[aarch64] Improve code-gen for vector initialization with single constant element. gcc/ChangeLog: * config/aarch64/aarc64.cc (aarch64_expand_vector_init): Tweak condition if (n_var == n_elts && n_elts <= 16) to allow a single constant, and if maxv == 1, use constant element for duplicating into register. gcc/testsuite/ChangeLog: * gcc.target/aarch64/vec-init-single-const.c: New test. diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 2b0de7ca038..f46750133a6 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -22167,7 +22167,7 @@ aarch64_expand_vector_init (rtx target, rtx vals) and matches[X][1] with the count of duplicate elements (if X is the earliest element which has duplicates). */ - if (n_var == n_elts && n_elts <= 16) + if ((n_var >= n_elts - 1) && n_elts <= 16) { int matches[16][2] = {0}; for (int i = 0; i < n_elts; i++) @@ -22227,6 +22227,18 @@ aarch64_expand_vector_init (rtx target, rtx vals) vector register. For big-endian we want that position to hold the last element of VALS. */ maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0; + + /* If we have a single constant element, use that for duplicating + instead. */ + if (n_var == n_elts - 1) + for (int i = 0; i < n_elts; i++) + if (CONST_INT_P (XVECEXP (vals, 0, i)) + || CONST_DOUBLE_P (XVECEXP (vals, 0, i))) + { + maxelement = i; + break; + } + rtx x = force_reg (inner_mode, XVECEXP (vals, 0, maxelement)); aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode)); } diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-single-const.c b/gcc/testsuite/gcc.target/aarch64/vec-init-single-const.c new file mode 100644 index 00000000000..517f47b13ec --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-single-const.c @@ -0,0 +1,66 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +#include + +/* +** f_s8: +** ... +** dup v[0-9]+\.16b, w[0-9]+ +** movi v[0-9]+\.8b, 0x1 +** ins v[0-9]+\.b\[15\], v[0-9]+\.b\[0\] +** ... +** ret +*/ + +int8x16_t f_s8(int8_t x) +{ + return (int8x16_t) { x, x, x, x, x, x, x, x, + x, x, x, x, x, x, x, 1 }; +} + +/* +** f_s16: +** ... +** dup v[0-9]+\.8h, w[0-9]+ +** movi v[0-9]+\.4h, 0x1 +** ins v[0-9]+\.h\[7\], v[0-9]+\.h\[0\] +** ... +** ret +*/ + +int16x8_t f_s16(int16_t x) +{ + return (int16x8_t) { x, x, x, x, x, x, x, 1 }; +} + +/* +** f_s32: +** ... +** movi v[0-9]\.2s, 0x1 +** dup v[0-9]\.4s, w[0-9]+ +** ins v[0-9]+\.s\[3\], v[0-9]+\.s\[0\] +** ... +** ret +*/ + +int32x4_t f_s32(int32_t x) +{ + return (int32x4_t) { x, x, x, 1 }; +} + +/* +** f_s64: +** ... +** fmov d[0-9]+, x[0-9]+ +** mov x[0-9]+, 1 +** ins v[0-9]+\.d\[1\], x[0-9]+ +** ... +** ret +*/ + +int64x2_t f_s64(int64_t x) +{ + return (int64x2_t) { x, 1 }; +}