diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 9a79a9e7928..411e85f52a4 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -21984,6 +21984,54 @@ aarch64_simd_make_constant (rtx vals) return NULL_RTX; } +/* Subroutine of aarch64_expand_vector_init. + Check if VALS has same element at every alternate position + from START_POS. */ + +static +bool aarch64_init_interleaving_dup_p (rtx vals, int start_pos) +{ + for (int i = start_pos + 2; i < XVECLEN (vals, 0); i += 2) + if (!rtx_equal_p (XVECEXP (vals, 0, start_pos), XVECEXP (vals, 0, i))) + return false; + return true; +} + +/* Subroutine of aarch64_expand_vector_init. + Check if every alternate element in VALS starting from START_POS + is a constant. */ + +static +bool aarch64_init_interleaving_const_p (rtx vals, int start_pos) +{ + for (int i = start_pos; i < XVECLEN (vals, 0); i += 2) + if (!CONSTANT_P (XVECEXP (vals, 0, i))) + return false; + return true; +} + +/* Subroutine of aarch64_expand_vector_init. + Copy all odd-numbered or even-numbered elements from VALS + depending on CONST_EVEN. + For eg if VALS is { x, 1, x, 2, x, 3, x, 4 } + return {1, 2, 3, 4, 1, 1, 1, 1}. + We are only interested in the first half {0 ... n_elts/2} since + that will be used by zip1 for merging. Fill the second half + with an arbitrary value since it will be discarded. */ + +static +rtx aarch64_init_interleaving_shift_init (rtx vals, bool const_even) +{ + int n_elts = XVECLEN (vals, 0); + rtvec vec = rtvec_alloc (n_elts); + int i; + for (i = 0; i < n_elts / 2; i++) + RTVEC_ELT (vec, i) = XVECEXP (vals, 0, (const_even) ? 2 * i : 2 * i + 1); + for (; i < n_elts; i++) + RTVEC_ELT (vec, i) = RTVEC_ELT (vec, 0); + return gen_rtx_CONST_VECTOR (GET_MODE (vals), vec); +} + /* Expand a vector initialisation sequence, such that TARGET is initialised to contain VALS. */ @@ -22048,22 +22096,55 @@ aarch64_expand_vector_init (rtx target, rtx vals) return; } - /* Check for interleaving case. - For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}. - Generate following code: - dup v0.h, x - dup v1.h, y - zip1 v0.h, v0.h, v1.h - for "large enough" initializer. */ + /* Check for interleaving case for "large enough" initializer. + Currently we handle following cases: + (a) Even part is dup and odd part is const. + (b) Odd part is dup and even part is const. + (c) Both even and odd parts are dup. */ if (n_elts >= 8) { - int i; - for (i = 2; i < n_elts; i++) - if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2))) - break; + bool even_dup = false, even_const = false; + bool odd_dup = false, odd_const = false; + + even_dup = aarch64_init_interleaving_dup_p (vals, 0); + if (!even_dup) + even_const = aarch64_init_interleaving_const_p (vals, 0); + + odd_dup = aarch64_init_interleaving_dup_p (vals, 1); + if (!odd_dup) + odd_const = aarch64_init_interleaving_const_p (vals, 1); + + /* This case should already be handled above when all elements are constants. */ + gcc_assert (!(even_const && odd_const)); - if (i == n_elts) + if (even_dup && odd_const) + { + rtx dup_reg = expand_vector_broadcast (mode, XVECEXP (vals, 0, 0)); + dup_reg = force_reg (mode, dup_reg); + + rtx const_reg = gen_reg_rtx (mode); + rtx const_vector = aarch64_init_interleaving_shift_init (vals, false); + aarch64_expand_vector_init (const_reg, const_vector); + + rtvec v = gen_rtvec (2, dup_reg, const_reg); + emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1)); + return; + } + else if (odd_dup && even_const) + { + rtx dup_reg = expand_vector_broadcast (mode, XVECEXP (vals, 0, 1)); + dup_reg = force_reg (mode, dup_reg); + + rtx const_reg = gen_reg_rtx (mode); + rtx const_vector = aarch64_init_interleaving_shift_init (vals, true); + aarch64_expand_vector_init (const_reg, const_vector); + + rtvec v = gen_rtvec (2, const_reg, dup_reg); + emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1)); + return; + } + else if (even_dup && odd_dup) { machine_mode mode = GET_MODE (target); rtx dest[2]; diff --git a/gcc/testsuite/gcc.target/aarch64/interleave-init-2.c b/gcc/testsuite/gcc.target/aarch64/interleave-init-2.c new file mode 100644 index 00000000000..3ad06c00451 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/interleave-init-2.c @@ -0,0 +1,51 @@ +/* { dg-do compile } */ +/* { dg-options "-O3" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +#include "arm_neon.h" + +/* +**foo: +** ... +** dup v[0-9]+\.8h, w[0-9]+ +** adrp x[0-9]+, .LC[0-9]+ +** ldr q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\] +** zip1 v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h +** ... +*/ + +int16x8_t foo(int16_t x) +{ + return (int16x8_t) { x, 1, x, 2, x, 3, x, 4 }; +} + + +/* +**foo2: +** ... +** dup v[0-9]+\.8h, w[0-9]+ +** adrp x[0-9]+, .LC[0-9]+ +** ldr q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\] +** zip1 v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h +** ... +*/ + +int16x8_t foo2(int16_t x) +{ + return (int16x8_t) { 1, x, 2, x, 3, x, 4, x }; +} + +/* +**foo3: +** ... +** dup v[0-9]+\.8h, v[0-9]+\.h\[0\] +** adrp x[0-9]+, .LC[0-9]+ +** ldr q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\] +** zip1 v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h +** ... +*/ + +float16x8_t foo3(float16_t x) +{ + return (float16x8_t) { x, 1.0, x, 2.0, x, 3.0, x, 4.0 }; +}