* [AArch64] Improve SVE dup intrinsics codegen
@ 2022-05-17 13:28 Andre Vieira (lists)
2022-05-17 17:44 ` Richard Sandiford
0 siblings, 1 reply; 2+ messages in thread
From: Andre Vieira (lists) @ 2022-05-17 13:28 UTC (permalink / raw)
To: gcc-patches, Richard Sandiford, Kyrylo Tkachov
[-- Attachment #1: Type: text/plain, Size: 931 bytes --]
Hi,
This patch teaches the aarch64 backend to improve codegen when using dup
with NEON vectors with repeating patterns. It will attempt to use a
smaller NEON vector (or element) to limit the number of instructions
needed to construct the input vector.
Bootstrapped and regression tested aarch64-none-linux-gnu.
Is his OK for trunk?
gcc/ChangeLog:
* config/aarch64/aarch64.cc (aarch64_simd_container_mode): Make
it global.
* config/aarch64/aarch64-protos.h
(aarch64_simd_container_mode): Declare it.
* config/aarch64/aarch64-sve.md (*vec_duplicate<mode>_reg):
Rename this to ...
(@aarch64_vec_duplicae_reg_<mode>): ... this.
* gcc/config/aarch64-sve-builtins-base.cc
(svdup_lane_impl::expand): Improve codegen when inputs form a repeating
pattern.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/sve/dup_opt.c: New test.
[-- Attachment #2: sve_dup.patch --]
[-- Type: text/plain, Size: 9651 bytes --]
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 2ac781dff4a93cbe0f0b091147b2521ed1a88750..cfc31b467cf1d3cd79b2dfe6a54e6910dd43b5d8 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -771,6 +771,7 @@ int aarch64_branch_cost (bool, bool);
enum aarch64_symbol_type aarch64_classify_symbolic_expression (rtx);
bool aarch64_advsimd_struct_mode_p (machine_mode mode);
opt_machine_mode aarch64_vq_mode (scalar_mode);
+machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
opt_machine_mode aarch64_full_sve_mode (scalar_mode);
bool aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode);
bool aarch64_const_vec_all_same_int_p (rtx, HOST_WIDE_INT);
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index c24c05487246f529f81867d6429e636fd6dc74d0..f8b755a83dc37578363270618323f87c95fa327f 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -875,13 +875,98 @@ public:
argument N to go into architectural lane N, whereas Advanced SIMD
vectors are loaded memory lsb to register lsb. We therefore need
to reverse the elements for big-endian targets. */
- rtx vq_reg = gen_reg_rtx (vq_mode);
rtvec vec = rtvec_alloc (elements_per_vq);
for (unsigned int i = 0; i < elements_per_vq; ++i)
{
unsigned int argno = BYTES_BIG_ENDIAN ? elements_per_vq - i - 1 : i;
RTVEC_ELT (vec, i) = e.args[argno];
}
+
+ /* Look for a repeating pattern in the 128-bit input as that potentially
+ simplifies constructing the input vector.
+ For example, codegen for svdupq_n_s32 (a, b, a, b), could be simplified
+ from:
+ dup v0.4s, w0
+ fmov s1, w1
+ ins v0.s[1], v1.s[0]
+ ins v0.s[3], v1.s[0]
+ dup z0.q, z0.q[0]
+ to:
+ fmov d0, x0
+ ins v0.s[1], w1
+ mov z0.d, d0
+ where we can see it uses a [a, b] input vector reducing the number of
+ needed instructions. */
+ if (elements_per_vq > 1 && mode == e.vector_mode(0))
+ {
+ unsigned int new_elements_n = elements_per_vq;
+ bool group = true;
+ while (group && new_elements_n > 1)
+ {
+ for (unsigned int i = 0; i < new_elements_n / 2; ++i)
+ {
+ if (rtx_equal_p (RTVEC_ELT (vec, i),
+ RTVEC_ELT (vec, new_elements_n / 2 + i)) == 0)
+ {
+ group = false;
+ break;
+ }
+ }
+ if (group)
+ new_elements_n /= 2;
+ }
+ /* We have found a repeating pattern smaller than 128-bits, so use that
+ instead. */
+ if (new_elements_n < elements_per_vq)
+ {
+ unsigned int input_size = 128 / elements_per_vq * new_elements_n;
+ scalar_mode new_mode
+ = int_mode_for_size (input_size, 0).require ();
+ rtx input;
+ if (new_elements_n > 1)
+ {
+ if (input_size < 64)
+ {
+ /* TODO: Remove this when support for 32- and 16-bit vectors
+ is added.
+ */
+ new_elements_n *= 64/input_size;
+ input_size = 64;
+ new_mode = int_mode_for_size (input_size, 0).require ();
+ }
+ input = gen_reg_rtx (new_mode);
+ rtvec new_vec = rtvec_alloc (new_elements_n);
+ for (unsigned int i = 0; i < new_elements_n; ++i)
+ RTVEC_ELT (new_vec, i) = RTVEC_ELT (vec, i);
+
+ machine_mode merge_mode
+ = aarch64_simd_container_mode (element_mode, input_size);
+
+ rtx merge_subreg = simplify_gen_subreg (merge_mode, input,
+ new_mode, 0);
+ aarch64_expand_vector_init (merge_subreg,
+ gen_rtx_PARALLEL (merge_mode,
+ new_vec));
+ }
+ else
+ input = simplify_gen_subreg (new_mode, RTVEC_ELT (vec, 0),
+ element_mode, 0);
+ machine_mode sve_mode
+ = aarch64_full_sve_mode (new_mode).require ();
+
+ rtx target = simplify_gen_subreg (sve_mode, e.possible_target,
+ mode, 0);
+
+ expand_operand ops[2];
+ create_output_operand (&ops[0], target, sve_mode);
+ create_fixed_operand (&ops[1], input);
+ expand_insn (code_for_aarch64_vec_duplicate_reg (sve_mode), 2,
+ ops);
+ return e.possible_target;
+ }
+ }
+
+ rtx vq_reg = gen_reg_rtx (vq_mode);
aarch64_expand_vector_init (vq_reg, gen_rtx_PARALLEL (vq_mode, vec));
/* If the result is a boolean, compare the data vector against zero. */
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index bd60e65b0c3f05f1c931f03807170f3b9d699de5..a7d6041bcda03318ff10f6d425889801b9a8fa63 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -2508,7 +2508,7 @@ (define_expand "vec_duplicate<mode>"
;; the scalar input gets spilled to memory during RA. We want to split
;; the load at the first opportunity in order to allow the PTRUE to be
;; optimized with surrounding code.
-(define_insn_and_split "*vec_duplicate<mode>_reg"
+(define_insn_and_split "@aarch64_vec_duplicate_reg_<mode>"
[(set (match_operand:SVE_ALL 0 "register_operand" "=w, w, w")
(vec_duplicate:SVE_ALL
(match_operand:<VEL> 1 "aarch64_sve_dup_operand" "r, w, Uty")))
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index f650abbc4ce49cf0947049931f86bad1130c3428..f5e66a43ec5d47e6f5d5540cb41fba0e0e9f92d6 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -301,7 +301,6 @@ static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
const_tree type,
int misalignment,
bool is_packed);
-static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
aarch64_addr_query_type);
static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
@@ -20502,7 +20501,7 @@ aarch64_vq_mode (scalar_mode mode)
/* Return appropriate SIMD container
for MODE within a vector of WIDTH bits. */
-static machine_mode
+machine_mode
aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
{
if (TARGET_SVE
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/dup_opt.c b/gcc/testsuite/gcc.target/aarch64/sve/dup_opt.c
new file mode 100644
index 0000000000000000000000000000000000000000..66a1fcfb585b2c2b36a1344d4a33811257188dee
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/dup_opt.c
@@ -0,0 +1,203 @@
+/* { dg-options { "-O2" } } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+#include <arm_sve.h>
+
+/*
+** float32_0:
+** ins v0.s\[1\], v1.s\[0\]
+** mov z0.d, d0
+** ret
+*/
+svfloat32_t float32_0(float x, float y)
+{
+ return svdupq_n_f32(x, y, x, y);
+}
+
+/*
+** float32_1:
+** mov z0.s, s0
+** ret
+*/
+
+svfloat32_t float32_1(float x)
+{
+ return svdupq_n_f32(x, x, x, x);
+}
+
+/*
+** float16_0:
+** ins v0.h\[1\], v1.h\[0\]
+** ins v0.h\[2\], v2.h\[0\]
+** ins v0.h\[3\], v3.h\[0\]
+** mov z0.d, d0
+** ret
+*/
+
+svfloat16_t float16_0 (float16_t a, float16_t b, float16_t c, float16_t d)
+{
+ return svdupq_n_f16 (a, b, c, d, a, b, c, d);
+}
+
+/*
+** float16_1:
+** dup v0.4h, v0.h\[0\]
+** ins v0.h\[1\], v1.h\[0\]
+** ins v0.h\[3\], v1.h\[0\]
+** mov z0.d, d0
+** ret
+*/
+
+svfloat16_t float16_1 (float16_t a, float16_t b)
+{
+ return svdupq_n_f16 (a, b, a, b, a, b, a, b);
+}
+
+/*
+** float16_2:
+** mov z0.h, h0
+** ret
+*/
+
+svfloat16_t float16_2 (float16_t a)
+{
+ return svdupq_n_f16 (a, a, a, a, a, a, a, a);
+}
+
+/*
+** int64_0:
+** mov z0.d, x0
+** ret
+*/
+
+svint64_t int64_0 (int64_t a)
+{
+ return svdupq_n_s64 (a, a);
+}
+
+/*
+** int32_0:
+** fmov d0, x0
+** ins v0.s\[1\], w1
+** mov z0.d, d0
+** ret
+*/
+
+svuint32_t int32_0(uint32_t a, uint32_t b) {
+ return svdupq_n_u32(a, b, a, b);
+}
+
+/*
+** int32_1:
+** mov z0.s, w0
+** ret
+*/
+
+svint32_t int32_1(int32_t a)
+{
+ return svdupq_n_s32(a, a, a, a);
+}
+
+/*
+** int16_0:
+** ...
+** fmov d0, x0
+** ins v0.h\[1\], w1
+** ins v0.h\[2\], w2
+** ins v0.h\[3\], w3
+** mov z0.d, d0
+** ret
+*/
+
+svint16_t int16_0(int16_t a, int16_t b, int16_t c, int16_t d)
+{
+ return svdupq_n_s16(a, b, c, d, a, b, c, d);
+}
+
+/*
+** int16_1:
+** dup v0.4h, w0
+** ins v0.h\[1\], w1
+** ins v0.h\[3\], w1
+** mov z0.d, d0
+** ret
+*/
+
+svuint16_t int16_1(uint16_t a, uint16_t b)
+{
+ return svdupq_n_u16(a, b, a, b, a, b, a, b);
+}
+
+/*
+** int16_2:
+** mov z0.h, w0
+** ret
+*/
+
+svint16_t int16_2(int16_t a)
+{
+ return svdupq_n_s16(a, a, a, a, a, a, a, a);
+}
+/*
+** int8_0:
+** ...
+** fmov d0, x0
+** ins v0.b\[1\], w1
+** ins v0.b\[2\], w2
+** ins v0.b\[3\], w3
+** ins v0.b\[4\], w4
+** ins v0.b\[5\], w5
+** ins v0.b\[6\], w6
+** ins v0.b\[7\], w7
+** mov z0.d, d0
+** ret
+*/
+
+svuint8_t int8_0(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h)
+{
+ return svdupq_n_u8(a, b, c, d, e, f, g, h, a, b, c, d, e, f, g, h);
+}
+
+/*
+** int8_1:
+** dup v0.8b, w0
+** ins v0.b\[1\], w1
+** ins v0.b\[2\], w2
+** ins v0.b\[3\], w3
+** ins v0.b\[5\], w1
+** ins v0.b\[6\], w2
+** ins v0.b\[7\], w3
+** mov z0.d, d0
+** ret
+*/
+
+svint8_t int8_1(int8_t a, int8_t b, int8_t c, int8_t d)
+{
+ return svdupq_n_s8(a, b, c, d, a, b, c, d, a, b, c, d, a, b, c, d);
+}
+
+/*
+** int8_2:
+** dup v0.8b, w0
+** ins v0.b\[1\], w1
+** ins v0.b\[3\], w1
+** ins v0.b\[5\], w1
+** ins v0.b\[7\], w1
+** mov z0.d, d0
+** ret
+*/
+
+svint8_t int8_2(int8_t a, int8_t b)
+{
+ return svdupq_n_s8(a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b);
+}
+
+/*
+** int8_3:
+** mov z0.b, w0
+** ret
+*/
+
+svint8_t int8_3(int8_t a)
+{
+ return svdupq_n_s8(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a);
+}
^ permalink raw reply [flat|nested] 2+ messages in thread
* Re: [AArch64] Improve SVE dup intrinsics codegen
2022-05-17 13:28 [AArch64] Improve SVE dup intrinsics codegen Andre Vieira (lists)
@ 2022-05-17 17:44 ` Richard Sandiford
0 siblings, 0 replies; 2+ messages in thread
From: Richard Sandiford @ 2022-05-17 17:44 UTC (permalink / raw)
To: Andre Vieira (lists); +Cc: gcc-patches, Kyrylo Tkachov
"Andre Vieira (lists)" <andre.simoesdiasvieira@arm.com> writes:
> Hi,
>
> This patch teaches the aarch64 backend to improve codegen when using dup
> with NEON vectors with repeating patterns. It will attempt to use a
> smaller NEON vector (or element) to limit the number of instructions
> needed to construct the input vector.
The new sequences definitely look like an improvement. However, this
change overlaps a bit with what Prathamesh is doing for PR96463.
Stepping back and thinking about how we handle this kind of thing
in general, it might make sense to do the following:
(1) Extend VEC_PERM_EXPR so that it can handle Advanced SIMD inputs
and SVE outputs (for constant permute indices). This is part of
what Prathamesh is doing.
(2a) Add a way for targets to expand such VEC_PERM_EXPRs when the
arguments are CONSTRUCTORs. This would only be useful for
variable-length vectors, since VEC_PERM_EXPRs of CONSTRUCTORs
should be folded to new CONSTRUCTORs for fixed-length vectors.
(2b) Generalise the SVE handling in aarch64_expand_vector_init
to cope with general rtx_vector_builders, rather than just
fixed-length ones, and use it to implement the new hook
added in (2a).
(3a) Use VEC_PERM_EXPRs of CONSTRUCTORs to simplify or replace the
duplicate_and_interleave stuff in SLP (think Richi would be glad
to see this go :-)).
(3b) Make svdupq_impl::fold() lower non-constant inputs to VEC_PERM_EXPRs
of CONSTRUCTORs.
with (3a) and (3b) being independent from each other.
The advantages of doing things this way are:
* autovectorised SLP code will benefit from the same tricks as svdupq.
* gimple optimisers get to work with the simplified svdupq form.
If you don't want to do that, or wait for it to happen, perhaps
we could short-circuit the process by doing (2b) on its own.
That is, create an interface like:
void aarch64_expand_vector_init (rtx target, rtx_vector_builder &builder);
Then have svdupq_impl::expand stuff the elements into an
rtx_vector_builder (a bit like svdupq_impl::fold does with a
tree_vector_builder when the elements are constant) and pass the
rtx_vector_builder to this new routine. Then aarch64_expand_vector_init
would be a home for all the optimisations, using the npatterns/
nelts_per_pattern information where useful. It would be good if
possible to integrate it with the existing SVE aarch64_expand_vector_init
code.
This would also make it easier to optimise:
svint8_t int8_2(int8_t a, int8_t b)
{
return svdupq_n_s8(a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b);
}
to the expected 16-bit dup, even without V2QI being defined.
Thanks,
Richard
> Bootstrapped and regression tested aarch64-none-linux-gnu.
>
> Is his OK for trunk?
>
> gcc/ChangeLog:
>
> * config/aarch64/aarch64.cc (aarch64_simd_container_mode): Make
> it global.
> * config/aarch64/aarch64-protos.h
> (aarch64_simd_container_mode): Declare it.
> * config/aarch64/aarch64-sve.md (*vec_duplicate<mode>_reg):
> Rename this to ...
> (@aarch64_vec_duplicae_reg_<mode>): ... this.
> * gcc/config/aarch64-sve-builtins-base.cc
> (svdup_lane_impl::expand): Improve codegen when inputs form a repeating
> pattern.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/aarch64/sve/dup_opt.c: New test.
>
> diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
> index 2ac781dff4a93cbe0f0b091147b2521ed1a88750..cfc31b467cf1d3cd79b2dfe6a54e6910dd43b5d8 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -771,6 +771,7 @@ int aarch64_branch_cost (bool, bool);
> enum aarch64_symbol_type aarch64_classify_symbolic_expression (rtx);
> bool aarch64_advsimd_struct_mode_p (machine_mode mode);
> opt_machine_mode aarch64_vq_mode (scalar_mode);
> +machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
> opt_machine_mode aarch64_full_sve_mode (scalar_mode);
> bool aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode);
> bool aarch64_const_vec_all_same_int_p (rtx, HOST_WIDE_INT);
> diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> index c24c05487246f529f81867d6429e636fd6dc74d0..f8b755a83dc37578363270618323f87c95fa327f 100644
> --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> @@ -875,13 +875,98 @@ public:
> argument N to go into architectural lane N, whereas Advanced SIMD
> vectors are loaded memory lsb to register lsb. We therefore need
> to reverse the elements for big-endian targets. */
> - rtx vq_reg = gen_reg_rtx (vq_mode);
> rtvec vec = rtvec_alloc (elements_per_vq);
> for (unsigned int i = 0; i < elements_per_vq; ++i)
> {
> unsigned int argno = BYTES_BIG_ENDIAN ? elements_per_vq - i - 1 : i;
> RTVEC_ELT (vec, i) = e.args[argno];
> }
> +
> + /* Look for a repeating pattern in the 128-bit input as that potentially
> + simplifies constructing the input vector.
> + For example, codegen for svdupq_n_s32 (a, b, a, b), could be simplified
> + from:
> + dup v0.4s, w0
> + fmov s1, w1
> + ins v0.s[1], v1.s[0]
> + ins v0.s[3], v1.s[0]
> + dup z0.q, z0.q[0]
> + to:
> + fmov d0, x0
> + ins v0.s[1], w1
> + mov z0.d, d0
> + where we can see it uses a [a, b] input vector reducing the number of
> + needed instructions. */
> + if (elements_per_vq > 1 && mode == e.vector_mode(0))
> + {
> + unsigned int new_elements_n = elements_per_vq;
> + bool group = true;
> + while (group && new_elements_n > 1)
> + {
> + for (unsigned int i = 0; i < new_elements_n / 2; ++i)
> + {
> + if (rtx_equal_p (RTVEC_ELT (vec, i),
> + RTVEC_ELT (vec, new_elements_n / 2 + i)) == 0)
> + {
> + group = false;
> + break;
> + }
> + }
> + if (group)
> + new_elements_n /= 2;
> + }
> + /* We have found a repeating pattern smaller than 128-bits, so use that
> + instead. */
> + if (new_elements_n < elements_per_vq)
> + {
> + unsigned int input_size = 128 / elements_per_vq * new_elements_n;
> + scalar_mode new_mode
> + = int_mode_for_size (input_size, 0).require ();
> + rtx input;
> + if (new_elements_n > 1)
> + {
> + if (input_size < 64)
> + {
> + /* TODO: Remove this when support for 32- and 16-bit vectors
> + is added.
> + */
> + new_elements_n *= 64/input_size;
> + input_size = 64;
> + new_mode = int_mode_for_size (input_size, 0).require ();
> + }
> + input = gen_reg_rtx (new_mode);
> + rtvec new_vec = rtvec_alloc (new_elements_n);
> + for (unsigned int i = 0; i < new_elements_n; ++i)
> + RTVEC_ELT (new_vec, i) = RTVEC_ELT (vec, i);
> +
> + machine_mode merge_mode
> + = aarch64_simd_container_mode (element_mode, input_size);
> +
> + rtx merge_subreg = simplify_gen_subreg (merge_mode, input,
> + new_mode, 0);
> + aarch64_expand_vector_init (merge_subreg,
> + gen_rtx_PARALLEL (merge_mode,
> + new_vec));
> + }
> + else
> + input = simplify_gen_subreg (new_mode, RTVEC_ELT (vec, 0),
> + element_mode, 0);
> + machine_mode sve_mode
> + = aarch64_full_sve_mode (new_mode).require ();
> +
> + rtx target = simplify_gen_subreg (sve_mode, e.possible_target,
> + mode, 0);
> +
> + expand_operand ops[2];
> + create_output_operand (&ops[0], target, sve_mode);
> + create_fixed_operand (&ops[1], input);
> + expand_insn (code_for_aarch64_vec_duplicate_reg (sve_mode), 2,
> + ops);
> + return e.possible_target;
> + }
> + }
> +
> + rtx vq_reg = gen_reg_rtx (vq_mode);
> aarch64_expand_vector_init (vq_reg, gen_rtx_PARALLEL (vq_mode, vec));
>
> /* If the result is a boolean, compare the data vector against zero. */
> diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
> index bd60e65b0c3f05f1c931f03807170f3b9d699de5..a7d6041bcda03318ff10f6d425889801b9a8fa63 100644
> --- a/gcc/config/aarch64/aarch64-sve.md
> +++ b/gcc/config/aarch64/aarch64-sve.md
> @@ -2508,7 +2508,7 @@ (define_expand "vec_duplicate<mode>"
> ;; the scalar input gets spilled to memory during RA. We want to split
> ;; the load at the first opportunity in order to allow the PTRUE to be
> ;; optimized with surrounding code.
> -(define_insn_and_split "*vec_duplicate<mode>_reg"
> +(define_insn_and_split "@aarch64_vec_duplicate_reg_<mode>"
> [(set (match_operand:SVE_ALL 0 "register_operand" "=w, w, w")
> (vec_duplicate:SVE_ALL
> (match_operand:<VEL> 1 "aarch64_sve_dup_operand" "r, w, Uty")))
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index f650abbc4ce49cf0947049931f86bad1130c3428..f5e66a43ec5d47e6f5d5540cb41fba0e0e9f92d6 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -301,7 +301,6 @@ static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
> const_tree type,
> int misalignment,
> bool is_packed);
> -static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
> static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
> aarch64_addr_query_type);
> static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
> @@ -20502,7 +20501,7 @@ aarch64_vq_mode (scalar_mode mode)
>
> /* Return appropriate SIMD container
> for MODE within a vector of WIDTH bits. */
> -static machine_mode
> +machine_mode
> aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
> {
> if (TARGET_SVE
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/dup_opt.c b/gcc/testsuite/gcc.target/aarch64/sve/dup_opt.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..66a1fcfb585b2c2b36a1344d4a33811257188dee
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/dup_opt.c
> @@ -0,0 +1,203 @@
> +/* { dg-options { "-O2" } } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +#include <arm_sve.h>
> +
> +/*
> +** float32_0:
> +** ins v0.s\[1\], v1.s\[0\]
> +** mov z0.d, d0
> +** ret
> +*/
> +svfloat32_t float32_0(float x, float y)
> +{
> + return svdupq_n_f32(x, y, x, y);
> +}
> +
> +/*
> +** float32_1:
> +** mov z0.s, s0
> +** ret
> +*/
> +
> +svfloat32_t float32_1(float x)
> +{
> + return svdupq_n_f32(x, x, x, x);
> +}
> +
> +/*
> +** float16_0:
> +** ins v0.h\[1\], v1.h\[0\]
> +** ins v0.h\[2\], v2.h\[0\]
> +** ins v0.h\[3\], v3.h\[0\]
> +** mov z0.d, d0
> +** ret
> +*/
> +
> +svfloat16_t float16_0 (float16_t a, float16_t b, float16_t c, float16_t d)
> +{
> + return svdupq_n_f16 (a, b, c, d, a, b, c, d);
> +}
> +
> +/*
> +** float16_1:
> +** dup v0.4h, v0.h\[0\]
> +** ins v0.h\[1\], v1.h\[0\]
> +** ins v0.h\[3\], v1.h\[0\]
> +** mov z0.d, d0
> +** ret
> +*/
> +
> +svfloat16_t float16_1 (float16_t a, float16_t b)
> +{
> + return svdupq_n_f16 (a, b, a, b, a, b, a, b);
> +}
> +
> +/*
> +** float16_2:
> +** mov z0.h, h0
> +** ret
> +*/
> +
> +svfloat16_t float16_2 (float16_t a)
> +{
> + return svdupq_n_f16 (a, a, a, a, a, a, a, a);
> +}
> +
> +/*
> +** int64_0:
> +** mov z0.d, x0
> +** ret
> +*/
> +
> +svint64_t int64_0 (int64_t a)
> +{
> + return svdupq_n_s64 (a, a);
> +}
> +
> +/*
> +** int32_0:
> +** fmov d0, x0
> +** ins v0.s\[1\], w1
> +** mov z0.d, d0
> +** ret
> +*/
> +
> +svuint32_t int32_0(uint32_t a, uint32_t b) {
> + return svdupq_n_u32(a, b, a, b);
> +}
> +
> +/*
> +** int32_1:
> +** mov z0.s, w0
> +** ret
> +*/
> +
> +svint32_t int32_1(int32_t a)
> +{
> + return svdupq_n_s32(a, a, a, a);
> +}
> +
> +/*
> +** int16_0:
> +** ...
> +** fmov d0, x0
> +** ins v0.h\[1\], w1
> +** ins v0.h\[2\], w2
> +** ins v0.h\[3\], w3
> +** mov z0.d, d0
> +** ret
> +*/
> +
> +svint16_t int16_0(int16_t a, int16_t b, int16_t c, int16_t d)
> +{
> + return svdupq_n_s16(a, b, c, d, a, b, c, d);
> +}
> +
> +/*
> +** int16_1:
> +** dup v0.4h, w0
> +** ins v0.h\[1\], w1
> +** ins v0.h\[3\], w1
> +** mov z0.d, d0
> +** ret
> +*/
> +
> +svuint16_t int16_1(uint16_t a, uint16_t b)
> +{
> + return svdupq_n_u16(a, b, a, b, a, b, a, b);
> +}
> +
> +/*
> +** int16_2:
> +** mov z0.h, w0
> +** ret
> +*/
> +
> +svint16_t int16_2(int16_t a)
> +{
> + return svdupq_n_s16(a, a, a, a, a, a, a, a);
> +}
> +/*
> +** int8_0:
> +** ...
> +** fmov d0, x0
> +** ins v0.b\[1\], w1
> +** ins v0.b\[2\], w2
> +** ins v0.b\[3\], w3
> +** ins v0.b\[4\], w4
> +** ins v0.b\[5\], w5
> +** ins v0.b\[6\], w6
> +** ins v0.b\[7\], w7
> +** mov z0.d, d0
> +** ret
> +*/
> +
> +svuint8_t int8_0(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h)
> +{
> + return svdupq_n_u8(a, b, c, d, e, f, g, h, a, b, c, d, e, f, g, h);
> +}
> +
> +/*
> +** int8_1:
> +** dup v0.8b, w0
> +** ins v0.b\[1\], w1
> +** ins v0.b\[2\], w2
> +** ins v0.b\[3\], w3
> +** ins v0.b\[5\], w1
> +** ins v0.b\[6\], w2
> +** ins v0.b\[7\], w3
> +** mov z0.d, d0
> +** ret
> +*/
> +
> +svint8_t int8_1(int8_t a, int8_t b, int8_t c, int8_t d)
> +{
> + return svdupq_n_s8(a, b, c, d, a, b, c, d, a, b, c, d, a, b, c, d);
> +}
> +
> +/*
> +** int8_2:
> +** dup v0.8b, w0
> +** ins v0.b\[1\], w1
> +** ins v0.b\[3\], w1
> +** ins v0.b\[5\], w1
> +** ins v0.b\[7\], w1
> +** mov z0.d, d0
> +** ret
> +*/
> +
> +svint8_t int8_2(int8_t a, int8_t b)
> +{
> + return svdupq_n_s8(a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b);
> +}
> +
> +/*
> +** int8_3:
> +** mov z0.b, w0
> +** ret
> +*/
> +
> +svint8_t int8_3(int8_t a)
> +{
> + return svdupq_n_s8(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a);
> +}
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2022-05-17 17:44 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-05-17 13:28 [AArch64] Improve SVE dup intrinsics codegen Andre Vieira (lists)
2022-05-17 17:44 ` Richard Sandiford
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).