public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc(refs/vendors/vrull/heads/slp-improvements)] aarch64: expand VEC_PERM into ins + uzp[12]
@ 2024-01-17 19:14 Philipp Tomsich
0 siblings, 0 replies; 5+ messages in thread
From: Philipp Tomsich @ 2024-01-17 19:14 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:3f1357f3759fe6fb67345452b31768a703c84893
commit 3f1357f3759fe6fb67345452b31768a703c84893
Author: Manolis Tsamis <manolis.tsamis@vrull.eu>
Date: Fri Nov 3 14:36:34 2023 +0100
aarch64: expand VEC_PERM into ins + uzp[12]
The AArch64 backend has specific strategies that can be used to expand
VEC_PERM expression (see aarch64_expand_vec_perm_const_1).
The last strategy applied if everything else fails is to use a tbl
instruction, which is known to have very bad latency and performance
(see aarch64_evpc_tbl). There are various improvements and additions
that can be done to the reduce the harmful tbl instructions.
The existing mechanisms work for cases that the permute can be done
with a single existing AArch64 vector instruction, but for x264's
first loop we need some patterns that may need two vector
instructions.
On x264, this change results in the following change in instruction
distribution:
tbl: 8 -> 0
ldr: 10 -> 8 (due to the eliminated tbls)
ins: 8 -> 16
uzp: 8 -> 16
A reduction of the newly introduced ins/uzp[12] sequences will be
addressed in a follow-on change.
Ref #344
Diff:
---
gcc/config/aarch64/aarch64.cc | 76 +++++++++++++++++++++++++++++++++++++++++++
1 file changed, 76 insertions(+)
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index a5a6b52730d..7ae94091b7e 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25825,6 +25825,80 @@ aarch64_evpc_ins (struct expand_vec_perm_d *d)
return true;
}
+/* Recognize patterns suitable for the an INS + UZP.
+ This addresses limited permute optimizations before a more generic search
+ algorithm for two operator sequences is implemented. */
+static bool
+aarch64_evpc_ins_uzp (struct expand_vec_perm_d *d)
+{
+ machine_mode mode = d->vmode;
+
+ if (d->vec_flags != VEC_ADVSIMD || BYTES_BIG_ENDIAN)
+ return false;
+
+ unsigned HOST_WIDE_INT nelt = d->perm.length ().to_constant ();
+
+ if (nelt != 4
+ || !d->perm[0].is_constant()
+ || !d->perm[1].is_constant()
+ || !d->perm.series_p (0, 2, d->perm[0], 0)
+ || !d->perm.series_p (1, 2, d->perm[1], 0))
+ return false;
+
+ /* We have a {A, B, A, B} permutation. */
+ HOST_WIDE_INT A = d->perm[0].to_constant ();
+ HOST_WIDE_INT B = d->perm[1].to_constant ();
+
+ if (A >= nelt || B < nelt || d->op0 == d->op1)
+ return false;
+
+ rtx insv;
+ rtx extractv;
+ HOST_WIDE_INT idx, extractindex;
+
+ /* If A is the first element or B is the second element of a UZP1/2 then we
+ can emit this permute as INS + UZP . */
+ if (A == 0 || A == 1)
+ {
+ insv = d->op0;
+ extractv = d->op1;
+ idx = A == 0 ? 2 : 3;
+ extractindex = B;
+ }
+ else if (B == nelt + 2 || B == nelt + 3)
+ {
+ insv = d->op1;
+ extractv = d->op0;
+ idx = B == nelt + 2 ? 0 : 1;
+ extractindex = A;
+ }
+ else
+ return false;
+
+ if (d->testing_p)
+ return true;
+
+ if (extractindex >= nelt)
+ extractindex -= nelt;
+ gcc_assert (extractindex < nelt);
+
+ /* Emit INS. */
+ insn_code icode = code_for_aarch64_simd_vec_copy_lane (mode);
+ expand_operand ops[5];
+ create_output_operand (&ops[0], d->target, mode);
+ create_input_operand (&ops[1], insv, mode);
+ create_integer_operand (&ops[2], 1 << idx);
+ create_input_operand (&ops[3], extractv, mode);
+ create_integer_operand (&ops[4], extractindex);
+ expand_insn (icode, 5, ops);
+
+ /* Emit UZP. */
+ emit_set_insn (d->target, gen_rtx_UNSPEC (mode, gen_rtvec (2, d->target, d->target),
+ idx & 1 ? UNSPEC_UZP2 : UNSPEC_UZP1));
+
+ return true;
+}
+
static bool
aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
{
@@ -25866,6 +25940,8 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
return true;
else if (aarch64_evpc_ins (d))
return true;
+ else if (aarch64_evpc_ins_uzp (d))
+ return true;
else if (aarch64_evpc_reencode (d))
return true;
^ permalink raw reply [flat|nested] 5+ messages in thread
* [gcc(refs/vendors/vrull/heads/slp-improvements)] aarch64: expand VEC_PERM into ins + uzp[12]
@ 2024-02-27 13:37 Philipp Tomsich
0 siblings, 0 replies; 5+ messages in thread
From: Philipp Tomsich @ 2024-02-27 13:37 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:ed7d62118f587c660491efa06e53ab003eef4a41
commit ed7d62118f587c660491efa06e53ab003eef4a41
Author: Manolis Tsamis <manolis.tsamis@vrull.eu>
Date: Fri Nov 3 14:36:34 2023 +0100
aarch64: expand VEC_PERM into ins + uzp[12]
The AArch64 backend has specific strategies that can be used to expand
VEC_PERM expression (see aarch64_expand_vec_perm_const_1).
The last strategy applied if everything else fails is to use a tbl
instruction, which is known to have very bad latency and performance
(see aarch64_evpc_tbl). There are various improvements and additions
that can be done to the reduce the harmful tbl instructions.
The existing mechanisms work for cases that the permute can be done
with a single existing AArch64 vector instruction, but for x264's
first loop we need some patterns that may need two vector
instructions.
On x264, this change results in the following change in instruction
distribution:
tbl: 8 -> 0
ldr: 10 -> 8 (due to the eliminated tbls)
ins: 8 -> 16
uzp: 8 -> 16
A reduction of the newly introduced ins/uzp[12] sequences will be
addressed in a follow-on change.
Ref #344
Diff:
---
gcc/config/aarch64/aarch64.cc | 76 +++++++++++++++++++++++++++++
gcc/testsuite/gcc.target/aarch64/vins_uzp.c | 36 ++++++++++++++
2 files changed, 112 insertions(+)
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 16318bf9258..a1c214a5104 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25960,6 +25960,80 @@ aarch64_evpc_ins (struct expand_vec_perm_d *d)
return true;
}
+/* Recognize patterns suitable for the an INS + UZP.
+ This addresses limited permute optimizations before a more generic search
+ algorithm for two operator sequences is implemented. */
+static bool
+aarch64_evpc_ins_uzp (struct expand_vec_perm_d *d)
+{
+ machine_mode mode = d->vmode;
+
+ if (d->vec_flags != VEC_ADVSIMD || BYTES_BIG_ENDIAN)
+ return false;
+
+ unsigned HOST_WIDE_INT nelt = d->perm.length ().to_constant ();
+
+ if (nelt != 4
+ || !d->perm[0].is_constant()
+ || !d->perm[1].is_constant()
+ || !d->perm.series_p (0, 2, d->perm[0], 0)
+ || !d->perm.series_p (1, 2, d->perm[1], 0))
+ return false;
+
+ /* We have a {A, B, A, B} permutation. */
+ unsigned HOST_WIDE_INT A = d->perm[0].to_constant ();
+ unsigned HOST_WIDE_INT B = d->perm[1].to_constant ();
+
+ if (A >= nelt || B < nelt || d->op0 == d->op1)
+ return false;
+
+ rtx insv;
+ rtx extractv;
+ unsigned HOST_WIDE_INT idx, extractindex;
+
+ /* If A is the first element or B is the second element of a UZP1/2 then we
+ can emit this permute as INS + UZP . */
+ if (A == 0 || A == 1)
+ {
+ insv = d->op0;
+ extractv = d->op1;
+ idx = A == 0 ? 2 : 3;
+ extractindex = B;
+ }
+ else if (B == nelt + 2 || B == nelt + 3)
+ {
+ insv = d->op1;
+ extractv = d->op0;
+ idx = B == nelt + 2 ? 0 : 1;
+ extractindex = A;
+ }
+ else
+ return false;
+
+ if (d->testing_p)
+ return true;
+
+ if (extractindex >= nelt)
+ extractindex -= nelt;
+ gcc_assert (extractindex < nelt);
+
+ /* Emit INS. */
+ insn_code icode = code_for_aarch64_simd_vec_copy_lane (mode);
+ expand_operand ops[5];
+ create_output_operand (&ops[0], d->target, mode);
+ create_input_operand (&ops[1], insv, mode);
+ create_integer_operand (&ops[2], 1 << idx);
+ create_input_operand (&ops[3], extractv, mode);
+ create_integer_operand (&ops[4], extractindex);
+ expand_insn (icode, 5, ops);
+
+ /* Emit UZP. */
+ emit_set_insn (d->target, gen_rtx_UNSPEC (mode, gen_rtvec (2, d->target, d->target),
+ idx & 1 ? UNSPEC_UZP2 : UNSPEC_UZP1));
+
+ return true;
+}
+
static bool
aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
{
@@ -26001,6 +26075,8 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
return true;
else if (aarch64_evpc_ins (d))
return true;
+ else if (aarch64_evpc_ins_uzp (d))
+ return true;
else if (aarch64_evpc_reencode (d))
return true;
diff --git a/gcc/testsuite/gcc.target/aarch64/vins_uzp.c b/gcc/testsuite/gcc.target/aarch64/vins_uzp.c
new file mode 100644
index 00000000000..d82d1f43c15
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vins_uzp.c
@@ -0,0 +1,36 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-require-effective-target aarch64_little_endian } */
+
+typedef int v4si __attribute__ ((vector_size (4 * sizeof (int))));
+
+v4si case1(v4si a, v4si b) {
+ return __builtin_shufflevector (a, b, 0, 5, 0, 5);
+}
+
+v4si case2(v4si a, v4si b) {
+ return __builtin_shufflevector (a, b, 1, 5, 1, 5);
+}
+
+v4si case3(v4si a, v4si b) {
+ return __builtin_shufflevector (a, b, 0, 6, 0, 6);
+}
+
+v4si case4(v4si a, v4si b) {
+ return __builtin_shufflevector (a, b, 1, 7, 1, 7);
+}
+
+v4si case5(v4si a, v4si b) {
+ return __builtin_shufflevector (a, b, 2, 7, 2, 7);
+}
+
+v4si case6(v4si a, v4si b) {
+ return __builtin_shufflevector (b, a, 2, 7, 2, 7);
+}
+
+v4si case7(v4si a, v4si b) {
+ return __builtin_shufflevector (a, b, 7, 2, 7, 2);
+}
+
+/* { dg-final { scan-assembler-not {\ttbl\t} } } */
+/* { dg-final { scan-assembler-not {\tldr\t} } } */
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [gcc(refs/vendors/vrull/heads/slp-improvements)] aarch64: expand VEC_PERM into ins + uzp[12]
2024-01-23 20:57 Philipp Tomsich
@ 2024-01-23 23:50 ` Andrew Pinski
0 siblings, 0 replies; 5+ messages in thread
From: Andrew Pinski @ 2024-01-23 23:50 UTC (permalink / raw)
To: Philipp Tomsich, Andrew Pinski (QUIC); +Cc: gcc-cvs
On Tue, Jan 23, 2024 at 12:57 PM Philipp Tomsich via Gcc-cvs
<gcc-cvs@gcc.gnu.org> wrote:
>
> https://gcc.gnu.org/g:d61be742513b5b8529ab9ef4022011c471925622
>
> commit d61be742513b5b8529ab9ef4022011c471925622
> Author: Manolis Tsamis <manolis.tsamis@vrull.eu>
> Date: Fri Nov 3 14:36:34 2023 +0100
>
> aarch64: expand VEC_PERM into ins + uzp[12]
>
> The AArch64 backend has specific strategies that can be used to expand
> VEC_PERM expression (see aarch64_expand_vec_perm_const_1).
>
> The last strategy applied if everything else fails is to use a tbl
> instruction, which is known to have very bad latency and performance
> (see aarch64_evpc_tbl). There are various improvements and additions
> that can be done to the reduce the harmful tbl instructions.
Actually NOT all cores a very bad performance with TBL. This
definitely needs to be tunable.
Thanks,
Andrew
>
> The existing mechanisms work for cases that the permute can be done
> with a single existing AArch64 vector instruction, but for x264's
> first loop we need some patterns that may need two vector
> instructions.
>
> On x264, this change results in the following change in instruction
> distribution:
> tbl: 8 -> 0
> ldr: 10 -> 8 (due to the eliminated tbls)
> ins: 8 -> 16
> uzp: 8 -> 16
> A reduction of the newly introduced ins/uzp[12] sequences will be
> addressed in a follow-on change.
>
> Ref #344
>
> Diff:
> ---
> gcc/config/aarch64/aarch64.cc | 76 +++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 76 insertions(+)
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index e6bd3fd0bb4..0f2423ef7de 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -25890,6 +25890,80 @@ aarch64_evpc_ins (struct expand_vec_perm_d *d)
> return true;
> }
>
> +/* Recognize patterns suitable for the an INS + UZP.
> + This addresses limited permute optimizations before a more generic search
> + algorithm for two operator sequences is implemented. */
> +static bool
> +aarch64_evpc_ins_uzp (struct expand_vec_perm_d *d)
> +{
> + machine_mode mode = d->vmode;
> +
> + if (d->vec_flags != VEC_ADVSIMD || BYTES_BIG_ENDIAN)
> + return false;
> +
> + unsigned HOST_WIDE_INT nelt = d->perm.length ().to_constant ();
> +
> + if (nelt != 4
> + || !d->perm[0].is_constant()
> + || !d->perm[1].is_constant()
> + || !d->perm.series_p (0, 2, d->perm[0], 0)
> + || !d->perm.series_p (1, 2, d->perm[1], 0))
> + return false;
> +
> + /* We have a {A, B, A, B} permutation. */
> + HOST_WIDE_INT A = d->perm[0].to_constant ();
> + HOST_WIDE_INT B = d->perm[1].to_constant ();
> +
> + if (A >= nelt || B < nelt || d->op0 == d->op1)
> + return false;
> +
> + rtx insv;
> + rtx extractv;
> + HOST_WIDE_INT idx, extractindex;
> +
> + /* If A is the first element or B is the second element of a UZP1/2 then we
> + can emit this permute as INS + UZP . */
> + if (A == 0 || A == 1)
> + {
> + insv = d->op0;
> + extractv = d->op1;
> + idx = A == 0 ? 2 : 3;
> + extractindex = B;
> + }
> + else if (B == nelt + 2 || B == nelt + 3)
> + {
> + insv = d->op1;
> + extractv = d->op0;
> + idx = B == nelt + 2 ? 0 : 1;
> + extractindex = A;
> + }
> + else
> + return false;
> +
> + if (d->testing_p)
> + return true;
> +
> + if (extractindex >= nelt)
> + extractindex -= nelt;
> + gcc_assert (extractindex < nelt);
> +
> + /* Emit INS. */
> + insn_code icode = code_for_aarch64_simd_vec_copy_lane (mode);
> + expand_operand ops[5];
> + create_output_operand (&ops[0], d->target, mode);
> + create_input_operand (&ops[1], insv, mode);
> + create_integer_operand (&ops[2], 1 << idx);
> + create_input_operand (&ops[3], extractv, mode);
> + create_integer_operand (&ops[4], extractindex);
> + expand_insn (icode, 5, ops);
> +
> + /* Emit UZP. */
> + emit_set_insn (d->target, gen_rtx_UNSPEC (mode, gen_rtvec (2, d->target, d->target),
> + idx & 1 ? UNSPEC_UZP2 : UNSPEC_UZP1));
> +
> + return true;
> +}
> +
> static bool
> aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
> {
> @@ -25931,6 +26005,8 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
> return true;
> else if (aarch64_evpc_ins (d))
> return true;
> + else if (aarch64_evpc_ins_uzp (d))
> + return true;
> else if (aarch64_evpc_reencode (d))
> return true;
^ permalink raw reply [flat|nested] 5+ messages in thread
* [gcc(refs/vendors/vrull/heads/slp-improvements)] aarch64: expand VEC_PERM into ins + uzp[12]
@ 2024-01-23 20:57 Philipp Tomsich
2024-01-23 23:50 ` Andrew Pinski
0 siblings, 1 reply; 5+ messages in thread
From: Philipp Tomsich @ 2024-01-23 20:57 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:d61be742513b5b8529ab9ef4022011c471925622
commit d61be742513b5b8529ab9ef4022011c471925622
Author: Manolis Tsamis <manolis.tsamis@vrull.eu>
Date: Fri Nov 3 14:36:34 2023 +0100
aarch64: expand VEC_PERM into ins + uzp[12]
The AArch64 backend has specific strategies that can be used to expand
VEC_PERM expression (see aarch64_expand_vec_perm_const_1).
The last strategy applied if everything else fails is to use a tbl
instruction, which is known to have very bad latency and performance
(see aarch64_evpc_tbl). There are various improvements and additions
that can be done to the reduce the harmful tbl instructions.
The existing mechanisms work for cases that the permute can be done
with a single existing AArch64 vector instruction, but for x264's
first loop we need some patterns that may need two vector
instructions.
On x264, this change results in the following change in instruction
distribution:
tbl: 8 -> 0
ldr: 10 -> 8 (due to the eliminated tbls)
ins: 8 -> 16
uzp: 8 -> 16
A reduction of the newly introduced ins/uzp[12] sequences will be
addressed in a follow-on change.
Ref #344
Diff:
---
gcc/config/aarch64/aarch64.cc | 76 +++++++++++++++++++++++++++++++++++++++++++
1 file changed, 76 insertions(+)
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index e6bd3fd0bb4..0f2423ef7de 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25890,6 +25890,80 @@ aarch64_evpc_ins (struct expand_vec_perm_d *d)
return true;
}
+/* Recognize patterns suitable for the an INS + UZP.
+ This addresses limited permute optimizations before a more generic search
+ algorithm for two operator sequences is implemented. */
+static bool
+aarch64_evpc_ins_uzp (struct expand_vec_perm_d *d)
+{
+ machine_mode mode = d->vmode;
+
+ if (d->vec_flags != VEC_ADVSIMD || BYTES_BIG_ENDIAN)
+ return false;
+
+ unsigned HOST_WIDE_INT nelt = d->perm.length ().to_constant ();
+
+ if (nelt != 4
+ || !d->perm[0].is_constant()
+ || !d->perm[1].is_constant()
+ || !d->perm.series_p (0, 2, d->perm[0], 0)
+ || !d->perm.series_p (1, 2, d->perm[1], 0))
+ return false;
+
+ /* We have a {A, B, A, B} permutation. */
+ HOST_WIDE_INT A = d->perm[0].to_constant ();
+ HOST_WIDE_INT B = d->perm[1].to_constant ();
+
+ if (A >= nelt || B < nelt || d->op0 == d->op1)
+ return false;
+
+ rtx insv;
+ rtx extractv;
+ HOST_WIDE_INT idx, extractindex;
+
+ /* If A is the first element or B is the second element of a UZP1/2 then we
+ can emit this permute as INS + UZP . */
+ if (A == 0 || A == 1)
+ {
+ insv = d->op0;
+ extractv = d->op1;
+ idx = A == 0 ? 2 : 3;
+ extractindex = B;
+ }
+ else if (B == nelt + 2 || B == nelt + 3)
+ {
+ insv = d->op1;
+ extractv = d->op0;
+ idx = B == nelt + 2 ? 0 : 1;
+ extractindex = A;
+ }
+ else
+ return false;
+
+ if (d->testing_p)
+ return true;
+
+ if (extractindex >= nelt)
+ extractindex -= nelt;
+ gcc_assert (extractindex < nelt);
+
+ /* Emit INS. */
+ insn_code icode = code_for_aarch64_simd_vec_copy_lane (mode);
+ expand_operand ops[5];
+ create_output_operand (&ops[0], d->target, mode);
+ create_input_operand (&ops[1], insv, mode);
+ create_integer_operand (&ops[2], 1 << idx);
+ create_input_operand (&ops[3], extractv, mode);
+ create_integer_operand (&ops[4], extractindex);
+ expand_insn (icode, 5, ops);
+
+ /* Emit UZP. */
+ emit_set_insn (d->target, gen_rtx_UNSPEC (mode, gen_rtvec (2, d->target, d->target),
+ idx & 1 ? UNSPEC_UZP2 : UNSPEC_UZP1));
+
+ return true;
+}
+
static bool
aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
{
@@ -25931,6 +26005,8 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
return true;
else if (aarch64_evpc_ins (d))
return true;
+ else if (aarch64_evpc_ins_uzp (d))
+ return true;
else if (aarch64_evpc_reencode (d))
return true;
^ permalink raw reply [flat|nested] 5+ messages in thread
* [gcc(refs/vendors/vrull/heads/slp-improvements)] aarch64: expand VEC_PERM into ins + uzp[12]
@ 2023-11-28 13:35 Philipp Tomsich
0 siblings, 0 replies; 5+ messages in thread
From: Philipp Tomsich @ 2023-11-28 13:35 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:527c082ecad2383d022857e5b50d3fba8705cbe6
commit 527c082ecad2383d022857e5b50d3fba8705cbe6
Author: Manolis Tsamis <manolis.tsamis@vrull.eu>
Date: Fri Nov 3 14:36:34 2023 +0100
aarch64: expand VEC_PERM into ins + uzp[12]
The AArch64 backend has specific strategies that can be used to expand
VEC_PERM expression (see aarch64_expand_vec_perm_const_1).
The last strategy applied if everything else fails is to use a tbl
instruction, which is known to have very bad latency and performance
(see aarch64_evpc_tbl). There are various improvements and additions
that can be done to the reduce the harmful tbl instructions.
The existing mechanisms work for cases that the permute can be done
with a single existing AArch64 vector instruction, but for x264's
first loop we need some patterns that may need two vector
instructions.
On x264, this change results in the following change in instruction
distribution:
tbl: 8 -> 0
ldr: 10 -> 8 (due to the eliminated tbls)
ins: 8 -> 16
uzp: 8 -> 16
A reduction of the newly introduced ins/uzp[12] sequences will be
addressed in a follow-on change.
Ref #344
Diff:
---
gcc/config/aarch64/aarch64.cc | 76 +++++++++++++++++++++++++++++++++++++++++++
1 file changed, 76 insertions(+)
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index f6f6f94bf43..dc89c8fad30 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -22548,6 +22548,80 @@ aarch64_evpc_ins (struct expand_vec_perm_d *d)
return true;
}
+/* Recognize patterns suitable for the an INS + UZP.
+ This addresses limited permute optimizations before a more generic search
+ algorithm for two operator sequences is implemented. */
+static bool
+aarch64_evpc_ins_uzp (struct expand_vec_perm_d *d)
+{
+ machine_mode mode = d->vmode;
+
+ if (d->vec_flags != VEC_ADVSIMD || BYTES_BIG_ENDIAN)
+ return false;
+
+ unsigned HOST_WIDE_INT nelt = d->perm.length ().to_constant ();
+
+ if (nelt != 4
+ || !d->perm[0].is_constant()
+ || !d->perm[1].is_constant()
+ || !d->perm.series_p (0, 2, d->perm[0], 0)
+ || !d->perm.series_p (1, 2, d->perm[1], 0))
+ return false;
+
+ /* We have a {A, B, A, B} permutation. */
+ HOST_WIDE_INT A = d->perm[0].to_constant ();
+ HOST_WIDE_INT B = d->perm[1].to_constant ();
+
+ if (A >= nelt || B < nelt || d->op0 == d->op1)
+ return false;
+
+ rtx insv;
+ rtx extractv;
+ HOST_WIDE_INT idx, extractindex;
+
+ /* If A is the first element or B is the second element of a UZP1/2 then we
+ can emit this permute as INS + UZP . */
+ if (A == 0 || A == 1)
+ {
+ insv = d->op0;
+ extractv = d->op1;
+ idx = A == 0 ? 2 : 3;
+ extractindex = B;
+ }
+ else if (B == nelt + 2 || B == nelt + 3)
+ {
+ insv = d->op1;
+ extractv = d->op0;
+ idx = B == nelt + 2 ? 0 : 1;
+ extractindex = A;
+ }
+ else
+ return false;
+
+ if (d->testing_p)
+ return true;
+
+ if (extractindex >= nelt)
+ extractindex -= nelt;
+ gcc_assert (extractindex < nelt);
+
+ /* Emit INS. */
+ insn_code icode = code_for_aarch64_simd_vec_copy_lane (mode);
+ expand_operand ops[5];
+ create_output_operand (&ops[0], d->target, mode);
+ create_input_operand (&ops[1], insv, mode);
+ create_integer_operand (&ops[2], 1 << idx);
+ create_input_operand (&ops[3], extractv, mode);
+ create_integer_operand (&ops[4], extractindex);
+ expand_insn (icode, 5, ops);
+
+ /* Emit UZP. */
+ emit_set_insn (d->target, gen_rtx_UNSPEC (mode, gen_rtvec (2, d->target, d->target),
+ idx & 1 ? UNSPEC_UZP2 : UNSPEC_UZP1));
+
+ return true;
+}
+
static bool
aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
{
@@ -22589,6 +22663,8 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
return true;
else if (aarch64_evpc_ins (d))
return true;
+ else if (aarch64_evpc_ins_uzp (d))
+ return true;
else if (aarch64_evpc_reencode (d))
return true;
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2024-02-27 13:37 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-01-17 19:14 [gcc(refs/vendors/vrull/heads/slp-improvements)] aarch64: expand VEC_PERM into ins + uzp[12] Philipp Tomsich
-- strict thread matches above, loose matches on Subject: below --
2024-02-27 13:37 Philipp Tomsich
2024-01-23 20:57 Philipp Tomsich
2024-01-23 23:50 ` Andrew Pinski
2023-11-28 13:35 Philipp Tomsich
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).