* [PATCH] [x86] Support 2-instruction vector shuffle for V4SI/V4SF in ix86_expand_vec_perm_const_1.
@ 2022-09-23 6:42 liuhongt
2022-09-23 6:53 ` Jakub Jelinek
0 siblings, 1 reply; 3+ messages in thread
From: liuhongt @ 2022-09-23 6:42 UTC (permalink / raw)
To: gcc-patches; +Cc: crazylht, hjl.tools
x86 have shufps which shuffles the first operand to the lower 64-bit,
and the second operand to the upper 64-bit. For
__builtin_shufflevector (op0, op1, 1, 4, 3, 6), it will be veclowered since
can_vec_perm_const_p return false for sse2 target.
This patch add a new function to support 2-operand v4si/v4sf
vector shuffle with any index for sse2.
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?
2022-09-23 Hongtao Liu <hongtao.liu@intel.com>
Liwei Xu <liwei.xu@intel.com>
gcc/ChangeLog:
PR target/53346
* config/i386/i386-expand.cc (expand_vec_perm_shufps_shufps):
New function.
(ix86_expand_vec_perm_const_1): Insert
expand_vec_perm_shufps_shufps at the end of 2-instruction
expand sequence.
gcc/testsuite/ChangeLog:
* gcc.target/i386/pr53346-1.c: New test.
* gcc.target/i386/pr53346-2.c: New test.
---
gcc/config/i386/i386-expand.cc | 117 ++++++++++++++++++++++
gcc/testsuite/gcc.target/i386/pr53346-1.c | 70 +++++++++++++
gcc/testsuite/gcc.target/i386/pr53346-2.c | 59 +++++++++++
gcc/testsuite/gcc.target/i386/pr53346-3.c | 69 +++++++++++++
gcc/testsuite/gcc.target/i386/pr53346-4.c | 59 +++++++++++
5 files changed, 374 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/i386/pr53346-1.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr53346-2.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr53346-3.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr53346-4.c
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 5334363e235..43c58111a62 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -19604,6 +19604,120 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
return false;
}
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
+ in terms of a pair of shufps+ shufps/pshufd instructions. */
+static bool
+expand_vec_perm_shufps_shufps (struct expand_vec_perm_d *d)
+{
+ unsigned char perm1[4];
+ machine_mode vmode = d->vmode;
+ bool ok;
+ unsigned i, j, k, count = 0;
+
+ if (d->one_operand_p
+ || (vmode != V4SImode && vmode != V4SFmode))
+ return false;
+
+ if (d->testing_p)
+ return true;
+
+ for (i = 0; i < 4; ++i)
+ count += d->perm[i] > 3 ? 1 : 0;
+
+ gcc_assert(count & 3);
+
+ rtx tmp = gen_reg_rtx (vmode);
+ /* 2 from op0 and 2 from op1. */
+ if (count == 2)
+ {
+ unsigned char perm2[4];
+ for (i = 0, j = 0, k = 2; i < 4; ++i)
+ if (d->perm[i] & 4)
+ {
+ perm1[k++] = d->perm[i];
+ perm2[i] = k - 1;
+ }
+ else
+ {
+ perm1[j++] = d->perm[i];
+ perm2[i] = j - 1;
+ }
+
+ /* shufps. */
+ ok = expand_vselect_vconcat(tmp, d->op0, d->op1,
+ perm1, d->nelt, false);
+ gcc_assert (ok);
+ if (vmode == V4SImode && TARGET_SSE2)
+ /* pshufd. */
+ ok = expand_vselect (d->target, tmp,
+ perm2, d->nelt, false);
+ else
+ {
+ /* shufps. */
+ perm2[2] += 4;
+ perm2[3] += 4;
+ ok = expand_vselect_vconcat (d->target, tmp, tmp,
+ perm2, d->nelt, false);
+ }
+ gcc_assert (ok);
+ }
+ /* 3 from one op and 1 from another. */
+ else
+ {
+ unsigned pair_idx = 8, lone_idx = 8, shift;
+
+ /* Find the lone index. */
+ for (i = 0; i < 4; ++i)
+ if ((d->perm[i] > 3 && count == 1)
+ || (d->perm[i] < 4 && count == 3))
+ lone_idx = i;
+
+ /* When lone_idx is not 0, it must from second op(count == 1). */
+ gcc_assert ((lone_idx == 0 && count == 3)
+ || (lone_idx != 0 && count == 1));
+
+ /* Find the pair index that sits in the same half as the lone index. */
+ shift = lone_idx & 2;
+ pair_idx = 1 - lone_idx + 2 * shift;
+
+ /* First permutate lone index and pair index into the same vector as
+ [ lone, lone, pair, pair ]. */
+ perm1[1] = perm1[0]
+ = (count == 3) ? d->perm[lone_idx] : d->perm[lone_idx] - 4;
+ perm1[3] = perm1[2]
+ = (count == 3) ? d->perm[pair_idx] : d->perm[pair_idx] + 4;
+
+ /* Alway put the vector contains lone indx at the first. */
+ if (count == 1)
+ std::swap (d->op0, d->op1);
+
+ /* shufps. */
+ ok = expand_vselect_vconcat(tmp, d->op0, d->op1,
+ perm1, d->nelt, false);
+ gcc_assert (ok);
+
+ /* Refine lone and pair index to original order. */
+ perm1[shift] = lone_idx << 1;
+ perm1[shift + 1] = pair_idx << 1;
+
+ /* Select the remaining 2 elements in another vector. */
+ for (i = 2 - shift; i < 4 - shift; ++i)
+ perm1[i] = (lone_idx == 1) ? (d->perm[i] + 4) : d->perm[i];
+
+ /* Adjust to original selector. */
+ if (lone_idx > 1)
+ std::swap (tmp, d->op1);
+
+ /* shufps. */
+ ok = expand_vselect_vconcat(d->target, tmp, d->op1,
+ perm1, d->nelt, false);
+
+ gcc_assert (ok);
+ }
+
+ return true;
+}
+
/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
in terms of a pair of pshuflw + pshufhw instructions. */
@@ -22152,6 +22266,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
if (expand_vec_perm_2perm_pblendv (d, true))
return true;
+ if (expand_vec_perm_shufps_shufps (d))
+ return true;
+
/* Try sequences of three instructions. */
if (expand_vec_perm_even_odd_pack (d))
diff --git a/gcc/testsuite/gcc.target/i386/pr53346-1.c b/gcc/testsuite/gcc.target/i386/pr53346-1.c
new file mode 100644
index 00000000000..6d230da632c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr53346-1.c
@@ -0,0 +1,70 @@
+/* { dg-do compile } */
+/* { dg-options "-msse2 -O2 -mno-sse3" } */
+/* { dg-final { scan-assembler-times "shufps" 15 } } */
+/* { dg-final { scan-assembler-times "pshufd" 2 } } */
+
+typedef int v4si __attribute__((vector_size(16)));
+
+v4si
+__attribute__((noipa))
+foo (v4si a, v4si b)
+{
+ return __builtin_shufflevector (a, b, 1, 2, 5, 3);
+}
+
+v4si
+__attribute__((noipa))
+foo1 (v4si a, v4si b)
+{
+ return __builtin_shufflevector (a, b, 1, 5, 2, 3);
+}
+
+v4si
+__attribute__((noipa))
+foo2 (v4si a, v4si b)
+{
+ return __builtin_shufflevector (a, b, 1, 2, 3, 5);
+}
+
+v4si
+__attribute__((noipa))
+foo3 (v4si a, v4si b)
+{
+ return __builtin_shufflevector (a, b, 1, 4, 5, 6);
+}
+
+v4si
+__attribute__((noipa))
+foo4 (v4si a, v4si b)
+{
+ return __builtin_shufflevector (a, b, 3, 6, 7, 5);
+}
+
+v4si
+__attribute__((noipa))
+foo5 (v4si a, v4si b)
+{
+ return __builtin_shufflevector (a, b, 2, 4, 7, 6);
+}
+
+v4si
+__attribute__((noipa))
+foo6 (v4si a, v4si b)
+{
+ return __builtin_shufflevector (a, b, 2, 4, 3, 6);
+}
+
+v4si
+__attribute__((noipa))
+foo7 (v4si a, v4si b)
+{
+ return __builtin_shufflevector (a, b, 2, 3, 4, 6);
+}
+
+v4si
+__attribute__((noipa))
+foo8 (v4si a, v4si b)
+{
+ return __builtin_shufflevector (a, b, 2, 4, 6, 3);
+}
+
diff --git a/gcc/testsuite/gcc.target/i386/pr53346-2.c b/gcc/testsuite/gcc.target/i386/pr53346-2.c
new file mode 100644
index 00000000000..0c6c7b35e01
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr53346-2.c
@@ -0,0 +1,59 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -msse2" } */
+/* { dg-require-effective-target sse2 } */
+
+#include "sse2-check.h"
+#include "pr53346-1.c"
+
+static void
+sse2_test ()
+{
+ v4si a = __extension__(v4si) { 0, 1, 2, 3 };
+ v4si b = __extension__(v4si) { 4, 5, 6, 7 };
+ v4si exp = __extension__(v4si) { 1, 2, 5, 3 };
+ v4si dest;
+ dest = foo (a, b);
+ if (__builtin_memcmp (&dest, &exp, 16))
+ __builtin_abort ();
+
+ exp = __extension__ (v4si) { 1, 5, 2, 3 };
+ dest = foo1 (a, b);
+ if (__builtin_memcmp (&dest, &exp, 16))
+ __builtin_abort ();
+
+ exp = __extension__ (v4si) { 1, 2, 3, 5 };
+ dest = foo2 (a, b);
+ if (__builtin_memcmp (&dest, &exp, 16))
+ __builtin_abort ();
+
+ exp = __extension__ (v4si) { 1, 4, 5, 6 };
+ dest = foo3 (a, b);
+ if (__builtin_memcmp (&dest, &exp, 16))
+ __builtin_abort ();
+
+ exp = __extension__ (v4si) { 3, 6, 7, 5 };
+ dest = foo4 (a, b);
+ if (__builtin_memcmp (&dest, &exp, 16))
+ __builtin_abort ();
+
+ exp = __extension__ (v4si) { 2, 4, 7, 6 };
+ dest = foo5 (a, b);
+ if (__builtin_memcmp (&dest, &exp, 16))
+ __builtin_abort ();
+
+ exp = __extension__ (v4si) { 2, 4, 3, 6 };
+ dest = foo6 (a, b);
+ if (__builtin_memcmp (&dest, &exp, 16))
+ __builtin_abort ();
+
+ exp = __extension__ (v4si) { 2, 3, 4, 6 };
+ dest = foo7 (a, b);
+ if (__builtin_memcmp (&dest, &exp, 16))
+ __builtin_abort ();
+
+ exp = __extension__ (v4si) { 2, 4, 6, 3 };
+ dest = foo8 (a, b);
+ if (__builtin_memcmp (&dest, &exp, 16))
+ __builtin_abort ();
+
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr53346-3.c b/gcc/testsuite/gcc.target/i386/pr53346-3.c
new file mode 100644
index 00000000000..0b204f6f210
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr53346-3.c
@@ -0,0 +1,69 @@
+/* { dg-do compile } */
+/* { dg-options "-msse2 -O2 -mno-sse3" } */
+/* { dg-final { scan-assembler-times "shufps" 17 } } */
+
+typedef float v4sf __attribute__((vector_size(16)));
+
+v4sf
+__attribute__((noipa))
+foo (v4sf a, v4sf b)
+{
+ return __builtin_shufflevector (a, b, 1, 2, 5, 3);
+}
+
+v4sf
+__attribute__((noipa))
+foo1 (v4sf a, v4sf b)
+{
+ return __builtin_shufflevector (a, b, 1, 5, 2, 3);
+}
+
+v4sf
+__attribute__((noipa))
+foo2 (v4sf a, v4sf b)
+{
+ return __builtin_shufflevector (a, b, 1, 2, 3, 5);
+}
+
+v4sf
+__attribute__((noipa))
+foo3 (v4sf a, v4sf b)
+{
+ return __builtin_shufflevector (a, b, 1, 4, 5, 6);
+}
+
+v4sf
+__attribute__((noipa))
+foo4 (v4sf a, v4sf b)
+{
+ return __builtin_shufflevector (a, b, 3, 6, 7, 5);
+}
+
+v4sf
+__attribute__((noipa))
+foo5 (v4sf a, v4sf b)
+{
+ return __builtin_shufflevector (a, b, 2, 4, 7, 6);
+}
+
+v4sf
+__attribute__((noipa))
+foo6 (v4sf a, v4sf b)
+{
+ return __builtin_shufflevector (a, b, 2, 4, 3, 6);
+}
+
+v4sf
+__attribute__((noipa))
+foo7 (v4sf a, v4sf b)
+{
+ return __builtin_shufflevector (a, b, 2, 3, 4, 6);
+}
+
+v4sf
+__attribute__((noipa))
+foo8 (v4sf a, v4sf b)
+{
+ return __builtin_shufflevector (a, b, 2, 4, 6, 3);
+}
+
diff --git a/gcc/testsuite/gcc.target/i386/pr53346-4.c b/gcc/testsuite/gcc.target/i386/pr53346-4.c
new file mode 100644
index 00000000000..9e4e45bd584
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr53346-4.c
@@ -0,0 +1,59 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -msse2" } */
+/* { dg-require-effective-target sse2 } */
+
+#include "sse2-check.h"
+#include "pr53346-3.c"
+
+static void
+sse2_test ()
+{
+ v4sf a = __extension__(v4sf) { 0, 1, 2, 3 };
+ v4sf b = __extension__(v4sf) { 4, 5, 6, 7 };
+ v4sf exp = __extension__(v4sf) { 1, 2, 5, 3 };
+ v4sf dest;
+ dest = foo (a, b);
+ if (__builtin_memcmp (&dest, &exp, 16))
+ __builtin_abort ();
+
+ exp = __extension__ (v4sf) { 1, 5, 2, 3 };
+ dest = foo1 (a, b);
+ if (__builtin_memcmp (&dest, &exp, 16))
+ __builtin_abort ();
+
+ exp = __extension__ (v4sf) { 1, 2, 3, 5 };
+ dest = foo2 (a, b);
+ if (__builtin_memcmp (&dest, &exp, 16))
+ __builtin_abort ();
+
+ exp = __extension__ (v4sf) { 1, 4, 5, 6 };
+ dest = foo3 (a, b);
+ if (__builtin_memcmp (&dest, &exp, 16))
+ __builtin_abort ();
+
+ exp = __extension__ (v4sf) { 3, 6, 7, 5 };
+ dest = foo4 (a, b);
+ if (__builtin_memcmp (&dest, &exp, 16))
+ __builtin_abort ();
+
+ exp = __extension__ (v4sf) { 2, 4, 7, 6 };
+ dest = foo5 (a, b);
+ if (__builtin_memcmp (&dest, &exp, 16))
+ __builtin_abort ();
+
+ exp = __extension__ (v4sf) { 2, 4, 3, 6 };
+ dest = foo6 (a, b);
+ if (__builtin_memcmp (&dest, &exp, 16))
+ __builtin_abort ();
+
+ exp = __extension__ (v4sf) { 2, 3, 4, 6 };
+ dest = foo7 (a, b);
+ if (__builtin_memcmp (&dest, &exp, 16))
+ __builtin_abort ();
+
+ exp = __extension__ (v4sf) { 2, 4, 6, 3 };
+ dest = foo8 (a, b);
+ if (__builtin_memcmp (&dest, &exp, 16))
+ __builtin_abort ();
+
+}
--
2.27.0
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH] [x86] Support 2-instruction vector shuffle for V4SI/V4SF in ix86_expand_vec_perm_const_1.
2022-09-23 6:42 [PATCH] [x86] Support 2-instruction vector shuffle for V4SI/V4SF in ix86_expand_vec_perm_const_1 liuhongt
@ 2022-09-23 6:53 ` Jakub Jelinek
2022-09-26 3:14 ` liuhongt
0 siblings, 1 reply; 3+ messages in thread
From: Jakub Jelinek @ 2022-09-23 6:53 UTC (permalink / raw)
To: liuhongt; +Cc: gcc-patches
On Fri, Sep 23, 2022 at 02:42:54PM +0800, liuhongt via Gcc-patches wrote:
> 2022-09-23 Hongtao Liu <hongtao.liu@intel.com>
> Liwei Xu <liwei.xu@intel.com>
>
> gcc/ChangeLog:
>
> PR target/53346
> * config/i386/i386-expand.cc (expand_vec_perm_shufps_shufps):
> New function.
> (ix86_expand_vec_perm_const_1): Insert
> expand_vec_perm_shufps_shufps at the end of 2-instruction
> expand sequence.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/pr53346-1.c: New test.
> * gcc.target/i386/pr53346-2.c: New test.
> ---
> gcc/config/i386/i386-expand.cc | 117 ++++++++++++++++++++++
> gcc/testsuite/gcc.target/i386/pr53346-1.c | 70 +++++++++++++
> gcc/testsuite/gcc.target/i386/pr53346-2.c | 59 +++++++++++
> gcc/testsuite/gcc.target/i386/pr53346-3.c | 69 +++++++++++++
> gcc/testsuite/gcc.target/i386/pr53346-4.c | 59 +++++++++++
> 5 files changed, 374 insertions(+)
> create mode 100644 gcc/testsuite/gcc.target/i386/pr53346-1.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr53346-2.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr53346-3.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr53346-4.c
>
> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> index 5334363e235..43c58111a62 100644
> --- a/gcc/config/i386/i386-expand.cc
> +++ b/gcc/config/i386/i386-expand.cc
> @@ -19604,6 +19604,120 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
> return false;
> }
>
> +/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
> + in terms of a pair of shufps+ shufps/pshufd instructions. */
> +static bool
> +expand_vec_perm_shufps_shufps (struct expand_vec_perm_d *d)
> +{
> + unsigned char perm1[4];
> + machine_mode vmode = d->vmode;
> + bool ok;
> + unsigned i, j, k, count = 0;
> +
> + if (d->one_operand_p
> + || (vmode != V4SImode && vmode != V4SFmode))
> + return false;
> +
> + if (d->testing_p)
> + return true;
> +
> + for (i = 0; i < 4; ++i)
> + count += d->perm[i] > 3 ? 1 : 0;
> +
> + gcc_assert(count & 3);
Missing space before (
> + /* shufps. */
> + ok = expand_vselect_vconcat(tmp, d->op0, d->op1,
> + perm1, d->nelt, false);
Ditto.
> + /* When lone_idx is not 0, it must from second op(count == 1). */
> + gcc_assert ((lone_idx == 0 && count == 3)
> + || (lone_idx != 0 && count == 1));
Perhaps write it more simply as
gcc_assert (count == (lone_idx ? 1 : 3));
?
> + /* shufps. */
> + ok = expand_vselect_vconcat(tmp, d->op0, d->op1,
> + perm1, d->nelt, false);
Missing space before (
> + gcc_assert (ok);
> +
> + /* Refine lone and pair index to original order. */
> + perm1[shift] = lone_idx << 1;
> + perm1[shift + 1] = pair_idx << 1;
> +
> + /* Select the remaining 2 elements in another vector. */
> + for (i = 2 - shift; i < 4 - shift; ++i)
> + perm1[i] = (lone_idx == 1) ? (d->perm[i] + 4) : d->perm[i];
All the ()s in the above line aren't needed.
> + /* shufps. */
> + ok = expand_vselect_vconcat(d->target, tmp, d->op1,
> + perm1, d->nelt, false);
Again, missing space
Otherwise LGTM
Jakub
^ permalink raw reply [flat|nested] 3+ messages in thread
* [PATCH] [x86] Support 2-instruction vector shuffle for V4SI/V4SF in ix86_expand_vec_perm_const_1.
2022-09-23 6:53 ` Jakub Jelinek
@ 2022-09-26 3:14 ` liuhongt
0 siblings, 0 replies; 3+ messages in thread
From: liuhongt @ 2022-09-26 3:14 UTC (permalink / raw)
To: gcc-patches; +Cc: crazylht, hjl.tools
>Missing space before (
Changed.
>> + /* shufps. */
>> + ok = expand_vselect_vconcat(tmp, d->op0, d->op1,
>> + perm1, d->nelt, false);
>
>Ditto.
Changed.
>
>> + /* When lone_idx is not 0, it must from second op(count == 1). */
>> + gcc_assert ((lone_idx == 0 && count == 3)
>> + || (lone_idx != 0 && count == 1));
>
>Perhaps write it more simply as
> gcc_assert (count == (lone_idx ? 1 : 3));
>?
Changed.
>
>> + /* shufps. */
>> + ok = expand_vselect_vconcat(tmp, d->op0, d->op1,
>> + perm1, d->nelt, false);
>
>Missing space before (
>
Changed.
>> + gcc_assert (ok);
>> +
>> + /* Refine lone and pair index to original order. */
>> + perm1[shift] = lone_idx << 1;
>> + perm1[shift + 1] = pair_idx << 1;
>> +
>> + /* Select the remaining 2 elements in another vector. */
>> + for (i = 2 - shift; i < 4 - shift; ++i)
>> + perm1[i] = (lone_idx == 1) ? (d->perm[i] + 4) : d->perm[i];
>
>All the ()s in the above line aren't needed.
>
Changed.
>> + /* shufps. */
>> + ok = expand_vselect_vconcat(d->target, tmp, d->op1,
>> + perm1, d->nelt, false);
>
>Again, missing space
>
>Otherwise LGTM
Thanks, here's the update patch i'm going to check in.
2022-09-23 Hongtao Liu <hongtao.liu@intel.com>
Liwei Xu <liwei.xu@intel.com>
gcc/ChangeLog:
PR target/53346
* config/i386/i386-expand.cc (expand_vec_perm_shufps_shufps):
New function.
(ix86_expand_vec_perm_const_1): Insert
expand_vec_perm_shufps_shufps at the end of 2-instruction
expand sequence.
gcc/testsuite/ChangeLog:
* gcc.target/i386/pr53346-1.c: New test.
* gcc.target/i386/pr53346-2.c: New test.
* gcc.target/i386/pr53346-3.c: New test.
* gcc.target/i386/pr53346-4.c: New test.
---
gcc/config/i386/i386-expand.cc | 116 ++++++++++++++++++++++
gcc/testsuite/gcc.target/i386/pr53346-1.c | 70 +++++++++++++
gcc/testsuite/gcc.target/i386/pr53346-2.c | 59 +++++++++++
gcc/testsuite/gcc.target/i386/pr53346-3.c | 69 +++++++++++++
gcc/testsuite/gcc.target/i386/pr53346-4.c | 59 +++++++++++
5 files changed, 373 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/i386/pr53346-1.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr53346-2.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr53346-3.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr53346-4.c
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 5334363e235..6baff6d0e61 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -19604,6 +19604,119 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
return false;
}
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
+ in terms of a pair of shufps+ shufps/pshufd instructions. */
+static bool
+expand_vec_perm_shufps_shufps (struct expand_vec_perm_d *d)
+{
+ unsigned char perm1[4];
+ machine_mode vmode = d->vmode;
+ bool ok;
+ unsigned i, j, k, count = 0;
+
+ if (d->one_operand_p
+ || (vmode != V4SImode && vmode != V4SFmode))
+ return false;
+
+ if (d->testing_p)
+ return true;
+
+ for (i = 0; i < 4; ++i)
+ count += d->perm[i] > 3 ? 1 : 0;
+
+ gcc_assert (count & 3);
+
+ rtx tmp = gen_reg_rtx (vmode);
+ /* 2 from op0 and 2 from op1. */
+ if (count == 2)
+ {
+ unsigned char perm2[4];
+ for (i = 0, j = 0, k = 2; i < 4; ++i)
+ if (d->perm[i] & 4)
+ {
+ perm1[k++] = d->perm[i];
+ perm2[i] = k - 1;
+ }
+ else
+ {
+ perm1[j++] = d->perm[i];
+ perm2[i] = j - 1;
+ }
+
+ /* shufps. */
+ ok = expand_vselect_vconcat (tmp, d->op0, d->op1,
+ perm1, d->nelt, false);
+ gcc_assert (ok);
+ if (vmode == V4SImode && TARGET_SSE2)
+ /* pshufd. */
+ ok = expand_vselect (d->target, tmp,
+ perm2, d->nelt, false);
+ else
+ {
+ /* shufps. */
+ perm2[2] += 4;
+ perm2[3] += 4;
+ ok = expand_vselect_vconcat (d->target, tmp, tmp,
+ perm2, d->nelt, false);
+ }
+ gcc_assert (ok);
+ }
+ /* 3 from one op and 1 from another. */
+ else
+ {
+ unsigned pair_idx = 8, lone_idx = 8, shift;
+
+ /* Find the lone index. */
+ for (i = 0; i < 4; ++i)
+ if ((d->perm[i] > 3 && count == 1)
+ || (d->perm[i] < 4 && count == 3))
+ lone_idx = i;
+
+ /* When lone_idx is not 0, it must from second op(count == 1). */
+ gcc_assert (count == (lone_idx ? 1 : 3));
+
+ /* Find the pair index that sits in the same half as the lone index. */
+ shift = lone_idx & 2;
+ pair_idx = 1 - lone_idx + 2 * shift;
+
+ /* First permutate lone index and pair index into the same vector as
+ [ lone, lone, pair, pair ]. */
+ perm1[1] = perm1[0]
+ = (count == 3) ? d->perm[lone_idx] : d->perm[lone_idx] - 4;
+ perm1[3] = perm1[2]
+ = (count == 3) ? d->perm[pair_idx] : d->perm[pair_idx] + 4;
+
+ /* Alway put the vector contains lone indx at the first. */
+ if (count == 1)
+ std::swap (d->op0, d->op1);
+
+ /* shufps. */
+ ok = expand_vselect_vconcat (tmp, d->op0, d->op1,
+ perm1, d->nelt, false);
+ gcc_assert (ok);
+
+ /* Refine lone and pair index to original order. */
+ perm1[shift] = lone_idx << 1;
+ perm1[shift + 1] = pair_idx << 1;
+
+ /* Select the remaining 2 elements in another vector. */
+ for (i = 2 - shift; i < 4 - shift; ++i)
+ perm1[i] = lone_idx == 1 ? d->perm[i] + 4 : d->perm[i];
+
+ /* Adjust to original selector. */
+ if (lone_idx > 1)
+ std::swap (tmp, d->op1);
+
+ /* shufps. */
+ ok = expand_vselect_vconcat (d->target, tmp, d->op1,
+ perm1, d->nelt, false);
+
+ gcc_assert (ok);
+ }
+
+ return true;
+}
+
/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
in terms of a pair of pshuflw + pshufhw instructions. */
@@ -22152,6 +22265,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
if (expand_vec_perm_2perm_pblendv (d, true))
return true;
+ if (expand_vec_perm_shufps_shufps (d))
+ return true;
+
/* Try sequences of three instructions. */
if (expand_vec_perm_even_odd_pack (d))
diff --git a/gcc/testsuite/gcc.target/i386/pr53346-1.c b/gcc/testsuite/gcc.target/i386/pr53346-1.c
new file mode 100644
index 00000000000..6d230da632c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr53346-1.c
@@ -0,0 +1,70 @@
+/* { dg-do compile } */
+/* { dg-options "-msse2 -O2 -mno-sse3" } */
+/* { dg-final { scan-assembler-times "shufps" 15 } } */
+/* { dg-final { scan-assembler-times "pshufd" 2 } } */
+
+typedef int v4si __attribute__((vector_size(16)));
+
+v4si
+__attribute__((noipa))
+foo (v4si a, v4si b)
+{
+ return __builtin_shufflevector (a, b, 1, 2, 5, 3);
+}
+
+v4si
+__attribute__((noipa))
+foo1 (v4si a, v4si b)
+{
+ return __builtin_shufflevector (a, b, 1, 5, 2, 3);
+}
+
+v4si
+__attribute__((noipa))
+foo2 (v4si a, v4si b)
+{
+ return __builtin_shufflevector (a, b, 1, 2, 3, 5);
+}
+
+v4si
+__attribute__((noipa))
+foo3 (v4si a, v4si b)
+{
+ return __builtin_shufflevector (a, b, 1, 4, 5, 6);
+}
+
+v4si
+__attribute__((noipa))
+foo4 (v4si a, v4si b)
+{
+ return __builtin_shufflevector (a, b, 3, 6, 7, 5);
+}
+
+v4si
+__attribute__((noipa))
+foo5 (v4si a, v4si b)
+{
+ return __builtin_shufflevector (a, b, 2, 4, 7, 6);
+}
+
+v4si
+__attribute__((noipa))
+foo6 (v4si a, v4si b)
+{
+ return __builtin_shufflevector (a, b, 2, 4, 3, 6);
+}
+
+v4si
+__attribute__((noipa))
+foo7 (v4si a, v4si b)
+{
+ return __builtin_shufflevector (a, b, 2, 3, 4, 6);
+}
+
+v4si
+__attribute__((noipa))
+foo8 (v4si a, v4si b)
+{
+ return __builtin_shufflevector (a, b, 2, 4, 6, 3);
+}
+
diff --git a/gcc/testsuite/gcc.target/i386/pr53346-2.c b/gcc/testsuite/gcc.target/i386/pr53346-2.c
new file mode 100644
index 00000000000..0c6c7b35e01
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr53346-2.c
@@ -0,0 +1,59 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -msse2" } */
+/* { dg-require-effective-target sse2 } */
+
+#include "sse2-check.h"
+#include "pr53346-1.c"
+
+static void
+sse2_test ()
+{
+ v4si a = __extension__(v4si) { 0, 1, 2, 3 };
+ v4si b = __extension__(v4si) { 4, 5, 6, 7 };
+ v4si exp = __extension__(v4si) { 1, 2, 5, 3 };
+ v4si dest;
+ dest = foo (a, b);
+ if (__builtin_memcmp (&dest, &exp, 16))
+ __builtin_abort ();
+
+ exp = __extension__ (v4si) { 1, 5, 2, 3 };
+ dest = foo1 (a, b);
+ if (__builtin_memcmp (&dest, &exp, 16))
+ __builtin_abort ();
+
+ exp = __extension__ (v4si) { 1, 2, 3, 5 };
+ dest = foo2 (a, b);
+ if (__builtin_memcmp (&dest, &exp, 16))
+ __builtin_abort ();
+
+ exp = __extension__ (v4si) { 1, 4, 5, 6 };
+ dest = foo3 (a, b);
+ if (__builtin_memcmp (&dest, &exp, 16))
+ __builtin_abort ();
+
+ exp = __extension__ (v4si) { 3, 6, 7, 5 };
+ dest = foo4 (a, b);
+ if (__builtin_memcmp (&dest, &exp, 16))
+ __builtin_abort ();
+
+ exp = __extension__ (v4si) { 2, 4, 7, 6 };
+ dest = foo5 (a, b);
+ if (__builtin_memcmp (&dest, &exp, 16))
+ __builtin_abort ();
+
+ exp = __extension__ (v4si) { 2, 4, 3, 6 };
+ dest = foo6 (a, b);
+ if (__builtin_memcmp (&dest, &exp, 16))
+ __builtin_abort ();
+
+ exp = __extension__ (v4si) { 2, 3, 4, 6 };
+ dest = foo7 (a, b);
+ if (__builtin_memcmp (&dest, &exp, 16))
+ __builtin_abort ();
+
+ exp = __extension__ (v4si) { 2, 4, 6, 3 };
+ dest = foo8 (a, b);
+ if (__builtin_memcmp (&dest, &exp, 16))
+ __builtin_abort ();
+
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr53346-3.c b/gcc/testsuite/gcc.target/i386/pr53346-3.c
new file mode 100644
index 00000000000..0b204f6f210
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr53346-3.c
@@ -0,0 +1,69 @@
+/* { dg-do compile } */
+/* { dg-options "-msse2 -O2 -mno-sse3" } */
+/* { dg-final { scan-assembler-times "shufps" 17 } } */
+
+typedef float v4sf __attribute__((vector_size(16)));
+
+v4sf
+__attribute__((noipa))
+foo (v4sf a, v4sf b)
+{
+ return __builtin_shufflevector (a, b, 1, 2, 5, 3);
+}
+
+v4sf
+__attribute__((noipa))
+foo1 (v4sf a, v4sf b)
+{
+ return __builtin_shufflevector (a, b, 1, 5, 2, 3);
+}
+
+v4sf
+__attribute__((noipa))
+foo2 (v4sf a, v4sf b)
+{
+ return __builtin_shufflevector (a, b, 1, 2, 3, 5);
+}
+
+v4sf
+__attribute__((noipa))
+foo3 (v4sf a, v4sf b)
+{
+ return __builtin_shufflevector (a, b, 1, 4, 5, 6);
+}
+
+v4sf
+__attribute__((noipa))
+foo4 (v4sf a, v4sf b)
+{
+ return __builtin_shufflevector (a, b, 3, 6, 7, 5);
+}
+
+v4sf
+__attribute__((noipa))
+foo5 (v4sf a, v4sf b)
+{
+ return __builtin_shufflevector (a, b, 2, 4, 7, 6);
+}
+
+v4sf
+__attribute__((noipa))
+foo6 (v4sf a, v4sf b)
+{
+ return __builtin_shufflevector (a, b, 2, 4, 3, 6);
+}
+
+v4sf
+__attribute__((noipa))
+foo7 (v4sf a, v4sf b)
+{
+ return __builtin_shufflevector (a, b, 2, 3, 4, 6);
+}
+
+v4sf
+__attribute__((noipa))
+foo8 (v4sf a, v4sf b)
+{
+ return __builtin_shufflevector (a, b, 2, 4, 6, 3);
+}
+
diff --git a/gcc/testsuite/gcc.target/i386/pr53346-4.c b/gcc/testsuite/gcc.target/i386/pr53346-4.c
new file mode 100644
index 00000000000..9e4e45bd584
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr53346-4.c
@@ -0,0 +1,59 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -msse2" } */
+/* { dg-require-effective-target sse2 } */
+
+#include "sse2-check.h"
+#include "pr53346-3.c"
+
+static void
+sse2_test ()
+{
+ v4sf a = __extension__(v4sf) { 0, 1, 2, 3 };
+ v4sf b = __extension__(v4sf) { 4, 5, 6, 7 };
+ v4sf exp = __extension__(v4sf) { 1, 2, 5, 3 };
+ v4sf dest;
+ dest = foo (a, b);
+ if (__builtin_memcmp (&dest, &exp, 16))
+ __builtin_abort ();
+
+ exp = __extension__ (v4sf) { 1, 5, 2, 3 };
+ dest = foo1 (a, b);
+ if (__builtin_memcmp (&dest, &exp, 16))
+ __builtin_abort ();
+
+ exp = __extension__ (v4sf) { 1, 2, 3, 5 };
+ dest = foo2 (a, b);
+ if (__builtin_memcmp (&dest, &exp, 16))
+ __builtin_abort ();
+
+ exp = __extension__ (v4sf) { 1, 4, 5, 6 };
+ dest = foo3 (a, b);
+ if (__builtin_memcmp (&dest, &exp, 16))
+ __builtin_abort ();
+
+ exp = __extension__ (v4sf) { 3, 6, 7, 5 };
+ dest = foo4 (a, b);
+ if (__builtin_memcmp (&dest, &exp, 16))
+ __builtin_abort ();
+
+ exp = __extension__ (v4sf) { 2, 4, 7, 6 };
+ dest = foo5 (a, b);
+ if (__builtin_memcmp (&dest, &exp, 16))
+ __builtin_abort ();
+
+ exp = __extension__ (v4sf) { 2, 4, 3, 6 };
+ dest = foo6 (a, b);
+ if (__builtin_memcmp (&dest, &exp, 16))
+ __builtin_abort ();
+
+ exp = __extension__ (v4sf) { 2, 3, 4, 6 };
+ dest = foo7 (a, b);
+ if (__builtin_memcmp (&dest, &exp, 16))
+ __builtin_abort ();
+
+ exp = __extension__ (v4sf) { 2, 4, 6, 3 };
+ dest = foo8 (a, b);
+ if (__builtin_memcmp (&dest, &exp, 16))
+ __builtin_abort ();
+
+}
--
2.18.1
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2022-09-26 3:16 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-09-23 6:42 [PATCH] [x86] Support 2-instruction vector shuffle for V4SI/V4SF in ix86_expand_vec_perm_const_1 liuhongt
2022-09-23 6:53 ` Jakub Jelinek
2022-09-26 3:14 ` liuhongt
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).