* [PATCH] Fold more shuffle builtins to VEC_PERM_EXPR.
@ 2021-08-26 4:57 liuhongt
2021-08-26 5:05 ` Hongtao Liu
0 siblings, 1 reply; 2+ messages in thread
From: liuhongt @ 2021-08-26 4:57 UTC (permalink / raw)
To: gcc-patches
This patch is a follow-up to [1], it fold all shufps/shufpd builtins into gimple.
Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
[1] https://gcc.gnu.org/pipermail/gcc-patches/2019-May/521983.html
gcc/
PR target/98167
PR target/43147
* config/i386/i386.c (ix86_gimple_fold_builtin): Fold
IX86_BUILTIN_SHUFPD512, IX86_BUILTIN_SHUFPS512,
IX86_BUILTIN_SHUFPD256, IX86_BUILTIN_SHUFPS,
IX86_BUILTIN_SHUFPS256.
(ix86_masked_all_ones): New function.
gcc/testsuite/
* gcc.target/i386/avx512f-vshufpd-1.c: Adjust testcase.
* gcc.target/i386/avx512f-vshufps-1.c: Adjust testcase.
* gcc.target/i386/pr43147.c: New test.
---
gcc/config/i386/i386.c | 90 ++++++++++++++-----
.../gcc.target/i386/avx512f-vshufpd-1.c | 3 +-
.../gcc.target/i386/avx512f-vshufps-1.c | 3 +-
gcc/testsuite/gcc.target/i386/pr43147.c | 15 ++++
4 files changed, 87 insertions(+), 24 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr43147.c
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index ebec8668758..f3eed9f2426 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -17541,6 +17541,20 @@ ix86_vector_shift_count (tree arg1)
return NULL_TREE;
}
+/* Return true if arg_mask is all ones, arg_vec is corresponding vector. */
+static bool
+ix86_masked_all_ones (unsigned HOST_WIDE_INT elems, tree arg_mask)
+{
+ if (TREE_CODE (arg_mask) != INTEGER_CST)
+ return false;
+
+ unsigned HOST_WIDE_INT mask = TREE_INT_CST_LOW (arg_mask);
+ if ((mask | (HOST_WIDE_INT_M1U << elems)) != HOST_WIDE_INT_M1U)
+ return false;
+
+ return true;
+}
+
static tree
ix86_fold_builtin (tree fndecl, int n_args,
tree *args, bool ignore ATTRIBUTE_UNUSED)
@@ -18026,6 +18040,7 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
enum tree_code tcode;
unsigned HOST_WIDE_INT count;
bool is_vshift;
+ unsigned HOST_WIDE_INT elems;
switch (fn_code)
{
@@ -18349,17 +18364,11 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
gcc_assert (n_args >= 2);
arg0 = gimple_call_arg (stmt, 0);
arg1 = gimple_call_arg (stmt, 1);
- if (n_args > 2)
- {
- /* This is masked shift. Only optimize if the mask is all ones. */
- tree argl = gimple_call_arg (stmt, n_args - 1);
- if (!tree_fits_uhwi_p (argl))
- break;
- unsigned HOST_WIDE_INT mask = tree_to_uhwi (argl);
- unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
- if ((mask | (HOST_WIDE_INT_M1U << elems)) != HOST_WIDE_INT_M1U)
- break;
- }
+ elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
+ /* For masked shift, only optimize if the mask is all ones. */
+ if (n_args > 2
+ && !ix86_masked_all_ones (elems, gimple_call_arg (stmt, n_args - 1)))
+ break;
if (is_vshift)
{
if (TREE_CODE (arg1) != VECTOR_CST)
@@ -18408,25 +18417,62 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
}
break;
+ case IX86_BUILTIN_SHUFPD512:
+ case IX86_BUILTIN_SHUFPS512:
case IX86_BUILTIN_SHUFPD:
+ case IX86_BUILTIN_SHUFPD256:
+ case IX86_BUILTIN_SHUFPS:
+ case IX86_BUILTIN_SHUFPS256:
+ arg0 = gimple_call_arg (stmt, 0);
+ elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
+ /* This is masked shuffle. Only optimize if the mask is all ones. */
+ if (n_args > 3
+ && !ix86_masked_all_ones (elems,
+ gimple_call_arg (stmt, n_args - 1)))
+ break;
arg2 = gimple_call_arg (stmt, 2);
if (TREE_CODE (arg2) == INTEGER_CST)
{
+ unsigned HOST_WIDE_INT shuffle_mask = TREE_INT_CST_LOW (arg2);
+ /* Check valid imm, refer to gcc.target/i386/testimm-10.c. */
+ if (shuffle_mask > 255)
+ return false;
+
+ machine_mode imode = GET_MODE_INNER (TYPE_MODE (TREE_TYPE (arg0)));
location_t loc = gimple_location (stmt);
- unsigned HOST_WIDE_INT imask = TREE_INT_CST_LOW (arg2);
- arg0 = gimple_call_arg (stmt, 0);
+ tree itype = (imode == E_DFmode
+ ? long_long_integer_type_node : integer_type_node);
+ tree vtype = build_vector_type (itype, elems);
+ tree_vector_builder elts (vtype, elems, 1);
+
+
+ /* Transform integer shuffle_mask to vector perm_mask which
+ is used by vec_perm_expr, refer to shuflp[sd]256/512 in sse.md. */
+ for (unsigned i = 0; i != elems; i++)
+ {
+ unsigned sel_idx;
+ /* Imm[1:0](if VL > 128, then use Imm[3:2],Imm[5:4],Imm[7:6])
+ provide 2 select constrols for each element of the
+ destination. */
+ if (imode == E_DFmode)
+ sel_idx = (i & 1) * elems + (i & ~1)
+ + ((shuffle_mask >> i) & 1);
+ else
+ {
+ /* Imm[7:0](if VL > 128, also use Imm[7:0]) provide 4 select
+ controls for each element of the destination. */
+ unsigned j = i % 4;
+ sel_idx = ((i >> 1) & 1) * elems + (i & ~3)
+ + ((shuffle_mask >> 2 * j) & 3);
+ }
+ elts.quick_push (build_int_cst (itype, sel_idx));
+ }
+
+ tree perm_mask = elts.build ();
arg1 = gimple_call_arg (stmt, 1);
- tree itype = long_long_integer_type_node;
- tree vtype = build_vector_type (itype, 2); /* V2DI */
- tree_vector_builder elts (vtype, 2, 1);
- /* Ignore bits other than the lowest 2. */
- elts.quick_push (build_int_cst (itype, imask & 1));
- imask >>= 1;
- elts.quick_push (build_int_cst (itype, 2 + (imask & 1)));
- tree omask = elts.build ();
gimple *g = gimple_build_assign (gimple_call_lhs (stmt),
VEC_PERM_EXPR,
- arg0, arg1, omask);
+ arg0, arg1, perm_mask);
gimple_set_location (g, loc);
gsi_replace (gsi, g, false);
return true;
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vshufpd-1.c b/gcc/testsuite/gcc.target/i386/avx512f-vshufpd-1.c
index d1ac01e1c88..8df5b9d4441 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-vshufpd-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-vshufpd-1.c
@@ -7,11 +7,12 @@
#include <immintrin.h>
__m512d x;
+__m512d y;
void extern
avx512f_test (void)
{
- x = _mm512_shuffle_pd (x, x, 56);
+ x = _mm512_shuffle_pd (x, y, 56);
x = _mm512_mask_shuffle_pd (x, 2, x, x, 56);
x = _mm512_maskz_shuffle_pd (2, x, x, 56);
}
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vshufps-1.c b/gcc/testsuite/gcc.target/i386/avx512f-vshufps-1.c
index 07a63fca3ff..378ae4b7101 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-vshufps-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-vshufps-1.c
@@ -7,11 +7,12 @@
#include <immintrin.h>
__m512 x;
+__m512 y;
void extern
avx512f_test (void)
{
- x = _mm512_shuffle_ps (x, x, 56);
+ x = _mm512_shuffle_ps (x, y, 56);
x = _mm512_mask_shuffle_ps (x, 2, x, x, 56);
x = _mm512_maskz_shuffle_ps (2, x, x, 56);
}
diff --git a/gcc/testsuite/gcc.target/i386/pr43147.c b/gcc/testsuite/gcc.target/i386/pr43147.c
new file mode 100644
index 00000000000..3c30f917c06
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr43147.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2" } */
+/* { dg-final { scan-assembler "movaps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+#include <x86intrin.h>
+
+__m128
+foo (void)
+{
+ __m128 m = _mm_set_ps(1.0f, 2.0f, 3.0f, 4.0f);
+ m = _mm_shuffle_ps(m, m, 0xC9);
+ m = _mm_shuffle_ps(m, m, 0x2D);
+ return m;
+}
--
2.18.1
^ permalink raw reply [flat|nested] 2+ messages in thread
* Re: [PATCH] Fold more shuffle builtins to VEC_PERM_EXPR.
2021-08-26 4:57 [PATCH] Fold more shuffle builtins to VEC_PERM_EXPR liuhongt
@ 2021-08-26 5:05 ` Hongtao Liu
0 siblings, 0 replies; 2+ messages in thread
From: Hongtao Liu @ 2021-08-26 5:05 UTC (permalink / raw)
To: liuhongt; +Cc: GCC Patches, H. J. Lu
On Thu, Aug 26, 2021 at 12:57 PM liuhongt <hongtao.liu@intel.com> wrote:
>
> This patch is a follow-up to [1], it fold all shufps/shufpd builtins into gimple.
Of course for non-mask or mask all-ones version.
> Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
>
> [1] https://gcc.gnu.org/pipermail/gcc-patches/2019-May/521983.html
>
> gcc/
> PR target/98167
> PR target/43147
> * config/i386/i386.c (ix86_gimple_fold_builtin): Fold
> IX86_BUILTIN_SHUFPD512, IX86_BUILTIN_SHUFPS512,
> IX86_BUILTIN_SHUFPD256, IX86_BUILTIN_SHUFPS,
> IX86_BUILTIN_SHUFPS256.
> (ix86_masked_all_ones): New function.
>
> gcc/testsuite/
> * gcc.target/i386/avx512f-vshufpd-1.c: Adjust testcase.
> * gcc.target/i386/avx512f-vshufps-1.c: Adjust testcase.
> * gcc.target/i386/pr43147.c: New test.
> ---
> gcc/config/i386/i386.c | 90 ++++++++++++++-----
> .../gcc.target/i386/avx512f-vshufpd-1.c | 3 +-
> .../gcc.target/i386/avx512f-vshufps-1.c | 3 +-
> gcc/testsuite/gcc.target/i386/pr43147.c | 15 ++++
> 4 files changed, 87 insertions(+), 24 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/i386/pr43147.c
>
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index ebec8668758..f3eed9f2426 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -17541,6 +17541,20 @@ ix86_vector_shift_count (tree arg1)
> return NULL_TREE;
> }
>
> +/* Return true if arg_mask is all ones, arg_vec is corresponding vector. */
> +static bool
> +ix86_masked_all_ones (unsigned HOST_WIDE_INT elems, tree arg_mask)
> +{
> + if (TREE_CODE (arg_mask) != INTEGER_CST)
> + return false;
> +
> + unsigned HOST_WIDE_INT mask = TREE_INT_CST_LOW (arg_mask);
> + if ((mask | (HOST_WIDE_INT_M1U << elems)) != HOST_WIDE_INT_M1U)
> + return false;
> +
> + return true;
> +}
> +
> static tree
> ix86_fold_builtin (tree fndecl, int n_args,
> tree *args, bool ignore ATTRIBUTE_UNUSED)
> @@ -18026,6 +18040,7 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
> enum tree_code tcode;
> unsigned HOST_WIDE_INT count;
> bool is_vshift;
> + unsigned HOST_WIDE_INT elems;
>
> switch (fn_code)
> {
> @@ -18349,17 +18364,11 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
> gcc_assert (n_args >= 2);
> arg0 = gimple_call_arg (stmt, 0);
> arg1 = gimple_call_arg (stmt, 1);
> - if (n_args > 2)
> - {
> - /* This is masked shift. Only optimize if the mask is all ones. */
> - tree argl = gimple_call_arg (stmt, n_args - 1);
> - if (!tree_fits_uhwi_p (argl))
> - break;
> - unsigned HOST_WIDE_INT mask = tree_to_uhwi (argl);
> - unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
> - if ((mask | (HOST_WIDE_INT_M1U << elems)) != HOST_WIDE_INT_M1U)
> - break;
> - }
> + elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
> + /* For masked shift, only optimize if the mask is all ones. */
> + if (n_args > 2
> + && !ix86_masked_all_ones (elems, gimple_call_arg (stmt, n_args - 1)))
> + break;
> if (is_vshift)
> {
> if (TREE_CODE (arg1) != VECTOR_CST)
> @@ -18408,25 +18417,62 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
> }
> break;
>
> + case IX86_BUILTIN_SHUFPD512:
> + case IX86_BUILTIN_SHUFPS512:
> case IX86_BUILTIN_SHUFPD:
> + case IX86_BUILTIN_SHUFPD256:
> + case IX86_BUILTIN_SHUFPS:
> + case IX86_BUILTIN_SHUFPS256:
> + arg0 = gimple_call_arg (stmt, 0);
> + elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
> + /* This is masked shuffle. Only optimize if the mask is all ones. */
> + if (n_args > 3
> + && !ix86_masked_all_ones (elems,
> + gimple_call_arg (stmt, n_args - 1)))
> + break;
> arg2 = gimple_call_arg (stmt, 2);
> if (TREE_CODE (arg2) == INTEGER_CST)
> {
> + unsigned HOST_WIDE_INT shuffle_mask = TREE_INT_CST_LOW (arg2);
> + /* Check valid imm, refer to gcc.target/i386/testimm-10.c. */
> + if (shuffle_mask > 255)
> + return false;
> +
> + machine_mode imode = GET_MODE_INNER (TYPE_MODE (TREE_TYPE (arg0)));
> location_t loc = gimple_location (stmt);
> - unsigned HOST_WIDE_INT imask = TREE_INT_CST_LOW (arg2);
> - arg0 = gimple_call_arg (stmt, 0);
> + tree itype = (imode == E_DFmode
> + ? long_long_integer_type_node : integer_type_node);
> + tree vtype = build_vector_type (itype, elems);
> + tree_vector_builder elts (vtype, elems, 1);
> +
> +
> + /* Transform integer shuffle_mask to vector perm_mask which
> + is used by vec_perm_expr, refer to shuflp[sd]256/512 in sse.md. */
> + for (unsigned i = 0; i != elems; i++)
> + {
> + unsigned sel_idx;
> + /* Imm[1:0](if VL > 128, then use Imm[3:2],Imm[5:4],Imm[7:6])
> + provide 2 select constrols for each element of the
> + destination. */
> + if (imode == E_DFmode)
> + sel_idx = (i & 1) * elems + (i & ~1)
> + + ((shuffle_mask >> i) & 1);
> + else
> + {
> + /* Imm[7:0](if VL > 128, also use Imm[7:0]) provide 4 select
> + controls for each element of the destination. */
> + unsigned j = i % 4;
> + sel_idx = ((i >> 1) & 1) * elems + (i & ~3)
> + + ((shuffle_mask >> 2 * j) & 3);
> + }
> + elts.quick_push (build_int_cst (itype, sel_idx));
> + }
> +
> + tree perm_mask = elts.build ();
> arg1 = gimple_call_arg (stmt, 1);
> - tree itype = long_long_integer_type_node;
> - tree vtype = build_vector_type (itype, 2); /* V2DI */
> - tree_vector_builder elts (vtype, 2, 1);
> - /* Ignore bits other than the lowest 2. */
> - elts.quick_push (build_int_cst (itype, imask & 1));
> - imask >>= 1;
> - elts.quick_push (build_int_cst (itype, 2 + (imask & 1)));
> - tree omask = elts.build ();
> gimple *g = gimple_build_assign (gimple_call_lhs (stmt),
> VEC_PERM_EXPR,
> - arg0, arg1, omask);
> + arg0, arg1, perm_mask);
> gimple_set_location (g, loc);
> gsi_replace (gsi, g, false);
> return true;
> diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vshufpd-1.c b/gcc/testsuite/gcc.target/i386/avx512f-vshufpd-1.c
> index d1ac01e1c88..8df5b9d4441 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512f-vshufpd-1.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512f-vshufpd-1.c
> @@ -7,11 +7,12 @@
> #include <immintrin.h>
>
> __m512d x;
> +__m512d y;
>
> void extern
> avx512f_test (void)
> {
> - x = _mm512_shuffle_pd (x, x, 56);
> + x = _mm512_shuffle_pd (x, y, 56);
> x = _mm512_mask_shuffle_pd (x, 2, x, x, 56);
> x = _mm512_maskz_shuffle_pd (2, x, x, 56);
> }
> diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vshufps-1.c b/gcc/testsuite/gcc.target/i386/avx512f-vshufps-1.c
> index 07a63fca3ff..378ae4b7101 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512f-vshufps-1.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512f-vshufps-1.c
> @@ -7,11 +7,12 @@
> #include <immintrin.h>
>
> __m512 x;
> +__m512 y;
>
> void extern
> avx512f_test (void)
> {
> - x = _mm512_shuffle_ps (x, x, 56);
> + x = _mm512_shuffle_ps (x, y, 56);
> x = _mm512_mask_shuffle_ps (x, 2, x, x, 56);
> x = _mm512_maskz_shuffle_ps (2, x, x, 56);
> }
> diff --git a/gcc/testsuite/gcc.target/i386/pr43147.c b/gcc/testsuite/gcc.target/i386/pr43147.c
> new file mode 100644
> index 00000000000..3c30f917c06
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr43147.c
> @@ -0,0 +1,15 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse2" } */
> +/* { dg-final { scan-assembler "movaps" } } */
> +/* { dg-final { scan-assembler-not "shufps" } } */
> +
> +#include <x86intrin.h>
> +
> +__m128
> +foo (void)
> +{
> + __m128 m = _mm_set_ps(1.0f, 2.0f, 3.0f, 4.0f);
> + m = _mm_shuffle_ps(m, m, 0xC9);
> + m = _mm_shuffle_ps(m, m, 0x2D);
> + return m;
> +}
> --
> 2.18.1
>
--
BR,
Hongtao
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2021-08-26 4:59 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-08-26 4:57 [PATCH] Fold more shuffle builtins to VEC_PERM_EXPR liuhongt
2021-08-26 5:05 ` Hongtao Liu
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).