public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r12-3177] Fold more shuffle builtins to VEC_PERM_EXPR.
@ 2021-08-27  0:51 hongtao Liu
  0 siblings, 0 replies; only message in thread
From: hongtao Liu @ 2021-08-27  0:51 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:0fa4787bf34b173ce6f198e99b6f6dd8a3f98014

commit r12-3177-g0fa4787bf34b173ce6f198e99b6f6dd8a3f98014
Author: liuhongt <hongtao.liu@intel.com>
Date:   Fri Dec 11 19:02:43 2020 +0800

    Fold more shuffle builtins to VEC_PERM_EXPR.
    
    A follow-up to https://gcc.gnu.org/pipermail/gcc-patches/2019-May/521983.html
    
    gcc/
            PR target/98167
            PR target/43147
            * config/i386/i386.c (ix86_gimple_fold_builtin): Fold
            IX86_BUILTIN_SHUFPD512, IX86_BUILTIN_SHUFPS512,
            IX86_BUILTIN_SHUFPD256, IX86_BUILTIN_SHUFPS,
            IX86_BUILTIN_SHUFPS256.
            (ix86_masked_all_ones): New function.
    
    gcc/testsuite/
            * gcc.target/i386/avx512f-vshufpd-1.c: Adjust testcase.
            * gcc.target/i386/avx512f-vshufps-1.c: Adjust testcase.
            * gcc.target/i386/pr43147.c: New test.

Diff:
---
 gcc/config/i386/i386.c                            | 91 +++++++++++++++++------
 gcc/testsuite/gcc.target/i386/avx512f-vshufpd-1.c |  3 +-
 gcc/testsuite/gcc.target/i386/avx512f-vshufps-1.c |  3 +-
 gcc/testsuite/gcc.target/i386/pr43147.c           | 15 ++++
 4 files changed, 88 insertions(+), 24 deletions(-)

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index ddbbbceded1..3bb2cab57a3 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -17559,6 +17559,21 @@ ix86_vector_shift_count (tree arg1)
   return NULL_TREE;
 }
 
+/* Return true if arg_mask is all ones, ELEMS is elements number of
+   corresponding vector.  */
+static bool
+ix86_masked_all_ones (unsigned HOST_WIDE_INT elems, tree arg_mask)
+{
+  if (TREE_CODE (arg_mask) != INTEGER_CST)
+    return false;
+
+  unsigned HOST_WIDE_INT mask = TREE_INT_CST_LOW (arg_mask);
+  if ((mask | (HOST_WIDE_INT_M1U << elems)) != HOST_WIDE_INT_M1U)
+    return false;
+
+  return true;
+}
+
 static tree
 ix86_fold_builtin (tree fndecl, int n_args,
 		   tree *args, bool ignore ATTRIBUTE_UNUSED)
@@ -18044,6 +18059,7 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
   enum tree_code tcode;
   unsigned HOST_WIDE_INT count;
   bool is_vshift;
+  unsigned HOST_WIDE_INT elems;
 
   switch (fn_code)
     {
@@ -18367,17 +18383,11 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
       gcc_assert (n_args >= 2);
       arg0 = gimple_call_arg (stmt, 0);
       arg1 = gimple_call_arg (stmt, 1);
-      if (n_args > 2)
-	{
-	  /* This is masked shift.  Only optimize if the mask is all ones.  */
-	  tree argl = gimple_call_arg (stmt, n_args - 1);
-	  if (!tree_fits_uhwi_p (argl))
-	    break;
-	  unsigned HOST_WIDE_INT mask = tree_to_uhwi (argl);
-	  unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
-	  if ((mask | (HOST_WIDE_INT_M1U << elems)) != HOST_WIDE_INT_M1U)
-	    break;
-	}
+      elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
+      /* For masked shift, only optimize if the mask is all ones.  */
+      if (n_args > 2
+	  && !ix86_masked_all_ones (elems, gimple_call_arg (stmt, n_args - 1)))
+	break;
       if (is_vshift)
 	{
 	  if (TREE_CODE (arg1) != VECTOR_CST)
@@ -18426,25 +18436,62 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
 	}
       break;
 
+    case IX86_BUILTIN_SHUFPD512:
+    case IX86_BUILTIN_SHUFPS512:
     case IX86_BUILTIN_SHUFPD:
+    case IX86_BUILTIN_SHUFPD256:
+    case IX86_BUILTIN_SHUFPS:
+    case IX86_BUILTIN_SHUFPS256:
+      arg0 = gimple_call_arg (stmt, 0);
+      elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
+      /* This is masked shuffle.  Only optimize if the mask is all ones.  */
+      if (n_args > 3
+	  && !ix86_masked_all_ones (elems,
+				    gimple_call_arg (stmt, n_args - 1)))
+	break;
       arg2 = gimple_call_arg (stmt, 2);
       if (TREE_CODE (arg2) == INTEGER_CST)
 	{
+	  unsigned HOST_WIDE_INT shuffle_mask = TREE_INT_CST_LOW (arg2);
+	  /* Check valid imm, refer to gcc.target/i386/testimm-10.c.  */
+	  if (shuffle_mask > 255)
+	    return false;
+
+	  machine_mode imode = GET_MODE_INNER (TYPE_MODE (TREE_TYPE (arg0)));
 	  location_t loc = gimple_location (stmt);
-	  unsigned HOST_WIDE_INT imask = TREE_INT_CST_LOW (arg2);
-	  arg0 = gimple_call_arg (stmt, 0);
+	  tree itype = (imode == E_DFmode
+			? long_long_integer_type_node : integer_type_node);
+	  tree vtype = build_vector_type (itype, elems);
+	  tree_vector_builder elts (vtype, elems, 1);
+
+
+	  /* Transform integer shuffle_mask to vector perm_mask which
+	     is used by vec_perm_expr, refer to shuflp[sd]256/512 in sse.md.  */
+	  for (unsigned i = 0; i != elems; i++)
+	    {
+	      unsigned sel_idx;
+	      /* Imm[1:0](if VL > 128, then use Imm[3:2],Imm[5:4],Imm[7:6])
+		 provide 2 select constrols for each element of the
+		 destination.  */
+	      if (imode == E_DFmode)
+		sel_idx = (i & 1) * elems + (i & ~1)
+			  + ((shuffle_mask >> i) & 1);
+	      else
+		{
+		  /* Imm[7:0](if VL > 128, also use Imm[7:0]) provide 4 select
+		     controls for each element of the destination.  */
+		  unsigned j = i % 4;
+		  sel_idx = ((i >> 1) & 1) * elems + (i & ~3)
+			    + ((shuffle_mask >> 2 * j) & 3);
+		}
+	      elts.quick_push (build_int_cst (itype, sel_idx));
+	    }
+
+	  tree perm_mask = elts.build ();
 	  arg1 = gimple_call_arg (stmt, 1);
-	  tree itype = long_long_integer_type_node;
-	  tree vtype = build_vector_type (itype, 2); /* V2DI */
-	  tree_vector_builder elts (vtype, 2, 1);
-	  /* Ignore bits other than the lowest 2.  */
-	  elts.quick_push (build_int_cst (itype, imask & 1));
-	  imask >>= 1;
-	  elts.quick_push (build_int_cst (itype, 2 + (imask & 1)));
-	  tree omask = elts.build ();
 	  gimple *g = gimple_build_assign (gimple_call_lhs (stmt),
 					   VEC_PERM_EXPR,
-					   arg0, arg1, omask);
+					   arg0, arg1, perm_mask);
 	  gimple_set_location (g, loc);
 	  gsi_replace (gsi, g, false);
 	  return true;
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vshufpd-1.c b/gcc/testsuite/gcc.target/i386/avx512f-vshufpd-1.c
index d1ac01e1c88..8df5b9d4441 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-vshufpd-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-vshufpd-1.c
@@ -7,11 +7,12 @@
 #include <immintrin.h>
 
 __m512d x;
+__m512d y;
 
 void extern
 avx512f_test (void)
 {
-  x = _mm512_shuffle_pd (x, x, 56);
+  x = _mm512_shuffle_pd (x, y, 56);
   x = _mm512_mask_shuffle_pd (x, 2, x, x, 56);
   x = _mm512_maskz_shuffle_pd (2, x, x, 56);
 }
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vshufps-1.c b/gcc/testsuite/gcc.target/i386/avx512f-vshufps-1.c
index 07a63fca3ff..378ae4b7101 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-vshufps-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-vshufps-1.c
@@ -7,11 +7,12 @@
 #include <immintrin.h>
 
 __m512 x;
+__m512 y;
 
 void extern
 avx512f_test (void)
 {
-  x = _mm512_shuffle_ps (x, x, 56);
+  x = _mm512_shuffle_ps (x, y, 56);
   x = _mm512_mask_shuffle_ps (x, 2, x, x, 56);
   x = _mm512_maskz_shuffle_ps (2, x, x, 56);
 }
diff --git a/gcc/testsuite/gcc.target/i386/pr43147.c b/gcc/testsuite/gcc.target/i386/pr43147.c
new file mode 100644
index 00000000000..3c30f917c06
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr43147.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2" } */
+/* { dg-final { scan-assembler "movaps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+#include <x86intrin.h>
+
+__m128
+foo (void)
+{
+  __m128 m = _mm_set_ps(1.0f, 2.0f, 3.0f, 4.0f);
+  m = _mm_shuffle_ps(m, m, 0xC9);
+  m = _mm_shuffle_ps(m, m, 0x2D);
+  return m;
+}


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2021-08-27  0:51 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-08-27  0:51 [gcc r12-3177] Fold more shuffle builtins to VEC_PERM_EXPR hongtao Liu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).