public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r14-65] tree-vect-patterns: Improve __builtin_{clz, ctz, ffs}ll vectorization [PR109011]
@ 2023-04-19  9:15 Jakub Jelinek
  0 siblings, 0 replies; only message in thread
From: Jakub Jelinek @ 2023-04-19  9:15 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:ade0a1ee5c6707b950ba284adcfed0514866c12d

commit r14-65-gade0a1ee5c6707b950ba284adcfed0514866c12d
Author: Jakub Jelinek <jakub@redhat.com>
Date:   Wed Apr 19 11:14:23 2023 +0200

    tree-vect-patterns: Improve __builtin_{clz,ctz,ffs}ll vectorization [PR109011]
    
    For __builtin_popcountll tree-vect-patterns.cc has
    vect_recog_popcount_pattern, which improves the vectorized code.
    Without that the vectorization is always multi-type vectorization
    in the loop (at least int and long long types) where we emit two
    .POPCOUNT calls with long long arguments and int return value and then
    widen to long long, so effectively after vectorization do the
    V?DImode -> V?DImode popcount twice, then pack the result into V?SImode
    and immediately unpack.
    
    The following patch extends that handling to __builtin_{clz,ctz,ffs}ll
    builtins as well (as long as there is an optab for them; more to come
    laster).
    
    x86 can do __builtin_popcountll with -mavx512vpopcntdq, __builtin_clzll
    with -mavx512cd, ppc can do __builtin_popcountll and __builtin_clzll
    with -mpower8-vector and __builtin_ctzll with -mpower9-vector, s390
    can do __builtin_{popcount,clz,ctz}ll with -march=z13 -mzarch (i.e. VX).
    
    2023-04-19  Jakub Jelinek  <jakub@redhat.com>
    
            PR tree-optimization/109011
            * tree-vect-patterns.cc (vect_recog_popcount_pattern): Rename to ...
            (vect_recog_popcount_clz_ctz_ffs_pattern): ... this.  Handle also
            CLZ, CTZ and FFS.  Remove vargs variable, use
            gimple_build_call_internal rather than gimple_build_call_internal_vec.
            (vect_vect_recog_func_ptrs): Adjust popcount entry.
    
            * gcc.dg/vect/pr109011-1.c: New test.

Diff:
---
 gcc/testsuite/gcc.dg/vect/pr109011-1.c |  48 +++++++++++
 gcc/tree-vect-patterns.cc              | 148 +++++++++++++++++++++++++++------
 2 files changed, 171 insertions(+), 25 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/vect/pr109011-1.c b/gcc/testsuite/gcc.dg/vect/pr109011-1.c
new file mode 100644
index 00000000000..707a82aaf43
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr109011-1.c
@@ -0,0 +1,48 @@
+/* PR tree-optimization/109011 */
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-unroll-loops --param=vect-epilogues-nomask=0 -fdump-tree-optimized" } */
+/* { dg-additional-options "-mavx512cd" { target { { i?86-*-* x86_64-*-* } && avx512cd } } } */
+/* { dg-additional-options "-mavx512vpopcntdq" { target { { i?86-*-* x86_64-*-* } && avx512vpopcntdq } } } */
+/* { dg-additional-options "-mpower8-vector" { target powerpc_p8vector_ok } } */
+/* { dg-additional-options "-mpower9-vector" { target powerpc_p9vector_ok } } */
+/* { dg-additional-options "-march=z13 -mzarch" { target s390_vx } } */
+
+void
+foo (long long *p, long long *q)
+{
+#pragma omp simd
+  for (int i = 0; i < 2048; ++i)
+    p[i] = __builtin_popcountll (q[i]);
+}
+
+/* { dg-final { scan-tree-dump-times " = \.POPCOUNT \\\(" 1 "optimized" { target { { i?86-*-* x86_64-*-* } && avx512vpopcntdq } } } } */
+/* { dg-final { scan-tree-dump-times " = \.POPCOUNT \\\(" 1 "optimized" { target { powerpc_p8vector_ok || s390_vx } } } } */
+
+void
+bar (long long *p, long long *q)
+{
+#pragma omp simd
+  for (int i = 0; i < 2048; ++i)
+    p[i] = __builtin_clzll (q[i]);
+}
+
+/* { dg-final { scan-tree-dump-times " = \.CLZ \\\(" 1 "optimized" { target { { i?86-*-* x86_64-*-* } && avx512cd } } } } */
+/* { dg-final { scan-tree-dump-times " = \.CLZ \\\(" 1 "optimized" { target { powerpc_p8vector_ok || s390_vx } } } } */
+
+void
+baz (long long *p, long long *q)
+{
+#pragma omp simd
+  for (int i = 0; i < 2048; ++i)
+    p[i] = __builtin_ctzll (q[i]);
+}
+
+/* { dg-final { scan-tree-dump-times " = \.CTZ \\\(" 1 "optimized" { target { powerpc_p9vector_ok || s390_vx } } } } */
+
+void
+qux (long long *p, long long *q)
+{
+#pragma omp simd
+  for (int i = 0; i < 2048; ++i)
+    p[i] = __builtin_ffsll (q[i]);
+}
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index 8802141cd6e..633998e8e3a 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -1501,7 +1501,7 @@ vect_recog_widen_minus_pattern (vec_info *vinfo, stmt_vec_info last_stmt_info,
 				      "vect_recog_widen_minus_pattern");
 }
 
-/* Function vect_recog_popcount_pattern
+/* Function vect_recog_popcount_clz_ctz_ffs_pattern
 
    Try to find the following pattern:
 
@@ -1530,16 +1530,20 @@ vect_recog_widen_minus_pattern (vec_info *vinfo, stmt_vec_info last_stmt_info,
    * Return value: A new stmt that will be used to replace the sequence of
    stmts that constitute the pattern. In this case it will be:
    B = .POPCOUNT (A);
+
+   Similarly for clz, ctz and ffs.
 */
 
 static gimple *
-vect_recog_popcount_pattern (vec_info *vinfo,
-			     stmt_vec_info stmt_vinfo, tree *type_out)
+vect_recog_popcount_clz_ctz_ffs_pattern (vec_info *vinfo,
+					 stmt_vec_info stmt_vinfo,
+					 tree *type_out)
 {
   gassign *last_stmt = dyn_cast <gassign *> (stmt_vinfo->stmt);
-  gimple *popcount_stmt, *pattern_stmt;
+  gimple *call_stmt, *pattern_stmt;
   tree rhs_oprnd, rhs_origin, lhs_oprnd, lhs_type, vec_type, new_var;
-  auto_vec<tree> vargs;
+  internal_fn ifn = IFN_LAST;
+  int addend = 0;
 
   /* Find B = (TYPE1) temp_out. */
   if (!last_stmt)
@@ -1557,51 +1561,137 @@ vect_recog_popcount_pattern (vec_info *vinfo,
   if (TREE_CODE (rhs_oprnd) != SSA_NAME
       || !has_single_use (rhs_oprnd))
     return NULL;
-  popcount_stmt = SSA_NAME_DEF_STMT (rhs_oprnd);
+  call_stmt = SSA_NAME_DEF_STMT (rhs_oprnd);
 
   /* Find temp_out = __builtin_popcount{,l,ll} (temp_in);  */
-  if (!is_gimple_call (popcount_stmt))
+  if (!is_gimple_call (call_stmt))
     return NULL;
-  switch (gimple_call_combined_fn (popcount_stmt))
+  switch (gimple_call_combined_fn (call_stmt))
     {
+      int val;
     CASE_CFN_POPCOUNT:
+      ifn = IFN_POPCOUNT;
+      break;
+    CASE_CFN_CLZ:
+      ifn = IFN_CLZ;
+      /* Punt if call result is unsigned and defined value at zero
+	 is negative, as the negative value doesn't extend correctly.  */
+      if (TYPE_UNSIGNED (TREE_TYPE (rhs_oprnd))
+	  && gimple_call_internal_p (call_stmt)
+	  && CLZ_DEFINED_VALUE_AT_ZERO
+	       (SCALAR_INT_TYPE_MODE (TREE_TYPE (rhs_oprnd)), val) == 2
+	  && val < 0)
+	return NULL;
+      break;
+    CASE_CFN_CTZ:
+      ifn = IFN_CTZ;
+      /* Punt if call result is unsigned and defined value at zero
+	 is negative, as the negative value doesn't extend correctly.  */
+      if (TYPE_UNSIGNED (TREE_TYPE (rhs_oprnd))
+	  && gimple_call_internal_p (call_stmt)
+	  && CTZ_DEFINED_VALUE_AT_ZERO
+	       (SCALAR_INT_TYPE_MODE (TREE_TYPE (rhs_oprnd)), val) == 2
+	  && val < 0)
+	return NULL;
+      break;
+    CASE_CFN_FFS:
+      ifn = IFN_FFS;
       break;
     default:
       return NULL;
     }
 
-  if (gimple_call_num_args (popcount_stmt) != 1)
+  if (gimple_call_num_args (call_stmt) != 1)
     return NULL;
 
-  rhs_oprnd = gimple_call_arg (popcount_stmt, 0);
+  rhs_oprnd = gimple_call_arg (call_stmt, 0);
   vect_unpromoted_value unprom_diff;
-  rhs_origin = vect_look_through_possible_promotion (vinfo, rhs_oprnd,
-						    &unprom_diff);
+  rhs_origin
+    = vect_look_through_possible_promotion (vinfo, rhs_oprnd, &unprom_diff);
 
   if (!rhs_origin)
     return NULL;
 
-  /* Input and output of .POPCOUNT should be same-precision integer.
-     Also A should be unsigned or same precision as temp_in,
-     otherwise there would be sign_extend from A to temp_in.  */
-  if (TYPE_PRECISION (unprom_diff.type) != TYPE_PRECISION (lhs_type)
-      || (!TYPE_UNSIGNED (unprom_diff.type)
-	  && (TYPE_PRECISION (unprom_diff.type)
-	      != TYPE_PRECISION (TREE_TYPE (rhs_oprnd)))))
+  /* Input and output of .POPCOUNT should be same-precision integer.  */
+  if (TYPE_PRECISION (unprom_diff.type) != TYPE_PRECISION (lhs_type))
     return NULL;
-  vargs.safe_push (unprom_diff.op);
 
-  vect_pattern_detected ("vec_regcog_popcount_pattern", popcount_stmt);
+  /* Also A should be unsigned or same precision as temp_in, otherwise
+     different builtins/internal functions have different behaviors.  */
+  if (TYPE_PRECISION (unprom_diff.type)
+      != TYPE_PRECISION (TREE_TYPE (rhs_oprnd)))
+    switch (ifn)
+      {
+      case IFN_POPCOUNT:
+	/* For popcount require zero extension, which doesn't add any
+	   further bits to the count.  */
+	if (!TYPE_UNSIGNED (unprom_diff.type))
+	  return NULL;
+	break;
+      case IFN_CLZ:
+	/* clzll (x) == clz (x) + 32 for unsigned x != 0, so ok
+	   if it is undefined at zero or if it matches also for the
+	   defined value there.  */
+	if (!TYPE_UNSIGNED (unprom_diff.type))
+	  return NULL;
+	if (!type_has_mode_precision_p (lhs_type)
+	    || !type_has_mode_precision_p (TREE_TYPE (rhs_oprnd)))
+	  return NULL;
+	addend = (TYPE_PRECISION (TREE_TYPE (rhs_oprnd))
+		  - TYPE_PRECISION (lhs_type));
+	if (gimple_call_internal_p (call_stmt))
+	  {
+	    int val1, val2;
+	    int d1
+	      = CLZ_DEFINED_VALUE_AT_ZERO
+		  (SCALAR_INT_TYPE_MODE (TREE_TYPE (rhs_oprnd)), val1);
+	    int d2
+	      = CLZ_DEFINED_VALUE_AT_ZERO (SCALAR_INT_TYPE_MODE (lhs_type),
+					   val2);
+	    if (d1 != 2)
+	      break;
+	    if (d2 != 2 || val1 != val2 + addend)
+	      return NULL;
+	  }
+	break;
+      case IFN_CTZ:
+	/* ctzll (x) == ctz (x) for unsigned or signed x != 0, so ok
+	   if it is undefined at zero or if it matches also for the
+	   defined value there.  */
+	if (gimple_call_internal_p (call_stmt))
+	  {
+	    int val1, val2;
+	    int d1
+	      = CTZ_DEFINED_VALUE_AT_ZERO
+		  (SCALAR_INT_TYPE_MODE (TREE_TYPE (rhs_oprnd)), val1);
+	    int d2
+	      = CTZ_DEFINED_VALUE_AT_ZERO (SCALAR_INT_TYPE_MODE (lhs_type),
+					   val2);
+	    if (d1 != 2)
+	      break;
+	    if (d2 != 2 || val1 != val2)
+	      return NULL;
+	  }
+	break;
+      case IFN_FFS:
+	/* ffsll (x) == ffs (x) for unsigned or signed x.  */
+	break;
+      default:
+	gcc_unreachable ();
+      }
+
+  vect_pattern_detected ("vec_recog_popcount_clz_ctz_ffs_pattern",
+			 call_stmt);
   vec_type = get_vectype_for_scalar_type (vinfo, lhs_type);
-  /* Do it only if the backend has popcount<vector_mode>2 pattern.  */
+  /* Do it only if the backend has popcount<vector_mode>2 etc. pattern.  */
   if (!vec_type
-      || !direct_internal_fn_supported_p (IFN_POPCOUNT, vec_type,
+      || !direct_internal_fn_supported_p (ifn, vec_type,
 					  OPTIMIZE_FOR_SPEED))
     return NULL;
 
   /* Create B = .POPCOUNT (A).  */
   new_var = vect_recog_temp_ssa_var (lhs_type, NULL);
-  pattern_stmt = gimple_build_call_internal_vec (IFN_POPCOUNT, vargs);
+  pattern_stmt = gimple_build_call_internal (ifn, 1, unprom_diff.op);
   gimple_call_set_lhs (pattern_stmt, new_var);
   gimple_set_location (pattern_stmt, gimple_location (last_stmt));
   *type_out = vec_type;
@@ -1609,6 +1699,14 @@ vect_recog_popcount_pattern (vec_info *vinfo,
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
 		     "created pattern stmt: %G", pattern_stmt);
+
+  if (addend)
+    {
+      append_pattern_def_seq (vinfo, stmt_vinfo, pattern_stmt, vec_type);
+      tree ret_var = vect_recog_temp_ssa_var (lhs_type, NULL);
+      pattern_stmt = gimple_build_assign (ret_var, PLUS_EXPR, new_var,
+					  build_int_cst (lhs_type, addend));
+    }
   return pattern_stmt;
 }
 
@@ -6051,7 +6149,7 @@ static vect_recog_func vect_vect_recog_func_ptrs[] = {
   { vect_recog_sad_pattern, "sad" },
   { vect_recog_widen_sum_pattern, "widen_sum" },
   { vect_recog_pow_pattern, "pow" },
-  { vect_recog_popcount_pattern, "popcount" },
+  { vect_recog_popcount_clz_ctz_ffs_pattern, "popcount_clz_ctz_ffs" },
   { vect_recog_widen_shift_pattern, "widen_shift" },
   { vect_recog_rotate_pattern, "rotate" },
   { vect_recog_vector_vector_shift_pattern, "vector_vector_shift" },

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2023-04-19  9:15 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-04-19  9:15 [gcc r14-65] tree-vect-patterns: Improve __builtin_{clz, ctz, ffs}ll vectorization [PR109011] Jakub Jelinek

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).