public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] tree-vect-patterns: Improve __builtin_{clz,ctz,ffs}ll vectorization [PR109011]
@ 2023-04-19  8:51 Jakub Jelinek
  2023-04-19  8:52 ` Richard Biener
  0 siblings, 1 reply; 2+ messages in thread
From: Jakub Jelinek @ 2023-04-19  8:51 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches

Hi!

For __builtin_popcountll tree-vect-patterns.cc has
vect_recog_popcount_pattern, which improves the vectorized code.
Without that the vectorization is always multi-type vectorization
in the loop (at least int and long long types) where we emit two
.POPCOUNT calls with long long arguments and int return value and then
widen to long long, so effectively after vectorization do the
V?DImode -> V?DImode popcount twice, then pack the result into V?SImode
and immediately unpack.

The following patch extends that handling to __builtin_{clz,ctz,ffs}ll
builtins as well (as long as there is an optab for them; more to come
laster).

Bootstrapped/regtested on x86_64-linux and i686-linux, plus tested on
the testcase in crosses to powerpc64le-linux and s390x-linux.  Ok
for trunk?

x86 can do __builtin_popcountll with -mavx512vpopcntdq, __builtin_clzll
with -mavx512cd, ppc can do __builtin_popcountll and __builtin_clzll
with -mpower8-vector and __builtin_ctzll with -mpower9-vector, s390
can do __builtin_{popcount,clz,ctz}ll with -march=z13 -mzarch (i.e. VX).

2023-04-19  Jakub Jelinek  <jakub@redhat.com>

	PR tree-optimization/109011
	* tree-vect-patterns.cc (vect_recog_popcount_pattern): Rename to ...
	(vect_recog_popcount_clz_ctz_ffs_pattern): ... this.  Handle also
	CLZ, CTZ and FFS.  Remove vargs variable, use
	gimple_build_call_internal rather than gimple_build_call_internal_vec.
	(vect_vect_recog_func_ptrs): Adjust popcount entry.

	* gcc.dg/vect/pr109011-1.c: New test.

--- gcc/tree-vect-patterns.cc.jj	2023-03-01 09:51:27.995362601 +0100
+++ gcc/tree-vect-patterns.cc	2023-04-18 17:16:42.733935262 +0200
@@ -1501,7 +1501,7 @@ vect_recog_widen_minus_pattern (vec_info
 				      "vect_recog_widen_minus_pattern");
 }
 
-/* Function vect_recog_popcount_pattern
+/* Function vect_recog_popcount_clz_ctz_ffs_pattern
 
    Try to find the following pattern:
 
@@ -1530,16 +1530,20 @@ vect_recog_widen_minus_pattern (vec_info
    * Return value: A new stmt that will be used to replace the sequence of
    stmts that constitute the pattern. In this case it will be:
    B = .POPCOUNT (A);
+
+   Similarly for clz, ctz and ffs.
 */
 
 static gimple *
-vect_recog_popcount_pattern (vec_info *vinfo,
-			     stmt_vec_info stmt_vinfo, tree *type_out)
+vect_recog_popcount_clz_ctz_ffs_pattern (vec_info *vinfo,
+					 stmt_vec_info stmt_vinfo,
+					 tree *type_out)
 {
   gassign *last_stmt = dyn_cast <gassign *> (stmt_vinfo->stmt);
-  gimple *popcount_stmt, *pattern_stmt;
+  gimple *call_stmt, *pattern_stmt;
   tree rhs_oprnd, rhs_origin, lhs_oprnd, lhs_type, vec_type, new_var;
-  auto_vec<tree> vargs;
+  internal_fn ifn = IFN_LAST;
+  int addend = 0;
 
   /* Find B = (TYPE1) temp_out. */
   if (!last_stmt)
@@ -1557,51 +1561,137 @@ vect_recog_popcount_pattern (vec_info *v
   if (TREE_CODE (rhs_oprnd) != SSA_NAME
       || !has_single_use (rhs_oprnd))
     return NULL;
-  popcount_stmt = SSA_NAME_DEF_STMT (rhs_oprnd);
+  call_stmt = SSA_NAME_DEF_STMT (rhs_oprnd);
 
   /* Find temp_out = __builtin_popcount{,l,ll} (temp_in);  */
-  if (!is_gimple_call (popcount_stmt))
+  if (!is_gimple_call (call_stmt))
     return NULL;
-  switch (gimple_call_combined_fn (popcount_stmt))
+  switch (gimple_call_combined_fn (call_stmt))
     {
+      int val;
     CASE_CFN_POPCOUNT:
+      ifn = IFN_POPCOUNT;
+      break;
+    CASE_CFN_CLZ:
+      ifn = IFN_CLZ;
+      /* Punt if call result is unsigned and defined value at zero
+	 is negative, as the negative value doesn't extend correctly.  */
+      if (TYPE_UNSIGNED (TREE_TYPE (rhs_oprnd))
+	  && gimple_call_internal_p (call_stmt)
+	  && CLZ_DEFINED_VALUE_AT_ZERO
+	       (SCALAR_INT_TYPE_MODE (TREE_TYPE (rhs_oprnd)), val) == 2
+	  && val < 0)
+	return NULL;
+      break;
+    CASE_CFN_CTZ:
+      ifn = IFN_CTZ;
+      /* Punt if call result is unsigned and defined value at zero
+	 is negative, as the negative value doesn't extend correctly.  */
+      if (TYPE_UNSIGNED (TREE_TYPE (rhs_oprnd))
+	  && gimple_call_internal_p (call_stmt)
+	  && CTZ_DEFINED_VALUE_AT_ZERO
+	       (SCALAR_INT_TYPE_MODE (TREE_TYPE (rhs_oprnd)), val) == 2
+	  && val < 0)
+	return NULL;
+      break;
+    CASE_CFN_FFS:
+      ifn = IFN_FFS;
       break;
     default:
       return NULL;
     }
 
-  if (gimple_call_num_args (popcount_stmt) != 1)
+  if (gimple_call_num_args (call_stmt) != 1)
     return NULL;
 
-  rhs_oprnd = gimple_call_arg (popcount_stmt, 0);
+  rhs_oprnd = gimple_call_arg (call_stmt, 0);
   vect_unpromoted_value unprom_diff;
-  rhs_origin = vect_look_through_possible_promotion (vinfo, rhs_oprnd,
-						    &unprom_diff);
+  rhs_origin
+    = vect_look_through_possible_promotion (vinfo, rhs_oprnd, &unprom_diff);
 
   if (!rhs_origin)
     return NULL;
 
-  /* Input and output of .POPCOUNT should be same-precision integer.
-     Also A should be unsigned or same precision as temp_in,
-     otherwise there would be sign_extend from A to temp_in.  */
-  if (TYPE_PRECISION (unprom_diff.type) != TYPE_PRECISION (lhs_type)
-      || (!TYPE_UNSIGNED (unprom_diff.type)
-	  && (TYPE_PRECISION (unprom_diff.type)
-	      != TYPE_PRECISION (TREE_TYPE (rhs_oprnd)))))
+  /* Input and output of .POPCOUNT should be same-precision integer.  */
+  if (TYPE_PRECISION (unprom_diff.type) != TYPE_PRECISION (lhs_type))
     return NULL;
-  vargs.safe_push (unprom_diff.op);
 
-  vect_pattern_detected ("vec_regcog_popcount_pattern", popcount_stmt);
+  /* Also A should be unsigned or same precision as temp_in, otherwise
+     different builtins/internal functions have different behaviors.  */
+  if (TYPE_PRECISION (unprom_diff.type)
+      != TYPE_PRECISION (TREE_TYPE (rhs_oprnd)))
+    switch (ifn)
+      {
+      case IFN_POPCOUNT:
+	/* For popcount require zero extension, which doesn't add any
+	   further bits to the count.  */
+	if (!TYPE_UNSIGNED (unprom_diff.type))
+	  return NULL;
+	break;
+      case IFN_CLZ:
+	/* clzll (x) == clz (x) + 32 for unsigned x != 0, so ok
+	   if it is undefined at zero or if it matches also for the
+	   defined value there.  */
+	if (!TYPE_UNSIGNED (unprom_diff.type))
+	  return NULL;
+	if (!type_has_mode_precision_p (lhs_type)
+	    || !type_has_mode_precision_p (TREE_TYPE (rhs_oprnd)))
+	  return NULL;
+	addend = (TYPE_PRECISION (TREE_TYPE (rhs_oprnd))
+		  - TYPE_PRECISION (lhs_type));
+	if (gimple_call_internal_p (call_stmt))
+	  {
+	    int val1, val2;
+	    int d1
+	      = CLZ_DEFINED_VALUE_AT_ZERO
+		  (SCALAR_INT_TYPE_MODE (TREE_TYPE (rhs_oprnd)), val1);
+	    int d2
+	      = CLZ_DEFINED_VALUE_AT_ZERO (SCALAR_INT_TYPE_MODE (lhs_type),
+					   val2);
+	    if (d1 != 2)
+	      break;
+	    if (d2 != 2 || val1 != val2 + addend)
+	      return NULL;
+	  }
+	break;
+      case IFN_CTZ:
+	/* ctzll (x) == ctz (x) for unsigned or signed x != 0, so ok
+	   if it is undefined at zero or if it matches also for the
+	   defined value there.  */
+	if (gimple_call_internal_p (call_stmt))
+	  {
+	    int val1, val2;
+	    int d1
+	      = CTZ_DEFINED_VALUE_AT_ZERO
+		  (SCALAR_INT_TYPE_MODE (TREE_TYPE (rhs_oprnd)), val1);
+	    int d2
+	      = CTZ_DEFINED_VALUE_AT_ZERO (SCALAR_INT_TYPE_MODE (lhs_type),
+					   val2);
+	    if (d1 != 2)
+	      break;
+	    if (d2 != 2 || val1 != val2)
+	      return NULL;
+	  }
+	break;
+      case IFN_FFS:
+	/* ffsll (x) == ffs (x) for unsigned or signed x.  */
+	break;
+      default:
+	gcc_unreachable ();
+      }
+
+  vect_pattern_detected ("vec_recog_popcount_clz_ctz_ffs_pattern",
+			 call_stmt);
   vec_type = get_vectype_for_scalar_type (vinfo, lhs_type);
-  /* Do it only if the backend has popcount<vector_mode>2 pattern.  */
+  /* Do it only if the backend has popcount<vector_mode>2 etc. pattern.  */
   if (!vec_type
-      || !direct_internal_fn_supported_p (IFN_POPCOUNT, vec_type,
+      || !direct_internal_fn_supported_p (ifn, vec_type,
 					  OPTIMIZE_FOR_SPEED))
     return NULL;
 
   /* Create B = .POPCOUNT (A).  */
   new_var = vect_recog_temp_ssa_var (lhs_type, NULL);
-  pattern_stmt = gimple_build_call_internal_vec (IFN_POPCOUNT, vargs);
+  pattern_stmt = gimple_build_call_internal (ifn, 1, unprom_diff.op);
   gimple_call_set_lhs (pattern_stmt, new_var);
   gimple_set_location (pattern_stmt, gimple_location (last_stmt));
   *type_out = vec_type;
@@ -1609,6 +1699,14 @@ vect_recog_popcount_pattern (vec_info *v
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
 		     "created pattern stmt: %G", pattern_stmt);
+
+  if (addend)
+    {
+      append_pattern_def_seq (vinfo, stmt_vinfo, pattern_stmt, vec_type);
+      tree ret_var = vect_recog_temp_ssa_var (lhs_type, NULL);
+      pattern_stmt = gimple_build_assign (ret_var, PLUS_EXPR, new_var,
+					  build_int_cst (lhs_type, addend));
+    }
   return pattern_stmt;
 }
 
@@ -6051,7 +6149,7 @@ static vect_recog_func vect_vect_recog_f
   { vect_recog_sad_pattern, "sad" },
   { vect_recog_widen_sum_pattern, "widen_sum" },
   { vect_recog_pow_pattern, "pow" },
-  { vect_recog_popcount_pattern, "popcount" },
+  { vect_recog_popcount_clz_ctz_ffs_pattern, "popcount_clz_ctz_ffs" },
   { vect_recog_widen_shift_pattern, "widen_shift" },
   { vect_recog_rotate_pattern, "rotate" },
   { vect_recog_vector_vector_shift_pattern, "vector_vector_shift" },
--- gcc/testsuite/gcc.dg/vect/pr109011-1.c.jj	2023-04-18 14:40:47.117397908 +0200
+++ gcc/testsuite/gcc.dg/vect/pr109011-1.c	2023-04-18 14:40:05.124004362 +0200
@@ -0,0 +1,48 @@
+/* PR tree-optimization/109011 */
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-unroll-loops --param=vect-epilogues-nomask=0 -fdump-tree-optimized" } */
+/* { dg-additional-options "-mavx512cd" { target { { i?86-*-* x86_64-*-* } && avx512cd } } } */
+/* { dg-additional-options "-mavx512vpopcntdq" { target { { i?86-*-* x86_64-*-* } && avx512vpopcntdq } } } */
+/* { dg-additional-options "-mpower8-vector" { target powerpc_p8vector_ok } } */
+/* { dg-additional-options "-mpower9-vector" { target powerpc_p9vector_ok } } */
+/* { dg-additional-options "-march=z13 -mzarch" { target s390_vx } } */
+
+void
+foo (long long *p, long long *q)
+{
+#pragma omp simd
+  for (int i = 0; i < 2048; ++i)
+    p[i] = __builtin_popcountll (q[i]);
+}
+
+/* { dg-final { scan-tree-dump-times " = \.POPCOUNT \\\(" 1 "optimized" { target { { i?86-*-* x86_64-*-* } && avx512vpopcntdq } } } } */
+/* { dg-final { scan-tree-dump-times " = \.POPCOUNT \\\(" 1 "optimized" { target { powerpc_p8vector_ok || s390_vx } } } } */
+
+void
+bar (long long *p, long long *q)
+{
+#pragma omp simd
+  for (int i = 0; i < 2048; ++i)
+    p[i] = __builtin_clzll (q[i]);
+}
+
+/* { dg-final { scan-tree-dump-times " = \.CLZ \\\(" 1 "optimized" { target { { i?86-*-* x86_64-*-* } && avx512cd } } } } */
+/* { dg-final { scan-tree-dump-times " = \.CLZ \\\(" 1 "optimized" { target { powerpc_p8vector_ok || s390_vx } } } } */
+
+void
+baz (long long *p, long long *q)
+{
+#pragma omp simd
+  for (int i = 0; i < 2048; ++i)
+    p[i] = __builtin_ctzll (q[i]);
+}
+
+/* { dg-final { scan-tree-dump-times " = \.CTZ \\\(" 1 "optimized" { target { powerpc_p9vector_ok || s390_vx } } } } */
+
+void
+qux (long long *p, long long *q)
+{
+#pragma omp simd
+  for (int i = 0; i < 2048; ++i)
+    p[i] = __builtin_ffsll (q[i]);
+}

	Jakub


^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [PATCH] tree-vect-patterns: Improve __builtin_{clz,ctz,ffs}ll vectorization [PR109011]
  2023-04-19  8:51 [PATCH] tree-vect-patterns: Improve __builtin_{clz,ctz,ffs}ll vectorization [PR109011] Jakub Jelinek
@ 2023-04-19  8:52 ` Richard Biener
  0 siblings, 0 replies; 2+ messages in thread
From: Richard Biener @ 2023-04-19  8:52 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: gcc-patches

On Wed, 19 Apr 2023, Jakub Jelinek wrote:

> Hi!
> 
> For __builtin_popcountll tree-vect-patterns.cc has
> vect_recog_popcount_pattern, which improves the vectorized code.
> Without that the vectorization is always multi-type vectorization
> in the loop (at least int and long long types) where we emit two
> .POPCOUNT calls with long long arguments and int return value and then
> widen to long long, so effectively after vectorization do the
> V?DImode -> V?DImode popcount twice, then pack the result into V?SImode
> and immediately unpack.
> 
> The following patch extends that handling to __builtin_{clz,ctz,ffs}ll
> builtins as well (as long as there is an optab for them; more to come
> laster).
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux, plus tested on
> the testcase in crosses to powerpc64le-linux and s390x-linux.  Ok
> for trunk?

OK.

Richard.

> x86 can do __builtin_popcountll with -mavx512vpopcntdq, __builtin_clzll
> with -mavx512cd, ppc can do __builtin_popcountll and __builtin_clzll
> with -mpower8-vector and __builtin_ctzll with -mpower9-vector, s390
> can do __builtin_{popcount,clz,ctz}ll with -march=z13 -mzarch (i.e. VX).
> 
> 2023-04-19  Jakub Jelinek  <jakub@redhat.com>
> 
> 	PR tree-optimization/109011
> 	* tree-vect-patterns.cc (vect_recog_popcount_pattern): Rename to ...
> 	(vect_recog_popcount_clz_ctz_ffs_pattern): ... this.  Handle also
> 	CLZ, CTZ and FFS.  Remove vargs variable, use
> 	gimple_build_call_internal rather than gimple_build_call_internal_vec.
> 	(vect_vect_recog_func_ptrs): Adjust popcount entry.
> 
> 	* gcc.dg/vect/pr109011-1.c: New test.
> 
> --- gcc/tree-vect-patterns.cc.jj	2023-03-01 09:51:27.995362601 +0100
> +++ gcc/tree-vect-patterns.cc	2023-04-18 17:16:42.733935262 +0200
> @@ -1501,7 +1501,7 @@ vect_recog_widen_minus_pattern (vec_info
>  				      "vect_recog_widen_minus_pattern");
>  }
>  
> -/* Function vect_recog_popcount_pattern
> +/* Function vect_recog_popcount_clz_ctz_ffs_pattern
>  
>     Try to find the following pattern:
>  
> @@ -1530,16 +1530,20 @@ vect_recog_widen_minus_pattern (vec_info
>     * Return value: A new stmt that will be used to replace the sequence of
>     stmts that constitute the pattern. In this case it will be:
>     B = .POPCOUNT (A);
> +
> +   Similarly for clz, ctz and ffs.
>  */
>  
>  static gimple *
> -vect_recog_popcount_pattern (vec_info *vinfo,
> -			     stmt_vec_info stmt_vinfo, tree *type_out)
> +vect_recog_popcount_clz_ctz_ffs_pattern (vec_info *vinfo,
> +					 stmt_vec_info stmt_vinfo,
> +					 tree *type_out)
>  {
>    gassign *last_stmt = dyn_cast <gassign *> (stmt_vinfo->stmt);
> -  gimple *popcount_stmt, *pattern_stmt;
> +  gimple *call_stmt, *pattern_stmt;
>    tree rhs_oprnd, rhs_origin, lhs_oprnd, lhs_type, vec_type, new_var;
> -  auto_vec<tree> vargs;
> +  internal_fn ifn = IFN_LAST;
> +  int addend = 0;
>  
>    /* Find B = (TYPE1) temp_out. */
>    if (!last_stmt)
> @@ -1557,51 +1561,137 @@ vect_recog_popcount_pattern (vec_info *v
>    if (TREE_CODE (rhs_oprnd) != SSA_NAME
>        || !has_single_use (rhs_oprnd))
>      return NULL;
> -  popcount_stmt = SSA_NAME_DEF_STMT (rhs_oprnd);
> +  call_stmt = SSA_NAME_DEF_STMT (rhs_oprnd);
>  
>    /* Find temp_out = __builtin_popcount{,l,ll} (temp_in);  */
> -  if (!is_gimple_call (popcount_stmt))
> +  if (!is_gimple_call (call_stmt))
>      return NULL;
> -  switch (gimple_call_combined_fn (popcount_stmt))
> +  switch (gimple_call_combined_fn (call_stmt))
>      {
> +      int val;
>      CASE_CFN_POPCOUNT:
> +      ifn = IFN_POPCOUNT;
> +      break;
> +    CASE_CFN_CLZ:
> +      ifn = IFN_CLZ;
> +      /* Punt if call result is unsigned and defined value at zero
> +	 is negative, as the negative value doesn't extend correctly.  */
> +      if (TYPE_UNSIGNED (TREE_TYPE (rhs_oprnd))
> +	  && gimple_call_internal_p (call_stmt)
> +	  && CLZ_DEFINED_VALUE_AT_ZERO
> +	       (SCALAR_INT_TYPE_MODE (TREE_TYPE (rhs_oprnd)), val) == 2
> +	  && val < 0)
> +	return NULL;
> +      break;
> +    CASE_CFN_CTZ:
> +      ifn = IFN_CTZ;
> +      /* Punt if call result is unsigned and defined value at zero
> +	 is negative, as the negative value doesn't extend correctly.  */
> +      if (TYPE_UNSIGNED (TREE_TYPE (rhs_oprnd))
> +	  && gimple_call_internal_p (call_stmt)
> +	  && CTZ_DEFINED_VALUE_AT_ZERO
> +	       (SCALAR_INT_TYPE_MODE (TREE_TYPE (rhs_oprnd)), val) == 2
> +	  && val < 0)
> +	return NULL;
> +      break;
> +    CASE_CFN_FFS:
> +      ifn = IFN_FFS;
>        break;
>      default:
>        return NULL;
>      }
>  
> -  if (gimple_call_num_args (popcount_stmt) != 1)
> +  if (gimple_call_num_args (call_stmt) != 1)
>      return NULL;
>  
> -  rhs_oprnd = gimple_call_arg (popcount_stmt, 0);
> +  rhs_oprnd = gimple_call_arg (call_stmt, 0);
>    vect_unpromoted_value unprom_diff;
> -  rhs_origin = vect_look_through_possible_promotion (vinfo, rhs_oprnd,
> -						    &unprom_diff);
> +  rhs_origin
> +    = vect_look_through_possible_promotion (vinfo, rhs_oprnd, &unprom_diff);
>  
>    if (!rhs_origin)
>      return NULL;
>  
> -  /* Input and output of .POPCOUNT should be same-precision integer.
> -     Also A should be unsigned or same precision as temp_in,
> -     otherwise there would be sign_extend from A to temp_in.  */
> -  if (TYPE_PRECISION (unprom_diff.type) != TYPE_PRECISION (lhs_type)
> -      || (!TYPE_UNSIGNED (unprom_diff.type)
> -	  && (TYPE_PRECISION (unprom_diff.type)
> -	      != TYPE_PRECISION (TREE_TYPE (rhs_oprnd)))))
> +  /* Input and output of .POPCOUNT should be same-precision integer.  */
> +  if (TYPE_PRECISION (unprom_diff.type) != TYPE_PRECISION (lhs_type))
>      return NULL;
> -  vargs.safe_push (unprom_diff.op);
>  
> -  vect_pattern_detected ("vec_regcog_popcount_pattern", popcount_stmt);
> +  /* Also A should be unsigned or same precision as temp_in, otherwise
> +     different builtins/internal functions have different behaviors.  */
> +  if (TYPE_PRECISION (unprom_diff.type)
> +      != TYPE_PRECISION (TREE_TYPE (rhs_oprnd)))
> +    switch (ifn)
> +      {
> +      case IFN_POPCOUNT:
> +	/* For popcount require zero extension, which doesn't add any
> +	   further bits to the count.  */
> +	if (!TYPE_UNSIGNED (unprom_diff.type))
> +	  return NULL;
> +	break;
> +      case IFN_CLZ:
> +	/* clzll (x) == clz (x) + 32 for unsigned x != 0, so ok
> +	   if it is undefined at zero or if it matches also for the
> +	   defined value there.  */
> +	if (!TYPE_UNSIGNED (unprom_diff.type))
> +	  return NULL;
> +	if (!type_has_mode_precision_p (lhs_type)
> +	    || !type_has_mode_precision_p (TREE_TYPE (rhs_oprnd)))
> +	  return NULL;
> +	addend = (TYPE_PRECISION (TREE_TYPE (rhs_oprnd))
> +		  - TYPE_PRECISION (lhs_type));
> +	if (gimple_call_internal_p (call_stmt))
> +	  {
> +	    int val1, val2;
> +	    int d1
> +	      = CLZ_DEFINED_VALUE_AT_ZERO
> +		  (SCALAR_INT_TYPE_MODE (TREE_TYPE (rhs_oprnd)), val1);
> +	    int d2
> +	      = CLZ_DEFINED_VALUE_AT_ZERO (SCALAR_INT_TYPE_MODE (lhs_type),
> +					   val2);
> +	    if (d1 != 2)
> +	      break;
> +	    if (d2 != 2 || val1 != val2 + addend)
> +	      return NULL;
> +	  }
> +	break;
> +      case IFN_CTZ:
> +	/* ctzll (x) == ctz (x) for unsigned or signed x != 0, so ok
> +	   if it is undefined at zero or if it matches also for the
> +	   defined value there.  */
> +	if (gimple_call_internal_p (call_stmt))
> +	  {
> +	    int val1, val2;
> +	    int d1
> +	      = CTZ_DEFINED_VALUE_AT_ZERO
> +		  (SCALAR_INT_TYPE_MODE (TREE_TYPE (rhs_oprnd)), val1);
> +	    int d2
> +	      = CTZ_DEFINED_VALUE_AT_ZERO (SCALAR_INT_TYPE_MODE (lhs_type),
> +					   val2);
> +	    if (d1 != 2)
> +	      break;
> +	    if (d2 != 2 || val1 != val2)
> +	      return NULL;
> +	  }
> +	break;
> +      case IFN_FFS:
> +	/* ffsll (x) == ffs (x) for unsigned or signed x.  */
> +	break;
> +      default:
> +	gcc_unreachable ();
> +      }
> +
> +  vect_pattern_detected ("vec_recog_popcount_clz_ctz_ffs_pattern",
> +			 call_stmt);
>    vec_type = get_vectype_for_scalar_type (vinfo, lhs_type);
> -  /* Do it only if the backend has popcount<vector_mode>2 pattern.  */
> +  /* Do it only if the backend has popcount<vector_mode>2 etc. pattern.  */
>    if (!vec_type
> -      || !direct_internal_fn_supported_p (IFN_POPCOUNT, vec_type,
> +      || !direct_internal_fn_supported_p (ifn, vec_type,
>  					  OPTIMIZE_FOR_SPEED))
>      return NULL;
>  
>    /* Create B = .POPCOUNT (A).  */
>    new_var = vect_recog_temp_ssa_var (lhs_type, NULL);
> -  pattern_stmt = gimple_build_call_internal_vec (IFN_POPCOUNT, vargs);
> +  pattern_stmt = gimple_build_call_internal (ifn, 1, unprom_diff.op);
>    gimple_call_set_lhs (pattern_stmt, new_var);
>    gimple_set_location (pattern_stmt, gimple_location (last_stmt));
>    *type_out = vec_type;
> @@ -1609,6 +1699,14 @@ vect_recog_popcount_pattern (vec_info *v
>    if (dump_enabled_p ())
>      dump_printf_loc (MSG_NOTE, vect_location,
>  		     "created pattern stmt: %G", pattern_stmt);
> +
> +  if (addend)
> +    {
> +      append_pattern_def_seq (vinfo, stmt_vinfo, pattern_stmt, vec_type);
> +      tree ret_var = vect_recog_temp_ssa_var (lhs_type, NULL);
> +      pattern_stmt = gimple_build_assign (ret_var, PLUS_EXPR, new_var,
> +					  build_int_cst (lhs_type, addend));
> +    }
>    return pattern_stmt;
>  }
>  
> @@ -6051,7 +6149,7 @@ static vect_recog_func vect_vect_recog_f
>    { vect_recog_sad_pattern, "sad" },
>    { vect_recog_widen_sum_pattern, "widen_sum" },
>    { vect_recog_pow_pattern, "pow" },
> -  { vect_recog_popcount_pattern, "popcount" },
> +  { vect_recog_popcount_clz_ctz_ffs_pattern, "popcount_clz_ctz_ffs" },
>    { vect_recog_widen_shift_pattern, "widen_shift" },
>    { vect_recog_rotate_pattern, "rotate" },
>    { vect_recog_vector_vector_shift_pattern, "vector_vector_shift" },
> --- gcc/testsuite/gcc.dg/vect/pr109011-1.c.jj	2023-04-18 14:40:47.117397908 +0200
> +++ gcc/testsuite/gcc.dg/vect/pr109011-1.c	2023-04-18 14:40:05.124004362 +0200
> @@ -0,0 +1,48 @@
> +/* PR tree-optimization/109011 */
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fno-unroll-loops --param=vect-epilogues-nomask=0 -fdump-tree-optimized" } */
> +/* { dg-additional-options "-mavx512cd" { target { { i?86-*-* x86_64-*-* } && avx512cd } } } */
> +/* { dg-additional-options "-mavx512vpopcntdq" { target { { i?86-*-* x86_64-*-* } && avx512vpopcntdq } } } */
> +/* { dg-additional-options "-mpower8-vector" { target powerpc_p8vector_ok } } */
> +/* { dg-additional-options "-mpower9-vector" { target powerpc_p9vector_ok } } */
> +/* { dg-additional-options "-march=z13 -mzarch" { target s390_vx } } */
> +
> +void
> +foo (long long *p, long long *q)
> +{
> +#pragma omp simd
> +  for (int i = 0; i < 2048; ++i)
> +    p[i] = __builtin_popcountll (q[i]);
> +}
> +
> +/* { dg-final { scan-tree-dump-times " = \.POPCOUNT \\\(" 1 "optimized" { target { { i?86-*-* x86_64-*-* } && avx512vpopcntdq } } } } */
> +/* { dg-final { scan-tree-dump-times " = \.POPCOUNT \\\(" 1 "optimized" { target { powerpc_p8vector_ok || s390_vx } } } } */
> +
> +void
> +bar (long long *p, long long *q)
> +{
> +#pragma omp simd
> +  for (int i = 0; i < 2048; ++i)
> +    p[i] = __builtin_clzll (q[i]);
> +}
> +
> +/* { dg-final { scan-tree-dump-times " = \.CLZ \\\(" 1 "optimized" { target { { i?86-*-* x86_64-*-* } && avx512cd } } } } */
> +/* { dg-final { scan-tree-dump-times " = \.CLZ \\\(" 1 "optimized" { target { powerpc_p8vector_ok || s390_vx } } } } */
> +
> +void
> +baz (long long *p, long long *q)
> +{
> +#pragma omp simd
> +  for (int i = 0; i < 2048; ++i)
> +    p[i] = __builtin_ctzll (q[i]);
> +}
> +
> +/* { dg-final { scan-tree-dump-times " = \.CTZ \\\(" 1 "optimized" { target { powerpc_p9vector_ok || s390_vx } } } } */
> +
> +void
> +qux (long long *p, long long *q)
> +{
> +#pragma omp simd
> +  for (int i = 0; i < 2048; ++i)
> +    p[i] = __builtin_ffsll (q[i]);
> +}
> 
> 	Jakub
> 
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2023-04-19  8:52 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-04-19  8:51 [PATCH] tree-vect-patterns: Improve __builtin_{clz,ctz,ffs}ll vectorization [PR109011] Jakub Jelinek
2023-04-19  8:52 ` Richard Biener

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).