[PATCH v1] Ifcvt: Add cond tree reconcile for truncated .SAT

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH v1] Ifcvt: Add cond tree reconcile for truncated .SAT_SUB
@ 2024-06-21  3:52 pan2.li
  2024-06-21  7:00 ` Richard Biener
                   ` (2 more replies)
  0 siblings, 3 replies; 27+ messages in thread
From: pan2.li @ 2024-06-21  3:52 UTC (permalink / raw)
  To: gcc-patches
  Cc: juzhe.zhong, kito.cheng, richard.guenther, jeffreyalaw,
	rdapp.gcc, Pan Li

From: Pan Li <pan2.li@intel.com>

The zip benchmark of coremark-pro have one SAT_SUB like pattern but
truncated as below:

void test (uint16_t *x, unsigned b, unsigned n)
{
  unsigned a = 0;
  register uint16_t *p = x;

  do {
    a = *--p;
    *p = (uint16_t)(a >= b ? a - b : 0); // Truncate the result of SAT_SUB
  } while (--n);
}

It will have gimple after ifcvt pass,  it cannot hit any pattern of
SAT_SUB and then cannot vectorize to SAT_SUB.

_2 = a_11 - b_12(D);
iftmp.0_13 = (short unsigned int) _2;
_18 = a_11 >= b_12(D);
iftmp.0_5 = _18 ? iftmp.0_13 : 0;

This patch would like to do some reconcile for above pattern to match
the SAT_SUB pattern.  Then the underlying vect pass is able to vectorize
the SAT_SUB.

_2 = a_11 - b_12(D);
_18 = a_11 >= b_12(D);
_pattmp = _18 ? _2 : 0; // .SAT_SUB pattern
iftmp.0_13 = (short unsigned int) _pattmp;
iftmp.0_5 = iftmp.0_13;

The below tests are running for this patch.
1. The rv64gcv fully regression tests.
2. The rv64gcv build with glibc.
3. The x86 bootstrap tests.
4. The x86 fully regression tests.

gcc/ChangeLog:

	* match.pd: Add new match for trunated unsigned sat_sub.
	* tree-if-conv.cc (gimple_truncated_unsigned_integer_sat_sub):
	New external decl from match.pd.
	(tree_if_cond_reconcile_unsigned_integer_sat_sub): New func impl
	to reconcile the truncated sat_sub pattern.
	(tree_if_cond_reconcile): New func impl to reconcile.
	(pass_if_conversion::execute): Try to reconcile after ifcvt.

Signed-off-by: Pan Li <pan2.li@intel.com>
---
 gcc/match.pd        |  9 +++++
 gcc/tree-if-conv.cc | 83 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 92 insertions(+)

diff --git a/gcc/match.pd b/gcc/match.pd
index 3d0689c9312..9617a5f9d5e 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -3210,6 +3210,15 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
  (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
       && types_match (type, @0, @1))))
 
+/* Unsigned saturation sub and then truncated, aka:
+   Truncated = X >= Y ? (Other Type) (X - Y) : 0.
+ */
+(match (truncated_unsigned_integer_sat_sub @0 @1)
+ (cond (ge @0 @1) (convert (minus @0 @1)) integer_zerop)
+ (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
+      && types_match (@0, @1)
+      && tree_int_cst_lt (TYPE_SIZE (type), TYPE_SIZE (TREE_TYPE (@0))))))
+
 /* x >  y  &&  x != XXX_MIN  -->  x > y
    x >  y  &&  x == XXX_MIN  -->  false . */
 (for eqne (eq ne)
diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc
index 57992b6deca..535743130f2 100644
--- a/gcc/tree-if-conv.cc
+++ b/gcc/tree-if-conv.cc
@@ -3738,6 +3738,87 @@ bitfields_to_lower_p (class loop *loop,
   return !reads_to_lower.is_empty () || !writes_to_lower.is_empty ();
 }
 
+extern bool gimple_truncated_unsigned_integer_sat_sub (tree, tree*,
+						       tree (*)(tree));
+
+/*
+ * Try to reconcile the stmt pattern as below to math the SAT_SUB
+ * in vectorization.  If and only if the related internal_fn has
+ * been implemented already.
+ *
+ * The reconcile will insert one new stmt named 'a' in below example,
+ * replace the stmt '4' by new added stmt 'b' as well.  Then the stmt
+ * pattern is able to hit the SAT_SUB pattern in the underlying pass.
+ *
+ * 1. _2 = a_11 - b_12(D);
+ * 2. iftmp.0_13 = (short unsigned int) _2;
+ * 3. _18 = a_11 >= b_12(D);
+ * 4. iftmp.0_5 = _18 ? iftmp.0_13 : 0;
+ * ==>
+ * 1. _2 = a_11 - b_12(D);
+ * 3. _18 = a_11 >= b_12(D);
+ * a. pattmp = _18 ? _2 : 0;                     // New insertion
+ * 2. iftmp.0_13 = (short unsigned int) _pattmp; // Move before
+ * b. iftmp.0_5 = iftmp.0_13;
+ *    == Replace ==> 4. iftmp.0_5 = _18 ? iftmp.0_13 : 0;
+ */
+static void
+tree_if_cond_reconcile_unsigned_integer_sat_sub (gimple_stmt_iterator *gsi,
+						 gassign *stmt)
+{
+  tree ops[2];
+  tree lhs = gimple_assign_lhs (stmt);
+  bool supported_p = direct_internal_fn_supported_p (IFN_SAT_SUB,
+						     TREE_TYPE (lhs),
+						     OPTIMIZE_FOR_BOTH);
+
+  if (supported_p && gimple_truncated_unsigned_integer_sat_sub (lhs, ops, NULL))
+    {
+      tree cond = gimple_assign_rhs1 (stmt); // aka _18
+      tree truncated = gimple_assign_rhs2 (stmt); // aka iftmp.0_13
+      gimple *stmt_2 = SSA_NAME_DEF_STMT (truncated);
+      tree minus = gimple_assign_rhs1 (stmt_2); // aka _2
+      tree raw_type = TREE_TYPE (minus);
+      tree zero = build_zero_cst (raw_type);
+      tree tmp = make_temp_ssa_name (raw_type, NULL, "sat_sub_tmp");
+
+      /* For stmt 'a' in above example  */
+      gimple *stmt_a = gimple_build_assign (tmp, COND_EXPR, cond, minus, zero);
+      gsi_insert_before (gsi, stmt_a, GSI_SAME_STMT);
+      update_stmt (stmt_a);
+
+      /* For stmt '2' in above example  */
+      gimple_stmt_iterator stmt_2_gsi = gsi_for_stmt (stmt_2);
+      gsi_move_before (&stmt_2_gsi, gsi, GSI_SAME_STMT);
+      gimple_assign_set_rhs1 (stmt_2, tmp);
+      update_stmt (stmt_2);
+
+      /* For stmt 'b' in above example  */
+      gimple *stmt_b = gimple_build_assign (lhs, NOP_EXPR, truncated);
+      gsi_replace (gsi, stmt_b, /* update_eh_info */ true);
+      update_stmt (stmt_b);
+    }
+}
+
+static void
+tree_if_cond_reconcile (function *fun)
+{
+  basic_block bb;
+  FOR_EACH_BB_FN (bb, fun)
+    {
+      gimple_stmt_iterator gsi;
+      for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+	{
+	  gimple *stmt = gsi_stmt (gsi);
+
+	  if (is_gimple_assign (stmt))
+	    {
+	      gassign *assign = dyn_cast <gassign *> (stmt);
+	      tree_if_cond_reconcile_unsigned_integer_sat_sub (&gsi, assign);
+	    }
+	}
+    }
+}
 
 /* If-convert LOOP when it is legal.  For the moment this pass has no
    profitability analysis.  Returns non-zero todo flags when something
@@ -4063,6 +4144,8 @@ pass_if_conversion::execute (function *fun)
 	}
     }
 
+  tree_if_cond_reconcile (fun);
+
   return 0;
 }
 
-- 
2.34.1


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v1] Ifcvt: Add cond tree reconcile for truncated .SAT_SUB
  2024-06-21  3:52 [PATCH v1] Ifcvt: Add cond tree reconcile for truncated .SAT_SUB pan2.li
@ 2024-06-21  7:00 ` Richard Biener
  2024-06-21  8:50   ` Li, Pan2
  2024-06-24 13:55 ` [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip pan2.li
  2024-06-27  1:31 ` [PATCH v3] " pan2.li
  2 siblings, 1 reply; 27+ messages in thread
From: Richard Biener @ 2024-06-21  7:00 UTC (permalink / raw)
  To: pan2.li; +Cc: gcc-patches, juzhe.zhong, kito.cheng, jeffreyalaw, rdapp.gcc

On Fri, Jun 21, 2024 at 5:53 AM <pan2.li@intel.com> wrote:
>
> From: Pan Li <pan2.li@intel.com>
>
> The zip benchmark of coremark-pro have one SAT_SUB like pattern but
> truncated as below:
>
> void test (uint16_t *x, unsigned b, unsigned n)
> {
>   unsigned a = 0;
>   register uint16_t *p = x;
>
>   do {
>     a = *--p;
>     *p = (uint16_t)(a >= b ? a - b : 0); // Truncate the result of SAT_SUB
>   } while (--n);
> }
>
> It will have gimple after ifcvt pass,  it cannot hit any pattern of
> SAT_SUB and then cannot vectorize to SAT_SUB.
>
> _2 = a_11 - b_12(D);
> iftmp.0_13 = (short unsigned int) _2;
> _18 = a_11 >= b_12(D);
> iftmp.0_5 = _18 ? iftmp.0_13 : 0;
>
> This patch would like to do some reconcile for above pattern to match
> the SAT_SUB pattern.  Then the underlying vect pass is able to vectorize
> the SAT_SUB.

Hmm.  I was thinking of allowing

/* Unsigned saturation sub, case 2 (branch with ge):
   SAT_U_SUB = X >= Y ? X - Y : 0.  */
(match (unsigned_integer_sat_sub @0 @1)
 (cond^ (ge @0 @1) (minus @0 @1) integer_zerop)
 (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
      && types_match (type, @0, @1))))

to match this by changing it to

/* Unsigned saturation sub, case 2 (branch with ge):
   SAT_U_SUB = X >= Y ? X - Y : 0.  */
(match (unsigned_integer_sat_sub @0 @1)
 (cond^ (ge @0 @1) (convert? (minus @0 @1)) integer_zerop)
 (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
      && types_match (type, @0, @1))))

and when using the gimple_match_* function make sure to consider
that the .SAT_SUB (@0, @1) is converted to the type of the SSA name
we matched?

Richard.

> _2 = a_11 - b_12(D);
> _18 = a_11 >= b_12(D);
> _pattmp = _18 ? _2 : 0; // .SAT_SUB pattern
> iftmp.0_13 = (short unsigned int) _pattmp;
> iftmp.0_5 = iftmp.0_13;
>
> The below tests are running for this patch.
> 1. The rv64gcv fully regression tests.
> 2. The rv64gcv build with glibc.
> 3. The x86 bootstrap tests.
> 4. The x86 fully regression tests.
>
> gcc/ChangeLog:
>
>         * match.pd: Add new match for trunated unsigned sat_sub.
>         * tree-if-conv.cc (gimple_truncated_unsigned_integer_sat_sub):
>         New external decl from match.pd.
>         (tree_if_cond_reconcile_unsigned_integer_sat_sub): New func impl
>         to reconcile the truncated sat_sub pattern.
>         (tree_if_cond_reconcile): New func impl to reconcile.
>         (pass_if_conversion::execute): Try to reconcile after ifcvt.
>
> Signed-off-by: Pan Li <pan2.li@intel.com>
> ---
>  gcc/match.pd        |  9 +++++
>  gcc/tree-if-conv.cc | 83 +++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 92 insertions(+)
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 3d0689c9312..9617a5f9d5e 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -3210,6 +3210,15 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>   (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
>        && types_match (type, @0, @1))))
>
> +/* Unsigned saturation sub and then truncated, aka:
> +   Truncated = X >= Y ? (Other Type) (X - Y) : 0.
> + */
> +(match (truncated_unsigned_integer_sat_sub @0 @1)
> + (cond (ge @0 @1) (convert (minus @0 @1)) integer_zerop)
> + (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
> +      && types_match (@0, @1)
> +      && tree_int_cst_lt (TYPE_SIZE (type), TYPE_SIZE (TREE_TYPE (@0))))))
> +
>  /* x >  y  &&  x != XXX_MIN  -->  x > y
>     x >  y  &&  x == XXX_MIN  -->  false . */
>  (for eqne (eq ne)
> diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc
> index 57992b6deca..535743130f2 100644
> --- a/gcc/tree-if-conv.cc
> +++ b/gcc/tree-if-conv.cc
> @@ -3738,6 +3738,87 @@ bitfields_to_lower_p (class loop *loop,
>    return !reads_to_lower.is_empty () || !writes_to_lower.is_empty ();
>  }
>
> +extern bool gimple_truncated_unsigned_integer_sat_sub (tree, tree*,
> +                                                      tree (*)(tree));
> +
> +/*
> + * Try to reconcile the stmt pattern as below to math the SAT_SUB
> + * in vectorization.  If and only if the related internal_fn has
> + * been implemented already.
> + *
> + * The reconcile will insert one new stmt named 'a' in below example,
> + * replace the stmt '4' by new added stmt 'b' as well.  Then the stmt
> + * pattern is able to hit the SAT_SUB pattern in the underlying pass.
> + *
> + * 1. _2 = a_11 - b_12(D);
> + * 2. iftmp.0_13 = (short unsigned int) _2;
> + * 3. _18 = a_11 >= b_12(D);
> + * 4. iftmp.0_5 = _18 ? iftmp.0_13 : 0;
> + * ==>
> + * 1. _2 = a_11 - b_12(D);
> + * 3. _18 = a_11 >= b_12(D);
> + * a. pattmp = _18 ? _2 : 0;                     // New insertion
> + * 2. iftmp.0_13 = (short unsigned int) _pattmp; // Move before
> + * b. iftmp.0_5 = iftmp.0_13;
> + *    == Replace ==> 4. iftmp.0_5 = _18 ? iftmp.0_13 : 0;
> + */
> +static void
> +tree_if_cond_reconcile_unsigned_integer_sat_sub (gimple_stmt_iterator *gsi,
> +                                                gassign *stmt)
> +{
> +  tree ops[2];
> +  tree lhs = gimple_assign_lhs (stmt);
> +  bool supported_p = direct_internal_fn_supported_p (IFN_SAT_SUB,
> +                                                    TREE_TYPE (lhs),
> +                                                    OPTIMIZE_FOR_BOTH);
> +
> +  if (supported_p && gimple_truncated_unsigned_integer_sat_sub (lhs, ops, NULL))
> +    {
> +      tree cond = gimple_assign_rhs1 (stmt); // aka _18
> +      tree truncated = gimple_assign_rhs2 (stmt); // aka iftmp.0_13
> +      gimple *stmt_2 = SSA_NAME_DEF_STMT (truncated);
> +      tree minus = gimple_assign_rhs1 (stmt_2); // aka _2
> +      tree raw_type = TREE_TYPE (minus);
> +      tree zero = build_zero_cst (raw_type);
> +      tree tmp = make_temp_ssa_name (raw_type, NULL, "sat_sub_tmp");
> +
> +      /* For stmt 'a' in above example  */
> +      gimple *stmt_a = gimple_build_assign (tmp, COND_EXPR, cond, minus, zero);
> +      gsi_insert_before (gsi, stmt_a, GSI_SAME_STMT);
> +      update_stmt (stmt_a);
> +
> +      /* For stmt '2' in above example  */
> +      gimple_stmt_iterator stmt_2_gsi = gsi_for_stmt (stmt_2);
> +      gsi_move_before (&stmt_2_gsi, gsi, GSI_SAME_STMT);
> +      gimple_assign_set_rhs1 (stmt_2, tmp);
> +      update_stmt (stmt_2);
> +
> +      /* For stmt 'b' in above example  */
> +      gimple *stmt_b = gimple_build_assign (lhs, NOP_EXPR, truncated);
> +      gsi_replace (gsi, stmt_b, /* update_eh_info */ true);
> +      update_stmt (stmt_b);
> +    }
> +}
> +
> +static void
> +tree_if_cond_reconcile (function *fun)
> +{
> +  basic_block bb;
> +  FOR_EACH_BB_FN (bb, fun)
> +    {
> +      gimple_stmt_iterator gsi;
> +      for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
> +       {
> +         gimple *stmt = gsi_stmt (gsi);
> +
> +         if (is_gimple_assign (stmt))
> +           {
> +             gassign *assign = dyn_cast <gassign *> (stmt);
> +             tree_if_cond_reconcile_unsigned_integer_sat_sub (&gsi, assign);
> +           }
> +       }
> +    }
> +}
>
>  /* If-convert LOOP when it is legal.  For the moment this pass has no
>     profitability analysis.  Returns non-zero todo flags when something
> @@ -4063,6 +4144,8 @@ pass_if_conversion::execute (function *fun)
>         }
>      }
>
> +  tree_if_cond_reconcile (fun);
> +
>    return 0;
>  }
>
> --
> 2.34.1
>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH v1] Ifcvt: Add cond tree reconcile for truncated .SAT_SUB
  2024-06-21  7:00 ` Richard Biener
@ 2024-06-21  8:50   ` Li, Pan2
  2024-06-21  9:28     ` Richard Biener
  0 siblings, 1 reply; 27+ messages in thread
From: Li, Pan2 @ 2024-06-21  8:50 UTC (permalink / raw)
  To: Richard Biener
  Cc: gcc-patches, juzhe.zhong, kito.cheng, jeffreyalaw, rdapp.gcc

Thanks Richard for comments.

> to match this by changing it to

> /* Unsigned saturation sub, case 2 (branch with ge):
>    SAT_U_SUB = X >= Y ? X - Y : 0.  */
> (match (unsigned_integer_sat_sub @0 @1)
> (cond^ (ge @0 @1) (convert? (minus @0 @1)) integer_zerop)
>  (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
>       && types_match (type, @0, @1))))

Do we need another name for this matching ? Add (convert? here may change the sematics of .SAT_SUB.
When we call gimple_unsigned_integer_sat_sub (lhs, ops, NULL), the converted value may be returned different
to the (minus @0 @1). Please correct me if my understanding is wrong.

> and when using the gimple_match_* function make sure to consider
> that the .SAT_SUB (@0, @1) is converted to the type of the SSA name
> we matched?

This may have problem for vector part I guess, require some additional change from vectorize_convert when
I try to do that in previous. Let me double check about it, and keep you posted.

Pan

-----Original Message-----
From: Richard Biener <richard.guenther@gmail.com> 
Sent: Friday, June 21, 2024 3:00 PM
To: Li, Pan2 <pan2.li@intel.com>
Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com; rdapp.gcc@gmail.com
Subject: Re: [PATCH v1] Ifcvt: Add cond tree reconcile for truncated .SAT_SUB

On Fri, Jun 21, 2024 at 5:53 AM <pan2.li@intel.com> wrote:
>
> From: Pan Li <pan2.li@intel.com>
>
> The zip benchmark of coremark-pro have one SAT_SUB like pattern but
> truncated as below:
>
> void test (uint16_t *x, unsigned b, unsigned n)
> {
>   unsigned a = 0;
>   register uint16_t *p = x;
>
>   do {
>     a = *--p;
>     *p = (uint16_t)(a >= b ? a - b : 0); // Truncate the result of SAT_SUB
>   } while (--n);
> }
>
> It will have gimple after ifcvt pass,  it cannot hit any pattern of
> SAT_SUB and then cannot vectorize to SAT_SUB.
>
> _2 = a_11 - b_12(D);
> iftmp.0_13 = (short unsigned int) _2;
> _18 = a_11 >= b_12(D);
> iftmp.0_5 = _18 ? iftmp.0_13 : 0;
>
> This patch would like to do some reconcile for above pattern to match
> the SAT_SUB pattern.  Then the underlying vect pass is able to vectorize
> the SAT_SUB.

Hmm.  I was thinking of allowing

/* Unsigned saturation sub, case 2 (branch with ge):
   SAT_U_SUB = X >= Y ? X - Y : 0.  */
(match (unsigned_integer_sat_sub @0 @1)
 (cond^ (ge @0 @1) (minus @0 @1) integer_zerop)
 (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
      && types_match (type, @0, @1))))

to match this by changing it to

/* Unsigned saturation sub, case 2 (branch with ge):
   SAT_U_SUB = X >= Y ? X - Y : 0.  */
(match (unsigned_integer_sat_sub @0 @1)
 (cond^ (ge @0 @1) (convert? (minus @0 @1)) integer_zerop)
 (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
      && types_match (type, @0, @1))))

and when using the gimple_match_* function make sure to consider
that the .SAT_SUB (@0, @1) is converted to the type of the SSA name
we matched?

Richard.

> _2 = a_11 - b_12(D);
> _18 = a_11 >= b_12(D);
> _pattmp = _18 ? _2 : 0; // .SAT_SUB pattern
> iftmp.0_13 = (short unsigned int) _pattmp;
> iftmp.0_5 = iftmp.0_13;
>
> The below tests are running for this patch.
> 1. The rv64gcv fully regression tests.
> 2. The rv64gcv build with glibc.
> 3. The x86 bootstrap tests.
> 4. The x86 fully regression tests.
>
> gcc/ChangeLog:
>
>         * match.pd: Add new match for trunated unsigned sat_sub.
>         * tree-if-conv.cc (gimple_truncated_unsigned_integer_sat_sub):
>         New external decl from match.pd.
>         (tree_if_cond_reconcile_unsigned_integer_sat_sub): New func impl
>         to reconcile the truncated sat_sub pattern.
>         (tree_if_cond_reconcile): New func impl to reconcile.
>         (pass_if_conversion::execute): Try to reconcile after ifcvt.
>
> Signed-off-by: Pan Li <pan2.li@intel.com>
> ---
>  gcc/match.pd        |  9 +++++
>  gcc/tree-if-conv.cc | 83 +++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 92 insertions(+)
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 3d0689c9312..9617a5f9d5e 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -3210,6 +3210,15 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>   (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
>        && types_match (type, @0, @1))))
>
> +/* Unsigned saturation sub and then truncated, aka:
> +   Truncated = X >= Y ? (Other Type) (X - Y) : 0.
> + */
> +(match (truncated_unsigned_integer_sat_sub @0 @1)
> + (cond (ge @0 @1) (convert (minus @0 @1)) integer_zerop)
> + (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
> +      && types_match (@0, @1)
> +      && tree_int_cst_lt (TYPE_SIZE (type), TYPE_SIZE (TREE_TYPE (@0))))))
> +
>  /* x >  y  &&  x != XXX_MIN  -->  x > y
>     x >  y  &&  x == XXX_MIN  -->  false . */
>  (for eqne (eq ne)
> diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc
> index 57992b6deca..535743130f2 100644
> --- a/gcc/tree-if-conv.cc
> +++ b/gcc/tree-if-conv.cc
> @@ -3738,6 +3738,87 @@ bitfields_to_lower_p (class loop *loop,
>    return !reads_to_lower.is_empty () || !writes_to_lower.is_empty ();
>  }
>
> +extern bool gimple_truncated_unsigned_integer_sat_sub (tree, tree*,
> +                                                      tree (*)(tree));
> +
> +/*
> + * Try to reconcile the stmt pattern as below to math the SAT_SUB
> + * in vectorization.  If and only if the related internal_fn has
> + * been implemented already.
> + *
> + * The reconcile will insert one new stmt named 'a' in below example,
> + * replace the stmt '4' by new added stmt 'b' as well.  Then the stmt
> + * pattern is able to hit the SAT_SUB pattern in the underlying pass.
> + *
> + * 1. _2 = a_11 - b_12(D);
> + * 2. iftmp.0_13 = (short unsigned int) _2;
> + * 3. _18 = a_11 >= b_12(D);
> + * 4. iftmp.0_5 = _18 ? iftmp.0_13 : 0;
> + * ==>
> + * 1. _2 = a_11 - b_12(D);
> + * 3. _18 = a_11 >= b_12(D);
> + * a. pattmp = _18 ? _2 : 0;                     // New insertion
> + * 2. iftmp.0_13 = (short unsigned int) _pattmp; // Move before
> + * b. iftmp.0_5 = iftmp.0_13;
> + *    == Replace ==> 4. iftmp.0_5 = _18 ? iftmp.0_13 : 0;
> + */
> +static void
> +tree_if_cond_reconcile_unsigned_integer_sat_sub (gimple_stmt_iterator *gsi,
> +                                                gassign *stmt)
> +{
> +  tree ops[2];
> +  tree lhs = gimple_assign_lhs (stmt);
> +  bool supported_p = direct_internal_fn_supported_p (IFN_SAT_SUB,
> +                                                    TREE_TYPE (lhs),
> +                                                    OPTIMIZE_FOR_BOTH);
> +
> +  if (supported_p && gimple_truncated_unsigned_integer_sat_sub (lhs, ops, NULL))
> +    {
> +      tree cond = gimple_assign_rhs1 (stmt); // aka _18
> +      tree truncated = gimple_assign_rhs2 (stmt); // aka iftmp.0_13
> +      gimple *stmt_2 = SSA_NAME_DEF_STMT (truncated);
> +      tree minus = gimple_assign_rhs1 (stmt_2); // aka _2
> +      tree raw_type = TREE_TYPE (minus);
> +      tree zero = build_zero_cst (raw_type);
> +      tree tmp = make_temp_ssa_name (raw_type, NULL, "sat_sub_tmp");
> +
> +      /* For stmt 'a' in above example  */
> +      gimple *stmt_a = gimple_build_assign (tmp, COND_EXPR, cond, minus, zero);
> +      gsi_insert_before (gsi, stmt_a, GSI_SAME_STMT);
> +      update_stmt (stmt_a);
> +
> +      /* For stmt '2' in above example  */
> +      gimple_stmt_iterator stmt_2_gsi = gsi_for_stmt (stmt_2);
> +      gsi_move_before (&stmt_2_gsi, gsi, GSI_SAME_STMT);
> +      gimple_assign_set_rhs1 (stmt_2, tmp);
> +      update_stmt (stmt_2);
> +
> +      /* For stmt 'b' in above example  */
> +      gimple *stmt_b = gimple_build_assign (lhs, NOP_EXPR, truncated);
> +      gsi_replace (gsi, stmt_b, /* update_eh_info */ true);
> +      update_stmt (stmt_b);
> +    }
> +}
> +
> +static void
> +tree_if_cond_reconcile (function *fun)
> +{
> +  basic_block bb;
> +  FOR_EACH_BB_FN (bb, fun)
> +    {
> +      gimple_stmt_iterator gsi;
> +      for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
> +       {
> +         gimple *stmt = gsi_stmt (gsi);
> +
> +         if (is_gimple_assign (stmt))
> +           {
> +             gassign *assign = dyn_cast <gassign *> (stmt);
> +             tree_if_cond_reconcile_unsigned_integer_sat_sub (&gsi, assign);
> +           }
> +       }
> +    }
> +}
>
>  /* If-convert LOOP when it is legal.  For the moment this pass has no
>     profitability analysis.  Returns non-zero todo flags when something
> @@ -4063,6 +4144,8 @@ pass_if_conversion::execute (function *fun)
>         }
>      }
>
> +  tree_if_cond_reconcile (fun);
> +
>    return 0;
>  }
>
> --
> 2.34.1
>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v1] Ifcvt: Add cond tree reconcile for truncated .SAT_SUB
  2024-06-21  8:50   ` Li, Pan2
@ 2024-06-21  9:28     ` Richard Biener
  2024-06-21 14:45       ` Li, Pan2
  0 siblings, 1 reply; 27+ messages in thread
From: Richard Biener @ 2024-06-21  9:28 UTC (permalink / raw)
  To: Li, Pan2; +Cc: gcc-patches, juzhe.zhong, kito.cheng, jeffreyalaw, rdapp.gcc

On Fri, Jun 21, 2024 at 10:50 AM Li, Pan2 <pan2.li@intel.com> wrote:
>
> Thanks Richard for comments.
>
> > to match this by changing it to
>
> > /* Unsigned saturation sub, case 2 (branch with ge):
> >    SAT_U_SUB = X >= Y ? X - Y : 0.  */
> > (match (unsigned_integer_sat_sub @0 @1)
> > (cond^ (ge @0 @1) (convert? (minus @0 @1)) integer_zerop)
> >  (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
> >       && types_match (type, @0, @1))))
>
> Do we need another name for this matching ? Add (convert? here may change the sematics of .SAT_SUB.
> When we call gimple_unsigned_integer_sat_sub (lhs, ops, NULL), the converted value may be returned different
> to the (minus @0 @1). Please correct me if my understanding is wrong.

I think gimple_unsigned_integer_sat_sub (lhs, ...) simply matches
(typeof LHS).SAT_SUB (ops[0], ops[1]) now, I don't think it's necessary to
handle the case where typef LHS and typeof ops[0] are equal specially?

> > and when using the gimple_match_* function make sure to consider
> > that the .SAT_SUB (@0, @1) is converted to the type of the SSA name
> > we matched?
>
> This may have problem for vector part I guess, require some additional change from vectorize_convert when
> I try to do that in previous. Let me double check about it, and keep you posted.

You are using gimple_unsigned_integer_sat_sub from pattern recognition, the
thing to do is simply to add a conversion stmt to the pattern sequence in case
the types differ?

But maybe I'm missing something.

Richard.

> Pan
>
> -----Original Message-----
> From: Richard Biener <richard.guenther@gmail.com>
> Sent: Friday, June 21, 2024 3:00 PM
> To: Li, Pan2 <pan2.li@intel.com>
> Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com; rdapp.gcc@gmail.com
> Subject: Re: [PATCH v1] Ifcvt: Add cond tree reconcile for truncated .SAT_SUB
>
> On Fri, Jun 21, 2024 at 5:53 AM <pan2.li@intel.com> wrote:
> >
> > From: Pan Li <pan2.li@intel.com>
> >
> > The zip benchmark of coremark-pro have one SAT_SUB like pattern but
> > truncated as below:
> >
> > void test (uint16_t *x, unsigned b, unsigned n)
> > {
> >   unsigned a = 0;
> >   register uint16_t *p = x;
> >
> >   do {
> >     a = *--p;
> >     *p = (uint16_t)(a >= b ? a - b : 0); // Truncate the result of SAT_SUB
> >   } while (--n);
> > }
> >
> > It will have gimple after ifcvt pass,  it cannot hit any pattern of
> > SAT_SUB and then cannot vectorize to SAT_SUB.
> >
> > _2 = a_11 - b_12(D);
> > iftmp.0_13 = (short unsigned int) _2;
> > _18 = a_11 >= b_12(D);
> > iftmp.0_5 = _18 ? iftmp.0_13 : 0;
> >
> > This patch would like to do some reconcile for above pattern to match
> > the SAT_SUB pattern.  Then the underlying vect pass is able to vectorize
> > the SAT_SUB.
>
> Hmm.  I was thinking of allowing
>
> /* Unsigned saturation sub, case 2 (branch with ge):
>    SAT_U_SUB = X >= Y ? X - Y : 0.  */
> (match (unsigned_integer_sat_sub @0 @1)
>  (cond^ (ge @0 @1) (minus @0 @1) integer_zerop)
>  (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
>       && types_match (type, @0, @1))))
>
> to match this by changing it to
>
> /* Unsigned saturation sub, case 2 (branch with ge):
>    SAT_U_SUB = X >= Y ? X - Y : 0.  */
> (match (unsigned_integer_sat_sub @0 @1)
>  (cond^ (ge @0 @1) (convert? (minus @0 @1)) integer_zerop)
>  (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
>       && types_match (type, @0, @1))))
>
> and when using the gimple_match_* function make sure to consider
> that the .SAT_SUB (@0, @1) is converted to the type of the SSA name
> we matched?
>
> Richard.
>
> > _2 = a_11 - b_12(D);
> > _18 = a_11 >= b_12(D);
> > _pattmp = _18 ? _2 : 0; // .SAT_SUB pattern
> > iftmp.0_13 = (short unsigned int) _pattmp;
> > iftmp.0_5 = iftmp.0_13;
> >
> > The below tests are running for this patch.
> > 1. The rv64gcv fully regression tests.
> > 2. The rv64gcv build with glibc.
> > 3. The x86 bootstrap tests.
> > 4. The x86 fully regression tests.
> >
> > gcc/ChangeLog:
> >
> >         * match.pd: Add new match for trunated unsigned sat_sub.
> >         * tree-if-conv.cc (gimple_truncated_unsigned_integer_sat_sub):
> >         New external decl from match.pd.
> >         (tree_if_cond_reconcile_unsigned_integer_sat_sub): New func impl
> >         to reconcile the truncated sat_sub pattern.
> >         (tree_if_cond_reconcile): New func impl to reconcile.
> >         (pass_if_conversion::execute): Try to reconcile after ifcvt.
> >
> > Signed-off-by: Pan Li <pan2.li@intel.com>
> > ---
> >  gcc/match.pd        |  9 +++++
> >  gcc/tree-if-conv.cc | 83 +++++++++++++++++++++++++++++++++++++++++++++
> >  2 files changed, 92 insertions(+)
> >
> > diff --git a/gcc/match.pd b/gcc/match.pd
> > index 3d0689c9312..9617a5f9d5e 100644
> > --- a/gcc/match.pd
> > +++ b/gcc/match.pd
> > @@ -3210,6 +3210,15 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> >   (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
> >        && types_match (type, @0, @1))))
> >
> > +/* Unsigned saturation sub and then truncated, aka:
> > +   Truncated = X >= Y ? (Other Type) (X - Y) : 0.
> > + */
> > +(match (truncated_unsigned_integer_sat_sub @0 @1)
> > + (cond (ge @0 @1) (convert (minus @0 @1)) integer_zerop)
> > + (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
> > +      && types_match (@0, @1)
> > +      && tree_int_cst_lt (TYPE_SIZE (type), TYPE_SIZE (TREE_TYPE (@0))))))
> > +
> >  /* x >  y  &&  x != XXX_MIN  -->  x > y
> >     x >  y  &&  x == XXX_MIN  -->  false . */
> >  (for eqne (eq ne)
> > diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc
> > index 57992b6deca..535743130f2 100644
> > --- a/gcc/tree-if-conv.cc
> > +++ b/gcc/tree-if-conv.cc
> > @@ -3738,6 +3738,87 @@ bitfields_to_lower_p (class loop *loop,
> >    return !reads_to_lower.is_empty () || !writes_to_lower.is_empty ();
> >  }
> >
> > +extern bool gimple_truncated_unsigned_integer_sat_sub (tree, tree*,
> > +                                                      tree (*)(tree));
> > +
> > +/*
> > + * Try to reconcile the stmt pattern as below to math the SAT_SUB
> > + * in vectorization.  If and only if the related internal_fn has
> > + * been implemented already.
> > + *
> > + * The reconcile will insert one new stmt named 'a' in below example,
> > + * replace the stmt '4' by new added stmt 'b' as well.  Then the stmt
> > + * pattern is able to hit the SAT_SUB pattern in the underlying pass.
> > + *
> > + * 1. _2 = a_11 - b_12(D);
> > + * 2. iftmp.0_13 = (short unsigned int) _2;
> > + * 3. _18 = a_11 >= b_12(D);
> > + * 4. iftmp.0_5 = _18 ? iftmp.0_13 : 0;
> > + * ==>
> > + * 1. _2 = a_11 - b_12(D);
> > + * 3. _18 = a_11 >= b_12(D);
> > + * a. pattmp = _18 ? _2 : 0;                     // New insertion
> > + * 2. iftmp.0_13 = (short unsigned int) _pattmp; // Move before
> > + * b. iftmp.0_5 = iftmp.0_13;
> > + *    == Replace ==> 4. iftmp.0_5 = _18 ? iftmp.0_13 : 0;
> > + */
> > +static void
> > +tree_if_cond_reconcile_unsigned_integer_sat_sub (gimple_stmt_iterator *gsi,
> > +                                                gassign *stmt)
> > +{
> > +  tree ops[2];
> > +  tree lhs = gimple_assign_lhs (stmt);
> > +  bool supported_p = direct_internal_fn_supported_p (IFN_SAT_SUB,
> > +                                                    TREE_TYPE (lhs),
> > +                                                    OPTIMIZE_FOR_BOTH);
> > +
> > +  if (supported_p && gimple_truncated_unsigned_integer_sat_sub (lhs, ops, NULL))
> > +    {
> > +      tree cond = gimple_assign_rhs1 (stmt); // aka _18
> > +      tree truncated = gimple_assign_rhs2 (stmt); // aka iftmp.0_13
> > +      gimple *stmt_2 = SSA_NAME_DEF_STMT (truncated);
> > +      tree minus = gimple_assign_rhs1 (stmt_2); // aka _2
> > +      tree raw_type = TREE_TYPE (minus);
> > +      tree zero = build_zero_cst (raw_type);
> > +      tree tmp = make_temp_ssa_name (raw_type, NULL, "sat_sub_tmp");
> > +
> > +      /* For stmt 'a' in above example  */
> > +      gimple *stmt_a = gimple_build_assign (tmp, COND_EXPR, cond, minus, zero);
> > +      gsi_insert_before (gsi, stmt_a, GSI_SAME_STMT);
> > +      update_stmt (stmt_a);
> > +
> > +      /* For stmt '2' in above example  */
> > +      gimple_stmt_iterator stmt_2_gsi = gsi_for_stmt (stmt_2);
> > +      gsi_move_before (&stmt_2_gsi, gsi, GSI_SAME_STMT);
> > +      gimple_assign_set_rhs1 (stmt_2, tmp);
> > +      update_stmt (stmt_2);
> > +
> > +      /* For stmt 'b' in above example  */
> > +      gimple *stmt_b = gimple_build_assign (lhs, NOP_EXPR, truncated);
> > +      gsi_replace (gsi, stmt_b, /* update_eh_info */ true);
> > +      update_stmt (stmt_b);
> > +    }
> > +}
> > +
> > +static void
> > +tree_if_cond_reconcile (function *fun)
> > +{
> > +  basic_block bb;
> > +  FOR_EACH_BB_FN (bb, fun)
> > +    {
> > +      gimple_stmt_iterator gsi;
> > +      for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
> > +       {
> > +         gimple *stmt = gsi_stmt (gsi);
> > +
> > +         if (is_gimple_assign (stmt))
> > +           {
> > +             gassign *assign = dyn_cast <gassign *> (stmt);
> > +             tree_if_cond_reconcile_unsigned_integer_sat_sub (&gsi, assign);
> > +           }
> > +       }
> > +    }
> > +}
> >
> >  /* If-convert LOOP when it is legal.  For the moment this pass has no
> >     profitability analysis.  Returns non-zero todo flags when something
> > @@ -4063,6 +4144,8 @@ pass_if_conversion::execute (function *fun)
> >         }
> >      }
> >
> > +  tree_if_cond_reconcile (fun);
> > +
> >    return 0;
> >  }
> >
> > --
> > 2.34.1
> >

^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH v1] Ifcvt: Add cond tree reconcile for truncated .SAT_SUB
  2024-06-21  9:28     ` Richard Biener
@ 2024-06-21 14:45       ` Li, Pan2
  2024-06-22 13:19         ` Richard Biener
  0 siblings, 1 reply; 27+ messages in thread
From: Li, Pan2 @ 2024-06-21 14:45 UTC (permalink / raw)
  To: Richard Biener
  Cc: gcc-patches, juzhe.zhong, kito.cheng, jeffreyalaw, rdapp.gcc

Thanks Richard for suggestion, tried the (convert? with below gimple stmt but got a miss def ice.
To double confirm, the *type_out should be the vector type of lhs, and we only need to build
one cvt stmt from itype to otype here. Or just return the call directly and set the type_out to the v_otype?

static gimple *
vect_recog_build_binary_gimple_stmt (vec_info *vinfo, gimple *stmt,
                                     internal_fn fn, tree *type_out,
                                     tree lhs, tree op_0, tree op_1)
{
  tree itype = TREE_TYPE (op_0);
  tree otype = TREE_TYPE (lhs);
  tree v_itype = get_vectype_for_scalar_type (vinfo, itype);
  tree v_otype = get_vectype_for_scalar_type (vinfo, otype);

  if (v_itype != NULL_TREE && v_otype != NULL_TREE
    && direct_internal_fn_supported_p (fn, v_itype, OPTIMIZE_FOR_BOTH))
    {
      gcall *call = gimple_build_call_internal (fn, 2, op_0, op_1);
      tree itype_ssa = vect_recog_temp_ssa_var (itype, NULL);

      gimple_call_set_lhs (call, itype_ssa);
      gimple_call_set_nothrow (call, /* nothrow_p */ false);
      gimple_set_location (call, gimple_location (stmt));

      *type_out = v_otype;
      gimple *new_stmt = call;

      if (itype != otype)
        {
          tree otype_ssa = vect_recog_temp_ssa_var (otype, NULL);
          new_stmt = gimple_build_assign (otype_ssa, CONVERT_EXPR, itype_ssa);
        }

      return new_stmt;
    }

  return NULL;
}

-----cut the ice---

zip.test.c: In function ‘test’:
zip.test.c:4:6: error: missing definition
    4 | void test (uint16_t *x, unsigned b, unsigned n)
      |      ^~~~
for SSA_NAME: patt_40 in statement:
vect_cst__151 = [vec_duplicate_expr] patt_40;
during GIMPLE pass: vect
dump file: zip.test.c.180t.vect
zip.test.c:4:6: internal compiler error: verify_ssa failed
0x1de0860 verify_ssa(bool, bool)
        /home/pli/gcc/555/riscv-gnu-toolchain/gcc/__RISCV_BUILD__/../gcc/tree-ssa.cc:1203
0x1919f69 execute_function_todo
        /home/pli/gcc/555/riscv-gnu-toolchain/gcc/__RISCV_BUILD__/../gcc/passes.cc:2096
0x1918b46 do_per_function
        /home/pli/gcc/555/riscv-gnu-toolchain/gcc/__RISCV_BUILD__/../gcc/passes.cc:1688
0x191a116 execute_todo

Pan


-----Original Message-----
From: Richard Biener <richard.guenther@gmail.com> 
Sent: Friday, June 21, 2024 5:29 PM
To: Li, Pan2 <pan2.li@intel.com>
Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com; rdapp.gcc@gmail.com
Subject: Re: [PATCH v1] Ifcvt: Add cond tree reconcile for truncated .SAT_SUB

On Fri, Jun 21, 2024 at 10:50 AM Li, Pan2 <pan2.li@intel.com> wrote:
>
> Thanks Richard for comments.
>
> > to match this by changing it to
>
> > /* Unsigned saturation sub, case 2 (branch with ge):
> >    SAT_U_SUB = X >= Y ? X - Y : 0.  */
> > (match (unsigned_integer_sat_sub @0 @1)
> > (cond^ (ge @0 @1) (convert? (minus @0 @1)) integer_zerop)
> >  (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
> >       && types_match (type, @0, @1))))
>
> Do we need another name for this matching ? Add (convert? here may change the sematics of .SAT_SUB.
> When we call gimple_unsigned_integer_sat_sub (lhs, ops, NULL), the converted value may be returned different
> to the (minus @0 @1). Please correct me if my understanding is wrong.

I think gimple_unsigned_integer_sat_sub (lhs, ...) simply matches
(typeof LHS).SAT_SUB (ops[0], ops[1]) now, I don't think it's necessary to
handle the case where typef LHS and typeof ops[0] are equal specially?

> > and when using the gimple_match_* function make sure to consider
> > that the .SAT_SUB (@0, @1) is converted to the type of the SSA name
> > we matched?
>
> This may have problem for vector part I guess, require some additional change from vectorize_convert when
> I try to do that in previous. Let me double check about it, and keep you posted.

You are using gimple_unsigned_integer_sat_sub from pattern recognition, the
thing to do is simply to add a conversion stmt to the pattern sequence in case
the types differ?

But maybe I'm missing something.

Richard.

> Pan
>
> -----Original Message-----
> From: Richard Biener <richard.guenther@gmail.com>
> Sent: Friday, June 21, 2024 3:00 PM
> To: Li, Pan2 <pan2.li@intel.com>
> Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com; rdapp.gcc@gmail.com
> Subject: Re: [PATCH v1] Ifcvt: Add cond tree reconcile for truncated .SAT_SUB
>
> On Fri, Jun 21, 2024 at 5:53 AM <pan2.li@intel.com> wrote:
> >
> > From: Pan Li <pan2.li@intel.com>
> >
> > The zip benchmark of coremark-pro have one SAT_SUB like pattern but
> > truncated as below:
> >
> > void test (uint16_t *x, unsigned b, unsigned n)
> > {
> >   unsigned a = 0;
> >   register uint16_t *p = x;
> >
> >   do {
> >     a = *--p;
> >     *p = (uint16_t)(a >= b ? a - b : 0); // Truncate the result of SAT_SUB
> >   } while (--n);
> > }
> >
> > It will have gimple after ifcvt pass,  it cannot hit any pattern of
> > SAT_SUB and then cannot vectorize to SAT_SUB.
> >
> > _2 = a_11 - b_12(D);
> > iftmp.0_13 = (short unsigned int) _2;
> > _18 = a_11 >= b_12(D);
> > iftmp.0_5 = _18 ? iftmp.0_13 : 0;
> >
> > This patch would like to do some reconcile for above pattern to match
> > the SAT_SUB pattern.  Then the underlying vect pass is able to vectorize
> > the SAT_SUB.
>
> Hmm.  I was thinking of allowing
>
> /* Unsigned saturation sub, case 2 (branch with ge):
>    SAT_U_SUB = X >= Y ? X - Y : 0.  */
> (match (unsigned_integer_sat_sub @0 @1)
>  (cond^ (ge @0 @1) (minus @0 @1) integer_zerop)
>  (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
>       && types_match (type, @0, @1))))
>
> to match this by changing it to
>
> /* Unsigned saturation sub, case 2 (branch with ge):
>    SAT_U_SUB = X >= Y ? X - Y : 0.  */
> (match (unsigned_integer_sat_sub @0 @1)
>  (cond^ (ge @0 @1) (convert? (minus @0 @1)) integer_zerop)
>  (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
>       && types_match (type, @0, @1))))
>
> and when using the gimple_match_* function make sure to consider
> that the .SAT_SUB (@0, @1) is converted to the type of the SSA name
> we matched?
>
> Richard.
>
> > _2 = a_11 - b_12(D);
> > _18 = a_11 >= b_12(D);
> > _pattmp = _18 ? _2 : 0; // .SAT_SUB pattern
> > iftmp.0_13 = (short unsigned int) _pattmp;
> > iftmp.0_5 = iftmp.0_13;
> >
> > The below tests are running for this patch.
> > 1. The rv64gcv fully regression tests.
> > 2. The rv64gcv build with glibc.
> > 3. The x86 bootstrap tests.
> > 4. The x86 fully regression tests.
> >
> > gcc/ChangeLog:
> >
> >         * match.pd: Add new match for trunated unsigned sat_sub.
> >         * tree-if-conv.cc (gimple_truncated_unsigned_integer_sat_sub):
> >         New external decl from match.pd.
> >         (tree_if_cond_reconcile_unsigned_integer_sat_sub): New func impl
> >         to reconcile the truncated sat_sub pattern.
> >         (tree_if_cond_reconcile): New func impl to reconcile.
> >         (pass_if_conversion::execute): Try to reconcile after ifcvt.
> >
> > Signed-off-by: Pan Li <pan2.li@intel.com>
> > ---
> >  gcc/match.pd        |  9 +++++
> >  gcc/tree-if-conv.cc | 83 +++++++++++++++++++++++++++++++++++++++++++++
> >  2 files changed, 92 insertions(+)
> >
> > diff --git a/gcc/match.pd b/gcc/match.pd
> > index 3d0689c9312..9617a5f9d5e 100644
> > --- a/gcc/match.pd
> > +++ b/gcc/match.pd
> > @@ -3210,6 +3210,15 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> >   (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
> >        && types_match (type, @0, @1))))
> >
> > +/* Unsigned saturation sub and then truncated, aka:
> > +   Truncated = X >= Y ? (Other Type) (X - Y) : 0.
> > + */
> > +(match (truncated_unsigned_integer_sat_sub @0 @1)
> > + (cond (ge @0 @1) (convert (minus @0 @1)) integer_zerop)
> > + (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
> > +      && types_match (@0, @1)
> > +      && tree_int_cst_lt (TYPE_SIZE (type), TYPE_SIZE (TREE_TYPE (@0))))))
> > +
> >  /* x >  y  &&  x != XXX_MIN  -->  x > y
> >     x >  y  &&  x == XXX_MIN  -->  false . */
> >  (for eqne (eq ne)
> > diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc
> > index 57992b6deca..535743130f2 100644
> > --- a/gcc/tree-if-conv.cc
> > +++ b/gcc/tree-if-conv.cc
> > @@ -3738,6 +3738,87 @@ bitfields_to_lower_p (class loop *loop,
> >    return !reads_to_lower.is_empty () || !writes_to_lower.is_empty ();
> >  }
> >
> > +extern bool gimple_truncated_unsigned_integer_sat_sub (tree, tree*,
> > +                                                      tree (*)(tree));
> > +
> > +/*
> > + * Try to reconcile the stmt pattern as below to math the SAT_SUB
> > + * in vectorization.  If and only if the related internal_fn has
> > + * been implemented already.
> > + *
> > + * The reconcile will insert one new stmt named 'a' in below example,
> > + * replace the stmt '4' by new added stmt 'b' as well.  Then the stmt
> > + * pattern is able to hit the SAT_SUB pattern in the underlying pass.
> > + *
> > + * 1. _2 = a_11 - b_12(D);
> > + * 2. iftmp.0_13 = (short unsigned int) _2;
> > + * 3. _18 = a_11 >= b_12(D);
> > + * 4. iftmp.0_5 = _18 ? iftmp.0_13 : 0;
> > + * ==>
> > + * 1. _2 = a_11 - b_12(D);
> > + * 3. _18 = a_11 >= b_12(D);
> > + * a. pattmp = _18 ? _2 : 0;                     // New insertion
> > + * 2. iftmp.0_13 = (short unsigned int) _pattmp; // Move before
> > + * b. iftmp.0_5 = iftmp.0_13;
> > + *    == Replace ==> 4. iftmp.0_5 = _18 ? iftmp.0_13 : 0;
> > + */
> > +static void
> > +tree_if_cond_reconcile_unsigned_integer_sat_sub (gimple_stmt_iterator *gsi,
> > +                                                gassign *stmt)
> > +{
> > +  tree ops[2];
> > +  tree lhs = gimple_assign_lhs (stmt);
> > +  bool supported_p = direct_internal_fn_supported_p (IFN_SAT_SUB,
> > +                                                    TREE_TYPE (lhs),
> > +                                                    OPTIMIZE_FOR_BOTH);
> > +
> > +  if (supported_p && gimple_truncated_unsigned_integer_sat_sub (lhs, ops, NULL))
> > +    {
> > +      tree cond = gimple_assign_rhs1 (stmt); // aka _18
> > +      tree truncated = gimple_assign_rhs2 (stmt); // aka iftmp.0_13
> > +      gimple *stmt_2 = SSA_NAME_DEF_STMT (truncated);
> > +      tree minus = gimple_assign_rhs1 (stmt_2); // aka _2
> > +      tree raw_type = TREE_TYPE (minus);
> > +      tree zero = build_zero_cst (raw_type);
> > +      tree tmp = make_temp_ssa_name (raw_type, NULL, "sat_sub_tmp");
> > +
> > +      /* For stmt 'a' in above example  */
> > +      gimple *stmt_a = gimple_build_assign (tmp, COND_EXPR, cond, minus, zero);
> > +      gsi_insert_before (gsi, stmt_a, GSI_SAME_STMT);
> > +      update_stmt (stmt_a);
> > +
> > +      /* For stmt '2' in above example  */
> > +      gimple_stmt_iterator stmt_2_gsi = gsi_for_stmt (stmt_2);
> > +      gsi_move_before (&stmt_2_gsi, gsi, GSI_SAME_STMT);
> > +      gimple_assign_set_rhs1 (stmt_2, tmp);
> > +      update_stmt (stmt_2);
> > +
> > +      /* For stmt 'b' in above example  */
> > +      gimple *stmt_b = gimple_build_assign (lhs, NOP_EXPR, truncated);
> > +      gsi_replace (gsi, stmt_b, /* update_eh_info */ true);
> > +      update_stmt (stmt_b);
> > +    }
> > +}
> > +
> > +static void
> > +tree_if_cond_reconcile (function *fun)
> > +{
> > +  basic_block bb;
> > +  FOR_EACH_BB_FN (bb, fun)
> > +    {
> > +      gimple_stmt_iterator gsi;
> > +      for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
> > +       {
> > +         gimple *stmt = gsi_stmt (gsi);
> > +
> > +         if (is_gimple_assign (stmt))
> > +           {
> > +             gassign *assign = dyn_cast <gassign *> (stmt);
> > +             tree_if_cond_reconcile_unsigned_integer_sat_sub (&gsi, assign);
> > +           }
> > +       }
> > +    }
> > +}
> >
> >  /* If-convert LOOP when it is legal.  For the moment this pass has no
> >     profitability analysis.  Returns non-zero todo flags when something
> > @@ -4063,6 +4144,8 @@ pass_if_conversion::execute (function *fun)
> >         }
> >      }
> >
> > +  tree_if_cond_reconcile (fun);
> > +
> >    return 0;
> >  }
> >
> > --
> > 2.34.1
> >

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v1] Ifcvt: Add cond tree reconcile for truncated .SAT_SUB
  2024-06-21 14:45       ` Li, Pan2
@ 2024-06-22 13:19         ` Richard Biener
  2024-06-23 12:57           ` Li, Pan2
  0 siblings, 1 reply; 27+ messages in thread
From: Richard Biener @ 2024-06-22 13:19 UTC (permalink / raw)
  To: Li, Pan2; +Cc: gcc-patches, juzhe.zhong, kito.cheng, jeffreyalaw, rdapp.gcc

On Fri, Jun 21, 2024 at 4:45 PM Li, Pan2 <pan2.li@intel.com> wrote:
>
> Thanks Richard for suggestion, tried the (convert? with below gimple stmt but got a miss def ice.
> To double confirm, the *type_out should be the vector type of lhs, and we only need to build
> one cvt stmt from itype to otype here. Or just return the call directly and set the type_out to the v_otype?
>
> static gimple *
> vect_recog_build_binary_gimple_stmt (vec_info *vinfo, gimple *stmt,
>                                      internal_fn fn, tree *type_out,
>                                      tree lhs, tree op_0, tree op_1)
> {
>   tree itype = TREE_TYPE (op_0);
>   tree otype = TREE_TYPE (lhs);
>   tree v_itype = get_vectype_for_scalar_type (vinfo, itype);
>   tree v_otype = get_vectype_for_scalar_type (vinfo, otype);
>
>   if (v_itype != NULL_TREE && v_otype != NULL_TREE
>     && direct_internal_fn_supported_p (fn, v_itype, OPTIMIZE_FOR_BOTH))
>     {
>       gcall *call = gimple_build_call_internal (fn, 2, op_0, op_1);
>       tree itype_ssa = vect_recog_temp_ssa_var (itype, NULL);
>
>       gimple_call_set_lhs (call, itype_ssa);
>       gimple_call_set_nothrow (call, /* nothrow_p */ false);
>       gimple_set_location (call, gimple_location (stmt));
>
>       *type_out = v_otype;
>       gimple *new_stmt = call;
>
>       if (itype != otype)
>         {
>           tree otype_ssa = vect_recog_temp_ssa_var (otype, NULL);
>           new_stmt = gimple_build_assign (otype_ssa, CONVERT_EXPR, itype_ssa);
>         }
>
>       return new_stmt;

You need to refactor this to add to the stmts pattern def sequence
(look for append_pattern_def_seq uses for example)

>     }
>
>   return NULL;
> }
>
> -----cut the ice---
>
> zip.test.c: In function ‘test’:
> zip.test.c:4:6: error: missing definition
>     4 | void test (uint16_t *x, unsigned b, unsigned n)
>       |      ^~~~
> for SSA_NAME: patt_40 in statement:
> vect_cst__151 = [vec_duplicate_expr] patt_40;
> during GIMPLE pass: vect
> dump file: zip.test.c.180t.vect
> zip.test.c:4:6: internal compiler error: verify_ssa failed
> 0x1de0860 verify_ssa(bool, bool)
>         /home/pli/gcc/555/riscv-gnu-toolchain/gcc/__RISCV_BUILD__/../gcc/tree-ssa.cc:1203
> 0x1919f69 execute_function_todo
>         /home/pli/gcc/555/riscv-gnu-toolchain/gcc/__RISCV_BUILD__/../gcc/passes.cc:2096
> 0x1918b46 do_per_function
>         /home/pli/gcc/555/riscv-gnu-toolchain/gcc/__RISCV_BUILD__/../gcc/passes.cc:1688
> 0x191a116 execute_todo
>
> Pan
>
>
> -----Original Message-----
> From: Richard Biener <richard.guenther@gmail.com>
> Sent: Friday, June 21, 2024 5:29 PM
> To: Li, Pan2 <pan2.li@intel.com>
> Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com; rdapp.gcc@gmail.com
> Subject: Re: [PATCH v1] Ifcvt: Add cond tree reconcile for truncated .SAT_SUB
>
> On Fri, Jun 21, 2024 at 10:50 AM Li, Pan2 <pan2.li@intel.com> wrote:
> >
> > Thanks Richard for comments.
> >
> > > to match this by changing it to
> >
> > > /* Unsigned saturation sub, case 2 (branch with ge):
> > >    SAT_U_SUB = X >= Y ? X - Y : 0.  */
> > > (match (unsigned_integer_sat_sub @0 @1)
> > > (cond^ (ge @0 @1) (convert? (minus @0 @1)) integer_zerop)
> > >  (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
> > >       && types_match (type, @0, @1))))
> >
> > Do we need another name for this matching ? Add (convert? here may change the sematics of .SAT_SUB.
> > When we call gimple_unsigned_integer_sat_sub (lhs, ops, NULL), the converted value may be returned different
> > to the (minus @0 @1). Please correct me if my understanding is wrong.
>
> I think gimple_unsigned_integer_sat_sub (lhs, ...) simply matches
> (typeof LHS).SAT_SUB (ops[0], ops[1]) now, I don't think it's necessary to
> handle the case where typef LHS and typeof ops[0] are equal specially?
>
> > > and when using the gimple_match_* function make sure to consider
> > > that the .SAT_SUB (@0, @1) is converted to the type of the SSA name
> > > we matched?
> >
> > This may have problem for vector part I guess, require some additional change from vectorize_convert when
> > I try to do that in previous. Let me double check about it, and keep you posted.
>
> You are using gimple_unsigned_integer_sat_sub from pattern recognition, the
> thing to do is simply to add a conversion stmt to the pattern sequence in case
> the types differ?
>
> But maybe I'm missing something.
>
> Richard.
>
> > Pan
> >
> > -----Original Message-----
> > From: Richard Biener <richard.guenther@gmail.com>
> > Sent: Friday, June 21, 2024 3:00 PM
> > To: Li, Pan2 <pan2.li@intel.com>
> > Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com; rdapp.gcc@gmail.com
> > Subject: Re: [PATCH v1] Ifcvt: Add cond tree reconcile for truncated .SAT_SUB
> >
> > On Fri, Jun 21, 2024 at 5:53 AM <pan2.li@intel.com> wrote:
> > >
> > > From: Pan Li <pan2.li@intel.com>
> > >
> > > The zip benchmark of coremark-pro have one SAT_SUB like pattern but
> > > truncated as below:
> > >
> > > void test (uint16_t *x, unsigned b, unsigned n)
> > > {
> > >   unsigned a = 0;
> > >   register uint16_t *p = x;
> > >
> > >   do {
> > >     a = *--p;
> > >     *p = (uint16_t)(a >= b ? a - b : 0); // Truncate the result of SAT_SUB
> > >   } while (--n);
> > > }
> > >
> > > It will have gimple after ifcvt pass,  it cannot hit any pattern of
> > > SAT_SUB and then cannot vectorize to SAT_SUB.
> > >
> > > _2 = a_11 - b_12(D);
> > > iftmp.0_13 = (short unsigned int) _2;
> > > _18 = a_11 >= b_12(D);
> > > iftmp.0_5 = _18 ? iftmp.0_13 : 0;
> > >
> > > This patch would like to do some reconcile for above pattern to match
> > > the SAT_SUB pattern.  Then the underlying vect pass is able to vectorize
> > > the SAT_SUB.
> >
> > Hmm.  I was thinking of allowing
> >
> > /* Unsigned saturation sub, case 2 (branch with ge):
> >    SAT_U_SUB = X >= Y ? X - Y : 0.  */
> > (match (unsigned_integer_sat_sub @0 @1)
> >  (cond^ (ge @0 @1) (minus @0 @1) integer_zerop)
> >  (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
> >       && types_match (type, @0, @1))))
> >
> > to match this by changing it to
> >
> > /* Unsigned saturation sub, case 2 (branch with ge):
> >    SAT_U_SUB = X >= Y ? X - Y : 0.  */
> > (match (unsigned_integer_sat_sub @0 @1)
> >  (cond^ (ge @0 @1) (convert? (minus @0 @1)) integer_zerop)
> >  (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
> >       && types_match (type, @0, @1))))
> >
> > and when using the gimple_match_* function make sure to consider
> > that the .SAT_SUB (@0, @1) is converted to the type of the SSA name
> > we matched?
> >
> > Richard.
> >
> > > _2 = a_11 - b_12(D);
> > > _18 = a_11 >= b_12(D);
> > > _pattmp = _18 ? _2 : 0; // .SAT_SUB pattern
> > > iftmp.0_13 = (short unsigned int) _pattmp;
> > > iftmp.0_5 = iftmp.0_13;
> > >
> > > The below tests are running for this patch.
> > > 1. The rv64gcv fully regression tests.
> > > 2. The rv64gcv build with glibc.
> > > 3. The x86 bootstrap tests.
> > > 4. The x86 fully regression tests.
> > >
> > > gcc/ChangeLog:
> > >
> > >         * match.pd: Add new match for trunated unsigned sat_sub.
> > >         * tree-if-conv.cc (gimple_truncated_unsigned_integer_sat_sub):
> > >         New external decl from match.pd.
> > >         (tree_if_cond_reconcile_unsigned_integer_sat_sub): New func impl
> > >         to reconcile the truncated sat_sub pattern.
> > >         (tree_if_cond_reconcile): New func impl to reconcile.
> > >         (pass_if_conversion::execute): Try to reconcile after ifcvt.
> > >
> > > Signed-off-by: Pan Li <pan2.li@intel.com>
> > > ---
> > >  gcc/match.pd        |  9 +++++
> > >  gcc/tree-if-conv.cc | 83 +++++++++++++++++++++++++++++++++++++++++++++
> > >  2 files changed, 92 insertions(+)
> > >
> > > diff --git a/gcc/match.pd b/gcc/match.pd
> > > index 3d0689c9312..9617a5f9d5e 100644
> > > --- a/gcc/match.pd
> > > +++ b/gcc/match.pd
> > > @@ -3210,6 +3210,15 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> > >   (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
> > >        && types_match (type, @0, @1))))
> > >
> > > +/* Unsigned saturation sub and then truncated, aka:
> > > +   Truncated = X >= Y ? (Other Type) (X - Y) : 0.
> > > + */
> > > +(match (truncated_unsigned_integer_sat_sub @0 @1)
> > > + (cond (ge @0 @1) (convert (minus @0 @1)) integer_zerop)
> > > + (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
> > > +      && types_match (@0, @1)
> > > +      && tree_int_cst_lt (TYPE_SIZE (type), TYPE_SIZE (TREE_TYPE (@0))))))
> > > +
> > >  /* x >  y  &&  x != XXX_MIN  -->  x > y
> > >     x >  y  &&  x == XXX_MIN  -->  false . */
> > >  (for eqne (eq ne)
> > > diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc
> > > index 57992b6deca..535743130f2 100644
> > > --- a/gcc/tree-if-conv.cc
> > > +++ b/gcc/tree-if-conv.cc
> > > @@ -3738,6 +3738,87 @@ bitfields_to_lower_p (class loop *loop,
> > >    return !reads_to_lower.is_empty () || !writes_to_lower.is_empty ();
> > >  }
> > >
> > > +extern bool gimple_truncated_unsigned_integer_sat_sub (tree, tree*,
> > > +                                                      tree (*)(tree));
> > > +
> > > +/*
> > > + * Try to reconcile the stmt pattern as below to math the SAT_SUB
> > > + * in vectorization.  If and only if the related internal_fn has
> > > + * been implemented already.
> > > + *
> > > + * The reconcile will insert one new stmt named 'a' in below example,
> > > + * replace the stmt '4' by new added stmt 'b' as well.  Then the stmt
> > > + * pattern is able to hit the SAT_SUB pattern in the underlying pass.
> > > + *
> > > + * 1. _2 = a_11 - b_12(D);
> > > + * 2. iftmp.0_13 = (short unsigned int) _2;
> > > + * 3. _18 = a_11 >= b_12(D);
> > > + * 4. iftmp.0_5 = _18 ? iftmp.0_13 : 0;
> > > + * ==>
> > > + * 1. _2 = a_11 - b_12(D);
> > > + * 3. _18 = a_11 >= b_12(D);
> > > + * a. pattmp = _18 ? _2 : 0;                     // New insertion
> > > + * 2. iftmp.0_13 = (short unsigned int) _pattmp; // Move before
> > > + * b. iftmp.0_5 = iftmp.0_13;
> > > + *    == Replace ==> 4. iftmp.0_5 = _18 ? iftmp.0_13 : 0;
> > > + */
> > > +static void
> > > +tree_if_cond_reconcile_unsigned_integer_sat_sub (gimple_stmt_iterator *gsi,
> > > +                                                gassign *stmt)
> > > +{
> > > +  tree ops[2];
> > > +  tree lhs = gimple_assign_lhs (stmt);
> > > +  bool supported_p = direct_internal_fn_supported_p (IFN_SAT_SUB,
> > > +                                                    TREE_TYPE (lhs),
> > > +                                                    OPTIMIZE_FOR_BOTH);
> > > +
> > > +  if (supported_p && gimple_truncated_unsigned_integer_sat_sub (lhs, ops, NULL))
> > > +    {
> > > +      tree cond = gimple_assign_rhs1 (stmt); // aka _18
> > > +      tree truncated = gimple_assign_rhs2 (stmt); // aka iftmp.0_13
> > > +      gimple *stmt_2 = SSA_NAME_DEF_STMT (truncated);
> > > +      tree minus = gimple_assign_rhs1 (stmt_2); // aka _2
> > > +      tree raw_type = TREE_TYPE (minus);
> > > +      tree zero = build_zero_cst (raw_type);
> > > +      tree tmp = make_temp_ssa_name (raw_type, NULL, "sat_sub_tmp");
> > > +
> > > +      /* For stmt 'a' in above example  */
> > > +      gimple *stmt_a = gimple_build_assign (tmp, COND_EXPR, cond, minus, zero);
> > > +      gsi_insert_before (gsi, stmt_a, GSI_SAME_STMT);
> > > +      update_stmt (stmt_a);
> > > +
> > > +      /* For stmt '2' in above example  */
> > > +      gimple_stmt_iterator stmt_2_gsi = gsi_for_stmt (stmt_2);
> > > +      gsi_move_before (&stmt_2_gsi, gsi, GSI_SAME_STMT);
> > > +      gimple_assign_set_rhs1 (stmt_2, tmp);
> > > +      update_stmt (stmt_2);
> > > +
> > > +      /* For stmt 'b' in above example  */
> > > +      gimple *stmt_b = gimple_build_assign (lhs, NOP_EXPR, truncated);
> > > +      gsi_replace (gsi, stmt_b, /* update_eh_info */ true);
> > > +      update_stmt (stmt_b);
> > > +    }
> > > +}
> > > +
> > > +static void
> > > +tree_if_cond_reconcile (function *fun)
> > > +{
> > > +  basic_block bb;
> > > +  FOR_EACH_BB_FN (bb, fun)
> > > +    {
> > > +      gimple_stmt_iterator gsi;
> > > +      for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
> > > +       {
> > > +         gimple *stmt = gsi_stmt (gsi);
> > > +
> > > +         if (is_gimple_assign (stmt))
> > > +           {
> > > +             gassign *assign = dyn_cast <gassign *> (stmt);
> > > +             tree_if_cond_reconcile_unsigned_integer_sat_sub (&gsi, assign);
> > > +           }
> > > +       }
> > > +    }
> > > +}
> > >
> > >  /* If-convert LOOP when it is legal.  For the moment this pass has no
> > >     profitability analysis.  Returns non-zero todo flags when something
> > > @@ -4063,6 +4144,8 @@ pass_if_conversion::execute (function *fun)
> > >         }
> > >      }
> > >
> > > +  tree_if_cond_reconcile (fun);
> > > +
> > >    return 0;
> > >  }
> > >
> > > --
> > > 2.34.1
> > >

^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH v1] Ifcvt: Add cond tree reconcile for truncated .SAT_SUB
  2024-06-22 13:19         ` Richard Biener
@ 2024-06-23 12:57           ` Li, Pan2
  0 siblings, 0 replies; 27+ messages in thread
From: Li, Pan2 @ 2024-06-23 12:57 UTC (permalink / raw)
  To: Richard Biener
  Cc: gcc-patches, juzhe.zhong, kito.cheng, jeffreyalaw, rdapp.gcc

> You need to refactor this to add to the stmts pattern def sequence
>  (look for append_pattern_def_seq uses for example)

Thanks Richard, really save my day, will have a try in v2.

Pan

-----Original Message-----
From: Richard Biener <richard.guenther@gmail.com> 
Sent: Saturday, June 22, 2024 9:19 PM
To: Li, Pan2 <pan2.li@intel.com>
Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com; rdapp.gcc@gmail.com
Subject: Re: [PATCH v1] Ifcvt: Add cond tree reconcile for truncated .SAT_SUB

On Fri, Jun 21, 2024 at 4:45 PM Li, Pan2 <pan2.li@intel.com> wrote:
>
> Thanks Richard for suggestion, tried the (convert? with below gimple stmt but got a miss def ice.
> To double confirm, the *type_out should be the vector type of lhs, and we only need to build
> one cvt stmt from itype to otype here. Or just return the call directly and set the type_out to the v_otype?
>
> static gimple *
> vect_recog_build_binary_gimple_stmt (vec_info *vinfo, gimple *stmt,
>                                      internal_fn fn, tree *type_out,
>                                      tree lhs, tree op_0, tree op_1)
> {
>   tree itype = TREE_TYPE (op_0);
>   tree otype = TREE_TYPE (lhs);
>   tree v_itype = get_vectype_for_scalar_type (vinfo, itype);
>   tree v_otype = get_vectype_for_scalar_type (vinfo, otype);
>
>   if (v_itype != NULL_TREE && v_otype != NULL_TREE
>     && direct_internal_fn_supported_p (fn, v_itype, OPTIMIZE_FOR_BOTH))
>     {
>       gcall *call = gimple_build_call_internal (fn, 2, op_0, op_1);
>       tree itype_ssa = vect_recog_temp_ssa_var (itype, NULL);
>
>       gimple_call_set_lhs (call, itype_ssa);
>       gimple_call_set_nothrow (call, /* nothrow_p */ false);
>       gimple_set_location (call, gimple_location (stmt));
>
>       *type_out = v_otype;
>       gimple *new_stmt = call;
>
>       if (itype != otype)
>         {
>           tree otype_ssa = vect_recog_temp_ssa_var (otype, NULL);
>           new_stmt = gimple_build_assign (otype_ssa, CONVERT_EXPR, itype_ssa);
>         }
>
>       return new_stmt;

You need to refactor this to add to the stmts pattern def sequence
(look for append_pattern_def_seq uses for example)

>     }
>
>   return NULL;
> }
>
> -----cut the ice---
>
> zip.test.c: In function ‘test’:
> zip.test.c:4:6: error: missing definition
>     4 | void test (uint16_t *x, unsigned b, unsigned n)
>       |      ^~~~
> for SSA_NAME: patt_40 in statement:
> vect_cst__151 = [vec_duplicate_expr] patt_40;
> during GIMPLE pass: vect
> dump file: zip.test.c.180t.vect
> zip.test.c:4:6: internal compiler error: verify_ssa failed
> 0x1de0860 verify_ssa(bool, bool)
>         /home/pli/gcc/555/riscv-gnu-toolchain/gcc/__RISCV_BUILD__/../gcc/tree-ssa.cc:1203
> 0x1919f69 execute_function_todo
>         /home/pli/gcc/555/riscv-gnu-toolchain/gcc/__RISCV_BUILD__/../gcc/passes.cc:2096
> 0x1918b46 do_per_function
>         /home/pli/gcc/555/riscv-gnu-toolchain/gcc/__RISCV_BUILD__/../gcc/passes.cc:1688
> 0x191a116 execute_todo
>
> Pan
>
>
> -----Original Message-----
> From: Richard Biener <richard.guenther@gmail.com>
> Sent: Friday, June 21, 2024 5:29 PM
> To: Li, Pan2 <pan2.li@intel.com>
> Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com; rdapp.gcc@gmail.com
> Subject: Re: [PATCH v1] Ifcvt: Add cond tree reconcile for truncated .SAT_SUB
>
> On Fri, Jun 21, 2024 at 10:50 AM Li, Pan2 <pan2.li@intel.com> wrote:
> >
> > Thanks Richard for comments.
> >
> > > to match this by changing it to
> >
> > > /* Unsigned saturation sub, case 2 (branch with ge):
> > >    SAT_U_SUB = X >= Y ? X - Y : 0.  */
> > > (match (unsigned_integer_sat_sub @0 @1)
> > > (cond^ (ge @0 @1) (convert? (minus @0 @1)) integer_zerop)
> > >  (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
> > >       && types_match (type, @0, @1))))
> >
> > Do we need another name for this matching ? Add (convert? here may change the sematics of .SAT_SUB.
> > When we call gimple_unsigned_integer_sat_sub (lhs, ops, NULL), the converted value may be returned different
> > to the (minus @0 @1). Please correct me if my understanding is wrong.
>
> I think gimple_unsigned_integer_sat_sub (lhs, ...) simply matches
> (typeof LHS).SAT_SUB (ops[0], ops[1]) now, I don't think it's necessary to
> handle the case where typef LHS and typeof ops[0] are equal specially?
>
> > > and when using the gimple_match_* function make sure to consider
> > > that the .SAT_SUB (@0, @1) is converted to the type of the SSA name
> > > we matched?
> >
> > This may have problem for vector part I guess, require some additional change from vectorize_convert when
> > I try to do that in previous. Let me double check about it, and keep you posted.
>
> You are using gimple_unsigned_integer_sat_sub from pattern recognition, the
> thing to do is simply to add a conversion stmt to the pattern sequence in case
> the types differ?
>
> But maybe I'm missing something.
>
> Richard.
>
> > Pan
> >
> > -----Original Message-----
> > From: Richard Biener <richard.guenther@gmail.com>
> > Sent: Friday, June 21, 2024 3:00 PM
> > To: Li, Pan2 <pan2.li@intel.com>
> > Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com; rdapp.gcc@gmail.com
> > Subject: Re: [PATCH v1] Ifcvt: Add cond tree reconcile for truncated .SAT_SUB
> >
> > On Fri, Jun 21, 2024 at 5:53 AM <pan2.li@intel.com> wrote:
> > >
> > > From: Pan Li <pan2.li@intel.com>
> > >
> > > The zip benchmark of coremark-pro have one SAT_SUB like pattern but
> > > truncated as below:
> > >
> > > void test (uint16_t *x, unsigned b, unsigned n)
> > > {
> > >   unsigned a = 0;
> > >   register uint16_t *p = x;
> > >
> > >   do {
> > >     a = *--p;
> > >     *p = (uint16_t)(a >= b ? a - b : 0); // Truncate the result of SAT_SUB
> > >   } while (--n);
> > > }
> > >
> > > It will have gimple after ifcvt pass,  it cannot hit any pattern of
> > > SAT_SUB and then cannot vectorize to SAT_SUB.
> > >
> > > _2 = a_11 - b_12(D);
> > > iftmp.0_13 = (short unsigned int) _2;
> > > _18 = a_11 >= b_12(D);
> > > iftmp.0_5 = _18 ? iftmp.0_13 : 0;
> > >
> > > This patch would like to do some reconcile for above pattern to match
> > > the SAT_SUB pattern.  Then the underlying vect pass is able to vectorize
> > > the SAT_SUB.
> >
> > Hmm.  I was thinking of allowing
> >
> > /* Unsigned saturation sub, case 2 (branch with ge):
> >    SAT_U_SUB = X >= Y ? X - Y : 0.  */
> > (match (unsigned_integer_sat_sub @0 @1)
> >  (cond^ (ge @0 @1) (minus @0 @1) integer_zerop)
> >  (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
> >       && types_match (type, @0, @1))))
> >
> > to match this by changing it to
> >
> > /* Unsigned saturation sub, case 2 (branch with ge):
> >    SAT_U_SUB = X >= Y ? X - Y : 0.  */
> > (match (unsigned_integer_sat_sub @0 @1)
> >  (cond^ (ge @0 @1) (convert? (minus @0 @1)) integer_zerop)
> >  (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
> >       && types_match (type, @0, @1))))
> >
> > and when using the gimple_match_* function make sure to consider
> > that the .SAT_SUB (@0, @1) is converted to the type of the SSA name
> > we matched?
> >
> > Richard.
> >
> > > _2 = a_11 - b_12(D);
> > > _18 = a_11 >= b_12(D);
> > > _pattmp = _18 ? _2 : 0; // .SAT_SUB pattern
> > > iftmp.0_13 = (short unsigned int) _pattmp;
> > > iftmp.0_5 = iftmp.0_13;
> > >
> > > The below tests are running for this patch.
> > > 1. The rv64gcv fully regression tests.
> > > 2. The rv64gcv build with glibc.
> > > 3. The x86 bootstrap tests.
> > > 4. The x86 fully regression tests.
> > >
> > > gcc/ChangeLog:
> > >
> > >         * match.pd: Add new match for trunated unsigned sat_sub.
> > >         * tree-if-conv.cc (gimple_truncated_unsigned_integer_sat_sub):
> > >         New external decl from match.pd.
> > >         (tree_if_cond_reconcile_unsigned_integer_sat_sub): New func impl
> > >         to reconcile the truncated sat_sub pattern.
> > >         (tree_if_cond_reconcile): New func impl to reconcile.
> > >         (pass_if_conversion::execute): Try to reconcile after ifcvt.
> > >
> > > Signed-off-by: Pan Li <pan2.li@intel.com>
> > > ---
> > >  gcc/match.pd        |  9 +++++
> > >  gcc/tree-if-conv.cc | 83 +++++++++++++++++++++++++++++++++++++++++++++
> > >  2 files changed, 92 insertions(+)
> > >
> > > diff --git a/gcc/match.pd b/gcc/match.pd
> > > index 3d0689c9312..9617a5f9d5e 100644
> > > --- a/gcc/match.pd
> > > +++ b/gcc/match.pd
> > > @@ -3210,6 +3210,15 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> > >   (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
> > >        && types_match (type, @0, @1))))
> > >
> > > +/* Unsigned saturation sub and then truncated, aka:
> > > +   Truncated = X >= Y ? (Other Type) (X - Y) : 0.
> > > + */
> > > +(match (truncated_unsigned_integer_sat_sub @0 @1)
> > > + (cond (ge @0 @1) (convert (minus @0 @1)) integer_zerop)
> > > + (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
> > > +      && types_match (@0, @1)
> > > +      && tree_int_cst_lt (TYPE_SIZE (type), TYPE_SIZE (TREE_TYPE (@0))))))
> > > +
> > >  /* x >  y  &&  x != XXX_MIN  -->  x > y
> > >     x >  y  &&  x == XXX_MIN  -->  false . */
> > >  (for eqne (eq ne)
> > > diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc
> > > index 57992b6deca..535743130f2 100644
> > > --- a/gcc/tree-if-conv.cc
> > > +++ b/gcc/tree-if-conv.cc
> > > @@ -3738,6 +3738,87 @@ bitfields_to_lower_p (class loop *loop,
> > >    return !reads_to_lower.is_empty () || !writes_to_lower.is_empty ();
> > >  }
> > >
> > > +extern bool gimple_truncated_unsigned_integer_sat_sub (tree, tree*,
> > > +                                                      tree (*)(tree));
> > > +
> > > +/*
> > > + * Try to reconcile the stmt pattern as below to math the SAT_SUB
> > > + * in vectorization.  If and only if the related internal_fn has
> > > + * been implemented already.
> > > + *
> > > + * The reconcile will insert one new stmt named 'a' in below example,
> > > + * replace the stmt '4' by new added stmt 'b' as well.  Then the stmt
> > > + * pattern is able to hit the SAT_SUB pattern in the underlying pass.
> > > + *
> > > + * 1. _2 = a_11 - b_12(D);
> > > + * 2. iftmp.0_13 = (short unsigned int) _2;
> > > + * 3. _18 = a_11 >= b_12(D);
> > > + * 4. iftmp.0_5 = _18 ? iftmp.0_13 : 0;
> > > + * ==>
> > > + * 1. _2 = a_11 - b_12(D);
> > > + * 3. _18 = a_11 >= b_12(D);
> > > + * a. pattmp = _18 ? _2 : 0;                     // New insertion
> > > + * 2. iftmp.0_13 = (short unsigned int) _pattmp; // Move before
> > > + * b. iftmp.0_5 = iftmp.0_13;
> > > + *    == Replace ==> 4. iftmp.0_5 = _18 ? iftmp.0_13 : 0;
> > > + */
> > > +static void
> > > +tree_if_cond_reconcile_unsigned_integer_sat_sub (gimple_stmt_iterator *gsi,
> > > +                                                gassign *stmt)
> > > +{
> > > +  tree ops[2];
> > > +  tree lhs = gimple_assign_lhs (stmt);
> > > +  bool supported_p = direct_internal_fn_supported_p (IFN_SAT_SUB,
> > > +                                                    TREE_TYPE (lhs),
> > > +                                                    OPTIMIZE_FOR_BOTH);
> > > +
> > > +  if (supported_p && gimple_truncated_unsigned_integer_sat_sub (lhs, ops, NULL))
> > > +    {
> > > +      tree cond = gimple_assign_rhs1 (stmt); // aka _18
> > > +      tree truncated = gimple_assign_rhs2 (stmt); // aka iftmp.0_13
> > > +      gimple *stmt_2 = SSA_NAME_DEF_STMT (truncated);
> > > +      tree minus = gimple_assign_rhs1 (stmt_2); // aka _2
> > > +      tree raw_type = TREE_TYPE (minus);
> > > +      tree zero = build_zero_cst (raw_type);
> > > +      tree tmp = make_temp_ssa_name (raw_type, NULL, "sat_sub_tmp");
> > > +
> > > +      /* For stmt 'a' in above example  */
> > > +      gimple *stmt_a = gimple_build_assign (tmp, COND_EXPR, cond, minus, zero);
> > > +      gsi_insert_before (gsi, stmt_a, GSI_SAME_STMT);
> > > +      update_stmt (stmt_a);
> > > +
> > > +      /* For stmt '2' in above example  */
> > > +      gimple_stmt_iterator stmt_2_gsi = gsi_for_stmt (stmt_2);
> > > +      gsi_move_before (&stmt_2_gsi, gsi, GSI_SAME_STMT);
> > > +      gimple_assign_set_rhs1 (stmt_2, tmp);
> > > +      update_stmt (stmt_2);
> > > +
> > > +      /* For stmt 'b' in above example  */
> > > +      gimple *stmt_b = gimple_build_assign (lhs, NOP_EXPR, truncated);
> > > +      gsi_replace (gsi, stmt_b, /* update_eh_info */ true);
> > > +      update_stmt (stmt_b);
> > > +    }
> > > +}
> > > +
> > > +static void
> > > +tree_if_cond_reconcile (function *fun)
> > > +{
> > > +  basic_block bb;
> > > +  FOR_EACH_BB_FN (bb, fun)
> > > +    {
> > > +      gimple_stmt_iterator gsi;
> > > +      for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
> > > +       {
> > > +         gimple *stmt = gsi_stmt (gsi);
> > > +
> > > +         if (is_gimple_assign (stmt))
> > > +           {
> > > +             gassign *assign = dyn_cast <gassign *> (stmt);
> > > +             tree_if_cond_reconcile_unsigned_integer_sat_sub (&gsi, assign);
> > > +           }
> > > +       }
> > > +    }
> > > +}
> > >
> > >  /* If-convert LOOP when it is legal.  For the moment this pass has no
> > >     profitability analysis.  Returns non-zero todo flags when something
> > > @@ -4063,6 +4144,8 @@ pass_if_conversion::execute (function *fun)
> > >         }
> > >      }
> > >
> > > +  tree_if_cond_reconcile (fun);
> > > +
> > >    return 0;
> > >  }
> > >
> > > --
> > > 2.34.1
> > >

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip
  2024-06-21  3:52 [PATCH v1] Ifcvt: Add cond tree reconcile for truncated .SAT_SUB pan2.li
  2024-06-21  7:00 ` Richard Biener
@ 2024-06-24 13:55 ` pan2.li
  2024-06-24 19:59   ` Tamar Christina
                     ` (2 more replies)
  2024-06-27  1:31 ` [PATCH v3] " pan2.li
  2 siblings, 3 replies; 27+ messages in thread
From: pan2.li @ 2024-06-24 13:55 UTC (permalink / raw)
  To: gcc-patches
  Cc: juzhe.zhong, kito.cheng, richard.guenther, jeffreyalaw, pinskia, Pan Li

From: Pan Li <pan2.li@intel.com>

The zip benchmark of coremark-pro have one SAT_SUB like pattern but
truncated as below:

void test (uint16_t *x, unsigned b, unsigned n)
{
  unsigned a = 0;
  register uint16_t *p = x;

  do {
    a = *--p;
    *p = (uint16_t)(a >= b ? a - b : 0); // Truncate after .SAT_SUB
  } while (--n);
}

It will have gimple before vect pass,  it cannot hit any pattern of
SAT_SUB and then cannot vectorize to SAT_SUB.

_2 = a_11 - b_12(D);
iftmp.0_13 = (short unsigned int) _2;
_18 = a_11 >= b_12(D);
iftmp.0_5 = _18 ? iftmp.0_13 : 0;

This patch would like to improve the pattern match to recog above
as truncate after .SAT_SUB pattern.  Then we will have the pattern
similar to below,  as well as eliminate the first 3 dead stmt.

_2 = a_11 - b_12(D);
iftmp.0_13 = (short unsigned int) _2;
_18 = a_11 >= b_12(D);
iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D));

The below tests are passed for this patch.
1. The rv64gcv fully regression tests.
2. The rv64gcv build with glibc.
3. The x86 bootstrap tests.
4. The x86 fully regression tests.

gcc/ChangeLog:

	* match.pd: Add convert description for minus and capture.
	* tree-vect-patterns.cc (vect_recog_build_binary_gimple_call): Add
	new logic to handle in_type is incompatibile with out_type,  as
	well as rename from.
	(vect_recog_build_binary_gimple_stmt): Rename to.
	(vect_recog_sat_add_pattern): Leverage above renamed func.
	(vect_recog_sat_sub_pattern): Ditto.

Signed-off-by: Pan Li <pan2.li@intel.com>
---
 gcc/match.pd              |  4 +--
 gcc/tree-vect-patterns.cc | 51 ++++++++++++++++++++++++---------------
 2 files changed, 33 insertions(+), 22 deletions(-)

diff --git a/gcc/match.pd b/gcc/match.pd
index 3d0689c9312..4a4b0b2e72f 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -3164,9 +3164,9 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
 /* Unsigned saturation sub, case 2 (branch with ge):
    SAT_U_SUB = X >= Y ? X - Y : 0.  */
 (match (unsigned_integer_sat_sub @0 @1)
- (cond^ (ge @0 @1) (minus @0 @1) integer_zerop)
+ (cond^ (ge @0 @1) (convert? (minus (convert1? @0) (convert1? @1))) integer_zerop)
  (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
-      && types_match (type, @0, @1))))
+      && TYPE_UNSIGNED (TREE_TYPE (@0)) && types_match (@0, @1))))
 
 /* Unsigned saturation sub, case 3 (branchless with gt):
    SAT_U_SUB = (X - Y) * (X > Y).  */
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index cef901808eb..3d887d36050 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -4490,26 +4490,37 @@ vect_recog_mult_pattern (vec_info *vinfo,
 extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree));
 extern bool gimple_unsigned_integer_sat_sub (tree, tree*, tree (*)(tree));
 
-static gcall *
-vect_recog_build_binary_gimple_call (vec_info *vinfo, gimple *stmt,
+static gimple *
+vect_recog_build_binary_gimple_stmt (vec_info *vinfo, stmt_vec_info stmt_info,
 				     internal_fn fn, tree *type_out,
-				     tree op_0, tree op_1)
+				     tree lhs, tree op_0, tree op_1)
 {
   tree itype = TREE_TYPE (op_0);
-  tree vtype = get_vectype_for_scalar_type (vinfo, itype);
+  tree otype = TREE_TYPE (lhs);
+  tree v_itype = get_vectype_for_scalar_type (vinfo, itype);
+  tree v_otype = get_vectype_for_scalar_type (vinfo, otype);
 
-  if (vtype != NULL_TREE
-    && direct_internal_fn_supported_p (fn, vtype, OPTIMIZE_FOR_BOTH))
+  if (v_itype != NULL_TREE && v_otype != NULL_TREE
+    && direct_internal_fn_supported_p (fn, v_itype, OPTIMIZE_FOR_BOTH))
     {
       gcall *call = gimple_build_call_internal (fn, 2, op_0, op_1);
+      tree in_ssa = vect_recog_temp_ssa_var (itype, NULL);
 
-      gimple_call_set_lhs (call, vect_recog_temp_ssa_var (itype, NULL));
+      gimple_call_set_lhs (call, in_ssa);
       gimple_call_set_nothrow (call, /* nothrow_p */ false);
-      gimple_set_location (call, gimple_location (stmt));
+      gimple_set_location (call, gimple_location (STMT_VINFO_STMT (stmt_info)));
+
+      *type_out = v_otype;
 
-      *type_out = vtype;
+      if (types_compatible_p (itype, otype))
+	return call;
+      else
+	{
+	  append_pattern_def_seq (vinfo, stmt_info, call, v_itype);
+	  tree out_ssa = vect_recog_temp_ssa_var (otype, NULL);
 
-      return call;
+	  return gimple_build_assign (out_ssa, CONVERT_EXPR, in_ssa);
+	}
     }
 
   return NULL;
@@ -4541,13 +4552,13 @@ vect_recog_sat_add_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo,
 
   if (gimple_unsigned_integer_sat_add (lhs, ops, NULL))
     {
-      gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt,
-							 IFN_SAT_ADD, type_out,
-							 ops[0], ops[1]);
-      if (call)
+      gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo,
+							  IFN_SAT_ADD, type_out,
+							  lhs, ops[0], ops[1]);
+      if (stmt)
 	{
 	  vect_pattern_detected ("vect_recog_sat_add_pattern", last_stmt);
-	  return call;
+	  return stmt;
 	}
     }
 
@@ -4579,13 +4590,13 @@ vect_recog_sat_sub_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo,
 
   if (gimple_unsigned_integer_sat_sub (lhs, ops, NULL))
     {
-      gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt,
-							 IFN_SAT_SUB, type_out,
-							 ops[0], ops[1]);
-      if (call)
+      gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo,
+							  IFN_SAT_SUB, type_out,
+							  lhs, ops[0], ops[1]);
+      if (stmt)
 	{
 	  vect_pattern_detected ("vect_recog_sat_sub_pattern", last_stmt);
-	  return call;
+	  return stmt;
 	}
     }
 
-- 
2.34.1


^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip
  2024-06-24 13:55 ` [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip pan2.li
@ 2024-06-24 19:59   ` Tamar Christina
  2024-06-25  2:25     ` Li, Pan2
  2024-06-26 13:56   ` Richard Biener
  2024-06-27  6:48   ` Uros Bizjak
  2 siblings, 1 reply; 27+ messages in thread
From: Tamar Christina @ 2024-06-24 19:59 UTC (permalink / raw)
  To: pan2.li, gcc-patches
  Cc: juzhe.zhong, kito.cheng, richard.guenther, jeffreyalaw, pinskia

Hi,

> -----Original Message-----
> From: pan2.li@intel.com <pan2.li@intel.com>
> Sent: Monday, June 24, 2024 2:55 PM
> To: gcc-patches@gcc.gnu.org
> Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com;
> jeffreyalaw@gmail.com; pinskia@gmail.com; Pan Li <pan2.li@intel.com>
> Subject: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip
> 
> From: Pan Li <pan2.li@intel.com>
> 
> The zip benchmark of coremark-pro have one SAT_SUB like pattern but
> truncated as below:
> 
> void test (uint16_t *x, unsigned b, unsigned n)
> {
>   unsigned a = 0;
>   register uint16_t *p = x;
> 
>   do {
>     a = *--p;
>     *p = (uint16_t)(a >= b ? a - b : 0); // Truncate after .SAT_SUB
>   } while (--n);
> }
> 
> It will have gimple before vect pass,  it cannot hit any pattern of
> SAT_SUB and then cannot vectorize to SAT_SUB.
> 
> _2 = a_11 - b_12(D);
> iftmp.0_13 = (short unsigned int) _2;
> _18 = a_11 >= b_12(D);
> iftmp.0_5 = _18 ? iftmp.0_13 : 0;
> 
> This patch would like to improve the pattern match to recog above
> as truncate after .SAT_SUB pattern.  Then we will have the pattern
> similar to below,  as well as eliminate the first 3 dead stmt.
> 
> _2 = a_11 - b_12(D);
> iftmp.0_13 = (short unsigned int) _2;
> _18 = a_11 >= b_12(D);
> iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D));
> 

I guess this is because one branch of the  cond is a constant so the
convert is folded in.  I was wondering though,  can't we just push
in the truncate in this case?

i.e. in this case we know both types are unsigned and the difference
positive and max value is the max value of the truncate type.

It seems like folding as a general rule

  _1 = *p_10;
  a_11 = (unsigned int) _1;
  _2 = a_11 - b_12(D);
  iftmp.0_13 = (short unsigned int) _2;
  _18 = a_11 >= b_12(D);
  iftmp.0_5 = _18 ? iftmp.0_13 : 0;
  *p_10 = iftmp.0_5;

Into 

  _1 = *p_10;
  a_11 = (unsigned int) _1;
  _2 = ((short unsigned int) a_11) - ((short unsigned int) b_12(D));
  iftmp.0_13 = _2;
  _18 = a_11 >= b_12(D);
  iftmp.0_5 = _18 ? iftmp.0_13 : 0;
  *p_10 = iftmp.0_5;

Is valid (though might have missed something).  This would negate the need for this change to the vectorizer and saturation detection
but also should generate better vector code. This is what we do in the general case https://godbolt.org/z/dfoj6fWdv
I think here we're just not seeing through the cond.

Typically lots of architectures have cheap truncation operations, so truncating before saturation means you do the cheap
operation first rather than doing the complex operation on the wider type.

That is,

_2 = a_11 - b_12(D);
iftmp.0_13 = (short unsigned int) _2;
_18 = a_11 >= b_12(D);
iftmp.0_5 = .SAT_SUB ((short unsigned int) a_11, (short unsigned int) b_12(D));

is cheaper than

_2 = a_11 - b_12(D);
iftmp.0_13 = (short unsigned int) _2;
_18 = a_11 >= b_12(D);
iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D));

after vectorization.   Normally the vectorizer will try to do this through over-widening detection as well,
but we haven't taught ranger about the ranges of these new IFNs (probably should at some point).

Cheers,
Tamar

> The below tests are passed for this patch.
> 1. The rv64gcv fully regression tests.
> 2. The rv64gcv build with glibc.
> 3. The x86 bootstrap tests.
> 4. The x86 fully regression tests.
> 
> gcc/ChangeLog:
> 
> 	* match.pd: Add convert description for minus and capture.
> 	* tree-vect-patterns.cc (vect_recog_build_binary_gimple_call): Add
> 	new logic to handle in_type is incompatibile with out_type,  as
> 	well as rename from.
> 	(vect_recog_build_binary_gimple_stmt): Rename to.
> 	(vect_recog_sat_add_pattern): Leverage above renamed func.
> 	(vect_recog_sat_sub_pattern): Ditto.
> 
> Signed-off-by: Pan Li <pan2.li@intel.com>
> ---
>  gcc/match.pd              |  4 +--
>  gcc/tree-vect-patterns.cc | 51 ++++++++++++++++++++++++---------------
>  2 files changed, 33 insertions(+), 22 deletions(-)
> 
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 3d0689c9312..4a4b0b2e72f 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -3164,9 +3164,9 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>  /* Unsigned saturation sub, case 2 (branch with ge):
>     SAT_U_SUB = X >= Y ? X - Y : 0.  */
>  (match (unsigned_integer_sat_sub @0 @1)
> - (cond^ (ge @0 @1) (minus @0 @1) integer_zerop)
> + (cond^ (ge @0 @1) (convert? (minus (convert1? @0) (convert1? @1)))
> integer_zerop)
>   (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
> -      && types_match (type, @0, @1))))
> +      && TYPE_UNSIGNED (TREE_TYPE (@0)) && types_match (@0, @1))))
> 
>  /* Unsigned saturation sub, case 3 (branchless with gt):
>     SAT_U_SUB = (X - Y) * (X > Y).  */
> diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
> index cef901808eb..3d887d36050 100644
> --- a/gcc/tree-vect-patterns.cc
> +++ b/gcc/tree-vect-patterns.cc
> @@ -4490,26 +4490,37 @@ vect_recog_mult_pattern (vec_info *vinfo,
>  extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree));
>  extern bool gimple_unsigned_integer_sat_sub (tree, tree*, tree (*)(tree));
> 
> -static gcall *
> -vect_recog_build_binary_gimple_call (vec_info *vinfo, gimple *stmt,
> +static gimple *
> +vect_recog_build_binary_gimple_stmt (vec_info *vinfo, stmt_vec_info stmt_info,
>  				     internal_fn fn, tree *type_out,
> -				     tree op_0, tree op_1)
> +				     tree lhs, tree op_0, tree op_1)
>  {
>    tree itype = TREE_TYPE (op_0);
> -  tree vtype = get_vectype_for_scalar_type (vinfo, itype);
> +  tree otype = TREE_TYPE (lhs);
> +  tree v_itype = get_vectype_for_scalar_type (vinfo, itype);
> +  tree v_otype = get_vectype_for_scalar_type (vinfo, otype);
> 
> -  if (vtype != NULL_TREE
> -    && direct_internal_fn_supported_p (fn, vtype, OPTIMIZE_FOR_BOTH))
> +  if (v_itype != NULL_TREE && v_otype != NULL_TREE
> +    && direct_internal_fn_supported_p (fn, v_itype, OPTIMIZE_FOR_BOTH))
>      {
>        gcall *call = gimple_build_call_internal (fn, 2, op_0, op_1);
> +      tree in_ssa = vect_recog_temp_ssa_var (itype, NULL);
> 
> -      gimple_call_set_lhs (call, vect_recog_temp_ssa_var (itype, NULL));
> +      gimple_call_set_lhs (call, in_ssa);
>        gimple_call_set_nothrow (call, /* nothrow_p */ false);
> -      gimple_set_location (call, gimple_location (stmt));
> +      gimple_set_location (call, gimple_location (STMT_VINFO_STMT (stmt_info)));
> +
> +      *type_out = v_otype;
> 
> -      *type_out = vtype;
> +      if (types_compatible_p (itype, otype))
> +	return call;
> +      else
> +	{
> +	  append_pattern_def_seq (vinfo, stmt_info, call, v_itype);
> +	  tree out_ssa = vect_recog_temp_ssa_var (otype, NULL);
> 
> -      return call;
> +	  return gimple_build_assign (out_ssa, CONVERT_EXPR, in_ssa);
> +	}
>      }
> 
>    return NULL;
> @@ -4541,13 +4552,13 @@ vect_recog_sat_add_pattern (vec_info *vinfo,
> stmt_vec_info stmt_vinfo,
> 
>    if (gimple_unsigned_integer_sat_add (lhs, ops, NULL))
>      {
> -      gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt,
> -							 IFN_SAT_ADD, type_out,
> -							 ops[0], ops[1]);
> -      if (call)
> +      gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo,
> +							  IFN_SAT_ADD, type_out,
> +							  lhs, ops[0], ops[1]);
> +      if (stmt)
>  	{
>  	  vect_pattern_detected ("vect_recog_sat_add_pattern", last_stmt);
> -	  return call;
> +	  return stmt;
>  	}
>      }
> 
> @@ -4579,13 +4590,13 @@ vect_recog_sat_sub_pattern (vec_info *vinfo,
> stmt_vec_info stmt_vinfo,
> 
>    if (gimple_unsigned_integer_sat_sub (lhs, ops, NULL))
>      {
> -      gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt,
> -							 IFN_SAT_SUB, type_out,
> -							 ops[0], ops[1]);
> -      if (call)
> +      gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo,
> +							  IFN_SAT_SUB, type_out,
> +							  lhs, ops[0], ops[1]);
> +      if (stmt)
>  	{
>  	  vect_pattern_detected ("vect_recog_sat_sub_pattern", last_stmt);
> -	  return call;
> +	  return stmt;
>  	}
>      }
> 
> --
> 2.34.1


^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip
  2024-06-24 19:59   ` Tamar Christina
@ 2024-06-25  2:25     ` Li, Pan2
  2024-06-25  4:00       ` Tamar Christina
  0 siblings, 1 reply; 27+ messages in thread
From: Li, Pan2 @ 2024-06-25  2:25 UTC (permalink / raw)
  To: Tamar Christina, gcc-patches
  Cc: juzhe.zhong, kito.cheng, richard.guenther, jeffreyalaw, pinskia

Thanks Tamar for comments. It indeed benefits the vectorized code, for example in RISC-V, we may eliminate some vsetvel insn in loop for widen here.

> iftmp.0_5 = .SAT_SUB ((short unsigned int) a_11, (short unsigned int) b_12(D));
> is cheaper than
> iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D));

I am not sure if it has any correctness problem for this transform, take uint16_t to uint8_t as example.

uint16_t a, b;
uint8_t result = (uint8_t)(a >= b ? a - b : 0);

Given a = 0x100; // 256
           b = 0xff;     // 255
For iftmp.0_5 = .SAT_SUB ((char unsigned) a, (char unsigned) b) = .SAT_SUB (0, 255) = 0
For iftmp.0_5 = (char unsigned).SAT_SUB (a, b) = (char unsigned).SAT_SUB (256, 255) = 1

Please help to correct me if any misunderstanding, thanks again for enlightening.

Pan

-----Original Message-----
From: Tamar Christina <Tamar.Christina@arm.com> 
Sent: Tuesday, June 25, 2024 4:00 AM
To: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org
Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com; jeffreyalaw@gmail.com; pinskia@gmail.com
Subject: RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip

Hi,

> -----Original Message-----
> From: pan2.li@intel.com <pan2.li@intel.com>
> Sent: Monday, June 24, 2024 2:55 PM
> To: gcc-patches@gcc.gnu.org
> Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com;
> jeffreyalaw@gmail.com; pinskia@gmail.com; Pan Li <pan2.li@intel.com>
> Subject: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip
> 
> From: Pan Li <pan2.li@intel.com>
> 
> The zip benchmark of coremark-pro have one SAT_SUB like pattern but
> truncated as below:
> 
> void test (uint16_t *x, unsigned b, unsigned n)
> {
>   unsigned a = 0;
>   register uint16_t *p = x;
> 
>   do {
>     a = *--p;
>     *p = (uint16_t)(a >= b ? a - b : 0); // Truncate after .SAT_SUB
>   } while (--n);
> }
> 
> It will have gimple before vect pass,  it cannot hit any pattern of
> SAT_SUB and then cannot vectorize to SAT_SUB.
> 
> _2 = a_11 - b_12(D);
> iftmp.0_13 = (short unsigned int) _2;
> _18 = a_11 >= b_12(D);
> iftmp.0_5 = _18 ? iftmp.0_13 : 0;
> 
> This patch would like to improve the pattern match to recog above
> as truncate after .SAT_SUB pattern.  Then we will have the pattern
> similar to below,  as well as eliminate the first 3 dead stmt.
> 
> _2 = a_11 - b_12(D);
> iftmp.0_13 = (short unsigned int) _2;
> _18 = a_11 >= b_12(D);
> iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D));
> 

I guess this is because one branch of the  cond is a constant so the
convert is folded in.  I was wondering though,  can't we just push
in the truncate in this case?

i.e. in this case we know both types are unsigned and the difference
positive and max value is the max value of the truncate type.

It seems like folding as a general rule

  _1 = *p_10;
  a_11 = (unsigned int) _1;
  _2 = a_11 - b_12(D);
  iftmp.0_13 = (short unsigned int) _2;
  _18 = a_11 >= b_12(D);
  iftmp.0_5 = _18 ? iftmp.0_13 : 0;
  *p_10 = iftmp.0_5;

Into 

  _1 = *p_10;
  a_11 = (unsigned int) _1;
  _2 = ((short unsigned int) a_11) - ((short unsigned int) b_12(D));
  iftmp.0_13 = _2;
  _18 = a_11 >= b_12(D);
  iftmp.0_5 = _18 ? iftmp.0_13 : 0;
  *p_10 = iftmp.0_5;

Is valid (though might have missed something).  This would negate the need for this change to the vectorizer and saturation detection
but also should generate better vector code. This is what we do in the general case https://godbolt.org/z/dfoj6fWdv
I think here we're just not seeing through the cond.

Typically lots of architectures have cheap truncation operations, so truncating before saturation means you do the cheap
operation first rather than doing the complex operation on the wider type.

That is,

_2 = a_11 - b_12(D);
iftmp.0_13 = (short unsigned int) _2;
_18 = a_11 >= b_12(D);
iftmp.0_5 = .SAT_SUB ((short unsigned int) a_11, (short unsigned int) b_12(D));

is cheaper than

_2 = a_11 - b_12(D);
iftmp.0_13 = (short unsigned int) _2;
_18 = a_11 >= b_12(D);
iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D));

after vectorization.   Normally the vectorizer will try to do this through over-widening detection as well,
but we haven't taught ranger about the ranges of these new IFNs (probably should at some point).

Cheers,
Tamar

> The below tests are passed for this patch.
> 1. The rv64gcv fully regression tests.
> 2. The rv64gcv build with glibc.
> 3. The x86 bootstrap tests.
> 4. The x86 fully regression tests.
> 
> gcc/ChangeLog:
> 
> 	* match.pd: Add convert description for minus and capture.
> 	* tree-vect-patterns.cc (vect_recog_build_binary_gimple_call): Add
> 	new logic to handle in_type is incompatibile with out_type,  as
> 	well as rename from.
> 	(vect_recog_build_binary_gimple_stmt): Rename to.
> 	(vect_recog_sat_add_pattern): Leverage above renamed func.
> 	(vect_recog_sat_sub_pattern): Ditto.
> 
> Signed-off-by: Pan Li <pan2.li@intel.com>
> ---
>  gcc/match.pd              |  4 +--
>  gcc/tree-vect-patterns.cc | 51 ++++++++++++++++++++++++---------------
>  2 files changed, 33 insertions(+), 22 deletions(-)
> 
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 3d0689c9312..4a4b0b2e72f 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -3164,9 +3164,9 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>  /* Unsigned saturation sub, case 2 (branch with ge):
>     SAT_U_SUB = X >= Y ? X - Y : 0.  */
>  (match (unsigned_integer_sat_sub @0 @1)
> - (cond^ (ge @0 @1) (minus @0 @1) integer_zerop)
> + (cond^ (ge @0 @1) (convert? (minus (convert1? @0) (convert1? @1)))
> integer_zerop)
>   (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
> -      && types_match (type, @0, @1))))
> +      && TYPE_UNSIGNED (TREE_TYPE (@0)) && types_match (@0, @1))))
> 
>  /* Unsigned saturation sub, case 3 (branchless with gt):
>     SAT_U_SUB = (X - Y) * (X > Y).  */
> diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
> index cef901808eb..3d887d36050 100644
> --- a/gcc/tree-vect-patterns.cc
> +++ b/gcc/tree-vect-patterns.cc
> @@ -4490,26 +4490,37 @@ vect_recog_mult_pattern (vec_info *vinfo,
>  extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree));
>  extern bool gimple_unsigned_integer_sat_sub (tree, tree*, tree (*)(tree));
> 
> -static gcall *
> -vect_recog_build_binary_gimple_call (vec_info *vinfo, gimple *stmt,
> +static gimple *
> +vect_recog_build_binary_gimple_stmt (vec_info *vinfo, stmt_vec_info stmt_info,
>  				     internal_fn fn, tree *type_out,
> -				     tree op_0, tree op_1)
> +				     tree lhs, tree op_0, tree op_1)
>  {
>    tree itype = TREE_TYPE (op_0);
> -  tree vtype = get_vectype_for_scalar_type (vinfo, itype);
> +  tree otype = TREE_TYPE (lhs);
> +  tree v_itype = get_vectype_for_scalar_type (vinfo, itype);
> +  tree v_otype = get_vectype_for_scalar_type (vinfo, otype);
> 
> -  if (vtype != NULL_TREE
> -    && direct_internal_fn_supported_p (fn, vtype, OPTIMIZE_FOR_BOTH))
> +  if (v_itype != NULL_TREE && v_otype != NULL_TREE
> +    && direct_internal_fn_supported_p (fn, v_itype, OPTIMIZE_FOR_BOTH))
>      {
>        gcall *call = gimple_build_call_internal (fn, 2, op_0, op_1);
> +      tree in_ssa = vect_recog_temp_ssa_var (itype, NULL);
> 
> -      gimple_call_set_lhs (call, vect_recog_temp_ssa_var (itype, NULL));
> +      gimple_call_set_lhs (call, in_ssa);
>        gimple_call_set_nothrow (call, /* nothrow_p */ false);
> -      gimple_set_location (call, gimple_location (stmt));
> +      gimple_set_location (call, gimple_location (STMT_VINFO_STMT (stmt_info)));
> +
> +      *type_out = v_otype;
> 
> -      *type_out = vtype;
> +      if (types_compatible_p (itype, otype))
> +	return call;
> +      else
> +	{
> +	  append_pattern_def_seq (vinfo, stmt_info, call, v_itype);
> +	  tree out_ssa = vect_recog_temp_ssa_var (otype, NULL);
> 
> -      return call;
> +	  return gimple_build_assign (out_ssa, CONVERT_EXPR, in_ssa);
> +	}
>      }
> 
>    return NULL;
> @@ -4541,13 +4552,13 @@ vect_recog_sat_add_pattern (vec_info *vinfo,
> stmt_vec_info stmt_vinfo,
> 
>    if (gimple_unsigned_integer_sat_add (lhs, ops, NULL))
>      {
> -      gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt,
> -							 IFN_SAT_ADD, type_out,
> -							 ops[0], ops[1]);
> -      if (call)
> +      gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo,
> +							  IFN_SAT_ADD, type_out,
> +							  lhs, ops[0], ops[1]);
> +      if (stmt)
>  	{
>  	  vect_pattern_detected ("vect_recog_sat_add_pattern", last_stmt);
> -	  return call;
> +	  return stmt;
>  	}
>      }
> 
> @@ -4579,13 +4590,13 @@ vect_recog_sat_sub_pattern (vec_info *vinfo,
> stmt_vec_info stmt_vinfo,
> 
>    if (gimple_unsigned_integer_sat_sub (lhs, ops, NULL))
>      {
> -      gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt,
> -							 IFN_SAT_SUB, type_out,
> -							 ops[0], ops[1]);
> -      if (call)
> +      gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo,
> +							  IFN_SAT_SUB, type_out,
> +							  lhs, ops[0], ops[1]);
> +      if (stmt)
>  	{
>  	  vect_pattern_detected ("vect_recog_sat_sub_pattern", last_stmt);
> -	  return call;
> +	  return stmt;
>  	}
>      }
> 
> --
> 2.34.1


^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip
  2024-06-25  2:25     ` Li, Pan2
@ 2024-06-25  4:00       ` Tamar Christina
  2024-06-25  6:06         ` Li, Pan2
  0 siblings, 1 reply; 27+ messages in thread
From: Tamar Christina @ 2024-06-25  4:00 UTC (permalink / raw)
  To: Li, Pan2, gcc-patches
  Cc: juzhe.zhong, kito.cheng, richard.guenther, jeffreyalaw, pinskia

> -----Original Message-----
> From: Li, Pan2 <pan2.li@intel.com>
> Sent: Tuesday, June 25, 2024 3:25 AM
> To: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org
> Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com;
> jeffreyalaw@gmail.com; pinskia@gmail.com
> Subject: RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip
> 
> Thanks Tamar for comments. It indeed benefits the vectorized code, for example in
> RISC-V, we may eliminate some vsetvel insn in loop for widen here.
> 
> > iftmp.0_5 = .SAT_SUB ((short unsigned int) a_11, (short unsigned int) b_12(D));
> > is cheaper than
> > iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D));
> 
> I am not sure if it has any correctness problem for this transform, take uint16_t to
> uint8_t as example.
> 
> uint16_t a, b;
> uint8_t result = (uint8_t)(a >= b ? a - b : 0);
> 
> Given a = 0x100; // 256
>            b = 0xff;     // 255
> For iftmp.0_5 = .SAT_SUB ((char unsigned) a, (char unsigned) b) = .SAT_SUB (0,
> 255) = 0
> For iftmp.0_5 = (char unsigned).SAT_SUB (a, b) = (char unsigned).SAT_SUB (256,
> 255) = 1
> 
> Please help to correct me if any misunderstanding, thanks again for enlightening.

Ah, no you're right, those would end up wrong for saturation. Arg..  Sorry should have
though it through more.

Tamar.
> 
> Pan
> 
> -----Original Message-----
> From: Tamar Christina <Tamar.Christina@arm.com>
> Sent: Tuesday, June 25, 2024 4:00 AM
> To: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org
> Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com;
> jeffreyalaw@gmail.com; pinskia@gmail.com
> Subject: RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip
> 
> Hi,
> 
> > -----Original Message-----
> > From: pan2.li@intel.com <pan2.li@intel.com>
> > Sent: Monday, June 24, 2024 2:55 PM
> > To: gcc-patches@gcc.gnu.org
> > Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com;
> > jeffreyalaw@gmail.com; pinskia@gmail.com; Pan Li <pan2.li@intel.com>
> > Subject: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip
> >
> > From: Pan Li <pan2.li@intel.com>
> >
> > The zip benchmark of coremark-pro have one SAT_SUB like pattern but
> > truncated as below:
> >
> > void test (uint16_t *x, unsigned b, unsigned n)
> > {
> >   unsigned a = 0;
> >   register uint16_t *p = x;
> >
> >   do {
> >     a = *--p;
> >     *p = (uint16_t)(a >= b ? a - b : 0); // Truncate after .SAT_SUB
> >   } while (--n);
> > }
> >
> > It will have gimple before vect pass,  it cannot hit any pattern of
> > SAT_SUB and then cannot vectorize to SAT_SUB.
> >
> > _2 = a_11 - b_12(D);
> > iftmp.0_13 = (short unsigned int) _2;
> > _18 = a_11 >= b_12(D);
> > iftmp.0_5 = _18 ? iftmp.0_13 : 0;
> >
> > This patch would like to improve the pattern match to recog above
> > as truncate after .SAT_SUB pattern.  Then we will have the pattern
> > similar to below,  as well as eliminate the first 3 dead stmt.
> >
> > _2 = a_11 - b_12(D);
> > iftmp.0_13 = (short unsigned int) _2;
> > _18 = a_11 >= b_12(D);
> > iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D));
> >
> 
> I guess this is because one branch of the  cond is a constant so the
> convert is folded in.  I was wondering though,  can't we just push
> in the truncate in this case?
> 
> i.e. in this case we know both types are unsigned and the difference
> positive and max value is the max value of the truncate type.
> 
> It seems like folding as a general rule
> 
>   _1 = *p_10;
>   a_11 = (unsigned int) _1;
>   _2 = a_11 - b_12(D);
>   iftmp.0_13 = (short unsigned int) _2;
>   _18 = a_11 >= b_12(D);
>   iftmp.0_5 = _18 ? iftmp.0_13 : 0;
>   *p_10 = iftmp.0_5;
> 
> Into
> 
>   _1 = *p_10;
>   a_11 = (unsigned int) _1;
>   _2 = ((short unsigned int) a_11) - ((short unsigned int) b_12(D));
>   iftmp.0_13 = _2;
>   _18 = a_11 >= b_12(D);
>   iftmp.0_5 = _18 ? iftmp.0_13 : 0;
>   *p_10 = iftmp.0_5;
> 
> Is valid (though might have missed something).  This would negate the need for
> this change to the vectorizer and saturation detection
> but also should generate better vector code. This is what we do in the general case
> https://godbolt.org/z/dfoj6fWdv
> I think here we're just not seeing through the cond.
> 
> Typically lots of architectures have cheap truncation operations, so truncating
> before saturation means you do the cheap
> operation first rather than doing the complex operation on the wider type.
> 
> That is,
> 
> _2 = a_11 - b_12(D);
> iftmp.0_13 = (short unsigned int) _2;
> _18 = a_11 >= b_12(D);
> iftmp.0_5 = .SAT_SUB ((short unsigned int) a_11, (short unsigned int) b_12(D));
> 
> is cheaper than
> 
> _2 = a_11 - b_12(D);
> iftmp.0_13 = (short unsigned int) _2;
> _18 = a_11 >= b_12(D);
> iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D));
> 
> after vectorization.   Normally the vectorizer will try to do this through over-
> widening detection as well,
> but we haven't taught ranger about the ranges of these new IFNs (probably
> should at some point).
> 
> Cheers,
> Tamar
> 
> > The below tests are passed for this patch.
> > 1. The rv64gcv fully regression tests.
> > 2. The rv64gcv build with glibc.
> > 3. The x86 bootstrap tests.
> > 4. The x86 fully regression tests.
> >
> > gcc/ChangeLog:
> >
> > 	* match.pd: Add convert description for minus and capture.
> > 	* tree-vect-patterns.cc (vect_recog_build_binary_gimple_call): Add
> > 	new logic to handle in_type is incompatibile with out_type,  as
> > 	well as rename from.
> > 	(vect_recog_build_binary_gimple_stmt): Rename to.
> > 	(vect_recog_sat_add_pattern): Leverage above renamed func.
> > 	(vect_recog_sat_sub_pattern): Ditto.
> >
> > Signed-off-by: Pan Li <pan2.li@intel.com>
> > ---
> >  gcc/match.pd              |  4 +--
> >  gcc/tree-vect-patterns.cc | 51 ++++++++++++++++++++++++---------------
> >  2 files changed, 33 insertions(+), 22 deletions(-)
> >
> > diff --git a/gcc/match.pd b/gcc/match.pd
> > index 3d0689c9312..4a4b0b2e72f 100644
> > --- a/gcc/match.pd
> > +++ b/gcc/match.pd
> > @@ -3164,9 +3164,9 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> >  /* Unsigned saturation sub, case 2 (branch with ge):
> >     SAT_U_SUB = X >= Y ? X - Y : 0.  */
> >  (match (unsigned_integer_sat_sub @0 @1)
> > - (cond^ (ge @0 @1) (minus @0 @1) integer_zerop)
> > + (cond^ (ge @0 @1) (convert? (minus (convert1? @0) (convert1? @1)))
> > integer_zerop)
> >   (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
> > -      && types_match (type, @0, @1))))
> > +      && TYPE_UNSIGNED (TREE_TYPE (@0)) && types_match (@0, @1))))
> >
> >  /* Unsigned saturation sub, case 3 (branchless with gt):
> >     SAT_U_SUB = (X - Y) * (X > Y).  */
> > diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
> > index cef901808eb..3d887d36050 100644
> > --- a/gcc/tree-vect-patterns.cc
> > +++ b/gcc/tree-vect-patterns.cc
> > @@ -4490,26 +4490,37 @@ vect_recog_mult_pattern (vec_info *vinfo,
> >  extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree));
> >  extern bool gimple_unsigned_integer_sat_sub (tree, tree*, tree (*)(tree));
> >
> > -static gcall *
> > -vect_recog_build_binary_gimple_call (vec_info *vinfo, gimple *stmt,
> > +static gimple *
> > +vect_recog_build_binary_gimple_stmt (vec_info *vinfo, stmt_vec_info
> stmt_info,
> >  				     internal_fn fn, tree *type_out,
> > -				     tree op_0, tree op_1)
> > +				     tree lhs, tree op_0, tree op_1)
> >  {
> >    tree itype = TREE_TYPE (op_0);
> > -  tree vtype = get_vectype_for_scalar_type (vinfo, itype);
> > +  tree otype = TREE_TYPE (lhs);
> > +  tree v_itype = get_vectype_for_scalar_type (vinfo, itype);
> > +  tree v_otype = get_vectype_for_scalar_type (vinfo, otype);
> >
> > -  if (vtype != NULL_TREE
> > -    && direct_internal_fn_supported_p (fn, vtype, OPTIMIZE_FOR_BOTH))
> > +  if (v_itype != NULL_TREE && v_otype != NULL_TREE
> > +    && direct_internal_fn_supported_p (fn, v_itype, OPTIMIZE_FOR_BOTH))
> >      {
> >        gcall *call = gimple_build_call_internal (fn, 2, op_0, op_1);
> > +      tree in_ssa = vect_recog_temp_ssa_var (itype, NULL);
> >
> > -      gimple_call_set_lhs (call, vect_recog_temp_ssa_var (itype, NULL));
> > +      gimple_call_set_lhs (call, in_ssa);
> >        gimple_call_set_nothrow (call, /* nothrow_p */ false);
> > -      gimple_set_location (call, gimple_location (stmt));
> > +      gimple_set_location (call, gimple_location (STMT_VINFO_STMT
> (stmt_info)));
> > +
> > +      *type_out = v_otype;
> >
> > -      *type_out = vtype;
> > +      if (types_compatible_p (itype, otype))
> > +	return call;
> > +      else
> > +	{
> > +	  append_pattern_def_seq (vinfo, stmt_info, call, v_itype);
> > +	  tree out_ssa = vect_recog_temp_ssa_var (otype, NULL);
> >
> > -      return call;
> > +	  return gimple_build_assign (out_ssa, CONVERT_EXPR, in_ssa);
> > +	}
> >      }
> >
> >    return NULL;
> > @@ -4541,13 +4552,13 @@ vect_recog_sat_add_pattern (vec_info *vinfo,
> > stmt_vec_info stmt_vinfo,
> >
> >    if (gimple_unsigned_integer_sat_add (lhs, ops, NULL))
> >      {
> > -      gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt,
> > -							 IFN_SAT_ADD, type_out,
> > -							 ops[0], ops[1]);
> > -      if (call)
> > +      gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo,
> > +							  IFN_SAT_ADD, type_out,
> > +							  lhs, ops[0], ops[1]);
> > +      if (stmt)
> >  	{
> >  	  vect_pattern_detected ("vect_recog_sat_add_pattern", last_stmt);
> > -	  return call;
> > +	  return stmt;
> >  	}
> >      }
> >
> > @@ -4579,13 +4590,13 @@ vect_recog_sat_sub_pattern (vec_info *vinfo,
> > stmt_vec_info stmt_vinfo,
> >
> >    if (gimple_unsigned_integer_sat_sub (lhs, ops, NULL))
> >      {
> > -      gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt,
> > -							 IFN_SAT_SUB, type_out,
> > -							 ops[0], ops[1]);
> > -      if (call)
> > +      gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo,
> > +							  IFN_SAT_SUB, type_out,
> > +							  lhs, ops[0], ops[1]);
> > +      if (stmt)
> >  	{
> >  	  vect_pattern_detected ("vect_recog_sat_sub_pattern", last_stmt);
> > -	  return call;
> > +	  return stmt;
> >  	}
> >      }
> >
> > --
> > 2.34.1


^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip
  2024-06-25  4:00       ` Tamar Christina
@ 2024-06-25  6:06         ` Li, Pan2
  2024-06-25  6:11           ` Tamar Christina
  0 siblings, 1 reply; 27+ messages in thread
From: Li, Pan2 @ 2024-06-25  6:06 UTC (permalink / raw)
  To: Tamar Christina, gcc-patches
  Cc: juzhe.zhong, kito.cheng, richard.guenther, jeffreyalaw, pinskia

> Ah, no you're right, those would end up wrong for saturation. Arg..  Sorry should have
> though it through more.

Never mind, but you enlighten me for even more optimize with some restrictions. I revisited the pattern, for example as below.

uint16_t a, b;
uint8_t result = (uint8_t)(a >= b ? a - b : 0);

=> result = (char unsigned).SAT_SUB (a, b)

If a has a def like below
uint8_t other = 0x1f;
a = (uint8_t)other

then we can safely convert result = (char unsigned).SAT_SUB (a, b) to
result = .SAT_SUB ((char unsigned)a, (char unsigned).b)

Then we may have better vectorized code if a is limited to char unsigned. Of course we can do that based on this patch.

Pan

-----Original Message-----
From: Tamar Christina <Tamar.Christina@arm.com> 
Sent: Tuesday, June 25, 2024 12:01 PM
To: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org
Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com; jeffreyalaw@gmail.com; pinskia@gmail.com
Subject: RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip

> -----Original Message-----
> From: Li, Pan2 <pan2.li@intel.com>
> Sent: Tuesday, June 25, 2024 3:25 AM
> To: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org
> Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com;
> jeffreyalaw@gmail.com; pinskia@gmail.com
> Subject: RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip
> 
> Thanks Tamar for comments. It indeed benefits the vectorized code, for example in
> RISC-V, we may eliminate some vsetvel insn in loop for widen here.
> 
> > iftmp.0_5 = .SAT_SUB ((short unsigned int) a_11, (short unsigned int) b_12(D));
> > is cheaper than
> > iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D));
> 
> I am not sure if it has any correctness problem for this transform, take uint16_t to
> uint8_t as example.
> 
> uint16_t a, b;
> uint8_t result = (uint8_t)(a >= b ? a - b : 0);
> 
> Given a = 0x100; // 256
>            b = 0xff;     // 255
> For iftmp.0_5 = .SAT_SUB ((char unsigned) a, (char unsigned) b) = .SAT_SUB (0,
> 255) = 0
> For iftmp.0_5 = (char unsigned).SAT_SUB (a, b) = (char unsigned).SAT_SUB (256,
> 255) = 1
> 
> Please help to correct me if any misunderstanding, thanks again for enlightening.

Ah, no you're right, those would end up wrong for saturation. Arg..  Sorry should have
though it through more.

Tamar.
> 
> Pan
> 
> -----Original Message-----
> From: Tamar Christina <Tamar.Christina@arm.com>
> Sent: Tuesday, June 25, 2024 4:00 AM
> To: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org
> Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com;
> jeffreyalaw@gmail.com; pinskia@gmail.com
> Subject: RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip
> 
> Hi,
> 
> > -----Original Message-----
> > From: pan2.li@intel.com <pan2.li@intel.com>
> > Sent: Monday, June 24, 2024 2:55 PM
> > To: gcc-patches@gcc.gnu.org
> > Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com;
> > jeffreyalaw@gmail.com; pinskia@gmail.com; Pan Li <pan2.li@intel.com>
> > Subject: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip
> >
> > From: Pan Li <pan2.li@intel.com>
> >
> > The zip benchmark of coremark-pro have one SAT_SUB like pattern but
> > truncated as below:
> >
> > void test (uint16_t *x, unsigned b, unsigned n)
> > {
> >   unsigned a = 0;
> >   register uint16_t *p = x;
> >
> >   do {
> >     a = *--p;
> >     *p = (uint16_t)(a >= b ? a - b : 0); // Truncate after .SAT_SUB
> >   } while (--n);
> > }
> >
> > It will have gimple before vect pass,  it cannot hit any pattern of
> > SAT_SUB and then cannot vectorize to SAT_SUB.
> >
> > _2 = a_11 - b_12(D);
> > iftmp.0_13 = (short unsigned int) _2;
> > _18 = a_11 >= b_12(D);
> > iftmp.0_5 = _18 ? iftmp.0_13 : 0;
> >
> > This patch would like to improve the pattern match to recog above
> > as truncate after .SAT_SUB pattern.  Then we will have the pattern
> > similar to below,  as well as eliminate the first 3 dead stmt.
> >
> > _2 = a_11 - b_12(D);
> > iftmp.0_13 = (short unsigned int) _2;
> > _18 = a_11 >= b_12(D);
> > iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D));
> >
> 
> I guess this is because one branch of the  cond is a constant so the
> convert is folded in.  I was wondering though,  can't we just push
> in the truncate in this case?
> 
> i.e. in this case we know both types are unsigned and the difference
> positive and max value is the max value of the truncate type.
> 
> It seems like folding as a general rule
> 
>   _1 = *p_10;
>   a_11 = (unsigned int) _1;
>   _2 = a_11 - b_12(D);
>   iftmp.0_13 = (short unsigned int) _2;
>   _18 = a_11 >= b_12(D);
>   iftmp.0_5 = _18 ? iftmp.0_13 : 0;
>   *p_10 = iftmp.0_5;
> 
> Into
> 
>   _1 = *p_10;
>   a_11 = (unsigned int) _1;
>   _2 = ((short unsigned int) a_11) - ((short unsigned int) b_12(D));
>   iftmp.0_13 = _2;
>   _18 = a_11 >= b_12(D);
>   iftmp.0_5 = _18 ? iftmp.0_13 : 0;
>   *p_10 = iftmp.0_5;
> 
> Is valid (though might have missed something).  This would negate the need for
> this change to the vectorizer and saturation detection
> but also should generate better vector code. This is what we do in the general case
> https://godbolt.org/z/dfoj6fWdv
> I think here we're just not seeing through the cond.
> 
> Typically lots of architectures have cheap truncation operations, so truncating
> before saturation means you do the cheap
> operation first rather than doing the complex operation on the wider type.
> 
> That is,
> 
> _2 = a_11 - b_12(D);
> iftmp.0_13 = (short unsigned int) _2;
> _18 = a_11 >= b_12(D);
> iftmp.0_5 = .SAT_SUB ((short unsigned int) a_11, (short unsigned int) b_12(D));
> 
> is cheaper than
> 
> _2 = a_11 - b_12(D);
> iftmp.0_13 = (short unsigned int) _2;
> _18 = a_11 >= b_12(D);
> iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D));
> 
> after vectorization.   Normally the vectorizer will try to do this through over-
> widening detection as well,
> but we haven't taught ranger about the ranges of these new IFNs (probably
> should at some point).
> 
> Cheers,
> Tamar
> 
> > The below tests are passed for this patch.
> > 1. The rv64gcv fully regression tests.
> > 2. The rv64gcv build with glibc.
> > 3. The x86 bootstrap tests.
> > 4. The x86 fully regression tests.
> >
> > gcc/ChangeLog:
> >
> > 	* match.pd: Add convert description for minus and capture.
> > 	* tree-vect-patterns.cc (vect_recog_build_binary_gimple_call): Add
> > 	new logic to handle in_type is incompatibile with out_type,  as
> > 	well as rename from.
> > 	(vect_recog_build_binary_gimple_stmt): Rename to.
> > 	(vect_recog_sat_add_pattern): Leverage above renamed func.
> > 	(vect_recog_sat_sub_pattern): Ditto.
> >
> > Signed-off-by: Pan Li <pan2.li@intel.com>
> > ---
> >  gcc/match.pd              |  4 +--
> >  gcc/tree-vect-patterns.cc | 51 ++++++++++++++++++++++++---------------
> >  2 files changed, 33 insertions(+), 22 deletions(-)
> >
> > diff --git a/gcc/match.pd b/gcc/match.pd
> > index 3d0689c9312..4a4b0b2e72f 100644
> > --- a/gcc/match.pd
> > +++ b/gcc/match.pd
> > @@ -3164,9 +3164,9 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> >  /* Unsigned saturation sub, case 2 (branch with ge):
> >     SAT_U_SUB = X >= Y ? X - Y : 0.  */
> >  (match (unsigned_integer_sat_sub @0 @1)
> > - (cond^ (ge @0 @1) (minus @0 @1) integer_zerop)
> > + (cond^ (ge @0 @1) (convert? (minus (convert1? @0) (convert1? @1)))
> > integer_zerop)
> >   (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
> > -      && types_match (type, @0, @1))))
> > +      && TYPE_UNSIGNED (TREE_TYPE (@0)) && types_match (@0, @1))))
> >
> >  /* Unsigned saturation sub, case 3 (branchless with gt):
> >     SAT_U_SUB = (X - Y) * (X > Y).  */
> > diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
> > index cef901808eb..3d887d36050 100644
> > --- a/gcc/tree-vect-patterns.cc
> > +++ b/gcc/tree-vect-patterns.cc
> > @@ -4490,26 +4490,37 @@ vect_recog_mult_pattern (vec_info *vinfo,
> >  extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree));
> >  extern bool gimple_unsigned_integer_sat_sub (tree, tree*, tree (*)(tree));
> >
> > -static gcall *
> > -vect_recog_build_binary_gimple_call (vec_info *vinfo, gimple *stmt,
> > +static gimple *
> > +vect_recog_build_binary_gimple_stmt (vec_info *vinfo, stmt_vec_info
> stmt_info,
> >  				     internal_fn fn, tree *type_out,
> > -				     tree op_0, tree op_1)
> > +				     tree lhs, tree op_0, tree op_1)
> >  {
> >    tree itype = TREE_TYPE (op_0);
> > -  tree vtype = get_vectype_for_scalar_type (vinfo, itype);
> > +  tree otype = TREE_TYPE (lhs);
> > +  tree v_itype = get_vectype_for_scalar_type (vinfo, itype);
> > +  tree v_otype = get_vectype_for_scalar_type (vinfo, otype);
> >
> > -  if (vtype != NULL_TREE
> > -    && direct_internal_fn_supported_p (fn, vtype, OPTIMIZE_FOR_BOTH))
> > +  if (v_itype != NULL_TREE && v_otype != NULL_TREE
> > +    && direct_internal_fn_supported_p (fn, v_itype, OPTIMIZE_FOR_BOTH))
> >      {
> >        gcall *call = gimple_build_call_internal (fn, 2, op_0, op_1);
> > +      tree in_ssa = vect_recog_temp_ssa_var (itype, NULL);
> >
> > -      gimple_call_set_lhs (call, vect_recog_temp_ssa_var (itype, NULL));
> > +      gimple_call_set_lhs (call, in_ssa);
> >        gimple_call_set_nothrow (call, /* nothrow_p */ false);
> > -      gimple_set_location (call, gimple_location (stmt));
> > +      gimple_set_location (call, gimple_location (STMT_VINFO_STMT
> (stmt_info)));
> > +
> > +      *type_out = v_otype;
> >
> > -      *type_out = vtype;
> > +      if (types_compatible_p (itype, otype))
> > +	return call;
> > +      else
> > +	{
> > +	  append_pattern_def_seq (vinfo, stmt_info, call, v_itype);
> > +	  tree out_ssa = vect_recog_temp_ssa_var (otype, NULL);
> >
> > -      return call;
> > +	  return gimple_build_assign (out_ssa, CONVERT_EXPR, in_ssa);
> > +	}
> >      }
> >
> >    return NULL;
> > @@ -4541,13 +4552,13 @@ vect_recog_sat_add_pattern (vec_info *vinfo,
> > stmt_vec_info stmt_vinfo,
> >
> >    if (gimple_unsigned_integer_sat_add (lhs, ops, NULL))
> >      {
> > -      gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt,
> > -							 IFN_SAT_ADD, type_out,
> > -							 ops[0], ops[1]);
> > -      if (call)
> > +      gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo,
> > +							  IFN_SAT_ADD, type_out,
> > +							  lhs, ops[0], ops[1]);
> > +      if (stmt)
> >  	{
> >  	  vect_pattern_detected ("vect_recog_sat_add_pattern", last_stmt);
> > -	  return call;
> > +	  return stmt;
> >  	}
> >      }
> >
> > @@ -4579,13 +4590,13 @@ vect_recog_sat_sub_pattern (vec_info *vinfo,
> > stmt_vec_info stmt_vinfo,
> >
> >    if (gimple_unsigned_integer_sat_sub (lhs, ops, NULL))
> >      {
> > -      gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt,
> > -							 IFN_SAT_SUB, type_out,
> > -							 ops[0], ops[1]);
> > -      if (call)
> > +      gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo,
> > +							  IFN_SAT_SUB, type_out,
> > +							  lhs, ops[0], ops[1]);
> > +      if (stmt)
> >  	{
> >  	  vect_pattern_detected ("vect_recog_sat_sub_pattern", last_stmt);
> > -	  return call;
> > +	  return stmt;
> >  	}
> >      }
> >
> > --
> > 2.34.1


^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip
  2024-06-25  6:06         ` Li, Pan2
@ 2024-06-25  6:11           ` Tamar Christina
  2024-06-25  6:25             ` Li, Pan2
  0 siblings, 1 reply; 27+ messages in thread
From: Tamar Christina @ 2024-06-25  6:11 UTC (permalink / raw)
  To: Li, Pan2, gcc-patches
  Cc: juzhe.zhong, kito.cheng, richard.guenther, jeffreyalaw, pinskia

> -----Original Message-----
> From: Li, Pan2 <pan2.li@intel.com>
> Sent: Tuesday, June 25, 2024 7:06 AM
> To: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org
> Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com;
> jeffreyalaw@gmail.com; pinskia@gmail.com
> Subject: RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip
> 
> > Ah, no you're right, those would end up wrong for saturation. Arg..  Sorry should
> have
> > though it through more.
> 
> Never mind, but you enlighten me for even more optimize with some restrictions. I
> revisited the pattern, for example as below.
> 
> uint16_t a, b;
> uint8_t result = (uint8_t)(a >= b ? a - b : 0);
> 
> => result = (char unsigned).SAT_SUB (a, b)
> 
> If a has a def like below
> uint8_t other = 0x1f;
> a = (uint8_t)other

You can in principle do this by querying range information,
e.g.

	  gimple_ranger ranger;
	  int_range_max r;
	  if (ranger.range_of_expr (r, oprnd0, stmt) && !r.undefined_p ())
	    {
...

We do this for instance in vect_recog_divmod_pattern.

Tamar

> 
> then we can safely convert result = (char unsigned).SAT_SUB (a, b) to
> result = .SAT_SUB ((char unsigned)a, (char unsigned).b)
> 
> Then we may have better vectorized code if a is limited to char unsigned. Of course
> we can do that based on this patch.
> 
> Pan
> 
> -----Original Message-----
> From: Tamar Christina <Tamar.Christina@arm.com>
> Sent: Tuesday, June 25, 2024 12:01 PM
> To: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org
> Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com;
> jeffreyalaw@gmail.com; pinskia@gmail.com
> Subject: RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip
> 
> > -----Original Message-----
> > From: Li, Pan2 <pan2.li@intel.com>
> > Sent: Tuesday, June 25, 2024 3:25 AM
> > To: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org
> > Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com;
> > jeffreyalaw@gmail.com; pinskia@gmail.com
> > Subject: RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip
> >
> > Thanks Tamar for comments. It indeed benefits the vectorized code, for example
> in
> > RISC-V, we may eliminate some vsetvel insn in loop for widen here.
> >
> > > iftmp.0_5 = .SAT_SUB ((short unsigned int) a_11, (short unsigned int)
> b_12(D));
> > > is cheaper than
> > > iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D));
> >
> > I am not sure if it has any correctness problem for this transform, take uint16_t
> to
> > uint8_t as example.
> >
> > uint16_t a, b;
> > uint8_t result = (uint8_t)(a >= b ? a - b : 0);
> >
> > Given a = 0x100; // 256
> >            b = 0xff;     // 255
> > For iftmp.0_5 = .SAT_SUB ((char unsigned) a, (char unsigned) b) = .SAT_SUB (0,
> > 255) = 0
> > For iftmp.0_5 = (char unsigned).SAT_SUB (a, b) = (char unsigned).SAT_SUB (256,
> > 255) = 1
> >
> > Please help to correct me if any misunderstanding, thanks again for enlightening.
> 
> Ah, no you're right, those would end up wrong for saturation. Arg..  Sorry should
> have
> though it through more.
> 
> Tamar.
> >
> > Pan
> >
> > -----Original Message-----
> > From: Tamar Christina <Tamar.Christina@arm.com>
> > Sent: Tuesday, June 25, 2024 4:00 AM
> > To: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org
> > Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com;
> > jeffreyalaw@gmail.com; pinskia@gmail.com
> > Subject: RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip
> >
> > Hi,
> >
> > > -----Original Message-----
> > > From: pan2.li@intel.com <pan2.li@intel.com>
> > > Sent: Monday, June 24, 2024 2:55 PM
> > > To: gcc-patches@gcc.gnu.org
> > > Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com;
> richard.guenther@gmail.com;
> > > jeffreyalaw@gmail.com; pinskia@gmail.com; Pan Li <pan2.li@intel.com>
> > > Subject: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip
> > >
> > > From: Pan Li <pan2.li@intel.com>
> > >
> > > The zip benchmark of coremark-pro have one SAT_SUB like pattern but
> > > truncated as below:
> > >
> > > void test (uint16_t *x, unsigned b, unsigned n)
> > > {
> > >   unsigned a = 0;
> > >   register uint16_t *p = x;
> > >
> > >   do {
> > >     a = *--p;
> > >     *p = (uint16_t)(a >= b ? a - b : 0); // Truncate after .SAT_SUB
> > >   } while (--n);
> > > }
> > >
> > > It will have gimple before vect pass,  it cannot hit any pattern of
> > > SAT_SUB and then cannot vectorize to SAT_SUB.
> > >
> > > _2 = a_11 - b_12(D);
> > > iftmp.0_13 = (short unsigned int) _2;
> > > _18 = a_11 >= b_12(D);
> > > iftmp.0_5 = _18 ? iftmp.0_13 : 0;
> > >
> > > This patch would like to improve the pattern match to recog above
> > > as truncate after .SAT_SUB pattern.  Then we will have the pattern
> > > similar to below,  as well as eliminate the first 3 dead stmt.
> > >
> > > _2 = a_11 - b_12(D);
> > > iftmp.0_13 = (short unsigned int) _2;
> > > _18 = a_11 >= b_12(D);
> > > iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D));
> > >
> >
> > I guess this is because one branch of the  cond is a constant so the
> > convert is folded in.  I was wondering though,  can't we just push
> > in the truncate in this case?
> >
> > i.e. in this case we know both types are unsigned and the difference
> > positive and max value is the max value of the truncate type.
> >
> > It seems like folding as a general rule
> >
> >   _1 = *p_10;
> >   a_11 = (unsigned int) _1;
> >   _2 = a_11 - b_12(D);
> >   iftmp.0_13 = (short unsigned int) _2;
> >   _18 = a_11 >= b_12(D);
> >   iftmp.0_5 = _18 ? iftmp.0_13 : 0;
> >   *p_10 = iftmp.0_5;
> >
> > Into
> >
> >   _1 = *p_10;
> >   a_11 = (unsigned int) _1;
> >   _2 = ((short unsigned int) a_11) - ((short unsigned int) b_12(D));
> >   iftmp.0_13 = _2;
> >   _18 = a_11 >= b_12(D);
> >   iftmp.0_5 = _18 ? iftmp.0_13 : 0;
> >   *p_10 = iftmp.0_5;
> >
> > Is valid (though might have missed something).  This would negate the need for
> > this change to the vectorizer and saturation detection
> > but also should generate better vector code. This is what we do in the general
> case
> > https://godbolt.org/z/dfoj6fWdv
> > I think here we're just not seeing through the cond.
> >
> > Typically lots of architectures have cheap truncation operations, so truncating
> > before saturation means you do the cheap
> > operation first rather than doing the complex operation on the wider type.
> >
> > That is,
> >
> > _2 = a_11 - b_12(D);
> > iftmp.0_13 = (short unsigned int) _2;
> > _18 = a_11 >= b_12(D);
> > iftmp.0_5 = .SAT_SUB ((short unsigned int) a_11, (short unsigned int) b_12(D));
> >
> > is cheaper than
> >
> > _2 = a_11 - b_12(D);
> > iftmp.0_13 = (short unsigned int) _2;
> > _18 = a_11 >= b_12(D);
> > iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D));
> >
> > after vectorization.   Normally the vectorizer will try to do this through over-
> > widening detection as well,
> > but we haven't taught ranger about the ranges of these new IFNs (probably
> > should at some point).
> >
> > Cheers,
> > Tamar
> >
> > > The below tests are passed for this patch.
> > > 1. The rv64gcv fully regression tests.
> > > 2. The rv64gcv build with glibc.
> > > 3. The x86 bootstrap tests.
> > > 4. The x86 fully regression tests.
> > >
> > > gcc/ChangeLog:
> > >
> > > 	* match.pd: Add convert description for minus and capture.
> > > 	* tree-vect-patterns.cc (vect_recog_build_binary_gimple_call): Add
> > > 	new logic to handle in_type is incompatibile with out_type,  as
> > > 	well as rename from.
> > > 	(vect_recog_build_binary_gimple_stmt): Rename to.
> > > 	(vect_recog_sat_add_pattern): Leverage above renamed func.
> > > 	(vect_recog_sat_sub_pattern): Ditto.
> > >
> > > Signed-off-by: Pan Li <pan2.li@intel.com>
> > > ---
> > >  gcc/match.pd              |  4 +--
> > >  gcc/tree-vect-patterns.cc | 51 ++++++++++++++++++++++++---------------
> > >  2 files changed, 33 insertions(+), 22 deletions(-)
> > >
> > > diff --git a/gcc/match.pd b/gcc/match.pd
> > > index 3d0689c9312..4a4b0b2e72f 100644
> > > --- a/gcc/match.pd
> > > +++ b/gcc/match.pd
> > > @@ -3164,9 +3164,9 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> > >  /* Unsigned saturation sub, case 2 (branch with ge):
> > >     SAT_U_SUB = X >= Y ? X - Y : 0.  */
> > >  (match (unsigned_integer_sat_sub @0 @1)
> > > - (cond^ (ge @0 @1) (minus @0 @1) integer_zerop)
> > > + (cond^ (ge @0 @1) (convert? (minus (convert1? @0) (convert1? @1)))
> > > integer_zerop)
> > >   (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
> > > -      && types_match (type, @0, @1))))
> > > +      && TYPE_UNSIGNED (TREE_TYPE (@0)) && types_match (@0, @1))))
> > >
> > >  /* Unsigned saturation sub, case 3 (branchless with gt):
> > >     SAT_U_SUB = (X - Y) * (X > Y).  */
> > > diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
> > > index cef901808eb..3d887d36050 100644
> > > --- a/gcc/tree-vect-patterns.cc
> > > +++ b/gcc/tree-vect-patterns.cc
> > > @@ -4490,26 +4490,37 @@ vect_recog_mult_pattern (vec_info *vinfo,
> > >  extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree));
> > >  extern bool gimple_unsigned_integer_sat_sub (tree, tree*, tree (*)(tree));
> > >
> > > -static gcall *
> > > -vect_recog_build_binary_gimple_call (vec_info *vinfo, gimple *stmt,
> > > +static gimple *
> > > +vect_recog_build_binary_gimple_stmt (vec_info *vinfo, stmt_vec_info
> > stmt_info,
> > >  				     internal_fn fn, tree *type_out,
> > > -				     tree op_0, tree op_1)
> > > +				     tree lhs, tree op_0, tree op_1)
> > >  {
> > >    tree itype = TREE_TYPE (op_0);
> > > -  tree vtype = get_vectype_for_scalar_type (vinfo, itype);
> > > +  tree otype = TREE_TYPE (lhs);
> > > +  tree v_itype = get_vectype_for_scalar_type (vinfo, itype);
> > > +  tree v_otype = get_vectype_for_scalar_type (vinfo, otype);
> > >
> > > -  if (vtype != NULL_TREE
> > > -    && direct_internal_fn_supported_p (fn, vtype, OPTIMIZE_FOR_BOTH))
> > > +  if (v_itype != NULL_TREE && v_otype != NULL_TREE
> > > +    && direct_internal_fn_supported_p (fn, v_itype, OPTIMIZE_FOR_BOTH))
> > >      {
> > >        gcall *call = gimple_build_call_internal (fn, 2, op_0, op_1);
> > > +      tree in_ssa = vect_recog_temp_ssa_var (itype, NULL);
> > >
> > > -      gimple_call_set_lhs (call, vect_recog_temp_ssa_var (itype, NULL));
> > > +      gimple_call_set_lhs (call, in_ssa);
> > >        gimple_call_set_nothrow (call, /* nothrow_p */ false);
> > > -      gimple_set_location (call, gimple_location (stmt));
> > > +      gimple_set_location (call, gimple_location (STMT_VINFO_STMT
> > (stmt_info)));
> > > +
> > > +      *type_out = v_otype;
> > >
> > > -      *type_out = vtype;
> > > +      if (types_compatible_p (itype, otype))
> > > +	return call;
> > > +      else
> > > +	{
> > > +	  append_pattern_def_seq (vinfo, stmt_info, call, v_itype);
> > > +	  tree out_ssa = vect_recog_temp_ssa_var (otype, NULL);
> > >
> > > -      return call;
> > > +	  return gimple_build_assign (out_ssa, CONVERT_EXPR, in_ssa);
> > > +	}
> > >      }
> > >
> > >    return NULL;
> > > @@ -4541,13 +4552,13 @@ vect_recog_sat_add_pattern (vec_info *vinfo,
> > > stmt_vec_info stmt_vinfo,
> > >
> > >    if (gimple_unsigned_integer_sat_add (lhs, ops, NULL))
> > >      {
> > > -      gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt,
> > > -							 IFN_SAT_ADD, type_out,
> > > -							 ops[0], ops[1]);
> > > -      if (call)
> > > +      gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo,
> > > +							  IFN_SAT_ADD, type_out,
> > > +							  lhs, ops[0], ops[1]);
> > > +      if (stmt)
> > >  	{
> > >  	  vect_pattern_detected ("vect_recog_sat_add_pattern", last_stmt);
> > > -	  return call;
> > > +	  return stmt;
> > >  	}
> > >      }
> > >
> > > @@ -4579,13 +4590,13 @@ vect_recog_sat_sub_pattern (vec_info *vinfo,
> > > stmt_vec_info stmt_vinfo,
> > >
> > >    if (gimple_unsigned_integer_sat_sub (lhs, ops, NULL))
> > >      {
> > > -      gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt,
> > > -							 IFN_SAT_SUB, type_out,
> > > -							 ops[0], ops[1]);
> > > -      if (call)
> > > +      gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo,
> > > +							  IFN_SAT_SUB, type_out,
> > > +							  lhs, ops[0], ops[1]);
> > > +      if (stmt)
> > >  	{
> > >  	  vect_pattern_detected ("vect_recog_sat_sub_pattern", last_stmt);
> > > -	  return call;
> > > +	  return stmt;
> > >  	}
> > >      }
> > >
> > > --
> > > 2.34.1


^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip
  2024-06-25  6:11           ` Tamar Christina
@ 2024-06-25  6:25             ` Li, Pan2
  2024-06-26  3:12               ` Li, Pan2
  0 siblings, 1 reply; 27+ messages in thread
From: Li, Pan2 @ 2024-06-25  6:25 UTC (permalink / raw)
  To: Tamar Christina, gcc-patches
  Cc: juzhe.zhong, kito.cheng, richard.guenther, jeffreyalaw, pinskia

Got it, thanks Tamer, will have a try.

Pan

-----Original Message-----
From: Tamar Christina <Tamar.Christina@arm.com> 
Sent: Tuesday, June 25, 2024 2:11 PM
To: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org
Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com; jeffreyalaw@gmail.com; pinskia@gmail.com
Subject: RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip

> -----Original Message-----
> From: Li, Pan2 <pan2.li@intel.com>
> Sent: Tuesday, June 25, 2024 7:06 AM
> To: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org
> Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com;
> jeffreyalaw@gmail.com; pinskia@gmail.com
> Subject: RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip
> 
> > Ah, no you're right, those would end up wrong for saturation. Arg..  Sorry should
> have
> > though it through more.
> 
> Never mind, but you enlighten me for even more optimize with some restrictions. I
> revisited the pattern, for example as below.
> 
> uint16_t a, b;
> uint8_t result = (uint8_t)(a >= b ? a - b : 0);
> 
> => result = (char unsigned).SAT_SUB (a, b)
> 
> If a has a def like below
> uint8_t other = 0x1f;
> a = (uint8_t)other

You can in principle do this by querying range information,
e.g.

	  gimple_ranger ranger;
	  int_range_max r;
	  if (ranger.range_of_expr (r, oprnd0, stmt) && !r.undefined_p ())
	    {
...

We do this for instance in vect_recog_divmod_pattern.

Tamar

> 
> then we can safely convert result = (char unsigned).SAT_SUB (a, b) to
> result = .SAT_SUB ((char unsigned)a, (char unsigned).b)
> 
> Then we may have better vectorized code if a is limited to char unsigned. Of course
> we can do that based on this patch.
> 
> Pan
> 
> -----Original Message-----
> From: Tamar Christina <Tamar.Christina@arm.com>
> Sent: Tuesday, June 25, 2024 12:01 PM
> To: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org
> Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com;
> jeffreyalaw@gmail.com; pinskia@gmail.com
> Subject: RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip
> 
> > -----Original Message-----
> > From: Li, Pan2 <pan2.li@intel.com>
> > Sent: Tuesday, June 25, 2024 3:25 AM
> > To: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org
> > Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com;
> > jeffreyalaw@gmail.com; pinskia@gmail.com
> > Subject: RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip
> >
> > Thanks Tamar for comments. It indeed benefits the vectorized code, for example
> in
> > RISC-V, we may eliminate some vsetvel insn in loop for widen here.
> >
> > > iftmp.0_5 = .SAT_SUB ((short unsigned int) a_11, (short unsigned int)
> b_12(D));
> > > is cheaper than
> > > iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D));
> >
> > I am not sure if it has any correctness problem for this transform, take uint16_t
> to
> > uint8_t as example.
> >
> > uint16_t a, b;
> > uint8_t result = (uint8_t)(a >= b ? a - b : 0);
> >
> > Given a = 0x100; // 256
> >            b = 0xff;     // 255
> > For iftmp.0_5 = .SAT_SUB ((char unsigned) a, (char unsigned) b) = .SAT_SUB (0,
> > 255) = 0
> > For iftmp.0_5 = (char unsigned).SAT_SUB (a, b) = (char unsigned).SAT_SUB (256,
> > 255) = 1
> >
> > Please help to correct me if any misunderstanding, thanks again for enlightening.
> 
> Ah, no you're right, those would end up wrong for saturation. Arg..  Sorry should
> have
> though it through more.
> 
> Tamar.
> >
> > Pan
> >
> > -----Original Message-----
> > From: Tamar Christina <Tamar.Christina@arm.com>
> > Sent: Tuesday, June 25, 2024 4:00 AM
> > To: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org
> > Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com;
> > jeffreyalaw@gmail.com; pinskia@gmail.com
> > Subject: RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip
> >
> > Hi,
> >
> > > -----Original Message-----
> > > From: pan2.li@intel.com <pan2.li@intel.com>
> > > Sent: Monday, June 24, 2024 2:55 PM
> > > To: gcc-patches@gcc.gnu.org
> > > Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com;
> richard.guenther@gmail.com;
> > > jeffreyalaw@gmail.com; pinskia@gmail.com; Pan Li <pan2.li@intel.com>
> > > Subject: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip
> > >
> > > From: Pan Li <pan2.li@intel.com>
> > >
> > > The zip benchmark of coremark-pro have one SAT_SUB like pattern but
> > > truncated as below:
> > >
> > > void test (uint16_t *x, unsigned b, unsigned n)
> > > {
> > >   unsigned a = 0;
> > >   register uint16_t *p = x;
> > >
> > >   do {
> > >     a = *--p;
> > >     *p = (uint16_t)(a >= b ? a - b : 0); // Truncate after .SAT_SUB
> > >   } while (--n);
> > > }
> > >
> > > It will have gimple before vect pass,  it cannot hit any pattern of
> > > SAT_SUB and then cannot vectorize to SAT_SUB.
> > >
> > > _2 = a_11 - b_12(D);
> > > iftmp.0_13 = (short unsigned int) _2;
> > > _18 = a_11 >= b_12(D);
> > > iftmp.0_5 = _18 ? iftmp.0_13 : 0;
> > >
> > > This patch would like to improve the pattern match to recog above
> > > as truncate after .SAT_SUB pattern.  Then we will have the pattern
> > > similar to below,  as well as eliminate the first 3 dead stmt.
> > >
> > > _2 = a_11 - b_12(D);
> > > iftmp.0_13 = (short unsigned int) _2;
> > > _18 = a_11 >= b_12(D);
> > > iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D));
> > >
> >
> > I guess this is because one branch of the  cond is a constant so the
> > convert is folded in.  I was wondering though,  can't we just push
> > in the truncate in this case?
> >
> > i.e. in this case we know both types are unsigned and the difference
> > positive and max value is the max value of the truncate type.
> >
> > It seems like folding as a general rule
> >
> >   _1 = *p_10;
> >   a_11 = (unsigned int) _1;
> >   _2 = a_11 - b_12(D);
> >   iftmp.0_13 = (short unsigned int) _2;
> >   _18 = a_11 >= b_12(D);
> >   iftmp.0_5 = _18 ? iftmp.0_13 : 0;
> >   *p_10 = iftmp.0_5;
> >
> > Into
> >
> >   _1 = *p_10;
> >   a_11 = (unsigned int) _1;
> >   _2 = ((short unsigned int) a_11) - ((short unsigned int) b_12(D));
> >   iftmp.0_13 = _2;
> >   _18 = a_11 >= b_12(D);
> >   iftmp.0_5 = _18 ? iftmp.0_13 : 0;
> >   *p_10 = iftmp.0_5;
> >
> > Is valid (though might have missed something).  This would negate the need for
> > this change to the vectorizer and saturation detection
> > but also should generate better vector code. This is what we do in the general
> case
> > https://godbolt.org/z/dfoj6fWdv
> > I think here we're just not seeing through the cond.
> >
> > Typically lots of architectures have cheap truncation operations, so truncating
> > before saturation means you do the cheap
> > operation first rather than doing the complex operation on the wider type.
> >
> > That is,
> >
> > _2 = a_11 - b_12(D);
> > iftmp.0_13 = (short unsigned int) _2;
> > _18 = a_11 >= b_12(D);
> > iftmp.0_5 = .SAT_SUB ((short unsigned int) a_11, (short unsigned int) b_12(D));
> >
> > is cheaper than
> >
> > _2 = a_11 - b_12(D);
> > iftmp.0_13 = (short unsigned int) _2;
> > _18 = a_11 >= b_12(D);
> > iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D));
> >
> > after vectorization.   Normally the vectorizer will try to do this through over-
> > widening detection as well,
> > but we haven't taught ranger about the ranges of these new IFNs (probably
> > should at some point).
> >
> > Cheers,
> > Tamar
> >
> > > The below tests are passed for this patch.
> > > 1. The rv64gcv fully regression tests.
> > > 2. The rv64gcv build with glibc.
> > > 3. The x86 bootstrap tests.
> > > 4. The x86 fully regression tests.
> > >
> > > gcc/ChangeLog:
> > >
> > > 	* match.pd: Add convert description for minus and capture.
> > > 	* tree-vect-patterns.cc (vect_recog_build_binary_gimple_call): Add
> > > 	new logic to handle in_type is incompatibile with out_type,  as
> > > 	well as rename from.
> > > 	(vect_recog_build_binary_gimple_stmt): Rename to.
> > > 	(vect_recog_sat_add_pattern): Leverage above renamed func.
> > > 	(vect_recog_sat_sub_pattern): Ditto.
> > >
> > > Signed-off-by: Pan Li <pan2.li@intel.com>
> > > ---
> > >  gcc/match.pd              |  4 +--
> > >  gcc/tree-vect-patterns.cc | 51 ++++++++++++++++++++++++---------------
> > >  2 files changed, 33 insertions(+), 22 deletions(-)
> > >
> > > diff --git a/gcc/match.pd b/gcc/match.pd
> > > index 3d0689c9312..4a4b0b2e72f 100644
> > > --- a/gcc/match.pd
> > > +++ b/gcc/match.pd
> > > @@ -3164,9 +3164,9 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> > >  /* Unsigned saturation sub, case 2 (branch with ge):
> > >     SAT_U_SUB = X >= Y ? X - Y : 0.  */
> > >  (match (unsigned_integer_sat_sub @0 @1)
> > > - (cond^ (ge @0 @1) (minus @0 @1) integer_zerop)
> > > + (cond^ (ge @0 @1) (convert? (minus (convert1? @0) (convert1? @1)))
> > > integer_zerop)
> > >   (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
> > > -      && types_match (type, @0, @1))))
> > > +      && TYPE_UNSIGNED (TREE_TYPE (@0)) && types_match (@0, @1))))
> > >
> > >  /* Unsigned saturation sub, case 3 (branchless with gt):
> > >     SAT_U_SUB = (X - Y) * (X > Y).  */
> > > diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
> > > index cef901808eb..3d887d36050 100644
> > > --- a/gcc/tree-vect-patterns.cc
> > > +++ b/gcc/tree-vect-patterns.cc
> > > @@ -4490,26 +4490,37 @@ vect_recog_mult_pattern (vec_info *vinfo,
> > >  extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree));
> > >  extern bool gimple_unsigned_integer_sat_sub (tree, tree*, tree (*)(tree));
> > >
> > > -static gcall *
> > > -vect_recog_build_binary_gimple_call (vec_info *vinfo, gimple *stmt,
> > > +static gimple *
> > > +vect_recog_build_binary_gimple_stmt (vec_info *vinfo, stmt_vec_info
> > stmt_info,
> > >  				     internal_fn fn, tree *type_out,
> > > -				     tree op_0, tree op_1)
> > > +				     tree lhs, tree op_0, tree op_1)
> > >  {
> > >    tree itype = TREE_TYPE (op_0);
> > > -  tree vtype = get_vectype_for_scalar_type (vinfo, itype);
> > > +  tree otype = TREE_TYPE (lhs);
> > > +  tree v_itype = get_vectype_for_scalar_type (vinfo, itype);
> > > +  tree v_otype = get_vectype_for_scalar_type (vinfo, otype);
> > >
> > > -  if (vtype != NULL_TREE
> > > -    && direct_internal_fn_supported_p (fn, vtype, OPTIMIZE_FOR_BOTH))
> > > +  if (v_itype != NULL_TREE && v_otype != NULL_TREE
> > > +    && direct_internal_fn_supported_p (fn, v_itype, OPTIMIZE_FOR_BOTH))
> > >      {
> > >        gcall *call = gimple_build_call_internal (fn, 2, op_0, op_1);
> > > +      tree in_ssa = vect_recog_temp_ssa_var (itype, NULL);
> > >
> > > -      gimple_call_set_lhs (call, vect_recog_temp_ssa_var (itype, NULL));
> > > +      gimple_call_set_lhs (call, in_ssa);
> > >        gimple_call_set_nothrow (call, /* nothrow_p */ false);
> > > -      gimple_set_location (call, gimple_location (stmt));
> > > +      gimple_set_location (call, gimple_location (STMT_VINFO_STMT
> > (stmt_info)));
> > > +
> > > +      *type_out = v_otype;
> > >
> > > -      *type_out = vtype;
> > > +      if (types_compatible_p (itype, otype))
> > > +	return call;
> > > +      else
> > > +	{
> > > +	  append_pattern_def_seq (vinfo, stmt_info, call, v_itype);
> > > +	  tree out_ssa = vect_recog_temp_ssa_var (otype, NULL);
> > >
> > > -      return call;
> > > +	  return gimple_build_assign (out_ssa, CONVERT_EXPR, in_ssa);
> > > +	}
> > >      }
> > >
> > >    return NULL;
> > > @@ -4541,13 +4552,13 @@ vect_recog_sat_add_pattern (vec_info *vinfo,
> > > stmt_vec_info stmt_vinfo,
> > >
> > >    if (gimple_unsigned_integer_sat_add (lhs, ops, NULL))
> > >      {
> > > -      gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt,
> > > -							 IFN_SAT_ADD, type_out,
> > > -							 ops[0], ops[1]);
> > > -      if (call)
> > > +      gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo,
> > > +							  IFN_SAT_ADD, type_out,
> > > +							  lhs, ops[0], ops[1]);
> > > +      if (stmt)
> > >  	{
> > >  	  vect_pattern_detected ("vect_recog_sat_add_pattern", last_stmt);
> > > -	  return call;
> > > +	  return stmt;
> > >  	}
> > >      }
> > >
> > > @@ -4579,13 +4590,13 @@ vect_recog_sat_sub_pattern (vec_info *vinfo,
> > > stmt_vec_info stmt_vinfo,
> > >
> > >    if (gimple_unsigned_integer_sat_sub (lhs, ops, NULL))
> > >      {
> > > -      gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt,
> > > -							 IFN_SAT_SUB, type_out,
> > > -							 ops[0], ops[1]);
> > > -      if (call)
> > > +      gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo,
> > > +							  IFN_SAT_SUB, type_out,
> > > +							  lhs, ops[0], ops[1]);
> > > +      if (stmt)
> > >  	{
> > >  	  vect_pattern_detected ("vect_recog_sat_sub_pattern", last_stmt);
> > > -	  return call;
> > > +	  return stmt;
> > >  	}
> > >      }
> > >
> > > --
> > > 2.34.1


^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip
  2024-06-25  6:25             ` Li, Pan2
@ 2024-06-26  3:12               ` Li, Pan2
  0 siblings, 0 replies; 27+ messages in thread
From: Li, Pan2 @ 2024-06-26  3:12 UTC (permalink / raw)
  To: Tamar Christina, gcc-patches
  Cc: juzhe.zhong, kito.cheng, richard.guenther, jeffreyalaw, pinskia

Thanks Tamer, gimple_ranger works well for that case, will send another patch after this one.

Pan

-----Original Message-----
From: Li, Pan2 
Sent: Tuesday, June 25, 2024 2:26 PM
To: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org
Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com; jeffreyalaw@gmail.com; pinskia@gmail.com
Subject: RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip

Got it, thanks Tamer, will have a try.

Pan

-----Original Message-----
From: Tamar Christina <Tamar.Christina@arm.com> 
Sent: Tuesday, June 25, 2024 2:11 PM
To: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org
Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com; jeffreyalaw@gmail.com; pinskia@gmail.com
Subject: RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip

> -----Original Message-----
> From: Li, Pan2 <pan2.li@intel.com>
> Sent: Tuesday, June 25, 2024 7:06 AM
> To: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org
> Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com;
> jeffreyalaw@gmail.com; pinskia@gmail.com
> Subject: RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip
> 
> > Ah, no you're right, those would end up wrong for saturation. Arg..  Sorry should
> have
> > though it through more.
> 
> Never mind, but you enlighten me for even more optimize with some restrictions. I
> revisited the pattern, for example as below.
> 
> uint16_t a, b;
> uint8_t result = (uint8_t)(a >= b ? a - b : 0);
> 
> => result = (char unsigned).SAT_SUB (a, b)
> 
> If a has a def like below
> uint8_t other = 0x1f;
> a = (uint8_t)other

You can in principle do this by querying range information,
e.g.

	  gimple_ranger ranger;
	  int_range_max r;
	  if (ranger.range_of_expr (r, oprnd0, stmt) && !r.undefined_p ())
	    {
...

We do this for instance in vect_recog_divmod_pattern.

Tamar

> 
> then we can safely convert result = (char unsigned).SAT_SUB (a, b) to
> result = .SAT_SUB ((char unsigned)a, (char unsigned).b)
> 
> Then we may have better vectorized code if a is limited to char unsigned. Of course
> we can do that based on this patch.
> 
> Pan
> 
> -----Original Message-----
> From: Tamar Christina <Tamar.Christina@arm.com>
> Sent: Tuesday, June 25, 2024 12:01 PM
> To: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org
> Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com;
> jeffreyalaw@gmail.com; pinskia@gmail.com
> Subject: RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip
> 
> > -----Original Message-----
> > From: Li, Pan2 <pan2.li@intel.com>
> > Sent: Tuesday, June 25, 2024 3:25 AM
> > To: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org
> > Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com;
> > jeffreyalaw@gmail.com; pinskia@gmail.com
> > Subject: RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip
> >
> > Thanks Tamar for comments. It indeed benefits the vectorized code, for example
> in
> > RISC-V, we may eliminate some vsetvel insn in loop for widen here.
> >
> > > iftmp.0_5 = .SAT_SUB ((short unsigned int) a_11, (short unsigned int)
> b_12(D));
> > > is cheaper than
> > > iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D));
> >
> > I am not sure if it has any correctness problem for this transform, take uint16_t
> to
> > uint8_t as example.
> >
> > uint16_t a, b;
> > uint8_t result = (uint8_t)(a >= b ? a - b : 0);
> >
> > Given a = 0x100; // 256
> >            b = 0xff;     // 255
> > For iftmp.0_5 = .SAT_SUB ((char unsigned) a, (char unsigned) b) = .SAT_SUB (0,
> > 255) = 0
> > For iftmp.0_5 = (char unsigned).SAT_SUB (a, b) = (char unsigned).SAT_SUB (256,
> > 255) = 1
> >
> > Please help to correct me if any misunderstanding, thanks again for enlightening.
> 
> Ah, no you're right, those would end up wrong for saturation. Arg..  Sorry should
> have
> though it through more.
> 
> Tamar.
> >
> > Pan
> >
> > -----Original Message-----
> > From: Tamar Christina <Tamar.Christina@arm.com>
> > Sent: Tuesday, June 25, 2024 4:00 AM
> > To: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org
> > Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com;
> > jeffreyalaw@gmail.com; pinskia@gmail.com
> > Subject: RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip
> >
> > Hi,
> >
> > > -----Original Message-----
> > > From: pan2.li@intel.com <pan2.li@intel.com>
> > > Sent: Monday, June 24, 2024 2:55 PM
> > > To: gcc-patches@gcc.gnu.org
> > > Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com;
> richard.guenther@gmail.com;
> > > jeffreyalaw@gmail.com; pinskia@gmail.com; Pan Li <pan2.li@intel.com>
> > > Subject: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip
> > >
> > > From: Pan Li <pan2.li@intel.com>
> > >
> > > The zip benchmark of coremark-pro have one SAT_SUB like pattern but
> > > truncated as below:
> > >
> > > void test (uint16_t *x, unsigned b, unsigned n)
> > > {
> > >   unsigned a = 0;
> > >   register uint16_t *p = x;
> > >
> > >   do {
> > >     a = *--p;
> > >     *p = (uint16_t)(a >= b ? a - b : 0); // Truncate after .SAT_SUB
> > >   } while (--n);
> > > }
> > >
> > > It will have gimple before vect pass,  it cannot hit any pattern of
> > > SAT_SUB and then cannot vectorize to SAT_SUB.
> > >
> > > _2 = a_11 - b_12(D);
> > > iftmp.0_13 = (short unsigned int) _2;
> > > _18 = a_11 >= b_12(D);
> > > iftmp.0_5 = _18 ? iftmp.0_13 : 0;
> > >
> > > This patch would like to improve the pattern match to recog above
> > > as truncate after .SAT_SUB pattern.  Then we will have the pattern
> > > similar to below,  as well as eliminate the first 3 dead stmt.
> > >
> > > _2 = a_11 - b_12(D);
> > > iftmp.0_13 = (short unsigned int) _2;
> > > _18 = a_11 >= b_12(D);
> > > iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D));
> > >
> >
> > I guess this is because one branch of the  cond is a constant so the
> > convert is folded in.  I was wondering though,  can't we just push
> > in the truncate in this case?
> >
> > i.e. in this case we know both types are unsigned and the difference
> > positive and max value is the max value of the truncate type.
> >
> > It seems like folding as a general rule
> >
> >   _1 = *p_10;
> >   a_11 = (unsigned int) _1;
> >   _2 = a_11 - b_12(D);
> >   iftmp.0_13 = (short unsigned int) _2;
> >   _18 = a_11 >= b_12(D);
> >   iftmp.0_5 = _18 ? iftmp.0_13 : 0;
> >   *p_10 = iftmp.0_5;
> >
> > Into
> >
> >   _1 = *p_10;
> >   a_11 = (unsigned int) _1;
> >   _2 = ((short unsigned int) a_11) - ((short unsigned int) b_12(D));
> >   iftmp.0_13 = _2;
> >   _18 = a_11 >= b_12(D);
> >   iftmp.0_5 = _18 ? iftmp.0_13 : 0;
> >   *p_10 = iftmp.0_5;
> >
> > Is valid (though might have missed something).  This would negate the need for
> > this change to the vectorizer and saturation detection
> > but also should generate better vector code. This is what we do in the general
> case
> > https://godbolt.org/z/dfoj6fWdv
> > I think here we're just not seeing through the cond.
> >
> > Typically lots of architectures have cheap truncation operations, so truncating
> > before saturation means you do the cheap
> > operation first rather than doing the complex operation on the wider type.
> >
> > That is,
> >
> > _2 = a_11 - b_12(D);
> > iftmp.0_13 = (short unsigned int) _2;
> > _18 = a_11 >= b_12(D);
> > iftmp.0_5 = .SAT_SUB ((short unsigned int) a_11, (short unsigned int) b_12(D));
> >
> > is cheaper than
> >
> > _2 = a_11 - b_12(D);
> > iftmp.0_13 = (short unsigned int) _2;
> > _18 = a_11 >= b_12(D);
> > iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D));
> >
> > after vectorization.   Normally the vectorizer will try to do this through over-
> > widening detection as well,
> > but we haven't taught ranger about the ranges of these new IFNs (probably
> > should at some point).
> >
> > Cheers,
> > Tamar
> >
> > > The below tests are passed for this patch.
> > > 1. The rv64gcv fully regression tests.
> > > 2. The rv64gcv build with glibc.
> > > 3. The x86 bootstrap tests.
> > > 4. The x86 fully regression tests.
> > >
> > > gcc/ChangeLog:
> > >
> > > 	* match.pd: Add convert description for minus and capture.
> > > 	* tree-vect-patterns.cc (vect_recog_build_binary_gimple_call): Add
> > > 	new logic to handle in_type is incompatibile with out_type,  as
> > > 	well as rename from.
> > > 	(vect_recog_build_binary_gimple_stmt): Rename to.
> > > 	(vect_recog_sat_add_pattern): Leverage above renamed func.
> > > 	(vect_recog_sat_sub_pattern): Ditto.
> > >
> > > Signed-off-by: Pan Li <pan2.li@intel.com>
> > > ---
> > >  gcc/match.pd              |  4 +--
> > >  gcc/tree-vect-patterns.cc | 51 ++++++++++++++++++++++++---------------
> > >  2 files changed, 33 insertions(+), 22 deletions(-)
> > >
> > > diff --git a/gcc/match.pd b/gcc/match.pd
> > > index 3d0689c9312..4a4b0b2e72f 100644
> > > --- a/gcc/match.pd
> > > +++ b/gcc/match.pd
> > > @@ -3164,9 +3164,9 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> > >  /* Unsigned saturation sub, case 2 (branch with ge):
> > >     SAT_U_SUB = X >= Y ? X - Y : 0.  */
> > >  (match (unsigned_integer_sat_sub @0 @1)
> > > - (cond^ (ge @0 @1) (minus @0 @1) integer_zerop)
> > > + (cond^ (ge @0 @1) (convert? (minus (convert1? @0) (convert1? @1)))
> > > integer_zerop)
> > >   (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
> > > -      && types_match (type, @0, @1))))
> > > +      && TYPE_UNSIGNED (TREE_TYPE (@0)) && types_match (@0, @1))))
> > >
> > >  /* Unsigned saturation sub, case 3 (branchless with gt):
> > >     SAT_U_SUB = (X - Y) * (X > Y).  */
> > > diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
> > > index cef901808eb..3d887d36050 100644
> > > --- a/gcc/tree-vect-patterns.cc
> > > +++ b/gcc/tree-vect-patterns.cc
> > > @@ -4490,26 +4490,37 @@ vect_recog_mult_pattern (vec_info *vinfo,
> > >  extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree));
> > >  extern bool gimple_unsigned_integer_sat_sub (tree, tree*, tree (*)(tree));
> > >
> > > -static gcall *
> > > -vect_recog_build_binary_gimple_call (vec_info *vinfo, gimple *stmt,
> > > +static gimple *
> > > +vect_recog_build_binary_gimple_stmt (vec_info *vinfo, stmt_vec_info
> > stmt_info,
> > >  				     internal_fn fn, tree *type_out,
> > > -				     tree op_0, tree op_1)
> > > +				     tree lhs, tree op_0, tree op_1)
> > >  {
> > >    tree itype = TREE_TYPE (op_0);
> > > -  tree vtype = get_vectype_for_scalar_type (vinfo, itype);
> > > +  tree otype = TREE_TYPE (lhs);
> > > +  tree v_itype = get_vectype_for_scalar_type (vinfo, itype);
> > > +  tree v_otype = get_vectype_for_scalar_type (vinfo, otype);
> > >
> > > -  if (vtype != NULL_TREE
> > > -    && direct_internal_fn_supported_p (fn, vtype, OPTIMIZE_FOR_BOTH))
> > > +  if (v_itype != NULL_TREE && v_otype != NULL_TREE
> > > +    && direct_internal_fn_supported_p (fn, v_itype, OPTIMIZE_FOR_BOTH))
> > >      {
> > >        gcall *call = gimple_build_call_internal (fn, 2, op_0, op_1);
> > > +      tree in_ssa = vect_recog_temp_ssa_var (itype, NULL);
> > >
> > > -      gimple_call_set_lhs (call, vect_recog_temp_ssa_var (itype, NULL));
> > > +      gimple_call_set_lhs (call, in_ssa);
> > >        gimple_call_set_nothrow (call, /* nothrow_p */ false);
> > > -      gimple_set_location (call, gimple_location (stmt));
> > > +      gimple_set_location (call, gimple_location (STMT_VINFO_STMT
> > (stmt_info)));
> > > +
> > > +      *type_out = v_otype;
> > >
> > > -      *type_out = vtype;
> > > +      if (types_compatible_p (itype, otype))
> > > +	return call;
> > > +      else
> > > +	{
> > > +	  append_pattern_def_seq (vinfo, stmt_info, call, v_itype);
> > > +	  tree out_ssa = vect_recog_temp_ssa_var (otype, NULL);
> > >
> > > -      return call;
> > > +	  return gimple_build_assign (out_ssa, CONVERT_EXPR, in_ssa);
> > > +	}
> > >      }
> > >
> > >    return NULL;
> > > @@ -4541,13 +4552,13 @@ vect_recog_sat_add_pattern (vec_info *vinfo,
> > > stmt_vec_info stmt_vinfo,
> > >
> > >    if (gimple_unsigned_integer_sat_add (lhs, ops, NULL))
> > >      {
> > > -      gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt,
> > > -							 IFN_SAT_ADD, type_out,
> > > -							 ops[0], ops[1]);
> > > -      if (call)
> > > +      gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo,
> > > +							  IFN_SAT_ADD, type_out,
> > > +							  lhs, ops[0], ops[1]);
> > > +      if (stmt)
> > >  	{
> > >  	  vect_pattern_detected ("vect_recog_sat_add_pattern", last_stmt);
> > > -	  return call;
> > > +	  return stmt;
> > >  	}
> > >      }
> > >
> > > @@ -4579,13 +4590,13 @@ vect_recog_sat_sub_pattern (vec_info *vinfo,
> > > stmt_vec_info stmt_vinfo,
> > >
> > >    if (gimple_unsigned_integer_sat_sub (lhs, ops, NULL))
> > >      {
> > > -      gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt,
> > > -							 IFN_SAT_SUB, type_out,
> > > -							 ops[0], ops[1]);
> > > -      if (call)
> > > +      gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo,
> > > +							  IFN_SAT_SUB, type_out,
> > > +							  lhs, ops[0], ops[1]);
> > > +      if (stmt)
> > >  	{
> > >  	  vect_pattern_detected ("vect_recog_sat_sub_pattern", last_stmt);
> > > -	  return call;
> > > +	  return stmt;
> > >  	}
> > >      }
> > >
> > > --
> > > 2.34.1


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip
  2024-06-24 13:55 ` [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip pan2.li
  2024-06-24 19:59   ` Tamar Christina
@ 2024-06-26 13:56   ` Richard Biener
  2024-06-26 14:22     ` Li, Pan2
  2024-06-27  6:48   ` Uros Bizjak
  2 siblings, 1 reply; 27+ messages in thread
From: Richard Biener @ 2024-06-26 13:56 UTC (permalink / raw)
  To: pan2.li; +Cc: gcc-patches, juzhe.zhong, kito.cheng, jeffreyalaw, pinskia

On Mon, Jun 24, 2024 at 3:55 PM <pan2.li@intel.com> wrote:
>
> From: Pan Li <pan2.li@intel.com>
>
> The zip benchmark of coremark-pro have one SAT_SUB like pattern but
> truncated as below:
>
> void test (uint16_t *x, unsigned b, unsigned n)
> {
>   unsigned a = 0;
>   register uint16_t *p = x;
>
>   do {
>     a = *--p;
>     *p = (uint16_t)(a >= b ? a - b : 0); // Truncate after .SAT_SUB
>   } while (--n);
> }
>
> It will have gimple before vect pass,  it cannot hit any pattern of
> SAT_SUB and then cannot vectorize to SAT_SUB.
>
> _2 = a_11 - b_12(D);
> iftmp.0_13 = (short unsigned int) _2;
> _18 = a_11 >= b_12(D);
> iftmp.0_5 = _18 ? iftmp.0_13 : 0;
>
> This patch would like to improve the pattern match to recog above
> as truncate after .SAT_SUB pattern.  Then we will have the pattern
> similar to below,  as well as eliminate the first 3 dead stmt.
>
> _2 = a_11 - b_12(D);
> iftmp.0_13 = (short unsigned int) _2;
> _18 = a_11 >= b_12(D);
> iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D));
>
> The below tests are passed for this patch.
> 1. The rv64gcv fully regression tests.
> 2. The rv64gcv build with glibc.
> 3. The x86 bootstrap tests.
> 4. The x86 fully regression tests.
>
> gcc/ChangeLog:
>
>         * match.pd: Add convert description for minus and capture.
>         * tree-vect-patterns.cc (vect_recog_build_binary_gimple_call): Add
>         new logic to handle in_type is incompatibile with out_type,  as
>         well as rename from.
>         (vect_recog_build_binary_gimple_stmt): Rename to.
>         (vect_recog_sat_add_pattern): Leverage above renamed func.
>         (vect_recog_sat_sub_pattern): Ditto.
>
> Signed-off-by: Pan Li <pan2.li@intel.com>
> ---
>  gcc/match.pd              |  4 +--
>  gcc/tree-vect-patterns.cc | 51 ++++++++++++++++++++++++---------------
>  2 files changed, 33 insertions(+), 22 deletions(-)
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 3d0689c9312..4a4b0b2e72f 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -3164,9 +3164,9 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>  /* Unsigned saturation sub, case 2 (branch with ge):
>     SAT_U_SUB = X >= Y ? X - Y : 0.  */
>  (match (unsigned_integer_sat_sub @0 @1)
> - (cond^ (ge @0 @1) (minus @0 @1) integer_zerop)
> + (cond^ (ge @0 @1) (convert? (minus (convert1? @0) (convert1? @1))) integer_zerop)
>   (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
> -      && types_match (type, @0, @1))))
> +      && TYPE_UNSIGNED (TREE_TYPE (@0)) && types_match (@0, @1))))

I suppose the other patterns can see similar enhacements for the case
their forms
show up truncated or extended?

>  /* Unsigned saturation sub, case 3 (branchless with gt):
>     SAT_U_SUB = (X - Y) * (X > Y).  */
> diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
> index cef901808eb..3d887d36050 100644
> --- a/gcc/tree-vect-patterns.cc
> +++ b/gcc/tree-vect-patterns.cc
> @@ -4490,26 +4490,37 @@ vect_recog_mult_pattern (vec_info *vinfo,
>  extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree));
>  extern bool gimple_unsigned_integer_sat_sub (tree, tree*, tree (*)(tree));
>
> -static gcall *
> -vect_recog_build_binary_gimple_call (vec_info *vinfo, gimple *stmt,
> +static gimple *
> +vect_recog_build_binary_gimple_stmt (vec_info *vinfo, stmt_vec_info stmt_info,
>                                      internal_fn fn, tree *type_out,
> -                                    tree op_0, tree op_1)
> +                                    tree lhs, tree op_0, tree op_1)
>  {
>    tree itype = TREE_TYPE (op_0);
> -  tree vtype = get_vectype_for_scalar_type (vinfo, itype);
> +  tree otype = TREE_TYPE (lhs);
> +  tree v_itype = get_vectype_for_scalar_type (vinfo, itype);
> +  tree v_otype = get_vectype_for_scalar_type (vinfo, otype);
>
> -  if (vtype != NULL_TREE
> -    && direct_internal_fn_supported_p (fn, vtype, OPTIMIZE_FOR_BOTH))
> +  if (v_itype != NULL_TREE && v_otype != NULL_TREE
> +    && direct_internal_fn_supported_p (fn, v_itype, OPTIMIZE_FOR_BOTH))
>      {
>        gcall *call = gimple_build_call_internal (fn, 2, op_0, op_1);
> +      tree in_ssa = vect_recog_temp_ssa_var (itype, NULL);
>
> -      gimple_call_set_lhs (call, vect_recog_temp_ssa_var (itype, NULL));
> +      gimple_call_set_lhs (call, in_ssa);
>        gimple_call_set_nothrow (call, /* nothrow_p */ false);
> -      gimple_set_location (call, gimple_location (stmt));
> +      gimple_set_location (call, gimple_location (STMT_VINFO_STMT (stmt_info)));
> +
> +      *type_out = v_otype;
>
> -      *type_out = vtype;
> +      if (types_compatible_p (itype, otype))
> +       return call;
> +      else
> +       {
> +         append_pattern_def_seq (vinfo, stmt_info, call, v_itype);
> +         tree out_ssa = vect_recog_temp_ssa_var (otype, NULL);
>
> -      return call;
> +         return gimple_build_assign (out_ssa, CONVERT_EXPR, in_ssa);

Please use NOP_EXPR here.

> +       }
>      }
>
>    return NULL;
> @@ -4541,13 +4552,13 @@ vect_recog_sat_add_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo,
>
>    if (gimple_unsigned_integer_sat_add (lhs, ops, NULL))
>      {
> -      gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt,
> -                                                        IFN_SAT_ADD, type_out,
> -                                                        ops[0], ops[1]);
> -      if (call)
> +      gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo,
> +                                                         IFN_SAT_ADD, type_out,
> +                                                         lhs, ops[0], ops[1]);
> +      if (stmt)
>         {
>           vect_pattern_detected ("vect_recog_sat_add_pattern", last_stmt);
> -         return call;
> +         return stmt;
>         }
>      }
>
> @@ -4579,13 +4590,13 @@ vect_recog_sat_sub_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo,
>
>    if (gimple_unsigned_integer_sat_sub (lhs, ops, NULL))
>      {
> -      gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt,
> -                                                        IFN_SAT_SUB, type_out,
> -                                                        ops[0], ops[1]);
> -      if (call)
> +      gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo,
> +                                                         IFN_SAT_SUB, type_out,
> +                                                         lhs, ops[0], ops[1]);
> +      if (stmt)
>         {
>           vect_pattern_detected ("vect_recog_sat_sub_pattern", last_stmt);
> -         return call;
> +         return stmt;
>         }
>      }
>
> --
> 2.34.1
>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip
  2024-06-26 13:56   ` Richard Biener
@ 2024-06-26 14:22     ` Li, Pan2
  0 siblings, 0 replies; 27+ messages in thread
From: Li, Pan2 @ 2024-06-26 14:22 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, juzhe.zhong, kito.cheng, jeffreyalaw, pinskia

> I suppose the other patterns can see similar enhacements for the case
> their forms
> show up truncated or extended?

Yes, just want to highlight that this form comes from the zip benchmark.
Of course, the rest forms are planed in underlying Patch(es).

> Please use NOP_EXPR here.

Sure, and will send the v2 if no surprise from test.

Pan

-----Original Message-----
From: Richard Biener <richard.guenther@gmail.com> 
Sent: Wednesday, June 26, 2024 9:56 PM
To: Li, Pan2 <pan2.li@intel.com>
Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com; pinskia@gmail.com
Subject: Re: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip

On Mon, Jun 24, 2024 at 3:55 PM <pan2.li@intel.com> wrote:
>
> From: Pan Li <pan2.li@intel.com>
>
> The zip benchmark of coremark-pro have one SAT_SUB like pattern but
> truncated as below:
>
> void test (uint16_t *x, unsigned b, unsigned n)
> {
>   unsigned a = 0;
>   register uint16_t *p = x;
>
>   do {
>     a = *--p;
>     *p = (uint16_t)(a >= b ? a - b : 0); // Truncate after .SAT_SUB
>   } while (--n);
> }
>
> It will have gimple before vect pass,  it cannot hit any pattern of
> SAT_SUB and then cannot vectorize to SAT_SUB.
>
> _2 = a_11 - b_12(D);
> iftmp.0_13 = (short unsigned int) _2;
> _18 = a_11 >= b_12(D);
> iftmp.0_5 = _18 ? iftmp.0_13 : 0;
>
> This patch would like to improve the pattern match to recog above
> as truncate after .SAT_SUB pattern.  Then we will have the pattern
> similar to below,  as well as eliminate the first 3 dead stmt.
>
> _2 = a_11 - b_12(D);
> iftmp.0_13 = (short unsigned int) _2;
> _18 = a_11 >= b_12(D);
> iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D));
>
> The below tests are passed for this patch.
> 1. The rv64gcv fully regression tests.
> 2. The rv64gcv build with glibc.
> 3. The x86 bootstrap tests.
> 4. The x86 fully regression tests.
>
> gcc/ChangeLog:
>
>         * match.pd: Add convert description for minus and capture.
>         * tree-vect-patterns.cc (vect_recog_build_binary_gimple_call): Add
>         new logic to handle in_type is incompatibile with out_type,  as
>         well as rename from.
>         (vect_recog_build_binary_gimple_stmt): Rename to.
>         (vect_recog_sat_add_pattern): Leverage above renamed func.
>         (vect_recog_sat_sub_pattern): Ditto.
>
> Signed-off-by: Pan Li <pan2.li@intel.com>
> ---
>  gcc/match.pd              |  4 +--
>  gcc/tree-vect-patterns.cc | 51 ++++++++++++++++++++++++---------------
>  2 files changed, 33 insertions(+), 22 deletions(-)
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 3d0689c9312..4a4b0b2e72f 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -3164,9 +3164,9 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>  /* Unsigned saturation sub, case 2 (branch with ge):
>     SAT_U_SUB = X >= Y ? X - Y : 0.  */
>  (match (unsigned_integer_sat_sub @0 @1)
> - (cond^ (ge @0 @1) (minus @0 @1) integer_zerop)
> + (cond^ (ge @0 @1) (convert? (minus (convert1? @0) (convert1? @1))) integer_zerop)
>   (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
> -      && types_match (type, @0, @1))))
> +      && TYPE_UNSIGNED (TREE_TYPE (@0)) && types_match (@0, @1))))

I suppose the other patterns can see similar enhacements for the case
their forms
show up truncated or extended?

>  /* Unsigned saturation sub, case 3 (branchless with gt):
>     SAT_U_SUB = (X - Y) * (X > Y).  */
> diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
> index cef901808eb..3d887d36050 100644
> --- a/gcc/tree-vect-patterns.cc
> +++ b/gcc/tree-vect-patterns.cc
> @@ -4490,26 +4490,37 @@ vect_recog_mult_pattern (vec_info *vinfo,
>  extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree));
>  extern bool gimple_unsigned_integer_sat_sub (tree, tree*, tree (*)(tree));
>
> -static gcall *
> -vect_recog_build_binary_gimple_call (vec_info *vinfo, gimple *stmt,
> +static gimple *
> +vect_recog_build_binary_gimple_stmt (vec_info *vinfo, stmt_vec_info stmt_info,
>                                      internal_fn fn, tree *type_out,
> -                                    tree op_0, tree op_1)
> +                                    tree lhs, tree op_0, tree op_1)
>  {
>    tree itype = TREE_TYPE (op_0);
> -  tree vtype = get_vectype_for_scalar_type (vinfo, itype);
> +  tree otype = TREE_TYPE (lhs);
> +  tree v_itype = get_vectype_for_scalar_type (vinfo, itype);
> +  tree v_otype = get_vectype_for_scalar_type (vinfo, otype);
>
> -  if (vtype != NULL_TREE
> -    && direct_internal_fn_supported_p (fn, vtype, OPTIMIZE_FOR_BOTH))
> +  if (v_itype != NULL_TREE && v_otype != NULL_TREE
> +    && direct_internal_fn_supported_p (fn, v_itype, OPTIMIZE_FOR_BOTH))
>      {
>        gcall *call = gimple_build_call_internal (fn, 2, op_0, op_1);
> +      tree in_ssa = vect_recog_temp_ssa_var (itype, NULL);
>
> -      gimple_call_set_lhs (call, vect_recog_temp_ssa_var (itype, NULL));
> +      gimple_call_set_lhs (call, in_ssa);
>        gimple_call_set_nothrow (call, /* nothrow_p */ false);
> -      gimple_set_location (call, gimple_location (stmt));
> +      gimple_set_location (call, gimple_location (STMT_VINFO_STMT (stmt_info)));
> +
> +      *type_out = v_otype;
>
> -      *type_out = vtype;
> +      if (types_compatible_p (itype, otype))
> +       return call;
> +      else
> +       {
> +         append_pattern_def_seq (vinfo, stmt_info, call, v_itype);
> +         tree out_ssa = vect_recog_temp_ssa_var (otype, NULL);
>
> -      return call;
> +         return gimple_build_assign (out_ssa, CONVERT_EXPR, in_ssa);

Please use NOP_EXPR here.

> +       }
>      }
>
>    return NULL;
> @@ -4541,13 +4552,13 @@ vect_recog_sat_add_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo,
>
>    if (gimple_unsigned_integer_sat_add (lhs, ops, NULL))
>      {
> -      gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt,
> -                                                        IFN_SAT_ADD, type_out,
> -                                                        ops[0], ops[1]);
> -      if (call)
> +      gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo,
> +                                                         IFN_SAT_ADD, type_out,
> +                                                         lhs, ops[0], ops[1]);
> +      if (stmt)
>         {
>           vect_pattern_detected ("vect_recog_sat_add_pattern", last_stmt);
> -         return call;
> +         return stmt;
>         }
>      }
>
> @@ -4579,13 +4590,13 @@ vect_recog_sat_sub_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo,
>
>    if (gimple_unsigned_integer_sat_sub (lhs, ops, NULL))
>      {
> -      gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt,
> -                                                        IFN_SAT_SUB, type_out,
> -                                                        ops[0], ops[1]);
> -      if (call)
> +      gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo,
> +                                                         IFN_SAT_SUB, type_out,
> +                                                         lhs, ops[0], ops[1]);
> +      if (stmt)
>         {
>           vect_pattern_detected ("vect_recog_sat_sub_pattern", last_stmt);
> -         return call;
> +         return stmt;
>         }
>      }
>
> --
> 2.34.1
>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v3] Vect: Support truncate after .SAT_SUB pattern in zip
  2024-06-21  3:52 [PATCH v1] Ifcvt: Add cond tree reconcile for truncated .SAT_SUB pan2.li
  2024-06-21  7:00 ` Richard Biener
  2024-06-24 13:55 ` [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip pan2.li
@ 2024-06-27  1:31 ` pan2.li
  2024-06-27  6:04   ` Richard Biener
  2 siblings, 1 reply; 27+ messages in thread
From: pan2.li @ 2024-06-27  1:31 UTC (permalink / raw)
  To: gcc-patches
  Cc: juzhe.zhong, kito.cheng, richard.guenther, jeffreyalaw,
	rdapp.gcc, Pan Li

From: Pan Li <pan2.li@intel.com>

The zip benchmark of coremark-pro have one SAT_SUB like pattern but
truncated as below:

void test (uint16_t *x, unsigned b, unsigned n)
{
  unsigned a = 0;
  register uint16_t *p = x;

  do {
    a = *--p;
    *p = (uint16_t)(a >= b ? a - b : 0); // Truncate after .SAT_SUB
  } while (--n);
}

It will have gimple before vect pass,  it cannot hit any pattern of
SAT_SUB and then cannot vectorize to SAT_SUB.

_2 = a_11 - b_12(D);
iftmp.0_13 = (short unsigned int) _2;
_18 = a_11 >= b_12(D);
iftmp.0_5 = _18 ? iftmp.0_13 : 0;

This patch would like to improve the pattern match to recog above
as truncate after .SAT_SUB pattern.  Then we will have the pattern
similar to below,  as well as eliminate the first 3 dead stmt.

_2 = a_11 - b_12(D);
iftmp.0_13 = (short unsigned int) _2;
_18 = a_11 >= b_12(D);
iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D));

The below tests are passed for this patch.
1. The rv64gcv fully regression tests.
2. The rv64gcv build with glibc.
3. The x86 bootstrap tests.
4. The x86 fully regression tests.

gcc/ChangeLog:

	* match.pd: Add convert description for minus and capture.
	* tree-vect-patterns.cc (vect_recog_build_binary_gimple_call): Add
	new logic to handle in_type is incompatibile with out_type,  as
	well as rename from.
	(vect_recog_build_binary_gimple_stmt): Rename to.
	(vect_recog_sat_add_pattern): Leverage above renamed func.
	(vect_recog_sat_sub_pattern): Ditto.

Signed-off-by: Pan Li <pan2.li@intel.com>
---
 gcc/match.pd              |  4 +--
 gcc/tree-vect-patterns.cc | 51 ++++++++++++++++++++++++---------------
 2 files changed, 33 insertions(+), 22 deletions(-)

diff --git a/gcc/match.pd b/gcc/match.pd
index cf8a399a744..820591a36b3 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -3164,9 +3164,9 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
 /* Unsigned saturation sub, case 2 (branch with ge):
    SAT_U_SUB = X >= Y ? X - Y : 0.  */
 (match (unsigned_integer_sat_sub @0 @1)
- (cond^ (ge @0 @1) (minus @0 @1) integer_zerop)
+ (cond^ (ge @0 @1) (convert? (minus (convert1? @0) (convert1? @1))) integer_zerop)
  (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
-      && types_match (type, @0, @1))))
+      && TYPE_UNSIGNED (TREE_TYPE (@0)) && types_match (@0, @1))))
 
 /* Unsigned saturation sub, case 3 (branchless with gt):
    SAT_U_SUB = (X - Y) * (X > Y).  */
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index cef901808eb..519d15f2a43 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -4490,26 +4490,37 @@ vect_recog_mult_pattern (vec_info *vinfo,
 extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree));
 extern bool gimple_unsigned_integer_sat_sub (tree, tree*, tree (*)(tree));
 
-static gcall *
-vect_recog_build_binary_gimple_call (vec_info *vinfo, gimple *stmt,
+static gimple *
+vect_recog_build_binary_gimple_stmt (vec_info *vinfo, stmt_vec_info stmt_info,
 				     internal_fn fn, tree *type_out,
-				     tree op_0, tree op_1)
+				     tree lhs, tree op_0, tree op_1)
 {
   tree itype = TREE_TYPE (op_0);
-  tree vtype = get_vectype_for_scalar_type (vinfo, itype);
+  tree otype = TREE_TYPE (lhs);
+  tree v_itype = get_vectype_for_scalar_type (vinfo, itype);
+  tree v_otype = get_vectype_for_scalar_type (vinfo, otype);
 
-  if (vtype != NULL_TREE
-    && direct_internal_fn_supported_p (fn, vtype, OPTIMIZE_FOR_BOTH))
+  if (v_itype != NULL_TREE && v_otype != NULL_TREE
+    && direct_internal_fn_supported_p (fn, v_itype, OPTIMIZE_FOR_BOTH))
     {
       gcall *call = gimple_build_call_internal (fn, 2, op_0, op_1);
+      tree in_ssa = vect_recog_temp_ssa_var (itype, NULL);
 
-      gimple_call_set_lhs (call, vect_recog_temp_ssa_var (itype, NULL));
+      gimple_call_set_lhs (call, in_ssa);
       gimple_call_set_nothrow (call, /* nothrow_p */ false);
-      gimple_set_location (call, gimple_location (stmt));
+      gimple_set_location (call, gimple_location (STMT_VINFO_STMT (stmt_info)));
+
+      *type_out = v_otype;
 
-      *type_out = vtype;
+      if (types_compatible_p (itype, otype))
+	return call;
+      else
+	{
+	  append_pattern_def_seq (vinfo, stmt_info, call, v_itype);
+	  tree out_ssa = vect_recog_temp_ssa_var (otype, NULL);
 
-      return call;
+	  return gimple_build_assign (out_ssa, NOP_EXPR, in_ssa);
+	}
     }
 
   return NULL;
@@ -4541,13 +4552,13 @@ vect_recog_sat_add_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo,
 
   if (gimple_unsigned_integer_sat_add (lhs, ops, NULL))
     {
-      gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt,
-							 IFN_SAT_ADD, type_out,
-							 ops[0], ops[1]);
-      if (call)
+      gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo,
+							  IFN_SAT_ADD, type_out,
+							  lhs, ops[0], ops[1]);
+      if (stmt)
 	{
 	  vect_pattern_detected ("vect_recog_sat_add_pattern", last_stmt);
-	  return call;
+	  return stmt;
 	}
     }
 
@@ -4579,13 +4590,13 @@ vect_recog_sat_sub_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo,
 
   if (gimple_unsigned_integer_sat_sub (lhs, ops, NULL))
     {
-      gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt,
-							 IFN_SAT_SUB, type_out,
-							 ops[0], ops[1]);
-      if (call)
+      gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo,
+							  IFN_SAT_SUB, type_out,
+							  lhs, ops[0], ops[1]);
+      if (stmt)
 	{
 	  vect_pattern_detected ("vect_recog_sat_sub_pattern", last_stmt);
-	  return call;
+	  return stmt;
 	}
     }
 
-- 
2.34.1


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v3] Vect: Support truncate after .SAT_SUB pattern in zip
  2024-06-27  1:31 ` [PATCH v3] " pan2.li
@ 2024-06-27  6:04   ` Richard Biener
  2024-06-27  6:14     ` Li, Pan2
  0 siblings, 1 reply; 27+ messages in thread
From: Richard Biener @ 2024-06-27  6:04 UTC (permalink / raw)
  To: pan2.li; +Cc: gcc-patches, juzhe.zhong, kito.cheng, jeffreyalaw, rdapp.gcc

On Thu, Jun 27, 2024 at 3:31 AM <pan2.li@intel.com> wrote:
>
> From: Pan Li <pan2.li@intel.com>

OK

> The zip benchmark of coremark-pro have one SAT_SUB like pattern but
> truncated as below:
>
> void test (uint16_t *x, unsigned b, unsigned n)
> {
>   unsigned a = 0;
>   register uint16_t *p = x;
>
>   do {
>     a = *--p;
>     *p = (uint16_t)(a >= b ? a - b : 0); // Truncate after .SAT_SUB
>   } while (--n);
> }
>
> It will have gimple before vect pass,  it cannot hit any pattern of
> SAT_SUB and then cannot vectorize to SAT_SUB.
>
> _2 = a_11 - b_12(D);
> iftmp.0_13 = (short unsigned int) _2;
> _18 = a_11 >= b_12(D);
> iftmp.0_5 = _18 ? iftmp.0_13 : 0;
>
> This patch would like to improve the pattern match to recog above
> as truncate after .SAT_SUB pattern.  Then we will have the pattern
> similar to below,  as well as eliminate the first 3 dead stmt.
>
> _2 = a_11 - b_12(D);
> iftmp.0_13 = (short unsigned int) _2;
> _18 = a_11 >= b_12(D);
> iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D));
>
> The below tests are passed for this patch.
> 1. The rv64gcv fully regression tests.
> 2. The rv64gcv build with glibc.
> 3. The x86 bootstrap tests.
> 4. The x86 fully regression tests.
>
> gcc/ChangeLog:
>
>         * match.pd: Add convert description for minus and capture.
>         * tree-vect-patterns.cc (vect_recog_build_binary_gimple_call): Add
>         new logic to handle in_type is incompatibile with out_type,  as
>         well as rename from.
>         (vect_recog_build_binary_gimple_stmt): Rename to.
>         (vect_recog_sat_add_pattern): Leverage above renamed func.
>         (vect_recog_sat_sub_pattern): Ditto.
>
> Signed-off-by: Pan Li <pan2.li@intel.com>
> ---
>  gcc/match.pd              |  4 +--
>  gcc/tree-vect-patterns.cc | 51 ++++++++++++++++++++++++---------------
>  2 files changed, 33 insertions(+), 22 deletions(-)
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index cf8a399a744..820591a36b3 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -3164,9 +3164,9 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>  /* Unsigned saturation sub, case 2 (branch with ge):
>     SAT_U_SUB = X >= Y ? X - Y : 0.  */
>  (match (unsigned_integer_sat_sub @0 @1)
> - (cond^ (ge @0 @1) (minus @0 @1) integer_zerop)
> + (cond^ (ge @0 @1) (convert? (minus (convert1? @0) (convert1? @1))) integer_zerop)
>   (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
> -      && types_match (type, @0, @1))))
> +      && TYPE_UNSIGNED (TREE_TYPE (@0)) && types_match (@0, @1))))
>
>  /* Unsigned saturation sub, case 3 (branchless with gt):
>     SAT_U_SUB = (X - Y) * (X > Y).  */
> diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
> index cef901808eb..519d15f2a43 100644
> --- a/gcc/tree-vect-patterns.cc
> +++ b/gcc/tree-vect-patterns.cc
> @@ -4490,26 +4490,37 @@ vect_recog_mult_pattern (vec_info *vinfo,
>  extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree));
>  extern bool gimple_unsigned_integer_sat_sub (tree, tree*, tree (*)(tree));
>
> -static gcall *
> -vect_recog_build_binary_gimple_call (vec_info *vinfo, gimple *stmt,
> +static gimple *
> +vect_recog_build_binary_gimple_stmt (vec_info *vinfo, stmt_vec_info stmt_info,
>                                      internal_fn fn, tree *type_out,
> -                                    tree op_0, tree op_1)
> +                                    tree lhs, tree op_0, tree op_1)
>  {
>    tree itype = TREE_TYPE (op_0);
> -  tree vtype = get_vectype_for_scalar_type (vinfo, itype);
> +  tree otype = TREE_TYPE (lhs);
> +  tree v_itype = get_vectype_for_scalar_type (vinfo, itype);
> +  tree v_otype = get_vectype_for_scalar_type (vinfo, otype);
>
> -  if (vtype != NULL_TREE
> -    && direct_internal_fn_supported_p (fn, vtype, OPTIMIZE_FOR_BOTH))
> +  if (v_itype != NULL_TREE && v_otype != NULL_TREE
> +    && direct_internal_fn_supported_p (fn, v_itype, OPTIMIZE_FOR_BOTH))
>      {
>        gcall *call = gimple_build_call_internal (fn, 2, op_0, op_1);
> +      tree in_ssa = vect_recog_temp_ssa_var (itype, NULL);
>
> -      gimple_call_set_lhs (call, vect_recog_temp_ssa_var (itype, NULL));
> +      gimple_call_set_lhs (call, in_ssa);
>        gimple_call_set_nothrow (call, /* nothrow_p */ false);
> -      gimple_set_location (call, gimple_location (stmt));
> +      gimple_set_location (call, gimple_location (STMT_VINFO_STMT (stmt_info)));
> +
> +      *type_out = v_otype;
>
> -      *type_out = vtype;
> +      if (types_compatible_p (itype, otype))
> +       return call;
> +      else
> +       {
> +         append_pattern_def_seq (vinfo, stmt_info, call, v_itype);
> +         tree out_ssa = vect_recog_temp_ssa_var (otype, NULL);
>
> -      return call;
> +         return gimple_build_assign (out_ssa, NOP_EXPR, in_ssa);
> +       }
>      }
>
>    return NULL;
> @@ -4541,13 +4552,13 @@ vect_recog_sat_add_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo,
>
>    if (gimple_unsigned_integer_sat_add (lhs, ops, NULL))
>      {
> -      gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt,
> -                                                        IFN_SAT_ADD, type_out,
> -                                                        ops[0], ops[1]);
> -      if (call)
> +      gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo,
> +                                                         IFN_SAT_ADD, type_out,
> +                                                         lhs, ops[0], ops[1]);
> +      if (stmt)
>         {
>           vect_pattern_detected ("vect_recog_sat_add_pattern", last_stmt);
> -         return call;
> +         return stmt;
>         }
>      }
>
> @@ -4579,13 +4590,13 @@ vect_recog_sat_sub_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo,
>
>    if (gimple_unsigned_integer_sat_sub (lhs, ops, NULL))
>      {
> -      gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt,
> -                                                        IFN_SAT_SUB, type_out,
> -                                                        ops[0], ops[1]);
> -      if (call)
> +      gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo,
> +                                                         IFN_SAT_SUB, type_out,
> +                                                         lhs, ops[0], ops[1]);
> +      if (stmt)
>         {
>           vect_pattern_detected ("vect_recog_sat_sub_pattern", last_stmt);
> -         return call;
> +         return stmt;
>         }
>      }
>
> --
> 2.34.1
>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH v3] Vect: Support truncate after .SAT_SUB pattern in zip
  2024-06-27  6:04   ` Richard Biener
@ 2024-06-27  6:14     ` Li, Pan2
  2024-06-27 14:45       ` Li, Pan2
  0 siblings, 1 reply; 27+ messages in thread
From: Li, Pan2 @ 2024-06-27  6:14 UTC (permalink / raw)
  To: Richard Biener
  Cc: gcc-patches, juzhe.zhong, kito.cheng, jeffreyalaw, rdapp.gcc

> OK

Committed, thanks Richard.

Pan

-----Original Message-----
From: Richard Biener <richard.guenther@gmail.com> 
Sent: Thursday, June 27, 2024 2:04 PM
To: Li, Pan2 <pan2.li@intel.com>
Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com; rdapp.gcc@gmail.com
Subject: Re: [PATCH v3] Vect: Support truncate after .SAT_SUB pattern in zip

On Thu, Jun 27, 2024 at 3:31 AM <pan2.li@intel.com> wrote:
>
> From: Pan Li <pan2.li@intel.com>

OK

> The zip benchmark of coremark-pro have one SAT_SUB like pattern but
> truncated as below:
>
> void test (uint16_t *x, unsigned b, unsigned n)
> {
>   unsigned a = 0;
>   register uint16_t *p = x;
>
>   do {
>     a = *--p;
>     *p = (uint16_t)(a >= b ? a - b : 0); // Truncate after .SAT_SUB
>   } while (--n);
> }
>
> It will have gimple before vect pass,  it cannot hit any pattern of
> SAT_SUB and then cannot vectorize to SAT_SUB.
>
> _2 = a_11 - b_12(D);
> iftmp.0_13 = (short unsigned int) _2;
> _18 = a_11 >= b_12(D);
> iftmp.0_5 = _18 ? iftmp.0_13 : 0;
>
> This patch would like to improve the pattern match to recog above
> as truncate after .SAT_SUB pattern.  Then we will have the pattern
> similar to below,  as well as eliminate the first 3 dead stmt.
>
> _2 = a_11 - b_12(D);
> iftmp.0_13 = (short unsigned int) _2;
> _18 = a_11 >= b_12(D);
> iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D));
>
> The below tests are passed for this patch.
> 1. The rv64gcv fully regression tests.
> 2. The rv64gcv build with glibc.
> 3. The x86 bootstrap tests.
> 4. The x86 fully regression tests.
>
> gcc/ChangeLog:
>
>         * match.pd: Add convert description for minus and capture.
>         * tree-vect-patterns.cc (vect_recog_build_binary_gimple_call): Add
>         new logic to handle in_type is incompatibile with out_type,  as
>         well as rename from.
>         (vect_recog_build_binary_gimple_stmt): Rename to.
>         (vect_recog_sat_add_pattern): Leverage above renamed func.
>         (vect_recog_sat_sub_pattern): Ditto.
>
> Signed-off-by: Pan Li <pan2.li@intel.com>
> ---
>  gcc/match.pd              |  4 +--
>  gcc/tree-vect-patterns.cc | 51 ++++++++++++++++++++++++---------------
>  2 files changed, 33 insertions(+), 22 deletions(-)
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index cf8a399a744..820591a36b3 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -3164,9 +3164,9 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>  /* Unsigned saturation sub, case 2 (branch with ge):
>     SAT_U_SUB = X >= Y ? X - Y : 0.  */
>  (match (unsigned_integer_sat_sub @0 @1)
> - (cond^ (ge @0 @1) (minus @0 @1) integer_zerop)
> + (cond^ (ge @0 @1) (convert? (minus (convert1? @0) (convert1? @1))) integer_zerop)
>   (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
> -      && types_match (type, @0, @1))))
> +      && TYPE_UNSIGNED (TREE_TYPE (@0)) && types_match (@0, @1))))
>
>  /* Unsigned saturation sub, case 3 (branchless with gt):
>     SAT_U_SUB = (X - Y) * (X > Y).  */
> diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
> index cef901808eb..519d15f2a43 100644
> --- a/gcc/tree-vect-patterns.cc
> +++ b/gcc/tree-vect-patterns.cc
> @@ -4490,26 +4490,37 @@ vect_recog_mult_pattern (vec_info *vinfo,
>  extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree));
>  extern bool gimple_unsigned_integer_sat_sub (tree, tree*, tree (*)(tree));
>
> -static gcall *
> -vect_recog_build_binary_gimple_call (vec_info *vinfo, gimple *stmt,
> +static gimple *
> +vect_recog_build_binary_gimple_stmt (vec_info *vinfo, stmt_vec_info stmt_info,
>                                      internal_fn fn, tree *type_out,
> -                                    tree op_0, tree op_1)
> +                                    tree lhs, tree op_0, tree op_1)
>  {
>    tree itype = TREE_TYPE (op_0);
> -  tree vtype = get_vectype_for_scalar_type (vinfo, itype);
> +  tree otype = TREE_TYPE (lhs);
> +  tree v_itype = get_vectype_for_scalar_type (vinfo, itype);
> +  tree v_otype = get_vectype_for_scalar_type (vinfo, otype);
>
> -  if (vtype != NULL_TREE
> -    && direct_internal_fn_supported_p (fn, vtype, OPTIMIZE_FOR_BOTH))
> +  if (v_itype != NULL_TREE && v_otype != NULL_TREE
> +    && direct_internal_fn_supported_p (fn, v_itype, OPTIMIZE_FOR_BOTH))
>      {
>        gcall *call = gimple_build_call_internal (fn, 2, op_0, op_1);
> +      tree in_ssa = vect_recog_temp_ssa_var (itype, NULL);
>
> -      gimple_call_set_lhs (call, vect_recog_temp_ssa_var (itype, NULL));
> +      gimple_call_set_lhs (call, in_ssa);
>        gimple_call_set_nothrow (call, /* nothrow_p */ false);
> -      gimple_set_location (call, gimple_location (stmt));
> +      gimple_set_location (call, gimple_location (STMT_VINFO_STMT (stmt_info)));
> +
> +      *type_out = v_otype;
>
> -      *type_out = vtype;
> +      if (types_compatible_p (itype, otype))
> +       return call;
> +      else
> +       {
> +         append_pattern_def_seq (vinfo, stmt_info, call, v_itype);
> +         tree out_ssa = vect_recog_temp_ssa_var (otype, NULL);
>
> -      return call;
> +         return gimple_build_assign (out_ssa, NOP_EXPR, in_ssa);
> +       }
>      }
>
>    return NULL;
> @@ -4541,13 +4552,13 @@ vect_recog_sat_add_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo,
>
>    if (gimple_unsigned_integer_sat_add (lhs, ops, NULL))
>      {
> -      gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt,
> -                                                        IFN_SAT_ADD, type_out,
> -                                                        ops[0], ops[1]);
> -      if (call)
> +      gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo,
> +                                                         IFN_SAT_ADD, type_out,
> +                                                         lhs, ops[0], ops[1]);
> +      if (stmt)
>         {
>           vect_pattern_detected ("vect_recog_sat_add_pattern", last_stmt);
> -         return call;
> +         return stmt;
>         }
>      }
>
> @@ -4579,13 +4590,13 @@ vect_recog_sat_sub_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo,
>
>    if (gimple_unsigned_integer_sat_sub (lhs, ops, NULL))
>      {
> -      gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt,
> -                                                        IFN_SAT_SUB, type_out,
> -                                                        ops[0], ops[1]);
> -      if (call)
> +      gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo,
> +                                                         IFN_SAT_SUB, type_out,
> +                                                         lhs, ops[0], ops[1]);
> +      if (stmt)
>         {
>           vect_pattern_detected ("vect_recog_sat_sub_pattern", last_stmt);
> -         return call;
> +         return stmt;
>         }
>      }
>
> --
> 2.34.1
>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip
  2024-06-24 13:55 ` [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip pan2.li
  2024-06-24 19:59   ` Tamar Christina
  2024-06-26 13:56   ` Richard Biener
@ 2024-06-27  6:48   ` Uros Bizjak
  2024-06-27  7:00     ` Li, Pan2
  2 siblings, 1 reply; 27+ messages in thread
From: Uros Bizjak @ 2024-06-27  6:48 UTC (permalink / raw)
  To: pan2.li
  Cc: gcc-patches, juzhe.zhong, kito.cheng, richard.guenther,
	jeffreyalaw, pinskia

On Mon, Jun 24, 2024 at 3:55 PM <pan2.li@intel.com> wrote:
>
> From: Pan Li <pan2.li@intel.com>
>
> The zip benchmark of coremark-pro have one SAT_SUB like pattern but
> truncated as below:
>
> void test (uint16_t *x, unsigned b, unsigned n)
> {
>   unsigned a = 0;
>   register uint16_t *p = x;
>
>   do {
>     a = *--p;
>     *p = (uint16_t)(a >= b ? a - b : 0); // Truncate after .SAT_SUB
>   } while (--n);
> }
>
> It will have gimple before vect pass,  it cannot hit any pattern of
> SAT_SUB and then cannot vectorize to SAT_SUB.
>
> _2 = a_11 - b_12(D);
> iftmp.0_13 = (short unsigned int) _2;
> _18 = a_11 >= b_12(D);
> iftmp.0_5 = _18 ? iftmp.0_13 : 0;
>
> This patch would like to improve the pattern match to recog above
> as truncate after .SAT_SUB pattern.  Then we will have the pattern
> similar to below,  as well as eliminate the first 3 dead stmt.
>
> _2 = a_11 - b_12(D);
> iftmp.0_13 = (short unsigned int) _2;
> _18 = a_11 >= b_12(D);
> iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D));
>
> The below tests are passed for this patch.
> 1. The rv64gcv fully regression tests.
> 2. The rv64gcv build with glibc.
> 3. The x86 bootstrap tests.
> 4. The x86 fully regression tests.

I have tried this patch with x86_64 on the testcase from PR51492, but
the compiler does not recognize the .SAT_SUB pattern here.

Is there anything else missing for successful detection?

Uros.

>
> gcc/ChangeLog:
>
>         * match.pd: Add convert description for minus and capture.
>         * tree-vect-patterns.cc (vect_recog_build_binary_gimple_call): Add
>         new logic to handle in_type is incompatibile with out_type,  as
>         well as rename from.
>         (vect_recog_build_binary_gimple_stmt): Rename to.
>         (vect_recog_sat_add_pattern): Leverage above renamed func.
>         (vect_recog_sat_sub_pattern): Ditto.
>
> Signed-off-by: Pan Li <pan2.li@intel.com>
> ---
>  gcc/match.pd              |  4 +--
>  gcc/tree-vect-patterns.cc | 51 ++++++++++++++++++++++++---------------
>  2 files changed, 33 insertions(+), 22 deletions(-)
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 3d0689c9312..4a4b0b2e72f 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -3164,9 +3164,9 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>  /* Unsigned saturation sub, case 2 (branch with ge):
>     SAT_U_SUB = X >= Y ? X - Y : 0.  */
>  (match (unsigned_integer_sat_sub @0 @1)
> - (cond^ (ge @0 @1) (minus @0 @1) integer_zerop)
> + (cond^ (ge @0 @1) (convert? (minus (convert1? @0) (convert1? @1))) integer_zerop)
>   (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
> -      && types_match (type, @0, @1))))
> +      && TYPE_UNSIGNED (TREE_TYPE (@0)) && types_match (@0, @1))))
>
>  /* Unsigned saturation sub, case 3 (branchless with gt):
>     SAT_U_SUB = (X - Y) * (X > Y).  */
> diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
> index cef901808eb..3d887d36050 100644
> --- a/gcc/tree-vect-patterns.cc
> +++ b/gcc/tree-vect-patterns.cc
> @@ -4490,26 +4490,37 @@ vect_recog_mult_pattern (vec_info *vinfo,
>  extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree));
>  extern bool gimple_unsigned_integer_sat_sub (tree, tree*, tree (*)(tree));
>
> -static gcall *
> -vect_recog_build_binary_gimple_call (vec_info *vinfo, gimple *stmt,
> +static gimple *
> +vect_recog_build_binary_gimple_stmt (vec_info *vinfo, stmt_vec_info stmt_info,
>                                      internal_fn fn, tree *type_out,
> -                                    tree op_0, tree op_1)
> +                                    tree lhs, tree op_0, tree op_1)
>  {
>    tree itype = TREE_TYPE (op_0);
> -  tree vtype = get_vectype_for_scalar_type (vinfo, itype);
> +  tree otype = TREE_TYPE (lhs);
> +  tree v_itype = get_vectype_for_scalar_type (vinfo, itype);
> +  tree v_otype = get_vectype_for_scalar_type (vinfo, otype);
>
> -  if (vtype != NULL_TREE
> -    && direct_internal_fn_supported_p (fn, vtype, OPTIMIZE_FOR_BOTH))
> +  if (v_itype != NULL_TREE && v_otype != NULL_TREE
> +    && direct_internal_fn_supported_p (fn, v_itype, OPTIMIZE_FOR_BOTH))
>      {
>        gcall *call = gimple_build_call_internal (fn, 2, op_0, op_1);
> +      tree in_ssa = vect_recog_temp_ssa_var (itype, NULL);
>
> -      gimple_call_set_lhs (call, vect_recog_temp_ssa_var (itype, NULL));
> +      gimple_call_set_lhs (call, in_ssa);
>        gimple_call_set_nothrow (call, /* nothrow_p */ false);
> -      gimple_set_location (call, gimple_location (stmt));
> +      gimple_set_location (call, gimple_location (STMT_VINFO_STMT (stmt_info)));
> +
> +      *type_out = v_otype;
>
> -      *type_out = vtype;
> +      if (types_compatible_p (itype, otype))
> +       return call;
> +      else
> +       {
> +         append_pattern_def_seq (vinfo, stmt_info, call, v_itype);
> +         tree out_ssa = vect_recog_temp_ssa_var (otype, NULL);
>
> -      return call;
> +         return gimple_build_assign (out_ssa, CONVERT_EXPR, in_ssa);
> +       }
>      }
>
>    return NULL;
> @@ -4541,13 +4552,13 @@ vect_recog_sat_add_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo,
>
>    if (gimple_unsigned_integer_sat_add (lhs, ops, NULL))
>      {
> -      gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt,
> -                                                        IFN_SAT_ADD, type_out,
> -                                                        ops[0], ops[1]);
> -      if (call)
> +      gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo,
> +                                                         IFN_SAT_ADD, type_out,
> +                                                         lhs, ops[0], ops[1]);
> +      if (stmt)
>         {
>           vect_pattern_detected ("vect_recog_sat_add_pattern", last_stmt);
> -         return call;
> +         return stmt;
>         }
>      }
>
> @@ -4579,13 +4590,13 @@ vect_recog_sat_sub_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo,
>
>    if (gimple_unsigned_integer_sat_sub (lhs, ops, NULL))
>      {
> -      gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt,
> -                                                        IFN_SAT_SUB, type_out,
> -                                                        ops[0], ops[1]);
> -      if (call)
> +      gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo,
> +                                                         IFN_SAT_SUB, type_out,
> +                                                         lhs, ops[0], ops[1]);
> +      if (stmt)
>         {
>           vect_pattern_detected ("vect_recog_sat_sub_pattern", last_stmt);
> -         return call;
> +         return stmt;
>         }
>      }
>
> --
> 2.34.1
>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip
  2024-06-27  6:48   ` Uros Bizjak
@ 2024-06-27  7:00     ` Li, Pan2
  2024-06-27  9:22       ` Uros Bizjak
  0 siblings, 1 reply; 27+ messages in thread
From: Li, Pan2 @ 2024-06-27  7:00 UTC (permalink / raw)
  To: Uros Bizjak
  Cc: gcc-patches, juzhe.zhong, kito.cheng, richard.guenther,
	jeffreyalaw, pinskia

It only requires the backend implement the standard name for vector mode I bet.
How about a simpler one like below.

  #define DEF_VEC_SAT_U_SUB_TRUNC_FMT_1(OUT_T, IN_T)                   \
  void __attribute__((noinline))                                       \
  vec_sat_u_sub_trunc_##OUT_T##_fmt_1 (OUT_T *out, IN_T *op_1, IN_T y, \
       unsigned limit)                 \
  {                                                                    \
    unsigned i;                                                        \
    for (i = 0; i < limit; i++)                                        \
      {                                                                \
        IN_T x = op_1[i];                                              \
        out[i] = (OUT_T)(x >= y ? x - y : 0);                          \
      }                                                                \
  }

DEF_VEC_SAT_U_SUB_TRUNC_FMT_1(uint32_t, uint64_t);

The riscv backend is able to detect the pattern similar as below. I can help to check x86 side after the running test suites.

;;   basic block 2, loop depth 0
;;    pred:       ENTRY
  if (limit_11(D) != 0)
    goto <bb 3>; [89.00%]
  else
    goto <bb 5>; [11.00%]
;;    succ:       3
;;                5
;;   basic block 3, loop depth 0
;;    pred:       2
  vect_cst__71 = [vec_duplicate_expr] y_14(D);
  _78 = (unsigned long) limit_11(D);
;;    succ:       4

;;   basic block 4, loop depth 1
;;    pred:       4
;;                3
  # vectp_op_1.7_68 = PHI <vectp_op_1.7_69(4), op_1_12(D)(3)>
  # vectp_out.12_75 = PHI <vectp_out.12_76(4), out_16(D)(3)>
  # ivtmp_79 = PHI <ivtmp_80(4), _78(3)>
  _81 = .SELECT_VL (ivtmp_79, POLY_INT_CST [2, 2]);
  ivtmp_67 = _81 * 8;
  vect_x_13.9_70 = .MASK_LEN_LOAD (vectp_op_1.7_68, 64B, { -1, ... }, _81, 0);
  vect_patt_48.10_72 = .SAT_SUB (vect_x_13.9_70, vect_cst__71);                              // .SAT_SUB pattern
  vect_patt_49.11_73 = (vector([2,2]) unsigned int) vect_patt_48.10_72;
  ivtmp_74 = _81 * 4;
  .MASK_LEN_STORE (vectp_out.12_75, 32B, { -1, ... }, _81, 0, vect_patt_49.11_73);
  vectp_op_1.7_69 = vectp_op_1.7_68 + ivtmp_67;
  vectp_out.12_76 = vectp_out.12_75 + ivtmp_74;
  ivtmp_80 = ivtmp_79 - _81;

riscv64-unknown-elf-gcc (GCC) 15.0.0 20240627 (experimental)
Copyright (C) 2024 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

Pan

-----Original Message-----
From: Uros Bizjak <ubizjak@gmail.com> 
Sent: Thursday, June 27, 2024 2:48 PM
To: Li, Pan2 <pan2.li@intel.com>
Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com; jeffreyalaw@gmail.com; pinskia@gmail.com
Subject: Re: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip

On Mon, Jun 24, 2024 at 3:55 PM <pan2.li@intel.com> wrote:
>
> From: Pan Li <pan2.li@intel.com>
>
> The zip benchmark of coremark-pro have one SAT_SUB like pattern but
> truncated as below:
>
> void test (uint16_t *x, unsigned b, unsigned n)
> {
>   unsigned a = 0;
>   register uint16_t *p = x;
>
>   do {
>     a = *--p;
>     *p = (uint16_t)(a >= b ? a - b : 0); // Truncate after .SAT_SUB
>   } while (--n);
> }
>
> It will have gimple before vect pass,  it cannot hit any pattern of
> SAT_SUB and then cannot vectorize to SAT_SUB.
>
> _2 = a_11 - b_12(D);
> iftmp.0_13 = (short unsigned int) _2;
> _18 = a_11 >= b_12(D);
> iftmp.0_5 = _18 ? iftmp.0_13 : 0;
>
> This patch would like to improve the pattern match to recog above
> as truncate after .SAT_SUB pattern.  Then we will have the pattern
> similar to below,  as well as eliminate the first 3 dead stmt.
>
> _2 = a_11 - b_12(D);
> iftmp.0_13 = (short unsigned int) _2;
> _18 = a_11 >= b_12(D);
> iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D));
>
> The below tests are passed for this patch.
> 1. The rv64gcv fully regression tests.
> 2. The rv64gcv build with glibc.
> 3. The x86 bootstrap tests.
> 4. The x86 fully regression tests.

I have tried this patch with x86_64 on the testcase from PR51492, but
the compiler does not recognize the .SAT_SUB pattern here.

Is there anything else missing for successful detection?

Uros.

>
> gcc/ChangeLog:
>
>         * match.pd: Add convert description for minus and capture.
>         * tree-vect-patterns.cc (vect_recog_build_binary_gimple_call): Add
>         new logic to handle in_type is incompatibile with out_type,  as
>         well as rename from.
>         (vect_recog_build_binary_gimple_stmt): Rename to.
>         (vect_recog_sat_add_pattern): Leverage above renamed func.
>         (vect_recog_sat_sub_pattern): Ditto.
>
> Signed-off-by: Pan Li <pan2.li@intel.com>
> ---
>  gcc/match.pd              |  4 +--
>  gcc/tree-vect-patterns.cc | 51 ++++++++++++++++++++++++---------------
>  2 files changed, 33 insertions(+), 22 deletions(-)
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 3d0689c9312..4a4b0b2e72f 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -3164,9 +3164,9 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>  /* Unsigned saturation sub, case 2 (branch with ge):
>     SAT_U_SUB = X >= Y ? X - Y : 0.  */
>  (match (unsigned_integer_sat_sub @0 @1)
> - (cond^ (ge @0 @1) (minus @0 @1) integer_zerop)
> + (cond^ (ge @0 @1) (convert? (minus (convert1? @0) (convert1? @1))) integer_zerop)
>   (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
> -      && types_match (type, @0, @1))))
> +      && TYPE_UNSIGNED (TREE_TYPE (@0)) && types_match (@0, @1))))
>
>  /* Unsigned saturation sub, case 3 (branchless with gt):
>     SAT_U_SUB = (X - Y) * (X > Y).  */
> diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
> index cef901808eb..3d887d36050 100644
> --- a/gcc/tree-vect-patterns.cc
> +++ b/gcc/tree-vect-patterns.cc
> @@ -4490,26 +4490,37 @@ vect_recog_mult_pattern (vec_info *vinfo,
>  extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree));
>  extern bool gimple_unsigned_integer_sat_sub (tree, tree*, tree (*)(tree));
>
> -static gcall *
> -vect_recog_build_binary_gimple_call (vec_info *vinfo, gimple *stmt,
> +static gimple *
> +vect_recog_build_binary_gimple_stmt (vec_info *vinfo, stmt_vec_info stmt_info,
>                                      internal_fn fn, tree *type_out,
> -                                    tree op_0, tree op_1)
> +                                    tree lhs, tree op_0, tree op_1)
>  {
>    tree itype = TREE_TYPE (op_0);
> -  tree vtype = get_vectype_for_scalar_type (vinfo, itype);
> +  tree otype = TREE_TYPE (lhs);
> +  tree v_itype = get_vectype_for_scalar_type (vinfo, itype);
> +  tree v_otype = get_vectype_for_scalar_type (vinfo, otype);
>
> -  if (vtype != NULL_TREE
> -    && direct_internal_fn_supported_p (fn, vtype, OPTIMIZE_FOR_BOTH))
> +  if (v_itype != NULL_TREE && v_otype != NULL_TREE
> +    && direct_internal_fn_supported_p (fn, v_itype, OPTIMIZE_FOR_BOTH))
>      {
>        gcall *call = gimple_build_call_internal (fn, 2, op_0, op_1);
> +      tree in_ssa = vect_recog_temp_ssa_var (itype, NULL);
>
> -      gimple_call_set_lhs (call, vect_recog_temp_ssa_var (itype, NULL));
> +      gimple_call_set_lhs (call, in_ssa);
>        gimple_call_set_nothrow (call, /* nothrow_p */ false);
> -      gimple_set_location (call, gimple_location (stmt));
> +      gimple_set_location (call, gimple_location (STMT_VINFO_STMT (stmt_info)));
> +
> +      *type_out = v_otype;
>
> -      *type_out = vtype;
> +      if (types_compatible_p (itype, otype))
> +       return call;
> +      else
> +       {
> +         append_pattern_def_seq (vinfo, stmt_info, call, v_itype);
> +         tree out_ssa = vect_recog_temp_ssa_var (otype, NULL);
>
> -      return call;
> +         return gimple_build_assign (out_ssa, CONVERT_EXPR, in_ssa);
> +       }
>      }
>
>    return NULL;
> @@ -4541,13 +4552,13 @@ vect_recog_sat_add_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo,
>
>    if (gimple_unsigned_integer_sat_add (lhs, ops, NULL))
>      {
> -      gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt,
> -                                                        IFN_SAT_ADD, type_out,
> -                                                        ops[0], ops[1]);
> -      if (call)
> +      gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo,
> +                                                         IFN_SAT_ADD, type_out,
> +                                                         lhs, ops[0], ops[1]);
> +      if (stmt)
>         {
>           vect_pattern_detected ("vect_recog_sat_add_pattern", last_stmt);
> -         return call;
> +         return stmt;
>         }
>      }
>
> @@ -4579,13 +4590,13 @@ vect_recog_sat_sub_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo,
>
>    if (gimple_unsigned_integer_sat_sub (lhs, ops, NULL))
>      {
> -      gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt,
> -                                                        IFN_SAT_SUB, type_out,
> -                                                        ops[0], ops[1]);
> -      if (call)
> +      gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo,
> +                                                         IFN_SAT_SUB, type_out,
> +                                                         lhs, ops[0], ops[1]);
> +      if (stmt)
>         {
>           vect_pattern_detected ("vect_recog_sat_sub_pattern", last_stmt);
> -         return call;
> +         return stmt;
>         }
>      }
>
> --
> 2.34.1
>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip
  2024-06-27  7:00     ` Li, Pan2
@ 2024-06-27  9:22       ` Uros Bizjak
  0 siblings, 0 replies; 27+ messages in thread
From: Uros Bizjak @ 2024-06-27  9:22 UTC (permalink / raw)
  To: Li, Pan2
  Cc: gcc-patches, juzhe.zhong, kito.cheng, richard.guenther,
	jeffreyalaw, pinskia

On Thu, Jun 27, 2024 at 9:01 AM Li, Pan2 <pan2.li@intel.com> wrote:
>
> It only requires the backend implement the standard name for vector mode I bet.

There are several standard names present for x86:
{ss,us}{add,sub}{v8qi,v16qi,v32qi,v64qi,v4hi,v8hi,v16hi,v32hi},
defined in sse.md:

(define_expand "<insn><mode>3<mask_name>"
  [(set (match_operand:VI12_AVX2_AVX512BW 0 "register_operand")
    (sat_plusminus:VI12_AVX2_AVX512BW
      (match_operand:VI12_AVX2_AVX512BW 1 "vector_operand")
      (match_operand:VI12_AVX2_AVX512BW 2 "vector_operand")))]
  "TARGET_SSE2 && <mask_mode512bit_condition> && <mask_avx512bw_condition>"
  "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")

but all of these handle only 8 and 16 bit elements.

> How about a simpler one like below.
>
>   #define DEF_VEC_SAT_U_SUB_TRUNC_FMT_1(OUT_T, IN_T)                   \
>   void __attribute__((noinline))                                       \
>   vec_sat_u_sub_trunc_##OUT_T##_fmt_1 (OUT_T *out, IN_T *op_1, IN_T y, \
>        unsigned limit)                 \
>   {                                                                    \
>     unsigned i;                                                        \
>     for (i = 0; i < limit; i++)                                        \
>       {                                                                \
>         IN_T x = op_1[i];                                              \
>         out[i] = (OUT_T)(x >= y ? x - y : 0);                          \
>       }                                                                \
>   }
>
> DEF_VEC_SAT_U_SUB_TRUNC_FMT_1(uint32_t, uint64_t);

I tried with:

DEF_VEC_SAT_U_SUB_TRUNC_FMT_1(uint8_t, uint16_t);

And the compiler was able to detect several .SAT_SUB patterns:

$ grep SAT_SUB pr51492-1.c.266t.optimized
 vect_patt_37.14_85 = .SAT_SUB (vect_x_13.12_81, vect_cst__84);
 vect_patt_37.14_86 = .SAT_SUB (vect_x_13.13_83, vect_cst__84);
 vect_patt_42.26_126 = .SAT_SUB (vect_x_62.24_122, vect_cst__125);
 vect_patt_42.26_127 = .SAT_SUB (vect_x_62.25_124, vect_cst__125);
 iftmp.0_24 = .SAT_SUB (x_3, y_14(D));

Uros.

>
> The riscv backend is able to detect the pattern similar as below. I can help to check x86 side after the running test suites.
>
> ;;   basic block 2, loop depth 0
> ;;    pred:       ENTRY
>   if (limit_11(D) != 0)
>     goto <bb 3>; [89.00%]
>   else
>     goto <bb 5>; [11.00%]
> ;;    succ:       3
> ;;                5
> ;;   basic block 3, loop depth 0
> ;;    pred:       2
>   vect_cst__71 = [vec_duplicate_expr] y_14(D);
>   _78 = (unsigned long) limit_11(D);
> ;;    succ:       4
>
> ;;   basic block 4, loop depth 1
> ;;    pred:       4
> ;;                3
>   # vectp_op_1.7_68 = PHI <vectp_op_1.7_69(4), op_1_12(D)(3)>
>   # vectp_out.12_75 = PHI <vectp_out.12_76(4), out_16(D)(3)>
>   # ivtmp_79 = PHI <ivtmp_80(4), _78(3)>
>   _81 = .SELECT_VL (ivtmp_79, POLY_INT_CST [2, 2]);
>   ivtmp_67 = _81 * 8;
>   vect_x_13.9_70 = .MASK_LEN_LOAD (vectp_op_1.7_68, 64B, { -1, ... }, _81, 0);
>   vect_patt_48.10_72 = .SAT_SUB (vect_x_13.9_70, vect_cst__71);                              // .SAT_SUB pattern
>   vect_patt_49.11_73 = (vector([2,2]) unsigned int) vect_patt_48.10_72;
>   ivtmp_74 = _81 * 4;
>   .MASK_LEN_STORE (vectp_out.12_75, 32B, { -1, ... }, _81, 0, vect_patt_49.11_73);
>   vectp_op_1.7_69 = vectp_op_1.7_68 + ivtmp_67;
>   vectp_out.12_76 = vectp_out.12_75 + ivtmp_74;
>   ivtmp_80 = ivtmp_79 - _81;
>
> riscv64-unknown-elf-gcc (GCC) 15.0.0 20240627 (experimental)
> Copyright (C) 2024 Free Software Foundation, Inc.
> This is free software; see the source for copying conditions.  There is NO
> warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
>
> Pan
>
> -----Original Message-----
> From: Uros Bizjak <ubizjak@gmail.com>
> Sent: Thursday, June 27, 2024 2:48 PM
> To: Li, Pan2 <pan2.li@intel.com>
> Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com; jeffreyalaw@gmail.com; pinskia@gmail.com
> Subject: Re: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip
>
> On Mon, Jun 24, 2024 at 3:55 PM <pan2.li@intel.com> wrote:
> >
> > From: Pan Li <pan2.li@intel.com>
> >
> > The zip benchmark of coremark-pro have one SAT_SUB like pattern but
> > truncated as below:
> >
> > void test (uint16_t *x, unsigned b, unsigned n)
> > {
> >   unsigned a = 0;
> >   register uint16_t *p = x;
> >
> >   do {
> >     a = *--p;
> >     *p = (uint16_t)(a >= b ? a - b : 0); // Truncate after .SAT_SUB
> >   } while (--n);
> > }
> >

No, the current compiler does not recognize .SAT_SUB for x86 with the
above code, although many vector sat sub instructions involving 16bit
elements are present.

Uros.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH v3] Vect: Support truncate after .SAT_SUB pattern in zip
  2024-06-27  6:14     ` Li, Pan2
@ 2024-06-27 14:45       ` Li, Pan2
  2024-06-28  5:38         ` Richard Biener
  0 siblings, 1 reply; 27+ messages in thread
From: Li, Pan2 @ 2024-06-27 14:45 UTC (permalink / raw)
  To: Richard Biener
  Cc: gcc-patches, juzhe.zhong, kito.cheng, jeffreyalaw, rdapp.gcc,
	Tamar Christina

Hi Richard,

As mentioned by tamar in previous, would like to try even more optimization based on this patch.
Assume we take zip benchmark as example, we may have gimple similar as below

unsigned int _1, _2;
unsigned short int _9;

_9 = (unsigned short int).SAT_SUB (_1, _2);

If we can locate the _1 is in the range of unsigned short, we can distribute the convert into
the .SAT_SUB, aka:

From:
_1 = (unsigned int short)_other;
_9 = (unsigned short int).SAT_SUB (_1, _2);

To:
_9 = .SAT_SUB ((unsigned int short)_1, (unsigned int short)MIN_EXPR (_2, 65536)));

Unfortunately, it failed to vectorize when I try to perform above changes. The vectorizable_conversion
considers it is not simple use and then return fail to vect_analyze_loop_2.

zip.test.c:15:12: note:   ==> examining pattern def statement: patt_42 = (short unsigned int) MIN_EXPR <b_12(D), b_12(D)>;
zip.test.c:15:12: note:   ==> examining statement: patt_42 = (short unsigned int) MIN_EXPR <b_12(D), b_12(D)>;
zip.test.c:15:12: note:   vect_is_simple_use: operand MIN_EXPR <b_12(D), b_12(D)>, type of def: unknown
zip.test.c:15:12: missed:   Unsupported pattern.
zip.test.c:15:12: missed:   use not simple.
zip.test.c:15:12: note:   vect_is_simple_use: operand MIN_EXPR <b_12(D), b_12(D)>, type of def: unknown
zip.test.c:15:12: missed:   Unsupported pattern.
zip.test.c:15:12: missed:   use not simple.
zip.test.c:15:12: note:   vect_is_simple_use: operand MIN_EXPR <b_12(D), b_12(D)>, type of def: unknown
zip.test.c:15:12: missed:   Unsupported pattern.
zip.test.c:15:12: missed:   use not simple.
zip.test.c:7:6: missed:   not vectorized: relevant stmt not supported: patt_42 = (short unsigned int) MIN_EXPR <b_12(D), b_12(D)>;
zip.test.c:15:12: missed:  bad operation or unsupported loop bound. 

I tried to take COND_EXPR here instead of MIN_EXPR but almost the same behavior. I am not sure if we can unblock this by the
vectorizable_conversion or we need some improvements from other pass.

Thanks a lot.

Pan

-----Original Message-----
From: Li, Pan2 
Sent: Thursday, June 27, 2024 2:14 PM
To: Richard Biener <richard.guenther@gmail.com>
Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com; rdapp.gcc@gmail.com
Subject: RE: [PATCH v3] Vect: Support truncate after .SAT_SUB pattern in zip

> OK

Committed, thanks Richard.

Pan

-----Original Message-----
From: Richard Biener <richard.guenther@gmail.com> 
Sent: Thursday, June 27, 2024 2:04 PM
To: Li, Pan2 <pan2.li@intel.com>
Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com; rdapp.gcc@gmail.com
Subject: Re: [PATCH v3] Vect: Support truncate after .SAT_SUB pattern in zip

On Thu, Jun 27, 2024 at 3:31 AM <pan2.li@intel.com> wrote:
>
> From: Pan Li <pan2.li@intel.com>

OK

> The zip benchmark of coremark-pro have one SAT_SUB like pattern but
> truncated as below:
>
> void test (uint16_t *x, unsigned b, unsigned n)
> {
>   unsigned a = 0;
>   register uint16_t *p = x;
>
>   do {
>     a = *--p;
>     *p = (uint16_t)(a >= b ? a - b : 0); // Truncate after .SAT_SUB
>   } while (--n);
> }
>
> It will have gimple before vect pass,  it cannot hit any pattern of
> SAT_SUB and then cannot vectorize to SAT_SUB.
>
> _2 = a_11 - b_12(D);
> iftmp.0_13 = (short unsigned int) _2;
> _18 = a_11 >= b_12(D);
> iftmp.0_5 = _18 ? iftmp.0_13 : 0;
>
> This patch would like to improve the pattern match to recog above
> as truncate after .SAT_SUB pattern.  Then we will have the pattern
> similar to below,  as well as eliminate the first 3 dead stmt.
>
> _2 = a_11 - b_12(D);
> iftmp.0_13 = (short unsigned int) _2;
> _18 = a_11 >= b_12(D);
> iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D));
>
> The below tests are passed for this patch.
> 1. The rv64gcv fully regression tests.
> 2. The rv64gcv build with glibc.
> 3. The x86 bootstrap tests.
> 4. The x86 fully regression tests.
>
> gcc/ChangeLog:
>
>         * match.pd: Add convert description for minus and capture.
>         * tree-vect-patterns.cc (vect_recog_build_binary_gimple_call): Add
>         new logic to handle in_type is incompatibile with out_type,  as
>         well as rename from.
>         (vect_recog_build_binary_gimple_stmt): Rename to.
>         (vect_recog_sat_add_pattern): Leverage above renamed func.
>         (vect_recog_sat_sub_pattern): Ditto.
>
> Signed-off-by: Pan Li <pan2.li@intel.com>
> ---
>  gcc/match.pd              |  4 +--
>  gcc/tree-vect-patterns.cc | 51 ++++++++++++++++++++++++---------------
>  2 files changed, 33 insertions(+), 22 deletions(-)
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index cf8a399a744..820591a36b3 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -3164,9 +3164,9 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>  /* Unsigned saturation sub, case 2 (branch with ge):
>     SAT_U_SUB = X >= Y ? X - Y : 0.  */
>  (match (unsigned_integer_sat_sub @0 @1)
> - (cond^ (ge @0 @1) (minus @0 @1) integer_zerop)
> + (cond^ (ge @0 @1) (convert? (minus (convert1? @0) (convert1? @1))) integer_zerop)
>   (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
> -      && types_match (type, @0, @1))))
> +      && TYPE_UNSIGNED (TREE_TYPE (@0)) && types_match (@0, @1))))
>
>  /* Unsigned saturation sub, case 3 (branchless with gt):
>     SAT_U_SUB = (X - Y) * (X > Y).  */
> diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
> index cef901808eb..519d15f2a43 100644
> --- a/gcc/tree-vect-patterns.cc
> +++ b/gcc/tree-vect-patterns.cc
> @@ -4490,26 +4490,37 @@ vect_recog_mult_pattern (vec_info *vinfo,
>  extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree));
>  extern bool gimple_unsigned_integer_sat_sub (tree, tree*, tree (*)(tree));
>
> -static gcall *
> -vect_recog_build_binary_gimple_call (vec_info *vinfo, gimple *stmt,
> +static gimple *
> +vect_recog_build_binary_gimple_stmt (vec_info *vinfo, stmt_vec_info stmt_info,
>                                      internal_fn fn, tree *type_out,
> -                                    tree op_0, tree op_1)
> +                                    tree lhs, tree op_0, tree op_1)
>  {
>    tree itype = TREE_TYPE (op_0);
> -  tree vtype = get_vectype_for_scalar_type (vinfo, itype);
> +  tree otype = TREE_TYPE (lhs);
> +  tree v_itype = get_vectype_for_scalar_type (vinfo, itype);
> +  tree v_otype = get_vectype_for_scalar_type (vinfo, otype);
>
> -  if (vtype != NULL_TREE
> -    && direct_internal_fn_supported_p (fn, vtype, OPTIMIZE_FOR_BOTH))
> +  if (v_itype != NULL_TREE && v_otype != NULL_TREE
> +    && direct_internal_fn_supported_p (fn, v_itype, OPTIMIZE_FOR_BOTH))
>      {
>        gcall *call = gimple_build_call_internal (fn, 2, op_0, op_1);
> +      tree in_ssa = vect_recog_temp_ssa_var (itype, NULL);
>
> -      gimple_call_set_lhs (call, vect_recog_temp_ssa_var (itype, NULL));
> +      gimple_call_set_lhs (call, in_ssa);
>        gimple_call_set_nothrow (call, /* nothrow_p */ false);
> -      gimple_set_location (call, gimple_location (stmt));
> +      gimple_set_location (call, gimple_location (STMT_VINFO_STMT (stmt_info)));
> +
> +      *type_out = v_otype;
>
> -      *type_out = vtype;
> +      if (types_compatible_p (itype, otype))
> +       return call;
> +      else
> +       {
> +         append_pattern_def_seq (vinfo, stmt_info, call, v_itype);
> +         tree out_ssa = vect_recog_temp_ssa_var (otype, NULL);
>
> -      return call;
> +         return gimple_build_assign (out_ssa, NOP_EXPR, in_ssa);
> +       }
>      }
>
>    return NULL;
> @@ -4541,13 +4552,13 @@ vect_recog_sat_add_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo,
>
>    if (gimple_unsigned_integer_sat_add (lhs, ops, NULL))
>      {
> -      gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt,
> -                                                        IFN_SAT_ADD, type_out,
> -                                                        ops[0], ops[1]);
> -      if (call)
> +      gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo,
> +                                                         IFN_SAT_ADD, type_out,
> +                                                         lhs, ops[0], ops[1]);
> +      if (stmt)
>         {
>           vect_pattern_detected ("vect_recog_sat_add_pattern", last_stmt);
> -         return call;
> +         return stmt;
>         }
>      }
>
> @@ -4579,13 +4590,13 @@ vect_recog_sat_sub_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo,
>
>    if (gimple_unsigned_integer_sat_sub (lhs, ops, NULL))
>      {
> -      gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt,
> -                                                        IFN_SAT_SUB, type_out,
> -                                                        ops[0], ops[1]);
> -      if (call)
> +      gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo,
> +                                                         IFN_SAT_SUB, type_out,
> +                                                         lhs, ops[0], ops[1]);
> +      if (stmt)
>         {
>           vect_pattern_detected ("vect_recog_sat_sub_pattern", last_stmt);
> -         return call;
> +         return stmt;
>         }
>      }
>
> --
> 2.34.1
>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v3] Vect: Support truncate after .SAT_SUB pattern in zip
  2024-06-27 14:45       ` Li, Pan2
@ 2024-06-28  5:38         ` Richard Biener
  2024-06-28 13:38           ` Tamar Christina
  0 siblings, 1 reply; 27+ messages in thread
From: Richard Biener @ 2024-06-28  5:38 UTC (permalink / raw)
  To: Li, Pan2
  Cc: gcc-patches, juzhe.zhong, kito.cheng, jeffreyalaw, rdapp.gcc,
	Tamar Christina

On Thu, Jun 27, 2024 at 4:45 PM Li, Pan2 <pan2.li@intel.com> wrote:
>
> Hi Richard,
>
> As mentioned by tamar in previous, would like to try even more optimization based on this patch.
> Assume we take zip benchmark as example, we may have gimple similar as below
>
> unsigned int _1, _2;
> unsigned short int _9;
>
> _9 = (unsigned short int).SAT_SUB (_1, _2);
>
> If we can locate the _1 is in the range of unsigned short, we can distribute the convert into
> the .SAT_SUB, aka:
>
> From:
> _1 = (unsigned int short)_other;
> _9 = (unsigned short int).SAT_SUB (_1, _2);
>
> To:
> _9 = .SAT_SUB ((unsigned int short)_1, (unsigned int short)MIN_EXPR (_2, 65536)));
>
> Unfortunately, it failed to vectorize when I try to perform above changes. The vectorizable_conversion
> considers it is not simple use and then return fail to vect_analyze_loop_2.
>
> zip.test.c:15:12: note:   ==> examining pattern def statement: patt_42 = (short unsigned int) MIN_EXPR <b_12(D), b_12(D)>;
> zip.test.c:15:12: note:   ==> examining statement: patt_42 = (short unsigned int) MIN_EXPR <b_12(D), b_12(D)>;
> zip.test.c:15:12: note:   vect_is_simple_use: operand MIN_EXPR <b_12(D), b_12(D)>, type of def: unknown
> zip.test.c:15:12: missed:   Unsupported pattern.
> zip.test.c:15:12: missed:   use not simple.
> zip.test.c:15:12: note:   vect_is_simple_use: operand MIN_EXPR <b_12(D), b_12(D)>, type of def: unknown
> zip.test.c:15:12: missed:   Unsupported pattern.
> zip.test.c:15:12: missed:   use not simple.
> zip.test.c:15:12: note:   vect_is_simple_use: operand MIN_EXPR <b_12(D), b_12(D)>, type of def: unknown
> zip.test.c:15:12: missed:   Unsupported pattern.
> zip.test.c:15:12: missed:   use not simple.
> zip.test.c:7:6: missed:   not vectorized: relevant stmt not supported: patt_42 = (short unsigned int) MIN_EXPR <b_12(D), b_12(D)>;
> zip.test.c:15:12: missed:  bad operation or unsupported loop bound.
>
> I tried to take COND_EXPR here instead of MIN_EXPR but almost the same behavior. I am not sure if we can unblock this by the
> vectorizable_conversion or we need some improvements from other pass.

I think you're doing the MIN_EXPR wrong - the above says MIN_EXPR
<b_12(D), b_12(D)> which doesn't make
sense anyway.  I suspect you fail to put the MIN_EXPR to a separate statement?

> Thanks a lot.
>
> Pan
>
> -----Original Message-----
> From: Li, Pan2
> Sent: Thursday, June 27, 2024 2:14 PM
> To: Richard Biener <richard.guenther@gmail.com>
> Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com; rdapp.gcc@gmail.com
> Subject: RE: [PATCH v3] Vect: Support truncate after .SAT_SUB pattern in zip
>
> > OK
>
> Committed, thanks Richard.
>
> Pan
>
> -----Original Message-----
> From: Richard Biener <richard.guenther@gmail.com>
> Sent: Thursday, June 27, 2024 2:04 PM
> To: Li, Pan2 <pan2.li@intel.com>
> Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com; rdapp.gcc@gmail.com
> Subject: Re: [PATCH v3] Vect: Support truncate after .SAT_SUB pattern in zip
>
> On Thu, Jun 27, 2024 at 3:31 AM <pan2.li@intel.com> wrote:
> >
> > From: Pan Li <pan2.li@intel.com>
>
> OK
>
> > The zip benchmark of coremark-pro have one SAT_SUB like pattern but
> > truncated as below:
> >
> > void test (uint16_t *x, unsigned b, unsigned n)
> > {
> >   unsigned a = 0;
> >   register uint16_t *p = x;
> >
> >   do {
> >     a = *--p;
> >     *p = (uint16_t)(a >= b ? a - b : 0); // Truncate after .SAT_SUB
> >   } while (--n);
> > }
> >
> > It will have gimple before vect pass,  it cannot hit any pattern of
> > SAT_SUB and then cannot vectorize to SAT_SUB.
> >
> > _2 = a_11 - b_12(D);
> > iftmp.0_13 = (short unsigned int) _2;
> > _18 = a_11 >= b_12(D);
> > iftmp.0_5 = _18 ? iftmp.0_13 : 0;
> >
> > This patch would like to improve the pattern match to recog above
> > as truncate after .SAT_SUB pattern.  Then we will have the pattern
> > similar to below,  as well as eliminate the first 3 dead stmt.
> >
> > _2 = a_11 - b_12(D);
> > iftmp.0_13 = (short unsigned int) _2;
> > _18 = a_11 >= b_12(D);
> > iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D));
> >
> > The below tests are passed for this patch.
> > 1. The rv64gcv fully regression tests.
> > 2. The rv64gcv build with glibc.
> > 3. The x86 bootstrap tests.
> > 4. The x86 fully regression tests.
> >
> > gcc/ChangeLog:
> >
> >         * match.pd: Add convert description for minus and capture.
> >         * tree-vect-patterns.cc (vect_recog_build_binary_gimple_call): Add
> >         new logic to handle in_type is incompatibile with out_type,  as
> >         well as rename from.
> >         (vect_recog_build_binary_gimple_stmt): Rename to.
> >         (vect_recog_sat_add_pattern): Leverage above renamed func.
> >         (vect_recog_sat_sub_pattern): Ditto.
> >
> > Signed-off-by: Pan Li <pan2.li@intel.com>
> > ---
> >  gcc/match.pd              |  4 +--
> >  gcc/tree-vect-patterns.cc | 51 ++++++++++++++++++++++++---------------
> >  2 files changed, 33 insertions(+), 22 deletions(-)
> >
> > diff --git a/gcc/match.pd b/gcc/match.pd
> > index cf8a399a744..820591a36b3 100644
> > --- a/gcc/match.pd
> > +++ b/gcc/match.pd
> > @@ -3164,9 +3164,9 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> >  /* Unsigned saturation sub, case 2 (branch with ge):
> >     SAT_U_SUB = X >= Y ? X - Y : 0.  */
> >  (match (unsigned_integer_sat_sub @0 @1)
> > - (cond^ (ge @0 @1) (minus @0 @1) integer_zerop)
> > + (cond^ (ge @0 @1) (convert? (minus (convert1? @0) (convert1? @1))) integer_zerop)
> >   (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
> > -      && types_match (type, @0, @1))))
> > +      && TYPE_UNSIGNED (TREE_TYPE (@0)) && types_match (@0, @1))))
> >
> >  /* Unsigned saturation sub, case 3 (branchless with gt):
> >     SAT_U_SUB = (X - Y) * (X > Y).  */
> > diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
> > index cef901808eb..519d15f2a43 100644
> > --- a/gcc/tree-vect-patterns.cc
> > +++ b/gcc/tree-vect-patterns.cc
> > @@ -4490,26 +4490,37 @@ vect_recog_mult_pattern (vec_info *vinfo,
> >  extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree));
> >  extern bool gimple_unsigned_integer_sat_sub (tree, tree*, tree (*)(tree));
> >
> > -static gcall *
> > -vect_recog_build_binary_gimple_call (vec_info *vinfo, gimple *stmt,
> > +static gimple *
> > +vect_recog_build_binary_gimple_stmt (vec_info *vinfo, stmt_vec_info stmt_info,
> >                                      internal_fn fn, tree *type_out,
> > -                                    tree op_0, tree op_1)
> > +                                    tree lhs, tree op_0, tree op_1)
> >  {
> >    tree itype = TREE_TYPE (op_0);
> > -  tree vtype = get_vectype_for_scalar_type (vinfo, itype);
> > +  tree otype = TREE_TYPE (lhs);
> > +  tree v_itype = get_vectype_for_scalar_type (vinfo, itype);
> > +  tree v_otype = get_vectype_for_scalar_type (vinfo, otype);
> >
> > -  if (vtype != NULL_TREE
> > -    && direct_internal_fn_supported_p (fn, vtype, OPTIMIZE_FOR_BOTH))
> > +  if (v_itype != NULL_TREE && v_otype != NULL_TREE
> > +    && direct_internal_fn_supported_p (fn, v_itype, OPTIMIZE_FOR_BOTH))
> >      {
> >        gcall *call = gimple_build_call_internal (fn, 2, op_0, op_1);
> > +      tree in_ssa = vect_recog_temp_ssa_var (itype, NULL);
> >
> > -      gimple_call_set_lhs (call, vect_recog_temp_ssa_var (itype, NULL));
> > +      gimple_call_set_lhs (call, in_ssa);
> >        gimple_call_set_nothrow (call, /* nothrow_p */ false);
> > -      gimple_set_location (call, gimple_location (stmt));
> > +      gimple_set_location (call, gimple_location (STMT_VINFO_STMT (stmt_info)));
> > +
> > +      *type_out = v_otype;
> >
> > -      *type_out = vtype;
> > +      if (types_compatible_p (itype, otype))
> > +       return call;
> > +      else
> > +       {
> > +         append_pattern_def_seq (vinfo, stmt_info, call, v_itype);
> > +         tree out_ssa = vect_recog_temp_ssa_var (otype, NULL);
> >
> > -      return call;
> > +         return gimple_build_assign (out_ssa, NOP_EXPR, in_ssa);
> > +       }
> >      }
> >
> >    return NULL;
> > @@ -4541,13 +4552,13 @@ vect_recog_sat_add_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo,
> >
> >    if (gimple_unsigned_integer_sat_add (lhs, ops, NULL))
> >      {
> > -      gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt,
> > -                                                        IFN_SAT_ADD, type_out,
> > -                                                        ops[0], ops[1]);
> > -      if (call)
> > +      gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo,
> > +                                                         IFN_SAT_ADD, type_out,
> > +                                                         lhs, ops[0], ops[1]);
> > +      if (stmt)
> >         {
> >           vect_pattern_detected ("vect_recog_sat_add_pattern", last_stmt);
> > -         return call;
> > +         return stmt;
> >         }
> >      }
> >
> > @@ -4579,13 +4590,13 @@ vect_recog_sat_sub_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo,
> >
> >    if (gimple_unsigned_integer_sat_sub (lhs, ops, NULL))
> >      {
> > -      gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt,
> > -                                                        IFN_SAT_SUB, type_out,
> > -                                                        ops[0], ops[1]);
> > -      if (call)
> > +      gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo,
> > +                                                         IFN_SAT_SUB, type_out,
> > +                                                         lhs, ops[0], ops[1]);
> > +      if (stmt)
> >         {
> >           vect_pattern_detected ("vect_recog_sat_sub_pattern", last_stmt);
> > -         return call;
> > +         return stmt;
> >         }
> >      }
> >
> > --
> > 2.34.1
> >

^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH v3] Vect: Support truncate after .SAT_SUB pattern in zip
  2024-06-28  5:38         ` Richard Biener
@ 2024-06-28 13:38           ` Tamar Christina
  2024-06-28 15:06             ` Li, Pan2
  0 siblings, 1 reply; 27+ messages in thread
From: Tamar Christina @ 2024-06-28 13:38 UTC (permalink / raw)
  To: Richard Biener, Li, Pan2
  Cc: gcc-patches, juzhe.zhong, kito.cheng, jeffreyalaw, rdapp.gcc

> -----Original Message-----
> From: Richard Biener <richard.guenther@gmail.com>
> Sent: Friday, June 28, 2024 6:39 AM
> To: Li, Pan2 <pan2.li@intel.com>
> Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com;
> jeffreyalaw@gmail.com; rdapp.gcc@gmail.com; Tamar Christina
> <Tamar.Christina@arm.com>
> Subject: Re: [PATCH v3] Vect: Support truncate after .SAT_SUB pattern in zip
> 
> On Thu, Jun 27, 2024 at 4:45 PM Li, Pan2 <pan2.li@intel.com> wrote:
> >
> > Hi Richard,
> >
> > As mentioned by tamar in previous, would like to try even more optimization
> based on this patch.
> > Assume we take zip benchmark as example, we may have gimple similar as below
> >
> > unsigned int _1, _2;
> > unsigned short int _9;
> >
> > _9 = (unsigned short int).SAT_SUB (_1, _2);
> >
> > If we can locate the _1 is in the range of unsigned short, we can distribute the
> convert into
> > the .SAT_SUB, aka:
> >
> > From:
> > _1 = (unsigned int short)_other;
> > _9 = (unsigned short int).SAT_SUB (_1, _2);
> >
> > To:
> > _9 = .SAT_SUB ((unsigned int short)_1, (unsigned int short)MIN_EXPR (_2,
> 65536)));
> >
> > Unfortunately, it failed to vectorize when I try to perform above changes. The
> vectorizable_conversion
> > considers it is not simple use and then return fail to vect_analyze_loop_2.
> >
> > zip.test.c:15:12: note:   ==> examining pattern def statement: patt_42 = (short
> unsigned int) MIN_EXPR <b_12(D), b_12(D)>;
> > zip.test.c:15:12: note:   ==> examining statement: patt_42 = (short unsigned int)
> MIN_EXPR <b_12(D), b_12(D)>;
> > zip.test.c:15:12: note:   vect_is_simple_use: operand MIN_EXPR <b_12(D),
> b_12(D)>, type of def: unknown
> > zip.test.c:15:12: missed:   Unsupported pattern.
> > zip.test.c:15:12: missed:   use not simple.
> > zip.test.c:15:12: note:   vect_is_simple_use: operand MIN_EXPR <b_12(D),
> b_12(D)>, type of def: unknown
> > zip.test.c:15:12: missed:   Unsupported pattern.
> > zip.test.c:15:12: missed:   use not simple.
> > zip.test.c:15:12: note:   vect_is_simple_use: operand MIN_EXPR <b_12(D),
> b_12(D)>, type of def: unknown
> > zip.test.c:15:12: missed:   Unsupported pattern.
> > zip.test.c:15:12: missed:   use not simple.
> > zip.test.c:7:6: missed:   not vectorized: relevant stmt not supported: patt_42 =
> (short unsigned int) MIN_EXPR <b_12(D), b_12(D)>;
> > zip.test.c:15:12: missed:  bad operation or unsupported loop bound.
> >
> > I tried to take COND_EXPR here instead of MIN_EXPR but almost the same
> behavior. I am not sure if we can unblock this by the
> > vectorizable_conversion or we need some improvements from other pass.
> 
> I think you're doing the MIN_EXPR wrong - the above says MIN_EXPR
> <b_12(D), b_12(D)> which doesn't make
> sense anyway.  I suspect you fail to put the MIN_EXPR to a separate statement?
> 

Aye, you need to emit the additional statements through  append_pattern_def_seq,
This is also because the scalar statement doesn’t require them, so it makes costing easier.

The vectorizer expects arguments to be simple use, so compound statements aren't
Supported as they make costing and codegen harder.

Cheers,
Tamar

> > Thanks a lot.
> >
> > Pan
> >
> > -----Original Message-----
> > From: Li, Pan2
> > Sent: Thursday, June 27, 2024 2:14 PM
> > To: Richard Biener <richard.guenther@gmail.com>
> > Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com;
> jeffreyalaw@gmail.com; rdapp.gcc@gmail.com
> > Subject: RE: [PATCH v3] Vect: Support truncate after .SAT_SUB pattern in zip
> >
> > > OK
> >
> > Committed, thanks Richard.
> >
> > Pan
> >
> > -----Original Message-----
> > From: Richard Biener <richard.guenther@gmail.com>
> > Sent: Thursday, June 27, 2024 2:04 PM
> > To: Li, Pan2 <pan2.li@intel.com>
> > Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com;
> jeffreyalaw@gmail.com; rdapp.gcc@gmail.com
> > Subject: Re: [PATCH v3] Vect: Support truncate after .SAT_SUB pattern in zip
> >
> > On Thu, Jun 27, 2024 at 3:31 AM <pan2.li@intel.com> wrote:
> > >
> > > From: Pan Li <pan2.li@intel.com>
> >
> > OK
> >
> > > The zip benchmark of coremark-pro have one SAT_SUB like pattern but
> > > truncated as below:
> > >
> > > void test (uint16_t *x, unsigned b, unsigned n)
> > > {
> > >   unsigned a = 0;
> > >   register uint16_t *p = x;
> > >
> > >   do {
> > >     a = *--p;
> > >     *p = (uint16_t)(a >= b ? a - b : 0); // Truncate after .SAT_SUB
> > >   } while (--n);
> > > }
> > >
> > > It will have gimple before vect pass,  it cannot hit any pattern of
> > > SAT_SUB and then cannot vectorize to SAT_SUB.
> > >
> > > _2 = a_11 - b_12(D);
> > > iftmp.0_13 = (short unsigned int) _2;
> > > _18 = a_11 >= b_12(D);
> > > iftmp.0_5 = _18 ? iftmp.0_13 : 0;
> > >
> > > This patch would like to improve the pattern match to recog above
> > > as truncate after .SAT_SUB pattern.  Then we will have the pattern
> > > similar to below,  as well as eliminate the first 3 dead stmt.
> > >
> > > _2 = a_11 - b_12(D);
> > > iftmp.0_13 = (short unsigned int) _2;
> > > _18 = a_11 >= b_12(D);
> > > iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D));
> > >
> > > The below tests are passed for this patch.
> > > 1. The rv64gcv fully regression tests.
> > > 2. The rv64gcv build with glibc.
> > > 3. The x86 bootstrap tests.
> > > 4. The x86 fully regression tests.
> > >
> > > gcc/ChangeLog:
> > >
> > >         * match.pd: Add convert description for minus and capture.
> > >         * tree-vect-patterns.cc (vect_recog_build_binary_gimple_call): Add
> > >         new logic to handle in_type is incompatibile with out_type,  as
> > >         well as rename from.
> > >         (vect_recog_build_binary_gimple_stmt): Rename to.
> > >         (vect_recog_sat_add_pattern): Leverage above renamed func.
> > >         (vect_recog_sat_sub_pattern): Ditto.
> > >
> > > Signed-off-by: Pan Li <pan2.li@intel.com>
> > > ---
> > >  gcc/match.pd              |  4 +--
> > >  gcc/tree-vect-patterns.cc | 51 ++++++++++++++++++++++++---------------
> > >  2 files changed, 33 insertions(+), 22 deletions(-)
> > >
> > > diff --git a/gcc/match.pd b/gcc/match.pd
> > > index cf8a399a744..820591a36b3 100644
> > > --- a/gcc/match.pd
> > > +++ b/gcc/match.pd
> > > @@ -3164,9 +3164,9 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> > >  /* Unsigned saturation sub, case 2 (branch with ge):
> > >     SAT_U_SUB = X >= Y ? X - Y : 0.  */
> > >  (match (unsigned_integer_sat_sub @0 @1)
> > > - (cond^ (ge @0 @1) (minus @0 @1) integer_zerop)
> > > + (cond^ (ge @0 @1) (convert? (minus (convert1? @0) (convert1? @1)))
> integer_zerop)
> > >   (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
> > > -      && types_match (type, @0, @1))))
> > > +      && TYPE_UNSIGNED (TREE_TYPE (@0)) && types_match (@0, @1))))
> > >
> > >  /* Unsigned saturation sub, case 3 (branchless with gt):
> > >     SAT_U_SUB = (X - Y) * (X > Y).  */
> > > diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
> > > index cef901808eb..519d15f2a43 100644
> > > --- a/gcc/tree-vect-patterns.cc
> > > +++ b/gcc/tree-vect-patterns.cc
> > > @@ -4490,26 +4490,37 @@ vect_recog_mult_pattern (vec_info *vinfo,
> > >  extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree));
> > >  extern bool gimple_unsigned_integer_sat_sub (tree, tree*, tree (*)(tree));
> > >
> > > -static gcall *
> > > -vect_recog_build_binary_gimple_call (vec_info *vinfo, gimple *stmt,
> > > +static gimple *
> > > +vect_recog_build_binary_gimple_stmt (vec_info *vinfo, stmt_vec_info
> stmt_info,
> > >                                      internal_fn fn, tree *type_out,
> > > -                                    tree op_0, tree op_1)
> > > +                                    tree lhs, tree op_0, tree op_1)
> > >  {
> > >    tree itype = TREE_TYPE (op_0);
> > > -  tree vtype = get_vectype_for_scalar_type (vinfo, itype);
> > > +  tree otype = TREE_TYPE (lhs);
> > > +  tree v_itype = get_vectype_for_scalar_type (vinfo, itype);
> > > +  tree v_otype = get_vectype_for_scalar_type (vinfo, otype);
> > >
> > > -  if (vtype != NULL_TREE
> > > -    && direct_internal_fn_supported_p (fn, vtype, OPTIMIZE_FOR_BOTH))
> > > +  if (v_itype != NULL_TREE && v_otype != NULL_TREE
> > > +    && direct_internal_fn_supported_p (fn, v_itype, OPTIMIZE_FOR_BOTH))
> > >      {
> > >        gcall *call = gimple_build_call_internal (fn, 2, op_0, op_1);
> > > +      tree in_ssa = vect_recog_temp_ssa_var (itype, NULL);
> > >
> > > -      gimple_call_set_lhs (call, vect_recog_temp_ssa_var (itype, NULL));
> > > +      gimple_call_set_lhs (call, in_ssa);
> > >        gimple_call_set_nothrow (call, /* nothrow_p */ false);
> > > -      gimple_set_location (call, gimple_location (stmt));
> > > +      gimple_set_location (call, gimple_location (STMT_VINFO_STMT
> (stmt_info)));
> > > +
> > > +      *type_out = v_otype;
> > >
> > > -      *type_out = vtype;
> > > +      if (types_compatible_p (itype, otype))
> > > +       return call;
> > > +      else
> > > +       {
> > > +         append_pattern_def_seq (vinfo, stmt_info, call, v_itype);
> > > +         tree out_ssa = vect_recog_temp_ssa_var (otype, NULL);
> > >
> > > -      return call;
> > > +         return gimple_build_assign (out_ssa, NOP_EXPR, in_ssa);
> > > +       }
> > >      }
> > >
> > >    return NULL;
> > > @@ -4541,13 +4552,13 @@ vect_recog_sat_add_pattern (vec_info *vinfo,
> stmt_vec_info stmt_vinfo,
> > >
> > >    if (gimple_unsigned_integer_sat_add (lhs, ops, NULL))
> > >      {
> > > -      gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt,
> > > -                                                        IFN_SAT_ADD, type_out,
> > > -                                                        ops[0], ops[1]);
> > > -      if (call)
> > > +      gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo,
> > > +                                                         IFN_SAT_ADD, type_out,
> > > +                                                         lhs, ops[0], ops[1]);
> > > +      if (stmt)
> > >         {
> > >           vect_pattern_detected ("vect_recog_sat_add_pattern", last_stmt);
> > > -         return call;
> > > +         return stmt;
> > >         }
> > >      }
> > >
> > > @@ -4579,13 +4590,13 @@ vect_recog_sat_sub_pattern (vec_info *vinfo,
> stmt_vec_info stmt_vinfo,
> > >
> > >    if (gimple_unsigned_integer_sat_sub (lhs, ops, NULL))
> > >      {
> > > -      gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt,
> > > -                                                        IFN_SAT_SUB, type_out,
> > > -                                                        ops[0], ops[1]);
> > > -      if (call)
> > > +      gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo,
> > > +                                                         IFN_SAT_SUB, type_out,
> > > +                                                         lhs, ops[0], ops[1]);
> > > +      if (stmt)
> > >         {
> > >           vect_pattern_detected ("vect_recog_sat_sub_pattern", last_stmt);
> > > -         return call;
> > > +         return stmt;
> > >         }
> > >      }
> > >
> > > --
> > > 2.34.1
> > >

^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH v3] Vect: Support truncate after .SAT_SUB pattern in zip
  2024-06-28 13:38           ` Tamar Christina
@ 2024-06-28 15:06             ` Li, Pan2
  0 siblings, 0 replies; 27+ messages in thread
From: Li, Pan2 @ 2024-06-28 15:06 UTC (permalink / raw)
  To: Tamar Christina, Richard Biener
  Cc: gcc-patches, juzhe.zhong, kito.cheng, jeffreyalaw, rdapp.gcc

Thanks Tamar and Richard for enlightening.

> I think you're doing the MIN_EXPR wrong - the above says MIN_EXPR
> <b_12(D), b_12(D)> which doesn't make
> sense anyway.  I suspect you fail to put the MIN_EXPR to a separate statement?

Make sense, will have another try for this.

> Aye, you need to emit the additional statements through  append_pattern_def_seq,
> This is also because the scalar statement doesn’t require them, so it makes costing easier.
> The vectorizer expects arguments to be simple use, so compound statements aren't
> Supported as they make costing and codegen harder.

Yes, you are right. It is not ssa_name during simple use check and then return failures to vectorization_convertion.

Pan

-----Original Message-----
From: Tamar Christina <Tamar.Christina@arm.com> 
Sent: Friday, June 28, 2024 9:39 PM
To: Richard Biener <richard.guenther@gmail.com>; Li, Pan2 <pan2.li@intel.com>
Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com; rdapp.gcc@gmail.com
Subject: RE: [PATCH v3] Vect: Support truncate after .SAT_SUB pattern in zip

> -----Original Message-----
> From: Richard Biener <richard.guenther@gmail.com>
> Sent: Friday, June 28, 2024 6:39 AM
> To: Li, Pan2 <pan2.li@intel.com>
> Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com;
> jeffreyalaw@gmail.com; rdapp.gcc@gmail.com; Tamar Christina
> <Tamar.Christina@arm.com>
> Subject: Re: [PATCH v3] Vect: Support truncate after .SAT_SUB pattern in zip
> 
> On Thu, Jun 27, 2024 at 4:45 PM Li, Pan2 <pan2.li@intel.com> wrote:
> >
> > Hi Richard,
> >
> > As mentioned by tamar in previous, would like to try even more optimization
> based on this patch.
> > Assume we take zip benchmark as example, we may have gimple similar as below
> >
> > unsigned int _1, _2;
> > unsigned short int _9;
> >
> > _9 = (unsigned short int).SAT_SUB (_1, _2);
> >
> > If we can locate the _1 is in the range of unsigned short, we can distribute the
> convert into
> > the .SAT_SUB, aka:
> >
> > From:
> > _1 = (unsigned int short)_other;
> > _9 = (unsigned short int).SAT_SUB (_1, _2);
> >
> > To:
> > _9 = .SAT_SUB ((unsigned int short)_1, (unsigned int short)MIN_EXPR (_2,
> 65536)));
> >
> > Unfortunately, it failed to vectorize when I try to perform above changes. The
> vectorizable_conversion
> > considers it is not simple use and then return fail to vect_analyze_loop_2.
> >
> > zip.test.c:15:12: note:   ==> examining pattern def statement: patt_42 = (short
> unsigned int) MIN_EXPR <b_12(D), b_12(D)>;
> > zip.test.c:15:12: note:   ==> examining statement: patt_42 = (short unsigned int)
> MIN_EXPR <b_12(D), b_12(D)>;
> > zip.test.c:15:12: note:   vect_is_simple_use: operand MIN_EXPR <b_12(D),
> b_12(D)>, type of def: unknown
> > zip.test.c:15:12: missed:   Unsupported pattern.
> > zip.test.c:15:12: missed:   use not simple.
> > zip.test.c:15:12: note:   vect_is_simple_use: operand MIN_EXPR <b_12(D),
> b_12(D)>, type of def: unknown
> > zip.test.c:15:12: missed:   Unsupported pattern.
> > zip.test.c:15:12: missed:   use not simple.
> > zip.test.c:15:12: note:   vect_is_simple_use: operand MIN_EXPR <b_12(D),
> b_12(D)>, type of def: unknown
> > zip.test.c:15:12: missed:   Unsupported pattern.
> > zip.test.c:15:12: missed:   use not simple.
> > zip.test.c:7:6: missed:   not vectorized: relevant stmt not supported: patt_42 =
> (short unsigned int) MIN_EXPR <b_12(D), b_12(D)>;
> > zip.test.c:15:12: missed:  bad operation or unsupported loop bound.
> >
> > I tried to take COND_EXPR here instead of MIN_EXPR but almost the same
> behavior. I am not sure if we can unblock this by the
> > vectorizable_conversion or we need some improvements from other pass.
> 
> I think you're doing the MIN_EXPR wrong - the above says MIN_EXPR
> <b_12(D), b_12(D)> which doesn't make
> sense anyway.  I suspect you fail to put the MIN_EXPR to a separate statement?
> 

Aye, you need to emit the additional statements through  append_pattern_def_seq,
This is also because the scalar statement doesn’t require them, so it makes costing easier.

The vectorizer expects arguments to be simple use, so compound statements aren't
Supported as they make costing and codegen harder.

Cheers,
Tamar

> > Thanks a lot.
> >
> > Pan
> >
> > -----Original Message-----
> > From: Li, Pan2
> > Sent: Thursday, June 27, 2024 2:14 PM
> > To: Richard Biener <richard.guenther@gmail.com>
> > Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com;
> jeffreyalaw@gmail.com; rdapp.gcc@gmail.com
> > Subject: RE: [PATCH v3] Vect: Support truncate after .SAT_SUB pattern in zip
> >
> > > OK
> >
> > Committed, thanks Richard.
> >
> > Pan
> >
> > -----Original Message-----
> > From: Richard Biener <richard.guenther@gmail.com>
> > Sent: Thursday, June 27, 2024 2:04 PM
> > To: Li, Pan2 <pan2.li@intel.com>
> > Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com;
> jeffreyalaw@gmail.com; rdapp.gcc@gmail.com
> > Subject: Re: [PATCH v3] Vect: Support truncate after .SAT_SUB pattern in zip
> >
> > On Thu, Jun 27, 2024 at 3:31 AM <pan2.li@intel.com> wrote:
> > >
> > > From: Pan Li <pan2.li@intel.com>
> >
> > OK
> >
> > > The zip benchmark of coremark-pro have one SAT_SUB like pattern but
> > > truncated as below:
> > >
> > > void test (uint16_t *x, unsigned b, unsigned n)
> > > {
> > >   unsigned a = 0;
> > >   register uint16_t *p = x;
> > >
> > >   do {
> > >     a = *--p;
> > >     *p = (uint16_t)(a >= b ? a - b : 0); // Truncate after .SAT_SUB
> > >   } while (--n);
> > > }
> > >
> > > It will have gimple before vect pass,  it cannot hit any pattern of
> > > SAT_SUB and then cannot vectorize to SAT_SUB.
> > >
> > > _2 = a_11 - b_12(D);
> > > iftmp.0_13 = (short unsigned int) _2;
> > > _18 = a_11 >= b_12(D);
> > > iftmp.0_5 = _18 ? iftmp.0_13 : 0;
> > >
> > > This patch would like to improve the pattern match to recog above
> > > as truncate after .SAT_SUB pattern.  Then we will have the pattern
> > > similar to below,  as well as eliminate the first 3 dead stmt.
> > >
> > > _2 = a_11 - b_12(D);
> > > iftmp.0_13 = (short unsigned int) _2;
> > > _18 = a_11 >= b_12(D);
> > > iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D));
> > >
> > > The below tests are passed for this patch.
> > > 1. The rv64gcv fully regression tests.
> > > 2. The rv64gcv build with glibc.
> > > 3. The x86 bootstrap tests.
> > > 4. The x86 fully regression tests.
> > >
> > > gcc/ChangeLog:
> > >
> > >         * match.pd: Add convert description for minus and capture.
> > >         * tree-vect-patterns.cc (vect_recog_build_binary_gimple_call): Add
> > >         new logic to handle in_type is incompatibile with out_type,  as
> > >         well as rename from.
> > >         (vect_recog_build_binary_gimple_stmt): Rename to.
> > >         (vect_recog_sat_add_pattern): Leverage above renamed func.
> > >         (vect_recog_sat_sub_pattern): Ditto.
> > >
> > > Signed-off-by: Pan Li <pan2.li@intel.com>
> > > ---
> > >  gcc/match.pd              |  4 +--
> > >  gcc/tree-vect-patterns.cc | 51 ++++++++++++++++++++++++---------------
> > >  2 files changed, 33 insertions(+), 22 deletions(-)
> > >
> > > diff --git a/gcc/match.pd b/gcc/match.pd
> > > index cf8a399a744..820591a36b3 100644
> > > --- a/gcc/match.pd
> > > +++ b/gcc/match.pd
> > > @@ -3164,9 +3164,9 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> > >  /* Unsigned saturation sub, case 2 (branch with ge):
> > >     SAT_U_SUB = X >= Y ? X - Y : 0.  */
> > >  (match (unsigned_integer_sat_sub @0 @1)
> > > - (cond^ (ge @0 @1) (minus @0 @1) integer_zerop)
> > > + (cond^ (ge @0 @1) (convert? (minus (convert1? @0) (convert1? @1)))
> integer_zerop)
> > >   (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
> > > -      && types_match (type, @0, @1))))
> > > +      && TYPE_UNSIGNED (TREE_TYPE (@0)) && types_match (@0, @1))))
> > >
> > >  /* Unsigned saturation sub, case 3 (branchless with gt):
> > >     SAT_U_SUB = (X - Y) * (X > Y).  */
> > > diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
> > > index cef901808eb..519d15f2a43 100644
> > > --- a/gcc/tree-vect-patterns.cc
> > > +++ b/gcc/tree-vect-patterns.cc
> > > @@ -4490,26 +4490,37 @@ vect_recog_mult_pattern (vec_info *vinfo,
> > >  extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree));
> > >  extern bool gimple_unsigned_integer_sat_sub (tree, tree*, tree (*)(tree));
> > >
> > > -static gcall *
> > > -vect_recog_build_binary_gimple_call (vec_info *vinfo, gimple *stmt,
> > > +static gimple *
> > > +vect_recog_build_binary_gimple_stmt (vec_info *vinfo, stmt_vec_info
> stmt_info,
> > >                                      internal_fn fn, tree *type_out,
> > > -                                    tree op_0, tree op_1)
> > > +                                    tree lhs, tree op_0, tree op_1)
> > >  {
> > >    tree itype = TREE_TYPE (op_0);
> > > -  tree vtype = get_vectype_for_scalar_type (vinfo, itype);
> > > +  tree otype = TREE_TYPE (lhs);
> > > +  tree v_itype = get_vectype_for_scalar_type (vinfo, itype);
> > > +  tree v_otype = get_vectype_for_scalar_type (vinfo, otype);
> > >
> > > -  if (vtype != NULL_TREE
> > > -    && direct_internal_fn_supported_p (fn, vtype, OPTIMIZE_FOR_BOTH))
> > > +  if (v_itype != NULL_TREE && v_otype != NULL_TREE
> > > +    && direct_internal_fn_supported_p (fn, v_itype, OPTIMIZE_FOR_BOTH))
> > >      {
> > >        gcall *call = gimple_build_call_internal (fn, 2, op_0, op_1);
> > > +      tree in_ssa = vect_recog_temp_ssa_var (itype, NULL);
> > >
> > > -      gimple_call_set_lhs (call, vect_recog_temp_ssa_var (itype, NULL));
> > > +      gimple_call_set_lhs (call, in_ssa);
> > >        gimple_call_set_nothrow (call, /* nothrow_p */ false);
> > > -      gimple_set_location (call, gimple_location (stmt));
> > > +      gimple_set_location (call, gimple_location (STMT_VINFO_STMT
> (stmt_info)));
> > > +
> > > +      *type_out = v_otype;
> > >
> > > -      *type_out = vtype;
> > > +      if (types_compatible_p (itype, otype))
> > > +       return call;
> > > +      else
> > > +       {
> > > +         append_pattern_def_seq (vinfo, stmt_info, call, v_itype);
> > > +         tree out_ssa = vect_recog_temp_ssa_var (otype, NULL);
> > >
> > > -      return call;
> > > +         return gimple_build_assign (out_ssa, NOP_EXPR, in_ssa);
> > > +       }
> > >      }
> > >
> > >    return NULL;
> > > @@ -4541,13 +4552,13 @@ vect_recog_sat_add_pattern (vec_info *vinfo,
> stmt_vec_info stmt_vinfo,
> > >
> > >    if (gimple_unsigned_integer_sat_add (lhs, ops, NULL))
> > >      {
> > > -      gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt,
> > > -                                                        IFN_SAT_ADD, type_out,
> > > -                                                        ops[0], ops[1]);
> > > -      if (call)
> > > +      gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo,
> > > +                                                         IFN_SAT_ADD, type_out,
> > > +                                                         lhs, ops[0], ops[1]);
> > > +      if (stmt)
> > >         {
> > >           vect_pattern_detected ("vect_recog_sat_add_pattern", last_stmt);
> > > -         return call;
> > > +         return stmt;
> > >         }
> > >      }
> > >
> > > @@ -4579,13 +4590,13 @@ vect_recog_sat_sub_pattern (vec_info *vinfo,
> stmt_vec_info stmt_vinfo,
> > >
> > >    if (gimple_unsigned_integer_sat_sub (lhs, ops, NULL))
> > >      {
> > > -      gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt,
> > > -                                                        IFN_SAT_SUB, type_out,
> > > -                                                        ops[0], ops[1]);
> > > -      if (call)
> > > +      gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo,
> > > +                                                         IFN_SAT_SUB, type_out,
> > > +                                                         lhs, ops[0], ops[1]);
> > > +      if (stmt)
> > >         {
> > >           vect_pattern_detected ("vect_recog_sat_sub_pattern", last_stmt);
> > > -         return call;
> > > +         return stmt;
> > >         }
> > >      }
> > >
> > > --
> > > 2.34.1
> > >

^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2024-06-28 15:06 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-06-21  3:52 [PATCH v1] Ifcvt: Add cond tree reconcile for truncated .SAT_SUB pan2.li
2024-06-21  7:00 ` Richard Biener
2024-06-21  8:50   ` Li, Pan2
2024-06-21  9:28     ` Richard Biener
2024-06-21 14:45       ` Li, Pan2
2024-06-22 13:19         ` Richard Biener
2024-06-23 12:57           ` Li, Pan2
2024-06-24 13:55 ` [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip pan2.li
2024-06-24 19:59   ` Tamar Christina
2024-06-25  2:25     ` Li, Pan2
2024-06-25  4:00       ` Tamar Christina
2024-06-25  6:06         ` Li, Pan2
2024-06-25  6:11           ` Tamar Christina
2024-06-25  6:25             ` Li, Pan2
2024-06-26  3:12               ` Li, Pan2
2024-06-26 13:56   ` Richard Biener
2024-06-26 14:22     ` Li, Pan2
2024-06-27  6:48   ` Uros Bizjak
2024-06-27  7:00     ` Li, Pan2
2024-06-27  9:22       ` Uros Bizjak
2024-06-27  1:31 ` [PATCH v3] " pan2.li
2024-06-27  6:04   ` Richard Biener
2024-06-27  6:14     ` Li, Pan2
2024-06-27 14:45       ` Li, Pan2
2024-06-28  5:38         ` Richard Biener
2024-06-28 13:38           ` Tamar Christina
2024-06-28 15:06             ` Li, Pan2

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).