public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] VECT: Apply COND_LEN_* into vectorizable_operation
@ 2023-07-12  4:21 juzhe.zhong
  2023-07-12  9:29 ` Richard Sandiford
  0 siblings, 1 reply; 3+ messages in thread
From: juzhe.zhong @ 2023-07-12  4:21 UTC (permalink / raw)
  To: gcc-patches; +Cc: richard.sandiford, rguenther, Ju-Zhe Zhong

From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>

Hi, Richard and Richi.
As we disscussed before, COND_LEN_* patterns were added for multiple situations.
This patch apply CON_LEN_* for the following situation:

Support for the situation that in "vectorizable_operation":
  /* If operating on inactive elements could generate spurious traps,
     we need to restrict the operation to active lanes.  Note that this
     specifically doesn't apply to unhoisted invariants, since they
     operate on the same value for every lane.

     Similarly, if this operation is part of a reduction, a fully-masked
     loop should only change the active lanes of the reduction chain,
     keeping the inactive lanes as-is.  */
  bool mask_out_inactive = ((!is_invariant && gimple_could_trap_p (stmt))
			    || reduc_idx >= 0);

For mask_out_inactive is true with length loop control.

So, we can these 2 following cases:

1. Integer division:

   #define TEST_TYPE(TYPE) 				\
   __attribute__((noipa))				\
   void vrem_##TYPE (TYPE *dst, TYPE *a, TYPE *b, int n)	\
   {							\
     for (int i = 0; i < n; i++)				\
       dst[i] = a[i] % b[i];				\
   }
   #define TEST_ALL()	\
   TEST_TYPE(int8_t)	\
   TEST_ALL()

With this patch:
  
  _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [4, 4]);
  ivtmp_45 = _61 * 4;
  vect__4.8_48 = .LEN_MASK_LOAD (vectp_a.6_46, 32B, _61, 0, { -1, ... });
  vect__6.11_52 = .LEN_MASK_LOAD (vectp_b.9_50, 32B, _61, 0, { -1, ... });
  vect__8.12_53 = .COND_LEN_ADD ({ -1, ... }, vect__4.8_48, vect__6.11_52, vect__4.8_48, _61, 0);
  .LEN_MASK_STORE (vectp_dst.13_55, 32B, _61, 0, { -1, ... }, vect__8.12_53);

2. Floating-point arithmetic **WITHOUT** -ffast-math
  
   #define TEST_TYPE(TYPE) 				\
   __attribute__((noipa))				\
   void vadd_##TYPE (TYPE *dst, TYPE *a, TYPE *b, int n)	\
   {							\
     for (int i = 0; i < n; i++)				\
       dst[i] = a[i] + b[i];				\
   }
   #define TEST_ALL()	\
   TEST_TYPE(float)	\
   TEST_ALL()

With this patch:
   
  _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [4, 4]);
  ivtmp_45 = _61 * 4;
  vect__4.8_48 = .LEN_MASK_LOAD (vectp_a.6_46, 32B, _61, 0, { -1, ... });
  vect__6.11_52 = .LEN_MASK_LOAD (vectp_b.9_50, 32B, _61, 0, { -1, ... });
  vect__8.12_53 = .COND_LEN_ADD ({ -1, ... }, vect__4.8_48, vect__6.11_52, vect__4.8_48, _61, 0);
  .LEN_MASK_STORE (vectp_dst.13_55, 32B, _61, 0, { -1, ... }, vect__8.12_53);

With this patch, we can make sure operations won't trap for elements that "mask_out_inactive".

gcc/ChangeLog:

        * internal-fn.cc (FOR_EACH_CODE_LEN_MAPPING): Add COND_LEN_*.
        (get_conditional_len_internal_fn): New function.
        (CASE): Add COND_LEN_*.
        * internal-fn.h (get_conditional_len_internal_fn): New function.
        * tree-vect-stmts.cc (vectorizable_operation): Apply COND_LEN_* into operation could trap.

---
 gcc/internal-fn.cc     | 48 +++++++++++++++++++++++++++++++++
 gcc/internal-fn.h      |  1 +
 gcc/tree-vect-stmts.cc | 60 ++++++++++++++++++++++++++++++++++++++----
 3 files changed, 104 insertions(+), 5 deletions(-)

diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index f9aaf66cf2a..e46dd57b7f0 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -4337,6 +4337,54 @@ conditional_internal_fn_code (internal_fn ifn)
     }
 }
 
+/* Invoke T(CODE, IFN) for each conditional len function IFN that maps to a
+   tree code CODE.  */
+#define FOR_EACH_CODE_LEN_MAPPING(T)                                           \
+  T (PLUS_EXPR, IFN_COND_LEN_ADD)                                              \
+  T (MINUS_EXPR, IFN_COND_LEN_SUB)                                             \
+  T (MULT_EXPR, IFN_COND_LEN_MUL)                                              \
+  T (TRUNC_DIV_EXPR, IFN_COND_LEN_DIV)                                         \
+  T (TRUNC_MOD_EXPR, IFN_COND_LEN_MOD)                                         \
+  T (RDIV_EXPR, IFN_COND_LEN_RDIV)                                             \
+  T (MIN_EXPR, IFN_COND_LEN_MIN)                                               \
+  T (MAX_EXPR, IFN_COND_LEN_MAX)                                               \
+  T (BIT_AND_EXPR, IFN_COND_LEN_AND)                                           \
+  T (BIT_IOR_EXPR, IFN_COND_LEN_IOR)                                           \
+  T (BIT_XOR_EXPR, IFN_COND_LEN_XOR)                                           \
+  T (LSHIFT_EXPR, IFN_COND_LEN_SHL)                                            \
+  T (RSHIFT_EXPR, IFN_COND_LEN_SHR)                                            \
+  T (NEGATE_EXPR, IFN_COND_LEN_NEG)
+
+/* Return a function that only performs CODE when a certain condition is met
+   and that uses a given fallback value otherwise.  For example, if CODE is
+   a binary operation associated with conditional function FN:
+
+     LHS = FN (COND, A, B, ELSE, LEN)
+
+   is equivalent to the C expression:
+
+     for (int i = 0; i < LEN; i++)
+       LHS[i] = COND[i] ? A[i] CODE B[i] : ELSE[i];
+
+   operating elementwise if the operands are vectors.
+
+   Return IFN_LAST if no such function exists.  */
+
+internal_fn
+get_conditional_len_internal_fn (tree_code code)
+{
+  switch (code)
+    {
+#define CASE(CODE, IFN)                                                        \
+  case CODE:                                                                   \
+    return IFN;
+      FOR_EACH_CODE_LEN_MAPPING (CASE)
+#undef CASE
+    default:
+      return IFN_LAST;
+    }
+}
+
 /* Invoke T(IFN) for each internal function IFN that also has an
    IFN_COND_* form.  */
 #define FOR_EACH_COND_FN_PAIR(T) \
diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
index 4234bbfed87..dd1bab0bddf 100644
--- a/gcc/internal-fn.h
+++ b/gcc/internal-fn.h
@@ -224,6 +224,7 @@ extern bool set_edom_supported_p (void);
 
 extern internal_fn get_conditional_internal_fn (tree_code);
 extern internal_fn get_conditional_internal_fn (internal_fn);
+extern internal_fn get_conditional_len_internal_fn (tree_code);
 extern tree_code conditional_internal_fn_code (internal_fn);
 extern internal_fn get_unconditional_internal_fn (internal_fn);
 extern bool can_interpret_as_conditional_op_p (gimple *, tree *,
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 10e71178ce7..1c90b54bdb7 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -6711,7 +6711,9 @@ vectorizable_operation (vec_info *vinfo,
 
   int reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info);
   vec_loop_masks *masks = (loop_vinfo ? &LOOP_VINFO_MASKS (loop_vinfo) : NULL);
+  vec_loop_lens *lens = (loop_vinfo ? &LOOP_VINFO_LENS (loop_vinfo) : NULL);
   internal_fn cond_fn = get_conditional_internal_fn (code);
+  internal_fn cond_len_fn = get_conditional_len_internal_fn (code);
 
   /* If operating on inactive elements could generate spurious traps,
      we need to restrict the operation to active lanes.  Note that this
@@ -6734,11 +6736,19 @@ vectorizable_operation (vec_info *vinfo,
 	      || !direct_internal_fn_supported_p (cond_fn, vectype,
 						  OPTIMIZE_FOR_SPEED))
 	    {
-	      if (dump_enabled_p ())
-		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-				 "can't use a fully-masked loop because no"
-				 " conditional operation is available.\n");
-	      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+	      if (cond_len_fn != IFN_LAST
+		  && direct_internal_fn_supported_p (cond_len_fn, vectype,
+						     OPTIMIZE_FOR_SPEED))
+		vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
+				      vectype, 1);
+	      else
+		{
+		  if (dump_enabled_p ())
+		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				     "can't use a fully-masked loop because no"
+				     " conditional operation is available.\n");
+		  LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+		}
 	    }
 	  else
 	    vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
@@ -6805,6 +6815,7 @@ vectorizable_operation (vec_info *vinfo,
                      "transform binary/unary operation.\n");
 
   bool masked_loop_p = loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
+  bool len_loop_p = loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo);
 
   /* POINTER_DIFF_EXPR has pointer arguments which are vectorized as
      vectors with unsigned elements, but the result is signed.  So, we
@@ -6971,6 +6982,45 @@ vectorizable_operation (vec_info *vinfo,
 	  gimple_assign_set_lhs (new_stmt, new_temp);
 	  vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
 	}
+      else if (len_loop_p && mask_out_inactive)
+	{
+	  tree len = vect_get_loop_len (loop_vinfo, gsi, lens,
+					vec_num * ncopies, vectype, i, 1);
+	  signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+	  tree bias = build_int_cst (intQI_type_node, biasval);
+	  /* Dummy mask.  */
+	  tree mask_vectype = truth_type_for (vectype);
+	  tree mask = build_minus_one_cst (mask_vectype);
+	  auto_vec<tree> vops (6);
+	  vops.quick_push (mask);
+	  vops.quick_push (vop0);
+	  if (vop1)
+	    vops.quick_push (vop1);
+	  if (vop2)
+	    vops.quick_push (vop2);
+	  if (reduc_idx >= 0)
+	    {
+	      /* Perform the operation on active elements only and take
+		 inactive elements from the reduction chain input.  */
+	      gcc_assert (!vop2);
+	      vops.quick_push (reduc_idx == 1 ? vop1 : vop0);
+	    }
+	  else
+	    {
+	      auto else_value
+		= targetm.preferred_else_value (cond_len_fn, vectype,
+						vops.length () - 1, &vops[1]);
+	      vops.quick_push (else_value);
+	    }
+	  vops.quick_push (len);
+	  vops.quick_push (bias);
+	  gcall *call = gimple_build_call_internal_vec (cond_len_fn, vops);
+	  new_temp = make_ssa_name (vec_dest, call);
+	  gimple_call_set_lhs (call, new_temp);
+	  gimple_call_set_nothrow (call, true);
+	  vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
+	  new_stmt = call;
+	}
       else if (masked_loop_p && mask_out_inactive)
 	{
 	  tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
-- 
2.36.1


^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] VECT: Apply COND_LEN_* into vectorizable_operation
  2023-07-12  4:21 [PATCH] VECT: Apply COND_LEN_* into vectorizable_operation juzhe.zhong
@ 2023-07-12  9:29 ` Richard Sandiford
  2023-07-12 10:39   ` juzhe.zhong
  0 siblings, 1 reply; 3+ messages in thread
From: Richard Sandiford @ 2023-07-12  9:29 UTC (permalink / raw)
  To: juzhe.zhong; +Cc: gcc-patches, rguenther

juzhe.zhong@rivai.ai writes:
> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
>
> Hi, Richard and Richi.
> As we disscussed before, COND_LEN_* patterns were added for multiple situations.
> This patch apply CON_LEN_* for the following situation:
>
> Support for the situation that in "vectorizable_operation":
>   /* If operating on inactive elements could generate spurious traps,
>      we need to restrict the operation to active lanes.  Note that this
>      specifically doesn't apply to unhoisted invariants, since they
>      operate on the same value for every lane.
>
>      Similarly, if this operation is part of a reduction, a fully-masked
>      loop should only change the active lanes of the reduction chain,
>      keeping the inactive lanes as-is.  */
>   bool mask_out_inactive = ((!is_invariant && gimple_could_trap_p (stmt))
> 			    || reduc_idx >= 0);
>
> For mask_out_inactive is true with length loop control.
>
> So, we can these 2 following cases:
>
> 1. Integer division:
>
>    #define TEST_TYPE(TYPE) 				\
>    __attribute__((noipa))				\
>    void vrem_##TYPE (TYPE *dst, TYPE *a, TYPE *b, int n)	\
>    {							\
>      for (int i = 0; i < n; i++)				\
>        dst[i] = a[i] % b[i];				\
>    }
>    #define TEST_ALL()	\
>    TEST_TYPE(int8_t)	\
>    TEST_ALL()
>
> With this patch:
>   
>   _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [4, 4]);
>   ivtmp_45 = _61 * 4;
>   vect__4.8_48 = .LEN_MASK_LOAD (vectp_a.6_46, 32B, _61, 0, { -1, ... });
>   vect__6.11_52 = .LEN_MASK_LOAD (vectp_b.9_50, 32B, _61, 0, { -1, ... });
>   vect__8.12_53 = .COND_LEN_ADD ({ -1, ... }, vect__4.8_48, vect__6.11_52, vect__4.8_48, _61, 0);
>   .LEN_MASK_STORE (vectp_dst.13_55, 32B, _61, 0, { -1, ... }, vect__8.12_53);
>
> 2. Floating-point arithmetic **WITHOUT** -ffast-math
>   
>    #define TEST_TYPE(TYPE) 				\
>    __attribute__((noipa))				\
>    void vadd_##TYPE (TYPE *dst, TYPE *a, TYPE *b, int n)	\
>    {							\
>      for (int i = 0; i < n; i++)				\
>        dst[i] = a[i] + b[i];				\
>    }
>    #define TEST_ALL()	\
>    TEST_TYPE(float)	\
>    TEST_ALL()
>
> With this patch:
>    
>   _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [4, 4]);
>   ivtmp_45 = _61 * 4;
>   vect__4.8_48 = .LEN_MASK_LOAD (vectp_a.6_46, 32B, _61, 0, { -1, ... });
>   vect__6.11_52 = .LEN_MASK_LOAD (vectp_b.9_50, 32B, _61, 0, { -1, ... });
>   vect__8.12_53 = .COND_LEN_ADD ({ -1, ... }, vect__4.8_48, vect__6.11_52, vect__4.8_48, _61, 0);
>   .LEN_MASK_STORE (vectp_dst.13_55, 32B, _61, 0, { -1, ... }, vect__8.12_53);
>
> With this patch, we can make sure operations won't trap for elements that "mask_out_inactive".
>
> gcc/ChangeLog:
>
>         * internal-fn.cc (FOR_EACH_CODE_LEN_MAPPING): Add COND_LEN_*.
>         (get_conditional_len_internal_fn): New function.
>         (CASE): Add COND_LEN_*.
>         * internal-fn.h (get_conditional_len_internal_fn): New function.
>         * tree-vect-stmts.cc (vectorizable_operation): Apply COND_LEN_* into operation could trap.
>
> ---
>  gcc/internal-fn.cc     | 48 +++++++++++++++++++++++++++++++++
>  gcc/internal-fn.h      |  1 +
>  gcc/tree-vect-stmts.cc | 60 ++++++++++++++++++++++++++++++++++++++----
>  3 files changed, 104 insertions(+), 5 deletions(-)
>
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index f9aaf66cf2a..e46dd57b7f0 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -4337,6 +4337,54 @@ conditional_internal_fn_code (internal_fn ifn)
>      }
>  }
>  
> +/* Invoke T(CODE, IFN) for each conditional len function IFN that maps to a
> +   tree code CODE.  */
> +#define FOR_EACH_CODE_LEN_MAPPING(T)                                           \
> +  T (PLUS_EXPR, IFN_COND_LEN_ADD)                                              \
> +  T (MINUS_EXPR, IFN_COND_LEN_SUB)                                             \
> +  T (MULT_EXPR, IFN_COND_LEN_MUL)                                              \
> +  T (TRUNC_DIV_EXPR, IFN_COND_LEN_DIV)                                         \
> +  T (TRUNC_MOD_EXPR, IFN_COND_LEN_MOD)                                         \
> +  T (RDIV_EXPR, IFN_COND_LEN_RDIV)                                             \
> +  T (MIN_EXPR, IFN_COND_LEN_MIN)                                               \
> +  T (MAX_EXPR, IFN_COND_LEN_MAX)                                               \
> +  T (BIT_AND_EXPR, IFN_COND_LEN_AND)                                           \
> +  T (BIT_IOR_EXPR, IFN_COND_LEN_IOR)                                           \
> +  T (BIT_XOR_EXPR, IFN_COND_LEN_XOR)                                           \
> +  T (LSHIFT_EXPR, IFN_COND_LEN_SHL)                                            \
> +  T (RSHIFT_EXPR, IFN_COND_LEN_SHR)                                            \
> +  T (NEGATE_EXPR, IFN_COND_LEN_NEG)

I think we should instead replace:

/* Invoke T(CODE, IFN) for each conditional function IFN that maps to a
   tree code CODE.  */
#define FOR_EACH_CODE_MAPPING(T) \
  T (PLUS_EXPR, IFN_COND_ADD) \
  T (MINUS_EXPR, IFN_COND_SUB) \
  T (MULT_EXPR, IFN_COND_MUL) \
  T (TRUNC_DIV_EXPR, IFN_COND_DIV) \
  T (TRUNC_MOD_EXPR, IFN_COND_MOD) \
  T (RDIV_EXPR, IFN_COND_RDIV) \
  T (MIN_EXPR, IFN_COND_MIN) \
  T (MAX_EXPR, IFN_COND_MAX) \
  T (BIT_AND_EXPR, IFN_COND_AND) \
  T (BIT_IOR_EXPR, IFN_COND_IOR) \
  T (BIT_XOR_EXPR, IFN_COND_XOR) \
  T (LSHIFT_EXPR, IFN_COND_SHL) \
  T (RSHIFT_EXPR, IFN_COND_SHR) \
  T (NEGATE_EXPR, IFN_COND_NEG)

with:

/* Invoke T(CODE, SUFFIX) for each conditional function IFN_COND_##SUFFIX
   that maps to a tree code CODE.  There is also an IFN_COND_LEN_##SUFFIX
   for each such IFN_COND_##SUFFIX.  */
#define FOR_EACH_CODE_MAPPING(T) \
  T (PLUS_EXPR, ADD) \
  T (MINUS_EXPR, SUB) \
  T (MULT_EXPR, MUL) \
  T (TRUNC_DIV_EXPR, DIV) \
  T (TRUNC_MOD_EXPR, MOD) \
  T (RDIV_EXPR, RDIV) \
  T (MIN_EXPR, MIN) \
  T (MAX_EXPR, MAX) \
  T (BIT_AND_EXPR, AND) \
  T (BIT_IOR_EXPR, IOR) \
  T (BIT_XOR_EXPR, XOR) \
  T (LSHIFT_EXPR, SHL) \
  T (RSHIFT_EXPR, SHR) \
  T (NEGATE_EXPR, NEG)

That way we can ensure that IFN_COND_* and IFN_COND_LEN_* stay in sync.

> +
> +/* Return a function that only performs CODE when a certain condition is met
> +   and that uses a given fallback value otherwise.  For example, if CODE is
> +   a binary operation associated with conditional function FN:
> +
> +     LHS = FN (COND, A, B, ELSE, LEN)

The bias argument is missing.

> +
> +   is equivalent to the C expression:
> +
> +     for (int i = 0; i < LEN; i++)
> +       LHS[i] = COND[i] ? A[i] CODE B[i] : ELSE[i];
> +
> +   operating elementwise if the operands are vectors.
> +
> +   Return IFN_LAST if no such function exists.  */
> +
> +internal_fn
> +get_conditional_len_internal_fn (tree_code code)
> +{
> +  switch (code)
> +    {
> +#define CASE(CODE, IFN)                                                        \
> +  case CODE:                                                                   \
> +    return IFN;
> +      FOR_EACH_CODE_LEN_MAPPING (CASE)
> +#undef CASE
> +    default:
> +      return IFN_LAST;
> +    }
> +}
> +
>  /* Invoke T(IFN) for each internal function IFN that also has an
>     IFN_COND_* form.  */
>  #define FOR_EACH_COND_FN_PAIR(T) \
> diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> index 4234bbfed87..dd1bab0bddf 100644
> --- a/gcc/internal-fn.h
> +++ b/gcc/internal-fn.h
> @@ -224,6 +224,7 @@ extern bool set_edom_supported_p (void);
>  
>  extern internal_fn get_conditional_internal_fn (tree_code);
>  extern internal_fn get_conditional_internal_fn (internal_fn);
> +extern internal_fn get_conditional_len_internal_fn (tree_code);
>  extern tree_code conditional_internal_fn_code (internal_fn);
>  extern internal_fn get_unconditional_internal_fn (internal_fn);
>  extern bool can_interpret_as_conditional_op_p (gimple *, tree *,
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 10e71178ce7..1c90b54bdb7 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -6711,7 +6711,9 @@ vectorizable_operation (vec_info *vinfo,
>  
>    int reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info);
>    vec_loop_masks *masks = (loop_vinfo ? &LOOP_VINFO_MASKS (loop_vinfo) : NULL);
> +  vec_loop_lens *lens = (loop_vinfo ? &LOOP_VINFO_LENS (loop_vinfo) : NULL);
>    internal_fn cond_fn = get_conditional_internal_fn (code);
> +  internal_fn cond_len_fn = get_conditional_len_internal_fn (code);
>  
>    /* If operating on inactive elements could generate spurious traps,
>       we need to restrict the operation to active lanes.  Note that this
> @@ -6734,11 +6736,19 @@ vectorizable_operation (vec_info *vinfo,
>  	      || !direct_internal_fn_supported_p (cond_fn, vectype,
>  						  OPTIMIZE_FOR_SPEED))
>  	    {
> -	      if (dump_enabled_p ())
> -		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -				 "can't use a fully-masked loop because no"
> -				 " conditional operation is available.\n");
> -	      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> +	      if (cond_len_fn != IFN_LAST
> +		  && direct_internal_fn_supported_p (cond_len_fn, vectype,
> +						     OPTIMIZE_FOR_SPEED))
> +		vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
> +				      vectype, 1);
> +	      else
> +		{
> +		  if (dump_enabled_p ())
> +		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +				     "can't use a fully-masked loop because no"
> +				     " conditional operation is available.\n");
> +		  LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> +		}
>  	    }
>  	  else
>  	    vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,

Please instead switch the if condition so that the structure is:

   if (...)
     vect_record_loop_mask (...)
   else if (...)
     vect_record_loop_len (...)
   else
     can't use partial vectors

> @@ -6805,6 +6815,7 @@ vectorizable_operation (vec_info *vinfo,
>                       "transform binary/unary operation.\n");
>  
>    bool masked_loop_p = loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
> +  bool len_loop_p = loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo);
>  
>    /* POINTER_DIFF_EXPR has pointer arguments which are vectorized as
>       vectors with unsigned elements, but the result is signed.  So, we
> @@ -6971,6 +6982,45 @@ vectorizable_operation (vec_info *vinfo,
>  	  gimple_assign_set_lhs (new_stmt, new_temp);
>  	  vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
>  	}
> +      else if (len_loop_p && mask_out_inactive)
> +	{
> +	  tree len = vect_get_loop_len (loop_vinfo, gsi, lens,
> +					vec_num * ncopies, vectype, i, 1);
> +	  signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +	  tree bias = build_int_cst (intQI_type_node, biasval);
> +	  /* Dummy mask.  */
> +	  tree mask_vectype = truth_type_for (vectype);
> +	  tree mask = build_minus_one_cst (mask_vectype);
> +	  auto_vec<tree> vops (6);
> +	  vops.quick_push (mask);
> +	  vops.quick_push (vop0);
> +	  if (vop1)
> +	    vops.quick_push (vop1);
> +	  if (vop2)
> +	    vops.quick_push (vop2);
> +	  if (reduc_idx >= 0)
> +	    {
> +	      /* Perform the operation on active elements only and take
> +		 inactive elements from the reduction chain input.  */
> +	      gcc_assert (!vop2);
> +	      vops.quick_push (reduc_idx == 1 ? vop1 : vop0);
> +	    }
> +	  else
> +	    {
> +	      auto else_value
> +		= targetm.preferred_else_value (cond_len_fn, vectype,
> +						vops.length () - 1, &vops[1]);
> +	      vops.quick_push (else_value);
> +	    }
> +	  vops.quick_push (len);
> +	  vops.quick_push (bias);
> +	  gcall *call = gimple_build_call_internal_vec (cond_len_fn, vops);
> +	  new_temp = make_ssa_name (vec_dest, call);
> +	  gimple_call_set_lhs (call, new_temp);
> +	  gimple_call_set_nothrow (call, true);
> +	  vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
> +	  new_stmt = call;
> +	}

We should be able to share the existing code a lot more.  The only
points of difference are the mask (all 1s for IFN_COND_LEN*) and
the extra len and bias arguments.

Thanks,
Richard

>        else if (masked_loop_p && mask_out_inactive)
>  	{
>  	  tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: Re: [PATCH] VECT: Apply COND_LEN_* into vectorizable_operation
  2023-07-12  9:29 ` Richard Sandiford
@ 2023-07-12 10:39   ` juzhe.zhong
  0 siblings, 0 replies; 3+ messages in thread
From: juzhe.zhong @ 2023-07-12 10:39 UTC (permalink / raw)
  To: richard.sandiford; +Cc: gcc-patches, rguenther

[-- Attachment #1: Type: text/plain, Size: 12787 bytes --]

Thank you so much.
I have addressed all comments with V2 patch:
https://gcc.gnu.org/pipermail/gcc-patches/2023-July/624237.html 

Could you take a look at it whether it looks reasonable to you?

Thanks.


juzhe.zhong@rivai.ai
 
From: Richard Sandiford
Date: 2023-07-12 17:29
To: juzhe.zhong
CC: gcc-patches; rguenther
Subject: Re: [PATCH] VECT: Apply COND_LEN_* into vectorizable_operation
juzhe.zhong@rivai.ai writes:
> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
>
> Hi, Richard and Richi.
> As we disscussed before, COND_LEN_* patterns were added for multiple situations.
> This patch apply CON_LEN_* for the following situation:
>
> Support for the situation that in "vectorizable_operation":
>   /* If operating on inactive elements could generate spurious traps,
>      we need to restrict the operation to active lanes.  Note that this
>      specifically doesn't apply to unhoisted invariants, since they
>      operate on the same value for every lane.
>
>      Similarly, if this operation is part of a reduction, a fully-masked
>      loop should only change the active lanes of the reduction chain,
>      keeping the inactive lanes as-is.  */
>   bool mask_out_inactive = ((!is_invariant && gimple_could_trap_p (stmt))
>     || reduc_idx >= 0);
>
> For mask_out_inactive is true with length loop control.
>
> So, we can these 2 following cases:
>
> 1. Integer division:
>
>    #define TEST_TYPE(TYPE) \
>    __attribute__((noipa)) \
>    void vrem_##TYPE (TYPE *dst, TYPE *a, TYPE *b, int n) \
>    { \
>      for (int i = 0; i < n; i++) \
>        dst[i] = a[i] % b[i]; \
>    }
>    #define TEST_ALL() \
>    TEST_TYPE(int8_t) \
>    TEST_ALL()
>
> With this patch:
>   
>   _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [4, 4]);
>   ivtmp_45 = _61 * 4;
>   vect__4.8_48 = .LEN_MASK_LOAD (vectp_a.6_46, 32B, _61, 0, { -1, ... });
>   vect__6.11_52 = .LEN_MASK_LOAD (vectp_b.9_50, 32B, _61, 0, { -1, ... });
>   vect__8.12_53 = .COND_LEN_ADD ({ -1, ... }, vect__4.8_48, vect__6.11_52, vect__4.8_48, _61, 0);
>   .LEN_MASK_STORE (vectp_dst.13_55, 32B, _61, 0, { -1, ... }, vect__8.12_53);
>
> 2. Floating-point arithmetic **WITHOUT** -ffast-math
>   
>    #define TEST_TYPE(TYPE) \
>    __attribute__((noipa)) \
>    void vadd_##TYPE (TYPE *dst, TYPE *a, TYPE *b, int n) \
>    { \
>      for (int i = 0; i < n; i++) \
>        dst[i] = a[i] + b[i]; \
>    }
>    #define TEST_ALL() \
>    TEST_TYPE(float) \
>    TEST_ALL()
>
> With this patch:
>    
>   _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [4, 4]);
>   ivtmp_45 = _61 * 4;
>   vect__4.8_48 = .LEN_MASK_LOAD (vectp_a.6_46, 32B, _61, 0, { -1, ... });
>   vect__6.11_52 = .LEN_MASK_LOAD (vectp_b.9_50, 32B, _61, 0, { -1, ... });
>   vect__8.12_53 = .COND_LEN_ADD ({ -1, ... }, vect__4.8_48, vect__6.11_52, vect__4.8_48, _61, 0);
>   .LEN_MASK_STORE (vectp_dst.13_55, 32B, _61, 0, { -1, ... }, vect__8.12_53);
>
> With this patch, we can make sure operations won't trap for elements that "mask_out_inactive".
>
> gcc/ChangeLog:
>
>         * internal-fn.cc (FOR_EACH_CODE_LEN_MAPPING): Add COND_LEN_*.
>         (get_conditional_len_internal_fn): New function.
>         (CASE): Add COND_LEN_*.
>         * internal-fn.h (get_conditional_len_internal_fn): New function.
>         * tree-vect-stmts.cc (vectorizable_operation): Apply COND_LEN_* into operation could trap.
>
> ---
>  gcc/internal-fn.cc     | 48 +++++++++++++++++++++++++++++++++
>  gcc/internal-fn.h      |  1 +
>  gcc/tree-vect-stmts.cc | 60 ++++++++++++++++++++++++++++++++++++++----
>  3 files changed, 104 insertions(+), 5 deletions(-)
>
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index f9aaf66cf2a..e46dd57b7f0 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -4337,6 +4337,54 @@ conditional_internal_fn_code (internal_fn ifn)
>      }
>  }
>  
> +/* Invoke T(CODE, IFN) for each conditional len function IFN that maps to a
> +   tree code CODE.  */
> +#define FOR_EACH_CODE_LEN_MAPPING(T)                                           \
> +  T (PLUS_EXPR, IFN_COND_LEN_ADD)                                              \
> +  T (MINUS_EXPR, IFN_COND_LEN_SUB)                                             \
> +  T (MULT_EXPR, IFN_COND_LEN_MUL)                                              \
> +  T (TRUNC_DIV_EXPR, IFN_COND_LEN_DIV)                                         \
> +  T (TRUNC_MOD_EXPR, IFN_COND_LEN_MOD)                                         \
> +  T (RDIV_EXPR, IFN_COND_LEN_RDIV)                                             \
> +  T (MIN_EXPR, IFN_COND_LEN_MIN)                                               \
> +  T (MAX_EXPR, IFN_COND_LEN_MAX)                                               \
> +  T (BIT_AND_EXPR, IFN_COND_LEN_AND)                                           \
> +  T (BIT_IOR_EXPR, IFN_COND_LEN_IOR)                                           \
> +  T (BIT_XOR_EXPR, IFN_COND_LEN_XOR)                                           \
> +  T (LSHIFT_EXPR, IFN_COND_LEN_SHL)                                            \
> +  T (RSHIFT_EXPR, IFN_COND_LEN_SHR)                                            \
> +  T (NEGATE_EXPR, IFN_COND_LEN_NEG)
 
I think we should instead replace:
 
/* Invoke T(CODE, IFN) for each conditional function IFN that maps to a
   tree code CODE.  */
#define FOR_EACH_CODE_MAPPING(T) \
  T (PLUS_EXPR, IFN_COND_ADD) \
  T (MINUS_EXPR, IFN_COND_SUB) \
  T (MULT_EXPR, IFN_COND_MUL) \
  T (TRUNC_DIV_EXPR, IFN_COND_DIV) \
  T (TRUNC_MOD_EXPR, IFN_COND_MOD) \
  T (RDIV_EXPR, IFN_COND_RDIV) \
  T (MIN_EXPR, IFN_COND_MIN) \
  T (MAX_EXPR, IFN_COND_MAX) \
  T (BIT_AND_EXPR, IFN_COND_AND) \
  T (BIT_IOR_EXPR, IFN_COND_IOR) \
  T (BIT_XOR_EXPR, IFN_COND_XOR) \
  T (LSHIFT_EXPR, IFN_COND_SHL) \
  T (RSHIFT_EXPR, IFN_COND_SHR) \
  T (NEGATE_EXPR, IFN_COND_NEG)
 
with:
 
/* Invoke T(CODE, SUFFIX) for each conditional function IFN_COND_##SUFFIX
   that maps to a tree code CODE.  There is also an IFN_COND_LEN_##SUFFIX
   for each such IFN_COND_##SUFFIX.  */
#define FOR_EACH_CODE_MAPPING(T) \
  T (PLUS_EXPR, ADD) \
  T (MINUS_EXPR, SUB) \
  T (MULT_EXPR, MUL) \
  T (TRUNC_DIV_EXPR, DIV) \
  T (TRUNC_MOD_EXPR, MOD) \
  T (RDIV_EXPR, RDIV) \
  T (MIN_EXPR, MIN) \
  T (MAX_EXPR, MAX) \
  T (BIT_AND_EXPR, AND) \
  T (BIT_IOR_EXPR, IOR) \
  T (BIT_XOR_EXPR, XOR) \
  T (LSHIFT_EXPR, SHL) \
  T (RSHIFT_EXPR, SHR) \
  T (NEGATE_EXPR, NEG)
 
That way we can ensure that IFN_COND_* and IFN_COND_LEN_* stay in sync.
 
> +
> +/* Return a function that only performs CODE when a certain condition is met
> +   and that uses a given fallback value otherwise.  For example, if CODE is
> +   a binary operation associated with conditional function FN:
> +
> +     LHS = FN (COND, A, B, ELSE, LEN)
 
The bias argument is missing.
 
> +
> +   is equivalent to the C expression:
> +
> +     for (int i = 0; i < LEN; i++)
> +       LHS[i] = COND[i] ? A[i] CODE B[i] : ELSE[i];
> +
> +   operating elementwise if the operands are vectors.
> +
> +   Return IFN_LAST if no such function exists.  */
> +
> +internal_fn
> +get_conditional_len_internal_fn (tree_code code)
> +{
> +  switch (code)
> +    {
> +#define CASE(CODE, IFN)                                                        \
> +  case CODE:                                                                   \
> +    return IFN;
> +      FOR_EACH_CODE_LEN_MAPPING (CASE)
> +#undef CASE
> +    default:
> +      return IFN_LAST;
> +    }
> +}
> +
>  /* Invoke T(IFN) for each internal function IFN that also has an
>     IFN_COND_* form.  */
>  #define FOR_EACH_COND_FN_PAIR(T) \
> diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> index 4234bbfed87..dd1bab0bddf 100644
> --- a/gcc/internal-fn.h
> +++ b/gcc/internal-fn.h
> @@ -224,6 +224,7 @@ extern bool set_edom_supported_p (void);
>  
>  extern internal_fn get_conditional_internal_fn (tree_code);
>  extern internal_fn get_conditional_internal_fn (internal_fn);
> +extern internal_fn get_conditional_len_internal_fn (tree_code);
>  extern tree_code conditional_internal_fn_code (internal_fn);
>  extern internal_fn get_unconditional_internal_fn (internal_fn);
>  extern bool can_interpret_as_conditional_op_p (gimple *, tree *,
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 10e71178ce7..1c90b54bdb7 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -6711,7 +6711,9 @@ vectorizable_operation (vec_info *vinfo,
>  
>    int reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info);
>    vec_loop_masks *masks = (loop_vinfo ? &LOOP_VINFO_MASKS (loop_vinfo) : NULL);
> +  vec_loop_lens *lens = (loop_vinfo ? &LOOP_VINFO_LENS (loop_vinfo) : NULL);
>    internal_fn cond_fn = get_conditional_internal_fn (code);
> +  internal_fn cond_len_fn = get_conditional_len_internal_fn (code);
>  
>    /* If operating on inactive elements could generate spurious traps,
>       we need to restrict the operation to active lanes.  Note that this
> @@ -6734,11 +6736,19 @@ vectorizable_operation (vec_info *vinfo,
>        || !direct_internal_fn_supported_p (cond_fn, vectype,
>    OPTIMIZE_FOR_SPEED))
>      {
> -       if (dump_enabled_p ())
> - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> - "can't use a fully-masked loop because no"
> - " conditional operation is available.\n");
> -       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> +       if (cond_len_fn != IFN_LAST
> +   && direct_internal_fn_supported_p (cond_len_fn, vectype,
> +      OPTIMIZE_FOR_SPEED))
> + vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
> +       vectype, 1);
> +       else
> + {
> +   if (dump_enabled_p ())
> +     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +      "can't use a fully-masked loop because no"
> +      " conditional operation is available.\n");
> +   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> + }
>      }
>    else
>      vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
 
Please instead switch the if condition so that the structure is:
 
   if (...)
     vect_record_loop_mask (...)
   else if (...)
     vect_record_loop_len (...)
   else
     can't use partial vectors
 
> @@ -6805,6 +6815,7 @@ vectorizable_operation (vec_info *vinfo,
>                       "transform binary/unary operation.\n");
>  
>    bool masked_loop_p = loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
> +  bool len_loop_p = loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo);
>  
>    /* POINTER_DIFF_EXPR has pointer arguments which are vectorized as
>       vectors with unsigned elements, but the result is signed.  So, we
> @@ -6971,6 +6982,45 @@ vectorizable_operation (vec_info *vinfo,
>    gimple_assign_set_lhs (new_stmt, new_temp);
>    vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
>  }
> +      else if (len_loop_p && mask_out_inactive)
> + {
> +   tree len = vect_get_loop_len (loop_vinfo, gsi, lens,
> + vec_num * ncopies, vectype, i, 1);
> +   signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +   tree bias = build_int_cst (intQI_type_node, biasval);
> +   /* Dummy mask.  */
> +   tree mask_vectype = truth_type_for (vectype);
> +   tree mask = build_minus_one_cst (mask_vectype);
> +   auto_vec<tree> vops (6);
> +   vops.quick_push (mask);
> +   vops.quick_push (vop0);
> +   if (vop1)
> +     vops.quick_push (vop1);
> +   if (vop2)
> +     vops.quick_push (vop2);
> +   if (reduc_idx >= 0)
> +     {
> +       /* Perform the operation on active elements only and take
> + inactive elements from the reduction chain input.  */
> +       gcc_assert (!vop2);
> +       vops.quick_push (reduc_idx == 1 ? vop1 : vop0);
> +     }
> +   else
> +     {
> +       auto else_value
> + = targetm.preferred_else_value (cond_len_fn, vectype,
> + vops.length () - 1, &vops[1]);
> +       vops.quick_push (else_value);
> +     }
> +   vops.quick_push (len);
> +   vops.quick_push (bias);
> +   gcall *call = gimple_build_call_internal_vec (cond_len_fn, vops);
> +   new_temp = make_ssa_name (vec_dest, call);
> +   gimple_call_set_lhs (call, new_temp);
> +   gimple_call_set_nothrow (call, true);
> +   vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
> +   new_stmt = call;
> + }
 
We should be able to share the existing code a lot more.  The only
points of difference are the mask (all 1s for IFN_COND_LEN*) and
the extra len and bias arguments.
 
Thanks,
Richard
 
>        else if (masked_loop_p && mask_out_inactive)
>  {
>    tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
 

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2023-07-12 10:39 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-07-12  4:21 [PATCH] VECT: Apply COND_LEN_* into vectorizable_operation juzhe.zhong
2023-07-12  9:29 ` Richard Sandiford
2023-07-12 10:39   ` juzhe.zhong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).