public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r14-2469] VECT: Apply COND_LEN_* into vectorizable_operation
@ 2023-07-12 14:27 Pan Li
  0 siblings, 0 replies; only message in thread
From: Pan Li @ 2023-07-12 14:27 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:0d4dd7e07a879d6c07a33edb2799710faa95651e

commit r14-2469-g0d4dd7e07a879d6c07a33edb2799710faa95651e
Author: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
Date:   Wed Jul 12 21:17:39 2023 +0800

    VECT: Apply COND_LEN_* into vectorizable_operation
    
    Hi, Richard and Richi.
    As we disscussed before, COND_LEN_* patterns were added for multiple situations.
    This patch apply CON_LEN_* for the following situation:
    
    Support for the situation that in "vectorizable_operation":
      /* If operating on inactive elements could generate spurious traps,
         we need to restrict the operation to active lanes.  Note that this
         specifically doesn't apply to unhoisted invariants, since they
         operate on the same value for every lane.
    
         Similarly, if this operation is part of a reduction, a fully-masked
         loop should only change the active lanes of the reduction chain,
         keeping the inactive lanes as-is.  */
      bool mask_out_inactive = ((!is_invariant && gimple_could_trap_p (stmt))
                                || reduc_idx >= 0);
    
    For mask_out_inactive is true with length loop control.
    
    So, we can these 2 following cases:
    
    1. Integer division:
    
       #define TEST_TYPE(TYPE)                              \
       __attribute__((noipa))                               \
       void vrem_##TYPE (TYPE *dst, TYPE *a, TYPE *b, int n)        \
       {                                                    \
         for (int i = 0; i < n; i++)                                \
           dst[i] = a[i] % b[i];                            \
       }
       #define TEST_ALL()   \
       TEST_TYPE(int8_t)    \
       TEST_ALL()
    
    With this patch:
    
      _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [4, 4]);
      ivtmp_45 = _61 * 4;
      vect__4.8_48 = .LEN_MASK_LOAD (vectp_a.6_46, 32B, _61, 0, { -1, ... });
      vect__6.11_52 = .LEN_MASK_LOAD (vectp_b.9_50, 32B, _61, 0, { -1, ... });
      vect__8.12_53 = .COND_LEN_ADD ({ -1, ... }, vect__4.8_48, vect__6.11_52, vect__4.8_48, _61, 0);
      .LEN_MASK_STORE (vectp_dst.13_55, 32B, _61, 0, { -1, ... }, vect__8.12_53);
    
    2. Floating-point arithmetic **WITHOUT** -ffast-math
    
       #define TEST_TYPE(TYPE)                              \
       __attribute__((noipa))                               \
       void vadd_##TYPE (TYPE *dst, TYPE *a, TYPE *b, int n)        \
       {                                                    \
         for (int i = 0; i < n; i++)                                \
           dst[i] = a[i] + b[i];                            \
       }
       #define TEST_ALL()   \
       TEST_TYPE(float)     \
       TEST_ALL()
    
    With this patch:
    
      _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [4, 4]);
      ivtmp_45 = _61 * 4;
      vect__4.8_48 = .LEN_MASK_LOAD (vectp_a.6_46, 32B, _61, 0, { -1, ... });
      vect__6.11_52 = .LEN_MASK_LOAD (vectp_b.9_50, 32B, _61, 0, { -1, ... });
      vect__8.12_53 = .COND_LEN_ADD ({ -1, ... }, vect__4.8_48, vect__6.11_52, vect__4.8_48, _61, 0);
      .LEN_MASK_STORE (vectp_dst.13_55, 32B, _61, 0, { -1, ... }, vect__8.12_53);
    
    With this patch, we can make sure operations won't trap for elements that "mask_out_inactive".
    
    gcc/ChangeLog:
    
            * internal-fn.cc (FOR_EACH_CODE_MAPPING): Adapt for COND_LEN_* support.
            (CASE): Ditto.
            (get_conditional_len_internal_fn): New function.
            * internal-fn.h (get_conditional_len_internal_fn): Ditto.
            * tree-vect-stmts.cc (vectorizable_operation): Adapt for COND_LEN_*
            support.

Diff:
---
 gcc/internal-fn.cc     | 71 +++++++++++++++++++++++++++++++++++++-------------
 gcc/internal-fn.h      |  1 +
 gcc/tree-vect-stmts.cc | 48 ++++++++++++++++++++++++++--------
 3 files changed, 91 insertions(+), 29 deletions(-)

diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index f9aaf66cf2a..c11123a1173 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -4276,23 +4276,24 @@ static void (*const internal_fn_expanders[]) (internal_fn, gcall *) = {
   0
 };
 
-/* Invoke T(CODE, IFN) for each conditional function IFN that maps to a
-   tree code CODE.  */
+/* Invoke T(CODE, SUFFIX) for each conditional function IFN_COND_##SUFFIX
+   that maps to a tree code CODE.  There is also an IFN_COND_LEN_##SUFFIX
+   for each such IFN_COND_##SUFFIX.  */
 #define FOR_EACH_CODE_MAPPING(T) \
-  T (PLUS_EXPR, IFN_COND_ADD) \
-  T (MINUS_EXPR, IFN_COND_SUB) \
-  T (MULT_EXPR, IFN_COND_MUL) \
-  T (TRUNC_DIV_EXPR, IFN_COND_DIV) \
-  T (TRUNC_MOD_EXPR, IFN_COND_MOD) \
-  T (RDIV_EXPR, IFN_COND_RDIV) \
-  T (MIN_EXPR, IFN_COND_MIN) \
-  T (MAX_EXPR, IFN_COND_MAX) \
-  T (BIT_AND_EXPR, IFN_COND_AND) \
-  T (BIT_IOR_EXPR, IFN_COND_IOR) \
-  T (BIT_XOR_EXPR, IFN_COND_XOR) \
-  T (LSHIFT_EXPR, IFN_COND_SHL) \
-  T (RSHIFT_EXPR, IFN_COND_SHR) \
-  T (NEGATE_EXPR, IFN_COND_NEG)
+  T (PLUS_EXPR, ADD) \
+  T (MINUS_EXPR, SUB) \
+  T (MULT_EXPR, MUL) \
+  T (TRUNC_DIV_EXPR, DIV) \
+  T (TRUNC_MOD_EXPR, MOD) \
+  T (RDIV_EXPR, RDIV) \
+  T (MIN_EXPR, MIN) \
+  T (MAX_EXPR, MAX) \
+  T (BIT_AND_EXPR, AND) \
+  T (BIT_IOR_EXPR, IOR) \
+  T (BIT_XOR_EXPR, XOR) \
+  T (LSHIFT_EXPR, SHL) \
+  T (RSHIFT_EXPR, SHR) \
+  T (NEGATE_EXPR, NEG)
 
 /* Return a function that only performs CODE when a certain condition is met
    and that uses a given fallback value otherwise.  For example, if CODE is
@@ -4313,7 +4314,7 @@ get_conditional_internal_fn (tree_code code)
 {
   switch (code)
     {
-#define CASE(CODE, IFN) case CODE: return IFN;
+#define CASE(CODE, IFN) case CODE: return IFN_COND_##IFN;
       FOR_EACH_CODE_MAPPING(CASE)
 #undef CASE
     default:
@@ -4329,7 +4330,7 @@ conditional_internal_fn_code (internal_fn ifn)
 {
   switch (ifn)
     {
-#define CASE(CODE, IFN) case IFN: return CODE;
+#define CASE(CODE, IFN) case IFN_COND_##IFN: return CODE;
       FOR_EACH_CODE_MAPPING(CASE)
 #undef CASE
     default:
@@ -4337,6 +4338,40 @@ conditional_internal_fn_code (internal_fn ifn)
     }
 }
 
+/* Like get_conditional_internal_fn, but return a function that
+   additionally restricts the operation to the leading elements
+   of a vector.  The number of elements to process is given by a length
+   and bias pair, as for IFN_LOAD_LEN.  The values of the remaining
+   elements are taken from the fallback ("else") argument.
+
+   For example, if CODE is a binary operation associated with FN:
+
+     LHS = FN (COND, A, B, ELSE, LEN, BIAS)
+
+   is equivalent to the C code:
+
+     for (int i = 0; i < NUNITS; i++)
+      {
+	if (i < LEN + BIAS && COND[i])
+	  LHS[i] = A[i] CODE B[i];
+	else
+	  LHS[i] = ELSE[i];
+      }
+*/
+
+internal_fn
+get_conditional_len_internal_fn (tree_code code)
+{
+  switch (code)
+    {
+#define CASE(CODE, IFN) case CODE: return IFN_COND_LEN_##IFN;
+      FOR_EACH_CODE_MAPPING(CASE)
+#undef CASE
+    default:
+      return IFN_LAST;
+    }
+}
+
 /* Invoke T(IFN) for each internal function IFN that also has an
    IFN_COND_* form.  */
 #define FOR_EACH_COND_FN_PAIR(T) \
diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
index 4234bbfed87..dd1bab0bddf 100644
--- a/gcc/internal-fn.h
+++ b/gcc/internal-fn.h
@@ -224,6 +224,7 @@ extern bool set_edom_supported_p (void);
 
 extern internal_fn get_conditional_internal_fn (tree_code);
 extern internal_fn get_conditional_internal_fn (internal_fn);
+extern internal_fn get_conditional_len_internal_fn (tree_code);
 extern tree_code conditional_internal_fn_code (internal_fn);
 extern internal_fn get_unconditional_internal_fn (internal_fn);
 extern bool can_interpret_as_conditional_op_p (gimple *, tree *,
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 10e71178ce7..dd24f017235 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -6711,7 +6711,9 @@ vectorizable_operation (vec_info *vinfo,
 
   int reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info);
   vec_loop_masks *masks = (loop_vinfo ? &LOOP_VINFO_MASKS (loop_vinfo) : NULL);
+  vec_loop_lens *lens = (loop_vinfo ? &LOOP_VINFO_LENS (loop_vinfo) : NULL);
   internal_fn cond_fn = get_conditional_internal_fn (code);
+  internal_fn cond_len_fn = get_conditional_len_internal_fn (code);
 
   /* If operating on inactive elements could generate spurious traps,
      we need to restrict the operation to active lanes.  Note that this
@@ -6730,9 +6732,17 @@ vectorizable_operation (vec_info *vinfo,
 	  && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
 	  && mask_out_inactive)
 	{
-	  if (cond_fn == IFN_LAST
-	      || !direct_internal_fn_supported_p (cond_fn, vectype,
-						  OPTIMIZE_FOR_SPEED))
+	  if (cond_fn != IFN_LAST
+	      && direct_internal_fn_supported_p (cond_fn, vectype,
+						 OPTIMIZE_FOR_SPEED))
+	    vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
+				   vectype, NULL);
+	  else if (cond_len_fn != IFN_LAST
+		   && direct_internal_fn_supported_p (cond_len_fn, vectype,
+						      OPTIMIZE_FOR_SPEED))
+	    vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num, vectype,
+				  1);
+	  else
 	    {
 	      if (dump_enabled_p ())
 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -6740,9 +6750,6 @@ vectorizable_operation (vec_info *vinfo,
 				 " conditional operation is available.\n");
 	      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
 	    }
-	  else
-	    vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
-				   vectype, NULL);
 	}
 
       /* Put types on constant and invariant SLP children.  */
@@ -6805,6 +6812,7 @@ vectorizable_operation (vec_info *vinfo,
                      "transform binary/unary operation.\n");
 
   bool masked_loop_p = loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
+  bool len_loop_p = loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo);
 
   /* POINTER_DIFF_EXPR has pointer arguments which are vectorized as
      vectors with unsigned elements, but the result is signed.  So, we
@@ -6971,11 +6979,16 @@ vectorizable_operation (vec_info *vinfo,
 	  gimple_assign_set_lhs (new_stmt, new_temp);
 	  vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
 	}
-      else if (masked_loop_p && mask_out_inactive)
+      else if ((masked_loop_p || len_loop_p) && mask_out_inactive)
 	{
-	  tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
-					  vec_num * ncopies, vectype, i);
-	  auto_vec<tree> vops (5);
+	  tree mask;
+	  if (masked_loop_p)
+	    mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
+				       vec_num * ncopies, vectype, i);
+	  else
+	    /* Dummy mask.  */
+	    mask = build_minus_one_cst (truth_type_for (vectype));
+	  auto_vec<tree> vops (6);
 	  vops.quick_push (mask);
 	  vops.quick_push (vop0);
 	  if (vop1)
@@ -6995,7 +7008,20 @@ vectorizable_operation (vec_info *vinfo,
 		(cond_fn, vectype, vops.length () - 1, &vops[1]);
 	      vops.quick_push (else_value);
 	    }
-	  gcall *call = gimple_build_call_internal_vec (cond_fn, vops);
+	  if (len_loop_p)
+	    {
+	      tree len = vect_get_loop_len (loop_vinfo, gsi, lens,
+					    vec_num * ncopies, vectype, i, 1);
+	      signed char biasval
+		= LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+	      tree bias = build_int_cst (intQI_type_node, biasval);
+	      vops.quick_push (len);
+	      vops.quick_push (bias);
+	    }
+	  gcall *call
+	    = gimple_build_call_internal_vec (masked_loop_p ? cond_fn
+							    : cond_len_fn,
+					      vops);
 	  new_temp = make_ssa_name (vec_dest, call);
 	  gimple_call_set_lhs (call, new_temp);
 	  gimple_call_set_nothrow (call, true);

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2023-07-12 14:27 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-07-12 14:27 [gcc r14-2469] VECT: Apply COND_LEN_* into vectorizable_operation Pan Li

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).