[PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
@ 2023-04-07  1:47 juzhe.zhong
  2023-04-07  3:23 ` Li, Pan2
                   ` (2 more replies)
  0 siblings, 3 replies; 41+ messages in thread
From: juzhe.zhong @ 2023-04-07  1:47 UTC (permalink / raw)
  To: gcc-patches; +Cc: richard.sandiford, rguenther, jeffreyalaw, Juzhe-Zhong

From: Juzhe-Zhong <juzhe.zhong@rivai.ai>

This patch is to add WHILE_LEN pattern.
It's inspired by RVV ISA simple "vvaddint32.s" example:
https://github.com/riscv/riscv-v-spec/blob/master/example/vvaddint32.s

More details are in "vect_set_loop_controls_by_while_len" implementation
and comments.

Consider such following case:
#define N 16
int src[N];
int dest[N];

void
foo (int n)
{
  for (int i = 0; i < n; i++)
    dest[i] = src[i];
}

-march=rv64gcv -O3 --param riscv-autovec-preference=scalable -fno-vect-cost-model -fno-tree-loop-distribute-patterns:

foo:        
        ble     a0,zero,.L1
        lui     a4,%hi(.LANCHOR0)
        addi    a4,a4,%lo(.LANCHOR0)
        addi    a3,a4,64
        csrr    a2,vlenb
.L3:
        vsetvli a5,a0,e32,m1,ta,ma
        vle32.v v1,0(a4)
        sub     a0,a0,a5
        vse32.v v1,0(a3)
        add     a4,a4,a2
        add     a3,a3,a2
        bne     a0,zero,.L3
.L1:
        ret

gcc/ChangeLog:

        * doc/md.texi: Add WHILE_LEN support.
        * internal-fn.cc (while_len_direct): Ditto.
        (expand_while_len_optab_fn): Ditto.
        (direct_while_len_optab_supported_p): Ditto.
        * internal-fn.def (WHILE_LEN): Ditto.
        * optabs.def (OPTAB_D): Ditto.
        * tree-ssa-loop-manip.cc (create_iv): Ditto.
        * tree-ssa-loop-manip.h (create_iv): Ditto.
        * tree-vect-loop-manip.cc (vect_set_loop_controls_by_while_len): Ditto.
        (vect_set_loop_condition_partial_vectors): Ditto.
        * tree-vect-loop.cc (vect_get_loop_len): Ditto.
        * tree-vect-stmts.cc (vectorizable_store): Ditto.
        (vectorizable_load): Ditto.
        * tree-vectorizer.h (vect_get_loop_len): Ditto.

---
 gcc/doc/md.texi             |  14 +++
 gcc/internal-fn.cc          |  29 ++++++
 gcc/internal-fn.def         |   1 +
 gcc/optabs.def              |   1 +
 gcc/tree-ssa-loop-manip.cc  |   4 +-
 gcc/tree-ssa-loop-manip.h   |   2 +-
 gcc/tree-vect-loop-manip.cc | 186 ++++++++++++++++++++++++++++++++++--
 gcc/tree-vect-loop.cc       |  35 +++++--
 gcc/tree-vect-stmts.cc      |   9 +-
 gcc/tree-vectorizer.h       |   4 +-
 10 files changed, 264 insertions(+), 21 deletions(-)

diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 8e3113599fd..72178ab014c 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -4965,6 +4965,20 @@ for (i = 1; i < operand3; i++)
   operand0[i] = operand0[i - 1] && (operand1 + i < operand2);
 @end smallexample
 
+@cindex @code{while_len@var{m}@var{n}} instruction pattern
+@item @code{while_len@var{m}@var{n}}
+Set operand 0 to the number of active elements in vector will be updated value.
+operand 1 is the total elements need to be updated value.
+operand 2 is the vectorization factor.
+The operation is equivalent to:
+
+@smallexample
+operand0 = MIN (operand1, operand2);
+operand2 can be const_poly_int or poly_int related to vector mode size.
+Some target like RISC-V has a standalone instruction to get MIN (n, MODE SIZE) so
+that we can reduce a use of general purpose register.
+@end smallexample
+
 @cindex @code{check_raw_ptrs@var{m}} instruction pattern
 @item @samp{check_raw_ptrs@var{m}}
 Check whether, given two pointers @var{a} and @var{b} and a length @var{len},
diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index 6e81dc05e0e..5f44def90d3 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -127,6 +127,7 @@ init_internal_fns ()
 #define cond_binary_direct { 1, 1, true }
 #define cond_ternary_direct { 1, 1, true }
 #define while_direct { 0, 2, false }
+#define while_len_direct { 0, 0, false }
 #define fold_extract_direct { 2, 2, false }
 #define fold_left_direct { 1, 1, false }
 #define mask_fold_left_direct { 1, 1, false }
@@ -3702,6 +3703,33 @@ expand_while_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
     emit_move_insn (lhs_rtx, ops[0].value);
 }
 
+/* Expand WHILE_LEN call STMT using optab OPTAB.  */
+static void
+expand_while_len_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
+{
+  expand_operand ops[3];
+  tree rhs_type[2];
+
+  tree lhs = gimple_call_lhs (stmt);
+  tree lhs_type = TREE_TYPE (lhs);
+  rtx lhs_rtx = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
+  create_output_operand (&ops[0], lhs_rtx, TYPE_MODE (lhs_type));
+
+  for (unsigned int i = 0; i < gimple_call_num_args (stmt); ++i)
+    {
+      tree rhs = gimple_call_arg (stmt, i);
+      rhs_type[i] = TREE_TYPE (rhs);
+      rtx rhs_rtx = expand_normal (rhs);
+      create_input_operand (&ops[i + 1], rhs_rtx, TYPE_MODE (rhs_type[i]));
+    }
+
+  insn_code icode = direct_optab_handler (optab, TYPE_MODE (rhs_type[0]));
+
+  expand_insn (icode, 3, ops);
+  if (!rtx_equal_p (lhs_rtx, ops[0].value))
+    emit_move_insn (lhs_rtx, ops[0].value);
+}
+
 /* Expand a call to a convert-like optab using the operands in STMT.
    FN has a single output operand and NARGS input operands.  */
 
@@ -3843,6 +3871,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
 #define direct_scatter_store_optab_supported_p convert_optab_supported_p
 #define direct_len_store_optab_supported_p direct_optab_supported_p
 #define direct_while_optab_supported_p convert_optab_supported_p
+#define direct_while_len_optab_supported_p direct_optab_supported_p
 #define direct_fold_extract_optab_supported_p direct_optab_supported_p
 #define direct_fold_left_optab_supported_p direct_optab_supported_p
 #define direct_mask_fold_left_optab_supported_p direct_optab_supported_p
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 7fe742c2ae7..3a933abff5d 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -153,6 +153,7 @@ DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set)
 DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store)
 
 DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
+DEF_INTERNAL_OPTAB_FN (WHILE_LEN, ECF_CONST | ECF_NOTHROW, while_len, while_len)
 DEF_INTERNAL_OPTAB_FN (CHECK_RAW_PTRS, ECF_CONST | ECF_NOTHROW,
 		       check_raw_ptrs, check_ptrs)
 DEF_INTERNAL_OPTAB_FN (CHECK_WAR_PTRS, ECF_CONST | ECF_NOTHROW,
diff --git a/gcc/optabs.def b/gcc/optabs.def
index 695f5911b30..f5938bd2c24 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -476,3 +476,4 @@ OPTAB_DC (vec_series_optab, "vec_series$a", VEC_SERIES)
 OPTAB_D (vec_shl_insert_optab, "vec_shl_insert_$a")
 OPTAB_D (len_load_optab, "len_load_$a")
 OPTAB_D (len_store_optab, "len_store_$a")
+OPTAB_D (while_len_optab, "while_len$a")
diff --git a/gcc/tree-ssa-loop-manip.cc b/gcc/tree-ssa-loop-manip.cc
index 09acc1c94cc..cdbf280e249 100644
--- a/gcc/tree-ssa-loop-manip.cc
+++ b/gcc/tree-ssa-loop-manip.cc
@@ -59,14 +59,14 @@ static bitmap_obstack loop_renamer_obstack;
 void
 create_iv (tree base, tree step, tree var, class loop *loop,
 	   gimple_stmt_iterator *incr_pos, bool after,
-	   tree *var_before, tree *var_after)
+	   tree *var_before, tree *var_after, enum tree_code code)
 {
   gassign *stmt;
   gphi *phi;
   tree initial, step1;
   gimple_seq stmts;
   tree vb, va;
-  enum tree_code incr_op = PLUS_EXPR;
+  enum tree_code incr_op = code;
   edge pe = loop_preheader_edge (loop);
 
   if (var != NULL_TREE)
diff --git a/gcc/tree-ssa-loop-manip.h b/gcc/tree-ssa-loop-manip.h
index d49273a3987..da755320a3a 100644
--- a/gcc/tree-ssa-loop-manip.h
+++ b/gcc/tree-ssa-loop-manip.h
@@ -23,7 +23,7 @@ along with GCC; see the file COPYING3.  If not see
 typedef void (*transform_callback)(class loop *, void *);
 
 extern void create_iv (tree, tree, tree, class loop *, gimple_stmt_iterator *,
-		       bool, tree *, tree *);
+		       bool, tree *, tree *, enum tree_code = PLUS_EXPR);
 extern void rewrite_into_loop_closed_ssa (bitmap, unsigned);
 extern void verify_loop_closed_ssa (bool, class loop * = NULL);
 
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index f60fa50e8f4..f3cd6c51d2e 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -682,6 +682,173 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
   return next_ctrl;
 }
 
+/* Helper for vect_set_loop_condition_partial_vectors.  Generate definitions
+   for all the rgroup controls in RGC and return a control that is nonzero
+   when the loop needs to iterate.  Add any new preheader statements to
+   PREHEADER_SEQ.  Use LOOP_COND_GSI to insert code before the exit gcond.
+
+   RGC belongs to loop LOOP.  The loop originally iterated NITERS
+   times and has been vectorized according to LOOP_VINFO.
+
+   Unlike vect_set_loop_controls_directly which is iterating from 0-based IV
+   to TEST_LIMIT - bias.
+
+   In vect_set_loop_controls_by_while_len, we are iterating from start at
+   IV = TEST_LIMIT - bias and keep subtract IV by the length calculated by
+   IFN_WHILE_LEN pattern.
+
+   Note: the cost of the code generated by this function is modeled
+   by vect_estimate_min_profitable_iters, so changes here may need
+   corresponding changes there.
+
+   1. Single rgroup, the Gimple IR should be:
+
+	<bb 3>
+	_19 = (unsigned long) n_5(D);
+	...
+
+	<bb 4>:
+	...
+	# ivtmp_20 = PHI <ivtmp_21(4), _19(3)>
+	...
+	_22 = .WHILE_LEN (ivtmp_20, vf);
+	...
+	vector statement (use _22);
+	...
+	ivtmp_21 = ivtmp_20 - _22;
+	...
+	if (ivtmp_21 != 0)
+	  goto <bb 4>; [75.00%]
+	else
+	  goto <bb 5>; [25.00%]
+
+	<bb 5>
+	return;
+
+   Note: IFN_WHILE_LEN will guarantee "ivtmp_21 = ivtmp_20 - _22" never
+   underflow 0.
+
+   2. Multiple rgroup, the Gimple IR should be:
+
+	<bb 3>
+	_70 = (unsigned long) bnd.7_52;
+	_71 = _70 * 2;
+	_72 = MAX_EXPR <_71, 4>;
+	_73 = _72 + 18446744073709551612;
+	...
+
+	<bb 4>:
+	...
+	# ivtmp_74 = PHI <ivtmp_75(6), _73(12)>
+	# ivtmp_77 = PHI <ivtmp_78(6), _71(12)>
+	_76 = .WHILE_LEN (ivtmp_74, vf * nitems_per_ctrl);
+	_79 = .WHILE_LEN (ivtmp_77, vf * nitems_per_ctrl);
+	...
+	vector statement (use _79);
+	...
+	vector statement (use _76);
+	...
+	_65 = _79 / 2;
+	vector statement (use _65);
+	...
+	_68 = _76 / 2;
+	vector statement (use _68);
+	...
+	ivtmp_78 = ivtmp_77 - _79;
+	ivtmp_75 = ivtmp_74 - _76;
+	...
+	if (ivtmp_78 != 0)
+	  goto <bb 4>; [75.00%]
+	else
+	  goto <bb 5>; [25.00%]
+
+	<bb 5>
+	return;
+
+*/
+
+static tree
+vect_set_loop_controls_by_while_len (class loop *loop, loop_vec_info loop_vinfo,
+				     gimple_seq *preheader_seq,
+				     gimple_seq *header_seq,
+				     rgroup_controls *rgc, tree niters)
+{
+  tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
+  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
+  /* We are not allowing masked approach in WHILE_LEN.  */
+  gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
+
+  tree ctrl_type = rgc->type;
+  unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
+  poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
+  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+
+  /* Calculate the maximum number of item values that the rgroup
+     handles in total, the number that it handles for each iteration
+     of the vector loop.  */
+  tree nitems_total = niters;
+  if (nitems_per_iter != 1)
+    {
+      /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
+	 these multiplications don't overflow.  */
+      tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
+      nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
+				   nitems_total, compare_factor);
+    }
+
+  /* Convert the comparison value to the IV type (either a no-op or
+     a promotion).  */
+  nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total);
+
+  /* Create an induction variable that counts the number of items
+     processed.  */
+  tree index_before_incr, index_after_incr;
+  gimple_stmt_iterator incr_gsi;
+  bool insert_after;
+  standard_iv_increment_position (loop, &incr_gsi, &insert_after);
+
+  /* Test the decremented IV, which will never underflow 0 since we have
+     IFN_WHILE_LEN to gurantee that.  */
+  tree test_limit = nitems_total;
+
+  /* Provide a definition of each control in the group.  */
+  tree ctrl;
+  unsigned int i;
+  FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
+    {
+      /* Previous controls will cover BIAS items.  This control covers the
+	 next batch.  */
+      poly_uint64 bias = nitems_per_ctrl * i;
+      tree bias_tree = build_int_cst (iv_type, bias);
+
+      /* Rather than have a new IV that starts at TEST_LIMIT and goes down to
+	 BIAS, prefer to use the same TEST_LIMIT - BIAS based IV for each
+	 control and adjust the bound down by BIAS.  */
+      tree this_test_limit = test_limit;
+      if (i != 0)
+	{
+	  this_test_limit = gimple_build (preheader_seq, MAX_EXPR, iv_type,
+					  this_test_limit, bias_tree);
+	  this_test_limit = gimple_build (preheader_seq, MINUS_EXPR, iv_type,
+					  this_test_limit, bias_tree);
+	}
+
+      /* Create decrement IV.  */
+      create_iv (this_test_limit, ctrl, NULL_TREE, loop, &incr_gsi,
+		 insert_after, &index_before_incr, &index_after_incr,
+		 MINUS_EXPR);
+
+      poly_uint64 final_vf = vf * nitems_per_iter;
+      tree vf_step = build_int_cst (iv_type, final_vf);
+      tree res_len = gimple_build (header_seq, IFN_WHILE_LEN, iv_type,
+				   index_before_incr, vf_step);
+      gassign *assign = gimple_build_assign (ctrl, res_len);
+      gimple_seq_add_stmt (header_seq, assign);
+    }
+
+  return index_after_incr;
+}
+
 /* Set up the iteration condition and rgroup controls for LOOP, given
    that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
    loop.  LOOP_VINFO describes the vectorization of LOOP.  NITERS is
@@ -703,6 +870,7 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
 
   bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
+  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
   unsigned int compare_precision = TYPE_PRECISION (compare_type);
   tree orig_niters = niters;
 
@@ -757,12 +925,18 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
 	bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
 
 	/* Set up all controls for this group.  */
-	test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
-						     &preheader_seq,
-						     &header_seq,
-						     loop_cond_gsi, rgc,
-						     niters, niters_skip,
-						     might_wrap_p);
+	if (direct_internal_fn_supported_p (IFN_WHILE_LEN, iv_type,
+					    OPTIMIZE_FOR_SPEED))
+	  test_ctrl
+	    = vect_set_loop_controls_by_while_len (loop, loop_vinfo,
+						   &preheader_seq, &header_seq,
+						   rgc, niters);
+	else
+	  test_ctrl
+	    = vect_set_loop_controls_directly (loop, loop_vinfo, &preheader_seq,
+					       &header_seq, loop_cond_gsi, rgc,
+					       niters, niters_skip,
+					       might_wrap_p);
       }
 
   /* Emit all accumulated statements.  */
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 1ba9f18d73e..5bffd9a6322 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -10360,12 +10360,14 @@ vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
    rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
 
 tree
-vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
-		   unsigned int nvectors, unsigned int index)
+vect_get_loop_len (gimple_stmt_iterator *gsi, loop_vec_info loop_vinfo,
+		   vec_loop_lens *lens, unsigned int nvectors, tree vectype,
+		   unsigned int index)
 {
   rgroup_controls *rgl = &(*lens)[nvectors - 1];
-  bool use_bias_adjusted_len =
-    LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
+  bool use_bias_adjusted_len
+    = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
+  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
 
   /* Populate the rgroup's len array, if this is the first time we've
      used it.  */
@@ -10386,8 +10388,8 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
 	  if (use_bias_adjusted_len)
 	    {
 	      gcc_assert (i == 0);
-	      tree adjusted_len =
-		make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
+	      tree adjusted_len
+		= make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
 	      SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
 	      rgl->bias_adjusted_ctrl = adjusted_len;
 	    }
@@ -10396,6 +10398,27 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
 
   if (use_bias_adjusted_len)
     return rgl->bias_adjusted_ctrl;
+  else if (direct_internal_fn_supported_p (IFN_WHILE_LEN, iv_type,
+					   OPTIMIZE_FOR_SPEED))
+    {
+      tree loop_len = rgl->controls[index];
+      poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
+      poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
+      if (maybe_ne (nunits1, nunits2))
+	{
+	  /* A loop len for data type X can be reused for data type Y
+	     if X has N times more elements than Y and if Y's elements
+	     are N times bigger than X's.  */
+	  gcc_assert (multiple_p (nunits1, nunits2));
+	  unsigned int factor = exact_div (nunits1, nunits2).to_constant ();
+	  gimple_seq seq = NULL;
+	  loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
+				   build_int_cst (iv_type, factor));
+	  if (seq)
+	    gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
+	}
+      return loop_len;
+    }
   else
     return rgl->controls[index];
 }
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index efa2d0daa52..708c8a1d806 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -8653,8 +8653,9 @@ vectorizable_store (vec_info *vinfo,
 	      else if (loop_lens)
 		{
 		  tree final_len
-		    = vect_get_loop_len (loop_vinfo, loop_lens,
-					 vec_num * ncopies, vec_num * j + i);
+		    = vect_get_loop_len (gsi, loop_vinfo, loop_lens,
+					 vec_num * ncopies, vectype,
+					 vec_num * j + i);
 		  tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
 		  machine_mode vmode = TYPE_MODE (vectype);
 		  opt_machine_mode new_ovmode
@@ -10009,8 +10010,8 @@ vectorizable_load (vec_info *vinfo,
 		    else if (loop_lens && memory_access_type != VMAT_INVARIANT)
 		      {
 			tree final_len
-			  = vect_get_loop_len (loop_vinfo, loop_lens,
-					       vec_num * ncopies,
+			  = vect_get_loop_len (gsi, loop_vinfo, loop_lens,
+					       vec_num * ncopies, vectype,
 					       vec_num * j + i);
 			tree ptr = build_int_cst (ref_type,
 						  align * BITS_PER_UNIT);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 9cf2fb23fe3..e5cf38caf4b 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2293,8 +2293,8 @@ extern tree vect_get_loop_mask (gimple_stmt_iterator *, vec_loop_masks *,
 				unsigned int, tree, unsigned int);
 extern void vect_record_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
 				  tree, unsigned int);
-extern tree vect_get_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
-			       unsigned int);
+extern tree vect_get_loop_len (gimple_stmt_iterator *, loop_vec_info,
+			       vec_loop_lens *, unsigned int, tree, unsigned int);
 extern gimple_seq vect_gen_len (tree, tree, tree, tree);
 extern stmt_vec_info info_for_reduction (vec_info *, stmt_vec_info);
 extern bool reduction_fn_for_scalar_code (code_helper, internal_fn *);
-- 
2.36.3


^ permalink raw reply	[flat|nested] 41+ messages in thread

* RE: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
  2023-04-07  1:47 [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization juzhe.zhong
@ 2023-04-07  3:23 ` Li, Pan2
  2023-04-11 12:12 ` juzhe.zhong
  2023-04-19 21:53 ` 钟居哲
  2 siblings, 0 replies; 41+ messages in thread
From: Li, Pan2 @ 2023-04-07  3:23 UTC (permalink / raw)
  To: juzhe.zhong, gcc-patches; +Cc: richard.sandiford, rguenther, jeffreyalaw

The bootstrap in X86 passed with this patch applied, target commit id a8c8351cf4fedb842988eed4f73304019c361e86 (13.0.1 20230407).

Pan

-----Original Message-----
From: Gcc-patches <gcc-patches-bounces+pan2.li=intel.com@gcc.gnu.org> On Behalf Of juzhe.zhong@rivai.ai
Sent: Friday, April 7, 2023 9:48 AM
To: gcc-patches@gcc.gnu.org
Cc: richard.sandiford@arm.com; rguenther@suse.de; jeffreyalaw@gmail.com; Juzhe-Zhong <juzhe.zhong@rivai.ai>
Subject: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization

From: Juzhe-Zhong <juzhe.zhong@rivai.ai>

This patch is to add WHILE_LEN pattern.
It's inspired by RVV ISA simple "vvaddint32.s" example:
https://github.com/riscv/riscv-v-spec/blob/master/example/vvaddint32.s

More details are in "vect_set_loop_controls_by_while_len" implementation and comments.

Consider such following case:
#define N 16
int src[N];
int dest[N];

void
foo (int n)
{
  for (int i = 0; i < n; i++)
    dest[i] = src[i];
}

-march=rv64gcv -O3 --param riscv-autovec-preference=scalable -fno-vect-cost-model -fno-tree-loop-distribute-patterns:

foo:        
        ble     a0,zero,.L1
        lui     a4,%hi(.LANCHOR0)
        addi    a4,a4,%lo(.LANCHOR0)
        addi    a3,a4,64
        csrr    a2,vlenb
.L3:
        vsetvli a5,a0,e32,m1,ta,ma
        vle32.v v1,0(a4)
        sub     a0,a0,a5
        vse32.v v1,0(a3)
        add     a4,a4,a2
        add     a3,a3,a2
        bne     a0,zero,.L3
.L1:
        ret

gcc/ChangeLog:

        * doc/md.texi: Add WHILE_LEN support.
        * internal-fn.cc (while_len_direct): Ditto.
        (expand_while_len_optab_fn): Ditto.
        (direct_while_len_optab_supported_p): Ditto.
        * internal-fn.def (WHILE_LEN): Ditto.
        * optabs.def (OPTAB_D): Ditto.
        * tree-ssa-loop-manip.cc (create_iv): Ditto.
        * tree-ssa-loop-manip.h (create_iv): Ditto.
        * tree-vect-loop-manip.cc (vect_set_loop_controls_by_while_len): Ditto.
        (vect_set_loop_condition_partial_vectors): Ditto.
        * tree-vect-loop.cc (vect_get_loop_len): Ditto.
        * tree-vect-stmts.cc (vectorizable_store): Ditto.
        (vectorizable_load): Ditto.
        * tree-vectorizer.h (vect_get_loop_len): Ditto.

---
 gcc/doc/md.texi             |  14 +++
 gcc/internal-fn.cc          |  29 ++++++
 gcc/internal-fn.def         |   1 +
 gcc/optabs.def              |   1 +
 gcc/tree-ssa-loop-manip.cc  |   4 +-
 gcc/tree-ssa-loop-manip.h   |   2 +-
 gcc/tree-vect-loop-manip.cc | 186 ++++++++++++++++++++++++++++++++++--
 gcc/tree-vect-loop.cc       |  35 +++++--
 gcc/tree-vect-stmts.cc      |   9 +-
 gcc/tree-vectorizer.h       |   4 +-
 10 files changed, 264 insertions(+), 21 deletions(-)

diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi index 8e3113599fd..72178ab014c 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -4965,6 +4965,20 @@ for (i = 1; i < operand3; i++)
   operand0[i] = operand0[i - 1] && (operand1 + i < operand2);  @end smallexample
 
+@cindex @code{while_len@var{m}@var{n}} instruction pattern @item 
+@code{while_len@var{m}@var{n}} Set operand 0 to the number of active 
+elements in vector will be updated value.
+operand 1 is the total elements need to be updated value.
+operand 2 is the vectorization factor.
+The operation is equivalent to:
+
+@smallexample
+operand0 = MIN (operand1, operand2);
+operand2 can be const_poly_int or poly_int related to vector mode size.
+Some target like RISC-V has a standalone instruction to get MIN (n, 
+MODE SIZE) so that we can reduce a use of general purpose register.
+@end smallexample
+
 @cindex @code{check_raw_ptrs@var{m}} instruction pattern  @item @samp{check_raw_ptrs@var{m}}  Check whether, given two pointers @var{a} and @var{b} and a length @var{len}, diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc index 6e81dc05e0e..5f44def90d3 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -127,6 +127,7 @@ init_internal_fns ()  #define cond_binary_direct { 1, 1, true }  #define cond_ternary_direct { 1, 1, true }  #define while_direct { 0, 2, false }
+#define while_len_direct { 0, 0, false }
 #define fold_extract_direct { 2, 2, false }  #define fold_left_direct { 1, 1, false }  #define mask_fold_left_direct { 1, 1, false } @@ -3702,6 +3703,33 @@ expand_while_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
     emit_move_insn (lhs_rtx, ops[0].value);  }
 
+/* Expand WHILE_LEN call STMT using optab OPTAB.  */ static void 
+expand_while_len_optab_fn (internal_fn, gcall *stmt, convert_optab 
+optab) {
+  expand_operand ops[3];
+  tree rhs_type[2];
+
+  tree lhs = gimple_call_lhs (stmt);
+  tree lhs_type = TREE_TYPE (lhs);
+  rtx lhs_rtx = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);  
+ create_output_operand (&ops[0], lhs_rtx, TYPE_MODE (lhs_type));
+
+  for (unsigned int i = 0; i < gimple_call_num_args (stmt); ++i)
+    {
+      tree rhs = gimple_call_arg (stmt, i);
+      rhs_type[i] = TREE_TYPE (rhs);
+      rtx rhs_rtx = expand_normal (rhs);
+      create_input_operand (&ops[i + 1], rhs_rtx, TYPE_MODE (rhs_type[i]));
+    }
+
+  insn_code icode = direct_optab_handler (optab, TYPE_MODE 
+ (rhs_type[0]));
+
+  expand_insn (icode, 3, ops);
+  if (!rtx_equal_p (lhs_rtx, ops[0].value))
+    emit_move_insn (lhs_rtx, ops[0].value); }
+
 /* Expand a call to a convert-like optab using the operands in STMT.
    FN has a single output operand and NARGS input operands.  */
 
@@ -3843,6 +3871,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,  #define direct_scatter_store_optab_supported_p convert_optab_supported_p  #define direct_len_store_optab_supported_p direct_optab_supported_p  #define direct_while_optab_supported_p convert_optab_supported_p
+#define direct_while_len_optab_supported_p direct_optab_supported_p
 #define direct_fold_extract_optab_supported_p direct_optab_supported_p  #define direct_fold_left_optab_supported_p direct_optab_supported_p  #define direct_mask_fold_left_optab_supported_p direct_optab_supported_p diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def index 7fe742c2ae7..3a933abff5d 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -153,6 +153,7 @@ DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set)  DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store)
 
 DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
+DEF_INTERNAL_OPTAB_FN (WHILE_LEN, ECF_CONST | ECF_NOTHROW, while_len, 
+while_len)
 DEF_INTERNAL_OPTAB_FN (CHECK_RAW_PTRS, ECF_CONST | ECF_NOTHROW,
 		       check_raw_ptrs, check_ptrs)
 DEF_INTERNAL_OPTAB_FN (CHECK_WAR_PTRS, ECF_CONST | ECF_NOTHROW, diff --git a/gcc/optabs.def b/gcc/optabs.def index 695f5911b30..f5938bd2c24 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -476,3 +476,4 @@ OPTAB_DC (vec_series_optab, "vec_series$a", VEC_SERIES)  OPTAB_D (vec_shl_insert_optab, "vec_shl_insert_$a")  OPTAB_D (len_load_optab, "len_load_$a")  OPTAB_D (len_store_optab, "len_store_$a")
+OPTAB_D (while_len_optab, "while_len$a")
diff --git a/gcc/tree-ssa-loop-manip.cc b/gcc/tree-ssa-loop-manip.cc index 09acc1c94cc..cdbf280e249 100644
--- a/gcc/tree-ssa-loop-manip.cc
+++ b/gcc/tree-ssa-loop-manip.cc
@@ -59,14 +59,14 @@ static bitmap_obstack loop_renamer_obstack;  void  create_iv (tree base, tree step, tree var, class loop *loop,
 	   gimple_stmt_iterator *incr_pos, bool after,
-	   tree *var_before, tree *var_after)
+	   tree *var_before, tree *var_after, enum tree_code code)
 {
   gassign *stmt;
   gphi *phi;
   tree initial, step1;
   gimple_seq stmts;
   tree vb, va;
-  enum tree_code incr_op = PLUS_EXPR;
+  enum tree_code incr_op = code;
   edge pe = loop_preheader_edge (loop);
 
   if (var != NULL_TREE)
diff --git a/gcc/tree-ssa-loop-manip.h b/gcc/tree-ssa-loop-manip.h index d49273a3987..da755320a3a 100644
--- a/gcc/tree-ssa-loop-manip.h
+++ b/gcc/tree-ssa-loop-manip.h
@@ -23,7 +23,7 @@ along with GCC; see the file COPYING3.  If not see  typedef void (*transform_callback)(class loop *, void *);
 
 extern void create_iv (tree, tree, tree, class loop *, gimple_stmt_iterator *,
-		       bool, tree *, tree *);
+		       bool, tree *, tree *, enum tree_code = PLUS_EXPR);
 extern void rewrite_into_loop_closed_ssa (bitmap, unsigned);  extern void verify_loop_closed_ssa (bool, class loop * = NULL);
 
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc index f60fa50e8f4..f3cd6c51d2e 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -682,6 +682,173 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
   return next_ctrl;
 }
 
+/* Helper for vect_set_loop_condition_partial_vectors.  Generate definitions
+   for all the rgroup controls in RGC and return a control that is nonzero
+   when the loop needs to iterate.  Add any new preheader statements to
+   PREHEADER_SEQ.  Use LOOP_COND_GSI to insert code before the exit gcond.
+
+   RGC belongs to loop LOOP.  The loop originally iterated NITERS
+   times and has been vectorized according to LOOP_VINFO.
+
+   Unlike vect_set_loop_controls_directly which is iterating from 0-based IV
+   to TEST_LIMIT - bias.
+
+   In vect_set_loop_controls_by_while_len, we are iterating from start at
+   IV = TEST_LIMIT - bias and keep subtract IV by the length calculated by
+   IFN_WHILE_LEN pattern.
+
+   Note: the cost of the code generated by this function is modeled
+   by vect_estimate_min_profitable_iters, so changes here may need
+   corresponding changes there.
+
+   1. Single rgroup, the Gimple IR should be:
+
+	<bb 3>
+	_19 = (unsigned long) n_5(D);
+	...
+
+	<bb 4>:
+	...
+	# ivtmp_20 = PHI <ivtmp_21(4), _19(3)>
+	...
+	_22 = .WHILE_LEN (ivtmp_20, vf);
+	...
+	vector statement (use _22);
+	...
+	ivtmp_21 = ivtmp_20 - _22;
+	...
+	if (ivtmp_21 != 0)
+	  goto <bb 4>; [75.00%]
+	else
+	  goto <bb 5>; [25.00%]
+
+	<bb 5>
+	return;
+
+   Note: IFN_WHILE_LEN will guarantee "ivtmp_21 = ivtmp_20 - _22" never
+   underflow 0.
+
+   2. Multiple rgroup, the Gimple IR should be:
+
+	<bb 3>
+	_70 = (unsigned long) bnd.7_52;
+	_71 = _70 * 2;
+	_72 = MAX_EXPR <_71, 4>;
+	_73 = _72 + 18446744073709551612;
+	...
+
+	<bb 4>:
+	...
+	# ivtmp_74 = PHI <ivtmp_75(6), _73(12)>
+	# ivtmp_77 = PHI <ivtmp_78(6), _71(12)>
+	_76 = .WHILE_LEN (ivtmp_74, vf * nitems_per_ctrl);
+	_79 = .WHILE_LEN (ivtmp_77, vf * nitems_per_ctrl);
+	...
+	vector statement (use _79);
+	...
+	vector statement (use _76);
+	...
+	_65 = _79 / 2;
+	vector statement (use _65);
+	...
+	_68 = _76 / 2;
+	vector statement (use _68);
+	...
+	ivtmp_78 = ivtmp_77 - _79;
+	ivtmp_75 = ivtmp_74 - _76;
+	...
+	if (ivtmp_78 != 0)
+	  goto <bb 4>; [75.00%]
+	else
+	  goto <bb 5>; [25.00%]
+
+	<bb 5>
+	return;
+
+*/
+
+static tree
+vect_set_loop_controls_by_while_len (class loop *loop, loop_vec_info loop_vinfo,
+				     gimple_seq *preheader_seq,
+				     gimple_seq *header_seq,
+				     rgroup_controls *rgc, tree niters) {
+  tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
+  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
+  /* We are not allowing masked approach in WHILE_LEN.  */
+  gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
+
+  tree ctrl_type = rgc->type;
+  unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * 
+ rgc->factor;
+  poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * 
+ rgc->factor;
+  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+
+  /* Calculate the maximum number of item values that the rgroup
+     handles in total, the number that it handles for each iteration
+     of the vector loop.  */
+  tree nitems_total = niters;
+  if (nitems_per_iter != 1)
+    {
+      /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
+	 these multiplications don't overflow.  */
+      tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
+      nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
+				   nitems_total, compare_factor);
+    }
+
+  /* Convert the comparison value to the IV type (either a no-op or
+     a promotion).  */
+  nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total);
+
+  /* Create an induction variable that counts the number of items
+     processed.  */
+  tree index_before_incr, index_after_incr;  gimple_stmt_iterator 
+ incr_gsi;  bool insert_after;  standard_iv_increment_position (loop, 
+ &incr_gsi, &insert_after);
+
+  /* Test the decremented IV, which will never underflow 0 since we have
+     IFN_WHILE_LEN to gurantee that.  */  tree test_limit = 
+ nitems_total;
+
+  /* Provide a definition of each control in the group.  */
+  tree ctrl;
+  unsigned int i;
+  FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
+    {
+      /* Previous controls will cover BIAS items.  This control covers the
+	 next batch.  */
+      poly_uint64 bias = nitems_per_ctrl * i;
+      tree bias_tree = build_int_cst (iv_type, bias);
+
+      /* Rather than have a new IV that starts at TEST_LIMIT and goes down to
+	 BIAS, prefer to use the same TEST_LIMIT - BIAS based IV for each
+	 control and adjust the bound down by BIAS.  */
+      tree this_test_limit = test_limit;
+      if (i != 0)
+	{
+	  this_test_limit = gimple_build (preheader_seq, MAX_EXPR, iv_type,
+					  this_test_limit, bias_tree);
+	  this_test_limit = gimple_build (preheader_seq, MINUS_EXPR, iv_type,
+					  this_test_limit, bias_tree);
+	}
+
+      /* Create decrement IV.  */
+      create_iv (this_test_limit, ctrl, NULL_TREE, loop, &incr_gsi,
+		 insert_after, &index_before_incr, &index_after_incr,
+		 MINUS_EXPR);
+
+      poly_uint64 final_vf = vf * nitems_per_iter;
+      tree vf_step = build_int_cst (iv_type, final_vf);
+      tree res_len = gimple_build (header_seq, IFN_WHILE_LEN, iv_type,
+				   index_before_incr, vf_step);
+      gassign *assign = gimple_build_assign (ctrl, res_len);
+      gimple_seq_add_stmt (header_seq, assign);
+    }
+
+  return index_after_incr;
+}
+
 /* Set up the iteration condition and rgroup controls for LOOP, given
    that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
    loop.  LOOP_VINFO describes the vectorization of LOOP.  NITERS is @@ -703,6 +870,7 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
 
   bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
+  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
   unsigned int compare_precision = TYPE_PRECISION (compare_type);
   tree orig_niters = niters;
 
@@ -757,12 +925,18 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
 	bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
 
 	/* Set up all controls for this group.  */
-	test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
-						     &preheader_seq,
-						     &header_seq,
-						     loop_cond_gsi, rgc,
-						     niters, niters_skip,
-						     might_wrap_p);
+	if (direct_internal_fn_supported_p (IFN_WHILE_LEN, iv_type,
+					    OPTIMIZE_FOR_SPEED))
+	  test_ctrl
+	    = vect_set_loop_controls_by_while_len (loop, loop_vinfo,
+						   &preheader_seq, &header_seq,
+						   rgc, niters);
+	else
+	  test_ctrl
+	    = vect_set_loop_controls_directly (loop, loop_vinfo, &preheader_seq,
+					       &header_seq, loop_cond_gsi, rgc,
+					       niters, niters_skip,
+					       might_wrap_p);
       }
 
   /* Emit all accumulated statements.  */ diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 1ba9f18d73e..5bffd9a6322 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -10360,12 +10360,14 @@ vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
    rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
 
 tree
-vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
-		   unsigned int nvectors, unsigned int index)
+vect_get_loop_len (gimple_stmt_iterator *gsi, loop_vec_info loop_vinfo,
+		   vec_loop_lens *lens, unsigned int nvectors, tree vectype,
+		   unsigned int index)
 {
   rgroup_controls *rgl = &(*lens)[nvectors - 1];
-  bool use_bias_adjusted_len =
-    LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
+  bool use_bias_adjusted_len
+    = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;  tree 
+ iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
 
   /* Populate the rgroup's len array, if this is the first time we've
      used it.  */
@@ -10386,8 +10388,8 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
 	  if (use_bias_adjusted_len)
 	    {
 	      gcc_assert (i == 0);
-	      tree adjusted_len =
-		make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
+	      tree adjusted_len
+		= make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
 	      SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
 	      rgl->bias_adjusted_ctrl = adjusted_len;
 	    }
@@ -10396,6 +10398,27 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
 
   if (use_bias_adjusted_len)
     return rgl->bias_adjusted_ctrl;
+  else if (direct_internal_fn_supported_p (IFN_WHILE_LEN, iv_type,
+					   OPTIMIZE_FOR_SPEED))
+    {
+      tree loop_len = rgl->controls[index];
+      poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
+      poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
+      if (maybe_ne (nunits1, nunits2))
+	{
+	  /* A loop len for data type X can be reused for data type Y
+	     if X has N times more elements than Y and if Y's elements
+	     are N times bigger than X's.  */
+	  gcc_assert (multiple_p (nunits1, nunits2));
+	  unsigned int factor = exact_div (nunits1, nunits2).to_constant ();
+	  gimple_seq seq = NULL;
+	  loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
+				   build_int_cst (iv_type, factor));
+	  if (seq)
+	    gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
+	}
+      return loop_len;
+    }
   else
     return rgl->controls[index];
 }
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index efa2d0daa52..708c8a1d806 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -8653,8 +8653,9 @@ vectorizable_store (vec_info *vinfo,
 	      else if (loop_lens)
 		{
 		  tree final_len
-		    = vect_get_loop_len (loop_vinfo, loop_lens,
-					 vec_num * ncopies, vec_num * j + i);
+		    = vect_get_loop_len (gsi, loop_vinfo, loop_lens,
+					 vec_num * ncopies, vectype,
+					 vec_num * j + i);
 		  tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
 		  machine_mode vmode = TYPE_MODE (vectype);
 		  opt_machine_mode new_ovmode
@@ -10009,8 +10010,8 @@ vectorizable_load (vec_info *vinfo,
 		    else if (loop_lens && memory_access_type != VMAT_INVARIANT)
 		      {
 			tree final_len
-			  = vect_get_loop_len (loop_vinfo, loop_lens,
-					       vec_num * ncopies,
+			  = vect_get_loop_len (gsi, loop_vinfo, loop_lens,
+					       vec_num * ncopies, vectype,
 					       vec_num * j + i);
 			tree ptr = build_int_cst (ref_type,
 						  align * BITS_PER_UNIT);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 9cf2fb23fe3..e5cf38caf4b 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2293,8 +2293,8 @@ extern tree vect_get_loop_mask (gimple_stmt_iterator *, vec_loop_masks *,
 				unsigned int, tree, unsigned int);
 extern void vect_record_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
 				  tree, unsigned int);
-extern tree vect_get_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
-			       unsigned int);
+extern tree vect_get_loop_len (gimple_stmt_iterator *, loop_vec_info,
+			       vec_loop_lens *, unsigned int, tree, unsigned int);
 extern gimple_seq vect_gen_len (tree, tree, tree, tree);  extern stmt_vec_info info_for_reduction (vec_info *, stmt_vec_info);  extern bool reduction_fn_for_scalar_code (code_helper, internal_fn *);
--
2.36.3


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
  2023-04-07  1:47 [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization juzhe.zhong
  2023-04-07  3:23 ` Li, Pan2
@ 2023-04-11 12:12 ` juzhe.zhong
  2023-04-11 12:44   ` Richard Sandiford
  2023-04-19 21:53 ` 钟居哲
  2 siblings, 1 reply; 41+ messages in thread
From: juzhe.zhong @ 2023-04-11 12:12 UTC (permalink / raw)
  To: 钟居哲, gcc-patches
  Cc: richard.sandiford, rguenther, jeffreyalaw

[-- Attachment #1: Type: text/plain, Size: 19903 bytes --]

Hi, Richards. 
Kindly Ping this patch. 
This is the most important patch for RVV auto-vectorization support.
Bootstraped on X86 has passed.
Feel free to comments.

Thanks.


juzhe.zhong@rivai.ai
 
From: juzhe.zhong
Date: 2023-04-07 09:47
To: gcc-patches
CC: richard.sandiford; rguenther; jeffreyalaw; Juzhe-Zhong
Subject: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
 
This patch is to add WHILE_LEN pattern.
It's inspired by RVV ISA simple "vvaddint32.s" example:
https://github.com/riscv/riscv-v-spec/blob/master/example/vvaddint32.s
 
More details are in "vect_set_loop_controls_by_while_len" implementation
and comments.
 
Consider such following case:
#define N 16
int src[N];
int dest[N];
 
void
foo (int n)
{
  for (int i = 0; i < n; i++)
    dest[i] = src[i];
}
 
-march=rv64gcv -O3 --param riscv-autovec-preference=scalable -fno-vect-cost-model -fno-tree-loop-distribute-patterns:
 
foo:        
        ble     a0,zero,.L1
        lui     a4,%hi(.LANCHOR0)
        addi    a4,a4,%lo(.LANCHOR0)
        addi    a3,a4,64
        csrr    a2,vlenb
.L3:
        vsetvli a5,a0,e32,m1,ta,ma
        vle32.v v1,0(a4)
        sub     a0,a0,a5
        vse32.v v1,0(a3)
        add     a4,a4,a2
        add     a3,a3,a2
        bne     a0,zero,.L3
.L1:
        ret
 
gcc/ChangeLog:
 
        * doc/md.texi: Add WHILE_LEN support.
        * internal-fn.cc (while_len_direct): Ditto.
        (expand_while_len_optab_fn): Ditto.
        (direct_while_len_optab_supported_p): Ditto.
        * internal-fn.def (WHILE_LEN): Ditto.
        * optabs.def (OPTAB_D): Ditto.
        * tree-ssa-loop-manip.cc (create_iv): Ditto.
        * tree-ssa-loop-manip.h (create_iv): Ditto.
        * tree-vect-loop-manip.cc (vect_set_loop_controls_by_while_len): Ditto.
        (vect_set_loop_condition_partial_vectors): Ditto.
        * tree-vect-loop.cc (vect_get_loop_len): Ditto.
        * tree-vect-stmts.cc (vectorizable_store): Ditto.
        (vectorizable_load): Ditto.
        * tree-vectorizer.h (vect_get_loop_len): Ditto.
 
---
gcc/doc/md.texi             |  14 +++
gcc/internal-fn.cc          |  29 ++++++
gcc/internal-fn.def         |   1 +
gcc/optabs.def              |   1 +
gcc/tree-ssa-loop-manip.cc  |   4 +-
gcc/tree-ssa-loop-manip.h   |   2 +-
gcc/tree-vect-loop-manip.cc | 186 ++++++++++++++++++++++++++++++++++--
gcc/tree-vect-loop.cc       |  35 +++++--
gcc/tree-vect-stmts.cc      |   9 +-
gcc/tree-vectorizer.h       |   4 +-
10 files changed, 264 insertions(+), 21 deletions(-)
 
diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 8e3113599fd..72178ab014c 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -4965,6 +4965,20 @@ for (i = 1; i < operand3; i++)
   operand0[i] = operand0[i - 1] && (operand1 + i < operand2);
@end smallexample
+@cindex @code{while_len@var{m}@var{n}} instruction pattern
+@item @code{while_len@var{m}@var{n}}
+Set operand 0 to the number of active elements in vector will be updated value.
+operand 1 is the total elements need to be updated value.
+operand 2 is the vectorization factor.
+The operation is equivalent to:
+
+@smallexample
+operand0 = MIN (operand1, operand2);
+operand2 can be const_poly_int or poly_int related to vector mode size.
+Some target like RISC-V has a standalone instruction to get MIN (n, MODE SIZE) so
+that we can reduce a use of general purpose register.
+@end smallexample
+
@cindex @code{check_raw_ptrs@var{m}} instruction pattern
@item @samp{check_raw_ptrs@var{m}}
Check whether, given two pointers @var{a} and @var{b} and a length @var{len},
diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index 6e81dc05e0e..5f44def90d3 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -127,6 +127,7 @@ init_internal_fns ()
#define cond_binary_direct { 1, 1, true }
#define cond_ternary_direct { 1, 1, true }
#define while_direct { 0, 2, false }
+#define while_len_direct { 0, 0, false }
#define fold_extract_direct { 2, 2, false }
#define fold_left_direct { 1, 1, false }
#define mask_fold_left_direct { 1, 1, false }
@@ -3702,6 +3703,33 @@ expand_while_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
     emit_move_insn (lhs_rtx, ops[0].value);
}
+/* Expand WHILE_LEN call STMT using optab OPTAB.  */
+static void
+expand_while_len_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
+{
+  expand_operand ops[3];
+  tree rhs_type[2];
+
+  tree lhs = gimple_call_lhs (stmt);
+  tree lhs_type = TREE_TYPE (lhs);
+  rtx lhs_rtx = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
+  create_output_operand (&ops[0], lhs_rtx, TYPE_MODE (lhs_type));
+
+  for (unsigned int i = 0; i < gimple_call_num_args (stmt); ++i)
+    {
+      tree rhs = gimple_call_arg (stmt, i);
+      rhs_type[i] = TREE_TYPE (rhs);
+      rtx rhs_rtx = expand_normal (rhs);
+      create_input_operand (&ops[i + 1], rhs_rtx, TYPE_MODE (rhs_type[i]));
+    }
+
+  insn_code icode = direct_optab_handler (optab, TYPE_MODE (rhs_type[0]));
+
+  expand_insn (icode, 3, ops);
+  if (!rtx_equal_p (lhs_rtx, ops[0].value))
+    emit_move_insn (lhs_rtx, ops[0].value);
+}
+
/* Expand a call to a convert-like optab using the operands in STMT.
    FN has a single output operand and NARGS input operands.  */
@@ -3843,6 +3871,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
#define direct_scatter_store_optab_supported_p convert_optab_supported_p
#define direct_len_store_optab_supported_p direct_optab_supported_p
#define direct_while_optab_supported_p convert_optab_supported_p
+#define direct_while_len_optab_supported_p direct_optab_supported_p
#define direct_fold_extract_optab_supported_p direct_optab_supported_p
#define direct_fold_left_optab_supported_p direct_optab_supported_p
#define direct_mask_fold_left_optab_supported_p direct_optab_supported_p
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 7fe742c2ae7..3a933abff5d 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -153,6 +153,7 @@ DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set)
DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store)
DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
+DEF_INTERNAL_OPTAB_FN (WHILE_LEN, ECF_CONST | ECF_NOTHROW, while_len, while_len)
DEF_INTERNAL_OPTAB_FN (CHECK_RAW_PTRS, ECF_CONST | ECF_NOTHROW,
       check_raw_ptrs, check_ptrs)
DEF_INTERNAL_OPTAB_FN (CHECK_WAR_PTRS, ECF_CONST | ECF_NOTHROW,
diff --git a/gcc/optabs.def b/gcc/optabs.def
index 695f5911b30..f5938bd2c24 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -476,3 +476,4 @@ OPTAB_DC (vec_series_optab, "vec_series$a", VEC_SERIES)
OPTAB_D (vec_shl_insert_optab, "vec_shl_insert_$a")
OPTAB_D (len_load_optab, "len_load_$a")
OPTAB_D (len_store_optab, "len_store_$a")
+OPTAB_D (while_len_optab, "while_len$a")
diff --git a/gcc/tree-ssa-loop-manip.cc b/gcc/tree-ssa-loop-manip.cc
index 09acc1c94cc..cdbf280e249 100644
--- a/gcc/tree-ssa-loop-manip.cc
+++ b/gcc/tree-ssa-loop-manip.cc
@@ -59,14 +59,14 @@ static bitmap_obstack loop_renamer_obstack;
void
create_iv (tree base, tree step, tree var, class loop *loop,
   gimple_stmt_iterator *incr_pos, bool after,
-    tree *var_before, tree *var_after)
+    tree *var_before, tree *var_after, enum tree_code code)
{
   gassign *stmt;
   gphi *phi;
   tree initial, step1;
   gimple_seq stmts;
   tree vb, va;
-  enum tree_code incr_op = PLUS_EXPR;
+  enum tree_code incr_op = code;
   edge pe = loop_preheader_edge (loop);
   if (var != NULL_TREE)
diff --git a/gcc/tree-ssa-loop-manip.h b/gcc/tree-ssa-loop-manip.h
index d49273a3987..da755320a3a 100644
--- a/gcc/tree-ssa-loop-manip.h
+++ b/gcc/tree-ssa-loop-manip.h
@@ -23,7 +23,7 @@ along with GCC; see the file COPYING3.  If not see
typedef void (*transform_callback)(class loop *, void *);
extern void create_iv (tree, tree, tree, class loop *, gimple_stmt_iterator *,
-        bool, tree *, tree *);
+        bool, tree *, tree *, enum tree_code = PLUS_EXPR);
extern void rewrite_into_loop_closed_ssa (bitmap, unsigned);
extern void verify_loop_closed_ssa (bool, class loop * = NULL);
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index f60fa50e8f4..f3cd6c51d2e 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -682,6 +682,173 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
   return next_ctrl;
}
+/* Helper for vect_set_loop_condition_partial_vectors.  Generate definitions
+   for all the rgroup controls in RGC and return a control that is nonzero
+   when the loop needs to iterate.  Add any new preheader statements to
+   PREHEADER_SEQ.  Use LOOP_COND_GSI to insert code before the exit gcond.
+
+   RGC belongs to loop LOOP.  The loop originally iterated NITERS
+   times and has been vectorized according to LOOP_VINFO.
+
+   Unlike vect_set_loop_controls_directly which is iterating from 0-based IV
+   to TEST_LIMIT - bias.
+
+   In vect_set_loop_controls_by_while_len, we are iterating from start at
+   IV = TEST_LIMIT - bias and keep subtract IV by the length calculated by
+   IFN_WHILE_LEN pattern.
+
+   Note: the cost of the code generated by this function is modeled
+   by vect_estimate_min_profitable_iters, so changes here may need
+   corresponding changes there.
+
+   1. Single rgroup, the Gimple IR should be:
+
+ <bb 3>
+ _19 = (unsigned long) n_5(D);
+ ...
+
+ <bb 4>:
+ ...
+ # ivtmp_20 = PHI <ivtmp_21(4), _19(3)>
+ ...
+ _22 = .WHILE_LEN (ivtmp_20, vf);
+ ...
+ vector statement (use _22);
+ ...
+ ivtmp_21 = ivtmp_20 - _22;
+ ...
+ if (ivtmp_21 != 0)
+   goto <bb 4>; [75.00%]
+ else
+   goto <bb 5>; [25.00%]
+
+ <bb 5>
+ return;
+
+   Note: IFN_WHILE_LEN will guarantee "ivtmp_21 = ivtmp_20 - _22" never
+   underflow 0.
+
+   2. Multiple rgroup, the Gimple IR should be:
+
+ <bb 3>
+ _70 = (unsigned long) bnd.7_52;
+ _71 = _70 * 2;
+ _72 = MAX_EXPR <_71, 4>;
+ _73 = _72 + 18446744073709551612;
+ ...
+
+ <bb 4>:
+ ...
+ # ivtmp_74 = PHI <ivtmp_75(6), _73(12)>
+ # ivtmp_77 = PHI <ivtmp_78(6), _71(12)>
+ _76 = .WHILE_LEN (ivtmp_74, vf * nitems_per_ctrl);
+ _79 = .WHILE_LEN (ivtmp_77, vf * nitems_per_ctrl);
+ ...
+ vector statement (use _79);
+ ...
+ vector statement (use _76);
+ ...
+ _65 = _79 / 2;
+ vector statement (use _65);
+ ...
+ _68 = _76 / 2;
+ vector statement (use _68);
+ ...
+ ivtmp_78 = ivtmp_77 - _79;
+ ivtmp_75 = ivtmp_74 - _76;
+ ...
+ if (ivtmp_78 != 0)
+   goto <bb 4>; [75.00%]
+ else
+   goto <bb 5>; [25.00%]
+
+ <bb 5>
+ return;
+
+*/
+
+static tree
+vect_set_loop_controls_by_while_len (class loop *loop, loop_vec_info loop_vinfo,
+      gimple_seq *preheader_seq,
+      gimple_seq *header_seq,
+      rgroup_controls *rgc, tree niters)
+{
+  tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
+  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
+  /* We are not allowing masked approach in WHILE_LEN.  */
+  gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
+
+  tree ctrl_type = rgc->type;
+  unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
+  poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
+  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+
+  /* Calculate the maximum number of item values that the rgroup
+     handles in total, the number that it handles for each iteration
+     of the vector loop.  */
+  tree nitems_total = niters;
+  if (nitems_per_iter != 1)
+    {
+      /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
+ these multiplications don't overflow.  */
+      tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
+      nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
+    nitems_total, compare_factor);
+    }
+
+  /* Convert the comparison value to the IV type (either a no-op or
+     a promotion).  */
+  nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total);
+
+  /* Create an induction variable that counts the number of items
+     processed.  */
+  tree index_before_incr, index_after_incr;
+  gimple_stmt_iterator incr_gsi;
+  bool insert_after;
+  standard_iv_increment_position (loop, &incr_gsi, &insert_after);
+
+  /* Test the decremented IV, which will never underflow 0 since we have
+     IFN_WHILE_LEN to gurantee that.  */
+  tree test_limit = nitems_total;
+
+  /* Provide a definition of each control in the group.  */
+  tree ctrl;
+  unsigned int i;
+  FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
+    {
+      /* Previous controls will cover BIAS items.  This control covers the
+ next batch.  */
+      poly_uint64 bias = nitems_per_ctrl * i;
+      tree bias_tree = build_int_cst (iv_type, bias);
+
+      /* Rather than have a new IV that starts at TEST_LIMIT and goes down to
+ BIAS, prefer to use the same TEST_LIMIT - BIAS based IV for each
+ control and adjust the bound down by BIAS.  */
+      tree this_test_limit = test_limit;
+      if (i != 0)
+ {
+   this_test_limit = gimple_build (preheader_seq, MAX_EXPR, iv_type,
+   this_test_limit, bias_tree);
+   this_test_limit = gimple_build (preheader_seq, MINUS_EXPR, iv_type,
+   this_test_limit, bias_tree);
+ }
+
+      /* Create decrement IV.  */
+      create_iv (this_test_limit, ctrl, NULL_TREE, loop, &incr_gsi,
+ insert_after, &index_before_incr, &index_after_incr,
+ MINUS_EXPR);
+
+      poly_uint64 final_vf = vf * nitems_per_iter;
+      tree vf_step = build_int_cst (iv_type, final_vf);
+      tree res_len = gimple_build (header_seq, IFN_WHILE_LEN, iv_type,
+    index_before_incr, vf_step);
+      gassign *assign = gimple_build_assign (ctrl, res_len);
+      gimple_seq_add_stmt (header_seq, assign);
+    }
+
+  return index_after_incr;
+}
+
/* Set up the iteration condition and rgroup controls for LOOP, given
    that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
    loop.  LOOP_VINFO describes the vectorization of LOOP.  NITERS is
@@ -703,6 +870,7 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
   bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
+  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
   unsigned int compare_precision = TYPE_PRECISION (compare_type);
   tree orig_niters = niters;
@@ -757,12 +925,18 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
/* Set up all controls for this group.  */
- test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
-      &preheader_seq,
-      &header_seq,
-      loop_cond_gsi, rgc,
-      niters, niters_skip,
-      might_wrap_p);
+ if (direct_internal_fn_supported_p (IFN_WHILE_LEN, iv_type,
+     OPTIMIZE_FOR_SPEED))
+   test_ctrl
+     = vect_set_loop_controls_by_while_len (loop, loop_vinfo,
+    &preheader_seq, &header_seq,
+    rgc, niters);
+ else
+   test_ctrl
+     = vect_set_loop_controls_directly (loop, loop_vinfo, &preheader_seq,
+        &header_seq, loop_cond_gsi, rgc,
+        niters, niters_skip,
+        might_wrap_p);
       }
   /* Emit all accumulated statements.  */
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 1ba9f18d73e..5bffd9a6322 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -10360,12 +10360,14 @@ vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
    rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
tree
-vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
-    unsigned int nvectors, unsigned int index)
+vect_get_loop_len (gimple_stmt_iterator *gsi, loop_vec_info loop_vinfo,
+    vec_loop_lens *lens, unsigned int nvectors, tree vectype,
+    unsigned int index)
{
   rgroup_controls *rgl = &(*lens)[nvectors - 1];
-  bool use_bias_adjusted_len =
-    LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
+  bool use_bias_adjusted_len
+    = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
+  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
   /* Populate the rgroup's len array, if this is the first time we've
      used it.  */
@@ -10386,8 +10388,8 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
  if (use_bias_adjusted_len)
    {
      gcc_assert (i == 0);
-       tree adjusted_len =
- make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
+       tree adjusted_len
+ = make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
      SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
      rgl->bias_adjusted_ctrl = adjusted_len;
    }
@@ -10396,6 +10398,27 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
   if (use_bias_adjusted_len)
     return rgl->bias_adjusted_ctrl;
+  else if (direct_internal_fn_supported_p (IFN_WHILE_LEN, iv_type,
+    OPTIMIZE_FOR_SPEED))
+    {
+      tree loop_len = rgl->controls[index];
+      poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
+      poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
+      if (maybe_ne (nunits1, nunits2))
+ {
+   /* A loop len for data type X can be reused for data type Y
+      if X has N times more elements than Y and if Y's elements
+      are N times bigger than X's.  */
+   gcc_assert (multiple_p (nunits1, nunits2));
+   unsigned int factor = exact_div (nunits1, nunits2).to_constant ();
+   gimple_seq seq = NULL;
+   loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
+    build_int_cst (iv_type, factor));
+   if (seq)
+     gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
+ }
+      return loop_len;
+    }
   else
     return rgl->controls[index];
}
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index efa2d0daa52..708c8a1d806 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -8653,8 +8653,9 @@ vectorizable_store (vec_info *vinfo,
      else if (loop_lens)
{
  tree final_len
-     = vect_get_loop_len (loop_vinfo, loop_lens,
- vec_num * ncopies, vec_num * j + i);
+     = vect_get_loop_len (gsi, loop_vinfo, loop_lens,
+ vec_num * ncopies, vectype,
+ vec_num * j + i);
  tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
  machine_mode vmode = TYPE_MODE (vectype);
  opt_machine_mode new_ovmode
@@ -10009,8 +10010,8 @@ vectorizable_load (vec_info *vinfo,
    else if (loop_lens && memory_access_type != VMAT_INVARIANT)
      {
tree final_len
-   = vect_get_loop_len (loop_vinfo, loop_lens,
-        vec_num * ncopies,
+   = vect_get_loop_len (gsi, loop_vinfo, loop_lens,
+        vec_num * ncopies, vectype,
       vec_num * j + i);
tree ptr = build_int_cst (ref_type,
  align * BITS_PER_UNIT);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 9cf2fb23fe3..e5cf38caf4b 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2293,8 +2293,8 @@ extern tree vect_get_loop_mask (gimple_stmt_iterator *, vec_loop_masks *,
unsigned int, tree, unsigned int);
extern void vect_record_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
  tree, unsigned int);
-extern tree vect_get_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
-        unsigned int);
+extern tree vect_get_loop_len (gimple_stmt_iterator *, loop_vec_info,
+        vec_loop_lens *, unsigned int, tree, unsigned int);
extern gimple_seq vect_gen_len (tree, tree, tree, tree);
extern stmt_vec_info info_for_reduction (vec_info *, stmt_vec_info);
extern bool reduction_fn_for_scalar_code (code_helper, internal_fn *);
-- 
2.36.3
 

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
  2023-04-11 12:12 ` juzhe.zhong
@ 2023-04-11 12:44   ` Richard Sandiford
  2023-04-12  7:00     ` Richard Biener
  0 siblings, 1 reply; 41+ messages in thread
From: Richard Sandiford @ 2023-04-11 12:44 UTC (permalink / raw)
  To: juzhe.zhong; +Cc: gcc-patches, rguenther, jeffreyalaw

"juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai> writes:
> Hi, Richards. 
> Kindly Ping this patch. 
> This is the most important patch for RVV auto-vectorization support.
> Bootstraped on X86 has passed.

Can it wait for GCC 14?  It doesn't seem like stage 4 material.

Also, pinging after 5 days seems a bit soon.  It's been a 4-day
holiday weekend for much of Europe.

Thanks,
Richard

> Feel free to comments.
>
> Thanks.
>
>
> juzhe.zhong@rivai.ai
>  
> From: juzhe.zhong
> Date: 2023-04-07 09:47
> To: gcc-patches
> CC: richard.sandiford; rguenther; jeffreyalaw; Juzhe-Zhong
> Subject: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
> From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
>  
> This patch is to add WHILE_LEN pattern.
> It's inspired by RVV ISA simple "vvaddint32.s" example:
> https://github.com/riscv/riscv-v-spec/blob/master/example/vvaddint32.s
>  
> More details are in "vect_set_loop_controls_by_while_len" implementation
> and comments.
>  
> Consider such following case:
> #define N 16
> int src[N];
> int dest[N];
>  
> void
> foo (int n)
> {
>   for (int i = 0; i < n; i++)
>     dest[i] = src[i];
> }
>  
> -march=rv64gcv -O3 --param riscv-autovec-preference=scalable -fno-vect-cost-model -fno-tree-loop-distribute-patterns:
>  
> foo:        
>         ble     a0,zero,.L1
>         lui     a4,%hi(.LANCHOR0)
>         addi    a4,a4,%lo(.LANCHOR0)
>         addi    a3,a4,64
>         csrr    a2,vlenb
> .L3:
>         vsetvli a5,a0,e32,m1,ta,ma
>         vle32.v v1,0(a4)
>         sub     a0,a0,a5
>         vse32.v v1,0(a3)
>         add     a4,a4,a2
>         add     a3,a3,a2
>         bne     a0,zero,.L3
> .L1:
>         ret
>  
> gcc/ChangeLog:
>  
>         * doc/md.texi: Add WHILE_LEN support.
>         * internal-fn.cc (while_len_direct): Ditto.
>         (expand_while_len_optab_fn): Ditto.
>         (direct_while_len_optab_supported_p): Ditto.
>         * internal-fn.def (WHILE_LEN): Ditto.
>         * optabs.def (OPTAB_D): Ditto.
>         * tree-ssa-loop-manip.cc (create_iv): Ditto.
>         * tree-ssa-loop-manip.h (create_iv): Ditto.
>         * tree-vect-loop-manip.cc (vect_set_loop_controls_by_while_len): Ditto.
>         (vect_set_loop_condition_partial_vectors): Ditto.
>         * tree-vect-loop.cc (vect_get_loop_len): Ditto.
>         * tree-vect-stmts.cc (vectorizable_store): Ditto.
>         (vectorizable_load): Ditto.
>         * tree-vectorizer.h (vect_get_loop_len): Ditto.
>  
> ---
> gcc/doc/md.texi             |  14 +++
> gcc/internal-fn.cc          |  29 ++++++
> gcc/internal-fn.def         |   1 +
> gcc/optabs.def              |   1 +
> gcc/tree-ssa-loop-manip.cc  |   4 +-
> gcc/tree-ssa-loop-manip.h   |   2 +-
> gcc/tree-vect-loop-manip.cc | 186 ++++++++++++++++++++++++++++++++++--
> gcc/tree-vect-loop.cc       |  35 +++++--
> gcc/tree-vect-stmts.cc      |   9 +-
> gcc/tree-vectorizer.h       |   4 +-
> 10 files changed, 264 insertions(+), 21 deletions(-)
>  
> diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> index 8e3113599fd..72178ab014c 100644
> --- a/gcc/doc/md.texi
> +++ b/gcc/doc/md.texi
> @@ -4965,6 +4965,20 @@ for (i = 1; i < operand3; i++)
>    operand0[i] = operand0[i - 1] && (operand1 + i < operand2);
> @end smallexample
> +@cindex @code{while_len@var{m}@var{n}} instruction pattern
> +@item @code{while_len@var{m}@var{n}}
> +Set operand 0 to the number of active elements in vector will be updated value.
> +operand 1 is the total elements need to be updated value.
> +operand 2 is the vectorization factor.
> +The operation is equivalent to:
> +
> +@smallexample
> +operand0 = MIN (operand1, operand2);
> +operand2 can be const_poly_int or poly_int related to vector mode size.
> +Some target like RISC-V has a standalone instruction to get MIN (n, MODE SIZE) so
> +that we can reduce a use of general purpose register.
> +@end smallexample
> +
> @cindex @code{check_raw_ptrs@var{m}} instruction pattern
> @item @samp{check_raw_ptrs@var{m}}
> Check whether, given two pointers @var{a} and @var{b} and a length @var{len},
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index 6e81dc05e0e..5f44def90d3 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -127,6 +127,7 @@ init_internal_fns ()
> #define cond_binary_direct { 1, 1, true }
> #define cond_ternary_direct { 1, 1, true }
> #define while_direct { 0, 2, false }
> +#define while_len_direct { 0, 0, false }
> #define fold_extract_direct { 2, 2, false }
> #define fold_left_direct { 1, 1, false }
> #define mask_fold_left_direct { 1, 1, false }
> @@ -3702,6 +3703,33 @@ expand_while_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>      emit_move_insn (lhs_rtx, ops[0].value);
> }
> +/* Expand WHILE_LEN call STMT using optab OPTAB.  */
> +static void
> +expand_while_len_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
> +{
> +  expand_operand ops[3];
> +  tree rhs_type[2];
> +
> +  tree lhs = gimple_call_lhs (stmt);
> +  tree lhs_type = TREE_TYPE (lhs);
> +  rtx lhs_rtx = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
> +  create_output_operand (&ops[0], lhs_rtx, TYPE_MODE (lhs_type));
> +
> +  for (unsigned int i = 0; i < gimple_call_num_args (stmt); ++i)
> +    {
> +      tree rhs = gimple_call_arg (stmt, i);
> +      rhs_type[i] = TREE_TYPE (rhs);
> +      rtx rhs_rtx = expand_normal (rhs);
> +      create_input_operand (&ops[i + 1], rhs_rtx, TYPE_MODE (rhs_type[i]));
> +    }
> +
> +  insn_code icode = direct_optab_handler (optab, TYPE_MODE (rhs_type[0]));
> +
> +  expand_insn (icode, 3, ops);
> +  if (!rtx_equal_p (lhs_rtx, ops[0].value))
> +    emit_move_insn (lhs_rtx, ops[0].value);
> +}
> +
> /* Expand a call to a convert-like optab using the operands in STMT.
>     FN has a single output operand and NARGS input operands.  */
> @@ -3843,6 +3871,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
> #define direct_scatter_store_optab_supported_p convert_optab_supported_p
> #define direct_len_store_optab_supported_p direct_optab_supported_p
> #define direct_while_optab_supported_p convert_optab_supported_p
> +#define direct_while_len_optab_supported_p direct_optab_supported_p
> #define direct_fold_extract_optab_supported_p direct_optab_supported_p
> #define direct_fold_left_optab_supported_p direct_optab_supported_p
> #define direct_mask_fold_left_optab_supported_p direct_optab_supported_p
> diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> index 7fe742c2ae7..3a933abff5d 100644
> --- a/gcc/internal-fn.def
> +++ b/gcc/internal-fn.def
> @@ -153,6 +153,7 @@ DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set)
> DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store)
> DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
> +DEF_INTERNAL_OPTAB_FN (WHILE_LEN, ECF_CONST | ECF_NOTHROW, while_len, while_len)
> DEF_INTERNAL_OPTAB_FN (CHECK_RAW_PTRS, ECF_CONST | ECF_NOTHROW,
>        check_raw_ptrs, check_ptrs)
> DEF_INTERNAL_OPTAB_FN (CHECK_WAR_PTRS, ECF_CONST | ECF_NOTHROW,
> diff --git a/gcc/optabs.def b/gcc/optabs.def
> index 695f5911b30..f5938bd2c24 100644
> --- a/gcc/optabs.def
> +++ b/gcc/optabs.def
> @@ -476,3 +476,4 @@ OPTAB_DC (vec_series_optab, "vec_series$a", VEC_SERIES)
> OPTAB_D (vec_shl_insert_optab, "vec_shl_insert_$a")
> OPTAB_D (len_load_optab, "len_load_$a")
> OPTAB_D (len_store_optab, "len_store_$a")
> +OPTAB_D (while_len_optab, "while_len$a")
> diff --git a/gcc/tree-ssa-loop-manip.cc b/gcc/tree-ssa-loop-manip.cc
> index 09acc1c94cc..cdbf280e249 100644
> --- a/gcc/tree-ssa-loop-manip.cc
> +++ b/gcc/tree-ssa-loop-manip.cc
> @@ -59,14 +59,14 @@ static bitmap_obstack loop_renamer_obstack;
> void
> create_iv (tree base, tree step, tree var, class loop *loop,
>    gimple_stmt_iterator *incr_pos, bool after,
> -    tree *var_before, tree *var_after)
> +    tree *var_before, tree *var_after, enum tree_code code)
> {
>    gassign *stmt;
>    gphi *phi;
>    tree initial, step1;
>    gimple_seq stmts;
>    tree vb, va;
> -  enum tree_code incr_op = PLUS_EXPR;
> +  enum tree_code incr_op = code;
>    edge pe = loop_preheader_edge (loop);
>    if (var != NULL_TREE)
> diff --git a/gcc/tree-ssa-loop-manip.h b/gcc/tree-ssa-loop-manip.h
> index d49273a3987..da755320a3a 100644
> --- a/gcc/tree-ssa-loop-manip.h
> +++ b/gcc/tree-ssa-loop-manip.h
> @@ -23,7 +23,7 @@ along with GCC; see the file COPYING3.  If not see
> typedef void (*transform_callback)(class loop *, void *);
> extern void create_iv (tree, tree, tree, class loop *, gimple_stmt_iterator *,
> -        bool, tree *, tree *);
> +        bool, tree *, tree *, enum tree_code = PLUS_EXPR);
> extern void rewrite_into_loop_closed_ssa (bitmap, unsigned);
> extern void verify_loop_closed_ssa (bool, class loop * = NULL);
> diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> index f60fa50e8f4..f3cd6c51d2e 100644
> --- a/gcc/tree-vect-loop-manip.cc
> +++ b/gcc/tree-vect-loop-manip.cc
> @@ -682,6 +682,173 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
>    return next_ctrl;
> }
> +/* Helper for vect_set_loop_condition_partial_vectors.  Generate definitions
> +   for all the rgroup controls in RGC and return a control that is nonzero
> +   when the loop needs to iterate.  Add any new preheader statements to
> +   PREHEADER_SEQ.  Use LOOP_COND_GSI to insert code before the exit gcond.
> +
> +   RGC belongs to loop LOOP.  The loop originally iterated NITERS
> +   times and has been vectorized according to LOOP_VINFO.
> +
> +   Unlike vect_set_loop_controls_directly which is iterating from 0-based IV
> +   to TEST_LIMIT - bias.
> +
> +   In vect_set_loop_controls_by_while_len, we are iterating from start at
> +   IV = TEST_LIMIT - bias and keep subtract IV by the length calculated by
> +   IFN_WHILE_LEN pattern.
> +
> +   Note: the cost of the code generated by this function is modeled
> +   by vect_estimate_min_profitable_iters, so changes here may need
> +   corresponding changes there.
> +
> +   1. Single rgroup, the Gimple IR should be:
> +
> + <bb 3>
> + _19 = (unsigned long) n_5(D);
> + ...
> +
> + <bb 4>:
> + ...
> + # ivtmp_20 = PHI <ivtmp_21(4), _19(3)>
> + ...
> + _22 = .WHILE_LEN (ivtmp_20, vf);
> + ...
> + vector statement (use _22);
> + ...
> + ivtmp_21 = ivtmp_20 - _22;
> + ...
> + if (ivtmp_21 != 0)
> +   goto <bb 4>; [75.00%]
> + else
> +   goto <bb 5>; [25.00%]
> +
> + <bb 5>
> + return;
> +
> +   Note: IFN_WHILE_LEN will guarantee "ivtmp_21 = ivtmp_20 - _22" never
> +   underflow 0.
> +
> +   2. Multiple rgroup, the Gimple IR should be:
> +
> + <bb 3>
> + _70 = (unsigned long) bnd.7_52;
> + _71 = _70 * 2;
> + _72 = MAX_EXPR <_71, 4>;
> + _73 = _72 + 18446744073709551612;
> + ...
> +
> + <bb 4>:
> + ...
> + # ivtmp_74 = PHI <ivtmp_75(6), _73(12)>
> + # ivtmp_77 = PHI <ivtmp_78(6), _71(12)>
> + _76 = .WHILE_LEN (ivtmp_74, vf * nitems_per_ctrl);
> + _79 = .WHILE_LEN (ivtmp_77, vf * nitems_per_ctrl);
> + ...
> + vector statement (use _79);
> + ...
> + vector statement (use _76);
> + ...
> + _65 = _79 / 2;
> + vector statement (use _65);
> + ...
> + _68 = _76 / 2;
> + vector statement (use _68);
> + ...
> + ivtmp_78 = ivtmp_77 - _79;
> + ivtmp_75 = ivtmp_74 - _76;
> + ...
> + if (ivtmp_78 != 0)
> +   goto <bb 4>; [75.00%]
> + else
> +   goto <bb 5>; [25.00%]
> +
> + <bb 5>
> + return;
> +
> +*/
> +
> +static tree
> +vect_set_loop_controls_by_while_len (class loop *loop, loop_vec_info loop_vinfo,
> +      gimple_seq *preheader_seq,
> +      gimple_seq *header_seq,
> +      rgroup_controls *rgc, tree niters)
> +{
> +  tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
> +  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> +  /* We are not allowing masked approach in WHILE_LEN.  */
> +  gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
> +
> +  tree ctrl_type = rgc->type;
> +  unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
> +  poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
> +  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
> +
> +  /* Calculate the maximum number of item values that the rgroup
> +     handles in total, the number that it handles for each iteration
> +     of the vector loop.  */
> +  tree nitems_total = niters;
> +  if (nitems_per_iter != 1)
> +    {
> +      /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
> + these multiplications don't overflow.  */
> +      tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
> +      nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
> +    nitems_total, compare_factor);
> +    }
> +
> +  /* Convert the comparison value to the IV type (either a no-op or
> +     a promotion).  */
> +  nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total);
> +
> +  /* Create an induction variable that counts the number of items
> +     processed.  */
> +  tree index_before_incr, index_after_incr;
> +  gimple_stmt_iterator incr_gsi;
> +  bool insert_after;
> +  standard_iv_increment_position (loop, &incr_gsi, &insert_after);
> +
> +  /* Test the decremented IV, which will never underflow 0 since we have
> +     IFN_WHILE_LEN to gurantee that.  */
> +  tree test_limit = nitems_total;
> +
> +  /* Provide a definition of each control in the group.  */
> +  tree ctrl;
> +  unsigned int i;
> +  FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
> +    {
> +      /* Previous controls will cover BIAS items.  This control covers the
> + next batch.  */
> +      poly_uint64 bias = nitems_per_ctrl * i;
> +      tree bias_tree = build_int_cst (iv_type, bias);
> +
> +      /* Rather than have a new IV that starts at TEST_LIMIT and goes down to
> + BIAS, prefer to use the same TEST_LIMIT - BIAS based IV for each
> + control and adjust the bound down by BIAS.  */
> +      tree this_test_limit = test_limit;
> +      if (i != 0)
> + {
> +   this_test_limit = gimple_build (preheader_seq, MAX_EXPR, iv_type,
> +   this_test_limit, bias_tree);
> +   this_test_limit = gimple_build (preheader_seq, MINUS_EXPR, iv_type,
> +   this_test_limit, bias_tree);
> + }
> +
> +      /* Create decrement IV.  */
> +      create_iv (this_test_limit, ctrl, NULL_TREE, loop, &incr_gsi,
> + insert_after, &index_before_incr, &index_after_incr,
> + MINUS_EXPR);
> +
> +      poly_uint64 final_vf = vf * nitems_per_iter;
> +      tree vf_step = build_int_cst (iv_type, final_vf);
> +      tree res_len = gimple_build (header_seq, IFN_WHILE_LEN, iv_type,
> +    index_before_incr, vf_step);
> +      gassign *assign = gimple_build_assign (ctrl, res_len);
> +      gimple_seq_add_stmt (header_seq, assign);
> +    }
> +
> +  return index_after_incr;
> +}
> +
> /* Set up the iteration condition and rgroup controls for LOOP, given
>     that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
>     loop.  LOOP_VINFO describes the vectorization of LOOP.  NITERS is
> @@ -703,6 +870,7 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
>    bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
>    tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
> +  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
>    unsigned int compare_precision = TYPE_PRECISION (compare_type);
>    tree orig_niters = niters;
> @@ -757,12 +925,18 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
> bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
> /* Set up all controls for this group.  */
> - test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
> -      &preheader_seq,
> -      &header_seq,
> -      loop_cond_gsi, rgc,
> -      niters, niters_skip,
> -      might_wrap_p);
> + if (direct_internal_fn_supported_p (IFN_WHILE_LEN, iv_type,
> +     OPTIMIZE_FOR_SPEED))
> +   test_ctrl
> +     = vect_set_loop_controls_by_while_len (loop, loop_vinfo,
> +    &preheader_seq, &header_seq,
> +    rgc, niters);
> + else
> +   test_ctrl
> +     = vect_set_loop_controls_directly (loop, loop_vinfo, &preheader_seq,
> +        &header_seq, loop_cond_gsi, rgc,
> +        niters, niters_skip,
> +        might_wrap_p);
>        }
>    /* Emit all accumulated statements.  */
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 1ba9f18d73e..5bffd9a6322 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -10360,12 +10360,14 @@ vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
>     rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
> tree
> -vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
> -    unsigned int nvectors, unsigned int index)
> +vect_get_loop_len (gimple_stmt_iterator *gsi, loop_vec_info loop_vinfo,
> +    vec_loop_lens *lens, unsigned int nvectors, tree vectype,
> +    unsigned int index)
> {
>    rgroup_controls *rgl = &(*lens)[nvectors - 1];
> -  bool use_bias_adjusted_len =
> -    LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
> +  bool use_bias_adjusted_len
> +    = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
> +  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
>    /* Populate the rgroup's len array, if this is the first time we've
>       used it.  */
> @@ -10386,8 +10388,8 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
>   if (use_bias_adjusted_len)
>     {
>       gcc_assert (i == 0);
> -       tree adjusted_len =
> - make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
> +       tree adjusted_len
> + = make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
>       SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
>       rgl->bias_adjusted_ctrl = adjusted_len;
>     }
> @@ -10396,6 +10398,27 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
>    if (use_bias_adjusted_len)
>      return rgl->bias_adjusted_ctrl;
> +  else if (direct_internal_fn_supported_p (IFN_WHILE_LEN, iv_type,
> +    OPTIMIZE_FOR_SPEED))
> +    {
> +      tree loop_len = rgl->controls[index];
> +      poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
> +      poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
> +      if (maybe_ne (nunits1, nunits2))
> + {
> +   /* A loop len for data type X can be reused for data type Y
> +      if X has N times more elements than Y and if Y's elements
> +      are N times bigger than X's.  */
> +   gcc_assert (multiple_p (nunits1, nunits2));
> +   unsigned int factor = exact_div (nunits1, nunits2).to_constant ();
> +   gimple_seq seq = NULL;
> +   loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
> +    build_int_cst (iv_type, factor));
> +   if (seq)
> +     gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
> + }
> +      return loop_len;
> +    }
>    else
>      return rgl->controls[index];
> }
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index efa2d0daa52..708c8a1d806 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -8653,8 +8653,9 @@ vectorizable_store (vec_info *vinfo,
>       else if (loop_lens)
> {
>   tree final_len
> -     = vect_get_loop_len (loop_vinfo, loop_lens,
> - vec_num * ncopies, vec_num * j + i);
> +     = vect_get_loop_len (gsi, loop_vinfo, loop_lens,
> + vec_num * ncopies, vectype,
> + vec_num * j + i);
>   tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
>   machine_mode vmode = TYPE_MODE (vectype);
>   opt_machine_mode new_ovmode
> @@ -10009,8 +10010,8 @@ vectorizable_load (vec_info *vinfo,
>     else if (loop_lens && memory_access_type != VMAT_INVARIANT)
>       {
> tree final_len
> -   = vect_get_loop_len (loop_vinfo, loop_lens,
> -        vec_num * ncopies,
> +   = vect_get_loop_len (gsi, loop_vinfo, loop_lens,
> +        vec_num * ncopies, vectype,
>        vec_num * j + i);
> tree ptr = build_int_cst (ref_type,
>   align * BITS_PER_UNIT);
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index 9cf2fb23fe3..e5cf38caf4b 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -2293,8 +2293,8 @@ extern tree vect_get_loop_mask (gimple_stmt_iterator *, vec_loop_masks *,
> unsigned int, tree, unsigned int);
> extern void vect_record_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
>   tree, unsigned int);
> -extern tree vect_get_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
> -        unsigned int);
> +extern tree vect_get_loop_len (gimple_stmt_iterator *, loop_vec_info,
> +        vec_loop_lens *, unsigned int, tree, unsigned int);
> extern gimple_seq vect_gen_len (tree, tree, tree, tree);
> extern stmt_vec_info info_for_reduction (vec_info *, stmt_vec_info);
> extern bool reduction_fn_for_scalar_code (code_helper, internal_fn *);

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
  2023-04-11 12:44   ` Richard Sandiford
@ 2023-04-12  7:00     ` Richard Biener
  2023-04-12  8:00       ` juzhe.zhong
  0 siblings, 1 reply; 41+ messages in thread
From: Richard Biener @ 2023-04-12  7:00 UTC (permalink / raw)
  To: Richard Sandiford; +Cc: juzhe.zhong, gcc-patches, jeffreyalaw

On Tue, 11 Apr 2023, Richard Sandiford wrote:

> "juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai> writes:
> > Hi, Richards. 
> > Kindly Ping this patch. 
> > This is the most important patch for RVV auto-vectorization support.
> > Bootstraped on X86 has passed.
> 
> Can it wait for GCC 14?  It doesn't seem like stage 4 material.
> 
> Also, pinging after 5 days seems a bit soon.  It's been a 4-day
> holiday weekend for much of Europe.

Also can you explain why using WHILE_ULT is not possible?  (I've
successfully - to some extent - done that for AVX512 for example)

The patch lacks the description of what WHILE_LEN actually is.

Richard.

> Thanks,
> Richard
> 
> > Feel free to comments.
> >
> > Thanks.
> >
> >
> > juzhe.zhong@rivai.ai
> >  
> > From: juzhe.zhong
> > Date: 2023-04-07 09:47
> > To: gcc-patches
> > CC: richard.sandiford; rguenther; jeffreyalaw; Juzhe-Zhong
> > Subject: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
> > From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
> >  
> > This patch is to add WHILE_LEN pattern.
> > It's inspired by RVV ISA simple "vvaddint32.s" example:
> > https://github.com/riscv/riscv-v-spec/blob/master/example/vvaddint32.s
> >  
> > More details are in "vect_set_loop_controls_by_while_len" implementation
> > and comments.
> >  
> > Consider such following case:
> > #define N 16
> > int src[N];
> > int dest[N];
> >  
> > void
> > foo (int n)
> > {
> >   for (int i = 0; i < n; i++)
> >     dest[i] = src[i];
> > }
> >  
> > -march=rv64gcv -O3 --param riscv-autovec-preference=scalable -fno-vect-cost-model -fno-tree-loop-distribute-patterns:
> >  
> > foo:        
> >         ble     a0,zero,.L1
> >         lui     a4,%hi(.LANCHOR0)
> >         addi    a4,a4,%lo(.LANCHOR0)
> >         addi    a3,a4,64
> >         csrr    a2,vlenb
> > .L3:
> >         vsetvli a5,a0,e32,m1,ta,ma
> >         vle32.v v1,0(a4)
> >         sub     a0,a0,a5
> >         vse32.v v1,0(a3)
> >         add     a4,a4,a2
> >         add     a3,a3,a2
> >         bne     a0,zero,.L3
> > .L1:
> >         ret
> >  
> > gcc/ChangeLog:
> >  
> >         * doc/md.texi: Add WHILE_LEN support.
> >         * internal-fn.cc (while_len_direct): Ditto.
> >         (expand_while_len_optab_fn): Ditto.
> >         (direct_while_len_optab_supported_p): Ditto.
> >         * internal-fn.def (WHILE_LEN): Ditto.
> >         * optabs.def (OPTAB_D): Ditto.
> >         * tree-ssa-loop-manip.cc (create_iv): Ditto.
> >         * tree-ssa-loop-manip.h (create_iv): Ditto.
> >         * tree-vect-loop-manip.cc (vect_set_loop_controls_by_while_len): Ditto.
> >         (vect_set_loop_condition_partial_vectors): Ditto.
> >         * tree-vect-loop.cc (vect_get_loop_len): Ditto.
> >         * tree-vect-stmts.cc (vectorizable_store): Ditto.
> >         (vectorizable_load): Ditto.
> >         * tree-vectorizer.h (vect_get_loop_len): Ditto.
> >  
> > ---
> > gcc/doc/md.texi             |  14 +++
> > gcc/internal-fn.cc          |  29 ++++++
> > gcc/internal-fn.def         |   1 +
> > gcc/optabs.def              |   1 +
> > gcc/tree-ssa-loop-manip.cc  |   4 +-
> > gcc/tree-ssa-loop-manip.h   |   2 +-
> > gcc/tree-vect-loop-manip.cc | 186 ++++++++++++++++++++++++++++++++++--
> > gcc/tree-vect-loop.cc       |  35 +++++--
> > gcc/tree-vect-stmts.cc      |   9 +-
> > gcc/tree-vectorizer.h       |   4 +-
> > 10 files changed, 264 insertions(+), 21 deletions(-)
> >  
> > diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> > index 8e3113599fd..72178ab014c 100644
> > --- a/gcc/doc/md.texi
> > +++ b/gcc/doc/md.texi
> > @@ -4965,6 +4965,20 @@ for (i = 1; i < operand3; i++)
> >    operand0[i] = operand0[i - 1] && (operand1 + i < operand2);
> > @end smallexample
> > +@cindex @code{while_len@var{m}@var{n}} instruction pattern
> > +@item @code{while_len@var{m}@var{n}}
> > +Set operand 0 to the number of active elements in vector will be updated value.
> > +operand 1 is the total elements need to be updated value.
> > +operand 2 is the vectorization factor.
> > +The operation is equivalent to:
> > +
> > +@smallexample
> > +operand0 = MIN (operand1, operand2);
> > +operand2 can be const_poly_int or poly_int related to vector mode size.
> > +Some target like RISC-V has a standalone instruction to get MIN (n, MODE SIZE) so
> > +that we can reduce a use of general purpose register.
> > +@end smallexample
> > +
> > @cindex @code{check_raw_ptrs@var{m}} instruction pattern
> > @item @samp{check_raw_ptrs@var{m}}
> > Check whether, given two pointers @var{a} and @var{b} and a length @var{len},
> > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> > index 6e81dc05e0e..5f44def90d3 100644
> > --- a/gcc/internal-fn.cc
> > +++ b/gcc/internal-fn.cc
> > @@ -127,6 +127,7 @@ init_internal_fns ()
> > #define cond_binary_direct { 1, 1, true }
> > #define cond_ternary_direct { 1, 1, true }
> > #define while_direct { 0, 2, false }
> > +#define while_len_direct { 0, 0, false }
> > #define fold_extract_direct { 2, 2, false }
> > #define fold_left_direct { 1, 1, false }
> > #define mask_fold_left_direct { 1, 1, false }
> > @@ -3702,6 +3703,33 @@ expand_while_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
> >      emit_move_insn (lhs_rtx, ops[0].value);
> > }
> > +/* Expand WHILE_LEN call STMT using optab OPTAB.  */
> > +static void
> > +expand_while_len_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
> > +{
> > +  expand_operand ops[3];
> > +  tree rhs_type[2];
> > +
> > +  tree lhs = gimple_call_lhs (stmt);
> > +  tree lhs_type = TREE_TYPE (lhs);
> > +  rtx lhs_rtx = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
> > +  create_output_operand (&ops[0], lhs_rtx, TYPE_MODE (lhs_type));
> > +
> > +  for (unsigned int i = 0; i < gimple_call_num_args (stmt); ++i)
> > +    {
> > +      tree rhs = gimple_call_arg (stmt, i);
> > +      rhs_type[i] = TREE_TYPE (rhs);
> > +      rtx rhs_rtx = expand_normal (rhs);
> > +      create_input_operand (&ops[i + 1], rhs_rtx, TYPE_MODE (rhs_type[i]));
> > +    }
> > +
> > +  insn_code icode = direct_optab_handler (optab, TYPE_MODE (rhs_type[0]));
> > +
> > +  expand_insn (icode, 3, ops);
> > +  if (!rtx_equal_p (lhs_rtx, ops[0].value))
> > +    emit_move_insn (lhs_rtx, ops[0].value);
> > +}
> > +
> > /* Expand a call to a convert-like optab using the operands in STMT.
> >     FN has a single output operand and NARGS input operands.  */
> > @@ -3843,6 +3871,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
> > #define direct_scatter_store_optab_supported_p convert_optab_supported_p
> > #define direct_len_store_optab_supported_p direct_optab_supported_p
> > #define direct_while_optab_supported_p convert_optab_supported_p
> > +#define direct_while_len_optab_supported_p direct_optab_supported_p
> > #define direct_fold_extract_optab_supported_p direct_optab_supported_p
> > #define direct_fold_left_optab_supported_p direct_optab_supported_p
> > #define direct_mask_fold_left_optab_supported_p direct_optab_supported_p
> > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> > index 7fe742c2ae7..3a933abff5d 100644
> > --- a/gcc/internal-fn.def
> > +++ b/gcc/internal-fn.def
> > @@ -153,6 +153,7 @@ DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set)
> > DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store)
> > DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
> > +DEF_INTERNAL_OPTAB_FN (WHILE_LEN, ECF_CONST | ECF_NOTHROW, while_len, while_len)
> > DEF_INTERNAL_OPTAB_FN (CHECK_RAW_PTRS, ECF_CONST | ECF_NOTHROW,
> >        check_raw_ptrs, check_ptrs)
> > DEF_INTERNAL_OPTAB_FN (CHECK_WAR_PTRS, ECF_CONST | ECF_NOTHROW,
> > diff --git a/gcc/optabs.def b/gcc/optabs.def
> > index 695f5911b30..f5938bd2c24 100644
> > --- a/gcc/optabs.def
> > +++ b/gcc/optabs.def
> > @@ -476,3 +476,4 @@ OPTAB_DC (vec_series_optab, "vec_series$a", VEC_SERIES)
> > OPTAB_D (vec_shl_insert_optab, "vec_shl_insert_$a")
> > OPTAB_D (len_load_optab, "len_load_$a")
> > OPTAB_D (len_store_optab, "len_store_$a")
> > +OPTAB_D (while_len_optab, "while_len$a")
> > diff --git a/gcc/tree-ssa-loop-manip.cc b/gcc/tree-ssa-loop-manip.cc
> > index 09acc1c94cc..cdbf280e249 100644
> > --- a/gcc/tree-ssa-loop-manip.cc
> > +++ b/gcc/tree-ssa-loop-manip.cc
> > @@ -59,14 +59,14 @@ static bitmap_obstack loop_renamer_obstack;
> > void
> > create_iv (tree base, tree step, tree var, class loop *loop,
> >    gimple_stmt_iterator *incr_pos, bool after,
> > -    tree *var_before, tree *var_after)
> > +    tree *var_before, tree *var_after, enum tree_code code)
> > {
> >    gassign *stmt;
> >    gphi *phi;
> >    tree initial, step1;
> >    gimple_seq stmts;
> >    tree vb, va;
> > -  enum tree_code incr_op = PLUS_EXPR;
> > +  enum tree_code incr_op = code;
> >    edge pe = loop_preheader_edge (loop);
> >    if (var != NULL_TREE)
> > diff --git a/gcc/tree-ssa-loop-manip.h b/gcc/tree-ssa-loop-manip.h
> > index d49273a3987..da755320a3a 100644
> > --- a/gcc/tree-ssa-loop-manip.h
> > +++ b/gcc/tree-ssa-loop-manip.h
> > @@ -23,7 +23,7 @@ along with GCC; see the file COPYING3.  If not see
> > typedef void (*transform_callback)(class loop *, void *);
> > extern void create_iv (tree, tree, tree, class loop *, gimple_stmt_iterator *,
> > -        bool, tree *, tree *);
> > +        bool, tree *, tree *, enum tree_code = PLUS_EXPR);
> > extern void rewrite_into_loop_closed_ssa (bitmap, unsigned);
> > extern void verify_loop_closed_ssa (bool, class loop * = NULL);
> > diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> > index f60fa50e8f4..f3cd6c51d2e 100644
> > --- a/gcc/tree-vect-loop-manip.cc
> > +++ b/gcc/tree-vect-loop-manip.cc
> > @@ -682,6 +682,173 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
> >    return next_ctrl;
> > }
> > +/* Helper for vect_set_loop_condition_partial_vectors.  Generate definitions
> > +   for all the rgroup controls in RGC and return a control that is nonzero
> > +   when the loop needs to iterate.  Add any new preheader statements to
> > +   PREHEADER_SEQ.  Use LOOP_COND_GSI to insert code before the exit gcond.
> > +
> > +   RGC belongs to loop LOOP.  The loop originally iterated NITERS
> > +   times and has been vectorized according to LOOP_VINFO.
> > +
> > +   Unlike vect_set_loop_controls_directly which is iterating from 0-based IV
> > +   to TEST_LIMIT - bias.
> > +
> > +   In vect_set_loop_controls_by_while_len, we are iterating from start at
> > +   IV = TEST_LIMIT - bias and keep subtract IV by the length calculated by
> > +   IFN_WHILE_LEN pattern.
> > +
> > +   Note: the cost of the code generated by this function is modeled
> > +   by vect_estimate_min_profitable_iters, so changes here may need
> > +   corresponding changes there.
> > +
> > +   1. Single rgroup, the Gimple IR should be:
> > +
> > + <bb 3>
> > + _19 = (unsigned long) n_5(D);
> > + ...
> > +
> > + <bb 4>:
> > + ...
> > + # ivtmp_20 = PHI <ivtmp_21(4), _19(3)>
> > + ...
> > + _22 = .WHILE_LEN (ivtmp_20, vf);
> > + ...
> > + vector statement (use _22);
> > + ...
> > + ivtmp_21 = ivtmp_20 - _22;
> > + ...
> > + if (ivtmp_21 != 0)
> > +   goto <bb 4>; [75.00%]
> > + else
> > +   goto <bb 5>; [25.00%]
> > +
> > + <bb 5>
> > + return;
> > +
> > +   Note: IFN_WHILE_LEN will guarantee "ivtmp_21 = ivtmp_20 - _22" never
> > +   underflow 0.
> > +
> > +   2. Multiple rgroup, the Gimple IR should be:
> > +
> > + <bb 3>
> > + _70 = (unsigned long) bnd.7_52;
> > + _71 = _70 * 2;
> > + _72 = MAX_EXPR <_71, 4>;
> > + _73 = _72 + 18446744073709551612;
> > + ...
> > +
> > + <bb 4>:
> > + ...
> > + # ivtmp_74 = PHI <ivtmp_75(6), _73(12)>
> > + # ivtmp_77 = PHI <ivtmp_78(6), _71(12)>
> > + _76 = .WHILE_LEN (ivtmp_74, vf * nitems_per_ctrl);
> > + _79 = .WHILE_LEN (ivtmp_77, vf * nitems_per_ctrl);
> > + ...
> > + vector statement (use _79);
> > + ...
> > + vector statement (use _76);
> > + ...
> > + _65 = _79 / 2;
> > + vector statement (use _65);
> > + ...
> > + _68 = _76 / 2;
> > + vector statement (use _68);
> > + ...
> > + ivtmp_78 = ivtmp_77 - _79;
> > + ivtmp_75 = ivtmp_74 - _76;
> > + ...
> > + if (ivtmp_78 != 0)
> > +   goto <bb 4>; [75.00%]
> > + else
> > +   goto <bb 5>; [25.00%]
> > +
> > + <bb 5>
> > + return;
> > +
> > +*/
> > +
> > +static tree
> > +vect_set_loop_controls_by_while_len (class loop *loop, loop_vec_info loop_vinfo,
> > +      gimple_seq *preheader_seq,
> > +      gimple_seq *header_seq,
> > +      rgroup_controls *rgc, tree niters)
> > +{
> > +  tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
> > +  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> > +  /* We are not allowing masked approach in WHILE_LEN.  */
> > +  gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
> > +
> > +  tree ctrl_type = rgc->type;
> > +  unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
> > +  poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
> > +  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
> > +
> > +  /* Calculate the maximum number of item values that the rgroup
> > +     handles in total, the number that it handles for each iteration
> > +     of the vector loop.  */
> > +  tree nitems_total = niters;
> > +  if (nitems_per_iter != 1)
> > +    {
> > +      /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
> > + these multiplications don't overflow.  */
> > +      tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
> > +      nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
> > +    nitems_total, compare_factor);
> > +    }
> > +
> > +  /* Convert the comparison value to the IV type (either a no-op or
> > +     a promotion).  */
> > +  nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total);
> > +
> > +  /* Create an induction variable that counts the number of items
> > +     processed.  */
> > +  tree index_before_incr, index_after_incr;
> > +  gimple_stmt_iterator incr_gsi;
> > +  bool insert_after;
> > +  standard_iv_increment_position (loop, &incr_gsi, &insert_after);
> > +
> > +  /* Test the decremented IV, which will never underflow 0 since we have
> > +     IFN_WHILE_LEN to gurantee that.  */
> > +  tree test_limit = nitems_total;
> > +
> > +  /* Provide a definition of each control in the group.  */
> > +  tree ctrl;
> > +  unsigned int i;
> > +  FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
> > +    {
> > +      /* Previous controls will cover BIAS items.  This control covers the
> > + next batch.  */
> > +      poly_uint64 bias = nitems_per_ctrl * i;
> > +      tree bias_tree = build_int_cst (iv_type, bias);
> > +
> > +      /* Rather than have a new IV that starts at TEST_LIMIT and goes down to
> > + BIAS, prefer to use the same TEST_LIMIT - BIAS based IV for each
> > + control and adjust the bound down by BIAS.  */
> > +      tree this_test_limit = test_limit;
> > +      if (i != 0)
> > + {
> > +   this_test_limit = gimple_build (preheader_seq, MAX_EXPR, iv_type,
> > +   this_test_limit, bias_tree);
> > +   this_test_limit = gimple_build (preheader_seq, MINUS_EXPR, iv_type,
> > +   this_test_limit, bias_tree);
> > + }
> > +
> > +      /* Create decrement IV.  */
> > +      create_iv (this_test_limit, ctrl, NULL_TREE, loop, &incr_gsi,
> > + insert_after, &index_before_incr, &index_after_incr,
> > + MINUS_EXPR);
> > +
> > +      poly_uint64 final_vf = vf * nitems_per_iter;
> > +      tree vf_step = build_int_cst (iv_type, final_vf);
> > +      tree res_len = gimple_build (header_seq, IFN_WHILE_LEN, iv_type,
> > +    index_before_incr, vf_step);
> > +      gassign *assign = gimple_build_assign (ctrl, res_len);
> > +      gimple_seq_add_stmt (header_seq, assign);
> > +    }
> > +
> > +  return index_after_incr;
> > +}
> > +
> > /* Set up the iteration condition and rgroup controls for LOOP, given
> >     that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
> >     loop.  LOOP_VINFO describes the vectorization of LOOP.  NITERS is
> > @@ -703,6 +870,7 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
> >    bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
> >    tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
> > +  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> >    unsigned int compare_precision = TYPE_PRECISION (compare_type);
> >    tree orig_niters = niters;
> > @@ -757,12 +925,18 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
> > bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
> > /* Set up all controls for this group.  */
> > - test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
> > -      &preheader_seq,
> > -      &header_seq,
> > -      loop_cond_gsi, rgc,
> > -      niters, niters_skip,
> > -      might_wrap_p);
> > + if (direct_internal_fn_supported_p (IFN_WHILE_LEN, iv_type,
> > +     OPTIMIZE_FOR_SPEED))
> > +   test_ctrl
> > +     = vect_set_loop_controls_by_while_len (loop, loop_vinfo,
> > +    &preheader_seq, &header_seq,
> > +    rgc, niters);
> > + else
> > +   test_ctrl
> > +     = vect_set_loop_controls_directly (loop, loop_vinfo, &preheader_seq,
> > +        &header_seq, loop_cond_gsi, rgc,
> > +        niters, niters_skip,
> > +        might_wrap_p);
> >        }
> >    /* Emit all accumulated statements.  */
> > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> > index 1ba9f18d73e..5bffd9a6322 100644
> > --- a/gcc/tree-vect-loop.cc
> > +++ b/gcc/tree-vect-loop.cc
> > @@ -10360,12 +10360,14 @@ vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
> >     rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
> > tree
> > -vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
> > -    unsigned int nvectors, unsigned int index)
> > +vect_get_loop_len (gimple_stmt_iterator *gsi, loop_vec_info loop_vinfo,
> > +    vec_loop_lens *lens, unsigned int nvectors, tree vectype,
> > +    unsigned int index)
> > {
> >    rgroup_controls *rgl = &(*lens)[nvectors - 1];
> > -  bool use_bias_adjusted_len =
> > -    LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
> > +  bool use_bias_adjusted_len
> > +    = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
> > +  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> >    /* Populate the rgroup's len array, if this is the first time we've
> >       used it.  */
> > @@ -10386,8 +10388,8 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
> >   if (use_bias_adjusted_len)
> >     {
> >       gcc_assert (i == 0);
> > -       tree adjusted_len =
> > - make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
> > +       tree adjusted_len
> > + = make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
> >       SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
> >       rgl->bias_adjusted_ctrl = adjusted_len;
> >     }
> > @@ -10396,6 +10398,27 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
> >    if (use_bias_adjusted_len)
> >      return rgl->bias_adjusted_ctrl;
> > +  else if (direct_internal_fn_supported_p (IFN_WHILE_LEN, iv_type,
> > +    OPTIMIZE_FOR_SPEED))
> > +    {
> > +      tree loop_len = rgl->controls[index];
> > +      poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
> > +      poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
> > +      if (maybe_ne (nunits1, nunits2))
> > + {
> > +   /* A loop len for data type X can be reused for data type Y
> > +      if X has N times more elements than Y and if Y's elements
> > +      are N times bigger than X's.  */
> > +   gcc_assert (multiple_p (nunits1, nunits2));
> > +   unsigned int factor = exact_div (nunits1, nunits2).to_constant ();
> > +   gimple_seq seq = NULL;
> > +   loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
> > +    build_int_cst (iv_type, factor));
> > +   if (seq)
> > +     gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
> > + }
> > +      return loop_len;
> > +    }
> >    else
> >      return rgl->controls[index];
> > }
> > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> > index efa2d0daa52..708c8a1d806 100644
> > --- a/gcc/tree-vect-stmts.cc
> > +++ b/gcc/tree-vect-stmts.cc
> > @@ -8653,8 +8653,9 @@ vectorizable_store (vec_info *vinfo,
> >       else if (loop_lens)
> > {
> >   tree final_len
> > -     = vect_get_loop_len (loop_vinfo, loop_lens,
> > - vec_num * ncopies, vec_num * j + i);
> > +     = vect_get_loop_len (gsi, loop_vinfo, loop_lens,
> > + vec_num * ncopies, vectype,
> > + vec_num * j + i);
> >   tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
> >   machine_mode vmode = TYPE_MODE (vectype);
> >   opt_machine_mode new_ovmode
> > @@ -10009,8 +10010,8 @@ vectorizable_load (vec_info *vinfo,
> >     else if (loop_lens && memory_access_type != VMAT_INVARIANT)
> >       {
> > tree final_len
> > -   = vect_get_loop_len (loop_vinfo, loop_lens,
> > -        vec_num * ncopies,
> > +   = vect_get_loop_len (gsi, loop_vinfo, loop_lens,
> > +        vec_num * ncopies, vectype,
> >        vec_num * j + i);
> > tree ptr = build_int_cst (ref_type,
> >   align * BITS_PER_UNIT);
> > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> > index 9cf2fb23fe3..e5cf38caf4b 100644
> > --- a/gcc/tree-vectorizer.h
> > +++ b/gcc/tree-vectorizer.h
> > @@ -2293,8 +2293,8 @@ extern tree vect_get_loop_mask (gimple_stmt_iterator *, vec_loop_masks *,
> > unsigned int, tree, unsigned int);
> > extern void vect_record_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
> >   tree, unsigned int);
> > -extern tree vect_get_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
> > -        unsigned int);
> > +extern tree vect_get_loop_len (gimple_stmt_iterator *, loop_vec_info,
> > +        vec_loop_lens *, unsigned int, tree, unsigned int);
> > extern gimple_seq vect_gen_len (tree, tree, tree, tree);
> > extern stmt_vec_info info_for_reduction (vec_info *, stmt_vec_info);
> > extern bool reduction_fn_for_scalar_code (code_helper, internal_fn *);
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
  2023-04-12  7:00     ` Richard Biener
@ 2023-04-12  8:00       ` juzhe.zhong
  2023-04-12  8:42         ` Richard Biener
  0 siblings, 1 reply; 41+ messages in thread
From: juzhe.zhong @ 2023-04-12  8:00 UTC (permalink / raw)
  To: rguenther, richard.sandiford; +Cc: gcc-patches, jeffreyalaw

[-- Attachment #1: Type: text/plain, Size: 27849 bytes --]

Thank you very much for reply.

WHILE_LEN is the pattern that calculates the number of the elements of the vector will be updated in each iteration.
For RVV, we use vsetvl instruction to calculate the number of the elements of the vector.

WHILE_ULT can not work for RVV since WHILE_ULT is generating mask to predicate vector operation, but RVV do not
use mask to do the loop strip mining (RVV only use mask for control flow inside the loop).

Here is the example WHILE_ULT working in ARM SVE:
https://godbolt.org/z/jKsT8E1hP 

The first example is:
void foo (int32_t * __restrict a, int32_t * __restrict b, int n)
{
    for (int i = 0; i < n; i++)
      a[i] = a[i] + b[i];
}

ARM SVE:
foo:
        cmp     w2, 0
        ble     .L1
        mov     x3, 0
        cntw    x4
        whilelo p0.s, wzr, w2
.L3:
        ld1w    z1.s, p0/z, [x0, x3, lsl 2]
        ld1w    z0.s, p0/z, [x1, x3, lsl 2]
        add     z0.s, z0.s, z1.s
        st1w    z0.s, p0, [x0, x3, lsl 2]
        add     x3, x3, x4
        whilelo p0.s, w3, w2
        b.any   .L3
.L1:
        ret

Here, whilelo will generate the mask according to w3 to w2.
So for example, if w3 = 0, and w2 = 3 (Suppose machine vector length > 3).
Then it will generate a mask with 0b111 mask to predicate loads and stores.

For RVV, we can't do that since RVV doesn't have whilelo instructions to generate predicate mask.
Also, we can't use mask as the predicate to do loop strip mining since RVV only has 1 single mask 
to handle flow control  inside the loop.

Instead, we use vsetvl to do the strip mining, so base on this, the same C code, RVV ideal asm according RVV ISA should be:

preheader:
a0 = n (the total number of the scalar should be calculated).
 .....
.L3:
        vsetvli a5,a0,e32,m1,ta,ma    ====> WHILE_LEN pattern generate this instruction, calculate the number of the elements should be updated
        vle32.v v1,0(a4)
        sub     a0,a0,a5      ============> Decrement the induction variable by the a5 (generated by WHILE_LEN)
        ....   

        vadd.vv....
        vse32.v v1,0(a3)
        add     a4,a4,a2
        add     a3,a3,a2
        bne     a0,zero,.L3
.L1:
        ret

So you will see, if n = 3 like I said for ARM SVE (Suppose machine vector length > 3), then vsetvli a5,a0,e32,m1,ta,ma will
generate a5 = 3, then the vle32.v/vadd.vv/vse32.v are all doing the operation only on the element 0,  element 1, element 2.

Besides, WHILE_LEN is defined to make sure to be never overflow the input operand which is "a0".
That means  sub     a0,a0,a5 will make a0 never underflow 0.

I have tried to return Pmode in TARGET_VECTORIZE_GET_MASK_MODE 
target hook and then use WHILE_ULT. 

But there are 2 issues:
One is that current GCC is doing the flow from 0-based until the TEST_LIMIT. Wheras the optimal flow of RVV I showed above
is from "n" keep decreasing n until 0.  Trying to fit the current flow of GCC, RVV needs more instructions to do the loop strip mining.

Second is that if we return a Pmode in TARGET_VECTORIZE_GET_MASK_MODE 
which not only specify the dest mode for WHILE_ULT but also the mask mode of flow control.
If we return Pmode which is used as the length for RVV. We can't use mask mode like VNx2BI mode to do the flow control predicate.
This another example:
void foo2 (int32_t * __restrict a, int32_t * __restrict b, int32_t * restrict cond, int n)
{
    for (int i = 0; i < n; i++)
      if (cond[i])
        a[i] = a[i] + b[i];
}

ARM SVE:
        ld1w    z0.s, p0/z, [x2, x4, lsl 2]
        cmpne   p0.s, p0/z, z0.s, #0
        ld1w    z0.s, p0/z, [x0, x4, lsl 2]
        ld1w    z1.s, p0/z, [x1, x4, lsl 2]
        add     z0.s, z0.s, z1.s
        st1w    z0.s, p0, [x0, x4, lsl 2]
        add     x4, x4, x5
        whilelo p0.s, w4, w3
        b.any   .L8

Here we can see ARM use mask mode for both loop strip minning and flow control.

Wheras, RVV use length generated by vsetvl (WHILE_LEN) to do the loop strip minning and mask generated by comparison to do the flow control.

So the ASM generated by my downstream LLVM/GCC:
.L3:
        vsetvli a6,a3,e32,m1,ta,mu   ==========> generate length to predicate RVV operation. 
        vle32.v v0,(a2)
        sub     a3,a3,a6      ==========> decrease the induction variable until 0.
        vmsne.vi        v0,v0,0   ==========> generate mask to predicate RVV operation. 
        vle32.v v24,(a0),v0.t   ===========> here using v0.t is the only mask register to predicate RVV operation
        vle32.v v25,(a1),v0.t
        vadd.vv v24,v24,v25
        vse32.v v24,(a0),v0.t
        add     a2,a2,a4
        add     a0,a0,a4
        add     a1,a1,a4
        bne     a3,zero,.L3
.L1:
        ret


This is the how RVV works.
Feel free to comment if you have any questions.

Thanks.


juzhe.zhong@rivai.ai
 
From: Richard Biener
Date: 2023-04-12 15:00
To: Richard Sandiford
CC: juzhe.zhong@rivai.ai; gcc-patches; jeffreyalaw
Subject: Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
On Tue, 11 Apr 2023, Richard Sandiford wrote:
 
> "juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai> writes:
> > Hi, Richards. 
> > Kindly Ping this patch. 
> > This is the most important patch for RVV auto-vectorization support.
> > Bootstraped on X86 has passed.
> 
> Can it wait for GCC 14?  It doesn't seem like stage 4 material.
> 
> Also, pinging after 5 days seems a bit soon.  It's been a 4-day
> holiday weekend for much of Europe.
 
Also can you explain why using WHILE_ULT is not possible?  (I've
successfully - to some extent - done that for AVX512 for example)
 
The patch lacks the description of what WHILE_LEN actually is.
 
Richard.
 
> Thanks,
> Richard
> 
> > Feel free to comments.
> >
> > Thanks.
> >
> >
> > juzhe.zhong@rivai.ai
> >  
> > From: juzhe.zhong
> > Date: 2023-04-07 09:47
> > To: gcc-patches
> > CC: richard.sandiford; rguenther; jeffreyalaw; Juzhe-Zhong
> > Subject: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
> > From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
> >  
> > This patch is to add WHILE_LEN pattern.
> > It's inspired by RVV ISA simple "vvaddint32.s" example:
> > https://github.com/riscv/riscv-v-spec/blob/master/example/vvaddint32.s
> >  
> > More details are in "vect_set_loop_controls_by_while_len" implementation
> > and comments.
> >  
> > Consider such following case:
> > #define N 16
> > int src[N];
> > int dest[N];
> >  
> > void
> > foo (int n)
> > {
> >   for (int i = 0; i < n; i++)
> >     dest[i] = src[i];
> > }
> >  
> > -march=rv64gcv -O3 --param riscv-autovec-preference=scalable -fno-vect-cost-model -fno-tree-loop-distribute-patterns:
> >  
> > foo:        
> >         ble     a0,zero,.L1
> >         lui     a4,%hi(.LANCHOR0)
> >         addi    a4,a4,%lo(.LANCHOR0)
> >         addi    a3,a4,64
> >         csrr    a2,vlenb
> > .L3:
> >         vsetvli a5,a0,e32,m1,ta,ma
> >         vle32.v v1,0(a4)
> >         sub     a0,a0,a5
> >         vse32.v v1,0(a3)
> >         add     a4,a4,a2
> >         add     a3,a3,a2
> >         bne     a0,zero,.L3
> > .L1:
> >         ret
> >  
> > gcc/ChangeLog:
> >  
> >         * doc/md.texi: Add WHILE_LEN support.
> >         * internal-fn.cc (while_len_direct): Ditto.
> >         (expand_while_len_optab_fn): Ditto.
> >         (direct_while_len_optab_supported_p): Ditto.
> >         * internal-fn.def (WHILE_LEN): Ditto.
> >         * optabs.def (OPTAB_D): Ditto.
> >         * tree-ssa-loop-manip.cc (create_iv): Ditto.
> >         * tree-ssa-loop-manip.h (create_iv): Ditto.
> >         * tree-vect-loop-manip.cc (vect_set_loop_controls_by_while_len): Ditto.
> >         (vect_set_loop_condition_partial_vectors): Ditto.
> >         * tree-vect-loop.cc (vect_get_loop_len): Ditto.
> >         * tree-vect-stmts.cc (vectorizable_store): Ditto.
> >         (vectorizable_load): Ditto.
> >         * tree-vectorizer.h (vect_get_loop_len): Ditto.
> >  
> > ---
> > gcc/doc/md.texi             |  14 +++
> > gcc/internal-fn.cc          |  29 ++++++
> > gcc/internal-fn.def         |   1 +
> > gcc/optabs.def              |   1 +
> > gcc/tree-ssa-loop-manip.cc  |   4 +-
> > gcc/tree-ssa-loop-manip.h   |   2 +-
> > gcc/tree-vect-loop-manip.cc | 186 ++++++++++++++++++++++++++++++++++--
> > gcc/tree-vect-loop.cc       |  35 +++++--
> > gcc/tree-vect-stmts.cc      |   9 +-
> > gcc/tree-vectorizer.h       |   4 +-
> > 10 files changed, 264 insertions(+), 21 deletions(-)
> >  
> > diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> > index 8e3113599fd..72178ab014c 100644
> > --- a/gcc/doc/md.texi
> > +++ b/gcc/doc/md.texi
> > @@ -4965,6 +4965,20 @@ for (i = 1; i < operand3; i++)
> >    operand0[i] = operand0[i - 1] && (operand1 + i < operand2);
> > @end smallexample
> > +@cindex @code{while_len@var{m}@var{n}} instruction pattern
> > +@item @code{while_len@var{m}@var{n}}
> > +Set operand 0 to the number of active elements in vector will be updated value.
> > +operand 1 is the total elements need to be updated value.
> > +operand 2 is the vectorization factor.
> > +The operation is equivalent to:
> > +
> > +@smallexample
> > +operand0 = MIN (operand1, operand2);
> > +operand2 can be const_poly_int or poly_int related to vector mode size.
> > +Some target like RISC-V has a standalone instruction to get MIN (n, MODE SIZE) so
> > +that we can reduce a use of general purpose register.
> > +@end smallexample
> > +
> > @cindex @code{check_raw_ptrs@var{m}} instruction pattern
> > @item @samp{check_raw_ptrs@var{m}}
> > Check whether, given two pointers @var{a} and @var{b} and a length @var{len},
> > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> > index 6e81dc05e0e..5f44def90d3 100644
> > --- a/gcc/internal-fn.cc
> > +++ b/gcc/internal-fn.cc
> > @@ -127,6 +127,7 @@ init_internal_fns ()
> > #define cond_binary_direct { 1, 1, true }
> > #define cond_ternary_direct { 1, 1, true }
> > #define while_direct { 0, 2, false }
> > +#define while_len_direct { 0, 0, false }
> > #define fold_extract_direct { 2, 2, false }
> > #define fold_left_direct { 1, 1, false }
> > #define mask_fold_left_direct { 1, 1, false }
> > @@ -3702,6 +3703,33 @@ expand_while_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
> >      emit_move_insn (lhs_rtx, ops[0].value);
> > }
> > +/* Expand WHILE_LEN call STMT using optab OPTAB.  */
> > +static void
> > +expand_while_len_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
> > +{
> > +  expand_operand ops[3];
> > +  tree rhs_type[2];
> > +
> > +  tree lhs = gimple_call_lhs (stmt);
> > +  tree lhs_type = TREE_TYPE (lhs);
> > +  rtx lhs_rtx = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
> > +  create_output_operand (&ops[0], lhs_rtx, TYPE_MODE (lhs_type));
> > +
> > +  for (unsigned int i = 0; i < gimple_call_num_args (stmt); ++i)
> > +    {
> > +      tree rhs = gimple_call_arg (stmt, i);
> > +      rhs_type[i] = TREE_TYPE (rhs);
> > +      rtx rhs_rtx = expand_normal (rhs);
> > +      create_input_operand (&ops[i + 1], rhs_rtx, TYPE_MODE (rhs_type[i]));
> > +    }
> > +
> > +  insn_code icode = direct_optab_handler (optab, TYPE_MODE (rhs_type[0]));
> > +
> > +  expand_insn (icode, 3, ops);
> > +  if (!rtx_equal_p (lhs_rtx, ops[0].value))
> > +    emit_move_insn (lhs_rtx, ops[0].value);
> > +}
> > +
> > /* Expand a call to a convert-like optab using the operands in STMT.
> >     FN has a single output operand and NARGS input operands.  */
> > @@ -3843,6 +3871,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
> > #define direct_scatter_store_optab_supported_p convert_optab_supported_p
> > #define direct_len_store_optab_supported_p direct_optab_supported_p
> > #define direct_while_optab_supported_p convert_optab_supported_p
> > +#define direct_while_len_optab_supported_p direct_optab_supported_p
> > #define direct_fold_extract_optab_supported_p direct_optab_supported_p
> > #define direct_fold_left_optab_supported_p direct_optab_supported_p
> > #define direct_mask_fold_left_optab_supported_p direct_optab_supported_p
> > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> > index 7fe742c2ae7..3a933abff5d 100644
> > --- a/gcc/internal-fn.def
> > +++ b/gcc/internal-fn.def
> > @@ -153,6 +153,7 @@ DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set)
> > DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store)
> > DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
> > +DEF_INTERNAL_OPTAB_FN (WHILE_LEN, ECF_CONST | ECF_NOTHROW, while_len, while_len)
> > DEF_INTERNAL_OPTAB_FN (CHECK_RAW_PTRS, ECF_CONST | ECF_NOTHROW,
> >        check_raw_ptrs, check_ptrs)
> > DEF_INTERNAL_OPTAB_FN (CHECK_WAR_PTRS, ECF_CONST | ECF_NOTHROW,
> > diff --git a/gcc/optabs.def b/gcc/optabs.def
> > index 695f5911b30..f5938bd2c24 100644
> > --- a/gcc/optabs.def
> > +++ b/gcc/optabs.def
> > @@ -476,3 +476,4 @@ OPTAB_DC (vec_series_optab, "vec_series$a", VEC_SERIES)
> > OPTAB_D (vec_shl_insert_optab, "vec_shl_insert_$a")
> > OPTAB_D (len_load_optab, "len_load_$a")
> > OPTAB_D (len_store_optab, "len_store_$a")
> > +OPTAB_D (while_len_optab, "while_len$a")
> > diff --git a/gcc/tree-ssa-loop-manip.cc b/gcc/tree-ssa-loop-manip.cc
> > index 09acc1c94cc..cdbf280e249 100644
> > --- a/gcc/tree-ssa-loop-manip.cc
> > +++ b/gcc/tree-ssa-loop-manip.cc
> > @@ -59,14 +59,14 @@ static bitmap_obstack loop_renamer_obstack;
> > void
> > create_iv (tree base, tree step, tree var, class loop *loop,
> >    gimple_stmt_iterator *incr_pos, bool after,
> > -    tree *var_before, tree *var_after)
> > +    tree *var_before, tree *var_after, enum tree_code code)
> > {
> >    gassign *stmt;
> >    gphi *phi;
> >    tree initial, step1;
> >    gimple_seq stmts;
> >    tree vb, va;
> > -  enum tree_code incr_op = PLUS_EXPR;
> > +  enum tree_code incr_op = code;
> >    edge pe = loop_preheader_edge (loop);
> >    if (var != NULL_TREE)
> > diff --git a/gcc/tree-ssa-loop-manip.h b/gcc/tree-ssa-loop-manip.h
> > index d49273a3987..da755320a3a 100644
> > --- a/gcc/tree-ssa-loop-manip.h
> > +++ b/gcc/tree-ssa-loop-manip.h
> > @@ -23,7 +23,7 @@ along with GCC; see the file COPYING3.  If not see
> > typedef void (*transform_callback)(class loop *, void *);
> > extern void create_iv (tree, tree, tree, class loop *, gimple_stmt_iterator *,
> > -        bool, tree *, tree *);
> > +        bool, tree *, tree *, enum tree_code = PLUS_EXPR);
> > extern void rewrite_into_loop_closed_ssa (bitmap, unsigned);
> > extern void verify_loop_closed_ssa (bool, class loop * = NULL);
> > diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> > index f60fa50e8f4..f3cd6c51d2e 100644
> > --- a/gcc/tree-vect-loop-manip.cc
> > +++ b/gcc/tree-vect-loop-manip.cc
> > @@ -682,6 +682,173 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
> >    return next_ctrl;
> > }
> > +/* Helper for vect_set_loop_condition_partial_vectors.  Generate definitions
> > +   for all the rgroup controls in RGC and return a control that is nonzero
> > +   when the loop needs to iterate.  Add any new preheader statements to
> > +   PREHEADER_SEQ.  Use LOOP_COND_GSI to insert code before the exit gcond.
> > +
> > +   RGC belongs to loop LOOP.  The loop originally iterated NITERS
> > +   times and has been vectorized according to LOOP_VINFO.
> > +
> > +   Unlike vect_set_loop_controls_directly which is iterating from 0-based IV
> > +   to TEST_LIMIT - bias.
> > +
> > +   In vect_set_loop_controls_by_while_len, we are iterating from start at
> > +   IV = TEST_LIMIT - bias and keep subtract IV by the length calculated by
> > +   IFN_WHILE_LEN pattern.
> > +
> > +   Note: the cost of the code generated by this function is modeled
> > +   by vect_estimate_min_profitable_iters, so changes here may need
> > +   corresponding changes there.
> > +
> > +   1. Single rgroup, the Gimple IR should be:
> > +
> > + <bb 3>
> > + _19 = (unsigned long) n_5(D);
> > + ...
> > +
> > + <bb 4>:
> > + ...
> > + # ivtmp_20 = PHI <ivtmp_21(4), _19(3)>
> > + ...
> > + _22 = .WHILE_LEN (ivtmp_20, vf);
> > + ...
> > + vector statement (use _22);
> > + ...
> > + ivtmp_21 = ivtmp_20 - _22;
> > + ...
> > + if (ivtmp_21 != 0)
> > +   goto <bb 4>; [75.00%]
> > + else
> > +   goto <bb 5>; [25.00%]
> > +
> > + <bb 5>
> > + return;
> > +
> > +   Note: IFN_WHILE_LEN will guarantee "ivtmp_21 = ivtmp_20 - _22" never
> > +   underflow 0.
> > +
> > +   2. Multiple rgroup, the Gimple IR should be:
> > +
> > + <bb 3>
> > + _70 = (unsigned long) bnd.7_52;
> > + _71 = _70 * 2;
> > + _72 = MAX_EXPR <_71, 4>;
> > + _73 = _72 + 18446744073709551612;
> > + ...
> > +
> > + <bb 4>:
> > + ...
> > + # ivtmp_74 = PHI <ivtmp_75(6), _73(12)>
> > + # ivtmp_77 = PHI <ivtmp_78(6), _71(12)>
> > + _76 = .WHILE_LEN (ivtmp_74, vf * nitems_per_ctrl);
> > + _79 = .WHILE_LEN (ivtmp_77, vf * nitems_per_ctrl);
> > + ...
> > + vector statement (use _79);
> > + ...
> > + vector statement (use _76);
> > + ...
> > + _65 = _79 / 2;
> > + vector statement (use _65);
> > + ...
> > + _68 = _76 / 2;
> > + vector statement (use _68);
> > + ...
> > + ivtmp_78 = ivtmp_77 - _79;
> > + ivtmp_75 = ivtmp_74 - _76;
> > + ...
> > + if (ivtmp_78 != 0)
> > +   goto <bb 4>; [75.00%]
> > + else
> > +   goto <bb 5>; [25.00%]
> > +
> > + <bb 5>
> > + return;
> > +
> > +*/
> > +
> > +static tree
> > +vect_set_loop_controls_by_while_len (class loop *loop, loop_vec_info loop_vinfo,
> > +      gimple_seq *preheader_seq,
> > +      gimple_seq *header_seq,
> > +      rgroup_controls *rgc, tree niters)
> > +{
> > +  tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
> > +  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> > +  /* We are not allowing masked approach in WHILE_LEN.  */
> > +  gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
> > +
> > +  tree ctrl_type = rgc->type;
> > +  unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
> > +  poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
> > +  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
> > +
> > +  /* Calculate the maximum number of item values that the rgroup
> > +     handles in total, the number that it handles for each iteration
> > +     of the vector loop.  */
> > +  tree nitems_total = niters;
> > +  if (nitems_per_iter != 1)
> > +    {
> > +      /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
> > + these multiplications don't overflow.  */
> > +      tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
> > +      nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
> > +    nitems_total, compare_factor);
> > +    }
> > +
> > +  /* Convert the comparison value to the IV type (either a no-op or
> > +     a promotion).  */
> > +  nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total);
> > +
> > +  /* Create an induction variable that counts the number of items
> > +     processed.  */
> > +  tree index_before_incr, index_after_incr;
> > +  gimple_stmt_iterator incr_gsi;
> > +  bool insert_after;
> > +  standard_iv_increment_position (loop, &incr_gsi, &insert_after);
> > +
> > +  /* Test the decremented IV, which will never underflow 0 since we have
> > +     IFN_WHILE_LEN to gurantee that.  */
> > +  tree test_limit = nitems_total;
> > +
> > +  /* Provide a definition of each control in the group.  */
> > +  tree ctrl;
> > +  unsigned int i;
> > +  FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
> > +    {
> > +      /* Previous controls will cover BIAS items.  This control covers the
> > + next batch.  */
> > +      poly_uint64 bias = nitems_per_ctrl * i;
> > +      tree bias_tree = build_int_cst (iv_type, bias);
> > +
> > +      /* Rather than have a new IV that starts at TEST_LIMIT and goes down to
> > + BIAS, prefer to use the same TEST_LIMIT - BIAS based IV for each
> > + control and adjust the bound down by BIAS.  */
> > +      tree this_test_limit = test_limit;
> > +      if (i != 0)
> > + {
> > +   this_test_limit = gimple_build (preheader_seq, MAX_EXPR, iv_type,
> > +   this_test_limit, bias_tree);
> > +   this_test_limit = gimple_build (preheader_seq, MINUS_EXPR, iv_type,
> > +   this_test_limit, bias_tree);
> > + }
> > +
> > +      /* Create decrement IV.  */
> > +      create_iv (this_test_limit, ctrl, NULL_TREE, loop, &incr_gsi,
> > + insert_after, &index_before_incr, &index_after_incr,
> > + MINUS_EXPR);
> > +
> > +      poly_uint64 final_vf = vf * nitems_per_iter;
> > +      tree vf_step = build_int_cst (iv_type, final_vf);
> > +      tree res_len = gimple_build (header_seq, IFN_WHILE_LEN, iv_type,
> > +    index_before_incr, vf_step);
> > +      gassign *assign = gimple_build_assign (ctrl, res_len);
> > +      gimple_seq_add_stmt (header_seq, assign);
> > +    }
> > +
> > +  return index_after_incr;
> > +}
> > +
> > /* Set up the iteration condition and rgroup controls for LOOP, given
> >     that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
> >     loop.  LOOP_VINFO describes the vectorization of LOOP.  NITERS is
> > @@ -703,6 +870,7 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
> >    bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
> >    tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
> > +  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> >    unsigned int compare_precision = TYPE_PRECISION (compare_type);
> >    tree orig_niters = niters;
> > @@ -757,12 +925,18 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
> > bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
> > /* Set up all controls for this group.  */
> > - test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
> > -      &preheader_seq,
> > -      &header_seq,
> > -      loop_cond_gsi, rgc,
> > -      niters, niters_skip,
> > -      might_wrap_p);
> > + if (direct_internal_fn_supported_p (IFN_WHILE_LEN, iv_type,
> > +     OPTIMIZE_FOR_SPEED))
> > +   test_ctrl
> > +     = vect_set_loop_controls_by_while_len (loop, loop_vinfo,
> > +    &preheader_seq, &header_seq,
> > +    rgc, niters);
> > + else
> > +   test_ctrl
> > +     = vect_set_loop_controls_directly (loop, loop_vinfo, &preheader_seq,
> > +        &header_seq, loop_cond_gsi, rgc,
> > +        niters, niters_skip,
> > +        might_wrap_p);
> >        }
> >    /* Emit all accumulated statements.  */
> > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> > index 1ba9f18d73e..5bffd9a6322 100644
> > --- a/gcc/tree-vect-loop.cc
> > +++ b/gcc/tree-vect-loop.cc
> > @@ -10360,12 +10360,14 @@ vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
> >     rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
> > tree
> > -vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
> > -    unsigned int nvectors, unsigned int index)
> > +vect_get_loop_len (gimple_stmt_iterator *gsi, loop_vec_info loop_vinfo,
> > +    vec_loop_lens *lens, unsigned int nvectors, tree vectype,
> > +    unsigned int index)
> > {
> >    rgroup_controls *rgl = &(*lens)[nvectors - 1];
> > -  bool use_bias_adjusted_len =
> > -    LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
> > +  bool use_bias_adjusted_len
> > +    = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
> > +  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> >    /* Populate the rgroup's len array, if this is the first time we've
> >       used it.  */
> > @@ -10386,8 +10388,8 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
> >   if (use_bias_adjusted_len)
> >     {
> >       gcc_assert (i == 0);
> > -       tree adjusted_len =
> > - make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
> > +       tree adjusted_len
> > + = make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
> >       SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
> >       rgl->bias_adjusted_ctrl = adjusted_len;
> >     }
> > @@ -10396,6 +10398,27 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
> >    if (use_bias_adjusted_len)
> >      return rgl->bias_adjusted_ctrl;
> > +  else if (direct_internal_fn_supported_p (IFN_WHILE_LEN, iv_type,
> > +    OPTIMIZE_FOR_SPEED))
> > +    {
> > +      tree loop_len = rgl->controls[index];
> > +      poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
> > +      poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
> > +      if (maybe_ne (nunits1, nunits2))
> > + {
> > +   /* A loop len for data type X can be reused for data type Y
> > +      if X has N times more elements than Y and if Y's elements
> > +      are N times bigger than X's.  */
> > +   gcc_assert (multiple_p (nunits1, nunits2));
> > +   unsigned int factor = exact_div (nunits1, nunits2).to_constant ();
> > +   gimple_seq seq = NULL;
> > +   loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
> > +    build_int_cst (iv_type, factor));
> > +   if (seq)
> > +     gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
> > + }
> > +      return loop_len;
> > +    }
> >    else
> >      return rgl->controls[index];
> > }
> > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> > index efa2d0daa52..708c8a1d806 100644
> > --- a/gcc/tree-vect-stmts.cc
> > +++ b/gcc/tree-vect-stmts.cc
> > @@ -8653,8 +8653,9 @@ vectorizable_store (vec_info *vinfo,
> >       else if (loop_lens)
> > {
> >   tree final_len
> > -     = vect_get_loop_len (loop_vinfo, loop_lens,
> > - vec_num * ncopies, vec_num * j + i);
> > +     = vect_get_loop_len (gsi, loop_vinfo, loop_lens,
> > + vec_num * ncopies, vectype,
> > + vec_num * j + i);
> >   tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
> >   machine_mode vmode = TYPE_MODE (vectype);
> >   opt_machine_mode new_ovmode
> > @@ -10009,8 +10010,8 @@ vectorizable_load (vec_info *vinfo,
> >     else if (loop_lens && memory_access_type != VMAT_INVARIANT)
> >       {
> > tree final_len
> > -   = vect_get_loop_len (loop_vinfo, loop_lens,
> > -        vec_num * ncopies,
> > +   = vect_get_loop_len (gsi, loop_vinfo, loop_lens,
> > +        vec_num * ncopies, vectype,
> >        vec_num * j + i);
> > tree ptr = build_int_cst (ref_type,
> >   align * BITS_PER_UNIT);
> > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> > index 9cf2fb23fe3..e5cf38caf4b 100644
> > --- a/gcc/tree-vectorizer.h
> > +++ b/gcc/tree-vectorizer.h
> > @@ -2293,8 +2293,8 @@ extern tree vect_get_loop_mask (gimple_stmt_iterator *, vec_loop_masks *,
> > unsigned int, tree, unsigned int);
> > extern void vect_record_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
> >   tree, unsigned int);
> > -extern tree vect_get_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
> > -        unsigned int);
> > +extern tree vect_get_loop_len (gimple_stmt_iterator *, loop_vec_info,
> > +        vec_loop_lens *, unsigned int, tree, unsigned int);
> > extern gimple_seq vect_gen_len (tree, tree, tree, tree);
> > extern stmt_vec_info info_for_reduction (vec_info *, stmt_vec_info);
> > extern bool reduction_fn_for_scalar_code (code_helper, internal_fn *);
> 
 
-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)
 

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
  2023-04-12  8:00       ` juzhe.zhong
@ 2023-04-12  8:42         ` Richard Biener
  2023-04-12  9:15           ` juzhe.zhong
       [not found]           ` <2023041217154958074655@rivai.ai>
  0 siblings, 2 replies; 41+ messages in thread
From: Richard Biener @ 2023-04-12  8:42 UTC (permalink / raw)
  To: juzhe.zhong; +Cc: richard.sandiford, gcc-patches, jeffreyalaw

On Wed, 12 Apr 2023, juzhe.zhong@rivai.ai wrote:

> Thank you very much for reply.
> 
> WHILE_LEN is the pattern that calculates the number of the elements of the vector will be updated in each iteration.
> For RVV, we use vsetvl instruction to calculate the number of the elements of the vector.
> 
> WHILE_ULT can not work for RVV since WHILE_ULT is generating mask to predicate vector operation, but RVV do not
> use mask to do the loop strip mining (RVV only use mask for control flow inside the loop).
> 
> Here is the example WHILE_ULT working in ARM SVE:
> https://godbolt.org/z/jKsT8E1hP 
> 
> The first example is:
> void foo (int32_t * __restrict a, int32_t * __restrict b, int n)
> {
>     for (int i = 0; i < n; i++)
>       a[i] = a[i] + b[i];
> }
> 
> ARM SVE:
> foo:
>         cmp     w2, 0
>         ble     .L1
>         mov     x3, 0
>         cntw    x4
>         whilelo p0.s, wzr, w2
> .L3:
>         ld1w    z1.s, p0/z, [x0, x3, lsl 2]
>         ld1w    z0.s, p0/z, [x1, x3, lsl 2]
>         add     z0.s, z0.s, z1.s
>         st1w    z0.s, p0, [x0, x3, lsl 2]
>         add     x3, x3, x4
>         whilelo p0.s, w3, w2
>         b.any   .L3
> .L1:
>         ret
> 
> Here, whilelo will generate the mask according to w3 to w2.
> So for example, if w3 = 0, and w2 = 3 (Suppose machine vector length > 3).
> Then it will generate a mask with 0b111 mask to predicate loads and stores.
> 
> For RVV, we can't do that since RVV doesn't have whilelo instructions to generate predicate mask.
> Also, we can't use mask as the predicate to do loop strip mining since RVV only has 1 single mask 
> to handle flow control  inside the loop.
> 
> Instead, we use vsetvl to do the strip mining, so base on this, the same C code, RVV ideal asm according RVV ISA should be:
> 
> preheader:
> a0 = n (the total number of the scalar should be calculated).
>  .....
> .L3:
>         vsetvli a5,a0,e32,m1,ta,ma    ====> WHILE_LEN pattern generate this instruction, calculate the number of the elements should be updated
>         vle32.v v1,0(a4)
>         sub     a0,a0,a5      ============> Decrement the induction variable by the a5 (generated by WHILE_LEN)
>         ....   
> 
>         vadd.vv....
>         vse32.v v1,0(a3)
>         add     a4,a4,a2
>         add     a3,a3,a2
>         bne     a0,zero,.L3
> .L1:
>         ret
> 
> So you will see, if n = 3 like I said for ARM SVE (Suppose machine vector length > 3), then vsetvli a5,a0,e32,m1,ta,ma will
> generate a5 = 3, then the vle32.v/vadd.vv/vse32.v are all doing the operation only on the element 0,  element 1, element 2.
> 
> Besides, WHILE_LEN is defined to make sure to be never overflow the input operand which is "a0".
> That means  sub     a0,a0,a5 will make a0 never underflow 0.
> 
> I have tried to return Pmode in TARGET_VECTORIZE_GET_MASK_MODE 
> target hook and then use WHILE_ULT. 
> 
> But there are 2 issues:
> One is that current GCC is doing the flow from 0-based until the TEST_LIMIT. Wheras the optimal flow of RVV I showed above
> is from "n" keep decreasing n until 0.  Trying to fit the current flow of GCC, RVV needs more instructions to do the loop strip mining.
> 
> Second is that if we return a Pmode in TARGET_VECTORIZE_GET_MASK_MODE 
> which not only specify the dest mode for WHILE_ULT but also the mask mode of flow control.
> If we return Pmode which is used as the length for RVV. We can't use mask mode like VNx2BI mode to do the flow control predicate.
> This another example:
> void foo2 (int32_t * __restrict a, int32_t * __restrict b, int32_t * restrict cond, int n)
> {
>     for (int i = 0; i < n; i++)
>       if (cond[i])
>         a[i] = a[i] + b[i];
> }
> 
> ARM SVE:
>         ld1w    z0.s, p0/z, [x2, x4, lsl 2]
>         cmpne   p0.s, p0/z, z0.s, #0
>         ld1w    z0.s, p0/z, [x0, x4, lsl 2]
>         ld1w    z1.s, p0/z, [x1, x4, lsl 2]
>         add     z0.s, z0.s, z1.s
>         st1w    z0.s, p0, [x0, x4, lsl 2]
>         add     x4, x4, x5
>         whilelo p0.s, w4, w3
>         b.any   .L8
> 
> Here we can see ARM use mask mode for both loop strip minning and flow control.
> 
> Wheras, RVV use length generated by vsetvl (WHILE_LEN) to do the loop strip minning and mask generated by comparison to do the flow control.
> 
> So the ASM generated by my downstream LLVM/GCC:
> .L3:
>         vsetvli a6,a3,e32,m1,ta,mu   ==========> generate length to predicate RVV operation. 
>         vle32.v v0,(a2)
>         sub     a3,a3,a6      ==========> decrease the induction variable until 0.
>         vmsne.vi        v0,v0,0   ==========> generate mask to predicate RVV operation. 
>         vle32.v v24,(a0),v0.t   ===========> here using v0.t is the only mask register to predicate RVV operation
>         vle32.v v25,(a1),v0.t
>         vadd.vv v24,v24,v25
>         vse32.v v24,(a0),v0.t
>         add     a2,a2,a4
>         add     a0,a0,a4
>         add     a1,a1,a4
>         bne     a3,zero,.L3
> .L1:
>         ret
> 
> 
> This is the how RVV works.
> Feel free to comment if you have any questions.

Thanks for the detailed explanation.  Just to clarify - with RVV
there's only a single mask register, v0.t, or did you want to
say an instruction can only specify a single mask register?
ARM SVE would have a loop control mask and a separate mask
for the if (cond[i]) which would be combined with a mask-and
instruction to a third mask which is then used on the
predicated instructions.

For AVX512 WHILE_ULT is a better match since we need a mask in the
end (but WHILE_ULT isn't a very good match either, so I'm still
working on masked loop support there).

PowerPC and s390x might be able to use WHILE_LEN as well (though
they only have LEN variants of loads and stores) - of course
only "simulating it".  For the fixed-vector-length ISAs the
predicated vector loop IMHO makes most sense for the epilogue to
handle low-trip loops better.

Richard.

> Thanks.
> 
> 
> juzhe.zhong@rivai.ai
>  
> From: Richard Biener
> Date: 2023-04-12 15:00
> To: Richard Sandiford
> CC: juzhe.zhong@rivai.ai; gcc-patches; jeffreyalaw
> Subject: Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
> On Tue, 11 Apr 2023, Richard Sandiford wrote:
>  
> > "juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai> writes:
> > > Hi, Richards. 
> > > Kindly Ping this patch. 
> > > This is the most important patch for RVV auto-vectorization support.
> > > Bootstraped on X86 has passed.
> > 
> > Can it wait for GCC 14?  It doesn't seem like stage 4 material.
> > 
> > Also, pinging after 5 days seems a bit soon.  It's been a 4-day
> > holiday weekend for much of Europe.
>  
> Also can you explain why using WHILE_ULT is not possible?  (I've
> successfully - to some extent - done that for AVX512 for example)
>  
> The patch lacks the description of what WHILE_LEN actually is.
>  
> Richard.
>  
> > Thanks,
> > Richard
> > 
> > > Feel free to comments.
> > >
> > > Thanks.
> > >
> > >
> > > juzhe.zhong@rivai.ai
> > >  
> > > From: juzhe.zhong
> > > Date: 2023-04-07 09:47
> > > To: gcc-patches
> > > CC: richard.sandiford; rguenther; jeffreyalaw; Juzhe-Zhong
> > > Subject: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
> > > From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
> > >  
> > > This patch is to add WHILE_LEN pattern.
> > > It's inspired by RVV ISA simple "vvaddint32.s" example:
> > > https://github.com/riscv/riscv-v-spec/blob/master/example/vvaddint32.s
> > >  
> > > More details are in "vect_set_loop_controls_by_while_len" implementation
> > > and comments.
> > >  
> > > Consider such following case:
> > > #define N 16
> > > int src[N];
> > > int dest[N];
> > >  
> > > void
> > > foo (int n)
> > > {
> > >   for (int i = 0; i < n; i++)
> > >     dest[i] = src[i];
> > > }
> > >  
> > > -march=rv64gcv -O3 --param riscv-autovec-preference=scalable -fno-vect-cost-model -fno-tree-loop-distribute-patterns:
> > >  
> > > foo:        
> > >         ble     a0,zero,.L1
> > >         lui     a4,%hi(.LANCHOR0)
> > >         addi    a4,a4,%lo(.LANCHOR0)
> > >         addi    a3,a4,64
> > >         csrr    a2,vlenb
> > > .L3:
> > >         vsetvli a5,a0,e32,m1,ta,ma
> > >         vle32.v v1,0(a4)
> > >         sub     a0,a0,a5
> > >         vse32.v v1,0(a3)
> > >         add     a4,a4,a2
> > >         add     a3,a3,a2
> > >         bne     a0,zero,.L3
> > > .L1:
> > >         ret
> > >  
> > > gcc/ChangeLog:
> > >  
> > >         * doc/md.texi: Add WHILE_LEN support.
> > >         * internal-fn.cc (while_len_direct): Ditto.
> > >         (expand_while_len_optab_fn): Ditto.
> > >         (direct_while_len_optab_supported_p): Ditto.
> > >         * internal-fn.def (WHILE_LEN): Ditto.
> > >         * optabs.def (OPTAB_D): Ditto.
> > >         * tree-ssa-loop-manip.cc (create_iv): Ditto.
> > >         * tree-ssa-loop-manip.h (create_iv): Ditto.
> > >         * tree-vect-loop-manip.cc (vect_set_loop_controls_by_while_len): Ditto.
> > >         (vect_set_loop_condition_partial_vectors): Ditto.
> > >         * tree-vect-loop.cc (vect_get_loop_len): Ditto.
> > >         * tree-vect-stmts.cc (vectorizable_store): Ditto.
> > >         (vectorizable_load): Ditto.
> > >         * tree-vectorizer.h (vect_get_loop_len): Ditto.
> > >  
> > > ---
> > > gcc/doc/md.texi             |  14 +++
> > > gcc/internal-fn.cc          |  29 ++++++
> > > gcc/internal-fn.def         |   1 +
> > > gcc/optabs.def              |   1 +
> > > gcc/tree-ssa-loop-manip.cc  |   4 +-
> > > gcc/tree-ssa-loop-manip.h   |   2 +-
> > > gcc/tree-vect-loop-manip.cc | 186 ++++++++++++++++++++++++++++++++++--
> > > gcc/tree-vect-loop.cc       |  35 +++++--
> > > gcc/tree-vect-stmts.cc      |   9 +-
> > > gcc/tree-vectorizer.h       |   4 +-
> > > 10 files changed, 264 insertions(+), 21 deletions(-)
> > >  
> > > diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> > > index 8e3113599fd..72178ab014c 100644
> > > --- a/gcc/doc/md.texi
> > > +++ b/gcc/doc/md.texi
> > > @@ -4965,6 +4965,20 @@ for (i = 1; i < operand3; i++)
> > >    operand0[i] = operand0[i - 1] && (operand1 + i < operand2);
> > > @end smallexample
> > > +@cindex @code{while_len@var{m}@var{n}} instruction pattern
> > > +@item @code{while_len@var{m}@var{n}}
> > > +Set operand 0 to the number of active elements in vector will be updated value.
> > > +operand 1 is the total elements need to be updated value.
> > > +operand 2 is the vectorization factor.
> > > +The operation is equivalent to:
> > > +
> > > +@smallexample
> > > +operand0 = MIN (operand1, operand2);
> > > +operand2 can be const_poly_int or poly_int related to vector mode size.
> > > +Some target like RISC-V has a standalone instruction to get MIN (n, MODE SIZE) so
> > > +that we can reduce a use of general purpose register.
> > > +@end smallexample
> > > +
> > > @cindex @code{check_raw_ptrs@var{m}} instruction pattern
> > > @item @samp{check_raw_ptrs@var{m}}
> > > Check whether, given two pointers @var{a} and @var{b} and a length @var{len},
> > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> > > index 6e81dc05e0e..5f44def90d3 100644
> > > --- a/gcc/internal-fn.cc
> > > +++ b/gcc/internal-fn.cc
> > > @@ -127,6 +127,7 @@ init_internal_fns ()
> > > #define cond_binary_direct { 1, 1, true }
> > > #define cond_ternary_direct { 1, 1, true }
> > > #define while_direct { 0, 2, false }
> > > +#define while_len_direct { 0, 0, false }
> > > #define fold_extract_direct { 2, 2, false }
> > > #define fold_left_direct { 1, 1, false }
> > > #define mask_fold_left_direct { 1, 1, false }
> > > @@ -3702,6 +3703,33 @@ expand_while_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
> > >      emit_move_insn (lhs_rtx, ops[0].value);
> > > }
> > > +/* Expand WHILE_LEN call STMT using optab OPTAB.  */
> > > +static void
> > > +expand_while_len_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
> > > +{
> > > +  expand_operand ops[3];
> > > +  tree rhs_type[2];
> > > +
> > > +  tree lhs = gimple_call_lhs (stmt);
> > > +  tree lhs_type = TREE_TYPE (lhs);
> > > +  rtx lhs_rtx = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
> > > +  create_output_operand (&ops[0], lhs_rtx, TYPE_MODE (lhs_type));
> > > +
> > > +  for (unsigned int i = 0; i < gimple_call_num_args (stmt); ++i)
> > > +    {
> > > +      tree rhs = gimple_call_arg (stmt, i);
> > > +      rhs_type[i] = TREE_TYPE (rhs);
> > > +      rtx rhs_rtx = expand_normal (rhs);
> > > +      create_input_operand (&ops[i + 1], rhs_rtx, TYPE_MODE (rhs_type[i]));
> > > +    }
> > > +
> > > +  insn_code icode = direct_optab_handler (optab, TYPE_MODE (rhs_type[0]));
> > > +
> > > +  expand_insn (icode, 3, ops);
> > > +  if (!rtx_equal_p (lhs_rtx, ops[0].value))
> > > +    emit_move_insn (lhs_rtx, ops[0].value);
> > > +}
> > > +
> > > /* Expand a call to a convert-like optab using the operands in STMT.
> > >     FN has a single output operand and NARGS input operands.  */
> > > @@ -3843,6 +3871,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
> > > #define direct_scatter_store_optab_supported_p convert_optab_supported_p
> > > #define direct_len_store_optab_supported_p direct_optab_supported_p
> > > #define direct_while_optab_supported_p convert_optab_supported_p
> > > +#define direct_while_len_optab_supported_p direct_optab_supported_p
> > > #define direct_fold_extract_optab_supported_p direct_optab_supported_p
> > > #define direct_fold_left_optab_supported_p direct_optab_supported_p
> > > #define direct_mask_fold_left_optab_supported_p direct_optab_supported_p
> > > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> > > index 7fe742c2ae7..3a933abff5d 100644
> > > --- a/gcc/internal-fn.def
> > > +++ b/gcc/internal-fn.def
> > > @@ -153,6 +153,7 @@ DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set)
> > > DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store)
> > > DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
> > > +DEF_INTERNAL_OPTAB_FN (WHILE_LEN, ECF_CONST | ECF_NOTHROW, while_len, while_len)
> > > DEF_INTERNAL_OPTAB_FN (CHECK_RAW_PTRS, ECF_CONST | ECF_NOTHROW,
> > >        check_raw_ptrs, check_ptrs)
> > > DEF_INTERNAL_OPTAB_FN (CHECK_WAR_PTRS, ECF_CONST | ECF_NOTHROW,
> > > diff --git a/gcc/optabs.def b/gcc/optabs.def
> > > index 695f5911b30..f5938bd2c24 100644
> > > --- a/gcc/optabs.def
> > > +++ b/gcc/optabs.def
> > > @@ -476,3 +476,4 @@ OPTAB_DC (vec_series_optab, "vec_series$a", VEC_SERIES)
> > > OPTAB_D (vec_shl_insert_optab, "vec_shl_insert_$a")
> > > OPTAB_D (len_load_optab, "len_load_$a")
> > > OPTAB_D (len_store_optab, "len_store_$a")
> > > +OPTAB_D (while_len_optab, "while_len$a")
> > > diff --git a/gcc/tree-ssa-loop-manip.cc b/gcc/tree-ssa-loop-manip.cc
> > > index 09acc1c94cc..cdbf280e249 100644
> > > --- a/gcc/tree-ssa-loop-manip.cc
> > > +++ b/gcc/tree-ssa-loop-manip.cc
> > > @@ -59,14 +59,14 @@ static bitmap_obstack loop_renamer_obstack;
> > > void
> > > create_iv (tree base, tree step, tree var, class loop *loop,
> > >    gimple_stmt_iterator *incr_pos, bool after,
> > > -    tree *var_before, tree *var_after)
> > > +    tree *var_before, tree *var_after, enum tree_code code)
> > > {
> > >    gassign *stmt;
> > >    gphi *phi;
> > >    tree initial, step1;
> > >    gimple_seq stmts;
> > >    tree vb, va;
> > > -  enum tree_code incr_op = PLUS_EXPR;
> > > +  enum tree_code incr_op = code;
> > >    edge pe = loop_preheader_edge (loop);
> > >    if (var != NULL_TREE)
> > > diff --git a/gcc/tree-ssa-loop-manip.h b/gcc/tree-ssa-loop-manip.h
> > > index d49273a3987..da755320a3a 100644
> > > --- a/gcc/tree-ssa-loop-manip.h
> > > +++ b/gcc/tree-ssa-loop-manip.h
> > > @@ -23,7 +23,7 @@ along with GCC; see the file COPYING3.  If not see
> > > typedef void (*transform_callback)(class loop *, void *);
> > > extern void create_iv (tree, tree, tree, class loop *, gimple_stmt_iterator *,
> > > -        bool, tree *, tree *);
> > > +        bool, tree *, tree *, enum tree_code = PLUS_EXPR);
> > > extern void rewrite_into_loop_closed_ssa (bitmap, unsigned);
> > > extern void verify_loop_closed_ssa (bool, class loop * = NULL);
> > > diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> > > index f60fa50e8f4..f3cd6c51d2e 100644
> > > --- a/gcc/tree-vect-loop-manip.cc
> > > +++ b/gcc/tree-vect-loop-manip.cc
> > > @@ -682,6 +682,173 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
> > >    return next_ctrl;
> > > }
> > > +/* Helper for vect_set_loop_condition_partial_vectors.  Generate definitions
> > > +   for all the rgroup controls in RGC and return a control that is nonzero
> > > +   when the loop needs to iterate.  Add any new preheader statements to
> > > +   PREHEADER_SEQ.  Use LOOP_COND_GSI to insert code before the exit gcond.
> > > +
> > > +   RGC belongs to loop LOOP.  The loop originally iterated NITERS
> > > +   times and has been vectorized according to LOOP_VINFO.
> > > +
> > > +   Unlike vect_set_loop_controls_directly which is iterating from 0-based IV
> > > +   to TEST_LIMIT - bias.
> > > +
> > > +   In vect_set_loop_controls_by_while_len, we are iterating from start at
> > > +   IV = TEST_LIMIT - bias and keep subtract IV by the length calculated by
> > > +   IFN_WHILE_LEN pattern.
> > > +
> > > +   Note: the cost of the code generated by this function is modeled
> > > +   by vect_estimate_min_profitable_iters, so changes here may need
> > > +   corresponding changes there.
> > > +
> > > +   1. Single rgroup, the Gimple IR should be:
> > > +
> > > + <bb 3>
> > > + _19 = (unsigned long) n_5(D);
> > > + ...
> > > +
> > > + <bb 4>:
> > > + ...
> > > + # ivtmp_20 = PHI <ivtmp_21(4), _19(3)>
> > > + ...
> > > + _22 = .WHILE_LEN (ivtmp_20, vf);
> > > + ...
> > > + vector statement (use _22);
> > > + ...
> > > + ivtmp_21 = ivtmp_20 - _22;
> > > + ...
> > > + if (ivtmp_21 != 0)
> > > +   goto <bb 4>; [75.00%]
> > > + else
> > > +   goto <bb 5>; [25.00%]
> > > +
> > > + <bb 5>
> > > + return;
> > > +
> > > +   Note: IFN_WHILE_LEN will guarantee "ivtmp_21 = ivtmp_20 - _22" never
> > > +   underflow 0.
> > > +
> > > +   2. Multiple rgroup, the Gimple IR should be:
> > > +
> > > + <bb 3>
> > > + _70 = (unsigned long) bnd.7_52;
> > > + _71 = _70 * 2;
> > > + _72 = MAX_EXPR <_71, 4>;
> > > + _73 = _72 + 18446744073709551612;
> > > + ...
> > > +
> > > + <bb 4>:
> > > + ...
> > > + # ivtmp_74 = PHI <ivtmp_75(6), _73(12)>
> > > + # ivtmp_77 = PHI <ivtmp_78(6), _71(12)>
> > > + _76 = .WHILE_LEN (ivtmp_74, vf * nitems_per_ctrl);
> > > + _79 = .WHILE_LEN (ivtmp_77, vf * nitems_per_ctrl);
> > > + ...
> > > + vector statement (use _79);
> > > + ...
> > > + vector statement (use _76);
> > > + ...
> > > + _65 = _79 / 2;
> > > + vector statement (use _65);
> > > + ...
> > > + _68 = _76 / 2;
> > > + vector statement (use _68);
> > > + ...
> > > + ivtmp_78 = ivtmp_77 - _79;
> > > + ivtmp_75 = ivtmp_74 - _76;
> > > + ...
> > > + if (ivtmp_78 != 0)
> > > +   goto <bb 4>; [75.00%]
> > > + else
> > > +   goto <bb 5>; [25.00%]
> > > +
> > > + <bb 5>
> > > + return;
> > > +
> > > +*/
> > > +
> > > +static tree
> > > +vect_set_loop_controls_by_while_len (class loop *loop, loop_vec_info loop_vinfo,
> > > +      gimple_seq *preheader_seq,
> > > +      gimple_seq *header_seq,
> > > +      rgroup_controls *rgc, tree niters)
> > > +{
> > > +  tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
> > > +  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> > > +  /* We are not allowing masked approach in WHILE_LEN.  */
> > > +  gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
> > > +
> > > +  tree ctrl_type = rgc->type;
> > > +  unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
> > > +  poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
> > > +  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
> > > +
> > > +  /* Calculate the maximum number of item values that the rgroup
> > > +     handles in total, the number that it handles for each iteration
> > > +     of the vector loop.  */
> > > +  tree nitems_total = niters;
> > > +  if (nitems_per_iter != 1)
> > > +    {
> > > +      /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
> > > + these multiplications don't overflow.  */
> > > +      tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
> > > +      nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
> > > +    nitems_total, compare_factor);
> > > +    }
> > > +
> > > +  /* Convert the comparison value to the IV type (either a no-op or
> > > +     a promotion).  */
> > > +  nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total);
> > > +
> > > +  /* Create an induction variable that counts the number of items
> > > +     processed.  */
> > > +  tree index_before_incr, index_after_incr;
> > > +  gimple_stmt_iterator incr_gsi;
> > > +  bool insert_after;
> > > +  standard_iv_increment_position (loop, &incr_gsi, &insert_after);
> > > +
> > > +  /* Test the decremented IV, which will never underflow 0 since we have
> > > +     IFN_WHILE_LEN to gurantee that.  */
> > > +  tree test_limit = nitems_total;
> > > +
> > > +  /* Provide a definition of each control in the group.  */
> > > +  tree ctrl;
> > > +  unsigned int i;
> > > +  FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
> > > +    {
> > > +      /* Previous controls will cover BIAS items.  This control covers the
> > > + next batch.  */
> > > +      poly_uint64 bias = nitems_per_ctrl * i;
> > > +      tree bias_tree = build_int_cst (iv_type, bias);
> > > +
> > > +      /* Rather than have a new IV that starts at TEST_LIMIT and goes down to
> > > + BIAS, prefer to use the same TEST_LIMIT - BIAS based IV for each
> > > + control and adjust the bound down by BIAS.  */
> > > +      tree this_test_limit = test_limit;
> > > +      if (i != 0)
> > > + {
> > > +   this_test_limit = gimple_build (preheader_seq, MAX_EXPR, iv_type,
> > > +   this_test_limit, bias_tree);
> > > +   this_test_limit = gimple_build (preheader_seq, MINUS_EXPR, iv_type,
> > > +   this_test_limit, bias_tree);
> > > + }
> > > +
> > > +      /* Create decrement IV.  */
> > > +      create_iv (this_test_limit, ctrl, NULL_TREE, loop, &incr_gsi,
> > > + insert_after, &index_before_incr, &index_after_incr,
> > > + MINUS_EXPR);
> > > +
> > > +      poly_uint64 final_vf = vf * nitems_per_iter;
> > > +      tree vf_step = build_int_cst (iv_type, final_vf);
> > > +      tree res_len = gimple_build (header_seq, IFN_WHILE_LEN, iv_type,
> > > +    index_before_incr, vf_step);
> > > +      gassign *assign = gimple_build_assign (ctrl, res_len);
> > > +      gimple_seq_add_stmt (header_seq, assign);
> > > +    }
> > > +
> > > +  return index_after_incr;
> > > +}
> > > +
> > > /* Set up the iteration condition and rgroup controls for LOOP, given
> > >     that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
> > >     loop.  LOOP_VINFO describes the vectorization of LOOP.  NITERS is
> > > @@ -703,6 +870,7 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
> > >    bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
> > >    tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
> > > +  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> > >    unsigned int compare_precision = TYPE_PRECISION (compare_type);
> > >    tree orig_niters = niters;
> > > @@ -757,12 +925,18 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
> > > bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
> > > /* Set up all controls for this group.  */
> > > - test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
> > > -      &preheader_seq,
> > > -      &header_seq,
> > > -      loop_cond_gsi, rgc,
> > > -      niters, niters_skip,
> > > -      might_wrap_p);
> > > + if (direct_internal_fn_supported_p (IFN_WHILE_LEN, iv_type,
> > > +     OPTIMIZE_FOR_SPEED))
> > > +   test_ctrl
> > > +     = vect_set_loop_controls_by_while_len (loop, loop_vinfo,
> > > +    &preheader_seq, &header_seq,
> > > +    rgc, niters);
> > > + else
> > > +   test_ctrl
> > > +     = vect_set_loop_controls_directly (loop, loop_vinfo, &preheader_seq,
> > > +        &header_seq, loop_cond_gsi, rgc,
> > > +        niters, niters_skip,
> > > +        might_wrap_p);
> > >        }
> > >    /* Emit all accumulated statements.  */
> > > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> > > index 1ba9f18d73e..5bffd9a6322 100644
> > > --- a/gcc/tree-vect-loop.cc
> > > +++ b/gcc/tree-vect-loop.cc
> > > @@ -10360,12 +10360,14 @@ vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
> > >     rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
> > > tree
> > > -vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
> > > -    unsigned int nvectors, unsigned int index)
> > > +vect_get_loop_len (gimple_stmt_iterator *gsi, loop_vec_info loop_vinfo,
> > > +    vec_loop_lens *lens, unsigned int nvectors, tree vectype,
> > > +    unsigned int index)
> > > {
> > >    rgroup_controls *rgl = &(*lens)[nvectors - 1];
> > > -  bool use_bias_adjusted_len =
> > > -    LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
> > > +  bool use_bias_adjusted_len
> > > +    = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
> > > +  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> > >    /* Populate the rgroup's len array, if this is the first time we've
> > >       used it.  */
> > > @@ -10386,8 +10388,8 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
> > >   if (use_bias_adjusted_len)
> > >     {
> > >       gcc_assert (i == 0);
> > > -       tree adjusted_len =
> > > - make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
> > > +       tree adjusted_len
> > > + = make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
> > >       SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
> > >       rgl->bias_adjusted_ctrl = adjusted_len;
> > >     }
> > > @@ -10396,6 +10398,27 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
> > >    if (use_bias_adjusted_len)
> > >      return rgl->bias_adjusted_ctrl;
> > > +  else if (direct_internal_fn_supported_p (IFN_WHILE_LEN, iv_type,
> > > +    OPTIMIZE_FOR_SPEED))
> > > +    {
> > > +      tree loop_len = rgl->controls[index];
> > > +      poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
> > > +      poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
> > > +      if (maybe_ne (nunits1, nunits2))
> > > + {
> > > +   /* A loop len for data type X can be reused for data type Y
> > > +      if X has N times more elements than Y and if Y's elements
> > > +      are N times bigger than X's.  */
> > > +   gcc_assert (multiple_p (nunits1, nunits2));
> > > +   unsigned int factor = exact_div (nunits1, nunits2).to_constant ();
> > > +   gimple_seq seq = NULL;
> > > +   loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
> > > +    build_int_cst (iv_type, factor));
> > > +   if (seq)
> > > +     gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
> > > + }
> > > +      return loop_len;
> > > +    }
> > >    else
> > >      return rgl->controls[index];
> > > }
> > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> > > index efa2d0daa52..708c8a1d806 100644
> > > --- a/gcc/tree-vect-stmts.cc
> > > +++ b/gcc/tree-vect-stmts.cc
> > > @@ -8653,8 +8653,9 @@ vectorizable_store (vec_info *vinfo,
> > >       else if (loop_lens)
> > > {
> > >   tree final_len
> > > -     = vect_get_loop_len (loop_vinfo, loop_lens,
> > > - vec_num * ncopies, vec_num * j + i);
> > > +     = vect_get_loop_len (gsi, loop_vinfo, loop_lens,
> > > + vec_num * ncopies, vectype,
> > > + vec_num * j + i);
> > >   tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
> > >   machine_mode vmode = TYPE_MODE (vectype);
> > >   opt_machine_mode new_ovmode
> > > @@ -10009,8 +10010,8 @@ vectorizable_load (vec_info *vinfo,
> > >     else if (loop_lens && memory_access_type != VMAT_INVARIANT)
> > >       {
> > > tree final_len
> > > -   = vect_get_loop_len (loop_vinfo, loop_lens,
> > > -        vec_num * ncopies,
> > > +   = vect_get_loop_len (gsi, loop_vinfo, loop_lens,
> > > +        vec_num * ncopies, vectype,
> > >        vec_num * j + i);
> > > tree ptr = build_int_cst (ref_type,
> > >   align * BITS_PER_UNIT);
> > > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> > > index 9cf2fb23fe3..e5cf38caf4b 100644
> > > --- a/gcc/tree-vectorizer.h
> > > +++ b/gcc/tree-vectorizer.h
> > > @@ -2293,8 +2293,8 @@ extern tree vect_get_loop_mask (gimple_stmt_iterator *, vec_loop_masks *,
> > > unsigned int, tree, unsigned int);
> > > extern void vect_record_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
> > >   tree, unsigned int);
> > > -extern tree vect_get_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
> > > -        unsigned int);
> > > +extern tree vect_get_loop_len (gimple_stmt_iterator *, loop_vec_info,
> > > +        vec_loop_lens *, unsigned int, tree, unsigned int);
> > > extern gimple_seq vect_gen_len (tree, tree, tree, tree);
> > > extern stmt_vec_info info_for_reduction (vec_info *, stmt_vec_info);
> > > extern bool reduction_fn_for_scalar_code (code_helper, internal_fn *);
> > 
>  
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
  2023-04-12  8:42         ` Richard Biener
@ 2023-04-12  9:15           ` juzhe.zhong
  2023-04-12  9:29             ` Richard Biener
       [not found]           ` <2023041217154958074655@rivai.ai>
  1 sibling, 1 reply; 41+ messages in thread
From: juzhe.zhong @ 2023-04-12  9:15 UTC (permalink / raw)
  To: rguenther; +Cc: richard.sandiford, gcc-patches, jeffreyalaw

[-- Attachment #1: Type: text/plain, Size: 32531 bytes --]


>> Thanks for the detailed explanation.  Just to clarify - with RVV
>> there's only a single mask register, v0.t, or did you want to
>> say an instruction can only specify a single mask register?

RVV has 32 (v0~v31) vector register in total.
We can store vector data value or mask value in any of them.
We also have mask-logic instruction for example mask-and between any vector register.

However, any vector operation for example like vadd.vv can only  predicated by v0 (in asm is v0.t) which is the first vector register.
We can predicate vadd.vv with v1 - v31.

So, you can image every time we want to use a mask to predicate a vector operation, we should always first store the mask value
into v0.

So, we can write intrinsic sequence like this:

vmseq v0,v8,v9 (store mask value to v0)
vmslt v1,v10,v11 (store mask value to v1)
vmand v0,v0,v1
vadd.vv ...v0.t (predicate mask should always be mask).

>> ARM SVE would have a loop control mask and a separate mask
>> for the if (cond[i]) which would be combined with a mask-and
>> instruction to a third mask which is then used on the
>> predicated instructions.

Yeah, I know it. ARM SVE way is a more elegant way than RVV do. 
However, for RVV, we can't follow this flow.
We don't have a  "whilelo" instruction to generate loop control mask.
We only can do loop control with length generated by vsetvl.
And we can only use "v0" to mask predicate vadd.vv, and mask value can only generated by comparison or mask logical instructions. 

>> PowerPC and s390x might be able to use WHILE_LEN as well (though
>> they only have LEN variants of loads and stores) - of course
>> only "simulating it".  For the fixed-vector-length ISAs the
>> predicated vector loop IMHO makes most sense for the epilogue to
>> handle low-trip loops better.

Yeah, I wonder how they do the flow control (if (cond[i])). 
For RVV, you can image I will need to add a pattern LEN_MASK_LOAD/LEN_MASK_STORE (length generated by WHILE_LEN and mask generated by comparison)

I think we can CC IBM folks to see whether we can make WHILE_LEN works 
for both IBM and RVV ? 

Thanks.


juzhe.zhong@rivai.ai
 
From: Richard Biener
Date: 2023-04-12 16:42
To: juzhe.zhong@rivai.ai
CC: richard.sandiford; gcc-patches; jeffreyalaw
Subject: Re: Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
On Wed, 12 Apr 2023, juzhe.zhong@rivai.ai wrote:
 
> Thank you very much for reply.
> 
> WHILE_LEN is the pattern that calculates the number of the elements of the vector will be updated in each iteration.
> For RVV, we use vsetvl instruction to calculate the number of the elements of the vector.
> 
> WHILE_ULT can not work for RVV since WHILE_ULT is generating mask to predicate vector operation, but RVV do not
> use mask to do the loop strip mining (RVV only use mask for control flow inside the loop).
> 
> Here is the example WHILE_ULT working in ARM SVE:
> https://godbolt.org/z/jKsT8E1hP 
> 
> The first example is:
> void foo (int32_t * __restrict a, int32_t * __restrict b, int n)
> {
>     for (int i = 0; i < n; i++)
>       a[i] = a[i] + b[i];
> }
> 
> ARM SVE:
> foo:
>         cmp     w2, 0
>         ble     .L1
>         mov     x3, 0
>         cntw    x4
>         whilelo p0.s, wzr, w2
> .L3:
>         ld1w    z1.s, p0/z, [x0, x3, lsl 2]
>         ld1w    z0.s, p0/z, [x1, x3, lsl 2]
>         add     z0.s, z0.s, z1.s
>         st1w    z0.s, p0, [x0, x3, lsl 2]
>         add     x3, x3, x4
>         whilelo p0.s, w3, w2
>         b.any   .L3
> .L1:
>         ret
> 
> Here, whilelo will generate the mask according to w3 to w2.
> So for example, if w3 = 0, and w2 = 3 (Suppose machine vector length > 3).
> Then it will generate a mask with 0b111 mask to predicate loads and stores.
> 
> For RVV, we can't do that since RVV doesn't have whilelo instructions to generate predicate mask.
> Also, we can't use mask as the predicate to do loop strip mining since RVV only has 1 single mask 
> to handle flow control  inside the loop.
> 
> Instead, we use vsetvl to do the strip mining, so base on this, the same C code, RVV ideal asm according RVV ISA should be:
> 
> preheader:
> a0 = n (the total number of the scalar should be calculated).
>  .....
> .L3:
>         vsetvli a5,a0,e32,m1,ta,ma    ====> WHILE_LEN pattern generate this instruction, calculate the number of the elements should be updated
>         vle32.v v1,0(a4)
>         sub     a0,a0,a5      ============> Decrement the induction variable by the a5 (generated by WHILE_LEN)
>         ....   
> 
>         vadd.vv....
>         vse32.v v1,0(a3)
>         add     a4,a4,a2
>         add     a3,a3,a2
>         bne     a0,zero,.L3
> .L1:
>         ret
> 
> So you will see, if n = 3 like I said for ARM SVE (Suppose machine vector length > 3), then vsetvli a5,a0,e32,m1,ta,ma will
> generate a5 = 3, then the vle32.v/vadd.vv/vse32.v are all doing the operation only on the element 0,  element 1, element 2.
> 
> Besides, WHILE_LEN is defined to make sure to be never overflow the input operand which is "a0".
> That means  sub     a0,a0,a5 will make a0 never underflow 0.
> 
> I have tried to return Pmode in TARGET_VECTORIZE_GET_MASK_MODE 
> target hook and then use WHILE_ULT. 
> 
> But there are 2 issues:
> One is that current GCC is doing the flow from 0-based until the TEST_LIMIT. Wheras the optimal flow of RVV I showed above
> is from "n" keep decreasing n until 0.  Trying to fit the current flow of GCC, RVV needs more instructions to do the loop strip mining.
> 
> Second is that if we return a Pmode in TARGET_VECTORIZE_GET_MASK_MODE 
> which not only specify the dest mode for WHILE_ULT but also the mask mode of flow control.
> If we return Pmode which is used as the length for RVV. We can't use mask mode like VNx2BI mode to do the flow control predicate.
> This another example:
> void foo2 (int32_t * __restrict a, int32_t * __restrict b, int32_t * restrict cond, int n)
> {
>     for (int i = 0; i < n; i++)
>       if (cond[i])
>         a[i] = a[i] + b[i];
> }
> 
> ARM SVE:
>         ld1w    z0.s, p0/z, [x2, x4, lsl 2]
>         cmpne   p0.s, p0/z, z0.s, #0
>         ld1w    z0.s, p0/z, [x0, x4, lsl 2]
>         ld1w    z1.s, p0/z, [x1, x4, lsl 2]
>         add     z0.s, z0.s, z1.s
>         st1w    z0.s, p0, [x0, x4, lsl 2]
>         add     x4, x4, x5
>         whilelo p0.s, w4, w3
>         b.any   .L8
> 
> Here we can see ARM use mask mode for both loop strip minning and flow control.
> 
> Wheras, RVV use length generated by vsetvl (WHILE_LEN) to do the loop strip minning and mask generated by comparison to do the flow control.
> 
> So the ASM generated by my downstream LLVM/GCC:
> .L3:
>         vsetvli a6,a3,e32,m1,ta,mu   ==========> generate length to predicate RVV operation. 
>         vle32.v v0,(a2)
>         sub     a3,a3,a6      ==========> decrease the induction variable until 0.
>         vmsne.vi        v0,v0,0   ==========> generate mask to predicate RVV operation. 
>         vle32.v v24,(a0),v0.t   ===========> here using v0.t is the only mask register to predicate RVV operation
>         vle32.v v25,(a1),v0.t
>         vadd.vv v24,v24,v25
>         vse32.v v24,(a0),v0.t
>         add     a2,a2,a4
>         add     a0,a0,a4
>         add     a1,a1,a4
>         bne     a3,zero,.L3
> .L1:
>         ret
> 
> 
> This is the how RVV works.
> Feel free to comment if you have any questions.
 
Thanks for the detailed explanation.  Just to clarify - with RVV
there's only a single mask register, v0.t, or did you want to
say an instruction can only specify a single mask register?
ARM SVE would have a loop control mask and a separate mask
for the if (cond[i]) which would be combined with a mask-and
instruction to a third mask which is then used on the
predicated instructions.
 
For AVX512 WHILE_ULT is a better match since we need a mask in the
end (but WHILE_ULT isn't a very good match either, so I'm still
working on masked loop support there).
 
PowerPC and s390x might be able to use WHILE_LEN as well (though
they only have LEN variants of loads and stores) - of course
only "simulating it".  For the fixed-vector-length ISAs the
predicated vector loop IMHO makes most sense for the epilogue to
handle low-trip loops better.
 
Richard.
 
> Thanks.
> 
> 
> juzhe.zhong@rivai.ai
>  
> From: Richard Biener
> Date: 2023-04-12 15:00
> To: Richard Sandiford
> CC: juzhe.zhong@rivai.ai; gcc-patches; jeffreyalaw
> Subject: Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
> On Tue, 11 Apr 2023, Richard Sandiford wrote:
>  
> > "juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai> writes:
> > > Hi, Richards. 
> > > Kindly Ping this patch. 
> > > This is the most important patch for RVV auto-vectorization support.
> > > Bootstraped on X86 has passed.
> > 
> > Can it wait for GCC 14?  It doesn't seem like stage 4 material.
> > 
> > Also, pinging after 5 days seems a bit soon.  It's been a 4-day
> > holiday weekend for much of Europe.
>  
> Also can you explain why using WHILE_ULT is not possible?  (I've
> successfully - to some extent - done that for AVX512 for example)
>  
> The patch lacks the description of what WHILE_LEN actually is.
>  
> Richard.
>  
> > Thanks,
> > Richard
> > 
> > > Feel free to comments.
> > >
> > > Thanks.
> > >
> > >
> > > juzhe.zhong@rivai.ai
> > >  
> > > From: juzhe.zhong
> > > Date: 2023-04-07 09:47
> > > To: gcc-patches
> > > CC: richard.sandiford; rguenther; jeffreyalaw; Juzhe-Zhong
> > > Subject: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
> > > From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
> > >  
> > > This patch is to add WHILE_LEN pattern.
> > > It's inspired by RVV ISA simple "vvaddint32.s" example:
> > > https://github.com/riscv/riscv-v-spec/blob/master/example/vvaddint32.s
> > >  
> > > More details are in "vect_set_loop_controls_by_while_len" implementation
> > > and comments.
> > >  
> > > Consider such following case:
> > > #define N 16
> > > int src[N];
> > > int dest[N];
> > >  
> > > void
> > > foo (int n)
> > > {
> > >   for (int i = 0; i < n; i++)
> > >     dest[i] = src[i];
> > > }
> > >  
> > > -march=rv64gcv -O3 --param riscv-autovec-preference=scalable -fno-vect-cost-model -fno-tree-loop-distribute-patterns:
> > >  
> > > foo:        
> > >         ble     a0,zero,.L1
> > >         lui     a4,%hi(.LANCHOR0)
> > >         addi    a4,a4,%lo(.LANCHOR0)
> > >         addi    a3,a4,64
> > >         csrr    a2,vlenb
> > > .L3:
> > >         vsetvli a5,a0,e32,m1,ta,ma
> > >         vle32.v v1,0(a4)
> > >         sub     a0,a0,a5
> > >         vse32.v v1,0(a3)
> > >         add     a4,a4,a2
> > >         add     a3,a3,a2
> > >         bne     a0,zero,.L3
> > > .L1:
> > >         ret
> > >  
> > > gcc/ChangeLog:
> > >  
> > >         * doc/md.texi: Add WHILE_LEN support.
> > >         * internal-fn.cc (while_len_direct): Ditto.
> > >         (expand_while_len_optab_fn): Ditto.
> > >         (direct_while_len_optab_supported_p): Ditto.
> > >         * internal-fn.def (WHILE_LEN): Ditto.
> > >         * optabs.def (OPTAB_D): Ditto.
> > >         * tree-ssa-loop-manip.cc (create_iv): Ditto.
> > >         * tree-ssa-loop-manip.h (create_iv): Ditto.
> > >         * tree-vect-loop-manip.cc (vect_set_loop_controls_by_while_len): Ditto.
> > >         (vect_set_loop_condition_partial_vectors): Ditto.
> > >         * tree-vect-loop.cc (vect_get_loop_len): Ditto.
> > >         * tree-vect-stmts.cc (vectorizable_store): Ditto.
> > >         (vectorizable_load): Ditto.
> > >         * tree-vectorizer.h (vect_get_loop_len): Ditto.
> > >  
> > > ---
> > > gcc/doc/md.texi             |  14 +++
> > > gcc/internal-fn.cc          |  29 ++++++
> > > gcc/internal-fn.def         |   1 +
> > > gcc/optabs.def              |   1 +
> > > gcc/tree-ssa-loop-manip.cc  |   4 +-
> > > gcc/tree-ssa-loop-manip.h   |   2 +-
> > > gcc/tree-vect-loop-manip.cc | 186 ++++++++++++++++++++++++++++++++++--
> > > gcc/tree-vect-loop.cc       |  35 +++++--
> > > gcc/tree-vect-stmts.cc      |   9 +-
> > > gcc/tree-vectorizer.h       |   4 +-
> > > 10 files changed, 264 insertions(+), 21 deletions(-)
> > >  
> > > diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> > > index 8e3113599fd..72178ab014c 100644
> > > --- a/gcc/doc/md.texi
> > > +++ b/gcc/doc/md.texi
> > > @@ -4965,6 +4965,20 @@ for (i = 1; i < operand3; i++)
> > >    operand0[i] = operand0[i - 1] && (operand1 + i < operand2);
> > > @end smallexample
> > > +@cindex @code{while_len@var{m}@var{n}} instruction pattern
> > > +@item @code{while_len@var{m}@var{n}}
> > > +Set operand 0 to the number of active elements in vector will be updated value.
> > > +operand 1 is the total elements need to be updated value.
> > > +operand 2 is the vectorization factor.
> > > +The operation is equivalent to:
> > > +
> > > +@smallexample
> > > +operand0 = MIN (operand1, operand2);
> > > +operand2 can be const_poly_int or poly_int related to vector mode size.
> > > +Some target like RISC-V has a standalone instruction to get MIN (n, MODE SIZE) so
> > > +that we can reduce a use of general purpose register.
> > > +@end smallexample
> > > +
> > > @cindex @code{check_raw_ptrs@var{m}} instruction pattern
> > > @item @samp{check_raw_ptrs@var{m}}
> > > Check whether, given two pointers @var{a} and @var{b} and a length @var{len},
> > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> > > index 6e81dc05e0e..5f44def90d3 100644
> > > --- a/gcc/internal-fn.cc
> > > +++ b/gcc/internal-fn.cc
> > > @@ -127,6 +127,7 @@ init_internal_fns ()
> > > #define cond_binary_direct { 1, 1, true }
> > > #define cond_ternary_direct { 1, 1, true }
> > > #define while_direct { 0, 2, false }
> > > +#define while_len_direct { 0, 0, false }
> > > #define fold_extract_direct { 2, 2, false }
> > > #define fold_left_direct { 1, 1, false }
> > > #define mask_fold_left_direct { 1, 1, false }
> > > @@ -3702,6 +3703,33 @@ expand_while_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
> > >      emit_move_insn (lhs_rtx, ops[0].value);
> > > }
> > > +/* Expand WHILE_LEN call STMT using optab OPTAB.  */
> > > +static void
> > > +expand_while_len_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
> > > +{
> > > +  expand_operand ops[3];
> > > +  tree rhs_type[2];
> > > +
> > > +  tree lhs = gimple_call_lhs (stmt);
> > > +  tree lhs_type = TREE_TYPE (lhs);
> > > +  rtx lhs_rtx = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
> > > +  create_output_operand (&ops[0], lhs_rtx, TYPE_MODE (lhs_type));
> > > +
> > > +  for (unsigned int i = 0; i < gimple_call_num_args (stmt); ++i)
> > > +    {
> > > +      tree rhs = gimple_call_arg (stmt, i);
> > > +      rhs_type[i] = TREE_TYPE (rhs);
> > > +      rtx rhs_rtx = expand_normal (rhs);
> > > +      create_input_operand (&ops[i + 1], rhs_rtx, TYPE_MODE (rhs_type[i]));
> > > +    }
> > > +
> > > +  insn_code icode = direct_optab_handler (optab, TYPE_MODE (rhs_type[0]));
> > > +
> > > +  expand_insn (icode, 3, ops);
> > > +  if (!rtx_equal_p (lhs_rtx, ops[0].value))
> > > +    emit_move_insn (lhs_rtx, ops[0].value);
> > > +}
> > > +
> > > /* Expand a call to a convert-like optab using the operands in STMT.
> > >     FN has a single output operand and NARGS input operands.  */
> > > @@ -3843,6 +3871,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
> > > #define direct_scatter_store_optab_supported_p convert_optab_supported_p
> > > #define direct_len_store_optab_supported_p direct_optab_supported_p
> > > #define direct_while_optab_supported_p convert_optab_supported_p
> > > +#define direct_while_len_optab_supported_p direct_optab_supported_p
> > > #define direct_fold_extract_optab_supported_p direct_optab_supported_p
> > > #define direct_fold_left_optab_supported_p direct_optab_supported_p
> > > #define direct_mask_fold_left_optab_supported_p direct_optab_supported_p
> > > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> > > index 7fe742c2ae7..3a933abff5d 100644
> > > --- a/gcc/internal-fn.def
> > > +++ b/gcc/internal-fn.def
> > > @@ -153,6 +153,7 @@ DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set)
> > > DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store)
> > > DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
> > > +DEF_INTERNAL_OPTAB_FN (WHILE_LEN, ECF_CONST | ECF_NOTHROW, while_len, while_len)
> > > DEF_INTERNAL_OPTAB_FN (CHECK_RAW_PTRS, ECF_CONST | ECF_NOTHROW,
> > >        check_raw_ptrs, check_ptrs)
> > > DEF_INTERNAL_OPTAB_FN (CHECK_WAR_PTRS, ECF_CONST | ECF_NOTHROW,
> > > diff --git a/gcc/optabs.def b/gcc/optabs.def
> > > index 695f5911b30..f5938bd2c24 100644
> > > --- a/gcc/optabs.def
> > > +++ b/gcc/optabs.def
> > > @@ -476,3 +476,4 @@ OPTAB_DC (vec_series_optab, "vec_series$a", VEC_SERIES)
> > > OPTAB_D (vec_shl_insert_optab, "vec_shl_insert_$a")
> > > OPTAB_D (len_load_optab, "len_load_$a")
> > > OPTAB_D (len_store_optab, "len_store_$a")
> > > +OPTAB_D (while_len_optab, "while_len$a")
> > > diff --git a/gcc/tree-ssa-loop-manip.cc b/gcc/tree-ssa-loop-manip.cc
> > > index 09acc1c94cc..cdbf280e249 100644
> > > --- a/gcc/tree-ssa-loop-manip.cc
> > > +++ b/gcc/tree-ssa-loop-manip.cc
> > > @@ -59,14 +59,14 @@ static bitmap_obstack loop_renamer_obstack;
> > > void
> > > create_iv (tree base, tree step, tree var, class loop *loop,
> > >    gimple_stmt_iterator *incr_pos, bool after,
> > > -    tree *var_before, tree *var_after)
> > > +    tree *var_before, tree *var_after, enum tree_code code)
> > > {
> > >    gassign *stmt;
> > >    gphi *phi;
> > >    tree initial, step1;
> > >    gimple_seq stmts;
> > >    tree vb, va;
> > > -  enum tree_code incr_op = PLUS_EXPR;
> > > +  enum tree_code incr_op = code;
> > >    edge pe = loop_preheader_edge (loop);
> > >    if (var != NULL_TREE)
> > > diff --git a/gcc/tree-ssa-loop-manip.h b/gcc/tree-ssa-loop-manip.h
> > > index d49273a3987..da755320a3a 100644
> > > --- a/gcc/tree-ssa-loop-manip.h
> > > +++ b/gcc/tree-ssa-loop-manip.h
> > > @@ -23,7 +23,7 @@ along with GCC; see the file COPYING3.  If not see
> > > typedef void (*transform_callback)(class loop *, void *);
> > > extern void create_iv (tree, tree, tree, class loop *, gimple_stmt_iterator *,
> > > -        bool, tree *, tree *);
> > > +        bool, tree *, tree *, enum tree_code = PLUS_EXPR);
> > > extern void rewrite_into_loop_closed_ssa (bitmap, unsigned);
> > > extern void verify_loop_closed_ssa (bool, class loop * = NULL);
> > > diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> > > index f60fa50e8f4..f3cd6c51d2e 100644
> > > --- a/gcc/tree-vect-loop-manip.cc
> > > +++ b/gcc/tree-vect-loop-manip.cc
> > > @@ -682,6 +682,173 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
> > >    return next_ctrl;
> > > }
> > > +/* Helper for vect_set_loop_condition_partial_vectors.  Generate definitions
> > > +   for all the rgroup controls in RGC and return a control that is nonzero
> > > +   when the loop needs to iterate.  Add any new preheader statements to
> > > +   PREHEADER_SEQ.  Use LOOP_COND_GSI to insert code before the exit gcond.
> > > +
> > > +   RGC belongs to loop LOOP.  The loop originally iterated NITERS
> > > +   times and has been vectorized according to LOOP_VINFO.
> > > +
> > > +   Unlike vect_set_loop_controls_directly which is iterating from 0-based IV
> > > +   to TEST_LIMIT - bias.
> > > +
> > > +   In vect_set_loop_controls_by_while_len, we are iterating from start at
> > > +   IV = TEST_LIMIT - bias and keep subtract IV by the length calculated by
> > > +   IFN_WHILE_LEN pattern.
> > > +
> > > +   Note: the cost of the code generated by this function is modeled
> > > +   by vect_estimate_min_profitable_iters, so changes here may need
> > > +   corresponding changes there.
> > > +
> > > +   1. Single rgroup, the Gimple IR should be:
> > > +
> > > + <bb 3>
> > > + _19 = (unsigned long) n_5(D);
> > > + ...
> > > +
> > > + <bb 4>:
> > > + ...
> > > + # ivtmp_20 = PHI <ivtmp_21(4), _19(3)>
> > > + ...
> > > + _22 = .WHILE_LEN (ivtmp_20, vf);
> > > + ...
> > > + vector statement (use _22);
> > > + ...
> > > + ivtmp_21 = ivtmp_20 - _22;
> > > + ...
> > > + if (ivtmp_21 != 0)
> > > +   goto <bb 4>; [75.00%]
> > > + else
> > > +   goto <bb 5>; [25.00%]
> > > +
> > > + <bb 5>
> > > + return;
> > > +
> > > +   Note: IFN_WHILE_LEN will guarantee "ivtmp_21 = ivtmp_20 - _22" never
> > > +   underflow 0.
> > > +
> > > +   2. Multiple rgroup, the Gimple IR should be:
> > > +
> > > + <bb 3>
> > > + _70 = (unsigned long) bnd.7_52;
> > > + _71 = _70 * 2;
> > > + _72 = MAX_EXPR <_71, 4>;
> > > + _73 = _72 + 18446744073709551612;
> > > + ...
> > > +
> > > + <bb 4>:
> > > + ...
> > > + # ivtmp_74 = PHI <ivtmp_75(6), _73(12)>
> > > + # ivtmp_77 = PHI <ivtmp_78(6), _71(12)>
> > > + _76 = .WHILE_LEN (ivtmp_74, vf * nitems_per_ctrl);
> > > + _79 = .WHILE_LEN (ivtmp_77, vf * nitems_per_ctrl);
> > > + ...
> > > + vector statement (use _79);
> > > + ...
> > > + vector statement (use _76);
> > > + ...
> > > + _65 = _79 / 2;
> > > + vector statement (use _65);
> > > + ...
> > > + _68 = _76 / 2;
> > > + vector statement (use _68);
> > > + ...
> > > + ivtmp_78 = ivtmp_77 - _79;
> > > + ivtmp_75 = ivtmp_74 - _76;
> > > + ...
> > > + if (ivtmp_78 != 0)
> > > +   goto <bb 4>; [75.00%]
> > > + else
> > > +   goto <bb 5>; [25.00%]
> > > +
> > > + <bb 5>
> > > + return;
> > > +
> > > +*/
> > > +
> > > +static tree
> > > +vect_set_loop_controls_by_while_len (class loop *loop, loop_vec_info loop_vinfo,
> > > +      gimple_seq *preheader_seq,
> > > +      gimple_seq *header_seq,
> > > +      rgroup_controls *rgc, tree niters)
> > > +{
> > > +  tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
> > > +  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> > > +  /* We are not allowing masked approach in WHILE_LEN.  */
> > > +  gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
> > > +
> > > +  tree ctrl_type = rgc->type;
> > > +  unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
> > > +  poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
> > > +  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
> > > +
> > > +  /* Calculate the maximum number of item values that the rgroup
> > > +     handles in total, the number that it handles for each iteration
> > > +     of the vector loop.  */
> > > +  tree nitems_total = niters;
> > > +  if (nitems_per_iter != 1)
> > > +    {
> > > +      /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
> > > + these multiplications don't overflow.  */
> > > +      tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
> > > +      nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
> > > +    nitems_total, compare_factor);
> > > +    }
> > > +
> > > +  /* Convert the comparison value to the IV type (either a no-op or
> > > +     a promotion).  */
> > > +  nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total);
> > > +
> > > +  /* Create an induction variable that counts the number of items
> > > +     processed.  */
> > > +  tree index_before_incr, index_after_incr;
> > > +  gimple_stmt_iterator incr_gsi;
> > > +  bool insert_after;
> > > +  standard_iv_increment_position (loop, &incr_gsi, &insert_after);
> > > +
> > > +  /* Test the decremented IV, which will never underflow 0 since we have
> > > +     IFN_WHILE_LEN to gurantee that.  */
> > > +  tree test_limit = nitems_total;
> > > +
> > > +  /* Provide a definition of each control in the group.  */
> > > +  tree ctrl;
> > > +  unsigned int i;
> > > +  FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
> > > +    {
> > > +      /* Previous controls will cover BIAS items.  This control covers the
> > > + next batch.  */
> > > +      poly_uint64 bias = nitems_per_ctrl * i;
> > > +      tree bias_tree = build_int_cst (iv_type, bias);
> > > +
> > > +      /* Rather than have a new IV that starts at TEST_LIMIT and goes down to
> > > + BIAS, prefer to use the same TEST_LIMIT - BIAS based IV for each
> > > + control and adjust the bound down by BIAS.  */
> > > +      tree this_test_limit = test_limit;
> > > +      if (i != 0)
> > > + {
> > > +   this_test_limit = gimple_build (preheader_seq, MAX_EXPR, iv_type,
> > > +   this_test_limit, bias_tree);
> > > +   this_test_limit = gimple_build (preheader_seq, MINUS_EXPR, iv_type,
> > > +   this_test_limit, bias_tree);
> > > + }
> > > +
> > > +      /* Create decrement IV.  */
> > > +      create_iv (this_test_limit, ctrl, NULL_TREE, loop, &incr_gsi,
> > > + insert_after, &index_before_incr, &index_after_incr,
> > > + MINUS_EXPR);
> > > +
> > > +      poly_uint64 final_vf = vf * nitems_per_iter;
> > > +      tree vf_step = build_int_cst (iv_type, final_vf);
> > > +      tree res_len = gimple_build (header_seq, IFN_WHILE_LEN, iv_type,
> > > +    index_before_incr, vf_step);
> > > +      gassign *assign = gimple_build_assign (ctrl, res_len);
> > > +      gimple_seq_add_stmt (header_seq, assign);
> > > +    }
> > > +
> > > +  return index_after_incr;
> > > +}
> > > +
> > > /* Set up the iteration condition and rgroup controls for LOOP, given
> > >     that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
> > >     loop.  LOOP_VINFO describes the vectorization of LOOP.  NITERS is
> > > @@ -703,6 +870,7 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
> > >    bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
> > >    tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
> > > +  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> > >    unsigned int compare_precision = TYPE_PRECISION (compare_type);
> > >    tree orig_niters = niters;
> > > @@ -757,12 +925,18 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
> > > bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
> > > /* Set up all controls for this group.  */
> > > - test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
> > > -      &preheader_seq,
> > > -      &header_seq,
> > > -      loop_cond_gsi, rgc,
> > > -      niters, niters_skip,
> > > -      might_wrap_p);
> > > + if (direct_internal_fn_supported_p (IFN_WHILE_LEN, iv_type,
> > > +     OPTIMIZE_FOR_SPEED))
> > > +   test_ctrl
> > > +     = vect_set_loop_controls_by_while_len (loop, loop_vinfo,
> > > +    &preheader_seq, &header_seq,
> > > +    rgc, niters);
> > > + else
> > > +   test_ctrl
> > > +     = vect_set_loop_controls_directly (loop, loop_vinfo, &preheader_seq,
> > > +        &header_seq, loop_cond_gsi, rgc,
> > > +        niters, niters_skip,
> > > +        might_wrap_p);
> > >        }
> > >    /* Emit all accumulated statements.  */
> > > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> > > index 1ba9f18d73e..5bffd9a6322 100644
> > > --- a/gcc/tree-vect-loop.cc
> > > +++ b/gcc/tree-vect-loop.cc
> > > @@ -10360,12 +10360,14 @@ vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
> > >     rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
> > > tree
> > > -vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
> > > -    unsigned int nvectors, unsigned int index)
> > > +vect_get_loop_len (gimple_stmt_iterator *gsi, loop_vec_info loop_vinfo,
> > > +    vec_loop_lens *lens, unsigned int nvectors, tree vectype,
> > > +    unsigned int index)
> > > {
> > >    rgroup_controls *rgl = &(*lens)[nvectors - 1];
> > > -  bool use_bias_adjusted_len =
> > > -    LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
> > > +  bool use_bias_adjusted_len
> > > +    = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
> > > +  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> > >    /* Populate the rgroup's len array, if this is the first time we've
> > >       used it.  */
> > > @@ -10386,8 +10388,8 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
> > >   if (use_bias_adjusted_len)
> > >     {
> > >       gcc_assert (i == 0);
> > > -       tree adjusted_len =
> > > - make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
> > > +       tree adjusted_len
> > > + = make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
> > >       SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
> > >       rgl->bias_adjusted_ctrl = adjusted_len;
> > >     }
> > > @@ -10396,6 +10398,27 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
> > >    if (use_bias_adjusted_len)
> > >      return rgl->bias_adjusted_ctrl;
> > > +  else if (direct_internal_fn_supported_p (IFN_WHILE_LEN, iv_type,
> > > +    OPTIMIZE_FOR_SPEED))
> > > +    {
> > > +      tree loop_len = rgl->controls[index];
> > > +      poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
> > > +      poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
> > > +      if (maybe_ne (nunits1, nunits2))
> > > + {
> > > +   /* A loop len for data type X can be reused for data type Y
> > > +      if X has N times more elements than Y and if Y's elements
> > > +      are N times bigger than X's.  */
> > > +   gcc_assert (multiple_p (nunits1, nunits2));
> > > +   unsigned int factor = exact_div (nunits1, nunits2).to_constant ();
> > > +   gimple_seq seq = NULL;
> > > +   loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
> > > +    build_int_cst (iv_type, factor));
> > > +   if (seq)
> > > +     gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
> > > + }
> > > +      return loop_len;
> > > +    }
> > >    else
> > >      return rgl->controls[index];
> > > }
> > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> > > index efa2d0daa52..708c8a1d806 100644
> > > --- a/gcc/tree-vect-stmts.cc
> > > +++ b/gcc/tree-vect-stmts.cc
> > > @@ -8653,8 +8653,9 @@ vectorizable_store (vec_info *vinfo,
> > >       else if (loop_lens)
> > > {
> > >   tree final_len
> > > -     = vect_get_loop_len (loop_vinfo, loop_lens,
> > > - vec_num * ncopies, vec_num * j + i);
> > > +     = vect_get_loop_len (gsi, loop_vinfo, loop_lens,
> > > + vec_num * ncopies, vectype,
> > > + vec_num * j + i);
> > >   tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
> > >   machine_mode vmode = TYPE_MODE (vectype);
> > >   opt_machine_mode new_ovmode
> > > @@ -10009,8 +10010,8 @@ vectorizable_load (vec_info *vinfo,
> > >     else if (loop_lens && memory_access_type != VMAT_INVARIANT)
> > >       {
> > > tree final_len
> > > -   = vect_get_loop_len (loop_vinfo, loop_lens,
> > > -        vec_num * ncopies,
> > > +   = vect_get_loop_len (gsi, loop_vinfo, loop_lens,
> > > +        vec_num * ncopies, vectype,
> > >        vec_num * j + i);
> > > tree ptr = build_int_cst (ref_type,
> > >   align * BITS_PER_UNIT);
> > > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> > > index 9cf2fb23fe3..e5cf38caf4b 100644
> > > --- a/gcc/tree-vectorizer.h
> > > +++ b/gcc/tree-vectorizer.h
> > > @@ -2293,8 +2293,8 @@ extern tree vect_get_loop_mask (gimple_stmt_iterator *, vec_loop_masks *,
> > > unsigned int, tree, unsigned int);
> > > extern void vect_record_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
> > >   tree, unsigned int);
> > > -extern tree vect_get_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
> > > -        unsigned int);
> > > +extern tree vect_get_loop_len (gimple_stmt_iterator *, loop_vec_info,
> > > +        vec_loop_lens *, unsigned int, tree, unsigned int);
> > > extern gimple_seq vect_gen_len (tree, tree, tree, tree);
> > > extern stmt_vec_info info_for_reduction (vec_info *, stmt_vec_info);
> > > extern bool reduction_fn_for_scalar_code (code_helper, internal_fn *);
> > 
>  
> 
 
-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)
 

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
       [not found]           ` <2023041217154958074655@rivai.ai>
@ 2023-04-12  9:20             ` juzhe.zhong
  0 siblings, 0 replies; 41+ messages in thread
From: juzhe.zhong @ 2023-04-12  9:20 UTC (permalink / raw)
  To: rguenther; +Cc: richard.sandiford, gcc-patches, jeffreyalaw

[-- Attachment #1: Type: text/plain, Size: 32894 bytes --]

Sorry for incorrect typo.We can predicate vadd.vv with v1 - v31.
====>
We can't predicate vadd.vv with v1 - v31.


juzhe.zhong@rivai.ai
 
From: juzhe.zhong@rivai.ai
Date: 2023-04-12 17:15
To: rguenther
CC: richard.sandiford; gcc-patches; jeffreyalaw
Subject: Re: Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization

>> Thanks for the detailed explanation.  Just to clarify - with RVV
>> there's only a single mask register, v0.t, or did you want to
>> say an instruction can only specify a single mask register?

RVV has 32 (v0~v31) vector register in total.
We can store vector data value or mask value in any of them.
We also have mask-logic instruction for example mask-and between any vector register.

However, any vector operation for example like vadd.vv can only  predicated by v0 (in asm is v0.t) which is the first vector register.
We can predicate vadd.vv with v1 - v31.

So, you can image every time we want to use a mask to predicate a vector operation, we should always first store the mask value
into v0.

So, we can write intrinsic sequence like this:

vmseq v0,v8,v9 (store mask value to v0)
vmslt v1,v10,v11 (store mask value to v1)
vmand v0,v0,v1
vadd.vv ...v0.t (predicate mask should always be mask).

>> ARM SVE would have a loop control mask and a separate mask
>> for the if (cond[i]) which would be combined with a mask-and
>> instruction to a third mask which is then used on the
>> predicated instructions.

Yeah, I know it. ARM SVE way is a more elegant way than RVV do. 
However, for RVV, we can't follow this flow.
We don't have a  "whilelo" instruction to generate loop control mask.
We only can do loop control with length generated by vsetvl.
And we can only use "v0" to mask predicate vadd.vv, and mask value can only generated by comparison or mask logical instructions. 

>> PowerPC and s390x might be able to use WHILE_LEN as well (though
>> they only have LEN variants of loads and stores) - of course
>> only "simulating it".  For the fixed-vector-length ISAs the
>> predicated vector loop IMHO makes most sense for the epilogue to
>> handle low-trip loops better.

Yeah, I wonder how they do the flow control (if (cond[i])). 
For RVV, you can image I will need to add a pattern LEN_MASK_LOAD/LEN_MASK_STORE (length generated by WHILE_LEN and mask generated by comparison)

I think we can CC IBM folks to see whether we can make WHILE_LEN works 
for both IBM and RVV ? 

Thanks.


juzhe.zhong@rivai.ai
 
From: Richard Biener
Date: 2023-04-12 16:42
To: juzhe.zhong@rivai.ai
CC: richard.sandiford; gcc-patches; jeffreyalaw
Subject: Re: Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
On Wed, 12 Apr 2023, juzhe.zhong@rivai.ai wrote:
 
> Thank you very much for reply.
> 
> WHILE_LEN is the pattern that calculates the number of the elements of the vector will be updated in each iteration.
> For RVV, we use vsetvl instruction to calculate the number of the elements of the vector.
> 
> WHILE_ULT can not work for RVV since WHILE_ULT is generating mask to predicate vector operation, but RVV do not
> use mask to do the loop strip mining (RVV only use mask for control flow inside the loop).
> 
> Here is the example WHILE_ULT working in ARM SVE:
> https://godbolt.org/z/jKsT8E1hP 
> 
> The first example is:
> void foo (int32_t * __restrict a, int32_t * __restrict b, int n)
> {
>     for (int i = 0; i < n; i++)
>       a[i] = a[i] + b[i];
> }
> 
> ARM SVE:
> foo:
>         cmp     w2, 0
>         ble     .L1
>         mov     x3, 0
>         cntw    x4
>         whilelo p0.s, wzr, w2
> .L3:
>         ld1w    z1.s, p0/z, [x0, x3, lsl 2]
>         ld1w    z0.s, p0/z, [x1, x3, lsl 2]
>         add     z0.s, z0.s, z1.s
>         st1w    z0.s, p0, [x0, x3, lsl 2]
>         add     x3, x3, x4
>         whilelo p0.s, w3, w2
>         b.any   .L3
> .L1:
>         ret
> 
> Here, whilelo will generate the mask according to w3 to w2.
> So for example, if w3 = 0, and w2 = 3 (Suppose machine vector length > 3).
> Then it will generate a mask with 0b111 mask to predicate loads and stores.
> 
> For RVV, we can't do that since RVV doesn't have whilelo instructions to generate predicate mask.
> Also, we can't use mask as the predicate to do loop strip mining since RVV only has 1 single mask 
> to handle flow control  inside the loop.
> 
> Instead, we use vsetvl to do the strip mining, so base on this, the same C code, RVV ideal asm according RVV ISA should be:
> 
> preheader:
> a0 = n (the total number of the scalar should be calculated).
>  .....
> .L3:
>         vsetvli a5,a0,e32,m1,ta,ma    ====> WHILE_LEN pattern generate this instruction, calculate the number of the elements should be updated
>         vle32.v v1,0(a4)
>         sub     a0,a0,a5      ============> Decrement the induction variable by the a5 (generated by WHILE_LEN)
>         ....   
> 
>         vadd.vv....
>         vse32.v v1,0(a3)
>         add     a4,a4,a2
>         add     a3,a3,a2
>         bne     a0,zero,.L3
> .L1:
>         ret
> 
> So you will see, if n = 3 like I said for ARM SVE (Suppose machine vector length > 3), then vsetvli a5,a0,e32,m1,ta,ma will
> generate a5 = 3, then the vle32.v/vadd.vv/vse32.v are all doing the operation only on the element 0,  element 1, element 2.
> 
> Besides, WHILE_LEN is defined to make sure to be never overflow the input operand which is "a0".
> That means  sub     a0,a0,a5 will make a0 never underflow 0.
> 
> I have tried to return Pmode in TARGET_VECTORIZE_GET_MASK_MODE 
> target hook and then use WHILE_ULT. 
> 
> But there are 2 issues:
> One is that current GCC is doing the flow from 0-based until the TEST_LIMIT. Wheras the optimal flow of RVV I showed above
> is from "n" keep decreasing n until 0.  Trying to fit the current flow of GCC, RVV needs more instructions to do the loop strip mining.
> 
> Second is that if we return a Pmode in TARGET_VECTORIZE_GET_MASK_MODE 
> which not only specify the dest mode for WHILE_ULT but also the mask mode of flow control.
> If we return Pmode which is used as the length for RVV. We can't use mask mode like VNx2BI mode to do the flow control predicate.
> This another example:
> void foo2 (int32_t * __restrict a, int32_t * __restrict b, int32_t * restrict cond, int n)
> {
>     for (int i = 0; i < n; i++)
>       if (cond[i])
>         a[i] = a[i] + b[i];
> }
> 
> ARM SVE:
>         ld1w    z0.s, p0/z, [x2, x4, lsl 2]
>         cmpne   p0.s, p0/z, z0.s, #0
>         ld1w    z0.s, p0/z, [x0, x4, lsl 2]
>         ld1w    z1.s, p0/z, [x1, x4, lsl 2]
>         add     z0.s, z0.s, z1.s
>         st1w    z0.s, p0, [x0, x4, lsl 2]
>         add     x4, x4, x5
>         whilelo p0.s, w4, w3
>         b.any   .L8
> 
> Here we can see ARM use mask mode for both loop strip minning and flow control.
> 
> Wheras, RVV use length generated by vsetvl (WHILE_LEN) to do the loop strip minning and mask generated by comparison to do the flow control.
> 
> So the ASM generated by my downstream LLVM/GCC:
> .L3:
>         vsetvli a6,a3,e32,m1,ta,mu   ==========> generate length to predicate RVV operation. 
>         vle32.v v0,(a2)
>         sub     a3,a3,a6      ==========> decrease the induction variable until 0.
>         vmsne.vi        v0,v0,0   ==========> generate mask to predicate RVV operation. 
>         vle32.v v24,(a0),v0.t   ===========> here using v0.t is the only mask register to predicate RVV operation
>         vle32.v v25,(a1),v0.t
>         vadd.vv v24,v24,v25
>         vse32.v v24,(a0),v0.t
>         add     a2,a2,a4
>         add     a0,a0,a4
>         add     a1,a1,a4
>         bne     a3,zero,.L3
> .L1:
>         ret
> 
> 
> This is the how RVV works.
> Feel free to comment if you have any questions.
 
Thanks for the detailed explanation.  Just to clarify - with RVV
there's only a single mask register, v0.t, or did you want to
say an instruction can only specify a single mask register?
ARM SVE would have a loop control mask and a separate mask
for the if (cond[i]) which would be combined with a mask-and
instruction to a third mask which is then used on the
predicated instructions.
 
For AVX512 WHILE_ULT is a better match since we need a mask in the
end (but WHILE_ULT isn't a very good match either, so I'm still
working on masked loop support there).
 
PowerPC and s390x might be able to use WHILE_LEN as well (though
they only have LEN variants of loads and stores) - of course
only "simulating it".  For the fixed-vector-length ISAs the
predicated vector loop IMHO makes most sense for the epilogue to
handle low-trip loops better.
 
Richard.
 
> Thanks.
> 
> 
> juzhe.zhong@rivai.ai
>  
> From: Richard Biener
> Date: 2023-04-12 15:00
> To: Richard Sandiford
> CC: juzhe.zhong@rivai.ai; gcc-patches; jeffreyalaw
> Subject: Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
> On Tue, 11 Apr 2023, Richard Sandiford wrote:
>  
> > "juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai> writes:
> > > Hi, Richards. 
> > > Kindly Ping this patch. 
> > > This is the most important patch for RVV auto-vectorization support.
> > > Bootstraped on X86 has passed.
> > 
> > Can it wait for GCC 14?  It doesn't seem like stage 4 material.
> > 
> > Also, pinging after 5 days seems a bit soon.  It's been a 4-day
> > holiday weekend for much of Europe.
>  
> Also can you explain why using WHILE_ULT is not possible?  (I've
> successfully - to some extent - done that for AVX512 for example)
>  
> The patch lacks the description of what WHILE_LEN actually is.
>  
> Richard.
>  
> > Thanks,
> > Richard
> > 
> > > Feel free to comments.
> > >
> > > Thanks.
> > >
> > >
> > > juzhe.zhong@rivai.ai
> > >  
> > > From: juzhe.zhong
> > > Date: 2023-04-07 09:47
> > > To: gcc-patches
> > > CC: richard.sandiford; rguenther; jeffreyalaw; Juzhe-Zhong
> > > Subject: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
> > > From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
> > >  
> > > This patch is to add WHILE_LEN pattern.
> > > It's inspired by RVV ISA simple "vvaddint32.s" example:
> > > https://github.com/riscv/riscv-v-spec/blob/master/example/vvaddint32.s
> > >  
> > > More details are in "vect_set_loop_controls_by_while_len" implementation
> > > and comments.
> > >  
> > > Consider such following case:
> > > #define N 16
> > > int src[N];
> > > int dest[N];
> > >  
> > > void
> > > foo (int n)
> > > {
> > >   for (int i = 0; i < n; i++)
> > >     dest[i] = src[i];
> > > }
> > >  
> > > -march=rv64gcv -O3 --param riscv-autovec-preference=scalable -fno-vect-cost-model -fno-tree-loop-distribute-patterns:
> > >  
> > > foo:        
> > >         ble     a0,zero,.L1
> > >         lui     a4,%hi(.LANCHOR0)
> > >         addi    a4,a4,%lo(.LANCHOR0)
> > >         addi    a3,a4,64
> > >         csrr    a2,vlenb
> > > .L3:
> > >         vsetvli a5,a0,e32,m1,ta,ma
> > >         vle32.v v1,0(a4)
> > >         sub     a0,a0,a5
> > >         vse32.v v1,0(a3)
> > >         add     a4,a4,a2
> > >         add     a3,a3,a2
> > >         bne     a0,zero,.L3
> > > .L1:
> > >         ret
> > >  
> > > gcc/ChangeLog:
> > >  
> > >         * doc/md.texi: Add WHILE_LEN support.
> > >         * internal-fn.cc (while_len_direct): Ditto.
> > >         (expand_while_len_optab_fn): Ditto.
> > >         (direct_while_len_optab_supported_p): Ditto.
> > >         * internal-fn.def (WHILE_LEN): Ditto.
> > >         * optabs.def (OPTAB_D): Ditto.
> > >         * tree-ssa-loop-manip.cc (create_iv): Ditto.
> > >         * tree-ssa-loop-manip.h (create_iv): Ditto.
> > >         * tree-vect-loop-manip.cc (vect_set_loop_controls_by_while_len): Ditto.
> > >         (vect_set_loop_condition_partial_vectors): Ditto.
> > >         * tree-vect-loop.cc (vect_get_loop_len): Ditto.
> > >         * tree-vect-stmts.cc (vectorizable_store): Ditto.
> > >         (vectorizable_load): Ditto.
> > >         * tree-vectorizer.h (vect_get_loop_len): Ditto.
> > >  
> > > ---
> > > gcc/doc/md.texi             |  14 +++
> > > gcc/internal-fn.cc          |  29 ++++++
> > > gcc/internal-fn.def         |   1 +
> > > gcc/optabs.def              |   1 +
> > > gcc/tree-ssa-loop-manip.cc  |   4 +-
> > > gcc/tree-ssa-loop-manip.h   |   2 +-
> > > gcc/tree-vect-loop-manip.cc | 186 ++++++++++++++++++++++++++++++++++--
> > > gcc/tree-vect-loop.cc       |  35 +++++--
> > > gcc/tree-vect-stmts.cc      |   9 +-
> > > gcc/tree-vectorizer.h       |   4 +-
> > > 10 files changed, 264 insertions(+), 21 deletions(-)
> > >  
> > > diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> > > index 8e3113599fd..72178ab014c 100644
> > > --- a/gcc/doc/md.texi
> > > +++ b/gcc/doc/md.texi
> > > @@ -4965,6 +4965,20 @@ for (i = 1; i < operand3; i++)
> > >    operand0[i] = operand0[i - 1] && (operand1 + i < operand2);
> > > @end smallexample
> > > +@cindex @code{while_len@var{m}@var{n}} instruction pattern
> > > +@item @code{while_len@var{m}@var{n}}
> > > +Set operand 0 to the number of active elements in vector will be updated value.
> > > +operand 1 is the total elements need to be updated value.
> > > +operand 2 is the vectorization factor.
> > > +The operation is equivalent to:
> > > +
> > > +@smallexample
> > > +operand0 = MIN (operand1, operand2);
> > > +operand2 can be const_poly_int or poly_int related to vector mode size.
> > > +Some target like RISC-V has a standalone instruction to get MIN (n, MODE SIZE) so
> > > +that we can reduce a use of general purpose register.
> > > +@end smallexample
> > > +
> > > @cindex @code{check_raw_ptrs@var{m}} instruction pattern
> > > @item @samp{check_raw_ptrs@var{m}}
> > > Check whether, given two pointers @var{a} and @var{b} and a length @var{len},
> > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> > > index 6e81dc05e0e..5f44def90d3 100644
> > > --- a/gcc/internal-fn.cc
> > > +++ b/gcc/internal-fn.cc
> > > @@ -127,6 +127,7 @@ init_internal_fns ()
> > > #define cond_binary_direct { 1, 1, true }
> > > #define cond_ternary_direct { 1, 1, true }
> > > #define while_direct { 0, 2, false }
> > > +#define while_len_direct { 0, 0, false }
> > > #define fold_extract_direct { 2, 2, false }
> > > #define fold_left_direct { 1, 1, false }
> > > #define mask_fold_left_direct { 1, 1, false }
> > > @@ -3702,6 +3703,33 @@ expand_while_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
> > >      emit_move_insn (lhs_rtx, ops[0].value);
> > > }
> > > +/* Expand WHILE_LEN call STMT using optab OPTAB.  */
> > > +static void
> > > +expand_while_len_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
> > > +{
> > > +  expand_operand ops[3];
> > > +  tree rhs_type[2];
> > > +
> > > +  tree lhs = gimple_call_lhs (stmt);
> > > +  tree lhs_type = TREE_TYPE (lhs);
> > > +  rtx lhs_rtx = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
> > > +  create_output_operand (&ops[0], lhs_rtx, TYPE_MODE (lhs_type));
> > > +
> > > +  for (unsigned int i = 0; i < gimple_call_num_args (stmt); ++i)
> > > +    {
> > > +      tree rhs = gimple_call_arg (stmt, i);
> > > +      rhs_type[i] = TREE_TYPE (rhs);
> > > +      rtx rhs_rtx = expand_normal (rhs);
> > > +      create_input_operand (&ops[i + 1], rhs_rtx, TYPE_MODE (rhs_type[i]));
> > > +    }
> > > +
> > > +  insn_code icode = direct_optab_handler (optab, TYPE_MODE (rhs_type[0]));
> > > +
> > > +  expand_insn (icode, 3, ops);
> > > +  if (!rtx_equal_p (lhs_rtx, ops[0].value))
> > > +    emit_move_insn (lhs_rtx, ops[0].value);
> > > +}
> > > +
> > > /* Expand a call to a convert-like optab using the operands in STMT.
> > >     FN has a single output operand and NARGS input operands.  */
> > > @@ -3843,6 +3871,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
> > > #define direct_scatter_store_optab_supported_p convert_optab_supported_p
> > > #define direct_len_store_optab_supported_p direct_optab_supported_p
> > > #define direct_while_optab_supported_p convert_optab_supported_p
> > > +#define direct_while_len_optab_supported_p direct_optab_supported_p
> > > #define direct_fold_extract_optab_supported_p direct_optab_supported_p
> > > #define direct_fold_left_optab_supported_p direct_optab_supported_p
> > > #define direct_mask_fold_left_optab_supported_p direct_optab_supported_p
> > > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> > > index 7fe742c2ae7..3a933abff5d 100644
> > > --- a/gcc/internal-fn.def
> > > +++ b/gcc/internal-fn.def
> > > @@ -153,6 +153,7 @@ DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set)
> > > DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store)
> > > DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
> > > +DEF_INTERNAL_OPTAB_FN (WHILE_LEN, ECF_CONST | ECF_NOTHROW, while_len, while_len)
> > > DEF_INTERNAL_OPTAB_FN (CHECK_RAW_PTRS, ECF_CONST | ECF_NOTHROW,
> > >        check_raw_ptrs, check_ptrs)
> > > DEF_INTERNAL_OPTAB_FN (CHECK_WAR_PTRS, ECF_CONST | ECF_NOTHROW,
> > > diff --git a/gcc/optabs.def b/gcc/optabs.def
> > > index 695f5911b30..f5938bd2c24 100644
> > > --- a/gcc/optabs.def
> > > +++ b/gcc/optabs.def
> > > @@ -476,3 +476,4 @@ OPTAB_DC (vec_series_optab, "vec_series$a", VEC_SERIES)
> > > OPTAB_D (vec_shl_insert_optab, "vec_shl_insert_$a")
> > > OPTAB_D (len_load_optab, "len_load_$a")
> > > OPTAB_D (len_store_optab, "len_store_$a")
> > > +OPTAB_D (while_len_optab, "while_len$a")
> > > diff --git a/gcc/tree-ssa-loop-manip.cc b/gcc/tree-ssa-loop-manip.cc
> > > index 09acc1c94cc..cdbf280e249 100644
> > > --- a/gcc/tree-ssa-loop-manip.cc
> > > +++ b/gcc/tree-ssa-loop-manip.cc
> > > @@ -59,14 +59,14 @@ static bitmap_obstack loop_renamer_obstack;
> > > void
> > > create_iv (tree base, tree step, tree var, class loop *loop,
> > >    gimple_stmt_iterator *incr_pos, bool after,
> > > -    tree *var_before, tree *var_after)
> > > +    tree *var_before, tree *var_after, enum tree_code code)
> > > {
> > >    gassign *stmt;
> > >    gphi *phi;
> > >    tree initial, step1;
> > >    gimple_seq stmts;
> > >    tree vb, va;
> > > -  enum tree_code incr_op = PLUS_EXPR;
> > > +  enum tree_code incr_op = code;
> > >    edge pe = loop_preheader_edge (loop);
> > >    if (var != NULL_TREE)
> > > diff --git a/gcc/tree-ssa-loop-manip.h b/gcc/tree-ssa-loop-manip.h
> > > index d49273a3987..da755320a3a 100644
> > > --- a/gcc/tree-ssa-loop-manip.h
> > > +++ b/gcc/tree-ssa-loop-manip.h
> > > @@ -23,7 +23,7 @@ along with GCC; see the file COPYING3.  If not see
> > > typedef void (*transform_callback)(class loop *, void *);
> > > extern void create_iv (tree, tree, tree, class loop *, gimple_stmt_iterator *,
> > > -        bool, tree *, tree *);
> > > +        bool, tree *, tree *, enum tree_code = PLUS_EXPR);
> > > extern void rewrite_into_loop_closed_ssa (bitmap, unsigned);
> > > extern void verify_loop_closed_ssa (bool, class loop * = NULL);
> > > diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> > > index f60fa50e8f4..f3cd6c51d2e 100644
> > > --- a/gcc/tree-vect-loop-manip.cc
> > > +++ b/gcc/tree-vect-loop-manip.cc
> > > @@ -682,6 +682,173 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
> > >    return next_ctrl;
> > > }
> > > +/* Helper for vect_set_loop_condition_partial_vectors.  Generate definitions
> > > +   for all the rgroup controls in RGC and return a control that is nonzero
> > > +   when the loop needs to iterate.  Add any new preheader statements to
> > > +   PREHEADER_SEQ.  Use LOOP_COND_GSI to insert code before the exit gcond.
> > > +
> > > +   RGC belongs to loop LOOP.  The loop originally iterated NITERS
> > > +   times and has been vectorized according to LOOP_VINFO.
> > > +
> > > +   Unlike vect_set_loop_controls_directly which is iterating from 0-based IV
> > > +   to TEST_LIMIT - bias.
> > > +
> > > +   In vect_set_loop_controls_by_while_len, we are iterating from start at
> > > +   IV = TEST_LIMIT - bias and keep subtract IV by the length calculated by
> > > +   IFN_WHILE_LEN pattern.
> > > +
> > > +   Note: the cost of the code generated by this function is modeled
> > > +   by vect_estimate_min_profitable_iters, so changes here may need
> > > +   corresponding changes there.
> > > +
> > > +   1. Single rgroup, the Gimple IR should be:
> > > +
> > > + <bb 3>
> > > + _19 = (unsigned long) n_5(D);
> > > + ...
> > > +
> > > + <bb 4>:
> > > + ...
> > > + # ivtmp_20 = PHI <ivtmp_21(4), _19(3)>
> > > + ...
> > > + _22 = .WHILE_LEN (ivtmp_20, vf);
> > > + ...
> > > + vector statement (use _22);
> > > + ...
> > > + ivtmp_21 = ivtmp_20 - _22;
> > > + ...
> > > + if (ivtmp_21 != 0)
> > > +   goto <bb 4>; [75.00%]
> > > + else
> > > +   goto <bb 5>; [25.00%]
> > > +
> > > + <bb 5>
> > > + return;
> > > +
> > > +   Note: IFN_WHILE_LEN will guarantee "ivtmp_21 = ivtmp_20 - _22" never
> > > +   underflow 0.
> > > +
> > > +   2. Multiple rgroup, the Gimple IR should be:
> > > +
> > > + <bb 3>
> > > + _70 = (unsigned long) bnd.7_52;
> > > + _71 = _70 * 2;
> > > + _72 = MAX_EXPR <_71, 4>;
> > > + _73 = _72 + 18446744073709551612;
> > > + ...
> > > +
> > > + <bb 4>:
> > > + ...
> > > + # ivtmp_74 = PHI <ivtmp_75(6), _73(12)>
> > > + # ivtmp_77 = PHI <ivtmp_78(6), _71(12)>
> > > + _76 = .WHILE_LEN (ivtmp_74, vf * nitems_per_ctrl);
> > > + _79 = .WHILE_LEN (ivtmp_77, vf * nitems_per_ctrl);
> > > + ...
> > > + vector statement (use _79);
> > > + ...
> > > + vector statement (use _76);
> > > + ...
> > > + _65 = _79 / 2;
> > > + vector statement (use _65);
> > > + ...
> > > + _68 = _76 / 2;
> > > + vector statement (use _68);
> > > + ...
> > > + ivtmp_78 = ivtmp_77 - _79;
> > > + ivtmp_75 = ivtmp_74 - _76;
> > > + ...
> > > + if (ivtmp_78 != 0)
> > > +   goto <bb 4>; [75.00%]
> > > + else
> > > +   goto <bb 5>; [25.00%]
> > > +
> > > + <bb 5>
> > > + return;
> > > +
> > > +*/
> > > +
> > > +static tree
> > > +vect_set_loop_controls_by_while_len (class loop *loop, loop_vec_info loop_vinfo,
> > > +      gimple_seq *preheader_seq,
> > > +      gimple_seq *header_seq,
> > > +      rgroup_controls *rgc, tree niters)
> > > +{
> > > +  tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
> > > +  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> > > +  /* We are not allowing masked approach in WHILE_LEN.  */
> > > +  gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
> > > +
> > > +  tree ctrl_type = rgc->type;
> > > +  unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
> > > +  poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
> > > +  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
> > > +
> > > +  /* Calculate the maximum number of item values that the rgroup
> > > +     handles in total, the number that it handles for each iteration
> > > +     of the vector loop.  */
> > > +  tree nitems_total = niters;
> > > +  if (nitems_per_iter != 1)
> > > +    {
> > > +      /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
> > > + these multiplications don't overflow.  */
> > > +      tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
> > > +      nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
> > > +    nitems_total, compare_factor);
> > > +    }
> > > +
> > > +  /* Convert the comparison value to the IV type (either a no-op or
> > > +     a promotion).  */
> > > +  nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total);
> > > +
> > > +  /* Create an induction variable that counts the number of items
> > > +     processed.  */
> > > +  tree index_before_incr, index_after_incr;
> > > +  gimple_stmt_iterator incr_gsi;
> > > +  bool insert_after;
> > > +  standard_iv_increment_position (loop, &incr_gsi, &insert_after);
> > > +
> > > +  /* Test the decremented IV, which will never underflow 0 since we have
> > > +     IFN_WHILE_LEN to gurantee that.  */
> > > +  tree test_limit = nitems_total;
> > > +
> > > +  /* Provide a definition of each control in the group.  */
> > > +  tree ctrl;
> > > +  unsigned int i;
> > > +  FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
> > > +    {
> > > +      /* Previous controls will cover BIAS items.  This control covers the
> > > + next batch.  */
> > > +      poly_uint64 bias = nitems_per_ctrl * i;
> > > +      tree bias_tree = build_int_cst (iv_type, bias);
> > > +
> > > +      /* Rather than have a new IV that starts at TEST_LIMIT and goes down to
> > > + BIAS, prefer to use the same TEST_LIMIT - BIAS based IV for each
> > > + control and adjust the bound down by BIAS.  */
> > > +      tree this_test_limit = test_limit;
> > > +      if (i != 0)
> > > + {
> > > +   this_test_limit = gimple_build (preheader_seq, MAX_EXPR, iv_type,
> > > +   this_test_limit, bias_tree);
> > > +   this_test_limit = gimple_build (preheader_seq, MINUS_EXPR, iv_type,
> > > +   this_test_limit, bias_tree);
> > > + }
> > > +
> > > +      /* Create decrement IV.  */
> > > +      create_iv (this_test_limit, ctrl, NULL_TREE, loop, &incr_gsi,
> > > + insert_after, &index_before_incr, &index_after_incr,
> > > + MINUS_EXPR);
> > > +
> > > +      poly_uint64 final_vf = vf * nitems_per_iter;
> > > +      tree vf_step = build_int_cst (iv_type, final_vf);
> > > +      tree res_len = gimple_build (header_seq, IFN_WHILE_LEN, iv_type,
> > > +    index_before_incr, vf_step);
> > > +      gassign *assign = gimple_build_assign (ctrl, res_len);
> > > +      gimple_seq_add_stmt (header_seq, assign);
> > > +    }
> > > +
> > > +  return index_after_incr;
> > > +}
> > > +
> > > /* Set up the iteration condition and rgroup controls for LOOP, given
> > >     that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
> > >     loop.  LOOP_VINFO describes the vectorization of LOOP.  NITERS is
> > > @@ -703,6 +870,7 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
> > >    bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
> > >    tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
> > > +  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> > >    unsigned int compare_precision = TYPE_PRECISION (compare_type);
> > >    tree orig_niters = niters;
> > > @@ -757,12 +925,18 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
> > > bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
> > > /* Set up all controls for this group.  */
> > > - test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
> > > -      &preheader_seq,
> > > -      &header_seq,
> > > -      loop_cond_gsi, rgc,
> > > -      niters, niters_skip,
> > > -      might_wrap_p);
> > > + if (direct_internal_fn_supported_p (IFN_WHILE_LEN, iv_type,
> > > +     OPTIMIZE_FOR_SPEED))
> > > +   test_ctrl
> > > +     = vect_set_loop_controls_by_while_len (loop, loop_vinfo,
> > > +    &preheader_seq, &header_seq,
> > > +    rgc, niters);
> > > + else
> > > +   test_ctrl
> > > +     = vect_set_loop_controls_directly (loop, loop_vinfo, &preheader_seq,
> > > +        &header_seq, loop_cond_gsi, rgc,
> > > +        niters, niters_skip,
> > > +        might_wrap_p);
> > >        }
> > >    /* Emit all accumulated statements.  */
> > > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> > > index 1ba9f18d73e..5bffd9a6322 100644
> > > --- a/gcc/tree-vect-loop.cc
> > > +++ b/gcc/tree-vect-loop.cc
> > > @@ -10360,12 +10360,14 @@ vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
> > >     rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
> > > tree
> > > -vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
> > > -    unsigned int nvectors, unsigned int index)
> > > +vect_get_loop_len (gimple_stmt_iterator *gsi, loop_vec_info loop_vinfo,
> > > +    vec_loop_lens *lens, unsigned int nvectors, tree vectype,
> > > +    unsigned int index)
> > > {
> > >    rgroup_controls *rgl = &(*lens)[nvectors - 1];
> > > -  bool use_bias_adjusted_len =
> > > -    LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
> > > +  bool use_bias_adjusted_len
> > > +    = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
> > > +  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> > >    /* Populate the rgroup's len array, if this is the first time we've
> > >       used it.  */
> > > @@ -10386,8 +10388,8 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
> > >   if (use_bias_adjusted_len)
> > >     {
> > >       gcc_assert (i == 0);
> > > -       tree adjusted_len =
> > > - make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
> > > +       tree adjusted_len
> > > + = make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
> > >       SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
> > >       rgl->bias_adjusted_ctrl = adjusted_len;
> > >     }
> > > @@ -10396,6 +10398,27 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
> > >    if (use_bias_adjusted_len)
> > >      return rgl->bias_adjusted_ctrl;
> > > +  else if (direct_internal_fn_supported_p (IFN_WHILE_LEN, iv_type,
> > > +    OPTIMIZE_FOR_SPEED))
> > > +    {
> > > +      tree loop_len = rgl->controls[index];
> > > +      poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
> > > +      poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
> > > +      if (maybe_ne (nunits1, nunits2))
> > > + {
> > > +   /* A loop len for data type X can be reused for data type Y
> > > +      if X has N times more elements than Y and if Y's elements
> > > +      are N times bigger than X's.  */
> > > +   gcc_assert (multiple_p (nunits1, nunits2));
> > > +   unsigned int factor = exact_div (nunits1, nunits2).to_constant ();
> > > +   gimple_seq seq = NULL;
> > > +   loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
> > > +    build_int_cst (iv_type, factor));
> > > +   if (seq)
> > > +     gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
> > > + }
> > > +      return loop_len;
> > > +    }
> > >    else
> > >      return rgl->controls[index];
> > > }
> > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> > > index efa2d0daa52..708c8a1d806 100644
> > > --- a/gcc/tree-vect-stmts.cc
> > > +++ b/gcc/tree-vect-stmts.cc
> > > @@ -8653,8 +8653,9 @@ vectorizable_store (vec_info *vinfo,
> > >       else if (loop_lens)
> > > {
> > >   tree final_len
> > > -     = vect_get_loop_len (loop_vinfo, loop_lens,
> > > - vec_num * ncopies, vec_num * j + i);
> > > +     = vect_get_loop_len (gsi, loop_vinfo, loop_lens,
> > > + vec_num * ncopies, vectype,
> > > + vec_num * j + i);
> > >   tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
> > >   machine_mode vmode = TYPE_MODE (vectype);
> > >   opt_machine_mode new_ovmode
> > > @@ -10009,8 +10010,8 @@ vectorizable_load (vec_info *vinfo,
> > >     else if (loop_lens && memory_access_type != VMAT_INVARIANT)
> > >       {
> > > tree final_len
> > > -   = vect_get_loop_len (loop_vinfo, loop_lens,
> > > -        vec_num * ncopies,
> > > +   = vect_get_loop_len (gsi, loop_vinfo, loop_lens,
> > > +        vec_num * ncopies, vectype,
> > >        vec_num * j + i);
> > > tree ptr = build_int_cst (ref_type,
> > >   align * BITS_PER_UNIT);
> > > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> > > index 9cf2fb23fe3..e5cf38caf4b 100644
> > > --- a/gcc/tree-vectorizer.h
> > > +++ b/gcc/tree-vectorizer.h
> > > @@ -2293,8 +2293,8 @@ extern tree vect_get_loop_mask (gimple_stmt_iterator *, vec_loop_masks *,
> > > unsigned int, tree, unsigned int);
> > > extern void vect_record_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
> > >   tree, unsigned int);
> > > -extern tree vect_get_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
> > > -        unsigned int);
> > > +extern tree vect_get_loop_len (gimple_stmt_iterator *, loop_vec_info,
> > > +        vec_loop_lens *, unsigned int, tree, unsigned int);
> > > extern gimple_seq vect_gen_len (tree, tree, tree, tree);
> > > extern stmt_vec_info info_for_reduction (vec_info *, stmt_vec_info);
> > > extern bool reduction_fn_for_scalar_code (code_helper, internal_fn *);
> > 
>  
> 
 
-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)
 

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
  2023-04-12  9:15           ` juzhe.zhong
@ 2023-04-12  9:29             ` Richard Biener
  2023-04-12  9:42               ` Robin Dapp
  2023-04-12 11:17               ` Richard Sandiford
  0 siblings, 2 replies; 41+ messages in thread
From: Richard Biener @ 2023-04-12  9:29 UTC (permalink / raw)
  To: juzhe.zhong; +Cc: richard.sandiford, gcc-patches, jeffreyalaw, rdapp, linkw

On Wed, 12 Apr 2023, juzhe.zhong@rivai.ai wrote:

> 
> >> Thanks for the detailed explanation.  Just to clarify - with RVV
> >> there's only a single mask register, v0.t, or did you want to
> >> say an instruction can only specify a single mask register?
> 
> RVV has 32 (v0~v31) vector register in total.
> We can store vector data value or mask value in any of them.
> We also have mask-logic instruction for example mask-and between any vector register.
> 
> However, any vector operation for example like vadd.vv can only  predicated by v0 (in asm is v0.t) which is the first vector register.
> We can predicate vadd.vv with v1 - v31.
> 
> So, you can image every time we want to use a mask to predicate a vector operation, we should always first store the mask value
> into v0.
> 
> So, we can write intrinsic sequence like this:
> 
> vmseq v0,v8,v9 (store mask value to v0)
> vmslt v1,v10,v11 (store mask value to v1)
> vmand v0,v0,v1
> vadd.vv ...v0.t (predicate mask should always be mask).

Ah, I see - that explains it well.

> >> ARM SVE would have a loop control mask and a separate mask
> >> for the if (cond[i]) which would be combined with a mask-and
> >> instruction to a third mask which is then used on the
> >> predicated instructions.
> 
> Yeah, I know it. ARM SVE way is a more elegant way than RVV do. 
> However, for RVV, we can't follow this flow.
> We don't have a  "whilelo" instruction to generate loop control mask.

Yep.  Similar for AVX512 where I have to use a vector compare.  I'm
currently using

 { 0, 1, 2 ... } < { remaining_len, remaining_len, ... }

and careful updating of remaining_len (we know it will either
be adjusted by the full constant vector length or updated to zero).

> We only can do loop control with length generated by vsetvl.
> And we can only use "v0" to mask predicate vadd.vv, and mask value can only generated by comparison or mask logical instructions. 
> 
> >> PowerPC and s390x might be able to use WHILE_LEN as well (though
> >> they only have LEN variants of loads and stores) - of course
> >> only "simulating it".  For the fixed-vector-length ISAs the
> >> predicated vector loop IMHO makes most sense for the epilogue to
> >> handle low-trip loops better.
> 
> Yeah, I wonder how they do the flow control (if (cond[i])). 
> For RVV, you can image I will need to add a pattern LEN_MASK_LOAD/LEN_MASK_STORE (length generated by WHILE_LEN and mask generated by comparison)
> 
> I think we can CC IBM folks to see whether we can make WHILE_LEN works 
> for both IBM and RVV ?

I've CCed them.  Adding WHILE_LEN support to rs6000/s390x would be
mainly the "easy" way to get len-masked (epilog) loop support.  I've
figured actually implementing WHILE_ULT for AVX512 in the backend
results in some code generation challenges so I'm going to play
(again) with open-coding it as outlined above in the vectorizer itself
so followup passes (mostly IVOPTs) can do a better job.

Richard.

> 
> Thanks.
> 
> 
> juzhe.zhong@rivai.ai
>  
> From: Richard Biener
> Date: 2023-04-12 16:42
> To: juzhe.zhong@rivai.ai
> CC: richard.sandiford; gcc-patches; jeffreyalaw
> Subject: Re: Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
> On Wed, 12 Apr 2023, juzhe.zhong@rivai.ai wrote:
>  
> > Thank you very much for reply.
> > 
> > WHILE_LEN is the pattern that calculates the number of the elements of the vector will be updated in each iteration.
> > For RVV, we use vsetvl instruction to calculate the number of the elements of the vector.
> > 
> > WHILE_ULT can not work for RVV since WHILE_ULT is generating mask to predicate vector operation, but RVV do not
> > use mask to do the loop strip mining (RVV only use mask for control flow inside the loop).
> > 
> > Here is the example WHILE_ULT working in ARM SVE:
> > https://godbolt.org/z/jKsT8E1hP 
> > 
> > The first example is:
> > void foo (int32_t * __restrict a, int32_t * __restrict b, int n)
> > {
> >     for (int i = 0; i < n; i++)
> >       a[i] = a[i] + b[i];
> > }
> > 
> > ARM SVE:
> > foo:
> >         cmp     w2, 0
> >         ble     .L1
> >         mov     x3, 0
> >         cntw    x4
> >         whilelo p0.s, wzr, w2
> > .L3:
> >         ld1w    z1.s, p0/z, [x0, x3, lsl 2]
> >         ld1w    z0.s, p0/z, [x1, x3, lsl 2]
> >         add     z0.s, z0.s, z1.s
> >         st1w    z0.s, p0, [x0, x3, lsl 2]
> >         add     x3, x3, x4
> >         whilelo p0.s, w3, w2
> >         b.any   .L3
> > .L1:
> >         ret
> > 
> > Here, whilelo will generate the mask according to w3 to w2.
> > So for example, if w3 = 0, and w2 = 3 (Suppose machine vector length > 3).
> > Then it will generate a mask with 0b111 mask to predicate loads and stores.
> > 
> > For RVV, we can't do that since RVV doesn't have whilelo instructions to generate predicate mask.
> > Also, we can't use mask as the predicate to do loop strip mining since RVV only has 1 single mask 
> > to handle flow control  inside the loop.
> > 
> > Instead, we use vsetvl to do the strip mining, so base on this, the same C code, RVV ideal asm according RVV ISA should be:
> > 
> > preheader:
> > a0 = n (the total number of the scalar should be calculated).
> >  .....
> > .L3:
> >         vsetvli a5,a0,e32,m1,ta,ma    ====> WHILE_LEN pattern generate this instruction, calculate the number of the elements should be updated
> >         vle32.v v1,0(a4)
> >         sub     a0,a0,a5      ============> Decrement the induction variable by the a5 (generated by WHILE_LEN)
> >         ....   
> > 
> >         vadd.vv....
> >         vse32.v v1,0(a3)
> >         add     a4,a4,a2
> >         add     a3,a3,a2
> >         bne     a0,zero,.L3
> > .L1:
> >         ret
> > 
> > So you will see, if n = 3 like I said for ARM SVE (Suppose machine vector length > 3), then vsetvli a5,a0,e32,m1,ta,ma will
> > generate a5 = 3, then the vle32.v/vadd.vv/vse32.v are all doing the operation only on the element 0,  element 1, element 2.
> > 
> > Besides, WHILE_LEN is defined to make sure to be never overflow the input operand which is "a0".
> > That means  sub     a0,a0,a5 will make a0 never underflow 0.
> > 
> > I have tried to return Pmode in TARGET_VECTORIZE_GET_MASK_MODE 
> > target hook and then use WHILE_ULT. 
> > 
> > But there are 2 issues:
> > One is that current GCC is doing the flow from 0-based until the TEST_LIMIT. Wheras the optimal flow of RVV I showed above
> > is from "n" keep decreasing n until 0.  Trying to fit the current flow of GCC, RVV needs more instructions to do the loop strip mining.
> > 
> > Second is that if we return a Pmode in TARGET_VECTORIZE_GET_MASK_MODE 
> > which not only specify the dest mode for WHILE_ULT but also the mask mode of flow control.
> > If we return Pmode which is used as the length for RVV. We can't use mask mode like VNx2BI mode to do the flow control predicate.
> > This another example:
> > void foo2 (int32_t * __restrict a, int32_t * __restrict b, int32_t * restrict cond, int n)
> > {
> >     for (int i = 0; i < n; i++)
> >       if (cond[i])
> >         a[i] = a[i] + b[i];
> > }
> > 
> > ARM SVE:
> >         ld1w    z0.s, p0/z, [x2, x4, lsl 2]
> >         cmpne   p0.s, p0/z, z0.s, #0
> >         ld1w    z0.s, p0/z, [x0, x4, lsl 2]
> >         ld1w    z1.s, p0/z, [x1, x4, lsl 2]
> >         add     z0.s, z0.s, z1.s
> >         st1w    z0.s, p0, [x0, x4, lsl 2]
> >         add     x4, x4, x5
> >         whilelo p0.s, w4, w3
> >         b.any   .L8
> > 
> > Here we can see ARM use mask mode for both loop strip minning and flow control.
> > 
> > Wheras, RVV use length generated by vsetvl (WHILE_LEN) to do the loop strip minning and mask generated by comparison to do the flow control.
> > 
> > So the ASM generated by my downstream LLVM/GCC:
> > .L3:
> >         vsetvli a6,a3,e32,m1,ta,mu   ==========> generate length to predicate RVV operation. 
> >         vle32.v v0,(a2)
> >         sub     a3,a3,a6      ==========> decrease the induction variable until 0.
> >         vmsne.vi        v0,v0,0   ==========> generate mask to predicate RVV operation. 
> >         vle32.v v24,(a0),v0.t   ===========> here using v0.t is the only mask register to predicate RVV operation
> >         vle32.v v25,(a1),v0.t
> >         vadd.vv v24,v24,v25
> >         vse32.v v24,(a0),v0.t
> >         add     a2,a2,a4
> >         add     a0,a0,a4
> >         add     a1,a1,a4
> >         bne     a3,zero,.L3
> > .L1:
> >         ret
> > 
> > 
> > This is the how RVV works.
> > Feel free to comment if you have any questions.
>  
> Thanks for the detailed explanation.  Just to clarify - with RVV
> there's only a single mask register, v0.t, or did you want to
> say an instruction can only specify a single mask register?
> ARM SVE would have a loop control mask and a separate mask
> for the if (cond[i]) which would be combined with a mask-and
> instruction to a third mask which is then used on the
> predicated instructions.
>  
> For AVX512 WHILE_ULT is a better match since we need a mask in the
> end (but WHILE_ULT isn't a very good match either, so I'm still
> working on masked loop support there).
>  
> PowerPC and s390x might be able to use WHILE_LEN as well (though
> they only have LEN variants of loads and stores) - of course
> only "simulating it".  For the fixed-vector-length ISAs the
> predicated vector loop IMHO makes most sense for the epilogue to
> handle low-trip loops better.
>  
> Richard.
>  
> > Thanks.
> > 
> > 
> > juzhe.zhong@rivai.ai
> >  
> > From: Richard Biener
> > Date: 2023-04-12 15:00
> > To: Richard Sandiford
> > CC: juzhe.zhong@rivai.ai; gcc-patches; jeffreyalaw
> > Subject: Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
> > On Tue, 11 Apr 2023, Richard Sandiford wrote:
> >  
> > > "juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai> writes:
> > > > Hi, Richards. 
> > > > Kindly Ping this patch. 
> > > > This is the most important patch for RVV auto-vectorization support.
> > > > Bootstraped on X86 has passed.
> > > 
> > > Can it wait for GCC 14?  It doesn't seem like stage 4 material.
> > > 
> > > Also, pinging after 5 days seems a bit soon.  It's been a 4-day
> > > holiday weekend for much of Europe.
> >  
> > Also can you explain why using WHILE_ULT is not possible?  (I've
> > successfully - to some extent - done that for AVX512 for example)
> >  
> > The patch lacks the description of what WHILE_LEN actually is.
> >  
> > Richard.
> >  
> > > Thanks,
> > > Richard
> > > 
> > > > Feel free to comments.
> > > >
> > > > Thanks.
> > > >
> > > >
> > > > juzhe.zhong@rivai.ai
> > > >  
> > > > From: juzhe.zhong
> > > > Date: 2023-04-07 09:47
> > > > To: gcc-patches
> > > > CC: richard.sandiford; rguenther; jeffreyalaw; Juzhe-Zhong
> > > > Subject: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
> > > > From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
> > > >  
> > > > This patch is to add WHILE_LEN pattern.
> > > > It's inspired by RVV ISA simple "vvaddint32.s" example:
> > > > https://github.com/riscv/riscv-v-spec/blob/master/example/vvaddint32.s
> > > >  
> > > > More details are in "vect_set_loop_controls_by_while_len" implementation
> > > > and comments.
> > > >  
> > > > Consider such following case:
> > > > #define N 16
> > > > int src[N];
> > > > int dest[N];
> > > >  
> > > > void
> > > > foo (int n)
> > > > {
> > > >   for (int i = 0; i < n; i++)
> > > >     dest[i] = src[i];
> > > > }
> > > >  
> > > > -march=rv64gcv -O3 --param riscv-autovec-preference=scalable -fno-vect-cost-model -fno-tree-loop-distribute-patterns:
> > > >  
> > > > foo:        
> > > >         ble     a0,zero,.L1
> > > >         lui     a4,%hi(.LANCHOR0)
> > > >         addi    a4,a4,%lo(.LANCHOR0)
> > > >         addi    a3,a4,64
> > > >         csrr    a2,vlenb
> > > > .L3:
> > > >         vsetvli a5,a0,e32,m1,ta,ma
> > > >         vle32.v v1,0(a4)
> > > >         sub     a0,a0,a5
> > > >         vse32.v v1,0(a3)
> > > >         add     a4,a4,a2
> > > >         add     a3,a3,a2
> > > >         bne     a0,zero,.L3
> > > > .L1:
> > > >         ret
> > > >  
> > > > gcc/ChangeLog:
> > > >  
> > > >         * doc/md.texi: Add WHILE_LEN support.
> > > >         * internal-fn.cc (while_len_direct): Ditto.
> > > >         (expand_while_len_optab_fn): Ditto.
> > > >         (direct_while_len_optab_supported_p): Ditto.
> > > >         * internal-fn.def (WHILE_LEN): Ditto.
> > > >         * optabs.def (OPTAB_D): Ditto.
> > > >         * tree-ssa-loop-manip.cc (create_iv): Ditto.
> > > >         * tree-ssa-loop-manip.h (create_iv): Ditto.
> > > >         * tree-vect-loop-manip.cc (vect_set_loop_controls_by_while_len): Ditto.
> > > >         (vect_set_loop_condition_partial_vectors): Ditto.
> > > >         * tree-vect-loop.cc (vect_get_loop_len): Ditto.
> > > >         * tree-vect-stmts.cc (vectorizable_store): Ditto.
> > > >         (vectorizable_load): Ditto.
> > > >         * tree-vectorizer.h (vect_get_loop_len): Ditto.
> > > >  
> > > > ---
> > > > gcc/doc/md.texi             |  14 +++
> > > > gcc/internal-fn.cc          |  29 ++++++
> > > > gcc/internal-fn.def         |   1 +
> > > > gcc/optabs.def              |   1 +
> > > > gcc/tree-ssa-loop-manip.cc  |   4 +-
> > > > gcc/tree-ssa-loop-manip.h   |   2 +-
> > > > gcc/tree-vect-loop-manip.cc | 186 ++++++++++++++++++++++++++++++++++--
> > > > gcc/tree-vect-loop.cc       |  35 +++++--
> > > > gcc/tree-vect-stmts.cc      |   9 +-
> > > > gcc/tree-vectorizer.h       |   4 +-
> > > > 10 files changed, 264 insertions(+), 21 deletions(-)
> > > >  
> > > > diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> > > > index 8e3113599fd..72178ab014c 100644
> > > > --- a/gcc/doc/md.texi
> > > > +++ b/gcc/doc/md.texi
> > > > @@ -4965,6 +4965,20 @@ for (i = 1; i < operand3; i++)
> > > >    operand0[i] = operand0[i - 1] && (operand1 + i < operand2);
> > > > @end smallexample
> > > > +@cindex @code{while_len@var{m}@var{n}} instruction pattern
> > > > +@item @code{while_len@var{m}@var{n}}
> > > > +Set operand 0 to the number of active elements in vector will be updated value.
> > > > +operand 1 is the total elements need to be updated value.
> > > > +operand 2 is the vectorization factor.
> > > > +The operation is equivalent to:
> > > > +
> > > > +@smallexample
> > > > +operand0 = MIN (operand1, operand2);
> > > > +operand2 can be const_poly_int or poly_int related to vector mode size.
> > > > +Some target like RISC-V has a standalone instruction to get MIN (n, MODE SIZE) so
> > > > +that we can reduce a use of general purpose register.
> > > > +@end smallexample
> > > > +
> > > > @cindex @code{check_raw_ptrs@var{m}} instruction pattern
> > > > @item @samp{check_raw_ptrs@var{m}}
> > > > Check whether, given two pointers @var{a} and @var{b} and a length @var{len},
> > > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> > > > index 6e81dc05e0e..5f44def90d3 100644
> > > > --- a/gcc/internal-fn.cc
> > > > +++ b/gcc/internal-fn.cc
> > > > @@ -127,6 +127,7 @@ init_internal_fns ()
> > > > #define cond_binary_direct { 1, 1, true }
> > > > #define cond_ternary_direct { 1, 1, true }
> > > > #define while_direct { 0, 2, false }
> > > > +#define while_len_direct { 0, 0, false }
> > > > #define fold_extract_direct { 2, 2, false }
> > > > #define fold_left_direct { 1, 1, false }
> > > > #define mask_fold_left_direct { 1, 1, false }
> > > > @@ -3702,6 +3703,33 @@ expand_while_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
> > > >      emit_move_insn (lhs_rtx, ops[0].value);
> > > > }
> > > > +/* Expand WHILE_LEN call STMT using optab OPTAB.  */
> > > > +static void
> > > > +expand_while_len_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
> > > > +{
> > > > +  expand_operand ops[3];
> > > > +  tree rhs_type[2];
> > > > +
> > > > +  tree lhs = gimple_call_lhs (stmt);
> > > > +  tree lhs_type = TREE_TYPE (lhs);
> > > > +  rtx lhs_rtx = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
> > > > +  create_output_operand (&ops[0], lhs_rtx, TYPE_MODE (lhs_type));
> > > > +
> > > > +  for (unsigned int i = 0; i < gimple_call_num_args (stmt); ++i)
> > > > +    {
> > > > +      tree rhs = gimple_call_arg (stmt, i);
> > > > +      rhs_type[i] = TREE_TYPE (rhs);
> > > > +      rtx rhs_rtx = expand_normal (rhs);
> > > > +      create_input_operand (&ops[i + 1], rhs_rtx, TYPE_MODE (rhs_type[i]));
> > > > +    }
> > > > +
> > > > +  insn_code icode = direct_optab_handler (optab, TYPE_MODE (rhs_type[0]));
> > > > +
> > > > +  expand_insn (icode, 3, ops);
> > > > +  if (!rtx_equal_p (lhs_rtx, ops[0].value))
> > > > +    emit_move_insn (lhs_rtx, ops[0].value);
> > > > +}
> > > > +
> > > > /* Expand a call to a convert-like optab using the operands in STMT.
> > > >     FN has a single output operand and NARGS input operands.  */
> > > > @@ -3843,6 +3871,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
> > > > #define direct_scatter_store_optab_supported_p convert_optab_supported_p
> > > > #define direct_len_store_optab_supported_p direct_optab_supported_p
> > > > #define direct_while_optab_supported_p convert_optab_supported_p
> > > > +#define direct_while_len_optab_supported_p direct_optab_supported_p
> > > > #define direct_fold_extract_optab_supported_p direct_optab_supported_p
> > > > #define direct_fold_left_optab_supported_p direct_optab_supported_p
> > > > #define direct_mask_fold_left_optab_supported_p direct_optab_supported_p
> > > > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> > > > index 7fe742c2ae7..3a933abff5d 100644
> > > > --- a/gcc/internal-fn.def
> > > > +++ b/gcc/internal-fn.def
> > > > @@ -153,6 +153,7 @@ DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set)
> > > > DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store)
> > > > DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
> > > > +DEF_INTERNAL_OPTAB_FN (WHILE_LEN, ECF_CONST | ECF_NOTHROW, while_len, while_len)
> > > > DEF_INTERNAL_OPTAB_FN (CHECK_RAW_PTRS, ECF_CONST | ECF_NOTHROW,
> > > >        check_raw_ptrs, check_ptrs)
> > > > DEF_INTERNAL_OPTAB_FN (CHECK_WAR_PTRS, ECF_CONST | ECF_NOTHROW,
> > > > diff --git a/gcc/optabs.def b/gcc/optabs.def
> > > > index 695f5911b30..f5938bd2c24 100644
> > > > --- a/gcc/optabs.def
> > > > +++ b/gcc/optabs.def
> > > > @@ -476,3 +476,4 @@ OPTAB_DC (vec_series_optab, "vec_series$a", VEC_SERIES)
> > > > OPTAB_D (vec_shl_insert_optab, "vec_shl_insert_$a")
> > > > OPTAB_D (len_load_optab, "len_load_$a")
> > > > OPTAB_D (len_store_optab, "len_store_$a")
> > > > +OPTAB_D (while_len_optab, "while_len$a")
> > > > diff --git a/gcc/tree-ssa-loop-manip.cc b/gcc/tree-ssa-loop-manip.cc
> > > > index 09acc1c94cc..cdbf280e249 100644
> > > > --- a/gcc/tree-ssa-loop-manip.cc
> > > > +++ b/gcc/tree-ssa-loop-manip.cc
> > > > @@ -59,14 +59,14 @@ static bitmap_obstack loop_renamer_obstack;
> > > > void
> > > > create_iv (tree base, tree step, tree var, class loop *loop,
> > > >    gimple_stmt_iterator *incr_pos, bool after,
> > > > -    tree *var_before, tree *var_after)
> > > > +    tree *var_before, tree *var_after, enum tree_code code)
> > > > {
> > > >    gassign *stmt;
> > > >    gphi *phi;
> > > >    tree initial, step1;
> > > >    gimple_seq stmts;
> > > >    tree vb, va;
> > > > -  enum tree_code incr_op = PLUS_EXPR;
> > > > +  enum tree_code incr_op = code;
> > > >    edge pe = loop_preheader_edge (loop);
> > > >    if (var != NULL_TREE)
> > > > diff --git a/gcc/tree-ssa-loop-manip.h b/gcc/tree-ssa-loop-manip.h
> > > > index d49273a3987..da755320a3a 100644
> > > > --- a/gcc/tree-ssa-loop-manip.h
> > > > +++ b/gcc/tree-ssa-loop-manip.h
> > > > @@ -23,7 +23,7 @@ along with GCC; see the file COPYING3.  If not see
> > > > typedef void (*transform_callback)(class loop *, void *);
> > > > extern void create_iv (tree, tree, tree, class loop *, gimple_stmt_iterator *,
> > > > -        bool, tree *, tree *);
> > > > +        bool, tree *, tree *, enum tree_code = PLUS_EXPR);
> > > > extern void rewrite_into_loop_closed_ssa (bitmap, unsigned);
> > > > extern void verify_loop_closed_ssa (bool, class loop * = NULL);
> > > > diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> > > > index f60fa50e8f4..f3cd6c51d2e 100644
> > > > --- a/gcc/tree-vect-loop-manip.cc
> > > > +++ b/gcc/tree-vect-loop-manip.cc
> > > > @@ -682,6 +682,173 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
> > > >    return next_ctrl;
> > > > }
> > > > +/* Helper for vect_set_loop_condition_partial_vectors.  Generate definitions
> > > > +   for all the rgroup controls in RGC and return a control that is nonzero
> > > > +   when the loop needs to iterate.  Add any new preheader statements to
> > > > +   PREHEADER_SEQ.  Use LOOP_COND_GSI to insert code before the exit gcond.
> > > > +
> > > > +   RGC belongs to loop LOOP.  The loop originally iterated NITERS
> > > > +   times and has been vectorized according to LOOP_VINFO.
> > > > +
> > > > +   Unlike vect_set_loop_controls_directly which is iterating from 0-based IV
> > > > +   to TEST_LIMIT - bias.
> > > > +
> > > > +   In vect_set_loop_controls_by_while_len, we are iterating from start at
> > > > +   IV = TEST_LIMIT - bias and keep subtract IV by the length calculated by
> > > > +   IFN_WHILE_LEN pattern.
> > > > +
> > > > +   Note: the cost of the code generated by this function is modeled
> > > > +   by vect_estimate_min_profitable_iters, so changes here may need
> > > > +   corresponding changes there.
> > > > +
> > > > +   1. Single rgroup, the Gimple IR should be:
> > > > +
> > > > + <bb 3>
> > > > + _19 = (unsigned long) n_5(D);
> > > > + ...
> > > > +
> > > > + <bb 4>:
> > > > + ...
> > > > + # ivtmp_20 = PHI <ivtmp_21(4), _19(3)>
> > > > + ...
> > > > + _22 = .WHILE_LEN (ivtmp_20, vf);
> > > > + ...
> > > > + vector statement (use _22);
> > > > + ...
> > > > + ivtmp_21 = ivtmp_20 - _22;
> > > > + ...
> > > > + if (ivtmp_21 != 0)
> > > > +   goto <bb 4>; [75.00%]
> > > > + else
> > > > +   goto <bb 5>; [25.00%]
> > > > +
> > > > + <bb 5>
> > > > + return;
> > > > +
> > > > +   Note: IFN_WHILE_LEN will guarantee "ivtmp_21 = ivtmp_20 - _22" never
> > > > +   underflow 0.
> > > > +
> > > > +   2. Multiple rgroup, the Gimple IR should be:
> > > > +
> > > > + <bb 3>
> > > > + _70 = (unsigned long) bnd.7_52;
> > > > + _71 = _70 * 2;
> > > > + _72 = MAX_EXPR <_71, 4>;
> > > > + _73 = _72 + 18446744073709551612;
> > > > + ...
> > > > +
> > > > + <bb 4>:
> > > > + ...
> > > > + # ivtmp_74 = PHI <ivtmp_75(6), _73(12)>
> > > > + # ivtmp_77 = PHI <ivtmp_78(6), _71(12)>
> > > > + _76 = .WHILE_LEN (ivtmp_74, vf * nitems_per_ctrl);
> > > > + _79 = .WHILE_LEN (ivtmp_77, vf * nitems_per_ctrl);
> > > > + ...
> > > > + vector statement (use _79);
> > > > + ...
> > > > + vector statement (use _76);
> > > > + ...
> > > > + _65 = _79 / 2;
> > > > + vector statement (use _65);
> > > > + ...
> > > > + _68 = _76 / 2;
> > > > + vector statement (use _68);
> > > > + ...
> > > > + ivtmp_78 = ivtmp_77 - _79;
> > > > + ivtmp_75 = ivtmp_74 - _76;
> > > > + ...
> > > > + if (ivtmp_78 != 0)
> > > > +   goto <bb 4>; [75.00%]
> > > > + else
> > > > +   goto <bb 5>; [25.00%]
> > > > +
> > > > + <bb 5>
> > > > + return;
> > > > +
> > > > +*/
> > > > +
> > > > +static tree
> > > > +vect_set_loop_controls_by_while_len (class loop *loop, loop_vec_info loop_vinfo,
> > > > +      gimple_seq *preheader_seq,
> > > > +      gimple_seq *header_seq,
> > > > +      rgroup_controls *rgc, tree niters)
> > > > +{
> > > > +  tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
> > > > +  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> > > > +  /* We are not allowing masked approach in WHILE_LEN.  */
> > > > +  gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
> > > > +
> > > > +  tree ctrl_type = rgc->type;
> > > > +  unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
> > > > +  poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
> > > > +  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
> > > > +
> > > > +  /* Calculate the maximum number of item values that the rgroup
> > > > +     handles in total, the number that it handles for each iteration
> > > > +     of the vector loop.  */
> > > > +  tree nitems_total = niters;
> > > > +  if (nitems_per_iter != 1)
> > > > +    {
> > > > +      /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
> > > > + these multiplications don't overflow.  */
> > > > +      tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
> > > > +      nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
> > > > +    nitems_total, compare_factor);
> > > > +    }
> > > > +
> > > > +  /* Convert the comparison value to the IV type (either a no-op or
> > > > +     a promotion).  */
> > > > +  nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total);
> > > > +
> > > > +  /* Create an induction variable that counts the number of items
> > > > +     processed.  */
> > > > +  tree index_before_incr, index_after_incr;
> > > > +  gimple_stmt_iterator incr_gsi;
> > > > +  bool insert_after;
> > > > +  standard_iv_increment_position (loop, &incr_gsi, &insert_after);
> > > > +
> > > > +  /* Test the decremented IV, which will never underflow 0 since we have
> > > > +     IFN_WHILE_LEN to gurantee that.  */
> > > > +  tree test_limit = nitems_total;
> > > > +
> > > > +  /* Provide a definition of each control in the group.  */
> > > > +  tree ctrl;
> > > > +  unsigned int i;
> > > > +  FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
> > > > +    {
> > > > +      /* Previous controls will cover BIAS items.  This control covers the
> > > > + next batch.  */
> > > > +      poly_uint64 bias = nitems_per_ctrl * i;
> > > > +      tree bias_tree = build_int_cst (iv_type, bias);
> > > > +
> > > > +      /* Rather than have a new IV that starts at TEST_LIMIT and goes down to
> > > > + BIAS, prefer to use the same TEST_LIMIT - BIAS based IV for each
> > > > + control and adjust the bound down by BIAS.  */
> > > > +      tree this_test_limit = test_limit;
> > > > +      if (i != 0)
> > > > + {
> > > > +   this_test_limit = gimple_build (preheader_seq, MAX_EXPR, iv_type,
> > > > +   this_test_limit, bias_tree);
> > > > +   this_test_limit = gimple_build (preheader_seq, MINUS_EXPR, iv_type,
> > > > +   this_test_limit, bias_tree);
> > > > + }
> > > > +
> > > > +      /* Create decrement IV.  */
> > > > +      create_iv (this_test_limit, ctrl, NULL_TREE, loop, &incr_gsi,
> > > > + insert_after, &index_before_incr, &index_after_incr,
> > > > + MINUS_EXPR);
> > > > +
> > > > +      poly_uint64 final_vf = vf * nitems_per_iter;
> > > > +      tree vf_step = build_int_cst (iv_type, final_vf);
> > > > +      tree res_len = gimple_build (header_seq, IFN_WHILE_LEN, iv_type,
> > > > +    index_before_incr, vf_step);
> > > > +      gassign *assign = gimple_build_assign (ctrl, res_len);
> > > > +      gimple_seq_add_stmt (header_seq, assign);
> > > > +    }
> > > > +
> > > > +  return index_after_incr;
> > > > +}
> > > > +
> > > > /* Set up the iteration condition and rgroup controls for LOOP, given
> > > >     that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
> > > >     loop.  LOOP_VINFO describes the vectorization of LOOP.  NITERS is
> > > > @@ -703,6 +870,7 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
> > > >    bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
> > > >    tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
> > > > +  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> > > >    unsigned int compare_precision = TYPE_PRECISION (compare_type);
> > > >    tree orig_niters = niters;
> > > > @@ -757,12 +925,18 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
> > > > bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
> > > > /* Set up all controls for this group.  */
> > > > - test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
> > > > -      &preheader_seq,
> > > > -      &header_seq,
> > > > -      loop_cond_gsi, rgc,
> > > > -      niters, niters_skip,
> > > > -      might_wrap_p);
> > > > + if (direct_internal_fn_supported_p (IFN_WHILE_LEN, iv_type,
> > > > +     OPTIMIZE_FOR_SPEED))
> > > > +   test_ctrl
> > > > +     = vect_set_loop_controls_by_while_len (loop, loop_vinfo,
> > > > +    &preheader_seq, &header_seq,
> > > > +    rgc, niters);
> > > > + else
> > > > +   test_ctrl
> > > > +     = vect_set_loop_controls_directly (loop, loop_vinfo, &preheader_seq,
> > > > +        &header_seq, loop_cond_gsi, rgc,
> > > > +        niters, niters_skip,
> > > > +        might_wrap_p);
> > > >        }
> > > >    /* Emit all accumulated statements.  */
> > > > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> > > > index 1ba9f18d73e..5bffd9a6322 100644
> > > > --- a/gcc/tree-vect-loop.cc
> > > > +++ b/gcc/tree-vect-loop.cc
> > > > @@ -10360,12 +10360,14 @@ vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
> > > >     rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
> > > > tree
> > > > -vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
> > > > -    unsigned int nvectors, unsigned int index)
> > > > +vect_get_loop_len (gimple_stmt_iterator *gsi, loop_vec_info loop_vinfo,
> > > > +    vec_loop_lens *lens, unsigned int nvectors, tree vectype,
> > > > +    unsigned int index)
> > > > {
> > > >    rgroup_controls *rgl = &(*lens)[nvectors - 1];
> > > > -  bool use_bias_adjusted_len =
> > > > -    LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
> > > > +  bool use_bias_adjusted_len
> > > > +    = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
> > > > +  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> > > >    /* Populate the rgroup's len array, if this is the first time we've
> > > >       used it.  */
> > > > @@ -10386,8 +10388,8 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
> > > >   if (use_bias_adjusted_len)
> > > >     {
> > > >       gcc_assert (i == 0);
> > > > -       tree adjusted_len =
> > > > - make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
> > > > +       tree adjusted_len
> > > > + = make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
> > > >       SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
> > > >       rgl->bias_adjusted_ctrl = adjusted_len;
> > > >     }
> > > > @@ -10396,6 +10398,27 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
> > > >    if (use_bias_adjusted_len)
> > > >      return rgl->bias_adjusted_ctrl;
> > > > +  else if (direct_internal_fn_supported_p (IFN_WHILE_LEN, iv_type,
> > > > +    OPTIMIZE_FOR_SPEED))
> > > > +    {
> > > > +      tree loop_len = rgl->controls[index];
> > > > +      poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
> > > > +      poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
> > > > +      if (maybe_ne (nunits1, nunits2))
> > > > + {
> > > > +   /* A loop len for data type X can be reused for data type Y
> > > > +      if X has N times more elements than Y and if Y's elements
> > > > +      are N times bigger than X's.  */
> > > > +   gcc_assert (multiple_p (nunits1, nunits2));
> > > > +   unsigned int factor = exact_div (nunits1, nunits2).to_constant ();
> > > > +   gimple_seq seq = NULL;
> > > > +   loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
> > > > +    build_int_cst (iv_type, factor));
> > > > +   if (seq)
> > > > +     gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
> > > > + }
> > > > +      return loop_len;
> > > > +    }
> > > >    else
> > > >      return rgl->controls[index];
> > > > }
> > > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> > > > index efa2d0daa52..708c8a1d806 100644
> > > > --- a/gcc/tree-vect-stmts.cc
> > > > +++ b/gcc/tree-vect-stmts.cc
> > > > @@ -8653,8 +8653,9 @@ vectorizable_store (vec_info *vinfo,
> > > >       else if (loop_lens)
> > > > {
> > > >   tree final_len
> > > > -     = vect_get_loop_len (loop_vinfo, loop_lens,
> > > > - vec_num * ncopies, vec_num * j + i);
> > > > +     = vect_get_loop_len (gsi, loop_vinfo, loop_lens,
> > > > + vec_num * ncopies, vectype,
> > > > + vec_num * j + i);
> > > >   tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
> > > >   machine_mode vmode = TYPE_MODE (vectype);
> > > >   opt_machine_mode new_ovmode
> > > > @@ -10009,8 +10010,8 @@ vectorizable_load (vec_info *vinfo,
> > > >     else if (loop_lens && memory_access_type != VMAT_INVARIANT)
> > > >       {
> > > > tree final_len
> > > > -   = vect_get_loop_len (loop_vinfo, loop_lens,
> > > > -        vec_num * ncopies,
> > > > +   = vect_get_loop_len (gsi, loop_vinfo, loop_lens,
> > > > +        vec_num * ncopies, vectype,
> > > >        vec_num * j + i);
> > > > tree ptr = build_int_cst (ref_type,
> > > >   align * BITS_PER_UNIT);
> > > > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> > > > index 9cf2fb23fe3..e5cf38caf4b 100644
> > > > --- a/gcc/tree-vectorizer.h
> > > > +++ b/gcc/tree-vectorizer.h
> > > > @@ -2293,8 +2293,8 @@ extern tree vect_get_loop_mask (gimple_stmt_iterator *, vec_loop_masks *,
> > > > unsigned int, tree, unsigned int);
> > > > extern void vect_record_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
> > > >   tree, unsigned int);
> > > > -extern tree vect_get_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
> > > > -        unsigned int);
> > > > +extern tree vect_get_loop_len (gimple_stmt_iterator *, loop_vec_info,
> > > > +        vec_loop_lens *, unsigned int, tree, unsigned int);
> > > > extern gimple_seq vect_gen_len (tree, tree, tree, tree);
> > > > extern stmt_vec_info info_for_reduction (vec_info *, stmt_vec_info);
> > > > extern bool reduction_fn_for_scalar_code (code_helper, internal_fn *);
> > > 
> >  
> > 
>  
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
  2023-04-12  9:29             ` Richard Biener
@ 2023-04-12  9:42               ` Robin Dapp
  2023-04-12 11:17               ` Richard Sandiford
  1 sibling, 0 replies; 41+ messages in thread
From: Robin Dapp @ 2023-04-12  9:42 UTC (permalink / raw)
  To: Richard Biener, juzhe.zhong
  Cc: richard.sandiford, gcc-patches, jeffreyalaw, linkw, stefansf, krebbel

>> I think we can CC IBM folks to see whether we can make WHILE_LEN works 
>> for both IBM and RVV ?
> 
> I've CCed them.  Adding WHILE_LEN support to rs6000/s390x would be
> mainly the "easy" way to get len-masked (epilog) loop support.  I've
> figured actually implementing WHILE_ULT for AVX512 in the backend
> results in some code generation challenges so I'm going to play
> (again) with open-coding it as outlined above in the vectorizer itself
> so followup passes (mostly IVOPTs) can do a better job.

I'm with Ventana now but haven't updated my affiliation yet.  CC'ing Stefan and Andreas fyi.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
  2023-04-12  9:29             ` Richard Biener
  2023-04-12  9:42               ` Robin Dapp
@ 2023-04-12 11:17               ` Richard Sandiford
  2023-04-12 11:37                 ` juzhe.zhong
  2023-04-12 11:42                 ` Richard Biener
  1 sibling, 2 replies; 41+ messages in thread
From: Richard Sandiford @ 2023-04-12 11:17 UTC (permalink / raw)
  To: Richard Biener; +Cc: juzhe.zhong, gcc-patches, jeffreyalaw, rdapp, linkw

Richard Biener <rguenther@suse.de> writes:
> On Wed, 12 Apr 2023, juzhe.zhong@rivai.ai wrote:
>
>> 
>> >> Thanks for the detailed explanation.  Just to clarify - with RVV
>> >> there's only a single mask register, v0.t, or did you want to
>> >> say an instruction can only specify a single mask register?
>> 
>> RVV has 32 (v0~v31) vector register in total.
>> We can store vector data value or mask value in any of them.
>> We also have mask-logic instruction for example mask-and between any vector register.
>> 
>> However, any vector operation for example like vadd.vv can only  predicated by v0 (in asm is v0.t) which is the first vector register.
>> We can predicate vadd.vv with v1 - v31.
>> 
>> So, you can image every time we want to use a mask to predicate a vector operation, we should always first store the mask value
>> into v0.
>> 
>> So, we can write intrinsic sequence like this:
>> 
>> vmseq v0,v8,v9 (store mask value to v0)
>> vmslt v1,v10,v11 (store mask value to v1)
>> vmand v0,v0,v1
>> vadd.vv ...v0.t (predicate mask should always be mask).
>
> Ah, I see - that explains it well.
>
>> >> ARM SVE would have a loop control mask and a separate mask
>> >> for the if (cond[i]) which would be combined with a mask-and
>> >> instruction to a third mask which is then used on the
>> >> predicated instructions.
>> 
>> Yeah, I know it. ARM SVE way is a more elegant way than RVV do. 
>> However, for RVV, we can't follow this flow.
>> We don't have a  "whilelo" instruction to generate loop control mask.
>
> Yep.  Similar for AVX512 where I have to use a vector compare.  I'm
> currently using
>
>  { 0, 1, 2 ... } < { remaining_len, remaining_len, ... }
>
> and careful updating of remaining_len (we know it will either
> be adjusted by the full constant vector length or updated to zero).
>
>> We only can do loop control with length generated by vsetvl.
>> And we can only use "v0" to mask predicate vadd.vv, and mask value can only generated by comparison or mask logical instructions. 
>> 
>> >> PowerPC and s390x might be able to use WHILE_LEN as well (though
>> >> they only have LEN variants of loads and stores) - of course
>> >> only "simulating it".  For the fixed-vector-length ISAs the
>> >> predicated vector loop IMHO makes most sense for the epilogue to
>> >> handle low-trip loops better.
>> 
>> Yeah, I wonder how they do the flow control (if (cond[i])). 
>> For RVV, you can image I will need to add a pattern LEN_MASK_LOAD/LEN_MASK_STORE (length generated by WHILE_LEN and mask generated by comparison)
>> 
>> I think we can CC IBM folks to see whether we can make WHILE_LEN works 
>> for both IBM and RVV ?
>
> I've CCed them.  Adding WHILE_LEN support to rs6000/s390x would be
> mainly the "easy" way to get len-masked (epilog) loop support.

I think that already works for them (could be misremembering).
However, IIUC, they have no special instruction to calculate the
length (unlike for RVV), and so it's open-coded using vect_get_len.

I suppose my two questions are:

(1) How easy would it be to express WHILE_LEN in normal gimple?
    I haven't thought about this at all, so the answer might be
    "very hard".  But it reminds me a little of UQDEC on AArch64,
    which we open-code using MAX_EXPR and MINUS_EXPR (see
    vect_set_loop_controls_directly).

    I'm not saying WHILE_LEN is the same operation, just that it seems
    like it might be open-codeable in a similar way.

    Even if we can open-code it, we'd still need some way for the
    target to select the "RVV way" from the "s390/PowerPC way".

(2) What effect does using a variable IV step (the result of
    the WHILE_LEN) have on ivopts?  I remember experimenting with
    something similar once (can't remember the context) and not
    having a constant step prevented ivopts from making good
    addresing-mode choices.

Thanks,
Richard



^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
  2023-04-12 11:17               ` Richard Sandiford
@ 2023-04-12 11:37                 ` juzhe.zhong
  2023-04-12 12:24                   ` Richard Sandiford
  2023-04-12 12:56                   ` Kewen.Lin
  2023-04-12 11:42                 ` Richard Biener
  1 sibling, 2 replies; 41+ messages in thread
From: juzhe.zhong @ 2023-04-12 11:37 UTC (permalink / raw)
  To: richard.sandiford, rguenther; +Cc: gcc-patches, jeffreyalaw, rdapp, linkw

[-- Attachment #1: Type: text/plain, Size: 6418 bytes --]

Thank you. Richard.


>> I think that already works for them (could be misremembering).
>> However, IIUC, they have no special instruction to calculate the
>> length (unlike for RVV), and so it's open-coded using vect_get_len.

Yeah, the current flow using min, sub, and then min in vect_get_len
is working for IBM. But I wonder whether switching the current flow of
length-loop-control into the WHILE_LEN pattern that this patch can improve
their performance.

>> (1) How easy would it be to express WHILE_LEN in normal gimple?
>>     I haven't thought about this at all, so the answer might be
>>     "very hard".  But it reminds me a little of UQDEC on AArch64,
>>     which we open-code using MAX_EXPR and MINUS_EXPR (see
 >>    vect_set_loop_controls_directly).

  >>   I'm not saying WHILE_LEN is the same operation, just that it seems
  >>   like it might be open-codeable in a similar way.

 >>    Even if we can open-code it, we'd still need some way for the
  >>   target to select the "RVV way" from the "s390/PowerPC way".

WHILE_LEN in doc I define is
operand0 = MIN (operand1, operand2)operand1 is the residual number of scalar elements need to be updated.operand2 is vectorization factor (vf) for single rgroup.         if multiple rgroup operan2 = vf * nitems_per_ctrl.You mean such pattern is not well expressed so we need to replace it with normaltree code (MIN OR MAX). And let RISC-V backend to optimize them into vsetvl ?Sorry, maybe I am not on the same page.
>> (2) What effect does using a variable IV step (the result of
>> the WHILE_LEN) have on ivopts?  I remember experimenting with
>> something similar once (can't remember the context) and not
>> having a constant step prevented ivopts from making good
>> addresing-mode choices.

Thank you so much for pointing out this. Currently, varialble IV step and decreasing n down to 0 
works fine for RISC-V downstream GCC and we didn't find issues related addressing-mode choosing.

I think I must missed something, would you mind giving me some hints so that I can study on ivopts
to find out which case may generate inferior codegens for varialble IV step?

Thank you so much.


juzhe.zhong@rivai.ai
 
From: Richard Sandiford
Date: 2023-04-12 19:17
To: Richard Biener
CC: juzhe.zhong\@rivai.ai; gcc-patches; jeffreyalaw; rdapp; linkw
Subject: Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
Richard Biener <rguenther@suse.de> writes:
> On Wed, 12 Apr 2023, juzhe.zhong@rivai.ai wrote:
>
>> 
>> >> Thanks for the detailed explanation.  Just to clarify - with RVV
>> >> there's only a single mask register, v0.t, or did you want to
>> >> say an instruction can only specify a single mask register?
>> 
>> RVV has 32 (v0~v31) vector register in total.
>> We can store vector data value or mask value in any of them.
>> We also have mask-logic instruction for example mask-and between any vector register.
>> 
>> However, any vector operation for example like vadd.vv can only  predicated by v0 (in asm is v0.t) which is the first vector register.
>> We can predicate vadd.vv with v1 - v31.
>> 
>> So, you can image every time we want to use a mask to predicate a vector operation, we should always first store the mask value
>> into v0.
>> 
>> So, we can write intrinsic sequence like this:
>> 
>> vmseq v0,v8,v9 (store mask value to v0)
>> vmslt v1,v10,v11 (store mask value to v1)
>> vmand v0,v0,v1
>> vadd.vv ...v0.t (predicate mask should always be mask).
>
> Ah, I see - that explains it well.
>
>> >> ARM SVE would have a loop control mask and a separate mask
>> >> for the if (cond[i]) which would be combined with a mask-and
>> >> instruction to a third mask which is then used on the
>> >> predicated instructions.
>> 
>> Yeah, I know it. ARM SVE way is a more elegant way than RVV do. 
>> However, for RVV, we can't follow this flow.
>> We don't have a  "whilelo" instruction to generate loop control mask.
>
> Yep.  Similar for AVX512 where I have to use a vector compare.  I'm
> currently using
>
>  { 0, 1, 2 ... } < { remaining_len, remaining_len, ... }
>
> and careful updating of remaining_len (we know it will either
> be adjusted by the full constant vector length or updated to zero).
>
>> We only can do loop control with length generated by vsetvl.
>> And we can only use "v0" to mask predicate vadd.vv, and mask value can only generated by comparison or mask logical instructions. 
>> 
>> >> PowerPC and s390x might be able to use WHILE_LEN as well (though
>> >> they only have LEN variants of loads and stores) - of course
>> >> only "simulating it".  For the fixed-vector-length ISAs the
>> >> predicated vector loop IMHO makes most sense for the epilogue to
>> >> handle low-trip loops better.
>> 
>> Yeah, I wonder how they do the flow control (if (cond[i])). 
>> For RVV, you can image I will need to add a pattern LEN_MASK_LOAD/LEN_MASK_STORE (length generated by WHILE_LEN and mask generated by comparison)
>> 
>> I think we can CC IBM folks to see whether we can make WHILE_LEN works 
>> for both IBM and RVV ?
>
> I've CCed them.  Adding WHILE_LEN support to rs6000/s390x would be
> mainly the "easy" way to get len-masked (epilog) loop support.
 
I think that already works for them (could be misremembering).
However, IIUC, they have no special instruction to calculate the
length (unlike for RVV), and so it's open-coded using vect_get_len.
 
I suppose my two questions are:
 
(1) How easy would it be to express WHILE_LEN in normal gimple?
    I haven't thought about this at all, so the answer might be
    "very hard".  But it reminds me a little of UQDEC on AArch64,
    which we open-code using MAX_EXPR and MINUS_EXPR (see
    vect_set_loop_controls_directly).
 
    I'm not saying WHILE_LEN is the same operation, just that it seems
    like it might be open-codeable in a similar way.
 
    Even if we can open-code it, we'd still need some way for the
    target to select the "RVV way" from the "s390/PowerPC way".
 
(2) What effect does using a variable IV step (the result of
    the WHILE_LEN) have on ivopts?  I remember experimenting with
    something similar once (can't remember the context) and not
    having a constant step prevented ivopts from making good
    addresing-mode choices.
 
Thanks,
Richard
 
 
 

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
  2023-04-12 11:17               ` Richard Sandiford
  2023-04-12 11:37                 ` juzhe.zhong
@ 2023-04-12 11:42                 ` Richard Biener
  1 sibling, 0 replies; 41+ messages in thread
From: Richard Biener @ 2023-04-12 11:42 UTC (permalink / raw)
  To: Richard Sandiford, Richard Biener, juzhe.zhong, gcc-patches,
	jeffreyalaw, rdapp, linkw

On Wed, Apr 12, 2023 at 1:18 PM Richard Sandiford via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Richard Biener <rguenther@suse.de> writes:
> > On Wed, 12 Apr 2023, juzhe.zhong@rivai.ai wrote:
> >
> >>
> >> >> Thanks for the detailed explanation.  Just to clarify - with RVV
> >> >> there's only a single mask register, v0.t, or did you want to
> >> >> say an instruction can only specify a single mask register?
> >>
> >> RVV has 32 (v0~v31) vector register in total.
> >> We can store vector data value or mask value in any of them.
> >> We also have mask-logic instruction for example mask-and between any vector register.
> >>
> >> However, any vector operation for example like vadd.vv can only  predicated by v0 (in asm is v0.t) which is the first vector register.
> >> We can predicate vadd.vv with v1 - v31.
> >>
> >> So, you can image every time we want to use a mask to predicate a vector operation, we should always first store the mask value
> >> into v0.
> >>
> >> So, we can write intrinsic sequence like this:
> >>
> >> vmseq v0,v8,v9 (store mask value to v0)
> >> vmslt v1,v10,v11 (store mask value to v1)
> >> vmand v0,v0,v1
> >> vadd.vv ...v0.t (predicate mask should always be mask).
> >
> > Ah, I see - that explains it well.
> >
> >> >> ARM SVE would have a loop control mask and a separate mask
> >> >> for the if (cond[i]) which would be combined with a mask-and
> >> >> instruction to a third mask which is then used on the
> >> >> predicated instructions.
> >>
> >> Yeah, I know it. ARM SVE way is a more elegant way than RVV do.
> >> However, for RVV, we can't follow this flow.
> >> We don't have a  "whilelo" instruction to generate loop control mask.
> >
> > Yep.  Similar for AVX512 where I have to use a vector compare.  I'm
> > currently using
> >
> >  { 0, 1, 2 ... } < { remaining_len, remaining_len, ... }
> >
> > and careful updating of remaining_len (we know it will either
> > be adjusted by the full constant vector length or updated to zero).
> >
> >> We only can do loop control with length generated by vsetvl.
> >> And we can only use "v0" to mask predicate vadd.vv, and mask value can only generated by comparison or mask logical instructions.
> >>
> >> >> PowerPC and s390x might be able to use WHILE_LEN as well (though
> >> >> they only have LEN variants of loads and stores) - of course
> >> >> only "simulating it".  For the fixed-vector-length ISAs the
> >> >> predicated vector loop IMHO makes most sense for the epilogue to
> >> >> handle low-trip loops better.
> >>
> >> Yeah, I wonder how they do the flow control (if (cond[i])).
> >> For RVV, you can image I will need to add a pattern LEN_MASK_LOAD/LEN_MASK_STORE (length generated by WHILE_LEN and mask generated by comparison)
> >>
> >> I think we can CC IBM folks to see whether we can make WHILE_LEN works
> >> for both IBM and RVV ?
> >
> > I've CCed them.  Adding WHILE_LEN support to rs6000/s390x would be
> > mainly the "easy" way to get len-masked (epilog) loop support.
>
> I think that already works for them (could be misremembering).
> However, IIUC, they have no special instruction to calculate the
> length (unlike for RVV), and so it's open-coded using vect_get_len.
>
> I suppose my two questions are:
>
> (1) How easy would it be to express WHILE_LEN in normal gimple?
>     I haven't thought about this at all, so the answer might be
>     "very hard".  But it reminds me a little of UQDEC on AArch64,
>     which we open-code using MAX_EXPR and MINUS_EXPR (see
>     vect_set_loop_controls_directly).
>
>     I'm not saying WHILE_LEN is the same operation, just that it seems
>     like it might be open-codeable in a similar way.

I think WHILE_LEN is saturate-to-zero subtraction.  So when the IV
can be expressed signed

   remain = MAX (0, remain - vf);

the details are more complicated then you need an unsigned IV.

It might be that WHILE_LEN for RVV computes remain % VL
so another MIN around (not sure).  For the AVX512 work I
also have a scalar 'remain' like above but currently I'm adding
a branch

do {
 if (remain < vf)
   mask = ... vector compare ..;
 else
   mask = all-ones;
} while (mask-not-all-zeros);

so I'm using the mask as control "IV".  But that's because I do
open-code WHILE_ULT at RTL expansion time and this is how
the vectorizer works for SVE.

When manually creating a loop mask in the vectorizer tracking
'remain' is easier.  Note the extra control flow complicates the
fully masked variant, for the epilog we know remain < vf and
that we'll immediately exit the loop.

>     Even if we can open-code it, we'd still need some way for the
>     target to select the "RVV way" from the "s390/PowerPC way".
>
> (2) What effect does using a variable IV step (the result of
>     the WHILE_LEN) have on ivopts?  I remember experimenting with
>     something similar once (can't remember the context) and not
>     having a constant step prevented ivopts from making good
>     addresing-mode choices.

Any kind of variable length stuff (WHILE_ULT or WHILE_LEN) will probably
make niter analysis fail.  All IV uses that are not SCEV analyzable will
just remain as-is as IVOPTs cannot deal with them either - but usually
that should be only the control IV.

Richard.

>
> Thanks,
> Richard
>
>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
  2023-04-12 11:37                 ` juzhe.zhong
@ 2023-04-12 12:24                   ` Richard Sandiford
  2023-04-12 14:18                     ` 钟居哲
  2023-04-12 12:56                   ` Kewen.Lin
  1 sibling, 1 reply; 41+ messages in thread
From: Richard Sandiford @ 2023-04-12 12:24 UTC (permalink / raw)
  To: juzhe.zhong; +Cc: rguenther, gcc-patches, jeffreyalaw, rdapp, linkw

"juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai> writes:
>>> I think that already works for them (could be misremembering).
>>> However, IIUC, they have no special instruction to calculate the
>>> length (unlike for RVV), and so it's open-coded using vect_get_len.
>
> Yeah, the current flow using min, sub, and then min in vect_get_len
> is working for IBM. But I wonder whether switching the current flow of
> length-loop-control into the WHILE_LEN pattern that this patch can improve
> their performance.
>
>>> (1) How easy would it be to express WHILE_LEN in normal gimple?
>>>     I haven't thought about this at all, so the answer might be
>>>     "very hard".  But it reminds me a little of UQDEC on AArch64,
>>>     which we open-code using MAX_EXPR and MINUS_EXPR (see
>  >>    vect_set_loop_controls_directly).
>
>   >>   I'm not saying WHILE_LEN is the same operation, just that it seems
>   >>   like it might be open-codeable in a similar way.
>
>  >>    Even if we can open-code it, we'd still need some way for the
>   >>   target to select the "RVV way" from the "s390/PowerPC way".
>
> WHILE_LEN in doc I define is
> operand0 = MIN (operand1, operand2)operand1 is the residual number of scalar elements need to be updated.operand2 is vectorization factor (vf) for single rgroup.         if multiple rgroup operan2 = vf * nitems_per_ctrl.You mean such pattern is not well expressed so we need to replace it with normaltree code (MIN OR MAX). And let RISC-V backend to optimize them into vsetvl ?Sorry, maybe I am not on the same page.

It's not so much that we need to do that.  But normally it's only worth
adding internal functions if they do something that is too complicated
to express in simple gimple arithmetic.  The UQDEC case I mentioned:

   z = MAX (x, y) - y

fell into the "simple arithmetic" category for me.  We could have added
an ifn for unsigned saturating decrement, but it didn't seem complicated
enough to merit its own ifn.

>>> (2) What effect does using a variable IV step (the result of
>>> the WHILE_LEN) have on ivopts?  I remember experimenting with
>>> something similar once (can't remember the context) and not
>>> having a constant step prevented ivopts from making good
>>> addresing-mode choices.
>
> Thank you so much for pointing out this. Currently, varialble IV step and decreasing n down to 0 
> works fine for RISC-V downstream GCC and we didn't find issues related addressing-mode choosing.

OK, that's good.  Sounds like it isn't a problem then.

> I think I must missed something, would you mind giving me some hints so that I can study on ivopts
> to find out which case may generate inferior codegens for varialble IV step?

I think AArch64 was sensitive to this because (a) the vectoriser creates
separate IVs for each base address and (b) for SVE, we instead want
invariant base addresses that are indexed by the loop control IV.
Like Richard says, if the loop control IV isn't a SCEV, ivopts isn't
able to use it and so (b) fails.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
  2023-04-12 11:37                 ` juzhe.zhong
  2023-04-12 12:24                   ` Richard Sandiford
@ 2023-04-12 12:56                   ` Kewen.Lin
  2023-04-12 13:22                     ` 钟居哲
  1 sibling, 1 reply; 41+ messages in thread
From: Kewen.Lin @ 2023-04-12 12:56 UTC (permalink / raw)
  To: juzhe.zhong, richard.sandiford, rguenther; +Cc: gcc-patches, jeffreyalaw, rdapp

Hi!

on 2023/4/12 19:37, juzhe.zhong@rivai.ai wrote:
> Thank you. Richard. >> I think that already works for them (could be misremembering). >> However, IIUC, they have no special instruction to calculate the >> length (unlike for RVV), and so it's open-coded using vect_get_len. 
> 
> Thank you. Richard.
> 
> 
>>> I think that already works for them (could be misremembering).
>>> However, IIUC, they have no special instruction to calculate the
>>> length (unlike for RVV), and so it's open-coded using vect_get_len.
> 

Yeah, Richard is right, we don't have some special hardware instruction
for efficient length calculation.

> Yeah, the current flow using min, sub, and then min in vect_get_len
> is working for IBM. But I wonder whether switching the current flow of
> length-loop-control into the WHILE_LEN pattern that this patch can improve
> their performance.

Based on some cons for the vector load/store with length in bytes on Power
(like we need one extra GPR holding the length, the length needs to be the
most significant 8 bits requiring an extra shifting etc.), we use normal
vector load/store in main loop and only use vector load/store with length
for the epilogue.  For the epilogue, the remaining length is known less
than the whole vector length, so the related sequence can be optimized.
I just had a check on s390 code, which also enables it only for the
epilogue.  From this perspective, this WHILE_LEN proposal may not give us
more.  But for the case of vect-partial-vector-usage=2 (fully adopting
vector with length on the main loop), I think the proposed sequence looks
better to me.

BR,
Kewen

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
  2023-04-12 12:56                   ` Kewen.Lin
@ 2023-04-12 13:22                     ` 钟居哲
  2023-04-13  7:29                       ` Kewen.Lin
  0 siblings, 1 reply; 41+ messages in thread
From: 钟居哲 @ 2023-04-12 13:22 UTC (permalink / raw)
  To: linkw, richard.sandiford, rguenther; +Cc: gcc-patches, Jeff Law, rdapp

[-- Attachment #1: Type: text/plain, Size: 2447 bytes --]

Thanks Kewen. 

It seems that this proposal WHILE_LEN can help s390 when using --param vect-partial-vector-usage=2 compile option.

Would you mind apply this patch && support WHILE_LEN in s390 backend and test it to see the overal benefits for s390
as well as the correctness of this sequence ? 
If it may create some correctness issue for s390 or rs6000 (I saw len_load/len_store in rs6000 too), I can fix this patch for you.

I hope both RVV and IBM targets can gain benefits from this patch.

Thanks.

juzhe.zhong@rivai.ai

From: Kewen.Lin
Date: 2023-04-12 20:56
To: juzhe.zhong@rivai.ai; richard.sandiford; rguenther
CC: gcc-patches; jeffreyalaw; rdapp
Subject: Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
Hi!

on 2023/4/12 19:37, juzhe.zhong@rivai.ai wrote:
> Thank you. Richard. >> I think that already works for them (could be misremembering). >> However, IIUC, they have no special instruction to calculate the >> length (unlike for RVV), and so it's open-coded using vect_get_len. 
> 
> Thank you. Richard.
> 
> 
>>> I think that already works for them (could be misremembering).
>>> However, IIUC, they have no special instruction to calculate the
>>> length (unlike for RVV), and so it's open-coded using vect_get_len.
> 

Yeah, Richard is right, we don't have some special hardware instruction
for efficient length calculation.

> Yeah, the current flow using min, sub, and then min in vect_get_len
> is working for IBM. But I wonder whether switching the current flow of
> length-loop-control into the WHILE_LEN pattern that this patch can improve
> their performance.

Based on some cons for the vector load/store with length in bytes on Power
(like we need one extra GPR holding the length, the length needs to be the
most significant 8 bits requiring an extra shifting etc.), we use normal
vector load/store in main loop and only use vector load/store with length
for the epilogue.  For the epilogue, the remaining length is known less
than the whole vector length, so the related sequence can be optimized.
I just had a check on s390 code, which also enables it only for the
epilogue.  From this perspective, this WHILE_LEN proposal may not give us
more.  But for the case of vect-partial-vector-usage=2 (fully adopting
vector with length on the main loop), I think the proposed sequence looks
better to me.

BR,
Kewen

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
  2023-04-12 12:24                   ` Richard Sandiford
@ 2023-04-12 14:18                     ` 钟居哲
  2023-04-13  6:47                       ` Richard Biener
  0 siblings, 1 reply; 41+ messages in thread
From: 钟居哲 @ 2023-04-12 14:18 UTC (permalink / raw)
  To: richard.sandiford
  Cc: rguenther, gcc-patches, Jeff Law, rdapp, linkw, kito.cheng

[-- Attachment #1: Type: text/plain, Size: 4987 bytes --]

>> It's not so much that we need to do that.  But normally it's only worth
>> adding internal functions if they do something that is too complicated
>> to express in simple gimple arithmetic.  The UQDEC case I mentioned:

>>    z = MAX (x, y) - y

>> fell into the "simple arithmetic" category for me.  We could have added
>> an ifn for unsigned saturating decrement, but it didn't seem complicated
>> enough to merit its own ifn.

Ah, I known your concern. I should admit that WHILE_LEN is a simple arithmetic operation
which is just taking result from

min (remain,vf).

The possible solution is to just use MIN_EXPR (remain,vf).
Then, add speciall handling in umin_optab pattern to recognize "vf" in the backend.
Finally generate vsetvl in RISC-V backend.

The "vf" should be recognized as the operand of umin should be const_int/const_poly_int operand.
Otherwise, just generate umin scalar instruction..

However, there is a case that I can't recognize umin should generate vsetvl or umin. Is this following case:
void foo (int32_t a)
{
  return min (a, 4);
}

In this case I should generate:
li a1,4
umin a1,a0,a1

instead of generating vsetvl

However, in this case:

void foo (int32_t *a...)
for (int i = 0; i < n; i++)
  a[i] = b[i] + c[i];

with -mriscv-vector-bits=128 (which means each vector can handle 4 INT32)
Then the VF will be 4 too. If we also MIN_EXPR instead WHILE_LEN:

...
len = MIN_EXPR (n,4)
v = len_load (len)
....
...

In this case, MIN_EXPR should emit vsetvl.

It's hard for me to tell the difference between these 2 cases...

CC RISC-V port backend maintainer: Kito.

juzhe.zhong@rivai.ai

From: Richard Sandiford
Date: 2023-04-12 20:24
To: juzhe.zhong\@rivai.ai
CC: rguenther; gcc-patches; jeffreyalaw; rdapp; linkw
Subject: Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
"juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai> writes:
>>> I think that already works for them (could be misremembering).
>>> However, IIUC, they have no special instruction to calculate the
>>> length (unlike for RVV), and so it's open-coded using vect_get_len.
>
> Yeah, the current flow using min, sub, and then min in vect_get_len
> is working for IBM. But I wonder whether switching the current flow of
> length-loop-control into the WHILE_LEN pattern that this patch can improve
> their performance.
>
>>> (1) How easy would it be to express WHILE_LEN in normal gimple?
>>>     I haven't thought about this at all, so the answer might be
>>>     "very hard".  But it reminds me a little of UQDEC on AArch64,
>>>     which we open-code using MAX_EXPR and MINUS_EXPR (see
>  >>    vect_set_loop_controls_directly).
>
>   >>   I'm not saying WHILE_LEN is the same operation, just that it seems
>   >>   like it might be open-codeable in a similar way.
>
>  >>    Even if we can open-code it, we'd still need some way for the
>   >>   target to select the "RVV way" from the "s390/PowerPC way".
>
> WHILE_LEN in doc I define is
> operand0 = MIN (operand1, operand2)operand1 is the residual number of scalar elements need to be updated.operand2 is vectorization factor (vf) for single rgroup.         if multiple rgroup operan2 = vf * nitems_per_ctrl.You mean such pattern is not well expressed so we need to replace it with normaltree code (MIN OR MAX). And let RISC-V backend to optimize them into vsetvl ?Sorry, maybe I am not on the same page.

It's not so much that we need to do that.  But normally it's only worth
adding internal functions if they do something that is too complicated
to express in simple gimple arithmetic.  The UQDEC case I mentioned:

   z = MAX (x, y) - y

fell into the "simple arithmetic" category for me.  We could have added
an ifn for unsigned saturating decrement, but it didn't seem complicated
enough to merit its own ifn.

>>> (2) What effect does using a variable IV step (the result of
>>> the WHILE_LEN) have on ivopts?  I remember experimenting with
>>> something similar once (can't remember the context) and not
>>> having a constant step prevented ivopts from making good
>>> addresing-mode choices.
>
> Thank you so much for pointing out this. Currently, varialble IV step and decreasing n down to 0 
> works fine for RISC-V downstream GCC and we didn't find issues related addressing-mode choosing.

OK, that's good.  Sounds like it isn't a problem then.

> I think I must missed something, would you mind giving me some hints so that I can study on ivopts
> to find out which case may generate inferior codegens for varialble IV step?

I think AArch64 was sensitive to this because (a) the vectoriser creates
separate IVs for each base address and (b) for SVE, we instead want
invariant base addresses that are indexed by the loop control IV.
Like Richard says, if the loop control IV isn't a SCEV, ivopts isn't
able to use it and so (b) fails.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
  2023-04-12 14:18                     ` 钟居哲
@ 2023-04-13  6:47                       ` Richard Biener
  2023-04-13  9:54                         ` juzhe.zhong
  0 siblings, 1 reply; 41+ messages in thread
From: Richard Biener @ 2023-04-13  6:47 UTC (permalink / raw)
  To: 钟居哲
  Cc: richard.sandiford, gcc-patches, Jeff Law, rdapp, linkw, kito.cheng

On Wed, 12 Apr 2023, ??? wrote:

> >> It's not so much that we need to do that.  But normally it's only worth
> >> adding internal functions if they do something that is too complicated
> >> to express in simple gimple arithmetic.  The UQDEC case I mentioned:
> 
> >>    z = MAX (x, y) - y
> 
> >> fell into the "simple arithmetic" category for me.  We could have added
> >> an ifn for unsigned saturating decrement, but it didn't seem complicated
> >> enough to merit its own ifn.
> 
> Ah, I known your concern. I should admit that WHILE_LEN is a simple arithmetic operation
> which is just taking result from
> 
> min (remain,vf).
> 
> The possible solution is to just use MIN_EXPR (remain,vf).
> Then, add speciall handling in umin_optab pattern to recognize "vf" in the backend.
> Finally generate vsetvl in RISC-V backend.
> 
> The "vf" should be recognized as the operand of umin should be const_int/const_poly_int operand.
> Otherwise, just generate umin scalar instruction..
> 
> However, there is a case that I can't recognize umin should generate vsetvl or umin. Is this following case:
> void foo (int32_t a)
> {
>   return min (a, 4);
> }
> 
> In this case I should generate:
> li a1,4
> umin a1,a0,a1
> 
> instead of generating vsetvl
> 
> However, in this case:
> 
> void foo (int32_t *a...)
> for (int i = 0; i < n; i++)
>   a[i] = b[i] + c[i];
> 
> with -mriscv-vector-bits=128 (which means each vector can handle 4 INT32)
> Then the VF will be 4 too. If we also MIN_EXPR instead WHILE_LEN:
> 
> ...
> len = MIN_EXPR (n,4)
> v = len_load (len)
> ....
> ...
> 
> In this case, MIN_EXPR should emit vsetvl.
> 
> It's hard for me to tell the difference between these 2 cases...

But the issue is the same in the reverse with WHILE_LEN, no?
WHILE_LEN just computes a scalar value - you seem to suggest
there's a hidden side-effect of "coalescing" the result with
a hardware vector length register?  I don't think that's good design.

IMHO tieing the scalar result with the uses has to be done where
you emit the other vsetvl instructions.

One convenient thing we have with WHILE_LEN is that it is a key
for the vectorizer to query target capabilities (and preferences).
But of course collecting whether stmts can be vectorized
with length and/or with mask would be better.

Richard.

> CC RISC-V port backend maintainer: Kito.
> 
> 
> 
> juzhe.zhong@rivai.ai
>  
> From: Richard Sandiford
> Date: 2023-04-12 20:24
> To: juzhe.zhong\@rivai.ai
> CC: rguenther; gcc-patches; jeffreyalaw; rdapp; linkw
> Subject: Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
> "juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai> writes:
> >>> I think that already works for them (could be misremembering).
> >>> However, IIUC, they have no special instruction to calculate the
> >>> length (unlike for RVV), and so it's open-coded using vect_get_len.
> >
> > Yeah, the current flow using min, sub, and then min in vect_get_len
> > is working for IBM. But I wonder whether switching the current flow of
> > length-loop-control into the WHILE_LEN pattern that this patch can improve
> > their performance.
> >
> >>> (1) How easy would it be to express WHILE_LEN in normal gimple?
> >>>     I haven't thought about this at all, so the answer might be
> >>>     "very hard".  But it reminds me a little of UQDEC on AArch64,
> >>>     which we open-code using MAX_EXPR and MINUS_EXPR (see
> >  >>    vect_set_loop_controls_directly).
> >
> >   >>   I'm not saying WHILE_LEN is the same operation, just that it seems
> >   >>   like it might be open-codeable in a similar way.
> >
> >  >>    Even if we can open-code it, we'd still need some way for the
> >   >>   target to select the "RVV way" from the "s390/PowerPC way".
> >
> > WHILE_LEN in doc I define is
> > operand0 = MIN (operand1, operand2)operand1 is the residual number of scalar elements need to be updated.operand2 is vectorization factor (vf) for single rgroup.         if multiple rgroup operan2 = vf * nitems_per_ctrl.You mean such pattern is not well expressed so we need to replace it with normaltree code (MIN OR MAX). And let RISC-V backend to optimize them into vsetvl ?Sorry, maybe I am not on the same page.
>  
> It's not so much that we need to do that.  But normally it's only worth
> adding internal functions if they do something that is too complicated
> to express in simple gimple arithmetic.  The UQDEC case I mentioned:
>  
>    z = MAX (x, y) - y
>  
> fell into the "simple arithmetic" category for me.  We could have added
> an ifn for unsigned saturating decrement, but it didn't seem complicated
> enough to merit its own ifn.
>  
> >>> (2) What effect does using a variable IV step (the result of
> >>> the WHILE_LEN) have on ivopts?  I remember experimenting with
> >>> something similar once (can't remember the context) and not
> >>> having a constant step prevented ivopts from making good
> >>> addresing-mode choices.
> >
> > Thank you so much for pointing out this. Currently, varialble IV step and decreasing n down to 0 
> > works fine for RISC-V downstream GCC and we didn't find issues related addressing-mode choosing.
>  
> OK, that's good.  Sounds like it isn't a problem then.
>  
> > I think I must missed something, would you mind giving me some hints so that I can study on ivopts
> > to find out which case may generate inferior codegens for varialble IV step?
>  
> I think AArch64 was sensitive to this because (a) the vectoriser creates
> separate IVs for each base address and (b) for SVE, we instead want
> invariant base addresses that are indexed by the loop control IV.
> Like Richard says, if the loop control IV isn't a SCEV, ivopts isn't
> able to use it and so (b) fails.
>  
> Thanks,
> Richard
>  
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
  2023-04-12 13:22                     ` 钟居哲
@ 2023-04-13  7:29                       ` Kewen.Lin
  2023-04-13 13:44                         ` 钟居哲
  0 siblings, 1 reply; 41+ messages in thread
From: Kewen.Lin @ 2023-04-13  7:29 UTC (permalink / raw)
  To: 钟居哲
  Cc: gcc-patches, Jeff Law, rdapp, richard.sandiford, rguenther

Hi Juzhe,

on 2023/4/12 21:22, 钟居哲 wrote:
> Thanks Kewen. 
> 
> It seems that this proposal WHILE_LEN can help s390 when using --param vect-partial-vector-usage=2 compile option.
> 

Yeah, IMHO, the previous sequence vs. the proposed sequence are like:

int
foo (int *__restrict a, int *__restrict b, int n)
{
  if (n <= 0)
    return 0;

  int iv = 0;
  int len = MIN (n, 16);
  int sum = 0;
  do
    {
      sum += a[len] + b[len];
      iv += 16;
      int n1 = MIN (n, iv);                   // line A
      int n2 = n - n1;
      len = MIN (n2, 16);
    }
  while (n > iv);

  return sum;
}

vs.

int
foo (int *__restrict a, int *__restrict b, int n)
{
  if (n <= 0)
    return 0;

  int len;
  int sum = 0;
  do
    {
      len = MIN (n, 16);
      sum += a[len] + b[len];
      n -= len;
    }
  while (n > 0);

  return sum;
}

it at least saves one MIN (at line A) and one length preparation in the
last iteration (it's useless since loop ends).  But I think the concern
that this proposed IV isn't recognized as simple iv may stay.  I tried
to compile the above source files on Power, the former can adopt doloop
optimization but the latter fails to.

> Would you mind apply this patch && support WHILE_LEN in s390 backend and test it to see the overal benefits for s390
> as well as the correctness of this sequence ? 

Sure, if all of you think this approach and this revision is good enough to go forward for this kind of evaluation,
I'm happy to give it a shot, but only for rs6000. ;-)  I noticed that there are some discussions on withdrawing this
WHILE_LEN by using MIN_EXPR instead, I'll stay tuned.

btw, now we only adopt vector with length on the epilogues rather than the main vectorized loops, because of the
non-trivial extra costs for length preparation than just using the normal vector load/store (all lanes), so we don't
care about the performance with --param vect-partial-vector-usage=2 much.  Even if this new proposal can optimize
the length preparation for --param vect-partial-vector-usage=2, the extra costs for length preparation is still
unavoidable (MIN, shifting, one more GPR used), we would still stay with default --param vect-partial-vector-usage=1
(which can't benefit from this new proposal).

BR,
Kewen

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
  2023-04-13  6:47                       ` Richard Biener
@ 2023-04-13  9:54                         ` juzhe.zhong
  2023-04-18  9:32                           ` Richard Sandiford
  0 siblings, 1 reply; 41+ messages in thread
From: juzhe.zhong @ 2023-04-13  9:54 UTC (permalink / raw)
  To: rguenther
  Cc: richard.sandiford, gcc-patches, jeffreyalaw, rdapp, linkw, kito.cheng

[-- Attachment #1: Type: text/plain, Size: 10602 bytes --]

Thank you Richard.

>> But the issue is the same in the reverse with WHILE_LEN, no?
>>WHILE_LEN just computes a scalar value - you seem to suggest
>>there's a hidden side-effect of "coalescing" the result with
>>a hardware vector length register?  I don't think that's good design.
No, I don't plan to suggest there's a hidden side-effect of "coalescing"
the result with a hardware vector length register.

Today, I read RVV ISA deeply again. I realize that this patch is not absolute correct for 
any RVV hardward.

According to RVV ISA, the vsetvl definition:
an vsetvli instruction which is vsetvli vl, avl, vtype
vl = AVL if AVL ≤ VLMAX
ceil(AVL / 2) ≤ vl ≤ VLMAX if AVL < (2 * VLMAX)
vl = VLMAX if AVL ≥ (2 * VLMAX)
Deterministic on any given implementation for same input AVL and VLMAX values
The second constraint make the result of vsetvli is not necessary to be VLMAX (the maximum number of elements will be updated of specific vector-length RVV CPU).

So for a vsetvli instruction (vsetvli vl,avl,vtype). The "vl" value can be various among different RVV CPU depending on the implementation of the downstream RVV hardware.

 Now I think I should fix this patch since this patch is not always suitable for all hardware.

So according to RVV ISA:
For example, this permits an implementation to set vl = ceil(AVL / 2) for VLMAX < AVL < 2*VLMAX in order to evenly distribute work over the last two iterations of a stripmine loop.

We can have these 2 following different RVV CPU:

Suppose  the maximum number of the elements needs to be updated is 10 element (int32_t), and the vector length = 256 bit (update 8 INT32 elements in max).

So there are 2 iterations we need, the number elements of each iteration depending on hardware implementation.

So we can have these 2 following hardware implementation are both legal for RVV standard:

RVV CPU 1:
1st iteration update 5 element (it satisfy the constraint ceil (AVL/2) <= vl <= VLMAX), set vl = ceil (AVL/2) = 5
2nd iteration update 5 elements too.

RVV CPU 2:
1st iteration update 8 elements. set vl = VLMAX = 8.
2nd iteration update 3 elments.

These 2 RVV CPU are both legal according to RVV specification standard.
It's obvious this patch is correct for RVV CPU 2 but incorrect for RVV CPU 1.

Since the current flow of this patch is as follows:

+	<bb 3>
+	_19 = (unsigned long) n_5(D);
+	...
+
+	<bb 4>:
+	...
+	# ivtmp_20 = PHI <ivtmp_21(4), _19(3)>
+	...
+	_22 = .WHILE_LEN (ivtmp_20, vf);
+	...
+	LEN_LOAD (addr, _22);...        addr = addr + vf;+	ivtmp_21 = ivtmp_20 - _22;
+	...
+	if (ivtmp_21 != 0)
+	  goto <bb 4>; [75.00%]
+	else
+	  goto <bb 5>; [25.00%]
+
+	<bb 5>
+	return;

Here the _22 which is the output of WHILE_LEN is only used in ivtmp_21 = ivtmp_20 - _22;
which serves the saturating-to-zero subtraction. 
And "addr = addr + vf;" 
The address is calculated in the loop just keep add vf. 
Such sequence is Ok for most of the RVV CPU so far I think.
However, for future compatibility, we should make WHILE_LEN output as the address IV adding value too.

So, we should abandon the current the address loop way which is just keeping add vf.

Replace "addr = addr + vf".

Instead, we should do like that:

_22 = .WHILE_LEN (ivtmp_20, vf);
....
LEN_LOAD (addr, _22);tmp = _22 * 4; (Assume it is INT32 calculation, make _22 which is INT32 align into BYTE align for address counting) addr = addr + tmp;....
Makeing the result of WHILE_LEN is not only used to do the remain = remain - len,But also used in addressing calculating: tmp = len * (element bytesize) ; addr = addr + tmp;
Then this flow is the correct flow for all RVV CPU.
This flow is totally same as example in RVV ISA define:
https://github.com/riscv/riscv-v-spec/blob/master/example/vvaddint32.s
I think I need to change this patch as described above to make it to be global suitable for all RVV CPU in the word. But I am not sure whether GCC community accept this flow. So I propose it now before I do it. 
I didn't realize that since my downstream RVV hardware and the open-source simulator generate "vl" = VLMAX. (Sorry about that)

Expecting any suggestions and comments.
Thank you so much.

juzhe.zhong@rivai.ai

From: Richard Biener
Date: 2023-04-13 14:47
To: 钟居哲
CC: richard.sandiford; gcc-patches; Jeff Law; rdapp; linkw; kito.cheng
Subject: Re: Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
On Wed, 12 Apr 2023, ??? wrote:

> >> It's not so much that we need to do that.  But normally it's only worth
> >> adding internal functions if they do something that is too complicated
> >> to express in simple gimple arithmetic.  The UQDEC case I mentioned:
> 
> >>    z = MAX (x, y) - y
> 
> >> fell into the "simple arithmetic" category for me.  We could have added
> >> an ifn for unsigned saturating decrement, but it didn't seem complicated
> >> enough to merit its own ifn.
> 
> Ah, I known your concern. I should admit that WHILE_LEN is a simple arithmetic operation
> which is just taking result from
> 
> min (remain,vf).
> 
> The possible solution is to just use MIN_EXPR (remain,vf).
> Then, add speciall handling in umin_optab pattern to recognize "vf" in the backend.
> Finally generate vsetvl in RISC-V backend.
> 
> The "vf" should be recognized as the operand of umin should be const_int/const_poly_int operand.
> Otherwise, just generate umin scalar instruction..
> 
> However, there is a case that I can't recognize umin should generate vsetvl or umin. Is this following case:
> void foo (int32_t a)
> {
>   return min (a, 4);
> }
> 
> In this case I should generate:
> li a1,4
> umin a1,a0,a1
> 
> instead of generating vsetvl
> 
> However, in this case:
> 
> void foo (int32_t *a...)
> for (int i = 0; i < n; i++)
>   a[i] = b[i] + c[i];
> 
> with -mriscv-vector-bits=128 (which means each vector can handle 4 INT32)
> Then the VF will be 4 too. If we also MIN_EXPR instead WHILE_LEN:
> 
> ...
> len = MIN_EXPR (n,4)
> v = len_load (len)
> ....
> ...
> 
> In this case, MIN_EXPR should emit vsetvl.
> 
> It's hard for me to tell the difference between these 2 cases...

But the issue is the same in the reverse with WHILE_LEN, no?
WHILE_LEN just computes a scalar value - you seem to suggest
there's a hidden side-effect of "coalescing" the result with
a hardware vector length register?  I don't think that's good design.

IMHO tieing the scalar result with the uses has to be done where
you emit the other vsetvl instructions.

One convenient thing we have with WHILE_LEN is that it is a key
for the vectorizer to query target capabilities (and preferences).
But of course collecting whether stmts can be vectorized
with length and/or with mask would be better.

Richard.

> CC RISC-V port backend maintainer: Kito.
> 
> 
> 
> juzhe.zhong@rivai.ai
>  
> From: Richard Sandiford
> Date: 2023-04-12 20:24
> To: juzhe.zhong\@rivai.ai
> CC: rguenther; gcc-patches; jeffreyalaw; rdapp; linkw
> Subject: Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
> "juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai> writes:
> >>> I think that already works for them (could be misremembering).
> >>> However, IIUC, they have no special instruction to calculate the
> >>> length (unlike for RVV), and so it's open-coded using vect_get_len.
> >
> > Yeah, the current flow using min, sub, and then min in vect_get_len
> > is working for IBM. But I wonder whether switching the current flow of
> > length-loop-control into the WHILE_LEN pattern that this patch can improve
> > their performance.
> >
> >>> (1) How easy would it be to express WHILE_LEN in normal gimple?
> >>>     I haven't thought about this at all, so the answer might be
> >>>     "very hard".  But it reminds me a little of UQDEC on AArch64,
> >>>     which we open-code using MAX_EXPR and MINUS_EXPR (see
> >  >>    vect_set_loop_controls_directly).
> >
> >   >>   I'm not saying WHILE_LEN is the same operation, just that it seems
> >   >>   like it might be open-codeable in a similar way.
> >
> >  >>    Even if we can open-code it, we'd still need some way for the
> >   >>   target to select the "RVV way" from the "s390/PowerPC way".
> >
> > WHILE_LEN in doc I define is
> > operand0 = MIN (operand1, operand2)operand1 is the residual number of scalar elements need to be updated.operand2 is vectorization factor (vf) for single rgroup.         if multiple rgroup operan2 = vf * nitems_per_ctrl.You mean such pattern is not well expressed so we need to replace it with normaltree code (MIN OR MAX). And let RISC-V backend to optimize them into vsetvl ?Sorry, maybe I am not on the same page.
>  
> It's not so much that we need to do that.  But normally it's only worth
> adding internal functions if they do something that is too complicated
> to express in simple gimple arithmetic.  The UQDEC case I mentioned:
>  
>    z = MAX (x, y) - y
>  
> fell into the "simple arithmetic" category for me.  We could have added
> an ifn for unsigned saturating decrement, but it didn't seem complicated
> enough to merit its own ifn.
>  
> >>> (2) What effect does using a variable IV step (the result of
> >>> the WHILE_LEN) have on ivopts?  I remember experimenting with
> >>> something similar once (can't remember the context) and not
> >>> having a constant step prevented ivopts from making good
> >>> addresing-mode choices.
> >
> > Thank you so much for pointing out this. Currently, varialble IV step and decreasing n down to 0 
> > works fine for RISC-V downstream GCC and we didn't find issues related addressing-mode choosing.
>  
> OK, that's good.  Sounds like it isn't a problem then.
>  
> > I think I must missed something, would you mind giving me some hints so that I can study on ivopts
> > to find out which case may generate inferior codegens for varialble IV step?
>  
> I think AArch64 was sensitive to this because (a) the vectoriser creates
> separate IVs for each base address and (b) for SVE, we instead want
> invariant base addresses that are indexed by the loop control IV.
> Like Richard says, if the loop control IV isn't a SCEV, ivopts isn't
> able to use it and so (b) fails.
>  
> Thanks,
> Richard
>  
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
  2023-04-13  7:29                       ` Kewen.Lin
@ 2023-04-13 13:44                         ` 钟居哲
  2023-04-14  2:54                           ` Kewen.Lin
  0 siblings, 1 reply; 41+ messages in thread
From: 钟居哲 @ 2023-04-13 13:44 UTC (permalink / raw)
  To: linkw; +Cc: gcc-patches, Jeff Law, rdapp, richard.sandiford, rguenther

[-- Attachment #1: Type: text/plain, Size: 3843 bytes --]

Thanks Kewen.

Current flow in this patch like you said:
....
len = WHILE_LEN (n,vf);
...
v = len_load (addr,len);
..
addr = addr + vf (in byte align);
....

This patch is just keep adding address with a vector factor (adjust as byte align).
For example, if your vector length = 512bit. Then this patch is just updating address as
addr = addr + 64;

However, today after I read RVV ISA more deeply, it should be more appropriate that
the address should updated as : addr = addr + (len * 4) if len is element number of INT32.
the len is the result by WHILE_LEN which calculate the len. 

I assume for IBM target, it's better to just update address directly adding the whole register bytesize 
in address IV. Since I think the second way (address = addr + (len * 4)) is too RVV specific, and won't be suitable for IBM. Is that right?
If it is true, I will keep this patch flow (won't change to  address = addr + (len * 4)) to see what else I need to do for IBM.
I would rather do that in RISC-V backend port.

>> I tried
>>to compile the above source files on Power, the former can adopt doloop
>>optimization but the latter fails to. 
You mean GCC can not do hardward loop optimization when IV loop control is variable ? 

juzhe.zhong@rivai.ai

From: Kewen.Lin
Date: 2023-04-13 15:29
To: 钟居哲
CC: gcc-patches; Jeff Law; rdapp; richard.sandiford; rguenther
Subject: Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
Hi Juzhe,

on 2023/4/12 21:22, 钟居哲 wrote:
> Thanks Kewen. 
> 
> It seems that this proposal WHILE_LEN can help s390 when using --param vect-partial-vector-usage=2 compile option.
> 

Yeah, IMHO, the previous sequence vs. the proposed sequence are like:

int
foo (int *__restrict a, int *__restrict b, int n)
{
  if (n <= 0)
    return 0;

  int iv = 0;
  int len = MIN (n, 16);
  int sum = 0;
  do
    {
      sum += a[len] + b[len];
      iv += 16;
      int n1 = MIN (n, iv);                   // line A
      int n2 = n - n1;
      len = MIN (n2, 16);
    }
  while (n > iv);

  return sum;
}

vs.

int
foo (int *__restrict a, int *__restrict b, int n)
{
  if (n <= 0)
    return 0;

  int len;
  int sum = 0;
  do
    {
      len = MIN (n, 16);
      sum += a[len] + b[len];
      n -= len;
    }
  while (n > 0);

  return sum;
}

it at least saves one MIN (at line A) and one length preparation in the
last iteration (it's useless since loop ends).  But I think the concern
that this proposed IV isn't recognized as simple iv may stay.  I tried
to compile the above source files on Power, the former can adopt doloop
optimization but the latter fails to.

> Would you mind apply this patch && support WHILE_LEN in s390 backend and test it to see the overal benefits for s390
> as well as the correctness of this sequence ? 

Sure, if all of you think this approach and this revision is good enough to go forward for this kind of evaluation,
I'm happy to give it a shot, but only for rs6000. ;-)  I noticed that there are some discussions on withdrawing this
WHILE_LEN by using MIN_EXPR instead, I'll stay tuned.

btw, now we only adopt vector with length on the epilogues rather than the main vectorized loops, because of the
non-trivial extra costs for length preparation than just using the normal vector load/store (all lanes), so we don't
care about the performance with --param vect-partial-vector-usage=2 much.  Even if this new proposal can optimize
the length preparation for --param vect-partial-vector-usage=2, the extra costs for length preparation is still
unavoidable (MIN, shifting, one more GPR used), we would still stay with default --param vect-partial-vector-usage=1
(which can't benefit from this new proposal).

BR,
Kewen

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
  2023-04-13 13:44                         ` 钟居哲
@ 2023-04-14  2:54                           ` Kewen.Lin
  2023-04-14  3:09                             ` juzhe.zhong
  2023-04-14  3:39                             ` juzhe.zhong
  0 siblings, 2 replies; 41+ messages in thread
From: Kewen.Lin @ 2023-04-14  2:54 UTC (permalink / raw)
  To: 钟居哲
  Cc: gcc-patches, Jeff Law, rdapp, richard.sandiford, rguenther

Hi Juzhe,

on 2023/4/13 21:44, 钟居哲 wrote:
> Thanks Kewen.
> 
> Current flow in this patch like you said:
> ....
> len = WHILE_LEN (n,vf);
> ...
> v = len_load (addr,len);
> ..
> addr = addr + vf (in byte align);
> ....
> 
> This patch is just keep adding address with a vector factor (adjust as byte align).
> For example, if your vector length = 512bit. Then this patch is just updating address as
> addr = addr + 64;
> 
> However, today after I read RVV ISA more deeply, it should be more appropriate that
> the address should updated as : addr = addr + (len * 4) if len is element number of INT32.
> the len is the result by WHILE_LEN which calculate the len.

I just read your detailed explanation on the usage of vsetvli insn (really appreciate that),
it looks that this WHILE_LEN wants some more semantics than MIN, so I assume you still want
to introduce this WHILE_LEN.

> 
> I assume for IBM target, it's better to just update address directly adding the whole register bytesize 
> in address IV. Since I think the second way (address = addr + (len * 4)) is too RVV specific, and won't be suitable for IBM. Is that right?

Yes, we just wants to add the whole vector register length in bytes.

> If it is true, I will keep this patch flow (won't change to  address = addr + (len * 4)) to see what else I need to do for IBM.
> I would rather do that in RISC-V backend port.

IMHO, you don't need to push this down to RV backend, just query these ports having len_{load,store}
support with a target hook or special operand in optab while_len (see internal_len_load_store_bias)
for this need, and generate different codes accordingly.  IIUC, for WHILE_LEN, you want it to have
the semantics as what vsetvli performs, but for IBM ports, it would be just like MIN_EXPR, maybe we
can also generate MIN or WHILE_LEN based on this kind of target information.

If the above assumption holds, I wonder if you also want WHILE_LEN to have the implicit effect
to update vector length register?  If yes, the codes with multiple rgroups looks unexpected:

+	_76 = .WHILE_LEN (ivtmp_74, vf * nitems_per_ctrl);
+	_79 = .WHILE_LEN (ivtmp_77, vf * nitems_per_ctrl);

as the latter one seems to override the former.  Besides, if the given operands are known constants,
it can't directly be folded into constants and do further propagation.   From this perspective, Richi's
suggestion on "tieing the scalar result with the uses" looks better IMHO.

> 
>>> I tried
>>>to compile the above source files on Power, the former can adopt doloop
>>>optimization but the latter fails to. 
> You mean GCC can not do hardward loop optimization when IV loop control is variable ? 

No, for both cases, IV is variable, the dumping at loop2_doloop for the proposed sequence says
"Doloop: Possible infinite iteration case.", it seems to show that for the proposed sequence compiler 
isn't able to figure out the loop is finite, it may miss the range information on n, or it isn't
able to analyze how the invariant involves, but I didn't look into it, all my guesses.

BR,
Kewen

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
  2023-04-14  2:54                           ` Kewen.Lin
@ 2023-04-14  3:09                             ` juzhe.zhong
  2023-04-14  5:40                               ` Kewen.Lin
  2023-04-14  3:39                             ` juzhe.zhong
  1 sibling, 1 reply; 41+ messages in thread
From: juzhe.zhong @ 2023-04-14  3:09 UTC (permalink / raw)
  To: linkw; +Cc: gcc-patches, jeffreyalaw, rdapp, richard.sandiford, rguenther

[-- Attachment #1: Type: text/plain, Size: 4742 bytes --]

>> Yes, we just wants to add the whole vector register length in bytes.
OK, I learn it and appreciate you give me the information.

>> I wonder if you also want WHILE_LEN to have the implicit effect
>>to update vector length register?
>>From this perspective, Richi's
>>suggestion on "tieing the scalar result with the uses" looks better IMHO.
No, I don't want to make WHILE_LEN have implict side-effect.
Just tieing the scalar result with the uses.
Updating vector length register, I let RISC-V backend port to do that.
I don't want to involve any RISC-V specific feature into GCC middle-end.

>>No, for both cases, IV is variable, the dumping at loop2_doloop for the proposed sequence says
>>"Doloop: Possible infinite iteration case.", it seems to show that for the proposed sequence compiler
>>isn't able to figure out the loop is finite, it may miss the range information on n, or it isn't
>>able to analyze how the invariant involves, but I didn't look into it, all my guesses.
Ok, I think it may be fixed in the future.

So, I wonder whether you are basically agree with the concept of this patch?
Would you mind giving more suggestions  that I can fix this patch to make more benefits for IBM (s390 or rs6000)?
For example, will you try this patch to see whether it can work for IBM in case of multiple rgroup of SLP?

Thanks.

juzhe.zhong@rivai.ai

From: Kewen.Lin
Date: 2023-04-14 10:54
To: 钟居哲
CC: gcc-patches; Jeff Law; rdapp; richard.sandiford; rguenther
Subject: Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
Hi Juzhe,

on 2023/4/13 21:44, 钟居哲 wrote:
> Thanks Kewen.
> 
> Current flow in this patch like you said:
> ....
> len = WHILE_LEN (n,vf);
> ...
> v = len_load (addr,len);
> ..
> addr = addr + vf (in byte align);
> ....
> 
> This patch is just keep adding address with a vector factor (adjust as byte align).
> For example, if your vector length = 512bit. Then this patch is just updating address as
> addr = addr + 64;
> 
> However, today after I read RVV ISA more deeply, it should be more appropriate that
> the address should updated as : addr = addr + (len * 4) if len is element number of INT32.
> the len is the result by WHILE_LEN which calculate the len.

I just read your detailed explanation on the usage of vsetvli insn (really appreciate that),
it looks that this WHILE_LEN wants some more semantics than MIN, so I assume you still want
to introduce this WHILE_LEN.

> 
> I assume for IBM target, it's better to just update address directly adding the whole register bytesize 
> in address IV. Since I think the second way (address = addr + (len * 4)) is too RVV specific, and won't be suitable for IBM. Is that right?

Yes, we just wants to add the whole vector register length in bytes.

> If it is true, I will keep this patch flow (won't change to  address = addr + (len * 4)) to see what else I need to do for IBM.
> I would rather do that in RISC-V backend port.

IMHO, you don't need to push this down to RV backend, just query these ports having len_{load,store}
support with a target hook or special operand in optab while_len (see internal_len_load_store_bias)
for this need, and generate different codes accordingly.  IIUC, for WHILE_LEN, you want it to have
the semantics as what vsetvli performs, but for IBM ports, it would be just like MIN_EXPR, maybe we
can also generate MIN or WHILE_LEN based on this kind of target information.

If the above assumption holds, I wonder if you also want WHILE_LEN to have the implicit effect
to update vector length register?  If yes, the codes with multiple rgroups looks unexpected:

+ _76 = .WHILE_LEN (ivtmp_74, vf * nitems_per_ctrl);
+ _79 = .WHILE_LEN (ivtmp_77, vf * nitems_per_ctrl);

as the latter one seems to override the former.  Besides, if the given operands are known constants,
it can't directly be folded into constants and do further propagation.   From this perspective, Richi's
suggestion on "tieing the scalar result with the uses" looks better IMHO.

> 
>>> I tried
>>>to compile the above source files on Power, the former can adopt doloop
>>>optimization but the latter fails to. 
> You mean GCC can not do hardward loop optimization when IV loop control is variable ? 

No, for both cases, IV is variable, the dumping at loop2_doloop for the proposed sequence says
"Doloop: Possible infinite iteration case.", it seems to show that for the proposed sequence compiler 
isn't able to figure out the loop is finite, it may miss the range information on n, or it isn't
able to analyze how the invariant involves, but I didn't look into it, all my guesses.

BR,
Kewen

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
  2023-04-14  2:54                           ` Kewen.Lin
  2023-04-14  3:09                             ` juzhe.zhong
@ 2023-04-14  3:39                             ` juzhe.zhong
  2023-04-14  6:31                               ` Kewen.Lin
  2023-04-14  6:52                               ` Richard Biener
  1 sibling, 2 replies; 41+ messages in thread
From: juzhe.zhong @ 2023-04-14  3:39 UTC (permalink / raw)
  To: linkw; +Cc: gcc-patches, jeffreyalaw, rdapp, richard.sandiford, rguenther

[-- Attachment #1: Type: text/plain, Size: 4024 bytes --]

And also I already decided to make remove WHILE_LEN pattern since it seems to be unnecessary.
And as Richard said, it's just a simple airthmetic and it's not worthwhile to do that.

So, I plan to replace WHILE_LEN into MIN_EXPR and make everything RVV specific done in RISC-V port.
I think it's more reasonable for IBM use and more target use in the future.

So, this patch will need to changed as "introduce a new flow to do vectorization loop control" which is a new loop control flow
with saturating subtracting n down to zero, and add a target hook for it so that we can switch to this flow ?

Is it more reasonable ?
Thanks.

juzhe.zhong@rivai.ai

From: Kewen.Lin
Date: 2023-04-14 10:54
To: 钟居哲
CC: gcc-patches; Jeff Law; rdapp; richard.sandiford; rguenther
Subject: Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
Hi Juzhe,

on 2023/4/13 21:44, 钟居哲 wrote:
> Thanks Kewen.
> 
> Current flow in this patch like you said:
> ....
> len = WHILE_LEN (n,vf);
> ...
> v = len_load (addr,len);
> ..
> addr = addr + vf (in byte align);
> ....
> 
> This patch is just keep adding address with a vector factor (adjust as byte align).
> For example, if your vector length = 512bit. Then this patch is just updating address as
> addr = addr + 64;
> 
> However, today after I read RVV ISA more deeply, it should be more appropriate that
> the address should updated as : addr = addr + (len * 4) if len is element number of INT32.
> the len is the result by WHILE_LEN which calculate the len.

I just read your detailed explanation on the usage of vsetvli insn (really appreciate that),
it looks that this WHILE_LEN wants some more semantics than MIN, so I assume you still want
to introduce this WHILE_LEN.

> 
> I assume for IBM target, it's better to just update address directly adding the whole register bytesize 
> in address IV. Since I think the second way (address = addr + (len * 4)) is too RVV specific, and won't be suitable for IBM. Is that right?

Yes, we just wants to add the whole vector register length in bytes.

> If it is true, I will keep this patch flow (won't change to  address = addr + (len * 4)) to see what else I need to do for IBM.
> I would rather do that in RISC-V backend port.

IMHO, you don't need to push this down to RV backend, just query these ports having len_{load,store}
support with a target hook or special operand in optab while_len (see internal_len_load_store_bias)
for this need, and generate different codes accordingly.  IIUC, for WHILE_LEN, you want it to have
the semantics as what vsetvli performs, but for IBM ports, it would be just like MIN_EXPR, maybe we
can also generate MIN or WHILE_LEN based on this kind of target information.

If the above assumption holds, I wonder if you also want WHILE_LEN to have the implicit effect
to update vector length register?  If yes, the codes with multiple rgroups looks unexpected:

+ _76 = .WHILE_LEN (ivtmp_74, vf * nitems_per_ctrl);
+ _79 = .WHILE_LEN (ivtmp_77, vf * nitems_per_ctrl);

as the latter one seems to override the former.  Besides, if the given operands are known constants,
it can't directly be folded into constants and do further propagation.   From this perspective, Richi's
suggestion on "tieing the scalar result with the uses" looks better IMHO.

> 
>>> I tried
>>>to compile the above source files on Power, the former can adopt doloop
>>>optimization but the latter fails to. 
> You mean GCC can not do hardward loop optimization when IV loop control is variable ? 

No, for both cases, IV is variable, the dumping at loop2_doloop for the proposed sequence says
"Doloop: Possible infinite iteration case.", it seems to show that for the proposed sequence compiler 
isn't able to figure out the loop is finite, it may miss the range information on n, or it isn't
able to analyze how the invariant involves, but I didn't look into it, all my guesses.

BR,
Kewen

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
  2023-04-14  3:09                             ` juzhe.zhong
@ 2023-04-14  5:40                               ` Kewen.Lin
  0 siblings, 0 replies; 41+ messages in thread
From: Kewen.Lin @ 2023-04-14  5:40 UTC (permalink / raw)
  To: juzhe.zhong; +Cc: gcc-patches, jeffreyalaw, rdapp, richard.sandiford, rguenther

Hi Juzhe,

>>> Yes, we just wants to add the whole vector register length in bytes.
> OK, I learn it and appreciate you give me the information.
> 
>>> I wonder if you also want WHILE_LEN to have the implicit effect
>>>to update vector length register?
>>>From this perspective, Richi's
>>>suggestion on "tieing the scalar result with the uses" looks better IMHO.
> No, I don't want to make WHILE_LEN have implict side-effect.
> Just tieing the scalar result with the uses.
> Updating vector length register, I let RISC-V backend port to do that.
> I don't want to involve any RISC-V specific feature into GCC middle-end.
> 

Good, thanks for clarifying, that makes more sense.

>>>No, for both cases, IV is variable, the dumping at loop2_doloop for the proposed sequence says
>>>"Doloop: Possible infinite iteration case.", it seems to show that for the proposed sequence compiler
>>>isn't able to figure out the loop is finite, it may miss the range information on n, or it isn't
>>>able to analyze how the invariant involves, but I didn't look into it, all my guesses.
> Ok, I think it may be fixed in the future.

Yeah, it can be.  It only matters for us when adopting --param vect-partial-vector-usage=2 but it's not
default.

> 
> So, I wonder whether you are basically agree with the concept of this patch?
> Would you mind giving more suggestions  that I can fix this patch to make more benefits for IBM (s390 or rs6000)?
> For example, will you try this patch to see whether it can work for IBM in case of multiple rgroup of SLP?

The concept looks good to me, for IBM ports, it can benefit the length preparation for the case of --param
vect-partial-vector-usage=2 (excepting for possible missing doloop chance), it's neutral for the case of
--param vect-partial-vector-usage=1.  IMHO, if possible you can extend the current function vect_set_loop_controls_directly
rather than adding a new function vect_set_loop_controls_by_while_len, since that function does handle both
masks and lengths (controls).  And as vect_gen_len's comments shows, once you change the length preparation,
you have to adjust the corresponding costs as well.  And sure, once this becomes stable (all decisions from
the discussions settled down, gets fully reviewed in stage 1), I'll test it on Power10 and get back to you.

BR,
Kewen

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
  2023-04-14  3:39                             ` juzhe.zhong
@ 2023-04-14  6:31                               ` Kewen.Lin
  2023-04-14  6:39                                 ` juzhe.zhong
  2023-04-14  6:52                               ` Richard Biener
  1 sibling, 1 reply; 41+ messages in thread
From: Kewen.Lin @ 2023-04-14  6:31 UTC (permalink / raw)
  To: juzhe.zhong; +Cc: gcc-patches, jeffreyalaw, rdapp, richard.sandiford, rguenther

Hi Juzhe,

on 2023/4/14 11:39, juzhe.zhong@rivai.ai wrote:
> And also I already decided to make remove WHILE_LEN pattern since it seems to be unnecessary.
> And as Richard said, it's just a simple airthmetic and it's not worthwhile to do that.
> > So, I plan to replace WHILE_LEN into MIN_EXPR and make everything RVV specific done in RISC-V port.

Yeah, MIN_EXPR is enough for IBM ports, but with seeing the special semantic of vsetvli on
"vl = ceil(AVL / 2) for VLMAX < AVL < 2*VLMAX", I'm not sure if it's a good idea for RV, it seems
to put the burden to RV backend.  For one case that the iteration count is known, on the environment
with the above capability, using the vector setting as [1], assuming the given iterations is 10,
fully unrolled, when using MIN_EXPR, the lengths for two iterations would be folded into 8/2, while
using WHILE_LEN artificial folding can make the lengths be 5/5.  I assumed that on the environment
with the above capability 5/5 is optimal than 8/2? that means if we use MIN then RV backend has to
try to make 8/2 to 5/5.  Or it's trivial since RV backend already supports and plans to support this
kind of vsetvli load re-balancing?

[1] https://gcc.gnu.org/pipermail/gcc-patches/2023-April/615634.html

> I think it's more reasonable for IBM use and more target use in the future.
> 

If RV needs WHILE_LEN, IMHO they can co-exist, like: for ports defining len_{load,store} but no
while_len, use MIN; for ports defining while_len, then use WHILE_LEN.

> So, this patch will need to changed as "introduce a new flow to do vectorization loop control" which is a new loop control flow
> with saturating subtracting n down to zero, and add a target hook for it so that we can switch to this flow ?

Yes, if you don't need WHILE_LEN, this proposal is more like to enhance the current partial
vectorization with length (mainly on length preparation and loop control).  But why would we need
a new target hook?  You want to keep the existing length handlings in vect_set_loop_controls_directly
unchanged? it seems not necessary.  IIUC, not requiring WHILE_LEN also means that this patch
doesn't necessarily block the other RV backend patches on vector with length exploitation since
the existing vector with length support already works well on functionality.

BR,
Kewen

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
  2023-04-14  6:31                               ` Kewen.Lin
@ 2023-04-14  6:39                                 ` juzhe.zhong
  2023-04-14  7:41                                   ` Kewen.Lin
  0 siblings, 1 reply; 41+ messages in thread
From: juzhe.zhong @ 2023-04-14  6:39 UTC (permalink / raw)
  To: linkw; +Cc: gcc-patches, jeffreyalaw, rdapp, richard.sandiford, rguenther

[-- Attachment #1: Type: text/plain, Size: 4432 bytes --]

>> Yeah, MIN_EXPR is enough for IBM ports, but with seeing the special semantic of vsetvli on
>> "vl = ceil(AVL / 2) for VLMAX < AVL < 2*VLMAX", I'm not sure if it's a good idea for RV, it seems
>> to put the burden to RV backend.  For one case that the iteration count is known, on the environment
>> with the above capability, using the vector setting as [1], assuming the given iterations is 10,
>> fully unrolled, when using MIN_EXPR, the lengths for two iterations would be folded into 8/2, while
>> using WHILE_LEN artificial folding can make the lengths be 5/5.  I assumed that on the environment
>> with the above capability 5/5 is optimal than 8/2? that means if we use MIN then RV backend has to
>> try to make 8/2 to 5/5.  Or it's trivial since RV backend already supports and plans to support this
>> kind of vsetvli load re-balancing?
This is a trivial power optimization feature of RVV. I don't think making it in middle-end is a good idea
since the middle-end is supposed to be totally target-independent. And I figured out re-balancing
vsetvli is not difficult to do that in RISC-V port. 

>> Yes, if you don't need WHILE_LEN, this proposal is more like to enhance the current partial
>> vectorization with length (mainly on length preparation and loop control).  But why would we need
>> a new target hook?  You want to keep the existing length handlings in vect_set_loop_controls_directly
>> unchanged? it seems not necessary.  IIUC, not requiring WHILE_LEN also means that this patch
>> doesn't necessarily block the other RV backend patches on vector with length exploitation since
>> the existing vector with length support already works well on functionality.
Ok, I get your point. I am gonna refine the patch to make it work for both RVV and IBM.

Thanks all your comments.

juzhe.zhong@rivai.ai

From: Kewen.Lin
Date: 2023-04-14 14:31
To: juzhe.zhong@rivai.ai
CC: gcc-patches; jeffreyalaw; rdapp; richard.sandiford; rguenther
Subject: Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
Hi Juzhe,

on 2023/4/14 11:39, juzhe.zhong@rivai.ai wrote:
> And also I already decided to make remove WHILE_LEN pattern since it seems to be unnecessary.
> And as Richard said, it's just a simple airthmetic and it's not worthwhile to do that.
> > So, I plan to replace WHILE_LEN into MIN_EXPR and make everything RVV specific done in RISC-V port.

Yeah, MIN_EXPR is enough for IBM ports, but with seeing the special semantic of vsetvli on
"vl = ceil(AVL / 2) for VLMAX < AVL < 2*VLMAX", I'm not sure if it's a good idea for RV, it seems
to put the burden to RV backend.  For one case that the iteration count is known, on the environment
with the above capability, using the vector setting as [1], assuming the given iterations is 10,
fully unrolled, when using MIN_EXPR, the lengths for two iterations would be folded into 8/2, while
using WHILE_LEN artificial folding can make the lengths be 5/5.  I assumed that on the environment
with the above capability 5/5 is optimal than 8/2? that means if we use MIN then RV backend has to
try to make 8/2 to 5/5.  Or it's trivial since RV backend already supports and plans to support this
kind of vsetvli load re-balancing?

[1] https://gcc.gnu.org/pipermail/gcc-patches/2023-April/615634.html

> I think it's more reasonable for IBM use and more target use in the future.
> 

If RV needs WHILE_LEN, IMHO they can co-exist, like: for ports defining len_{load,store} but no
while_len, use MIN; for ports defining while_len, then use WHILE_LEN.

> So, this patch will need to changed as "introduce a new flow to do vectorization loop control" which is a new loop control flow
> with saturating subtracting n down to zero, and add a target hook for it so that we can switch to this flow ?

Yes, if you don't need WHILE_LEN, this proposal is more like to enhance the current partial
vectorization with length (mainly on length preparation and loop control).  But why would we need
a new target hook?  You want to keep the existing length handlings in vect_set_loop_controls_directly
unchanged? it seems not necessary.  IIUC, not requiring WHILE_LEN also means that this patch
doesn't necessarily block the other RV backend patches on vector with length exploitation since
the existing vector with length support already works well on functionality.

BR,
Kewen

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
  2023-04-14  3:39                             ` juzhe.zhong
  2023-04-14  6:31                               ` Kewen.Lin
@ 2023-04-14  6:52                               ` Richard Biener
  1 sibling, 0 replies; 41+ messages in thread
From: Richard Biener @ 2023-04-14  6:52 UTC (permalink / raw)
  To: juzhe.zhong; +Cc: linkw, gcc-patches, jeffreyalaw, rdapp, richard.sandiford

On Fri, 14 Apr 2023, juzhe.zhong@rivai.ai wrote:

> And also I already decided to make remove WHILE_LEN pattern since it seems to be unnecessary.
> And as Richard said, it's just a simple airthmetic and it's not worthwhile to do that.
> 
> So, I plan to replace WHILE_LEN into MIN_EXPR and make everything RVV specific done in RISC-V port.
> I think it's more reasonable for IBM use and more target use in the future.
> 
> So, this patch will need to changed as "introduce a new flow to do vectorization loop control" which is a new loop control flow
> with saturating subtracting n down to zero, and add a target hook for it so that we can switch to this flow ?
> 
> Is it more reasonable ?

I think we want to change the various IVs the vectorizer uses to
control the exit condition of prologue/vect/epilogue loops to a single
one counting the remaining _scalar_ iterations to zero.  Currently
it's somewhat of a mess which also leads to difficult to CSE expressions
based on derived values of such an IV.

But yes, whether for example the vector loop control stmt should
be a test for zero mask (while-ult) or zero scalar iterations
(or (signed) <= zero) could be subject to a new target hook if it
isn't an obvious choice based on HW capability checks we can already
do.

Richard.

> Thanks.
> 
> 
> juzhe.zhong@rivai.ai
>  
> From: Kewen.Lin
> Date: 2023-04-14 10:54
> To: ???
> CC: gcc-patches; Jeff Law; rdapp; richard.sandiford; rguenther
> Subject: Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
> Hi Juzhe,
>  
> on 2023/4/13 21:44, ??? wrote:
> > Thanks Kewen.
> > 
> > Current flow in this patch like you said:
> > ....
> > len = WHILE_LEN (n,vf);
> > ...
> > v = len_load (addr,len);
> > ..
> > addr = addr + vf (in byte align);
> > ....
> > 
> > This patch is just keep adding address with a vector factor (adjust as byte align).
> > For example, if your vector length = 512bit. Then this patch is just updating address as
> > addr = addr + 64;
> > 
> > However, today after I read RVV ISA more deeply, it should be more appropriate that
> > the address should updated as : addr = addr + (len * 4) if len is element number of INT32.
> > the len is the result by WHILE_LEN which calculate the len.
>  
> I just read your detailed explanation on the usage of vsetvli insn (really appreciate that),
> it looks that this WHILE_LEN wants some more semantics than MIN, so I assume you still want
> to introduce this WHILE_LEN.
>  
> > 
> > I assume for IBM target, it's better to just update address directly adding the whole register bytesize 
> > in address IV. Since I think the second way (address = addr + (len * 4)) is too RVV specific, and won't be suitable for IBM. Is that right?
>  
> Yes, we just wants to add the whole vector register length in bytes.
>  
> > If it is true, I will keep this patch flow (won't change to  address = addr + (len * 4)) to see what else I need to do for IBM.
> > I would rather do that in RISC-V backend port.
>  
> IMHO, you don't need to push this down to RV backend, just query these ports having len_{load,store}
> support with a target hook or special operand in optab while_len (see internal_len_load_store_bias)
> for this need, and generate different codes accordingly.  IIUC, for WHILE_LEN, you want it to have
> the semantics as what vsetvli performs, but for IBM ports, it would be just like MIN_EXPR, maybe we
> can also generate MIN or WHILE_LEN based on this kind of target information.
>  
> If the above assumption holds, I wonder if you also want WHILE_LEN to have the implicit effect
> to update vector length register?  If yes, the codes with multiple rgroups looks unexpected:
>  
> + _76 = .WHILE_LEN (ivtmp_74, vf * nitems_per_ctrl);
> + _79 = .WHILE_LEN (ivtmp_77, vf * nitems_per_ctrl);
>  
> as the latter one seems to override the former.  Besides, if the given operands are known constants,
> it can't directly be folded into constants and do further propagation.   From this perspective, Richi's
> suggestion on "tieing the scalar result with the uses" looks better IMHO.
>  
> > 
> >>> I tried
> >>>to compile the above source files on Power, the former can adopt doloop
> >>>optimization but the latter fails to. 
> > You mean GCC can not do hardward loop optimization when IV loop control is variable ? 
>  
> No, for both cases, IV is variable, the dumping at loop2_doloop for the proposed sequence says
> "Doloop: Possible infinite iteration case.", it seems to show that for the proposed sequence compiler 
> isn't able to figure out the loop is finite, it may miss the range information on n, or it isn't
> able to analyze how the invariant involves, but I didn't look into it, all my guesses.
>  
> BR,
> Kewen
>  
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
  2023-04-14  6:39                                 ` juzhe.zhong
@ 2023-04-14  7:41                                   ` Kewen.Lin
  0 siblings, 0 replies; 41+ messages in thread
From: Kewen.Lin @ 2023-04-14  7:41 UTC (permalink / raw)
  To: juzhe.zhong; +Cc: gcc-patches, jeffreyalaw, rdapp, richard.sandiford, rguenther

on 2023/4/14 14:39, juzhe.zhong@rivai.ai wrote:
>>> Yeah, MIN_EXPR is enough for IBM ports, but with seeing the special semantic of vsetvli on
>>> "vl = ceil(AVL / 2) for VLMAX < AVL < 2*VLMAX", I'm not sure if it's a good idea for RV, it seems
>>> to put the burden to RV backend.  For one case that the iteration count is known, on the environment
>>> with the above capability, using the vector setting as [1], assuming the given iterations is 10,
>>> fully unrolled, when using MIN_EXPR, the lengths for two iterations would be folded into 8/2, while
>>> using WHILE_LEN artificial folding can make the lengths be 5/5.  I assumed that on the environment
>>> with the above capability 5/5 is optimal than 8/2? that means if we use MIN then RV backend has to
>>> try to make 8/2 to 5/5.  Or it's trivial since RV backend already supports and plans to support this
>>> kind of vsetvli load re-balancing?
> This is a trivial power optimization feature of RVV. I don't think making it in middle-end is a good idea
> since the middle-end is supposed to be totally target-independent. And I figured out re-balancing
> vsetvli is not difficult to do that in RISC-V port. 
> 

OK, thanks for clarifying, sounds good.

> 
>>> Yes, if you don't need WHILE_LEN, this proposal is more like to enhance the current partial
>>> vectorization with length (mainly on length preparation and loop control).  But why would we need
>>> a new target hook?  You want to keep the existing length handlings in vect_set_loop_controls_directly
>>> unchanged? it seems not necessary.  IIUC, not requiring WHILE_LEN also means that this patch
>>> doesn't necessarily block the other RV backend patches on vector with length exploitation since
>>> the existing vector with length support already works well on functionality.
> Ok, I get your point. I am gonna refine the patch to make it work for both RVV and IBM.

Thanks!

BR,
Kewen

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
  2023-04-13  9:54                         ` juzhe.zhong
@ 2023-04-18  9:32                           ` Richard Sandiford
  0 siblings, 0 replies; 41+ messages in thread
From: Richard Sandiford @ 2023-04-18  9:32 UTC (permalink / raw)
  To: juzhe.zhong; +Cc: rguenther, gcc-patches, jeffreyalaw, rdapp, linkw, kito.cheng

"juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai> writes:
>>> But the issue is the same in the reverse with WHILE_LEN, no?
>>>WHILE_LEN just computes a scalar value - you seem to suggest
>>>there's a hidden side-effect of "coalescing" the result with
>>>a hardware vector length register?  I don't think that's good design.
> No, I don't plan to suggest there's a hidden side-effect of "coalescing"
> the result with a hardware vector length register.
>
> Today, I read RVV ISA deeply again. I realize that this patch is not absolute correct for 
> any RVV hardward.
>
> According to RVV ISA, the vsetvl definition:
> an vsetvli instruction which is vsetvli vl, avl, vtype
> vl = AVL if AVL ≤ VLMAX
> ceil(AVL / 2) ≤ vl ≤ VLMAX if AVL < (2 * VLMAX)
> vl = VLMAX if AVL ≥ (2 * VLMAX)
> Deterministic on any given implementation for same input AVL and VLMAX values
> The second constraint make the result of vsetvli is not necessary to be VLMAX (the maximum number of elements will be updated of specific vector-length RVV CPU).
>
> So for a vsetvli instruction (vsetvli vl,avl,vtype). The "vl" value can be various among different RVV CPU depending on the implementation of the downstream RVV hardware.
>
>
>  Now I think I should fix this patch since this patch is not always suitable for all hardware.
>
> So according to RVV ISA:
> For example, this permits an implementation to set vl = ceil(AVL / 2) for VLMAX < AVL < 2*VLMAX in order to evenly distribute work over the last two iterations of a stripmine loop.
>
> We can have these 2 following different RVV CPU:
>
> Suppose  the maximum number of the elements needs to be updated is 10 element (int32_t), and the vector length = 256 bit (update 8 INT32 elements in max).
>
> So there are 2 iterations we need, the number elements of each iteration depending on hardware implementation.
>
> So we can have these 2 following hardware implementation are both legal for RVV standard:
>
> RVV CPU 1:
> 1st iteration update 5 element (it satisfy the constraint ceil (AVL/2) <= vl <= VLMAX), set vl = ceil (AVL/2) = 5
> 2nd iteration update 5 elements too.
>
> RVV CPU 2:
> 1st iteration update 8 elements. set vl = VLMAX = 8.
> 2nd iteration update 3 elments.
>
> These 2 RVV CPU are both legal according to RVV specification standard.
> It's obvious this patch is correct for RVV CPU 2 but incorrect for RVV CPU 1.

Ah, OK.  In that case, I guess a new ifn like WHILE_LEN will be needed
after all.

> Since the current flow of this patch is as follows:
>
> +	<bb 3>
> +	_19 = (unsigned long) n_5(D);
> +	...
> +
> +	<bb 4>:
> +	...
> +	# ivtmp_20 = PHI <ivtmp_21(4), _19(3)>
> +	...
> +	_22 = .WHILE_LEN (ivtmp_20, vf);
> +	...
> +	LEN_LOAD (addr, _22);...        addr = addr + vf;+	ivtmp_21 = ivtmp_20 - _22;
> +	...
> +	if (ivtmp_21 != 0)
> +	  goto <bb 4>; [75.00%]
> +	else
> +	  goto <bb 5>; [25.00%]
> +
> +	<bb 5>
> +	return;
>
> Here the _22 which is the output of WHILE_LEN is only used in ivtmp_21 = ivtmp_20 - _22;
> which serves the saturating-to-zero subtraction. 
> And "addr = addr + vf;" 
> The address is calculated in the loop just keep add vf. 
> Such sequence is Ok for most of the RVV CPU so far I think.
> However, for future compatibility, we should make WHILE_LEN output as the address IV adding value too.
>
> So, we should abandon the current the address loop way which is just keeping add vf.
>
> Replace "addr = addr + vf".
>
> Instead, we should do like that:
>
> _22 = .WHILE_LEN (ivtmp_20, vf);
> ....
> LEN_LOAD (addr, _22);tmp = _22 * 4; (Assume it is INT32 calculation, make _22 which is INT32 align into BYTE align for address counting) addr = addr + tmp;....
> Makeing the result of WHILE_LEN is not only used to do the remain = remain - len,But also used in addressing calculating: tmp = len * (element bytesize) ; addr = addr + tmp;
> Then this flow is the correct flow for all RVV CPU.
> This flow is totally same as example in RVV ISA define:
> https://github.com/riscv/riscv-v-spec/blob/master/example/vvaddint32.s
> I think I need to change this patch as described above to make it to be global suitable for all RVV CPU in the word. But I am not sure whether GCC community accept this flow. So I propose it now before I do it. 
> I didn't realize that since my downstream RVV hardware and the open-source simulator generate "vl" = VLMAX. (Sorry about that)

Sounds OK to me FWIW.  Supporting non-(poly-)constant VFs seems useful.

I had a local patch (not submitted) that needed non-constant VFs for
a different reason: to clamp the VF at runtime to avoid hazards.
E.g. if we know that vectorisation is safe up to VF=N but not
beyond that, the patch would limit the VF to N on targets with
larger vectors, rather than punt to the scalar fallback loop.

There's still a key difference between that use case and yours,
since in "my" case the VF would still be invariant (for SVE),
whereas with yours the VF would vary between iterations.
But the general principle is the same.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
  2023-04-07  1:47 [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization juzhe.zhong
  2023-04-07  3:23 ` Li, Pan2
  2023-04-11 12:12 ` juzhe.zhong
@ 2023-04-19 21:53 ` 钟居哲
  2023-04-20  8:52   ` Richard Sandiford
  2 siblings, 1 reply; 41+ messages in thread
From: 钟居哲 @ 2023-04-19 21:53 UTC (permalink / raw)
  To: 钟居哲, gcc-patches
  Cc: richard.sandiford, rguenther, Jeff Law

[-- Attachment #1: Type: text/plain, Size: 19881 bytes --]

Hi, Richards.
Since GCC 14 is open and this patch has been boostraped && tested on X86.
Is this patch supporting variable IV OK for the trunk ?

Thanks


juzhe.zhong@rivai.ai
 
From: juzhe.zhong
Date: 2023-04-07 09:47
To: gcc-patches
CC: richard.sandiford; rguenther; jeffreyalaw; Juzhe-Zhong
Subject: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
 
This patch is to add WHILE_LEN pattern.
It's inspired by RVV ISA simple "vvaddint32.s" example:
https://github.com/riscv/riscv-v-spec/blob/master/example/vvaddint32.s
 
More details are in "vect_set_loop_controls_by_while_len" implementation
and comments.
 
Consider such following case:
#define N 16
int src[N];
int dest[N];
 
void
foo (int n)
{
  for (int i = 0; i < n; i++)
    dest[i] = src[i];
}
 
-march=rv64gcv -O3 --param riscv-autovec-preference=scalable -fno-vect-cost-model -fno-tree-loop-distribute-patterns:
 
foo:        
        ble     a0,zero,.L1
        lui     a4,%hi(.LANCHOR0)
        addi    a4,a4,%lo(.LANCHOR0)
        addi    a3,a4,64
        csrr    a2,vlenb
.L3:
        vsetvli a5,a0,e32,m1,ta,ma
        vle32.v v1,0(a4)
        sub     a0,a0,a5
        vse32.v v1,0(a3)
        add     a4,a4,a2
        add     a3,a3,a2
        bne     a0,zero,.L3
.L1:
        ret
 
gcc/ChangeLog:
 
        * doc/md.texi: Add WHILE_LEN support.
        * internal-fn.cc (while_len_direct): Ditto.
        (expand_while_len_optab_fn): Ditto.
        (direct_while_len_optab_supported_p): Ditto.
        * internal-fn.def (WHILE_LEN): Ditto.
        * optabs.def (OPTAB_D): Ditto.
        * tree-ssa-loop-manip.cc (create_iv): Ditto.
        * tree-ssa-loop-manip.h (create_iv): Ditto.
        * tree-vect-loop-manip.cc (vect_set_loop_controls_by_while_len): Ditto.
        (vect_set_loop_condition_partial_vectors): Ditto.
        * tree-vect-loop.cc (vect_get_loop_len): Ditto.
        * tree-vect-stmts.cc (vectorizable_store): Ditto.
        (vectorizable_load): Ditto.
        * tree-vectorizer.h (vect_get_loop_len): Ditto.
 
---
gcc/doc/md.texi             |  14 +++
gcc/internal-fn.cc          |  29 ++++++
gcc/internal-fn.def         |   1 +
gcc/optabs.def              |   1 +
gcc/tree-ssa-loop-manip.cc  |   4 +-
gcc/tree-ssa-loop-manip.h   |   2 +-
gcc/tree-vect-loop-manip.cc | 186 ++++++++++++++++++++++++++++++++++--
gcc/tree-vect-loop.cc       |  35 +++++--
gcc/tree-vect-stmts.cc      |   9 +-
gcc/tree-vectorizer.h       |   4 +-
10 files changed, 264 insertions(+), 21 deletions(-)
 
diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 8e3113599fd..72178ab014c 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -4965,6 +4965,20 @@ for (i = 1; i < operand3; i++)
   operand0[i] = operand0[i - 1] && (operand1 + i < operand2);
@end smallexample
+@cindex @code{while_len@var{m}@var{n}} instruction pattern
+@item @code{while_len@var{m}@var{n}}
+Set operand 0 to the number of active elements in vector will be updated value.
+operand 1 is the total elements need to be updated value.
+operand 2 is the vectorization factor.
+The operation is equivalent to:
+
+@smallexample
+operand0 = MIN (operand1, operand2);
+operand2 can be const_poly_int or poly_int related to vector mode size.
+Some target like RISC-V has a standalone instruction to get MIN (n, MODE SIZE) so
+that we can reduce a use of general purpose register.
+@end smallexample
+
@cindex @code{check_raw_ptrs@var{m}} instruction pattern
@item @samp{check_raw_ptrs@var{m}}
Check whether, given two pointers @var{a} and @var{b} and a length @var{len},
diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index 6e81dc05e0e..5f44def90d3 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -127,6 +127,7 @@ init_internal_fns ()
#define cond_binary_direct { 1, 1, true }
#define cond_ternary_direct { 1, 1, true }
#define while_direct { 0, 2, false }
+#define while_len_direct { 0, 0, false }
#define fold_extract_direct { 2, 2, false }
#define fold_left_direct { 1, 1, false }
#define mask_fold_left_direct { 1, 1, false }
@@ -3702,6 +3703,33 @@ expand_while_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
     emit_move_insn (lhs_rtx, ops[0].value);
}
+/* Expand WHILE_LEN call STMT using optab OPTAB.  */
+static void
+expand_while_len_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
+{
+  expand_operand ops[3];
+  tree rhs_type[2];
+
+  tree lhs = gimple_call_lhs (stmt);
+  tree lhs_type = TREE_TYPE (lhs);
+  rtx lhs_rtx = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
+  create_output_operand (&ops[0], lhs_rtx, TYPE_MODE (lhs_type));
+
+  for (unsigned int i = 0; i < gimple_call_num_args (stmt); ++i)
+    {
+      tree rhs = gimple_call_arg (stmt, i);
+      rhs_type[i] = TREE_TYPE (rhs);
+      rtx rhs_rtx = expand_normal (rhs);
+      create_input_operand (&ops[i + 1], rhs_rtx, TYPE_MODE (rhs_type[i]));
+    }
+
+  insn_code icode = direct_optab_handler (optab, TYPE_MODE (rhs_type[0]));
+
+  expand_insn (icode, 3, ops);
+  if (!rtx_equal_p (lhs_rtx, ops[0].value))
+    emit_move_insn (lhs_rtx, ops[0].value);
+}
+
/* Expand a call to a convert-like optab using the operands in STMT.
    FN has a single output operand and NARGS input operands.  */
@@ -3843,6 +3871,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
#define direct_scatter_store_optab_supported_p convert_optab_supported_p
#define direct_len_store_optab_supported_p direct_optab_supported_p
#define direct_while_optab_supported_p convert_optab_supported_p
+#define direct_while_len_optab_supported_p direct_optab_supported_p
#define direct_fold_extract_optab_supported_p direct_optab_supported_p
#define direct_fold_left_optab_supported_p direct_optab_supported_p
#define direct_mask_fold_left_optab_supported_p direct_optab_supported_p
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 7fe742c2ae7..3a933abff5d 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -153,6 +153,7 @@ DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set)
DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store)
DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
+DEF_INTERNAL_OPTAB_FN (WHILE_LEN, ECF_CONST | ECF_NOTHROW, while_len, while_len)
DEF_INTERNAL_OPTAB_FN (CHECK_RAW_PTRS, ECF_CONST | ECF_NOTHROW,
       check_raw_ptrs, check_ptrs)
DEF_INTERNAL_OPTAB_FN (CHECK_WAR_PTRS, ECF_CONST | ECF_NOTHROW,
diff --git a/gcc/optabs.def b/gcc/optabs.def
index 695f5911b30..f5938bd2c24 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -476,3 +476,4 @@ OPTAB_DC (vec_series_optab, "vec_series$a", VEC_SERIES)
OPTAB_D (vec_shl_insert_optab, "vec_shl_insert_$a")
OPTAB_D (len_load_optab, "len_load_$a")
OPTAB_D (len_store_optab, "len_store_$a")
+OPTAB_D (while_len_optab, "while_len$a")
diff --git a/gcc/tree-ssa-loop-manip.cc b/gcc/tree-ssa-loop-manip.cc
index 09acc1c94cc..cdbf280e249 100644
--- a/gcc/tree-ssa-loop-manip.cc
+++ b/gcc/tree-ssa-loop-manip.cc
@@ -59,14 +59,14 @@ static bitmap_obstack loop_renamer_obstack;
void
create_iv (tree base, tree step, tree var, class loop *loop,
   gimple_stmt_iterator *incr_pos, bool after,
-    tree *var_before, tree *var_after)
+    tree *var_before, tree *var_after, enum tree_code code)
{
   gassign *stmt;
   gphi *phi;
   tree initial, step1;
   gimple_seq stmts;
   tree vb, va;
-  enum tree_code incr_op = PLUS_EXPR;
+  enum tree_code incr_op = code;
   edge pe = loop_preheader_edge (loop);
   if (var != NULL_TREE)
diff --git a/gcc/tree-ssa-loop-manip.h b/gcc/tree-ssa-loop-manip.h
index d49273a3987..da755320a3a 100644
--- a/gcc/tree-ssa-loop-manip.h
+++ b/gcc/tree-ssa-loop-manip.h
@@ -23,7 +23,7 @@ along with GCC; see the file COPYING3.  If not see
typedef void (*transform_callback)(class loop *, void *);
extern void create_iv (tree, tree, tree, class loop *, gimple_stmt_iterator *,
-        bool, tree *, tree *);
+        bool, tree *, tree *, enum tree_code = PLUS_EXPR);
extern void rewrite_into_loop_closed_ssa (bitmap, unsigned);
extern void verify_loop_closed_ssa (bool, class loop * = NULL);
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index f60fa50e8f4..f3cd6c51d2e 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -682,6 +682,173 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
   return next_ctrl;
}
+/* Helper for vect_set_loop_condition_partial_vectors.  Generate definitions
+   for all the rgroup controls in RGC and return a control that is nonzero
+   when the loop needs to iterate.  Add any new preheader statements to
+   PREHEADER_SEQ.  Use LOOP_COND_GSI to insert code before the exit gcond.
+
+   RGC belongs to loop LOOP.  The loop originally iterated NITERS
+   times and has been vectorized according to LOOP_VINFO.
+
+   Unlike vect_set_loop_controls_directly which is iterating from 0-based IV
+   to TEST_LIMIT - bias.
+
+   In vect_set_loop_controls_by_while_len, we are iterating from start at
+   IV = TEST_LIMIT - bias and keep subtract IV by the length calculated by
+   IFN_WHILE_LEN pattern.
+
+   Note: the cost of the code generated by this function is modeled
+   by vect_estimate_min_profitable_iters, so changes here may need
+   corresponding changes there.
+
+   1. Single rgroup, the Gimple IR should be:
+
+ <bb 3>
+ _19 = (unsigned long) n_5(D);
+ ...
+
+ <bb 4>:
+ ...
+ # ivtmp_20 = PHI <ivtmp_21(4), _19(3)>
+ ...
+ _22 = .WHILE_LEN (ivtmp_20, vf);
+ ...
+ vector statement (use _22);
+ ...
+ ivtmp_21 = ivtmp_20 - _22;
+ ...
+ if (ivtmp_21 != 0)
+   goto <bb 4>; [75.00%]
+ else
+   goto <bb 5>; [25.00%]
+
+ <bb 5>
+ return;
+
+   Note: IFN_WHILE_LEN will guarantee "ivtmp_21 = ivtmp_20 - _22" never
+   underflow 0.
+
+   2. Multiple rgroup, the Gimple IR should be:
+
+ <bb 3>
+ _70 = (unsigned long) bnd.7_52;
+ _71 = _70 * 2;
+ _72 = MAX_EXPR <_71, 4>;
+ _73 = _72 + 18446744073709551612;
+ ...
+
+ <bb 4>:
+ ...
+ # ivtmp_74 = PHI <ivtmp_75(6), _73(12)>
+ # ivtmp_77 = PHI <ivtmp_78(6), _71(12)>
+ _76 = .WHILE_LEN (ivtmp_74, vf * nitems_per_ctrl);
+ _79 = .WHILE_LEN (ivtmp_77, vf * nitems_per_ctrl);
+ ...
+ vector statement (use _79);
+ ...
+ vector statement (use _76);
+ ...
+ _65 = _79 / 2;
+ vector statement (use _65);
+ ...
+ _68 = _76 / 2;
+ vector statement (use _68);
+ ...
+ ivtmp_78 = ivtmp_77 - _79;
+ ivtmp_75 = ivtmp_74 - _76;
+ ...
+ if (ivtmp_78 != 0)
+   goto <bb 4>; [75.00%]
+ else
+   goto <bb 5>; [25.00%]
+
+ <bb 5>
+ return;
+
+*/
+
+static tree
+vect_set_loop_controls_by_while_len (class loop *loop, loop_vec_info loop_vinfo,
+      gimple_seq *preheader_seq,
+      gimple_seq *header_seq,
+      rgroup_controls *rgc, tree niters)
+{
+  tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
+  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
+  /* We are not allowing masked approach in WHILE_LEN.  */
+  gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
+
+  tree ctrl_type = rgc->type;
+  unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
+  poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
+  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+
+  /* Calculate the maximum number of item values that the rgroup
+     handles in total, the number that it handles for each iteration
+     of the vector loop.  */
+  tree nitems_total = niters;
+  if (nitems_per_iter != 1)
+    {
+      /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
+ these multiplications don't overflow.  */
+      tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
+      nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
+    nitems_total, compare_factor);
+    }
+
+  /* Convert the comparison value to the IV type (either a no-op or
+     a promotion).  */
+  nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total);
+
+  /* Create an induction variable that counts the number of items
+     processed.  */
+  tree index_before_incr, index_after_incr;
+  gimple_stmt_iterator incr_gsi;
+  bool insert_after;
+  standard_iv_increment_position (loop, &incr_gsi, &insert_after);
+
+  /* Test the decremented IV, which will never underflow 0 since we have
+     IFN_WHILE_LEN to gurantee that.  */
+  tree test_limit = nitems_total;
+
+  /* Provide a definition of each control in the group.  */
+  tree ctrl;
+  unsigned int i;
+  FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
+    {
+      /* Previous controls will cover BIAS items.  This control covers the
+ next batch.  */
+      poly_uint64 bias = nitems_per_ctrl * i;
+      tree bias_tree = build_int_cst (iv_type, bias);
+
+      /* Rather than have a new IV that starts at TEST_LIMIT and goes down to
+ BIAS, prefer to use the same TEST_LIMIT - BIAS based IV for each
+ control and adjust the bound down by BIAS.  */
+      tree this_test_limit = test_limit;
+      if (i != 0)
+ {
+   this_test_limit = gimple_build (preheader_seq, MAX_EXPR, iv_type,
+   this_test_limit, bias_tree);
+   this_test_limit = gimple_build (preheader_seq, MINUS_EXPR, iv_type,
+   this_test_limit, bias_tree);
+ }
+
+      /* Create decrement IV.  */
+      create_iv (this_test_limit, ctrl, NULL_TREE, loop, &incr_gsi,
+ insert_after, &index_before_incr, &index_after_incr,
+ MINUS_EXPR);
+
+      poly_uint64 final_vf = vf * nitems_per_iter;
+      tree vf_step = build_int_cst (iv_type, final_vf);
+      tree res_len = gimple_build (header_seq, IFN_WHILE_LEN, iv_type,
+    index_before_incr, vf_step);
+      gassign *assign = gimple_build_assign (ctrl, res_len);
+      gimple_seq_add_stmt (header_seq, assign);
+    }
+
+  return index_after_incr;
+}
+
/* Set up the iteration condition and rgroup controls for LOOP, given
    that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
    loop.  LOOP_VINFO describes the vectorization of LOOP.  NITERS is
@@ -703,6 +870,7 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
   bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
+  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
   unsigned int compare_precision = TYPE_PRECISION (compare_type);
   tree orig_niters = niters;
@@ -757,12 +925,18 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
/* Set up all controls for this group.  */
- test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
-      &preheader_seq,
-      &header_seq,
-      loop_cond_gsi, rgc,
-      niters, niters_skip,
-      might_wrap_p);
+ if (direct_internal_fn_supported_p (IFN_WHILE_LEN, iv_type,
+     OPTIMIZE_FOR_SPEED))
+   test_ctrl
+     = vect_set_loop_controls_by_while_len (loop, loop_vinfo,
+    &preheader_seq, &header_seq,
+    rgc, niters);
+ else
+   test_ctrl
+     = vect_set_loop_controls_directly (loop, loop_vinfo, &preheader_seq,
+        &header_seq, loop_cond_gsi, rgc,
+        niters, niters_skip,
+        might_wrap_p);
       }
   /* Emit all accumulated statements.  */
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 1ba9f18d73e..5bffd9a6322 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -10360,12 +10360,14 @@ vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
    rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
tree
-vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
-    unsigned int nvectors, unsigned int index)
+vect_get_loop_len (gimple_stmt_iterator *gsi, loop_vec_info loop_vinfo,
+    vec_loop_lens *lens, unsigned int nvectors, tree vectype,
+    unsigned int index)
{
   rgroup_controls *rgl = &(*lens)[nvectors - 1];
-  bool use_bias_adjusted_len =
-    LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
+  bool use_bias_adjusted_len
+    = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
+  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
   /* Populate the rgroup's len array, if this is the first time we've
      used it.  */
@@ -10386,8 +10388,8 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
  if (use_bias_adjusted_len)
    {
      gcc_assert (i == 0);
-       tree adjusted_len =
- make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
+       tree adjusted_len
+ = make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
      SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
      rgl->bias_adjusted_ctrl = adjusted_len;
    }
@@ -10396,6 +10398,27 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
   if (use_bias_adjusted_len)
     return rgl->bias_adjusted_ctrl;
+  else if (direct_internal_fn_supported_p (IFN_WHILE_LEN, iv_type,
+    OPTIMIZE_FOR_SPEED))
+    {
+      tree loop_len = rgl->controls[index];
+      poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
+      poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
+      if (maybe_ne (nunits1, nunits2))
+ {
+   /* A loop len for data type X can be reused for data type Y
+      if X has N times more elements than Y and if Y's elements
+      are N times bigger than X's.  */
+   gcc_assert (multiple_p (nunits1, nunits2));
+   unsigned int factor = exact_div (nunits1, nunits2).to_constant ();
+   gimple_seq seq = NULL;
+   loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
+    build_int_cst (iv_type, factor));
+   if (seq)
+     gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
+ }
+      return loop_len;
+    }
   else
     return rgl->controls[index];
}
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index efa2d0daa52..708c8a1d806 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -8653,8 +8653,9 @@ vectorizable_store (vec_info *vinfo,
      else if (loop_lens)
{
  tree final_len
-     = vect_get_loop_len (loop_vinfo, loop_lens,
- vec_num * ncopies, vec_num * j + i);
+     = vect_get_loop_len (gsi, loop_vinfo, loop_lens,
+ vec_num * ncopies, vectype,
+ vec_num * j + i);
  tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
  machine_mode vmode = TYPE_MODE (vectype);
  opt_machine_mode new_ovmode
@@ -10009,8 +10010,8 @@ vectorizable_load (vec_info *vinfo,
    else if (loop_lens && memory_access_type != VMAT_INVARIANT)
      {
tree final_len
-   = vect_get_loop_len (loop_vinfo, loop_lens,
-        vec_num * ncopies,
+   = vect_get_loop_len (gsi, loop_vinfo, loop_lens,
+        vec_num * ncopies, vectype,
       vec_num * j + i);
tree ptr = build_int_cst (ref_type,
  align * BITS_PER_UNIT);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 9cf2fb23fe3..e5cf38caf4b 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2293,8 +2293,8 @@ extern tree vect_get_loop_mask (gimple_stmt_iterator *, vec_loop_masks *,
unsigned int, tree, unsigned int);
extern void vect_record_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
  tree, unsigned int);
-extern tree vect_get_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
-        unsigned int);
+extern tree vect_get_loop_len (gimple_stmt_iterator *, loop_vec_info,
+        vec_loop_lens *, unsigned int, tree, unsigned int);
extern gimple_seq vect_gen_len (tree, tree, tree, tree);
extern stmt_vec_info info_for_reduction (vec_info *, stmt_vec_info);
extern bool reduction_fn_for_scalar_code (code_helper, internal_fn *);
-- 
2.36.3
 

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
  2023-04-19 21:53 ` 钟居哲
@ 2023-04-20  8:52   ` Richard Sandiford
  2023-04-20  8:57     ` juzhe.zhong
  0 siblings, 1 reply; 41+ messages in thread
From: Richard Sandiford @ 2023-04-20  8:52 UTC (permalink / raw)
  To: 钟居哲; +Cc: gcc-patches, rguenther, Jeff Law

钟居哲 <juzhe.zhong@rivai.ai> writes:
> Hi, Richards.
> Since GCC 14 is open and this patch has been boostraped && tested on X86.
> Is this patch supporting variable IV OK for the trunk ?

Doesn't the patch need updating based on the previous discussion?
I thought the outcome was that WHILE_LEN isn't a simple MIN operation
(contrary to the documentation in the patch) and that pointer IVs
would also need to be updated by a variable amount, given that even
non-final iterations might process fewer than VF elements.

Thanks,
Richard

> juzhe.zhong@rivai.ai
>  
> From: juzhe.zhong
> Date: 2023-04-07 09:47
> To: gcc-patches
> CC: richard.sandiford; rguenther; jeffreyalaw; Juzhe-Zhong
> Subject: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
> From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
>  
> This patch is to add WHILE_LEN pattern.
> It's inspired by RVV ISA simple "vvaddint32.s" example:
> https://github.com/riscv/riscv-v-spec/blob/master/example/vvaddint32.s
>  
> More details are in "vect_set_loop_controls_by_while_len" implementation
> and comments.
>  
> Consider such following case:
> #define N 16
> int src[N];
> int dest[N];
>  
> void
> foo (int n)
> {
>   for (int i = 0; i < n; i++)
>     dest[i] = src[i];
> }
>  
> -march=rv64gcv -O3 --param riscv-autovec-preference=scalable -fno-vect-cost-model -fno-tree-loop-distribute-patterns:
>  
> foo:        
>         ble     a0,zero,.L1
>         lui     a4,%hi(.LANCHOR0)
>         addi    a4,a4,%lo(.LANCHOR0)
>         addi    a3,a4,64
>         csrr    a2,vlenb
> .L3:
>         vsetvli a5,a0,e32,m1,ta,ma
>         vle32.v v1,0(a4)
>         sub     a0,a0,a5
>         vse32.v v1,0(a3)
>         add     a4,a4,a2
>         add     a3,a3,a2
>         bne     a0,zero,.L3
> .L1:
>         ret
>  
> gcc/ChangeLog:
>  
>         * doc/md.texi: Add WHILE_LEN support.
>         * internal-fn.cc (while_len_direct): Ditto.
>         (expand_while_len_optab_fn): Ditto.
>         (direct_while_len_optab_supported_p): Ditto.
>         * internal-fn.def (WHILE_LEN): Ditto.
>         * optabs.def (OPTAB_D): Ditto.
>         * tree-ssa-loop-manip.cc (create_iv): Ditto.
>         * tree-ssa-loop-manip.h (create_iv): Ditto.
>         * tree-vect-loop-manip.cc (vect_set_loop_controls_by_while_len): Ditto.
>         (vect_set_loop_condition_partial_vectors): Ditto.
>         * tree-vect-loop.cc (vect_get_loop_len): Ditto.
>         * tree-vect-stmts.cc (vectorizable_store): Ditto.
>         (vectorizable_load): Ditto.
>         * tree-vectorizer.h (vect_get_loop_len): Ditto.
>  
> ---
> gcc/doc/md.texi             |  14 +++
> gcc/internal-fn.cc          |  29 ++++++
> gcc/internal-fn.def         |   1 +
> gcc/optabs.def              |   1 +
> gcc/tree-ssa-loop-manip.cc  |   4 +-
> gcc/tree-ssa-loop-manip.h   |   2 +-
> gcc/tree-vect-loop-manip.cc | 186 ++++++++++++++++++++++++++++++++++--
> gcc/tree-vect-loop.cc       |  35 +++++--
> gcc/tree-vect-stmts.cc      |   9 +-
> gcc/tree-vectorizer.h       |   4 +-
> 10 files changed, 264 insertions(+), 21 deletions(-)
>  
> diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> index 8e3113599fd..72178ab014c 100644
> --- a/gcc/doc/md.texi
> +++ b/gcc/doc/md.texi
> @@ -4965,6 +4965,20 @@ for (i = 1; i < operand3; i++)
>    operand0[i] = operand0[i - 1] && (operand1 + i < operand2);
> @end smallexample
> +@cindex @code{while_len@var{m}@var{n}} instruction pattern
> +@item @code{while_len@var{m}@var{n}}
> +Set operand 0 to the number of active elements in vector will be updated value.
> +operand 1 is the total elements need to be updated value.
> +operand 2 is the vectorization factor.
> +The operation is equivalent to:
> +
> +@smallexample
> +operand0 = MIN (operand1, operand2);
> +operand2 can be const_poly_int or poly_int related to vector mode size.
> +Some target like RISC-V has a standalone instruction to get MIN (n, MODE SIZE) so
> +that we can reduce a use of general purpose register.
> +@end smallexample
> +
> @cindex @code{check_raw_ptrs@var{m}} instruction pattern
> @item @samp{check_raw_ptrs@var{m}}
> Check whether, given two pointers @var{a} and @var{b} and a length @var{len},
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index 6e81dc05e0e..5f44def90d3 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -127,6 +127,7 @@ init_internal_fns ()
> #define cond_binary_direct { 1, 1, true }
> #define cond_ternary_direct { 1, 1, true }
> #define while_direct { 0, 2, false }
> +#define while_len_direct { 0, 0, false }
> #define fold_extract_direct { 2, 2, false }
> #define fold_left_direct { 1, 1, false }
> #define mask_fold_left_direct { 1, 1, false }
> @@ -3702,6 +3703,33 @@ expand_while_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>      emit_move_insn (lhs_rtx, ops[0].value);
> }
> +/* Expand WHILE_LEN call STMT using optab OPTAB.  */
> +static void
> +expand_while_len_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
> +{
> +  expand_operand ops[3];
> +  tree rhs_type[2];
> +
> +  tree lhs = gimple_call_lhs (stmt);
> +  tree lhs_type = TREE_TYPE (lhs);
> +  rtx lhs_rtx = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
> +  create_output_operand (&ops[0], lhs_rtx, TYPE_MODE (lhs_type));
> +
> +  for (unsigned int i = 0; i < gimple_call_num_args (stmt); ++i)
> +    {
> +      tree rhs = gimple_call_arg (stmt, i);
> +      rhs_type[i] = TREE_TYPE (rhs);
> +      rtx rhs_rtx = expand_normal (rhs);
> +      create_input_operand (&ops[i + 1], rhs_rtx, TYPE_MODE (rhs_type[i]));
> +    }
> +
> +  insn_code icode = direct_optab_handler (optab, TYPE_MODE (rhs_type[0]));
> +
> +  expand_insn (icode, 3, ops);
> +  if (!rtx_equal_p (lhs_rtx, ops[0].value))
> +    emit_move_insn (lhs_rtx, ops[0].value);
> +}
> +
> /* Expand a call to a convert-like optab using the operands in STMT.
>     FN has a single output operand and NARGS input operands.  */
> @@ -3843,6 +3871,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
> #define direct_scatter_store_optab_supported_p convert_optab_supported_p
> #define direct_len_store_optab_supported_p direct_optab_supported_p
> #define direct_while_optab_supported_p convert_optab_supported_p
> +#define direct_while_len_optab_supported_p direct_optab_supported_p
> #define direct_fold_extract_optab_supported_p direct_optab_supported_p
> #define direct_fold_left_optab_supported_p direct_optab_supported_p
> #define direct_mask_fold_left_optab_supported_p direct_optab_supported_p
> diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> index 7fe742c2ae7..3a933abff5d 100644
> --- a/gcc/internal-fn.def
> +++ b/gcc/internal-fn.def
> @@ -153,6 +153,7 @@ DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set)
> DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store)
> DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
> +DEF_INTERNAL_OPTAB_FN (WHILE_LEN, ECF_CONST | ECF_NOTHROW, while_len, while_len)
> DEF_INTERNAL_OPTAB_FN (CHECK_RAW_PTRS, ECF_CONST | ECF_NOTHROW,
>        check_raw_ptrs, check_ptrs)
> DEF_INTERNAL_OPTAB_FN (CHECK_WAR_PTRS, ECF_CONST | ECF_NOTHROW,
> diff --git a/gcc/optabs.def b/gcc/optabs.def
> index 695f5911b30..f5938bd2c24 100644
> --- a/gcc/optabs.def
> +++ b/gcc/optabs.def
> @@ -476,3 +476,4 @@ OPTAB_DC (vec_series_optab, "vec_series$a", VEC_SERIES)
> OPTAB_D (vec_shl_insert_optab, "vec_shl_insert_$a")
> OPTAB_D (len_load_optab, "len_load_$a")
> OPTAB_D (len_store_optab, "len_store_$a")
> +OPTAB_D (while_len_optab, "while_len$a")
> diff --git a/gcc/tree-ssa-loop-manip.cc b/gcc/tree-ssa-loop-manip.cc
> index 09acc1c94cc..cdbf280e249 100644
> --- a/gcc/tree-ssa-loop-manip.cc
> +++ b/gcc/tree-ssa-loop-manip.cc
> @@ -59,14 +59,14 @@ static bitmap_obstack loop_renamer_obstack;
> void
> create_iv (tree base, tree step, tree var, class loop *loop,
>    gimple_stmt_iterator *incr_pos, bool after,
> -    tree *var_before, tree *var_after)
> +    tree *var_before, tree *var_after, enum tree_code code)
> {
>    gassign *stmt;
>    gphi *phi;
>    tree initial, step1;
>    gimple_seq stmts;
>    tree vb, va;
> -  enum tree_code incr_op = PLUS_EXPR;
> +  enum tree_code incr_op = code;
>    edge pe = loop_preheader_edge (loop);
>    if (var != NULL_TREE)
> diff --git a/gcc/tree-ssa-loop-manip.h b/gcc/tree-ssa-loop-manip.h
> index d49273a3987..da755320a3a 100644
> --- a/gcc/tree-ssa-loop-manip.h
> +++ b/gcc/tree-ssa-loop-manip.h
> @@ -23,7 +23,7 @@ along with GCC; see the file COPYING3.  If not see
> typedef void (*transform_callback)(class loop *, void *);
> extern void create_iv (tree, tree, tree, class loop *, gimple_stmt_iterator *,
> -        bool, tree *, tree *);
> +        bool, tree *, tree *, enum tree_code = PLUS_EXPR);
> extern void rewrite_into_loop_closed_ssa (bitmap, unsigned);
> extern void verify_loop_closed_ssa (bool, class loop * = NULL);
> diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> index f60fa50e8f4..f3cd6c51d2e 100644
> --- a/gcc/tree-vect-loop-manip.cc
> +++ b/gcc/tree-vect-loop-manip.cc
> @@ -682,6 +682,173 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
>    return next_ctrl;
> }
> +/* Helper for vect_set_loop_condition_partial_vectors.  Generate definitions
> +   for all the rgroup controls in RGC and return a control that is nonzero
> +   when the loop needs to iterate.  Add any new preheader statements to
> +   PREHEADER_SEQ.  Use LOOP_COND_GSI to insert code before the exit gcond.
> +
> +   RGC belongs to loop LOOP.  The loop originally iterated NITERS
> +   times and has been vectorized according to LOOP_VINFO.
> +
> +   Unlike vect_set_loop_controls_directly which is iterating from 0-based IV
> +   to TEST_LIMIT - bias.
> +
> +   In vect_set_loop_controls_by_while_len, we are iterating from start at
> +   IV = TEST_LIMIT - bias and keep subtract IV by the length calculated by
> +   IFN_WHILE_LEN pattern.
> +
> +   Note: the cost of the code generated by this function is modeled
> +   by vect_estimate_min_profitable_iters, so changes here may need
> +   corresponding changes there.
> +
> +   1. Single rgroup, the Gimple IR should be:
> +
> + <bb 3>
> + _19 = (unsigned long) n_5(D);
> + ...
> +
> + <bb 4>:
> + ...
> + # ivtmp_20 = PHI <ivtmp_21(4), _19(3)>
> + ...
> + _22 = .WHILE_LEN (ivtmp_20, vf);
> + ...
> + vector statement (use _22);
> + ...
> + ivtmp_21 = ivtmp_20 - _22;
> + ...
> + if (ivtmp_21 != 0)
> +   goto <bb 4>; [75.00%]
> + else
> +   goto <bb 5>; [25.00%]
> +
> + <bb 5>
> + return;
> +
> +   Note: IFN_WHILE_LEN will guarantee "ivtmp_21 = ivtmp_20 - _22" never
> +   underflow 0.
> +
> +   2. Multiple rgroup, the Gimple IR should be:
> +
> + <bb 3>
> + _70 = (unsigned long) bnd.7_52;
> + _71 = _70 * 2;
> + _72 = MAX_EXPR <_71, 4>;
> + _73 = _72 + 18446744073709551612;
> + ...
> +
> + <bb 4>:
> + ...
> + # ivtmp_74 = PHI <ivtmp_75(6), _73(12)>
> + # ivtmp_77 = PHI <ivtmp_78(6), _71(12)>
> + _76 = .WHILE_LEN (ivtmp_74, vf * nitems_per_ctrl);
> + _79 = .WHILE_LEN (ivtmp_77, vf * nitems_per_ctrl);
> + ...
> + vector statement (use _79);
> + ...
> + vector statement (use _76);
> + ...
> + _65 = _79 / 2;
> + vector statement (use _65);
> + ...
> + _68 = _76 / 2;
> + vector statement (use _68);
> + ...
> + ivtmp_78 = ivtmp_77 - _79;
> + ivtmp_75 = ivtmp_74 - _76;
> + ...
> + if (ivtmp_78 != 0)
> +   goto <bb 4>; [75.00%]
> + else
> +   goto <bb 5>; [25.00%]
> +
> + <bb 5>
> + return;
> +
> +*/
> +
> +static tree
> +vect_set_loop_controls_by_while_len (class loop *loop, loop_vec_info loop_vinfo,
> +      gimple_seq *preheader_seq,
> +      gimple_seq *header_seq,
> +      rgroup_controls *rgc, tree niters)
> +{
> +  tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
> +  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> +  /* We are not allowing masked approach in WHILE_LEN.  */
> +  gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
> +
> +  tree ctrl_type = rgc->type;
> +  unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
> +  poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
> +  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
> +
> +  /* Calculate the maximum number of item values that the rgroup
> +     handles in total, the number that it handles for each iteration
> +     of the vector loop.  */
> +  tree nitems_total = niters;
> +  if (nitems_per_iter != 1)
> +    {
> +      /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
> + these multiplications don't overflow.  */
> +      tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
> +      nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
> +    nitems_total, compare_factor);
> +    }
> +
> +  /* Convert the comparison value to the IV type (either a no-op or
> +     a promotion).  */
> +  nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total);
> +
> +  /* Create an induction variable that counts the number of items
> +     processed.  */
> +  tree index_before_incr, index_after_incr;
> +  gimple_stmt_iterator incr_gsi;
> +  bool insert_after;
> +  standard_iv_increment_position (loop, &incr_gsi, &insert_after);
> +
> +  /* Test the decremented IV, which will never underflow 0 since we have
> +     IFN_WHILE_LEN to gurantee that.  */
> +  tree test_limit = nitems_total;
> +
> +  /* Provide a definition of each control in the group.  */
> +  tree ctrl;
> +  unsigned int i;
> +  FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
> +    {
> +      /* Previous controls will cover BIAS items.  This control covers the
> + next batch.  */
> +      poly_uint64 bias = nitems_per_ctrl * i;
> +      tree bias_tree = build_int_cst (iv_type, bias);
> +
> +      /* Rather than have a new IV that starts at TEST_LIMIT and goes down to
> + BIAS, prefer to use the same TEST_LIMIT - BIAS based IV for each
> + control and adjust the bound down by BIAS.  */
> +      tree this_test_limit = test_limit;
> +      if (i != 0)
> + {
> +   this_test_limit = gimple_build (preheader_seq, MAX_EXPR, iv_type,
> +   this_test_limit, bias_tree);
> +   this_test_limit = gimple_build (preheader_seq, MINUS_EXPR, iv_type,
> +   this_test_limit, bias_tree);
> + }
> +
> +      /* Create decrement IV.  */
> +      create_iv (this_test_limit, ctrl, NULL_TREE, loop, &incr_gsi,
> + insert_after, &index_before_incr, &index_after_incr,
> + MINUS_EXPR);
> +
> +      poly_uint64 final_vf = vf * nitems_per_iter;
> +      tree vf_step = build_int_cst (iv_type, final_vf);
> +      tree res_len = gimple_build (header_seq, IFN_WHILE_LEN, iv_type,
> +    index_before_incr, vf_step);
> +      gassign *assign = gimple_build_assign (ctrl, res_len);
> +      gimple_seq_add_stmt (header_seq, assign);
> +    }
> +
> +  return index_after_incr;
> +}
> +
> /* Set up the iteration condition and rgroup controls for LOOP, given
>     that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
>     loop.  LOOP_VINFO describes the vectorization of LOOP.  NITERS is
> @@ -703,6 +870,7 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
>    bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
>    tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
> +  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
>    unsigned int compare_precision = TYPE_PRECISION (compare_type);
>    tree orig_niters = niters;
> @@ -757,12 +925,18 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
> bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
> /* Set up all controls for this group.  */
> - test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
> -      &preheader_seq,
> -      &header_seq,
> -      loop_cond_gsi, rgc,
> -      niters, niters_skip,
> -      might_wrap_p);
> + if (direct_internal_fn_supported_p (IFN_WHILE_LEN, iv_type,
> +     OPTIMIZE_FOR_SPEED))
> +   test_ctrl
> +     = vect_set_loop_controls_by_while_len (loop, loop_vinfo,
> +    &preheader_seq, &header_seq,
> +    rgc, niters);
> + else
> +   test_ctrl
> +     = vect_set_loop_controls_directly (loop, loop_vinfo, &preheader_seq,
> +        &header_seq, loop_cond_gsi, rgc,
> +        niters, niters_skip,
> +        might_wrap_p);
>        }
>    /* Emit all accumulated statements.  */
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 1ba9f18d73e..5bffd9a6322 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -10360,12 +10360,14 @@ vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
>     rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
> tree
> -vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
> -    unsigned int nvectors, unsigned int index)
> +vect_get_loop_len (gimple_stmt_iterator *gsi, loop_vec_info loop_vinfo,
> +    vec_loop_lens *lens, unsigned int nvectors, tree vectype,
> +    unsigned int index)
> {
>    rgroup_controls *rgl = &(*lens)[nvectors - 1];
> -  bool use_bias_adjusted_len =
> -    LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
> +  bool use_bias_adjusted_len
> +    = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
> +  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
>    /* Populate the rgroup's len array, if this is the first time we've
>       used it.  */
> @@ -10386,8 +10388,8 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
>   if (use_bias_adjusted_len)
>     {
>       gcc_assert (i == 0);
> -       tree adjusted_len =
> - make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
> +       tree adjusted_len
> + = make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
>       SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
>       rgl->bias_adjusted_ctrl = adjusted_len;
>     }
> @@ -10396,6 +10398,27 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
>    if (use_bias_adjusted_len)
>      return rgl->bias_adjusted_ctrl;
> +  else if (direct_internal_fn_supported_p (IFN_WHILE_LEN, iv_type,
> +    OPTIMIZE_FOR_SPEED))
> +    {
> +      tree loop_len = rgl->controls[index];
> +      poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
> +      poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
> +      if (maybe_ne (nunits1, nunits2))
> + {
> +   /* A loop len for data type X can be reused for data type Y
> +      if X has N times more elements than Y and if Y's elements
> +      are N times bigger than X's.  */
> +   gcc_assert (multiple_p (nunits1, nunits2));
> +   unsigned int factor = exact_div (nunits1, nunits2).to_constant ();
> +   gimple_seq seq = NULL;
> +   loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
> +    build_int_cst (iv_type, factor));
> +   if (seq)
> +     gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
> + }
> +      return loop_len;
> +    }
>    else
>      return rgl->controls[index];
> }
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index efa2d0daa52..708c8a1d806 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -8653,8 +8653,9 @@ vectorizable_store (vec_info *vinfo,
>       else if (loop_lens)
> {
>   tree final_len
> -     = vect_get_loop_len (loop_vinfo, loop_lens,
> - vec_num * ncopies, vec_num * j + i);
> +     = vect_get_loop_len (gsi, loop_vinfo, loop_lens,
> + vec_num * ncopies, vectype,
> + vec_num * j + i);
>   tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
>   machine_mode vmode = TYPE_MODE (vectype);
>   opt_machine_mode new_ovmode
> @@ -10009,8 +10010,8 @@ vectorizable_load (vec_info *vinfo,
>     else if (loop_lens && memory_access_type != VMAT_INVARIANT)
>       {
> tree final_len
> -   = vect_get_loop_len (loop_vinfo, loop_lens,
> -        vec_num * ncopies,
> +   = vect_get_loop_len (gsi, loop_vinfo, loop_lens,
> +        vec_num * ncopies, vectype,
>        vec_num * j + i);
> tree ptr = build_int_cst (ref_type,
>   align * BITS_PER_UNIT);
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index 9cf2fb23fe3..e5cf38caf4b 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -2293,8 +2293,8 @@ extern tree vect_get_loop_mask (gimple_stmt_iterator *, vec_loop_masks *,
> unsigned int, tree, unsigned int);
> extern void vect_record_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
>   tree, unsigned int);
> -extern tree vect_get_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
> -        unsigned int);
> +extern tree vect_get_loop_len (gimple_stmt_iterator *, loop_vec_info,
> +        vec_loop_lens *, unsigned int, tree, unsigned int);
> extern gimple_seq vect_gen_len (tree, tree, tree, tree);
> extern stmt_vec_info info_for_reduction (vec_info *, stmt_vec_info);
> extern bool reduction_fn_for_scalar_code (code_helper, internal_fn *);

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
  2023-04-20  8:52   ` Richard Sandiford
@ 2023-04-20  8:57     ` juzhe.zhong
  2023-04-20  9:11       ` Richard Sandiford
  0 siblings, 1 reply; 41+ messages in thread
From: juzhe.zhong @ 2023-04-20  8:57 UTC (permalink / raw)
  To: richard.sandiford; +Cc: gcc-patches, rguenther, jeffreyalaw

[-- Attachment #1: Type: text/plain, Size: 21768 bytes --]

Thanks Richard reminding me. I originally think community does not allow me support variable amount IV and let me do this in RISC-V backend.
It seems that I can do that in middle-end. Thank you so much. I will update the patch. Really appreciate it!



juzhe.zhong@rivai.ai
 
From: Richard Sandiford
Date: 2023-04-20 16:52
To: 钟居哲
CC: gcc-patches; rguenther; Jeff Law
Subject: Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
钟居哲 <juzhe.zhong@rivai.ai> writes:
> Hi, Richards.
> Since GCC 14 is open and this patch has been boostraped && tested on X86.
> Is this patch supporting variable IV OK for the trunk ?
 
Doesn't the patch need updating based on the previous discussion?
I thought the outcome was that WHILE_LEN isn't a simple MIN operation
(contrary to the documentation in the patch) and that pointer IVs
would also need to be updated by a variable amount, given that even
non-final iterations might process fewer than VF elements.
 
Thanks,
Richard
 
> juzhe.zhong@rivai.ai
>  
> From: juzhe.zhong
> Date: 2023-04-07 09:47
> To: gcc-patches
> CC: richard.sandiford; rguenther; jeffreyalaw; Juzhe-Zhong
> Subject: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
> From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
>  
> This patch is to add WHILE_LEN pattern.
> It's inspired by RVV ISA simple "vvaddint32.s" example:
> https://github.com/riscv/riscv-v-spec/blob/master/example/vvaddint32.s
>  
> More details are in "vect_set_loop_controls_by_while_len" implementation
> and comments.
>  
> Consider such following case:
> #define N 16
> int src[N];
> int dest[N];
>  
> void
> foo (int n)
> {
>   for (int i = 0; i < n; i++)
>     dest[i] = src[i];
> }
>  
> -march=rv64gcv -O3 --param riscv-autovec-preference=scalable -fno-vect-cost-model -fno-tree-loop-distribute-patterns:
>  
> foo:        
>         ble     a0,zero,.L1
>         lui     a4,%hi(.LANCHOR0)
>         addi    a4,a4,%lo(.LANCHOR0)
>         addi    a3,a4,64
>         csrr    a2,vlenb
> .L3:
>         vsetvli a5,a0,e32,m1,ta,ma
>         vle32.v v1,0(a4)
>         sub     a0,a0,a5
>         vse32.v v1,0(a3)
>         add     a4,a4,a2
>         add     a3,a3,a2
>         bne     a0,zero,.L3
> .L1:
>         ret
>  
> gcc/ChangeLog:
>  
>         * doc/md.texi: Add WHILE_LEN support.
>         * internal-fn.cc (while_len_direct): Ditto.
>         (expand_while_len_optab_fn): Ditto.
>         (direct_while_len_optab_supported_p): Ditto.
>         * internal-fn.def (WHILE_LEN): Ditto.
>         * optabs.def (OPTAB_D): Ditto.
>         * tree-ssa-loop-manip.cc (create_iv): Ditto.
>         * tree-ssa-loop-manip.h (create_iv): Ditto.
>         * tree-vect-loop-manip.cc (vect_set_loop_controls_by_while_len): Ditto.
>         (vect_set_loop_condition_partial_vectors): Ditto.
>         * tree-vect-loop.cc (vect_get_loop_len): Ditto.
>         * tree-vect-stmts.cc (vectorizable_store): Ditto.
>         (vectorizable_load): Ditto.
>         * tree-vectorizer.h (vect_get_loop_len): Ditto.
>  
> ---
> gcc/doc/md.texi             |  14 +++
> gcc/internal-fn.cc          |  29 ++++++
> gcc/internal-fn.def         |   1 +
> gcc/optabs.def              |   1 +
> gcc/tree-ssa-loop-manip.cc  |   4 +-
> gcc/tree-ssa-loop-manip.h   |   2 +-
> gcc/tree-vect-loop-manip.cc | 186 ++++++++++++++++++++++++++++++++++--
> gcc/tree-vect-loop.cc       |  35 +++++--
> gcc/tree-vect-stmts.cc      |   9 +-
> gcc/tree-vectorizer.h       |   4 +-
> 10 files changed, 264 insertions(+), 21 deletions(-)
>  
> diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> index 8e3113599fd..72178ab014c 100644
> --- a/gcc/doc/md.texi
> +++ b/gcc/doc/md.texi
> @@ -4965,6 +4965,20 @@ for (i = 1; i < operand3; i++)
>    operand0[i] = operand0[i - 1] && (operand1 + i < operand2);
> @end smallexample
> +@cindex @code{while_len@var{m}@var{n}} instruction pattern
> +@item @code{while_len@var{m}@var{n}}
> +Set operand 0 to the number of active elements in vector will be updated value.
> +operand 1 is the total elements need to be updated value.
> +operand 2 is the vectorization factor.
> +The operation is equivalent to:
> +
> +@smallexample
> +operand0 = MIN (operand1, operand2);
> +operand2 can be const_poly_int or poly_int related to vector mode size.
> +Some target like RISC-V has a standalone instruction to get MIN (n, MODE SIZE) so
> +that we can reduce a use of general purpose register.
> +@end smallexample
> +
> @cindex @code{check_raw_ptrs@var{m}} instruction pattern
> @item @samp{check_raw_ptrs@var{m}}
> Check whether, given two pointers @var{a} and @var{b} and a length @var{len},
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index 6e81dc05e0e..5f44def90d3 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -127,6 +127,7 @@ init_internal_fns ()
> #define cond_binary_direct { 1, 1, true }
> #define cond_ternary_direct { 1, 1, true }
> #define while_direct { 0, 2, false }
> +#define while_len_direct { 0, 0, false }
> #define fold_extract_direct { 2, 2, false }
> #define fold_left_direct { 1, 1, false }
> #define mask_fold_left_direct { 1, 1, false }
> @@ -3702,6 +3703,33 @@ expand_while_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>      emit_move_insn (lhs_rtx, ops[0].value);
> }
> +/* Expand WHILE_LEN call STMT using optab OPTAB.  */
> +static void
> +expand_while_len_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
> +{
> +  expand_operand ops[3];
> +  tree rhs_type[2];
> +
> +  tree lhs = gimple_call_lhs (stmt);
> +  tree lhs_type = TREE_TYPE (lhs);
> +  rtx lhs_rtx = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
> +  create_output_operand (&ops[0], lhs_rtx, TYPE_MODE (lhs_type));
> +
> +  for (unsigned int i = 0; i < gimple_call_num_args (stmt); ++i)
> +    {
> +      tree rhs = gimple_call_arg (stmt, i);
> +      rhs_type[i] = TREE_TYPE (rhs);
> +      rtx rhs_rtx = expand_normal (rhs);
> +      create_input_operand (&ops[i + 1], rhs_rtx, TYPE_MODE (rhs_type[i]));
> +    }
> +
> +  insn_code icode = direct_optab_handler (optab, TYPE_MODE (rhs_type[0]));
> +
> +  expand_insn (icode, 3, ops);
> +  if (!rtx_equal_p (lhs_rtx, ops[0].value))
> +    emit_move_insn (lhs_rtx, ops[0].value);
> +}
> +
> /* Expand a call to a convert-like optab using the operands in STMT.
>     FN has a single output operand and NARGS input operands.  */
> @@ -3843,6 +3871,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
> #define direct_scatter_store_optab_supported_p convert_optab_supported_p
> #define direct_len_store_optab_supported_p direct_optab_supported_p
> #define direct_while_optab_supported_p convert_optab_supported_p
> +#define direct_while_len_optab_supported_p direct_optab_supported_p
> #define direct_fold_extract_optab_supported_p direct_optab_supported_p
> #define direct_fold_left_optab_supported_p direct_optab_supported_p
> #define direct_mask_fold_left_optab_supported_p direct_optab_supported_p
> diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> index 7fe742c2ae7..3a933abff5d 100644
> --- a/gcc/internal-fn.def
> +++ b/gcc/internal-fn.def
> @@ -153,6 +153,7 @@ DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set)
> DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store)
> DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
> +DEF_INTERNAL_OPTAB_FN (WHILE_LEN, ECF_CONST | ECF_NOTHROW, while_len, while_len)
> DEF_INTERNAL_OPTAB_FN (CHECK_RAW_PTRS, ECF_CONST | ECF_NOTHROW,
>        check_raw_ptrs, check_ptrs)
> DEF_INTERNAL_OPTAB_FN (CHECK_WAR_PTRS, ECF_CONST | ECF_NOTHROW,
> diff --git a/gcc/optabs.def b/gcc/optabs.def
> index 695f5911b30..f5938bd2c24 100644
> --- a/gcc/optabs.def
> +++ b/gcc/optabs.def
> @@ -476,3 +476,4 @@ OPTAB_DC (vec_series_optab, "vec_series$a", VEC_SERIES)
> OPTAB_D (vec_shl_insert_optab, "vec_shl_insert_$a")
> OPTAB_D (len_load_optab, "len_load_$a")
> OPTAB_D (len_store_optab, "len_store_$a")
> +OPTAB_D (while_len_optab, "while_len$a")
> diff --git a/gcc/tree-ssa-loop-manip.cc b/gcc/tree-ssa-loop-manip.cc
> index 09acc1c94cc..cdbf280e249 100644
> --- a/gcc/tree-ssa-loop-manip.cc
> +++ b/gcc/tree-ssa-loop-manip.cc
> @@ -59,14 +59,14 @@ static bitmap_obstack loop_renamer_obstack;
> void
> create_iv (tree base, tree step, tree var, class loop *loop,
>    gimple_stmt_iterator *incr_pos, bool after,
> -    tree *var_before, tree *var_after)
> +    tree *var_before, tree *var_after, enum tree_code code)
> {
>    gassign *stmt;
>    gphi *phi;
>    tree initial, step1;
>    gimple_seq stmts;
>    tree vb, va;
> -  enum tree_code incr_op = PLUS_EXPR;
> +  enum tree_code incr_op = code;
>    edge pe = loop_preheader_edge (loop);
>    if (var != NULL_TREE)
> diff --git a/gcc/tree-ssa-loop-manip.h b/gcc/tree-ssa-loop-manip.h
> index d49273a3987..da755320a3a 100644
> --- a/gcc/tree-ssa-loop-manip.h
> +++ b/gcc/tree-ssa-loop-manip.h
> @@ -23,7 +23,7 @@ along with GCC; see the file COPYING3.  If not see
> typedef void (*transform_callback)(class loop *, void *);
> extern void create_iv (tree, tree, tree, class loop *, gimple_stmt_iterator *,
> -        bool, tree *, tree *);
> +        bool, tree *, tree *, enum tree_code = PLUS_EXPR);
> extern void rewrite_into_loop_closed_ssa (bitmap, unsigned);
> extern void verify_loop_closed_ssa (bool, class loop * = NULL);
> diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> index f60fa50e8f4..f3cd6c51d2e 100644
> --- a/gcc/tree-vect-loop-manip.cc
> +++ b/gcc/tree-vect-loop-manip.cc
> @@ -682,6 +682,173 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
>    return next_ctrl;
> }
> +/* Helper for vect_set_loop_condition_partial_vectors.  Generate definitions
> +   for all the rgroup controls in RGC and return a control that is nonzero
> +   when the loop needs to iterate.  Add any new preheader statements to
> +   PREHEADER_SEQ.  Use LOOP_COND_GSI to insert code before the exit gcond.
> +
> +   RGC belongs to loop LOOP.  The loop originally iterated NITERS
> +   times and has been vectorized according to LOOP_VINFO.
> +
> +   Unlike vect_set_loop_controls_directly which is iterating from 0-based IV
> +   to TEST_LIMIT - bias.
> +
> +   In vect_set_loop_controls_by_while_len, we are iterating from start at
> +   IV = TEST_LIMIT - bias and keep subtract IV by the length calculated by
> +   IFN_WHILE_LEN pattern.
> +
> +   Note: the cost of the code generated by this function is modeled
> +   by vect_estimate_min_profitable_iters, so changes here may need
> +   corresponding changes there.
> +
> +   1. Single rgroup, the Gimple IR should be:
> +
> + <bb 3>
> + _19 = (unsigned long) n_5(D);
> + ...
> +
> + <bb 4>:
> + ...
> + # ivtmp_20 = PHI <ivtmp_21(4), _19(3)>
> + ...
> + _22 = .WHILE_LEN (ivtmp_20, vf);
> + ...
> + vector statement (use _22);
> + ...
> + ivtmp_21 = ivtmp_20 - _22;
> + ...
> + if (ivtmp_21 != 0)
> +   goto <bb 4>; [75.00%]
> + else
> +   goto <bb 5>; [25.00%]
> +
> + <bb 5>
> + return;
> +
> +   Note: IFN_WHILE_LEN will guarantee "ivtmp_21 = ivtmp_20 - _22" never
> +   underflow 0.
> +
> +   2. Multiple rgroup, the Gimple IR should be:
> +
> + <bb 3>
> + _70 = (unsigned long) bnd.7_52;
> + _71 = _70 * 2;
> + _72 = MAX_EXPR <_71, 4>;
> + _73 = _72 + 18446744073709551612;
> + ...
> +
> + <bb 4>:
> + ...
> + # ivtmp_74 = PHI <ivtmp_75(6), _73(12)>
> + # ivtmp_77 = PHI <ivtmp_78(6), _71(12)>
> + _76 = .WHILE_LEN (ivtmp_74, vf * nitems_per_ctrl);
> + _79 = .WHILE_LEN (ivtmp_77, vf * nitems_per_ctrl);
> + ...
> + vector statement (use _79);
> + ...
> + vector statement (use _76);
> + ...
> + _65 = _79 / 2;
> + vector statement (use _65);
> + ...
> + _68 = _76 / 2;
> + vector statement (use _68);
> + ...
> + ivtmp_78 = ivtmp_77 - _79;
> + ivtmp_75 = ivtmp_74 - _76;
> + ...
> + if (ivtmp_78 != 0)
> +   goto <bb 4>; [75.00%]
> + else
> +   goto <bb 5>; [25.00%]
> +
> + <bb 5>
> + return;
> +
> +*/
> +
> +static tree
> +vect_set_loop_controls_by_while_len (class loop *loop, loop_vec_info loop_vinfo,
> +      gimple_seq *preheader_seq,
> +      gimple_seq *header_seq,
> +      rgroup_controls *rgc, tree niters)
> +{
> +  tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
> +  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> +  /* We are not allowing masked approach in WHILE_LEN.  */
> +  gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
> +
> +  tree ctrl_type = rgc->type;
> +  unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
> +  poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
> +  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
> +
> +  /* Calculate the maximum number of item values that the rgroup
> +     handles in total, the number that it handles for each iteration
> +     of the vector loop.  */
> +  tree nitems_total = niters;
> +  if (nitems_per_iter != 1)
> +    {
> +      /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
> + these multiplications don't overflow.  */
> +      tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
> +      nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
> +    nitems_total, compare_factor);
> +    }
> +
> +  /* Convert the comparison value to the IV type (either a no-op or
> +     a promotion).  */
> +  nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total);
> +
> +  /* Create an induction variable that counts the number of items
> +     processed.  */
> +  tree index_before_incr, index_after_incr;
> +  gimple_stmt_iterator incr_gsi;
> +  bool insert_after;
> +  standard_iv_increment_position (loop, &incr_gsi, &insert_after);
> +
> +  /* Test the decremented IV, which will never underflow 0 since we have
> +     IFN_WHILE_LEN to gurantee that.  */
> +  tree test_limit = nitems_total;
> +
> +  /* Provide a definition of each control in the group.  */
> +  tree ctrl;
> +  unsigned int i;
> +  FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
> +    {
> +      /* Previous controls will cover BIAS items.  This control covers the
> + next batch.  */
> +      poly_uint64 bias = nitems_per_ctrl * i;
> +      tree bias_tree = build_int_cst (iv_type, bias);
> +
> +      /* Rather than have a new IV that starts at TEST_LIMIT and goes down to
> + BIAS, prefer to use the same TEST_LIMIT - BIAS based IV for each
> + control and adjust the bound down by BIAS.  */
> +      tree this_test_limit = test_limit;
> +      if (i != 0)
> + {
> +   this_test_limit = gimple_build (preheader_seq, MAX_EXPR, iv_type,
> +   this_test_limit, bias_tree);
> +   this_test_limit = gimple_build (preheader_seq, MINUS_EXPR, iv_type,
> +   this_test_limit, bias_tree);
> + }
> +
> +      /* Create decrement IV.  */
> +      create_iv (this_test_limit, ctrl, NULL_TREE, loop, &incr_gsi,
> + insert_after, &index_before_incr, &index_after_incr,
> + MINUS_EXPR);
> +
> +      poly_uint64 final_vf = vf * nitems_per_iter;
> +      tree vf_step = build_int_cst (iv_type, final_vf);
> +      tree res_len = gimple_build (header_seq, IFN_WHILE_LEN, iv_type,
> +    index_before_incr, vf_step);
> +      gassign *assign = gimple_build_assign (ctrl, res_len);
> +      gimple_seq_add_stmt (header_seq, assign);
> +    }
> +
> +  return index_after_incr;
> +}
> +
> /* Set up the iteration condition and rgroup controls for LOOP, given
>     that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
>     loop.  LOOP_VINFO describes the vectorization of LOOP.  NITERS is
> @@ -703,6 +870,7 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
>    bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
>    tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
> +  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
>    unsigned int compare_precision = TYPE_PRECISION (compare_type);
>    tree orig_niters = niters;
> @@ -757,12 +925,18 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
> bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
> /* Set up all controls for this group.  */
> - test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
> -      &preheader_seq,
> -      &header_seq,
> -      loop_cond_gsi, rgc,
> -      niters, niters_skip,
> -      might_wrap_p);
> + if (direct_internal_fn_supported_p (IFN_WHILE_LEN, iv_type,
> +     OPTIMIZE_FOR_SPEED))
> +   test_ctrl
> +     = vect_set_loop_controls_by_while_len (loop, loop_vinfo,
> +    &preheader_seq, &header_seq,
> +    rgc, niters);
> + else
> +   test_ctrl
> +     = vect_set_loop_controls_directly (loop, loop_vinfo, &preheader_seq,
> +        &header_seq, loop_cond_gsi, rgc,
> +        niters, niters_skip,
> +        might_wrap_p);
>        }
>    /* Emit all accumulated statements.  */
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 1ba9f18d73e..5bffd9a6322 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -10360,12 +10360,14 @@ vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
>     rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
> tree
> -vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
> -    unsigned int nvectors, unsigned int index)
> +vect_get_loop_len (gimple_stmt_iterator *gsi, loop_vec_info loop_vinfo,
> +    vec_loop_lens *lens, unsigned int nvectors, tree vectype,
> +    unsigned int index)
> {
>    rgroup_controls *rgl = &(*lens)[nvectors - 1];
> -  bool use_bias_adjusted_len =
> -    LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
> +  bool use_bias_adjusted_len
> +    = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
> +  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
>    /* Populate the rgroup's len array, if this is the first time we've
>       used it.  */
> @@ -10386,8 +10388,8 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
>   if (use_bias_adjusted_len)
>     {
>       gcc_assert (i == 0);
> -       tree adjusted_len =
> - make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
> +       tree adjusted_len
> + = make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
>       SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
>       rgl->bias_adjusted_ctrl = adjusted_len;
>     }
> @@ -10396,6 +10398,27 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
>    if (use_bias_adjusted_len)
>      return rgl->bias_adjusted_ctrl;
> +  else if (direct_internal_fn_supported_p (IFN_WHILE_LEN, iv_type,
> +    OPTIMIZE_FOR_SPEED))
> +    {
> +      tree loop_len = rgl->controls[index];
> +      poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
> +      poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
> +      if (maybe_ne (nunits1, nunits2))
> + {
> +   /* A loop len for data type X can be reused for data type Y
> +      if X has N times more elements than Y and if Y's elements
> +      are N times bigger than X's.  */
> +   gcc_assert (multiple_p (nunits1, nunits2));
> +   unsigned int factor = exact_div (nunits1, nunits2).to_constant ();
> +   gimple_seq seq = NULL;
> +   loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
> +    build_int_cst (iv_type, factor));
> +   if (seq)
> +     gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
> + }
> +      return loop_len;
> +    }
>    else
>      return rgl->controls[index];
> }
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index efa2d0daa52..708c8a1d806 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -8653,8 +8653,9 @@ vectorizable_store (vec_info *vinfo,
>       else if (loop_lens)
> {
>   tree final_len
> -     = vect_get_loop_len (loop_vinfo, loop_lens,
> - vec_num * ncopies, vec_num * j + i);
> +     = vect_get_loop_len (gsi, loop_vinfo, loop_lens,
> + vec_num * ncopies, vectype,
> + vec_num * j + i);
>   tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
>   machine_mode vmode = TYPE_MODE (vectype);
>   opt_machine_mode new_ovmode
> @@ -10009,8 +10010,8 @@ vectorizable_load (vec_info *vinfo,
>     else if (loop_lens && memory_access_type != VMAT_INVARIANT)
>       {
> tree final_len
> -   = vect_get_loop_len (loop_vinfo, loop_lens,
> -        vec_num * ncopies,
> +   = vect_get_loop_len (gsi, loop_vinfo, loop_lens,
> +        vec_num * ncopies, vectype,
>        vec_num * j + i);
> tree ptr = build_int_cst (ref_type,
>   align * BITS_PER_UNIT);
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index 9cf2fb23fe3..e5cf38caf4b 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -2293,8 +2293,8 @@ extern tree vect_get_loop_mask (gimple_stmt_iterator *, vec_loop_masks *,
> unsigned int, tree, unsigned int);
> extern void vect_record_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
>   tree, unsigned int);
> -extern tree vect_get_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
> -        unsigned int);
> +extern tree vect_get_loop_len (gimple_stmt_iterator *, loop_vec_info,
> +        vec_loop_lens *, unsigned int, tree, unsigned int);
> extern gimple_seq vect_gen_len (tree, tree, tree, tree);
> extern stmt_vec_info info_for_reduction (vec_info *, stmt_vec_info);
> extern bool reduction_fn_for_scalar_code (code_helper, internal_fn *);
 

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
  2023-04-20  8:57     ` juzhe.zhong
@ 2023-04-20  9:11       ` Richard Sandiford
  2023-04-20  9:19         ` juzhe.zhong
  0 siblings, 1 reply; 41+ messages in thread
From: Richard Sandiford @ 2023-04-20  9:11 UTC (permalink / raw)
  To: juzhe.zhong; +Cc: gcc-patches, rguenther, jeffreyalaw

"juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai> writes:
> Thanks Richard reminding me. I originally think community does not allow me support variable amount IV and let me do this in RISC-V backend.

No, I think that part should and needs to be done in the middle-end,
since if the initial IVs are incorrect, it's very difficult to fix
them up later.

But with the patch as originally presented, WHILE_LEN was just a
simple minimum operation, with only the final iteration being partial.
It didn't make sense IMO for that to be its own IFN.  It was only later
that you said that non-final iterations might be partial too.

And there was pushback against WHILE_LEN having an effect on global
state, rather than being a simple "how many elements should I process?"
calculation.  That last bit -- the global effect of VSETVL -- was the bit
that needed to be kept local to the RISC-V backend.

Thanks,
Richard

> It seems that I can do that in middle-end. Thank you so much. I will update the patch. Really appreciate it!
>
>
>
> juzhe.zhong@rivai.ai
>  
> From: Richard Sandiford
> Date: 2023-04-20 16:52
> To: 钟居哲
> CC: gcc-patches; rguenther; Jeff Law
> Subject: Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
> 钟居哲 <juzhe.zhong@rivai.ai> writes:
>> Hi, Richards.
>> Since GCC 14 is open and this patch has been boostraped && tested on X86.
>> Is this patch supporting variable IV OK for the trunk ?
>  
> Doesn't the patch need updating based on the previous discussion?
> I thought the outcome was that WHILE_LEN isn't a simple MIN operation
> (contrary to the documentation in the patch) and that pointer IVs
> would also need to be updated by a variable amount, given that even
> non-final iterations might process fewer than VF elements.
>  
> Thanks,
> Richard
>  
>> juzhe.zhong@rivai.ai
>>  
>> From: juzhe.zhong
>> Date: 2023-04-07 09:47
>> To: gcc-patches
>> CC: richard.sandiford; rguenther; jeffreyalaw; Juzhe-Zhong
>> Subject: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
>> From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
>>  
>> This patch is to add WHILE_LEN pattern.
>> It's inspired by RVV ISA simple "vvaddint32.s" example:
>> https://github.com/riscv/riscv-v-spec/blob/master/example/vvaddint32.s
>>  
>> More details are in "vect_set_loop_controls_by_while_len" implementation
>> and comments.
>>  
>> Consider such following case:
>> #define N 16
>> int src[N];
>> int dest[N];
>>  
>> void
>> foo (int n)
>> {
>>   for (int i = 0; i < n; i++)
>>     dest[i] = src[i];
>> }
>>  
>> -march=rv64gcv -O3 --param riscv-autovec-preference=scalable -fno-vect-cost-model -fno-tree-loop-distribute-patterns:
>>  
>> foo:        
>>         ble     a0,zero,.L1
>>         lui     a4,%hi(.LANCHOR0)
>>         addi    a4,a4,%lo(.LANCHOR0)
>>         addi    a3,a4,64
>>         csrr    a2,vlenb
>> .L3:
>>         vsetvli a5,a0,e32,m1,ta,ma
>>         vle32.v v1,0(a4)
>>         sub     a0,a0,a5
>>         vse32.v v1,0(a3)
>>         add     a4,a4,a2
>>         add     a3,a3,a2
>>         bne     a0,zero,.L3
>> .L1:
>>         ret
>>  
>> gcc/ChangeLog:
>>  
>>         * doc/md.texi: Add WHILE_LEN support.
>>         * internal-fn.cc (while_len_direct): Ditto.
>>         (expand_while_len_optab_fn): Ditto.
>>         (direct_while_len_optab_supported_p): Ditto.
>>         * internal-fn.def (WHILE_LEN): Ditto.
>>         * optabs.def (OPTAB_D): Ditto.
>>         * tree-ssa-loop-manip.cc (create_iv): Ditto.
>>         * tree-ssa-loop-manip.h (create_iv): Ditto.
>>         * tree-vect-loop-manip.cc (vect_set_loop_controls_by_while_len): Ditto.
>>         (vect_set_loop_condition_partial_vectors): Ditto.
>>         * tree-vect-loop.cc (vect_get_loop_len): Ditto.
>>         * tree-vect-stmts.cc (vectorizable_store): Ditto.
>>         (vectorizable_load): Ditto.
>>         * tree-vectorizer.h (vect_get_loop_len): Ditto.
>>  
>> ---
>> gcc/doc/md.texi             |  14 +++
>> gcc/internal-fn.cc          |  29 ++++++
>> gcc/internal-fn.def         |   1 +
>> gcc/optabs.def              |   1 +
>> gcc/tree-ssa-loop-manip.cc  |   4 +-
>> gcc/tree-ssa-loop-manip.h   |   2 +-
>> gcc/tree-vect-loop-manip.cc | 186 ++++++++++++++++++++++++++++++++++--
>> gcc/tree-vect-loop.cc       |  35 +++++--
>> gcc/tree-vect-stmts.cc      |   9 +-
>> gcc/tree-vectorizer.h       |   4 +-
>> 10 files changed, 264 insertions(+), 21 deletions(-)
>>  
>> diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
>> index 8e3113599fd..72178ab014c 100644
>> --- a/gcc/doc/md.texi
>> +++ b/gcc/doc/md.texi
>> @@ -4965,6 +4965,20 @@ for (i = 1; i < operand3; i++)
>>    operand0[i] = operand0[i - 1] && (operand1 + i < operand2);
>> @end smallexample
>> +@cindex @code{while_len@var{m}@var{n}} instruction pattern
>> +@item @code{while_len@var{m}@var{n}}
>> +Set operand 0 to the number of active elements in vector will be updated value.
>> +operand 1 is the total elements need to be updated value.
>> +operand 2 is the vectorization factor.
>> +The operation is equivalent to:
>> +
>> +@smallexample
>> +operand0 = MIN (operand1, operand2);
>> +operand2 can be const_poly_int or poly_int related to vector mode size.
>> +Some target like RISC-V has a standalone instruction to get MIN (n, MODE SIZE) so
>> +that we can reduce a use of general purpose register.
>> +@end smallexample
>> +
>> @cindex @code{check_raw_ptrs@var{m}} instruction pattern
>> @item @samp{check_raw_ptrs@var{m}}
>> Check whether, given two pointers @var{a} and @var{b} and a length @var{len},
>> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
>> index 6e81dc05e0e..5f44def90d3 100644
>> --- a/gcc/internal-fn.cc
>> +++ b/gcc/internal-fn.cc
>> @@ -127,6 +127,7 @@ init_internal_fns ()
>> #define cond_binary_direct { 1, 1, true }
>> #define cond_ternary_direct { 1, 1, true }
>> #define while_direct { 0, 2, false }
>> +#define while_len_direct { 0, 0, false }
>> #define fold_extract_direct { 2, 2, false }
>> #define fold_left_direct { 1, 1, false }
>> #define mask_fold_left_direct { 1, 1, false }
>> @@ -3702,6 +3703,33 @@ expand_while_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>>      emit_move_insn (lhs_rtx, ops[0].value);
>> }
>> +/* Expand WHILE_LEN call STMT using optab OPTAB.  */
>> +static void
>> +expand_while_len_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>> +{
>> +  expand_operand ops[3];
>> +  tree rhs_type[2];
>> +
>> +  tree lhs = gimple_call_lhs (stmt);
>> +  tree lhs_type = TREE_TYPE (lhs);
>> +  rtx lhs_rtx = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
>> +  create_output_operand (&ops[0], lhs_rtx, TYPE_MODE (lhs_type));
>> +
>> +  for (unsigned int i = 0; i < gimple_call_num_args (stmt); ++i)
>> +    {
>> +      tree rhs = gimple_call_arg (stmt, i);
>> +      rhs_type[i] = TREE_TYPE (rhs);
>> +      rtx rhs_rtx = expand_normal (rhs);
>> +      create_input_operand (&ops[i + 1], rhs_rtx, TYPE_MODE (rhs_type[i]));
>> +    }
>> +
>> +  insn_code icode = direct_optab_handler (optab, TYPE_MODE (rhs_type[0]));
>> +
>> +  expand_insn (icode, 3, ops);
>> +  if (!rtx_equal_p (lhs_rtx, ops[0].value))
>> +    emit_move_insn (lhs_rtx, ops[0].value);
>> +}
>> +
>> /* Expand a call to a convert-like optab using the operands in STMT.
>>     FN has a single output operand and NARGS input operands.  */
>> @@ -3843,6 +3871,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
>> #define direct_scatter_store_optab_supported_p convert_optab_supported_p
>> #define direct_len_store_optab_supported_p direct_optab_supported_p
>> #define direct_while_optab_supported_p convert_optab_supported_p
>> +#define direct_while_len_optab_supported_p direct_optab_supported_p
>> #define direct_fold_extract_optab_supported_p direct_optab_supported_p
>> #define direct_fold_left_optab_supported_p direct_optab_supported_p
>> #define direct_mask_fold_left_optab_supported_p direct_optab_supported_p
>> diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
>> index 7fe742c2ae7..3a933abff5d 100644
>> --- a/gcc/internal-fn.def
>> +++ b/gcc/internal-fn.def
>> @@ -153,6 +153,7 @@ DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set)
>> DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store)
>> DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
>> +DEF_INTERNAL_OPTAB_FN (WHILE_LEN, ECF_CONST | ECF_NOTHROW, while_len, while_len)
>> DEF_INTERNAL_OPTAB_FN (CHECK_RAW_PTRS, ECF_CONST | ECF_NOTHROW,
>>        check_raw_ptrs, check_ptrs)
>> DEF_INTERNAL_OPTAB_FN (CHECK_WAR_PTRS, ECF_CONST | ECF_NOTHROW,
>> diff --git a/gcc/optabs.def b/gcc/optabs.def
>> index 695f5911b30..f5938bd2c24 100644
>> --- a/gcc/optabs.def
>> +++ b/gcc/optabs.def
>> @@ -476,3 +476,4 @@ OPTAB_DC (vec_series_optab, "vec_series$a", VEC_SERIES)
>> OPTAB_D (vec_shl_insert_optab, "vec_shl_insert_$a")
>> OPTAB_D (len_load_optab, "len_load_$a")
>> OPTAB_D (len_store_optab, "len_store_$a")
>> +OPTAB_D (while_len_optab, "while_len$a")
>> diff --git a/gcc/tree-ssa-loop-manip.cc b/gcc/tree-ssa-loop-manip.cc
>> index 09acc1c94cc..cdbf280e249 100644
>> --- a/gcc/tree-ssa-loop-manip.cc
>> +++ b/gcc/tree-ssa-loop-manip.cc
>> @@ -59,14 +59,14 @@ static bitmap_obstack loop_renamer_obstack;
>> void
>> create_iv (tree base, tree step, tree var, class loop *loop,
>>    gimple_stmt_iterator *incr_pos, bool after,
>> -    tree *var_before, tree *var_after)
>> +    tree *var_before, tree *var_after, enum tree_code code)
>> {
>>    gassign *stmt;
>>    gphi *phi;
>>    tree initial, step1;
>>    gimple_seq stmts;
>>    tree vb, va;
>> -  enum tree_code incr_op = PLUS_EXPR;
>> +  enum tree_code incr_op = code;
>>    edge pe = loop_preheader_edge (loop);
>>    if (var != NULL_TREE)
>> diff --git a/gcc/tree-ssa-loop-manip.h b/gcc/tree-ssa-loop-manip.h
>> index d49273a3987..da755320a3a 100644
>> --- a/gcc/tree-ssa-loop-manip.h
>> +++ b/gcc/tree-ssa-loop-manip.h
>> @@ -23,7 +23,7 @@ along with GCC; see the file COPYING3.  If not see
>> typedef void (*transform_callback)(class loop *, void *);
>> extern void create_iv (tree, tree, tree, class loop *, gimple_stmt_iterator *,
>> -        bool, tree *, tree *);
>> +        bool, tree *, tree *, enum tree_code = PLUS_EXPR);
>> extern void rewrite_into_loop_closed_ssa (bitmap, unsigned);
>> extern void verify_loop_closed_ssa (bool, class loop * = NULL);
>> diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
>> index f60fa50e8f4..f3cd6c51d2e 100644
>> --- a/gcc/tree-vect-loop-manip.cc
>> +++ b/gcc/tree-vect-loop-manip.cc
>> @@ -682,6 +682,173 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
>>    return next_ctrl;
>> }
>> +/* Helper for vect_set_loop_condition_partial_vectors.  Generate definitions
>> +   for all the rgroup controls in RGC and return a control that is nonzero
>> +   when the loop needs to iterate.  Add any new preheader statements to
>> +   PREHEADER_SEQ.  Use LOOP_COND_GSI to insert code before the exit gcond.
>> +
>> +   RGC belongs to loop LOOP.  The loop originally iterated NITERS
>> +   times and has been vectorized according to LOOP_VINFO.
>> +
>> +   Unlike vect_set_loop_controls_directly which is iterating from 0-based IV
>> +   to TEST_LIMIT - bias.
>> +
>> +   In vect_set_loop_controls_by_while_len, we are iterating from start at
>> +   IV = TEST_LIMIT - bias and keep subtract IV by the length calculated by
>> +   IFN_WHILE_LEN pattern.
>> +
>> +   Note: the cost of the code generated by this function is modeled
>> +   by vect_estimate_min_profitable_iters, so changes here may need
>> +   corresponding changes there.
>> +
>> +   1. Single rgroup, the Gimple IR should be:
>> +
>> + <bb 3>
>> + _19 = (unsigned long) n_5(D);
>> + ...
>> +
>> + <bb 4>:
>> + ...
>> + # ivtmp_20 = PHI <ivtmp_21(4), _19(3)>
>> + ...
>> + _22 = .WHILE_LEN (ivtmp_20, vf);
>> + ...
>> + vector statement (use _22);
>> + ...
>> + ivtmp_21 = ivtmp_20 - _22;
>> + ...
>> + if (ivtmp_21 != 0)
>> +   goto <bb 4>; [75.00%]
>> + else
>> +   goto <bb 5>; [25.00%]
>> +
>> + <bb 5>
>> + return;
>> +
>> +   Note: IFN_WHILE_LEN will guarantee "ivtmp_21 = ivtmp_20 - _22" never
>> +   underflow 0.
>> +
>> +   2. Multiple rgroup, the Gimple IR should be:
>> +
>> + <bb 3>
>> + _70 = (unsigned long) bnd.7_52;
>> + _71 = _70 * 2;
>> + _72 = MAX_EXPR <_71, 4>;
>> + _73 = _72 + 18446744073709551612;
>> + ...
>> +
>> + <bb 4>:
>> + ...
>> + # ivtmp_74 = PHI <ivtmp_75(6), _73(12)>
>> + # ivtmp_77 = PHI <ivtmp_78(6), _71(12)>
>> + _76 = .WHILE_LEN (ivtmp_74, vf * nitems_per_ctrl);
>> + _79 = .WHILE_LEN (ivtmp_77, vf * nitems_per_ctrl);
>> + ...
>> + vector statement (use _79);
>> + ...
>> + vector statement (use _76);
>> + ...
>> + _65 = _79 / 2;
>> + vector statement (use _65);
>> + ...
>> + _68 = _76 / 2;
>> + vector statement (use _68);
>> + ...
>> + ivtmp_78 = ivtmp_77 - _79;
>> + ivtmp_75 = ivtmp_74 - _76;
>> + ...
>> + if (ivtmp_78 != 0)
>> +   goto <bb 4>; [75.00%]
>> + else
>> +   goto <bb 5>; [25.00%]
>> +
>> + <bb 5>
>> + return;
>> +
>> +*/
>> +
>> +static tree
>> +vect_set_loop_controls_by_while_len (class loop *loop, loop_vec_info loop_vinfo,
>> +      gimple_seq *preheader_seq,
>> +      gimple_seq *header_seq,
>> +      rgroup_controls *rgc, tree niters)
>> +{
>> +  tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
>> +  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
>> +  /* We are not allowing masked approach in WHILE_LEN.  */
>> +  gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
>> +
>> +  tree ctrl_type = rgc->type;
>> +  unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
>> +  poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
>> +  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
>> +
>> +  /* Calculate the maximum number of item values that the rgroup
>> +     handles in total, the number that it handles for each iteration
>> +     of the vector loop.  */
>> +  tree nitems_total = niters;
>> +  if (nitems_per_iter != 1)
>> +    {
>> +      /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
>> + these multiplications don't overflow.  */
>> +      tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
>> +      nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
>> +    nitems_total, compare_factor);
>> +    }
>> +
>> +  /* Convert the comparison value to the IV type (either a no-op or
>> +     a promotion).  */
>> +  nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total);
>> +
>> +  /* Create an induction variable that counts the number of items
>> +     processed.  */
>> +  tree index_before_incr, index_after_incr;
>> +  gimple_stmt_iterator incr_gsi;
>> +  bool insert_after;
>> +  standard_iv_increment_position (loop, &incr_gsi, &insert_after);
>> +
>> +  /* Test the decremented IV, which will never underflow 0 since we have
>> +     IFN_WHILE_LEN to gurantee that.  */
>> +  tree test_limit = nitems_total;
>> +
>> +  /* Provide a definition of each control in the group.  */
>> +  tree ctrl;
>> +  unsigned int i;
>> +  FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
>> +    {
>> +      /* Previous controls will cover BIAS items.  This control covers the
>> + next batch.  */
>> +      poly_uint64 bias = nitems_per_ctrl * i;
>> +      tree bias_tree = build_int_cst (iv_type, bias);
>> +
>> +      /* Rather than have a new IV that starts at TEST_LIMIT and goes down to
>> + BIAS, prefer to use the same TEST_LIMIT - BIAS based IV for each
>> + control and adjust the bound down by BIAS.  */
>> +      tree this_test_limit = test_limit;
>> +      if (i != 0)
>> + {
>> +   this_test_limit = gimple_build (preheader_seq, MAX_EXPR, iv_type,
>> +   this_test_limit, bias_tree);
>> +   this_test_limit = gimple_build (preheader_seq, MINUS_EXPR, iv_type,
>> +   this_test_limit, bias_tree);
>> + }
>> +
>> +      /* Create decrement IV.  */
>> +      create_iv (this_test_limit, ctrl, NULL_TREE, loop, &incr_gsi,
>> + insert_after, &index_before_incr, &index_after_incr,
>> + MINUS_EXPR);
>> +
>> +      poly_uint64 final_vf = vf * nitems_per_iter;
>> +      tree vf_step = build_int_cst (iv_type, final_vf);
>> +      tree res_len = gimple_build (header_seq, IFN_WHILE_LEN, iv_type,
>> +    index_before_incr, vf_step);
>> +      gassign *assign = gimple_build_assign (ctrl, res_len);
>> +      gimple_seq_add_stmt (header_seq, assign);
>> +    }
>> +
>> +  return index_after_incr;
>> +}
>> +
>> /* Set up the iteration condition and rgroup controls for LOOP, given
>>     that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
>>     loop.  LOOP_VINFO describes the vectorization of LOOP.  NITERS is
>> @@ -703,6 +870,7 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
>>    bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
>>    tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
>> +  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
>>    unsigned int compare_precision = TYPE_PRECISION (compare_type);
>>    tree orig_niters = niters;
>> @@ -757,12 +925,18 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
>> bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
>> /* Set up all controls for this group.  */
>> - test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
>> -      &preheader_seq,
>> -      &header_seq,
>> -      loop_cond_gsi, rgc,
>> -      niters, niters_skip,
>> -      might_wrap_p);
>> + if (direct_internal_fn_supported_p (IFN_WHILE_LEN, iv_type,
>> +     OPTIMIZE_FOR_SPEED))
>> +   test_ctrl
>> +     = vect_set_loop_controls_by_while_len (loop, loop_vinfo,
>> +    &preheader_seq, &header_seq,
>> +    rgc, niters);
>> + else
>> +   test_ctrl
>> +     = vect_set_loop_controls_directly (loop, loop_vinfo, &preheader_seq,
>> +        &header_seq, loop_cond_gsi, rgc,
>> +        niters, niters_skip,
>> +        might_wrap_p);
>>        }
>>    /* Emit all accumulated statements.  */
>> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
>> index 1ba9f18d73e..5bffd9a6322 100644
>> --- a/gcc/tree-vect-loop.cc
>> +++ b/gcc/tree-vect-loop.cc
>> @@ -10360,12 +10360,14 @@ vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
>>     rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
>> tree
>> -vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
>> -    unsigned int nvectors, unsigned int index)
>> +vect_get_loop_len (gimple_stmt_iterator *gsi, loop_vec_info loop_vinfo,
>> +    vec_loop_lens *lens, unsigned int nvectors, tree vectype,
>> +    unsigned int index)
>> {
>>    rgroup_controls *rgl = &(*lens)[nvectors - 1];
>> -  bool use_bias_adjusted_len =
>> -    LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
>> +  bool use_bias_adjusted_len
>> +    = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
>> +  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
>>    /* Populate the rgroup's len array, if this is the first time we've
>>       used it.  */
>> @@ -10386,8 +10388,8 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
>>   if (use_bias_adjusted_len)
>>     {
>>       gcc_assert (i == 0);
>> -       tree adjusted_len =
>> - make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
>> +       tree adjusted_len
>> + = make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
>>       SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
>>       rgl->bias_adjusted_ctrl = adjusted_len;
>>     }
>> @@ -10396,6 +10398,27 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
>>    if (use_bias_adjusted_len)
>>      return rgl->bias_adjusted_ctrl;
>> +  else if (direct_internal_fn_supported_p (IFN_WHILE_LEN, iv_type,
>> +    OPTIMIZE_FOR_SPEED))
>> +    {
>> +      tree loop_len = rgl->controls[index];
>> +      poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
>> +      poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
>> +      if (maybe_ne (nunits1, nunits2))
>> + {
>> +   /* A loop len for data type X can be reused for data type Y
>> +      if X has N times more elements than Y and if Y's elements
>> +      are N times bigger than X's.  */
>> +   gcc_assert (multiple_p (nunits1, nunits2));
>> +   unsigned int factor = exact_div (nunits1, nunits2).to_constant ();
>> +   gimple_seq seq = NULL;
>> +   loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
>> +    build_int_cst (iv_type, factor));
>> +   if (seq)
>> +     gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
>> + }
>> +      return loop_len;
>> +    }
>>    else
>>      return rgl->controls[index];
>> }
>> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
>> index efa2d0daa52..708c8a1d806 100644
>> --- a/gcc/tree-vect-stmts.cc
>> +++ b/gcc/tree-vect-stmts.cc
>> @@ -8653,8 +8653,9 @@ vectorizable_store (vec_info *vinfo,
>>       else if (loop_lens)
>> {
>>   tree final_len
>> -     = vect_get_loop_len (loop_vinfo, loop_lens,
>> - vec_num * ncopies, vec_num * j + i);
>> +     = vect_get_loop_len (gsi, loop_vinfo, loop_lens,
>> + vec_num * ncopies, vectype,
>> + vec_num * j + i);
>>   tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
>>   machine_mode vmode = TYPE_MODE (vectype);
>>   opt_machine_mode new_ovmode
>> @@ -10009,8 +10010,8 @@ vectorizable_load (vec_info *vinfo,
>>     else if (loop_lens && memory_access_type != VMAT_INVARIANT)
>>       {
>> tree final_len
>> -   = vect_get_loop_len (loop_vinfo, loop_lens,
>> -        vec_num * ncopies,
>> +   = vect_get_loop_len (gsi, loop_vinfo, loop_lens,
>> +        vec_num * ncopies, vectype,
>>        vec_num * j + i);
>> tree ptr = build_int_cst (ref_type,
>>   align * BITS_PER_UNIT);
>> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
>> index 9cf2fb23fe3..e5cf38caf4b 100644
>> --- a/gcc/tree-vectorizer.h
>> +++ b/gcc/tree-vectorizer.h
>> @@ -2293,8 +2293,8 @@ extern tree vect_get_loop_mask (gimple_stmt_iterator *, vec_loop_masks *,
>> unsigned int, tree, unsigned int);
>> extern void vect_record_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
>>   tree, unsigned int);
>> -extern tree vect_get_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
>> -        unsigned int);
>> +extern tree vect_get_loop_len (gimple_stmt_iterator *, loop_vec_info,
>> +        vec_loop_lens *, unsigned int, tree, unsigned int);
>> extern gimple_seq vect_gen_len (tree, tree, tree, tree);
>> extern stmt_vec_info info_for_reduction (vec_info *, stmt_vec_info);
>> extern bool reduction_fn_for_scalar_code (code_helper, internal_fn *);
>  

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
  2023-04-20  9:11       ` Richard Sandiford
@ 2023-04-20  9:19         ` juzhe.zhong
  2023-04-20  9:22           ` Richard Sandiford
  0 siblings, 1 reply; 41+ messages in thread
From: juzhe.zhong @ 2023-04-20  9:19 UTC (permalink / raw)
  To: richard.sandiford; +Cc: gcc-patches, rguenther, jeffreyalaw

[-- Attachment #1: Type: text/plain, Size: 23717 bytes --]

OK. Thanks Richard.
So let me conclude:
1. Community agree that I should support variable IV in the middle-end.
2. We can keep WHILE_LEN pattern when "not only final iteration is partial".
    And I should describe it more clearly in the doc.

I should do these 2 things in the later update patch.
Is that right? Feel free to correct me.

Thanks.


juzhe.zhong@rivai.ai
 
From: Richard Sandiford
Date: 2023-04-20 17:11
To: juzhe.zhong\@rivai.ai
CC: gcc-patches; rguenther; jeffreyalaw
Subject: Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
"juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai> writes:
> Thanks Richard reminding me. I originally think community does not allow me support variable amount IV and let me do this in RISC-V backend.
 
No, I think that part should and needs to be done in the middle-end,
since if the initial IVs are incorrect, it's very difficult to fix
them up later.
 
But with the patch as originally presented, WHILE_LEN was just a
simple minimum operation, with only the final iteration being partial.
It didn't make sense IMO for that to be its own IFN.  It was only later
that you said that non-final iterations might be partial too.
 
And there was pushback against WHILE_LEN having an effect on global
state, rather than being a simple "how many elements should I process?"
calculation.  That last bit -- the global effect of VSETVL -- was the bit
that needed to be kept local to the RISC-V backend.
 
Thanks,
Richard
 
> It seems that I can do that in middle-end. Thank you so much. I will update the patch. Really appreciate it!
>
>
>
> juzhe.zhong@rivai.ai
>  
> From: Richard Sandiford
> Date: 2023-04-20 16:52
> To: 钟居哲
> CC: gcc-patches; rguenther; Jeff Law
> Subject: Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
> 钟居哲 <juzhe.zhong@rivai.ai> writes:
>> Hi, Richards.
>> Since GCC 14 is open and this patch has been boostraped && tested on X86.
>> Is this patch supporting variable IV OK for the trunk ?
>  
> Doesn't the patch need updating based on the previous discussion?
> I thought the outcome was that WHILE_LEN isn't a simple MIN operation
> (contrary to the documentation in the patch) and that pointer IVs
> would also need to be updated by a variable amount, given that even
> non-final iterations might process fewer than VF elements.
>  
> Thanks,
> Richard
>  
>> juzhe.zhong@rivai.ai
>>  
>> From: juzhe.zhong
>> Date: 2023-04-07 09:47
>> To: gcc-patches
>> CC: richard.sandiford; rguenther; jeffreyalaw; Juzhe-Zhong
>> Subject: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
>> From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
>>  
>> This patch is to add WHILE_LEN pattern.
>> It's inspired by RVV ISA simple "vvaddint32.s" example:
>> https://github.com/riscv/riscv-v-spec/blob/master/example/vvaddint32.s
>>  
>> More details are in "vect_set_loop_controls_by_while_len" implementation
>> and comments.
>>  
>> Consider such following case:
>> #define N 16
>> int src[N];
>> int dest[N];
>>  
>> void
>> foo (int n)
>> {
>>   for (int i = 0; i < n; i++)
>>     dest[i] = src[i];
>> }
>>  
>> -march=rv64gcv -O3 --param riscv-autovec-preference=scalable -fno-vect-cost-model -fno-tree-loop-distribute-patterns:
>>  
>> foo:        
>>         ble     a0,zero,.L1
>>         lui     a4,%hi(.LANCHOR0)
>>         addi    a4,a4,%lo(.LANCHOR0)
>>         addi    a3,a4,64
>>         csrr    a2,vlenb
>> .L3:
>>         vsetvli a5,a0,e32,m1,ta,ma
>>         vle32.v v1,0(a4)
>>         sub     a0,a0,a5
>>         vse32.v v1,0(a3)
>>         add     a4,a4,a2
>>         add     a3,a3,a2
>>         bne     a0,zero,.L3
>> .L1:
>>         ret
>>  
>> gcc/ChangeLog:
>>  
>>         * doc/md.texi: Add WHILE_LEN support.
>>         * internal-fn.cc (while_len_direct): Ditto.
>>         (expand_while_len_optab_fn): Ditto.
>>         (direct_while_len_optab_supported_p): Ditto.
>>         * internal-fn.def (WHILE_LEN): Ditto.
>>         * optabs.def (OPTAB_D): Ditto.
>>         * tree-ssa-loop-manip.cc (create_iv): Ditto.
>>         * tree-ssa-loop-manip.h (create_iv): Ditto.
>>         * tree-vect-loop-manip.cc (vect_set_loop_controls_by_while_len): Ditto.
>>         (vect_set_loop_condition_partial_vectors): Ditto.
>>         * tree-vect-loop.cc (vect_get_loop_len): Ditto.
>>         * tree-vect-stmts.cc (vectorizable_store): Ditto.
>>         (vectorizable_load): Ditto.
>>         * tree-vectorizer.h (vect_get_loop_len): Ditto.
>>  
>> ---
>> gcc/doc/md.texi             |  14 +++
>> gcc/internal-fn.cc          |  29 ++++++
>> gcc/internal-fn.def         |   1 +
>> gcc/optabs.def              |   1 +
>> gcc/tree-ssa-loop-manip.cc  |   4 +-
>> gcc/tree-ssa-loop-manip.h   |   2 +-
>> gcc/tree-vect-loop-manip.cc | 186 ++++++++++++++++++++++++++++++++++--
>> gcc/tree-vect-loop.cc       |  35 +++++--
>> gcc/tree-vect-stmts.cc      |   9 +-
>> gcc/tree-vectorizer.h       |   4 +-
>> 10 files changed, 264 insertions(+), 21 deletions(-)
>>  
>> diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
>> index 8e3113599fd..72178ab014c 100644
>> --- a/gcc/doc/md.texi
>> +++ b/gcc/doc/md.texi
>> @@ -4965,6 +4965,20 @@ for (i = 1; i < operand3; i++)
>>    operand0[i] = operand0[i - 1] && (operand1 + i < operand2);
>> @end smallexample
>> +@cindex @code{while_len@var{m}@var{n}} instruction pattern
>> +@item @code{while_len@var{m}@var{n}}
>> +Set operand 0 to the number of active elements in vector will be updated value.
>> +operand 1 is the total elements need to be updated value.
>> +operand 2 is the vectorization factor.
>> +The operation is equivalent to:
>> +
>> +@smallexample
>> +operand0 = MIN (operand1, operand2);
>> +operand2 can be const_poly_int or poly_int related to vector mode size.
>> +Some target like RISC-V has a standalone instruction to get MIN (n, MODE SIZE) so
>> +that we can reduce a use of general purpose register.
>> +@end smallexample
>> +
>> @cindex @code{check_raw_ptrs@var{m}} instruction pattern
>> @item @samp{check_raw_ptrs@var{m}}
>> Check whether, given two pointers @var{a} and @var{b} and a length @var{len},
>> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
>> index 6e81dc05e0e..5f44def90d3 100644
>> --- a/gcc/internal-fn.cc
>> +++ b/gcc/internal-fn.cc
>> @@ -127,6 +127,7 @@ init_internal_fns ()
>> #define cond_binary_direct { 1, 1, true }
>> #define cond_ternary_direct { 1, 1, true }
>> #define while_direct { 0, 2, false }
>> +#define while_len_direct { 0, 0, false }
>> #define fold_extract_direct { 2, 2, false }
>> #define fold_left_direct { 1, 1, false }
>> #define mask_fold_left_direct { 1, 1, false }
>> @@ -3702,6 +3703,33 @@ expand_while_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>>      emit_move_insn (lhs_rtx, ops[0].value);
>> }
>> +/* Expand WHILE_LEN call STMT using optab OPTAB.  */
>> +static void
>> +expand_while_len_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>> +{
>> +  expand_operand ops[3];
>> +  tree rhs_type[2];
>> +
>> +  tree lhs = gimple_call_lhs (stmt);
>> +  tree lhs_type = TREE_TYPE (lhs);
>> +  rtx lhs_rtx = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
>> +  create_output_operand (&ops[0], lhs_rtx, TYPE_MODE (lhs_type));
>> +
>> +  for (unsigned int i = 0; i < gimple_call_num_args (stmt); ++i)
>> +    {
>> +      tree rhs = gimple_call_arg (stmt, i);
>> +      rhs_type[i] = TREE_TYPE (rhs);
>> +      rtx rhs_rtx = expand_normal (rhs);
>> +      create_input_operand (&ops[i + 1], rhs_rtx, TYPE_MODE (rhs_type[i]));
>> +    }
>> +
>> +  insn_code icode = direct_optab_handler (optab, TYPE_MODE (rhs_type[0]));
>> +
>> +  expand_insn (icode, 3, ops);
>> +  if (!rtx_equal_p (lhs_rtx, ops[0].value))
>> +    emit_move_insn (lhs_rtx, ops[0].value);
>> +}
>> +
>> /* Expand a call to a convert-like optab using the operands in STMT.
>>     FN has a single output operand and NARGS input operands.  */
>> @@ -3843,6 +3871,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
>> #define direct_scatter_store_optab_supported_p convert_optab_supported_p
>> #define direct_len_store_optab_supported_p direct_optab_supported_p
>> #define direct_while_optab_supported_p convert_optab_supported_p
>> +#define direct_while_len_optab_supported_p direct_optab_supported_p
>> #define direct_fold_extract_optab_supported_p direct_optab_supported_p
>> #define direct_fold_left_optab_supported_p direct_optab_supported_p
>> #define direct_mask_fold_left_optab_supported_p direct_optab_supported_p
>> diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
>> index 7fe742c2ae7..3a933abff5d 100644
>> --- a/gcc/internal-fn.def
>> +++ b/gcc/internal-fn.def
>> @@ -153,6 +153,7 @@ DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set)
>> DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store)
>> DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
>> +DEF_INTERNAL_OPTAB_FN (WHILE_LEN, ECF_CONST | ECF_NOTHROW, while_len, while_len)
>> DEF_INTERNAL_OPTAB_FN (CHECK_RAW_PTRS, ECF_CONST | ECF_NOTHROW,
>>        check_raw_ptrs, check_ptrs)
>> DEF_INTERNAL_OPTAB_FN (CHECK_WAR_PTRS, ECF_CONST | ECF_NOTHROW,
>> diff --git a/gcc/optabs.def b/gcc/optabs.def
>> index 695f5911b30..f5938bd2c24 100644
>> --- a/gcc/optabs.def
>> +++ b/gcc/optabs.def
>> @@ -476,3 +476,4 @@ OPTAB_DC (vec_series_optab, "vec_series$a", VEC_SERIES)
>> OPTAB_D (vec_shl_insert_optab, "vec_shl_insert_$a")
>> OPTAB_D (len_load_optab, "len_load_$a")
>> OPTAB_D (len_store_optab, "len_store_$a")
>> +OPTAB_D (while_len_optab, "while_len$a")
>> diff --git a/gcc/tree-ssa-loop-manip.cc b/gcc/tree-ssa-loop-manip.cc
>> index 09acc1c94cc..cdbf280e249 100644
>> --- a/gcc/tree-ssa-loop-manip.cc
>> +++ b/gcc/tree-ssa-loop-manip.cc
>> @@ -59,14 +59,14 @@ static bitmap_obstack loop_renamer_obstack;
>> void
>> create_iv (tree base, tree step, tree var, class loop *loop,
>>    gimple_stmt_iterator *incr_pos, bool after,
>> -    tree *var_before, tree *var_after)
>> +    tree *var_before, tree *var_after, enum tree_code code)
>> {
>>    gassign *stmt;
>>    gphi *phi;
>>    tree initial, step1;
>>    gimple_seq stmts;
>>    tree vb, va;
>> -  enum tree_code incr_op = PLUS_EXPR;
>> +  enum tree_code incr_op = code;
>>    edge pe = loop_preheader_edge (loop);
>>    if (var != NULL_TREE)
>> diff --git a/gcc/tree-ssa-loop-manip.h b/gcc/tree-ssa-loop-manip.h
>> index d49273a3987..da755320a3a 100644
>> --- a/gcc/tree-ssa-loop-manip.h
>> +++ b/gcc/tree-ssa-loop-manip.h
>> @@ -23,7 +23,7 @@ along with GCC; see the file COPYING3.  If not see
>> typedef void (*transform_callback)(class loop *, void *);
>> extern void create_iv (tree, tree, tree, class loop *, gimple_stmt_iterator *,
>> -        bool, tree *, tree *);
>> +        bool, tree *, tree *, enum tree_code = PLUS_EXPR);
>> extern void rewrite_into_loop_closed_ssa (bitmap, unsigned);
>> extern void verify_loop_closed_ssa (bool, class loop * = NULL);
>> diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
>> index f60fa50e8f4..f3cd6c51d2e 100644
>> --- a/gcc/tree-vect-loop-manip.cc
>> +++ b/gcc/tree-vect-loop-manip.cc
>> @@ -682,6 +682,173 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
>>    return next_ctrl;
>> }
>> +/* Helper for vect_set_loop_condition_partial_vectors.  Generate definitions
>> +   for all the rgroup controls in RGC and return a control that is nonzero
>> +   when the loop needs to iterate.  Add any new preheader statements to
>> +   PREHEADER_SEQ.  Use LOOP_COND_GSI to insert code before the exit gcond.
>> +
>> +   RGC belongs to loop LOOP.  The loop originally iterated NITERS
>> +   times and has been vectorized according to LOOP_VINFO.
>> +
>> +   Unlike vect_set_loop_controls_directly which is iterating from 0-based IV
>> +   to TEST_LIMIT - bias.
>> +
>> +   In vect_set_loop_controls_by_while_len, we are iterating from start at
>> +   IV = TEST_LIMIT - bias and keep subtract IV by the length calculated by
>> +   IFN_WHILE_LEN pattern.
>> +
>> +   Note: the cost of the code generated by this function is modeled
>> +   by vect_estimate_min_profitable_iters, so changes here may need
>> +   corresponding changes there.
>> +
>> +   1. Single rgroup, the Gimple IR should be:
>> +
>> + <bb 3>
>> + _19 = (unsigned long) n_5(D);
>> + ...
>> +
>> + <bb 4>:
>> + ...
>> + # ivtmp_20 = PHI <ivtmp_21(4), _19(3)>
>> + ...
>> + _22 = .WHILE_LEN (ivtmp_20, vf);
>> + ...
>> + vector statement (use _22);
>> + ...
>> + ivtmp_21 = ivtmp_20 - _22;
>> + ...
>> + if (ivtmp_21 != 0)
>> +   goto <bb 4>; [75.00%]
>> + else
>> +   goto <bb 5>; [25.00%]
>> +
>> + <bb 5>
>> + return;
>> +
>> +   Note: IFN_WHILE_LEN will guarantee "ivtmp_21 = ivtmp_20 - _22" never
>> +   underflow 0.
>> +
>> +   2. Multiple rgroup, the Gimple IR should be:
>> +
>> + <bb 3>
>> + _70 = (unsigned long) bnd.7_52;
>> + _71 = _70 * 2;
>> + _72 = MAX_EXPR <_71, 4>;
>> + _73 = _72 + 18446744073709551612;
>> + ...
>> +
>> + <bb 4>:
>> + ...
>> + # ivtmp_74 = PHI <ivtmp_75(6), _73(12)>
>> + # ivtmp_77 = PHI <ivtmp_78(6), _71(12)>
>> + _76 = .WHILE_LEN (ivtmp_74, vf * nitems_per_ctrl);
>> + _79 = .WHILE_LEN (ivtmp_77, vf * nitems_per_ctrl);
>> + ...
>> + vector statement (use _79);
>> + ...
>> + vector statement (use _76);
>> + ...
>> + _65 = _79 / 2;
>> + vector statement (use _65);
>> + ...
>> + _68 = _76 / 2;
>> + vector statement (use _68);
>> + ...
>> + ivtmp_78 = ivtmp_77 - _79;
>> + ivtmp_75 = ivtmp_74 - _76;
>> + ...
>> + if (ivtmp_78 != 0)
>> +   goto <bb 4>; [75.00%]
>> + else
>> +   goto <bb 5>; [25.00%]
>> +
>> + <bb 5>
>> + return;
>> +
>> +*/
>> +
>> +static tree
>> +vect_set_loop_controls_by_while_len (class loop *loop, loop_vec_info loop_vinfo,
>> +      gimple_seq *preheader_seq,
>> +      gimple_seq *header_seq,
>> +      rgroup_controls *rgc, tree niters)
>> +{
>> +  tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
>> +  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
>> +  /* We are not allowing masked approach in WHILE_LEN.  */
>> +  gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
>> +
>> +  tree ctrl_type = rgc->type;
>> +  unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
>> +  poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
>> +  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
>> +
>> +  /* Calculate the maximum number of item values that the rgroup
>> +     handles in total, the number that it handles for each iteration
>> +     of the vector loop.  */
>> +  tree nitems_total = niters;
>> +  if (nitems_per_iter != 1)
>> +    {
>> +      /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
>> + these multiplications don't overflow.  */
>> +      tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
>> +      nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
>> +    nitems_total, compare_factor);
>> +    }
>> +
>> +  /* Convert the comparison value to the IV type (either a no-op or
>> +     a promotion).  */
>> +  nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total);
>> +
>> +  /* Create an induction variable that counts the number of items
>> +     processed.  */
>> +  tree index_before_incr, index_after_incr;
>> +  gimple_stmt_iterator incr_gsi;
>> +  bool insert_after;
>> +  standard_iv_increment_position (loop, &incr_gsi, &insert_after);
>> +
>> +  /* Test the decremented IV, which will never underflow 0 since we have
>> +     IFN_WHILE_LEN to gurantee that.  */
>> +  tree test_limit = nitems_total;
>> +
>> +  /* Provide a definition of each control in the group.  */
>> +  tree ctrl;
>> +  unsigned int i;
>> +  FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
>> +    {
>> +      /* Previous controls will cover BIAS items.  This control covers the
>> + next batch.  */
>> +      poly_uint64 bias = nitems_per_ctrl * i;
>> +      tree bias_tree = build_int_cst (iv_type, bias);
>> +
>> +      /* Rather than have a new IV that starts at TEST_LIMIT and goes down to
>> + BIAS, prefer to use the same TEST_LIMIT - BIAS based IV for each
>> + control and adjust the bound down by BIAS.  */
>> +      tree this_test_limit = test_limit;
>> +      if (i != 0)
>> + {
>> +   this_test_limit = gimple_build (preheader_seq, MAX_EXPR, iv_type,
>> +   this_test_limit, bias_tree);
>> +   this_test_limit = gimple_build (preheader_seq, MINUS_EXPR, iv_type,
>> +   this_test_limit, bias_tree);
>> + }
>> +
>> +      /* Create decrement IV.  */
>> +      create_iv (this_test_limit, ctrl, NULL_TREE, loop, &incr_gsi,
>> + insert_after, &index_before_incr, &index_after_incr,
>> + MINUS_EXPR);
>> +
>> +      poly_uint64 final_vf = vf * nitems_per_iter;
>> +      tree vf_step = build_int_cst (iv_type, final_vf);
>> +      tree res_len = gimple_build (header_seq, IFN_WHILE_LEN, iv_type,
>> +    index_before_incr, vf_step);
>> +      gassign *assign = gimple_build_assign (ctrl, res_len);
>> +      gimple_seq_add_stmt (header_seq, assign);
>> +    }
>> +
>> +  return index_after_incr;
>> +}
>> +
>> /* Set up the iteration condition and rgroup controls for LOOP, given
>>     that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
>>     loop.  LOOP_VINFO describes the vectorization of LOOP.  NITERS is
>> @@ -703,6 +870,7 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
>>    bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
>>    tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
>> +  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
>>    unsigned int compare_precision = TYPE_PRECISION (compare_type);
>>    tree orig_niters = niters;
>> @@ -757,12 +925,18 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
>> bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
>> /* Set up all controls for this group.  */
>> - test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
>> -      &preheader_seq,
>> -      &header_seq,
>> -      loop_cond_gsi, rgc,
>> -      niters, niters_skip,
>> -      might_wrap_p);
>> + if (direct_internal_fn_supported_p (IFN_WHILE_LEN, iv_type,
>> +     OPTIMIZE_FOR_SPEED))
>> +   test_ctrl
>> +     = vect_set_loop_controls_by_while_len (loop, loop_vinfo,
>> +    &preheader_seq, &header_seq,
>> +    rgc, niters);
>> + else
>> +   test_ctrl
>> +     = vect_set_loop_controls_directly (loop, loop_vinfo, &preheader_seq,
>> +        &header_seq, loop_cond_gsi, rgc,
>> +        niters, niters_skip,
>> +        might_wrap_p);
>>        }
>>    /* Emit all accumulated statements.  */
>> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
>> index 1ba9f18d73e..5bffd9a6322 100644
>> --- a/gcc/tree-vect-loop.cc
>> +++ b/gcc/tree-vect-loop.cc
>> @@ -10360,12 +10360,14 @@ vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
>>     rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
>> tree
>> -vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
>> -    unsigned int nvectors, unsigned int index)
>> +vect_get_loop_len (gimple_stmt_iterator *gsi, loop_vec_info loop_vinfo,
>> +    vec_loop_lens *lens, unsigned int nvectors, tree vectype,
>> +    unsigned int index)
>> {
>>    rgroup_controls *rgl = &(*lens)[nvectors - 1];
>> -  bool use_bias_adjusted_len =
>> -    LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
>> +  bool use_bias_adjusted_len
>> +    = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
>> +  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
>>    /* Populate the rgroup's len array, if this is the first time we've
>>       used it.  */
>> @@ -10386,8 +10388,8 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
>>   if (use_bias_adjusted_len)
>>     {
>>       gcc_assert (i == 0);
>> -       tree adjusted_len =
>> - make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
>> +       tree adjusted_len
>> + = make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
>>       SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
>>       rgl->bias_adjusted_ctrl = adjusted_len;
>>     }
>> @@ -10396,6 +10398,27 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
>>    if (use_bias_adjusted_len)
>>      return rgl->bias_adjusted_ctrl;
>> +  else if (direct_internal_fn_supported_p (IFN_WHILE_LEN, iv_type,
>> +    OPTIMIZE_FOR_SPEED))
>> +    {
>> +      tree loop_len = rgl->controls[index];
>> +      poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
>> +      poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
>> +      if (maybe_ne (nunits1, nunits2))
>> + {
>> +   /* A loop len for data type X can be reused for data type Y
>> +      if X has N times more elements than Y and if Y's elements
>> +      are N times bigger than X's.  */
>> +   gcc_assert (multiple_p (nunits1, nunits2));
>> +   unsigned int factor = exact_div (nunits1, nunits2).to_constant ();
>> +   gimple_seq seq = NULL;
>> +   loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
>> +    build_int_cst (iv_type, factor));
>> +   if (seq)
>> +     gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
>> + }
>> +      return loop_len;
>> +    }
>>    else
>>      return rgl->controls[index];
>> }
>> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
>> index efa2d0daa52..708c8a1d806 100644
>> --- a/gcc/tree-vect-stmts.cc
>> +++ b/gcc/tree-vect-stmts.cc
>> @@ -8653,8 +8653,9 @@ vectorizable_store (vec_info *vinfo,
>>       else if (loop_lens)
>> {
>>   tree final_len
>> -     = vect_get_loop_len (loop_vinfo, loop_lens,
>> - vec_num * ncopies, vec_num * j + i);
>> +     = vect_get_loop_len (gsi, loop_vinfo, loop_lens,
>> + vec_num * ncopies, vectype,
>> + vec_num * j + i);
>>   tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
>>   machine_mode vmode = TYPE_MODE (vectype);
>>   opt_machine_mode new_ovmode
>> @@ -10009,8 +10010,8 @@ vectorizable_load (vec_info *vinfo,
>>     else if (loop_lens && memory_access_type != VMAT_INVARIANT)
>>       {
>> tree final_len
>> -   = vect_get_loop_len (loop_vinfo, loop_lens,
>> -        vec_num * ncopies,
>> +   = vect_get_loop_len (gsi, loop_vinfo, loop_lens,
>> +        vec_num * ncopies, vectype,
>>        vec_num * j + i);
>> tree ptr = build_int_cst (ref_type,
>>   align * BITS_PER_UNIT);
>> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
>> index 9cf2fb23fe3..e5cf38caf4b 100644
>> --- a/gcc/tree-vectorizer.h
>> +++ b/gcc/tree-vectorizer.h
>> @@ -2293,8 +2293,8 @@ extern tree vect_get_loop_mask (gimple_stmt_iterator *, vec_loop_masks *,
>> unsigned int, tree, unsigned int);
>> extern void vect_record_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
>>   tree, unsigned int);
>> -extern tree vect_get_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
>> -        unsigned int);
>> +extern tree vect_get_loop_len (gimple_stmt_iterator *, loop_vec_info,
>> +        vec_loop_lens *, unsigned int, tree, unsigned int);
>> extern gimple_seq vect_gen_len (tree, tree, tree, tree);
>> extern stmt_vec_info info_for_reduction (vec_info *, stmt_vec_info);
>> extern bool reduction_fn_for_scalar_code (code_helper, internal_fn *);
>  
 

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
  2023-04-20  9:19         ` juzhe.zhong
@ 2023-04-20  9:22           ` Richard Sandiford
  2023-04-20  9:50             ` Richard Biener
  0 siblings, 1 reply; 41+ messages in thread
From: Richard Sandiford @ 2023-04-20  9:22 UTC (permalink / raw)
  To: juzhe.zhong; +Cc: gcc-patches, rguenther, jeffreyalaw

"juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai> writes:
> OK. Thanks Richard.
> So let me conclude:
> 1. Community agree that I should support variable IV in the middle-end.
> 2. We can keep WHILE_LEN pattern when "not only final iteration is partial".
>     And I should describe it more clearly in the doc.
>
> I should do these 2 things in the later update patch.

Sounds good to me, but Richi is the maintainer.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
  2023-04-20  9:22           ` Richard Sandiford
@ 2023-04-20  9:50             ` Richard Biener
  2023-04-20  9:54               ` Richard Sandiford
  0 siblings, 1 reply; 41+ messages in thread
From: Richard Biener @ 2023-04-20  9:50 UTC (permalink / raw)
  To: Richard Sandiford; +Cc: juzhe.zhong, gcc-patches, jeffreyalaw

On Thu, 20 Apr 2023, Richard Sandiford wrote:

> "juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai> writes:
> > OK. Thanks Richard.
> > So let me conclude:
> > 1. Community agree that I should support variable IV in the middle-end.
> > 2. We can keep WHILE_LEN pattern when "not only final iteration is partial".
> >     And I should describe it more clearly in the doc.
> >
> > I should do these 2 things in the later update patch.
> 
> Sounds good to me, but Richi is the maintainer.

But I happily defer to you for designing VL stuff.  I suppose it's
time to make you maintainer as well ... (you are global reviewer).

That said, I will look at an updated patch.

Richard.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
  2023-04-20  9:50             ` Richard Biener
@ 2023-04-20  9:54               ` Richard Sandiford
  2023-04-20 10:38                 ` juzhe.zhong
  0 siblings, 1 reply; 41+ messages in thread
From: Richard Sandiford @ 2023-04-20  9:54 UTC (permalink / raw)
  To: Richard Biener; +Cc: juzhe.zhong, gcc-patches, jeffreyalaw

Richard Biener <rguenther@suse.de> writes:
> On Thu, 20 Apr 2023, Richard Sandiford wrote:
>
>> "juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai> writes:
>> > OK. Thanks Richard.
>> > So let me conclude:
>> > 1. Community agree that I should support variable IV in the middle-end.
>> > 2. We can keep WHILE_LEN pattern when "not only final iteration is partial".
>> >     And I should describe it more clearly in the doc.
>> >
>> > I should do these 2 things in the later update patch.
>> 
>> Sounds good to me, but Richi is the maintainer.
>
> But I happily defer to you for designing VL stuff.  I suppose it's
> time to make you maintainer as well ... (you are global reviewer).

Heh, wasn't trying to bag an extra maintainership :-)  I just got a
bit lost in the thread and wasn't sure whether I was contradicting
something you'd said (in which case I'd defer to that).

Richard



^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
  2023-04-20  9:54               ` Richard Sandiford
@ 2023-04-20 10:38                 ` juzhe.zhong
  2023-04-20 12:05                   ` Richard Biener
  0 siblings, 1 reply; 41+ messages in thread
From: juzhe.zhong @ 2023-04-20 10:38 UTC (permalink / raw)
  To: richard.sandiford, rguenther; +Cc: gcc-patches, jeffreyalaw

[-- Attachment #1: Type: text/plain, Size: 1466 bytes --]

Thanks Richards (Sandiford && Biener).
I have a technique question:
In case of support variable IVs for memory address calculation, is it right I should make output of WHILE_LEN
visible in tree-ssa-loop-ivopts.cc ? Since the address calculation is not in the loop control handling function.

Thanks.


juzhe.zhong@rivai.ai
 
From: Richard Sandiford
Date: 2023-04-20 17:54
To: Richard Biener
CC: juzhe.zhong\@rivai.ai; gcc-patches; jeffreyalaw
Subject: Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
Richard Biener <rguenther@suse.de> writes:
> On Thu, 20 Apr 2023, Richard Sandiford wrote:
>
>> "juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai> writes:
>> > OK. Thanks Richard.
>> > So let me conclude:
>> > 1. Community agree that I should support variable IV in the middle-end.
>> > 2. We can keep WHILE_LEN pattern when "not only final iteration is partial".
>> >     And I should describe it more clearly in the doc.
>> >
>> > I should do these 2 things in the later update patch.
>> 
>> Sounds good to me, but Richi is the maintainer.
>
> But I happily defer to you for designing VL stuff.  I suppose it's
> time to make you maintainer as well ... (you are global reviewer).
 
Heh, wasn't trying to bag an extra maintainership :-)  I just got a
bit lost in the thread and wasn't sure whether I was contradicting
something you'd said (in which case I'd defer to that).
 
Richard
 
 
 

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
  2023-04-20 10:38                 ` juzhe.zhong
@ 2023-04-20 12:05                   ` Richard Biener
  0 siblings, 0 replies; 41+ messages in thread
From: Richard Biener @ 2023-04-20 12:05 UTC (permalink / raw)
  To: juzhe.zhong; +Cc: richard.sandiford, gcc-patches, jeffreyalaw

On Thu, 20 Apr 2023, juzhe.zhong@rivai.ai wrote:

> Thanks Richards (Sandiford && Biener).
> I have a technique question:
> In case of support variable IVs for memory address calculation, is it right I should make output of WHILE_LEN
> visible in tree-ssa-loop-ivopts.cc ? Since the address calculation is not in the loop control handling function.

Well, it is visible, no?  It's going to be not different from SVE
testing the mask for all zeros.

I'd leave trying to make IVOPTs eliminate other counting IVs with
the WHILE_LEN result as a followup exercise - you will likely have
address IVs that do not depend on the WHILE_LEN result for all of
the memory accesses.

Richard.

> Thanks.
> 
> 
> juzhe.zhong@rivai.ai
>  
> From: Richard Sandiford
> Date: 2023-04-20 17:54
> To: Richard Biener
> CC: juzhe.zhong\@rivai.ai; gcc-patches; jeffreyalaw
> Subject: Re: [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization
> Richard Biener <rguenther@suse.de> writes:
> > On Thu, 20 Apr 2023, Richard Sandiford wrote:
> >
> >> "juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai> writes:
> >> > OK. Thanks Richard.
> >> > So let me conclude:
> >> > 1. Community agree that I should support variable IV in the middle-end.
> >> > 2. We can keep WHILE_LEN pattern when "not only final iteration is partial".
> >> >     And I should describe it more clearly in the doc.
> >> >
> >> > I should do these 2 things in the later update patch.
> >> 
> >> Sounds good to me, but Richi is the maintainer.
> >
> > But I happily defer to you for designing VL stuff.  I suppose it's
> > time to make you maintainer as well ... (you are global reviewer).
>  
> Heh, wasn't trying to bag an extra maintainership :-)  I just got a
> bit lost in the thread and wasn't sure whether I was contradicting
> something you'd said (in which case I'd defer to that).
>  
> Richard
>  
>  
>  
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 41+ messages in thread

end of thread, other threads:[~2023-04-20 12:05 UTC | newest]

Thread overview: 41+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-04-07  1:47 [PATCH] VECT: Add WHILE_LEN pattern for decrement IV support for auto-vectorization juzhe.zhong
2023-04-07  3:23 ` Li, Pan2
2023-04-11 12:12 ` juzhe.zhong
2023-04-11 12:44   ` Richard Sandiford
2023-04-12  7:00     ` Richard Biener
2023-04-12  8:00       ` juzhe.zhong
2023-04-12  8:42         ` Richard Biener
2023-04-12  9:15           ` juzhe.zhong
2023-04-12  9:29             ` Richard Biener
2023-04-12  9:42               ` Robin Dapp
2023-04-12 11:17               ` Richard Sandiford
2023-04-12 11:37                 ` juzhe.zhong
2023-04-12 12:24                   ` Richard Sandiford
2023-04-12 14:18                     ` 钟居哲
2023-04-13  6:47                       ` Richard Biener
2023-04-13  9:54                         ` juzhe.zhong
2023-04-18  9:32                           ` Richard Sandiford
2023-04-12 12:56                   ` Kewen.Lin
2023-04-12 13:22                     ` 钟居哲
2023-04-13  7:29                       ` Kewen.Lin
2023-04-13 13:44                         ` 钟居哲
2023-04-14  2:54                           ` Kewen.Lin
2023-04-14  3:09                             ` juzhe.zhong
2023-04-14  5:40                               ` Kewen.Lin
2023-04-14  3:39                             ` juzhe.zhong
2023-04-14  6:31                               ` Kewen.Lin
2023-04-14  6:39                                 ` juzhe.zhong
2023-04-14  7:41                                   ` Kewen.Lin
2023-04-14  6:52                               ` Richard Biener
2023-04-12 11:42                 ` Richard Biener
     [not found]           ` <2023041217154958074655@rivai.ai>
2023-04-12  9:20             ` juzhe.zhong
2023-04-19 21:53 ` 钟居哲
2023-04-20  8:52   ` Richard Sandiford
2023-04-20  8:57     ` juzhe.zhong
2023-04-20  9:11       ` Richard Sandiford
2023-04-20  9:19         ` juzhe.zhong
2023-04-20  9:22           ` Richard Sandiford
2023-04-20  9:50             ` Richard Biener
2023-04-20  9:54               ` Richard Sandiford
2023-04-20 10:38                 ` juzhe.zhong
2023-04-20 12:05                   ` Richard Biener

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).