public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] Add first-order recurrence autovectorization
@ 2022-09-30  8:00 juzhe.zhong
  2022-10-06 12:13 ` Richard Biener
  0 siblings, 1 reply; 5+ messages in thread
From: juzhe.zhong @ 2022-09-30  8:00 UTC (permalink / raw)
  To: gcc-patches; +Cc: richard.sandiford, richard.guenther, Ju-Zhe Zhong

From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>

Hi, After fixing previous ICE. 
I add full implementation (insert permutation to get correct result.)

The gimple IR is correct now I think:
  # t_21 = PHI <_4(6), t_12(9)>
  # i_22 = PHI <i_17(6), 0(9)>
  # vectp_a.6_26 = PHI <vectp_a.6_25(6), a_14(D)(9)>
  # vect_vec_recur_.9_9 = PHI <vect__4.8_19(6), vect_cst__18(9)>
  # vectp_b.11_7 = PHI <vectp_b.11_30(6), b_15(D)(9)>
  # curr_cnt_36 = PHI <next_cnt_35(6), _32(9)>
  # loop_len_20 = PHI <next_len_34(6), _32(9)>
  _38 = .WHILE_LEN (loop_len_20, 32, POLY_INT_CST [4, 4]);
  while_len_37 = _38;
  _1 = (long unsigned int) i_22;
  _2 = _1 * 4;
  _3 = a_14(D) + _2;
  vect__4.8_19 = .LEN_LOAD (vectp_a.6_26, 32B, loop_len_20, 0);
  _4 = *_3;
  _5 = b_15(D) + _2;
  vect_vec_recur_.9_9 = VEC_PERM_EXPR <vect_vec_recur_.9_9, vect__4.8_19, { POLY_INT_CST [3, 4], POLY_INT_CST [4, 4], POLY_INT_CST [5, 4], ... }>;

But I encounter another ICE:
0x169e0e7 process_bb
        ../../../riscv-gcc/gcc/tree-ssa-sccvn.cc:7498
0x16a09af do_rpo_vn(function*, edge_def*, bitmap_head*, bool, bool, vn_lookup_kind)
        ../../../riscv-gcc/gcc/tree-ssa-sccvn.cc:8109
0x16a0fe7 do_rpo_vn(function*, edge_def*, bitmap_head*)
        ../../../riscv-gcc/gcc/tree-ssa-sccvn.cc:8205
0x179b7db execute
        ../../../riscv-gcc/gcc/tree-vectorizer.cc:1365

Could you help me with this? After fixing this ICE, I think the loop vectorizer
can run correctly. Maybe you can test is in X86 or ARM after fixing this ICE.

Thanks.

---
 gcc/tree-vect-loop.cc  | 248 ++++++++++++++++++++++++++++++++++++++++-
 gcc/tree-vect-stmts.cc |  47 +++++++-
 gcc/tree-vectorizer.h  |   4 +
 3 files changed, 296 insertions(+), 3 deletions(-)

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 2536cc3cf49..1f0279d0d28 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -529,6 +529,65 @@ vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
   return false;
 }
 
+/* Returns true if Phi is a first-order recurrence. A first-order
+   recurrence is a non-reduction recurrence relation in which the value of
+   the recurrence in the current loop iteration equals a value defined in
+   the previous iteration.  */
+
+static bool
+vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
+				   gphi *phi)
+{
+  /* Ensure the phi node is in the loop header and has two incoming values.  */
+  if (gimple_bb (phi) != loop->header || gimple_phi_num_args (phi) != 2)
+    return false;
+
+  /* Ensure the loop has a preheader and a single latch block. The loop
+     vectorizer will need the latch to set up the next iteration of the loop. */
+  edge preheader = loop_preheader_edge (loop);
+  edge latch = loop_latch_edge (loop);
+  if (!preheader || !latch)
+    return false;
+
+  /* Ensure the phi node's incoming blocks are the loop preheader and latch.  */
+  if (!PHI_ARG_DEF_FROM_EDGE (phi, preheader)
+      || !PHI_ARG_DEF_FROM_EDGE (phi, latch))
+    return false;
+
+  /* Get the previous value. The previous value comes from the latch edge while
+     the initial value comes form the preheader edge.  */
+  gimple *previous = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, latch));
+  if (!previous)
+    return false;
+
+  /* Ensure every use_stmt of the phi node is dominated by the previous value.
+     The dominance requirement ensures the loop vectorizer will not need to
+     vectorize the initial value prior to the first iteration of the loop.  */
+  gimple *use_stmt;
+  imm_use_iterator imm_iter;
+  FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, gimple_phi_result (phi))
+    if (use_stmt)
+      if (!dominated_by_p (CDI_DOMINATORS, gimple_bb (use_stmt),
+			   gimple_bb (previous)))
+	return false;
+
+  /* First-order recurrence autovectorization needs shuffle vector.  */
+  tree scalar_type = TREE_TYPE (PHI_RESULT (phi));
+  tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
+  /* First-order recurrence autovectorization needs to handle permutation
+     with indices = [nunits-1, nunits, nunits+1, ...].  */
+  machine_mode mode = TYPE_MODE (vectype);
+  poly_int64 nunits = GET_MODE_NUNITS (mode);
+  vec_perm_builder sel (nunits, 1, 3);
+  for (int i = 0; i < 3; ++i)
+    sel.quick_push (nunits - 1 + i);
+  vec_perm_indices indices (sel, 1, nunits * 2);
+  if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
+    return false;
+
+  return true;
+}
+
 /* Function vect_analyze_scalar_cycles_1.
 
    Examine the cross iteration def-use cycles of scalar variables
@@ -666,6 +725,8 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
                 }
             }
         }
+      else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
+	STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
       else
         if (dump_enabled_p ())
           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -1808,9 +1869,13 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
 
           gcc_assert (stmt_info);
 
+          if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
+	    ok = vectorizable_dep_phi (loop_vinfo, stmt_info, NULL, NULL);
+
           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
                || STMT_VINFO_LIVE_P (stmt_info))
-              && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
+              && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
+              && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
 	    /* A scalar-dependence cycle that we don't support.  */
 	    return opt_result::failure_at (phi,
 					   "not vectorized:"
@@ -8267,6 +8332,118 @@ vectorizable_phi (vec_info *,
   return true;
 }
 
+/* Vectorizes Dep PHIs.  */
+
+bool
+vectorizable_dep_phi (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
+		      gimple **vec_stmt, slp_tree slp_node)
+{
+  if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
+    return false;
+
+  gphi *phi = as_a<gphi *> (stmt_info->stmt);
+
+  /* So far we only support first-order recurrence auto-vectorization.  */
+  if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
+    return false;
+
+  if (!vec_stmt) /* transformation not required.  */
+    {
+      /* So far we don't support first-order recurrence for SLP
+       * auto-vectorization.   */
+      if (slp_node)
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "do not support first-order recurrence"
+			     "autovectorization for SLP node\n");
+	  return false;
+	}
+      STMT_VINFO_TYPE (stmt_info) = dep_phi_info_type;
+      return true;
+    }
+
+  /* This is the second phase of vectorizing first-order rececurrences. An
+  overview of the transformation is described below. Suppose we have the
+  following loop.
+
+    int32_t t = 0;
+    for (int i = 0; i < n; ++i)
+      {
+	b[i] = a[i] - t;
+	t = a[i];
+      }
+
+  There is a first-order recurrence on "a". For this loop, the shorthand
+  scalar IR looks like:
+
+    scalar.preheader:
+      init = a[-1]
+      br loop.body
+
+    scalar.body:
+      i = PHI <0(scalar.preheader), i+1(scalar.body)>
+      _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
+      _1 = a[i]
+      b[i] = _1 - _2
+      br cond, scalar.body, ...
+
+  In this example, _2 is a recurrence because it's value depends on the
+  previous iteration. In the first phase of vectorization, we created a
+  temporary value for _2. We now complete the vectorization and produce the
+  shorthand vector IR shown below (VF = 4).
+
+    vector.preheader:
+      vect_init = vect_cst(..., ..., ..., a[-1])
+      br vector.body
+
+    vector.body
+      i = PHI <0(vector.preheader), i+4(vector.body)>
+      vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
+      vect_2 = a[i, i+1, i+2, i+3];
+      vect_3 = vector(vect_1(3), vect_2(0, 1, 2))
+      b[i, i+1, i+2, i+3] = vect_2 - vect_3
+      br cond, vector.body, middle.block
+
+    middle.block:
+      x = vect_2(3)
+      br scalar.preheader
+
+    scalar.ph:
+      s_init = PHI <x(middle.block), a[-1], otherwise>
+      br scalar.body
+
+  After execution completes the vector loop, we extract the next value of
+  the recurrence (x) to use as the initial value in the scalar loop.  */
+
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+  tree scalar_dest = gimple_phi_result (stmt_info->stmt);
+  basic_block bb = gimple_bb (stmt_info->stmt);
+  tree vec_dest
+    = vect_get_new_vect_var (vectype, vect_simple_var, "vec_recur_");
+  auto_vec<tree> vec_oprnds;
+  tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
+
+  tree vec_init = build_vector_from_val (vectype, preheader);
+  vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
+  vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
+		     !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
+		     gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
+  /* Create the vectorized first-order PHI node.  */
+  gphi *new_phi = create_phi_node (vec_dest, bb);
+  add_phi_arg (new_phi, vec_oprnds[0], loop_latch_edge (loop),
+	       UNKNOWN_LOCATION);
+  add_phi_arg (new_phi, vec_init, loop_preheader_edge (loop), UNKNOWN_LOCATION);
+  if (slp_node)
+    SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
+  else
+    STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
+  if (!slp_node)
+    *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
+  return true;
+}
+
 /* Return true if VECTYPE represents a vector that requires lowering
    by the vector lowering pass.  */
 
@@ -10476,6 +10653,29 @@ update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
   epilogue_vinfo->shared->save_datarefs ();
 }
 
+/* Return true if the stmt is the first statement that uses the result
+   of a first-order recurrence phi node.  */
+
+static gphi *
+vect_use_first_order_phi_result_p (auto_vec<gphi *> &first_order_phi_vec,
+				   gimple *stmt)
+{
+  for (unsigned int i = 0; i < first_order_phi_vec.length (); ++i)
+    {
+      gphi *phi = first_order_phi_vec[i];
+      tree phi_result = PHI_RESULT (phi);
+      gimple *first_use;
+      use_operand_p use_p;
+      if (!single_imm_use (phi_result, &use_p, &first_use))
+	continue;
+
+      if (first_use == stmt)
+	return phi;
+    }
+
+  return NULL;
+}
+
 /* Function vect_transform_loop.
 
    The analysis phase has determined that the loop is vectorizable.
@@ -10624,6 +10824,31 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
       basic_block bb = bbs[i];
       stmt_vec_info stmt_info;
 
+      /* It's used to save the first-order recurrence phi node.
+         Consider the following case:
+         # t_19 = PHI <_4(6), 0(5)>
+         ...
+         _4 = *_3;
+         ...
+         _6 = _4 - t_19;
+         
+         The vectorization sequence should be:
+         1. Vectorize _4 = *_3;
+         2. Vectorize # t_19 = PHI <_4(6), 0(5)>
+         3, Vectorize _6 = _4 - t_19;
+         
+         Flow:
+         1. So we save the first-order recurrence PHI in first_order_phi_vec
+	    and skip vectorizing it tentatively when iterating phi statement 
+	    (save t_19 = PHI <_4(6), 0(5)>).
+         2. Vectorize all statements when iterating all statements
+            until we reach the statement who is using the result of
+            first-order recurrence PHI (vectorize _4 = *_3;).
+         3. Stop iterating and return to vectorize the PHI saved
+	    in first_order_phi_vec (# t_19 = PHI <_4(6), 0(5)>).
+         4. Conintue iterating and vectorize the residual statements.  */
+      auto_vec<gphi *> first_order_phi_vec;
+
       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 	   gsi_next (&si))
 	{
@@ -10677,9 +10902,16 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
-	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
+	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
+	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
 	      && ! PURE_SLP_STMT (stmt_info))
 	    maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
+
+	  if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
+	    {
+	      first_order_phi_vec.safe_push (phi);
+	      continue;
+	    }
 	}
 
       for (gimple_stmt_iterator si = gsi_start_bb (bb);
@@ -10698,6 +10930,18 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
 	      /* Ignore vector stmts created in the outer loop.  */
 	      stmt_info = loop_vinfo->lookup_stmt (stmt);
 
+	      gphi *first_order_phi
+		= vect_use_first_order_phi_result_p (first_order_phi_vec, stmt);
+	      if (first_order_phi)
+		{
+		  stmt_vec_info first_order_stmt_info
+		    = loop_vinfo->lookup_stmt (first_order_phi);
+		  gimple_stmt_iterator first_order_si
+		    = gsi_for_stmt (first_order_phi);
+		  vect_transform_loop_stmt (loop_vinfo, first_order_stmt_info,
+					    &first_order_si, NULL);
+		}
+
 	      /* vector stmts created in the outer-loop during vectorization of
 		 stmts in an inner-loop may not have a stmt_info, and do not
 		 need to be vectorized.  */
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index c8d1efc45e5..ad37bbea30c 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -1503,6 +1503,41 @@ vect_get_vec_defs_for_operand (vec_info *vinfo, stmt_vec_info stmt_vinfo,
       while (ncopies--)
 	vec_oprnds->quick_push (vop);
     }
+  else if (dt == vect_first_order_recurrence)
+    {
+      def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
+      gcc_assert (STMT_VINFO_VEC_STMTS (def_stmt_info).length () == ncopies);
+      tree vector_type;
+
+      if (vectype)
+	vector_type = vectype;
+      else
+	vector_type = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op));
+
+      /* Insert shufflevector to for first-order recurrence autovectorization.
+	 result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...].  */
+      machine_mode mode = TYPE_MODE (vector_type);
+      poly_int64 nunits = GET_MODE_NUNITS (mode);
+      vec_perm_builder sel (nunits, 1, 3);
+      for (int i = 0; i < 3; ++i)
+	sel.quick_push (nunits - 1 + i);
+      vec_perm_indices indices (sel, 1, nunits * 2);
+      tree perm = vec_perm_indices_to_tree (vector_type, indices);
+      class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+
+      for (unsigned i = 0; i < ncopies; ++i)
+	{
+	  gphi *phi = as_a<gphi *> (STMT_VINFO_VEC_STMTS (def_stmt_info)[i]);
+	  tree latch = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
+	  tree recur = gimple_phi_result (phi);
+	  gassign *assign
+	    = gimple_build_assign (recur, VEC_PERM_EXPR, recur, latch, perm);
+	  gimple_assign_set_lhs (assign, recur);
+	  gimple_stmt_iterator gsi = gsi_for_stmt (stmt_vinfo->stmt);
+	  vect_finish_stmt_generation (vinfo, stmt_vinfo, assign, &gsi);
+	  vec_oprnds->quick_push (recur);
+	}
+    }
   else
     {
       def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
@@ -11404,6 +11439,12 @@ vect_transform_stmt (vec_info *vinfo,
       gcc_assert (done);
       break;
 
+    case dep_phi_info_type:
+      done = vectorizable_dep_phi (as_a <loop_vec_info> (vinfo),
+				  stmt_info, &vec_stmt, slp_node);
+      gcc_assert (done);
+      break;
+
     case phi_info_type:
       done = vectorizable_phi (vinfo, stmt_info, &vec_stmt, slp_node, NULL);
       gcc_assert (done);
@@ -11804,6 +11845,9 @@ vect_is_simple_use (tree operand, vec_info *vinfo, enum vect_def_type *dt,
 	case vect_nested_cycle:
 	  dump_printf (MSG_NOTE, "nested cycle\n");
 	  break;
+	case vect_first_order_recurrence:
+	  dump_printf (MSG_NOTE, "first order recurrence\n");
+	  break;
 	case vect_unknown_def_type:
 	  dump_printf (MSG_NOTE, "unknown\n");
 	  break;
@@ -11852,7 +11896,8 @@ vect_is_simple_use (tree operand, vec_info *vinfo, enum vect_def_type *dt,
       || *dt == vect_induction_def
       || *dt == vect_reduction_def
       || *dt == vect_double_reduction_def
-      || *dt == vect_nested_cycle)
+      || *dt == vect_nested_cycle
+			|| *dt == vect_first_order_recurrence)
     {
       *vectype = STMT_VINFO_VECTYPE (def_stmt_info);
       gcc_assert (*vectype != NULL_TREE);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 4870c754499..2c8e7660b4e 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -65,6 +65,7 @@ enum vect_def_type {
   vect_reduction_def,
   vect_double_reduction_def,
   vect_nested_cycle,
+  vect_first_order_recurrence,
   vect_unknown_def_type
 };
 
@@ -1027,6 +1028,7 @@ enum stmt_vec_info_type {
   cycle_phi_info_type,
   lc_phi_info_type,
   phi_info_type,
+  dep_phi_info_type,
   loop_exit_ctrl_vec_info_type
 };
 
@@ -2331,6 +2333,8 @@ extern bool vectorizable_lc_phi (loop_vec_info, stmt_vec_info,
 				 gimple **, slp_tree);
 extern bool vectorizable_phi (vec_info *, stmt_vec_info, gimple **, slp_tree,
 			      stmt_vector_for_cost *);
+extern bool vectorizable_dep_phi (loop_vec_info, stmt_vec_info,
+				 gimple **, slp_tree);
 extern bool vect_emulated_vector_p (tree);
 extern bool vect_can_vectorize_without_simd_p (tree_code);
 extern bool vect_can_vectorize_without_simd_p (code_helper);
-- 
2.36.1


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] Add first-order recurrence autovectorization
  2022-09-30  8:00 [PATCH] Add first-order recurrence autovectorization juzhe.zhong
@ 2022-10-06 12:13 ` Richard Biener
  2022-10-06 13:07   ` Richard Biener
  0 siblings, 1 reply; 5+ messages in thread
From: Richard Biener @ 2022-10-06 12:13 UTC (permalink / raw)
  To: juzhe.zhong; +Cc: gcc-patches, richard.sandiford

[-- Attachment #1: Type: text/plain, Size: 2741 bytes --]

On Fri, Sep 30, 2022 at 10:00 AM <juzhe.zhong@rivai.ai> wrote:
>
> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
>
> Hi, After fixing previous ICE.
> I add full implementation (insert permutation to get correct result.)
>
> The gimple IR is correct now I think:
>   # t_21 = PHI <_4(6), t_12(9)>
>   # i_22 = PHI <i_17(6), 0(9)>
>   # vectp_a.6_26 = PHI <vectp_a.6_25(6), a_14(D)(9)>
>   # vect_vec_recur_.9_9 = PHI <vect__4.8_19(6), vect_cst__18(9)>
>   # vectp_b.11_7 = PHI <vectp_b.11_30(6), b_15(D)(9)>
>   # curr_cnt_36 = PHI <next_cnt_35(6), _32(9)>
>   # loop_len_20 = PHI <next_len_34(6), _32(9)>
>   _38 = .WHILE_LEN (loop_len_20, 32, POLY_INT_CST [4, 4]);
>   while_len_37 = _38;
>   _1 = (long unsigned int) i_22;
>   _2 = _1 * 4;
>   _3 = a_14(D) + _2;
>   vect__4.8_19 = .LEN_LOAD (vectp_a.6_26, 32B, loop_len_20, 0);
>   _4 = *_3;
>   _5 = b_15(D) + _2;
>   vect_vec_recur_.9_9 = VEC_PERM_EXPR <vect_vec_recur_.9_9, vect__4.8_19, { POLY_INT_CST [3, 4], POLY_INT_CST [4, 4], POLY_INT_CST [5, 4], ... }>;
>
> But I encounter another ICE:
> 0x169e0e7 process_bb
>         ../../../riscv-gcc/gcc/tree-ssa-sccvn.cc:7498
> 0x16a09af do_rpo_vn(function*, edge_def*, bitmap_head*, bool, bool, vn_lookup_kind)
>         ../../../riscv-gcc/gcc/tree-ssa-sccvn.cc:8109
> 0x16a0fe7 do_rpo_vn(function*, edge_def*, bitmap_head*)
>         ../../../riscv-gcc/gcc/tree-ssa-sccvn.cc:8205
> 0x179b7db execute
>         ../../../riscv-gcc/gcc/tree-vectorizer.cc:1365
>
> Could you help me with this? After fixing this ICE, I think the loop vectorizer
> can run correctly. Maybe you can test is in X86 or ARM after fixing this ICE.

Sorry for the late reply, the issue is that we have

vect_vec_recur_.7_7 = VEC_PERM_EXPR <vect_vec_recur_.7_7, vect__4.6_9,
{ 7, 8, 9, 10, 11, 12, 13, 14 }>;

thus

+      for (unsigned i = 0; i < ncopies; ++i)
+       {
+         gphi *phi = as_a<gphi *> (STMT_VINFO_VEC_STMTS (def_stmt_info)[i]);
+         tree latch = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
+         tree recur = gimple_phi_result (phi);
+         gassign *assign
+           = gimple_build_assign (recur, VEC_PERM_EXPR, recur, latch, perm);
+         gimple_assign_set_lhs (assign, recur);

needs to create a new SSA name for each LHS.  You shouldn't create code in
vect_get_vec_defs_for_operand either.

Let me mangle the patch a bit.

The attached is what I came up with, the permutes need to be generated when
the backedge PHI values are filled in.  Missing are ncopies > 1 handling, we'd
need to think of how the initial value and the permutes would work here, missing
is SLP support but more importantly handling in the epilogue (so on x86 requires
constant loop bound)
I've added a testcase that triggers on x86_64.

Richard.

[-- Attachment #2: p --]
[-- Type: application/octet-stream, Size: 15523 bytes --]

From 2c6e3265c1d6675b3427766184708b46558b6f11 Mon Sep 17 00:00:00 2001
From: Richard Biener <rguenther@suse.de>
Date: Thu, 6 Oct 2022 13:56:09 +0200
Subject: [PATCH] Vectorization of first-order recurrences
To: gcc-patches@gcc.gnu.org

Missing:
 * ncopies > 1
 * SLP
 * epilogue handling
 * costing
 * more testcases (with runtime verification)

	* tree-vectorizer.h:
	* tree-vect-loop.cc
	* tree-vect-stmts.cc

	* gcc.dg/vect/vect-recurr-1.c: New testcase.
---
 gcc/testsuite/gcc.dg/vect/vect-recurr-1.c |  14 ++
 gcc/tree-vect-loop.cc                     | 228 ++++++++++++++++++++--
 gcc/tree-vect-stmts.cc                    |  12 +-
 gcc/tree-vectorizer.h                     |   4 +
 4 files changed, 240 insertions(+), 18 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-recurr-1.c

diff --git a/gcc/testsuite/gcc.dg/vect/vect-recurr-1.c b/gcc/testsuite/gcc.dg/vect/vect-recurr-1.c
new file mode 100644
index 00000000000..60c7ae117c1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-recurr-1.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_int } */
+
+void foo (int * __restrict__ a, int * __restrict__ b, int * __restrict__ c)
+{
+  int t = *c;
+  for (int i = 0; i < 64; ++i)
+    {
+      b[i] = a[i] - t;
+      t = a[i];
+    }
+}
+
+/* { dg-final { scan-tree-dump "vectorized 1 loops in function" "vect" } } */
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 2536cc3cf49..7b6ba1e04c9 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -529,6 +529,39 @@ vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
   return false;
 }
 
+/* Returns true if Phi is a first-order recurrence. A first-order
+   recurrence is a non-reduction recurrence relation in which the value of
+   the recurrence in the current loop iteration equals a value defined in
+   the previous iteration.  */
+
+static bool
+vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
+				   gphi *phi)
+{
+  /* Ensure the loop latch definition is from within the loop.  */
+  edge latch = loop_latch_edge (loop);
+  tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
+  if (TREE_CODE (ldef) != SSA_NAME
+      || SSA_NAME_IS_DEFAULT_DEF (ldef)
+      || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
+    return false;
+
+  /* First-order recurrence autovectorization needs shuffle vector.  */
+  tree scalar_type = TREE_TYPE (PHI_RESULT (phi));
+  tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
+  /* First-order recurrence autovectorization needs to handle permutation
+     with indices = [nunits-1, nunits, nunits+1, ...].  */
+  poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
+  vec_perm_builder sel (nunits, 1, 3);
+  for (int i = 0; i < 3; ++i)
+    sel.quick_push (nunits - 1 + i);
+  vec_perm_indices indices (sel, 1, nunits * 2);
+  if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype), indices))
+    return false;
+
+  return true;
+}
+
 /* Function vect_analyze_scalar_cycles_1.
 
    Examine the cross iteration def-use cycles of scalar variables
@@ -666,6 +699,8 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
                 }
             }
         }
+      else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
+	STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
       else
         if (dump_enabled_p ())
           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -1810,7 +1845,8 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
 
           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
                || STMT_VINFO_LIVE_P (stmt_info))
-              && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
+	      && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
+	      && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
 	    /* A scalar-dependence cycle that we don't support.  */
 	    return opt_result::failure_at (phi,
 					   "not vectorized:"
@@ -1831,6 +1867,10 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
 		       && ! PURE_SLP_STMT (stmt_info))
 		ok = vectorizable_reduction (loop_vinfo,
 					     stmt_info, NULL, NULL, &cost_vec);
+	      else if (STMT_VINFO_DEF_TYPE (stmt_info)
+			 == vect_first_order_recurrence)
+		ok = vectorizable_dep_phi (loop_vinfo, stmt_info, NULL, NULL,
+					   &cost_vec);
             }
 
 	  /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
@@ -8267,6 +8307,119 @@ vectorizable_phi (vec_info *,
   return true;
 }
 
+/* Vectorizes Dep PHIs.  */
+
+bool
+vectorizable_dep_phi (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
+		      gimple **vec_stmt, slp_tree slp_node,
+		      stmt_vector_for_cost *)
+{
+  if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
+    return false;
+
+  gphi *phi = as_a<gphi *> (stmt_info->stmt);
+
+  /* So far we only support first-order recurrence auto-vectorization.  */
+  if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
+    return false;
+
+  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+  if (!vec_stmt) /* transformation not required.  */
+    {
+      /* So far we don't support first-order recurrence for SLP
+	 auto-vectorization.   */
+      if (slp_node)
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "do not support first-order recurrence"
+			     "autovectorization for SLP node\n");
+	  return false;
+	}
+
+      /* ???  We probably can shift in zeros for the initial values?  */
+      unsigned ncopies = vect_get_num_copies (loop_vinfo, vectype);
+      if (ncopies > 1)
+	return false;
+
+      /* ???  Account for the necessary permutes.  */
+      STMT_VINFO_TYPE (stmt_info) = dep_phi_info_type;
+      return true;
+    }
+
+  /* This is the second phase of vectorizing first-order rececurrences. An
+     overview of the transformation is described below. Suppose we have the
+     following loop.
+
+     int32_t t = 0;
+     for (int i = 0; i < n; ++i)
+       {
+	b[i] = a[i] - t;
+	t = a[i];
+      }
+
+    There is a first-order recurrence on "a". For this loop, the shorthand
+    scalar IR looks like:
+
+    scalar.preheader:
+      init = a[-1]
+      br loop.body
+
+    scalar.body:
+      i = PHI <0(scalar.preheader), i+1(scalar.body)>
+      _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
+      _1 = a[i]
+      b[i] = _1 - _2
+      br cond, scalar.body, ...
+
+    In this example, _2 is a recurrence because it's value depends on the
+    previous iteration. In the first phase of vectorization, we created a
+    temporary value for _2. We now complete the vectorization and produce the
+    shorthand vector IR shown below (VF = 4).
+
+    vector.preheader:
+      vect_init = vect_cst(..., ..., ..., a[-1])
+      br vector.body
+
+    vector.body
+      i = PHI <0(vector.preheader), i+4(vector.body)>
+      vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
+      vect_2 = a[i, i+1, i+2, i+3];
+      vect_3 = vector(vect_1(3), vect_2(0, 1, 2))
+      b[i, i+1, i+2, i+3] = vect_2 - vect_3
+      br cond, vector.body, middle.block
+
+    middle.block:
+      x = vect_2(3)
+      br scalar.preheader
+
+    scalar.ph:
+      s_init = PHI <x(middle.block), a[-1], otherwise>
+      br scalar.body
+
+    After execution completes the vector loop, we extract the next value of
+    the recurrence (x) to use as the initial value in the scalar loop.  */
+
+  edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
+  basic_block bb = gimple_bb (phi);
+  tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
+  tree vec_init = build_vector_from_val (vectype, preheader);
+  vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
+  /* Create the vectorized first-order PHI node.  */
+  tree vec_dest = vect_get_new_vect_var (vectype,
+					 vect_simple_var, "vec_recur_");
+  gphi *new_phi = create_phi_node (vec_dest, bb);
+  add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
+  if (slp_node)
+    SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
+  else
+    STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
+
+  if (!slp_node)
+    *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
+  return true;
+}
+
 /* Return true if VECTYPE represents a vector that requires lowering
    by the vector lowering pass.  */
 
@@ -10223,27 +10376,66 @@ maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
   imm_use_iterator iter;
   use_operand_p use_p;
   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
-    if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
-      if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
-	  && (phi_info = loop_vinfo->lookup_stmt (phi))
-	  && STMT_VINFO_RELEVANT_P (phi_info)
-	  && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
+    {
+      gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
+      if (!phi)
+	continue;
+      if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
+	    && (phi_info = loop_vinfo->lookup_stmt (phi))
+	    && STMT_VINFO_RELEVANT_P (phi_info)))
+	continue;
+      loop_p loop = gimple_bb (phi)->loop_father;
+      edge e = loop_latch_edge (loop);
+      if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
+	continue;
+
+      if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
 	  && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
 	  && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
 	{
-	  loop_p loop = gimple_bb (phi)->loop_father;
-	  edge e = loop_latch_edge (loop);
-	  if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
+	  vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
+	  vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
+	  gcc_assert (phi_defs.length () == latch_defs.length ());
+	  for (unsigned i = 0; i < phi_defs.length (); ++i)
+	    add_phi_arg (as_a <gphi *> (phi_defs[i]),
+			 gimple_get_lhs (latch_defs[i]), e,
+			 gimple_phi_arg_location (phi, e->dest_idx));
+	}
+      else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
+	{
+	  /* Insert shufflevector to for first-order recurrence
+	     autovectorization.
+	       result = VEC_PERM <vec_recur, vect_1,
+				  index[nunits-1, nunits, ...]>.  */
+	  tree vectype = STMT_VINFO_VECTYPE (phi_info);
+	  poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
+	  vec_perm_builder sel (nunits, 1, 3);
+	  for (int i = 0; i < 3; ++i)
+	    sel.quick_push (nunits - 1 + i);
+	  vec_perm_indices indices (sel, 1, nunits * 2);
+	  tree perm = vect_gen_perm_mask_checked (vectype, indices);
+
+	  vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
+	  vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
+	  gcc_assert (phi_defs.length () == latch_defs.length ());
+	  /* ???  For ncopies > 1 this scheme might not be correct.  */
+	  gcc_assert (phi_defs.length () == 1);
+	  for (unsigned i = 0; i < phi_defs.length (); ++i)
 	    {
-	      vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
-	      vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
-	      gcc_assert (phi_defs.length () == latch_defs.length ());
-	      for (unsigned i = 0; i < phi_defs.length (); ++i)
-		add_phi_arg (as_a <gphi *> (phi_defs[i]),
-			     gimple_get_lhs (latch_defs[i]), e,
-			     gimple_phi_arg_location (phi, e->dest_idx));
+	      tree recur = make_ssa_name (vectype);
+	      gassign *assign
+		= gimple_build_assign (recur, VEC_PERM_EXPR,
+				       gimple_phi_result (phi_defs[i]),
+				       gimple_get_lhs (latch_defs[i]), perm);
+	      gimple_stmt_iterator gsi = gsi_for_stmt (latch_defs[i]);
+	      gsi_next (&gsi);
+	      vect_finish_stmt_generation (loop_vinfo, def_stmt_info,
+					   assign, &gsi);
+	      add_phi_arg (as_a <gphi *> (phi_defs[i]), recur, e,
+			   gimple_phi_arg_location (phi, e->dest_idx));
 	    }
 	}
+    }
 }
 
 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
@@ -10652,6 +10844,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
+	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
 	      && ! PURE_SLP_STMT (stmt_info))
 	    {
@@ -10677,7 +10870,8 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
-	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
+	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
+	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
 	      && ! PURE_SLP_STMT (stmt_info))
 	    maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
 	}
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index c8d1efc45e5..68b79a8f988 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -11404,6 +11404,12 @@ vect_transform_stmt (vec_info *vinfo,
       gcc_assert (done);
       break;
 
+    case dep_phi_info_type:
+      done = vectorizable_dep_phi (as_a <loop_vec_info> (vinfo),
+				  stmt_info, &vec_stmt, slp_node, NULL);
+      gcc_assert (done);
+      break;
+
     case phi_info_type:
       done = vectorizable_phi (vinfo, stmt_info, &vec_stmt, slp_node, NULL);
       gcc_assert (done);
@@ -11804,6 +11810,9 @@ vect_is_simple_use (tree operand, vec_info *vinfo, enum vect_def_type *dt,
 	case vect_nested_cycle:
 	  dump_printf (MSG_NOTE, "nested cycle\n");
 	  break;
+	case vect_first_order_recurrence:
+	  dump_printf (MSG_NOTE, "first order recurrence\n");
+	  break;
 	case vect_unknown_def_type:
 	  dump_printf (MSG_NOTE, "unknown\n");
 	  break;
@@ -11852,7 +11861,8 @@ vect_is_simple_use (tree operand, vec_info *vinfo, enum vect_def_type *dt,
       || *dt == vect_induction_def
       || *dt == vect_reduction_def
       || *dt == vect_double_reduction_def
-      || *dt == vect_nested_cycle)
+      || *dt == vect_nested_cycle
+			|| *dt == vect_first_order_recurrence)
     {
       *vectype = STMT_VINFO_VECTYPE (def_stmt_info);
       gcc_assert (*vectype != NULL_TREE);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 4870c754499..a8a83ced252 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -65,6 +65,7 @@ enum vect_def_type {
   vect_reduction_def,
   vect_double_reduction_def,
   vect_nested_cycle,
+  vect_first_order_recurrence,
   vect_unknown_def_type
 };
 
@@ -1027,6 +1028,7 @@ enum stmt_vec_info_type {
   cycle_phi_info_type,
   lc_phi_info_type,
   phi_info_type,
+  dep_phi_info_type,
   loop_exit_ctrl_vec_info_type
 };
 
@@ -2331,6 +2333,8 @@ extern bool vectorizable_lc_phi (loop_vec_info, stmt_vec_info,
 				 gimple **, slp_tree);
 extern bool vectorizable_phi (vec_info *, stmt_vec_info, gimple **, slp_tree,
 			      stmt_vector_for_cost *);
+extern bool vectorizable_dep_phi (loop_vec_info, stmt_vec_info,
+				 gimple **, slp_tree, stmt_vector_for_cost *);
 extern bool vect_emulated_vector_p (tree);
 extern bool vect_can_vectorize_without_simd_p (tree_code);
 extern bool vect_can_vectorize_without_simd_p (code_helper);
-- 
2.35.3


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] Add first-order recurrence autovectorization
  2022-10-06 12:13 ` Richard Biener
@ 2022-10-06 13:07   ` Richard Biener
  2022-10-07 12:24     ` Richard Biener
  0 siblings, 1 reply; 5+ messages in thread
From: Richard Biener @ 2022-10-06 13:07 UTC (permalink / raw)
  To: juzhe.zhong; +Cc: gcc-patches, richard.sandiford

[-- Attachment #1: Type: text/plain, Size: 3013 bytes --]

On Thu, Oct 6, 2022 at 2:13 PM Richard Biener
<richard.guenther@gmail.com> wrote:
>
> On Fri, Sep 30, 2022 at 10:00 AM <juzhe.zhong@rivai.ai> wrote:
> >
> > From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> >
> > Hi, After fixing previous ICE.
> > I add full implementation (insert permutation to get correct result.)
> >
> > The gimple IR is correct now I think:
> >   # t_21 = PHI <_4(6), t_12(9)>
> >   # i_22 = PHI <i_17(6), 0(9)>
> >   # vectp_a.6_26 = PHI <vectp_a.6_25(6), a_14(D)(9)>
> >   # vect_vec_recur_.9_9 = PHI <vect__4.8_19(6), vect_cst__18(9)>
> >   # vectp_b.11_7 = PHI <vectp_b.11_30(6), b_15(D)(9)>
> >   # curr_cnt_36 = PHI <next_cnt_35(6), _32(9)>
> >   # loop_len_20 = PHI <next_len_34(6), _32(9)>
> >   _38 = .WHILE_LEN (loop_len_20, 32, POLY_INT_CST [4, 4]);
> >   while_len_37 = _38;
> >   _1 = (long unsigned int) i_22;
> >   _2 = _1 * 4;
> >   _3 = a_14(D) + _2;
> >   vect__4.8_19 = .LEN_LOAD (vectp_a.6_26, 32B, loop_len_20, 0);
> >   _4 = *_3;
> >   _5 = b_15(D) + _2;
> >   vect_vec_recur_.9_9 = VEC_PERM_EXPR <vect_vec_recur_.9_9, vect__4.8_19, { POLY_INT_CST [3, 4], POLY_INT_CST [4, 4], POLY_INT_CST [5, 4], ... }>;
> >
> > But I encounter another ICE:
> > 0x169e0e7 process_bb
> >         ../../../riscv-gcc/gcc/tree-ssa-sccvn.cc:7498
> > 0x16a09af do_rpo_vn(function*, edge_def*, bitmap_head*, bool, bool, vn_lookup_kind)
> >         ../../../riscv-gcc/gcc/tree-ssa-sccvn.cc:8109
> > 0x16a0fe7 do_rpo_vn(function*, edge_def*, bitmap_head*)
> >         ../../../riscv-gcc/gcc/tree-ssa-sccvn.cc:8205
> > 0x179b7db execute
> >         ../../../riscv-gcc/gcc/tree-vectorizer.cc:1365
> >
> > Could you help me with this? After fixing this ICE, I think the loop vectorizer
> > can run correctly. Maybe you can test is in X86 or ARM after fixing this ICE.
>
> Sorry for the late reply, the issue is that we have
>
> vect_vec_recur_.7_7 = VEC_PERM_EXPR <vect_vec_recur_.7_7, vect__4.6_9,
> { 7, 8, 9, 10, 11, 12, 13, 14 }>;
>
> thus
>
> +      for (unsigned i = 0; i < ncopies; ++i)
> +       {
> +         gphi *phi = as_a<gphi *> (STMT_VINFO_VEC_STMTS (def_stmt_info)[i]);
> +         tree latch = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
> +         tree recur = gimple_phi_result (phi);
> +         gassign *assign
> +           = gimple_build_assign (recur, VEC_PERM_EXPR, recur, latch, perm);
> +         gimple_assign_set_lhs (assign, recur);
>
> needs to create a new SSA name for each LHS.  You shouldn't create code in
> vect_get_vec_defs_for_operand either.
>
> Let me mangle the patch a bit.
>
> The attached is what I came up with, the permutes need to be generated when
> the backedge PHI values are filled in.  Missing are ncopies > 1 handling, we'd
> need to think of how the initial value and the permutes would work here, missing
> is SLP support but more importantly handling in the epilogue (so on x86 requires
> constant loop bound)
> I've added a testcase that triggers on x86_64.

Actually I broke it, the following is more correct.

Richard.

> Richard.

[-- Attachment #2: p --]
[-- Type: application/octet-stream, Size: 16865 bytes --]

From c14e445f79188d0634e7c92bd5264107820c1035 Mon Sep 17 00:00:00 2001
From: Richard Biener <rguenther@suse.de>
Date: Thu, 6 Oct 2022 13:56:09 +0200
Subject: [PATCH] Vectorization of first-order recurrences
To: gcc-patches@gcc.gnu.org

Missing:
 * ncopies > 1
 * SLP
 * epilogue handling
 * costing
 * more testcases (with runtime verification)

	* tree-vectorizer.h:
	* tree-vect-loop.cc
	* tree-vect-stmts.cc

	* gcc.dg/vect/vect-recurr-1.c: New testcase.
---
 gcc/testsuite/gcc.dg/vect/vect-recurr-1.c |  38 ++++
 gcc/tree-vect-loop.cc                     | 254 ++++++++++++++++++++--
 gcc/tree-vect-stmts.cc                    |  12 +-
 gcc/tree-vectorizer.h                     |   4 +
 4 files changed, 290 insertions(+), 18 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-recurr-1.c

diff --git a/gcc/testsuite/gcc.dg/vect/vect-recurr-1.c b/gcc/testsuite/gcc.dg/vect/vect-recurr-1.c
new file mode 100644
index 00000000000..6eb59fdf854
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-recurr-1.c
@@ -0,0 +1,38 @@
+/* { dg-do run } */
+/* { dg-require-effective-target vect_int } */
+
+#include "tree-vect.h"
+
+void __attribute__((noipa))
+foo (int * __restrict__ a, int * __restrict__ b, int * __restrict__ c)
+{
+  int t = *c;
+  for (int i = 0; i < 64; ++i)
+    {
+      b[i] = a[i] - t;
+      t = a[i];
+    }
+}
+
+int a[64], b[64];
+
+int
+main ()
+{
+  check_vect ();
+  for (int i = 0; i < 64; ++i)
+    {
+      a[i] = i;
+      __asm__ volatile ("" ::: "memory");
+    }
+  int c = 7;
+  foo (a, b, &c);
+  for (int i = 1; i < 64; ++i)
+    if (b[i] != a[i] - a[i-1])
+      abort ();
+  if (b[0] != -7)
+    abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "vectorized 1 loops in function" "vect" } } */
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 2536cc3cf49..f37251bb1ef 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -529,6 +529,52 @@ vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
   return false;
 }
 
+/* Returns true if Phi is a first-order recurrence. A first-order
+   recurrence is a non-reduction recurrence relation in which the value of
+   the recurrence in the current loop iteration equals a value defined in
+   the previous iteration.  */
+
+static bool
+vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
+				   gphi *phi)
+{
+  /* Ensure the loop latch definition is from within the loop.  */
+  edge latch = loop_latch_edge (loop);
+  tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
+  if (TREE_CODE (ldef) != SSA_NAME
+      || SSA_NAME_IS_DEFAULT_DEF (ldef)
+      || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
+    return false;
+
+  tree def = gimple_phi_result (phi);
+
+  /* Ensure every use_stmt of the phi node is dominated by the latch
+     definition.  */
+  imm_use_iterator imm_iter;
+  use_operand_p use_p;
+  FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
+    if (!vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
+				     USE_STMT (use_p)))
+      return false;
+
+  /* First-order recurrence autovectorization needs shuffle vector.  */
+  tree scalar_type = TREE_TYPE (def);
+  tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
+  if (!vectype)
+    return false;
+  /* First-order recurrence autovectorization needs to handle permutation
+     with indices = [nunits-1, nunits, nunits+1, ...].  */
+  poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
+  vec_perm_builder sel (nunits, 1, 3);
+  for (int i = 0; i < 3; ++i)
+    sel.quick_push (nunits - 1 + i);
+  vec_perm_indices indices (sel, 1, nunits * 2);
+  if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype), indices))
+    return false;
+
+  return true;
+}
+
 /* Function vect_analyze_scalar_cycles_1.
 
    Examine the cross iteration def-use cycles of scalar variables
@@ -666,6 +712,8 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
                 }
             }
         }
+      else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
+	STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
       else
         if (dump_enabled_p ())
           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -1810,7 +1858,8 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
 
           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
                || STMT_VINFO_LIVE_P (stmt_info))
-              && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
+	      && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
+	      && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
 	    /* A scalar-dependence cycle that we don't support.  */
 	    return opt_result::failure_at (phi,
 					   "not vectorized:"
@@ -1831,6 +1880,10 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
 		       && ! PURE_SLP_STMT (stmt_info))
 		ok = vectorizable_reduction (loop_vinfo,
 					     stmt_info, NULL, NULL, &cost_vec);
+	      else if (STMT_VINFO_DEF_TYPE (stmt_info)
+			 == vect_first_order_recurrence)
+		ok = vectorizable_dep_phi (loop_vinfo, stmt_info, NULL, NULL,
+					   &cost_vec);
             }
 
 	  /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
@@ -8267,6 +8320,145 @@ vectorizable_phi (vec_info *,
   return true;
 }
 
+/* Vectorizes Dep PHIs.  */
+
+bool
+vectorizable_dep_phi (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
+		      gimple **vec_stmt, slp_tree slp_node,
+		      stmt_vector_for_cost *)
+{
+  if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
+    return false;
+
+  gphi *phi = as_a<gphi *> (stmt_info->stmt);
+
+  /* So far we only support first-order recurrence auto-vectorization.  */
+  if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
+    return false;
+
+  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+  if (!vec_stmt) /* transformation not required.  */
+    {
+      /* So far we don't support first-order recurrence for SLP
+	 auto-vectorization.   */
+      if (slp_node)
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "do not support first-order recurrence"
+			     "autovectorization for SLP node\n");
+	  return false;
+	}
+
+      /* ???  We probably can shift in zeros for the initial values?  */
+      unsigned ncopies = vect_get_num_copies (loop_vinfo, vectype);
+      if (ncopies > 1)
+	return false;
+
+      /* ???  Account for the necessary permutes.  */
+      STMT_VINFO_TYPE (stmt_info) = dep_phi_info_type;
+      return true;
+    }
+
+  /* This is the second phase of vectorizing first-order rececurrences. An
+     overview of the transformation is described below. Suppose we have the
+     following loop.
+
+     int32_t t = 0;
+     for (int i = 0; i < n; ++i)
+       {
+	b[i] = a[i] - t;
+	t = a[i];
+      }
+
+    There is a first-order recurrence on "a". For this loop, the shorthand
+    scalar IR looks like:
+
+    scalar.preheader:
+      init = a[-1]
+      br loop.body
+
+    scalar.body:
+      i = PHI <0(scalar.preheader), i+1(scalar.body)>
+      _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
+      _1 = a[i]
+      b[i] = _1 - _2
+      br cond, scalar.body, ...
+
+    In this example, _2 is a recurrence because it's value depends on the
+    previous iteration. In the first phase of vectorization, we created a
+    temporary value for _2. We now complete the vectorization and produce the
+    shorthand vector IR shown below (VF = 4).
+
+    vector.preheader:
+      vect_init = vect_cst(..., ..., ..., a[-1])
+      br vector.body
+
+    vector.body
+      i = PHI <0(vector.preheader), i+4(vector.body)>
+      vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
+      vect_2 = a[i, i+1, i+2, i+3];
+      vect_3 = vector(vect_1(3), vect_2(0, 1, 2))
+      b[i, i+1, i+2, i+3] = vect_2 - vect_3
+      br cond, vector.body, middle.block
+
+    middle.block:
+      x = vect_2(3)
+      br scalar.preheader
+
+    scalar.ph:
+      s_init = PHI <x(middle.block), a[-1], otherwise>
+      br scalar.body
+
+    After execution completes the vector loop, we extract the next value of
+    the recurrence (x) to use as the initial value in the scalar loop.  */
+
+  edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
+  basic_block bb = gimple_bb (phi);
+  tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
+  tree vec_init = build_vector_from_val (vectype, preheader);
+  vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
+  /* Create the vectorized first-order PHI node.  */
+  tree vec_dest = vect_get_new_vect_var (vectype,
+					 vect_simple_var, "vec_recur_");
+  gphi *new_phi = create_phi_node (vec_dest, bb);
+  add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
+
+  /* Insert shufflevector to for first-order recurrence
+     autovectorization.
+     result = VEC_PERM <vec_recur, vect_1,
+     index[nunits-1, nunits, ...]>.  */
+  poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
+  vec_perm_builder sel (nunits, 1, 3);
+  for (int i = 0; i < 3; ++i)
+    sel.quick_push (nunits - 1 + i);
+  vec_perm_indices indices (sel, 1, nunits * 2);
+  tree perm = vect_gen_perm_mask_checked (vectype, indices);
+
+  /* Insert the required permute after the latch definition.  The
+     second operand is tentative and will be updated when we have
+     vectorized the latch definition.  */
+  vec_dest = make_ssa_name (vectype);
+  gassign *vperm
+    = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
+			   gimple_phi_result (new_phi),
+			   gimple_phi_result (new_phi), perm);
+  edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
+  gimple_stmt_iterator gsi2
+    = gsi_for_stmt (SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le)));
+  gsi_next (&gsi2);
+  vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
+
+  if (slp_node)
+    SLP_TREE_VEC_STMTS (slp_node).quick_push (vperm);
+  else
+    STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
+
+  if (!slp_node)
+    *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
+  return true;
+}
+
 /* Return true if VECTYPE represents a vector that requires lowering
    by the vector lowering pass.  */
 
@@ -10223,27 +10415,53 @@ maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
   imm_use_iterator iter;
   use_operand_p use_p;
   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
-    if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
-      if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
-	  && (phi_info = loop_vinfo->lookup_stmt (phi))
-	  && STMT_VINFO_RELEVANT_P (phi_info)
-	  && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
+    {
+      gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
+      if (!phi)
+	continue;
+      if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
+	    && (phi_info = loop_vinfo->lookup_stmt (phi))
+	    && STMT_VINFO_RELEVANT_P (phi_info)))
+	continue;
+      loop_p loop = gimple_bb (phi)->loop_father;
+      edge e = loop_latch_edge (loop);
+      if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
+	continue;
+
+      if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
 	  && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
 	  && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
 	{
-	  loop_p loop = gimple_bb (phi)->loop_father;
-	  edge e = loop_latch_edge (loop);
-	  if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
+	  vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
+	  vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
+	  gcc_assert (phi_defs.length () == latch_defs.length ());
+	  for (unsigned i = 0; i < phi_defs.length (); ++i)
+	    add_phi_arg (as_a <gphi *> (phi_defs[i]),
+			 gimple_get_lhs (latch_defs[i]), e,
+			 gimple_phi_arg_location (phi, e->dest_idx));
+	}
+      else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
+	{
+	  /* For first order recurrences we have to update both uses of
+	     the latch definition, the one in the PHI node and the one
+	     in the generated VEC_PERM_EXPR.  */
+	  vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
+	  vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
+	  gcc_assert (phi_defs.length () == latch_defs.length ());
+	  /* ???  For ncopies > 1 this scheme might not be correct.  */
+	  gcc_assert (phi_defs.length () == 1);
+	  for (unsigned i = 0; i < phi_defs.length (); ++i)
 	    {
-	      vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
-	      vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
-	      gcc_assert (phi_defs.length () == latch_defs.length ());
-	      for (unsigned i = 0; i < phi_defs.length (); ++i)
-		add_phi_arg (as_a <gphi *> (phi_defs[i]),
-			     gimple_get_lhs (latch_defs[i]), e,
-			     gimple_phi_arg_location (phi, e->dest_idx));
+	      gassign *perm = as_a <gassign *> (phi_defs[i]);
+	      gphi *vphi = as_a <gphi *>
+			     (SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm)));
+	      gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
+	      update_stmt (perm);
+	      add_phi_arg (vphi, gimple_get_lhs (latch_defs[i]), e,
+			   gimple_phi_arg_location (phi, e->dest_idx));
 	    }
 	}
+    }
 }
 
 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
@@ -10652,6 +10870,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
+	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
 	      && ! PURE_SLP_STMT (stmt_info))
 	    {
@@ -10677,7 +10896,8 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
-	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
+	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
+	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
 	      && ! PURE_SLP_STMT (stmt_info))
 	    maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
 	}
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index c8d1efc45e5..68b79a8f988 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -11404,6 +11404,12 @@ vect_transform_stmt (vec_info *vinfo,
       gcc_assert (done);
       break;
 
+    case dep_phi_info_type:
+      done = vectorizable_dep_phi (as_a <loop_vec_info> (vinfo),
+				  stmt_info, &vec_stmt, slp_node, NULL);
+      gcc_assert (done);
+      break;
+
     case phi_info_type:
       done = vectorizable_phi (vinfo, stmt_info, &vec_stmt, slp_node, NULL);
       gcc_assert (done);
@@ -11804,6 +11810,9 @@ vect_is_simple_use (tree operand, vec_info *vinfo, enum vect_def_type *dt,
 	case vect_nested_cycle:
 	  dump_printf (MSG_NOTE, "nested cycle\n");
 	  break;
+	case vect_first_order_recurrence:
+	  dump_printf (MSG_NOTE, "first order recurrence\n");
+	  break;
 	case vect_unknown_def_type:
 	  dump_printf (MSG_NOTE, "unknown\n");
 	  break;
@@ -11852,7 +11861,8 @@ vect_is_simple_use (tree operand, vec_info *vinfo, enum vect_def_type *dt,
       || *dt == vect_induction_def
       || *dt == vect_reduction_def
       || *dt == vect_double_reduction_def
-      || *dt == vect_nested_cycle)
+      || *dt == vect_nested_cycle
+			|| *dt == vect_first_order_recurrence)
     {
       *vectype = STMT_VINFO_VECTYPE (def_stmt_info);
       gcc_assert (*vectype != NULL_TREE);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 4870c754499..a8a83ced252 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -65,6 +65,7 @@ enum vect_def_type {
   vect_reduction_def,
   vect_double_reduction_def,
   vect_nested_cycle,
+  vect_first_order_recurrence,
   vect_unknown_def_type
 };
 
@@ -1027,6 +1028,7 @@ enum stmt_vec_info_type {
   cycle_phi_info_type,
   lc_phi_info_type,
   phi_info_type,
+  dep_phi_info_type,
   loop_exit_ctrl_vec_info_type
 };
 
@@ -2331,6 +2333,8 @@ extern bool vectorizable_lc_phi (loop_vec_info, stmt_vec_info,
 				 gimple **, slp_tree);
 extern bool vectorizable_phi (vec_info *, stmt_vec_info, gimple **, slp_tree,
 			      stmt_vector_for_cost *);
+extern bool vectorizable_dep_phi (loop_vec_info, stmt_vec_info,
+				 gimple **, slp_tree, stmt_vector_for_cost *);
 extern bool vect_emulated_vector_p (tree);
 extern bool vect_can_vectorize_without_simd_p (tree_code);
 extern bool vect_can_vectorize_without_simd_p (code_helper);
-- 
2.35.3


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] Add first-order recurrence autovectorization
  2022-10-06 13:07   ` Richard Biener
@ 2022-10-07 12:24     ` Richard Biener
  2022-10-07 13:11       ` 钟居哲
  0 siblings, 1 reply; 5+ messages in thread
From: Richard Biener @ 2022-10-07 12:24 UTC (permalink / raw)
  To: juzhe.zhong; +Cc: gcc-patches, richard.sandiford

On Thu, Oct 6, 2022 at 3:07 PM Richard Biener
<richard.guenther@gmail.com> wrote:
>
> On Thu, Oct 6, 2022 at 2:13 PM Richard Biener
> <richard.guenther@gmail.com> wrote:
> >
> > On Fri, Sep 30, 2022 at 10:00 AM <juzhe.zhong@rivai.ai> wrote:
> > >
> > > From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> > >
> > > Hi, After fixing previous ICE.
> > > I add full implementation (insert permutation to get correct result.)
> > >
> > > The gimple IR is correct now I think:
> > >   # t_21 = PHI <_4(6), t_12(9)>
> > >   # i_22 = PHI <i_17(6), 0(9)>
> > >   # vectp_a.6_26 = PHI <vectp_a.6_25(6), a_14(D)(9)>
> > >   # vect_vec_recur_.9_9 = PHI <vect__4.8_19(6), vect_cst__18(9)>
> > >   # vectp_b.11_7 = PHI <vectp_b.11_30(6), b_15(D)(9)>
> > >   # curr_cnt_36 = PHI <next_cnt_35(6), _32(9)>
> > >   # loop_len_20 = PHI <next_len_34(6), _32(9)>
> > >   _38 = .WHILE_LEN (loop_len_20, 32, POLY_INT_CST [4, 4]);
> > >   while_len_37 = _38;
> > >   _1 = (long unsigned int) i_22;
> > >   _2 = _1 * 4;
> > >   _3 = a_14(D) + _2;
> > >   vect__4.8_19 = .LEN_LOAD (vectp_a.6_26, 32B, loop_len_20, 0);
> > >   _4 = *_3;
> > >   _5 = b_15(D) + _2;
> > >   vect_vec_recur_.9_9 = VEC_PERM_EXPR <vect_vec_recur_.9_9, vect__4.8_19, { POLY_INT_CST [3, 4], POLY_INT_CST [4, 4], POLY_INT_CST [5, 4], ... }>;
> > >
> > > But I encounter another ICE:
> > > 0x169e0e7 process_bb
> > >         ../../../riscv-gcc/gcc/tree-ssa-sccvn.cc:7498
> > > 0x16a09af do_rpo_vn(function*, edge_def*, bitmap_head*, bool, bool, vn_lookup_kind)
> > >         ../../../riscv-gcc/gcc/tree-ssa-sccvn.cc:8109
> > > 0x16a0fe7 do_rpo_vn(function*, edge_def*, bitmap_head*)
> > >         ../../../riscv-gcc/gcc/tree-ssa-sccvn.cc:8205
> > > 0x179b7db execute
> > >         ../../../riscv-gcc/gcc/tree-vectorizer.cc:1365
> > >
> > > Could you help me with this? After fixing this ICE, I think the loop vectorizer
> > > can run correctly. Maybe you can test is in X86 or ARM after fixing this ICE.
> >
> > Sorry for the late reply, the issue is that we have
> >
> > vect_vec_recur_.7_7 = VEC_PERM_EXPR <vect_vec_recur_.7_7, vect__4.6_9,
> > { 7, 8, 9, 10, 11, 12, 13, 14 }>;
> >
> > thus
> >
> > +      for (unsigned i = 0; i < ncopies; ++i)
> > +       {
> > +         gphi *phi = as_a<gphi *> (STMT_VINFO_VEC_STMTS (def_stmt_info)[i]);
> > +         tree latch = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
> > +         tree recur = gimple_phi_result (phi);
> > +         gassign *assign
> > +           = gimple_build_assign (recur, VEC_PERM_EXPR, recur, latch, perm);
> > +         gimple_assign_set_lhs (assign, recur);
> >
> > needs to create a new SSA name for each LHS.  You shouldn't create code in
> > vect_get_vec_defs_for_operand either.
> >
> > Let me mangle the patch a bit.
> >
> > The attached is what I came up with, the permutes need to be generated when
> > the backedge PHI values are filled in.  Missing are ncopies > 1 handling, we'd
> > need to think of how the initial value and the permutes would work here, missing
> > is SLP support but more importantly handling in the epilogue (so on x86 requires
> > constant loop bound)
> > I've added a testcase that triggers on x86_64.
>
> Actually I broke it, the following is more correct.

So let me finish the patch.  I have everything besides the epilogue
handling done,
I'll get to that somewhen next week.

Richard.

> Richard.
>
> > Richard.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: Re: [PATCH] Add first-order recurrence autovectorization
  2022-10-07 12:24     ` Richard Biener
@ 2022-10-07 13:11       ` 钟居哲
  0 siblings, 0 replies; 5+ messages in thread
From: 钟居哲 @ 2022-10-07 13:11 UTC (permalink / raw)
  To: richard.guenther; +Cc: gcc-patches, richard.sandiford

[-- Attachment #1: Type: text/plain, Size: 3841 bytes --]

Sorry for late reply. I just got back from vacation (a week).
I was planning to finish this patch after vacation. It seems that you almost finished.
That's great! Thank you so much.


juzhe.zhong@rivai.ai
 
From: Richard Biener
Date: 2022-10-07 20:24
To: juzhe.zhong
CC: gcc-patches; richard.sandiford
Subject: Re: [PATCH] Add first-order recurrence autovectorization
On Thu, Oct 6, 2022 at 3:07 PM Richard Biener
<richard.guenther@gmail.com> wrote:
>
> On Thu, Oct 6, 2022 at 2:13 PM Richard Biener
> <richard.guenther@gmail.com> wrote:
> >
> > On Fri, Sep 30, 2022 at 10:00 AM <juzhe.zhong@rivai.ai> wrote:
> > >
> > > From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> > >
> > > Hi, After fixing previous ICE.
> > > I add full implementation (insert permutation to get correct result.)
> > >
> > > The gimple IR is correct now I think:
> > >   # t_21 = PHI <_4(6), t_12(9)>
> > >   # i_22 = PHI <i_17(6), 0(9)>
> > >   # vectp_a.6_26 = PHI <vectp_a.6_25(6), a_14(D)(9)>
> > >   # vect_vec_recur_.9_9 = PHI <vect__4.8_19(6), vect_cst__18(9)>
> > >   # vectp_b.11_7 = PHI <vectp_b.11_30(6), b_15(D)(9)>
> > >   # curr_cnt_36 = PHI <next_cnt_35(6), _32(9)>
> > >   # loop_len_20 = PHI <next_len_34(6), _32(9)>
> > >   _38 = .WHILE_LEN (loop_len_20, 32, POLY_INT_CST [4, 4]);
> > >   while_len_37 = _38;
> > >   _1 = (long unsigned int) i_22;
> > >   _2 = _1 * 4;
> > >   _3 = a_14(D) + _2;
> > >   vect__4.8_19 = .LEN_LOAD (vectp_a.6_26, 32B, loop_len_20, 0);
> > >   _4 = *_3;
> > >   _5 = b_15(D) + _2;
> > >   vect_vec_recur_.9_9 = VEC_PERM_EXPR <vect_vec_recur_.9_9, vect__4.8_19, { POLY_INT_CST [3, 4], POLY_INT_CST [4, 4], POLY_INT_CST [5, 4], ... }>;
> > >
> > > But I encounter another ICE:
> > > 0x169e0e7 process_bb
> > >         ../../../riscv-gcc/gcc/tree-ssa-sccvn.cc:7498
> > > 0x16a09af do_rpo_vn(function*, edge_def*, bitmap_head*, bool, bool, vn_lookup_kind)
> > >         ../../../riscv-gcc/gcc/tree-ssa-sccvn.cc:8109
> > > 0x16a0fe7 do_rpo_vn(function*, edge_def*, bitmap_head*)
> > >         ../../../riscv-gcc/gcc/tree-ssa-sccvn.cc:8205
> > > 0x179b7db execute
> > >         ../../../riscv-gcc/gcc/tree-vectorizer.cc:1365
> > >
> > > Could you help me with this? After fixing this ICE, I think the loop vectorizer
> > > can run correctly. Maybe you can test is in X86 or ARM after fixing this ICE.
> >
> > Sorry for the late reply, the issue is that we have
> >
> > vect_vec_recur_.7_7 = VEC_PERM_EXPR <vect_vec_recur_.7_7, vect__4.6_9,
> > { 7, 8, 9, 10, 11, 12, 13, 14 }>;
> >
> > thus
> >
> > +      for (unsigned i = 0; i < ncopies; ++i)
> > +       {
> > +         gphi *phi = as_a<gphi *> (STMT_VINFO_VEC_STMTS (def_stmt_info)[i]);
> > +         tree latch = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
> > +         tree recur = gimple_phi_result (phi);
> > +         gassign *assign
> > +           = gimple_build_assign (recur, VEC_PERM_EXPR, recur, latch, perm);
> > +         gimple_assign_set_lhs (assign, recur);
> >
> > needs to create a new SSA name for each LHS.  You shouldn't create code in
> > vect_get_vec_defs_for_operand either.
> >
> > Let me mangle the patch a bit.
> >
> > The attached is what I came up with, the permutes need to be generated when
> > the backedge PHI values are filled in.  Missing are ncopies > 1 handling, we'd
> > need to think of how the initial value and the permutes would work here, missing
> > is SLP support but more importantly handling in the epilogue (so on x86 requires
> > constant loop bound)
> > I've added a testcase that triggers on x86_64.
>
> Actually I broke it, the following is more correct.
 
So let me finish the patch.  I have everything besides the epilogue
handling done,
I'll get to that somewhen next week.
 
Richard.
 
> Richard.
>
> > Richard.
 

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2022-10-07 13:11 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-09-30  8:00 [PATCH] Add first-order recurrence autovectorization juzhe.zhong
2022-10-06 12:13 ` Richard Biener
2022-10-06 13:07   ` Richard Biener
2022-10-07 12:24     ` Richard Biener
2022-10-07 13:11       ` 钟居哲

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).