public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization
@ 2023-08-11 14:24 juzhe.zhong
  2023-08-14  6:53 ` Richard Biener
  0 siblings, 1 reply; 19+ messages in thread
From: juzhe.zhong @ 2023-08-11 14:24 UTC (permalink / raw)
  To: gcc-patches; +Cc: rguenther, richard.sandiford, Ju-Zhe Zhong

From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>

Hi, Richard and Richi.

This patch add support live vectorization by VEC_EXTRACT for LEN loop control.

Consider this following case:

#include <stdint.h>

#define EXTRACT_LAST(TYPE)			\
  TYPE __attribute__ ((noinline, noclone))	\
  test_##TYPE (TYPE *x, int n, TYPE value)	\
  {						\
    TYPE last;					\
    for (int j = 0; j < n; ++j)			\
      {						\
	last = x[j];				\
	x[j] = last * value;			\
      }						\
    return last;				\
  }

#define TEST_ALL(T)				\
  T (uint8_t)					\

TEST_ALL (EXTRACT_LAST)

ARM SVE IR:

Preheader:
  max_mask_34 = .WHILE_ULT (0, bnd.5_6, { 0, ... });

Loop:
  ...
  # loop_mask_22 = PHI <next_mask_35(4), max_mask_34(3)>
  ...
  vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_mask_22);
  vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
  .MASK_STORE (_7, 8B, loop_mask_22, vect__4.9_27);
  ...
  next_mask_35 = .WHILE_ULT (_1, bnd.5_6, { 0, ... });
  ...

Epilogue:
  _25 = .EXTRACT_LAST (loop_mask_22, vect_last_12.8_23);

For RVV since we prefer len in loop control, after this patch for RVV:

Loop:
  ...
  loop_len_22 = SELECT_VL;
  vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_len_22);
  vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
  .MASK_STORE (_7, 8B, loop_len_22, vect__4.9_27);
  ...

Epilogue:
  _25 = .VEC_EXTRACT (loop_len_22 + bias - 1, vect_last_12.8_23);

Details of this approach:

1. Step 1 - Add 'vect_can_vectorize_extract_last_with_len_p'  to enable live vectorization
            for LEN loop control.
   
   This function we check whether target support:
    - Use LEN as the loop control.
    - Support VEC_EXTRACT optab.

2. Step 2 - Record LEN for loop control if 'vect_can_vectorize_extract_last_with_len_p' is true.

3. Step 3 - Gerenate VEC_EXTRACT (v, LEN + BIAS - 1).

The only difference between mask and len is that len is using length generated by SELECT_VL and
use VEC_EXTRACT pattern. The rest of the live vectorization is totally the same ARM SVE.

gcc/ChangeLog:

	* tree-vect-loop.cc (vectorizable_live_operation): Add loop len control.

---
 gcc/tree-vect-loop.cc | 78 ++++++++++++++++++++++++++++++++++---------
 1 file changed, 62 insertions(+), 16 deletions(-)

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index bf8d677b584..a011e2dacb2 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -10278,17 +10278,7 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
       /* No transformation required.  */
       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
 	{
-	  if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
-					       OPTIMIZE_FOR_SPEED))
-	    {
-	      if (dump_enabled_p ())
-		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-				 "can't operate on partial vectors "
-				 "because the target doesn't support extract "
-				 "last reduction.\n");
-	      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
-	    }
-	  else if (slp_node)
+	  if (slp_node)
 	    {
 	      if (dump_enabled_p ())
 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -10308,9 +10298,28 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
 	  else
 	    {
 	      gcc_assert (ncopies == 1 && !slp_node);
-	      vect_record_loop_mask (loop_vinfo,
-				     &LOOP_VINFO_MASKS (loop_vinfo),
-				     1, vectype, NULL);
+	      if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
+						  OPTIMIZE_FOR_SPEED))
+		vect_record_loop_mask (loop_vinfo,
+				       &LOOP_VINFO_MASKS (loop_vinfo),
+				       1, vectype, NULL);
+	      else if (convert_optab_handler (vec_extract_optab,
+					      TYPE_MODE (vectype),
+					      TYPE_MODE (TREE_TYPE (vectype)))
+		       != CODE_FOR_nothing)
+		vect_record_loop_len (loop_vinfo,
+				      &LOOP_VINFO_LENS (loop_vinfo),
+				      1, vectype, 1);
+	      else
+		{
+		  if (dump_enabled_p ())
+		    dump_printf_loc (
+		      MSG_MISSED_OPTIMIZATION, vect_location,
+		      "can't operate on partial vectors "
+		      "because the target doesn't support extract "
+		      "last reduction.\n");
+		  LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+		}
 	    }
 	}
       /* ???  Enable for loop costing as well.  */
@@ -10336,7 +10345,9 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
   gimple *vec_stmt;
   if (slp_node)
     {
-      gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
+      gcc_assert (!loop_vinfo
+		  || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+		      && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
 
       /* Get the correct slp vectorized stmt.  */
       vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
@@ -10380,7 +10391,42 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
 
       gimple_seq stmts = NULL;
       tree new_tree;
-      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+      if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
+	{
+	  /* Emit:
+
+	       SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
+
+	     where VEC_LHS is the vectorized live-out result and MASK is
+	     the loop mask for the final iteration.  */
+	  gcc_assert (ncopies == 1 && !slp_node);
+	  gimple_seq tem = NULL;
+	  gimple_stmt_iterator gsi = gsi_last (tem);
+	  tree len
+	    = vect_get_loop_len (loop_vinfo, &gsi,
+				 &LOOP_VINFO_LENS (loop_vinfo),
+				 1, vectype, 0, 0);
+
+	  /* BIAS - 1.  */
+	  signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+	  tree bias_minus_one
+	    = int_const_binop (MINUS_EXPR,
+			       build_int_cst (TREE_TYPE (len), biasval),
+			       build_one_cst (TREE_TYPE (len)));
+
+	  /* LAST_INDEX = LEN + (BIAS - 1).  */
+	  tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
+					  len, bias_minus_one);
+
+	  /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
+	  tree scalar_res
+	    = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
+			    vec_lhs_phi, last_index);
+
+	  /* Convert the extracted vector element to the scalar type.  */
+	  new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
+	}
+      else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
 	{
 	  /* Emit:
 
-- 
2.36.3


^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2023-08-15  9:05 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-08-11 14:24 [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization juzhe.zhong
2023-08-14  6:53 ` Richard Biener
2023-08-14  7:09   ` juzhe.zhong
2023-08-14  8:51     ` Kewen.Lin
2023-08-14  8:58       ` Robin Dapp
2023-08-14  9:15         ` Richard Biener
2023-08-14  9:19         ` Kewen.Lin
2023-08-14  9:26           ` juzhe.zhong
2023-08-14 12:07             ` Richard Biener
2023-08-14 12:08           ` juzhe.zhong
2023-08-14 12:45             ` Kewen.Lin
2023-08-14 13:28               ` 钟居哲
2023-08-14 13:44                 ` Richard Biener
2023-08-14 18:51               ` Stefan Schulze Frielinghaus
2023-08-15  6:25                 ` Kewen.Lin
2023-08-15  6:58                   ` Richard Biener
2023-08-15  8:52                     ` Richard Sandiford
2023-08-15  8:55                       ` Richard Biener
2023-08-15  9:05                         ` Richard Sandiford

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).