public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH V4] VECT: Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer
@ 2023-07-04 12:43 juzhe.zhong
  2023-07-04 13:12 ` juzhe.zhong
  0 siblings, 1 reply; 2+ messages in thread
From: juzhe.zhong @ 2023-07-04 12:43 UTC (permalink / raw)
  To: gcc-patches; +Cc: richard.sandiford, rguenther, Ju-Zhe Zhong

From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>

Hi, Richard and Richi.

Address comments from Richi.

Make gs_info.ifn = LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE.

I have fully tested these 4 format:

length = vf is a dummpy length,
mask = {-1,-1, ... } is a dummy mask.

1. no length, no mask
   LEN_MASK_GATHER_LOAD (..., length = vf, mask = {-1,-1,...})
2. exist length, no mask
   LEN_MASK_GATHER_LOAD (..., len, mask = {-1,-1,...})
3. exist mask, no length
   LEN_MASK_GATHER_LOAD (..., length = vf, mask)
4. both mask and length exist
   LEN_MASK_GATHER_LOAD (..., length, mask)

All of these work fine in this patch.

Here is the example:

void
f (int *restrict a,
   int *restrict b, int n,
   int base, int step,
   int *restrict cond)
{
  for (int i = 0; i < n; ++i)
    {
      if (cond[i])
        a[i * 4] = b[i];
    }
}

Gimple IR:

  <bb 3> [local count: 105119324]:
  _58 = (unsigned long) n_13(D);

  <bb 4> [local count: 630715945]:
  # vectp_cond.7_45 = PHI <vectp_cond.7_46(4), cond_14(D)(3)>
  # vectp_b.11_51 = PHI <vectp_b.11_52(4), b_15(D)(3)>
  # vectp_a.14_55 = PHI <vectp_a.14_56(4), a_16(D)(3)>
  # ivtmp_59 = PHI <ivtmp_60(4), _58(3)>
  _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [2, 2]);
  ivtmp_44 = _61 * 4;
  vect__4.9_47 = .LEN_MASK_LOAD (vectp_cond.7_45, 32B, _61, 0, { -1, ... });
  mask__24.10_49 = vect__4.9_47 != { 0, ... };
  vect__8.13_53 = .LEN_MASK_LOAD (vectp_b.11_51, 32B, _61, 0, mask__24.10_49);
  ivtmp_54 = _61 * 16;
  .LEN_MASK_SCATTER_STORE (vectp_a.14_55, { 0, 16, 32, ... }, 1, vect__8.13_53, _61, 0, mask__24.10_49);
  vectp_cond.7_46 = vectp_cond.7_45 + ivtmp_44;
  vectp_b.11_52 = vectp_b.11_51 + ivtmp_44;
  vectp_a.14_56 = vectp_a.14_55 + ivtmp_54;
  ivtmp_60 = ivtmp_59 - _61;
  if (ivtmp_60 != 0)
    goto <bb 4>; [83.33%]
  else
    goto <bb 5>; [16.67%]

Ok for trunk ?

gcc/ChangeLog:

        * internal-fn.cc (internal_fn_len_index): Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer.
        (internal_fn_mask_index): Ditto.
        * optabs-query.cc (supports_vec_gather_load_p): Ditto.
        (supports_vec_scatter_store_p): Ditto.
        * tree-vect-data-refs.cc (vect_gather_scatter_fn_p): Ditto.
        * tree-vect-patterns.cc (vect_recog_gather_scatter_pattern): Ditto.
        * tree-vect-stmts.cc (check_load_store_for_partial_vectors): Ditto.
        (vect_get_strided_load_store_ops): Ditto.
        (vectorizable_store): Ditto.
        (vectorizable_load): Ditto.

---
 gcc/internal-fn.cc         |   6 +-
 gcc/optabs-query.cc        |   2 +
 gcc/tree-vect-data-refs.cc |  18 +++++-
 gcc/tree-vect-patterns.cc  |   4 +-
 gcc/tree-vect-stmts.cc     | 122 +++++++++++++++++++++++++++++++------
 5 files changed, 129 insertions(+), 23 deletions(-)

diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index 303df102d81..bec60cdf4d0 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -4472,7 +4472,7 @@ internal_fn_len_index (internal_fn fn)
 
     case IFN_LEN_MASK_GATHER_LOAD:
     case IFN_LEN_MASK_SCATTER_STORE:
-      return 4;
+      return 5;
 
     default:
       return -1;
@@ -4497,11 +4497,9 @@ internal_fn_mask_index (internal_fn fn)
     case IFN_MASK_SCATTER_STORE:
     case IFN_LEN_MASK_LOAD:
     case IFN_LEN_MASK_STORE:
-      return 4;
-
     case IFN_LEN_MASK_GATHER_LOAD:
     case IFN_LEN_MASK_SCATTER_STORE:
-      return 6;
+      return 4;
 
     default:
       return (conditional_internal_fn_code (fn) != ERROR_MARK
diff --git a/gcc/optabs-query.cc b/gcc/optabs-query.cc
index 2fdd0d34354..bf1f484e874 100644
--- a/gcc/optabs-query.cc
+++ b/gcc/optabs-query.cc
@@ -676,6 +676,7 @@ supports_vec_gather_load_p (machine_mode mode)
     this_fn_optabs->supports_vec_gather_load[mode]
       = (supports_vec_convert_optab_p (gather_load_optab, mode)
 	 || supports_vec_convert_optab_p (mask_gather_load_optab, mode)
+	 || supports_vec_convert_optab_p (len_mask_gather_load_optab, mode)
 	 ? 1 : -1);
 
   return this_fn_optabs->supports_vec_gather_load[mode] > 0;
@@ -692,6 +693,7 @@ supports_vec_scatter_store_p (machine_mode mode)
     this_fn_optabs->supports_vec_scatter_store[mode]
       = (supports_vec_convert_optab_p (scatter_store_optab, mode)
 	 || supports_vec_convert_optab_p (mask_scatter_store_optab, mode)
+	 || supports_vec_convert_optab_p (len_mask_scatter_store_optab, mode)
 	 ? 1 : -1);
 
   return this_fn_optabs->supports_vec_scatter_store[mode] > 0;
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
index ebe93832b1e..ab2af103cb4 100644
--- a/gcc/tree-vect-data-refs.cc
+++ b/gcc/tree-vect-data-refs.cc
@@ -3873,16 +3873,24 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
     return false;
 
   /* Work out which function we need.  */
-  internal_fn ifn, alt_ifn;
+  internal_fn ifn, alt_ifn, alt_ifn2;
   if (read_p)
     {
       ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
       alt_ifn = IFN_MASK_GATHER_LOAD;
+      /* When target supports LEN_MASK_GATHER_LOAD, we always
+	 use LEN_MASK_GATHER_LOAD regardless whether len and
+	 mask are valid or not.  */
+      alt_ifn2 = IFN_LEN_MASK_GATHER_LOAD;
     }
   else
     {
       ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
       alt_ifn = IFN_MASK_SCATTER_STORE;
+      /* When target supports LEN_MASK_SCATTER_STORE, we always
+	 use LEN_MASK_SCATTER_STORE regardless whether len and
+	 mask are valid or not.  */
+      alt_ifn2 = IFN_LEN_MASK_SCATTER_STORE;
     }
 
   for (;;)
@@ -3909,6 +3917,14 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
 	  *offset_vectype_out = offset_vectype;
 	  return true;
 	}
+      else if (internal_gather_scatter_fn_supported_p (alt_ifn2, vectype,
+						       memory_type,
+						       offset_vectype, scale))
+	{
+	  *ifn_out = alt_ifn2;
+	  *offset_vectype_out = offset_vectype;
+	  return true;
+	}
 
       if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
 	  && TYPE_PRECISION (offset_type) >= element_bits)
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index de20e9d59cb..1bc36b043a0 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -6075,7 +6075,9 @@ vect_recog_gather_scatter_pattern (vec_info *vinfo,
     mask = vect_convert_mask_for_vectype (mask, gs_vectype, stmt_info,
 					  loop_vinfo);
   else if (gs_info.ifn == IFN_MASK_SCATTER_STORE
-	   || gs_info.ifn == IFN_MASK_GATHER_LOAD)
+	   || gs_info.ifn == IFN_MASK_GATHER_LOAD
+	   || gs_info.ifn == IFN_LEN_MASK_SCATTER_STORE
+	   || gs_info.ifn == IFN_LEN_MASK_GATHER_LOAD)
     mask = build_int_cst (TREE_TYPE (truth_type_for (gs_vectype)), -1);
 
   /* Get the invariant base and non-invariant offset, converting the
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index a0c39268bf0..c5341ac01e5 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -1771,6 +1771,18 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
 						   gs_info->offset_vectype,
 						   gs_info->scale))
 	{
+	  ifn = (is_load
+		 ? IFN_LEN_MASK_GATHER_LOAD
+		 : IFN_LEN_MASK_SCATTER_STORE);
+	  if (internal_gather_scatter_fn_supported_p (ifn, vectype,
+						      gs_info->memory_type,
+						      gs_info->offset_vectype,
+						      gs_info->scale))
+	    {
+	      vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
+	      vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1);
+	      return;
+	    }
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 			     "can't operate on partial vectors because"
@@ -3129,16 +3141,39 @@ vect_get_gather_scatter_ops (loop_vec_info loop_vinfo,
 static void
 vect_get_strided_load_store_ops (stmt_vec_info stmt_info,
 				 loop_vec_info loop_vinfo,
+				 gimple_stmt_iterator *gsi,
 				 gather_scatter_info *gs_info,
-				 tree *dataref_bump, tree *vec_offset)
+				 tree *dataref_bump, tree *vec_offset,
+				 vec_loop_lens *loop_lens)
 {
   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 
-  tree bump = size_binop (MULT_EXPR,
-			  fold_convert (sizetype, unshare_expr (DR_STEP (dr))),
-			  size_int (TYPE_VECTOR_SUBPARTS (vectype)));
-  *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump);
+  if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
+    {
+      /* _31 = .SELECT_VL (ivtmp_29, POLY_INT_CST [4, 4]);
+	 ivtmp_8 = _31 * 16 (step in bytes);
+	 .LEN_MASK_SCATTER_STORE (vectp_a.9_7, ... );
+	 vectp_a.9_26 = vectp_a.9_7 + ivtmp_8;  */
+      tree loop_len
+	= vect_get_loop_len (loop_vinfo, gsi, loop_lens, 1, vectype, 0, 0);
+      tree tmp
+	= fold_build2 (MULT_EXPR, sizetype,
+		       fold_convert (sizetype, unshare_expr (DR_STEP (dr))),
+		       loop_len);
+      tree bump = make_temp_ssa_name (sizetype, NULL, "ivtmp");
+      gassign *assign = gimple_build_assign (bump, tmp);
+      gsi_insert_before (gsi, assign, GSI_SAME_STMT);
+      *dataref_bump = bump;
+    }
+  else
+    {
+      tree bump
+	= size_binop (MULT_EXPR,
+		      fold_convert (sizetype, unshare_expr (DR_STEP (dr))),
+		      size_int (TYPE_VECTOR_SUBPARTS (vectype)));
+      *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump);
+    }
 
   /* The offset given in GS_INFO can have pointer type, so use the element
      type of the vector instead.  */
@@ -8685,8 +8720,8 @@ vectorizable_store (vec_info *vinfo,
   else if (memory_access_type == VMAT_GATHER_SCATTER)
     {
       aggr_type = elem_type;
-      vect_get_strided_load_store_ops (stmt_info, loop_vinfo, &gs_info,
-				       &bump, &vec_offset);
+      vect_get_strided_load_store_ops (stmt_info, loop_vinfo, gsi, &gs_info,
+				       &bump, &vec_offset, loop_lens);
     }
   else
     {
@@ -8915,6 +8950,8 @@ vectorizable_store (vec_info *vinfo,
 	      unsigned HOST_WIDE_INT align;
 
 	      tree final_mask = NULL_TREE;
+	      tree final_len = NULL_TREE;
+	      tree bias = NULL_TREE;
 	      if (loop_masks)
 		final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
 						 vec_num * ncopies,
@@ -8929,8 +8966,36 @@ vectorizable_store (vec_info *vinfo,
 		  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 		    vec_offset = vec_offsets[vec_num * j + i];
 		  tree scale = size_int (gs_info.scale);
+
+		  if (gs_info.ifn == IFN_LEN_MASK_SCATTER_STORE)
+		    {
+		      if (loop_lens)
+			final_len
+			  = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
+					       vec_num * ncopies, vectype,
+					       vec_num * j + i, 1);
+		      else
+			final_len
+			  = build_int_cst (sizetype,
+					   TYPE_VECTOR_SUBPARTS (vectype));
+		      signed char biasval
+			= LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+		      bias = build_int_cst (intQI_type_node, biasval);
+		      if (!final_mask)
+			{
+			  mask_vectype = truth_type_for (vectype);
+			  final_mask = build_minus_one_cst (mask_vectype);
+			}
+		    }
+
 		  gcall *call;
-		  if (final_mask)
+		  if (final_len && final_len)
+		    call
+		      = gimple_build_call_internal (IFN_LEN_MASK_SCATTER_STORE,
+						    7, dataref_ptr, vec_offset,
+						    scale, vec_oprnd, final_mask,
+						    final_len, bias);
+		  else if (final_mask)
 		    call = gimple_build_call_internal
 		      (IFN_MASK_SCATTER_STORE, 5, dataref_ptr, vec_offset,
 		       scale, vec_oprnd, final_mask);
@@ -9047,9 +9112,6 @@ vectorizable_store (vec_info *vinfo,
 	      machine_mode vmode = TYPE_MODE (vectype);
 	      machine_mode new_vmode = vmode;
 	      internal_fn partial_ifn = IFN_LAST;
-	      /* Produce 'len' and 'bias' argument.  */
-	      tree final_len = NULL_TREE;
-	      tree bias = NULL_TREE;
 	      if (loop_lens)
 		{
 		  opt_machine_mode new_ovmode
@@ -10177,8 +10239,8 @@ vectorizable_load (vec_info *vinfo,
   else if (memory_access_type == VMAT_GATHER_SCATTER)
     {
       aggr_type = elem_type;
-      vect_get_strided_load_store_ops (stmt_info, loop_vinfo, &gs_info,
-				       &bump, &vec_offset);
+      vect_get_strided_load_store_ops (stmt_info, loop_vinfo, gsi, &gs_info,
+				       &bump, &vec_offset, loop_lens);
     }
   else
     {
@@ -10339,6 +10401,8 @@ vectorizable_load (vec_info *vinfo,
 	  for (i = 0; i < vec_num; i++)
 	    {
 	      tree final_mask = NULL_TREE;
+	      tree final_len = NULL_TREE;
+	      tree bias = NULL_TREE;
 	      if (loop_masks
 		  && memory_access_type != VMAT_INVARIANT)
 		final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
@@ -10368,8 +10432,35 @@ vectorizable_load (vec_info *vinfo,
 			  vec_offset = vec_offsets[vec_num * j + i];
 			tree zero = build_zero_cst (vectype);
 			tree scale = size_int (gs_info.scale);
+
+			if (gs_info.ifn == IFN_LEN_MASK_GATHER_LOAD)
+			  {
+			    if (loop_lens)
+			      final_len
+				= vect_get_loop_len (loop_vinfo, gsi, loop_lens,
+						     vec_num * ncopies, vectype,
+						     vec_num * j + i, 1);
+			    else
+			      final_len = build_int_cst (sizetype,
+							 TYPE_VECTOR_SUBPARTS (
+							   vectype));
+			    signed char biasval
+			      = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+			    bias = build_int_cst (intQI_type_node, biasval);
+			    if (!final_mask)
+			      {
+				mask_vectype = truth_type_for (vectype);
+				final_mask = build_minus_one_cst (mask_vectype);
+			      }
+			  }
+
 			gcall *call;
-			if (final_mask)
+			if (final_len && final_mask)
+			  call = gimple_build_call_internal (
+			    IFN_LEN_MASK_GATHER_LOAD, 7, dataref_ptr,
+			    vec_offset, scale, zero, final_mask, final_len,
+			    bias);
+			else if (final_mask)
 			  call = gimple_build_call_internal
 			    (IFN_MASK_GATHER_LOAD, 5, dataref_ptr,
 			     vec_offset, scale, zero, final_mask);
@@ -10462,9 +10553,6 @@ vectorizable_load (vec_info *vinfo,
 		    machine_mode vmode = TYPE_MODE (vectype);
 		    machine_mode new_vmode = vmode;
 		    internal_fn partial_ifn = IFN_LAST;
-		    /* Produce 'len' and 'bias' argument.  */
-		    tree final_len = NULL_TREE;
-		    tree bias = NULL_TREE;
 		    if (loop_lens)
 		      {
 			opt_machine_mode new_ovmode
-- 
2.36.3


^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [PATCH V4] VECT: Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer
  2023-07-04 12:43 [PATCH V4] VECT: Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer juzhe.zhong
@ 2023-07-04 13:12 ` juzhe.zhong
  0 siblings, 0 replies; 2+ messages in thread
From: juzhe.zhong @ 2023-07-04 13:12 UTC (permalink / raw)
  To: 钟居哲, gcc-patches; +Cc: richard.sandiford, rguenther

[-- Attachment #1: Type: text/plain, Size: 14475 bytes --]

Sorry for I made a mistake in V4:
+		  if (final_len && final_len)
It should be 
+		  if (final_len && final_mask)

I fixed it on V5 patch:
https://gcc.gnu.org/pipermail/gcc-patches/2023-July/623589.html 

Thanks.


juzhe.zhong@rivai.ai
 
From: juzhe.zhong
Date: 2023-07-04 20:43
To: gcc-patches
CC: richard.sandiford; rguenther; Ju-Zhe Zhong
Subject: [PATCH V4] VECT: Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer
From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
 
Hi, Richard and Richi.
 
Address comments from Richi.
 
Make gs_info.ifn = LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE.
 
I have fully tested these 4 format:
 
length = vf is a dummpy length,
mask = {-1,-1, ... } is a dummy mask.
 
1. no length, no mask
   LEN_MASK_GATHER_LOAD (..., length = vf, mask = {-1,-1,...})
2. exist length, no mask
   LEN_MASK_GATHER_LOAD (..., len, mask = {-1,-1,...})
3. exist mask, no length
   LEN_MASK_GATHER_LOAD (..., length = vf, mask)
4. both mask and length exist
   LEN_MASK_GATHER_LOAD (..., length, mask)
 
All of these work fine in this patch.
 
Here is the example:
 
void
f (int *restrict a,
   int *restrict b, int n,
   int base, int step,
   int *restrict cond)
{
  for (int i = 0; i < n; ++i)
    {
      if (cond[i])
        a[i * 4] = b[i];
    }
}
 
Gimple IR:
 
  <bb 3> [local count: 105119324]:
  _58 = (unsigned long) n_13(D);
 
  <bb 4> [local count: 630715945]:
  # vectp_cond.7_45 = PHI <vectp_cond.7_46(4), cond_14(D)(3)>
  # vectp_b.11_51 = PHI <vectp_b.11_52(4), b_15(D)(3)>
  # vectp_a.14_55 = PHI <vectp_a.14_56(4), a_16(D)(3)>
  # ivtmp_59 = PHI <ivtmp_60(4), _58(3)>
  _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [2, 2]);
  ivtmp_44 = _61 * 4;
  vect__4.9_47 = .LEN_MASK_LOAD (vectp_cond.7_45, 32B, _61, 0, { -1, ... });
  mask__24.10_49 = vect__4.9_47 != { 0, ... };
  vect__8.13_53 = .LEN_MASK_LOAD (vectp_b.11_51, 32B, _61, 0, mask__24.10_49);
  ivtmp_54 = _61 * 16;
  .LEN_MASK_SCATTER_STORE (vectp_a.14_55, { 0, 16, 32, ... }, 1, vect__8.13_53, _61, 0, mask__24.10_49);
  vectp_cond.7_46 = vectp_cond.7_45 + ivtmp_44;
  vectp_b.11_52 = vectp_b.11_51 + ivtmp_44;
  vectp_a.14_56 = vectp_a.14_55 + ivtmp_54;
  ivtmp_60 = ivtmp_59 - _61;
  if (ivtmp_60 != 0)
    goto <bb 4>; [83.33%]
  else
    goto <bb 5>; [16.67%]
 
Ok for trunk ?
 
gcc/ChangeLog:
 
        * internal-fn.cc (internal_fn_len_index): Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer.
        (internal_fn_mask_index): Ditto.
        * optabs-query.cc (supports_vec_gather_load_p): Ditto.
        (supports_vec_scatter_store_p): Ditto.
        * tree-vect-data-refs.cc (vect_gather_scatter_fn_p): Ditto.
        * tree-vect-patterns.cc (vect_recog_gather_scatter_pattern): Ditto.
        * tree-vect-stmts.cc (check_load_store_for_partial_vectors): Ditto.
        (vect_get_strided_load_store_ops): Ditto.
        (vectorizable_store): Ditto.
        (vectorizable_load): Ditto.
 
---
gcc/internal-fn.cc         |   6 +-
gcc/optabs-query.cc        |   2 +
gcc/tree-vect-data-refs.cc |  18 +++++-
gcc/tree-vect-patterns.cc  |   4 +-
gcc/tree-vect-stmts.cc     | 122 +++++++++++++++++++++++++++++++------
5 files changed, 129 insertions(+), 23 deletions(-)
 
diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index 303df102d81..bec60cdf4d0 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -4472,7 +4472,7 @@ internal_fn_len_index (internal_fn fn)
     case IFN_LEN_MASK_GATHER_LOAD:
     case IFN_LEN_MASK_SCATTER_STORE:
-      return 4;
+      return 5;
     default:
       return -1;
@@ -4497,11 +4497,9 @@ internal_fn_mask_index (internal_fn fn)
     case IFN_MASK_SCATTER_STORE:
     case IFN_LEN_MASK_LOAD:
     case IFN_LEN_MASK_STORE:
-      return 4;
-
     case IFN_LEN_MASK_GATHER_LOAD:
     case IFN_LEN_MASK_SCATTER_STORE:
-      return 6;
+      return 4;
     default:
       return (conditional_internal_fn_code (fn) != ERROR_MARK
diff --git a/gcc/optabs-query.cc b/gcc/optabs-query.cc
index 2fdd0d34354..bf1f484e874 100644
--- a/gcc/optabs-query.cc
+++ b/gcc/optabs-query.cc
@@ -676,6 +676,7 @@ supports_vec_gather_load_p (machine_mode mode)
     this_fn_optabs->supports_vec_gather_load[mode]
       = (supports_vec_convert_optab_p (gather_load_optab, mode)
|| supports_vec_convert_optab_p (mask_gather_load_optab, mode)
+ || supports_vec_convert_optab_p (len_mask_gather_load_optab, mode)
? 1 : -1);
   return this_fn_optabs->supports_vec_gather_load[mode] > 0;
@@ -692,6 +693,7 @@ supports_vec_scatter_store_p (machine_mode mode)
     this_fn_optabs->supports_vec_scatter_store[mode]
       = (supports_vec_convert_optab_p (scatter_store_optab, mode)
|| supports_vec_convert_optab_p (mask_scatter_store_optab, mode)
+ || supports_vec_convert_optab_p (len_mask_scatter_store_optab, mode)
? 1 : -1);
   return this_fn_optabs->supports_vec_scatter_store[mode] > 0;
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
index ebe93832b1e..ab2af103cb4 100644
--- a/gcc/tree-vect-data-refs.cc
+++ b/gcc/tree-vect-data-refs.cc
@@ -3873,16 +3873,24 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
     return false;
   /* Work out which function we need.  */
-  internal_fn ifn, alt_ifn;
+  internal_fn ifn, alt_ifn, alt_ifn2;
   if (read_p)
     {
       ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
       alt_ifn = IFN_MASK_GATHER_LOAD;
+      /* When target supports LEN_MASK_GATHER_LOAD, we always
+ use LEN_MASK_GATHER_LOAD regardless whether len and
+ mask are valid or not.  */
+      alt_ifn2 = IFN_LEN_MASK_GATHER_LOAD;
     }
   else
     {
       ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
       alt_ifn = IFN_MASK_SCATTER_STORE;
+      /* When target supports LEN_MASK_SCATTER_STORE, we always
+ use LEN_MASK_SCATTER_STORE regardless whether len and
+ mask are valid or not.  */
+      alt_ifn2 = IFN_LEN_MASK_SCATTER_STORE;
     }
   for (;;)
@@ -3909,6 +3917,14 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
  *offset_vectype_out = offset_vectype;
  return true;
}
+      else if (internal_gather_scatter_fn_supported_p (alt_ifn2, vectype,
+        memory_type,
+        offset_vectype, scale))
+ {
+   *ifn_out = alt_ifn2;
+   *offset_vectype_out = offset_vectype;
+   return true;
+ }
       if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
  && TYPE_PRECISION (offset_type) >= element_bits)
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index de20e9d59cb..1bc36b043a0 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -6075,7 +6075,9 @@ vect_recog_gather_scatter_pattern (vec_info *vinfo,
     mask = vect_convert_mask_for_vectype (mask, gs_vectype, stmt_info,
  loop_vinfo);
   else if (gs_info.ifn == IFN_MASK_SCATTER_STORE
-    || gs_info.ifn == IFN_MASK_GATHER_LOAD)
+    || gs_info.ifn == IFN_MASK_GATHER_LOAD
+    || gs_info.ifn == IFN_LEN_MASK_SCATTER_STORE
+    || gs_info.ifn == IFN_LEN_MASK_GATHER_LOAD)
     mask = build_int_cst (TREE_TYPE (truth_type_for (gs_vectype)), -1);
   /* Get the invariant base and non-invariant offset, converting the
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index a0c39268bf0..c5341ac01e5 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -1771,6 +1771,18 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
   gs_info->offset_vectype,
   gs_info->scale))
{
+   ifn = (is_load
+ ? IFN_LEN_MASK_GATHER_LOAD
+ : IFN_LEN_MASK_SCATTER_STORE);
+   if (internal_gather_scatter_fn_supported_p (ifn, vectype,
+       gs_info->memory_type,
+       gs_info->offset_vectype,
+       gs_info->scale))
+     {
+       vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
+       vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1);
+       return;
+     }
  if (dump_enabled_p ())
    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     "can't operate on partial vectors because"
@@ -3129,16 +3141,39 @@ vect_get_gather_scatter_ops (loop_vec_info loop_vinfo,
static void
vect_get_strided_load_store_ops (stmt_vec_info stmt_info,
loop_vec_info loop_vinfo,
+ gimple_stmt_iterator *gsi,
gather_scatter_info *gs_info,
- tree *dataref_bump, tree *vec_offset)
+ tree *dataref_bump, tree *vec_offset,
+ vec_loop_lens *loop_lens)
{
   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
-  tree bump = size_binop (MULT_EXPR,
-   fold_convert (sizetype, unshare_expr (DR_STEP (dr))),
-   size_int (TYPE_VECTOR_SUBPARTS (vectype)));
-  *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump);
+  if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
+    {
+      /* _31 = .SELECT_VL (ivtmp_29, POLY_INT_CST [4, 4]);
+ ivtmp_8 = _31 * 16 (step in bytes);
+ .LEN_MASK_SCATTER_STORE (vectp_a.9_7, ... );
+ vectp_a.9_26 = vectp_a.9_7 + ivtmp_8;  */
+      tree loop_len
+ = vect_get_loop_len (loop_vinfo, gsi, loop_lens, 1, vectype, 0, 0);
+      tree tmp
+ = fold_build2 (MULT_EXPR, sizetype,
+        fold_convert (sizetype, unshare_expr (DR_STEP (dr))),
+        loop_len);
+      tree bump = make_temp_ssa_name (sizetype, NULL, "ivtmp");
+      gassign *assign = gimple_build_assign (bump, tmp);
+      gsi_insert_before (gsi, assign, GSI_SAME_STMT);
+      *dataref_bump = bump;
+    }
+  else
+    {
+      tree bump
+ = size_binop (MULT_EXPR,
+       fold_convert (sizetype, unshare_expr (DR_STEP (dr))),
+       size_int (TYPE_VECTOR_SUBPARTS (vectype)));
+      *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump);
+    }
   /* The offset given in GS_INFO can have pointer type, so use the element
      type of the vector instead.  */
@@ -8685,8 +8720,8 @@ vectorizable_store (vec_info *vinfo,
   else if (memory_access_type == VMAT_GATHER_SCATTER)
     {
       aggr_type = elem_type;
-      vect_get_strided_load_store_ops (stmt_info, loop_vinfo, &gs_info,
-        &bump, &vec_offset);
+      vect_get_strided_load_store_ops (stmt_info, loop_vinfo, gsi, &gs_info,
+        &bump, &vec_offset, loop_lens);
     }
   else
     {
@@ -8915,6 +8950,8 @@ vectorizable_store (vec_info *vinfo,
      unsigned HOST_WIDE_INT align;
      tree final_mask = NULL_TREE;
+       tree final_len = NULL_TREE;
+       tree bias = NULL_TREE;
      if (loop_masks)
final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
vec_num * ncopies,
@@ -8929,8 +8966,36 @@ vectorizable_store (vec_info *vinfo,
  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
    vec_offset = vec_offsets[vec_num * j + i];
  tree scale = size_int (gs_info.scale);
+
+   if (gs_info.ifn == IFN_LEN_MASK_SCATTER_STORE)
+     {
+       if (loop_lens)
+ final_len
+   = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
+        vec_num * ncopies, vectype,
+        vec_num * j + i, 1);
+       else
+ final_len
+   = build_int_cst (sizetype,
+    TYPE_VECTOR_SUBPARTS (vectype));
+       signed char biasval
+ = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+       bias = build_int_cst (intQI_type_node, biasval);
+       if (!final_mask)
+ {
+   mask_vectype = truth_type_for (vectype);
+   final_mask = build_minus_one_cst (mask_vectype);
+ }
+     }
+
  gcall *call;
-   if (final_mask)
+   if (final_len && final_len)
+     call
+       = gimple_build_call_internal (IFN_LEN_MASK_SCATTER_STORE,
+     7, dataref_ptr, vec_offset,
+     scale, vec_oprnd, final_mask,
+     final_len, bias);
+   else if (final_mask)
    call = gimple_build_call_internal
      (IFN_MASK_SCATTER_STORE, 5, dataref_ptr, vec_offset,
       scale, vec_oprnd, final_mask);
@@ -9047,9 +9112,6 @@ vectorizable_store (vec_info *vinfo,
      machine_mode vmode = TYPE_MODE (vectype);
      machine_mode new_vmode = vmode;
      internal_fn partial_ifn = IFN_LAST;
-       /* Produce 'len' and 'bias' argument.  */
-       tree final_len = NULL_TREE;
-       tree bias = NULL_TREE;
      if (loop_lens)
{
  opt_machine_mode new_ovmode
@@ -10177,8 +10239,8 @@ vectorizable_load (vec_info *vinfo,
   else if (memory_access_type == VMAT_GATHER_SCATTER)
     {
       aggr_type = elem_type;
-      vect_get_strided_load_store_ops (stmt_info, loop_vinfo, &gs_info,
-        &bump, &vec_offset);
+      vect_get_strided_load_store_ops (stmt_info, loop_vinfo, gsi, &gs_info,
+        &bump, &vec_offset, loop_lens);
     }
   else
     {
@@ -10339,6 +10401,8 @@ vectorizable_load (vec_info *vinfo,
  for (i = 0; i < vec_num; i++)
    {
      tree final_mask = NULL_TREE;
+       tree final_len = NULL_TREE;
+       tree bias = NULL_TREE;
      if (loop_masks
  && memory_access_type != VMAT_INVARIANT)
final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
@@ -10368,8 +10432,35 @@ vectorizable_load (vec_info *vinfo,
  vec_offset = vec_offsets[vec_num * j + i];
tree zero = build_zero_cst (vectype);
tree scale = size_int (gs_info.scale);
+
+ if (gs_info.ifn == IFN_LEN_MASK_GATHER_LOAD)
+   {
+     if (loop_lens)
+       final_len
+ = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
+      vec_num * ncopies, vectype,
+      vec_num * j + i, 1);
+     else
+       final_len = build_int_cst (sizetype,
+ TYPE_VECTOR_SUBPARTS (
+    vectype));
+     signed char biasval
+       = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+     bias = build_int_cst (intQI_type_node, biasval);
+     if (!final_mask)
+       {
+ mask_vectype = truth_type_for (vectype);
+ final_mask = build_minus_one_cst (mask_vectype);
+       }
+   }
+
gcall *call;
- if (final_mask)
+ if (final_len && final_mask)
+   call = gimple_build_call_internal (
+     IFN_LEN_MASK_GATHER_LOAD, 7, dataref_ptr,
+     vec_offset, scale, zero, final_mask, final_len,
+     bias);
+ else if (final_mask)
  call = gimple_build_call_internal
    (IFN_MASK_GATHER_LOAD, 5, dataref_ptr,
     vec_offset, scale, zero, final_mask);
@@ -10462,9 +10553,6 @@ vectorizable_load (vec_info *vinfo,
    machine_mode vmode = TYPE_MODE (vectype);
    machine_mode new_vmode = vmode;
    internal_fn partial_ifn = IFN_LAST;
-     /* Produce 'len' and 'bias' argument.  */
-     tree final_len = NULL_TREE;
-     tree bias = NULL_TREE;
    if (loop_lens)
      {
opt_machine_mode new_ovmode
-- 
2.36.3
 

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2023-07-04 13:12 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-07-04 12:43 [PATCH V4] VECT: Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer juzhe.zhong
2023-07-04 13:12 ` juzhe.zhong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).