public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH V6] VECT: Apply LEN_MASK_{LOAD,STORE} into vectorizer
@ 2023-06-22 23:51 juzhe.zhong
  2023-06-23  7:55 ` Bernhard Reutner-Fischer
  2023-06-23  8:08 ` Richard Sandiford
  0 siblings, 2 replies; 8+ messages in thread
From: juzhe.zhong @ 2023-06-22 23:51 UTC (permalink / raw)
  To: gcc-patches; +Cc: rguenther, richard.sandiford, rep.dot.nop, Ju-Zhe Zhong

From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>

Address comments from Richard and Bernhard from V5 patch.
V6 fixed all issues according their comments.

gcc/ChangeLog:

        * internal-fn.cc (expand_partial_store_optab_fn): Adapt for LEN_MASK_STORE.
        (internal_load_fn_p): Add LEN_MASK_LOAD.
        (internal_store_fn_p): Add LEN_MASK_STORE.
        (internal_fn_mask_index): Add LEN_MASK_{LOAD,STORE}.
        (internal_fn_stored_value_index): Add LEN_MASK_STORE.
        (internal_len_load_store_bias):  Add LEN_MASK_{LOAD,STORE}.
        * optabs-tree.cc (can_vec_mask_load_store_p): Adapt for LEN_MASK_{LOAD,STORE}.
        (get_len_load_store_mode): Ditto.
        * optabs-tree.h (can_vec_mask_load_store_p): Ditto.
        (get_len_load_store_mode): Ditto.
        * tree-vect-stmts.cc (check_load_store_for_partial_vectors): Ditto.
        (get_all_ones_mask): New function.
        (vectorizable_store): Apply LEN_MASK_{LOAD,STORE} into vectorizer.
        (vectorizable_load): Ditto.

---
 gcc/internal-fn.cc     |  37 ++++++-
 gcc/optabs-tree.cc     |  86 ++++++++++++++--
 gcc/optabs-tree.h      |   6 +-
 gcc/tree-vect-stmts.cc | 221 +++++++++++++++++++++++++++++------------
 4 files changed, 267 insertions(+), 83 deletions(-)

diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index c911ae790cb..1c2fd487e2a 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -2949,7 +2949,7 @@ expand_partial_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
  * OPTAB.  */
 
 static void
-expand_partial_store_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
+expand_partial_store_optab_fn (internal_fn ifn, gcall *stmt, convert_optab optab)
 {
   class expand_operand ops[5];
   tree type, lhs, rhs, maskt, biast;
@@ -2957,7 +2957,7 @@ expand_partial_store_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
   insn_code icode;
 
   maskt = gimple_call_arg (stmt, 2);
-  rhs = gimple_call_arg (stmt, 3);
+  rhs = gimple_call_arg (stmt, internal_fn_stored_value_index (ifn));
   type = TREE_TYPE (rhs);
   lhs = expand_call_mem_ref (type, stmt, 0);
 
@@ -4435,6 +4435,7 @@ internal_load_fn_p (internal_fn fn)
     case IFN_GATHER_LOAD:
     case IFN_MASK_GATHER_LOAD:
     case IFN_LEN_LOAD:
+    case IFN_LEN_MASK_LOAD:
       return true;
 
     default:
@@ -4455,6 +4456,7 @@ internal_store_fn_p (internal_fn fn)
     case IFN_SCATTER_STORE:
     case IFN_MASK_SCATTER_STORE:
     case IFN_LEN_STORE:
+    case IFN_LEN_MASK_STORE:
       return true;
 
     default:
@@ -4498,6 +4500,10 @@ internal_fn_mask_index (internal_fn fn)
     case IFN_MASK_SCATTER_STORE:
       return 4;
 
+    case IFN_LEN_MASK_LOAD:
+    case IFN_LEN_MASK_STORE:
+      return 3;
+
     default:
       return (conditional_internal_fn_code (fn) != ERROR_MARK
 	      || get_unconditional_internal_fn (fn) != IFN_LAST ? 0 : -1);
@@ -4519,6 +4525,9 @@ internal_fn_stored_value_index (internal_fn fn)
     case IFN_LEN_STORE:
       return 3;
 
+    case IFN_LEN_MASK_STORE:
+      return 4;
+
     default:
       return -1;
     }
@@ -4583,13 +4592,33 @@ internal_len_load_store_bias (internal_fn ifn, machine_mode mode)
 {
   optab optab = direct_internal_fn_optab (ifn);
   insn_code icode = direct_optab_handler (optab, mode);
+  int bias_opno = 3;
+
+  if (icode == CODE_FOR_nothing)
+    {
+      machine_mode mask_mode;
+      if (!targetm.vectorize.get_mask_mode (mode).exists (&mask_mode))
+	return VECT_PARTIAL_BIAS_UNSUPPORTED;
+      if (ifn == IFN_LEN_LOAD)
+	{
+	  /* Try LEN_MASK_LOAD.  */
+	  optab = direct_internal_fn_optab (IFN_LEN_MASK_LOAD);
+	}
+      else
+	{
+	  /* Try LEN_MASK_STORE.  */
+	  optab = direct_internal_fn_optab (IFN_LEN_MASK_STORE);
+	}
+      icode = convert_optab_handler (optab, mode, mask_mode);
+      bias_opno = 4;
+    }
 
   if (icode != CODE_FOR_nothing)
     {
       /* For now we only support biases of 0 or -1.  Try both of them.  */
-      if (insn_operand_matches (icode, 3, GEN_INT (0)))
+      if (insn_operand_matches (icode, bias_opno, GEN_INT (0)))
 	return 0;
-      if (insn_operand_matches (icode, 3, GEN_INT (-1)))
+      if (insn_operand_matches (icode, bias_opno, GEN_INT (-1)))
 	return -1;
     }
 
diff --git a/gcc/optabs-tree.cc b/gcc/optabs-tree.cc
index 77bf745ae40..e6ae15939d3 100644
--- a/gcc/optabs-tree.cc
+++ b/gcc/optabs-tree.cc
@@ -543,19 +543,50 @@ target_supports_op_p (tree type, enum tree_code code,
 	  && optab_handler (ot, TYPE_MODE (type)) != CODE_FOR_nothing);
 }
 
-/* Return true if target supports vector masked load/store for mode.  */
+/* Return true if the target has support for masked load/store.
+   We can support masked load/store by either mask{load,store}
+   or len_mask{load,store}.
+   This helper function checks whether target supports masked
+   load/store and return corresponding IFN in the last argument
+   (IFN_MASK_{LOAD,STORE} or IFN_LEN_MASK_{LOAD,STORE}).  */
+
+static bool
+target_supports_mask_load_store_p (machine_mode mode, machine_mode mask_mode,
+				   bool is_load, internal_fn *ifn)
+{
+  optab op = is_load ? maskload_optab : maskstore_optab;
+  optab len_op = is_load ? len_maskload_optab : len_maskstore_optab;
+  if (convert_optab_handler (op, mode, mask_mode) != CODE_FOR_nothing)
+    {
+      if (ifn)
+	*ifn = is_load ? IFN_MASK_LOAD : IFN_MASK_STORE;
+      return true;
+    }
+  else if (convert_optab_handler (len_op, mode, mask_mode) != CODE_FOR_nothing)
+    {
+      if (ifn)
+	*ifn = is_load ? IFN_LEN_MASK_LOAD : IFN_LEN_MASK_STORE;
+      return true;
+    }
+  return false;
+}
+
+/* Return true if target supports vector masked load/store for mode.
+   An additional output in the last argument which is the IFN pointer.
+   We set IFN as MASK_{LOAD,STORE} or LEN_MASK_{LOAD,STORE} according
+   which optab is supported in the target.  */
 
 bool
 can_vec_mask_load_store_p (machine_mode mode,
 			   machine_mode mask_mode,
-			   bool is_load)
+			   bool is_load,
+			   internal_fn *ifn)
 {
-  optab op = is_load ? maskload_optab : maskstore_optab;
   machine_mode vmode;
 
   /* If mode is vector mode, check it directly.  */
   if (VECTOR_MODE_P (mode))
-    return convert_optab_handler (op, mode, mask_mode) != CODE_FOR_nothing;
+    return target_supports_mask_load_store_p (mode, mask_mode, is_load, ifn);
 
   /* Otherwise, return true if there is some vector mode with
      the mask load/store supported.  */
@@ -569,7 +600,7 @@ can_vec_mask_load_store_p (machine_mode mode,
   vmode = targetm.vectorize.preferred_simd_mode (smode);
   if (VECTOR_MODE_P (vmode)
       && targetm.vectorize.get_mask_mode (vmode).exists (&mask_mode)
-      && convert_optab_handler (op, vmode, mask_mode) != CODE_FOR_nothing)
+      && target_supports_mask_load_store_p (vmode, mask_mode, is_load, ifn))
     return true;
 
   auto_vector_modes vector_modes;
@@ -577,33 +608,66 @@ can_vec_mask_load_store_p (machine_mode mode,
   for (machine_mode base_mode : vector_modes)
     if (related_vector_mode (base_mode, smode).exists (&vmode)
 	&& targetm.vectorize.get_mask_mode (vmode).exists (&mask_mode)
-	&& convert_optab_handler (op, vmode, mask_mode) != CODE_FOR_nothing)
+	&& target_supports_mask_load_store_p (vmode, mask_mode, is_load, ifn))
       return true;
   return false;
 }
 
+/* Return true if the target has support for len load/store.
+   We can support len load/store by either len_{load,store}
+   or len_mask{load,store}.
+   This helper function checks whether target supports len
+   load/store and return corresponding IFN in the last argument
+   (IFN_LEN_{LOAD,STORE} or IFN_LEN_MASK_{LOAD,STORE}).  */
+
+static bool
+target_supports_len_load_store_p (machine_mode mode, bool is_load,
+				  internal_fn *ifn)
+{
+  optab op = is_load ? len_load_optab : len_store_optab;
+  optab masked_op = is_load ? len_maskload_optab : len_maskstore_optab;
+
+  if (direct_optab_handler (op, mode))
+    {
+      if (ifn)
+	*ifn = is_load ? IFN_LEN_LOAD : IFN_LEN_STORE;
+      return true;
+    }
+  machine_mode mask_mode;
+  if (targetm.vectorize.get_mask_mode (mode).exists (&mask_mode)
+      && convert_optab_handler (masked_op, mode, mask_mode) != CODE_FOR_nothing)
+    {
+      if (ifn)
+	*ifn = is_load ? IFN_LEN_MASK_LOAD : IFN_LEN_MASK_STORE;
+      return true;
+    }
+  return false;
+}
+
 /* If target supports vector load/store with length for vector mode MODE,
    return the corresponding vector mode, otherwise return opt_machine_mode ().
    There are two flavors for vector load/store with length, one is to measure
    length with bytes, the other is to measure length with lanes.
    As len_{load,store} optabs point out, for the flavor with bytes, we use
-   VnQI to wrap the other supportable same size vector modes.  */
+   VnQI to wrap the other supportable same size vector modes.
+   An additional output in the last argument which is the IFN pointer.
+   We set IFN as LEN_{LOAD,STORE} or LEN_MASK_{LOAD,STORE} according
+   which optab is supported in the target.  */
 
 opt_machine_mode
-get_len_load_store_mode (machine_mode mode, bool is_load)
+get_len_load_store_mode (machine_mode mode, bool is_load, internal_fn *ifn)
 {
-  optab op = is_load ? len_load_optab : len_store_optab;
   gcc_assert (VECTOR_MODE_P (mode));
 
   /* Check if length in lanes supported for this mode directly.  */
-  if (direct_optab_handler (op, mode))
+  if (target_supports_len_load_store_p (mode, is_load, ifn))
     return mode;
 
   /* Check if length in bytes supported for same vector size VnQI.  */
   machine_mode vmode;
   poly_uint64 nunits = GET_MODE_SIZE (mode);
   if (related_vector_mode (mode, QImode, nunits).exists (&vmode)
-      && direct_optab_handler (op, vmode))
+      && target_supports_len_load_store_p (vmode, is_load, ifn))
     return vmode;
 
   return opt_machine_mode ();
diff --git a/gcc/optabs-tree.h b/gcc/optabs-tree.h
index a3f79b6bd43..e421fc24289 100644
--- a/gcc/optabs-tree.h
+++ b/gcc/optabs-tree.h
@@ -47,7 +47,9 @@ bool expand_vec_cond_expr_p (tree, tree, enum tree_code);
 void init_tree_optimization_optabs (tree);
 bool target_supports_op_p (tree, enum tree_code,
 			   enum optab_subtype = optab_default);
-bool can_vec_mask_load_store_p (machine_mode, machine_mode, bool);
-opt_machine_mode get_len_load_store_mode (machine_mode, bool);
+bool can_vec_mask_load_store_p (machine_mode, machine_mode, bool,
+				internal_fn * = nullptr);
+opt_machine_mode get_len_load_store_mode (machine_mode, bool,
+					  internal_fn * = nullptr);
 
 #endif
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 056a0ecb2be..44fb2507efb 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -1819,16 +1819,8 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
   machine_mode mask_mode;
-  bool using_partial_vectors_p = false;
-  if (targetm.vectorize.get_mask_mode (vecmode).exists (&mask_mode)
-      && can_vec_mask_load_store_p (vecmode, mask_mode, is_load))
-    {
-      nvectors = group_memory_nvectors (group_size * vf, nunits);
-      vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype, scalar_mask);
-      using_partial_vectors_p = true;
-    }
-
   machine_mode vmode;
+  bool using_partial_vectors_p = false;
   if (get_len_load_store_mode (vecmode, is_load).exists (&vmode))
     {
       nvectors = group_memory_nvectors (group_size * vf, nunits);
@@ -1837,6 +1829,13 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
       vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, factor);
       using_partial_vectors_p = true;
     }
+  else if (targetm.vectorize.get_mask_mode (vecmode).exists (&mask_mode)
+	   && can_vec_mask_load_store_p (vecmode, mask_mode, is_load))
+    {
+      nvectors = group_memory_nvectors (group_size * vf, nunits);
+      vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype, scalar_mask);
+      using_partial_vectors_p = true;
+    }
 
   if (!using_partial_vectors_p)
     {
@@ -8944,30 +8943,63 @@ vectorizable_store (vec_info *vinfo,
 		  vec_oprnd = new_temp;
 		}
 
-	      /* Arguments are ready.  Create the new vector stmt.  */
-	      if (final_mask)
-		{
-		  tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
-		  gcall *call
-		    = gimple_build_call_internal (IFN_MASK_STORE, 4,
-						  dataref_ptr, ptr,
-						  final_mask, vec_oprnd);
-		  gimple_call_set_nothrow (call, true);
-		  vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
-		  new_stmt = call;
-		}
-	      else if (loop_lens)
+	      /* Compute IFN when LOOP_LENS or final_mask valid.  */
+	      machine_mode vmode = TYPE_MODE (vectype);
+	      machine_mode new_vmode = vmode;
+	      internal_fn partial_ifn = IFN_LAST;
+	      /* Produce 'len' and 'bias' argument.  */
+	      tree final_len = NULL_TREE;
+	      tree bias = NULL_TREE;
+	      if (loop_lens)
 		{
-		  machine_mode vmode = TYPE_MODE (vectype);
 		  opt_machine_mode new_ovmode
-		    = get_len_load_store_mode (vmode, false);
-		  machine_mode new_vmode = new_ovmode.require ();
+		    = get_len_load_store_mode (vmode, false, &partial_ifn);
+		  new_vmode = new_ovmode.require ();
 		  unsigned factor
 		    = (new_ovmode == vmode) ? 1 : GET_MODE_UNIT_SIZE (vmode);
-		  tree final_len
-		    = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
-					 vec_num * ncopies, vectype,
-					 vec_num * j + i, factor);
+		  final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
+						 vec_num * ncopies, vectype,
+						 vec_num * j + i, factor);
+		}
+	      else if (final_mask)
+		{
+		  if (!can_vec_mask_load_store_p (vmode,
+						  TYPE_MODE (mask_vectype),
+						  false, &partial_ifn))
+		    gcc_unreachable ();
+		}
+
+	      if (partial_ifn == IFN_LEN_MASK_STORE)
+		{
+		  if (!final_len)
+		    {
+		      /* Pass VF value to 'len' argument of
+		         LEN_MASK_STORE if LOOP_LENS is invalid.  */
+		      tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
+		      final_len
+			= build_int_cst (iv_type,
+					 TYPE_VECTOR_SUBPARTS (vectype));
+		    }
+		  if (!final_mask)
+		    {
+		      /* Pass all ones value to 'mask' argument of
+			 LEN_MASK_STORE if final_mask is invalid.  */
+		      mask_vectype = truth_type_for (vectype);
+		      final_mask = build_minus_one_cst (mask_vectype);
+		    }
+		}
+	      if (final_len)
+		{
+		  signed char biasval
+		    = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+
+		  bias = build_int_cst (intQI_type_node, biasval);
+		}
+
+	      /* Arguments are ready.  Create the new vector stmt.  */
+	      if (final_len)
+		{
+		  gcall *call;
 		  tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
 		  /* Need conversion if it's wrapped with VnQI.  */
 		  if (vmode != new_vmode)
@@ -8987,14 +9019,27 @@ vectorizable_store (vec_info *vinfo,
 		      vec_oprnd = var;
 		    }
 
-		  signed char biasval =
-		    LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
-
-		  tree bias = build_int_cst (intQI_type_node, biasval);
+		  if (partial_ifn == IFN_LEN_MASK_STORE)
+		    call = gimple_build_call_internal (IFN_LEN_MASK_STORE, 6,
+						       dataref_ptr, ptr,
+						       final_len, final_mask,
+						       vec_oprnd, bias);
+		  else
+		    call
+		      = gimple_build_call_internal (IFN_LEN_STORE, 5,
+						    dataref_ptr, ptr, final_len,
+						    vec_oprnd, bias);
+		  gimple_call_set_nothrow (call, true);
+		  vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
+		  new_stmt = call;
+		}
+	      else if (final_mask)
+		{
+		  tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
 		  gcall *call
-		    = gimple_build_call_internal (IFN_LEN_STORE, 5, dataref_ptr,
-						  ptr, final_len, vec_oprnd,
-						  bias);
+		    = gimple_build_call_internal (IFN_MASK_STORE, 4,
+						  dataref_ptr, ptr,
+						  final_mask, vec_oprnd);
 		  gimple_call_set_nothrow (call, true);
 		  vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
 		  new_stmt = call;
@@ -10304,45 +10349,77 @@ vectorizable_load (vec_info *vinfo,
 					      align, misalign);
 		    align = least_bit_hwi (misalign | align);
 
-		    if (final_mask)
-		      {
-			tree ptr = build_int_cst (ref_type,
-						  align * BITS_PER_UNIT);
-			gcall *call
-			  = gimple_build_call_internal (IFN_MASK_LOAD, 3,
-							dataref_ptr, ptr,
-							final_mask);
-			gimple_call_set_nothrow (call, true);
-			new_stmt = call;
-			data_ref = NULL_TREE;
-		      }
-		    else if (loop_lens && memory_access_type != VMAT_INVARIANT)
+		    /* Compute IFN when LOOP_LENS or final_mask valid.  */
+		    machine_mode vmode = TYPE_MODE (vectype);
+		    machine_mode new_vmode = vmode;
+		    internal_fn partial_ifn = IFN_LAST;
+		    /* Produce 'len' and 'bias' argument.  */
+		    tree final_len = NULL_TREE;
+		    tree bias = NULL_TREE;
+		    if (loop_lens)
 		      {
-			machine_mode vmode = TYPE_MODE (vectype);
 			opt_machine_mode new_ovmode
-			  = get_len_load_store_mode (vmode, true);
-			machine_mode new_vmode = new_ovmode.require ();
+			  = get_len_load_store_mode (vmode, true,
+						     &partial_ifn);
+			new_vmode = new_ovmode.require ();
 			unsigned factor = (new_ovmode == vmode)
 					    ? 1
 					    : GET_MODE_UNIT_SIZE (vmode);
-			tree final_len
+			final_len
 			  = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
 					       vec_num * ncopies, vectype,
 					       vec_num * j + i, factor);
-			tree ptr
-			  = build_int_cst (ref_type, align * BITS_PER_UNIT);
-
-			tree qi_type = unsigned_intQI_type_node;
+		      }
+		    else if (final_mask)
+		      {
+			if (!can_vec_mask_load_store_p (
+			      vmode, TYPE_MODE (mask_vectype), true,
+			      &partial_ifn))
+			  gcc_unreachable ();
+		      }
 
-			signed char biasval =
-			  LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+		    if (partial_ifn == IFN_LEN_MASK_LOAD)
+		      {
+			if (!final_len)
+			  {
+			    /* Pass VF value to 'len' argument of
+			       LEN_MASK_LOAD if LOOP_LENS is invalid.  */
+			    tree iv_type
+			      = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
+			    final_len
+			      = build_int_cst (iv_type,
+					       TYPE_VECTOR_SUBPARTS (vectype));
+			  }
+			if (!final_mask)
+			  {
+			    /* Pass all ones value to 'mask' argument of
+			       LEN_MASK_LOAD if final_mask is invalid.  */
+			    mask_vectype = truth_type_for (vectype);
+			    final_mask = build_minus_one_cst (mask_vectype);
+			  }
+		      }
+		    if (final_len)
+		      {
+			signed char biasval
+			  = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
 
-			tree bias = build_int_cst (intQI_type_node, biasval);
+			bias = build_int_cst (intQI_type_node, biasval);
+		      }
 
-			gcall *call
-			  = gimple_build_call_internal (IFN_LEN_LOAD, 4,
-							dataref_ptr, ptr,
-							final_len, bias);
+		    if (final_len && memory_access_type != VMAT_INVARIANT)
+		      {
+			tree ptr
+			  = build_int_cst (ref_type, align * BITS_PER_UNIT);
+			gcall *call;
+			if (partial_ifn == IFN_LEN_MASK_LOAD)
+			  call = gimple_build_call_internal (IFN_LEN_MASK_LOAD,
+							     5, dataref_ptr,
+							     ptr, final_len,
+							     final_mask, bias);
+			else
+			  call = gimple_build_call_internal (IFN_LEN_LOAD, 4,
+							     dataref_ptr, ptr,
+							     final_len, bias);
 			gimple_call_set_nothrow (call, true);
 			new_stmt = call;
 			data_ref = NULL_TREE;
@@ -10350,8 +10427,8 @@ vectorizable_load (vec_info *vinfo,
 			/* Need conversion if it's wrapped with VnQI.  */
 			if (vmode != new_vmode)
 			  {
-			    tree new_vtype
-			      = build_vector_type_for_mode (qi_type, new_vmode);
+			    tree new_vtype = build_vector_type_for_mode (
+			      unsigned_intQI_type_node, new_vmode);
 			    tree var = vect_get_new_ssa_name (new_vtype,
 							      vect_simple_var);
 			    gimple_set_lhs (call, var);
@@ -10363,6 +10440,18 @@ vectorizable_load (vec_info *vinfo,
 						     VIEW_CONVERT_EXPR, op);
 			  }
 		      }
+		    else if (final_mask)
+		      {
+			tree ptr = build_int_cst (ref_type,
+						  align * BITS_PER_UNIT);
+			gcall *call
+			  = gimple_build_call_internal (IFN_MASK_LOAD, 3,
+							dataref_ptr, ptr,
+							final_mask);
+			gimple_call_set_nothrow (call, true);
+			new_stmt = call;
+			data_ref = NULL_TREE;
+		      }
 		    else
 		      {
 			tree ltype = vectype;
-- 
2.36.1


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH V6] VECT: Apply LEN_MASK_{LOAD,STORE} into vectorizer
  2023-06-22 23:51 [PATCH V6] VECT: Apply LEN_MASK_{LOAD,STORE} into vectorizer juzhe.zhong
@ 2023-06-23  7:55 ` Bernhard Reutner-Fischer
  2023-06-23  8:03   ` Richard Sandiford
  2023-06-23  8:08 ` Richard Sandiford
  1 sibling, 1 reply; 8+ messages in thread
From: Bernhard Reutner-Fischer @ 2023-06-23  7:55 UTC (permalink / raw)
  To: juzhe.zhong, gcc-patches; +Cc: rguenther, richard.sandiford, Ju-Zhe Zhong

On 23 June 2023 01:51:12 CEST, juzhe.zhong@rivai.ai wrote:
>From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>

I am sorry but I somehow overlooked a trivial spot in V5.
Nit which does not warrant an immediate next version, but please consider it before pushing iff approved:

>+	      if (final_len)
>+		{
>+		  signed char biasval
>+		    = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
>+
>+		  bias = build_int_cst (intQI_type_node, biasval);
>+		}
>+
>+	      /* Arguments are ready.  Create the new vector stmt.  */
>+	      if (final_len)
>+		{

Fuse the block below into the one above as the condition seems to be identical?
thanks,

>+		  gcall *call;
> 		  tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
> 		  /* Need conversion if it's wrapped with VnQI.  */
> 		  if (vmode != new_vmode)

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH V6] VECT: Apply LEN_MASK_{LOAD,STORE} into vectorizer
  2023-06-23  7:55 ` Bernhard Reutner-Fischer
@ 2023-06-23  8:03   ` Richard Sandiford
  2023-06-23  8:37     ` 钟居哲
  2023-06-23  9:38     ` Bernhard Reutner-Fischer
  0 siblings, 2 replies; 8+ messages in thread
From: Richard Sandiford @ 2023-06-23  8:03 UTC (permalink / raw)
  To: Bernhard Reutner-Fischer; +Cc: juzhe.zhong, gcc-patches, rguenther

Bernhard Reutner-Fischer <rep.dot.nop@gmail.com> writes:
> On 23 June 2023 01:51:12 CEST, juzhe.zhong@rivai.ai wrote:
>>From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
>
> I am sorry but I somehow overlooked a trivial spot in V5.
> Nit which does not warrant an immediate next version, but please consider it before pushing iff approved:
>
>>+	      if (final_len)
>>+		{
>>+		  signed char biasval
>>+		    = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
>>+
>>+		  bias = build_int_cst (intQI_type_node, biasval);
>>+		}
>>+
>>+	      /* Arguments are ready.  Create the new vector stmt.  */
>>+	      if (final_len)
>>+		{
>
> Fuse the block below into the one above as the condition seems to be identical?

Yeah, true, but I think the idea is that the code above “Arguments are
ready” is calculating argument values, and the code after it is creating
code.  These are two separate steps, and the fact that the two final_len
blocks end up being consecutive is something of a coincidence.

So personally I think we should keep the structure in the patch.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH V6] VECT: Apply LEN_MASK_{LOAD,STORE} into vectorizer
  2023-06-22 23:51 [PATCH V6] VECT: Apply LEN_MASK_{LOAD,STORE} into vectorizer juzhe.zhong
  2023-06-23  7:55 ` Bernhard Reutner-Fischer
@ 2023-06-23  8:08 ` Richard Sandiford
  2023-06-23  8:10   ` 钟居哲
  1 sibling, 1 reply; 8+ messages in thread
From: Richard Sandiford @ 2023-06-23  8:08 UTC (permalink / raw)
  To: juzhe.zhong; +Cc: gcc-patches, rguenther, rep.dot.nop

juzhe.zhong@rivai.ai writes:
> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
>
> Address comments from Richard and Bernhard from V5 patch.
> V6 fixed all issues according their comments.
>
> gcc/ChangeLog:
>
>         * internal-fn.cc (expand_partial_store_optab_fn): Adapt for LEN_MASK_STORE.
>         (internal_load_fn_p): Add LEN_MASK_LOAD.
>         (internal_store_fn_p): Add LEN_MASK_STORE.
>         (internal_fn_mask_index): Add LEN_MASK_{LOAD,STORE}.
>         (internal_fn_stored_value_index): Add LEN_MASK_STORE.
>         (internal_len_load_store_bias):  Add LEN_MASK_{LOAD,STORE}.
>         * optabs-tree.cc (can_vec_mask_load_store_p): Adapt for LEN_MASK_{LOAD,STORE}.
>         (get_len_load_store_mode): Ditto.
>         * optabs-tree.h (can_vec_mask_load_store_p): Ditto.
>         (get_len_load_store_mode): Ditto.
>         * tree-vect-stmts.cc (check_load_store_for_partial_vectors): Ditto.
>         (get_all_ones_mask): New function.
>         (vectorizable_store): Apply LEN_MASK_{LOAD,STORE} into vectorizer.
>         (vectorizable_load): Ditto.

Given Richard was happy with the previous version and this addresses
my comments from V5: OK, thanks.

Richard

>
> ---
>  gcc/internal-fn.cc     |  37 ++++++-
>  gcc/optabs-tree.cc     |  86 ++++++++++++++--
>  gcc/optabs-tree.h      |   6 +-
>  gcc/tree-vect-stmts.cc | 221 +++++++++++++++++++++++++++++------------
>  4 files changed, 267 insertions(+), 83 deletions(-)
>
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index c911ae790cb..1c2fd487e2a 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -2949,7 +2949,7 @@ expand_partial_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>   * OPTAB.  */
>  
>  static void
> -expand_partial_store_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
> +expand_partial_store_optab_fn (internal_fn ifn, gcall *stmt, convert_optab optab)
>  {
>    class expand_operand ops[5];
>    tree type, lhs, rhs, maskt, biast;
> @@ -2957,7 +2957,7 @@ expand_partial_store_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>    insn_code icode;
>  
>    maskt = gimple_call_arg (stmt, 2);
> -  rhs = gimple_call_arg (stmt, 3);
> +  rhs = gimple_call_arg (stmt, internal_fn_stored_value_index (ifn));
>    type = TREE_TYPE (rhs);
>    lhs = expand_call_mem_ref (type, stmt, 0);
>  
> @@ -4435,6 +4435,7 @@ internal_load_fn_p (internal_fn fn)
>      case IFN_GATHER_LOAD:
>      case IFN_MASK_GATHER_LOAD:
>      case IFN_LEN_LOAD:
> +    case IFN_LEN_MASK_LOAD:
>        return true;
>  
>      default:
> @@ -4455,6 +4456,7 @@ internal_store_fn_p (internal_fn fn)
>      case IFN_SCATTER_STORE:
>      case IFN_MASK_SCATTER_STORE:
>      case IFN_LEN_STORE:
> +    case IFN_LEN_MASK_STORE:
>        return true;
>  
>      default:
> @@ -4498,6 +4500,10 @@ internal_fn_mask_index (internal_fn fn)
>      case IFN_MASK_SCATTER_STORE:
>        return 4;
>  
> +    case IFN_LEN_MASK_LOAD:
> +    case IFN_LEN_MASK_STORE:
> +      return 3;
> +
>      default:
>        return (conditional_internal_fn_code (fn) != ERROR_MARK
>  	      || get_unconditional_internal_fn (fn) != IFN_LAST ? 0 : -1);
> @@ -4519,6 +4525,9 @@ internal_fn_stored_value_index (internal_fn fn)
>      case IFN_LEN_STORE:
>        return 3;
>  
> +    case IFN_LEN_MASK_STORE:
> +      return 4;
> +
>      default:
>        return -1;
>      }
> @@ -4583,13 +4592,33 @@ internal_len_load_store_bias (internal_fn ifn, machine_mode mode)
>  {
>    optab optab = direct_internal_fn_optab (ifn);
>    insn_code icode = direct_optab_handler (optab, mode);
> +  int bias_opno = 3;
> +
> +  if (icode == CODE_FOR_nothing)
> +    {
> +      machine_mode mask_mode;
> +      if (!targetm.vectorize.get_mask_mode (mode).exists (&mask_mode))
> +	return VECT_PARTIAL_BIAS_UNSUPPORTED;
> +      if (ifn == IFN_LEN_LOAD)
> +	{
> +	  /* Try LEN_MASK_LOAD.  */
> +	  optab = direct_internal_fn_optab (IFN_LEN_MASK_LOAD);
> +	}
> +      else
> +	{
> +	  /* Try LEN_MASK_STORE.  */
> +	  optab = direct_internal_fn_optab (IFN_LEN_MASK_STORE);
> +	}
> +      icode = convert_optab_handler (optab, mode, mask_mode);
> +      bias_opno = 4;
> +    }
>  
>    if (icode != CODE_FOR_nothing)
>      {
>        /* For now we only support biases of 0 or -1.  Try both of them.  */
> -      if (insn_operand_matches (icode, 3, GEN_INT (0)))
> +      if (insn_operand_matches (icode, bias_opno, GEN_INT (0)))
>  	return 0;
> -      if (insn_operand_matches (icode, 3, GEN_INT (-1)))
> +      if (insn_operand_matches (icode, bias_opno, GEN_INT (-1)))
>  	return -1;
>      }
>  
> diff --git a/gcc/optabs-tree.cc b/gcc/optabs-tree.cc
> index 77bf745ae40..e6ae15939d3 100644
> --- a/gcc/optabs-tree.cc
> +++ b/gcc/optabs-tree.cc
> @@ -543,19 +543,50 @@ target_supports_op_p (tree type, enum tree_code code,
>  	  && optab_handler (ot, TYPE_MODE (type)) != CODE_FOR_nothing);
>  }
>  
> -/* Return true if target supports vector masked load/store for mode.  */
> +/* Return true if the target has support for masked load/store.
> +   We can support masked load/store by either mask{load,store}
> +   or len_mask{load,store}.
> +   This helper function checks whether target supports masked
> +   load/store and return corresponding IFN in the last argument
> +   (IFN_MASK_{LOAD,STORE} or IFN_LEN_MASK_{LOAD,STORE}).  */
> +
> +static bool
> +target_supports_mask_load_store_p (machine_mode mode, machine_mode mask_mode,
> +				   bool is_load, internal_fn *ifn)
> +{
> +  optab op = is_load ? maskload_optab : maskstore_optab;
> +  optab len_op = is_load ? len_maskload_optab : len_maskstore_optab;
> +  if (convert_optab_handler (op, mode, mask_mode) != CODE_FOR_nothing)
> +    {
> +      if (ifn)
> +	*ifn = is_load ? IFN_MASK_LOAD : IFN_MASK_STORE;
> +      return true;
> +    }
> +  else if (convert_optab_handler (len_op, mode, mask_mode) != CODE_FOR_nothing)
> +    {
> +      if (ifn)
> +	*ifn = is_load ? IFN_LEN_MASK_LOAD : IFN_LEN_MASK_STORE;
> +      return true;
> +    }
> +  return false;
> +}
> +
> +/* Return true if target supports vector masked load/store for mode.
> +   An additional output in the last argument which is the IFN pointer.
> +   We set IFN as MASK_{LOAD,STORE} or LEN_MASK_{LOAD,STORE} according
> +   which optab is supported in the target.  */
>  
>  bool
>  can_vec_mask_load_store_p (machine_mode mode,
>  			   machine_mode mask_mode,
> -			   bool is_load)
> +			   bool is_load,
> +			   internal_fn *ifn)
>  {
> -  optab op = is_load ? maskload_optab : maskstore_optab;
>    machine_mode vmode;
>  
>    /* If mode is vector mode, check it directly.  */
>    if (VECTOR_MODE_P (mode))
> -    return convert_optab_handler (op, mode, mask_mode) != CODE_FOR_nothing;
> +    return target_supports_mask_load_store_p (mode, mask_mode, is_load, ifn);
>  
>    /* Otherwise, return true if there is some vector mode with
>       the mask load/store supported.  */
> @@ -569,7 +600,7 @@ can_vec_mask_load_store_p (machine_mode mode,
>    vmode = targetm.vectorize.preferred_simd_mode (smode);
>    if (VECTOR_MODE_P (vmode)
>        && targetm.vectorize.get_mask_mode (vmode).exists (&mask_mode)
> -      && convert_optab_handler (op, vmode, mask_mode) != CODE_FOR_nothing)
> +      && target_supports_mask_load_store_p (vmode, mask_mode, is_load, ifn))
>      return true;
>  
>    auto_vector_modes vector_modes;
> @@ -577,33 +608,66 @@ can_vec_mask_load_store_p (machine_mode mode,
>    for (machine_mode base_mode : vector_modes)
>      if (related_vector_mode (base_mode, smode).exists (&vmode)
>  	&& targetm.vectorize.get_mask_mode (vmode).exists (&mask_mode)
> -	&& convert_optab_handler (op, vmode, mask_mode) != CODE_FOR_nothing)
> +	&& target_supports_mask_load_store_p (vmode, mask_mode, is_load, ifn))
>        return true;
>    return false;
>  }
>  
> +/* Return true if the target has support for len load/store.
> +   We can support len load/store by either len_{load,store}
> +   or len_mask{load,store}.
> +   This helper function checks whether target supports len
> +   load/store and return corresponding IFN in the last argument
> +   (IFN_LEN_{LOAD,STORE} or IFN_LEN_MASK_{LOAD,STORE}).  */
> +
> +static bool
> +target_supports_len_load_store_p (machine_mode mode, bool is_load,
> +				  internal_fn *ifn)
> +{
> +  optab op = is_load ? len_load_optab : len_store_optab;
> +  optab masked_op = is_load ? len_maskload_optab : len_maskstore_optab;
> +
> +  if (direct_optab_handler (op, mode))
> +    {
> +      if (ifn)
> +	*ifn = is_load ? IFN_LEN_LOAD : IFN_LEN_STORE;
> +      return true;
> +    }
> +  machine_mode mask_mode;
> +  if (targetm.vectorize.get_mask_mode (mode).exists (&mask_mode)
> +      && convert_optab_handler (masked_op, mode, mask_mode) != CODE_FOR_nothing)
> +    {
> +      if (ifn)
> +	*ifn = is_load ? IFN_LEN_MASK_LOAD : IFN_LEN_MASK_STORE;
> +      return true;
> +    }
> +  return false;
> +}
> +
>  /* If target supports vector load/store with length for vector mode MODE,
>     return the corresponding vector mode, otherwise return opt_machine_mode ().
>     There are two flavors for vector load/store with length, one is to measure
>     length with bytes, the other is to measure length with lanes.
>     As len_{load,store} optabs point out, for the flavor with bytes, we use
> -   VnQI to wrap the other supportable same size vector modes.  */
> +   VnQI to wrap the other supportable same size vector modes.
> +   An additional output in the last argument which is the IFN pointer.
> +   We set IFN as LEN_{LOAD,STORE} or LEN_MASK_{LOAD,STORE} according
> +   which optab is supported in the target.  */
>  
>  opt_machine_mode
> -get_len_load_store_mode (machine_mode mode, bool is_load)
> +get_len_load_store_mode (machine_mode mode, bool is_load, internal_fn *ifn)
>  {
> -  optab op = is_load ? len_load_optab : len_store_optab;
>    gcc_assert (VECTOR_MODE_P (mode));
>  
>    /* Check if length in lanes supported for this mode directly.  */
> -  if (direct_optab_handler (op, mode))
> +  if (target_supports_len_load_store_p (mode, is_load, ifn))
>      return mode;
>  
>    /* Check if length in bytes supported for same vector size VnQI.  */
>    machine_mode vmode;
>    poly_uint64 nunits = GET_MODE_SIZE (mode);
>    if (related_vector_mode (mode, QImode, nunits).exists (&vmode)
> -      && direct_optab_handler (op, vmode))
> +      && target_supports_len_load_store_p (vmode, is_load, ifn))
>      return vmode;
>  
>    return opt_machine_mode ();
> diff --git a/gcc/optabs-tree.h b/gcc/optabs-tree.h
> index a3f79b6bd43..e421fc24289 100644
> --- a/gcc/optabs-tree.h
> +++ b/gcc/optabs-tree.h
> @@ -47,7 +47,9 @@ bool expand_vec_cond_expr_p (tree, tree, enum tree_code);
>  void init_tree_optimization_optabs (tree);
>  bool target_supports_op_p (tree, enum tree_code,
>  			   enum optab_subtype = optab_default);
> -bool can_vec_mask_load_store_p (machine_mode, machine_mode, bool);
> -opt_machine_mode get_len_load_store_mode (machine_mode, bool);
> +bool can_vec_mask_load_store_p (machine_mode, machine_mode, bool,
> +				internal_fn * = nullptr);
> +opt_machine_mode get_len_load_store_mode (machine_mode, bool,
> +					  internal_fn * = nullptr);
>  
>  #endif
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 056a0ecb2be..44fb2507efb 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -1819,16 +1819,8 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
>    poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
>    poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
>    machine_mode mask_mode;
> -  bool using_partial_vectors_p = false;
> -  if (targetm.vectorize.get_mask_mode (vecmode).exists (&mask_mode)
> -      && can_vec_mask_load_store_p (vecmode, mask_mode, is_load))
> -    {
> -      nvectors = group_memory_nvectors (group_size * vf, nunits);
> -      vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype, scalar_mask);
> -      using_partial_vectors_p = true;
> -    }
> -
>    machine_mode vmode;
> +  bool using_partial_vectors_p = false;
>    if (get_len_load_store_mode (vecmode, is_load).exists (&vmode))
>      {
>        nvectors = group_memory_nvectors (group_size * vf, nunits);
> @@ -1837,6 +1829,13 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
>        vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, factor);
>        using_partial_vectors_p = true;
>      }
> +  else if (targetm.vectorize.get_mask_mode (vecmode).exists (&mask_mode)
> +	   && can_vec_mask_load_store_p (vecmode, mask_mode, is_load))
> +    {
> +      nvectors = group_memory_nvectors (group_size * vf, nunits);
> +      vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype, scalar_mask);
> +      using_partial_vectors_p = true;
> +    }
>  
>    if (!using_partial_vectors_p)
>      {
> @@ -8944,30 +8943,63 @@ vectorizable_store (vec_info *vinfo,
>  		  vec_oprnd = new_temp;
>  		}
>  
> -	      /* Arguments are ready.  Create the new vector stmt.  */
> -	      if (final_mask)
> -		{
> -		  tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
> -		  gcall *call
> -		    = gimple_build_call_internal (IFN_MASK_STORE, 4,
> -						  dataref_ptr, ptr,
> -						  final_mask, vec_oprnd);
> -		  gimple_call_set_nothrow (call, true);
> -		  vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
> -		  new_stmt = call;
> -		}
> -	      else if (loop_lens)
> +	      /* Compute IFN when LOOP_LENS or final_mask valid.  */
> +	      machine_mode vmode = TYPE_MODE (vectype);
> +	      machine_mode new_vmode = vmode;
> +	      internal_fn partial_ifn = IFN_LAST;
> +	      /* Produce 'len' and 'bias' argument.  */
> +	      tree final_len = NULL_TREE;
> +	      tree bias = NULL_TREE;
> +	      if (loop_lens)
>  		{
> -		  machine_mode vmode = TYPE_MODE (vectype);
>  		  opt_machine_mode new_ovmode
> -		    = get_len_load_store_mode (vmode, false);
> -		  machine_mode new_vmode = new_ovmode.require ();
> +		    = get_len_load_store_mode (vmode, false, &partial_ifn);
> +		  new_vmode = new_ovmode.require ();
>  		  unsigned factor
>  		    = (new_ovmode == vmode) ? 1 : GET_MODE_UNIT_SIZE (vmode);
> -		  tree final_len
> -		    = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> -					 vec_num * ncopies, vectype,
> -					 vec_num * j + i, factor);
> +		  final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> +						 vec_num * ncopies, vectype,
> +						 vec_num * j + i, factor);
> +		}
> +	      else if (final_mask)
> +		{
> +		  if (!can_vec_mask_load_store_p (vmode,
> +						  TYPE_MODE (mask_vectype),
> +						  false, &partial_ifn))
> +		    gcc_unreachable ();
> +		}
> +
> +	      if (partial_ifn == IFN_LEN_MASK_STORE)
> +		{
> +		  if (!final_len)
> +		    {
> +		      /* Pass VF value to 'len' argument of
> +		         LEN_MASK_STORE if LOOP_LENS is invalid.  */
> +		      tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> +		      final_len
> +			= build_int_cst (iv_type,
> +					 TYPE_VECTOR_SUBPARTS (vectype));
> +		    }
> +		  if (!final_mask)
> +		    {
> +		      /* Pass all ones value to 'mask' argument of
> +			 LEN_MASK_STORE if final_mask is invalid.  */
> +		      mask_vectype = truth_type_for (vectype);
> +		      final_mask = build_minus_one_cst (mask_vectype);
> +		    }
> +		}
> +	      if (final_len)
> +		{
> +		  signed char biasval
> +		    = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +
> +		  bias = build_int_cst (intQI_type_node, biasval);
> +		}
> +
> +	      /* Arguments are ready.  Create the new vector stmt.  */
> +	      if (final_len)
> +		{
> +		  gcall *call;
>  		  tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
>  		  /* Need conversion if it's wrapped with VnQI.  */
>  		  if (vmode != new_vmode)
> @@ -8987,14 +9019,27 @@ vectorizable_store (vec_info *vinfo,
>  		      vec_oprnd = var;
>  		    }
>  
> -		  signed char biasval =
> -		    LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> -
> -		  tree bias = build_int_cst (intQI_type_node, biasval);
> +		  if (partial_ifn == IFN_LEN_MASK_STORE)
> +		    call = gimple_build_call_internal (IFN_LEN_MASK_STORE, 6,
> +						       dataref_ptr, ptr,
> +						       final_len, final_mask,
> +						       vec_oprnd, bias);
> +		  else
> +		    call
> +		      = gimple_build_call_internal (IFN_LEN_STORE, 5,
> +						    dataref_ptr, ptr, final_len,
> +						    vec_oprnd, bias);
> +		  gimple_call_set_nothrow (call, true);
> +		  vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
> +		  new_stmt = call;
> +		}
> +	      else if (final_mask)
> +		{
> +		  tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
>  		  gcall *call
> -		    = gimple_build_call_internal (IFN_LEN_STORE, 5, dataref_ptr,
> -						  ptr, final_len, vec_oprnd,
> -						  bias);
> +		    = gimple_build_call_internal (IFN_MASK_STORE, 4,
> +						  dataref_ptr, ptr,
> +						  final_mask, vec_oprnd);
>  		  gimple_call_set_nothrow (call, true);
>  		  vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
>  		  new_stmt = call;
> @@ -10304,45 +10349,77 @@ vectorizable_load (vec_info *vinfo,
>  					      align, misalign);
>  		    align = least_bit_hwi (misalign | align);
>  
> -		    if (final_mask)
> -		      {
> -			tree ptr = build_int_cst (ref_type,
> -						  align * BITS_PER_UNIT);
> -			gcall *call
> -			  = gimple_build_call_internal (IFN_MASK_LOAD, 3,
> -							dataref_ptr, ptr,
> -							final_mask);
> -			gimple_call_set_nothrow (call, true);
> -			new_stmt = call;
> -			data_ref = NULL_TREE;
> -		      }
> -		    else if (loop_lens && memory_access_type != VMAT_INVARIANT)
> +		    /* Compute IFN when LOOP_LENS or final_mask valid.  */
> +		    machine_mode vmode = TYPE_MODE (vectype);
> +		    machine_mode new_vmode = vmode;
> +		    internal_fn partial_ifn = IFN_LAST;
> +		    /* Produce 'len' and 'bias' argument.  */
> +		    tree final_len = NULL_TREE;
> +		    tree bias = NULL_TREE;
> +		    if (loop_lens)
>  		      {
> -			machine_mode vmode = TYPE_MODE (vectype);
>  			opt_machine_mode new_ovmode
> -			  = get_len_load_store_mode (vmode, true);
> -			machine_mode new_vmode = new_ovmode.require ();
> +			  = get_len_load_store_mode (vmode, true,
> +						     &partial_ifn);
> +			new_vmode = new_ovmode.require ();
>  			unsigned factor = (new_ovmode == vmode)
>  					    ? 1
>  					    : GET_MODE_UNIT_SIZE (vmode);
> -			tree final_len
> +			final_len
>  			  = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
>  					       vec_num * ncopies, vectype,
>  					       vec_num * j + i, factor);
> -			tree ptr
> -			  = build_int_cst (ref_type, align * BITS_PER_UNIT);
> -
> -			tree qi_type = unsigned_intQI_type_node;
> +		      }
> +		    else if (final_mask)
> +		      {
> +			if (!can_vec_mask_load_store_p (
> +			      vmode, TYPE_MODE (mask_vectype), true,
> +			      &partial_ifn))
> +			  gcc_unreachable ();
> +		      }
>  
> -			signed char biasval =
> -			  LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +		    if (partial_ifn == IFN_LEN_MASK_LOAD)
> +		      {
> +			if (!final_len)
> +			  {
> +			    /* Pass VF value to 'len' argument of
> +			       LEN_MASK_LOAD if LOOP_LENS is invalid.  */
> +			    tree iv_type
> +			      = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> +			    final_len
> +			      = build_int_cst (iv_type,
> +					       TYPE_VECTOR_SUBPARTS (vectype));
> +			  }
> +			if (!final_mask)
> +			  {
> +			    /* Pass all ones value to 'mask' argument of
> +			       LEN_MASK_LOAD if final_mask is invalid.  */
> +			    mask_vectype = truth_type_for (vectype);
> +			    final_mask = build_minus_one_cst (mask_vectype);
> +			  }
> +		      }
> +		    if (final_len)
> +		      {
> +			signed char biasval
> +			  = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
>  
> -			tree bias = build_int_cst (intQI_type_node, biasval);
> +			bias = build_int_cst (intQI_type_node, biasval);
> +		      }
>  
> -			gcall *call
> -			  = gimple_build_call_internal (IFN_LEN_LOAD, 4,
> -							dataref_ptr, ptr,
> -							final_len, bias);
> +		    if (final_len && memory_access_type != VMAT_INVARIANT)
> +		      {
> +			tree ptr
> +			  = build_int_cst (ref_type, align * BITS_PER_UNIT);
> +			gcall *call;
> +			if (partial_ifn == IFN_LEN_MASK_LOAD)
> +			  call = gimple_build_call_internal (IFN_LEN_MASK_LOAD,
> +							     5, dataref_ptr,
> +							     ptr, final_len,
> +							     final_mask, bias);
> +			else
> +			  call = gimple_build_call_internal (IFN_LEN_LOAD, 4,
> +							     dataref_ptr, ptr,
> +							     final_len, bias);
>  			gimple_call_set_nothrow (call, true);
>  			new_stmt = call;
>  			data_ref = NULL_TREE;
> @@ -10350,8 +10427,8 @@ vectorizable_load (vec_info *vinfo,
>  			/* Need conversion if it's wrapped with VnQI.  */
>  			if (vmode != new_vmode)
>  			  {
> -			    tree new_vtype
> -			      = build_vector_type_for_mode (qi_type, new_vmode);
> +			    tree new_vtype = build_vector_type_for_mode (
> +			      unsigned_intQI_type_node, new_vmode);
>  			    tree var = vect_get_new_ssa_name (new_vtype,
>  							      vect_simple_var);
>  			    gimple_set_lhs (call, var);
> @@ -10363,6 +10440,18 @@ vectorizable_load (vec_info *vinfo,
>  						     VIEW_CONVERT_EXPR, op);
>  			  }
>  		      }
> +		    else if (final_mask)
> +		      {
> +			tree ptr = build_int_cst (ref_type,
> +						  align * BITS_PER_UNIT);
> +			gcall *call
> +			  = gimple_build_call_internal (IFN_MASK_LOAD, 3,
> +							dataref_ptr, ptr,
> +							final_mask);
> +			gimple_call_set_nothrow (call, true);
> +			new_stmt = call;
> +			data_ref = NULL_TREE;
> +		      }
>  		    else
>  		      {
>  			tree ltype = vectype;

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: Re: [PATCH V6] VECT: Apply LEN_MASK_{LOAD,STORE} into vectorizer
  2023-06-23  8:08 ` Richard Sandiford
@ 2023-06-23  8:10   ` 钟居哲
  0 siblings, 0 replies; 8+ messages in thread
From: 钟居哲 @ 2023-06-23  8:10 UTC (permalink / raw)
  To: richard.sandiford; +Cc: gcc-patches, rguenther, Bernhard Reutner-Fischer

[-- Attachment #1: Type: text/plain, Size: 22078 bytes --]

Hi, Richard.

I saw Berhard comments. 

Should I send a V7 to fuse 2 if (final_len) together which is the final version to be merged?

Thanks.


juzhe.zhong@rivai.ai
 
From: Richard Sandiford
Date: 2023-06-23 16:08
To: juzhe.zhong
CC: gcc-patches; rguenther; rep.dot.nop
Subject: Re: [PATCH V6] VECT: Apply LEN_MASK_{LOAD,STORE} into vectorizer
juzhe.zhong@rivai.ai writes:
> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
>
> Address comments from Richard and Bernhard from V5 patch.
> V6 fixed all issues according their comments.
>
> gcc/ChangeLog:
>
>         * internal-fn.cc (expand_partial_store_optab_fn): Adapt for LEN_MASK_STORE.
>         (internal_load_fn_p): Add LEN_MASK_LOAD.
>         (internal_store_fn_p): Add LEN_MASK_STORE.
>         (internal_fn_mask_index): Add LEN_MASK_{LOAD,STORE}.
>         (internal_fn_stored_value_index): Add LEN_MASK_STORE.
>         (internal_len_load_store_bias):  Add LEN_MASK_{LOAD,STORE}.
>         * optabs-tree.cc (can_vec_mask_load_store_p): Adapt for LEN_MASK_{LOAD,STORE}.
>         (get_len_load_store_mode): Ditto.
>         * optabs-tree.h (can_vec_mask_load_store_p): Ditto.
>         (get_len_load_store_mode): Ditto.
>         * tree-vect-stmts.cc (check_load_store_for_partial_vectors): Ditto.
>         (get_all_ones_mask): New function.
>         (vectorizable_store): Apply LEN_MASK_{LOAD,STORE} into vectorizer.
>         (vectorizable_load): Ditto.
 
Given Richard was happy with the previous version and this addresses
my comments from V5: OK, thanks.
 
Richard
 
>
> ---
>  gcc/internal-fn.cc     |  37 ++++++-
>  gcc/optabs-tree.cc     |  86 ++++++++++++++--
>  gcc/optabs-tree.h      |   6 +-
>  gcc/tree-vect-stmts.cc | 221 +++++++++++++++++++++++++++++------------
>  4 files changed, 267 insertions(+), 83 deletions(-)
>
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index c911ae790cb..1c2fd487e2a 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -2949,7 +2949,7 @@ expand_partial_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>   * OPTAB.  */
>  
>  static void
> -expand_partial_store_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
> +expand_partial_store_optab_fn (internal_fn ifn, gcall *stmt, convert_optab optab)
>  {
>    class expand_operand ops[5];
>    tree type, lhs, rhs, maskt, biast;
> @@ -2957,7 +2957,7 @@ expand_partial_store_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>    insn_code icode;
>  
>    maskt = gimple_call_arg (stmt, 2);
> -  rhs = gimple_call_arg (stmt, 3);
> +  rhs = gimple_call_arg (stmt, internal_fn_stored_value_index (ifn));
>    type = TREE_TYPE (rhs);
>    lhs = expand_call_mem_ref (type, stmt, 0);
>  
> @@ -4435,6 +4435,7 @@ internal_load_fn_p (internal_fn fn)
>      case IFN_GATHER_LOAD:
>      case IFN_MASK_GATHER_LOAD:
>      case IFN_LEN_LOAD:
> +    case IFN_LEN_MASK_LOAD:
>        return true;
>  
>      default:
> @@ -4455,6 +4456,7 @@ internal_store_fn_p (internal_fn fn)
>      case IFN_SCATTER_STORE:
>      case IFN_MASK_SCATTER_STORE:
>      case IFN_LEN_STORE:
> +    case IFN_LEN_MASK_STORE:
>        return true;
>  
>      default:
> @@ -4498,6 +4500,10 @@ internal_fn_mask_index (internal_fn fn)
>      case IFN_MASK_SCATTER_STORE:
>        return 4;
>  
> +    case IFN_LEN_MASK_LOAD:
> +    case IFN_LEN_MASK_STORE:
> +      return 3;
> +
>      default:
>        return (conditional_internal_fn_code (fn) != ERROR_MARK
>        || get_unconditional_internal_fn (fn) != IFN_LAST ? 0 : -1);
> @@ -4519,6 +4525,9 @@ internal_fn_stored_value_index (internal_fn fn)
>      case IFN_LEN_STORE:
>        return 3;
>  
> +    case IFN_LEN_MASK_STORE:
> +      return 4;
> +
>      default:
>        return -1;
>      }
> @@ -4583,13 +4592,33 @@ internal_len_load_store_bias (internal_fn ifn, machine_mode mode)
>  {
>    optab optab = direct_internal_fn_optab (ifn);
>    insn_code icode = direct_optab_handler (optab, mode);
> +  int bias_opno = 3;
> +
> +  if (icode == CODE_FOR_nothing)
> +    {
> +      machine_mode mask_mode;
> +      if (!targetm.vectorize.get_mask_mode (mode).exists (&mask_mode))
> + return VECT_PARTIAL_BIAS_UNSUPPORTED;
> +      if (ifn == IFN_LEN_LOAD)
> + {
> +   /* Try LEN_MASK_LOAD.  */
> +   optab = direct_internal_fn_optab (IFN_LEN_MASK_LOAD);
> + }
> +      else
> + {
> +   /* Try LEN_MASK_STORE.  */
> +   optab = direct_internal_fn_optab (IFN_LEN_MASK_STORE);
> + }
> +      icode = convert_optab_handler (optab, mode, mask_mode);
> +      bias_opno = 4;
> +    }
>  
>    if (icode != CODE_FOR_nothing)
>      {
>        /* For now we only support biases of 0 or -1.  Try both of them.  */
> -      if (insn_operand_matches (icode, 3, GEN_INT (0)))
> +      if (insn_operand_matches (icode, bias_opno, GEN_INT (0)))
>  return 0;
> -      if (insn_operand_matches (icode, 3, GEN_INT (-1)))
> +      if (insn_operand_matches (icode, bias_opno, GEN_INT (-1)))
>  return -1;
>      }
>  
> diff --git a/gcc/optabs-tree.cc b/gcc/optabs-tree.cc
> index 77bf745ae40..e6ae15939d3 100644
> --- a/gcc/optabs-tree.cc
> +++ b/gcc/optabs-tree.cc
> @@ -543,19 +543,50 @@ target_supports_op_p (tree type, enum tree_code code,
>    && optab_handler (ot, TYPE_MODE (type)) != CODE_FOR_nothing);
>  }
>  
> -/* Return true if target supports vector masked load/store for mode.  */
> +/* Return true if the target has support for masked load/store.
> +   We can support masked load/store by either mask{load,store}
> +   or len_mask{load,store}.
> +   This helper function checks whether target supports masked
> +   load/store and return corresponding IFN in the last argument
> +   (IFN_MASK_{LOAD,STORE} or IFN_LEN_MASK_{LOAD,STORE}).  */
> +
> +static bool
> +target_supports_mask_load_store_p (machine_mode mode, machine_mode mask_mode,
> +    bool is_load, internal_fn *ifn)
> +{
> +  optab op = is_load ? maskload_optab : maskstore_optab;
> +  optab len_op = is_load ? len_maskload_optab : len_maskstore_optab;
> +  if (convert_optab_handler (op, mode, mask_mode) != CODE_FOR_nothing)
> +    {
> +      if (ifn)
> + *ifn = is_load ? IFN_MASK_LOAD : IFN_MASK_STORE;
> +      return true;
> +    }
> +  else if (convert_optab_handler (len_op, mode, mask_mode) != CODE_FOR_nothing)
> +    {
> +      if (ifn)
> + *ifn = is_load ? IFN_LEN_MASK_LOAD : IFN_LEN_MASK_STORE;
> +      return true;
> +    }
> +  return false;
> +}
> +
> +/* Return true if target supports vector masked load/store for mode.
> +   An additional output in the last argument which is the IFN pointer.
> +   We set IFN as MASK_{LOAD,STORE} or LEN_MASK_{LOAD,STORE} according
> +   which optab is supported in the target.  */
>  
>  bool
>  can_vec_mask_load_store_p (machine_mode mode,
>     machine_mode mask_mode,
> -    bool is_load)
> +    bool is_load,
> +    internal_fn *ifn)
>  {
> -  optab op = is_load ? maskload_optab : maskstore_optab;
>    machine_mode vmode;
>  
>    /* If mode is vector mode, check it directly.  */
>    if (VECTOR_MODE_P (mode))
> -    return convert_optab_handler (op, mode, mask_mode) != CODE_FOR_nothing;
> +    return target_supports_mask_load_store_p (mode, mask_mode, is_load, ifn);
>  
>    /* Otherwise, return true if there is some vector mode with
>       the mask load/store supported.  */
> @@ -569,7 +600,7 @@ can_vec_mask_load_store_p (machine_mode mode,
>    vmode = targetm.vectorize.preferred_simd_mode (smode);
>    if (VECTOR_MODE_P (vmode)
>        && targetm.vectorize.get_mask_mode (vmode).exists (&mask_mode)
> -      && convert_optab_handler (op, vmode, mask_mode) != CODE_FOR_nothing)
> +      && target_supports_mask_load_store_p (vmode, mask_mode, is_load, ifn))
>      return true;
>  
>    auto_vector_modes vector_modes;
> @@ -577,33 +608,66 @@ can_vec_mask_load_store_p (machine_mode mode,
>    for (machine_mode base_mode : vector_modes)
>      if (related_vector_mode (base_mode, smode).exists (&vmode)
>  && targetm.vectorize.get_mask_mode (vmode).exists (&mask_mode)
> - && convert_optab_handler (op, vmode, mask_mode) != CODE_FOR_nothing)
> + && target_supports_mask_load_store_p (vmode, mask_mode, is_load, ifn))
>        return true;
>    return false;
>  }
>  
> +/* Return true if the target has support for len load/store.
> +   We can support len load/store by either len_{load,store}
> +   or len_mask{load,store}.
> +   This helper function checks whether target supports len
> +   load/store and return corresponding IFN in the last argument
> +   (IFN_LEN_{LOAD,STORE} or IFN_LEN_MASK_{LOAD,STORE}).  */
> +
> +static bool
> +target_supports_len_load_store_p (machine_mode mode, bool is_load,
> +   internal_fn *ifn)
> +{
> +  optab op = is_load ? len_load_optab : len_store_optab;
> +  optab masked_op = is_load ? len_maskload_optab : len_maskstore_optab;
> +
> +  if (direct_optab_handler (op, mode))
> +    {
> +      if (ifn)
> + *ifn = is_load ? IFN_LEN_LOAD : IFN_LEN_STORE;
> +      return true;
> +    }
> +  machine_mode mask_mode;
> +  if (targetm.vectorize.get_mask_mode (mode).exists (&mask_mode)
> +      && convert_optab_handler (masked_op, mode, mask_mode) != CODE_FOR_nothing)
> +    {
> +      if (ifn)
> + *ifn = is_load ? IFN_LEN_MASK_LOAD : IFN_LEN_MASK_STORE;
> +      return true;
> +    }
> +  return false;
> +}
> +
>  /* If target supports vector load/store with length for vector mode MODE,
>     return the corresponding vector mode, otherwise return opt_machine_mode ().
>     There are two flavors for vector load/store with length, one is to measure
>     length with bytes, the other is to measure length with lanes.
>     As len_{load,store} optabs point out, for the flavor with bytes, we use
> -   VnQI to wrap the other supportable same size vector modes.  */
> +   VnQI to wrap the other supportable same size vector modes.
> +   An additional output in the last argument which is the IFN pointer.
> +   We set IFN as LEN_{LOAD,STORE} or LEN_MASK_{LOAD,STORE} according
> +   which optab is supported in the target.  */
>  
>  opt_machine_mode
> -get_len_load_store_mode (machine_mode mode, bool is_load)
> +get_len_load_store_mode (machine_mode mode, bool is_load, internal_fn *ifn)
>  {
> -  optab op = is_load ? len_load_optab : len_store_optab;
>    gcc_assert (VECTOR_MODE_P (mode));
>  
>    /* Check if length in lanes supported for this mode directly.  */
> -  if (direct_optab_handler (op, mode))
> +  if (target_supports_len_load_store_p (mode, is_load, ifn))
>      return mode;
>  
>    /* Check if length in bytes supported for same vector size VnQI.  */
>    machine_mode vmode;
>    poly_uint64 nunits = GET_MODE_SIZE (mode);
>    if (related_vector_mode (mode, QImode, nunits).exists (&vmode)
> -      && direct_optab_handler (op, vmode))
> +      && target_supports_len_load_store_p (vmode, is_load, ifn))
>      return vmode;
>  
>    return opt_machine_mode ();
> diff --git a/gcc/optabs-tree.h b/gcc/optabs-tree.h
> index a3f79b6bd43..e421fc24289 100644
> --- a/gcc/optabs-tree.h
> +++ b/gcc/optabs-tree.h
> @@ -47,7 +47,9 @@ bool expand_vec_cond_expr_p (tree, tree, enum tree_code);
>  void init_tree_optimization_optabs (tree);
>  bool target_supports_op_p (tree, enum tree_code,
>     enum optab_subtype = optab_default);
> -bool can_vec_mask_load_store_p (machine_mode, machine_mode, bool);
> -opt_machine_mode get_len_load_store_mode (machine_mode, bool);
> +bool can_vec_mask_load_store_p (machine_mode, machine_mode, bool,
> + internal_fn * = nullptr);
> +opt_machine_mode get_len_load_store_mode (machine_mode, bool,
> +   internal_fn * = nullptr);
>  
>  #endif
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 056a0ecb2be..44fb2507efb 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -1819,16 +1819,8 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
>    poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
>    poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
>    machine_mode mask_mode;
> -  bool using_partial_vectors_p = false;
> -  if (targetm.vectorize.get_mask_mode (vecmode).exists (&mask_mode)
> -      && can_vec_mask_load_store_p (vecmode, mask_mode, is_load))
> -    {
> -      nvectors = group_memory_nvectors (group_size * vf, nunits);
> -      vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype, scalar_mask);
> -      using_partial_vectors_p = true;
> -    }
> -
>    machine_mode vmode;
> +  bool using_partial_vectors_p = false;
>    if (get_len_load_store_mode (vecmode, is_load).exists (&vmode))
>      {
>        nvectors = group_memory_nvectors (group_size * vf, nunits);
> @@ -1837,6 +1829,13 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
>        vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, factor);
>        using_partial_vectors_p = true;
>      }
> +  else if (targetm.vectorize.get_mask_mode (vecmode).exists (&mask_mode)
> +    && can_vec_mask_load_store_p (vecmode, mask_mode, is_load))
> +    {
> +      nvectors = group_memory_nvectors (group_size * vf, nunits);
> +      vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype, scalar_mask);
> +      using_partial_vectors_p = true;
> +    }
>  
>    if (!using_partial_vectors_p)
>      {
> @@ -8944,30 +8943,63 @@ vectorizable_store (vec_info *vinfo,
>    vec_oprnd = new_temp;
>  }
>  
> -       /* Arguments are ready.  Create the new vector stmt.  */
> -       if (final_mask)
> - {
> -   tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
> -   gcall *call
> -     = gimple_build_call_internal (IFN_MASK_STORE, 4,
> -   dataref_ptr, ptr,
> -   final_mask, vec_oprnd);
> -   gimple_call_set_nothrow (call, true);
> -   vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
> -   new_stmt = call;
> - }
> -       else if (loop_lens)
> +       /* Compute IFN when LOOP_LENS or final_mask valid.  */
> +       machine_mode vmode = TYPE_MODE (vectype);
> +       machine_mode new_vmode = vmode;
> +       internal_fn partial_ifn = IFN_LAST;
> +       /* Produce 'len' and 'bias' argument.  */
> +       tree final_len = NULL_TREE;
> +       tree bias = NULL_TREE;
> +       if (loop_lens)
>  {
> -   machine_mode vmode = TYPE_MODE (vectype);
>    opt_machine_mode new_ovmode
> -     = get_len_load_store_mode (vmode, false);
> -   machine_mode new_vmode = new_ovmode.require ();
> +     = get_len_load_store_mode (vmode, false, &partial_ifn);
> +   new_vmode = new_ovmode.require ();
>    unsigned factor
>      = (new_ovmode == vmode) ? 1 : GET_MODE_UNIT_SIZE (vmode);
> -   tree final_len
> -     = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> - vec_num * ncopies, vectype,
> - vec_num * j + i, factor);
> +   final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> + vec_num * ncopies, vectype,
> + vec_num * j + i, factor);
> + }
> +       else if (final_mask)
> + {
> +   if (!can_vec_mask_load_store_p (vmode,
> +   TYPE_MODE (mask_vectype),
> +   false, &partial_ifn))
> +     gcc_unreachable ();
> + }
> +
> +       if (partial_ifn == IFN_LEN_MASK_STORE)
> + {
> +   if (!final_len)
> +     {
> +       /* Pass VF value to 'len' argument of
> +          LEN_MASK_STORE if LOOP_LENS is invalid.  */
> +       tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> +       final_len
> + = build_int_cst (iv_type,
> + TYPE_VECTOR_SUBPARTS (vectype));
> +     }
> +   if (!final_mask)
> +     {
> +       /* Pass all ones value to 'mask' argument of
> + LEN_MASK_STORE if final_mask is invalid.  */
> +       mask_vectype = truth_type_for (vectype);
> +       final_mask = build_minus_one_cst (mask_vectype);
> +     }
> + }
> +       if (final_len)
> + {
> +   signed char biasval
> +     = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +
> +   bias = build_int_cst (intQI_type_node, biasval);
> + }
> +
> +       /* Arguments are ready.  Create the new vector stmt.  */
> +       if (final_len)
> + {
> +   gcall *call;
>    tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
>    /* Need conversion if it's wrapped with VnQI.  */
>    if (vmode != new_vmode)
> @@ -8987,14 +9019,27 @@ vectorizable_store (vec_info *vinfo,
>        vec_oprnd = var;
>      }
>  
> -   signed char biasval =
> -     LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> -
> -   tree bias = build_int_cst (intQI_type_node, biasval);
> +   if (partial_ifn == IFN_LEN_MASK_STORE)
> +     call = gimple_build_call_internal (IFN_LEN_MASK_STORE, 6,
> +        dataref_ptr, ptr,
> +        final_len, final_mask,
> +        vec_oprnd, bias);
> +   else
> +     call
> +       = gimple_build_call_internal (IFN_LEN_STORE, 5,
> +     dataref_ptr, ptr, final_len,
> +     vec_oprnd, bias);
> +   gimple_call_set_nothrow (call, true);
> +   vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
> +   new_stmt = call;
> + }
> +       else if (final_mask)
> + {
> +   tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
>    gcall *call
> -     = gimple_build_call_internal (IFN_LEN_STORE, 5, dataref_ptr,
> -   ptr, final_len, vec_oprnd,
> -   bias);
> +     = gimple_build_call_internal (IFN_MASK_STORE, 4,
> +   dataref_ptr, ptr,
> +   final_mask, vec_oprnd);
>    gimple_call_set_nothrow (call, true);
>    vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
>    new_stmt = call;
> @@ -10304,45 +10349,77 @@ vectorizable_load (vec_info *vinfo,
>        align, misalign);
>      align = least_bit_hwi (misalign | align);
>  
> -     if (final_mask)
> -       {
> - tree ptr = build_int_cst (ref_type,
> -   align * BITS_PER_UNIT);
> - gcall *call
> -   = gimple_build_call_internal (IFN_MASK_LOAD, 3,
> - dataref_ptr, ptr,
> - final_mask);
> - gimple_call_set_nothrow (call, true);
> - new_stmt = call;
> - data_ref = NULL_TREE;
> -       }
> -     else if (loop_lens && memory_access_type != VMAT_INVARIANT)
> +     /* Compute IFN when LOOP_LENS or final_mask valid.  */
> +     machine_mode vmode = TYPE_MODE (vectype);
> +     machine_mode new_vmode = vmode;
> +     internal_fn partial_ifn = IFN_LAST;
> +     /* Produce 'len' and 'bias' argument.  */
> +     tree final_len = NULL_TREE;
> +     tree bias = NULL_TREE;
> +     if (loop_lens)
>        {
> - machine_mode vmode = TYPE_MODE (vectype);
>  opt_machine_mode new_ovmode
> -   = get_len_load_store_mode (vmode, true);
> - machine_mode new_vmode = new_ovmode.require ();
> +   = get_len_load_store_mode (vmode, true,
> +      &partial_ifn);
> + new_vmode = new_ovmode.require ();
>  unsigned factor = (new_ovmode == vmode)
>      ? 1
>      : GET_MODE_UNIT_SIZE (vmode);
> - tree final_len
> + final_len
>    = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
>         vec_num * ncopies, vectype,
>         vec_num * j + i, factor);
> - tree ptr
> -   = build_int_cst (ref_type, align * BITS_PER_UNIT);
> -
> - tree qi_type = unsigned_intQI_type_node;
> +       }
> +     else if (final_mask)
> +       {
> + if (!can_vec_mask_load_store_p (
> +       vmode, TYPE_MODE (mask_vectype), true,
> +       &partial_ifn))
> +   gcc_unreachable ();
> +       }
>  
> - signed char biasval =
> -   LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +     if (partial_ifn == IFN_LEN_MASK_LOAD)
> +       {
> + if (!final_len)
> +   {
> +     /* Pass VF value to 'len' argument of
> +        LEN_MASK_LOAD if LOOP_LENS is invalid.  */
> +     tree iv_type
> +       = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> +     final_len
> +       = build_int_cst (iv_type,
> +        TYPE_VECTOR_SUBPARTS (vectype));
> +   }
> + if (!final_mask)
> +   {
> +     /* Pass all ones value to 'mask' argument of
> +        LEN_MASK_LOAD if final_mask is invalid.  */
> +     mask_vectype = truth_type_for (vectype);
> +     final_mask = build_minus_one_cst (mask_vectype);
> +   }
> +       }
> +     if (final_len)
> +       {
> + signed char biasval
> +   = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
>  
> - tree bias = build_int_cst (intQI_type_node, biasval);
> + bias = build_int_cst (intQI_type_node, biasval);
> +       }
>  
> - gcall *call
> -   = gimple_build_call_internal (IFN_LEN_LOAD, 4,
> - dataref_ptr, ptr,
> - final_len, bias);
> +     if (final_len && memory_access_type != VMAT_INVARIANT)
> +       {
> + tree ptr
> +   = build_int_cst (ref_type, align * BITS_PER_UNIT);
> + gcall *call;
> + if (partial_ifn == IFN_LEN_MASK_LOAD)
> +   call = gimple_build_call_internal (IFN_LEN_MASK_LOAD,
> +      5, dataref_ptr,
> +      ptr, final_len,
> +      final_mask, bias);
> + else
> +   call = gimple_build_call_internal (IFN_LEN_LOAD, 4,
> +      dataref_ptr, ptr,
> +      final_len, bias);
>  gimple_call_set_nothrow (call, true);
>  new_stmt = call;
>  data_ref = NULL_TREE;
> @@ -10350,8 +10427,8 @@ vectorizable_load (vec_info *vinfo,
>  /* Need conversion if it's wrapped with VnQI.  */
>  if (vmode != new_vmode)
>    {
> -     tree new_vtype
> -       = build_vector_type_for_mode (qi_type, new_vmode);
> +     tree new_vtype = build_vector_type_for_mode (
> +       unsigned_intQI_type_node, new_vmode);
>      tree var = vect_get_new_ssa_name (new_vtype,
>        vect_simple_var);
>      gimple_set_lhs (call, var);
> @@ -10363,6 +10440,18 @@ vectorizable_load (vec_info *vinfo,
>       VIEW_CONVERT_EXPR, op);
>    }
>        }
> +     else if (final_mask)
> +       {
> + tree ptr = build_int_cst (ref_type,
> +   align * BITS_PER_UNIT);
> + gcall *call
> +   = gimple_build_call_internal (IFN_MASK_LOAD, 3,
> + dataref_ptr, ptr,
> + final_mask);
> + gimple_call_set_nothrow (call, true);
> + new_stmt = call;
> + data_ref = NULL_TREE;
> +       }
>      else
>        {
>  tree ltype = vectype;
 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: Re: [PATCH V6] VECT: Apply LEN_MASK_{LOAD,STORE} into vectorizer
  2023-06-23  8:03   ` Richard Sandiford
@ 2023-06-23  8:37     ` 钟居哲
  2023-06-23  9:38     ` Bernhard Reutner-Fischer
  1 sibling, 0 replies; 8+ messages in thread
From: 钟居哲 @ 2023-06-23  8:37 UTC (permalink / raw)
  To: richard.sandiford, Bernhard Reutner-Fischer; +Cc: gcc-patches, rguenther

[-- Attachment #1: Type: text/plain, Size: 1572 bytes --]

Oh. Ok Thanks Richard so much.
I will merge V6 after I finished regression.

Previously, I didn't understand whether you want V7 (I tried use google translator to translate your words :)
Now I understand you are happy with V6.

Thanks.


juzhe.zhong@rivai.ai
 
From: Richard Sandiford
Date: 2023-06-23 16:03
To: Bernhard Reutner-Fischer
CC: juzhe.zhong; gcc-patches; rguenther
Subject: Re: [PATCH V6] VECT: Apply LEN_MASK_{LOAD,STORE} into vectorizer
Bernhard Reutner-Fischer <rep.dot.nop@gmail.com> writes:
> On 23 June 2023 01:51:12 CEST, juzhe.zhong@rivai.ai wrote:
>>From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
>
> I am sorry but I somehow overlooked a trivial spot in V5.
> Nit which does not warrant an immediate next version, but please consider it before pushing iff approved:
>
>>+       if (final_len)
>>+ {
>>+   signed char biasval
>>+     = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
>>+
>>+   bias = build_int_cst (intQI_type_node, biasval);
>>+ }
>>+
>>+       /* Arguments are ready.  Create the new vector stmt.  */
>>+       if (final_len)
>>+ {
>
> Fuse the block below into the one above as the condition seems to be identical?
 
Yeah, true, but I think the idea is that the code above “Arguments are
ready” is calculating argument values, and the code after it is creating
code.  These are two separate steps, and the fact that the two final_len
blocks end up being consecutive is something of a coincidence.
 
So personally I think we should keep the structure in the patch.
 
Thanks,
Richard
 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH V6] VECT: Apply LEN_MASK_{LOAD,STORE} into vectorizer
  2023-06-23  8:03   ` Richard Sandiford
  2023-06-23  8:37     ` 钟居哲
@ 2023-06-23  9:38     ` Bernhard Reutner-Fischer
  2023-06-24  0:53       ` Li, Pan2
  1 sibling, 1 reply; 8+ messages in thread
From: Bernhard Reutner-Fischer @ 2023-06-23  9:38 UTC (permalink / raw)
  To: Richard Sandiford; +Cc: juzhe.zhong, gcc-patches, rguenther

On 23 June 2023 10:03:45 CEST, Richard Sandiford <richard.sandiford@arm.com> wrote:

>> Fuse the block below into the one above as the condition seems to be identical?
>
>Yeah, true, but I think the idea is that the code above “Arguments are
>ready” is calculating argument values, and the code after it is creating
>code.  These are two separate steps, and the fact that the two final_len
>blocks end up being consecutive is something of a coincidence.
>
>So personally I think we should keep the structure in the patch.

Sure, works for me.
thanks,

^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH V6] VECT: Apply LEN_MASK_{LOAD,STORE} into vectorizer
  2023-06-23  9:38     ` Bernhard Reutner-Fischer
@ 2023-06-24  0:53       ` Li, Pan2
  0 siblings, 0 replies; 8+ messages in thread
From: Li, Pan2 @ 2023-06-24  0:53 UTC (permalink / raw)
  To: Bernhard Reutner-Fischer, Richard Sandiford
  Cc: juzhe.zhong, gcc-patches, rguenther

Committed as passed both bootstrap and regression tests.

Pan

-----Original Message-----
From: Gcc-patches <gcc-patches-bounces+pan2.li=intel.com@gcc.gnu.org> On Behalf Of Bernhard Reutner-Fischer via Gcc-patches
Sent: Friday, June 23, 2023 5:39 PM
To: Richard Sandiford <richard.sandiford@arm.com>
Cc: juzhe.zhong@rivai.ai; gcc-patches@gcc.gnu.org; rguenther@suse.de
Subject: Re: [PATCH V6] VECT: Apply LEN_MASK_{LOAD,STORE} into vectorizer

On 23 June 2023 10:03:45 CEST, Richard Sandiford <richard.sandiford@arm.com> wrote:

>> Fuse the block below into the one above as the condition seems to be identical?
>
>Yeah, true, but I think the idea is that the code above “Arguments are
>ready” is calculating argument values, and the code after it is creating
>code.  These are two separate steps, and the fact that the two final_len
>blocks end up being consecutive is something of a coincidence.
>
>So personally I think we should keep the structure in the patch.

Sure, works for me.
thanks,

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2023-06-24  0:53 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-06-22 23:51 [PATCH V6] VECT: Apply LEN_MASK_{LOAD,STORE} into vectorizer juzhe.zhong
2023-06-23  7:55 ` Bernhard Reutner-Fischer
2023-06-23  8:03   ` Richard Sandiford
2023-06-23  8:37     ` 钟居哲
2023-06-23  9:38     ` Bernhard Reutner-Fischer
2023-06-24  0:53       ` Li, Pan2
2023-06-23  8:08 ` Richard Sandiford
2023-06-23  8:10   ` 钟居哲

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).