public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH V5] Machine Description: Add LEN_MASK_{GATHER_LOAD, SCATTER_STORE} pattern
@ 2023-06-30 10:41 juzhe.zhong
  2023-06-30 10:41 ` [PATCH] VECT: Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer juzhe.zhong
  2023-07-02  9:35 ` [PATCH V5] Machine Description: Add LEN_MASK_{GATHER_LOAD, SCATTER_STORE} pattern Richard Sandiford
  0 siblings, 2 replies; 3+ messages in thread
From: juzhe.zhong @ 2023-06-30 10:41 UTC (permalink / raw)
  To: gcc-patches; +Cc: richard.sandiford, rguenther, rdapp.gcc, Ju-Zhe Zhong

From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>

Hi, Richi and Richard.

This patch is adding LEN_MASK_{GATHER_LOAD,SCATTER_STORE} to allow targets
handle flow control by mask and loop control by length on gather/scatter memory
operations. Consider this following case:

#include <stdint.h>
void
f (uint8_t *restrict a,
   uint8_t *restrict b, int n,
   int base, int step,
   int *restrict cond)
{
  for (int i = 0; i < n; ++i)
    {
      if (cond[i])
        a[i * step + base] = b[i * step + base];
    }
}

We hope RVV can vectorize such case into following IR:

loop_len = SELECT_VL
control_mask = comparison
v = LEN_MASK_GATHER_LOAD (.., loop_len, control_mask, bias)
LEN_SCATTER_STORE (... v, ..., loop_len, control_mask, bias)

This patch doesn't apply such patterns into vectorizer, just add patterns
and update the documents.

Will send patch which apply such patterns into vectorizer soon after this
patch is approved.

Thanks.

---
 gcc/doc/md.texi     | 17 ++++++++++++
 gcc/internal-fn.cc  | 67 +++++++++++++++++++++++++++++++++++++++++++--
 gcc/internal-fn.def |  8 ++++--
 gcc/internal-fn.h   |  1 +
 gcc/optabs.def      |  2 ++
 5 files changed, 90 insertions(+), 5 deletions(-)

diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 9648fdc846a..df41b5251d4 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -5040,6 +5040,15 @@ operand 5.  Bit @var{i} of the mask is set if element @var{i}
 of the result should be loaded from memory and clear if element @var{i}
 of the result should be set to zero.
 
+@cindex @code{len_mask_gather_load@var{m}@var{n}} instruction pattern
+@item @samp{len_mask_gather_load@var{m}@var{n}}
+Like @samp{gather_load@var{m}@var{n}}, but takes an extra length operand (operand 5),
+a mask operand (operand 6) as well as a bias operand (operand 7).  Similar to len_maskload,
+the instruction loads at most (operand 5 + operand 7) elements from memory.
+Bit @var{i} of the mask is set if element @var{i} of the result should
+be loaded from memory and clear if element @var{i} of the result should be undefined.
+Mask elements @var{i} with i > (operand 5) are ignored.
+
 @cindex @code{scatter_store@var{m}@var{n}} instruction pattern
 @item @samp{scatter_store@var{m}@var{n}}
 Store a vector of mode @var{m} into several distinct memory locations.
@@ -5069,6 +5078,14 @@ Like @samp{scatter_store@var{m}@var{n}}, but takes an extra mask operand as
 operand 5.  Bit @var{i} of the mask is set if element @var{i}
 of the result should be stored to memory.
 
+@cindex @code{len_mask_scatter_store@var{m}@var{n}} instruction pattern
+@item @samp{len_mask_scatter_store@var{m}@var{n}}
+Like @samp{scatter_store@var{m}@var{n}}, but takes an extra length operand (operand 5),
+a mask operand (operand 6) as well as a bias operand (operand 7).  The instruction stores
+at most (operand 5 + operand 7) elements of (operand 4) to memory.
+Bit @var{i} of the mask is set if element @var{i} of (operand 4) should be stored.
+Mask elements @var{i} with i > (operand 5) are ignored.
+
 @cindex @code{vec_set@var{m}} instruction pattern
 @item @samp{vec_set@var{m}}
 Set given field in the vector value.  Operand 0 is the vector to modify,
diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index 9017176dc7a..da3827481e9 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -3537,7 +3537,7 @@ expand_scatter_store_optab_fn (internal_fn, gcall *stmt, direct_optab optab)
   HOST_WIDE_INT scale_int = tree_to_shwi (scale);
   rtx rhs_rtx = expand_normal (rhs);
 
-  class expand_operand ops[6];
+  class expand_operand ops[8];
   int i = 0;
   create_address_operand (&ops[i++], base_rtx);
   create_input_operand (&ops[i++], offset_rtx, TYPE_MODE (TREE_TYPE (offset)));
@@ -3546,9 +3546,23 @@ expand_scatter_store_optab_fn (internal_fn, gcall *stmt, direct_optab optab)
   create_input_operand (&ops[i++], rhs_rtx, TYPE_MODE (TREE_TYPE (rhs)));
   if (mask_index >= 0)
     {
+      if (optab == len_mask_scatter_store_optab)
+	{
+	  tree len = gimple_call_arg (stmt, internal_fn_len_index (ifn));
+	  rtx len_rtx = expand_normal (len);
+	  create_convert_operand_from (&ops[i++], len_rtx,
+				       TYPE_MODE (TREE_TYPE (len)),
+				       TYPE_UNSIGNED (TREE_TYPE (len)));
+	}
       tree mask = gimple_call_arg (stmt, mask_index);
       rtx mask_rtx = expand_normal (mask);
       create_input_operand (&ops[i++], mask_rtx, TYPE_MODE (TREE_TYPE (mask)));
+      if (optab == len_mask_scatter_store_optab)
+	{
+	  tree biast = gimple_call_arg (stmt, gimple_call_num_args (stmt) - 1);
+	  rtx bias = expand_normal (biast);
+	  create_input_operand (&ops[i++], bias, QImode);
+	}
     }
 
   insn_code icode = convert_optab_handler (optab, TYPE_MODE (TREE_TYPE (rhs)),
@@ -3559,7 +3573,7 @@ expand_scatter_store_optab_fn (internal_fn, gcall *stmt, direct_optab optab)
 /* Expand {MASK_,}GATHER_LOAD call CALL using optab OPTAB.  */
 
 static void
-expand_gather_load_optab_fn (internal_fn, gcall *stmt, direct_optab optab)
+expand_gather_load_optab_fn (internal_fn ifn, gcall *stmt, direct_optab optab)
 {
   tree lhs = gimple_call_lhs (stmt);
   tree base = gimple_call_arg (stmt, 0);
@@ -3572,7 +3586,7 @@ expand_gather_load_optab_fn (internal_fn, gcall *stmt, direct_optab optab)
   HOST_WIDE_INT scale_int = tree_to_shwi (scale);
 
   int i = 0;
-  class expand_operand ops[6];
+  class expand_operand ops[8];
   create_output_operand (&ops[i++], lhs_rtx, TYPE_MODE (TREE_TYPE (lhs)));
   create_address_operand (&ops[i++], base_rtx);
   create_input_operand (&ops[i++], offset_rtx, TYPE_MODE (TREE_TYPE (offset)));
@@ -3584,6 +3598,20 @@ expand_gather_load_optab_fn (internal_fn, gcall *stmt, direct_optab optab)
       rtx mask_rtx = expand_normal (mask);
       create_input_operand (&ops[i++], mask_rtx, TYPE_MODE (TREE_TYPE (mask)));
     }
+  else if (optab == len_mask_gather_load_optab)
+    {
+      tree len = gimple_call_arg (stmt, internal_fn_len_index (ifn));
+      rtx len_rtx = expand_normal (len);
+      create_convert_operand_from (&ops[i++], len_rtx,
+				   TYPE_MODE (TREE_TYPE (len)),
+				   TYPE_UNSIGNED (TREE_TYPE (len)));
+      tree mask = gimple_call_arg (stmt, internal_fn_mask_index (ifn));
+      rtx mask_rtx = expand_normal (mask);
+      create_input_operand (&ops[i++], mask_rtx, TYPE_MODE (TREE_TYPE (mask)));
+      tree biast = gimple_call_arg (stmt, gimple_call_num_args (stmt) - 1);
+      rtx bias = expand_normal (biast);
+      create_input_operand (&ops[i++], bias, QImode);
+    }
   insn_code icode = convert_optab_handler (optab, TYPE_MODE (TREE_TYPE (lhs)),
 					   TYPE_MODE (TREE_TYPE (offset)));
   expand_insn (icode, i, ops);
@@ -4434,6 +4462,7 @@ internal_load_fn_p (internal_fn fn)
     case IFN_MASK_LOAD_LANES:
     case IFN_GATHER_LOAD:
     case IFN_MASK_GATHER_LOAD:
+    case IFN_LEN_MASK_GATHER_LOAD:
     case IFN_LEN_LOAD:
     case IFN_LEN_MASK_LOAD:
       return true;
@@ -4455,6 +4484,7 @@ internal_store_fn_p (internal_fn fn)
     case IFN_MASK_STORE_LANES:
     case IFN_SCATTER_STORE:
     case IFN_MASK_SCATTER_STORE:
+    case IFN_LEN_MASK_SCATTER_STORE:
     case IFN_LEN_STORE:
     case IFN_LEN_MASK_STORE:
       return true;
@@ -4473,8 +4503,10 @@ internal_gather_scatter_fn_p (internal_fn fn)
     {
     case IFN_GATHER_LOAD:
     case IFN_MASK_GATHER_LOAD:
+    case IFN_LEN_MASK_GATHER_LOAD:
     case IFN_SCATTER_STORE:
     case IFN_MASK_SCATTER_STORE:
+    case IFN_LEN_MASK_SCATTER_STORE:
       return true;
 
     default:
@@ -4504,6 +4536,34 @@ internal_fn_mask_index (internal_fn fn)
     case IFN_LEN_MASK_STORE:
       return 3;
 
+    case IFN_LEN_MASK_GATHER_LOAD:
+    case IFN_LEN_MASK_SCATTER_STORE:
+      return 5;
+
+    default:
+      return (conditional_internal_fn_code (fn) != ERROR_MARK
+	      || get_unconditional_internal_fn (fn) != IFN_LAST ? 0 : -1);
+    }
+}
+
+/* If FN takes a vector len argument, return the index of that argument,
+   otherwise return -1.  */
+
+int
+internal_fn_len_index (internal_fn fn)
+{
+  switch (fn)
+    {
+    case IFN_LEN_LOAD:
+    case IFN_LEN_STORE:
+    case IFN_LEN_MASK_LOAD:
+    case IFN_LEN_MASK_STORE:
+      return 2;
+
+    case IFN_LEN_MASK_GATHER_LOAD:
+    case IFN_LEN_MASK_SCATTER_STORE:
+      return 4;
+
     default:
       return (conditional_internal_fn_code (fn) != ERROR_MARK
 	      || get_unconditional_internal_fn (fn) != IFN_LAST ? 0 : -1);
@@ -4522,6 +4582,7 @@ internal_fn_stored_value_index (internal_fn fn)
     case IFN_MASK_STORE_LANES:
     case IFN_SCATTER_STORE:
     case IFN_MASK_SCATTER_STORE:
+    case IFN_LEN_MASK_SCATTER_STORE:
     case IFN_LEN_STORE:
       return 3;
 
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index bc947c0fde7..5be24decf88 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -48,14 +48,14 @@ along with GCC; see the file COPYING3.  If not see
    - mask_load: currently just maskload
    - load_lanes: currently just vec_load_lanes
    - mask_load_lanes: currently just vec_mask_load_lanes
-   - gather_load: used for {mask_,}gather_load
+   - gather_load: used for {mask_,len_mask_,}gather_load
    - len_load: currently just len_load
    - len_maskload: currently just len_maskload
 
    - mask_store: currently just maskstore
    - store_lanes: currently just vec_store_lanes
    - mask_store_lanes: currently just vec_mask_store_lanes
-   - scatter_store: used for {mask_,}scatter_store
+   - scatter_store: used for {mask_,len_mask_,}scatter_store
    - len_store: currently just len_store
    - len_maskstore: currently just len_maskstore
 
@@ -157,6 +157,8 @@ DEF_INTERNAL_OPTAB_FN (MASK_LOAD_LANES, ECF_PURE,
 DEF_INTERNAL_OPTAB_FN (GATHER_LOAD, ECF_PURE, gather_load, gather_load)
 DEF_INTERNAL_OPTAB_FN (MASK_GATHER_LOAD, ECF_PURE,
 		       mask_gather_load, gather_load)
+DEF_INTERNAL_OPTAB_FN (LEN_MASK_GATHER_LOAD, ECF_PURE,
+		       len_mask_gather_load, gather_load)
 
 DEF_INTERNAL_OPTAB_FN (LEN_LOAD, ECF_PURE, len_load, len_load)
 DEF_INTERNAL_OPTAB_FN (LEN_MASK_LOAD, ECF_PURE, len_maskload, len_maskload)
@@ -164,6 +166,8 @@ DEF_INTERNAL_OPTAB_FN (LEN_MASK_LOAD, ECF_PURE, len_maskload, len_maskload)
 DEF_INTERNAL_OPTAB_FN (SCATTER_STORE, 0, scatter_store, scatter_store)
 DEF_INTERNAL_OPTAB_FN (MASK_SCATTER_STORE, 0,
 		       mask_scatter_store, scatter_store)
+DEF_INTERNAL_OPTAB_FN (LEN_MASK_SCATTER_STORE, 0,
+		       len_mask_scatter_store, scatter_store)
 
 DEF_INTERNAL_OPTAB_FN (MASK_STORE, 0, maskstore, mask_store)
 DEF_INTERNAL_OPTAB_FN (STORE_LANES, ECF_CONST, vec_store_lanes, store_lanes)
diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
index 8f21068e300..4234bbfed87 100644
--- a/gcc/internal-fn.h
+++ b/gcc/internal-fn.h
@@ -234,6 +234,7 @@ extern bool internal_load_fn_p (internal_fn);
 extern bool internal_store_fn_p (internal_fn);
 extern bool internal_gather_scatter_fn_p (internal_fn);
 extern int internal_fn_mask_index (internal_fn);
+extern int internal_fn_len_index (internal_fn);
 extern int internal_fn_stored_value_index (internal_fn);
 extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree,
 						    tree, tree, int);
diff --git a/gcc/optabs.def b/gcc/optabs.def
index 9533eb11565..58933e61817 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -95,8 +95,10 @@ OPTAB_CD(len_maskload_optab, "len_maskload$a$b")
 OPTAB_CD(len_maskstore_optab, "len_maskstore$a$b")
 OPTAB_CD(gather_load_optab, "gather_load$a$b")
 OPTAB_CD(mask_gather_load_optab, "mask_gather_load$a$b")
+OPTAB_CD(len_mask_gather_load_optab, "len_mask_gather_load$a$b")
 OPTAB_CD(scatter_store_optab, "scatter_store$a$b")
 OPTAB_CD(mask_scatter_store_optab, "mask_scatter_store$a$b")
+OPTAB_CD(len_mask_scatter_store_optab, "len_mask_scatter_store$a$b")
 OPTAB_CD(vec_extract_optab, "vec_extract$a$b")
 OPTAB_CD(vec_init_optab, "vec_init$a$b")
 
-- 
2.36.3


^ permalink raw reply	[flat|nested] 3+ messages in thread

* [PATCH] VECT: Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer
  2023-06-30 10:41 [PATCH V5] Machine Description: Add LEN_MASK_{GATHER_LOAD, SCATTER_STORE} pattern juzhe.zhong
@ 2023-06-30 10:41 ` juzhe.zhong
  2023-07-02  9:35 ` [PATCH V5] Machine Description: Add LEN_MASK_{GATHER_LOAD, SCATTER_STORE} pattern Richard Sandiford
  1 sibling, 0 replies; 3+ messages in thread
From: juzhe.zhong @ 2023-06-30 10:41 UTC (permalink / raw)
  To: gcc-patches; +Cc: richard.sandiford, rguenther, rdapp.gcc, Ju-Zhe Zhong

From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>

Hi, Richard and Richi.
It seems that the implementation of LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE is simple
and code change is not big.

Here is an example:

#include <stdint.h>
void
f (uint8_t *restrict a,
   uint8_t *restrict b, int n,
   int base, int step,
   int *restrict cond)
{
  for (int i = 0; i < n; ++i)
    {
      if (cond[i])
        a[i * step + base] = b[i * step + base];
    }
}

With this patch:

  <bb 6> [local count: 84095460]:
  _58 = (unsigned int) base_19(D);
  _61 = (unsigned long) b_20(D);
  _63 = (unsigned long) a_21(D);
  vect_cst__105 = [vec_duplicate_expr] _58;
  _110 = (unsigned long) n_16(D);

  <bb 7> [local count: 504572759]:
  # vect_vec_iv_.8_95 = PHI <_96(7), { 0, 1, 2, ... }(6)>
  # vectp_cond.9_99 = PHI <vectp_cond.9_100(7), cond_17(D)(6)>
  # ivtmp_111 = PHI <ivtmp_112(7), _110(6)>
  _113 = .SELECT_VL (ivtmp_111, POLY_INT_CST [4, 4]);
  _96 = vect_vec_iv_.8_95 + { POLY_INT_CST [4, 4], ... };
  ivtmp_98 = _113 * 4;
  vect__24.11_101 = .LEN_MASK_LOAD (vectp_cond.9_99, 32B, _113, { -1, ... }, 0);
  mask__14.12_103 = vect__24.11_101 != { 0, ... };
  vect__59.13_104 = VIEW_CONVERT_EXPR<vector([4,4]) unsigned int>(vect_vec_iv_.8_95);
  vect__60.14_106 = vect__59.13_104 + vect_cst__105;
  vect__12.15_107 = VIEW_CONVERT_EXPR<vector([4,4]) int>(vect__60.14_106);
  vect_patt_5.16_108 = .LEN_MASK_GATHER_LOAD (_61, vect__12.15_107, 4, { 0, ... }, _113, mask__14.12_103, 0);
  .LEN_MASK_SCATTER_STORE (_63, vect__12.15_107, 4, vect_patt_5.16_108, _113, mask__14.12_103, 0);
  vectp_cond.9_100 = vectp_cond.9_99 + ivtmp_98;
  ivtmp_112 = ivtmp_111 - _113;
  if (ivtmp_112 != 0)
    goto <bb 7>; [83.33%]
  else
    goto <bb 8>; [16.67%]

gcc/ChangeLog:

        * optabs-query.cc (supports_vec_gather_load_p): Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE.
        (supports_vec_scatter_store_p): Ditto.
        * tree-vect-data-refs.cc (vect_gather_scatter_fn_p): Ditto.
        * tree-vect-stmts.cc (check_load_store_for_partial_vectors): Ditto.
        (vectorizable_store): Ditto.
        (vectorizable_load): Ditto.

---
 gcc/optabs-query.cc        |  2 +
 gcc/tree-vect-data-refs.cc | 18 ++++++++-
 gcc/tree-vect-stmts.cc     | 81 +++++++++++++++++++++++++++++++++++++-
 3 files changed, 98 insertions(+), 3 deletions(-)

diff --git a/gcc/optabs-query.cc b/gcc/optabs-query.cc
index 2fdd0d34354..bf1f484e874 100644
--- a/gcc/optabs-query.cc
+++ b/gcc/optabs-query.cc
@@ -676,6 +676,7 @@ supports_vec_gather_load_p (machine_mode mode)
     this_fn_optabs->supports_vec_gather_load[mode]
       = (supports_vec_convert_optab_p (gather_load_optab, mode)
 	 || supports_vec_convert_optab_p (mask_gather_load_optab, mode)
+	 || supports_vec_convert_optab_p (len_mask_gather_load_optab, mode)
 	 ? 1 : -1);
 
   return this_fn_optabs->supports_vec_gather_load[mode] > 0;
@@ -692,6 +693,7 @@ supports_vec_scatter_store_p (machine_mode mode)
     this_fn_optabs->supports_vec_scatter_store[mode]
       = (supports_vec_convert_optab_p (scatter_store_optab, mode)
 	 || supports_vec_convert_optab_p (mask_scatter_store_optab, mode)
+	 || supports_vec_convert_optab_p (len_mask_scatter_store_optab, mode)
 	 ? 1 : -1);
 
   return this_fn_optabs->supports_vec_scatter_store[mode] > 0;
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
index ebe93832b1e..01016284c48 100644
--- a/gcc/tree-vect-data-refs.cc
+++ b/gcc/tree-vect-data-refs.cc
@@ -3873,16 +3873,24 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
     return false;
 
   /* Work out which function we need.  */
-  internal_fn ifn, alt_ifn;
+  internal_fn ifn, alt_ifn, len_mask_ifn;
   if (read_p)
     {
       ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
       alt_ifn = IFN_MASK_GATHER_LOAD;
+      /* When target supports LEN_MASK_GATHER_LOAD, we always
+	 use LEN_MASK_GATHER_LOAD regardless whether len and
+	 mask are valid or not.  */
+      len_mask_ifn = IFN_LEN_MASK_GATHER_LOAD;
     }
   else
     {
       ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
       alt_ifn = IFN_MASK_SCATTER_STORE;
+      /* When target supports LEN_MASK_SCATTER_STORE, we always
+	 use LEN_MASK_SCATTER_STORE regardless whether len and
+	 mask are valid or not.  */
+      len_mask_ifn = IFN_LEN_MASK_SCATTER_STORE;
     }
 
   for (;;)
@@ -3909,6 +3917,14 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
 	  *offset_vectype_out = offset_vectype;
 	  return true;
 	}
+      else if (internal_gather_scatter_fn_supported_p (len_mask_ifn, vectype,
+						       memory_type,
+						       offset_vectype, scale))
+	{
+	  *ifn_out = ifn;
+	  *offset_vectype_out = offset_vectype;
+	  return true;
+	}
 
       if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
 	  && TYPE_PRECISION (offset_type) >= element_bits)
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 68faa8ead39..fa0387353cf 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -1771,6 +1771,17 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
 						   gs_info->offset_vectype,
 						   gs_info->scale))
 	{
+	  internal_fn len_mask_ifn
+	    = (is_load ? IFN_LEN_MASK_GATHER_LOAD : IFN_LEN_MASK_SCATTER_STORE);
+	  if (internal_gather_scatter_fn_supported_p (len_mask_ifn, vectype,
+						      gs_info->memory_type,
+						      gs_info->offset_vectype,
+						      gs_info->scale))
+	    {
+	      vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
+	      vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1);
+	      return;
+	    }
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 			     "can't operate on partial vectors because"
@@ -8930,7 +8941,40 @@ vectorizable_store (vec_info *vinfo,
 		    vec_offset = vec_offsets[vec_num * j + i];
 		  tree scale = size_int (gs_info.scale);
 		  gcall *call;
-		  if (final_mask)
+		  if (internal_gather_scatter_fn_supported_p (
+			IFN_LEN_MASK_SCATTER_STORE, vectype,
+			gs_info.memory_type, TREE_TYPE (vec_offset),
+			gs_info.scale))
+		    {
+		      tree final_len = NULL_TREE;
+		      tree bias = NULL_TREE;
+		      if (loop_lens)
+			{
+			  final_len
+			    = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
+						 vec_num * ncopies, vectype,
+						 vec_num * j + i, 1);
+			}
+		      else
+			{
+			  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
+			  final_len
+			    = build_int_cst (iv_type,
+					     TYPE_VECTOR_SUBPARTS (vectype));
+			}
+		      signed char biasval
+			= LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+		      bias = build_int_cst (intQI_type_node, biasval);
+		      if (!final_mask)
+			{
+			  mask_vectype = truth_type_for (vectype);
+			  final_mask = build_minus_one_cst (mask_vectype);
+			}
+		      call = gimple_build_call_internal (
+			IFN_LEN_MASK_SCATTER_STORE, 7, dataref_ptr, vec_offset,
+			scale, vec_oprnd, final_len, final_mask, bias);
+		    }
+		  else if (final_mask)
 		    call = gimple_build_call_internal
 		      (IFN_MASK_SCATTER_STORE, 5, dataref_ptr, vec_offset,
 		       scale, vec_oprnd, final_mask);
@@ -10368,7 +10412,40 @@ vectorizable_load (vec_info *vinfo,
 			tree zero = build_zero_cst (vectype);
 			tree scale = size_int (gs_info.scale);
 			gcall *call;
-			if (final_mask)
+			if (internal_gather_scatter_fn_supported_p (
+			      IFN_LEN_MASK_GATHER_LOAD, vectype,
+			      gs_info.memory_type, TREE_TYPE (vec_offset),
+			      gs_info.scale))
+			  {
+			    tree final_len = NULL_TREE;
+			    tree bias = NULL_TREE;
+			    if (loop_lens)
+			      {
+				final_len = vect_get_loop_len (
+				  loop_vinfo, gsi, loop_lens, vec_num * ncopies,
+				  vectype, vec_num * j + i, 1);
+			      }
+			    else
+			      {
+				tree iv_type
+				  = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
+				final_len = build_int_cst (
+				  iv_type, TYPE_VECTOR_SUBPARTS (vectype));
+			      }
+			    signed char biasval
+			      = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+			    bias = build_int_cst (intQI_type_node, biasval);
+			    if (!final_mask)
+			      {
+				mask_vectype = truth_type_for (vectype);
+				final_mask = build_minus_one_cst (mask_vectype);
+			      }
+			    call = gimple_build_call_internal (
+			      IFN_LEN_MASK_GATHER_LOAD, 7, dataref_ptr,
+			      vec_offset, scale, zero, final_len, final_mask,
+			      bias);
+			  }
+			else if (final_mask)
 			  call = gimple_build_call_internal
 			    (IFN_MASK_GATHER_LOAD, 5, dataref_ptr,
 			     vec_offset, scale, zero, final_mask);
-- 
2.36.3


^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH V5] Machine Description: Add LEN_MASK_{GATHER_LOAD, SCATTER_STORE} pattern
  2023-06-30 10:41 [PATCH V5] Machine Description: Add LEN_MASK_{GATHER_LOAD, SCATTER_STORE} pattern juzhe.zhong
  2023-06-30 10:41 ` [PATCH] VECT: Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer juzhe.zhong
@ 2023-07-02  9:35 ` Richard Sandiford
  1 sibling, 0 replies; 3+ messages in thread
From: Richard Sandiford @ 2023-07-02  9:35 UTC (permalink / raw)
  To: juzhe.zhong; +Cc: gcc-patches, rguenther, rdapp.gcc

juzhe.zhong@rivai.ai writes:
> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
>
> Hi, Richi and Richard.
>
> This patch is adding LEN_MASK_{GATHER_LOAD,SCATTER_STORE} to allow targets
> handle flow control by mask and loop control by length on gather/scatter memory
> operations. Consider this following case:
>
> #include <stdint.h>
> void
> f (uint8_t *restrict a,
>    uint8_t *restrict b, int n,
>    int base, int step,
>    int *restrict cond)
> {
>   for (int i = 0; i < n; ++i)
>     {
>       if (cond[i])
>         a[i * step + base] = b[i * step + base];
>     }
> }
>
> We hope RVV can vectorize such case into following IR:
>
> loop_len = SELECT_VL
> control_mask = comparison
> v = LEN_MASK_GATHER_LOAD (.., loop_len, control_mask, bias)
> LEN_SCATTER_STORE (... v, ..., loop_len, control_mask, bias)
>
> This patch doesn't apply such patterns into vectorizer, just add patterns
> and update the documents.
>
> Will send patch which apply such patterns into vectorizer soon after this
> patch is approved.
>
> Thanks.
>
> ---
>  gcc/doc/md.texi     | 17 ++++++++++++
>  gcc/internal-fn.cc  | 67 +++++++++++++++++++++++++++++++++++++++++++--
>  gcc/internal-fn.def |  8 ++++--
>  gcc/internal-fn.h   |  1 +
>  gcc/optabs.def      |  2 ++
>  5 files changed, 90 insertions(+), 5 deletions(-)
>
> diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> index 9648fdc846a..df41b5251d4 100644
> --- a/gcc/doc/md.texi
> +++ b/gcc/doc/md.texi
> @@ -5040,6 +5040,15 @@ operand 5.  Bit @var{i} of the mask is set if element @var{i}
>  of the result should be loaded from memory and clear if element @var{i}
>  of the result should be set to zero.
>  
> +@cindex @code{len_mask_gather_load@var{m}@var{n}} instruction pattern
> +@item @samp{len_mask_gather_load@var{m}@var{n}}
> +Like @samp{gather_load@var{m}@var{n}}, but takes an extra length operand (operand 5),
> +a mask operand (operand 6) as well as a bias operand (operand 7).  Similar to len_maskload,
> +the instruction loads at most (operand 5 + operand 7) elements from memory.
> +Bit @var{i} of the mask is set if element @var{i} of the result should
> +be loaded from memory and clear if element @var{i} of the result should be undefined.
> +Mask elements @var{i} with i > (operand 5) are ignored.

Nit: second i should be @var{i} too.

> +
>  @cindex @code{scatter_store@var{m}@var{n}} instruction pattern
>  @item @samp{scatter_store@var{m}@var{n}}
>  Store a vector of mode @var{m} into several distinct memory locations.
> @@ -5069,6 +5078,14 @@ Like @samp{scatter_store@var{m}@var{n}}, but takes an extra mask operand as
>  operand 5.  Bit @var{i} of the mask is set if element @var{i}
>  of the result should be stored to memory.
>  
> +@cindex @code{len_mask_scatter_store@var{m}@var{n}} instruction pattern
> +@item @samp{len_mask_scatter_store@var{m}@var{n}}
> +Like @samp{scatter_store@var{m}@var{n}}, but takes an extra length operand (operand 5),
> +a mask operand (operand 6) as well as a bias operand (operand 7).  The instruction stores
> +at most (operand 5 + operand 7) elements of (operand 4) to memory.
> +Bit @var{i} of the mask is set if element @var{i} of (operand 4) should be stored.
> +Mask elements @var{i} with i > (operand 5) are ignored.

Same here.

> +
>  @cindex @code{vec_set@var{m}} instruction pattern
>  @item @samp{vec_set@var{m}}
>  Set given field in the vector value.  Operand 0 is the vector to modify,
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index 9017176dc7a..da3827481e9 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -3537,7 +3537,7 @@ expand_scatter_store_optab_fn (internal_fn, gcall *stmt, direct_optab optab)
>    HOST_WIDE_INT scale_int = tree_to_shwi (scale);
>    rtx rhs_rtx = expand_normal (rhs);
>  
> -  class expand_operand ops[6];
> +  class expand_operand ops[8];
>    int i = 0;
>    create_address_operand (&ops[i++], base_rtx);
>    create_input_operand (&ops[i++], offset_rtx, TYPE_MODE (TREE_TYPE (offset)));
> @@ -3546,9 +3546,23 @@ expand_scatter_store_optab_fn (internal_fn, gcall *stmt, direct_optab optab)
>    create_input_operand (&ops[i++], rhs_rtx, TYPE_MODE (TREE_TYPE (rhs)));
>    if (mask_index >= 0)
>      {
> +      if (optab == len_mask_scatter_store_optab)
> +	{
> +	  tree len = gimple_call_arg (stmt, internal_fn_len_index (ifn));
> +	  rtx len_rtx = expand_normal (len);
> +	  create_convert_operand_from (&ops[i++], len_rtx,
> +				       TYPE_MODE (TREE_TYPE (len)),
> +				       TYPE_UNSIGNED (TREE_TYPE (len)));
> +	}
>        tree mask = gimple_call_arg (stmt, mask_index);
>        rtx mask_rtx = expand_normal (mask);
>        create_input_operand (&ops[i++], mask_rtx, TYPE_MODE (TREE_TYPE (mask)));
> +      if (optab == len_mask_scatter_store_optab)
> +	{
> +	  tree biast = gimple_call_arg (stmt, gimple_call_num_args (stmt) - 1);
> +	  rtx bias = expand_normal (biast);
> +	  create_input_operand (&ops[i++], bias, QImode);
> +	}
>      }

Guess this is personal preference, but IMO it would be more natural to have:

  if (optab == len_mask_scatter_store_optab)
    {
      ...
    }
  if (mask_index >= 0)
    {
      ...
    }
  if (optab == len_mask_scatter_store_optab)
    {
      ...
    }

since that's the structure we'd need if LEN_GATHER_LOAD was a thing.
Also...

>  
>    insn_code icode = convert_optab_handler (optab, TYPE_MODE (TREE_TYPE (rhs)),
> @@ -3559,7 +3573,7 @@ expand_scatter_store_optab_fn (internal_fn, gcall *stmt, direct_optab optab)
>  /* Expand {MASK_,}GATHER_LOAD call CALL using optab OPTAB.  */
>  
>  static void
> -expand_gather_load_optab_fn (internal_fn, gcall *stmt, direct_optab optab)
> +expand_gather_load_optab_fn (internal_fn ifn, gcall *stmt, direct_optab optab)
>  {
>    tree lhs = gimple_call_lhs (stmt);
>    tree base = gimple_call_arg (stmt, 0);
> @@ -3572,7 +3586,7 @@ expand_gather_load_optab_fn (internal_fn, gcall *stmt, direct_optab optab)
>    HOST_WIDE_INT scale_int = tree_to_shwi (scale);
>  
>    int i = 0;
> -  class expand_operand ops[6];
> +  class expand_operand ops[8];
>    create_output_operand (&ops[i++], lhs_rtx, TYPE_MODE (TREE_TYPE (lhs)));
>    create_address_operand (&ops[i++], base_rtx);
>    create_input_operand (&ops[i++], offset_rtx, TYPE_MODE (TREE_TYPE (offset)));
> @@ -3584,6 +3598,20 @@ expand_gather_load_optab_fn (internal_fn, gcall *stmt, direct_optab optab)
>        rtx mask_rtx = expand_normal (mask);
>        create_input_operand (&ops[i++], mask_rtx, TYPE_MODE (TREE_TYPE (mask)));
>      }
> +  else if (optab == len_mask_gather_load_optab)
> +    {
> +      tree len = gimple_call_arg (stmt, internal_fn_len_index (ifn));
> +      rtx len_rtx = expand_normal (len);
> +      create_convert_operand_from (&ops[i++], len_rtx,
> +				   TYPE_MODE (TREE_TYPE (len)),
> +				   TYPE_UNSIGNED (TREE_TYPE (len)));
> +      tree mask = gimple_call_arg (stmt, internal_fn_mask_index (ifn));
> +      rtx mask_rtx = expand_normal (mask);
> +      create_input_operand (&ops[i++], mask_rtx, TYPE_MODE (TREE_TYPE (mask)));
> +      tree biast = gimple_call_arg (stmt, gimple_call_num_args (stmt) - 1);
> +      rtx bias = expand_normal (biast);
> +      create_input_operand (&ops[i++], bias, QImode);
> +    }

...I think we've now got too many copies of this construct: the two above,
plus expand_partial_load_optab_fn and expand_partial_store_optab_fn.
It would be good to have a helper along the lines of:

static unsigned int
add_mask_and_len_args (expand_operand *opno, unsigned int opno,
		       gcall *stmt)
{
  ...
}

that makes use of internal_fn_len_index and internal_fn_mask_index.

Don't shoot me, but I think we might have made a mistake by putting
the mask between the length and the bias.  It seems more natural
to keep the length-related arguments consecutive, so that any bias
argument is always immediately after any length argument.

Would you mind doing a separate patch to swap the len_maskload/len_maskstore
arguments around?  “mask, len, bias” or “len, bias, mask” would be OK,
although I suppose “len, bias, mask” fits the order in the name.

>    insn_code icode = convert_optab_handler (optab, TYPE_MODE (TREE_TYPE (lhs)),
>  					   TYPE_MODE (TREE_TYPE (offset)));
>    expand_insn (icode, i, ops);
> @@ -4434,6 +4462,7 @@ internal_load_fn_p (internal_fn fn)
>      case IFN_MASK_LOAD_LANES:
>      case IFN_GATHER_LOAD:
>      case IFN_MASK_GATHER_LOAD:
> +    case IFN_LEN_MASK_GATHER_LOAD:
>      case IFN_LEN_LOAD:
>      case IFN_LEN_MASK_LOAD:
>        return true;
> @@ -4455,6 +4484,7 @@ internal_store_fn_p (internal_fn fn)
>      case IFN_MASK_STORE_LANES:
>      case IFN_SCATTER_STORE:
>      case IFN_MASK_SCATTER_STORE:
> +    case IFN_LEN_MASK_SCATTER_STORE:
>      case IFN_LEN_STORE:
>      case IFN_LEN_MASK_STORE:
>        return true;
> @@ -4473,8 +4503,10 @@ internal_gather_scatter_fn_p (internal_fn fn)
>      {
>      case IFN_GATHER_LOAD:
>      case IFN_MASK_GATHER_LOAD:
> +    case IFN_LEN_MASK_GATHER_LOAD:
>      case IFN_SCATTER_STORE:
>      case IFN_MASK_SCATTER_STORE:
> +    case IFN_LEN_MASK_SCATTER_STORE:
>        return true;
>  
>      default:
> @@ -4504,6 +4536,34 @@ internal_fn_mask_index (internal_fn fn)
>      case IFN_LEN_MASK_STORE:
>        return 3;
>  
> +    case IFN_LEN_MASK_GATHER_LOAD:
> +    case IFN_LEN_MASK_SCATTER_STORE:
> +      return 5;
> +
> +    default:
> +      return (conditional_internal_fn_code (fn) != ERROR_MARK
> +	      || get_unconditional_internal_fn (fn) != IFN_LAST ? 0 : -1);
> +    }
> +}
> +
> +/* If FN takes a vector len argument, return the index of that argument,
> +   otherwise return -1.  */
> +
> +int
> +internal_fn_len_index (internal_fn fn)
> +{
> +  switch (fn)
> +    {
> +    case IFN_LEN_LOAD:
> +    case IFN_LEN_STORE:
> +    case IFN_LEN_MASK_LOAD:
> +    case IFN_LEN_MASK_STORE:
> +      return 2;
> +
> +    case IFN_LEN_MASK_GATHER_LOAD:
> +    case IFN_LEN_MASK_SCATTER_STORE:
> +      return 4;
> +
>      default:
>        return (conditional_internal_fn_code (fn) != ERROR_MARK
>  	      || get_unconditional_internal_fn (fn) != IFN_LAST ? 0 : -1);

This default case isn't correct for this function.

Thanks,
Richard

> @@ -4522,6 +4582,7 @@ internal_fn_stored_value_index (internal_fn fn)
>      case IFN_MASK_STORE_LANES:
>      case IFN_SCATTER_STORE:
>      case IFN_MASK_SCATTER_STORE:
> +    case IFN_LEN_MASK_SCATTER_STORE:
>      case IFN_LEN_STORE:
>        return 3;
>  
> diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> index bc947c0fde7..5be24decf88 100644
> --- a/gcc/internal-fn.def
> +++ b/gcc/internal-fn.def
> @@ -48,14 +48,14 @@ along with GCC; see the file COPYING3.  If not see
>     - mask_load: currently just maskload
>     - load_lanes: currently just vec_load_lanes
>     - mask_load_lanes: currently just vec_mask_load_lanes
> -   - gather_load: used for {mask_,}gather_load
> +   - gather_load: used for {mask_,len_mask_,}gather_load
>     - len_load: currently just len_load
>     - len_maskload: currently just len_maskload
>  
>     - mask_store: currently just maskstore
>     - store_lanes: currently just vec_store_lanes
>     - mask_store_lanes: currently just vec_mask_store_lanes
> -   - scatter_store: used for {mask_,}scatter_store
> +   - scatter_store: used for {mask_,len_mask_,}scatter_store
>     - len_store: currently just len_store
>     - len_maskstore: currently just len_maskstore
>  
> @@ -157,6 +157,8 @@ DEF_INTERNAL_OPTAB_FN (MASK_LOAD_LANES, ECF_PURE,
>  DEF_INTERNAL_OPTAB_FN (GATHER_LOAD, ECF_PURE, gather_load, gather_load)
>  DEF_INTERNAL_OPTAB_FN (MASK_GATHER_LOAD, ECF_PURE,
>  		       mask_gather_load, gather_load)
> +DEF_INTERNAL_OPTAB_FN (LEN_MASK_GATHER_LOAD, ECF_PURE,
> +		       len_mask_gather_load, gather_load)
>  
>  DEF_INTERNAL_OPTAB_FN (LEN_LOAD, ECF_PURE, len_load, len_load)
>  DEF_INTERNAL_OPTAB_FN (LEN_MASK_LOAD, ECF_PURE, len_maskload, len_maskload)
> @@ -164,6 +166,8 @@ DEF_INTERNAL_OPTAB_FN (LEN_MASK_LOAD, ECF_PURE, len_maskload, len_maskload)
>  DEF_INTERNAL_OPTAB_FN (SCATTER_STORE, 0, scatter_store, scatter_store)
>  DEF_INTERNAL_OPTAB_FN (MASK_SCATTER_STORE, 0,
>  		       mask_scatter_store, scatter_store)
> +DEF_INTERNAL_OPTAB_FN (LEN_MASK_SCATTER_STORE, 0,
> +		       len_mask_scatter_store, scatter_store)
>  
>  DEF_INTERNAL_OPTAB_FN (MASK_STORE, 0, maskstore, mask_store)
>  DEF_INTERNAL_OPTAB_FN (STORE_LANES, ECF_CONST, vec_store_lanes, store_lanes)
> diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> index 8f21068e300..4234bbfed87 100644
> --- a/gcc/internal-fn.h
> +++ b/gcc/internal-fn.h
> @@ -234,6 +234,7 @@ extern bool internal_load_fn_p (internal_fn);
>  extern bool internal_store_fn_p (internal_fn);
>  extern bool internal_gather_scatter_fn_p (internal_fn);
>  extern int internal_fn_mask_index (internal_fn);
> +extern int internal_fn_len_index (internal_fn);
>  extern int internal_fn_stored_value_index (internal_fn);
>  extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree,
>  						    tree, tree, int);
> diff --git a/gcc/optabs.def b/gcc/optabs.def
> index 9533eb11565..58933e61817 100644
> --- a/gcc/optabs.def
> +++ b/gcc/optabs.def
> @@ -95,8 +95,10 @@ OPTAB_CD(len_maskload_optab, "len_maskload$a$b")
>  OPTAB_CD(len_maskstore_optab, "len_maskstore$a$b")
>  OPTAB_CD(gather_load_optab, "gather_load$a$b")
>  OPTAB_CD(mask_gather_load_optab, "mask_gather_load$a$b")
> +OPTAB_CD(len_mask_gather_load_optab, "len_mask_gather_load$a$b")
>  OPTAB_CD(scatter_store_optab, "scatter_store$a$b")
>  OPTAB_CD(mask_scatter_store_optab, "mask_scatter_store$a$b")
> +OPTAB_CD(len_mask_scatter_store_optab, "len_mask_scatter_store$a$b")
>  OPTAB_CD(vec_extract_optab, "vec_extract$a$b")
>  OPTAB_CD(vec_init_optab, "vec_init$a$b")

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2023-07-02  9:35 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-06-30 10:41 [PATCH V5] Machine Description: Add LEN_MASK_{GATHER_LOAD, SCATTER_STORE} pattern juzhe.zhong
2023-06-30 10:41 ` [PATCH] VECT: Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer juzhe.zhong
2023-07-02  9:35 ` [PATCH V5] Machine Description: Add LEN_MASK_{GATHER_LOAD, SCATTER_STORE} pattern Richard Sandiford

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).