[PATCH V2] VECT: Add COND_LEN_* operations for loop control with length targets

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH V2] VECT: Add COND_LEN_* operations for loop control with length targets
@ 2023-07-10 11:35 juzhe.zhong
  2023-07-11  0:59 ` juzhe.zhong
  2023-07-11 11:00 ` Richard Biener
  0 siblings, 2 replies; 4+ messages in thread
From: juzhe.zhong @ 2023-07-10 11:35 UTC (permalink / raw)
  To: gcc-patches; +Cc: richard.sandiford, rguenther, Ju-Zhe Zhong

From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>

Hi, Richard and Richi.

This patch is adding cond_len_* operations pattern for target support loop control with length.

These patterns will be used in these following case:

1. Integer division:
   void
   f (int32_t *restrict a, int32_t *restrict b, int32_t *restrict c, int n)
   {
     for (int i = 0; i < n; ++i)
      {
        a[i] = b[i] / c[i];
      }
   }

  ARM SVE IR:
  
  ...
  max_mask_36 = .WHILE_ULT (0, bnd.5_32, { 0, ... });

  Loop:
  ...
  # loop_mask_29 = PHI <next_mask_37(4), max_mask_36(3)>
  ...
  vect__4.8_28 = .MASK_LOAD (_33, 32B, loop_mask_29);
  ...
  vect__6.11_25 = .MASK_LOAD (_20, 32B, loop_mask_29);
  vect__8.12_24 = .COND_DIV (loop_mask_29, vect__4.8_28, vect__6.11_25, vect__4.8_28);
  ...
  .MASK_STORE (_1, 32B, loop_mask_29, vect__8.12_24);
  ...
  next_mask_37 = .WHILE_ULT (_2, bnd.5_32, { 0, ... });
  ...
  
  For target like RVV who support loop control with length, we want to see IR as follows:
  
  Loop:
  ...
  # loop_len_29 = SELECT_VL
  ...
  vect__4.8_28 = .LEN_MASK_LOAD (_33, 32B, loop_len_29);
  ...
  vect__6.11_25 = .LEN_MASK_LOAD (_20, 32B, loop_len_29);
  vect__8.12_24 = .COND_LEN_DIV (dummp_mask, vect__4.8_28, vect__6.11_25, vect__4.8_28, loop_len_29, bias);
  ...
  .LEN_MASK_STORE (_1, 32B, loop_len_29, vect__8.12_24);
  ...
  next_mask_37 = .WHILE_ULT (_2, bnd.5_32, { 0, ... });
  ...
  
  Notice here, we use dummp_mask = { -1, -1, .... , -1 }

2. Integer conditional division:
   Similar case with (1) but with condtion:
   void
   f (int32_t *restrict a, int32_t *restrict b, int32_t *restrict c, int32_t * cond, int n)
   {
     for (int i = 0; i < n; ++i)
       {
         if (cond[i])
         a[i] = b[i] / c[i];
       }
   }
   
   ARM SVE:
   ...
   max_mask_76 = .WHILE_ULT (0, bnd.6_52, { 0, ... });

   Loop:
   ...
   # loop_mask_55 = PHI <next_mask_77(5), max_mask_76(4)>
   ...
   vect__4.9_56 = .MASK_LOAD (_51, 32B, loop_mask_55);
   mask__29.10_58 = vect__4.9_56 != { 0, ... };
   vec_mask_and_61 = loop_mask_55 & mask__29.10_58;
   ...
   vect__6.13_62 = .MASK_LOAD (_24, 32B, vec_mask_and_61);
   ...
   vect__8.16_66 = .MASK_LOAD (_1, 32B, vec_mask_and_61);
   vect__10.17_68 = .COND_DIV (vec_mask_and_61, vect__6.13_62, vect__8.16_66, vect__6.13_62);
   ...
   .MASK_STORE (_2, 32B, vec_mask_and_61, vect__10.17_68);
   ...
   next_mask_77 = .WHILE_ULT (_3, bnd.6_52, { 0, ... });
   
   Here, ARM SVE use vec_mask_and_61 = loop_mask_55 & mask__29.10_58; to gurantee the correct result.
   
   However, target with length control can not perform this elegant flow, for RVV, we would expect:
   
   Loop:
   ...
   loop_len_55 = SELECT_VL
   ...
   mask__29.10_58 = vect__4.9_56 != { 0, ... };
   ...
   vect__10.17_68 = .COND_LEN_DIV (mask__29.10_58, vect__6.13_62, vect__8.16_66, vect__6.13_62, loop_len_55, bias);
   ...

   Here we expect COND_LEN_DIV predicated by a real mask which is the outcome of comparison: mask__29.10_58 = vect__4.9_56 != { 0, ... };
   and a real length which is produced by loop control : loop_len_55 = SELECT_VL
   
3. conditional Floating-point operations (no -ffast-math):
   
    void
    f (float *restrict a, float *restrict b, int32_t *restrict cond, int n)
    {
      for (int i = 0; i < n; ++i)
        {
          if (cond[i])
          a[i] = b[i] + a[i];
        }
    }
  
  ARM SVE IR:
  max_mask_70 = .WHILE_ULT (0, bnd.6_46, { 0, ... });

  ...
  # loop_mask_49 = PHI <next_mask_71(4), max_mask_70(3)>
  ...
  mask__27.10_52 = vect__4.9_50 != { 0, ... };
  vec_mask_and_55 = loop_mask_49 & mask__27.10_52;
  ...
  vect__9.17_62 = .COND_ADD (vec_mask_and_55, vect__6.13_56, vect__8.16_60, vect__6.13_56);
  ...
  next_mask_71 = .WHILE_ULT (_22, bnd.6_46, { 0, ... });
  ...
  
  For RVV, we would expect IR:
  
  ...
  loop_len_49 = SELECT_VL
  ...
  mask__27.10_52 = vect__4.9_50 != { 0, ... };
  ...
  vect__9.17_62 = .COND_LEN_ADD (mask__27.10_52, vect__6.13_56, vect__8.16_60, vect__6.13_56, loop_len_49, bias);
  ...

4. Conditional un-ordered reduction:
   
   int32_t
   f (int32_t *restrict a, 
   int32_t *restrict cond, int n)
   {
     int32_t result = 0;
     for (int i = 0; i < n; ++i)
       {
           if (cond[i])
         result += a[i];
       }
     return result;
   }
   
   ARM SVE IR:
     
     Loop:
     # vect_result_18.7_37 = PHI <vect__33.16_51(4), { 0, ... }(3)>
     ...
     # loop_mask_40 = PHI <next_mask_58(4), max_mask_57(3)>
     ...
     mask__17.11_43 = vect__4.10_41 != { 0, ... };
     vec_mask_and_46 = loop_mask_40 & mask__17.11_43;
     ...
     vect__33.16_51 = .COND_ADD (vec_mask_and_46, vect_result_18.7_37, vect__7.14_47, vect_result_18.7_37);
     ...
     next_mask_58 = .WHILE_ULT (_15, bnd.6_36, { 0, ... });
     ...
   
     Epilogue:
     _53 = .REDUC_PLUS (vect__33.16_51); [tail call]
   
   For RVV, we expect:
     
    Loop:
     # vect_result_18.7_37 = PHI <vect__33.16_51(4), { 0, ... }(3)>
     ...
     loop_len_40 = SELECT_VL
     ...
     mask__17.11_43 = vect__4.10_41 != { 0, ... };
     ...
     vect__33.16_51 = .COND_LEN_ADD (mask__17.11_43, vect_result_18.7_37, vect__7.14_47, vect_result_18.7_37, loop_len_40, bias);
     ...
     next_mask_58 = .WHILE_ULT (_15, bnd.6_36, { 0, ... });
     ...
   
     Epilogue:
     _53 = .REDUC_PLUS (vect__33.16_51); [tail call]
     
     I name these patterns as "cond_len_*" since I want the length operand comes after mask operand and all other operands except length operand
     same order as "cond_*" patterns. Such order will make life easier in the following loop vectorizer support.

gcc/ChangeLog:

        * doc/md.texi: Add COND_LEN_* operations for loop control with length.
        * internal-fn.cc (cond_len_unary_direct): Ditto.
        (cond_len_binary_direct): Ditto.
        (cond_len_ternary_direct): Ditto.
        (expand_cond_len_unary_optab_fn): Ditto.
        (expand_cond_len_binary_optab_fn): Ditto.
        (expand_cond_len_ternary_optab_fn): Ditto.
        (direct_cond_len_unary_optab_supported_p): Ditto.
        (direct_cond_len_binary_optab_supported_p): Ditto.
        (direct_cond_len_ternary_optab_supported_p): Ditto.
        * internal-fn.def (COND_LEN_ADD): Ditto.
        (COND_LEN_SUB): Ditto.
        (COND_LEN_MUL): Ditto.
        (COND_LEN_DIV): Ditto.
        (COND_LEN_MOD): Ditto.
        (COND_LEN_RDIV): Ditto.
        (COND_LEN_MIN): Ditto.
        (COND_LEN_MAX): Ditto.
        (COND_LEN_FMIN): Ditto.
        (COND_LEN_FMAX): Ditto.
        (COND_LEN_AND): Ditto.
        (COND_LEN_IOR): Ditto.
        (COND_LEN_XOR): Ditto.
        (COND_LEN_SHL): Ditto.
        (COND_LEN_SHR): Ditto.
        (COND_LEN_FMA): Ditto.
        (COND_LEN_FMS): Ditto.
        (COND_LEN_FNMA): Ditto.
        (COND_LEN_FNMS): Ditto.
        (COND_LEN_NEG): Ditto.
        * optabs.def (OPTAB_D): Ditto.

---
 gcc/doc/md.texi     | 80 +++++++++++++++++++++++++++++++++++++++++++++
 gcc/internal-fn.cc  | 15 +++++++++
 gcc/internal-fn.def | 38 +++++++++++++++++++++
 gcc/optabs.def      | 24 ++++++++++++++
 4 files changed, 157 insertions(+)

diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index b30a824488b..cbcb992e5d7 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -7234,6 +7234,86 @@ for (i = 0; i < GET_MODE_NUNITS (@var{m}); i++)
   op0[i] = op1[i] ? fma (op2[i], op3[i], op4[i]) : op5[i];
 @end smallexample
 
+@cindex @code{cond_len_add@var{mode}} instruction pattern
+@cindex @code{cond_len_sub@var{mode}} instruction pattern
+@cindex @code{cond_len_mul@var{mode}} instruction pattern
+@cindex @code{cond_len_div@var{mode}} instruction pattern
+@cindex @code{cond_len_udiv@var{mode}} instruction pattern
+@cindex @code{cond_len_mod@var{mode}} instruction pattern
+@cindex @code{cond_len_umod@var{mode}} instruction pattern
+@cindex @code{cond_len_and@var{mode}} instruction pattern
+@cindex @code{cond_len_ior@var{mode}} instruction pattern
+@cindex @code{cond_len_xor@var{mode}} instruction pattern
+@cindex @code{cond_len_smin@var{mode}} instruction pattern
+@cindex @code{cond_len_smax@var{mode}} instruction pattern
+@cindex @code{cond_len_umin@var{mode}} instruction pattern
+@cindex @code{cond_len_umax@var{mode}} instruction pattern
+@cindex @code{cond_len_fmin@var{mode}} instruction pattern
+@cindex @code{cond_len_fmax@var{mode}} instruction pattern
+@cindex @code{cond_len_ashl@var{mode}} instruction pattern
+@cindex @code{cond_len_ashr@var{mode}} instruction pattern
+@cindex @code{cond_len_lshr@var{mode}} instruction pattern
+@item @samp{cond_len_add@var{mode}}
+@itemx @samp{cond_len_sub@var{mode}}
+@itemx @samp{cond_len_mul@var{mode}}
+@itemx @samp{cond_len_div@var{mode}}
+@itemx @samp{cond_len_udiv@var{mode}}
+@itemx @samp{cond_len_mod@var{mode}}
+@itemx @samp{cond_len_umod@var{mode}}
+@itemx @samp{cond_len_and@var{mode}}
+@itemx @samp{cond_len_ior@var{mode}}
+@itemx @samp{cond_len_xor@var{mode}}
+@itemx @samp{cond_len_smin@var{mode}}
+@itemx @samp{cond_len_smax@var{mode}}
+@itemx @samp{cond_len_umin@var{mode}}
+@itemx @samp{cond_len_umax@var{mode}}
+@itemx @samp{cond_len_fmin@var{mode}}
+@itemx @samp{cond_len_fmax@var{mode}}
+@itemx @samp{cond_len_ashl@var{mode}}
+@itemx @samp{cond_len_ashr@var{mode}}
+@itemx @samp{cond_len_lshr@var{mode}}
+When operand 1 is true and element index < operand 5 + operand 6, perform an operation on operands 2 and 3 and
+store the result in operand 0, otherwise store operand 4 in operand 0.
+The operation only works for the operands are vectors.
+
+@smallexample
+for (i = 0; i < ops[5] + ops[6]; i++)
+  op0[i] = op1[i] ? op2[i] @var{op} op3[i] : op4[i];
+@end smallexample
+
+where, for example, @var{op} is @code{+} for @samp{cond_len_add@var{mode}}.
+
+When defined for floating-point modes, the contents of @samp{op3[i]}
+are not interpreted if @samp{op1[i]} is false, just like they would not
+be in a normal C @samp{?:} condition.
+
+Operands 0, 2, 3 and 4 all have mode @var{m}.  Operand 1 is a scalar
+integer if @var{m} is scalar, otherwise it has the mode returned by
+@code{TARGET_VECTORIZE_GET_MASK_MODE}.  Operand 5 has whichever
+integer mode the target prefers.
+
+@samp{cond_@var{op}@var{mode}} generally corresponds to a conditional
+form of @samp{@var{op}@var{mode}3}.  As an exception, the vector forms
+of shifts correspond to patterns like @code{vashl@var{mode}3} rather
+than patterns like @code{ashl@var{mode}3}.
+
+@cindex @code{cond_len_fma@var{mode}} instruction pattern
+@cindex @code{cond_len_fms@var{mode}} instruction pattern
+@cindex @code{cond_len_fnma@var{mode}} instruction pattern
+@cindex @code{cond_len_fnms@var{mode}} instruction pattern
+@item @samp{cond_len_fma@var{mode}}
+@itemx @samp{cond_len_fms@var{mode}}
+@itemx @samp{cond_len_fnma@var{mode}}
+@itemx @samp{cond_len_fnms@var{mode}}
+Like @samp{cond_len_add@var{m}}, except that the conditional operation
+takes 3 operands rather than two.  For example, the vector form of
+@samp{cond_len_fma@var{mode}} is equivalent to:
+
+@smallexample
+for (i = 0; i < ops[6] + ops[7]; i++)
+  op0[i] = op1[i] ? fma (op2[i], op3[i], op4[i]) : op5[i];
+@end smallexample
+
 @cindex @code{neg@var{mode}cc} instruction pattern
 @item @samp{neg@var{mode}cc}
 Similar to @samp{mov@var{mode}cc} but for conditional negation.  Conditionally
diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index 278db7b1805..f9aaf66cf2a 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -183,6 +183,9 @@ init_internal_fns ()
 #define cond_unary_direct { 1, 1, true }
 #define cond_binary_direct { 1, 1, true }
 #define cond_ternary_direct { 1, 1, true }
+#define cond_len_unary_direct { 1, 1, true }
+#define cond_len_binary_direct { 1, 1, true }
+#define cond_len_ternary_direct { 1, 1, true }
 #define while_direct { 0, 2, false }
 #define fold_extract_direct { 2, 2, false }
 #define fold_left_direct { 1, 1, false }
@@ -3869,6 +3872,15 @@ expand_convert_optab_fn (internal_fn fn, gcall *stmt, convert_optab optab,
 #define expand_cond_ternary_optab_fn(FN, STMT, OPTAB) \
   expand_direct_optab_fn (FN, STMT, OPTAB, 5)
 
+#define expand_cond_len_unary_optab_fn(FN, STMT, OPTAB) \
+  expand_direct_optab_fn (FN, STMT, OPTAB, 5)
+
+#define expand_cond_len_binary_optab_fn(FN, STMT, OPTAB) \
+  expand_direct_optab_fn (FN, STMT, OPTAB, 6)
+
+#define expand_cond_len_ternary_optab_fn(FN, STMT, OPTAB) \
+  expand_direct_optab_fn (FN, STMT, OPTAB, 7)
+
 #define expand_fold_extract_optab_fn(FN, STMT, OPTAB) \
   expand_direct_optab_fn (FN, STMT, OPTAB, 3)
 
@@ -3964,6 +3976,9 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
 #define direct_cond_unary_optab_supported_p direct_optab_supported_p
 #define direct_cond_binary_optab_supported_p direct_optab_supported_p
 #define direct_cond_ternary_optab_supported_p direct_optab_supported_p
+#define direct_cond_len_unary_optab_supported_p direct_optab_supported_p
+#define direct_cond_len_binary_optab_supported_p direct_optab_supported_p
+#define direct_cond_len_ternary_optab_supported_p direct_optab_supported_p
 #define direct_mask_load_optab_supported_p convert_optab_supported_p
 #define direct_load_lanes_optab_supported_p multi_vector_optab_supported_p
 #define direct_mask_load_lanes_optab_supported_p multi_vector_optab_supported_p
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 238b7ee0bc9..ea750a921ed 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -72,6 +72,10 @@ along with GCC; see the file COPYING3.  If not see
    - fold_left: for scalar = FN (scalar, vector), keyed off the vector mode
    - check_ptrs: used for check_{raw,war}_ptrs
 
+   - cond_len_unary: a conditional unary optab, such as cond_len_neg<mode>
+   - cond_len_binary: a conditional binary optab, such as cond_len_add<mode>
+   - cond_len_ternary: a conditional ternary optab, such as cond_len_fma_rev<mode>
+
    DEF_INTERNAL_SIGNED_OPTAB_FN defines an internal function that
    maps to one of two optabs, depending on the signedness of an input.
    SIGNED_OPTAB and UNSIGNED_OPTAB are the optabs for signed and
@@ -248,6 +252,40 @@ DEF_INTERNAL_OPTAB_FN (COND_FNMS, ECF_CONST, cond_fnms, cond_ternary)
 
 DEF_INTERNAL_OPTAB_FN (COND_NEG, ECF_CONST, cond_neg, cond_unary)
 
+DEF_INTERNAL_OPTAB_FN (COND_LEN_ADD, ECF_CONST, cond_len_add, cond_len_binary)
+DEF_INTERNAL_OPTAB_FN (COND_LEN_SUB, ECF_CONST, cond_len_sub, cond_len_binary)
+DEF_INTERNAL_OPTAB_FN (COND_LEN_MUL, ECF_CONST, cond_len_smul, cond_len_binary)
+DEF_INTERNAL_SIGNED_OPTAB_FN (COND_LEN_DIV, ECF_CONST, first, cond_len_sdiv,
+			      cond_len_udiv, cond_len_binary)
+DEF_INTERNAL_SIGNED_OPTAB_FN (COND_LEN_MOD, ECF_CONST, first, cond_len_smod,
+			      cond_len_umod, cond_len_binary)
+DEF_INTERNAL_OPTAB_FN (COND_LEN_RDIV, ECF_CONST, cond_len_sdiv, cond_len_binary)
+DEF_INTERNAL_SIGNED_OPTAB_FN (COND_LEN_MIN, ECF_CONST, first, cond_len_smin,
+			      cond_len_umin, cond_len_binary)
+DEF_INTERNAL_SIGNED_OPTAB_FN (COND_LEN_MAX, ECF_CONST, first, cond_len_smax,
+			      cond_len_umax, cond_len_binary)
+DEF_INTERNAL_OPTAB_FN (COND_LEN_FMIN, ECF_CONST, cond_len_fmin, cond_len_binary)
+DEF_INTERNAL_OPTAB_FN (COND_LEN_FMAX, ECF_CONST, cond_len_fmax, cond_len_binary)
+DEF_INTERNAL_OPTAB_FN (COND_LEN_AND, ECF_CONST | ECF_NOTHROW, cond_len_and,
+		       cond_len_binary)
+DEF_INTERNAL_OPTAB_FN (COND_LEN_IOR, ECF_CONST | ECF_NOTHROW, cond_len_ior,
+		       cond_len_binary)
+DEF_INTERNAL_OPTAB_FN (COND_LEN_XOR, ECF_CONST | ECF_NOTHROW, cond_len_xor,
+		       cond_len_binary)
+DEF_INTERNAL_OPTAB_FN (COND_LEN_SHL, ECF_CONST | ECF_NOTHROW, cond_len_ashl,
+		       cond_len_binary)
+DEF_INTERNAL_SIGNED_OPTAB_FN (COND_LEN_SHR, ECF_CONST | ECF_NOTHROW, first,
+			      cond_len_ashr, cond_len_lshr, cond_len_binary)
+
+DEF_INTERNAL_OPTAB_FN (COND_LEN_FMA, ECF_CONST, cond_len_fma, cond_len_ternary)
+DEF_INTERNAL_OPTAB_FN (COND_LEN_FMS, ECF_CONST, cond_len_fms, cond_len_ternary)
+DEF_INTERNAL_OPTAB_FN (COND_LEN_FNMA, ECF_CONST, cond_len_fnma,
+		       cond_len_ternary)
+DEF_INTERNAL_OPTAB_FN (COND_LEN_FNMS, ECF_CONST, cond_len_fnms,
+		       cond_len_ternary)
+
+DEF_INTERNAL_OPTAB_FN (COND_LEN_NEG, ECF_CONST, cond_len_neg, cond_len_unary)
+
 DEF_INTERNAL_OPTAB_FN (RSQRT, ECF_CONST, rsqrt, unary)
 
 DEF_INTERNAL_OPTAB_FN (REDUC_PLUS, ECF_CONST | ECF_NOTHROW,
diff --git a/gcc/optabs.def b/gcc/optabs.def
index 73c9a0c760f..3dae228fba6 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -254,6 +254,30 @@ OPTAB_D (cond_fms_optab, "cond_fms$a")
 OPTAB_D (cond_fnma_optab, "cond_fnma$a")
 OPTAB_D (cond_fnms_optab, "cond_fnms$a")
 OPTAB_D (cond_neg_optab, "cond_neg$a")
+OPTAB_D (cond_len_add_optab, "cond_len_add$a")
+OPTAB_D (cond_len_sub_optab, "cond_len_sub$a")
+OPTAB_D (cond_len_smul_optab, "cond_len_mul$a")
+OPTAB_D (cond_len_sdiv_optab, "cond_len_div$a")
+OPTAB_D (cond_len_smod_optab, "cond_len_mod$a")
+OPTAB_D (cond_len_udiv_optab, "cond_len_udiv$a")
+OPTAB_D (cond_len_umod_optab, "cond_len_umod$a")
+OPTAB_D (cond_len_and_optab, "cond_len_and$a")
+OPTAB_D (cond_len_ior_optab, "cond_len_ior$a")
+OPTAB_D (cond_len_xor_optab, "cond_len_xor$a")
+OPTAB_D (cond_len_ashl_optab, "cond_len_ashl$a")
+OPTAB_D (cond_len_ashr_optab, "cond_len_ashr$a")
+OPTAB_D (cond_len_lshr_optab, "cond_len_lshr$a")
+OPTAB_D (cond_len_smin_optab, "cond_len_smin$a")
+OPTAB_D (cond_len_smax_optab, "cond_len_smax$a")
+OPTAB_D (cond_len_umin_optab, "cond_len_umin$a")
+OPTAB_D (cond_len_umax_optab, "cond_len_umax$a")
+OPTAB_D (cond_len_fmin_optab, "cond_len_fmin$a")
+OPTAB_D (cond_len_fmax_optab, "cond_len_fmax$a")
+OPTAB_D (cond_len_fma_optab, "cond_len_fma$a")
+OPTAB_D (cond_len_fms_optab, "cond_len_fms$a")
+OPTAB_D (cond_len_fnma_optab, "cond_len_fnma$a")
+OPTAB_D (cond_len_fnms_optab, "cond_len_fnms$a")
+OPTAB_D (cond_len_neg_optab, "cond_len_neg$a")
 OPTAB_D (cmov_optab, "cmov$a6")
 OPTAB_D (cstore_optab, "cstore$a4")
 OPTAB_D (ctrap_optab, "ctrap$a4")
-- 
2.36.1


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH V2] VECT: Add COND_LEN_* operations for loop control with length targets
  2023-07-10 11:35 [PATCH V2] VECT: Add COND_LEN_* operations for loop control with length targets juzhe.zhong
@ 2023-07-11  0:59 ` juzhe.zhong
  2023-07-11 11:00 ` Richard Biener
  1 sibling, 0 replies; 4+ messages in thread
From: juzhe.zhong @ 2023-07-11  0:59 UTC (permalink / raw)
  To: 钟居哲, gcc-patches; +Cc: richard.sandiford, rguenther

[-- Attachment #1: Type: text/plain, Size: 18605 bytes --]

Bootstraped and Regression on X86 last night with no surprise fails.

This patch has already included  'BIAS' argument.

Ok for trunk ?


juzhe.zhong@rivai.ai
 
From: juzhe.zhong
Date: 2023-07-10 19:35
To: gcc-patches
CC: richard.sandiford; rguenther; Ju-Zhe Zhong
Subject: [PATCH V2] VECT: Add COND_LEN_* operations for loop control with length targets
From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
 
Hi, Richard and Richi.
 
This patch is adding cond_len_* operations pattern for target support loop control with length.
 
These patterns will be used in these following case:
 
1. Integer division:
   void
   f (int32_t *restrict a, int32_t *restrict b, int32_t *restrict c, int n)
   {
     for (int i = 0; i < n; ++i)
      {
        a[i] = b[i] / c[i];
      }
   }
 
  ARM SVE IR:
  
  ...
  max_mask_36 = .WHILE_ULT (0, bnd.5_32, { 0, ... });
 
  Loop:
  ...
  # loop_mask_29 = PHI <next_mask_37(4), max_mask_36(3)>
  ...
  vect__4.8_28 = .MASK_LOAD (_33, 32B, loop_mask_29);
  ...
  vect__6.11_25 = .MASK_LOAD (_20, 32B, loop_mask_29);
  vect__8.12_24 = .COND_DIV (loop_mask_29, vect__4.8_28, vect__6.11_25, vect__4.8_28);
  ...
  .MASK_STORE (_1, 32B, loop_mask_29, vect__8.12_24);
  ...
  next_mask_37 = .WHILE_ULT (_2, bnd.5_32, { 0, ... });
  ...
  
  For target like RVV who support loop control with length, we want to see IR as follows:
  
  Loop:
  ...
  # loop_len_29 = SELECT_VL
  ...
  vect__4.8_28 = .LEN_MASK_LOAD (_33, 32B, loop_len_29);
  ...
  vect__6.11_25 = .LEN_MASK_LOAD (_20, 32B, loop_len_29);
  vect__8.12_24 = .COND_LEN_DIV (dummp_mask, vect__4.8_28, vect__6.11_25, vect__4.8_28, loop_len_29, bias);
  ...
  .LEN_MASK_STORE (_1, 32B, loop_len_29, vect__8.12_24);
  ...
  next_mask_37 = .WHILE_ULT (_2, bnd.5_32, { 0, ... });
  ...
  
  Notice here, we use dummp_mask = { -1, -1, .... , -1 }
 
2. Integer conditional division:
   Similar case with (1) but with condtion:
   void
   f (int32_t *restrict a, int32_t *restrict b, int32_t *restrict c, int32_t * cond, int n)
   {
     for (int i = 0; i < n; ++i)
       {
         if (cond[i])
         a[i] = b[i] / c[i];
       }
   }
   
   ARM SVE:
   ...
   max_mask_76 = .WHILE_ULT (0, bnd.6_52, { 0, ... });
 
   Loop:
   ...
   # loop_mask_55 = PHI <next_mask_77(5), max_mask_76(4)>
   ...
   vect__4.9_56 = .MASK_LOAD (_51, 32B, loop_mask_55);
   mask__29.10_58 = vect__4.9_56 != { 0, ... };
   vec_mask_and_61 = loop_mask_55 & mask__29.10_58;
   ...
   vect__6.13_62 = .MASK_LOAD (_24, 32B, vec_mask_and_61);
   ...
   vect__8.16_66 = .MASK_LOAD (_1, 32B, vec_mask_and_61);
   vect__10.17_68 = .COND_DIV (vec_mask_and_61, vect__6.13_62, vect__8.16_66, vect__6.13_62);
   ...
   .MASK_STORE (_2, 32B, vec_mask_and_61, vect__10.17_68);
   ...
   next_mask_77 = .WHILE_ULT (_3, bnd.6_52, { 0, ... });
   
   Here, ARM SVE use vec_mask_and_61 = loop_mask_55 & mask__29.10_58; to gurantee the correct result.
   
   However, target with length control can not perform this elegant flow, for RVV, we would expect:
   
   Loop:
   ...
   loop_len_55 = SELECT_VL
   ...
   mask__29.10_58 = vect__4.9_56 != { 0, ... };
   ...
   vect__10.17_68 = .COND_LEN_DIV (mask__29.10_58, vect__6.13_62, vect__8.16_66, vect__6.13_62, loop_len_55, bias);
   ...
 
   Here we expect COND_LEN_DIV predicated by a real mask which is the outcome of comparison: mask__29.10_58 = vect__4.9_56 != { 0, ... };
   and a real length which is produced by loop control : loop_len_55 = SELECT_VL
   
3. conditional Floating-point operations (no -ffast-math):
   
    void
    f (float *restrict a, float *restrict b, int32_t *restrict cond, int n)
    {
      for (int i = 0; i < n; ++i)
        {
          if (cond[i])
          a[i] = b[i] + a[i];
        }
    }
  
  ARM SVE IR:
  max_mask_70 = .WHILE_ULT (0, bnd.6_46, { 0, ... });
 
  ...
  # loop_mask_49 = PHI <next_mask_71(4), max_mask_70(3)>
  ...
  mask__27.10_52 = vect__4.9_50 != { 0, ... };
  vec_mask_and_55 = loop_mask_49 & mask__27.10_52;
  ...
  vect__9.17_62 = .COND_ADD (vec_mask_and_55, vect__6.13_56, vect__8.16_60, vect__6.13_56);
  ...
  next_mask_71 = .WHILE_ULT (_22, bnd.6_46, { 0, ... });
  ...
  
  For RVV, we would expect IR:
  
  ...
  loop_len_49 = SELECT_VL
  ...
  mask__27.10_52 = vect__4.9_50 != { 0, ... };
  ...
  vect__9.17_62 = .COND_LEN_ADD (mask__27.10_52, vect__6.13_56, vect__8.16_60, vect__6.13_56, loop_len_49, bias);
  ...
 
4. Conditional un-ordered reduction:
   
   int32_t
   f (int32_t *restrict a, 
   int32_t *restrict cond, int n)
   {
     int32_t result = 0;
     for (int i = 0; i < n; ++i)
       {
           if (cond[i])
         result += a[i];
       }
     return result;
   }
   
   ARM SVE IR:
     
     Loop:
     # vect_result_18.7_37 = PHI <vect__33.16_51(4), { 0, ... }(3)>
     ...
     # loop_mask_40 = PHI <next_mask_58(4), max_mask_57(3)>
     ...
     mask__17.11_43 = vect__4.10_41 != { 0, ... };
     vec_mask_and_46 = loop_mask_40 & mask__17.11_43;
     ...
     vect__33.16_51 = .COND_ADD (vec_mask_and_46, vect_result_18.7_37, vect__7.14_47, vect_result_18.7_37);
     ...
     next_mask_58 = .WHILE_ULT (_15, bnd.6_36, { 0, ... });
     ...
   
     Epilogue:
     _53 = .REDUC_PLUS (vect__33.16_51); [tail call]
   
   For RVV, we expect:
     
    Loop:
     # vect_result_18.7_37 = PHI <vect__33.16_51(4), { 0, ... }(3)>
     ...
     loop_len_40 = SELECT_VL
     ...
     mask__17.11_43 = vect__4.10_41 != { 0, ... };
     ...
     vect__33.16_51 = .COND_LEN_ADD (mask__17.11_43, vect_result_18.7_37, vect__7.14_47, vect_result_18.7_37, loop_len_40, bias);
     ...
     next_mask_58 = .WHILE_ULT (_15, bnd.6_36, { 0, ... });
     ...
   
     Epilogue:
     _53 = .REDUC_PLUS (vect__33.16_51); [tail call]
     
     I name these patterns as "cond_len_*" since I want the length operand comes after mask operand and all other operands except length operand
     same order as "cond_*" patterns. Such order will make life easier in the following loop vectorizer support.
 
gcc/ChangeLog:
 
        * doc/md.texi: Add COND_LEN_* operations for loop control with length.
        * internal-fn.cc (cond_len_unary_direct): Ditto.
        (cond_len_binary_direct): Ditto.
        (cond_len_ternary_direct): Ditto.
        (expand_cond_len_unary_optab_fn): Ditto.
        (expand_cond_len_binary_optab_fn): Ditto.
        (expand_cond_len_ternary_optab_fn): Ditto.
        (direct_cond_len_unary_optab_supported_p): Ditto.
        (direct_cond_len_binary_optab_supported_p): Ditto.
        (direct_cond_len_ternary_optab_supported_p): Ditto.
        * internal-fn.def (COND_LEN_ADD): Ditto.
        (COND_LEN_SUB): Ditto.
        (COND_LEN_MUL): Ditto.
        (COND_LEN_DIV): Ditto.
        (COND_LEN_MOD): Ditto.
        (COND_LEN_RDIV): Ditto.
        (COND_LEN_MIN): Ditto.
        (COND_LEN_MAX): Ditto.
        (COND_LEN_FMIN): Ditto.
        (COND_LEN_FMAX): Ditto.
        (COND_LEN_AND): Ditto.
        (COND_LEN_IOR): Ditto.
        (COND_LEN_XOR): Ditto.
        (COND_LEN_SHL): Ditto.
        (COND_LEN_SHR): Ditto.
        (COND_LEN_FMA): Ditto.
        (COND_LEN_FMS): Ditto.
        (COND_LEN_FNMA): Ditto.
        (COND_LEN_FNMS): Ditto.
        (COND_LEN_NEG): Ditto.
        * optabs.def (OPTAB_D): Ditto.
 
---
gcc/doc/md.texi     | 80 +++++++++++++++++++++++++++++++++++++++++++++
gcc/internal-fn.cc  | 15 +++++++++
gcc/internal-fn.def | 38 +++++++++++++++++++++
gcc/optabs.def      | 24 ++++++++++++++
4 files changed, 157 insertions(+)
 
diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index b30a824488b..cbcb992e5d7 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -7234,6 +7234,86 @@ for (i = 0; i < GET_MODE_NUNITS (@var{m}); i++)
   op0[i] = op1[i] ? fma (op2[i], op3[i], op4[i]) : op5[i];
@end smallexample
+@cindex @code{cond_len_add@var{mode}} instruction pattern
+@cindex @code{cond_len_sub@var{mode}} instruction pattern
+@cindex @code{cond_len_mul@var{mode}} instruction pattern
+@cindex @code{cond_len_div@var{mode}} instruction pattern
+@cindex @code{cond_len_udiv@var{mode}} instruction pattern
+@cindex @code{cond_len_mod@var{mode}} instruction pattern
+@cindex @code{cond_len_umod@var{mode}} instruction pattern
+@cindex @code{cond_len_and@var{mode}} instruction pattern
+@cindex @code{cond_len_ior@var{mode}} instruction pattern
+@cindex @code{cond_len_xor@var{mode}} instruction pattern
+@cindex @code{cond_len_smin@var{mode}} instruction pattern
+@cindex @code{cond_len_smax@var{mode}} instruction pattern
+@cindex @code{cond_len_umin@var{mode}} instruction pattern
+@cindex @code{cond_len_umax@var{mode}} instruction pattern
+@cindex @code{cond_len_fmin@var{mode}} instruction pattern
+@cindex @code{cond_len_fmax@var{mode}} instruction pattern
+@cindex @code{cond_len_ashl@var{mode}} instruction pattern
+@cindex @code{cond_len_ashr@var{mode}} instruction pattern
+@cindex @code{cond_len_lshr@var{mode}} instruction pattern
+@item @samp{cond_len_add@var{mode}}
+@itemx @samp{cond_len_sub@var{mode}}
+@itemx @samp{cond_len_mul@var{mode}}
+@itemx @samp{cond_len_div@var{mode}}
+@itemx @samp{cond_len_udiv@var{mode}}
+@itemx @samp{cond_len_mod@var{mode}}
+@itemx @samp{cond_len_umod@var{mode}}
+@itemx @samp{cond_len_and@var{mode}}
+@itemx @samp{cond_len_ior@var{mode}}
+@itemx @samp{cond_len_xor@var{mode}}
+@itemx @samp{cond_len_smin@var{mode}}
+@itemx @samp{cond_len_smax@var{mode}}
+@itemx @samp{cond_len_umin@var{mode}}
+@itemx @samp{cond_len_umax@var{mode}}
+@itemx @samp{cond_len_fmin@var{mode}}
+@itemx @samp{cond_len_fmax@var{mode}}
+@itemx @samp{cond_len_ashl@var{mode}}
+@itemx @samp{cond_len_ashr@var{mode}}
+@itemx @samp{cond_len_lshr@var{mode}}
+When operand 1 is true and element index < operand 5 + operand 6, perform an operation on operands 2 and 3 and
+store the result in operand 0, otherwise store operand 4 in operand 0.
+The operation only works for the operands are vectors.
+
+@smallexample
+for (i = 0; i < ops[5] + ops[6]; i++)
+  op0[i] = op1[i] ? op2[i] @var{op} op3[i] : op4[i];
+@end smallexample
+
+where, for example, @var{op} is @code{+} for @samp{cond_len_add@var{mode}}.
+
+When defined for floating-point modes, the contents of @samp{op3[i]}
+are not interpreted if @samp{op1[i]} is false, just like they would not
+be in a normal C @samp{?:} condition.
+
+Operands 0, 2, 3 and 4 all have mode @var{m}.  Operand 1 is a scalar
+integer if @var{m} is scalar, otherwise it has the mode returned by
+@code{TARGET_VECTORIZE_GET_MASK_MODE}.  Operand 5 has whichever
+integer mode the target prefers.
+
+@samp{cond_@var{op}@var{mode}} generally corresponds to a conditional
+form of @samp{@var{op}@var{mode}3}.  As an exception, the vector forms
+of shifts correspond to patterns like @code{vashl@var{mode}3} rather
+than patterns like @code{ashl@var{mode}3}.
+
+@cindex @code{cond_len_fma@var{mode}} instruction pattern
+@cindex @code{cond_len_fms@var{mode}} instruction pattern
+@cindex @code{cond_len_fnma@var{mode}} instruction pattern
+@cindex @code{cond_len_fnms@var{mode}} instruction pattern
+@item @samp{cond_len_fma@var{mode}}
+@itemx @samp{cond_len_fms@var{mode}}
+@itemx @samp{cond_len_fnma@var{mode}}
+@itemx @samp{cond_len_fnms@var{mode}}
+Like @samp{cond_len_add@var{m}}, except that the conditional operation
+takes 3 operands rather than two.  For example, the vector form of
+@samp{cond_len_fma@var{mode}} is equivalent to:
+
+@smallexample
+for (i = 0; i < ops[6] + ops[7]; i++)
+  op0[i] = op1[i] ? fma (op2[i], op3[i], op4[i]) : op5[i];
+@end smallexample
+
@cindex @code{neg@var{mode}cc} instruction pattern
@item @samp{neg@var{mode}cc}
Similar to @samp{mov@var{mode}cc} but for conditional negation.  Conditionally
diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index 278db7b1805..f9aaf66cf2a 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -183,6 +183,9 @@ init_internal_fns ()
#define cond_unary_direct { 1, 1, true }
#define cond_binary_direct { 1, 1, true }
#define cond_ternary_direct { 1, 1, true }
+#define cond_len_unary_direct { 1, 1, true }
+#define cond_len_binary_direct { 1, 1, true }
+#define cond_len_ternary_direct { 1, 1, true }
#define while_direct { 0, 2, false }
#define fold_extract_direct { 2, 2, false }
#define fold_left_direct { 1, 1, false }
@@ -3869,6 +3872,15 @@ expand_convert_optab_fn (internal_fn fn, gcall *stmt, convert_optab optab,
#define expand_cond_ternary_optab_fn(FN, STMT, OPTAB) \
   expand_direct_optab_fn (FN, STMT, OPTAB, 5)
+#define expand_cond_len_unary_optab_fn(FN, STMT, OPTAB) \
+  expand_direct_optab_fn (FN, STMT, OPTAB, 5)
+
+#define expand_cond_len_binary_optab_fn(FN, STMT, OPTAB) \
+  expand_direct_optab_fn (FN, STMT, OPTAB, 6)
+
+#define expand_cond_len_ternary_optab_fn(FN, STMT, OPTAB) \
+  expand_direct_optab_fn (FN, STMT, OPTAB, 7)
+
#define expand_fold_extract_optab_fn(FN, STMT, OPTAB) \
   expand_direct_optab_fn (FN, STMT, OPTAB, 3)
@@ -3964,6 +3976,9 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
#define direct_cond_unary_optab_supported_p direct_optab_supported_p
#define direct_cond_binary_optab_supported_p direct_optab_supported_p
#define direct_cond_ternary_optab_supported_p direct_optab_supported_p
+#define direct_cond_len_unary_optab_supported_p direct_optab_supported_p
+#define direct_cond_len_binary_optab_supported_p direct_optab_supported_p
+#define direct_cond_len_ternary_optab_supported_p direct_optab_supported_p
#define direct_mask_load_optab_supported_p convert_optab_supported_p
#define direct_load_lanes_optab_supported_p multi_vector_optab_supported_p
#define direct_mask_load_lanes_optab_supported_p multi_vector_optab_supported_p
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 238b7ee0bc9..ea750a921ed 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -72,6 +72,10 @@ along with GCC; see the file COPYING3.  If not see
    - fold_left: for scalar = FN (scalar, vector), keyed off the vector mode
    - check_ptrs: used for check_{raw,war}_ptrs
+   - cond_len_unary: a conditional unary optab, such as cond_len_neg<mode>
+   - cond_len_binary: a conditional binary optab, such as cond_len_add<mode>
+   - cond_len_ternary: a conditional ternary optab, such as cond_len_fma_rev<mode>
+
    DEF_INTERNAL_SIGNED_OPTAB_FN defines an internal function that
    maps to one of two optabs, depending on the signedness of an input.
    SIGNED_OPTAB and UNSIGNED_OPTAB are the optabs for signed and
@@ -248,6 +252,40 @@ DEF_INTERNAL_OPTAB_FN (COND_FNMS, ECF_CONST, cond_fnms, cond_ternary)
DEF_INTERNAL_OPTAB_FN (COND_NEG, ECF_CONST, cond_neg, cond_unary)
+DEF_INTERNAL_OPTAB_FN (COND_LEN_ADD, ECF_CONST, cond_len_add, cond_len_binary)
+DEF_INTERNAL_OPTAB_FN (COND_LEN_SUB, ECF_CONST, cond_len_sub, cond_len_binary)
+DEF_INTERNAL_OPTAB_FN (COND_LEN_MUL, ECF_CONST, cond_len_smul, cond_len_binary)
+DEF_INTERNAL_SIGNED_OPTAB_FN (COND_LEN_DIV, ECF_CONST, first, cond_len_sdiv,
+       cond_len_udiv, cond_len_binary)
+DEF_INTERNAL_SIGNED_OPTAB_FN (COND_LEN_MOD, ECF_CONST, first, cond_len_smod,
+       cond_len_umod, cond_len_binary)
+DEF_INTERNAL_OPTAB_FN (COND_LEN_RDIV, ECF_CONST, cond_len_sdiv, cond_len_binary)
+DEF_INTERNAL_SIGNED_OPTAB_FN (COND_LEN_MIN, ECF_CONST, first, cond_len_smin,
+       cond_len_umin, cond_len_binary)
+DEF_INTERNAL_SIGNED_OPTAB_FN (COND_LEN_MAX, ECF_CONST, first, cond_len_smax,
+       cond_len_umax, cond_len_binary)
+DEF_INTERNAL_OPTAB_FN (COND_LEN_FMIN, ECF_CONST, cond_len_fmin, cond_len_binary)
+DEF_INTERNAL_OPTAB_FN (COND_LEN_FMAX, ECF_CONST, cond_len_fmax, cond_len_binary)
+DEF_INTERNAL_OPTAB_FN (COND_LEN_AND, ECF_CONST | ECF_NOTHROW, cond_len_and,
+        cond_len_binary)
+DEF_INTERNAL_OPTAB_FN (COND_LEN_IOR, ECF_CONST | ECF_NOTHROW, cond_len_ior,
+        cond_len_binary)
+DEF_INTERNAL_OPTAB_FN (COND_LEN_XOR, ECF_CONST | ECF_NOTHROW, cond_len_xor,
+        cond_len_binary)
+DEF_INTERNAL_OPTAB_FN (COND_LEN_SHL, ECF_CONST | ECF_NOTHROW, cond_len_ashl,
+        cond_len_binary)
+DEF_INTERNAL_SIGNED_OPTAB_FN (COND_LEN_SHR, ECF_CONST | ECF_NOTHROW, first,
+       cond_len_ashr, cond_len_lshr, cond_len_binary)
+
+DEF_INTERNAL_OPTAB_FN (COND_LEN_FMA, ECF_CONST, cond_len_fma, cond_len_ternary)
+DEF_INTERNAL_OPTAB_FN (COND_LEN_FMS, ECF_CONST, cond_len_fms, cond_len_ternary)
+DEF_INTERNAL_OPTAB_FN (COND_LEN_FNMA, ECF_CONST, cond_len_fnma,
+        cond_len_ternary)
+DEF_INTERNAL_OPTAB_FN (COND_LEN_FNMS, ECF_CONST, cond_len_fnms,
+        cond_len_ternary)
+
+DEF_INTERNAL_OPTAB_FN (COND_LEN_NEG, ECF_CONST, cond_len_neg, cond_len_unary)
+
DEF_INTERNAL_OPTAB_FN (RSQRT, ECF_CONST, rsqrt, unary)
DEF_INTERNAL_OPTAB_FN (REDUC_PLUS, ECF_CONST | ECF_NOTHROW,
diff --git a/gcc/optabs.def b/gcc/optabs.def
index 73c9a0c760f..3dae228fba6 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -254,6 +254,30 @@ OPTAB_D (cond_fms_optab, "cond_fms$a")
OPTAB_D (cond_fnma_optab, "cond_fnma$a")
OPTAB_D (cond_fnms_optab, "cond_fnms$a")
OPTAB_D (cond_neg_optab, "cond_neg$a")
+OPTAB_D (cond_len_add_optab, "cond_len_add$a")
+OPTAB_D (cond_len_sub_optab, "cond_len_sub$a")
+OPTAB_D (cond_len_smul_optab, "cond_len_mul$a")
+OPTAB_D (cond_len_sdiv_optab, "cond_len_div$a")
+OPTAB_D (cond_len_smod_optab, "cond_len_mod$a")
+OPTAB_D (cond_len_udiv_optab, "cond_len_udiv$a")
+OPTAB_D (cond_len_umod_optab, "cond_len_umod$a")
+OPTAB_D (cond_len_and_optab, "cond_len_and$a")
+OPTAB_D (cond_len_ior_optab, "cond_len_ior$a")
+OPTAB_D (cond_len_xor_optab, "cond_len_xor$a")
+OPTAB_D (cond_len_ashl_optab, "cond_len_ashl$a")
+OPTAB_D (cond_len_ashr_optab, "cond_len_ashr$a")
+OPTAB_D (cond_len_lshr_optab, "cond_len_lshr$a")
+OPTAB_D (cond_len_smin_optab, "cond_len_smin$a")
+OPTAB_D (cond_len_smax_optab, "cond_len_smax$a")
+OPTAB_D (cond_len_umin_optab, "cond_len_umin$a")
+OPTAB_D (cond_len_umax_optab, "cond_len_umax$a")
+OPTAB_D (cond_len_fmin_optab, "cond_len_fmin$a")
+OPTAB_D (cond_len_fmax_optab, "cond_len_fmax$a")
+OPTAB_D (cond_len_fma_optab, "cond_len_fma$a")
+OPTAB_D (cond_len_fms_optab, "cond_len_fms$a")
+OPTAB_D (cond_len_fnma_optab, "cond_len_fnma$a")
+OPTAB_D (cond_len_fnms_optab, "cond_len_fnms$a")
+OPTAB_D (cond_len_neg_optab, "cond_len_neg$a")
OPTAB_D (cmov_optab, "cmov$a6")
OPTAB_D (cstore_optab, "cstore$a4")
OPTAB_D (ctrap_optab, "ctrap$a4")
-- 
2.36.1
 

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH V2] VECT: Add COND_LEN_* operations for loop control with length targets
  2023-07-10 11:35 [PATCH V2] VECT: Add COND_LEN_* operations for loop control with length targets juzhe.zhong
  2023-07-11  0:59 ` juzhe.zhong
@ 2023-07-11 11:00 ` Richard Biener
  2023-07-11 12:40   ` Li, Pan2
  1 sibling, 1 reply; 4+ messages in thread
From: Richard Biener @ 2023-07-11 11:00 UTC (permalink / raw)
  To: Ju-Zhe Zhong; +Cc: gcc-patches, richard.sandiford

On Mon, 10 Jul 2023, juzhe.zhong@rivai.ai wrote:

> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> 
> Hi, Richard and Richi.
> 
> This patch is adding cond_len_* operations pattern for target support loop control with length.
> 
> These patterns will be used in these following case:
> 
> 1. Integer division:
>    void
>    f (int32_t *restrict a, int32_t *restrict b, int32_t *restrict c, int n)
>    {
>      for (int i = 0; i < n; ++i)
>       {
>         a[i] = b[i] / c[i];
>       }
>    }
> 
>   ARM SVE IR:
>   
>   ...
>   max_mask_36 = .WHILE_ULT (0, bnd.5_32, { 0, ... });
> 
>   Loop:
>   ...
>   # loop_mask_29 = PHI <next_mask_37(4), max_mask_36(3)>
>   ...
>   vect__4.8_28 = .MASK_LOAD (_33, 32B, loop_mask_29);
>   ...
>   vect__6.11_25 = .MASK_LOAD (_20, 32B, loop_mask_29);
>   vect__8.12_24 = .COND_DIV (loop_mask_29, vect__4.8_28, vect__6.11_25, vect__4.8_28);
>   ...
>   .MASK_STORE (_1, 32B, loop_mask_29, vect__8.12_24);
>   ...
>   next_mask_37 = .WHILE_ULT (_2, bnd.5_32, { 0, ... });
>   ...
>   
>   For target like RVV who support loop control with length, we want to see IR as follows:
>   
>   Loop:
>   ...
>   # loop_len_29 = SELECT_VL
>   ...
>   vect__4.8_28 = .LEN_MASK_LOAD (_33, 32B, loop_len_29);
>   ...
>   vect__6.11_25 = .LEN_MASK_LOAD (_20, 32B, loop_len_29);
>   vect__8.12_24 = .COND_LEN_DIV (dummp_mask, vect__4.8_28, vect__6.11_25, vect__4.8_28, loop_len_29, bias);
>   ...
>   .LEN_MASK_STORE (_1, 32B, loop_len_29, vect__8.12_24);
>   ...
>   next_mask_37 = .WHILE_ULT (_2, bnd.5_32, { 0, ... });
>   ...
>   
>   Notice here, we use dummp_mask = { -1, -1, .... , -1 }
> 
> 2. Integer conditional division:
>    Similar case with (1) but with condtion:
>    void
>    f (int32_t *restrict a, int32_t *restrict b, int32_t *restrict c, int32_t * cond, int n)
>    {
>      for (int i = 0; i < n; ++i)
>        {
>          if (cond[i])
>          a[i] = b[i] / c[i];
>        }
>    }
>    
>    ARM SVE:
>    ...
>    max_mask_76 = .WHILE_ULT (0, bnd.6_52, { 0, ... });
> 
>    Loop:
>    ...
>    # loop_mask_55 = PHI <next_mask_77(5), max_mask_76(4)>
>    ...
>    vect__4.9_56 = .MASK_LOAD (_51, 32B, loop_mask_55);
>    mask__29.10_58 = vect__4.9_56 != { 0, ... };
>    vec_mask_and_61 = loop_mask_55 & mask__29.10_58;
>    ...
>    vect__6.13_62 = .MASK_LOAD (_24, 32B, vec_mask_and_61);
>    ...
>    vect__8.16_66 = .MASK_LOAD (_1, 32B, vec_mask_and_61);
>    vect__10.17_68 = .COND_DIV (vec_mask_and_61, vect__6.13_62, vect__8.16_66, vect__6.13_62);
>    ...
>    .MASK_STORE (_2, 32B, vec_mask_and_61, vect__10.17_68);
>    ...
>    next_mask_77 = .WHILE_ULT (_3, bnd.6_52, { 0, ... });
>    
>    Here, ARM SVE use vec_mask_and_61 = loop_mask_55 & mask__29.10_58; to gurantee the correct result.
>    
>    However, target with length control can not perform this elegant flow, for RVV, we would expect:
>    
>    Loop:
>    ...
>    loop_len_55 = SELECT_VL
>    ...
>    mask__29.10_58 = vect__4.9_56 != { 0, ... };
>    ...
>    vect__10.17_68 = .COND_LEN_DIV (mask__29.10_58, vect__6.13_62, vect__8.16_66, vect__6.13_62, loop_len_55, bias);
>    ...
> 
>    Here we expect COND_LEN_DIV predicated by a real mask which is the outcome of comparison: mask__29.10_58 = vect__4.9_56 != { 0, ... };
>    and a real length which is produced by loop control : loop_len_55 = SELECT_VL
>    
> 3. conditional Floating-point operations (no -ffast-math):
>    
>     void
>     f (float *restrict a, float *restrict b, int32_t *restrict cond, int n)
>     {
>       for (int i = 0; i < n; ++i)
>         {
>           if (cond[i])
>           a[i] = b[i] + a[i];
>         }
>     }
>   
>   ARM SVE IR:
>   max_mask_70 = .WHILE_ULT (0, bnd.6_46, { 0, ... });
> 
>   ...
>   # loop_mask_49 = PHI <next_mask_71(4), max_mask_70(3)>
>   ...
>   mask__27.10_52 = vect__4.9_50 != { 0, ... };
>   vec_mask_and_55 = loop_mask_49 & mask__27.10_52;
>   ...
>   vect__9.17_62 = .COND_ADD (vec_mask_and_55, vect__6.13_56, vect__8.16_60, vect__6.13_56);
>   ...
>   next_mask_71 = .WHILE_ULT (_22, bnd.6_46, { 0, ... });
>   ...
>   
>   For RVV, we would expect IR:
>   
>   ...
>   loop_len_49 = SELECT_VL
>   ...
>   mask__27.10_52 = vect__4.9_50 != { 0, ... };
>   ...
>   vect__9.17_62 = .COND_LEN_ADD (mask__27.10_52, vect__6.13_56, vect__8.16_60, vect__6.13_56, loop_len_49, bias);
>   ...
> 
> 4. Conditional un-ordered reduction:
>    
>    int32_t
>    f (int32_t *restrict a, 
>    int32_t *restrict cond, int n)
>    {
>      int32_t result = 0;
>      for (int i = 0; i < n; ++i)
>        {
>            if (cond[i])
>          result += a[i];
>        }
>      return result;
>    }
>    
>    ARM SVE IR:
>      
>      Loop:
>      # vect_result_18.7_37 = PHI <vect__33.16_51(4), { 0, ... }(3)>
>      ...
>      # loop_mask_40 = PHI <next_mask_58(4), max_mask_57(3)>
>      ...
>      mask__17.11_43 = vect__4.10_41 != { 0, ... };
>      vec_mask_and_46 = loop_mask_40 & mask__17.11_43;
>      ...
>      vect__33.16_51 = .COND_ADD (vec_mask_and_46, vect_result_18.7_37, vect__7.14_47, vect_result_18.7_37);
>      ...
>      next_mask_58 = .WHILE_ULT (_15, bnd.6_36, { 0, ... });
>      ...
>    
>      Epilogue:
>      _53 = .REDUC_PLUS (vect__33.16_51); [tail call]
>    
>    For RVV, we expect:
>      
>     Loop:
>      # vect_result_18.7_37 = PHI <vect__33.16_51(4), { 0, ... }(3)>
>      ...
>      loop_len_40 = SELECT_VL
>      ...
>      mask__17.11_43 = vect__4.10_41 != { 0, ... };
>      ...
>      vect__33.16_51 = .COND_LEN_ADD (mask__17.11_43, vect_result_18.7_37, vect__7.14_47, vect_result_18.7_37, loop_len_40, bias);
>      ...
>      next_mask_58 = .WHILE_ULT (_15, bnd.6_36, { 0, ... });
>      ...
>    
>      Epilogue:
>      _53 = .REDUC_PLUS (vect__33.16_51); [tail call]
>      
>      I name these patterns as "cond_len_*" since I want the length operand comes after mask operand and all other operands except length operand
>      same order as "cond_*" patterns. Such order will make life easier in the following loop vectorizer support.

OK.

Thanks,
Richard.

> gcc/ChangeLog:
> 
>         * doc/md.texi: Add COND_LEN_* operations for loop control with length.
>         * internal-fn.cc (cond_len_unary_direct): Ditto.
>         (cond_len_binary_direct): Ditto.
>         (cond_len_ternary_direct): Ditto.
>         (expand_cond_len_unary_optab_fn): Ditto.
>         (expand_cond_len_binary_optab_fn): Ditto.
>         (expand_cond_len_ternary_optab_fn): Ditto.
>         (direct_cond_len_unary_optab_supported_p): Ditto.
>         (direct_cond_len_binary_optab_supported_p): Ditto.
>         (direct_cond_len_ternary_optab_supported_p): Ditto.
>         * internal-fn.def (COND_LEN_ADD): Ditto.
>         (COND_LEN_SUB): Ditto.
>         (COND_LEN_MUL): Ditto.
>         (COND_LEN_DIV): Ditto.
>         (COND_LEN_MOD): Ditto.
>         (COND_LEN_RDIV): Ditto.
>         (COND_LEN_MIN): Ditto.
>         (COND_LEN_MAX): Ditto.
>         (COND_LEN_FMIN): Ditto.
>         (COND_LEN_FMAX): Ditto.
>         (COND_LEN_AND): Ditto.
>         (COND_LEN_IOR): Ditto.
>         (COND_LEN_XOR): Ditto.
>         (COND_LEN_SHL): Ditto.
>         (COND_LEN_SHR): Ditto.
>         (COND_LEN_FMA): Ditto.
>         (COND_LEN_FMS): Ditto.
>         (COND_LEN_FNMA): Ditto.
>         (COND_LEN_FNMS): Ditto.
>         (COND_LEN_NEG): Ditto.
>         * optabs.def (OPTAB_D): Ditto.
> 
> ---
>  gcc/doc/md.texi     | 80 +++++++++++++++++++++++++++++++++++++++++++++
>  gcc/internal-fn.cc  | 15 +++++++++
>  gcc/internal-fn.def | 38 +++++++++++++++++++++
>  gcc/optabs.def      | 24 ++++++++++++++
>  4 files changed, 157 insertions(+)
> 
> diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> index b30a824488b..cbcb992e5d7 100644
> --- a/gcc/doc/md.texi
> +++ b/gcc/doc/md.texi
> @@ -7234,6 +7234,86 @@ for (i = 0; i < GET_MODE_NUNITS (@var{m}); i++)
>    op0[i] = op1[i] ? fma (op2[i], op3[i], op4[i]) : op5[i];
>  @end smallexample
>  
> +@cindex @code{cond_len_add@var{mode}} instruction pattern
> +@cindex @code{cond_len_sub@var{mode}} instruction pattern
> +@cindex @code{cond_len_mul@var{mode}} instruction pattern
> +@cindex @code{cond_len_div@var{mode}} instruction pattern
> +@cindex @code{cond_len_udiv@var{mode}} instruction pattern
> +@cindex @code{cond_len_mod@var{mode}} instruction pattern
> +@cindex @code{cond_len_umod@var{mode}} instruction pattern
> +@cindex @code{cond_len_and@var{mode}} instruction pattern
> +@cindex @code{cond_len_ior@var{mode}} instruction pattern
> +@cindex @code{cond_len_xor@var{mode}} instruction pattern
> +@cindex @code{cond_len_smin@var{mode}} instruction pattern
> +@cindex @code{cond_len_smax@var{mode}} instruction pattern
> +@cindex @code{cond_len_umin@var{mode}} instruction pattern
> +@cindex @code{cond_len_umax@var{mode}} instruction pattern
> +@cindex @code{cond_len_fmin@var{mode}} instruction pattern
> +@cindex @code{cond_len_fmax@var{mode}} instruction pattern
> +@cindex @code{cond_len_ashl@var{mode}} instruction pattern
> +@cindex @code{cond_len_ashr@var{mode}} instruction pattern
> +@cindex @code{cond_len_lshr@var{mode}} instruction pattern
> +@item @samp{cond_len_add@var{mode}}
> +@itemx @samp{cond_len_sub@var{mode}}
> +@itemx @samp{cond_len_mul@var{mode}}
> +@itemx @samp{cond_len_div@var{mode}}
> +@itemx @samp{cond_len_udiv@var{mode}}
> +@itemx @samp{cond_len_mod@var{mode}}
> +@itemx @samp{cond_len_umod@var{mode}}
> +@itemx @samp{cond_len_and@var{mode}}
> +@itemx @samp{cond_len_ior@var{mode}}
> +@itemx @samp{cond_len_xor@var{mode}}
> +@itemx @samp{cond_len_smin@var{mode}}
> +@itemx @samp{cond_len_smax@var{mode}}
> +@itemx @samp{cond_len_umin@var{mode}}
> +@itemx @samp{cond_len_umax@var{mode}}
> +@itemx @samp{cond_len_fmin@var{mode}}
> +@itemx @samp{cond_len_fmax@var{mode}}
> +@itemx @samp{cond_len_ashl@var{mode}}
> +@itemx @samp{cond_len_ashr@var{mode}}
> +@itemx @samp{cond_len_lshr@var{mode}}
> +When operand 1 is true and element index < operand 5 + operand 6, perform an operation on operands 2 and 3 and
> +store the result in operand 0, otherwise store operand 4 in operand 0.
> +The operation only works for the operands are vectors.
> +
> +@smallexample
> +for (i = 0; i < ops[5] + ops[6]; i++)
> +  op0[i] = op1[i] ? op2[i] @var{op} op3[i] : op4[i];
> +@end smallexample
> +
> +where, for example, @var{op} is @code{+} for @samp{cond_len_add@var{mode}}.
> +
> +When defined for floating-point modes, the contents of @samp{op3[i]}
> +are not interpreted if @samp{op1[i]} is false, just like they would not
> +be in a normal C @samp{?:} condition.
> +
> +Operands 0, 2, 3 and 4 all have mode @var{m}.  Operand 1 is a scalar
> +integer if @var{m} is scalar, otherwise it has the mode returned by
> +@code{TARGET_VECTORIZE_GET_MASK_MODE}.  Operand 5 has whichever
> +integer mode the target prefers.
> +
> +@samp{cond_@var{op}@var{mode}} generally corresponds to a conditional
> +form of @samp{@var{op}@var{mode}3}.  As an exception, the vector forms
> +of shifts correspond to patterns like @code{vashl@var{mode}3} rather
> +than patterns like @code{ashl@var{mode}3}.
> +
> +@cindex @code{cond_len_fma@var{mode}} instruction pattern
> +@cindex @code{cond_len_fms@var{mode}} instruction pattern
> +@cindex @code{cond_len_fnma@var{mode}} instruction pattern
> +@cindex @code{cond_len_fnms@var{mode}} instruction pattern
> +@item @samp{cond_len_fma@var{mode}}
> +@itemx @samp{cond_len_fms@var{mode}}
> +@itemx @samp{cond_len_fnma@var{mode}}
> +@itemx @samp{cond_len_fnms@var{mode}}
> +Like @samp{cond_len_add@var{m}}, except that the conditional operation
> +takes 3 operands rather than two.  For example, the vector form of
> +@samp{cond_len_fma@var{mode}} is equivalent to:
> +
> +@smallexample
> +for (i = 0; i < ops[6] + ops[7]; i++)
> +  op0[i] = op1[i] ? fma (op2[i], op3[i], op4[i]) : op5[i];
> +@end smallexample
> +
>  @cindex @code{neg@var{mode}cc} instruction pattern
>  @item @samp{neg@var{mode}cc}
>  Similar to @samp{mov@var{mode}cc} but for conditional negation.  Conditionally
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index 278db7b1805..f9aaf66cf2a 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -183,6 +183,9 @@ init_internal_fns ()
>  #define cond_unary_direct { 1, 1, true }
>  #define cond_binary_direct { 1, 1, true }
>  #define cond_ternary_direct { 1, 1, true }
> +#define cond_len_unary_direct { 1, 1, true }
> +#define cond_len_binary_direct { 1, 1, true }
> +#define cond_len_ternary_direct { 1, 1, true }
>  #define while_direct { 0, 2, false }
>  #define fold_extract_direct { 2, 2, false }
>  #define fold_left_direct { 1, 1, false }
> @@ -3869,6 +3872,15 @@ expand_convert_optab_fn (internal_fn fn, gcall *stmt, convert_optab optab,
>  #define expand_cond_ternary_optab_fn(FN, STMT, OPTAB) \
>    expand_direct_optab_fn (FN, STMT, OPTAB, 5)
>  
> +#define expand_cond_len_unary_optab_fn(FN, STMT, OPTAB) \
> +  expand_direct_optab_fn (FN, STMT, OPTAB, 5)
> +
> +#define expand_cond_len_binary_optab_fn(FN, STMT, OPTAB) \
> +  expand_direct_optab_fn (FN, STMT, OPTAB, 6)
> +
> +#define expand_cond_len_ternary_optab_fn(FN, STMT, OPTAB) \
> +  expand_direct_optab_fn (FN, STMT, OPTAB, 7)
> +
>  #define expand_fold_extract_optab_fn(FN, STMT, OPTAB) \
>    expand_direct_optab_fn (FN, STMT, OPTAB, 3)
>  
> @@ -3964,6 +3976,9 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
>  #define direct_cond_unary_optab_supported_p direct_optab_supported_p
>  #define direct_cond_binary_optab_supported_p direct_optab_supported_p
>  #define direct_cond_ternary_optab_supported_p direct_optab_supported_p
> +#define direct_cond_len_unary_optab_supported_p direct_optab_supported_p
> +#define direct_cond_len_binary_optab_supported_p direct_optab_supported_p
> +#define direct_cond_len_ternary_optab_supported_p direct_optab_supported_p
>  #define direct_mask_load_optab_supported_p convert_optab_supported_p
>  #define direct_load_lanes_optab_supported_p multi_vector_optab_supported_p
>  #define direct_mask_load_lanes_optab_supported_p multi_vector_optab_supported_p
> diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> index 238b7ee0bc9..ea750a921ed 100644
> --- a/gcc/internal-fn.def
> +++ b/gcc/internal-fn.def
> @@ -72,6 +72,10 @@ along with GCC; see the file COPYING3.  If not see
>     - fold_left: for scalar = FN (scalar, vector), keyed off the vector mode
>     - check_ptrs: used for check_{raw,war}_ptrs
>  
> +   - cond_len_unary: a conditional unary optab, such as cond_len_neg<mode>
> +   - cond_len_binary: a conditional binary optab, such as cond_len_add<mode>
> +   - cond_len_ternary: a conditional ternary optab, such as cond_len_fma_rev<mode>
> +
>     DEF_INTERNAL_SIGNED_OPTAB_FN defines an internal function that
>     maps to one of two optabs, depending on the signedness of an input.
>     SIGNED_OPTAB and UNSIGNED_OPTAB are the optabs for signed and
> @@ -248,6 +252,40 @@ DEF_INTERNAL_OPTAB_FN (COND_FNMS, ECF_CONST, cond_fnms, cond_ternary)
>  
>  DEF_INTERNAL_OPTAB_FN (COND_NEG, ECF_CONST, cond_neg, cond_unary)
>  
> +DEF_INTERNAL_OPTAB_FN (COND_LEN_ADD, ECF_CONST, cond_len_add, cond_len_binary)
> +DEF_INTERNAL_OPTAB_FN (COND_LEN_SUB, ECF_CONST, cond_len_sub, cond_len_binary)
> +DEF_INTERNAL_OPTAB_FN (COND_LEN_MUL, ECF_CONST, cond_len_smul, cond_len_binary)
> +DEF_INTERNAL_SIGNED_OPTAB_FN (COND_LEN_DIV, ECF_CONST, first, cond_len_sdiv,
> +			      cond_len_udiv, cond_len_binary)
> +DEF_INTERNAL_SIGNED_OPTAB_FN (COND_LEN_MOD, ECF_CONST, first, cond_len_smod,
> +			      cond_len_umod, cond_len_binary)
> +DEF_INTERNAL_OPTAB_FN (COND_LEN_RDIV, ECF_CONST, cond_len_sdiv, cond_len_binary)
> +DEF_INTERNAL_SIGNED_OPTAB_FN (COND_LEN_MIN, ECF_CONST, first, cond_len_smin,
> +			      cond_len_umin, cond_len_binary)
> +DEF_INTERNAL_SIGNED_OPTAB_FN (COND_LEN_MAX, ECF_CONST, first, cond_len_smax,
> +			      cond_len_umax, cond_len_binary)
> +DEF_INTERNAL_OPTAB_FN (COND_LEN_FMIN, ECF_CONST, cond_len_fmin, cond_len_binary)
> +DEF_INTERNAL_OPTAB_FN (COND_LEN_FMAX, ECF_CONST, cond_len_fmax, cond_len_binary)
> +DEF_INTERNAL_OPTAB_FN (COND_LEN_AND, ECF_CONST | ECF_NOTHROW, cond_len_and,
> +		       cond_len_binary)
> +DEF_INTERNAL_OPTAB_FN (COND_LEN_IOR, ECF_CONST | ECF_NOTHROW, cond_len_ior,
> +		       cond_len_binary)
> +DEF_INTERNAL_OPTAB_FN (COND_LEN_XOR, ECF_CONST | ECF_NOTHROW, cond_len_xor,
> +		       cond_len_binary)
> +DEF_INTERNAL_OPTAB_FN (COND_LEN_SHL, ECF_CONST | ECF_NOTHROW, cond_len_ashl,
> +		       cond_len_binary)
> +DEF_INTERNAL_SIGNED_OPTAB_FN (COND_LEN_SHR, ECF_CONST | ECF_NOTHROW, first,
> +			      cond_len_ashr, cond_len_lshr, cond_len_binary)
> +
> +DEF_INTERNAL_OPTAB_FN (COND_LEN_FMA, ECF_CONST, cond_len_fma, cond_len_ternary)
> +DEF_INTERNAL_OPTAB_FN (COND_LEN_FMS, ECF_CONST, cond_len_fms, cond_len_ternary)
> +DEF_INTERNAL_OPTAB_FN (COND_LEN_FNMA, ECF_CONST, cond_len_fnma,
> +		       cond_len_ternary)
> +DEF_INTERNAL_OPTAB_FN (COND_LEN_FNMS, ECF_CONST, cond_len_fnms,
> +		       cond_len_ternary)
> +
> +DEF_INTERNAL_OPTAB_FN (COND_LEN_NEG, ECF_CONST, cond_len_neg, cond_len_unary)
> +
>  DEF_INTERNAL_OPTAB_FN (RSQRT, ECF_CONST, rsqrt, unary)
>  
>  DEF_INTERNAL_OPTAB_FN (REDUC_PLUS, ECF_CONST | ECF_NOTHROW,
> diff --git a/gcc/optabs.def b/gcc/optabs.def
> index 73c9a0c760f..3dae228fba6 100644
> --- a/gcc/optabs.def
> +++ b/gcc/optabs.def
> @@ -254,6 +254,30 @@ OPTAB_D (cond_fms_optab, "cond_fms$a")
>  OPTAB_D (cond_fnma_optab, "cond_fnma$a")
>  OPTAB_D (cond_fnms_optab, "cond_fnms$a")
>  OPTAB_D (cond_neg_optab, "cond_neg$a")
> +OPTAB_D (cond_len_add_optab, "cond_len_add$a")
> +OPTAB_D (cond_len_sub_optab, "cond_len_sub$a")
> +OPTAB_D (cond_len_smul_optab, "cond_len_mul$a")
> +OPTAB_D (cond_len_sdiv_optab, "cond_len_div$a")
> +OPTAB_D (cond_len_smod_optab, "cond_len_mod$a")
> +OPTAB_D (cond_len_udiv_optab, "cond_len_udiv$a")
> +OPTAB_D (cond_len_umod_optab, "cond_len_umod$a")
> +OPTAB_D (cond_len_and_optab, "cond_len_and$a")
> +OPTAB_D (cond_len_ior_optab, "cond_len_ior$a")
> +OPTAB_D (cond_len_xor_optab, "cond_len_xor$a")
> +OPTAB_D (cond_len_ashl_optab, "cond_len_ashl$a")
> +OPTAB_D (cond_len_ashr_optab, "cond_len_ashr$a")
> +OPTAB_D (cond_len_lshr_optab, "cond_len_lshr$a")
> +OPTAB_D (cond_len_smin_optab, "cond_len_smin$a")
> +OPTAB_D (cond_len_smax_optab, "cond_len_smax$a")
> +OPTAB_D (cond_len_umin_optab, "cond_len_umin$a")
> +OPTAB_D (cond_len_umax_optab, "cond_len_umax$a")
> +OPTAB_D (cond_len_fmin_optab, "cond_len_fmin$a")
> +OPTAB_D (cond_len_fmax_optab, "cond_len_fmax$a")
> +OPTAB_D (cond_len_fma_optab, "cond_len_fma$a")
> +OPTAB_D (cond_len_fms_optab, "cond_len_fms$a")
> +OPTAB_D (cond_len_fnma_optab, "cond_len_fnma$a")
> +OPTAB_D (cond_len_fnms_optab, "cond_len_fnms$a")
> +OPTAB_D (cond_len_neg_optab, "cond_len_neg$a")
>  OPTAB_D (cmov_optab, "cmov$a6")
>  OPTAB_D (cstore_optab, "cstore$a4")
>  OPTAB_D (ctrap_optab, "ctrap$a4")
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 4+ messages in thread

* RE: [PATCH V2] VECT: Add COND_LEN_* operations for loop control with length targets
  2023-07-11 11:00 ` Richard Biener
@ 2023-07-11 12:40   ` Li, Pan2
  0 siblings, 0 replies; 4+ messages in thread
From: Li, Pan2 @ 2023-07-11 12:40 UTC (permalink / raw)
  To: Richard Biener, Ju-Zhe Zhong; +Cc: gcc-patches, richard.sandiford

Committed, thanks Richard.

Pan

-----Original Message-----
From: Gcc-patches <gcc-patches-bounces+pan2.li=intel.com@gcc.gnu.org> On Behalf Of Richard Biener via Gcc-patches
Sent: Tuesday, July 11, 2023 7:01 PM
To: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
Cc: gcc-patches@gcc.gnu.org; richard.sandiford@arm.com
Subject: Re: [PATCH V2] VECT: Add COND_LEN_* operations for loop control with length targets

On Mon, 10 Jul 2023, juzhe.zhong@rivai.ai wrote:

> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> 
> Hi, Richard and Richi.
> 
> This patch is adding cond_len_* operations pattern for target support loop control with length.
> 
> These patterns will be used in these following case:
> 
> 1. Integer division:
>    void
>    f (int32_t *restrict a, int32_t *restrict b, int32_t *restrict c, int n)
>    {
>      for (int i = 0; i < n; ++i)
>       {
>         a[i] = b[i] / c[i];
>       }
>    }
> 
>   ARM SVE IR:
>   
>   ...
>   max_mask_36 = .WHILE_ULT (0, bnd.5_32, { 0, ... });
> 
>   Loop:
>   ...
>   # loop_mask_29 = PHI <next_mask_37(4), max_mask_36(3)>
>   ...
>   vect__4.8_28 = .MASK_LOAD (_33, 32B, loop_mask_29);
>   ...
>   vect__6.11_25 = .MASK_LOAD (_20, 32B, loop_mask_29);
>   vect__8.12_24 = .COND_DIV (loop_mask_29, vect__4.8_28, vect__6.11_25, vect__4.8_28);
>   ...
>   .MASK_STORE (_1, 32B, loop_mask_29, vect__8.12_24);
>   ...
>   next_mask_37 = .WHILE_ULT (_2, bnd.5_32, { 0, ... });
>   ...
>   
>   For target like RVV who support loop control with length, we want to see IR as follows:
>   
>   Loop:
>   ...
>   # loop_len_29 = SELECT_VL
>   ...
>   vect__4.8_28 = .LEN_MASK_LOAD (_33, 32B, loop_len_29);
>   ...
>   vect__6.11_25 = .LEN_MASK_LOAD (_20, 32B, loop_len_29);
>   vect__8.12_24 = .COND_LEN_DIV (dummp_mask, vect__4.8_28, vect__6.11_25, vect__4.8_28, loop_len_29, bias);
>   ...
>   .LEN_MASK_STORE (_1, 32B, loop_len_29, vect__8.12_24);
>   ...
>   next_mask_37 = .WHILE_ULT (_2, bnd.5_32, { 0, ... });
>   ...
>   
>   Notice here, we use dummp_mask = { -1, -1, .... , -1 }
> 
> 2. Integer conditional division:
>    Similar case with (1) but with condtion:
>    void
>    f (int32_t *restrict a, int32_t *restrict b, int32_t *restrict c, int32_t * cond, int n)
>    {
>      for (int i = 0; i < n; ++i)
>        {
>          if (cond[i])
>          a[i] = b[i] / c[i];
>        }
>    }
>    
>    ARM SVE:
>    ...
>    max_mask_76 = .WHILE_ULT (0, bnd.6_52, { 0, ... });
> 
>    Loop:
>    ...
>    # loop_mask_55 = PHI <next_mask_77(5), max_mask_76(4)>
>    ...
>    vect__4.9_56 = .MASK_LOAD (_51, 32B, loop_mask_55);
>    mask__29.10_58 = vect__4.9_56 != { 0, ... };
>    vec_mask_and_61 = loop_mask_55 & mask__29.10_58;
>    ...
>    vect__6.13_62 = .MASK_LOAD (_24, 32B, vec_mask_and_61);
>    ...
>    vect__8.16_66 = .MASK_LOAD (_1, 32B, vec_mask_and_61);
>    vect__10.17_68 = .COND_DIV (vec_mask_and_61, vect__6.13_62, vect__8.16_66, vect__6.13_62);
>    ...
>    .MASK_STORE (_2, 32B, vec_mask_and_61, vect__10.17_68);
>    ...
>    next_mask_77 = .WHILE_ULT (_3, bnd.6_52, { 0, ... });
>    
>    Here, ARM SVE use vec_mask_and_61 = loop_mask_55 & mask__29.10_58; to gurantee the correct result.
>    
>    However, target with length control can not perform this elegant flow, for RVV, we would expect:
>    
>    Loop:
>    ...
>    loop_len_55 = SELECT_VL
>    ...
>    mask__29.10_58 = vect__4.9_56 != { 0, ... };
>    ...
>    vect__10.17_68 = .COND_LEN_DIV (mask__29.10_58, vect__6.13_62, vect__8.16_66, vect__6.13_62, loop_len_55, bias);
>    ...
> 
>    Here we expect COND_LEN_DIV predicated by a real mask which is the outcome of comparison: mask__29.10_58 = vect__4.9_56 != { 0, ... };
>    and a real length which is produced by loop control : loop_len_55 = SELECT_VL
>    
> 3. conditional Floating-point operations (no -ffast-math):
>    
>     void
>     f (float *restrict a, float *restrict b, int32_t *restrict cond, int n)
>     {
>       for (int i = 0; i < n; ++i)
>         {
>           if (cond[i])
>           a[i] = b[i] + a[i];
>         }
>     }
>   
>   ARM SVE IR:
>   max_mask_70 = .WHILE_ULT (0, bnd.6_46, { 0, ... });
> 
>   ...
>   # loop_mask_49 = PHI <next_mask_71(4), max_mask_70(3)>
>   ...
>   mask__27.10_52 = vect__4.9_50 != { 0, ... };
>   vec_mask_and_55 = loop_mask_49 & mask__27.10_52;
>   ...
>   vect__9.17_62 = .COND_ADD (vec_mask_and_55, vect__6.13_56, vect__8.16_60, vect__6.13_56);
>   ...
>   next_mask_71 = .WHILE_ULT (_22, bnd.6_46, { 0, ... });
>   ...
>   
>   For RVV, we would expect IR:
>   
>   ...
>   loop_len_49 = SELECT_VL
>   ...
>   mask__27.10_52 = vect__4.9_50 != { 0, ... };
>   ...
>   vect__9.17_62 = .COND_LEN_ADD (mask__27.10_52, vect__6.13_56, vect__8.16_60, vect__6.13_56, loop_len_49, bias);
>   ...
> 
> 4. Conditional un-ordered reduction:
>    
>    int32_t
>    f (int32_t *restrict a, 
>    int32_t *restrict cond, int n)
>    {
>      int32_t result = 0;
>      for (int i = 0; i < n; ++i)
>        {
>            if (cond[i])
>          result += a[i];
>        }
>      return result;
>    }
>    
>    ARM SVE IR:
>      
>      Loop:
>      # vect_result_18.7_37 = PHI <vect__33.16_51(4), { 0, ... }(3)>
>      ...
>      # loop_mask_40 = PHI <next_mask_58(4), max_mask_57(3)>
>      ...
>      mask__17.11_43 = vect__4.10_41 != { 0, ... };
>      vec_mask_and_46 = loop_mask_40 & mask__17.11_43;
>      ...
>      vect__33.16_51 = .COND_ADD (vec_mask_and_46, vect_result_18.7_37, vect__7.14_47, vect_result_18.7_37);
>      ...
>      next_mask_58 = .WHILE_ULT (_15, bnd.6_36, { 0, ... });
>      ...
>    
>      Epilogue:
>      _53 = .REDUC_PLUS (vect__33.16_51); [tail call]
>    
>    For RVV, we expect:
>      
>     Loop:
>      # vect_result_18.7_37 = PHI <vect__33.16_51(4), { 0, ... }(3)>
>      ...
>      loop_len_40 = SELECT_VL
>      ...
>      mask__17.11_43 = vect__4.10_41 != { 0, ... };
>      ...
>      vect__33.16_51 = .COND_LEN_ADD (mask__17.11_43, vect_result_18.7_37, vect__7.14_47, vect_result_18.7_37, loop_len_40, bias);
>      ...
>      next_mask_58 = .WHILE_ULT (_15, bnd.6_36, { 0, ... });
>      ...
>    
>      Epilogue:
>      _53 = .REDUC_PLUS (vect__33.16_51); [tail call]
>      
>      I name these patterns as "cond_len_*" since I want the length operand comes after mask operand and all other operands except length operand
>      same order as "cond_*" patterns. Such order will make life easier in the following loop vectorizer support.

OK.

Thanks,
Richard.

> gcc/ChangeLog:
> 
>         * doc/md.texi: Add COND_LEN_* operations for loop control with length.
>         * internal-fn.cc (cond_len_unary_direct): Ditto.
>         (cond_len_binary_direct): Ditto.
>         (cond_len_ternary_direct): Ditto.
>         (expand_cond_len_unary_optab_fn): Ditto.
>         (expand_cond_len_binary_optab_fn): Ditto.
>         (expand_cond_len_ternary_optab_fn): Ditto.
>         (direct_cond_len_unary_optab_supported_p): Ditto.
>         (direct_cond_len_binary_optab_supported_p): Ditto.
>         (direct_cond_len_ternary_optab_supported_p): Ditto.
>         * internal-fn.def (COND_LEN_ADD): Ditto.
>         (COND_LEN_SUB): Ditto.
>         (COND_LEN_MUL): Ditto.
>         (COND_LEN_DIV): Ditto.
>         (COND_LEN_MOD): Ditto.
>         (COND_LEN_RDIV): Ditto.
>         (COND_LEN_MIN): Ditto.
>         (COND_LEN_MAX): Ditto.
>         (COND_LEN_FMIN): Ditto.
>         (COND_LEN_FMAX): Ditto.
>         (COND_LEN_AND): Ditto.
>         (COND_LEN_IOR): Ditto.
>         (COND_LEN_XOR): Ditto.
>         (COND_LEN_SHL): Ditto.
>         (COND_LEN_SHR): Ditto.
>         (COND_LEN_FMA): Ditto.
>         (COND_LEN_FMS): Ditto.
>         (COND_LEN_FNMA): Ditto.
>         (COND_LEN_FNMS): Ditto.
>         (COND_LEN_NEG): Ditto.
>         * optabs.def (OPTAB_D): Ditto.
> 
> ---
>  gcc/doc/md.texi     | 80 +++++++++++++++++++++++++++++++++++++++++++++
>  gcc/internal-fn.cc  | 15 +++++++++
>  gcc/internal-fn.def | 38 +++++++++++++++++++++
>  gcc/optabs.def      | 24 ++++++++++++++
>  4 files changed, 157 insertions(+)
> 
> diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> index b30a824488b..cbcb992e5d7 100644
> --- a/gcc/doc/md.texi
> +++ b/gcc/doc/md.texi
> @@ -7234,6 +7234,86 @@ for (i = 0; i < GET_MODE_NUNITS (@var{m}); i++)
>    op0[i] = op1[i] ? fma (op2[i], op3[i], op4[i]) : op5[i];
>  @end smallexample
>  
> +@cindex @code{cond_len_add@var{mode}} instruction pattern
> +@cindex @code{cond_len_sub@var{mode}} instruction pattern
> +@cindex @code{cond_len_mul@var{mode}} instruction pattern
> +@cindex @code{cond_len_div@var{mode}} instruction pattern
> +@cindex @code{cond_len_udiv@var{mode}} instruction pattern
> +@cindex @code{cond_len_mod@var{mode}} instruction pattern
> +@cindex @code{cond_len_umod@var{mode}} instruction pattern
> +@cindex @code{cond_len_and@var{mode}} instruction pattern
> +@cindex @code{cond_len_ior@var{mode}} instruction pattern
> +@cindex @code{cond_len_xor@var{mode}} instruction pattern
> +@cindex @code{cond_len_smin@var{mode}} instruction pattern
> +@cindex @code{cond_len_smax@var{mode}} instruction pattern
> +@cindex @code{cond_len_umin@var{mode}} instruction pattern
> +@cindex @code{cond_len_umax@var{mode}} instruction pattern
> +@cindex @code{cond_len_fmin@var{mode}} instruction pattern
> +@cindex @code{cond_len_fmax@var{mode}} instruction pattern
> +@cindex @code{cond_len_ashl@var{mode}} instruction pattern
> +@cindex @code{cond_len_ashr@var{mode}} instruction pattern
> +@cindex @code{cond_len_lshr@var{mode}} instruction pattern
> +@item @samp{cond_len_add@var{mode}}
> +@itemx @samp{cond_len_sub@var{mode}}
> +@itemx @samp{cond_len_mul@var{mode}}
> +@itemx @samp{cond_len_div@var{mode}}
> +@itemx @samp{cond_len_udiv@var{mode}}
> +@itemx @samp{cond_len_mod@var{mode}}
> +@itemx @samp{cond_len_umod@var{mode}}
> +@itemx @samp{cond_len_and@var{mode}}
> +@itemx @samp{cond_len_ior@var{mode}}
> +@itemx @samp{cond_len_xor@var{mode}}
> +@itemx @samp{cond_len_smin@var{mode}}
> +@itemx @samp{cond_len_smax@var{mode}}
> +@itemx @samp{cond_len_umin@var{mode}}
> +@itemx @samp{cond_len_umax@var{mode}}
> +@itemx @samp{cond_len_fmin@var{mode}}
> +@itemx @samp{cond_len_fmax@var{mode}}
> +@itemx @samp{cond_len_ashl@var{mode}}
> +@itemx @samp{cond_len_ashr@var{mode}}
> +@itemx @samp{cond_len_lshr@var{mode}}
> +When operand 1 is true and element index < operand 5 + operand 6, perform an operation on operands 2 and 3 and
> +store the result in operand 0, otherwise store operand 4 in operand 0.
> +The operation only works for the operands are vectors.
> +
> +@smallexample
> +for (i = 0; i < ops[5] + ops[6]; i++)
> +  op0[i] = op1[i] ? op2[i] @var{op} op3[i] : op4[i];
> +@end smallexample
> +
> +where, for example, @var{op} is @code{+} for @samp{cond_len_add@var{mode}}.
> +
> +When defined for floating-point modes, the contents of @samp{op3[i]}
> +are not interpreted if @samp{op1[i]} is false, just like they would not
> +be in a normal C @samp{?:} condition.
> +
> +Operands 0, 2, 3 and 4 all have mode @var{m}.  Operand 1 is a scalar
> +integer if @var{m} is scalar, otherwise it has the mode returned by
> +@code{TARGET_VECTORIZE_GET_MASK_MODE}.  Operand 5 has whichever
> +integer mode the target prefers.
> +
> +@samp{cond_@var{op}@var{mode}} generally corresponds to a conditional
> +form of @samp{@var{op}@var{mode}3}.  As an exception, the vector forms
> +of shifts correspond to patterns like @code{vashl@var{mode}3} rather
> +than patterns like @code{ashl@var{mode}3}.
> +
> +@cindex @code{cond_len_fma@var{mode}} instruction pattern
> +@cindex @code{cond_len_fms@var{mode}} instruction pattern
> +@cindex @code{cond_len_fnma@var{mode}} instruction pattern
> +@cindex @code{cond_len_fnms@var{mode}} instruction pattern
> +@item @samp{cond_len_fma@var{mode}}
> +@itemx @samp{cond_len_fms@var{mode}}
> +@itemx @samp{cond_len_fnma@var{mode}}
> +@itemx @samp{cond_len_fnms@var{mode}}
> +Like @samp{cond_len_add@var{m}}, except that the conditional operation
> +takes 3 operands rather than two.  For example, the vector form of
> +@samp{cond_len_fma@var{mode}} is equivalent to:
> +
> +@smallexample
> +for (i = 0; i < ops[6] + ops[7]; i++)
> +  op0[i] = op1[i] ? fma (op2[i], op3[i], op4[i]) : op5[i];
> +@end smallexample
> +
>  @cindex @code{neg@var{mode}cc} instruction pattern
>  @item @samp{neg@var{mode}cc}
>  Similar to @samp{mov@var{mode}cc} but for conditional negation.  Conditionally
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index 278db7b1805..f9aaf66cf2a 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -183,6 +183,9 @@ init_internal_fns ()
>  #define cond_unary_direct { 1, 1, true }
>  #define cond_binary_direct { 1, 1, true }
>  #define cond_ternary_direct { 1, 1, true }
> +#define cond_len_unary_direct { 1, 1, true }
> +#define cond_len_binary_direct { 1, 1, true }
> +#define cond_len_ternary_direct { 1, 1, true }
>  #define while_direct { 0, 2, false }
>  #define fold_extract_direct { 2, 2, false }
>  #define fold_left_direct { 1, 1, false }
> @@ -3869,6 +3872,15 @@ expand_convert_optab_fn (internal_fn fn, gcall *stmt, convert_optab optab,
>  #define expand_cond_ternary_optab_fn(FN, STMT, OPTAB) \
>    expand_direct_optab_fn (FN, STMT, OPTAB, 5)
>  
> +#define expand_cond_len_unary_optab_fn(FN, STMT, OPTAB) \
> +  expand_direct_optab_fn (FN, STMT, OPTAB, 5)
> +
> +#define expand_cond_len_binary_optab_fn(FN, STMT, OPTAB) \
> +  expand_direct_optab_fn (FN, STMT, OPTAB, 6)
> +
> +#define expand_cond_len_ternary_optab_fn(FN, STMT, OPTAB) \
> +  expand_direct_optab_fn (FN, STMT, OPTAB, 7)
> +
>  #define expand_fold_extract_optab_fn(FN, STMT, OPTAB) \
>    expand_direct_optab_fn (FN, STMT, OPTAB, 3)
>  
> @@ -3964,6 +3976,9 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
>  #define direct_cond_unary_optab_supported_p direct_optab_supported_p
>  #define direct_cond_binary_optab_supported_p direct_optab_supported_p
>  #define direct_cond_ternary_optab_supported_p direct_optab_supported_p
> +#define direct_cond_len_unary_optab_supported_p direct_optab_supported_p
> +#define direct_cond_len_binary_optab_supported_p direct_optab_supported_p
> +#define direct_cond_len_ternary_optab_supported_p direct_optab_supported_p
>  #define direct_mask_load_optab_supported_p convert_optab_supported_p
>  #define direct_load_lanes_optab_supported_p multi_vector_optab_supported_p
>  #define direct_mask_load_lanes_optab_supported_p multi_vector_optab_supported_p
> diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> index 238b7ee0bc9..ea750a921ed 100644
> --- a/gcc/internal-fn.def
> +++ b/gcc/internal-fn.def
> @@ -72,6 +72,10 @@ along with GCC; see the file COPYING3.  If not see
>     - fold_left: for scalar = FN (scalar, vector), keyed off the vector mode
>     - check_ptrs: used for check_{raw,war}_ptrs
>  
> +   - cond_len_unary: a conditional unary optab, such as cond_len_neg<mode>
> +   - cond_len_binary: a conditional binary optab, such as cond_len_add<mode>
> +   - cond_len_ternary: a conditional ternary optab, such as cond_len_fma_rev<mode>
> +
>     DEF_INTERNAL_SIGNED_OPTAB_FN defines an internal function that
>     maps to one of two optabs, depending on the signedness of an input.
>     SIGNED_OPTAB and UNSIGNED_OPTAB are the optabs for signed and
> @@ -248,6 +252,40 @@ DEF_INTERNAL_OPTAB_FN (COND_FNMS, ECF_CONST, cond_fnms, cond_ternary)
>  
>  DEF_INTERNAL_OPTAB_FN (COND_NEG, ECF_CONST, cond_neg, cond_unary)
>  
> +DEF_INTERNAL_OPTAB_FN (COND_LEN_ADD, ECF_CONST, cond_len_add, cond_len_binary)
> +DEF_INTERNAL_OPTAB_FN (COND_LEN_SUB, ECF_CONST, cond_len_sub, cond_len_binary)
> +DEF_INTERNAL_OPTAB_FN (COND_LEN_MUL, ECF_CONST, cond_len_smul, cond_len_binary)
> +DEF_INTERNAL_SIGNED_OPTAB_FN (COND_LEN_DIV, ECF_CONST, first, cond_len_sdiv,
> +			      cond_len_udiv, cond_len_binary)
> +DEF_INTERNAL_SIGNED_OPTAB_FN (COND_LEN_MOD, ECF_CONST, first, cond_len_smod,
> +			      cond_len_umod, cond_len_binary)
> +DEF_INTERNAL_OPTAB_FN (COND_LEN_RDIV, ECF_CONST, cond_len_sdiv, cond_len_binary)
> +DEF_INTERNAL_SIGNED_OPTAB_FN (COND_LEN_MIN, ECF_CONST, first, cond_len_smin,
> +			      cond_len_umin, cond_len_binary)
> +DEF_INTERNAL_SIGNED_OPTAB_FN (COND_LEN_MAX, ECF_CONST, first, cond_len_smax,
> +			      cond_len_umax, cond_len_binary)
> +DEF_INTERNAL_OPTAB_FN (COND_LEN_FMIN, ECF_CONST, cond_len_fmin, cond_len_binary)
> +DEF_INTERNAL_OPTAB_FN (COND_LEN_FMAX, ECF_CONST, cond_len_fmax, cond_len_binary)
> +DEF_INTERNAL_OPTAB_FN (COND_LEN_AND, ECF_CONST | ECF_NOTHROW, cond_len_and,
> +		       cond_len_binary)
> +DEF_INTERNAL_OPTAB_FN (COND_LEN_IOR, ECF_CONST | ECF_NOTHROW, cond_len_ior,
> +		       cond_len_binary)
> +DEF_INTERNAL_OPTAB_FN (COND_LEN_XOR, ECF_CONST | ECF_NOTHROW, cond_len_xor,
> +		       cond_len_binary)
> +DEF_INTERNAL_OPTAB_FN (COND_LEN_SHL, ECF_CONST | ECF_NOTHROW, cond_len_ashl,
> +		       cond_len_binary)
> +DEF_INTERNAL_SIGNED_OPTAB_FN (COND_LEN_SHR, ECF_CONST | ECF_NOTHROW, first,
> +			      cond_len_ashr, cond_len_lshr, cond_len_binary)
> +
> +DEF_INTERNAL_OPTAB_FN (COND_LEN_FMA, ECF_CONST, cond_len_fma, cond_len_ternary)
> +DEF_INTERNAL_OPTAB_FN (COND_LEN_FMS, ECF_CONST, cond_len_fms, cond_len_ternary)
> +DEF_INTERNAL_OPTAB_FN (COND_LEN_FNMA, ECF_CONST, cond_len_fnma,
> +		       cond_len_ternary)
> +DEF_INTERNAL_OPTAB_FN (COND_LEN_FNMS, ECF_CONST, cond_len_fnms,
> +		       cond_len_ternary)
> +
> +DEF_INTERNAL_OPTAB_FN (COND_LEN_NEG, ECF_CONST, cond_len_neg, cond_len_unary)
> +
>  DEF_INTERNAL_OPTAB_FN (RSQRT, ECF_CONST, rsqrt, unary)
>  
>  DEF_INTERNAL_OPTAB_FN (REDUC_PLUS, ECF_CONST | ECF_NOTHROW,
> diff --git a/gcc/optabs.def b/gcc/optabs.def
> index 73c9a0c760f..3dae228fba6 100644
> --- a/gcc/optabs.def
> +++ b/gcc/optabs.def
> @@ -254,6 +254,30 @@ OPTAB_D (cond_fms_optab, "cond_fms$a")
>  OPTAB_D (cond_fnma_optab, "cond_fnma$a")
>  OPTAB_D (cond_fnms_optab, "cond_fnms$a")
>  OPTAB_D (cond_neg_optab, "cond_neg$a")
> +OPTAB_D (cond_len_add_optab, "cond_len_add$a")
> +OPTAB_D (cond_len_sub_optab, "cond_len_sub$a")
> +OPTAB_D (cond_len_smul_optab, "cond_len_mul$a")
> +OPTAB_D (cond_len_sdiv_optab, "cond_len_div$a")
> +OPTAB_D (cond_len_smod_optab, "cond_len_mod$a")
> +OPTAB_D (cond_len_udiv_optab, "cond_len_udiv$a")
> +OPTAB_D (cond_len_umod_optab, "cond_len_umod$a")
> +OPTAB_D (cond_len_and_optab, "cond_len_and$a")
> +OPTAB_D (cond_len_ior_optab, "cond_len_ior$a")
> +OPTAB_D (cond_len_xor_optab, "cond_len_xor$a")
> +OPTAB_D (cond_len_ashl_optab, "cond_len_ashl$a")
> +OPTAB_D (cond_len_ashr_optab, "cond_len_ashr$a")
> +OPTAB_D (cond_len_lshr_optab, "cond_len_lshr$a")
> +OPTAB_D (cond_len_smin_optab, "cond_len_smin$a")
> +OPTAB_D (cond_len_smax_optab, "cond_len_smax$a")
> +OPTAB_D (cond_len_umin_optab, "cond_len_umin$a")
> +OPTAB_D (cond_len_umax_optab, "cond_len_umax$a")
> +OPTAB_D (cond_len_fmin_optab, "cond_len_fmin$a")
> +OPTAB_D (cond_len_fmax_optab, "cond_len_fmax$a")
> +OPTAB_D (cond_len_fma_optab, "cond_len_fma$a")
> +OPTAB_D (cond_len_fms_optab, "cond_len_fms$a")
> +OPTAB_D (cond_len_fnma_optab, "cond_len_fnma$a")
> +OPTAB_D (cond_len_fnms_optab, "cond_len_fnms$a")
> +OPTAB_D (cond_len_neg_optab, "cond_len_neg$a")
>  OPTAB_D (cmov_optab, "cmov$a6")
>  OPTAB_D (cstore_optab, "cstore$a4")
>  OPTAB_D (ctrap_optab, "ctrap$a4")
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2023-07-11 12:41 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-07-10 11:35 [PATCH V2] VECT: Add COND_LEN_* operations for loop control with length targets juzhe.zhong
2023-07-11  0:59 ` juzhe.zhong
2023-07-11 11:00 ` Richard Biener
2023-07-11 12:40   ` Li, Pan2

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).