Hi, Robin.

I have fixed all issues for you with V4 patch:
https://gcc.gnu.org/pipermail/gcc-patches/2023-July/623856.html 

1. Apply your approach on poly int
2. Fix comments.
3. Normalize "modulo" codes and make it no redundancy.
...

Could you take a look at it?

Thanks.


juzhe.zhong@rivai.ai
 
From: Robin Dapp
Date: 2023-07-07 17:43
To: Juzhe-Zhong; gcc-patches
CC: rdapp.gcc; kito.cheng; kito.cheng; palmer; palmer; jeffreyalaw
Subject: Re: [PATCH V3] RISC-V: Support gather_load/scatter RVV auto-vectorization
Hi Juzhe,
 
thanks, that's quite a chunk :) and it took me a while to
go through it.
 
> @@ -564,7 +565,14 @@ const_vec_all_in_range_p (rtx vec, poly_int64 minval, poly_int64 maxval)
>  static rtx
>  gen_const_vector_dup (machine_mode mode, poly_int64 val)
>  {
> -  rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
> +  scalar_mode smode = GET_MODE_INNER (mode);
> +  rtx c = gen_int_mode (val, smode);
> +  if (!val.is_constant () && GET_MODE_SIZE (smode) > GET_MODE_SIZE (Pmode))
> +    {
> +      rtx dup = gen_reg_rtx (mode);
> +      emit_insn (gen_vec_duplicate (mode, dup, c));
> +      return dup;
> +    }
>    return gen_const_vec_duplicate (mode, c);
>  }
 
It's a bit weird that the function now also emits an insn.  It's not
similar to the aarch64 variant anymore then, I suppose.  If so, please
remove the comment.
 
> +
>  /* This function emits a masked instruction.  */
>  void
>  emit_vlmax_masked_mu_insn (unsigned icode, int op_num, rtx *ops)
> @@ -1162,7 +1203,6 @@ expand_const_vector (rtx target, rtx src)
>  }
>        else
>  {
> -   elt = force_reg (elt_mode, elt);
>    rtx ops[] = {tmp, elt};
>    emit_vlmax_insn (code_for_pred_broadcast (mode), RVV_UNOP, ops);
>  }
> @@ -2431,6 +2471,25 @@ expand_vec_cmp_float (rtx target, rtx_code code, rtx op0, rtx op1,
>    return false;
>  }
 
elt_mode is unused after your patch.  Please remove it or we will have
a bootstrap error.
 
> @@ -2444,42 +2503,47 @@ expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
>       index is in range of [0, nunits - 1]. A single vrgather instructions is
>       enough. Since we will use vrgatherei16.vv for variable-length vector,
>       it is never out of range and we don't need to modulo the index.  */
> -  if (!nunits.is_constant () || const_vec_all_in_range_p (sel, 0, nunits - 1))
> +  if (nunits.is_constant () && const_vec_all_in_range_p (sel, 0, nunits - 1))
>      {
>        emit_vlmax_gather_insn (target, op0, sel);
>        return;
>      }
>  
>    /* Check if the two values vectors are the same.  */
> -  if (rtx_equal_p (op0, op1) || const_vec_duplicate_p (sel))
> +  if (rtx_equal_p (op0, op1))
>      {
>        /* Note: vec_perm indices are supposed to wrap when they go beyond the
>  size of the two value vectors, i.e. the upper bits of the indices
>  are effectively ignored.  RVV vrgather instead produces 0 for any
>  out-of-range indices, so we need to modulo all the vec_perm indices
>  to ensure they are all in range of [0, nunits - 1].  */
> -      rtx max_sel = gen_const_vector_dup (sel_mode, nunits - 1);
> -      rtx sel_mod = expand_simple_binop (sel_mode, AND, sel, max_sel, NULL, 0,
> - OPTAB_DIRECT);
> -      emit_vlmax_gather_insn (target, op1, sel_mod);
> +      rtx sel_mod = modulo_sel_indices (sel, nunits - 1);
> +      emit_vlmax_gather_insn (target, op0, sel_mod);
>        return;
>      }
 
When reading it I considered unifying both cases and have modulo_sel_indices
just do nothing when the constant already satisfies the range requirement.
Would that work?
 
> -      OPTAB_DIRECT);
> +      poly_uint64 value = rtx_to_poly_int64 (elt);
> +      rtx op = op0;
> +      if (maybe_gt (value, nunits - 1))
> + {
> +   sel = gen_const_vector_dup (sel_mode, value - nunits);
> +   op = op1;
> + }
> +      emit_vlmax_gather_insn (target, op, sel);
>      }
 
That's again a "modulo".   Could that also fit in modulo_sel_indices?
Your call in the end, it just feels like we do the same thing kind
of differently in several places but no strict preference here.
 
> +/* Return true if it is the strided load/store.  */
> +static bool
> +strided_load_store_p (rtx vec_offset, rtx *base, rtx *step)
> +{
> +  if (const_vec_series_p (vec_offset, base, step))
> +    return true;
 
The vectorizer will never emit this but we wouldn't want a step
of 1 here, right?
 
> +
> +  /* For strided load/store, vectorizer always generates
> +     VEC_SERIES_EXPR for vec_offset.  */
> +  tree expr = REG_EXPR (vec_offset);
> +  if (!expr || TREE_CODE (expr) != SSA_NAME)
> +    return false;
> +
> +  /* Check if it is GIMPLE like: _88 = VEC_SERIES_EXPR <0, _87>;  */
> +  gimple *def_stmt = SSA_NAME_DEF_STMT (expr);
> +  if (!def_stmt || !is_gimple_assign (def_stmt)
> +      || gimple_assign_rhs_code (def_stmt) != VEC_SERIES_EXPR)
> +    return false;
 
Interesting to query the gimple here.  As long as the
vectorizer doesn't do strided stores separately, I guess we can
live with that.
 
> +  rtx ptr, vec_offset, vec_reg, len, mask;
> +  bool zero_extend_p;
> +  int scale_log2;
> +  if (is_load)
> +    {
> +      vec_reg = ops[0];
> +      ptr = ops[1];
> +      vec_offset = ops[2];
> +      zero_extend_p = INTVAL (ops[3]);
> +      scale_log2 = exact_log2 (INTVAL (ops[4]));
> +      len = ops[5];
> +      mask = ops[7];
> +    }
> +  else
> +    {
> +      vec_reg = ops[4];
> +      ptr = ops[0];
> +      vec_offset = ops[1];
> +      zero_extend_p = INTVAL (ops[2]);
> +      scale_log2 = exact_log2 (INTVAL (ops[3]));
> +      len = ops[5];
> +      mask = ops[7];
> +    }
 
> +  bool is_dummp_len = poly_int_rtx_p (len, &value) && known_eq (value, nunits);
 
What's dummp?  dumb?  It looks like it's used for switching between
vlmax/nonvlmax so a different name might be advisable.
 
> +       emit_insn (gen_extend_insn (extend_step, step, Pmode,
> +   GET_MODE (step),
> +   zero_extend_p ? true : false));
> +       step = extend_step;
> +     }
> + }
 
See below.
 
> +      rtx mem = validize_mem (gen_rtx_MEM (vec_mode, ptr));
> +      /* Emit vlse.v if it's load. Otherwise, emit vsse.v.  */
> +      if (is_load)
> + {
> +   insn_code icode = code_for_pred_strided_load (vec_mode);
> +   if (is_dummp_len)
> +     {
> +       rtx load_ops[]
> + = {vec_reg, mask, RVV_VUNDEF (vec_mode), mem, step};
> +       emit_vlmax_masked_insn (icode, RVV_GATHER_M_OP, load_ops);
> +     }
> +   else
> +     {
> +       rtx load_ops[]
> + = {vec_reg, mask, RVV_VUNDEF (vec_mode), mem, step};
> +       emit_nonvlmax_masked_insn (icode, RVV_GATHER_M_OP, load_ops, len);
> +     }
> + }
 
The ops are similar, better to define them outside of the if/else.
I would also rather have both in the same emit helper but that was
discussed before.  The following similar patterns all look a bit
"boilerplate-ish".  Tolerable for now I guess.
 
> +  if (inner_offsize < inner_vsize)
> +    {
> +      /* 7.2. Vector Load/Store Addressing Modes.
> + If the vector offset elements are narrower than XLEN, they are
> + zero-extended to XLEN before adding to the ptr effective address. If
> + the vector offset elements are wider than XLEN, the least-significant
> + XLEN bits are used in the address calculation. An implementation must
> + raise an illegal instruction exception if the EEW is not supported for
> + offset elements.  */
> +      if (!zero_extend_p || (zero_extend_p && scale_log2 != 0))
> + {
> +   if (zero_extend_p)
> +     inner_idx_mode
> +       = int_mode_for_size (inner_offsize * 2, 0).require ();
> +   else
> +     inner_idx_mode = int_mode_for_size (BITS_PER_WORD, 0).require ();
> +   machine_mode new_idx_mode
> +     = get_vector_mode (inner_idx_mode, nunits).require ();
> +   rtx tmp = gen_reg_rtx (new_idx_mode);
> +   emit_insn (gen_extend_insn (tmp, vec_offset, new_idx_mode, idx_mode,
> +       zero_extend_p ? true : false));
> +   vec_offset = tmp;
> +   idx_mode = new_idx_mode;
> + }
> +    }
 
This one I don't get.  Why do we still need to zero_extend when the
hardware does it for us?  Shouldn't we only sign extend when the
expander says so?  Actually we should even scan-assembler-not for
(v)zext?  Additionally maybe also scan-assembler (v)sext for the
respective cases.
Besides, couldn't we do a widening shift when combining it with
scale_log != 0?
 
> +  if (SUBREG_P (src) && CONST_POLY_INT_P (SUBREG_REG (src)))
> +    {
> +      /* Since VLENB result feteched by csrr is always within Pmode size,
> + we always set the upper bits of the REG with larger mode than Pmode
> + as 0.  The code sequence is transformed as follows:
> +
> +      - (set (reg:SI 136)
> +     (subreg:SI (const_poly_int:DI [16,16]) 0))
> +      -> (set (reg:SI 136) (const_poly_int:SI [16,16]))
> +
> +      - (set (reg:SI 136)
> +     (subreg:SI (const_poly_int:DI [16,16]) 4))
> +      -> (set (reg:SI 136) (const_int 0))  */
> +      rtx poly_rtx = SUBREG_REG (src);
> +      poly_int64 value = rtx_to_poly_int64 (poly_rtx);
> +      /* The only possible case is handling CONST_POLY_INT:DI in RV32 system. */
> +      gcc_assert (!TARGET_64BIT && mode == SImode
> +   && GET_MODE (poly_rtx) == DImode && known_ge (value, 0U));
> +      if (known_eq (SUBREG_BYTE (src), GET_MODE_SIZE (Pmode)))
> + emit_move_insn (dest, const0_rtx);
> +      else
> + emit_move_insn (dest, gen_int_mode (value, Pmode));
> +      return true;
> +    }
 
As discussed separately, I'd rather go with
 
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index d40d430db1b..6735c7763b5 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -2061,7 +2061,14 @@ riscv_legitimize_poly_move (machine_mode mode, rtx dest, rtx tmp, rtx src)
      (m, n) = base * magn + constant.
      This calculation doesn't need div operation.  */
-  emit_move_insn (tmp, gen_int_mode (BYTES_PER_RISCV_VECTOR, mode));
+    if (mode <= Pmode)
+      emit_move_insn (tmp, gen_int_mode (BYTES_PER_RISCV_VECTOR, mode));
+    else
+      {
+       emit_move_insn (gen_highpart (Pmode, tmp), CONST0_RTX (Pmode));
+       emit_move_insn (gen_lowpart (Pmode, tmp),
+                       gen_int_mode (BYTES_PER_RISCV_VECTOR, Pmode));
+      }
   if (BYTES_PER_RISCV_VECTOR.is_constant ())
     {
@@ -2168,7 +2175,7 @@ riscv_legitimize_move (machine_mode mode, rtx dest, rtx src)
          return false;
        }
-      if (satisfies_constraint_vp (src))
+      if (satisfies_constraint_vp (src) && GET_MODE (src) == Pmode)
        return false;
 
instead.  This avoid the (surprising) weird subreg being generated
at all and we don't need to ensure, probably redundantly, that the
const_poly value is in range etc.
 
> +           && (immediate_operand (operands[3], Pmode)
> +        || (CONST_POLY_INT_P (operands[3])uldn't we do a widening shift when combined with
scale_log != 0?
> +            && known_ge (rtx_to_poly_int64 (operands[3]), 0U)
> +    && known_le (rtx_to_poly_int64 (operands[3]), GET_MODE_SIZE (<MODE>mode)))))
> +    {
> +      rtx tmp = gen_reg_rtx (Pmode);
> +      poly_int64 value = rtx_to_poly_int64 (operands[3]);
> +      emit_move_insn (tmp, gen_int_mode (value, Pmode));
> +      operands[3] = gen_rtx_SIGN_EXTEND (<VEL>mode, tmp);
> +    }
 
Maybe add a single-line comment as for the other existing cases.
 
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/strided_store_run-2.c
> @@ -0,0 +1,82 @@
> +/* { dg-do run { target { riscv_vector } } } */
As discussed separately,  I would prefer separate run tests for
zvfh(min) but I guess we can live with them just not being vectorized
for now.
 
Regards
Robin