Hi, Robin. I have fixed all issues for you with V4 patch: https://gcc.gnu.org/pipermail/gcc-patches/2023-July/623856.html 1. Apply your approach on poly int 2. Fix comments. 3. Normalize "modulo" codes and make it no redundancy. ... Could you take a look at it? Thanks. juzhe.zhong@rivai.ai From: Robin Dapp Date: 2023-07-07 17:43 To: Juzhe-Zhong; gcc-patches CC: rdapp.gcc; kito.cheng; kito.cheng; palmer; palmer; jeffreyalaw Subject: Re: [PATCH V3] RISC-V: Support gather_load/scatter RVV auto-vectorization Hi Juzhe, thanks, that's quite a chunk :) and it took me a while to go through it. > @@ -564,7 +565,14 @@ const_vec_all_in_range_p (rtx vec, poly_int64 minval, poly_int64 maxval) > static rtx > gen_const_vector_dup (machine_mode mode, poly_int64 val) > { > - rtx c = gen_int_mode (val, GET_MODE_INNER (mode)); > + scalar_mode smode = GET_MODE_INNER (mode); > + rtx c = gen_int_mode (val, smode); > + if (!val.is_constant () && GET_MODE_SIZE (smode) > GET_MODE_SIZE (Pmode)) > + { > + rtx dup = gen_reg_rtx (mode); > + emit_insn (gen_vec_duplicate (mode, dup, c)); > + return dup; > + } > return gen_const_vec_duplicate (mode, c); > } It's a bit weird that the function now also emits an insn. It's not similar to the aarch64 variant anymore then, I suppose. If so, please remove the comment. > + > /* This function emits a masked instruction. */ > void > emit_vlmax_masked_mu_insn (unsigned icode, int op_num, rtx *ops) > @@ -1162,7 +1203,6 @@ expand_const_vector (rtx target, rtx src) > } > else > { > - elt = force_reg (elt_mode, elt); > rtx ops[] = {tmp, elt}; > emit_vlmax_insn (code_for_pred_broadcast (mode), RVV_UNOP, ops); > } > @@ -2431,6 +2471,25 @@ expand_vec_cmp_float (rtx target, rtx_code code, rtx op0, rtx op1, > return false; > } elt_mode is unused after your patch. Please remove it or we will have a bootstrap error. > @@ -2444,42 +2503,47 @@ expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel) > index is in range of [0, nunits - 1]. A single vrgather instructions is > enough. Since we will use vrgatherei16.vv for variable-length vector, > it is never out of range and we don't need to modulo the index. */ > - if (!nunits.is_constant () || const_vec_all_in_range_p (sel, 0, nunits - 1)) > + if (nunits.is_constant () && const_vec_all_in_range_p (sel, 0, nunits - 1)) > { > emit_vlmax_gather_insn (target, op0, sel); > return; > } > > /* Check if the two values vectors are the same. */ > - if (rtx_equal_p (op0, op1) || const_vec_duplicate_p (sel)) > + if (rtx_equal_p (op0, op1)) > { > /* Note: vec_perm indices are supposed to wrap when they go beyond the > size of the two value vectors, i.e. the upper bits of the indices > are effectively ignored. RVV vrgather instead produces 0 for any > out-of-range indices, so we need to modulo all the vec_perm indices > to ensure they are all in range of [0, nunits - 1]. */ > - rtx max_sel = gen_const_vector_dup (sel_mode, nunits - 1); > - rtx sel_mod = expand_simple_binop (sel_mode, AND, sel, max_sel, NULL, 0, > - OPTAB_DIRECT); > - emit_vlmax_gather_insn (target, op1, sel_mod); > + rtx sel_mod = modulo_sel_indices (sel, nunits - 1); > + emit_vlmax_gather_insn (target, op0, sel_mod); > return; > } When reading it I considered unifying both cases and have modulo_sel_indices just do nothing when the constant already satisfies the range requirement. Would that work? > - OPTAB_DIRECT); > + poly_uint64 value = rtx_to_poly_int64 (elt); > + rtx op = op0; > + if (maybe_gt (value, nunits - 1)) > + { > + sel = gen_const_vector_dup (sel_mode, value - nunits); > + op = op1; > + } > + emit_vlmax_gather_insn (target, op, sel); > } That's again a "modulo". Could that also fit in modulo_sel_indices? Your call in the end, it just feels like we do the same thing kind of differently in several places but no strict preference here. > +/* Return true if it is the strided load/store. */ > +static bool > +strided_load_store_p (rtx vec_offset, rtx *base, rtx *step) > +{ > + if (const_vec_series_p (vec_offset, base, step)) > + return true; The vectorizer will never emit this but we wouldn't want a step of 1 here, right? > + > + /* For strided load/store, vectorizer always generates > + VEC_SERIES_EXPR for vec_offset. */ > + tree expr = REG_EXPR (vec_offset); > + if (!expr || TREE_CODE (expr) != SSA_NAME) > + return false; > + > + /* Check if it is GIMPLE like: _88 = VEC_SERIES_EXPR <0, _87>; */ > + gimple *def_stmt = SSA_NAME_DEF_STMT (expr); > + if (!def_stmt || !is_gimple_assign (def_stmt) > + || gimple_assign_rhs_code (def_stmt) != VEC_SERIES_EXPR) > + return false; Interesting to query the gimple here. As long as the vectorizer doesn't do strided stores separately, I guess we can live with that. > + rtx ptr, vec_offset, vec_reg, len, mask; > + bool zero_extend_p; > + int scale_log2; > + if (is_load) > + { > + vec_reg = ops[0]; > + ptr = ops[1]; > + vec_offset = ops[2]; > + zero_extend_p = INTVAL (ops[3]); > + scale_log2 = exact_log2 (INTVAL (ops[4])); > + len = ops[5]; > + mask = ops[7]; > + } > + else > + { > + vec_reg = ops[4]; > + ptr = ops[0]; > + vec_offset = ops[1]; > + zero_extend_p = INTVAL (ops[2]); > + scale_log2 = exact_log2 (INTVAL (ops[3])); > + len = ops[5]; > + mask = ops[7]; > + } > + bool is_dummp_len = poly_int_rtx_p (len, &value) && known_eq (value, nunits); What's dummp? dumb? It looks like it's used for switching between vlmax/nonvlmax so a different name might be advisable. > + emit_insn (gen_extend_insn (extend_step, step, Pmode, > + GET_MODE (step), > + zero_extend_p ? true : false)); > + step = extend_step; > + } > + } See below. > + rtx mem = validize_mem (gen_rtx_MEM (vec_mode, ptr)); > + /* Emit vlse.v if it's load. Otherwise, emit vsse.v. */ > + if (is_load) > + { > + insn_code icode = code_for_pred_strided_load (vec_mode); > + if (is_dummp_len) > + { > + rtx load_ops[] > + = {vec_reg, mask, RVV_VUNDEF (vec_mode), mem, step}; > + emit_vlmax_masked_insn (icode, RVV_GATHER_M_OP, load_ops); > + } > + else > + { > + rtx load_ops[] > + = {vec_reg, mask, RVV_VUNDEF (vec_mode), mem, step}; > + emit_nonvlmax_masked_insn (icode, RVV_GATHER_M_OP, load_ops, len); > + } > + } The ops are similar, better to define them outside of the if/else. I would also rather have both in the same emit helper but that was discussed before. The following similar patterns all look a bit "boilerplate-ish". Tolerable for now I guess. > + if (inner_offsize < inner_vsize) > + { > + /* 7.2. Vector Load/Store Addressing Modes. > + If the vector offset elements are narrower than XLEN, they are > + zero-extended to XLEN before adding to the ptr effective address. If > + the vector offset elements are wider than XLEN, the least-significant > + XLEN bits are used in the address calculation. An implementation must > + raise an illegal instruction exception if the EEW is not supported for > + offset elements. */ > + if (!zero_extend_p || (zero_extend_p && scale_log2 != 0)) > + { > + if (zero_extend_p) > + inner_idx_mode > + = int_mode_for_size (inner_offsize * 2, 0).require (); > + else > + inner_idx_mode = int_mode_for_size (BITS_PER_WORD, 0).require (); > + machine_mode new_idx_mode > + = get_vector_mode (inner_idx_mode, nunits).require (); > + rtx tmp = gen_reg_rtx (new_idx_mode); > + emit_insn (gen_extend_insn (tmp, vec_offset, new_idx_mode, idx_mode, > + zero_extend_p ? true : false)); > + vec_offset = tmp; > + idx_mode = new_idx_mode; > + } > + } This one I don't get. Why do we still need to zero_extend when the hardware does it for us? Shouldn't we only sign extend when the expander says so? Actually we should even scan-assembler-not for (v)zext? Additionally maybe also scan-assembler (v)sext for the respective cases. Besides, couldn't we do a widening shift when combining it with scale_log != 0? > + if (SUBREG_P (src) && CONST_POLY_INT_P (SUBREG_REG (src))) > + { > + /* Since VLENB result feteched by csrr is always within Pmode size, > + we always set the upper bits of the REG with larger mode than Pmode > + as 0. The code sequence is transformed as follows: > + > + - (set (reg:SI 136) > + (subreg:SI (const_poly_int:DI [16,16]) 0)) > + -> (set (reg:SI 136) (const_poly_int:SI [16,16])) > + > + - (set (reg:SI 136) > + (subreg:SI (const_poly_int:DI [16,16]) 4)) > + -> (set (reg:SI 136) (const_int 0)) */ > + rtx poly_rtx = SUBREG_REG (src); > + poly_int64 value = rtx_to_poly_int64 (poly_rtx); > + /* The only possible case is handling CONST_POLY_INT:DI in RV32 system. */ > + gcc_assert (!TARGET_64BIT && mode == SImode > + && GET_MODE (poly_rtx) == DImode && known_ge (value, 0U)); > + if (known_eq (SUBREG_BYTE (src), GET_MODE_SIZE (Pmode))) > + emit_move_insn (dest, const0_rtx); > + else > + emit_move_insn (dest, gen_int_mode (value, Pmode)); > + return true; > + } As discussed separately, I'd rather go with diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index d40d430db1b..6735c7763b5 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -2061,7 +2061,14 @@ riscv_legitimize_poly_move (machine_mode mode, rtx dest, rtx tmp, rtx src) (m, n) = base * magn + constant. This calculation doesn't need div operation. */ - emit_move_insn (tmp, gen_int_mode (BYTES_PER_RISCV_VECTOR, mode)); + if (mode <= Pmode) + emit_move_insn (tmp, gen_int_mode (BYTES_PER_RISCV_VECTOR, mode)); + else + { + emit_move_insn (gen_highpart (Pmode, tmp), CONST0_RTX (Pmode)); + emit_move_insn (gen_lowpart (Pmode, tmp), + gen_int_mode (BYTES_PER_RISCV_VECTOR, Pmode)); + } if (BYTES_PER_RISCV_VECTOR.is_constant ()) { @@ -2168,7 +2175,7 @@ riscv_legitimize_move (machine_mode mode, rtx dest, rtx src) return false; } - if (satisfies_constraint_vp (src)) + if (satisfies_constraint_vp (src) && GET_MODE (src) == Pmode) return false; instead. This avoid the (surprising) weird subreg being generated at all and we don't need to ensure, probably redundantly, that the const_poly value is in range etc. > + && (immediate_operand (operands[3], Pmode) > + || (CONST_POLY_INT_P (operands[3])uldn't we do a widening shift when combined with scale_log != 0? > + && known_ge (rtx_to_poly_int64 (operands[3]), 0U) > + && known_le (rtx_to_poly_int64 (operands[3]), GET_MODE_SIZE (mode))))) > + { > + rtx tmp = gen_reg_rtx (Pmode); > + poly_int64 value = rtx_to_poly_int64 (operands[3]); > + emit_move_insn (tmp, gen_int_mode (value, Pmode)); > + operands[3] = gen_rtx_SIGN_EXTEND (mode, tmp); > + } Maybe add a single-line comment as for the other existing cases. > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/strided_store_run-2.c > @@ -0,0 +1,82 @@ > +/* { dg-do run { target { riscv_vector } } } */ As discussed separately, I would prefer separate run tests for zvfh(min) but I guess we can live with them just not being vectorized for now. Regards Robin