Hi， Richard.
Would you mind take a look at the loop control part again:

static gcond *
vect_set_loop_condition_partial_vectors (class loop *loop,
loop_vec_info loop_vinfo, tree niters,
tree final_iv, bool niters_maybe_zero,
gimple_stmt_iterator loop_cond_gsi)
...
tree loop_len_x = NULL_TREE;
  FOR_EACH_VEC_ELT (*controls, i, rgc)
    if (!rgc->controls.is_empty ())
      {
...

/* Set up all controls for this group.  */
if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
   OPTIMIZE_FOR_SPEED))
 test_ctrl
   = vect_set_loop_controls_by_select_vl (loop, loop_vinfo,
  &preheader_seq, &header_seq,
  rgc, niters, &loop_len_x);
...
      }

static tree
vect_set_loop_controls_by_select_vl (class loop *loop, loop_vec_info loop_vinfo,
    gimple_seq *preheader_seq,
    gimple_seq *header_seq,
    rgroup_controls *rgc, tree niters, tree *x)
{
  tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
  /* We are not allowing masked approach in SELECT_VL.  */
  gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));

  tree ctrl_type = rgc->type;
  unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
  poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);

  /* Calculate the maximum number of item values that the rgroup
     handles in total, the number that it handles for each iteration
     of the vector loop.  */
  tree nitems_total = niters;
  if (nitems_per_iter != 1)
    {
      /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
these multiplications don't overflow.  */
      tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
      nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
  nitems_total, compare_factor);
    }

  /* Convert the comparison value to the IV type (either a no-op or
     a promotion).  */
  nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total);

  /* Create an induction variable that counts the number of items
     processed.  */
  tree index_before_incr, index_after_incr;
  gimple_stmt_iterator incr_gsi;
  bool insert_after;
  standard_iv_increment_position (loop, &incr_gsi, &insert_after);

  /* Test the decremented IV, which will never underflow 0 since we have
     IFN_SELECT_VL to gurantee that.  */
  tree test_limit = nitems_total;

  /* Provide a definition of each control in the group.  */
  tree ctrl;
  unsigned int i;
  FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
    {
      /* Previous controls will cover BIAS items.  This control covers the
next batch.  */
      poly_uint64 bias = nitems_per_ctrl * i;
      tree bias_tree = build_int_cst (iv_type, bias);

      /* Rather than have a new IV that starts at TEST_LIMIT and goes down to
BIAS, prefer to use the same TEST_LIMIT - BIAS based IV for each
control and adjust the bound down by BIAS.  */
      tree this_test_limit = test_limit;
      if (i != 0)
{
 this_test_limit = gimple_build (preheader_seq, MAX_EXPR, iv_type,
 this_test_limit, bias_tree);
 this_test_limit = gimple_build (preheader_seq, MINUS_EXPR, iv_type,
 this_test_limit, bias_tree);
}

      /* Create decrement IV.  */
      create_iv (this_test_limit, MINUS_EXPR, ctrl, NULL_TREE, loop, &incr_gsi,
insert_after, &index_before_incr, &index_after_incr);

      tree res_len;
      if (rgc->controls.length () != 1)
{
 if (nitems_per_iter == 1)
   {
     /* Generte length = (X - VF*I/N) capped to the range [0, VF/N]. */
     /* step = VF * I / N.  */
     tree step
= build_int_cst (iv_type,
exact_div (vf * i, rgc->controls.length ()));
     /* Make sure (X - VF*I/N) never underflow zero.  */
     tree max = gimple_build (header_seq, MAX_EXPR, iv_type, *x, step);
     res_len
= gimple_build (header_seq, MIN_EXPR, iv_type,
index_before_incr,
build_int_cst (iv_type, vf * nitems_per_iter));
   }
 else
   {
     /* For SLP, we can't allow non-VF number of elements to be
processed in non-final iteration. We force the number of
elements to be processed in each non-final iteration is VF
elements. If we allow non-VF elements processing in non-final
iteration will make SLP too complicated and produce inferior
codegen.

  For example:

   If non-final iteration process VF elements.

     ...
     .LEN_STORE (vectp_f.8_51, 128B, _71, { 1, 2, 1, 2 }, 0);
     .LEN_STORE (vectp_f.8_56, 128B, _72, { 1, 2, 1, 2 }, 0);
     ...

   If non-final iteration process non-VF elements.

     ...
     .LEN_STORE (vectp_f.8_51, 128B, _71, { 1, 2, 1, 2 }, 0);
     if (_71 % 2 == 0)
      .LEN_STORE (vectp_f.8_56, 128B, _72, { 1, 2, 1, 2 }, 0);
     else
      .LEN_STORE (vectp_f.8_56, 128B, _72, { 2, 1, 2, 1 }, 0);
     ...

This is the simple case of 2-elements interleaved vector SLP.
We consider other interleave vector, the situation will become
more complicated.  */
     res_len
= gimple_build (header_seq, MIN_EXPR, iv_type,
index_before_incr,
build_int_cst (iv_type, vf * nitems_per_iter));
   }
}
      else
{
 res_len
   = gimple_build (header_seq, IFN_SELECT_VL, iv_type,
   index_before_incr, build_int_cst (iv_type, vf));
}
      gassign *assign = gimple_build_assign (ctrl, res_len);
      gimple_seq_add_stmt (header_seq, assign);
      if (rgc->controls.length () == 1)
*x = ctrl;
    }

  return index_after_incr;
}

Am I understand correctly ? I think I may need to implement VEC_PACK_TRUNC to test your idea.

Thanks


juzhe.zhong@rivai.ai
 
From: Richard Sandiford
Date: 2023-04-26 17:06
To: juzhe.zhong\@rivai.ai
CC: gcc-patches; rguenther
Subject: Re: [PATCH] VECT: Add decrement IV iteration loop control by variable amount support
"juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai> writes:
> Thank you so much for pointing out this issue.
>
> After reading your comments carefully, I need to revise "vect_set_loop_controls_by_while_len"  in  loop control like this:
>
> vect_set_loop_controls_by_while_len
> ... 
> tree X = NULL_TREE;
> FOR_EACH_VEC_ELT (rgc->controls, i, ctrl)
> ...
> if (i == 0) {
>   X = gimple_build (WHILE_LEN);
>   gimple_build_assign (ctrl, X);
> } else {
>   // (X - VF*I/N) capped to the range [0, VF/N]
>   tree t = gimple_build (MINUS, X, build_int_cst (VF*I/N));
>   gimple_build_assign (ctrl, t);
> }
> }
> ....
>
> Am I understand your idea correctly ?
 
I think it's more that rgc->controls.length () == 1 is a special case,
rather than i == 0 being a special case.
 
That is, rgc->controls.length () == 1 can use a single WHILE_LEN to
calculate the number of scalars that will be processed by the current
loop iteration.  Let's call it X.  Then all rgroups with
rgc->controls.length () > 1 will be based on X rather than using
WHILE_LEN.  (And they would do that even for the first control in the
group, i.e. for i == 0.)
 
I'm not saying it has to be this way.  It might be that a different
arrangement is better for the later RVV processing.  But there needs
to be something in the gimple-level description, and something in
the optab documentation, that guarantees that whatever code we
generate for these cases works correctly.
 
BTW, very minor thing (I should have raised it earlier), but maybe
something like SELECT_VL would be a better name than WHILE_LEN?
WHILE_ULT means "while (IV) is unsigned less than" and so describes
an operation in terms of its arguments.  But I think WHILE_LEN is
more describing an operation based on its use case.
 
Thanks,
Richard
 
 
>
> So the example you shows in ARM SVE gimple IR, is like this:
>
> _3 = &MEM <vector([2,2]) long int> [(long int *)_2];
>   vect__4.6_15 = .MASK_LOAD (_3, 64B, loop_mask_21); (INT64)
>   _5 = &MEM <vector([2,2]) long int> [(long int *)_2 + POLY_INT_CST [16B, 16B]];
>   vect__4.7_8 = .MASK_LOAD (_5, 64B, loop_mask_20);(INT64)
>   _7 = &MEM <vector([2,2]) long int> [(long int *)_2 + POLY_INT_CST [32B, 32B]];
>   vect__4.8_28 = .MASK_LOAD (_7, 64B, loop_mask_19);(INT64)
>   _24 = &MEM <vector([2,2]) long int> [(long int *)_2 + POLY_INT_CST [48B, 48B]];
>   vect__4.9_30 = .MASK_LOAD (_24, 64B, loop_mask_16); (INT64)
> vect__7.11_31 = VEC_PACK_TRUNC_EXPR <vect__4.6_15, vect__4.7_8>;
>   vect__7.11_32 = VEC_PACK_TRUNC_EXPR <vect__4.8_28, vect__4.9_30>;
>   vect__7.10_33 = VEC_PACK_TRUNC_EXPR <vect__7.11_31, vect__7.11_32>;
> ...
> .MASK_STORE (_13, 16B, loop_mask_36, vect__7.10_33); (INT16)
>
> If it is changed into WHILE_LEN style,  it should be:
>   
>    X = WHILE_LEN;
> _3 = &MEM <vector([2,2]) long int> [(long int *)_2];
>   vect__4.6_15 = .LEN_LOAD (_3, 64B, X - VF*1/N); (INT64)
>   _5 = &MEM <vector([2,2]) long int> [(long int *)_2 + (X - VF*1/N)*8 ];
>   vect__4.7_8 = .LEN_LOAD (_5, 64B, X - VF*2/N);(INT64)
>   _7 = &MEM <vector([2,2]) long int> [(long int *)_2 + (X - VF*2/N)*8];
>   vect__4.8_28 = .LEN_LOAD (_7, 64B, X - VF*3/N);(INT64)
>   _24 = &MEM <vector([2,2]) long int> [(long int *)_2 + (X - VF*3/N)*8];
>   vect__4.9_30 = .LEN_LOAD (_24, 64B, X - VF*4/N); (INT64)
> vect__7.11_31 = VEC_PACK_TRUNC_EXPR <vect__4.6_15, vect__4.7_8>;
>   vect__7.11_32 = VEC_PACK_TRUNC_EXPR <vect__4.8_28, vect__4.9_30>;
>   vect__7.10_33 = VEC_PACK_TRUNC_EXPR <vect__7.11_31, vect__7.11_32>;
> ...
> .LEN_STORE (_13, 16B, X, vect__7.10_33); (INT16)
>
> Is this correct ? 
>
> Thanks.
>
>
> juzhe.zhong@rivai.ai
>  
> From: Richard Sandiford
> Date: 2023-04-26 16:06
> To: juzhe.zhong\@rivai.ai
> CC: gcc-patches; rguenther
> Subject: Re: [PATCH] VECT: Add decrement IV iteration loop control by variable amount support
> "juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai> writes:
>> Thanks Richard so much.
>>
>>>> I don't think that's guaranteed by the proposed definition of WHILE_LEN.
>>>> The first int64_t WHILE_LEN could come up short, and return something
>>>> less than VF/2.
>>
>> I am so sorry that the comments of vect_set_loop_controls_by_while_len
>> is totally misleading and incorrect and I have sent V3 patch to fix that.
>> Actually, I don't use WHILE_LEN in multi-rgroups situation, instead, I use MIN_EXPR
>> to force VF elements for each non-final iteration to make sure result is correct.
>>
>> Yes, I agree with you that WHILE_LEN will produce issues for SLP situation that
>> having multi-rgroups since WHILE_LEN definition is allow target produces non-VF
>> outcome in non-final iteration. 
>  
> Yeah, I'd read that you weren't using WHILE_LEN for SLP.  I was talking
> specifically about non-SLP though (nitems_per_iter == 1).  Consider:
>  
> void f(short *x, long *y) {
>   for (int i = 0; i < 100; ++i)
>     x[i] = y[i];
> }
>  
> compiled at -O3 -fno-vect-cost-model for SVE:
>  
>         whilelo p4.d, wzr, w6
>         whilelo p3.d, wzr, w5
>         whilelo p2.h, wzr, w3
>         whilelo p1.d, wzr, w3
>         whilelo p0.d, wzr, w4
> .L2:
>         ld1d    z2.d, p0/z, [x1, #1, mul vl]
>         ld1d    z0.d, p1/z, [x1]
>         ld1d    z1.d, p3/z, [x1, #2, mul vl]
>         uzp1    z0.s, z0.s, z2.s
>         ld1d    z2.d, p4/z, [x1, #3, mul vl]
>         uzp1    z1.s, z1.s, z2.s
>         uzp1    z0.h, z0.h, z1.h
>         st1h    z0.h, p2, [x0, x2, lsl 1]
>         add     x2, x2, x8
>         whilelo p2.h, w2, w3
>         whilelo p4.d, w2, w6
>         whilelo p3.d, w2, w5
>         whilelo p0.d, w2, w4
>         add     x1, x1, x7
>         whilelo p1.d, w2, w3
>         b.any   .L2
>  
> This is a non-SLP loop.  We have two rgroups: a single-mask/control
> rgroup for the short vector, and a 4-mask/control rgroup for the long
> vector.  And the loop converts the Nth long scalar (selected from 4
> concatenated vectors) to the Nth short scalar (in a single vector).
>  
> It's therefore important that the 4-mask/control rgroup and the
> single-mask/control rgroup treat the same lanes/scalar iterations
> as active and the same lanes/scalar iterations as inactive.
>  
> But if I read the code correctly, the patch would generate 5 WHILE_LENs
> for this case, since nitems_per_iter==1 for all 5 controls.  And I don't
> think the documentation of WHILE_LEN guarantees that that will work
> correctly, given that WHILE_LEN isn't a simple MIN operation.
>  
> It might be that it works correctly on RVV, given the later
> backend-specific processing.  But I'm looking at this as a purely
> gimple thing.  If something guarantees that the above works then
> I think the WHILE_LEN documentation needs to be updated.
>  
> From the current documentation of WHILE_LEN, I think the safe
> approach would be to use WHILE_LEN for a single-control rgroup
> and then "expand" that to larger control rgroups using arithmetic.
> Specifically, if the length computed by the single-control rgroup
> is X, then control I in an N-control rgroup would need to be:
>  
>    (X - VF*I/N) capped to the range [0, VF/N]
>  
> SVE does something similar for:
>  
> void f(short *x, int *y) {
>   for (int i = 0; i < 100; ++i)
>     x[i] = y[i];
> }
>  
> Here we use a single WHILELO and then unpack it, rather than use
> three independent WHILELOs:
>  
>         whilelo p0.h, wzr, w3
> .L2:
>         punpklo p2.h, p0.b
>         punpkhi p1.h, p0.b
>         ld1w    z0.s, p2/z, [x1, x2, lsl 2]
>         ld1w    z1.s, p1/z, [x5, x2, lsl 2]
>         uzp1    z0.h, z0.h, z1.h
>         st1h    z0.h, p0, [x0, x2, lsl 1]
>         add     x2, x2, x4
>         whilelo p0.h, w2, w3
>         b.any   .L2
>  
> This is dreadful code (hence the -fno-vect-cost-model).  And I'm sure
> it's equally bad for RVV.  But the point is that we can generate it,
> and in more exotic cases it might even be worthwhile.
>  
> Thanks,
> Richard
>