More details for Case 2:
+	_72 = MIN_EXPR <ivtmp_70, 16>;
+	_75 = MIN_EXPR <ivtmp_73, 16>;
+	...
+	.LEN_STORE (vectp_f.8_51, 128B, _75, { 1, 2, 1, 2, 1, 2, 1, 2 }, 0);
+	vectp_f.8_56 = vectp_f.8_51 + 16;
+	.LEN_STORE (vectp_f.8_56, 128B, _72, { 1, 2, 1, 2, 1, 2, 1, 2 }, 0);
+	...
+	_61 = _75 / 2;
+	.LEN_STORE (vectp_d.10_59, 128B, _61, { 3, 3, 3, 3 }, 0);
+	vectp_d.10_63 = vectp_d.10_59 + 16;
+	_64 = _72 / 2;
+	.LEN_STORE (vectp_d.10_63, 128B, _64, { 3, 3, 3, 3 }, 0);
You may be confused that _61 = _75 / 2; and _64 = _72 / 2;
Well, this is similiar VIEW_CONVERT_EXPR of mask in ARM SVE.

Like ARM SVE:
tree
vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
                    unsigned int nvectors, tree vectype, unsigned int index)
{
  rgroup_controls *rgm = &(*masks)[nvectors - 1];
  tree mask_type = rgm->type;

  /* Populate the rgroup's mask array, if this is the first time we've
     used it.  */
  if (rgm->controls.is_empty ())
    {
      rgm->controls.safe_grow_cleared (nvectors, true);
      for (unsigned int i = 0; i < nvectors; ++i)
        {
          tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
          /* Provide a dummy definition until the real one is available.  */
          SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
          rgm->controls[i] = mask;
        }
    }

  tree mask = rgm->controls[index];
  if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
                TYPE_VECTOR_SUBPARTS (vectype)))
    {
      /* A loop mask for data type X can be reused for data type Y
         if X has N times more elements than Y and if Y's elements
         are N times bigger than X's.  In this case each sequence
         of N elements in the loop mask will be all-zero or all-one.
         We can then view-convert the mask so that each sequence of
         N elements is replaced by a single element.  */
      gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
                              TYPE_VECTOR_SUBPARTS (vectype)));
      gimple_seq seq = NULL;
      mask_type = truth_type_for (vectype);
      mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
      if (seq)
        gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
    }
  return mask;
}


I am doing similiar thing for RVV:
+vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
+		   vec_loop_lens *lens, unsigned int nvectors, tree vectype,
+		   unsigned int index)
 {
   rgroup_controls *rgl = &(*lens)[nvectors - 1];
   bool use_bias_adjusted_len =
     LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
+  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
 
   /* Populate the rgroup's len array, if this is the first time we've
      used it.  */
@@ -10400,6 +10403,27 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
 
   if (use_bias_adjusted_len)
     return rgl->bias_adjusted_ctrl;
+  else if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
+					   OPTIMIZE_FOR_SPEED))
+    {
+      tree loop_len = rgl->controls[index];
+      poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
+      poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
+      if (maybe_ne (nunits1, nunits2))
+	{
+	  /* A loop len for data type X can be reused for data type Y
+	     if X has N times more elements than Y and if Y's elements
+	     are N times bigger than X's.  */
+	  gcc_assert (multiple_p (nunits1, nunits2));
+	  unsigned int factor = exact_div (nunits1, nunits2).to_constant ();
+	  gimple_seq seq = NULL;
+	  loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
+				   build_int_cst (iv_type, factor));
+	  if (seq)
+	    gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
+	}
+      return loop_len;
+    }
   else
     return rgl->controls[index];
 }Thansk.


juzhe.zhong@rivai.ai
 
From: juzhe.zhong@rivai.ai
Date: 2023-05-09 21:27
To: richard.sandiford
CC: gcc-patches; rguenther
Subject: Re: Re: [PATCH V4] VECT: Add decrement IV iteration loop control by variable amount support
Thanks, Richard.

>> Could you go into more details about this?  I imagined that for case 3,
>> there would be a single SELECT_VL that decides how many scalar iterations
>> will be handled by the current vector iteration, then we would "expand"
>> the result (using MIN_EXPRs) to the multi-control cases.

For case 2 , here is the example:
+   2. Multiple rgroup, the Gimple IR should be:
+
+	# i_23 = PHI <i_20(6), 0(11)>
+	# vectp_f.8_51 = PHI <vectp_f.8_52(6), f_15(D)(11)>
+	# vectp_d.10_59 = PHI <vectp_d.10_60(6), d_18(D)(11)>
+	# ivtmp_70 = PHI <ivtmp_71(6), _69(11)>
+	# ivtmp_73 = PHI <ivtmp_74(6), _67(11)>
+	_72 = MIN_EXPR <ivtmp_70, 16>;
+	_75 = MIN_EXPR <ivtmp_73, 16>;
+	_1 = i_23 * 2;
+	_2 = (long unsigned int) _1;
+	_3 = _2 * 2;
+	_4 = f_15(D) + _3;
+	_5 = _2 + 1;
+	_6 = _5 * 2;
+	_7 = f_15(D) + _6;
+	.LEN_STORE (vectp_f.8_51, 128B, _75, { 1, 2, 1, 2, 1, 2, 1, 2 }, 0);
+	vectp_f.8_56 = vectp_f.8_51 + 16;
+	.LEN_STORE (vectp_f.8_56, 128B, _72, { 1, 2, 1, 2, 1, 2, 1, 2 }, 0);
+	_8 = (long unsigned int) i_23;
+	_9 = _8 * 4;
+	_10 = d_18(D) + _9;
+	_61 = _75 / 2;
+	.LEN_STORE (vectp_d.10_59, 128B, _61, { 3, 3, 3, 3 }, 0);
+	vectp_d.10_63 = vectp_d.10_59 + 16;
+	_64 = _72 / 2;
+	.LEN_STORE (vectp_d.10_63, 128B, _64, { 3, 3, 3, 3 }, 0);
+	i_20 = i_23 + 1;
+	vectp_f.8_52 = vectp_f.8_56 + 16;
+	vectp_d.10_60 = vectp_d.10_63 + 16;
+	ivtmp_74 = ivtmp_73 - _75;
+	ivtmp_71 = ivtmp_70 - _72;
+	if (ivtmp_74 != 0)
+	  goto <bb 6>; [83.33%]
+	else
+	  goto <bb 13>; [16.67%]
+
+   Note: We DO NOT use .SELECT_VL in SLP auto-vectorization for multiple
+   rgroups. Instead, we use MIN_EXPR to guarantee we always use VF as the
+   iteration amount for mutiple rgroups.+   The analysis of the flow of multiple rgroups:
+	_72 = MIN_EXPR <ivtmp_70, 16>;
+	_75 = MIN_EXPR <ivtmp_73, 16>;
+	...
+	.LEN_STORE (vectp_f.8_51, 128B, _75, { 1, 2, 1, 2, 1, 2, 1, 2 }, 0);
+	vectp_f.8_56 = vectp_f.8_51 + 16;
+	.LEN_STORE (vectp_f.8_56, 128B, _72, { 1, 2, 1, 2, 1, 2, 1, 2 }, 0);
+	...
+	_61 = _75 / 2;
+	.LEN_STORE (vectp_d.10_59, 128B, _61, { 3, 3, 3, 3 }, 0);
+	vectp_d.10_63 = vectp_d.10_59 + 16;
+	_64 = _72 / 2;
+	.LEN_STORE (vectp_d.10_63, 128B, _64, { 3, 3, 3, 3 }, 0);Here, If use SELECT_VL instead of MIN_EXPR. Since we define the outcome of SELECT_VL can be any number in non-final iteration.It seems not easy to adjust address pointer IV (vectp_f.8_56 = vectp_f.8_51 + 16;) and the next length (_61 = _75 / 2;).
For case 3: +  3. Multiple rgroups for non-SLP auto-vectorization.+
+     # ivtmp_26 = PHI <ivtmp_27(4), _25(3)>
+     # ivtmp.35_10 = PHI <ivtmp.35_11(4), ivtmp.35_1(3)>
+     # ivtmp.36_2 = PHI <ivtmp.36_8(4), ivtmp.36_23(3)>
+     _28 = MIN_EXPR <ivtmp_26, POLY_INT_CST [8, 8]>;
+     loop_len_15 = MIN_EXPR <_28, POLY_INT_CST [4, 4]>;
+     loop_len_16 = _28 - loop_len_15;
+     _29 = (void *) ivtmp.35_10;
+     _7 = &MEM <vector([4,4]) int> [(int *)_29];
+     vect__1.25_17 = .LEN_LOAD (_7, 128B, loop_len_15, 0);
+     _33 = _29 + POLY_INT_CST [16, 16];
+     _34 = &MEM <vector([4,4]) int> [(int *)_33];
+     vect__1.26_19 = .LEN_LOAD (_34, 128B, loop_len_16, 0);
+     vect__2.27_20 = VEC_PACK_TRUNC_EXPR <vect__1.25_17, vect__1.26_19>;
+     _30 = (void *) ivtmp.36_2;
+     _31 = &MEM <vector([8,8]) short int> [(short int *)_30];
+     .LEN_STORE (_31, 128B, _28, vect__2.27_20, 0);
+     ivtmp_27 = ivtmp_26 - _28;
+     ivtmp.35_11 = ivtmp.35_10 + POLY_INT_CST [32, 32];
+     ivtmp.36_8 = ivtmp.36_2 + POLY_INT_CST [16, 16];
+     if (ivtmp_27 != 0)
+       goto <bb 4>; [83.33%]
+     else
+       goto <bb 5>; [16.67%]
+
+     The total length: _28 = MIN_EXPR <ivtmp_26, POLY_INT_CST [8, 8]>;
+
+     The length of first half vector:
+       loop_len_15 = MIN_EXPR <_28, POLY_INT_CST [4, 4]>;
+
+     The length of second half vector:
+       loop_len_15 = MIN_EXPR <_28, POLY_INT_CST [4, 4]>;
+       loop_len_16 = _28 - loop_len_15;
+
+     1). _28 always <= POLY_INT_CST [8, 8].
+     2). When _28 <= POLY_INT_CST [4, 4], second half vector is not processed.
+     3). When _28 > POLY_INT_CST [4, 4], second half vector is processed.We known in Case 3, we should deal with 2 vectors: vect__2.27_20 = VEC_PACK_TRUNC_EXPR <vect__1.25_17, vect__1.26_19>;First we use  _28 = MIN_EXPR <ivtmp_26, POLY_INT_CST [8, 8]>; to generate the number of elements to be processedfor these 2 vector.Second, we use "loop_len_15 = MIN_EXPR <_28, POLY_INT_CST [4, 4]>;"  "loop_len_15" is the  number elements to be processed for first vector.Then, "loop_len_16 = _28 - loop_len_15; "loop_len_16" is the  number elements to be processed for first vector.I think "loop_len_15 = MIN_EXPR <_28, POLY_INT_CST [4, 4]>;" is very similiar the unpacklo in ARM SVE."loop_len_16 = _28 - loop_len_15; "loop_len_16" is very similiar the unpackhi in ARM SVE.
>> It's up to you.  If you don't think select_vl is worth it then it would
>>obviously make the vectoriser changes a bit simpler.
>>But making the vectoriser simpler isn't IMO the goal here.  SELECT_VL
>>seems like a perfectly reasonable construct to add to target-independent
>>code.  We just need to work out some of the details.

Ok, I also prefer keeping select_vl.

>>FWIW, I share Kewen's concern about duplicating too much logic between
>>masks, current lengths, and SELECT_VL lengths.  But I haven't looked at
>>the patch yet and so I don't know how easy it would be to avoid that.

I understand the concern, the current implementation are in the isolated function "vect_set_loop_controls_by_select_vl",
it's easier to review the implementation.
Maybe we can first make the whole implementation codes in "vect_set_loop_controls_by_select_vl" to be stable after review,
then we can try to incorporate these codes of "vect_set_loop_controls_by_select_vl" into "vect_set_loop_controls_directly".


Thanks.


juzhe.zhong@rivai.ai
 
From: Richard Sandiford
Date: 2023-05-09 20:59
To: 钟居哲
CC: gcc-patches; rguenther
Subject: Re: [PATCH V4] VECT: Add decrement IV iteration loop control by variable amount support
钟居哲 <juzhe.zhong@rivai.ai> writes:
> Hi, Richards. I would like to give more information about this patch so that it will make this patch easier for you to review.
>
> Currently, I saw we have 3 situations that we need to handle in case of loop control IV in auto-vectorization:
> 1. Single rgroup loop control (ncopies == 1 && vec_num == 1 so loop_len.length () == 1 or rgc->lengh () == 1)
> 2. Multiple rgroup for SLP.
> 3. Multiple rgroup for non-SLP which is Richard Sandiford point out previously (For example, VEC_PACK_TRUNC).
>
> To talk about this patch, let me talk about RVV LLVM implementation first which inspire me to send this patch:
> https://reviews.llvm.org/D99750 
>
> According to LLVM implementation, they are adding a middle-end IR called "get_vector_length" which has totally
> same functionality as "select_vl" in this patch (I call it "while_len" previously, now I rename it as "select_vl" following Richard suggestion).
>
> The LLVM implementation is only let "get_vector_length" calculate the number of elements in single rgroup loop.
> For multi rgroup, let's take a look at it:
> https://godbolt.org/z/3GP78efTY 
>
> void
> foo1 (short *__restrict f, int *__restrict d, int n)
> {
>   for (int i = 0; i < n; ++i)
>     {
>       f[i * 2 + 0] = 1;
>       f[i * 2 + 1] = 2;
>       d[i] = 3;
>     }
> } 
>
> RISC-V Clang:
> foo1:                                   # @foo1
> # %bb.0:
>         blez    a2, .LBB0_8
> # %bb.1:
>         li      a3, 16
>         bgeu    a2, a3, .LBB0_3
> # %bb.2:
>         li      a6, 0
>         j       .LBB0_6
> .LBB0_3:
>         andi    a6, a2, -16
>         lui     a3, 32
>         addiw   a3, a3, 1
>         vsetivli        zero, 8, e32, m2, ta, ma
>         vmv.v.x v8, a3
>         vmv.v.i v10, 3
>         mv      a4, a6
>         mv      a5, a1
>         mv      a3, a0
> .LBB0_4:                                # =>This Inner Loop Header: Depth=1
>         addi    a7, a5, 32
>         addi    t0, a3, 32
>         vsetivli        zero, 16, e16, m2, ta, ma
>         vse16.v v8, (a3)
>         vse16.v v8, (t0)
>         vsetivli        zero, 8, e32, m2, ta, ma
>         vse32.v v10, (a5)
>         vse32.v v10, (a7)
>         addi    a3, a3, 64
>         addi    a4, a4, -16
>         addi    a5, a5, 64
>         bnez    a4, .LBB0_4
> # %bb.5:
>         beq     a6, a2, .LBB0_8
> .LBB0_6:
>         slli    a3, a6, 2
>         add     a0, a0, a3
>         addi    a0, a0, 2
>         add     a1, a1, a3
>         sub     a2, a2, a6
>         li      a3, 1
>         li      a4, 2
>         li      a5, 3
> .LBB0_7:                                # =>This Inner Loop Header: Depth=1
>         sh      a3, -2(a0)
>         sh      a4, 0(a0)
>         sw      a5, 0(a1)
>         addi    a0, a0, 4
>         addi    a2, a2, -1
>         addi    a1, a1, 4
>         bnez    a2, .LBB0_7
> .LBB0_8:
>         ret
>
> ARM GCC:
> foo1:
>         cmp     w2, 0
>         ble     .L1
>         addvl   x4, x0, #1
>         mov     x3, 0
>         cntb    x7
>         cntb    x6, all, mul #2
>         sbfiz   x2, x2, 1, 32
>         ptrue   p0.b, all
>         mov     x5, x2
>         adrp    x8, .LC0
>         uqdech  x5
>         add     x8, x8, :lo12:.LC0
>         whilelo p1.h, xzr, x5
>         ld1rw   z1.s, p0/z, [x8]
>         mov     z0.s, #3
>         whilelo p0.h, xzr, x2
> .L3:
>         st1h    z1.h, p0, [x0, x3, lsl 1]
>         st1h    z1.h, p1, [x4, x3, lsl 1]
>         st1w    z0.s, p1, [x1, #1, mul vl]
>         add     x3, x3, x7
>         whilelo p1.h, x3, x5
>         st1w    z0.s, p0, [x1]
>         add     x1, x1, x6
>         whilelo p0.h, x3, x2
>         b.any   .L3
> .L1:
>         ret
>
> It's very obvious that ARM GCC has much better codegen since RVV LLVM just use SIMD style to handle multi-rgroup SLP auto-vectorization.
>
> Well, I am totally aggree that we should add length stuff in auto-vectorization not only for single rgroup but also multiple rgroup.
> However, when I am trying to implement multiple rgroup length for both SLP and non-SLP and testing, turns out it's hard to use select_vl
> since "select_vl" pattern allows non-VF flexible length (length <= min (remain,VF)) in any iteration, it's consuming much more operations for
> adjust loop controls IV and data reference address point IV than just using "MIN_EXPR".
>
> So for Case 2 && Case 3, I just use MIN_EXPR directly instead of SELECT_VL after my serveral internal testing.
 
Could you go into more details about this?  I imagined that for case 3,
there would be a single SELECT_VL that decides how many scalar iterations
will be handled by the current vector iteration, then we would "expand"
the result (using MIN_EXPRs) to the multi-control cases.
 
In a sense that replicates what the SVE code above is doing.  But for SVE,
it's possible to "optimise" the unpacking of a WHILELO result due to the
lack of implementation-defined behaviour.  So conceptually we have a
single WHILELO that is unpacked one layer to give two masks.  But in
reality we optimise that two WHILELOs.  No such optimisation is possible
with SELECT_VL, and maybe that produces poor-quality code.
 
That might be what you mean (haven't had chance to look at the patch
itself yet, but hope to tomorrow).
 
For case 2 (max_nscalars_per_iter > 1), I think it would make conceptual
sense to pass max_nscalars_per_iter to SELECT_VL or (alternatively)
multiply the result of the SELECT_VL by max_nscalars_per_iter.
But it's only worth doing one of those two things if it produces
reasonable code for RVV.
 
> Now base on these situations, we only have "select_vl" for single-rgroup, but multiple-rgroup (both SLP and non-SLP), we just
> use MIN_EXPR.
>
> Is it more appropriate that we should remove "select_vl" and just use MIN_EXPR force VF elements in each non-final iteration in single rgroup?
 
It's up to you.  If you don't think select_vl is worth it then it would
obviously make the vectoriser changes a bit simpler.
 
But making the vectoriser simpler isn't IMO the goal here.  SELECT_VL
seems like a perfectly reasonable construct to add to target-independent
code.  We just need to work out some of the details.
 
FWIW, I share Kewen's concern about duplicating too much logic between
masks, current lengths, and SELECT_VL lengths.  But I haven't looked at
the patch yet and so I don't know how easy it would be to avoid that.
 
Thanks,
Richard
 
> Like the codegen according to RVV ISA example (show as RVV LLVM):
> https://repo.hca.bsc.es/epic/z/oynhzP 
>
> ASM:
> vec_add:                                # @vec_add
>         blez    a3, .LBB0_3
>         li      a4, 0
> .LBB0_2:                                # %vector.body
>         sub     a5, a3, a4
>         vsetvli a6, a5, e64, m1, ta, mu  ==> change it into a6 = min (a5, VF) && vsetvli zero, a6, e64, m1, ta, mu
>         slli    a7, a4, 3
>         add     a5, a1, a7
>         vle64.v v8, (a5)
>         add     a5, a2, a7
>         vle64.v v9, (a5)
>         vfadd.vv        v8, v8, v9
>         add     a7, a7, a0
>         add     a4, a4, a6
>         vse64.v v8, (a7)
>         bne     a4, a3, .LBB0_2
> .LBB0_3:                                # %for.cond.cleanup
>         ret
>
> So if we remove "select_vl" just use MIN_EXPR to force VF elements in non-final iteration, we will end up with having like this:
>
> vec_add:                                # @vec_add
>         blez    a3, .LBB0_3
>         csrr    VF in bytes, vlenb
> .LBB0_2:                                # %vector.body
>         sub     a5, a3, VF in elements
>         a6 = min (a5, VF in elements) 
>         vsetvli zero, a6, e64, m1, ta, mu
>         add     a5, a1, VF in bytes
>         vle64.v v8, (a5)
>         add     a5, a2, VF in bytes
>         vle64.v v9, (a5)
>         vfadd.vv        v8, v8, v9
>         add     a7, a7, a0
>         vse64.v v8, (a7)
>         bne     a4, a3, .LBB0_2
> .LBB0_3:                                # %for.cond.cleanup
>         ret
>
> Both codegen are working good for RVV.
> Just only the second one can not have the RVV special optimization (even distributed workload in last 2 iterations).
>
> Expecting any suggestions from you.
> Thanks.
>
>
> juzhe.zhong@rivai.ai
>  
> From: juzhe.zhong
> Date: 2023-05-04 21:25
> To: gcc-patches
> CC: richard.sandiford; rguenther; Ju-Zhe Zhong
> Subject: [PATCH V4] VECT: Add decrement IV iteration loop control by variable amount support
> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
>  
> This patch is fixing V3 patch:
> https://patchwork.sourceware.org/project/gcc/patch/20230407014741.139387-1-juzhe.zhong@rivai.ai/
>  
> Fix issues according to Richard Sandiford && Richard Biener.
>  
> 1. Rename WHILE_LEN pattern into SELECT_VL according to Richard Sandiford.
> 2. Support multiple-rgroup for non-SLP auto-vectorization.
>  
>    For vec_pack_trunc pattern (multi-rgroup of non-SLP), we generate the total length:
>  
>      _36 = MIN_EXPR <ivtmp_34, POLY_INT_CST [8, 8]>;
>  
>      First length (MIN (X, VF/N)):
>        loop_len_15 = MIN_EXPR <_36, POLY_INT_CST [2, 2]>;
>  
>      Second length (X - MIN (X, 1 * VF/N)):
>        loop_len_16 = _36 - loop_len_15;
>  
>      Third length (X - MIN (X, 2 * VF/N)):
>        _38 = MIN_EXPR <_36, POLY_INT_CST [4, 4]>;
>        loop_len_17 = _36 - _38;
>  
>      Forth length (X - MIN (X, 3 * VF/N)):
>        _39 = MIN_EXPR <_36, POLY_INT_CST [6, 6]>;
>        loop_len_18 = _36 - _39;
>  
> The reason that I use MIN_EXPR instead of SELECT_VL to calculate total length since using SELECT_VL
> to adapt induction IV consumes more instructions than just using MIN_EXPR. Also, during testing,
> I found it's hard to adjust length correctly according to SELECT_VL.
>  
> So, this patch we only use SELECT_VL for single-rgroup with single length control.
>  
> 3. Fix document of select_vl for Richard Biener (remove mode N).
> 4. Fix comments of vect_set_loop_controls_by_select_vl according to Richard Biener.
> 5. Keep loop_vinfo as first parameter for "vect_get_loop_len".
> 6. make requirement of get_while_len_data_ref_ptr outside, let it to be gated at the caller site.
>  
> More comments from Richard Biener:
>>> So it's not actually saturating.  The saturating operation is done by .WHILE_LEN?
> I define the outcome of SELECT_VL (n, vf)  (WHILE_LEN) = IN_RANGE (0, min (n, vf)) will make 
> the loop control counter never underflow zero.
>  
>>> I see.  I wonder if it makes sense to leave .WHILE_LEN aside for a start,
>>> the above scheme should also work for single rgroups, no?
>>> As said, it _looks_ like you can progress without .WHILE_LEN and using
>>> .WHILE_LEN is a pure optimization?
> Yes, SELECT_VL (WHILE_LEN) is pure optimization for single-rgroup and allow
> target adjust any length = INRANGE (0, min (n, vf)) each iteration.
>  
> Let me known if I missed something for the V3 patch.
> Thanks.
>  
> ---
> gcc/cfgloopmanip.cc            |   2 +-
> gcc/doc/md.texi                |  34 +++
> gcc/gimple-loop-interchange.cc |   2 +-
> gcc/internal-fn.def            |   1 +
> gcc/optabs.def                 |   1 +
> gcc/tree-ssa-loop-ivcanon.cc   |   2 +-
> gcc/tree-ssa-loop-ivopts.cc    |   2 +-
> gcc/tree-ssa-loop-manip.cc     |  18 +-
> gcc/tree-ssa-loop-manip.h      |   4 +-
> gcc/tree-vect-data-refs.cc     |   8 +-
> gcc/tree-vect-loop-manip.cc    | 374 ++++++++++++++++++++++++++++++++-
> gcc/tree-vect-loop.cc          |  32 ++-
> gcc/tree-vect-stmts.cc         |  89 +++++++-
> gcc/tree-vectorizer.h          |   4 +-
> 14 files changed, 535 insertions(+), 38 deletions(-)
>  
> diff --git a/gcc/cfgloopmanip.cc b/gcc/cfgloopmanip.cc
> index 0e3ad8ed742..6e09dcbb0b1 100644
> --- a/gcc/cfgloopmanip.cc
> +++ b/gcc/cfgloopmanip.cc
> @@ -826,7 +826,7 @@ create_empty_loop_on_edge (edge entry_edge,
>      }
>    gsi = gsi_last_bb (loop_header);
> -  create_iv (initial_value, stride, iv, loop, &gsi, false,
> +  create_iv (initial_value, PLUS_EXPR, stride, iv, loop, &gsi, false,
>      iv_before, iv_after);
>    /* Insert loop exit condition.  */
> diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> index cc4a93a8763..99cf0cdbdca 100644
> --- a/gcc/doc/md.texi
> +++ b/gcc/doc/md.texi
> @@ -4974,6 +4974,40 @@ for (i = 1; i < operand3; i++)
>    operand0[i] = operand0[i - 1] && (operand1 + i < operand2);
> @end smallexample
> +@cindex @code{select_vl@var{m}} instruction pattern
> +@item @code{select_vl@var{m}}
> +Set operand 0 to the number of active elements in vector will be updated value.
> +operand 1 is the total elements need to be updated value.
> +operand 2 is the vectorization factor.
> +The value of operand 0 is target dependent and flexible in each iteration.
> +The operation of this pattern can be:
> +
> +@smallexample
> +Case 1:
> +operand0 = MIN (operand1, operand2);
> +operand2 can be const_poly_int or poly_int related to vector mode size.
> +Some target like RISC-V has a standalone instruction to get MIN (n, MODE SIZE) so
> +that we can reduce a use of general purpose register.
> +
> +In this case, only the last iteration of the loop is partial iteration.
> +@end smallexample
> +
> +@smallexample
> +Case 2:
> +if (operand1 <= operand2)
> +  operand0 = operand1;
> +else if (operand1 < 2 * operand2)
> +  operand0 = IN_RANGE (ceil (operand1 / 2), operand2);
> +else
> +  operand0 = operand2;
> +
> +This case will evenly distribute work over the last 2 iterations of a stripmine loop.
> +@end smallexample
> +
> +The output of this pattern is not only used as IV of loop control counter, but also
> +is used as the IV of address calculation with multiply/shift operation. This allow
> +us dynamic adjust the number of elements is processed in each iteration of the loop.
> +
> @cindex @code{check_raw_ptrs@var{m}} instruction pattern
> @item @samp{check_raw_ptrs@var{m}}
> Check whether, given two pointers @var{a} and @var{b} and a length @var{len},
> diff --git a/gcc/gimple-loop-interchange.cc b/gcc/gimple-loop-interchange.cc
> index 1b77bfd46b2..e5590374e59 100644
> --- a/gcc/gimple-loop-interchange.cc
> +++ b/gcc/gimple-loop-interchange.cc
> @@ -1185,7 +1185,7 @@ tree_loop_interchange::map_inductions_to_loop (loop_cand &src, loop_cand &tgt)
>   tree var_before, var_after;
>   tree base = unshare_expr (iv->init_expr);
>   tree step = unshare_expr (iv->step);
> -   create_iv (base, step, SSA_NAME_VAR (iv->var),
> +   create_iv (base, PLUS_EXPR, step, SSA_NAME_VAR (iv->var),
>      tgt.m_loop, &incr_pos, false, &var_before, &var_after);
>   bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (var_before));
>   bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (var_after));
> diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> index 7fe742c2ae7..6f6fa7d37f9 100644
> --- a/gcc/internal-fn.def
> +++ b/gcc/internal-fn.def
> @@ -153,6 +153,7 @@ DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set)
> DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store)
> DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
> +DEF_INTERNAL_OPTAB_FN (SELECT_VL, ECF_CONST | ECF_NOTHROW, select_vl, binary)
> DEF_INTERNAL_OPTAB_FN (CHECK_RAW_PTRS, ECF_CONST | ECF_NOTHROW,
>        check_raw_ptrs, check_ptrs)
> DEF_INTERNAL_OPTAB_FN (CHECK_WAR_PTRS, ECF_CONST | ECF_NOTHROW,
> diff --git a/gcc/optabs.def b/gcc/optabs.def
> index 695f5911b30..b637471b76e 100644
> --- a/gcc/optabs.def
> +++ b/gcc/optabs.def
> @@ -476,3 +476,4 @@ OPTAB_DC (vec_series_optab, "vec_series$a", VEC_SERIES)
> OPTAB_D (vec_shl_insert_optab, "vec_shl_insert_$a")
> OPTAB_D (len_load_optab, "len_load_$a")
> OPTAB_D (len_store_optab, "len_store_$a")
> +OPTAB_D (select_vl_optab, "select_vl$a")
> diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
> index e41ec73a52a..d4e113c44df 100644
> --- a/gcc/tree-ssa-loop-ivcanon.cc
> +++ b/gcc/tree-ssa-loop-ivcanon.cc
> @@ -113,7 +113,7 @@ create_canonical_iv (class loop *loop, edge exit, tree niter,
>        niter,
>        build_int_cst (type, 1));
>    incr_at = gsi_last_bb (in->src);
> -  create_iv (niter,
> +  create_iv (niter, PLUS_EXPR,
>      build_int_cst (type, -1),
>      NULL_TREE, loop,
>      &incr_at, false, var_before, &var);
> diff --git a/gcc/tree-ssa-loop-ivopts.cc b/gcc/tree-ssa-loop-ivopts.cc
> index 78e8cbc75b5..8640fe2d487 100644
> --- a/gcc/tree-ssa-loop-ivopts.cc
> +++ b/gcc/tree-ssa-loop-ivopts.cc
> @@ -7267,7 +7267,7 @@ create_new_iv (struct ivopts_data *data, struct iv_cand *cand)
>    base = unshare_expr (cand->iv->base);
> -  create_iv (base, unshare_expr (cand->iv->step),
> +  create_iv (base, PLUS_EXPR, unshare_expr (cand->iv->step),
>      cand->var_before, data->current_loop,
>      &incr_pos, after, &cand->var_before, &cand->var_after);
> }
> diff --git a/gcc/tree-ssa-loop-manip.cc b/gcc/tree-ssa-loop-manip.cc
> index 909b705d00d..5abca64379e 100644
> --- a/gcc/tree-ssa-loop-manip.cc
> +++ b/gcc/tree-ssa-loop-manip.cc
> @@ -47,7 +47,9 @@ along with GCC; see the file COPYING3.  If not see
>     so that we can free them all at once.  */
> static bitmap_obstack loop_renamer_obstack;
> -/* Creates an induction variable with value BASE + STEP * iteration in LOOP.
> +/* Creates an induction variable with value BASE (+/-) STEP * iteration in LOOP.
> +   If CODE is PLUS_EXPR, the induction variable is BASE + STEP * iteration.
> +   If CODE is MINUS_EXPR, the induction variable is BASE - STEP * iteration.
>     It is expected that neither BASE nor STEP are shared with other expressions
>     (unless the sharing rules allow this).  Use VAR as a base var_decl for it
>     (if NULL, a new temporary will be created).  The increment will occur at
> @@ -57,8 +59,8 @@ static bitmap_obstack loop_renamer_obstack;
>     VAR_AFTER (unless they are NULL).  */
> void
> -create_iv (tree base, tree step, tree var, class loop *loop,
> -    gimple_stmt_iterator *incr_pos, bool after,
> +create_iv (tree base, tree_code code, tree step, tree var,
> +    class loop *loop, gimple_stmt_iterator *incr_pos, bool after,
>    tree *var_before, tree *var_after)
> {
>    gassign *stmt;
> @@ -66,7 +68,9 @@ create_iv (tree base, tree step, tree var, class loop *loop,
>    tree initial, step1;
>    gimple_seq stmts;
>    tree vb, va;
> -  enum tree_code incr_op = PLUS_EXPR;
> +  /* The code can only be PLUS_EXPR or MINUS_EXPR.  */
> +  gcc_assert (code == PLUS_EXPR || code == MINUS_EXPR);
> +  tree_code incr_op = code;
>    edge pe = loop_preheader_edge (loop);
>    if (var != NULL_TREE)
> @@ -1365,7 +1369,7 @@ tree_transform_and_unroll_loop (class loop *loop, unsigned factor,
>        tree ctr_before, ctr_after;
>        gimple_stmt_iterator bsi = gsi_last_nondebug_bb (new_exit->src);
>        exit_if = as_a <gcond *> (gsi_stmt (bsi));
> -      create_iv (exit_base, exit_step, NULL_TREE, loop,
> +      create_iv (exit_base, PLUS_EXPR, exit_step, NULL_TREE, loop,
> &bsi, false, &ctr_before, &ctr_after);
>        gimple_cond_set_code (exit_if, exit_cmp);
>        gimple_cond_set_lhs (exit_if, ctr_after);
> @@ -1580,8 +1584,8 @@ canonicalize_loop_ivs (class loop *loop, tree *nit, bool bump_in_latch)
>      gsi = gsi_last_bb (loop->latch);
>    else
>      gsi = gsi_last_nondebug_bb (loop->header);
> -  create_iv (build_int_cst_type (type, 0), build_int_cst (type, 1), NULL_TREE,
> -      loop, &gsi, bump_in_latch, &var_before, NULL);
> +  create_iv (build_int_cst_type (type, 0), PLUS_EXPR, build_int_cst (type, 1),
> +      NULL_TREE, loop, &gsi, bump_in_latch, &var_before, NULL);
>    rewrite_all_phi_nodes_with_iv (loop, var_before);
> diff --git a/gcc/tree-ssa-loop-manip.h b/gcc/tree-ssa-loop-manip.h
> index d49273a3987..bda09f51d56 100644
> --- a/gcc/tree-ssa-loop-manip.h
> +++ b/gcc/tree-ssa-loop-manip.h
> @@ -22,8 +22,8 @@ along with GCC; see the file COPYING3.  If not see
> typedef void (*transform_callback)(class loop *, void *);
> -extern void create_iv (tree, tree, tree, class loop *, gimple_stmt_iterator *,
> -        bool, tree *, tree *);
> +extern void create_iv (tree, tree_code, tree, tree, class loop *,
> +        gimple_stmt_iterator *, bool, tree *, tree *);
> extern void rewrite_into_loop_closed_ssa (bitmap, unsigned);
> extern void verify_loop_closed_ssa (bool, class loop * = NULL);
> diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
> index 6721ab6efc4..5c9103b16e5 100644
> --- a/gcc/tree-vect-data-refs.cc
> +++ b/gcc/tree-vect-data-refs.cc
> @@ -5099,7 +5099,7 @@ vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
>        standard_iv_increment_position (loop, &incr_gsi, &insert_after);
> -      create_iv (aggr_ptr_init,
> +      create_iv (aggr_ptr_init, PLUS_EXPR,
> fold_convert (aggr_ptr_type, iv_step),
> aggr_ptr, loop, &incr_gsi, insert_after,
> &indx_before_incr, &indx_after_incr);
> @@ -5129,9 +5129,9 @@ vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
>      {
>        standard_iv_increment_position (containing_loop, &incr_gsi,
>       &insert_after);
> -      create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
> - containing_loop, &incr_gsi, insert_after, &indx_before_incr,
> - &indx_after_incr);
> +      create_iv (aptr, PLUS_EXPR, fold_convert (aggr_ptr_type, DR_STEP (dr)),
> + aggr_ptr, containing_loop, &incr_gsi, insert_after,
> + &indx_before_incr, &indx_after_incr);
>        incr = gsi_stmt (incr_gsi);
>        /* Copy the points-to information if it exists. */
> diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> index 44bd5f2c805..d63ded5d4f0 100644
> --- a/gcc/tree-vect-loop-manip.cc
> +++ b/gcc/tree-vect-loop-manip.cc
> @@ -385,6 +385,48 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
>    return false;
> }
> +/* Try to use permutes to define the lens in DEST_RGM using the lens
> +   in SRC_RGM, given that the former has twice as many lens as the
> +   latter.  Return true on success, adding any new statements to SEQ.  */
> +
> +static bool
> +vect_maybe_permute_loop_lens (tree iv_type, gimple_seq *seq,
> +       rgroup_controls *dest_rgm,
> +       rgroup_controls *src_rgm)
> +{
> +  tree ctrl_type = dest_rgm->type;
> +  poly_uint64 nitems_per_ctrl
> +    = TYPE_VECTOR_SUBPARTS (ctrl_type) * dest_rgm->factor;
> +  if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter)
> +    {
> +      for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
> + {
> +   tree src = src_rgm->controls[i / dest_rgm->controls.length ()];
> +   tree dest = dest_rgm->controls[i];
> +   gassign *stmt;
> +   if (i == 0)
> +     {
> +       /* MIN (X, VF*I/N) capped to the range [0, VF/N].  */
> +       tree factor = build_int_cst (iv_type, nitems_per_ctrl);
> +       stmt = gimple_build_assign (dest, MIN_EXPR, src, factor);
> +       gimple_seq_add_stmt (seq, stmt);
> +     }
> +   else
> +     {
> +       /* (X - MIN (X, VF*I/N)) capped to the range [0, VF/N].  */
> +       tree factor = build_int_cst (iv_type, nitems_per_ctrl * i);
> +       tree temp = make_ssa_name (iv_type);
> +       stmt = gimple_build_assign (temp, MIN_EXPR, src, factor);
> +       gimple_seq_add_stmt (seq, stmt);
> +       stmt = gimple_build_assign (dest, MINUS_EXPR, src, temp);
> +       gimple_seq_add_stmt (seq, stmt);
> +     }
> + }
> +      return true;
> +    }
> +  return false;
> +}
> +
> /* Helper for vect_set_loop_condition_partial_vectors.  Generate definitions
>     for all the rgroup controls in RGC and return a control that is nonzero
>     when the loop needs to iterate.  Add any new preheader statements to
> @@ -468,8 +510,9 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
>    gimple_stmt_iterator incr_gsi;
>    bool insert_after;
>    standard_iv_increment_position (loop, &incr_gsi, &insert_after);
> -  create_iv (build_int_cst (iv_type, 0), nitems_step, NULL_TREE, loop,
> -      &incr_gsi, insert_after, &index_before_incr, &index_after_incr);
> +  create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE,
> +      loop, &incr_gsi, insert_after, &index_before_incr,
> +      &index_after_incr);
>    tree zero_index = build_int_cst (compare_type, 0);
>    tree test_index, test_limit, first_limit;
> @@ -682,6 +725,300 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
>    return next_ctrl;
> }
> +/* Helper for vect_set_loop_condition_partial_vectors.  Generate definitions
> +   for all the rgroup controls in RGC and return a control that is nonzero
> +   when the loop needs to iterate.  Add any new preheader statements to
> +   PREHEADER_SEQ.  Use LOOP_COND_GSI to insert code before the exit gcond.
> +
> +   RGC belongs to loop LOOP.  The loop originally iterated NITERS
> +   times and has been vectorized according to LOOP_VINFO.
> +
> +   Unlike vect_set_loop_controls_directly which is iterating from 0-based IV
> +   to TEST_LIMIT - bias.
> +
> +   In vect_set_loop_controls_by_select_vl, we are iterating from start at
> +   IV = TEST_LIMIT - bias and keep subtract IV by the length calculated by
> +   IFN_SELECT_VL pattern.
> +
> +   1. Single rgroup, the Gimple IR should be:
> +
> + # vectp_B.6_8 = PHI <vectp_B.6_13(6), &B(5)>
> + # vectp_B.8_16 = PHI <vectp_B.8_17(6), &B(5)>
> + # vectp_A.11_19 = PHI <vectp_A.11_20(6), &A(5)>
> + # vectp_A.13_22 = PHI <vectp_A.13_23(6), &A(5)>
> + # ivtmp_26 = PHI <ivtmp_27(6), _25(5)>
> + _28 = .SELECT_VL (ivtmp_26, POLY_INT_CST [4, 4]);
> + ivtmp_15 = _28 * 4;
> + vect__1.10_18 = .LEN_LOAD (vectp_B.8_16, 128B, _28, 0);
> + _1 = B[i_10];
> + .LEN_STORE (vectp_A.13_22, 128B, _28, vect__1.10_18, 0);
> + i_7 = i_10 + 1;
> + vectp_B.8_17 = vectp_B.8_16 + ivtmp_15;
> + vectp_A.13_23 = vectp_A.13_22 + ivtmp_15;
> + ivtmp_27 = ivtmp_26 - _28;
> + if (ivtmp_27 != 0)
> +   goto <bb 6>; [83.33%]
> + else
> +   goto <bb 7>; [16.67%]
> +
> +   Note: We use the outcome of .SELECT_VL to adjust both loop control IV and
> +   data reference pointer IV.
> +
> +   1). The result of .SELECT_VL:
> +       _28 = .SELECT_VL (ivtmp_26, POLY_INT_CST [4, 4]);
> +       The _28 is not necessary to be VF in any iteration, instead, we allow
> +       _28 to be any value as long as _28 <= VF. Such flexible SELECT_VL
> +       pattern allows target have various flexible optimizations in vector
> +       loop iterations. Target like RISC-V has special application vector
> +       length calculation instruction which will distribute even workload
> +       in the last 2 iterations.
> +
> +       Other example is that we can allow even generate _28 <= VF / 2 so
> +       that some machine can run vector codes in low power mode.
> +
> +   2). Loop control IV:
> +       ivtmp_27 = ivtmp_26 - _28;
> +       if (ivtmp_27 != 0)
> + goto <bb 6>; [83.33%]
> +       else
> + goto <bb 7>; [16.67%]
> +
> +       This is the saturating-subtraction towards zero, the outcome of
> +       .SELECT_VL wil make ivtmp_27 never underflow zero.
> +
> +   3). Data reference pointer IV:
> +       ivtmp_15 = _28 * 4;
> +       vectp_B.8_17 = vectp_B.8_16 + ivtmp_15;
> +       vectp_A.13_23 = vectp_A.13_22 + ivtmp_15;
> +
> +       The pointer IV is adjusted accurately according to the .SELECT_VL.
> +
> +   2. Multiple rgroup, the Gimple IR should be:
> +
> + # i_23 = PHI <i_20(6), 0(11)>
> + # vectp_f.8_51 = PHI <vectp_f.8_52(6), f_15(D)(11)>
> + # vectp_d.10_59 = PHI <vectp_d.10_60(6), d_18(D)(11)>
> + # ivtmp_70 = PHI <ivtmp_71(6), _69(11)>
> + # ivtmp_73 = PHI <ivtmp_74(6), _67(11)>
> + _72 = MIN_EXPR <ivtmp_70, 16>;
> + _75 = MIN_EXPR <ivtmp_73, 16>;
> + _1 = i_23 * 2;
> + _2 = (long unsigned int) _1;
> + _3 = _2 * 2;
> + _4 = f_15(D) + _3;
> + _5 = _2 + 1;
> + _6 = _5 * 2;
> + _7 = f_15(D) + _6;
> + .LEN_STORE (vectp_f.8_51, 128B, _75, { 1, 2, 1, 2, 1, 2, 1, 2 }, 0);
> + vectp_f.8_56 = vectp_f.8_51 + 16;
> + .LEN_STORE (vectp_f.8_56, 128B, _72, { 1, 2, 1, 2, 1, 2, 1, 2 }, 0);
> + _8 = (long unsigned int) i_23;
> + _9 = _8 * 4;
> + _10 = d_18(D) + _9;
> + _61 = _75 / 2;
> + .LEN_STORE (vectp_d.10_59, 128B, _61, { 3, 3, 3, 3 }, 0);
> + vectp_d.10_63 = vectp_d.10_59 + 16;
> + _64 = _72 / 2;
> + .LEN_STORE (vectp_d.10_63, 128B, _64, { 3, 3, 3, 3 }, 0);
> + i_20 = i_23 + 1;
> + vectp_f.8_52 = vectp_f.8_56 + 16;
> + vectp_d.10_60 = vectp_d.10_63 + 16;
> + ivtmp_74 = ivtmp_73 - _75;
> + ivtmp_71 = ivtmp_70 - _72;
> + if (ivtmp_74 != 0)
> +   goto <bb 6>; [83.33%]
> + else
> +   goto <bb 13>; [16.67%]
> +
> +   Note: We DO NOT use .SELECT_VL in SLP auto-vectorization for multiple
> +   rgroups. Instead, we use MIN_EXPR to guarantee we always use VF as the
> +   iteration amount for mutiple rgroups.
> +
> +   The analysis of the flow of multiple rgroups:
> + _72 = MIN_EXPR <ivtmp_70, 16>;
> + _75 = MIN_EXPR <ivtmp_73, 16>;
> + ...
> + .LEN_STORE (vectp_f.8_51, 128B, _75, { 1, 2, 1, 2, 1, 2, 1, 2 }, 0);
> + vectp_f.8_56 = vectp_f.8_51 + 16;
> + .LEN_STORE (vectp_f.8_56, 128B, _72, { 1, 2, 1, 2, 1, 2, 1, 2 }, 0);
> + ...
> + _61 = _75 / 2;
> + .LEN_STORE (vectp_d.10_59, 128B, _61, { 3, 3, 3, 3 }, 0);
> + vectp_d.10_63 = vectp_d.10_59 + 16;
> + _64 = _72 / 2;
> + .LEN_STORE (vectp_d.10_63, 128B, _64, { 3, 3, 3, 3 }, 0);
> +
> +  We use _72 = MIN_EXPR <ivtmp_70, 16>; to generate the number of the elements
> +  to be processed in each iteration.
> +
> +  The related STOREs:
> +    _72 = MIN_EXPR <ivtmp_70, 16>;
> +    .LEN_STORE (vectp_f.8_56, 128B, _72, { 1, 2, 1, 2, 1, 2, 1, 2 }, 0);
> +    _64 = _72 / 2;
> +    .LEN_STORE (vectp_d.10_63, 128B, _64, { 3, 3, 3, 3 }, 0);
> +  Since these 2 STOREs store 2 vectors that the second vector is half elements
> +  of the first vector. So the length of second STORE will be _64 = _72 / 2;
> +  It's similar to the VIEW_CONVERT of handling masks in SLP.
> +
> +  3. Multiple rgroups for non-SLP auto-vectorization.
> +
> +     # ivtmp_26 = PHI <ivtmp_27(4), _25(3)>
> +     # ivtmp.35_10 = PHI <ivtmp.35_11(4), ivtmp.35_1(3)>
> +     # ivtmp.36_2 = PHI <ivtmp.36_8(4), ivtmp.36_23(3)>
> +     _28 = MIN_EXPR <ivtmp_26, POLY_INT_CST [8, 8]>;
> +     loop_len_15 = MIN_EXPR <_28, POLY_INT_CST [4, 4]>;
> +     loop_len_16 = _28 - loop_len_15;
> +     _29 = (void *) ivtmp.35_10;
> +     _7 = &MEM <vector([4,4]) int> [(int *)_29];
> +     vect__1.25_17 = .LEN_LOAD (_7, 128B, loop_len_15, 0);
> +     _33 = _29 + POLY_INT_CST [16, 16];
> +     _34 = &MEM <vector([4,4]) int> [(int *)_33];
> +     vect__1.26_19 = .LEN_LOAD (_34, 128B, loop_len_16, 0);
> +     vect__2.27_20 = VEC_PACK_TRUNC_EXPR <vect__1.25_17, vect__1.26_19>;
> +     _30 = (void *) ivtmp.36_2;
> +     _31 = &MEM <vector([8,8]) short int> [(short int *)_30];
> +     .LEN_STORE (_31, 128B, _28, vect__2.27_20, 0);
> +     ivtmp_27 = ivtmp_26 - _28;
> +     ivtmp.35_11 = ivtmp.35_10 + POLY_INT_CST [32, 32];
> +     ivtmp.36_8 = ivtmp.36_2 + POLY_INT_CST [16, 16];
> +     if (ivtmp_27 != 0)
> +       goto <bb 4>; [83.33%]
> +     else
> +       goto <bb 5>; [16.67%]
> +
> +     The total length: _28 = MIN_EXPR <ivtmp_26, POLY_INT_CST [8, 8]>;
> +
> +     The length of first half vector:
> +       loop_len_15 = MIN_EXPR <_28, POLY_INT_CST [4, 4]>;
> +
> +     The length of second half vector:
> +       loop_len_15 = MIN_EXPR <_28, POLY_INT_CST [4, 4]>;
> +       loop_len_16 = _28 - loop_len_15;
> +
> +     1). _28 always <= POLY_INT_CST [8, 8].
> +     2). When _28 <= POLY_INT_CST [4, 4], second half vector is not processed.
> +     3). When _28 > POLY_INT_CST [4, 4], second half vector is processed.
> +*/
> +
> +static tree
> +vect_set_loop_controls_by_select_vl (class loop *loop, loop_vec_info loop_vinfo,
> +      gimple_seq *preheader_seq,
> +      gimple_seq *header_seq,
> +      rgroup_controls *rgc, tree niters,
> +      unsigned int controls_length)
> +{
> +  tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
> +  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> +  /* We are not allowing masked approach in SELECT_VL.  */
> +  gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
> +
> +  tree ctrl_type = rgc->type;
> +  unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
> +  poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
> +  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
> +
> +  /* Calculate the maximum number of item values that the rgroup
> +     handles in total, the number that it handles for each iteration
> +     of the vector loop.  */
> +  tree nitems_total = niters;
> +  if (nitems_per_iter != 1)
> +    {
> +      /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
> + these multiplications don't overflow.  */
> +      tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
> +      nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
> +    nitems_total, compare_factor);
> +    }
> +
> +  /* Convert the comparison value to the IV type (either a no-op or
> +     a promotion).  */
> +  nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total);
> +
> +  /* Create an induction variable that counts the number of items
> +     processed.  */
> +  tree index_before_incr, index_after_incr;
> +  gimple_stmt_iterator incr_gsi;
> +  bool insert_after;
> +  standard_iv_increment_position (loop, &incr_gsi, &insert_after);
> +
> +  /* Test the decremented IV, which will never underflow 0 since we have
> +     IFN_SELECT_VL to gurantee that.  */
> +  tree test_limit = nitems_total;
> +
> +  /* Provide a definition of each control in the group.  */
> +  tree ctrl;
> +  unsigned int i;
> +  FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
> +    {
> +      /* Previous controls will cover BIAS items.  This control covers the
> + next batch.  */
> +      poly_uint64 bias = nitems_per_ctrl * i;
> +      tree bias_tree = build_int_cst (iv_type, bias);
> +
> +      /* Rather than have a new IV that starts at TEST_LIMIT and goes down to
> + BIAS, prefer to use the same TEST_LIMIT - BIAS based IV for each
> + control and adjust the bound down by BIAS.  */
> +      tree this_test_limit = test_limit;
> +      if (i != 0)
> + {
> +   this_test_limit = gimple_build (preheader_seq, MAX_EXPR, iv_type,
> +   this_test_limit, bias_tree);
> +   this_test_limit = gimple_build (preheader_seq, MINUS_EXPR, iv_type,
> +   this_test_limit, bias_tree);
> + }
> +
> +      /* Create decrement IV.  */
> +      create_iv (this_test_limit, MINUS_EXPR, ctrl, NULL_TREE, loop, &incr_gsi,
> + insert_after, &index_before_incr, &index_after_incr);
> +
> +      poly_uint64 final_vf = vf * nitems_per_iter;
> +      tree vf_step = build_int_cst (iv_type, final_vf);
> +      tree res_len;
> +      if (nitems_per_iter != 1 || controls_length != 1)
> + {
> +   /* For SLP, we can't allow non-VF number of elements to be processed
> +      in non-final iteration. We force the number of elements to be
> +    processed in each non-final iteration is VF elements. If we allow
> +    non-VF elements processing in non-final iteration will make SLP too
> +    complicated and produce inferior codegen.
> +
> +        For example:
> +
> + If non-final iteration process VF elements.
> +
> +   ...
> +   .LEN_STORE (vectp_f.8_51, 128B, _71, { 1, 2, 1, 2 }, 0);
> +   .LEN_STORE (vectp_f.8_56, 128B, _72, { 1, 2, 1, 2 }, 0);
> +   ...
> +
> + If non-final iteration process non-VF elements.
> +
> +   ...
> +   .LEN_STORE (vectp_f.8_51, 128B, _71, { 1, 2, 1, 2 }, 0);
> +   if (_71 % 2 == 0)
> +    .LEN_STORE (vectp_f.8_56, 128B, _72, { 1, 2, 1, 2 }, 0);
> +   else
> +    .LEN_STORE (vectp_f.8_56, 128B, _72, { 2, 1, 2, 1 }, 0);
> +   ...
> +
> +    This is the simple case of 2-elements interleaved vector SLP. We
> +    consider other interleave vector, the situation will become more
> +    complicated.  */
> +   res_len = gimple_build (header_seq, MIN_EXPR, iv_type,
> +   index_before_incr, vf_step);
> + }
> +      else
> + {
> +   res_len = gimple_build (header_seq, IFN_SELECT_VL, iv_type,
> +   index_before_incr, vf_step);
> + }
> +      gassign *assign = gimple_build_assign (ctrl, res_len);
> +      gimple_seq_add_stmt (header_seq, assign);
> +    }
> +
> +  return index_after_incr;
> +}
> +
> /* Set up the iteration condition and rgroup controls for LOOP, given
>     that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
>     loop.  LOOP_VINFO describes the vectorization of LOOP.  NITERS is
> @@ -703,6 +1040,10 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
>    bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
>    tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
> +  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> +  bool use_vl_p = !use_masks_p
> +   && direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
> +      OPTIMIZE_FOR_SPEED);
>    unsigned int compare_precision = TYPE_PRECISION (compare_type);
>    tree orig_niters = niters;
> @@ -752,17 +1093,32 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
>       continue;
>   }
> + if (use_vl_p && rgc->controls.length () != 1)
> +   {
> +     rgroup_controls *sub_rgc
> +       = &(*controls)[nmasks / rgc->controls.length () - 1];
> +     if (!sub_rgc->controls.is_empty ()
> + && vect_maybe_permute_loop_lens (iv_type, &header_seq, rgc,
> + sub_rgc))
> +       continue;
> +   }
> +
> /* See whether zero-based IV would ever generate all-false masks
>    or zero length before wrapping around.  */
> bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
> /* Set up all controls for this group.  */
> - test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
> -      &preheader_seq,
> -      &header_seq,
> -      loop_cond_gsi, rgc,
> -      niters, niters_skip,
> -      might_wrap_p);
> + if (use_vl_p)
> +   test_ctrl
> +     = vect_set_loop_controls_by_select_vl (loop, loop_vinfo,
> +    &preheader_seq, &header_seq,
> +    rgc, niters, controls->length ());
> + else
> +   test_ctrl
> +     = vect_set_loop_controls_directly (loop, loop_vinfo, &preheader_seq,
> +        &header_seq, loop_cond_gsi, rgc,
> +        niters, niters_skip,
> +        might_wrap_p);
>        }
>    /* Emit all accumulated statements.  */
> @@ -893,7 +1249,7 @@ vect_set_loop_condition_normal (class loop *loop, tree niters, tree step,
>      }
>    standard_iv_increment_position (loop, &incr_gsi, &insert_after);
> -  create_iv (init, step, NULL_TREE, loop,
> +  create_iv (init, PLUS_EXPR, step, NULL_TREE, loop,
>               &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
>    indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
>       true, NULL_TREE, true,
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 6ea0f21fd13..d7cf92576d1 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -5567,7 +5567,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>        gimple_stmt_iterator incr_gsi;
>        bool insert_after;
>        standard_iv_increment_position (loop, &incr_gsi, &insert_after);
> -      create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
> +      create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
> insert_after, &indx_before_incr, &indx_after_incr);
>        /* Next create a new phi node vector (NEW_PHI_TREE) which starts
> @@ -10361,15 +10361,18 @@ vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
> }
> /* Given a complete set of length LENS, extract length number INDEX for an
> -   rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
> +   rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.
> +   Insert any set-up statements before GSI.  */
> tree
> -vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
> -    unsigned int nvectors, unsigned int index)
> +vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
> +    vec_loop_lens *lens, unsigned int nvectors, tree vectype,
> +    unsigned int index)
> {
>    rgroup_controls *rgl = &(*lens)[nvectors - 1];
>    bool use_bias_adjusted_len =
>      LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
> +  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
>    /* Populate the rgroup's len array, if this is the first time we've
>       used it.  */
> @@ -10400,6 +10403,27 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
>    if (use_bias_adjusted_len)
>      return rgl->bias_adjusted_ctrl;
> +  else if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
> +    OPTIMIZE_FOR_SPEED))
> +    {
> +      tree loop_len = rgl->controls[index];
> +      poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
> +      poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
> +      if (maybe_ne (nunits1, nunits2))
> + {
> +   /* A loop len for data type X can be reused for data type Y
> +      if X has N times more elements than Y and if Y's elements
> +      are N times bigger than X's.  */
> +   gcc_assert (multiple_p (nunits1, nunits2));
> +   unsigned int factor = exact_div (nunits1, nunits2).to_constant ();
> +   gimple_seq seq = NULL;
> +   loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
> +    build_int_cst (iv_type, factor));
> +   if (seq)
> +     gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
> + }
> +      return loop_len;
> +    }
>    else
>      return rgl->controls[index];
> }
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 3ad6a7d28d7..26813ab56e9 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -3147,6 +3147,61 @@ vect_get_data_ptr_increment (vec_info *vinfo,
>    return iv_step;
> }
> +/* Prepare the pointer IVs which needs to be updated by a variable amount.
> +   Such variable amount is the outcome of .SELECT_VL. In this case, we can
> +   allow each iteration process the flexible number of elements as long as
> +   the number <= vf elments.
> +
> +   Return data reference according to SELECT_VL.
> +   If new statements are needed, insert them before GSI.  */
> +
> +static tree
> +get_select_vl_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
> +     tree aggr_type, class loop *at_loop, tree offset,
> +     tree *dummy, gimple_stmt_iterator *gsi,
> +     bool simd_lane_access_p, vec_loop_lens *loop_lens,
> +     dr_vec_info *dr_info,
> +     vect_memory_access_type memory_access_type)
> +{
> +  loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo);
> +  tree step = vect_dr_behavior (vinfo, dr_info)->step;
> +
> +  /* TODO: We don't support gather/scatter or load_lanes/store_lanes for pointer
> +     IVs are updated by variable amount but we will support them in the future.
> +   */
> +  gcc_assert (memory_access_type != VMAT_GATHER_SCATTER
> +       && memory_access_type != VMAT_LOAD_STORE_LANES);
> +
> +  /* When we support SELECT_VL pattern, we dynamic adjust
> +     the memory address by .SELECT_VL result.
> +
> +     The result of .SELECT_VL is the number of elements to
> +     be processed of each iteration. So the memory address
> +     adjustment operation should be:
> +
> +     bytesize = GET_MODE_SIZE (element_mode (aggr_type));
> +     addr = addr + .SELECT_VL (ARG..) * bytesize;
> +  */
> +  gimple *ptr_incr;
> +  tree loop_len
> +    = vect_get_loop_len (loop_vinfo, gsi, loop_lens, 1, aggr_type, 0);
> +  tree len_type = TREE_TYPE (loop_len);
> +  poly_uint64 bytesize = GET_MODE_SIZE (element_mode (aggr_type));
> +  /* Since the outcome of .SELECT_VL is element size, we should adjust
> +     it into bytesize so that it can be used in address pointer variable
> +     amount IVs adjustment.  */
> +  tree tmp = fold_build2 (MULT_EXPR, len_type, loop_len,
> +   build_int_cst (len_type, bytesize));
> +  if (tree_int_cst_sgn (step) == -1)
> +    tmp = fold_build1 (NEGATE_EXPR, len_type, tmp);
> +  tree bump = make_temp_ssa_name (len_type, NULL, "ivtmp");
> +  gassign *assign = gimple_build_assign (bump, tmp);
> +  gsi_insert_before (gsi, assign, GSI_SAME_STMT);
> +  return vect_create_data_ref_ptr (vinfo, stmt_info, aggr_type, at_loop, offset,
> +    dummy, gsi, &ptr_incr, simd_lane_access_p,
> +    bump);
> +}
> +
> /* Check and perform vectorization of BUILT_IN_BSWAP{16,32,64,128}.  */
> static bool
> @@ -8279,7 +8334,7 @@ vectorizable_store (vec_info *vinfo,
>        stride_base = cse_and_gimplify_to_preheader (loop_vinfo, stride_base);
>        ivstep = cse_and_gimplify_to_preheader (loop_vinfo, ivstep);
> -      create_iv (stride_base, ivstep, NULL,
> +      create_iv (stride_base, PLUS_EXPR, ivstep, NULL,
> loop, &incr_gsi, insert_after,
> &offvar, NULL);
>        incr = gsi_stmt (incr_gsi);
> @@ -8540,6 +8595,17 @@ vectorizable_store (vec_info *vinfo,
>     vect_get_gather_scatter_ops (loop_vinfo, loop, stmt_info,
> slp_node, &gs_info, &dataref_ptr,
> &vec_offsets);
> +   else if (loop_lens && loop_lens->length () == 1
> +    && direct_internal_fn_supported_p (
> +      IFN_SELECT_VL, LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo),
> +      OPTIMIZE_FOR_SPEED)
> +    && memory_access_type != VMAT_INVARIANT)
> +     dataref_ptr
> +       = get_select_vl_data_ref_ptr (vinfo, stmt_info, aggr_type,
> +     simd_lane_access_p ? loop : NULL,
> +     offset, &dummy, gsi,
> +     simd_lane_access_p, loop_lens,
> +     dr_info, memory_access_type);
>   else
>     dataref_ptr
>       = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
> @@ -8788,8 +8854,9 @@ vectorizable_store (vec_info *vinfo,
>       else if (loop_lens)
> {
>   tree final_len
> -     = vect_get_loop_len (loop_vinfo, loop_lens,
> - vec_num * ncopies, vec_num * j + i);
> +     = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> + vec_num * ncopies, vectype,
> + vec_num * j + i);
>   tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
>   machine_mode vmode = TYPE_MODE (vectype);
>   opt_machine_mode new_ovmode
> @@ -9450,7 +9517,7 @@ vectorizable_load (vec_info *vinfo,
>        stride_base = cse_and_gimplify_to_preheader (loop_vinfo, stride_base);
>        ivstep = cse_and_gimplify_to_preheader (loop_vinfo, ivstep);
> -      create_iv (stride_base, ivstep, NULL,
> +      create_iv (stride_base, PLUS_EXPR, ivstep, NULL,
> loop, &incr_gsi, insert_after,
> &offvar, NULL);
> @@ -9928,6 +9995,16 @@ vectorizable_load (vec_info *vinfo,
>    slp_node, &gs_info, &dataref_ptr,
>    &vec_offsets);
>     }
> +   else if (loop_lens && loop_lens->length () == 1
> +    && direct_internal_fn_supported_p (
> +      IFN_SELECT_VL, LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo),
> +      OPTIMIZE_FOR_SPEED)
> +    && memory_access_type != VMAT_INVARIANT)
> +     dataref_ptr
> +       = get_select_vl_data_ref_ptr (vinfo, stmt_info, aggr_type,
> +     at_loop, offset, &dummy, gsi,
> +     simd_lane_access_p, loop_lens,
> +     dr_info, memory_access_type);
>   else
>     dataref_ptr
>       = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
> @@ -10144,8 +10221,8 @@ vectorizable_load (vec_info *vinfo,
>     else if (loop_lens && memory_access_type != VMAT_INVARIANT)
>       {
> tree final_len
> -   = vect_get_loop_len (loop_vinfo, loop_lens,
> -        vec_num * ncopies,
> +   = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> +        vec_num * ncopies, vectype,
>        vec_num * j + i);
> tree ptr = build_int_cst (ref_type,
>   align * BITS_PER_UNIT);
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index 9cf2fb23fe3..357bc5c7315 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -2293,8 +2293,8 @@ extern tree vect_get_loop_mask (gimple_stmt_iterator *, vec_loop_masks *,
> unsigned int, tree, unsigned int);
> extern void vect_record_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
>   tree, unsigned int);
> -extern tree vect_get_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
> -        unsigned int);
> +extern tree vect_get_loop_len (loop_vec_info, gimple_stmt_iterator *, vec_loop_lens *,
> +        unsigned int, tree, unsigned int);
> extern gimple_seq vect_gen_len (tree, tree, tree, tree);
> extern stmt_vec_info info_for_reduction (vec_info *, stmt_vec_info);
> extern bool reduction_fn_for_scalar_code (code_helper, internal_fn *);