From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <SRS0=JN2h=B3=gmail.com=npickito@sourceware.org>
Received: from mail-vk1-xa2b.google.com (mail-vk1-xa2b.google.com [IPv6:2607:f8b0:4864:20::a2b])
	by sourceware.org (Postfix) with ESMTPS id 0504F3857B98
	for <gcc-patches@gcc.gnu.org>; Wed,  7 Jun 2023 02:38:21 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org 0504F3857B98
Authentication-Results: sourceware.org; dmarc=pass (p=none dis=none) header.from=gmail.com
Authentication-Results: sourceware.org; spf=pass smtp.mailfrom=gmail.com
Received: by mail-vk1-xa2b.google.com with SMTP id 71dfb90a1353d-45d1c1404easo1208375e0c.1
        for <gcc-patches@gcc.gnu.org>; Tue, 06 Jun 2023 19:38:21 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20221208; t=1686105500; x=1688697500;
        h=content-transfer-encoding:cc:to:subject:message-id:date:from
         :in-reply-to:references:mime-version:from:to:cc:subject:date
         :message-id:reply-to;
        bh=j4z593eNP0kPoL+TmfKIhhRjEyeYjCgLrV/U/GScFOY=;
        b=L+Z7N8RZixq7s337m2rl03vV3fB2UmgEj9i6RYluEkhJZJlkW3L559OE1yJmfij4Xx
         Ahv3UApZfpgbgqaBpX87kNbwg+64vyNyFFrRQ/UYbKE4UFZHbZMpWg03j4xlztuuMxh5
         L93EmI5i8FiZSnxfGfikc3+gxCktx1pkm80ttVzbYfCy3G4afjjgvkRg0LASjHyWLAcr
         LZBif0OSE76vPJrUZPFjXX+PK2uYcpQH7Tl9i9H4QXmUfs1Oqw1NgQdPb5E/TOjKQlLX
         FGbIeV9a7SKQ68XQZ1ptTLD+rVP9CS9omGGleKzX1iGSWgx5Hj2eB0plc2EyDXYR307t
         5YEg==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20221208; t=1686105500; x=1688697500;
        h=content-transfer-encoding:cc:to:subject:message-id:date:from
         :in-reply-to:references:mime-version:x-gm-message-state:from:to:cc
         :subject:date:message-id:reply-to;
        bh=j4z593eNP0kPoL+TmfKIhhRjEyeYjCgLrV/U/GScFOY=;
        b=ClKJ0jRZ/Yhuv8kSnKauzzmJCARuDmhatqJrplH40zfyNlhXgMu8wi/hf5l6QC7nG4
         yUGsr1pK6lMCppc02e7ntZklfaISMiBHHaObJRKtuqCuK2oJq8JbOb4eM8n66BQ8rMLu
         snPPbcmm3MBDOndXa+W2wfZKVy+9HZ74SOkuLnW+yV6Eb7abwc3SuNCa6VzR1+CdnmVS
         JZ7zlbSDFvDOZycbpTix0G5vYEBIfEryawqHYMHtUOSbEgGLRaKhxs7FGGefuh9cwDbH
         HoyIpjgS9Wq0uzFxXpB57VE6j+p/Pd+qqIgVBHVssBKzea/fSYWCltgdR5FW2QvfcP6r
         7SJQ==
X-Gm-Message-State: AC+VfDyUPiVY+YOYccIyTFQ+e/MbpCNLpqUuLKegM6i2sHJSRmWytF0H
	w+26HcBD5MtzI0X00U7eJnUBxNCoksojP4YxTZc=
X-Google-Smtp-Source: ACHHUZ47gEZrSHZKvRLmoC/j1oW38ai1bsZWvCmrHfhDiEkNxw1YYOorNqtJVWjQCmY1RDP4UwmBA3scPXMini7ejIc=
X-Received: by 2002:a1f:bf0b:0:b0:462:7eb0:6f8 with SMTP id
 p11-20020a1fbf0b000000b004627eb006f8mr1287806vkf.1.1686105499884; Tue, 06 Jun
 2023 19:38:19 -0700 (PDT)
MIME-Version: 1.0
References: <20230606041635.226494-1-juzhe.zhong@rivai.ai> <E249DB918182191C+2023060708380880756997@rivai.ai>
In-Reply-To: <E249DB918182191C+2023060708380880756997@rivai.ai>
From: Kito Cheng <kito.cheng@gmail.com>
Date: Wed, 7 Jun 2023 10:38:08 +0800
Message-ID: <CA+yXCZCykK6T9A77kgWy=JSD=hKeRsYB8YsF8t14T0nPAxpQOg@mail.gmail.com>
Subject: Re: [PATCH] RISC-V: Support RVV VLA SLP auto-vectorization
To: "juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai>
Cc: gcc-patches <gcc-patches@gcc.gnu.org>, "Kito.cheng" <kito.cheng@sifive.com>, 
	palmer <palmer@dabbelt.com>, palmer <palmer@rivosinc.com>, 
	jeffreyalaw <jeffreyalaw@gmail.com>, Robin Dapp <rdapp.gcc@gmail.com>, 
	"pan2.li" <pan2.li@intel.com>
Content-Type: text/plain; charset="UTF-8"
Content-Transfer-Encoding: quoted-printable
X-Spam-Status: No, score=-7.8 required=5.0 tests=BAYES_00,DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF,FREEMAIL_FROM,GIT_PATCH_0,KAM_SHORT,RCVD_IN_DNSWL_NONE,SCC_10_SHORT_WORD_LINES,SCC_5_SHORT_WORD_LINES,SPF_HELO_NONE,SPF_PASS,TXREP,T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org
List-Id: <gcc-patches.gcc.gnu.org>

Few comments, but all comments are asking adding more comment :P

> @@ -398,6 +410,48 @@ rvv_builder::get_merge_scalar_mask (unsigned int ind=
ex_in_pattern) const
>    return gen_int_mode (mask, inner_int_mode ());
>  }
>
> +/* Return true if the variable-length vector is single step.  */
> +bool
> +rvv_builder::single_step_npatterns_p () const

what is single_step_npatterns? could you have more comment?

> +{
> +  if (nelts_per_pattern () !=3D 3)
> +    return false;
> +
> +  poly_int64 step
> +    =3D rtx_to_poly_int64 (elt (npatterns ())) - rtx_to_poly_int64 (elt =
(0));
> +  for (unsigned int i =3D 0; i < npatterns (); i++)
> +    {
> +      poly_int64 ele0 =3D rtx_to_poly_int64 (elt (i));
> +      poly_int64 ele1 =3D rtx_to_poly_int64 (elt (npatterns () + i));
> +      poly_int64 ele2 =3D rtx_to_poly_int64 (elt (npatterns () * 2 + i))=
;
> +      poly_int64 diff1 =3D ele1 - ele0;
> +      poly_int64 diff2 =3D ele2 - ele1;
> +      if (maybe_ne (step, diff1) || maybe_ne (step, diff2))
> +       return false;
> +    }
> +  return true;
> +}
> +
> +/* Return true if all elements of NPATTERNS are equal.
> +
> +   E.g. NPATTERNS =3D 4:
> +     { 2, 2, 2, 2, 4, 4, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, ... }
> +   E.g. NPATTERNS =3D 8:
> +     { 2, 2, 2, 2, 2, 2, 2, 2, 8, 8, 8, 8, 8, 8, 8, 8, ... }
> +*/
> +bool
> +rvv_builder::npatterns_all_equal_p () const
> +{
> +  poly_int64 ele0 =3D rtx_to_poly_int64 (elt (0));
> +  for (unsigned int i =3D 1; i < npatterns (); i++)
> +    {
> +      poly_int64 ele =3D rtx_to_poly_int64 (elt (i));
> +      if (!known_eq (ele, ele0))
> +       return false;
> +    }
> +  return true;
> +}
> +
>  static unsigned
>  get_sew (machine_mode mode)
>  {
> @@ -425,7 +479,7 @@ const_vec_all_same_in_range_p (rtx x, HOST_WIDE_INT m=
inval,
>     future.  */
>
>  static bool
> -const_vec_all_in_range_p (rtx vec, HOST_WIDE_INT minval, HOST_WIDE_INT m=
axval)
> +const_vec_all_in_range_p (rtx vec, poly_int64 minval, poly_int64 maxval)
>  {
>    if (!CONST_VECTOR_P (vec)
>        || GET_MODE_CLASS (GET_MODE (vec)) !=3D MODE_VECTOR_INT)
> @@ -440,8 +494,10 @@ const_vec_all_in_range_p (rtx vec, HOST_WIDE_INT min=
val, HOST_WIDE_INT maxval)
>    for (int i =3D 0; i < nunits; i++)
>      {
>        rtx vec_elem =3D CONST_VECTOR_ELT (vec, i);
> -      if (!CONST_INT_P (vec_elem)
> -         || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
> +      poly_int64 value;
> +      if (!poly_int_rtx_p (vec_elem, &value)
> +         || maybe_lt (value, minval)
> +         || maybe_gt (value, maxval))
>         return false;
>      }
>    return true;
> @@ -453,7 +509,7 @@ const_vec_all_in_range_p (rtx vec, HOST_WIDE_INT minv=
al, HOST_WIDE_INT maxval)
>     future.  */
>
>  static rtx
> -gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
> +gen_const_vector_dup (machine_mode mode, poly_int64 val)
>  {
>    rtx c =3D gen_int_mode (val, GET_MODE_INNER (mode));
>    return gen_const_vec_duplicate (mode, c);
> @@ -727,7 +783,10 @@ emit_vlmax_gather_insn (rtx target, rtx op, rtx sel)
>    rtx elt;
>    insn_code icode;
>    machine_mode data_mode =3D GET_MODE (target);
> -  if (const_vec_duplicate_p (sel, &elt))
> +  machine_mode sel_mode =3D GET_MODE (sel);
> +  if (maybe_ne (GET_MODE_SIZE (data_mode), GET_MODE_SIZE (sel_mode)))
> +    icode =3D code_for_pred_gatherei16 (data_mode);
> +  else if (const_vec_duplicate_p (sel, &elt))
>      {
>        icode =3D code_for_pred_gather_scalar (data_mode);
>        sel =3D elt;
> @@ -744,7 +803,10 @@ emit_vlmax_masked_gather_mu_insn (rtx target, rtx op=
, rtx sel, rtx mask)
>    rtx elt;
>    insn_code icode;
>    machine_mode data_mode =3D GET_MODE (target);
> -  if (const_vec_duplicate_p (sel, &elt))
> +  machine_mode sel_mode =3D GET_MODE (sel);
> +  if (maybe_ne (GET_MODE_SIZE (data_mode), GET_MODE_SIZE (sel_mode)))
> +    icode =3D code_for_pred_gatherei16 (data_mode);
> +  else if (const_vec_duplicate_p (sel, &elt))
>      {
>        icode =3D code_for_pred_gather_scalar (data_mode);
>        sel =3D elt;
> @@ -895,11 +957,130 @@ expand_const_vector (rtx target, rtx src)
>        return;
>      }
>
> -  /* TODO: We only support const duplicate vector for now. More cases
> -     will be supported when we support auto-vectorization:
> +  /* Handle variable-length vector.  */
> +  unsigned int nelts_per_pattern =3D CONST_VECTOR_NELTS_PER_PATTERN (src=
);
> +  unsigned int npatterns =3D CONST_VECTOR_NPATTERNS (src);
> +  rvv_builder builder (mode, npatterns, nelts_per_pattern);
> +  for (unsigned int i =3D 0; i < nelts_per_pattern; i++)
> +    {
> +      for (unsigned int j =3D 0; j < npatterns; j++)
> +       builder.quick_push (CONST_VECTOR_ELT (src, i * npatterns + j));
> +    }
> +  builder.finalize ();
>
> -       1. multiple elts duplicate vector.
> -       2. multiple patterns with multiple elts.  */
> +  if (CONST_VECTOR_DUPLICATE_P (src))


I thought it's a predicator for a vector with same value like [a, a,
a, a,...] when I read the check
but seems like not? so could you add more comment for that?

> +    {
> +      if (builder.can_duplicate_repeating_sequence_p ())

Also more comment about this

> +       {
> +         rtx ele =3D builder.get_merged_repeating_sequence ();
> +         rtx dup =3D expand_vector_broadcast (builder.new_mode (), ele);
> +         emit_move_insn (target, gen_lowpart (mode, dup));
> +       }
> +      else

and this.

> +       {
> +         unsigned int nbits =3D npatterns - 1;
> +
> +         /* Generate vid =3D { 0, 1, 2, 3, 4, 5, 6, 7, ... }.  */
> +         rtx vid =3D gen_reg_rtx (builder.int_mode ());
> +         rtx op[] =3D {vid};
> +         emit_vlmax_insn (code_for_pred_series (builder.int_mode ()),
> +                          RVV_MISC_OP, op);
> +
> +         /* Generate vid_repeat =3D { 0, 1, ... nbits, ... }  */
> +         rtx vid_repeat =3D gen_reg_rtx (builder.int_mode ());
> +         rtx and_ops[] =3D {vid_repeat, vid,
> +                          gen_int_mode (nbits, builder.inner_int_mode ()=
)};
> +         emit_vlmax_insn (code_for_pred_scalar (AND, builder.int_mode ()=
),
> +                          RVV_BINOP, and_ops);
> +
> +         rtx tmp =3D gen_reg_rtx (builder.mode ());
> +         rtx dup_ops[] =3D {tmp, builder.elt (0)};
> +         emit_vlmax_insn (code_for_pred_broadcast (builder.mode ()), RVV=
_UNOP,
> +                          dup_ops);
> +         for (unsigned int i =3D 1; i < builder.npatterns (); i++)
> +           {
> +             /* Generate mask according to i.  */
> +             rtx mask =3D gen_reg_rtx (builder.mask_mode ());
> +             rtx const_vec =3D gen_const_vector_dup (builder.int_mode ()=
, i);
> +             expand_vec_cmp (mask, EQ, vid_repeat, const_vec);
> +
> +             /* Merge scalar to each i.  */
> +             rtx tmp2 =3D gen_reg_rtx (builder.mode ());
> +             rtx merge_ops[] =3D {tmp2, tmp, builder.elt (i), mask};
> +             insn_code icode =3D code_for_pred_merge_scalar (builder.mod=
e ());
> +             emit_vlmax_merge_insn (icode, RVV_MERGE_OP, merge_ops);
> +             tmp =3D tmp2;
> +           }
> +         emit_move_insn (target, tmp);
> +       }
> +      return;
> +    }
> +  else if (CONST_VECTOR_STEPPED_P (src))
> +    {
> +      gcc_assert (GET_MODE_CLASS (mode) =3D=3D MODE_VECTOR_INT);
> +      if (builder.single_step_npatterns_p ())
> +       {
> +         /* Describe the case by choosing NPATTERNS =3D 4 as an example.=
  */
> +         rtx base, step;
> +         if (builder.npatterns_all_equal_p ())
> +           {
> +             /* Generate the variable-length vector as below:
> +                E.g. { 0, 0, 0, 0, 8, 8, 8, 8, 16, 16, 16, 16, ... } */

Add more comment like:
{ a, a, a, a, a + step, a + step, a + step, a + step, a + step * 2, a
+ step * 2,, a + step * 2,, a + step * 2, ...}

> +             /* Step 1: Generate base =3D { 0, 0, 0, 0, 0, 0, 0, ... }. =
 */
> +             base =3D expand_vector_broadcast (builder.mode (), builder.=
elt (0));
> +           }
> +         else
> +           {
> +             /* Generate the variable-length vector as below:
> +                E.g. { 0, 6, 0, 6, 8, 14, 8, 14, 16, 22, 16, 22, ... } *=
/

Add more comment like:
{ a, b, a, b, a + step, b + step, a + step *2, b + step *2, ...}

> +             /* Step 1: Generate base =3D { 0, 6, 0, 6, ... }.  */
> +             rvv_builder new_builder (builder.mode (), builder.npatterns=
 (),
> +                                      1);
> +             for (unsigned int i =3D 0; i < builder.npatterns (); ++i)
> +               new_builder.quick_push (builder.elt (i));
> +             rtx new_vec =3D new_builder.build ();
> +             base =3D gen_reg_rtx (builder.mode ());
> +             emit_move_insn (base, new_vec);
> +           }
> +
> +         /* Step 2: Generate step =3D gen_int_mode (diff, mode).  */
> +         poly_int64 value1 =3D rtx_to_poly_int64 (builder.elt (0));
> +         poly_int64 value2
> +           =3D rtx_to_poly_int64 (builder.elt (builder.npatterns ()));
> +         poly_int64 diff =3D value2 - value1;
> +         step =3D gen_int_mode (diff, builder.inner_mode ());
> +
> +         /* Step 3: Generate vid =3D { 0, 1, 2, 3, 4, 5, 6, 7, ... }.  *=
/
> +         rtx vid =3D gen_reg_rtx (builder.mode ());
> +         rtx op[] =3D {vid};
> +         emit_vlmax_insn (code_for_pred_series (builder.mode ()), RVV_MI=
SC_OP,
> +                          op);
> +
> +         /* Step 4: Generate factor =3D { 0, 0, 0, 0, 1, 1, 1, 1, ... }.=
  */
> +         rtx factor =3D gen_reg_rtx (builder.mode ());
> +         rtx shift_ops[]
> +           =3D {factor, vid,
> +              gen_int_mode (exact_log2 (builder.npatterns ()), Pmode)};

Do we have check builder.npatterns () must be power of 2 in somewhere?

> +         emit_vlmax_insn (code_for_pred_scalar (LSHIFTRT, builder.mode (=
)),
> +                          RVV_BINOP, shift_ops);
> +
> +         /* Step 5: Generate adjusted step =3D { 0, 0, 0, 0, diff, diff,=
 ... } */
> +         rtx adjusted_step =3D gen_reg_rtx (builder.mode ());
> +         rtx mul_ops[] =3D {adjusted_step, factor, step};
> +         emit_vlmax_insn (code_for_pred_scalar (MULT, builder.mode ()),
> +                          RVV_BINOP, mul_ops);
> +
> +         /* Step 6: Generate the final result.  */
> +         rtx add_ops[] =3D {target, base, adjusted_step};
> +         emit_vlmax_insn (code_for_pred (PLUS, builder.mode ()), RVV_BIN=
OP,
> +                          add_ops);
> +       }
> +      else
> +       /* TODO: We will enable more variable-length vector in the future=
.  */
> +       gcc_unreachable ();
> +    }
> +  else
> +    gcc_unreachable ();
>  }
>
>  /* Expand a pre-RA RVV data move from SRC to DEST.

On Wed, Jun 7, 2023 at 8:39=E2=80=AFAM juzhe.zhong@rivai.ai
<juzhe.zhong@rivai.ai> wrote:
>
> Ping this patch. Ok for trunk ?
> Since following patches are blocked by this.
>
>
>
> juzhe.zhong@rivai.ai
>
> From: juzhe.zhong
> Date: 2023-06-06 12:16
> To: gcc-patches
> CC: kito.cheng; kito.cheng; palmer; palmer; jeffreyalaw; rdapp.gcc; pan2.=
li; Juzhe-Zhong
> Subject: [PATCH] RISC-V: Support RVV VLA SLP auto-vectorization
> From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
>
> This patch enables basic VLA SLP auto-vectorization.
> Consider this following case:
> void
> f (uint8_t *restrict a, uint8_t *restrict b)
> {
>   for (int i =3D 0; i < 100; ++i)
>     {
>       a[i * 8 + 0] =3D b[i * 8 + 7] + 1;
>       a[i * 8 + 1] =3D b[i * 8 + 7] + 2;
>       a[i * 8 + 2] =3D b[i * 8 + 7] + 8;
>       a[i * 8 + 3] =3D b[i * 8 + 7] + 4;
>       a[i * 8 + 4] =3D b[i * 8 + 7] + 5;
>       a[i * 8 + 5] =3D b[i * 8 + 7] + 6;
>       a[i * 8 + 6] =3D b[i * 8 + 7] + 7;
>       a[i * 8 + 7] =3D b[i * 8 + 7] + 3;
>     }
> }
>
> To enable VLA SLP auto-vectorization, we should be able to handle this fo=
llowing const vector:
>
> 1. NPATTERNS =3D 8, NELTS_PER_PATTERN =3D 3.
> { 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16,=
 16, 16, ... }
>
> 2. NPATTERNS =3D 8, NELTS_PER_PATTERN =3D 1.
> { 1, 2, 8, 4, 5, 6, 7, 3, ... }
>
> And these vector can be generated at prologue.
>
> After this patch, we end up with this following codegen:
>
> Prologue:
> ...
>         vsetvli a7,zero,e16,m2,ta,ma
>         vid.v   v4
>         vsrl.vi v4,v4,3
>         li      a3,8
>         vmul.vx v4,v4,a3  =3D=3D=3D> v4 =3D { 0, 0, 0, 0, 0, 0, 0, 0, 8, =
8, 8, 8, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16, 16, ... }
> ...
>         li      t1,67633152
>         addi    t1,t1,513
>         li      a3,50790400
>         addi    a3,a3,1541
>         slli    a3,a3,32
>         add     a3,a3,t1
>         vsetvli t1,zero,e64,m1,ta,ma
>         vmv.v.x v3,a3   =3D=3D=3D> v3 =3D { 1, 2, 8, 4, 5, 6, 7, 3, ... }
> ...
> LoopBody:
> ...
>         min     a3,...
>         vsetvli zero,a3,e8,m1,ta,ma
>         vle8.v  v2,0(a6)
>         vsetvli a7,zero,e8,m1,ta,ma
>         vrgatherei16.vv v1,v2,v4
>         vadd.vv v1,v1,v3
>         vsetvli zero,a3,e8,m1,ta,ma
>         vse8.v  v1,0(a2)
>         add     a6,a6,a4
>         add     a2,a2,a4
>         mv      a3,a5
>         add     a5,a5,t1
>         bgtu    a3,a4,.L3
> ...
>
> Note: we need to use "vrgatherei16.vv" instead of "vrgather.vv" for SEW =
=3D 8 since "vrgatherei16.vv" can cover larger
>       range than "vrgather.vv" (which only can maximum element index =3D =
255).
> Epilogue:
>         lbu     a5,799(a1)
>         addiw   a4,a5,1
>         sb      a4,792(a0)
>         addiw   a4,a5,2
>         sb      a4,793(a0)
>         addiw   a4,a5,8
>         sb      a4,794(a0)
>         addiw   a4,a5,4
>         sb      a4,795(a0)
>         addiw   a4,a5,5
>         sb      a4,796(a0)
>         addiw   a4,a5,6
>         sb      a4,797(a0)
>         addiw   a4,a5,7
>         sb      a4,798(a0)
>         addiw   a5,a5,3
>         sb      a5,799(a0)
>         ret
>
> There is one more last thing we need to do is the "Epilogue auto-vectoriz=
ation" which needs VLS modes support.
> I will support VLS modes for "Epilogue auto-vectorization" in the future.
>
> gcc/ChangeLog:
>
>         * config/riscv/riscv-protos.h (expand_vec_perm_const): New functi=
on.
>         * config/riscv/riscv-v.cc (rvv_builder::can_duplicate_repeating_s=
equence_p): Support POLY handling.
>         (rvv_builder::single_step_npatterns_p): New function.
>         (rvv_builder::npatterns_all_equal_p): Ditto.
>         (const_vec_all_in_range_p): Support POLY handling.
>         (gen_const_vector_dup): Ditto.
>         (emit_vlmax_gather_insn): Add vrgatherei16.
>         (emit_vlmax_masked_gather_mu_insn): Ditto.
>         (expand_const_vector): Add VLA SLP const vector support.
>         (expand_vec_perm): Support POLY.
>         (struct expand_vec_perm_d): New struct.
>         (shuffle_generic_patterns): New function.
>         (expand_vec_perm_const_1): Ditto.
>         (expand_vec_perm_const): Ditto.
>         * config/riscv/riscv.cc (riscv_vectorize_vec_perm_const): Ditto.
>         (TARGET_VECTORIZE_VEC_PERM_CONST): New targethook.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/riscv/rvv/autovec/scalable-1.c: Adapt testcase for V=
LA vectorizer.
>         * gcc.target/riscv/rvv/autovec/v-1.c: Ditto.
>         * gcc.target/riscv/rvv/autovec/zve32f_zvl128b-1.c: Ditto.
>         * gcc.target/riscv/rvv/autovec/zve32x_zvl128b-1.c: Ditto.
>         * gcc.target/riscv/rvv/autovec/zve64d-1.c: Ditto.
>         * gcc.target/riscv/rvv/autovec/zve64d_zvl128b-1.c: Ditto.
>         * gcc.target/riscv/rvv/autovec/zve64f-1.c: Ditto.
>         * gcc.target/riscv/rvv/autovec/zve64f_zvl128b-1.c: Ditto.
>         * gcc.target/riscv/rvv/autovec/zve64x_zvl128b-1.c: Ditto.
>         * gcc.target/riscv/rvv/autovec/partial/slp-1.c: New test.
>         * gcc.target/riscv/rvv/autovec/partial/slp-2.c: New test.
>         * gcc.target/riscv/rvv/autovec/partial/slp-3.c: New test.
>         * gcc.target/riscv/rvv/autovec/partial/slp-4.c: New test.
>         * gcc.target/riscv/rvv/autovec/partial/slp-5.c: New test.
>         * gcc.target/riscv/rvv/autovec/partial/slp-6.c: New test.
>         * gcc.target/riscv/rvv/autovec/partial/slp-7.c: New test.
>         * gcc.target/riscv/rvv/autovec/partial/slp_run-1.c: New test.
>         * gcc.target/riscv/rvv/autovec/partial/slp_run-2.c: New test.
>         * gcc.target/riscv/rvv/autovec/partial/slp_run-3.c: New test.
>         * gcc.target/riscv/rvv/autovec/partial/slp_run-4.c: New test.
>         * gcc.target/riscv/rvv/autovec/partial/slp_run-5.c: New test.
>         * gcc.target/riscv/rvv/autovec/partial/slp_run-6.c: New test.
>         * gcc.target/riscv/rvv/autovec/partial/slp_run-7.c: New test.
>
> ---
> gcc/config/riscv/riscv-protos.h               |   2 +
> gcc/config/riscv/riscv-v.cc                   | 352 ++++++++++++++++--
> gcc/config/riscv/riscv.cc                     |  16 +
> .../riscv/rvv/autovec/partial/slp-1.c         |  22 ++
> .../riscv/rvv/autovec/partial/slp-2.c         |  22 ++
> .../riscv/rvv/autovec/partial/slp-3.c         |  22 ++
> .../riscv/rvv/autovec/partial/slp-4.c         |  22 ++
> .../riscv/rvv/autovec/partial/slp-5.c         |  22 ++
> .../riscv/rvv/autovec/partial/slp-6.c         |  23 ++
> .../riscv/rvv/autovec/partial/slp-7.c         |  15 +
> .../riscv/rvv/autovec/partial/slp_run-1.c     |  66 ++++
> .../riscv/rvv/autovec/partial/slp_run-2.c     |  67 ++++
> .../riscv/rvv/autovec/partial/slp_run-3.c     |  67 ++++
> .../riscv/rvv/autovec/partial/slp_run-4.c     |  67 ++++
> .../riscv/rvv/autovec/partial/slp_run-5.c     |  67 ++++
> .../riscv/rvv/autovec/partial/slp_run-6.c     |  67 ++++
> .../riscv/rvv/autovec/partial/slp_run-7.c     |  58 +++
> .../gcc.target/riscv/rvv/autovec/scalable-1.c |   2 +-
> .../gcc.target/riscv/rvv/autovec/v-1.c        |   7 +-
> .../riscv/rvv/autovec/zve32f_zvl128b-1.c      |   2 +-
> .../riscv/rvv/autovec/zve32x_zvl128b-1.c      |   2 +-
> .../gcc.target/riscv/rvv/autovec/zve64d-1.c   |   2 +-
> .../riscv/rvv/autovec/zve64d_zvl128b-1.c      |   2 +-
> .../gcc.target/riscv/rvv/autovec/zve64f-1.c   |   2 +-
> .../riscv/rvv/autovec/zve64f_zvl128b-1.c      |   2 +-
> .../riscv/rvv/autovec/zve64x_zvl128b-1.c      |   2 +-
> 26 files changed, 963 insertions(+), 37 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp=
-1.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp=
-2.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp=
-3.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp=
-4.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp=
-5.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp=
-6.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp=
-7.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp=
_run-1.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp=
_run-2.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp=
_run-3.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp=
_run-4.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp=
_run-5.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp=
_run-6.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp=
_run-7.c
>
> diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-pro=
tos.h
> index d770e5e826e..27ecd16e496 100644
> --- a/gcc/config/riscv/riscv-protos.h
> +++ b/gcc/config/riscv/riscv-protos.h
> @@ -168,6 +168,8 @@ void init_builtins (void);
> const char *mangle_builtin_type (const_tree);
> #ifdef GCC_TARGET_H
> bool verify_type_context (location_t, type_context_kind, const_tree, bool=
);
> +bool expand_vec_perm_const (machine_mode, machine_mode, rtx, rtx, rtx,
> +     const vec_perm_indices &);
> #endif
> void handle_pragma_vector (void);
> tree builtin_decl (unsigned, bool);
> diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
> index 83277fc2c05..4864429ed06 100644
> --- a/gcc/config/riscv/riscv-v.cc
> +++ b/gcc/config/riscv/riscv-v.cc
> @@ -251,9 +251,12 @@ public:
>      m_inner_mode =3D GET_MODE_INNER (mode);
>      m_inner_bits_size =3D GET_MODE_BITSIZE (m_inner_mode);
>      m_inner_bytes_size =3D GET_MODE_SIZE (m_inner_mode);
> +    m_mask_mode =3D get_mask_mode (mode).require ();
>      gcc_assert (
>        int_mode_for_size (inner_bits_size (), 0).exists (&m_inner_int_mod=
e));
> +    m_int_mode
> +      =3D get_vector_mode (m_inner_int_mode, GET_MODE_NUNITS (mode)).req=
uire ();
>    }
>    bool can_duplicate_repeating_sequence_p ();
> @@ -262,9 +265,14 @@ public:
>    bool repeating_sequence_use_merge_profitable_p ();
>    rtx get_merge_scalar_mask (unsigned int) const;
> +  bool single_step_npatterns_p () const;
> +  bool npatterns_all_equal_p () const;
> +
>    machine_mode new_mode () const { return m_new_mode; }
>    scalar_mode inner_mode () const { return m_inner_mode; }
>    scalar_int_mode inner_int_mode () const { return m_inner_int_mode; }
> +  machine_mode mask_mode () const { return m_mask_mode; }
> +  machine_mode int_mode () const { return m_int_mode; }
>    unsigned int inner_bits_size () const { return m_inner_bits_size; }
>    unsigned int inner_bytes_size () const { return m_inner_bytes_size; }
> @@ -273,6 +281,8 @@ private:
>    scalar_int_mode m_inner_int_mode;
>    machine_mode m_new_mode;
>    scalar_int_mode m_new_inner_mode;
> +  machine_mode m_mask_mode;
> +  machine_mode m_int_mode;
>    unsigned int m_inner_bits_size;
>    unsigned int m_inner_bytes_size;
> };
> @@ -290,7 +300,9 @@ rvv_builder::can_duplicate_repeating_sequence_p ()
>        || GET_MODE_SIZE (m_new_inner_mode) > UNITS_PER_WORD
>        || !get_vector_mode (m_new_inner_mode, new_size).exists (&m_new_mo=
de))
>      return false;
> -  return repeating_sequence_p (0, full_nelts ().to_constant (), npattern=
s ());
> +  if (full_nelts ().is_constant ())
> +    return repeating_sequence_p (0, full_nelts ().to_constant (), npatte=
rns ());
> +  return nelts_per_pattern () =3D=3D 1;
> }
> /* Return true if it is a repeating sequence that using
> @@ -398,6 +410,48 @@ rvv_builder::get_merge_scalar_mask (unsigned int ind=
ex_in_pattern) const
>    return gen_int_mode (mask, inner_int_mode ());
> }
> +/* Return true if the variable-length vector is single step.  */
> +bool
> +rvv_builder::single_step_npatterns_p () const
> +{
> +  if (nelts_per_pattern () !=3D 3)
> +    return false;
> +
> +  poly_int64 step
> +    =3D rtx_to_poly_int64 (elt (npatterns ())) - rtx_to_poly_int64 (elt =
(0));
> +  for (unsigned int i =3D 0; i < npatterns (); i++)
> +    {
> +      poly_int64 ele0 =3D rtx_to_poly_int64 (elt (i));
> +      poly_int64 ele1 =3D rtx_to_poly_int64 (elt (npatterns () + i));
> +      poly_int64 ele2 =3D rtx_to_poly_int64 (elt (npatterns () * 2 + i))=
;
> +      poly_int64 diff1 =3D ele1 - ele0;
> +      poly_int64 diff2 =3D ele2 - ele1;
> +      if (maybe_ne (step, diff1) || maybe_ne (step, diff2))
> + return false;
> +    }
> +  return true;
> +}
> +
> +/* Return true if all elements of NPATTERNS are equal.
> +
> +   E.g. NPATTERNS =3D 4:
> +     { 2, 2, 2, 2, 4, 4, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, ... }
> +   E.g. NPATTERNS =3D 8:
> +     { 2, 2, 2, 2, 2, 2, 2, 2, 8, 8, 8, 8, 8, 8, 8, 8, ... }
> +*/
> +bool
> +rvv_builder::npatterns_all_equal_p () const
> +{
> +  poly_int64 ele0 =3D rtx_to_poly_int64 (elt (0));
> +  for (unsigned int i =3D 1; i < npatterns (); i++)
> +    {
> +      poly_int64 ele =3D rtx_to_poly_int64 (elt (i));
> +      if (!known_eq (ele, ele0))
> + return false;
> +    }
> +  return true;
> +}
> +
> static unsigned
> get_sew (machine_mode mode)
> {
> @@ -425,7 +479,7 @@ const_vec_all_same_in_range_p (rtx x, HOST_WIDE_INT m=
inval,
>     future.  */
> static bool
> -const_vec_all_in_range_p (rtx vec, HOST_WIDE_INT minval, HOST_WIDE_INT m=
axval)
> +const_vec_all_in_range_p (rtx vec, poly_int64 minval, poly_int64 maxval)
> {
>    if (!CONST_VECTOR_P (vec)
>        || GET_MODE_CLASS (GET_MODE (vec)) !=3D MODE_VECTOR_INT)
> @@ -440,8 +494,10 @@ const_vec_all_in_range_p (rtx vec, HOST_WIDE_INT min=
val, HOST_WIDE_INT maxval)
>    for (int i =3D 0; i < nunits; i++)
>      {
>        rtx vec_elem =3D CONST_VECTOR_ELT (vec, i);
> -      if (!CONST_INT_P (vec_elem)
> -   || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
> +      poly_int64 value;
> +      if (!poly_int_rtx_p (vec_elem, &value)
> +   || maybe_lt (value, minval)
> +   || maybe_gt (value, maxval))
> return false;
>      }
>    return true;
> @@ -453,7 +509,7 @@ const_vec_all_in_range_p (rtx vec, HOST_WIDE_INT minv=
al, HOST_WIDE_INT maxval)
>     future.  */
> static rtx
> -gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
> +gen_const_vector_dup (machine_mode mode, poly_int64 val)
> {
>    rtx c =3D gen_int_mode (val, GET_MODE_INNER (mode));
>    return gen_const_vec_duplicate (mode, c);
> @@ -727,7 +783,10 @@ emit_vlmax_gather_insn (rtx target, rtx op, rtx sel)
>    rtx elt;
>    insn_code icode;
>    machine_mode data_mode =3D GET_MODE (target);
> -  if (const_vec_duplicate_p (sel, &elt))
> +  machine_mode sel_mode =3D GET_MODE (sel);
> +  if (maybe_ne (GET_MODE_SIZE (data_mode), GET_MODE_SIZE (sel_mode)))
> +    icode =3D code_for_pred_gatherei16 (data_mode);
> +  else if (const_vec_duplicate_p (sel, &elt))
>      {
>        icode =3D code_for_pred_gather_scalar (data_mode);
>        sel =3D elt;
> @@ -744,7 +803,10 @@ emit_vlmax_masked_gather_mu_insn (rtx target, rtx op=
, rtx sel, rtx mask)
>    rtx elt;
>    insn_code icode;
>    machine_mode data_mode =3D GET_MODE (target);
> -  if (const_vec_duplicate_p (sel, &elt))
> +  machine_mode sel_mode =3D GET_MODE (sel);
> +  if (maybe_ne (GET_MODE_SIZE (data_mode), GET_MODE_SIZE (sel_mode)))
> +    icode =3D code_for_pred_gatherei16 (data_mode);
> +  else if (const_vec_duplicate_p (sel, &elt))
>      {
>        icode =3D code_for_pred_gather_scalar (data_mode);
>        sel =3D elt;
> @@ -895,11 +957,130 @@ expand_const_vector (rtx target, rtx src)
>        return;
>      }
> -  /* TODO: We only support const duplicate vector for now. More cases
> -     will be supported when we support auto-vectorization:
> +  /* Handle variable-length vector.  */
> +  unsigned int nelts_per_pattern =3D CONST_VECTOR_NELTS_PER_PATTERN (src=
);
> +  unsigned int npatterns =3D CONST_VECTOR_NPATTERNS (src);
> +  rvv_builder builder (mode, npatterns, nelts_per_pattern);
> +  for (unsigned int i =3D 0; i < nelts_per_pattern; i++)
> +    {
> +      for (unsigned int j =3D 0; j < npatterns; j++)
> + builder.quick_push (CONST_VECTOR_ELT (src, i * npatterns + j));
> +    }
> +  builder.finalize ();
> -       1. multiple elts duplicate vector.
> -       2. multiple patterns with multiple elts.  */
> +  if (CONST_VECTOR_DUPLICATE_P (src))
> +    {
> +      if (builder.can_duplicate_repeating_sequence_p ())
> + {
> +   rtx ele =3D builder.get_merged_repeating_sequence ();
> +   rtx dup =3D expand_vector_broadcast (builder.new_mode (), ele);
> +   emit_move_insn (target, gen_lowpart (mode, dup));
> + }
> +      else
> + {
> +   unsigned int nbits =3D npatterns - 1;
> +
> +   /* Generate vid =3D { 0, 1, 2, 3, 4, 5, 6, 7, ... }.  */
> +   rtx vid =3D gen_reg_rtx (builder.int_mode ());
> +   rtx op[] =3D {vid};
> +   emit_vlmax_insn (code_for_pred_series (builder.int_mode ()),
> +    RVV_MISC_OP, op);
> +
> +   /* Generate vid_repeat =3D { 0, 1, ... nbits, ... }  */
> +   rtx vid_repeat =3D gen_reg_rtx (builder.int_mode ());
> +   rtx and_ops[] =3D {vid_repeat, vid,
> +    gen_int_mode (nbits, builder.inner_int_mode ())};
> +   emit_vlmax_insn (code_for_pred_scalar (AND, builder.int_mode ()),
> +    RVV_BINOP, and_ops);
> +
> +   rtx tmp =3D gen_reg_rtx (builder.mode ());
> +   rtx dup_ops[] =3D {tmp, builder.elt (0)};
> +   emit_vlmax_insn (code_for_pred_broadcast (builder.mode ()), RVV_UNOP,
> +    dup_ops);
> +   for (unsigned int i =3D 1; i < builder.npatterns (); i++)
> +     {
> +       /* Generate mask according to i.  */
> +       rtx mask =3D gen_reg_rtx (builder.mask_mode ());
> +       rtx const_vec =3D gen_const_vector_dup (builder.int_mode (), i);
> +       expand_vec_cmp (mask, EQ, vid_repeat, const_vec);
> +
> +       /* Merge scalar to each i.  */
> +       rtx tmp2 =3D gen_reg_rtx (builder.mode ());
> +       rtx merge_ops[] =3D {tmp2, tmp, builder.elt (i), mask};
> +       insn_code icode =3D code_for_pred_merge_scalar (builder.mode ());
> +       emit_vlmax_merge_insn (icode, RVV_MERGE_OP, merge_ops);
> +       tmp =3D tmp2;
> +     }
> +   emit_move_insn (target, tmp);
> + }
> +      return;
> +    }
> +  else if (CONST_VECTOR_STEPPED_P (src))
> +    {
> +      gcc_assert (GET_MODE_CLASS (mode) =3D=3D MODE_VECTOR_INT);
> +      if (builder.single_step_npatterns_p ())
> + {
> +   /* Describe the case by choosing NPATTERNS =3D 4 as an example.  */
> +   rtx base, step;
> +   if (builder.npatterns_all_equal_p ())
> +     {
> +       /* Generate the variable-length vector as below:
> + E.g. { 0, 0, 0, 0, 8, 8, 8, 8, 16, 16, 16, 16, ... } */
> +       /* Step 1: Generate base =3D { 0, 0, 0, 0, 0, 0, 0, ... }.  */
> +       base =3D expand_vector_broadcast (builder.mode (), builder.elt (0=
));
> +     }
> +   else
> +     {
> +       /* Generate the variable-length vector as below:
> + E.g. { 0, 6, 0, 6, 8, 14, 8, 14, 16, 22, 16, 22, ... } */
> +       /* Step 1: Generate base =3D { 0, 6, 0, 6, ... }.  */
> +       rvv_builder new_builder (builder.mode (), builder.npatterns (),
> +        1);
> +       for (unsigned int i =3D 0; i < builder.npatterns (); ++i)
> + new_builder.quick_push (builder.elt (i));
> +       rtx new_vec =3D new_builder.build ();
> +       base =3D gen_reg_rtx (builder.mode ());
> +       emit_move_insn (base, new_vec);
> +     }
> +
> +   /* Step 2: Generate step =3D gen_int_mode (diff, mode).  */
> +   poly_int64 value1 =3D rtx_to_poly_int64 (builder.elt (0));
> +   poly_int64 value2
> +     =3D rtx_to_poly_int64 (builder.elt (builder.npatterns ()));
> +   poly_int64 diff =3D value2 - value1;
> +   step =3D gen_int_mode (diff, builder.inner_mode ());
> +
> +   /* Step 3: Generate vid =3D { 0, 1, 2, 3, 4, 5, 6, 7, ... }.  */
> +   rtx vid =3D gen_reg_rtx (builder.mode ());
> +   rtx op[] =3D {vid};
> +   emit_vlmax_insn (code_for_pred_series (builder.mode ()), RVV_MISC_OP,
> +    op);
> +
> +   /* Step 4: Generate factor =3D { 0, 0, 0, 0, 1, 1, 1, 1, ... }.  */
> +   rtx factor =3D gen_reg_rtx (builder.mode ());
> +   rtx shift_ops[]
> +     =3D {factor, vid,
> +        gen_int_mode (exact_log2 (builder.npatterns ()), Pmode)};
> +   emit_vlmax_insn (code_for_pred_scalar (LSHIFTRT, builder.mode ()),
> +    RVV_BINOP, shift_ops);
> +
> +   /* Step 5: Generate adjusted step =3D { 0, 0, 0, 0, diff, diff, ... }=
 */
> +   rtx adjusted_step =3D gen_reg_rtx (builder.mode ());
> +   rtx mul_ops[] =3D {adjusted_step, factor, step};
> +   emit_vlmax_insn (code_for_pred_scalar (MULT, builder.mode ()),
> +    RVV_BINOP, mul_ops);
> +
> +   /* Step 6: Generate the final result.  */
> +   rtx add_ops[] =3D {target, base, adjusted_step};
> +   emit_vlmax_insn (code_for_pred (PLUS, builder.mode ()), RVV_BINOP,
> +    add_ops);
> + }
> +      else
> + /* TODO: We will enable more variable-length vector in the future.  */
> + gcc_unreachable ();
> +    }
> +  else
> +    gcc_unreachable ();
> }
> /* Expand a pre-RA RVV data move from SRC to DEST.
> @@ -2029,14 +2210,13 @@ expand_vec_perm (rtx target, rtx op0, rtx op1, rt=
x sel)
> {
>    machine_mode data_mode =3D GET_MODE (target);
>    machine_mode sel_mode =3D GET_MODE (sel);
> -
> -  /* Enforced by the pattern condition.  */
> -  int nunits =3D GET_MODE_NUNITS (sel_mode).to_constant ();
> +  poly_uint64 nunits =3D GET_MODE_NUNITS (sel_mode);
>    /* Check if the sel only references the first values vector. If each s=
elect
>       index is in range of [0, nunits - 1]. A single vrgather instruction=
s is
> -     enough.  */
> -  if (const_vec_all_in_range_p (sel, 0, nunits - 1))
> +     enough. Since we will use vrgatherei16.vv for variable-length vecto=
r,
> +     it is never out of range and we don't need to modulo the index.  */
> +  if (!nunits.is_constant () || const_vec_all_in_range_p (sel, 0, nunits=
 - 1))
>      {
>        emit_vlmax_gather_insn (target, op0, sel);
>        return;
> @@ -2057,14 +2237,20 @@ expand_vec_perm (rtx target, rtx op0, rtx op1, rt=
x sel)
>        return;
>      }
> -  /* Note: vec_perm indices are supposed to wrap when they go beyond the
> -     size of the two value vectors, i.e. the upper bits of the indices
> -     are effectively ignored.  RVV vrgather instead produces 0 for any
> -     out-of-range indices, so we need to modulo all the vec_perm indices
> -     to ensure they are all in range of [0, 2 * nunits - 1].  */
> +  rtx sel_mod =3D sel;
>    rtx max_sel =3D gen_const_vector_dup (sel_mode, 2 * nunits - 1);
> -  rtx sel_mod
> -    =3D expand_simple_binop (sel_mode, AND, sel, max_sel, NULL, 0, OPTAB=
_DIRECT);
> +  /* We don't need to modulo indices for VLA vector.
> +     Since we should gurantee they aren't out of range before.  */
> +  if (nunits.is_constant ())
> +    {
> +      /* Note: vec_perm indices are supposed to wrap when they go beyond=
 the
> + size of the two value vectors, i.e. the upper bits of the indices
> + are effectively ignored.  RVV vrgather instead produces 0 for any
> + out-of-range indices, so we need to modulo all the vec_perm indices
> + to ensure they are all in range of [0, 2 * nunits - 1].  */
> +      sel_mod =3D expand_simple_binop (sel_mode, AND, sel, max_sel, NULL=
, 0,
> +      OPTAB_DIRECT);
> +    }
>    /* This following sequence is handling the case that:
>       __builtin_shufflevector (vec1, vec2, index...), the index can be an=
y
> @@ -2094,4 +2280,124 @@ expand_vec_perm (rtx target, rtx op0, rtx op1, rt=
x sel)
>    emit_vlmax_masked_gather_mu_insn (target, op1, tmp, mask);
> }
> +/* Implement TARGET_VECTORIZE_VEC_PERM_CONST for RVV.  */
> +
> +/* vec_perm support.  */
> +
> +struct expand_vec_perm_d
> +{
> +  rtx target, op0, op1;
> +  vec_perm_indices perm;
> +  machine_mode vmode;
> +  machine_mode op_mode;
> +  bool one_vector_p;
> +  bool testing_p;
> +};
> +
> +/* Recognize the pattern that can be shuffled by generic approach.  */
> +
> +static bool
> +shuffle_generic_patterns (struct expand_vec_perm_d *d)
> +{
> +  machine_mode sel_mode =3D related_int_vector_mode (d->vmode).require (=
);
> +  poly_uint64 nunits =3D GET_MODE_NUNITS (d->vmode);
> +
> +  /* For constant size indices, we dont't need to handle it here.
> +     Just leave it to vec_perm<mode>.  */
> +  if (d->perm.length ().is_constant ())
> +    return false;
> +
> +  /* Permuting two SEW8 variable-length vectors need vrgatherei16.vv.
> +     Otherwise, it could overflow the index range.  */
> +  if (GET_MODE_INNER (d->vmode) =3D=3D QImode
> +      && !get_vector_mode (HImode, nunits).exists (&sel_mode))
> +    return false;
> +
> +  /* Success! */
> +  if (d->testing_p)
> +    return true;
> +
> +  rtx sel =3D vec_perm_indices_to_rtx (sel_mode, d->perm);
> +  expand_vec_perm (d->target, d->op0, d->op1, force_reg (sel_mode, sel))=
;
> +  return true;
> +}
> +
> +static bool
> +expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
> +{
> +  gcc_assert (d->op_mode !=3D E_VOIDmode);
> +
> +  /* The pattern matching functions above are written to look for a smal=
l
> +     number to begin the sequence (0, 1, N/2).  If we begin with an inde=
x
> +     from the second operand, we can swap the operands.  */
> +  poly_int64 nelt =3D d->perm.length ();
> +  if (known_ge (d->perm[0], nelt))
> +    {
> +      d->perm.rotate_inputs (1);
> +      std::swap (d->op0, d->op1);
> +    }
> +
> +  if (known_gt (nelt, 1))
> +    {
> +      if (d->vmode =3D=3D d->op_mode)
> + {
> +   if (shuffle_generic_patterns (d))
> +     return true;
> +   return false;
> + }
> +      else
> + return false;
> +    }
> +  return false;
> +}
> +
> +bool
> +expand_vec_perm_const (machine_mode vmode, machine_mode op_mode, rtx tar=
get,
> +        rtx op0, rtx op1, const vec_perm_indices &sel)
> +{
> +  /* RVV doesn't have Mask type pack/unpack instructions and we don't us=
e
> +     mask to do the iteration loop control. Just disable it directly.  *=
/
> +  if (GET_MODE_CLASS (vmode) =3D=3D MODE_VECTOR_BOOL)
> +    return false;
> +
> +  struct expand_vec_perm_d d;
> +
> +  /* Check whether the mask can be applied to a single vector.  */
> +  if (sel.ninputs () =3D=3D 1 || (op0 && rtx_equal_p (op0, op1)))
> +    d.one_vector_p =3D true;
> +  else if (sel.all_from_input_p (0))
> +    {
> +      d.one_vector_p =3D true;
> +      op1 =3D op0;
> +    }
> +  else if (sel.all_from_input_p (1))
> +    {
> +      d.one_vector_p =3D true;
> +      op0 =3D op1;
> +    }
> +  else
> +    d.one_vector_p =3D false;
> +
> +  d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
> +      sel.nelts_per_input ());
> +  d.vmode =3D vmode;
> +  d.op_mode =3D op_mode;
> +  d.target =3D target;
> +  d.op0 =3D op0;
> +  if (op0 =3D=3D op1)
> +    d.op1 =3D d.op0;
> +  else
> +    d.op1 =3D op1;
> +  d.testing_p =3D !target;
> +
> +  if (!d.testing_p)
> +    return expand_vec_perm_const_1 (&d);
> +
> +  rtx_insn *last =3D get_last_insn ();
> +  bool ret =3D expand_vec_perm_const_1 (&d);
> +  gcc_assert (last =3D=3D get_last_insn ());
> +
> +  return ret;
> +}
> +
> } // namespace riscv_vector
> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> index caa7858b864..5d22012b591 100644
> --- a/gcc/config/riscv/riscv.cc
> +++ b/gcc/config/riscv/riscv.cc
> @@ -7631,6 +7631,19 @@ riscv_vectorize_related_mode (machine_mode vector_=
mode, scalar_mode element_mode
>    return default_vectorize_related_mode (vector_mode, element_mode, nuni=
ts);
> }
> +/* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
> +
> +static bool
> +riscv_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode=
,
> + rtx target, rtx op0, rtx op1,
> + const vec_perm_indices &sel)
> +{
> +  if (TARGET_VECTOR && riscv_v_ext_vector_mode_p (vmode))
> +    return riscv_vector::expand_vec_perm_const (vmode, op_mode, target, =
op0,
> + op1, sel);
> +
> +  return false;
> +}
> /* Initialize the GCC target structure.  */
> #undef TARGET_ASM_ALIGNED_HI_OP
> @@ -7930,6 +7943,9 @@ riscv_vectorize_related_mode (machine_mode vector_m=
ode, scalar_mode element_mode
> #undef TARGET_VECTORIZE_RELATED_MODE
> #define TARGET_VECTORIZE_RELATED_MODE riscv_vectorize_related_mode
> +#undef TARGET_VECTORIZE_VEC_PERM_CONST
> +#define TARGET_VECTORIZE_VEC_PERM_CONST riscv_vectorize_vec_perm_const
> +
> struct gcc_target targetm =3D TARGET_INITIALIZER;
> #include "gt-riscv.h"
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-1.c b=
/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-1.c
> new file mode 100644
> index 00000000000..befb518e2dd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-1.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-march=3Drv32gcv -mabi=3Dilp32d --param risc=
v-autovec-preference=3Dscalable -fdump-tree-optimized-details" } */
> +
> +#include <stdint-gcc.h>
> +
> +void __attribute__ ((noipa))
> +f (int8_t *restrict a, int8_t *restrict b, int n)
> +{
> +  for (int i =3D 0; i < n; ++i)
> +    {
> +      a[i * 8 + 0] =3D b[i * 8 + 37] + 1;
> +      a[i * 8 + 1] =3D b[i * 8 + 37] + 2;
> +      a[i * 8 + 2] =3D b[i * 8 + 37] + 8;
> +      a[i * 8 + 3] =3D b[i * 8 + 37] + 4;
> +      a[i * 8 + 4] =3D b[i * 8 + 37] + 5;
> +      a[i * 8 + 5] =3D b[i * 8 + 37] + 6;
> +      a[i * 8 + 6] =3D b[i * 8 + 37] + 7;
> +      a[i * 8 + 7] =3D b[i * 8 + 37] + 3;
> +    }
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-2.c b=
/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-2.c
> new file mode 100644
> index 00000000000..ac817451295
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-2.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-march=3Drv32gcv -mabi=3Dilp32d --param risc=
v-autovec-preference=3Dscalable -fdump-tree-optimized-details" } */
> +
> +#include <stdint-gcc.h>
> +
> +void __attribute__ ((noipa))
> +f (int16_t *restrict a, int16_t *restrict b, int n)
> +{
> +  for (int i =3D 0; i < n; ++i)
> +    {
> +      a[i * 8 + 0] =3D b[i * 8 + 37] + 1;
> +      a[i * 8 + 1] =3D b[i * 8 + 37] + 2;
> +      a[i * 8 + 2] =3D b[i * 8 + 37] + 8;
> +      a[i * 8 + 3] =3D b[i * 8 + 37] + 4;
> +      a[i * 8 + 4] =3D b[i * 8 + 37] + 5;
> +      a[i * 8 + 5] =3D b[i * 8 + 37] + 6;
> +      a[i * 8 + 6] =3D b[i * 8 + 37] + 7;
> +      a[i * 8 + 7] =3D b[i * 8 + 37] + 3;
> +    }
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-3.c b=
/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-3.c
> new file mode 100644
> index 00000000000..73962055b03
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-3.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-march=3Drv32gcv -mabi=3Dilp32d --param risc=
v-autovec-preference=3Dscalable -fdump-tree-optimized-details" } */
> +
> +#include <stdint-gcc.h>
> +
> +void __attribute__ ((noipa))
> +f (int8_t *restrict a, int8_t *restrict b, int n)
> +{
> +  for (int i =3D 0; i < n; ++i)
> +    {
> +      a[i * 8] =3D b[i * 8 + 1] + 1;
> +      a[i * 8 + 1] =3D b[i * 8 + 7] + 2;
> +      a[i * 8 + 2] =3D b[i * 8 + 1] + 3;
> +      a[i * 8 + 3] =3D b[i * 8 + 7] + 4;
> +      a[i * 8 + 4] =3D b[i * 8 + 1] + 5;
> +      a[i * 8 + 5] =3D b[i * 8 + 7] + 6;
> +      a[i * 8 + 6] =3D b[i * 8 + 1] + 7;
> +      a[i * 8 + 7] =3D b[i * 8 + 7] + 8;
> +    }
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-4.c b=
/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-4.c
> new file mode 100644
> index 00000000000..fa216fc8c40
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-4.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-march=3Drv32gcv -mabi=3Dilp32d --param risc=
v-autovec-preference=3Dscalable -fdump-tree-optimized-details" } */
> +
> +#include <stdint-gcc.h>
> +
> +void __attribute__ ((noipa))
> +f (int16_t *restrict a, int16_t *restrict b, int n)
> +{
> +  for (int i =3D 0; i < n; ++i)
> +    {
> +      a[i * 8 + 0] =3D b[i * 8 + 1] + 1;
> +      a[i * 8 + 1] =3D b[i * 8 + 7] + 2;
> +      a[i * 8 + 2] =3D b[i * 8 + 1] + 3;
> +      a[i * 8 + 3] =3D b[i * 8 + 7] + 4;
> +      a[i * 8 + 4] =3D b[i * 8 + 1] + 5;
> +      a[i * 8 + 5] =3D b[i * 8 + 7] + 6;
> +      a[i * 8 + 6] =3D b[i * 8 + 1] + 7;
> +      a[i * 8 + 7] =3D b[i * 8 + 7] + 8;
> +    }
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-5.c b=
/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-5.c
> new file mode 100644
> index 00000000000..899ed9e310b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-5.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-march=3Drv32gcv -mabi=3Dilp32d --param risc=
v-autovec-preference=3Dscalable -fdump-tree-optimized-details" } */
> +
> +#include <stdint-gcc.h>
> +
> +void __attribute__ ((noipa))
> +f (int8_t *restrict a, int8_t *restrict b, int n)
> +{
> +  for (int i =3D 0; i < n; ++i)
> +    {
> +      a[i * 8 + 0] =3D b[i * 8 + 1] + 1;
> +      a[i * 8 + 1] =3D b[i * 8 + 7] + 2;
> +      a[i * 8 + 2] =3D b[i * 8 + 4] + 3;
> +      a[i * 8 + 3] =3D b[i * 8 + 8] + 4;
> +      a[i * 8 + 4] =3D b[i * 8 + 1] + 5;
> +      a[i * 8 + 5] =3D b[i * 8 + 7] + 6;
> +      a[i * 8 + 6] =3D b[i * 8 + 4] + 7;
> +      a[i * 8 + 7] =3D b[i * 8 + 8] + 8;
> +    }
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-6.c b=
/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-6.c
> new file mode 100644
> index 00000000000..fb87cc00cea
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-6.c
> @@ -0,0 +1,23 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-march=3Drv32gcv -mabi=3Dilp32d --param risc=
v-autovec-preference=3Dscalable -fdump-tree-optimized-details" } */
> +
> +#include <stdint-gcc.h>
> +
> +void __attribute__ ((noipa))
> +f (uint8_t *restrict a, uint8_t *restrict b, int n)
> +{
> +  for (int i =3D 0; i < n; ++i)
> +    {
> +      a[i * 8 + 0] =3D b[i * 8 + 1] + 1;
> +      a[i * 8 + 1] =3D b[i * 8 + 2] + 2;
> +      a[i * 8 + 2] =3D b[i * 8 + 6] + 8;
> +      a[i * 8 + 3] =3D b[i * 8 + 7] + 4;
> +      a[i * 8 + 4] =3D b[i * 8 + 3] + 5;
> +      a[i * 8 + 5] =3D b[i * 8 + 4] + 6;
> +      a[i * 8 + 6] =3D b[i * 8 + 5] + 7;
> +      a[i * 8 + 7] =3D b[i * 8 + 0] + 3;
> +    }
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" } } */
> +
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-7.c b=
/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-7.c
> new file mode 100644
> index 00000000000..3dd744b586e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-7.c
> @@ -0,0 +1,15 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-march=3Drv32gcv -mabi=3Dilp32d --param risc=
v-autovec-preference=3Dscalable -fdump-tree-optimized-details" } */
> +
> +#include <stdint-gcc.h>
> +
> +void __attribute__ ((noipa))
> +f (float *__restrict f, double *__restrict d, int n)
> +{
> +  for (int i =3D 0; i < n; ++i)
> +    {
> +      f[i * 2 + 0] =3D 1;
> +      f[i * 2 + 1] =3D 2;
> +      d[i] =3D 3;
> +    }
> +}
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp_run-1=
.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp_run-1.c
> new file mode 100644
> index 00000000000..16f078a0433
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp_run-1.c
> @@ -0,0 +1,66 @@
> +/* { dg-do run { target { riscv_vector } } } */
> +/* { dg-additional-options "--param riscv-autovec-preference=3Dscalable"=
 } */
> +
> +#include "slp-1.c"
> +
> +#define LIMIT 128
> +void __attribute__ ((optimize (0)))
> +f_golden (int8_t *restrict a, int8_t *restrict b, int n)
> +{
> +  for (int i =3D 0; i < n; ++i)
> +    {
> +      a[i * 8 + 0] =3D b[i * 8 + 37] + 1;
> +      a[i * 8 + 1] =3D b[i * 8 + 37] + 2;
> +      a[i * 8 + 2] =3D b[i * 8 + 37] + 8;
> +      a[i * 8 + 3] =3D b[i * 8 + 37] + 4;
> +      a[i * 8 + 4] =3D b[i * 8 + 37] + 5;
> +      a[i * 8 + 5] =3D b[i * 8 + 37] + 6;
> +      a[i * 8 + 6] =3D b[i * 8 + 37] + 7;
> +      a[i * 8 + 7] =3D b[i * 8 + 37] + 3;
> +    }
> +}
> +
> +int
> +main (void)
> +{
> +#define RUN(NUM)                                                        =
       \
> +  int8_t a_##NUM[NUM * 8 + 8] =3D {0};                                  =
         \
> +  int8_t a_golden_##NUM[NUM * 8 + 8] =3D {0};                           =
         \
> +  int8_t b_##NUM[NUM * 8 + 37] =3D {0};                                 =
         \
> +  for (int i =3D 0; i < NUM * 8 + 37; i++)                              =
         \
> +    {                                                                   =
       \
> +      if (i % NUM =3D=3D 0)                                             =
           \
> + b_##NUM[i] =3D (i + NUM) % LIMIT;                                      =
  \
> +      else                                                              =
       \
> + b_##NUM[i] =3D (i - NUM) % (-LIMIT);                                   =
  \
> +    }                                                                   =
       \
> +  f (a_##NUM, b_##NUM, NUM);                                            =
       \
> +  f_golden (a_golden_##NUM, b_##NUM, NUM);                              =
       \
> +  for (int i =3D 0; i < NUM * 8 + 8; i++)                               =
         \
> +    {                                                                   =
       \
> +      if (a_##NUM[i] !=3D a_golden_##NUM[i])                            =
         \
> + __builtin_abort ();                                                    =
\
> +    }
> +
> +  RUN (3);
> +  RUN (5);
> +  RUN (15);
> +  RUN (16);
> +  RUN (17);
> +  RUN (31);
> +  RUN (32);
> +  RUN (33);
> +  RUN (63);
> +  RUN (64);
> +  RUN (65);
> +  RUN (127);
> +  RUN (128);
> +  RUN (129);
> +  RUN (239);
> +  RUN (359);
> +  RUN (498);
> +  RUN (799);
> +  RUN (977);
> +  RUN (5789);
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp_run-2=
.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp_run-2.c
> new file mode 100644
> index 00000000000..41f688f628c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp_run-2.c
> @@ -0,0 +1,67 @@
> +/* { dg-do run { target { riscv_vector } } } */
> +/* { dg-additional-options "--param riscv-autovec-preference=3Dscalable"=
 } */
> +
> +#include "slp-2.c"
> +
> +#define LIMIT 32767
> +
> +void __attribute__ ((optimize (0)))
> +f_golden (int16_t *restrict a, int16_t *restrict b, int n)
> +{
> +  for (int i =3D 0; i < n; ++i)
> +    {
> +      a[i * 8 + 0] =3D b[i * 8 + 37] + 1;
> +      a[i * 8 + 1] =3D b[i * 8 + 37] + 2;
> +      a[i * 8 + 2] =3D b[i * 8 + 37] + 8;
> +      a[i * 8 + 3] =3D b[i * 8 + 37] + 4;
> +      a[i * 8 + 4] =3D b[i * 8 + 37] + 5;
> +      a[i * 8 + 5] =3D b[i * 8 + 37] + 6;
> +      a[i * 8 + 6] =3D b[i * 8 + 37] + 7;
> +      a[i * 8 + 7] =3D b[i * 8 + 37] + 3;
> +    }
> +}
> +
> +int
> +main (void)
> +{
> +#define RUN(NUM)                                                        =
       \
> +  int16_t a_##NUM[NUM * 8 + 8] =3D {0};                                 =
         \
> +  int16_t a_golden_##NUM[NUM * 8 + 8] =3D {0};                          =
         \
> +  int16_t b_##NUM[NUM * 8 + 37] =3D {0};                                =
         \
> +  for (int i =3D 0; i < NUM * 8 + 37; i++)                              =
         \
> +    {                                                                   =
       \
> +      if (i % NUM =3D=3D 0)                                             =
           \
> + b_##NUM[i] =3D (i + NUM) % LIMIT;                                      =
  \
> +      else                                                              =
       \
> + b_##NUM[i] =3D (i - NUM) % (-LIMIT);                                   =
  \
> +    }                                                                   =
       \
> +  f (a_##NUM, b_##NUM, NUM);                                            =
       \
> +  f_golden (a_golden_##NUM, b_##NUM, NUM);                              =
       \
> +  for (int i =3D 0; i < NUM * 8 + 8; i++)                               =
         \
> +    {                                                                   =
       \
> +      if (a_##NUM[i] !=3D a_golden_##NUM[i])                            =
         \
> + __builtin_abort ();                                                    =
\
> +    }
> +
> +  RUN (3);
> +  RUN (5);
> +  RUN (15);
> +  RUN (16);
> +  RUN (17);
> +  RUN (31);
> +  RUN (32);
> +  RUN (33);
> +  RUN (63);
> +  RUN (64);
> +  RUN (65);
> +  RUN (127);
> +  RUN (128);
> +  RUN (129);
> +  RUN (239);
> +  RUN (359);
> +  RUN (498);
> +  RUN (799);
> +  RUN (977);
> +  RUN (5789);
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp_run-3=
.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp_run-3.c
> new file mode 100644
> index 00000000000..30996cb2c6e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp_run-3.c
> @@ -0,0 +1,67 @@
> +/* { dg-do run { target { riscv_vector } } } */
> +/* { dg-additional-options "--param riscv-autovec-preference=3Dscalable"=
 } */
> +
> +#include "slp-3.c"
> +
> +#define LIMIT 128
> +
> +void __attribute__ ((optimize (0)))
> +f_golden (int8_t *restrict a, int8_t *restrict b, int n)
> +{
> +  for (int i =3D 0; i < n; ++i)
> +    {
> +      a[i * 8] =3D b[i * 8 + 1] + 1;
> +      a[i * 8 + 1] =3D b[i * 8 + 7] + 2;
> +      a[i * 8 + 2] =3D b[i * 8 + 1] + 3;
> +      a[i * 8 + 3] =3D b[i * 8 + 7] + 4;
> +      a[i * 8 + 4] =3D b[i * 8 + 1] + 5;
> +      a[i * 8 + 5] =3D b[i * 8 + 7] + 6;
> +      a[i * 8 + 6] =3D b[i * 8 + 1] + 7;
> +      a[i * 8 + 7] =3D b[i * 8 + 7] + 8;
> +    }
> +}
> +
> +int
> +main (void)
> +{
> +#define RUN(NUM)                                                        =
       \
> +  int8_t a_##NUM[NUM * 8 + 8] =3D {0};                                  =
         \
> +  int8_t a_golden_##NUM[NUM * 8 + 8] =3D {0};                           =
         \
> +  int8_t b_##NUM[NUM * 8 + 8] =3D {0};                                  =
         \
> +  for (int i =3D 0; i < NUM * 8 + 8; i++)                               =
         \
> +    {                                                                   =
       \
> +      if (i % NUM =3D=3D 0)                                             =
           \
> + b_##NUM[i] =3D (i + NUM) % LIMIT;                                      =
  \
> +      else                                                              =
       \
> + b_##NUM[i] =3D (i - NUM) % (-LIMIT);                                   =
  \
> +    }                                                                   =
       \
> +  f (a_##NUM, b_##NUM, NUM);                                            =
       \
> +  f_golden (a_golden_##NUM, b_##NUM, NUM);                              =
       \
> +  for (int i =3D 0; i < NUM * 8 + 8; i++)                               =
         \
> +    {                                                                   =
       \
> +      if (a_##NUM[i] !=3D a_golden_##NUM[i])                            =
         \
> + __builtin_abort ();                                                    =
\
> +    }
> +
> +  RUN (3);
> +  RUN (5);
> +  RUN (15);
> +  RUN (16);
> +  RUN (17);
> +  RUN (31);
> +  RUN (32);
> +  RUN (33);
> +  RUN (63);
> +  RUN (64);
> +  RUN (65);
> +  RUN (127);
> +  RUN (128);
> +  RUN (129);
> +  RUN (239);
> +  RUN (359);
> +  RUN (498);
> +  RUN (799);
> +  RUN (977);
> +  RUN (5789);
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp_run-4=
.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp_run-4.c
> new file mode 100644
> index 00000000000..3d43ef0890c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp_run-4.c
> @@ -0,0 +1,67 @@
> +/* { dg-do run { target { riscv_vector } } } */
> +/* { dg-additional-options "--param riscv-autovec-preference=3Dscalable"=
 } */
> +
> +#include "slp-4.c"
> +
> +#define LIMIT 32767
> +
> +void __attribute__ ((optimize (0)))
> +f_golden (int16_t *restrict a, int16_t *restrict b, int n)
> +{
> +  for (int i =3D 0; i < n; ++i)
> +    {
> +      a[i * 8] =3D b[i * 8 + 1] + 1;
> +      a[i * 8 + 1] =3D b[i * 8 + 7] + 2;
> +      a[i * 8 + 2] =3D b[i * 8 + 1] + 3;
> +      a[i * 8 + 3] =3D b[i * 8 + 7] + 4;
> +      a[i * 8 + 4] =3D b[i * 8 + 1] + 5;
> +      a[i * 8 + 5] =3D b[i * 8 + 7] + 6;
> +      a[i * 8 + 6] =3D b[i * 8 + 1] + 7;
> +      a[i * 8 + 7] =3D b[i * 8 + 7] + 8;
> +    }
> +}
> +
> +int
> +main (void)
> +{
> +#define RUN(NUM)                                                        =
       \
> +  int16_t a_##NUM[NUM * 8 + 8] =3D {0};                                 =
         \
> +  int16_t a_golden_##NUM[NUM * 8 + 8] =3D {0};                          =
         \
> +  int16_t b_##NUM[NUM * 8 + 8] =3D {0};                                 =
         \
> +  for (int i =3D 0; i < NUM * 8 + 8; i++)                               =
         \
> +    {                                                                   =
       \
> +      if (i % NUM =3D=3D 0)                                             =
           \
> + b_##NUM[i] =3D (i + NUM) % LIMIT;                                      =
  \
> +      else                                                              =
       \
> + b_##NUM[i] =3D (i - NUM) % (-LIMIT);                                   =
  \
> +    }                                                                   =
       \
> +  f (a_##NUM, b_##NUM, NUM);                                            =
       \
> +  f_golden (a_golden_##NUM, b_##NUM, NUM);                              =
       \
> +  for (int i =3D 0; i < NUM * 8 + 8; i++)                               =
         \
> +    {                                                                   =
       \
> +      if (a_##NUM[i] !=3D a_golden_##NUM[i])                            =
         \
> + __builtin_abort ();                                                    =
\
> +    }
> +
> +  RUN (3);
> +  RUN (5);
> +  RUN (15);
> +  RUN (16);
> +  RUN (17);
> +  RUN (31);
> +  RUN (32);
> +  RUN (33);
> +  RUN (63);
> +  RUN (64);
> +  RUN (65);
> +  RUN (127);
> +  RUN (128);
> +  RUN (129);
> +  RUN (239);
> +  RUN (359);
> +  RUN (498);
> +  RUN (799);
> +  RUN (977);
> +  RUN (5789);
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp_run-5=
.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp_run-5.c
> new file mode 100644
> index 00000000000..814308bd7af
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp_run-5.c
> @@ -0,0 +1,67 @@
> +/* { dg-do run { target { riscv_vector } } } */
> +/* { dg-additional-options "--param riscv-autovec-preference=3Dscalable"=
 } */
> +
> +#include "slp-5.c"
> +
> +#define LIMIT 128
> +
> +void __attribute__ ((optimize (0)))
> +f_golden (int8_t *restrict a, int8_t *restrict b, int n)
> +{
> +  for (int i =3D 0; i < n; ++i)
> +    {
> +      a[i * 8 + 0] =3D b[i * 8 + 1] + 1;
> +      a[i * 8 + 1] =3D b[i * 8 + 7] + 2;
> +      a[i * 8 + 2] =3D b[i * 8 + 4] + 3;
> +      a[i * 8 + 3] =3D b[i * 8 + 8] + 4;
> +      a[i * 8 + 4] =3D b[i * 8 + 1] + 5;
> +      a[i * 8 + 5] =3D b[i * 8 + 7] + 6;
> +      a[i * 8 + 6] =3D b[i * 8 + 4] + 7;
> +      a[i * 8 + 7] =3D b[i * 8 + 8] + 8;
> +    }
> +}
> +
> +int
> +main (void)
> +{
> +#define RUN(NUM)                                                        =
       \
> +  int8_t a_##NUM[NUM * 8 + 8] =3D {0};                                  =
         \
> +  int8_t a_golden_##NUM[NUM * 8 + 8] =3D {0};                           =
         \
> +  int8_t b_##NUM[NUM * 8 + 9] =3D {0};                                  =
         \
> +  for (int i =3D 0; i < NUM * 8 + 9; i++)                               =
         \
> +    {                                                                   =
       \
> +      if (i % NUM =3D=3D 0)                                             =
           \
> + b_##NUM[i] =3D (i + NUM) % LIMIT;                                      =
  \
> +      else                                                              =
       \
> + b_##NUM[i] =3D (i - NUM) % (-LIMIT);                                   =
  \
> +    }                                                                   =
       \
> +  f (a_##NUM, b_##NUM, NUM);                                            =
       \
> +  f_golden (a_golden_##NUM, b_##NUM, NUM);                              =
       \
> +  for (int i =3D 0; i < NUM * 8 + 8; i++)                               =
         \
> +    {                                                                   =
       \
> +      if (a_##NUM[i] !=3D a_golden_##NUM[i])                            =
         \
> + __builtin_abort ();                                                    =
\
> +    }
> +
> +  RUN (3);
> +  RUN (5);
> +  RUN (15);
> +  RUN (16);
> +  RUN (17);
> +  RUN (31);
> +  RUN (32);
> +  RUN (33);
> +  RUN (63);
> +  RUN (64);
> +  RUN (65);
> +  RUN (127);
> +  RUN (128);
> +  RUN (129);
> +  RUN (239);
> +  RUN (359);
> +  RUN (498);
> +  RUN (799);
> +  RUN (977);
> +  RUN (5789);
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp_run-6=
.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp_run-6.c
> new file mode 100644
> index 00000000000..e317eeac2f2
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp_run-6.c
> @@ -0,0 +1,67 @@
> +/* { dg-do run { target { riscv_vector } } } */
> +/* { dg-additional-options "--param riscv-autovec-preference=3Dscalable"=
 } */
> +
> +#include "slp-6.c"
> +
> +#define LIMIT 128
> +
> +void __attribute__ ((optimize (0)))
> +f_golden (int8_t *restrict a, int8_t *restrict b, int n)
> +{
> +  for (int i =3D 0; i < n; ++i)
> +    {
> +      a[i * 8 + 0] =3D b[i * 8 + 1] + 1;
> +      a[i * 8 + 1] =3D b[i * 8 + 2] + 2;
> +      a[i * 8 + 2] =3D b[i * 8 + 6] + 8;
> +      a[i * 8 + 3] =3D b[i * 8 + 7] + 4;
> +      a[i * 8 + 4] =3D b[i * 8 + 3] + 5;
> +      a[i * 8 + 5] =3D b[i * 8 + 4] + 6;
> +      a[i * 8 + 6] =3D b[i * 8 + 5] + 7;
> +      a[i * 8 + 7] =3D b[i * 8 + 0] + 3;
> +    }
> +}
> +
> +int
> +main (void)
> +{
> +#define RUN(NUM)                                                        =
       \
> +  int8_t a_##NUM[NUM * 8 + 8] =3D {0};                                  =
         \
> +  int8_t a_golden_##NUM[NUM * 8 + 8] =3D {0};                           =
         \
> +  int8_t b_##NUM[NUM * 8 + 9] =3D {0};                                  =
         \
> +  for (int i =3D 0; i < NUM * 8 + 9; i++)                               =
         \
> +    {                                                                   =
       \
> +      if (i % NUM =3D=3D 0)                                             =
           \
> + b_##NUM[i] =3D (i + NUM) % LIMIT;                                      =
  \
> +      else                                                              =
       \
> + b_##NUM[i] =3D (i - NUM) % (-LIMIT);                                   =
  \
> +    }                                                                   =
       \
> +  f (a_##NUM, b_##NUM, NUM);                                            =
       \
> +  f_golden (a_golden_##NUM, b_##NUM, NUM);                              =
       \
> +  for (int i =3D 0; i < NUM * 8 + 8; i++)                               =
         \
> +    {                                                                   =
       \
> +      if (a_##NUM[i] !=3D a_golden_##NUM[i])                            =
         \
> + __builtin_abort ();                                                    =
\
> +    }
> +
> +  RUN (3);
> +  RUN (5);
> +  RUN (15);
> +  RUN (16);
> +  RUN (17);
> +  RUN (31);
> +  RUN (32);
> +  RUN (33);
> +  RUN (63);
> +  RUN (64);
> +  RUN (65);
> +  RUN (127);
> +  RUN (128);
> +  RUN (129);
> +  RUN (239);
> +  RUN (359);
> +  RUN (498);
> +  RUN (799);
> +  RUN (977);
> +  RUN (5789);
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp_run-7=
.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp_run-7.c
> new file mode 100644
> index 00000000000..a8e4781988e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp_run-7.c
> @@ -0,0 +1,58 @@
> +/* { dg-do run { target { riscv_vector } } } */
> +/* { dg-additional-options "--param riscv-autovec-preference=3Dscalable"=
 } */
> +
> +#include "slp-7.c"
> +
> +void
> +f_golden (float *__restrict f, double *__restrict d, int n)
> +{
> +  for (int i =3D 0; i < n; ++i)
> +    {
> +      f[i * 2 + 0] =3D 1;
> +      f[i * 2 + 1] =3D 2;
> +      d[i] =3D 3;
> +    }
> +}
> +
> +int
> +main (void)
> +{
> +#define RUN(NUM)                                                        =
       \
> +  float a_##NUM[NUM * 2 + 2] =3D {0};                                   =
         \
> +  float a_golden_##NUM[NUM * 2 + 2] =3D {0};                            =
         \
> +  double b_##NUM[NUM] =3D {0};                                          =
         \
> +  double b_golden_##NUM[NUM] =3D {0};                                   =
         \
> +  f (a_##NUM, b_##NUM, NUM);                                            =
       \
> +  f_golden (a_golden_##NUM, b_golden_##NUM, NUM);                       =
       \
> +  for (int i =3D 0; i < NUM; i++)                                       =
         \
> +    {                                                                   =
       \
> +      if (a_##NUM[i * 2 + 0] !=3D a_golden_##NUM[i * 2 + 0])            =
         \
> + __builtin_abort ();                                                    =
\
> +      if (a_##NUM[i * 2 + 1] !=3D a_golden_##NUM[i * 2 + 1])            =
         \
> + __builtin_abort ();                                                    =
\
> +      if (b_##NUM[i] !=3D b_golden_##NUM[i])                            =
         \
> + __builtin_abort ();                                                    =
\
> +    }
> +
> +  RUN (3);
> +  RUN (5);
> +  RUN (15);
> +  RUN (16);
> +  RUN (17);
> +  RUN (31);
> +  RUN (32);
> +  RUN (33);
> +  RUN (63);
> +  RUN (64);
> +  RUN (65);
> +  RUN (127);
> +  RUN (128);
> +  RUN (129);
> +  RUN (239);
> +  RUN (359);
> +  RUN (498);
> +  RUN (799);
> +  RUN (977);
> +  RUN (5789);
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/scalable-1.c b/gc=
c/testsuite/gcc.target/riscv/rvv/autovec/scalable-1.c
> index 500b0adce66..3c03a87377d 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/scalable-1.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/scalable-1.c
> @@ -14,4 +14,4 @@ f (int32_t *__restrict f, int32_t *__restrict d, int n)
>      }
> }
> -/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 0 =
"vect" } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 1 =
"vect" } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/v-1.c b/gcc/tests=
uite/gcc.target/riscv/rvv/autovec/v-1.c
> index 383c82a3b7c..e68d05f5f48 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/v-1.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/v-1.c
> @@ -3,9 +3,4 @@
> #include "template-1.h"
> -/* Currently, we don't support SLP auto-vectorization for VLA. But it's
> -   necessary that we add this testcase here to make sure such unsupporte=
d SLP
> -   auto-vectorization will not cause an ICE. We will enable "vect" check=
ing when
> -   we support SLP auto-vectorization for VLA in the future.  */
> -
> -/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 0 =
"vect" } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 5 =
"vect" } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/zve32f_zvl128b-1.=
c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/zve32f_zvl128b-1.c
> index 23cc1c8651f..ecfda79e19a 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/zve32f_zvl128b-1.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/zve32f_zvl128b-1.c
> @@ -3,4 +3,4 @@
> #include "template-1.h"
> -/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 0 =
"vect" } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 3 =
"vect" } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/zve32x_zvl128b-1.=
c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/zve32x_zvl128b-1.c
> index 4f130f02f67..1394f08f2b9 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/zve32x_zvl128b-1.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/zve32x_zvl128b-1.c
> @@ -3,4 +3,4 @@
> #include "template-1.h"
> -/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 0 =
"vect" } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 2 =
"vect" } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/zve64d-1.c b/gcc/=
testsuite/gcc.target/riscv/rvv/autovec/zve64d-1.c
> index 823d51a03cb..c5e89996fa4 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/zve64d-1.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/zve64d-1.c
> @@ -3,4 +3,4 @@
> #include "template-1.h"
> -/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 2 =
"vect" } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 3 =
"vect" } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/zve64d_zvl128b-1.=
c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/zve64d_zvl128b-1.c
> index 5ead22746d3..6b320ca6f38 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/zve64d_zvl128b-1.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/zve64d_zvl128b-1.c
> @@ -3,4 +3,4 @@
> #include "template-1.h"
> -/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 0 =
"vect" } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 5 =
"vect" } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/zve64f-1.c b/gcc/=
testsuite/gcc.target/riscv/rvv/autovec/zve64f-1.c
> index e03d1b44ca6..6c2a002de9c 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/zve64f-1.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/zve64f-1.c
> @@ -3,4 +3,4 @@
> #include "template-1.h"
> -/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 2 =
"vect" } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 3 =
"vect" } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/zve64f_zvl128b-1.=
c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/zve64f_zvl128b-1.c
> index 5bb2d9d96fa..ae3f066477c 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/zve64f_zvl128b-1.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/zve64f_zvl128b-1.c
> @@ -3,4 +3,4 @@
> #include "template-1.h"
> -/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 0 =
"vect" } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 =
"vect" } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/zve64x_zvl128b-1.=
c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/zve64x_zvl128b-1.c
> index 71820ece4b2..fc676a3865e 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/zve64x_zvl128b-1.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/zve64x_zvl128b-1.c
> @@ -3,4 +3,4 @@
> #include "template-1.h"
> -/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 0 =
"vect" } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 3 =
"vect" } } */
> --
> 2.36.1
>