[PATCH] RISC-V: Add VLS modes for GNU vectors

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH] RISC-V: Add VLS modes for GNU vectors
@ 2023-06-18 23:06 Juzhe-Zhong
  2023-06-19 18:25 ` Jeff Law
  0 siblings, 1 reply; 4+ messages in thread
From: Juzhe-Zhong @ 2023-06-18 23:06 UTC (permalink / raw)
  To: gcc-patches; +Cc: kito.cheng, palmer, rdapp.gcc, jeffreyalaw, Juzhe-Zhong

This patch is a propsal patch is **NOT** ready to push since
after this patch the total machine modes will exceed 255 which will create ICE
in LTO:
  internal compiler error: in bp_pack_int_in_range, at data-streamer.h:290

The reason we need to add VLS modes for following reason:
1. Enhance GNU vectors codegen:
   For example:
     typedef int32_t vnx8si __attribute__ ((vector_size (32)));

     __attribute__ ((noipa)) void
     f_vnx8si (int32_t * in, int32_t * out)
     {
       vnx8si v = *(vnx8si*)in;
       *(vnx8si *) out = v;
     } 

compile option: --param=riscv-autovec-preference=scalable
    before this patch:
    f_vnx8si:
        ld      a2,0(a0)
        ld      a3,8(a0)
        ld      a4,16(a0)
        ld      a5,24(a0)
        addi    sp,sp,-32
        sd      a2,0(a1)
        sd      a3,8(a1)
        sd      a4,16(a1)
        sd      a5,24(a1)
        addi    sp,sp,32
        jr      ra

After this patch:
       f_vnx8si:
        vsetivli        zero,8,e32,m2,ta,ma
        vle32.v v2,0(a0)
        vse32.v v2,0(a1)
        ret

2. Ehance VLA SLP:
void
f (uint8_t *restrict a, uint8_t *restrict b, uint8_t *restrict c)
{
  for (int i = 0; i < 100; ++i)
    {
      a[i * 8] = b[i * 8] + c[i * 8];
      a[i * 8 + 1] = b[i * 8] + c[i * 8 + 1];
      a[i * 8 + 2] = b[i * 8 + 2] + c[i * 8 + 2];
      a[i * 8 + 3] = b[i * 8 + 2] + c[i * 8 + 3];
      a[i * 8 + 4] = b[i * 8 + 4] + c[i * 8 + 4];
      a[i * 8 + 5] = b[i * 8 + 4] + c[i * 8 + 5];
      a[i * 8 + 6] = b[i * 8 + 6] + c[i * 8 + 6];
      a[i * 8 + 7] = b[i * 8 + 6] + c[i * 8 + 7];
    }
}


..
Loop body:
 ...
 vrgatherei16.vv...
 ...

Tail:
 lbu     a4,792(a1)
        lbu     a5,792(a2)
        addw    a5,a5,a4
        sb      a5,792(a0)
        lbu     a5,793(a2)
        addw    a5,a5,a4
        sb      a5,793(a0)
        lbu     a4,794(a1)
        lbu     a5,794(a2)
        addw    a5,a5,a4
        sb      a5,794(a0)
        lbu     a5,795(a2)
        addw    a5,a5,a4
        sb      a5,795(a0)
        lbu     a4,796(a1)
        lbu     a5,796(a2)
        addw    a5,a5,a4
        sb      a5,796(a0)
        lbu     a5,797(a2)
        addw    a5,a5,a4
        sb      a5,797(a0)
        lbu     a4,798(a1)
        lbu     a5,798(a2)
        addw    a5,a5,a4
        sb      a5,798(a0)
        lbu     a5,799(a2)
        addw    a5,a5,a4
        sb      a5,799(a0)
        ret

The tail elements need VLS modes to vectorize like ARM SVE:

f:
        mov     x3, 0
        cntb    x5
        mov     x4, 792
        whilelo p7.b, xzr, x4
.L2:
        ld1b    z31.b, p7/z, [x1, x3]
        ld1b    z30.b, p7/z, [x2, x3]
        trn1    z31.b, z31.b, z31.b
        add     z31.b, z31.b, z30.b
        st1b    z31.b, p7, [x0, x3]
        add     x3, x3, x5
        whilelo p7.b, x3, x4
        b.any   .L2
Tail:
        ldr     b31, [x1, 792]
        ldr     b27, [x1, 794]
        ldr     b28, [x1, 796]
        dup     v31.8b, v31.b[0]
        ldr     b29, [x1, 798]
        ldr     d30, [x2, 792]
        ins     v31.b[2], v27.b[0]
        ins     v31.b[3], v27.b[0]
        ins     v31.b[4], v28.b[0]
        ins     v31.b[5], v28.b[0]
        ins     v31.b[6], v29.b[0]
        ins     v31.b[7], v29.b[0]
        add     v31.8b, v30.8b, v31.8b
        str     d31, [x0, 792]
        ret

Notice ARM SVE use ADVSIMD modes (Neon) to vectorize the tail.

gcc/ChangeLog:

        * config/riscv/riscv-modes.def (VECTOR_BOOL_MODE): Add VLS modes for GNU vectors.
        (ADJUST_ALIGNMENT): Ditto.
        (ADJUST_BYTESIZE): Ditto.

        (ADJUST_PRECISION): Ditto.
        (VECTOR_MODES): Ditto.
        * config/riscv/riscv-protos.h (riscv_v_ext_vls_mode_p): Ditto.
        (get_regno_alignment): Ditto.
        * config/riscv/riscv-v.cc (INCLUDE_ALGORITHM): Ditto.
        (const_vlmax_p): Ditto.
        (legitimize_move): Ditto.
        (get_vlmul): Ditto.
        (get_regno_alignment): Ditto.
        (get_ratio): Ditto.
        (get_vector_mode): Ditto.
        * config/riscv/riscv-vector-switch.def (VLS_ENTRY): Ditto.
        * config/riscv/riscv.cc (riscv_v_ext_vls_mode_p): Ditto.
        (VLS_ENTRY): Ditto.
        (riscv_v_ext_mode_p): Ditto.
        (riscv_hard_regno_nregs): Ditto.
        (riscv_hard_regno_mode_ok): Ditto.
        * config/riscv/riscv.md: Ditto.
        * config/riscv/vector-iterators.md: Ditto.
        * config/riscv/vector.md: Ditto.
        * config/riscv/autovec-vls.md: New file.

---
 gcc/config/riscv/autovec-vls.md          | 102 +++++++++++++++++++
 gcc/config/riscv/riscv-modes.def         |  72 +++++++++++++
 gcc/config/riscv/riscv-protos.h          |   2 +
 gcc/config/riscv/riscv-v.cc              | 122 ++++++++++++++++++++++-
 gcc/config/riscv/riscv-vector-switch.def |  62 ++++++++++++
 gcc/config/riscv/riscv.cc                |  45 ++++++---
 gcc/config/riscv/riscv.md                |   6 +-
 gcc/config/riscv/vector-iterators.md     |  57 +++++++++++
 gcc/config/riscv/vector.md               | 113 +++++++++++++++++++--
 9 files changed, 557 insertions(+), 24 deletions(-)
 create mode 100644 gcc/config/riscv/autovec-vls.md

diff --git a/gcc/config/riscv/autovec-vls.md b/gcc/config/riscv/autovec-vls.md
new file mode 100644
index 00000000000..d48b2b601c1
--- /dev/null
+++ b/gcc/config/riscv/autovec-vls.md
@@ -0,0 +1,102 @@
+;; Machine description for VLS of RVV auto-vectorization.
+;; Copyright (C) 2023 Free Software Foundation, Inc.
+;; Contributed by Juzhe Zhong (juzhe.zhong@rivai.ai), RiVAI Technologies Ltd.
+
+;; This file is part of GCC.
+
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+;; This pattern is similar to vector.md but exclude avl_type operand.
+;; We don't need to include avl_type operand since we never use
+;; vsevli rd,zero for VLS modes.
+(define_insn_and_split "@pred_mov<mode>"
+  [(set (match_operand:VLS 0 "reg_or_mem_operand"        "=vr,    vr,    vd,     m,    vr,    vr,    vr,    vr")
+    (if_then_else:VLS
+      (unspec:<VM>
+        [(match_operand:<VM> 1 "vector_mask_operand"   "vmWc1,   Wc1,    vm, vmWc1,   Wc1,   Wc1,   Wc1,   Wc1")
+         (match_operand 4 "vector_length_operand"      "   rK,    rK,    rK,    rK,    rK,    rK,    rK,    rK")
+         (match_operand 5 "const_int_operand"          "    i,     i,     i,     i,     i,     i,     i,     i")
+         (match_operand 6 "const_int_operand"          "    i,     i,     i,     i,     i,     i,     i,     i")
+         (reg:SI VL_REGNUM)
+         (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
+      (match_operand:VLS 3 "vector_move_operand"       "    m,     m,     m,    vr,    vr,    vr, viWc0, viWc0")
+      (match_operand:VLS 2 "vector_merge_operand"      "    0,    vu,    vu,    vu,    vu,     0,    vu,     0")))]
+  "TARGET_VECTOR && (MEM_P (operands[0]) || MEM_P (operands[3])
+   || CONST_VECTOR_P (operands[1]))"
+  "@
+   vle<sew>.v\t%0,%3%p1
+   vle<sew>.v\t%0,%3
+   vle<sew>.v\t%0,%3,%1.t
+   vse<sew>.v\t%3,%0%p1
+   vmv.v.v\t%0,%3
+   vmv.v.v\t%0,%3
+   vmv.v.i\t%0,%v3
+   vmv.v.i\t%0,%v3"
+  "&& register_operand (operands[0], <MODE>mode)
+   && register_operand (operands[3], <MODE>mode)
+   && satisfies_constraint_vu (operands[2])"
+  [(set (match_dup 0) (match_dup 3))]
+  ""
+  [(set_attr "type" "vlde,vlde,vlde,vste,vimov,vimov,vimov,vimov")
+   (set_attr "mode" "<MODE>")])
+
+;; -----------------------------------------------------------------
+;; ---- Moves Operations
+;; -----------------------------------------------------------------
+
+(define_expand "mov<mode>"
+  [(set (match_operand:VLS 0 "reg_or_mem_operand")
+	(match_operand:VLS 1 "general_operand"))]
+  "TARGET_VECTOR"
+{
+  if (riscv_vector::legitimize_move (operands[0], operands[1]))
+    DONE;
+})
+
+(define_insn "*mov<mode>_reg"
+  [(set (match_operand:VLS 0 "register_operand" "=vr")
+	(match_operand:VLS 1 "register_operand" " vr"))]
+  "TARGET_VECTOR"
+  "vmv%m1r.v\t%0,%1"
+  [(set_attr "type" "vmov")
+   (set_attr "mode" "<MODE>")])
+
+(define_expand "@mov<VLS:mode><P:mode>_lra"
+  [(parallel
+    [(set (match_operand:VLS 0 "reg_or_mem_operand")
+	  (match_operand:VLS 1 "reg_or_mem_operand"))
+   (clobber (match_scratch:P 2))])]
+  "TARGET_VECTOR && (lra_in_progress || reload_completed)"
+{})
+
+(define_insn_and_split "*mov<VLS:mode><P:mode>_lra"
+  [(set (match_operand:VLS 0 "reg_or_mem_operand" "=vr, m,vr")
+	(match_operand:VLS 1 "reg_or_mem_operand" "  m,vr,vr"))
+   (clobber (match_scratch:P 2 "=&r,&r,X"))]
+  "TARGET_VECTOR && (lra_in_progress || reload_completed)"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  if (REG_P (operands[0]) && REG_P (operands[1]))
+      emit_insn (gen_rtx_SET (operands[0], operands[1]));
+  else
+    {
+      riscv_vector::emit_vlmax_vsetvl (<VLS:MODE>mode, operands[2]);
+      riscv_vector::emit_vlmax_insn (code_for_pred_mov (<VLS:MODE>mode),
+				     riscv_vector::RVV_UNOP, operands, operands[2]);
+    }
+  DONE;
+})
diff --git a/gcc/config/riscv/riscv-modes.def b/gcc/config/riscv/riscv-modes.def
index 19a4f9fb3db..f8aaf9597b6 100644
--- a/gcc/config/riscv/riscv-modes.def
+++ b/gcc/config/riscv/riscv-modes.def
@@ -318,6 +318,78 @@ RVV_TUPLE_PARTIAL_MODES (6)
 RVV_TUPLE_PARTIAL_MODES (7)
 RVV_TUPLE_PARTIAL_MODES (8)
 
+/* VLS modes used as SIMD auto-vectorization for epilogue. We know the
+   return type of GET_MODE_BITSIZE is poly_uint16 or unsigned short.
+   The maximum bitsize of all vector modes is 65536 = (8192 (LMUL1) * 8),
+   even though RISC-V 'V' ISA spec allow maximum bitsize = 65536 * 8.
+
+   Note: We don't enable of vector modes of TI/TF.  */
+
+VECTOR_BOOL_MODE (V2BI, 2, BI, 2);           /*    V2BI  */
+VECTOR_BOOL_MODE (V4BI, 4, BI, 4);           /*    V4BI  */
+VECTOR_BOOL_MODE (V8BI, 8, BI, 8);           /*    V8BI  */
+VECTOR_BOOL_MODE (V16BI, 16, BI, 16);        /*   V16BI  */
+VECTOR_BOOL_MODE (V32BI, 32, BI, 32);        /*   V32BI  */
+VECTOR_BOOL_MODE (V64BI, 64, BI, 64);        /*   V64BI  */
+VECTOR_BOOL_MODE (V128BI, 128, BI, 128);     /*  V128BI  */
+VECTOR_BOOL_MODE (V256BI, 256, BI, 256);     /*  V256BI  */
+VECTOR_BOOL_MODE (V512BI, 512, BI, 512);     /*  V512BI  */
+VECTOR_BOOL_MODE (V1024BI, 1024, BI, 1024);  /* V1024BI  */
+VECTOR_BOOL_MODE (V2048BI, 2048, BI, 2048);  /* V2048BI  */
+VECTOR_BOOL_MODE (V4096BI, 4096, BI, 4096);  /* V4096BI  */
+
+ADJUST_ALIGNMENT (V2BI, 1);
+ADJUST_ALIGNMENT (V4BI, 1);
+ADJUST_ALIGNMENT (V8BI, 1);
+ADJUST_ALIGNMENT (V16BI, 1);
+ADJUST_ALIGNMENT (V32BI, 1);
+ADJUST_ALIGNMENT (V64BI, 1);
+ADJUST_ALIGNMENT (V128BI, 1);
+ADJUST_ALIGNMENT (V256BI, 1);
+ADJUST_ALIGNMENT (V512BI, 1);
+ADJUST_ALIGNMENT (V1024BI, 1);
+ADJUST_ALIGNMENT (V2048BI, 1);
+ADJUST_ALIGNMENT (V4096BI, 1);
+
+ADJUST_BYTESIZE (V2BI, TARGET_MIN_VLEN / 8);
+ADJUST_BYTESIZE (V4BI, TARGET_MIN_VLEN / 8);
+ADJUST_BYTESIZE (V8BI, TARGET_MIN_VLEN / 8);
+ADJUST_BYTESIZE (V16BI, TARGET_MIN_VLEN / 8);
+ADJUST_BYTESIZE (V32BI, TARGET_MIN_VLEN / 8);
+ADJUST_BYTESIZE (V64BI, TARGET_MIN_VLEN / 8);
+ADJUST_BYTESIZE (V128BI, TARGET_MIN_VLEN / 8);
+ADJUST_BYTESIZE (V256BI, TARGET_MIN_VLEN / 8);
+ADJUST_BYTESIZE (V512BI, TARGET_MIN_VLEN / 8);
+ADJUST_BYTESIZE (V1024BI, TARGET_MIN_VLEN / 8);
+ADJUST_BYTESIZE (V2048BI, TARGET_MIN_VLEN / 8);
+ADJUST_BYTESIZE (V4096BI, TARGET_MIN_VLEN / 8);
+
+ADJUST_PRECISION (V2BI, 2);
+ADJUST_PRECISION (V4BI, 4);
+ADJUST_PRECISION (V8BI, 8);
+ADJUST_PRECISION (V16BI, 16);
+ADJUST_PRECISION (V32BI, 32);
+ADJUST_PRECISION (V64BI, 64);
+ADJUST_PRECISION (V128BI, 128);
+ADJUST_PRECISION (V256BI, 256);
+ADJUST_PRECISION (V512BI, 512);
+ADJUST_PRECISION (V1024BI, 1024);
+ADJUST_PRECISION (V2048BI, 2048);
+ADJUST_PRECISION (V4096BI, 4096);
+
+VECTOR_MODES (INT, 2);        /*    V2QI                         */
+VECTOR_MODES (INT, 4);        /*    V4QI    V2HI                 */
+VECTOR_MODES (INT, 8);        /*    V8QI    V4HI    V2SI         */
+VECTOR_MODES (INT, 16);       /*   V16QI    V8HI    V4SI    V2DI */
+VECTOR_MODES (INT, 32);       /*   V32QI   V16HI    V8SI    V4DI */
+VECTOR_MODES (INT, 64);       /*   V64QI   V32HI   V16SI    V8DI */
+VECTOR_MODES (INT, 128);      /*  V128QI   V64HI   V32SI   V16DI */
+VECTOR_MODES (INT, 256);      /*  V256QI  V128HI   V64SI   V32DI */
+VECTOR_MODES (INT, 512);      /*  V512QI  V256HI  V128SI   V64DI */
+VECTOR_MODES (INT, 1024);     /* V1024QI  V512HI  V256SI  V128DI */
+VECTOR_MODES (INT, 2048);     /* V2048QI V1024HI  V512SI  V256DI */
+VECTOR_MODES (INT, 4096);     /* V4096QI V2048HI V1024SI  V512DI */
+
 /* TODO: According to RISC-V 'V' ISA spec, the maximun vector length can
    be 65536 for a single vector register which means the vector mode in
    GCC can be maximum = 65536 * 8 bits (LMUL=8).
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index b23a9c12465..284c2630751 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -82,6 +82,7 @@ extern void riscv_reinit (void);
 extern poly_uint64 riscv_regmode_natural_size (machine_mode);
 extern bool riscv_v_ext_vector_mode_p (machine_mode);
 extern bool riscv_v_ext_tuple_mode_p (machine_mode);
+extern bool riscv_v_ext_vls_mode_p (machine_mode);
 extern bool riscv_shamt_matches_mask_p (int, HOST_WIDE_INT);
 extern void riscv_subword_address (rtx, rtx *, rtx *, rtx *, rtx *);
 extern void riscv_lshift_subword (machine_mode, rtx, rtx, rtx *);
@@ -191,6 +192,7 @@ void emit_vlmax_cmp_insn (unsigned, rtx *);
 void emit_vlmax_cmp_mu_insn (unsigned, rtx *);
 void emit_vlmax_masked_mu_insn (unsigned, int, rtx *);
 enum vlmul_type get_vlmul (machine_mode);
+int get_regno_alignment (machine_mode);
 unsigned int get_ratio (machine_mode);
 unsigned int get_nf (machine_mode);
 machine_mode get_subpart_mode (machine_mode);
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 6421e933ca9..6fc1c433069 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -25,6 +25,7 @@
    the vector.md.  */
 #define RVV_INSN_OPERANDS_MAX 11
 
+#define INCLUDE_ALGORITHM
 #include "config.h"
 #include "system.h"
 #include "coretypes.h"
@@ -60,6 +61,11 @@ const_vlmax_p (machine_mode mode)
 {
   poly_uint64 nuints = GET_MODE_NUNITS (mode);
 
+  /* For VLS modes, we always use vsetvli zero,rs1
+     instead of vsetvli rd,zero.  */
+  if (riscv_v_ext_vls_mode_p (mode))
+    return true;
+
   return nuints.is_constant ()
     /* The vsetivli can only hold register 0~31.  */
     ? (IN_RANGE (nuints.to_constant (), 0, 31))
@@ -198,7 +204,7 @@ public:
 
     if (!all_mask_p)
       add_policy_operand ();
-    if (m_needs_avl_p)
+    if (m_needs_avl_p && !riscv_v_ext_vls_mode_p (m_dest_mode))
       add_avl_type_operand (m_vlmax_p ? avl_type::VLMAX : avl_type::NONVLMAX);
     expand (icode, any_mem_p);
   }
@@ -1194,7 +1200,8 @@ legitimize_move (rtx dest, rtx src)
    * require one extra general purpose register, but it's not allowed during LRA
    * process, so we have a special move pattern used for LRA, which will defer
    * the expansion after LRA.  */
-  if ((known_lt (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR)
+  if ((riscv_v_ext_vls_mode_p (mode)
+       || known_lt (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR)
        || GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
       && lra_in_progress)
     {
@@ -1202,7 +1209,8 @@ legitimize_move (rtx dest, rtx src)
       return true;
     }
 
-  if (known_ge (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR)
+  if (riscv_v_ext_vector_mode_p (mode)
+      && known_ge (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR)
       && GET_MODE_CLASS (mode) != MODE_VECTOR_BOOL)
     {
       /* Need to force register if mem <- !reg.  */
@@ -1285,6 +1293,59 @@ static mode_vtype_group mode_vtype_infos;
 enum vlmul_type
 get_vlmul (machine_mode mode)
 {
+  /* For VLS modes, the vlmul should be dynamically
+     calculated since we need to adjust VLMUL according
+     to TARGET_MIN_VLEN.  */
+  if (riscv_v_ext_vls_mode_p (mode))
+    {
+      int size = GET_MODE_BITSIZE (mode).to_constant ();
+      int inner_size = GET_MODE_BITSIZE (GET_MODE_INNER (mode));
+      if (size < TARGET_MIN_VLEN)
+	{
+	  int factor = TARGET_MIN_VLEN / size;
+	  if (inner_size == 8)
+	    factor = std::min (factor, 8);
+	  else if (inner_size == 16)
+	    factor = std::min (factor, 4);
+	  else if (inner_size == 32)
+	    factor = std::min (factor, 2);
+	  else if (inner_size == 64)
+	    factor = std::min (factor, 1);
+	  else
+	    gcc_unreachable ();
+
+	  switch (factor)
+	    {
+	    case 2:
+	      return LMUL_F2;
+	    case 4:
+	      return LMUL_F4;
+	    case 8:
+	      return LMUL_F8;
+
+	    default:
+	      gcc_unreachable ();
+	    }
+	}
+      else
+	{
+	  int factor = size / TARGET_MIN_VLEN;
+	  switch (factor)
+	    {
+	    case 1:
+	      return LMUL_1;
+	    case 2:
+	      return LMUL_2;
+	    case 4:
+	      return LMUL_4;
+	    case 8:
+	      return LMUL_8;
+
+	    default:
+	      gcc_unreachable ();
+	    }
+	}
+    }
   if (TARGET_MIN_VLEN >= 128)
     return mode_vtype_infos.vlmul_for_for_vlen128[mode];
   else if (TARGET_MIN_VLEN == 32)
@@ -1293,6 +1354,32 @@ get_vlmul (machine_mode mode)
     return mode_vtype_infos.vlmul_for_min_vlen64[mode];
 }
 
+/* Get REGNO alignment of vector mode.
+   The alignment = LMUL when the LMUL >= 1.
+   Otherwise, alignment = 1.  */
+int
+get_regno_alignment (machine_mode mode)
+{
+  /* 3.3.2. LMUL = 2,4,8, register numbers should be multiple of 2,4,8.
+     but for mask vector register, register numbers can be any number. */
+  int lmul = 1;
+  machine_mode rvv_mode = mode;
+  if (riscv_v_ext_vls_mode_p (rvv_mode))
+    {
+      int size = GET_MODE_BITSIZE (rvv_mode).to_constant ();
+      if (size < TARGET_MIN_VLEN)
+	return 1;
+      else
+	return size / TARGET_MIN_VLEN;
+    }
+  if (riscv_v_ext_tuple_mode_p (rvv_mode))
+    rvv_mode = riscv_vector::get_subpart_mode (rvv_mode);
+  poly_int64 size = GET_MODE_SIZE (rvv_mode);
+  if (known_gt (size, UNITS_PER_V_REG))
+    lmul = exact_div (size, UNITS_PER_V_REG).to_constant ();
+  return lmul;
+}
+
 /* Return the NF value of the corresponding mode.  */
 unsigned int
 get_nf (machine_mode mode)
@@ -1317,6 +1404,31 @@ get_subpart_mode (machine_mode mode)
 unsigned int
 get_ratio (machine_mode mode)
 {
+  if (riscv_v_ext_vls_mode_p (mode))
+    {
+      unsigned int sew = get_sew (mode);
+      vlmul_type vlmul = get_vlmul (mode);
+      switch (vlmul)
+	{
+	case LMUL_1:
+	  return sew;
+	case LMUL_2:
+	  return sew / 2;
+	case LMUL_4:
+	  return sew / 4;
+	case LMUL_8:
+	  return sew / 8;
+	case LMUL_F8:
+	  return sew * 8;
+	case LMUL_F4:
+	  return sew * 4;
+	case LMUL_F2:
+	  return sew * 2;
+
+	default:
+	  gcc_unreachable ();
+	}
+    }
   if (TARGET_MIN_VLEN >= 128)
     return mode_vtype_infos.ratio_for_for_vlen128[mode];
   else if (TARGET_MIN_VLEN == 32)
@@ -1397,7 +1509,9 @@ get_vector_mode (scalar_mode inner_mode, poly_uint64 nunits)
   FOR_EACH_MODE_IN_CLASS (mode, mclass)
     if (inner_mode == GET_MODE_INNER (mode)
 	&& known_eq (nunits, GET_MODE_NUNITS (mode))
-	&& riscv_v_ext_vector_mode_p (mode))
+	&& (riscv_v_ext_vector_mode_p (mode)
+	    || (!riscv_vector_chunks.is_constant ()
+		&& riscv_v_ext_vls_mode_p (mode))))
       return mode;
   return opt_machine_mode ();
 }
diff --git a/gcc/config/riscv/riscv-vector-switch.def b/gcc/config/riscv/riscv-vector-switch.def
index 52f07709f99..b476e7c22e5 100644
--- a/gcc/config/riscv/riscv-vector-switch.def
+++ b/gcc/config/riscv/riscv-vector-switch.def
@@ -90,6 +90,9 @@ TODO: FP16 vector needs support of 'zvfh', we don't support it yet.  */
 		    RATIO_FOR_MIN_VLEN64, VLMUL_FOR_MIN_VLEN128,               \
 		    RATIO_FOR_MIN_VLEN128)
 #endif
+#ifndef VLS_ENTRY
+#define VLS_ENTRY(MODE, REQUIREMENT)
+#endif
 
 /* Mask modes. Disable VNx64BImode when TARGET_MIN_VLEN == 32.  */
 ENTRY (VNx128BI, TARGET_MIN_VLEN >= 128, LMUL_RESERVED, 0, LMUL_RESERVED, 0, LMUL_8, 1)
@@ -339,5 +342,64 @@ TUPLE_ENTRY (VNx6x1DF, TARGET_VECTOR_ELEN_FP_64 && TARGET_MIN_VLEN < 128, VNx1DF
 TUPLE_ENTRY (VNx7x1DF, TARGET_VECTOR_ELEN_FP_64 && TARGET_MIN_VLEN < 128, VNx1DF, 7, LMUL_RESERVED, 0, LMUL_1, 64, LMUL_RESERVED, 0)
 TUPLE_ENTRY (VNx8x1DF, TARGET_VECTOR_ELEN_FP_64 && TARGET_MIN_VLEN < 128, VNx1DF, 8, LMUL_RESERVED, 0, LMUL_1, 64, LMUL_RESERVED, 0)
 
+/* This following VLS modes should satisfy the constraint:
+   GET_MODE_BITSIZE (MODE) <= TARGET_MIN_VLEN * 8.  */
+VLS_ENTRY (V2BI, true)
+VLS_ENTRY (V4BI, true)
+VLS_ENTRY (V8BI, true)
+VLS_ENTRY (V16BI, true)
+VLS_ENTRY (V32BI, true)
+VLS_ENTRY (V64BI, TARGET_MIN_VLEN >= 64)
+VLS_ENTRY (V128BI, TARGET_MIN_VLEN >= 128)
+VLS_ENTRY (V256BI, TARGET_MIN_VLEN >= 256)
+VLS_ENTRY (V512BI, TARGET_MIN_VLEN >= 512)
+VLS_ENTRY (V1024BI, TARGET_MIN_VLEN >= 1024)
+VLS_ENTRY (V2048BI, TARGET_MIN_VLEN >= 2048)
+VLS_ENTRY (V4096BI, TARGET_MIN_VLEN >= 4096)
+
+VLS_ENTRY (V2QI, true)
+VLS_ENTRY (V4QI, true)
+VLS_ENTRY (V8QI, true)
+VLS_ENTRY (V16QI, true)
+VLS_ENTRY (V32QI, true)
+VLS_ENTRY (V64QI, TARGET_MIN_VLEN >= 64)
+VLS_ENTRY (V128QI, TARGET_MIN_VLEN >= 128)
+VLS_ENTRY (V256QI, TARGET_MIN_VLEN >= 256)
+VLS_ENTRY (V512QI, TARGET_MIN_VLEN >= 512)
+VLS_ENTRY (V1024QI, TARGET_MIN_VLEN >= 1024)
+VLS_ENTRY (V2048QI, TARGET_MIN_VLEN >= 2048)
+VLS_ENTRY (V4096QI, TARGET_MIN_VLEN >= 4096)
+VLS_ENTRY (V2HI, true)
+VLS_ENTRY (V4HI, true)
+VLS_ENTRY (V8HI, true)
+VLS_ENTRY (V16HI, true)
+VLS_ENTRY (V32HI, TARGET_MIN_VLEN >= 64)
+VLS_ENTRY (V64HI, TARGET_MIN_VLEN >= 128)
+VLS_ENTRY (V128HI, TARGET_MIN_VLEN >= 256)
+VLS_ENTRY (V256HI, TARGET_MIN_VLEN >= 512)
+VLS_ENTRY (V512HI, TARGET_MIN_VLEN >= 1024)
+VLS_ENTRY (V1024HI, TARGET_MIN_VLEN >= 2048)
+VLS_ENTRY (V2048HI, TARGET_MIN_VLEN >= 4096)
+VLS_ENTRY (V2SI, true)
+VLS_ENTRY (V4SI, true)
+VLS_ENTRY (V8SI, true)
+VLS_ENTRY (V16SI, TARGET_MIN_VLEN >= 64)
+VLS_ENTRY (V32SI, TARGET_MIN_VLEN >= 128)
+VLS_ENTRY (V64SI, TARGET_MIN_VLEN >= 256)
+VLS_ENTRY (V128SI, TARGET_MIN_VLEN >= 512)
+VLS_ENTRY (V256SI, TARGET_MIN_VLEN >= 1024)
+VLS_ENTRY (V512SI, TARGET_MIN_VLEN >= 2048)
+VLS_ENTRY (V1024SI, TARGET_MIN_VLEN >= 4096)
+VLS_ENTRY (V2DI, TARGET_VECTOR_ELEN_64)
+VLS_ENTRY (V4DI, TARGET_VECTOR_ELEN_64)
+VLS_ENTRY (V8DI, TARGET_VECTOR_ELEN_64 && TARGET_MIN_VLEN >= 64)
+VLS_ENTRY (V16DI, TARGET_VECTOR_ELEN_64 && TARGET_MIN_VLEN >= 128)
+VLS_ENTRY (V32DI, TARGET_VECTOR_ELEN_64 && TARGET_MIN_VLEN >= 256)
+VLS_ENTRY (V64DI, TARGET_VECTOR_ELEN_64 && TARGET_MIN_VLEN >= 512)
+VLS_ENTRY (V128DI, TARGET_VECTOR_ELEN_64 && TARGET_MIN_VLEN >= 1024)
+VLS_ENTRY (V256DI, TARGET_VECTOR_ELEN_64 && TARGET_MIN_VLEN >= 2048)
+VLS_ENTRY (V512DI, TARGET_VECTOR_ELEN_64 && TARGET_MIN_VLEN >= 4096)
+
 #undef ENTRY
 #undef TUPLE_ENTRY
+#undef VLS_ENTRY
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index e5ae4e81b7a..70051257fa5 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -1029,12 +1029,31 @@ riscv_v_ext_tuple_mode_p (machine_mode mode)
   return false;
 }
 
+/* Return true if mode is the RVV enabled vls mode.  */
+
+bool
+riscv_v_ext_vls_mode_p (machine_mode mode)
+{
+#define VLS_ENTRY(MODE, REQUIREMENT)                                           \
+  case MODE##mode:                                                             \
+    return REQUIREMENT;
+  switch (mode)
+    {
+#include "riscv-vector-switch.def"
+    default:
+      return false;
+    }
+
+  return false;
+}
+
 /* Return true if it is either RVV vector mode or RVV tuple mode.  */
 
 static bool
 riscv_v_ext_mode_p (machine_mode mode)
 {
-  return riscv_v_ext_vector_mode_p (mode) || riscv_v_ext_tuple_mode_p (mode);
+  return riscv_v_ext_vector_mode_p (mode) || riscv_v_ext_tuple_mode_p (mode)
+	 || riscv_v_ext_vls_mode_p (mode);
 }
 
 /* Call from ADJUST_NUNITS in riscv-modes.def. Return the correct
@@ -6186,6 +6205,16 @@ riscv_hard_regno_nregs (unsigned int regno, machine_mode mode)
 	}
     }
 
+  /* For VLS modes, we allocate registers according to TARGET_MIN_VLEN.  */
+  if (riscv_v_ext_vls_mode_p (mode))
+    {
+      int size = GET_MODE_SIZE (mode).to_constant ();
+      if (size < TARGET_MIN_VLEN)
+	return 1;
+      else
+	return size / TARGET_MIN_VLEN;
+    }
+
   /* mode for VL or VTYPE are just a marker, not holding value,
      so it always consume one register.  */
   if (VTYPE_REG_P (regno) || VL_REG_P (regno) || VXRM_REG_P (regno)
@@ -6245,17 +6274,9 @@ riscv_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
       if (!V_REG_P (regno + nregs - 1))
 	return false;
 
-      /* 3.3.2. LMUL = 2,4,8, register numbers should be multiple of 2,4,8.
-	 but for mask vector register, register numbers can be any number. */
-      int lmul = 1;
-      machine_mode rvv_mode = mode;
-      if (riscv_v_ext_tuple_mode_p (rvv_mode))
-	rvv_mode = riscv_vector::get_subpart_mode (rvv_mode);
-      poly_int64 size = GET_MODE_SIZE (rvv_mode);
-      if (known_gt (size, UNITS_PER_V_REG))
-	lmul = exact_div (size, UNITS_PER_V_REG).to_constant ();
-      if (lmul != 1)
-	return ((regno % lmul) == 0);
+      int regno_alignment = riscv_vector::get_regno_alignment (mode);
+      if (regno_alignment != 1)
+	return ((regno % regno_alignment) == 0);
     }
   else if (VTYPE_REG_P (regno) || VL_REG_P (regno) || VXRM_REG_P (regno)
 	   || FRM_REG_P (regno))
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index d8e935cb934..ed184f963ac 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -202,7 +202,11 @@
   VNx2x1DI,VNx3x1DI,VNx4x1DI,VNx5x1DI,VNx6x1DI,VNx7x1DI,VNx8x1DI,
   VNx2x8DF,VNx2x4DF,VNx3x4DF,VNx4x4DF,
   VNx2x2DF,VNx3x2DF,VNx4x2DF,VNx5x2DF,VNx6x2DF,VNx7x2DF,VNx8x2DF,
-  VNx2x1DF,VNx3x1DF,VNx4x1DF,VNx5x1DF,VNx6x1DF,VNx7x1DF,VNx8x1DF"
+  VNx2x1DF,VNx3x1DF,VNx4x1DF,VNx5x1DF,VNx6x1DF,VNx7x1DF,VNx8x1DF,
+  V2QI,V4QI,V8QI,V16QI,V32QI,V64QI,V128QI,V256QI,V512QI,V1024QI,V2048QI,V4096QI,
+  V2HI,V4HI,V8HI,V16HI,V32HI,V64HI,V128HI,V256HI,V512HI,V1024HI,V2048HI,
+  V2SI,V4SI,V8SI,V16SI,V32SI,V64SI,V128SI,V256SI,V512SI,V1024SI,
+  V2DI,V4DI,V8DI,V16DI,V32DI,V64DI,V128DI,V256DI,V512DI"
   (const_string "unknown"))
 
 ;; True if the main data type is twice the size of a word.
diff --git a/gcc/config/riscv/vector-iterators.md b/gcc/config/riscv/vector-iterators.md
index 8c71c9e22cc..5f732240ab5 100644
--- a/gcc/config/riscv/vector-iterators.md
+++ b/gcc/config/riscv/vector-iterators.md
@@ -1033,6 +1033,19 @@
   (VNx2x4DF "VNx4BI") (VNx3x4DF "VNx4BI") (VNx4x4DF "VNx4BI")
   (VNx2x2DF "VNx2BI") (VNx3x2DF "VNx2BI") (VNx4x2DF "VNx2BI") (VNx5x2DF "VNx2BI") (VNx6x2DF "VNx2BI") (VNx7x2DF "VNx2BI") (VNx8x2DF "VNx2BI")
   (VNx2x1DF "VNx1BI") (VNx3x1DF "VNx1BI") (VNx4x1DF "VNx1BI") (VNx5x1DF "VNx1BI") (VNx6x1DF "VNx1BI") (VNx7x1DF "VNx1BI") (VNx8x1DF "VNx1BI")
+
+  ;; VLS modes.
+  (V2QI "V2BI") (V4QI "V4BI") (V8QI "V8BI") (V16QI "V16BI") (V32QI "V32BI")
+  (V64QI "V64BI") (V128QI "V128BI") (V256QI "V256BI") (V512QI "V512BI")
+  (V1024QI "V1024BI") (V2048QI "V2048BI") (V4096QI "V4096BI")
+  (V2HI "V2BI") (V4HI "V4BI") (V8HI "V8BI") (V16HI "V16BI")
+  (V32HI "V32BI") (V64HI "V64BI") (V128HI "V128BI") (V256HI "V256BI")
+  (V512HI "V512BI") (V1024HI "V1024BI") (V2048HI "V2048BI")
+  (V2SI "V2BI") (V4SI "V4BI") (V8SI "V8BI")
+  (V16SI "V16BI") (V32SI "V32BI") (V64SI "V64BI")
+  (V128SI "V128BI") (V256SI "V256BI") (V512SI "V512BI") (V1024SI "V1024BI")
+  (V2DI "V2BI") (V4DI "V4BI") (V8DI "V8BI") (V16DI "V16BI") (V32DI "V32BI")
+  (V64DI "V64BI") (V128DI "V128BI") (V256DI "V256BI") (V512DI "V512BI")
 ])
 
 (define_mode_attr vm [
@@ -1136,6 +1149,15 @@
   (VNx2x4DF "64") (VNx3x4DF "64") (VNx4x4DF "64")
   (VNx2x2DF "64") (VNx3x2DF "64") (VNx4x2DF "64") (VNx5x2DF "64") (VNx6x2DF "64") (VNx7x2DF "64") (VNx8x2DF "64")
   (VNx2x1DF "64") (VNx3x1DF "64") (VNx4x1DF "64") (VNx5x1DF "64") (VNx6x1DF "64") (VNx7x1DF "64") (VNx8x1DF "64")
+
+  ;; VLS modes.
+  (V2QI "8") (V4QI "8") (V8QI "8") (V16QI "8") (V32QI "8") (V64QI "8") (V128QI "8") (V256QI "8") (V512QI "8")
+  (V1024QI "8") (V2048QI "8") (V4096QI "8")
+  (V2HI "16") (V4HI "16") (V8HI "16") (V16HI "16") (V32HI "16") (V64HI "16") (V128HI "16") (V256HI "16")
+  (V512HI "16") (V1024HI "16") (V2048HI "16")
+  (V2SI "32") (V4SI "32") (V8SI "32") (V16SI "32") (V32SI "32") (V64SI "32") (V128SI "32") (V256SI "32")
+  (V512SI "32") (V1024SI "32")
+  (V2DI "64") (V4DI "64") (V8DI "64") (V16DI "64") (V32DI "64") (V64DI "64") (V128DI "64") (V256DI "64") (V512DI "64")
 ])
 
 (define_mode_attr double_trunc_sew [
@@ -1774,3 +1796,38 @@
 			      (mult "%3,%4")])
 
 (define_code_attr sz [(sign_extend "s") (zero_extend "z")])
+
+;; VLS modes.
+(define_mode_iterator VLS [V2QI V4QI V8QI V16QI V32QI
+  (V64QI "TARGET_MIN_VLEN >= 64")
+  (V128QI "TARGET_MIN_VLEN >= 128")
+  (V256QI "TARGET_MIN_VLEN >= 256")
+  (V512QI "TARGET_MIN_VLEN >= 512")
+  (V1024QI "TARGET_MIN_VLEN >= 1024")
+  (V2048QI "TARGET_MIN_VLEN >= 2048")
+  (V4096QI "TARGET_MIN_VLEN >= 4096")
+  V2HI V4HI V8HI V16HI
+  (V32HI "TARGET_MIN_VLEN >= 64")
+  (V64HI "TARGET_MIN_VLEN >= 128")
+  (V128HI "TARGET_MIN_VLEN >= 256")
+  (V256HI "TARGET_MIN_VLEN >= 512")
+  (V512HI "TARGET_MIN_VLEN >= 1024")
+  (V1024HI "TARGET_MIN_VLEN >= 2048")
+  (V2048HI "TARGET_MIN_VLEN >= 4096")
+  V2SI V4SI V8SI
+  (V16SI "TARGET_MIN_VLEN >= 64")
+  (V32SI "TARGET_MIN_VLEN >= 128")
+  (V64SI "TARGET_MIN_VLEN >= 256")
+  (V128SI "TARGET_MIN_VLEN >= 512")
+  (V256SI "TARGET_MIN_VLEN >= 1024")
+  (V512SI "TARGET_MIN_VLEN >= 2048")
+  (V1024SI "TARGET_MIN_VLEN >= 4096")
+  (V2DI "TARGET_VECTOR_ELEN_64")
+  (V4DI "TARGET_VECTOR_ELEN_64")
+  (V8DI "TARGET_VECTOR_ELEN_64 && TARGET_MIN_VLEN >= 64")
+  (V16DI "TARGET_VECTOR_ELEN_64 && TARGET_MIN_VLEN >= 128")
+  (V32DI "TARGET_VECTOR_ELEN_64 && TARGET_MIN_VLEN >= 256")
+  (V64DI "TARGET_VECTOR_ELEN_64 && TARGET_MIN_VLEN >= 512")
+  (V128DI "TARGET_VECTOR_ELEN_64 && TARGET_MIN_VLEN >= 1024")
+  (V256DI "TARGET_VECTOR_ELEN_64 && TARGET_MIN_VLEN >= 2048")
+  (V512DI "TARGET_VECTOR_ELEN_64 && TARGET_MIN_VLEN >= 4096")])
diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index 1d1847bd85a..3322b3512c5 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -25,6 +25,7 @@
 ;; - Intrinsics (https://github.com/riscv/rvv-intrinsic-doc)
 ;; - Auto-vectorization (autovec.md)
 ;; - Optimization (autovec-opt.md)
+;; - VLS patterns (autovec-vls.md)
 
 (include "vector-iterators.md")
 
@@ -90,7 +91,8 @@
 			  VNx2x8QI,VNx3x8QI,VNx4x8QI,VNx5x8QI,VNx6x8QI,VNx7x8QI,VNx8x8QI,\
 			  VNx2x4QI,VNx3x4QI,VNx4x4QI,VNx5x4QI,VNx6x4QI,VNx7x4QI,VNx8x4QI,\
 			  VNx2x2QI,VNx3x2QI,VNx4x2QI,VNx5x2QI,VNx6x2QI,VNx7x2QI,VNx8x2QI,\
-			  VNx2x1QI,VNx3x1QI,VNx4x1QI,VNx5x1QI,VNx6x1QI,VNx7x1QI,VNx8x1QI")
+			  VNx2x1QI,VNx3x1QI,VNx4x1QI,VNx5x1QI,VNx6x1QI,VNx7x1QI,VNx8x1QI,\
+			  V2QI,V4QI,V8QI,V16QI,V32QI,V64QI,V128QI,V256QI,V512QI,V1024QI,V2048QI,V4096QI")
 	 (const_int 8)
 	 (eq_attr "mode" "VNx1HI,VNx2HI,VNx4HI,VNx8HI,VNx16HI,VNx32HI,VNx64HI,\
 			  VNx1HF,VNx2HF,VNx4HF,VNx8HF,VNx16HF,VNx32HF,VNx64HF,\
@@ -98,7 +100,8 @@
 			  VNx2x8HI,VNx3x8HI,VNx4x8HI,VNx5x8HI,VNx6x8HI,VNx7x8HI,VNx8x8HI,\
 			  VNx2x4HI,VNx3x4HI,VNx4x4HI,VNx5x4HI,VNx6x4HI,VNx7x4HI,VNx8x4HI,\
 			  VNx2x2HI,VNx3x2HI,VNx4x2HI,VNx5x2HI,VNx6x2HI,VNx7x2HI,VNx8x2HI,\
-			  VNx2x1HI,VNx3x1HI,VNx4x1HI,VNx5x1HI,VNx6x1HI,VNx7x1HI,VNx8x1HI")
+			  VNx2x1HI,VNx3x1HI,VNx4x1HI,VNx5x1HI,VNx6x1HI,VNx7x1HI,VNx8x1HI,\
+			  V2HI,V4HI,V8HI,V16HI,V32HI,V64HI,V128HI,V256HI,V512HI,V1024HI,V2048HI")
 	 (const_int 16)
 	 (eq_attr "mode" "VNx1SI,VNx2SI,VNx4SI,VNx8SI,VNx16SI,VNx32SI,\
 			  VNx1SF,VNx2SF,VNx4SF,VNx8SF,VNx16SF,VNx32SF,\
@@ -109,7 +112,8 @@
 			  VNx2x16SF,VNx2x8SF,VNx3x8SF,VNx4x8SF,\
 			  VNx2x4SF,VNx3x4SF,VNx4x4SF,VNx5x4SF,VNx6x4SF,VNx7x4SF,VNx8x4SF,\
 			  VNx2x2SF,VNx3x2SF,VNx4x2SF,VNx5x2SF,VNx6x2SF,VNx7x2SF,VNx8x2SF,\
-			  VNx2x1SF,VNx3x1SF,VNx4x1SF,VNx5x1SF,VNx6x1SF,VNx7x1SF,VNx8x1SF")
+			  VNx2x1SF,VNx3x1SF,VNx4x1SF,VNx5x1SF,VNx6x1SF,VNx7x1SF,VNx8x1SF,\
+			  V2SI,V4SI,V8SI,V16SI,V32SI,V64SI,V128SI,V256SI,V512SI,V1024SI")
 	 (const_int 32)
 	 (eq_attr "mode" "VNx1DI,VNx2DI,VNx4DI,VNx8DI,VNx16DI,\
 			  VNx1DF,VNx2DF,VNx4DF,VNx8DF,VNx16DF,\
@@ -118,7 +122,8 @@
 			  VNx2x1DI,VNx3x1DI,VNx4x1DI,VNx5x1DI,VNx6x1DI,VNx7x1DI,VNx8x1DI,\
 			  VNx2x8DF,VNx2x4DF,VNx3x4DF,VNx4x4DF,\
 			  VNx2x2DF,VNx3x2DF,VNx4x2DF,VNx5x2DF,VNx6x2DF,VNx7x2DF,VNx8x2DF,\
-			  VNx2x1DF,VNx3x1DF,VNx4x1DF,VNx5x1DF,VNx6x1DF,VNx7x1DF,VNx8x1DF")
+			  VNx2x1DF,VNx3x1DF,VNx4x1DF,VNx5x1DF,VNx6x1DF,VNx7x1DF,VNx8x1DF,\
+			  V2DI,V4DI,V8DI,V16DI,V32DI,V64DI,V128DI,V256DI,V512DI")
 	 (const_int 64)]
 	(const_int INVALID_ATTRIBUTE)))
 
@@ -197,7 +202,51 @@
 	 (eq_attr "mode" "VNx8DI,VNx8DF,VNx2x8DI,VNx2x8DF")
 	   (symbol_ref "riscv_vector::get_vlmul(E_VNx8DImode)")
 	 (eq_attr "mode" "VNx16DI,VNx16DF")
-	   (symbol_ref "riscv_vector::get_vlmul(E_VNx16DImode)")]
+	   (symbol_ref "riscv_vector::get_vlmul(E_VNx16DImode)")
+
+	 ;; VLS modes.
+	 (eq_attr "mode" "V2QI") (symbol_ref "riscv_vector::get_vlmul(E_V2QImode)")
+	 (eq_attr "mode" "V4QI") (symbol_ref "riscv_vector::get_vlmul(E_V4QImode)")
+	 (eq_attr "mode" "V8QI") (symbol_ref "riscv_vector::get_vlmul(E_V8QImode)")
+	 (eq_attr "mode" "V16QI") (symbol_ref "riscv_vector::get_vlmul(E_V16QImode)")
+	 (eq_attr "mode" "V32QI") (symbol_ref "riscv_vector::get_vlmul(E_V32QImode)")
+	 (eq_attr "mode" "V64QI") (symbol_ref "riscv_vector::get_vlmul(E_V64QImode)")
+	 (eq_attr "mode" "V128QI") (symbol_ref "riscv_vector::get_vlmul(E_V128QImode)")
+	 (eq_attr "mode" "V256QI") (symbol_ref "riscv_vector::get_vlmul(E_V256QImode)")
+	 (eq_attr "mode" "V512QI") (symbol_ref "riscv_vector::get_vlmul(E_V512QImode)")
+	 (eq_attr "mode" "V1024QI") (symbol_ref "riscv_vector::get_vlmul(E_V1024QImode)")
+	 (eq_attr "mode" "V2048QI") (symbol_ref "riscv_vector::get_vlmul(E_V2048QImode)")
+	 (eq_attr "mode" "V4096QI") (symbol_ref "riscv_vector::get_vlmul(E_V4096QImode)")
+	 (eq_attr "mode" "V2HI") (symbol_ref "riscv_vector::get_vlmul(E_V2HImode)")
+	 (eq_attr "mode" "V4HI") (symbol_ref "riscv_vector::get_vlmul(E_V4HImode)")
+	 (eq_attr "mode" "V8HI") (symbol_ref "riscv_vector::get_vlmul(E_V8HImode)")
+	 (eq_attr "mode" "V16HI") (symbol_ref "riscv_vector::get_vlmul(E_V16HImode)")
+	 (eq_attr "mode" "V32HI") (symbol_ref "riscv_vector::get_vlmul(E_V32HImode)")
+	 (eq_attr "mode" "V64HI") (symbol_ref "riscv_vector::get_vlmul(E_V64HImode)")
+	 (eq_attr "mode" "V128HI") (symbol_ref "riscv_vector::get_vlmul(E_V128HImode)")
+	 (eq_attr "mode" "V256HI") (symbol_ref "riscv_vector::get_vlmul(E_V256HImode)")
+	 (eq_attr "mode" "V512HI") (symbol_ref "riscv_vector::get_vlmul(E_V512HImode)")
+	 (eq_attr "mode" "V1024HI") (symbol_ref "riscv_vector::get_vlmul(E_V1024HImode)")
+	 (eq_attr "mode" "V2048HI") (symbol_ref "riscv_vector::get_vlmul(E_V2048HImode)")
+	 (eq_attr "mode" "V2SI") (symbol_ref "riscv_vector::get_vlmul(E_V2SImode)")
+	 (eq_attr "mode" "V4SI") (symbol_ref "riscv_vector::get_vlmul(E_V4SImode)")
+	 (eq_attr "mode" "V8SI") (symbol_ref "riscv_vector::get_vlmul(E_V8SImode)")
+	 (eq_attr "mode" "V16SI") (symbol_ref "riscv_vector::get_vlmul(E_V16SImode)")
+	 (eq_attr "mode" "V32SI") (symbol_ref "riscv_vector::get_vlmul(E_V32SImode)")
+	 (eq_attr "mode" "V64SI") (symbol_ref "riscv_vector::get_vlmul(E_V64SImode)")
+	 (eq_attr "mode" "V128SI") (symbol_ref "riscv_vector::get_vlmul(E_V128SImode)")
+	 (eq_attr "mode" "V256SI") (symbol_ref "riscv_vector::get_vlmul(E_V256SImode)")
+	 (eq_attr "mode" "V512SI") (symbol_ref "riscv_vector::get_vlmul(E_V512SImode)")
+	 (eq_attr "mode" "V1024SI") (symbol_ref "riscv_vector::get_vlmul(E_V1024SImode)")
+	 (eq_attr "mode" "V2DI") (symbol_ref "riscv_vector::get_vlmul(E_V2DImode)")
+	 (eq_attr "mode" "V4DI") (symbol_ref "riscv_vector::get_vlmul(E_V4DImode)")
+	 (eq_attr "mode" "V8DI") (symbol_ref "riscv_vector::get_vlmul(E_V8DImode)")
+	 (eq_attr "mode" "V16DI") (symbol_ref "riscv_vector::get_vlmul(E_V16DImode)")
+	 (eq_attr "mode" "V32DI") (symbol_ref "riscv_vector::get_vlmul(E_V32DImode)")
+	 (eq_attr "mode" "V64DI") (symbol_ref "riscv_vector::get_vlmul(E_V64DImode)")
+	 (eq_attr "mode" "V128DI") (symbol_ref "riscv_vector::get_vlmul(E_V128DImode)")
+	 (eq_attr "mode" "V256DI") (symbol_ref "riscv_vector::get_vlmul(E_V256DImode)")
+	 (eq_attr "mode" "V512DI") (symbol_ref "riscv_vector::get_vlmul(E_V512DImode)")]
 	(const_int INVALID_ATTRIBUTE)))
 
 ;; It is valid for instruction that require sew/lmul ratio.
@@ -290,7 +339,51 @@
 	 (eq_attr "mode" "VNx8DI,VNx8DF,VNx2x8DI,VNx2x8DF")
 	   (symbol_ref "riscv_vector::get_ratio(E_VNx8DImode)")
 	 (eq_attr "mode" "VNx16DI,VNx16DF")
-	   (symbol_ref "riscv_vector::get_ratio(E_VNx16DImode)")]
+	   (symbol_ref "riscv_vector::get_ratio(E_VNx16DImode)")
+
+	 ;; VLS modes.
+	 (eq_attr "mode" "V2QI") (symbol_ref "riscv_vector::get_ratio(E_V2QImode)")
+	 (eq_attr "mode" "V4QI") (symbol_ref "riscv_vector::get_ratio(E_V4QImode)")
+	 (eq_attr "mode" "V8QI") (symbol_ref "riscv_vector::get_ratio(E_V8QImode)")
+	 (eq_attr "mode" "V16QI") (symbol_ref "riscv_vector::get_ratio(E_V16QImode)")
+	 (eq_attr "mode" "V32QI") (symbol_ref "riscv_vector::get_ratio(E_V32QImode)")
+	 (eq_attr "mode" "V64QI") (symbol_ref "riscv_vector::get_ratio(E_V64QImode)")
+	 (eq_attr "mode" "V128QI") (symbol_ref "riscv_vector::get_ratio(E_V128QImode)")
+	 (eq_attr "mode" "V256QI") (symbol_ref "riscv_vector::get_ratio(E_V256QImode)")
+	 (eq_attr "mode" "V512QI") (symbol_ref "riscv_vector::get_ratio(E_V512QImode)")
+	 (eq_attr "mode" "V1024QI") (symbol_ref "riscv_vector::get_ratio(E_V1024QImode)")
+	 (eq_attr "mode" "V2048QI") (symbol_ref "riscv_vector::get_ratio(E_V2048QImode)")
+	 (eq_attr "mode" "V4096QI") (symbol_ref "riscv_vector::get_ratio(E_V4096QImode)")
+	 (eq_attr "mode" "V2HI") (symbol_ref "riscv_vector::get_ratio(E_V2HImode)")
+	 (eq_attr "mode" "V4HI") (symbol_ref "riscv_vector::get_ratio(E_V4HImode)")
+	 (eq_attr "mode" "V8HI") (symbol_ref "riscv_vector::get_ratio(E_V8HImode)")
+	 (eq_attr "mode" "V16HI") (symbol_ref "riscv_vector::get_ratio(E_V16HImode)")
+	 (eq_attr "mode" "V32HI") (symbol_ref "riscv_vector::get_ratio(E_V32HImode)")
+	 (eq_attr "mode" "V64HI") (symbol_ref "riscv_vector::get_ratio(E_V64HImode)")
+	 (eq_attr "mode" "V128HI") (symbol_ref "riscv_vector::get_ratio(E_V128HImode)")
+	 (eq_attr "mode" "V256HI") (symbol_ref "riscv_vector::get_ratio(E_V256HImode)")
+	 (eq_attr "mode" "V512HI") (symbol_ref "riscv_vector::get_ratio(E_V512HImode)")
+	 (eq_attr "mode" "V1024HI") (symbol_ref "riscv_vector::get_ratio(E_V1024HImode)")
+	 (eq_attr "mode" "V2048HI") (symbol_ref "riscv_vector::get_ratio(E_V2048HImode)")
+	 (eq_attr "mode" "V2SI") (symbol_ref "riscv_vector::get_ratio(E_V2SImode)")
+	 (eq_attr "mode" "V4SI") (symbol_ref "riscv_vector::get_ratio(E_V4SImode)")
+	 (eq_attr "mode" "V8SI") (symbol_ref "riscv_vector::get_ratio(E_V8SImode)")
+	 (eq_attr "mode" "V16SI") (symbol_ref "riscv_vector::get_ratio(E_V16SImode)")
+	 (eq_attr "mode" "V32SI") (symbol_ref "riscv_vector::get_ratio(E_V32SImode)")
+	 (eq_attr "mode" "V64SI") (symbol_ref "riscv_vector::get_ratio(E_V64SImode)")
+	 (eq_attr "mode" "V128SI") (symbol_ref "riscv_vector::get_ratio(E_V128SImode)")
+	 (eq_attr "mode" "V256SI") (symbol_ref "riscv_vector::get_ratio(E_V256SImode)")
+	 (eq_attr "mode" "V512SI") (symbol_ref "riscv_vector::get_ratio(E_V512SImode)")
+	 (eq_attr "mode" "V1024SI") (symbol_ref "riscv_vector::get_ratio(E_V1024SImode)")
+	 (eq_attr "mode" "V2DI") (symbol_ref "riscv_vector::get_ratio(E_V2DImode)")
+	 (eq_attr "mode" "V4DI") (symbol_ref "riscv_vector::get_ratio(E_V4DImode)")
+	 (eq_attr "mode" "V8DI") (symbol_ref "riscv_vector::get_ratio(E_V8DImode)")
+	 (eq_attr "mode" "V16DI") (symbol_ref "riscv_vector::get_ratio(E_V16DImode)")
+	 (eq_attr "mode" "V32DI") (symbol_ref "riscv_vector::get_ratio(E_V32DImode)")
+	 (eq_attr "mode" "V64DI") (symbol_ref "riscv_vector::get_ratio(E_V64DImode)")
+	 (eq_attr "mode" "V128DI") (symbol_ref "riscv_vector::get_ratio(E_V128DImode)")
+	 (eq_attr "mode" "V256DI") (symbol_ref "riscv_vector::get_ratio(E_V256DImode)")
+	 (eq_attr "mode" "V512DI") (symbol_ref "riscv_vector::get_ratio(E_V512DImode)")]
 	(const_int INVALID_ATTRIBUTE)))
 
 ;; The index of operand[] to get the merge op.
@@ -406,7 +499,12 @@
 
 ;; The avl type value.
 (define_attr "avl_type" ""
-  (cond [(eq_attr "type" "vlde,vldff,vste,vimov,vimov,vimov,vfmov,vext,vimerge,\
+  (cond [(eq_attr "mode" "V2QI,V4QI,V8QI,V16QI,V32QI,V64QI,V128QI,V256QI,V512QI,V1024QI,V2048QI,V4096QI,
+			  V2HI,V4HI,V8HI,V16HI,V32HI,V64HI,V128HI,V256HI,V512HI,V1024HI,V2048HI,
+			  V2SI,V4SI,V8SI,V16SI,V32SI,V64SI,V128SI,V256SI,V512SI,V1024SI,
+			  V2DI,V4DI,V8DI,V16DI,V32DI,V64DI,V128DI,V256DI,V512DI")
+	   (symbol_ref "riscv_vector::NONVLMAX")
+	(eq_attr "type" "vlde,vldff,vste,vimov,vimov,vimov,vfmov,vext,vimerge,\
 			  vfsqrt,vfrecp,vfmerge,vfcvtitof,vfcvtftoi,vfwcvtitof,\
 			  vfwcvtftoi,vfwcvtftof,vfncvtitof,vfncvtftoi,vfncvtftof,\
 			  vfclass,vired,viwred,vfredu,vfredo,vfwredu,vfwredo,\
@@ -8299,3 +8397,4 @@
 
 (include "autovec.md")
 (include "autovec-opt.md")
+(include "autovec-vls.md")
-- 
2.36.3


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] RISC-V: Add VLS modes for GNU vectors
  2023-06-18 23:06 [PATCH] RISC-V: Add VLS modes for GNU vectors Juzhe-Zhong
@ 2023-06-19 18:25 ` Jeff Law
  2023-06-19 21:45   ` 钟居哲
  0 siblings, 1 reply; 4+ messages in thread
From: Jeff Law @ 2023-06-19 18:25 UTC (permalink / raw)
  To: Juzhe-Zhong, gcc-patches; +Cc: kito.cheng, palmer, rdapp.gcc



On 6/18/23 17:06, Juzhe-Zhong wrote:
> This patch is a propsal patch is **NOT** ready to push since
> after this patch the total machine modes will exceed 255 which will create ICE
> in LTO:
>    internal compiler error: in bp_pack_int_in_range, at data-streamer.h:290
Right.  Note that an ack from Jakub or Richi will be sufficient for the 
LTO fixes to go forward.


> 
> The reason we need to add VLS modes for following reason:
> 1. Enhance GNU vectors codegen:
>     For example:
>       typedef int32_t vnx8si __attribute__ ((vector_size (32)));
> 
>       __attribute__ ((noipa)) void
>       f_vnx8si (int32_t * in, int32_t * out)
>       {
>         vnx8si v = *(vnx8si*)in;
>         *(vnx8si *) out = v;
>       }
> 
> compile option: --param=riscv-autovec-preference=scalable
>      before this patch:
>      f_vnx8si:
>          ld      a2,0(a0)
>          ld      a3,8(a0)
>          ld      a4,16(a0)
>          ld      a5,24(a0)
>          addi    sp,sp,-32
>          sd      a2,0(a1)
>          sd      a3,8(a1)
>          sd      a4,16(a1)
>          sd      a5,24(a1)
>          addi    sp,sp,32
>          jr      ra
> 
> After this patch:
>         f_vnx8si:
>          vsetivli        zero,8,e32,m2,ta,ma
>          vle32.v v2,0(a0)
>          vse32.v v2,0(a1)
>          ret
> 
> 2. Ehance VLA SLP:
> void
> f (uint8_t *restrict a, uint8_t *restrict b, uint8_t *restrict c)
> {
>    for (int i = 0; i < 100; ++i)
>      {
>        a[i * 8] = b[i * 8] + c[i * 8];
>        a[i * 8 + 1] = b[i * 8] + c[i * 8 + 1];
>        a[i * 8 + 2] = b[i * 8 + 2] + c[i * 8 + 2];
>        a[i * 8 + 3] = b[i * 8 + 2] + c[i * 8 + 3];
>        a[i * 8 + 4] = b[i * 8 + 4] + c[i * 8 + 4];
>        a[i * 8 + 5] = b[i * 8 + 4] + c[i * 8 + 5];
>        a[i * 8 + 6] = b[i * 8 + 6] + c[i * 8 + 6];
>        a[i * 8 + 7] = b[i * 8 + 6] + c[i * 8 + 7];
>      }
> }
> 
> 
> ..
> Loop body:
>   ...
>   vrgatherei16.vv...
>   ...
> 
> Tail:
>   lbu     a4,792(a1)
>          lbu     a5,792(a2)
>          addw    a5,a5,a4
>          sb      a5,792(a0)
>          lbu     a5,793(a2)
>          addw    a5,a5,a4
>          sb      a5,793(a0)
>          lbu     a4,794(a1)
>          lbu     a5,794(a2)
>          addw    a5,a5,a4
>          sb      a5,794(a0)
>          lbu     a5,795(a2)
>          addw    a5,a5,a4
>          sb      a5,795(a0)
>          lbu     a4,796(a1)
>          lbu     a5,796(a2)
>          addw    a5,a5,a4
>          sb      a5,796(a0)
>          lbu     a5,797(a2)
>          addw    a5,a5,a4
>          sb      a5,797(a0)
>          lbu     a4,798(a1)
>          lbu     a5,798(a2)
>          addw    a5,a5,a4
>          sb      a5,798(a0)
>          lbu     a5,799(a2)
>          addw    a5,a5,a4
>          sb      a5,799(a0)
>          ret
> 
> The tail elements need VLS modes to vectorize like ARM SVE:
> 
> f:
>          mov     x3, 0
>          cntb    x5
>          mov     x4, 792
>          whilelo p7.b, xzr, x4
> .L2:
>          ld1b    z31.b, p7/z, [x1, x3]
>          ld1b    z30.b, p7/z, [x2, x3]
>          trn1    z31.b, z31.b, z31.b
>          add     z31.b, z31.b, z30.b
>          st1b    z31.b, p7, [x0, x3]
>          add     x3, x3, x5
>          whilelo p7.b, x3, x4
>          b.any   .L2
> Tail:
>          ldr     b31, [x1, 792]
>          ldr     b27, [x1, 794]
>          ldr     b28, [x1, 796]
>          dup     v31.8b, v31.b[0]
>          ldr     b29, [x1, 798]
>          ldr     d30, [x2, 792]
>          ins     v31.b[2], v27.b[0]
>          ins     v31.b[3], v27.b[0]
>          ins     v31.b[4], v28.b[0]
>          ins     v31.b[5], v28.b[0]
>          ins     v31.b[6], v29.b[0]
>          ins     v31.b[7], v29.b[0]
>          add     v31.8b, v30.8b, v31.8b
>          str     d31, [x0, 792]
>          ret
> 
> Notice ARM SVE use ADVSIMD modes (Neon) to vectorize the tail.



> 
> gcc/ChangeLog:
> 
>          * config/riscv/riscv-modes.def (VECTOR_BOOL_MODE): Add VLS modes for GNU vectors.
>          (ADJUST_ALIGNMENT): Ditto.
>          (ADJUST_BYTESIZE): Ditto.
> 
>          (ADJUST_PRECISION): Ditto.
>          (VECTOR_MODES): Ditto.
>          * config/riscv/riscv-protos.h (riscv_v_ext_vls_mode_p): Ditto.
>          (get_regno_alignment): Ditto.
>          * config/riscv/riscv-v.cc (INCLUDE_ALGORITHM): Ditto.
>          (const_vlmax_p): Ditto.
>          (legitimize_move): Ditto.
>          (get_vlmul): Ditto.
>          (get_regno_alignment): Ditto.
>          (get_ratio): Ditto.
>          (get_vector_mode): Ditto.
>          * config/riscv/riscv-vector-switch.def (VLS_ENTRY): Ditto.
>          * config/riscv/riscv.cc (riscv_v_ext_vls_mode_p): Ditto.
>          (VLS_ENTRY): Ditto.
>          (riscv_v_ext_mode_p): Ditto.
>          (riscv_hard_regno_nregs): Ditto.
>          (riscv_hard_regno_mode_ok): Ditto.
>          * config/riscv/riscv.md: Ditto.
>          * config/riscv/vector-iterators.md: Ditto.
>          * config/riscv/vector.md: Ditto.
>          * config/riscv/autovec-vls.md: New file.
> 
> ---
So I expected we were going to have to define some static length 
patterns at some point.  So this isn't a huge surprise.






> diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
> index 6421e933ca9..6fc1c433069 100644
> --- a/gcc/config/riscv/riscv-v.cc
> +++ b/gcc/config/riscv/riscv-v.cc
> @@ -25,6 +25,7 @@
>      the vector.md.  */
>   #define RVV_INSN_OPERANDS_MAX 11
>   
> +#define INCLUDE_ALGORITHM
I must have missed something in this patch.  I didn't see anything 
obvious which needed INCLUDE_ALGORITHM.


>   /* Return true if it is either RVV vector mode or RVV tuple mode.  */
>   
>   static bool
>   riscv_v_ext_mode_p (machine_mode mode)
>   {
> -  return riscv_v_ext_vector_mode_p (mode) || riscv_v_ext_tuple_mode_p (mode);
> +  return riscv_v_ext_vector_mode_p (mode) || riscv_v_ext_tuple_mode_p (mode)
> +	 || riscv_v_ext_vls_mode_p (mode);
>   }
Formatting nit.  When you wrap a line, go ahead and add the parenthesis 
and indent the operator appropriately.

If you need INCLUDE_ALGORITHM, then fine, just assume I missed whatever 
caused you to need it.  If not remove it.   Fix the formatting nit and 
this is good for the trunk.  Go ahead and post the V2, but consider it 
pre-approved.

jeff

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: Re: [PATCH] RISC-V: Add VLS modes for GNU vectors
  2023-06-19 18:25 ` Jeff Law
@ 2023-06-19 21:45   ` 钟居哲
  2023-06-19 22:07     ` Jeff Law
  0 siblings, 1 reply; 4+ messages in thread
From: 钟居哲 @ 2023-06-19 21:45 UTC (permalink / raw)
  To: Jeff Law, gcc-patches; +Cc: kito.cheng, palmer, rdapp.gcc

[-- Attachment #1: Type: text/plain, Size: 6865 bytes --]

Hi, Jeff.
Thanks for comment.

I add INCLUDE_ALGORITHM since I use std:min.
I failed to compile when I didn't add INCLUDE_ALGORITHM.

Is INCLUDE_ALGORITHM expensive that you don't want it?


juzhe.zhong@rivai.ai
 
From: Jeff Law
Date: 2023-06-20 02:25
To: Juzhe-Zhong; gcc-patches
CC: kito.cheng; palmer; rdapp.gcc
Subject: Re: [PATCH] RISC-V: Add VLS modes for GNU vectors
 
 
On 6/18/23 17:06, Juzhe-Zhong wrote:
> This patch is a propsal patch is **NOT** ready to push since
> after this patch the total machine modes will exceed 255 which will create ICE
> in LTO:
>    internal compiler error: in bp_pack_int_in_range, at data-streamer.h:290
Right.  Note that an ack from Jakub or Richi will be sufficient for the 
LTO fixes to go forward.
 
 
> 
> The reason we need to add VLS modes for following reason:
> 1. Enhance GNU vectors codegen:
>     For example:
>       typedef int32_t vnx8si __attribute__ ((vector_size (32)));
> 
>       __attribute__ ((noipa)) void
>       f_vnx8si (int32_t * in, int32_t * out)
>       {
>         vnx8si v = *(vnx8si*)in;
>         *(vnx8si *) out = v;
>       }
> 
> compile option: --param=riscv-autovec-preference=scalable
>      before this patch:
>      f_vnx8si:
>          ld      a2,0(a0)
>          ld      a3,8(a0)
>          ld      a4,16(a0)
>          ld      a5,24(a0)
>          addi    sp,sp,-32
>          sd      a2,0(a1)
>          sd      a3,8(a1)
>          sd      a4,16(a1)
>          sd      a5,24(a1)
>          addi    sp,sp,32
>          jr      ra
> 
> After this patch:
>         f_vnx8si:
>          vsetivli        zero,8,e32,m2,ta,ma
>          vle32.v v2,0(a0)
>          vse32.v v2,0(a1)
>          ret
> 
> 2. Ehance VLA SLP:
> void
> f (uint8_t *restrict a, uint8_t *restrict b, uint8_t *restrict c)
> {
>    for (int i = 0; i < 100; ++i)
>      {
>        a[i * 8] = b[i * 8] + c[i * 8];
>        a[i * 8 + 1] = b[i * 8] + c[i * 8 + 1];
>        a[i * 8 + 2] = b[i * 8 + 2] + c[i * 8 + 2];
>        a[i * 8 + 3] = b[i * 8 + 2] + c[i * 8 + 3];
>        a[i * 8 + 4] = b[i * 8 + 4] + c[i * 8 + 4];
>        a[i * 8 + 5] = b[i * 8 + 4] + c[i * 8 + 5];
>        a[i * 8 + 6] = b[i * 8 + 6] + c[i * 8 + 6];
>        a[i * 8 + 7] = b[i * 8 + 6] + c[i * 8 + 7];
>      }
> }
> 
> 
> ..
> Loop body:
>   ...
>   vrgatherei16.vv...
>   ...
> 
> Tail:
>   lbu     a4,792(a1)
>          lbu     a5,792(a2)
>          addw    a5,a5,a4
>          sb      a5,792(a0)
>          lbu     a5,793(a2)
>          addw    a5,a5,a4
>          sb      a5,793(a0)
>          lbu     a4,794(a1)
>          lbu     a5,794(a2)
>          addw    a5,a5,a4
>          sb      a5,794(a0)
>          lbu     a5,795(a2)
>          addw    a5,a5,a4
>          sb      a5,795(a0)
>          lbu     a4,796(a1)
>          lbu     a5,796(a2)
>          addw    a5,a5,a4
>          sb      a5,796(a0)
>          lbu     a5,797(a2)
>          addw    a5,a5,a4
>          sb      a5,797(a0)
>          lbu     a4,798(a1)
>          lbu     a5,798(a2)
>          addw    a5,a5,a4
>          sb      a5,798(a0)
>          lbu     a5,799(a2)
>          addw    a5,a5,a4
>          sb      a5,799(a0)
>          ret
> 
> The tail elements need VLS modes to vectorize like ARM SVE:
> 
> f:
>          mov     x3, 0
>          cntb    x5
>          mov     x4, 792
>          whilelo p7.b, xzr, x4
> .L2:
>          ld1b    z31.b, p7/z, [x1, x3]
>          ld1b    z30.b, p7/z, [x2, x3]
>          trn1    z31.b, z31.b, z31.b
>          add     z31.b, z31.b, z30.b
>          st1b    z31.b, p7, [x0, x3]
>          add     x3, x3, x5
>          whilelo p7.b, x3, x4
>          b.any   .L2
> Tail:
>          ldr     b31, [x1, 792]
>          ldr     b27, [x1, 794]
>          ldr     b28, [x1, 796]
>          dup     v31.8b, v31.b[0]
>          ldr     b29, [x1, 798]
>          ldr     d30, [x2, 792]
>          ins     v31.b[2], v27.b[0]
>          ins     v31.b[3], v27.b[0]
>          ins     v31.b[4], v28.b[0]
>          ins     v31.b[5], v28.b[0]
>          ins     v31.b[6], v29.b[0]
>          ins     v31.b[7], v29.b[0]
>          add     v31.8b, v30.8b, v31.8b
>          str     d31, [x0, 792]
>          ret
> 
> Notice ARM SVE use ADVSIMD modes (Neon) to vectorize the tail.
 
 
 
> 
> gcc/ChangeLog:
> 
>          * config/riscv/riscv-modes.def (VECTOR_BOOL_MODE): Add VLS modes for GNU vectors.
>          (ADJUST_ALIGNMENT): Ditto.
>          (ADJUST_BYTESIZE): Ditto.
> 
>          (ADJUST_PRECISION): Ditto.
>          (VECTOR_MODES): Ditto.
>          * config/riscv/riscv-protos.h (riscv_v_ext_vls_mode_p): Ditto.
>          (get_regno_alignment): Ditto.
>          * config/riscv/riscv-v.cc (INCLUDE_ALGORITHM): Ditto.
>          (const_vlmax_p): Ditto.
>          (legitimize_move): Ditto.
>          (get_vlmul): Ditto.
>          (get_regno_alignment): Ditto.
>          (get_ratio): Ditto.
>          (get_vector_mode): Ditto.
>          * config/riscv/riscv-vector-switch.def (VLS_ENTRY): Ditto.
>          * config/riscv/riscv.cc (riscv_v_ext_vls_mode_p): Ditto.
>          (VLS_ENTRY): Ditto.
>          (riscv_v_ext_mode_p): Ditto.
>          (riscv_hard_regno_nregs): Ditto.
>          (riscv_hard_regno_mode_ok): Ditto.
>          * config/riscv/riscv.md: Ditto.
>          * config/riscv/vector-iterators.md: Ditto.
>          * config/riscv/vector.md: Ditto.
>          * config/riscv/autovec-vls.md: New file.
> 
> ---
So I expected we were going to have to define some static length 
patterns at some point.  So this isn't a huge surprise.
 
 
 
 
 
 
> diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
> index 6421e933ca9..6fc1c433069 100644
> --- a/gcc/config/riscv/riscv-v.cc
> +++ b/gcc/config/riscv/riscv-v.cc
> @@ -25,6 +25,7 @@
>      the vector.md.  */
>   #define RVV_INSN_OPERANDS_MAX 11
>   
> +#define INCLUDE_ALGORITHM
I must have missed something in this patch.  I didn't see anything 
obvious which needed INCLUDE_ALGORITHM.
 
 
>   /* Return true if it is either RVV vector mode or RVV tuple mode.  */
>   
>   static bool
>   riscv_v_ext_mode_p (machine_mode mode)
>   {
> -  return riscv_v_ext_vector_mode_p (mode) || riscv_v_ext_tuple_mode_p (mode);
> +  return riscv_v_ext_vector_mode_p (mode) || riscv_v_ext_tuple_mode_p (mode)
> + || riscv_v_ext_vls_mode_p (mode);
>   }
Formatting nit.  When you wrap a line, go ahead and add the parenthesis 
and indent the operator appropriately.
 
If you need INCLUDE_ALGORITHM, then fine, just assume I missed whatever 
caused you to need it.  If not remove it.   Fix the formatting nit and 
this is good for the trunk.  Go ahead and post the V2, but consider it 
pre-approved.
 
jeff
 

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] RISC-V: Add VLS modes for GNU vectors
  2023-06-19 21:45   ` 钟居哲
@ 2023-06-19 22:07     ` Jeff Law
  0 siblings, 0 replies; 4+ messages in thread
From: Jeff Law @ 2023-06-19 22:07 UTC (permalink / raw)
  To: 钟居哲, gcc-patches; +Cc: kito.cheng, palmer, rdapp.gcc



On 6/19/23 15:45, 钟居哲 wrote:
> Hi, Jeff.
> Thanks for comment.
> 
> I add INCLUDE_ALGORITHM since I use std:min.
> I failed to compile when I didn't add INCLUDE_ALGORITHM.
> 
> Is INCLUDE_ALGORITHM expensive that you don't want it?

It just stood out as unexpected.  THere's no concerns with std::min and 
the like.

Jeff


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2023-06-19 22:07 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-06-18 23:06 [PATCH] RISC-V: Add VLS modes for GNU vectors Juzhe-Zhong
2023-06-19 18:25 ` Jeff Law
2023-06-19 21:45   ` 钟居哲
2023-06-19 22:07     ` Jeff Law

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).