Could you test it  v4096qi ?
Also, VLS modes need to be tested.



juzhe.zhong@rivai.ai
 
From: pan2.li
Date: 2023-11-15 11:48
To: gcc-patches
CC: juzhe.zhong; pan2.li; yanzhang.wang; kito.cheng
Subject: [PATCH v1] RISC-V: Refine the mask generation for vec_init case 2
From: Pan Li <pan2.li@intel.com>
 
We take vec_init element int mode when generate the mask for
case 2. But actually we don't need as many bits as the element.
The extra bigger mode may introduce some unnecessary insns.
For example as below code:
 
typedef int64_t v16di __attribute__ ((vector_size (16 * 8)));
 
void __attribute__ ((noinline, noclone))
  foo (int64_t *out, int64_t x, int64_t y)
{
  v16di v = {y, x, y, x, y, x, y, x, y, x, y, x, y, x, y, x};
  *(v16di *) out = v;
}
 
We will have VDImode when generate the 0b0101010101010101 mask but
actually VHImode is good enough here. This patch would like to
refine the mask generation to avoid:
1. Unnecessary scalar to generate big constant mask.
2. Unnecessary vector insn to v0 mask.
 
Before this patch:
foo:
  li      a5,-1431654400
  li      a4,-1431654400               <== unnecessary insn
  addi    a5,a5,-1365                  <== unnecessary insn
  addi    a4,a4,-1366
  slli    a5,a5,32                     <== unnecessary insn
  add     a5,a5,a4                     <== unnecessary insn
  vsetivli        zero,16,e64,m8,ta,ma
  vmv.v.x v8,a2
  vmv.s.x v16,a5
  vmv1r.v v0,v16                       <== unnecessary insn
  vmerge.vxm      v8,v8,a1,v0
  vse64.v v8,0(a0)
  ret
 
After this patch:
foo:
  li      a5,-20480
  addiw   a5,a5,-1366
  vsetivli        zero,16,e64,m8,ta,ma
  vmv.s.x v0,a5
  vmv.v.x v8,a2
  vmerge.vxm      v8,v8,a1,v0
  vs8r.v  v8,0(a0)
  ret
 
gcc/ChangeLog:
 
* config/riscv/riscv-v.cc (rvv_builder::get_merge_scalar_mask):
Add inner_mode mask arg for mask int mode.
(get_repeating_sequence_dup_machine_mode): Add mask_bit_mode arg
to get the good enough vector int mode on precision.
(expand_vector_init_merge_repeating_sequence): Pass required args
to above func.
 
gcc/testsuite/ChangeLog:
 
* gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-10.c: New test.
* gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-11.c: New test.
* gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-6.c: New test.
* gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-7.c: New test.
* gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-8.c: New test.
* gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-9.c: New test.
 
Signed-off-by: Pan Li <pan2.li@intel.com>
---
gcc/config/riscv/riscv-v.cc                   | 54 ++++++++++++++-----
.../vls-vlmax/init-repeat-sequence-10.c       | 28 ++++++++++
.../vls-vlmax/init-repeat-sequence-11.c       | 26 +++++++++
.../vls-vlmax/init-repeat-sequence-6.c        | 27 ++++++++++
.../vls-vlmax/init-repeat-sequence-7.c        | 25 +++++++++
.../vls-vlmax/init-repeat-sequence-8.c        | 27 ++++++++++
.../vls-vlmax/init-repeat-sequence-9.c        | 25 +++++++++
7 files changed, 200 insertions(+), 12 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-10.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-11.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-6.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-7.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-8.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-9.c
 
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 265a298f447..ffb645eccf3 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -416,7 +416,7 @@ public:
   bool repeating_sequence_use_merge_profitable_p ();
   bool combine_sequence_use_slideup_profitable_p ();
   bool combine_sequence_use_merge_profitable_p ();
-  rtx get_merge_scalar_mask (unsigned int) const;
+  rtx get_merge_scalar_mask (unsigned int, machine_mode) const;
   bool single_step_npatterns_p () const;
   bool npatterns_all_equal_p () const;
@@ -592,7 +592,8 @@ rvv_builder::get_merged_repeating_sequence ()
    To merge "b", the mask should be 0101....
*/
rtx
-rvv_builder::get_merge_scalar_mask (unsigned int index_in_pattern) const
+rvv_builder::get_merge_scalar_mask (unsigned int index_in_pattern,
+     machine_mode inner_mode) const
{
   unsigned HOST_WIDE_INT mask = 0;
   unsigned HOST_WIDE_INT base_mask = (1ULL << index_in_pattern);
@@ -611,7 +612,7 @@ rvv_builder::get_merge_scalar_mask (unsigned int index_in_pattern) const
   for (int i = 0; i < limit; i++)
     mask |= base_mask << (i * npatterns ());
-  return gen_int_mode (mask, inner_int_mode ());
+  return gen_int_mode (mask, inner_mode);
}
/* Return true if the variable-length vector is single step.
@@ -919,17 +920,45 @@ emit_vlmax_decompress_insn (rtx target, rtx op0, rtx op1, rtx mask)
/* Emit merge instruction.  */
static machine_mode
-get_repeating_sequence_dup_machine_mode (const rvv_builder &builder)
+get_repeating_sequence_dup_machine_mode (const rvv_builder &builder,
+ machine_mode mask_bit_mode)
{
-  poly_uint64 dup_nunits = GET_MODE_NUNITS (builder.mode ());
+  unsigned mask_precision = GET_MODE_PRECISION (mask_bit_mode).to_constant ();
+  unsigned mask_scalar_size = mask_precision > builder.inner_bits_size ()
+    ? builder.inner_bits_size () : mask_precision;
-  if (known_ge (GET_MODE_SIZE (builder.mode ()), BYTES_PER_RISCV_VECTOR))
+  scalar_mode inner_mode;
+  unsigned minimal_bits_size;
+
+  switch (mask_scalar_size)
     {
-      dup_nunits = exact_div (BYTES_PER_RISCV_VECTOR,
- builder.inner_bytes_size ());
+      case 8:
+ inner_mode = QImode;
+ minimal_bits_size = TARGET_MIN_VLEN / 8; /* AKA RVVMF8.  */
+ break;
+      case 16:
+ inner_mode = HImode;
+ minimal_bits_size = TARGET_MIN_VLEN / 4; /* AKA RVVMF4.  */
+ break;
+      case 32:
+ inner_mode = SImode;
+ minimal_bits_size = TARGET_MIN_VLEN / 2; /* AKA RVVMF2.  */
+ break;
+      case 64:
+ inner_mode = DImode;
+ minimal_bits_size = TARGET_MIN_VLEN / 1; /* AKA RVVM1.  */
+ break;
+      default:
+ gcc_unreachable ();
+ break;
     }
-  return get_vector_mode (builder.inner_int_mode (), dup_nunits).require ();
+  gcc_assert (mask_precision % mask_scalar_size == 0);
+
+  uint64_t dup_nunit = mask_precision > mask_scalar_size
+    ? mask_precision / mask_scalar_size : minimal_bits_size / mask_scalar_size;
+
+  return get_vector_mode (inner_mode, dup_nunit).require ();
}
/* Expand series const vector.  */
@@ -2130,9 +2159,9 @@ expand_vector_init_merge_repeating_sequence (rtx target,
      since we don't have such instruction in RVV.
      Instead, we should use INT mode (QI/HI/SI/DI) with integer move
      instruction to generate the mask data we want.  */
-  machine_mode mask_int_mode
-    = get_repeating_sequence_dup_machine_mode (builder);
   machine_mode mask_bit_mode = get_mask_mode (builder.mode ());
+  machine_mode mask_int_mode
+    = get_repeating_sequence_dup_machine_mode (builder, mask_bit_mode);
   uint64_t full_nelts = builder.full_nelts ().to_constant ();
   /* Step 1: Broadcast the first pattern.  */
@@ -2143,7 +2172,8 @@ expand_vector_init_merge_repeating_sequence (rtx target,
   for (unsigned int i = 1; i < builder.npatterns (); i++)
     {
       /* Step 2-1: Generate mask register v0 for each merge.  */
-      rtx merge_mask = builder.get_merge_scalar_mask (i);
+      rtx merge_mask
+ = builder.get_merge_scalar_mask (i, GET_MODE_INNER (mask_int_mode));
       rtx mask = gen_reg_rtx (mask_bit_mode);
       rtx dup = gen_reg_rtx (mask_int_mode);
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-10.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-10.c
new file mode 100644
index 00000000000..ccce5052dc2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-10.c
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv64gcv_zvl1024b -mabi=lp64d -fno-schedule-insns -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <stdint-gcc.h>
+
+typedef int64_t vnx32di __attribute__ ((vector_size (32 * 8)));
+
+/*
+** f_vnx32di:
+**   vsetvli\s+[axt][0-9]+,\s*zero,\s*e64,\s*m2,\s*ta,\s*ma
+**   ...
+**   vmv\.v\.x\s+v[0-9]+,\s*[axt][0-9]+
+**   ...
+**   vmv\.s\.x\s+v0,\s*[axt][0-9]+
+**   vmerge\.vxm\s+v[0-9]+,\s*v[0-9]+,\s*[axt][0-9]+,\s*v0
+**   vs2r\.v\s+v[0-9]+,\s*0\([axt][0-9]+\)
+**   ret
+*/
+__attribute__ ((noipa)) void
+f_vnx32di (int64_t a, int64_t b, int64_t *out)
+{
+  vnx32di v = {
+    a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b,
+    a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b,
+  };
+  *(vnx32di *) out = v;
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-11.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-11.c
new file mode 100644
index 00000000000..a62eee8a5ae
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-11.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv64gcv_zvl1024b -mabi=lp64d -fno-schedule-insns -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef double vnx32df __attribute__ ((vector_size (32 * 8)));
+
+/*
+** f_vnx32df:
+**   vsetvli\s+[axt][0-9]+\s*,zero,\s*e64,\s*m2,\s*ta,\s*ma
+**   ...
+**   vfmv\.v\.f\s+v[0-9]+,\s*[af]+[0-9]+
+**   ...
+**   vmv\.s\.x\s+v0,\s*[axt][0-9]+
+**   vfmerge\.vfm\s+v[0-9]+,\s*v[0-9]+,\s*[af]+[0-9]+,\s*v0
+**   vs2r\.v\s+v[0-9]+,\s*0\([axt][0-9]+\)
+**   ret
+*/
+__attribute__ ((noipa)) void
+f_vnx32df (double a, double b, double *out)
+{
+  vnx32df v = {
+    a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b,
+    a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b,
+  };
+  *(vnx32df *) out = v;
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-6.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-6.c
new file mode 100644
index 00000000000..4f8a78b3161
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-6.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv64gcv -mabi=lp64d -fno-schedule-insns -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <stdint-gcc.h>
+
+typedef int64_t vnx16di __attribute__ ((vector_size (16 * 8)));
+
+/*
+** f_vnx16di:
+**   vsetivli\s+zero,\s*16,\s*e64,\s*m8,\s*ta,\s*ma
+**   ...
+**   vmv\.v\.x\s+v[0-9]+,\s*[axt][0-9]+
+**   ...
+**   vmv\.s\.x\s+v0,\s*[axt][0-9]+
+**   vmerge\.vxm\s+v[0-9]+,\s*v[0-9]+,\s*[axt][0-9]+,\s*v0
+**   vs8r\.v\s+v[0-9]+,\s*0\([axt][0-9]+\)
+**   ret
+*/
+__attribute__ ((noipa)) void
+f_vnx16di (int64_t a, int64_t b, int64_t *out)
+{
+  vnx16di v = {
+    a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b,
+  };
+  *(vnx16di *) out = v;
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-7.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-7.c
new file mode 100644
index 00000000000..f0d14db8fa8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-7.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv64gcv -mabi=lp64d -fno-schedule-insns -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef double vnx16df __attribute__ ((vector_size (16 * 8)));
+
+/*
+** f_vnx16df:
+**   vsetivli\s+zero,\s*16,\s*e64,\s*m8,\s*ta,\s*ma
+**   ...
+**   vfmv\.v\.f\s+v[0-9]+,\s*[af]+[0-9]+
+**   ...
+**   vmv\.s\.x\s+v0,\s*[axt][0-9]+
+**   vfmerge\.vfm\s+v[0-9]+,\s*v[0-9]+,\s*[af]+[0-9]+,\s*v0
+**   vs8r\.v\s+v[0-9]+,\s*0\([axt][0-9]+\)
+**   ret
+*/
+__attribute__ ((noipa)) void
+f_vnx16df (double a, double b, double *out)
+{
+  vnx16df v = {
+    a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b,
+  };
+  *(vnx16df *) out = v;
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-8.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-8.c
new file mode 100644
index 00000000000..fd986e6b649
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-8.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv64gcv_zvl1024b -mabi=lp64d -fno-schedule-insns -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <stdint-gcc.h>
+
+typedef int64_t vnx16di __attribute__ ((vector_size (16 * 8)));
+
+/*
+** f_vnx16di:
+**   vsetivli\s+zero,\s*16,\s*e64,\s*m1,\s*ta,\s*ma
+**   ...
+**   vmv\.v\.x\s+v[0-9]+,\s*[axt][0-9]+
+**   ...
+**   vmv\.s\.x\s+v0,\s*[axt][0-9]+
+**   vmerge\.vxm\s+v[0-9]+,\s*v[0-9]+,\s*[axt][0-9]+,\s*v0
+**   vs1r\.v\s+v[0-9]+,\s*0\([axt][0-9]+\)
+**   ret
+*/
+__attribute__ ((noipa)) void
+f_vnx16di (int64_t a, int64_t b, int64_t *out)
+{
+  vnx16di v = {
+    a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b,
+  };
+  *(vnx16di *) out = v;
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-9.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-9.c
new file mode 100644
index 00000000000..753221ffdbf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-9.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv64gcv_zvl1024b -mabi=lp64d -fno-schedule-insns -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef double vnx16df __attribute__ ((vector_size (16 * 8)));
+
+/*
+** f_vnx16df:
+**   vsetivli\s+zero,\s*16,\s*e64,\s*m1,\s*ta,\s*ma
+**   ...
+**   vfmv\.v\.f\s+v[0-9]+,\s*[af]+[0-9]+
+**   ...
+**   vmv\.s\.x\s+v0,\s*[axt][0-9]+
+**   vfmerge\.vfm\s+v[0-9]+,\s*v[0-9]+,\s*[af]+[0-9]+,\s*v0
+**   vs1r\.v\s+v[0-9]+,\s*0\([axt][0-9]+\)
+**   ret
+*/
+__attribute__ ((noipa)) void
+f_vnx16df (double a, double b, double *out)
+{
+  vnx16df v = {
+    a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b,
+  };
+  *(vnx16df *) out = v;
+}
-- 
2.34.1