[PATCH] RISC-V: Optimize permutation codegen with vcompress

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH] RISC-V: Optimize permutation codegen with vcompress
@ 2023-07-11  6:38 juzhe.zhong
  2023-07-11 12:17 ` Robin Dapp
  2023-07-11 23:18 ` Jeff Law
  0 siblings, 2 replies; 8+ messages in thread
From: juzhe.zhong @ 2023-07-11  6:38 UTC (permalink / raw)
  To: gcc-patches; +Cc: kito.cheng, kito.cheng, jeffreyalaw, rdapp.gcc, Ju-Zhe Zhong

From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>

This patch is to recognize specific permutation pattern which can be applied compress approach.

Consider this following case:
#include <stdint.h>
typedef int8_t vnx64i __attribute__ ((vector_size (64)));
#define MASK_64                                                                \
  1, 2, 3, 5, 7, 9, 10, 11, 12, 14, 15, 17, 19, 21, 22, 23, 26, 28, 30, 31,    \
    37, 38, 41, 46, 47, 53, 54, 55, 60, 61, 62, 63, 76, 77, 78, 79, 80, 81,    \
    82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99,    \
    100, 101, 102, 103, 104, 105, 106, 107
void __attribute__ ((noinline, noclone)) test_1 (int8_t *x, int8_t *y, int8_t *out)
{
  vnx64i v1 = *(vnx64i*)x;
  vnx64i v2 = *(vnx64i*)y;
  vnx64i v3 = __builtin_shufflevector (v1, v2, MASK_64);
  *(vnx64i*)out = v3;
}

https://godbolt.org/z/P33nev6cW

Before this patch:
        lui     a4,%hi(.LANCHOR0)
        addi    a4,a4,%lo(.LANCHOR0)
        vl4re8.v        v4,0(a4)
        li      a4,64
        vsetvli a5,zero,e8,m4,ta,mu
        vl4re8.v        v20,0(a0)
        vl4re8.v        v16,0(a1)
        vmv.v.x v12,a4
        vrgather.vv     v8,v20,v4
        vmsgeu.vv       v0,v4,v12
        vsub.vv v4,v4,v12
        vrgather.vv     v8,v16,v4,v0.t
        vs4r.v  v8,0(a2)
        ret

After this patch:
	lui	a4,%hi(.LANCHOR0)
	addi	a4,a4,%lo(.LANCHOR0)
	vsetvli	a5,zero,e8,m4,ta,ma
	vl4re8.v	v12,0(a1)
	vl4re8.v	v8,0(a0)
	vlm.v	v0,0(a4)
	vslideup.vi	v4,v12,20
	vcompress.vm	v4,v8,v0
	vs4r.v	v4,0(a2)
	ret

gcc/ChangeLog:

        * config/riscv/riscv-protos.h (enum insn_type): Add vcompress optimization.
        * config/riscv/riscv-v.cc (emit_vlmax_compress_insn): Ditto.
        (shuffle_compress_patterns): Ditto.
        (expand_vec_perm_const_1): Ditto.

gcc/testsuite/ChangeLog:

        * gcc.target/riscv/rvv/autovec/vls-vlmax/compress-1.c: New test.
        * gcc.target/riscv/rvv/autovec/vls-vlmax/compress-2.c: New test.
        * gcc.target/riscv/rvv/autovec/vls-vlmax/compress-3.c: New test.
        * gcc.target/riscv/rvv/autovec/vls-vlmax/compress-4.c: New test.
        * gcc.target/riscv/rvv/autovec/vls-vlmax/compress-5.c: New test.
        * gcc.target/riscv/rvv/autovec/vls-vlmax/compress-6.c: New test.
        * gcc.target/riscv/rvv/autovec/vls-vlmax/compress_run-1.c: New test.
        * gcc.target/riscv/rvv/autovec/vls-vlmax/compress_run-2.c: New test.
        * gcc.target/riscv/rvv/autovec/vls-vlmax/compress_run-3.c: New test.
        * gcc.target/riscv/rvv/autovec/vls-vlmax/compress_run-4.c: New test.
        * gcc.target/riscv/rvv/autovec/vls-vlmax/compress_run-5.c: New test.
        * gcc.target/riscv/rvv/autovec/vls-vlmax/compress_run-6.c: New test.

---
 gcc/config/riscv/riscv-protos.h               |   1 +
 gcc/config/riscv/riscv-v.cc                   | 156 ++++++++++++
 .../riscv/rvv/autovec/vls-vlmax/compress-1.c  |  21 ++
 .../riscv/rvv/autovec/vls-vlmax/compress-2.c  |  46 ++++
 .../riscv/rvv/autovec/vls-vlmax/compress-3.c  |  60 +++++
 .../riscv/rvv/autovec/vls-vlmax/compress-4.c  |  81 +++++++
 .../riscv/rvv/autovec/vls-vlmax/compress-5.c  |  85 +++++++
 .../riscv/rvv/autovec/vls-vlmax/compress-6.c  |  95 ++++++++
 .../rvv/autovec/vls-vlmax/compress_run-1.c    |  27 +++
 .../rvv/autovec/vls-vlmax/compress_run-2.c    |  51 ++++
 .../rvv/autovec/vls-vlmax/compress_run-3.c    |  79 +++++++
 .../rvv/autovec/vls-vlmax/compress_run-4.c    | 117 +++++++++
 .../rvv/autovec/vls-vlmax/compress_run-5.c    | 149 ++++++++++++
 .../rvv/autovec/vls-vlmax/compress_run-6.c    | 222 ++++++++++++++++++
 14 files changed, 1190 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress-2.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress-3.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress-4.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress-5.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress-6.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress_run-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress_run-2.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress_run-3.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress_run-4.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress_run-5.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress_run-6.c

diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 5766e3597e8..6cd5c6639c9 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -148,6 +148,7 @@ enum insn_type
   RVV_WIDEN_TERNOP = 4,
   RVV_SCALAR_MOV_OP = 4, /* +1 for VUNDEF according to vector.md.  */
   RVV_SLIDE_OP = 4,      /* Dest, VUNDEF, source and offset.  */
+  RVV_COMPRESS_OP = 4,
 };
 enum vlmul_type
 {
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 8d5bed7ebe4..1a61f9a18cd 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -1040,6 +1040,24 @@ emit_vlmax_decompress_insn (rtx target, rtx op0, rtx op1, rtx mask)
   emit_vlmax_masked_gather_mu_insn (target, op1, sel, mask);
 }
 
+/* Emit compress instruction.  */
+static void
+emit_vlmax_compress_insn (unsigned icode, rtx *ops)
+{
+  machine_mode dest_mode = GET_MODE (ops[0]);
+  machine_mode mask_mode = get_mask_mode (dest_mode).require ();
+  insn_expander<RVV_INSN_OPERANDS_MAX> e (RVV_COMPRESS_OP,
+					  /* HAS_DEST_P */ true,
+					  /* FULLY_UNMASKED_P */ false,
+					  /* USE_REAL_MERGE_P */ true,
+					  /* HAS_AVL_P */ true,
+					  /* VLMAX_P */ true, dest_mode,
+					  mask_mode);
+
+  e.set_policy (TAIL_ANY);
+  e.emit_insn ((enum insn_code) icode, ops);
+}
+
 /* Emit merge instruction.  */
 
 static machine_mode
@@ -2573,6 +2591,142 @@ shuffle_merge_patterns (struct expand_vec_perm_d *d)
   return true;
 }
 
+/* Recognize the patterns that we can use compress operation to shuffle the
+   vectors. The perm selector of compress pattern is divided into 2 part:
+   The first part is the random index number < NUNITS.
+   The second part is consecutive last N index number >= NUNITS.
+
+   E.g.
+   v = VEC_PERM_EXPR (v0, v1, selector),
+   selector = { 0, 2, 6, 7 }
+
+   We can transform such pattern into:
+
+   op1 = vcompress (op0, mask)
+   mask = { 1, 0, 1, 0 }
+   v = op1.  */
+
+static bool
+shuffle_compress_patterns (struct expand_vec_perm_d *d)
+{
+  machine_mode vmode = d->vmode;
+  poly_int64 vec_len = d->perm.length ();
+
+  if (!vec_len.is_constant ())
+    return false;
+
+  int vlen = vec_len.to_constant ();
+
+  /* It's not worthwhile the compress pattern has elemenets < 4
+     and we can't modulo indices for compress pattern.  */
+  if (known_ge (d->perm[vlen - 1], vlen * 2) || vlen < 4)
+    return false;
+
+  /* Compress pattern doesn't work for one vector.  */
+  if (d->one_vector_p)
+    return false;
+
+  /* Compress point is the point that all elements value with index i >=
+     compress point of the selector are all consecutive series increasing and
+     each selector value >= NUNTIS. In this case, we could compress all elements
+     of i < compress point into the op1.  */
+  int compress_point = -1;
+  for (int i = 0; i < vlen; i++)
+    {
+      if (compress_point < 0 && known_ge (d->perm[i], vec_len))
+	{
+	  compress_point = i;
+	  break;
+	}
+    }
+
+  /* We don't apply compress approach if we can't find the compress point.  */
+  if (compress_point < 0)
+    return false;
+
+  /* It must be series increasing from compress point.  */
+  if (!d->perm.series_p (compress_point, 1, d->perm[compress_point], 1))
+    return false;
+
+  /* We can only apply compress approach when all index values from 0 to
+     compress point are increasing.  */
+  for (int i = 1; i < compress_point; i++)
+    if (known_le (d->perm[i], d->perm[i - 1]))
+      return false;
+
+  /* Check whether we need to slideup op1 to apply compress approach.
+
+       E.g. For index = { 0, 2, 6, 7}, since d->perm[i - 1] = 7 which
+	    is 2 * NUNITS - 1, so we don't need to slide up.
+
+	    For index = { 0, 2, 5, 6}, we need to slide op1 up before
+	    we apply compress approach.  */
+  bool need_slideup_p = maybe_ne (d->perm[vlen - 1], 2 * vec_len - 1);
+
+  /* If we leave it directly be handled by general gather,
+     the code sequence will be:
+	VECTOR LOAD  selector
+	GEU          mask, selector, NUNITS
+	GATHER       dest, op0, selector
+	SUB          selector, selector, NUNITS
+	GATHER       dest, op1, selector, mask
+     Each ALU operation is considered as COST = 1 and VECTOR LOAD is considered
+     as COST = 4. So, we consider the general gather handling COST = 9.
+     TODO: This cost is not accurate, we can adjust it by tune info.  */
+  int general_cost = 9;
+
+  /* If we can use compress approach, the code squence will be:
+	MASK LOAD    mask
+	COMPRESS     op1, op0, mask
+     If it needs slide up, it will be:
+	MASK LOAD    mask
+	SLIDEUP      op1
+	COMPRESS     op1, op0, mask
+     By default, mask load COST = 2.
+     TODO: This cost is not accurate, we can adjust it by tune info.  */
+  int compress_cost = 4;
+
+  if (general_cost <= compress_cost)
+    return false;
+
+  /* Build a mask that is true when selector element is true.  */
+  machine_mode mask_mode = get_mask_mode (vmode).require ();
+  rvv_builder builder (mask_mode, vlen, 1);
+  for (int i = 0; i < vlen; i++)
+    {
+      bool is_compress_index = false;
+      for (int j = 0; j < compress_point; j++)
+	{
+	  if (known_eq (d->perm[j], i))
+	    {
+	      is_compress_index = true;
+	      break;
+	    }
+	}
+      if (is_compress_index)
+	builder.quick_push (CONST1_RTX (BImode));
+      else
+	builder.quick_push (CONST0_RTX (BImode));
+    }
+  rtx mask = force_reg (mask_mode, builder.build ());
+
+  rtx merge = d->op1;
+  if (need_slideup_p)
+    {
+      int slideup_cnt = vlen - (d->perm[vlen - 1].to_constant () % vlen) - 1;
+      rtx ops[] = {d->target, RVV_VUNDEF (vmode), d->op1,
+		   gen_int_mode (slideup_cnt, Pmode)};
+      insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEUP, vmode);
+      emit_vlmax_slide_insn (icode, ops);
+      merge = d->target;
+    }
+
+  insn_code icode = code_for_pred_compress (vmode);
+  rtx ops[] = {d->target, merge, d->op0, mask};
+  emit_vlmax_compress_insn (icode, ops);
+  return true;
+}
+
 /* Recognize decompress patterns:
 
    1. VEC_PERM_EXPR op0 and op1
@@ -2696,6 +2850,8 @@ expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
 	{
 	  if (shuffle_merge_patterns (d))
 	    return true;
+	  if (shuffle_compress_patterns (d))
+	    return true;
 	  if (shuffle_decompress_patterns (d))
 	    return true;
 	  if (shuffle_generic_patterns (d))
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress-1.c
new file mode 100644
index 00000000000..5983757dfd8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress-1.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 --param riscv-autovec-preference=fixed-vlmax -Wno-psabi" } */
+
+#include <stdint-gcc.h>
+
+typedef int8_t vnx4i __attribute__ ((vector_size (4)));
+typedef uint8_t vnx4ui __attribute__ ((vector_size (4)));
+
+#define MASK_4  0, 2, 6, 7 
+
+vnx4i __attribute__ ((noinline, noclone)) test_1 (vnx4i x, vnx4i y)
+{
+  return __builtin_shufflevector (x, y, MASK_4);
+}
+
+vnx4ui __attribute__ ((noinline, noclone)) test_2 (vnx4ui x, vnx4ui y)
+{
+  return __builtin_shufflevector (x, y, MASK_4);
+}
+
+/* { dg-final { scan-assembler-times {\tvcompress\.vm} 2 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress-2.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress-2.c
new file mode 100644
index 00000000000..c6cd7bb895e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress-2.c
@@ -0,0 +1,46 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvfh -mabi=lp64d -O3 --param riscv-autovec-preference=fixed-vlmax -Wno-psabi" } */
+
+#include <stdint-gcc.h>
+
+typedef int8_t vnx8i __attribute__ ((vector_size (8)));
+typedef int16_t vnx4i __attribute__ ((vector_size (8)));
+typedef uint8_t vnx8ui __attribute__ ((vector_size (8)));
+typedef uint16_t vnx4ui __attribute__ ((vector_size (8)));
+typedef _Float16 vnx4f __attribute__ ((vector_size (8)));
+
+#define MASK_4 1, 3, 6, 7
+#define MASK_8 2, 3, 5, 6, 11, 12, 13, 14
+
+vnx8i __attribute__ ((noinline, noclone))
+test_1 (vnx8i x, vnx8i y)
+{
+  return __builtin_shufflevector (x, y, MASK_8);
+}
+
+vnx4i __attribute__ ((noinline, noclone))
+test_2 (vnx4i x, vnx4i y)
+{
+  return __builtin_shufflevector (x, y, MASK_4);
+}
+
+vnx8ui __attribute__ ((noinline, noclone))
+test_3 (vnx8ui x, vnx8ui y)
+{
+  return __builtin_shufflevector (x, y, MASK_8);
+}
+
+vnx4ui __attribute__ ((noinline, noclone))
+test_4 (vnx4ui x, vnx4ui y)
+{
+  return __builtin_shufflevector (x, y, MASK_4);
+}
+
+vnx4f __attribute__ ((noinline, noclone))
+test_5 (vnx4f x, vnx4f y)
+{
+  return __builtin_shufflevector (x, y, MASK_4);
+}
+
+/* { dg-final { scan-assembler-times {\tvcompress\.vm} 5 } } */
+/* { dg-final { scan-assembler-times {\tvslideup} 2 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress-3.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress-3.c
new file mode 100644
index 00000000000..0fc2cefe5a7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress-3.c
@@ -0,0 +1,60 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvfh -mabi=lp64d -O3 --param riscv-autovec-preference=fixed-vlmax -Wno-psabi" } */
+
+#include <stdint-gcc.h>
+
+typedef int8_t vnx16i __attribute__ ((vector_size (16)));
+typedef int16_t vnx8i __attribute__ ((vector_size (16)));
+typedef int32_t vnx4i __attribute__ ((vector_size (16)));
+typedef uint8_t vnx16ui __attribute__ ((vector_size (16)));
+typedef uint16_t vnx8ui __attribute__ ((vector_size (16)));
+typedef uint32_t vnx4ui __attribute__ ((vector_size (16)));
+typedef _Float16 vnx8f __attribute__ ((vector_size (16)));
+typedef float vnx4f __attribute__ ((vector_size (16)));
+
+#define MASK_4 0, 2, 5, 6
+#define MASK_8 0, 1, 2, 5, 10, 11, 12, 13
+#define MASK_16 1, 3, 4, 6, 7, 8, 14, 15, 20, 21, 22, 23, 24, 25, 26, 27
+
+vnx16i __attribute__ ((noinline, noclone)) test_1 (vnx16i x, vnx16i y)
+{
+  return __builtin_shufflevector (x, y, MASK_16);
+}
+
+vnx8i __attribute__ ((noinline, noclone)) test_2 (vnx8i x, vnx8i y)
+{
+  return __builtin_shufflevector (x, y, MASK_8);
+}
+
+vnx4i __attribute__ ((noinline, noclone)) test_3 (vnx4i x, vnx4i y)
+{
+  return __builtin_shufflevector (x, y, MASK_4);
+}
+
+vnx16ui __attribute__ ((noinline, noclone)) test_4 (vnx16ui x, vnx16ui y)
+{
+  return __builtin_shufflevector (x, y, MASK_16);
+}
+
+vnx8ui __attribute__ ((noinline, noclone)) test_5 (vnx8ui x, vnx8ui y)
+{
+  return __builtin_shufflevector (x, y, MASK_8);
+}
+
+vnx4ui __attribute__ ((noinline, noclone)) test_6 (vnx4ui x, vnx4ui y)
+{
+  return __builtin_shufflevector (x, y, MASK_4);
+}
+
+vnx8f __attribute__ ((noinline, noclone)) test_7 (vnx8f x, vnx8f y)
+{
+  return __builtin_shufflevector (x, y, MASK_8);
+}
+
+vnx4f __attribute__ ((noinline, noclone)) test_8 (vnx4f x, vnx4f y)
+{
+  return __builtin_shufflevector (x, y, MASK_4);
+}
+
+/* { dg-final { scan-assembler-times {\tvcompress.vm} 8 } } */
+/* { dg-final { scan-assembler-times {\tvslideup} 8 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress-4.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress-4.c
new file mode 100644
index 00000000000..54b89ed41a9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress-4.c
@@ -0,0 +1,81 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvfh -mabi=lp64d -O3 --param riscv-autovec-preference=fixed-vlmax -Wno-psabi" } */
+
+#include <stdint-gcc.h>
+
+typedef int8_t vnx32i __attribute__ ((vector_size (32)));
+typedef int16_t vnx16i __attribute__ ((vector_size (32)));
+typedef int32_t vnx8i __attribute__ ((vector_size (32)));
+typedef int64_t vnx4i __attribute__ ((vector_size (32)));
+typedef uint8_t vnx32ui __attribute__ ((vector_size (32)));
+typedef uint16_t vnx16ui __attribute__ ((vector_size (32)));
+typedef uint32_t vnx8ui __attribute__ ((vector_size (32)));
+typedef uint64_t vnx4ui __attribute__ ((vector_size (32)));
+typedef _Float16 vnx16f __attribute__ ((vector_size (32)));
+typedef float vnx8f __attribute__ ((vector_size (32)));
+typedef double vnx4f __attribute__ ((vector_size (32)));
+
+#define MASK_4 1, 2, 5, 6
+#define MASK_8 1, 2, 5, 7, 11, 12, 13, 14
+#define MASK_16 0, 3, 5, 7, 10, 11, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23
+#define MASK_32                                                                \
+  3, 4, 6, 7, 9, 10, 14, 15, 17, 20, 21, 22, 24, 27, 29, 31, 34, 35, 36, 37,   \
+    38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49
+
+vnx32i __attribute__ ((noinline, noclone)) test_1 (vnx32i x, vnx32i y)
+{
+  return __builtin_shufflevector (x, y, MASK_32);
+}
+
+vnx16i __attribute__ ((noinline, noclone)) test_2 (vnx16i x, vnx16i y)
+{
+  return __builtin_shufflevector (x, y, MASK_16);
+}
+
+vnx8i __attribute__ ((noinline, noclone)) test_3 (vnx8i x, vnx8i y)
+{
+  return __builtin_shufflevector (x, y, MASK_8);
+}
+
+vnx4i __attribute__ ((noinline, noclone)) test_4 (vnx4i x, vnx4i y)
+{
+  return __builtin_shufflevector (x, y, MASK_4);
+}
+
+vnx32ui __attribute__ ((noinline, noclone)) test_5 (vnx32ui x, vnx32ui y)
+{
+  return __builtin_shufflevector (x, y, MASK_32);
+}
+
+vnx16ui __attribute__ ((noinline, noclone)) test_6 (vnx16ui x, vnx16ui y)
+{
+  return __builtin_shufflevector (x, y, MASK_16);
+}
+
+vnx8ui __attribute__ ((noinline, noclone)) test_7 (vnx8ui x, vnx8ui y)
+{
+  return __builtin_shufflevector (x, y, MASK_8);
+}
+
+vnx4ui __attribute__ ((noinline, noclone)) test_8 (vnx4ui x, vnx4ui y)
+{
+  return __builtin_shufflevector (x, y, MASK_4);
+}
+
+vnx16f __attribute__ ((noinline, noclone)) test_9 (vnx16f x, vnx16f y)
+{
+  return __builtin_shufflevector (x, y, MASK_16);
+}
+
+vnx8f __attribute__ ((noinline, noclone)) test_10 (vnx8f x, vnx8f y)
+{
+  return __builtin_shufflevector (x, y, MASK_8);
+}
+
+vnx4f __attribute__ ((noinline, noclone)) test_11 (vnx4f x, vnx4f y)
+{
+  return __builtin_shufflevector (x, y, MASK_4);
+}
+
+/* { dg-final { scan-assembler-times {\tvcompress.vm} 11 } } */
+/* { dg-final { scan-assembler-times {\tvslideup} 11 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress-5.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress-5.c
new file mode 100644
index 00000000000..4b2750264e6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress-5.c
@@ -0,0 +1,85 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvfh -mabi=lp64d -O3 --param riscv-autovec-preference=fixed-vlmax -Wno-psabi" } */
+
+#include <stdint-gcc.h>
+
+typedef int8_t vnx64i __attribute__ ((vector_size (64)));
+typedef int16_t vnx32i __attribute__ ((vector_size (64)));
+typedef int32_t vnx16i __attribute__ ((vector_size (64)));
+typedef int64_t vnx8i __attribute__ ((vector_size (64)));
+typedef uint8_t vnx64ui __attribute__ ((vector_size (64)));
+typedef uint16_t vnx32ui __attribute__ ((vector_size (64)));
+typedef uint32_t vnx16ui __attribute__ ((vector_size (64)));
+typedef uint64_t vnx8ui __attribute__ ((vector_size (64)));
+typedef _Float16 vnx32f __attribute__ ((vector_size (64)));
+typedef float vnx16f __attribute__ ((vector_size (64)));
+typedef double vnx8f __attribute__ ((vector_size (64)));
+
+#define MASK_4 0, 2, 5, 6
+#define MASK_8 0, 1, 5, 7, 11, 12, 13, 14
+#define MASK_16 0, 2, 3, 8, 10, 11, 12, 14, 24, 25, 26, 27, 28, 29, 30, 31
+#define MASK_32                                                                \
+  4, 5, 6, 7, 9, 10, 12, 14, 18, 20, 22, 23, 25, 27, 28, 29, 35, 36, 37, 38,   \
+    39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50
+#define MASK_64                                                                \
+  1, 2, 3, 5, 7, 9, 10, 11, 12, 14, 15, 17, 19, 21, 22, 23, 26, 28, 30, 31,    \
+    37, 38, 41, 46, 47, 53, 54, 55, 60, 61, 62, 63, 76, 77, 78, 79, 80, 81,    \
+    82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99,    \
+    100, 101, 102, 103, 104, 105, 106, 107
+
+vnx64i __attribute__ ((noinline, noclone)) test_1 (vnx64i x, vnx64i y)
+{
+  return __builtin_shufflevector (x, y, MASK_64);
+}
+
+vnx32i __attribute__ ((noinline, noclone)) test_2 (vnx32i x, vnx32i y)
+{
+  return __builtin_shufflevector (x, y, MASK_32);
+}
+
+vnx16i __attribute__ ((noinline, noclone)) test_3 (vnx16i x, vnx16i y)
+{
+  return __builtin_shufflevector (x, y, MASK_16);
+}
+
+vnx8i __attribute__ ((noinline, noclone)) test_4 (vnx8i x, vnx8i y)
+{
+  return __builtin_shufflevector (x, y, MASK_8);
+}
+
+vnx64ui __attribute__ ((noinline, noclone)) test_5 (vnx64ui x, vnx64ui y)
+{
+  return __builtin_shufflevector (x, y, MASK_64);
+}
+
+vnx32ui __attribute__ ((noinline, noclone)) test_6 (vnx32ui x, vnx32ui y)
+{
+  return __builtin_shufflevector (x, y, MASK_32);
+}
+
+vnx16ui __attribute__ ((noinline, noclone)) test_7 (vnx16ui x, vnx16ui y)
+{
+  return __builtin_shufflevector (x, y, MASK_16);
+}
+
+vnx8ui __attribute__ ((noinline, noclone)) test_8 (vnx8ui x, vnx8ui y)
+{
+  return __builtin_shufflevector (x, y, MASK_8);
+}
+
+vnx32f __attribute__ ((noinline, noclone)) test_9 (vnx32f x, vnx32f y)
+{
+  return __builtin_shufflevector (x, y, MASK_32);
+}
+
+vnx16f __attribute__ ((noinline, noclone)) test_10 (vnx16f x, vnx16f y)
+{
+  return __builtin_shufflevector (x, y, MASK_16);
+}
+
+vnx8f __attribute__ ((noinline, noclone)) test_11 (vnx8f x, vnx8f y)
+{
+  return __builtin_shufflevector (x, y, MASK_8);
+}
+
+/* { dg-final { scan-assembler-times {\tvcompress.vm} 11 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress-6.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress-6.c
new file mode 100644
index 00000000000..4b85c71a55e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress-6.c
@@ -0,0 +1,95 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvfh -mabi=lp64d -O3 --param riscv-autovec-preference=fixed-vlmax -Wno-psabi" } */
+
+#include <stdint-gcc.h>
+
+typedef int8_t vnx128i __attribute__ ((vector_size (128)));
+typedef int16_t vnx64i __attribute__ ((vector_size (128)));
+typedef int32_t vnx32i __attribute__ ((vector_size (128)));
+typedef int64_t vnx16i __attribute__ ((vector_size (128)));
+typedef uint8_t vnx128ui __attribute__ ((vector_size (128)));
+typedef uint16_t vnx64ui __attribute__ ((vector_size (128)));
+typedef uint32_t vnx32ui __attribute__ ((vector_size (128)));
+typedef uint64_t vnx16ui __attribute__ ((vector_size (128)));
+typedef _Float16 vnx64f __attribute__ ((vector_size (128)));
+typedef float vnx32f __attribute__ ((vector_size (128)));
+typedef double vnx16f __attribute__ ((vector_size (128)));
+
+#define MASK_4 0, 3, 6, 7
+#define MASK_8 0, 2, 3, 4, 10, 11, 12, 13
+#define MASK_16 2, 3, 4, 6, 7, 8, 9, 12, 20, 21, 22, 23, 24, 25, 26, 27
+#define MASK_32                                                                \
+  0, 1, 3, 4, 7, 8, 12, 13, 14, 19, 21, 22, 23, 27, 29, 31, 41, 42, 43, 44,    \
+    45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56
+#define MASK_64                                                                \
+  0, 2, 3, 4, 5, 7, 11, 13, 14, 16, 17, 19, 20, 22, 23, 24, 27, 28, 30, 31,    \
+    35, 37, 39, 40, 44, 45, 46, 53, 54, 56, 61, 63, 68, 69, 70, 71, 72, 73,    \
+    74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91,    \
+    92, 93, 94, 95, 96, 97, 98, 99
+
+#define MASK_128                                                               \
+  1, 3, 4, 5, 6, 7, 8, 10, 12, 14, 15, 16, 17, 18, 22, 25, 28, 29, 30, 31, 36, \
+    37, 40, 41, 42, 43, 44, 46, 52, 54, 55, 58, 61, 62, 64, 67, 68, 69, 70,    \
+    71, 76, 77, 78, 80, 82, 83, 84, 86, 87, 88, 91, 94, 95, 99, 102, 104, 106, \
+    110, 112, 115, 116, 125, 126, 127, 144, 145, 146, 147, 148, 149, 150, 151, \
+    152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, \
+    167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, \
+    182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, \
+    197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207
+
+vnx128i __attribute__ ((noinline, noclone)) test_1 (vnx128i x, vnx128i y)
+{
+  return __builtin_shufflevector (x, y, MASK_128);
+}
+
+vnx64i __attribute__ ((noinline, noclone)) test_2 (vnx64i x, vnx64i y)
+{
+  return __builtin_shufflevector (x, y, MASK_64);
+}
+
+vnx32i __attribute__ ((noinline, noclone)) test_3 (vnx32i x, vnx32i y)
+{
+  return __builtin_shufflevector (x, y, MASK_32);
+}
+
+vnx16i __attribute__ ((noinline, noclone)) test_4 (vnx16i x, vnx16i y)
+{
+  return __builtin_shufflevector (x, y, MASK_16);
+}
+
+vnx128ui __attribute__ ((noinline, noclone)) test_5 (vnx128ui x, vnx128ui y)
+{
+  return __builtin_shufflevector (x, y, MASK_128);
+}
+
+vnx64ui __attribute__ ((noinline, noclone)) test_6 (vnx64ui x, vnx64ui y)
+{
+  return __builtin_shufflevector (x, y, MASK_64);
+}
+
+vnx32ui __attribute__ ((noinline, noclone)) test_7 (vnx32ui x, vnx32ui y)
+{
+  return __builtin_shufflevector (x, y, MASK_32);
+}
+
+vnx16ui __attribute__ ((noinline, noclone)) test_8 (vnx16ui x, vnx16ui y)
+{
+  return __builtin_shufflevector (x, y, MASK_16);
+}
+
+vnx64f __attribute__ ((noinline, noclone)) test_9 (vnx64f x, vnx64f y)
+{
+  return __builtin_shufflevector (x, y, MASK_64);
+}
+
+vnx32f __attribute__ ((noinline, noclone)) test_10 (vnx32f x, vnx32f y)
+{
+  return __builtin_shufflevector (x, y, MASK_32);
+}
+
+vnx16f __attribute__ ((noinline, noclone)) test_11 (vnx16f x, vnx16f y)
+{
+  return __builtin_shufflevector (x, y, MASK_16);
+}
+
+/* { dg-final { scan-assembler-times {\tvcompress.vm} 11 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress_run-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress_run-1.c
new file mode 100644
index 00000000000..a2c1312d0ae
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress_run-1.c
@@ -0,0 +1,27 @@
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-options "-O3 --param riscv-autovec-preference=fixed-vlmax -Wno-psabi" } */
+
+#include <assert.h>
+#include "compress-1.c"
+
+int
+main (void)
+{
+  vnx4i test_1_x = {0, 1, 2, 4};
+  vnx4i test_1_y = {4, 5, 7, 8};
+  vnx4i test_1_except = {0, 2, 7, 8};
+  vnx4i test_1_real;
+  test_1_real = test_1 (test_1_x, test_1_y);
+  for (int i = 0; i < 4; i++)
+    assert (test_1_real[i] == test_1_except[i]);
+
+  vnx4ui test_2_x = {0, 1, 2, 4};
+  vnx4ui test_2_y = {4, 5, 6, 8};
+  vnx4ui test_2_except = {0, 2, 6, 8};
+  vnx4ui test_2_real;
+  test_2_real = test_2 (test_2_x, test_2_y);
+  for (int i = 0; i < 4; i++)
+    assert (test_2_real[i] == test_2_except[i]);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress_run-2.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress_run-2.c
new file mode 100644
index 00000000000..9f9f152357f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress_run-2.c
@@ -0,0 +1,51 @@
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-options "-O3 --param riscv-autovec-preference=fixed-vlmax -Wno-psabi" } */
+
+#include <assert.h>
+#include "compress-2.c"
+
+int
+main (void)
+{
+  vnx8i test_1_x = {0, 1, 2, 3, 5, 6, 7, 8};
+  vnx8i test_1_y = {8, 9, 10, 11, 13, 14, 15, 16};
+  vnx8i test_1_except = {2, 3, 6, 7, 11, 13, 14, 15};
+  vnx8i test_1_real;
+  test_1_real = test_1 (test_1_x, test_1_y);
+  for (int i = 0; i < 8; i++)
+    assert (test_1_real[i] == test_1_except[i]);
+
+  vnx4i test_2_x = {1, 2, 3, 4};
+  vnx4i test_2_y = {5, 6, 7, 8};
+  vnx4i test_2_except = {2, 4, 7, 8};
+  vnx4i test_2_real;
+  test_2_real = test_2 (test_2_x, test_2_y);
+  for (int i = 0; i < 4; i++)
+    assert (test_2_real[i] == test_2_except[i]);
+
+  vnx8ui test_3_x = {0, 1, 2, 3, 4, 5, 6, 8};
+  vnx8ui test_3_y = {8, 9, 10, 11, 12, 13, 15, 16};
+  vnx8ui test_3_except = {2, 3, 5, 6, 11, 12, 13, 15};
+  vnx8ui test_3_real;
+  test_3_real = test_3 (test_3_x, test_3_y);
+  for (int i = 0; i < 8; i++)
+    assert (test_3_real[i] == test_3_except[i]);
+
+  vnx4ui test_4_x = {1, 2, 3, 4};
+  vnx4ui test_4_y = {4, 5, 6, 8};
+  vnx4ui test_4_except = {2, 4, 6, 8};
+  vnx4ui test_4_real;
+  test_4_real = test_4 (test_4_x, test_4_y);
+  for (int i = 0; i < 4; i++)
+    assert (test_4_real[i] == test_4_except[i]);
+
+  vnx4f test_5_x = {0, 1, 3, 4};
+  vnx4f test_5_y = {4, 5, 6, 7};
+  vnx4f test_5_except = {1, 4, 6, 7};
+  vnx4f test_5_real;
+  test_5_real = test_5 (test_5_x, test_5_y);
+  for (int i = 0; i < 4; i++)
+    assert (test_5_real[i] == test_5_except[i]);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress_run-3.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress_run-3.c
new file mode 100644
index 00000000000..8a29a5b7203
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress_run-3.c
@@ -0,0 +1,79 @@
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-options "-O3 --param riscv-autovec-preference=fixed-vlmax -Wno-psabi" } */
+
+#include <assert.h>
+#include "compress-3.c"
+
+int
+main (void)
+{
+  vnx16i test_1_x = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  vnx16i test_1_y
+    = {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
+  vnx16i test_1_except
+    = {1, 3, 4, 6, 7, 8, 14, 15, 20, 21, 22, 23, 24, 25, 26, 27};
+  vnx16i test_1_real;
+  test_1_real = test_1 (test_1_x, test_1_y);
+  for (int i = 0; i < 16; i++)
+    assert (test_1_real[i] == test_1_except[i]);
+
+  vnx8i test_2_x = {0, 1, 2, 3, 4, 5, 6, 7};
+  vnx8i test_2_y = {8, 9, 10, 11, 12, 13, 14, 15};
+  vnx8i test_2_except = {0, 1, 2, 5, 10, 11, 12, 13};
+  vnx8i test_2_real;
+  test_2_real = test_2 (test_2_x, test_2_y);
+  for (int i = 0; i < 8; i++)
+    assert (test_2_real[i] == test_2_except[i]);
+
+  vnx4i test_3_x = {0, 1, 2, 3};
+  vnx4i test_3_y = {4, 5, 6, 7};
+  vnx4i test_3_except = {0, 2, 5, 6};
+  vnx4i test_3_real;
+  test_3_real = test_3 (test_3_x, test_3_y);
+  for (int i = 0; i < 4; i++)
+    assert (test_3_real[i] == test_3_except[i]);
+
+  vnx16ui test_4_x = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  vnx16ui test_4_y
+    = {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
+  vnx16ui test_4_except
+    = {1, 3, 4, 6, 7, 8, 14, 15, 20, 21, 22, 23, 24, 25, 26, 27};
+  vnx16ui test_4_real;
+  test_4_real = test_4 (test_4_x, test_4_y);
+  for (int i = 0; i < 16; i++)
+    assert (test_4_real[i] == test_4_except[i]);
+
+  vnx8ui test_5_x = {0, 1, 2, 3, 4, 5, 6, 7};
+  vnx8ui test_5_y = {8, 9, 10, 11, 12, 13, 14, 15};
+  vnx8ui test_5_except = {0, 1, 2, 5, 10, 11, 12, 13};
+  vnx8ui test_5_real;
+  test_5_real = test_5 (test_5_x, test_5_y);
+  for (int i = 0; i < 8; i++)
+    assert (test_5_real[i] == test_5_except[i]);
+
+  vnx4ui test_6_x = {0, 1, 2, 3};
+  vnx4ui test_6_y = {4, 5, 6, 7};
+  vnx4ui test_6_except = {0, 2, 5, 6};
+  vnx4ui test_6_real;
+  test_6_real = test_6 (test_6_x, test_6_y);
+  for (int i = 0; i < 4; i++)
+    assert (test_6_real[i] == test_6_except[i]);
+
+  vnx8f test_7_x = {0, 1, 2, 3, 4, 5, 6, 7};
+  vnx8f test_7_y = {8, 9, 10, 11, 12, 13, 14, 15};
+  vnx8f test_7_except = {0, 1, 2, 5, 10, 11, 12, 13};
+  vnx8f test_7_real;
+  test_7_real = test_7 (test_7_x, test_7_y);
+  for (int i = 0; i < 8; i++)
+    assert (test_7_real[i] == test_7_except[i]);
+
+  vnx4f test_8_x = {0, 1, 2, 3};
+  vnx4f test_8_y = {4, 5, 6, 7};
+  vnx4f test_8_except = {0, 2, 5, 6};
+  vnx4f test_8_real;
+  test_8_real = test_8 (test_8_x, test_8_y);
+  for (int i = 0; i < 4; i++)
+    assert (test_8_real[i] == test_8_except[i]);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress_run-4.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress_run-4.c
new file mode 100644
index 00000000000..42b3fa16c8f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress_run-4.c
@@ -0,0 +1,117 @@
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-options "-O3 --param riscv-autovec-preference=fixed-vlmax -Wno-psabi" } */
+
+#include <assert.h>
+#include "compress-4.c"
+
+int
+main (void)
+{
+  vnx32i test_1_x
+    = {0,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32};
+  vnx32i test_1_y
+    = {32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+       48, 49, 50, 51, 52, 53, 54, 55, 57, 58, 59, 60, 61, 62, 63, 64};
+  vnx32i test_1_except
+    = {4,  5,  7,  8,  10, 11, 15, 16, 18, 21, 22, 23, 25, 28, 30, 32,
+       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49};
+  vnx32i test_1_real;
+  test_1_real = test_1 (test_1_x, test_1_y);
+  for (int i = 0; i < 32; i++)
+    assert (test_1_real[i] == test_1_except[i]);
+
+  vnx16i test_2_x = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  vnx16i test_2_y
+    = {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 28, 29, 30, 31, 32};
+  vnx16i test_2_except
+    = {0, 3, 5, 7, 10, 11, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23};
+  vnx16i test_2_real;
+  test_2_real = test_2 (test_2_x, test_2_y);
+  for (int i = 0; i < 16; i++)
+    assert (test_2_real[i] == test_2_except[i]);
+
+  vnx8i test_3_x = {0, 1, 2, 4, 5, 6, 7, 8};
+  vnx8i test_3_y = {8, 10, 11, 12, 13, 14, 15, 16};
+  vnx8i test_3_except = {1, 2, 6, 8, 12, 13, 14, 15};
+  vnx8i test_3_real;
+  test_3_real = test_3 (test_3_x, test_3_y);
+  for (int i = 0; i < 8; i++)
+    assert (test_3_real[i] == test_3_except[i]);
+
+  vnx4i test_4_x = {0, 2, 3, 4};
+  vnx4i test_4_y = {4, 5, 7, 8};
+  vnx4i test_4_except = {2, 3, 5, 7};
+  vnx4i test_4_real;
+  test_4_real = test_4 (test_4_x, test_4_y);
+  for (int i = 0; i < 4; i++)
+    assert (test_4_real[i] == test_4_except[i]);
+
+  vnx32ui test_5_x
+    = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32};
+  vnx32ui test_5_y
+    = {32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+       48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 60, 61, 62, 63, 64};
+  vnx32ui test_5_except
+    = {3,  4,  6,  7,  9,  10, 14, 15, 17, 20, 21, 22, 24, 27, 29, 32,
+       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49};
+  vnx32ui test_5_real;
+  test_5_real = test_5 (test_5_x, test_5_y);
+  for (int i = 0; i < 32; i++)
+    assert (test_5_real[i] == test_5_except[i]);
+
+  vnx16ui test_6_x = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16};
+  vnx16ui test_6_y
+    = {16, 17, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32};
+  vnx16ui test_6_except
+    = {0, 3, 5, 7, 10, 11, 15, 16, 16, 17, 19, 20, 21, 22, 23, 24};
+  vnx16ui test_6_real;
+  test_6_real = test_6 (test_6_x, test_6_y);
+  for (int i = 0; i < 16; i++)
+    assert (test_6_real[i] == test_6_except[i]);
+
+  vnx8ui test_7_x = {0, 2, 3, 4, 5, 6, 7, 8};
+  vnx8ui test_7_y = {8, 9, 10, 12, 13, 14, 15, 16};
+  vnx8ui test_7_except = {2, 3, 6, 8, 12, 13, 14, 15};
+  vnx8ui test_7_real;
+  test_7_real = test_7 (test_7_x, test_7_y);
+  for (int i = 0; i < 8; i++)
+    assert (test_7_real[i] == test_7_except[i]);
+
+  vnx4ui test_8_x = {0, 2, 3, 4};
+  vnx4ui test_8_y = {5, 6, 7, 8};
+  vnx4ui test_8_except = {2, 3, 6, 7};
+  vnx4ui test_8_real;
+  test_8_real = test_8 (test_8_x, test_8_y);
+  for (int i = 0; i < 4; i++)
+    assert (test_8_real[i] == test_8_except[i]);
+
+  vnx16f test_9_x = {0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
+  vnx16f test_9_y
+    = {16, 17, 18, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32};
+  vnx16f test_9_except
+    = {0, 3, 6, 8, 11, 12, 15, 16, 16, 17, 18, 20, 21, 22, 23, 24};
+  vnx16f test_9_real;
+  test_9_real = test_9 (test_9_x, test_9_y);
+  for (int i = 0; i < 16; i++)
+    assert (test_9_real[i] == test_9_except[i]);
+
+  vnx8f test_10_x = {0, 1, 2, 3, 4, 5, 6, 7};
+  vnx8f test_10_y = {8, 9, 10, 12, 13, 14, 15, 16};
+  vnx8f test_10_except = {1, 2, 5, 7, 12, 13, 14, 15};
+  vnx8f test_10_real;
+  test_10_real = test_10 (test_10_x, test_10_y);
+  for (int i = 0; i < 8; i++)
+    assert (test_10_real[i] == test_10_except[i]);
+
+  vnx4f test_11_x = {0, 2, 3, 4};
+  vnx4f test_11_y = {4, 6, 7, 8};
+  vnx4f test_11_except = {2, 3, 6, 7};
+  vnx4f test_11_real;
+  test_11_real = test_11 (test_11_x, test_11_y);
+  for (int i = 0; i < 4; i++)
+    assert (test_11_real[i] == test_11_except[i]);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress_run-5.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress_run-5.c
new file mode 100644
index 00000000000..8f15f45977b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress_run-5.c
@@ -0,0 +1,149 @@
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-options "-O3 --param riscv-autovec-preference=fixed-vlmax -Wno-psabi" } */
+
+#include <assert.h>
+#include "compress-5.c"
+
+int
+main (void)
+{
+  vnx64i test_1_x
+    = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+       32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+       48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
+  vnx64i test_1_y
+    = {64,  65,	 66,  67,  68,	69,  70,  71,  72,  73,	 74,  75,  76,
+       77,  78,	 79,  80,  81,	82,  83,  84,  85,  86,	 87,  88,  89,
+       90,  91,	 92,  93,  94,	95,  96,  97,  98,  99,	 100, 101, 102,
+       103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115,
+       116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127};
+  vnx64i test_1_except
+    = {1,  2,  3,  5,  7,  9,  10, 11, 12,  14,	 15,  17,  19,	21,  22,  23,
+       26, 28, 30, 31, 37, 38, 41, 46, 47,  53,	 54,  55,  60,	61,  62,  63,
+       76, 77, 78, 79, 80, 81, 82, 83, 84,  85,	 86,  87,  88,	89,  90,  91,
+       92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107};
+  vnx64i test_1_real;
+  test_1_real = test_1 (test_1_x, test_1_y);
+  for (int i = 0; i < 64; i++)
+    assert (test_1_real[i] == test_1_except[i]);
+
+  vnx32i test_2_x
+    = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
+  vnx32i test_2_y
+    = {32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+       48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
+  vnx32i test_2_except
+    = {4,  5,  6,  7,  9,  10, 12, 14, 18, 20, 22, 23, 25, 27, 28, 29,
+       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50};
+  vnx32i test_2_real;
+  test_2_real = test_2 (test_2_x, test_2_y);
+  for (int i = 0; i < 32; i++)
+    assert (test_2_real[i] == test_2_except[i]);
+
+  vnx16i test_3_x = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  vnx16i test_3_y
+    = {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
+  vnx16i test_3_except
+    = {0, 2, 3, 8, 10, 11, 12, 14, 24, 25, 26, 27, 28, 29, 30, 31};
+  vnx16i test_3_real;
+  test_3_real = test_3 (test_3_x, test_3_y);
+  for (int i = 0; i < 16; i++)
+    assert (test_3_real[i] == test_3_except[i]);
+
+  vnx8i test_4_x = {0, 1, 2, 3, 4, 5, 6, 7};
+  vnx8i test_4_y = {8, 9, 10, 11, 12, 13, 14, 15};
+  vnx8i test_4_except = {0, 1, 5, 7, 11, 12, 13, 14};
+  vnx8i test_4_real;
+  test_4_real = test_4 (test_4_x, test_4_y);
+  for (int i = 0; i < 8; i++)
+    assert (test_4_real[i] == test_4_except[i]);
+
+  vnx64ui test_5_x
+    = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+       32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+       48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
+  vnx64ui test_5_y
+    = {64,  65,	 66,  67,  68,	69,  70,  71,  72,  73,	 74,  75,  76,
+       77,  78,	 79,  80,  81,	82,  83,  84,  85,  86,	 87,  88,  89,
+       90,  91,	 92,  93,  94,	95,  96,  97,  98,  99,	 100, 101, 102,
+       103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115,
+       116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127};
+  vnx64ui test_5_except
+    = {1,  2,  3,  5,  7,  9,  10, 11, 12,  14,	 15,  17,  19,	21,  22,  23,
+       26, 28, 30, 31, 37, 38, 41, 46, 47,  53,	 54,  55,  60,	61,  62,  63,
+       76, 77, 78, 79, 80, 81, 82, 83, 84,  85,	 86,  87,  88,	89,  90,  91,
+       92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107};
+  vnx64ui test_5_real;
+  test_5_real = test_5 (test_5_x, test_5_y);
+  for (int i = 0; i < 64; i++)
+    assert (test_5_real[i] == test_5_except[i]);
+
+  vnx32ui test_6_x
+    = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
+  vnx32ui test_6_y
+    = {32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+       48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
+  vnx32ui test_6_except
+    = {4,  5,  6,  7,  9,  10, 12, 14, 18, 20, 22, 23, 25, 27, 28, 29,
+       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50};
+  vnx32ui test_6_real;
+  test_6_real = test_6 (test_6_x, test_6_y);
+  for (int i = 0; i < 32; i++)
+    assert (test_6_real[i] == test_6_except[i]);
+
+  vnx16ui test_7_x = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  vnx16ui test_7_y
+    = {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
+  vnx16ui test_7_except
+    = {0, 2, 3, 8, 10, 11, 12, 14, 24, 25, 26, 27, 28, 29, 30, 31};
+  vnx16ui test_7_real;
+  test_7_real = test_7 (test_7_x, test_7_y);
+  for (int i = 0; i < 16; i++)
+    assert (test_7_real[i] == test_7_except[i]);
+
+  vnx8ui test_8_x = {0, 1, 2, 3, 4, 5, 6, 7};
+  vnx8ui test_8_y = {8, 9, 10, 11, 12, 13, 14, 15};
+  vnx8ui test_8_except = {0, 1, 5, 7, 11, 12, 13, 14};
+  vnx8ui test_8_real;
+  test_8_real = test_8 (test_8_x, test_8_y);
+  for (int i = 0; i < 8; i++)
+    assert (test_8_real[i] == test_8_except[i]);
+
+  vnx32f test_9_x
+    = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
+  vnx32f test_9_y
+    = {32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+       48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
+  vnx32f test_9_except
+    = {4,  5,  6,  7,  9,  10, 12, 14, 18, 20, 22, 23, 25, 27, 28, 29,
+       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50};
+  vnx32f test_9_real;
+  test_9_real = test_9 (test_9_x, test_9_y);
+  for (int i = 0; i < 32; i++)
+    assert (test_9_real[i] == test_9_except[i]);
+
+  vnx16f test_10_x = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  vnx16f test_10_y
+    = {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
+  vnx16f test_10_except
+    = {0, 2, 3, 8, 10, 11, 12, 14, 24, 25, 26, 27, 28, 29, 30, 31};
+  vnx16f test_10_real;
+  test_10_real = test_10 (test_10_x, test_10_y);
+  for (int i = 0; i < 16; i++)
+    assert (test_10_real[i] == test_10_except[i]);
+
+  vnx8f test_11_x = {0, 1, 2, 3, 4, 5, 6, 7};
+  vnx8f test_11_y = {8, 9, 10, 11, 12, 13, 14, 15};
+  vnx8f test_11_except = {0, 1, 5, 7, 11, 12, 13, 14};
+  vnx8f test_11_real;
+  test_11_real = test_11 (test_11_x, test_11_y);
+  for (int i = 0; i < 8; i++)
+    assert (test_11_real[i] == test_11_except[i]);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress_run-6.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress_run-6.c
new file mode 100644
index 00000000000..5139ea2b93e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/compress_run-6.c
@@ -0,0 +1,222 @@
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-options "-O3 --param riscv-autovec-preference=fixed-vlmax -Wno-psabi" } */
+
+#include <assert.h>
+#include "compress-6.c"
+
+int
+main (void)
+{
+  vnx128i test_1_x
+    = {0,   1,	 2,   3,   4,	5,   6,	  7,   8,   9,	 10,  11,  12,
+       13,  14,	 15,  16,  17,	18,  19,  20,  21,  22,	 23,  24,  25,
+       26,  27,	 28,  29,  30,	31,  32,  33,  34,  35,	 36,  37,  38,
+       39,  40,	 41,  42,  43,	44,  45,  46,  47,  48,	 49,  50,  51,
+       52,  53,	 54,  55,  56,	57,  58,  59,  60,  61,	 62,  63,  64,
+       65,  66,	 67,  68,  69,	70,  71,  72,  73,  74,	 75,  76,  77,
+       78,  79,	 80,  81,  82,	83,  84,  85,  86,  87,	 88,  89,  90,
+       91,  92,	 93,  94,  95,	96,  97,  98,  99,  100, 101, 102, 103,
+       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
+       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127};
+  vnx128i test_1_y
+    = {128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140,
+       141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153,
+       154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166,
+       167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+       180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192,
+       193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205,
+       206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218,
+       219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231,
+       232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244,
+       245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255};
+  vnx128i test_1_except
+    = {1,   3,	 4,   5,   6,	7,   8,	  10,  12,  14,	 15,  16,  17,
+       18,  22,	 25,  28,  29,	30,  31,  36,  37,  40,	 41,  42,  43,
+       44,  46,	 52,  54,  55,	58,  61,  62,  64,  67,	 68,  69,  70,
+       71,  76,	 77,  78,  80,	82,  83,  84,  86,  87,	 88,  91,  94,
+       95,  99,	 102, 104, 106, 110, 112, 115, 116, 125, 126, 127, 144,
+       145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157,
+       158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170,
+       171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183,
+       184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196,
+       197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207};
+  vnx128i test_1_real;
+  test_1_real = test_1 (test_1_x, test_1_y);
+  for (int i = 0; i < 128; i++)
+    assert (test_1_real[i] == test_1_except[i]);
+
+  vnx64i test_2_x
+    = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+       32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+       48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
+  vnx64i test_2_y
+    = {64,  65,	 66,  67,  68,	69,  70,  71,  72,  73,	 74,  75,  76,
+       77,  78,	 79,  80,  81,	82,  83,  84,  85,  86,	 87,  88,  89,
+       90,  91,	 92,  93,  94,	95,  96,  97,  98,  99,	 100, 101, 102,
+       103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115,
+       116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127};
+  vnx64i test_2_except
+    = {0,  2,  3,  4,  5,  7,  11, 13, 14, 16, 17, 19, 20, 22, 23, 24,
+       27, 28, 30, 31, 35, 37, 39, 40, 44, 45, 46, 53, 54, 56, 61, 63,
+       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83,
+       84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99};
+  vnx64i test_2_real;
+  test_2_real = test_2 (test_2_x, test_2_y);
+  for (int i = 0; i < 64; i++)
+    assert (test_2_real[i] == test_2_except[i]);
+
+  vnx32i test_3_x
+    = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
+  vnx32i test_3_y
+    = {32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+       48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
+  vnx32i test_3_except
+    = {0,  1,  3,  4,  7,  8,  12, 13, 14, 19, 21, 22, 23, 27, 29, 31,
+       41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56};
+  vnx32i test_3_real;
+  test_3_real = test_3 (test_3_x, test_3_y);
+  for (int i = 0; i < 32; i++)
+    assert (test_3_real[i] == test_3_except[i]);
+
+  vnx16i test_4_x = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  vnx16i test_4_y
+    = {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
+  vnx16i test_4_except
+    = {2, 3, 4, 6, 7, 8, 9, 12, 20, 21, 22, 23, 24, 25, 26, 27};
+  vnx16i test_4_real;
+  test_4_real = test_4 (test_4_x, test_4_y);
+  for (int i = 0; i < 16; i++)
+    assert (test_4_real[i] == test_4_except[i]);
+
+  vnx128ui test_5_x
+    = {0,   1,	 2,   3,   4,	5,   6,	  7,   8,   9,	 10,  11,  12,
+       13,  14,	 15,  16,  17,	18,  19,  20,  21,  22,	 23,  24,  25,
+       26,  27,	 28,  29,  30,	31,  32,  33,  34,  35,	 36,  37,  38,
+       39,  40,	 41,  42,  43,	44,  45,  46,  47,  48,	 49,  50,  51,
+       52,  53,	 54,  55,  56,	57,  58,  59,  60,  61,	 62,  63,  64,
+       65,  66,	 67,  68,  69,	70,  71,  72,  73,  74,	 75,  76,  77,
+       78,  79,	 80,  81,  82,	83,  84,  85,  86,  87,	 88,  89,  90,
+       91,  92,	 93,  94,  95,	96,  97,  98,  99,  100, 101, 102, 103,
+       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
+       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127};
+  vnx128ui test_5_y
+    = {128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140,
+       141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153,
+       154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166,
+       167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+       180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192,
+       193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205,
+       206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218,
+       219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231,
+       232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244,
+       245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255};
+  vnx128ui test_5_except
+    = {1,   3,	 4,   5,   6,	7,   8,	  10,  12,  14,	 15,  16,  17,
+       18,  22,	 25,  28,  29,	30,  31,  36,  37,  40,	 41,  42,  43,
+       44,  46,	 52,  54,  55,	58,  61,  62,  64,  67,	 68,  69,  70,
+       71,  76,	 77,  78,  80,	82,  83,  84,  86,  87,	 88,  91,  94,
+       95,  99,	 102, 104, 106, 110, 112, 115, 116, 125, 126, 127, 144,
+       145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157,
+       158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170,
+       171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183,
+       184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196,
+       197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207};
+  vnx128ui test_5_real;
+  test_5_real = test_5 (test_5_x, test_5_y);
+  for (int i = 0; i < 128; i++)
+    assert (test_5_real[i] == test_5_except[i]);
+
+  vnx64ui test_6_x
+    = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+       32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+       48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
+  vnx64ui test_6_y
+    = {64,  65,	 66,  67,  68,	69,  70,  71,  72,  73,	 74,  75,  76,
+       77,  78,	 79,  80,  81,	82,  83,  84,  85,  86,	 87,  88,  89,
+       90,  91,	 92,  93,  94,	95,  96,  97,  98,  99,	 100, 101, 102,
+       103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115,
+       116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127};
+  vnx64ui test_6_except
+    = {0,  2,  3,  4,  5,  7,  11, 13, 14, 16, 17, 19, 20, 22, 23, 24,
+       27, 28, 30, 31, 35, 37, 39, 40, 44, 45, 46, 53, 54, 56, 61, 63,
+       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83,
+       84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99};
+  vnx64ui test_6_real;
+  test_6_real = test_6 (test_6_x, test_6_y);
+  for (int i = 0; i < 64; i++)
+    assert (test_6_real[i] == test_6_except[i]);
+
+  vnx32ui test_7_x
+    = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
+  vnx32ui test_7_y
+    = {32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+       48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
+  vnx32ui test_7_except
+    = {0,  1,  3,  4,  7,  8,  12, 13, 14, 19, 21, 22, 23, 27, 29, 31,
+       41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56};
+  vnx32ui test_7_real;
+  test_7_real = test_7 (test_7_x, test_7_y);
+  for (int i = 0; i < 32; i++)
+    assert (test_7_real[i] == test_7_except[i]);
+
+  vnx16ui test_8_x = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  vnx16ui test_8_y
+    = {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
+  vnx16ui test_8_except
+    = {2, 3, 4, 6, 7, 8, 9, 12, 20, 21, 22, 23, 24, 25, 26, 27};
+  vnx16ui test_8_real;
+  test_8_real = test_8 (test_8_x, test_8_y);
+  for (int i = 0; i < 16; i++)
+    assert (test_8_real[i] == test_8_except[i]);
+
+  vnx64f test_9_x
+    = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+       32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+       48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
+  vnx64f test_9_y
+    = {64,  65,	 66,  67,  68,	69,  70,  71,  72,  73,	 74,  75,  76,
+       77,  78,	 79,  80,  81,	82,  83,  84,  85,  86,	 87,  88,  89,
+       90,  91,	 92,  93,  94,	95,  96,  97,  98,  99,	 100, 101, 102,
+       103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115,
+       116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127};
+  vnx64f test_9_except
+    = {0,  2,  3,  4,  5,  7,  11, 13, 14, 16, 17, 19, 20, 22, 23, 24,
+       27, 28, 30, 31, 35, 37, 39, 40, 44, 45, 46, 53, 54, 56, 61, 63,
+       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83,
+       84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99};
+  vnx64f test_9_real;
+  test_9_real = test_9 (test_9_x, test_9_y);
+  for (int i = 0; i < 64; i++)
+    assert (test_9_real[i] == test_9_except[i]);
+
+  vnx32f test_10_x
+    = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
+  vnx32f test_10_y
+    = {32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+       48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
+  vnx32f test_10_except
+    = {0,  1,  3,  4,  7,  8,  12, 13, 14, 19, 21, 22, 23, 27, 29, 31,
+       41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56};
+  vnx32f test_10_real;
+  test_10_real = test_10 (test_10_x, test_10_y);
+  for (int i = 0; i < 32; i++)
+    assert (test_10_real[i] == test_10_except[i]);
+
+  vnx16f test_11_x = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  vnx16f test_11_y
+    = {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
+  vnx16f test_11_except
+    = {2, 3, 4, 6, 7, 8, 9, 12, 20, 21, 22, 23, 24, 25, 26, 27};
+  vnx16f test_11_real;
+  test_11_real = test_11 (test_11_x, test_11_y);
+  for (int i = 0; i < 16; i++)
+    assert (test_11_real[i] == test_11_except[i]);
+
+  return 0;
+}
-- 
2.36.1


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] RISC-V: Optimize permutation codegen with vcompress
  2023-07-11  6:38 [PATCH] RISC-V: Optimize permutation codegen with vcompress juzhe.zhong
@ 2023-07-11 12:17 ` Robin Dapp
  2023-07-11 12:21   ` 钟居哲
  2023-07-11 23:18 ` Jeff Law
  1 sibling, 1 reply; 8+ messages in thread
From: Robin Dapp @ 2023-07-11 12:17 UTC (permalink / raw)
  To: juzhe.zhong, gcc-patches; +Cc: rdapp.gcc, kito.cheng, kito.cheng, jeffreyalaw

Hi Juzhe,

looks good from my side, thanks.  While going through it I
thought of some related cases that we could still handle
differently but I didn't bother to formalize them for now.
Most likely we already handle them in the shortest way
anyway.  I'm going to check on that when I find some time
at some point. 

In the tests I noticed that most (all?) of them are pretty
evenly split (half/half) between first and second source vector.
Wouldn't we want some more variety there? Still OK without
that IMHO.

Regards
 Robin

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: Re: [PATCH] RISC-V: Optimize permutation codegen with vcompress
  2023-07-11 12:17 ` Robin Dapp
@ 2023-07-11 12:21   ` 钟居哲
  2023-07-11 12:24     ` Robin Dapp
  0 siblings, 1 reply; 8+ messages in thread
From: 钟居哲 @ 2023-07-11 12:21 UTC (permalink / raw)
  To: rdapp.gcc, gcc-patches; +Cc: rdapp.gcc, kito.cheng, kito.cheng, Jeff Law

[-- Attachment #1: Type: text/plain, Size: 1007 bytes --]

The compress optimization pattern has included all variety.
It's not necessary to force split (half/half), we can apply this compress
pattern to any variety of compress pattern.

You can apply this patch to see.

Thanks.

juzhe.zhong@rivai.ai

From: Robin Dapp
Date: 2023-07-11 20:17
To: juzhe.zhong; gcc-patches
CC: rdapp.gcc; kito.cheng; kito.cheng; jeffreyalaw
Subject: Re: [PATCH] RISC-V: Optimize permutation codegen with vcompress
Hi Juzhe,

looks good from my side, thanks.  While going through it I
thought of some related cases that we could still handle
differently but I didn't bother to formalize them for now.
Most likely we already handle them in the shortest way
anyway.  I'm going to check on that when I find some time
at some point. 

In the tests I noticed that most (all?) of them are pretty
evenly split (half/half) between first and second source vector.
Wouldn't we want some more variety there? Still OK without
that IMHO.

Regards
Robin

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] RISC-V: Optimize permutation codegen with vcompress
  2023-07-11 12:21   ` 钟居哲
@ 2023-07-11 12:24     ` Robin Dapp
  2023-07-11 12:30       ` 钟居哲
  0 siblings, 1 reply; 8+ messages in thread
From: Robin Dapp @ 2023-07-11 12:24 UTC (permalink / raw)
  To: 钟居哲, gcc-patches
  Cc: rdapp.gcc, kito.cheng, kito.cheng, Jeff Law

> The compress optimization pattern has included all variety.
> It's not necessary to force split (half/half), we can apply this compress
> pattern to any variety of compress pattern.

Yes, that's clear.  I meant the testcases are mostly designed
like

MASK4 1, 2, 6, 7

instead of variation like

MASK4 0, 5, 6, 7

or something else.  But this wouldn't add a lot of coverage
anyway as we're searching for the "pivot" anyway.

Regards
 Robin

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: Re: [PATCH] RISC-V: Optimize permutation codegen with vcompress
  2023-07-11 12:24     ` Robin Dapp
@ 2023-07-11 12:30       ` 钟居哲
  2023-07-11 12:35         ` Robin Dapp
  0 siblings, 1 reply; 8+ messages in thread
From: 钟居哲 @ 2023-07-11 12:30 UTC (permalink / raw)
  To: rdapp.gcc, gcc-patches; +Cc: rdapp.gcc, kito.cheng, kito.cheng, Jeff Law

[-- Attachment #1: Type: text/plain, Size: 840 bytes --]

MASK4 0, 5, 6, 7 also works definitely.

The optimization is generic as long as the permutation index matches the compress insn on RVV ISA SPEC.



juzhe.zhong@rivai.ai
 
From: Robin Dapp
Date: 2023-07-11 20:24
To: 钟居哲; gcc-patches
CC: rdapp.gcc; kito.cheng; kito.cheng; Jeff Law
Subject: Re: [PATCH] RISC-V: Optimize permutation codegen with vcompress
> The compress optimization pattern has included all variety.
> It's not necessary to force split (half/half), we can apply this compress
> pattern to any variety of compress pattern.
 
Yes, that's clear.  I meant the testcases are mostly designed
like
 
MASK4 1, 2, 6, 7
 
instead of variation like
 
MASK4 0, 5, 6, 7
 
or something else.  But this wouldn't add a lot of coverage
anyway as we're searching for the "pivot" anyway.
 
Regards
Robin
 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] RISC-V: Optimize permutation codegen with vcompress
  2023-07-11 12:30       ` 钟居哲
@ 2023-07-11 12:35         ` Robin Dapp
  0 siblings, 0 replies; 8+ messages in thread
From: Robin Dapp @ 2023-07-11 12:35 UTC (permalink / raw)
  To: 钟居哲, gcc-patches
  Cc: rdapp.gcc, kito.cheng, kito.cheng, Jeff Law

> MASK4 0, 5, 6, 7 also works definitely

Sure :)  My remark was that the tests are all(?)
evenly split and a bit more variation would have been nice.
Not that it doesn't work, I'm OK with it as is.

Regards
 Robin

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] RISC-V: Optimize permutation codegen with vcompress
  2023-07-11  6:38 [PATCH] RISC-V: Optimize permutation codegen with vcompress juzhe.zhong
  2023-07-11 12:17 ` Robin Dapp
@ 2023-07-11 23:18 ` Jeff Law
  2023-07-11 23:35   ` Li, Pan2
  1 sibling, 1 reply; 8+ messages in thread
From: Jeff Law @ 2023-07-11 23:18 UTC (permalink / raw)
  To: juzhe.zhong, gcc-patches; +Cc: kito.cheng, kito.cheng, rdapp.gcc



On 7/11/23 00:38, juzhe.zhong@rivai.ai wrote:
> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> 
> This patch is to recognize specific permutation pattern which can be applied compress approach.
> 
> Consider this following case:
> #include <stdint.h>
> typedef int8_t vnx64i __attribute__ ((vector_size (64)));
> #define MASK_64                                                                \
>    1, 2, 3, 5, 7, 9, 10, 11, 12, 14, 15, 17, 19, 21, 22, 23, 26, 28, 30, 31,    \
>      37, 38, 41, 46, 47, 53, 54, 55, 60, 61, 62, 63, 76, 77, 78, 79, 80, 81,    \
>      82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99,    \
>      100, 101, 102, 103, 104, 105, 106, 107
> void __attribute__ ((noinline, noclone)) test_1 (int8_t *x, int8_t *y, int8_t *out)
> {
>    vnx64i v1 = *(vnx64i*)x;
>    vnx64i v2 = *(vnx64i*)y;
>    vnx64i v3 = __builtin_shufflevector (v1, v2, MASK_64);
>    *(vnx64i*)out = v3;
> }
> 
> https://godbolt.org/z/P33nev6cW
> 
> Before this patch:
>          lui     a4,%hi(.LANCHOR0)
>          addi    a4,a4,%lo(.LANCHOR0)
>          vl4re8.v        v4,0(a4)
>          li      a4,64
>          vsetvli a5,zero,e8,m4,ta,mu
>          vl4re8.v        v20,0(a0)
>          vl4re8.v        v16,0(a1)
>          vmv.v.x v12,a4
>          vrgather.vv     v8,v20,v4
>          vmsgeu.vv       v0,v4,v12
>          vsub.vv v4,v4,v12
>          vrgather.vv     v8,v16,v4,v0.t
>          vs4r.v  v8,0(a2)
>          ret
> 
> After this patch:
> 	lui	a4,%hi(.LANCHOR0)
> 	addi	a4,a4,%lo(.LANCHOR0)
> 	vsetvli	a5,zero,e8,m4,ta,ma
> 	vl4re8.v	v12,0(a1)
> 	vl4re8.v	v8,0(a0)
> 	vlm.v	v0,0(a4)
> 	vslideup.vi	v4,v12,20
> 	vcompress.vm	v4,v8,v0
> 	vs4r.v	v4,0(a2)
> 	ret
> 
> gcc/ChangeLog:
> 
>          * config/riscv/riscv-protos.h (enum insn_type): Add vcompress optimization.
>          * config/riscv/riscv-v.cc (emit_vlmax_compress_insn): Ditto.
>          (shuffle_compress_patterns): Ditto.
>          (expand_vec_perm_const_1): Ditto.
> 
> gcc/testsuite/ChangeLog:
> 
>          * gcc.target/riscv/rvv/autovec/vls-vlmax/compress-1.c: New test.
>          * gcc.target/riscv/rvv/autovec/vls-vlmax/compress-2.c: New test.
>          * gcc.target/riscv/rvv/autovec/vls-vlmax/compress-3.c: New test.
>          * gcc.target/riscv/rvv/autovec/vls-vlmax/compress-4.c: New test.
>          * gcc.target/riscv/rvv/autovec/vls-vlmax/compress-5.c: New test.
>          * gcc.target/riscv/rvv/autovec/vls-vlmax/compress-6.c: New test.
>          * gcc.target/riscv/rvv/autovec/vls-vlmax/compress_run-1.c: New test.
>          * gcc.target/riscv/rvv/autovec/vls-vlmax/compress_run-2.c: New test.
>          * gcc.target/riscv/rvv/autovec/vls-vlmax/compress_run-3.c: New test.
>          * gcc.target/riscv/rvv/autovec/vls-vlmax/compress_run-4.c: New test.
>          * gcc.target/riscv/rvv/autovec/vls-vlmax/compress_run-5.c: New test.
>          * gcc.target/riscv/rvv/autovec/vls-vlmax/compress_run-6.c: New test.
I had to look at this a few times, but I think that's because it's been 
polluted by another vector architecture's handling of compressed 
vectors.  What you're doing looks quite reasonable.

OK for the trunk.

jeff


^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH] RISC-V: Optimize permutation codegen with vcompress
  2023-07-11 23:18 ` Jeff Law
@ 2023-07-11 23:35   ` Li, Pan2
  0 siblings, 0 replies; 8+ messages in thread
From: Li, Pan2 @ 2023-07-11 23:35 UTC (permalink / raw)
  To: Jeff Law, juzhe.zhong, gcc-patches; +Cc: kito.cheng, kito.cheng, rdapp.gcc

Committed, thanks Jeff.

Pan

-----Original Message-----
From: Gcc-patches <gcc-patches-bounces+pan2.li=intel.com@gcc.gnu.org> On Behalf Of Jeff Law via Gcc-patches
Sent: Wednesday, July 12, 2023 7:19 AM
To: juzhe.zhong@rivai.ai; gcc-patches@gcc.gnu.org
Cc: kito.cheng@gmail.com; kito.cheng@sifive.com; rdapp.gcc@gmail.com
Subject: Re: [PATCH] RISC-V: Optimize permutation codegen with vcompress



On 7/11/23 00:38, juzhe.zhong@rivai.ai wrote:
> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> 
> This patch is to recognize specific permutation pattern which can be applied compress approach.
> 
> Consider this following case:
> #include <stdint.h>
> typedef int8_t vnx64i __attribute__ ((vector_size (64)));
> #define MASK_64                                                                \
>    1, 2, 3, 5, 7, 9, 10, 11, 12, 14, 15, 17, 19, 21, 22, 23, 26, 28, 30, 31,    \
>      37, 38, 41, 46, 47, 53, 54, 55, 60, 61, 62, 63, 76, 77, 78, 79, 80, 81,    \
>      82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99,    \
>      100, 101, 102, 103, 104, 105, 106, 107
> void __attribute__ ((noinline, noclone)) test_1 (int8_t *x, int8_t *y, int8_t *out)
> {
>    vnx64i v1 = *(vnx64i*)x;
>    vnx64i v2 = *(vnx64i*)y;
>    vnx64i v3 = __builtin_shufflevector (v1, v2, MASK_64);
>    *(vnx64i*)out = v3;
> }
> 
> https://godbolt.org/z/P33nev6cW
> 
> Before this patch:
>          lui     a4,%hi(.LANCHOR0)
>          addi    a4,a4,%lo(.LANCHOR0)
>          vl4re8.v        v4,0(a4)
>          li      a4,64
>          vsetvli a5,zero,e8,m4,ta,mu
>          vl4re8.v        v20,0(a0)
>          vl4re8.v        v16,0(a1)
>          vmv.v.x v12,a4
>          vrgather.vv     v8,v20,v4
>          vmsgeu.vv       v0,v4,v12
>          vsub.vv v4,v4,v12
>          vrgather.vv     v8,v16,v4,v0.t
>          vs4r.v  v8,0(a2)
>          ret
> 
> After this patch:
> 	lui	a4,%hi(.LANCHOR0)
> 	addi	a4,a4,%lo(.LANCHOR0)
> 	vsetvli	a5,zero,e8,m4,ta,ma
> 	vl4re8.v	v12,0(a1)
> 	vl4re8.v	v8,0(a0)
> 	vlm.v	v0,0(a4)
> 	vslideup.vi	v4,v12,20
> 	vcompress.vm	v4,v8,v0
> 	vs4r.v	v4,0(a2)
> 	ret
> 
> gcc/ChangeLog:
> 
>          * config/riscv/riscv-protos.h (enum insn_type): Add vcompress optimization.
>          * config/riscv/riscv-v.cc (emit_vlmax_compress_insn): Ditto.
>          (shuffle_compress_patterns): Ditto.
>          (expand_vec_perm_const_1): Ditto.
> 
> gcc/testsuite/ChangeLog:
> 
>          * gcc.target/riscv/rvv/autovec/vls-vlmax/compress-1.c: New test.
>          * gcc.target/riscv/rvv/autovec/vls-vlmax/compress-2.c: New test.
>          * gcc.target/riscv/rvv/autovec/vls-vlmax/compress-3.c: New test.
>          * gcc.target/riscv/rvv/autovec/vls-vlmax/compress-4.c: New test.
>          * gcc.target/riscv/rvv/autovec/vls-vlmax/compress-5.c: New test.
>          * gcc.target/riscv/rvv/autovec/vls-vlmax/compress-6.c: New test.
>          * gcc.target/riscv/rvv/autovec/vls-vlmax/compress_run-1.c: New test.
>          * gcc.target/riscv/rvv/autovec/vls-vlmax/compress_run-2.c: New test.
>          * gcc.target/riscv/rvv/autovec/vls-vlmax/compress_run-3.c: New test.
>          * gcc.target/riscv/rvv/autovec/vls-vlmax/compress_run-4.c: New test.
>          * gcc.target/riscv/rvv/autovec/vls-vlmax/compress_run-5.c: New test.
>          * gcc.target/riscv/rvv/autovec/vls-vlmax/compress_run-6.c: New test.
I had to look at this a few times, but I think that's because it's been 
polluted by another vector architecture's handling of compressed 
vectors.  What you're doing looks quite reasonable.

OK for the trunk.

jeff


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2023-07-11 23:35 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-07-11  6:38 [PATCH] RISC-V: Optimize permutation codegen with vcompress juzhe.zhong
2023-07-11 12:17 ` Robin Dapp
2023-07-11 12:21   ` 钟居哲
2023-07-11 12:24     ` Robin Dapp
2023-07-11 12:30       ` 钟居哲
2023-07-11 12:35         ` Robin Dapp
2023-07-11 23:18 ` Jeff Law
2023-07-11 23:35   ` Li, Pan2

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).