[PATCH V2] RISC-V: Enhance RVV VLA SLP auto-vectorization with decompress operation

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH V2] RISC-V: Enhance RVV VLA SLP auto-vectorization with decompress operation
@ 2023-06-12 15:11 juzhe.zhong
  2023-06-12 19:42 ` Jeff Law
  0 siblings, 1 reply; 3+ messages in thread
From: juzhe.zhong @ 2023-06-12 15:11 UTC (permalink / raw)
  To: gcc-patches; +Cc: kito.cheng, palmer, rdapp.gcc, jeffreyalaw, Juzhe-Zhong

From: Juzhe-Zhong <juzhe.zhong@rivai.ai>

According to RVV ISA:
https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc

We can enhance VLA SLP auto-vectorization with (16.5.1. Synthesizing vdecompress)
Decompress operation.

Case 1 (nunits = POLY_INT_CST [16, 16]):
_48 = VEC_PERM_EXPR <_37, _35, { 0, POLY_INT_CST [16, 16], 1, POLY_INT_CST [17, 16], 2, POLY_INT_CST [18, 16], ... }>;
We can optimize such VLA SLP permuation pattern into:
_48 = vdecompress (_37, _35, mask = { 0, 1, 0, 1, ... };

Case 2 (nunits = POLY_INT_CST [16, 16]):
_23 = VEC_PERM_EXPR <_46, _44, { POLY_INT_CST [1, 1], POLY_INT_CST [3, 3], POLY_INT_CST [2, 1], POLY_INT_CST [4, 3], POLY_INT_CST [3, 1], POLY_INT_CST [5, 3], ... }>;
We can optimize such VLA SLP permuation pattern into:
_48 = vdecompress (slidedown(_46, 1/2 nunits), slidedown(_44, 1/2 nunits), mask = { 0, 1, 0, 1, ... };

For example:
void __attribute__ ((noinline, noclone))
vec_slp (uint64_t *restrict a, uint64_t b, uint64_t c, int n)
{
  for (int i = 0; i < n; ++i)
    {
      a[i * 2] += b;
      a[i * 2 + 1] += c;
    }
}

ASM:
...
        vid.v   v0
        vand.vi v0,v0,1
        vmseq.vi        v0,v0,1  ===> mask = { 0, 1, 0, 1, ... }
vdecompress:
        viota.m v3,v0   
        vrgather.vv     v2,v1,v3,v0.t
Loop:
        vsetvli zero,a5,e64,m1,ta,ma
        vle64.v v1,0(a0)
        vsetvli a6,zero,e64,m1,ta,ma
        vadd.vv v1,v2,v1
        vsetvli zero,a5,e64,m1,ta,ma
        mv      a5,a3
        vse64.v v1,0(a0)
        add     a3,a3,a1
        add     a0,a0,a2
        bgtu    a5,a4,.L4


gcc/ChangeLog:

        * config/riscv/riscv-v.cc (emit_vlmax_decompress_insn): New function.
        (shuffle_decompress_patterns): New function.
        (expand_vec_perm_const_1): Add decompress optimization.

gcc/testsuite/ChangeLog:

        * gcc.target/riscv/rvv/autovec/partial/slp-8.c: New test.
        * gcc.target/riscv/rvv/autovec/partial/slp-9.c: New test.
        * gcc.target/riscv/rvv/autovec/partial/slp_run-8.c: New test.
        * gcc.target/riscv/rvv/autovec/partial/slp_run-9.c: New test.

---
 gcc/config/riscv/riscv-v.cc                   | 111 ++++++++++++++++++
 .../riscv/rvv/autovec/partial/slp-8.c         |  30 +++++
 .../riscv/rvv/autovec/partial/slp-9.c         |  31 +++++
 .../riscv/rvv/autovec/partial/slp_run-8.c     |  30 +++++
 .../riscv/rvv/autovec/partial/slp_run-9.c     |  30 +++++
 5 files changed, 232 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-8.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-9.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp_run-8.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp_run-9.c

diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index e1b85a5af91..fb970344521 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -836,6 +836,46 @@ emit_vlmax_masked_gather_mu_insn (rtx target, rtx op, rtx sel, rtx mask)
   emit_vlmax_masked_mu_insn (icode, RVV_BINOP_MU, ops);
 }
 
+/* According to RVV ISA spec (16.5.1. Synthesizing vdecompress):
+   https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc
+
+  There is no inverse vdecompress provided, as this operation can be readily
+  synthesized using iota and a masked vrgather:
+
+      Desired functionality of 'vdecompress'
+	7 6 5 4 3 2 1 0     # vid
+
+	      e d c b a     # packed vector of 5 elements
+	1 0 0 1 1 1 0 1     # mask vector of 8 elements
+	p q r s t u v w     # destination register before vdecompress
+
+	e q r d c b v a     # result of vdecompress
+       # v0 holds mask
+       # v1 holds packed data
+       # v11 holds input expanded vector and result
+       viota.m v10, v0                 # Calc iota from mask in v0
+       vrgather.vv v11, v1, v10, v0.t  # Expand into destination
+     p q r s t u v w  # v11 destination register
+	   e d c b a  # v1 source vector
+     1 0 0 1 1 1 0 1  # v0 mask vector
+
+     4 4 4 3 2 1 1 0  # v10 result of viota.m
+     e q r d c b v a  # v11 destination after vrgather using viota.m under mask
+*/
+static void
+emit_vlmax_decompress_insn (rtx target, rtx op, rtx mask)
+{
+  machine_mode data_mode = GET_MODE (target);
+  machine_mode sel_mode = related_int_vector_mode (data_mode).require ();
+  if (GET_MODE_INNER (data_mode) == QImode)
+    sel_mode = get_vector_mode (HImode, GET_MODE_NUNITS (data_mode)).require ();
+
+  rtx sel = gen_reg_rtx (sel_mode);
+  rtx iota_ops[] = {sel, mask};
+  emit_vlmax_insn (code_for_pred_iota (sel_mode), RVV_UNOP, iota_ops);
+  emit_vlmax_masked_gather_mu_insn (target, op, sel, mask);
+}
+
 /* Emit merge instruction.  */
 
 static machine_mode
@@ -2337,6 +2377,75 @@ struct expand_vec_perm_d
   bool testing_p;
 };
 
+/* Recognize decompress patterns:
+
+   1. VEC_PERM_EXPR op0 and op1
+      with isel = { 0, nunits, 1, nunits + 1, ... }.
+      Decompress op0 and op1 vector with the mask = { 0, 1, 0, 1, ... }.
+
+   2. VEC_PERM_EXPR op0 and op1
+      with isel = { 1/2 nunits, 3/2 nunits, 1/2 nunits+1, 3/2 nunits+1,... }.
+      Slide down op0 and op1 with OFFSET = 1/2 nunits.
+      Decompress op0 and op1 vector with the mask = { 0, 1, 0, 1, ... }.
+*/
+static bool
+shuffle_decompress_patterns (struct expand_vec_perm_d *d)
+{
+  poly_uint64 nelt = d->perm.length ();
+  machine_mode mask_mode = get_mask_mode (d->vmode).require ();
+
+  /* For constant size indices, we dont't need to handle it here.
+     Just leave it to vec_perm<mode>.  */
+  if (d->perm.length ().is_constant ())
+    return false;
+
+  poly_uint64 first = d->perm[0];
+  if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
+      || !d->perm.series_p (0, 2, first, 1)
+      || !d->perm.series_p (1, 2, first + nelt, 1))
+    return false;
+
+  /* Permuting two SEW8 variable-length vectors need vrgatherei16.vv.
+     Otherwise, it could overflow the index range.  */
+  machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
+  if (GET_MODE_INNER (d->vmode) == QImode
+      && !get_vector_mode (HImode, nelt).exists (&sel_mode))
+    return false;
+
+  /* Success!  */
+  if (d->testing_p)
+    return true;
+
+  rtx op0, op1;
+  if (known_eq (first, 0U))
+    {
+      op0 = d->op0;
+      op1 = d->op1;
+    }
+  else
+    {
+      op0 = gen_reg_rtx (d->vmode);
+      op1 = gen_reg_rtx (d->vmode);
+      insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEDOWN, d->vmode);
+      rtx ops0[] = {op0, d->op0, gen_int_mode (first, Pmode)};
+      rtx ops1[] = {op1, d->op1, gen_int_mode (first, Pmode)};
+      emit_vlmax_insn (icode, RVV_BINOP, ops0);
+      emit_vlmax_insn (icode, RVV_BINOP, ops1);
+    }
+  /* Generate { 0, 1, .... } mask.  */
+  rtx vid = gen_reg_rtx (sel_mode);
+  rtx vid_repeat = gen_reg_rtx (sel_mode);
+  emit_insn (gen_vec_series (sel_mode, vid, const0_rtx, const1_rtx));
+  rtx and_ops[] = {vid_repeat, vid, const1_rtx};
+  emit_vlmax_insn (code_for_pred_scalar (AND, sel_mode), RVV_BINOP, and_ops);
+  rtx const_vec = gen_const_vector_dup (sel_mode, 1);
+  rtx mask = gen_reg_rtx (mask_mode);
+  expand_vec_cmp (mask, EQ, vid_repeat, const_vec);
+  emit_move_insn (d->target, op0);
+  emit_vlmax_decompress_insn (d->target, op1, mask);
+  return true;
+}
+
 /* Recognize the pattern that can be shuffled by generic approach.  */
 
 static bool
@@ -2388,6 +2497,8 @@ expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
     {
       if (d->vmode == d->op_mode)
 	{
+	  if (shuffle_decompress_patterns (d))
+	    return true;
 	  if (shuffle_generic_patterns (d))
 	    return true;
 	  return false;
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-8.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-8.c
new file mode 100644
index 00000000000..2568d6947a2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-8.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv32gcv -mabi=ilp32d --param riscv-autovec-preference=scalable -fno-vect-cost-model -fdump-tree-optimized-details" } */
+
+#include <stdint.h>
+
+#define VEC_PERM(TYPE)                                                         \
+  TYPE __attribute__ ((noinline, noclone))                                     \
+  vec_slp_##TYPE (TYPE *restrict a, TYPE b, TYPE c, int n)                     \
+  {                                                                            \
+    for (int i = 0; i < n; ++i)                                                \
+      {                                                                        \
+	a[i * 2] += b;                                                         \
+	a[i * 2 + 1] += c;                                                     \
+      }                                                                        \
+  }
+
+#define TEST_ALL(T)                                                            \
+  T (int8_t)                                                                   \
+  T (uint8_t)                                                                  \
+  T (int16_t)                                                                  \
+  T (uint16_t)                                                                 \
+  T (int32_t)                                                                  \
+  T (uint32_t)                                                                 \
+  T (int64_t)                                                                  \
+  T (uint64_t)
+
+TEST_ALL (VEC_PERM)
+
+/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 2 "optimized" } } */
+/* { dg-final { scan-assembler-times {viota.m} 2 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-9.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-9.c
new file mode 100644
index 00000000000..d410e57adbd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-9.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv32gcv -mabi=ilp32d --param riscv-autovec-preference=scalable -fno-vect-cost-model -fdump-tree-optimized-details" } */
+
+#include <stdint.h>
+
+#define VEC_PERM(TYPE)                                                         \
+  TYPE __attribute__ ((noinline, noclone))                                     \
+  vec_slp_##TYPE (TYPE *restrict a, TYPE b, TYPE c, int n)                     \
+  {                                                                            \
+    for (int i = 0; i < n; ++i)                                                \
+      {                                                                        \
+	a[i * 4] += b;                                                         \
+	a[i * 4 + 1] += c;                                                     \
+	a[i * 4 + 2] += b;                                                     \
+	a[i * 4 + 3] += c;                                                     \
+      }                                                                        \
+  }
+
+#define TEST_ALL(T)                                                            \
+  T (int8_t)                                                                   \
+  T (uint8_t)                                                                  \
+  T (int16_t)                                                                  \
+  T (uint16_t)                                                                 \
+  T (int32_t)                                                                  \
+  T (uint32_t)                                                                 \
+  T (int64_t)                                                                  \
+  T (uint64_t)
+
+TEST_ALL (VEC_PERM)
+
+/* { dg-final { scan-assembler-times {viota.m} 2 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp_run-8.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp_run-8.c
new file mode 100644
index 00000000000..39ae513812b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp_run-8.c
@@ -0,0 +1,30 @@
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-additional-options "--param riscv-autovec-preference=scalable -fno-vect-cost-model" } */
+
+#include "slp-8.c"
+
+#define N (103 * 2)
+
+#define HARNESS(TYPE)						\
+  {								\
+    TYPE a[N], b[2] = { 3, 11 };				\
+    for (unsigned int i = 0; i < N; ++i)			\
+      {								\
+	a[i] = i * 2 + i % 5;					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    vec_slp_##TYPE (a, b[0], b[1], N / 2);			\
+    for (unsigned int i = 0; i < N; ++i)			\
+      {								\
+	TYPE orig = i * 2 + i % 5;				\
+	TYPE expected = orig + b[i % 2];			\
+	if (a[i] != expected)					\
+	  __builtin_abort ();					\
+      }								\
+  }
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  TEST_ALL (HARNESS)
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp_run-9.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp_run-9.c
new file mode 100644
index 00000000000..791cfbc2b47
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp_run-9.c
@@ -0,0 +1,30 @@
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-additional-options "--param riscv-autovec-preference=scalable -fno-vect-cost-model" } */
+
+#include "slp-9.c"
+
+#define N (103 * 4)
+
+#define HARNESS(TYPE)						\
+  {								\
+    TYPE a[N], b[2] = { 3, 11 };				\
+    for (unsigned int i = 0; i < N; ++i)			\
+      {								\
+	a[i] = i * 2 + i % 5;					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    vec_slp_##TYPE (a, b[0], b[1], N / 4);			\
+    for (unsigned int i = 0; i < N; ++i)			\
+      {								\
+	TYPE orig = i * 2 + i % 5;				\
+	TYPE expected = orig + b[i % 2];			\
+	if (a[i] != expected)					\
+	  __builtin_abort ();					\
+      }								\
+  }
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  TEST_ALL (HARNESS)
+}
-- 
2.36.1


^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH V2] RISC-V: Enhance RVV VLA SLP auto-vectorization with decompress operation
  2023-06-12 15:11 [PATCH V2] RISC-V: Enhance RVV VLA SLP auto-vectorization with decompress operation juzhe.zhong
@ 2023-06-12 19:42 ` Jeff Law
  2023-06-13  1:28   ` Li, Pan2
  0 siblings, 1 reply; 3+ messages in thread
From: Jeff Law @ 2023-06-12 19:42 UTC (permalink / raw)
  To: juzhe.zhong, gcc-patches; +Cc: kito.cheng, palmer, rdapp.gcc



On 6/12/23 09:11, juzhe.zhong@rivai.ai wrote:
> From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
> 
> According to RVV ISA:
> https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc
> 
> We can enhance VLA SLP auto-vectorization with (16.5.1. Synthesizing vdecompress)
> Decompress operation.
> 
> Case 1 (nunits = POLY_INT_CST [16, 16]):
> _48 = VEC_PERM_EXPR <_37, _35, { 0, POLY_INT_CST [16, 16], 1, POLY_INT_CST [17, 16], 2, POLY_INT_CST [18, 16], ... }>;
> We can optimize such VLA SLP permuation pattern into:
> _48 = vdecompress (_37, _35, mask = { 0, 1, 0, 1, ... };
> 
> Case 2 (nunits = POLY_INT_CST [16, 16]):
> _23 = VEC_PERM_EXPR <_46, _44, { POLY_INT_CST [1, 1], POLY_INT_CST [3, 3], POLY_INT_CST [2, 1], POLY_INT_CST [4, 3], POLY_INT_CST [3, 1], POLY_INT_CST [5, 3], ... }>;
> We can optimize such VLA SLP permuation pattern into:
> _48 = vdecompress (slidedown(_46, 1/2 nunits), slidedown(_44, 1/2 nunits), mask = { 0, 1, 0, 1, ... };
> 
> For example:
> void __attribute__ ((noinline, noclone))
> vec_slp (uint64_t *restrict a, uint64_t b, uint64_t c, int n)
> {
>    for (int i = 0; i < n; ++i)
>      {
>        a[i * 2] += b;
>        a[i * 2 + 1] += c;
>      }
> }
> 
> ASM:
> ...
>          vid.v   v0
>          vand.vi v0,v0,1
>          vmseq.vi        v0,v0,1  ===> mask = { 0, 1, 0, 1, ... }
> vdecompress:
>          viota.m v3,v0
>          vrgather.vv     v2,v1,v3,v0.t
> Loop:
>          vsetvli zero,a5,e64,m1,ta,ma
>          vle64.v v1,0(a0)
>          vsetvli a6,zero,e64,m1,ta,ma
>          vadd.vv v1,v2,v1
>          vsetvli zero,a5,e64,m1,ta,ma
>          mv      a5,a3
>          vse64.v v1,0(a0)
>          add     a3,a3,a1
>          add     a0,a0,a2
>          bgtu    a5,a4,.L4
> 
> 
> gcc/ChangeLog:
> 
>          * config/riscv/riscv-v.cc (emit_vlmax_decompress_insn): New function.
>          (shuffle_decompress_patterns): New function.
>          (expand_vec_perm_const_1): Add decompress optimization.
> 
> gcc/testsuite/ChangeLog:
> 
>          * gcc.target/riscv/rvv/autovec/partial/slp-8.c: New test.
>          * gcc.target/riscv/rvv/autovec/partial/slp-9.c: New test.
>          * gcc.target/riscv/rvv/autovec/partial/slp_run-8.c: New test.
>          * gcc.target/riscv/rvv/autovec/partial/slp_run-9.c: New test.
I've been wanting to get inside expand_vec_perm_const to see what 
opportunities might exist to improve code in there.  We had good success 
mining this space at a prior employer.  While we had a lot of weird 
idioms and costs to consider it was well worth the time.

So quite happy to see you diving into this code.

OK for the trunk,
Jeff


^ permalink raw reply	[flat|nested] 3+ messages in thread

* RE: [PATCH V2] RISC-V: Enhance RVV VLA SLP auto-vectorization with decompress operation
  2023-06-12 19:42 ` Jeff Law
@ 2023-06-13  1:28   ` Li, Pan2
  0 siblings, 0 replies; 3+ messages in thread
From: Li, Pan2 @ 2023-06-13  1:28 UTC (permalink / raw)
  To: Jeff Law, juzhe.zhong, gcc-patches; +Cc: kito.cheng, palmer, rdapp.gcc

Committed, thanks Jeff.

Pan

-----Original Message-----
From: Gcc-patches <gcc-patches-bounces+pan2.li=intel.com@gcc.gnu.org> On Behalf Of Jeff Law via Gcc-patches
Sent: Tuesday, June 13, 2023 3:43 AM
To: juzhe.zhong@rivai.ai; gcc-patches@gcc.gnu.org
Cc: kito.cheng@sifive.com; palmer@rivosinc.com; rdapp.gcc@gmail.com
Subject: Re: [PATCH V2] RISC-V: Enhance RVV VLA SLP auto-vectorization with decompress operation



On 6/12/23 09:11, juzhe.zhong@rivai.ai wrote:
> From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
> 
> According to RVV ISA:
> https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc
> 
> We can enhance VLA SLP auto-vectorization with (16.5.1. Synthesizing 
> vdecompress) Decompress operation.
> 
> Case 1 (nunits = POLY_INT_CST [16, 16]):
> _48 = VEC_PERM_EXPR <_37, _35, { 0, POLY_INT_CST [16, 16], 1, 
> POLY_INT_CST [17, 16], 2, POLY_INT_CST [18, 16], ... }>; We can optimize such VLA SLP permuation pattern into:
> _48 = vdecompress (_37, _35, mask = { 0, 1, 0, 1, ... };
> 
> Case 2 (nunits = POLY_INT_CST [16, 16]):
> _23 = VEC_PERM_EXPR <_46, _44, { POLY_INT_CST [1, 1], POLY_INT_CST [3, 
> 3], POLY_INT_CST [2, 1], POLY_INT_CST [4, 3], POLY_INT_CST [3, 1], POLY_INT_CST [5, 3], ... }>; We can optimize such VLA SLP permuation pattern into:
> _48 = vdecompress (slidedown(_46, 1/2 nunits), slidedown(_44, 1/2 
> nunits), mask = { 0, 1, 0, 1, ... };
> 
> For example:
> void __attribute__ ((noinline, noclone)) vec_slp (uint64_t *restrict 
> a, uint64_t b, uint64_t c, int n) {
>    for (int i = 0; i < n; ++i)
>      {
>        a[i * 2] += b;
>        a[i * 2 + 1] += c;
>      }
> }
> 
> ASM:
> ...
>          vid.v   v0
>          vand.vi v0,v0,1
>          vmseq.vi        v0,v0,1  ===> mask = { 0, 1, 0, 1, ... }
> vdecompress:
>          viota.m v3,v0
>          vrgather.vv     v2,v1,v3,v0.t
> Loop:
>          vsetvli zero,a5,e64,m1,ta,ma
>          vle64.v v1,0(a0)
>          vsetvli a6,zero,e64,m1,ta,ma
>          vadd.vv v1,v2,v1
>          vsetvli zero,a5,e64,m1,ta,ma
>          mv      a5,a3
>          vse64.v v1,0(a0)
>          add     a3,a3,a1
>          add     a0,a0,a2
>          bgtu    a5,a4,.L4
> 
> 
> gcc/ChangeLog:
> 
>          * config/riscv/riscv-v.cc (emit_vlmax_decompress_insn): New function.
>          (shuffle_decompress_patterns): New function.
>          (expand_vec_perm_const_1): Add decompress optimization.
> 
> gcc/testsuite/ChangeLog:
> 
>          * gcc.target/riscv/rvv/autovec/partial/slp-8.c: New test.
>          * gcc.target/riscv/rvv/autovec/partial/slp-9.c: New test.
>          * gcc.target/riscv/rvv/autovec/partial/slp_run-8.c: New test.
>          * gcc.target/riscv/rvv/autovec/partial/slp_run-9.c: New test.
I've been wanting to get inside expand_vec_perm_const to see what opportunities might exist to improve code in there.  We had good success mining this space at a prior employer.  While we had a lot of weird idioms and costs to consider it was well worth the time.

So quite happy to see you diving into this code.

OK for the trunk,
Jeff


^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2023-06-13  1:29 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-06-12 15:11 [PATCH V2] RISC-V: Enhance RVV VLA SLP auto-vectorization with decompress operation juzhe.zhong
2023-06-12 19:42 ` Jeff Law
2023-06-13  1:28   ` Li, Pan2

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).