From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <law@sourceware.org>
Received: by sourceware.org (Postfix, from userid 2119)
	id 3CAF93858D28; Mon, 19 Jun 2023 11:40:45 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 3CAF93858D28
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org;
	s=default; t=1687174845;
	bh=HQ6ssLIkVIiIVO4Sb+paclZ+PEFZgezknpsqWc0h1+0=;
	h=From:To:Subject:Date:From;
	b=QQ4PckLQcdPremvsJDBTq7w473GV1SqSfKSIkx9IiUdFKcoBJ8Hke5uaVtjACXz0l
	 g5xX9AMXMRGEa62dm8zRvMaK2ZIfXirMiDZlO5sfgcXIoUuNLltChVegM+3RqIEhyB
	 xBlsWMO/GepAeAswZbf/yVp41hF+dIX+4hRMznPg=
Content-Type: text/plain; charset="us-ascii"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
From: Jeff Law <law@gcc.gnu.org>
To: gcc-cvs@gcc.gnu.org
Subject: [gcc(refs/vendors/riscv/heads/gcc-13-with-riscv-opts)] RISC-V:
 Enhance RVV VLA SLP auto-vectorization with decompress operation
X-Act-Checkin: gcc
X-Git-Author: Juzhe-Zhong <juzhe.zhong@rivai.ai>
X-Git-Refname: refs/vendors/riscv/heads/gcc-13-with-riscv-opts
X-Git-Oldrev: a97440f879f5f3c8cdcaa1dc443a8a7512d74347
X-Git-Newrev: 0639fd5c8b68497e76739e6b4278913b0748bb51
Message-Id: <20230619114045.3CAF93858D28@sourceware.org>
Date: Mon, 19 Jun 2023 11:40:45 +0000 (GMT)
List-Id: <gcc-cvs.sourceware.org>

https://gcc.gnu.org/g:0639fd5c8b68497e76739e6b4278913b0748bb51

commit 0639fd5c8b68497e76739e6b4278913b0748bb51
Author: Juzhe-Zhong <juzhe.zhong@rivai.ai>
Date:   Mon Jun 12 23:11:07 2023 +0800

    RISC-V: Enhance RVV VLA SLP auto-vectorization with decompress operation
    
    According to RVV ISA:
    https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc
    
    We can enhance VLA SLP auto-vectorization with (16.5.1. Synthesizing vdecompress)
    Decompress operation.
    
    Case 1 (nunits = POLY_INT_CST [16, 16]):
    _48 = VEC_PERM_EXPR <_37, _35, { 0, POLY_INT_CST [16, 16], 1, POLY_INT_CST [17, 16], 2, POLY_INT_CST [18, 16], ... }>;
    We can optimize such VLA SLP permuation pattern into:
    _48 = vdecompress (_37, _35, mask = { 0, 1, 0, 1, ... };
    
    Case 2 (nunits = POLY_INT_CST [16, 16]):
    _23 = VEC_PERM_EXPR <_46, _44, { POLY_INT_CST [1, 1], POLY_INT_CST [3, 3], POLY_INT_CST [2, 1], POLY_INT_CST [4, 3], POLY_INT_CST [3, 1], POLY_INT_CST [5, 3], ... }>;
    We can optimize such VLA SLP permuation pattern into:
    _48 = vdecompress (slidedown(_46, 1/2 nunits), slidedown(_44, 1/2 nunits), mask = { 0, 1, 0, 1, ... };
    
    For example:
    void __attribute__ ((noinline, noclone))
    vec_slp (uint64_t *restrict a, uint64_t b, uint64_t c, int n)
    {
      for (int i = 0; i < n; ++i)
        {
          a[i * 2] += b;
          a[i * 2 + 1] += c;
        }
    }
    
    ASM:
    ...
            vid.v   v0
            vand.vi v0,v0,1
            vmseq.vi        v0,v0,1  ===> mask = { 0, 1, 0, 1, ... }
    vdecompress:
            viota.m v3,v0
            vrgather.vv     v2,v1,v3,v0.t
    Loop:
            vsetvli zero,a5,e64,m1,ta,ma
            vle64.v v1,0(a0)
            vsetvli a6,zero,e64,m1,ta,ma
            vadd.vv v1,v2,v1
            vsetvli zero,a5,e64,m1,ta,ma
            mv      a5,a3
            vse64.v v1,0(a0)
            add     a3,a3,a1
            add     a0,a0,a2
            bgtu    a5,a4,.L4
    
    gcc/ChangeLog:
    
            * config/riscv/riscv-v.cc (emit_vlmax_decompress_insn): New function.
            (shuffle_decompress_patterns): New function.
            (expand_vec_perm_const_1): Add decompress optimization.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/riscv/rvv/autovec/partial/slp-8.c: New test.
            * gcc.target/riscv/rvv/autovec/partial/slp-9.c: New test.
            * gcc.target/riscv/rvv/autovec/partial/slp_run-8.c: New test.
            * gcc.target/riscv/rvv/autovec/partial/slp_run-9.c: New test.

Diff:
---
 gcc/config/riscv/riscv-v.cc                        | 111 +++++++++++++++++++++
 .../gcc.target/riscv/rvv/autovec/partial/slp-8.c   |  30 ++++++
 .../gcc.target/riscv/rvv/autovec/partial/slp-9.c   |  31 ++++++
 .../riscv/rvv/autovec/partial/slp_run-8.c          |  30 ++++++
 .../riscv/rvv/autovec/partial/slp_run-9.c          |  30 ++++++
 5 files changed, 232 insertions(+)

diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index e1b85a5af91..fb970344521 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -836,6 +836,46 @@ emit_vlmax_masked_gather_mu_insn (rtx target, rtx op, rtx sel, rtx mask)
   emit_vlmax_masked_mu_insn (icode, RVV_BINOP_MU, ops);
 }
 
+/* According to RVV ISA spec (16.5.1. Synthesizing vdecompress):
+   https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc
+
+  There is no inverse vdecompress provided, as this operation can be readily
+  synthesized using iota and a masked vrgather:
+
+      Desired functionality of 'vdecompress'
+	7 6 5 4 3 2 1 0     # vid
+
+	      e d c b a     # packed vector of 5 elements
+	1 0 0 1 1 1 0 1     # mask vector of 8 elements
+	p q r s t u v w     # destination register before vdecompress
+
+	e q r d c b v a     # result of vdecompress
+       # v0 holds mask
+       # v1 holds packed data
+       # v11 holds input expanded vector and result
+       viota.m v10, v0                 # Calc iota from mask in v0
+       vrgather.vv v11, v1, v10, v0.t  # Expand into destination
+     p q r s t u v w  # v11 destination register
+	   e d c b a  # v1 source vector
+     1 0 0 1 1 1 0 1  # v0 mask vector
+
+     4 4 4 3 2 1 1 0  # v10 result of viota.m
+     e q r d c b v a  # v11 destination after vrgather using viota.m under mask
+*/
+static void
+emit_vlmax_decompress_insn (rtx target, rtx op, rtx mask)
+{
+  machine_mode data_mode = GET_MODE (target);
+  machine_mode sel_mode = related_int_vector_mode (data_mode).require ();
+  if (GET_MODE_INNER (data_mode) == QImode)
+    sel_mode = get_vector_mode (HImode, GET_MODE_NUNITS (data_mode)).require ();
+
+  rtx sel = gen_reg_rtx (sel_mode);
+  rtx iota_ops[] = {sel, mask};
+  emit_vlmax_insn (code_for_pred_iota (sel_mode), RVV_UNOP, iota_ops);
+  emit_vlmax_masked_gather_mu_insn (target, op, sel, mask);
+}
+
 /* Emit merge instruction.  */
 
 static machine_mode
@@ -2337,6 +2377,75 @@ struct expand_vec_perm_d
   bool testing_p;
 };
 
+/* Recognize decompress patterns:
+
+   1. VEC_PERM_EXPR op0 and op1
+      with isel = { 0, nunits, 1, nunits + 1, ... }.
+      Decompress op0 and op1 vector with the mask = { 0, 1, 0, 1, ... }.
+
+   2. VEC_PERM_EXPR op0 and op1
+      with isel = { 1/2 nunits, 3/2 nunits, 1/2 nunits+1, 3/2 nunits+1,... }.
+      Slide down op0 and op1 with OFFSET = 1/2 nunits.
+      Decompress op0 and op1 vector with the mask = { 0, 1, 0, 1, ... }.
+*/
+static bool
+shuffle_decompress_patterns (struct expand_vec_perm_d *d)
+{
+  poly_uint64 nelt = d->perm.length ();
+  machine_mode mask_mode = get_mask_mode (d->vmode).require ();
+
+  /* For constant size indices, we dont't need to handle it here.
+     Just leave it to vec_perm<mode>.  */
+  if (d->perm.length ().is_constant ())
+    return false;
+
+  poly_uint64 first = d->perm[0];
+  if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
+      || !d->perm.series_p (0, 2, first, 1)
+      || !d->perm.series_p (1, 2, first + nelt, 1))
+    return false;
+
+  /* Permuting two SEW8 variable-length vectors need vrgatherei16.vv.
+     Otherwise, it could overflow the index range.  */
+  machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
+  if (GET_MODE_INNER (d->vmode) == QImode
+      && !get_vector_mode (HImode, nelt).exists (&sel_mode))
+    return false;
+
+  /* Success!  */
+  if (d->testing_p)
+    return true;
+
+  rtx op0, op1;
+  if (known_eq (first, 0U))
+    {
+      op0 = d->op0;
+      op1 = d->op1;
+    }
+  else
+    {
+      op0 = gen_reg_rtx (d->vmode);
+      op1 = gen_reg_rtx (d->vmode);
+      insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEDOWN, d->vmode);
+      rtx ops0[] = {op0, d->op0, gen_int_mode (first, Pmode)};
+      rtx ops1[] = {op1, d->op1, gen_int_mode (first, Pmode)};
+      emit_vlmax_insn (icode, RVV_BINOP, ops0);
+      emit_vlmax_insn (icode, RVV_BINOP, ops1);
+    }
+  /* Generate { 0, 1, .... } mask.  */
+  rtx vid = gen_reg_rtx (sel_mode);
+  rtx vid_repeat = gen_reg_rtx (sel_mode);
+  emit_insn (gen_vec_series (sel_mode, vid, const0_rtx, const1_rtx));
+  rtx and_ops[] = {vid_repeat, vid, const1_rtx};
+  emit_vlmax_insn (code_for_pred_scalar (AND, sel_mode), RVV_BINOP, and_ops);
+  rtx const_vec = gen_const_vector_dup (sel_mode, 1);
+  rtx mask = gen_reg_rtx (mask_mode);
+  expand_vec_cmp (mask, EQ, vid_repeat, const_vec);
+  emit_move_insn (d->target, op0);
+  emit_vlmax_decompress_insn (d->target, op1, mask);
+  return true;
+}
+
 /* Recognize the pattern that can be shuffled by generic approach.  */
 
 static bool
@@ -2388,6 +2497,8 @@ expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
     {
       if (d->vmode == d->op_mode)
 	{
+	  if (shuffle_decompress_patterns (d))
+	    return true;
 	  if (shuffle_generic_patterns (d))
 	    return true;
 	  return false;
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-8.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-8.c
new file mode 100644
index 00000000000..2568d6947a2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-8.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv32gcv -mabi=ilp32d --param riscv-autovec-preference=scalable -fno-vect-cost-model -fdump-tree-optimized-details" } */
+
+#include <stdint.h>
+
+#define VEC_PERM(TYPE)                                                         \
+  TYPE __attribute__ ((noinline, noclone))                                     \
+  vec_slp_##TYPE (TYPE *restrict a, TYPE b, TYPE c, int n)                     \
+  {                                                                            \
+    for (int i = 0; i < n; ++i)                                                \
+      {                                                                        \
+	a[i * 2] += b;                                                         \
+	a[i * 2 + 1] += c;                                                     \
+      }                                                                        \
+  }
+
+#define TEST_ALL(T)                                                            \
+  T (int8_t)                                                                   \
+  T (uint8_t)                                                                  \
+  T (int16_t)                                                                  \
+  T (uint16_t)                                                                 \
+  T (int32_t)                                                                  \
+  T (uint32_t)                                                                 \
+  T (int64_t)                                                                  \
+  T (uint64_t)
+
+TEST_ALL (VEC_PERM)
+
+/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 2 "optimized" } } */
+/* { dg-final { scan-assembler-times {viota.m} 2 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-9.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-9.c
new file mode 100644
index 00000000000..d410e57adbd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-9.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv32gcv -mabi=ilp32d --param riscv-autovec-preference=scalable -fno-vect-cost-model -fdump-tree-optimized-details" } */
+
+#include <stdint.h>
+
+#define VEC_PERM(TYPE)                                                         \
+  TYPE __attribute__ ((noinline, noclone))                                     \
+  vec_slp_##TYPE (TYPE *restrict a, TYPE b, TYPE c, int n)                     \
+  {                                                                            \
+    for (int i = 0; i < n; ++i)                                                \
+      {                                                                        \
+	a[i * 4] += b;                                                         \
+	a[i * 4 + 1] += c;                                                     \
+	a[i * 4 + 2] += b;                                                     \
+	a[i * 4 + 3] += c;                                                     \
+      }                                                                        \
+  }
+
+#define TEST_ALL(T)                                                            \
+  T (int8_t)                                                                   \
+  T (uint8_t)                                                                  \
+  T (int16_t)                                                                  \
+  T (uint16_t)                                                                 \
+  T (int32_t)                                                                  \
+  T (uint32_t)                                                                 \
+  T (int64_t)                                                                  \
+  T (uint64_t)
+
+TEST_ALL (VEC_PERM)
+
+/* { dg-final { scan-assembler-times {viota.m} 2 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp_run-8.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp_run-8.c
new file mode 100644
index 00000000000..39ae513812b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp_run-8.c
@@ -0,0 +1,30 @@
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-additional-options "--param riscv-autovec-preference=scalable -fno-vect-cost-model" } */
+
+#include "slp-8.c"
+
+#define N (103 * 2)
+
+#define HARNESS(TYPE)						\
+  {								\
+    TYPE a[N], b[2] = { 3, 11 };				\
+    for (unsigned int i = 0; i < N; ++i)			\
+      {								\
+	a[i] = i * 2 + i % 5;					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    vec_slp_##TYPE (a, b[0], b[1], N / 2);			\
+    for (unsigned int i = 0; i < N; ++i)			\
+      {								\
+	TYPE orig = i * 2 + i % 5;				\
+	TYPE expected = orig + b[i % 2];			\
+	if (a[i] != expected)					\
+	  __builtin_abort ();					\
+      }								\
+  }
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  TEST_ALL (HARNESS)
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp_run-9.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp_run-9.c
new file mode 100644
index 00000000000..791cfbc2b47
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp_run-9.c
@@ -0,0 +1,30 @@
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-additional-options "--param riscv-autovec-preference=scalable -fno-vect-cost-model" } */
+
+#include "slp-9.c"
+
+#define N (103 * 4)
+
+#define HARNESS(TYPE)						\
+  {								\
+    TYPE a[N], b[2] = { 3, 11 };				\
+    for (unsigned int i = 0; i < N; ++i)			\
+      {								\
+	a[i] = i * 2 + i % 5;					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    vec_slp_##TYPE (a, b[0], b[1], N / 4);			\
+    for (unsigned int i = 0; i < N; ++i)			\
+      {								\
+	TYPE orig = i * 2 + i % 5;				\
+	TYPE expected = orig + b[i % 2];			\
+	if (a[i] != expected)					\
+	  __builtin_abort ();					\
+      }								\
+  }
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  TEST_ALL (HARNESS)
+}