From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
To: gcc-patches@gcc.gnu.org
Cc: kito.cheng@gmail.com, kito.cheng@sifive.com,
jeffreyalaw@gmail.com, rdapp.gcc@gmail.com,
Juzhe-Zhong <juzhe.zhong@rivai.ai>
Subject: [PATCH] RISC-V: Optimize VLA SLP with duplicate VLA shuffle indice
Date: Fri, 17 Nov 2023 12:43:19 +0800 [thread overview]
Message-ID: <20231117044319.3912782-1-juzhe.zhong@rivai.ai> (raw)
When evaluating dynamic LMUL, notice we can do better on VLA SLP with duplicate VLA shuffle indice.
Consider this following case:
void
foo (uint16_t *restrict a, uint16_t *restrict b, int n)
{
for (int i = 0; i < n; ++i)
{
a[i * 8] = b[i * 8 + 3] + 1;
a[i * 8 + 1] = b[i * 8 + 6] + 1;
a[i * 8 + 2] = b[i * 8 + 0] + 1;
a[i * 8 + 3] = b[i * 8 + 2] + 1;
a[i * 8 + 4] = b[i * 8 + 1] + 1;
a[i * 8 + 5] = b[i * 8 + 7] + 1;
a[i * 8 + 6] = b[i * 8 + 5] + 1;
a[i * 8 + 7] = b[i * 8 + 4] + 1;
}
}
We want to generate this following indice:
{ 3, 6, 0, 2, 1, 7, 5, 4, 11, 14, 8, 10, 9, 15, 13, 12, 19, 22, 16, 18, 17, 23, 21, 20, ... }
Before this patch, we are using pair of vmseq + vmerge to create such vector in prologue:
https://godbolt.org/z/r919WPabK
foo:
ble a2,zero,.L5
vsetvli a5,zero,e16,m8,ta,ma
vid.v v16
vand.vi v24,v16,7
vmseq.vi v0,v24,1
vmseq.vi v1,v24,2
vmv.v.i v8,3
vmerge.vim v8,v8,5,v0
vmv1r.v v0,v1
vmseq.vi v2,v24,3
vmerge.vim v8,v8,-2,v0
vmv1r.v v0,v2
vmseq.vi v1,v24,4
csrr a4,vlenb
vmerge.vim v8,v8,-1,v0
csrr a6,vlenb
vmv1r.v v0,v1
vmseq.vi v2,v24,5
slli a3,a4,2
vmerge.vim v8,v8,-3,v0
slli a6,a6,2
vmv1r.v v0,v2
vmseq.vi v1,v24,6
vmerge.vim v8,v8,2,v0
addi a6,a6,-1
vmv1r.v v0,v1
slli a2,a2,3
slli a4,a4,3
neg a7,a3
vmseq.vi v2,v24,7
vmerge.vim v8,v8,-1,v0
vmv.v.x v24,a6
vmv1r.v v0,v2
vmerge.vim v8,v8,-3,v0
vadd.vv v16,v16,v8
vand.vv v24,v16,v24
.L3:
minu a5,a2,a3
vsetvli zero,a5,e16,m8,ta,ma
mv a6,a2
vle16.v v16,0(a1)
vrgather.vv v8,v16,v24
vadd.vi v8,v8,1
vse16.v v8,0(a0)
add a1,a1,a4
add a0,a0,a4
add a2,a2,a7
bgtu a6,a3,.L3
.L5:
ret
After this patch:
foo:
ble a2,zero,.L5
li a6,-536875008
slli a6,a6,4
addi a6,a6,3
csrr a4,vlenb
csrr a5,vlenb
li t1,-536850432
slli a6,a6,16
slli a3,a4,1
addi a6,a6,-3
slli a5,a5,1
slli t1,t1,4
vsetvli t3,zero,e64,m4,ta,ma
addi a5,a5,-1
vmv.v.x v4,a6
addi t1,t1,3
slli a2,a2,3
slli a4,a4,2
neg a7,a3
vslide1up.vx v16,v4,t1
vsetvli a6,zero,e16,m4,ta,ma
vid.v v12
vmv.v.x v4,a5
vand.vi v20,v12,7
vrgather.vv v8,v16,v20
vadd.vv v12,v12,v8
vand.vv v12,v12,v4
.L3:
minu a5,a2,a3
vsetvli zero,a5,e16,m4,ta,ma
mv a6,a2
vle16.v v8,0(a1)
vrgather.vv v4,v8,v12
vadd.vi v4,v4,1
vse16.v v4,0(a0)
add a1,a1,a4
add a0,a0,a4
add a2,a2,a7
bgtu a6,a3,.L3
.L5:
ret
Optimization:
1. reduce 9 dynamic instructions in prologue.
2. Fewer vector instructions reduce hardware pipeline consuming.
gcc/ChangeLog:
* config/riscv/riscv-v.cc (rvv_builder::merge_pattern): New function.
(expand_vector_init_trailing_same_elem): Adapt function.
(expand_const_vector): Ditto.
(expand_vec_init): Ditto.
gcc/testsuite/ChangeLog:
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-6.c: Adapt test.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-8.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/slp-1.c: New test.
---
gcc/config/riscv/riscv-v.cc | 174 +++++++++++++-----
.../costmodel/riscv/rvv/dynamic-lmul4-6.c | 7 +-
.../costmodel/riscv/rvv/dynamic-lmul4-8.c | 7 +-
.../gcc.dg/vect/costmodel/riscv/rvv/slp-1.c | 40 ++++
4 files changed, 174 insertions(+), 54 deletions(-)
create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/slp-1.c
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 000d0c2c721..6a2009ffb05 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -420,6 +420,7 @@ public:
bool single_step_npatterns_p () const;
bool npatterns_all_equal_p () const;
+ void merge_pattern (rtx);
machine_mode new_mode () const { return m_new_mode; }
scalar_mode inner_mode () const { return m_inner_mode; }
@@ -679,6 +680,50 @@ rvv_builder::npatterns_all_equal_p () const
return true;
}
+/* Merge pattern to reduce the elements we need to process.
+
+ E.g. v = { 0, 1, 2, 3 }, mode = V4SI.
+ Since we can use EEW = 64 RVV instructions.
+
+ Transform it into:
+ v = { 1 << 32, 3 << 32 | 2 }, mode = V2DI. */
+void
+rvv_builder::merge_pattern (rtx src)
+{
+ if (this->inner_bits_size () >= GET_MODE_BITSIZE (Pmode))
+ return;
+ unsigned int ele_num = GET_MODE_BITSIZE (Pmode) / this->inner_bits_size ();
+ poly_uint64 nunits = exact_div (this->full_nelts (), ele_num);
+ machine_mode mode;
+ if (!get_vector_mode (Pmode, nunits).exists (&mode))
+ return;
+ this->new_vector (mode, this->npatterns () / ele_num,
+ this->nelts_per_pattern ());
+ rtx imm = gen_int_mode ((1ULL << this->inner_bits_size ()) - 1, Pmode);
+ for (unsigned int i = 0; i < this->nelts_per_pattern (); i++)
+ {
+ for (unsigned int j = 0; j < this->npatterns (); j++)
+ {
+ rtx val = const0_rtx;
+ for (unsigned int k = 0; k < ele_num; k++)
+ {
+ rtx e
+ = gen_lowpart (Pmode, CONST_VECTOR_ELT (src, k + j * ele_num));
+ e = expand_simple_binop (Pmode, AND, e, imm, NULL_RTX, false,
+ OPTAB_DIRECT);
+ e = expand_simple_binop (
+ Pmode, ASHIFT, e,
+ gen_int_mode (this->inner_bits_size () * k, Pmode), NULL_RTX,
+ false, OPTAB_DIRECT);
+ val = expand_simple_binop (Pmode, IOR, e, val, NULL_RTX, false,
+ OPTAB_DIRECT);
+ }
+ this->quick_push (val);
+ }
+ }
+ this->finalize ();
+}
+
static unsigned
get_sew (machine_mode mode)
{
@@ -1040,6 +1085,39 @@ expand_vec_series (rtx dest, rtx base, rtx step)
emit_move_insn (dest, result);
}
+/* Subroutine of expand_vec_init and expand_const_vector to handle case
+ when all trailing elements of builder are same.
+ This works as follows:
+ (a) Use expand_insn interface to broadcast last vector element in TARGET.
+ (b) Insert remaining elements in TARGET using insr.
+
+ ??? The heuristic used is to do above if number of same trailing elements
+ is greater than leading_ndups, loosely based on
+ heuristic from mostly_zeros_p. May need fine-tuning. */
+
+static void
+expand_vector_init_trailing_same_elem (rtx target,
+ const rtx_vector_builder &builder,
+ int start, int nelts_reqd)
+{
+ machine_mode mode = GET_MODE (target);
+
+ rtx dup = expand_vector_broadcast (mode, builder.elt (nelts_reqd - 1));
+ for (int i = start - 1; i >= 0; i--)
+ {
+ unsigned int unspec
+ = FLOAT_MODE_P (mode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP;
+ insn_code icode = code_for_pred_slide (unspec, mode);
+ rtx tmp = gen_reg_rtx (mode);
+ rtx ops[] = {tmp, dup, builder.elt (i)};
+ emit_vlmax_insn (icode, BINARY_OP, ops);
+ /* slide1up need source and dest to be different REG. */
+ dup = tmp;
+ }
+
+ emit_move_insn (target, dup);
+}
+
static void
expand_const_vector (rtx target, rtx src)
{
@@ -1139,6 +1217,43 @@ expand_const_vector (rtx target, rtx src)
rtx dup = expand_vector_broadcast (builder.new_mode (), ele);
emit_move_insn (target, gen_lowpart (mode, dup));
}
+ /* We don't apply such approach for LMUL = 8 since vrgather.vv doesn't
+ allow dest overlap with any source register and VLA repeating vector
+ always by a addition. So, it such VLA constant vector will consume
+ 32 registers if LMUL = 8 which cause serious high register pressure. */
+ else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT && npatterns > 2)
+ {
+ /* We handle the case that we can't find a vector containter to hold
+ element bitsize = NPATTERNS * ele_bitsize.
+
+ NPATTERNS = 8, element width = 16
+ v = { 3, 5, -2, -1, -3, 2, -1, -3, ... }
+ Since NPATTERNS * element width = 128, we can't find a container
+ to hold it.
+
+ In this case, we use NPATTERNS merge operations to generate such
+ vector. */
+
+ /* Generate index = { 0, 1, 2, 3, 4, 5, 6, 7, ... }. */
+ rtx vid = gen_reg_rtx (builder.int_mode ());
+ expand_vec_series (vid, const0_rtx, const1_rtx);
+ rtx index = gen_reg_rtx (builder.int_mode ());
+ rtx range = gen_int_mode (builder.npatterns () - 1,
+ GET_MODE_INNER (builder.int_mode ()));
+ rtx and_ops[] = {index, vid, range};
+ emit_vlmax_insn (code_for_pred_scalar (AND, builder.int_mode ()),
+ BINARY_OP, and_ops);
+
+ /* Create vector = { 3, 5, -2, -1, -3, 2, -1, -3, X, ... , X }. */
+ builder.merge_pattern (src);
+ rtx init = gen_reg_rtx (builder.mode ());
+ expand_vector_init_trailing_same_elem (init, builder,
+ builder.npatterns () - 1,
+ builder.npatterns ());
+
+ /* Shuffle vector with index. */
+ emit_vlmax_gather_insn (target, gen_lowpart (mode, init), index);
+ }
else
{
/* We handle the case that we can't find a vector containter to hold
@@ -2268,47 +2383,6 @@ expand_vector_init_merge_combine_sequence (rtx target,
emit_vlmax_insn (icode, MERGE_OP, merge_ops);
}
-/* Subroutine of expand_vec_init to handle case
- when all trailing elements of builder are same.
- This works as follows:
- (a) Use expand_insn interface to broadcast last vector element in TARGET.
- (b) Insert remaining elements in TARGET using insr.
-
- ??? The heuristic used is to do above if number of same trailing elements
- is greater than leading_ndups, loosely based on
- heuristic from mostly_zeros_p. May need fine-tuning. */
-
-static bool
-expand_vector_init_trailing_same_elem (rtx target,
- const rtx_vector_builder &builder,
- int nelts_reqd)
-{
- int leading_ndups = builder.count_dups (0, nelts_reqd - 1, 1);
- int trailing_ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
- machine_mode mode = GET_MODE (target);
-
- if (trailing_ndups > leading_ndups)
- {
- rtx dup = expand_vector_broadcast (mode, builder.elt (nelts_reqd - 1));
- for (int i = nelts_reqd - trailing_ndups - 1; i >= 0; i--)
- {
- unsigned int unspec
- = FLOAT_MODE_P (mode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP;
- insn_code icode = code_for_pred_slide (unspec, mode);
- rtx tmp = gen_reg_rtx (mode);
- rtx ops[] = {tmp, dup, builder.elt (i)};
- emit_vlmax_insn (icode, BINARY_OP, ops);
- /* slide1up need source and dest to be different REG. */
- dup = tmp;
- }
-
- emit_move_insn (target, dup);
- return true;
- }
-
- return false;
-}
-
/* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
void
@@ -2378,11 +2452,19 @@ expand_vec_init (rtx target, rtx vals)
/* Optimize trailing same elements sequence:
v = {y, y2, y3, y4, y5, x, x, x, x, x, x, x, x, x, x, x}; */
- if (!expand_vector_init_trailing_same_elem (target, v, nelts))
- /* Handle common situation by vslide1down. This function can handle any
- situation of vec_init<mode>. Only the cases that are not optimized above
- will fall through here. */
- expand_vector_init_insert_elems (target, v, nelts);
+ int leading_ndups = v.count_dups (0, nelts - 1, 1);
+ int trailing_ndups = v.count_dups (nelts - 1, -1, -1);
+ if (trailing_ndups > leading_ndups)
+ {
+ expand_vector_init_trailing_same_elem (target, v, nelts - trailing_ndups,
+ nelts);
+ return;
+ }
+
+ /* Handle common situation by vslide1down. This function can handle any
+ situation of vec_init<mode>. Only the cases that are not optimized above
+ will fall through here. */
+ expand_vector_init_insert_elems (target, v, nelts);
}
/* Get insn code for corresponding comparison. */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-6.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-6.c
index c7b88635923..d3149052b13 100644
--- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-6.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-6.c
@@ -4,7 +4,7 @@
#include <stdint-gcc.h>
void
-foo (uint8_t *restrict a, uint8_t *restrict b, int n)
+foo (uint16_t *restrict a, uint16_t *restrict b, int n)
{
for (int i = 0; i < n; ++i)
{
@@ -19,9 +19,8 @@ foo (uint8_t *restrict a, uint8_t *restrict b, int n)
}
}
-/* { dg-final { scan-assembler {e8,m4} } } */
-/* { dg-final { scan-assembler-times {csrr} 1 } } */
-/* { dg-final { scan-tree-dump-not "Maximum lmul = 8" "vect" } } */
+/* { dg-final { scan-assembler {e16,m4} } } */
+/* { dg-final { scan-tree-dump-times "Maximum lmul = 8" 1 "vect" } } */
/* { dg-final { scan-tree-dump-times "Maximum lmul = 4" 1 "vect" } } */
/* { dg-final { scan-tree-dump-not "Maximum lmul = 2" "vect" } } */
/* { dg-final { scan-tree-dump-not "Maximum lmul = 1" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-8.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-8.c
index 4d9175e86f8..4c428f33a79 100644
--- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-8.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-8.c
@@ -4,7 +4,7 @@
#include <stdint-gcc.h>
void
-foo (uint8_t *restrict a, uint8_t *restrict b, int n)
+foo (uint16_t *restrict a, uint16_t *restrict b, int n)
{
for (int i = 0; i < n; ++i)
{
@@ -28,9 +28,8 @@ foo (uint8_t *restrict a, uint8_t *restrict b, int n)
}
}
-/* { dg-final { scan-assembler {e8,m4} } } */
-/* { dg-final { scan-assembler-times {csrr} 1 } } */
-/* { dg-final { scan-tree-dump-not "Maximum lmul = 8" "vect" } } */
+/* { dg-final { scan-assembler {e16,m4} } } */
+/* { dg-final { scan-tree-dump-times "Maximum lmul = 8" 1 "vect" } } */
/* { dg-final { scan-tree-dump-times "Maximum lmul = 4" 1 "vect" } } */
/* { dg-final { scan-tree-dump-not "Maximum lmul = 2" "vect" } } */
/* { dg-final { scan-tree-dump-not "Maximum lmul = 1" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/slp-1.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/slp-1.c
new file mode 100644
index 00000000000..23bf59b0cb3
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/slp-1.c
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize --param riscv-autovec-lmul=dynamic -fno-schedule-insns -fdump-tree-vect-details" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <stdint-gcc.h>
+
+/*
+** foo:
+** ...
+** vslide1up\.vx\s+v[0-9]+,\s*v[0-9]+,\s*[a-x0-9]+
+** ...
+** vrgather\.vv\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+
+** ...
+** vle16\.v\s+v8,\s*0\([a-x0-9]+\)
+** ...
+** vrgather\.vv\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+
+** ...
+*/
+void
+foo (uint16_t *restrict a, uint16_t *restrict b, int n)
+{
+ for (int i = 0; i < n; ++i)
+ {
+ a[i * 8] = b[i * 8 + 3] + 1;
+ a[i * 8 + 1] = b[i * 8 + 6] + 1;
+ a[i * 8 + 2] = b[i * 8 + 0] + 1;
+ a[i * 8 + 3] = b[i * 8 + 2] + 1;
+ a[i * 8 + 4] = b[i * 8 + 1] + 1;
+ a[i * 8 + 5] = b[i * 8 + 7] + 1;
+ a[i * 8 + 6] = b[i * 8 + 5] + 1;
+ a[i * 8 + 7] = b[i * 8 + 4] + 1;
+ }
+}
+
+/* { dg-final { scan-assembler-times {vslide1up\.vx} 1 } } */
+/* { dg-final { scan-assembler-times {vrgather\.vv} 2 } } */
+/* { dg-final { scan-assembler-not {vmv1r\.v} } } */
+/* { dg-final { scan-assembler-not {vmv2r\.v} } } */
+/* { dg-final { scan-assembler-not {vmv4r\.v} } } */
+/* { dg-final { scan-assembler-not {vmv8r\.v} } } */
--
2.36.3
next reply other threads:[~2023-11-17 4:43 UTC|newest]
Thread overview: 2+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-11-17 4:43 Juzhe-Zhong [this message]
2023-11-17 8:27 ` Robin Dapp
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20231117044319.3912782-1-juzhe.zhong@rivai.ai \
--to=juzhe.zhong@rivai.ai \
--cc=gcc-patches@gcc.gnu.org \
--cc=jeffreyalaw@gmail.com \
--cc=kito.cheng@gmail.com \
--cc=kito.cheng@sifive.com \
--cc=rdapp.gcc@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).