* [PATCH] RISC-V: Support highest overlap for wv instructions
@ 2023-12-09 4:06 Juzhe-Zhong
2023-12-11 7:54 ` Robin Dapp
0 siblings, 1 reply; 2+ messages in thread
From: Juzhe-Zhong @ 2023-12-09 4:06 UTC (permalink / raw)
To: gcc-patches; +Cc: kito.cheng, kito.cheng, jeffreyalaw, rdapp.gcc, Juzhe-Zhong
According to RVV ISA, we can allow vwadd.wv v2, v2, v3 overlap.
Before this patch:
nop
vsetivli zero,4,e8,m4,tu,ma
vle16.v v8,0(a0)
vmv8r.v v0,v8
vwsub.wv v0,v8,v12
nop
addi a4,a0,100
vle16.v v8,0(a4)
vmv8r.v v24,v8
vwsub.wv v24,v8,v12
nop
addi a4,a0,200
vle16.v v8,0(a4)
vmv8r.v v16,v8
vwsub.wv v16,v8,v12
nop
After this patch:
nop
vsetivli zero,4,e8,m4,tu,ma
vle16.v v0,0(a0)
vwsub.wv v0,v0,v4
nop
addi a4,a0,100
vle16.v v24,0(a4)
vwsub.wv v24,v24,v28
nop
addi a4,a0,200
vle16.v v16,0(a4)
vwsub.wv v16,v16,v20
PR target/112431
gcc/ChangeLog:
* config/riscv/vector.md: Support highest overlap for wv instructions.
gcc/testsuite/ChangeLog:
* gcc.target/riscv/rvv/base/pr112431-39.c: New test.
* gcc.target/riscv/rvv/base/pr112431-40.c: New test.
* gcc.target/riscv/rvv/base/pr112431-41.c: New test.
---
gcc/config/riscv/vector.md | 88 +++++-----
.../gcc.target/riscv/rvv/base/pr112431-39.c | 158 ++++++++++++++++++
.../gcc.target/riscv/rvv/base/pr112431-40.c | 94 +++++++++++
.../gcc.target/riscv/rvv/base/pr112431-41.c | 62 +++++++
4 files changed, 360 insertions(+), 42 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-39.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-40.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-41.c
diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index ba0714a9971..31c13a6dcca 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -3795,46 +3795,48 @@
(set_attr "group_overlap" "W21,W21,W21,W21,W42,W42,W42,W42,W84,W84,W84,W84,none,none")])
(define_insn "@pred_single_widen_sub<any_extend:su><mode>"
- [(set (match_operand:VWEXTI 0 "register_operand" "=&vr,&vr")
+ [(set (match_operand:VWEXTI 0 "register_operand" "=vd, vr, vd, vr, vd, vr, vd, vr, vd, vr, vd, vr, ?&vr, ?&vr")
(if_then_else:VWEXTI
(unspec:<VM>
- [(match_operand:<VM> 1 "vector_mask_operand" "vmWc1,vmWc1")
- (match_operand 5 "vector_length_operand" " rK, rK")
- (match_operand 6 "const_int_operand" " i, i")
- (match_operand 7 "const_int_operand" " i, i")
- (match_operand 8 "const_int_operand" " i, i")
+ [(match_operand:<VM> 1 "vector_mask_operand" " vm,Wc1, vm,Wc1, vm,Wc1, vm,Wc1, vm,Wc1, vm,Wc1,vmWc1,vmWc1")
+ (match_operand 5 "vector_length_operand" " rK, rK, rK, rK, rK, rK, rK, rK, rK, rK, rK, rK, rK, rK")
+ (match_operand 6 "const_int_operand" " i, i, i, i, i, i, i, i, i, i, i, i, i, i")
+ (match_operand 7 "const_int_operand" " i, i, i, i, i, i, i, i, i, i, i, i, i, i")
+ (match_operand 8 "const_int_operand" " i, i, i, i, i, i, i, i, i, i, i, i, i, i")
(reg:SI VL_REGNUM)
(reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
(minus:VWEXTI
- (match_operand:VWEXTI 3 "register_operand" " vr, vr")
+ (match_operand:VWEXTI 3 "register_operand" " vr, vr, vr, vr, vr, vr, vr, vr, vr, vr, vr, vr, vr, vr")
(any_extend:VWEXTI
- (match_operand:<V_DOUBLE_TRUNC> 4 "register_operand" " vr, vr")))
- (match_operand:VWEXTI 2 "vector_merge_operand" " vu, 0")))]
+ (match_operand:<V_DOUBLE_TRUNC> 4 "register_operand" "W21,W21,W21,W21,W42,W42,W42,W42,W84,W84,W84,W84, vr, vr")))
+ (match_operand:VWEXTI 2 "vector_merge_operand" " vu, vu, 0, 0, vu, vu, 0, 0, vu, vu, 0, 0, vu, 0")))]
"TARGET_VECTOR"
"vwsub<any_extend:u>.wv\t%0,%3,%4%p1"
[(set_attr "type" "viwalu")
- (set_attr "mode" "<V_DOUBLE_TRUNC>")])
+ (set_attr "mode" "<V_DOUBLE_TRUNC>")
+ (set_attr "group_overlap" "W21,W21,W21,W21,W42,W42,W42,W42,W84,W84,W84,W84,none,none")])
(define_insn "@pred_single_widen_add<any_extend:su><mode>"
- [(set (match_operand:VWEXTI 0 "register_operand" "=&vr,&vr")
+ [(set (match_operand:VWEXTI 0 "register_operand" "=vd, vr, vd, vr, vd, vr, vd, vr, vd, vr, vd, vr, ?&vr, ?&vr")
(if_then_else:VWEXTI
(unspec:<VM>
- [(match_operand:<VM> 1 "vector_mask_operand" "vmWc1,vmWc1")
- (match_operand 5 "vector_length_operand" " rK, rK")
- (match_operand 6 "const_int_operand" " i, i")
- (match_operand 7 "const_int_operand" " i, i")
- (match_operand 8 "const_int_operand" " i, i")
+ [(match_operand:<VM> 1 "vector_mask_operand" " vm,Wc1, vm,Wc1, vm,Wc1, vm,Wc1, vm,Wc1, vm,Wc1,vmWc1,vmWc1")
+ (match_operand 5 "vector_length_operand" " rK, rK, rK, rK, rK, rK, rK, rK, rK, rK, rK, rK, rK, rK")
+ (match_operand 6 "const_int_operand" " i, i, i, i, i, i, i, i, i, i, i, i, i, i")
+ (match_operand 7 "const_int_operand" " i, i, i, i, i, i, i, i, i, i, i, i, i, i")
+ (match_operand 8 "const_int_operand" " i, i, i, i, i, i, i, i, i, i, i, i, i, i")
(reg:SI VL_REGNUM)
(reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
(plus:VWEXTI
(any_extend:VWEXTI
- (match_operand:<V_DOUBLE_TRUNC> 4 "register_operand" " vr, vr"))
- (match_operand:VWEXTI 3 "register_operand" " vr, vr"))
- (match_operand:VWEXTI 2 "vector_merge_operand" " vu, 0")))]
+ (match_operand:<V_DOUBLE_TRUNC> 4 "register_operand" "W21,W21,W21,W21,W42,W42,W42,W42,W84,W84,W84,W84, vr, vr"))
+ (match_operand:VWEXTI 3 "register_operand" " vr, vr, vr, vr, vr, vr, vr, vr, vr, vr, vr, vr, vr, vr"))
+ (match_operand:VWEXTI 2 "vector_merge_operand" " vu, vu, 0, 0, vu, vu, 0, 0, vu, vu, 0, 0, vu, 0")))]
"TARGET_VECTOR"
"vwadd<any_extend:u>.wv\t%0,%3,%4%p1"
[(set_attr "type" "viwalu")
- (set_attr "mode" "<V_DOUBLE_TRUNC>")])
+ (set_attr "mode" "<V_DOUBLE_TRUNC>")
+ (set_attr "group_overlap" "W21,W21,W21,W21,W42,W42,W42,W42,W84,W84,W84,W84,none,none")])
(define_insn "@pred_single_widen_<plus_minus:optab><any_extend:su><mode>_scalar"
[(set (match_operand:VWEXTI 0 "register_operand" "=vr, vr")
@@ -7073,54 +7075,56 @@
(set_attr "group_overlap" "W21,W21,W21,W21,W42,W42,W42,W42,W84,W84,W84,W84,none,none")])
(define_insn "@pred_single_widen_add<mode>"
- [(set (match_operand:VWEXTF 0 "register_operand" "=&vr, &vr")
+ [(set (match_operand:VWEXTF 0 "register_operand" "=vd, vr, vd, vr, vd, vr, vd, vr, vd, vr, vd, vr, ?&vr, ?&vr")
(if_then_else:VWEXTF
(unspec:<VM>
- [(match_operand:<VM> 1 "vector_mask_operand" "vmWc1,vmWc1")
- (match_operand 5 "vector_length_operand" " rK, rK")
- (match_operand 6 "const_int_operand" " i, i")
- (match_operand 7 "const_int_operand" " i, i")
- (match_operand 8 "const_int_operand" " i, i")
- (match_operand 9 "const_int_operand" " i, i")
+ [(match_operand:<VM> 1 "vector_mask_operand" " vm,Wc1, vm,Wc1, vm,Wc1, vm,Wc1, vm,Wc1, vm,Wc1,vmWc1,vmWc1")
+ (match_operand 5 "vector_length_operand" " rK, rK, rK, rK, rK, rK, rK, rK, rK, rK, rK, rK, rK, rK")
+ (match_operand 6 "const_int_operand" " i, i, i, i, i, i, i, i, i, i, i, i, i, i")
+ (match_operand 7 "const_int_operand" " i, i, i, i, i, i, i, i, i, i, i, i, i, i")
+ (match_operand 8 "const_int_operand" " i, i, i, i, i, i, i, i, i, i, i, i, i, i")
+ (match_operand 9 "const_int_operand" " i, i, i, i, i, i, i, i, i, i, i, i, i, i")
(reg:SI VL_REGNUM)
(reg:SI VTYPE_REGNUM)
(reg:SI FRM_REGNUM)] UNSPEC_VPREDICATE)
(plus:VWEXTF
(float_extend:VWEXTF
- (match_operand:<V_DOUBLE_TRUNC> 4 "register_operand" " vr, vr"))
- (match_operand:VWEXTF 3 "register_operand" " vr, vr"))
- (match_operand:VWEXTF 2 "vector_merge_operand" " vu, 0")))]
+ (match_operand:<V_DOUBLE_TRUNC> 4 "register_operand" "W21,W21,W21,W21,W42,W42,W42,W42,W84,W84,W84,W84, vr, vr"))
+ (match_operand:VWEXTF 3 "register_operand" " vr, vr, vr, vr, vr, vr, vr, vr, vr, vr, vr, vr, vr, vr"))
+ (match_operand:VWEXTF 2 "vector_merge_operand" " vu, vu, 0, 0, vu, vu, 0, 0, vu, vu, 0, 0, vu, 0")))]
"TARGET_VECTOR"
"vfwadd.wv\t%0,%3,%4%p1"
[(set_attr "type" "vfwalu")
(set_attr "mode" "<V_DOUBLE_TRUNC>")
(set (attr "frm_mode")
- (symbol_ref "riscv_vector::get_frm_mode (operands[9])"))])
+ (symbol_ref "riscv_vector::get_frm_mode (operands[9])"))
+ (set_attr "group_overlap" "W21,W21,W21,W21,W42,W42,W42,W42,W84,W84,W84,W84,none,none")])
(define_insn "@pred_single_widen_sub<mode>"
- [(set (match_operand:VWEXTF 0 "register_operand" "=&vr, &vr")
+ [(set (match_operand:VWEXTF 0 "register_operand" "=vd, vr, vd, vr, vd, vr, vd, vr, vd, vr, vd, vr, ?&vr, ?&vr")
(if_then_else:VWEXTF
(unspec:<VM>
- [(match_operand:<VM> 1 "vector_mask_operand" "vmWc1,vmWc1")
- (match_operand 5 "vector_length_operand" " rK, rK")
- (match_operand 6 "const_int_operand" " i, i")
- (match_operand 7 "const_int_operand" " i, i")
- (match_operand 8 "const_int_operand" " i, i")
- (match_operand 9 "const_int_operand" " i, i")
+ [(match_operand:<VM> 1 "vector_mask_operand" " vm,Wc1, vm,Wc1, vm,Wc1, vm,Wc1, vm,Wc1, vm,Wc1,vmWc1,vmWc1")
+ (match_operand 5 "vector_length_operand" " rK, rK, rK, rK, rK, rK, rK, rK, rK, rK, rK, rK, rK, rK")
+ (match_operand 6 "const_int_operand" " i, i, i, i, i, i, i, i, i, i, i, i, i, i")
+ (match_operand 7 "const_int_operand" " i, i, i, i, i, i, i, i, i, i, i, i, i, i")
+ (match_operand 8 "const_int_operand" " i, i, i, i, i, i, i, i, i, i, i, i, i, i")
+ (match_operand 9 "const_int_operand" " i, i, i, i, i, i, i, i, i, i, i, i, i, i")
(reg:SI VL_REGNUM)
(reg:SI VTYPE_REGNUM)
(reg:SI FRM_REGNUM)] UNSPEC_VPREDICATE)
(minus:VWEXTF
- (match_operand:VWEXTF 3 "register_operand" " vr, vr")
+ (match_operand:VWEXTF 3 "register_operand" " vr, vr, vr, vr, vr, vr, vr, vr, vr, vr, vr, vr, vr, vr")
(float_extend:VWEXTF
- (match_operand:<V_DOUBLE_TRUNC> 4 "register_operand" " vr, vr")))
- (match_operand:VWEXTF 2 "vector_merge_operand" " vu, 0")))]
+ (match_operand:<V_DOUBLE_TRUNC> 4 "register_operand" "W21,W21,W21,W21,W42,W42,W42,W42,W84,W84,W84,W84, vr, vr")))
+ (match_operand:VWEXTF 2 "vector_merge_operand" " vu, vu, 0, 0, vu, vu, 0, 0, vu, vu, 0, 0, vu, 0")))]
"TARGET_VECTOR"
"vfwsub.wv\t%0,%3,%4%p1"
[(set_attr "type" "vfwalu")
(set_attr "mode" "<V_DOUBLE_TRUNC>")
(set (attr "frm_mode")
- (symbol_ref "riscv_vector::get_frm_mode (operands[9])"))])
+ (symbol_ref "riscv_vector::get_frm_mode (operands[9])"))
+ (set_attr "group_overlap" "W21,W21,W21,W21,W42,W42,W42,W42,W84,W84,W84,W84,none,none")])
(define_insn "@pred_single_widen_<plus_minus:optab><mode>_scalar"
[(set (match_operand:VWEXTF 0 "register_operand" "=vr, vr")
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-39.c b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-39.c
new file mode 100644
index 00000000000..47820dd29f0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-39.c
@@ -0,0 +1,158 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */
+
+#include "riscv_vector.h"
+
+void
+foo (void *in, void *out, int n)
+{
+ for (int i = 0; i < n; i++)
+ {
+ asm volatile("nop" ::: "memory");
+ vint16m2_t v0 = __riscv_vle16_v_i16m2 (in, 4);in+=100;
+ v0 = __riscv_vwsub_wv_i16m2_tu (v0, v0, __riscv_vreinterpret_v_i16m1_i8m1 (__riscv_vget_v_i16m2_i16m1 (v0, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint16m2_t v1 = __riscv_vle16_v_i16m2 (in, 4);in+=100;
+ v1 = __riscv_vwsub_wv_i16m2_tu (v1, v1, __riscv_vreinterpret_v_i16m1_i8m1 (__riscv_vget_v_i16m2_i16m1 (v1, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint16m2_t v2 = __riscv_vle16_v_i16m2 (in, 4);in+=100;
+ v2 = __riscv_vwsub_wv_i16m2_tu (v2, v2, __riscv_vreinterpret_v_i16m1_i8m1 (__riscv_vget_v_i16m2_i16m1 (v2, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint16m2_t v3 = __riscv_vle16_v_i16m2 (in, 4);in+=100;
+ v3 = __riscv_vwsub_wv_i16m2_tu (v3, v3, __riscv_vreinterpret_v_i16m1_i8m1 (__riscv_vget_v_i16m2_i16m1 (v3, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint16m2_t v4 = __riscv_vle16_v_i16m2 (in, 4);in+=100;
+ v4 = __riscv_vwsub_wv_i16m2_tu (v4, v4, __riscv_vreinterpret_v_i16m1_i8m1 (__riscv_vget_v_i16m2_i16m1 (v4, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint16m2_t v5 = __riscv_vle16_v_i16m2 (in, 4);in+=100;
+ v5 = __riscv_vwsub_wv_i16m2_tu (v5, v5, __riscv_vreinterpret_v_i16m1_i8m1 (__riscv_vget_v_i16m2_i16m1 (v5, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint16m2_t v6 = __riscv_vle16_v_i16m2 (in, 4);in+=100;
+ v6 = __riscv_vwsub_wv_i16m2_tu (v6, v6, __riscv_vreinterpret_v_i16m1_i8m1 (__riscv_vget_v_i16m2_i16m1 (v6, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint16m2_t v7 = __riscv_vle16_v_i16m2 (in, 4);in+=100;
+ v7 = __riscv_vwsub_wv_i16m2_tu (v7, v7, __riscv_vreinterpret_v_i16m1_i8m1 (__riscv_vget_v_i16m2_i16m1 (v7, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint16m2_t v8 = __riscv_vle16_v_i16m2 (in, 4);in+=100;
+ v8 = __riscv_vwsub_wv_i16m2_tu (v8, v8, __riscv_vreinterpret_v_i16m1_i8m1 (__riscv_vget_v_i16m2_i16m1 (v8, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint16m2_t v9 = __riscv_vle16_v_i16m2 (in, 4);in+=100;
+ v9 = __riscv_vwsub_wv_i16m2_tu (v9, v9, __riscv_vreinterpret_v_i16m1_i8m1 (__riscv_vget_v_i16m2_i16m1 (v9, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint16m2_t v10 = __riscv_vle16_v_i16m2 (in, 4);in+=100;
+ v10 = __riscv_vwsub_wv_i16m2_tu (v10, v10, __riscv_vreinterpret_v_i16m1_i8m1 (__riscv_vget_v_i16m2_i16m1 (v10, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint16m2_t v11 = __riscv_vle16_v_i16m2 (in, 4);in+=100;
+ v11 = __riscv_vwsub_wv_i16m2_tu (v11, v11, __riscv_vreinterpret_v_i16m1_i8m1 (__riscv_vget_v_i16m2_i16m1 (v11, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint16m2_t v12 = __riscv_vle16_v_i16m2 (in, 4);in+=100;
+ v12 = __riscv_vwsub_wv_i16m2_tu (v12, v12, __riscv_vreinterpret_v_i16m1_i8m1 (__riscv_vget_v_i16m2_i16m1 (v12, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint16m2_t v13 = __riscv_vle16_v_i16m2 (in, 4);in+=100;
+ v13 = __riscv_vwsub_wv_i16m2_tu (v13, v13, __riscv_vreinterpret_v_i16m1_i8m1 (__riscv_vget_v_i16m2_i16m1 (v13, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint16m2_t v14 = __riscv_vle16_v_i16m2 (in, 4);in+=100;
+ v14 = __riscv_vwsub_wv_i16m2_tu (v14, v14, __riscv_vreinterpret_v_i16m1_i8m1 (__riscv_vget_v_i16m2_i16m1 (v14, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint8m1_t v15_n = __riscv_vle8_v_i8m1 (in, 4);in+=100;
+ vint16m2_t v15 = __riscv_vwcvt_x_x_v_i16m2 (v15_n, 4);
+
+ asm volatile("nop" ::: "memory");
+ __riscv_vsse16_v_i16m2 (out, 4, v0, 4);out+=100;
+ __riscv_vsse16_v_i16m2 (out, 4, v1, 4);out+=100;
+ __riscv_vsse16_v_i16m2 (out, 4, v2, 4);out+=100;
+ __riscv_vsse16_v_i16m2 (out, 4, v3, 4);out+=100;
+ __riscv_vsse16_v_i16m2 (out, 4, v4, 4);out+=100;
+ __riscv_vsse16_v_i16m2 (out, 4, v5, 4);out+=100;
+ __riscv_vsse16_v_i16m2 (out, 4, v6, 4);out+=100;
+ __riscv_vsse16_v_i16m2 (out, 4, v7, 4);out+=100;
+ __riscv_vsse16_v_i16m2 (out, 4, v8, 4);out+=100;
+ __riscv_vsse16_v_i16m2 (out, 4, v9, 4);out+=100;
+ __riscv_vsse16_v_i16m2 (out, 4, v10, 4);out+=100;
+ __riscv_vsse16_v_i16m2 (out, 4, v11, 4);out+=100;
+ __riscv_vsse16_v_i16m2 (out, 4, v12, 4);out+=100;
+ __riscv_vsse16_v_i16m2 (out, 4, v13, 4);out+=100;
+ __riscv_vsse16_v_i16m2 (out, 4, v14, 4);out+=100;
+ __riscv_vsse16_v_i16m2 (out, 4, v15, 4);out+=100;
+ }
+}
+
+void
+foo2 (void *in, void *out, int n)
+{
+ for (int i = 0; i < n; i++)
+ {
+ asm volatile("nop" ::: "memory");
+ vint16m2_t v0 = __riscv_vle16_v_i16m2 (in, 4);in+=100;
+ v0 = __riscv_vwadd_wv_i16m2_tu (v0, v0, __riscv_vreinterpret_v_i16m1_i8m1 (__riscv_vget_v_i16m2_i16m1 (v0, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint16m2_t v1 = __riscv_vle16_v_i16m2 (in, 4);in+=100;
+ v1 = __riscv_vwadd_wv_i16m2_tu (v1, v1, __riscv_vreinterpret_v_i16m1_i8m1 (__riscv_vget_v_i16m2_i16m1 (v1, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint16m2_t v2 = __riscv_vle16_v_i16m2 (in, 4);in+=100;
+ v2 = __riscv_vwadd_wv_i16m2_tu (v2, v2, __riscv_vreinterpret_v_i16m1_i8m1 (__riscv_vget_v_i16m2_i16m1 (v2, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint16m2_t v3 = __riscv_vle16_v_i16m2 (in, 4);in+=100;
+ v3 = __riscv_vwadd_wv_i16m2_tu (v3, v3, __riscv_vreinterpret_v_i16m1_i8m1 (__riscv_vget_v_i16m2_i16m1 (v3, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint16m2_t v4 = __riscv_vle16_v_i16m2 (in, 4);in+=100;
+ v4 = __riscv_vwadd_wv_i16m2_tu (v4, v4, __riscv_vreinterpret_v_i16m1_i8m1 (__riscv_vget_v_i16m2_i16m1 (v4, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint16m2_t v5 = __riscv_vle16_v_i16m2 (in, 4);in+=100;
+ v5 = __riscv_vwadd_wv_i16m2_tu (v5, v5, __riscv_vreinterpret_v_i16m1_i8m1 (__riscv_vget_v_i16m2_i16m1 (v5, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint16m2_t v6 = __riscv_vle16_v_i16m2 (in, 4);in+=100;
+ v6 = __riscv_vwadd_wv_i16m2_tu (v6, v6, __riscv_vreinterpret_v_i16m1_i8m1 (__riscv_vget_v_i16m2_i16m1 (v6, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint16m2_t v7 = __riscv_vle16_v_i16m2 (in, 4);in+=100;
+ v7 = __riscv_vwadd_wv_i16m2_tu (v7, v7, __riscv_vreinterpret_v_i16m1_i8m1 (__riscv_vget_v_i16m2_i16m1 (v7, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint16m2_t v8 = __riscv_vle16_v_i16m2 (in, 4);in+=100;
+ v8 = __riscv_vwadd_wv_i16m2_tu (v8, v8, __riscv_vreinterpret_v_i16m1_i8m1 (__riscv_vget_v_i16m2_i16m1 (v8, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint16m2_t v9 = __riscv_vle16_v_i16m2 (in, 4);in+=100;
+ v9 = __riscv_vwadd_wv_i16m2_tu (v9, v9, __riscv_vreinterpret_v_i16m1_i8m1 (__riscv_vget_v_i16m2_i16m1 (v9, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint16m2_t v10 = __riscv_vle16_v_i16m2 (in, 4);in+=100;
+ v10 = __riscv_vwadd_wv_i16m2_tu (v10, v10, __riscv_vreinterpret_v_i16m1_i8m1 (__riscv_vget_v_i16m2_i16m1 (v10, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint16m2_t v11 = __riscv_vle16_v_i16m2 (in, 4);in+=100;
+ v11 = __riscv_vwadd_wv_i16m2_tu (v11, v11, __riscv_vreinterpret_v_i16m1_i8m1 (__riscv_vget_v_i16m2_i16m1 (v11, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint16m2_t v12 = __riscv_vle16_v_i16m2 (in, 4);in+=100;
+ v12 = __riscv_vwadd_wv_i16m2_tu (v12, v12, __riscv_vreinterpret_v_i16m1_i8m1 (__riscv_vget_v_i16m2_i16m1 (v12, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint16m2_t v13 = __riscv_vle16_v_i16m2 (in, 4);in+=100;
+ v13 = __riscv_vwadd_wv_i16m2_tu (v13, v13, __riscv_vreinterpret_v_i16m1_i8m1 (__riscv_vget_v_i16m2_i16m1 (v13, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint16m2_t v14 = __riscv_vle16_v_i16m2 (in, 4);in+=100;
+ v14 = __riscv_vwadd_wv_i16m2_tu (v14, v14, __riscv_vreinterpret_v_i16m1_i8m1 (__riscv_vget_v_i16m2_i16m1 (v14, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint8m1_t v15_n = __riscv_vle8_v_i8m1 (in, 4);in+=100;
+ vint16m2_t v15 = __riscv_vwcvt_x_x_v_i16m2 (v15_n, 4);
+
+ asm volatile("nop" ::: "memory");
+ __riscv_vsse16_v_i16m2 (out, 4, v0, 4);out+=100;
+ __riscv_vsse16_v_i16m2 (out, 4, v1, 4);out+=100;
+ __riscv_vsse16_v_i16m2 (out, 4, v2, 4);out+=100;
+ __riscv_vsse16_v_i16m2 (out, 4, v3, 4);out+=100;
+ __riscv_vsse16_v_i16m2 (out, 4, v4, 4);out+=100;
+ __riscv_vsse16_v_i16m2 (out, 4, v5, 4);out+=100;
+ __riscv_vsse16_v_i16m2 (out, 4, v6, 4);out+=100;
+ __riscv_vsse16_v_i16m2 (out, 4, v7, 4);out+=100;
+ __riscv_vsse16_v_i16m2 (out, 4, v8, 4);out+=100;
+ __riscv_vsse16_v_i16m2 (out, 4, v9, 4);out+=100;
+ __riscv_vsse16_v_i16m2 (out, 4, v10, 4);out+=100;
+ __riscv_vsse16_v_i16m2 (out, 4, v11, 4);out+=100;
+ __riscv_vsse16_v_i16m2 (out, 4, v12, 4);out+=100;
+ __riscv_vsse16_v_i16m2 (out, 4, v13, 4);out+=100;
+ __riscv_vsse16_v_i16m2 (out, 4, v14, 4);out+=100;
+ __riscv_vsse16_v_i16m2 (out, 4, v15, 4);out+=100;
+ }
+}
+
+/* { dg-final { scan-assembler-not {vmv1r} } } */
+/* { dg-final { scan-assembler-not {vmv2r} } } */
+/* { dg-final { scan-assembler-not {vmv4r} } } */
+/* { dg-final { scan-assembler-not {vmv8r} } } */
+/* { dg-final { scan-assembler-not {csrr} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-40.c b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-40.c
new file mode 100644
index 00000000000..e44b8010579
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-40.c
@@ -0,0 +1,94 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */
+
+#include "riscv_vector.h"
+
+void
+foo (void *in, void *out, int n)
+{
+ for (int i = 0; i < n; i++)
+ {
+ asm volatile("nop" ::: "memory");
+ vint16m4_t v0 = __riscv_vle16_v_i16m4 (in, 4);in+=100;
+ v0 = __riscv_vwsub_wv_i16m4_tu (v0, v0, __riscv_vreinterpret_v_i16m2_i8m2 (__riscv_vget_v_i16m4_i16m2 (v0, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint16m4_t v1 = __riscv_vle16_v_i16m4 (in, 4);in+=100;
+ v1 = __riscv_vwsub_wv_i16m4_tu (v1, v1, __riscv_vreinterpret_v_i16m2_i8m2 (__riscv_vget_v_i16m4_i16m2 (v1, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint16m4_t v2 = __riscv_vle16_v_i16m4 (in, 4);in+=100;
+ v2 = __riscv_vwsub_wv_i16m4_tu (v2, v2, __riscv_vreinterpret_v_i16m2_i8m2 (__riscv_vget_v_i16m4_i16m2 (v2, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint16m4_t v3 = __riscv_vle16_v_i16m4 (in, 4);in+=100;
+ v3 = __riscv_vwsub_wv_i16m4_tu (v3, v3, __riscv_vreinterpret_v_i16m2_i8m2 (__riscv_vget_v_i16m4_i16m2 (v3, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint16m4_t v4 = __riscv_vle16_v_i16m4 (in, 4);in+=100;
+ v4 = __riscv_vwsub_wv_i16m4_tu (v4, v4, __riscv_vreinterpret_v_i16m2_i8m2 (__riscv_vget_v_i16m4_i16m2 (v4, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint16m4_t v5 = __riscv_vle16_v_i16m4 (in, 4);in+=100;
+ v5 = __riscv_vwsub_wv_i16m4_tu (v5, v5, __riscv_vreinterpret_v_i16m2_i8m2 (__riscv_vget_v_i16m4_i16m2 (v5, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint16m4_t v6 = __riscv_vle16_v_i16m4 (in, 4);in+=100;
+ v6 = __riscv_vwsub_wv_i16m4_tu (v6, v6, __riscv_vreinterpret_v_i16m2_i8m2 (__riscv_vget_v_i16m4_i16m2 (v6, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint8m2_t v7_n = __riscv_vle8_v_i8m2 (in, 4);in+=100;
+ vint16m4_t v7 = __riscv_vwcvt_x_x_v_i16m4 (v7_n, 4);
+
+ asm volatile("nop" ::: "memory");
+ __riscv_vsse16_v_i16m4 (out, 4, v0, 4);out+=100;
+ __riscv_vsse16_v_i16m4 (out, 4, v1, 4);out+=100;
+ __riscv_vsse16_v_i16m4 (out, 4, v2, 4);out+=100;
+ __riscv_vsse16_v_i16m4 (out, 4, v3, 4);out+=100;
+ __riscv_vsse16_v_i16m4 (out, 4, v4, 4);out+=100;
+ __riscv_vsse16_v_i16m4 (out, 4, v5, 4);out+=100;
+ __riscv_vsse16_v_i16m4 (out, 4, v6, 4);out+=100;
+ __riscv_vsse16_v_i16m4 (out, 4, v7, 4);out+=100;
+ }
+}
+
+void
+foo2 (void *in, void *out, int n)
+{
+ for (int i = 0; i < n; i++)
+ {
+ asm volatile("nop" ::: "memory");
+ vint16m4_t v0 = __riscv_vle16_v_i16m4 (in, 4);in+=100;
+ v0 = __riscv_vwadd_wv_i16m4_tu (v0, v0, __riscv_vreinterpret_v_i16m2_i8m2 (__riscv_vget_v_i16m4_i16m2 (v0, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint16m4_t v1 = __riscv_vle16_v_i16m4 (in, 4);in+=100;
+ v1 = __riscv_vwadd_wv_i16m4_tu (v1, v1, __riscv_vreinterpret_v_i16m2_i8m2 (__riscv_vget_v_i16m4_i16m2 (v1, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint16m4_t v2 = __riscv_vle16_v_i16m4 (in, 4);in+=100;
+ v2 = __riscv_vwadd_wv_i16m4_tu (v2, v2, __riscv_vreinterpret_v_i16m2_i8m2 (__riscv_vget_v_i16m4_i16m2 (v2, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint16m4_t v3 = __riscv_vle16_v_i16m4 (in, 4);in+=100;
+ v3 = __riscv_vwadd_wv_i16m4_tu (v3, v3, __riscv_vreinterpret_v_i16m2_i8m2 (__riscv_vget_v_i16m4_i16m2 (v3, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint16m4_t v4 = __riscv_vle16_v_i16m4 (in, 4);in+=100;
+ v4 = __riscv_vwadd_wv_i16m4_tu (v4, v4, __riscv_vreinterpret_v_i16m2_i8m2 (__riscv_vget_v_i16m4_i16m2 (v4, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint16m4_t v5 = __riscv_vle16_v_i16m4 (in, 4);in+=100;
+ v5 = __riscv_vwadd_wv_i16m4_tu (v5, v5, __riscv_vreinterpret_v_i16m2_i8m2 (__riscv_vget_v_i16m4_i16m2 (v5, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint16m4_t v6 = __riscv_vle16_v_i16m4 (in, 4);in+=100;
+ v6 = __riscv_vwadd_wv_i16m4_tu (v6, v6, __riscv_vreinterpret_v_i16m2_i8m2 (__riscv_vget_v_i16m4_i16m2 (v6, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint8m2_t v7_n = __riscv_vle8_v_i8m2 (in, 4);in+=100;
+ vint16m4_t v7 = __riscv_vwcvt_x_x_v_i16m4 (v7_n, 4);
+
+ asm volatile("nop" ::: "memory");
+ __riscv_vsse16_v_i16m4 (out, 4, v0, 4);out+=100;
+ __riscv_vsse16_v_i16m4 (out, 4, v1, 4);out+=100;
+ __riscv_vsse16_v_i16m4 (out, 4, v2, 4);out+=100;
+ __riscv_vsse16_v_i16m4 (out, 4, v3, 4);out+=100;
+ __riscv_vsse16_v_i16m4 (out, 4, v4, 4);out+=100;
+ __riscv_vsse16_v_i16m4 (out, 4, v5, 4);out+=100;
+ __riscv_vsse16_v_i16m4 (out, 4, v6, 4);out+=100;
+ __riscv_vsse16_v_i16m4 (out, 4, v7, 4);out+=100;
+ }
+}
+
+/* { dg-final { scan-assembler-not {vmv1r} } } */
+/* { dg-final { scan-assembler-not {vmv2r} } } */
+/* { dg-final { scan-assembler-not {vmv4r} } } */
+/* { dg-final { scan-assembler-not {vmv8r} } } */
+/* { dg-final { scan-assembler-not {csrr} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-41.c b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-41.c
new file mode 100644
index 00000000000..dc27006f6f9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-41.c
@@ -0,0 +1,62 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */
+
+#include "riscv_vector.h"
+
+void
+foo (void *in, void *out, int n)
+{
+ for (int i = 0; i < n; i++)
+ {
+ asm volatile("nop" ::: "memory");
+ vint16m8_t v0 = __riscv_vle16_v_i16m8 (in, 4);in+=100;
+ v0 = __riscv_vwsub_wv_i16m8_tu (v0, v0, __riscv_vreinterpret_v_i16m4_i8m4 (__riscv_vget_v_i16m8_i16m4 (v0, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint16m8_t v1 = __riscv_vle16_v_i16m8 (in, 4);in+=100;
+ v1 = __riscv_vwsub_wv_i16m8_tu (v1, v1, __riscv_vreinterpret_v_i16m4_i8m4 (__riscv_vget_v_i16m8_i16m4 (v1, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint16m8_t v2 = __riscv_vle16_v_i16m8 (in, 4);in+=100;
+ v2 = __riscv_vwsub_wv_i16m8_tu (v2, v2, __riscv_vreinterpret_v_i16m4_i8m4 (__riscv_vget_v_i16m8_i16m4 (v2, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint8m4_t v3_n = __riscv_vle8_v_i8m4 (in, 4);in+=100;
+ vint16m8_t v3 = __riscv_vwcvt_x_x_v_i16m8 (v3_n, 4);
+
+ asm volatile("nop" ::: "memory");
+ __riscv_vsse16_v_i16m8 (out, 4, v0, 4);out+=100;
+ __riscv_vsse16_v_i16m8 (out, 4, v1, 4);out+=100;
+ __riscv_vsse16_v_i16m8 (out, 4, v2, 4);out+=100;
+ __riscv_vsse16_v_i16m8 (out, 4, v3, 4);out+=100;
+ }
+}
+
+void
+foo2 (void *in, void *out, int n)
+{
+ for (int i = 0; i < n; i++)
+ {
+ asm volatile("nop" ::: "memory");
+ vint16m8_t v0 = __riscv_vle16_v_i16m8 (in, 4);in+=100;
+ v0 = __riscv_vwadd_wv_i16m8_tu (v0, v0, __riscv_vreinterpret_v_i16m4_i8m4 (__riscv_vget_v_i16m8_i16m4 (v0, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint16m8_t v1 = __riscv_vle16_v_i16m8 (in, 4);in+=100;
+ v1 = __riscv_vwadd_wv_i16m8_tu (v1, v1, __riscv_vreinterpret_v_i16m4_i8m4 (__riscv_vget_v_i16m8_i16m4 (v1, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint16m8_t v2 = __riscv_vle16_v_i16m8 (in, 4);in+=100;
+ v2 = __riscv_vwadd_wv_i16m8_tu (v2, v2, __riscv_vreinterpret_v_i16m4_i8m4 (__riscv_vget_v_i16m8_i16m4 (v2, 1)), 4);
+ asm volatile("nop" ::: "memory");
+ vint8m4_t v3_n = __riscv_vle8_v_i8m4 (in, 4);in+=100;
+ vint16m8_t v3 = __riscv_vwcvt_x_x_v_i16m8 (v3_n, 4);
+
+ asm volatile("nop" ::: "memory");
+ __riscv_vsse16_v_i16m8 (out, 4, v0, 4);out+=100;
+ __riscv_vsse16_v_i16m8 (out, 4, v1, 4);out+=100;
+ __riscv_vsse16_v_i16m8 (out, 4, v2, 4);out+=100;
+ __riscv_vsse16_v_i16m8 (out, 4, v3, 4);out+=100;
+ }
+}
+
+/* { dg-final { scan-assembler-not {vmv1r} } } */
+/* { dg-final { scan-assembler-not {vmv2r} } } */
+/* { dg-final { scan-assembler-not {vmv4r} } } */
+/* { dg-final { scan-assembler-not {vmv8r} } } */
+/* { dg-final { scan-assembler-not {csrr} } } */
--
2.36.3
^ permalink raw reply [flat|nested] 2+ messages in thread
* Re: [PATCH] RISC-V: Support highest overlap for wv instructions
2023-12-09 4:06 [PATCH] RISC-V: Support highest overlap for wv instructions Juzhe-Zhong
@ 2023-12-11 7:54 ` Robin Dapp
0 siblings, 0 replies; 2+ messages in thread
From: Robin Dapp @ 2023-12-11 7:54 UTC (permalink / raw)
To: Juzhe-Zhong, gcc-patches; +Cc: rdapp.gcc, kito.cheng, kito.cheng, jeffreyalaw
LGTM, thanks.
Regards
Robin
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2023-12-11 7:54 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-12-09 4:06 [PATCH] RISC-V: Support highest overlap for wv instructions Juzhe-Zhong
2023-12-11 7:54 ` Robin Dapp
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).