public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug c/112401] New: RISC-V: So many redundant move instructions due to subreg handling on vector mode
@ 2023-11-06 3:52 juzhe.zhong at rivai dot ai
2023-11-06 3:53 ` [Bug c/112401] " juzhe.zhong at rivai dot ai
` (2 more replies)
0 siblings, 3 replies; 4+ messages in thread
From: juzhe.zhong at rivai dot ai @ 2023-11-06 3:52 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112401
Bug ID: 112401
Summary: RISC-V: So many redundant move instructions due to
subreg handling on vector mode
Product: gcc
Version: 14.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: c
Assignee: unassigned at gcc dot gnu.org
Reporter: juzhe.zhong at rivai dot ai
Target Milestone: ---
Consider this following case:
https://godbolt.org/z/8nc6r4joc
Compare with LLVM, we have so many redundant move instruction "vmv1r"
#include <riscv_vector.h>
void
subreg_to_reg_1 (int32_t *in, int32_t *out, size_t m)
{
vint32m8_t result = __riscv_vle32_v_i32m8 (in, 32);
vint32m1_t v0 = __riscv_vget_v_i32m8_i32m1 (result, 0);
vint32m1_t v1 = __riscv_vget_v_i32m8_i32m1 (result, 1);
vint32m1_t v2 = __riscv_vget_v_i32m8_i32m1 (result, 2);
vint32m1_t v3 = __riscv_vget_v_i32m8_i32m1 (result, 3);
vint32m1_t v4 = __riscv_vget_v_i32m8_i32m1 (result, 4);
vint32m1_t v5 = __riscv_vget_v_i32m8_i32m1 (result, 5);
vint32m1_t v6 = __riscv_vget_v_i32m8_i32m1 (result, 6);
vint32m1_t v7 = __riscv_vget_v_i32m8_i32m1 (result, 7);
for (size_t i = 0; i < m; i++)
{
v0 = __riscv_vadd_vv_i32m1(v0, v0, 4);
v1 = __riscv_vadd_vv_i32m1(v1, v1, 4);
v2 = __riscv_vadd_vv_i32m1(v2, v2, 4);
v3 = __riscv_vadd_vv_i32m1(v3, v3, 4);
v4 = __riscv_vadd_vv_i32m1(v4, v4, 4);
v5 = __riscv_vadd_vv_i32m1(v5, v5, 4);
v6 = __riscv_vadd_vv_i32m1(v6, v6, 4);
v7 = __riscv_vadd_vv_i32m1(v7, v7, 4);
}
*(vint32m1_t*)(out+4*0) = v0;
*(vint32m1_t*)(out+4*1) = v1;
*(vint32m1_t*)(out+4*2) = v2;
*(vint32m1_t*)(out+4*3) = v3;
*(vint32m1_t*)(out+4*4) = v4;
*(vint32m1_t*)(out+4*5) = v5;
*(vint32m1_t*)(out+4*6) = v6;
*(vint32m1_t*)(out+4*7) = v7;
}
Such issue not only happens on RISC-V but also in all other targets.
Lehua will send a patch to support subreg liveness tracking on GCC soon.
^ permalink raw reply [flat|nested] 4+ messages in thread
* [Bug c/112401] RISC-V: So many redundant move instructions due to subreg handling on vector mode
2023-11-06 3:52 [Bug c/112401] New: RISC-V: So many redundant move instructions due to subreg handling on vector mode juzhe.zhong at rivai dot ai
@ 2023-11-06 3:53 ` juzhe.zhong at rivai dot ai
2024-01-18 1:27 ` [Bug rtl-optimization/112401] " juzhe.zhong at rivai dot ai
2024-01-18 1:28 ` juzhe.zhong at rivai dot ai
2 siblings, 0 replies; 4+ messages in thread
From: juzhe.zhong at rivai dot ai @ 2023-11-06 3:53 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112401
--- Comment #1 from JuzheZhong <juzhe.zhong at rivai dot ai> ---
GCC ASM:
subreg_to_reg_1:
li a5,32
vsetvli zero,a5,e32,m8,ta,ma
vle32.v v16,0(a0)
vmv1r.v v8,v16
vmv1r.v v7,v17
vmv1r.v v6,v18
vmv1r.v v5,v19
vmv1r.v v4,v20
vmv1r.v v3,v21
vmv1r.v v2,v22
vmv1r.v v1,v23
beq a2,zero,.L2
li a5,0
vsetivli zero,4,e32,m1,ta,ma
.L3:
addi a5,a5,1
vadd.vv v8,v8,v8
vadd.vv v7,v7,v7
vadd.vv v6,v6,v6
vadd.vv v5,v5,v5
vadd.vv v4,v4,v4
vadd.vv v3,v3,v3
vadd.vv v2,v2,v2
vadd.vv v1,v1,v1
bne a2,a5,.L3
.L2:
vs1r.v v8,0(a1)
addi a5,a1,16
vs1r.v v7,0(a5)
addi a5,a1,32
vs1r.v v6,0(a5)
addi a5,a1,48
vs1r.v v5,0(a5)
addi a5,a1,64
vs1r.v v4,0(a5)
addi a5,a1,80
vs1r.v v3,0(a5)
addi a5,a1,96
vs1r.v v2,0(a5)
addi a1,a1,112
vs1r.v v1,0(a1)
ret
LLVM ASM:
subreg_to_reg_1: # @subreg_to_reg_1
li a3, 32
vsetvli zero, a3, e32, m8, ta, ma
vle32.v v8, (a0)
addi a0, a1, 16
beqz a2, .LBB0_2
.LBB0_1: # =>This Inner Loop Header: Depth=1
vsetivli zero, 4, e32, m1, ta, ma
vadd.vv v8, v8, v8
vadd.vv v9, v9, v9
vadd.vv v10, v10, v10
vadd.vv v11, v11, v11
vadd.vv v12, v12, v12
vadd.vv v13, v13, v13
vadd.vv v14, v14, v14
addi a2, a2, -1
vadd.vv v15, v15, v15
bnez a2, .LBB0_1
.LBB0_2:
vs1r.v v8, (a1)
vs1r.v v9, (a0)
addi a1, a0, 16
vs1r.v v10, (a1)
addi a1, a0, 32
vs1r.v v11, (a1)
addi a1, a0, 48
vs1r.v v12, (a1)
addi a1, a0, 64
vs1r.v v13, (a1)
addi a1, a0, 80
vs1r.v v14, (a1)
addi a0, a0, 96
vs1r.v v15, (a0)
ret
^ permalink raw reply [flat|nested] 4+ messages in thread
* [Bug rtl-optimization/112401] RISC-V: So many redundant move instructions due to subreg handling on vector mode
2023-11-06 3:52 [Bug c/112401] New: RISC-V: So many redundant move instructions due to subreg handling on vector mode juzhe.zhong at rivai dot ai
2023-11-06 3:53 ` [Bug c/112401] " juzhe.zhong at rivai dot ai
@ 2024-01-18 1:27 ` juzhe.zhong at rivai dot ai
2024-01-18 1:28 ` juzhe.zhong at rivai dot ai
2 siblings, 0 replies; 4+ messages in thread
From: juzhe.zhong at rivai dot ai @ 2024-01-18 1:27 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112401
--- Comment #2 from JuzheZhong <juzhe.zhong at rivai dot ai> ---
Add more test:
void matrix_4x4_transpose_segmented_load(float* dst, float* src)
{
vfloat32m1x4_t data = __riscv_vlseg4e32_v_f32m1x4(src, 4);
vfloat32m1_t data0 = __riscv_vget_v_f32m1x4_f32m1(data, 0);
vfloat32m1_t data1 = __riscv_vget_v_f32m1x4_f32m1(data, 1);
vfloat32m1_t data2 = __riscv_vget_v_f32m1x4_f32m1(data, 2);
vfloat32m1_t data3 = __riscv_vget_v_f32m1x4_f32m1(data, 3);
vfloat32m4_t packedData = __riscv_vcreate_v_f32m1_f32m4(data0,
data1,
data2,
data3);
__riscv_vse32_v_f32m4(dst, packedData, 16);
}
matrix_4x4_transpose_segmented_load:
vsetivli zero,4,e32,m1,ta,ma
vlseg4e32.v v8,(a1)
vsetivli zero,16,e32,m4,ta,ma
vmv1r.v v4,v8
vmv1r.v v5,v9
vmv1r.v v6,v10
vmv1r.v v7,v11
vse32.v v4,0(a0)
ret
^ permalink raw reply [flat|nested] 4+ messages in thread
* [Bug rtl-optimization/112401] RISC-V: So many redundant move instructions due to subreg handling on vector mode
2023-11-06 3:52 [Bug c/112401] New: RISC-V: So many redundant move instructions due to subreg handling on vector mode juzhe.zhong at rivai dot ai
2023-11-06 3:53 ` [Bug c/112401] " juzhe.zhong at rivai dot ai
2024-01-18 1:27 ` [Bug rtl-optimization/112401] " juzhe.zhong at rivai dot ai
@ 2024-01-18 1:28 ` juzhe.zhong at rivai dot ai
2 siblings, 0 replies; 4+ messages in thread
From: juzhe.zhong at rivai dot ai @ 2024-01-18 1:28 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112401
--- Comment #3 from JuzheZhong <juzhe.zhong at rivai dot ai> ---
vfloat32m4_t matrix_4x4_transpose_vslide(vfloat32m4_t src) {
vfloat32m1_t inMat0 = __riscv_vget_v_f32m4_f32m1(src, 0);
vfloat32m1_t inMat1 = __riscv_vget_v_f32m4_f32m1(src, 1);
vfloat32m1_t inMat2 = __riscv_vget_v_f32m4_f32m1(src, 2);
vfloat32m1_t inMat3 = __riscv_vget_v_f32m4_f32m1(src, 3);
vuint32m1_t oddMask_u32 = __riscv_vmv_v_x_u32m1(0xaaaa, 1);
vuint32m1_t evenMask_u32 = __riscv_vmv_v_x_u32m1(0x5555, 1);
vbool32_t oddMask = __riscv_vreinterpret_v_u32m1_b32(oddMask_u32);
// vl=4 in the following
// should be mapped to vslideup.vi
vfloat32m1_t transMat0 = __riscv_vslideup_vx_f32m1_tumu(oddMask,
inMat0,
inMat1,
1, 4);
vfloat32m1_t transMat2 = __riscv_vslideup_vx_f32m1_tumu(oddMask,
inMat2,
inMat3,
1, 4);
vbool32_t evenMask = __riscv_vreinterpret_v_u32m1_b32(evenMask_u32);
// should be mapped to vslidedown.vi
vfloat32m1_t transMat1 = __riscv_vslidedown_vx_f32m1_tumu(evenMask,
inMat1,
inMat0,
1, 4);
vfloat32m1_t transMat3 = __riscv_vslidedown_vx_f32m1_tumu(evenMask,
inMat3,
inMat2,
1, 4);
// should be mapped to vslideup.vi
vfloat32m1_t outMat0 = __riscv_vslideup_vx_f32m1_tu(transMat0,
transMat2,
2, 4);
vfloat32m1_t outMat1 = __riscv_vslideup_vx_f32m1_tu(transMat1,
transMat3,
2, 4);
// vl=2 in the following
// should be mapped to vslidedown.vi
vfloat32m1_t outMat2 = __riscv_vslidedown_vx_f32m1_tu(transMat2,
transMat0,
2, 2);
vfloat32m1_t outMat3 = __riscv_vslidedown_vx_f32m1_tu(transMat3,
transMat1,
2, 2);
return __riscv_vcreate_v_f32m1_f32m4(outMat0,
outMat1,
outMat2,
outMat3);
}
matrix_4x4_transpose_vslide:
li a4,45056
addiw a4,a4,-1366
vsetivli zero,1,e32,m1,ta,ma
li a5,20480
vmv.v.x v0,a4
vsetivli zero,4,e32,m1,tu,mu
vl4re32.v v4,0(a1)
addiw a5,a5,1365
vmv1r.v v12,v4
vmv1r.v v3,v6
vslideup.vi v12,v5,1,v0.t
vslideup.vi v3,v7,1,v0.t
vsetivli zero,1,e32,m1,ta,ma
vmv1r.v v1,v12
vmv.v.x v0,a5
vsetivli zero,4,e32,m1,tu,mu
vslideup.vi v1,v3,2
vmv1r.v v2,v5
vmv1r.v v8,v1
vslidedown.vi v2,v4,1,v0.t
vmv1r.v v1,v7
vmv1r.v v4,v2
vslidedown.vi v1,v6,1,v0.t
vslideup.vi v4,v1,2
vsetivli zero,2,e32,m1,tu,ma
vmv1r.v v9,v4
vslidedown.vi v3,v12,2
vslidedown.vi v1,v2,2
vmv1r.v v10,v3
vmv1r.v v11,v1
vs4r.v v8,0(a0)
ret
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2024-01-18 1:28 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-11-06 3:52 [Bug c/112401] New: RISC-V: So many redundant move instructions due to subreg handling on vector mode juzhe.zhong at rivai dot ai
2023-11-06 3:53 ` [Bug c/112401] " juzhe.zhong at rivai dot ai
2024-01-18 1:27 ` [Bug rtl-optimization/112401] " juzhe.zhong at rivai dot ai
2024-01-18 1:28 ` juzhe.zhong at rivai dot ai
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).