public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug c/108271] New: Missed RVV cost model
@ 2023-01-03 2:03 juzhe.zhong at rivai dot ai
2023-01-03 2:08 ` [Bug target/108271] " pinskia at gcc dot gnu.org
` (3 more replies)
0 siblings, 4 replies; 5+ messages in thread
From: juzhe.zhong at rivai dot ai @ 2023-01-03 2:03 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108271
Bug ID: 108271
Summary: Missed RVV cost model
Product: gcc
Version: 13.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: c
Assignee: unassigned at gcc dot gnu.org
Reporter: juzhe.zhong at rivai dot ai
Target Milestone: ---
#include "riscv_vector.h"
void f3 (int * restrict in, int * restrict out, void * restrict mask_in, int n)
{
vfloat32mf2_t v = __riscv_vle32_v_f32mf2 ((float *)(in + 10000), 19);
__riscv_vse32_v_f32mf2 ((float *)(out + 10000), v, 19);
vbool64_t mask = *(vbool64_t*)mask_in;
for (int i = 0; i < n; i++)
{
vint16mf2_t v1 = __riscv_vle16_v_i16mf2 ((int16_t *)(in + i + 1), 19);
__riscv_vse16_v_i16mf2 ((int16_t *)(out + i + 1), v1, 19);
vint32mf2_t v2 = __riscv_vle32_v_i32mf2 ((int32_t *)(in + i + 2), 19);
__riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 2), v2, 19);
vint32mf2_t v3 = __riscv_vle32_v_i32mf2_tumu (mask, v2, (int32_t *)(in +
i + 200), 13);
*(vint32mf2_t*)(out + i + 200) = v3;
vfloat64m1_t v4 = __riscv_vle64_v_f64m1_m (mask, (double *)(in + i +
300), 11);
__riscv_vse64_v_f64m1 ((double *)(out + i + 300), v4, 11);
vfloat64m1_t v5 = __riscv_vle64_v_f64m1_tum (mask, v4, (double *)(in + i
+ 500), 11);
__riscv_vse64_v_f64m1 ((double *)(out + i + 500), v5, 11);
vfloat64m1_t v6 = __riscv_vle64_v_f64m1_mu (mask, v5, (double *)(in + i +
600), 11);
__riscv_vse64_v_f64m1_m (mask, (double *)(out + i + 600), v6, 11);
vuint8mf4_t v7 = __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 700), 11);
__riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 700), v7, 11);
vuint8mf4_t v8 = __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 800), 11);
__riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 800), v7, 11);
vuint8mf4_t v9 = __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 900), 11);
__riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 900), v7, 11);
vuint8mf4_t v10 = __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 1000), 11);
__riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 1000), v7, 11);
}
}
-O3 -S ASM:
f3:
li a5,40960
addi a5,a5,-960
addi sp,sp,-64
sd s4,24(sp)
add a4,a0,a5
add a5,a1,a5
vsetivli zero,19,e32,mf2,ta,ma
vle32.v v24,0(a4)
vse32.v v24,0(a5)
vsetvli s4,zero,e8,mf8,ta,ma
vlm.v v0,0(a2)
ble a3,zero,.L1
addi a3,a3,1
sd s3,32(sp)
slli a3,a3,2
li s3,4096
sd s2,40(sp)
sd s5,16(sp)
sd s6,8(sp)
addi t6,s3,-1700
addi t5,s3,-1300
addi s6,s3,-900
addi s5,s3,-500
sd s0,56(sp)
sd s1,48(sp)
addi a0,a0,4
addi a4,a1,4
add s2,a1,a3
addi s3,s3,-100
.L3:
vsetivli zero,19,e16,mf2,ta,ma
mv a5,a4
vle16.v v24,0(a0)
mv a3,a0
vse16.v v24,0(a4)
addi a0,a0,4
vsetivli zero,19,e32,mf2,ta,ma
addi a4,a4,4
vle32.v v24,0(a0)
addi s1,a3,796
vse32.v v24,0(a4)
vsetivli zero,13,e32,mf2,tu,mu
addi s0,a5,796
vle32.v v24,0(s1),v0.t
addi a1,a3,1196
addi t4,a5,1196
addi t2,a3,1996
addi t3,a5,1996
add t0,a3,t6
vsetvli s4,zero,e32,mf2,ta,ma
add t1,a5,t6
vse32.v v24,0(s0)
add a7,a5,t5
vsetivli zero,11,e64,m1,tu,mu
add a6,a5,s6
vle64.v v24,0(a1),v0.t
add a2,a5,s5
vse64.v v24,0(t4)
add a3,a3,t5
vle64.v v24,0(t2),v0.t
add a5,a5,s3
vse64.v v24,0(t3)
vle64.v v24,0(t0),v0.t
vse64.v v24,0(t1),v0.t
vsetivli zero,11,e8,mf4,ta,ma
vle8.v v24,0(a3)
vse8.v v24,0(a7)
vse8.v v24,0(a6)
vse8.v v24,0(a2)
vse8.v v24,0(a5)
bne s2,a4,.L3
ld s0,56(sp)
ld s1,48(sp)
ld s2,40(sp)
ld s3,32(sp)
ld s5,16(sp)
ld s6,8(sp)
.L1:
ld s4,24(sp)
addi sp,sp,64
jr ra
GCC allocate redundant stack and generate a lot of redundant ld or sd
instructions.
However, if we use -O3 -fno-schedule-insns ASM:
f3:
li a5,40960
addi a5,a5,-960
add a4,a0,a5
add a5,a1,a5
vsetivli zero,19,e32,mf2,ta,ma
vle32.v v24,0(a4)
vse32.v v24,0(a5)
vsetvli t3,zero,e8,mf8,ta,ma
vlm.v v0,0(a2)
ble a3,zero,.L1
addi a3,a3,1
li t1,4096
slli a3,a3,2
addi a4,a1,4
addi a7,t1,-1700
addi a6,t1,-1300
addi t5,t1,-900
addi t4,t1,-500
addi a2,a0,4
add a1,a1,a3
addi t1,t1,-100
.L3:
mv a3,a2
vsetivli zero,19,e16,mf2,ta,ma
mv a5,a4
vle16.v v24,0(a2)
addi a0,a3,796
vse16.v v24,0(a4)
addi a2,a2,4
vsetivli zero,19,e32,mf2,ta,ma
addi a4,a4,4
vle32.v v24,0(a2)
vse32.v v24,0(a4)
vsetivli zero,13,e32,mf2,tu,mu
vle32.v v24,0(a0),v0.t
addi a0,a5,796
vsetvli t3,zero,e32,mf2,ta,ma
vse32.v v24,0(a0)
addi a0,a3,1196
vsetivli zero,11,e64,m1,tu,mu
vle64.v v24,0(a0),v0.t
addi a0,a5,1196
vse64.v v24,0(a0)
addi a0,a3,1996
vle64.v v24,0(a0),v0.t
addi a0,a5,1996
vse64.v v24,0(a0)
add a0,a3,a7
vle64.v v24,0(a0),v0.t
add a3,a3,a6
add a0,a5,a7
vse64.v v24,0(a0),v0.t
vsetivli zero,11,e8,mf4,ta,ma
vle8.v v24,0(a3)
add a3,a5,a6
vse8.v v24,0(a3)
add a3,a5,t5
vse8.v v24,0(a3)
add a3,a5,t4
add a5,a5,t1
vse8.v v24,0(a3)
vse8.v v24,0(a5)
bne a1,a4,.L3
.L1:
ret
This issue is gone. we should correctly adjust the RVV instruction COST model
to make the codegen of with -fno-schedule-insns and without -fno-schedule-insns
the same.
^ permalink raw reply [flat|nested] 5+ messages in thread
* [Bug target/108271] Missed RVV cost model
2023-01-03 2:03 [Bug c/108271] New: Missed RVV cost model juzhe.zhong at rivai dot ai
@ 2023-01-03 2:08 ` pinskia at gcc dot gnu.org
2023-01-03 2:09 ` juzhe.zhong at rivai dot ai
` (2 subsequent siblings)
3 siblings, 0 replies; 5+ messages in thread
From: pinskia at gcc dot gnu.org @ 2023-01-03 2:08 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108271
--- Comment #1 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
More likely there should be a generic scheduling model that handles RVV too
instead of a cost issue ...
^ permalink raw reply [flat|nested] 5+ messages in thread
* [Bug target/108271] Missed RVV cost model
2023-01-03 2:03 [Bug c/108271] New: Missed RVV cost model juzhe.zhong at rivai dot ai
2023-01-03 2:08 ` [Bug target/108271] " pinskia at gcc dot gnu.org
@ 2023-01-03 2:09 ` juzhe.zhong at rivai dot ai
2023-08-25 7:43 ` rdapp at gcc dot gnu.org
2023-12-21 8:25 ` juzhe.zhong at rivai dot ai
3 siblings, 0 replies; 5+ messages in thread
From: juzhe.zhong at rivai dot ai @ 2023-01-03 2:09 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108271
--- Comment #2 from JuzheZhong <juzhe.zhong at rivai dot ai> ---
(In reply to Andrew Pinski from comment #1)
> More likely there should be a generic scheduling model that handles RVV too
> instead of a cost issue ...
OK. This issue should be fixed after I support all RVV intrinsics.
^ permalink raw reply [flat|nested] 5+ messages in thread
* [Bug target/108271] Missed RVV cost model
2023-01-03 2:03 [Bug c/108271] New: Missed RVV cost model juzhe.zhong at rivai dot ai
2023-01-03 2:08 ` [Bug target/108271] " pinskia at gcc dot gnu.org
2023-01-03 2:09 ` juzhe.zhong at rivai dot ai
@ 2023-08-25 7:43 ` rdapp at gcc dot gnu.org
2023-12-21 8:25 ` juzhe.zhong at rivai dot ai
3 siblings, 0 replies; 5+ messages in thread
From: rdapp at gcc dot gnu.org @ 2023-08-25 7:43 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108271
Robin Dapp <rdapp at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
CC| |rdapp at gcc dot gnu.org
--- Comment #3 from Robin Dapp <rdapp at gcc dot gnu.org> ---
This is basically the same problem as PR108412. As long as loads/stores have a
high(ish) latency and we mostly do load/store, they will tend to lump together
at the end of the function. Setting vector load/store to a latency of <= 2
helps here and we might want to do this in order to avoid excessive spilling.
I had to deal with this before, e.g. in SPEC2006's calculix.
In the end insn scheduling wouldn't buy us anything and rather caused more
spilling causing performance degradationl
^ permalink raw reply [flat|nested] 5+ messages in thread
* [Bug target/108271] Missed RVV cost model
2023-01-03 2:03 [Bug c/108271] New: Missed RVV cost model juzhe.zhong at rivai dot ai
` (2 preceding siblings ...)
2023-08-25 7:43 ` rdapp at gcc dot gnu.org
@ 2023-12-21 8:25 ` juzhe.zhong at rivai dot ai
3 siblings, 0 replies; 5+ messages in thread
From: juzhe.zhong at rivai dot ai @ 2023-12-21 8:25 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108271
JuzheZhong <juzhe.zhong at rivai dot ai> changed:
What |Removed |Added
----------------------------------------------------------------------------
Status|UNCONFIRMED |RESOLVED
Resolution|--- |FIXED
--- Comment #4 from JuzheZhong <juzhe.zhong at rivai dot ai> ---
This issue is fixed when we use -mtune=sifive-u74 so it won't be a problem.
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2023-12-21 8:25 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-01-03 2:03 [Bug c/108271] New: Missed RVV cost model juzhe.zhong at rivai dot ai
2023-01-03 2:08 ` [Bug target/108271] " pinskia at gcc dot gnu.org
2023-01-03 2:09 ` juzhe.zhong at rivai dot ai
2023-08-25 7:43 ` rdapp at gcc dot gnu.org
2023-12-21 8:25 ` juzhe.zhong at rivai dot ai
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).