public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug c/110559] New: Bad mask_load/mask_store codegen of RVV
@ 2023-07-05 11:30 juzhe.zhong at rivai dot ai
2023-07-07 14:50 ` [Bug target/110559] " rdapp at gcc dot gnu.org
` (2 more replies)
0 siblings, 3 replies; 4+ messages in thread
From: juzhe.zhong at rivai dot ai @ 2023-07-05 11:30 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110559
Bug ID: 110559
Summary: Bad mask_load/mask_store codegen of RVV
Product: gcc
Version: 14.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: c
Assignee: unassigned at gcc dot gnu.org
Reporter: juzhe.zhong at rivai dot ai
Target Milestone: ---
#include <stdint-gcc.h>
#define INDEX8 uint8_t
#define INDEX16 uint16_t
#define INDEX32 uint32_t
#define INDEX64 uint64_t
#define TEST_LOOP(DATA_TYPE, BITS)
\
void __attribute__ ((noinline, noclone))
\
f_##DATA_TYPE (DATA_TYPE *restrict dest, DATA_TYPE *restrict src,
\
INDEX##BITS *restrict indices, INDEX##BITS *restrict cond)
\
{
\
for (int i = 0; i < 128; ++i)
\
if (cond[i])
\
dest[i] += src[i]; \
}
#define TEST_ALL(T)
\
T (int8_t, 8)
\
T (uint8_t, 8)
\
T (int16_t, 16)
\
T (uint16_t, 16)
\
T (_Float16, 16)
\
T (int32_t, 32)
\
T (uint32_t, 32)
\
T (float, 32)
\
T (int64_t, 64)
\
T (uint64_t, 64)
\
T (double, 64)
TEST_ALL (TEST_LOOP)
riscv32: --param riscv-autovec-preference=fixed-vlmax -O3:
f_int8_t:
addi sp,sp,-48
sw s0,44(sp)
sw s1,40(sp)
sw s2,36(sp)
sw s3,32(sp)
sw s4,28(sp)
sw s5,24(sp)
sw s6,20(sp)
sw s7,16(sp)
sw s8,12(sp)
sw s9,8(sp)
vsetivli zero,16,e8,m1,ta,ma
addi s9,a3,16
vmv.v.i v1,0
vl1re8.v v8,0(a3)
vmsne.vv v8,v8,v1
vmv1r.v v0,v8
vl1re8.v v7,0(s9)
vle8.v v9,0(a0),v0.t
vmsne.vv v7,v7,v1
vle8.v v15,0(a1),v0.t
addi s8,a1,16
vmv1r.v v0,v7
addi s7,a3,32
vle8.v v14,0(s8),v0.t
vl1re8.v v6,0(s7)
addi s5,a1,32
vmsne.vv v6,v6,v1
addi s6,a3,48
vmv1r.v v0,v6
vl1re8.v v5,0(s6)
vle8.v v13,0(s5),v0.t
vmsne.vv v5,v5,v1
addi s4,a1,48
vmv1r.v v0,v5
addi s3,a3,64
vle8.v v12,0(s4),v0.t
vl1re8.v v4,0(s3)
addi s1,a1,64
vmsne.vv v4,v4,v1
addi s2,a3,80
vmv1r.v v0,v4
addi s0,a1,80
addi t2,a3,96
vle8.v v11,0(s1),v0.t
vl1re8.v v2,0(s2)
vl1re8.v v3,0(t2)
vmsne.vv v2,v2,v1
vmsne.vv v3,v3,v1
vmv1r.v v0,v2
vadd.vv v9,v9,v15
vle8.v v10,0(s0),v0.t
vsetvli a5,zero,e8,m1,ta,ma
vmv1r.v v0,v8
addi t4,a0,16
vse8.v v9,0(a0),v0.t
vsetivli zero,16,e8,m1,ta,ma
vmv1r.v v0,v7
vle8.v v8,0(t4),v0.t
vadd.vv v8,v8,v14
vsetvli a5,zero,e8,m1,ta,ma
addi t3,a0,32
vse8.v v8,0(t4),v0.t
vsetivli zero,16,e8,m1,ta,ma
vmv1r.v v0,v6
vle8.v v7,0(t3),v0.t
vadd.vv v7,v7,v13
vsetvli a5,zero,e8,m1,ta,ma
addi t1,a0,48
vse8.v v7,0(t3),v0.t
vsetivli zero,16,e8,m1,ta,ma
vmv1r.v v0,v5
vle8.v v6,0(t1),v0.t
vadd.vv v6,v6,v12
vsetvli a5,zero,e8,m1,ta,ma
addi a7,a0,64
vse8.v v6,0(t1),v0.t
vsetivli zero,16,e8,m1,ta,ma
vmv1r.v v0,v4
vle8.v v5,0(a7),v0.t
vadd.vv v5,v5,v11
vsetvli a5,zero,e8,m1,ta,ma
addi a6,a0,80
vse8.v v5,0(a7),v0.t
vsetivli zero,16,e8,m1,ta,ma
vmv1r.v v0,v2
vle8.v v4,0(a6),v0.t
vadd.vv v4,v4,v10
vsetvli a5,zero,e8,m1,ta,ma
addi a2,a0,96
vse8.v v4,0(a6),v0.t
addi t0,a1,96
vsetivli zero,16,e8,m1,ta,ma
addi t6,a3,112
vmv1r.v v0,v3
vl1re8.v v2,0(t6)
vle8.v v4,0(a2),v0.t
vle8.v v5,0(t0),v0.t
vmsne.vv v1,v2,v1
vadd.vv v4,v4,v5
vsetvli a5,zero,e8,m1,ta,ma
addi a4,a0,112
vse8.v v4,0(a2),v0.t
addi t5,a1,112
vsetivli zero,16,e8,m1,ta,ma
vmv1r.v v0,v1
vle8.v v2,0(a4),v0.t
vle8.v v3,0(t5),v0.t
vadd.vv v2,v2,v3
vsetvli a5,zero,e8,m1,ta,ma
vse8.v v2,0(a4),v0.t
lw s0,44(sp)
lw s1,40(sp)
lw s2,36(sp)
lw s3,32(sp)
lw s4,28(sp)
lw s5,24(sp)
lw s6,20(sp)
lw s7,16(sp)
lw s8,12(sp)
lw s9,8(sp)
addi sp,sp,48
jr ra
This codegen is very ugly and bad, to many "lw" "sw" and "vmv1r.v"
But with -fno-schedule-insns, codegen becomes much more reasonable:
f_uint8_t:
vsetivli zero,16,e8,m1,ta,ma
addi a2,a3,16
vmv.v.i v1,0
vl1re8.v v0,0(a3)
vmsne.vv v0,v0,v1
vle8.v v2,0(a0),v0.t
vle8.v v3,0(a1),v0.t
vadd.vv v2,v2,v3
vsetvli a5,zero,e8,m1,ta,ma
addi a4,a0,16
vse8.v v2,0(a0),v0.t
vsetivli zero,16,e8,m1,ta,ma
vl1re8.v v0,0(a2)
addi a2,a1,16
vmsne.vv v0,v0,v1
vle8.v v3,0(a4),v0.t
vle8.v v2,0(a2),v0.t
addi a2,a3,32
vadd.vv v2,v2,v3
vsetvli a5,zero,e8,m1,ta,ma
vse8.v v2,0(a4),v0.t
vsetivli zero,16,e8,m1,ta,ma
addi a4,a0,32
vl1re8.v v0,0(a2)
addi a2,a1,32
vmsne.vv v0,v0,v1
vle8.v v2,0(a4),v0.t
vle8.v v3,0(a2),v0.t
addi a2,a3,48
vadd.vv v2,v2,v3
vsetvli a5,zero,e8,m1,ta,ma
vse8.v v2,0(a4),v0.t
vsetivli zero,16,e8,m1,ta,ma
addi a4,a0,48
vl1re8.v v0,0(a2)
addi a2,a1,48
vmsne.vv v0,v0,v1
vle8.v v2,0(a4),v0.t
vle8.v v3,0(a2),v0.t
addi a2,a3,64
vadd.vv v2,v2,v3
vsetvli a5,zero,e8,m1,ta,ma
vse8.v v2,0(a4),v0.t
vsetivli zero,16,e8,m1,ta,ma
addi a4,a0,64
vl1re8.v v0,0(a2)
addi a2,a1,64
vmsne.vv v0,v0,v1
vle8.v v2,0(a4),v0.t
vle8.v v3,0(a2),v0.t
addi a2,a3,80
vadd.vv v2,v2,v3
vsetvli a5,zero,e8,m1,ta,ma
vse8.v v2,0(a4),v0.t
vsetivli zero,16,e8,m1,ta,ma
addi a4,a0,80
vl1re8.v v0,0(a2)
addi a2,a1,80
vmsne.vv v0,v0,v1
vle8.v v2,0(a4),v0.t
vle8.v v3,0(a2),v0.t
addi a2,a3,96
vadd.vv v2,v2,v3
vsetvli a5,zero,e8,m1,ta,ma
vse8.v v2,0(a4),v0.t
vsetivli zero,16,e8,m1,ta,ma
addi a4,a0,96
vl1re8.v v0,0(a2)
addi a2,a1,96
vmsne.vv v0,v0,v1
vle8.v v2,0(a4),v0.t
vle8.v v3,0(a2),v0.t
vadd.vv v2,v2,v3
vsetvli a5,zero,e8,m1,ta,ma
addi a0,a0,112
vse8.v v2,0(a4),v0.t
addi a3,a3,112
vsetivli zero,16,e8,m1,ta,ma
addi a1,a1,112
vl1re8.v v0,0(a3)
vmsne.vv v0,v0,v1
vle8.v v1,0(a0),v0.t
vle8.v v2,0(a1),v0.t
vadd.vv v1,v1,v2
vsetvli a5,zero,e8,m1,ta,ma
vse8.v v1,0(a0),v0.t
ret
^ permalink raw reply [flat|nested] 4+ messages in thread
* [Bug target/110559] Bad mask_load/mask_store codegen of RVV
2023-07-05 11:30 [Bug c/110559] New: Bad mask_load/mask_store codegen of RVV juzhe.zhong at rivai dot ai
@ 2023-07-07 14:50 ` rdapp at gcc dot gnu.org
2023-07-07 14:55 ` law at gcc dot gnu.org
2023-08-25 13:30 ` rdapp at gcc dot gnu.org
2 siblings, 0 replies; 4+ messages in thread
From: rdapp at gcc dot gnu.org @ 2023-07-07 14:50 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110559
--- Comment #1 from Robin Dapp <rdapp at gcc dot gnu.org> ---
This can be improved in parts by enabling register-pressure aware scheduling.
The rest is due to the default issue rate of 1. Setting proper instruction
latency will then obviously cause a bit more reordering but my tests haven't
shown a lot of additional spilling.
I'm going to set the scheduler options in a patch next week.
^ permalink raw reply [flat|nested] 4+ messages in thread
* [Bug target/110559] Bad mask_load/mask_store codegen of RVV
2023-07-05 11:30 [Bug c/110559] New: Bad mask_load/mask_store codegen of RVV juzhe.zhong at rivai dot ai
2023-07-07 14:50 ` [Bug target/110559] " rdapp at gcc dot gnu.org
@ 2023-07-07 14:55 ` law at gcc dot gnu.org
2023-08-25 13:30 ` rdapp at gcc dot gnu.org
2 siblings, 0 replies; 4+ messages in thread
From: law at gcc dot gnu.org @ 2023-07-07 14:55 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110559
Jeffrey A. Law <law at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Status|UNCONFIRMED |NEW
Last reconfirmed| |2023-07-07
Ever confirmed|0 |1
--- Comment #2 from Jeffrey A. Law <law at gcc dot gnu.org> ---
Yea, we definitely want pressure sensitive scheduling. While it's more
valuable for scalar cases, it can help with some vector as well. Also note
there's two variants of the pressure sensitive scheduler support. I think we
use the newer one which is supposed to be better, but I don't think we've
really evaluated one over the other.
Setting issue rate to 1 for the first pass scheduler is a bit of a hack, though
not terribly uncommon. It's something I've wanted to go back and review, so
fully support you digging into that as well.
^ permalink raw reply [flat|nested] 4+ messages in thread
* [Bug target/110559] Bad mask_load/mask_store codegen of RVV
2023-07-05 11:30 [Bug c/110559] New: Bad mask_load/mask_store codegen of RVV juzhe.zhong at rivai dot ai
2023-07-07 14:50 ` [Bug target/110559] " rdapp at gcc dot gnu.org
2023-07-07 14:55 ` law at gcc dot gnu.org
@ 2023-08-25 13:30 ` rdapp at gcc dot gnu.org
2 siblings, 0 replies; 4+ messages in thread
From: rdapp at gcc dot gnu.org @ 2023-08-25 13:30 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110559
--- Comment #3 from Robin Dapp <rdapp at gcc dot gnu.org> ---
I got back to this again today, now that pressure-aware scheduling is the
default. As mentioned before, it helps but doesn't get rid of the spills.
Testing with the "generic ooo" scheduling model it looks like vector load/store
latency of 6 is too high. Yet, even setting them to 1 is not enough to get rid
of spills entirely. What helps is additionally lowering the vector alu latency
to 2 (from the default 3).
I'm not really sure how to properly handle this. As far as I can tell spilling
is always going to happen if we try to "wait" for dependencies and delay the
dependent instructions. In my experience the hardware does a better job at
live scheduling anyway and we only make things worse in several cases.
Previously I experimented with setting the latency of most instructions to 1
with few exceptions and instead ensure a proper instruction mix i.e. trying to
keep every execution unit busy. That's not a panacea either, though.
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2023-08-25 13:30 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-07-05 11:30 [Bug c/110559] New: Bad mask_load/mask_store codegen of RVV juzhe.zhong at rivai dot ai
2023-07-07 14:50 ` [Bug target/110559] " rdapp at gcc dot gnu.org
2023-07-07 14:55 ` law at gcc dot gnu.org
2023-08-25 13:30 ` rdapp at gcc dot gnu.org
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).