public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug c/110559] New: Bad mask_load/mask_store codegen of RVV
@ 2023-07-05 11:30 juzhe.zhong at rivai dot ai
  2023-07-07 14:50 ` [Bug target/110559] " rdapp at gcc dot gnu.org
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: juzhe.zhong at rivai dot ai @ 2023-07-05 11:30 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110559

            Bug ID: 110559
           Summary: Bad mask_load/mask_store codegen of RVV
           Product: gcc
           Version: 14.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c
          Assignee: unassigned at gcc dot gnu.org
          Reporter: juzhe.zhong at rivai dot ai
  Target Milestone: ---

#include <stdint-gcc.h>

#define INDEX8 uint8_t
#define INDEX16 uint16_t
#define INDEX32 uint32_t
#define INDEX64 uint64_t

#define TEST_LOOP(DATA_TYPE, BITS)                                            
\
  void __attribute__ ((noinline, noclone))                                    
\
  f_##DATA_TYPE (DATA_TYPE *restrict dest, DATA_TYPE *restrict src,           
\
                 INDEX##BITS *restrict indices, INDEX##BITS *restrict cond)   
\
  {                                                                           
\
    for (int i = 0; i < 128; ++i)                                             
\
      if (cond[i])                                                            
\
        dest[i] += src[i];                                            \
  }

#define TEST_ALL(T)                                                           
\
  T (int8_t, 8)                                                               
\
  T (uint8_t, 8)                                                              
\
  T (int16_t, 16)                                                             
\
  T (uint16_t, 16)                                                            
\
  T (_Float16, 16)                                                            
\
  T (int32_t, 32)                                                             
\
  T (uint32_t, 32)                                                            
\
  T (float, 32)                                                               
\
  T (int64_t, 64)                                                             
\
  T (uint64_t, 64)                                                            
\
  T (double, 64)

TEST_ALL (TEST_LOOP)

riscv32: --param riscv-autovec-preference=fixed-vlmax -O3:
f_int8_t:
        addi    sp,sp,-48
        sw      s0,44(sp)
        sw      s1,40(sp)
        sw      s2,36(sp)
        sw      s3,32(sp)
        sw      s4,28(sp)
        sw      s5,24(sp)
        sw      s6,20(sp)
        sw      s7,16(sp)
        sw      s8,12(sp)
        sw      s9,8(sp)
        vsetivli        zero,16,e8,m1,ta,ma
        addi    s9,a3,16
        vmv.v.i v1,0
        vl1re8.v        v8,0(a3)
        vmsne.vv        v8,v8,v1
        vmv1r.v v0,v8
        vl1re8.v        v7,0(s9)
        vle8.v  v9,0(a0),v0.t
        vmsne.vv        v7,v7,v1
        vle8.v  v15,0(a1),v0.t
        addi    s8,a1,16
        vmv1r.v v0,v7
        addi    s7,a3,32
        vle8.v  v14,0(s8),v0.t
        vl1re8.v        v6,0(s7)
        addi    s5,a1,32
        vmsne.vv        v6,v6,v1
        addi    s6,a3,48
        vmv1r.v v0,v6
        vl1re8.v        v5,0(s6)
        vle8.v  v13,0(s5),v0.t
        vmsne.vv        v5,v5,v1
        addi    s4,a1,48
        vmv1r.v v0,v5
        addi    s3,a3,64
        vle8.v  v12,0(s4),v0.t
        vl1re8.v        v4,0(s3)
        addi    s1,a1,64
        vmsne.vv        v4,v4,v1
        addi    s2,a3,80
        vmv1r.v v0,v4
        addi    s0,a1,80
        addi    t2,a3,96
        vle8.v  v11,0(s1),v0.t
        vl1re8.v        v2,0(s2)
        vl1re8.v        v3,0(t2)
        vmsne.vv        v2,v2,v1
        vmsne.vv        v3,v3,v1
        vmv1r.v v0,v2
        vadd.vv v9,v9,v15
        vle8.v  v10,0(s0),v0.t
        vsetvli a5,zero,e8,m1,ta,ma
        vmv1r.v v0,v8
        addi    t4,a0,16
        vse8.v  v9,0(a0),v0.t
        vsetivli        zero,16,e8,m1,ta,ma
        vmv1r.v v0,v7
        vle8.v  v8,0(t4),v0.t
        vadd.vv v8,v8,v14
        vsetvli a5,zero,e8,m1,ta,ma
        addi    t3,a0,32
        vse8.v  v8,0(t4),v0.t
        vsetivli        zero,16,e8,m1,ta,ma
        vmv1r.v v0,v6
        vle8.v  v7,0(t3),v0.t
        vadd.vv v7,v7,v13
        vsetvli a5,zero,e8,m1,ta,ma
        addi    t1,a0,48
        vse8.v  v7,0(t3),v0.t
        vsetivli        zero,16,e8,m1,ta,ma
        vmv1r.v v0,v5
        vle8.v  v6,0(t1),v0.t
        vadd.vv v6,v6,v12
        vsetvli a5,zero,e8,m1,ta,ma
        addi    a7,a0,64
        vse8.v  v6,0(t1),v0.t
        vsetivli        zero,16,e8,m1,ta,ma
        vmv1r.v v0,v4
        vle8.v  v5,0(a7),v0.t
        vadd.vv v5,v5,v11
        vsetvli a5,zero,e8,m1,ta,ma
        addi    a6,a0,80
        vse8.v  v5,0(a7),v0.t
        vsetivli        zero,16,e8,m1,ta,ma
        vmv1r.v v0,v2
        vle8.v  v4,0(a6),v0.t
        vadd.vv v4,v4,v10
        vsetvli a5,zero,e8,m1,ta,ma
        addi    a2,a0,96
        vse8.v  v4,0(a6),v0.t
        addi    t0,a1,96
        vsetivli        zero,16,e8,m1,ta,ma
        addi    t6,a3,112
        vmv1r.v v0,v3
        vl1re8.v        v2,0(t6)
        vle8.v  v4,0(a2),v0.t
        vle8.v  v5,0(t0),v0.t
        vmsne.vv        v1,v2,v1
        vadd.vv v4,v4,v5
        vsetvli a5,zero,e8,m1,ta,ma
        addi    a4,a0,112
        vse8.v  v4,0(a2),v0.t
        addi    t5,a1,112
        vsetivli        zero,16,e8,m1,ta,ma
        vmv1r.v v0,v1
        vle8.v  v2,0(a4),v0.t
        vle8.v  v3,0(t5),v0.t
        vadd.vv v2,v2,v3
        vsetvli a5,zero,e8,m1,ta,ma
        vse8.v  v2,0(a4),v0.t
        lw      s0,44(sp)
        lw      s1,40(sp)
        lw      s2,36(sp)
        lw      s3,32(sp)
        lw      s4,28(sp)
        lw      s5,24(sp)
        lw      s6,20(sp)
        lw      s7,16(sp)
        lw      s8,12(sp)
        lw      s9,8(sp)
        addi    sp,sp,48
        jr      ra

This codegen is very ugly and bad, to many "lw" "sw" and "vmv1r.v"

But with -fno-schedule-insns, codegen becomes much more reasonable:

f_uint8_t:
        vsetivli        zero,16,e8,m1,ta,ma
        addi    a2,a3,16
        vmv.v.i v1,0
        vl1re8.v        v0,0(a3)
        vmsne.vv        v0,v0,v1
        vle8.v  v2,0(a0),v0.t
        vle8.v  v3,0(a1),v0.t
        vadd.vv v2,v2,v3
        vsetvli a5,zero,e8,m1,ta,ma
        addi    a4,a0,16
        vse8.v  v2,0(a0),v0.t
        vsetivli        zero,16,e8,m1,ta,ma
        vl1re8.v        v0,0(a2)
        addi    a2,a1,16
        vmsne.vv        v0,v0,v1
        vle8.v  v3,0(a4),v0.t
        vle8.v  v2,0(a2),v0.t
        addi    a2,a3,32
        vadd.vv v2,v2,v3
        vsetvli a5,zero,e8,m1,ta,ma
        vse8.v  v2,0(a4),v0.t
        vsetivli        zero,16,e8,m1,ta,ma
        addi    a4,a0,32
        vl1re8.v        v0,0(a2)
        addi    a2,a1,32
        vmsne.vv        v0,v0,v1
        vle8.v  v2,0(a4),v0.t
        vle8.v  v3,0(a2),v0.t
        addi    a2,a3,48
        vadd.vv v2,v2,v3
        vsetvli a5,zero,e8,m1,ta,ma
        vse8.v  v2,0(a4),v0.t
        vsetivli        zero,16,e8,m1,ta,ma
        addi    a4,a0,48
        vl1re8.v        v0,0(a2)
        addi    a2,a1,48
        vmsne.vv        v0,v0,v1
        vle8.v  v2,0(a4),v0.t
        vle8.v  v3,0(a2),v0.t
        addi    a2,a3,64
        vadd.vv v2,v2,v3
        vsetvli a5,zero,e8,m1,ta,ma
        vse8.v  v2,0(a4),v0.t
        vsetivli        zero,16,e8,m1,ta,ma
        addi    a4,a0,64
        vl1re8.v        v0,0(a2)
        addi    a2,a1,64
        vmsne.vv        v0,v0,v1
        vle8.v  v2,0(a4),v0.t
        vle8.v  v3,0(a2),v0.t
        addi    a2,a3,80
        vadd.vv v2,v2,v3
        vsetvli a5,zero,e8,m1,ta,ma
        vse8.v  v2,0(a4),v0.t
        vsetivli        zero,16,e8,m1,ta,ma
        addi    a4,a0,80
        vl1re8.v        v0,0(a2)
        addi    a2,a1,80
        vmsne.vv        v0,v0,v1
        vle8.v  v2,0(a4),v0.t
        vle8.v  v3,0(a2),v0.t
        addi    a2,a3,96
        vadd.vv v2,v2,v3
        vsetvli a5,zero,e8,m1,ta,ma
        vse8.v  v2,0(a4),v0.t
        vsetivli        zero,16,e8,m1,ta,ma
        addi    a4,a0,96
        vl1re8.v        v0,0(a2)
        addi    a2,a1,96
        vmsne.vv        v0,v0,v1
        vle8.v  v2,0(a4),v0.t
        vle8.v  v3,0(a2),v0.t
        vadd.vv v2,v2,v3
        vsetvli a5,zero,e8,m1,ta,ma
        addi    a0,a0,112
        vse8.v  v2,0(a4),v0.t
        addi    a3,a3,112
        vsetivli        zero,16,e8,m1,ta,ma
        addi    a1,a1,112
        vl1re8.v        v0,0(a3)
        vmsne.vv        v0,v0,v1
        vle8.v  v1,0(a0),v0.t
        vle8.v  v2,0(a1),v0.t
        vadd.vv v1,v1,v2
        vsetvli a5,zero,e8,m1,ta,ma
        vse8.v  v1,0(a0),v0.t
        ret

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [Bug target/110559] Bad mask_load/mask_store codegen of RVV
  2023-07-05 11:30 [Bug c/110559] New: Bad mask_load/mask_store codegen of RVV juzhe.zhong at rivai dot ai
@ 2023-07-07 14:50 ` rdapp at gcc dot gnu.org
  2023-07-07 14:55 ` law at gcc dot gnu.org
  2023-08-25 13:30 ` rdapp at gcc dot gnu.org
  2 siblings, 0 replies; 4+ messages in thread
From: rdapp at gcc dot gnu.org @ 2023-07-07 14:50 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110559

--- Comment #1 from Robin Dapp <rdapp at gcc dot gnu.org> ---
This can be improved in parts by enabling register-pressure aware scheduling.
The rest is due to the default issue rate of 1.  Setting proper instruction
latency will then obviously cause a bit more reordering but my tests haven't
shown a lot of additional spilling.

I'm going to set the scheduler options in a patch next week.

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [Bug target/110559] Bad mask_load/mask_store codegen of RVV
  2023-07-05 11:30 [Bug c/110559] New: Bad mask_load/mask_store codegen of RVV juzhe.zhong at rivai dot ai
  2023-07-07 14:50 ` [Bug target/110559] " rdapp at gcc dot gnu.org
@ 2023-07-07 14:55 ` law at gcc dot gnu.org
  2023-08-25 13:30 ` rdapp at gcc dot gnu.org
  2 siblings, 0 replies; 4+ messages in thread
From: law at gcc dot gnu.org @ 2023-07-07 14:55 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110559

Jeffrey A. Law <law at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|UNCONFIRMED                 |NEW
   Last reconfirmed|                            |2023-07-07
     Ever confirmed|0                           |1

--- Comment #2 from Jeffrey A. Law <law at gcc dot gnu.org> ---
Yea, we definitely want pressure sensitive scheduling.  While it's more
valuable for scalar cases, it can help with some vector as well.  Also note
there's two variants of the pressure sensitive scheduler support.  I think we
use the newer one which is supposed to be better, but I don't think we've
really evaluated one over the other.

Setting issue rate to 1 for the first pass scheduler is a bit of a hack, though
not terribly uncommon.  It's something I've wanted to go back and review, so
fully support you digging into that as well.

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [Bug target/110559] Bad mask_load/mask_store codegen of RVV
  2023-07-05 11:30 [Bug c/110559] New: Bad mask_load/mask_store codegen of RVV juzhe.zhong at rivai dot ai
  2023-07-07 14:50 ` [Bug target/110559] " rdapp at gcc dot gnu.org
  2023-07-07 14:55 ` law at gcc dot gnu.org
@ 2023-08-25 13:30 ` rdapp at gcc dot gnu.org
  2 siblings, 0 replies; 4+ messages in thread
From: rdapp at gcc dot gnu.org @ 2023-08-25 13:30 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110559

--- Comment #3 from Robin Dapp <rdapp at gcc dot gnu.org> ---
I got back to this again today, now that pressure-aware scheduling is the
default.  As mentioned before, it helps but doesn't get rid of the spills.

Testing with the "generic ooo" scheduling model it looks like vector load/store
latency of 6 is too high.  Yet, even setting them to 1 is not enough to get rid
of spills entirely.  What helps is additionally lowering the vector alu latency
to 2 (from the default 3).

I'm not really sure how to properly handle this.  As far as I can tell spilling
is always going to happen if we try to "wait" for dependencies and delay the
dependent instructions.  In my experience the hardware does a better job at
live scheduling anyway and we only make things worse in several cases. 
Previously I experimented with setting the latency of most instructions to 1
with few exceptions and instead ensure a proper instruction mix i.e. trying to
keep every execution unit busy.  That's not a panacea either, though.

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2023-08-25 13:30 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-07-05 11:30 [Bug c/110559] New: Bad mask_load/mask_store codegen of RVV juzhe.zhong at rivai dot ai
2023-07-07 14:50 ` [Bug target/110559] " rdapp at gcc dot gnu.org
2023-07-07 14:55 ` law at gcc dot gnu.org
2023-08-25 13:30 ` rdapp at gcc dot gnu.org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).