[Bug c/112401] New: RISC-V: So many redundant move instructions due to subreg handling on vector mode

public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed

* [Bug c/112401] New: RISC-V: So many redundant move instructions due to subreg handling on vector mode
@ 2023-11-06  3:52 juzhe.zhong at rivai dot ai
  2023-11-06  3:53 ` [Bug c/112401] " juzhe.zhong at rivai dot ai
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: juzhe.zhong at rivai dot ai @ 2023-11-06  3:52 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112401

            Bug ID: 112401
           Summary: RISC-V: So many redundant move instructions due to
                    subreg handling on vector mode
           Product: gcc
           Version: 14.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c
          Assignee: unassigned at gcc dot gnu.org
          Reporter: juzhe.zhong at rivai dot ai
  Target Milestone: ---

Consider this following case:

https://godbolt.org/z/8nc6r4joc

Compare with LLVM, we have so many redundant move instruction "vmv1r"

#include <riscv_vector.h>

void
subreg_to_reg_1 (int32_t *in, int32_t *out, size_t m)
{
  vint32m8_t result = __riscv_vle32_v_i32m8 (in, 32);
  vint32m1_t v0 = __riscv_vget_v_i32m8_i32m1 (result, 0);
  vint32m1_t v1 = __riscv_vget_v_i32m8_i32m1 (result, 1);
  vint32m1_t v2 = __riscv_vget_v_i32m8_i32m1 (result, 2);
  vint32m1_t v3 = __riscv_vget_v_i32m8_i32m1 (result, 3);
  vint32m1_t v4 = __riscv_vget_v_i32m8_i32m1 (result, 4);
  vint32m1_t v5 = __riscv_vget_v_i32m8_i32m1 (result, 5);
  vint32m1_t v6 = __riscv_vget_v_i32m8_i32m1 (result, 6);
  vint32m1_t v7 = __riscv_vget_v_i32m8_i32m1 (result, 7);
  for (size_t i = 0; i < m; i++)
    {
      v0 = __riscv_vadd_vv_i32m1(v0, v0, 4);
      v1 = __riscv_vadd_vv_i32m1(v1, v1, 4);
      v2 = __riscv_vadd_vv_i32m1(v2, v2, 4);
      v3 = __riscv_vadd_vv_i32m1(v3, v3, 4);
      v4 = __riscv_vadd_vv_i32m1(v4, v4, 4);
      v5 = __riscv_vadd_vv_i32m1(v5, v5, 4);
      v6 = __riscv_vadd_vv_i32m1(v6, v6, 4);
      v7 = __riscv_vadd_vv_i32m1(v7, v7, 4);
    }
  *(vint32m1_t*)(out+4*0) = v0;
  *(vint32m1_t*)(out+4*1) = v1;
  *(vint32m1_t*)(out+4*2) = v2;
  *(vint32m1_t*)(out+4*3) = v3;
  *(vint32m1_t*)(out+4*4) = v4;
  *(vint32m1_t*)(out+4*5) = v5;
  *(vint32m1_t*)(out+4*6) = v6;
  *(vint32m1_t*)(out+4*7) = v7;
}

Such issue not only happens on RISC-V but also in all other targets.

Lehua will send a patch to support subreg liveness tracking on GCC soon.

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [Bug c/112401] RISC-V: So many redundant move instructions due to subreg handling on vector mode
  2023-11-06  3:52 [Bug c/112401] New: RISC-V: So many redundant move instructions due to subreg handling on vector mode juzhe.zhong at rivai dot ai
@ 2023-11-06  3:53 ` juzhe.zhong at rivai dot ai
  2024-01-18  1:27 ` [Bug rtl-optimization/112401] " juzhe.zhong at rivai dot ai
  2024-01-18  1:28 ` juzhe.zhong at rivai dot ai
  2 siblings, 0 replies; 4+ messages in thread
From: juzhe.zhong at rivai dot ai @ 2023-11-06  3:53 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112401

--- Comment #1 from JuzheZhong <juzhe.zhong at rivai dot ai> ---
GCC ASM:

subreg_to_reg_1:
        li      a5,32
        vsetvli zero,a5,e32,m8,ta,ma
        vle32.v v16,0(a0)
        vmv1r.v v8,v16
        vmv1r.v v7,v17
        vmv1r.v v6,v18
        vmv1r.v v5,v19
        vmv1r.v v4,v20
        vmv1r.v v3,v21
        vmv1r.v v2,v22
        vmv1r.v v1,v23
        beq     a2,zero,.L2
        li      a5,0
        vsetivli        zero,4,e32,m1,ta,ma
.L3:
        addi    a5,a5,1
        vadd.vv v8,v8,v8
        vadd.vv v7,v7,v7
        vadd.vv v6,v6,v6
        vadd.vv v5,v5,v5
        vadd.vv v4,v4,v4
        vadd.vv v3,v3,v3
        vadd.vv v2,v2,v2
        vadd.vv v1,v1,v1
        bne     a2,a5,.L3
.L2:
        vs1r.v  v8,0(a1)
        addi    a5,a1,16
        vs1r.v  v7,0(a5)
        addi    a5,a1,32
        vs1r.v  v6,0(a5)
        addi    a5,a1,48
        vs1r.v  v5,0(a5)
        addi    a5,a1,64
        vs1r.v  v4,0(a5)
        addi    a5,a1,80
        vs1r.v  v3,0(a5)
        addi    a5,a1,96
        vs1r.v  v2,0(a5)
        addi    a1,a1,112
        vs1r.v  v1,0(a1)
        ret

LLVM ASM:

subreg_to_reg_1:                        # @subreg_to_reg_1
        li      a3, 32
        vsetvli zero, a3, e32, m8, ta, ma
        vle32.v v8, (a0)
        addi    a0, a1, 16
        beqz    a2, .LBB0_2
.LBB0_1:                                # =>This Inner Loop Header: Depth=1
        vsetivli        zero, 4, e32, m1, ta, ma
        vadd.vv v8, v8, v8
        vadd.vv v9, v9, v9
        vadd.vv v10, v10, v10
        vadd.vv v11, v11, v11
        vadd.vv v12, v12, v12
        vadd.vv v13, v13, v13
        vadd.vv v14, v14, v14
        addi    a2, a2, -1
        vadd.vv v15, v15, v15
        bnez    a2, .LBB0_1
.LBB0_2:
        vs1r.v  v8, (a1)
        vs1r.v  v9, (a0)
        addi    a1, a0, 16
        vs1r.v  v10, (a1)
        addi    a1, a0, 32
        vs1r.v  v11, (a1)
        addi    a1, a0, 48
        vs1r.v  v12, (a1)
        addi    a1, a0, 64
        vs1r.v  v13, (a1)
        addi    a1, a0, 80
        vs1r.v  v14, (a1)
        addi    a0, a0, 96
        vs1r.v  v15, (a0)
        ret

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [Bug rtl-optimization/112401] RISC-V: So many redundant move instructions due to subreg handling on vector mode
  2023-11-06  3:52 [Bug c/112401] New: RISC-V: So many redundant move instructions due to subreg handling on vector mode juzhe.zhong at rivai dot ai
  2023-11-06  3:53 ` [Bug c/112401] " juzhe.zhong at rivai dot ai
@ 2024-01-18  1:27 ` juzhe.zhong at rivai dot ai
  2024-01-18  1:28 ` juzhe.zhong at rivai dot ai
  2 siblings, 0 replies; 4+ messages in thread
From: juzhe.zhong at rivai dot ai @ 2024-01-18  1:27 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112401

--- Comment #2 from JuzheZhong <juzhe.zhong at rivai dot ai> ---
Add more test:

void matrix_4x4_transpose_segmented_load(float* dst, float* src)
{
    vfloat32m1x4_t data = __riscv_vlseg4e32_v_f32m1x4(src, 4);
    vfloat32m1_t data0 = __riscv_vget_v_f32m1x4_f32m1(data, 0);
    vfloat32m1_t data1 = __riscv_vget_v_f32m1x4_f32m1(data, 1);
    vfloat32m1_t data2 = __riscv_vget_v_f32m1x4_f32m1(data, 2);
    vfloat32m1_t data3 = __riscv_vget_v_f32m1x4_f32m1(data, 3);
    vfloat32m4_t packedData = __riscv_vcreate_v_f32m1_f32m4(data0,
                                                            data1,
                                                            data2,
                                                            data3);
    __riscv_vse32_v_f32m4(dst, packedData, 16);
}

matrix_4x4_transpose_segmented_load:
        vsetivli        zero,4,e32,m1,ta,ma
        vlseg4e32.v     v8,(a1)
        vsetivli        zero,16,e32,m4,ta,ma
        vmv1r.v v4,v8
        vmv1r.v v5,v9
        vmv1r.v v6,v10
        vmv1r.v v7,v11
        vse32.v v4,0(a0)
        ret

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [Bug rtl-optimization/112401] RISC-V: So many redundant move instructions due to subreg handling on vector mode
  2023-11-06  3:52 [Bug c/112401] New: RISC-V: So many redundant move instructions due to subreg handling on vector mode juzhe.zhong at rivai dot ai
  2023-11-06  3:53 ` [Bug c/112401] " juzhe.zhong at rivai dot ai
  2024-01-18  1:27 ` [Bug rtl-optimization/112401] " juzhe.zhong at rivai dot ai
@ 2024-01-18  1:28 ` juzhe.zhong at rivai dot ai
  2 siblings, 0 replies; 4+ messages in thread
From: juzhe.zhong at rivai dot ai @ 2024-01-18  1:28 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112401

--- Comment #3 from JuzheZhong <juzhe.zhong at rivai dot ai> ---
vfloat32m4_t matrix_4x4_transpose_vslide(vfloat32m4_t src) {
    vfloat32m1_t inMat0 = __riscv_vget_v_f32m4_f32m1(src, 0);
    vfloat32m1_t inMat1 = __riscv_vget_v_f32m4_f32m1(src, 1);
    vfloat32m1_t inMat2 = __riscv_vget_v_f32m4_f32m1(src, 2);
    vfloat32m1_t inMat3 = __riscv_vget_v_f32m4_f32m1(src, 3);
vuint32m1_t oddMask_u32 = __riscv_vmv_v_x_u32m1(0xaaaa, 1);
vuint32m1_t evenMask_u32 = __riscv_vmv_v_x_u32m1(0x5555, 1);

vbool32_t oddMask = __riscv_vreinterpret_v_u32m1_b32(oddMask_u32);
// vl=4 in the following
// should be mapped to vslideup.vi
vfloat32m1_t transMat0 = __riscv_vslideup_vx_f32m1_tumu(oddMask,
                                                        inMat0,
                                                        inMat1,
                                                        1, 4);
vfloat32m1_t transMat2 = __riscv_vslideup_vx_f32m1_tumu(oddMask,
                                                        inMat2,
                                                        inMat3,
                                                         1, 4);

vbool32_t evenMask = __riscv_vreinterpret_v_u32m1_b32(evenMask_u32);
// should be mapped to vslidedown.vi
vfloat32m1_t transMat1 = __riscv_vslidedown_vx_f32m1_tumu(evenMask,
                                                          inMat1,
                                                          inMat0,
                                                          1, 4);
vfloat32m1_t transMat3 = __riscv_vslidedown_vx_f32m1_tumu(evenMask,
                                                          inMat3,
                                                          inMat2,
                                                          1, 4);

// should be mapped to vslideup.vi
vfloat32m1_t outMat0 = __riscv_vslideup_vx_f32m1_tu(transMat0,
                                                    transMat2,
                                                    2, 4);
vfloat32m1_t outMat1 = __riscv_vslideup_vx_f32m1_tu(transMat1,
                                                    transMat3,
                                                    2, 4);

// vl=2 in the following
// should be mapped to vslidedown.vi
vfloat32m1_t outMat2 = __riscv_vslidedown_vx_f32m1_tu(transMat2,
                                                      transMat0,
                                                      2, 2);
vfloat32m1_t outMat3 = __riscv_vslidedown_vx_f32m1_tu(transMat3,
                                                      transMat1,
                                                      2, 2);

return __riscv_vcreate_v_f32m1_f32m4(outMat0,
                                     outMat1,
                                     outMat2,
                                     outMat3);




}


matrix_4x4_transpose_vslide:
        li      a4,45056
        addiw   a4,a4,-1366
        vsetivli        zero,1,e32,m1,ta,ma
        li      a5,20480
        vmv.v.x v0,a4
        vsetivli        zero,4,e32,m1,tu,mu
        vl4re32.v       v4,0(a1)
        addiw   a5,a5,1365
        vmv1r.v v12,v4
        vmv1r.v v3,v6
        vslideup.vi     v12,v5,1,v0.t
        vslideup.vi     v3,v7,1,v0.t
        vsetivli        zero,1,e32,m1,ta,ma
        vmv1r.v v1,v12
        vmv.v.x v0,a5
        vsetivli        zero,4,e32,m1,tu,mu
        vslideup.vi     v1,v3,2
        vmv1r.v v2,v5
        vmv1r.v v8,v1
        vslidedown.vi   v2,v4,1,v0.t
        vmv1r.v v1,v7
        vmv1r.v v4,v2
        vslidedown.vi   v1,v6,1,v0.t
        vslideup.vi     v4,v1,2
        vsetivli        zero,2,e32,m1,tu,ma
        vmv1r.v v9,v4
        vslidedown.vi   v3,v12,2
        vslidedown.vi   v1,v2,2
        vmv1r.v v10,v3
        vmv1r.v v11,v1
        vs4r.v  v8,0(a0)
        ret

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2024-01-18  1:28 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-11-06  3:52 [Bug c/112401] New: RISC-V: So many redundant move instructions due to subreg handling on vector mode juzhe.zhong at rivai dot ai
2023-11-06  3:53 ` [Bug c/112401] " juzhe.zhong at rivai dot ai
2024-01-18  1:27 ` [Bug rtl-optimization/112401] " juzhe.zhong at rivai dot ai
2024-01-18  1:28 ` juzhe.zhong at rivai dot ai

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).