From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 48) id 7B9913858D32; Tue, 3 Jan 2023 02:03:10 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 7B9913858D32 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1672711390; bh=joH32LKyGTKZrTbXrn19RrpPDRxP7LG2BwdBtY/vuXg=; h=From:To:Subject:Date:From; b=hnMzTWsZ2eRpKmJ7qtCOQHG6ifK27HM2vnk7kkZCMhmWH9wDZSt5dNHcsrmHcJdPY rlwSW5e8RBlBjuCCQXxlzD9Kx198qThkIInDZlGhw9b9BWlk/oS+4l9U+UtgW7DEql ZIfo41oOobWxpOWnQaIveWLjOLV2fdQhAni1zLhQ= From: "juzhe.zhong at rivai dot ai" To: gcc-bugs@gcc.gnu.org Subject: [Bug c/108271] New: Missed RVV cost model Date: Tue, 03 Jan 2023 02:03:10 +0000 X-Bugzilla-Reason: CC X-Bugzilla-Type: new X-Bugzilla-Watch-Reason: None X-Bugzilla-Product: gcc X-Bugzilla-Component: c X-Bugzilla-Version: 13.0 X-Bugzilla-Keywords: X-Bugzilla-Severity: normal X-Bugzilla-Who: juzhe.zhong at rivai dot ai X-Bugzilla-Status: UNCONFIRMED X-Bugzilla-Resolution: X-Bugzilla-Priority: P3 X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org X-Bugzilla-Target-Milestone: --- X-Bugzilla-Flags: X-Bugzilla-Changed-Fields: bug_id short_desc product version bug_status bug_severity priority component assigned_to reporter target_milestone Message-ID: Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: quoted-printable X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/ Auto-Submitted: auto-generated MIME-Version: 1.0 List-Id: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D108271 Bug ID: 108271 Summary: Missed RVV cost model Product: gcc Version: 13.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: c Assignee: unassigned at gcc dot gnu.org Reporter: juzhe.zhong at rivai dot ai Target Milestone: --- #include "riscv_vector.h" void f3 (int * restrict in, int * restrict out, void * restrict mask_in, in= t n) { vfloat32mf2_t v =3D __riscv_vle32_v_f32mf2 ((float *)(in + 10000), 19); __riscv_vse32_v_f32mf2 ((float *)(out + 10000), v, 19); vbool64_t mask =3D *(vbool64_t*)mask_in; for (int i =3D 0; i < n; i++) { vint16mf2_t v1 =3D __riscv_vle16_v_i16mf2 ((int16_t *)(in + i + 1), 1= 9); __riscv_vse16_v_i16mf2 ((int16_t *)(out + i + 1), v1, 19); vint32mf2_t v2 =3D __riscv_vle32_v_i32mf2 ((int32_t *)(in + i + 2), 1= 9); __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 2), v2, 19); vint32mf2_t v3 =3D __riscv_vle32_v_i32mf2_tumu (mask, v2, (int32_t *)= (in + i + 200), 13); *(vint32mf2_t*)(out + i + 200) =3D v3; vfloat64m1_t v4 =3D __riscv_vle64_v_f64m1_m (mask, (double *)(in + i + 300), 11); __riscv_vse64_v_f64m1 ((double *)(out + i + 300), v4, 11); vfloat64m1_t v5 =3D __riscv_vle64_v_f64m1_tum (mask, v4, (double *)(i= n + i + 500), 11); __riscv_vse64_v_f64m1 ((double *)(out + i + 500), v5, 11); vfloat64m1_t v6 =3D __riscv_vle64_v_f64m1_mu (mask, v5, (double *)(in= + i + 600), 11); __riscv_vse64_v_f64m1_m (mask, (double *)(out + i + 600), v6, 11); vuint8mf4_t v7 =3D __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 700), 1= 1); __riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 700), v7, 11); vuint8mf4_t v8 =3D __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 800), 1= 1); __riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 800), v7, 11); vuint8mf4_t v9 =3D __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 900), 1= 1); __riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 900), v7, 11); vuint8mf4_t v10 =3D __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 1000),= 11); __riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 1000), v7, 11); } } -O3 -S ASM: f3: li a5,40960 addi a5,a5,-960 addi sp,sp,-64 sd s4,24(sp) add a4,a0,a5 add a5,a1,a5 vsetivli zero,19,e32,mf2,ta,ma vle32.v v24,0(a4) vse32.v v24,0(a5) vsetvli s4,zero,e8,mf8,ta,ma vlm.v v0,0(a2) ble a3,zero,.L1 addi a3,a3,1 sd s3,32(sp) slli a3,a3,2 li s3,4096 sd s2,40(sp) sd s5,16(sp) sd s6,8(sp) addi t6,s3,-1700 addi t5,s3,-1300 addi s6,s3,-900 addi s5,s3,-500 sd s0,56(sp) sd s1,48(sp) addi a0,a0,4 addi a4,a1,4 add s2,a1,a3 addi s3,s3,-100 .L3: vsetivli zero,19,e16,mf2,ta,ma mv a5,a4 vle16.v v24,0(a0) mv a3,a0 vse16.v v24,0(a4) addi a0,a0,4 vsetivli zero,19,e32,mf2,ta,ma addi a4,a4,4 vle32.v v24,0(a0) addi s1,a3,796 vse32.v v24,0(a4) vsetivli zero,13,e32,mf2,tu,mu addi s0,a5,796 vle32.v v24,0(s1),v0.t addi a1,a3,1196 addi t4,a5,1196 addi t2,a3,1996 addi t3,a5,1996 add t0,a3,t6 vsetvli s4,zero,e32,mf2,ta,ma add t1,a5,t6 vse32.v v24,0(s0) add a7,a5,t5 vsetivli zero,11,e64,m1,tu,mu add a6,a5,s6 vle64.v v24,0(a1),v0.t add a2,a5,s5 vse64.v v24,0(t4) add a3,a3,t5 vle64.v v24,0(t2),v0.t add a5,a5,s3 vse64.v v24,0(t3) vle64.v v24,0(t0),v0.t vse64.v v24,0(t1),v0.t vsetivli zero,11,e8,mf4,ta,ma vle8.v v24,0(a3) vse8.v v24,0(a7) vse8.v v24,0(a6) vse8.v v24,0(a2) vse8.v v24,0(a5) bne s2,a4,.L3 ld s0,56(sp) ld s1,48(sp) ld s2,40(sp) ld s3,32(sp) ld s5,16(sp) ld s6,8(sp) .L1: ld s4,24(sp) addi sp,sp,64 jr ra GCC allocate redundant stack and generate a lot of redundant ld or sd instructions. However, if we use -O3 -fno-schedule-insns ASM: f3: li a5,40960 addi a5,a5,-960 add a4,a0,a5 add a5,a1,a5 vsetivli zero,19,e32,mf2,ta,ma vle32.v v24,0(a4) vse32.v v24,0(a5) vsetvli t3,zero,e8,mf8,ta,ma vlm.v v0,0(a2) ble a3,zero,.L1 addi a3,a3,1 li t1,4096 slli a3,a3,2 addi a4,a1,4 addi a7,t1,-1700 addi a6,t1,-1300 addi t5,t1,-900 addi t4,t1,-500 addi a2,a0,4 add a1,a1,a3 addi t1,t1,-100 .L3: mv a3,a2 vsetivli zero,19,e16,mf2,ta,ma mv a5,a4 vle16.v v24,0(a2) addi a0,a3,796 vse16.v v24,0(a4) addi a2,a2,4 vsetivli zero,19,e32,mf2,ta,ma addi a4,a4,4 vle32.v v24,0(a2) vse32.v v24,0(a4) vsetivli zero,13,e32,mf2,tu,mu vle32.v v24,0(a0),v0.t addi a0,a5,796 vsetvli t3,zero,e32,mf2,ta,ma vse32.v v24,0(a0) addi a0,a3,1196 vsetivli zero,11,e64,m1,tu,mu vle64.v v24,0(a0),v0.t addi a0,a5,1196 vse64.v v24,0(a0) addi a0,a3,1996 vle64.v v24,0(a0),v0.t addi a0,a5,1996 vse64.v v24,0(a0) add a0,a3,a7 vle64.v v24,0(a0),v0.t add a3,a3,a6 add a0,a5,a7 vse64.v v24,0(a0),v0.t vsetivli zero,11,e8,mf4,ta,ma vle8.v v24,0(a3) add a3,a5,a6 vse8.v v24,0(a3) add a3,a5,t5 vse8.v v24,0(a3) add a3,a5,t4 add a5,a5,t1 vse8.v v24,0(a3) vse8.v v24,0(a5) bne a1,a4,.L3 .L1: ret This issue is gone. we should correctly adjust the RVV instruction COST mod= el to make the codegen of with -fno-schedule-insns and without -fno-schedule-i= nsns the same.=