From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 48) id 355A5387085F; Fri, 5 Mar 2021 14:11:48 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 355A5387085F From: "hubicka at gcc dot gnu.org" To: gcc-bugs@gcc.gnu.org Subject: [Bug middle-end/99408] New: s3251 benchmark of TSVC vectorized by clang runs about 7 times faster compared to gcc Date: Fri, 05 Mar 2021 14:11:48 +0000 X-Bugzilla-Reason: CC X-Bugzilla-Type: new X-Bugzilla-Watch-Reason: None X-Bugzilla-Product: gcc X-Bugzilla-Component: middle-end X-Bugzilla-Version: 11.0 X-Bugzilla-Keywords: X-Bugzilla-Severity: normal X-Bugzilla-Who: hubicka at gcc dot gnu.org X-Bugzilla-Status: UNCONFIRMED X-Bugzilla-Resolution: X-Bugzilla-Priority: P3 X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org X-Bugzilla-Target-Milestone: --- X-Bugzilla-Flags: X-Bugzilla-Changed-Fields: bug_id short_desc product version bug_status bug_severity priority component assigned_to reporter target_milestone Message-ID: Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: quoted-printable X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/ Auto-Submitted: auto-generated MIME-Version: 1.0 X-BeenThere: gcc-bugs@gcc.gnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Gcc-bugs mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 05 Mar 2021 14:11:48 -0000 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D99408 Bug ID: 99408 Summary: s3251 benchmark of TSVC vectorized by clang runs about 7 times faster compared to gcc Product: gcc Version: 11.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: middle-end Assignee: unassigned at gcc dot gnu.org Reporter: hubicka at gcc dot gnu.org Target Milestone: --- typedef float real_t; #define iterations 100000 #define LEN_1D 32000 #define LEN_2D 256 real_t a[LEN_1D],b[LEN_1D],c[LEN_1D],d[LEN_1D],e[LEN_1D]; void main(void) { for (int nl =3D 0; nl < iterations; nl++) { for (int i =3D 0; i < LEN_1D-1; i++){ a[i+1] =3D b[i]+c[i]; b[i] =3D c[i]*e[i]; d[i] =3D a[i]*e[i]; } } } Built with -march=3Dznver2 -Ofast I get: main: .LFB0: .cfi_startproc vmovaps c+127968(%rip), %xmm5 vmovaps e+127968(%rip), %xmm4 movl $100000, %edx vmovq c+127984(%rip), %xmm9 vmovq e+127984(%rip), %xmm10 vmovss c+127992(%rip), %xmm7 vmovss e+127992(%rip), %xmm3 vmovss c+127984(%rip), %xmm13 vmulps %xmm4, %xmm5, %xmm6 vmulps %xmm9, %xmm10, %xmm12 vmulss %xmm3, %xmm7, %xmm11 .p2align 4 .p2align 3 .L2: xorl %eax, %eax .p2align 4 .p2align 3 .L4: vmovaps c(%rax), %ymm2 addq $32, %rax vaddps b-32(%rax), %ymm2, %ymm0 vmovups %ymm0, a-28(%rax) vmulps e-32(%rax), %ymm2, %ymm0 vmovaps e-32(%rax), %ymm2 vmovaps %ymm0, b-32(%rax) vmulps a-32(%rax), %ymm2, %ymm0 vmovaps %ymm0, d-32(%rax) cmpq $127968, %rax jne .L4 vaddps b+127968(%rip), %xmm5, %xmm1 vaddss b+127984(%rip), %xmm13, %xmm2 decl %edx vmovaps %xmm6, b+127968(%rip) vmovq b+127984(%rip), %xmm0 vmovlps %xmm12, b+127984(%rip) vaddps %xmm0, %xmm9, %xmm0 vmovups %xmm1, a+127972(%rip) vshufps $255, %xmm1, %xmm1, %xmm1 vmulps a+127968(%rip), %xmm4, %xmm8 vunpcklps %xmm2, %xmm1, %xmm1 vaddss b+127992(%rip), %xmm7, %xmm2 vmovss %xmm11, b+127992(%rip) vmulps %xmm10, %xmm1, %xmm1 vmovlps %xmm0, a+127988(%rip) vmovshdup %xmm0, %xmm0 vmulss %xmm3, %xmm0, %xmm0 vmovss %xmm2, a+127996(%rip) jne .L2 vmovaps %xmm8, d+127968(%rip) vmovlps %xmm1, d+127984(%rip) vmovss %xmm0, d+127992(%rip) vzeroupper ret Clang does: main: # @main .cfi_startproc # %bb.0: vbroadcastss a(%rip), %ymm0 vmovss e+127968(%rip), %xmm1 # xmm1 =3D mem[0],zero,zero= ,zero vmovss e+127980(%rip), %xmm2 # xmm2 =3D mem[0],zero,zero= ,zero vmovss c+127984(%rip), %xmm4 # xmm4 =3D mem[0],zero,zero= ,zero vmovss e+127984(%rip), %xmm5 # xmm5 =3D mem[0],zero,zero= ,zero vmovss c+127988(%rip), %xmm8 # xmm8 =3D mem[0],zero,zero= ,zero vmovss e+127988(%rip), %xmm9 # xmm9 =3D mem[0],zero,zero= ,zero vmovss c+127992(%rip), %xmm11 # xmm11 =3D mem[0],zero,zer= o,zero vmovss e+127992(%rip), %xmm12 # xmm12 =3D mem[0],zero,zer= o,zero xorl %eax, %eax vmovups %ymm0, -56(%rsp) # 32-byte Spill vmovss c+127968(%rip), %xmm0 # xmm0 =3D mem[0],zero,zero= ,zero vmovss %xmm1, -64(%rsp) # 4-byte Spill vmulss %xmm4, %xmm5, %xmm3 vmulss %xmm8, %xmm9, %xmm10 vmulss %xmm11, %xmm12, %xmm13 vmovss %xmm0, -60(%rsp) # 4-byte Spill vmulss %xmm0, %xmm1, %xmm0 vmovss e+127972(%rip), %xmm1 # xmm1 =3D mem[0],zero,zero= ,zero vmovss %xmm0, -68(%rsp) # 4-byte Spill vmovss c+127972(%rip), %xmm0 # xmm0 =3D mem[0],zero,zero= ,zero vmovss %xmm1, -76(%rsp) # 4-byte Spill vmovss %xmm0, -72(%rsp) # 4-byte Spill vmulss %xmm0, %xmm1, %xmm0 vmovss e+127976(%rip), %xmm1 # xmm1 =3D mem[0],zero,zero= ,zero vmovss %xmm0, -80(%rsp) # 4-byte Spill vmovss c+127976(%rip), %xmm0 # xmm0 =3D mem[0],zero,zero= ,zero vmovss %xmm1, -88(%rsp) # 4-byte Spill vmovss %xmm0, -84(%rsp) # 4-byte Spill vmulss %xmm0, %xmm1, %xmm0 vmovss c+127980(%rip), %xmm1 # xmm1 =3D mem[0],zero,zero= ,zero vmovss %xmm0, -92(%rsp) # 4-byte Spill vmulss %xmm1, %xmm2, %xmm0 vmovss %xmm0, -96(%rsp) # 4-byte Spill .p2align 4, 0x90 .LBB0_1: # =3D>This Loop Header: Depth=3D1 # Child Loop BB0_2 Depth 2 vmovups -56(%rsp), %ymm14 # 32-byte Reload xorl %ecx, %ecx .p2align 4, 0x90 .LBB0_2: # Parent Loop BB0_1 Depth=3D1 # =3D> This Inner Loop Header: Dep= th=3D2 vmovups c(%rcx), %ymm7 vmovaps %ymm14, %ymm15 vmovups e(%rcx), %ymm0 vaddps b(%rcx), %ymm7, %ymm14 vmulps %ymm7, %ymm0, %ymm7 vperm2f128 $33, %ymm14, %ymm15, %ymm15 # ymm15 =3D ymm15[2,3],ymm14[0,1] vmovups %ymm14, a+4(%rcx) vmovups %ymm7, b(%rcx) vshufps $3, %ymm14, %ymm15, %ymm15 # ymm15 =3D ymm15[3,0],ymm14[0,0],ymm15[7,4],ymm14[4,4] vshufps $152, %ymm14, %ymm15, %ymm15 # ymm15 =3D ymm15[0,2],ymm14[1,2],ymm15[4,6],ymm14[5,6] vmulps %ymm0, %ymm15, %ymm0 vmovups %ymm0, d(%rcx) addq $32, %rcx cmpq $127968, %rcx # imm =3D 0x1F3E0 jne .LBB0_2 # %bb.3: # in Loop: Header=3DBB0_1 Depth= =3D1 vextractf128 $1, %ymm14, %xmm0 vmovss -60(%rsp), %xmm7 # 4-byte Reload # xmm7 =3D mem[0],zero,zero,zero vmovss -68(%rsp), %xmm6 # 4-byte Reload # xmm6 =3D mem[0],zero,zero,zero incl %eax vpermilps $231, %xmm0, %xmm0 # xmm0 =3D xmm0[3,1,2,3] vmulss -64(%rsp), %xmm0, %xmm0 # 4-byte Folded Reload vaddss b+127968(%rip), %xmm7, %xmm7 vmovss %xmm6, b+127968(%rip) vmovss -80(%rsp), %xmm6 # 4-byte Reload # xmm6 =3D mem[0],zero,zero,zero vmovss %xmm0, d+127968(%rip) vmovss -72(%rsp), %xmm0 # 4-byte Reload # xmm0 =3D mem[0],zero,zero,zero vmovss %xmm7, a+127972(%rip) vmulss -76(%rsp), %xmm7, %xmm7 # 4-byte Folded Reload vaddss b+127972(%rip), %xmm0, %xmm0 vmovss %xmm6, b+127972(%rip) vmovss -84(%rsp), %xmm6 # 4-byte Reload # xmm6 =3D mem[0],zero,zero,zero vmovss %xmm7, d+127972(%rip) vaddss b+127976(%rip), %xmm6, %xmm7 vmovss -92(%rsp), %xmm6 # 4-byte Reload # xmm6 =3D mem[0],zero,zero,zero vmovss %xmm0, a+127976(%rip) vmulss -88(%rsp), %xmm0, %xmm0 # 4-byte Folded Reload vmovss %xmm6, b+127976(%rip) vmovss -96(%rsp), %xmm6 # 4-byte Reload # xmm6 =3D mem[0],zero,zero,zero vmovss %xmm7, a+127980(%rip) vmulss %xmm2, %xmm7, %xmm7 vmovss %xmm0, d+127976(%rip) vaddss b+127980(%rip), %xmm1, %xmm0 vmovss %xmm7, d+127980(%rip) vmovss %xmm6, b+127980(%rip) vaddss b+127984(%rip), %xmm4, %xmm7 vmovss %xmm3, b+127984(%rip) vmovss %xmm0, a+127984(%rip) vmulss %xmm5, %xmm0, %xmm0 vmovss %xmm0, d+127984(%rip) vaddss b+127988(%rip), %xmm8, %xmm0 vmovss %xmm10, b+127988(%rip) vaddss b+127992(%rip), %xmm11, %xmm6 vmovss %xmm13, b+127992(%rip) vmovss %xmm7, a+127988(%rip) vmulss %xmm7, %xmm9, %xmm7 vmovss %xmm7, d+127988(%rip) vmovss %xmm0, a+127992(%rip) vmulss %xmm0, %xmm12, %xmm0 vmovss %xmm6, a+127996(%rip) vmovss %xmm0, d+127992(%rip) cmpl $100000, %eax # imm =3D 0x186A0 jne .LBB0_1 # %bb.4: vzeroupper retq Runtie with clang is 0.443s and GCC 2.317s. With -fno-tree-vectorize I get 2.153s=