From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 48) id C89D23836C50; Fri, 5 Mar 2021 16:20:54 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org C89D23836C50 From: "hubicka at gcc dot gnu.org" To: gcc-bugs@gcc.gnu.org Subject: [Bug middle-end/99416] New: s211 benchmark of TSVC is vectorized by icc and not by gcc Date: Fri, 05 Mar 2021 16:20:54 +0000 X-Bugzilla-Reason: CC X-Bugzilla-Type: new X-Bugzilla-Watch-Reason: None X-Bugzilla-Product: gcc X-Bugzilla-Component: middle-end X-Bugzilla-Version: 11.0 X-Bugzilla-Keywords: X-Bugzilla-Severity: normal X-Bugzilla-Who: hubicka at gcc dot gnu.org X-Bugzilla-Status: UNCONFIRMED X-Bugzilla-Resolution: X-Bugzilla-Priority: P3 X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org X-Bugzilla-Target-Milestone: --- X-Bugzilla-Flags: X-Bugzilla-Changed-Fields: bug_id short_desc product version bug_status bug_severity priority component assigned_to reporter target_milestone Message-ID: Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: quoted-printable X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/ Auto-Submitted: auto-generated MIME-Version: 1.0 X-BeenThere: gcc-bugs@gcc.gnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Gcc-bugs mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 05 Mar 2021 16:20:54 -0000 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D99416 Bug ID: 99416 Summary: s211 benchmark of TSVC is vectorized by icc and not by gcc Product: gcc Version: 11.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: middle-end Assignee: unassigned at gcc dot gnu.org Reporter: hubicka at gcc dot gnu.org Target Milestone: --- typedef float real_t; #define iterations 100000 #define LEN_1D 32000 #define LEN_2D 256 real_t a[LEN_1D],b[LEN_1D],c[LEN_1D],d[LEN_1D],e[LEN_1D]; void main() { for (int nl =3D 0; nl < iterations; nl++) { for (int i =3D 1; i < LEN_1D-1; i++) { a[i] =3D b[i - 1] + c[i] * d[i]; b[i] =3D b[i + 1] - e[i] * d[i]; } } } Icc produces: ain: ..B1.1: # Preds ..B1.0 # Execution count [0.00e+00] .cfi_startproc ..___tag_value_ain.1: ..L2: #9.1 subq $136, %rsp #9.1 .cfi_def_cfa_offset 144 xorl %edx, %edx #11.5 lea 12+d(%rip), %r8 #14.38 vmovss (%r8), %xmm0 #14.38 movl $7, %edi #13.38 lea 12+e(%rip), %r9 #14.38 vmulss (%r9), %xmm0, %xmm12 #14.38 xorl %esi, %esi #13.38 lea 12+c(%rip), %r10 #13.38 vmulss (%r10), %xmm0, %xmm0 #13.38 vmovss 16(%r8), %xmm4 #14.38 movl $31977, %ecx #12.9 vmulss 16(%r9), %xmm4, %xmm14 #14.38 movl $31975, %eax #12.9 lea 24+b(%rip), %r11 #14.20 vmovss (%r11), %xmm11 #14.20 vmovss 4(%r8), %xmm6 #14.38 vmovss %xmm12, 104(%rsp) #14.38[spil= l] vmovss %xmm11, 8(%rsp) #14.20[spil= l] vmulss 4(%r9), %xmm6, %xmm12 #14.38 vmulss 4(%r10), %xmm6, %xmm11 #13.38 vmovss 127984+d(%rip), %xmm6 #14.38 vmovss 8(%r8), %xmm13 #14.38 vmovss %xmm14, 96(%rsp) #14.38[spil= l] vmulss 127984+e(%rip), %xmm6, %xmm14 #14.38 vmulss 8(%r9), %xmm13, %xmm1 #14.38 vmovss %xmm14, 112(%rsp) #14.38[spil= l] vmovss 127988+d(%rip), %xmm14 #14.38 vmovss %xmm1, 16(%rsp) #14.38[spil= l] vmulss 8(%r10), %xmm13, %xmm1 #13.38 vmulss 16(%r10), %xmm4, %xmm13 #13.38 vmulss 127988+e(%rip), %xmm14, %xmm4 #14.38 vmovss %xmm4, 120(%rsp) #14.38[spil= l] vmulss 127988+c(%rip), %xmm14, %xmm4 #13.38 vmovss -4(%r11), %xmm5 #14.20 vmovss -8(%r8), %xmm2 #14.38 vmovss 12(%r8), %xmm15 #14.38 vmovss %xmm4, 24(%rsp) #13.38[spil= l] vmovss 127992+d(%rip), %xmm4 #14.38 vmovss %xmm5, (%rsp) #14.20[spil= l] vmulss -8(%r9), %xmm2, %xmm3 #14.38 vmulss -8(%r10), %xmm2, %xmm5 #13.38 vmulss 12(%r9), %xmm15, %xmm2 #14.38 vmulss 12(%r10), %xmm15, %xmm15 #13.38 vmulss 127992+e(%rip), %xmm4, %xmm14 #14.38 vmulss 127992+c(%rip), %xmm4, %xmm4 #13.38 vmovss -4(%r8), %xmm10 #14.38 vmulss -4(%r9), %xmm10, %xmm7 #14.38 vmulss -4(%r10), %xmm10, %xmm10 #13.38 vmovss %xmm7, 88(%rsp) #14.38[spil= l] vmovss %xmm4, 32(%rsp) #13.38[spil= l] vmovss %xmm15, 56(%rsp) #13.31[spil= l] vmovss %xmm14, 40(%rsp) #13.31[spil= l] vmovss %xmm3, 80(%rsp) #13.31[spil= l] vmovss -16(%r11), %xmm9 #14.20 vmovss -12(%r11), %xmm8 #14.20 vmovss -8(%r11), %xmm7 #14.20 vmovss 127984+c(%rip), %xmm4 #13.31 vmovss %xmm1, 64(%rsp) #13.31[spil= l] vmovss %xmm0, 48(%rsp) #13.31[spil= l] vmovss %xmm2, 72(%rsp) #13.31[spil= l] vmovss 16(%rsp), %xmm14 #13.31[spil= l] vmovss 8(%rsp), %xmm15 #13.31[spil= l] vmovss (%rsp), %xmm3 #13.31[spil= l] # LOE rax rcx rbx rbp rsi rdi r12 r13 r14 r= 15 edx xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 xmm15 ..B1.2: # Preds ..B1.10 ..B1.1 # Execution count [1.00e+05] movq %rdi, %r8 #12.9 vsubss 80(%rsp), %xmm9, %xmm0 #14.38[spil= l] vsubss 88(%rsp), %xmm8, %xmm1 #14.38[spil= l] vsubss 104(%rsp), %xmm7, %xmm2 #14.38[spil= l] vsubss %xmm14, %xmm15, %xmm7 #14.38 vsubss %xmm12, %xmm3, %xmm3 #14.38 vmovss 28+b(%rip), %xmm8 #14.20 vmovss 32+b(%rip), %xmm15 #14.20 vmovss %xmm0, 4+b(%rip) #14.13 vmovss %xmm1, 8+b(%rip) #14.13 vmovss %xmm2, 12+b(%rip) #14.13 vmovss %xmm3, 16+b(%rip) #14.13 vmovss %xmm7, 20+b(%rip) #14.13 vsubss 72(%rsp), %xmm8, %xmm9 #14.38[spil= l] vsubss 96(%rsp), %xmm15, %xmm0 #14.38[spil= l] vmovss %xmm9, 24+b(%rip) #14.13 vmovss %xmm0, 28+b(%rip) #14.13 # LOE rax rcx rbx rbp rsi rdi r8 r12 r13 r14 r15 edx xmm4 xmm5 xmm6 xmm10 xmm11 xmm12 xmm13 xmm14 ..B1.3: # Preds ..B1.3 ..B1.2 # Execution count [3.20e+09] vmovups 4+e(,%r8,4), %ymm1 #14.31 lea (,%r8,4), %r9 #14.13 vmovups 36+e(,%r8,4), %ymm3 #14.31 vmovups 68+e(,%r8,4), %ymm8 #14.31 vmovups 100+e(,%r8,4), %ymm15 #14.31 vmovups 4+d(,%r8,4), %ymm0 #14.38 vmovups 36+d(,%r8,4), %ymm2 #14.38 vmovups 68+d(,%r8,4), %ymm7 #14.38 vmovups 100+d(,%r8,4), %ymm9 #14.38 vfnmadd213ps 8+b(,%r8,4), %ymm0, %ymm1 #14.38 vfnmadd213ps 40+b(,%r8,4), %ymm2, %ymm3 #14.38 vfnmadd213ps 72+b(,%r8,4), %ymm7, %ymm8 #14.38 vfnmadd213ps 104+b(,%r8,4), %ymm9, %ymm15 #14.38 vmovups %ymm1, 4+b(%r9) #14.13 vmovups %ymm3, 36+b(%r9) #14.13 vmovups %ymm8, 68+b(%r9) #14.13 vmovups %ymm15, 100+b(%r9) #14.13 addq $32, %r8 #12.9 cmpq $31975, %r8 #12.9 jb ..B1.3 # Prob 99% #12.9 # LOE rax rcx rbx rbp rsi rdi r8 r12 r13 r14 r15 edx xmm4 xmm5 xmm6 xmm10 xmm11 xmm12 xmm13 xmm14 ..B1.4: # Preds ..B1.3 # Execution count [1.00e+05] movq %rsi, %r9 #12.9 movq %rcx, %r8 #12.9 # LOE rax rcx rbx rbp rsi rdi r8 r9 r12 r13= r14 r15 edx xmm4 xmm5 xmm6 xmm10 xmm11 xmm12 xmm13 xmm14 ..B1.5: # Preds ..B1.5 ..B1.4 # Execution count [3.20e+09] vmovups 127904+e(,%r9,4), %xmm1 #14.31 vmovups 127904+d(,%r9,4), %xmm0 #14.38 vfnmadd213ps b(,%r8,4), %xmm0, %xmm1 #14.38 addq $4, %r8 #12.9 vmovups %xmm1, 127904+b(,%r9,4) #14.13 addq $4, %r9 #12.9 cmpq $20, %r9 #12.9 jb ..B1.5 # Prob 99% #12.9 # LOE rax rcx rbx rbp rsi rdi r8 r9 r12 r13= r14 r15 edx xmm4 xmm5 xmm6 xmm10 xmm11 xmm12 xmm13 xmm14 ..B1.6: # Preds ..B1.5 # Execution count [1.00e+05] vmovss 127996+b(%rip), %xmm9 #14.20 movq %rdi, %r8 #12.9 vmovss 127992+b(%rip), %xmm1 #14.20 vmovss 127988+b(%rip), %xmm2 #14.20 vaddss b(%rip), %xmm5, %xmm7 #13.38 vaddss 4+b(%rip), %xmm10, %xmm3 #13.38 vsubss 40(%rsp), %xmm9, %xmm8 #14.38[spil= l] vsubss 112(%rsp), %xmm2, %xmm2 #14.38[spil= l] vsubss 120(%rsp), %xmm1, %xmm1 #14.38[spil= l] vmovss %xmm7, 4+a(%rip) #13.13 vmovss 16+b(%rip), %xmm7 #13.20 vmovss %xmm3, 8+a(%rip) #13.13 vmovss 8+b(%rip), %xmm9 #13.20 vmovss %xmm8, 127992+b(%rip) #14.13 vmovss 12+b(%rip), %xmm8 #13.20 vmovss %xmm2, 127984+b(%rip) #14.13 vaddss %xmm11, %xmm8, %xmm0 #13.38 vaddss 64(%rsp), %xmm7, %xmm3 #13.38[spil= l] vaddss 48(%rsp), %xmm9, %xmm15 #13.38[spil= l] vmovss %xmm3, 20+a(%rip) #13.13 vmovss 20+b(%rip), %xmm3 #13.20 vmovss %xmm15, 12+a(%rip) #13.13 vmovss %xmm0, 16+a(%rip) #13.13 vmovss %xmm1, 127988+b(%rip) #14.13 vmovss %xmm9, (%rsp) #13.13[spil= l] vaddss 56(%rsp), %xmm3, %xmm15 #13.38[spil= l] vmovss %xmm15, 24+a(%rip) #13.13 vmovss 24+b(%rip), %xmm15 #13.20 vaddss %xmm13, %xmm15, %xmm0 #13.38 vmovss %xmm0, 28+a(%rip) #13.13 # LOE rax rcx rbx rbp rsi rdi r8 r12 r13 r14= r15 edx xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm10 xmm11 xmm12 xmm13 xmm14 x= mm15 ..B1.7: # Preds ..B1.7 ..B1.6 # Execution count [3.20e+09] vmovups 4+c(,%r8,4), %ymm9 #13.31 lea (,%r8,4), %r9 #13.13 vmovups 4+d(,%r8,4), %ymm0 #13.38 vfmadd213ps b(,%r8,4), %ymm0, %ymm9 #13.38 vmovups 36+d(,%r8,4), %ymm0 #13.38 vmovups %ymm9, 4+a(%r9) #13.13 vmovups 36+c(,%r8,4), %ymm9 #13.31 vfmadd213ps 32+b(,%r8,4), %ymm0, %ymm9 #13.38 vmovups 68+d(,%r8,4), %ymm0 #13.38 vmovups %ymm9, 36+a(%r9) #13.13 vmovups 68+c(,%r8,4), %ymm9 #13.31 vfmadd213ps 64+b(,%r8,4), %ymm0, %ymm9 #13.38 vmovups 100+d(,%r8,4), %ymm0 #13.38 vmovups %ymm9, 68+a(%r9) #13.13 vmovups 100+c(,%r8,4), %ymm9 #13.31 vfmadd213ps 96+b(,%r8,4), %ymm0, %ymm9 #13.38 addq $32, %r8 #12.9 vmovups %ymm9, 100+a(%r9) #13.13 cmpq $31975, %r8 #12.9 jb ..B1.7 # Prob 99% #12.9 # LOE rax rcx rbx rbp rsi rdi r8 r12 r13 r14 r15 edx xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm10 xmm11 xmm12 xmm13 xmm= 14 xmm15 ..B1.8: # Preds ..B1.7 # Execution count [1.00e+05] movq %rsi, %r9 #12.9 movq %rax, %r8 #12.9 # LOE rax rcx rbx rbp rsi rdi r8 r9 r12 r13= r14 r15 edx xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm10 xmm11 xmm12 xmm13 xmm= 14 xmm15 ..B1.9: # Preds ..B1.9 ..B1.8 # Execution count [3.20e+09] vmovups 127904+c(,%r9,4), %xmm9 #13.31 vmovups 127904+d(,%r9,4), %xmm0 #13.38 vfmadd213ps b(,%r8,4), %xmm0, %xmm9 #13.38 addq $4, %r8 #12.9 vmovups %xmm9, 127904+a(,%r9,4) #13.13 addq $4, %r9 #12.9 cmpq $20, %r9 #12.9 jb ..B1.9 # Prob 99% #12.9 # LOE rax rcx rbx rbp rsi rdi r8 r9 r12 r13= r14 r15 edx xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm10 xmm11 xmm12 xmm13 xmm= 14 xmm15 ..B1.10: # Preds ..B1.9 # Execution count [1.07e+09] incl %edx #11.5 vmovss 127980+b(%rip), %xmm0 #13.20 vmovss (%rsp), %xmm9 #[spill] vfmadd231ss %xmm6, %xmm4, %xmm0 #13.38 cmpl $100000, %edx #11.5 jb ..B1.2 # Prob 99% #11.5 # LOE rax rcx rbx rbp rsi rdi r12 r13 r14 r= 15 edx xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm= 13 xmm14 xmm15 ..B1.11: # Preds ..B1.10 # Execution count [1.00e+00] vmovss %xmm0, 127984+a(%rip) #13.13 vaddss 32(%rsp), %xmm1, %xmm1 #13.38[spil= l] vaddss 24(%rsp), %xmm2, %xmm2 #13.38[spil= l] vmovss %xmm1, 127992+a(%rip) #13.13 vmovss %xmm2, 127988+a(%rip) #13.13 vzeroupper #17.1 addq $136, %rsp #17.1 .cfi_def_cfa_offset 8 ret #17.1=