From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 48) id DD252386EC70; Fri, 5 Mar 2021 15:50:14 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org DD252386EC70 From: "hubicka at gcc dot gnu.org" To: gcc-bugs@gcc.gnu.org Subject: [Bug middle-end/99415] New: s115 benchmark of TSVC is vectorized by icc and not by gcc Date: Fri, 05 Mar 2021 15:50:14 +0000 X-Bugzilla-Reason: CC X-Bugzilla-Type: new X-Bugzilla-Watch-Reason: None X-Bugzilla-Product: gcc X-Bugzilla-Component: middle-end X-Bugzilla-Version: 11.0 X-Bugzilla-Keywords: X-Bugzilla-Severity: normal X-Bugzilla-Who: hubicka at gcc dot gnu.org X-Bugzilla-Status: UNCONFIRMED X-Bugzilla-Resolution: X-Bugzilla-Priority: P3 X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org X-Bugzilla-Target-Milestone: --- X-Bugzilla-Flags: X-Bugzilla-Changed-Fields: bug_id short_desc product version bug_status bug_severity priority component assigned_to reporter target_milestone Message-ID: Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: quoted-printable X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/ Auto-Submitted: auto-generated MIME-Version: 1.0 X-BeenThere: gcc-bugs@gcc.gnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Gcc-bugs mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 05 Mar 2021 15:50:15 -0000 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D99415 Bug ID: 99415 Summary: s115 benchmark of TSVC is vectorized by icc and not by gcc Product: gcc Version: 11.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: middle-end Assignee: unassigned at gcc dot gnu.org Reporter: hubicka at gcc dot gnu.org Target Milestone: --- typedef float real_t; #define iterations 100000 #define LEN_1D 32000 #define LEN_2D 256 real_t a[LEN_1D],aa[LEN_2D][LEN_2D]; void main() { for (int nl =3D 0; nl < 1000*(iterations/LEN_2D); nl++) { for (int j =3D 0; j < LEN_2D; j++) { for (int i =3D j+1; i < LEN_2D; i++) { a[i] -=3D aa[j][i] * a[j]; } } } } is built as: main: ..B1.1: # Preds ..B1.0 # Execution count [1.17e-01] .cfi_startproc ..___tag_value_main.1: ..L2: #9.1 pushq %rbp #9.1 .cfi_def_cfa_offset 16 movq %rsp, %rbp #9.1 .cfi_def_cfa 6, 16 .cfi_offset 6, -16 andq $-128, %rsp #9.1 pushq %r14 #9.1 pushq %r15 #9.1 pushq %rbx #9.1 subq $104, %rsp #9.1 movl $3, %edi #9.1 xorl %esi, %esi #9.1 call __intel_new_feature_proc_init #9.1 .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0x80, 0xff, 0xff, 0= xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0x80, 0xff, 0xff, 0= xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0x80, 0xff, 0xff, 0= xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 # LOE rbx r12 r13 r14 r15 ..B1.29: # Preds ..B1.1 # Execution count [1.17e-01] vstmxcsr (%rsp) #9.1 xorl %eax, %eax #11.5 orl $32832, (%rsp) #9.1 vldmxcsr (%rsp) #9.1 # LOE r12 r13 eax ..B1.2: # Preds ..B1.22 ..B1.29 # Execution count [4.50e+04] xorl %r11d, %r11d #12.9 xorl %edi, %edi #12.9 xorl %ebx, %ebx #12.9 xorl %r9d, %r9d #12.9 xorl %esi, %esi #12.9 # LOE rbx rsi r11 r12 r13 eax edi r9d ..B1.3: # Preds ..B1.21 ..B1.2 # Execution count [1.15e+07] incl %edi #13.28 decl %r9d #13.28 cmpl $256, %edi #13.35 jge ..B1.21 # Prob 50% #13.35 # LOE rbx rsi r11 r12 r13 eax edi r9d ..B1.4: # Preds ..B1.3 # Execution count [1.04e+07] lea 256(%r9), %r10d #13.35 cmpl $16, %r10d #13.13 jl ..B1.25 # Prob 10% #13.13 # LOE rbx rsi r11 r12 r13 eax edi r9d r10d ..B1.5: # Preds ..B1.4 # Execution count [1.04e+07] lea 4+aa(%rsi,%rbx), %r8 #14.25 andq $31, %r8 #13.13 lea (%rsi,%rbx), %r14 #14.25 movl %r8d, %edx #13.13 negl %edx #13.13 addl $32, %edx #13.13 shrl $2, %edx #13.13 testl %r8d, %r8d #13.13 cmovne %edx, %r8d #13.13 lea 16(%r8), %ecx #13.13 cmpl %ecx, %r10d #13.13 jl ..B1.25 # Prob 10% #13.13 # LOE rbx rsi r8 r11 r12 r13 r14 eax edi r9d r10d ..B1.6: # Preds ..B1.5 # Execution count [1.15e+07] movl %r10d, %ecx #13.13 xorl %r15d, %r15d #13.13 subl %r8d, %ecx #13.13 xorl %edx, %edx #13.13 andl $15, %ecx #13.13 negl %ecx #13.13 addl %r10d, %ecx #13.13 testl %r8d, %r8d #13.13 jbe ..B1.10 # Prob 9% #13.13 # LOE rdx rbx rsi r8 r11 r12 r13 r14 r15 eax ecx edi r9d r10d ..B1.7: # Preds ..B1.6 # Execution count [1.04e+07] vmovss a(%rbx), %xmm0 #14.36 # LOE rdx rbx rsi r8 r11 r12 r13 r14 r15 eax ecx edi r9d r10d xmm0 ..B1.8: # Preds ..B1.8 ..B1.7 # Execution count [3.33e+11] vmovss 4+aa(%rdx,%r14), %xmm1 #14.25 incq %r15 #13.13 vfnmadd213ss 4+a(%rdx,%rbx), %xmm0, %xmm1 #14.17 vmovss %xmm1, 4+a(%rdx,%rbx) #14.17 addq $4, %rdx #13.13 cmpq %r8, %r15 #13.13 jb ..B1.8 # Prob 99% #13.13 # LOE rdx rbx rsi r8 r11 r12 r13 r14 r15 eax ecx edi r9d r10d xmm0 ..B1.10: # Preds ..B1.8 ..B1.6 # Execution count [1.04e+07] vbroadcastss a(,%r11,4), %ymm0 #14.36 lea (%r8,%r11), %r15 #13.13 movslq %ecx, %rdx #13.13 .align 16,0x90 # LOE rdx rbx rsi r8 r11 r12 r13 r14 r15 eax ecx edi r9d r10d ymm0 ..B1.11: # Preds ..B1.11 ..B1.10 # Execution count [3.33e+11] vmovups 4+aa(%r14,%r8,4), %ymm1 #14.25 vmovups 36+aa(%r14,%r8,4), %ymm2 #14.25 vfnmadd213ps 4+a(,%r15,4), %ymm0, %ymm1 #14.17 vfnmadd213ps 36+a(,%r15,4), %ymm0, %ymm2 #14.17 vmovups %ymm1, 4+a(,%r15,4) #14.17 vmovups %ymm2, 36+a(,%r15,4) #14.17 addq $16, %r8 #13.13 addq $16, %r15 #13.13 cmpq %rdx, %r8 #13.13 jb ..B1.11 # Prob 99% #13.13 # LOE rdx rbx rsi r8 r11 r12 r13 r14 r15 eax ecx edi r9d r10d ymm0 ..B1.12: # Preds ..B1.11 # Execution count [1.04e+07] lea 1(%rcx), %r8d #13.13 cmpl %r10d, %r8d #13.13 ja ..B1.21 # Prob 50% #13.13 # LOE rdx rbx rsi r11 r12 r13 r14 eax ecx e= di r9d r10d ..B1.13: # Preds ..B1.12 # Execution count [1.04e+07] movslq %r10d, %r10 #13.13 subq %rdx, %r10 #13.13 cmpq $4, %r10 #13.13 jl ..B1.24 # Prob 10% #13.13 # LOE rdx rbx rsi r10 r11 r12 r13 r14 eax e= cx edi r9d ..B1.14: # Preds ..B1.13 # Execution count [1.04e+07] movl %r10d, %r8d #13.13 lea (%r14,%rdx,4), %r14 #14.25 andl $-4, %r8d #13.13 addq %r11, %rdx #13.13 movslq %r8d, %r8 #13.13 xorl %r15d, %r15d #13.13 # LOE rdx rbx rsi r8 r10 r11 r12 r13 r14 r15 eax ecx edi r9d ..B1.15: # Preds ..B1.15 ..B1.14 # Execution count [3.33e+11] vbroadcastss a(%rbx), %xmm1 #14.36 vmovups 4+aa(%r14,%r15,4), %xmm0 #14.25 vfnmadd213ps 4+a(,%rdx,4), %xmm0, %xmm1 #14.17 addq $4, %r15 #13.13 vmovups %xmm1, 4+a(,%rdx,4) #14.17 addq $4, %rdx #13.13 cmpq %r8, %r15 #13.13 jb ..B1.15 # Prob 99% #13.13 # LOE rdx rbx rsi r8 r10 r11 r12 r13 r14 r15 eax ecx edi r9d ..B1.17: # Preds ..B1.15 ..B1.24 ..B1.26 # Execution count [1.15e+07] lea (,%r8,4), %r14 #13.13 cmpq %r10, %r8 #13.13 jae ..B1.21 # Prob 9% #13.13 # LOE rbx rsi r8 r10 r11 r12 r13 r14 eax ecx edi r9d ..B1.18: # Preds ..B1.17 # Execution count [1.04e+07] movslq %ecx, %rcx #14.17 lea (%rsi,%r11,4), %r15 #14.25 lea (,%rcx,4), %rdx #14.25 lea (%rdx,%r11,4), %rdx #14.17 lea (%r15,%rcx,4), %rcx #14.25 # LOE rdx rcx rbx rsi r8 r10 r11 r12 r13 r14 eax edi r9d ..B1.19: # Preds ..B1.19 ..B1.18 # Execution count [3.33e+11] vmovss a(,%r11,4), %xmm1 #14.36 incq %r8 #13.13 vmovss 4+aa(%r14,%rcx), %xmm0 #14.25 vfnmadd213ss 4+a(%r14,%rdx), %xmm0, %xmm1 #14.17 vmovss %xmm1, 4+a(%r14,%rdx) #14.17 addq $4, %r14 #13.13 cmpq %r10, %r8 #13.13 jb ..B1.19 # Prob 99% #13.13 # LOE rdx rcx rbx rsi r8 r10 r11 r12 r13 r14 eax edi r9d ..B1.21: # Preds ..B1.19 ..B1.25 ..B1.12 ..B1.17 ..B= 1.3 #=20=20=20=20=20=20 # Execution count [1.15e+07] addq $4, %rbx #13.28 addq $1024, %rsi #13.28 incq %r11 #13.28 cmpl $256, %edi #12.9 jb ..B1.3 # Prob 99% #12.9 # LOE rbx rsi r11 r12 r13 eax edi r9d ..B1.22: # Preds ..B1.21 # Execution count [4.50e+04] .byte 15 #11.5 .byte 31 #11.5 .byte 128 #11.5 .byte 0 #11.5 .byte 0 #11.5 .byte 0 #11.5 .byte 0 #11.5 incl %eax #11.5 cmpl $390000, %eax #11.5 jb ..B1.2 # Prob 99% #11.5 # LOE r12 r13 eax ..B1.23: # Preds ..B1.22 # Execution count [1.17e-01] vzeroupper #19.1 xorl %eax, %eax #19.1 addq $104, %rsp #19.1 .cfi_restore 3 popq %rbx #19.1 .cfi_restore 15 popq %r15 #19.1 .cfi_restore 14 popq %r14 #19.1 movq %rbp, %rsp #19.1 popq %rbp #19.1 .cfi_def_cfa 7, 8 .cfi_restore 6 ret #19.1 .cfi_def_cfa 6, 16 .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0x80, 0xff, 0xff, 0= xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 .cfi_offset 6, -16 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0x80, 0xff, 0xff, 0= xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0x80, 0xff, 0xff, 0= xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 # LOE ..B1.24: # Preds ..B1.13 # Execution count [1.04e+06]: Infreq xorl %r8d, %r8d #13.13 jmp ..B1.17 # Prob 100% #13.13 # LOE rbx rsi r8 r10 r11 r12 r13 eax ecx edi r9d ..B1.25: # Preds ..B1.5 ..B1.4 # Execution count [1.15e+06]: Infreq xorl %ecx, %ecx #13.13 cmpl $1, %r10d #13.13 jb ..B1.21 # Prob 50% #13.13 # LOE rbx rsi r11 r12 r13 eax ecx edi r9d r= 10d ..B1.26: # Preds ..B1.25 # Execution count [5.77e+05]: Infreq movslq %r10d, %r10 #13.13 xorl %r8d, %r8d #13.13 jmp ..B1.17 # Prob 100% #13.13 which runs 0.7s while gcc binary needs 5.7s=