From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 48) id BD646385483C; Wed, 17 Mar 2021 18:49:50 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org BD646385483C From: "hubicka at gcc dot gnu.org" To: gcc-bugs@gcc.gnu.org Subject: [Bug middle-end/99634] New: s2102 benchmarks of TSVC is vectorized better by icc than gcc Date: Wed, 17 Mar 2021 18:49:50 +0000 X-Bugzilla-Reason: CC X-Bugzilla-Type: new X-Bugzilla-Watch-Reason: None X-Bugzilla-Product: gcc X-Bugzilla-Component: middle-end X-Bugzilla-Version: 11.0 X-Bugzilla-Keywords: X-Bugzilla-Severity: normal X-Bugzilla-Who: hubicka at gcc dot gnu.org X-Bugzilla-Status: UNCONFIRMED X-Bugzilla-Resolution: X-Bugzilla-Priority: P3 X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org X-Bugzilla-Target-Milestone: --- X-Bugzilla-Flags: X-Bugzilla-Changed-Fields: bug_id short_desc product version bug_status bug_severity priority component assigned_to reporter target_milestone Message-ID: Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: quoted-printable X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/ Auto-Submitted: auto-generated MIME-Version: 1.0 X-BeenThere: gcc-bugs@gcc.gnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Gcc-bugs mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 17 Mar 2021 18:49:50 -0000 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D99634 Bug ID: 99634 Summary: s2102 benchmarks of TSVC is vectorized better by icc than gcc Product: gcc Version: 11.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: middle-end Assignee: unassigned at gcc dot gnu.org Reporter: hubicka at gcc dot gnu.org Target Milestone: --- typedef float real_t; #define iterations 100000 #define LEN_1D 32000 #define LEN_2D 256 // array definitions real_t a[LEN_2D],d[LEN_2D],aa[LEN_2D][LEN_2D],bb[LEN_2D][LEN_2D],cc[LEN_2D][LEN_2D= ],tt[LEN_2D][LEN_2D]; int main(struct args_t * func_args) { // diagonals // identity matrix, best results vectorize both inner and outer loops for (int nl =3D 0; nl < 100*(iterations/LEN_2D); nl++) { for (int i =3D 0; i < LEN_2D; i++) { for (int j =3D 0; j < LEN_2D; j++) { aa[j][i] =3D (real_t)0.; } aa[i][i] =3D (real_t)1.; } dummy(); } return aa[0][0]; } is vectorized by ic as: min: # parameter 1: %rdi ..B1.1: # Preds ..B1.0 # Execution count [5.00e-03] .cfi_startproc ..___tag_value_min.1: ..L2: #36.1 pushq %rbp #36.1 .cfi_def_cfa_offset 16 movq %rsp, %rbp #36.1 .cfi_def_cfa 6, 16 .cfi_offset 6, -16 andq $-32, %rsp #36.1 movl $aa, %edi #38.13 xorl %esi, %esi #38.13 movl $262144, %edx #38.13 call _intel_fast_memset #38.13 # LOE rbx r12 r13 r14 r15 ..B1.2: # Preds ..B1.1 # Execution count [1.00e+00] vmovups .L_2il0floatpacket.0(%rip), %ymm1 #41.24 xorl %edx, %edx #37.9 xorl %eax, %eax #37.9 vextractf128 $1, %ymm1, %xmm0 #41.13 # LOE rax rdx rbx r12 r13 r14 r15 xmm0 xmm1 ..B1.3: # Preds ..B1.3 ..B1.2 # Execution count [2.56e+02] vextractps $3, %xmm1, 44204+aa(%rax,%rdx,4) #41.13 lea (%rax,%rdx,4), %rcx #41.13 vmovss %xmm0, 45232+aa(%rax,%rdx,4) #41.13 vextractps $1, %xmm0, 46260+aa(%rax,%rdx,4) #41.13 vextractps $2, %xmm0, 47288+aa(%rax,%rdx,4) #41.13 vextractps $3, %xmm0, 48316+aa(%rax,%rdx,4) #41.13 vmovss %xmm1, 49344+aa(%rax,%rdx,4) #41.13 vextractps $1, %xmm1, 50372+aa(%rax,%rdx,4) #41.13 vextractps $2, %xmm1, 51400+aa(%rax,%rdx,4) #41.13 vextractps $3, %xmm1, 52428+aa(%rax,%rdx,4) #41.13 vmovss %xmm0, 53456+aa(%rax,%rdx,4) #41.13 vextractps $1, %xmm0, 54484+aa(%rax,%rdx,4) #41.13 vextractps $2, %xmm0, 55512+aa(%rax,%rdx,4) #41.13 vextractps $3, %xmm0, 56540+aa(%rax,%rdx,4) #41.13 vmovss %xmm1, 57568+aa(%rax,%rdx,4) #41.13 vextractps $1, %xmm1, 58596+aa(%rax,%rdx,4) #41.13 vextractps $2, %xmm1, 59624+aa(%rax,%rdx,4) #41.13 vextractps $3, %xmm1, 60652+aa(%rax,%rdx,4) #41.13 vmovss %xmm0, 61680+aa(%rax,%rdx,4) #41.13 vextractps $1, %xmm0, 62708+aa(%rax,%rdx,4) #41.13 vextractps $2, %xmm0, 63736+aa(%rax,%rdx,4) #41.13 vextractps $3, %xmm0, 64764+aa(%rax,%rdx,4) #41.13 vmovss %xmm1, 65792+aa(%rax,%rdx,4) #41.13 vextractps $1, %xmm1, 66820+aa(%rax,%rdx,4) #41.13 vextractps $2, %xmm1, 67848+aa(%rax,%rdx,4) #41.13 vextractps $3, %xmm1, 68876+aa(%rax,%rdx,4) #41.13 vmovss %xmm0, 69904+aa(%rax,%rdx,4) #41.13 vextractps $1, %xmm0, 70932+aa(%rax,%rdx,4) #41.13 vextractps $2, %xmm0, 71960+aa(%rax,%rdx,4) #41.13 vextractps $3, %xmm0, 72988+aa(%rax,%rdx,4) #41.13 vmovss %xmm1, 74016+aa(%rax,%rdx,4) #41.13 vextractps $1, %xmm1, 75044+aa(%rax,%rdx,4) #41.13 vextractps $2, %xmm1, 76072+aa(%rax,%rdx,4) #41.13 vextractps $3, %xmm1, 77100+aa(%rax,%rdx,4) #41.13 vmovss %xmm0, 78128+aa(%rax,%rdx,4) #41.13 vextractps $1, %xmm0, 79156+aa(%rax,%rdx,4) #41.13 vextractps $2, %xmm0, 80184+aa(%rax,%rdx,4) #41.13 vextractps $3, %xmm0, 81212+aa(%rax,%rdx,4) #41.13 vmovss %xmm1, 82240+aa(%rax,%rdx,4) #41.13 vextractps $1, %xmm1, 83268+aa(%rax,%rdx,4) #41.13 vextractps $2, %xmm1, 84296+aa(%rax,%rdx,4) #41.13 vextractps $3, %xmm1, 85324+aa(%rax,%rdx,4) #41.13 vmovss %xmm0, 86352+aa(%rax,%rdx,4) #41.13 vextractps $1, %xmm0, 87380+aa(%rax,%rdx,4) #41.13 vextractps $2, %xmm0, 88408+aa(%rax,%rdx,4) #41.13 vextractps $3, %xmm0, 89436+aa(%rax,%rdx,4) #41.13 vmovss %xmm1, 90464+aa(%rax,%rdx,4) #41.13 vextractps $1, %xmm1, 91492+aa(%rax,%rdx,4) #41.13 vextractps $2, %xmm1, 92520+aa(%rax,%rdx,4) #41.13 vextractps $3, %xmm1, 93548+aa(%rax,%rdx,4) #41.13 vmovss %xmm0, 94576+aa(%rax,%rdx,4) #41.13 vextractps $1, %xmm0, 95604+aa(%rax,%rdx,4) #41.13 vextractps $2, %xmm0, 96632+aa(%rax,%rdx,4) #41.13 vextractps $3, %xmm0, 97660+aa(%rax,%rdx,4) #41.13 vmovss %xmm1, 98688+aa(%rax,%rdx,4) #41.13 vextractps $1, %xmm1, 99716+aa(%rax,%rdx,4) #41.13 vextractps $2, %xmm1, 100744+aa(%rax,%rdx,4) #41.13 vextractps $3, %xmm1, 101772+aa(%rax,%rdx,4) #41.13 vmovss %xmm0, 102800+aa(%rax,%rdx,4) #41.13 vextractps $1, %xmm0, 103828+aa(%rax,%rdx,4) #41.13 vextractps $2, %xmm0, 104856+aa(%rax,%rdx,4) #41.13 vextractps $3, %xmm0, 105884+aa(%rax,%rdx,4) #41.13 vmovss %xmm1, 106912+aa(%rax,%rdx,4) #41.13 vextractps $1, %xmm1, 107940+aa(%rax,%rdx,4) #41.13 vextractps $2, %xmm1, 108968+aa(%rax,%rdx,4) #41.13 vextractps $3, %xmm1, 109996+aa(%rax,%rdx,4) #41.13 vmovss %xmm0, 111024+aa(%rax,%rdx,4) #41.13 vextractps $1, %xmm0, 112052+aa(%rax,%rdx,4) #41.13 vextractps $2, %xmm0, 113080+aa(%rax,%rdx,4) #41.13 vextractps $3, %xmm0, 114108+aa(%rax,%rdx,4) #41.13 vmovss %xmm1, 115136+aa(%rax,%rdx,4) #41.13 vextractps $1, %xmm1, 116164+aa(%rax,%rdx,4) #41.13 vextractps $2, %xmm1, 117192+aa(%rax,%rdx,4) #41.13 vextractps $3, %xmm1, 118220+aa(%rax,%rdx,4) #41.13 vmovss %xmm0, 119248+aa(%rax,%rdx,4) #41.13 vextractps $1, %xmm0, 120276+aa(%rax,%rdx,4) #41.13 vextractps $2, %xmm0, 121304+aa(%rax,%rdx,4) #41.13 vextractps $3, %xmm0, 122332+aa(%rax,%rdx,4) #41.13 vmovss %xmm1, 123360+aa(%rax,%rdx,4) #41.13 vextractps $1, %xmm1, 124388+aa(%rax,%rdx,4) #41.13 vextractps $2, %xmm1, 125416+aa(%rax,%rdx,4) #41.13 vextractps $3, %xmm1, 126444+aa(%rax,%rdx,4) #41.13 vmovss %xmm0, 127472+aa(%rax,%rdx,4) #41.13 vextractps $1, %xmm0, 128500+aa(%rax,%rdx,4) #41.13 vextractps $2, %xmm0, 129528+aa(%rax,%rdx,4) #41.13 vextractps $3, %xmm0, 130556+aa(%rax,%rdx,4) #41.13 addq $128, %rdx #37.9 addq $131072, %rax #37.9 vmovss %xmm1, aa(%rcx) #41.13 vextractps $1, %xmm1, 1028+aa(%rcx) #41.13 vextractps $2, %xmm1, 2056+aa(%rcx) #41.13 vextractps $3, %xmm1, 3084+aa(%rcx) #41.13 vmovss %xmm0, 4112+aa(%rcx) #41.13 vextractps $1, %xmm0, 5140+aa(%rcx) #41.13 vextractps $2, %xmm0, 6168+aa(%rcx) #41.13 vextractps $3, %xmm0, 7196+aa(%rcx) #41.13 vmovss %xmm1, 8224+aa(%rcx) #41.13 vextractps $1, %xmm1, 9252+aa(%rcx) #41.13 vextractps $2, %xmm1, 10280+aa(%rcx) #41.13 vextractps $3, %xmm1, 11308+aa(%rcx) #41.13 vmovss %xmm0, 12336+aa(%rcx) #41.13 vextractps $1, %xmm0, 13364+aa(%rcx) #41.13 vextractps $2, %xmm0, 14392+aa(%rcx) #41.13 vextractps $3, %xmm0, 15420+aa(%rcx) #41.13 vmovss %xmm1, 16448+aa(%rcx) #41.13 vextractps $1, %xmm1, 17476+aa(%rcx) #41.13 vextractps $2, %xmm1, 18504+aa(%rcx) #41.13 vextractps $3, %xmm1, 19532+aa(%rcx) #41.13 vmovss %xmm0, 20560+aa(%rcx) #41.13 vextractps $1, %xmm0, 21588+aa(%rcx) #41.13 vextractps $2, %xmm0, 22616+aa(%rcx) #41.13 vextractps $3, %xmm0, 23644+aa(%rcx) #41.13 vmovss %xmm1, 24672+aa(%rcx) #41.13 vextractps $1, %xmm1, 25700+aa(%rcx) #41.13 vextractps $2, %xmm1, 26728+aa(%rcx) #41.13 vextractps $3, %xmm1, 27756+aa(%rcx) #41.13 vmovss %xmm0, 28784+aa(%rcx) #41.13 vextractps $1, %xmm0, 29812+aa(%rcx) #41.13 vextractps $2, %xmm0, 30840+aa(%rcx) #41.13 vextractps $3, %xmm0, 31868+aa(%rcx) #41.13 vmovss %xmm1, 32896+aa(%rcx) #41.13 vextractps $1, %xmm1, 33924+aa(%rcx) #41.13 vextractps $2, %xmm1, 34952+aa(%rcx) #41.13 vextractps $3, %xmm1, 35980+aa(%rcx) #41.13 vmovss %xmm0, 37008+aa(%rcx) #41.13 vextractps $1, %xmm0, 38036+aa(%rcx) #41.13 vextractps $2, %xmm0, 39064+aa(%rcx) #41.13 vextractps $3, %xmm0, 40092+aa(%rcx) #41.13 vmovss %xmm1, 41120+aa(%rcx) #41.13 vextractps $1, %xmm1, 42148+aa(%rcx) #41.13 vextractps $2, %xmm1, 43176+aa(%rcx) #41.13 cmpq $256, %rdx #37.9 jb ..B1.3 # Prob 99% #37.9 # LOE rax rdx rbx r12 r13 r14 r15 xmm0 xmm1 ..B1.4: # Preds ..B1.3 # Execution count [1.00e+00] vzeroupper #43.1 movq %rbp, %rsp #43.1 popq %rbp #43.1 .cfi_def_cfa 7, 8 .cfi_restore 6 ret #43.1=