From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 48) id 89F88383F425; Fri, 24 Jun 2022 15:56:40 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 89F88383F425 From: "hubicka at gcc dot gnu.org" To: gcc-bugs@gcc.gnu.org Subject: [Bug middle-end/106081] New: missed vectorization Date: Fri, 24 Jun 2022 15:56:40 +0000 X-Bugzilla-Reason: CC X-Bugzilla-Type: new X-Bugzilla-Watch-Reason: None X-Bugzilla-Product: gcc X-Bugzilla-Component: middle-end X-Bugzilla-Version: 13.0 X-Bugzilla-Keywords: X-Bugzilla-Severity: normal X-Bugzilla-Who: hubicka at gcc dot gnu.org X-Bugzilla-Status: UNCONFIRMED X-Bugzilla-Resolution: X-Bugzilla-Priority: P3 X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org X-Bugzilla-Target-Milestone: --- X-Bugzilla-Flags: X-Bugzilla-Changed-Fields: bug_id short_desc product version bug_status bug_severity priority component assigned_to reporter target_milestone Message-ID: Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: quoted-printable X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/ Auto-Submitted: auto-generated MIME-Version: 1.0 X-BeenThere: gcc-bugs@gcc.gnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Gcc-bugs mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 24 Jun 2022 15:56:40 -0000 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D106081 Bug ID: 106081 Summary: missed vectorization Product: gcc Version: 13.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: middle-end Assignee: unassigned at gcc dot gnu.org Reporter: hubicka at gcc dot gnu.org Target Milestone: --- This testcase (derived from ImageMagick) struct pixels { short a,b,c,d; } *pixels; struct dpixels { double a,b,c,d; }; double test(double *k) { struct dpixels results=3D{}; for (int u=3D0; u<10000;u++,k--) { results.a +=3D *k*pixels[u].a; results.b +=3D *k*pixels[u].b; results.c +=3D *k*pixels[u].c; results.d +=3D *k*pixels[u].d; } return results.a+results.b*2+results.c*3+results.d*4; } gets vectorized by clang: test: # @test .cfi_startproc # %bb.0: movq pixels(%rip), %rax vxorpd %xmm0, %xmm0, %xmm0 xorl %ecx, %ecx .p2align 4, 0x90 .LBB0_1: # =3D>This Inner Loop Header: Depth= =3D1 vpmovsxwd (%rax), %xmm1 vbroadcastsd (%rdi,%rcx,8), %ymm2 addq $8, %rax decq %rcx vcvtdq2pd %xmm1, %ymm1 vfmadd231pd %ymm2, %ymm1, %ymm0 # ymm0 =3D (ymm1 * ymm2) + = ymm0 cmpq $-10000, %rcx # imm =3D 0xD8F0 jne .LBB0_1 # %bb.2: vpermilpd $1, %xmm0, %xmm1 # xmm1 =3D xmm0[1,0] vfmadd132sd .LCPI0_0(%rip), %xmm0, %xmm1 # xmm1 =3D (xmm1 * mem= ) + xmm0 vextractf128 $1, %ymm0, %xmm0 vfmadd231sd .LCPI0_1(%rip), %xmm0, %xmm1 # xmm1 =3D (xmm0 * mem= ) + xmm1 vpermilpd $1, %xmm0, %xmm0 # xmm0 =3D xmm0[1,0] vfmadd132sd .LCPI0_2(%rip), %xmm1, %xmm0 # xmm0 =3D (xmm0 * mem= ) + xmm1 vzeroupper retq but not by GCC. Original loop is: 0.94 : 423cb0: vmovdqu (%rsi,%rdi,8),%xmm5 // morphology.c:2984 : 2983 if ( IsNaN(*k) ) continue; 0.29 : 423cb5: vpermilpd $0x1,(%rcx),%xmm4 : 2982 for (u=3D0; u < (ssize_t) kernel->width; u++, k--) { 0.46 : 423cbb: add $0x2,%rdi 0.07 : 423cbf: add $0xfffffffffffffff0,%rcx : 2984 result.red +=3D (*k)*k_pixels[u].red; 0.03 : 423cc3: vpshufb %xmm12,%xmm5,%xmm6 6.81 : 423cc8: vcvtdq2pd %xmm6,%xmm6 13.05 : 423ccc: vfmadd231pd %xmm6,%xmm4,%xmm1 : 2985 result.green +=3D (*k)*k_pixels[u].green; 17.45 : 423cd1: vpshufb %xmm15,%xmm5,%xmm6 // morphology.c:2985 0.33 : 423cd6: vcvtdq2pd %xmm6,%xmm6 0.00 : 423cda: vfmadd231pd %xmm6,%xmm4,%xmm3 : 2986 result.blue +=3D (*k)*k_pixels[u].blue; 15.28 : 423cdf: vpshufb %xmm13,%xmm5,%xmm6 // morphology.c:2986 : 2987 result.opacity +=3D (*k)*k_pixels[u].opacity; 0.00 : 423ce4: vpshufb %xmm8,%xmm5,%xmm5 : 2986 result.blue +=3D (*k)*k_pixels[u].blue; 0.00 : 423ce9: vcvtdq2pd %xmm6,%xmm6 : 2987 result.opacity +=3D (*k)*k_pixels[u].opacity; 0.21 : 423ced: vcvtdq2pd %xmm5,%xmm5 : 2986 result.blue +=3D (*k)*k_pixels[u].blue; 0.97 : 423cf1: vfmadd231pd %xmm6,%xmm4,%xmm0 : 2987 result.opacity +=3D (*k)*k_pixels[u].opacity; 19.16 : 423cf6: vfmadd231pd %xmm5,%xmm4,%xmm2 // morphology.c:2987 : 2982 for (u=3D0; u < (ssize_t) kernel->width; u++, k--) { 14.51 : 423cfb: cmp %rdi,%rbp // morphology.c:2982 0.00 : 423cfe: jne 423cb0 Changing short to double makes it vectorized: .L2: vmovupd (%rax), %ymm4 vmovupd 64(%rax), %ymm2 subq $-128, %rax subq $32, %rdx vunpcklpd -96(%rax), %ymm4, %ymm1 vunpckhpd -96(%rax), %ymm4, %ymm0 vmovupd -64(%rax), %ymm4 vunpckhpd -32(%rax), %ymm2, %ymm2 vunpcklpd -32(%rax), %ymm4, %ymm4 vpermpd $27, 32(%rdx), %ymm3 vpermpd $216, %ymm1, %ymm1 vpermpd $216, %ymm0, %ymm0 vpermpd $216, %ymm2, %ymm2 vpermpd $216, %ymm4, %ymm4 vunpcklpd %ymm2, %ymm0, %ymm10 vunpckhpd %ymm2, %ymm0, %ymm0 vunpckhpd %ymm4, %ymm1, %ymm9 vunpcklpd %ymm4, %ymm1, %ymm1 vpermpd $216, %ymm10, %ymm10 vpermpd $216, %ymm0, %ymm0 vfmadd231pd %ymm3, %ymm10, %ymm6 vfmadd231pd %ymm3, %ymm0, %ymm8 vpermpd $216, %ymm9, %ymm9 vpermpd $216, %ymm1, %ymm1 vfmadd231pd %ymm3, %ymm1, %ymm5 vfmadd231pd %ymm3, %ymm9, %ymm7 cmpq %rax, %rcx jne .L2 howver clang's code looks shorter: LBB0_1: # =3D>This Inner Loop Header: Depth= =3D1 vbroadcastsd (%rdi,%rcx,8), %ymm1 vfmadd231pd (%rax), %ymm1, %ymm0 # ymm0 =3D (ymm1 * mem) + y= mm0 addq $32, %rax decq %rcx cmpq $-10000, %rcx # imm =3D 0xD8F0 jne .LBB0_1 We loop vectorize while clang slp vectorizes it seems.=