From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 48) id C1D443858C83; Fri, 22 Apr 2022 21:46:02 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org C1D443858C83 From: "john_platts at hotmail dot com" To: gcc-bugs@gcc.gnu.org Subject: [Bug target/105354] New: __builtin_shuffle for alignr generates suboptimal code unless SSSE3 is enabled Date: Fri, 22 Apr 2022 21:46:02 +0000 X-Bugzilla-Reason: CC X-Bugzilla-Type: new X-Bugzilla-Watch-Reason: None X-Bugzilla-Product: gcc X-Bugzilla-Component: target X-Bugzilla-Version: 11.2.0 X-Bugzilla-Keywords: missed-optimization X-Bugzilla-Severity: normal X-Bugzilla-Who: john_platts at hotmail dot com X-Bugzilla-Status: UNCONFIRMED X-Bugzilla-Resolution: X-Bugzilla-Priority: P3 X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org X-Bugzilla-Target-Milestone: --- X-Bugzilla-Flags: X-Bugzilla-Changed-Fields: bug_id short_desc product version bug_status keywords bug_severity priority component assigned_to reporter target_milestone Message-ID: Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: quoted-printable X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/ Auto-Submitted: auto-generated MIME-Version: 1.0 X-BeenThere: gcc-bugs@gcc.gnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Gcc-bugs mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 22 Apr 2022 21:46:02 -0000 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D105354 Bug ID: 105354 Summary: __builtin_shuffle for alignr generates suboptimal code unless SSSE3 is enabled Product: gcc Version: 11.2.0 Status: UNCONFIRMED Keywords: missed-optimization Severity: normal Priority: P3 Component: target Assignee: unassigned at gcc dot gnu.org Reporter: john_platts at hotmail dot com Target Milestone: --- The below code generates suboptimal code if SSE2 is enabled but SSSE3 is not enabled: #include typedef std::uint8_t Simd128U8VectT __attribute__((__vector_size__(16))); template static inline Simd128U8VectT RotateRightByByteAmt(Simd128U8VectT vect) noex= cept { constexpr int NormalizedRotateAmt =3D RotateAmt & 15; if constexpr(NormalizedRotateAmt =3D=3D 0) return vect; else return __builtin_shuffle(vect, vect, (Simd128U8VectT){ NormalizedRotateAmt, NormalizedRotateAmt + 1, NormalizedRotateAmt + 2, NormalizedRotateAmt + 3, NormalizedRotateAmt + 4, NormalizedRotateAmt + 5, NormalizedRotateAmt + 6, NormalizedRotateAmt + 7, NormalizedRotateAmt + 8, NormalizedRotateAmt + 9, NormalizedRotateAmt + 10, NormalizedRotateAmt + 11, NormalizedRotateAmt + 12, NormalizedRotateAmt + 13, NormalizedRotateAmt + 14, NormalizedRotateAmt + 15 }); } auto func1(Simd128U8VectT vect) noexcept { return RotateRightByByteAmt<5>(vect); } Here is the code that is generated on GCC 11 if the -O2 -mssse3 options are specified: func1(unsigned char __vector(16)): palignr xmm0, xmm0, 5 ret Here is the code that is generated on GCC 11 if the -O2 option is specified= but the -mssse3 option is not specified on 64-bit x86 platforms: func1(unsigned char __vector(16)): sub rsp, 144 movd ecx, xmm0 movaps XMMWORD PTR [rsp+8], xmm0 movzx edx, BYTE PTR [rsp+20] movzx ecx, cl movaps XMMWORD PTR [rsp+24], xmm0 movzx eax, BYTE PTR [rsp+35] sal rdx, 8 movaps XMMWORD PTR [rsp+40], xmm0 or rdx, rax movzx eax, BYTE PTR [rsp+50] movaps XMMWORD PTR [rsp+56], xmm0 sal rdx, 8 movaps XMMWORD PTR [rsp+72], xmm0 or rdx, rax movzx eax, BYTE PTR [rsp+65] movaps XMMWORD PTR [rsp+88], xmm0 sal rdx, 8 movaps XMMWORD PTR [rsp+104], xmm0 or rdx, rax movzx eax, BYTE PTR [rsp+80] movaps XMMWORD PTR [rsp-104], xmm0 sal rdx, 8 movaps XMMWORD PTR [rsp-88], xmm0 movzx edi, BYTE PTR [rsp-85] or rdx, rax movzx eax, BYTE PTR [rsp+95] movaps XMMWORD PTR [rsp-72], xmm0 sal rdx, 8 movaps XMMWORD PTR [rsp-56], xmm0 or rdx, rax movzx eax, BYTE PTR [rsp+110] movaps XMMWORD PTR [rsp-40], xmm0 sal rdx, 8 movaps XMMWORD PTR [rsp-24], xmm0 or rdx, rax movzx eax, BYTE PTR [rsp-100] movaps XMMWORD PTR [rsp+120], xmm0 movzx esi, BYTE PTR [rsp+125] movaps XMMWORD PTR [rsp-8], xmm0 sal rdx, 8 sal rax, 8 or rdx, rsi or rax, rdi movzx edi, BYTE PTR [rsp-70] sal rax, 8 or rax, rdi movzx edi, BYTE PTR [rsp-55] sal rax, 8 or rax, rdi sal rax, 8 or rax, rcx movzx ecx, BYTE PTR [rsp-25] sal rax, 8 or rax, rcx movzx ecx, BYTE PTR [rsp-10] sal rax, 8 or rax, rcx movzx ecx, BYTE PTR [rsp+5] mov QWORD PTR [rsp-120], rdx sal rax, 8 or rax, rcx mov QWORD PTR [rsp-112], rax movdqa xmm0, XMMWORD PTR [rsp-120] add rsp, 144 ret Here is a more optimal implementation of the above code on 64-bit x86 platf= orms when SSE2 is enabled but SSSE3 is not enabled: func1(unsigned char __vector(16)): movdqa xmm1, xmm0 psrldq xmm1, 5 pslldq xmm0, 11 por xmm0, xmm1 ret=