From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 48) id AC84A3858C39; Wed, 8 Dec 2021 00:55:07 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org AC84A3858C39 From: "john_platts at hotmail dot com" To: gcc-bugs@gcc.gnu.org Subject: [Bug target/103611] New: GCC generates suboptimal code for SSE2/SSE4.1 64-bit integer element extraction on 32-bit x86 targets Date: Wed, 08 Dec 2021 00:55:07 +0000 X-Bugzilla-Reason: CC X-Bugzilla-Type: new X-Bugzilla-Watch-Reason: None X-Bugzilla-Product: gcc X-Bugzilla-Component: target X-Bugzilla-Version: 11.2.0 X-Bugzilla-Keywords: X-Bugzilla-Severity: normal X-Bugzilla-Who: john_platts at hotmail dot com X-Bugzilla-Status: UNCONFIRMED X-Bugzilla-Resolution: X-Bugzilla-Priority: P3 X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org X-Bugzilla-Target-Milestone: --- X-Bugzilla-Flags: X-Bugzilla-Changed-Fields: bug_id short_desc product version bug_status bug_severity priority component assigned_to reporter target_milestone Message-ID: Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: quoted-printable X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/ Auto-Submitted: auto-generated MIME-Version: 1.0 X-BeenThere: gcc-bugs@gcc.gnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Gcc-bugs mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 08 Dec 2021 00:55:07 -0000 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D103611 Bug ID: 103611 Summary: GCC generates suboptimal code for SSE2/SSE4.1 64-bit integer element extraction on 32-bit x86 targets Product: gcc Version: 11.2.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: target Assignee: unassigned at gcc dot gnu.org Reporter: john_platts at hotmail dot com Target Milestone: --- Here is some code for extracting 64-bit integers from a SSE2 vector: #include #include template std::int64_t SSE2ExtractInt64(__m128i vect) noexcept { static_assert(ElemIdx =3D=3D (ElemIdx & 1), "ElemIdx must be between 0 = and 1"); __m128i vect2; if constexpr(ElemIdx =3D=3D 0) { vect2 =3D _mm_shuffle_epi32(vect, 1); } else { vect2 =3D _mm_shuffle_epi32(vect, 3); vect =3D _mm_shuffle_epi32(vect, 2); } auto loVal =3D std::uint32_t(_mm_cvtsi128_si32(vect)); auto hiVal =3D std::uint32_t(_mm_cvtsi128_si32(vect2)); return std::int64_t(loVal) | std::int64_t(std::uint64_t(hiVal) << 32); } template std::int64_t SSE2ExtractInt64<0>(__m128i vect) noexcept; template std::int64_t SSE2ExtractInt64<1>(__m128i vect) noexcept; Here is the assembly code that is generated when the above C++ code is comp= iled with the -O2 -std=3Dc++17 -march=3Dnocona -mtune=3Dskylake -m32 options: _Z16SSE2ExtractInt64ILi0EExDv2_x: pushl %ebx pshufd $1, %xmm0, %xmm1 xorl %ebx, %ebx movd %xmm1, %edx movd %xmm0, %eax orl %ebx, %edx orb $0, %ah popl %ebx ret _Z16SSE2ExtractInt64ILi1EExDv2_x: pushl %esi pshufd $3, %xmm0, %xmm1 xorl %esi, %esi pushl %ebx pshufd $2, %xmm0, %xmm0 movl %esi, %edx movd %xmm1, %ecx movd %xmm0, %eax popl %ebx orb $0, %ah orl %ecx, %edx popl %esi ret Here is a more optimal implementation of the above functions: _Z16SSE2ExtractInt64ILi0EExDv2_x: pshufd $1, %xmm0, %xmm1 movd %xmm1, %edx movd %xmm0, %eax ret _Z16SSE2ExtractInt64ILi1EExDv2_x: pshufd $3, %xmm0, %xmm1 pshufd $2, %xmm0, %xmm0 movd %xmm1, %edx movd %xmm0, %eax ret Here is the code that is generated when the above C++ code is compiled with clang 13.0.0 with the -O2 -std=3Dc++17 -march=3Dnocona -mtune=3Dskylake -m3= 2 options: _Z16SSE2ExtractInt64ILi0EExDv2_x: # @_Z16SSE2ExtractInt64ILi0EExDv2_x movd %xmm0, %eax pshufd $85, %xmm0, %xmm0 # xmm0 =3D xmm0[1,1,1,1] movd %xmm0, %edx retl _Z16SSE2ExtractInt64ILi1EExDv2_x: # @_Z16SSE2ExtractInt64ILi1EExDv2_x pshufd $238, %xmm0, %xmm1 # xmm1 =3D xmm0[2,3,2,3] movd %xmm1, %eax pshufd $255, %xmm0, %xmm0 # xmm0 =3D xmm0[3,3,3,3] movd %xmm0, %edx retl=