public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug target/103611] New: GCC generates suboptimal code for SSE2/SSE4.1 64-bit integer element extraction on 32-bit x86 targets
@ 2021-12-08 0:55 john_platts at hotmail dot com
2021-12-08 1:04 ` [Bug target/103611] " john_platts at hotmail dot com
` (6 more replies)
0 siblings, 7 replies; 8+ messages in thread
From: john_platts at hotmail dot com @ 2021-12-08 0:55 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103611
Bug ID: 103611
Summary: GCC generates suboptimal code for SSE2/SSE4.1 64-bit
integer element extraction on 32-bit x86 targets
Product: gcc
Version: 11.2.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: target
Assignee: unassigned at gcc dot gnu.org
Reporter: john_platts at hotmail dot com
Target Milestone: ---
Here is some code for extracting 64-bit integers from a SSE2 vector:
#include <cstdint>
#include <immintrin.h>
template<int ElemIdx>
std::int64_t SSE2ExtractInt64(__m128i vect) noexcept {
static_assert(ElemIdx == (ElemIdx & 1), "ElemIdx must be between 0 and 1");
__m128i vect2;
if constexpr(ElemIdx == 0) {
vect2 = _mm_shuffle_epi32(vect, 1);
} else {
vect2 = _mm_shuffle_epi32(vect, 3);
vect = _mm_shuffle_epi32(vect, 2);
}
auto loVal = std::uint32_t(_mm_cvtsi128_si32(vect));
auto hiVal = std::uint32_t(_mm_cvtsi128_si32(vect2));
return std::int64_t(loVal) | std::int64_t(std::uint64_t(hiVal) << 32);
}
template std::int64_t SSE2ExtractInt64<0>(__m128i vect) noexcept;
template std::int64_t SSE2ExtractInt64<1>(__m128i vect) noexcept;
Here is the assembly code that is generated when the above C++ code is compiled
with the -O2 -std=c++17 -march=nocona -mtune=skylake -m32 options:
_Z16SSE2ExtractInt64ILi0EExDv2_x:
pushl %ebx
pshufd $1, %xmm0, %xmm1
xorl %ebx, %ebx
movd %xmm1, %edx
movd %xmm0, %eax
orl %ebx, %edx
orb $0, %ah
popl %ebx
ret
_Z16SSE2ExtractInt64ILi1EExDv2_x:
pushl %esi
pshufd $3, %xmm0, %xmm1
xorl %esi, %esi
pushl %ebx
pshufd $2, %xmm0, %xmm0
movl %esi, %edx
movd %xmm1, %ecx
movd %xmm0, %eax
popl %ebx
orb $0, %ah
orl %ecx, %edx
popl %esi
ret
Here is a more optimal implementation of the above functions:
_Z16SSE2ExtractInt64ILi0EExDv2_x:
pshufd $1, %xmm0, %xmm1
movd %xmm1, %edx
movd %xmm0, %eax
ret
_Z16SSE2ExtractInt64ILi1EExDv2_x:
pshufd $3, %xmm0, %xmm1
pshufd $2, %xmm0, %xmm0
movd %xmm1, %edx
movd %xmm0, %eax
ret
Here is the code that is generated when the above C++ code is compiled with
clang 13.0.0 with the -O2 -std=c++17 -march=nocona -mtune=skylake -m32 options:
_Z16SSE2ExtractInt64ILi0EExDv2_x: # @_Z16SSE2ExtractInt64ILi0EExDv2_x
movd %xmm0, %eax
pshufd $85, %xmm0, %xmm0 # xmm0 = xmm0[1,1,1,1]
movd %xmm0, %edx
retl
_Z16SSE2ExtractInt64ILi1EExDv2_x: # @_Z16SSE2ExtractInt64ILi1EExDv2_x
pshufd $238, %xmm0, %xmm1 # xmm1 = xmm0[2,3,2,3]
movd %xmm1, %eax
pshufd $255, %xmm0, %xmm0 # xmm0 = xmm0[3,3,3,3]
movd %xmm0, %edx
retl
^ permalink raw reply [flat|nested] 8+ messages in thread
* [Bug target/103611] GCC generates suboptimal code for SSE2/SSE4.1 64-bit integer element extraction on 32-bit x86 targets
2021-12-08 0:55 [Bug target/103611] New: GCC generates suboptimal code for SSE2/SSE4.1 64-bit integer element extraction on 32-bit x86 targets john_platts at hotmail dot com
@ 2021-12-08 1:04 ` john_platts at hotmail dot com
2021-12-08 1:10 ` john_platts at hotmail dot com
` (5 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: john_platts at hotmail dot com @ 2021-12-08 1:04 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103611
--- Comment #1 from John Platts <john_platts at hotmail dot com> ---
Here is some C++ code for extracting 64-bit integers from a __m128i vector
using SSE4.1:
#include <cstdint>
#include <immintrin.h>
template<int ElemIdx>
std::int64_t SSE41ExtractInt64(__m128i vect) noexcept {
static_assert(ElemIdx == (ElemIdx & 1), "ElemIdx must be between 0 and 1");
std::uint32_t loVal;
std::uint32_t hiVal;
if constexpr(ElemIdx == 0) {
loVal = std::uint32_t(_mm_extract_epi32(vect, 0));
hiVal = std::uint32_t(_mm_extract_epi32(vect, 1));
} else {
loVal = std::uint32_t(_mm_extract_epi32(vect, 2));
hiVal = std::uint32_t(_mm_extract_epi32(vect, 3));
}
return std::int64_t(loVal) | std::int64_t(std::uint64_t(hiVal) << 32);
}
template std::int64_t SSE41ExtractInt64<0>(__m128i vect) noexcept;
template std::int64_t SSE41ExtractInt64<1>(__m128i vect) noexcept;
Here is the assembly code that is generated when the above C++ code is compiled
with the -O2 -std=c++17 -march=core2 -msse4.1 -mtune=skylake -m32 options:
_Z17SSE41ExtractInt64ILi0EExDv2_x:
subl $28, %esp
pmovzxdq %xmm0, %xmm1
movq %xmm1, 8(%esp)
pextrd $1, %xmm0, %eax
movl %eax, %edx
movl 8(%esp), %eax
orl 12(%esp), %edx
orb $0, %ah
addl $28, %esp
ret
_Z17SSE41ExtractInt64ILi1EExDv2_x:
pushl %ebx
pextrd $2, %xmm0, %ecx
psrldq $12, %xmm0
xorl %ebx, %ebx
movd %xmm0, %edx
movl %ecx, %eax
orl %ebx, %edx
orb $0, %ah
popl %ebx
ret
Here is more optimal code for the above functions:
_Z17SSE41ExtractInt64ILi0EExDv2_x:
movd %xmm0, %eax
pextrd $1, %xmm0, %edx
ret
_Z17SSE41ExtractInt64ILi1EExDv2_x:
pextrd $2, %xmm0, %eax
pextrd $3, %xmm0, %edx
ret
Here is the code that is generated when the above C++ code is compiled with
clang 13.0.0 with the -O2 -std=c++17 -march=core2 -msse4.1 -mtune=skylake -m32
options:
_Z17SSE41ExtractInt64ILi0EExDv2_x: # @_Z17SSE41ExtractInt64ILi0EExDv2_x
movd %xmm0, %eax
pextrd $1, %xmm0, %edx
retl
_Z17SSE41ExtractInt64ILi1EExDv2_x: # @_Z17SSE41ExtractInt64ILi1EExDv2_x
extractps $2, %xmm0, %eax
extractps $3, %xmm0, %edx
retl
^ permalink raw reply [flat|nested] 8+ messages in thread
* [Bug target/103611] GCC generates suboptimal code for SSE2/SSE4.1 64-bit integer element extraction on 32-bit x86 targets
2021-12-08 0:55 [Bug target/103611] New: GCC generates suboptimal code for SSE2/SSE4.1 64-bit integer element extraction on 32-bit x86 targets john_platts at hotmail dot com
2021-12-08 1:04 ` [Bug target/103611] " john_platts at hotmail dot com
@ 2021-12-08 1:10 ` john_platts at hotmail dot com
2021-12-08 1:28 ` pinskia at gcc dot gnu.org
` (4 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: john_platts at hotmail dot com @ 2021-12-08 1:10 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103611
--- Comment #2 from John Platts <john_platts at hotmail dot com> ---
Here is some code for extracting 64-bit integers from a SSE2 vector using GCC
vector extensions:
#include <cstdint>
#include <immintrin.h>
using Int64M128Vect [[__gnu__::__vector_size__(16)]] = std::int64_t;
template<int ElemIdx>
std::int64_t SSE2ExtractInt64(__m128i vect) noexcept {
static_assert(ElemIdx == (ElemIdx & 1), "ElemIdx must be between 0 and 1");
return Int64M128Vect(vect)[ElemIdx];
}
template std::int64_t SSE2ExtractInt64<0>(__m128i vect) noexcept;
template std::int64_t SSE2ExtractInt64<1>(__m128i vect) noexcept;
Here is the output of the above C++ code when compiled with the -O2 -std=c++17
-march=nocona -mtune=skylake -m32 options:
_Z16SSE2ExtractInt64ILi0EExDv2_x:
subl $28, %esp
movq %xmm0, 8(%esp)
movl 8(%esp), %eax
movl 12(%esp), %edx
addl $28, %esp
ret
_Z16SSE2ExtractInt64ILi1EExDv2_x:
subl $28, %esp
movhps %xmm0, 8(%esp)
movl 8(%esp), %eax
movl 12(%esp), %edx
addl $28, %esp
ret
Here is the output of the above C++ code when compiled with the -O2 -std=c++17
-march=nocona -mtune=skylake -m64 options:
_Z16SSE2ExtractInt64ILi0EElDv2_x:
movq %xmm0, %rax
ret
_Z16SSE2ExtractInt64ILi1EElDv2_x:
movhlps %xmm0, %xmm1
movq %xmm1, %rax
ret
Here is the output of the above C++ code when compiled with the -O2 -std=c++17
-march=core2 -msse4.1 -mtune=skylake -m32 options:
_Z16SSE2ExtractInt64ILi0EExDv2_x:
movd %xmm0, %eax
pextrd $1, %xmm0, %edx
ret
_Z16SSE2ExtractInt64ILi1EExDv2_x:
subl $28, %esp
movhps %xmm0, 8(%esp)
movl 8(%esp), %eax
movl 12(%esp), %edx
addl $28, %esp
ret
Here is the output of the above C++ code when compiled with the -O2 -std=c++17
-march=core2 -msse4.1 -mtune=skylake -m64 options:
_Z16SSE2ExtractInt64ILi0EElDv2_x:
movq %xmm0, %rax
ret
_Z16SSE2ExtractInt64ILi1EElDv2_x:
pextrq $1, %xmm0, %rax
ret
^ permalink raw reply [flat|nested] 8+ messages in thread
* [Bug target/103611] GCC generates suboptimal code for SSE2/SSE4.1 64-bit integer element extraction on 32-bit x86 targets
2021-12-08 0:55 [Bug target/103611] New: GCC generates suboptimal code for SSE2/SSE4.1 64-bit integer element extraction on 32-bit x86 targets john_platts at hotmail dot com
2021-12-08 1:04 ` [Bug target/103611] " john_platts at hotmail dot com
2021-12-08 1:10 ` john_platts at hotmail dot com
@ 2021-12-08 1:28 ` pinskia at gcc dot gnu.org
2021-12-08 1:36 ` john_platts at hotmail dot com
` (3 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: pinskia at gcc dot gnu.org @ 2021-12-08 1:28 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103611
--- Comment #3 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
Hmm, GCC 4.8.1-5.5.0 produces:
long long SSE2ExtractInt64<0>(long long __vector):
.LFB499:
.cfi_startproc
pshufd xmm1, xmm0, 1
movd eax, xmm0
movd edx, xmm1
ret
long long SSE2ExtractInt64<1>(long long __vector):
.LFB500:
.cfi_startproc
pshufd xmm1, xmm0, 3
pshufd xmm0, xmm0, 2
movd edx, xmm1
movd eax, xmm0
ret
For the code in comment #0.
And always used memory for code in comment #2.
^ permalink raw reply [flat|nested] 8+ messages in thread
* [Bug target/103611] GCC generates suboptimal code for SSE2/SSE4.1 64-bit integer element extraction on 32-bit x86 targets
2021-12-08 0:55 [Bug target/103611] New: GCC generates suboptimal code for SSE2/SSE4.1 64-bit integer element extraction on 32-bit x86 targets john_platts at hotmail dot com
` (2 preceding siblings ...)
2021-12-08 1:28 ` pinskia at gcc dot gnu.org
@ 2021-12-08 1:36 ` john_platts at hotmail dot com
2021-12-13 18:52 ` cvs-commit at gcc dot gnu.org
` (2 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: john_platts at hotmail dot com @ 2021-12-08 1:36 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103611
--- Comment #4 from John Platts <john_platts at hotmail dot com> ---
(In reply to Andrew Pinski from comment #3)
> Hmm, GCC 4.8.1-5.5.0 produces:
> long long SSE2ExtractInt64<0>(long long __vector):
> .LFB499:
> .cfi_startproc
> pshufd xmm1, xmm0, 1
> movd eax, xmm0
> movd edx, xmm1
> ret
> long long SSE2ExtractInt64<1>(long long __vector):
> .LFB500:
> .cfi_startproc
> pshufd xmm1, xmm0, 3
> pshufd xmm0, xmm0, 2
> movd edx, xmm1
> movd eax, xmm0
> ret
>
> For the code in comment #0.
> And always used memory for code in comment #2.
I have noticed that the issue with suboptimal code being generated for the code
in comment #0 and comment #1 isn't happening with GCC 5.5 or earlier, but the
issue with suboptimal code being generated for code in comment #0 and comment
#1 is happening with GCC 6.1 or later.
^ permalink raw reply [flat|nested] 8+ messages in thread
* [Bug target/103611] GCC generates suboptimal code for SSE2/SSE4.1 64-bit integer element extraction on 32-bit x86 targets
2021-12-08 0:55 [Bug target/103611] New: GCC generates suboptimal code for SSE2/SSE4.1 64-bit integer element extraction on 32-bit x86 targets john_platts at hotmail dot com
` (3 preceding siblings ...)
2021-12-08 1:36 ` john_platts at hotmail dot com
@ 2021-12-13 18:52 ` cvs-commit at gcc dot gnu.org
2021-12-18 13:53 ` cvs-commit at gcc dot gnu.org
2021-12-21 21:30 ` roger at nextmovesoftware dot com
6 siblings, 0 replies; 8+ messages in thread
From: cvs-commit at gcc dot gnu.org @ 2021-12-13 18:52 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103611
--- Comment #5 from CVS Commits <cvs-commit at gcc dot gnu.org> ---
The master branch has been updated by Roger Sayle <sayle@gcc.gnu.org>:
https://gcc.gnu.org/g:149739c39475f3691e67aa0aee4f205f4e83392f
commit r12-5943-g149739c39475f3691e67aa0aee4f205f4e83392f
Author: Roger Sayle <roger@nextmovesoftware.com>
Date: Mon Dec 13 18:48:22 2021 +0000
x86: Avoid generating orb $0, %ah
I'll post my proposed fix for PR target/103611 shortly, but this patch
fixes another missed optimization opportunity revealed by that PR.
Occasionally, reload materializes integer constants during register
allocation sometimes resulting in unnecessary instructions such as:
(insn 23 31 24 2 (parallel [
(set (reg:SI 0 ax [99])
(ior:SI (reg:SI 0 ax [99])
(const_int 0 [0])))
(clobber (reg:CC 17 flags))
]) "pr103611.c":18:73 550 {*iorsi_1}
(nil))
These then get "optimized" during the split2 pass, which realizes that
no bits outside of 0xff00 are set, so this operation can be implemented
by operating on just the highpart of a QIreg_operand, i.e. %ah, %bh, %ch
etc., which leads to the useless "orb $0, %ah" seen in the reported PR.
This fix catches the case of const0_rtx in relevant splitter, either
eliminating the instruction or turning it into a simple move.
2021-12-13 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386.md (define_split any_or:SWI248 -> orb %?h):
Optimize the case where the integer constant operand is zero.
gcc/testsuite/ChangeLog
* gcc.target/i386/pr103611-1.c: New test case.
^ permalink raw reply [flat|nested] 8+ messages in thread
* [Bug target/103611] GCC generates suboptimal code for SSE2/SSE4.1 64-bit integer element extraction on 32-bit x86 targets
2021-12-08 0:55 [Bug target/103611] New: GCC generates suboptimal code for SSE2/SSE4.1 64-bit integer element extraction on 32-bit x86 targets john_platts at hotmail dot com
` (4 preceding siblings ...)
2021-12-13 18:52 ` cvs-commit at gcc dot gnu.org
@ 2021-12-18 13:53 ` cvs-commit at gcc dot gnu.org
2021-12-21 21:30 ` roger at nextmovesoftware dot com
6 siblings, 0 replies; 8+ messages in thread
From: cvs-commit at gcc dot gnu.org @ 2021-12-18 13:53 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103611
--- Comment #6 from CVS Commits <cvs-commit at gcc dot gnu.org> ---
The master branch has been updated by Roger Sayle <sayle@gcc.gnu.org>:
https://gcc.gnu.org/g:31048012db98f5ec9c2ba537bfd850374bdd771f
commit r12-6058-g31048012db98f5ec9c2ba537bfd850374bdd771f
Author: Roger Sayle <roger@nextmovesoftware.com>
Date: Sat Dec 18 13:51:56 2021 +0000
x86: PR target/103611: Splitter for DST:DI = (HI:SI<<32)|LO:SI.
A common idiom is to create a DImode value from the "concat" of two SImode
values, using "(long long)hi << 32 | (long long)lo", where the operation
may be ior, xor or plus. On x86, with -m32, the high and low parts of
a DImode register are actually different SImode registers (typically %edx
and %eax) so ideally this idiom should reduce to two move instructions
(or optimally, just clever register allocation).
Unfortunately, GCC currently performs the IOR operation above on -m32,
and worse allocates DImode registers (split to SImode register pairs)
for both the zero extended HI and LO values.
Hence, for test1 from the new test case below:
typedef int __v4si __attribute__ ((__vector_size__ (16)));
long long test1(__v4si v) {
unsigned int loVal = (unsigned int)v[0];
unsigned int hiVal = (unsigned int)v[1];
return (long long)(loVal) | ((long long)(hiVal) << 32);
}
we currently generate (with -m32 -O2 -msse4.1):
test1: subl $28, %esp
pextrd $1, %xmm0, %eax
pmovzxdq %xmm0, %xmm1
movq %xmm1, 8(%esp)
movl %eax, %edx
movl 8(%esp), %eax
orl 12(%esp), %edx
addl $28, %esp
orb $0, %ah
ret
with this patch we now generate:
test1: pextrd $1, %xmm0, %edx
movd %xmm0, %eax
ret
The fix is to recognize and split the idiom (hi<<32)|zext(lo) prior
to register allocation on !TARGET_64BIT, simplifying this sequence to
"highpart(dst) = hi; lowpart(dst) = lo".
The one minor complication is that sse.md's define_insn for
*vec_extractv4si_0_zext_sse4 can sometimes interfere with this
optimization. It turns out that on !TARGET_64BIT, the zero_extend:DI
following vec_select:SI isn't free, and this insn gets split back
into multiple instructions during later passes, but too late to
be optimized away by this patch/reload. Hence the last hunk of
this patch is to restrict *vec_extractv4si_0_zext_sse4 to TARGET_64BIT.
Checking PR target/80286, where *vec_extractv4si_0_zext_sse4 was
first added, this seems reasonable.
2021-12-18 Roger Sayle <roger@nextmovesoftware.com>
Uroš Bizjak <ubizjak@gmail.com>
gcc/ChangeLog
PR target/103611
* config/i386/i386.md (any_or_plus): New code iterator.
(define_split): Split (HI<<32)|zext(LO) into piece-wise
move instructions on !TARGET_64BIT.
* config/i386/sse.md (*vec_extractv4si_0_zext_sse4):
Restrict to TARGET_64BIT.
gcc/testsuite/ChangeLog
PR target/103611
* gcc.target/i386/pr103611-2.c: New test case.
^ permalink raw reply [flat|nested] 8+ messages in thread
* [Bug target/103611] GCC generates suboptimal code for SSE2/SSE4.1 64-bit integer element extraction on 32-bit x86 targets
2021-12-08 0:55 [Bug target/103611] New: GCC generates suboptimal code for SSE2/SSE4.1 64-bit integer element extraction on 32-bit x86 targets john_platts at hotmail dot com
` (5 preceding siblings ...)
2021-12-18 13:53 ` cvs-commit at gcc dot gnu.org
@ 2021-12-21 21:30 ` roger at nextmovesoftware dot com
6 siblings, 0 replies; 8+ messages in thread
From: roger at nextmovesoftware dot com @ 2021-12-21 21:30 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103611
Roger Sayle <roger at nextmovesoftware dot com> changed:
What |Removed |Added
----------------------------------------------------------------------------
Target Milestone|--- |12.0
Status|UNCONFIRMED |RESOLVED
Resolution|--- |FIXED
CC| |roger at nextmovesoftware dot com
--- Comment #7 from Roger Sayle <roger at nextmovesoftware dot com> ---
This should now be fixed on mainline.
^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2021-12-21 21:30 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-12-08 0:55 [Bug target/103611] New: GCC generates suboptimal code for SSE2/SSE4.1 64-bit integer element extraction on 32-bit x86 targets john_platts at hotmail dot com
2021-12-08 1:04 ` [Bug target/103611] " john_platts at hotmail dot com
2021-12-08 1:10 ` john_platts at hotmail dot com
2021-12-08 1:28 ` pinskia at gcc dot gnu.org
2021-12-08 1:36 ` john_platts at hotmail dot com
2021-12-13 18:52 ` cvs-commit at gcc dot gnu.org
2021-12-18 13:53 ` cvs-commit at gcc dot gnu.org
2021-12-21 21:30 ` roger at nextmovesoftware dot com
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).