public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug target/114944] New: Codegen of __builtin_shuffle for an 16-byte uint8_t vector is suboptimal on SSE2
@ 2024-05-04 14:00 john_platts at hotmail dot com
2024-05-04 14:15 ` [Bug target/114944] " john_platts at hotmail dot com
` (3 more replies)
0 siblings, 4 replies; 5+ messages in thread
From: john_platts at hotmail dot com @ 2024-05-04 14:00 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114944
Bug ID: 114944
Summary: Codegen of __builtin_shuffle for an 16-byte uint8_t
vector is suboptimal on SSE2
Product: gcc
Version: 13.2.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: target
Assignee: unassigned at gcc dot gnu.org
Reporter: john_platts at hotmail dot com
Target Milestone: ---
Here is a snippet of code that has suboptimal codegen on SSE2:
#include <stdint.h>
#include <emmintrin.h>
__m128i SSE2ShuffleI8(__m128i a, __m128i b) {
typedef uint8_t GccU8M128Vec __attribute__((__vector_size__(16)));
return reinterpret_cast<__m128i>(__builtin_shuffle(
reinterpret_cast<GccU8M128Vec>(a), reinterpret_cast<GccU8M128Vec>(b)));
}
Here is the code that is generated when the above code is compiled on x86_64
GCC 13.2.0 with the -O2 option:
SSE2ShuffleI8(long long __vector(2), long long __vector(2)):
push r15
movd r11d, xmm1
push r14
and r11d, 15
push r13
push r12
push rbp
push rbx
sub rsp, 160
movaps XMMWORD PTR [rsp+8], xmm1
movzx edx, BYTE PTR [rsp+16]
movaps XMMWORD PTR [rsp+24], xmm1
movzx eax, BYTE PTR [rsp+31]
movaps XMMWORD PTR [rsp+40], xmm1
mov rcx, rdx
movzx r15d, BYTE PTR [rsp+46]
and ecx, 15
and eax, 15
movaps XMMWORD PTR [rsp+120], xmm1
movzx ebx, BYTE PTR [rsp+121]
mov QWORD PTR [rsp-120], rcx
and r15d, 15
movaps XMMWORD PTR [rsp+136], xmm0
and ebx, 15
movaps XMMWORD PTR [rsp+104], xmm1
movzx ebp, BYTE PTR [rsp+106]
movaps XMMWORD PTR [rsp+88], xmm1
movzx r12d, BYTE PTR [rsp+91]
movaps XMMWORD PTR [rsp+72], xmm1
movzx r13d, BYTE PTR [rsp+76]
and ebp, 15
movaps XMMWORD PTR [rsp+56], xmm1
movzx r14d, BYTE PTR [rsp+61]
and r12d, 15
movaps XMMWORD PTR [rsp-8], xmm1
movzx edx, BYTE PTR [rsp+1]
and r13d, 15
movaps XMMWORD PTR [rsp-24], xmm1
movzx ecx, BYTE PTR [rsp-14]
and r14d, 15
movaps XMMWORD PTR [rsp-40], xmm1
movzx esi, BYTE PTR [rsp-29]
and edx, 15
movaps XMMWORD PTR [rsp-56], xmm1
movzx edi, BYTE PTR [rsp-44]
and ecx, 15
movaps XMMWORD PTR [rsp-72], xmm1
movzx r8d, BYTE PTR [rsp-59]
and esi, 15
movaps XMMWORD PTR [rsp-88], xmm1
movzx r9d, BYTE PTR [rsp-74]
and edi, 15
movaps XMMWORD PTR [rsp-104], xmm1
movzx r10d, BYTE PTR [rsp-89]
and r8d, 15
movzx eax, BYTE PTR [rsp+136+rax]
movzx r15d, BYTE PTR [rsp+136+r15]
and r9d, 15
movzx r14d, BYTE PTR [rsp+136+r14]
sal rax, 8
movzx ebp, BYTE PTR [rsp+136+rbp]
movzx r13d, BYTE PTR [rsp+136+r13]
and r10d, 15
or rax, r15
movzx r12d, BYTE PTR [rsp+136+r12]
movzx ebx, BYTE PTR [rsp+136+rbx]
sal rax, 8
movzx edi, BYTE PTR [rsp+136+rdi]
movzx r9d, BYTE PTR [rsp+136+r9]
or rax, r14
movzx esi, BYTE PTR [rsp+136+rsi]
movzx r8d, BYTE PTR [rsp+136+r8]
sal rax, 8
movzx ecx, BYTE PTR [rsp+136+rcx]
movzx edx, BYTE PTR [rsp+136+rdx]
or rax, r13
sal rax, 8
or rax, r12
sal rax, 8
or rax, rbp
sal rax, 8
or rax, rbx
movzx ebx, BYTE PTR [rsp+136+r11]
sal rax, 8
mov r11, rax
movzx eax, BYTE PTR [rsp+136+r10]
sal rax, 8
or rax, r9
sal rax, 8
or r11, rbx
or rax, r8
sal rax, 8
or rax, rdi
sal rax, 8
or rax, rsi
sal rax, 8
or rax, rcx
mov rcx, QWORD PTR [rsp-120]
mov QWORD PTR [rsp-120], r11
sal rax, 8
or rax, rdx
movzx edx, BYTE PTR [rsp+136+rcx]
sal rax, 8
or rax, rdx
mov QWORD PTR [rsp-112], rax
movdqa xmm0, XMMWORD PTR [rsp-120]
add rsp, 160
pop rbx
pop rbp
pop r12
pop r13
pop r14
pop r15
ret
The above code unnecessarily allocates more stack space than is necessary and
stores xmm1 (the index vector) multiple times.
Here is an more optimal version of SSE2ShuffleI8:
.LSSE2ShuffleI8_Element_Mask:
.byte 15
.byte 15
.byte 15
.byte 15
.byte 15
.byte 15
.byte 15
.byte 15
.byte 15
.byte 15
.byte 15
.byte 15
.byte 15
.byte 15
.byte 15
.byte 15
SSE2ShuffleI8:
push rbp
push r15
push r14
push r13
push r12
push rbx
movdqa XMMWORD PTR [rsp - 24], xmm0
pand xmm1, XMMWORD PTR .LSSE2ShuffleI8_Element_Mask[rip]
movdqa XMMWORD PTR [rsp - 56], xmm1
movzx eax, BYTE PTR [rsp - 56]
movzx ebx, BYTE PTR [rsp - 55]
movzx ecx, BYTE PTR [rsp - 54]
movzx edx, BYTE PTR [rsp - 53]
movzx esi, BYTE PTR [rsp - 52]
movzx edi, BYTE PTR [rsp - 51]
movzx ebp, BYTE PTR [rsp - 50]
movzx r8d, BYTE PTR [rsp - 49]
movzx r9d, BYTE PTR [rsp - 48]
movzx r10d, BYTE PTR [rsp - 47]
movzx r11d, BYTE PTR [rsp - 46]
movzx r12d, BYTE PTR [rsp - 45]
movzx r13d, BYTE PTR [rsp - 44]
movzx r14d, BYTE PTR [rsp - 43]
movzx r15d, BYTE PTR [rsp - 42]
movzx eax, BYTE PTR [rsp + rax - 24]
movzx ebx, BYTE PTR [rsp + rbx - 24]
movzx ecx, BYTE PTR [rsp + rcx - 24]
movzx edx, BYTE PTR [rsp + rdx - 24]
movzx esi, BYTE PTR [rsp + rsi - 24]
movzx edi, BYTE PTR [rsp + rdi - 24]
movzx ebp, BYTE PTR [rsp + rbp - 24]
movzx r8d, BYTE PTR [rsp + r8 - 24]
movd xmm0, eax
movzx eax, BYTE PTR [rsp - 41]
movzx r9d, BYTE PTR [rsp + r9 - 24]
movzx r10d, BYTE PTR [rsp + r10 - 24]
movzx r11d, BYTE PTR [rsp + r11 - 24]
movzx r12d, BYTE PTR [rsp + r12 - 24]
movzx r13d, BYTE PTR [rsp + r13 - 24]
movzx r14d, BYTE PTR [rsp + r14 - 24]
movzx r15d, BYTE PTR [rsp + r15 - 24]
movzx eax, BYTE PTR [rsp + rax - 24]
movd xmm1, ebx
movd xmm2, ecx
movd xmm3, edx
movd xmm4, esi
movd xmm5, edi
movd xmm6, ebp
movd xmm7, r8d
movd xmm8, r9d
movd xmm9, r10d
movd xmm10, r11d
movd xmm11, r12d
movd xmm12, r13d
movd xmm13, r14d
movd xmm14, r15d
movd xmm15, eax
punpcklbw xmm0, xmm1
punpcklbw xmm2, xmm3
punpcklbw xmm4, xmm5
punpcklbw xmm6, xmm7
punpcklbw xmm8, xmm9
punpcklbw xmm10, xmm11
punpcklbw xmm12, xmm13
punpcklbw xmm14, xmm15
punpcklwd xmm0, xmm2
punpcklwd xmm4, xmm6
punpcklwd xmm8, xmm10
punpcklwd xmm12, xmm14
punpckldq xmm0, xmm4
punpckldq xmm8, xmm12
punpcklqdq xmm0, xmm8
pop rbx
pop r12
pop r13
pop r14
pop r15
pop rbp
ret
The second version of the SSE2ShuffleI8 op above requires 79 instructions and
only requires 80 bytes of stack, compared to the first version of the
SSE2ShuffleI8 op (which is generated by GCC 13.2.0), which allocates 160 bytes
of stack and uses an additional 120 bytes of stack in the red zone and
generates 114 instructions.
^ permalink raw reply [flat|nested] 5+ messages in thread
* [Bug target/114944] Codegen of __builtin_shuffle for an 16-byte uint8_t vector is suboptimal on SSE2
2024-05-04 14:00 [Bug target/114944] New: Codegen of __builtin_shuffle for an 16-byte uint8_t vector is suboptimal on SSE2 john_platts at hotmail dot com
@ 2024-05-04 14:15 ` john_platts at hotmail dot com
2024-05-06 13:35 ` john_platts at hotmail dot com
` (2 subsequent siblings)
3 siblings, 0 replies; 5+ messages in thread
From: john_platts at hotmail dot com @ 2024-05-04 14:15 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114944
John Platts <john_platts at hotmail dot com> changed:
What |Removed |Added
----------------------------------------------------------------------------
Target| |x86_64-*-*, i?86-*-*
--- Comment #1 from John Platts <john_platts at hotmail dot com> ---
Here is another snippet of code that has suboptimal codegen on SSE2 with GCC
13.2.0:
#include <stdint.h>
#include <emmintrin.h>
__m128i SSE2ShuffleI8(__m128i a, __m128i b) {
alignas(16) uint8_t a_lanes[16];
alignas(16) uint8_t b_lanes[16];
_mm_store_si128(reinterpret_cast<__m128i*>(a_lanes), a);
_mm_store_si128(reinterpret_cast<__m128i*>(b_lanes),
_mm_and_si128(b, _mm_set1_epi8(static_cast<char>(15))));
__m128i v0 = _mm_cvtsi32_si128(a_lanes[b_lanes[0]]);
__m128i v1 = _mm_cvtsi32_si128(a_lanes[b_lanes[1]]);
__m128i v2 = _mm_cvtsi32_si128(a_lanes[b_lanes[2]]);
__m128i v3 = _mm_cvtsi32_si128(a_lanes[b_lanes[3]]);
__m128i v4 = _mm_cvtsi32_si128(a_lanes[b_lanes[4]]);
__m128i v5 = _mm_cvtsi32_si128(a_lanes[b_lanes[5]]);
__m128i v6 = _mm_cvtsi32_si128(a_lanes[b_lanes[6]]);
__m128i v7 = _mm_cvtsi32_si128(a_lanes[b_lanes[7]]);
__m128i v8 = _mm_cvtsi32_si128(a_lanes[b_lanes[8]]);
__m128i v9 = _mm_cvtsi32_si128(a_lanes[b_lanes[9]]);
__m128i v10 = _mm_cvtsi32_si128(a_lanes[b_lanes[10]]);
__m128i v11 = _mm_cvtsi32_si128(a_lanes[b_lanes[11]]);
__m128i v12 = _mm_cvtsi32_si128(a_lanes[b_lanes[12]]);
__m128i v13 = _mm_cvtsi32_si128(a_lanes[b_lanes[13]]);
__m128i v14 = _mm_cvtsi32_si128(a_lanes[b_lanes[14]]);
__m128i v15 = _mm_cvtsi32_si128(a_lanes[b_lanes[15]]);
v0 = _mm_unpacklo_epi8(v0, v1);
v2 = _mm_unpacklo_epi8(v2, v3);
v4 = _mm_unpacklo_epi8(v4, v5);
v6 = _mm_unpacklo_epi8(v6, v7);
v8 = _mm_unpacklo_epi8(v8, v9);
v10 = _mm_unpacklo_epi8(v10, v11);
v12 = _mm_unpacklo_epi8(v12, v13);
v14 = _mm_unpacklo_epi8(v14, v15);
v0 = _mm_unpacklo_epi16(v0, v2);
v4 = _mm_unpacklo_epi16(v4, v6);
v8 = _mm_unpacklo_epi16(v8, v10);
v12 = _mm_unpacklo_epi16(v12, v14);
v0 = _mm_unpacklo_epi32(v0, v4);
v8 = _mm_unpacklo_epi32(v8, v12);
return _mm_unpacklo_epi64(v0, v8);
}
Here is the code that is generated when the above code is compiled on x86_64
GCC 13.2.0 with the -O2 option:
SSE2ShuffleI8(long long __vector(2), long long __vector(2)):
sub rsp, 144
pand xmm1, XMMWORD PTR .LC0[rip]
movaps XMMWORD PTR [rsp+120], xmm0
movd eax, xmm1
movzx eax, al
movzx eax, BYTE PTR [rsp+120+rax]
movaps XMMWORD PTR [rsp+104], xmm1
movd xmm0, eax
movzx eax, BYTE PTR [rsp+105]
movzx eax, BYTE PTR [rsp+120+rax]
movaps XMMWORD PTR [rsp+88], xmm1
movd xmm2, eax
movzx eax, BYTE PTR [rsp+90]
punpcklbw xmm0, xmm2
movzx eax, BYTE PTR [rsp+120+rax]
movaps XMMWORD PTR [rsp+72], xmm1
movd xmm8, eax
movzx eax, BYTE PTR [rsp+75]
movzx eax, BYTE PTR [rsp+120+rax]
movaps XMMWORD PTR [rsp+56], xmm1
movd xmm2, eax
movzx eax, BYTE PTR [rsp+60]
punpcklbw xmm8, xmm2
movzx eax, BYTE PTR [rsp+120+rax]
movaps XMMWORD PTR [rsp+40], xmm1
punpcklwd xmm0, xmm8
movd xmm5, eax
movzx eax, BYTE PTR [rsp+45]
movzx eax, BYTE PTR [rsp+120+rax]
movaps XMMWORD PTR [rsp+24], xmm1
movd xmm2, eax
movzx eax, BYTE PTR [rsp+30]
punpcklbw xmm5, xmm2
movzx eax, BYTE PTR [rsp+120+rax]
movaps XMMWORD PTR [rsp+8], xmm1
movd xmm7, eax
movzx eax, BYTE PTR [rsp+15]
movzx eax, BYTE PTR [rsp+120+rax]
movaps XMMWORD PTR [rsp-8], xmm1
movd xmm2, eax
movzx eax, BYTE PTR [rsp]
punpcklbw xmm7, xmm2
movzx eax, BYTE PTR [rsp+120+rax]
movaps XMMWORD PTR [rsp-24], xmm1
punpcklwd xmm5, xmm7
punpckldq xmm0, xmm5
movd xmm3, eax
movzx eax, BYTE PTR [rsp-15]
movzx eax, BYTE PTR [rsp+120+rax]
movaps XMMWORD PTR [rsp-40], xmm1
movd xmm4, eax
movzx eax, BYTE PTR [rsp-30]
punpcklbw xmm3, xmm4
movzx eax, BYTE PTR [rsp+120+rax]
movaps XMMWORD PTR [rsp-56], xmm1
movd xmm6, eax
movzx eax, BYTE PTR [rsp-45]
movzx eax, BYTE PTR [rsp+120+rax]
movaps XMMWORD PTR [rsp-72], xmm1
movd xmm2, eax
movzx eax, BYTE PTR [rsp-60]
punpcklbw xmm6, xmm2
movzx eax, BYTE PTR [rsp+120+rax]
movaps XMMWORD PTR [rsp-88], xmm1
punpcklwd xmm3, xmm6
movd xmm2, eax
movzx eax, BYTE PTR [rsp-75]
movzx eax, BYTE PTR [rsp+120+rax]
movaps XMMWORD PTR [rsp-104], xmm1
movd xmm4, eax
movzx eax, BYTE PTR [rsp-90]
punpcklbw xmm2, xmm4
movzx eax, BYTE PTR [rsp+120+rax]
movaps XMMWORD PTR [rsp-120], xmm1
movd xmm4, eax
movzx eax, BYTE PTR [rsp-105]
movzx eax, BYTE PTR [rsp+120+rax]
add rsp, 144
movd xmm1, eax
punpcklbw xmm4, xmm1
movdqa xmm1, xmm2
movdqa xmm2, xmm3
punpcklwd xmm1, xmm4
punpckldq xmm2, xmm1
punpcklqdq xmm0, xmm2
ret
.LC0:
.quad 1085102592571150095
.quad 1085102592571150095
In the SSE2ShuffleI8 code generated above GCC 13.2.0 unnecessarily stores the
result of _mm_and_si128(b, _mm_set1_epi8(static_cast<char>(15))) into 15
different memory locations when optimizations are enabled.
^ permalink raw reply [flat|nested] 5+ messages in thread
* [Bug target/114944] Codegen of __builtin_shuffle for an 16-byte uint8_t vector is suboptimal on SSE2
2024-05-04 14:00 [Bug target/114944] New: Codegen of __builtin_shuffle for an 16-byte uint8_t vector is suboptimal on SSE2 john_platts at hotmail dot com
2024-05-04 14:15 ` [Bug target/114944] " john_platts at hotmail dot com
@ 2024-05-06 13:35 ` john_platts at hotmail dot com
2024-05-06 15:52 ` amonakov at gcc dot gnu.org
2024-05-06 16:16 ` amonakov at gcc dot gnu.org
3 siblings, 0 replies; 5+ messages in thread
From: john_platts at hotmail dot com @ 2024-05-06 13:35 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114944
--- Comment #2 from John Platts <john_platts at hotmail dot com> ---
Here is more optimal codegen for SSE2ShuffleI8 on x86_64:
SSE2ShuffleI8(long long __vector(2), long long __vector(2)):
pand xmm1, XMMWORD PTR .LC0[rip]
movaps XMMWORD PTR [rsp-24], xmm0
movd eax, xmm1
movzx eax, al
movzx eax, BYTE PTR [rsp-24+rax]
movaps XMMWORD PTR [rsp-40], xmm1
movd xmm0, eax
movzx eax, BYTE PTR [rsp-39]
movzx eax, BYTE PTR [rsp-24+rax]
movd xmm2, eax
movzx eax, BYTE PTR [rsp-38]
punpcklbw xmm0, xmm2
movzx eax, BYTE PTR [rsp-24+rax]
movd xmm8, eax
movzx eax, BYTE PTR [rsp-37]
movzx eax, BYTE PTR [rsp-24+rax]
movd xmm2, eax
movzx eax, BYTE PTR [rsp-36]
punpcklbw xmm8, xmm2
movzx eax, BYTE PTR [rsp-24+rax]
punpcklwd xmm0, xmm8
movd xmm5, eax
movzx eax, BYTE PTR [rsp-35]
movzx eax, BYTE PTR [rsp-24+rax]
movd xmm2, eax
movzx eax, BYTE PTR [rsp-34]
punpcklbw xmm5, xmm2
movzx eax, BYTE PTR [rsp-24+rax]
movd xmm7, eax
movzx eax, BYTE PTR [rsp-33]
movzx eax, BYTE PTR [rsp-24+rax]
movd xmm2, eax
movzx eax, BYTE PTR [rsp-32]
punpcklbw xmm7, xmm2
movzx eax, BYTE PTR [rsp-24+rax]
punpcklwd xmm5, xmm7
punpckldq xmm0, xmm5
movd xmm3, eax
movzx eax, BYTE PTR [rsp-31]
movzx eax, BYTE PTR [rsp-24+rax]
movd xmm4, eax
movzx eax, BYTE PTR [rsp-30]
punpcklbw xmm3, xmm4
movzx eax, BYTE PTR [rsp-24+rax]
movd xmm6, eax
movzx eax, BYTE PTR [rsp-29]
movzx eax, BYTE PTR [rsp-24+rax]
movd xmm2, eax
movzx eax, BYTE PTR [rsp-28]
punpcklbw xmm6, xmm2
movzx eax, BYTE PTR [rsp-24+rax]
punpcklwd xmm3, xmm6
movd xmm2, eax
movzx eax, BYTE PTR [rsp-27]
movzx eax, BYTE PTR [rsp-24+rax]
movd xmm4, eax
movzx eax, BYTE PTR [rsp-26]
punpcklbw xmm2, xmm4
movzx eax, BYTE PTR [rsp-24+rax]
movd xmm4, eax
movzx eax, BYTE PTR [rsp-25]
movzx eax, BYTE PTR [rsp-24+rax]
movd xmm1, eax
punpcklbw xmm4, xmm1
movdqa xmm1, xmm2
movdqa xmm2, xmm3
punpcklwd xmm1, xmm4
punpckldq xmm2, xmm1
punpcklqdq xmm0, xmm2
ret
.LC0:
.quad 1085102592571150095
.quad 1085102592571150095
^ permalink raw reply [flat|nested] 5+ messages in thread
* [Bug target/114944] Codegen of __builtin_shuffle for an 16-byte uint8_t vector is suboptimal on SSE2
2024-05-04 14:00 [Bug target/114944] New: Codegen of __builtin_shuffle for an 16-byte uint8_t vector is suboptimal on SSE2 john_platts at hotmail dot com
2024-05-04 14:15 ` [Bug target/114944] " john_platts at hotmail dot com
2024-05-06 13:35 ` john_platts at hotmail dot com
@ 2024-05-06 15:52 ` amonakov at gcc dot gnu.org
2024-05-06 16:16 ` amonakov at gcc dot gnu.org
3 siblings, 0 replies; 5+ messages in thread
From: amonakov at gcc dot gnu.org @ 2024-05-06 15:52 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114944
Alexander Monakov <amonakov at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
CC| |amonakov at gcc dot gnu.org
--- Comment #3 from Alexander Monakov <amonakov at gcc dot gnu.org> ---
Throughput-wise, the code in comment 2 has a significant bottleneck on port 5
on Haswell and Skylake (31 uops out of 70 go to port 5). Straightforward code
that does 16x movzx-movzx-movb for each byte should fare better, even
considering the load-store penalty for retrieving the vector from memory:
pand xmm1, XMMWORD PTR .LC0[rip]
movaps XMMWORD PTR [rsp-56], xmm0
movaps XMMWORD PTR [rsp-40], xmm1
movzx eax, BYTE PTR [rsp-40]
movzx eax, BYTE PTR [rsp-56+rax]
mov BYTE PTR [rsp-24], al
movzx eax, BYTE PTR [rsp-39]
movzx eax, BYTE PTR [rsp-56+rax]
mov BYTE PTR [rsp-23], al
movzx eax, BYTE PTR [rsp-38]
movzx eax, BYTE PTR [rsp-56+rax]
mov BYTE PTR [rsp-22], al
movzx eax, BYTE PTR [rsp-37]
movzx eax, BYTE PTR [rsp-56+rax]
mov BYTE PTR [rsp-21], al
movzx eax, BYTE PTR [rsp-36]
movzx eax, BYTE PTR [rsp-56+rax]
mov BYTE PTR [rsp-20], al
movzx eax, BYTE PTR [rsp-35]
movzx eax, BYTE PTR [rsp-56+rax]
mov BYTE PTR [rsp-19], al
movzx eax, BYTE PTR [rsp-34]
movzx eax, BYTE PTR [rsp-56+rax]
mov BYTE PTR [rsp-18], al
movzx eax, BYTE PTR [rsp-33]
movzx eax, BYTE PTR [rsp-56+rax]
mov BYTE PTR [rsp-17], al
movzx eax, BYTE PTR [rsp-32]
movzx eax, BYTE PTR [rsp-56+rax]
mov BYTE PTR [rsp-16], al
movzx eax, BYTE PTR [rsp-31]
movzx eax, BYTE PTR [rsp-56+rax]
mov BYTE PTR [rsp-15], al
movzx eax, BYTE PTR [rsp-30]
movzx eax, BYTE PTR [rsp-56+rax]
mov BYTE PTR [rsp-14], al
movzx eax, BYTE PTR [rsp-29]
movzx eax, BYTE PTR [rsp-56+rax]
mov BYTE PTR [rsp-13], al
movzx eax, BYTE PTR [rsp-28]
movzx eax, BYTE PTR [rsp-56+rax]
mov BYTE PTR [rsp-12], al
movzx eax, BYTE PTR [rsp-27]
movzx eax, BYTE PTR [rsp-56+rax]
mov BYTE PTR [rsp-11], al
movzx eax, BYTE PTR [rsp-26]
movzx eax, BYTE PTR [rsp-56+rax]
mov BYTE PTR [rsp-10], al
movzx eax, BYTE PTR [rsp-25]
movzx eax, BYTE PTR [rsp-56+rax]
mov BYTE PTR [rsp-9], al
movdqa xmm0, XMMWORD PTR [rsp-24]
If you want to avoid the load-store forwarding stall, perhaps you can assemble
two halves of the shuffled vector on GPRs (e.g. do 'movzx ecx, byte[...]; shl
eax, 8; mov al, byte [...+rcx]), then merge two 64-bit GPRs into one 128-bit
vector.
^ permalink raw reply [flat|nested] 5+ messages in thread
* [Bug target/114944] Codegen of __builtin_shuffle for an 16-byte uint8_t vector is suboptimal on SSE2
2024-05-04 14:00 [Bug target/114944] New: Codegen of __builtin_shuffle for an 16-byte uint8_t vector is suboptimal on SSE2 john_platts at hotmail dot com
` (2 preceding siblings ...)
2024-05-06 15:52 ` amonakov at gcc dot gnu.org
@ 2024-05-06 16:16 ` amonakov at gcc dot gnu.org
3 siblings, 0 replies; 5+ messages in thread
From: amonakov at gcc dot gnu.org @ 2024-05-06 16:16 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114944
--- Comment #4 from Alexander Monakov <amonakov at gcc dot gnu.org> ---
Like this:
pand xmm1, XMMWORD PTR .LC0[rip]
movaps XMMWORD PTR [rsp-40], xmm0
xor eax, eax
xor edx, edx
movaps XMMWORD PTR [rsp-24], xmm1
movzx ecx, BYTE PTR [rsp-17]
mov al, BYTE PTR [rsp-40+rcx]
movzx ecx, BYTE PTR [rsp-9]
mov dl, BYTE PTR [rsp-40+rcx]
movzx ecx, BYTE PTR [rsp-18]
sal rax, 8
mov al, BYTE PTR [rsp-40+rcx]
movzx ecx, BYTE PTR [rsp-10]
sal rdx, 8
mov dl, BYTE PTR [rsp-40+rcx]
movzx ecx, BYTE PTR [rsp-19]
sal rax, 8
mov al, BYTE PTR [rsp-40+rcx]
movzx ecx, BYTE PTR [rsp-11]
sal rdx, 8
mov dl, BYTE PTR [rsp-40+rcx]
movzx ecx, BYTE PTR [rsp-20]
sal rax, 8
mov al, BYTE PTR [rsp-40+rcx]
movzx ecx, BYTE PTR [rsp-12]
sal rdx, 8
mov dl, BYTE PTR [rsp-40+rcx]
movzx ecx, BYTE PTR [rsp-21]
sal rax, 8
mov al, BYTE PTR [rsp-40+rcx]
movzx ecx, BYTE PTR [rsp-13]
sal rdx, 8
mov dl, BYTE PTR [rsp-40+rcx]
movzx ecx, BYTE PTR [rsp-22]
sal rax, 8
mov al, BYTE PTR [rsp-40+rcx]
movzx ecx, BYTE PTR [rsp-14]
sal rdx, 8
mov dl, BYTE PTR [rsp-40+rcx]
movzx ecx, BYTE PTR [rsp-23]
sal rax, 8
mov al, BYTE PTR [rsp-40+rcx]
movzx ecx, BYTE PTR [rsp-15]
sal rdx, 8
mov dl, BYTE PTR [rsp-40+rcx]
movzx ecx, BYTE PTR [rsp-24]
sal rax, 8
mov al, BYTE PTR [rsp-40+rcx]
movzx ecx, BYTE PTR [rsp-16]
sal rdx, 8
mov dl, BYTE PTR [rsp-40+rcx]
movq xmm0, rax
movq xmm2, rdx
punpcklqdq xmm0, xmm2
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2024-05-06 16:16 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-05-04 14:00 [Bug target/114944] New: Codegen of __builtin_shuffle for an 16-byte uint8_t vector is suboptimal on SSE2 john_platts at hotmail dot com
2024-05-04 14:15 ` [Bug target/114944] " john_platts at hotmail dot com
2024-05-06 13:35 ` john_platts at hotmail dot com
2024-05-06 15:52 ` amonakov at gcc dot gnu.org
2024-05-06 16:16 ` amonakov at gcc dot gnu.org
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).