[Bug target/114944] New: Codegen of __builtin_shuffle for an 16-byte uint8

public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed

* [Bug target/114944] New: Codegen of __builtin_shuffle for an 16-byte uint8_t vector is suboptimal on SSE2
@ 2024-05-04 14:00 john_platts at hotmail dot com
  2024-05-04 14:15 ` [Bug target/114944] " john_platts at hotmail dot com
                   ` (3 more replies)
  0 siblings, 4 replies; 5+ messages in thread
From: john_platts at hotmail dot com @ 2024-05-04 14:00 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114944

            Bug ID: 114944
           Summary: Codegen of __builtin_shuffle for an 16-byte uint8_t
                    vector is suboptimal on SSE2
           Product: gcc
           Version: 13.2.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: john_platts at hotmail dot com
  Target Milestone: ---

Here is a snippet of code that has suboptimal codegen on SSE2:

#include <stdint.h>
#include <emmintrin.h>

__m128i SSE2ShuffleI8(__m128i a, __m128i b) {
  typedef uint8_t GccU8M128Vec __attribute__((__vector_size__(16)));
  return reinterpret_cast<__m128i>(__builtin_shuffle(
    reinterpret_cast<GccU8M128Vec>(a), reinterpret_cast<GccU8M128Vec>(b)));
}

Here is the code that is generated when the above code is compiled on x86_64
GCC 13.2.0 with the -O2 option:
SSE2ShuffleI8(long long __vector(2), long long __vector(2)):
        push    r15
        movd    r11d, xmm1
        push    r14
        and     r11d, 15
        push    r13
        push    r12
        push    rbp
        push    rbx
        sub     rsp, 160
        movaps  XMMWORD PTR [rsp+8], xmm1
        movzx   edx, BYTE PTR [rsp+16]
        movaps  XMMWORD PTR [rsp+24], xmm1
        movzx   eax, BYTE PTR [rsp+31]
        movaps  XMMWORD PTR [rsp+40], xmm1
        mov     rcx, rdx
        movzx   r15d, BYTE PTR [rsp+46]
        and     ecx, 15
        and     eax, 15
        movaps  XMMWORD PTR [rsp+120], xmm1
        movzx   ebx, BYTE PTR [rsp+121]
        mov     QWORD PTR [rsp-120], rcx
        and     r15d, 15
        movaps  XMMWORD PTR [rsp+136], xmm0
        and     ebx, 15
        movaps  XMMWORD PTR [rsp+104], xmm1
        movzx   ebp, BYTE PTR [rsp+106]
        movaps  XMMWORD PTR [rsp+88], xmm1
        movzx   r12d, BYTE PTR [rsp+91]
        movaps  XMMWORD PTR [rsp+72], xmm1
        movzx   r13d, BYTE PTR [rsp+76]
        and     ebp, 15
        movaps  XMMWORD PTR [rsp+56], xmm1
        movzx   r14d, BYTE PTR [rsp+61]
        and     r12d, 15
        movaps  XMMWORD PTR [rsp-8], xmm1
        movzx   edx, BYTE PTR [rsp+1]
        and     r13d, 15
        movaps  XMMWORD PTR [rsp-24], xmm1
        movzx   ecx, BYTE PTR [rsp-14]
        and     r14d, 15
        movaps  XMMWORD PTR [rsp-40], xmm1
        movzx   esi, BYTE PTR [rsp-29]
        and     edx, 15
        movaps  XMMWORD PTR [rsp-56], xmm1
        movzx   edi, BYTE PTR [rsp-44]
        and     ecx, 15
        movaps  XMMWORD PTR [rsp-72], xmm1
        movzx   r8d, BYTE PTR [rsp-59]
        and     esi, 15
        movaps  XMMWORD PTR [rsp-88], xmm1
        movzx   r9d, BYTE PTR [rsp-74]
        and     edi, 15
        movaps  XMMWORD PTR [rsp-104], xmm1
        movzx   r10d, BYTE PTR [rsp-89]
        and     r8d, 15
        movzx   eax, BYTE PTR [rsp+136+rax]
        movzx   r15d, BYTE PTR [rsp+136+r15]
        and     r9d, 15
        movzx   r14d, BYTE PTR [rsp+136+r14]
        sal     rax, 8
        movzx   ebp, BYTE PTR [rsp+136+rbp]
        movzx   r13d, BYTE PTR [rsp+136+r13]
        and     r10d, 15
        or      rax, r15
        movzx   r12d, BYTE PTR [rsp+136+r12]
        movzx   ebx, BYTE PTR [rsp+136+rbx]
        sal     rax, 8
        movzx   edi, BYTE PTR [rsp+136+rdi]
        movzx   r9d, BYTE PTR [rsp+136+r9]
        or      rax, r14
        movzx   esi, BYTE PTR [rsp+136+rsi]
        movzx   r8d, BYTE PTR [rsp+136+r8]
        sal     rax, 8
        movzx   ecx, BYTE PTR [rsp+136+rcx]
        movzx   edx, BYTE PTR [rsp+136+rdx]
        or      rax, r13
        sal     rax, 8
        or      rax, r12
        sal     rax, 8
        or      rax, rbp
        sal     rax, 8
        or      rax, rbx
        movzx   ebx, BYTE PTR [rsp+136+r11]
        sal     rax, 8
        mov     r11, rax
        movzx   eax, BYTE PTR [rsp+136+r10]
        sal     rax, 8
        or      rax, r9
        sal     rax, 8
        or      r11, rbx
        or      rax, r8
        sal     rax, 8
        or      rax, rdi
        sal     rax, 8
        or      rax, rsi
        sal     rax, 8
        or      rax, rcx
        mov     rcx, QWORD PTR [rsp-120]
        mov     QWORD PTR [rsp-120], r11
        sal     rax, 8
        or      rax, rdx
        movzx   edx, BYTE PTR [rsp+136+rcx]
        sal     rax, 8
        or      rax, rdx
        mov     QWORD PTR [rsp-112], rax
        movdqa  xmm0, XMMWORD PTR [rsp-120]
        add     rsp, 160
        pop     rbx
        pop     rbp
        pop     r12
        pop     r13
        pop     r14
        pop     r15
        ret

The above code unnecessarily allocates more stack space than is necessary and
stores xmm1 (the index vector) multiple times.

Here is an more optimal version of SSE2ShuffleI8:
.LSSE2ShuffleI8_Element_Mask:
  .byte 15
  .byte 15
  .byte 15
  .byte 15
  .byte 15
  .byte 15
  .byte 15
  .byte 15
  .byte 15
  .byte 15
  .byte 15
  .byte 15
  .byte 15
  .byte 15
  .byte 15
  .byte 15

SSE2ShuffleI8:
  push    rbp
  push    r15
  push    r14
  push    r13
  push    r12
  push    rbx
  movdqa  XMMWORD PTR [rsp - 24], xmm0
  pand    xmm1, XMMWORD PTR .LSSE2ShuffleI8_Element_Mask[rip]
  movdqa  XMMWORD PTR [rsp - 56], xmm1
  movzx   eax, BYTE PTR [rsp - 56]
  movzx   ebx, BYTE PTR [rsp - 55]
  movzx   ecx, BYTE PTR [rsp - 54]
  movzx   edx, BYTE PTR [rsp - 53]
  movzx   esi, BYTE PTR [rsp - 52]
  movzx   edi, BYTE PTR [rsp - 51]
  movzx   ebp, BYTE PTR [rsp - 50]
  movzx   r8d, BYTE PTR [rsp - 49]
  movzx   r9d, BYTE PTR [rsp - 48]
  movzx   r10d, BYTE PTR [rsp - 47]
  movzx   r11d, BYTE PTR [rsp - 46]
  movzx   r12d, BYTE PTR [rsp - 45]
  movzx   r13d, BYTE PTR [rsp - 44]
  movzx   r14d, BYTE PTR [rsp - 43]
  movzx   r15d, BYTE PTR [rsp - 42]
  movzx   eax, BYTE PTR [rsp + rax - 24]
  movzx   ebx, BYTE PTR [rsp + rbx - 24]
  movzx   ecx, BYTE PTR [rsp + rcx - 24]
  movzx   edx, BYTE PTR [rsp + rdx - 24]
  movzx   esi, BYTE PTR [rsp + rsi - 24]
  movzx   edi, BYTE PTR [rsp + rdi - 24]
  movzx   ebp, BYTE PTR [rsp + rbp - 24]
  movzx   r8d, BYTE PTR [rsp + r8 - 24]
  movd    xmm0, eax
  movzx   eax, BYTE PTR [rsp - 41]
  movzx   r9d, BYTE PTR [rsp + r9 - 24]
  movzx   r10d, BYTE PTR [rsp + r10 - 24]
  movzx   r11d, BYTE PTR [rsp + r11 - 24]
  movzx   r12d, BYTE PTR [rsp + r12 - 24]
  movzx   r13d, BYTE PTR [rsp + r13 - 24]
  movzx   r14d, BYTE PTR [rsp + r14 - 24]
  movzx   r15d, BYTE PTR [rsp + r15 - 24]
  movzx   eax, BYTE PTR [rsp + rax - 24]
  movd    xmm1, ebx
  movd    xmm2, ecx
  movd    xmm3, edx
  movd    xmm4, esi
  movd    xmm5, edi
  movd    xmm6, ebp
  movd    xmm7, r8d
  movd    xmm8, r9d
  movd    xmm9, r10d
  movd    xmm10, r11d
  movd    xmm11, r12d
  movd    xmm12, r13d
  movd    xmm13, r14d
  movd    xmm14, r15d
  movd    xmm15, eax
  punpcklbw xmm0, xmm1
  punpcklbw xmm2, xmm3
  punpcklbw xmm4, xmm5
  punpcklbw xmm6, xmm7
  punpcklbw xmm8, xmm9
  punpcklbw xmm10, xmm11
  punpcklbw xmm12, xmm13
  punpcklbw xmm14, xmm15
  punpcklwd xmm0, xmm2
  punpcklwd xmm4, xmm6
  punpcklwd xmm8, xmm10
  punpcklwd xmm12, xmm14
  punpckldq xmm0, xmm4
  punpckldq xmm8, xmm12
  punpcklqdq xmm0, xmm8
  pop     rbx
  pop     r12
  pop     r13
  pop     r14
  pop     r15
  pop     rbp
  ret

The second version of the SSE2ShuffleI8 op above requires 79 instructions and
only requires 80 bytes of stack, compared to the first version of the
SSE2ShuffleI8 op (which is generated by GCC 13.2.0), which allocates 160 bytes
of stack and uses an additional 120 bytes of stack in the red zone and
generates 114 instructions.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [Bug target/114944] Codegen of __builtin_shuffle for an 16-byte uint8_t vector is suboptimal on SSE2
  2024-05-04 14:00 [Bug target/114944] New: Codegen of __builtin_shuffle for an 16-byte uint8_t vector is suboptimal on SSE2 john_platts at hotmail dot com
@ 2024-05-04 14:15 ` john_platts at hotmail dot com
  2024-05-06 13:35 ` john_platts at hotmail dot com
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 5+ messages in thread
From: john_platts at hotmail dot com @ 2024-05-04 14:15 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114944

John Platts <john_platts at hotmail dot com> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Target|                            |x86_64-*-*, i?86-*-*

--- Comment #1 from John Platts <john_platts at hotmail dot com> ---
Here is another snippet of code that has suboptimal codegen on SSE2 with GCC
13.2.0:
#include <stdint.h>
#include <emmintrin.h>

__m128i SSE2ShuffleI8(__m128i a, __m128i b) {
  alignas(16) uint8_t a_lanes[16];
  alignas(16) uint8_t b_lanes[16];

  _mm_store_si128(reinterpret_cast<__m128i*>(a_lanes), a);
  _mm_store_si128(reinterpret_cast<__m128i*>(b_lanes),
                  _mm_and_si128(b, _mm_set1_epi8(static_cast<char>(15))));

  __m128i v0 = _mm_cvtsi32_si128(a_lanes[b_lanes[0]]);
  __m128i v1 = _mm_cvtsi32_si128(a_lanes[b_lanes[1]]);
  __m128i v2 = _mm_cvtsi32_si128(a_lanes[b_lanes[2]]);
  __m128i v3 = _mm_cvtsi32_si128(a_lanes[b_lanes[3]]);
  __m128i v4 = _mm_cvtsi32_si128(a_lanes[b_lanes[4]]);
  __m128i v5 = _mm_cvtsi32_si128(a_lanes[b_lanes[5]]);
  __m128i v6 = _mm_cvtsi32_si128(a_lanes[b_lanes[6]]);
  __m128i v7 = _mm_cvtsi32_si128(a_lanes[b_lanes[7]]);
  __m128i v8 = _mm_cvtsi32_si128(a_lanes[b_lanes[8]]);
  __m128i v9 = _mm_cvtsi32_si128(a_lanes[b_lanes[9]]);
  __m128i v10 = _mm_cvtsi32_si128(a_lanes[b_lanes[10]]);
  __m128i v11 = _mm_cvtsi32_si128(a_lanes[b_lanes[11]]);
  __m128i v12 = _mm_cvtsi32_si128(a_lanes[b_lanes[12]]);
  __m128i v13 = _mm_cvtsi32_si128(a_lanes[b_lanes[13]]);
  __m128i v14 = _mm_cvtsi32_si128(a_lanes[b_lanes[14]]);
  __m128i v15 = _mm_cvtsi32_si128(a_lanes[b_lanes[15]]);

  v0 = _mm_unpacklo_epi8(v0, v1);
  v2 = _mm_unpacklo_epi8(v2, v3);
  v4 = _mm_unpacklo_epi8(v4, v5);
  v6 = _mm_unpacklo_epi8(v6, v7);
  v8 = _mm_unpacklo_epi8(v8, v9);
  v10 = _mm_unpacklo_epi8(v10, v11);
  v12 = _mm_unpacklo_epi8(v12, v13);
  v14 = _mm_unpacklo_epi8(v14, v15);

  v0 = _mm_unpacklo_epi16(v0, v2);
  v4 = _mm_unpacklo_epi16(v4, v6);
  v8 = _mm_unpacklo_epi16(v8, v10);
  v12 = _mm_unpacklo_epi16(v12, v14);

  v0 = _mm_unpacklo_epi32(v0, v4);
  v8 = _mm_unpacklo_epi32(v8, v12);

  return _mm_unpacklo_epi64(v0, v8);
}

Here is the code that is generated when the above code is compiled on x86_64
GCC 13.2.0 with the -O2 option:
SSE2ShuffleI8(long long __vector(2), long long __vector(2)):
        sub     rsp, 144
        pand    xmm1, XMMWORD PTR .LC0[rip]
        movaps  XMMWORD PTR [rsp+120], xmm0
        movd    eax, xmm1
        movzx   eax, al
        movzx   eax, BYTE PTR [rsp+120+rax]
        movaps  XMMWORD PTR [rsp+104], xmm1
        movd    xmm0, eax
        movzx   eax, BYTE PTR [rsp+105]
        movzx   eax, BYTE PTR [rsp+120+rax]
        movaps  XMMWORD PTR [rsp+88], xmm1
        movd    xmm2, eax
        movzx   eax, BYTE PTR [rsp+90]
        punpcklbw       xmm0, xmm2
        movzx   eax, BYTE PTR [rsp+120+rax]
        movaps  XMMWORD PTR [rsp+72], xmm1
        movd    xmm8, eax
        movzx   eax, BYTE PTR [rsp+75]
        movzx   eax, BYTE PTR [rsp+120+rax]
        movaps  XMMWORD PTR [rsp+56], xmm1
        movd    xmm2, eax
        movzx   eax, BYTE PTR [rsp+60]
        punpcklbw       xmm8, xmm2
        movzx   eax, BYTE PTR [rsp+120+rax]
        movaps  XMMWORD PTR [rsp+40], xmm1
        punpcklwd       xmm0, xmm8
        movd    xmm5, eax
        movzx   eax, BYTE PTR [rsp+45]
        movzx   eax, BYTE PTR [rsp+120+rax]
        movaps  XMMWORD PTR [rsp+24], xmm1
        movd    xmm2, eax
        movzx   eax, BYTE PTR [rsp+30]
        punpcklbw       xmm5, xmm2
        movzx   eax, BYTE PTR [rsp+120+rax]
        movaps  XMMWORD PTR [rsp+8], xmm1
        movd    xmm7, eax
        movzx   eax, BYTE PTR [rsp+15]
        movzx   eax, BYTE PTR [rsp+120+rax]
        movaps  XMMWORD PTR [rsp-8], xmm1
        movd    xmm2, eax
        movzx   eax, BYTE PTR [rsp]
        punpcklbw       xmm7, xmm2
        movzx   eax, BYTE PTR [rsp+120+rax]
        movaps  XMMWORD PTR [rsp-24], xmm1
        punpcklwd       xmm5, xmm7
        punpckldq       xmm0, xmm5
        movd    xmm3, eax
        movzx   eax, BYTE PTR [rsp-15]
        movzx   eax, BYTE PTR [rsp+120+rax]
        movaps  XMMWORD PTR [rsp-40], xmm1
        movd    xmm4, eax
        movzx   eax, BYTE PTR [rsp-30]
        punpcklbw       xmm3, xmm4
        movzx   eax, BYTE PTR [rsp+120+rax]
        movaps  XMMWORD PTR [rsp-56], xmm1
        movd    xmm6, eax
        movzx   eax, BYTE PTR [rsp-45]
        movzx   eax, BYTE PTR [rsp+120+rax]
        movaps  XMMWORD PTR [rsp-72], xmm1
        movd    xmm2, eax
        movzx   eax, BYTE PTR [rsp-60]
        punpcklbw       xmm6, xmm2
        movzx   eax, BYTE PTR [rsp+120+rax]
        movaps  XMMWORD PTR [rsp-88], xmm1
        punpcklwd       xmm3, xmm6
        movd    xmm2, eax
        movzx   eax, BYTE PTR [rsp-75]
        movzx   eax, BYTE PTR [rsp+120+rax]
        movaps  XMMWORD PTR [rsp-104], xmm1
        movd    xmm4, eax
        movzx   eax, BYTE PTR [rsp-90]
        punpcklbw       xmm2, xmm4
        movzx   eax, BYTE PTR [rsp+120+rax]
        movaps  XMMWORD PTR [rsp-120], xmm1
        movd    xmm4, eax
        movzx   eax, BYTE PTR [rsp-105]
        movzx   eax, BYTE PTR [rsp+120+rax]
        add     rsp, 144
        movd    xmm1, eax
        punpcklbw       xmm4, xmm1
        movdqa  xmm1, xmm2
        movdqa  xmm2, xmm3
        punpcklwd       xmm1, xmm4
        punpckldq       xmm2, xmm1
        punpcklqdq      xmm0, xmm2
        ret
.LC0:
        .quad   1085102592571150095
        .quad   1085102592571150095

In the SSE2ShuffleI8 code generated above GCC 13.2.0 unnecessarily stores the
result of _mm_and_si128(b, _mm_set1_epi8(static_cast<char>(15))) into 15
different memory locations when optimizations are enabled.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [Bug target/114944] Codegen of __builtin_shuffle for an 16-byte uint8_t vector is suboptimal on SSE2
  2024-05-04 14:00 [Bug target/114944] New: Codegen of __builtin_shuffle for an 16-byte uint8_t vector is suboptimal on SSE2 john_platts at hotmail dot com
  2024-05-04 14:15 ` [Bug target/114944] " john_platts at hotmail dot com
@ 2024-05-06 13:35 ` john_platts at hotmail dot com
  2024-05-06 15:52 ` amonakov at gcc dot gnu.org
  2024-05-06 16:16 ` amonakov at gcc dot gnu.org
  3 siblings, 0 replies; 5+ messages in thread
From: john_platts at hotmail dot com @ 2024-05-06 13:35 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114944

--- Comment #2 from John Platts <john_platts at hotmail dot com> ---
Here is more optimal codegen for SSE2ShuffleI8 on x86_64:
SSE2ShuffleI8(long long __vector(2), long long __vector(2)):
        pand    xmm1, XMMWORD PTR .LC0[rip]
        movaps  XMMWORD PTR [rsp-24], xmm0
        movd    eax, xmm1
        movzx   eax, al
        movzx   eax, BYTE PTR [rsp-24+rax]
        movaps  XMMWORD PTR [rsp-40], xmm1
        movd    xmm0, eax
        movzx   eax, BYTE PTR [rsp-39]
        movzx   eax, BYTE PTR [rsp-24+rax]
        movd    xmm2, eax
        movzx   eax, BYTE PTR [rsp-38]
        punpcklbw       xmm0, xmm2
        movzx   eax, BYTE PTR [rsp-24+rax]
        movd    xmm8, eax
        movzx   eax, BYTE PTR [rsp-37]
        movzx   eax, BYTE PTR [rsp-24+rax]
        movd    xmm2, eax
        movzx   eax, BYTE PTR [rsp-36]
        punpcklbw       xmm8, xmm2
        movzx   eax, BYTE PTR [rsp-24+rax]
        punpcklwd       xmm0, xmm8
        movd    xmm5, eax
        movzx   eax, BYTE PTR [rsp-35]
        movzx   eax, BYTE PTR [rsp-24+rax]
        movd    xmm2, eax
        movzx   eax, BYTE PTR [rsp-34]
        punpcklbw       xmm5, xmm2
        movzx   eax, BYTE PTR [rsp-24+rax]
        movd    xmm7, eax
        movzx   eax, BYTE PTR [rsp-33]
        movzx   eax, BYTE PTR [rsp-24+rax]
        movd    xmm2, eax
        movzx   eax, BYTE PTR [rsp-32]
        punpcklbw       xmm7, xmm2
        movzx   eax, BYTE PTR [rsp-24+rax]
        punpcklwd       xmm5, xmm7
        punpckldq       xmm0, xmm5
        movd    xmm3, eax
        movzx   eax, BYTE PTR [rsp-31]
        movzx   eax, BYTE PTR [rsp-24+rax]
        movd    xmm4, eax
        movzx   eax, BYTE PTR [rsp-30]
        punpcklbw       xmm3, xmm4
        movzx   eax, BYTE PTR [rsp-24+rax]
        movd    xmm6, eax
        movzx   eax, BYTE PTR [rsp-29]
        movzx   eax, BYTE PTR [rsp-24+rax]
        movd    xmm2, eax
        movzx   eax, BYTE PTR [rsp-28]
        punpcklbw       xmm6, xmm2
        movzx   eax, BYTE PTR [rsp-24+rax]
        punpcklwd       xmm3, xmm6
        movd    xmm2, eax
        movzx   eax, BYTE PTR [rsp-27]
        movzx   eax, BYTE PTR [rsp-24+rax]
        movd    xmm4, eax
        movzx   eax, BYTE PTR [rsp-26]
        punpcklbw       xmm2, xmm4
        movzx   eax, BYTE PTR [rsp-24+rax]
        movd    xmm4, eax
        movzx   eax, BYTE PTR [rsp-25]
        movzx   eax, BYTE PTR [rsp-24+rax]
        movd    xmm1, eax
        punpcklbw       xmm4, xmm1
        movdqa  xmm1, xmm2
        movdqa  xmm2, xmm3
        punpcklwd       xmm1, xmm4
        punpckldq       xmm2, xmm1
        punpcklqdq      xmm0, xmm2
        ret
.LC0:
        .quad   1085102592571150095
        .quad   1085102592571150095

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [Bug target/114944] Codegen of __builtin_shuffle for an 16-byte uint8_t vector is suboptimal on SSE2
  2024-05-04 14:00 [Bug target/114944] New: Codegen of __builtin_shuffle for an 16-byte uint8_t vector is suboptimal on SSE2 john_platts at hotmail dot com
  2024-05-04 14:15 ` [Bug target/114944] " john_platts at hotmail dot com
  2024-05-06 13:35 ` john_platts at hotmail dot com
@ 2024-05-06 15:52 ` amonakov at gcc dot gnu.org
  2024-05-06 16:16 ` amonakov at gcc dot gnu.org
  3 siblings, 0 replies; 5+ messages in thread
From: amonakov at gcc dot gnu.org @ 2024-05-06 15:52 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114944

Alexander Monakov <amonakov at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |amonakov at gcc dot gnu.org

--- Comment #3 from Alexander Monakov <amonakov at gcc dot gnu.org> ---
Throughput-wise, the code in comment 2 has a significant bottleneck on port 5
on Haswell and Skylake (31 uops out of 70 go to port 5). Straightforward code
that does 16x movzx-movzx-movb for each byte should fare better, even
considering the load-store penalty for retrieving the vector from memory:

        pand    xmm1, XMMWORD PTR .LC0[rip]
        movaps  XMMWORD PTR [rsp-56], xmm0
        movaps  XMMWORD PTR [rsp-40], xmm1
        movzx   eax, BYTE PTR [rsp-40]
        movzx   eax, BYTE PTR [rsp-56+rax]
        mov     BYTE PTR [rsp-24], al
        movzx   eax, BYTE PTR [rsp-39]
        movzx   eax, BYTE PTR [rsp-56+rax]
        mov     BYTE PTR [rsp-23], al
        movzx   eax, BYTE PTR [rsp-38]
        movzx   eax, BYTE PTR [rsp-56+rax]
        mov     BYTE PTR [rsp-22], al
        movzx   eax, BYTE PTR [rsp-37]
        movzx   eax, BYTE PTR [rsp-56+rax]
        mov     BYTE PTR [rsp-21], al
        movzx   eax, BYTE PTR [rsp-36]
        movzx   eax, BYTE PTR [rsp-56+rax]
        mov     BYTE PTR [rsp-20], al
        movzx   eax, BYTE PTR [rsp-35]
        movzx   eax, BYTE PTR [rsp-56+rax]
        mov     BYTE PTR [rsp-19], al
        movzx   eax, BYTE PTR [rsp-34]
        movzx   eax, BYTE PTR [rsp-56+rax]
        mov     BYTE PTR [rsp-18], al
        movzx   eax, BYTE PTR [rsp-33]
        movzx   eax, BYTE PTR [rsp-56+rax]
        mov     BYTE PTR [rsp-17], al
        movzx   eax, BYTE PTR [rsp-32]
        movzx   eax, BYTE PTR [rsp-56+rax]
        mov     BYTE PTR [rsp-16], al
        movzx   eax, BYTE PTR [rsp-31]
        movzx   eax, BYTE PTR [rsp-56+rax]
        mov     BYTE PTR [rsp-15], al
        movzx   eax, BYTE PTR [rsp-30]
        movzx   eax, BYTE PTR [rsp-56+rax]
        mov     BYTE PTR [rsp-14], al
        movzx   eax, BYTE PTR [rsp-29]
        movzx   eax, BYTE PTR [rsp-56+rax]
        mov     BYTE PTR [rsp-13], al
        movzx   eax, BYTE PTR [rsp-28]
        movzx   eax, BYTE PTR [rsp-56+rax]
        mov     BYTE PTR [rsp-12], al
        movzx   eax, BYTE PTR [rsp-27]
        movzx   eax, BYTE PTR [rsp-56+rax]
        mov     BYTE PTR [rsp-11], al
        movzx   eax, BYTE PTR [rsp-26]
        movzx   eax, BYTE PTR [rsp-56+rax]
        mov     BYTE PTR [rsp-10], al
        movzx   eax, BYTE PTR [rsp-25]
        movzx   eax, BYTE PTR [rsp-56+rax]
        mov     BYTE PTR [rsp-9], al
        movdqa  xmm0, XMMWORD PTR [rsp-24]

If you want to avoid the load-store forwarding stall, perhaps you can assemble
two halves of the shuffled vector on GPRs (e.g. do 'movzx ecx, byte[...]; shl
eax, 8; mov al, byte [...+rcx]), then merge two 64-bit GPRs into one 128-bit
vector.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [Bug target/114944] Codegen of __builtin_shuffle for an 16-byte uint8_t vector is suboptimal on SSE2
  2024-05-04 14:00 [Bug target/114944] New: Codegen of __builtin_shuffle for an 16-byte uint8_t vector is suboptimal on SSE2 john_platts at hotmail dot com
                   ` (2 preceding siblings ...)
  2024-05-06 15:52 ` amonakov at gcc dot gnu.org
@ 2024-05-06 16:16 ` amonakov at gcc dot gnu.org
  3 siblings, 0 replies; 5+ messages in thread
From: amonakov at gcc dot gnu.org @ 2024-05-06 16:16 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114944

--- Comment #4 from Alexander Monakov <amonakov at gcc dot gnu.org> ---
Like this:

        pand    xmm1, XMMWORD PTR .LC0[rip]
        movaps  XMMWORD PTR [rsp-40], xmm0
        xor     eax, eax
        xor     edx, edx
        movaps  XMMWORD PTR [rsp-24], xmm1
        movzx   ecx, BYTE PTR [rsp-17]
        mov     al, BYTE PTR [rsp-40+rcx]
        movzx   ecx, BYTE PTR [rsp-9]
        mov     dl, BYTE PTR [rsp-40+rcx]
        movzx   ecx, BYTE PTR [rsp-18]
        sal     rax, 8
        mov     al, BYTE PTR [rsp-40+rcx]
        movzx   ecx, BYTE PTR [rsp-10]
        sal     rdx, 8
        mov     dl, BYTE PTR [rsp-40+rcx]
        movzx   ecx, BYTE PTR [rsp-19]
        sal     rax, 8
        mov     al, BYTE PTR [rsp-40+rcx]
        movzx   ecx, BYTE PTR [rsp-11]
        sal     rdx, 8
        mov     dl, BYTE PTR [rsp-40+rcx]
        movzx   ecx, BYTE PTR [rsp-20]
        sal     rax, 8
        mov     al, BYTE PTR [rsp-40+rcx]
        movzx   ecx, BYTE PTR [rsp-12]
        sal     rdx, 8
        mov     dl, BYTE PTR [rsp-40+rcx]
        movzx   ecx, BYTE PTR [rsp-21]
        sal     rax, 8
        mov     al, BYTE PTR [rsp-40+rcx]
        movzx   ecx, BYTE PTR [rsp-13]
        sal     rdx, 8
        mov     dl, BYTE PTR [rsp-40+rcx]
        movzx   ecx, BYTE PTR [rsp-22]
        sal     rax, 8
        mov     al, BYTE PTR [rsp-40+rcx]
        movzx   ecx, BYTE PTR [rsp-14]
        sal     rdx, 8
        mov     dl, BYTE PTR [rsp-40+rcx]
        movzx   ecx, BYTE PTR [rsp-23]
        sal     rax, 8
        mov     al, BYTE PTR [rsp-40+rcx]
        movzx   ecx, BYTE PTR [rsp-15]
        sal     rdx, 8
        mov     dl, BYTE PTR [rsp-40+rcx]
        movzx   ecx, BYTE PTR [rsp-24]
        sal     rax, 8
        mov     al, BYTE PTR [rsp-40+rcx]
        movzx   ecx, BYTE PTR [rsp-16]
        sal     rdx, 8
        mov     dl, BYTE PTR [rsp-40+rcx]
        movq    xmm0, rax
        movq    xmm2, rdx
        punpcklqdq      xmm0, xmm2

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2024-05-06 16:16 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-05-04 14:00 [Bug target/114944] New: Codegen of __builtin_shuffle for an 16-byte uint8_t vector is suboptimal on SSE2 john_platts at hotmail dot com
2024-05-04 14:15 ` [Bug target/114944] " john_platts at hotmail dot com
2024-05-06 13:35 ` john_platts at hotmail dot com
2024-05-06 15:52 ` amonakov at gcc dot gnu.org
2024-05-06 16:16 ` amonakov at gcc dot gnu.org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).