public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug target/101579] New: Suboptimal codegen for __builtin_shufflevector
@ 2021-07-22 12:41 hjl.tools at gmail dot com
  2021-07-27 10:14 ` [Bug target/101579] " rguenth at gcc dot gnu.org
                   ` (7 more replies)
  0 siblings, 8 replies; 9+ messages in thread
From: hjl.tools at gmail dot com @ 2021-07-22 12:41 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101579

            Bug ID: 101579
           Summary: Suboptimal codegen for __builtin_shufflevector
           Product: gcc
           Version: 12.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: hjl.tools at gmail dot com
                CC: crazylht at gmail dot com
  Target Milestone: ---
            Target: i386,x86-64

For

---
typedef unsigned int __attribute__((__vector_size__ (32))) U;
typedef unsigned char __attribute__((__vector_size__ (64))) V;

V g;

U
foo (void)
{
  V v = __builtin_shufflevector (g, g,
                                 0, 1, 2, 0, 5, 1, 0, 1, 3, 2, 3, 0, 4, 3, 1,
2,
                                 2, 0, 4, 2, 3, 1, 1, 2, 3, 4, 1, 1, 0, 0, 5,
2,
                                 0, 3, 3, 3, 3, 4, 5, 0, 1, 5, 2, 1, 0, 1, 1,
2,
                                 3, 2, 0, 5, 4, 5, 1, 0, 1, 4, 4, 3, 4, 5, 2,
0)
;
  v ^= 255;
  V w = v + g;
  U u = ((union { V a; U b; }) w).b + ((union { V a; U b; }) w).b[1];
  return u;
}
---

GCC 12 -march=skylake -O2 generates

        .cfi_startproc
        pushq   %rbp
        .cfi_def_cfa_offset 16
        .cfi_offset 6, -16
        vpcmpeqd        %ymm4, %ymm4, %ymm4
        movq    %rsp, %rbp
        .cfi_def_cfa_register 6
        andq    $-64, %rsp
        subq    $72, %rsp
        movzbl  g+2(%rip), %ecx
        movzbl  g+1(%rip), %edx
        movzbl  g(%rip), %eax
        movzbl  g+3(%rip), %edi
        movzbl  g+5(%rip), %esi
        movzbl  g+4(%rip), %r8d
        vmovd   %ecx, %xmm7
        vmovd   %edi, %xmm0
        vmovd   %edx, %xmm1
        vmovd   %eax, %xmm5
        vmovdqa %xmm7, -72(%rsp)
        vpinsrb $1, %eax, %xmm7, %xmm11
        vmovd   %r8d, %xmm2
        vmovd   %esi, %xmm7
        vpinsrb $1, %edx, %xmm5, %xmm6
        vpinsrb $1, %edi, %xmm2, %xmm14
        vpinsrb $1, %ecx, %xmm1, %xmm12
        vpinsrb $1, %ecx, %xmm0, %xmm15
        vpinsrb $1, %edx, %xmm7, %xmm8
        vpinsrb $1, %eax, %xmm0, %xmm3
        vpunpcklwd      %xmm11, %xmm6, %xmm13
        vpunpcklwd      %xmm12, %xmm14, %xmm9
        vpunpcklwd      %xmm6, %xmm8, %xmm8
        vpunpcklwd      %xmm3, %xmm15, %xmm3
        vpunpckldq      %xmm8, %xmm13, %xmm8
        vpunpckldq      %xmm9, %xmm3, %xmm3
        vpunpcklqdq     %xmm3, %xmm8, %xmm3
        vpaddb  g(%rip), %ymm4, %ymm10
        vmovdqa %xmm14, -88(%rsp)
        vmovdqa %xmm3, -104(%rsp)
        vpinsrb $1, %r8d, %xmm0, %xmm9
        vpinsrb $1, %ecx, %xmm7, %xmm4
        vpinsrb $1, %ecx, %xmm2, %xmm3
        vpinsrb $1, %edx, %xmm0, %xmm14
        vpinsrb $1, %edx, %xmm1, %xmm8
        vpinsrb $1, %eax, %xmm5, %xmm13
        vpunpcklwd      %xmm4, %xmm13, %xmm13
        vpunpcklwd      %xmm8, %xmm9, %xmm8
        vpunpcklwd      %xmm3, %xmm11, %xmm3
        vpunpcklwd      %xmm12, %xmm14, %xmm14
        vmovdqa -104(%rsp), %xmm4
        vpunpckldq      %xmm13, %xmm8, %xmm8
        vpunpckldq      %xmm14, %xmm3, %xmm3
        vpunpcklqdq     %xmm8, %xmm3, %xmm3
        vmovdqa -72(%rsp), %xmm13
        vinserti128     $0x1, %xmm3, %ymm4, %ymm3
        vpsubb  %ymm3, %ymm10, %ymm10
        vmovdqa %ymm10, -56(%rsp)
        vmovdqa %xmm10, %xmm8
        vpinsrb $1, %esi, %xmm1, %xmm3
        vpinsrb $1, %edi, %xmm5, %xmm10
        vpinsrb $1, %edi, %xmm0, %xmm0
        vpinsrb $1, %eax, %xmm7, %xmm7
        vpinsrb $1, %edx, %xmm13, %xmm13
        vpunpcklwd      %xmm13, %xmm3, %xmm3
        vpunpcklwd      %xmm0, %xmm10, %xmm0
        vpunpcklwd      %xmm7, %xmm9, %xmm9
        vpunpcklwd      %xmm12, %xmm6, %xmm6
        vpunpckldq      %xmm6, %xmm3, %xmm6
        vpunpckldq      %xmm9, %xmm0, %xmm0
        vpunpcklqdq     %xmm6, %xmm0, %xmm0
        vpinsrb $1, %eax, %xmm1, %xmm6
        vpinsrb $1, %r8d, %xmm1, %xmm1
        vpunpcklwd      -88(%rsp), %xmm1, %xmm1
        vpinsrb $1, %esi, %xmm5, %xmm5
        vpinsrb $1, %esi, %xmm2, %xmm2
        vpunpcklwd      %xmm5, %xmm15, %xmm3
        vpunpcklwd      %xmm6, %xmm2, %xmm5
        vpunpcklwd      %xmm11, %xmm2, %xmm2
        vpcmpeqd        %ymm4, %ymm4, %ymm4
        vpunpckldq      %xmm5, %xmm3, %xmm3
        vpunpckldq      %xmm2, %xmm1, %xmm1
        vpaddb  g+32(%rip), %ymm4, %ymm4
        vpunpcklqdq     %xmm1, %xmm3, %xmm1
        vinserti128     $0x1, %xmm1, %ymm0, %ymm0
        vpsubb  %ymm0, %ymm4, %ymm4
        vmovdqa %xmm8, 8(%rsp)
        vmovdqa %ymm4, -24(%rsp)
        vmovdqa -40(%rsp), %xmm4
        vpbroadcastd    12(%rsp), %ymm0
        vmovdqa %xmm4, 24(%rsp)
        vpaddd  8(%rsp), %ymm0, %ymm0
        leave
        .cfi_def_cfa 7, 8
        ret

clang 12 generates

foo:                                    # @foo
        .cfi_startproc
# %bb.0:
        vmovdqa g(%rip), %ymm0
        vpcmpeqd        %ymm1, %ymm1, %ymm1
        vpxor   %ymm1, %ymm0, %ymm1
        vpermq  $68, %ymm1, %ymm1               # ymm1 = ymm1[0,1,0,1]
        vpshufb .LCPI0_0(%rip), %ymm1, %ymm1    # ymm1 =
ymm1[0,1,2,0,5,1,0,1,3,
2,3,0,4,3,1,2,18,16,20,18,19,17,17,18,19,20,17,17,16,16,21,18]
        vpaddb  %ymm0, %ymm1, %ymm0
        vpbroadcastd    .LCPI0_1(%rip), %ymm1   # ymm1 = [1,1,1,1,1,1,1,1]
        vpermd  %ymm0, %ymm1, %ymm1
        vpaddd  %ymm0, %ymm1, %ymm0
        retq
.Lfunc_end0:
        .size   foo, .Lfunc_end0-foo
        .cfi_endproc

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [Bug target/101579] Suboptimal codegen for __builtin_shufflevector
  2021-07-22 12:41 [Bug target/101579] New: Suboptimal codegen for __builtin_shufflevector hjl.tools at gmail dot com
@ 2021-07-27 10:14 ` rguenth at gcc dot gnu.org
  2021-07-27 10:35 ` jakub at gcc dot gnu.org
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: rguenth at gcc dot gnu.org @ 2021-07-27 10:14 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101579

Richard Biener <rguenth at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|UNCONFIRMED                 |NEW
   Last reconfirmed|                            |2021-07-27
     Ever confirmed|0                           |1
           Keywords|                            |missed-optimization

--- Comment #1 from Richard Biener <rguenth at gcc dot gnu.org> ---
The original GIMPLE looks like

  g.0_1 = g;
  g.1_2 = g;
  v = VEC_PERM_EXPR <g.0_1, g.0_1, { 0, 1, 2, 0, 5, 1, 0, 1, 3, 2, 3, 0, 4, 3,
1, 2, 2, 0, 4, 2, 3, 1, 1, 2, 3, 4, 1, 1, 0, 0, 5, 2, 0, 3, 3, 3, 3, 4, 5, 0,
1, 5, 2, 1, 0, 1, 1, 2, 3, 2, 0, 5, 4, 5, 1, 0, 1, 4, 4, 3, 4, 5, 2, 0 }>;
  v = ~v;
  g.2_3 = g;
  w = v + g.2_3;
  D.2465.a = w;
  _4 = D.2465.b;
  D.2466.a = w;
  D.2467 = BIT_FIELD_REF <D.2466.b, 32, 32>;
  _5 = {D.2467, D.2467, D.2467, D.2467, D.2467, D.2467, D.2467, D.2467};
  u = _4 + _5;
  D.2468 = u;
  return D.2468;

but then veclower ends up open-coding the VEC_PERM_EXPR because the constant
permute is not supported by the target.  The same seems to be true for the
upper/lower half selection for some reason.

Note veclower doesn't have any code trying to implement a not supported
VEC_PERM_EXPR as a combination of supported ones.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [Bug target/101579] Suboptimal codegen for __builtin_shufflevector
  2021-07-22 12:41 [Bug target/101579] New: Suboptimal codegen for __builtin_shufflevector hjl.tools at gmail dot com
  2021-07-27 10:14 ` [Bug target/101579] " rguenth at gcc dot gnu.org
@ 2021-07-27 10:35 ` jakub at gcc dot gnu.org
  2021-07-28  9:01 ` crazylht at gmail dot com
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: jakub at gcc dot gnu.org @ 2021-07-27 10:35 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101579

Jakub Jelinek <jakub at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |jakub at gcc dot gnu.org

--- Comment #2 from Jakub Jelinek <jakub at gcc dot gnu.org> ---
As
typedef unsigned char V __attribute__((vector_size (32)));

V
foo (V x)
{
  return __builtin_shufflevector (x, x, 0, 1, 2, 0, 5, 1, 0, 1, 3, 2, 3, 0, 4,
3, 1, 2,
                                        2, 0, 4, 2, 3, 1, 1, 2, 3, 4, 1, 1, 0,
0, 5, 2);
}

V
bar (V x)
{
  return __builtin_shufflevector (x, x, 0, 3, 3, 3, 3, 4, 5, 0, 1, 5, 2, 1, 0,
1, 1, 2,
                                        3, 2, 0, 5, 4, 5, 1, 0, 1, 4, 4, 3, 4,
5, 2, 0);
}
with -O2 -mavx2 is handled, I'd say this is veclower task to determine that the
particular permutation could be cheaply implemented with two permutations of
half-sized vectors and ask the backend if it supports those.
Of course there can be other permutations that can't be implemented that way
easily and might e.g. need more half-sized permutations...

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [Bug target/101579] Suboptimal codegen for __builtin_shufflevector
  2021-07-22 12:41 [Bug target/101579] New: Suboptimal codegen for __builtin_shufflevector hjl.tools at gmail dot com
  2021-07-27 10:14 ` [Bug target/101579] " rguenth at gcc dot gnu.org
  2021-07-27 10:35 ` jakub at gcc dot gnu.org
@ 2021-07-28  9:01 ` crazylht at gmail dot com
  2021-07-28  9:40 ` crazylht at gmail dot com
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: crazylht at gmail dot com @ 2021-07-28  9:01 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101579

--- Comment #3 from Hongtao.liu <crazylht at gmail dot com> ---
Codegen could be improved by -mavx512bw, but still not good as clang12

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [Bug target/101579] Suboptimal codegen for __builtin_shufflevector
  2021-07-22 12:41 [Bug target/101579] New: Suboptimal codegen for __builtin_shufflevector hjl.tools at gmail dot com
                   ` (2 preceding siblings ...)
  2021-07-28  9:01 ` crazylht at gmail dot com
@ 2021-07-28  9:40 ` crazylht at gmail dot com
  2021-07-28  9:45 ` crazylht at gmail dot com
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: crazylht at gmail dot com @ 2021-07-28  9:40 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101579

--- Comment #4 from Hongtao.liu <crazylht at gmail dot com> ---
(In reply to Jakub Jelinek from comment #2)
> As
> typedef unsigned char V __attribute__((vector_size (32)));
> 
> V
> foo (V x)
> {
>   return __builtin_shufflevector (x, x, 0, 1, 2, 0, 5, 1, 0, 1, 3, 2, 3, 0,
> 4, 3, 1, 2,
> 					2, 0, 4, 2, 3, 1, 1, 2, 3, 4, 1, 1, 0, 0, 5, 2);
> }
> 
> V
> bar (V x)
> {
>   return __builtin_shufflevector (x, x, 0, 3, 3, 3, 3, 4, 5, 0, 1, 5, 2, 1,
> 0, 1, 1, 2,
> 					3, 2, 0, 5, 4, 5, 1, 0, 1, 4, 4, 3, 4, 5, 2, 0);
> }
> with -O2 -mavx2 is handled, I'd say this is veclower task to determine that
> the particular permutation could be cheaply implemented with two
> permutations of half-sized vectors and ask the backend if it supports those.
> Of course there can be other permutations that can't be implemented that way
> easily and might e.g. need more half-sized permutations...
I looks to me that middle end should be able to transform 64-byte vector
shuffle to 32-byte vector shuffle when data flow analysis shows the upper part
of the vector is never used.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [Bug target/101579] Suboptimal codegen for __builtin_shufflevector
  2021-07-22 12:41 [Bug target/101579] New: Suboptimal codegen for __builtin_shufflevector hjl.tools at gmail dot com
                   ` (3 preceding siblings ...)
  2021-07-28  9:40 ` crazylht at gmail dot com
@ 2021-07-28  9:45 ` crazylht at gmail dot com
  2021-07-28  9:49 ` jakub at gcc dot gnu.org
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: crazylht at gmail dot com @ 2021-07-28  9:45 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101579

--- Comment #5 from Hongtao.liu <crazylht at gmail dot com> ---
After manually eliminating the upper part vector shuffle, codegen is much
better https://godbolt.org/z/d3YhzzYfo

typedef unsigned int __attribute__((__vector_size__ (32))) U;
typedef unsigned char __attribute__((__vector_size__ (32))) V;

V g;

U
foo (void)
{
  V v = __builtin_shufflevector (g, g,
                                 0, 1, 2, 0, 5, 1, 0, 1, 3, 2, 3, 0, 4, 3, 1,
2,
                                 2, 0, 4, 2, 3, 1, 1, 2, 3, 4, 1, 1, 0, 0, 5,
2)
;
  v ^= 255;
  V w = v + g;
  U u = ((union { V a; U b; }) w).b + ((union { V a; U b; }) w).b[1];
  return u;
}

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [Bug target/101579] Suboptimal codegen for __builtin_shufflevector
  2021-07-22 12:41 [Bug target/101579] New: Suboptimal codegen for __builtin_shufflevector hjl.tools at gmail dot com
                   ` (4 preceding siblings ...)
  2021-07-28  9:45 ` crazylht at gmail dot com
@ 2021-07-28  9:49 ` jakub at gcc dot gnu.org
  2021-07-28 10:03 ` crazylht at gmail dot com
  2021-07-28 10:12 ` crazylht at gmail dot com
  7 siblings, 0 replies; 9+ messages in thread
From: jakub at gcc dot gnu.org @ 2021-07-28  9:49 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101579

--- Comment #6 from Jakub Jelinek <jakub at gcc dot gnu.org> ---
(In reply to Hongtao.liu from comment #4)
> I looks to me that middle end should be able to transform 64-byte vector
> shuffle to 32-byte vector shuffle when data flow analysis shows the upper
> part of the vector is never used.

That is not needed in this case.  The permutation is such that all indices for
the first half read from one half (in this case the first) and all indices for
the second half read from one half (in this case the first again), so it is
identical to a vector containing permutation of the first half with the first
half of the indices and permutation of the first half again with the second
half of the indices.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [Bug target/101579] Suboptimal codegen for __builtin_shufflevector
  2021-07-22 12:41 [Bug target/101579] New: Suboptimal codegen for __builtin_shufflevector hjl.tools at gmail dot com
                   ` (5 preceding siblings ...)
  2021-07-28  9:49 ` jakub at gcc dot gnu.org
@ 2021-07-28 10:03 ` crazylht at gmail dot com
  2021-07-28 10:12 ` crazylht at gmail dot com
  7 siblings, 0 replies; 9+ messages in thread
From: crazylht at gmail dot com @ 2021-07-28 10:03 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101579

--- Comment #7 from Hongtao.liu <crazylht at gmail dot com> ---
(In reply to Jakub Jelinek from comment #6)
> (In reply to Hongtao.liu from comment #4)
> > I looks to me that middle end should be able to transform 64-byte vector
> > shuffle to 32-byte vector shuffle when data flow analysis shows the upper
> > part of the vector is never used.
> 
> That is not needed in this case.  The permutation is such that all indices
> for the first half read from one half (in this case the first) and all
> indices for the second half read from one half (in this case the first
> again), so it is
> identical to a vector containing permutation of the first half with the
> first half of the indices and permutation of the first half again with the
> second half of the indices.

  U u = ((union { V a; U b; }) w).b + ((union { V a; U b; }) w).b[1];
  return u;

I means the result only use the first half, we can just create a tmp v1 with
v1 = __builtin_shufflevector (g, g,
                                 0, 1, 2, 0, 5, 1, 0, 1, 3, 2, 3, 0, 4, 3, 1,
2,
                                 2, 0, 4, 2, 3, 1, 1, 2, 3, 4, 1, 1, 0, 0, 5,
2)(In reply to Jakub Jelinek from comment #6)
> (In reply to Hongtao.liu from comment #4)
> > I looks to me that middle end should be able to transform 64-byte vector
> > shuffle to 32-byte vector shuffle when data flow analysis shows the upper
> > part of the vector is never used.
> 
> That is not needed in this case.  The permutation is such that all indices
> for the first half read from one half (in this case the first) and all
> indices for the second half read from one half (in this case the first
> again), so it is
> identical to a vector containing permutation of the first half with the
> first half of the indices and permutation of the first half again with the
> second half of the indices.

  U u = ((union { V a; U b; }) w).b + ((union { V a; U b; }) w).b[1];
  return u;

I means the result u only cared about the first half, we can drop the second
half, it's redundant.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [Bug target/101579] Suboptimal codegen for __builtin_shufflevector
  2021-07-22 12:41 [Bug target/101579] New: Suboptimal codegen for __builtin_shufflevector hjl.tools at gmail dot com
                   ` (6 preceding siblings ...)
  2021-07-28 10:03 ` crazylht at gmail dot com
@ 2021-07-28 10:12 ` crazylht at gmail dot com
  7 siblings, 0 replies; 9+ messages in thread
From: crazylht at gmail dot com @ 2021-07-28 10:12 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101579

--- Comment #8 from Hongtao.liu <crazylht at gmail dot com> ---
> 
>   U u = ((union { V a; U b; }) w).b + ((union { V a; U b; }) w).b[1];
>   return u;
> 
> I means the result u only cared about the first half, we can drop the second
> half, it's redundant.

Sorry for the typo, just want to clarify

result u = ((union { V a; U b; }) w).b + ((union { V a; U b; }) w).b[1]

where "(union { V a; U b; }) w).b" only cares about the first half, so does
"((union { V a; U b; }) w).b[1]" and "u", the second half of v is never used
and can be optimized away.

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2021-07-28 10:12 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-07-22 12:41 [Bug target/101579] New: Suboptimal codegen for __builtin_shufflevector hjl.tools at gmail dot com
2021-07-27 10:14 ` [Bug target/101579] " rguenth at gcc dot gnu.org
2021-07-27 10:35 ` jakub at gcc dot gnu.org
2021-07-28  9:01 ` crazylht at gmail dot com
2021-07-28  9:40 ` crazylht at gmail dot com
2021-07-28  9:45 ` crazylht at gmail dot com
2021-07-28  9:49 ` jakub at gcc dot gnu.org
2021-07-28 10:03 ` crazylht at gmail dot com
2021-07-28 10:12 ` crazylht at gmail dot com

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).