public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug target/101579] New: Suboptimal codegen for __builtin_shufflevector
@ 2021-07-22 12:41 hjl.tools at gmail dot com
2021-07-27 10:14 ` [Bug target/101579] " rguenth at gcc dot gnu.org
` (7 more replies)
0 siblings, 8 replies; 9+ messages in thread
From: hjl.tools at gmail dot com @ 2021-07-22 12:41 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101579
Bug ID: 101579
Summary: Suboptimal codegen for __builtin_shufflevector
Product: gcc
Version: 12.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: target
Assignee: unassigned at gcc dot gnu.org
Reporter: hjl.tools at gmail dot com
CC: crazylht at gmail dot com
Target Milestone: ---
Target: i386,x86-64
For
---
typedef unsigned int __attribute__((__vector_size__ (32))) U;
typedef unsigned char __attribute__((__vector_size__ (64))) V;
V g;
U
foo (void)
{
V v = __builtin_shufflevector (g, g,
0, 1, 2, 0, 5, 1, 0, 1, 3, 2, 3, 0, 4, 3, 1,
2,
2, 0, 4, 2, 3, 1, 1, 2, 3, 4, 1, 1, 0, 0, 5,
2,
0, 3, 3, 3, 3, 4, 5, 0, 1, 5, 2, 1, 0, 1, 1,
2,
3, 2, 0, 5, 4, 5, 1, 0, 1, 4, 4, 3, 4, 5, 2,
0)
;
v ^= 255;
V w = v + g;
U u = ((union { V a; U b; }) w).b + ((union { V a; U b; }) w).b[1];
return u;
}
---
GCC 12 -march=skylake -O2 generates
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
vpcmpeqd %ymm4, %ymm4, %ymm4
movq %rsp, %rbp
.cfi_def_cfa_register 6
andq $-64, %rsp
subq $72, %rsp
movzbl g+2(%rip), %ecx
movzbl g+1(%rip), %edx
movzbl g(%rip), %eax
movzbl g+3(%rip), %edi
movzbl g+5(%rip), %esi
movzbl g+4(%rip), %r8d
vmovd %ecx, %xmm7
vmovd %edi, %xmm0
vmovd %edx, %xmm1
vmovd %eax, %xmm5
vmovdqa %xmm7, -72(%rsp)
vpinsrb $1, %eax, %xmm7, %xmm11
vmovd %r8d, %xmm2
vmovd %esi, %xmm7
vpinsrb $1, %edx, %xmm5, %xmm6
vpinsrb $1, %edi, %xmm2, %xmm14
vpinsrb $1, %ecx, %xmm1, %xmm12
vpinsrb $1, %ecx, %xmm0, %xmm15
vpinsrb $1, %edx, %xmm7, %xmm8
vpinsrb $1, %eax, %xmm0, %xmm3
vpunpcklwd %xmm11, %xmm6, %xmm13
vpunpcklwd %xmm12, %xmm14, %xmm9
vpunpcklwd %xmm6, %xmm8, %xmm8
vpunpcklwd %xmm3, %xmm15, %xmm3
vpunpckldq %xmm8, %xmm13, %xmm8
vpunpckldq %xmm9, %xmm3, %xmm3
vpunpcklqdq %xmm3, %xmm8, %xmm3
vpaddb g(%rip), %ymm4, %ymm10
vmovdqa %xmm14, -88(%rsp)
vmovdqa %xmm3, -104(%rsp)
vpinsrb $1, %r8d, %xmm0, %xmm9
vpinsrb $1, %ecx, %xmm7, %xmm4
vpinsrb $1, %ecx, %xmm2, %xmm3
vpinsrb $1, %edx, %xmm0, %xmm14
vpinsrb $1, %edx, %xmm1, %xmm8
vpinsrb $1, %eax, %xmm5, %xmm13
vpunpcklwd %xmm4, %xmm13, %xmm13
vpunpcklwd %xmm8, %xmm9, %xmm8
vpunpcklwd %xmm3, %xmm11, %xmm3
vpunpcklwd %xmm12, %xmm14, %xmm14
vmovdqa -104(%rsp), %xmm4
vpunpckldq %xmm13, %xmm8, %xmm8
vpunpckldq %xmm14, %xmm3, %xmm3
vpunpcklqdq %xmm8, %xmm3, %xmm3
vmovdqa -72(%rsp), %xmm13
vinserti128 $0x1, %xmm3, %ymm4, %ymm3
vpsubb %ymm3, %ymm10, %ymm10
vmovdqa %ymm10, -56(%rsp)
vmovdqa %xmm10, %xmm8
vpinsrb $1, %esi, %xmm1, %xmm3
vpinsrb $1, %edi, %xmm5, %xmm10
vpinsrb $1, %edi, %xmm0, %xmm0
vpinsrb $1, %eax, %xmm7, %xmm7
vpinsrb $1, %edx, %xmm13, %xmm13
vpunpcklwd %xmm13, %xmm3, %xmm3
vpunpcklwd %xmm0, %xmm10, %xmm0
vpunpcklwd %xmm7, %xmm9, %xmm9
vpunpcklwd %xmm12, %xmm6, %xmm6
vpunpckldq %xmm6, %xmm3, %xmm6
vpunpckldq %xmm9, %xmm0, %xmm0
vpunpcklqdq %xmm6, %xmm0, %xmm0
vpinsrb $1, %eax, %xmm1, %xmm6
vpinsrb $1, %r8d, %xmm1, %xmm1
vpunpcklwd -88(%rsp), %xmm1, %xmm1
vpinsrb $1, %esi, %xmm5, %xmm5
vpinsrb $1, %esi, %xmm2, %xmm2
vpunpcklwd %xmm5, %xmm15, %xmm3
vpunpcklwd %xmm6, %xmm2, %xmm5
vpunpcklwd %xmm11, %xmm2, %xmm2
vpcmpeqd %ymm4, %ymm4, %ymm4
vpunpckldq %xmm5, %xmm3, %xmm3
vpunpckldq %xmm2, %xmm1, %xmm1
vpaddb g+32(%rip), %ymm4, %ymm4
vpunpcklqdq %xmm1, %xmm3, %xmm1
vinserti128 $0x1, %xmm1, %ymm0, %ymm0
vpsubb %ymm0, %ymm4, %ymm4
vmovdqa %xmm8, 8(%rsp)
vmovdqa %ymm4, -24(%rsp)
vmovdqa -40(%rsp), %xmm4
vpbroadcastd 12(%rsp), %ymm0
vmovdqa %xmm4, 24(%rsp)
vpaddd 8(%rsp), %ymm0, %ymm0
leave
.cfi_def_cfa 7, 8
ret
clang 12 generates
foo: # @foo
.cfi_startproc
# %bb.0:
vmovdqa g(%rip), %ymm0
vpcmpeqd %ymm1, %ymm1, %ymm1
vpxor %ymm1, %ymm0, %ymm1
vpermq $68, %ymm1, %ymm1 # ymm1 = ymm1[0,1,0,1]
vpshufb .LCPI0_0(%rip), %ymm1, %ymm1 # ymm1 =
ymm1[0,1,2,0,5,1,0,1,3,
2,3,0,4,3,1,2,18,16,20,18,19,17,17,18,19,20,17,17,16,16,21,18]
vpaddb %ymm0, %ymm1, %ymm0
vpbroadcastd .LCPI0_1(%rip), %ymm1 # ymm1 = [1,1,1,1,1,1,1,1]
vpermd %ymm0, %ymm1, %ymm1
vpaddd %ymm0, %ymm1, %ymm0
retq
.Lfunc_end0:
.size foo, .Lfunc_end0-foo
.cfi_endproc
^ permalink raw reply [flat|nested] 9+ messages in thread
* [Bug target/101579] Suboptimal codegen for __builtin_shufflevector
2021-07-22 12:41 [Bug target/101579] New: Suboptimal codegen for __builtin_shufflevector hjl.tools at gmail dot com
@ 2021-07-27 10:14 ` rguenth at gcc dot gnu.org
2021-07-27 10:35 ` jakub at gcc dot gnu.org
` (6 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: rguenth at gcc dot gnu.org @ 2021-07-27 10:14 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101579
Richard Biener <rguenth at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Status|UNCONFIRMED |NEW
Last reconfirmed| |2021-07-27
Ever confirmed|0 |1
Keywords| |missed-optimization
--- Comment #1 from Richard Biener <rguenth at gcc dot gnu.org> ---
The original GIMPLE looks like
g.0_1 = g;
g.1_2 = g;
v = VEC_PERM_EXPR <g.0_1, g.0_1, { 0, 1, 2, 0, 5, 1, 0, 1, 3, 2, 3, 0, 4, 3,
1, 2, 2, 0, 4, 2, 3, 1, 1, 2, 3, 4, 1, 1, 0, 0, 5, 2, 0, 3, 3, 3, 3, 4, 5, 0,
1, 5, 2, 1, 0, 1, 1, 2, 3, 2, 0, 5, 4, 5, 1, 0, 1, 4, 4, 3, 4, 5, 2, 0 }>;
v = ~v;
g.2_3 = g;
w = v + g.2_3;
D.2465.a = w;
_4 = D.2465.b;
D.2466.a = w;
D.2467 = BIT_FIELD_REF <D.2466.b, 32, 32>;
_5 = {D.2467, D.2467, D.2467, D.2467, D.2467, D.2467, D.2467, D.2467};
u = _4 + _5;
D.2468 = u;
return D.2468;
but then veclower ends up open-coding the VEC_PERM_EXPR because the constant
permute is not supported by the target. The same seems to be true for the
upper/lower half selection for some reason.
Note veclower doesn't have any code trying to implement a not supported
VEC_PERM_EXPR as a combination of supported ones.
^ permalink raw reply [flat|nested] 9+ messages in thread
* [Bug target/101579] Suboptimal codegen for __builtin_shufflevector
2021-07-22 12:41 [Bug target/101579] New: Suboptimal codegen for __builtin_shufflevector hjl.tools at gmail dot com
2021-07-27 10:14 ` [Bug target/101579] " rguenth at gcc dot gnu.org
@ 2021-07-27 10:35 ` jakub at gcc dot gnu.org
2021-07-28 9:01 ` crazylht at gmail dot com
` (5 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: jakub at gcc dot gnu.org @ 2021-07-27 10:35 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101579
Jakub Jelinek <jakub at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
CC| |jakub at gcc dot gnu.org
--- Comment #2 from Jakub Jelinek <jakub at gcc dot gnu.org> ---
As
typedef unsigned char V __attribute__((vector_size (32)));
V
foo (V x)
{
return __builtin_shufflevector (x, x, 0, 1, 2, 0, 5, 1, 0, 1, 3, 2, 3, 0, 4,
3, 1, 2,
2, 0, 4, 2, 3, 1, 1, 2, 3, 4, 1, 1, 0,
0, 5, 2);
}
V
bar (V x)
{
return __builtin_shufflevector (x, x, 0, 3, 3, 3, 3, 4, 5, 0, 1, 5, 2, 1, 0,
1, 1, 2,
3, 2, 0, 5, 4, 5, 1, 0, 1, 4, 4, 3, 4,
5, 2, 0);
}
with -O2 -mavx2 is handled, I'd say this is veclower task to determine that the
particular permutation could be cheaply implemented with two permutations of
half-sized vectors and ask the backend if it supports those.
Of course there can be other permutations that can't be implemented that way
easily and might e.g. need more half-sized permutations...
^ permalink raw reply [flat|nested] 9+ messages in thread
* [Bug target/101579] Suboptimal codegen for __builtin_shufflevector
2021-07-22 12:41 [Bug target/101579] New: Suboptimal codegen for __builtin_shufflevector hjl.tools at gmail dot com
2021-07-27 10:14 ` [Bug target/101579] " rguenth at gcc dot gnu.org
2021-07-27 10:35 ` jakub at gcc dot gnu.org
@ 2021-07-28 9:01 ` crazylht at gmail dot com
2021-07-28 9:40 ` crazylht at gmail dot com
` (4 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: crazylht at gmail dot com @ 2021-07-28 9:01 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101579
--- Comment #3 from Hongtao.liu <crazylht at gmail dot com> ---
Codegen could be improved by -mavx512bw, but still not good as clang12
^ permalink raw reply [flat|nested] 9+ messages in thread
* [Bug target/101579] Suboptimal codegen for __builtin_shufflevector
2021-07-22 12:41 [Bug target/101579] New: Suboptimal codegen for __builtin_shufflevector hjl.tools at gmail dot com
` (2 preceding siblings ...)
2021-07-28 9:01 ` crazylht at gmail dot com
@ 2021-07-28 9:40 ` crazylht at gmail dot com
2021-07-28 9:45 ` crazylht at gmail dot com
` (3 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: crazylht at gmail dot com @ 2021-07-28 9:40 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101579
--- Comment #4 from Hongtao.liu <crazylht at gmail dot com> ---
(In reply to Jakub Jelinek from comment #2)
> As
> typedef unsigned char V __attribute__((vector_size (32)));
>
> V
> foo (V x)
> {
> return __builtin_shufflevector (x, x, 0, 1, 2, 0, 5, 1, 0, 1, 3, 2, 3, 0,
> 4, 3, 1, 2,
> 2, 0, 4, 2, 3, 1, 1, 2, 3, 4, 1, 1, 0, 0, 5, 2);
> }
>
> V
> bar (V x)
> {
> return __builtin_shufflevector (x, x, 0, 3, 3, 3, 3, 4, 5, 0, 1, 5, 2, 1,
> 0, 1, 1, 2,
> 3, 2, 0, 5, 4, 5, 1, 0, 1, 4, 4, 3, 4, 5, 2, 0);
> }
> with -O2 -mavx2 is handled, I'd say this is veclower task to determine that
> the particular permutation could be cheaply implemented with two
> permutations of half-sized vectors and ask the backend if it supports those.
> Of course there can be other permutations that can't be implemented that way
> easily and might e.g. need more half-sized permutations...
I looks to me that middle end should be able to transform 64-byte vector
shuffle to 32-byte vector shuffle when data flow analysis shows the upper part
of the vector is never used.
^ permalink raw reply [flat|nested] 9+ messages in thread
* [Bug target/101579] Suboptimal codegen for __builtin_shufflevector
2021-07-22 12:41 [Bug target/101579] New: Suboptimal codegen for __builtin_shufflevector hjl.tools at gmail dot com
` (3 preceding siblings ...)
2021-07-28 9:40 ` crazylht at gmail dot com
@ 2021-07-28 9:45 ` crazylht at gmail dot com
2021-07-28 9:49 ` jakub at gcc dot gnu.org
` (2 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: crazylht at gmail dot com @ 2021-07-28 9:45 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101579
--- Comment #5 from Hongtao.liu <crazylht at gmail dot com> ---
After manually eliminating the upper part vector shuffle, codegen is much
better https://godbolt.org/z/d3YhzzYfo
typedef unsigned int __attribute__((__vector_size__ (32))) U;
typedef unsigned char __attribute__((__vector_size__ (32))) V;
V g;
U
foo (void)
{
V v = __builtin_shufflevector (g, g,
0, 1, 2, 0, 5, 1, 0, 1, 3, 2, 3, 0, 4, 3, 1,
2,
2, 0, 4, 2, 3, 1, 1, 2, 3, 4, 1, 1, 0, 0, 5,
2)
;
v ^= 255;
V w = v + g;
U u = ((union { V a; U b; }) w).b + ((union { V a; U b; }) w).b[1];
return u;
}
^ permalink raw reply [flat|nested] 9+ messages in thread
* [Bug target/101579] Suboptimal codegen for __builtin_shufflevector
2021-07-22 12:41 [Bug target/101579] New: Suboptimal codegen for __builtin_shufflevector hjl.tools at gmail dot com
` (4 preceding siblings ...)
2021-07-28 9:45 ` crazylht at gmail dot com
@ 2021-07-28 9:49 ` jakub at gcc dot gnu.org
2021-07-28 10:03 ` crazylht at gmail dot com
2021-07-28 10:12 ` crazylht at gmail dot com
7 siblings, 0 replies; 9+ messages in thread
From: jakub at gcc dot gnu.org @ 2021-07-28 9:49 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101579
--- Comment #6 from Jakub Jelinek <jakub at gcc dot gnu.org> ---
(In reply to Hongtao.liu from comment #4)
> I looks to me that middle end should be able to transform 64-byte vector
> shuffle to 32-byte vector shuffle when data flow analysis shows the upper
> part of the vector is never used.
That is not needed in this case. The permutation is such that all indices for
the first half read from one half (in this case the first) and all indices for
the second half read from one half (in this case the first again), so it is
identical to a vector containing permutation of the first half with the first
half of the indices and permutation of the first half again with the second
half of the indices.
^ permalink raw reply [flat|nested] 9+ messages in thread
* [Bug target/101579] Suboptimal codegen for __builtin_shufflevector
2021-07-22 12:41 [Bug target/101579] New: Suboptimal codegen for __builtin_shufflevector hjl.tools at gmail dot com
` (5 preceding siblings ...)
2021-07-28 9:49 ` jakub at gcc dot gnu.org
@ 2021-07-28 10:03 ` crazylht at gmail dot com
2021-07-28 10:12 ` crazylht at gmail dot com
7 siblings, 0 replies; 9+ messages in thread
From: crazylht at gmail dot com @ 2021-07-28 10:03 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101579
--- Comment #7 from Hongtao.liu <crazylht at gmail dot com> ---
(In reply to Jakub Jelinek from comment #6)
> (In reply to Hongtao.liu from comment #4)
> > I looks to me that middle end should be able to transform 64-byte vector
> > shuffle to 32-byte vector shuffle when data flow analysis shows the upper
> > part of the vector is never used.
>
> That is not needed in this case. The permutation is such that all indices
> for the first half read from one half (in this case the first) and all
> indices for the second half read from one half (in this case the first
> again), so it is
> identical to a vector containing permutation of the first half with the
> first half of the indices and permutation of the first half again with the
> second half of the indices.
U u = ((union { V a; U b; }) w).b + ((union { V a; U b; }) w).b[1];
return u;
I means the result only use the first half, we can just create a tmp v1 with
v1 = __builtin_shufflevector (g, g,
0, 1, 2, 0, 5, 1, 0, 1, 3, 2, 3, 0, 4, 3, 1,
2,
2, 0, 4, 2, 3, 1, 1, 2, 3, 4, 1, 1, 0, 0, 5,
2)(In reply to Jakub Jelinek from comment #6)
> (In reply to Hongtao.liu from comment #4)
> > I looks to me that middle end should be able to transform 64-byte vector
> > shuffle to 32-byte vector shuffle when data flow analysis shows the upper
> > part of the vector is never used.
>
> That is not needed in this case. The permutation is such that all indices
> for the first half read from one half (in this case the first) and all
> indices for the second half read from one half (in this case the first
> again), so it is
> identical to a vector containing permutation of the first half with the
> first half of the indices and permutation of the first half again with the
> second half of the indices.
U u = ((union { V a; U b; }) w).b + ((union { V a; U b; }) w).b[1];
return u;
I means the result u only cared about the first half, we can drop the second
half, it's redundant.
^ permalink raw reply [flat|nested] 9+ messages in thread
* [Bug target/101579] Suboptimal codegen for __builtin_shufflevector
2021-07-22 12:41 [Bug target/101579] New: Suboptimal codegen for __builtin_shufflevector hjl.tools at gmail dot com
` (6 preceding siblings ...)
2021-07-28 10:03 ` crazylht at gmail dot com
@ 2021-07-28 10:12 ` crazylht at gmail dot com
7 siblings, 0 replies; 9+ messages in thread
From: crazylht at gmail dot com @ 2021-07-28 10:12 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101579
--- Comment #8 from Hongtao.liu <crazylht at gmail dot com> ---
>
> U u = ((union { V a; U b; }) w).b + ((union { V a; U b; }) w).b[1];
> return u;
>
> I means the result u only cared about the first half, we can drop the second
> half, it's redundant.
Sorry for the typo, just want to clarify
result u = ((union { V a; U b; }) w).b + ((union { V a; U b; }) w).b[1]
where "(union { V a; U b; }) w).b" only cares about the first half, so does
"((union { V a; U b; }) w).b[1]" and "u", the second half of v is never used
and can be optimized away.
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2021-07-28 10:12 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-07-22 12:41 [Bug target/101579] New: Suboptimal codegen for __builtin_shufflevector hjl.tools at gmail dot com
2021-07-27 10:14 ` [Bug target/101579] " rguenth at gcc dot gnu.org
2021-07-27 10:35 ` jakub at gcc dot gnu.org
2021-07-28 9:01 ` crazylht at gmail dot com
2021-07-28 9:40 ` crazylht at gmail dot com
2021-07-28 9:45 ` crazylht at gmail dot com
2021-07-28 9:49 ` jakub at gcc dot gnu.org
2021-07-28 10:03 ` crazylht at gmail dot com
2021-07-28 10:12 ` crazylht at gmail dot com
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).