public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug target/98399] New: x86: Awful code generation for shifting vectors
@ 2020-12-20 14:13 gabravier at gmail dot com
2020-12-21 1:57 ` [Bug target/98399] " crazylht at gmail dot com
` (5 more replies)
0 siblings, 6 replies; 7+ messages in thread
From: gabravier at gmail dot com @ 2020-12-20 14:13 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98399
Bug ID: 98399
Summary: x86: Awful code generation for shifting vectors
Product: gcc
Version: 11.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: target
Assignee: unassigned at gcc dot gnu.org
Reporter: gabravier at gmail dot com
Target Milestone: ---
typedef char U __attribute__((vector_size(16)));
U f(U u)
{
return u >> (u & 1);
}
When compiled with -O3, on LLVM, this code generates this :
.LCPI0_0:
.zero 16,1
f(char __vector(16)): # @f(char __vector(16))
movdqa xmm3, xmmword ptr [rip + .LCPI0_0] # xmm3 =
[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
pand xmm3, xmm0
punpckhbw xmm1, xmm0 # xmm1 =
xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
psllw xmm3, 5
punpckhbw xmm4, xmm3 # xmm4 =
xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
pxor xmm2, xmm2
pxor xmm5, xmm5
pcmpgtw xmm5, xmm4
movdqa xmm6, xmm5
pandn xmm6, xmm1
psraw xmm1, 4
pand xmm1, xmm5
por xmm1, xmm6
paddw xmm4, xmm4
pxor xmm5, xmm5
pcmpgtw xmm5, xmm4
movdqa xmm6, xmm5
pandn xmm6, xmm1
psraw xmm1, 2
pand xmm1, xmm5
por xmm1, xmm6
paddw xmm4, xmm4
pxor xmm5, xmm5
pcmpgtw xmm5, xmm4
movdqa xmm4, xmm5
pandn xmm4, xmm1
psraw xmm1, 1
pand xmm1, xmm5
por xmm1, xmm4
psrlw xmm1, 8
punpcklbw xmm0, xmm0 # xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
punpcklbw xmm3, xmm3 # xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
pxor xmm4, xmm4
pcmpgtw xmm4, xmm3
movdqa xmm5, xmm4
pandn xmm5, xmm0
psraw xmm0, 4
pand xmm0, xmm4
por xmm0, xmm5
paddw xmm3, xmm3
pxor xmm4, xmm4
pcmpgtw xmm4, xmm3
movdqa xmm5, xmm4
pandn xmm5, xmm0
psraw xmm0, 2
pand xmm0, xmm4
por xmm0, xmm5
paddw xmm3, xmm3
pcmpgtw xmm2, xmm3
movdqa xmm3, xmm2
pandn xmm3, xmm0
psraw xmm0, 1
pand xmm0, xmm2
por xmm0, xmm3
psrlw xmm0, 8
packuswb xmm0, xmm1
ret
Which is rather long, however GCC generates this :
f(char __vector(16)):
push r15
movd edx, xmm0
push r14
push r13
push r12
push rbp
push rbx
sub rsp, 400
movdqa xmm1, XMMWORD PTR .LC0[rip]
movaps XMMWORD PTR [rsp+376], xmm0
movzx ebx, BYTE PTR [rsp+377]
pand xmm1, xmm0
movaps XMMWORD PTR [rsp+344], xmm0
movzx ebp, BYTE PTR [rsp+346]
movd ecx, xmm1
movaps XMMWORD PTR [rsp+360], xmm1
sar dl, cl
movzx ecx, BYTE PTR [rsp+361]
movaps XMMWORD PTR [rsp+328], xmm1
movaps XMMWORD PTR [rsp+312], xmm0
movzx edx, dl
movzx r12d, BYTE PTR [rsp+315]
sar bl, cl
movzx ecx, BYTE PTR [rsp+330]
movaps XMMWORD PTR [rsp+296], xmm1
movaps XMMWORD PTR [rsp+280], xmm0
movzx ebx, bl
movzx r13d, BYTE PTR [rsp+284]
sar bpl, cl
movzx ecx, BYTE PTR [rsp+299]
movaps XMMWORD PTR [rsp+264], xmm1
movaps XMMWORD PTR [rsp+248], xmm0
movzx ebp, bpl
movzx r14d, BYTE PTR [rsp+253]
sar r12b, cl
movzx ecx, BYTE PTR [rsp+268]
movaps XMMWORD PTR [rsp+232], xmm1
movaps XMMWORD PTR [rsp+216], xmm0
movzx r12d, r12b
movzx r15d, BYTE PTR [rsp+222]
sar r13b, cl
movzx ecx, BYTE PTR [rsp+237]
movaps XMMWORD PTR [rsp+200], xmm1
movzx r13d, r13b
sar r14b, cl
movzx ecx, BYTE PTR [rsp+206]
movaps XMMWORD PTR [rsp+184], xmm0
movzx eax, BYTE PTR [rsp+191]
movaps XMMWORD PTR [rsp+168], xmm1
movzx r14d, r14b
sar r15b, cl
movzx ecx, BYTE PTR [rsp+175]
movaps XMMWORD PTR [rsp+120], xmm0
movzx edi, BYTE PTR [rsp+129]
movaps XMMWORD PTR [rsp+152], xmm0
movzx esi, BYTE PTR [rsp+160]
movzx r15d, r15b
sar al, cl
movaps XMMWORD PTR [rsp+136], xmm1
movzx ecx, BYTE PTR [rsp+144]
movaps XMMWORD PTR [rsp+104], xmm1
sar sil, cl
movzx ecx, BYTE PTR [rsp+113]
movaps XMMWORD PTR [rsp+88], xmm0
mov BYTE PTR [rsp-89], sil
sar dil, cl
movaps XMMWORD PTR [rsp+72], xmm1
movzx ecx, BYTE PTR [rsp+82]
movzx esi, dil
movzx edi, BYTE PTR [rsp+98]
movaps XMMWORD PTR [rsp+56], xmm0
movzx r8d, BYTE PTR [rsp+67]
movaps XMMWORD PTR [rsp+40], xmm1
sar dil, cl
movzx ecx, BYTE PTR [rsp+51]
movaps XMMWORD PTR [rsp+24], xmm0
movzx r9d, BYTE PTR [rsp+36]
movaps XMMWORD PTR [rsp+8], xmm1
movzx edi, dil
sar r8b, cl
movzx ecx, BYTE PTR [rsp+20]
movaps XMMWORD PTR [rsp-8], xmm0
movzx r10d, BYTE PTR [rsp+5]
movaps XMMWORD PTR [rsp-24], xmm1
movzx r8d, r8b
sar r9b, cl
movzx ecx, BYTE PTR [rsp-11]
mov BYTE PTR [rsp-120], al
movaps XMMWORD PTR [rsp-40], xmm0
movzx r9d, r9b
sar r10b, cl
movaps XMMWORD PTR [rsp-56], xmm1
movzx ecx, BYTE PTR [rsp-42]
movzx r11d, BYTE PTR [rsp-26]
movaps XMMWORD PTR [rsp-72], xmm0
movzx eax, BYTE PTR [rsp-57]
movzx r10d, r10b
sar r11b, cl
movaps XMMWORD PTR [rsp-88], xmm1
movzx ecx, BYTE PTR [rsp-73]
movzx r11d, r11b
sar al, cl
movzx ecx, al
movzx eax, BYTE PTR [rsp-120]
sal rcx, 8
sal rax, 8
or rcx, r11
or rax, r15
sal rax, 8
or rax, r14
sal rax, 8
or rax, r13
sal rax, 8
or rax, r12
sal rax, 8
or rax, rbp
sal rax, 8
or rax, rbx
movzx ebx, BYTE PTR [rsp-89]
sal rax, 8
sal rcx, 8
or rcx, r10
or rax, rdx
sal rcx, 8
mov QWORD PTR [rsp-120], rax
or rcx, r9
sal rcx, 8
or rcx, r8
sal rcx, 8
or rcx, rdi
sal rcx, 8
or rcx, rsi
sal rcx, 8
or rcx, rbx
mov QWORD PTR [rsp-112], rcx
movdqa xmm0, XMMWORD PTR [rsp-120]
add rsp, 400
pop rbx
pop rbp
pop r12
pop r13
pop r14
pop r15
ret
.LC0:
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
Using such flags as `-mavx2` seems to marginally improve the situation, but on
LLVM results in far better code generation :
.LCPI0_0:
.zero 16,1
f(char __vector(16)): # @f(char __vector(16))
vpand xmm1, xmm0, xmmword ptr [rip + .LCPI0_0]
vpsllw xmm1, xmm1, 5
vpunpckhbw xmm2, xmm1, xmm1 # xmm2 =
xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
vpunpckhbw xmm3, xmm0, xmm0 # xmm3 =
xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
vpsraw xmm4, xmm3, 4
vpblendvb xmm3, xmm3, xmm4, xmm2
vpsraw xmm4, xmm3, 2
vpaddw xmm2, xmm2, xmm2
vpblendvb xmm3, xmm3, xmm4, xmm2
vpsraw xmm4, xmm3, 1
vpaddw xmm2, xmm2, xmm2
vpblendvb xmm2, xmm3, xmm4, xmm2
vpsrlw xmm2, xmm2, 8
vpunpcklbw xmm1, xmm1, xmm1 # xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
vpunpcklbw xmm0, xmm0, xmm0 # xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
vpsraw xmm3, xmm0, 4
vpblendvb xmm0, xmm0, xmm3, xmm1
vpsraw xmm3, xmm0, 2
vpaddw xmm1, xmm1, xmm1
vpblendvb xmm0, xmm0, xmm3, xmm1
vpsraw xmm3, xmm0, 1
vpaddw xmm1, xmm1, xmm1
vpblendvb xmm0, xmm0, xmm3, xmm1
vpsrlw xmm0, xmm0, 8
vpackuswb xmm0, xmm0, xmm2
ret
^ permalink raw reply [flat|nested] 7+ messages in thread
* [Bug target/98399] x86: Awful code generation for shifting vectors
2020-12-20 14:13 [Bug target/98399] New: x86: Awful code generation for shifting vectors gabravier at gmail dot com
@ 2020-12-21 1:57 ` crazylht at gmail dot com
2020-12-21 11:45 ` crazylht at gmail dot com
` (4 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: crazylht at gmail dot com @ 2020-12-21 1:57 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98399
--- Comment #1 from Hongtao.liu <crazylht at gmail dot com> ---
I'll take a look.
^ permalink raw reply [flat|nested] 7+ messages in thread
* [Bug target/98399] x86: Awful code generation for shifting vectors
2020-12-20 14:13 [Bug target/98399] New: x86: Awful code generation for shifting vectors gabravier at gmail dot com
2020-12-21 1:57 ` [Bug target/98399] " crazylht at gmail dot com
@ 2020-12-21 11:45 ` crazylht at gmail dot com
2020-12-21 11:59 ` crazylht at gmail dot com
` (3 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: crazylht at gmail dot com @ 2020-12-21 11:45 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98399
--- Comment #2 from Hongtao.liu <crazylht at gmail dot com> ---
It's the problem of veclower
test1.c.180t.veclower21
;; Function f (f, funcdef_no=0, decl_uid=3990, cgraph_uid=1, symbol_order=0)
U f (U u)
{
vector(16) char _1;
U _3;
char _5;
char _6;
unsigned int _7;
char _8;
char _9;
char _10;
unsigned int _11;
char _12;
char _13;
char _14;
unsigned int _15;
char _16;
char _17;
char _18;
unsigned int _19;
char _20;
char _21;
char _22;
unsigned int _23;
char _24;
char _25;
char _26;
unsigned int _27;
char _28;
char _29;
char _30;
unsigned int _31;
char _32;
char _33;
char _34;
unsigned int _35;
char _36;
char _37;
char _38;
unsigned int _39;
char _40;
char _41;
char _42;
unsigned int _43;
char _44;
char _45;
char _46;
unsigned int _47;
char _48;
char _49;
char _50;
unsigned int _51;
char _52;
char _53;
char _54;
unsigned int _55;
char _56;
char _57;
char _58;
unsigned int _59;
char _60;
char _61;
char _62;
unsigned int _63;
char _64;
char _65;
char _66;
unsigned int _67;
char _68;
<bb 2> [local count: 1073741824]:
_1 = u_2(D) & { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
_5 = BIT_FIELD_REF <u_2(D), 8, 0>;
_6 = BIT_FIELD_REF <_1, 8, 0>;
_7 = (unsigned int) _6;
_8 = _5 >> _7;
_9 = BIT_FIELD_REF <u_2(D), 8, 8>;
_10 = BIT_FIELD_REF <_1, 8, 8>;
_11 = (unsigned int) _10;
_12 = _9 >> _11;
_13 = BIT_FIELD_REF <u_2(D), 8, 16>;
_14 = BIT_FIELD_REF <_1, 8, 16>;
_15 = (unsigned int) _14;
_16 = _13 >> _15;
_17 = BIT_FIELD_REF <u_2(D), 8, 24>;
_18 = BIT_FIELD_REF <_1, 8, 24>;
_19 = (unsigned int) _18;
_20 = _17 >> _19;
_21 = BIT_FIELD_REF <u_2(D), 8, 32>;
_22 = BIT_FIELD_REF <_1, 8, 32>;
_23 = (unsigned int) _22;
_24 = _21 >> _23;
_25 = BIT_FIELD_REF <u_2(D), 8, 40>;
_26 = BIT_FIELD_REF <_1, 8, 40>;
_27 = (unsigned int) _26;
_28 = _25 >> _27;
_29 = BIT_FIELD_REF <u_2(D), 8, 48>;
_30 = BIT_FIELD_REF <_1, 8, 48>;
_31 = (unsigned int) _30;
_32 = _29 >> _31;
_33 = BIT_FIELD_REF <u_2(D), 8, 56>;
_34 = BIT_FIELD_REF <_1, 8, 56>;
_35 = (unsigned int) _34;
_36 = _33 >> _35;
_37 = BIT_FIELD_REF <u_2(D), 8, 64>;
_38 = BIT_FIELD_REF <_1, 8, 64>;
_39 = (unsigned int) _38;
_40 = _37 >> _39;
_41 = BIT_FIELD_REF <u_2(D), 8, 72>;
_42 = BIT_FIELD_REF <_1, 8, 72>;
_43 = (unsigned int) _42;
_44 = _41 >> _43;
_45 = BIT_FIELD_REF <u_2(D), 8, 80>;
_46 = BIT_FIELD_REF <_1, 8, 80>;
_47 = (unsigned int) _46;
_48 = _45 >> _47;
_49 = BIT_FIELD_REF <u_2(D), 8, 88>;
_50 = BIT_FIELD_REF <_1, 8, 88>;
_51 = (unsigned int) _50;
_52 = _49 >> _51;
_53 = BIT_FIELD_REF <u_2(D), 8, 96>;
_54 = BIT_FIELD_REF <_1, 8, 96>;
_55 = (unsigned int) _54;
_56 = _53 >> _55;
_57 = BIT_FIELD_REF <u_2(D), 8, 104>;
_58 = BIT_FIELD_REF <_1, 8, 104>;
_59 = (unsigned int) _58;
_60 = _57 >> _59;
_61 = BIT_FIELD_REF <u_2(D), 8, 112>;
_62 = BIT_FIELD_REF <_1, 8, 112>;
_63 = (unsigned int) _62;
_64 = _61 >> _63;
_65 = BIT_FIELD_REF <u_2(D), 8, 120>;
_66 = BIT_FIELD_REF <_1, 8, 120>;
_67 = (unsigned int) _66;
_68 = _65 >> _67;
_3 = {_8, _12, _16, _20, _24, _28, _32, _36, _40, _44, _48, _52, _56, _60,
_64, _68};
return _3;
}
^ permalink raw reply [flat|nested] 7+ messages in thread
* [Bug target/98399] x86: Awful code generation for shifting vectors
2020-12-20 14:13 [Bug target/98399] New: x86: Awful code generation for shifting vectors gabravier at gmail dot com
2020-12-21 1:57 ` [Bug target/98399] " crazylht at gmail dot com
2020-12-21 11:45 ` crazylht at gmail dot com
@ 2020-12-21 11:59 ` crazylht at gmail dot com
2021-01-05 9:32 ` rguenth at gcc dot gnu.org
` (2 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: crazylht at gmail dot com @ 2020-12-21 11:59 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98399
--- Comment #3 from Hongtao.liu <crazylht at gmail dot com> ---
(In reply to Hongtao.liu from comment #2)
> It's the problem of veclower
>
The root cause is missing expander of vashrv16qi, although x86 don't have
vector int8 shift instruction, it can be emulated by vector int16 shift.
^ permalink raw reply [flat|nested] 7+ messages in thread
* [Bug target/98399] x86: Awful code generation for shifting vectors
2020-12-20 14:13 [Bug target/98399] New: x86: Awful code generation for shifting vectors gabravier at gmail dot com
` (2 preceding siblings ...)
2020-12-21 11:59 ` crazylht at gmail dot com
@ 2021-01-05 9:32 ` rguenth at gcc dot gnu.org
2021-06-24 5:03 ` crazylht at gmail dot com
2021-08-25 5:29 ` pinskia at gcc dot gnu.org
5 siblings, 0 replies; 7+ messages in thread
From: rguenth at gcc dot gnu.org @ 2021-01-05 9:32 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98399
Richard Biener <rguenth at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Status|UNCONFIRMED |NEW
Last reconfirmed| |2021-01-05
Ever confirmed|0 |1
^ permalink raw reply [flat|nested] 7+ messages in thread
* [Bug target/98399] x86: Awful code generation for shifting vectors
2020-12-20 14:13 [Bug target/98399] New: x86: Awful code generation for shifting vectors gabravier at gmail dot com
` (3 preceding siblings ...)
2021-01-05 9:32 ` rguenth at gcc dot gnu.org
@ 2021-06-24 5:03 ` crazylht at gmail dot com
2021-08-25 5:29 ` pinskia at gcc dot gnu.org
5 siblings, 0 replies; 7+ messages in thread
From: crazylht at gmail dot com @ 2021-06-24 5:03 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98399
Bug 98399 depends on bug 98434, which changed state.
Bug 98434 Summary: [AVX512] Missing expander for vashl<VI2_AVX512BW>, vlshr<VI2_AVX512BW>, vashr{v32hi,v16hi,v4di,v8di}
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98434
What |Removed |Added
----------------------------------------------------------------------------
Status|UNCONFIRMED |RESOLVED
Resolution|--- |FIXED
^ permalink raw reply [flat|nested] 7+ messages in thread
* [Bug target/98399] x86: Awful code generation for shifting vectors
2020-12-20 14:13 [Bug target/98399] New: x86: Awful code generation for shifting vectors gabravier at gmail dot com
` (4 preceding siblings ...)
2021-06-24 5:03 ` crazylht at gmail dot com
@ 2021-08-25 5:29 ` pinskia at gcc dot gnu.org
5 siblings, 0 replies; 7+ messages in thread
From: pinskia at gcc dot gnu.org @ 2021-08-25 5:29 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98399
Andrew Pinski <pinskia at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Status|NEW |RESOLVED
Resolution|--- |DUPLICATE
--- Comment #4 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
This is a dup of bug 98934. The issues are the same except for &1.
*** This bug has been marked as a duplicate of bug 98934 ***
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2021-08-25 5:29 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-12-20 14:13 [Bug target/98399] New: x86: Awful code generation for shifting vectors gabravier at gmail dot com
2020-12-21 1:57 ` [Bug target/98399] " crazylht at gmail dot com
2020-12-21 11:45 ` crazylht at gmail dot com
2020-12-21 11:59 ` crazylht at gmail dot com
2021-01-05 9:32 ` rguenth at gcc dot gnu.org
2021-06-24 5:03 ` crazylht at gmail dot com
2021-08-25 5:29 ` pinskia at gcc dot gnu.org
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).