public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug tree-optimization/55155] New: Autovectorization does not use unaligned loads/stores
@ 2012-10-31 21:33 sgunderson at bigfoot dot com
2012-10-31 21:44 ` [Bug tree-optimization/55155] " pinskia at gcc dot gnu.org
` (2 more replies)
0 siblings, 3 replies; 4+ messages in thread
From: sgunderson at bigfoot dot com @ 2012-10-31 21:33 UTC (permalink / raw)
To: gcc-bugs
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=55155
Bug #: 55155
Summary: Autovectorization does not use unaligned loads/stores
Classification: Unclassified
Product: gcc
Version: 4.7.1
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: tree-optimization
AssignedTo: unassigned@gcc.gnu.org
ReportedBy: sgunderson@bigfoot.com
Hi,
I am on
gcc version 4.7.1 (Debian 4.7.1-7)
and a project of mine had code that looked like this:
beklager:~> cat example.cpp
void func(float * __restrict prod_features, float * __restrict
grad_prod_features, float alpha, unsigned num_prods) {
float *pf = (float *)__builtin_assume_aligned(prod_features, 16);
float *gpf = (float *)__builtin_assume_aligned(grad_prod_features, 16);
for (unsigned i = 0; i < num_prods * 16; ++i) {
prod_features[i] -= alpha * grad_prod_features[i];
//pf[i] -= alpha * gpf[i];
}
}
This would seem like a great case for autovectorization, so I tried:
beklager:~> g++ -Wall -O2 -ftree-vectorize -msse4.1 -c example.cpp
example.cpp: In function ‘void func(float*, float*, float, unsigned int)’:
example.cpp:2:9: warning: unused variable ‘pf’ [-Wunused-variable]
example.cpp:3:9: warning: unused variable ‘gpf’ [-Wunused-variable]
The resulting code, however, is a train wreck:
beklager:~> objdump --disassemble --demangle example.o
example.o: file format elf64-x86-64
Disassembly of section .text:
0000000000000000 <func(float*, float*, float, unsigned int)>:
0: 55 push %rbp
1: c1 e2 04 shl $0x4,%edx
4: 85 d2 test %edx,%edx
6: 53 push %rbx
7: 0f 84 ef 00 00 00 je fc <func(float*, float*, float,
unsigned int)+0xfc>
d: 49 89 f8 mov %rdi,%r8
10: 41 83 e0 0f and $0xf,%r8d
14: 49 c1 e8 02 shr $0x2,%r8
18: 49 f7 d8 neg %r8
1b: 41 83 e0 03 and $0x3,%r8d
1f: 44 39 c2 cmp %r8d,%edx
22: 44 0f 42 c2 cmovb %edx,%r8d
26: 83 fa 04 cmp $0x4,%edx
29: 0f 87 d0 00 00 00 ja ff <func(float*, float*, float,
unsigned int)+0xff>
2f: 41 89 d0 mov %edx,%r8d
32: 31 c0 xor %eax,%eax
34: 0f 1f 40 00 nopl 0x0(%rax)
38: f3 0f 10 14 86 movss (%rsi,%rax,4),%xmm2
3d: 8d 48 01 lea 0x1(%rax),%ecx
40: f3 0f 59 d0 mulss %xmm0,%xmm2
44: f3 0f 10 0c 87 movss (%rdi,%rax,4),%xmm1
49: f3 0f 5c ca subss %xmm2,%xmm1
4d: f3 0f 11 0c 87 movss %xmm1,(%rdi,%rax,4)
52: 48 83 c0 01 add $0x1,%rax
56: 41 39 c0 cmp %eax,%r8d
59: 77 dd ja 38 <func(float*, float*, float,
unsigned int)+0x38>
5b: 44 39 c2 cmp %r8d,%edx
5e: 0f 84 98 00 00 00 je fc <func(float*, float*, float,
unsigned int)+0xfc>
64: 89 d5 mov %edx,%ebp
66: 45 89 c1 mov %r8d,%r9d
69: 44 29 c5 sub %r8d,%ebp
6c: 41 89 eb mov %ebp,%r11d
6f: 41 c1 eb 02 shr $0x2,%r11d
73: 42 8d 1c 9d 00 00 00 lea 0x0(,%r11,4),%ebx
7a: 00
7b: 85 db test %ebx,%ebx
7d: 74 59 je d8 <func(float*, float*, float,
unsigned int)+0xd8>
7f: 0f 28 c8 movaps %xmm0,%xmm1
82: 49 c1 e1 02 shl $0x2,%r9
86: 0f 57 db xorps %xmm3,%xmm3
89: 4e 8d 14 0f lea (%rdi,%r9,1),%r10
8d: 0f c6 c9 00 shufps $0x0,%xmm1,%xmm1
91: 49 01 f1 add %rsi,%r9
94: 31 c0 xor %eax,%eax
96: 45 31 c0 xor %r8d,%r8d
99: 0f 28 e1 movaps %xmm1,%xmm4
9c: 0f 1f 40 00 nopl 0x0(%rax)
a0: 0f 28 cb movaps %xmm3,%xmm1
a3: 41 83 c0 01 add $0x1,%r8d
a7: 41 0f 28 14 02 movaps (%r10,%rax,1),%xmm2
ac: 41 0f 12 0c 01 movlps (%r9,%rax,1),%xmm1
b1: 41 0f 16 4c 01 08 movhps 0x8(%r9,%rax,1),%xmm1
b7: 0f 59 cc mulps %xmm4,%xmm1
ba: 0f 5c d1 subps %xmm1,%xmm2
bd: 41 0f 29 14 02 movaps %xmm2,(%r10,%rax,1)
c2: 48 83 c0 10 add $0x10,%rax
c6: 45 39 d8 cmp %r11d,%r8d
c9: 72 d5 jb a0 <func(float*, float*, float,
unsigned int)+0xa0>
cb: 01 d9 add %ebx,%ecx
cd: 39 dd cmp %ebx,%ebp
cf: 74 2b je fc <func(float*, float*, float,
unsigned int)+0xfc>
d1: 0f 1f 80 00 00 00 00 nopl 0x0(%rax)
d8: 41 89 c8 mov %ecx,%r8d
db: 83 c1 01 add $0x1,%ecx
de: f3 42 0f 10 14 86 movss (%rsi,%r8,4),%xmm2
e4: 4a 8d 04 87 lea (%rdi,%r8,4),%rax
e8: 39 ca cmp %ecx,%edx
ea: f3 0f 59 d0 mulss %xmm0,%xmm2
ee: f3 0f 10 08 movss (%rax),%xmm1
f2: f3 0f 5c ca subss %xmm2,%xmm1
f6: f3 0f 11 08 movss %xmm1,(%rax)
fa: 77 dc ja d8 <func(float*, float*, float,
unsigned int)+0xd8>
fc: 5b pop %rbx
fd: 5d pop %rbp
fe: c3 retq
ff: 45 85 c0 test %r8d,%r8d
102: 0f 85 2a ff ff ff jne 32 <func(float*, float*, float,
unsigned int)+0x32>
108: 31 c9 xor %ecx,%ecx
10a: e9 55 ff ff ff jmpq 64 <func(float*, float*, float,
unsigned int)+0x64>
There are two potential issues here:
1. It knows that my two arrays are not necessarily 16-byte aligned, so it emits
a huge body of code around it. (If I comment out the line in the inner loop and
uncomment the one next to it, much of this code disappears.) It should simply
write the loop using unaligned loads/stores (movups) instead of trying to piece
together packed scalars with movlps and movhps itself.
2. For some reason, it doesn't understand that (num_prods * 16) is divisible by
four, so it has extra code to handle that case.
If I change num_prods to a constant (e.g. 64), and use the variables that are
assumed 16-aligned, the output is the much more sane
beklager:~> cat example.cpp
void func(float * __restrict prod_features, float * __restrict
grad_prod_features, float alpha, unsigned num_prods) {
float *pf = (float *)__builtin_assume_aligned(prod_features, 16);
float *gpf = (float *)__builtin_assume_aligned(grad_prod_features, 16);
for (unsigned i = 0; i < 64 * 16; ++i) {
//prod_features[i] -= alpha * grad_prod_features[i];
pf[i] -= alpha * gpf[i];
}
}
beklager:~> g++ -Wall -O2 -ftree-vectorize -msse4.1 -c example.cpp
beklager:~> objdump --disassemble --demangle example.o
example.o: file format elf64-x86-64
Disassembly of section .text:
0000000000000000 <func(float*, float*, float, unsigned int)>:
0: 0f 28 c8 movaps %xmm0,%xmm1
3: 31 c0 xor %eax,%eax
5: 0f c6 c9 00 shufps $0x0,%xmm1,%xmm1
9: 0f 28 d1 movaps %xmm1,%xmm2
c: 0f 1f 40 00 nopl 0x0(%rax)
10: 0f 28 0c 06 movaps (%rsi,%rax,1),%xmm1
14: 0f 59 ca mulps %xmm2,%xmm1
17: 0f 28 04 07 movaps (%rdi,%rax,1),%xmm0
1b: 0f 5c c1 subps %xmm1,%xmm0
1e: 0f 29 04 07 movaps %xmm0,(%rdi,%rax,1)
22: 48 83 c0 10 add $0x10,%rax
26: 48 3d 00 10 00 00 cmp $0x1000,%rax
2c: 75 e2 jne 10 <func(float*, float*, float,
unsigned int)+0x10>
2e: f3 c3 repz retq
although in this case, one could argue that it should have fused the
movaps+subps+movaps to a single subps from memory.
^ permalink raw reply [flat|nested] 4+ messages in thread
* [Bug tree-optimization/55155] Autovectorization does not use unaligned loads/stores
2012-10-31 21:33 [Bug tree-optimization/55155] New: Autovectorization does not use unaligned loads/stores sgunderson at bigfoot dot com
@ 2012-10-31 21:44 ` pinskia at gcc dot gnu.org
2021-06-08 8:28 ` pinskia at gcc dot gnu.org
2022-11-07 20:02 ` aldyh at gcc dot gnu.org
2 siblings, 0 replies; 4+ messages in thread
From: pinskia at gcc dot gnu.org @ 2012-10-31 21:44 UTC (permalink / raw)
To: gcc-bugs
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=55155
--- Comment #1 from Andrew Pinski <pinskia at gcc dot gnu.org> 2012-10-31 21:44:20 UTC ---
<bb 2>:
_19 = num_prods_6(D) * 16;
if (_19 != 0)
goto <bb 4>;
else
goto <bb 3>;
<bb 3>:
return;
<bb 4>:
_16 = ASSERT_EXPR <_19, _19 != 0>;
...
if (_16 <= 4)
goto <bb 5>;
else
goto <bb 21>;
We should have an assert_expr that _16 is also greater or equal to than 16.
Note this is changing unsigned to __SIZE_TYPE__ so casting does not get in the
way.
^ permalink raw reply [flat|nested] 4+ messages in thread
* [Bug tree-optimization/55155] Autovectorization does not use unaligned loads/stores
2012-10-31 21:33 [Bug tree-optimization/55155] New: Autovectorization does not use unaligned loads/stores sgunderson at bigfoot dot com
2012-10-31 21:44 ` [Bug tree-optimization/55155] " pinskia at gcc dot gnu.org
@ 2021-06-08 8:28 ` pinskia at gcc dot gnu.org
2022-11-07 20:02 ` aldyh at gcc dot gnu.org
2 siblings, 0 replies; 4+ messages in thread
From: pinskia at gcc dot gnu.org @ 2021-06-08 8:28 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=55155
Andrew Pinski <pinskia at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Target Milestone|--- |8.0
Status|UNCONFIRMED |RESOLVED
Resolution|--- |FIXED
--- Comment #2 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
Been fixed since at least GCC 8.
_Z4funcPfS_fj:
sall $4, %edx
je .L1
shrl $2, %edx
xorl %eax, %eax
shufps $0, %xmm0, %xmm0
salq $4, %rdx
.p2align 4,,10
.p2align 3
.L3:
movups (%rsi,%rax), %xmm1
movups (%rdi,%rax), %xmm2
mulps %xmm0, %xmm1
subps %xmm1, %xmm2
movups %xmm2, (%rdi,%rax)
addq $16, %rax
cmpq %rax, %rdx
jne .L3
.L1:
ret
^ permalink raw reply [flat|nested] 4+ messages in thread
* [Bug tree-optimization/55155] Autovectorization does not use unaligned loads/stores
2012-10-31 21:33 [Bug tree-optimization/55155] New: Autovectorization does not use unaligned loads/stores sgunderson at bigfoot dot com
2012-10-31 21:44 ` [Bug tree-optimization/55155] " pinskia at gcc dot gnu.org
2021-06-08 8:28 ` pinskia at gcc dot gnu.org
@ 2022-11-07 20:02 ` aldyh at gcc dot gnu.org
2 siblings, 0 replies; 4+ messages in thread
From: aldyh at gcc dot gnu.org @ 2022-11-07 20:02 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=55155
Bug 55155 depends on bug 55157, which changed state.
Bug 55157 Summary: Missed VRP with != 0 and multiply
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=55157
What |Removed |Added
----------------------------------------------------------------------------
Status|NEW |RESOLVED
Resolution|--- |FIXED
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2022-11-07 20:02 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-10-31 21:33 [Bug tree-optimization/55155] New: Autovectorization does not use unaligned loads/stores sgunderson at bigfoot dot com
2012-10-31 21:44 ` [Bug tree-optimization/55155] " pinskia at gcc dot gnu.org
2021-06-08 8:28 ` pinskia at gcc dot gnu.org
2022-11-07 20:02 ` aldyh at gcc dot gnu.org
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).