public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug tree-optimization/39821] New: 120% slowdown with vectorizer
@ 2009-04-20 0:23 ramiro86 at hotmail dot com
2009-04-20 9:26 ` [Bug tree-optimization/39821] " rguenth at gcc dot gnu dot org
` (3 more replies)
0 siblings, 4 replies; 5+ messages in thread
From: ramiro86 at hotmail dot com @ 2009-04-20 0:23 UTC (permalink / raw)
To: gcc-bugs
The vectorizer produces horrible code with this testcase:
$ cat dotproduct.c
#include "inttypes.h"
int64_t dotproduct(int32_t *v1, int32_t *v2, int order)
{
int64_t accum = 0;
while (order--)
accum += (int64_t) *v1++ * *v2++;
return accum;
}
int64_t dotproduct_order4(int32_t *v1, int32_t *v2, int order)
{
return dotproduct(v1, v2, 4);
}
$ gcc-4.4rc1 -o dotproduct.o -c dotproduct.c -O3
$ gcc-4.4rc1 -o dotproduct-no-vectorize.o -c dotproduct.c -O3
-fno-tree-vectorize
$ objdump -d dotproduct.o
dotproduct.o: file format elf64-x86-64
Disassembly of section .text:
0000000000000000 <dotproduct>:
0: 31 c0 xor %eax,%eax
2: 85 d2 test %edx,%edx
4: 0f 84 4e 01 00 00 je 158 <dotproduct+0x158>
a: 41 89 d0 mov %edx,%r8d
d: 44 8d 52 ff lea -0x1(%rdx),%r10d
11: 41 c1 e8 02 shr $0x2,%r8d
15: 83 fa 03 cmp $0x3,%edx
18: 46 8d 0c 85 00 00 00 lea 0x0(,%r8,4),%r9d
1f: 00
20: 76 05 jbe 27 <dotproduct+0x27>
22: 45 85 c9 test %r9d,%r9d
25: 75 09 jne 30 <dotproduct+0x30>
27: 31 c0 xor %eax,%eax
29: e9 fc 00 00 00 jmpq 12a <dotproduct+0x12a>
2e: 66 90 xchg %ax,%ax
30: 66 0f ef c0 pxor %xmm0,%xmm0
34: 31 c0 xor %eax,%eax
36: 66 45 0f ef c9 pxor %xmm9,%xmm9
3b: 31 c9 xor %ecx,%ecx
3d: 0f 1f 00 nopl (%rax)
40: f3 0f 6f 14 07 movdqu (%rdi,%rax,1),%xmm2
45: 83 c1 01 add $0x1,%ecx
48: 66 41 0f 6f d9 movdqa %xmm9,%xmm3
4d: f3 0f 6f 24 06 movdqu (%rsi,%rax,1),%xmm4
52: 66 45 0f 6f c1 movdqa %xmm9,%xmm8
57: 66 0f 6f ea movdqa %xmm2,%xmm5
5b: 48 83 c0 10 add $0x10,%rax
5f: 66 0f 66 dc pcmpgtd %xmm4,%xmm3
63: 66 0f 6f fc movdqa %xmm4,%xmm7
67: 66 44 0f 66 c2 pcmpgtd %xmm2,%xmm8
6c: 41 39 c8 cmp %ecx,%r8d
6f: 66 0f 62 fb punpckldq %xmm3,%xmm7
73: 66 41 0f 62 e8 punpckldq %xmm8,%xmm5
78: 66 0f 6a e3 punpckhdq %xmm3,%xmm4
7c: 66 41 0f 6a d0 punpckhdq %xmm8,%xmm2
81: 66 0f 6f cf movdqa %xmm7,%xmm1
85: 66 0f 6f f5 movdqa %xmm5,%xmm6
89: 66 44 0f 6f d7 movdqa %xmm7,%xmm10
8e: 66 0f f4 cd pmuludq %xmm5,%xmm1
92: 66 0f 6f da movdqa %xmm2,%xmm3
96: 66 0f 73 d6 20 psrlq $0x20,%xmm6
9b: 66 0f f4 f7 pmuludq %xmm7,%xmm6
9f: 66 41 0f 73 d2 20 psrlq $0x20,%xmm10
a5: 66 0f 73 f6 20 psllq $0x20,%xmm6
aa: 66 41 0f f4 ea pmuludq %xmm10,%xmm5
af: 66 0f d4 ce paddq %xmm6,%xmm1
b3: 66 0f 73 f5 20 psllq $0x20,%xmm5
b8: 66 0f d4 cd paddq %xmm5,%xmm1
bc: 66 0f 6f ec movdqa %xmm4,%xmm5
c0: 66 0f d4 c8 paddq %xmm0,%xmm1
c4: 66 0f 73 d3 20 psrlq $0x20,%xmm3
c9: 66 0f 6f c4 movdqa %xmm4,%xmm0
cd: 66 0f f4 dc pmuludq %xmm4,%xmm3
d1: 66 0f 73 f3 20 psllq $0x20,%xmm3
d6: 66 0f 73 d5 20 psrlq $0x20,%xmm5
db: 66 0f f4 c2 pmuludq %xmm2,%xmm0
df: 66 0f f4 d5 pmuludq %xmm5,%xmm2
e3: 66 0f d4 c3 paddq %xmm3,%xmm0
e7: 66 0f 73 f2 20 psllq $0x20,%xmm2
ec: 66 0f d4 c2 paddq %xmm2,%xmm0
f0: 66 0f d4 c1 paddq %xmm1,%xmm0
f4: 0f 87 46 ff ff ff ja 40 <dotproduct+0x40>
fa: 42 8d 0c 8d 00 00 00 lea 0x0(,%r9,4),%ecx
101: 00
102: 66 0f 6f c8 movdqa %xmm0,%xmm1
106: 45 29 ca sub %r9d,%r10d
109: 89 c9 mov %ecx,%ecx
10b: 66 0f 73 d9 08 psrldq $0x8,%xmm1
110: 66 0f d4 c1 paddq %xmm1,%xmm0
114: 48 01 cf add %rcx,%rdi
117: 48 01 ce add %rcx,%rsi
11a: 44 39 ca cmp %r9d,%edx
11d: 66 0f d6 44 24 f8 movq %xmm0,-0x8(%rsp)
123: 48 8b 44 24 f8 mov -0x8(%rsp),%rax
128: 74 2e je 158 <dotproduct+0x158>
12a: 45 89 d2 mov %r10d,%r10d
12d: 31 d2 xor %edx,%edx
12f: 4e 8d 0c 95 04 00 00 lea 0x4(,%r10,4),%r9
136: 00
137: 66 0f 1f 84 00 00 00 nopw 0x0(%rax,%rax,1)
13e: 00 00
140: 48 63 0c 16 movslq (%rsi,%rdx,1),%rcx
144: 4c 63 04 17 movslq (%rdi,%rdx,1),%r8
148: 48 83 c2 04 add $0x4,%rdx
14c: 49 0f af c8 imul %r8,%rcx
150: 48 01 c8 add %rcx,%rax
153: 4c 39 ca cmp %r9,%rdx
156: 75 e8 jne 140 <dotproduct+0x140>
158: f3 c3 repz retq
15a: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
0000000000000160 <dotproduct_order4>:
160: 66 0f ef c0 pxor %xmm0,%xmm0
164: f3 0f 6f 0f movdqu (%rdi),%xmm1
168: f3 0f 6f 1e movdqu (%rsi),%xmm3
16c: 66 0f 6f d0 movdqa %xmm0,%xmm2
170: 66 0f 6f f1 movdqa %xmm1,%xmm6
174: 66 0f 66 c1 pcmpgtd %xmm1,%xmm0
178: 66 0f 6f fb movdqa %xmm3,%xmm7
17c: 66 0f 66 d3 pcmpgtd %xmm3,%xmm2
180: 66 0f 62 f0 punpckldq %xmm0,%xmm6
184: 66 0f 62 fa punpckldq %xmm2,%xmm7
188: 66 0f 6a da punpckhdq %xmm2,%xmm3
18c: 66 0f 6a c8 punpckhdq %xmm0,%xmm1
190: 66 0f 6f ee movdqa %xmm6,%xmm5
194: 66 44 0f 6f c7 movdqa %xmm7,%xmm8
199: 66 0f 6f e7 movdqa %xmm7,%xmm4
19d: 66 0f 6f c3 movdqa %xmm3,%xmm0
1a1: 66 0f 73 d5 20 psrlq $0x20,%xmm5
1a6: 66 44 0f f4 c6 pmuludq %xmm6,%xmm8
1ab: 66 0f f4 ef pmuludq %xmm7,%xmm5
1af: 66 0f 6f d1 movdqa %xmm1,%xmm2
1b3: 66 0f 73 d4 20 psrlq $0x20,%xmm4
1b8: 66 0f 73 f5 20 psllq $0x20,%xmm5
1bd: 66 0f f4 e6 pmuludq %xmm6,%xmm4
1c1: 66 41 0f d4 e8 paddq %xmm8,%xmm5
1c6: 66 0f 73 f4 20 psllq $0x20,%xmm4
1cb: 66 0f d4 e5 paddq %xmm5,%xmm4
1cf: 66 0f 6f eb movdqa %xmm3,%xmm5
1d3: 66 0f f4 c1 pmuludq %xmm1,%xmm0
1d7: 66 0f 73 d2 20 psrlq $0x20,%xmm2
1dc: 66 0f f4 d3 pmuludq %xmm3,%xmm2
1e0: 66 0f 73 f2 20 psllq $0x20,%xmm2
1e5: 66 0f d4 c2 paddq %xmm2,%xmm0
1e9: 66 0f 73 d5 20 psrlq $0x20,%xmm5
1ee: 66 0f f4 cd pmuludq %xmm5,%xmm1
1f2: 66 0f 73 f1 20 psllq $0x20,%xmm1
1f7: 66 0f d4 c1 paddq %xmm1,%xmm0
1fb: 66 0f d4 c4 paddq %xmm4,%xmm0
1ff: 66 0f 6f c8 movdqa %xmm0,%xmm1
203: 66 0f 73 d9 08 psrldq $0x8,%xmm1
208: 66 0f d4 c1 paddq %xmm1,%xmm0
20c: 66 0f d6 44 24 f8 movq %xmm0,-0x8(%rsp)
212: 48 8b 44 24 f8 mov -0x8(%rsp),%rax
217: c3 retq
$ objdump -d dotproduct-no-vectorize.o
dotproduct-no-vectorize.o: file format elf64-x86-64
Disassembly of section .text:
0000000000000000 <dotproduct>:
0: 31 c0 xor %eax,%eax
2: 85 d2 test %edx,%edx
4: 74 2a je 30 <dotproduct+0x30>
6: 83 ea 01 sub $0x1,%edx
9: 4c 8d 0c 95 04 00 00 lea 0x4(,%rdx,4),%r9
10: 00
11: 31 d2 xor %edx,%edx
13: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
18: 48 63 0c 16 movslq (%rsi,%rdx,1),%rcx
1c: 4c 63 04 17 movslq (%rdi,%rdx,1),%r8
20: 48 83 c2 04 add $0x4,%rdx
24: 49 0f af c8 imul %r8,%rcx
28: 48 01 c8 add %rcx,%rax
2b: 4c 39 ca cmp %r9,%rdx
2e: 75 e8 jne 18 <dotproduct+0x18>
30: f3 c3 repz retq
32: 66 66 66 66 66 2e 0f nopw %cs:0x0(%rax,%rax,1)
39: 1f 84 00 00 00 00 00
0000000000000040 <dotproduct_order4>:
40: 48 63 07 movslq (%rdi),%rax
43: 48 63 16 movslq (%rsi),%rdx
46: 48 63 4f 04 movslq 0x4(%rdi),%rcx
4a: 48 0f af d0 imul %rax,%rdx
4e: 48 63 46 04 movslq 0x4(%rsi),%rax
52: 48 0f af c1 imul %rcx,%rax
56: 48 63 4f 08 movslq 0x8(%rdi),%rcx
5a: 48 01 c2 add %rax,%rdx
5d: 48 63 46 08 movslq 0x8(%rsi),%rax
61: 48 0f af c1 imul %rcx,%rax
65: 48 63 4f 0c movslq 0xc(%rdi),%rcx
69: 48 01 c2 add %rax,%rdx
6c: 48 63 46 0c movslq 0xc(%rsi),%rax
70: 48 0f af c1 imul %rcx,%rax
74: 48 01 d0 add %rdx,%rax
77: c3 retq
--
Summary: 120% slowdown with vectorizer
Product: gcc
Version: 4.4.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: tree-optimization
AssignedTo: unassigned at gcc dot gnu dot org
ReportedBy: ramiro86 at hotmail dot com
GCC target triplet: x86_64-linux-gnu
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=39821
^ permalink raw reply [flat|nested] 5+ messages in thread
* [Bug tree-optimization/39821] 120% slowdown with vectorizer
2009-04-20 0:23 [Bug tree-optimization/39821] New: 120% slowdown with vectorizer ramiro86 at hotmail dot com
@ 2009-04-20 9:26 ` rguenth at gcc dot gnu dot org
2009-04-20 20:52 ` ubizjak at gmail dot com
` (2 subsequent siblings)
3 siblings, 0 replies; 5+ messages in thread
From: rguenth at gcc dot gnu dot org @ 2009-04-20 9:26 UTC (permalink / raw)
To: gcc-bugs
------- Comment #1 from rguenth at gcc dot gnu dot org 2009-04-20 09:26 -------
The vectorizer creates
vect_var_.128_46 = M*vect_p.123_44{misalignment: 0};
vect_var_.129_47 = [vec_unpack_lo_expr] vect_var_.128_46;
vect_var_.129_48 = [vec_unpack_hi_expr] vect_var_.128_46;
vect_var_.135_53 = M*vect_p.130_51{misalignment: 0};
vect_var_.136_54 = [vec_unpack_lo_expr] vect_var_.135_53;
vect_var_.136_55 = [vec_unpack_hi_expr] vect_var_.135_53;
vect_var_.137_56 = vect_var_.136_54 * vect_var_.129_47;
vect_var_.137_57 = vect_var_.136_55 * vect_var_.129_48;
vect_var_.138_59 = vect_var_.137_56 + vect_var_.138_58;
vect_var_.138_60 = vect_var_.137_57 + vect_var_.138_59;
v1_14 = v1_26 + 4;
but the widening unpacking results in absymal code generated. Where are
all the shifts coming from?
--
rguenth at gcc dot gnu dot org changed:
What |Removed |Added
----------------------------------------------------------------------------
CC| |uros at gcc dot gnu dot org,
| |irar at il dot ibm dot com
Status|UNCONFIRMED |NEW
Ever Confirmed|0 |1
Keywords| |missed-optimization
Last reconfirmed|0000-00-00 00:00:00 |2009-04-20 09:26:17
date| |
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=39821
^ permalink raw reply [flat|nested] 5+ messages in thread
* [Bug tree-optimization/39821] 120% slowdown with vectorizer
2009-04-20 0:23 [Bug tree-optimization/39821] New: 120% slowdown with vectorizer ramiro86 at hotmail dot com
2009-04-20 9:26 ` [Bug tree-optimization/39821] " rguenth at gcc dot gnu dot org
@ 2009-04-20 20:52 ` ubizjak at gmail dot com
2009-04-21 0:09 ` ramiro86 at hotmail dot com
2009-04-21 0:10 ` ramiro86 at hotmail dot com
3 siblings, 0 replies; 5+ messages in thread
From: ubizjak at gmail dot com @ 2009-04-20 20:52 UTC (permalink / raw)
To: gcc-bugs
------- Comment #2 from ubizjak at gmail dot com 2009-04-20 20:52 -------
(In reply to comment #1)
> but the widening unpacking results in absymal code generated. Where are
> all the shifts coming from?
Not from unpacking, but from mulv2di pattern from sse.md
Can you please attach full source to create executable testcase? IIRC,
execution times depend on target processor, and perhaps vect cost should be
updated for this case.
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=39821
^ permalink raw reply [flat|nested] 5+ messages in thread
* [Bug tree-optimization/39821] 120% slowdown with vectorizer
2009-04-20 0:23 [Bug tree-optimization/39821] New: 120% slowdown with vectorizer ramiro86 at hotmail dot com
2009-04-20 9:26 ` [Bug tree-optimization/39821] " rguenth at gcc dot gnu dot org
2009-04-20 20:52 ` ubizjak at gmail dot com
@ 2009-04-21 0:09 ` ramiro86 at hotmail dot com
2009-04-21 0:10 ` ramiro86 at hotmail dot com
3 siblings, 0 replies; 5+ messages in thread
From: ramiro86 at hotmail dot com @ 2009-04-21 0:09 UTC (permalink / raw)
To: gcc-bugs
------- Comment #3 from ramiro86 at hotmail dot com 2009-04-21 00:08 -------
Created an attachment (id=17660)
--> (http://gcc.gnu.org/bugzilla/attachment.cgi?id=17660&action=view)
tarball of a simple testcase
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=39821
^ permalink raw reply [flat|nested] 5+ messages in thread
* [Bug tree-optimization/39821] 120% slowdown with vectorizer
2009-04-20 0:23 [Bug tree-optimization/39821] New: 120% slowdown with vectorizer ramiro86 at hotmail dot com
` (2 preceding siblings ...)
2009-04-21 0:09 ` ramiro86 at hotmail dot com
@ 2009-04-21 0:10 ` ramiro86 at hotmail dot com
3 siblings, 0 replies; 5+ messages in thread
From: ramiro86 at hotmail dot com @ 2009-04-21 0:10 UTC (permalink / raw)
To: gcc-bugs
------- Comment #4 from ramiro86 at hotmail dot com 2009-04-21 00:10 -------
I've attached a simple testcase. The system I'm running this on is a q6600 with
64-bit Linux.
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=39821
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2009-04-21 0:10 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-04-20 0:23 [Bug tree-optimization/39821] New: 120% slowdown with vectorizer ramiro86 at hotmail dot com
2009-04-20 9:26 ` [Bug tree-optimization/39821] " rguenth at gcc dot gnu dot org
2009-04-20 20:52 ` ubizjak at gmail dot com
2009-04-21 0:09 ` ramiro86 at hotmail dot com
2009-04-21 0:10 ` ramiro86 at hotmail dot com
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).