[Bug tree-optimization/39821] New: 120% slowdown with vectorizer

public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed

* [Bug tree-optimization/39821]  New: 120% slowdown with vectorizer
@ 2009-04-20  0:23 ramiro86 at hotmail dot com
  2009-04-20  9:26 ` [Bug tree-optimization/39821] " rguenth at gcc dot gnu dot org
                   ` (3 more replies)
  0 siblings, 4 replies; 5+ messages in thread
From: ramiro86 at hotmail dot com @ 2009-04-20  0:23 UTC (permalink / raw)
  To: gcc-bugs

The vectorizer produces horrible code with this testcase:

$ cat dotproduct.c 
#include "inttypes.h"

int64_t dotproduct(int32_t *v1, int32_t *v2, int order)
{
    int64_t accum = 0;
    while (order--)
        accum += (int64_t) *v1++ * *v2++;
    return accum;
}

int64_t dotproduct_order4(int32_t *v1, int32_t *v2, int order)
{
    return dotproduct(v1, v2, 4);
}
$ gcc-4.4rc1 -o dotproduct.o -c dotproduct.c -O3
$ gcc-4.4rc1 -o dotproduct-no-vectorize.o -c dotproduct.c -O3
-fno-tree-vectorize
$ objdump -d dotproduct.o

dotproduct.o:     file format elf64-x86-64


Disassembly of section .text:

0000000000000000 <dotproduct>:
   0:   31 c0                   xor    %eax,%eax
   2:   85 d2                   test   %edx,%edx
   4:   0f 84 4e 01 00 00       je     158 <dotproduct+0x158>
   a:   41 89 d0                mov    %edx,%r8d
   d:   44 8d 52 ff             lea    -0x1(%rdx),%r10d
  11:   41 c1 e8 02             shr    $0x2,%r8d
  15:   83 fa 03                cmp    $0x3,%edx
  18:   46 8d 0c 85 00 00 00    lea    0x0(,%r8,4),%r9d
  1f:   00 
  20:   76 05                   jbe    27 <dotproduct+0x27>
  22:   45 85 c9                test   %r9d,%r9d
  25:   75 09                   jne    30 <dotproduct+0x30>
  27:   31 c0                   xor    %eax,%eax
  29:   e9 fc 00 00 00          jmpq   12a <dotproduct+0x12a>
  2e:   66 90                   xchg   %ax,%ax
  30:   66 0f ef c0             pxor   %xmm0,%xmm0
  34:   31 c0                   xor    %eax,%eax
  36:   66 45 0f ef c9          pxor   %xmm9,%xmm9
  3b:   31 c9                   xor    %ecx,%ecx
  3d:   0f 1f 00                nopl   (%rax)
  40:   f3 0f 6f 14 07          movdqu (%rdi,%rax,1),%xmm2
  45:   83 c1 01                add    $0x1,%ecx
  48:   66 41 0f 6f d9          movdqa %xmm9,%xmm3
  4d:   f3 0f 6f 24 06          movdqu (%rsi,%rax,1),%xmm4
  52:   66 45 0f 6f c1          movdqa %xmm9,%xmm8
  57:   66 0f 6f ea             movdqa %xmm2,%xmm5
  5b:   48 83 c0 10             add    $0x10,%rax
  5f:   66 0f 66 dc             pcmpgtd %xmm4,%xmm3
  63:   66 0f 6f fc             movdqa %xmm4,%xmm7
  67:   66 44 0f 66 c2          pcmpgtd %xmm2,%xmm8
  6c:   41 39 c8                cmp    %ecx,%r8d
  6f:   66 0f 62 fb             punpckldq %xmm3,%xmm7
  73:   66 41 0f 62 e8          punpckldq %xmm8,%xmm5
  78:   66 0f 6a e3             punpckhdq %xmm3,%xmm4
  7c:   66 41 0f 6a d0          punpckhdq %xmm8,%xmm2
  81:   66 0f 6f cf             movdqa %xmm7,%xmm1
  85:   66 0f 6f f5             movdqa %xmm5,%xmm6
  89:   66 44 0f 6f d7          movdqa %xmm7,%xmm10
  8e:   66 0f f4 cd             pmuludq %xmm5,%xmm1
  92:   66 0f 6f da             movdqa %xmm2,%xmm3
  96:   66 0f 73 d6 20          psrlq  $0x20,%xmm6
  9b:   66 0f f4 f7             pmuludq %xmm7,%xmm6
  9f:   66 41 0f 73 d2 20       psrlq  $0x20,%xmm10
  a5:   66 0f 73 f6 20          psllq  $0x20,%xmm6
  aa:   66 41 0f f4 ea          pmuludq %xmm10,%xmm5
  af:   66 0f d4 ce             paddq  %xmm6,%xmm1
  b3:   66 0f 73 f5 20          psllq  $0x20,%xmm5
  b8:   66 0f d4 cd             paddq  %xmm5,%xmm1
  bc:   66 0f 6f ec             movdqa %xmm4,%xmm5
  c0:   66 0f d4 c8             paddq  %xmm0,%xmm1
  c4:   66 0f 73 d3 20          psrlq  $0x20,%xmm3
  c9:   66 0f 6f c4             movdqa %xmm4,%xmm0
  cd:   66 0f f4 dc             pmuludq %xmm4,%xmm3
  d1:   66 0f 73 f3 20          psllq  $0x20,%xmm3
  d6:   66 0f 73 d5 20          psrlq  $0x20,%xmm5
  db:   66 0f f4 c2             pmuludq %xmm2,%xmm0
  df:   66 0f f4 d5             pmuludq %xmm5,%xmm2
  e3:   66 0f d4 c3             paddq  %xmm3,%xmm0
  e7:   66 0f 73 f2 20          psllq  $0x20,%xmm2
  ec:   66 0f d4 c2             paddq  %xmm2,%xmm0
  f0:   66 0f d4 c1             paddq  %xmm1,%xmm0
  f4:   0f 87 46 ff ff ff       ja     40 <dotproduct+0x40>
  fa:   42 8d 0c 8d 00 00 00    lea    0x0(,%r9,4),%ecx
 101:   00 
 102:   66 0f 6f c8             movdqa %xmm0,%xmm1
 106:   45 29 ca                sub    %r9d,%r10d
 109:   89 c9                   mov    %ecx,%ecx
 10b:   66 0f 73 d9 08          psrldq $0x8,%xmm1
 110:   66 0f d4 c1             paddq  %xmm1,%xmm0
 114:   48 01 cf                add    %rcx,%rdi
 117:   48 01 ce                add    %rcx,%rsi
 11a:   44 39 ca                cmp    %r9d,%edx
 11d:   66 0f d6 44 24 f8       movq   %xmm0,-0x8(%rsp)
 123:   48 8b 44 24 f8          mov    -0x8(%rsp),%rax
 128:   74 2e                   je     158 <dotproduct+0x158>
 12a:   45 89 d2                mov    %r10d,%r10d
 12d:   31 d2                   xor    %edx,%edx
 12f:   4e 8d 0c 95 04 00 00    lea    0x4(,%r10,4),%r9
 136:   00 
 137:   66 0f 1f 84 00 00 00    nopw   0x0(%rax,%rax,1)
 13e:   00 00 
 140:   48 63 0c 16             movslq (%rsi,%rdx,1),%rcx
 144:   4c 63 04 17             movslq (%rdi,%rdx,1),%r8
 148:   48 83 c2 04             add    $0x4,%rdx
 14c:   49 0f af c8             imul   %r8,%rcx
 150:   48 01 c8                add    %rcx,%rax
 153:   4c 39 ca                cmp    %r9,%rdx
 156:   75 e8                   jne    140 <dotproduct+0x140>
 158:   f3 c3                   repz retq 
 15a:   66 0f 1f 44 00 00       nopw   0x0(%rax,%rax,1)

0000000000000160 <dotproduct_order4>:
 160:   66 0f ef c0             pxor   %xmm0,%xmm0
 164:   f3 0f 6f 0f             movdqu (%rdi),%xmm1
 168:   f3 0f 6f 1e             movdqu (%rsi),%xmm3
 16c:   66 0f 6f d0             movdqa %xmm0,%xmm2
 170:   66 0f 6f f1             movdqa %xmm1,%xmm6
 174:   66 0f 66 c1             pcmpgtd %xmm1,%xmm0
 178:   66 0f 6f fb             movdqa %xmm3,%xmm7
 17c:   66 0f 66 d3             pcmpgtd %xmm3,%xmm2
 180:   66 0f 62 f0             punpckldq %xmm0,%xmm6
 184:   66 0f 62 fa             punpckldq %xmm2,%xmm7
 188:   66 0f 6a da             punpckhdq %xmm2,%xmm3
 18c:   66 0f 6a c8             punpckhdq %xmm0,%xmm1
 190:   66 0f 6f ee             movdqa %xmm6,%xmm5
 194:   66 44 0f 6f c7          movdqa %xmm7,%xmm8
 199:   66 0f 6f e7             movdqa %xmm7,%xmm4
 19d:   66 0f 6f c3             movdqa %xmm3,%xmm0
 1a1:   66 0f 73 d5 20          psrlq  $0x20,%xmm5
 1a6:   66 44 0f f4 c6          pmuludq %xmm6,%xmm8
 1ab:   66 0f f4 ef             pmuludq %xmm7,%xmm5
 1af:   66 0f 6f d1             movdqa %xmm1,%xmm2
 1b3:   66 0f 73 d4 20          psrlq  $0x20,%xmm4
 1b8:   66 0f 73 f5 20          psllq  $0x20,%xmm5
 1bd:   66 0f f4 e6             pmuludq %xmm6,%xmm4
 1c1:   66 41 0f d4 e8          paddq  %xmm8,%xmm5
 1c6:   66 0f 73 f4 20          psllq  $0x20,%xmm4
 1cb:   66 0f d4 e5             paddq  %xmm5,%xmm4
 1cf:   66 0f 6f eb             movdqa %xmm3,%xmm5
 1d3:   66 0f f4 c1             pmuludq %xmm1,%xmm0
 1d7:   66 0f 73 d2 20          psrlq  $0x20,%xmm2
 1dc:   66 0f f4 d3             pmuludq %xmm3,%xmm2
 1e0:   66 0f 73 f2 20          psllq  $0x20,%xmm2
 1e5:   66 0f d4 c2             paddq  %xmm2,%xmm0
 1e9:   66 0f 73 d5 20          psrlq  $0x20,%xmm5
 1ee:   66 0f f4 cd             pmuludq %xmm5,%xmm1
 1f2:   66 0f 73 f1 20          psllq  $0x20,%xmm1
 1f7:   66 0f d4 c1             paddq  %xmm1,%xmm0
 1fb:   66 0f d4 c4             paddq  %xmm4,%xmm0
 1ff:   66 0f 6f c8             movdqa %xmm0,%xmm1
 203:   66 0f 73 d9 08          psrldq $0x8,%xmm1
 208:   66 0f d4 c1             paddq  %xmm1,%xmm0
 20c:   66 0f d6 44 24 f8       movq   %xmm0,-0x8(%rsp)
 212:   48 8b 44 24 f8          mov    -0x8(%rsp),%rax
 217:   c3                      retq   
$ objdump -d dotproduct-no-vectorize.o

dotproduct-no-vectorize.o:     file format elf64-x86-64


Disassembly of section .text:

0000000000000000 <dotproduct>:
   0:   31 c0                   xor    %eax,%eax
   2:   85 d2                   test   %edx,%edx
   4:   74 2a                   je     30 <dotproduct+0x30>
   6:   83 ea 01                sub    $0x1,%edx
   9:   4c 8d 0c 95 04 00 00    lea    0x4(,%rdx,4),%r9
  10:   00 
  11:   31 d2                   xor    %edx,%edx
  13:   0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
  18:   48 63 0c 16             movslq (%rsi,%rdx,1),%rcx
  1c:   4c 63 04 17             movslq (%rdi,%rdx,1),%r8
  20:   48 83 c2 04             add    $0x4,%rdx
  24:   49 0f af c8             imul   %r8,%rcx
  28:   48 01 c8                add    %rcx,%rax
  2b:   4c 39 ca                cmp    %r9,%rdx
  2e:   75 e8                   jne    18 <dotproduct+0x18>
  30:   f3 c3                   repz retq 
  32:   66 66 66 66 66 2e 0f    nopw   %cs:0x0(%rax,%rax,1)
  39:   1f 84 00 00 00 00 00 

0000000000000040 <dotproduct_order4>:
  40:   48 63 07                movslq (%rdi),%rax
  43:   48 63 16                movslq (%rsi),%rdx
  46:   48 63 4f 04             movslq 0x4(%rdi),%rcx
  4a:   48 0f af d0             imul   %rax,%rdx
  4e:   48 63 46 04             movslq 0x4(%rsi),%rax
  52:   48 0f af c1             imul   %rcx,%rax
  56:   48 63 4f 08             movslq 0x8(%rdi),%rcx
  5a:   48 01 c2                add    %rax,%rdx
  5d:   48 63 46 08             movslq 0x8(%rsi),%rax
  61:   48 0f af c1             imul   %rcx,%rax
  65:   48 63 4f 0c             movslq 0xc(%rdi),%rcx
  69:   48 01 c2                add    %rax,%rdx
  6c:   48 63 46 0c             movslq 0xc(%rsi),%rax
  70:   48 0f af c1             imul   %rcx,%rax
  74:   48 01 d0                add    %rdx,%rax
  77:   c3                      retq


-- 
           Summary: 120% slowdown with vectorizer
           Product: gcc
           Version: 4.4.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
        AssignedTo: unassigned at gcc dot gnu dot org
        ReportedBy: ramiro86 at hotmail dot com
GCC target triplet: x86_64-linux-gnu


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=39821


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [Bug tree-optimization/39821] 120% slowdown with vectorizer
  2009-04-20  0:23 [Bug tree-optimization/39821] New: 120% slowdown with vectorizer ramiro86 at hotmail dot com
@ 2009-04-20  9:26 ` rguenth at gcc dot gnu dot org
  2009-04-20 20:52 ` ubizjak at gmail dot com
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 5+ messages in thread
From: rguenth at gcc dot gnu dot org @ 2009-04-20  9:26 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #1 from rguenth at gcc dot gnu dot org  2009-04-20 09:26 -------
The vectorizer creates

  vect_var_.128_46 = M*vect_p.123_44{misalignment: 0};
  vect_var_.129_47 = [vec_unpack_lo_expr] vect_var_.128_46;
  vect_var_.129_48 = [vec_unpack_hi_expr] vect_var_.128_46;
  vect_var_.135_53 = M*vect_p.130_51{misalignment: 0};
  vect_var_.136_54 = [vec_unpack_lo_expr] vect_var_.135_53;
  vect_var_.136_55 = [vec_unpack_hi_expr] vect_var_.135_53;
  vect_var_.137_56 = vect_var_.136_54 * vect_var_.129_47;
  vect_var_.137_57 = vect_var_.136_55 * vect_var_.129_48;
  vect_var_.138_59 = vect_var_.137_56 + vect_var_.138_58;
  vect_var_.138_60 = vect_var_.137_57 + vect_var_.138_59;
  v1_14 = v1_26 + 4;

but the widening unpacking results in absymal code generated.  Where are
all the shifts coming from?


-- 

rguenth at gcc dot gnu dot org changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |uros at gcc dot gnu dot org,
                   |                            |irar at il dot ibm dot com
             Status|UNCONFIRMED                 |NEW
     Ever Confirmed|0                           |1
           Keywords|                            |missed-optimization
   Last reconfirmed|0000-00-00 00:00:00         |2009-04-20 09:26:17
               date|                            |


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=39821


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [Bug tree-optimization/39821] 120% slowdown with vectorizer
  2009-04-20  0:23 [Bug tree-optimization/39821] New: 120% slowdown with vectorizer ramiro86 at hotmail dot com
  2009-04-20  9:26 ` [Bug tree-optimization/39821] " rguenth at gcc dot gnu dot org
@ 2009-04-20 20:52 ` ubizjak at gmail dot com
  2009-04-21  0:09 ` ramiro86 at hotmail dot com
  2009-04-21  0:10 ` ramiro86 at hotmail dot com
  3 siblings, 0 replies; 5+ messages in thread
From: ubizjak at gmail dot com @ 2009-04-20 20:52 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #2 from ubizjak at gmail dot com  2009-04-20 20:52 -------
(In reply to comment #1)

> but the widening unpacking results in absymal code generated.  Where are
> all the shifts coming from?

Not from unpacking, but from mulv2di pattern from sse.md

Can you please attach full source to create executable testcase? IIRC,
execution times depend on target processor, and perhaps vect cost should be
updated for this case.


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=39821


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [Bug tree-optimization/39821] 120% slowdown with vectorizer
  2009-04-20  0:23 [Bug tree-optimization/39821] New: 120% slowdown with vectorizer ramiro86 at hotmail dot com
  2009-04-20  9:26 ` [Bug tree-optimization/39821] " rguenth at gcc dot gnu dot org
  2009-04-20 20:52 ` ubizjak at gmail dot com
@ 2009-04-21  0:09 ` ramiro86 at hotmail dot com
  2009-04-21  0:10 ` ramiro86 at hotmail dot com
  3 siblings, 0 replies; 5+ messages in thread
From: ramiro86 at hotmail dot com @ 2009-04-21  0:09 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #3 from ramiro86 at hotmail dot com  2009-04-21 00:08 -------
Created an attachment (id=17660)
 --> (http://gcc.gnu.org/bugzilla/attachment.cgi?id=17660&action=view)
tarball of a simple testcase


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=39821


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [Bug tree-optimization/39821] 120% slowdown with vectorizer
  2009-04-20  0:23 [Bug tree-optimization/39821] New: 120% slowdown with vectorizer ramiro86 at hotmail dot com
                   ` (2 preceding siblings ...)
  2009-04-21  0:09 ` ramiro86 at hotmail dot com
@ 2009-04-21  0:10 ` ramiro86 at hotmail dot com
  3 siblings, 0 replies; 5+ messages in thread
From: ramiro86 at hotmail dot com @ 2009-04-21  0:10 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #4 from ramiro86 at hotmail dot com  2009-04-21 00:10 -------
I've attached a simple testcase. The system I'm running this on is a q6600 with
64-bit Linux.


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=39821


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2009-04-21  0:10 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-04-20  0:23 [Bug tree-optimization/39821] New: 120% slowdown with vectorizer ramiro86 at hotmail dot com
2009-04-20  9:26 ` [Bug tree-optimization/39821] " rguenth at gcc dot gnu dot org
2009-04-20 20:52 ` ubizjak at gmail dot com
2009-04-21  0:09 ` ramiro86 at hotmail dot com
2009-04-21  0:10 ` ramiro86 at hotmail dot com

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).