[Bug tree-optimization/38682] New: [4.4 Regression] speed regression with sse intrinsics and -ffast-math

public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed

* [Bug tree-optimization/38682]  New: [4.4 Regression] speed regression with sse intrinsics and -ffast-math
@ 2008-12-31 10:55 tim at klingt dot org
  2008-12-31 15:35 ` [Bug target/38682] " pinskia at gcc dot gnu dot org
                   ` (6 more replies)
  0 siblings, 7 replies; 8+ messages in thread
From: tim at klingt dot org @ 2008-12-31 10:55 UTC (permalink / raw)
  To: gcc-bugs

there is a speed regression from gcc-4.4 with the following code:

void bench_3(float * out, float * in1, float in2, float slope, unsigned int n)
{
    __m128 arg2 = _mm_set_ps(in2, in2+slope, in2+slope+slope,
in2+slope+slope+slope);
    const __m128 vslope = _mm_set_ps1(slope+slope+slope+slope);

    std::size_t loops = n / 4;

    do {
        __m128 arg1 = _mm_load_ps(in1);
        __m128 result = _mm_add_ps(arg1, arg2);
        arg2 = _mm_add_ps(arg2, vslope);
        _mm_store_ps(out, result);
        in1+=4;
        out+=4;
    } while (--loops);
}

gcc-4.3 generates the code:
0000000000400f00 <bench_3(float*, float*, float, float, unsigned int)>:
  400f00:       0f 28 e1                movaps %xmm1,%xmm4
  400f03:       c1 ea 02                shr    $0x2,%edx
  400f06:       f3 0f 58 e0             addss  %xmm0,%xmm4
  400f0a:       89 d2                   mov    %edx,%edx
  400f0c:       0f 28 dc                movaps %xmm4,%xmm3
  400f0f:       31 c0                   xor    %eax,%eax
  400f11:       f3 0f 58 d9             addss  %xmm1,%xmm3
  400f15:       0f 14 e0                unpcklps %xmm0,%xmm4
  400f18:       0f 28 d3                movaps %xmm3,%xmm2
  400f1b:       f3 0f 58 d1             addss  %xmm1,%xmm2
  400f1f:       f3 0f 59 0d 79 17 00    mulss  0x1779(%rip),%xmm1        #
4026a0 <_IO_stdin_used+0xa0>
  400f26:       00 
  400f27:       0f 14 d3                unpcklps %xmm3,%xmm2
  400f2a:       0f c6 c9 00             shufps $0x0,%xmm1,%xmm1
  400f2e:       0f 16 d4                movlhps %xmm4,%xmm2
  400f31:       0f 1f 80 00 00 00 00    nopl   0x0(%rax)
  400f38:       0f 28 04 06             movaps (%rsi,%rax,1),%xmm0
  400f3c:       0f 58 c2                addps  %xmm2,%xmm0
  400f3f:       0f 58 d1                addps  %xmm1,%xmm2
  400f42:       0f 29 04 07             movaps %xmm0,(%rdi,%rax,1)
  400f46:       48 83 c0 10             add    $0x10,%rax
  400f4a:       48 ff ca                dec    %rdx
  400f4d:       75 e9                   jne    400f38 <bench_3(float*, float*,
float, float, unsigned int)+0x38>
  400f4f:       f3 c3                   repz retq 
  400f51:       66 66 66 66 66 66 2e    nopw   %cs:0x0(%rax,%rax,1)
  400f58:       0f 1f 84 00 00 00 00 
  400f5f:       00 

while gcc-4.4 generates:
0000000000400ea0 <bench_3(float*, float*, float, float, unsigned int)>:
  400ea0:       0f 28 d9                movaps %xmm1,%xmm3
  400ea3:       c1 ea 02                shr    $0x2,%edx
  400ea6:       f3 0f 58 d8             addss  %xmm0,%xmm3
  400eaa:       89 d2                   mov    %edx,%edx
  400eac:       0f 28 d3                movaps %xmm3,%xmm2
  400eaf:       31 c0                   xor    %eax,%eax
  400eb1:       f3 0f 58 d1             addss  %xmm1,%xmm2
  400eb5:       0f 14 d8                unpcklps %xmm0,%xmm3
  400eb8:       0f 28 e2                movaps %xmm2,%xmm4
  400ebb:       f3 0f 58 e1             addss  %xmm1,%xmm4
  400ebf:       f3 0f 59 0d 39 1e 00    mulss  0x1e39(%rip),%xmm1        #
402d00 <_IO_stdin_used+0xa0>
  400ec6:       00 
  400ec7:       0f 28 c4                movaps %xmm4,%xmm0
  400eca:       0f c6 c9 00             shufps $0x0,%xmm1,%xmm1
  400ece:       0f 14 c2                unpcklps %xmm2,%xmm0
  400ed1:       0f 16 c3                movlhps %xmm3,%xmm0
  400ed4:       0f 1f 40 00             nopl   0x0(%rax)
  400ed8:       0f 28 d0                movaps %xmm0,%xmm2
  400edb:       0f 58 c1                addps  %xmm1,%xmm0
  400ede:       0f 58 14 06             addps  (%rsi,%rax,1),%xmm2
  400ee2:       0f 29 14 07             movaps %xmm2,(%rdi,%rax,1)
  400ee6:       48 83 c0 10             add    $0x10,%rax
  400eea:       48 ff ca                dec    %rdx
  400eed:       75 e9                   jne    400ed8 <bench_3(float*, float*,
float, float, unsigned int)+0x38>
  400eef:       f3 c3                   repz retq 
  400ef1:       66 66 66 66 66 66 2e    nopw   %cs:0x0(%rax,%rax,1)
  400ef8:       0f 1f 84 00 00 00 00 
  400eff:       00 

the movaps in 400ec7 is not generated by gcc-4.3 ... the code generated by
gcc-4.4 is running about 7% slower on a core2 (x86_64).

gcc -v:
Using built-in specs.
Target: x86_64-linux-gnu
Configured with: ../gcc-4.4-20081226/configure -v
--with-bugurl=file:///usr/share/doc/gcc-snapshot/README.Bugs
--enable-languages=c,c++ --prefix=/usr/local/lib/gcc-snapshot --enable-shared
--with-system-zlib --disable-nls --enable-clocale=gnu --enable-libstdcxx-debug
--enable-gtk-cairo --disable-plugin --enable-objc-gc --enable-mpfr
--disable-werror --build=x86_64-linux-gnu --host=x86_64-linux-gnu
--target=x86_64-linux-gnu
Thread model: posix
gcc version 4.4.0 20081226 (experimental) (GCC)


-- 
           Summary: [4.4 Regression] speed regression with sse intrinsics
                    and -ffast-math
           Product: gcc
           Version: unknown
            Status: UNCONFIRMED
          Severity: enhancement
          Priority: P3
         Component: tree-optimization
        AssignedTo: unassigned at gcc dot gnu dot org
        ReportedBy: tim at klingt dot org
GCC target triplet: x86_64-linux-gnu


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38682


^ permalink raw reply	[flat|nested] 8+ messages in thread

* [Bug target/38682] [4.4 Regression] speed regression with sse intrinsics and -ffast-math
  2008-12-31 10:55 [Bug tree-optimization/38682] New: [4.4 Regression] speed regression with sse intrinsics and -ffast-math tim at klingt dot org
@ 2008-12-31 15:35 ` pinskia at gcc dot gnu dot org
  2009-01-01  5:16 ` pinskia at gcc dot gnu dot org
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 8+ messages in thread
From: pinskia at gcc dot gnu dot org @ 2008-12-31 15:35 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #1 from pinskia at gcc dot gnu dot org  2008-12-31 15:34 -------
This is a target issue really.  The number and type of instructions is the
same.  The difference is just a little reassociation in the addition.


-- 

pinskia at gcc dot gnu dot org changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
          Component|tree-optimization           |target


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38682


^ permalink raw reply	[flat|nested] 8+ messages in thread

* [Bug target/38682] [4.4 Regression] speed regression with sse intrinsics and -ffast-math
  2008-12-31 10:55 [Bug tree-optimization/38682] New: [4.4 Regression] speed regression with sse intrinsics and -ffast-math tim at klingt dot org
  2008-12-31 15:35 ` [Bug target/38682] " pinskia at gcc dot gnu dot org
@ 2009-01-01  5:16 ` pinskia at gcc dot gnu dot org
  2009-01-05 11:29 ` rguenth at gcc dot gnu dot org
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 8+ messages in thread
From: pinskia at gcc dot gnu dot org @ 2009-01-01  5:16 UTC (permalink / raw)
  To: gcc-bugs



-- 

pinskia at gcc dot gnu dot org changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
   Target Milestone|---                         |4.4.0
            Version|unknown                     |4.4.0


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38682


^ permalink raw reply	[flat|nested] 8+ messages in thread

* [Bug target/38682] [4.4 Regression] speed regression with sse intrinsics and -ffast-math
  2008-12-31 10:55 [Bug tree-optimization/38682] New: [4.4 Regression] speed regression with sse intrinsics and -ffast-math tim at klingt dot org
  2008-12-31 15:35 ` [Bug target/38682] " pinskia at gcc dot gnu dot org
  2009-01-01  5:16 ` pinskia at gcc dot gnu dot org
@ 2009-01-05 11:29 ` rguenth at gcc dot gnu dot org
  2009-01-06 20:22 ` rguenth at gcc dot gnu dot org
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 8+ messages in thread
From: rguenth at gcc dot gnu dot org @ 2009-01-05 11:29 UTC (permalink / raw)
  To: gcc-bugs



-- 

rguenth at gcc dot gnu dot org changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
           Severity|enhancement                 |normal
           Keywords|                            |missed-optimization


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38682


^ permalink raw reply	[flat|nested] 8+ messages in thread

* [Bug target/38682] [4.4 Regression] speed regression with sse intrinsics and -ffast-math
  2008-12-31 10:55 [Bug tree-optimization/38682] New: [4.4 Regression] speed regression with sse intrinsics and -ffast-math tim at klingt dot org
                   ` (2 preceding siblings ...)
  2009-01-05 11:29 ` rguenth at gcc dot gnu dot org
@ 2009-01-06 20:22 ` rguenth at gcc dot gnu dot org
  2009-01-31 14:33 ` bonzini at gnu dot org
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 8+ messages in thread
From: rguenth at gcc dot gnu dot org @ 2009-01-06 20:22 UTC (permalink / raw)
  To: gcc-bugs



-- 

rguenth at gcc dot gnu dot org changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
           Priority|P3                          |P2


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38682


^ permalink raw reply	[flat|nested] 8+ messages in thread

* [Bug target/38682] [4.4 Regression] speed regression with sse intrinsics and -ffast-math
  2008-12-31 10:55 [Bug tree-optimization/38682] New: [4.4 Regression] speed regression with sse intrinsics and -ffast-math tim at klingt dot org
                   ` (3 preceding siblings ...)
  2009-01-06 20:22 ` rguenth at gcc dot gnu dot org
@ 2009-01-31 14:33 ` bonzini at gnu dot org
  2009-01-31 15:39 ` bonzini at gnu dot org
  2009-01-31 16:23 ` bonzini at gnu dot org
  6 siblings, 0 replies; 8+ messages in thread
From: bonzini at gnu dot org @ 2009-01-31 14:33 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #2 from bonzini at gnu dot org  2009-01-31 14:33 -------
??? Andrew, there's 11 vs. 12 instructions.


-- 

bonzini at gnu dot org changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |bonzini at gnu dot org
             Status|UNCONFIRMED                 |NEW
     Ever Confirmed|0                           |1
   Last reconfirmed|0000-00-00 00:00:00         |2009-01-31 14:33:24
               date|                            |


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38682


^ permalink raw reply	[flat|nested] 8+ messages in thread

* [Bug target/38682] [4.4 Regression] speed regression with sse intrinsics and -ffast-math
  2008-12-31 10:55 [Bug tree-optimization/38682] New: [4.4 Regression] speed regression with sse intrinsics and -ffast-math tim at klingt dot org
                   ` (4 preceding siblings ...)
  2009-01-31 14:33 ` bonzini at gnu dot org
@ 2009-01-31 15:39 ` bonzini at gnu dot org
  2009-01-31 16:23 ` bonzini at gnu dot org
  6 siblings, 0 replies; 8+ messages in thread
From: bonzini at gnu dot org @ 2009-01-31 15:39 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #3 from bonzini at gnu dot org  2009-01-31 15:39 -------
In both versions there's some pessimization in the expansion of _mm_set_ps and
_mm_set_ps1.  It's probably easier to fix than the regression.


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38682


^ permalink raw reply	[flat|nested] 8+ messages in thread

* [Bug target/38682] [4.4 Regression] speed regression with sse intrinsics and -ffast-math
  2008-12-31 10:55 [Bug tree-optimization/38682] New: [4.4 Regression] speed regression with sse intrinsics and -ffast-math tim at klingt dot org
                   ` (5 preceding siblings ...)
  2009-01-31 15:39 ` bonzini at gnu dot org
@ 2009-01-31 16:23 ` bonzini at gnu dot org
  6 siblings, 0 replies; 8+ messages in thread
From: bonzini at gnu dot org @ 2009-01-31 16:23 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #4 from bonzini at gnu dot org  2009-01-31 16:23 -------
I see optimal code with trunk:

.LFB8:
        movaps  %xmm1, %xmm4
        shrl    $2, %edx
        mov     %edx, %edx
        xorl    %eax, %eax
        addss   %xmm0, %xmm4
        movaps  %xmm4, %xmm3
        unpcklps        %xmm0, %xmm4
        addss   %xmm1, %xmm3
        movaps  %xmm3, %xmm2
        addss   %xmm1, %xmm2
        mulss   .LC0(%rip), %xmm1
        unpcklps        %xmm3, %xmm2
        shufps  $0, %xmm1, %xmm1
        movlhps %xmm4, %xmm2
        .align 16
.L2:
        movaps  (%rsi,%rax), %xmm0
        addps   %xmm2, %xmm0
        addps   %xmm1, %xmm2
        movaps  %xmm0, (%rdi,%rax)
        addq    $16, %rax
        subq    $1, %rdx
        jne     .L2


-- 

bonzini at gnu dot org changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|NEW                         |RESOLVED
         Resolution|                            |FIXED


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38682


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2009-01-31 16:23 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-12-31 10:55 [Bug tree-optimization/38682] New: [4.4 Regression] speed regression with sse intrinsics and -ffast-math tim at klingt dot org
2008-12-31 15:35 ` [Bug target/38682] " pinskia at gcc dot gnu dot org
2009-01-01  5:16 ` pinskia at gcc dot gnu dot org
2009-01-05 11:29 ` rguenth at gcc dot gnu dot org
2009-01-06 20:22 ` rguenth at gcc dot gnu dot org
2009-01-31 14:33 ` bonzini at gnu dot org
2009-01-31 15:39 ` bonzini at gnu dot org
2009-01-31 16:23 ` bonzini at gnu dot org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).