public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug c/38134]  New: gcc-4.4 speed regression with sse code
@ 2008-11-15 15:56 tim at klingt dot org
  2008-11-15 15:57 ` [Bug c/38134] " tim at klingt dot org
                   ` (21 more replies)
  0 siblings, 22 replies; 23+ messages in thread
From: tim at klingt dot org @ 2008-11-15 15:56 UTC (permalink / raw)
  To: gcc-bugs

the attached program, a simdfied version of the tanf function, shows a 20%
performance regression from gcc-4.3 to gcc-4.4:

the compared compilers are
g++-4.3
Using built-in specs.
Target: x86_64-linux-gnu
Configured with: ../src/configure -v --with-pkgversion='Ubuntu 4.3.2-1ubuntu11'
--with-bugurl=file:///usr/share/doc/gcc-4.3/README.Bugs
--enable-languages=c,c++,fortran,objc,obj-c++ --prefix=/usr --enable-shared
--with-system-zlib --libexecdir=/usr/lib --without-included-gettext
--enable-threads=posix --enable-nls --with-gxx-include-dir=/usr/include/c++/4.3
--program-suffix=-4.3 --enable-clocale=gnu --enable-libstdcxx-debug
--enable-objc-gc --enable-mpfr --enable-checking=release
--build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu
Thread model: posix
gcc version 4.3.2 (Ubuntu 4.3.2-1ubuntu11) 

and

Using built-in specs.
Target: x86_64-linux-gnu
Configured with: ../src/configure -v --with-pkgversion='Ubuntu
20081024-0ubuntu1' --with-bugurl=file:///usr/share/doc/gcc-snapshot/README.Bugs
--enable-languages=c,c++,java,fortran,objc,obj-c++,ada
--prefix=/usr/lib/gcc-snapshot --enable-shared --with-system-zlib --disable-nls
--enable-clocale=gnu --enable-libstdcxx-debug --enable-java-awt=gtk
--enable-gtk-cairo --disable-plugin --with-java-home=/usr/lib/gcc-snapshot
--enable-java-home --with-jvm-root-dir=/usr/lib/gcc-snapshot/jvm
--with-jvm-jar-dir=/usr/lib/gcc-snapshot/jvm-exports
--with-ecj-jar=/usr/share/java/eclipse-ecj.jar --enable-objc-gc --enable-mpfr
--disable-werror --build=x86_64-linux-gnu --host=x86_64-linux-gnu
--target=x86_64-linux-gnu
Thread model: posix
gcc version 4.4.0 20081024 (experimental) [trunk revision 141342] (Ubuntu
20081024-0ubuntu1) 

the interesting part is the inner loop of the bench_1_simd function. 
gcc-4.4 generates:

.L54:
        movaps  in(%rax), %xmm0
        movdqa  %xmm14, %xmm3
        addl    $4, %edx
        pand    %xmm0, %xmm3
#APP
# 325 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        xorps %xmm3, %xmm0
# 0 "" 2
#NO_APP
        movaps  %xmm0, %xmm4
        movaps  %xmm0, %xmm15
        mulps   %xmm13, %xmm4
#APP
# 328 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        cvttps2dq %xmm4, %xmm4
# 0 "" 2
#NO_APP
        movdqa  %xmm4, %xmm1
        pand    %xmm12, %xmm1
        paddd   %xmm1, %xmm4
#APP
# 331 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        cvtdq2ps  %xmm4, %xmm1
# 0 "" 2
#NO_APP
        pand    .LC15(%rip), %xmm4
        movaps  %xmm1, %xmm2
        psrld   $1, %xmm4
        mulps   %xmm11, %xmm2
        subps   %xmm2, %xmm15
        movaps  %xmm15, %xmm2
        movaps  %xmm1, %xmm15
        mulps   %xmm9, %xmm1
        mulps   %xmm10, %xmm15
        subps   %xmm15, %xmm2
        movaps  %xmm8, %xmm15
        subps   %xmm1, %xmm2
        cmpltps %xmm0, %xmm15
        movaps  %xmm2, %xmm1
        mulps   %xmm2, %xmm1
        movaps  %xmm1, %xmm0
        mulps   %xmm7, %xmm0
        addps   .LC10(%rip), %xmm0
        mulps   %xmm1, %xmm0
        addps   .LC11(%rip), %xmm0
        mulps   %xmm1, %xmm0
        addps   .LC12(%rip), %xmm0
        mulps   %xmm1, %xmm0
        addps   .LC13(%rip), %xmm0
        mulps   %xmm1, %xmm0
        addps   .LC14(%rip), %xmm0
        mulps   %xmm1, %xmm0
        movdqa  %xmm5, %xmm1
        mulps   %xmm2, %xmm0
        psubd   %xmm4, %xmm1
        addps   %xmm2, %xmm0
        movdqa  %xmm1, %xmm4
#APP
# 342 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        andps %xmm15, %xmm0
# 0 "" 2
#NO_APP
        movaps  .LC16(%rip), %xmm1
#APP
# 343 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        andnps %xmm2, %xmm15
# 0 "" 2
#NO_APP
        movaps  %xmm6, %xmm2
#APP
# 344 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        orps  %xmm15, %xmm0
# 0 "" 2
#NO_APP
        addps   %xmm0, %xmm1
        divps   %xmm1, %xmm2
        movaps  %xmm2, %xmm1
#APP
# 145 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1
        andps %xmm4, %xmm1
# 0 "" 2
# 146 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1
        andnps %xmm0, %xmm4
# 0 "" 2
#NO_APP
        movaps  %xmm1, %xmm0
#APP
# 147 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1
        orps  %xmm4, %xmm0
# 0 "" 2
# 349 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        xorps %xmm3, %xmm0
# 0 "" 2
#NO_APP
        movaps  %xmm0, out(%rax)
        addq    $16, %rax
        cmpl    %edi, %edx
        jne     .L54

while gcc-4.3 generates:
.L48:
        movaps  in(%rax), %xmm2
        movdqa  .LC2(%rip), %xmm5
        movaps  .LC3(%rip), %xmm0
        pand    %xmm2, %xmm5
        movdqa  .LC4(%rip), %xmm4
        movaps  .LC5(%rip), %xmm1
        addl    $4, %edx
#APP
# 325 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        xorps %xmm5, %xmm2
# 0 "" 2
#NO_APP
        mulps   %xmm2, %xmm0
        movaps  %xmm2, %xmm3
#APP
# 328 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        cvttps2dq %xmm0, %xmm0
# 0 "" 2
#NO_APP
        pand    %xmm0, %xmm4
        paddd   %xmm0, %xmm4
#APP
# 331 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        cvtdq2ps  %xmm4, %xmm0
# 0 "" 2
#NO_APP
        pand    %xmm9, %xmm4
        mulps   %xmm0, %xmm1
        psrld   $1, %xmm4
        subps   %xmm1, %xmm3
        movaps  .LC6(%rip), %xmm1
        mulps   %xmm0, %xmm1
        mulps   .LC7(%rip), %xmm0
        subps   %xmm1, %xmm3
        subps   %xmm0, %xmm3
        movaps  .LC8(%rip), %xmm0
        movaps  %xmm3, %xmm1
        cmpltps %xmm2, %xmm0
        mulps   %xmm3, %xmm1
        movaps  %xmm0, %xmm2
        movaps  %xmm1, %xmm0
        mulps   %xmm15, %xmm0
        addps   %xmm14, %xmm0
        mulps   %xmm1, %xmm0
        addps   %xmm13, %xmm0
        mulps   %xmm1, %xmm0
        addps   %xmm12, %xmm0
        mulps   %xmm1, %xmm0
        addps   %xmm11, %xmm0
        mulps   %xmm1, %xmm0
        addps   %xmm10, %xmm0
        mulps   %xmm1, %xmm0
        mulps   %xmm3, %xmm0
        addps   %xmm3, %xmm0
#APP
# 342 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        andps %xmm2, %xmm0
# 0 "" 2
# 343 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        andnps %xmm3, %xmm2
# 0 "" 2
#NO_APP
        movaps  %xmm7, %xmm3
#APP
# 344 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        orps  %xmm2, %xmm0
# 0 "" 2
#NO_APP
        movdqa  %xmm6, %xmm2
        movaps  %xmm0, %xmm1
        psubd   %xmm4, %xmm2
        addps   %xmm8, %xmm1
        divps   %xmm1, %xmm3
        movaps  %xmm3, %xmm1
#APP
# 145 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1
        andps %xmm2, %xmm1
# 0 "" 2
# 146 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1
        andnps %xmm0, %xmm2
# 0 "" 2
# 147 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1
        orps  %xmm2, %xmm1
# 0 "" 2
# 349 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        xorps %xmm5, %xmm1
# 0 "" 2
#NO_APP
        movaps  %xmm1, out(%rax)
        addq    $16, %rax
        cmpl    %edi, %edx
        jne     .L48

the code generated by gcc-4.4 requires more memory access. the code was
generated with the flags -O3 -march=core2. while the assembly code is generated
for the x86_64 architecture, similar results can be seen with x86 code (4.4 is
about 14% slower than 4.3)


-- 
           Summary: gcc-4.4 speed regression with sse code
           Product: gcc
           Version: unknown
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c
        AssignedTo: unassigned at gcc dot gnu dot org
        ReportedBy: tim at klingt dot org


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [Bug c/38134] gcc-4.4 speed regression with sse code
  2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org
@ 2008-11-15 15:57 ` tim at klingt dot org
  2008-11-15 16:47 ` ubizjak at gmail dot com
                   ` (20 subsequent siblings)
  21 siblings, 0 replies; 23+ messages in thread
From: tim at klingt dot org @ 2008-11-15 15:57 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #1 from tim at klingt dot org  2008-11-15 15:55 -------
Created an attachment (id=16684)
 --> (http://gcc.gnu.org/bugzilla/attachment.cgi?id=16684&action=view)
compressed preprocessed source


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [Bug c/38134] gcc-4.4 speed regression with sse code
  2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org
  2008-11-15 15:57 ` [Bug c/38134] " tim at klingt dot org
@ 2008-11-15 16:47 ` ubizjak at gmail dot com
  2008-11-15 17:05 ` tim at klingt dot org
                   ` (19 subsequent siblings)
  21 siblings, 0 replies; 23+ messages in thread
From: ubizjak at gmail dot com @ 2008-11-15 16:47 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #2 from ubizjak at gmail dot com  2008-11-15 16:46 -------
Can you try with -fno-ira?


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [Bug c/38134] gcc-4.4 speed regression with sse code
  2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org
  2008-11-15 15:57 ` [Bug c/38134] " tim at klingt dot org
  2008-11-15 16:47 ` ubizjak at gmail dot com
@ 2008-11-15 17:05 ` tim at klingt dot org
  2008-11-15 20:32 ` [Bug target/38134] [4.4 Regression] speed regression with inline-asm " rguenth at gcc dot gnu dot org
                   ` (18 subsequent siblings)
  21 siblings, 0 replies; 23+ messages in thread
From: tim at klingt dot org @ 2008-11-15 17:05 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #3 from tim at klingt dot org  2008-11-15 17:04 -------
i tried to run the benchmark with -fno-ira, which turned out to be about 20%
slower than without the flag.

anyway, i found, that the preprocessed source generated by gcc-4.3 cannot be
compiled with gcc-4.4 ... the specific file can be found here
http://tim.klingt.org/git?p=nova-server.git;a=blob;f=benchmarks/simd_tan_benchmarks.cpp;h=c575996de0dc916a8e654af7a36350be9c22327e;hb=844d3cf991cbbbe74b34277696dda0b940769b28


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [Bug target/38134] [4.4 Regression] speed regression with inline-asm sse code
  2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org
                   ` (2 preceding siblings ...)
  2008-11-15 17:05 ` tim at klingt dot org
@ 2008-11-15 20:32 ` rguenth at gcc dot gnu dot org
  2008-11-16  0:07 ` hjl dot tools at gmail dot com
                   ` (17 subsequent siblings)
  21 siblings, 0 replies; 23+ messages in thread
From: rguenth at gcc dot gnu dot org @ 2008-11-15 20:32 UTC (permalink / raw)
  To: gcc-bugs



-- 

rguenth at gcc dot gnu dot org changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
 GCC target triplet|                            |x86_64-*-*-*
           Keywords|                            |missed-optimization
            Summary|gcc-4.4 speed regression    |[4.4 Regression] speed
                   |with inline-asm sse code    |regression with inline-asm
                   |                            |sse code
   Target Milestone|---                         |4.4.0
            Version|unknown                     |4.4.0


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [Bug target/38134] [4.4 Regression] speed regression with inline-asm sse code
  2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org
                   ` (3 preceding siblings ...)
  2008-11-15 20:32 ` [Bug target/38134] [4.4 Regression] speed regression with inline-asm " rguenth at gcc dot gnu dot org
@ 2008-11-16  0:07 ` hjl dot tools at gmail dot com
  2008-11-16  0:09 ` hjl dot tools at gmail dot com
                   ` (16 subsequent siblings)
  21 siblings, 0 replies; 23+ messages in thread
From: hjl dot tools at gmail dot com @ 2008-11-16  0:07 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #4 from hjl dot tools at gmail dot com  2008-11-16 00:06 -------
(In reply to comment #3)
> i tried to run the benchmark with -fno-ira, which turned out to be about 20%
> slower than without the flag.
> 

Can you try "-O3 -march=core2 -mtune=generic" and "-O3 -march=core2
-mtune=generic -fno-ira" ?


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [Bug target/38134] [4.4 Regression] speed regression with inline-asm sse code
  2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org
                   ` (4 preceding siblings ...)
  2008-11-16  0:07 ` hjl dot tools at gmail dot com
@ 2008-11-16  0:09 ` hjl dot tools at gmail dot com
  2008-11-17  9:36 ` jakub at gcc dot gnu dot org
                   ` (15 subsequent siblings)
  21 siblings, 0 replies; 23+ messages in thread
From: hjl dot tools at gmail dot com @ 2008-11-16  0:09 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #5 from hjl dot tools at gmail dot com  2008-11-16 00:08 -------
(In reply to comment #3)
> anyway, i found, that the preprocessed source generated by gcc-4.3 cannot be
> compiled with gcc-4.4 ... the specific file can be found here
> http://tim.klingt.org/git?p=nova-server.git;a=blob;f=benchmarks/simd_tan_benchmarks.cpp;h=c575996de0dc916a8e654af7a36350be9c22327e;hb=844d3cf991cbbbe74b34277696dda0b940769b28
> 

Please upload both preprocessed sources generated by gcc 4.3 and gcc 4.4.


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [Bug target/38134] [4.4 Regression] speed regression with inline-asm sse code
  2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org
                   ` (5 preceding siblings ...)
  2008-11-16  0:09 ` hjl dot tools at gmail dot com
@ 2008-11-17  9:36 ` jakub at gcc dot gnu dot org
  2008-11-17 18:13 ` ubizjak at gmail dot com
                   ` (14 subsequent siblings)
  21 siblings, 0 replies; 23+ messages in thread
From: jakub at gcc dot gnu dot org @ 2008-11-17  9:36 UTC (permalink / raw)
  To: gcc-bugs



-- 

jakub at gcc dot gnu dot org changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
           Priority|P3                          |P2


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [Bug target/38134] [4.4 Regression] speed regression with inline-asm sse code
  2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org
                   ` (6 preceding siblings ...)
  2008-11-17  9:36 ` jakub at gcc dot gnu dot org
@ 2008-11-17 18:13 ` ubizjak at gmail dot com
  2008-11-17 18:20 ` tim at klingt dot org
                   ` (13 subsequent siblings)
  21 siblings, 0 replies; 23+ messages in thread
From: ubizjak at gmail dot com @ 2008-11-17 18:13 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #6 from ubizjak at gmail dot com  2008-11-17 18:11 -------
I think that

        addps   .LC10(%rip), %xmm0
        mulps   %xmm1, %xmm0
        addps   .LC11(%rip), %xmm0
        mulps   %xmm1, %xmm0
        addps   .LC12(%rip), %xmm0
        mulps   %xmm1, %xmm0
        addps   .LC13(%rip), %xmm0
        mulps   %xmm1, %xmm0
        addps   .LC14(%rip), %xmm0
        mulps   %xmm1, %xmm0

is the bottleneck. Perhaps we should split impilicit memory operands out of the
insn by some generic peephole (if the register is available) and schedule loads
appropriately.

OTOH, loop optimizer should detect invariant loads and move them out of the
loop.


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [Bug target/38134] [4.4 Regression] speed regression with inline-asm sse code
  2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org
                   ` (7 preceding siblings ...)
  2008-11-17 18:13 ` ubizjak at gmail dot com
@ 2008-11-17 18:20 ` tim at klingt dot org
  2008-11-17 18:31 ` tim at klingt dot org
                   ` (12 subsequent siblings)
  21 siblings, 0 replies; 23+ messages in thread
From: tim at klingt dot org @ 2008-11-17 18:20 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #7 from tim at klingt dot org  2008-11-17 18:19 -------
Created an attachment (id=16710)
 --> (http://gcc.gnu.org/bugzilla/attachment.cgi?id=16710&action=view)
compressed preprocessed source, gcc-4.4


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [Bug target/38134] [4.4 Regression] speed regression with inline-asm sse code
  2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org
                   ` (8 preceding siblings ...)
  2008-11-17 18:20 ` tim at klingt dot org
@ 2008-11-17 18:31 ` tim at klingt dot org
  2008-11-17 18:50 ` tim at klingt dot org
                   ` (11 subsequent siblings)
  21 siblings, 0 replies; 23+ messages in thread
From: tim at klingt dot org @ 2008-11-17 18:31 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #8 from tim at klingt dot org  2008-11-17 18:30 -------
Created an attachment (id=16711)
 --> (http://gcc.gnu.org/bugzilla/attachment.cgi?id=16711&action=view)
16684: compressed preprocessed source, gcc-4.3


-- 

tim at klingt dot org changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
  Attachment #16684|0                           |1
        is obsolete|                            |


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [Bug target/38134] [4.4 Regression] speed regression with inline-asm sse code
  2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org
                   ` (9 preceding siblings ...)
  2008-11-17 18:31 ` tim at klingt dot org
@ 2008-11-17 18:50 ` tim at klingt dot org
  2009-02-03  9:47 ` bonzini at gnu dot org
                   ` (10 subsequent siblings)
  21 siblings, 0 replies; 23+ messages in thread
From: tim at klingt dot org @ 2008-11-17 18:50 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #9 from tim at klingt dot org  2008-11-17 18:49 -------
i have updated the test program and attached preprocessed sources of gcc 4.3
and 4.4

the loop prefix contains
4.4 (9 invariant loads, one store of a generated constant to the stack):
        pxor    %xmm5, %xmm5
        xorl    %eax, %eax
        movdqa  %xmm5, %xmm0
        xorl    %edx, %edx
        pcmpeqd %xmm5, %xmm0
        movaps  .LC2(%rip), %xmm14
        psrld   $31, %xmm0
        movdqa  .LC3(%rip), %xmm13
        pslld   $31, %xmm0
        movaps  .LC4(%rip), %xmm12
        movaps  .LC5(%rip), %xmm11
        movaps  .LC6(%rip), %xmm10
        movaps  .LC7(%rip), %xmm9
        movaps  .LC8(%rip), %xmm8
        movaps  .LC9(%rip), %xmm7
        movaps  .LC16(%rip), %xmm6
        movdqa  %xmm0, -24(%rsp)

4.3 (8 invariant loads, store one generated constant in register):
        pxor    %xmm6, %xmm6
        xorl    %edx, %edx
        movdqa  %xmm6, %xmm0
        xorl    %eax, %eax
        pcmpeqd %xmm6, %xmm0
        movaps  .LC9(%rip), %xmm15
        psrld   $31, %xmm0
        movaps  .LC10(%rip), %xmm14
        pslld   $31, %xmm0
        movaps  .LC11(%rip), %xmm13
        movaps  .LC12(%rip), %xmm12
        movaps  .LC13(%rip), %xmm11
        movdqa  .LC14(%rip), %xmm10
        movaps  .LC15(%rip), %xmm9
        movaps  .LC16(%rip), %xmm8
        movdqa  %xmm0, %xmm7




body:
4.3 (7 loads from memory, 2 loads are used in the next instruction, others are
used later):
.L48:
        movaps  in(%rax), %xmm2
        movaps  .LC2(%rip), %xmm0
        movdqa  %xmm2, %xmm5
        movdqa  .LC3(%rip), %xmm4
        pand    %xmm7, %xmm5
        movaps  .LC4(%rip), %xmm1
        addl    $4, %edx
#APP
# 324 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        xorps %xmm5, %xmm2
# 0 "" 2
#NO_APP
        mulps   %xmm2, %xmm0
        movaps  %xmm2, %xmm3
#APP
# 327 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        cvttps2dq %xmm0, %xmm0
# 0 "" 2
#NO_APP
        pand    %xmm0, %xmm4
        paddd   %xmm0, %xmm4
#APP
# 330 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        cvtdq2ps  %xmm4, %xmm0
# 0 "" 2
#NO_APP
        pand    %xmm10, %xmm4
        mulps   %xmm0, %xmm1
        psrld   $1, %xmm4
        subps   %xmm1, %xmm3
        movaps  .LC5(%rip), %xmm1
        mulps   %xmm0, %xmm1
        mulps   .LC6(%rip), %xmm0
        subps   %xmm1, %xmm3
        subps   %xmm0, %xmm3
        movaps  .LC7(%rip), %xmm0
        movaps  %xmm3, %xmm1
        cmpltps %xmm2, %xmm0
        mulps   %xmm3, %xmm1
        movaps  %xmm0, %xmm2
        movaps  .LC8(%rip), %xmm0
        mulps   %xmm1, %xmm0
        addps   %xmm15, %xmm0
        mulps   %xmm1, %xmm0
        addps   %xmm14, %xmm0
        mulps   %xmm1, %xmm0
        addps   %xmm13, %xmm0
        mulps   %xmm1, %xmm0
        addps   %xmm12, %xmm0
        mulps   %xmm1, %xmm0
        addps   %xmm11, %xmm0
        mulps   %xmm1, %xmm0
        mulps   %xmm3, %xmm0
        addps   %xmm3, %xmm0
#APP
# 341 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        andps %xmm2, %xmm0
# 0 "" 2
# 342 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        andnps %xmm3, %xmm2
# 0 "" 2
#NO_APP
        movaps  %xmm8, %xmm3
#APP
# 343 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        orps  %xmm2, %xmm0
# 0 "" 2
#NO_APP
        movdqa  %xmm6, %xmm2
        movaps  %xmm0, %xmm1
        psubd   %xmm4, %xmm2
        addps   %xmm9, %xmm1
        divps   %xmm1, %xmm3
        movaps  %xmm3, %xmm1
#APP
# 145 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1
        andps %xmm2, %xmm1
# 0 "" 2
# 146 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1
        andnps %xmm0, %xmm2
# 0 "" 2
# 147 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1
        orps  %xmm2, %xmm1
# 0 "" 2
# 348 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        xorps %xmm5, %xmm1
# 0 "" 2
#NO_APP
        movaps  %xmm1, out(%rax)
        addq    $16, %rax
        cmpl    %edi, %edx
        jne     .L48


4.4 (6 loads from memory, 5 loads are used as memory argument to opcodes):
.L54:
        movaps  in(%rax), %xmm2
        movdqa  -24(%rsp), %xmm3
        addl    $4, %edx
        pand    %xmm2, %xmm3
#APP
# 324 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        xorps %xmm3, %xmm2
# 0 "" 2
#NO_APP
        movaps  %xmm2, %xmm4
        movaps  %xmm2, %xmm15
        mulps   %xmm14, %xmm4
#APP
# 327 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        cvttps2dq %xmm4, %xmm4
# 0 "" 2
#NO_APP
        movdqa  %xmm4, %xmm0
        pand    %xmm13, %xmm0
        paddd   %xmm0, %xmm4
#APP
# 330 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        cvtdq2ps  %xmm4, %xmm0
# 0 "" 2
#NO_APP
        pand    .LC14(%rip), %xmm4
        movaps  %xmm0, %xmm1
        psrld   $1, %xmm4
        mulps   %xmm12, %xmm1
        subps   %xmm1, %xmm15
        movaps  %xmm15, %xmm1
        movaps  %xmm0, %xmm15
        mulps   %xmm10, %xmm0
        mulps   %xmm11, %xmm15
        subps   %xmm15, %xmm1
        movaps  %xmm9, %xmm15
        subps   %xmm0, %xmm1
        cmpltps %xmm2, %xmm15
        movaps  %xmm1, %xmm0
        movaps  %xmm15, %xmm2
        mulps   %xmm1, %xmm0
        movaps  %xmm0, %xmm15
        mulps   %xmm8, %xmm15
        addps   %xmm7, %xmm15
        mulps   %xmm0, %xmm15
        addps   .LC10(%rip), %xmm15
        mulps   %xmm0, %xmm15
        addps   .LC11(%rip), %xmm15
        mulps   %xmm0, %xmm15
        addps   .LC12(%rip), %xmm15
        mulps   %xmm0, %xmm15
        addps   .LC13(%rip), %xmm15
        mulps   %xmm15, %xmm0
        mulps   %xmm1, %xmm0
        addps   %xmm1, %xmm0
#APP
# 341 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        andps %xmm2, %xmm0
# 0 "" 2
# 342 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        andnps %xmm1, %xmm2
# 0 "" 2
#NO_APP
        movdqa  %xmm5, %xmm1
        psubd   %xmm4, %xmm1
        movdqa  %xmm1, %xmm4
        movaps  .LC15(%rip), %xmm1
#APP
# 343 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        orps  %xmm2, %xmm0
# 0 "" 2
#NO_APP
        movaps  %xmm6, %xmm2
        addps   %xmm0, %xmm1
        divps   %xmm1, %xmm2
        movaps  %xmm2, %xmm1
#APP
# 145 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1
        andps %xmm4, %xmm1
# 0 "" 2
# 146 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1
        andnps %xmm0, %xmm4
# 0 "" 2
#NO_APP
        movaps  %xmm1, %xmm0
#APP
# 147 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1
        orps  %xmm4, %xmm0
# 0 "" 2
# 348 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        xorps %xmm3, %xmm0
# 0 "" 2
#NO_APP
        movaps  %xmm0, out(%rax)
        addq    $16, %rax
        cmpl    %edi, %edx
        jne     .L54

hth


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [Bug target/38134] [4.4 Regression] speed regression with inline-asm sse code
  2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org
                   ` (10 preceding siblings ...)
  2008-11-17 18:50 ` tim at klingt dot org
@ 2009-02-03  9:47 ` bonzini at gnu dot org
  2009-02-03 10:36 ` ubizjak at gmail dot com
                   ` (9 subsequent siblings)
  21 siblings, 0 replies; 23+ messages in thread
From: bonzini at gnu dot org @ 2009-02-03  9:47 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #10 from bonzini at gnu dot org  2009-02-03 09:47 -------
Can you try the patch of PR38824?


-- 

bonzini at gnu dot org changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|UNCONFIRMED                 |WAITING


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [Bug target/38134] [4.4 Regression] speed regression with inline-asm sse code
  2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org
                   ` (11 preceding siblings ...)
  2009-02-03  9:47 ` bonzini at gnu dot org
@ 2009-02-03 10:36 ` ubizjak at gmail dot com
  2009-02-03 11:17 ` bonzini at gnu dot org
                   ` (8 subsequent siblings)
  21 siblings, 0 replies; 23+ messages in thread
From: ubizjak at gmail dot com @ 2009-02-03 10:36 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #11 from ubizjak at gmail dot com  2009-02-03 10:36 -------
(In reply to comment #10)
> Can you try the patch of PR38824?

I have tried with a similar peephole2 recognizer. The problem is, that there is
no spare "x" register to allocate as a temporary, so peephole2 is ineffective
in this particular case.


-- 

ubizjak at gmail dot com changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
   Last reconfirmed|0000-00-00 00:00:00         |2009-02-03 10:36:46
               date|                            |


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [Bug target/38134] [4.4 Regression] speed regression with inline-asm sse code
  2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org
                   ` (12 preceding siblings ...)
  2009-02-03 10:36 ` ubizjak at gmail dot com
@ 2009-02-03 11:17 ` bonzini at gnu dot org
  2009-02-03 11:34 ` ubizjak at gmail dot com
                   ` (7 subsequent siblings)
  21 siblings, 0 replies; 23+ messages in thread
From: bonzini at gnu dot org @ 2009-02-03 11:17 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #12 from bonzini at gnu dot org  2009-02-03 11:17 -------
What if we forbid altogether memory operands and we *synthesize* them with a
peephole2?  Anyway, it seems safe to me to declare this a dup of PR38824?


-- 

bonzini at gnu dot org changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|WAITING                     |NEW
     Ever Confirmed|0                           |1
   Last reconfirmed|2009-02-03 10:36:46         |2009-02-03 11:17:38
               date|                            |


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [Bug target/38134] [4.4 Regression] speed regression with inline-asm sse code
  2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org
                   ` (13 preceding siblings ...)
  2009-02-03 11:17 ` bonzini at gnu dot org
@ 2009-02-03 11:34 ` ubizjak at gmail dot com
  2009-02-13  9:57 ` [Bug target/38134] [4.4 Regression] speed regression with many loop invariants bonzini at gnu dot org
                   ` (6 subsequent siblings)
  21 siblings, 0 replies; 23+ messages in thread
From: ubizjak at gmail dot com @ 2009-02-03 11:34 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #13 from ubizjak at gmail dot com  2009-02-03 11:34 -------
(In reply to comment #12)
> What if we forbid altogether memory operands and we *synthesize* them with a
> peephole2?  Anyway, it seems safe to me to declare this a dup of PR38824?

I think that we will hit PR 19398 then...


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [Bug target/38134] [4.4 Regression] speed regression with many loop invariants
  2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org
                   ` (14 preceding siblings ...)
  2009-02-03 11:34 ` ubizjak at gmail dot com
@ 2009-02-13  9:57 ` bonzini at gnu dot org
  2009-02-13 10:03 ` steven at gcc dot gnu dot org
                   ` (5 subsequent siblings)
  21 siblings, 0 replies; 23+ messages in thread
From: bonzini at gnu dot org @ 2009-02-13  9:57 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #14 from bonzini at gnu dot org  2009-02-13 09:57 -------
It seems to me that it would help to have a postreload LIM pass that would
concentrate on loop-invariant memory accesses that are as cheap or cheaper than
loading back a spill.  These would be excluded from the current (non-strict)
LIM.  Is it crazy?


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [Bug target/38134] [4.4 Regression] speed regression with many loop invariants
  2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org
                   ` (15 preceding siblings ...)
  2009-02-13  9:57 ` [Bug target/38134] [4.4 Regression] speed regression with many loop invariants bonzini at gnu dot org
@ 2009-02-13 10:03 ` steven at gcc dot gnu dot org
  2009-04-21 16:02 ` [Bug target/38134] [4.4/4.5 " jakub at gcc dot gnu dot org
                   ` (4 subsequent siblings)
  21 siblings, 0 replies; 23+ messages in thread
From: steven at gcc dot gnu dot org @ 2009-02-13 10:03 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #15 from steven at gcc dot gnu dot org  2009-02-13 10:03 -------
Re. Comment #14

No, this is not crazy.  It is called postreload-gcse.  But it is a stupid pass
that doesn't handle all cases it ought to handle.


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [Bug target/38134] [4.4/4.5 Regression] speed regression with many loop invariants
  2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org
                   ` (16 preceding siblings ...)
  2009-02-13 10:03 ` steven at gcc dot gnu dot org
@ 2009-04-21 16:02 ` jakub at gcc dot gnu dot org
  2009-07-22 10:35 ` jakub at gcc dot gnu dot org
                   ` (3 subsequent siblings)
  21 siblings, 0 replies; 23+ messages in thread
From: jakub at gcc dot gnu dot org @ 2009-04-21 16:02 UTC (permalink / raw)
  To: gcc-bugs



-- 

jakub at gcc dot gnu dot org changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
   Target Milestone|4.4.0                       |4.4.1


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [Bug target/38134] [4.4/4.5 Regression] speed regression with many loop invariants
  2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org
                   ` (17 preceding siblings ...)
  2009-04-21 16:02 ` [Bug target/38134] [4.4/4.5 " jakub at gcc dot gnu dot org
@ 2009-07-22 10:35 ` jakub at gcc dot gnu dot org
  2009-10-15 12:56 ` jakub at gcc dot gnu dot org
                   ` (2 subsequent siblings)
  21 siblings, 0 replies; 23+ messages in thread
From: jakub at gcc dot gnu dot org @ 2009-07-22 10:35 UTC (permalink / raw)
  To: gcc-bugs



-- 

jakub at gcc dot gnu dot org changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
   Target Milestone|4.4.1                       |4.4.2


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [Bug target/38134] [4.4/4.5 Regression] speed regression with many loop invariants
  2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org
                   ` (18 preceding siblings ...)
  2009-07-22 10:35 ` jakub at gcc dot gnu dot org
@ 2009-10-15 12:56 ` jakub at gcc dot gnu dot org
  2010-01-21 13:16 ` jakub at gcc dot gnu dot org
  2010-04-30  9:01 ` [Bug target/38134] [4.4/4.5/4.6 " jakub at gcc dot gnu dot org
  21 siblings, 0 replies; 23+ messages in thread
From: jakub at gcc dot gnu dot org @ 2009-10-15 12:56 UTC (permalink / raw)
  To: gcc-bugs



-- 

jakub at gcc dot gnu dot org changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
   Target Milestone|4.4.2                       |4.4.3


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [Bug target/38134] [4.4/4.5 Regression] speed regression with many loop invariants
  2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org
                   ` (19 preceding siblings ...)
  2009-10-15 12:56 ` jakub at gcc dot gnu dot org
@ 2010-01-21 13:16 ` jakub at gcc dot gnu dot org
  2010-04-30  9:01 ` [Bug target/38134] [4.4/4.5/4.6 " jakub at gcc dot gnu dot org
  21 siblings, 0 replies; 23+ messages in thread
From: jakub at gcc dot gnu dot org @ 2010-01-21 13:16 UTC (permalink / raw)
  To: gcc-bugs



-- 

jakub at gcc dot gnu dot org changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
   Target Milestone|4.4.3                       |4.4.4


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [Bug target/38134] [4.4/4.5/4.6 Regression] speed regression with many loop invariants
  2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org
                   ` (20 preceding siblings ...)
  2010-01-21 13:16 ` jakub at gcc dot gnu dot org
@ 2010-04-30  9:01 ` jakub at gcc dot gnu dot org
  21 siblings, 0 replies; 23+ messages in thread
From: jakub at gcc dot gnu dot org @ 2010-04-30  9:01 UTC (permalink / raw)
  To: gcc-bugs



-- 

jakub at gcc dot gnu dot org changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
   Target Milestone|4.4.4                       |4.4.5


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134


^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2010-04-30  8:54 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org
2008-11-15 15:57 ` [Bug c/38134] " tim at klingt dot org
2008-11-15 16:47 ` ubizjak at gmail dot com
2008-11-15 17:05 ` tim at klingt dot org
2008-11-15 20:32 ` [Bug target/38134] [4.4 Regression] speed regression with inline-asm " rguenth at gcc dot gnu dot org
2008-11-16  0:07 ` hjl dot tools at gmail dot com
2008-11-16  0:09 ` hjl dot tools at gmail dot com
2008-11-17  9:36 ` jakub at gcc dot gnu dot org
2008-11-17 18:13 ` ubizjak at gmail dot com
2008-11-17 18:20 ` tim at klingt dot org
2008-11-17 18:31 ` tim at klingt dot org
2008-11-17 18:50 ` tim at klingt dot org
2009-02-03  9:47 ` bonzini at gnu dot org
2009-02-03 10:36 ` ubizjak at gmail dot com
2009-02-03 11:17 ` bonzini at gnu dot org
2009-02-03 11:34 ` ubizjak at gmail dot com
2009-02-13  9:57 ` [Bug target/38134] [4.4 Regression] speed regression with many loop invariants bonzini at gnu dot org
2009-02-13 10:03 ` steven at gcc dot gnu dot org
2009-04-21 16:02 ` [Bug target/38134] [4.4/4.5 " jakub at gcc dot gnu dot org
2009-07-22 10:35 ` jakub at gcc dot gnu dot org
2009-10-15 12:56 ` jakub at gcc dot gnu dot org
2010-01-21 13:16 ` jakub at gcc dot gnu dot org
2010-04-30  9:01 ` [Bug target/38134] [4.4/4.5/4.6 " jakub at gcc dot gnu dot org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).