public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug c/38134] New: gcc-4.4 speed regression with sse code
@ 2008-11-15 15:56 tim at klingt dot org
2008-11-15 15:57 ` [Bug c/38134] " tim at klingt dot org
` (21 more replies)
0 siblings, 22 replies; 23+ messages in thread
From: tim at klingt dot org @ 2008-11-15 15:56 UTC (permalink / raw)
To: gcc-bugs
the attached program, a simdfied version of the tanf function, shows a 20%
performance regression from gcc-4.3 to gcc-4.4:
the compared compilers are
g++-4.3
Using built-in specs.
Target: x86_64-linux-gnu
Configured with: ../src/configure -v --with-pkgversion='Ubuntu 4.3.2-1ubuntu11'
--with-bugurl=file:///usr/share/doc/gcc-4.3/README.Bugs
--enable-languages=c,c++,fortran,objc,obj-c++ --prefix=/usr --enable-shared
--with-system-zlib --libexecdir=/usr/lib --without-included-gettext
--enable-threads=posix --enable-nls --with-gxx-include-dir=/usr/include/c++/4.3
--program-suffix=-4.3 --enable-clocale=gnu --enable-libstdcxx-debug
--enable-objc-gc --enable-mpfr --enable-checking=release
--build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu
Thread model: posix
gcc version 4.3.2 (Ubuntu 4.3.2-1ubuntu11)
and
Using built-in specs.
Target: x86_64-linux-gnu
Configured with: ../src/configure -v --with-pkgversion='Ubuntu
20081024-0ubuntu1' --with-bugurl=file:///usr/share/doc/gcc-snapshot/README.Bugs
--enable-languages=c,c++,java,fortran,objc,obj-c++,ada
--prefix=/usr/lib/gcc-snapshot --enable-shared --with-system-zlib --disable-nls
--enable-clocale=gnu --enable-libstdcxx-debug --enable-java-awt=gtk
--enable-gtk-cairo --disable-plugin --with-java-home=/usr/lib/gcc-snapshot
--enable-java-home --with-jvm-root-dir=/usr/lib/gcc-snapshot/jvm
--with-jvm-jar-dir=/usr/lib/gcc-snapshot/jvm-exports
--with-ecj-jar=/usr/share/java/eclipse-ecj.jar --enable-objc-gc --enable-mpfr
--disable-werror --build=x86_64-linux-gnu --host=x86_64-linux-gnu
--target=x86_64-linux-gnu
Thread model: posix
gcc version 4.4.0 20081024 (experimental) [trunk revision 141342] (Ubuntu
20081024-0ubuntu1)
the interesting part is the inner loop of the bench_1_simd function.
gcc-4.4 generates:
.L54:
movaps in(%rax), %xmm0
movdqa %xmm14, %xmm3
addl $4, %edx
pand %xmm0, %xmm3
#APP
# 325 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
xorps %xmm3, %xmm0
# 0 "" 2
#NO_APP
movaps %xmm0, %xmm4
movaps %xmm0, %xmm15
mulps %xmm13, %xmm4
#APP
# 328 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
cvttps2dq %xmm4, %xmm4
# 0 "" 2
#NO_APP
movdqa %xmm4, %xmm1
pand %xmm12, %xmm1
paddd %xmm1, %xmm4
#APP
# 331 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
cvtdq2ps %xmm4, %xmm1
# 0 "" 2
#NO_APP
pand .LC15(%rip), %xmm4
movaps %xmm1, %xmm2
psrld $1, %xmm4
mulps %xmm11, %xmm2
subps %xmm2, %xmm15
movaps %xmm15, %xmm2
movaps %xmm1, %xmm15
mulps %xmm9, %xmm1
mulps %xmm10, %xmm15
subps %xmm15, %xmm2
movaps %xmm8, %xmm15
subps %xmm1, %xmm2
cmpltps %xmm0, %xmm15
movaps %xmm2, %xmm1
mulps %xmm2, %xmm1
movaps %xmm1, %xmm0
mulps %xmm7, %xmm0
addps .LC10(%rip), %xmm0
mulps %xmm1, %xmm0
addps .LC11(%rip), %xmm0
mulps %xmm1, %xmm0
addps .LC12(%rip), %xmm0
mulps %xmm1, %xmm0
addps .LC13(%rip), %xmm0
mulps %xmm1, %xmm0
addps .LC14(%rip), %xmm0
mulps %xmm1, %xmm0
movdqa %xmm5, %xmm1
mulps %xmm2, %xmm0
psubd %xmm4, %xmm1
addps %xmm2, %xmm0
movdqa %xmm1, %xmm4
#APP
# 342 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
andps %xmm15, %xmm0
# 0 "" 2
#NO_APP
movaps .LC16(%rip), %xmm1
#APP
# 343 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
andnps %xmm2, %xmm15
# 0 "" 2
#NO_APP
movaps %xmm6, %xmm2
#APP
# 344 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
orps %xmm15, %xmm0
# 0 "" 2
#NO_APP
addps %xmm0, %xmm1
divps %xmm1, %xmm2
movaps %xmm2, %xmm1
#APP
# 145 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1
andps %xmm4, %xmm1
# 0 "" 2
# 146 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1
andnps %xmm0, %xmm4
# 0 "" 2
#NO_APP
movaps %xmm1, %xmm0
#APP
# 147 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1
orps %xmm4, %xmm0
# 0 "" 2
# 349 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
xorps %xmm3, %xmm0
# 0 "" 2
#NO_APP
movaps %xmm0, out(%rax)
addq $16, %rax
cmpl %edi, %edx
jne .L54
while gcc-4.3 generates:
.L48:
movaps in(%rax), %xmm2
movdqa .LC2(%rip), %xmm5
movaps .LC3(%rip), %xmm0
pand %xmm2, %xmm5
movdqa .LC4(%rip), %xmm4
movaps .LC5(%rip), %xmm1
addl $4, %edx
#APP
# 325 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
xorps %xmm5, %xmm2
# 0 "" 2
#NO_APP
mulps %xmm2, %xmm0
movaps %xmm2, %xmm3
#APP
# 328 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
cvttps2dq %xmm0, %xmm0
# 0 "" 2
#NO_APP
pand %xmm0, %xmm4
paddd %xmm0, %xmm4
#APP
# 331 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
cvtdq2ps %xmm4, %xmm0
# 0 "" 2
#NO_APP
pand %xmm9, %xmm4
mulps %xmm0, %xmm1
psrld $1, %xmm4
subps %xmm1, %xmm3
movaps .LC6(%rip), %xmm1
mulps %xmm0, %xmm1
mulps .LC7(%rip), %xmm0
subps %xmm1, %xmm3
subps %xmm0, %xmm3
movaps .LC8(%rip), %xmm0
movaps %xmm3, %xmm1
cmpltps %xmm2, %xmm0
mulps %xmm3, %xmm1
movaps %xmm0, %xmm2
movaps %xmm1, %xmm0
mulps %xmm15, %xmm0
addps %xmm14, %xmm0
mulps %xmm1, %xmm0
addps %xmm13, %xmm0
mulps %xmm1, %xmm0
addps %xmm12, %xmm0
mulps %xmm1, %xmm0
addps %xmm11, %xmm0
mulps %xmm1, %xmm0
addps %xmm10, %xmm0
mulps %xmm1, %xmm0
mulps %xmm3, %xmm0
addps %xmm3, %xmm0
#APP
# 342 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
andps %xmm2, %xmm0
# 0 "" 2
# 343 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
andnps %xmm3, %xmm2
# 0 "" 2
#NO_APP
movaps %xmm7, %xmm3
#APP
# 344 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
orps %xmm2, %xmm0
# 0 "" 2
#NO_APP
movdqa %xmm6, %xmm2
movaps %xmm0, %xmm1
psubd %xmm4, %xmm2
addps %xmm8, %xmm1
divps %xmm1, %xmm3
movaps %xmm3, %xmm1
#APP
# 145 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1
andps %xmm2, %xmm1
# 0 "" 2
# 146 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1
andnps %xmm0, %xmm2
# 0 "" 2
# 147 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1
orps %xmm2, %xmm1
# 0 "" 2
# 349 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
xorps %xmm5, %xmm1
# 0 "" 2
#NO_APP
movaps %xmm1, out(%rax)
addq $16, %rax
cmpl %edi, %edx
jne .L48
the code generated by gcc-4.4 requires more memory access. the code was
generated with the flags -O3 -march=core2. while the assembly code is generated
for the x86_64 architecture, similar results can be seen with x86 code (4.4 is
about 14% slower than 4.3)
--
Summary: gcc-4.4 speed regression with sse code
Product: gcc
Version: unknown
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: c
AssignedTo: unassigned at gcc dot gnu dot org
ReportedBy: tim at klingt dot org
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134
^ permalink raw reply [flat|nested] 23+ messages in thread
* [Bug c/38134] gcc-4.4 speed regression with sse code
2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org
@ 2008-11-15 15:57 ` tim at klingt dot org
2008-11-15 16:47 ` ubizjak at gmail dot com
` (20 subsequent siblings)
21 siblings, 0 replies; 23+ messages in thread
From: tim at klingt dot org @ 2008-11-15 15:57 UTC (permalink / raw)
To: gcc-bugs
------- Comment #1 from tim at klingt dot org 2008-11-15 15:55 -------
Created an attachment (id=16684)
--> (http://gcc.gnu.org/bugzilla/attachment.cgi?id=16684&action=view)
compressed preprocessed source
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134
^ permalink raw reply [flat|nested] 23+ messages in thread
* [Bug c/38134] gcc-4.4 speed regression with sse code
2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org
2008-11-15 15:57 ` [Bug c/38134] " tim at klingt dot org
@ 2008-11-15 16:47 ` ubizjak at gmail dot com
2008-11-15 17:05 ` tim at klingt dot org
` (19 subsequent siblings)
21 siblings, 0 replies; 23+ messages in thread
From: ubizjak at gmail dot com @ 2008-11-15 16:47 UTC (permalink / raw)
To: gcc-bugs
------- Comment #2 from ubizjak at gmail dot com 2008-11-15 16:46 -------
Can you try with -fno-ira?
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134
^ permalink raw reply [flat|nested] 23+ messages in thread
* [Bug c/38134] gcc-4.4 speed regression with sse code
2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org
2008-11-15 15:57 ` [Bug c/38134] " tim at klingt dot org
2008-11-15 16:47 ` ubizjak at gmail dot com
@ 2008-11-15 17:05 ` tim at klingt dot org
2008-11-15 20:32 ` [Bug target/38134] [4.4 Regression] speed regression with inline-asm " rguenth at gcc dot gnu dot org
` (18 subsequent siblings)
21 siblings, 0 replies; 23+ messages in thread
From: tim at klingt dot org @ 2008-11-15 17:05 UTC (permalink / raw)
To: gcc-bugs
------- Comment #3 from tim at klingt dot org 2008-11-15 17:04 -------
i tried to run the benchmark with -fno-ira, which turned out to be about 20%
slower than without the flag.
anyway, i found, that the preprocessed source generated by gcc-4.3 cannot be
compiled with gcc-4.4 ... the specific file can be found here
http://tim.klingt.org/git?p=nova-server.git;a=blob;f=benchmarks/simd_tan_benchmarks.cpp;h=c575996de0dc916a8e654af7a36350be9c22327e;hb=844d3cf991cbbbe74b34277696dda0b940769b28
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134
^ permalink raw reply [flat|nested] 23+ messages in thread
* [Bug target/38134] [4.4 Regression] speed regression with inline-asm sse code
2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org
` (2 preceding siblings ...)
2008-11-15 17:05 ` tim at klingt dot org
@ 2008-11-15 20:32 ` rguenth at gcc dot gnu dot org
2008-11-16 0:07 ` hjl dot tools at gmail dot com
` (17 subsequent siblings)
21 siblings, 0 replies; 23+ messages in thread
From: rguenth at gcc dot gnu dot org @ 2008-11-15 20:32 UTC (permalink / raw)
To: gcc-bugs
--
rguenth at gcc dot gnu dot org changed:
What |Removed |Added
----------------------------------------------------------------------------
GCC target triplet| |x86_64-*-*-*
Keywords| |missed-optimization
Summary|gcc-4.4 speed regression |[4.4 Regression] speed
|with inline-asm sse code |regression with inline-asm
| |sse code
Target Milestone|--- |4.4.0
Version|unknown |4.4.0
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134
^ permalink raw reply [flat|nested] 23+ messages in thread
* [Bug target/38134] [4.4 Regression] speed regression with inline-asm sse code
2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org
` (3 preceding siblings ...)
2008-11-15 20:32 ` [Bug target/38134] [4.4 Regression] speed regression with inline-asm " rguenth at gcc dot gnu dot org
@ 2008-11-16 0:07 ` hjl dot tools at gmail dot com
2008-11-16 0:09 ` hjl dot tools at gmail dot com
` (16 subsequent siblings)
21 siblings, 0 replies; 23+ messages in thread
From: hjl dot tools at gmail dot com @ 2008-11-16 0:07 UTC (permalink / raw)
To: gcc-bugs
------- Comment #4 from hjl dot tools at gmail dot com 2008-11-16 00:06 -------
(In reply to comment #3)
> i tried to run the benchmark with -fno-ira, which turned out to be about 20%
> slower than without the flag.
>
Can you try "-O3 -march=core2 -mtune=generic" and "-O3 -march=core2
-mtune=generic -fno-ira" ?
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134
^ permalink raw reply [flat|nested] 23+ messages in thread
* [Bug target/38134] [4.4 Regression] speed regression with inline-asm sse code
2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org
` (4 preceding siblings ...)
2008-11-16 0:07 ` hjl dot tools at gmail dot com
@ 2008-11-16 0:09 ` hjl dot tools at gmail dot com
2008-11-17 9:36 ` jakub at gcc dot gnu dot org
` (15 subsequent siblings)
21 siblings, 0 replies; 23+ messages in thread
From: hjl dot tools at gmail dot com @ 2008-11-16 0:09 UTC (permalink / raw)
To: gcc-bugs
------- Comment #5 from hjl dot tools at gmail dot com 2008-11-16 00:08 -------
(In reply to comment #3)
> anyway, i found, that the preprocessed source generated by gcc-4.3 cannot be
> compiled with gcc-4.4 ... the specific file can be found here
> http://tim.klingt.org/git?p=nova-server.git;a=blob;f=benchmarks/simd_tan_benchmarks.cpp;h=c575996de0dc916a8e654af7a36350be9c22327e;hb=844d3cf991cbbbe74b34277696dda0b940769b28
>
Please upload both preprocessed sources generated by gcc 4.3 and gcc 4.4.
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134
^ permalink raw reply [flat|nested] 23+ messages in thread
* [Bug target/38134] [4.4 Regression] speed regression with inline-asm sse code
2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org
` (5 preceding siblings ...)
2008-11-16 0:09 ` hjl dot tools at gmail dot com
@ 2008-11-17 9:36 ` jakub at gcc dot gnu dot org
2008-11-17 18:13 ` ubizjak at gmail dot com
` (14 subsequent siblings)
21 siblings, 0 replies; 23+ messages in thread
From: jakub at gcc dot gnu dot org @ 2008-11-17 9:36 UTC (permalink / raw)
To: gcc-bugs
--
jakub at gcc dot gnu dot org changed:
What |Removed |Added
----------------------------------------------------------------------------
Priority|P3 |P2
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134
^ permalink raw reply [flat|nested] 23+ messages in thread
* [Bug target/38134] [4.4 Regression] speed regression with inline-asm sse code
2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org
` (6 preceding siblings ...)
2008-11-17 9:36 ` jakub at gcc dot gnu dot org
@ 2008-11-17 18:13 ` ubizjak at gmail dot com
2008-11-17 18:20 ` tim at klingt dot org
` (13 subsequent siblings)
21 siblings, 0 replies; 23+ messages in thread
From: ubizjak at gmail dot com @ 2008-11-17 18:13 UTC (permalink / raw)
To: gcc-bugs
------- Comment #6 from ubizjak at gmail dot com 2008-11-17 18:11 -------
I think that
addps .LC10(%rip), %xmm0
mulps %xmm1, %xmm0
addps .LC11(%rip), %xmm0
mulps %xmm1, %xmm0
addps .LC12(%rip), %xmm0
mulps %xmm1, %xmm0
addps .LC13(%rip), %xmm0
mulps %xmm1, %xmm0
addps .LC14(%rip), %xmm0
mulps %xmm1, %xmm0
is the bottleneck. Perhaps we should split impilicit memory operands out of the
insn by some generic peephole (if the register is available) and schedule loads
appropriately.
OTOH, loop optimizer should detect invariant loads and move them out of the
loop.
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134
^ permalink raw reply [flat|nested] 23+ messages in thread
* [Bug target/38134] [4.4 Regression] speed regression with inline-asm sse code
2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org
` (7 preceding siblings ...)
2008-11-17 18:13 ` ubizjak at gmail dot com
@ 2008-11-17 18:20 ` tim at klingt dot org
2008-11-17 18:31 ` tim at klingt dot org
` (12 subsequent siblings)
21 siblings, 0 replies; 23+ messages in thread
From: tim at klingt dot org @ 2008-11-17 18:20 UTC (permalink / raw)
To: gcc-bugs
------- Comment #7 from tim at klingt dot org 2008-11-17 18:19 -------
Created an attachment (id=16710)
--> (http://gcc.gnu.org/bugzilla/attachment.cgi?id=16710&action=view)
compressed preprocessed source, gcc-4.4
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134
^ permalink raw reply [flat|nested] 23+ messages in thread
* [Bug target/38134] [4.4 Regression] speed regression with inline-asm sse code
2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org
` (8 preceding siblings ...)
2008-11-17 18:20 ` tim at klingt dot org
@ 2008-11-17 18:31 ` tim at klingt dot org
2008-11-17 18:50 ` tim at klingt dot org
` (11 subsequent siblings)
21 siblings, 0 replies; 23+ messages in thread
From: tim at klingt dot org @ 2008-11-17 18:31 UTC (permalink / raw)
To: gcc-bugs
------- Comment #8 from tim at klingt dot org 2008-11-17 18:30 -------
Created an attachment (id=16711)
--> (http://gcc.gnu.org/bugzilla/attachment.cgi?id=16711&action=view)
16684: compressed preprocessed source, gcc-4.3
--
tim at klingt dot org changed:
What |Removed |Added
----------------------------------------------------------------------------
Attachment #16684|0 |1
is obsolete| |
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134
^ permalink raw reply [flat|nested] 23+ messages in thread
* [Bug target/38134] [4.4 Regression] speed regression with inline-asm sse code
2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org
` (9 preceding siblings ...)
2008-11-17 18:31 ` tim at klingt dot org
@ 2008-11-17 18:50 ` tim at klingt dot org
2009-02-03 9:47 ` bonzini at gnu dot org
` (10 subsequent siblings)
21 siblings, 0 replies; 23+ messages in thread
From: tim at klingt dot org @ 2008-11-17 18:50 UTC (permalink / raw)
To: gcc-bugs
------- Comment #9 from tim at klingt dot org 2008-11-17 18:49 -------
i have updated the test program and attached preprocessed sources of gcc 4.3
and 4.4
the loop prefix contains
4.4 (9 invariant loads, one store of a generated constant to the stack):
pxor %xmm5, %xmm5
xorl %eax, %eax
movdqa %xmm5, %xmm0
xorl %edx, %edx
pcmpeqd %xmm5, %xmm0
movaps .LC2(%rip), %xmm14
psrld $31, %xmm0
movdqa .LC3(%rip), %xmm13
pslld $31, %xmm0
movaps .LC4(%rip), %xmm12
movaps .LC5(%rip), %xmm11
movaps .LC6(%rip), %xmm10
movaps .LC7(%rip), %xmm9
movaps .LC8(%rip), %xmm8
movaps .LC9(%rip), %xmm7
movaps .LC16(%rip), %xmm6
movdqa %xmm0, -24(%rsp)
4.3 (8 invariant loads, store one generated constant in register):
pxor %xmm6, %xmm6
xorl %edx, %edx
movdqa %xmm6, %xmm0
xorl %eax, %eax
pcmpeqd %xmm6, %xmm0
movaps .LC9(%rip), %xmm15
psrld $31, %xmm0
movaps .LC10(%rip), %xmm14
pslld $31, %xmm0
movaps .LC11(%rip), %xmm13
movaps .LC12(%rip), %xmm12
movaps .LC13(%rip), %xmm11
movdqa .LC14(%rip), %xmm10
movaps .LC15(%rip), %xmm9
movaps .LC16(%rip), %xmm8
movdqa %xmm0, %xmm7
body:
4.3 (7 loads from memory, 2 loads are used in the next instruction, others are
used later):
.L48:
movaps in(%rax), %xmm2
movaps .LC2(%rip), %xmm0
movdqa %xmm2, %xmm5
movdqa .LC3(%rip), %xmm4
pand %xmm7, %xmm5
movaps .LC4(%rip), %xmm1
addl $4, %edx
#APP
# 324 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
xorps %xmm5, %xmm2
# 0 "" 2
#NO_APP
mulps %xmm2, %xmm0
movaps %xmm2, %xmm3
#APP
# 327 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
cvttps2dq %xmm0, %xmm0
# 0 "" 2
#NO_APP
pand %xmm0, %xmm4
paddd %xmm0, %xmm4
#APP
# 330 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
cvtdq2ps %xmm4, %xmm0
# 0 "" 2
#NO_APP
pand %xmm10, %xmm4
mulps %xmm0, %xmm1
psrld $1, %xmm4
subps %xmm1, %xmm3
movaps .LC5(%rip), %xmm1
mulps %xmm0, %xmm1
mulps .LC6(%rip), %xmm0
subps %xmm1, %xmm3
subps %xmm0, %xmm3
movaps .LC7(%rip), %xmm0
movaps %xmm3, %xmm1
cmpltps %xmm2, %xmm0
mulps %xmm3, %xmm1
movaps %xmm0, %xmm2
movaps .LC8(%rip), %xmm0
mulps %xmm1, %xmm0
addps %xmm15, %xmm0
mulps %xmm1, %xmm0
addps %xmm14, %xmm0
mulps %xmm1, %xmm0
addps %xmm13, %xmm0
mulps %xmm1, %xmm0
addps %xmm12, %xmm0
mulps %xmm1, %xmm0
addps %xmm11, %xmm0
mulps %xmm1, %xmm0
mulps %xmm3, %xmm0
addps %xmm3, %xmm0
#APP
# 341 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
andps %xmm2, %xmm0
# 0 "" 2
# 342 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
andnps %xmm3, %xmm2
# 0 "" 2
#NO_APP
movaps %xmm8, %xmm3
#APP
# 343 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
orps %xmm2, %xmm0
# 0 "" 2
#NO_APP
movdqa %xmm6, %xmm2
movaps %xmm0, %xmm1
psubd %xmm4, %xmm2
addps %xmm9, %xmm1
divps %xmm1, %xmm3
movaps %xmm3, %xmm1
#APP
# 145 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1
andps %xmm2, %xmm1
# 0 "" 2
# 146 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1
andnps %xmm0, %xmm2
# 0 "" 2
# 147 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1
orps %xmm2, %xmm1
# 0 "" 2
# 348 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
xorps %xmm5, %xmm1
# 0 "" 2
#NO_APP
movaps %xmm1, out(%rax)
addq $16, %rax
cmpl %edi, %edx
jne .L48
4.4 (6 loads from memory, 5 loads are used as memory argument to opcodes):
.L54:
movaps in(%rax), %xmm2
movdqa -24(%rsp), %xmm3
addl $4, %edx
pand %xmm2, %xmm3
#APP
# 324 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
xorps %xmm3, %xmm2
# 0 "" 2
#NO_APP
movaps %xmm2, %xmm4
movaps %xmm2, %xmm15
mulps %xmm14, %xmm4
#APP
# 327 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
cvttps2dq %xmm4, %xmm4
# 0 "" 2
#NO_APP
movdqa %xmm4, %xmm0
pand %xmm13, %xmm0
paddd %xmm0, %xmm4
#APP
# 330 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
cvtdq2ps %xmm4, %xmm0
# 0 "" 2
#NO_APP
pand .LC14(%rip), %xmm4
movaps %xmm0, %xmm1
psrld $1, %xmm4
mulps %xmm12, %xmm1
subps %xmm1, %xmm15
movaps %xmm15, %xmm1
movaps %xmm0, %xmm15
mulps %xmm10, %xmm0
mulps %xmm11, %xmm15
subps %xmm15, %xmm1
movaps %xmm9, %xmm15
subps %xmm0, %xmm1
cmpltps %xmm2, %xmm15
movaps %xmm1, %xmm0
movaps %xmm15, %xmm2
mulps %xmm1, %xmm0
movaps %xmm0, %xmm15
mulps %xmm8, %xmm15
addps %xmm7, %xmm15
mulps %xmm0, %xmm15
addps .LC10(%rip), %xmm15
mulps %xmm0, %xmm15
addps .LC11(%rip), %xmm15
mulps %xmm0, %xmm15
addps .LC12(%rip), %xmm15
mulps %xmm0, %xmm15
addps .LC13(%rip), %xmm15
mulps %xmm15, %xmm0
mulps %xmm1, %xmm0
addps %xmm1, %xmm0
#APP
# 341 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
andps %xmm2, %xmm0
# 0 "" 2
# 342 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
andnps %xmm1, %xmm2
# 0 "" 2
#NO_APP
movdqa %xmm5, %xmm1
psubd %xmm4, %xmm1
movdqa %xmm1, %xmm4
movaps .LC15(%rip), %xmm1
#APP
# 343 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
orps %xmm2, %xmm0
# 0 "" 2
#NO_APP
movaps %xmm6, %xmm2
addps %xmm0, %xmm1
divps %xmm1, %xmm2
movaps %xmm2, %xmm1
#APP
# 145 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1
andps %xmm4, %xmm1
# 0 "" 2
# 146 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1
andnps %xmm0, %xmm4
# 0 "" 2
#NO_APP
movaps %xmm1, %xmm0
#APP
# 147 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1
orps %xmm4, %xmm0
# 0 "" 2
# 348 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
xorps %xmm3, %xmm0
# 0 "" 2
#NO_APP
movaps %xmm0, out(%rax)
addq $16, %rax
cmpl %edi, %edx
jne .L54
hth
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134
^ permalink raw reply [flat|nested] 23+ messages in thread
* [Bug target/38134] [4.4 Regression] speed regression with inline-asm sse code
2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org
` (10 preceding siblings ...)
2008-11-17 18:50 ` tim at klingt dot org
@ 2009-02-03 9:47 ` bonzini at gnu dot org
2009-02-03 10:36 ` ubizjak at gmail dot com
` (9 subsequent siblings)
21 siblings, 0 replies; 23+ messages in thread
From: bonzini at gnu dot org @ 2009-02-03 9:47 UTC (permalink / raw)
To: gcc-bugs
------- Comment #10 from bonzini at gnu dot org 2009-02-03 09:47 -------
Can you try the patch of PR38824?
--
bonzini at gnu dot org changed:
What |Removed |Added
----------------------------------------------------------------------------
Status|UNCONFIRMED |WAITING
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134
^ permalink raw reply [flat|nested] 23+ messages in thread
* [Bug target/38134] [4.4 Regression] speed regression with inline-asm sse code
2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org
` (11 preceding siblings ...)
2009-02-03 9:47 ` bonzini at gnu dot org
@ 2009-02-03 10:36 ` ubizjak at gmail dot com
2009-02-03 11:17 ` bonzini at gnu dot org
` (8 subsequent siblings)
21 siblings, 0 replies; 23+ messages in thread
From: ubizjak at gmail dot com @ 2009-02-03 10:36 UTC (permalink / raw)
To: gcc-bugs
------- Comment #11 from ubizjak at gmail dot com 2009-02-03 10:36 -------
(In reply to comment #10)
> Can you try the patch of PR38824?
I have tried with a similar peephole2 recognizer. The problem is, that there is
no spare "x" register to allocate as a temporary, so peephole2 is ineffective
in this particular case.
--
ubizjak at gmail dot com changed:
What |Removed |Added
----------------------------------------------------------------------------
Last reconfirmed|0000-00-00 00:00:00 |2009-02-03 10:36:46
date| |
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134
^ permalink raw reply [flat|nested] 23+ messages in thread
* [Bug target/38134] [4.4 Regression] speed regression with inline-asm sse code
2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org
` (12 preceding siblings ...)
2009-02-03 10:36 ` ubizjak at gmail dot com
@ 2009-02-03 11:17 ` bonzini at gnu dot org
2009-02-03 11:34 ` ubizjak at gmail dot com
` (7 subsequent siblings)
21 siblings, 0 replies; 23+ messages in thread
From: bonzini at gnu dot org @ 2009-02-03 11:17 UTC (permalink / raw)
To: gcc-bugs
------- Comment #12 from bonzini at gnu dot org 2009-02-03 11:17 -------
What if we forbid altogether memory operands and we *synthesize* them with a
peephole2? Anyway, it seems safe to me to declare this a dup of PR38824?
--
bonzini at gnu dot org changed:
What |Removed |Added
----------------------------------------------------------------------------
Status|WAITING |NEW
Ever Confirmed|0 |1
Last reconfirmed|2009-02-03 10:36:46 |2009-02-03 11:17:38
date| |
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134
^ permalink raw reply [flat|nested] 23+ messages in thread
* [Bug target/38134] [4.4 Regression] speed regression with inline-asm sse code
2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org
` (13 preceding siblings ...)
2009-02-03 11:17 ` bonzini at gnu dot org
@ 2009-02-03 11:34 ` ubizjak at gmail dot com
2009-02-13 9:57 ` [Bug target/38134] [4.4 Regression] speed regression with many loop invariants bonzini at gnu dot org
` (6 subsequent siblings)
21 siblings, 0 replies; 23+ messages in thread
From: ubizjak at gmail dot com @ 2009-02-03 11:34 UTC (permalink / raw)
To: gcc-bugs
------- Comment #13 from ubizjak at gmail dot com 2009-02-03 11:34 -------
(In reply to comment #12)
> What if we forbid altogether memory operands and we *synthesize* them with a
> peephole2? Anyway, it seems safe to me to declare this a dup of PR38824?
I think that we will hit PR 19398 then...
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134
^ permalink raw reply [flat|nested] 23+ messages in thread
* [Bug target/38134] [4.4 Regression] speed regression with many loop invariants
2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org
` (14 preceding siblings ...)
2009-02-03 11:34 ` ubizjak at gmail dot com
@ 2009-02-13 9:57 ` bonzini at gnu dot org
2009-02-13 10:03 ` steven at gcc dot gnu dot org
` (5 subsequent siblings)
21 siblings, 0 replies; 23+ messages in thread
From: bonzini at gnu dot org @ 2009-02-13 9:57 UTC (permalink / raw)
To: gcc-bugs
------- Comment #14 from bonzini at gnu dot org 2009-02-13 09:57 -------
It seems to me that it would help to have a postreload LIM pass that would
concentrate on loop-invariant memory accesses that are as cheap or cheaper than
loading back a spill. These would be excluded from the current (non-strict)
LIM. Is it crazy?
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134
^ permalink raw reply [flat|nested] 23+ messages in thread
* [Bug target/38134] [4.4 Regression] speed regression with many loop invariants
2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org
` (15 preceding siblings ...)
2009-02-13 9:57 ` [Bug target/38134] [4.4 Regression] speed regression with many loop invariants bonzini at gnu dot org
@ 2009-02-13 10:03 ` steven at gcc dot gnu dot org
2009-04-21 16:02 ` [Bug target/38134] [4.4/4.5 " jakub at gcc dot gnu dot org
` (4 subsequent siblings)
21 siblings, 0 replies; 23+ messages in thread
From: steven at gcc dot gnu dot org @ 2009-02-13 10:03 UTC (permalink / raw)
To: gcc-bugs
------- Comment #15 from steven at gcc dot gnu dot org 2009-02-13 10:03 -------
Re. Comment #14
No, this is not crazy. It is called postreload-gcse. But it is a stupid pass
that doesn't handle all cases it ought to handle.
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134
^ permalink raw reply [flat|nested] 23+ messages in thread
* [Bug target/38134] [4.4/4.5 Regression] speed regression with many loop invariants
2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org
` (16 preceding siblings ...)
2009-02-13 10:03 ` steven at gcc dot gnu dot org
@ 2009-04-21 16:02 ` jakub at gcc dot gnu dot org
2009-07-22 10:35 ` jakub at gcc dot gnu dot org
` (3 subsequent siblings)
21 siblings, 0 replies; 23+ messages in thread
From: jakub at gcc dot gnu dot org @ 2009-04-21 16:02 UTC (permalink / raw)
To: gcc-bugs
--
jakub at gcc dot gnu dot org changed:
What |Removed |Added
----------------------------------------------------------------------------
Target Milestone|4.4.0 |4.4.1
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134
^ permalink raw reply [flat|nested] 23+ messages in thread
* [Bug target/38134] [4.4/4.5 Regression] speed regression with many loop invariants
2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org
` (17 preceding siblings ...)
2009-04-21 16:02 ` [Bug target/38134] [4.4/4.5 " jakub at gcc dot gnu dot org
@ 2009-07-22 10:35 ` jakub at gcc dot gnu dot org
2009-10-15 12:56 ` jakub at gcc dot gnu dot org
` (2 subsequent siblings)
21 siblings, 0 replies; 23+ messages in thread
From: jakub at gcc dot gnu dot org @ 2009-07-22 10:35 UTC (permalink / raw)
To: gcc-bugs
--
jakub at gcc dot gnu dot org changed:
What |Removed |Added
----------------------------------------------------------------------------
Target Milestone|4.4.1 |4.4.2
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134
^ permalink raw reply [flat|nested] 23+ messages in thread
* [Bug target/38134] [4.4/4.5 Regression] speed regression with many loop invariants
2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org
` (18 preceding siblings ...)
2009-07-22 10:35 ` jakub at gcc dot gnu dot org
@ 2009-10-15 12:56 ` jakub at gcc dot gnu dot org
2010-01-21 13:16 ` jakub at gcc dot gnu dot org
2010-04-30 9:01 ` [Bug target/38134] [4.4/4.5/4.6 " jakub at gcc dot gnu dot org
21 siblings, 0 replies; 23+ messages in thread
From: jakub at gcc dot gnu dot org @ 2009-10-15 12:56 UTC (permalink / raw)
To: gcc-bugs
--
jakub at gcc dot gnu dot org changed:
What |Removed |Added
----------------------------------------------------------------------------
Target Milestone|4.4.2 |4.4.3
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134
^ permalink raw reply [flat|nested] 23+ messages in thread
* [Bug target/38134] [4.4/4.5 Regression] speed regression with many loop invariants
2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org
` (19 preceding siblings ...)
2009-10-15 12:56 ` jakub at gcc dot gnu dot org
@ 2010-01-21 13:16 ` jakub at gcc dot gnu dot org
2010-04-30 9:01 ` [Bug target/38134] [4.4/4.5/4.6 " jakub at gcc dot gnu dot org
21 siblings, 0 replies; 23+ messages in thread
From: jakub at gcc dot gnu dot org @ 2010-01-21 13:16 UTC (permalink / raw)
To: gcc-bugs
--
jakub at gcc dot gnu dot org changed:
What |Removed |Added
----------------------------------------------------------------------------
Target Milestone|4.4.3 |4.4.4
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134
^ permalink raw reply [flat|nested] 23+ messages in thread
* [Bug target/38134] [4.4/4.5/4.6 Regression] speed regression with many loop invariants
2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org
` (20 preceding siblings ...)
2010-01-21 13:16 ` jakub at gcc dot gnu dot org
@ 2010-04-30 9:01 ` jakub at gcc dot gnu dot org
21 siblings, 0 replies; 23+ messages in thread
From: jakub at gcc dot gnu dot org @ 2010-04-30 9:01 UTC (permalink / raw)
To: gcc-bugs
--
jakub at gcc dot gnu dot org changed:
What |Removed |Added
----------------------------------------------------------------------------
Target Milestone|4.4.4 |4.4.5
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134
^ permalink raw reply [flat|nested] 23+ messages in thread
end of thread, other threads:[~2010-04-30 8:54 UTC | newest]
Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org
2008-11-15 15:57 ` [Bug c/38134] " tim at klingt dot org
2008-11-15 16:47 ` ubizjak at gmail dot com
2008-11-15 17:05 ` tim at klingt dot org
2008-11-15 20:32 ` [Bug target/38134] [4.4 Regression] speed regression with inline-asm " rguenth at gcc dot gnu dot org
2008-11-16 0:07 ` hjl dot tools at gmail dot com
2008-11-16 0:09 ` hjl dot tools at gmail dot com
2008-11-17 9:36 ` jakub at gcc dot gnu dot org
2008-11-17 18:13 ` ubizjak at gmail dot com
2008-11-17 18:20 ` tim at klingt dot org
2008-11-17 18:31 ` tim at klingt dot org
2008-11-17 18:50 ` tim at klingt dot org
2009-02-03 9:47 ` bonzini at gnu dot org
2009-02-03 10:36 ` ubizjak at gmail dot com
2009-02-03 11:17 ` bonzini at gnu dot org
2009-02-03 11:34 ` ubizjak at gmail dot com
2009-02-13 9:57 ` [Bug target/38134] [4.4 Regression] speed regression with many loop invariants bonzini at gnu dot org
2009-02-13 10:03 ` steven at gcc dot gnu dot org
2009-04-21 16:02 ` [Bug target/38134] [4.4/4.5 " jakub at gcc dot gnu dot org
2009-07-22 10:35 ` jakub at gcc dot gnu dot org
2009-10-15 12:56 ` jakub at gcc dot gnu dot org
2010-01-21 13:16 ` jakub at gcc dot gnu dot org
2010-04-30 9:01 ` [Bug target/38134] [4.4/4.5/4.6 " jakub at gcc dot gnu dot org
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).