public inbox for gcc-bugs@sourceware.org help / color / mirror / Atom feed
* [Bug c/38134] New: gcc-4.4 speed regression with sse code @ 2008-11-15 15:56 tim at klingt dot org 2008-11-15 15:57 ` [Bug c/38134] " tim at klingt dot org ` (21 more replies) 0 siblings, 22 replies; 23+ messages in thread From: tim at klingt dot org @ 2008-11-15 15:56 UTC (permalink / raw) To: gcc-bugs the attached program, a simdfied version of the tanf function, shows a 20% performance regression from gcc-4.3 to gcc-4.4: the compared compilers are g++-4.3 Using built-in specs. Target: x86_64-linux-gnu Configured with: ../src/configure -v --with-pkgversion='Ubuntu 4.3.2-1ubuntu11' --with-bugurl=file:///usr/share/doc/gcc-4.3/README.Bugs --enable-languages=c,c++,fortran,objc,obj-c++ --prefix=/usr --enable-shared --with-system-zlib --libexecdir=/usr/lib --without-included-gettext --enable-threads=posix --enable-nls --with-gxx-include-dir=/usr/include/c++/4.3 --program-suffix=-4.3 --enable-clocale=gnu --enable-libstdcxx-debug --enable-objc-gc --enable-mpfr --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu Thread model: posix gcc version 4.3.2 (Ubuntu 4.3.2-1ubuntu11) and Using built-in specs. Target: x86_64-linux-gnu Configured with: ../src/configure -v --with-pkgversion='Ubuntu 20081024-0ubuntu1' --with-bugurl=file:///usr/share/doc/gcc-snapshot/README.Bugs --enable-languages=c,c++,java,fortran,objc,obj-c++,ada --prefix=/usr/lib/gcc-snapshot --enable-shared --with-system-zlib --disable-nls --enable-clocale=gnu --enable-libstdcxx-debug --enable-java-awt=gtk --enable-gtk-cairo --disable-plugin --with-java-home=/usr/lib/gcc-snapshot --enable-java-home --with-jvm-root-dir=/usr/lib/gcc-snapshot/jvm --with-jvm-jar-dir=/usr/lib/gcc-snapshot/jvm-exports --with-ecj-jar=/usr/share/java/eclipse-ecj.jar --enable-objc-gc --enable-mpfr --disable-werror --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu Thread model: posix gcc version 4.4.0 20081024 (experimental) [trunk revision 141342] (Ubuntu 20081024-0ubuntu1) the interesting part is the inner loop of the bench_1_simd function. gcc-4.4 generates: .L54: movaps in(%rax), %xmm0 movdqa %xmm14, %xmm3 addl $4, %edx pand %xmm0, %xmm3 #APP # 325 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1 xorps %xmm3, %xmm0 # 0 "" 2 #NO_APP movaps %xmm0, %xmm4 movaps %xmm0, %xmm15 mulps %xmm13, %xmm4 #APP # 328 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1 cvttps2dq %xmm4, %xmm4 # 0 "" 2 #NO_APP movdqa %xmm4, %xmm1 pand %xmm12, %xmm1 paddd %xmm1, %xmm4 #APP # 331 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1 cvtdq2ps %xmm4, %xmm1 # 0 "" 2 #NO_APP pand .LC15(%rip), %xmm4 movaps %xmm1, %xmm2 psrld $1, %xmm4 mulps %xmm11, %xmm2 subps %xmm2, %xmm15 movaps %xmm15, %xmm2 movaps %xmm1, %xmm15 mulps %xmm9, %xmm1 mulps %xmm10, %xmm15 subps %xmm15, %xmm2 movaps %xmm8, %xmm15 subps %xmm1, %xmm2 cmpltps %xmm0, %xmm15 movaps %xmm2, %xmm1 mulps %xmm2, %xmm1 movaps %xmm1, %xmm0 mulps %xmm7, %xmm0 addps .LC10(%rip), %xmm0 mulps %xmm1, %xmm0 addps .LC11(%rip), %xmm0 mulps %xmm1, %xmm0 addps .LC12(%rip), %xmm0 mulps %xmm1, %xmm0 addps .LC13(%rip), %xmm0 mulps %xmm1, %xmm0 addps .LC14(%rip), %xmm0 mulps %xmm1, %xmm0 movdqa %xmm5, %xmm1 mulps %xmm2, %xmm0 psubd %xmm4, %xmm1 addps %xmm2, %xmm0 movdqa %xmm1, %xmm4 #APP # 342 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1 andps %xmm15, %xmm0 # 0 "" 2 #NO_APP movaps .LC16(%rip), %xmm1 #APP # 343 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1 andnps %xmm2, %xmm15 # 0 "" 2 #NO_APP movaps %xmm6, %xmm2 #APP # 344 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1 orps %xmm15, %xmm0 # 0 "" 2 #NO_APP addps %xmm0, %xmm1 divps %xmm1, %xmm2 movaps %xmm2, %xmm1 #APP # 145 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1 andps %xmm4, %xmm1 # 0 "" 2 # 146 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1 andnps %xmm0, %xmm4 # 0 "" 2 #NO_APP movaps %xmm1, %xmm0 #APP # 147 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1 orps %xmm4, %xmm0 # 0 "" 2 # 349 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1 xorps %xmm3, %xmm0 # 0 "" 2 #NO_APP movaps %xmm0, out(%rax) addq $16, %rax cmpl %edi, %edx jne .L54 while gcc-4.3 generates: .L48: movaps in(%rax), %xmm2 movdqa .LC2(%rip), %xmm5 movaps .LC3(%rip), %xmm0 pand %xmm2, %xmm5 movdqa .LC4(%rip), %xmm4 movaps .LC5(%rip), %xmm1 addl $4, %edx #APP # 325 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1 xorps %xmm5, %xmm2 # 0 "" 2 #NO_APP mulps %xmm2, %xmm0 movaps %xmm2, %xmm3 #APP # 328 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1 cvttps2dq %xmm0, %xmm0 # 0 "" 2 #NO_APP pand %xmm0, %xmm4 paddd %xmm0, %xmm4 #APP # 331 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1 cvtdq2ps %xmm4, %xmm0 # 0 "" 2 #NO_APP pand %xmm9, %xmm4 mulps %xmm0, %xmm1 psrld $1, %xmm4 subps %xmm1, %xmm3 movaps .LC6(%rip), %xmm1 mulps %xmm0, %xmm1 mulps .LC7(%rip), %xmm0 subps %xmm1, %xmm3 subps %xmm0, %xmm3 movaps .LC8(%rip), %xmm0 movaps %xmm3, %xmm1 cmpltps %xmm2, %xmm0 mulps %xmm3, %xmm1 movaps %xmm0, %xmm2 movaps %xmm1, %xmm0 mulps %xmm15, %xmm0 addps %xmm14, %xmm0 mulps %xmm1, %xmm0 addps %xmm13, %xmm0 mulps %xmm1, %xmm0 addps %xmm12, %xmm0 mulps %xmm1, %xmm0 addps %xmm11, %xmm0 mulps %xmm1, %xmm0 addps %xmm10, %xmm0 mulps %xmm1, %xmm0 mulps %xmm3, %xmm0 addps %xmm3, %xmm0 #APP # 342 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1 andps %xmm2, %xmm0 # 0 "" 2 # 343 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1 andnps %xmm3, %xmm2 # 0 "" 2 #NO_APP movaps %xmm7, %xmm3 #APP # 344 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1 orps %xmm2, %xmm0 # 0 "" 2 #NO_APP movdqa %xmm6, %xmm2 movaps %xmm0, %xmm1 psubd %xmm4, %xmm2 addps %xmm8, %xmm1 divps %xmm1, %xmm3 movaps %xmm3, %xmm1 #APP # 145 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1 andps %xmm2, %xmm1 # 0 "" 2 # 146 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1 andnps %xmm0, %xmm2 # 0 "" 2 # 147 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1 orps %xmm2, %xmm1 # 0 "" 2 # 349 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1 xorps %xmm5, %xmm1 # 0 "" 2 #NO_APP movaps %xmm1, out(%rax) addq $16, %rax cmpl %edi, %edx jne .L48 the code generated by gcc-4.4 requires more memory access. the code was generated with the flags -O3 -march=core2. while the assembly code is generated for the x86_64 architecture, similar results can be seen with x86 code (4.4 is about 14% slower than 4.3) -- Summary: gcc-4.4 speed regression with sse code Product: gcc Version: unknown Status: UNCONFIRMED Severity: normal Priority: P3 Component: c AssignedTo: unassigned at gcc dot gnu dot org ReportedBy: tim at klingt dot org http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134 ^ permalink raw reply [flat|nested] 23+ messages in thread
* [Bug c/38134] gcc-4.4 speed regression with sse code 2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org @ 2008-11-15 15:57 ` tim at klingt dot org 2008-11-15 16:47 ` ubizjak at gmail dot com ` (20 subsequent siblings) 21 siblings, 0 replies; 23+ messages in thread From: tim at klingt dot org @ 2008-11-15 15:57 UTC (permalink / raw) To: gcc-bugs ------- Comment #1 from tim at klingt dot org 2008-11-15 15:55 ------- Created an attachment (id=16684) --> (http://gcc.gnu.org/bugzilla/attachment.cgi?id=16684&action=view) compressed preprocessed source -- http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134 ^ permalink raw reply [flat|nested] 23+ messages in thread
* [Bug c/38134] gcc-4.4 speed regression with sse code 2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org 2008-11-15 15:57 ` [Bug c/38134] " tim at klingt dot org @ 2008-11-15 16:47 ` ubizjak at gmail dot com 2008-11-15 17:05 ` tim at klingt dot org ` (19 subsequent siblings) 21 siblings, 0 replies; 23+ messages in thread From: ubizjak at gmail dot com @ 2008-11-15 16:47 UTC (permalink / raw) To: gcc-bugs ------- Comment #2 from ubizjak at gmail dot com 2008-11-15 16:46 ------- Can you try with -fno-ira? -- http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134 ^ permalink raw reply [flat|nested] 23+ messages in thread
* [Bug c/38134] gcc-4.4 speed regression with sse code 2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org 2008-11-15 15:57 ` [Bug c/38134] " tim at klingt dot org 2008-11-15 16:47 ` ubizjak at gmail dot com @ 2008-11-15 17:05 ` tim at klingt dot org 2008-11-15 20:32 ` [Bug target/38134] [4.4 Regression] speed regression with inline-asm " rguenth at gcc dot gnu dot org ` (18 subsequent siblings) 21 siblings, 0 replies; 23+ messages in thread From: tim at klingt dot org @ 2008-11-15 17:05 UTC (permalink / raw) To: gcc-bugs ------- Comment #3 from tim at klingt dot org 2008-11-15 17:04 ------- i tried to run the benchmark with -fno-ira, which turned out to be about 20% slower than without the flag. anyway, i found, that the preprocessed source generated by gcc-4.3 cannot be compiled with gcc-4.4 ... the specific file can be found here http://tim.klingt.org/git?p=nova-server.git;a=blob;f=benchmarks/simd_tan_benchmarks.cpp;h=c575996de0dc916a8e654af7a36350be9c22327e;hb=844d3cf991cbbbe74b34277696dda0b940769b28 -- http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134 ^ permalink raw reply [flat|nested] 23+ messages in thread
* [Bug target/38134] [4.4 Regression] speed regression with inline-asm sse code 2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org ` (2 preceding siblings ...) 2008-11-15 17:05 ` tim at klingt dot org @ 2008-11-15 20:32 ` rguenth at gcc dot gnu dot org 2008-11-16 0:07 ` hjl dot tools at gmail dot com ` (17 subsequent siblings) 21 siblings, 0 replies; 23+ messages in thread From: rguenth at gcc dot gnu dot org @ 2008-11-15 20:32 UTC (permalink / raw) To: gcc-bugs -- rguenth at gcc dot gnu dot org changed: What |Removed |Added ---------------------------------------------------------------------------- GCC target triplet| |x86_64-*-*-* Keywords| |missed-optimization Summary|gcc-4.4 speed regression |[4.4 Regression] speed |with inline-asm sse code |regression with inline-asm | |sse code Target Milestone|--- |4.4.0 Version|unknown |4.4.0 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134 ^ permalink raw reply [flat|nested] 23+ messages in thread
* [Bug target/38134] [4.4 Regression] speed regression with inline-asm sse code 2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org ` (3 preceding siblings ...) 2008-11-15 20:32 ` [Bug target/38134] [4.4 Regression] speed regression with inline-asm " rguenth at gcc dot gnu dot org @ 2008-11-16 0:07 ` hjl dot tools at gmail dot com 2008-11-16 0:09 ` hjl dot tools at gmail dot com ` (16 subsequent siblings) 21 siblings, 0 replies; 23+ messages in thread From: hjl dot tools at gmail dot com @ 2008-11-16 0:07 UTC (permalink / raw) To: gcc-bugs ------- Comment #4 from hjl dot tools at gmail dot com 2008-11-16 00:06 ------- (In reply to comment #3) > i tried to run the benchmark with -fno-ira, which turned out to be about 20% > slower than without the flag. > Can you try "-O3 -march=core2 -mtune=generic" and "-O3 -march=core2 -mtune=generic -fno-ira" ? -- http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134 ^ permalink raw reply [flat|nested] 23+ messages in thread
* [Bug target/38134] [4.4 Regression] speed regression with inline-asm sse code 2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org ` (4 preceding siblings ...) 2008-11-16 0:07 ` hjl dot tools at gmail dot com @ 2008-11-16 0:09 ` hjl dot tools at gmail dot com 2008-11-17 9:36 ` jakub at gcc dot gnu dot org ` (15 subsequent siblings) 21 siblings, 0 replies; 23+ messages in thread From: hjl dot tools at gmail dot com @ 2008-11-16 0:09 UTC (permalink / raw) To: gcc-bugs ------- Comment #5 from hjl dot tools at gmail dot com 2008-11-16 00:08 ------- (In reply to comment #3) > anyway, i found, that the preprocessed source generated by gcc-4.3 cannot be > compiled with gcc-4.4 ... the specific file can be found here > http://tim.klingt.org/git?p=nova-server.git;a=blob;f=benchmarks/simd_tan_benchmarks.cpp;h=c575996de0dc916a8e654af7a36350be9c22327e;hb=844d3cf991cbbbe74b34277696dda0b940769b28 > Please upload both preprocessed sources generated by gcc 4.3 and gcc 4.4. -- http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134 ^ permalink raw reply [flat|nested] 23+ messages in thread
* [Bug target/38134] [4.4 Regression] speed regression with inline-asm sse code 2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org ` (5 preceding siblings ...) 2008-11-16 0:09 ` hjl dot tools at gmail dot com @ 2008-11-17 9:36 ` jakub at gcc dot gnu dot org 2008-11-17 18:13 ` ubizjak at gmail dot com ` (14 subsequent siblings) 21 siblings, 0 replies; 23+ messages in thread From: jakub at gcc dot gnu dot org @ 2008-11-17 9:36 UTC (permalink / raw) To: gcc-bugs -- jakub at gcc dot gnu dot org changed: What |Removed |Added ---------------------------------------------------------------------------- Priority|P3 |P2 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134 ^ permalink raw reply [flat|nested] 23+ messages in thread
* [Bug target/38134] [4.4 Regression] speed regression with inline-asm sse code 2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org ` (6 preceding siblings ...) 2008-11-17 9:36 ` jakub at gcc dot gnu dot org @ 2008-11-17 18:13 ` ubizjak at gmail dot com 2008-11-17 18:20 ` tim at klingt dot org ` (13 subsequent siblings) 21 siblings, 0 replies; 23+ messages in thread From: ubizjak at gmail dot com @ 2008-11-17 18:13 UTC (permalink / raw) To: gcc-bugs ------- Comment #6 from ubizjak at gmail dot com 2008-11-17 18:11 ------- I think that addps .LC10(%rip), %xmm0 mulps %xmm1, %xmm0 addps .LC11(%rip), %xmm0 mulps %xmm1, %xmm0 addps .LC12(%rip), %xmm0 mulps %xmm1, %xmm0 addps .LC13(%rip), %xmm0 mulps %xmm1, %xmm0 addps .LC14(%rip), %xmm0 mulps %xmm1, %xmm0 is the bottleneck. Perhaps we should split impilicit memory operands out of the insn by some generic peephole (if the register is available) and schedule loads appropriately. OTOH, loop optimizer should detect invariant loads and move them out of the loop. -- http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134 ^ permalink raw reply [flat|nested] 23+ messages in thread
* [Bug target/38134] [4.4 Regression] speed regression with inline-asm sse code 2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org ` (7 preceding siblings ...) 2008-11-17 18:13 ` ubizjak at gmail dot com @ 2008-11-17 18:20 ` tim at klingt dot org 2008-11-17 18:31 ` tim at klingt dot org ` (12 subsequent siblings) 21 siblings, 0 replies; 23+ messages in thread From: tim at klingt dot org @ 2008-11-17 18:20 UTC (permalink / raw) To: gcc-bugs ------- Comment #7 from tim at klingt dot org 2008-11-17 18:19 ------- Created an attachment (id=16710) --> (http://gcc.gnu.org/bugzilla/attachment.cgi?id=16710&action=view) compressed preprocessed source, gcc-4.4 -- http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134 ^ permalink raw reply [flat|nested] 23+ messages in thread
* [Bug target/38134] [4.4 Regression] speed regression with inline-asm sse code 2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org ` (8 preceding siblings ...) 2008-11-17 18:20 ` tim at klingt dot org @ 2008-11-17 18:31 ` tim at klingt dot org 2008-11-17 18:50 ` tim at klingt dot org ` (11 subsequent siblings) 21 siblings, 0 replies; 23+ messages in thread From: tim at klingt dot org @ 2008-11-17 18:31 UTC (permalink / raw) To: gcc-bugs ------- Comment #8 from tim at klingt dot org 2008-11-17 18:30 ------- Created an attachment (id=16711) --> (http://gcc.gnu.org/bugzilla/attachment.cgi?id=16711&action=view) 16684: compressed preprocessed source, gcc-4.3 -- tim at klingt dot org changed: What |Removed |Added ---------------------------------------------------------------------------- Attachment #16684|0 |1 is obsolete| | http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134 ^ permalink raw reply [flat|nested] 23+ messages in thread
* [Bug target/38134] [4.4 Regression] speed regression with inline-asm sse code 2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org ` (9 preceding siblings ...) 2008-11-17 18:31 ` tim at klingt dot org @ 2008-11-17 18:50 ` tim at klingt dot org 2009-02-03 9:47 ` bonzini at gnu dot org ` (10 subsequent siblings) 21 siblings, 0 replies; 23+ messages in thread From: tim at klingt dot org @ 2008-11-17 18:50 UTC (permalink / raw) To: gcc-bugs ------- Comment #9 from tim at klingt dot org 2008-11-17 18:49 ------- i have updated the test program and attached preprocessed sources of gcc 4.3 and 4.4 the loop prefix contains 4.4 (9 invariant loads, one store of a generated constant to the stack): pxor %xmm5, %xmm5 xorl %eax, %eax movdqa %xmm5, %xmm0 xorl %edx, %edx pcmpeqd %xmm5, %xmm0 movaps .LC2(%rip), %xmm14 psrld $31, %xmm0 movdqa .LC3(%rip), %xmm13 pslld $31, %xmm0 movaps .LC4(%rip), %xmm12 movaps .LC5(%rip), %xmm11 movaps .LC6(%rip), %xmm10 movaps .LC7(%rip), %xmm9 movaps .LC8(%rip), %xmm8 movaps .LC9(%rip), %xmm7 movaps .LC16(%rip), %xmm6 movdqa %xmm0, -24(%rsp) 4.3 (8 invariant loads, store one generated constant in register): pxor %xmm6, %xmm6 xorl %edx, %edx movdqa %xmm6, %xmm0 xorl %eax, %eax pcmpeqd %xmm6, %xmm0 movaps .LC9(%rip), %xmm15 psrld $31, %xmm0 movaps .LC10(%rip), %xmm14 pslld $31, %xmm0 movaps .LC11(%rip), %xmm13 movaps .LC12(%rip), %xmm12 movaps .LC13(%rip), %xmm11 movdqa .LC14(%rip), %xmm10 movaps .LC15(%rip), %xmm9 movaps .LC16(%rip), %xmm8 movdqa %xmm0, %xmm7 body: 4.3 (7 loads from memory, 2 loads are used in the next instruction, others are used later): .L48: movaps in(%rax), %xmm2 movaps .LC2(%rip), %xmm0 movdqa %xmm2, %xmm5 movdqa .LC3(%rip), %xmm4 pand %xmm7, %xmm5 movaps .LC4(%rip), %xmm1 addl $4, %edx #APP # 324 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1 xorps %xmm5, %xmm2 # 0 "" 2 #NO_APP mulps %xmm2, %xmm0 movaps %xmm2, %xmm3 #APP # 327 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1 cvttps2dq %xmm0, %xmm0 # 0 "" 2 #NO_APP pand %xmm0, %xmm4 paddd %xmm0, %xmm4 #APP # 330 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1 cvtdq2ps %xmm4, %xmm0 # 0 "" 2 #NO_APP pand %xmm10, %xmm4 mulps %xmm0, %xmm1 psrld $1, %xmm4 subps %xmm1, %xmm3 movaps .LC5(%rip), %xmm1 mulps %xmm0, %xmm1 mulps .LC6(%rip), %xmm0 subps %xmm1, %xmm3 subps %xmm0, %xmm3 movaps .LC7(%rip), %xmm0 movaps %xmm3, %xmm1 cmpltps %xmm2, %xmm0 mulps %xmm3, %xmm1 movaps %xmm0, %xmm2 movaps .LC8(%rip), %xmm0 mulps %xmm1, %xmm0 addps %xmm15, %xmm0 mulps %xmm1, %xmm0 addps %xmm14, %xmm0 mulps %xmm1, %xmm0 addps %xmm13, %xmm0 mulps %xmm1, %xmm0 addps %xmm12, %xmm0 mulps %xmm1, %xmm0 addps %xmm11, %xmm0 mulps %xmm1, %xmm0 mulps %xmm3, %xmm0 addps %xmm3, %xmm0 #APP # 341 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1 andps %xmm2, %xmm0 # 0 "" 2 # 342 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1 andnps %xmm3, %xmm2 # 0 "" 2 #NO_APP movaps %xmm8, %xmm3 #APP # 343 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1 orps %xmm2, %xmm0 # 0 "" 2 #NO_APP movdqa %xmm6, %xmm2 movaps %xmm0, %xmm1 psubd %xmm4, %xmm2 addps %xmm9, %xmm1 divps %xmm1, %xmm3 movaps %xmm3, %xmm1 #APP # 145 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1 andps %xmm2, %xmm1 # 0 "" 2 # 146 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1 andnps %xmm0, %xmm2 # 0 "" 2 # 147 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1 orps %xmm2, %xmm1 # 0 "" 2 # 348 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1 xorps %xmm5, %xmm1 # 0 "" 2 #NO_APP movaps %xmm1, out(%rax) addq $16, %rax cmpl %edi, %edx jne .L48 4.4 (6 loads from memory, 5 loads are used as memory argument to opcodes): .L54: movaps in(%rax), %xmm2 movdqa -24(%rsp), %xmm3 addl $4, %edx pand %xmm2, %xmm3 #APP # 324 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1 xorps %xmm3, %xmm2 # 0 "" 2 #NO_APP movaps %xmm2, %xmm4 movaps %xmm2, %xmm15 mulps %xmm14, %xmm4 #APP # 327 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1 cvttps2dq %xmm4, %xmm4 # 0 "" 2 #NO_APP movdqa %xmm4, %xmm0 pand %xmm13, %xmm0 paddd %xmm0, %xmm4 #APP # 330 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1 cvtdq2ps %xmm4, %xmm0 # 0 "" 2 #NO_APP pand .LC14(%rip), %xmm4 movaps %xmm0, %xmm1 psrld $1, %xmm4 mulps %xmm12, %xmm1 subps %xmm1, %xmm15 movaps %xmm15, %xmm1 movaps %xmm0, %xmm15 mulps %xmm10, %xmm0 mulps %xmm11, %xmm15 subps %xmm15, %xmm1 movaps %xmm9, %xmm15 subps %xmm0, %xmm1 cmpltps %xmm2, %xmm15 movaps %xmm1, %xmm0 movaps %xmm15, %xmm2 mulps %xmm1, %xmm0 movaps %xmm0, %xmm15 mulps %xmm8, %xmm15 addps %xmm7, %xmm15 mulps %xmm0, %xmm15 addps .LC10(%rip), %xmm15 mulps %xmm0, %xmm15 addps .LC11(%rip), %xmm15 mulps %xmm0, %xmm15 addps .LC12(%rip), %xmm15 mulps %xmm0, %xmm15 addps .LC13(%rip), %xmm15 mulps %xmm15, %xmm0 mulps %xmm1, %xmm0 addps %xmm1, %xmm0 #APP # 341 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1 andps %xmm2, %xmm0 # 0 "" 2 # 342 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1 andnps %xmm1, %xmm2 # 0 "" 2 #NO_APP movdqa %xmm5, %xmm1 psubd %xmm4, %xmm1 movdqa %xmm1, %xmm4 movaps .LC15(%rip), %xmm1 #APP # 343 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1 orps %xmm2, %xmm0 # 0 "" 2 #NO_APP movaps %xmm6, %xmm2 addps %xmm0, %xmm1 divps %xmm1, %xmm2 movaps %xmm2, %xmm1 #APP # 145 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1 andps %xmm4, %xmm1 # 0 "" 2 # 146 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1 andnps %xmm0, %xmm4 # 0 "" 2 #NO_APP movaps %xmm1, %xmm0 #APP # 147 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1 orps %xmm4, %xmm0 # 0 "" 2 # 348 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1 xorps %xmm3, %xmm0 # 0 "" 2 #NO_APP movaps %xmm0, out(%rax) addq $16, %rax cmpl %edi, %edx jne .L54 hth -- http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134 ^ permalink raw reply [flat|nested] 23+ messages in thread
* [Bug target/38134] [4.4 Regression] speed regression with inline-asm sse code 2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org ` (10 preceding siblings ...) 2008-11-17 18:50 ` tim at klingt dot org @ 2009-02-03 9:47 ` bonzini at gnu dot org 2009-02-03 10:36 ` ubizjak at gmail dot com ` (9 subsequent siblings) 21 siblings, 0 replies; 23+ messages in thread From: bonzini at gnu dot org @ 2009-02-03 9:47 UTC (permalink / raw) To: gcc-bugs ------- Comment #10 from bonzini at gnu dot org 2009-02-03 09:47 ------- Can you try the patch of PR38824? -- bonzini at gnu dot org changed: What |Removed |Added ---------------------------------------------------------------------------- Status|UNCONFIRMED |WAITING http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134 ^ permalink raw reply [flat|nested] 23+ messages in thread
* [Bug target/38134] [4.4 Regression] speed regression with inline-asm sse code 2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org ` (11 preceding siblings ...) 2009-02-03 9:47 ` bonzini at gnu dot org @ 2009-02-03 10:36 ` ubizjak at gmail dot com 2009-02-03 11:17 ` bonzini at gnu dot org ` (8 subsequent siblings) 21 siblings, 0 replies; 23+ messages in thread From: ubizjak at gmail dot com @ 2009-02-03 10:36 UTC (permalink / raw) To: gcc-bugs ------- Comment #11 from ubizjak at gmail dot com 2009-02-03 10:36 ------- (In reply to comment #10) > Can you try the patch of PR38824? I have tried with a similar peephole2 recognizer. The problem is, that there is no spare "x" register to allocate as a temporary, so peephole2 is ineffective in this particular case. -- ubizjak at gmail dot com changed: What |Removed |Added ---------------------------------------------------------------------------- Last reconfirmed|0000-00-00 00:00:00 |2009-02-03 10:36:46 date| | http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134 ^ permalink raw reply [flat|nested] 23+ messages in thread
* [Bug target/38134] [4.4 Regression] speed regression with inline-asm sse code 2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org ` (12 preceding siblings ...) 2009-02-03 10:36 ` ubizjak at gmail dot com @ 2009-02-03 11:17 ` bonzini at gnu dot org 2009-02-03 11:34 ` ubizjak at gmail dot com ` (7 subsequent siblings) 21 siblings, 0 replies; 23+ messages in thread From: bonzini at gnu dot org @ 2009-02-03 11:17 UTC (permalink / raw) To: gcc-bugs ------- Comment #12 from bonzini at gnu dot org 2009-02-03 11:17 ------- What if we forbid altogether memory operands and we *synthesize* them with a peephole2? Anyway, it seems safe to me to declare this a dup of PR38824? -- bonzini at gnu dot org changed: What |Removed |Added ---------------------------------------------------------------------------- Status|WAITING |NEW Ever Confirmed|0 |1 Last reconfirmed|2009-02-03 10:36:46 |2009-02-03 11:17:38 date| | http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134 ^ permalink raw reply [flat|nested] 23+ messages in thread
* [Bug target/38134] [4.4 Regression] speed regression with inline-asm sse code 2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org ` (13 preceding siblings ...) 2009-02-03 11:17 ` bonzini at gnu dot org @ 2009-02-03 11:34 ` ubizjak at gmail dot com 2009-02-13 9:57 ` [Bug target/38134] [4.4 Regression] speed regression with many loop invariants bonzini at gnu dot org ` (6 subsequent siblings) 21 siblings, 0 replies; 23+ messages in thread From: ubizjak at gmail dot com @ 2009-02-03 11:34 UTC (permalink / raw) To: gcc-bugs ------- Comment #13 from ubizjak at gmail dot com 2009-02-03 11:34 ------- (In reply to comment #12) > What if we forbid altogether memory operands and we *synthesize* them with a > peephole2? Anyway, it seems safe to me to declare this a dup of PR38824? I think that we will hit PR 19398 then... -- http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134 ^ permalink raw reply [flat|nested] 23+ messages in thread
* [Bug target/38134] [4.4 Regression] speed regression with many loop invariants 2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org ` (14 preceding siblings ...) 2009-02-03 11:34 ` ubizjak at gmail dot com @ 2009-02-13 9:57 ` bonzini at gnu dot org 2009-02-13 10:03 ` steven at gcc dot gnu dot org ` (5 subsequent siblings) 21 siblings, 0 replies; 23+ messages in thread From: bonzini at gnu dot org @ 2009-02-13 9:57 UTC (permalink / raw) To: gcc-bugs ------- Comment #14 from bonzini at gnu dot org 2009-02-13 09:57 ------- It seems to me that it would help to have a postreload LIM pass that would concentrate on loop-invariant memory accesses that are as cheap or cheaper than loading back a spill. These would be excluded from the current (non-strict) LIM. Is it crazy? -- http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134 ^ permalink raw reply [flat|nested] 23+ messages in thread
* [Bug target/38134] [4.4 Regression] speed regression with many loop invariants 2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org ` (15 preceding siblings ...) 2009-02-13 9:57 ` [Bug target/38134] [4.4 Regression] speed regression with many loop invariants bonzini at gnu dot org @ 2009-02-13 10:03 ` steven at gcc dot gnu dot org 2009-04-21 16:02 ` [Bug target/38134] [4.4/4.5 " jakub at gcc dot gnu dot org ` (4 subsequent siblings) 21 siblings, 0 replies; 23+ messages in thread From: steven at gcc dot gnu dot org @ 2009-02-13 10:03 UTC (permalink / raw) To: gcc-bugs ------- Comment #15 from steven at gcc dot gnu dot org 2009-02-13 10:03 ------- Re. Comment #14 No, this is not crazy. It is called postreload-gcse. But it is a stupid pass that doesn't handle all cases it ought to handle. -- http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134 ^ permalink raw reply [flat|nested] 23+ messages in thread
* [Bug target/38134] [4.4/4.5 Regression] speed regression with many loop invariants 2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org ` (16 preceding siblings ...) 2009-02-13 10:03 ` steven at gcc dot gnu dot org @ 2009-04-21 16:02 ` jakub at gcc dot gnu dot org 2009-07-22 10:35 ` jakub at gcc dot gnu dot org ` (3 subsequent siblings) 21 siblings, 0 replies; 23+ messages in thread From: jakub at gcc dot gnu dot org @ 2009-04-21 16:02 UTC (permalink / raw) To: gcc-bugs -- jakub at gcc dot gnu dot org changed: What |Removed |Added ---------------------------------------------------------------------------- Target Milestone|4.4.0 |4.4.1 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134 ^ permalink raw reply [flat|nested] 23+ messages in thread
* [Bug target/38134] [4.4/4.5 Regression] speed regression with many loop invariants 2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org ` (17 preceding siblings ...) 2009-04-21 16:02 ` [Bug target/38134] [4.4/4.5 " jakub at gcc dot gnu dot org @ 2009-07-22 10:35 ` jakub at gcc dot gnu dot org 2009-10-15 12:56 ` jakub at gcc dot gnu dot org ` (2 subsequent siblings) 21 siblings, 0 replies; 23+ messages in thread From: jakub at gcc dot gnu dot org @ 2009-07-22 10:35 UTC (permalink / raw) To: gcc-bugs -- jakub at gcc dot gnu dot org changed: What |Removed |Added ---------------------------------------------------------------------------- Target Milestone|4.4.1 |4.4.2 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134 ^ permalink raw reply [flat|nested] 23+ messages in thread
* [Bug target/38134] [4.4/4.5 Regression] speed regression with many loop invariants 2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org ` (18 preceding siblings ...) 2009-07-22 10:35 ` jakub at gcc dot gnu dot org @ 2009-10-15 12:56 ` jakub at gcc dot gnu dot org 2010-01-21 13:16 ` jakub at gcc dot gnu dot org 2010-04-30 9:01 ` [Bug target/38134] [4.4/4.5/4.6 " jakub at gcc dot gnu dot org 21 siblings, 0 replies; 23+ messages in thread From: jakub at gcc dot gnu dot org @ 2009-10-15 12:56 UTC (permalink / raw) To: gcc-bugs -- jakub at gcc dot gnu dot org changed: What |Removed |Added ---------------------------------------------------------------------------- Target Milestone|4.4.2 |4.4.3 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134 ^ permalink raw reply [flat|nested] 23+ messages in thread
* [Bug target/38134] [4.4/4.5 Regression] speed regression with many loop invariants 2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org ` (19 preceding siblings ...) 2009-10-15 12:56 ` jakub at gcc dot gnu dot org @ 2010-01-21 13:16 ` jakub at gcc dot gnu dot org 2010-04-30 9:01 ` [Bug target/38134] [4.4/4.5/4.6 " jakub at gcc dot gnu dot org 21 siblings, 0 replies; 23+ messages in thread From: jakub at gcc dot gnu dot org @ 2010-01-21 13:16 UTC (permalink / raw) To: gcc-bugs -- jakub at gcc dot gnu dot org changed: What |Removed |Added ---------------------------------------------------------------------------- Target Milestone|4.4.3 |4.4.4 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134 ^ permalink raw reply [flat|nested] 23+ messages in thread
* [Bug target/38134] [4.4/4.5/4.6 Regression] speed regression with many loop invariants 2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org ` (20 preceding siblings ...) 2010-01-21 13:16 ` jakub at gcc dot gnu dot org @ 2010-04-30 9:01 ` jakub at gcc dot gnu dot org 21 siblings, 0 replies; 23+ messages in thread From: jakub at gcc dot gnu dot org @ 2010-04-30 9:01 UTC (permalink / raw) To: gcc-bugs -- jakub at gcc dot gnu dot org changed: What |Removed |Added ---------------------------------------------------------------------------- Target Milestone|4.4.4 |4.4.5 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134 ^ permalink raw reply [flat|nested] 23+ messages in thread
end of thread, other threads:[~2010-04-30 8:54 UTC | newest] Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2008-11-15 15:56 [Bug c/38134] New: gcc-4.4 speed regression with sse code tim at klingt dot org 2008-11-15 15:57 ` [Bug c/38134] " tim at klingt dot org 2008-11-15 16:47 ` ubizjak at gmail dot com 2008-11-15 17:05 ` tim at klingt dot org 2008-11-15 20:32 ` [Bug target/38134] [4.4 Regression] speed regression with inline-asm " rguenth at gcc dot gnu dot org 2008-11-16 0:07 ` hjl dot tools at gmail dot com 2008-11-16 0:09 ` hjl dot tools at gmail dot com 2008-11-17 9:36 ` jakub at gcc dot gnu dot org 2008-11-17 18:13 ` ubizjak at gmail dot com 2008-11-17 18:20 ` tim at klingt dot org 2008-11-17 18:31 ` tim at klingt dot org 2008-11-17 18:50 ` tim at klingt dot org 2009-02-03 9:47 ` bonzini at gnu dot org 2009-02-03 10:36 ` ubizjak at gmail dot com 2009-02-03 11:17 ` bonzini at gnu dot org 2009-02-03 11:34 ` ubizjak at gmail dot com 2009-02-13 9:57 ` [Bug target/38134] [4.4 Regression] speed regression with many loop invariants bonzini at gnu dot org 2009-02-13 10:03 ` steven at gcc dot gnu dot org 2009-04-21 16:02 ` [Bug target/38134] [4.4/4.5 " jakub at gcc dot gnu dot org 2009-07-22 10:35 ` jakub at gcc dot gnu dot org 2009-10-15 12:56 ` jakub at gcc dot gnu dot org 2010-01-21 13:16 ` jakub at gcc dot gnu dot org 2010-04-30 9:01 ` [Bug target/38134] [4.4/4.5/4.6 " jakub at gcc dot gnu dot org
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).