public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
From: "tim at klingt dot org" <gcc-bugzilla@gcc.gnu.org>
To: gcc-bugs@gcc.gnu.org
Subject: [Bug c/38134]  New: gcc-4.4 speed regression with sse code
Date: Sat, 15 Nov 2008 15:56:00 -0000	[thread overview]
Message-ID: <bug-38134-12873@http.gcc.gnu.org/bugzilla/> (raw)

the attached program, a simdfied version of the tanf function, shows a 20%
performance regression from gcc-4.3 to gcc-4.4:

the compared compilers are
g++-4.3
Using built-in specs.
Target: x86_64-linux-gnu
Configured with: ../src/configure -v --with-pkgversion='Ubuntu 4.3.2-1ubuntu11'
--with-bugurl=file:///usr/share/doc/gcc-4.3/README.Bugs
--enable-languages=c,c++,fortran,objc,obj-c++ --prefix=/usr --enable-shared
--with-system-zlib --libexecdir=/usr/lib --without-included-gettext
--enable-threads=posix --enable-nls --with-gxx-include-dir=/usr/include/c++/4.3
--program-suffix=-4.3 --enable-clocale=gnu --enable-libstdcxx-debug
--enable-objc-gc --enable-mpfr --enable-checking=release
--build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu
Thread model: posix
gcc version 4.3.2 (Ubuntu 4.3.2-1ubuntu11) 

and

Using built-in specs.
Target: x86_64-linux-gnu
Configured with: ../src/configure -v --with-pkgversion='Ubuntu
20081024-0ubuntu1' --with-bugurl=file:///usr/share/doc/gcc-snapshot/README.Bugs
--enable-languages=c,c++,java,fortran,objc,obj-c++,ada
--prefix=/usr/lib/gcc-snapshot --enable-shared --with-system-zlib --disable-nls
--enable-clocale=gnu --enable-libstdcxx-debug --enable-java-awt=gtk
--enable-gtk-cairo --disable-plugin --with-java-home=/usr/lib/gcc-snapshot
--enable-java-home --with-jvm-root-dir=/usr/lib/gcc-snapshot/jvm
--with-jvm-jar-dir=/usr/lib/gcc-snapshot/jvm-exports
--with-ecj-jar=/usr/share/java/eclipse-ecj.jar --enable-objc-gc --enable-mpfr
--disable-werror --build=x86_64-linux-gnu --host=x86_64-linux-gnu
--target=x86_64-linux-gnu
Thread model: posix
gcc version 4.4.0 20081024 (experimental) [trunk revision 141342] (Ubuntu
20081024-0ubuntu1) 

the interesting part is the inner loop of the bench_1_simd function. 
gcc-4.4 generates:

.L54:
        movaps  in(%rax), %xmm0
        movdqa  %xmm14, %xmm3
        addl    $4, %edx
        pand    %xmm0, %xmm3
#APP
# 325 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        xorps %xmm3, %xmm0
# 0 "" 2
#NO_APP
        movaps  %xmm0, %xmm4
        movaps  %xmm0, %xmm15
        mulps   %xmm13, %xmm4
#APP
# 328 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        cvttps2dq %xmm4, %xmm4
# 0 "" 2
#NO_APP
        movdqa  %xmm4, %xmm1
        pand    %xmm12, %xmm1
        paddd   %xmm1, %xmm4
#APP
# 331 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        cvtdq2ps  %xmm4, %xmm1
# 0 "" 2
#NO_APP
        pand    .LC15(%rip), %xmm4
        movaps  %xmm1, %xmm2
        psrld   $1, %xmm4
        mulps   %xmm11, %xmm2
        subps   %xmm2, %xmm15
        movaps  %xmm15, %xmm2
        movaps  %xmm1, %xmm15
        mulps   %xmm9, %xmm1
        mulps   %xmm10, %xmm15
        subps   %xmm15, %xmm2
        movaps  %xmm8, %xmm15
        subps   %xmm1, %xmm2
        cmpltps %xmm0, %xmm15
        movaps  %xmm2, %xmm1
        mulps   %xmm2, %xmm1
        movaps  %xmm1, %xmm0
        mulps   %xmm7, %xmm0
        addps   .LC10(%rip), %xmm0
        mulps   %xmm1, %xmm0
        addps   .LC11(%rip), %xmm0
        mulps   %xmm1, %xmm0
        addps   .LC12(%rip), %xmm0
        mulps   %xmm1, %xmm0
        addps   .LC13(%rip), %xmm0
        mulps   %xmm1, %xmm0
        addps   .LC14(%rip), %xmm0
        mulps   %xmm1, %xmm0
        movdqa  %xmm5, %xmm1
        mulps   %xmm2, %xmm0
        psubd   %xmm4, %xmm1
        addps   %xmm2, %xmm0
        movdqa  %xmm1, %xmm4
#APP
# 342 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        andps %xmm15, %xmm0
# 0 "" 2
#NO_APP
        movaps  .LC16(%rip), %xmm1
#APP
# 343 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        andnps %xmm2, %xmm15
# 0 "" 2
#NO_APP
        movaps  %xmm6, %xmm2
#APP
# 344 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        orps  %xmm15, %xmm0
# 0 "" 2
#NO_APP
        addps   %xmm0, %xmm1
        divps   %xmm1, %xmm2
        movaps  %xmm2, %xmm1
#APP
# 145 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1
        andps %xmm4, %xmm1
# 0 "" 2
# 146 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1
        andnps %xmm0, %xmm4
# 0 "" 2
#NO_APP
        movaps  %xmm1, %xmm0
#APP
# 147 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1
        orps  %xmm4, %xmm0
# 0 "" 2
# 349 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        xorps %xmm3, %xmm0
# 0 "" 2
#NO_APP
        movaps  %xmm0, out(%rax)
        addq    $16, %rax
        cmpl    %edi, %edx
        jne     .L54

while gcc-4.3 generates:
.L48:
        movaps  in(%rax), %xmm2
        movdqa  .LC2(%rip), %xmm5
        movaps  .LC3(%rip), %xmm0
        pand    %xmm2, %xmm5
        movdqa  .LC4(%rip), %xmm4
        movaps  .LC5(%rip), %xmm1
        addl    $4, %edx
#APP
# 325 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        xorps %xmm5, %xmm2
# 0 "" 2
#NO_APP
        mulps   %xmm2, %xmm0
        movaps  %xmm2, %xmm3
#APP
# 328 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        cvttps2dq %xmm0, %xmm0
# 0 "" 2
#NO_APP
        pand    %xmm0, %xmm4
        paddd   %xmm0, %xmm4
#APP
# 331 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        cvtdq2ps  %xmm4, %xmm0
# 0 "" 2
#NO_APP
        pand    %xmm9, %xmm4
        mulps   %xmm0, %xmm1
        psrld   $1, %xmm4
        subps   %xmm1, %xmm3
        movaps  .LC6(%rip), %xmm1
        mulps   %xmm0, %xmm1
        mulps   .LC7(%rip), %xmm0
        subps   %xmm1, %xmm3
        subps   %xmm0, %xmm3
        movaps  .LC8(%rip), %xmm0
        movaps  %xmm3, %xmm1
        cmpltps %xmm2, %xmm0
        mulps   %xmm3, %xmm1
        movaps  %xmm0, %xmm2
        movaps  %xmm1, %xmm0
        mulps   %xmm15, %xmm0
        addps   %xmm14, %xmm0
        mulps   %xmm1, %xmm0
        addps   %xmm13, %xmm0
        mulps   %xmm1, %xmm0
        addps   %xmm12, %xmm0
        mulps   %xmm1, %xmm0
        addps   %xmm11, %xmm0
        mulps   %xmm1, %xmm0
        addps   %xmm10, %xmm0
        mulps   %xmm1, %xmm0
        mulps   %xmm3, %xmm0
        addps   %xmm3, %xmm0
#APP
# 342 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        andps %xmm2, %xmm0
# 0 "" 2
# 343 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        andnps %xmm3, %xmm2
# 0 "" 2
#NO_APP
        movaps  %xmm7, %xmm3
#APP
# 344 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        orps  %xmm2, %xmm0
# 0 "" 2
#NO_APP
        movdqa  %xmm6, %xmm2
        movaps  %xmm0, %xmm1
        psubd   %xmm4, %xmm2
        addps   %xmm8, %xmm1
        divps   %xmm1, %xmm3
        movaps  %xmm3, %xmm1
#APP
# 145 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1
        andps %xmm2, %xmm1
# 0 "" 2
# 146 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1
        andnps %xmm0, %xmm2
# 0 "" 2
# 147 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1
        orps  %xmm2, %xmm1
# 0 "" 2
# 349 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        xorps %xmm5, %xmm1
# 0 "" 2
#NO_APP
        movaps  %xmm1, out(%rax)
        addq    $16, %rax
        cmpl    %edi, %edx
        jne     .L48

the code generated by gcc-4.4 requires more memory access. the code was
generated with the flags -O3 -march=core2. while the assembly code is generated
for the x86_64 architecture, similar results can be seen with x86 code (4.4 is
about 14% slower than 4.3)


-- 
           Summary: gcc-4.4 speed regression with sse code
           Product: gcc
           Version: unknown
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c
        AssignedTo: unassigned at gcc dot gnu dot org
        ReportedBy: tim at klingt dot org


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134


             reply	other threads:[~2008-11-15 15:56 UTC|newest]

Thread overview: 23+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2008-11-15 15:56 tim at klingt dot org [this message]
2008-11-15 15:57 ` [Bug c/38134] " tim at klingt dot org
2008-11-15 16:47 ` ubizjak at gmail dot com
2008-11-15 17:05 ` tim at klingt dot org
2008-11-15 20:32 ` [Bug target/38134] [4.4 Regression] speed regression with inline-asm " rguenth at gcc dot gnu dot org
2008-11-16  0:07 ` hjl dot tools at gmail dot com
2008-11-16  0:09 ` hjl dot tools at gmail dot com
2008-11-17  9:36 ` jakub at gcc dot gnu dot org
2008-11-17 18:13 ` ubizjak at gmail dot com
2008-11-17 18:20 ` tim at klingt dot org
2008-11-17 18:31 ` tim at klingt dot org
2008-11-17 18:50 ` tim at klingt dot org
2009-02-03  9:47 ` bonzini at gnu dot org
2009-02-03 10:36 ` ubizjak at gmail dot com
2009-02-03 11:17 ` bonzini at gnu dot org
2009-02-03 11:34 ` ubizjak at gmail dot com
2009-02-13  9:57 ` [Bug target/38134] [4.4 Regression] speed regression with many loop invariants bonzini at gnu dot org
2009-02-13 10:03 ` steven at gcc dot gnu dot org
2009-04-21 16:02 ` [Bug target/38134] [4.4/4.5 " jakub at gcc dot gnu dot org
2009-07-22 10:35 ` jakub at gcc dot gnu dot org
2009-10-15 12:56 ` jakub at gcc dot gnu dot org
2010-01-21 13:16 ` jakub at gcc dot gnu dot org
2010-04-30  9:01 ` [Bug target/38134] [4.4/4.5/4.6 " jakub at gcc dot gnu dot org

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=bug-38134-12873@http.gcc.gnu.org/bugzilla/ \
    --to=gcc-bugzilla@gcc.gnu.org \
    --cc=gcc-bugs@gcc.gnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).