[Bug rtl-optimization/59857] New: 4.8.2 loop optimization is worse than 4.5.1 under ARM

public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed

* [Bug rtl-optimization/59857] New: 4.8.2 loop optimization is worse than 4.5.1 under ARM
@ 2014-01-17 11:26 xuelingko at yahoo dot com.tw
  2014-01-17 12:29 ` [Bug rtl-optimization/59857] " rguenth at gcc dot gnu.org
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: xuelingko at yahoo dot com.tw @ 2014-01-17 11:26 UTC (permalink / raw)
  To: gcc-bugs

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=59857

            Bug ID: 59857
           Summary: 4.8.2 loop optimization is worse than 4.5.1 under ARM
           Product: gcc
           Version: 4.8.2
            Status: UNCONFIRMED
          Severity: enhancement
          Priority: P3
         Component: rtl-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: xuelingko at yahoo dot com.tw

I compile a simple source code, memread.c, by gcc 4.8.2 and 4.5.1

The C code is:
int TEST_Memread(ulv * pSrc, unsigned int nCount)
{
    unsigned int val;
    ulv *p1 = NULL;
    unsigned int i;

    p1 = (ulv *) pSrc;
    for (i = 0; i < nCount; i++)
        val = *p1++;

    return 10;
}

# gcc -Wall -O2 -static -g -gstabs+ -c memread.c

4.8.2:
Target: armv7a
Configured with: ../gcc-4.8.2/configure
--prefix=/tmp/root/usr/toolchain-4.8.2-vfp/cortex-a7/gcc
--host=x86_64-pc-linux-gnu --build=x86_64-pc-linux-gnu
--target=armv7a-mediatek-linux-gnueabi
--with-sysroot=/tmp/root/usr/toolchain-4.8.2-vfp/cortex-a7/gcc/sysroot
--with-arch=armv7-a --with-tune=cortex-a7 --with-cpu=cortex-a7 --with-interwork
--with-fpu=vfpv4-d16 --with-float=softfp --with-gnu-as --with-gnu-ld
--disable-nls --enable-shared --enable-__cxa_atexit --disable-multilib
--enable-c99 --enable-long-long --enable-threads=posix --enable-languages=c,c++
--with-gmp=/tmp/root/build/x86_64 --with-mpfr=/tmp/root/build/x86_64
--with-cloog=/tmp/root/build/x86_64 --with-isl=/tmp/root/build/x86_64
--with-libelf=/tmp/root/build/x86_64
--program-transform-name='s,^,armv7a_001_vfp-linux-gnueabi-,'
--with-mpc=/tmp/root/build/x86_64 --enable-lto --without-system-libunwind
--disable-rpath --with-host-libstdcxx='-static-libgcc
-Wl,-Bstatic,-lstdc++,-Bdynamic,-lm'
--with-specs='%{!fno-unwind-tables:-funwind-tables}'
--with-build-time-tools=/tmp/root/usr/toolchain-4.8.2-vfp/cortex-a7/binutils/armv7a/bin
--enable-cxx-flags='-g -O2'
Thread model: posix
gcc version 4.8.2 20131014 (prerelease) (Linaro GCC 4.8-2013.10)

Target: armv7a
Configured with: ../gcc-4.5.1/configure
--prefix=/tmp/root/usr/toolchain-4.5.1-vfp/cortex-a9/gcc
--host=i686-pc-linux-gnu --target=armv7a
--with-sysroot=/tmp/root/usr/toolchain-4.5.1-vfp/cortex-a9/gcc/sysroot
--with-arch=armv7-a --with-tune=cortex-a9 --with-cpu=cortex-a9 --with-interwork
--with-fpu=vfp --with-float=softfp --with-gnu-as --with-gnu-ld --disable-nls
--enable-shared --enable-__cxa_atexit --disable-multilib --enable-c99
--enable-long-long --enable-threads=posix --enable-languages=c,c++
--with-gmp=/tmp/root/build/i686 --with-mpfr=/tmp/root/build/i686
--with-ppl=/tmp/root/build/i686 --with-cloog=/tmp/root/build/i686
--with-libelf=/tmp/root/build/i686 --program-transform-name='s,^,armv7a-,'
--with-mpc=/tmp/root/build/i686 --enable-lto --without-system-libunwind
--disable-rpath --with-host-libstdcxx='-static-libgcc
-Wl,-Bstatic,-lstdc++,-Bdynamic,-lm'
--with-specs='%{!fno-unwind-tables:-funwind-tables}'
--with-build-time-tools=/tmp/root/usr/toolchain-4.5.1-vfp/cortex-a9/binutils/armv7a/bin/
--enable-cxx-flags='-g -O2'
Thread model: posix
gcc version 4.5.1 (GCC)




The objdump of 4.8.2 is

Disassembly of section .text:

00000000 <TEST_Memread>:
    unsigned int val;
    ulv *p1 = NULL;
    unsigned int i;

    p1 = (ulv *) pSrc;
    for (i = 0; i < nCount; i++)
   0:    e3510000     cmp    r1, #0
   4:    0a000005     beq    20 <TEST_Memread+0x20>
   8:    e3a03000     mov    r3, #0
        val = *p1++;
   c:    e5902000     ldr    r2, [r0]
    unsigned int val;
    ulv *p1 = NULL;
    unsigned int i;

    p1 = (ulv *) pSrc;
    for (i = 0; i < nCount; i++)
  10:    e2833001     add    r3, r3, #1
  14:    e1530001     cmp    r3, r1
        val = *p1++;
  18:    e2800004     add    r0, r0, #4
    unsigned int val;
    ulv *p1 = NULL;
    unsigned int i;

    p1 = (ulv *) pSrc;
    for (i = 0; i < nCount; i++)
  1c:    1afffffa     bne    c <TEST_Memread+0xc>
        val = *p1++;

    return 10;
}
  20:    e3a0000a     mov    r0, #10
  24:    e12fff1e     bx    lr


The objdump of 4.5.1 is

Disassembly of section .text:

00000000 <TEST_Memread>:
    unsigned int val;
    ulv *p1 = NULL;
    unsigned int i;

    p1 = (ulv *) pSrc;
    for (i = 0; i < nCount; i++)
   0:    e3510000     cmp    r1, #0
   4:    0a000004     beq    1c <TEST_Memread+0x1c>
   8:    e3a03000     mov    r3, #0
   c:    e2833001     add    r3, r3, #1
        val = *p1++;
  10:    e4902004     ldr    r2, [r0], #4
    unsigned int val;
    ulv *p1 = NULL;
    unsigned int i;

    p1 = (ulv *) pSrc;
    for (i = 0; i < nCount; i++)
  14:    e1510003     cmp    r1, r3
  18:    8afffffb     bhi    c <TEST_Memread+0xc>
        val = *p1++;

    return 10;
}
  1c:    e3a0000a     mov    r0, #10
  20:    e12fff1e     bx    lr



The main different between them is 
4.8.2:
   c:    e5902000     ldr    r2, [r0]
  18:    e2800004     add    r0, r0, #4
4.5.1
  10:    e4902004     ldr    r2, [r0], #4

For this loop performance example, 4.8.2 is only 80% of 4.5.1, this make the
memory read result is bad when using 4.8.2.


^ permalink raw reply	[flat|nested] 4+ messages in thread

* [Bug rtl-optimization/59857] 4.8.2 loop optimization is worse than 4.5.1 under ARM
  2014-01-17 11:26 [Bug rtl-optimization/59857] New: 4.8.2 loop optimization is worse than 4.5.1 under ARM xuelingko at yahoo dot com.tw
@ 2014-01-17 12:29 ` rguenth at gcc dot gnu.org
  2014-01-17 15:42 ` rearnsha at gcc dot gnu.org
  2014-01-20  8:39 ` xuelingko at yahoo dot com.tw
  2 siblings, 0 replies; 4+ messages in thread
From: rguenth at gcc dot gnu.org @ 2014-01-17 12:29 UTC (permalink / raw)
  To: gcc-bugs

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=59857

Richard Biener <rguenth at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|UNCONFIRMED                 |WAITING
   Last reconfirmed|                            |2014-01-17
     Ever confirmed|0                           |1

--- Comment #1 from Richard Biener <rguenth at gcc dot gnu.org> ---
The testcase doesn't compile.  I'd expect the loop to be gone (stupid testcase)
and be optimized to val = p1[nCount-1].


^ permalink raw reply	[flat|nested] 4+ messages in thread

* [Bug rtl-optimization/59857] 4.8.2 loop optimization is worse than 4.5.1 under ARM
  2014-01-17 11:26 [Bug rtl-optimization/59857] New: 4.8.2 loop optimization is worse than 4.5.1 under ARM xuelingko at yahoo dot com.tw
  2014-01-17 12:29 ` [Bug rtl-optimization/59857] " rguenth at gcc dot gnu.org
@ 2014-01-17 15:42 ` rearnsha at gcc dot gnu.org
  2014-01-20  8:39 ` xuelingko at yahoo dot com.tw
  2 siblings, 0 replies; 4+ messages in thread
From: rearnsha at gcc dot gnu.org @ 2014-01-17 15:42 UTC (permalink / raw)
  To: gcc-bugs

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=59857

--- Comment #2 from Richard Earnshaw <rearnsha at gcc dot gnu.org> ---
My suspicion is that ulv is short-hand for unsigned long volatile -- since
without it this testcase is completely degenerate:  val isn't used at all, so
when ulv is not volatile, then entire function reduces to 'return 10;'

The difference between the output when ulv is volatile is then down to
no-longer using post-increment operations on volatile memory accesses.  I
suspect this was done to fix problems we've had with incorrect optimizations of
volatile operations.

I think we've seen similar bug reports to this in the past.  GCC is *very*
conservative when dealing with volatile objects, so I'm not surprised we don't
do this optimization.

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [Bug rtl-optimization/59857] 4.8.2 loop optimization is worse than 4.5.1 under ARM
  2014-01-17 11:26 [Bug rtl-optimization/59857] New: 4.8.2 loop optimization is worse than 4.5.1 under ARM xuelingko at yahoo dot com.tw
  2014-01-17 12:29 ` [Bug rtl-optimization/59857] " rguenth at gcc dot gnu.org
  2014-01-17 15:42 ` rearnsha at gcc dot gnu.org
@ 2014-01-20  8:39 ` xuelingko at yahoo dot com.tw
  2 siblings, 0 replies; 4+ messages in thread
From: xuelingko at yahoo dot com.tw @ 2014-01-20  8:39 UTC (permalink / raw)
  To: gcc-bugs

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=59857

--- Comment #3 from Jacky Ko <xuelingko at yahoo dot com.tw> ---
You are right. ulv is volatile, the typedef in the code is 

typedef unsigned int volatile ulv;

I'm sorry that I didn't provide the definition.

I modify the C code as below,

int TEST_Memread(unsigned int * pSrc, volatile unsigned int *pDst, unsigned int
nCount)
{
    unsigned int *p1 = NULL;
    unsigned int i;

    p1 = (unsigned int *) pSrc;
    for (i = 0; i < nCount; i++)
        *pDst = *p1++;

    return 0;
}

After testing it, the performance is the same between 4.5.1 and 4.8.2. If the
pointer is not a volatile type, both version will do load address data and
post-increment operations in one instruction.


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2014-01-20  8:39 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-01-17 11:26 [Bug rtl-optimization/59857] New: 4.8.2 loop optimization is worse than 4.5.1 under ARM xuelingko at yahoo dot com.tw
2014-01-17 12:29 ` [Bug rtl-optimization/59857] " rguenth at gcc dot gnu.org
2014-01-17 15:42 ` rearnsha at gcc dot gnu.org
2014-01-20  8:39 ` xuelingko at yahoo dot com.tw

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).