public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug rtl-optimization/36712]  New: Inefficient loop unrolling
@ 2008-07-03  9:07 bmei at broadcom dot com
  2009-05-20 13:20 ` [Bug rtl-optimization/36712] " ramana at gcc dot gnu dot org
                   ` (19 more replies)
  0 siblings, 20 replies; 21+ messages in thread
From: bmei at broadcom dot com @ 2008-07-03  9:07 UTC (permalink / raw)
  To: gcc-bugs

Loop unrolling generates far worse code compared with manually unrolled code. 
In following code, the first version is GCC-unrolled and the second is 
manually unrolled. 

GCC-unrolled version mainly suffers from two issues. First, the
load/store offsets are registers. Extra ADD instructions are needed to
increase offset over iteration. In the contrast, manually unrolled code
makes use of immediate offset efficiently and only need one ADD to
adjust base register in the end. Second, the alias (dependence) analysis
is over conservative. The LOAD instruction of next unrolled iteration
cannot be moved beyond previous STORE instruction even they are clearly
not aliased. I suspect the failure of alias analysis is related to the
first issue of handling base and offset address. The .sched2 file shows
that the first loop body requires 57 cycles whereas the second one takes
50 cycles for arm9 (56 cycles vs 34 cycles for Xscale).  It become even
worse for our VLIW porting due to longer latency of MUL and Load
instructions and incapability of filling all slots (120 cycles vs. 20
cycles)

tst.c
void Unroll( short s, int * restrict b_inout, int *restrict out)
{
        int i;
        for (i=0; i<64; i++)
        {
                b_inout[i] = b_inout[i] * s;
        }
}
arm-elf-gcc tst.c -O2  -std=c99 -S  -v -fdump-tree-all  -da  -mcpu=arm9
-funroll-loops
Unroll:
        @ args = 0, pretend = 0, frame = 0
        @ frame_needed = 0, uses_anonymous_args = 0
        @ link register save eliminated.
        mov     r0, r0, asl #16
        stmfd   sp!, {r4, r5, r6}
        mov     r4, r1
        mov     r6, r0, asr #16
        mov     r5, #0
.L2:
        ldr     r1, [r4, r5]
        add     ip, r5, #4
        mul     r0, r6, r1
        str     r0, [r4, r5]
        ldr     r3, [r4, ip]
        add     r0, ip, #4
        mul     r2, r6, r3
        str     r2, [r4, ip]
        ldr     r1, [r4, r0]
        add     ip, r5, #12
        mul     r3, r6, r1
        str     r3, [r4, r0]
        ldr     r2, [r4, ip]
        add     r1, r5, #16
        mul     r3, r6, r2
        str     r3, [r4, ip]
        ldr     r0, [r4, r1]
        add     ip, r5, #20
        mul     r3, r6, r0
        str     r3, [r4, r1]
        ldr     r2, [r4, ip]
        add     r1, r5, #24
        mul     r0, r6, r2
        str     r0, [r4, ip]
        ldr     r3, [r4, r1]
        add     ip, r5, #28
        mul     r0, r6, r3
        str     r0, [r4, r1]
        ldr     r2, [r4, ip]
        add     r5, r5, #32
        mul     r3, r6, r2
        cmp     r5, #256
        str     r3, [r4, ip]
        bne     .L2
        ldmfd   sp!, {r4, r5, r6}
        bx      lr
        .size   Unroll, .-Unroll


tst2.c:
void ManualUnroll( short s, int * restrict b_inout, int *restrict out)
{
        int i;
        for (i=0; i<64;)
        {
                b_inout[i] = b_inout[i] * s;
                i++;
                b_inout[i] = b_inout[i] * s;
                i++;
                b_inout[i] = b_inout[i] * s;
                i++;
                b_inout[i] = b_inout[i] * s;
                i++;
                b_inout[i] = b_inout[i] * s;
                i++;
                b_inout[i] = b_inout[i] * s;
                i++;
                b_inout[i] = b_inout[i] * s;
                i++;
                b_inout[i] = b_inout[i] * s;
                i++;
        }
}

arm-elf-gcc tst2.c -O2  -std=c99 -S  -mcpu=arm9

ManualUnroll:
        @ args = 0, pretend = 0, frame = 0
        @ frame_needed = 0, uses_anonymous_args = 0
        @ link register save eliminated.
        mov     r0, r0, asl #16
        stmfd   sp!, {r4, r5, r6, r7, r8, r9, sl, fp}
        mov     sl, r1
        mov     r9, r0, asr #16
        add     fp, r1, #256
.L7:
        ldr     r3, [sl, #0]
        ldr     r2, [sl, #4]
        ldr     r1, [sl, #8]
        ldr     r0, [sl, #12]
        ldr     ip, [sl, #16]
        add     r4, sl, #20
        ldmia   r4, {r4, r5, r6}        @ phole ldm
        mul     r7, r9, r3
        mul     r8, r9, r2
        mul     r3, r9, r1
        mul     r2, r9, r0
        mul     r1, r9, ip
        mul     r0, r9, r4
        mul     ip, r9, r5
        mul     r4, r9, r6
        stmia   sl, {r7, r8}    @ phole stm
        str     r3, [sl, #8]
        str     r2, [sl, #12]
        str     r1, [sl, #16]
        str     r0, [sl, #20]
        str     ip, [sl, #24]
        str     r4, [sl, #28]
        add     sl, sl, #32
        cmp     sl, fp
        bne     .L7
        ldmfd   sp!, {r4, r5, r6, r7, r8, r9, sl, fp}
        bx      lr
        .size   ManualUnroll, .-ManualUnroll
        .ident  "GCC: (GNU) 4.4.0 20080530 (experimental)"


My ARM compiler is built with following configuration
CC="gcc -m32 -static" CFLAGS="-g"
RANLIB_FOR_TARGET="/home/aashley/work/sourceware/install/bin/arm-elf-ranlib"
AR_FOR_TARGET="/home/aashley/work/sourceware/install/bin/arm-elf-ar"
AS_FOR_TARGET="/home/aashley/work/sourceware/install/bin/arm-elf-as"
LD_FOR_TARGET="/home/aashley/work/sourceware/install/bin/arm-elf-ld"
../src/configure --prefix=/home/bmei/work/trunck-arm --enable-languages=c
--disable-nls --target=arm-elf  --disable-shared
--with-mpfr=/projects/firepath/tools/team/packages/x86_64-rhel3-32/mpfr/2.3.0
--with-gmp=/projects/firepath/tools/team/packages/x86_64-rhel3-32/gmp/4.2.2
--disable-libssp


-- 
           Summary: Inefficient loop unrolling
           Product: gcc
           Version: 4.4.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: rtl-optimization
        AssignedTo: unassigned at gcc dot gnu dot org
        ReportedBy: bmei at broadcom dot com
GCC target triplet: arm-elf-gcc


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36712


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [Bug rtl-optimization/36712] Inefficient loop unrolling
  2008-07-03  9:07 [Bug rtl-optimization/36712] New: Inefficient loop unrolling bmei at broadcom dot com
@ 2009-05-20 13:20 ` ramana at gcc dot gnu dot org
  2009-05-20 14:09 ` rguenth at gcc dot gnu dot org
                   ` (18 subsequent siblings)
  19 siblings, 0 replies; 21+ messages in thread
From: ramana at gcc dot gnu dot org @ 2009-05-20 13:20 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #1 from ramana at gcc dot gnu dot org  2009-05-20 13:19 -------
Can be reproduced with trunk today.


-- 

ramana at gcc dot gnu dot org changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|UNCONFIRMED                 |NEW
     Ever Confirmed|0                           |1
   Last reconfirmed|0000-00-00 00:00:00         |2009-05-20 13:19:56
               date|                            |


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36712


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [Bug rtl-optimization/36712] Inefficient loop unrolling
  2008-07-03  9:07 [Bug rtl-optimization/36712] New: Inefficient loop unrolling bmei at broadcom dot com
  2009-05-20 13:20 ` [Bug rtl-optimization/36712] " ramana at gcc dot gnu dot org
@ 2009-05-20 14:09 ` rguenth at gcc dot gnu dot org
  2009-05-20 14:14 ` ramana at gcc dot gnu dot org
                   ` (17 subsequent siblings)
  19 siblings, 0 replies; 21+ messages in thread
From: rguenth at gcc dot gnu dot org @ 2009-05-20 14:09 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #2 from rguenth at gcc dot gnu dot org  2009-05-20 14:09 -------
I think there is no induction variable optimization on RTL anymore.


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36712


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [Bug rtl-optimization/36712] Inefficient loop unrolling
  2008-07-03  9:07 [Bug rtl-optimization/36712] New: Inefficient loop unrolling bmei at broadcom dot com
  2009-05-20 13:20 ` [Bug rtl-optimization/36712] " ramana at gcc dot gnu dot org
  2009-05-20 14:09 ` rguenth at gcc dot gnu dot org
@ 2009-05-20 14:14 ` ramana at gcc dot gnu dot org
  2009-05-20 14:17 ` bmei at broadcom dot com
                   ` (16 subsequent siblings)
  19 siblings, 0 replies; 21+ messages in thread
From: ramana at gcc dot gnu dot org @ 2009-05-20 14:14 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #3 from ramana at gcc dot gnu dot org  2009-05-20 14:14 -------
There was a discussion thread here.
http://gcc.gnu.org/ml/gcc/2008-07/msg00037.html and one of the solutions that
Bingfeng was investigating was loop unrolling before ivopts in certain cases
being useful . 


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36712


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [Bug rtl-optimization/36712] Inefficient loop unrolling
  2008-07-03  9:07 [Bug rtl-optimization/36712] New: Inefficient loop unrolling bmei at broadcom dot com
                   ` (2 preceding siblings ...)
  2009-05-20 14:14 ` ramana at gcc dot gnu dot org
@ 2009-05-20 14:17 ` bmei at broadcom dot com
  2009-05-20 17:51 ` dje dot gcc at gmail dot com
                   ` (15 subsequent siblings)
  19 siblings, 0 replies; 21+ messages in thread
From: bmei at broadcom dot com @ 2009-05-20 14:17 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #4 from bmei at broadcom dot com  2009-05-20 14:17 -------
I implemented a tree-level loop-unrolling pass in our private porting, which 
takes advantage of later tree ivopt pass. It produces much better code than 
rtl-level loop unrolling in such scenarios. Not sure whether should submit for 
4.5.


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36712


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [Bug rtl-optimization/36712] Inefficient loop unrolling
  2008-07-03  9:07 [Bug rtl-optimization/36712] New: Inefficient loop unrolling bmei at broadcom dot com
                   ` (3 preceding siblings ...)
  2009-05-20 14:17 ` bmei at broadcom dot com
@ 2009-05-20 17:51 ` dje dot gcc at gmail dot com
  2009-05-21  8:38 ` bmei at broadcom dot com
                   ` (14 subsequent siblings)
  19 siblings, 0 replies; 21+ messages in thread
From: dje dot gcc at gmail dot com @ 2009-05-20 17:51 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #5 from dje dot gcc at gmail dot com  2009-05-20 17:51 -------
Subject: Re:  Inefficient loop unrolling

> I implemented a tree-level loop-unrolling pass in our private porting, which
> takes advantage of later tree ivopt pass. It produces much better code than
> rtl-level loop unrolling in such scenarios. Not sure whether should submit for
> 4.5.

Why "not sure"?  The worst anyone can say is "no".  Of course you
should submit it.


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36712


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [Bug rtl-optimization/36712] Inefficient loop unrolling
  2008-07-03  9:07 [Bug rtl-optimization/36712] New: Inefficient loop unrolling bmei at broadcom dot com
                   ` (4 preceding siblings ...)
  2009-05-20 17:51 ` dje dot gcc at gmail dot com
@ 2009-05-21  8:38 ` bmei at broadcom dot com
  2009-10-15 12:12 ` drow at gcc dot gnu dot org
                   ` (13 subsequent siblings)
  19 siblings, 0 replies; 21+ messages in thread
From: bmei at broadcom dot com @ 2009-05-21  8:38 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #6 from bmei at broadcom dot com  2009-05-21 08:38 -------
I only submitted small patch before. To add a pass (may need new command-line
option, disabling the old rtl-level unrolling) seems to be a big issue to me.
Don't know what's procedure. 

My code also contains my own implementation of #pragma unroll. I need to clean
it up for the public patch. 


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36712


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [Bug rtl-optimization/36712] Inefficient loop unrolling
  2008-07-03  9:07 [Bug rtl-optimization/36712] New: Inefficient loop unrolling bmei at broadcom dot com
                   ` (5 preceding siblings ...)
  2009-05-21  8:38 ` bmei at broadcom dot com
@ 2009-10-15 12:12 ` drow at gcc dot gnu dot org
  2010-01-25 21:10 ` froydnj at gcc dot gnu dot org
                   ` (12 subsequent siblings)
  19 siblings, 0 replies; 21+ messages in thread
From: drow at gcc dot gnu dot org @ 2009-10-15 12:12 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #7 from drow at gcc dot gnu dot org  2009-10-15 12:12 -------
I really would like to see this submitted - at least as a starting point for
discussion.  You don't need to do anything different than for a small patch; if
you've missed any steps, a reviewer will tell you.

Another problem I noticed is that if RTL loop unrolling unrolls a loop, only
the first iteration will use auto-inc patterns.  This leads to silly omissions
in the assembly.


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36712


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [Bug rtl-optimization/36712] Inefficient loop unrolling
  2008-07-03  9:07 [Bug rtl-optimization/36712] New: Inefficient loop unrolling bmei at broadcom dot com
                   ` (6 preceding siblings ...)
  2009-10-15 12:12 ` drow at gcc dot gnu dot org
@ 2010-01-25 21:10 ` froydnj at gcc dot gnu dot org
  2010-02-04 11:12 ` rearnsha at gcc dot gnu dot org
                   ` (11 subsequent siblings)
  19 siblings, 0 replies; 21+ messages in thread
From: froydnj at gcc dot gnu dot org @ 2010-01-25 21:10 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #8 from froydnj at gcc dot gnu dot org  2010-01-25 21:10 -------
First, something has gotten better; an arm-eabi gcc (-O2 -std=c99 -mcpu=arm9
-funroll-loops) from 20091209 gives:

Unroll:
        @ Function supports interworking.
        @ args = 0, pretend = 0, frame = 0
        @ frame_needed = 0, uses_anonymous_args = 0
        @ link register save eliminated.
        stmfd   sp!, {r4, r5, r6, r7, r8}
        add     r8, r1, #256
.L2:
        ldr     ip, [r1, #0]
        mov     r7, r1
        mul     r2, ip, r0
        str     r2, [r7], #4
        ldr     r3, [r1, #4]
        ldr     r5, [r7, #4]
        mul     r6, r3, r0
        str     r6, [r7, #0]
        ldr     r4, [r1, #12]
        ldr     ip, [r1, #16]
        add     r2, r1, #20
        ldmia   r2, {r2, r3, r7}        @ phole ldm
        mul     r6, r5, r0
        mul     r5, r4, r0
        mul     r4, ip, r0
        mul     ip, r2, r0
        mul     r2, r3, r0
        mul     r3, r7, r0
        str     r6, [r1, #8]
        str     r5, [r1, #12]
        str     r4, [r1, #16]
        str     ip, [r1, #20]
        str     r2, [r1, #24]
        add     r1, r1, #32
        cmp     r1, r8
        str     r3, [r1, #-4]
        bne     .L2
        ldmfd   sp!, {r4, r5, r6, r7, r8}
        bx      lr
        .size   Unroll, .-Unroll
        .ident  "GCC: (GNU) 4.5.0 20091209 (experimental)"

which, if not close to ManualUnroll from the first comment, is much better than
the initial example.

Second, the problem Daniel mentioned concerning auto-inc/dec not doing the
right thing is because of the cleverness of
loop-unroll.c:analyze_iv_to_split_insn.  It breaks the code shape that
auto-inc/dec needs.  (You can see its effects in the assembly above; the
spurious move to r7 at the top of the loop.)  Even if you disable that bit of
RTL loop unrolling, you also need to disable the web pass so as to not really
break the code shape for auto-inc/dec and introduce spurious moves into the
RTL.  Once you do that, you get:

Unroll:
        @ Function supports interworking.
        @ args = 0, pretend = 0, frame = 0
        @ frame_needed = 0, uses_anonymous_args = 0
        @ link register save eliminated.
        add     ip, r1, #256
.L2:
        ldr     r2, [r1, #0]
        mul     r3, r2, r0
        str     r3, [r1], #4
        ldr     r2, [r1, #0]
        mul     r3, r2, r0
        str     r3, [r1], #4
        ldr     r2, [r1, #0]
        mul     r3, r2, r0
        str     r3, [r1], #4
        ldr     r2, [r1, #0]
        mul     r3, r2, r0
        str     r3, [r1], #4
        ldr     r2, [r1, #0]
        mul     r3, r2, r0
        str     r3, [r1], #4
        ldr     r2, [r1, #0]
        mul     r3, r2, r0
        str     r3, [r1], #4
        ldr     r2, [r1, #0]
        mul     r3, r2, r0
        str     r3, [r1], #4
        ldr     r2, [r1, #0]
        mul     r3, r2, r0
        str     r3, [r1], #4
        cmp     r1, ip
        bne     .L2
        bx      lr


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36712


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [Bug rtl-optimization/36712] Inefficient loop unrolling
  2008-07-03  9:07 [Bug rtl-optimization/36712] New: Inefficient loop unrolling bmei at broadcom dot com
                   ` (7 preceding siblings ...)
  2010-01-25 21:10 ` froydnj at gcc dot gnu dot org
@ 2010-02-04 11:12 ` rearnsha at gcc dot gnu dot org
  2010-02-04 11:21 ` steven at gcc dot gnu dot org
                   ` (10 subsequent siblings)
  19 siblings, 0 replies; 21+ messages in thread
From: rearnsha at gcc dot gnu dot org @ 2010-02-04 11:12 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #9 from rearnsha at gcc dot gnu dot org  2010-02-04 11:11 -------
(In reply to comment #8)

>         ldr     r2, [r1, #0]
>         mul     r3, r2, r0
>         str     r3, [r1], #4
>         ldr     r2, [r1, #0]
>         mul     r3, r2, r0
>         str     r3, [r1], #4
>         ldr     r2, [r1, #0]
[...]

Ug, on a dual-issue core with load delay slots that code will REALLY suck. 
there's almost nothing that can be dual issued and the loaded values are used
in the instruction immediately after the load.


-- 

rearnsha at gcc dot gnu dot org changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |rearnsha at gcc dot gnu dot
                   |                            |org


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36712


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [Bug rtl-optimization/36712] Inefficient loop unrolling
  2008-07-03  9:07 [Bug rtl-optimization/36712] New: Inefficient loop unrolling bmei at broadcom dot com
                   ` (8 preceding siblings ...)
  2010-02-04 11:12 ` rearnsha at gcc dot gnu dot org
@ 2010-02-04 11:21 ` steven at gcc dot gnu dot org
  2010-02-04 11:47 ` rguenth at gcc dot gnu dot org
                   ` (9 subsequent siblings)
  19 siblings, 0 replies; 21+ messages in thread
From: steven at gcc dot gnu dot org @ 2010-02-04 11:21 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #10 from steven at gcc dot gnu dot org  2010-02-04 11:21 -------
I'm going to crack this bug.


-- 

steven at gcc dot gnu dot org changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
         AssignedTo|unassigned at gcc dot gnu   |steven at gcc dot gnu dot
                   |dot org                     |org
             Status|NEW                         |ASSIGNED
   Last reconfirmed|2009-05-20 13:19:56         |2010-02-04 11:21:25
               date|                            |


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36712


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [Bug rtl-optimization/36712] Inefficient loop unrolling
  2008-07-03  9:07 [Bug rtl-optimization/36712] New: Inefficient loop unrolling bmei at broadcom dot com
                   ` (9 preceding siblings ...)
  2010-02-04 11:21 ` steven at gcc dot gnu dot org
@ 2010-02-04 11:47 ` rguenth at gcc dot gnu dot org
  2010-02-04 14:55 ` steven at gcc dot gnu dot org
                   ` (8 subsequent siblings)
  19 siblings, 0 replies; 21+ messages in thread
From: rguenth at gcc dot gnu dot org @ 2010-02-04 11:47 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #11 from rguenth at gcc dot gnu dot org  2010-02-04 11:47 -------
Also try the patches from PR42617 to see if they improve the pre-regalloc
scheduling.


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36712


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [Bug rtl-optimization/36712] Inefficient loop unrolling
  2008-07-03  9:07 [Bug rtl-optimization/36712] New: Inefficient loop unrolling bmei at broadcom dot com
                   ` (10 preceding siblings ...)
  2010-02-04 11:47 ` rguenth at gcc dot gnu dot org
@ 2010-02-04 14:55 ` steven at gcc dot gnu dot org
  2010-02-04 14:57 ` steven at gcc dot gnu dot org
                   ` (7 subsequent siblings)
  19 siblings, 0 replies; 21+ messages in thread
From: steven at gcc dot gnu dot org @ 2010-02-04 14:55 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #12 from steven at gcc dot gnu dot org  2010-02-04 14:54 -------
With the patches from bug 42617 applied, I get the following:

        .file   "tst.c"
        .text
        .align  2
        .global Unroll
        .type   Unroll, %function
Unroll:
        @ args = 0, pretend = 0, frame = 0
        @ frame_needed = 0, uses_anonymous_args = 0
        @ link register save eliminated.
        mov     r0, r0, asl #16
        stmfd   sp!, {r4, r5, r6, r7, r8}
        mov     r0, r0, asr #16
        add     r8, r1, #256
.L2:
        ldr     ip, [r1, #0]
        mov     r7, r1
        mul     r2, ip, r0
        str     r2, [r7], #4
        ldr     r3, [r1, #4]
        ldr     r5, [r7, #4]
        mul     r6, r3, r0
        str     r6, [r7, #0]
        ldr     r4, [r1, #12]
        ldr     ip, [r1, #16]
        add     r2, r1, #20
        ldmia   r2, {r2, r3, r7}        @ phole ldm
        mul     r6, r5, r0
        mul     r5, r4, r0
        mul     r4, ip, r0
        mul     ip, r2, r0
        mul     r2, r3, r0
        mul     r3, r7, r0
        str     r6, [r1, #8]
        str     r5, [r1, #12]
        str     r4, [r1, #16]
        str     ip, [r1, #20]
        str     r2, [r1, #24]
        add     r1, r1, #32
        cmp     r1, r8
        str     r3, [r1, #-4]
        bne     .L2
        ldmfd   sp!, {r4, r5, r6, r7, r8}
        bx      lr
        .size   Unroll, .-Unroll
        .ident  "GCC: (GNU) 4.5.0 20100204 (experimental) [trunk revision
156492]"

(flags: -std=c99 -mcpu=arm9 -O2 -funroll-loops)


This is good but not perfect.


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36712


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [Bug rtl-optimization/36712] Inefficient loop unrolling
  2008-07-03  9:07 [Bug rtl-optimization/36712] New: Inefficient loop unrolling bmei at broadcom dot com
                   ` (11 preceding siblings ...)
  2010-02-04 14:55 ` steven at gcc dot gnu dot org
@ 2010-02-04 14:57 ` steven at gcc dot gnu dot org
  2010-02-04 15:19 ` steven at gcc dot gnu dot org
                   ` (6 subsequent siblings)
  19 siblings, 0 replies; 21+ messages in thread
From: steven at gcc dot gnu dot org @ 2010-02-04 14:57 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #13 from steven at gcc dot gnu dot org  2010-02-04 14:56 -------
With -fno-web, the patches from bug 42617 do not help and the output is the
same as that of comment #8 (second asm dump).


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36712


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [Bug rtl-optimization/36712] Inefficient loop unrolling
  2008-07-03  9:07 [Bug rtl-optimization/36712] New: Inefficient loop unrolling bmei at broadcom dot com
                   ` (12 preceding siblings ...)
  2010-02-04 14:57 ` steven at gcc dot gnu dot org
@ 2010-02-04 15:19 ` steven at gcc dot gnu dot org
  2010-02-04 16:07 ` steven at gcc dot gnu dot org
                   ` (5 subsequent siblings)
  19 siblings, 0 replies; 21+ messages in thread
From: steven at gcc dot gnu dot org @ 2010-02-04 15:19 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #14 from steven at gcc dot gnu dot org  2010-02-04 15:19 -------
Part of the problem comes from the way IVOPTS optimizes the memory access:

;; Generating RTL for gimple basic block 3

;; D.1814_10 = MEM[base: D.1846_29];

(insn 52 51 0 tst.c:6 (set (reg:SI 172 [ D.1814 ])
        (mem:SI (reg:SI 179 [ ivtmp.20 ]) [2 *D.1846_29 S4 A32])) -1 (nil))

;; ivtmp.20_24 = ivtmp.20_25 + 4;

(insn 53 52 0 tst.c:6 (set (reg:SI 179 [ ivtmp.20 ])
        (plus:SI (reg:SI 179 [ ivtmp.20 ])
            (const_int 4 [0x4]))) -1 (nil))

;; MEM[base: D.1847_30, offset: 4294967292] = D.1816_13;

(insn 54 53 55 tst.c:6 (set (reg:SI 189)
        (mult:SI (reg:SI 172 [ D.1814 ])
            (reg:SI 180 [ pretmp.11 ]))) -1 (nil))

(insn 55 54 0 tst.c:6 (set (mem:SI (plus:SI (reg:SI 179 [ ivtmp.20 ])
                (const_int -4 [0xfffffffffffffffc])) [2 *D.1847_30 S4 A32])
        (reg:SI 189)) -1 (nil))

;; if (ivtmp.20_24 != D.1849_32)

(insn 57 55 58 tst.c:4 (set (reg:CC 24 cc)
        (compare:CC (reg:SI 179 [ ivtmp.20 ])
            (reg:SI 183 [ D.1849 ]))) -1 (nil))

(jump_insn 58 57 0 tst.c:4 (set (pc)
        (if_then_else (ne (reg:CC 24 cc)
                (const_int 0 [0x0]))
            (label_ref 56)
            (pc))) -1 (expr_list:REG_BR_PROB (const_int 9844 [0x2674])
        (nil)))


This yields the sequence:
   52 r172:SI=[r179:SI]
   53 r179:SI=r179:SI+0x4
   54 r189:SI=r172:SI*r180:SI
   55 [r179:SI-0x4]=r189:SI

and we never get rid of this again.  There is another bug related to this one:
bug 31849. The patch there probably needs dusting off, and then I'll see if it
improves things for this bug as well.


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36712


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [Bug rtl-optimization/36712] Inefficient loop unrolling
  2008-07-03  9:07 [Bug rtl-optimization/36712] New: Inefficient loop unrolling bmei at broadcom dot com
                   ` (13 preceding siblings ...)
  2010-02-04 15:19 ` steven at gcc dot gnu dot org
@ 2010-02-04 16:07 ` steven at gcc dot gnu dot org
  2010-02-05 13:33 ` steven at gcc dot gnu dot org
                   ` (4 subsequent siblings)
  19 siblings, 0 replies; 21+ messages in thread
From: steven at gcc dot gnu dot org @ 2010-02-04 16:07 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #15 from steven at gcc dot gnu dot org  2010-02-04 16:06 -------
The patches for bug 31849 have been commited, it seems, and it doesn't help for
this case.


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36712


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [Bug rtl-optimization/36712] Inefficient loop unrolling
  2008-07-03  9:07 [Bug rtl-optimization/36712] New: Inefficient loop unrolling bmei at broadcom dot com
                   ` (14 preceding siblings ...)
  2010-02-04 16:07 ` steven at gcc dot gnu dot org
@ 2010-02-05 13:33 ` steven at gcc dot gnu dot org
  2010-02-05 13:58 ` rakdver at kam dot mff dot cuni dot cz
                   ` (3 subsequent siblings)
  19 siblings, 0 replies; 21+ messages in thread
From: steven at gcc dot gnu dot org @ 2010-02-05 13:33 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #16 from steven at gcc dot gnu dot org  2010-02-05 13:33 -------
I'm trying to coerce IVOPTSs into producing the following, optimal code in the
GIMPLE optimizers (without much luck, so far):

<bb 2>:
  pretmp.11_26 = (int) s_11(D);
  ivtmp.20_28 = (long unsigned int) b_inout_5(D);
  D.1848_31 = (long unsigned int) b_inout_5(D);
  D.1849_32 = D.1848_31 + 256;

<bb 3>:
 # ivtmp.20_25 = PHI <ivtmp.20_24(4), ivtmp.20_28(2)>
 D.1846_29 = (void *) ivtmp.20_25;
 D.1814_10 = MEM[base: D.1846_29]{*D.1813};
 D.1816_13 = D.1814_10 * pretmp.11_26;
 D.1847_30 = (void *) ivtmp.20_25;
 MEM[base: D.1847_30]{*D.1813} = D.1816_13;
 ivtmp.20_24 = ivtmp.20_25 + 4;
 if (ivtmp.20_24 != D.1849_32)
   goto <bb 4>;
 else
   goto <bb 5>;

<bb 4>:
  goto <bb 3>;

<bb 5>:
  return;


If we can get the compiler to generate the above code in IVOPTS, then we should
get the same code as the hand-unrolled example (although it also looks like  a
bit more scheduler look-ahead freedom is necessary).

I asked Zdenek for help. For ARM9 he found the following problem:

/quote/
Address costs:
 index costs 6
 cst + index costs 2

arm_arm_address_cost pretends that having reg + cst as an address is cheaper
than having reg by itself.  Ivopts are happy to make this happen :-)

There used to be the same problem on x86, which was eventually fixed by making
address_cost reflect the real cost of addresses, rather than "which addressing
modes should CSE prefer" metrics.
/quote/

Of course all cases with cost of "index" > cost of "index + cst" results in the
code like that of an unpatched compiler. But if I adjust the cost to make
"index" cost only 1 or 2, I get this:

<bb 2>:
  pretmp.11_26 = (int) s_11(D);
  ivtmp.25_28 = (long unsigned int) b_inout_5(D);

<bb 3>:
  # i_20 = PHI <i_14(4), 0(2)>
  # ivtmp.25_25 = PHI <ivtmp.25_24(4), ivtmp.25_28(2)>
  D.1846_29 = (void *) ivtmp.25_25;
  D.1814_10 = MEM[base: D.1846_29]{*D.1813};
  D.1816_13 = D.1814_10 * pretmp.11_26;
  D.1847_30 = (void *) ivtmp.25_25;
  MEM[base: D.1847_30]{*D.1813} = D.1816_13;
  ivtmp.25_24 = ivtmp.25_25 + 4;
  i_14 = i_20 + 1;
  if (i_14 != 64)
    goto <bb 4>;
  else
    goto <bb 5>;

<bb 4>:
  goto <bb 3>;

<bb 5>:
  return;


That is still not optimal: We get an extra IV for some reason.


-- 

steven at gcc dot gnu dot org changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |rakdver at gcc dot gnu dot
                   |                            |org


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36712


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [Bug rtl-optimization/36712] Inefficient loop unrolling
  2008-07-03  9:07 [Bug rtl-optimization/36712] New: Inefficient loop unrolling bmei at broadcom dot com
                   ` (15 preceding siblings ...)
  2010-02-05 13:33 ` steven at gcc dot gnu dot org
@ 2010-02-05 13:58 ` rakdver at kam dot mff dot cuni dot cz
  2010-02-05 14:02 ` steven at gcc dot gnu dot org
                   ` (2 subsequent siblings)
  19 siblings, 0 replies; 21+ messages in thread
From: rakdver at kam dot mff dot cuni dot cz @ 2010-02-05 13:58 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #17 from rakdver at kam dot mff dot cuni dot cz  2010-02-05 13:58 -------
Subject: Re:  Inefficient loop unrolling

>  But if I adjust the cost to make
> "index" cost only 1 or 2, I get this:
> 
> <bb 2>:
>   pretmp.11_26 = (int) s_11(D);
>   ivtmp.25_28 = (long unsigned int) b_inout_5(D);
> 
> <bb 3>:
>   # i_20 = PHI <i_14(4), 0(2)>
>   # ivtmp.25_25 = PHI <ivtmp.25_24(4), ivtmp.25_28(2)>
>   D.1846_29 = (void *) ivtmp.25_25;
>   D.1814_10 = MEM[base: D.1846_29]{*D.1813};
>   D.1816_13 = D.1814_10 * pretmp.11_26;
>   D.1847_30 = (void *) ivtmp.25_25;
>   MEM[base: D.1847_30]{*D.1813} = D.1816_13;
>   ivtmp.25_24 = ivtmp.25_25 + 4;
>   i_14 = i_20 + 1;
>   if (i_14 != 64)
>     goto <bb 4>;
>   else
>     goto <bb 5>;
> 
> <bb 4>:
>   goto <bb 3>;
> 
> <bb 5>:
>   return;

what configuration and flags are you using?  For me, replacing
arm_arm_address_cost with return 1 resulted
in

<bb 3>:
  # ivtmp.25_25 = PHI <ivtmp.25_24(3), ivtmp.25_28(2)>
  D.1816_29 = (void *) ivtmp.25_25;
  D.1784_10 = MEM[base: D.1816_29]{*D.1783};
  D.1786_13 = pretmp.11_26 * D.1784_10;
  MEM[base: D.1816_29]{*D.1783} = D.1786_13;
  ivtmp.25_24 = ivtmp.25_25 + 4;
  if (ivtmp.25_24 != D.1819_32)

Zdenek


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36712


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [Bug rtl-optimization/36712] Inefficient loop unrolling
  2008-07-03  9:07 [Bug rtl-optimization/36712] New: Inefficient loop unrolling bmei at broadcom dot com
                   ` (16 preceding siblings ...)
  2010-02-05 13:58 ` rakdver at kam dot mff dot cuni dot cz
@ 2010-02-05 14:02 ` steven at gcc dot gnu dot org
  2010-02-05 14:58 ` [Bug target/36712] " steven at gcc dot gnu dot org
  2010-02-12 22:46 ` steven at gcc dot gnu dot org
  19 siblings, 0 replies; 21+ messages in thread
From: steven at gcc dot gnu dot org @ 2010-02-05 14:02 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #18 from steven at gcc dot gnu dot org  2010-02-05 14:02 -------
I used "-O2 -std=c99 -mcpu=arm9 -funroll-loops" and I manually hacked the cost
in GDB to change from:

Address costs:
 index costs 6
 cst + index costs 2


...to this...:

Address costs:
 index costs 1
 cst + index costs 2


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36712


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [Bug target/36712] Inefficient loop unrolling
  2008-07-03  9:07 [Bug rtl-optimization/36712] New: Inefficient loop unrolling bmei at broadcom dot com
                   ` (17 preceding siblings ...)
  2010-02-05 14:02 ` steven at gcc dot gnu dot org
@ 2010-02-05 14:58 ` steven at gcc dot gnu dot org
  2010-02-12 22:46 ` steven at gcc dot gnu dot org
  19 siblings, 0 replies; 21+ messages in thread
From: steven at gcc dot gnu dot org @ 2010-02-05 14:58 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #19 from steven at gcc dot gnu dot org  2010-02-05 14:58 -------
Interesting: for " -march=armv5te -mthumb" the code after IVOPTS is the perfect
code (from e.g. comment #17). The reason is that the address cost function for
Thumb (arm_thumb_address_cost) is of course not the same as that for ARM
(arm_arm_address_cost) so the correct code comes out automatically:

Address costs:
  index costs 1
  cst + index costs 1

In arm_arm_address_cost, "index" as a naked REG is the most expensive of all
addresses. In arm_thumb_address_cost it is the cheapest.

In conclusion: This is a target cost problem, not a generic rtl-optimization
bug. Adjusting as such. An ARM maintainer will have to figure out a better
implementation of arm_arm_address_cost.


-- 

steven at gcc dot gnu dot org changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
         AssignedTo|steven at gcc dot gnu dot   |unassigned at gcc dot gnu
                   |org                         |dot org
             Status|ASSIGNED                    |NEW
          Component|rtl-optimization            |target


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36712


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [Bug target/36712] Inefficient loop unrolling
  2008-07-03  9:07 [Bug rtl-optimization/36712] New: Inefficient loop unrolling bmei at broadcom dot com
                   ` (18 preceding siblings ...)
  2010-02-05 14:58 ` [Bug target/36712] " steven at gcc dot gnu dot org
@ 2010-02-12 22:46 ` steven at gcc dot gnu dot org
  19 siblings, 0 replies; 21+ messages in thread
From: steven at gcc dot gnu dot org @ 2010-02-12 22:46 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #20 from steven at gcc dot gnu dot org  2010-02-12 22:46 -------
Bug 27016 is another example of poor IVOPTS due to poor choices in
arm_arm_address_cost


-- 

steven at gcc dot gnu dot org changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
  BugsThisDependsOn|                            |27016


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36712


^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2010-02-12 22:46 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-07-03  9:07 [Bug rtl-optimization/36712] New: Inefficient loop unrolling bmei at broadcom dot com
2009-05-20 13:20 ` [Bug rtl-optimization/36712] " ramana at gcc dot gnu dot org
2009-05-20 14:09 ` rguenth at gcc dot gnu dot org
2009-05-20 14:14 ` ramana at gcc dot gnu dot org
2009-05-20 14:17 ` bmei at broadcom dot com
2009-05-20 17:51 ` dje dot gcc at gmail dot com
2009-05-21  8:38 ` bmei at broadcom dot com
2009-10-15 12:12 ` drow at gcc dot gnu dot org
2010-01-25 21:10 ` froydnj at gcc dot gnu dot org
2010-02-04 11:12 ` rearnsha at gcc dot gnu dot org
2010-02-04 11:21 ` steven at gcc dot gnu dot org
2010-02-04 11:47 ` rguenth at gcc dot gnu dot org
2010-02-04 14:55 ` steven at gcc dot gnu dot org
2010-02-04 14:57 ` steven at gcc dot gnu dot org
2010-02-04 15:19 ` steven at gcc dot gnu dot org
2010-02-04 16:07 ` steven at gcc dot gnu dot org
2010-02-05 13:33 ` steven at gcc dot gnu dot org
2010-02-05 13:58 ` rakdver at kam dot mff dot cuni dot cz
2010-02-05 14:02 ` steven at gcc dot gnu dot org
2010-02-05 14:58 ` [Bug target/36712] " steven at gcc dot gnu dot org
2010-02-12 22:46 ` steven at gcc dot gnu dot org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).