public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug rtl-optimization/36712] New: Inefficient loop unrolling
@ 2008-07-03 9:07 bmei at broadcom dot com
2009-05-20 13:20 ` [Bug rtl-optimization/36712] " ramana at gcc dot gnu dot org
` (19 more replies)
0 siblings, 20 replies; 21+ messages in thread
From: bmei at broadcom dot com @ 2008-07-03 9:07 UTC (permalink / raw)
To: gcc-bugs
Loop unrolling generates far worse code compared with manually unrolled code.
In following code, the first version is GCC-unrolled and the second is
manually unrolled.
GCC-unrolled version mainly suffers from two issues. First, the
load/store offsets are registers. Extra ADD instructions are needed to
increase offset over iteration. In the contrast, manually unrolled code
makes use of immediate offset efficiently and only need one ADD to
adjust base register in the end. Second, the alias (dependence) analysis
is over conservative. The LOAD instruction of next unrolled iteration
cannot be moved beyond previous STORE instruction even they are clearly
not aliased. I suspect the failure of alias analysis is related to the
first issue of handling base and offset address. The .sched2 file shows
that the first loop body requires 57 cycles whereas the second one takes
50 cycles for arm9 (56 cycles vs 34 cycles for Xscale). It become even
worse for our VLIW porting due to longer latency of MUL and Load
instructions and incapability of filling all slots (120 cycles vs. 20
cycles)
tst.c
void Unroll( short s, int * restrict b_inout, int *restrict out)
{
int i;
for (i=0; i<64; i++)
{
b_inout[i] = b_inout[i] * s;
}
}
arm-elf-gcc tst.c -O2 -std=c99 -S -v -fdump-tree-all -da -mcpu=arm9
-funroll-loops
Unroll:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
@ link register save eliminated.
mov r0, r0, asl #16
stmfd sp!, {r4, r5, r6}
mov r4, r1
mov r6, r0, asr #16
mov r5, #0
.L2:
ldr r1, [r4, r5]
add ip, r5, #4
mul r0, r6, r1
str r0, [r4, r5]
ldr r3, [r4, ip]
add r0, ip, #4
mul r2, r6, r3
str r2, [r4, ip]
ldr r1, [r4, r0]
add ip, r5, #12
mul r3, r6, r1
str r3, [r4, r0]
ldr r2, [r4, ip]
add r1, r5, #16
mul r3, r6, r2
str r3, [r4, ip]
ldr r0, [r4, r1]
add ip, r5, #20
mul r3, r6, r0
str r3, [r4, r1]
ldr r2, [r4, ip]
add r1, r5, #24
mul r0, r6, r2
str r0, [r4, ip]
ldr r3, [r4, r1]
add ip, r5, #28
mul r0, r6, r3
str r0, [r4, r1]
ldr r2, [r4, ip]
add r5, r5, #32
mul r3, r6, r2
cmp r5, #256
str r3, [r4, ip]
bne .L2
ldmfd sp!, {r4, r5, r6}
bx lr
.size Unroll, .-Unroll
tst2.c:
void ManualUnroll( short s, int * restrict b_inout, int *restrict out)
{
int i;
for (i=0; i<64;)
{
b_inout[i] = b_inout[i] * s;
i++;
b_inout[i] = b_inout[i] * s;
i++;
b_inout[i] = b_inout[i] * s;
i++;
b_inout[i] = b_inout[i] * s;
i++;
b_inout[i] = b_inout[i] * s;
i++;
b_inout[i] = b_inout[i] * s;
i++;
b_inout[i] = b_inout[i] * s;
i++;
b_inout[i] = b_inout[i] * s;
i++;
}
}
arm-elf-gcc tst2.c -O2 -std=c99 -S -mcpu=arm9
ManualUnroll:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
@ link register save eliminated.
mov r0, r0, asl #16
stmfd sp!, {r4, r5, r6, r7, r8, r9, sl, fp}
mov sl, r1
mov r9, r0, asr #16
add fp, r1, #256
.L7:
ldr r3, [sl, #0]
ldr r2, [sl, #4]
ldr r1, [sl, #8]
ldr r0, [sl, #12]
ldr ip, [sl, #16]
add r4, sl, #20
ldmia r4, {r4, r5, r6} @ phole ldm
mul r7, r9, r3
mul r8, r9, r2
mul r3, r9, r1
mul r2, r9, r0
mul r1, r9, ip
mul r0, r9, r4
mul ip, r9, r5
mul r4, r9, r6
stmia sl, {r7, r8} @ phole stm
str r3, [sl, #8]
str r2, [sl, #12]
str r1, [sl, #16]
str r0, [sl, #20]
str ip, [sl, #24]
str r4, [sl, #28]
add sl, sl, #32
cmp sl, fp
bne .L7
ldmfd sp!, {r4, r5, r6, r7, r8, r9, sl, fp}
bx lr
.size ManualUnroll, .-ManualUnroll
.ident "GCC: (GNU) 4.4.0 20080530 (experimental)"
My ARM compiler is built with following configuration
CC="gcc -m32 -static" CFLAGS="-g"
RANLIB_FOR_TARGET="/home/aashley/work/sourceware/install/bin/arm-elf-ranlib"
AR_FOR_TARGET="/home/aashley/work/sourceware/install/bin/arm-elf-ar"
AS_FOR_TARGET="/home/aashley/work/sourceware/install/bin/arm-elf-as"
LD_FOR_TARGET="/home/aashley/work/sourceware/install/bin/arm-elf-ld"
../src/configure --prefix=/home/bmei/work/trunck-arm --enable-languages=c
--disable-nls --target=arm-elf --disable-shared
--with-mpfr=/projects/firepath/tools/team/packages/x86_64-rhel3-32/mpfr/2.3.0
--with-gmp=/projects/firepath/tools/team/packages/x86_64-rhel3-32/gmp/4.2.2
--disable-libssp
--
Summary: Inefficient loop unrolling
Product: gcc
Version: 4.4.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: rtl-optimization
AssignedTo: unassigned at gcc dot gnu dot org
ReportedBy: bmei at broadcom dot com
GCC target triplet: arm-elf-gcc
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36712
^ permalink raw reply [flat|nested] 21+ messages in thread
* [Bug rtl-optimization/36712] Inefficient loop unrolling
2008-07-03 9:07 [Bug rtl-optimization/36712] New: Inefficient loop unrolling bmei at broadcom dot com
@ 2009-05-20 13:20 ` ramana at gcc dot gnu dot org
2009-05-20 14:09 ` rguenth at gcc dot gnu dot org
` (18 subsequent siblings)
19 siblings, 0 replies; 21+ messages in thread
From: ramana at gcc dot gnu dot org @ 2009-05-20 13:20 UTC (permalink / raw)
To: gcc-bugs
------- Comment #1 from ramana at gcc dot gnu dot org 2009-05-20 13:19 -------
Can be reproduced with trunk today.
--
ramana at gcc dot gnu dot org changed:
What |Removed |Added
----------------------------------------------------------------------------
Status|UNCONFIRMED |NEW
Ever Confirmed|0 |1
Last reconfirmed|0000-00-00 00:00:00 |2009-05-20 13:19:56
date| |
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36712
^ permalink raw reply [flat|nested] 21+ messages in thread
* [Bug rtl-optimization/36712] Inefficient loop unrolling
2008-07-03 9:07 [Bug rtl-optimization/36712] New: Inefficient loop unrolling bmei at broadcom dot com
2009-05-20 13:20 ` [Bug rtl-optimization/36712] " ramana at gcc dot gnu dot org
@ 2009-05-20 14:09 ` rguenth at gcc dot gnu dot org
2009-05-20 14:14 ` ramana at gcc dot gnu dot org
` (17 subsequent siblings)
19 siblings, 0 replies; 21+ messages in thread
From: rguenth at gcc dot gnu dot org @ 2009-05-20 14:09 UTC (permalink / raw)
To: gcc-bugs
------- Comment #2 from rguenth at gcc dot gnu dot org 2009-05-20 14:09 -------
I think there is no induction variable optimization on RTL anymore.
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36712
^ permalink raw reply [flat|nested] 21+ messages in thread
* [Bug rtl-optimization/36712] Inefficient loop unrolling
2008-07-03 9:07 [Bug rtl-optimization/36712] New: Inefficient loop unrolling bmei at broadcom dot com
2009-05-20 13:20 ` [Bug rtl-optimization/36712] " ramana at gcc dot gnu dot org
2009-05-20 14:09 ` rguenth at gcc dot gnu dot org
@ 2009-05-20 14:14 ` ramana at gcc dot gnu dot org
2009-05-20 14:17 ` bmei at broadcom dot com
` (16 subsequent siblings)
19 siblings, 0 replies; 21+ messages in thread
From: ramana at gcc dot gnu dot org @ 2009-05-20 14:14 UTC (permalink / raw)
To: gcc-bugs
------- Comment #3 from ramana at gcc dot gnu dot org 2009-05-20 14:14 -------
There was a discussion thread here.
http://gcc.gnu.org/ml/gcc/2008-07/msg00037.html and one of the solutions that
Bingfeng was investigating was loop unrolling before ivopts in certain cases
being useful .
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36712
^ permalink raw reply [flat|nested] 21+ messages in thread
* [Bug rtl-optimization/36712] Inefficient loop unrolling
2008-07-03 9:07 [Bug rtl-optimization/36712] New: Inefficient loop unrolling bmei at broadcom dot com
` (2 preceding siblings ...)
2009-05-20 14:14 ` ramana at gcc dot gnu dot org
@ 2009-05-20 14:17 ` bmei at broadcom dot com
2009-05-20 17:51 ` dje dot gcc at gmail dot com
` (15 subsequent siblings)
19 siblings, 0 replies; 21+ messages in thread
From: bmei at broadcom dot com @ 2009-05-20 14:17 UTC (permalink / raw)
To: gcc-bugs
------- Comment #4 from bmei at broadcom dot com 2009-05-20 14:17 -------
I implemented a tree-level loop-unrolling pass in our private porting, which
takes advantage of later tree ivopt pass. It produces much better code than
rtl-level loop unrolling in such scenarios. Not sure whether should submit for
4.5.
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36712
^ permalink raw reply [flat|nested] 21+ messages in thread
* [Bug rtl-optimization/36712] Inefficient loop unrolling
2008-07-03 9:07 [Bug rtl-optimization/36712] New: Inefficient loop unrolling bmei at broadcom dot com
` (3 preceding siblings ...)
2009-05-20 14:17 ` bmei at broadcom dot com
@ 2009-05-20 17:51 ` dje dot gcc at gmail dot com
2009-05-21 8:38 ` bmei at broadcom dot com
` (14 subsequent siblings)
19 siblings, 0 replies; 21+ messages in thread
From: dje dot gcc at gmail dot com @ 2009-05-20 17:51 UTC (permalink / raw)
To: gcc-bugs
------- Comment #5 from dje dot gcc at gmail dot com 2009-05-20 17:51 -------
Subject: Re: Inefficient loop unrolling
> I implemented a tree-level loop-unrolling pass in our private porting, which
> takes advantage of later tree ivopt pass. It produces much better code than
> rtl-level loop unrolling in such scenarios. Not sure whether should submit for
> 4.5.
Why "not sure"? The worst anyone can say is "no". Of course you
should submit it.
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36712
^ permalink raw reply [flat|nested] 21+ messages in thread
* [Bug rtl-optimization/36712] Inefficient loop unrolling
2008-07-03 9:07 [Bug rtl-optimization/36712] New: Inefficient loop unrolling bmei at broadcom dot com
` (4 preceding siblings ...)
2009-05-20 17:51 ` dje dot gcc at gmail dot com
@ 2009-05-21 8:38 ` bmei at broadcom dot com
2009-10-15 12:12 ` drow at gcc dot gnu dot org
` (13 subsequent siblings)
19 siblings, 0 replies; 21+ messages in thread
From: bmei at broadcom dot com @ 2009-05-21 8:38 UTC (permalink / raw)
To: gcc-bugs
------- Comment #6 from bmei at broadcom dot com 2009-05-21 08:38 -------
I only submitted small patch before. To add a pass (may need new command-line
option, disabling the old rtl-level unrolling) seems to be a big issue to me.
Don't know what's procedure.
My code also contains my own implementation of #pragma unroll. I need to clean
it up for the public patch.
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36712
^ permalink raw reply [flat|nested] 21+ messages in thread
* [Bug rtl-optimization/36712] Inefficient loop unrolling
2008-07-03 9:07 [Bug rtl-optimization/36712] New: Inefficient loop unrolling bmei at broadcom dot com
` (5 preceding siblings ...)
2009-05-21 8:38 ` bmei at broadcom dot com
@ 2009-10-15 12:12 ` drow at gcc dot gnu dot org
2010-01-25 21:10 ` froydnj at gcc dot gnu dot org
` (12 subsequent siblings)
19 siblings, 0 replies; 21+ messages in thread
From: drow at gcc dot gnu dot org @ 2009-10-15 12:12 UTC (permalink / raw)
To: gcc-bugs
------- Comment #7 from drow at gcc dot gnu dot org 2009-10-15 12:12 -------
I really would like to see this submitted - at least as a starting point for
discussion. You don't need to do anything different than for a small patch; if
you've missed any steps, a reviewer will tell you.
Another problem I noticed is that if RTL loop unrolling unrolls a loop, only
the first iteration will use auto-inc patterns. This leads to silly omissions
in the assembly.
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36712
^ permalink raw reply [flat|nested] 21+ messages in thread
* [Bug rtl-optimization/36712] Inefficient loop unrolling
2008-07-03 9:07 [Bug rtl-optimization/36712] New: Inefficient loop unrolling bmei at broadcom dot com
` (6 preceding siblings ...)
2009-10-15 12:12 ` drow at gcc dot gnu dot org
@ 2010-01-25 21:10 ` froydnj at gcc dot gnu dot org
2010-02-04 11:12 ` rearnsha at gcc dot gnu dot org
` (11 subsequent siblings)
19 siblings, 0 replies; 21+ messages in thread
From: froydnj at gcc dot gnu dot org @ 2010-01-25 21:10 UTC (permalink / raw)
To: gcc-bugs
------- Comment #8 from froydnj at gcc dot gnu dot org 2010-01-25 21:10 -------
First, something has gotten better; an arm-eabi gcc (-O2 -std=c99 -mcpu=arm9
-funroll-loops) from 20091209 gives:
Unroll:
@ Function supports interworking.
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
@ link register save eliminated.
stmfd sp!, {r4, r5, r6, r7, r8}
add r8, r1, #256
.L2:
ldr ip, [r1, #0]
mov r7, r1
mul r2, ip, r0
str r2, [r7], #4
ldr r3, [r1, #4]
ldr r5, [r7, #4]
mul r6, r3, r0
str r6, [r7, #0]
ldr r4, [r1, #12]
ldr ip, [r1, #16]
add r2, r1, #20
ldmia r2, {r2, r3, r7} @ phole ldm
mul r6, r5, r0
mul r5, r4, r0
mul r4, ip, r0
mul ip, r2, r0
mul r2, r3, r0
mul r3, r7, r0
str r6, [r1, #8]
str r5, [r1, #12]
str r4, [r1, #16]
str ip, [r1, #20]
str r2, [r1, #24]
add r1, r1, #32
cmp r1, r8
str r3, [r1, #-4]
bne .L2
ldmfd sp!, {r4, r5, r6, r7, r8}
bx lr
.size Unroll, .-Unroll
.ident "GCC: (GNU) 4.5.0 20091209 (experimental)"
which, if not close to ManualUnroll from the first comment, is much better than
the initial example.
Second, the problem Daniel mentioned concerning auto-inc/dec not doing the
right thing is because of the cleverness of
loop-unroll.c:analyze_iv_to_split_insn. It breaks the code shape that
auto-inc/dec needs. (You can see its effects in the assembly above; the
spurious move to r7 at the top of the loop.) Even if you disable that bit of
RTL loop unrolling, you also need to disable the web pass so as to not really
break the code shape for auto-inc/dec and introduce spurious moves into the
RTL. Once you do that, you get:
Unroll:
@ Function supports interworking.
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
@ link register save eliminated.
add ip, r1, #256
.L2:
ldr r2, [r1, #0]
mul r3, r2, r0
str r3, [r1], #4
ldr r2, [r1, #0]
mul r3, r2, r0
str r3, [r1], #4
ldr r2, [r1, #0]
mul r3, r2, r0
str r3, [r1], #4
ldr r2, [r1, #0]
mul r3, r2, r0
str r3, [r1], #4
ldr r2, [r1, #0]
mul r3, r2, r0
str r3, [r1], #4
ldr r2, [r1, #0]
mul r3, r2, r0
str r3, [r1], #4
ldr r2, [r1, #0]
mul r3, r2, r0
str r3, [r1], #4
ldr r2, [r1, #0]
mul r3, r2, r0
str r3, [r1], #4
cmp r1, ip
bne .L2
bx lr
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36712
^ permalink raw reply [flat|nested] 21+ messages in thread
* [Bug rtl-optimization/36712] Inefficient loop unrolling
2008-07-03 9:07 [Bug rtl-optimization/36712] New: Inefficient loop unrolling bmei at broadcom dot com
` (7 preceding siblings ...)
2010-01-25 21:10 ` froydnj at gcc dot gnu dot org
@ 2010-02-04 11:12 ` rearnsha at gcc dot gnu dot org
2010-02-04 11:21 ` steven at gcc dot gnu dot org
` (10 subsequent siblings)
19 siblings, 0 replies; 21+ messages in thread
From: rearnsha at gcc dot gnu dot org @ 2010-02-04 11:12 UTC (permalink / raw)
To: gcc-bugs
------- Comment #9 from rearnsha at gcc dot gnu dot org 2010-02-04 11:11 -------
(In reply to comment #8)
> ldr r2, [r1, #0]
> mul r3, r2, r0
> str r3, [r1], #4
> ldr r2, [r1, #0]
> mul r3, r2, r0
> str r3, [r1], #4
> ldr r2, [r1, #0]
[...]
Ug, on a dual-issue core with load delay slots that code will REALLY suck.
there's almost nothing that can be dual issued and the loaded values are used
in the instruction immediately after the load.
--
rearnsha at gcc dot gnu dot org changed:
What |Removed |Added
----------------------------------------------------------------------------
CC| |rearnsha at gcc dot gnu dot
| |org
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36712
^ permalink raw reply [flat|nested] 21+ messages in thread
* [Bug rtl-optimization/36712] Inefficient loop unrolling
2008-07-03 9:07 [Bug rtl-optimization/36712] New: Inefficient loop unrolling bmei at broadcom dot com
` (8 preceding siblings ...)
2010-02-04 11:12 ` rearnsha at gcc dot gnu dot org
@ 2010-02-04 11:21 ` steven at gcc dot gnu dot org
2010-02-04 11:47 ` rguenth at gcc dot gnu dot org
` (9 subsequent siblings)
19 siblings, 0 replies; 21+ messages in thread
From: steven at gcc dot gnu dot org @ 2010-02-04 11:21 UTC (permalink / raw)
To: gcc-bugs
------- Comment #10 from steven at gcc dot gnu dot org 2010-02-04 11:21 -------
I'm going to crack this bug.
--
steven at gcc dot gnu dot org changed:
What |Removed |Added
----------------------------------------------------------------------------
AssignedTo|unassigned at gcc dot gnu |steven at gcc dot gnu dot
|dot org |org
Status|NEW |ASSIGNED
Last reconfirmed|2009-05-20 13:19:56 |2010-02-04 11:21:25
date| |
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36712
^ permalink raw reply [flat|nested] 21+ messages in thread
* [Bug rtl-optimization/36712] Inefficient loop unrolling
2008-07-03 9:07 [Bug rtl-optimization/36712] New: Inefficient loop unrolling bmei at broadcom dot com
` (9 preceding siblings ...)
2010-02-04 11:21 ` steven at gcc dot gnu dot org
@ 2010-02-04 11:47 ` rguenth at gcc dot gnu dot org
2010-02-04 14:55 ` steven at gcc dot gnu dot org
` (8 subsequent siblings)
19 siblings, 0 replies; 21+ messages in thread
From: rguenth at gcc dot gnu dot org @ 2010-02-04 11:47 UTC (permalink / raw)
To: gcc-bugs
------- Comment #11 from rguenth at gcc dot gnu dot org 2010-02-04 11:47 -------
Also try the patches from PR42617 to see if they improve the pre-regalloc
scheduling.
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36712
^ permalink raw reply [flat|nested] 21+ messages in thread
* [Bug rtl-optimization/36712] Inefficient loop unrolling
2008-07-03 9:07 [Bug rtl-optimization/36712] New: Inefficient loop unrolling bmei at broadcom dot com
` (10 preceding siblings ...)
2010-02-04 11:47 ` rguenth at gcc dot gnu dot org
@ 2010-02-04 14:55 ` steven at gcc dot gnu dot org
2010-02-04 14:57 ` steven at gcc dot gnu dot org
` (7 subsequent siblings)
19 siblings, 0 replies; 21+ messages in thread
From: steven at gcc dot gnu dot org @ 2010-02-04 14:55 UTC (permalink / raw)
To: gcc-bugs
------- Comment #12 from steven at gcc dot gnu dot org 2010-02-04 14:54 -------
With the patches from bug 42617 applied, I get the following:
.file "tst.c"
.text
.align 2
.global Unroll
.type Unroll, %function
Unroll:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
@ link register save eliminated.
mov r0, r0, asl #16
stmfd sp!, {r4, r5, r6, r7, r8}
mov r0, r0, asr #16
add r8, r1, #256
.L2:
ldr ip, [r1, #0]
mov r7, r1
mul r2, ip, r0
str r2, [r7], #4
ldr r3, [r1, #4]
ldr r5, [r7, #4]
mul r6, r3, r0
str r6, [r7, #0]
ldr r4, [r1, #12]
ldr ip, [r1, #16]
add r2, r1, #20
ldmia r2, {r2, r3, r7} @ phole ldm
mul r6, r5, r0
mul r5, r4, r0
mul r4, ip, r0
mul ip, r2, r0
mul r2, r3, r0
mul r3, r7, r0
str r6, [r1, #8]
str r5, [r1, #12]
str r4, [r1, #16]
str ip, [r1, #20]
str r2, [r1, #24]
add r1, r1, #32
cmp r1, r8
str r3, [r1, #-4]
bne .L2
ldmfd sp!, {r4, r5, r6, r7, r8}
bx lr
.size Unroll, .-Unroll
.ident "GCC: (GNU) 4.5.0 20100204 (experimental) [trunk revision
156492]"
(flags: -std=c99 -mcpu=arm9 -O2 -funroll-loops)
This is good but not perfect.
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36712
^ permalink raw reply [flat|nested] 21+ messages in thread
* [Bug rtl-optimization/36712] Inefficient loop unrolling
2008-07-03 9:07 [Bug rtl-optimization/36712] New: Inefficient loop unrolling bmei at broadcom dot com
` (11 preceding siblings ...)
2010-02-04 14:55 ` steven at gcc dot gnu dot org
@ 2010-02-04 14:57 ` steven at gcc dot gnu dot org
2010-02-04 15:19 ` steven at gcc dot gnu dot org
` (6 subsequent siblings)
19 siblings, 0 replies; 21+ messages in thread
From: steven at gcc dot gnu dot org @ 2010-02-04 14:57 UTC (permalink / raw)
To: gcc-bugs
------- Comment #13 from steven at gcc dot gnu dot org 2010-02-04 14:56 -------
With -fno-web, the patches from bug 42617 do not help and the output is the
same as that of comment #8 (second asm dump).
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36712
^ permalink raw reply [flat|nested] 21+ messages in thread
* [Bug rtl-optimization/36712] Inefficient loop unrolling
2008-07-03 9:07 [Bug rtl-optimization/36712] New: Inefficient loop unrolling bmei at broadcom dot com
` (12 preceding siblings ...)
2010-02-04 14:57 ` steven at gcc dot gnu dot org
@ 2010-02-04 15:19 ` steven at gcc dot gnu dot org
2010-02-04 16:07 ` steven at gcc dot gnu dot org
` (5 subsequent siblings)
19 siblings, 0 replies; 21+ messages in thread
From: steven at gcc dot gnu dot org @ 2010-02-04 15:19 UTC (permalink / raw)
To: gcc-bugs
------- Comment #14 from steven at gcc dot gnu dot org 2010-02-04 15:19 -------
Part of the problem comes from the way IVOPTS optimizes the memory access:
;; Generating RTL for gimple basic block 3
;; D.1814_10 = MEM[base: D.1846_29];
(insn 52 51 0 tst.c:6 (set (reg:SI 172 [ D.1814 ])
(mem:SI (reg:SI 179 [ ivtmp.20 ]) [2 *D.1846_29 S4 A32])) -1 (nil))
;; ivtmp.20_24 = ivtmp.20_25 + 4;
(insn 53 52 0 tst.c:6 (set (reg:SI 179 [ ivtmp.20 ])
(plus:SI (reg:SI 179 [ ivtmp.20 ])
(const_int 4 [0x4]))) -1 (nil))
;; MEM[base: D.1847_30, offset: 4294967292] = D.1816_13;
(insn 54 53 55 tst.c:6 (set (reg:SI 189)
(mult:SI (reg:SI 172 [ D.1814 ])
(reg:SI 180 [ pretmp.11 ]))) -1 (nil))
(insn 55 54 0 tst.c:6 (set (mem:SI (plus:SI (reg:SI 179 [ ivtmp.20 ])
(const_int -4 [0xfffffffffffffffc])) [2 *D.1847_30 S4 A32])
(reg:SI 189)) -1 (nil))
;; if (ivtmp.20_24 != D.1849_32)
(insn 57 55 58 tst.c:4 (set (reg:CC 24 cc)
(compare:CC (reg:SI 179 [ ivtmp.20 ])
(reg:SI 183 [ D.1849 ]))) -1 (nil))
(jump_insn 58 57 0 tst.c:4 (set (pc)
(if_then_else (ne (reg:CC 24 cc)
(const_int 0 [0x0]))
(label_ref 56)
(pc))) -1 (expr_list:REG_BR_PROB (const_int 9844 [0x2674])
(nil)))
This yields the sequence:
52 r172:SI=[r179:SI]
53 r179:SI=r179:SI+0x4
54 r189:SI=r172:SI*r180:SI
55 [r179:SI-0x4]=r189:SI
and we never get rid of this again. There is another bug related to this one:
bug 31849. The patch there probably needs dusting off, and then I'll see if it
improves things for this bug as well.
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36712
^ permalink raw reply [flat|nested] 21+ messages in thread
* [Bug rtl-optimization/36712] Inefficient loop unrolling
2008-07-03 9:07 [Bug rtl-optimization/36712] New: Inefficient loop unrolling bmei at broadcom dot com
` (13 preceding siblings ...)
2010-02-04 15:19 ` steven at gcc dot gnu dot org
@ 2010-02-04 16:07 ` steven at gcc dot gnu dot org
2010-02-05 13:33 ` steven at gcc dot gnu dot org
` (4 subsequent siblings)
19 siblings, 0 replies; 21+ messages in thread
From: steven at gcc dot gnu dot org @ 2010-02-04 16:07 UTC (permalink / raw)
To: gcc-bugs
------- Comment #15 from steven at gcc dot gnu dot org 2010-02-04 16:06 -------
The patches for bug 31849 have been commited, it seems, and it doesn't help for
this case.
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36712
^ permalink raw reply [flat|nested] 21+ messages in thread
* [Bug rtl-optimization/36712] Inefficient loop unrolling
2008-07-03 9:07 [Bug rtl-optimization/36712] New: Inefficient loop unrolling bmei at broadcom dot com
` (14 preceding siblings ...)
2010-02-04 16:07 ` steven at gcc dot gnu dot org
@ 2010-02-05 13:33 ` steven at gcc dot gnu dot org
2010-02-05 13:58 ` rakdver at kam dot mff dot cuni dot cz
` (3 subsequent siblings)
19 siblings, 0 replies; 21+ messages in thread
From: steven at gcc dot gnu dot org @ 2010-02-05 13:33 UTC (permalink / raw)
To: gcc-bugs
------- Comment #16 from steven at gcc dot gnu dot org 2010-02-05 13:33 -------
I'm trying to coerce IVOPTSs into producing the following, optimal code in the
GIMPLE optimizers (without much luck, so far):
<bb 2>:
pretmp.11_26 = (int) s_11(D);
ivtmp.20_28 = (long unsigned int) b_inout_5(D);
D.1848_31 = (long unsigned int) b_inout_5(D);
D.1849_32 = D.1848_31 + 256;
<bb 3>:
# ivtmp.20_25 = PHI <ivtmp.20_24(4), ivtmp.20_28(2)>
D.1846_29 = (void *) ivtmp.20_25;
D.1814_10 = MEM[base: D.1846_29]{*D.1813};
D.1816_13 = D.1814_10 * pretmp.11_26;
D.1847_30 = (void *) ivtmp.20_25;
MEM[base: D.1847_30]{*D.1813} = D.1816_13;
ivtmp.20_24 = ivtmp.20_25 + 4;
if (ivtmp.20_24 != D.1849_32)
goto <bb 4>;
else
goto <bb 5>;
<bb 4>:
goto <bb 3>;
<bb 5>:
return;
If we can get the compiler to generate the above code in IVOPTS, then we should
get the same code as the hand-unrolled example (although it also looks like a
bit more scheduler look-ahead freedom is necessary).
I asked Zdenek for help. For ARM9 he found the following problem:
/quote/
Address costs:
index costs 6
cst + index costs 2
arm_arm_address_cost pretends that having reg + cst as an address is cheaper
than having reg by itself. Ivopts are happy to make this happen :-)
There used to be the same problem on x86, which was eventually fixed by making
address_cost reflect the real cost of addresses, rather than "which addressing
modes should CSE prefer" metrics.
/quote/
Of course all cases with cost of "index" > cost of "index + cst" results in the
code like that of an unpatched compiler. But if I adjust the cost to make
"index" cost only 1 or 2, I get this:
<bb 2>:
pretmp.11_26 = (int) s_11(D);
ivtmp.25_28 = (long unsigned int) b_inout_5(D);
<bb 3>:
# i_20 = PHI <i_14(4), 0(2)>
# ivtmp.25_25 = PHI <ivtmp.25_24(4), ivtmp.25_28(2)>
D.1846_29 = (void *) ivtmp.25_25;
D.1814_10 = MEM[base: D.1846_29]{*D.1813};
D.1816_13 = D.1814_10 * pretmp.11_26;
D.1847_30 = (void *) ivtmp.25_25;
MEM[base: D.1847_30]{*D.1813} = D.1816_13;
ivtmp.25_24 = ivtmp.25_25 + 4;
i_14 = i_20 + 1;
if (i_14 != 64)
goto <bb 4>;
else
goto <bb 5>;
<bb 4>:
goto <bb 3>;
<bb 5>:
return;
That is still not optimal: We get an extra IV for some reason.
--
steven at gcc dot gnu dot org changed:
What |Removed |Added
----------------------------------------------------------------------------
CC| |rakdver at gcc dot gnu dot
| |org
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36712
^ permalink raw reply [flat|nested] 21+ messages in thread
* [Bug rtl-optimization/36712] Inefficient loop unrolling
2008-07-03 9:07 [Bug rtl-optimization/36712] New: Inefficient loop unrolling bmei at broadcom dot com
` (15 preceding siblings ...)
2010-02-05 13:33 ` steven at gcc dot gnu dot org
@ 2010-02-05 13:58 ` rakdver at kam dot mff dot cuni dot cz
2010-02-05 14:02 ` steven at gcc dot gnu dot org
` (2 subsequent siblings)
19 siblings, 0 replies; 21+ messages in thread
From: rakdver at kam dot mff dot cuni dot cz @ 2010-02-05 13:58 UTC (permalink / raw)
To: gcc-bugs
------- Comment #17 from rakdver at kam dot mff dot cuni dot cz 2010-02-05 13:58 -------
Subject: Re: Inefficient loop unrolling
> But if I adjust the cost to make
> "index" cost only 1 or 2, I get this:
>
> <bb 2>:
> pretmp.11_26 = (int) s_11(D);
> ivtmp.25_28 = (long unsigned int) b_inout_5(D);
>
> <bb 3>:
> # i_20 = PHI <i_14(4), 0(2)>
> # ivtmp.25_25 = PHI <ivtmp.25_24(4), ivtmp.25_28(2)>
> D.1846_29 = (void *) ivtmp.25_25;
> D.1814_10 = MEM[base: D.1846_29]{*D.1813};
> D.1816_13 = D.1814_10 * pretmp.11_26;
> D.1847_30 = (void *) ivtmp.25_25;
> MEM[base: D.1847_30]{*D.1813} = D.1816_13;
> ivtmp.25_24 = ivtmp.25_25 + 4;
> i_14 = i_20 + 1;
> if (i_14 != 64)
> goto <bb 4>;
> else
> goto <bb 5>;
>
> <bb 4>:
> goto <bb 3>;
>
> <bb 5>:
> return;
what configuration and flags are you using? For me, replacing
arm_arm_address_cost with return 1 resulted
in
<bb 3>:
# ivtmp.25_25 = PHI <ivtmp.25_24(3), ivtmp.25_28(2)>
D.1816_29 = (void *) ivtmp.25_25;
D.1784_10 = MEM[base: D.1816_29]{*D.1783};
D.1786_13 = pretmp.11_26 * D.1784_10;
MEM[base: D.1816_29]{*D.1783} = D.1786_13;
ivtmp.25_24 = ivtmp.25_25 + 4;
if (ivtmp.25_24 != D.1819_32)
Zdenek
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36712
^ permalink raw reply [flat|nested] 21+ messages in thread
* [Bug rtl-optimization/36712] Inefficient loop unrolling
2008-07-03 9:07 [Bug rtl-optimization/36712] New: Inefficient loop unrolling bmei at broadcom dot com
` (16 preceding siblings ...)
2010-02-05 13:58 ` rakdver at kam dot mff dot cuni dot cz
@ 2010-02-05 14:02 ` steven at gcc dot gnu dot org
2010-02-05 14:58 ` [Bug target/36712] " steven at gcc dot gnu dot org
2010-02-12 22:46 ` steven at gcc dot gnu dot org
19 siblings, 0 replies; 21+ messages in thread
From: steven at gcc dot gnu dot org @ 2010-02-05 14:02 UTC (permalink / raw)
To: gcc-bugs
------- Comment #18 from steven at gcc dot gnu dot org 2010-02-05 14:02 -------
I used "-O2 -std=c99 -mcpu=arm9 -funroll-loops" and I manually hacked the cost
in GDB to change from:
Address costs:
index costs 6
cst + index costs 2
...to this...:
Address costs:
index costs 1
cst + index costs 2
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36712
^ permalink raw reply [flat|nested] 21+ messages in thread
* [Bug target/36712] Inefficient loop unrolling
2008-07-03 9:07 [Bug rtl-optimization/36712] New: Inefficient loop unrolling bmei at broadcom dot com
` (17 preceding siblings ...)
2010-02-05 14:02 ` steven at gcc dot gnu dot org
@ 2010-02-05 14:58 ` steven at gcc dot gnu dot org
2010-02-12 22:46 ` steven at gcc dot gnu dot org
19 siblings, 0 replies; 21+ messages in thread
From: steven at gcc dot gnu dot org @ 2010-02-05 14:58 UTC (permalink / raw)
To: gcc-bugs
------- Comment #19 from steven at gcc dot gnu dot org 2010-02-05 14:58 -------
Interesting: for " -march=armv5te -mthumb" the code after IVOPTS is the perfect
code (from e.g. comment #17). The reason is that the address cost function for
Thumb (arm_thumb_address_cost) is of course not the same as that for ARM
(arm_arm_address_cost) so the correct code comes out automatically:
Address costs:
index costs 1
cst + index costs 1
In arm_arm_address_cost, "index" as a naked REG is the most expensive of all
addresses. In arm_thumb_address_cost it is the cheapest.
In conclusion: This is a target cost problem, not a generic rtl-optimization
bug. Adjusting as such. An ARM maintainer will have to figure out a better
implementation of arm_arm_address_cost.
--
steven at gcc dot gnu dot org changed:
What |Removed |Added
----------------------------------------------------------------------------
AssignedTo|steven at gcc dot gnu dot |unassigned at gcc dot gnu
|org |dot org
Status|ASSIGNED |NEW
Component|rtl-optimization |target
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36712
^ permalink raw reply [flat|nested] 21+ messages in thread
* [Bug target/36712] Inefficient loop unrolling
2008-07-03 9:07 [Bug rtl-optimization/36712] New: Inefficient loop unrolling bmei at broadcom dot com
` (18 preceding siblings ...)
2010-02-05 14:58 ` [Bug target/36712] " steven at gcc dot gnu dot org
@ 2010-02-12 22:46 ` steven at gcc dot gnu dot org
19 siblings, 0 replies; 21+ messages in thread
From: steven at gcc dot gnu dot org @ 2010-02-12 22:46 UTC (permalink / raw)
To: gcc-bugs
------- Comment #20 from steven at gcc dot gnu dot org 2010-02-12 22:46 -------
Bug 27016 is another example of poor IVOPTS due to poor choices in
arm_arm_address_cost
--
steven at gcc dot gnu dot org changed:
What |Removed |Added
----------------------------------------------------------------------------
BugsThisDependsOn| |27016
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36712
^ permalink raw reply [flat|nested] 21+ messages in thread
end of thread, other threads:[~2010-02-12 22:46 UTC | newest]
Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-07-03 9:07 [Bug rtl-optimization/36712] New: Inefficient loop unrolling bmei at broadcom dot com
2009-05-20 13:20 ` [Bug rtl-optimization/36712] " ramana at gcc dot gnu dot org
2009-05-20 14:09 ` rguenth at gcc dot gnu dot org
2009-05-20 14:14 ` ramana at gcc dot gnu dot org
2009-05-20 14:17 ` bmei at broadcom dot com
2009-05-20 17:51 ` dje dot gcc at gmail dot com
2009-05-21 8:38 ` bmei at broadcom dot com
2009-10-15 12:12 ` drow at gcc dot gnu dot org
2010-01-25 21:10 ` froydnj at gcc dot gnu dot org
2010-02-04 11:12 ` rearnsha at gcc dot gnu dot org
2010-02-04 11:21 ` steven at gcc dot gnu dot org
2010-02-04 11:47 ` rguenth at gcc dot gnu dot org
2010-02-04 14:55 ` steven at gcc dot gnu dot org
2010-02-04 14:57 ` steven at gcc dot gnu dot org
2010-02-04 15:19 ` steven at gcc dot gnu dot org
2010-02-04 16:07 ` steven at gcc dot gnu dot org
2010-02-05 13:33 ` steven at gcc dot gnu dot org
2010-02-05 13:58 ` rakdver at kam dot mff dot cuni dot cz
2010-02-05 14:02 ` steven at gcc dot gnu dot org
2010-02-05 14:58 ` [Bug target/36712] " steven at gcc dot gnu dot org
2010-02-12 22:46 ` steven at gcc dot gnu dot org
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).