public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug target/17593] New: Overaggressive Speculative Code Motion
@ 2004-09-21 20:53 steinmtz at us dot ibm dot com
2004-09-21 20:56 ` [Bug target/17593] Over Aggressive " steinmtz at us dot ibm dot com
2004-09-22 3:54 ` pinskia at gcc dot gnu dot org
0 siblings, 2 replies; 3+ messages in thread
From: steinmtz at us dot ibm dot com @ 2004-09-21 20:53 UTC (permalink / raw)
To: gcc-bugs
Description:
This example comes from procedure "mainGtU" in blocksort.c which is part of
bzip2-1.0.2
The procedure is not scheduled because it exceeds the threshold allowed
for "max-sched-region-blocks" which defaults to 10. If we use --param to bump
that value to 30, then gcc will schedule this procedure.
The bad news is that once scheduled, it performs worse than if it were not
scheduled. The main reason appears to be that overaggressive speculative
motions are taking place within the "do" loop. Many of the adds/shifts
required in the loop are moved prior to the first exit condition. Thus, if
the loop exits early, we are computing a lot of results that will never get
used.
In addition, the aggressive code motion introduces much higher register
pressure causing the procedure to use non-volatile registers and in turn
adding save-restore code to the prolog and epilog.
Testcase:
typedef unsigned char Bool;
typedef unsigned int UInt32;
typedef unsigned char UChar;
typedef unsigned short UInt16;
typedef int Int32;
#define True ((Bool)1)
#define False ((Bool)0)
#define AssertD(cond,msg) /* */
Bool mainGtU ( UInt32 i1,
UInt32 i2,
UChar* block,
UInt16* quadrant,
UInt32 nblock,
Int32* budget )
{
Int32 k;
UChar c1, c2;
UInt16 s1, s2;
AssertD ( i1 != i2, "mainGtU" );
/* 1 */
c1 = block[i1]; c2 = block[i2];
if (c1 != c2) return (c1 > c2);
i1++; i2++;
/* 2 */
c1 = block[i1]; c2 = block[i2];
if (c1 != c2) return (c1 > c2);
i1++; i2++;
/* 3 */
c1 = block[i1]; c2 = block[i2];
if (c1 != c2) return (c1 > c2);
i1++; i2++;
/* 4 */
c1 = block[i1]; c2 = block[i2];
if (c1 != c2) return (c1 > c2);
i1++; i2++;
/* 5 */
c1 = block[i1]; c2 = block[i2];
if (c1 != c2) return (c1 > c2);
i1++; i2++;
/* 6 */
c1 = block[i1]; c2 = block[i2];
if (c1 != c2) return (c1 > c2);
i1++; i2++;
/* 7 */
c1 = block[i1]; c2 = block[i2];
if (c1 != c2) return (c1 > c2);
i1++; i2++;
/* 8 */
c1 = block[i1]; c2 = block[i2];
if (c1 != c2) return (c1 > c2);
i1++; i2++;
/* 9 */
c1 = block[i1]; c2 = block[i2];
if (c1 != c2) return (c1 > c2);
i1++; i2++;
/* 10 */
c1 = block[i1]; c2 = block[i2];
if (c1 != c2) return (c1 > c2);
i1++; i2++;
/* 11 */
c1 = block[i1]; c2 = block[i2];
if (c1 != c2) return (c1 > c2);
i1++; i2++;
/* 12 */
c1 = block[i1]; c2 = block[i2];
if (c1 != c2) return (c1 > c2);
i1++; i2++;
k = nblock + 8;
do {
/* 1 */
c1 = block[i1]; c2 = block[i2];
if (c1 != c2) return (c1 > c2);
s1 = quadrant[i1]; s2 = quadrant[i2];
if (s1 != s2) return (s1 > s2);
i1++; i2++;
/* 2 */
c1 = block[i1]; c2 = block[i2];
if (c1 != c2) return (c1 > c2);
s1 = quadrant[i1]; s2 = quadrant[i2];
if (s1 != s2) return (s1 > s2);
i1++; i2++;
/* 3 */
c1 = block[i1]; c2 = block[i2];
if (c1 != c2) return (c1 > c2);
s1 = quadrant[i1]; s2 = quadrant[i2];
if (s1 != s2) return (s1 > s2);
i1++; i2++;
/* 4 */
c1 = block[i1]; c2 = block[i2];
if (c1 != c2) return (c1 > c2);
s1 = quadrant[i1]; s2 = quadrant[i2];
if (s1 != s2) return (s1 > s2);
i1++; i2++;
/* 5 */
c1 = block[i1]; c2 = block[i2];
if (c1 != c2) return (c1 > c2);
s1 = quadrant[i1]; s2 = quadrant[i2];
if (s1 != s2) return (s1 > s2);
i1++; i2++;
/* 6 */
c1 = block[i1]; c2 = block[i2];
if (c1 != c2) return (c1 > c2);
s1 = quadrant[i1]; s2 = quadrant[i2];
if (s1 != s2) return (s1 > s2);
i1++; i2++;
/* 7 */
c1 = block[i1]; c2 = block[i2];
if (c1 != c2) return (c1 > c2);
s1 = quadrant[i1]; s2 = quadrant[i2];
if (s1 != s2) return (s1 > s2);
i1++; i2++;
/* 8 */
c1 = block[i1]; c2 = block[i2];
if (c1 != c2) return (c1 > c2);
s1 = quadrant[i1]; s2 = quadrant[i2];
if (s1 != s2) return (s1 > s2);
i1++; i2++;
if (i1 >= nblock) i1 -= nblock;
if (i2 >= nblock) i2 -= nblock;
k -= 8;
(*budget)--;
}
while (k >= 0);
return False;
}
Assembly for non-schedule case:
Using gcc 4.0:
gcc -c -m32 -O2 -mcpu=power4 bzip2_mainGtU.c
mainGtU:
lbzx 0,3,5
mr 11,3
lbzx 9,5,4
stwu 1,-16(1)
cmpw 7,0,9
stw 31,12(1)
subfc 3,0,9
subfe 3,3,3
neg 3,3
beq- 7,.L85
.L4:
lwz 31,12(1)
addi 1,1,16
blr
.L85:
add 10,11,5
add 12,4,5
lbz 0,1(10)
lbz 9,1(12)
cmpw 7,0,9
subfc 3,0,9
subfe 3,3,3
neg 3,3
bne+ 7,.L4
lbz 0,2(10)
lbz 9,2(12)
cmpw 7,0,9
subfc 3,0,9
subfe 3,3,3
neg 3,3
bne+ 7,.L4
lbz 0,3(10)
lbz 9,3(12)
cmpw 7,0,9
subfc 3,0,9
subfe 3,3,3
neg 3,3
bne+ 7,.L4
lbz 0,4(10)
lbz 9,4(12)
cmpw 7,0,9
subfc 3,0,9
subfe 3,3,3
neg 3,3
bne+ 7,.L4
lbz 0,5(10)
lbz 9,5(12)
cmpw 7,0,9
subfc 3,0,9
subfe 3,3,3
neg 3,3
bne+ 7,.L4
lbz 0,6(10)
lbz 9,6(12)
cmpw 7,0,9
subfc 3,0,9
subfe 3,3,3
neg 3,3
bne+ 7,.L4
lbz 0,7(10)
lbz 9,7(12)
cmpw 7,0,9
subfc 3,0,9
subfe 3,3,3
neg 3,3
bne+ 7,.L4
lbz 0,8(10)
lbz 9,8(12)
cmpw 7,0,9
subfc 3,0,9
subfe 3,3,3
neg 3,3
bne+ 7,.L4
lbz 0,9(10)
lbz 9,9(12)
cmpw 7,0,9
subfc 3,0,9
subfe 3,3,3
neg 3,3
bne+ 7,.L4
lbz 0,10(10)
lbz 9,10(12)
cmpw 7,0,9
subfc 3,0,9
subfe 3,3,3
neg 3,3
bne+ 7,.L4
lbz 0,11(10)
lbz 9,11(12)
cmpw 7,0,9
subfc 3,0,9
subfe 3,3,3
neg 3,3
bne+ 7,.L4
addi 10,11,12
addi 4,4,12
addi 31,7,8
li 12,0
.L27:
lbzx 3,5,10
lbzx 0,5,4
cmpw 7,3,0
bne- 7,.L81
slwi 0,10,1
slwi 9,4,1
lhzx 3,6,0
lhzx 0,9,6
cmpw 7,3,0
bne- 7,.L81
addi 11,10,1
addi 9,4,1
lbzx 3,5,11
lbzx 0,5,9
cmpw 7,3,0
bne- 7,.L81
slwi 0,11,1
slwi 9,9,1
lhzx 3,6,0
lhzx 0,9,6
cmpw 7,3,0
bne- 7,.L81
addi 11,10,2
addi 9,4,2
lbzx 3,5,11
lbzx 0,5,9
cmpw 7,3,0
bne- 7,.L81
slwi 0,11,1
slwi 9,9,1
lhzx 3,6,0
lhzx 0,9,6
cmpw 7,3,0
bne- 7,.L81
addi 11,10,3
addi 9,4,3
lbzx 3,5,11
lbzx 0,5,9
cmpw 7,3,0
bne- 7,.L81
slwi 0,11,1
slwi 9,9,1
lhzx 3,6,0
lhzx 0,9,6
cmpw 7,3,0
bne- 7,.L81
addi 11,10,4
addi 9,4,4
lbzx 3,5,11
lbzx 0,5,9
cmpw 7,3,0
bne- 7,.L81
slwi 0,11,1
slwi 9,9,1
lhzx 3,6,0
lhzx 0,9,6
cmpw 7,3,0
bne- 7,.L81
addi 11,10,5
addi 9,4,5
lbzx 3,5,11
lbzx 0,5,9
cmpw 7,3,0
bne- 7,.L81
slwi 0,11,1
slwi 9,9,1
lhzx 3,6,0
lhzx 0,9,6
cmpw 7,3,0
bne- 7,.L81
addi 11,10,6
addi 9,4,6
lbzx 3,5,11
lbzx 0,5,9
cmpw 7,3,0
bne- 7,.L81
slwi 0,11,1
slwi 9,9,1
lhzx 3,6,0
lhzx 0,9,6
cmpw 7,3,0
bne- 7,.L81
addi 11,10,7
addi 9,4,7
lbzx 3,5,11
lbzx 0,5,9
cmpw 7,3,0
bne- 7,.L81
slwi 0,11,1
slwi 9,9,1
lhzx 3,6,0
lhzx 0,9,6
cmpw 7,3,0
bne- 7,.L81
addi 10,10,8
addi 4,4,8
cmplw 7,7,10
bgt- 7,.L60
subf 10,7,10
.L60:
cmplw 7,7,4
bgt- 7,.L62
subf 4,7,4
.L62:
addi 12,12,-8
lwz 9,0(8)
add. 0,31,12
addi 9,9,-1
stw 9,0(8)
bge+ 0,.L27
li 3,0
b .L4
.L81:
subfc 3,3,0
subfe 3,3,3
neg 3,3
b .L4
Assembly for scheduled case:
Using gcc 4.0:
gcc -c -m32 -O2 -mcpu=power4 --param max-sched-region-blocks=30
bzip2_mainGtU.c
See notes within code:
mainGtU:
mflr 0
stwu 1,-80(1)
mr 11,3
lbzx 9,5,4
stw 0,84(1)
lbzx 0,3,5
stw 14,8(1) - register save code due to increased reg pressure
cmpw 7,0,9
stw 15,12(1) .
subfc 3,0,9
subfe 3,3,3
neg 3,3
stw 16,16(1) .
stw 17,20(1) .
stw 18,24(1) .
stw 19,28(1) .
stw 20,32(1) .
stw 21,36(1) .
stw 22,40(1) .
stw 23,44(1) .
stw 24,48(1) .
stw 25,52(1) .
stw 26,56(1) .
stw 27,60(1) .
stw 28,64(1) .
stw 29,68(1) .
stw 30,72(1) .
stw 31,76(1) --------------------------------------------------
beq- 7,.L85
.L4:
lwz 0,84(1)
lwz 14,8(1) - register restore code due to increased reg pressure
lwz 15,12(1) .
mtlr 0
lwz 16,16(1) .
lwz 17,20(1) .
lwz 18,24(1) .
lwz 19,28(1) .
lwz 20,32(1) .
lwz 21,36(1) .
lwz 22,40(1) .
lwz 23,44(1) .
lwz 24,48(1) .
lwz 25,52(1) .
lwz 26,56(1) .
lwz 27,60(1) .
lwz 28,64(1) .
lwz 29,68(1) .
lwz 30,72(1) .
lwz 31,76(1) -----------------------------------------------------
addi 1,1,80
blr
.L85:
add 10,11,5
add 12,4,5
lbz 0,1(10)
lbz 9,1(12)
cmpw 7,0,9
subfc 3,0,9
subfe 3,3,3
neg 3,3
bne+ 7,.L4
lbz 0,2(10)
lbz 9,2(12)
cmpw 7,0,9
subfc 3,0,9
subfe 3,3,3
neg 3,3
bne+ 7,.L4
lbz 0,3(10)
lbz 9,3(12)
cmpw 7,0,9
subfc 3,0,9
subfe 3,3,3
neg 3,3
bne+ 7,.L4
lbz 0,4(10)
lbz 9,4(12)
cmpw 7,0,9
subfc 3,0,9
subfe 3,3,3
neg 3,3
bne+ 7,.L4
lbz 0,5(10)
lbz 9,5(12)
cmpw 7,0,9
subfc 3,0,9
subfe 3,3,3
neg 3,3
bne+ 7,.L4
lbz 0,6(10)
lbz 9,6(12)
cmpw 7,0,9
subfc 3,0,9
subfe 3,3,3
neg 3,3
bne+ 7,.L4
lbz 0,7(10)
lbz 9,7(12)
cmpw 7,0,9
subfc 3,0,9
subfe 3,3,3
neg 3,3
bne+ 7,.L4
lbz 0,8(10)
lbz 9,8(12)
cmpw 7,0,9
subfc 3,0,9
subfe 3,3,3
neg 3,3
bne+ 7,.L4
lbz 0,9(10)
lbz 9,9(12)
cmpw 7,0,9
subfc 3,0,9
subfe 3,3,3
neg 3,3
bne+ 7,.L4
lbz 0,10(10)
lbz 9,10(12)
cmpw 7,0,9
subfc 3,0,9
subfe 3,3,3
neg 3,3
bne+ 7,.L4
lbz 0,11(10)
lbz 9,11(12)
cmpw 7,0,9
subfc 3,0,9
subfe 3,3,3
neg 3,3
bne+ 7,.L4
addi 11,11,12
addi 4,4,12
addi 22,7,8
li 23,0
.L27: - start of "do" loop.
lbzx 3,5,11
addi 9,11,1 - many of the add/shifts in loop get
addi 10,4,1 . speculatively moved here. They cause the
lbzx 0,5,4 . pathlength of the procedure to increase in
addi 29,11,2 . the case where the loop exits early.
addi 24,4,2 .
addi 30,11,3 . They are also the cause of the increased
addi 25,4,3 . register pressure that is causing the reg
cmpw 7,3,0 . allocator to resort to the use of non-volatiles
slwi 28,11,1 .
slwi 27,4,1 .
slwi 21,9,1 . Opportunities exist near the bottom of the
slwi 20,10,1 . loop to perform some of these in parallel
slwi 19,29,1 . with other instructions rather than increase
slwi 18,24,1 . the path length here.
slwi 17,30,1 .
slwi 16,25,1 .
addi 31,11,4 .
addi 26,4,4 .
bne- 7,.L81 - first possible loop exit.
lhzx 3,28,6
slwi 15,31,1
slwi 14,26,1
lhzx 0,27,6
cmpw 7,3,0
bne- 7,.L81
lbzx 3,5,9
addi 12,11,5
addi 27,4,5
lbzx 0,5,10
cmpw 7,3,0
bne- 7,.L81
slwi 0,12,1
lhzx 3,21,6
mtctr 0
slwi 0,27,1
mtlr 0
lhzx 0,20,6
cmpw 7,3,0
bne- 7,.L81
lbzx 3,5,29
addi 10,11,6
addi 28,4,6
lbzx 0,5,24
cmpw 7,3,0
bne- 7,.L81
lhzx 3,19,6
slwi 24,10,1
slwi 21,28,1
lhzx 0,18,6
cmpw 7,3,0
bne- 7,.L81
lbzx 3,5,30
addi 9,11,7
addi 29,4,7
lbzx 0,5,25
cmpw 7,3,0
bne- 7,.L81
lhzx 3,17,6
slwi 30,9,1
slwi 25,29,1
lhzx 0,16,6
cmpw 7,3,0
bne- 7,.L81
lbzx 3,5,31
addi 11,11,8
addi 23,23,-8
lbzx 0,5,26
addi 4,4,8
cmplw 6,7,11
add. 31,22,23
cmplw 1,7,4
cmpw 7,3,0
bne- 7,.L81
lhzx 3,15,6
lhzx 0,14,6
cmpw 7,3,0
bne- 7,.L81
lbzx 3,5,12
lbzx 0,5,27
cmpw 7,3,0
bne- 7,.L81
mfctr 0
mflr 31
lhzx 3,6,0
lhzx 0,31,6
cmpw 7,3,0
bne- 7,.L81
lbzx 3,5,10
lbzx 0,5,28
cmpw 7,3,0
bne- 7,.L81
lhzx 3,24,6
lhzx 0,21,6
cmpw 7,3,0
bne- 7,.L81
lbzx 3,5,9
lbzx 0,5,29
cmpw 7,3,0
bne- 7,.L81
lhzx 3,30,6
lhzx 0,25,6
cmpw 7,3,0
bne- 7,.L81
bgt- 6,.L60
subf 11,7,11
.L60:
bgt- 1,.L62
subf 4,7,4
.L62:
lwz 9,0(8)
addi 9,9,-1
stw 9,0(8)
bge+ 0,.L27
li 3,0
b .L4
.L81:
subfc 3,3,0
subfe 3,3,3
neg 3,3
b .L4
--
Summary: Overaggressive Speculative Code Motion
Product: gcc
Version: 4.0.0
Status: UNCONFIRMED
Severity: enhancement
Priority: P1
Component: target
AssignedTo: unassigned at gcc dot gnu dot org
ReportedBy: steinmtz at us dot ibm dot com
CC: gcc-bugs at gcc dot gnu dot org,steinmtz at us dot ibm
dot com
GCC build triplet: powerpc64-linux
GCC host triplet: powerpc64-linux
GCC target triplet: powerpc64-linux
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17593
^ permalink raw reply [flat|nested] 3+ messages in thread
* [Bug target/17593] Over Aggressive Speculative Code Motion
2004-09-21 20:53 [Bug target/17593] New: Overaggressive Speculative Code Motion steinmtz at us dot ibm dot com
@ 2004-09-21 20:56 ` steinmtz at us dot ibm dot com
2004-09-22 3:54 ` pinskia at gcc dot gnu dot org
1 sibling, 0 replies; 3+ messages in thread
From: steinmtz at us dot ibm dot com @ 2004-09-21 20:56 UTC (permalink / raw)
To: gcc-bugs
--
What |Removed |Added
----------------------------------------------------------------------------
Summary|Overaggressive Speculative |Over Aggressive Speculative
|Code Motion |Code Motion
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17593
^ permalink raw reply [flat|nested] 3+ messages in thread
* [Bug target/17593] Over Aggressive Speculative Code Motion
2004-09-21 20:53 [Bug target/17593] New: Overaggressive Speculative Code Motion steinmtz at us dot ibm dot com
2004-09-21 20:56 ` [Bug target/17593] Over Aggressive " steinmtz at us dot ibm dot com
@ 2004-09-22 3:54 ` pinskia at gcc dot gnu dot org
1 sibling, 0 replies; 3+ messages in thread
From: pinskia at gcc dot gnu dot org @ 2004-09-22 3:54 UTC (permalink / raw)
To: gcc-bugs
------- Additional Comments From pinskia at gcc dot gnu dot org 2004-09-22 03:54 -------
Confirmed.
--
What |Removed |Added
----------------------------------------------------------------------------
Status|UNCONFIRMED |NEW
Ever Confirmed| |1
Last reconfirmed|0000-00-00 00:00:00 |2004-09-22 03:54:39
date| |
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17593
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2004-09-22 3:54 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2004-09-21 20:53 [Bug target/17593] New: Overaggressive Speculative Code Motion steinmtz at us dot ibm dot com
2004-09-21 20:56 ` [Bug target/17593] Over Aggressive " steinmtz at us dot ibm dot com
2004-09-22 3:54 ` pinskia at gcc dot gnu dot org
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).