public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug tree-optimization/42972] New: Very bad
@ 2010-02-05 11:13 steven at gcc dot gnu dot org
2010-02-05 12:43 ` [Bug tree-optimization/42972] Very bad bit field code steven at gcc dot gnu dot org
` (4 more replies)
0 siblings, 5 replies; 8+ messages in thread
From: steven at gcc dot gnu dot org @ 2010-02-05 11:13 UTC (permalink / raw)
To: gcc-bugs
Taken from http://hardwarebug.org/2010/01/30/bit-field-badness/
-----------------------------------
struct bf1_31 {
unsigned a:1;
unsigned b:31;
};
void func(struct bf1_31 *p, int n, int a)
{
int i = 0;
do {
if (p[i].a)
p[i].b += a;
} while (++i < n);
}
-----------------------------------
GCC produces this dreadful code for arm-elf with options -march=armv5te -O3:
.file "t.c"
.text
.align 2
.global func
.type func, %function
func:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
@ link register save eliminated.
mov r3, #0
str r4, [sp, #-4]!
.L3:
ldrb ip, [r0, #0] @ zero_extendqisi2
add r3, r3, #1
tst ip, #1
ldrne ip, [r0, #0]
addne r4, r2, ip, lsr #1
andne ip, ip, #1
orrne ip, ip, r4, asl #1
strne ip, [r0, #0]
cmp r3, r1
add r0, r0, #4
blt .L3
ldmfd sp!, {r4}
bx lr
.size func, .-func
.ident "GCC: (GNU) 4.5.0 20100204 (experimental) [trunk revision
156492]"
What is wrong with this code:
1) The same value is loaded from memory twice (from [r0, #0])
2) Mask/shift/or operations are used where a simple shifted add would suffice
3) Write-back addressing is not used
4) The loop control counts up and compares instead of counting down
(Re. 4: The loop is clearly invertible. Does GCC fail to see this? Or is this
optimization just not implemented?)
Hand-optimized assembly would look like this according to Hardwarebug:
func:
ldr r3, [r0], #4
tst r3, #1
add r3, r3, r2, lsl #1
strne r3, [r0, #-4]
subs r1, r1, #1
bgt func
bx lr
Best compiler available produces this:
func:
mov r3, #0
push {r4, lr}
loop:
ldr ip, [r0, r3, lsl #2]
tst ip, #1
addne ip, ip, r2, lsl #1
strne ip, [r0, r3, lsl #2]
add r3, r3, #1
cmp r3, r1
blt loop
pop {r4, pc}
Part of the problem seems to be that the redundant load is not exposed in
GIMPLE (altough the RTL optimizers should also be able to see it). The
.139t.optimized dump looks like this:
1 ;; Function func (func)
2
3 func (struct bf1_31 * p, int n, int a)
4 {
5 long unsigned int ivtmp.16;
6 int i;
7 <unnamed-unsigned:31> D.1973;
8 unsigned int D.1972;
9 int D.1971;
10 int D.1970;
11 <unnamed-unsigned:31> D.1969;
12 unsigned char D.1966;
13 unsigned char D.1965;
14 struct bf1_31 * D.1964;
15
16 <bb 2>:
17 ivtmp.16_30 = (long unsigned int) p_5(D);
18
19 <bb 3>:
20 # i_1 = PHI <0(2), i_21(5)>
21 # ivtmp.16_32 = PHI <ivtmp.16_30(2), ivtmp.16_31(5)>
22 D.1964_38 = (struct bf1_31 *) ivtmp.16_32;
23 D.1965_7 = BIT_FIELD_REF <*D.1964_38, 8, 0>;
24 D.1966_8 = D.1965_7 & 1;
25 if (D.1966_8 != 0)
26 goto <bb 4>;
27 else
28 goto <bb 5>;
29
30 <bb 4>:
31 D.1969_15 = D.1964_38->b;
32 D.1970_16 = (int) D.1969_15;
33 D.1971_18 = a_17(D) + D.1970_16;
34 D.1972_19 = (unsigned int) D.1971_18;
35 D.1973_20 = (<unnamed-unsigned:31>) D.1972_19;
36 D.1964_38->b = D.1973_20;
37
38 <bb 5>:
39 i_21 = i_1 + 1;
40 ivtmp.16_31 = ivtmp.16_32 + 4;
41 if (i_21 < n_22(D))
42 goto <bb 3>;
43 else
44 goto <bb 6>;
45
46 <bb 6>:
47 return;
48
49 }
There are two loads from *D.1964_38 at line 23 and line 31, but one is a
BIT_FIELD_REF and the other is an INDIRECT_REF, so the redundant load is not
noticed.
RTL PRE fails to kill the redundant load because the expressions for the loads
are not the same:
Index 0 (hash value 10)
(zero_extend:SI (mem/s:QI (reg:SI 158 [ ivtmp.16 ]) [0+0 S1 A32]))
Index 3 (hash value 7)
(mem/s:SI (reg:SI 158 [ ivtmp.16 ]) [0+0 S4 A32])
CSE also does not see the redundant load although it could notice it:
* It sees the path:
;; Following path with 17 sets: 3 4
* It sees the first load:
(insn 23 22 24 3 t.c:10 (set (reg:SI 169)
(zero_extend:SI (mem/s:QI (reg:SI 158 [ ivtmp.16 ]) [0+0 S1 A32]))) 148
{*arm_zero_extendqisi2} (nil))
* It sees the second load:
(insn 31 30 32 4 t.c:11 (set (reg:SI 174)
(mem/s:SI (reg:SI 158 [ ivtmp.16 ]) [0+0 S4 A32])) 166
{*arm_movsi_insn} (nil))
The problem here probably is the same as for RTL PRE, that the zero_extend
hides the redundancy.
--
Summary: Very bad
Product: gcc
Version: 4.5.0
Status: UNCONFIRMED
Keywords: missed-optimization
Severity: normal
Priority: P3
Component: tree-optimization
AssignedTo: unassigned at gcc dot gnu dot org
ReportedBy: steven at gcc dot gnu dot org
GCC target triplet: arm-elf
BugsThisDependsOn: 16996
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=42972
^ permalink raw reply [flat|nested] 8+ messages in thread
* [Bug tree-optimization/42972] Very bad bit field code
2010-02-05 11:13 [Bug tree-optimization/42972] New: Very bad steven at gcc dot gnu dot org
@ 2010-02-05 12:43 ` steven at gcc dot gnu dot org
2010-02-05 12:46 ` [Bug middle-end/42972] " steven at gcc dot gnu dot org
` (3 subsequent siblings)
4 siblings, 0 replies; 8+ messages in thread
From: steven at gcc dot gnu dot org @ 2010-02-05 12:43 UTC (permalink / raw)
To: gcc-bugs
------- Comment #1 from steven at gcc dot gnu dot org 2010-02-05 12:42 -------
In the .expand dump there is already something funny about the code generated
for the bit field expressions.
For example the code generated for this:
D.1966_8 = D.1965_7 & 1;
if (D.1966_8 != 0)
TER will perform the forward substitution:
D.1966_8 replace with --> D.1966_8 = D.1965_7 & 1;
The expanders generate the following code for this sequence:
;; if (D.1966_8 != 0)
(insn 23 22 24 t.c:10 (set (reg:SI 169)
(zero_extend:SI (mem/s:QI (reg/f:SI 164 [ D.1964 ]) [0+0 S1 A32]))) -1
(nil))
(insn 24 23 25 t.c:10 (set (reg:QI 168)
(subreg:QI (reg:SI 169) 0)) -1 (nil))
(insn 25 24 26 t.c:10 (set (reg:SI 170)
(and:SI (subreg:SI (reg:QI 168) 0)
(const_int 1 [0x1]))) -1 (nil))
(insn 26 25 27 t.c:10 (set (reg:QI 171)
(subreg:QI (reg:SI 170) 0)) -1 (nil))
(insn 27 26 28 t.c:10 (set (reg:SI 172)
(and:SI (subreg:SI (reg:QI 171) 0)
(const_int 255 [0xff]))) -1 (nil))
(insn 28 27 29 t.c:10 (set (reg:CC 24 cc)
(compare:CC (reg:SI 172)
(const_int 0 [0x0]))) -1 (nil))
(jump_insn 29 28 0 t.c:10 (set (pc)
(if_then_else (eq (reg:CC 24 cc)
(const_int 0 [0x0]))
(label_ref 0)
(pc))) -1 (expr_list:REG_BR_PROB (const_int 5000 [0x1388])
(nil)))
What are insn 26 and insn 27 for?
Then the update of p[i].b:
;; D.1964_38->b = D.1973_20;
(insn 31 30 32 t.c:11 (set (reg:SI 174)
(mem/s:SI (reg/f:SI 164 [ D.1964 ]) [0+0 S4 A32])) -1 (nil))
(insn 32 31 33 t.c:11 (set (reg:SI 173)
(lshiftrt:SI (reg:SI 174)
(const_int 1 [0x1]))) -1 (nil))
(insn 33 32 34 t.c:11 (set (reg:SI 175)
(plus:SI (reg/v:SI 167 [ a ])
(reg:SI 173))) -1 (nil))
(insn 34 33 35 t.c:11 (set (reg:SI 176)
(mem/s/j:SI (reg/f:SI 164 [ D.1964 ]) [0+0 S4 A32])) -1 (nil))
(insn 35 34 36 t.c:11 (set (reg:SI 177)
(and:SI (reg:SI 175)
(const_int 2147483647 [0x7fffffff]))) -1 (nil))
(insn 36 35 37 t.c:11 (set (reg:SI 178)
(and:SI (reg:SI 176)
(const_int 1 [0x1]))) -1 (nil))
(insn 37 36 38 t.c:11 (set (reg:SI 177)
(ashift:SI (reg:SI 177)
(const_int 1 [0x1]))) -1 (nil))
(insn 38 37 39 t.c:11 (set (reg:SI 176)
(ior:SI (reg:SI 177)
(reg:SI 178))) -1 (nil))
(insn 39 38 0 t.c:11 (set (mem/s/j:SI (reg/f:SI 164 [ D.1964 ]) [0+0 S4 A32])
(reg:SI 176)) -1 (nil))
Insns 32, 33, and 37 should use a shifted add instead. The mask of insn 35
looks redundant (the bit is shifted out in insn 37 iiuc) but it is not removed
before combine. Combine generates the shifted add, except that it shifts
"p[i]->b" one bit to the right, instead of "a" one bit left -- apparently not
noticing that everything is shifted back left again in insn 38:
(insn 31 30 32 4 t.c:11 (set (reg:SI 174)
(mem/s:SI (reg:SI 158 [ ivtmp.16 ]) [0+0 S4 A32])) 166
{*arm_movsi_insn}
(insn 33 32 35 4 t.c:11 (set (reg:SI 175)
(plus:SI (lshiftrt:SI (reg:SI 174)
(const_int 1 [0x1]))
(reg/v:SI 167 [ a ]))) 269 {*arith_shiftsi} (nil))
(insn 36 35 37 4 t.c:11 (set (reg:SI 178)
(and:SI (reg:SI 174)
(const_int 1 [0x1]))) 67 {*arm_andsi3_insn} (expr_list:REG_DEAD
(reg:SI 174)
(nil)))
(insn 38 37 39 4 t.c:11 (set (reg:SI 176)
(ior:SI (ashift:SI (reg:SI 175)
(const_int 1 [0x1]))
(reg:SI 178))) 269 {*arith_shiftsi} (expr_list:REG_DEAD (reg:SI
175)
(expr_list:REG_DEAD (reg:SI 178)
(nil))))
(insn 39 38 40 4 t.c:11 (set (mem/s/j:SI (reg:SI 158 [ ivtmp.16 ]) [0+0 S4
A32])
(reg:SI 176)) 166 {*arm_movsi_insn} (expr_list:REG_DEAD (reg:SI 176)
(nil)))
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=42972
^ permalink raw reply [flat|nested] 8+ messages in thread
* [Bug middle-end/42972] Very bad bit field code
2010-02-05 11:13 [Bug tree-optimization/42972] New: Very bad steven at gcc dot gnu dot org
2010-02-05 12:43 ` [Bug tree-optimization/42972] Very bad bit field code steven at gcc dot gnu dot org
@ 2010-02-05 12:46 ` steven at gcc dot gnu dot org
2010-02-05 13:17 ` rguenth at gcc dot gnu dot org
` (2 subsequent siblings)
4 siblings, 0 replies; 8+ messages in thread
From: steven at gcc dot gnu dot org @ 2010-02-05 12:46 UTC (permalink / raw)
To: gcc-bugs
------- Comment #2 from steven at gcc dot gnu dot org 2010-02-05 12:45 -------
This is both a tree-optimization and an rtl-optimization problem, so setting
component to "middle-end" (there is no generic "optimization" component
anymore).
--
steven at gcc dot gnu dot org changed:
What |Removed |Added
----------------------------------------------------------------------------
Status|UNCONFIRMED |NEW
Component|tree-optimization |middle-end
Ever Confirmed|0 |1
Last reconfirmed|0000-00-00 00:00:00 |2010-02-05 12:45:53
date| |
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=42972
^ permalink raw reply [flat|nested] 8+ messages in thread
* [Bug middle-end/42972] Very bad bit field code
2010-02-05 11:13 [Bug tree-optimization/42972] New: Very bad steven at gcc dot gnu dot org
2010-02-05 12:43 ` [Bug tree-optimization/42972] Very bad bit field code steven at gcc dot gnu dot org
2010-02-05 12:46 ` [Bug middle-end/42972] " steven at gcc dot gnu dot org
@ 2010-02-05 13:17 ` rguenth at gcc dot gnu dot org
2010-02-05 13:26 ` rguenth at gcc dot gnu dot org
2010-02-05 15:36 ` matz at gcc dot gnu dot org
4 siblings, 0 replies; 8+ messages in thread
From: rguenth at gcc dot gnu dot org @ 2010-02-05 13:17 UTC (permalink / raw)
To: gcc-bugs
------- Comment #3 from rguenth at gcc dot gnu dot org 2010-02-05 13:17 -------
On the (retired) mem-ref branch bitfield loads/stores were lowered very early
to read-extract-modify-write operations so the tree level would have optimized
this.
But of course people complained that architectures that can do bitfield
stores would be pessimized if we do not retain the funny BIT_FIELD_REFs
of memory.
Basically without some form of lowering the tree level is lost.
mem-ref branch lowered the fn to
i = 0;
<D.1562>:;
D.1564 = (long unsigned int) i;
D.1565 = D.1564 * 4;
D.1566 = p + D.1565;
MEML.0 = IMEM <unsigned int {2}, D.1566>;
D.1567 = BIT_FIELD_REF <MEML.0, 1, 0>;
if (D.1567 != 0) goto <D.1574>; else goto <D.1575>;
<D.1574>:;
D.1564 = (long unsigned int) i;
D.1565 = D.1564 * 4;
D.1566 = p + D.1565;
D.1564 = (long unsigned int) i;
D.1565 = D.1564 * 4;
D.1566 = p + D.1565;
MEML.1 = IMEM <unsigned int {2}, D.1566>;
D.1568 = BIT_FIELD_REF <MEML.1, 31, 1>;
D.1569 = (int) D.1568;
D.1570 = D.1569 + a;
D.1571 = (unsigned int) D.1570;
D.1572 = (<unnamed-unsigned:31>) D.1571;
MEML.2 = IMEM <unsigned int {2}, D.1566>;
MEML.2 = BIT_FIELD_EXPR <MEML.2, D.1572, 31, 1>;
IMEM <unsigned int {2}, D.1566> = MEML.2;
<D.1575>:;
i = i + 1;
if (i < n) goto <D.1562>; else goto <D.1563>;
<D.1563>:;
return;
so FRE sees the redundant load and we expand from
<bb 2>:
ivtmp.20_15 = (unsigned int *) p_5(D);
<bb 3>:
# ivtmp.20_13 = PHI <ivtmp.20_15(2), ivtmp.20_22(5)>
# i_1 = PHI <0(2), i_24(5)>
MEML.0_7 = IMEM <unsigned int {2}, ivtmp.20_13>;
D.1567_8 = BIT_FIELD_REF <MEML.0_7, 1, 0>;
if (D.1567_8 != 0)
goto <bb 4>;
else
goto <bb 5>;
<bb 4>:
D.1568_16 = BIT_FIELD_REF <MEML.0_7, 31, 1>;
D.1569_17 = (int) D.1568_16;
D.1570_19 = D.1569_17 + a_18(D);
D.1571_20 = (unsigned int) D.1570_19;
D.1572_21 = (<unnamed-unsigned:31>) D.1571_20;
MEML.2_23 = BIT_FIELD_EXPR <MEML.0_7, D.1572_21, 31, 1>;
IMEM <unsigned int {2}, ivtmp.20_13> = MEML.2_23;
<bb 5>:
i_24 = i_1 + 1;
ivtmp.20_22 = ivtmp.20_13 + 4;
if (i_24 < n_25(D))
goto <bb 3>;
else
goto <bb 6>;
<bb 6>:
return;
on x86_64 the generated code was
func:
.LFB2:
movl %edx, %r8d
xorl %ecx, %ecx
.p2align 4,,10
.p2align 3
.L3:
movl (%rdi), %eax
testb $1, %al
je .L2
movl %eax, %edx
shrl %eax
addl %r8d, %eax
andl $1, %edx
addl %eax, %eax
orl %eax, %edx
movl %edx, (%rdi)
.L2:
addl $1, %ecx
addq $4, %rdi
cmpl %esi, %ecx
jl .L3
rep
ret
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=42972
^ permalink raw reply [flat|nested] 8+ messages in thread
* [Bug middle-end/42972] Very bad bit field code
2010-02-05 11:13 [Bug tree-optimization/42972] New: Very bad steven at gcc dot gnu dot org
` (2 preceding siblings ...)
2010-02-05 13:17 ` rguenth at gcc dot gnu dot org
@ 2010-02-05 13:26 ` rguenth at gcc dot gnu dot org
2010-02-05 15:36 ` matz at gcc dot gnu dot org
4 siblings, 0 replies; 8+ messages in thread
From: rguenth at gcc dot gnu dot org @ 2010-02-05 13:26 UTC (permalink / raw)
To: gcc-bugs
------- Comment #4 from rguenth at gcc dot gnu dot org 2010-02-05 13:25 -------
(In reply to comment #1)
> In the .expand dump there is already something funny about the code generated
> for the bit field expressions.
>
> For example the code generated for this:
> D.1966_8 = D.1965_7 & 1;
> if (D.1966_8 != 0)
>
> TER will perform the forward substitution:
> D.1966_8 replace with --> D.1966_8 = D.1965_7 & 1;
Note that this dump is misleading. The expression is only _marked_ for
possible forwarding. It is the resposibility of the individual expanders
to do the expression lookup and in-place expansion.
And of course forwarded statements are still regularly expanded anyway
(well, to dead code) - that's probably what you see.
--
rguenth at gcc dot gnu dot org changed:
What |Removed |Added
----------------------------------------------------------------------------
CC| |matz at gcc dot gnu dot org
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=42972
^ permalink raw reply [flat|nested] 8+ messages in thread
* [Bug middle-end/42972] Very bad bit field code
2010-02-05 11:13 [Bug tree-optimization/42972] New: Very bad steven at gcc dot gnu dot org
` (3 preceding siblings ...)
2010-02-05 13:26 ` rguenth at gcc dot gnu dot org
@ 2010-02-05 15:36 ` matz at gcc dot gnu dot org
4 siblings, 0 replies; 8+ messages in thread
From: matz at gcc dot gnu dot org @ 2010-02-05 15:36 UTC (permalink / raw)
To: gcc-bugs
------- Comment #5 from matz at gcc dot gnu dot org 2010-02-05 15:36 -------
The code for the if() looks sane on x86-64:
-----------------------------------------
;; if (D.2729_8 != 0)
(insn 16 15 17 pr42972.c:10 (set (reg:QI 87)
(mem/s:QI (reg/f:DI 82 [ D.2727 ]) [0 S1 A32])) -1 (nil))
(insn 17 16 18 pr42972.c:10 (parallel [
(set (reg:QI 86)
(and:QI (reg:QI 87)
(const_int 1 [0x1])))
(clobber (reg:CC 17 flags))
]) -1 (expr_list:REG_EQUAL (and:QI (mem/s:QI (reg/f:DI 82 [ D.2727 ])
[0 S1 A32])
(const_int 1 [0x1]))
(nil)))
(insn 18 17 19 pr42972.c:10 (set (reg:CCZ 17 flags)
(compare:CCZ (reg:QI 86)
(const_int 0 [0x0]))) -1 (nil))
(jump_insn 19 18 0 pr42972.c:10 (set (pc)
(if_then_else (eq (reg:CCZ 17 flags)
(const_int 0 [0x0]))
(label_ref 0)
(pc))) -1 (expr_list:REG_BR_PROB (const_int 5000 [0x1388])
(nil)))
---------------------------------
Btw, instructions marked for forwarding (by TER) are not expanded to useless
code, but not expanded at all (or better said, only at the point of use).
The back-and-forth between QImode and SImode on arm seems to stem from
register promotion trying really hard to work on only SImode regs.
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=42972
^ permalink raw reply [flat|nested] 8+ messages in thread
[parent not found: <bug-42972-4@http.gcc.gnu.org/bugzilla/>]
end of thread, other threads:[~2021-07-20 23:59 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-02-05 11:13 [Bug tree-optimization/42972] New: Very bad steven at gcc dot gnu dot org
2010-02-05 12:43 ` [Bug tree-optimization/42972] Very bad bit field code steven at gcc dot gnu dot org
2010-02-05 12:46 ` [Bug middle-end/42972] " steven at gcc dot gnu dot org
2010-02-05 13:17 ` rguenth at gcc dot gnu dot org
2010-02-05 13:26 ` rguenth at gcc dot gnu dot org
2010-02-05 15:36 ` matz at gcc dot gnu dot org
[not found] <bug-42972-4@http.gcc.gnu.org/bugzilla/>
2012-03-19 4:11 ` steven at gcc dot gnu.org
2021-07-20 23:59 ` pinskia at gcc dot gnu.org
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).