public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug rtl-optimization/115344] New: Missing loop counter reversal
@ 2024-06-04 12:47 cmuellner at gcc dot gnu.org
2024-06-04 17:49 ` [Bug tree-optimization/115344] " pinskia at gcc dot gnu.org
` (5 more replies)
0 siblings, 6 replies; 7+ messages in thread
From: cmuellner at gcc dot gnu.org @ 2024-06-04 12:47 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115344
Bug ID: 115344
Summary: Missing loop counter reversal
Product: gcc
Version: 15.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: rtl-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: cmuellner at gcc dot gnu.org
Target Milestone: ---
Let's take a simple for-loop with an unknown bound:
void bar ();
void foo1 (int n) {
for (int i = 0; i < n; i++) {
bar ();
}
}
We see that two variables are in the program,
but we could eliminate the loop variable `i` as follows:
void bar ();
void foo2 (int n) {
while (n) {
bar ();
n--;
}
}
Optimizing the loop as above has the following benefits:
- No need for a register for the loop variable `i`
- No need for an additional slot in the stack frame
- No need for instructions to save/restore the loop variable register in the
prologue/epilogue
- No need for an initialization instruction for the loop variable `i` (to zero)
LLVM does this transformation on (at least) x86-64, RISC-V (rv64gc),
and AArch64 with -O3, but GCC does not.
Tests have been done with trunk and older GCC releases (I've tested down to GCC
4.4).
Related bug tickets:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=22041 (open - with uses of the
loop counter as an array index)
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=31238 (closed - fixed for GCC
4.5.0)
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=40886 (closed - fixed for GCC
4.5.0)
GCC AArch64 -O3:
foo1:
cmp w0, 0
ble .L6
stp x29, x30, [sp, -32]!
mov x29, sp
stp x19, x20, [sp, 16]
mov w20, w0
mov w19, 0
.L3:
add w19, w19, 1
bl bar
cmp w20, w19
bne .L3
ldp x19, x20, [sp, 16]
ldp x29, x30, [sp], 32
ret
.L6:
ret
foo2:
cbz w0, .L18
stp x29, x30, [sp, -32]!
mov x29, sp
str x19, [sp, 16]
mov w19, w0
.L12:
bl bar
subs w19, w19, #1
bne .L12
ldr x19, [sp, 16]
ldp x29, x30, [sp], 32
ret
.L18:
ret
LLVM AArch64 -O3:
foo1: // @foo1
cmp w0, #1
b.lt .LBB0_4
stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
str x19, [sp, #16] // 8-byte Folded Spill
mov x29, sp
mov w19, w0
.LBB0_2: // =>This Inner Loop Header: Depth=1
bl bar
subs w19, w19, #1
b.ne .LBB0_2
ldr x19, [sp, #16] // 8-byte Folded Reload
ldp x29, x30, [sp], #32 // 16-byte Folded Reload
.LBB0_4:
ret
foo2: // @foo2
cbz w0, .LBB1_4
stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
str x19, [sp, #16] // 8-byte Folded Spill
mov x29, sp
mov w19, w0
.LBB1_2: // =>This Inner Loop Header: Depth=1
bl bar
subs w19, w19, #1
b.ne .LBB1_2
ldr x19, [sp, #16] // 8-byte Folded Reload
ldp x29, x30, [sp], #32 // 16-byte Folded Reload
.LBB1_4:
ret
GCC RISC-V -O3 -march=rv64gc:
foo1:
ble a0,zero,.L6
addi sp,sp,-32
sd s0,16(sp)
sd s1,8(sp)
sd ra,24(sp)
mv s1,a0
li s0,0
.L3:
addiw s0,s0,1
call bar
bne s1,s0,.L3
ld ra,24(sp)
ld s0,16(sp)
ld s1,8(sp)
addi sp,sp,32
jr ra
.L6:
ret
foo2:
beq a0,zero,.L18
addi sp,sp,-16
sd s0,0(sp)
sd ra,8(sp)
mv s0,a0
.L12:
addiw s0,s0,-1
call bar
bne s0,zero,.L12
ld ra,8(sp)
ld s0,0(sp)
addi sp,sp,16
jr ra
.L18:
ret
LLVM RISC-V -O3 -march=rv64gc
foo1: # @foo1
blez a0, .LBB0_4
addi sp, sp, -16
sd ra, 8(sp) # 8-byte Folded Spill
sd s0, 0(sp) # 8-byte Folded Spill
mv s0, a0
.LBB0_2: # =>This Inner Loop Header: Depth=1
call bar
addiw s0, s0, -1
bnez s0, .LBB0_2
ld ra, 8(sp) # 8-byte Folded Reload
ld s0, 0(sp) # 8-byte Folded Reload
addi sp, sp, 16
.LBB0_4:
ret
foo2: # @foo2
beqz a0, .LBB1_4
addi sp, sp, -16
sd ra, 8(sp) # 8-byte Folded Spill
sd s0, 0(sp) # 8-byte Folded Spill
mv s0, a0
.LBB1_2: # =>This Inner Loop Header: Depth=1
call bar
addiw s0, s0, -1
bnez s0, .LBB1_2
ld ra, 8(sp) # 8-byte Folded Reload
ld s0, 0(sp) # 8-byte Folded Reload
addi sp, sp, 16
.LBB1_4:
ret
GCC x86-64 -O3:
foo1:
testl %edi, %edi
jle .L6
pushq %rbp
movl %edi, %ebp
pushq %rbx
xorl %ebx, %ebx
subq $8, %rsp
.L3:
xorl %eax, %eax
addl $1, %ebx
call bar
cmpl %ebx, %ebp
jne .L3
addq $8, %rsp
popq %rbx
popq %rbp
ret
.L6:
ret
foo2:
testl %edi, %edi
je .L18
pushq %rbx
movl %edi, %ebx
.L12:
xorl %eax, %eax
call bar
subl $1, %ebx
jne .L12
popq %rbx
ret
LLVM x86-64 -O3:
foo1: # @foo1
testl %edi, %edi
jle .LBB0_4
pushq %rbx
movl %edi, %ebx
.LBB0_2: # =>This Inner Loop Header: Depth=1
xorl %eax, %eax
callq bar@PLT
decl %ebx
jne .LBB0_2
popq %rbx
.LBB0_4:
retq
foo2: # @foo2
testl %edi, %edi
je .LBB1_4
pushq %rbx
movl %edi, %ebx
.LBB1_2: # =>This Inner Loop Header: Depth=1
xorl %eax, %eax
callq bar@PLT
decl %ebx
jne .LBB1_2
popq %rbx
.LBB1_4:
retq
^ permalink raw reply [flat|nested] 7+ messages in thread
* [Bug tree-optimization/115344] Missing loop counter reversal
2024-06-04 12:47 [Bug rtl-optimization/115344] New: Missing loop counter reversal cmuellner at gcc dot gnu.org
@ 2024-06-04 17:49 ` pinskia at gcc dot gnu.org
2024-06-05 6:50 ` rguenth at gcc dot gnu.org
` (4 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: pinskia at gcc dot gnu.org @ 2024-06-04 17:49 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115344
Andrew Pinski <pinskia at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Severity|normal |enhancement
^ permalink raw reply [flat|nested] 7+ messages in thread
* [Bug tree-optimization/115344] Missing loop counter reversal
2024-06-04 12:47 [Bug rtl-optimization/115344] New: Missing loop counter reversal cmuellner at gcc dot gnu.org
2024-06-04 17:49 ` [Bug tree-optimization/115344] " pinskia at gcc dot gnu.org
@ 2024-06-05 6:50 ` rguenth at gcc dot gnu.org
2024-06-24 6:18 ` andi-gcc at firstfloor dot org
` (3 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: rguenth at gcc dot gnu.org @ 2024-06-05 6:50 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115344
Richard Biener <rguenth at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Status|UNCONFIRMED |NEW
Ever confirmed|0 |1
Last reconfirmed| |2024-06-05
--- Comment #1 from Richard Biener <rguenth at gcc dot gnu.org> ---
IVOPTs can do this with and I also think without the help of IVCANON which
could add a decrementing IV (it only does that for constant number of
iterations
for some reason).
I'm not sure why, for this example, IVOPTs doesn't add a candidate IV
that decrements to zero. I see
Predict doloop failure due to target specific checks.
so the doloop candidate isn't added?
^ permalink raw reply [flat|nested] 7+ messages in thread
* [Bug tree-optimization/115344] Missing loop counter reversal
2024-06-04 12:47 [Bug rtl-optimization/115344] New: Missing loop counter reversal cmuellner at gcc dot gnu.org
2024-06-04 17:49 ` [Bug tree-optimization/115344] " pinskia at gcc dot gnu.org
2024-06-05 6:50 ` rguenth at gcc dot gnu.org
@ 2024-06-24 6:18 ` andi-gcc at firstfloor dot org
2024-06-24 6:24 ` pinskia at gcc dot gnu.org
` (2 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: andi-gcc at firstfloor dot org @ 2024-06-24 6:18 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115344
Andi Kleen <andi-gcc at firstfloor dot org> changed:
What |Removed |Added
----------------------------------------------------------------------------
CC| |andi-gcc at firstfloor dot org
--- Comment #2 from Andi Kleen <andi-gcc at firstfloor dot org> ---
If you go back long enough the RTL loop optimizer used to support that. It
broke at some point, even before that one was removed.
>so the doloop candidate isn't added?
x86 doesn't have hardware do loops, it's only for some obscure targets like
some DSPs.
^ permalink raw reply [flat|nested] 7+ messages in thread
* [Bug tree-optimization/115344] Missing loop counter reversal
2024-06-04 12:47 [Bug rtl-optimization/115344] New: Missing loop counter reversal cmuellner at gcc dot gnu.org
` (2 preceding siblings ...)
2024-06-24 6:18 ` andi-gcc at firstfloor dot org
@ 2024-06-24 6:24 ` pinskia at gcc dot gnu.org
2024-06-24 17:04 ` andi-gcc at firstfloor dot org
2024-06-24 17:13 ` andi-gcc at firstfloor dot org
5 siblings, 0 replies; 7+ messages in thread
From: pinskia at gcc dot gnu.org @ 2024-06-24 6:24 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115344
--- Comment #3 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
(In reply to Andi Kleen from comment #2)
> x86 doesn't have hardware do loops, it's only for some obscure targets like
> some DSPs.
Actually doloop was added to support rs6000(powerpc)'s loop counter register
...
So it is not just for obscure DSPs targets.
^ permalink raw reply [flat|nested] 7+ messages in thread
* [Bug tree-optimization/115344] Missing loop counter reversal
2024-06-04 12:47 [Bug rtl-optimization/115344] New: Missing loop counter reversal cmuellner at gcc dot gnu.org
` (3 preceding siblings ...)
2024-06-24 6:24 ` pinskia at gcc dot gnu.org
@ 2024-06-24 17:04 ` andi-gcc at firstfloor dot org
2024-06-24 17:13 ` andi-gcc at firstfloor dot org
5 siblings, 0 replies; 7+ messages in thread
From: andi-gcc at firstfloor dot org @ 2024-06-24 17:04 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115344
--- Comment #4 from Andi Kleen <andi-gcc at firstfloor dot org> ---
Pedantry aside the basic problem is that doloop optimization depends on the
target supporting doloop, but the loop reversal would be useful everywhere.
So there are two options: add doloop to every target of interest or make the
reversal optimization independent.
^ permalink raw reply [flat|nested] 7+ messages in thread
* [Bug tree-optimization/115344] Missing loop counter reversal
2024-06-04 12:47 [Bug rtl-optimization/115344] New: Missing loop counter reversal cmuellner at gcc dot gnu.org
` (4 preceding siblings ...)
2024-06-24 17:04 ` andi-gcc at firstfloor dot org
@ 2024-06-24 17:13 ` andi-gcc at firstfloor dot org
5 siblings, 0 replies; 7+ messages in thread
From: andi-gcc at firstfloor dot org @ 2024-06-24 17:13 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115344
--- Comment #5 from Andi Kleen <andi-gcc at firstfloor dot org> ---
Also the other problem is that doloop optimization is only for known bounds,
while generic reversal works for unknown too
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2024-06-24 17:13 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-06-04 12:47 [Bug rtl-optimization/115344] New: Missing loop counter reversal cmuellner at gcc dot gnu.org
2024-06-04 17:49 ` [Bug tree-optimization/115344] " pinskia at gcc dot gnu.org
2024-06-05 6:50 ` rguenth at gcc dot gnu.org
2024-06-24 6:18 ` andi-gcc at firstfloor dot org
2024-06-24 6:24 ` pinskia at gcc dot gnu.org
2024-06-24 17:04 ` andi-gcc at firstfloor dot org
2024-06-24 17:13 ` andi-gcc at firstfloor dot org
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).