[Bug rtl-optimization/115344] New: Missing loop counter reversal

public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed

* [Bug rtl-optimization/115344] New: Missing loop counter reversal
@ 2024-06-04 12:47 cmuellner at gcc dot gnu.org
  2024-06-04 17:49 ` [Bug tree-optimization/115344] " pinskia at gcc dot gnu.org
                   ` (5 more replies)
  0 siblings, 6 replies; 7+ messages in thread
From: cmuellner at gcc dot gnu.org @ 2024-06-04 12:47 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115344

            Bug ID: 115344
           Summary: Missing loop counter reversal
           Product: gcc
           Version: 15.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: rtl-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: cmuellner at gcc dot gnu.org
  Target Milestone: ---

Let's take a simple for-loop with an unknown bound:

void bar ();
void foo1 (int n) {
    for (int i = 0; i < n; i++) {
        bar ();
    }
}

We see that two variables are in the program,
but we could eliminate the loop variable `i` as follows:

void bar ();
void foo2 (int n) {
    while (n) {
        bar ();
        n--;
    }
}

Optimizing the loop as above has the following benefits:
- No need for a register for the loop variable `i`
- No need for an additional slot in the stack frame
- No need for instructions to save/restore the loop variable register in the
prologue/epilogue
- No need for an initialization instruction for the loop variable `i` (to zero)

LLVM does this transformation on (at least) x86-64,  RISC-V (rv64gc),
and AArch64 with -O3, but GCC does not.
Tests have been done with trunk and older GCC releases (I've tested down to GCC
4.4).

Related bug tickets:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=22041 (open - with uses of the
loop counter as an array index)
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=31238 (closed - fixed for GCC
4.5.0)
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=40886 (closed - fixed for GCC
4.5.0)

GCC AArch64 -O3:
foo1:
        cmp     w0, 0
        ble     .L6
        stp     x29, x30, [sp, -32]!
        mov     x29, sp
        stp     x19, x20, [sp, 16]
        mov     w20, w0
        mov     w19, 0
.L3:
        add     w19, w19, 1
        bl      bar
        cmp     w20, w19
        bne     .L3
        ldp     x19, x20, [sp, 16]
        ldp     x29, x30, [sp], 32
        ret
.L6:
        ret
foo2:
        cbz     w0, .L18
        stp     x29, x30, [sp, -32]!
        mov     x29, sp
        str     x19, [sp, 16]
        mov     w19, w0
.L12:
        bl      bar
        subs    w19, w19, #1
        bne     .L12
        ldr     x19, [sp, 16]
        ldp     x29, x30, [sp], 32
        ret
.L18:
        ret

LLVM AArch64 -O3:
foo1:                                   // @foo1
        cmp     w0, #1
        b.lt    .LBB0_4
        stp     x29, x30, [sp, #-32]!           // 16-byte Folded Spill
        str     x19, [sp, #16]                  // 8-byte Folded Spill
        mov     x29, sp
        mov     w19, w0
.LBB0_2:                                // =>This Inner Loop Header: Depth=1
        bl      bar
        subs    w19, w19, #1
        b.ne    .LBB0_2
        ldr     x19, [sp, #16]                  // 8-byte Folded Reload
        ldp     x29, x30, [sp], #32             // 16-byte Folded Reload
.LBB0_4:
        ret
foo2:                                   // @foo2
        cbz     w0, .LBB1_4
        stp     x29, x30, [sp, #-32]!           // 16-byte Folded Spill
        str     x19, [sp, #16]                  // 8-byte Folded Spill
        mov     x29, sp
        mov     w19, w0
.LBB1_2:                                // =>This Inner Loop Header: Depth=1
        bl      bar
        subs    w19, w19, #1
        b.ne    .LBB1_2
        ldr     x19, [sp, #16]                  // 8-byte Folded Reload
        ldp     x29, x30, [sp], #32             // 16-byte Folded Reload
.LBB1_4:
        ret

GCC RISC-V -O3 -march=rv64gc:
foo1:
        ble     a0,zero,.L6
        addi    sp,sp,-32
        sd      s0,16(sp)
        sd      s1,8(sp)
        sd      ra,24(sp)
        mv      s1,a0
        li      s0,0
.L3:
        addiw   s0,s0,1
        call    bar
        bne     s1,s0,.L3
        ld      ra,24(sp)
        ld      s0,16(sp)
        ld      s1,8(sp)
        addi    sp,sp,32
        jr      ra
.L6:
        ret
foo2:
        beq     a0,zero,.L18
        addi    sp,sp,-16
        sd      s0,0(sp)
        sd      ra,8(sp)
        mv      s0,a0
.L12:
        addiw   s0,s0,-1
        call    bar
        bne     s0,zero,.L12
        ld      ra,8(sp)
        ld      s0,0(sp)
        addi    sp,sp,16
        jr      ra
.L18:
        ret

LLVM RISC-V -O3 -march=rv64gc
foo1:                                   # @foo1
        blez    a0, .LBB0_4
        addi    sp, sp, -16
        sd      ra, 8(sp)                       # 8-byte Folded Spill
        sd      s0, 0(sp)                       # 8-byte Folded Spill
        mv      s0, a0
.LBB0_2:                                # =>This Inner Loop Header: Depth=1
        call    bar
        addiw   s0, s0, -1
        bnez    s0, .LBB0_2
        ld      ra, 8(sp)                       # 8-byte Folded Reload
        ld      s0, 0(sp)                       # 8-byte Folded Reload
        addi    sp, sp, 16
.LBB0_4:
        ret
foo2:                                   # @foo2
        beqz    a0, .LBB1_4
        addi    sp, sp, -16
        sd      ra, 8(sp)                       # 8-byte Folded Spill
        sd      s0, 0(sp)                       # 8-byte Folded Spill
        mv      s0, a0
.LBB1_2:                                # =>This Inner Loop Header: Depth=1
        call    bar
        addiw   s0, s0, -1
        bnez    s0, .LBB1_2
        ld      ra, 8(sp)                       # 8-byte Folded Reload
        ld      s0, 0(sp)                       # 8-byte Folded Reload
        addi    sp, sp, 16
.LBB1_4:
        ret

GCC x86-64 -O3:
foo1:
        testl   %edi, %edi
        jle     .L6
        pushq   %rbp
        movl    %edi, %ebp
        pushq   %rbx
        xorl    %ebx, %ebx
        subq    $8, %rsp
.L3:
        xorl    %eax, %eax
        addl    $1, %ebx
        call    bar
        cmpl    %ebx, %ebp
        jne     .L3
        addq    $8, %rsp
        popq    %rbx
        popq    %rbp
        ret
.L6:
        ret
foo2:
        testl   %edi, %edi
        je      .L18
        pushq   %rbx
        movl    %edi, %ebx
.L12:
        xorl    %eax, %eax
        call    bar
        subl    $1, %ebx
        jne     .L12
        popq    %rbx
        ret

LLVM x86-64 -O3:
foo1:                                   # @foo1
        testl   %edi, %edi
        jle     .LBB0_4
        pushq   %rbx
        movl    %edi, %ebx
.LBB0_2:                                # =>This Inner Loop Header: Depth=1
        xorl    %eax, %eax
        callq   bar@PLT
        decl    %ebx
        jne     .LBB0_2
        popq    %rbx
.LBB0_4:
        retq
foo2:                                   # @foo2
        testl   %edi, %edi
        je      .LBB1_4
        pushq   %rbx
        movl    %edi, %ebx
.LBB1_2:                                # =>This Inner Loop Header: Depth=1
        xorl    %eax, %eax
        callq   bar@PLT
        decl    %ebx
        jne     .LBB1_2
        popq    %rbx
.LBB1_4:
        retq

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [Bug tree-optimization/115344] Missing loop counter reversal
  2024-06-04 12:47 [Bug rtl-optimization/115344] New: Missing loop counter reversal cmuellner at gcc dot gnu.org
@ 2024-06-04 17:49 ` pinskia at gcc dot gnu.org
  2024-06-05  6:50 ` rguenth at gcc dot gnu.org
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: pinskia at gcc dot gnu.org @ 2024-06-04 17:49 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115344

Andrew Pinski <pinskia at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
           Severity|normal                      |enhancement

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [Bug tree-optimization/115344] Missing loop counter reversal
  2024-06-04 12:47 [Bug rtl-optimization/115344] New: Missing loop counter reversal cmuellner at gcc dot gnu.org
  2024-06-04 17:49 ` [Bug tree-optimization/115344] " pinskia at gcc dot gnu.org
@ 2024-06-05  6:50 ` rguenth at gcc dot gnu.org
  2024-06-24  6:18 ` andi-gcc at firstfloor dot org
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: rguenth at gcc dot gnu.org @ 2024-06-05  6:50 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115344

Richard Biener <rguenth at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|UNCONFIRMED                 |NEW
     Ever confirmed|0                           |1
   Last reconfirmed|                            |2024-06-05

--- Comment #1 from Richard Biener <rguenth at gcc dot gnu.org> ---
IVOPTs can do this with and I also think without the help of IVCANON which
could add a decrementing IV (it only does that for constant number of
iterations
for some reason).

I'm not sure why, for this example, IVOPTs doesn't add a candidate IV
that decrements to zero.  I see

Predict doloop failure due to target specific checks.

so the doloop candidate isn't added?

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [Bug tree-optimization/115344] Missing loop counter reversal
  2024-06-04 12:47 [Bug rtl-optimization/115344] New: Missing loop counter reversal cmuellner at gcc dot gnu.org
  2024-06-04 17:49 ` [Bug tree-optimization/115344] " pinskia at gcc dot gnu.org
  2024-06-05  6:50 ` rguenth at gcc dot gnu.org
@ 2024-06-24  6:18 ` andi-gcc at firstfloor dot org
  2024-06-24  6:24 ` pinskia at gcc dot gnu.org
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: andi-gcc at firstfloor dot org @ 2024-06-24  6:18 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115344

Andi Kleen <andi-gcc at firstfloor dot org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |andi-gcc at firstfloor dot org

--- Comment #2 from Andi Kleen <andi-gcc at firstfloor dot org> ---

If you go back long enough the RTL loop optimizer used to support that. It
broke at some point, even before that one was removed.

>so the doloop candidate isn't added?

x86 doesn't have hardware do loops, it's only for some obscure targets like
some DSPs.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [Bug tree-optimization/115344] Missing loop counter reversal
  2024-06-04 12:47 [Bug rtl-optimization/115344] New: Missing loop counter reversal cmuellner at gcc dot gnu.org
                   ` (2 preceding siblings ...)
  2024-06-24  6:18 ` andi-gcc at firstfloor dot org
@ 2024-06-24  6:24 ` pinskia at gcc dot gnu.org
  2024-06-24 17:04 ` andi-gcc at firstfloor dot org
  2024-06-24 17:13 ` andi-gcc at firstfloor dot org
  5 siblings, 0 replies; 7+ messages in thread
From: pinskia at gcc dot gnu.org @ 2024-06-24  6:24 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115344

--- Comment #3 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
(In reply to Andi Kleen from comment #2)
> x86 doesn't have hardware do loops, it's only for some obscure targets like
> some DSPs.

Actually doloop was added to support rs6000(powerpc)'s loop counter register
...
So it is not just for obscure DSPs targets.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [Bug tree-optimization/115344] Missing loop counter reversal
  2024-06-04 12:47 [Bug rtl-optimization/115344] New: Missing loop counter reversal cmuellner at gcc dot gnu.org
                   ` (3 preceding siblings ...)
  2024-06-24  6:24 ` pinskia at gcc dot gnu.org
@ 2024-06-24 17:04 ` andi-gcc at firstfloor dot org
  2024-06-24 17:13 ` andi-gcc at firstfloor dot org
  5 siblings, 0 replies; 7+ messages in thread
From: andi-gcc at firstfloor dot org @ 2024-06-24 17:04 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115344

--- Comment #4 from Andi Kleen <andi-gcc at firstfloor dot org> ---
Pedantry aside the basic problem is that doloop optimization depends on the
target supporting doloop, but the loop reversal would be useful everywhere.

So there are two options: add doloop to every target of interest or make the
reversal optimization independent.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [Bug tree-optimization/115344] Missing loop counter reversal
  2024-06-04 12:47 [Bug rtl-optimization/115344] New: Missing loop counter reversal cmuellner at gcc dot gnu.org
                   ` (4 preceding siblings ...)
  2024-06-24 17:04 ` andi-gcc at firstfloor dot org
@ 2024-06-24 17:13 ` andi-gcc at firstfloor dot org
  5 siblings, 0 replies; 7+ messages in thread
From: andi-gcc at firstfloor dot org @ 2024-06-24 17:13 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115344

--- Comment #5 from Andi Kleen <andi-gcc at firstfloor dot org> ---
Also the other problem is that doloop optimization is only for known bounds,
while generic reversal works for unknown too

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2024-06-24 17:13 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-06-04 12:47 [Bug rtl-optimization/115344] New: Missing loop counter reversal cmuellner at gcc dot gnu.org
2024-06-04 17:49 ` [Bug tree-optimization/115344] " pinskia at gcc dot gnu.org
2024-06-05  6:50 ` rguenth at gcc dot gnu.org
2024-06-24  6:18 ` andi-gcc at firstfloor dot org
2024-06-24  6:24 ` pinskia at gcc dot gnu.org
2024-06-24 17:04 ` andi-gcc at firstfloor dot org
2024-06-24 17:13 ` andi-gcc at firstfloor dot org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).