public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug rtl-optimization/110093] New: [12/13/14 Regression][avr] Move frenzy leading to code bloat
@ 2023-06-02 13:22 gjl at gcc dot gnu.org
  2023-06-05  6:38 ` [Bug rtl-optimization/110093] " rguenth at gcc dot gnu.org
                   ` (8 more replies)
  0 siblings, 9 replies; 10+ messages in thread
From: gjl at gcc dot gnu.org @ 2023-06-02 13:22 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110093

            Bug ID: 110093
           Summary: [12/13/14 Regression][avr] Move frenzy leading to code
                    bloat
           Product: gcc
           Version: 14.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: rtl-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: gjl at gcc dot gnu.org
  Target Milestone: ---

So here is a C test case:

long add (long aa, long bb, long cc)
{
    if (cc < 0)
        return aa - cc;
    return aa + bb;
}

$ avr-gcc-8 -S -Os -dp
compiles this to following assembly:

add:
        push r14                 ;  30  [c=4 l=1]  pushqi1/0
        push r15                 ;  31  [c=4 l=1]  pushqi1/0
        push r16                 ;  32  [c=4 l=1]  pushqi1/0
        push r17                 ;  33  [c=4 l=1]  pushqi1/0
/* prologue: function */
/* frame size = 0 */
/* stack size = 4 */
        sbrs r17,7       ;  42  [c=28 l=2]  *sbrx_and_branchsi
        rjmp .L2        
        sub r22,r14      ;  11  [c=16 l=4]  subsi3/0
        sbc r23,r15
        sbc r24,r16
        sbc r25,r17
.L1:
/* epilogue start */
        pop r17          ;  36  [c=4 l=1]  popqi
        pop r16          ;  37  [c=4 l=1]  popqi
        pop r15          ;  38  [c=4 l=1]  popqi
        pop r14          ;  39  [c=4 l=1]  popqi
        ret              ;  40  [c=0 l=1]  return_from_epilogue
.L2:
        add r22,r18      ;  16  [c=16 l=4]  addsi3/0
        adc r23,r19
        adc r24,r20
        adc r25,r21
        rjmp .L1                 ;  43  [c=4 l=1]  jump

Notice that the operations on aa in SI:22 can be done in place, no moves
needed.  The superfluous PUSHes and POPs is PR109910, which is yet another
issue...

The code from above then deteriorates with v12, v13, v14 20230501 to a move
bonanza that starts moving stuff for no reason, leading to high register
pressure,  required stack increases from 4 bytes to 14 bytes, code size
increase from 20 instructions to 56:

add:
        push r4          ;  85  [c=4 l=1]  pushqi1/0
        push r5          ;  86  [c=4 l=1]  pushqi1/0
        push r6          ;  87  [c=4 l=1]  pushqi1/0
        push r7          ;  88  [c=4 l=1]  pushqi1/0
        push r8          ;  89  [c=4 l=1]  pushqi1/0
        push r9          ;  90  [c=4 l=1]  pushqi1/0
        push r10                 ;  91  [c=4 l=1]  pushqi1/0
        push r11                 ;  92  [c=4 l=1]  pushqi1/0
        push r12                 ;  93  [c=4 l=1]  pushqi1/0
        push r13                 ;  94  [c=4 l=1]  pushqi1/0
        push r14                 ;  95  [c=4 l=1]  pushqi1/0
        push r15                 ;  96  [c=4 l=1]  pushqi1/0
        push r16                 ;  97  [c=4 l=1]  pushqi1/0
        push r17                 ;  98  [c=4 l=1]  pushqi1/0
/* prologue: function */
/* frame size = 0 */
/* stack size = 14 */
.L__stack_usage = 14
        mov r4,r22       ;  68  [c=4 l=1]  movqi_insn/0
        mov r5,r23       ;  69  [c=4 l=1]  movqi_insn/0
        mov r6,r24       ;  70  [c=4 l=1]  movqi_insn/0
        mov r7,r25       ;  71  [c=4 l=1]  movqi_insn/0
        mov r8,r18       ;  72  [c=4 l=1]  movqi_insn/0
        mov r9,r19       ;  73  [c=4 l=1]  movqi_insn/0
        mov r10,r20      ;  74  [c=4 l=1]  movqi_insn/0
        mov r11,r21      ;  75  [c=4 l=1]  movqi_insn/0
        mov r12,r14      ;  78  [c=4 l=1]  movqi_insn/0
        mov r13,r15      ;  79  [c=4 l=1]  movqi_insn/0
        mov r14,r16      ;  80  [c=4 l=1]  movqi_insn/0
        mov r15,r17      ;  81  [c=4 l=1]  movqi_insn/0
        mov r25,r7       ;  66  [c=4 l=4]  *movsi/0
        mov r24,r6
        mov r23,r5
        mov r22,r4
        sbrs r15,7       ;  117 [c=28 l=2]  *sbrx_and_branchsi
        rjmp .L2        
        sub r22,r12      ;  67  [c=16 l=4]  *subsi3/0
        sbc r23,r13
        sbc r24,r14
        sbc r25,r15
.L1:
/* epilogue start */
        pop r17          ;  101 [c=4 l=1]  popqi
        pop r16          ;  102 [c=4 l=1]  popqi
        pop r15          ;  103 [c=4 l=1]  popqi
        pop r14          ;  104 [c=4 l=1]  popqi
        pop r13          ;  105 [c=4 l=1]  popqi
        pop r12          ;  106 [c=4 l=1]  popqi
        pop r11          ;  107 [c=4 l=1]  popqi
        pop r10          ;  108 [c=4 l=1]  popqi
        pop r9           ;  109 [c=4 l=1]  popqi
        pop r8           ;  110 [c=4 l=1]  popqi
        pop r7           ;  111 [c=4 l=1]  popqi
        pop r6           ;  112 [c=4 l=1]  popqi
        pop r5           ;  113 [c=4 l=1]  popqi
        pop r4           ;  114 [c=4 l=1]  popqi
        ret              ;  115 [c=0 l=1]  return_from_epilogue
.L2:
        add r22,r8       ;  65  [c=16 l=4]  *addsi3/0
        adc r23,r9
        adc r24,r10
        adc r25,r11
        rjmp .L1                 ;  118 [c=4 l=1]  jump

Then finally, with v14 20230602, crazyness increases even more to even requires
a stack frame and a frame pointer.  Register allocator starts to move stuff to
a stack slot and back again.  Code size increases again from 56 instructions to
68, more stack usage:

add:
        push r4          ;  84  [c=4 l=1]  pushqi1/0
        push r5          ;  85  [c=4 l=1]  pushqi1/0
        push r6          ;  86  [c=4 l=1]  pushqi1/0
        push r7          ;  87  [c=4 l=1]  pushqi1/0
        push r8          ;  88  [c=4 l=1]  pushqi1/0
        push r9          ;  89  [c=4 l=1]  pushqi1/0
        push r10                 ;  90  [c=4 l=1]  pushqi1/0
        push r11                 ;  91  [c=4 l=1]  pushqi1/0
        push r14                 ;  92  [c=4 l=1]  pushqi1/0
        push r15                 ;  93  [c=4 l=1]  pushqi1/0
        push r16                 ;  94  [c=4 l=1]  pushqi1/0
        push r17                 ;  95  [c=4 l=1]  pushqi1/0
        push r28                 ;  96  [c=4 l=1]  pushqi1/0
        push r29                 ;  97  [c=4 l=1]  pushqi1/0
         ; SP -= 4       ;  101 [c=4 l=2]  *addhi3_sp
        rcall . 
        rcall . 
        in r28,__SP_L__  ;  127 [c=4 l=2]  *movhi/7
        in r29,__SP_H__
/* prologue: function */
/* frame size = 4 */
/* stack size = 18 */
.L__stack_usage = 18
        mov r8,r22       ;  69  [c=4 l=1]  movqi_insn/0
        mov r9,r23       ;  70  [c=4 l=1]  movqi_insn/0
        mov r10,r24      ;  71  [c=4 l=1]  movqi_insn/0
        mov r11,r25      ;  72  [c=4 l=1]  movqi_insn/0
        std Y+1,r18      ;  73  [c=4 l=1]  movqi_insn/2
        std Y+2,r19      ;  74  [c=4 l=1]  movqi_insn/2
        std Y+3,r20      ;  75  [c=4 l=1]  movqi_insn/2
        std Y+4,r21      ;  76  [c=4 l=1]  movqi_insn/2
        mov r4,r14       ;  77  [c=4 l=1]  movqi_insn/0
        mov r5,r15       ;  78  [c=4 l=1]  movqi_insn/0
        mov r6,r16       ;  79  [c=4 l=1]  movqi_insn/0
        mov r7,r17       ;  80  [c=4 l=1]  movqi_insn/0
        sbrs r7,7        ;  124 [c=28 l=2]  *sbrx_and_branchsi
        rjmp .L2        
        mov r25,r11      ;  67  [c=4 l=4]  *movsi/0
        mov r24,r10
        mov r23,r9
        mov r22,r8
        sub r22,r4       ;  68  [c=16 l=4]  *subsi3/0
        sbc r23,r5
        sbc r24,r6
        sbc r25,r7
.L1:
/* epilogue start */
         ; SP += 4       ;  107 [c=4 l=4]  *addhi3_sp
        pop __tmp_reg__
        pop __tmp_reg__
        pop __tmp_reg__
        pop __tmp_reg__
        pop r29          ;  108 [c=4 l=1]  popqi
        pop r28          ;  109 [c=4 l=1]  popqi
        pop r17          ;  110 [c=4 l=1]  popqi
        pop r16          ;  111 [c=4 l=1]  popqi
        pop r15          ;  112 [c=4 l=1]  popqi
        pop r14          ;  113 [c=4 l=1]  popqi
        pop r11          ;  114 [c=4 l=1]  popqi
        pop r10          ;  115 [c=4 l=1]  popqi
        pop r9           ;  116 [c=4 l=1]  popqi
        pop r8           ;  117 [c=4 l=1]  popqi
        pop r7           ;  118 [c=4 l=1]  popqi
        pop r6           ;  119 [c=4 l=1]  popqi
        pop r5           ;  120 [c=4 l=1]  popqi
        pop r4           ;  121 [c=4 l=1]  popqi
        ret              ;  122 [c=0 l=1]  return_from_epilogue
.L2:
        ldd r22,Y+1      ;  65  [c=16 l=4]  *movsi/2
        ldd r23,Y+2
        ldd r24,Y+3
        ldd r25,Y+4
        add r22,r8       ;  66  [c=16 l=4]  *addsi3/0
        adc r23,r9
        adc r24,r10
        adc r25,r11
        rjmp .L1                 ;  125 [c=4 l=1]  jump

So we have the following results:

Optimal code:                 Size 12 Instr, no Stack
avr-gcc v8:                   Size 20 Instr,  4 Stack
avr-gcc v12, v13, v14 (May)   Size 56 Instr, 14 Stack
avr-gcc v14 (June)            Size 68 Instr, 18 Stack + Frame Pointer


Target: avr
Configured with: --target=avr --disable-nls --with-gnu-as --with-gnu-ld
--disable-shared --enable-languages=c,c++

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2024-06-20  9:12 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-06-02 13:22 [Bug rtl-optimization/110093] New: [12/13/14 Regression][avr] Move frenzy leading to code bloat gjl at gcc dot gnu.org
2023-06-05  6:38 ` [Bug rtl-optimization/110093] " rguenth at gcc dot gnu.org
2023-06-13 10:18 ` gjl at gcc dot gnu.org
2023-08-22 14:25 ` gjl at gcc dot gnu.org
2023-08-22 14:25 ` gjl at gcc dot gnu.org
2023-08-29 17:15 ` vmakarov at gcc dot gnu.org
2023-08-30  8:16 ` gjl at gcc dot gnu.org
2023-08-30 14:04 ` vmakarov at gcc dot gnu.org
2024-03-08 15:35 ` law at gcc dot gnu.org
2024-06-20  9:12 ` [Bug rtl-optimization/110093] [12/13/14/15 " rguenth at gcc dot gnu.org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).