From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 48) id AFFD13858D3C; Fri, 2 Jun 2023 13:22:53 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org AFFD13858D3C DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1685712173; bh=uaTIAtfRCJ3iLfhnc59S39+y7S1FbIQ5iyoYJK3nmXI=; h=From:To:Subject:Date:From; b=H4yx/9GYrxFcuzJ4zSgswQkcKbc1dx2PL7lt2sj+4nSdIj0pZvkwC/jYGdGjbOkd3 ZUH2IH4lk8A80hw1M0iIAQJxOeHL7zIwnggjU72abectveyeJ8DCXCxK0dmORwLrs4 xCn5D1GPbh5QwYsQCBFLHj5RpkAjNzYudin+B4u4= From: "gjl at gcc dot gnu.org" To: gcc-bugs@gcc.gnu.org Subject: [Bug rtl-optimization/110093] New: [12/13/14 Regression][avr] Move frenzy leading to code bloat Date: Fri, 02 Jun 2023 13:22:53 +0000 X-Bugzilla-Reason: CC X-Bugzilla-Type: new X-Bugzilla-Watch-Reason: None X-Bugzilla-Product: gcc X-Bugzilla-Component: rtl-optimization X-Bugzilla-Version: 14.0 X-Bugzilla-Keywords: X-Bugzilla-Severity: normal X-Bugzilla-Who: gjl at gcc dot gnu.org X-Bugzilla-Status: UNCONFIRMED X-Bugzilla-Resolution: X-Bugzilla-Priority: P3 X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org X-Bugzilla-Target-Milestone: --- X-Bugzilla-Flags: X-Bugzilla-Changed-Fields: bug_id short_desc product version bug_status bug_severity priority component assigned_to reporter target_milestone Message-ID: Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: quoted-printable X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/ Auto-Submitted: auto-generated MIME-Version: 1.0 List-Id: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D110093 Bug ID: 110093 Summary: [12/13/14 Regression][avr] Move frenzy leading to code bloat Product: gcc Version: 14.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: rtl-optimization Assignee: unassigned at gcc dot gnu.org Reporter: gjl at gcc dot gnu.org Target Milestone: --- So here is a C test case: long add (long aa, long bb, long cc) { if (cc < 0) return aa - cc; return aa + bb; } $ avr-gcc-8 -S -Os -dp compiles this to following assembly: add: push r14 ; 30 [c=3D4 l=3D1] pushqi1/0 push r15 ; 31 [c=3D4 l=3D1] pushqi1/0 push r16 ; 32 [c=3D4 l=3D1] pushqi1/0 push r17 ; 33 [c=3D4 l=3D1] pushqi1/0 /* prologue: function */ /* frame size =3D 0 */ /* stack size =3D 4 */ sbrs r17,7 ; 42 [c=3D28 l=3D2] *sbrx_and_branchsi rjmp .L2=20=20=20=20=20=20=20=20 sub r22,r14 ; 11 [c=3D16 l=3D4] subsi3/0 sbc r23,r15 sbc r24,r16 sbc r25,r17 .L1: /* epilogue start */ pop r17 ; 36 [c=3D4 l=3D1] popqi pop r16 ; 37 [c=3D4 l=3D1] popqi pop r15 ; 38 [c=3D4 l=3D1] popqi pop r14 ; 39 [c=3D4 l=3D1] popqi ret ; 40 [c=3D0 l=3D1] return_from_epilogue .L2: add r22,r18 ; 16 [c=3D16 l=3D4] addsi3/0 adc r23,r19 adc r24,r20 adc r25,r21 rjmp .L1 ; 43 [c=3D4 l=3D1] jump Notice that the operations on aa in SI:22 can be done in place, no moves needed. The superfluous PUSHes and POPs is PR109910, which is yet another issue... The code from above then deteriorates with v12, v13, v14 20230501 to a move bonanza that starts moving stuff for no reason, leading to high register pressure, required stack increases from 4 bytes to 14 bytes, code size increase from 20 instructions to 56: add: push r4 ; 85 [c=3D4 l=3D1] pushqi1/0 push r5 ; 86 [c=3D4 l=3D1] pushqi1/0 push r6 ; 87 [c=3D4 l=3D1] pushqi1/0 push r7 ; 88 [c=3D4 l=3D1] pushqi1/0 push r8 ; 89 [c=3D4 l=3D1] pushqi1/0 push r9 ; 90 [c=3D4 l=3D1] pushqi1/0 push r10 ; 91 [c=3D4 l=3D1] pushqi1/0 push r11 ; 92 [c=3D4 l=3D1] pushqi1/0 push r12 ; 93 [c=3D4 l=3D1] pushqi1/0 push r13 ; 94 [c=3D4 l=3D1] pushqi1/0 push r14 ; 95 [c=3D4 l=3D1] pushqi1/0 push r15 ; 96 [c=3D4 l=3D1] pushqi1/0 push r16 ; 97 [c=3D4 l=3D1] pushqi1/0 push r17 ; 98 [c=3D4 l=3D1] pushqi1/0 /* prologue: function */ /* frame size =3D 0 */ /* stack size =3D 14 */ .L__stack_usage =3D 14 mov r4,r22 ; 68 [c=3D4 l=3D1] movqi_insn/0 mov r5,r23 ; 69 [c=3D4 l=3D1] movqi_insn/0 mov r6,r24 ; 70 [c=3D4 l=3D1] movqi_insn/0 mov r7,r25 ; 71 [c=3D4 l=3D1] movqi_insn/0 mov r8,r18 ; 72 [c=3D4 l=3D1] movqi_insn/0 mov r9,r19 ; 73 [c=3D4 l=3D1] movqi_insn/0 mov r10,r20 ; 74 [c=3D4 l=3D1] movqi_insn/0 mov r11,r21 ; 75 [c=3D4 l=3D1] movqi_insn/0 mov r12,r14 ; 78 [c=3D4 l=3D1] movqi_insn/0 mov r13,r15 ; 79 [c=3D4 l=3D1] movqi_insn/0 mov r14,r16 ; 80 [c=3D4 l=3D1] movqi_insn/0 mov r15,r17 ; 81 [c=3D4 l=3D1] movqi_insn/0 mov r25,r7 ; 66 [c=3D4 l=3D4] *movsi/0 mov r24,r6 mov r23,r5 mov r22,r4 sbrs r15,7 ; 117 [c=3D28 l=3D2] *sbrx_and_branchsi rjmp .L2=20=20=20=20=20=20=20=20 sub r22,r12 ; 67 [c=3D16 l=3D4] *subsi3/0 sbc r23,r13 sbc r24,r14 sbc r25,r15 .L1: /* epilogue start */ pop r17 ; 101 [c=3D4 l=3D1] popqi pop r16 ; 102 [c=3D4 l=3D1] popqi pop r15 ; 103 [c=3D4 l=3D1] popqi pop r14 ; 104 [c=3D4 l=3D1] popqi pop r13 ; 105 [c=3D4 l=3D1] popqi pop r12 ; 106 [c=3D4 l=3D1] popqi pop r11 ; 107 [c=3D4 l=3D1] popqi pop r10 ; 108 [c=3D4 l=3D1] popqi pop r9 ; 109 [c=3D4 l=3D1] popqi pop r8 ; 110 [c=3D4 l=3D1] popqi pop r7 ; 111 [c=3D4 l=3D1] popqi pop r6 ; 112 [c=3D4 l=3D1] popqi pop r5 ; 113 [c=3D4 l=3D1] popqi pop r4 ; 114 [c=3D4 l=3D1] popqi ret ; 115 [c=3D0 l=3D1] return_from_epilogue .L2: add r22,r8 ; 65 [c=3D16 l=3D4] *addsi3/0 adc r23,r9 adc r24,r10 adc r25,r11 rjmp .L1 ; 118 [c=3D4 l=3D1] jump Then finally, with v14 20230602, crazyness increases even more to even requ= ires a stack frame and a frame pointer. Register allocator starts to move stuff= to a stack slot and back again. Code size increases again from 56 instruction= s to 68, more stack usage: add: push r4 ; 84 [c=3D4 l=3D1] pushqi1/0 push r5 ; 85 [c=3D4 l=3D1] pushqi1/0 push r6 ; 86 [c=3D4 l=3D1] pushqi1/0 push r7 ; 87 [c=3D4 l=3D1] pushqi1/0 push r8 ; 88 [c=3D4 l=3D1] pushqi1/0 push r9 ; 89 [c=3D4 l=3D1] pushqi1/0 push r10 ; 90 [c=3D4 l=3D1] pushqi1/0 push r11 ; 91 [c=3D4 l=3D1] pushqi1/0 push r14 ; 92 [c=3D4 l=3D1] pushqi1/0 push r15 ; 93 [c=3D4 l=3D1] pushqi1/0 push r16 ; 94 [c=3D4 l=3D1] pushqi1/0 push r17 ; 95 [c=3D4 l=3D1] pushqi1/0 push r28 ; 96 [c=3D4 l=3D1] pushqi1/0 push r29 ; 97 [c=3D4 l=3D1] pushqi1/0 ; SP -=3D 4 ; 101 [c=3D4 l=3D2] *addhi3_sp rcall .=20 rcall .=20 in r28,__SP_L__ ; 127 [c=3D4 l=3D2] *movhi/7 in r29,__SP_H__ /* prologue: function */ /* frame size =3D 4 */ /* stack size =3D 18 */ .L__stack_usage =3D 18 mov r8,r22 ; 69 [c=3D4 l=3D1] movqi_insn/0 mov r9,r23 ; 70 [c=3D4 l=3D1] movqi_insn/0 mov r10,r24 ; 71 [c=3D4 l=3D1] movqi_insn/0 mov r11,r25 ; 72 [c=3D4 l=3D1] movqi_insn/0 std Y+1,r18 ; 73 [c=3D4 l=3D1] movqi_insn/2 std Y+2,r19 ; 74 [c=3D4 l=3D1] movqi_insn/2 std Y+3,r20 ; 75 [c=3D4 l=3D1] movqi_insn/2 std Y+4,r21 ; 76 [c=3D4 l=3D1] movqi_insn/2 mov r4,r14 ; 77 [c=3D4 l=3D1] movqi_insn/0 mov r5,r15 ; 78 [c=3D4 l=3D1] movqi_insn/0 mov r6,r16 ; 79 [c=3D4 l=3D1] movqi_insn/0 mov r7,r17 ; 80 [c=3D4 l=3D1] movqi_insn/0 sbrs r7,7 ; 124 [c=3D28 l=3D2] *sbrx_and_branchsi rjmp .L2=20=20=20=20=20=20=20=20 mov r25,r11 ; 67 [c=3D4 l=3D4] *movsi/0 mov r24,r10 mov r23,r9 mov r22,r8 sub r22,r4 ; 68 [c=3D16 l=3D4] *subsi3/0 sbc r23,r5 sbc r24,r6 sbc r25,r7 .L1: /* epilogue start */ ; SP +=3D 4 ; 107 [c=3D4 l=3D4] *addhi3_sp pop __tmp_reg__ pop __tmp_reg__ pop __tmp_reg__ pop __tmp_reg__ pop r29 ; 108 [c=3D4 l=3D1] popqi pop r28 ; 109 [c=3D4 l=3D1] popqi pop r17 ; 110 [c=3D4 l=3D1] popqi pop r16 ; 111 [c=3D4 l=3D1] popqi pop r15 ; 112 [c=3D4 l=3D1] popqi pop r14 ; 113 [c=3D4 l=3D1] popqi pop r11 ; 114 [c=3D4 l=3D1] popqi pop r10 ; 115 [c=3D4 l=3D1] popqi pop r9 ; 116 [c=3D4 l=3D1] popqi pop r8 ; 117 [c=3D4 l=3D1] popqi pop r7 ; 118 [c=3D4 l=3D1] popqi pop r6 ; 119 [c=3D4 l=3D1] popqi pop r5 ; 120 [c=3D4 l=3D1] popqi pop r4 ; 121 [c=3D4 l=3D1] popqi ret ; 122 [c=3D0 l=3D1] return_from_epilogue .L2: ldd r22,Y+1 ; 65 [c=3D16 l=3D4] *movsi/2 ldd r23,Y+2 ldd r24,Y+3 ldd r25,Y+4 add r22,r8 ; 66 [c=3D16 l=3D4] *addsi3/0 adc r23,r9 adc r24,r10 adc r25,r11 rjmp .L1 ; 125 [c=3D4 l=3D1] jump So we have the following results: Optimal code: Size 12 Instr, no Stack avr-gcc v8: Size 20 Instr, 4 Stack avr-gcc v12, v13, v14 (May) Size 56 Instr, 14 Stack avr-gcc v14 (June) Size 68 Instr, 18 Stack + Frame Pointer Target: avr Configured with: --target=3Davr --disable-nls --with-gnu-as --with-gnu-ld --disable-shared --enable-languages=3Dc,c++=