public inbox for gcc-bugs@sourceware.org help / color / mirror / Atom feed
* [Bug testsuite/108141] New: [13 Regression] gcc.target/i386/pr64110.c FAIL since r13-4727 on ia32 @ 2022-12-16 9:00 jakub at gcc dot gnu.org 2022-12-16 9:01 ` [Bug testsuite/108141] " jakub at gcc dot gnu.org ` (7 more replies) 0 siblings, 8 replies; 9+ messages in thread From: jakub at gcc dot gnu.org @ 2022-12-16 9:00 UTC (permalink / raw) To: gcc-bugs https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108141 Bug ID: 108141 Summary: [13 Regression] gcc.target/i386/pr64110.c FAIL since r13-4727 on ia32 Product: gcc Version: 13.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: testsuite Assignee: unassigned at gcc dot gnu.org Reporter: jakub at gcc dot gnu.org Target Milestone: --- On ia32, I get +FAIL: gcc.target/i386/pr64110.c scan-assembler vmovd[\\\\t ] starting with r13-4727-g12abd5a7d13209f79664ea603b3f3517f71b8c4f The emitted code is quite different, though without larger analysis I can't tell if it is worse or better. ^ permalink raw reply [flat|nested] 9+ messages in thread
* [Bug testsuite/108141] [13 Regression] gcc.target/i386/pr64110.c FAIL since r13-4727 on ia32 2022-12-16 9:00 [Bug testsuite/108141] New: [13 Regression] gcc.target/i386/pr64110.c FAIL since r13-4727 on ia32 jakub at gcc dot gnu.org @ 2022-12-16 9:01 ` jakub at gcc dot gnu.org 2022-12-16 9:03 ` jakub at gcc dot gnu.org ` (6 subsequent siblings) 7 siblings, 0 replies; 9+ messages in thread From: jakub at gcc dot gnu.org @ 2022-12-16 9:01 UTC (permalink / raw) To: gcc-bugs https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108141 Jakub Jelinek <jakub at gcc dot gnu.org> changed: What |Removed |Added ---------------------------------------------------------------------------- CC| |hjl.tools at gmail dot com, | |vmakarov at gcc dot gnu.org Target Milestone|--- |13.0 ^ permalink raw reply [flat|nested] 9+ messages in thread
* [Bug testsuite/108141] [13 Regression] gcc.target/i386/pr64110.c FAIL since r13-4727 on ia32 2022-12-16 9:00 [Bug testsuite/108141] New: [13 Regression] gcc.target/i386/pr64110.c FAIL since r13-4727 on ia32 jakub at gcc dot gnu.org 2022-12-16 9:01 ` [Bug testsuite/108141] " jakub at gcc dot gnu.org @ 2022-12-16 9:03 ` jakub at gcc dot gnu.org 2022-12-16 11:12 ` rguenth at gcc dot gnu.org ` (5 subsequent siblings) 7 siblings, 0 replies; 9+ messages in thread From: jakub at gcc dot gnu.org @ 2022-12-16 9:03 UTC (permalink / raw) To: gcc-bugs https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108141 --- Comment #1 from Jakub Jelinek <jakub at gcc dot gnu.org> --- Just counting number of instructions it is shorter though: --- pr64110.s.r13-4726 2022-12-16 03:57:42.000000000 -0500 +++ pr64110.s.r13-4727 2022-12-16 03:57:48.000000000 -0500 @@ -11,106 +11,105 @@ bar: pushl %ebx andl $-32, %esp subl $64, %esp - movzwl 8(%ebp), %edi + movzwl 8(%ebp), %eax + movw %ax, 30(%esp) movl a, %eax - vmovd %edi, %xmm1 + vpbroadcastw 30(%esp), %xmm2 + vpbroadcastw 30(%esp), %ymm0 leal -1(%eax), %edx - vmovd %edi, %xmm0 movl %edx, a - vpbroadcastw %xmm1, %xmm1 - vpbroadcastw %xmm0, %ymm0 + vmovdqa %xmm2, (%esp) testl %eax, %eax - je .L24 - movw %di, 62(%esp) - vmovdqa %xmm1, 32(%esp) - vmovdqa %ymm0, (%esp) - vzeroupper + je .L23 .p2align 4,,10 .p2align 3 .L2: + vmovdqa %ymm0, 32(%esp) + vzeroupper call foo + vmovdqa 32(%esp), %ymm0 testl %eax, %eax jle .L3 - leal -1(%eax), %edi + leal -1(%eax), %esi movl b, %ebx - movl %edi, 52(%esp) - cmpl $14, %edi + movl %esi, 24(%esp) + cmpl $14, %esi jbe .L10 movl %eax, %ecx movl %ebx, %edx shrl $4, %ecx sall $5, %ecx - leal (%ecx,%ebx), %esi + leal (%ecx,%ebx), %edi andl $32, %ecx - jne .L21 - movzwl 62(%esp), %edi - vmovdqa (%esp), %ymm0 + je .L5 + leal 32(%ebx), %edx + vmovdqu %ymm0, (%ebx) + cmpl %edi, %edx + je .L22 .p2align 4,,10 .p2align 3 .L5: vmovdqu %ymm0, (%edx) addl $64, %edx vmovdqu %ymm0, -32(%edx) - cmpl %esi, %edx + cmpl %edi, %edx jne .L5 - movw %di, 62(%esp) .L22: movl %eax, %edx andl $-16, %edx - movl %edx, %esi + movl %edx, %edi leal (%ebx,%edx,2), %ecx cmpl %edx, %eax - je .L29 - vzeroupper + je .L6 .L4: - movl %eax, %edi - subl %esi, %edi - movl %edi, 56(%esp) - decl %edi - cmpl $6, %edi + movl %eax, %esi + subl %edi, %esi + movl %esi, 32(%esp) + decl %esi + cmpl $6, %esi jbe .L7 - movl 56(%esp), %edi - vmovdqa 32(%esp), %xmm2 - vmovdqu %xmm2, (%ebx,%esi,2) - movl %edi, %esi - andl $-8, %esi - addl %esi, %edx - leal (%ecx,%esi,2), %ecx - movl %edi, %esi - andl $7, %esi + movl 32(%esp), %esi + vmovdqa (%esp), %xmm1 + vmovdqu %xmm1, (%ebx,%edi,2) + movl %esi, %edi + andl $-8, %edi + addl %edi, %edx + leal (%ecx,%edi,2), %ecx + movl %esi, %edi + andl $7, %edi je .L6 .L7: - movzwl 62(%esp), %edi - leal 1(%edx), %esi - movw %di, (%ecx) - cmpl %esi, %eax - jle .L6 - leal 2(%edx), %esi - movw %di, 2(%ecx) - cmpl %esi, %eax - jle .L6 - leal 3(%edx), %esi - movw %di, 4(%ecx) - cmpl %esi, %eax - jle .L6 - leal 4(%edx), %esi - movw %di, 6(%ecx) - cmpl %esi, %eax - jle .L6 - leal 5(%edx), %esi - movw %di, 8(%ecx) - cmpl %esi, %eax + movzwl 30(%esp), %esi + leal 1(%edx), %edi + movw %si, (%ecx) + cmpl %edi, %eax + jle .L6 + leal 2(%edx), %edi + movw %si, 2(%ecx) + cmpl %edi, %eax + jle .L6 + leal 3(%edx), %edi + movw %si, 4(%ecx) + cmpl %edi, %eax + jle .L6 + leal 4(%edx), %edi + movw %si, 6(%ecx) + cmpl %edi, %eax + jle .L6 + leal 5(%edx), %edi + movw %si, 8(%ecx) + cmpl %edi, %eax jle .L6 addl $6, %edx - movw %di, 10(%ecx) + movw %si, 10(%ecx) cmpl %edx, %eax jle .L6 - movw %di, 12(%ecx) + movw %si, 12(%ecx) .L6: leal (%ebx,%eax,2), %eax - movzwl 62(%esp), %ecx + movzwl 30(%esp), %ecx movl %eax, b - movl 52(%esp), %eax + movl 24(%esp), %eax movw %cx, (%ebx,%eax,2) .L3: movl a, %eax @@ -118,39 +117,19 @@ bar: movl %edx, a testl %eax, %eax jne .L2 +.L23: + vzeroupper leal -12(%ebp), %esp popl %ebx popl %esi popl %edi popl %ebp ret - .p2align 4,,10 - .p2align 3 -.L21: - vmovdqa (%esp), %ymm3 - leal 32(%ebx), %edx - vmovdqu %ymm3, (%ebx) - cmpl %esi, %edx - je .L22 - movzwl 62(%esp), %edi - vmovdqa %ymm3, %ymm0 - jmp .L5 .L10: movl %ebx, %ecx - xorl %esi, %esi + xorl %edi, %edi xorl %edx, %edx jmp .L4 -.L29: - vzeroupper - jmp .L6 -.L24: - vzeroupper - leal -12(%ebp), %esp - popl %ebx - popl %esi - popl %edi - popl %ebp - ret .size bar, .-bar .globl b .bss ^ permalink raw reply [flat|nested] 9+ messages in thread
* [Bug testsuite/108141] [13 Regression] gcc.target/i386/pr64110.c FAIL since r13-4727 on ia32 2022-12-16 9:00 [Bug testsuite/108141] New: [13 Regression] gcc.target/i386/pr64110.c FAIL since r13-4727 on ia32 jakub at gcc dot gnu.org 2022-12-16 9:01 ` [Bug testsuite/108141] " jakub at gcc dot gnu.org 2022-12-16 9:03 ` jakub at gcc dot gnu.org @ 2022-12-16 11:12 ` rguenth at gcc dot gnu.org 2022-12-16 11:36 ` jakub at gcc dot gnu.org ` (4 subsequent siblings) 7 siblings, 0 replies; 9+ messages in thread From: rguenth at gcc dot gnu.org @ 2022-12-16 11:12 UTC (permalink / raw) To: gcc-bugs https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108141 Richard Biener <rguenth at gcc dot gnu.org> changed: What |Removed |Added ---------------------------------------------------------------------------- CC| |crazylht at gmail dot com Target| |i?86-*-* --- Comment #2 from Richard Biener <rguenth at gcc dot gnu.org> --- it looks better overall but the key difference is: - movzwl 8(%ebp), %edi + movzwl 8(%ebp), %eax + movw %ax, 30(%esp) ... - vmovd %edi, %xmm1 + vpbroadcastw 30(%esp), %xmm2 + vpbroadcastw 30(%esp), %ymm0 ... - vmovd %edi, %xmm0 ... - vpbroadcastw %xmm1, %xmm1 - vpbroadcastw %xmm0, %ymm0 I wonder whether optimal would be vpbroadcasstw 8(%ebp), %xmm2 vpbroadcasstw 8(%ebp), %ymm0 though. ^ permalink raw reply [flat|nested] 9+ messages in thread
* [Bug testsuite/108141] [13 Regression] gcc.target/i386/pr64110.c FAIL since r13-4727 on ia32 2022-12-16 9:00 [Bug testsuite/108141] New: [13 Regression] gcc.target/i386/pr64110.c FAIL since r13-4727 on ia32 jakub at gcc dot gnu.org ` (2 preceding siblings ...) 2022-12-16 11:12 ` rguenth at gcc dot gnu.org @ 2022-12-16 11:36 ` jakub at gcc dot gnu.org 2022-12-16 11:46 ` rguenth at gcc dot gnu.org ` (3 subsequent siblings) 7 siblings, 0 replies; 9+ messages in thread From: jakub at gcc dot gnu.org @ 2022-12-16 11:36 UTC (permalink / raw) To: gcc-bugs https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108141 --- Comment #3 from Jakub Jelinek <jakub at gcc dot gnu.org> --- Yeah. For the PR64110: typedef short V __attribute__((vector_size (32))); V foo (short x) { return (V) { x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x }; } we emit with -m64 -O2 -mavx2 vmovd %edi, %xmm0 vpbroadcastw %xmm0, %ymm0 which is I think right and for -m32 -O2 -mavx2 vpbroadcastw 8(%ebp), %ymm0 which again is optimal. Though, in the pr64110.c test we went with -O3 -march=core-avx2 -m32 from r219682: movzwl (%ecx), %esi movw %si, -28(%ebp) vpbroadcastw -28(%ebp), %ymm1 vmovdqa %ymm1, -88(%ebp) to r219683: movzwl (%ecx), %esi vmovd %esi, %xmm1 vpbroadcastw %xmm1, %ymm1 vmovdqa %ymm1, -88(%ebp) to r13-4726: movzwl 8(%ebp), %edi vmovd %edi, %xmm1 vpbroadcastw %xmm1, %xmm1 vmovdqa %xmm1, 32(%esp) to r13-4727: movzwl 8(%ebp), %eax movw %ax, 30(%esp) vpbroadcastw 30(%esp), %xmm2 vmovdqa %xmm2, (%esp) while vpbroadcastw 8(%ebp), %xmm2 vmovdqa %xmm2, (%esp) would be best. From this POV I think r13-4727 is actually a step backwards because previously we were at least loading it into GPR, moving to SSE and broadcasting there, while now we move into GPR, spill to memory and broadcast from memory. Before combine we have: (insn 2 8 3 2 (set (reg:SI 120 [ x ]) (mem/c:SI (reg/f:SI 16 argp) [2 x+0 S4 A32])) "pr64110.c":11:1 83 {*movsi_internal} (nil)) (insn 3 2 4 2 (set (reg/v:HI 119 [ x ]) (subreg:HI (reg:SI 120 [ x ]) 0)) "pr64110.c":11:1 84 {*movhi_internal} (expr_list:REG_DEAD (reg:SI 120 [ x ]) (nil))) ... and in another bb (insn 63 140 35 3 (set (reg:V8HI 140) (vec_duplicate:V8HI (reg/v:HI 119 [ x ]))) "pr64110.c":16:7 7985 {*vec_dupv8hi} (nil)) (insn 35 63 18 3 (set (reg:V16HI 141 [ vect_cst__52 ]) (vec_duplicate:V16HI (reg/v:HI 119 [ x ]))) 7984 {*vec_dupv16hi} (nil)) so I bet that is the reason why combine doesn't merge those into just the broadcast. As for the xmm vs. ymm, it is only loop-invariant that moves those 2 dups (insn 63 and 35) next to each other, and the question is what kind of optimization pass could figure out that insn 35 is a superset of insn 63 and change it into insn 35 + lowpart subreg to set pseudo 140 from low half of 141. ^ permalink raw reply [flat|nested] 9+ messages in thread
* [Bug testsuite/108141] [13 Regression] gcc.target/i386/pr64110.c FAIL since r13-4727 on ia32 2022-12-16 9:00 [Bug testsuite/108141] New: [13 Regression] gcc.target/i386/pr64110.c FAIL since r13-4727 on ia32 jakub at gcc dot gnu.org ` (3 preceding siblings ...) 2022-12-16 11:36 ` jakub at gcc dot gnu.org @ 2022-12-16 11:46 ` rguenth at gcc dot gnu.org 2022-12-16 18:22 ` cvs-commit at gcc dot gnu.org ` (2 subsequent siblings) 7 siblings, 0 replies; 9+ messages in thread From: rguenth at gcc dot gnu.org @ 2022-12-16 11:46 UTC (permalink / raw) To: gcc-bugs https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108141 --- Comment #4 from Richard Biener <rguenth at gcc dot gnu.org> --- (In reply to Jakub Jelinek from comment #3) [...] ... From this POV I think r13-4727 is actually a step backwards > because previously we were at least loading it into GPR, moving to SSE and > broadcasting there, > while now we move into GPR, spill to memory and broadcast from memory. > Before combine we have: > (insn 2 8 3 2 (set (reg:SI 120 [ x ]) > (mem/c:SI (reg/f:SI 16 argp) [2 x+0 S4 A32])) "pr64110.c":11:1 83 > {*movsi_internal} > (nil)) > (insn 3 2 4 2 (set (reg/v:HI 119 [ x ]) > (subreg:HI (reg:SI 120 [ x ]) 0)) "pr64110.c":11:1 84 > {*movhi_internal} > (expr_list:REG_DEAD (reg:SI 120 [ x ]) > (nil))) > ... > and in another bb > (insn 63 140 35 3 (set (reg:V8HI 140) > (vec_duplicate:V8HI (reg/v:HI 119 [ x ]))) "pr64110.c":16:7 7985 > {*vec_dupv8hi} > (nil)) > (insn 35 63 18 3 (set (reg:V16HI 141 [ vect_cst__52 ]) > (vec_duplicate:V16HI (reg/v:HI 119 [ x ]))) 7984 {*vec_dupv16hi} > (nil)) > so I bet that is the reason why combine doesn't merge those into just the > broadcast. Yep. And probably fwprop doesnt consider MEMs (or even two defs) at all. I suppose we don't want to combine insn 2 + 3 into a HImode MEM by itself? OTOH there's no fwprop after combine. > As for the xmm vs. ymm, it is only loop-invariant that moves those 2 dups > (insn 63 and 35) next to each other, and the question is what kind of > optimization pass could figure out that insn 35 is a superset of insn 63 and > change it into insn 35 + lowpart subreg to set pseudo 140 from low half of > 141. There's only a peephole or alternatively scheduling heuristic + CSE (we need the V16HI duplicate before the V8HI one) I can think of. CSE could also tentatively record "larger" computations and modify the earlier stmt if uses of that larger compute appears. ^ permalink raw reply [flat|nested] 9+ messages in thread
* [Bug testsuite/108141] [13 Regression] gcc.target/i386/pr64110.c FAIL since r13-4727 on ia32 2022-12-16 9:00 [Bug testsuite/108141] New: [13 Regression] gcc.target/i386/pr64110.c FAIL since r13-4727 on ia32 jakub at gcc dot gnu.org ` (4 preceding siblings ...) 2022-12-16 11:46 ` rguenth at gcc dot gnu.org @ 2022-12-16 18:22 ` cvs-commit at gcc dot gnu.org 2022-12-20 16:59 ` [Bug target/108141] " jakub at gcc dot gnu.org 2023-03-03 15:11 ` vmakarov at gcc dot gnu.org 7 siblings, 0 replies; 9+ messages in thread From: cvs-commit at gcc dot gnu.org @ 2022-12-16 18:22 UTC (permalink / raw) To: gcc-bugs https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108141 --- Comment #5 from CVS Commits <cvs-commit at gcc dot gnu.org> --- The master branch has been updated by Vladimir Makarov <vmakarov@gcc.gnu.org>: https://gcc.gnu.org/g:b50fe16a3b2214c418ecc5febc0bb21bc17912b7 commit r13-4749-gb50fe16a3b2214c418ecc5febc0bb21bc17912b7 Author: Vladimir N. Makarov <vmakarov@redhat.com> Date: Fri Dec 16 13:16:31 2022 -0500 Revert "IRA: Check that reg classes contain a hard reg of given mode in reg move cost calculation" The patch resulted in new PRs: PR target/108145 PR testsuite/108141 So I am reverting the patch. This reverts commit 12abd5a7d13209f79664ea603b3f3517f71b8c4f. ^ permalink raw reply [flat|nested] 9+ messages in thread
* [Bug target/108141] [13 Regression] gcc.target/i386/pr64110.c FAIL since r13-4727 on ia32 2022-12-16 9:00 [Bug testsuite/108141] New: [13 Regression] gcc.target/i386/pr64110.c FAIL since r13-4727 on ia32 jakub at gcc dot gnu.org ` (5 preceding siblings ...) 2022-12-16 18:22 ` cvs-commit at gcc dot gnu.org @ 2022-12-20 16:59 ` jakub at gcc dot gnu.org 2023-03-03 15:11 ` vmakarov at gcc dot gnu.org 7 siblings, 0 replies; 9+ messages in thread From: jakub at gcc dot gnu.org @ 2022-12-20 16:59 UTC (permalink / raw) To: gcc-bugs https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108141 Jakub Jelinek <jakub at gcc dot gnu.org> changed: What |Removed |Added ---------------------------------------------------------------------------- Resolution|--- |FIXED Status|UNCONFIRMED |RESOLVED --- Comment #6 from Jakub Jelinek <jakub at gcc dot gnu.org> --- The change has been reverted, so this is no longer a regression. ^ permalink raw reply [flat|nested] 9+ messages in thread
* [Bug target/108141] [13 Regression] gcc.target/i386/pr64110.c FAIL since r13-4727 on ia32 2022-12-16 9:00 [Bug testsuite/108141] New: [13 Regression] gcc.target/i386/pr64110.c FAIL since r13-4727 on ia32 jakub at gcc dot gnu.org ` (6 preceding siblings ...) 2022-12-20 16:59 ` [Bug target/108141] " jakub at gcc dot gnu.org @ 2023-03-03 15:11 ` vmakarov at gcc dot gnu.org 7 siblings, 0 replies; 9+ messages in thread From: vmakarov at gcc dot gnu.org @ 2023-03-03 15:11 UTC (permalink / raw) To: gcc-bugs https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108141 --- Comment #7 from Vladimir Makarov <vmakarov at gcc dot gnu.org> --- (In reply to Jakub Jelinek from comment #6) > The change has been reverted, so this is no longer a regression. Just for the info. The patch I reverted resulted in wrong calculation of pressure classes (there was a single pressure class ALL_REGS). This affected register pressure calculation and as a consequence using one region only. W/o the patch IRA uses regional register allocation for the loops in the test. I pushed another patch for PR90706. I hope it will not create such problems as the previous patch. ^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2023-03-03 15:11 UTC | newest] Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2022-12-16 9:00 [Bug testsuite/108141] New: [13 Regression] gcc.target/i386/pr64110.c FAIL since r13-4727 on ia32 jakub at gcc dot gnu.org 2022-12-16 9:01 ` [Bug testsuite/108141] " jakub at gcc dot gnu.org 2022-12-16 9:03 ` jakub at gcc dot gnu.org 2022-12-16 11:12 ` rguenth at gcc dot gnu.org 2022-12-16 11:36 ` jakub at gcc dot gnu.org 2022-12-16 11:46 ` rguenth at gcc dot gnu.org 2022-12-16 18:22 ` cvs-commit at gcc dot gnu.org 2022-12-20 16:59 ` [Bug target/108141] " jakub at gcc dot gnu.org 2023-03-03 15:11 ` vmakarov at gcc dot gnu.org
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).