public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug testsuite/108141] New: [13 Regression] gcc.target/i386/pr64110.c FAIL since r13-4727 on ia32
@ 2022-12-16 9:00 jakub at gcc dot gnu.org
2022-12-16 9:01 ` [Bug testsuite/108141] " jakub at gcc dot gnu.org
` (7 more replies)
0 siblings, 8 replies; 9+ messages in thread
From: jakub at gcc dot gnu.org @ 2022-12-16 9:00 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108141
Bug ID: 108141
Summary: [13 Regression] gcc.target/i386/pr64110.c FAIL since
r13-4727 on ia32
Product: gcc
Version: 13.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: testsuite
Assignee: unassigned at gcc dot gnu.org
Reporter: jakub at gcc dot gnu.org
Target Milestone: ---
On ia32, I get
+FAIL: gcc.target/i386/pr64110.c scan-assembler vmovd[\\\\t ]
starting with r13-4727-g12abd5a7d13209f79664ea603b3f3517f71b8c4f
The emitted code is quite different, though without larger analysis I can't
tell if it is worse or better.
^ permalink raw reply [flat|nested] 9+ messages in thread
* [Bug testsuite/108141] [13 Regression] gcc.target/i386/pr64110.c FAIL since r13-4727 on ia32
2022-12-16 9:00 [Bug testsuite/108141] New: [13 Regression] gcc.target/i386/pr64110.c FAIL since r13-4727 on ia32 jakub at gcc dot gnu.org
@ 2022-12-16 9:01 ` jakub at gcc dot gnu.org
2022-12-16 9:03 ` jakub at gcc dot gnu.org
` (6 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: jakub at gcc dot gnu.org @ 2022-12-16 9:01 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108141
Jakub Jelinek <jakub at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
CC| |hjl.tools at gmail dot com,
| |vmakarov at gcc dot gnu.org
Target Milestone|--- |13.0
^ permalink raw reply [flat|nested] 9+ messages in thread
* [Bug testsuite/108141] [13 Regression] gcc.target/i386/pr64110.c FAIL since r13-4727 on ia32
2022-12-16 9:00 [Bug testsuite/108141] New: [13 Regression] gcc.target/i386/pr64110.c FAIL since r13-4727 on ia32 jakub at gcc dot gnu.org
2022-12-16 9:01 ` [Bug testsuite/108141] " jakub at gcc dot gnu.org
@ 2022-12-16 9:03 ` jakub at gcc dot gnu.org
2022-12-16 11:12 ` rguenth at gcc dot gnu.org
` (5 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: jakub at gcc dot gnu.org @ 2022-12-16 9:03 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108141
--- Comment #1 from Jakub Jelinek <jakub at gcc dot gnu.org> ---
Just counting number of instructions it is shorter though:
--- pr64110.s.r13-4726 2022-12-16 03:57:42.000000000 -0500
+++ pr64110.s.r13-4727 2022-12-16 03:57:48.000000000 -0500
@@ -11,106 +11,105 @@ bar:
pushl %ebx
andl $-32, %esp
subl $64, %esp
- movzwl 8(%ebp), %edi
+ movzwl 8(%ebp), %eax
+ movw %ax, 30(%esp)
movl a, %eax
- vmovd %edi, %xmm1
+ vpbroadcastw 30(%esp), %xmm2
+ vpbroadcastw 30(%esp), %ymm0
leal -1(%eax), %edx
- vmovd %edi, %xmm0
movl %edx, a
- vpbroadcastw %xmm1, %xmm1
- vpbroadcastw %xmm0, %ymm0
+ vmovdqa %xmm2, (%esp)
testl %eax, %eax
- je .L24
- movw %di, 62(%esp)
- vmovdqa %xmm1, 32(%esp)
- vmovdqa %ymm0, (%esp)
- vzeroupper
+ je .L23
.p2align 4,,10
.p2align 3
.L2:
+ vmovdqa %ymm0, 32(%esp)
+ vzeroupper
call foo
+ vmovdqa 32(%esp), %ymm0
testl %eax, %eax
jle .L3
- leal -1(%eax), %edi
+ leal -1(%eax), %esi
movl b, %ebx
- movl %edi, 52(%esp)
- cmpl $14, %edi
+ movl %esi, 24(%esp)
+ cmpl $14, %esi
jbe .L10
movl %eax, %ecx
movl %ebx, %edx
shrl $4, %ecx
sall $5, %ecx
- leal (%ecx,%ebx), %esi
+ leal (%ecx,%ebx), %edi
andl $32, %ecx
- jne .L21
- movzwl 62(%esp), %edi
- vmovdqa (%esp), %ymm0
+ je .L5
+ leal 32(%ebx), %edx
+ vmovdqu %ymm0, (%ebx)
+ cmpl %edi, %edx
+ je .L22
.p2align 4,,10
.p2align 3
.L5:
vmovdqu %ymm0, (%edx)
addl $64, %edx
vmovdqu %ymm0, -32(%edx)
- cmpl %esi, %edx
+ cmpl %edi, %edx
jne .L5
- movw %di, 62(%esp)
.L22:
movl %eax, %edx
andl $-16, %edx
- movl %edx, %esi
+ movl %edx, %edi
leal (%ebx,%edx,2), %ecx
cmpl %edx, %eax
- je .L29
- vzeroupper
+ je .L6
.L4:
- movl %eax, %edi
- subl %esi, %edi
- movl %edi, 56(%esp)
- decl %edi
- cmpl $6, %edi
+ movl %eax, %esi
+ subl %edi, %esi
+ movl %esi, 32(%esp)
+ decl %esi
+ cmpl $6, %esi
jbe .L7
- movl 56(%esp), %edi
- vmovdqa 32(%esp), %xmm2
- vmovdqu %xmm2, (%ebx,%esi,2)
- movl %edi, %esi
- andl $-8, %esi
- addl %esi, %edx
- leal (%ecx,%esi,2), %ecx
- movl %edi, %esi
- andl $7, %esi
+ movl 32(%esp), %esi
+ vmovdqa (%esp), %xmm1
+ vmovdqu %xmm1, (%ebx,%edi,2)
+ movl %esi, %edi
+ andl $-8, %edi
+ addl %edi, %edx
+ leal (%ecx,%edi,2), %ecx
+ movl %esi, %edi
+ andl $7, %edi
je .L6
.L7:
- movzwl 62(%esp), %edi
- leal 1(%edx), %esi
- movw %di, (%ecx)
- cmpl %esi, %eax
- jle .L6
- leal 2(%edx), %esi
- movw %di, 2(%ecx)
- cmpl %esi, %eax
- jle .L6
- leal 3(%edx), %esi
- movw %di, 4(%ecx)
- cmpl %esi, %eax
- jle .L6
- leal 4(%edx), %esi
- movw %di, 6(%ecx)
- cmpl %esi, %eax
- jle .L6
- leal 5(%edx), %esi
- movw %di, 8(%ecx)
- cmpl %esi, %eax
+ movzwl 30(%esp), %esi
+ leal 1(%edx), %edi
+ movw %si, (%ecx)
+ cmpl %edi, %eax
+ jle .L6
+ leal 2(%edx), %edi
+ movw %si, 2(%ecx)
+ cmpl %edi, %eax
+ jle .L6
+ leal 3(%edx), %edi
+ movw %si, 4(%ecx)
+ cmpl %edi, %eax
+ jle .L6
+ leal 4(%edx), %edi
+ movw %si, 6(%ecx)
+ cmpl %edi, %eax
+ jle .L6
+ leal 5(%edx), %edi
+ movw %si, 8(%ecx)
+ cmpl %edi, %eax
jle .L6
addl $6, %edx
- movw %di, 10(%ecx)
+ movw %si, 10(%ecx)
cmpl %edx, %eax
jle .L6
- movw %di, 12(%ecx)
+ movw %si, 12(%ecx)
.L6:
leal (%ebx,%eax,2), %eax
- movzwl 62(%esp), %ecx
+ movzwl 30(%esp), %ecx
movl %eax, b
- movl 52(%esp), %eax
+ movl 24(%esp), %eax
movw %cx, (%ebx,%eax,2)
.L3:
movl a, %eax
@@ -118,39 +117,19 @@ bar:
movl %edx, a
testl %eax, %eax
jne .L2
+.L23:
+ vzeroupper
leal -12(%ebp), %esp
popl %ebx
popl %esi
popl %edi
popl %ebp
ret
- .p2align 4,,10
- .p2align 3
-.L21:
- vmovdqa (%esp), %ymm3
- leal 32(%ebx), %edx
- vmovdqu %ymm3, (%ebx)
- cmpl %esi, %edx
- je .L22
- movzwl 62(%esp), %edi
- vmovdqa %ymm3, %ymm0
- jmp .L5
.L10:
movl %ebx, %ecx
- xorl %esi, %esi
+ xorl %edi, %edi
xorl %edx, %edx
jmp .L4
-.L29:
- vzeroupper
- jmp .L6
-.L24:
- vzeroupper
- leal -12(%ebp), %esp
- popl %ebx
- popl %esi
- popl %edi
- popl %ebp
- ret
.size bar, .-bar
.globl b
.bss
^ permalink raw reply [flat|nested] 9+ messages in thread
* [Bug testsuite/108141] [13 Regression] gcc.target/i386/pr64110.c FAIL since r13-4727 on ia32
2022-12-16 9:00 [Bug testsuite/108141] New: [13 Regression] gcc.target/i386/pr64110.c FAIL since r13-4727 on ia32 jakub at gcc dot gnu.org
2022-12-16 9:01 ` [Bug testsuite/108141] " jakub at gcc dot gnu.org
2022-12-16 9:03 ` jakub at gcc dot gnu.org
@ 2022-12-16 11:12 ` rguenth at gcc dot gnu.org
2022-12-16 11:36 ` jakub at gcc dot gnu.org
` (4 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: rguenth at gcc dot gnu.org @ 2022-12-16 11:12 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108141
Richard Biener <rguenth at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
CC| |crazylht at gmail dot com
Target| |i?86-*-*
--- Comment #2 from Richard Biener <rguenth at gcc dot gnu.org> ---
it looks better overall but the key difference is:
- movzwl 8(%ebp), %edi
+ movzwl 8(%ebp), %eax
+ movw %ax, 30(%esp)
...
- vmovd %edi, %xmm1
+ vpbroadcastw 30(%esp), %xmm2
+ vpbroadcastw 30(%esp), %ymm0
...
- vmovd %edi, %xmm0
...
- vpbroadcastw %xmm1, %xmm1
- vpbroadcastw %xmm0, %ymm0
I wonder whether optimal would be
vpbroadcasstw 8(%ebp), %xmm2
vpbroadcasstw 8(%ebp), %ymm0
though.
^ permalink raw reply [flat|nested] 9+ messages in thread
* [Bug testsuite/108141] [13 Regression] gcc.target/i386/pr64110.c FAIL since r13-4727 on ia32
2022-12-16 9:00 [Bug testsuite/108141] New: [13 Regression] gcc.target/i386/pr64110.c FAIL since r13-4727 on ia32 jakub at gcc dot gnu.org
` (2 preceding siblings ...)
2022-12-16 11:12 ` rguenth at gcc dot gnu.org
@ 2022-12-16 11:36 ` jakub at gcc dot gnu.org
2022-12-16 11:46 ` rguenth at gcc dot gnu.org
` (3 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: jakub at gcc dot gnu.org @ 2022-12-16 11:36 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108141
--- Comment #3 from Jakub Jelinek <jakub at gcc dot gnu.org> ---
Yeah.
For the PR64110:
typedef short V __attribute__((vector_size (32)));
V
foo (short x)
{
return (V) { x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x };
}
we emit with -m64 -O2 -mavx2
vmovd %edi, %xmm0
vpbroadcastw %xmm0, %ymm0
which is I think right and for -m32 -O2 -mavx2
vpbroadcastw 8(%ebp), %ymm0
which again is optimal.
Though, in the pr64110.c test we went with -O3 -march=core-avx2 -m32
from r219682:
movzwl (%ecx), %esi
movw %si, -28(%ebp)
vpbroadcastw -28(%ebp), %ymm1
vmovdqa %ymm1, -88(%ebp)
to r219683:
movzwl (%ecx), %esi
vmovd %esi, %xmm1
vpbroadcastw %xmm1, %ymm1
vmovdqa %ymm1, -88(%ebp)
to r13-4726:
movzwl 8(%ebp), %edi
vmovd %edi, %xmm1
vpbroadcastw %xmm1, %xmm1
vmovdqa %xmm1, 32(%esp)
to r13-4727:
movzwl 8(%ebp), %eax
movw %ax, 30(%esp)
vpbroadcastw 30(%esp), %xmm2
vmovdqa %xmm2, (%esp)
while
vpbroadcastw 8(%ebp), %xmm2
vmovdqa %xmm2, (%esp)
would be best. From this POV I think r13-4727 is actually a step backwards
because previously we were at least loading it into GPR, moving to SSE and
broadcasting there,
while now we move into GPR, spill to memory and broadcast from memory.
Before combine we have:
(insn 2 8 3 2 (set (reg:SI 120 [ x ])
(mem/c:SI (reg/f:SI 16 argp) [2 x+0 S4 A32])) "pr64110.c":11:1 83
{*movsi_internal}
(nil))
(insn 3 2 4 2 (set (reg/v:HI 119 [ x ])
(subreg:HI (reg:SI 120 [ x ]) 0)) "pr64110.c":11:1 84 {*movhi_internal}
(expr_list:REG_DEAD (reg:SI 120 [ x ])
(nil)))
...
and in another bb
(insn 63 140 35 3 (set (reg:V8HI 140)
(vec_duplicate:V8HI (reg/v:HI 119 [ x ]))) "pr64110.c":16:7 7985
{*vec_dupv8hi}
(nil))
(insn 35 63 18 3 (set (reg:V16HI 141 [ vect_cst__52 ])
(vec_duplicate:V16HI (reg/v:HI 119 [ x ]))) 7984 {*vec_dupv16hi}
(nil))
so I bet that is the reason why combine doesn't merge those into just the
broadcast.
As for the xmm vs. ymm, it is only loop-invariant that moves those 2 dups (insn
63 and 35) next to each other, and the question is what kind of optimization
pass could figure out that insn 35 is a superset of insn 63 and change it into
insn 35 + lowpart subreg to set pseudo 140 from low half of 141.
^ permalink raw reply [flat|nested] 9+ messages in thread
* [Bug testsuite/108141] [13 Regression] gcc.target/i386/pr64110.c FAIL since r13-4727 on ia32
2022-12-16 9:00 [Bug testsuite/108141] New: [13 Regression] gcc.target/i386/pr64110.c FAIL since r13-4727 on ia32 jakub at gcc dot gnu.org
` (3 preceding siblings ...)
2022-12-16 11:36 ` jakub at gcc dot gnu.org
@ 2022-12-16 11:46 ` rguenth at gcc dot gnu.org
2022-12-16 18:22 ` cvs-commit at gcc dot gnu.org
` (2 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: rguenth at gcc dot gnu.org @ 2022-12-16 11:46 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108141
--- Comment #4 from Richard Biener <rguenth at gcc dot gnu.org> ---
(In reply to Jakub Jelinek from comment #3)
[...]
... From this POV I think r13-4727 is actually a step backwards
> because previously we were at least loading it into GPR, moving to SSE and
> broadcasting there,
> while now we move into GPR, spill to memory and broadcast from memory.
> Before combine we have:
> (insn 2 8 3 2 (set (reg:SI 120 [ x ])
> (mem/c:SI (reg/f:SI 16 argp) [2 x+0 S4 A32])) "pr64110.c":11:1 83
> {*movsi_internal}
> (nil))
> (insn 3 2 4 2 (set (reg/v:HI 119 [ x ])
> (subreg:HI (reg:SI 120 [ x ]) 0)) "pr64110.c":11:1 84
> {*movhi_internal}
> (expr_list:REG_DEAD (reg:SI 120 [ x ])
> (nil)))
> ...
> and in another bb
> (insn 63 140 35 3 (set (reg:V8HI 140)
> (vec_duplicate:V8HI (reg/v:HI 119 [ x ]))) "pr64110.c":16:7 7985
> {*vec_dupv8hi}
> (nil))
> (insn 35 63 18 3 (set (reg:V16HI 141 [ vect_cst__52 ])
> (vec_duplicate:V16HI (reg/v:HI 119 [ x ]))) 7984 {*vec_dupv16hi}
> (nil))
> so I bet that is the reason why combine doesn't merge those into just the
> broadcast.
Yep. And probably fwprop doesnt consider MEMs (or even two defs) at all.
I suppose we don't want to combine insn 2 + 3 into a HImode MEM by itself?
OTOH there's no fwprop after combine.
> As for the xmm vs. ymm, it is only loop-invariant that moves those 2 dups
> (insn 63 and 35) next to each other, and the question is what kind of
> optimization pass could figure out that insn 35 is a superset of insn 63 and
> change it into insn 35 + lowpart subreg to set pseudo 140 from low half of
> 141.
There's only a peephole or alternatively scheduling heuristic + CSE (we
need the V16HI duplicate before the V8HI one) I can think of.
CSE could also tentatively record "larger" computations and modify the
earlier stmt if uses of that larger compute appears.
^ permalink raw reply [flat|nested] 9+ messages in thread
* [Bug testsuite/108141] [13 Regression] gcc.target/i386/pr64110.c FAIL since r13-4727 on ia32
2022-12-16 9:00 [Bug testsuite/108141] New: [13 Regression] gcc.target/i386/pr64110.c FAIL since r13-4727 on ia32 jakub at gcc dot gnu.org
` (4 preceding siblings ...)
2022-12-16 11:46 ` rguenth at gcc dot gnu.org
@ 2022-12-16 18:22 ` cvs-commit at gcc dot gnu.org
2022-12-20 16:59 ` [Bug target/108141] " jakub at gcc dot gnu.org
2023-03-03 15:11 ` vmakarov at gcc dot gnu.org
7 siblings, 0 replies; 9+ messages in thread
From: cvs-commit at gcc dot gnu.org @ 2022-12-16 18:22 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108141
--- Comment #5 from CVS Commits <cvs-commit at gcc dot gnu.org> ---
The master branch has been updated by Vladimir Makarov <vmakarov@gcc.gnu.org>:
https://gcc.gnu.org/g:b50fe16a3b2214c418ecc5febc0bb21bc17912b7
commit r13-4749-gb50fe16a3b2214c418ecc5febc0bb21bc17912b7
Author: Vladimir N. Makarov <vmakarov@redhat.com>
Date: Fri Dec 16 13:16:31 2022 -0500
Revert "IRA: Check that reg classes contain a hard reg of given mode in reg
move cost calculation"
The patch resulted in new PRs:
PR target/108145
PR testsuite/108141
So I am reverting the patch.
This reverts commit 12abd5a7d13209f79664ea603b3f3517f71b8c4f.
^ permalink raw reply [flat|nested] 9+ messages in thread
* [Bug target/108141] [13 Regression] gcc.target/i386/pr64110.c FAIL since r13-4727 on ia32
2022-12-16 9:00 [Bug testsuite/108141] New: [13 Regression] gcc.target/i386/pr64110.c FAIL since r13-4727 on ia32 jakub at gcc dot gnu.org
` (5 preceding siblings ...)
2022-12-16 18:22 ` cvs-commit at gcc dot gnu.org
@ 2022-12-20 16:59 ` jakub at gcc dot gnu.org
2023-03-03 15:11 ` vmakarov at gcc dot gnu.org
7 siblings, 0 replies; 9+ messages in thread
From: jakub at gcc dot gnu.org @ 2022-12-20 16:59 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108141
Jakub Jelinek <jakub at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Resolution|--- |FIXED
Status|UNCONFIRMED |RESOLVED
--- Comment #6 from Jakub Jelinek <jakub at gcc dot gnu.org> ---
The change has been reverted, so this is no longer a regression.
^ permalink raw reply [flat|nested] 9+ messages in thread
* [Bug target/108141] [13 Regression] gcc.target/i386/pr64110.c FAIL since r13-4727 on ia32
2022-12-16 9:00 [Bug testsuite/108141] New: [13 Regression] gcc.target/i386/pr64110.c FAIL since r13-4727 on ia32 jakub at gcc dot gnu.org
` (6 preceding siblings ...)
2022-12-20 16:59 ` [Bug target/108141] " jakub at gcc dot gnu.org
@ 2023-03-03 15:11 ` vmakarov at gcc dot gnu.org
7 siblings, 0 replies; 9+ messages in thread
From: vmakarov at gcc dot gnu.org @ 2023-03-03 15:11 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108141
--- Comment #7 from Vladimir Makarov <vmakarov at gcc dot gnu.org> ---
(In reply to Jakub Jelinek from comment #6)
> The change has been reverted, so this is no longer a regression.
Just for the info. The patch I reverted resulted in wrong calculation of
pressure classes (there was a single pressure class ALL_REGS). This affected
register pressure calculation and as a consequence using one region only. W/o
the patch IRA uses regional register allocation for the loops in the test.
I pushed another patch for PR90706. I hope it will not create such problems as
the previous patch.
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2023-03-03 15:11 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-12-16 9:00 [Bug testsuite/108141] New: [13 Regression] gcc.target/i386/pr64110.c FAIL since r13-4727 on ia32 jakub at gcc dot gnu.org
2022-12-16 9:01 ` [Bug testsuite/108141] " jakub at gcc dot gnu.org
2022-12-16 9:03 ` jakub at gcc dot gnu.org
2022-12-16 11:12 ` rguenth at gcc dot gnu.org
2022-12-16 11:36 ` jakub at gcc dot gnu.org
2022-12-16 11:46 ` rguenth at gcc dot gnu.org
2022-12-16 18:22 ` cvs-commit at gcc dot gnu.org
2022-12-20 16:59 ` [Bug target/108141] " jakub at gcc dot gnu.org
2023-03-03 15:11 ` vmakarov at gcc dot gnu.org
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).