public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug target/108229] New: [13 Regression] unprofitable STV transform
@ 2022-12-26 17:38 amonakov at gcc dot gnu.org
2022-12-27 12:06 ` [Bug target/108229] [13 Regression] unprofitable STV transform since r13-4873-g0b2c1369d035e928 marxin at gcc dot gnu.org
` (4 more replies)
0 siblings, 5 replies; 6+ messages in thread
From: amonakov at gcc dot gnu.org @ 2022-12-26 17:38 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108229
Bug ID: 108229
Summary: [13 Regression] unprofitable STV transform
Product: gcc
Version: 13.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: target
Assignee: unassigned at gcc dot gnu.org
Reporter: amonakov at gcc dot gnu.org
Target Milestone: ---
Target: x86_64-*-*
In the following example, STV is making a very unprofitable transformation on
trunk, but not on gcc-12:
#include <stddef.h>
#include <stdint.h>
struct b {
struct b *next;
uint64_t data[511];
};
typedef uint64_t u64v2 __attribute__((vector_size(16)));
static inline
void vsum(u64v2 s[], uint64_t *x, size_t n)
{
typedef u64v2 u64v2_u __attribute__((may_alias));
u64v2_u *vx = (void *)x;
for (; n; vx += 4, n -= 8) {
s[0] += vx[0];
s[1] += vx[1];
s[2] += vx[2];
s[3] += vx[3];
}
}
uint64_t sum(struct b *b)
{
uint64_t s = 0;
u64v2 vs[4] = { 0 };
do {
vsum(vs, b->data + 7, 511-7);
#pragma GCC unroll(7)
for (int i = 0; i < 7; i++)
s += b->data[i];
} while ((b = b->next));
vs[0] += vs[1] + vs[2] + vs[3];
return s + vs[0][0] + vs[0][1];
}
gcc -O2 -mavx (-mavx is not necessary, plain -O2 also triggers it):
sum:
vpxor xmm2, xmm2, xmm2
vmovdqa xmm1, xmm2
vmovdqa xmm3, xmm2
vmovdqa xmm0, xmm2
vmovdqa xmm5, xmm2
.L3:
lea rax, [rdi+64]
lea rdx, [rdi+4096]
.L2:
vpaddq xmm0, xmm0, XMMWORD PTR [rax]
vpaddq xmm3, xmm3, XMMWORD PTR [rax+16]
add rax, 64
vpaddq xmm1, xmm1, XMMWORD PTR [rax-32]
vpaddq xmm2, xmm2, XMMWORD PTR [rax-16]
cmp rdx, rax
jne .L2
vmovq xmm6, QWORD PTR [rdi+16]
vmovq xmm4, QWORD PTR [rdi+8]
vpaddq xmm4, xmm4, xmm6
vpaddq xmm4, xmm4, xmm5
vmovq xmm5, QWORD PTR [rdi+24]
vpaddq xmm4, xmm4, xmm5
vmovq xmm5, QWORD PTR [rdi+32]
vpaddq xmm4, xmm4, xmm5
vmovq xmm5, QWORD PTR [rdi+40]
vpaddq xmm4, xmm4, xmm5
vmovq xmm5, QWORD PTR [rdi+48]
vpaddq xmm4, xmm4, xmm5
vmovq xmm5, QWORD PTR [rdi+56]
mov rdi, QWORD PTR [rdi]
vpaddq xmm5, xmm4, xmm5
test rdi, rdi
jne .L3
vpaddq xmm1, xmm1, xmm2
vpaddq xmm0, xmm0, xmm3
vpaddq xmm0, xmm0, xmm1
vmovdqa xmm1, xmm0
vpsrldq xmm0, xmm0, 8
vpaddq xmm0, xmm1, xmm0
vpaddq xmm0, xmm0, xmm5
vmovq rax, xmm0
ret
compare with gcc -O2 -mavx -mno-stv:
sum:
vpxor xmm2, xmm2, xmm2
xor edx, edx
vmovdqa xmm1, xmm2
vmovdqa xmm3, xmm2
vmovdqa xmm0, xmm2
.L3:
lea rax, [rdi+64]
lea rcx, [rdi+4096]
.L2:
vpaddq xmm0, xmm0, XMMWORD PTR [rax]
vpaddq xmm3, xmm3, XMMWORD PTR [rax+16]
add rax, 64
vpaddq xmm1, xmm1, XMMWORD PTR [rax-32]
vpaddq xmm2, xmm2, XMMWORD PTR [rax-16]
cmp rcx, rax
jne .L2
mov rax, QWORD PTR [rdi+16]
add rax, QWORD PTR [rdi+8]
add rdx, rax
add rdx, QWORD PTR [rdi+24]
add rdx, QWORD PTR [rdi+32]
add rdx, QWORD PTR [rdi+40]
add rdx, QWORD PTR [rdi+48]
add rdx, QWORD PTR [rdi+56]
mov rdi, QWORD PTR [rdi]
test rdi, rdi
jne .L3
vpaddq xmm0, xmm0, xmm3
vpaddq xmm1, xmm1, xmm2
vpaddq xmm0, xmm0, xmm1
vmovq rcx, xmm0
vpextrq rax, xmm0, 1
add rax, rcx
add rax, rdx
ret
^ permalink raw reply [flat|nested] 6+ messages in thread
* [Bug target/108229] [13 Regression] unprofitable STV transform since r13-4873-g0b2c1369d035e928
2022-12-26 17:38 [Bug target/108229] New: [13 Regression] unprofitable STV transform amonakov at gcc dot gnu.org
@ 2022-12-27 12:06 ` marxin at gcc dot gnu.org
2022-12-28 9:40 ` roger at nextmovesoftware dot com
` (3 subsequent siblings)
4 siblings, 0 replies; 6+ messages in thread
From: marxin at gcc dot gnu.org @ 2022-12-27 12:06 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108229
Martin Liška <marxin at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Last reconfirmed| |2022-12-27
CC| |marxin at gcc dot gnu.org,
| |sayle at gcc dot gnu.org
Status|UNCONFIRMED |NEW
Ever confirmed|0 |1
Summary|[13 Regression] |[13 Regression]
|unprofitable STV transform |unprofitable STV transform
| |since
| |r13-4873-g0b2c1369d035e928
--- Comment #1 from Martin Liška <marxin at gcc dot gnu.org> ---
Started with r13-4873-g0b2c1369d035e928.
^ permalink raw reply [flat|nested] 6+ messages in thread
* [Bug target/108229] [13 Regression] unprofitable STV transform since r13-4873-g0b2c1369d035e928
2022-12-26 17:38 [Bug target/108229] New: [13 Regression] unprofitable STV transform amonakov at gcc dot gnu.org
2022-12-27 12:06 ` [Bug target/108229] [13 Regression] unprofitable STV transform since r13-4873-g0b2c1369d035e928 marxin at gcc dot gnu.org
@ 2022-12-28 9:40 ` roger at nextmovesoftware dot com
2022-12-28 10:37 ` amonakov at gcc dot gnu.org
` (2 subsequent siblings)
4 siblings, 0 replies; 6+ messages in thread
From: roger at nextmovesoftware dot com @ 2022-12-28 9:40 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108229
Roger Sayle <roger at nextmovesoftware dot com> changed:
What |Removed |Added
----------------------------------------------------------------------------
CC| |roger at nextmovesoftware dot com
--- Comment #2 from Roger Sayle <roger at nextmovesoftware dot com> ---
Created attachment 54159
--> https://gcc.gnu.org/bugzilla/attachment.cgi?id=54159&action=edit
compute_convert_gain patch
I believe the attached patch should improve things, by tweaking the gains/costs
used by the STV pass. Previously, GCC wasn't taking into account the
conversion of memory references in operands of PLUS. In terms of u-ops/memory
loads, the before and after in the bugzilla PR are pretty much equivalent, so
the effect is subtle and probably depends on target microarchitecture. If
folks could benchmark this change, and post the results I'd very much
appreciate it. It would also be good to understand why/where this change
was/is considered "unprofitable". The one thing I can guarantee is that with
this proposed patch, GCC no longer increases function size with -Os [a tangible
measure of unprofitable].
^ permalink raw reply [flat|nested] 6+ messages in thread
* [Bug target/108229] [13 Regression] unprofitable STV transform since r13-4873-g0b2c1369d035e928
2022-12-26 17:38 [Bug target/108229] New: [13 Regression] unprofitable STV transform amonakov at gcc dot gnu.org
2022-12-27 12:06 ` [Bug target/108229] [13 Regression] unprofitable STV transform since r13-4873-g0b2c1369d035e928 marxin at gcc dot gnu.org
2022-12-28 9:40 ` roger at nextmovesoftware dot com
@ 2022-12-28 10:37 ` amonakov at gcc dot gnu.org
2023-01-03 13:38 ` cvs-commit at gcc dot gnu.org
2023-01-07 9:45 ` roger at nextmovesoftware dot com
4 siblings, 0 replies; 6+ messages in thread
From: amonakov at gcc dot gnu.org @ 2022-12-28 10:37 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108229
--- Comment #3 from Alexander Monakov <amonakov at gcc dot gnu.org> ---
Thank you! I considered this unprofitable for these reasons:
1. As you said, the code grows in size, but the speed benefit is not clear.
2. The transform converts load+add operations in a loop, and their final uses
outside of the loop. How does the costing work in this case, i.e. how are
changes for the more frequently executed instructions are weighted against
changes for the instructions that will be executed once?
3. The scalar 'add reg, mem' instruction results in one micro-fused uop that is
handled as one uop during renaming (one of narrowest point in the pipeline). It
is then issued on two execution units (for the load and for the add).
4. On AMD, there are separate fp/simd pipes, so when the code is already
simd-heavy as in this example, STV offloads instructions from the integer pipes
to the possibly already-busy simd/fp pipes.
That said, the transformed portion is small relative to the inner loop of the
example, so benchmarking yesterday's trunk with/without -mno-stv on Zen 2, I
get:
27.26 bytes/cycle, 3.07 instruction/cycle
vs.
26.01 bytes/cycle, 2.97 instruction/cycle
So it's not the end of the world for this particular example, but I wanted to
raise the issue in case there's a costing problem in STV that needs correcting.
^ permalink raw reply [flat|nested] 6+ messages in thread
* [Bug target/108229] [13 Regression] unprofitable STV transform since r13-4873-g0b2c1369d035e928
2022-12-26 17:38 [Bug target/108229] New: [13 Regression] unprofitable STV transform amonakov at gcc dot gnu.org
` (2 preceding siblings ...)
2022-12-28 10:37 ` amonakov at gcc dot gnu.org
@ 2023-01-03 13:38 ` cvs-commit at gcc dot gnu.org
2023-01-07 9:45 ` roger at nextmovesoftware dot com
4 siblings, 0 replies; 6+ messages in thread
From: cvs-commit at gcc dot gnu.org @ 2023-01-03 13:38 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108229
--- Comment #4 from CVS Commits <cvs-commit at gcc dot gnu.org> ---
The master branch has been updated by Roger Sayle <sayle@gcc.gnu.org>:
https://gcc.gnu.org/g:de59d8bd163a4b2e50ab566441ab49d7212c3356
commit r13-4976-gde59d8bd163a4b2e50ab566441ab49d7212c3356
Author: Roger Sayle <roger@nextmovesoftware.com>
Date: Tue Jan 3 13:37:31 2023 +0000
PR target/108229: A minor STV compute_convert_gain tweak on x86.
This patch addresses PR target/108229, which is a change in code
generation during the STV pass, due to the recently approved patch
to handle vec_select (reductions) in the vector unit. The recent
change is innocent, but exposes a latent STV "gain" calculation issue
that is benign (or closely balanced) on most microarchitectures.
The issue is when STV considers converting PLUS with a MEM operand.
On TARGET_64BIT (m=1):
addq 24(%rdi), %rdx // 4 bytes
or with -m32 (m=2)
addl 24(%esi), %eax // 3 bytes
adcl 28(%esi), %edx // 3 bytes
is being converted by STV to
vmovq 24(%rdi), %xmm5 // 5 bytes
vpaddq %xmm5, %xmm4, %xmm4 // 4 bytes
The current code in general_scalar_chain::compute_convert_gain
considers that scalar unit addition is replaced with a vector
unit addition (usually about the same cost), but doesn't consider
anything special about MEM operands, assuming that a scalar load
gains/costs nothing compared to a vector load. We can allow the
backend slightly better fine tuning by including in the gain
calculation that m scalar loads are being replaced by one vector
load, and when optimizing for size including that we're increasing
code size (e.g. an extra vmovq instruction for a MEM operand).
This patch is a win on the CSiBE benchmark when compiled with -Os.
2023-01-03 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
PR target/108229
* config/i386/i386-features.cc
(general_scalar_chain::compute_convert_gain) <case PLUS>: Consider
the gain/cost of converting a MEM operand.
^ permalink raw reply [flat|nested] 6+ messages in thread
* [Bug target/108229] [13 Regression] unprofitable STV transform since r13-4873-g0b2c1369d035e928
2022-12-26 17:38 [Bug target/108229] New: [13 Regression] unprofitable STV transform amonakov at gcc dot gnu.org
` (3 preceding siblings ...)
2023-01-03 13:38 ` cvs-commit at gcc dot gnu.org
@ 2023-01-07 9:45 ` roger at nextmovesoftware dot com
4 siblings, 0 replies; 6+ messages in thread
From: roger at nextmovesoftware dot com @ 2023-01-07 9:45 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108229
Roger Sayle <roger at nextmovesoftware dot com> changed:
What |Removed |Added
----------------------------------------------------------------------------
Resolution|--- |FIXED
Status|NEW |RESOLVED
--- Comment #5 from Roger Sayle <roger at nextmovesoftware dot com> ---
This should now be fixed on mainline. The backend's microarchitecture tuning
now has much better control over whether Alexander's loop is converted by STV,
and this no longer happens at -Os where this obviously isn't a win/correct.
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2023-01-07 9:45 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-12-26 17:38 [Bug target/108229] New: [13 Regression] unprofitable STV transform amonakov at gcc dot gnu.org
2022-12-27 12:06 ` [Bug target/108229] [13 Regression] unprofitable STV transform since r13-4873-g0b2c1369d035e928 marxin at gcc dot gnu.org
2022-12-28 9:40 ` roger at nextmovesoftware dot com
2022-12-28 10:37 ` amonakov at gcc dot gnu.org
2023-01-03 13:38 ` cvs-commit at gcc dot gnu.org
2023-01-07 9:45 ` roger at nextmovesoftware dot com
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).