[Bug target/108229] New: [13 Regression] unprofitable STV transform

public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed

* [Bug target/108229] New: [13 Regression] unprofitable STV transform
@ 2022-12-26 17:38 amonakov at gcc dot gnu.org
  2022-12-27 12:06 ` [Bug target/108229] [13 Regression] unprofitable STV transform since r13-4873-g0b2c1369d035e928 marxin at gcc dot gnu.org
                   ` (4 more replies)
  0 siblings, 5 replies; 6+ messages in thread
From: amonakov at gcc dot gnu.org @ 2022-12-26 17:38 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108229

            Bug ID: 108229
           Summary: [13 Regression] unprofitable STV transform
           Product: gcc
           Version: 13.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: amonakov at gcc dot gnu.org
  Target Milestone: ---
            Target: x86_64-*-*

In the following example, STV is making a very unprofitable transformation on
trunk, but not on gcc-12:

#include <stddef.h>
#include <stdint.h>

struct b {
        struct b *next;
        uint64_t data[511];
};

typedef uint64_t u64v2 __attribute__((vector_size(16)));
static inline
void vsum(u64v2 s[], uint64_t *x, size_t n)
{
        typedef u64v2 u64v2_u __attribute__((may_alias));
        u64v2_u *vx = (void *)x;
        for (; n; vx += 4, n -= 8) {
                s[0] += vx[0];
                s[1] += vx[1];
                s[2] += vx[2];
                s[3] += vx[3];
        }
}

uint64_t sum(struct b *b)
{
        uint64_t s = 0;
        u64v2 vs[4] = { 0 };
        do {
                vsum(vs, b->data + 7, 511-7);
#pragma GCC unroll(7)
                for (int i = 0; i < 7; i++)
                        s += b->data[i];
        } while ((b = b->next));
        vs[0] += vs[1] + vs[2] + vs[3];
        return s + vs[0][0] + vs[0][1];
}

gcc -O2 -mavx (-mavx is not necessary, plain -O2 also triggers it):

sum:
        vpxor   xmm2, xmm2, xmm2
        vmovdqa xmm1, xmm2
        vmovdqa xmm3, xmm2
        vmovdqa xmm0, xmm2
        vmovdqa xmm5, xmm2
.L3:
        lea     rax, [rdi+64]
        lea     rdx, [rdi+4096]
.L2:
        vpaddq  xmm0, xmm0, XMMWORD PTR [rax]
        vpaddq  xmm3, xmm3, XMMWORD PTR [rax+16]
        add     rax, 64
        vpaddq  xmm1, xmm1, XMMWORD PTR [rax-32]
        vpaddq  xmm2, xmm2, XMMWORD PTR [rax-16]
        cmp     rdx, rax
        jne     .L2
        vmovq   xmm6, QWORD PTR [rdi+16]
        vmovq   xmm4, QWORD PTR [rdi+8]
        vpaddq  xmm4, xmm4, xmm6
        vpaddq  xmm4, xmm4, xmm5
        vmovq   xmm5, QWORD PTR [rdi+24]
        vpaddq  xmm4, xmm4, xmm5
        vmovq   xmm5, QWORD PTR [rdi+32]
        vpaddq  xmm4, xmm4, xmm5
        vmovq   xmm5, QWORD PTR [rdi+40]
        vpaddq  xmm4, xmm4, xmm5
        vmovq   xmm5, QWORD PTR [rdi+48]
        vpaddq  xmm4, xmm4, xmm5
        vmovq   xmm5, QWORD PTR [rdi+56]
        mov     rdi, QWORD PTR [rdi]
        vpaddq  xmm5, xmm4, xmm5
        test    rdi, rdi
        jne     .L3
        vpaddq  xmm1, xmm1, xmm2
        vpaddq  xmm0, xmm0, xmm3
        vpaddq  xmm0, xmm0, xmm1
        vmovdqa xmm1, xmm0
        vpsrldq xmm0, xmm0, 8
        vpaddq  xmm0, xmm1, xmm0
        vpaddq  xmm0, xmm0, xmm5
        vmovq   rax, xmm0
        ret

compare with gcc -O2 -mavx -mno-stv:

sum:
        vpxor   xmm2, xmm2, xmm2
        xor     edx, edx
        vmovdqa xmm1, xmm2
        vmovdqa xmm3, xmm2
        vmovdqa xmm0, xmm2
.L3:
        lea     rax, [rdi+64]
        lea     rcx, [rdi+4096]
.L2:
        vpaddq  xmm0, xmm0, XMMWORD PTR [rax]
        vpaddq  xmm3, xmm3, XMMWORD PTR [rax+16]
        add     rax, 64
        vpaddq  xmm1, xmm1, XMMWORD PTR [rax-32]
        vpaddq  xmm2, xmm2, XMMWORD PTR [rax-16]
        cmp     rcx, rax
        jne     .L2
        mov     rax, QWORD PTR [rdi+16]
        add     rax, QWORD PTR [rdi+8]
        add     rdx, rax
        add     rdx, QWORD PTR [rdi+24]
        add     rdx, QWORD PTR [rdi+32]
        add     rdx, QWORD PTR [rdi+40]
        add     rdx, QWORD PTR [rdi+48]
        add     rdx, QWORD PTR [rdi+56]
        mov     rdi, QWORD PTR [rdi]
        test    rdi, rdi
        jne     .L3
        vpaddq  xmm0, xmm0, xmm3
        vpaddq  xmm1, xmm1, xmm2
        vpaddq  xmm0, xmm0, xmm1
        vmovq   rcx, xmm0
        vpextrq rax, xmm0, 1
        add     rax, rcx
        add     rax, rdx
        ret

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [Bug target/108229] [13 Regression] unprofitable STV transform since r13-4873-g0b2c1369d035e928
  2022-12-26 17:38 [Bug target/108229] New: [13 Regression] unprofitable STV transform amonakov at gcc dot gnu.org
@ 2022-12-27 12:06 ` marxin at gcc dot gnu.org
  2022-12-28  9:40 ` roger at nextmovesoftware dot com
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: marxin at gcc dot gnu.org @ 2022-12-27 12:06 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108229

Martin Liška <marxin at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
   Last reconfirmed|                            |2022-12-27
                 CC|                            |marxin at gcc dot gnu.org,
                   |                            |sayle at gcc dot gnu.org
             Status|UNCONFIRMED                 |NEW
     Ever confirmed|0                           |1
            Summary|[13 Regression]             |[13 Regression]
                   |unprofitable STV transform  |unprofitable STV transform
                   |                            |since
                   |                            |r13-4873-g0b2c1369d035e928

--- Comment #1 from Martin Liška <marxin at gcc dot gnu.org> ---
Started with r13-4873-g0b2c1369d035e928.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [Bug target/108229] [13 Regression] unprofitable STV transform since r13-4873-g0b2c1369d035e928
  2022-12-26 17:38 [Bug target/108229] New: [13 Regression] unprofitable STV transform amonakov at gcc dot gnu.org
  2022-12-27 12:06 ` [Bug target/108229] [13 Regression] unprofitable STV transform since r13-4873-g0b2c1369d035e928 marxin at gcc dot gnu.org
@ 2022-12-28  9:40 ` roger at nextmovesoftware dot com
  2022-12-28 10:37 ` amonakov at gcc dot gnu.org
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: roger at nextmovesoftware dot com @ 2022-12-28  9:40 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108229

Roger Sayle <roger at nextmovesoftware dot com> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |roger at nextmovesoftware dot com

--- Comment #2 from Roger Sayle <roger at nextmovesoftware dot com> ---
Created attachment 54159
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=54159&action=edit
compute_convert_gain patch

I believe the attached patch should improve things, by tweaking the gains/costs
used by the STV pass.  Previously, GCC wasn't taking into account the
conversion of memory references in operands of PLUS.  In terms of u-ops/memory
loads, the before and after in the bugzilla PR are pretty much equivalent, so
the effect is subtle and probably depends on target microarchitecture.  If
folks could benchmark this change, and post the results I'd very much
appreciate it.  It would also be good to understand why/where this change
was/is considered "unprofitable".  The one thing I can guarantee is that with
this proposed patch, GCC no longer increases function size with -Os [a tangible
measure of unprofitable].

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [Bug target/108229] [13 Regression] unprofitable STV transform since r13-4873-g0b2c1369d035e928
  2022-12-26 17:38 [Bug target/108229] New: [13 Regression] unprofitable STV transform amonakov at gcc dot gnu.org
  2022-12-27 12:06 ` [Bug target/108229] [13 Regression] unprofitable STV transform since r13-4873-g0b2c1369d035e928 marxin at gcc dot gnu.org
  2022-12-28  9:40 ` roger at nextmovesoftware dot com
@ 2022-12-28 10:37 ` amonakov at gcc dot gnu.org
  2023-01-03 13:38 ` cvs-commit at gcc dot gnu.org
  2023-01-07  9:45 ` roger at nextmovesoftware dot com
  4 siblings, 0 replies; 6+ messages in thread
From: amonakov at gcc dot gnu.org @ 2022-12-28 10:37 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108229

--- Comment #3 from Alexander Monakov <amonakov at gcc dot gnu.org> ---
Thank you! I considered this unprofitable for these reasons:

1. As you said, the code grows in size, but the speed benefit is not clear.

2. The transform converts load+add operations in a loop, and their final uses
outside of the loop. How does the costing work in this case, i.e. how are
changes for the more frequently executed instructions are weighted against
changes for the instructions that will be executed once?

3. The scalar 'add reg, mem' instruction results in one micro-fused uop that is
handled as one uop during renaming (one of narrowest point in the pipeline). It
is then issued on two execution units (for the load and for the add).

4. On AMD, there are separate fp/simd pipes, so when the code is already
simd-heavy as in this example, STV offloads instructions from the integer pipes
to the possibly already-busy simd/fp pipes.

That said, the transformed portion is small relative to the inner loop of the
example, so benchmarking yesterday's trunk with/without -mno-stv on Zen 2, I
get:

27.26 bytes/cycle, 3.07 instruction/cycle

vs.

26.01 bytes/cycle, 2.97 instruction/cycle

So it's not the end of the world for this particular example, but I wanted to
raise the issue in case there's a costing problem in STV that needs correcting.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [Bug target/108229] [13 Regression] unprofitable STV transform since r13-4873-g0b2c1369d035e928
  2022-12-26 17:38 [Bug target/108229] New: [13 Regression] unprofitable STV transform amonakov at gcc dot gnu.org
                   ` (2 preceding siblings ...)
  2022-12-28 10:37 ` amonakov at gcc dot gnu.org
@ 2023-01-03 13:38 ` cvs-commit at gcc dot gnu.org
  2023-01-07  9:45 ` roger at nextmovesoftware dot com
  4 siblings, 0 replies; 6+ messages in thread
From: cvs-commit at gcc dot gnu.org @ 2023-01-03 13:38 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108229

--- Comment #4 from CVS Commits <cvs-commit at gcc dot gnu.org> ---
The master branch has been updated by Roger Sayle <sayle@gcc.gnu.org>:

https://gcc.gnu.org/g:de59d8bd163a4b2e50ab566441ab49d7212c3356

commit r13-4976-gde59d8bd163a4b2e50ab566441ab49d7212c3356
Author: Roger Sayle <roger@nextmovesoftware.com>
Date:   Tue Jan 3 13:37:31 2023 +0000

    PR target/108229: A minor STV compute_convert_gain tweak on x86.

    This patch addresses PR target/108229, which is a change in code
    generation during the STV pass, due to the recently approved patch
    to handle vec_select (reductions) in the vector unit.  The recent
    change is innocent, but exposes a latent STV "gain" calculation issue
    that is benign (or closely balanced) on most microarchitectures.

    The issue is when STV considers converting PLUS with a MEM operand.

    On TARGET_64BIT (m=1):
            addq 24(%rdi), %rdx             // 4 bytes
    or with -m32 (m=2)
            addl    24(%esi), %eax          // 3 bytes
            adcl    28(%esi), %edx          // 3 bytes
    is being converted by STV to
            vmovq   24(%rdi), %xmm5         // 5 bytes
            vpaddq  %xmm5, %xmm4, %xmm4     // 4 bytes

    The current code in general_scalar_chain::compute_convert_gain
    considers that scalar unit addition is replaced with a vector
    unit addition (usually about the same cost), but doesn't consider
    anything special about MEM operands, assuming that a scalar load
    gains/costs nothing compared to a vector load.  We can allow the
    backend slightly better fine tuning by including in the gain
    calculation that m scalar loads are being replaced by one vector
    load, and when optimizing for size including that we're increasing
    code size (e.g. an extra vmovq instruction for a MEM operand).

    This patch is a win on the CSiBE benchmark when compiled with -Os.

    2023-01-03  Roger Sayle  <roger@nextmovesoftware.com>

    gcc/ChangeLog
            PR target/108229
            * config/i386/i386-features.cc
            (general_scalar_chain::compute_convert_gain) <case PLUS>: Consider
            the gain/cost of converting a MEM operand.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [Bug target/108229] [13 Regression] unprofitable STV transform since r13-4873-g0b2c1369d035e928
  2022-12-26 17:38 [Bug target/108229] New: [13 Regression] unprofitable STV transform amonakov at gcc dot gnu.org
                   ` (3 preceding siblings ...)
  2023-01-03 13:38 ` cvs-commit at gcc dot gnu.org
@ 2023-01-07  9:45 ` roger at nextmovesoftware dot com
  4 siblings, 0 replies; 6+ messages in thread
From: roger at nextmovesoftware dot com @ 2023-01-07  9:45 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108229

Roger Sayle <roger at nextmovesoftware dot com> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
         Resolution|---                         |FIXED
             Status|NEW                         |RESOLVED

--- Comment #5 from Roger Sayle <roger at nextmovesoftware dot com> ---
This should now be fixed on mainline.  The backend's microarchitecture tuning
now has much better control over whether Alexander's loop is converted by STV,
and this no longer happens at -Os where this obviously isn't a win/correct.

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2023-01-07  9:45 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-12-26 17:38 [Bug target/108229] New: [13 Regression] unprofitable STV transform amonakov at gcc dot gnu.org
2022-12-27 12:06 ` [Bug target/108229] [13 Regression] unprofitable STV transform since r13-4873-g0b2c1369d035e928 marxin at gcc dot gnu.org
2022-12-28  9:40 ` roger at nextmovesoftware dot com
2022-12-28 10:37 ` amonakov at gcc dot gnu.org
2023-01-03 13:38 ` cvs-commit at gcc dot gnu.org
2023-01-07  9:45 ` roger at nextmovesoftware dot com

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).