public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug target/108229] New: [13 Regression] unprofitable STV transform
@ 2022-12-26 17:38 amonakov at gcc dot gnu.org
  2022-12-27 12:06 ` [Bug target/108229] [13 Regression] unprofitable STV transform since r13-4873-g0b2c1369d035e928 marxin at gcc dot gnu.org
                   ` (4 more replies)
  0 siblings, 5 replies; 6+ messages in thread
From: amonakov at gcc dot gnu.org @ 2022-12-26 17:38 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108229

            Bug ID: 108229
           Summary: [13 Regression] unprofitable STV transform
           Product: gcc
           Version: 13.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: amonakov at gcc dot gnu.org
  Target Milestone: ---
            Target: x86_64-*-*

In the following example, STV is making a very unprofitable transformation on
trunk, but not on gcc-12:

#include <stddef.h>
#include <stdint.h>

struct b {
        struct b *next;
        uint64_t data[511];
};

typedef uint64_t u64v2 __attribute__((vector_size(16)));
static inline
void vsum(u64v2 s[], uint64_t *x, size_t n)
{
        typedef u64v2 u64v2_u __attribute__((may_alias));
        u64v2_u *vx = (void *)x;
        for (; n; vx += 4, n -= 8) {
                s[0] += vx[0];
                s[1] += vx[1];
                s[2] += vx[2];
                s[3] += vx[3];
        }
}

uint64_t sum(struct b *b)
{
        uint64_t s = 0;
        u64v2 vs[4] = { 0 };
        do {
                vsum(vs, b->data + 7, 511-7);
#pragma GCC unroll(7)
                for (int i = 0; i < 7; i++)
                        s += b->data[i];
        } while ((b = b->next));
        vs[0] += vs[1] + vs[2] + vs[3];
        return s + vs[0][0] + vs[0][1];
}

gcc -O2 -mavx (-mavx is not necessary, plain -O2 also triggers it):

sum:
        vpxor   xmm2, xmm2, xmm2
        vmovdqa xmm1, xmm2
        vmovdqa xmm3, xmm2
        vmovdqa xmm0, xmm2
        vmovdqa xmm5, xmm2
.L3:
        lea     rax, [rdi+64]
        lea     rdx, [rdi+4096]
.L2:
        vpaddq  xmm0, xmm0, XMMWORD PTR [rax]
        vpaddq  xmm3, xmm3, XMMWORD PTR [rax+16]
        add     rax, 64
        vpaddq  xmm1, xmm1, XMMWORD PTR [rax-32]
        vpaddq  xmm2, xmm2, XMMWORD PTR [rax-16]
        cmp     rdx, rax
        jne     .L2
        vmovq   xmm6, QWORD PTR [rdi+16]
        vmovq   xmm4, QWORD PTR [rdi+8]
        vpaddq  xmm4, xmm4, xmm6
        vpaddq  xmm4, xmm4, xmm5
        vmovq   xmm5, QWORD PTR [rdi+24]
        vpaddq  xmm4, xmm4, xmm5
        vmovq   xmm5, QWORD PTR [rdi+32]
        vpaddq  xmm4, xmm4, xmm5
        vmovq   xmm5, QWORD PTR [rdi+40]
        vpaddq  xmm4, xmm4, xmm5
        vmovq   xmm5, QWORD PTR [rdi+48]
        vpaddq  xmm4, xmm4, xmm5
        vmovq   xmm5, QWORD PTR [rdi+56]
        mov     rdi, QWORD PTR [rdi]
        vpaddq  xmm5, xmm4, xmm5
        test    rdi, rdi
        jne     .L3
        vpaddq  xmm1, xmm1, xmm2
        vpaddq  xmm0, xmm0, xmm3
        vpaddq  xmm0, xmm0, xmm1
        vmovdqa xmm1, xmm0
        vpsrldq xmm0, xmm0, 8
        vpaddq  xmm0, xmm1, xmm0
        vpaddq  xmm0, xmm0, xmm5
        vmovq   rax, xmm0
        ret

compare with gcc -O2 -mavx -mno-stv:

sum:
        vpxor   xmm2, xmm2, xmm2
        xor     edx, edx
        vmovdqa xmm1, xmm2
        vmovdqa xmm3, xmm2
        vmovdqa xmm0, xmm2
.L3:
        lea     rax, [rdi+64]
        lea     rcx, [rdi+4096]
.L2:
        vpaddq  xmm0, xmm0, XMMWORD PTR [rax]
        vpaddq  xmm3, xmm3, XMMWORD PTR [rax+16]
        add     rax, 64
        vpaddq  xmm1, xmm1, XMMWORD PTR [rax-32]
        vpaddq  xmm2, xmm2, XMMWORD PTR [rax-16]
        cmp     rcx, rax
        jne     .L2
        mov     rax, QWORD PTR [rdi+16]
        add     rax, QWORD PTR [rdi+8]
        add     rdx, rax
        add     rdx, QWORD PTR [rdi+24]
        add     rdx, QWORD PTR [rdi+32]
        add     rdx, QWORD PTR [rdi+40]
        add     rdx, QWORD PTR [rdi+48]
        add     rdx, QWORD PTR [rdi+56]
        mov     rdi, QWORD PTR [rdi]
        test    rdi, rdi
        jne     .L3
        vpaddq  xmm0, xmm0, xmm3
        vpaddq  xmm1, xmm1, xmm2
        vpaddq  xmm0, xmm0, xmm1
        vmovq   rcx, xmm0
        vpextrq rax, xmm0, 1
        add     rax, rcx
        add     rax, rdx
        ret

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2023-01-07  9:45 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-12-26 17:38 [Bug target/108229] New: [13 Regression] unprofitable STV transform amonakov at gcc dot gnu.org
2022-12-27 12:06 ` [Bug target/108229] [13 Regression] unprofitable STV transform since r13-4873-g0b2c1369d035e928 marxin at gcc dot gnu.org
2022-12-28  9:40 ` roger at nextmovesoftware dot com
2022-12-28 10:37 ` amonakov at gcc dot gnu.org
2023-01-03 13:38 ` cvs-commit at gcc dot gnu.org
2023-01-07  9:45 ` roger at nextmovesoftware dot com

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).