[Bug tree-optimization/108724] Poor codegen when summing two arrays without AVX or SSE

public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed

From: "rguenth at gcc dot gnu.org" <gcc-bugzilla@gcc.gnu.org>
To: gcc-bugs@gcc.gnu.org
Subject: [Bug tree-optimization/108724] Poor codegen when summing two arrays without AVX or SSE
Date: Fri, 10 Feb 2023 10:00:29 +0000	[thread overview]
Message-ID: <bug-108724-4-pVrJb1tiY4@http.gcc.gnu.org/bugzilla/> (raw)
In-Reply-To: <bug-108724-4@http.gcc.gnu.org/bugzilla/>

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108724

Richard Biener <rguenth at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
           See Also|                            |https://gcc.gnu.org/bugzill
                   |                            |a/show_bug.cgi?id=101801

--- Comment #4 from Richard Biener <rguenth at gcc dot gnu.org> ---
So the vectorizer thinks that

foo:
.LFB0:
        .cfi_startproc
        movabsq $9223372034707292159, %rax
        movq    %rdi, %rcx
        movq    (%rdx), %r10
        movq    (%rsi), %rdi
        movq    %rax, %r8
        movq    %rax, %r9
        movq    %rax, %r11
        andq    %r10, %r8
        andq    %rdi, %r9
        addq    %r8, %r9
        movq    %rdi, %r8
        movabsq $-9223372034707292160, %rdi
        xorq    %r10, %r8
        movq    8(%rdx), %r10
        andq    %rdi, %r8
        xorq    %r9, %r8
        movq    %rax, %r9
        movq    %r8, (%rcx)
        movq    8(%rsi), %r8
        andq    %r10, %r9
        andq    %r8, %r11
        xorq    %r10, %r8
        movq    16(%rdx), %r10
        addq    %r11, %r9
        andq    %rdi, %r8
        movq    %rax, %r11
        xorq    %r9, %r8
        movq    %rax, %r9
        andq    %r10, %r11
        movq    %r8, 8(%rcx)
        movq    16(%rsi), %r8
        andq    %r8, %r9
        xorq    %r10, %r8
        movq    24(%rdx), %r10
        addq    %r11, %r9
        andq    %rdi, %r8
        movq    %rax, %r11
        xorq    %r9, %r8
        movq    %rax, %r9
        andq    %r10, %r11
        movq    %r8, 16(%rcx)
        movq    24(%rsi), %r8
        andq    %r8, %r9
        xorq    %r10, %r8
        movq    32(%rdx), %r10
        addq    %r11, %r9
        andq    %rdi, %r8
        movq    %rax, %r11
        xorq    %r9, %r8
        movq    %rax, %r9
        andq    %r10, %r11
        movq    %r8, 24(%rcx)
        movq    32(%rsi), %r8
        andq    %r8, %r9
        xorq    %r10, %r8
        movq    40(%rdx), %r10
        addq    %r11, %r9
        andq    %rdi, %r8
        movq    %rax, %r11
        xorq    %r9, %r8
        movq    %rax, %r9
        andq    %r10, %r11
        movq    %r8, 32(%rcx)
        movq    40(%rsi), %r8
        andq    %r8, %r9
        addq    %r11, %r9
        xorq    %r10, %r8
        movq    48(%rsi), %r10
        movq    %rax, %r11
        andq    %rdi, %r8
        xorq    %r9, %r8
        movq    %rax, %r9
        andq    %r10, %r11
        movq    %r8, 40(%rcx)
        movq    48(%rdx), %r8
        movq    56(%rdx), %rdx
        andq    %r8, %r9
        xorq    %r10, %r8
        addq    %r11, %r9
        andq    %rdi, %r8
        xorq    %r9, %r8
        movq    %r8, 48(%rcx)
        movq    56(%rsi), %r8
        movq    %rax, %rsi
        andq    %rdx, %rsi
        andq    %r8, %rax
        xorq    %r8, %rdx
        addq    %rsi, %rax
        andq    %rdi, %rdx
        xorq    %rdx, %rax
        movq    %rax, 56(%rcx)
        ret

will be faster than when not vectorizing.  Not vectorizing produces

foo:
.LFB0:
        .cfi_startproc
        movq    %rsi, %rcx
        movl    (%rsi), %esi
        addl    (%rdx), %esi
        movl    %esi, (%rdi)
        movl    4(%rdx), %esi
        addl    4(%rcx), %esi
        movl    %esi, 4(%rdi)
        movl    8(%rdx), %esi
        addl    8(%rcx), %esi
        movl    %esi, 8(%rdi)
        movl    12(%rdx), %esi
        addl    12(%rcx), %esi
        movl    %esi, 12(%rdi)
        movl    16(%rdx), %esi
        addl    16(%rcx), %esi
        movl    %esi, 16(%rdi)
        movl    20(%rdx), %esi
        addl    20(%rcx), %esi
        movl    %esi, 20(%rdi)
        movl    24(%rdx), %esi
        addl    24(%rcx), %esi
        movl    %esi, 24(%rdi)
        movl    28(%rdx), %esi
        addl    28(%rcx), %esi
        movl    %esi, 28(%rdi)
        movl    32(%rdx), %esi
        addl    32(%rcx), %esi
        movl    %esi, 32(%rdi)
        movl    36(%rdx), %esi
        addl    36(%rcx), %esi
        movl    %esi, 36(%rdi)
        movl    40(%rdx), %esi
        addl    40(%rcx), %esi
        movl    %esi, 40(%rdi)
        movl    44(%rdx), %esi
        addl    44(%rcx), %esi
        movl    %esi, 44(%rdi)
        movl    48(%rdx), %esi
        addl    48(%rcx), %esi
        movl    %esi, 48(%rdi)
        movl    52(%rdx), %esi
        addl    52(%rcx), %esi
        movl    %esi, 52(%rdi)
        movl    56(%rdx), %esi
        movl    60(%rdx), %edx
        addl    56(%rcx), %esi
        addl    60(%rcx), %edx
        movl    %esi, 56(%rdi)
        movl    %edx, 60(%rdi)
        ret

The vectorizer produces un-lowered vector adds which is good in case followup
optimizations are possible (the ops are not obfuscated), but also bad
because unrolling estimates the size in a wrong way.  Costs go

*_3 1 times scalar_load costs 12 in prologue
*_5 1 times scalar_load costs 12 in prologue 
_4 + _6 1 times scalar_stmt costs 4 in prologue
_8 1 times scalar_store costs 12 in prologue 

vs

*_3 1 times unaligned_load (misalign -1) costs 12 in body
*_5 1 times unaligned_load (misalign -1) costs 12 in body
_4 + _6 1 times vector_stmt costs 4 in body
_4 + _6 5 times scalar_stmt costs 20 in body
_8 1 times unaligned_store (misalign -1) costs 12 in body

and as usual the savings from the wide loads and store outweight the
loss on the arithmetic side, but it's a close call:

  Vector inside of loop cost: 60
  Scalar iteration cost: 40

so 2*40 > 60 and the vectorization is profitable.

The easiest fix is to avoid vectorizing, another possibility is to adhere
to the vectorizers decision and expand to the lowered sequence immediately
from within the vectorizer itself.  The original goal was to remove
this hard cap on the number of elements but this bug shows that work was
incomplete.

I'm going to re-instantiate the hard cap and revisit for GCC 14.

next prev parent reply	other threads:[~2023-02-10 10:00 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-02-08 19:17 [Bug target/108724] New: [11 regression] " gbs at canishe dot com
2023-02-08 19:30 ` [Bug tree-optimization/108724] " pinskia at gcc dot gnu.org
2023-02-09  9:37 ` crazylht at gmail dot com
2023-02-09 13:54 ` rguenth at gcc dot gnu.org
2023-02-10 10:00 ` rguenth at gcc dot gnu.org [this message]
2023-02-10 10:07 ` [Bug tree-optimization/108724] [11/12/13 Regression] " rguenth at gcc dot gnu.org
2023-02-10 11:22 ` cvs-commit at gcc dot gnu.org
2023-02-10 11:22 ` [Bug tree-optimization/108724] [11/12 " rguenth at gcc dot gnu.org
2023-03-15  9:48 ` cvs-commit at gcc dot gnu.org
2023-05-05  8:34 ` [Bug tree-optimization/108724] [11 " rguenth at gcc dot gnu.org
2023-05-05 12:06 ` [Bug target/108724] " rguenth at gcc dot gnu.org
2023-05-23 12:55 ` rguenth at gcc dot gnu.org
2023-05-29 10:08 ` jakub at gcc dot gnu.org

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=bug-108724-4-pVrJb1tiY4@http.gcc.gnu.org/bugzilla/ \
    --to=gcc-bugzilla@gcc.gnu.org \
    --cc=gcc-bugs@gcc.gnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).