From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <gcc-bugzilla@gcc.gnu.org>
Received: by sourceware.org (Postfix, from userid 48)
	id 88C063858D28; Fri, 10 Feb 2023 10:00:32 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 88C063858D28
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org;
	s=default; t=1676023232;
	bh=UXFU529wJ58bwNWR7HRS40wngRaYBJPoO/lsw0m9JXE=;
	h=From:To:Subject:Date:In-Reply-To:References:From;
	b=ZmJqZ+AM01RWuDqfvcBdfMuLjKQzbsSw39fWswgN01/Q+fiC9h2f55dz/O/jtqGP8
	 akgw7xjANo2Jq6Pg7P6pwIS6oqwxgsedN2uj849lGi2xSuZVLuNQs7CoR35k8lE/mx
	 Arc7APxw9hgWFm3wk30nwGm2Fvfd/95f65UX4iIs=
From: "rguenth at gcc dot gnu.org" <gcc-bugzilla@gcc.gnu.org>
To: gcc-bugs@gcc.gnu.org
Subject: [Bug tree-optimization/108724] Poor codegen when summing two arrays
 without AVX or SSE
Date: Fri, 10 Feb 2023 10:00:29 +0000
X-Bugzilla-Reason: CC
X-Bugzilla-Type: changed
X-Bugzilla-Watch-Reason: None
X-Bugzilla-Product: gcc
X-Bugzilla-Component: tree-optimization
X-Bugzilla-Version: 13.0
X-Bugzilla-Keywords: missed-optimization
X-Bugzilla-Severity: normal
X-Bugzilla-Who: rguenth at gcc dot gnu.org
X-Bugzilla-Status: ASSIGNED
X-Bugzilla-Resolution: 
X-Bugzilla-Priority: P3
X-Bugzilla-Assigned-To: rguenth at gcc dot gnu.org
X-Bugzilla-Target-Milestone: ---
X-Bugzilla-Flags: 
X-Bugzilla-Changed-Fields: see_also
Message-ID: <bug-108724-4-pVrJb1tiY4@http.gcc.gnu.org/bugzilla/>
In-Reply-To: <bug-108724-4@http.gcc.gnu.org/bugzilla/>
References: <bug-108724-4@http.gcc.gnu.org/bugzilla/>
Content-Type: text/plain; charset="UTF-8"
Content-Transfer-Encoding: quoted-printable
X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/
Auto-Submitted: auto-generated
MIME-Version: 1.0
List-Id: <gcc-bugs.sourceware.org>

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D108724

Richard Biener <rguenth at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
           See Also|                            |https://gcc.gnu.org/bugzill
                   |                            |a/show_bug.cgi?id=3D101801
--- Comment #4 from Richard Biener <rguenth at gcc dot gnu.org> ---
So the vectorizer thinks that

foo:
.LFB0:
        .cfi_startproc
        movabsq $9223372034707292159, %rax
        movq    %rdi, %rcx
        movq    (%rdx), %r10
        movq    (%rsi), %rdi
        movq    %rax, %r8
        movq    %rax, %r9
        movq    %rax, %r11
        andq    %r10, %r8
        andq    %rdi, %r9
        addq    %r8, %r9
        movq    %rdi, %r8
        movabsq $-9223372034707292160, %rdi
        xorq    %r10, %r8
        movq    8(%rdx), %r10
        andq    %rdi, %r8
        xorq    %r9, %r8
        movq    %rax, %r9
        movq    %r8, (%rcx)
        movq    8(%rsi), %r8
        andq    %r10, %r9
        andq    %r8, %r11
        xorq    %r10, %r8
        movq    16(%rdx), %r10
        addq    %r11, %r9
        andq    %rdi, %r8
        movq    %rax, %r11
        xorq    %r9, %r8
        movq    %rax, %r9
        andq    %r10, %r11
        movq    %r8, 8(%rcx)
        movq    16(%rsi), %r8
        andq    %r8, %r9
        xorq    %r10, %r8
        movq    24(%rdx), %r10
        addq    %r11, %r9
        andq    %rdi, %r8
        movq    %rax, %r11
        xorq    %r9, %r8
        movq    %rax, %r9
        andq    %r10, %r11
        movq    %r8, 16(%rcx)
        movq    24(%rsi), %r8
        andq    %r8, %r9
        xorq    %r10, %r8
        movq    32(%rdx), %r10
        addq    %r11, %r9
        andq    %rdi, %r8
        movq    %rax, %r11
        xorq    %r9, %r8
        movq    %rax, %r9
        andq    %r10, %r11
        movq    %r8, 24(%rcx)
        movq    32(%rsi), %r8
        andq    %r8, %r9
        xorq    %r10, %r8
        movq    40(%rdx), %r10
        addq    %r11, %r9
        andq    %rdi, %r8
        movq    %rax, %r11
        xorq    %r9, %r8
        movq    %rax, %r9
        andq    %r10, %r11
        movq    %r8, 32(%rcx)
        movq    40(%rsi), %r8
        andq    %r8, %r9
        addq    %r11, %r9
        xorq    %r10, %r8
        movq    48(%rsi), %r10
        movq    %rax, %r11
        andq    %rdi, %r8
        xorq    %r9, %r8
        movq    %rax, %r9
        andq    %r10, %r11
        movq    %r8, 40(%rcx)
        movq    48(%rdx), %r8
        movq    56(%rdx), %rdx
        andq    %r8, %r9
        xorq    %r10, %r8
        addq    %r11, %r9
        andq    %rdi, %r8
        xorq    %r9, %r8
        movq    %r8, 48(%rcx)
        movq    56(%rsi), %r8
        movq    %rax, %rsi
        andq    %rdx, %rsi
        andq    %r8, %rax
        xorq    %r8, %rdx
        addq    %rsi, %rax
        andq    %rdi, %rdx
        xorq    %rdx, %rax
        movq    %rax, 56(%rcx)
        ret

will be faster than when not vectorizing.  Not vectorizing produces

foo:
.LFB0:
        .cfi_startproc
        movq    %rsi, %rcx
        movl    (%rsi), %esi
        addl    (%rdx), %esi
        movl    %esi, (%rdi)
        movl    4(%rdx), %esi
        addl    4(%rcx), %esi
        movl    %esi, 4(%rdi)
        movl    8(%rdx), %esi
        addl    8(%rcx), %esi
        movl    %esi, 8(%rdi)
        movl    12(%rdx), %esi
        addl    12(%rcx), %esi
        movl    %esi, 12(%rdi)
        movl    16(%rdx), %esi
        addl    16(%rcx), %esi
        movl    %esi, 16(%rdi)
        movl    20(%rdx), %esi
        addl    20(%rcx), %esi
        movl    %esi, 20(%rdi)
        movl    24(%rdx), %esi
        addl    24(%rcx), %esi
        movl    %esi, 24(%rdi)
        movl    28(%rdx), %esi
        addl    28(%rcx), %esi
        movl    %esi, 28(%rdi)
        movl    32(%rdx), %esi
        addl    32(%rcx), %esi
        movl    %esi, 32(%rdi)
        movl    36(%rdx), %esi
        addl    36(%rcx), %esi
        movl    %esi, 36(%rdi)
        movl    40(%rdx), %esi
        addl    40(%rcx), %esi
        movl    %esi, 40(%rdi)
        movl    44(%rdx), %esi
        addl    44(%rcx), %esi
        movl    %esi, 44(%rdi)
        movl    48(%rdx), %esi
        addl    48(%rcx), %esi
        movl    %esi, 48(%rdi)
        movl    52(%rdx), %esi
        addl    52(%rcx), %esi
        movl    %esi, 52(%rdi)
        movl    56(%rdx), %esi
        movl    60(%rdx), %edx
        addl    56(%rcx), %esi
        addl    60(%rcx), %edx
        movl    %esi, 56(%rdi)
        movl    %edx, 60(%rdi)
        ret

The vectorizer produces un-lowered vector adds which is good in case follow=
up
optimizations are possible (the ops are not obfuscated), but also bad
because unrolling estimates the size in a wrong way.  Costs go

*_3 1 times scalar_load costs 12 in prologue
*_5 1 times scalar_load costs 12 in prologue=20
_4 + _6 1 times scalar_stmt costs 4 in prologue
_8 1 times scalar_store costs 12 in prologue=20

vs

*_3 1 times unaligned_load (misalign -1) costs 12 in body
*_5 1 times unaligned_load (misalign -1) costs 12 in body
_4 + _6 1 times vector_stmt costs 4 in body
_4 + _6 5 times scalar_stmt costs 20 in body
_8 1 times unaligned_store (misalign -1) costs 12 in body

and as usual the savings from the wide loads and store outweight the
loss on the arithmetic side, but it's a close call:

  Vector inside of loop cost: 60
  Scalar iteration cost: 40

so 2*40 > 60 and the vectorization is profitable.

The easiest fix is to avoid vectorizing, another possibility is to adhere
to the vectorizers decision and expand to the lowered sequence immediately
from within the vectorizer itself.  The original goal was to remove
this hard cap on the number of elements but this bug shows that work was
incomplete.

I'm going to re-instantiate the hard cap and revisit for GCC 14.=