From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <gcc-bugzilla@gcc.gnu.org>
Received: by sourceware.org (Postfix, from userid 48)
	id 9CF103858D39; Wed,  8 Feb 2023 19:17:27 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 9CF103858D39
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org;
	s=default; t=1675883847;
	bh=02voY2S2Wf80PWtzGozq6CvIZyrbk4aAaIKqfi1ZfL0=;
	h=From:To:Subject:Date:From;
	b=hEIg/MA//a7H40h7cLRFt9xSbe+k4C9nNMdcTaH0Q3fji9XmTBd2SNUBVIY5Q6wbu
	 2aeCfr5cj6+rHvZMDf7779gxfx/TYup07R+dGVdoBOWJNWw+MLXC+VNlHjIiwDudLj
	 pQHDVEmX1fIMf5uiVrquFh6kdPSj1rO6+QtPcoo4=
From: "gbs at canishe dot com" <gcc-bugzilla@gcc.gnu.org>
To: gcc-bugs@gcc.gnu.org
Subject: [Bug target/108724] New: [11 regression] Poor codegen when summing
 two arrays without AVX or SSE
Date: Wed, 08 Feb 2023 19:17:27 +0000
X-Bugzilla-Reason: CC
X-Bugzilla-Type: new
X-Bugzilla-Watch-Reason: None
X-Bugzilla-Product: gcc
X-Bugzilla-Component: target
X-Bugzilla-Version: 13.0
X-Bugzilla-Keywords: 
X-Bugzilla-Severity: normal
X-Bugzilla-Who: gbs at canishe dot com
X-Bugzilla-Status: UNCONFIRMED
X-Bugzilla-Resolution: 
X-Bugzilla-Priority: P3
X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org
X-Bugzilla-Target-Milestone: ---
X-Bugzilla-Flags: 
X-Bugzilla-Changed-Fields: bug_id short_desc product version bug_status
 bug_severity priority component assigned_to reporter target_milestone
Message-ID: <bug-108724-4@http.gcc.gnu.org/bugzilla/>
Content-Type: text/plain; charset="UTF-8"
Content-Transfer-Encoding: quoted-printable
X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/
Auto-Submitted: auto-generated
MIME-Version: 1.0
List-Id: <gcc-bugs.sourceware.org>

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D108724

            Bug ID: 108724
           Summary: [11 regression] Poor codegen when summing two arrays
                    without AVX or SSE
           Product: gcc
           Version: 13.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: gbs at canishe dot com
  Target Milestone: ---

This program:

void foo(int *a, const int *__restrict b, const int *__restrict c)
{
  for (int i =3D 0; i < 16; i++) {
    a[i] =3D b[i] + c[i];
  }
}


When compiled for x86 by GCC 11.1+ with -O3 -mno-avx -mno-sse, produces:

foo:
        movq    %rdx, %rax
        subq    $8, %rsp
        movl    (%rsi), %edx
        movq    %rsi, %rcx
        addl    (%rax), %edx
        movl    4(%rax), %esi
        movq    $0, (%rsp)
        movl    %edx, (%rsp)
        movq    (%rsp), %rdx
        addl    4(%rcx), %esi
        movq    %rdx, -8(%rsp)
        movl    %esi, -4(%rsp)
        movq    -8(%rsp), %rdx
        movq    %rdx, (%rdi)
        movl    8(%rax), %edx
        addl    8(%rcx), %edx
        movq    $0, -16(%rsp)
        movl    %edx, -16(%rsp)
        movq    -16(%rsp), %rdx
        movl    12(%rcx), %esi
        addl    12(%rax), %esi
        movq    %rdx, -24(%rsp)
        movl    %esi, -20(%rsp)
        movq    -24(%rsp), %rdx
        movq    %rdx, 8(%rdi)
        [snip more of the same]
        movl    48(%rcx), %edx
        movq    $0, -96(%rsp)
        addl    48(%rax), %edx
        movl    %edx, -96(%rsp)
        movq    -96(%rsp), %rdx
        movl    52(%rcx), %esi
        addl    52(%rax), %esi
        movq    %rdx, -104(%rsp)
        movl    %esi, -100(%rsp)
        movq    -104(%rsp), %rdx
        movq    %rdx, 48(%rdi)
        movl    56(%rcx), %edx
        movq    $0, -112(%rsp)
        addl    56(%rax), %edx
        movl    %edx, -112(%rsp)
        movq    -112(%rsp), %rdx
        movl    60(%rcx), %ecx
        addl    60(%rax), %ecx
        movq    %rdx, -120(%rsp)
        movl    %ecx, -116(%rsp)
        movq    -120(%rsp), %rdx
        movq    %rdx, 56(%rdi)
        addq    $8, %rsp
        ret

(Godbolt link: https://godbolt.org/z/qq9dbP8ed)

This is bizarre - it's storing intermediate results on the stack, instead of
keeping them in registers or writing them directly to *a, which is bound to=
 be
slow. (GCC 10.4, and Clang, produce more or less what I would expect, using
only the provided arrays and a register.) I haven't done any benchmarking
myself, but Jonathan Wakely's results (on list:
https://gcc.gnu.org/pipermail/gcc-help/2023-February/142181.html) seem to b=
ear
this out.

>From a bisect, this behavior seems to have been introduced by commit
33c0f246f799b7403171e97f31276a8feddd05c9 (tree-optimization/97626 - handle =
SCCs
properly in SLP stmt analysis) from Oct 2020, and persists into GCC trunk.=