From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <gcc-bugzilla@gcc.gnu.org>
Received: by sourceware.org (Postfix, from userid 48)
 id 0A4FD3833005; Wed, 18 Nov 2020 12:08:18 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 0A4FD3833005
From: "andysem at mail dot ru" <gcc-bugzilla@gcc.gnu.org>
To: gcc-bugs@gcc.gnu.org
Subject: [Bug target/97891] New: [x86] Consider using registers on large
 initializations
Date: Wed, 18 Nov 2020 12:08:17 +0000
X-Bugzilla-Reason: CC
X-Bugzilla-Type: new
X-Bugzilla-Watch-Reason: None
X-Bugzilla-Product: gcc
X-Bugzilla-Component: target
X-Bugzilla-Version: 11.0
X-Bugzilla-Keywords: 
X-Bugzilla-Severity: normal
X-Bugzilla-Who: andysem at mail dot ru
X-Bugzilla-Status: UNCONFIRMED
X-Bugzilla-Resolution: 
X-Bugzilla-Priority: P3
X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org
X-Bugzilla-Target-Milestone: ---
X-Bugzilla-Flags: 
X-Bugzilla-Changed-Fields: bug_id short_desc product version bug_status
 bug_severity priority component assigned_to reporter target_milestone
Message-ID: <bug-97891-4@http.gcc.gnu.org/bugzilla/>
Content-Type: text/plain; charset="UTF-8"
Content-Transfer-Encoding: quoted-printable
X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/
Auto-Submitted: auto-generated
MIME-Version: 1.0
X-BeenThere: gcc-bugs@gcc.gnu.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Gcc-bugs mailing list <gcc-bugs.gcc.gnu.org>
List-Unsubscribe: <https://gcc.gnu.org/mailman/options/gcc-bugs>,
 <mailto:gcc-bugs-request@gcc.gnu.org?subject=unsubscribe>
List-Archive: <https://gcc.gnu.org/pipermail/gcc-bugs/>
List-Post: <mailto:gcc-bugs@gcc.gnu.org>
List-Help: <mailto:gcc-bugs-request@gcc.gnu.org?subject=help>
List-Subscribe: <https://gcc.gnu.org/mailman/listinfo/gcc-bugs>,
 <mailto:gcc-bugs-request@gcc.gnu.org?subject=subscribe>
X-List-Received-Date: Wed, 18 Nov 2020 12:08:18 -0000

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D97891

            Bug ID: 97891
           Summary: [x86] Consider using registers on large
                    initializations
           Product: gcc
           Version: 11.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: andysem at mail dot ru
  Target Milestone: ---

Consider the following example code:

struct A
{
    long a;
    short b;
    int c;
    char d;
    long x;
    bool y;
    int z;
    char* p;

    A() :
        a(0), b(0), c(0), d(0), x(0), y(false), z(0), p(0)
    {}
};

void test(A* p, unsigned int count)
{
    for (unsigned int i =3D 0; i < count; ++i)
    {
        p[i] =3D A();
    }
}

When compiled with "-O3 -march=3Dnehalem" the generated code is:

test(A*, unsigned int):
        testl   %esi, %esi
        je      .L1
        leal    -1(%rsi), %eax
        leaq    (%rax,%rax,2), %rax
        salq    $4, %rax
        leaq    48(%rdi,%rax), %rax
.L3:
        xorl    %edx, %edx
        movq    $0, (%rdi)
        addq    $48, %rdi
        movw    %dx, -40(%rdi)
        movl    $0, -36(%rdi)
        movb    $0, -32(%rdi)
        movq    $0, -24(%rdi)
        movb    $0, -16(%rdi)
        movl    $0, -12(%rdi)
        movq    $0, -8(%rdi)
        cmpq    %rax, %rdi
        jne     .L3
.L1:
        ret

https://gcc.godbolt.org/z/TrfWYr

Here, the main loop body between .L3 and .L1 is 60 bytes large, with a
significant amount of space wasted on the $0 constants encoded in mov
instructions. It would be more efficient to use a single zero register in a=
ll
member initializations, especially given that %edx is already used like tha=
t.

A loop rewritten like this:

    for (unsigned int i =3D 0; i < count; ++i)
    {
        __asm__
        (
            "movq    %q1, (%0)\n\t"
            "movw    %w1, 8(%0)\n\t"
            "movl    %1, 12(%0)\n\t"
            "movb    %b1, 16(%0)\n\t"
            "movq    %q1, 24(%0)\n\t"
            "movb    %b1, 32(%0)\n\t"
            "movl    %1, 36(%0)\n\t"
            "movq    %q1, 40(%0)\n\t"
            : : "r" (p + i), "q" (0)
        );
    }

compiles to:

test(A*, unsigned int):
        testl   %esi, %esi
        je      .L1
        leal    -1(%rsi), %eax
        leaq    (%rax,%rax,2), %rax
        salq    $4, %rax
        leaq    48(%rdi,%rax), %rdx
        xorl    %eax, %eax
.L3:
        movq    %rax, (%rdi)
        movw    %ax, 8(%rdi)
        movl    %eax, 12(%rdi)
        movb    %al, 16(%rdi)
        movq    %rax, 24(%rdi)
        movb    %al, 32(%rdi)
        movl    %eax, 36(%rdi)
        movq    %rax, 40(%rdi)

        addq    $48, %rdi
        cmpq    %rdx, %rdi
        jne     .L3
.L1:
        ret

Here, the loop between .L3 and .L1 only takes 34 bytes, which is nearly half
the original size.

Constant (for example, zero) initialization is a frequently used pattern to
initialize structures, so the sequences like the above are quite wide sprea=
d.
Converting cases like this to the use of registers could save some code size
and reduce cache pressure.=