From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <gcc-bugzilla@gcc.gnu.org>
Received: by sourceware.org (Postfix, from userid 48)
	id 5EE5E3858CDB; Thu, 18 May 2023 09:36:01 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 5EE5E3858CDB
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org;
	s=default; t=1684402561;
	bh=j1qtam9bAlwEg3DdWWEDBPe7cXHHpQ9hhqShm36ORWg=;
	h=From:To:Subject:Date:In-Reply-To:References:From;
	b=p5dd8lLV9PNDnWQw1QEF45MSBm0DGVxeEBoBrLhAQgy8aPmRZWfSYBO9sffYSh2O4
	 /RfZGa/yYcC1Z5X8LwSdGaF8ta4SkMOm0fkBccGo78XHShVZaE+wIPuQLdny6Xq29g
	 jNFR7EtcbEIvFWkdTY6nFZ3/8jwg7J7kjGsXXCGA=
From: "hubicka at gcc dot gnu.org" <gcc-bugzilla@gcc.gnu.org>
To: gcc-bugs@gcc.gnu.org
Subject: [Bug middle-end/109849] suboptimal code for vector walking loop
Date: Thu, 18 May 2023 09:35:59 +0000
X-Bugzilla-Reason: CC
X-Bugzilla-Type: changed
X-Bugzilla-Watch-Reason: None
X-Bugzilla-Product: gcc
X-Bugzilla-Component: middle-end
X-Bugzilla-Version: 13.0
X-Bugzilla-Keywords: missed-optimization
X-Bugzilla-Severity: normal
X-Bugzilla-Who: hubicka at gcc dot gnu.org
X-Bugzilla-Status: NEW
X-Bugzilla-Resolution: 
X-Bugzilla-Priority: P3
X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org
X-Bugzilla-Target-Milestone: ---
X-Bugzilla-Flags: 
X-Bugzilla-Changed-Fields: 
Message-ID: <bug-109849-4-IwiLGRRWq3@http.gcc.gnu.org/bugzilla/>
In-Reply-To: <bug-109849-4@http.gcc.gnu.org/bugzilla/>
References: <bug-109849-4@http.gcc.gnu.org/bugzilla/>
Content-Type: text/plain; charset="UTF-8"
Content-Transfer-Encoding: quoted-printable
X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/
Auto-Submitted: auto-generated
MIME-Version: 1.0
List-Id: <gcc-bugs.sourceware.org>

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D109849
--- Comment #8 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
We can only SRA if the address is non-escaping.  Clang does not seem to nee=
d it
to optimize better:

jan@localhost:~> cat t.c
extern void q(int *);
__attribute__ ((noinline))
void
test()
{
        for (int a =3D 0; a < 1000;a++)
                if (!(a%100))
                        q(&a);
}
int
main()
{
        for (int a =3D 0; a < 1000000;a++)
                test ();
}
jan@localhost:~> cat t2.c
void q(int *a)
{
}
jan@localhost:~> gcc -O2 t.c t2.c ; perf stat ./a.out

 Performance counter stats for './a.out':

          2,916.73 msec task-clock:u                     #    0.999 CPUs
utilized=20=20=20=20=20=20=20=20=20=20=20=20=20
                 0      context-switches:u               #    0.000 /sec=20=
=20=20=20=20=20=20=20
                 0      cpu-migrations:u                 #    0.000 /sec=20=
=20=20=20=20=20=20=20
                52      page-faults:u                    #   17.828 /sec=20=
=20=20=20=20=20=20=20
     8,344,719,833      cycles:u                         #    2.861 GHz=20=
=20=20=20=20=20=20=20=20
        13,561,375      stalled-cycles-frontend:u        #    0.16% frontend
cycles idle=20=20=20=20=20=20
     5,128,112,757      stalled-cycles-backend:u         #   61.45% backend
cycles idle=20=20=20=20=20=20=20
    10,050,172,242      instructions:u                   #    1.20  insn per
cycle=20=20=20=20=20=20=20=20=20=20=20=20
                                                  #    0.51  stalled cycles=
 per
insn=20=20=20
     2,034,043,082      branches:u                       #  697.370 M/sec=
=20=20=20=20=20=20=20
        11,186,312      branch-misses:u                  #    0.55% of all
branches=20=20=20=20=20=20=20=20=20=20=20

       2.918344737 seconds time elapsed

       2.917844000 seconds user
       0.000000000 seconds sys


jan@localhost:~> clang -O2 t.c t2.c ; perf stat ./a.out

 Performance counter stats for './a.out':

            664.40 msec task-clock:u                     #    0.999 CPUs
utilized=20=20=20=20=20=20=20=20=20=20=20=20=20
                 0      context-switches:u               #    0.000 /sec=20=
=20=20=20=20=20=20=20
                 0      cpu-migrations:u                 #    0.000 /sec=20=
=20=20=20=20=20=20=20
                54      page-faults:u                    #   81.276 /sec=20=
=20=20=20=20=20=20=20
     2,318,095,848      cycles:u                         #    3.489 GHz=20=
=20=20=20=20=20=20=20=20
        10,417,694      stalled-cycles-frontend:u        #    0.45% frontend
cycles idle=20=20=20=20=20=20
     1,057,731,301      stalled-cycles-backend:u         #   45.63% backend
cycles idle=20=20=20=20=20=20=20
    10,062,172,840      instructions:u                   #    4.34  insn per
cycle=20=20=20=20=20=20=20=20=20=20=20=20
                                                  #    0.11  stalled cycles=
 per
insn=20=20=20
     2,034,042,724      branches:u                       #    3.061 G/sec=
=20=20=20=20=20=20=20
        10,003,620      branch-misses:u                  #    0.49% of all
branches=20=20=20=20=20=20=20=20=20=20=20

       0.665267996 seconds time elapsed

       0.665247000 seconds user
       0.000000000 seconds sys


We do:

        jmp     .L3
        .p2align 4,,10
        .p2align 3
.L2:
        movl    12(%rsp), %eax
        addl    $1, %eax
        movl    %eax, 12(%rsp)
        cmpl    $999, %eax
        jg      .L7
.L3:
        imull   $-1030792151, %eax, %eax
        addl    $85899344, %eax
        rorl    $2, %eax
        cmpl    $42949672, %eax
        ja      .L2
        leaq    12(%rsp), %rdi
        call    q
        jmp     .L2

Which has stupid store-to-load dpendency in the internal loop. Clang keeps =
the
store but optimizes away the load:

        jmp     .LBB0_1
        .p2align        4, 0x90
.LBB0_3:                                #   in Loop: Header=3DBB0_1 Depth=
=3D1
        leal    1(%rax), %ecx
        movl    %ecx, 12(%rsp)
        cmpl    $999, %eax                      # imm =3D 0x3E7
        movl    %ecx, %eax
        jge     .LBB0_4
.LBB0_1:                                # =3D>This Inner Loop Header: Depth=
=3D1
        imull   $-1030792151, %eax, %ecx        # imm =3D 0xC28F5C29
        addl    $85899344, %ecx                 # imm =3D 0x51EB850
        rorl    $2, %ecx
        cmpl    $42949672, %ecx                 # imm =3D 0x28F5C28
        ja      .LBB0_3
# %bb.2:                                #   in Loop: Header=3DBB0_1 Depth=
=3D1
        movq    %rbx, %rdi
        callq   q@PLT
        movl    12(%rsp), %eax
        jmp     .LBB0_3

Wonder what makes clang to think it needs @PLT though.
Why we do not consider the load as partially redundant with itself?=