From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <gcc-bugzilla@gcc.gnu.org>
Received: by sourceware.org (Postfix, from userid 48)
	id EF0E63947412; Wed, 16 Nov 2022 14:10:07 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org EF0E63947412
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org;
	s=default; t=1668607807;
	bh=L3DhYg4P7uKd9sGPMHerm04lTQ1B7Ep4t9RxLWimo0A=;
	h=From:To:Subject:Date:From;
	b=xWyrUPrXQwAOmArN2rYMr6lZPPqlE7G8xf5M/9GGXrdpkeuMt6361ioJnkxVWid6N
	 SAp/7on3ZucrO0hdn98H9GL2w77JqNas68LkHnPKb9BhL4hNPcERWcgxIEuAx9cTgU
	 M/qBnfP21u0nB8M/P+0/Oqg311RedoHemy/qTUy0=
From: "hubicka at gcc dot gnu.org" <gcc-bugzilla@gcc.gnu.org>
To: gcc-bugs@gcc.gnu.org
Subject: [Bug tree-optimization/107715] New: TSVC s161 for double runs at
 zen4 30 times slower when vectorization is enabled
Date: Wed, 16 Nov 2022 14:10:07 +0000
X-Bugzilla-Reason: CC
X-Bugzilla-Type: new
X-Bugzilla-Watch-Reason: None
X-Bugzilla-Product: gcc
X-Bugzilla-Component: tree-optimization
X-Bugzilla-Version: 13.0
X-Bugzilla-Keywords: 
X-Bugzilla-Severity: normal
X-Bugzilla-Who: hubicka at gcc dot gnu.org
X-Bugzilla-Status: UNCONFIRMED
X-Bugzilla-Resolution: 
X-Bugzilla-Priority: P3
X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org
X-Bugzilla-Target-Milestone: ---
X-Bugzilla-Flags: 
X-Bugzilla-Changed-Fields: bug_id short_desc product version bug_status
 bug_severity priority component assigned_to reporter target_milestone
Message-ID: <bug-107715-4@http.gcc.gnu.org/bugzilla/>
Content-Type: text/plain; charset="UTF-8"
Content-Transfer-Encoding: quoted-printable
X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/
Auto-Submitted: auto-generated
MIME-Version: 1.0
List-Id: <gcc-bugs.sourceware.org>

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D107715

            Bug ID: 107715
           Summary: TSVC s161 for double runs at zen4 30 times slower when
                    vectorization is enabled
           Product: gcc
           Version: 13.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: hubicka at gcc dot gnu.org
  Target Milestone: ---

jh@alberti:~/tsvc/bin> more test.c
typedef double real_t;
#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256
real_t a[LEN_1D],b[LEN_1D],c[LEN_1D],d[LEN_1D],e[LEN_1D];
int
main()
{
    for (int nl =3D 0; nl < iterations/2; nl++) {
        for (int i =3D 0; i < LEN_1D-1; ++i) {
            if (b[i] < (real_t)0.) {
                goto L20;
            }
            a[i] =3D c[i] + d[i] * e[i];
            goto L10;
L20:
            c[i+1] =3D a[i] + d[i] * d[i];
L10:
            ;
        }
    }
    return 0;
}

jh@alberti:~/tsvc/bin> ~/trunk-install/bin/gcc -Ofast -march=3Dnative test.c
-fno-tree-vectorize
jh@alberti:~/tsvc/bin> time ./a.out

real    0m1.170s
user    0m1.170s
sys     0m0.000s

jh@alberti:~/tsvc/bin> ~/trunk-install/bin/gcc -Ofast -march=3Dnative test.=
c=20
jh@alberti:~/tsvc/bin> time ./a.out

real    0m37.269s
user    0m37.258s
sys     0m0.004s


It is not quite clear to me why this happens. It seems that all the time is
spent by movapd:
       =E2=94=82 b0:=E2=94=8C=E2=94=80=E2=86=92vmovapd     0x6bc880(%rax),%=
zmm2
       =E2=94=82    =E2=94=82  vmovapd     0x63f880(%rax),%zmm0
  0.00 =E2=94=82    =E2=94=82  vcmpltpd    %zmm1,%zmm2,%k1=20=20=20=20=20
       =E2=94=82    =E2=94=82  vmovapd     0x6fb080(%rax),%zmm2
       =E2=94=82    =E2=94=82  vfmadd132pd %zmm0,%zmm2,%zmm0=20=20=20
       =E2=94=82    =E2=94=82  vmovapd     0x6bc880(%rax),%zmm2
       =E2=94=82    =E2=94=82  vmovupd     %zmm0,0x67e088(%rax){%k1}
 99.94 =E2=94=82    =E2=94=82  vmovapd     0x63f880(%rax),%zmm0
       =E2=94=82    =E2=94=82  add         $0x40,%rax=20=20=20=20=20=20=20=
=20=20=20
       =E2=94=82    =E2=94=82  vcmpgepd    %zmm1,%zmm2,%k1=20=20=20=20=20
       =E2=94=82    =E2=94=82  vmovapd     0x67e040(%rax),%zmm2
  0.02 =E2=94=82    =E2=94=82  vfmadd132pd 0x601040(%rax),%zmm2,%zmm0
  0.04 =E2=94=82    =E2=94=82  vmovapd     0x6fb040(%rax),%zmm2
  0.00 =E2=94=82    =E2=94=82  vblendmpd   %zmm0,%zmm2,%zmm0{%k1}
       =E2=94=82    =E2=94=82  vmovapd     %zmm0,0x6fb040(%rax)=20=20
       =E2=94=82    =E2=94=82  cmp         $0x3e7c0,%rax=20=20=20=20=20=20=
=20=20=20
       =E2=94=82    =E2=94=94=E2=94=80=E2=94=80jne         b0=20=20=20

Since I do not initialize the array in reduced testcase we always execute t=
he
jump to L20.

Exctending the testcase by array initialization:
typedef double real_t;
#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256
real_t a[LEN_1D],b[LEN_1D],c[LEN_1D],d[LEN_1D],e[LEN_1D];
enum {SET1D_RECIP_IDX =3D -1, SET1D_RECIP_IDX_SQ =3D -2};
void set_1d_array(real_t * arr, int length, real_t value, int stride)
{
    if (stride =3D=3D SET1D_RECIP_IDX) {
        for (int i =3D 0; i < length; i++) {
            arr[i] =3D 1. / (real_t) (i+1);
        }
    } else if (stride =3D=3D SET1D_RECIP_IDX_SQ) {
        for (int i =3D 0; i < length; i++) {
            arr[i] =3D 1. / (real_t) ((i+1) * (i+1));
        }
    } else {
        for (int i =3D 0; i < length; i +=3D stride) {
            arr[i] =3D value;
        }
    }
}
int
main()
{
    set_1d_array(a, LEN_1D, 1.,1);
    set_1d_array(b, LEN_1D, 1.,1);
    set_1d_array(c, LEN_1D, 1.,1);
    set_1d_array(d, LEN_1D, 1.,1);
    set_1d_array(e, LEN_1D, 1.,1);
    for (int nl =3D 0; nl < iterations/2; nl++) {
        for (int i =3D 0; i < LEN_1D-1; ++i) {
            if (b[i] < (real_t)0.) {
                goto L20;
            }
            a[i] =3D c[i] + d[i] * e[i];
            goto L10;
L20:
            c[i+1] =3D a[i] + d[i] * d[i];
L10:
            ;
        }
    }
    return 0;
}
jh@alberti:~/tsvc/bin> ~/trunk-install/bin/gcc -Ofast -march=3Dnative test.=
c=20
-fno-tree-vectorize
jh@alberti:~/tsvc/bin> time ./a.out

real    0m0.910s
user    0m0.910s
sys     0m0.000s
jh@alberti:~/tsvc/bin> ~/trunk-install/bin/gcc -Ofast -march=3Dnative test.=
c=20=20
jh@alberti:~/tsvc/bin> time ./a.out

real    0m1.866s
user    0m1.866s
sys     0m0.000s
jh@alberti:~/tsvc/bin>=20

still gets about 2x regression for vectorization.=