From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 48) id 4ABD6395C014; Wed, 16 Nov 2022 17:30:10 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 4ABD6395C014 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1668619810; bh=gb+bzrgQQ0zin+vk/GbEohlNV2V68Zg7+WzAgJ5vDTE=; h=From:To:Subject:Date:From; b=uqB7mjup/sXZoIeTm2d5gdUi3kauKAMTp+6NqDycpBH5R9V9MYGfskFzdeqbKmRy0 f+v04XVLMgK7bMIZCMufzr+0JZ8+SMBDLy9T3ugjp0STqDxmeUxYxFN2A3XEM7Ayho Tz50QFb/F1RhwWR/of0PypQjp2C6lvO9S1uxPqlY= From: "hubicka at gcc dot gnu.org" To: gcc-bugs@gcc.gnu.org Subject: [Bug middle-end/107719] New: 14% regression on TSVC s3113 on znve4 compared to GCC 7.5 Date: Wed, 16 Nov 2022 17:30:09 +0000 X-Bugzilla-Reason: CC X-Bugzilla-Type: new X-Bugzilla-Watch-Reason: None X-Bugzilla-Product: gcc X-Bugzilla-Component: middle-end X-Bugzilla-Version: 13.0 X-Bugzilla-Keywords: X-Bugzilla-Severity: normal X-Bugzilla-Who: hubicka at gcc dot gnu.org X-Bugzilla-Status: UNCONFIRMED X-Bugzilla-Resolution: X-Bugzilla-Priority: P3 X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org X-Bugzilla-Target-Milestone: --- X-Bugzilla-Flags: X-Bugzilla-Changed-Fields: bug_id short_desc product version bug_status bug_severity priority component assigned_to reporter target_milestone Message-ID: Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: quoted-printable X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/ Auto-Submitted: auto-generated MIME-Version: 1.0 List-Id: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D107719 Bug ID: 107719 Summary: 14% regression on TSVC s3113 on znve4 compared to GCC 7.5 Product: gcc Version: 13.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: middle-end Assignee: unassigned at gcc dot gnu.org Reporter: hubicka at gcc dot gnu.org Target Milestone: --- jh@alberti:~/tsvc/bin> cat tt5.c #include typedef double real_t; #define iterations 100000 #define LEN_1D 32000 #define LEN_2D 256 real_t a[LEN_1D],b[LEN_1D],c[LEN_1D],d[LEN_1D],e[LEN_1D]; real_t qq; int main(void) { // reductions // maximum of absolute value real_t max; for (int nl =3D 0; nl < iterations*4; nl++) { max =3D fabs(a[0]); for (int i =3D 0; i < LEN_1D; i++) { if ((fabs(a[i])) > max) { max =3D fabs(a[i]); } } qq +=3D max; } return max; } jh@alberti:~/tsvc/bin> /home/jh/trunk-install/bin/gcc -Ofast -march=3Dnative tt5.c ; perf stat ./a.out Performance counter stats for './a.out': 913.92 msec task-clock:u # 0.999 CPUs utilized= =20=20=20=20=20=20 0 context-switches:u # 0.000 /sec=20=20=20= =20=20=20=20=20=20=20=20=20=20=20=20 0 cpu-migrations:u # 0.000 /sec=20=20=20= =20=20=20=20=20=20=20=20=20=20=20=20 108 page-faults:u # 118.172 /sec=20=20=20= =20=20=20=20=20=20=20=20=20=20=20=20 3,342,731,634 cycles:u # 3.658 GHz=20=20=20= =20=20=20=20=20=20=20=20=20=20=20=20 (83.37%) 15,353 stalled-cycles-frontend:u # 0.00% frontend cycles idle (83.37%) 12,484 stalled-cycles-backend:u # 0.00% backend cycles idle (83.38%) 7,989,930,772 instructions:u # 2.39 insn per cycle= =20=20=20=20=20 # 0.00 stalled cycles= per insn (83.37%) 1,597,552,117 branches:u # 1.748 G/sec=20=20=20= =20=20=20=20=20=20=20=20=20=20 (83.37%) 401,094 branch-misses:u # 0.03% of all branche= s=20=20=20 (83.13%) 0.914933333 seconds time elapsed 0.914630000 seconds user 0.000000000 seconds sys jh@alberti:~/tsvc/bin> gcc -Ofast -march=3Dnative tt5.c ; perf stat ./a.out Performance counter stats for './a.out': 880.97 msec task-clock:u # 0.999 CPUs utilized= =20=20=20=20=20=20 0 context-switches:u # 0.000 /sec=20=20=20= =20=20=20=20=20=20=20=20=20=20=20=20 0 cpu-migrations:u # 0.000 /sec=20=20=20= =20=20=20=20=20=20=20=20=20=20=20=20 110 page-faults:u # 124.862 /sec=20=20=20= =20=20=20=20=20=20=20=20=20=20=20=20 3,218,698,288 cycles:u # 3.654 GHz=20=20=20= =20=20=20=20=20=20=20=20=20=20=20=20 (83.21%) 11,566 stalled-cycles-frontend:u # 0.00% frontend cycles idle (83.21%) 12,185 stalled-cycles-backend:u # 0.00% backend cycles idle (83.21%) 7,989,544,164 instructions:u # 2.48 insn per cycle= =20=20=20=20=20 # 0.00 stalled cycles= per insn (83.48%) 1,597,229,244 branches:u # 1.813 G/sec=20=20=20= =20=20=20=20=20=20=20=20=20=20 (83.66%) 401,157 branch-misses:u # 0.03% of all branche= s=20=20=20 (83.23%) 0.881919601 seconds time elapsed 0.881627000 seconds user 0.000000000 seconds sys It is off-noise. GCC 7.5 does: main: .LFB0: .cfi_startproc vmovsd a(%rip), %xmm4 vmovsd qq(%rip), %xmm3 movl $400000, %ecx movl $a+256000, %edx vmovapd .LC1(%rip), %zmm2 vandps .LC0(%rip), %xmm4, %xmm4 vbroadcastsd %xmm4, %zmm4 .p2align 4,,15 .L3: movl $a, %eax vmovapd %zmm4, %zmm0 .p2align 4,,15 .L2: vandpd (%rax), %zmm2, %zmm1 addq $64, %rax vmaxpd %zmm1, %zmm0, %zmm0 cmpq %rax, %rdx jne .L2 vshufi32x4 $78, %zmm0, %zmm0, %zmm1 decl %ecx vmaxpd %zmm0, %zmm1, %zmm0 vshufi32x4 $77, %zmm0, %zmm0, %zmm1 vmaxpd %zmm0, %zmm1, %zmm1 vpshufd $254, %zmm1, %zmm0 vmaxpd %zmm1, %zmm0, %zmm0 vaddsd %xmm0, %xmm3, %xmm3 jne .L3 vmovsd %xmm3, qq(%rip) vcvttsd2si %xmm0, %eax vzeroupper ret .cfi_endproc while trunk main: .LFB0: .cfi_startproc vmovsd a(%rip), %xmm4 vmovsd qq(%rip), %xmm3 movl $400000, %ecx movl $a+256000, %edx vandpd .LC0(%rip), %xmm4, %xmm4 vbroadcastsd .LC2(%rip), %zmm2 vbroadcastsd %xmm4, %zmm4 .p2align 4 .p2align 3 .L3: vmovapd %zmm4, %zmm0 movl $a, %eax .p2align 4 .p2align 3 .L2: vandpd (%rax), %zmm2, %zmm1 addq $64, %rax vmaxpd %zmm1, %zmm0, %zmm0 cmpq %rax, %rdx jne .L2 vextractf64x4 $0x1, %zmm0, %ymm1 decl %ecx vmaxpd %ymm0, %ymm1, %ymm0 vextractf64x2 $0x1, %ymm0, %xmm1 vmaxpd %xmm0, %xmm1, %xmm1 vunpckhpd %xmm1, %xmm1, %xmm0 vmaxpd %xmm1, %xmm0, %xmm0 vaddsd %xmm0, %xmm3, %xmm3 jne .L3 vmovsd %xmm3, qq(%rip) vcvttsd2sil %xmm0, %eax vzeroupper ret .cfi_endproc So no difference in the internal loop @@ -11,67 +11,82 @@ vmovsd qq(%rip), %xmm3 movl $400000, %ecx movl $a+256000, %edx - vmovapd .LC1(%rip), %zmm2 - vandps .LC0(%rip), %xmm4, %xmm4 + vandpd .LC0(%rip), %xmm4, %xmm4 + vbroadcastsd .LC2(%rip), %zmm2 vbroadcastsd %xmm4, %zmm4 - .p2align 4,,15 + .p2align 4 + .p2align 3 .L3: - movl $a, %eax vmovapd %zmm4, %zmm0 - .p2align 4,,15 + movl $a, %eax + .p2align 4 + .p2align 3 .L2: vandpd (%rax), %zmm2, %zmm1 addq $64, %rax vmaxpd %zmm1, %zmm0, %zmm0 cmpq %rax, %rdx jne .L2 - vshufi32x4 $78, %zmm0, %zmm0, %zmm1 + vextractf64x4 $0x1, %zmm0, %ymm1 decl %ecx - vmaxpd %zmm0, %zmm1, %zmm0 - vshufi32x4 $77, %zmm0, %zmm0, %zmm1 - vmaxpd %zmm0, %zmm1, %zmm1 - vpshufd $254, %zmm1, %zmm0 - vmaxpd %zmm1, %zmm0, %zmm0 + vmaxpd %ymm0, %ymm1, %ymm0 + vextractf64x2 $0x1, %ymm0, %xmm1 + vmaxpd %xmm0, %xmm1, %xmm1 + vunpckhpd %xmm1, %xmm1, %xmm0 + vmaxpd %xmm1, %xmm0, %xmm0 vaddsd %xmm0, %xmm3, %xmm3 jne .L3 vmovsd %xmm3, qq(%rip) - vcvttsd2si %xmm0, %eax + vcvttsd2sil %xmm0, %eax=