public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug middle-end/107719] New: 14% regression on TSVC s3113 on znve4 compared to GCC 7.5
@ 2022-11-16 17:30 hubicka at gcc dot gnu.org
2022-11-16 17:42 ` [Bug middle-end/107719] " amonakov at gcc dot gnu.org
` (2 more replies)
0 siblings, 3 replies; 4+ messages in thread
From: hubicka at gcc dot gnu.org @ 2022-11-16 17:30 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107719
Bug ID: 107719
Summary: 14% regression on TSVC s3113 on znve4 compared to GCC
7.5
Product: gcc
Version: 13.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: middle-end
Assignee: unassigned at gcc dot gnu.org
Reporter: hubicka at gcc dot gnu.org
Target Milestone: ---
jh@alberti:~/tsvc/bin> cat tt5.c
#include <math.h>
typedef double real_t;
#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256
real_t a[LEN_1D],b[LEN_1D],c[LEN_1D],d[LEN_1D],e[LEN_1D];
real_t qq;
int
main(void)
{
// reductions
// maximum of absolute value
real_t max;
for (int nl = 0; nl < iterations*4; nl++) {
max = fabs(a[0]);
for (int i = 0; i < LEN_1D; i++) {
if ((fabs(a[i])) > max) {
max = fabs(a[i]);
}
}
qq += max;
}
return max;
}
jh@alberti:~/tsvc/bin> /home/jh/trunk-install/bin/gcc -Ofast -march=native
tt5.c ; perf stat ./a.out
Performance counter stats for './a.out':
913.92 msec task-clock:u # 0.999 CPUs utilized
0 context-switches:u # 0.000 /sec
0 cpu-migrations:u # 0.000 /sec
108 page-faults:u # 118.172 /sec
3,342,731,634 cycles:u # 3.658 GHz
(83.37%)
15,353 stalled-cycles-frontend:u # 0.00% frontend cycles
idle (83.37%)
12,484 stalled-cycles-backend:u # 0.00% backend cycles
idle (83.38%)
7,989,930,772 instructions:u # 2.39 insn per cycle
# 0.00 stalled cycles per
insn (83.37%)
1,597,552,117 branches:u # 1.748 G/sec
(83.37%)
401,094 branch-misses:u # 0.03% of all branches
(83.13%)
0.914933333 seconds time elapsed
0.914630000 seconds user
0.000000000 seconds sys
jh@alberti:~/tsvc/bin> gcc -Ofast -march=native tt5.c ; perf stat ./a.out
Performance counter stats for './a.out':
880.97 msec task-clock:u # 0.999 CPUs utilized
0 context-switches:u # 0.000 /sec
0 cpu-migrations:u # 0.000 /sec
110 page-faults:u # 124.862 /sec
3,218,698,288 cycles:u # 3.654 GHz
(83.21%)
11,566 stalled-cycles-frontend:u # 0.00% frontend cycles
idle (83.21%)
12,185 stalled-cycles-backend:u # 0.00% backend cycles
idle (83.21%)
7,989,544,164 instructions:u # 2.48 insn per cycle
# 0.00 stalled cycles per
insn (83.48%)
1,597,229,244 branches:u # 1.813 G/sec
(83.66%)
401,157 branch-misses:u # 0.03% of all branches
(83.23%)
0.881919601 seconds time elapsed
0.881627000 seconds user
0.000000000 seconds sys
It is off-noise. GCC 7.5 does:
main:
.LFB0:
.cfi_startproc
vmovsd a(%rip), %xmm4
vmovsd qq(%rip), %xmm3
movl $400000, %ecx
movl $a+256000, %edx
vmovapd .LC1(%rip), %zmm2
vandps .LC0(%rip), %xmm4, %xmm4
vbroadcastsd %xmm4, %zmm4
.p2align 4,,15
.L3:
movl $a, %eax
vmovapd %zmm4, %zmm0
.p2align 4,,15
.L2:
vandpd (%rax), %zmm2, %zmm1
addq $64, %rax
vmaxpd %zmm1, %zmm0, %zmm0
cmpq %rax, %rdx
jne .L2
vshufi32x4 $78, %zmm0, %zmm0, %zmm1
decl %ecx
vmaxpd %zmm0, %zmm1, %zmm0
vshufi32x4 $77, %zmm0, %zmm0, %zmm1
vmaxpd %zmm0, %zmm1, %zmm1
vpshufd $254, %zmm1, %zmm0
vmaxpd %zmm1, %zmm0, %zmm0
vaddsd %xmm0, %xmm3, %xmm3
jne .L3
vmovsd %xmm3, qq(%rip)
vcvttsd2si %xmm0, %eax
vzeroupper
ret
.cfi_endproc
while trunk
main:
.LFB0:
.cfi_startproc
vmovsd a(%rip), %xmm4
vmovsd qq(%rip), %xmm3
movl $400000, %ecx
movl $a+256000, %edx
vandpd .LC0(%rip), %xmm4, %xmm4
vbroadcastsd .LC2(%rip), %zmm2
vbroadcastsd %xmm4, %zmm4
.p2align 4
.p2align 3
.L3:
vmovapd %zmm4, %zmm0
movl $a, %eax
.p2align 4
.p2align 3
.L2:
vandpd (%rax), %zmm2, %zmm1
addq $64, %rax
vmaxpd %zmm1, %zmm0, %zmm0
cmpq %rax, %rdx
jne .L2
vextractf64x4 $0x1, %zmm0, %ymm1
decl %ecx
vmaxpd %ymm0, %ymm1, %ymm0
vextractf64x2 $0x1, %ymm0, %xmm1
vmaxpd %xmm0, %xmm1, %xmm1
vunpckhpd %xmm1, %xmm1, %xmm0
vmaxpd %xmm1, %xmm0, %xmm0
vaddsd %xmm0, %xmm3, %xmm3
jne .L3
vmovsd %xmm3, qq(%rip)
vcvttsd2sil %xmm0, %eax
vzeroupper
ret
.cfi_endproc
So no difference in the internal loop
@@ -11,67 +11,82 @@
vmovsd qq(%rip), %xmm3
movl $400000, %ecx
movl $a+256000, %edx
- vmovapd .LC1(%rip), %zmm2
- vandps .LC0(%rip), %xmm4, %xmm4
+ vandpd .LC0(%rip), %xmm4, %xmm4
+ vbroadcastsd .LC2(%rip), %zmm2
vbroadcastsd %xmm4, %zmm4
- .p2align 4,,15
+ .p2align 4
+ .p2align 3
.L3:
- movl $a, %eax
vmovapd %zmm4, %zmm0
- .p2align 4,,15
+ movl $a, %eax
+ .p2align 4
+ .p2align 3
.L2:
vandpd (%rax), %zmm2, %zmm1
addq $64, %rax
vmaxpd %zmm1, %zmm0, %zmm0
cmpq %rax, %rdx
jne .L2
- vshufi32x4 $78, %zmm0, %zmm0, %zmm1
+ vextractf64x4 $0x1, %zmm0, %ymm1
decl %ecx
- vmaxpd %zmm0, %zmm1, %zmm0
- vshufi32x4 $77, %zmm0, %zmm0, %zmm1
- vmaxpd %zmm0, %zmm1, %zmm1
- vpshufd $254, %zmm1, %zmm0
- vmaxpd %zmm1, %zmm0, %zmm0
+ vmaxpd %ymm0, %ymm1, %ymm0
+ vextractf64x2 $0x1, %ymm0, %xmm1
+ vmaxpd %xmm0, %xmm1, %xmm1
+ vunpckhpd %xmm1, %xmm1, %xmm0
+ vmaxpd %xmm1, %xmm0, %xmm0
vaddsd %xmm0, %xmm3, %xmm3
jne .L3
vmovsd %xmm3, qq(%rip)
- vcvttsd2si %xmm0, %eax
+ vcvttsd2sil %xmm0, %eax
^ permalink raw reply [flat|nested] 4+ messages in thread
* [Bug middle-end/107719] 14% regression on TSVC s3113 on znve4 compared to GCC 7.5
2022-11-16 17:30 [Bug middle-end/107719] New: 14% regression on TSVC s3113 on znve4 compared to GCC 7.5 hubicka at gcc dot gnu.org
@ 2022-11-16 17:42 ` amonakov at gcc dot gnu.org
2023-01-13 10:22 ` rguenth at gcc dot gnu.org
2023-07-19 11:12 ` rguenth at gcc dot gnu.org
2 siblings, 0 replies; 4+ messages in thread
From: amonakov at gcc dot gnu.org @ 2022-11-16 17:42 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107719
Alexander Monakov <amonakov at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
CC| |amonakov at gcc dot gnu.org
--- Comment #1 from Alexander Monakov <amonakov at gcc dot gnu.org> ---
As you say, the inner loop is the same, and it iterates 32000 times. Most
likely it crosses an instruction fetch boundary differently, try
-falign-loops=32.
^ permalink raw reply [flat|nested] 4+ messages in thread
* [Bug middle-end/107719] 14% regression on TSVC s3113 on znve4 compared to GCC 7.5
2022-11-16 17:30 [Bug middle-end/107719] New: 14% regression on TSVC s3113 on znve4 compared to GCC 7.5 hubicka at gcc dot gnu.org
2022-11-16 17:42 ` [Bug middle-end/107719] " amonakov at gcc dot gnu.org
@ 2023-01-13 10:22 ` rguenth at gcc dot gnu.org
2023-07-19 11:12 ` rguenth at gcc dot gnu.org
2 siblings, 0 replies; 4+ messages in thread
From: rguenth at gcc dot gnu.org @ 2023-01-13 10:22 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107719
Richard Biener <rguenth at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Target| |x86_64-*-*
--- Comment #2 from Richard Biener <rguenth at gcc dot gnu.org> ---
yep, probably not "real"
^ permalink raw reply [flat|nested] 4+ messages in thread
* [Bug middle-end/107719] 14% regression on TSVC s3113 on znve4 compared to GCC 7.5
2022-11-16 17:30 [Bug middle-end/107719] New: 14% regression on TSVC s3113 on znve4 compared to GCC 7.5 hubicka at gcc dot gnu.org
2022-11-16 17:42 ` [Bug middle-end/107719] " amonakov at gcc dot gnu.org
2023-01-13 10:22 ` rguenth at gcc dot gnu.org
@ 2023-07-19 11:12 ` rguenth at gcc dot gnu.org
2 siblings, 0 replies; 4+ messages in thread
From: rguenth at gcc dot gnu.org @ 2023-07-19 11:12 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107719
Richard Biener <rguenth at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Resolution|--- |INVALID
Status|UNCONFIRMED |RESOLVED
--- Comment #3 from Richard Biener <rguenth at gcc dot gnu.org> ---
I can't reproduce the difference in runtime. I also think you're running into
a misaligned inner loop here.
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2023-07-19 11:12 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-11-16 17:30 [Bug middle-end/107719] New: 14% regression on TSVC s3113 on znve4 compared to GCC 7.5 hubicka at gcc dot gnu.org
2022-11-16 17:42 ` [Bug middle-end/107719] " amonakov at gcc dot gnu.org
2023-01-13 10:22 ` rguenth at gcc dot gnu.org
2023-07-19 11:12 ` rguenth at gcc dot gnu.org
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).