public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug tree-optimization/107715] New: TSVC s161 for double runs at zen4 30 times slower when vectorization is enabled
@ 2022-11-16 14:10 hubicka at gcc dot gnu.org
2022-11-16 15:08 ` [Bug tree-optimization/107715] " rguenth at gcc dot gnu.org
` (4 more replies)
0 siblings, 5 replies; 6+ messages in thread
From: hubicka at gcc dot gnu.org @ 2022-11-16 14:10 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107715
Bug ID: 107715
Summary: TSVC s161 for double runs at zen4 30 times slower when
vectorization is enabled
Product: gcc
Version: 13.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: tree-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: hubicka at gcc dot gnu.org
Target Milestone: ---
jh@alberti:~/tsvc/bin> more test.c
typedef double real_t;
#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256
real_t a[LEN_1D],b[LEN_1D],c[LEN_1D],d[LEN_1D],e[LEN_1D];
int
main()
{
for (int nl = 0; nl < iterations/2; nl++) {
for (int i = 0; i < LEN_1D-1; ++i) {
if (b[i] < (real_t)0.) {
goto L20;
}
a[i] = c[i] + d[i] * e[i];
goto L10;
L20:
c[i+1] = a[i] + d[i] * d[i];
L10:
;
}
}
return 0;
}
jh@alberti:~/tsvc/bin> ~/trunk-install/bin/gcc -Ofast -march=native test.c
-fno-tree-vectorize
jh@alberti:~/tsvc/bin> time ./a.out
real 0m1.170s
user 0m1.170s
sys 0m0.000s
jh@alberti:~/tsvc/bin> ~/trunk-install/bin/gcc -Ofast -march=native test.c
jh@alberti:~/tsvc/bin> time ./a.out
real 0m37.269s
user 0m37.258s
sys 0m0.004s
It is not quite clear to me why this happens. It seems that all the time is
spent by movapd:
│ b0:┌─→vmovapd 0x6bc880(%rax),%zmm2
│ │ vmovapd 0x63f880(%rax),%zmm0
0.00 │ │ vcmpltpd %zmm1,%zmm2,%k1
│ │ vmovapd 0x6fb080(%rax),%zmm2
│ │ vfmadd132pd %zmm0,%zmm2,%zmm0
│ │ vmovapd 0x6bc880(%rax),%zmm2
│ │ vmovupd %zmm0,0x67e088(%rax){%k1}
99.94 │ │ vmovapd 0x63f880(%rax),%zmm0
│ │ add $0x40,%rax
│ │ vcmpgepd %zmm1,%zmm2,%k1
│ │ vmovapd 0x67e040(%rax),%zmm2
0.02 │ │ vfmadd132pd 0x601040(%rax),%zmm2,%zmm0
0.04 │ │ vmovapd 0x6fb040(%rax),%zmm2
0.00 │ │ vblendmpd %zmm0,%zmm2,%zmm0{%k1}
│ │ vmovapd %zmm0,0x6fb040(%rax)
│ │ cmp $0x3e7c0,%rax
│ └──jne b0
Since I do not initialize the array in reduced testcase we always execute the
jump to L20.
Exctending the testcase by array initialization:
typedef double real_t;
#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256
real_t a[LEN_1D],b[LEN_1D],c[LEN_1D],d[LEN_1D],e[LEN_1D];
enum {SET1D_RECIP_IDX = -1, SET1D_RECIP_IDX_SQ = -2};
void set_1d_array(real_t * arr, int length, real_t value, int stride)
{
if (stride == SET1D_RECIP_IDX) {
for (int i = 0; i < length; i++) {
arr[i] = 1. / (real_t) (i+1);
}
} else if (stride == SET1D_RECIP_IDX_SQ) {
for (int i = 0; i < length; i++) {
arr[i] = 1. / (real_t) ((i+1) * (i+1));
}
} else {
for (int i = 0; i < length; i += stride) {
arr[i] = value;
}
}
}
int
main()
{
set_1d_array(a, LEN_1D, 1.,1);
set_1d_array(b, LEN_1D, 1.,1);
set_1d_array(c, LEN_1D, 1.,1);
set_1d_array(d, LEN_1D, 1.,1);
set_1d_array(e, LEN_1D, 1.,1);
for (int nl = 0; nl < iterations/2; nl++) {
for (int i = 0; i < LEN_1D-1; ++i) {
if (b[i] < (real_t)0.) {
goto L20;
}
a[i] = c[i] + d[i] * e[i];
goto L10;
L20:
c[i+1] = a[i] + d[i] * d[i];
L10:
;
}
}
return 0;
}
jh@alberti:~/tsvc/bin> ~/trunk-install/bin/gcc -Ofast -march=native test.c
-fno-tree-vectorize
jh@alberti:~/tsvc/bin> time ./a.out
real 0m0.910s
user 0m0.910s
sys 0m0.000s
jh@alberti:~/tsvc/bin> ~/trunk-install/bin/gcc -Ofast -march=native test.c
jh@alberti:~/tsvc/bin> time ./a.out
real 0m1.866s
user 0m1.866s
sys 0m0.000s
jh@alberti:~/tsvc/bin>
still gets about 2x regression for vectorization.
^ permalink raw reply [flat|nested] 6+ messages in thread
* [Bug tree-optimization/107715] TSVC s161 for double runs at zen4 30 times slower when vectorization is enabled
2022-11-16 14:10 [Bug tree-optimization/107715] New: TSVC s161 for double runs at zen4 30 times slower when vectorization is enabled hubicka at gcc dot gnu.org
@ 2022-11-16 15:08 ` rguenth at gcc dot gnu.org
2022-11-16 15:28 ` hubicka at ucw dot cz
` (3 subsequent siblings)
4 siblings, 0 replies; 6+ messages in thread
From: rguenth at gcc dot gnu.org @ 2022-11-16 15:08 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107715
--- Comment #1 from Richard Biener <rguenth at gcc dot gnu.org> ---
Because store data races are allowed with -Ofast masked stores are not used so
we instead get
vect__ifc__80.24_114 = VEC_COND_EXPR <mask__58.15_104, vect__45.20_109,
vect__ifc__78.23_113>;
_ifc__80 = _58 ? _45 : _ifc__78;
MEM <vector(8) double> [(double *)vectp_c.25_116] = vect__ifc__80.24_114;
which somehow is later turned into masked stores? In fact we expand from
vect__43.18_107 = MEM <vector(8) double> [(double *)&a + ivtmp.75_134 * 1];
vect__ifc__78.23_113 = MEM <vector(8) double> [(double *)&c + 8B +
ivtmp.75_134 * 1];
_97 = .COND_FMA (mask__58.15_104, vect_pretmp_36.14_102,
vect_pretmp_36.14_102, vect__43.18_107, vect__ifc__78.23_113);
MEM <vector(8) double> [(double *)&c + 8B + ivtmp.75_134 * 1] = _97;
vect__38.29_121 = MEM <vector(8) double> [(double *)&c + ivtmp.75_134 * 1];
vect__39.32_124 = MEM <vector(8) double> [(double *)&e + ivtmp.75_134 * 1];
_98 = vect__35.11_99 >= { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
_100 = .COND_FMA (_98, vect_pretmp_36.14_102, vect__39.32_124,
vect__38.29_121, vect__43.18_107);
MEM <vector(8) double> [(double *)&a + ivtmp.75_134 * 1] = _100;
the vectorizer has optimize_mask_stores () which is supposed to replace
.MASK_STORE with
if (mask != { 0, 0, 0 ... })
<code depending on the mask store>
and thus optimize the mask == 0 case. But that only triggers for .MASK_STORE.
You can see this when you force .MASK_STORE via -O3 -ffast-math (without
-fallow-store-data-races) you get this effect:
.L2:
vcmppd $13, %zmm1, %zmm0, %k1
kortestb %k1, %k1
jne .L33
.L3:
addq $64, %rax
cmpq $255936, %rax
je .L34
.L4:
vmovapd b(%rax), %zmm0
vmovapd d(%rax), %zmm2
vcmppd $1, %zmm1, %zmm0, %k1
kortestb %k1, %k1
je .L2
vmovapd %zmm2, %zmm3
vfmadd213pd a(%rax), %zmm2, %zmm3
vmovupd %zmm3, c+8(%rax){%k1}
vcmppd $13, %zmm1, %zmm0, %k1
kortestb %k1, %k1
je .L3
.p2align 4
.p2align 3
.L33:
vmovapd c(%rax), %zmm3
vfmadd132pd e(%rax), %zmm3, %zmm2
vmovapd %zmm2, a(%rax){%k1}
addq $64, %rax
cmpq $255936, %rax
jne .L4
.L34:
kortestb %k3, %k3
jne .L35
maybe you can benchmark with that? Still it shouldn't be 40 times slower,
but maybe that's the cache effect of using 4 arrays instead of 3? 4
arrays need 1MB while 3 fit into 750kB? L1 is exactly 1MB so we might
run into aliasing issues there with the 4 arrays.
^ permalink raw reply [flat|nested] 6+ messages in thread
* [Bug tree-optimization/107715] TSVC s161 for double runs at zen4 30 times slower when vectorization is enabled
2022-11-16 14:10 [Bug tree-optimization/107715] New: TSVC s161 for double runs at zen4 30 times slower when vectorization is enabled hubicka at gcc dot gnu.org
2022-11-16 15:08 ` [Bug tree-optimization/107715] " rguenth at gcc dot gnu.org
@ 2022-11-16 15:28 ` hubicka at ucw dot cz
2022-11-16 15:35 ` amonakov at gcc dot gnu.org
` (2 subsequent siblings)
4 siblings, 0 replies; 6+ messages in thread
From: hubicka at ucw dot cz @ 2022-11-16 15:28 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107715
--- Comment #2 from Jan Hubicka <hubicka at ucw dot cz> ---
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107715
>
> --- Comment #1 from Richard Biener <rguenth at gcc dot gnu.org> ---
> Because store data races are allowed with -Ofast masked stores are not used so
> we instead get
>
> vect__ifc__80.24_114 = VEC_COND_EXPR <mask__58.15_104, vect__45.20_109,
> vect__ifc__78.23_113>;
> _ifc__80 = _58 ? _45 : _ifc__78;
> MEM <vector(8) double> [(double *)vectp_c.25_116] = vect__ifc__80.24_114;
>
> which somehow is later turned into masked stores? In fact we expand from
>
> vect__43.18_107 = MEM <vector(8) double> [(double *)&a + ivtmp.75_134 * 1];
> vect__ifc__78.23_113 = MEM <vector(8) double> [(double *)&c + 8B +
> ivtmp.75_134 * 1];
> _97 = .COND_FMA (mask__58.15_104, vect_pretmp_36.14_102,
> vect_pretmp_36.14_102, vect__43.18_107, vect__ifc__78.23_113);
> MEM <vector(8) double> [(double *)&c + 8B + ivtmp.75_134 * 1] = _97;
> vect__38.29_121 = MEM <vector(8) double> [(double *)&c + ivtmp.75_134 * 1];
> vect__39.32_124 = MEM <vector(8) double> [(double *)&e + ivtmp.75_134 * 1];
> _98 = vect__35.11_99 >= { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
> _100 = .COND_FMA (_98, vect_pretmp_36.14_102, vect__39.32_124,
> vect__38.29_121, vect__43.18_107);
> MEM <vector(8) double> [(double *)&a + ivtmp.75_134 * 1] = _100;
>
> the vectorizer has optimize_mask_stores () which is supposed to replace
> .MASK_STORE with
>
> if (mask != { 0, 0, 0 ... })
> <code depending on the mask store>
>
> and thus optimize the mask == 0 case. But that only triggers for .MASK_STORE.
>
> You can see this when you force .MASK_STORE via -O3 -ffast-math (without
> -fallow-store-data-races) you get this effect:
Yep, -fno-allow-store-data-races fixes the problem
jh@alberti:~/tsvc/bin> /home/jh/trunk-install/bin/gcc test.c -Ofast
-march=native -lm
jh@alberti:~/tsvc/bin> perf stat ./a.out
Performance counter stats for './a.out':
37,289.50 msec task-clock:u # 1.000 CPUs utilized
0 context-switches:u # 0.000 /sec
0 cpu-migrations:u # 0.000 /sec
431 page-faults:u # 11.558 /sec
137,411,365,539 cycles:u # 3.685 GHz
(83.33%)
991,673,172 stalled-cycles-frontend:u # 0.72% frontend cycles
idle (83.34%)
506,793 stalled-cycles-backend:u # 0.00% backend cycles
idle (83.34%)
3,400,375,204 instructions:u # 0.02 insn per cycle
# 0.29 stalled cycles per
insn (83.34%)
200,235,802 branches:u # 5.370 M/sec
(83.34%)
73,962 branch-misses:u # 0.04% of all branches
(83.33%)
37.305121352 seconds time elapsed
37.285467000 seconds user
0.000000000 seconds sys
jh@alberti:~/tsvc/bin> /home/jh/trunk-install/bin/gcc test.c -Ofast
-march=native -lm -fno-allow-store-data-races
jh@alberti:~/tsvc/bin> perf stat ./a.out
Performance counter stats for './a.out':
667.95 msec task-clock:u # 0.999 CPUs utilized
0 context-switches:u # 0.000 /sec
0 cpu-migrations:u # 0.000 /sec
367 page-faults:u # 549.439 /sec
2,434,906,671 cycles:u # 3.645 GHz
(83.24%)
19,681 stalled-cycles-frontend:u # 0.00% frontend cycles
idle (83.24%)
12,495 stalled-cycles-backend:u # 0.00% backend cycles
idle (83.24%)
2,793,482,139 instructions:u # 1.15 insn per cycle
# 0.00 stalled cycles per
insn (83.24%)
598,879,536 branches:u # 896.588 M/sec
(83.78%)
50,649 branch-misses:u # 0.01% of all branches
(83.26%)
0.668807640 seconds time elapsed
0.668660000 seconds user
0.000000000 seconds sys
So I suppose it is L1 trashing. l1-dcache-loads goes up from
2,000,413,936 to 11,044,576,207
I suppose it would be too fancy for vectorizer to work out the overall
memory consumption here :) It sort of should have all the info...
Honza
^ permalink raw reply [flat|nested] 6+ messages in thread
* [Bug tree-optimization/107715] TSVC s161 for double runs at zen4 30 times slower when vectorization is enabled
2022-11-16 14:10 [Bug tree-optimization/107715] New: TSVC s161 for double runs at zen4 30 times slower when vectorization is enabled hubicka at gcc dot gnu.org
2022-11-16 15:08 ` [Bug tree-optimization/107715] " rguenth at gcc dot gnu.org
2022-11-16 15:28 ` hubicka at ucw dot cz
@ 2022-11-16 15:35 ` amonakov at gcc dot gnu.org
2022-11-16 17:20 ` [Bug tree-optimization/107715] TSVC s161 and s277 " hubicka at gcc dot gnu.org
2022-11-21 10:02 ` marxin at gcc dot gnu.org
4 siblings, 0 replies; 6+ messages in thread
From: amonakov at gcc dot gnu.org @ 2022-11-16 15:35 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107715
--- Comment #3 from Alexander Monakov <amonakov at gcc dot gnu.org> ---
There's a forward dependency over 'c' (read of c[i] vs. write of c[i+1] with
'i' iterating forward), and the vectorized variant takes the hit on each
iteration. How is a slowdown even surprising.
For the non-vectorized variant you have at most 50% iterations waiting on the
previous, when 'b' has positive and negative elements in alternation, but the
generator doesn't elicit this worst case.
^ permalink raw reply [flat|nested] 6+ messages in thread
* [Bug tree-optimization/107715] TSVC s161 and s277 for double runs at zen4 30 times slower when vectorization is enabled
2022-11-16 14:10 [Bug tree-optimization/107715] New: TSVC s161 for double runs at zen4 30 times slower when vectorization is enabled hubicka at gcc dot gnu.org
` (2 preceding siblings ...)
2022-11-16 15:35 ` amonakov at gcc dot gnu.org
@ 2022-11-16 17:20 ` hubicka at gcc dot gnu.org
2022-11-21 10:02 ` marxin at gcc dot gnu.org
4 siblings, 0 replies; 6+ messages in thread
From: hubicka at gcc dot gnu.org @ 2022-11-16 17:20 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107715
Jan Hubicka <hubicka at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Summary|TSVC s161 for double runs |TSVC s161 and s277 for
|at zen4 30 times slower |double runs at zen4 30
|when vectorization is |times slower when
|enabled |vectorization is enabled
--- Comment #4 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
This looks to be same issue but also an regression wrt GCC 7.5.0
typedef double real_t;
#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256
real_t a[LEN_1D],b[LEN_1D],c[LEN_1D],d[LEN_1D],e[LEN_1D];
real_t qq;
int
main(void)
{
for (int nl = 0; nl < iterations; nl++) {
for (int i = 0; i < LEN_1D-1; i++) {
if (a[i] >= (real_t)0.) {
goto L20;
}
if (b[i] >= (real_t)0.) {
goto L30;
}
a[i] += c[i] * d[i];
L30:
b[i+1] = c[i] + d[i] * e[i];
L20:
;
}
}
return 1;
}
jh@alberti:~/tsvc/bin> gcc -Ofast -march=native tt4.c ; time ./a.out
real 0m0.879s
user 0m0.879s
sys 0m0.000s
jh@alberti:~/tsvc/bin> /home/jh/trunk-install/bin/gcc -Ofast -march=native
tt4.c ; time ./a.out
real 0m1.742s
user 0m1.741s
sys 0m0.000s
jh@alberti:~/tsvc/bin> clang -Ofast -march=native tt4.c ; time ./a.out
real 0m0.879s
user 0m0.879s
sys 0m0.000s
^ permalink raw reply [flat|nested] 6+ messages in thread
* [Bug tree-optimization/107715] TSVC s161 and s277 for double runs at zen4 30 times slower when vectorization is enabled
2022-11-16 14:10 [Bug tree-optimization/107715] New: TSVC s161 for double runs at zen4 30 times slower when vectorization is enabled hubicka at gcc dot gnu.org
` (3 preceding siblings ...)
2022-11-16 17:20 ` [Bug tree-optimization/107715] TSVC s161 and s277 " hubicka at gcc dot gnu.org
@ 2022-11-21 10:02 ` marxin at gcc dot gnu.org
4 siblings, 0 replies; 6+ messages in thread
From: marxin at gcc dot gnu.org @ 2022-11-21 10:02 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107715
Martin Liška <marxin at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Last reconfirmed| |2022-11-21
Ever confirmed|0 |1
CC| |marxin at gcc dot gnu.org
Status|UNCONFIRMED |NEW
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2022-11-21 10:02 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-11-16 14:10 [Bug tree-optimization/107715] New: TSVC s161 for double runs at zen4 30 times slower when vectorization is enabled hubicka at gcc dot gnu.org
2022-11-16 15:08 ` [Bug tree-optimization/107715] " rguenth at gcc dot gnu.org
2022-11-16 15:28 ` hubicka at ucw dot cz
2022-11-16 15:35 ` amonakov at gcc dot gnu.org
2022-11-16 17:20 ` [Bug tree-optimization/107715] TSVC s161 and s277 " hubicka at gcc dot gnu.org
2022-11-21 10:02 ` marxin at gcc dot gnu.org
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).