public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug middle-end/108376] New: TSVC s1279 runs 40% faster with aocc than gcc at zen4
@ 2023-01-11 18:56 hubicka at gcc dot gnu.org
2023-01-11 20:21 ` [Bug middle-end/108376] " amonakov at gcc dot gnu.org
` (3 more replies)
0 siblings, 4 replies; 5+ messages in thread
From: hubicka at gcc dot gnu.org @ 2023-01-11 18:56 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108376
Bug ID: 108376
Summary: TSVC s1279 runs 40% faster with aocc than gcc at zen4
Product: gcc
Version: 13.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: middle-end
Assignee: unassigned at gcc dot gnu.org
Reporter: hubicka at gcc dot gnu.org
Target Milestone: ---
jh@alberti:~/tsvc/bin> more s1279.c
#include <math.h>
#include <malloc.h>
typedef float real_t;
#define iterations 1000000
#define LEN_1D 32000
#define LEN_2D 256
real_t a[LEN_1D],b[LEN_1D],c[LEN_1D],d[LEN_1D],e[LEN_1D];
real_t aa[LEN_2D][LEN_2D];
real_t bb[LEN_2D][LEN_2D];
real_t cc[LEN_2D][LEN_2D];
real_t qq;
int
main(void)
{
// reductions
// if to max reduction
real_t x;
int * __restrict__ ip = (int *) malloc(LEN_1D*sizeof(real_t));
for (int i = 0; i < LEN_1D; i = i+5){
(ip)[i] = (i+4);
(ip)[i+1] = (i+2);
(ip)[i+2] = (i);
(ip)[i+3] = (i+3);
(ip)[i+4] = (i+1);
}
for (int nl = 0; nl < iterations; nl++) {
for (int i = 0; i < LEN_1D; i++) {
if (a[i] < (real_t)0.) {
if (b[i] > a[i]) {
c[i] += d[i] * e[i];
}
}
}
//dummy(a, b, c, d, e, aa, bb, cc, 0.);
}
return x;
}
jh@alberti:~/tsvc/bin> ~/trunk-install/bin/gcc -Ofast -march=native s1279.c
jh@alberti:~/tsvc/bin> perf stat ./a.out
Performance counter stats for './a.out':
2762.85 msec task-clock:u # 0.999 CPUs utilized
0 context-switches:u # 0.000 /sec
0 cpu-migrations:u # 0.000 /sec
265 page-faults:u # 95.915 /sec
10155904052 cycles:u # 3.676 GHz
(83.34%)
20767 stalled-cycles-frontend:u # 0.00% frontend cycles
idle (83.36%)
36970 stalled-cycles-backend:u # 0.00% backend cycles
idle (83.36%)
27985795691 instructions:u # 2.76 insn per cycle
# 0.00 stalled cycles per
insn (83.36%)
1999265642 branches:u # 723.624 M/sec
(83.36%)
502031 branch-misses:u # 0.03% of all branches
(83.23%)
2.764553907 seconds time elapsed
2.763249000 seconds user
0.000000000 seconds sys
jh@alberti:~/tsvc/bin> ~/aocc-compiler-4.0.0/bin/clang -Ofast -march=native
s1279.c
jh@alberti:~/tsvc/bin> perf stat ./a.out
Performance counter stats for './a.out':
1980.94 msec task-clock:u # 0.999 CPUs utilized
0 context-switches:u # 0.000 /sec
0 cpu-migrations:u # 0.000 /sec
77 page-faults:u # 38.871 /sec
7261166980 cycles:u # 3.666 GHz
(83.25%)
16796 stalled-cycles-frontend:u # 0.00% frontend cycles
idle (83.25%)
34506 stalled-cycles-backend:u # 0.00% backend cycles
idle (83.25%)
10498254812 instructions:u # 1.45 insn per cycle
# 0.00 stalled cycles per
insn (83.40%)
1500160478 branches:u # 757.299 M/sec
(83.45%)
1000905 branch-misses:u # 0.07% of all branches
(83.40%)
1.982364055 seconds time elapsed
1.981460000 seconds user
0.000000000 seconds sys
aocc does:
.LBB0_6: # %for.inc43.vec.bb
# in Loop: Header=BB0_2 Depth=2
addq $256, %rcx # imm = 0x100
cmpq $128000, %rcx # imm = 0x1F400
je .LBB0_7
.LBB0_2: # %vector.body
# Parent Loop BB0_1 Depth=1
# => This Inner Loop Header: Depth=2
vmovups a(%rcx), %zmm1
vmovups a+64(%rcx), %zmm2
vmovups a+128(%rcx), %zmm3
vmovups a+192(%rcx), %zmm4
# implicit-def: $k4
vcmpltps %zmm0, %zmm1, %k0
vcmpltps %zmm0, %zmm2, %k1
vcmpltps %zmm0, %zmm3, %k2
vcmpltps %zmm0, %zmm4, %k3
kunpckwd %k0, %k1, %k0
kunpckwd %k2, %k3, %k1
# implicit-def: $k2
# implicit-def: $k3
kunpckdq %k0, %k1, %k0
# implicit-def: $k1
kortestq %k0, %k0
je .LBB0_4
# %bb.3: # %if.then.vec.bb
# in Loop: Header=BB0_2 Depth=2
vcmpltps b(%rcx), %zmm1, %k1
vcmpltps b+64(%rcx), %zmm2, %k2
vcmpltps b+128(%rcx), %zmm3, %k3
vcmpltps b+192(%rcx), %zmm4, %k4
.LBB0_4: # %if.then.vec.join.bb
# in Loop: Header=BB0_2 Depth=2
kunpckwd %k1, %k2, %k5
kunpckwd %k3, %k4, %k6
kunpckdq %k5, %k6, %k5
ktestq %k0, %k5
je .LBB0_6
So mask registers does the conditionals
and GCC with 256bit vectors:
.L2:
vmovdqa %ymm7, %ymm1
vmovdqa %ymm8, %ymm0
addq $160, %rax
vpaddd %ymm4, %ymm8, %ymm8
vpaddd %ymm18, %ymm1, %ymm2
vpaddd %ymm17, %ymm1, %ymm1
vpaddd %ymm4, %ymm7, %ymm7
vextracti64x2 $1, %ymm2, %xmm3
vmovq %xmm2, -160(%rax)
vpextrq $1, %xmm2, -140(%rax)
vmovq %xmm1, -80(%rax)
vpextrq $1, %xmm1, -60(%rax)
valignq $3, %ymm2, %ymm2, %ymm2
vmovq %xmm3, -120(%rax)
vmovdqa %ymm0, %ymm3
vmovq %xmm2, -100(%rax)
vextracti64x2 $1, %ymm1, %xmm2
valignq $3, %ymm1, %ymm1, %ymm1
vmovq %xmm2, -40(%rax)
vpaddd %ymm5, %ymm0, %ymm2
vmovd %xmm2, -144(%rax)
vpextrd $1, %xmm2, -124(%rax)
vpextrd $2, %xmm2, -104(%rax)
vmovq %xmm1, -20(%rax)
vpaddd %ymm6, %ymm0, %ymm1
vpermt2d %ymm1, %ymm16, %ymm3
vpextrd $3, %xmm2, -84(%rax)
vmovq %xmm3, -152(%rax)
vmovdqa %ymm0, %ymm3
vpermt2d %ymm1, %ymm15, %ymm3
vmovq %xmm3, -132(%rax)
vmovdqa %ymm0, %ymm3
vpermt2d %ymm1, %ymm14, %ymm3
vmovq %xmm3, -112(%rax)
vmovdqa %ymm0, %ymm3
vpermt2d %ymm1, %ymm13, %ymm3
vmovq %xmm3, -92(%rax)
vmovdqa %ymm0, %ymm3
vpermt2d %ymm1, %ymm12, %ymm3
vmovq %xmm3, -72(%rax)
vmovdqa %ymm0, %ymm3
vpermt2d %ymm1, %ymm11, %ymm3
vmovq %xmm3, -52(%rax)
vmovdqa %ymm0, %ymm3
vpermt2d %ymm1, %ymm9, %ymm0
vmovq %xmm0, -12(%rax)
vpermt2d %ymm1, %ymm10, %ymm3
vextracti32x4 $1, %ymm2, %xmm0
vmovq %xmm3, -32(%rax)
vmovd %xmm0, -64(%rax)
valignd $5, %ymm2, %ymm2, %ymm0
vmovd %xmm0, -44(%rax)
valignd $6, %ymm2, %ymm2, %ymm0
valignd $7, %ymm2, %ymm2, %ymm2
vmovd %xmm0, -24(%rax)
vmovd %xmm2, -4(%rax)
cmpq %rax, %rcx
jne .L2
with 512bit vectors:
.L2:
vmovdqa32 %zmm5, %zmm1
addq $320, %rax
vpaddd %zmm2, %zmm5, %zmm5
vmovdqa32 %zmm6, %zmm0
vpaddd %zmm2, %zmm6, %zmm6
vpaddd %zmm24, %zmm1, %zmm25
vpaddd %zmm23, %zmm1, %zmm1
valignq $3, %ymm25, %ymm25, %ymm26
vmovq %xmm25, -320(%rax)
vpextrq $1, %xmm25, -300(%rax)
vmovq %xmm1, -160(%rax)
vpextrq $1, %xmm1, -140(%rax)
vextracti64x2 $1, %ymm25, %xmm27
vextracti64x4 $0x1, %zmm25, %ymm25
vmovq %xmm26, -260(%rax)
vmovq %xmm25, -240(%rax)
vpextrq $1, %xmm25, -220(%rax)
vextracti64x2 $1, %ymm25, %xmm26
vmovq %xmm27, -280(%rax)
valignq $3, %ymm25, %ymm25, %ymm25
vmovq %xmm26, -200(%rax)
vmovq %xmm25, -180(%rax)
valignq $3, %ymm1, %ymm1, %ymm25
vextracti64x2 $1, %ymm1, %xmm26
vextracti64x4 $0x1, %zmm1, %ymm1
vmovq %xmm25, -100(%rax)
vmovq %xmm1, -80(%rax)
vpextrq $1, %xmm1, -60(%rax)
vextracti64x2 $1, %ymm1, %xmm25
vmovq %xmm26, -120(%rax)
vmovdqa32 %zmm0, %zmm26
valignq $3, %ymm1, %ymm1, %ymm1
vmovq %xmm25, -40(%rax)
vpaddd %zmm3, %zmm0, %zmm25
vmovq %xmm1, -20(%rax)
vpaddd %zmm4, %zmm0, %zmm1
vpermt2d %zmm1, %zmm22, %zmm26
vmovq %xmm26, -312(%rax)
vmovdqa32 %zmm0, %zmm26
vpermt2d %zmm1, %zmm21, %zmm26
vmovq %xmm26, %rdx
vmovdqa32 %zmm0, %zmm26
movq %rdx, -292(%rax)
vpermt2d %zmm1, %zmm20, %zmm26
vmovq %xmm26, -272(%rax)
vmovdqa32 %zmm0, %zmm26
vpermt2d %zmm1, %zmm19, %zmm26
vmovq %xmm26, %rdx
vmovdqa32 %zmm0, %zmm26
movq %rdx, -252(%rax)
vpermt2d %zmm1, %zmm18, %zmm26
vmovq %xmm26, -232(%rax)
vmovdqa32 %zmm0, %zmm26
vpermt2d %zmm1, %zmm17, %zmm26
vmovq %xmm26, %rdx
vmovdqa32 %zmm0, %zmm26
movq %rdx, -212(%rax)
vpermt2d %zmm1, %zmm16, %zmm26
vmovq %xmm26, -192(%rax)
vmovdqa32 %zmm0, %zmm26
vmovq %xmm26, %rdx
vmovdqa32 %zmm0, %zmm26
movq %rdx, -172(%rax)
vpermt2d %zmm1, %zmm14, %zmm26
vmovq %xmm26, -152(%rax)
vmovdqa32 %zmm0, %zmm26
vpermt2d %zmm1, %zmm13, %zmm26
vmovq %xmm26, %rdx
vmovdqa32 %zmm0, %zmm26
movq %rdx, -132(%rax)
vpermt2d %zmm1, %zmm12, %zmm26
vmovq %xmm26, -112(%rax)
vmovdqa32 %zmm0, %zmm26
vpermt2d %zmm1, %zmm11, %zmm26
vmovq %xmm26, %rdx
vmovdqa32 %zmm0, %zmm26
movq %rdx, -92(%rax)
vpermt2d %zmm1, %zmm10, %zmm26
vmovq %xmm26, -72(%rax)
vmovdqa32 %zmm0, %zmm26
vpermt2d %zmm1, %zmm9, %zmm26
vmovq %xmm26, %rdx
vmovdqa32 %zmm0, %zmm26
vpermt2d %zmm1, %zmm7, %zmm0
vmovq %xmm0, -12(%rax)
movq %rdx, -52(%rax)
vmovdqa32 %ymm25, %ymm0
vpermt2d %zmm1, %zmm8, %zmm26
vextracti32x4 $1, %ymm25, %xmm1
vmovq %xmm26, -32(%rax)
vmovd %xmm25, -304(%rax)
vpextrd $1, %xmm0, -284(%rax)
vpextrd $2, %xmm0, -264(%rax)
vmovd %xmm1, -224(%rax)
valignd $5, %ymm25, %ymm25, %ymm1
vpextrd $3, %xmm0, -244(%rax)
valignd $7, %ymm25, %ymm25, %ymm0
vmovd %xmm1, -204(%rax)
valignd $6, %ymm25, %ymm25, %ymm1
vmovd %xmm0, -164(%rax)
vextracti32x8 $0x1, %zmm25, %ymm0
vmovd %xmm0, -144(%rax)
vpextrd $1, %xmm0, -124(%rax)
vmovd %xmm1, -184(%rax)
vextracti32x4 $1, %ymm0, %xmm1
vpextrd $2, %xmm0, -104(%rax)
vpextrd $3, %xmm0, -84(%rax)
vmovd %xmm1, -64(%rax)
valignd $5, %ymm0, %ymm0, %ymm1
vmovd %xmm1, -44(%rax)
valignd $6, %ymm0, %ymm0, %ymm1
valignd $7, %ymm0, %ymm0, %ymm0
vmovd %xmm1, -24(%rax)
vmovd %xmm0, -4(%rax)
cmpq %rax, %rcx
jne .L2
^ permalink raw reply [flat|nested] 5+ messages in thread
* [Bug middle-end/108376] TSVC s1279 runs 40% faster with aocc than gcc at zen4
2023-01-11 18:56 [Bug middle-end/108376] New: TSVC s1279 runs 40% faster with aocc than gcc at zen4 hubicka at gcc dot gnu.org
@ 2023-01-11 20:21 ` amonakov at gcc dot gnu.org
2023-01-12 10:34 ` rguenth at gcc dot gnu.org
` (2 subsequent siblings)
3 siblings, 0 replies; 5+ messages in thread
From: amonakov at gcc dot gnu.org @ 2023-01-11 20:21 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108376
Alexander Monakov <amonakov at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
CC| |amonakov at gcc dot gnu.org
--- Comment #1 from Alexander Monakov <amonakov at gcc dot gnu.org> ---
I think your GCC dumps are for the wrong loop.
^ permalink raw reply [flat|nested] 5+ messages in thread
* [Bug middle-end/108376] TSVC s1279 runs 40% faster with aocc than gcc at zen4
2023-01-11 18:56 [Bug middle-end/108376] New: TSVC s1279 runs 40% faster with aocc than gcc at zen4 hubicka at gcc dot gnu.org
2023-01-11 20:21 ` [Bug middle-end/108376] " amonakov at gcc dot gnu.org
@ 2023-01-12 10:34 ` rguenth at gcc dot gnu.org
2023-01-14 22:30 ` hubicka at gcc dot gnu.org
2024-02-09 13:47 ` rguenth at gcc dot gnu.org
3 siblings, 0 replies; 5+ messages in thread
From: rguenth at gcc dot gnu.org @ 2023-01-12 10:34 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108376
Richard Biener <rguenth at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Last reconfirmed| |2023-01-12
Status|UNCONFIRMED |NEW
Ever confirmed|0 |1
--- Comment #2 from Richard Biener <rguenth at gcc dot gnu.org> ---
As far as I can see a[] is all zeros. AOCC basically preserves the
loop control flow when if (a[i] < 0.) for all elements processed in the
iteration, likewise for if (b[i] > a[i]) but GCC if-converts this all
down to combined masking of the guarded code.
I think the testcase as-is is too artificial to be relevant. GCC
has code to do such thing to convert masked stores, but in this case
we are not using masked stores or masked loads:
.L3:
vmovaps a(%rax), %ymm3
vmovaps b(%rax), %ymm4
vmovaps c(%rax), %ymm7
addq $32, %rax
vmovaps c-32(%rax), %ymm0
vmovaps e-32(%rax), %ymm5
vcmpps $1, %ymm1, %ymm3, %k1
vcmpps $14, %ymm3, %ymm4, %k1{%k1}
vfmadd231ps d-32(%rax), %ymm5, %ymm0{%k1}
vfmadd231ps d-32(%rax), %ymm5, %ymm0
vblendmps %ymm0, %ymm7, %ymm0{%k1}
vmovaps %ymm0, c-32(%rax)
cmpq $128000, %rax
jne .L3
I suspect if you do a less optimal initialization of a/b then the AOCC
code will be slower.
Note GCC applies unroll-and-jam to the loop (the outer iteration is
visibly redundant, so we are eventually doing half of the work as AOCC ;))
Confirmed for us not vectorizing control flow but if-converting.
^ permalink raw reply [flat|nested] 5+ messages in thread
* [Bug middle-end/108376] TSVC s1279 runs 40% faster with aocc than gcc at zen4
2023-01-11 18:56 [Bug middle-end/108376] New: TSVC s1279 runs 40% faster with aocc than gcc at zen4 hubicka at gcc dot gnu.org
2023-01-11 20:21 ` [Bug middle-end/108376] " amonakov at gcc dot gnu.org
2023-01-12 10:34 ` rguenth at gcc dot gnu.org
@ 2023-01-14 22:30 ` hubicka at gcc dot gnu.org
2024-02-09 13:47 ` rguenth at gcc dot gnu.org
3 siblings, 0 replies; 5+ messages in thread
From: hubicka at gcc dot gnu.org @ 2023-01-14 22:30 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108376
--- Comment #3 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
If I make the arrays random then GCC code is indeed faster:
#include <math.h>
#include <malloc.h>
typedef float real_t;
#define iterations 1000000
#define LEN_1D 32000
#define LEN_2D 256
real_t a[LEN_1D],b[LEN_1D],c[LEN_1D],d[LEN_1D],e[LEN_1D];
real_t aa[LEN_2D][LEN_2D];
real_t bb[LEN_2D][LEN_2D];
real_t cc[LEN_2D][LEN_2D];
real_t qq;
int
main(void)
{
// reductions
// if to max reduction
real_t x;
for (int i = 0; i < LEN_1D; i++)
{
a[i]=(rand() %5) - 3;
b[i]=(rand() %6) - 3;
}
for (int nl = 0; nl < iterations; nl++) {
for (int i = 0; i < LEN_1D; i++) {
if (a[i] < (real_t)0.) {
if (b[i] > a[i]) {
c[i] += d[i] * e[i];
}
}
}
//dummy(a, b, c, d, e, aa, bb, cc, 0.);
}
return x;
}
jh@alberti:~/tsvc/bin> ~/aocc-compiler-4.0.0/bin/clang -Ofast s1279.c
-march=native
s1279.c:23:14: warning: implicit declaration of function 'rand' is invalid in
C99 [-Wimplicit-function-declaration]
a[i]=(rand() %5) - 3;
^
1 warning generated.
jh@alberti:~/tsvc/bin> time ./a.out
real 0m5.638s
user 0m5.636s
sys 0m0.000s
jh@alberti:~/tsvc/bin> ~/trunk-install/bin/gcc -Ofast s1279.c -march=native
s1279.c: In function 'main':
s1279.c:23:14: warning: implicit declaration of function 'rand'
[-Wimplicit-function-declaration]
23 | a[i]=(rand() %5) - 3;
| ^~~~
jh@alberti:~/tsvc/bin> time ./a.out
real 0m2.791s
user 0m2.790s
sys 0m0.000s
sorry for wrong code, just for reference the loop compiles as:
.L4:
xorl %eax, %eax
.p2align 4
.p2align 3
.L3:
vmovaps a(%rax), %ymm2
vmovaps b(%rax), %ymm3
vmovaps c(%rax), %ymm6
addq $32, %rax
vmovaps c-32(%rax), %ymm0
vmovaps e-32(%rax), %ymm4
vcmpps $1, %ymm1, %ymm2, %k1
vcmpps $14, %ymm2, %ymm3, %k1{%k1}
vfmadd231ps d-32(%rax), %ymm4, %ymm0{%k1}
vfmadd231ps d-32(%rax), %ymm4, %ymm0
vblendmps %ymm0, %ymm6, %ymm0{%k1}
vmovaps %ymm0, c-32(%rax)
cmpq $128000, %rax
jne .L3
decl %edx
jne .L4
^ permalink raw reply [flat|nested] 5+ messages in thread
* [Bug middle-end/108376] TSVC s1279 runs 40% faster with aocc than gcc at zen4
2023-01-11 18:56 [Bug middle-end/108376] New: TSVC s1279 runs 40% faster with aocc than gcc at zen4 hubicka at gcc dot gnu.org
` (2 preceding siblings ...)
2023-01-14 22:30 ` hubicka at gcc dot gnu.org
@ 2024-02-09 13:47 ` rguenth at gcc dot gnu.org
3 siblings, 0 replies; 5+ messages in thread
From: rguenth at gcc dot gnu.org @ 2024-02-09 13:47 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108376
Richard Biener <rguenth at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Resolution|--- |WONTFIX
Status|NEW |RESOLVED
--- Comment #4 from Richard Biener <rguenth at gcc dot gnu.org> ---
So I'd say INVALID or WONTFIX.
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2024-02-09 13:47 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-01-11 18:56 [Bug middle-end/108376] New: TSVC s1279 runs 40% faster with aocc than gcc at zen4 hubicka at gcc dot gnu.org
2023-01-11 20:21 ` [Bug middle-end/108376] " amonakov at gcc dot gnu.org
2023-01-12 10:34 ` rguenth at gcc dot gnu.org
2023-01-14 22:30 ` hubicka at gcc dot gnu.org
2024-02-09 13:47 ` rguenth at gcc dot gnu.org
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).