* [Bug tree-optimization/99408] s3251 benchmark of TSVC vectorized by clang runs about 7 times faster compared to gcc
2021-03-05 14:11 [Bug middle-end/99408] New: s3251 benchmark of TSVC vectorized by clang runs about 7 times faster compared to gcc hubicka at gcc dot gnu.org
2021-03-08 8:17 ` [Bug middle-end/99408] " rguenth at gcc dot gnu.org
2021-12-22 10:42 ` [Bug tree-optimization/99408] " pinskia at gcc dot gnu.org
@ 2022-11-16 17:03 ` hubicka at gcc dot gnu.org
2023-01-11 19:03 ` hubicka at gcc dot gnu.org
2023-01-14 22:33 ` hubicka at gcc dot gnu.org
4 siblings, 0 replies; 6+ messages in thread
From: hubicka at gcc dot gnu.org @ 2022-11-16 17:03 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99408
--- Comment #2 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
This also reproduces with zen4 and double.
jh@alberti:~/tsvc/bin> cat tt.c
typedef double real_t;
#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256
real_t a[LEN_1D],b[LEN_1D],c[LEN_1D],d[LEN_1D],e[LEN_1D];
void
main(void)
{
for (int nl = 0; nl < iterations; nl++) {
for (int i = 0; i < LEN_1D-1; i++){
a[i+1] = b[i]+c[i];
b[i] = c[i]*e[i];
d[i] = a[i]*e[i];
}
}
}
jh@alberti:~/tsvc/bin> ~/trunk-install/bin/gcc -Ofast -march=native tt.c
jh@alberti:~/tsvc/bin> time ./a.out
real 0m3.590s
user 0m3.585s
sys 0m0.004s
jh@alberti:~/tsvc/bin> clang -Ofast -march=native tt.c
tt.c:6:1: warning: return type of 'main' is not 'int' [-Wmain-return-type]
void
^
tt.c:6:1: note: change return type to 'int'
void
^~~~
int
1 warning generated.
jh@alberti:~/tsvc/bin> time ./a.out
real 0m1.538s
user 0m1.538s
sys 0m0.000s
gcc generates:
│ 60: vmovapd 0x67e080(%rax),%zmm7
0.15 │ vmovapd 0x601080(%rax),%zmm2
1.07 │ add $0x40,%rax
│ vaddpd 0x6bc840(%rax),%zmm7,%zmm0
0.00 │ vmovupd %zmm0,0x6fb048(%rax)
11.10 │ vmulpd 0x601040(%rax),%zmm7,%zmm0
9.46 │ vmovapd %zmm0,0x6bc840(%rax)
0.01 │ vmulpd 0x6fb040(%rax),%zmm2,%zmm0
78.20 │ vmovapd %zmm0,0x63f840(%rax)
│ cmp $0x3e7c0,%rax
│ ↑ jne 60
clang generates:
│ nop
│ a0: vmovupd (%r9,%rdx,1),%zmm15
0.46 │ vmovupd (%r9,%rdi,1),%zmm19
0.22 │ vmovupd 0x40(%r9,%rdx,1),%zmm16
0.56 │ vmovupd 0x40(%r9,%rdi,1),%zmm22
0.92 │ vmovupd 0x80(%r9,%rdx,1),%zmm17
1.85 │ vmovupd 0x80(%r9,%rdi,1),%zmm21
1.51 │ vaddpd (%r9,%rcx,1),%zmm15,%zmm18
0.84 │ vmulpd %zmm15,%zmm19,%zmm15
0.47 │ vmovupd %zmm15,(%r9,%rcx,1)
3.37 │ vaddpd 0x40(%r9,%rcx,1),%zmm16,%zmm15
0.56 │ vmulpd %zmm16,%zmm22,%zmm16
0.69 │ vmovupd %zmm16,0x40(%r9,%rcx,1)
3.82 │ vmovupd %zmm18,0x8(%r9,%rsi,1)
3.27 │ vmovapd %zmm15,%zmm20
│ vmovupd %zmm15,0x48(%r9,%rsi,1)
3.60 │ vpermt2pd %zmm18,%zmm13,%zmm20
0.47 │ vpermt2pd %zmm14,%zmm13,%zmm18
0.36 │ vmulpd %zmm19,%zmm18,%zmm18
1.07 │ vmulpd %zmm22,%zmm20,%zmm14
1.33 │ vmovupd %zmm18,(%r9,%r8,1)
6.31 │ vmovupd %zmm14,0x40(%r9,%r8,1)
8.02 │ vaddpd 0x80(%r9,%rcx,1),%zmm17,%zmm14
0.53 │ vmovapd %zmm14,%zmm16
0.05 │ vmovupd %zmm14,0x88(%r9,%rsi,1)
3.08 │ vpermt2pd %zmm15,%zmm13,%zmm16
0.41 │ vmulpd %zmm17,%zmm21,%zmm15
0.20 │ vmovupd %zmm15,0x80(%r9,%rcx,1)
1.60 │ vmulpd %zmm21,%zmm16,%zmm15
1.16 │ vmovupd %zmm15,0x80(%r9,%r8,1)
3.13 │ add $0xc0,%r9
│ cmp $0x3e7c0,%r9
0.03 │ ↑ jne a0
So a forward-dependency here?
^ permalink raw reply [flat|nested] 6+ messages in thread
* [Bug tree-optimization/99408] s3251 benchmark of TSVC vectorized by clang runs about 7 times faster compared to gcc
2021-03-05 14:11 [Bug middle-end/99408] New: s3251 benchmark of TSVC vectorized by clang runs about 7 times faster compared to gcc hubicka at gcc dot gnu.org
` (2 preceding siblings ...)
2022-11-16 17:03 ` hubicka at gcc dot gnu.org
@ 2023-01-11 19:03 ` hubicka at gcc dot gnu.org
2023-01-14 22:33 ` hubicka at gcc dot gnu.org
4 siblings, 0 replies; 6+ messages in thread
From: hubicka at gcc dot gnu.org @ 2023-01-11 19:03 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99408
--- Comment #3 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
with zen4 gcc build loop takes 19s, while aocc 6.6.
aocc:
.LBB0_1: # %for.cond22.preheader
# =>This Loop Header: Depth=1
# Child Loop BB0_2 Depth 2
vbroadcastss a(%rip), %zmm20
xorl %ecx, %ecx
.p2align 4, 0x90
.LBB0_2: # %vector.body
# Parent Loop BB0_1 Depth=1
# => This Inner Loop Header: Depth=2
vmovups c(%rcx), %zmm13
vmovaps %zmm20, %zmm12
vmovups e(%rcx), %zmm0
vaddps b(%rcx), %zmm13, %zmm20
vmulps %zmm13, %zmm0, %zmm13
vmovaps %zmm20, %zmm15
vpermt2ps %zmm12, %zmm29, %zmm15
vmovups %zmm20, a+4(%rcx)
vmovups %zmm13, b(%rcx)
vmulps %zmm0, %zmm15, %zmm12
vmovups %zmm12, d(%rcx)
addq $64, %rcx
cmpq $127936, %rcx # imm = 0x1F3C0
jne .LBB0_2
# %bb.3: # %middle.block
vextractf32x4 $3, %zmm20, %xmm5
vmovss -4(%rsp), %xmm2 # 4-byte Reload
# xmm2 = mem[0],zero,zero,zero
vmovss -12(%rsp), %xmm0 # 4-byte Reload
# xmm0 = mem[0],zero,zero,zero
incl %eax
vaddss b+127936(%rip), %xmm2, %xmm2
vpermilps $231, %xmm5, %xmm5 # xmm5 = xmm5[3,1,2,3]
vmulss -8(%rsp), %xmm5, %xmm5 # 4-byte Folded Reload
vmovss %xmm0, b+127936(%rip)
vmovss -16(%rsp), %xmm0 # 4-byte Reload
# xmm0 = mem[0],zero,zero,zero
vmovss %xmm2, a+127940(%rip)
vmulss -20(%rsp), %xmm2, %xmm2 # 4-byte Folded Reload
vmovss %xmm5, d+127936(%rip)
vaddss b+127940(%rip), %xmm0, %xmm5
vmovss -24(%rsp), %xmm0 # 4-byte Reload
# xmm0 = mem[0],zero,zero,zero
vmovss %xmm0, b+127940(%rip)
vmovss -28(%rsp), %xmm0 # 4-byte Reload
# xmm0 = mem[0],zero,zero,zero
vmovss %xmm2, d+127940(%rip)
vaddss b+127944(%rip), %xmm0, %xmm2
vmovss -36(%rsp), %xmm0 # 4-byte Reload
# xmm0 = mem[0],zero,zero,zero
vmovss %xmm5, a+127944(%rip)
vmulss -32(%rsp), %xmm5, %xmm5 # 4-byte Folded Reload
vmovss %xmm0, b+127944(%rip)
vmovss -40(%rsp), %xmm0 # 4-byte Reload
# xmm0 = mem[0],zero,zero,zero
vmovss %xmm2, a+127948(%rip)
vmulss %xmm22, %xmm2, %xmm2
vmovss %xmm5, d+127944(%rip)
vaddss b+127948(%rip), %xmm21, %xmm5
vmovss %xmm2, d+127948(%rip)
vmovss %xmm0, b+127948(%rip)
vmovss -44(%rsp), %xmm0 # 4-byte Reload
# xmm0 = mem[0],zero,zero,zero
vaddss b+127952(%rip), %xmm24, %xmm2
vmovss %xmm5, a+127952(%rip)
vmulss %xmm25, %xmm5, %xmm5
vmovss %xmm0, b+127952(%rip)
vmovss -48(%rsp), %xmm0 # 4-byte Reload
# xmm0 = mem[0],zero,zero,zero
vmovss %xmm5, d+127952(%rip)
vaddss b+127956(%rip), %xmm27, %xmm5
vmovss %xmm2, a+127956(%rip)
vmulss %xmm28, %xmm2, %xmm2
vmovss %xmm2, d+127956(%rip)
vmovss %xmm0, b+127956(%rip)
vmovss -52(%rsp), %xmm0 # 4-byte Reload
# xmm0 = mem[0],zero,zero,zero
vaddss b+127960(%rip), %xmm30, %xmm2
vmovss %xmm5, a+127960(%rip)
vmulss %xmm31, %xmm5, %xmm5
vmovss %xmm5, d+127960(%rip)
vmovss %xmm0, b+127960(%rip)
vmovss -56(%rsp), %xmm0 # 4-byte Reload
# xmm0 = mem[0],zero,zero,zero
vaddss b+127964(%rip), %xmm16, %xmm5
vmovss %xmm2, a+127964(%rip)
vmulss %xmm18, %xmm2, %xmm2
vmovss %xmm2, d+127964(%rip)
vmovss %xmm0, b+127964(%rip)
vmovss -60(%rsp), %xmm0 # 4-byte Reload
# xmm0 = mem[0],zero,zero,zero
vaddss b+127968(%rip), %xmm19, %xmm2
vmovss %xmm5, a+127968(%rip)
vmulss %xmm1, %xmm5, %xmm5
vmovss %xmm5, d+127968(%rip)
vmovss %xmm0, b+127968(%rip)
vmovss -64(%rsp), %xmm0 # 4-byte Reload
# xmm0 = mem[0],zero,zero,zero
vaddss b+127972(%rip), %xmm3, %xmm5
vmovss %xmm2, a+127972(%rip)
vmulss %xmm4, %xmm2, %xmm2
vmovss %xmm2, d+127972(%rip)
vmovss %xmm0, b+127972(%rip)
vmovss -68(%rsp), %xmm0 # 4-byte Reload
# xmm0 = mem[0],zero,zero,zero
vaddss b+127976(%rip), %xmm6, %xmm2
vmovss %xmm5, a+127976(%rip)
vmulss %xmm7, %xmm5, %xmm5
vmovss %xmm5, d+127976(%rip)
vmovss %xmm0, b+127976(%rip)
vmovss -72(%rsp), %xmm0 # 4-byte Reload
# xmm0 = mem[0],zero,zero,zero
vaddss b+127980(%rip), %xmm9, %xmm5
vmovss %xmm2, a+127980(%rip)
vmulss %xmm2, %xmm10, %xmm2
vmovss %xmm2, d+127980(%rip)
vmovss %xmm0, b+127980(%rip)
vmovss -76(%rsp), %xmm0 # 4-byte Reload
# xmm0 = mem[0],zero,zero,zero
vaddss b+127984(%rip), %xmm14, %xmm2
vmovss %xmm5, a+127984(%rip)
vmulss %xmm17, %xmm5, %xmm5
vmovss %xmm5, d+127984(%rip)
vmovss %xmm0, b+127984(%rip)
vmovss -80(%rsp), %xmm0 # 4-byte Reload
# xmm0 = mem[0],zero,zero,zero
vaddss b+127988(%rip), %xmm23, %xmm5
vmovss %xmm2, a+127988(%rip)
vmulss %xmm26, %xmm2, %xmm2
vmovss %xmm2, d+127988(%rip)
vmovss %xmm0, b+127988(%rip)
vaddss b+127992(%rip), %xmm8, %xmm2
vmulss %xmm5, %xmm11, %xmm0
vmovss %xmm5, a+127992(%rip)
vmovss %xmm0, d+127992(%rip)
vmovss -84(%rsp), %xmm0 # 4-byte Reload
# xmm0 = mem[0],zero,zero,zero
vmovss %xmm2, a+127996(%rip)
vmovss %xmm0, b+127992(%rip)
cmpl $1000000, %eax # imm = 0xF4240
jne .LBB0_1
gcc:
.L2:
vmovdqa32 %zmm5, %zmm1
addq $320, %rax
vpaddd %zmm2, %zmm5, %zmm5
vmovdqa32 %zmm6, %zmm0
vpaddd %zmm2, %zmm6, %zmm6
vpaddd %zmm24, %zmm1, %zmm25
vpaddd %zmm23, %zmm1, %zmm1
valignq $3, %ymm25, %ymm25, %ymm26
vmovq %xmm25, -320(%rax)
vpextrq $1, %xmm25, -300(%rax)
vmovq %xmm1, -160(%rax)
vpextrq $1, %xmm1, -140(%rax)
vextracti64x2 $1, %ymm25, %xmm27
vextracti64x4 $0x1, %zmm25, %ymm25
vmovq %xmm26, -260(%rax)
vmovq %xmm25, -240(%rax)
vpextrq $1, %xmm25, -220(%rax)
vextracti64x2 $1, %ymm25, %xmm26
vmovq %xmm27, -280(%rax)
valignq $3, %ymm25, %ymm25, %ymm25
vmovq %xmm26, -200(%rax)
vmovq %xmm25, -180(%rax)
valignq $3, %ymm1, %ymm1, %ymm25
vextracti64x2 $1, %ymm1, %xmm26
vextracti64x4 $0x1, %zmm1, %ymm1
vmovq %xmm25, -100(%rax)
vmovq %xmm1, -80(%rax)
vpextrq $1, %xmm1, -60(%rax)
vextracti64x2 $1, %ymm1, %xmm25
vmovq %xmm26, -120(%rax)
vmovdqa32 %zmm0, %zmm26
valignq $3, %ymm1, %ymm1, %ymm1
vmovq %xmm25, -40(%rax)
vpaddd %zmm3, %zmm0, %zmm25
vmovq %xmm1, -20(%rax)
vpaddd %zmm4, %zmm0, %zmm1
vpermt2d %zmm1, %zmm22, %zmm26
vmovq %xmm26, -312(%rax)
vmovdqa32 %zmm0, %zmm26
vpermt2d %zmm1, %zmm21, %zmm26
vmovq %xmm26, %rdx
vmovdqa32 %zmm0, %zmm26
movq %rdx, -292(%rax)
vpermt2d %zmm1, %zmm20, %zmm26
vmovq %xmm26, -272(%rax)
vmovdqa32 %zmm0, %zmm26
vpermt2d %zmm1, %zmm19, %zmm26
vmovq %xmm26, %rdx
vmovdqa32 %zmm0, %zmm26
movq %rdx, -252(%rax)
vpermt2d %zmm1, %zmm18, %zmm26
vmovq %xmm26, -232(%rax)
vmovdqa32 %zmm0, %zmm26
vpermt2d %zmm1, %zmm17, %zmm26
vmovq %xmm26, %rdx
vmovdqa32 %zmm0, %zmm26
movq %rdx, -212(%rax)
vpermt2d %zmm1, %zmm16, %zmm26
vmovq %xmm26, -192(%rax)
vmovdqa32 %zmm0, %zmm26
vpermt2d %zmm1, %zmm15, %zmm26
vmovq %xmm26, %rdx
vmovdqa32 %zmm0, %zmm26
movq %rdx, -172(%rax)
vpermt2d %zmm1, %zmm14, %zmm26
vmovq %xmm26, -152(%rax)
vmovdqa32 %zmm0, %zmm26
vpermt2d %zmm1, %zmm13, %zmm26
vmovq %xmm26, %rdx
vmovdqa32 %zmm0, %zmm26
movq %rdx, -132(%rax)
vpermt2d %zmm1, %zmm12, %zmm26
vmovq %xmm26, -112(%rax)
vmovdqa32 %zmm0, %zmm26
vpermt2d %zmm1, %zmm11, %zmm26
vmovq %xmm26, %rdx
vmovdqa32 %zmm0, %zmm26
movq %rdx, -92(%rax)
vpermt2d %zmm1, %zmm10, %zmm26
vmovq %xmm26, -72(%rax)
vmovdqa32 %zmm0, %zmm26
vpermt2d %zmm1, %zmm9, %zmm26
vmovq %xmm26, %rdx
vmovdqa32 %zmm0, %zmm26
vpermt2d %zmm1, %zmm7, %zmm0
vmovq %xmm0, -12(%rax)
movq %rdx, -52(%rax)
vmovdqa32 %ymm25, %ymm0
vpermt2d %zmm1, %zmm8, %zmm26
vextracti32x4 $1, %ymm25, %xmm1
vmovq %xmm26, -32(%rax)
vmovd %xmm25, -304(%rax)
vpextrd $1, %xmm0, -284(%rax)
vpextrd $2, %xmm0, -264(%rax)
vmovd %xmm1, -224(%rax)
valignd $5, %ymm25, %ymm25, %ymm1
vpextrd $3, %xmm0, -244(%rax)
valignd $7, %ymm25, %ymm25, %ymm0
vmovd %xmm1, -204(%rax)
valignd $6, %ymm25, %ymm25, %ymm1
vmovd %xmm0, -164(%rax)
vextracti32x8 $0x1, %zmm25, %ymm0
vmovd %xmm0, -144(%rax)
vpextrd $1, %xmm0, -124(%rax)
vmovd %xmm1, -184(%rax)
vextracti32x4 $1, %ymm0, %xmm1
vpextrd $2, %xmm0, -104(%rax)
vpextrd $3, %xmm0, -84(%rax)
vmovd %xmm1, -64(%rax)
valignd $5, %ymm0, %ymm0, %ymm1
vmovd %xmm1, -44(%rax)
valignd $6, %ymm0, %ymm0, %ymm1
valignd $7, %ymm0, %ymm0, %ymm0
vmovd %xmm1, -24(%rax)
vmovd %xmm0, -4(%rax)
cmpq %rcx, %rax
jne .L2
^ permalink raw reply [flat|nested] 6+ messages in thread