public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug middle-end/99411] New: s311 benchmark of TSVC is vectorized by clang better than by gcc
@ 2021-03-05 14:30 hubicka at gcc dot gnu.org
2021-03-05 14:41 ` [Bug middle-end/99411] s311 and s31111 " hubicka at gcc dot gnu.org
` (7 more replies)
0 siblings, 8 replies; 9+ messages in thread
From: hubicka at gcc dot gnu.org @ 2021-03-05 14:30 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99411
Bug ID: 99411
Summary: s311 benchmark of TSVC is vectorized by clang better
than by gcc
Product: gcc
Version: 11.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: middle-end
Assignee: unassigned at gcc dot gnu.org
Reporter: hubicka at gcc dot gnu.org
Target Milestone: ---
typedef float real_t;
#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256
real_t a[LEN_1D];
int main()
{
// reductions
// sum reduction
real_t sum;
for (int nl = 0; nl < iterations*10; nl++) {
sum = (real_t)0.;
for (int i = 0; i < LEN_1D; i++) {
sum += a[i];
}
}
return sum > 4;
}
We produce with -O2 -march=znver2
.L2:
movl $a, %eax
vxorps %xmm0, %xmm0, %xmm0
.p2align 4
.p2align 3
.L3:
vaddps (%rax), %ymm0, %ymm0
addq $32, %rax
cmpq $a+128000, %rax
jne .L3
vextractf128 $0x1, %ymm0, %xmm1
decl %edx
vaddps %xmm0, %xmm1, %xmm1
vmovhlps %xmm1, %xmm1, %xmm0
vaddps %xmm1, %xmm0, %xmm0
vshufps $85, %xmm0, %xmm0, %xmm1
vaddps %xmm0, %xmm1, %xmm0
jne .L2
xorl %eax, %eax
vcomiss .LC0(%rip), %xmm0
seta %al
vzeroupper
ret
.cfi_endproc
clang does:
main: # @main
.cfi_startproc
# %bb.0:
xorl %eax, %eax
.p2align 4, 0x90
.LBB0_1: # =>This Loop Header: Depth=1
# Child Loop BB0_2 Depth 2
vxorps %xmm0, %xmm0, %xmm0
movq $-128000, %rcx # imm = 0xFFFE0C00
vxorps %xmm1, %xmm1, %xmm1
vxorps %xmm2, %xmm2, %xmm2
vxorps %xmm3, %xmm3, %xmm3
.p2align 4, 0x90
.LBB0_2: # Parent Loop BB0_1 Depth=1
# => This Inner Loop Header: Depth=2
vaddps a+128000(%rcx), %ymm0, %ymm0
vaddps a+128032(%rcx), %ymm1, %ymm1
vaddps a+128064(%rcx), %ymm2, %ymm2
vaddps a+128096(%rcx), %ymm3, %ymm3
subq $-128, %rcx
jne .LBB0_2
# %bb.3: # in Loop: Header=BB0_1 Depth=1
incl %eax
cmpl $1000000, %eax # imm = 0xF4240
jne .LBB0_1
# %bb.4:
vaddps %ymm0, %ymm1, %ymm0
xorl %eax, %eax
vaddps %ymm0, %ymm2, %ymm0
vaddps %ymm0, %ymm3, %ymm0
vextractf128 $1, %ymm0, %xmm1
vaddps %xmm1, %xmm0, %xmm0
vpermilpd $1, %xmm0, %xmm1 # xmm1 = xmm0[1,0]
vaddps %xmm1, %xmm0, %xmm0
vmovshdup %xmm0, %xmm1 # xmm1 = xmm0[1,1,3,3]
vaddss %xmm1, %xmm0, %xmm0
vucomiss .LCPI0_0(%rip), %xmm0
seta %al
vzeroupper
retq
On zen3 hardware gcc version runs 2.4s, while clang's 0.8s
^ permalink raw reply [flat|nested] 9+ messages in thread
* [Bug middle-end/99411] s311 and s31111 benchmark of TSVC is vectorized by clang better than by gcc
2021-03-05 14:30 [Bug middle-end/99411] New: s311 benchmark of TSVC is vectorized by clang better than by gcc hubicka at gcc dot gnu.org
@ 2021-03-05 14:41 ` hubicka at gcc dot gnu.org
2021-03-05 14:43 ` [Bug middle-end/99411] s311, s312 " hubicka at gcc dot gnu.org
` (6 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: hubicka at gcc dot gnu.org @ 2021-03-05 14:41 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99411
Jan Hubicka <hubicka at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Summary|s311 benchmark of TSVC is |s311 and s31111 benchmark
|vectorized by clang better |of TSVC is vectorized by
|than by gcc |clang better than by gcc
--- Comment #1 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
I think this is same case
typedef float real_t;
#define iterations 1000000
#define LEN_1D 32000
#define LEN_2D 256
real_t a[LEN_1D];
real_t test(real_t* A){
real_t s = (real_t)0.0;
for (int i = 0; i < 4; i++)
s += A[i];
return s;
}
int main()
{
// reductions
// sum reduction
real_t sum;
for (int nl = 0; nl < 2000*iterations; nl++) {
sum = (real_t)0.;
sum += test(a);
sum += test(&a[4]);
sum += test(&a[8]);
sum += test(&a[12]);
sum += test(&a[16]);
sum += test(&a[20]);
sum += test(&a[24]);
sum += test(&a[28]);
}
return sum>4;
}
^ permalink raw reply [flat|nested] 9+ messages in thread
* [Bug middle-end/99411] s311, s312 and s31111 benchmark of TSVC is vectorized by clang better than by gcc
2021-03-05 14:30 [Bug middle-end/99411] New: s311 benchmark of TSVC is vectorized by clang better than by gcc hubicka at gcc dot gnu.org
2021-03-05 14:41 ` [Bug middle-end/99411] s311 and s31111 " hubicka at gcc dot gnu.org
@ 2021-03-05 14:43 ` hubicka at gcc dot gnu.org
2021-03-05 14:46 ` [Bug middle-end/99411] s311, s312, s31111 " hubicka at gcc dot gnu.org
` (5 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: hubicka at gcc dot gnu.org @ 2021-03-05 14:43 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99411
Jan Hubicka <hubicka at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Summary|s311 and s31111 benchmark |s311, s312 and s31111
|of TSVC is vectorized by |benchmark of TSVC is
|clang better than by gcc |vectorized by clang better
| |than by gcc
--- Comment #2 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
another one:
// %3.1
typedef float real_t;
#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256
real_t a[LEN_1D];
int main ()
{
// reductions
// product reduction
real_t prod;
for (int nl = 0; nl < 10*iterations; nl++) {
prod = (real_t)1.;
for (int i = 0; i < LEN_1D; i++) {
prod *= a[i];
}
}
return prod > 0;
}
^ permalink raw reply [flat|nested] 9+ messages in thread
* [Bug middle-end/99411] s311, s312, s31111 and s31111 benchmark of TSVC is vectorized by clang better than by gcc
2021-03-05 14:30 [Bug middle-end/99411] New: s311 benchmark of TSVC is vectorized by clang better than by gcc hubicka at gcc dot gnu.org
2021-03-05 14:41 ` [Bug middle-end/99411] s311 and s31111 " hubicka at gcc dot gnu.org
2021-03-05 14:43 ` [Bug middle-end/99411] s311, s312 " hubicka at gcc dot gnu.org
@ 2021-03-05 14:46 ` hubicka at gcc dot gnu.org
2021-03-05 14:49 ` [Bug middle-end/99411] s311, s312, s31111 and s31111, s3110 " hubicka at gcc dot gnu.org
` (4 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: hubicka at gcc dot gnu.org @ 2021-03-05 14:46 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99411
Jan Hubicka <hubicka at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Summary|s311, s312 and s31111 |s311, s312, s31111 and
|benchmark of TSVC is |s31111 benchmark of TSVC is
|vectorized by clang better |vectorized by clang better
|than by gcc |than by gcc
--- Comment #3 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
and yet another one
typedef float real_t;
#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256
real_t a[LEN_1D];
int main()
{
// reductions
// conditional sum reduction
real_t sum;
for (int nl = 0; nl < iterations/2; nl++) {
sum = 0.;
for (int i = 0; i < LEN_1D; i++) {
if (a[i] > (real_t)0.) {
sum += a[i];
}
}
}
return sum > 4;
}
^ permalink raw reply [flat|nested] 9+ messages in thread
* [Bug middle-end/99411] s311, s312, s31111 and s31111, s3110 benchmark of TSVC is vectorized by clang better than by gcc
2021-03-05 14:30 [Bug middle-end/99411] New: s311 benchmark of TSVC is vectorized by clang better than by gcc hubicka at gcc dot gnu.org
` (2 preceding siblings ...)
2021-03-05 14:46 ` [Bug middle-end/99411] s311, s312, s31111 " hubicka at gcc dot gnu.org
@ 2021-03-05 14:49 ` hubicka at gcc dot gnu.org
2021-03-05 15:03 ` [Bug middle-end/99411] s311, s312, s31111, s31111, s3110, vsumr " hubicka at gcc dot gnu.org
` (3 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: hubicka at gcc dot gnu.org @ 2021-03-05 14:49 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99411
Jan Hubicka <hubicka at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Summary|s311, s312, s31111 and |s311, s312, s31111 and
|s31111 benchmark of TSVC is |s31111, s3110 benchmark of
|vectorized by clang better |TSVC is vectorized by clang
|than by gcc |better than by gcc
--- Comment #4 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
typedef float real_t;
#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256
real_t a[LEN_1D];
real_t aa[LEN_2D][LEN_2D];
int main()
{
// reductions
// if to max with index reductio 2 dimensions
// similar to S315
int xindex, yindex;
real_t max, chksum;
for (int nl = 0; nl < 100*(iterations/(LEN_2D)); nl++) {
max = aa[(0)][0];
xindex = 0;
yindex = 0;
for (int i = 0; i < LEN_2D; i++) {
for (int j = 0; j < LEN_2D; j++) {
if (aa[i][j] > max) {
max = aa[i][j];
xindex = i;
yindex = j;
}
}
}
chksum = max + (real_t) xindex + (real_t) yindex;
}
return max + xindex+1 + yindex+1;
}
^ permalink raw reply [flat|nested] 9+ messages in thread
* [Bug middle-end/99411] s311, s312, s31111, s31111, s3110, vsumr benchmark of TSVC is vectorized by clang better than by gcc
2021-03-05 14:30 [Bug middle-end/99411] New: s311 benchmark of TSVC is vectorized by clang better than by gcc hubicka at gcc dot gnu.org
` (3 preceding siblings ...)
2021-03-05 14:49 ` [Bug middle-end/99411] s311, s312, s31111 and s31111, s3110 " hubicka at gcc dot gnu.org
@ 2021-03-05 15:03 ` hubicka at gcc dot gnu.org
2021-03-08 8:24 ` [Bug tree-optimization/99411] " rguenth at gcc dot gnu.org
` (2 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: hubicka at gcc dot gnu.org @ 2021-03-05 15:03 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99411
Jan Hubicka <hubicka at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Summary|s311, s312, s31111 and |s311, s312, s31111, s31111,
|s31111, s3110 benchmark of |s3110, vsumr benchmark of
|TSVC is vectorized by clang |TSVC is vectorized by clang
|better than by gcc |better than by gcc
--- Comment #5 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
typedef float real_t;
#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256
real_t a[LEN_1D];
int main()
{
// control loops
// vector sum reduction
real_t sum;
for (int nl = 0; nl < iterations*10; nl++) {
sum = 0.;
for (int i = 0; i < LEN_1D; i++) {
sum += a[i];
}
}
return sum;
}
^ permalink raw reply [flat|nested] 9+ messages in thread
* [Bug tree-optimization/99411] s311, s312, s31111, s31111, s3110, vsumr benchmark of TSVC is vectorized by clang better than by gcc
2021-03-05 14:30 [Bug middle-end/99411] New: s311 benchmark of TSVC is vectorized by clang better than by gcc hubicka at gcc dot gnu.org
` (4 preceding siblings ...)
2021-03-05 15:03 ` [Bug middle-end/99411] s311, s312, s31111, s31111, s3110, vsumr " hubicka at gcc dot gnu.org
@ 2021-03-08 8:24 ` rguenth at gcc dot gnu.org
2022-11-16 17:16 ` hubicka at gcc dot gnu.org
2023-01-11 22:36 ` hubicka at gcc dot gnu.org
7 siblings, 0 replies; 9+ messages in thread
From: rguenth at gcc dot gnu.org @ 2021-03-08 8:24 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99411
Richard Biener <rguenth at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Blocks| |53947
Keywords| |missed-optimization
Component|middle-end |tree-optimization
--- Comment #6 from Richard Biener <rguenth at gcc dot gnu.org> ---
So clang uses a larger VF (unroll of the vectorized loop) here. I think we
have another PR about this.
Referenced Bugs:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53947
[Bug 53947] [meta-bug] vectorizer missed-optimizations
^ permalink raw reply [flat|nested] 9+ messages in thread
* [Bug tree-optimization/99411] s311, s312, s31111, s31111, s3110, vsumr benchmark of TSVC is vectorized by clang better than by gcc
2021-03-05 14:30 [Bug middle-end/99411] New: s311 benchmark of TSVC is vectorized by clang better than by gcc hubicka at gcc dot gnu.org
` (5 preceding siblings ...)
2021-03-08 8:24 ` [Bug tree-optimization/99411] " rguenth at gcc dot gnu.org
@ 2022-11-16 17:16 ` hubicka at gcc dot gnu.org
2023-01-11 22:36 ` hubicka at gcc dot gnu.org
7 siblings, 0 replies; 9+ messages in thread
From: hubicka at gcc dot gnu.org @ 2022-11-16 17:16 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99411
--- Comment #7 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
With znver4 current trunk and clang15 I still see this problem (clang code is
about 60% faster) for s311, s312 and s3111.
Curious s31111 and s3110 no longer shows a regression.
^ permalink raw reply [flat|nested] 9+ messages in thread
* [Bug tree-optimization/99411] s311, s312, s31111, s31111, s3110, vsumr benchmark of TSVC is vectorized by clang better than by gcc
2021-03-05 14:30 [Bug middle-end/99411] New: s311 benchmark of TSVC is vectorized by clang better than by gcc hubicka at gcc dot gnu.org
` (6 preceding siblings ...)
2022-11-16 17:16 ` hubicka at gcc dot gnu.org
@ 2023-01-11 22:36 ` hubicka at gcc dot gnu.org
7 siblings, 0 replies; 9+ messages in thread
From: hubicka at gcc dot gnu.org @ 2023-01-11 22:36 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99411
--- Comment #8 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
Compared to aocc we also do worse on zen4:
jh@alberti:~/tsvc/bin> ~/trunk-install/bin/gcc -Ofast -march=native s311.c
jh@alberti:~/tsvc/bin> time ./a.out
real 0m3.207s
user 0m3.206s
sys 0m0.000s
jh@alberti:~/tsvc/bin> ~/aocc-compiler-4.0.0/bin/clang -Ofast -march=native
s311.c
jh@alberti:~/tsvc/bin> time ./a.out
real 0m1.221s
user 0m1.221s
sys 0m0.000s
aocc code seems similar to clangs from two years ago except for additional use
of avx512.
main: # @main
.cfi_startproc
# %bb.0: # %entry
xorl %eax, %eax
.p2align 4, 0x90
.LBB0_1: # %vector.ph
# =>This Loop Header: Depth=1
# Child Loop BB0_2 Depth 2
vxorps %xmm0, %xmm0, %xmm0
movq $-128000, %rcx # imm = 0xFFFE0C00
vxorps %xmm1, %xmm1, %xmm1
vxorps %xmm2, %xmm2, %xmm2
vxorps %xmm3, %xmm3, %xmm3
.p2align 4, 0x90
.LBB0_2: # %vector.body
# Parent Loop BB0_1 Depth=1
# => This Inner Loop Header: Depth=2
vaddps a+128000(%rcx), %zmm0, %zmm0
vaddps a+128064(%rcx), %zmm1, %zmm1
vaddps a+128128(%rcx), %zmm2, %zmm2
vaddps a+128192(%rcx), %zmm3, %zmm3
addq $256, %rcx # imm = 0x100
jne .LBB0_2
# %bb.3: # %middle.block
# in Loop: Header=BB0_1 Depth=1
incl %eax
cmpl $1000000, %eax # imm = 0xF4240
jne .LBB0_1
# %bb.4: # %for.cond.cleanup
vaddps %zmm0, %zmm1, %zmm0
xorl %eax, %eax
vaddps %zmm0, %zmm2, %zmm0
vaddps %zmm0, %zmm3, %zmm0
vextractf64x4 $1, %zmm0, %ymm1
vaddps %zmm1, %zmm0, %zmm0
vextractf128 $1, %ymm0, %xmm1
vaddps %xmm1, %xmm0, %xmm0
vpermilpd $1, %xmm0, %xmm1 # xmm1 = xmm0[1,0]
vaddps %xmm1, %xmm0, %xmm0
vmovshdup %xmm0, %xmm1 # xmm1 = xmm0[1,1,3,3]
vaddss %xmm1, %xmm0, %xmm0
vucomiss .LCPI0_0(%rip), %xmm0
seta %al
vzeroupper
retq
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2023-01-11 22:36 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-05 14:30 [Bug middle-end/99411] New: s311 benchmark of TSVC is vectorized by clang better than by gcc hubicka at gcc dot gnu.org
2021-03-05 14:41 ` [Bug middle-end/99411] s311 and s31111 " hubicka at gcc dot gnu.org
2021-03-05 14:43 ` [Bug middle-end/99411] s311, s312 " hubicka at gcc dot gnu.org
2021-03-05 14:46 ` [Bug middle-end/99411] s311, s312, s31111 " hubicka at gcc dot gnu.org
2021-03-05 14:49 ` [Bug middle-end/99411] s311, s312, s31111 and s31111, s3110 " hubicka at gcc dot gnu.org
2021-03-05 15:03 ` [Bug middle-end/99411] s311, s312, s31111, s31111, s3110, vsumr " hubicka at gcc dot gnu.org
2021-03-08 8:24 ` [Bug tree-optimization/99411] " rguenth at gcc dot gnu.org
2022-11-16 17:16 ` hubicka at gcc dot gnu.org
2023-01-11 22:36 ` hubicka at gcc dot gnu.org
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).