public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug middle-end/99416] New: s211 benchmark of TSVC is vectorized by icc and not by gcc
@ 2021-03-05 16:20 hubicka at gcc dot gnu.org
2021-03-08 8:50 ` [Bug tree-optimization/99416] " rguenth at gcc dot gnu.org
` (3 more replies)
0 siblings, 4 replies; 5+ messages in thread
From: hubicka at gcc dot gnu.org @ 2021-03-05 16:20 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99416
Bug ID: 99416
Summary: s211 benchmark of TSVC is vectorized by icc and not by
gcc
Product: gcc
Version: 11.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: middle-end
Assignee: unassigned at gcc dot gnu.org
Reporter: hubicka at gcc dot gnu.org
Target Milestone: ---
typedef float real_t;
#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256
real_t a[LEN_1D],b[LEN_1D],c[LEN_1D],d[LEN_1D],e[LEN_1D];
void main()
{
for (int nl = 0; nl < iterations; nl++) {
for (int i = 1; i < LEN_1D-1; i++) {
a[i] = b[i - 1] + c[i] * d[i];
b[i] = b[i + 1] - e[i] * d[i];
}
}
}
Icc produces:
ain:
..B1.1: # Preds ..B1.0
# Execution count [0.00e+00]
.cfi_startproc
..___tag_value_ain.1:
..L2:
#9.1
subq $136, %rsp #9.1
.cfi_def_cfa_offset 144
xorl %edx, %edx #11.5
lea 12+d(%rip), %r8 #14.38
vmovss (%r8), %xmm0 #14.38
movl $7, %edi #13.38
lea 12+e(%rip), %r9 #14.38
vmulss (%r9), %xmm0, %xmm12 #14.38
xorl %esi, %esi #13.38
lea 12+c(%rip), %r10 #13.38
vmulss (%r10), %xmm0, %xmm0 #13.38
vmovss 16(%r8), %xmm4 #14.38
movl $31977, %ecx #12.9
vmulss 16(%r9), %xmm4, %xmm14 #14.38
movl $31975, %eax #12.9
lea 24+b(%rip), %r11 #14.20
vmovss (%r11), %xmm11 #14.20
vmovss 4(%r8), %xmm6 #14.38
vmovss %xmm12, 104(%rsp) #14.38[spill]
vmovss %xmm11, 8(%rsp) #14.20[spill]
vmulss 4(%r9), %xmm6, %xmm12 #14.38
vmulss 4(%r10), %xmm6, %xmm11 #13.38
vmovss 127984+d(%rip), %xmm6 #14.38
vmovss 8(%r8), %xmm13 #14.38
vmovss %xmm14, 96(%rsp) #14.38[spill]
vmulss 127984+e(%rip), %xmm6, %xmm14 #14.38
vmulss 8(%r9), %xmm13, %xmm1 #14.38
vmovss %xmm14, 112(%rsp) #14.38[spill]
vmovss 127988+d(%rip), %xmm14 #14.38
vmovss %xmm1, 16(%rsp) #14.38[spill]
vmulss 8(%r10), %xmm13, %xmm1 #13.38
vmulss 16(%r10), %xmm4, %xmm13 #13.38
vmulss 127988+e(%rip), %xmm14, %xmm4 #14.38
vmovss %xmm4, 120(%rsp) #14.38[spill]
vmulss 127988+c(%rip), %xmm14, %xmm4 #13.38
vmovss -4(%r11), %xmm5 #14.20
vmovss -8(%r8), %xmm2 #14.38
vmovss 12(%r8), %xmm15 #14.38
vmovss %xmm4, 24(%rsp) #13.38[spill]
vmovss 127992+d(%rip), %xmm4 #14.38
vmovss %xmm5, (%rsp) #14.20[spill]
vmulss -8(%r9), %xmm2, %xmm3 #14.38
vmulss -8(%r10), %xmm2, %xmm5 #13.38
vmulss 12(%r9), %xmm15, %xmm2 #14.38
vmulss 12(%r10), %xmm15, %xmm15 #13.38
vmulss 127992+e(%rip), %xmm4, %xmm14 #14.38
vmulss 127992+c(%rip), %xmm4, %xmm4 #13.38
vmovss -4(%r8), %xmm10 #14.38
vmulss -4(%r9), %xmm10, %xmm7 #14.38
vmulss -4(%r10), %xmm10, %xmm10 #13.38
vmovss %xmm7, 88(%rsp) #14.38[spill]
vmovss %xmm4, 32(%rsp) #13.38[spill]
vmovss %xmm15, 56(%rsp) #13.31[spill]
vmovss %xmm14, 40(%rsp) #13.31[spill]
vmovss %xmm3, 80(%rsp) #13.31[spill]
vmovss -16(%r11), %xmm9 #14.20
vmovss -12(%r11), %xmm8 #14.20
vmovss -8(%r11), %xmm7 #14.20
vmovss 127984+c(%rip), %xmm4 #13.31
vmovss %xmm1, 64(%rsp) #13.31[spill]
vmovss %xmm0, 48(%rsp) #13.31[spill]
vmovss %xmm2, 72(%rsp) #13.31[spill]
vmovss 16(%rsp), %xmm14 #13.31[spill]
vmovss 8(%rsp), %xmm15 #13.31[spill]
vmovss (%rsp), %xmm3 #13.31[spill]
# LOE rax rcx rbx rbp rsi rdi r12 r13 r14 r15
edx xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 xmm15
..B1.2: # Preds ..B1.10 ..B1.1
# Execution count [1.00e+05]
movq %rdi, %r8 #12.9
vsubss 80(%rsp), %xmm9, %xmm0 #14.38[spill]
vsubss 88(%rsp), %xmm8, %xmm1 #14.38[spill]
vsubss 104(%rsp), %xmm7, %xmm2 #14.38[spill]
vsubss %xmm14, %xmm15, %xmm7 #14.38
vsubss %xmm12, %xmm3, %xmm3 #14.38
vmovss 28+b(%rip), %xmm8 #14.20
vmovss 32+b(%rip), %xmm15 #14.20
vmovss %xmm0, 4+b(%rip) #14.13
vmovss %xmm1, 8+b(%rip) #14.13
vmovss %xmm2, 12+b(%rip) #14.13
vmovss %xmm3, 16+b(%rip) #14.13
vmovss %xmm7, 20+b(%rip) #14.13
vsubss 72(%rsp), %xmm8, %xmm9 #14.38[spill]
vsubss 96(%rsp), %xmm15, %xmm0 #14.38[spill]
vmovss %xmm9, 24+b(%rip) #14.13
vmovss %xmm0, 28+b(%rip) #14.13
# LOE rax rcx rbx rbp rsi rdi r8 r12 r13 r14
r15 edx xmm4 xmm5 xmm6 xmm10 xmm11 xmm12 xmm13 xmm14
..B1.3: # Preds ..B1.3 ..B1.2
# Execution count [3.20e+09]
vmovups 4+e(,%r8,4), %ymm1 #14.31
lea (,%r8,4), %r9 #14.13
vmovups 36+e(,%r8,4), %ymm3 #14.31
vmovups 68+e(,%r8,4), %ymm8 #14.31
vmovups 100+e(,%r8,4), %ymm15 #14.31
vmovups 4+d(,%r8,4), %ymm0 #14.38
vmovups 36+d(,%r8,4), %ymm2 #14.38
vmovups 68+d(,%r8,4), %ymm7 #14.38
vmovups 100+d(,%r8,4), %ymm9 #14.38
vfnmadd213ps 8+b(,%r8,4), %ymm0, %ymm1 #14.38
vfnmadd213ps 40+b(,%r8,4), %ymm2, %ymm3 #14.38
vfnmadd213ps 72+b(,%r8,4), %ymm7, %ymm8 #14.38
vfnmadd213ps 104+b(,%r8,4), %ymm9, %ymm15 #14.38
vmovups %ymm1, 4+b(%r9) #14.13
vmovups %ymm3, 36+b(%r9) #14.13
vmovups %ymm8, 68+b(%r9) #14.13
vmovups %ymm15, 100+b(%r9) #14.13
addq $32, %r8 #12.9
cmpq $31975, %r8 #12.9
jb ..B1.3 # Prob 99% #12.9
# LOE rax rcx rbx rbp rsi rdi r8 r12 r13 r14
r15 edx xmm4 xmm5 xmm6 xmm10 xmm11 xmm12 xmm13 xmm14
..B1.4: # Preds ..B1.3
# Execution count [1.00e+05]
movq %rsi, %r9 #12.9
movq %rcx, %r8 #12.9
# LOE rax rcx rbx rbp rsi rdi r8 r9 r12 r13 r14
r15 edx xmm4 xmm5 xmm6 xmm10 xmm11 xmm12 xmm13 xmm14
..B1.5: # Preds ..B1.5 ..B1.4
# Execution count [3.20e+09]
vmovups 127904+e(,%r9,4), %xmm1 #14.31
vmovups 127904+d(,%r9,4), %xmm0 #14.38
vfnmadd213ps b(,%r8,4), %xmm0, %xmm1 #14.38
addq $4, %r8 #12.9
vmovups %xmm1, 127904+b(,%r9,4) #14.13
addq $4, %r9 #12.9
cmpq $20, %r9 #12.9
jb ..B1.5 # Prob 99% #12.9
# LOE rax rcx rbx rbp rsi rdi r8 r9 r12 r13 r14
r15 edx xmm4 xmm5 xmm6 xmm10 xmm11 xmm12 xmm13 xmm14
..B1.6: # Preds ..B1.5
# Execution count [1.00e+05]
vmovss 127996+b(%rip), %xmm9 #14.20
movq %rdi, %r8 #12.9
vmovss 127992+b(%rip), %xmm1 #14.20
vmovss 127988+b(%rip), %xmm2 #14.20
vaddss b(%rip), %xmm5, %xmm7 #13.38
vaddss 4+b(%rip), %xmm10, %xmm3 #13.38
vsubss 40(%rsp), %xmm9, %xmm8 #14.38[spill]
vsubss 112(%rsp), %xmm2, %xmm2 #14.38[spill]
vsubss 120(%rsp), %xmm1, %xmm1 #14.38[spill]
vmovss %xmm7, 4+a(%rip) #13.13
vmovss 16+b(%rip), %xmm7 #13.20
vmovss %xmm3, 8+a(%rip) #13.13
vmovss 8+b(%rip), %xmm9 #13.20
vmovss %xmm8, 127992+b(%rip) #14.13
vmovss 12+b(%rip), %xmm8 #13.20
vmovss %xmm2, 127984+b(%rip) #14.13
vaddss %xmm11, %xmm8, %xmm0 #13.38
vaddss 64(%rsp), %xmm7, %xmm3 #13.38[spill]
vaddss 48(%rsp), %xmm9, %xmm15 #13.38[spill]
vmovss %xmm3, 20+a(%rip) #13.13
vmovss 20+b(%rip), %xmm3 #13.20
vmovss %xmm15, 12+a(%rip) #13.13
vmovss %xmm0, 16+a(%rip) #13.13
vmovss %xmm1, 127988+b(%rip) #14.13
vmovss %xmm9, (%rsp) #13.13[spill]
vaddss 56(%rsp), %xmm3, %xmm15 #13.38[spill]
vmovss %xmm15, 24+a(%rip) #13.13
vmovss 24+b(%rip), %xmm15 #13.20
vaddss %xmm13, %xmm15, %xmm0 #13.38
vmovss %xmm0, 28+a(%rip) #13.13
# LOE rax rcx rbx rbp rsi rdi r8 r12 r13 r14 r15
edx xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm10 xmm11 xmm12 xmm13 xmm14 xmm15
..B1.7: # Preds ..B1.7 ..B1.6
# Execution count [3.20e+09]
vmovups 4+c(,%r8,4), %ymm9 #13.31
lea (,%r8,4), %r9 #13.13
vmovups 4+d(,%r8,4), %ymm0 #13.38
vfmadd213ps b(,%r8,4), %ymm0, %ymm9 #13.38
vmovups 36+d(,%r8,4), %ymm0 #13.38
vmovups %ymm9, 4+a(%r9) #13.13
vmovups 36+c(,%r8,4), %ymm9 #13.31
vfmadd213ps 32+b(,%r8,4), %ymm0, %ymm9 #13.38
vmovups 68+d(,%r8,4), %ymm0 #13.38
vmovups %ymm9, 36+a(%r9) #13.13
vmovups 68+c(,%r8,4), %ymm9 #13.31
vfmadd213ps 64+b(,%r8,4), %ymm0, %ymm9 #13.38
vmovups 100+d(,%r8,4), %ymm0 #13.38
vmovups %ymm9, 68+a(%r9) #13.13
vmovups 100+c(,%r8,4), %ymm9 #13.31
vfmadd213ps 96+b(,%r8,4), %ymm0, %ymm9 #13.38
addq $32, %r8 #12.9
vmovups %ymm9, 100+a(%r9) #13.13
cmpq $31975, %r8 #12.9
jb ..B1.7 # Prob 99% #12.9
# LOE rax rcx rbx rbp rsi rdi r8 r12 r13 r14
r15 edx xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm10 xmm11 xmm12 xmm13 xmm14
xmm15
..B1.8: # Preds ..B1.7
# Execution count [1.00e+05]
movq %rsi, %r9 #12.9
movq %rax, %r8 #12.9
# LOE rax rcx rbx rbp rsi rdi r8 r9 r12 r13 r14
r15 edx xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm10 xmm11 xmm12 xmm13 xmm14
xmm15
..B1.9: # Preds ..B1.9 ..B1.8
# Execution count [3.20e+09]
vmovups 127904+c(,%r9,4), %xmm9 #13.31
vmovups 127904+d(,%r9,4), %xmm0 #13.38
vfmadd213ps b(,%r8,4), %xmm0, %xmm9 #13.38
addq $4, %r8 #12.9
vmovups %xmm9, 127904+a(,%r9,4) #13.13
addq $4, %r9 #12.9
cmpq $20, %r9 #12.9
jb ..B1.9 # Prob 99% #12.9
# LOE rax rcx rbx rbp rsi rdi r8 r9 r12 r13 r14
r15 edx xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm10 xmm11 xmm12 xmm13 xmm14
xmm15
..B1.10: # Preds ..B1.9
# Execution count [1.07e+09]
incl %edx #11.5
vmovss 127980+b(%rip), %xmm0 #13.20
vmovss (%rsp), %xmm9 #[spill]
vfmadd231ss %xmm6, %xmm4, %xmm0 #13.38
cmpl $100000, %edx #11.5
jb ..B1.2 # Prob 99% #11.5
# LOE rax rcx rbx rbp rsi rdi r12 r13 r14 r15
edx xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13
xmm14 xmm15
..B1.11: # Preds ..B1.10
# Execution count [1.00e+00]
vmovss %xmm0, 127984+a(%rip) #13.13
vaddss 32(%rsp), %xmm1, %xmm1 #13.38[spill]
vaddss 24(%rsp), %xmm2, %xmm2 #13.38[spill]
vmovss %xmm1, 127992+a(%rip) #13.13
vmovss %xmm2, 127988+a(%rip) #13.13
vzeroupper #17.1
addq $136, %rsp #17.1
.cfi_def_cfa_offset 8
ret #17.1
^ permalink raw reply [flat|nested] 5+ messages in thread
* [Bug tree-optimization/99416] s211 benchmark of TSVC is vectorized by icc and not by gcc
2021-03-05 16:20 [Bug middle-end/99416] New: s211 benchmark of TSVC is vectorized by icc and not by gcc hubicka at gcc dot gnu.org
@ 2021-03-08 8:50 ` rguenth at gcc dot gnu.org
2022-07-11 13:20 ` rguenth at gcc dot gnu.org
` (2 subsequent siblings)
3 siblings, 0 replies; 5+ messages in thread
From: rguenth at gcc dot gnu.org @ 2021-03-08 8:50 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99416
Richard Biener <rguenth at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Blocks| |53947
Last reconfirmed| |2021-03-08
Ever confirmed|0 |1
Status|UNCONFIRMED |NEW
Component|middle-end |tree-optimization
Keywords| |missed-optimization
CC| |amker at gcc dot gnu.org,
| |rguenth at gcc dot gnu.org
--- Comment #1 from Richard Biener <rguenth at gcc dot gnu.org> ---
Confirmed. ICC applies loop distribution but again our cost-modeling doesn't
want that to happen.
I suspect we want to detect extra incentives there (make dependences "good",
allow interchange, etc.)
Referenced Bugs:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53947
[Bug 53947] [meta-bug] vectorizer missed-optimizations
^ permalink raw reply [flat|nested] 5+ messages in thread
* [Bug tree-optimization/99416] s211 benchmark of TSVC is vectorized by icc and not by gcc
2021-03-05 16:20 [Bug middle-end/99416] New: s211 benchmark of TSVC is vectorized by icc and not by gcc hubicka at gcc dot gnu.org
2021-03-08 8:50 ` [Bug tree-optimization/99416] " rguenth at gcc dot gnu.org
@ 2022-07-11 13:20 ` rguenth at gcc dot gnu.org
2022-07-11 13:27 ` rguenth at gcc dot gnu.org
2022-07-12 9:59 ` rguenth at gcc dot gnu.org
3 siblings, 0 replies; 5+ messages in thread
From: rguenth at gcc dot gnu.org @ 2022-07-11 13:20 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99416
Richard Biener <rguenth at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Status|NEW |ASSIGNED
Assignee|unassigned at gcc dot gnu.org |rguenth at gcc dot gnu.org
--- Comment #2 from Richard Biener <rguenth at gcc dot gnu.org> ---
Note after "fixing" (disabling) the costing issue we get to
Fuse partitions because they are in the same dependence scc:
Part 1: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 17, 18, 20, 21
Part 2: 1, 2, 3, 7, 11, 12, 13, 14, 15, 16, 17, 18, 20, 21
Loop nest 1 not distributed.
still a[i] = b[i - 1] can be performed separately but second while
b[i] = b[i + 1] needs to be performed first. That means the
dependence analysis interpretation needs improvement.
^ permalink raw reply [flat|nested] 5+ messages in thread
* [Bug tree-optimization/99416] s211 benchmark of TSVC is vectorized by icc and not by gcc
2021-03-05 16:20 [Bug middle-end/99416] New: s211 benchmark of TSVC is vectorized by icc and not by gcc hubicka at gcc dot gnu.org
2021-03-08 8:50 ` [Bug tree-optimization/99416] " rguenth at gcc dot gnu.org
2022-07-11 13:20 ` rguenth at gcc dot gnu.org
@ 2022-07-11 13:27 ` rguenth at gcc dot gnu.org
2022-07-12 9:59 ` rguenth at gcc dot gnu.org
3 siblings, 0 replies; 5+ messages in thread
From: rguenth at gcc dot gnu.org @ 2022-07-11 13:27 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99416
--- Comment #3 from Richard Biener <rguenth at gcc dot gnu.org> ---
Note it's only the outer loop that confuses us here. With that removed we have
the following because of yet another "heuristic" to disable distribution.
Possible alias data dependence to break:
Fuse partitions because there is no point to distribute loop:
Part 1: 0, 1, 5, 9, 10, 11, 12, 13, 14, 15, 16
Part 2: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 15, 16
^ permalink raw reply [flat|nested] 5+ messages in thread
* [Bug tree-optimization/99416] s211 benchmark of TSVC is vectorized by icc and not by gcc
2021-03-05 16:20 [Bug middle-end/99416] New: s211 benchmark of TSVC is vectorized by icc and not by gcc hubicka at gcc dot gnu.org
` (2 preceding siblings ...)
2022-07-11 13:27 ` rguenth at gcc dot gnu.org
@ 2022-07-12 9:59 ` rguenth at gcc dot gnu.org
3 siblings, 0 replies; 5+ messages in thread
From: rguenth at gcc dot gnu.org @ 2022-07-12 9:59 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99416
--- Comment #4 from Richard Biener <rguenth at gcc dot gnu.org> ---
(In reply to Richard Biener from comment #3)
> Note it's only the outer loop that confuses us here. With that removed we
> have
> the following because of yet another "heuristic" to disable distribution.
In fact we first analyze the whole nest but then continue to look at the inner
loop only, so this isn't really an issue.
The fusing because of shared memory refs is only because of the double use
of d[i], b[i], b[i-1] or b[i+1] are not detected as problematic for
distribution (the "same memory object" check isn't working as intended).
Fuse partitions because they have shared memory refs:
Part 1: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 15, 16
Part 2: 0, 1, 5, 9, 10, 11, 12, 13, 14, 15, 16
note the intersection of both partitions includes half of the stmts
(0, 1, 5, 6, 15, 16) that would be duplicated (5 is the d[i] load) while
the other half is different.
To defeat the final fusing reason we need a positive motivation, like
tracking whether we know a partition can or cannot be vectorized (or
whether we are not sure). For the partition containing the b[i], b[i+1]
dependence distance of 1 we know we cannot vectorize (with a VF > 0).
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2022-07-12 9:59 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-05 16:20 [Bug middle-end/99416] New: s211 benchmark of TSVC is vectorized by icc and not by gcc hubicka at gcc dot gnu.org
2021-03-08 8:50 ` [Bug tree-optimization/99416] " rguenth at gcc dot gnu.org
2022-07-11 13:20 ` rguenth at gcc dot gnu.org
2022-07-11 13:27 ` rguenth at gcc dot gnu.org
2022-07-12 9:59 ` rguenth at gcc dot gnu.org
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).