public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug middle-end/99634] New: s2102 benchmarks of TSVC is vectorized better by icc than gcc
@ 2021-03-17 18:49 hubicka at gcc dot gnu.org
2021-03-18 9:03 ` [Bug middle-end/99634] s2102 benchmarks of TSVC is vectorized better by icc than gcc, interchange is missing rguenth at gcc dot gnu.org
2023-01-11 22:28 ` hubicka at gcc dot gnu.org
0 siblings, 2 replies; 3+ messages in thread
From: hubicka at gcc dot gnu.org @ 2021-03-17 18:49 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99634
Bug ID: 99634
Summary: s2102 benchmarks of TSVC is vectorized better by icc
than gcc
Product: gcc
Version: 11.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: middle-end
Assignee: unassigned at gcc dot gnu.org
Reporter: hubicka at gcc dot gnu.org
Target Milestone: ---
typedef float real_t;
#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256
// array definitions
real_t
a[LEN_2D],d[LEN_2D],aa[LEN_2D][LEN_2D],bb[LEN_2D][LEN_2D],cc[LEN_2D][LEN_2D],tt[LEN_2D][LEN_2D];
int main(struct args_t * func_args)
{
// diagonals
// identity matrix, best results vectorize both inner and outer loops
for (int nl = 0; nl < 100*(iterations/LEN_2D); nl++) {
for (int i = 0; i < LEN_2D; i++) {
for (int j = 0; j < LEN_2D; j++) {
aa[j][i] = (real_t)0.;
}
aa[i][i] = (real_t)1.;
}
dummy();
}
return aa[0][0];
}
is vectorized by ic as:
min:
# parameter 1: %rdi
..B1.1: # Preds ..B1.0
# Execution count [5.00e-03]
.cfi_startproc
..___tag_value_min.1:
..L2:
#36.1
pushq %rbp #36.1
.cfi_def_cfa_offset 16
movq %rsp, %rbp #36.1
.cfi_def_cfa 6, 16
.cfi_offset 6, -16
andq $-32, %rsp #36.1
movl $aa, %edi #38.13
xorl %esi, %esi #38.13
movl $262144, %edx #38.13
call _intel_fast_memset #38.13
# LOE rbx r12 r13 r14 r15
..B1.2: # Preds ..B1.1
# Execution count [1.00e+00]
vmovups .L_2il0floatpacket.0(%rip), %ymm1 #41.24
xorl %edx, %edx #37.9
xorl %eax, %eax #37.9
vextractf128 $1, %ymm1, %xmm0 #41.13
# LOE rax rdx rbx r12 r13 r14 r15 xmm0 xmm1
..B1.3: # Preds ..B1.3 ..B1.2
# Execution count [2.56e+02]
vextractps $3, %xmm1, 44204+aa(%rax,%rdx,4) #41.13
lea (%rax,%rdx,4), %rcx #41.13
vmovss %xmm0, 45232+aa(%rax,%rdx,4) #41.13
vextractps $1, %xmm0, 46260+aa(%rax,%rdx,4) #41.13
vextractps $2, %xmm0, 47288+aa(%rax,%rdx,4) #41.13
vextractps $3, %xmm0, 48316+aa(%rax,%rdx,4) #41.13
vmovss %xmm1, 49344+aa(%rax,%rdx,4) #41.13
vextractps $1, %xmm1, 50372+aa(%rax,%rdx,4) #41.13
vextractps $2, %xmm1, 51400+aa(%rax,%rdx,4) #41.13
vextractps $3, %xmm1, 52428+aa(%rax,%rdx,4) #41.13
vmovss %xmm0, 53456+aa(%rax,%rdx,4) #41.13
vextractps $1, %xmm0, 54484+aa(%rax,%rdx,4) #41.13
vextractps $2, %xmm0, 55512+aa(%rax,%rdx,4) #41.13
vextractps $3, %xmm0, 56540+aa(%rax,%rdx,4) #41.13
vmovss %xmm1, 57568+aa(%rax,%rdx,4) #41.13
vextractps $1, %xmm1, 58596+aa(%rax,%rdx,4) #41.13
vextractps $2, %xmm1, 59624+aa(%rax,%rdx,4) #41.13
vextractps $3, %xmm1, 60652+aa(%rax,%rdx,4) #41.13
vmovss %xmm0, 61680+aa(%rax,%rdx,4) #41.13
vextractps $1, %xmm0, 62708+aa(%rax,%rdx,4) #41.13
vextractps $2, %xmm0, 63736+aa(%rax,%rdx,4) #41.13
vextractps $3, %xmm0, 64764+aa(%rax,%rdx,4) #41.13
vmovss %xmm1, 65792+aa(%rax,%rdx,4) #41.13
vextractps $1, %xmm1, 66820+aa(%rax,%rdx,4) #41.13
vextractps $2, %xmm1, 67848+aa(%rax,%rdx,4) #41.13
vextractps $3, %xmm1, 68876+aa(%rax,%rdx,4) #41.13
vmovss %xmm0, 69904+aa(%rax,%rdx,4) #41.13
vextractps $1, %xmm0, 70932+aa(%rax,%rdx,4) #41.13
vextractps $2, %xmm0, 71960+aa(%rax,%rdx,4) #41.13
vextractps $3, %xmm0, 72988+aa(%rax,%rdx,4) #41.13
vmovss %xmm1, 74016+aa(%rax,%rdx,4) #41.13
vextractps $1, %xmm1, 75044+aa(%rax,%rdx,4) #41.13
vextractps $2, %xmm1, 76072+aa(%rax,%rdx,4) #41.13
vextractps $3, %xmm1, 77100+aa(%rax,%rdx,4) #41.13
vmovss %xmm0, 78128+aa(%rax,%rdx,4) #41.13
vextractps $1, %xmm0, 79156+aa(%rax,%rdx,4) #41.13
vextractps $2, %xmm0, 80184+aa(%rax,%rdx,4) #41.13
vextractps $3, %xmm0, 81212+aa(%rax,%rdx,4) #41.13
vmovss %xmm1, 82240+aa(%rax,%rdx,4) #41.13
vextractps $1, %xmm1, 83268+aa(%rax,%rdx,4) #41.13
vextractps $2, %xmm1, 84296+aa(%rax,%rdx,4) #41.13
vextractps $3, %xmm1, 85324+aa(%rax,%rdx,4) #41.13
vmovss %xmm0, 86352+aa(%rax,%rdx,4) #41.13
vextractps $1, %xmm0, 87380+aa(%rax,%rdx,4) #41.13
vextractps $2, %xmm0, 88408+aa(%rax,%rdx,4) #41.13
vextractps $3, %xmm0, 89436+aa(%rax,%rdx,4) #41.13
vmovss %xmm1, 90464+aa(%rax,%rdx,4) #41.13
vextractps $1, %xmm1, 91492+aa(%rax,%rdx,4) #41.13
vextractps $2, %xmm1, 92520+aa(%rax,%rdx,4) #41.13
vextractps $3, %xmm1, 93548+aa(%rax,%rdx,4) #41.13
vmovss %xmm0, 94576+aa(%rax,%rdx,4) #41.13
vextractps $1, %xmm0, 95604+aa(%rax,%rdx,4) #41.13
vextractps $2, %xmm0, 96632+aa(%rax,%rdx,4) #41.13
vextractps $3, %xmm0, 97660+aa(%rax,%rdx,4) #41.13
vmovss %xmm1, 98688+aa(%rax,%rdx,4) #41.13
vextractps $1, %xmm1, 99716+aa(%rax,%rdx,4) #41.13
vextractps $2, %xmm1, 100744+aa(%rax,%rdx,4) #41.13
vextractps $3, %xmm1, 101772+aa(%rax,%rdx,4) #41.13
vmovss %xmm0, 102800+aa(%rax,%rdx,4) #41.13
vextractps $1, %xmm0, 103828+aa(%rax,%rdx,4) #41.13
vextractps $2, %xmm0, 104856+aa(%rax,%rdx,4) #41.13
vextractps $3, %xmm0, 105884+aa(%rax,%rdx,4) #41.13
vmovss %xmm1, 106912+aa(%rax,%rdx,4) #41.13
vextractps $1, %xmm1, 107940+aa(%rax,%rdx,4) #41.13
vextractps $2, %xmm1, 108968+aa(%rax,%rdx,4) #41.13
vextractps $3, %xmm1, 109996+aa(%rax,%rdx,4) #41.13
vmovss %xmm0, 111024+aa(%rax,%rdx,4) #41.13
vextractps $1, %xmm0, 112052+aa(%rax,%rdx,4) #41.13
vextractps $2, %xmm0, 113080+aa(%rax,%rdx,4) #41.13
vextractps $3, %xmm0, 114108+aa(%rax,%rdx,4) #41.13
vmovss %xmm1, 115136+aa(%rax,%rdx,4) #41.13
vextractps $1, %xmm1, 116164+aa(%rax,%rdx,4) #41.13
vextractps $2, %xmm1, 117192+aa(%rax,%rdx,4) #41.13
vextractps $3, %xmm1, 118220+aa(%rax,%rdx,4) #41.13
vmovss %xmm0, 119248+aa(%rax,%rdx,4) #41.13
vextractps $1, %xmm0, 120276+aa(%rax,%rdx,4) #41.13
vextractps $2, %xmm0, 121304+aa(%rax,%rdx,4) #41.13
vextractps $3, %xmm0, 122332+aa(%rax,%rdx,4) #41.13
vmovss %xmm1, 123360+aa(%rax,%rdx,4) #41.13
vextractps $1, %xmm1, 124388+aa(%rax,%rdx,4) #41.13
vextractps $2, %xmm1, 125416+aa(%rax,%rdx,4) #41.13
vextractps $3, %xmm1, 126444+aa(%rax,%rdx,4) #41.13
vmovss %xmm0, 127472+aa(%rax,%rdx,4) #41.13
vextractps $1, %xmm0, 128500+aa(%rax,%rdx,4) #41.13
vextractps $2, %xmm0, 129528+aa(%rax,%rdx,4) #41.13
vextractps $3, %xmm0, 130556+aa(%rax,%rdx,4) #41.13
addq $128, %rdx #37.9
addq $131072, %rax #37.9
vmovss %xmm1, aa(%rcx) #41.13
vextractps $1, %xmm1, 1028+aa(%rcx) #41.13
vextractps $2, %xmm1, 2056+aa(%rcx) #41.13
vextractps $3, %xmm1, 3084+aa(%rcx) #41.13
vmovss %xmm0, 4112+aa(%rcx) #41.13
vextractps $1, %xmm0, 5140+aa(%rcx) #41.13
vextractps $2, %xmm0, 6168+aa(%rcx) #41.13
vextractps $3, %xmm0, 7196+aa(%rcx) #41.13
vmovss %xmm1, 8224+aa(%rcx) #41.13
vextractps $1, %xmm1, 9252+aa(%rcx) #41.13
vextractps $2, %xmm1, 10280+aa(%rcx) #41.13
vextractps $3, %xmm1, 11308+aa(%rcx) #41.13
vmovss %xmm0, 12336+aa(%rcx) #41.13
vextractps $1, %xmm0, 13364+aa(%rcx) #41.13
vextractps $2, %xmm0, 14392+aa(%rcx) #41.13
vextractps $3, %xmm0, 15420+aa(%rcx) #41.13
vmovss %xmm1, 16448+aa(%rcx) #41.13
vextractps $1, %xmm1, 17476+aa(%rcx) #41.13
vextractps $2, %xmm1, 18504+aa(%rcx) #41.13
vextractps $3, %xmm1, 19532+aa(%rcx) #41.13
vmovss %xmm0, 20560+aa(%rcx) #41.13
vextractps $1, %xmm0, 21588+aa(%rcx) #41.13
vextractps $2, %xmm0, 22616+aa(%rcx) #41.13
vextractps $3, %xmm0, 23644+aa(%rcx) #41.13
vmovss %xmm1, 24672+aa(%rcx) #41.13
vextractps $1, %xmm1, 25700+aa(%rcx) #41.13
vextractps $2, %xmm1, 26728+aa(%rcx) #41.13
vextractps $3, %xmm1, 27756+aa(%rcx) #41.13
vmovss %xmm0, 28784+aa(%rcx) #41.13
vextractps $1, %xmm0, 29812+aa(%rcx) #41.13
vextractps $2, %xmm0, 30840+aa(%rcx) #41.13
vextractps $3, %xmm0, 31868+aa(%rcx) #41.13
vmovss %xmm1, 32896+aa(%rcx) #41.13
vextractps $1, %xmm1, 33924+aa(%rcx) #41.13
vextractps $2, %xmm1, 34952+aa(%rcx) #41.13
vextractps $3, %xmm1, 35980+aa(%rcx) #41.13
vmovss %xmm0, 37008+aa(%rcx) #41.13
vextractps $1, %xmm0, 38036+aa(%rcx) #41.13
vextractps $2, %xmm0, 39064+aa(%rcx) #41.13
vextractps $3, %xmm0, 40092+aa(%rcx) #41.13
vmovss %xmm1, 41120+aa(%rcx) #41.13
vextractps $1, %xmm1, 42148+aa(%rcx) #41.13
vextractps $2, %xmm1, 43176+aa(%rcx) #41.13
cmpq $256, %rdx #37.9
jb ..B1.3 # Prob 99% #37.9
# LOE rax rdx rbx r12 r13 r14 r15 xmm0 xmm1
..B1.4: # Preds ..B1.3
# Execution count [1.00e+00]
vzeroupper #43.1
movq %rbp, %rsp #43.1
popq %rbp #43.1
.cfi_def_cfa 7, 8
.cfi_restore 6
ret #43.1
^ permalink raw reply [flat|nested] 3+ messages in thread
* [Bug middle-end/99634] s2102 benchmarks of TSVC is vectorized better by icc than gcc, interchange is missing
2021-03-17 18:49 [Bug middle-end/99634] New: s2102 benchmarks of TSVC is vectorized better by icc than gcc hubicka at gcc dot gnu.org
@ 2021-03-18 9:03 ` rguenth at gcc dot gnu.org
2023-01-11 22:28 ` hubicka at gcc dot gnu.org
1 sibling, 0 replies; 3+ messages in thread
From: rguenth at gcc dot gnu.org @ 2021-03-18 9:03 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99634
Richard Biener <rguenth at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
CC| |amker at gcc dot gnu.org
Ever confirmed|0 |1
Last reconfirmed| |2021-03-18
Summary|s2102 benchmarks of TSVC is |s2102 benchmarks of TSVC is
|vectorized better by icc |vectorized better by icc
|than gcc |than gcc, interchange is
| |missing
Status|UNCONFIRMED |NEW
--- Comment #1 from Richard Biener <rguenth at gcc dot gnu.org> ---
we cannot analyze the dependence between aa[j][i] and aa[i][i] in outer loop
vectorization.
ICC seems to completely unroll the inner loop, doing scalar stores for
everything. The only thing "vectorized" is the constant. Not sure
why it uses vextractps at all though - probably an artifact of indeed
being produced by its vectorization.
So what ICC does is quite stupid and not vectorized. I guess simply
unrolling would end up being faster than ICC. Unfortunately we're
doing
.L3:
movl $0x00000000, (%rax)
movl $0x00000000, 1024(%rax)
movl $0x00000000, 2048(%rax)
addq $8192, %rax
movl $0x00000000, -5120(%rax)
movl $0x00000000, -4096(%rax)
movl $0x00000000, -3072(%rax)
movl $0x00000000, -2048(%rax)
movl $0x00000000, -1024(%rax)
cmpq %rdx, %rax
jne .L3
rather than using a register source operand. We also end up not
streaming to consecutive stores.
So the interesting transform is not vectorization but instead
doing interchange again. Not sure if we're confused by the
dependence (likely) and thus we'd need loop distribution again.
^ permalink raw reply [flat|nested] 3+ messages in thread
* [Bug middle-end/99634] s2102 benchmarks of TSVC is vectorized better by icc than gcc, interchange is missing
2021-03-17 18:49 [Bug middle-end/99634] New: s2102 benchmarks of TSVC is vectorized better by icc than gcc hubicka at gcc dot gnu.org
2021-03-18 9:03 ` [Bug middle-end/99634] s2102 benchmarks of TSVC is vectorized better by icc than gcc, interchange is missing rguenth at gcc dot gnu.org
@ 2023-01-11 22:28 ` hubicka at gcc dot gnu.org
1 sibling, 0 replies; 3+ messages in thread
From: hubicka at gcc dot gnu.org @ 2023-01-11 22:28 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99634
--- Comment #2 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
AOCC produced code is:
.LBB0_2: # %vector.body
# Parent Loop BB0_1 Depth=1
# => This Inner Loop Header: Depth=2
vpbroadcastq %rdx, %zmm4
kxnorw %k0, %k0, %k1
incq %rdx
vpsllq $2, %zmm4, %zmm4
vpaddq %zmm4, %zmm0, %zmm4
vpaddq %zmm7, %zmm4, %zmm5
vscatterqps %ymm3, (,%zmm5) {%k1}
vpaddq .LCPI0_0(%rip), %zmm4, %zmm5
kxnorw %k0, %k0, %k1
vscatterqps %ymm3, (,%zmm5) {%k1}
vpaddq .LCPI0_3(%rip), %zmm4, %zmm5
kxnorw %k0, %k0, %k1
vscatterqps %ymm3, (,%zmm5) {%k1}
vpaddq .LCPI0_2(%rip), %zmm4, %zmm5
kxnorw %k0, %k0, %k1
vscatterqps %ymm3, (,%zmm5) {%k1}
vpaddq %zmm11, %zmm4, %zmm5
kxnorw %k0, %k0, %k1
vscatterqps %ymm3, (,%zmm5) {%k1}
vpaddq .LCPI0_4(%rip), %zmm4, %zmm5
kxnorw %k0, %k0, %k1
vscatterqps %ymm3, (,%zmm5) {%k1}
vpaddq %zmm13, %zmm4, %zmm5
kxnorw %k0, %k0, %k1
vscatterqps %ymm3, (,%zmm5) {%k1}
vpaddq %zmm12, %zmm4, %zmm5
kxnorw %k0, %k0, %k1
vscatterqps %ymm3, (,%zmm5) {%k1}
vpaddq %zmm15, %zmm4, %zmm5
kxnorw %k0, %k0, %k1
vscatterqps %ymm3, (,%zmm5) {%k1}
vpaddq %zmm14, %zmm4, %zmm5
kxnorw %k0, %k0, %k1
vscatterqps %ymm3, (,%zmm5) {%k1}
vpaddq %zmm17, %zmm4, %zmm5
kxnorw %k0, %k0, %k1
vscatterqps %ymm3, (,%zmm5) {%k1}
vpaddq %zmm16, %zmm4, %zmm5
kxnorw %k0, %k0, %k1
vscatterqps %ymm3, (,%zmm5) {%k1}
vpaddq %zmm19, %zmm4, %zmm5
kxnorw %k0, %k0, %k1
vscatterqps %ymm3, (,%zmm5) {%k1}
vpaddq %zmm18, %zmm4, %zmm5
kxnorw %k0, %k0, %k1
vscatterqps %ymm3, (,%zmm5) {%k1}
vpaddq %zmm21, %zmm4, %zmm5
kxnorw %k0, %k0, %k1
vscatterqps %ymm3, (,%zmm5) {%k1}
vpaddq %zmm20, %zmm4, %zmm5
kxnorw %k0, %k0, %k1
vscatterqps %ymm3, (,%zmm5) {%k1}
vpaddq %zmm23, %zmm4, %zmm5
kxnorw %k0, %k0, %k1
vscatterqps %ymm3, (,%zmm5) {%k1}
vpaddq %zmm22, %zmm4, %zmm5
kxnorw %k0, %k0, %k1
vscatterqps %ymm3, (,%zmm5) {%k1}
vpaddq %zmm25, %zmm4, %zmm5
kxnorw %k0, %k0, %k1
vscatterqps %ymm3, (,%zmm5) {%k1}
vpaddq %zmm24, %zmm4, %zmm5
kxnorw %k0, %k0, %k1
vscatterqps %ymm3, (,%zmm5) {%k1}
vpaddq %zmm27, %zmm4, %zmm5
kxnorw %k0, %k0, %k1
vscatterqps %ymm3, (,%zmm5) {%k1}
vpaddq %zmm26, %zmm4, %zmm5
kxnorw %k0, %k0, %k1
vscatterqps %ymm3, (,%zmm5) {%k1}
vpaddq %zmm29, %zmm4, %zmm5
kxnorw %k0, %k0, %k1
vscatterqps %ymm3, (,%zmm5) {%k1}
vpaddq %zmm28, %zmm4, %zmm5
kxnorw %k0, %k0, %k1
vscatterqps %ymm3, (,%zmm5) {%k1}
vpaddq %zmm31, %zmm4, %zmm5
kxnorw %k0, %k0, %k1
vscatterqps %ymm3, (,%zmm5) {%k1}
vpaddq %zmm30, %zmm4, %zmm5
kxnorw %k0, %k0, %k1
vscatterqps %ymm3, (,%zmm5) {%k1}
vpaddq %zmm2, %zmm4, %zmm5
kxnorw %k0, %k0, %k1
vscatterqps %ymm3, (,%zmm5) {%k1}
vpaddq %zmm1, %zmm4, %zmm5
kxnorw %k0, %k0, %k1
vscatterqps %ymm3, (,%zmm5) {%k1}
vpaddq %zmm8, %zmm4, %zmm5
kxnorw %k0, %k0, %k1
vscatterqps %ymm3, (,%zmm5) {%k1}
vpaddq %zmm6, %zmm4, %zmm5
kxnorw %k0, %k0, %k1
vscatterqps %ymm3, (,%zmm5) {%k1}
vpaddq %zmm10, %zmm4, %zmm5
kxnorw %k0, %k0, %k1
vpaddq %zmm9, %zmm4, %zmm4
vscatterqps %ymm3, (,%zmm5) {%k1}
kxnorw %k0, %k0, %k1
vscatterqps %ymm3, (,%zmm4) {%k1}
movl $1065353216, (%rcx) # imm = 0x3F800000
addq $1028, %rcx # imm = 0x404
cmpq $256, %rdx # imm = 0x100
jne .LBB0_2
# %bb.3: # %for.cond.cleanup3
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2023-01-11 22:28 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-17 18:49 [Bug middle-end/99634] New: s2102 benchmarks of TSVC is vectorized better by icc than gcc hubicka at gcc dot gnu.org
2021-03-18 9:03 ` [Bug middle-end/99634] s2102 benchmarks of TSVC is vectorized better by icc than gcc, interchange is missing rguenth at gcc dot gnu.org
2023-01-11 22:28 ` hubicka at gcc dot gnu.org
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).