public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug tree-optimization/64716] New: Missed vectorization in a hot code of SPEC2000 ammp
@ 2015-01-21 16:58 vmakarov at gcc dot gnu.org
2015-01-21 17:36 ` [Bug tree-optimization/64716] " vmakarov at gcc dot gnu.org
` (3 more replies)
0 siblings, 4 replies; 5+ messages in thread
From: vmakarov at gcc dot gnu.org @ 2015-01-21 16:58 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=64716
Bug ID: 64716
Summary: Missed vectorization in a hot code of SPEC2000 ammp
Product: gcc
Version: 5.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: tree-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: vmakarov at gcc dot gnu.org
Created attachment 34521
--> https://gcc.gnu.org/bugzilla/attachment.cgi?id=34521&action=edit
Preprocessed rectmm.c from SPEC2000 amp
GCC does not vectorize one of the hotest code in SPECFP2000 ammp
(function mm_fv_update_nonbon in file rectmm.c) on x86-64 when -Ofast
-march=core-avx2 -ffast-math -fno-schedule-insns2 is used. The
preprocessed rectmm.i is in the attachment.
The source code in the consideration is
r0 = 1./(*vector)[j+3];
r = r0*r0;
r = r*r*r;
xt = a1->q*a2->q*dielectric*r0;
yt = a1->a*a2->a*r;
zt = a1->b*a2->b*r*r;
k = xt - yt + zt;
xt = xt*r0; yt = yt*r0; zt = zt*r0;
k1 = xt - yt*6. + zt*12.;
xt = xt*r0; yt = yt*r0; zt = zt*r0;
k2 = xt*3.; ka2 = - yt*6.*8.; kb2 = zt*12.*14;
k1 = -k1;
xt = (*vector)[j]*r0 ;
yt = (*vector)[j+1]*r0 ;
zt = (*vector)[j+2] *r0;
a1->VP += k;
a2->dpx -= k1*xt;
a1->dpx += k1*xt;
a2->dpy -= k1*yt;
a1->dpy += k1*yt;
a2->dpz -= k1*zt;
a1->dpz += k1*zt;
xt2 = xt*xt; yt2 = yt*yt; zt2 = zt*zt;
a2->qxx -= k2*(xt2 - 1./3) + ka2*(xt2 - 1./8)+kb2*(xt2-1./14) ;
a1->qxx -= k2*(xt2 - 1./3) + ka2*(xt2 - 1./8)+kb2*(xt2-1./14) ;
a2->qxy -= (k2+ka2+kb2)*yt*xt;
a1->qxy -= (k2+ka2+kb2)*yt*xt;
a2->qxz -= (k2+ka2+kb2)*zt*xt;
a1->qxz -= (k2+ka2+kb2)*zt*xt;
a2->qyy -= k2*(yt2 - 1./3) + ka2*(yt2 - 1./8)+kb2*(yt2-1./14) ;
a1->qyy -= k2*(yt2 - 1./3) + ka2*(yt2 - 1./8)+kb2*(yt2-1./14) ;
a2->qyz -= (k2+ka2+kb2)*yt*zt;
a1->qyz -= (k2+ka2+kb2)*yt*zt;
a2->qzz -= k2*(zt2 - 1./3) + ka2*(zt2 - 1./8)+kb2*(zt2-1./14) ;
a1->qzz -= k2*(zt2 - 1./3) + ka2*(zt2 - 1./8)+kb2*(zt2-1./14) ;
GCC on the trunk generates 118 insns
.L85:
.cfi_restore_state
vmovsd .LC12(%rip), %xmm7
vdivsd %xmm0, %xmm7, %xmm6
vmulsd %xmm6, %xmm6, %xmm0
vmulsd %xmm0, %xmm0, %xmm10
vmulsd %xmm10, %xmm0, %xmm0
vmovsd 56(%rbx), %xmm12
vmulsd 56(%rdi), %xmm12, %xmm12
vmulsd %xmm4, %xmm12, %xmm12
vmulsd %xmm12, %xmm6, %xmm12
vmovsd 64(%rbx), %xmm10
vmulsd 64(%rdi), %xmm10, %xmm10
vmulsd %xmm10, %xmm0, %xmm11
vmovsd 72(%rbx), %xmm10
vmulsd 72(%rdi), %xmm10, %xmm10
vmulsd %xmm10, %xmm0, %xmm10
vmulsd %xmm0, %xmm10, %xmm10
vmulsd %xmm12, %xmm6, %xmm0
vmulsd %xmm11, %xmm6, %xmm1
vmulsd %xmm10, %xmm6, %xmm2
vmulsd .LC22(%rip), %xmm2, %xmm8
vfnmadd231sd %xmm9, %xmm1, %xmm8
vaddsd %xmm8, %xmm0, %xmm8
vmulsd .LC21(%rip), %xmm6, %xmm5
vmulsd %xmm0, %xmm5, %xmm5
vmulsd %xmm1, %xmm6, %xmm0
vxorpd %xmm15, %xmm0, %xmm0
vmulsd .LC24(%rip), %xmm0, %xmm3
vmulsd .LC25(%rip), %xmm6, %xmm7
vmulsd %xmm2, %xmm7, %xmm7
vxorpd %xmm15, %xmm8, %xmm8
movslq %esi, %rax
vmulsd (%r12,%rax,8), %xmm6, %xmm2
leal 1(%rsi), %eax
cltq
vmulsd (%r12,%rax,8), %xmm6, %xmm1
leal 2(%rsi), %eax
cltq
vmulsd (%r12,%rax,8), %xmm6, %xmm0
vaddsd 208(%rbx), %xmm12, %xmm12
vaddsd %xmm12, %xmm10, %xmm10
vsubsd %xmm11, %xmm10, %xmm10
vmovsd %xmm10, 208(%rbx)
vmovapd %xmm8, %xmm6
vfnmadd213sd 240(%rdi), %xmm2, %xmm6
vmovsd %xmm6, 240(%rdi)
vmovapd %xmm8, %xmm6
vfmadd213sd 240(%rbx), %xmm2, %xmm6
vmovsd %xmm6, 240(%rbx)
vmovapd %xmm8, %xmm6
vfnmadd213sd 248(%rdi), %xmm1, %xmm6
vmovsd %xmm6, 248(%rdi)
vmovapd %xmm8, %xmm6
vfmadd213sd 248(%rbx), %xmm1, %xmm6
vmovsd %xmm6, 248(%rbx)
vmovapd %xmm8, %xmm6
vfnmadd213sd 256(%rdi), %xmm0, %xmm6
vmovsd %xmm6, 256(%rdi)
vfmadd213sd 256(%rbx), %xmm0, %xmm8
vmovsd %xmm8, 256(%rbx)
vmovsd .LC26(%rip), %xmm8
vmovapd %xmm2, %xmm11
vfnmadd132sd %xmm2, %xmm8, %xmm11
vmulsd %xmm11, %xmm5, %xmm11
vmovsd .LC27(%rip), %xmm6
vmovapd %xmm2, %xmm10
vfnmadd132sd %xmm2, %xmm6, %xmm10
vmovapd %xmm10, %xmm12
vfmadd132sd %xmm7, %xmm11, %xmm12
vmovsd .LC28(%rip), %xmm10
vmovapd %xmm2, %xmm11
vfnmadd132sd %xmm2, %xmm10, %xmm11
vfmadd132sd %xmm3, %xmm12, %xmm11
vaddsd 264(%rdi), %xmm11, %xmm12
vmovsd %xmm12, 264(%rdi)
vaddsd 264(%rbx), %xmm11, %xmm11
vmovsd %xmm11, 264(%rbx)
vaddsd %xmm7, %xmm5, %xmm12
vaddsd %xmm12, %xmm3, %xmm12
vmulsd %xmm12, %xmm1, %xmm11
vmovapd %xmm2, %xmm13
vfnmadd213sd 272(%rdi), %xmm11, %xmm13
vmovsd %xmm13, 272(%rdi)
vmovapd %xmm2, %xmm13
vfnmadd213sd 272(%rbx), %xmm11, %xmm13
vmovsd %xmm13, 272(%rbx)
vmulsd %xmm0, %xmm2, %xmm2
vmovapd %xmm12, %xmm13
vfnmadd213sd 280(%rdi), %xmm2, %xmm13
vmovsd %xmm13, 280(%rdi)
vfnmadd213sd 280(%rbx), %xmm12, %xmm2
vmovsd %xmm2, 280(%rbx)
vmovapd %xmm1, %xmm2
vfnmadd132sd %xmm1, %xmm8, %xmm2
vmulsd %xmm2, %xmm5, %xmm12
vmovapd %xmm1, %xmm2
vfnmadd132sd %xmm1, %xmm6, %xmm2
vfmadd132sd %xmm7, %xmm12, %xmm2
vfnmadd132sd %xmm1, %xmm10, %xmm1
vfmadd132sd %xmm3, %xmm2, %xmm1
vaddsd 288(%rdi), %xmm1, %xmm2
vmovsd %xmm2, 288(%rdi)
vaddsd 288(%rbx), %xmm1, %xmm1
vmovsd %xmm1, 288(%rbx)
vmovapd %xmm0, %xmm1
vfnmadd213sd 296(%rdi), %xmm11, %xmm1
vmovsd %xmm1, 296(%rdi)
vfnmadd213sd 296(%rbx), %xmm0, %xmm11
vmovsd %xmm11, 296(%rbx)
vfnmadd231sd %xmm0, %xmm0, %xmm8
vmulsd %xmm8, %xmm5, %xmm5
vfnmadd231sd %xmm0, %xmm0, %xmm6
vfmadd132sd %xmm6, %xmm5, %xmm7
vfnmadd132sd %xmm0, %xmm10, %xmm0
vfmadd132sd %xmm3, %xmm7, %xmm0
vaddsd 304(%rdi), %xmm0, %xmm1
vmovsd %xmm1, 304(%rdi)
vaddsd 304(%rbx), %xmm0, %xmm0
vmovsd %xmm0, 304(%rbx)
LLVM-3.5 with -Ofast -ffast-math -march=core-avx2 generates
107 insns (10% less than GCC!):
.LBB0_135: # %if.then1703
# in Loop: Header=BB0_132 Depth=3
leal (,%r15,4), %eax
vmovsd .LCPI0_4(%rip), %xmm1
vdivsd %xmm0, %xmm1, %xmm1
vmulsd %xmm1, %xmm1, %xmm0
vmulsd %xmm0, %xmm0, %xmm2
vmulsd %xmm2, %xmm0, %xmm0
vmovsd 56(%r13), %xmm2
vmovsd 64(%r13), %xmm3
vmulsd 56(%rcx), %xmm2, %xmm2
vmovsd 368(%rsp), %xmm4 # 8-byte Reload
vmulsd %xmm2, %xmm4, %xmm2
vmulsd %xmm2, %xmm1, %xmm2
vmulsd 64(%rcx), %xmm3, %xmm3
vmulsd %xmm3, %xmm0, %xmm3
vmovsd 72(%r13), %xmm4
vmulsd 72(%rcx), %xmm4, %xmm4
vmulsd %xmm0, %xmm0, %xmm0
vmulsd %xmm4, %xmm0, %xmm0
vsubsd %xmm3, %xmm2, %xmm4
vaddsd %xmm0, %xmm4, %xmm5
vmulsd %xmm2, %xmm1, %xmm2
vmulsd %xmm3, %xmm1, %xmm3
vmulsd %xmm0, %xmm1, %xmm0
vmovsd .LCPI0_9(%rip), %xmm4
vfmsub213sd %xmm2, %xmm3, %xmm4
vmovsd .LCPI0_10(%rip), %xmm6
vfmadd213sd %xmm4, %xmm0, %xmm6
vmulsd %xmm2, %xmm1, %xmm2
vmulsd %xmm3, %xmm1, %xmm4
vmulsd %xmm0, %xmm1, %xmm0
vmulsd .LCPI0_11(%rip), %xmm2, %xmm11
vmulsd .LCPI0_12(%rip), %xmm4, %xmm14
vmulsd .LCPI0_13(%rip), %xmm0, %xmm10
cltq
vpermilpd $0, %xmm1, %xmm0 # xmm0 = xmm1[0,0]
vmulpd (%r11,%rax,8), %xmm0, %xmm0
orl $2, %eax
cltq
vmulsd (%r11,%rax,8), %xmm1, %xmm9
vaddsd 208(%r13), %xmm5, %xmm5
vmovsd %xmm5, 208(%r13)
vpermilpd $0, %xmm6, %xmm5 # xmm5 = xmm6[0,0]
vmulpd %xmm0, %xmm5, %xmm5
vmovupd 240(%rcx), %xmm7
vsubpd %xmm5, %xmm7, %xmm7
vmovupd %xmm7, 240(%rcx)
vaddpd 240(%r13), %xmm5, %xmm5
vmovupd %xmm5, 240(%r13)
vmulsd %xmm6, %xmm9, %xmm5
vmovsd 256(%rcx), %xmm6
vsubsd %xmm5, %xmm6, %xmm6
vmovsd %xmm6, 256(%rcx)
vaddsd 256(%r13), %xmm5, %xmm5
vmovsd %xmm5, 256(%r13)
vmulsd %xmm0, %xmm0, %xmm5
vunpckhpd %xmm0, %xmm0, %xmm8 # xmm8 = xmm0[1,1]
vmulsd %xmm8, %xmm8, %xmm15
vmulsd %xmm9, %xmm9, %xmm7
vmovsd .LCPI0_14(%rip), %xmm3
vaddsd %xmm3, %xmm5, %xmm1
vmovsd .LCPI0_15(%rip), %xmm4
vaddsd %xmm4, %xmm5, %xmm2
vmulsd %xmm2, %xmm14, %xmm2
vfmadd213sd %xmm2, %xmm11, %xmm1
vmovsd .LCPI0_16(%rip), %xmm6
vaddsd %xmm6, %xmm5, %xmm5
vfmadd213sd %xmm1, %xmm10, %xmm5
vaddsd %xmm3, %xmm15, %xmm1
vaddsd %xmm4, %xmm15, %xmm2
vmulsd %xmm2, %xmm14, %xmm2
vfmadd213sd %xmm2, %xmm11, %xmm1
vaddsd %xmm3, %xmm7, %xmm2
vaddsd %xmm4, %xmm7, %xmm3
vmulsd %xmm3, %xmm14, %xmm3
vfmadd213sd %xmm3, %xmm11, %xmm2
vaddsd %xmm14, %xmm11, %xmm3
vaddsd %xmm6, %xmm15, %xmm4
vfmadd213sd %xmm1, %xmm10, %xmm4
vaddsd %xmm6, %xmm7, %xmm1
vfmadd213sd %xmm2, %xmm10, %xmm1
vaddsd %xmm10, %xmm3, %xmm2
vmulsd %xmm2, %xmm8, %xmm3
vmulsd %xmm3, %xmm0, %xmm6
vunpcklpd %xmm6, %xmm5, %xmm5 # xmm5 = xmm5[0],xmm6[0]
vmovupd 264(%rcx), %xmm6
vsubpd %xmm5, %xmm6, %xmm6
vmovupd %xmm6, 264(%rcx)
vmovupd 264(%r13), %xmm6
vsubpd %xmm5, %xmm6, %xmm5
vmovupd %xmm5, 264(%r13)
vmulsd %xmm2, %xmm9, %xmm2
vmulsd %xmm2, %xmm0, %xmm0
vunpcklpd %xmm4, %xmm0, %xmm0 # xmm0 = xmm0[0],xmm4[0]
vmovupd 280(%rcx), %xmm2
vsubpd %xmm0, %xmm2, %xmm2
vmovupd %xmm2, 280(%rcx)
vmovupd 280(%r13), %xmm2
vsubpd %xmm0, %xmm2, %xmm0
vmovupd %xmm0, 280(%r13)
vmulsd %xmm3, %xmm9, %xmm0
vunpcklpd %xmm1, %xmm0, %xmm0 # xmm0 = xmm0[0],xmm1[0]
vmovupd 296(%rcx), %xmm1
vsubpd %xmm0, %xmm1, %xmm1
vmovupd %xmm1, 296(%rcx)
vmovupd 296(%r13), %xmm1
vsubpd %xmm0, %xmm1, %xmm0
vmovupd %xmm0, 296(%r13)
It is achieved by vectorization, please see vsubpd and vmulpd in LLVM
generated code.
^ permalink raw reply [flat|nested] 5+ messages in thread
* [Bug tree-optimization/64716] Missed vectorization in a hot code of SPEC2000 ammp
2015-01-21 16:58 [Bug tree-optimization/64716] New: Missed vectorization in a hot code of SPEC2000 ammp vmakarov at gcc dot gnu.org
@ 2015-01-21 17:36 ` vmakarov at gcc dot gnu.org
2015-01-21 19:37 ` jakub at gcc dot gnu.org
` (2 subsequent siblings)
3 siblings, 0 replies; 5+ messages in thread
From: vmakarov at gcc dot gnu.org @ 2015-01-21 17:36 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=64716
--- Comment #1 from Vladimir Makarov <vmakarov at gcc dot gnu.org> ---
Created attachment 34523
--> https://gcc.gnu.org/bugzilla/attachment.cgi?id=34523&action=edit
rectmm.c code annotated by gcov to see other hot code parts
^ permalink raw reply [flat|nested] 5+ messages in thread
* [Bug tree-optimization/64716] Missed vectorization in a hot code of SPEC2000 ammp
2015-01-21 16:58 [Bug tree-optimization/64716] New: Missed vectorization in a hot code of SPEC2000 ammp vmakarov at gcc dot gnu.org
2015-01-21 17:36 ` [Bug tree-optimization/64716] " vmakarov at gcc dot gnu.org
@ 2015-01-21 19:37 ` jakub at gcc dot gnu.org
2015-01-21 19:54 ` jakub at gcc dot gnu.org
2021-07-21 3:28 ` pinskia at gcc dot gnu.org
3 siblings, 0 replies; 5+ messages in thread
From: jakub at gcc dot gnu.org @ 2015-01-21 19:37 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=64716
Jakub Jelinek <jakub at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Status|UNCONFIRMED |NEW
Last reconfirmed| |2015-01-21
CC| |jakub at gcc dot gnu.org
Blocks|53947 |
Ever confirmed|0 |1
--- Comment #2 from Jakub Jelinek <jakub at gcc dot gnu.org> ---
This is about SLP vectorization, and for some reason we treat the whole bb as
one item to slp vectorize, rather than trying to vectorize just individual
statements where beneficial.
So we end up with:
pr64716.c:2633:12: note: Build SLP failed: unrolling required in basic block
SLP
pr64716.c:2633:12: note: Failed to SLP the basic block.
pr64716.c:2633:12: note: not vectorized: failed to find SLP opportunities in
basic block.
where 2633 in my copy is that r0 = 1./(*vector)[j+3]; line - group_size is 6 on
that stmt and nunits is 4 (for AVX2) or 2 (for 128-bit vectors).
^ permalink raw reply [flat|nested] 5+ messages in thread
* [Bug tree-optimization/64716] Missed vectorization in a hot code of SPEC2000 ammp
2015-01-21 16:58 [Bug tree-optimization/64716] New: Missed vectorization in a hot code of SPEC2000 ammp vmakarov at gcc dot gnu.org
2015-01-21 17:36 ` [Bug tree-optimization/64716] " vmakarov at gcc dot gnu.org
2015-01-21 19:37 ` jakub at gcc dot gnu.org
@ 2015-01-21 19:54 ` jakub at gcc dot gnu.org
2021-07-21 3:28 ` pinskia at gcc dot gnu.org
3 siblings, 0 replies; 5+ messages in thread
From: jakub at gcc dot gnu.org @ 2015-01-21 19:54 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=64716
--- Comment #3 from Jakub Jelinek <jakub at gcc dot gnu.org> ---
Say on:
a2->qyz -= (k2+ka2+kb2)*yt*zt;
a1->qyz -= (k2+ka2+kb2)*yt*zt;
a2->qzz -= k2*(zt2 - 1./3) + ka2*(zt2 - 1./8)+kb2*(zt2-1./14) ;
a1->qzz -= k2*(zt2 - 1./3) + ka2*(zt2 - 1./8)+kb2*(zt2-1./14) ;
it seems that
temp1 = (k2+ka2+kb2)*yt*zt
and
temp2 = k2*(zt2 - 1./3) + ka2*(zt2 - 1./8)+kb2*(zt2-1./14)
are computed in scalar code, then combined into a V2DFmode vector and the
a1->qyz -= temp1;
a1->qzz -= temp2;
a2->qyz -= temp1;
a2->qyz -= temp2;
is already performed using vectorized code. We'd need to carefully analyze the
costs if putting the scalars into the vector is beneficial, but supposedly it
is if the score shows that.
Or the:
xt = (*vector)[j] * r0;
yt = (*vector)[j + 1] * r0;
zt = (*vector)[j + 2] * r0;
a2->dpx -= k1 * xt;
a1->dpx += k1 * xt;
a2->dpy -= k1 * yt;
a1->dpy += k1 * yt;
a2->dpz -= k1 * zt;
a1->dpz += k1 * zt;
part shows that even though this would be ideally vectorized with V3DFmode
vectors, it can be vectorized using V2DFmode + scalar for the *z* elements.
Or say for a group of 6 we could consider vectorizing with 4 units vector and 2
units vector for the remainder (perhaps split appart the SLP instance for that,
analyze each individually?).
^ permalink raw reply [flat|nested] 5+ messages in thread
* [Bug tree-optimization/64716] Missed vectorization in a hot code of SPEC2000 ammp
2015-01-21 16:58 [Bug tree-optimization/64716] New: Missed vectorization in a hot code of SPEC2000 ammp vmakarov at gcc dot gnu.org
` (2 preceding siblings ...)
2015-01-21 19:54 ` jakub at gcc dot gnu.org
@ 2021-07-21 3:28 ` pinskia at gcc dot gnu.org
3 siblings, 0 replies; 5+ messages in thread
From: pinskia at gcc dot gnu.org @ 2021-07-21 3:28 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=64716
Andrew Pinski <pinskia at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Severity|normal |enhancement
Keywords| |missed-optimization
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2021-07-21 3:28 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-01-21 16:58 [Bug tree-optimization/64716] New: Missed vectorization in a hot code of SPEC2000 ammp vmakarov at gcc dot gnu.org
2015-01-21 17:36 ` [Bug tree-optimization/64716] " vmakarov at gcc dot gnu.org
2015-01-21 19:37 ` jakub at gcc dot gnu.org
2015-01-21 19:54 ` jakub at gcc dot gnu.org
2021-07-21 3:28 ` pinskia at gcc dot gnu.org
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).