public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug tree-optimization/51062] New: SLP vectorization of dot (inner) product
@ 2011-11-09 14:55 vincenzo.innocente at cern dot ch
2011-11-09 15:00 ` [Bug tree-optimization/51062] " rguenth at gcc dot gnu.org
` (2 more replies)
0 siblings, 3 replies; 4+ messages in thread
From: vincenzo.innocente at cern dot ch @ 2011-11-09 14:55 UTC (permalink / raw)
To: gcc-bugs
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=51062
Bug #: 51062
Summary: SLP vectorization of dot (inner) product
Classification: Unclassified
Product: gcc
Version: 4.7.0
Status: UNCONFIRMED
Severity: enhancement
Priority: P3
Component: tree-optimization
AssignedTo: unassigned@gcc.gnu.org
ReportedBy: vincenzo.innocente@cern.ch
SLP is working nicely in 4.7
the most needed missing bit is the ability to vectorize a dot product (using
for instance _mm_dp_ps for sse4)
Any chance to get this any time soon?
small test here
cat dot.cc
struct V {
float x,y,z,w;
};
V a;
V b;
float dot() {
return a.x*b.x+a.y*b.y+a.z*b.z+a.w*b.w;
}
V sum() {
V v=a;
v.x+=b.x; v.y+=b.y; v.z+=b.z; v.w+=b.w;
return v;
}
c++ -Ofast -c dot.cc -march=corei7
otool -X -t -v -V dot.o | c++filt
dot():
movss _b+0x00000004(%rip),%xmm0
movss _b(%rip),%xmm1
mulss _a+0x00000004(%rip),%xmm0
mulss _a(%rip),%xmm1
addss %xmm1,%xmm0
movss _b+0x00000008(%rip),%xmm1
mulss _a+0x00000008(%rip),%xmm1
addss %xmm1,%xmm0
movss _b+0x0000000c(%rip),%xmm1
mulss _a+0x0000000c(%rip),%xmm1
addss %xmm1,%xmm0
ret
nopl (%rax)
sum():
movaps _b(%rip),%xmm0
addps _a(%rip),%xmm0
movaps %xmm0,0xc8(%rsp)
movq 0xc8(%rsp),%rax
movaps %xmm0,0xe8(%rsp)
movq _a(%rsp),%xmm1
movd %rax,%xmm0
ret
^ permalink raw reply [flat|nested] 4+ messages in thread
* [Bug tree-optimization/51062] SLP vectorization of dot (inner) product
2011-11-09 14:55 [Bug tree-optimization/51062] New: SLP vectorization of dot (inner) product vincenzo.innocente at cern dot ch
@ 2011-11-09 15:00 ` rguenth at gcc dot gnu.org
2011-11-09 16:28 ` irar at il dot ibm.com
2012-11-30 13:54 ` vincenzo.innocente at cern dot ch
2 siblings, 0 replies; 4+ messages in thread
From: rguenth at gcc dot gnu.org @ 2011-11-09 15:00 UTC (permalink / raw)
To: gcc-bugs
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=51062
Richard Guenther <rguenth at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Status|UNCONFIRMED |NEW
Last reconfirmed| |2011-11-09
CC| |irar at gcc dot gnu.org
Ever Confirmed|0 |1
--- Comment #1 from Richard Guenther <rguenth at gcc dot gnu.org> 2011-11-09 14:58:42 UTC ---
I think we do not perform pattern detection in SLP mode. Ira?
^ permalink raw reply [flat|nested] 4+ messages in thread
* [Bug tree-optimization/51062] SLP vectorization of dot (inner) product
2011-11-09 14:55 [Bug tree-optimization/51062] New: SLP vectorization of dot (inner) product vincenzo.innocente at cern dot ch
2011-11-09 15:00 ` [Bug tree-optimization/51062] " rguenth at gcc dot gnu.org
@ 2011-11-09 16:28 ` irar at il dot ibm.com
2012-11-30 13:54 ` vincenzo.innocente at cern dot ch
2 siblings, 0 replies; 4+ messages in thread
From: irar at il dot ibm.com @ 2011-11-09 16:28 UTC (permalink / raw)
To: gcc-bugs
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=51062
Ira Rosen <irar at il dot ibm.com> changed:
What |Removed |Added
----------------------------------------------------------------------------
CC| |irar at il dot ibm.com
--- Comment #2 from Ira Rosen <irar at il dot ibm.com> 2011-11-09 16:13:08 UTC ---
(In reply to comment #1)
> I think we do not perform pattern detection in SLP mode. Ira?
Right. I actually had a patch for pattern detection in SLP ready couple of
hours after Stage 1 was over. But this patch doesn't handle dot product (and
widen-sum), since these patterns look for reduction, i.e., loop, so it will
need some additional work.
^ permalink raw reply [flat|nested] 4+ messages in thread
* [Bug tree-optimization/51062] SLP vectorization of dot (inner) product
2011-11-09 14:55 [Bug tree-optimization/51062] New: SLP vectorization of dot (inner) product vincenzo.innocente at cern dot ch
2011-11-09 15:00 ` [Bug tree-optimization/51062] " rguenth at gcc dot gnu.org
2011-11-09 16:28 ` irar at il dot ibm.com
@ 2012-11-30 13:54 ` vincenzo.innocente at cern dot ch
2 siblings, 0 replies; 4+ messages in thread
From: vincenzo.innocente at cern dot ch @ 2012-11-30 13:54 UTC (permalink / raw)
To: gcc-bugs
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=51062
vincenzo Innocente <vincenzo.innocente at cern dot ch> changed:
What |Removed |Added
----------------------------------------------------------------------------
Version|4.7.0 |4.8.0
--- Comment #3 from vincenzo Innocente <vincenzo.innocente at cern dot ch> 2012-11-30 13:53:41 UTC ---
in 4.8 using
typedef float __attribute__( ( vector_size( 16 ) ) ) float32x4_t;
typedef double __attribute__( ( vector_size( 32 ) ) ) float64x4_t;
the scalar product works well IF WRITTEN as a loop!
in the following
dot_product2 produces exactly the code I would have expected to be emitted by
"dot_product"..
would be nice to have also the reduction of a single vector to emit
horizonal-sum…
float dot_product(float32x4_t x, float32x4_t y) {
float32x4_t res = x*y;
float ret=0;
for (int i=0;i!=4;++i) ret+=res[i];
return ret;
}
float dot_product2(float32x4_t x, float32x4_t y) {
float ret=0;
for (int i=0;i!=4;++i) ret+=x[i]*y[i];
return ret;
}
double dot_product(float64x4_t x, float64x4_t y) {
float64x4_t res = x*y;
double ret=0;
for (int i=0;i!=4;++i) ret+=res[i];
return ret;
}
double dot_product2(float64x4_t x, float64x4_t y) {
double ret=0;
for (int i=0;i!=4;++i) ret+=x[i]*y[i];
return ret;
}
c++ -Ofast -ftree-vectorizer-verbose=2 -S cross.cc -march=corei7-avx; cat
cross.s | c++filt
dot_product(float __vector, float __vector):
LFB2:
vmulps %xmm1, %xmm0, %xmm1
vmovaps %xmm1, %xmm0
vshufps $85, %xmm1, %xmm1, %xmm2
vaddss %xmm0, %xmm2, %xmm0
vunpckhps %xmm1, %xmm1, %xmm2
vshufps $255, %xmm1, %xmm1, %xmm1
vaddss %xmm2, %xmm0, %xmm0
vaddss %xmm1, %xmm0, %xmm0
ret
LFE2:
.align 4,0x90
.globl dot_product2(float __vector, float __vector)
dot_product2(float __vector, float __vector):
LFB3:
vmulps %xmm0, %xmm1, %xmm1
vhaddps %xmm1, %xmm1, %xmm0
vhaddps %xmm0, %xmm0, %xmm0
ret
LFE3:
.align 4,0x90
.globl dot_product(double __vector, double __vector)
dot_product(double __vector, double __vector):
LFB4:
vmulpd %ymm1, %ymm0, %ymm1
vmovapd %xmm1, %xmm0
vextractf128 $0x1, %ymm1, %xmm1
vhaddpd %xmm0, %xmm0, %xmm0
vmovapd %xmm1, %xmm2
vunpckhpd %xmm1, %xmm1, %xmm1
vaddsd %xmm2, %xmm0, %xmm0
vaddsd %xmm1, %xmm0, %xmm0
vzeroupper
ret
LFE4:
.align 4,0x90
.globl dot_product2(double __vector, double __vector)
dot_product2(double __vector, double __vector):
LFB5:
vmulpd %ymm0, %ymm1, %ymm1
vhaddpd %ymm1, %ymm1, %ymm1
pushq %rbp
LCFI0:
movq %rsp, %rbp
LCFI1:
andq $-32, %rsp
addq $16, %rsp
vperm2f128 $1, %ymm1, %ymm1, %ymm0
vaddpd %ymm0, %ymm1, %ymm1
vmovapd %xmm1, %xmm0
vzeroupper
leave
LCFI2:
ret
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2012-11-30 13:54 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-11-09 14:55 [Bug tree-optimization/51062] New: SLP vectorization of dot (inner) product vincenzo.innocente at cern dot ch
2011-11-09 15:00 ` [Bug tree-optimization/51062] " rguenth at gcc dot gnu.org
2011-11-09 16:28 ` irar at il dot ibm.com
2012-11-30 13:54 ` vincenzo.innocente at cern dot ch
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).