From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 11812 invoked by alias); 30 Nov 2012 13:54:01 -0000 Received: (qmail 11728 invoked by uid 48); 30 Nov 2012 13:53:42 -0000 From: "vincenzo.innocente at cern dot ch" To: gcc-bugs@gcc.gnu.org Subject: [Bug tree-optimization/51062] SLP vectorization of dot (inner) product Date: Fri, 30 Nov 2012 13:54:00 -0000 X-Bugzilla-Reason: CC X-Bugzilla-Type: changed X-Bugzilla-Watch-Reason: None X-Bugzilla-Product: gcc X-Bugzilla-Component: tree-optimization X-Bugzilla-Keywords: X-Bugzilla-Severity: enhancement X-Bugzilla-Who: vincenzo.innocente at cern dot ch X-Bugzilla-Status: NEW X-Bugzilla-Priority: P3 X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org X-Bugzilla-Target-Milestone: --- X-Bugzilla-Changed-Fields: Version Message-ID: In-Reply-To: References: X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/ Auto-Submitted: auto-generated Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: quoted-printable MIME-Version: 1.0 Mailing-List: contact gcc-bugs-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Archive: List-Post: List-Help: Sender: gcc-bugs-owner@gcc.gnu.org X-SW-Source: 2012-11/txt/msg03003.txt.bz2 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D51062 vincenzo Innocente changed: What |Removed |Added ---------------------------------------------------------------------------- Version|4.7.0 |4.8.0 --- Comment #3 from vincenzo Innocente = 2012-11-30 13:53:41 UTC --- in 4.8 using=20 typedef float __attribute__( ( vector_size( 16 ) ) ) float32x4_t; typedef double __attribute__( ( vector_size( 32 ) ) ) float64x4_t; the scalar product works well IF WRITTEN as a loop! in the following=20 dot_product2 produces exactly the code I would have expected to be emitted = by "dot_product".. would be nice to have also the reduction of a single vector to emit horizonal-sum=E2=80=A6 float dot_product(float32x4_t x, float32x4_t y) { float32x4_t res =3D x*y; float ret=3D0; for (int i=3D0;i!=3D4;++i) ret+=3Dres[i]; return ret; } float dot_product2(float32x4_t x, float32x4_t y) { float ret=3D0; for (int i=3D0;i!=3D4;++i) ret+=3Dx[i]*y[i]; return ret; } double dot_product(float64x4_t x, float64x4_t y) { float64x4_t res =3D x*y; double ret=3D0; for (int i=3D0;i!=3D4;++i) ret+=3Dres[i]; return ret; } double dot_product2(float64x4_t x, float64x4_t y) { double ret=3D0; for (int i=3D0;i!=3D4;++i) ret+=3Dx[i]*y[i]; return ret; } c++ -Ofast -ftree-vectorizer-verbose=3D2 -S cross.cc -march=3Dcorei7-avx; = cat cross.s | c++filt dot_product(float __vector, float __vector): LFB2: vmulps %xmm1, %xmm0, %xmm1 vmovaps %xmm1, %xmm0 vshufps $85, %xmm1, %xmm1, %xmm2 vaddss %xmm0, %xmm2, %xmm0 vunpckhps %xmm1, %xmm1, %xmm2 vshufps $255, %xmm1, %xmm1, %xmm1 vaddss %xmm2, %xmm0, %xmm0 vaddss %xmm1, %xmm0, %xmm0 ret LFE2: .align 4,0x90 .globl dot_product2(float __vector, float __vector) dot_product2(float __vector, float __vector): LFB3: vmulps %xmm0, %xmm1, %xmm1 vhaddps %xmm1, %xmm1, %xmm0 vhaddps %xmm0, %xmm0, %xmm0 ret LFE3: .align 4,0x90 .globl dot_product(double __vector, double __vector) dot_product(double __vector, double __vector): LFB4: vmulpd %ymm1, %ymm0, %ymm1 vmovapd %xmm1, %xmm0 vextractf128 $0x1, %ymm1, %xmm1 vhaddpd %xmm0, %xmm0, %xmm0 vmovapd %xmm1, %xmm2 vunpckhpd %xmm1, %xmm1, %xmm1 vaddsd %xmm2, %xmm0, %xmm0 vaddsd %xmm1, %xmm0, %xmm0 vzeroupper ret LFE4: .align 4,0x90 .globl dot_product2(double __vector, double __vector) dot_product2(double __vector, double __vector): LFB5: vmulpd %ymm0, %ymm1, %ymm1 vhaddpd %ymm1, %ymm1, %ymm1 pushq %rbp LCFI0: movq %rsp, %rbp LCFI1: andq $-32, %rsp addq $16, %rsp vperm2f128 $1, %ymm1, %ymm1, %ymm0 vaddpd %ymm0, %ymm1, %ymm1 vmovapd %xmm1, %xmm0 vzeroupper leave LCFI2: ret