From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <gcc-bugs-return-408341-listarch-gcc-bugs=gcc.gnu.org@gcc.gnu.org>
Received: (qmail 11812 invoked by alias); 30 Nov 2012 13:54:01 -0000
Received: (qmail 11728 invoked by uid 48); 30 Nov 2012 13:53:42 -0000
From: "vincenzo.innocente at cern dot ch" <gcc-bugzilla@gcc.gnu.org>
To: gcc-bugs@gcc.gnu.org
Subject: [Bug tree-optimization/51062] SLP vectorization of dot (inner) product
Date: Fri, 30 Nov 2012 13:54:00 -0000
X-Bugzilla-Reason: CC
X-Bugzilla-Type: changed
X-Bugzilla-Watch-Reason: None
X-Bugzilla-Product: gcc
X-Bugzilla-Component: tree-optimization
X-Bugzilla-Keywords:
X-Bugzilla-Severity: enhancement
X-Bugzilla-Who: vincenzo.innocente at cern dot ch
X-Bugzilla-Status: NEW
X-Bugzilla-Priority: P3
X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org
X-Bugzilla-Target-Milestone: ---
X-Bugzilla-Changed-Fields: Version
Message-ID: <bug-51062-4-ufhv0YmD9K@http.gcc.gnu.org/bugzilla/>
In-Reply-To: <bug-51062-4@http.gcc.gnu.org/bugzilla/>
References: <bug-51062-4@http.gcc.gnu.org/bugzilla/>
X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/
Auto-Submitted: auto-generated
Content-Type: text/plain; charset="UTF-8"
Content-Transfer-Encoding: quoted-printable
MIME-Version: 1.0
Mailing-List: contact gcc-bugs-help@gcc.gnu.org; run by ezmlm
Precedence: bulk
List-Id: <gcc-bugs.gcc.gnu.org>
List-Archive: <http://gcc.gnu.org/ml/gcc-bugs/>
List-Post: <mailto:gcc-bugs@gcc.gnu.org>
List-Help: <mailto:gcc-bugs-help@gcc.gnu.org>
Sender: gcc-bugs-owner@gcc.gnu.org
X-SW-Source: 2012-11/txt/msg03003.txt.bz2


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D51062

vincenzo Innocente <vincenzo.innocente at cern dot ch> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
            Version|4.7.0                       |4.8.0

--- Comment #3 from vincenzo Innocente <vincenzo.innocente at cern dot ch> =
2012-11-30 13:53:41 UTC ---
in 4.8 using=20
typedef float __attribute__( ( vector_size( 16 ) ) ) float32x4_t;
typedef double __attribute__( ( vector_size( 32 ) ) ) float64x4_t;
the scalar product works well IF WRITTEN as a loop!

in the following=20
dot_product2 produces exactly the code I would have expected to be emitted =
by
"dot_product"..
would be nice to have also the reduction of a single vector to emit
horizonal-sum=E2=80=A6


float dot_product(float32x4_t x, float32x4_t y) {
  float32x4_t res =3D x*y;
  float ret=3D0;
  for (int i=3D0;i!=3D4;++i) ret+=3Dres[i];
  return ret;
}

float dot_product2(float32x4_t x, float32x4_t y) {
  float ret=3D0;
  for (int i=3D0;i!=3D4;++i) ret+=3Dx[i]*y[i];
  return ret;
}


double dot_product(float64x4_t x, float64x4_t y) {
  float64x4_t res =3D x*y;
  double ret=3D0;
  for (int i=3D0;i!=3D4;++i) ret+=3Dres[i];
  return ret;
}

double dot_product2(float64x4_t x, float64x4_t y) {
  double ret=3D0;
  for (int i=3D0;i!=3D4;++i) ret+=3Dx[i]*y[i];
  return ret;
}

c++ -Ofast -ftree-vectorizer-verbose=3D2 -S cross.cc  -march=3Dcorei7-avx; =
cat
cross.s | c++filt
dot_product(float __vector, float __vector):
LFB2:
    vmulps    %xmm1, %xmm0, %xmm1
    vmovaps    %xmm1, %xmm0
    vshufps    $85, %xmm1, %xmm1, %xmm2
    vaddss    %xmm0, %xmm2, %xmm0
    vunpckhps    %xmm1, %xmm1, %xmm2
    vshufps    $255, %xmm1, %xmm1, %xmm1
    vaddss    %xmm2, %xmm0, %xmm0
    vaddss    %xmm1, %xmm0, %xmm0
    ret
LFE2:
    .align 4,0x90
    .globl dot_product2(float __vector, float __vector)
dot_product2(float __vector, float __vector):
LFB3:
    vmulps    %xmm0, %xmm1, %xmm1
    vhaddps    %xmm1, %xmm1, %xmm0
    vhaddps    %xmm0, %xmm0, %xmm0
    ret
LFE3:
    .align 4,0x90
    .globl dot_product(double __vector, double __vector)
dot_product(double __vector, double __vector):
LFB4:
    vmulpd    %ymm1, %ymm0, %ymm1
    vmovapd    %xmm1, %xmm0
    vextractf128    $0x1, %ymm1, %xmm1
    vhaddpd    %xmm0, %xmm0, %xmm0
    vmovapd    %xmm1, %xmm2
    vunpckhpd    %xmm1, %xmm1, %xmm1
    vaddsd    %xmm2, %xmm0, %xmm0
    vaddsd    %xmm1, %xmm0, %xmm0
    vzeroupper
    ret
LFE4:
    .align 4,0x90
    .globl dot_product2(double __vector, double __vector)
dot_product2(double __vector, double __vector):
LFB5:
    vmulpd    %ymm0, %ymm1, %ymm1
    vhaddpd    %ymm1, %ymm1, %ymm1
    pushq    %rbp
LCFI0:
    movq    %rsp, %rbp
LCFI1:
    andq    $-32, %rsp
    addq    $16, %rsp
    vperm2f128    $1, %ymm1, %ymm1, %ymm0
    vaddpd    %ymm0, %ymm1, %ymm1
    vmovapd    %xmm1, %xmm0
    vzeroupper
    leave
LCFI2:
    ret