From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <gcc-bugzilla@gcc.gnu.org>
Received: by sourceware.org (Postfix, from userid 48)
 id D09003858405; Wed, 22 Dec 2021 13:44:10 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org D09003858405
From: "hubicka at gcc dot gnu.org" <gcc-bugzilla@gcc.gnu.org>
To: gcc-bugs@gcc.gnu.org
Subject: [Bug tree-optimization/103797] Clang vectorized LightPixel while GCC
 does not
Date: Wed, 22 Dec 2021 13:44:10 +0000
X-Bugzilla-Reason: CC
X-Bugzilla-Type: changed
X-Bugzilla-Watch-Reason: None
X-Bugzilla-Product: gcc
X-Bugzilla-Component: tree-optimization
X-Bugzilla-Version: unknown
X-Bugzilla-Keywords: missed-optimization
X-Bugzilla-Severity: enhancement
X-Bugzilla-Who: hubicka at gcc dot gnu.org
X-Bugzilla-Status: NEW
X-Bugzilla-Resolution: 
X-Bugzilla-Priority: P3
X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org
X-Bugzilla-Target-Milestone: ---
X-Bugzilla-Flags: 
X-Bugzilla-Changed-Fields: bug_status
Message-ID: <bug-103797-4-US1NMQQg6o@http.gcc.gnu.org/bugzilla/>
In-Reply-To: <bug-103797-4@http.gcc.gnu.org/bugzilla/>
References: <bug-103797-4@http.gcc.gnu.org/bugzilla/>
Content-Type: text/plain; charset="UTF-8"
Content-Transfer-Encoding: quoted-printable
X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/
Auto-Submitted: auto-generated
MIME-Version: 1.0
X-BeenThere: gcc-bugs@gcc.gnu.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Gcc-bugs mailing list <gcc-bugs.gcc.gnu.org>
List-Unsubscribe: <https://gcc.gnu.org/mailman/options/gcc-bugs>,
 <mailto:gcc-bugs-request@gcc.gnu.org?subject=unsubscribe>
List-Archive: <https://gcc.gnu.org/pipermail/gcc-bugs/>
List-Post: <mailto:gcc-bugs@gcc.gnu.org>
List-Help: <mailto:gcc-bugs-request@gcc.gnu.org?subject=help>
List-Subscribe: <https://gcc.gnu.org/mailman/listinfo/gcc-bugs>,
 <mailto:gcc-bugs-request@gcc.gnu.org?subject=subscribe>
X-List-Received-Date: Wed, 22 Dec 2021 13:44:10 -0000

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D103797

Jan Hubicka <hubicka at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|WAITING                     |NEW
--- Comment #7 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
OK, here is completely fake testcase that does similar operaitons:

#include <math.h>
struct test {float x; float y; float z;} test;
float f;
void
t()
{
  float x =3D test.x;
  float y =3D test.y;
  float z =3D test.z;

  x =3D x * f;
  y =3D y * f;
  z =3D z * f;
  x =3D sqrt (x);
  y =3D sqrt (y);
  z =3D sqrt (z);
  x =3D x / f;
  y =3D y / f;
  z =3D z / f;
  test.x=3Dx;
  test.y=3Dy;
  test.z=3Dz;
}

We seem to fail to vectorize it with:

t.c:20:9: missed:   op not supported by target.=20=20=20=20=20=20=20=20=20=
=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20
t.c:17:5: missed:   not vectorized: relevant stmt not supported: x_15 =3D x=
_24 /
f.0_1;

clang seems to use divps happilly, so I am not sure why it is not supported.
Even more funny is that with -Ofast it is compiled into multiplication by
reciprocal:

t:
.LFB0:
        .cfi_startproc
        movss   f(%rip), %xmm4
        movss   .LC0(%rip), %xmm2
        movss   test(%rip), %xmm0
        movss   test+4(%rip), %xmm3
        divss   %xmm4, %xmm2
        movss   test+8(%rip), %xmm1
        mulss   %xmm4, %xmm0
        mulss   %xmm4, %xmm3
        mulss   %xmm4, %xmm1
        sqrtss  %xmm0, %xmm0
        sqrtss  %xmm3, %xmm3
        sqrtss  %xmm1, %xmm1
        mulss   %xmm2, %xmm0
        mulss   %xmm2, %xmm3
        mulss   %xmm2, %xmm1
        unpcklps        %xmm3, %xmm0
        movlps  %xmm0, test(%rip)
        movss   %xmm1, test+8(%rip)
        ret


and rewriting it that way by hand:

#include <math.h>
struct test {float x; float y; float z;} test;
float f;
void
t()
{
  float x =3D test.x;
  float y =3D test.y;
  float z =3D test.z;
  float m =3D 1/f;

  x =3D x * f;
  y =3D y * f;
  z =3D z * f;
  x =3D sqrt (x);
  y =3D sqrt (y);
  z =3D sqrt (z);
  x =3D x * m;
  y =3D y * m;
  z =3D z * m;
  test.x=3Dx;
  test.y=3Dy;
  test.z=3Dz;
}

gets the expected result:
t:
.LFB0:
        .cfi_startproc
        movss   f(%rip), %xmm0
        movq    test(%rip), %xmm1
        movaps  %xmm0, %xmm2
        shufps  $0xe0, %xmm2, %xmm2
        mulps   %xmm1, %xmm2
        movss   .LC0(%rip), %xmm1
        divss   %xmm0, %xmm1
        mulss   test+8(%rip), %xmm0
        sqrtps  %xmm2, %xmm2
        sqrtss  %xmm0, %xmm0
        movaps  %xmm1, %xmm3
        shufps  $0xe0, %xmm3, %xmm3
        mulss   %xmm0, %xmm1
        mulps   %xmm3, %xmm2
        movss   %xmm1, test+8(%rip)
        movlps  %xmm2, test(%rip)
        ret
        .cfi_endproc

Having this however I do not see slp analyzing the divide in the original c=
ode
at all.=