From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <gcc-bugzilla@gcc.gnu.org>
Received: by sourceware.org (Postfix, from userid 48)
 id 81368384C82F; Mon, 16 Aug 2021 05:08:55 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 81368384C82F
From: "crazylht at gmail dot com" <gcc-bugzilla@gcc.gnu.org>
To: gcc-bugs@gcc.gnu.org
Subject: [Bug tree-optimization/101929] New: r12-2549 regress x264_r by 4% on
 CLX.
Date: Mon, 16 Aug 2021 05:08:55 +0000
X-Bugzilla-Reason: CC
X-Bugzilla-Type: new
X-Bugzilla-Watch-Reason: None
X-Bugzilla-Product: gcc
X-Bugzilla-Component: tree-optimization
X-Bugzilla-Version: 12.0
X-Bugzilla-Keywords: 
X-Bugzilla-Severity: normal
X-Bugzilla-Who: crazylht at gmail dot com
X-Bugzilla-Status: UNCONFIRMED
X-Bugzilla-Resolution: 
X-Bugzilla-Priority: P3
X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org
X-Bugzilla-Target-Milestone: ---
X-Bugzilla-Flags: 
X-Bugzilla-Changed-Fields: bug_id short_desc product version bug_status
 bug_severity priority component assigned_to reporter cc target_milestone
 cf_gcchost cf_gcctarget
Message-ID: <bug-101929-4@http.gcc.gnu.org/bugzilla/>
Content-Type: text/plain; charset="UTF-8"
Content-Transfer-Encoding: quoted-printable
X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/
Auto-Submitted: auto-generated
MIME-Version: 1.0
X-BeenThere: gcc-bugs@gcc.gnu.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Gcc-bugs mailing list <gcc-bugs.gcc.gnu.org>
List-Unsubscribe: <https://gcc.gnu.org/mailman/options/gcc-bugs>,
 <mailto:gcc-bugs-request@gcc.gnu.org?subject=unsubscribe>
List-Archive: <https://gcc.gnu.org/pipermail/gcc-bugs/>
List-Post: <mailto:gcc-bugs@gcc.gnu.org>
List-Help: <mailto:gcc-bugs-request@gcc.gnu.org?subject=help>
List-Subscribe: <https://gcc.gnu.org/mailman/listinfo/gcc-bugs>,
 <mailto:gcc-bugs-request@gcc.gnu.org?subject=subscribe>
X-List-Received-Date: Mon, 16 Aug 2021 05:08:55 -0000

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D101929

            Bug ID: 101929
           Summary: r12-2549 regress x264_r by 4% on CLX.
           Product: gcc
           Version: 12.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: crazylht at gmail dot com
                CC: hjl.tools at gmail dot com, wwwhhhyyy333 at gmail dot c=
om
  Target Milestone: ---
              Host: x86_64-pc-linux-gnu
            Target: x86_64-*-* i?86-*-*

The regression is in x264_pixel_satd_8x4

typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned short uint16_t;

// in: a pseudo-simd number of the form x+(y<<16)
// return: abs(x)+(abs(y)<<16)
static inline
uint32_t abs2( uint32_t a )
{
    uint32_t s =3D ((a>>15)&0x10001)*0xffff;
    return (a+s)^s;
}

#define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3) {\
    int t0 =3D s0 + s1;\
    int t1 =3D s0 - s1;\
    int t2 =3D s2 + s3;\
    int t3 =3D s2 - s3;\
    d0 =3D t0 + t2;\
    d2 =3D t0 - t2;\
    d1 =3D t1 + t3;\
    d3 =3D t1 - t3;\
}

int
x264_pixel_satd_8x4( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
{
    uint32_t tmp[4][4];
    uint32_t a0, a1, a2, a3;
    int sum =3D 0;
    for( int i =3D 0; i < 4; i++, pix1 +=3D i_pix1, pix2 +=3D i_pix2 )
    {
        a0 =3D (pix1[0] - pix2[0]) + ((pix1[4] - pix2[4]) << 16);
        a1 =3D (pix1[1] - pix2[1]) + ((pix1[5] - pix2[5]) << 16);
        a2 =3D (pix1[2] - pix2[2]) + ((pix1[6] - pix2[6]) << 16);
        a3 =3D (pix1[3] - pix2[3]) + ((pix1[7] - pix2[7]) << 16);
        HADAMARD4( tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0,a1,a2,a3 =
);
    }
    for( int i =3D 0; i < 4; i++ )
    {
        HADAMARD4( a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][=
i]
);
        sum +=3D abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
    }
    return (((uint16_t)sum) + ((uint32_t)sum>>16)) >> 1;
}

after increase cost of vector CTOR, slp1 won't vector for below
git diff my.slp1 original.slp1

-  _820 =3D {_187, _189, _187, _189};
-  vect_t2_188.65_821 =3D VIEW_CONVERT_EXPR<vector(4) int>(_820);
-  vect__200.67_823 =3D vect_t0_184.64_819 - vect_t2_188.65_821;
-  vect__191.66_822 =3D vect_t0_184.64_819 + vect_t2_188.65_821;
-  _824 =3D VEC_PERM_EXPR <vect__191.66_822, vect__200.67_823, { 0, 1, 6, 7=
 }>;
-  vect__192.68_825 =3D VIEW_CONVERT_EXPR<vector(4) unsigned int>(_824);
   t3_190 =3D (int) _189;
   _191 =3D t0_184 + t2_188;
   _192 =3D (unsigned int) _191;
+  tmp[0][0] =3D _192;
   _194 =3D t0_184 - t2_188;
   _195 =3D (unsigned int) _194;
+  tmp[0][2] =3D _195;
   _197 =3D t1_186 + t3_190;
   _198 =3D (unsigned int) _197;
+  tmp[0][1] =3D _198;
   _200 =3D t1_186 - t3_190;
   _201 =3D (unsigned int) _200;
-  MEM <vector(4) unsigned int> [(unsigned int *)&tmp] =3D vect__192.68_825;
+  tmp[0][3] =3D _201;

but the vectorized version can somehow help fre to eliminate redundant vect=
or
load and then got even better performace.

git diff dump.veclower21 dump.fre5

   MEM <vector(4) unsigned int> [(unsigned int *)&tmp + 48B] =3D vect__54.8=
9_852;
-  vect__63.9_482 =3D MEM <vector(4) unsigned int> [(unsigned int *)&tmp];
-  vect__64.12_478 =3D MEM <vector(4) unsigned int> [(unsigned int *)&tmp +=
 16B];
-  vect__65.13_477 =3D vect__63.9_482 + vect__64.12_478;
+  vect__65.13_477 =3D vect__192.68_825 + vect__273.75_834;
   vect_t0_100.14_476 =3D VIEW_CONVERT_EXPR<vector(4) int>(vect__65.13_477);
-  vect__67.15_475 =3D vect__63.9_482 - vect__64.12_478;
+  vect__67.15_475 =3D vect__192.68_825 - vect__273.75_834;
   vect_t1_101.16_474 =3D VIEW_CONVERT_EXPR<vector(4) int>(vect__67.15_475);
-  vect__68.19_470 =3D MEM <vector(4) unsigned int> [(unsigned int *)&tmp +=
 32B];
-  vect__69.22_466 =3D MEM <vector(4) unsigned int> [(unsigned int *)&tmp +=
 48B];
-  vect__70.23_465 =3D vect__68.19_470 + vect__69.22_466;
+  vect__70.23_465 =3D vect__354.82_843 + vect__54.89_852;

If slp1 can realize this and add the upper part to comparison of scalar cos=
t vs
vector cost, gcc should do vectorization, but currently it doesn't.=