From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 48) id 4DEE23858C36; Tue, 30 Jan 2024 09:29:39 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 4DEE23858C36 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1706606979; bh=5SK7obmusIo8S6/iugkzQWhSwAvFJ7c445Uk0UZG15s=; h=From:To:Subject:Date:In-Reply-To:References:From; b=i7MoG/EO/Soy5IVut9GK0YmzgaNLYGeckTBcWXmtts2fVEyadNYKfBJZHgRcvpAaw Di2Zlo0BsFUqylhj9riGQ4VL6kE3cmR692lefO9yLkprJWzsUFtD/cI4WmdgYXEtZf XXYaK5kFo0H3z+WdvbJUTRRdEnFUG7tqSCD12zEY= From: "liuhongt at gcc dot gnu.org" To: gcc-bugs@gcc.gnu.org Subject: [Bug target/113600] [14 regression] 525.x264_r run-time regresses by 8% with PGO -Ofast -march=znver4 Date: Tue, 30 Jan 2024 09:29:36 +0000 X-Bugzilla-Reason: CC X-Bugzilla-Type: changed X-Bugzilla-Watch-Reason: None X-Bugzilla-Product: gcc X-Bugzilla-Component: target X-Bugzilla-Version: 14.0 X-Bugzilla-Keywords: missed-optimization X-Bugzilla-Severity: normal X-Bugzilla-Who: liuhongt at gcc dot gnu.org X-Bugzilla-Status: UNCONFIRMED X-Bugzilla-Resolution: X-Bugzilla-Priority: P3 X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org X-Bugzilla-Target-Milestone: 14.0 X-Bugzilla-Flags: X-Bugzilla-Changed-Fields: Message-ID: In-Reply-To: References: Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: quoted-printable X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/ Auto-Submitted: auto-generated MIME-Version: 1.0 List-Id: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D113600 --- Comment #5 from Hongtao Liu --- It looks like x264_pixel_satd_16x16 consumes more time after my commit, an extracted case is as below, note there's no attribute((always_inline)) in t= he original x264_pixel_satd_8x4, it's added to force inline(Under PGO, it's hot and will be inlined) typedef unsigned char uint8_t; typedef unsigned uint32_t; typedef unsigned short uint16_t; static inline uint32_t abs2( uint32_t a ) { uint32_t s =3D ((a>>15)&0x10001)*0xffff; return (a+s)^s; } int __attribute__((always_inline)) x264_pixel_satd_8x4( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ) { uint32_t tmp[4][4]; uint32_t a0, a1, a2, a3; int sum =3D 0; for( int i =3D 0; i < 4; i++, pix1 +=3D i_pix1, pix2 +=3D i_pix2 ) { a0 =3D (pix1[0] - pix2[0]) + ((pix1[4] - pix2[4]) << 16); a1 =3D (pix1[1] - pix2[1]) + ((pix1[5] - pix2[5]) << 16); a2 =3D (pix1[2] - pix2[2]) + ((pix1[6] - pix2[6]) << 16); a3 =3D (pix1[3] - pix2[3]) + ((pix1[7] - pix2[7]) << 16); { int t0 =3D a0 + a1; int t1 =3D a0 - a1; int t2 =3D a2 + a3; int t3 = =3D a2 - a3; tmp[i][0] =3D t0 + t2; tmp[i][2] =3D t0 - t2; tmp[i][1] =3D t1 + t3; tmp[i]= [3] =3D t1 - t3;}; } for( int i =3D 0; i < 4; i++ ) { { int t0 =3D tmp[0][i] + tmp[1][i]; int t1 =3D tmp[0][i] - tmp[1][i];= int t2 =3D tmp[2][i] + tmp[3][i]; int t3 =3D tmp[2][i] - tmp[3][i]; a0 =3D t0 + t2= ; a2 =3D t0 - t2; a1 =3D t1 + t3; a3 =3D t1 - t3;}; sum +=3D abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3); } return (((uint16_t)sum) + ((uint32_t)sum>>16)) >> 1; } int x264_pixel_satd_16x16( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_= pix2 ) { int sum =3D x264_pixel_satd_8x4( pix1, i_pix1, pix2, i_pix2 ) + x264_pixel_satd_8x4( pix1+4*i_pix1, i_pix1, pix2+4*i_pix2, i_pix2 ); sum+=3D x264_pixel_satd_8x4( pix1+8, i_pix1, pix2+8, i_pix2 ) + x264_pixel_satd_8x4( pix1+8+4*i_pix1, i_pix1, pix2+8+4*i_pix2, i_pix2= ); sum+=3D x264_pixel_satd_8x4( pix1+8*i_pix1, i_pix1, pix2+8*i_pix2, i_pix2= ) + x264_pixel_satd_8x4( pix1+12*i_pix1, i_pix1, pix2+12*i_pix2, i_pix2 ); sum+=3D x264_pixel_satd_8x4( pix1+8+8*i_pix1, i_pix1, pix2+8+8*i_pix2, i_= pix2 ) + x264_pixel_satd_8x4( pix1+8+12*i_pix1, i_pix1, pix2+8+12*i_pix2, i_pi= x2 ); return sum; } after commits, slp failed to splitted group size 16(vector int(16)) into sm= all 4 + 12 and missed vectorization for below cases. vect_t2_2445.784_8503 =3D VIEW_CONVERT_EXPR(_8502); vect__2457.786_8505 =3D vect_t0_2441.783_8501 - vect_t2_2445.784_8503; vect__2448.785_8504 =3D vect_t0_2441.783_8501 + vect_t2_2445.784_8503; _8506 =3D VEC_PERM_EXPR ; vect__2449.787_8507 =3D VIEW_CONVERT_EXPR(_8506); t3_2447 =3D (int) _2446; _2448 =3D t0_2441 + t2_2445; _2449 =3D (unsigned int) _2448; _2451 =3D t0_2441 - t2_2445; _2452 =3D (unsigned int) _2451; _2454 =3D t1_2443 + t3_2447; _2455 =3D (unsigned int) _2454; _2457 =3D t1_2443 - t3_2447; _2458 =3D (unsigned int) _2457; MEM [(unsigned int *)&tmp + 16B] =3D vect__2449.787_8507; The vector store will be optimized off with later vector load, so for the b= ad case there're STLF issue.=