From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 48) id 9561B3871018; Tue, 17 Aug 2021 09:26:48 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 9561B3871018 From: "linkw at gcc dot gnu.org" To: gcc-bugs@gcc.gnu.org Subject: [Bug tree-optimization/101944] New: suboptimal SLP for reduced case from namd_r Date: Tue, 17 Aug 2021 09:26:48 +0000 X-Bugzilla-Reason: CC X-Bugzilla-Type: new X-Bugzilla-Watch-Reason: None X-Bugzilla-Product: gcc X-Bugzilla-Component: tree-optimization X-Bugzilla-Version: 12.0 X-Bugzilla-Keywords: X-Bugzilla-Severity: normal X-Bugzilla-Who: linkw at gcc dot gnu.org X-Bugzilla-Status: UNCONFIRMED X-Bugzilla-Resolution: X-Bugzilla-Priority: P3 X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org X-Bugzilla-Target-Milestone: --- X-Bugzilla-Flags: X-Bugzilla-Changed-Fields: bug_id short_desc product version bug_status bug_severity priority component assigned_to reporter target_milestone Message-ID: Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: quoted-printable X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/ Auto-Submitted: auto-generated MIME-Version: 1.0 X-BeenThere: gcc-bugs@gcc.gnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Gcc-bugs mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 17 Aug 2021 09:26:48 -0000 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D101944 Bug ID: 101944 Summary: suboptimal SLP for reduced case from namd_r Product: gcc Version: 12.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: tree-optimization Assignee: unassigned at gcc dot gnu.org Reporter: linkw at gcc dot gnu.org Target Milestone: --- For SPEC2017 bmk 508.namd_r, it's observed that it degraded by -3.73%=20 at -O2 -ftree-slp-vectorize vs baseline -O2 on Power9 with either default c= ost model or very cheap cost model. By isolating functions, several functions a= re responsible for it. One typical case is the below reduced one: ------------- TEST CASE=20 typedef double BigReal; extern BigReal table_four_i[8]; extern void func1(BigReal *); extern void func2(BigReal *); extern void func3(BigReal, BigReal, BigReal, BigReal); void foo(BigReal scaling, BigReal *A1, BigReal *B1, BigReal diffa) { BigReal vdwEnergy =3D 0; func1(&vdwEnergy); const BigReal A =3D scaling * *A1; const BigReal B =3D scaling * *B1; BigReal vdw_d =3D A * table_four_i[0] - B * table_four_i[2]; BigReal vdw_c =3D A * table_four_i[1] - B * table_four_i[3]; BigReal vdw_b =3D A * table_four_i[4] - B * table_four_i[6]; BigReal vdw_a =3D A * table_four_i[5] - B * table_four_i[7]; register BigReal vdw_val =3D ((diffa * vdw_d * (1 / 6.) + vdw_c * (1 / 4.)) * diffa + vdw_b * (1 / 2.)) * diffa + vdw_a; vdwEnergy -=3D vdw_val; func2 (&vdwEnergy); func3 (vdw_a, vdw_b, vdw_c, vdw_d); } ------------- Options: -O2 -ffast-math -ftree-slp-vectorize -mcpu=3Dpower9 Scalar version at optimized dumping: func1 (&vdwEnergy); _1 =3D *A1_32(D); A_34 =3D _1 * scaling_33(D); _2 =3D *B1_35(D); B_36 =3D _2 * scaling_33(D); _3 =3D table_four_i[0]; _5 =3D table_four_i[2]; _6 =3D _5 * B_36; vdw_d_37 =3D .FMS (_3, A_34, _6); _7 =3D table_four_i[1]; _9 =3D table_four_i[3]; _10 =3D _9 * B_36; vdw_c_38 =3D .FMS (_7, A_34, _10); _11 =3D table_four_i[4]; _13 =3D table_four_i[6]; _14 =3D _13 * B_36; vdw_b_39 =3D .FMS (_11, A_34, _14); _15 =3D table_four_i[5]; _17 =3D table_four_i[7]; _18 =3D _17 * B_36; vdw_a_40 =3D .FMS (_15, A_34, _18); _51 =3D diffa_41(D) * 1.666666666666666574148081281236954964697360992431640625e-1; _21 =3D vdw_c_38 * 2.5e-1; _22 =3D .FMA (vdw_d_37, _51, _21); _24 =3D vdw_b_39 * 5.0e-1; _25 =3D .FMA (_22, diffa_41(D), _24); _26 =3D _25 * diffa_41(D); vdwEnergy.0_27 =3D vdwEnergy; _49 =3D _18 + vdwEnergy.0_27; _52 =3D .FMA (_15, A_34, _26); _28 =3D _49 - _52; vdwEnergy =3D _28; func2 (&vdwEnergy); Vector version at optimized dumping: func1 (&vdwEnergy); _1 =3D *A1_32(D); A_34 =3D _1 * scaling_33(D); _49 =3D {A_34, A_34}; _2 =3D *B1_35(D); B_36 =3D _2 * scaling_33(D); _54 =3D {B_36, B_36}; vect__3.6_48 =3D MEM [(double *)&table_four_i]; vect__5.10_52 =3D MEM [(double *)&table_four_i + 16B]; vect__6.11_55 =3D vect__5.10_52 * _54; vect_vdw_d_37.12_56 =3D .FMS (vect__3.6_48, _49, vect__6.11_55); _58 =3D BIT_FIELD_REF ; _57 =3D BIT_FIELD_REF ; _11 =3D table_four_i[4]; _13 =3D table_four_i[6]; _14 =3D _13 * B_36; vdw_b_39 =3D .FMS (_11, A_34, _14); _15 =3D table_four_i[5]; _17 =3D table_four_i[7]; _18 =3D _17 * B_36; vdw_a_40 =3D .FMS (_15, A_34, _18); _51 =3D diffa_41(D) * 1.666666666666666574148081281236954964697360992431640625e-1; _59 =3D {_51, 2.5e-1}; vect__20.13_60 =3D vect_vdw_d_37.12_56 * _59; _61 =3D .REDUC_PLUS (vect__20.13_60); _24 =3D vdw_b_39 * 5.0e-1; _25 =3D .FMA (diffa_41(D), _61, _24); _26 =3D _25 * diffa_41(D); vdwEnergy.0_27 =3D vdwEnergy; _66 =3D _18 + vdwEnergy.0_27; _68 =3D .FMA (_15, A_34, _26); _28 =3D _66 - _68; vdwEnergy =3D _28; func2 (&vdwEnergy); reduc.c:24:34: note: Cost model analysis for part in loop 0: Vector cost: 16 Scalar cost: 17=