From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <gcc-bugzilla@gcc.gnu.org>
Received: by sourceware.org (Postfix, from userid 48)
 id 3B2A33860C3E; Thu, 15 Apr 2021 06:35:57 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 3B2A33860C3E
From: "crazylht at gmail dot com" <gcc-bugzilla@gcc.gnu.org>
To: gcc-bugs@gcc.gnu.org
Subject: [Bug tree-optimization/100089] New: [11 Performance regression ] 30%
 for denbench/mp2decoddata2 with -O3
Date: Thu, 15 Apr 2021 06:35:57 +0000
X-Bugzilla-Reason: CC
X-Bugzilla-Type: new
X-Bugzilla-Watch-Reason: None
X-Bugzilla-Product: gcc
X-Bugzilla-Component: tree-optimization
X-Bugzilla-Version: 11.0
X-Bugzilla-Keywords: 
X-Bugzilla-Severity: normal
X-Bugzilla-Who: crazylht at gmail dot com
X-Bugzilla-Status: UNCONFIRMED
X-Bugzilla-Resolution: 
X-Bugzilla-Priority: P3
X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org
X-Bugzilla-Target-Milestone: ---
X-Bugzilla-Flags: 
X-Bugzilla-Changed-Fields: bug_id short_desc product version bug_status
 bug_severity priority component assigned_to reporter cc target_milestone
 cf_gcchost cf_gcctarget attachments.created
Message-ID: <bug-100089-4@http.gcc.gnu.org/bugzilla/>
Content-Type: text/plain; charset="UTF-8"
Content-Transfer-Encoding: quoted-printable
X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/
Auto-Submitted: auto-generated
MIME-Version: 1.0
X-BeenThere: gcc-bugs@gcc.gnu.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Gcc-bugs mailing list <gcc-bugs.gcc.gnu.org>
List-Unsubscribe: <https://gcc.gnu.org/mailman/options/gcc-bugs>,
 <mailto:gcc-bugs-request@gcc.gnu.org?subject=unsubscribe>
List-Archive: <https://gcc.gnu.org/pipermail/gcc-bugs/>
List-Post: <mailto:gcc-bugs@gcc.gnu.org>
List-Help: <mailto:gcc-bugs-request@gcc.gnu.org?subject=help>
List-Subscribe: <https://gcc.gnu.org/mailman/listinfo/gcc-bugs>,
 <mailto:gcc-bugs-request@gcc.gnu.org?subject=subscribe>
X-List-Received-Date: Thu, 15 Apr 2021 06:35:57 -0000

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D100089

            Bug ID: 100089
           Summary: [11 Performance regression ] 30% for
                    denbench/mp2decoddata2 with -O3
           Product: gcc
           Version: 11.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: crazylht at gmail dot com
                CC: hjl.tools at gmail dot com
  Target Milestone: ---
              Host: x86_64-pc-linux-gnu
            Target: x86_64-*-* i?86-*-*

Created attachment 50597
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=3D50597&action=3Dedit
denbench_mp2decoddata2.cpp

https://godbolt.org/z/EGoz1zx61

cat test.cpp

static inline void idctrow(e_s16 *blk)
{
   e_s32 x0, x1, x2, x3, x4, x5, x6, x7, x8;


   if (!((x1 =3D blk[4]<<11) | (x2 =3D blk[6]) | (x3 =3D blk[2]) |
      (x4 =3D blk[1]) | (x5 =3D blk[7]) | (x6 =3D blk[5]) | (x7 =3D blk[3])=
))
   {
      blk[0]=3Dblk[1]=3Dblk[2]=3Dblk[3]=3Dblk[4]=3Dblk[5]=3Dblk[6]=3Dblk[7]=
=3D(e_s16)blk[0]<<3;
      return;
   }

   x0 =3D (blk[0]<<11) + 128;


   x8 =3D 565*(x4+x5);
   x4 =3D x8 + (2841 -565)*x4;
   x5 =3D x8 - (2841 +565)*x5;
   x8 =3D 2408*(x6+x7);
   x6 =3D x8 - (2408 -1609)*x6;
   x7 =3D x8 - (2408 +1609)*x7;


   x8 =3D x0 + x1;
   x0 -=3D x1;
   x1 =3D 1108*(x3+x2);
   x2 =3D x1 - (2676 +1108)*x2;
   x3 =3D x1 + (2676 -1108)*x3;
   x1 =3D x4 + x6;
   x4 -=3D x6;
   x6 =3D x5 + x7;
   x5 -=3D x7;
   x7 =3D x8 + x3;
   x8 -=3D x3;
   x3 =3D x0 + x2;
   x0 -=3D x2;
   x2 =3D (181*(x4+x5)+128)>>8;
   x4 =3D (181*(x4-x5)+128)>>8;

   blk[0] =3D (e_s16)((x7+x1)>>8);
   blk[1] =3D (e_s16)((x3+x2)>>8);
   blk[2] =3D (e_s16)((x0+x4)>>8);
   blk[3] =3D (e_s16)((x8+x6)>>8);
   blk[4] =3D (e_s16)((x8-x6)>>8);
   blk[5] =3D (e_s16)((x0-x4)>>8);
   blk[6] =3D (e_s16)((x3-x2)>>8);
   blk[7] =3D (e_s16)((x7-x1)>>8);

}

int
__attribute__ ((noipa))
Fast_IDCT(e_s16 *block)
{
   e_s32 i;

   for (i=3D0; i<8; i++)
      idctrow(block+8*i);

   return 1;
}

 pass_ifcvt transforms the if branch in idctrow into an conditional move, a=
nd
then pass_vect finds that although there's no loop vectorization opportunity
but there are opportunities for SLP, but the cost model of SLP does not
consider the cost of these conditional movs, which eventually generates a l=
arge
number of redundant test and cmov in codegen.

test.cpp:76:11: note:   stmt 1 MEM[(e_s16 *)_3 + 2B] =3D _ifc__264;
test.cpp:76:11: note:   stmt 2 MEM[(e_s16 *)_3 + 4B] =3D _ifc__267;
test.cpp:76:11: note:   stmt 3 MEM[(e_s16 *)_3 + 6B] =3D _ifc__270;
test.cpp:76:11: note:   stmt 4 MEM[(e_s16 *)_3 + 8B] =3D _ifc__273;
test.cpp:76:11: note:   stmt 5 MEM[(e_s16 *)_3 + 10B] =3D _ifc__276;
test.cpp:76:11: note:   stmt 6 MEM[(e_s16 *)_3 + 12B] =3D _ifc__279;
test.cpp:76:11: note:   stmt 7 MEM[(e_s16 *)_3 + 14B] =3D _ifc__282;
test.cpp:76:11: note:   children 0x3ec9580
test.cpp:76:11: note: node (external) 0x3ec9580 (max_nunits=3D1, refcnt=3D1)
test.cpp:76:11: note:   { _ifc__261, _ifc__264, _ifc__267, _ifc__270,
_ifc__273, _ifc__276, _ifc__279, _ifc__282 }
test.cpp:76:11: note: Cost model analysis:=20
0x3c1aee0 _ifc__261 1 times scalar_store costs 12 in body
0x3c1aee0 _ifc__264 1 times scalar_store costs 12 in body
0x3c1aee0 _ifc__267 1 times scalar_store costs 12 in body
0x3c1aee0 _ifc__270 1 times scalar_store costs 12 in body
0x3c1aee0 _ifc__273 1 times scalar_store costs 12 in body
0x3c1aee0 _ifc__276 1 times scalar_store costs 12 in body
0x3c1aee0 _ifc__279 1 times scalar_store costs 12 in body
0x3c1aee0 _ifc__282 1 times scalar_store costs 12 in body
0x3c1aee0 _ifc__261 1 times unaligned_store (misalign -1) costs 12 in body
0x3c1aee0 <unknown> 1 times vec_construct costs 32 in prologue
test.cpp:76:11: note: Cost model analysis for part in loop 1:
  Vector cost: 44
  Scalar cost: 96


int Fast_IDCT (e_s16 * block)
{
  vector(8) short int * vectp.78;
  vector(8) short int * vectp.77;
  e_s32 x0;
  e_s32 x1;
  e_s32 x2;
  e_s32 x3;
  e_s32 x4;
  e_s32 x5;
  e_s32 x6;
  e_s32 x7;
  e_s32 x8;
  e_s32 i;
  long unsigned int i.0_1;
  long unsigned int _2;
  e_s16 * _3;
  unsigned long ivtmp_4;
  unsigned long ivtmp_5;
  short int _10;
  int _11;
  int _12;
  short int _14;
  short int _16;
  short int _17;
  short int _19;
  short int _20;
  short int _22;
  short int _23;
  short int _25;
  short int _26;
  short int _28;
  short int _29;
  long int _31;
  int _34;
  short int _35;
  int _38;
  int _39;
  long int _41;
  long int _43;
  long int _45;
  long int _47;
  long int _49;
  long int _51;
  long int _55;
  long int _57;
  long int _59;
  long int _69;
  long int _70;
  long int _71;
  long int _73;
  long int _74;
  long int _75;
  long int _77;
  long int _78;
  short int _79;
  long int _80;
  long int _81;
  short int _82;
  long int _83;
  long int _84;
  short int _85;
  long int _86;
  long int _87;
  short int _88;
  long int _89;
  long int _90;
  short int _91;
  long int _92;
  long int _93;
  short int _94;
  long int _95;
  long int _96;
  short int _97;
  long int _98;
  long int _99;
  short int _100;
  long int _118;
  bool _121;
  short int pretmp_159;
  int _160;
  short int _ifc__237;
  short int _ifc__240;
  short int _ifc__243;
  short int _ifc__246;
  short int _ifc__249;
  short int _ifc__252;
  short int _ifc__255;
  short int _ifc__258;
  short int _ifc__261;
  short int _ifc__264;
  short int _ifc__267;
  short int _ifc__270;
  short int _ifc__273;
  short int _ifc__276;
  short int _ifc__279;
  short int _ifc__282;
  vector(8) short int _1154;

  <bb 2> [local count: 119292720]:
  _121 =3D 1;

  <bb 3> [local count: 954449105]:
  # i_122 =3D PHI <i_9(8), 0(2)>
  # ivtmp_5 =3D PHI <ivtmp_4(8), 8(2)>
  i.0_1 =3D (long unsigned int) i_122;
  _2 =3D i.0_1 * 16;
  _3 =3D block_7(D) + _2;
  _10 =3D MEM[(e_s16 *)_3 + 8B];
  _11 =3D (int) _10;
  _12 =3D _11 << 11;
  x1_13 =3D (e_s32) _12;
  _14 =3D MEM[(e_s16 *)_3 + 12B];
  _17 =3D MEM[(e_s16 *)_3 + 4B];
  _16 =3D _14 | _17;
  _20 =3D MEM[(e_s16 *)_3 + 2B];
  _19 =3D _16 | _20;
  _23 =3D MEM[(e_s16 *)_3 + 14B];
  _22 =3D _19 | _23;
  _26 =3D MEM[(e_s16 *)_3 + 10B];
  _25 =3D _22 | _26;
  _29 =3D MEM[(e_s16 *)_3 + 6B];
  _28 =3D _25 | _29;
  _118 =3D (long int) _28;
  _31 =3D x1_13 | _118;
  pretmp_159 =3D *_3;
  _160 =3D (int) pretmp_159;
  _34 =3D _160 << 3;
  _35 =3D (short int) _34;
  _ifc__237 =3D _31 !=3D 0 ? _23 : _35;
  _ifc__240 =3D _31 !=3D 0 ? _14 : _35;
  _ifc__243 =3D _31 !=3D 0 ? _26 : _35;
  _ifc__246 =3D _31 !=3D 0 ? _10 : _35;
  _ifc__249 =3D _31 !=3D 0 ? _29 : _35;
  _ifc__252 =3D _31 !=3D 0 ? _17 : _35;
  _ifc__255 =3D _31 !=3D 0 ? _20 : _35;
  _ifc__258 =3D _31 =3D=3D 0 ? _35 : pretmp_159;
  x2_15 =3D (e_s32) _14;
  x3_18 =3D (e_s32) _17;
  x4_21 =3D (e_s32) _20;
  x5_24 =3D (e_s32) _23;
  x6_27 =3D (e_s32) _26;
  x7_30 =3D (e_s32) _29;
  _38 =3D _160 << 11;
  _39 =3D _38 + 128;
  x0_40 =3D (e_s32) _39;
  _41 =3D x4_21 + x5_24;
  x8_42 =3D _41 * 565;
  _43 =3D x4_21 * 2276;
  x4_44 =3D x8_42 + _43;
  _45 =3D x5_24 * -3406;
  x5_46 =3D x8_42 + _45;
  _47 =3D x6_27 + x7_30;
  x8_48 =3D _47 * 2408;
  _49 =3D x6_27 * -799;
  x6_50 =3D x8_48 + _49;
  _51 =3D x7_30 * -4017;
  x7_52 =3D x8_48 + _51;
  x8_53 =3D x1_13 + x0_40;
  x0_54 =3D x0_40 - x1_13;
  _55 =3D x2_15 + x3_18;
  x1_56 =3D _55 * 1108;
  _57 =3D x2_15 * -3784;
  x2_58 =3D x1_56 + _57;
  _59 =3D x3_18 * 1568;
  x3_60 =3D x1_56 + _59;
  x1_61 =3D x4_44 + x6_50;
  x4_62 =3D x4_44 - x6_50;
  x6_63 =3D x5_46 + x7_52;
  x5_64 =3D x5_46 - x7_52;
  x7_65 =3D x8_53 + x3_60;
  x8_66 =3D x8_53 - x3_60;
  x3_67 =3D x0_54 + x2_58;
  x0_68 =3D x0_54 - x2_58;
  _69 =3D x4_62 + x5_64;
  _70 =3D _69 * 181;
  _71 =3D _70 + 128;
  x2_72 =3D _71 >> 8;
  _73 =3D x4_62 - x5_64;
  _74 =3D _73 * 181;
  _75 =3D _74 + 128;
  x4_76 =3D _75 >> 8;
  _77 =3D x1_61 + x7_65;
  _78 =3D _77 >> 8;
  _79 =3D (short int) _78;
  _ifc__261 =3D _31 !=3D 0 ? _79 : _ifc__258;
  _80 =3D x3_67 + x2_72;
  _81 =3D _80 >> 8;
  _82 =3D (short int) _81;
  _ifc__264 =3D _31 !=3D 0 ? _82 : _ifc__255;
  _83 =3D x0_68 + x4_76;
  _84 =3D _83 >> 8;
  _85 =3D (short int) _84;
  _ifc__267 =3D _31 !=3D 0 ? _85 : _ifc__252;
  _86 =3D x6_63 + x8_66;
  _87 =3D _86 >> 8;
  _88 =3D (short int) _87;
  _ifc__270 =3D _31 !=3D 0 ? _88 : _ifc__249;
  _89 =3D x8_66 - x6_63;
  _90 =3D _89 >> 8;
  _91 =3D (short int) _90;
  _ifc__273 =3D _31 !=3D 0 ? _91 : _ifc__246;
  _92 =3D x0_68 - x4_76;
  _93 =3D _92 >> 8;
  _94 =3D (short int) _93;
  _ifc__276 =3D _31 !=3D 0 ? _94 : _ifc__243;
  _95 =3D x3_67 - x2_72;
  _96 =3D _95 >> 8;
  _97 =3D (short int) _96;
  _ifc__279 =3D _31 !=3D 0 ? _97 : _ifc__240;
  _98 =3D x7_65 - x1_61;
  _99 =3D _98 >> 8;
  _100 =3D (short int) _99;
  _ifc__282 =3D _31 !=3D 0 ? _100 : _ifc__237;
  _1154 =3D {_ifc__261, _ifc__264, _ifc__267, _ifc__270, _ifc__273, _ifc__2=
76,
_ifc__279, _ifc__282};
  vectp.78_1155 =3D _3;
  MEM <vector(8) short int> [(e_s16 *)vectp.78_1155] =3D _1154;
  i_9 =3D i_122 + 1;
  ivtmp_4 =3D ivtmp_5 - 1;
  if (ivtmp_4 !=3D 0)
    goto <bb 8>; [87.50%]
  else
    goto <bb 7>; [12.50%]

  <bb 8> [local count: 835156386]:
  goto <bb 3>; [100.00%]

  <bb 7> [local count: 119292720]:
  return 1;

}=