From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <gcc-bugzilla@gcc.gnu.org>
Received: by sourceware.org (Postfix, from userid 48)
	id 5E9293858CDB; Fri, 13 Oct 2023 10:19:48 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 5E9293858CDB
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org;
	s=default; t=1697192388;
	bh=s+9hHK6eEPhIz4MkiHTWTcEZTVoP947cZ6tQPkZ2q6o=;
	h=From:To:Subject:Date:In-Reply-To:References:From;
	b=yMnYI0xXwwJMfQC93j1U10R95/wnP1XgXfrKEOcP03bVHzRfQx92cn6vDFv2FSfR8
	 d9u9oQw0YTLyPTwnj1PRC131+lq1w16fONFxwzqYSaXKprk4BNd4hZVol55pUu5ZBd
	 8mvSCgAJXfWBv/idFfRAasdXF0Zfbd/5v4SsbINA=
From: "linkw at gcc dot gnu.org" <gcc-bugzilla@gcc.gnu.org>
To: gcc-bugs@gcc.gnu.org
Subject: [Bug target/111591] ppc64be: miscompilation with -mstrict-align /
 -O3
Date: Fri, 13 Oct 2023 10:19:46 +0000
X-Bugzilla-Reason: CC
X-Bugzilla-Type: changed
X-Bugzilla-Watch-Reason: None
X-Bugzilla-Product: gcc
X-Bugzilla-Component: target
X-Bugzilla-Version: 13.2.0
X-Bugzilla-Keywords: needs-bisection
X-Bugzilla-Severity: normal
X-Bugzilla-Who: linkw at gcc dot gnu.org
X-Bugzilla-Status: ASSIGNED
X-Bugzilla-Resolution: 
X-Bugzilla-Priority: P3
X-Bugzilla-Assigned-To: linkw at gcc dot gnu.org
X-Bugzilla-Target-Milestone: ---
X-Bugzilla-Flags: 
X-Bugzilla-Changed-Fields: 
Message-ID: <bug-111591-4-D4GrXdSj0R@http.gcc.gnu.org/bugzilla/>
In-Reply-To: <bug-111591-4@http.gcc.gnu.org/bugzilla/>
References: <bug-111591-4@http.gcc.gnu.org/bugzilla/>
Content-Type: text/plain; charset="UTF-8"
Content-Transfer-Encoding: quoted-printable
X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/
Auto-Submitted: auto-generated
MIME-Version: 1.0
List-Id: <gcc-bugs.sourceware.org>

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D111591
--- Comment #16 from Kewen Lin <linkw at gcc dot gnu.org> ---
Tracing down it with template specialization, the aborting happens on

  auto vn_b =3D Load(dn, in_b.get());
  HWY_ASSERT_VEC_EQ(
      dw, vw_signed_max,
      SatWidenMulPairwiseAdd(
          dw, InterleaveLower(dn_u, BitCast(dn_u, vn_b), vn_unsigned_max),
          InterleaveLower(dn, vn_b, vn_signed_max)));

with "void operator()(int8_t, CappedTag<int8_t, 8> dn)"

by isolating, it doesn't get the expected result on "b0" for function

template <class DI16, class VU8, class VI8>
HWY_API Vec<DI16> SatWidenMulPairwiseAdd(DI16 di16, VU8 a, VI8 b) {
  RebindToUnsigned<decltype(di16)> du16;
  auto a0 =3D And(BitCast(di16, a), Set(di16, 255));
  auto b0 =3D ShiftRight<8>(ShiftLeft<8>(BitCast(di16, b)));
  auto a1 =3D BitCast(di16, ShiftRight<8>(BitCast(du16, a)));
  auto b1 =3D ShiftRight<8>(BitCast(di16, b));
  return SaturatedAdd(Mul(a0, b0), Mul(a1, b1));
}

specialized with=20
template <> HWY_API Vec128<int16_t, 4> SatWidenMulPairwiseAdd(Simd<int16_t,=
 4,
0> di16, Vec128<uint8_t, 8> a, Vec128<int8_t, 8> b)

further found that the unexpected values are from ShiftLeft<8>, the tree
optimized code looks expected but the final insn sequence look in wrong ord=
er.
Either -fdisable-rtl-sched2 or -fdisable-rtl-sched1 can make it pass. With
counter, I see an unexpected insn movement in sched2 on insn 395.

...

 1436: %10:DI=3D0x70
      REG_EQUIV 0x70
 1438: %9:DI=3D0xc0
      REG_EQUIV 0xc0
 1437: %8:DI=3D0x1e0
      REG_EQUIV 0x1e0
 1441: %7:DI=3D0xd0
      REG_EQUIV 0xd0
  389: %0:V2DI=3D[%1:DI+%9:DI]
      REG_DEAD %9:DI
      REG_EQUAL [sfp:DI+0xc0]
 1445: %5:DI=3D0xb0
      REG_EQUIV 0xb0
 1714: %9:DI=3D0xff0000
      REG_EQUIV 0xff0000
  373: [%1:DI+0x70]=3D%4:DI
      REG_DEAD %4:DI
  375: [%1:DI+0x78]=3D%6:DI
      REG_DEAD %6:DI
 1715: %9:DI=3D%9:DI|0xff
 1785: %25:DI=3Dhigh(unspec[`*.LC8',%2:DI] 47)
 1716: %9:DI=3D%9:DI&0xffffffff|%9:DI<<0x20
      REG_EQUIV 0xff00ff00ff00ff
  410: %28:DI=3D%1:DI+0xae
      REG_EQUAL sfp:DI+0xae
    6: %31:SI=3D0
      REG_EQUAL 0
 1786: %25:DI=3D%25:DI+low(unspec[`*.LC8',%2:DI] 47)
      REG_DEAD %2:DI
      REG_EQUAL `*.LC8'
  392: [%1:DI+%7:DI]=3D%0:V2DI
      REG_DEAD %7:DI
                                         // unexpected version having insn =
395
moved here.
 1738: %12:V2DI=3D[%1:DI+%10:DI]
  376: [%1:DI+%8:DI]=3D%12:V2DI
      REG_DEAD %12:V2DI
      REG_DEAD %8:DI
      REG_EQUIV [sfp:DI+%8:DI]
      REG_EQUAL [sfp:DI+0x70]
  390: [%1:DI+%10:DI]=3D%0:V2DI            // since this store updates
[%1:DI+0x70] in 16 bytes, so the read
                                         // can't pass this=20=20
      REG_DEAD %0:V2DI
  395: %4:DI=3Dzero_extend([%1:DI+0x70])   //  <------ this is expected
  398: %6:DI=3Dzero_extend([%1:DI+0x72])
  401: %7:DI=3Dzero_extend([%1:DI+0x74])
  404: %8:DI=3Dzero_extend([%1:DI+0x76])
  396: %4:SI=3D%4:SI<<0x8
  399: %6:SI=3D%6:SI<<0x8
  402: %7:SI=3D%7:SI<<0x8
  405: %8:SI=3D%8:SI<<0x8

 ....

the tree optimized IR for this part looks expected?

  <bb 51> [local count: 119292722]:
  v =3D a;
  MEM <unsigned char[16]> [(char * {ref-all})&D.38735] =3D MEM <unsigned
char[16]> [(char * {ref-all})&v];
  v =3D{v} {CLOBBER(eol)};
  vect_a_raw_0_1121.562_722 =3D MEM <vector(4) short int> [(short int
*)&D.38735];
  _215 =3D VIEW_CONVERT_EXPR<long unsigned int>(vect_a_raw_0_1121.562_722);
  _830 =3D _215 & 71777214294589695;
  _1549 =3D BIT_FIELD_REF <_830, 16, 32>;
  _1537 =3D BIT_FIELD_REF <_830, 16, 16>;
  _323 =3D BIT_FIELD_REF <_830, 16, 0>;
  v =3D b;
  MEM <unsigned char[16]> [(char * {ref-all})&b00] =3D MEM <unsigned char[1=
6]>
[(char * {ref-all})&v];=20

                          =3D=3D> ref-all here, so should be executed befor=
e any
reads below?

  v =3D{v} {CLOBBER(eol)};
  v =3D b00;
  raw_u_1323 =3D v.raw[0];
  _1324 =3D raw_u_1323 << 8;
  v.raw[0] =3D _1324;
  raw_u_1403 =3D v.raw[1];
  _1404 =3D raw_u_1403 << 8;
  v.raw[1] =3D _1404;
  raw_u_1447 =3D v.raw[2];
  _1448 =3D raw_u_1447 << 8;
  v.raw[2] =3D _1448;
  raw_u_128 =3D v.raw[3];
  _129 =3D raw_u_128 << 8;
  v.raw[3] =3D _129;
  b01 =3D v;
  v =3D{v} {CLOBBER(eol)};
  ivtmp.577_734 =3D (unsigned long) &MEM <struct Vec128> [(void *)&b01 + -2=
B];

...

I guess there is some way to keep this kind of aliasing information after
expanding, need more investigations why sched considers it's safe to move.=