From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <gcc-bugzilla@gcc.gnu.org>
Received: by sourceware.org (Postfix, from userid 48)
	id A82C338582BE; Wed,  7 Feb 2024 18:28:50 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org A82C338582BE
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org;
	s=default; t=1707330530;
	bh=i0K3/tCtqmwsTMX5LIQCr+yFG/13hCVFi8c0DpALPQQ=;
	h=From:To:Subject:Date:In-Reply-To:References:From;
	b=hfa3RsRH92EGqgqHjkNGh5UYQROPL3bD0vsM5KcmSNJsmCuxGIEMoi1kcQcWugfqD
	 Us4S4fx+IfjX/D5PIGLuIC5NziBUvErhAGgBkcj7+83kreCvd8BIJ4cOj2KV/o1L+P
	 vky74I+onc251lYZ7WYEDaoe88Pth0evMx3npu1c=
From: "jakub at gcc dot gnu.org" <gcc-bugzilla@gcc.gnu.org>
To: gcc-bugs@gcc.gnu.org
Subject: [Bug tree-optimization/113774] wrong code with _BitInt() arithmetics
 at -O2
Date: Wed, 07 Feb 2024 18:28:49 +0000
X-Bugzilla-Reason: CC
X-Bugzilla-Type: changed
X-Bugzilla-Watch-Reason: None
X-Bugzilla-Product: gcc
X-Bugzilla-Component: tree-optimization
X-Bugzilla-Version: 14.0
X-Bugzilla-Keywords: wrong-code
X-Bugzilla-Severity: normal
X-Bugzilla-Who: jakub at gcc dot gnu.org
X-Bugzilla-Status: NEW
X-Bugzilla-Resolution: 
X-Bugzilla-Priority: P3
X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org
X-Bugzilla-Target-Milestone: ---
X-Bugzilla-Flags: 
X-Bugzilla-Changed-Fields: everconfirmed cc bug_status cf_reconfirmed_on
Message-ID: <bug-113774-4-UEBbDB3BOR@http.gcc.gnu.org/bugzilla/>
In-Reply-To: <bug-113774-4@http.gcc.gnu.org/bugzilla/>
References: <bug-113774-4@http.gcc.gnu.org/bugzilla/>
Content-Type: text/plain; charset="UTF-8"
Content-Transfer-Encoding: quoted-printable
X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/
Auto-Submitted: auto-generated
MIME-Version: 1.0
List-Id: <gcc-bugs.sourceware.org>

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D113774

Jakub Jelinek <jakub at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
     Ever confirmed|0                           |1
                 CC|                            |rguenth at gcc dot gnu.org
             Status|UNCONFIRMED                 |NEW
   Last reconfirmed|                            |2024-02-07
--- Comment #2 from Jakub Jelinek <jakub at gcc dot gnu.org> ---
/* PR tree-optimization/113774 */
/* { dg-do run { target bitint } } */
/* { dg-options "-std=3Dc23 -pedantic-errors" } */
/* { dg-skip-if "" { ! run_expensive_tests }  { "*" } { "-O0" "-O2" } } */
/* { dg-skip-if "" { ! run_expensive_tests } { "-flto" } { "" } } */

#if __BITINT_MAXWIDTH__ >=3D 512
unsigned _BitInt(512) u;
unsigned _BitInt(512) v;

void
foo (unsigned _BitInt(255) a, unsigned _BitInt(257) b, unsigned _BitInt(512)
*r)
{
  b +=3D v;
  b |=3D a - b;
  unsigned _BitInt(512) c =3D b * 6;
  unsigned _BitInt(512) h =3D c >> u;
  *r =3D h;
}
#endif

int
main ()
{
#if __BITINT_MAXWIDTH__ >=3D 512
  unsigned _BitInt(512) x;
  foo (0x10000000000000000wb, 0x10000000000000001wb, &x);
  if (x !=3D
0x1fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffawb)
    __builtin_abort ();
#endif
  return 0;
}

This looks like a PRE bug to me.
The bug at runtime is that b |=3D a - b (i.e. the operand of the multiplica=
tion)
which ought to be
0x01ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffuwb I t=
hink
is different.
What bitint lowering emits to compute this are 2 separate loops + straight =
line
code after each to handle the most significant limb of 257-bit unsigned
_BitInts.
The first computes b +=3D v; and because v is 0, it doesn't change anything=
 and
uses
the same underlying variable (the PARM_DECL) for both the input and result,=
 the
second
one performs the b |=3D a - b which is complicated because there needs to b=
e zero
extension from 255-bit a to 257 bits.  This loop handles 2 limbs at a time,=
 so
iterates twice:
  <bb 5> [local count: 1073741824]:
  # _49 =3D PHI <0(4), _50(11)>
  # _55 =3D PHI <0(4), _56(11)>
  _51 =3D VIEW_CONVERT_EXPR<unsigned long[5]>(b)[_49];
  if (_49 < 3)
    goto <bb 6>; [80.00%]
  else
    goto <bb 7>; [20.00%]

  <bb 6> [local count: 1073741824]:
  _52 =3D VIEW_CONVERT_EXPR<unsigned long[4]>(a)[_49];

  <bb 7> [local count: 1073741824]:
  # _53 =3D PHI <_52(6), 0(5)>
  _54 =3D VIEW_CONVERT_EXPR<unsigned long[5]>(b)[_49];
  _57 =3D .USUBC (_53, _54, _55);
  _58 =3D IMAGPART_EXPR <_57>;
  _59 =3D REALPART_EXPR <_57>;
  _60 =3D _59 | _51;
  bitint.6[_49] =3D _60;
  _61 =3D _49 + 1;
  _62 =3D VIEW_CONVERT_EXPR<unsigned long[5]>(b)[_61];
  if (_61 <=3D 3)
    goto <bb 8>; [80.00%]
  else
    goto <bb 11>; [20.00%]

  <bb 8> [local count: 1073741824]:
  if (_61 =3D=3D 3)
    goto <bb 10>; [20.00%]
  else
    goto <bb 9>; [80.00%]

  <bb 9> [local count: 1073741824]:
  _63 =3D VIEW_CONVERT_EXPR<unsigned long[4]>(a)[_61];
  goto <bb 11>; [100.00%]

  <bb 10> [local count: 214748360]:
  _64 =3D MEM <unsigned long> [(unsigned _BitInt(255) *)&a + 24B];
  _65 =3D (<unnamed-unsigned:63>) _64;
  _66 =3D (unsigned long) _65;

  <bb 11> [local count: 1073741824]:
  # _67 =3D PHI <_63(9), 0(7), _66(10)>
  _68 =3D VIEW_CONVERT_EXPR<unsigned long[5]>(b)[_61];
  _69 =3D .USUBC (_67, _68, _58);
  _56 =3D IMAGPART_EXPR <_69>;
  _70 =3D REALPART_EXPR <_69>;
  _71 =3D _70 | _62;
  bitint.6[_61] =3D _71;
  _50 =3D _49 + 2;
  if (_50 !=3D 4)
    goto <bb 5>; [0.05%]
  else
    goto <bb 12>; [99.95%]

  <bb 12> [local count: 1073741824]:
  _72 =3D MEM <unsigned long> [(unsigned _BitInt(257) *)&b + 32B];
  _73 =3D (<unnamed-unsigned:1>) _72;
  _74 =3D MEM <unsigned long> [(unsigned _BitInt(257) *)&b + 32B];
  _75 =3D (<unnamed-unsigned:1>) _74;
  _76 =3D (unsigned long) _75;
  _77 =3D .USUBC (0, _76, _56);
  _78 =3D IMAGPART_EXPR <_77>;
  _79 =3D REALPART_EXPR <_77>;
  _80 =3D (<unnamed-unsigned:1>) _79;
  _81 =3D _80 | _73;
  _82 =3D (unsigned long) _81;
  MEM[(unsigned long *)&bitint.6 + 32B] =3D _82;
  a =3D{v} {CLOBBER(eos)};

bitint.6 is the result of b | (zext (a) - b) which is then used as argument=
 to
__mulbitint3.
Now, there is something I should improve, eventhough later optimizations li=
ke
VRP IMHO ought to be able to figure it out, because the loop iterates just
twice, _49 will be in the [0, 2] range (actually 0 or 2), so _49 < 3 condit=
ion
is actually always true and similarly _61 <=3D 3 is also always true (becau=
se _61
is in [1, 3] range (actually 1 or 3)).
Guess I should in the handle_cast handling look at the m_upwards_2limb exact
value and figure out conditions which will be always true or always false.

Anyway, with the above which is IMHO correct but non-optimal let's see what
happens to the processing of the second least significant limb, i.e. the se=
cond
half of the first iteration which is what is IMHO miscompiled during PRE.
First thread1 pass goes wild and threads everything it thinks it is possibl=
e to
thread.
The second least significant limb is handled in
  # _67 =3D PHI <_63(8), 0(6)>
  # _144 =3D PHI <_143(8), _136(6)>
  # _146 =3D PHI <_145(8), _140(6)>
  # _148 =3D PHI <_147(8), _141(6)>
  _68 =3D VIEW_CONVERT_EXPR<unsigned long[5]>(b)[_146];
  _69 =3D .USUBC (_67, _68, _144);
  _56 =3D IMAGPART_EXPR <_69>;
  _70 =3D REALPART_EXPR <_69>;
  _71 =3D _70 | _148;
  bitint.6[_146] =3D _71;
and the important thing to look at is the second .USUBC argument there, so =
_68,
load from b's _146
index, and second LS limb is entering this from bb 8 with
  <bb 8> [local count: 1073741824]:
  # _143 =3D PHI <_10(7), _136(6)>
  # _145 =3D PHI <_4(7), _140(6)>
  # _147 =3D PHI <_3(7), _141(6)>
  _63 =3D VIEW_CONVERT_EXPR<unsigned long[4]>(a)[_145];
  goto <bb 10>; [100.00%]
entered from bb 7, where
  _4 =3D _49 + 1;
and earlier:
  <bb 5> [local count: 1073741824]:
  # _49 =3D PHI <0(4), _50(10)>
  # _55 =3D PHI <0(4), _56(10)>
is the head of the whole loop entered from bb 4, so _146 is 1 in the first
iteration.
All good.  This is the state which is also in walloca2.
But PRE changes this to:
@@ -144,9 +251,10 @@ void foo (unsigned _BitInt(255) a, unsig
   # DEBUG BEGIN_STMT

   <bb 5> [local count: 1073741824]:
-  # _49 =3D PHI <0(4), _50(28)>
-  # _55 =3D PHI <0(4), _56(28)>
+  # _49 =3D PHI <0(4), _50(10)>
+  # _55 =3D PHI <0(4), _56(10)>
   _51 =3D VIEW_CONVERT_EXPR<unsigned long[5]>(b)[_49];
+  _179 =3D _49 + 1;
   if (_49 <=3D 2)
     goto <bb 7>; [80.00%]
   else
@@ -158,9 +266,7 @@ void foo (unsigned _BitInt(255) a, unsig
   _137 =3D REALPART_EXPR <_135>;
   _138 =3D _51 | _137;
   bitint.6[_49] =3D _138;
-  _140 =3D _49 + 1;
-  _141 =3D VIEW_CONVERT_EXPR<unsigned long[5]>(b)[_140];
-  if (_140 <=3D 3)
+  if (_179 <=3D 3)
     goto <bb 8>; [0.00%]
   else
     goto <bb 10>; [100.00%]
@@ -172,18 +278,16 @@ void foo (unsigned _BitInt(255) a, unsig
   _9 =3D REALPART_EXPR <_11>;
   _6 =3D _9 | _51;
   bitint.6[_49] =3D _6;
-  _4 =3D _49 + 1;
-  _3 =3D VIEW_CONVERT_EXPR<unsigned long[5]>(b)[_4];
-  if (_4 =3D=3D 3)
+  _3 =3D VIEW_CONVERT_EXPR<unsigned long[5]>(b)[_179];
+  if (_179 =3D=3D 3)
     goto <bb 9>; [20.00%]
   else
     goto <bb 8>; [80.00%]

   <bb 8> [local count: 1073741824]:
   # _143 =3D PHI <_10(7), _136(6)>
-  # _145 =3D PHI <_4(7), _140(6)>
-  # _147 =3D PHI <_3(7), _141(6)>
-  _63 =3D VIEW_CONVERT_EXPR<unsigned long[4]>(a)[_145];
+  # _147 =3D PHI <_3(7), _134(6)>
+  _63 =3D VIEW_CONVERT_EXPR<unsigned long[4]>(a)[_179];
   goto <bb 10>; [100.00%]

   <bb 9> [local count: 214748360]:
@@ -200,27 +304,21 @@ void foo (unsigned _BitInt(255) a, unsig
   <bb 10> [local count: 858993464]:
   # _67 =3D PHI <_63(8), 0(6)>
   # _144 =3D PHI <_143(8), _136(6)>
-  # _146 =3D PHI <_145(8), _140(6)>
-  # _148 =3D PHI <_147(8), _141(6)>
-  _68 =3D VIEW_CONVERT_EXPR<unsigned long[5]>(b)[_146];
-  _69 =3D .USUBC (_67, _68, _144);
+  # _148 =3D PHI <_147(8), _134(6)>
+  _69 =3D .USUBC (_67, _134, _144);
   _56 =3D IMAGPART_EXPR <_69>;
   _70 =3D REALPART_EXPR <_69>;
   _71 =3D _70 | _148;
-  bitint.6[_146] =3D _71;
+  bitint.6[_179] =3D _71;
   _50 =3D _49 + 2;
   if (_50 !=3D 4)
-    goto <bb 28>; [0.06%]
+    goto <bb 5>; [0.06%]
   else
     goto <bb 11>; [99.94%]

Now, the bug is IMHO in using _134 in the .USUBC call, _134 is the
VIEW_CONVERT_EXPR<unsigned long[5]>(b)[4] value,
so something that is valid when bb 10 is entered from the edge from bb 6, b=
ut
when entered from bb 8 it ought
to be VIEW_CONVERT_EXPR<unsigned long[4]>(b)[_179], so _3, so the right thi=
ng
would be to use there _148
which contains that value.
Later on we completely unroll the loop and there it is clearly visible, for=
 the
least significant limb
we correctly compute bitint.6[0] =3D b[0] | (a[0] - b[0]), while for the se=
cond
least significant limb
bitint.6[1] =3D b[1] | (a[1] - (b[4] & 1) - carry) and so while bitint.6[0]=
 is
-1, bitint.6[1] is 1.=