From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <gcc-bugzilla@gcc.gnu.org>
Received: by sourceware.org (Postfix, from userid 48)
	id 5BB373858C5F; Fri, 26 May 2023 23:11:14 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 5BB373858C5F
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org;
	s=default; t=1685142674;
	bh=mDXvD/awQ69mfxRC3NESoVgd0vVzKUfB+JqXYXhi33I=;
	h=From:To:Subject:Date:In-Reply-To:References:From;
	b=vQc7VPJUcJAOABygzr/6P3wNR5Eo1nilOa/jOljBWB4X0LxgqhECkN1emGPn1Nc1g
	 d7Q6Z1Bf6HJ63Y+CYsXmq56I3ZTooUZuAf1MlApNGAt6uoRqKTOPzspC6ptCf/U/hX
	 phEnRiwAJqNEFZF4P1i8ezpZfYUSahddlKnvQD3s=
From: "pinskia at gcc dot gnu.org" <gcc-bugzilla@gcc.gnu.org>
To: gcc-bugs@gcc.gnu.org
Subject: [Bug middle-end/109907] Missed optimization for bit extraction (uses
 shift instead of single bit-test)
Date: Fri, 26 May 2023 23:11:13 +0000
X-Bugzilla-Reason: CC
X-Bugzilla-Type: changed
X-Bugzilla-Watch-Reason: None
X-Bugzilla-Product: gcc
X-Bugzilla-Component: middle-end
X-Bugzilla-Version: 14.0
X-Bugzilla-Keywords: missed-optimization
X-Bugzilla-Severity: normal
X-Bugzilla-Who: pinskia at gcc dot gnu.org
X-Bugzilla-Status: NEW
X-Bugzilla-Resolution: 
X-Bugzilla-Priority: P3
X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org
X-Bugzilla-Target-Milestone: ---
X-Bugzilla-Flags: 
X-Bugzilla-Changed-Fields: 
Message-ID: <bug-109907-4-pajU3y2oUd@http.gcc.gnu.org/bugzilla/>
In-Reply-To: <bug-109907-4@http.gcc.gnu.org/bugzilla/>
References: <bug-109907-4@http.gcc.gnu.org/bugzilla/>
Content-Type: text/plain; charset="UTF-8"
Content-Transfer-Encoding: quoted-printable
X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/
Auto-Submitted: auto-generated
MIME-Version: 1.0
List-Id: <gcc-bugs.sourceware.org>

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D109907
--- Comment #24 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
(In reply to Georg-Johann Lay from comment #23)
> Thank you so much for looking into this.
>=20
> For the test case from comment #21 though, the problem is somewhere in tr=
ee
> optimizations.
>=20
> > unsigned char lfsr32_mpp_ge0 (unsigned long number)
> > {
> >   unsigned char b =3D 0;
> >   if (number >=3D 0) b--;
> >   if (number & (1UL << 29)) b++;
> >   if (number & (1UL << 13)) b++;
> >=20
> >   return b;
> > }
>=20
> The -fdump-tree-optimized dump reads:
>=20
> ;; Function lfsr32_mpp_ge0 (lfsr32_mpp_ge0, funcdef_no=3D0, decl_uid=3D18=
80,
> cgraph_uid=3D1, symbol_order=3D0)
>=20
> unsigned char lfsr32_mpp_ge0 (long unsigned int number)
> {
>   unsigned char b;
>   long unsigned int _1;
>   long unsigned int _2;
>   _Bool _3;
>   unsigned char _8;
>   _Bool _9;
>   unsigned char _10;
>   unsigned char _11;
>=20
>   <bb 2> [local count: 1073741824]:
>   _1 =3D number_5(D) & 536870912;
>   _2 =3D number_5(D) & 8192;
>   if (_2 !=3D 0)
>     goto <bb 4>; [50.00%]
>   else
>     goto <bb 3>; [50.00%]
>=20
>   <bb 3> [local count: 536870912]:
>   _9 =3D _1 =3D=3D 0;
>   _10 =3D (unsigned char) _9;
>   _11 =3D -_10;
>   goto <bb 5>; [100.00%]
>=20
>   <bb 4> [local count: 536870913]:
>   _3 =3D _1 !=3D 0;
>   _8 =3D (unsigned char) _3;
>=20
>   <bb 5> [local count: 1073741824]:
>   # b_4 =3D PHI <_11(3), _8(4)>
>   return b_4;
> }

Oh yes this is where my
https://gcc.gnu.org/pipermail/gcc-patches/2023-May/619068.html patch actual=
ly
solves.
It should be able to detect that _1 has a non-zero bits of just one bit set=
 and
expand using single_bit_test for _3.=20=20

For an example we get:
;; Generating RTL for gimple basic block 3

;; _11 =3D -_10;

(insn 10 9 11 (set (reg:QI 51)
        (zero_extract:QI (subreg:QI (reg:SI 43 [ _1 ]) 3)
            (const_int 1 [0x1])
            (const_int 5 [0x5]))) "t21.c":5:6 -1
     (nil))

(insn 11 10 12 (set (reg:QI 53)
        (const_int 1 [0x1])) "t21.c":5:6 -1
     (nil))

(insn 12 11 13 (set (reg:QI 52)
        (xor:QI (reg:QI 51)
            (reg:QI 53))) "t21.c":5:6 -1
     (nil))

(insn 13 12 0 (set (reg/v:QI 48 [ <retval> ])
        (neg:QI (reg:QI 52))) "t21.c":5:6 -1
     (nil))


Which is exactly what you want right?

Overall we get:
```
lfsr32_mpp_ge0:
        push r16
        push r17
/* prologue: function */
/* frame size =3D 0 */
/* stack size =3D 2 */
.L__stack_usage =3D 2
        mov r16,r22
        mov r17,r23
        mov r18,r24
        mov r19,r25
        mov r27,r19
        mov r26,r18
        mov r25,r17
        mov r24,r16
        clr r24
        clr r25
        clr r26
        andi r27,32
        bst r27,5
        clr r24
        bld r24,0
        sbrs r17,5
        subi r24,lo8(-(-1))
.L1:
/* epilogue start */
        pop r17
        pop r16
        ret
```

Which is much better than it was before (still could be improved more though
but that is for a different time).

To finish this patch up, I am supposed to do some cost modelling and such w=
hich
I might get to this weekend.


Even for aarch64 we do slightly better:
```
lfsr32_mpp_ge0:
        and     x1, x0, 536870912
        tbnz    x0, 13, .L2
        cmp     x1, 0
        csetm   w0, eq
        and     w0, w0, 255
        ret
.L2:
        cmp     x1, 0
        cset    w0, ne
        ret
```


```
lfsr32_mpp_ge0:
        and     x1, x0, 536870912
        tbnz    x0, 13, .L2
        ubfx    x0, x1, 29, 1
        eor     w0, w0, 1
        neg     w0, w0
        and     w0, w0, 255
        ret
.L2:
        ubfx    x0, x1, 29, 1
        and     w0, w0, 255
        ret
```

Though if there would be a way to remove the first and's, that would be best
(and the last and, the last one is still there because of fuzziness with
combine trying to do ne there still).

Note even though it does increase in size, the cost of cset on some process=
ors
is 2 cycles rather than 1.=