From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <gcc-bugzilla@gcc.gnu.org>
Received: by sourceware.org (Postfix, from userid 48)
 id C38383856DCE; Fri, 15 Apr 2022 13:49:01 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org C38383856DCE
From: "already5chosen at yahoo dot com" <gcc-bugzilla@gcc.gnu.org>
To: gcc-bugs@gcc.gnu.org
Subject: [Bug libquadmath/105101] incorrect rounding for sqrtq
Date: Fri, 15 Apr 2022 13:49:01 +0000
X-Bugzilla-Reason: CC
X-Bugzilla-Type: changed
X-Bugzilla-Watch-Reason: None
X-Bugzilla-Product: gcc
X-Bugzilla-Component: libquadmath
X-Bugzilla-Version: unknown
X-Bugzilla-Keywords: wrong-code
X-Bugzilla-Severity: normal
X-Bugzilla-Who: already5chosen at yahoo dot com
X-Bugzilla-Status: NEW
X-Bugzilla-Resolution: 
X-Bugzilla-Priority: P3
X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org
X-Bugzilla-Target-Milestone: ---
X-Bugzilla-Flags: 
X-Bugzilla-Changed-Fields: 
Message-ID: <bug-105101-4-svqtOZLuZy@http.gcc.gnu.org/bugzilla/>
In-Reply-To: <bug-105101-4@http.gcc.gnu.org/bugzilla/>
References: <bug-105101-4@http.gcc.gnu.org/bugzilla/>
Content-Type: text/plain; charset="UTF-8"
Content-Transfer-Encoding: quoted-printable
X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/
Auto-Submitted: auto-generated
MIME-Version: 1.0
X-BeenThere: gcc-bugs@gcc.gnu.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Gcc-bugs mailing list <gcc-bugs.gcc.gnu.org>
List-Unsubscribe: <https://gcc.gnu.org/mailman/options/gcc-bugs>,
 <mailto:gcc-bugs-request@gcc.gnu.org?subject=unsubscribe>
List-Archive: <https://gcc.gnu.org/pipermail/gcc-bugs/>
List-Post: <mailto:gcc-bugs@gcc.gnu.org>
List-Help: <mailto:gcc-bugs-request@gcc.gnu.org?subject=help>
List-Subscribe: <https://gcc.gnu.org/mailman/listinfo/gcc-bugs>,
 <mailto:gcc-bugs-request@gcc.gnu.org?subject=subscribe>
X-List-Received-Date: Fri, 15 Apr 2022 13:49:01 -0000

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D105101
--- Comment #9 from Michael_S <already5chosen at yahoo dot com> ---
(In reply to Michael_S from comment #4)
> If you want quick fix for immediate shipment then you can take that:
>=20
> #include <math.h>
> #include <quadmath.h>
>=20
> __float128 quick_and_dirty_sqrtq(__float128 x)
> {
>   if (isnanq(x))
>     return x;
>=20
>   if (x=3D=3D0)
>     return x;
>=20
>   if (x < 0)
>     return nanq("");
>=20
>   if (isinfq(x))
>     return x;
>=20
>   int xExp;
>   x =3D frexpq(x, &xExp);
>   if (xExp & 1)
>     x *=3D 2.0; // x in [0.5:2.0)
>=20
>   __float128 r =3D (__float128)(1.0/sqrt((double)x)); // r=3Drsqrt(x) est=
imate
> (53 bits)
>   r *=3D 1.5 - r*r*x*0.5; // NR iteration improves precision of r to 105.=
4 bit
>=20
>   __float128 y  =3D x*r;  // y=3Dsqrt(x) estimate (105.4 bits)
>=20
>   // extended-precision NR iteration
>   __float128 yH =3D (double)y;
>   __float128 yL =3D y - yH;
>=20
>   __float128 deltaX =3D x - yH*yH;
>   deltaX -=3D yH*yL*2;
>   deltaX -=3D yL*yL;
>   y +=3D deltaX*r*0.5; // improve precision of y to ~210.2 bits. Not enou=
gh
> for perfect rounding, but not too bad
>=20
>   return ldexpq(y, xExp >> 1);
> }
>=20
> It is very slow, even slower than what you have now, which by itself is
> quite astonishingly slow.
> It is also not sufficiently precise for correct rounding in all cases.
> But, at least, the worst error is something like (0.5+2**-98) ULP, so you=
 are
> unlikely to be ever caught by black box type of testing.
> It's biggest advantage is extreme portability.
> Should run on all platforms where double=3D=3DIEEE binary64 and __float12=
8 =3D=3D
> IEEE binary128.
>=20
> May be, few days later I'll have better variant for "good" 64-bit platfor=
ms
> i.e. for those where we have __int128.
> It would be 15-25 times faster than the variant above and rounding would =
be
> mathematically correct rather than just "impossible to be caught" like ab=
ove.
> But it would not run everywhere.
> Also, I want to give it away under MIT or BSD license, rather than under =
GPL.

Here.

#include <stdint.h>
#include <string.h>
#include <quadmath.h>

static __inline uint64_t umulx64(uint64_t mult1, uint64_t mult2, int rshift)
{
  return (uint64_t)(((unsigned __int128)mult1 * mult2)>>rshift);
}

static __inline uint64_t umulh64(uint64_t mult1, uint64_t mult2)
{
  return umulx64(mult1, mult2, 64);
}

static __inline __float128 ret__float128(uint64_t wordH, uint64_t wordL) {
  unsigned __int128 res_u128 =3D ((unsigned __int128)wordH << 64) | wordL;
  __float128 result;
  memcpy(&result, &res_u128, sizeof(result)); // hopefully __int128 and
__float128 have the endianness
  return result;
}

__float128 slow_and_clean_sqrtq(__float128 src)
{
  typedef unsigned __int128 __uint128;
  const uint64_t SIGN_BIT   =3D (uint64_t)1  << 63;
  const uint64_t MANT_H_MSK =3D (uint64_t)-1 >> 16;
  const uint64_t nNAN_MSW =3D (uint64_t)0xFFFF8 << 44; // -NaN

  __uint128 src_u128;
  memcpy(&src_u128, &src, sizeof(src_u128));
  uint64_t mshalf =3D (uint64_t)(src_u128 >> 64); // MS half
  uint64_t mantL  =3D (uint64_t)(src_u128);       // LS half

  // parse MS half of source
  unsigned exp   =3D mshalf >> 48;
  uint64_t mantH =3D mshalf & MANT_H_MSK;
  unsigned expx =3D exp + 0x3FFF;
  if (exp - 1 >=3D 0x7FFF-1) { // special cases: exp =3D 0, exp=3Dmax_exp o=
r sign=3D1
    if (exp > 0x7fff) {  // negative
      if (exp=3D=3D0xFFFF) { // NaN or -Inf
        if ((mantH | mantL)=3D=3D0) // -Inf
          mshalf =3D nNAN_MSW;
        return ret__float128(mshalf, mantL); // in case of NaN the leave nu=
mber
intact
      }
      if (mshalf=3D=3DSIGN_BIT && mantL=3D=3D0) // -0
        return ret__float128(mshalf, mantL); // in case of -0 leave the num=
ber
intact
      // normal or sub-normal
      mantL =3D 0;
      mshalf =3D nNAN_MSW;
      return ret__float128(mshalf, mantL);
    }
    // non-negative
    if (exp =3D=3D 0x7fff) // NaN or -Inf
      return ret__float128(mshalf, mantL); // in case of positive NaN or Inf
leave the number intact

    // exp=3D=3D0 : zero or subnormal
    if ((mantH | mantL)=3D=3D0) // +0
      return ret__float128(mshalf, mantL); // in case of +0 leave the number
intact

    // positive subnormal
    // normalize
    if (mantH =3D=3D 0) {
      expx -=3D 48;
      int lz =3D __builtin_clzll(mantL);
      expx -=3D lz;
      mantL <<=3D lz + 1;
      mantH =3D mantL >> 16;
      mantL =3D mantL << 48;
    } else {
      int lz =3D __builtin_clzll(mantH)-16;
      expx -=3D lz;
      mantH =3D (mantH << (lz+1)) | mantL >> (63-lz);
      mantL =3D mantL << (lz + 1);
    }
    mantH &=3D MANT_H_MSK;
  }

  // Normal case
  int e =3D expx & 1; // e=3D LS bit of exponent

  const uint64_t BIT_30 =3D (uint64_t)1 << 30;
  static const uint8_t rsqrt_tab[64] =3D { // floor(1/sqrt(1+i/32)*512) - 2=
56
   252, 248, 244, 240,    237, 233, 230, 226,
   223, 220, 216, 213,    210, 207, 204, 201,
   199, 196, 193, 190,    188, 185, 183, 180,
   178, 175, 173, 171,    168, 166, 164, 162,
   159, 157, 155, 153,    151, 149, 147, 145,
   143, 141, 139, 138,    136, 134, 132, 131,
   129, 127, 125, 124,    122, 121, 119, 117,
   116, 114, 113, 111,    110, 108, 107, 106,
 };

  // Look for approximate value of rsqrt(x)=3D1/sqrt(x)
  // where x =3D 1:mantH:mantL
  unsigned r0 =3D rsqrt_tab[mantH >> (48-6)]+256; // scale =3D 2**9
  // r0 is low estimate, r0*sqrt(x) is in range [0.99119507..0.99991213]
  // Est. Precisions: ~6.82 bits

  //
  // Improve precision of the estimate by 2nd-order NR iteration with custom
polynomial
  // r1 =3D r0 * (P2*(r0*r0*x)**2 + P1*(r0*r0*x) + P0)
  // where
  // P2 =3D  0.38345164828933775
  // P1 =3D -1.26684607889193623
  // P0 =3D  1.88339443966540210
  // Calculations are implemented in a way that minimizes latency
  // and minimizes requirement to precision of the coefficients as
  // ((rrx0_n + A)**2 + B)*(r0*F[e])
  // where rrx0_n =3D 1 - r0*r0*x
  //
  // The last multiplications serves additional purpose of adjustment by
exponent (multiplication by sqrt(0.5))
  // Only upper 30 bits of mantissa used in evaluation.  Use of minimal amo=
unt
of input bits
  // simplifies validation by exhausting with minimal negative effect on
worst-case precision
  //
  uint32_t rrx0 =3D ((r0*r0 * ((mantH >> 18)+BIT_30+1))>>16)+1;// scale =3D=
 2**32
  // rrx0 calculated with x1_u=3Dx/2**34 + 1 instead of x1=3Dx/2**34, in or=
der to
assure that
  // for any possible values of 34 LS bits of x, the estimate r1 is lower t=
han
exact value of r
  const uint32_t A =3D 2799880908;                         // scale =3D 2**=
32
  const uint32_t B =3D 2343892134;                         // scale =3D 2**=
30
  static uint32_t const f_tab[2] =3D { 3293824581, 2329085697 }; // F/sqrt(=
2**i),
scale =3D 2**33
  uint32_t rrx0_n =3D 0 -  rrx0;                           // scale =3D 2**=
32,
nbits=3D27
  uint32_t termA  =3D rrx0_n + A;                          // scale =3D 2**=
32,
nbits=3D32
  uint64_t termAA =3D ((uint64_t)termA * termA) >> 34;     // scale =3D 2**=
30,
nbits=3D30
  uint64_t termAB =3D termAA + B;                          // scale =3D 2**=
30,
nbits=3D32
  uint64_t termRF =3D (r0*(uint64_t)f_tab[e]) >> 9;        // scale =3D 2**=
33,
nbits=3D32
  uint64_t r1  =3D (termAB*termRF)>>33;                    // scale =3D 2**=
30;
nbits=3D30
  // r1 is low estimate
  // r1 is in range [0x1ffffffe .. 0x3fffffe1]
  // r1*sqrt(x) is in range
[0.999_999_891_641_538_03..0.999_999_999_782_706_57], i.e.
  // Est. Precisions: ~23.13 bits

  //
  // Further improve precision of the estimate by canonical 2nd-order NR
iteration
  // r2 =3D r1 * (P2*(r0*r0*x)**2 + P1*(r0*r0*x) + P0)
  // where
  // P2 =3D  3/8
  // P1 =3D -5/4
  // P0 =3D  15/8
  // At this point precision is of high importance, so evaluation is done i=
n a
way
  // that minimizes impact of truncation while still doing majority of the =
work
with
  // efficient 64-bit arithmetic.
  // The transformed equation is r2 =3D r2 + r2*rrx1_n*(rrx1_n*3 + 4)/8
  // where rrx1_n =3D 1 - r1*r1*x
  //
  uint64_t  mant64 =3D (mantH << 16) | (mantL >> 48);      // Upper 64 bits=
 of
mantissa
  uint64_t  r1r1   =3D (r1*r1) << e;                       // scale =3D 2**=
60;
nbits=3D60
  __uint128 rrx1_x =3D (__uint128)r1r1*mant64 + r1r1;      // r1*r1*(x+1), =
scale
2**124
  uint64_t rrx1 =3D (uint64_t)(rrx1_x>>53)+(r1r1<<11)+1;   // r1*r1*x=20=20=
=20=20=20=20=20=20=20=20=20
, scale =3D 2**71; nbits=3D64
  // rrx1 calculated with x2_u=3Dx+1 instead of x2=3Dx, in order to assure =
that
  // for any possible values of 48 LS bits of x, the estimates r2 and y2 are
lower than exact values of r and y
  uint64_t rrx1_n =3D 0 - rrx1;                            // rrx1_n=3D1-r1=
^2*x=20=20=20
, scale =3D 2**71, nbits=3D49
  uint64_t term1 =3D rrx1_n;                               // rrx1_n*(1/2)=
=20=20=20=20=20=20
, scale =3D 2**72, nbits=3D49
  uint64_t term2 =3D umulh64(rrx1_n*3,rrx1_n) >>  9;       //
rrx1_n*rrx1_n*(3/8), scale =3D 2**72, nbits=3D27
  uint64_t deltaR =3D umulh64(r1<<26,term1+term2);         //
(rrx1_n*1/2+rrx1_n*rrx1_n*(3/8))*r1, scale =3D 2**64, nbits=3D41
  uint64_t r2 =3D (r1 << 34)+deltaR;                       // scale =3D 2**=
64,
nbits=3D64
  // r2 is low estimate,
  // r2 is in range [0x8000000000000000..0xffffffffffffffff]
  // r2*sqrt(x) is in range [0.999_999_999_999_999_999_888_676  ..
0.999_999_999_999_999_999_999_999_989]
  // Est. Precisions: ~62.96 bits
  // Worst-case errors are caused by unlucky truncation of cases where exact
result scaled by 2**64 is in form N+eps

  // Calculate y =3D estimate of sqrt(1+x)-1 =3D x*rsqrt(x)+rsqrt(x)
  uint64_t y2 =3D (umulh64(mant64, r2) + r2) << e; // scale =3D 2**64, nbit=
s=3D64
  // scale =3D 2**64, nbits=3D64

  if (r2 >=3D (uint64_t)(-2))
    y2 =3D 0;

  //
  // Improve precision of y by 1-st order NR iteration
  // It is a canonical NR iteration y =3D (x + y*y)/(2*y) used
  // in a form y =3D y + (x - y*y)*r/2
  //
  // Calculate bits[-60:-123] of dX =3D (x+1)*2**e - (y+1)*(y+1)
  // All upper bits of difference are known to be 0
  __uint128 y2y2 =3D (__uint128)y2 * y2;            // y*y,     scale 2**128
  uint64_t  yy2  =3D (uint64_t)(y2y2 >> 5)+(y2<<60);// y*y+2*y, scale 2**123
  uint64_t  dx   =3D ((mantL<<e)<<11) - yy2;        // scale 2**123
  dx -=3D (dx !=3D 0); // subtract 1 in order to guarantee that estimate is=
 lower
than exact value
  uint64_t  deltaY =3D umulh64(dx, r2);             // (x - y*y)*r/2, scale
2**124

  // Round deltaY to scale =3D 2**112
  const uint64_t GUARD_MSK  =3D (1 << 12)-1;        // scale 2**124
  const uint64_t GUARD_BIT  =3D 1 << 11;            // scale 2**124
  const uint64_t MAX_ERR    =3D 6;                  // scale 2**124
  uint64_t round_incr =3D GUARD_BIT;
  uint64_t guard_bits =3D deltaY & GUARD_MSK;
  if (guard_bits - (GUARD_BIT-MAX_ERR) < MAX_ERR) {
    // Guard bits are very close to the half-bit (from below)
    // Direction of the rounding should be found by additional calculations
    __uint128 midY =3D (((__uint128)y2 << 49)+(deltaY>>11)) | 1 | ((__uint1=
28)1
<< 113); // scale 2**113
    // MidY resides between two representable output points
    __uint128 rndDir =3D midY*midY;                // scale 2**226
    uint64_t rndDirH =3D (uint64_t)(rndDir >> 64); // scale 2**162
    if ((rndDirH >> (162-113+e)) & 1)
      round_incr =3D GUARD_MSK; // when guard bit of the square =3D 1, it m=
eans
that the square of midY < x, so we need to round up
  }
  deltaY =3D (deltaY + round_incr) >> 12;         // scale 2**112
  __uint128 y =3D ((__uint128)y2 << 48) + deltaY; // scale 2**112

  // format output
  mantL =3D (uint64_t)(y);
  mantH =3D (uint64_t)(y >> 64) & MANT_H_MSK;
  uint64_t resExp =3D expx / 2;
  mshalf =3D (resExp << 48) | mantH;

  return ret__float128(mshalf, mantL);
}

Hopefully, this one is perfectly precise.
On Intel (IvyBridge, Haswell, Skylake) it is quite fast.
On AMD (Zen3) - less so. I didn't figure out yet, what causes a slowdown.

But even on AMD, it's slow only relatively to the same code on Intel.
Relatively to current library it is 7.4 times faster.

Also, I hope that this post could be considered as giving a code away to
any takers with BSD-like license.
If it is not, then tell me, how to do it right.=