From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 48) id C38383856DCE; Fri, 15 Apr 2022 13:49:01 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org C38383856DCE From: "already5chosen at yahoo dot com" To: gcc-bugs@gcc.gnu.org Subject: [Bug libquadmath/105101] incorrect rounding for sqrtq Date: Fri, 15 Apr 2022 13:49:01 +0000 X-Bugzilla-Reason: CC X-Bugzilla-Type: changed X-Bugzilla-Watch-Reason: None X-Bugzilla-Product: gcc X-Bugzilla-Component: libquadmath X-Bugzilla-Version: unknown X-Bugzilla-Keywords: wrong-code X-Bugzilla-Severity: normal X-Bugzilla-Who: already5chosen at yahoo dot com X-Bugzilla-Status: NEW X-Bugzilla-Resolution: X-Bugzilla-Priority: P3 X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org X-Bugzilla-Target-Milestone: --- X-Bugzilla-Flags: X-Bugzilla-Changed-Fields: Message-ID: In-Reply-To: References: Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: quoted-printable X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/ Auto-Submitted: auto-generated MIME-Version: 1.0 X-BeenThere: gcc-bugs@gcc.gnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Gcc-bugs mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 15 Apr 2022 13:49:01 -0000 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D105101 --- Comment #9 from Michael_S --- (In reply to Michael_S from comment #4) > If you want quick fix for immediate shipment then you can take that: >=20 > #include > #include >=20 > __float128 quick_and_dirty_sqrtq(__float128 x) > { > if (isnanq(x)) > return x; >=20 > if (x=3D=3D0) > return x; >=20 > if (x < 0) > return nanq(""); >=20 > if (isinfq(x)) > return x; >=20 > int xExp; > x =3D frexpq(x, &xExp); > if (xExp & 1) > x *=3D 2.0; // x in [0.5:2.0) >=20 > __float128 r =3D (__float128)(1.0/sqrt((double)x)); // r=3Drsqrt(x) est= imate > (53 bits) > r *=3D 1.5 - r*r*x*0.5; // NR iteration improves precision of r to 105.= 4 bit >=20 > __float128 y =3D x*r; // y=3Dsqrt(x) estimate (105.4 bits) >=20 > // extended-precision NR iteration > __float128 yH =3D (double)y; > __float128 yL =3D y - yH; >=20 > __float128 deltaX =3D x - yH*yH; > deltaX -=3D yH*yL*2; > deltaX -=3D yL*yL; > y +=3D deltaX*r*0.5; // improve precision of y to ~210.2 bits. Not enou= gh > for perfect rounding, but not too bad >=20 > return ldexpq(y, xExp >> 1); > } >=20 > It is very slow, even slower than what you have now, which by itself is > quite astonishingly slow. > It is also not sufficiently precise for correct rounding in all cases. > But, at least, the worst error is something like (0.5+2**-98) ULP, so you= are > unlikely to be ever caught by black box type of testing. > It's biggest advantage is extreme portability. > Should run on all platforms where double=3D=3DIEEE binary64 and __float12= 8 =3D=3D > IEEE binary128. >=20 > May be, few days later I'll have better variant for "good" 64-bit platfor= ms > i.e. for those where we have __int128. > It would be 15-25 times faster than the variant above and rounding would = be > mathematically correct rather than just "impossible to be caught" like ab= ove. > But it would not run everywhere. > Also, I want to give it away under MIT or BSD license, rather than under = GPL. Here. #include #include #include static __inline uint64_t umulx64(uint64_t mult1, uint64_t mult2, int rshift) { return (uint64_t)(((unsigned __int128)mult1 * mult2)>>rshift); } static __inline uint64_t umulh64(uint64_t mult1, uint64_t mult2) { return umulx64(mult1, mult2, 64); } static __inline __float128 ret__float128(uint64_t wordH, uint64_t wordL) { unsigned __int128 res_u128 =3D ((unsigned __int128)wordH << 64) | wordL; __float128 result; memcpy(&result, &res_u128, sizeof(result)); // hopefully __int128 and __float128 have the endianness return result; } __float128 slow_and_clean_sqrtq(__float128 src) { typedef unsigned __int128 __uint128; const uint64_t SIGN_BIT =3D (uint64_t)1 << 63; const uint64_t MANT_H_MSK =3D (uint64_t)-1 >> 16; const uint64_t nNAN_MSW =3D (uint64_t)0xFFFF8 << 44; // -NaN __uint128 src_u128; memcpy(&src_u128, &src, sizeof(src_u128)); uint64_t mshalf =3D (uint64_t)(src_u128 >> 64); // MS half uint64_t mantL =3D (uint64_t)(src_u128); // LS half // parse MS half of source unsigned exp =3D mshalf >> 48; uint64_t mantH =3D mshalf & MANT_H_MSK; unsigned expx =3D exp + 0x3FFF; if (exp - 1 >=3D 0x7FFF-1) { // special cases: exp =3D 0, exp=3Dmax_exp o= r sign=3D1 if (exp > 0x7fff) { // negative if (exp=3D=3D0xFFFF) { // NaN or -Inf if ((mantH | mantL)=3D=3D0) // -Inf mshalf =3D nNAN_MSW; return ret__float128(mshalf, mantL); // in case of NaN the leave nu= mber intact } if (mshalf=3D=3DSIGN_BIT && mantL=3D=3D0) // -0 return ret__float128(mshalf, mantL); // in case of -0 leave the num= ber intact // normal or sub-normal mantL =3D 0; mshalf =3D nNAN_MSW; return ret__float128(mshalf, mantL); } // non-negative if (exp =3D=3D 0x7fff) // NaN or -Inf return ret__float128(mshalf, mantL); // in case of positive NaN or Inf leave the number intact // exp=3D=3D0 : zero or subnormal if ((mantH | mantL)=3D=3D0) // +0 return ret__float128(mshalf, mantL); // in case of +0 leave the number intact // positive subnormal // normalize if (mantH =3D=3D 0) { expx -=3D 48; int lz =3D __builtin_clzll(mantL); expx -=3D lz; mantL <<=3D lz + 1; mantH =3D mantL >> 16; mantL =3D mantL << 48; } else { int lz =3D __builtin_clzll(mantH)-16; expx -=3D lz; mantH =3D (mantH << (lz+1)) | mantL >> (63-lz); mantL =3D mantL << (lz + 1); } mantH &=3D MANT_H_MSK; } // Normal case int e =3D expx & 1; // e=3D LS bit of exponent const uint64_t BIT_30 =3D (uint64_t)1 << 30; static const uint8_t rsqrt_tab[64] =3D { // floor(1/sqrt(1+i/32)*512) - 2= 56 252, 248, 244, 240, 237, 233, 230, 226, 223, 220, 216, 213, 210, 207, 204, 201, 199, 196, 193, 190, 188, 185, 183, 180, 178, 175, 173, 171, 168, 166, 164, 162, 159, 157, 155, 153, 151, 149, 147, 145, 143, 141, 139, 138, 136, 134, 132, 131, 129, 127, 125, 124, 122, 121, 119, 117, 116, 114, 113, 111, 110, 108, 107, 106, }; // Look for approximate value of rsqrt(x)=3D1/sqrt(x) // where x =3D 1:mantH:mantL unsigned r0 =3D rsqrt_tab[mantH >> (48-6)]+256; // scale =3D 2**9 // r0 is low estimate, r0*sqrt(x) is in range [0.99119507..0.99991213] // Est. Precisions: ~6.82 bits // // Improve precision of the estimate by 2nd-order NR iteration with custom polynomial // r1 =3D r0 * (P2*(r0*r0*x)**2 + P1*(r0*r0*x) + P0) // where // P2 =3D 0.38345164828933775 // P1 =3D -1.26684607889193623 // P0 =3D 1.88339443966540210 // Calculations are implemented in a way that minimizes latency // and minimizes requirement to precision of the coefficients as // ((rrx0_n + A)**2 + B)*(r0*F[e]) // where rrx0_n =3D 1 - r0*r0*x // // The last multiplications serves additional purpose of adjustment by exponent (multiplication by sqrt(0.5)) // Only upper 30 bits of mantissa used in evaluation. Use of minimal amo= unt of input bits // simplifies validation by exhausting with minimal negative effect on worst-case precision // uint32_t rrx0 =3D ((r0*r0 * ((mantH >> 18)+BIT_30+1))>>16)+1;// scale =3D= 2**32 // rrx0 calculated with x1_u=3Dx/2**34 + 1 instead of x1=3Dx/2**34, in or= der to assure that // for any possible values of 34 LS bits of x, the estimate r1 is lower t= han exact value of r const uint32_t A =3D 2799880908; // scale =3D 2**= 32 const uint32_t B =3D 2343892134; // scale =3D 2**= 30 static uint32_t const f_tab[2] =3D { 3293824581, 2329085697 }; // F/sqrt(= 2**i), scale =3D 2**33 uint32_t rrx0_n =3D 0 - rrx0; // scale =3D 2**= 32, nbits=3D27 uint32_t termA =3D rrx0_n + A; // scale =3D 2**= 32, nbits=3D32 uint64_t termAA =3D ((uint64_t)termA * termA) >> 34; // scale =3D 2**= 30, nbits=3D30 uint64_t termAB =3D termAA + B; // scale =3D 2**= 30, nbits=3D32 uint64_t termRF =3D (r0*(uint64_t)f_tab[e]) >> 9; // scale =3D 2**= 33, nbits=3D32 uint64_t r1 =3D (termAB*termRF)>>33; // scale =3D 2**= 30; nbits=3D30 // r1 is low estimate // r1 is in range [0x1ffffffe .. 0x3fffffe1] // r1*sqrt(x) is in range [0.999_999_891_641_538_03..0.999_999_999_782_706_57], i.e. // Est. Precisions: ~23.13 bits // // Further improve precision of the estimate by canonical 2nd-order NR iteration // r2 =3D r1 * (P2*(r0*r0*x)**2 + P1*(r0*r0*x) + P0) // where // P2 =3D 3/8 // P1 =3D -5/4 // P0 =3D 15/8 // At this point precision is of high importance, so evaluation is done i= n a way // that minimizes impact of truncation while still doing majority of the = work with // efficient 64-bit arithmetic. // The transformed equation is r2 =3D r2 + r2*rrx1_n*(rrx1_n*3 + 4)/8 // where rrx1_n =3D 1 - r1*r1*x // uint64_t mant64 =3D (mantH << 16) | (mantL >> 48); // Upper 64 bits= of mantissa uint64_t r1r1 =3D (r1*r1) << e; // scale =3D 2**= 60; nbits=3D60 __uint128 rrx1_x =3D (__uint128)r1r1*mant64 + r1r1; // r1*r1*(x+1), = scale 2**124 uint64_t rrx1 =3D (uint64_t)(rrx1_x>>53)+(r1r1<<11)+1; // r1*r1*x=20=20= =20=20=20=20=20=20=20=20=20 , scale =3D 2**71; nbits=3D64 // rrx1 calculated with x2_u=3Dx+1 instead of x2=3Dx, in order to assure = that // for any possible values of 48 LS bits of x, the estimates r2 and y2 are lower than exact values of r and y uint64_t rrx1_n =3D 0 - rrx1; // rrx1_n=3D1-r1= ^2*x=20=20=20 , scale =3D 2**71, nbits=3D49 uint64_t term1 =3D rrx1_n; // rrx1_n*(1/2)= =20=20=20=20=20=20 , scale =3D 2**72, nbits=3D49 uint64_t term2 =3D umulh64(rrx1_n*3,rrx1_n) >> 9; // rrx1_n*rrx1_n*(3/8), scale =3D 2**72, nbits=3D27 uint64_t deltaR =3D umulh64(r1<<26,term1+term2); // (rrx1_n*1/2+rrx1_n*rrx1_n*(3/8))*r1, scale =3D 2**64, nbits=3D41 uint64_t r2 =3D (r1 << 34)+deltaR; // scale =3D 2**= 64, nbits=3D64 // r2 is low estimate, // r2 is in range [0x8000000000000000..0xffffffffffffffff] // r2*sqrt(x) is in range [0.999_999_999_999_999_999_888_676 .. 0.999_999_999_999_999_999_999_999_989] // Est. Precisions: ~62.96 bits // Worst-case errors are caused by unlucky truncation of cases where exact result scaled by 2**64 is in form N+eps // Calculate y =3D estimate of sqrt(1+x)-1 =3D x*rsqrt(x)+rsqrt(x) uint64_t y2 =3D (umulh64(mant64, r2) + r2) << e; // scale =3D 2**64, nbit= s=3D64 // scale =3D 2**64, nbits=3D64 if (r2 >=3D (uint64_t)(-2)) y2 =3D 0; // // Improve precision of y by 1-st order NR iteration // It is a canonical NR iteration y =3D (x + y*y)/(2*y) used // in a form y =3D y + (x - y*y)*r/2 // // Calculate bits[-60:-123] of dX =3D (x+1)*2**e - (y+1)*(y+1) // All upper bits of difference are known to be 0 __uint128 y2y2 =3D (__uint128)y2 * y2; // y*y, scale 2**128 uint64_t yy2 =3D (uint64_t)(y2y2 >> 5)+(y2<<60);// y*y+2*y, scale 2**123 uint64_t dx =3D ((mantL<>11)) | 1 | ((__uint1= 28)1 << 113); // scale 2**113 // MidY resides between two representable output points __uint128 rndDir =3D midY*midY; // scale 2**226 uint64_t rndDirH =3D (uint64_t)(rndDir >> 64); // scale 2**162 if ((rndDirH >> (162-113+e)) & 1) round_incr =3D GUARD_MSK; // when guard bit of the square =3D 1, it m= eans that the square of midY < x, so we need to round up } deltaY =3D (deltaY + round_incr) >> 12; // scale 2**112 __uint128 y =3D ((__uint128)y2 << 48) + deltaY; // scale 2**112 // format output mantL =3D (uint64_t)(y); mantH =3D (uint64_t)(y >> 64) & MANT_H_MSK; uint64_t resExp =3D expx / 2; mshalf =3D (resExp << 48) | mantH; return ret__float128(mshalf, mantL); } Hopefully, this one is perfectly precise. On Intel (IvyBridge, Haswell, Skylake) it is quite fast. On AMD (Zen3) - less so. I didn't figure out yet, what causes a slowdown. But even on AMD, it's slow only relatively to the same code on Intel. Relatively to current library it is 7.4 times faster. Also, I hope that this post could be considered as giving a code away to any takers with BSD-like license. If it is not, then tell me, how to do it right.=