From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mailout08.t-online.de (mailout08.t-online.de [194.25.134.20]) by sourceware.org (Postfix) with ESMTPS id 011E7385781F; Fri, 20 Aug 2021 09:58:05 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 011E7385781F Authentication-Results: sourceware.org; dmarc=none (p=none dis=none) header.from=t-online.de Authentication-Results: sourceware.org; spf=none smtp.mailfrom=t-online.de Received: from fwd00.aul.t-online.de (fwd00.aul.t-online.de [172.20.26.147]) by mailout08.t-online.de (Postfix) with SMTP id 8872317099; Fri, 20 Aug 2021 11:58:04 +0200 (CEST) Received: from H270 (EfXDZBZ-Qhjp1-TKhaOp0Y+EfcUNPqPWMovNy4AkICLo3wTj10jOyfbB2-Hw4+ywEv@[91.56.241.188]) by fwd00.t-online.de with (TLSv1.2:ECDHE-RSA-AES256-SHA384 encrypted) esmtp id 1mH1Hd-0ujIZc0; Fri, 20 Aug 2021 11:57:53 +0200 Message-ID: From: "Stefan Kanthak" To: "Szabolcs Nagy" , "Adhemerval Zanella" Cc: , References: <20210818125119.GH25257@arm.com> <741b2c48-a754-51c5-cb72-a2f97795e30f@linaro.org> In-Reply-To: <741b2c48-a754-51c5-cb72-a2f97795e30f@linaro.org> Subject: [PATCH/2nd version] Re: nextafter() about an order of magnitude slower than trivial implementation Date: Fri, 20 Aug 2021 11:52:50 +0200 Organization: Me, myself & IT MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="----=_NextPart_000_0605_01D795B9.E70323B0" X-Priority: 3 X-MSMail-Priority: Normal X-Mailer: Microsoft Windows Mail 6.0.6002.18197 X-MimeOLE: Produced By Microsoft MimeOLE V6.1.7601.24158 X-ID: EfXDZBZ-Qhjp1-TKhaOp0Y+EfcUNPqPWMovNy4AkICLo3wTj10jOyfbB2-Hw4+ywEv X-TOI-EXPURGATEID: 150726::1629453473-00010D54-ADF2E2A3/0/0 CLEAN NORMAL X-TOI-MSGID: 0f09ab59-575a-46e2-904e-8d8db810f52c X-Spam-Status: No, score=-1.1 required=5.0 tests=BAYES_00, FREEMAIL_FROM, KAM_DMARC_STATUS, KAM_LAZY_DOMAIN_SECURITY, RCVD_IN_DNSWL_NONE, RCVD_IN_MSPIKE_H3, RCVD_IN_MSPIKE_WL, SPF_HELO_NONE, SPF_NONE, TXREP autolearn=no autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: libc-help@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-help mailing list List-Unsubscribe: , List-Archive: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 20 Aug 2021 09:58:07 -0000 This is a multi-part message in MIME format. ------=_NextPart_000_0605_01D795B9.E70323B0 Content-Type: text/plain; charset="iso-8859-1" Content-Transfer-Encoding: 7bit Adhemerval Zanella wrote: > On 18/08/2021 14:11, Stefan Kanthak wrote: >> Szabolcs Nagy wrote: >> >>> The 08/16/2021 18:03, Stefan Kanthak wrote: >>>> Testing and benchmarking an exponential function that consumes >>>> about 10ns/call on an AMD EPYC 7262, I noticed that nextafter() >>>> itself is DEAD SLOW: it consumes about 8.5ns/call! >>>> >>>> The following (quick&dirty) implementation consumes 0.7ns/call, >>>> i.e. is about an order of magnitude faster: >>> >>> correctly measuring latency on a modern x86_64 core: >>> >>> musl: 3.16 ns >>> glibc: 5.68 ns >>> your: 5.72 ns > > Thanks for bring this up, if you want to contribute a patch See attachment. For completeness sake and to ease an initial review, I post the full source here too: without ALTERNATE defined, it uses integer arithmetic to the extent possible, like the original. Choose whatever you like best or gives better throughput/latency. Changes from the 1st version of the patch: - leftover FP comparision substituted with integer operations; - increment/decrement logic simplified/unified, avoiding a conditional branch. I expect the compiler to detect the common clause "&& ((xx.s < 0) == (yy.s < 0)))" introduced with these changes. Stefan JFTR: .../sysdeps/ieee754/flt-32/s_nextafterf.c as well as .../sysdeps/i386/fpu/s_nextafterl.c will need the same changes. --- .../math/s_nextafter.c --- /* IEEE functions * nextafter(x,y) * return the next machine floating-point number of x in the * direction toward y. * Special cases: */ /* Ugly hack so that the aliasing works. */ #define __nexttoward __internal___nexttoward #define nexttoward __internal_nexttoward #include #include #include #include #include #include double __nextafter(double x, double y) { union { double d; int64_t s; uint64_t u; } xx = {x}, yy = {y}; uint64_t ax = xx.u + xx.u; /* |x|<<1 */ uint64_t ay = yy.u + yy.u; /* |y|<<1 */ if(ax > 0x7ffull << 53) /* x is nan */ return x; if(ay > 0x7ffull << 53) /* y is nan */ return y; #ifdef ALTERNATE if(x == y) return y; /* x=y, return y */ #else if((ax == ay) && ((xx.s < 0) == (yy.s < 0))) return y; #endif if(ax == 0) { /* x == 0 */ xx.u = yy.s < 0 ? 0x8000000000000001ull : 1ull; xx.d = math_opt_barrier (xx.d); xx.d += 0.0; math_force_eval (xx.d); /* raise underflow flag */ return xx.d; /* return +-minsubnormal */ } #ifdef ALTERNATE if((x < y) == (x < 0.0)) --xx.u; else ++xx.u; #else if((ax < ay) && ((xx.s < 0) == (yy.s < 0))) ++xx.u; else --xx.u; #endif ax = xx.u + xx.u; /* |x|<<1 */ if((ax >= 0x7ffull << 53) /* overflow */ || (ax < 0x001ull << 53)) { /* underflow */ xx.d = math_opt_barrier (xx.d); xx.d += 0.0; math_force_eval (xx.d); /* raise overflow/underflow flag */ __set_errno (ERANGE); } return xx.d; } libm_alias_double (__nextafter, nextafter) #ifdef NO_LONG_DOUBLE strong_alias (__nextafter, __nexttowardl) weak_alias (__nexttowardl, nexttowardl) #undef __nexttoward strong_alias (__nextafter, __nexttoward) #undef nexttoward weak_alias (__nextafter, nexttoward) #endif --- EOF --- ------=_NextPart_000_0605_01D795B9.E70323B0 Content-Type: application/octet-stream; name="glibc.patch" Content-Transfer-Encoding: quoted-printable Content-Disposition: attachment; filename="glibc.patch" --- -/math/s_nextafter.c=0A= +++ +/math/s_nextafter.c=0A= @@ -1,19 +1,3 @@=0A= -/* @(#)s_nextafter.c 5.1 93/09/24 */=0A= -/*=0A= - * = =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=0A= - * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.=0A= - *=0A= - * Developed at SunPro, a Sun Microsystems, Inc. business.=0A= - * Permission to use, copy, modify, and distribute this=0A= - * software is freely granted, provided that this notice=0A= - * is preserved.=0A= - * = =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=0A= - */=0A= -=0A= -#if defined(LIBM_SCCS) && !defined(lint)=0A= -static char rcsid[] =3D "$NetBSD: s_nextafter.c,v 1.8 1995/05/10 = 20:47:58 jtc Exp $";=0A= -#endif=0A= -=0A= /* IEEE functions=0A= * nextafter(x,y)=0A= * return the next machine floating-point number of x in the=0A= @@ -34,56 +18,55 @@=0A= =0A= double __nextafter(double x, double y)=0A= {=0A= - int32_t hx,hy,ix,iy;=0A= - uint32_t lx,ly;=0A= + union {=0A= + double d;=0A= + int64_t s;=0A= + uint64_t u;=0A= + } xx =3D {x}, yy =3D {y};=0A= =0A= - EXTRACT_WORDS(hx,lx,x);=0A= - EXTRACT_WORDS(hy,ly,y);=0A= - ix =3D hx&0x7fffffff; /* |x| */=0A= - iy =3D hy&0x7fffffff; /* |y| */=0A= + uint64_t ax =3D xx.u + xx.u; /* |x|<<1 */=0A= + uint64_t ay =3D yy.u + yy.u; /* |y|<<1 */=0A= =0A= - if(((ix>=3D0x7ff00000)&&((ix-0x7ff00000)|lx)!=3D0) || /* x is nan */=0A= - ((iy>=3D0x7ff00000)&&((iy-0x7ff00000)|ly)!=3D0)) /* y is nan */=0A= - return x+y;=0A= - if(x=3D=3Dy) return y; /* x=3Dy, return y */=0A= - if((ix|lx)=3D=3D0) { /* x =3D=3D 0 */=0A= - double u;=0A= - INSERT_WORDS(x,hy&0x80000000,1); /* return +-minsubnormal */=0A= - u =3D math_opt_barrier (x);=0A= - u =3D u*u;=0A= - math_force_eval (u); /* raise underflow flag */=0A= + if(ax > 0x7ffull << 53) /* x is nan */=0A= return x;=0A= + if(ay > 0x7ffull << 53) /* y is nan */=0A= + return y;=0A= +#ifdef ALTERNATE=0A= + if(x =3D=3D y) return y; /* x=3Dy, return y */=0A= +#else=0A= + if((ax =3D=3D ay)=0A= + && ((xx.s < 0) =3D=3D (yy.s < 0)))=0A= + return y;=0A= +#endif=0A= + if(ax =3D=3D 0) { /* x =3D=3D 0 */=0A= + xx.u =3D yy.s < 0 ? 0x8000000000000001ull : 1ull;=0A= + xx.d =3D math_opt_barrier (xx.d);=0A= + xx.d +=3D 0.0;=0A= + math_force_eval (xx.d); /* raise underflow flag */=0A= + return xx.d; /* return +-minsubnormal */=0A= }=0A= - if(hx>=3D0) { /* x > 0 */=0A= - if(hx>hy||((hx=3D=3Dhy)&&(lx>ly))) { /* x > y, x -=3D ulp */=0A= - if(lx=3D=3D0) hx -=3D 1;=0A= - lx -=3D 1;=0A= - } else { /* x < y, x +=3D ulp */=0A= - lx +=3D 1;=0A= - if(lx=3D=3D0) hx +=3D 1;=0A= - }=0A= - } else { /* x < 0 */=0A= - if(hy>=3D0||hx>hy||((hx=3D=3Dhy)&&(lx>ly))){/* x < y, x -=3D ulp */=0A= - if(lx=3D=3D0) hx -=3D 1;=0A= - lx -=3D 1;=0A= - } else { /* x > y, x +=3D ulp */=0A= - lx +=3D 1;=0A= - if(lx=3D=3D0) hx +=3D 1;=0A= - }=0A= +#ifdef ALTERNATE=0A= + if((x < y) =3D=3D (x < 0.0))=0A= + --xx.u;=0A= + else=0A= + ++xx.u;=0A= }=0A= - hy =3D hx&0x7ff00000;=0A= - if(hy>=3D0x7ff00000) {=0A= - double u =3D x+x; /* overflow */=0A= - math_force_eval (u);=0A= - __set_errno (ERANGE);=0A= +#else=0A= + if((ax < ay)=0A= + && ((xx.s < 0) =3D=3D (yy.s < 0)))=0A= + ++xx.u;=0A= + else=0A= + --xx.u;=0A= +#endif=0A= + ax =3D xx.u + xx.u; /* |x|<<1 */=0A= + if((ax >=3D 0x7ffull << 53) /* overflow */=0A= + || (ax < 0x001ull << 53)) { /* underflow */=0A= + xx.d =3D math_opt_barrier (xx.d);=0A= + xx.d +=3D 0.0;=0A= + math_force_eval (xx.d); /* raise overflow/underflow flag */=0A= + __set_errno (ERANGE);=0A= }=0A= - if(hy<0x00100000) {=0A= - double u =3D x*x; /* underflow */=0A= - math_force_eval (u); /* raise underflow flag */=0A= - __set_errno (ERANGE);=0A= - }=0A= - INSERT_WORDS(x,hx,lx);=0A= - return x;=0A= + return xx.d;=0A= }=0A= libm_alias_double (__nextafter, nextafter)=0A= #ifdef NO_LONG_DOUBLE=0A= ------=_NextPart_000_0605_01D795B9.E70323B0--