From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from smtpout2.vodafonemail.de (smtpout2.vodafonemail.de [145.253.239.133]) by sourceware.org (Postfix) with ESMTPS id E6A0039ADC0E; Thu, 19 Aug 2021 18:00:04 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org E6A0039ADC0E Authentication-Results: sourceware.org; dmarc=none (p=none dis=none) header.from=nexgo.de Authentication-Results: sourceware.org; spf=pass smtp.mailfrom=nexgo.de Received: from smtp.vodafone.de (smtpa05.fra-mediabeam.com [10.2.0.36]) by smtpout2.vodafonemail.de (Postfix) with ESMTP id 7C66C1228A3; Thu, 19 Aug 2021 20:00:03 +0200 (CEST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=nexgo.de; s=vfde-smtpout-mb-15sep; t=1629396003; bh=n3cd+/KNewlLdb29d9D9y4bU+z8iIdP7HmdgdVztVqo=; h=From:To:Cc:References:In-Reply-To:Subject:Date; b=Uqoe/TxZcfMmbPuwgTnWYhr8iPa40viOKWsCXyDyiBBdme9UAqinCN4gTkDqVeFc2 r+AUzBlBzwHXrn8ss+jyjrp1oZSt+h6mkHOJOVcFMzE0YusCQ08S3gKLVYyt+ICLWy HsBb2GUKQVrCvUyIqzP9rCWI5NsvXqzVuV1RTbJE= Received: from H270 (p5b38f1bc.dip0.t-ipconnect.de [91.56.241.188]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-SHA384 (256/256 bits)) (No client certificate requested) by smtp.vodafone.de (Postfix) with ESMTPSA id B0B8C140231; Thu, 19 Aug 2021 18:00:02 +0000 (UTC) Message-ID: From: "Stefan Kanthak" To: "Szabolcs Nagy" , "Adhemerval Zanella" Cc: , References: <20210818125119.GH25257@arm.com> <741b2c48-a754-51c5-cb72-a2f97795e30f@linaro.org> In-Reply-To: <741b2c48-a754-51c5-cb72-a2f97795e30f@linaro.org> Subject: [PATCH] Re: nextafter() about an order of magnitude slower than trivial implementation Date: Thu, 19 Aug 2021 19:57:28 +0200 Organization: Me, myself & IT MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="----=_NextPart_000_04D8_01D79534.7049ED10" X-Priority: 3 X-MSMail-Priority: Normal X-Mailer: Microsoft Windows Mail 6.0.6002.18197 X-MimeOLE: Produced By Microsoft MimeOLE V6.1.7601.24158 X-purgate-type: clean X-purgate-Ad: Categorized by eleven eXpurgate (R) http://www.eleven.de X-purgate: This mail is considered clean (visit http://www.eleven.de for further information) X-purgate: clean X-purgate-size: 8574 X-purgate-ID: 155817::1629396003-00004EF9-6AC23C21/0/0 X-Spam-Status: No, score=-2.7 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, RCVD_IN_DNSWL_LOW, RCVD_IN_MSPIKE_H2, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: libc-help@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-help mailing list List-Unsubscribe: , List-Archive: List-Help: List-Subscribe: , X-List-Received-Date: Thu, 19 Aug 2021 18:00:16 -0000 This is a multi-part message in MIME format. ------=_NextPart_000_04D8_01D79534.7049ED10 Content-Type: text/plain; charset="iso-8859-1" Content-Transfer-Encoding: 7bit Adhemerval Zanella wrote: > On 18/08/2021 14:11, Stefan Kanthak wrote: >> Szabolcs Nagy wrote: >> >>> The 08/16/2021 18:03, Stefan Kanthak wrote: >>>> Testing and benchmarking an exponential function that consumes >>>> about 10ns/call on an AMD EPYC 7262, I noticed that nextafter() >>>> itself is DEAD SLOW: it consumes about 8.5ns/call! >>>> >>>> The following (quick&dirty) implementation consumes 0.7ns/call, >>>> i.e. is about an order of magnitude faster: >>> >>> correctly measuring latency on a modern x86_64 core: >>> >>> musl: 3.16 ns >>> glibc: 5.68 ns >>> your: 5.72 ns > > Thanks for bring this up, if you want to contribute a patch See attachment. For completeness sake and to ease an initial review, I post the full source here too: without ALTERNATE defined, it uses integer arithmetic to the extent possible, like the original. Choose whatever you like best or gives better throughput/latency. Stefan JFTR: .../sysdeps/ieee754/flt-32/s_nextafterf.c as well as .../sysdeps/i386/fpu/s_nextafterl.c will need the same changes. --- .../math/s_nextafter.c --- /* IEEE functions * nextafter(x,y) * return the next machine floating-point number of x in the * direction toward y. * Special cases: */ /* Ugly hack so that the aliasing works. */ #define __nexttoward __internal___nexttoward #define nexttoward __internal_nexttoward #include #include #include #include #include #include double __nextafter(double x, double y) { union { double d; int64_t s; uint64_t u; } xx = {x}, yy = {y}; uint64_t ax = xx.u + xx.u; /* |x|<<1 */ uint64_t ay = yy.u + yy.u; /* |y|<<1 */ if(ax > 0x7ffull << 53) /* x is nan */ return x; if(ay > 0x7ffull << 53) /* y is nan */ return y; if(x == y) return y; /* x=y, return y */ if(ax == 0ull) { /* x == 0 */ xx.u = yy.s < 0 ? 0x8000000000000001ull : 1ull; xx.d = math_opt_barrier (xx.d); xx.d += 0.0; math_force_eval (xx.d); /* raise underflow flag */ return xx.d; /* return +-minsubnormal */ } #ifdef ALTERNATE if((x < y) == (x < 0.0)) --xx.u; else ++xx.u; #else if(xx.s < 0) { /* x < 0 */ if(yy.s < 0 && xx.s < yy.s) ++xx.s; /* x < 0, x > y, x += ulp */ else --xx.s; /* x < 0, x < y, x -= ulp */ } else { /* x > 0 */ if(xx.s > yy.s) --xx.s; /* x > 0, x > y, x -= ulp */ else ++xx.s; /* x > 0, x < y, x += ulp */ } #endif ax = xx.u + xx.u; /* |x|<<1 */ if((ax >= 0x7ffull << 53) /* overflow */ || (ax < 0x001ull << 53)) { /* underflow */ xx.d = math_opt_barrier (xx.d); xx.d += 0.0; math_force_eval (xx.d); /* raise overflow/underflow flag */ __set_errno (ERANGE); } return xx.d; } libm_alias_double (__nextafter, nextafter) #ifdef NO_LONG_DOUBLE strong_alias (__nextafter, __nexttowardl) weak_alias (__nexttowardl, nexttowardl) #undef __nexttoward strong_alias (__nextafter, __nexttoward) #undef nexttoward weak_alias (__nextafter, nexttoward) #endif --- EOF --- ------=_NextPart_000_04D8_01D79534.7049ED10 Content-Type: application/octet-stream; name="glibc.patch" Content-Transfer-Encoding: quoted-printable Content-Disposition: attachment; filename="glibc.patch" --- -/math/s_nextafter.c=0A= +++ +/math/s_nextafter.c=0A= @@ -1,19 +1,3 @@=0A= -/* @(#)s_nextafter.c 5.1 93/09/24 */=0A= -/*=0A= - * = =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=0A= - * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.=0A= - *=0A= - * Developed at SunPro, a Sun Microsystems, Inc. business.=0A= - * Permission to use, copy, modify, and distribute this=0A= - * software is freely granted, provided that this notice=0A= - * is preserved.=0A= - * = =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=0A= - */=0A= -=0A= -#if defined(LIBM_SCCS) && !defined(lint)=0A= -static char rcsid[] =3D "$NetBSD: s_nextafter.c,v 1.8 1995/05/10 = 20:47:58 jtc Exp $";=0A= -#endif=0A= -=0A= /* IEEE functions=0A= * nextafter(x,y)=0A= * return the next machine floating-point number of x in the=0A= @@ -34,55 +18,53 @@=0A= =0A= double __nextafter(double x, double y)=0A= {=0A= - int32_t hx,hy,ix,iy;=0A= - uint32_t lx,ly;=0A= + union {=0A= + double d;=0A= + int64_t s;=0A= + uint64_t u;=0A= + } xx =3D {x}, yy =3D {y};=0A= =0A= - EXTRACT_WORDS(hx,lx,x);=0A= - EXTRACT_WORDS(hy,ly,y);=0A= - ix =3D hx&0x7fffffff; /* |x| */=0A= - iy =3D hy&0x7fffffff; /* |y| */=0A= + uint64_t ax =3D xx.u + xx.u; /* |x|<<1 */=0A= + uint64_t ay =3D yy.u + yy.u; /* |y|<<1 */=0A= =0A= - if(((ix>=3D0x7ff00000)&&((ix-0x7ff00000)|lx)!=3D0) || /* x is nan */=0A= - ((iy>=3D0x7ff00000)&&((iy-0x7ff00000)|ly)!=3D0)) /* y is nan */=0A= - return x+y;=0A= - if(x=3D=3Dy) return y; /* x=3Dy, return y */=0A= - if((ix|lx)=3D=3D0) { /* x =3D=3D 0 */=0A= - double u;=0A= - INSERT_WORDS(x,hy&0x80000000,1); /* return +-minsubnormal */=0A= - u =3D math_opt_barrier (x);=0A= - u =3D u*u;=0A= - math_force_eval (u); /* raise underflow flag */=0A= + if(ax > 0x7ffull << 53) /* x is nan */=0A= return x;=0A= + if(ay > 0x7ffull << 53) /* y is nan */=0A= + return y;=0A= + if(x =3D=3D y) return y; /* x=3Dy, return y */=0A= + if(ax =3D=3D 0ull) { /* x =3D=3D 0 */=0A= + xx.u =3D yy.s < 0 ? 0x8000000000000001ull : 1ull;=0A= + xx.d =3D math_opt_barrier (xx.d);=0A= + xx.d +=3D 0.0;=0A= + math_force_eval (xx.d); /* raise underflow flag */=0A= + return xx.d; /* return +-minsubnormal */=0A= }=0A= - if(hx>=3D0) { /* x > 0 */=0A= - if(hx>hy||((hx=3D=3Dhy)&&(lx>ly))) { /* x > y, x -=3D ulp */=0A= - if(lx=3D=3D0) hx -=3D 1;=0A= - lx -=3D 1;=0A= - } else { /* x < y, x +=3D ulp */=0A= - lx +=3D 1;=0A= - if(lx=3D=3D0) hx +=3D 1;=0A= - }=0A= - } else { /* x < 0 */=0A= - if(hy>=3D0||hx>hy||((hx=3D=3Dhy)&&(lx>ly))){/* x < y, x -=3D ulp */=0A= - if(lx=3D=3D0) hx -=3D 1;=0A= - lx -=3D 1;=0A= - } else { /* x > y, x +=3D ulp */=0A= - lx +=3D 1;=0A= - if(lx=3D=3D0) hx +=3D 1;=0A= - }=0A= +#ifdef ALTERNATE=0A= + if ((x < y) =3D=3D (x < 0.0))=0A= + --xx.u;=0A= + else=0A= + ++xx.u;=0A= +#else=0A= + if(xx.s < 0) { /* x < 0 */=0A= + if(yy.s < 0 && xx.s < yy.s)=0A= + ++xx.s; /* x < 0, x > y, x +=3D ulp */=0A= + else=0A= + --xx.s; /* x < 0, x < y, x -=3D ulp */=0A= + } else { /* x > 0 */=0A= + if(xx.s > yy.s)=0A= + --xx.s; /* x > 0, x > y, x -=3D ulp */=0A= + else=0A= + ++xx.s; /* x > 0, x < y, x +=3D ulp */=0A= }=0A= - hy =3D hx&0x7ff00000;=0A= - if(hy>=3D0x7ff00000) {=0A= - double u =3D x+x; /* overflow */=0A= - math_force_eval (u);=0A= +#endif=0A= + ax =3D xx.u + xx.u; /* |x|<<1 */=0A= + if((ax >=3D 0x7ffull << 53) /* overflow */=0A= + || (ax < 0x001ull << 53)) { /* underflow */=0A= + xx.d =3D math_opt_barrier (xx.d);=0A= + xx.d +=3D 0.0;=0A= + math_force_eval (xx.d); /* raise overflow/underflow flag */=0A= __set_errno (ERANGE);=0A= }=0A= - if(hy<0x00100000) {=0A= - double u =3D x*x; /* underflow */=0A= - math_force_eval (u); /* raise underflow flag */=0A= - __set_errno (ERANGE);=0A= - }=0A= - INSERT_WORDS(x,hx,lx);=0A= return xx.d;=0A= }=0A= libm_alias_double (__nextafter, nextafter)=0A= ------=_NextPart_000_04D8_01D79534.7049ED10--