[PATCH/2nd version] Re: nextafter() about an order of magnitude slower than trivial implementation

public inbox for libc-help@sourceware.org
 help / color / mirror / Atom feed

From: "Stefan Kanthak" <stefan.kanthak@t-online.de>
To: "Szabolcs Nagy" <szabolcs.nagy@arm.com>,
	"Adhemerval Zanella" <adhemerval.zanella@linaro.org>
Cc: <libc-help@sourceware.org>, <libc-alpha@sourceware.org>
Subject: [PATCH/2nd version] Re: nextafter() about an order of magnitude slower than trivial implementation
Date: Fri, 20 Aug 2021 11:52:50 +0200	[thread overview]
Message-ID: <AEC2AFE4A2D94C449A5ED31A392459C0@H270> (raw)
In-Reply-To: <741b2c48-a754-51c5-cb72-a2f97795e30f@linaro.org>

[-- Attachment #1: Type: text/plain, Size: 3682 bytes --]

Adhemerval Zanella <adhemerval.zanella@linaro.org> wrote:

> On 18/08/2021 14:11, Stefan Kanthak wrote:
>> Szabolcs Nagy <szabolcs.nagy@arm.com> wrote:
>> 
>>> The 08/16/2021 18:03, Stefan Kanthak wrote:
>>>> Testing and benchmarking an exponential function that consumes
>>>> about 10ns/call on an AMD EPYC 7262, I noticed that nextafter()
>>>> itself is DEAD SLOW: it consumes about 8.5ns/call!
>>>>
>>>> The following (quick&dirty) implementation consumes 0.7ns/call,
>>>> i.e. is about an order of magnitude faster:
>>>
>>> correctly measuring latency on a modern x86_64 core:
>>>
>>> musl: 3.16 ns
>>> glibc: 5.68 ns
>>> your: 5.72 ns
> 
> Thanks for bring this up, if you want to contribute a patch

See attachment.
For completeness sake and to ease an initial review, I post the full
source here too: without ALTERNATE defined, it uses integer arithmetic
to the extent possible, like the original.
Choose whatever you like best or gives better throughput/latency.

Changes from the 1st version of the patch:
- leftover FP comparision substituted with integer operations;
- increment/decrement logic simplified/unified, avoiding a
  conditional branch.
I expect the compiler to detect the common clause
 "&& ((xx.s < 0) == (yy.s < 0)))" introduced with these changes.

Stefan

JFTR: .../sysdeps/ieee754/flt-32/s_nextafterf.c as well as
      .../sysdeps/i386/fpu/s_nextafterl.c will need the same changes.

--- .../math/s_nextafter.c ---
/* IEEE functions
 *      nextafter(x,y)
 *      return the next machine floating-point number of x in the
 *      direction toward y.
 *   Special cases:
 */

/* Ugly hack so that the aliasing works.  */
#define __nexttoward __internal___nexttoward
#define nexttoward __internal_nexttoward

#include <errno.h>
#include <math.h>
#include <math-barriers.h>
#include <math_private.h>
#include <float.h>
#include <libm-alias-double.h>

double __nextafter(double x, double y)
{
        union {
            double d;
            int64_t s;
            uint64_t u;
        } xx = {x}, yy = {y};

        uint64_t ax = xx.u + xx.u;      /* |x|<<1 */
        uint64_t ay = yy.u + yy.u;      /* |y|<<1 */

        if(ax > 0x7ffull << 53)         /* x is nan */
            return x;
        if(ay > 0x7ffull << 53)         /* y is nan */
            return y;
#ifdef ALTERNATE
        if(x == y) return y;            /* x=y, return y */
#else
        if((ax == ay)
        && ((xx.s < 0) == (yy.s < 0)))
            return y;
#endif
        if(ax == 0) {                   /* x == 0 */
            xx.u = yy.s < 0 ? 0x8000000000000001ull : 1ull;
            xx.d = math_opt_barrier (xx.d);
            xx.d += 0.0;
            math_force_eval (xx.d);     /* raise underflow flag */
            return xx.d;                /* return +-minsubnormal */
        }
#ifdef ALTERNATE
        if((x < y) == (x < 0.0))
            --xx.u;
        else
            ++xx.u;
#else
        if((ax < ay)
        && ((xx.s < 0) == (yy.s < 0)))
            ++xx.u;
        else
            --xx.u;
#endif
        ax = xx.u + xx.u;               /* |x|<<1 */
        if((ax >= 0x7ffull << 53)       /* overflow  */
        || (ax < 0x001ull << 53)) {     /* underflow */
            xx.d = math_opt_barrier (xx.d);
            xx.d += 0.0;
            math_force_eval (xx.d);     /* raise overflow/underflow flag */
            __set_errno (ERANGE);
        }
        return xx.d;
}
libm_alias_double (__nextafter, nextafter)
#ifdef NO_LONG_DOUBLE
strong_alias (__nextafter, __nexttowardl)
weak_alias (__nexttowardl, nexttowardl)
#undef __nexttoward
strong_alias (__nextafter, __nexttoward)
#undef nexttoward
weak_alias (__nextafter, nexttoward)
#endif
--- EOF ---

[-- Attachment #2: glibc.patch --]
[-- Type: application/octet-stream, Size: 3447 bytes --]

--- -/math/s_nextafter.c
+++ +/math/s_nextafter.c
@@ -1,19 +1,3 @@
-/* @(#)s_nextafter.c 5.1 93/09/24 */
-/*
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunPro, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-
-#if defined(LIBM_SCCS) && !defined(lint)
-static char rcsid[] = "$NetBSD: s_nextafter.c,v 1.8 1995/05/10 20:47:58 jtc Exp $";
-#endif
-
 /* IEEE functions
  *	nextafter(x,y)
  *	return the next machine floating-point number of x in the
@@ -34,56 +18,55 @@
 
 double __nextafter(double x, double y)
 {
-	int32_t hx,hy,ix,iy;
-	uint32_t lx,ly;
+	union {
+	    double d;
+	    int64_t s;
+	    uint64_t u;
+	} xx = {x}, yy = {y};
 
-	EXTRACT_WORDS(hx,lx,x);
-	EXTRACT_WORDS(hy,ly,y);
-	ix = hx&0x7fffffff;		/* |x| */
-	iy = hy&0x7fffffff;		/* |y| */
+	uint64_t ax = xx.u + xx.u;	/* |x|<<1 */
+	uint64_t ay = yy.u + yy.u;	/* |y|<<1 */
 
-	if(((ix>=0x7ff00000)&&((ix-0x7ff00000)|lx)!=0) ||   /* x is nan */
-	   ((iy>=0x7ff00000)&&((iy-0x7ff00000)|ly)!=0))     /* y is nan */
-	   return x+y;
-	if(x==y) return y;		/* x=y, return y */
-	if((ix|lx)==0) {			/* x == 0 */
-	    double u;
-	    INSERT_WORDS(x,hy&0x80000000,1);	/* return +-minsubnormal */
-	    u = math_opt_barrier (x);
-	    u = u*u;
-	    math_force_eval (u);		/* raise underflow flag */
+	if(ax > 0x7ffull << 53)		/* x is nan */
 	    return x;
+	if(ay > 0x7ffull << 53)		/* y is nan */
+	    return y;
+#ifdef ALTERNATE
+	if(x == y) return y;		/* x=y, return y */
+#else
+	if((ax == ay)
+	&& ((xx.s < 0) == (yy.s < 0)))
+	    return y;
+#endif
+	if(ax == 0) {			/* x == 0 */
+	    xx.u = yy.s < 0 ? 0x8000000000000001ull : 1ull;
+	    xx.d = math_opt_barrier (xx.d);
+	    xx.d += 0.0;
+	    math_force_eval (xx.d);	/* raise underflow flag */
+	    return xx.d;		/* return +-minsubnormal */
 	}
-	if(hx>=0) {				/* x > 0 */
-	    if(hx>hy||((hx==hy)&&(lx>ly))) {	/* x > y, x -= ulp */
-		if(lx==0) hx -= 1;
-		lx -= 1;
-	    } else {				/* x < y, x += ulp */
-		lx += 1;
-		if(lx==0) hx += 1;
-	    }
-	} else {				/* x < 0 */
-	    if(hy>=0||hx>hy||((hx==hy)&&(lx>ly))){/* x < y, x -= ulp */
-		if(lx==0) hx -= 1;
-		lx -= 1;
-	    } else {				/* x > y, x += ulp */
-		lx += 1;
-		if(lx==0) hx += 1;
-	    }
+#ifdef ALTERNATE
+	if((x < y) == (x < 0.0))
+	    --xx.u;
+	else
+	    ++xx.u;
 	}
-	hy = hx&0x7ff00000;
-	if(hy>=0x7ff00000) {
-	  double u = x+x;	/* overflow  */
-	  math_force_eval (u);
-	  __set_errno (ERANGE);
+#else
+	if((ax < ay)
+	&& ((xx.s < 0) == (yy.s < 0)))
+	    ++xx.u;
+	else
+	    --xx.u;
+#endif
+	ax = xx.u + xx.u;		/* |x|<<1 */
+	if((ax >= 0x7ffull << 53)	/* overflow  */
+	|| (ax < 0x001ull << 53)) {	/* underflow */
+	  xx.d = math_opt_barrier (xx.d);
+	  xx.d += 0.0;
+	  math_force_eval (xx.d);	/* raise overflow/underflow flag */
+	  __set_errno (ERANGE);
 	}
-	if(hy<0x00100000) {
-	    double u = x*x;			/* underflow */
-	    math_force_eval (u);		/* raise underflow flag */
-	  __set_errno (ERANGE);
-	}
-	INSERT_WORDS(x,hx,lx);
-	return x;
+ 	return xx.d;
 }
 libm_alias_double (__nextafter, nextafter)
 #ifdef NO_LONG_DOUBLE

next prev parent reply	other threads:[~2021-08-20  9:58 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-08-16 16:03 Stefan Kanthak
2021-08-18 12:51 ` Szabolcs Nagy
2021-08-18 17:11   ` Stefan Kanthak
2021-08-19 11:20     ` Adhemerval Zanella
2021-08-19 17:57       ` [PATCH] " Stefan Kanthak
2021-08-20  9:52       ` Stefan Kanthak [this message]
2021-08-20 16:55         ` [PATCH/2nd version] " Joseph Myers
2021-08-20 20:19           ` Stefan Kanthak
2021-08-20 21:03             ` Joseph Myers
2021-08-23 12:50             ` Adhemerval Zanella

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=AEC2AFE4A2D94C449A5ED31A392459C0@H270 \
    --to=stefan.kanthak@t-online.de \
    --cc=adhemerval.zanella@linaro.org \
    --cc=libc-alpha@sourceware.org \
    --cc=libc-help@sourceware.org \
    --cc=szabolcs.nagy@arm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).