public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH] Improves __ieee754_exp(x) performance by 18-37% when |x| < 1.0397
@ 2018-03-20 23:02 Patrick McGehearty
  2018-03-29 10:38 ` Szabolcs Nagy
  0 siblings, 1 reply; 2+ messages in thread
From: Patrick McGehearty @ 2018-03-20 23:02 UTC (permalink / raw)
  To: libc-alpha

Adds a fast path to e_exp.c when |x| < 1.03972053527832.
When values are tested in isolation, reduction in execution
time is: aarch 30%, sparc 18%, x86 37%.
When comparing benchtests/bench.out which includes values
outside that range, the gains are:
aarch 8%, sparc 5%, x86 9%.

make check is clean (no increase in ulp for any math test).
Testing 20M values for each rounding mode in that range shows
approximately one in 200 values is off by 1 ulp. No value tested
for exp(x) changed by 2 or more ulp.

No observed change in performance or accuracy for x outside
fast path range.

These changes will be active for all platforms that don't provide
their own exp() routines. They will also be active for ieee754
versions of ccos, ccosh, cosh, csin, csinh, sinh, exp10, gamma, and
erf.

ChangeLog:
2018-03-20  Patrick McGehearty <patrick.mcgehearty@oracle.com>

        * sysdeps/ieee754/dbl-64/e_exp.c: faster __ieee754_exp()
        * sysdeps/ieee754/dbl-64/eexp.tbl: New file for e_exp.c
---
 sysdeps/ieee754/dbl-64/e_exp.c  |   48 +++++++++--
 sysdeps/ieee754/dbl-64/eexp.tbl |  172 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 212 insertions(+), 8 deletions(-)
 create mode 100644 sysdeps/ieee754/dbl-64/eexp.tbl

diff --git a/sysdeps/ieee754/dbl-64/e_exp.c b/sysdeps/ieee754/dbl-64/e_exp.c
index 62035a8..e47b8af 100644
--- a/sysdeps/ieee754/dbl-64/e_exp.c
+++ b/sysdeps/ieee754/dbl-64/e_exp.c
@@ -40,6 +40,7 @@
 #include <math_private.h>
 #include <fenv.h>
 #include <float.h>
+#include "eexp.tbl"
 
 #ifndef SECTION
 # define SECTION
@@ -50,8 +51,10 @@ SECTION
 __ieee754_exp (double x)
 {
   double bexp, t, eps, del, base, y, al, bet, res, rem, cor;
+  double z;
   mynumber junk1, junk2, binexp = {{0, 0}};
   int4 i, j, m, n, ex;
+  int4 k;
   double retval;
 
   {
@@ -61,7 +64,42 @@ __ieee754_exp (double x)
     m = junk1.i[HIGH_HALF];
     n = m & hugeint;
 
-    if (n > smallint && n < bigint)
+    if (n < 0x3ff0a2b2)		/* |x| < 1.03972053527832 */
+      {
+	if (n < 0x3f862e42)	/* |x| < 3/2 ln 2 */
+	  {
+	    if (n < 0x3ed00000)	/* |x| < 1/64 ln 2 */
+	      {
+		if (n < 0x3e300000)	/* |x| < 2^18 */
+		  {
+		    retval = one + junk1.x;
+		    goto ret;
+		  }
+		retval = one + junk1.x * (one + half * junk1.x);
+		goto ret;
+	      }
+	    t = junk1.x * junk1.x;
+	    retval = junk1.x + (t * (half + junk1.x * t2) +
+				(t * t) * (t3 + junk1.x * t4 + t * t5));
+	    retval = one + retval;
+	    goto ret;
+	  }
+
+	/* Find the multiple of 2^-6 nearest x.  */
+	k = n >> 20;
+	j = (0x00100000 | (n & 0x000fffff)) >> (0x40c - k);
+	j = (j - 1) & ~1;
+	if (m < 0)
+	  j += 134;
+	z = junk1.x - TBL2[j];
+	t = z * z;
+	retval = z + (t * (half + (z * t2))
+		      + (t * t) * (t3 + z * t4 + t * t5));
+	retval = TBL2[j + 1] + TBL2[j + 1] * retval;
+	goto ret;
+      }
+
+    if (n < bigint)		/* && |x| >= 1.03972053527832 */
       {
 	y = x * log2e.x + three51.x;
 	bexp = y - three51.x;	/*  multiply the result by 2**bexp        */
@@ -74,7 +112,7 @@ __ieee754_exp (double x)
 	y = t + three33.x;
 	base = y - three33.x;	/* t rounded to a multiple of 2**-18      */
 	junk2.x = y;
-	del = (t - base) - eps;	/*  x = bexp*ln(2) + base + del           */
+	del = (t - base) - eps;	/*  x = bexp*ln(2) + base + del	          */
 	eps = del + del * del * (p3.x * del + p2.x);
 
 	binexp.i[HIGH_HALF] = (junk1.i[LOW_HALF] + 1023) << 20;
@@ -94,12 +132,6 @@ __ieee754_exp (double x)
 	goto ret;
       }
 
-    if (n <= smallint)
-      {
-	retval = 1.0;
-	goto ret;
-      }
-
     if (n >= badint)
       {
 	if (n > infint)
diff --git a/sysdeps/ieee754/dbl-64/eexp.tbl b/sysdeps/ieee754/dbl-64/eexp.tbl
new file mode 100644
index 0000000..3b5fe40
--- /dev/null
+++ b/sysdeps/ieee754/dbl-64/eexp.tbl
@@ -0,0 +1,172 @@
+/* EXP function tables - for use in computing double precision exponential
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* For i = 0, ..., 66,
+     TBL2[2*i] is a double precision number near (i+1)*2^-6, and
+     TBL2[2*i+1] = exp(TBL2[2*i]) to within a relative error less
+     than 2^-60.
+
+   For i = 67, ..., 133,
+     TBL2[2*i] is a double precision number near -(i+1)*2^-6, and
+     TBL2[2*i+1] = exp(TBL2[2*i]) to within a relative error less
+     than 2^-60.  */
+
+static const double TBL2[268] = {
+    0x1.ffffffffffc82p-7,   0x1.04080ab55de32p+0,
+    0x1.fffffffffffdbp-6,   0x1.08205601127ecp+0,
+    0x1.80000000000a0p-5,   0x1.0c49236829e91p+0,
+    0x1.fffffffffff79p-5,   0x1.1082b577d34e9p+0,
+    0x1.3fffffffffffcp-4,   0x1.14cd4fc989cd6p+0,
+    0x1.8000000000060p-4,   0x1.192937074e0d4p+0,
+    0x1.c000000000061p-4,   0x1.1d96b0eff0e80p+0,
+    0x1.fffffffffffd6p-4,   0x1.2216045b6f5cap+0,
+    0x1.1ffffffffff58p-3,   0x1.26a7793f6014cp+0,
+    0x1.3ffffffffff75p-3,   0x1.2b4b58b372c65p+0,
+    0x1.5ffffffffff00p-3,   0x1.3001ecf601ad1p+0,
+    0x1.8000000000020p-3,   0x1.34cb8170b583ap+0,
+    0x1.9ffffffffa629p-3,   0x1.39a862bd3b344p+0,
+    0x1.c00000000000fp-3,   0x1.3e98deaa11dcep+0,
+    0x1.e00000000007fp-3,   0x1.439d443f5f16dp+0,
+    0x1.0000000000072p-2,   0x1.48b5e3c3e81abp+0,
+    0x1.0fffffffffecap-2,   0x1.4de30ec211dfbp+0,
+    0x1.1ffffffffff8fp-2,   0x1.5325180cfacd2p+0,
+    0x1.300000000003bp-2,   0x1.587c53c5a7b04p+0,
+    0x1.4000000000034p-2,   0x1.5de9176046007p+0,
+    0x1.4ffffffffff89p-2,   0x1.636bb9a98322fp+0,
+    0x1.5ffffffffffe7p-2,   0x1.690492cbf942ap+0,
+    0x1.6ffffffffff78p-2,   0x1.6eb3fc55b1e45p+0,
+    0x1.7ffffffffff65p-2,   0x1.747a513dbef32p+0,
+    0x1.8ffffffffffd5p-2,   0x1.7a57ede9ea22ep+0,
+    0x1.9ffffffffff6ep-2,   0x1.804d30347b50fp+0,
+    0x1.affffffffffc3p-2,   0x1.865a7772164aep+0,
+    0x1.c000000000053p-2,   0x1.8c802477b0030p+0,
+    0x1.d00000000004dp-2,   0x1.92be99a09bf1ep+0,
+    0x1.e000000000096p-2,   0x1.99163ad4b1e08p+0,
+    0x1.efffffffffefap-2,   0x1.9f876d8e8c4fcp+0,
+    0x1.fffffffffffd0p-2,   0x1.a61298e1e0688p+0,
+    0x1.0800000000002p-1,   0x1.acb82581eee56p+0,
+    0x1.100000000001fp-1,   0x1.b3787dc80f979p+0,
+    0x1.17ffffffffff8p-1,   0x1.ba540dba56e4fp+0,
+    0x1.1fffffffffffap-1,   0x1.c14b431256441p+0,
+    0x1.27fffffffffc4p-1,   0x1.c85e8d43f7c9bp+0,
+    0x1.2fffffffffffdp-1,   0x1.cf8e5d84758a6p+0,
+    0x1.380000000001fp-1,   0x1.d6db26d16cd84p+0,
+    0x1.3ffffffffffd8p-1,   0x1.de455df80e39bp+0,
+    0x1.4800000000052p-1,   0x1.e5cd799c6a59cp+0,
+    0x1.4ffffffffffc8p-1,   0x1.ed73f240dc10cp+0,
+    0x1.5800000000013p-1,   0x1.f539424d90f71p+0,
+    0x1.5ffffffffffbcp-1,   0x1.fd1de6182f885p+0,
+    0x1.680000000002dp-1,   0x1.02912df5ce741p+1,
+    0x1.7000000000040p-1,   0x1.06a39207f0a2ap+1,
+    0x1.780000000004fp-1,   0x1.0ac660691652ap+1,
+    0x1.7ffffffffff6fp-1,   0x1.0ef9db467dcabp+1,
+    0x1.87fffffffffe5p-1,   0x1.133e45d82e943p+1,
+    0x1.9000000000035p-1,   0x1.1793e4652cc6dp+1,
+    0x1.97fffffffffb3p-1,   0x1.1bfafc47bda48p+1,
+    0x1.a000000000000p-1,   0x1.2073d3f1bd518p+1,
+    0x1.a80000000004ap-1,   0x1.24feb2f105ce2p+1,
+    0x1.affffffffffedp-1,   0x1.299be1f3e7f11p+1,
+    0x1.b7ffffffffffbp-1,   0x1.2e4baacdb6611p+1,
+    0x1.c00000000001dp-1,   0x1.330e587b62b39p+1,
+    0x1.c800000000079p-1,   0x1.37e437282d538p+1,
+    0x1.cffffffffff51p-1,   0x1.3ccd943268248p+1,
+    0x1.d7fffffffff74p-1,   0x1.41cabe304cadcp+1,
+    0x1.e000000000011p-1,   0x1.46dc04f4e5343p+1,
+    0x1.e80000000001ep-1,   0x1.4c01b9950a124p+1,
+    0x1.effffffffff9ep-1,   0x1.513c2e6c73196p+1,
+    0x1.f7fffffffffedp-1,   0x1.568bb722dd586p+1,
+    0x1.0000000000034p+0,   0x1.5bf0a8b1457b0p+1,
+    0x1.03fffffffffe2p+0,   0x1.616b5967376dfp+1,
+    0x1.07fffffffff4bp+0,   0x1.66fc20f0337a9p+1,
+    0x1.0bffffffffffdp+0,   0x1.6ca35859290f5p+1,
+   -0x1.fffffffffffe4p-7,   0x1.f80feabfeefa5p-1,
+   -0x1.ffffffffffb0bp-6,   0x1.f03f56a88b5fep-1,
+   -0x1.7ffffffffffa7p-5,   0x1.e88dc6afecfc5p-1,
+   -0x1.ffffffffffea8p-5,   0x1.e0fabfbc702b8p-1,
+   -0x1.3ffffffffffb3p-4,   0x1.d985c89d041acp-1,
+   -0x1.7ffffffffffe3p-4,   0x1.d22e6a0197c06p-1,
+   -0x1.bffffffffff9ap-4,   0x1.caf42e73a4c89p-1,
+   -0x1.fffffffffff98p-4,   0x1.c3d6a24ed822dp-1,
+   -0x1.1ffffffffffe9p-3,   0x1.bcd553b9d7b67p-1,
+   -0x1.3ffffffffffe0p-3,   0x1.b5efd29f24c2dp-1,
+   -0x1.5fffffffff553p-3,   0x1.af25b0a61a9f4p-1,
+   -0x1.7ffffffffff8bp-3,   0x1.a876812c08794p-1,
+   -0x1.9fffffffffe51p-3,   0x1.a1e1d93d68828p-1,
+   -0x1.bffffffffff6ep-3,   0x1.9b674f8f2f3f5p-1,
+   -0x1.dffffffffff7fp-3,   0x1.95067c7837a0cp-1,
+   -0x1.fffffffffff7ap-3,   0x1.8ebef9eac8225p-1,
+   -0x1.0fffffffffffep-2,   0x1.8890636e31f55p-1,
+   -0x1.1ffffffffff41p-2,   0x1.827a56188975ep-1,
+   -0x1.2ffffffffffbap-2,   0x1.7c7c708877656p-1,
+   -0x1.3fffffffffff8p-2,   0x1.769652df22f81p-1,
+   -0x1.4ffffffffff90p-2,   0x1.70c79eba33c2fp-1,
+   -0x1.5ffffffffffdbp-2,   0x1.6b0ff72deb8aap-1,
+   -0x1.6ffffffffff9ap-2,   0x1.656f00bf5798ep-1,
+   -0x1.7ffffffffff9fp-2,   0x1.5fe4615e98eb0p-1,
+   -0x1.8ffffffffffeep-2,   0x1.5a6fc061433cep-1,
+   -0x1.9fffffffffc4ap-2,   0x1.5510c67cd26cdp-1,
+   -0x1.affffffffff30p-2,   0x1.4fc71dc13566bp-1,
+   -0x1.bfffffffffff0p-2,   0x1.4a9271936fd0ep-1,
+   -0x1.cfffffffffff3p-2,   0x1.45726ea84fb8cp-1,
+   -0x1.dfffffffffff3p-2,   0x1.4066c2ff3912bp-1,
+   -0x1.effffffffff80p-2,   0x1.3b6f1ddd05ab9p-1,
+   -0x1.fffffffffffdfp-2,   0x1.368b2fc6f9614p-1,
+   -0x1.0800000000000p-1,   0x1.31baaa7dca843p-1,
+   -0x1.0ffffffffffa4p-1,   0x1.2cfd40f8bdce4p-1,
+   -0x1.17fffffffff0ap-1,   0x1.2852a760d5ce7p-1,
+   -0x1.2000000000000p-1,   0x1.23ba930c1568bp-1,
+   -0x1.27fffffffffbbp-1,   0x1.1f34ba78d568dp-1,
+   -0x1.2fffffffffe32p-1,   0x1.1ac0d5492c1dbp-1,
+   -0x1.37ffffffff042p-1,   0x1.165e9c3e67ef2p-1,
+   -0x1.3ffffffffff77p-1,   0x1.120dc93499431p-1,
+   -0x1.47fffffffff6bp-1,   0x1.0dce171e34ecep-1,
+   -0x1.4fffffffffff1p-1,   0x1.099f41ffbe588p-1,
+   -0x1.57ffffffffe02p-1,   0x1.058106eb8a7aep-1,
+   -0x1.5ffffffffffe5p-1,   0x1.017323fd9002ep-1,
+   -0x1.67fffffffffb0p-1,   0x1.faeab0ae9386cp-2,
+   -0x1.6ffffffffffb2p-1,   0x1.f30ec837503d7p-2,
+   -0x1.77fffffffff7fp-1,   0x1.eb5210d627133p-2,
+   -0x1.7ffffffffffe8p-1,   0x1.e3b40ebefcd95p-2,
+   -0x1.87fffffffffc8p-1,   0x1.dc3448110dae2p-2,
+   -0x1.8fffffffffb30p-1,   0x1.d4d244cf4ef06p-2,
+   -0x1.97fffffffffefp-1,   0x1.cd8d8ed8ee395p-2,
+   -0x1.9ffffffffffa7p-1,   0x1.c665b1e1f1e5cp-2,
+   -0x1.a7fffffffffdcp-1,   0x1.bf5a3b6bf18d6p-2,
+   -0x1.affffffffff95p-1,   0x1.b86ababeef93bp-2,
+   -0x1.b7fffffffffcbp-1,   0x1.b196c0e24d256p-2,
+   -0x1.bffffffffff32p-1,   0x1.aadde095dadf7p-2,
+   -0x1.c7fffffffff6ap-1,   0x1.a43fae4b047c9p-2,
+   -0x1.cffffffffffb6p-1,   0x1.9dbbc01e182a4p-2,
+   -0x1.d7fffffffffcap-1,   0x1.9751adcfa81ecp-2,
+   -0x1.dffffffffffcdp-1,   0x1.910110be0699ep-2,
+   -0x1.e7ffffffffffbp-1,   0x1.8ac983dedbc69p-2,
+   -0x1.effffffffff88p-1,   0x1.84aaa3b8d51a9p-2,
+   -0x1.f7fffffffffbbp-1,   0x1.7ea40e5d6d92ep-2,
+   -0x1.fffffffffffdbp-1,   0x1.78b56362cef53p-2,
+   -0x1.03fffffffff00p+0,   0x1.72de43ddcb1f2p-2,
+   -0x1.07ffffffffe6fp+0,   0x1.6d1e525bed085p-2,
+   -0x1.0bfffffffffd6p+0,   0x1.677532dda1c57p-2};
+
+static const double
+  half = 0.5,
+  one = 1.0,
+/* t2-t5 terms used for polynomial computation.  */
+  t2 = 0x1.5555555555555p-3, /* 1.6666666666666665741e-1 */
+  t3 = 0x1.5555555555555p-5, /* 4.1666666666666664354e-2 */
+  t4 = 0x1.1111111111111p-7, /* 8.3333333333333332177e-3 */
+  t5 = 0x1.6c16c16c16c17p-10; /* 1.3888888888888889419e-3 */
-- 
1.7.1

^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [PATCH] Improves __ieee754_exp(x) performance by 18-37% when |x| < 1.0397
  2018-03-20 23:02 [PATCH] Improves __ieee754_exp(x) performance by 18-37% when |x| < 1.0397 Patrick McGehearty
@ 2018-03-29 10:38 ` Szabolcs Nagy
  0 siblings, 0 replies; 2+ messages in thread
From: Szabolcs Nagy @ 2018-03-29 10:38 UTC (permalink / raw)
  To: Patrick McGehearty; +Cc: libc-alpha

* Patrick McGehearty <patrick.mcgehearty@oracle.com> [2018-03-20 19:01:56 -0400]:
> Adds a fast path to e_exp.c when |x| < 1.03972053527832.
> When values are tested in isolation, reduction in execution
> time is: aarch 30%, sparc 18%, x86 37%.
> When comparing benchtests/bench.out which includes values
> outside that range, the gains are:
> aarch 8%, sparc 5%, x86 9%.
> 
> make check is clean (no increase in ulp for any math test).
> Testing 20M values for each rounding mode in that range shows
> approximately one in 200 values is off by 1 ulp. No value tested
> for exp(x) changed by 2 or more ulp.
> 
> No observed change in performance or accuracy for x outside
> fast path range.
> 
> These changes will be active for all platforms that don't provide
> their own exp() routines. They will also be active for ieee754
> versions of ccos, ccosh, cosh, csin, csinh, sinh, exp10, gamma, and
> erf.
> 
> ChangeLog:
> 2018-03-20  Patrick McGehearty <patrick.mcgehearty@oracle.com>
> 
>         * sysdeps/ieee754/dbl-64/e_exp.c: faster __ieee754_exp()
>         * sysdeps/ieee754/dbl-64/eexp.tbl: New file for e_exp.c

i think this is ok,
further improvements can be done separately.
i don't yet know when i will be able to post my version.

i'd mention that it increases rodata size by about 2k.

(i think the worst case ulp error is < 0.513 somewhere
around 0.5078 or -0.836 which is acceptable)

> @@ -74,7 +112,7 @@ __ieee754_exp (double x)
>  	y = t + three33.x;
>  	base = y - three33.x;	/* t rounded to a multiple of 2**-18      */
>  	junk2.x = y;
> -	del = (t - base) - eps;	/*  x = bexp*ln(2) + base + del           */
> +	del = (t - base) - eps;	/*  x = bexp*ln(2) + base + del	          */

this is a spurious whitespace change.

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2018-03-29 10:38 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-03-20 23:02 [PATCH] Improves __ieee754_exp(x) performance by 18-37% when |x| < 1.0397 Patrick McGehearty
2018-03-29 10:38 ` Szabolcs Nagy

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).