From: Paul Zimmermann <Paul.Zimmermann@inria.fr>
To: Joe Ramsay <Joe.Ramsay@arm.com>
Cc: libc-alpha@sourceware.org, Joe.Ramsay@arm.com
Subject: Re: [PATCH v2] math: Add new exp10 implementation
Date: Fri, 01 Dec 2023 11:51:18 +0100 [thread overview]
Message-ID: <p9u0cyvqurfd.fsf@coriandre.loria.fr> (raw)
In-Reply-To: <20231201094945.49035-1-Joe.Ramsay@arm.com> (message from Joe Ramsay on Fri, 1 Dec 2023 09:49:45 +0000)
Hi Joe,
looks good to me (precision-wise). I didn't check the speed improvement.
Paul
> From: Joe Ramsay <Joe.Ramsay@arm.com>
> CC: Joe Ramsay <Joe.Ramsay@arm.com>
> Date: Fri, 1 Dec 2023 09:49:45 +0000
> NoDisclaimer: true
>
> New implementation is based on the existing exp/exp2, with different
> reduction constants and polynomial. Worst-case error in round-to-
> nearest is 0.513 ULP.
>
> The exp/exp2 shared table is reused for exp10 - .rodata size of
> e_exp_data increases by 64 bytes.
>
> As for exp/exp2, targets with single-instruction rounding/conversion
> intrinsics can use them by toggling TOINT_INTRINSICS=1 and adding the
> necessary code to their math_private.h.
>
> Improvements on Neoverse V1 compared to current GLIBC master:
> exp10 thruput: 3.3x in [-0x1.439b746e36b52p+8 0x1.34413509f79ffp+8]
> exp10 latency: 1.8x in [-0x1.439b746e36b52p+8 0x1.34413509f79ffp+8]
>
> Tested on:
> aarch64-linux-gnu (TOINT_INTRINSICS, fma contraction) and
> x86_64-linux-gnu (!TOINT_INTRINSICS, no fma contraction)
> ---
> Differences from v1:
> * Stop preventing inlining of special-case - no good reason for this,
> in fact performance is slightly better if it is inlined
> * Remove configurable wide poly
> * Update max ULP based on several runs of the subdivide program with
> threshold=1000000
> * Define float constant in hex format
> * No benchtest added, as there is already one for exp10 (I couldn't
> see a way to have multiple intervals in exp10-inputs?). Anyway the
> intervals in the previous version's commit message were chosen
> fairly arbitrarily - added speedup measurement for the interval used
> in exp10-inputs
> Thanks,
> Joe
> sysdeps/ieee754/dbl-64/e_exp10.c | 144 ++++++++++++++++++++++-----
> sysdeps/ieee754/dbl-64/e_exp_data.c | 11 ++
> sysdeps/ieee754/dbl-64/math_config.h | 4 +
> 3 files changed, 135 insertions(+), 24 deletions(-)
>
> diff --git a/sysdeps/ieee754/dbl-64/e_exp10.c b/sysdeps/ieee754/dbl-64/e_exp10.c
> index fa47f4f922..08069140c0 100644
> --- a/sysdeps/ieee754/dbl-64/e_exp10.c
> +++ b/sysdeps/ieee754/dbl-64/e_exp10.c
> @@ -16,36 +16,132 @@
> <https://www.gnu.org/licenses/>. */
>
> #include <math.h>
> +#include <math-barriers.h>
> +#include <math-narrow-eval.h>
> #include <math_private.h>
> #include <float.h>
> #include <libm-alias-finite.h>
> +#include "math_config.h"
>
> -static const double log10_high = 0x2.4d7637p0;
> -static const double log10_low = 0x7.6aaa2b05ba95cp-28;
> +#define N (1 << EXP_TABLE_BITS)
> +#define IndexMask (N - 1)
> +#define OFlowBound 0x1.34413509f79ffp8 /* log10(DBL_MAX). */
> +#define UFlowBound -0x1.5ep+8 /* -350. */
> +#define SmallTop 0x3c6 /* top12(0x1p-57). */
> +#define BigTop 0x407 /* top12(0x1p8). */
> +#define Thresh 0x41 /* BigTop - SmallTop. */
> +#define Shift __exp_data.shift
> +#define C(i) __exp_data.exp10_poly[i]
>
> +static double
> +special_case (uint64_t sbits, double_t tmp, uint64_t ki)
> +{
> + double_t scale, y;
> +
> + if (ki - (1ull << 16) < 0x80000000)
> + {
> + /* The exponent of scale might have overflowed by 1. */
> + sbits -= 1ull << 52;
> + scale = asdouble (sbits);
> + y = 2 * (scale + scale * tmp);
> + return check_oflow (y);
> + }
> +
> + /* n < 0, need special care in the subnormal range. */
> + sbits += 1022ull << 52;
> + scale = asdouble (sbits);
> + y = scale + scale * tmp;
> +
> + if (y < 1.0)
> + {
> + /* Round y to the right precision before scaling it into the subnormal
> + range to avoid double rounding that can cause 0.5+E/2 ulp error where
> + E is the worst-case ulp error outside the subnormal range. So this
> + is only useful if the goal is better than 1 ulp worst-case error. */
> + double_t lo = scale - y + scale * tmp;
> + double_t hi = 1.0 + y;
> + lo = 1.0 - hi + y + lo;
> + y = math_narrow_eval (hi + lo) - 1.0;
> + /* Avoid -0.0 with downward rounding. */
> + if (WANT_ROUNDING && y == 0.0)
> + y = 0.0;
> + /* The underflow exception needs to be signaled explicitly. */
> + math_force_eval (math_opt_barrier (0x1p-1022) * 0x1p-1022);
> + }
> + y = 0x1p-1022 * y;
> +
> + return check_uflow (y);
> +}
> +
> +/* Double-precision 10^x approximation. Largest observed error is ~0.513 ULP. */
> double
> -__ieee754_exp10 (double arg)
> +__ieee754_exp10 (double x)
> {
> - int32_t lx;
> - double arg_high, arg_low;
> - double exp_high, exp_low;
> -
> - if (!isfinite (arg))
> - return __ieee754_exp (arg);
> - if (arg < DBL_MIN_10_EXP - DBL_DIG - 10)
> - return DBL_MIN * DBL_MIN;
> - else if (arg > DBL_MAX_10_EXP + 1)
> - return DBL_MAX * DBL_MAX;
> - else if (fabs (arg) < 0x1p-56)
> - return 1.0;
> -
> - GET_LOW_WORD (lx, arg);
> - lx &= 0xf8000000;
> - arg_high = arg;
> - SET_LOW_WORD (arg_high, lx);
> - arg_low = arg - arg_high;
> - exp_high = arg_high * log10_high;
> - exp_low = arg_high * log10_low + arg_low * M_LN10;
> - return __ieee754_exp (exp_high) * __ieee754_exp (exp_low);
> + uint64_t ix = asuint64 (x);
> + uint32_t abstop = (ix >> 52) & 0x7ff;
> +
> + if (__glibc_unlikely (abstop - SmallTop >= Thresh))
> + {
> + if (abstop - SmallTop >= 0x80000000)
> + /* Avoid spurious underflow for tiny x.
> + Note: 0 is common input. */
> + return x + 1;
> + if (abstop == 0x7ff)
> + return ix == asuint64 (-INFINITY) ? 0.0 : x + 1.0;
> + if (x >= OFlowBound)
> + return __math_oflow (0);
> + if (x < UFlowBound)
> + return __math_uflow (0);
> +
> + /* Large x is special-cased below. */
> + abstop = 0;
> + }
> +
> + /* Reduce x: z = x * N / log10(2), k = round(z). */
> + double_t z = __exp_data.invlog10_2N * x;
> + double_t kd;
> + int64_t ki;
> +#if TOINT_INTRINSICS
> + kd = roundtoint (z);
> + ki = converttoint (z);
> +#else
> + kd = math_narrow_eval (z + Shift);
> + kd -= Shift;
> + ki = kd;
> +#endif
> +
> + /* r = x - k * log10(2), r in [-0.5, 0.5]. */
> + double_t r = x;
> + r = __exp_data.neglog10_2hiN * kd + r;
> + r = __exp_data.neglog10_2loN * kd + r;
> +
> + /* exp10(x) = 2^(k/N) * 2^(r/N).
> + Approximate the two components separately. */
> +
> + /* s = 2^(k/N), using lookup table. */
> + uint64_t e = ki << (52 - EXP_TABLE_BITS);
> + uint64_t i = (ki & IndexMask) * 2;
> + uint64_t u = __exp_data.tab[i + 1];
> + uint64_t sbits = u + e;
> +
> + double_t tail = asdouble (__exp_data.tab[i]);
> +
> + /* 2^(r/N) ~= 1 + r * Poly(r). */
> + double_t r2 = r * r;
> + double_t p = C (0) + r * C (1);
> + double_t y = C (2) + r * C (3);
> + y = y + r2 * C (4);
> + y = p + r2 * y;
> + y = tail + y * r;
> +
> + if (__glibc_unlikely (abstop == 0))
> + return special_case (sbits, y, ki);
> +
> + /* Assemble components:
> + y = 2^(r/N) * 2^(k/N)
> + ~= (y + 1) * s. */
> + double_t s = asdouble (sbits);
> + return s * y + s;
> }
> +
> libm_alias_finite (__ieee754_exp10, __exp10)
> diff --git a/sysdeps/ieee754/dbl-64/e_exp_data.c b/sysdeps/ieee754/dbl-64/e_exp_data.c
> index d498b8bc3b..5c774afbcb 100644
> --- a/sysdeps/ieee754/dbl-64/e_exp_data.c
> +++ b/sysdeps/ieee754/dbl-64/e_exp_data.c
> @@ -58,6 +58,17 @@ const struct exp_data __exp_data = {
> 0x1.5d7e09b4e3a84p-10,
> #endif
> },
> +.invlog10_2N = 0x1.a934f0979a371p1 * N,
> +.neglog10_2hiN = -0x1.3441350ap-2 / N,
> +.neglog10_2loN = 0x1.0c0219dc1da99p-39 / N,
> +.exp10_poly = {
> +/* Coeffs generated using Remez in [-log10(2)/256, log10(2)/256 ]. */
> +0x1.26bb1bbb55516p1,
> +0x1.53524c73ce9fep1,
> +0x1.0470591ce4b26p1,
> +0x1.2bd76577fe684p0,
> +0x1.1446eeccd0efbp-1
> +},
> // 2^(k/N) ~= H[k]*(1 + T[k]) for int k in [0,N)
> // tab[2*k] = asuint64(T[k])
> // tab[2*k+1] = asuint64(H[k]) - (k << 52)/N
> diff --git a/sysdeps/ieee754/dbl-64/math_config.h b/sysdeps/ieee754/dbl-64/math_config.h
> index 19af33fd86..d617addfbe 100644
> --- a/sysdeps/ieee754/dbl-64/math_config.h
> +++ b/sysdeps/ieee754/dbl-64/math_config.h
> @@ -201,6 +201,10 @@ extern const struct exp_data
> double poly[4]; /* Last four coefficients. */
> double exp2_shift;
> double exp2_poly[EXP2_POLY_ORDER];
> + double invlog10_2N;
> + double neglog10_2hiN;
> + double neglog10_2loN;
> + double exp10_poly[5];
> uint64_t tab[2*(1 << EXP_TABLE_BITS)];
> } __exp_data attribute_hidden;
>
> --
> 2.27.0
>
>
next prev parent reply other threads:[~2023-12-01 10:51 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-12-01 9:49 Joe Ramsay
2023-12-01 10:51 ` Paul Zimmermann [this message]
2023-12-04 15:09 ` Szabolcs Nagy
2023-12-04 15:15 ` Adhemerval Zanella Netto
2023-12-04 15:55 ` Szabolcs Nagy
2023-12-04 16:37 ` Adhemerval Zanella
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=p9u0cyvqurfd.fsf@coriandre.loria.fr \
--to=paul.zimmermann@inria.fr \
--cc=Joe.Ramsay@arm.com \
--cc=libc-alpha@sourceware.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).