* RFC: Improve hypot performance
@ 2021-11-17 15:58 Wilco Dijkstra
2021-11-17 19:02 ` Paul A. Clarke
0 siblings, 1 reply; 6+ messages in thread
From: Wilco Dijkstra @ 2021-11-17 15:58 UTC (permalink / raw)
To: Adhemerval Zanella; +Cc: 'GNU C Library'
Hi Adhemerval,
Here is an early version of a much faster hypot implementation. It uses fma
to significantly outperform the powerpc version both in throughput and
latency. It has a worst-case ULP of ~0.949 and passes the testsuite. The
powerpc version has a worst-case ULP of ~1.21 and several test failures.
It applies on top of your hypot patch series. I didn't optimize the non-fma
case since modern targets have fma. It'll be interesting to compare it on
Power. You'll need to correctly set FAST_FMINMAX to indicate support for
inlined fmin/fmax instructions (this will be added to math_private.h for
targets that have it), without it the code tries to use a conditional move
since using a branch here is really bad for performance.
Cheers,
Wilco
---
diff --git a/sysdeps/ieee754/dbl-64/e_hypot.c b/sysdeps/ieee754/dbl-64/e_hypot.c
index d20bc3e3657e350a1103a8f8477db35ee60399e0..3906711788cff5d66725e5879bb4d6c36dd24dc7 100644
--- a/sysdeps/ieee754/dbl-64/e_hypot.c
+++ b/sysdeps/ieee754/dbl-64/e_hypot.c
@@ -37,73 +37,40 @@
#include <math-svid-compat.h>
#include <errno.h>
+#define FAST_FMINMAX 1
+//#undef __FP_FAST_FMA
+
+#define SCALE 0x1p-600
+#define LARGE_VAL 0x1p+511
+#define TINY_VAL 0x1p-459
+#define EPS 0x1p-54
+
+
static inline double
handle_errno (double r)
{
+ r = math_narrow_eval (r);
if (isinf (r))
__set_errno (ERANGE);
return r;
}
-/* sqrt (DBL_EPSILON / 2.0) */
-#define SQRT_EPS_DIV_2 0x1.6a09e667f3bcdp-27
-/* DBL_MIN / (sqrt (DBL_EPSILON / 2.0)) */
-#define DBL_MIN_THRESHOLD 0x1.6a09e667f3bcdp-996
-/* eps (double) * sqrt (DBL_MIN)) */
-#define SCALE 0x1p-563
-/* 1 / eps (sqrt (DBL_MIN) */
-#define INV_SCALE 0x1p+563
-/* sqrt (DBL_MAX) */
-#define SQRT_DBL_MAX 0x1.6a09e667f3bccp+511
-/* sqrt (DBL_MIN) */
-#define SQRT_DBL_MIN 0x1p-511
-
-double
-__hypot (double x, double y)
+static inline double
+kernel (double ax, double ay)
{
- if ((isinf (x) || isinf (y))
- && !issignaling (x) && !issignaling (y))
- return INFINITY;
- if (isnan (x) || isnan (y))
- return x + y;
-
- double ax = fabs (x);
- double ay = fabs (y);
- if (ay > ax)
- {
- double tmp = ax;
- ax = ay;
- ay = tmp;
- }
-
- /* Widely varying operands. The DBL_MIN_THRESHOLD check is used to avoid
- a spurious underflow from the multiplication. */
- if (ax >= DBL_MIN_THRESHOLD && ay <= ax * SQRT_EPS_DIV_2)
- return (ay == 0.0)
- ? ax
- : handle_errno (math_narrow_eval (ax + DBL_TRUE_MIN));
+ double t1, t2;
+#ifdef __FP_FAST_FMA
+ t1 = ay + ay;
+ t2 = ax - ay;
- double scale = SCALE;
- if (ax > SQRT_DBL_MAX)
- {
- ax *= scale;
- ay *= scale;
- scale = INV_SCALE;
- }
- else if (ay < SQRT_DBL_MIN)
- {
- ax /= scale;
- ay /= scale;
- }
+ if (t1 >= ax)
+ return sqrt (fma (t1, ax, t2 * t2));
else
- scale = 1.0;
-
+ return sqrt (fma (ax, ax, ay * ay));
+#else
double h = sqrt (ax * ax + ay * ay);
- double t1, t2;
- if (h == 0.0)
- return h;
- else if (h <= 2.0 * ay)
+ if (h <= 2.0 * ay)
{
double delta = h - ay;
t1 = ax * (2.0 * delta - ax);
@@ -112,14 +79,57 @@ __hypot (double x, double y)
else
{
double delta = h - ax;
- t1 = 2.0 * delta * (ax - 2 * ay);
+ t1 = 2.0 * delta * (ax - 2.0 * ay);
t2 = (4.0 * delta - ay) * ay + delta * delta;
}
h -= (t1 + t2) / (2.0 * h);
- h = math_narrow_eval (h * scale);
- math_check_force_underflow_nonneg (h);
- return handle_errno (h);
+ return h;
+#endif
+}
+
+
+double
+__hypot (double x, double y)
+{
+ if (!isfinite (x) || !isfinite (y))
+ {
+ if ((isinf (x) || isinf (y))
+ && !issignaling_inline (x) && !issignaling_inline (y))
+ return INFINITY;
+ return x + y;
+ }
+
+ x = fabs (x);
+ y = fabs (y);
+
+ double ax = FAST_FMINMAX ? fmax (x, y) : (x < y ? y : x);
+ double ay = FAST_FMINMAX ? fmin (x, y) : (x < y ? x : y);
+
+ if (__glibc_unlikely (ax > LARGE_VAL))
+ {
+ if (__glibc_unlikely (ay <= ax * EPS))
+ return handle_errno (ax + ay);
+
+ return handle_errno (kernel (ax * SCALE, ay * SCALE) / SCALE);
+ }
+
+ if (__glibc_unlikely (ay < TINY_VAL))
+ {
+ if (__glibc_unlikely (ax >= ay / EPS))
+ return math_narrow_eval (ax + ay);
+
+ ax = math_narrow_eval (kernel (ax / SCALE, ay / SCALE) * SCALE);
+ math_check_force_underflow_nonneg (ax);
+ return ax;
+ }
+
+ /* Common case: ax is not huge and ay is not tiny. */
+ if (__glibc_unlikely (ay <= ax * EPS))
+ return math_narrow_eval (ax + ay);
+
+ return math_narrow_eval (kernel (ax, ay));
}
+
strong_alias (__hypot, __ieee754_hypot)
libm_alias_finite (__ieee754_hypot, __hypot)
#if LIBM_SVID_COMPAT
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: RFC: Improve hypot performance
2021-11-17 15:58 RFC: Improve hypot performance Wilco Dijkstra
@ 2021-11-17 19:02 ` Paul A. Clarke
2021-11-18 12:37 ` Wilco Dijkstra
0 siblings, 1 reply; 6+ messages in thread
From: Paul A. Clarke @ 2021-11-17 19:02 UTC (permalink / raw)
To: Wilco Dijkstra; +Cc: Adhemerval Zanella, 'GNU C Library'
On Wed, Nov 17, 2021 at 03:58:53PM +0000, Wilco Dijkstra via Libc-alpha wrote:
> Hi Adhemerval,
>
> Here is an early version of a much faster hypot implementation. It uses fma
> to significantly outperform the powerpc version both in throughput and
> latency. It has a worst-case ULP of ~0.949 and passes the testsuite. The
> powerpc version has a worst-case ULP of ~1.21 and several test failures.
>
> It applies on top of your hypot patch series. I didn't optimize the non-fma
> case since modern targets have fma. It'll be interesting to compare it on
> Power. You'll need to correctly set FAST_FMINMAX to indicate support for
> inlined fmin/fmax instructions (this will be added to math_private.h for
> targets that have it), without it the code tries to use a conditional move
> since using a branch here is really bad for performance.
On Power10, this implementation is still has a large delta compared to the
current implementation:
current on Power10:
"hypot": {
"workload-random": {
"duration": 5.27306e+08,
"iterations": 4.8e+07,
"reciprocal-throughput": 8.26834,
"latency": 13.7027,
"max-throughput": 1.20943e+08,
"min-throughput": 7.29781e+07
}
}
with Adhemerval's patches and this patch:
"hypot": {
"workload-random": {
"duration": 5.34629e+08,
"iterations": 3.6e+07,
"reciprocal-throughput": 12.5006,
"latency": 17.201,
"max-throughput": 7.99959e+07,
"min-throughput": 5.81362e+07
}
}
PC
> ---
>
> diff --git a/sysdeps/ieee754/dbl-64/e_hypot.c b/sysdeps/ieee754/dbl-64/e_hypot.c
> index d20bc3e3657e350a1103a8f8477db35ee60399e0..3906711788cff5d66725e5879bb4d6c36dd24dc7 100644
> --- a/sysdeps/ieee754/dbl-64/e_hypot.c
> +++ b/sysdeps/ieee754/dbl-64/e_hypot.c
> @@ -37,73 +37,40 @@
> #include <math-svid-compat.h>
> #include <errno.h>
>
> +#define FAST_FMINMAX 1
> +//#undef __FP_FAST_FMA
> +
> +#define SCALE 0x1p-600
> +#define LARGE_VAL 0x1p+511
> +#define TINY_VAL 0x1p-459
> +#define EPS 0x1p-54
> +
> +
> static inline double
> handle_errno (double r)
> {
> + r = math_narrow_eval (r);
> if (isinf (r))
> __set_errno (ERANGE);
> return r;
> }
>
> -/* sqrt (DBL_EPSILON / 2.0) */
> -#define SQRT_EPS_DIV_2 0x1.6a09e667f3bcdp-27
> -/* DBL_MIN / (sqrt (DBL_EPSILON / 2.0)) */
> -#define DBL_MIN_THRESHOLD 0x1.6a09e667f3bcdp-996
> -/* eps (double) * sqrt (DBL_MIN)) */
> -#define SCALE 0x1p-563
> -/* 1 / eps (sqrt (DBL_MIN) */
> -#define INV_SCALE 0x1p+563
> -/* sqrt (DBL_MAX) */
> -#define SQRT_DBL_MAX 0x1.6a09e667f3bccp+511
> -/* sqrt (DBL_MIN) */
> -#define SQRT_DBL_MIN 0x1p-511
> -
> -double
> -__hypot (double x, double y)
> +static inline double
> +kernel (double ax, double ay)
> {
> - if ((isinf (x) || isinf (y))
> - && !issignaling (x) && !issignaling (y))
> - return INFINITY;
> - if (isnan (x) || isnan (y))
> - return x + y;
> -
> - double ax = fabs (x);
> - double ay = fabs (y);
> - if (ay > ax)
> - {
> - double tmp = ax;
> - ax = ay;
> - ay = tmp;
> - }
> -
> - /* Widely varying operands. The DBL_MIN_THRESHOLD check is used to avoid
> - a spurious underflow from the multiplication. */
> - if (ax >= DBL_MIN_THRESHOLD && ay <= ax * SQRT_EPS_DIV_2)
> - return (ay == 0.0)
> - ? ax
> - : handle_errno (math_narrow_eval (ax + DBL_TRUE_MIN));
> + double t1, t2;
> +#ifdef __FP_FAST_FMA
> + t1 = ay + ay;
> + t2 = ax - ay;
>
> - double scale = SCALE;
> - if (ax > SQRT_DBL_MAX)
> - {
> - ax *= scale;
> - ay *= scale;
> - scale = INV_SCALE;
> - }
> - else if (ay < SQRT_DBL_MIN)
> - {
> - ax /= scale;
> - ay /= scale;
> - }
> + if (t1 >= ax)
> + return sqrt (fma (t1, ax, t2 * t2));
> else
> - scale = 1.0;
> -
> + return sqrt (fma (ax, ax, ay * ay));
> +#else
> double h = sqrt (ax * ax + ay * ay);
>
> - double t1, t2;
> - if (h == 0.0)
> - return h;
> - else if (h <= 2.0 * ay)
> + if (h <= 2.0 * ay)
> {
> double delta = h - ay;
> t1 = ax * (2.0 * delta - ax);
> @@ -112,14 +79,57 @@ __hypot (double x, double y)
> else
> {
> double delta = h - ax;
> - t1 = 2.0 * delta * (ax - 2 * ay);
> + t1 = 2.0 * delta * (ax - 2.0 * ay);
> t2 = (4.0 * delta - ay) * ay + delta * delta;
> }
> h -= (t1 + t2) / (2.0 * h);
> - h = math_narrow_eval (h * scale);
> - math_check_force_underflow_nonneg (h);
> - return handle_errno (h);
> + return h;
> +#endif
> +}
> +
> +
> +double
> +__hypot (double x, double y)
> +{
> + if (!isfinite (x) || !isfinite (y))
> + {
> + if ((isinf (x) || isinf (y))
> + && !issignaling_inline (x) && !issignaling_inline (y))
> + return INFINITY;
> + return x + y;
> + }
> +
> + x = fabs (x);
> + y = fabs (y);
> +
> + double ax = FAST_FMINMAX ? fmax (x, y) : (x < y ? y : x);
> + double ay = FAST_FMINMAX ? fmin (x, y) : (x < y ? x : y);
> +
> + if (__glibc_unlikely (ax > LARGE_VAL))
> + {
> + if (__glibc_unlikely (ay <= ax * EPS))
> + return handle_errno (ax + ay);
> +
> + return handle_errno (kernel (ax * SCALE, ay * SCALE) / SCALE);
> + }
> +
> + if (__glibc_unlikely (ay < TINY_VAL))
> + {
> + if (__glibc_unlikely (ax >= ay / EPS))
> + return math_narrow_eval (ax + ay);
> +
> + ax = math_narrow_eval (kernel (ax / SCALE, ay / SCALE) * SCALE);
> + math_check_force_underflow_nonneg (ax);
> + return ax;
> + }
> +
> + /* Common case: ax is not huge and ay is not tiny. */
> + if (__glibc_unlikely (ay <= ax * EPS))
> + return math_narrow_eval (ax + ay);
> +
> + return math_narrow_eval (kernel (ax, ay));
> }
> +
> strong_alias (__hypot, __ieee754_hypot)
> libm_alias_finite (__ieee754_hypot, __hypot)
> #if LIBM_SVID_COMPAT
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: RFC: Improve hypot performance
2021-11-17 19:02 ` Paul A. Clarke
@ 2021-11-18 12:37 ` Wilco Dijkstra
2021-11-18 15:43 ` Paul A. Clarke
0 siblings, 1 reply; 6+ messages in thread
From: Wilco Dijkstra @ 2021-11-18 12:37 UTC (permalink / raw)
To: Paul A. Clarke; +Cc: Adhemerval Zanella, 'GNU C Library'
Hi Paul,
> On Power10, this implementation is still has a large delta compared to the
> current implementation:
It's obvious something went wrong here since it is slower than Adhemerval's
version - did you look at the generated code?
It should use fma, and no division or function calls (if you see calls to fmin/fmax
you need to set FAST_FMINMAX to 0).
Cheers,
Wilco
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: RFC: Improve hypot performance
2021-11-18 12:37 ` Wilco Dijkstra
@ 2021-11-18 15:43 ` Paul A. Clarke
2021-11-18 16:15 ` Paul A. Clarke
0 siblings, 1 reply; 6+ messages in thread
From: Paul A. Clarke @ 2021-11-18 15:43 UTC (permalink / raw)
To: Wilco Dijkstra; +Cc: Adhemerval Zanella, 'GNU C Library'
On Thu, Nov 18, 2021 at 12:37:24PM +0000, Wilco Dijkstra wrote:
> > On Power10, this implementation is still has a large delta compared to the
> > current implementation:
>
> It's obvious something went wrong here since it is slower than Adhemerval's
> version - did you look at the generated code?
>
> It should use fma, and no division or function calls (if you see calls to fmin/fmax
> you need to set FAST_FMINMAX to 0).
Ugh. That was it. I missed that subtlety. (Why would I not want "FAST" to be
true? :-)
Anyway, with FAST_FMINMAX set to 1, the results are much better, and compare
favorably with the current implementation:
current:
"hypot": {
"workload-random": {
"duration": 5.26088e+08,
"iterations": 4.8e+07,
"reciprocal-throughput": 8.2599,
"latency": 13.6604,
"max-throughput": 1.21067e+08,
"min-throughput": 7.32042e+07
}
}
with Adhemerval's patch and this patch with FAST_FMINMAX=1:
"hypot": {
"workload-random": {
"duration": 5.30541e+08,
"iterations": 5.2e+07,
"reciprocal-throughput": 7.22461,
"latency": 13.1808,
"max-throughput": 1.38416e+08,
"min-throughput": 7.58679e+07
}
}
PC
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: RFC: Improve hypot performance
2021-11-18 15:43 ` Paul A. Clarke
@ 2021-11-18 16:15 ` Paul A. Clarke
2021-11-18 17:39 ` Wilco Dijkstra
0 siblings, 1 reply; 6+ messages in thread
From: Paul A. Clarke @ 2021-11-18 16:15 UTC (permalink / raw)
To: Wilco Dijkstra, 'GNU C Library'
On Thu, Nov 18, 2021 at 09:43:12AM -0600, Paul A. Clarke via Libc-alpha wrote:
> On Thu, Nov 18, 2021 at 12:37:24PM +0000, Wilco Dijkstra wrote:
> > > On Power10, this implementation is still has a large delta compared to the
> > > current implementation:
> >
> > It's obvious something went wrong here since it is slower than Adhemerval's
> > version - did you look at the generated code?
> >
> > It should use fma, and no division or function calls (if you see calls to fmin/fmax
> > you need to set FAST_FMINMAX to 0).
>
> Ugh. That was it. I missed that subtlety. (Why would I not want "FAST" to be
> true? :-)
>
> Anyway, with FAST_FMINMAX set to 1, the results are much better, and compare
That should say "with FAST_FMINMAX set to *0*".
> favorably with the current implementation:
>
> current:
> "hypot": {
> "workload-random": {
> "duration": 5.26088e+08,
> "iterations": 4.8e+07,
> "reciprocal-throughput": 8.2599,
> "latency": 13.6604,
> "max-throughput": 1.21067e+08,
> "min-throughput": 7.32042e+07
> }
> }
>
> with Adhemerval's patch and this patch with FAST_FMINMAX=1:
> "hypot": {
> "workload-random": {
> "duration": 5.30541e+08,
> "iterations": 5.2e+07,
> "reciprocal-throughput": 7.22461,
> "latency": 13.1808,
> "max-throughput": 1.38416e+08,
> "min-throughput": 7.58679e+07
> }
> }
>
> PC
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: RFC: Improve hypot performance
2021-11-18 16:15 ` Paul A. Clarke
@ 2021-11-18 17:39 ` Wilco Dijkstra
0 siblings, 0 replies; 6+ messages in thread
From: Wilco Dijkstra @ 2021-11-18 17:39 UTC (permalink / raw)
To: Paul A. Clarke; +Cc: 'GNU C Library', Adhemerval Zanella
Hi Paul,
Great, that's about 14% higher throughput and 3.5% lower latency!
Looking in more detail, it appears AArch64 may be the only target
which inlines fmin/fmax with -O2, so the default is now off unless
math_private.h overrides it.
Cheers,
Wilco
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2021-11-18 17:39 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-11-17 15:58 RFC: Improve hypot performance Wilco Dijkstra
2021-11-17 19:02 ` Paul A. Clarke
2021-11-18 12:37 ` Wilco Dijkstra
2021-11-18 15:43 ` Paul A. Clarke
2021-11-18 16:15 ` Paul A. Clarke
2021-11-18 17:39 ` Wilco Dijkstra
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).