Re: [PATCH] math: Improve fmod(f) performance

public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed

From: Adhemerval Zanella Netto <adhemerval.zanella@linaro.org>
To: Wilco Dijkstra <Wilco.Dijkstra@arm.com>,
	'GNU C Library' <libc-alpha@sourceware.org>
Subject: Re: [PATCH] math: Improve fmod(f) performance
Date: Thu, 13 Apr 2023 12:58:19 -0300	[thread overview]
Message-ID: <0baece75-8f99-da08-4094-18f99238cb12@linaro.org> (raw)
In-Reply-To: <PAWPR08MB8982428AD5DD18A711D4F31783989@PAWPR08MB8982.eurprd08.prod.outlook.com>



On 13/04/23 11:29, Wilco Dijkstra wrote:
> Optimize the fast paths (x < y) and (x/y < 2^12).  Delay handling of special cases to reduce
> amount of code executed before the fast paths.  Performance of the close-exponents benchmark
> improves by 18-19% on Cortex-A72 and Neoverse V1. Passes regress.

I am seeing different numbers on x86_64 (Ryzen 9), below are the best 
'mean' from 3 bench-fmod runs:

                    master      patch
subnormals         8.77307    8.83486
normal             284.637    293.503
close-exponents    11.9665    11.3463

So at least with current 'close-exponents' from bench-fmod, which was
generated from exponents between -10 and 10, the gain is more modest
(and normal inputs does show a small regression).  This should be ok, 
but I also think we need to outline that A72 gains might not show on
different hardware.

So maybe also add another bench-fmod set for |x/y| < 2^12 to show
the potential gains.

> 
> ---
> 
> diff --git a/sysdeps/ieee754/dbl-64/e_fmod.c b/sysdeps/ieee754/dbl-64/e_fmod.c
> index caae4e47e2358daced51a22342ec6e34a04f6fce..7c0ef11cb6c53561e64fab4f2ece17692dff3e5f 100644
> --- a/sysdeps/ieee754/dbl-64/e_fmod.c
> +++ b/sysdeps/ieee754/dbl-64/e_fmod.c
> @@ -40,10 +40,10 @@
>  
>     r == x % y == (x % (N * y)) % y
>  
> -   And with mx/my being mantissa of double floating point number (which uses
> +   And with mx/my being mantissa of a double floating point number (which uses
>     less bits than the storage type), on each step the argument reduction can
>     be improved by 11 (which is the size of uint64_t minus MANTISSA_WIDTH plus
> -   the signal bit):
> +   the implicit one bit):
>  
>     mx * 2^ex == 2^11 * mx * 2^(ex - 11)
>  
> @@ -54,7 +54,12 @@
>         mx << 11;
>         ex -= 11;
>         mx %= my;
> -     }  */
> +     }
> +
> +   Special cases:
> +     - If x or y is a NaN, a NaN is returned.
> +     - If x is an infinity, or y is zero, a NaN is returned and EDOM is set.
> +     - If x is +0/-0, and y is not zero, +0/-0 is returned.  */
>  
>  double
>  __fmod (double x, double y)

Ok.

> @@ -67,62 +72,70 @@ __fmod (double x, double y)
>    hx ^= sx;
>    hy &= ~SIGN_MASK;
>  
> -  /* Special cases:
> -     - If x or y is a Nan, NaN is returned.
> -     - If x is an inifinity, a NaN is returned and EDOM is set.
> -     - If y is zero, Nan is returned.
> -     - If x is +0/-0, and y is not zero, +0/-0 is returned.  */
> -  if (__glibc_unlikely (hy == 0
> -			|| hx >= EXPONENT_MASK || hy > EXPONENT_MASK))
> -    {
> -      if (is_nan (hx) || is_nan (hy))
> -	return (x * y) / (x * y);
> -      return __math_edom ((x * y) / (x * y));
> -    }
> -
> -  if (__glibc_unlikely (hx <= hy))
> +  /* If x < y, return x (unless y is a NaN).  */
> +  if (__glibc_likely (hx < hy))
>      {
> -      if (hx < hy)
> -	return x;
> -      return asdouble (sx);
> +      /* If y is a NaN, return a NaN.  */
> +      if (__glibc_unlikely (hy > EXPONENT_MASK))
> +	return x * y;
> +      return x;
>      }
>  
>    int ex = hx >> MANTISSA_WIDTH;
>    int ey = hy >> MANTISSA_WIDTH;
> +  int exp_diff = ex - ey;
> +
> +  /* Common case where exponents are close: |x/y| < 2^12, x not inf/NaN
> +     and |x%y| not denormal.  */
> +  if (__glibc_likely (ey < (EXPONENT_MASK >> MANTISSA_WIDTH) - EXPONENT_WIDTH
> +		      && ey > MANTISSA_WIDTH
> +		      && exp_diff <= EXPONENT_WIDTH))
> +    {
> +      uint64_t mx = (hx << EXPONENT_WIDTH) | SIGN_MASK;
> +      uint64_t my = (hy << EXPONENT_WIDTH) | SIGN_MASK;
> +
> +      mx %= (my >> exp_diff);
> +
> +      if (__glibc_unlikely (mx == 0))
> +	return asdouble (sx);
> +      int shift = __builtin_clzll (mx);

Maybe clz_uint64 here?

> +      ex -= shift + 1;
> +      mx <<= shift;
> +      mx = sx | (mx >> EXPONENT_WIDTH);
> +      return asdouble (mx + ((uint64_t)ex << MANTISSA_WIDTH));
> +    }
>  
> -  /* Common case where exponents are close: ey >= -907 and |x/y| < 2^52,  */
> -  if (__glibc_likely (ey > MANTISSA_WIDTH && ex - ey <= EXPONENT_WIDTH))
> +  if (__glibc_unlikely (hy == 0 || hx >= EXPONENT_MASK))
>      {
> -      uint64_t mx = (hx & MANTISSA_MASK) | (MANTISSA_MASK + 1);
> -      uint64_t my = (hy & MANTISSA_MASK) | (MANTISSA_MASK + 1);
> +      /* If x is a NaN, return a NaN.  */
> +      if (hx > EXPONENT_MASK)
> +	return x * y;
>  
> -      uint64_t d = (ex == ey) ? (mx - my) : (mx << (ex - ey)) % my;
> -      return make_double (d, ey - 1, sx);
> +      /* If x is an infinity or y is zero, return a NaN and set EDOM.  */
> +      return __math_edom ((x * y) / (x * y));
>      }
>  
> -  /* Special case, both x and y are subnormal.  */
> -  if (__glibc_unlikely (ex == 0 && ey == 0))
> +  /* Special case, both x and y are denormal.  */
> +  if (__glibc_unlikely (ex == 0))
>      return asdouble (sx | hx % hy);
>  
> -  /* Convert |x| and |y| to 'mx + 2^ex' and 'my + 2^ey'.  Assume that hx is
> -     not subnormal by conditions above.  */
> +  /* Extract normalized mantissas - hx is not denormal and hy != 0.  */
>    uint64_t mx = get_mantissa (hx) | (MANTISSA_MASK + 1);
> -  ex--;
>    uint64_t my = get_mantissa (hy) | (MANTISSA_MASK + 1);
> -
>    int lead_zeros_my = EXPONENT_WIDTH;
> -  if (__glibc_likely (ey > 0))
> -    ey--;
> -  else
> +
> +  ey--;
> +  /* Special case for denormal y.  */
> +  if (__glibc_unlikely (ey < 0))
>      {
>        my = hy;
> +      ey = 0;
> +      exp_diff--;
>        lead_zeros_my = clz_uint64 (my);
>      }
>  
> -  /* Assume hy != 0  */
>    int tail_zeros_my = ctz_uint64 (my);
>    int sides_zeroes = lead_zeros_my + tail_zeros_my;
> -  int exp_diff = ex - ey;
>  
>    int right_shift = exp_diff < tail_zeros_my ? exp_diff : tail_zeros_my;
>    my >>= right_shift;
> @@ -141,8 +154,7 @@ __fmod (double x, double y)
>    if (exp_diff == 0)
>      return make_double (mx, ey, sx);
>  
> -  /* Assume modulo/divide operation is slow, so use multiplication with invert
> -     values.  */
> +  /* Multiplication with the inverse is faster than repeated modulo.  */
>    uint64_t inv_hy = UINT64_MAX / my;
>    while (exp_diff > sides_zeroes) {
>      exp_diff -= sides_zeroes;

Ok.

> diff --git a/sysdeps/ieee754/flt-32/e_fmodf.c b/sysdeps/ieee754/flt-32/e_fmodf.c
> index 763900efdaa8222431e189e50a7e62baf465a54d..14f3fcae256d03ceb4bf91898a7a003bbfcb854a 100644
> --- a/sysdeps/ieee754/flt-32/e_fmodf.c
> +++ b/sysdeps/ieee754/flt-32/e_fmodf.c
> @@ -40,10 +40,10 @@
>  
>     r == x % y == (x % (N * y)) % y
>  
> -   And with mx/my being mantissa of double floating point number (which uses
> +   And with mx/my being mantissa of a single floating point number (which uses
>     less bits than the storage type), on each step the argument reduction can
>     be improved by 8 (which is the size of uint32_t minus MANTISSA_WIDTH plus
> -   the signal bit):
> +   the implicit one bit):
>  
>     mx * 2^ex == 2^8 * mx * 2^(ex - 8)
>  
> @@ -54,7 +54,12 @@
>         mx << 8;
>         ex -= 8;
>         mx %= my;
> -     }  */
> +     }
> +
> +   Special cases:
> +     - If x or y is a NaN, a NaN is returned.
> +     - If x is an infinity, or y is zero, a NaN is returned and EDOM is set.
> +     - If x is +0/-0, and y is not zero, +0/-0 is returned.  */
>  
>  float
>  __fmodf (float x, float y)
> @@ -67,61 +72,69 @@ __fmodf (float x, float y)
>    hx ^= sx;
>    hy &= ~SIGN_MASK;
>  
> -  /* Special cases:
> -     - If x or y is a Nan, NaN is returned.
> -     - If x is an inifinity, a NaN is returned.
> -     - If y is zero, Nan is returned.
> -     - If x is +0/-0, and y is not zero, +0/-0 is returned.  */
> -  if (__glibc_unlikely (hy == 0
> -			|| hx >= EXPONENT_MASK || hy > EXPONENT_MASK))
> -    {
> -      if (is_nan (hx) || is_nan (hy))
> -	return (x * y) / (x * y);
> -      return __math_edomf ((x * y) / (x * y));
> -    }
> -
> -  if (__glibc_unlikely (hx <= hy))
> +  if (__glibc_likely (hx < hy))
>      {
> -      if (hx < hy)
> -	return x;
> -      return asfloat (sx);
> +      /* If y is a NaN, return a NaN.  */
> +      if (__glibc_unlikely (hy > EXPONENT_MASK))
> +	return x * y;
> +      return x;
>      }
>  
>    int ex = hx >> MANTISSA_WIDTH;
>    int ey = hy >> MANTISSA_WIDTH;
> +  int exp_diff = ex - ey;
> +
> +  /* Common case where exponents are close: |x/y| < 2^9, x not inf/NaN
> +     and |x%y| not denormal.  */
> +  if (__glibc_likely (ey < (EXPONENT_MASK >> MANTISSA_WIDTH) - EXPONENT_WIDTH
> +		     && ey > MANTISSA_WIDTH
> +		     && exp_diff <= EXPONENT_WIDTH))
> +    {
> +      uint32_t mx = (hx << EXPONENT_WIDTH) | SIGN_MASK;
> +      uint32_t my = (hy << EXPONENT_WIDTH) | SIGN_MASK;
> +
> +      mx %= (my >> exp_diff);
> +
> +      if (__glibc_unlikely (mx == 0))
> +	return asfloat (sx);
> +      int shift = __builtin_clz (mx);
> +      ex -= shift + 1;
> +      mx <<= shift;
> +      mx = sx | (mx >> EXPONENT_WIDTH);
> +      return asfloat (mx + ((uint32_t)ex << MANTISSA_WIDTH));
> +    }
>  
> -  /* Common case where exponents are close: ey >= -103 and |x/y| < 2^8,  */
> -  if (__glibc_likely (ey > MANTISSA_WIDTH && ex - ey <= EXPONENT_WIDTH))
> +  if (__glibc_unlikely (hy == 0 || hx >= EXPONENT_MASK))
>      {
> -      uint64_t mx = (hx & MANTISSA_MASK) | (MANTISSA_MASK + 1);
> -      uint64_t my = (hy & MANTISSA_MASK) | (MANTISSA_MASK + 1);
> +      /* If x is a NaN, return a NaN.  */
> +      if (hx > EXPONENT_MASK)
> +	return x * y;
>  
> -      uint32_t d = (ex == ey) ? (mx - my) : (mx << (ex - ey)) % my;
> -      return make_float (d, ey - 1, sx);
> +      /* If x is an infinity or y is zero, return a NaN and set EDOM.  */
> +      return __math_edomf ((x * y) / (x * y));
>      }
>  
> -  /* Special case, both x and y are subnormal.  */
> -  if (__glibc_unlikely (ex == 0 && ey == 0))
> +  /* Special case, both x and y are denormal.  */
> +  if (__glibc_unlikely (ex == 0))
>      return asfloat (sx | hx % hy);
>  
> -  /* Convert |x| and |y| to 'mx + 2^ex' and 'my + 2^ey'.  Assume that hx is
> -     not subnormal by conditions above.  */
> +  /* Extract normalized mantissas - hx is not denormal and hy != 0.  */
>    uint32_t mx = get_mantissa (hx) | (MANTISSA_MASK + 1);
> -  ex--;
> -
>    uint32_t my = get_mantissa (hy) | (MANTISSA_MASK + 1);
>    int lead_zeros_my = EXPONENT_WIDTH;
> -  if (__glibc_likely (ey > 0))
> -    ey--;
> -  else
> +
> +  ey--;
> +  /* Special case for denormal y.  */
> +  if (__glibc_unlikely (ey < 0))
>      {
>        my = hy;
> +      ey = 0;
> +      exp_diff--;
>        lead_zeros_my = __builtin_clz (my);
>      }
>  
>    int tail_zeros_my = __builtin_ctz (my);
>    int sides_zeroes = lead_zeros_my + tail_zeros_my;
> -  int exp_diff = ex - ey;
>  
>    int right_shift = exp_diff < tail_zeros_my ? exp_diff : tail_zeros_my;
>    my >>= right_shift;
> @@ -140,8 +153,7 @@ __fmodf (float x, float y)
>    if (exp_diff == 0)
>      return make_float (mx, ey, sx);
>  
> -  /* Assume modulo/divide operation is slow, so use multiplication with invert
> -     values.  */
> +  /* Multiplication with the inverse is faster than repeated modulo.  */
>    uint32_t inv_hy = UINT32_MAX / my;
>    while (exp_diff > sides_zeroes) {
>      exp_diff -= sides_zeroes;

next prev parent reply	other threads:[~2023-04-13 15:58 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-04-13 14:29 Wilco Dijkstra
2023-04-13 15:58 ` Adhemerval Zanella Netto [this message]
2023-04-13 20:45   ` Wilco Dijkstra
2023-04-13 20:56     ` Adhemerval Zanella Netto

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=0baece75-8f99-da08-4094-18f99238cb12@linaro.org \
    --to=adhemerval.zanella@linaro.org \
    --cc=Wilco.Dijkstra@arm.com \
    --cc=libc-alpha@sourceware.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).