From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <wilco@sourceware.org>
Received: by sourceware.org (Postfix, from userid 1895)
	id 55E853857BB2; Mon, 17 Apr 2023 12:04:53 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 55E853857BB2
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org;
	s=default; t=1681733093;
	bh=zXlLds4O4zAxdKzK/+X2eyI/PoMdaylz2kwY/zdCDIg=;
	h=From:To:Subject:Date:From;
	b=jbfjY0/l/V98HUhGkH9Mjl6SaBrDWjiRzG/jPdLYnc5Qg+Ol/gALYNCSaVmwiQyKr
	 9Waz8zos2/Nl2byPsmBU6t/fApFS0E57oEhuWipQzUkZz+WhIT1O+RMq2llmWJdIoN
	 +t8Xc5eWrcslJ4R2UPOrX0tfRRUvTgq9NJpUYRUQ=
Content-Type: text/plain; charset="us-ascii"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
From: Wilco Dijkstra <wilco@sourceware.org>
To: glibc-cvs@sourceware.org
Subject: [glibc] math: Improve fmod(f) performance
X-Act-Checkin: glibc
X-Git-Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
X-Git-Refname: refs/heads/master
X-Git-Oldrev: 2623479105a7e11ccd1e504b3f549cadbb875a42
X-Git-Newrev: 76d0f094dd177e303b36d7b77e21673f244a4b53
Message-Id: <20230417120453.55E853857BB2@sourceware.org>
Date: Mon, 17 Apr 2023 12:04:53 +0000 (GMT)
List-Id: <glibc-cvs.sourceware.org>

https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=76d0f094dd177e303b36d7b77e21673f244a4b53

commit 76d0f094dd177e303b36d7b77e21673f244a4b53
Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date:   Mon Apr 17 12:42:18 2023 +0100

    math: Improve fmod(f) performance
    
    Optimize the fast paths (x < y) and (x/y < 2^12).  Delay handling of special
    cases to reduce the number of instructions executed before the fast paths.
    Performance improvements for fmod:
    
                    Skylake Zen2    Neoverse V1
    subnormals      11.8%   4.2%    11.5%
    normal          3.9%    0.01%   -0.5%
    close-exponents 6.3%    5.6%    19.4%
    
    Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>

Diff:
---
 sysdeps/ieee754/dbl-64/e_fmod.c  | 90 +++++++++++++++++++++++-----------------
 sysdeps/ieee754/flt-32/e_fmodf.c | 88 ++++++++++++++++++++++-----------------
 2 files changed, 101 insertions(+), 77 deletions(-)

diff --git a/sysdeps/ieee754/dbl-64/e_fmod.c b/sysdeps/ieee754/dbl-64/e_fmod.c
index caae4e47e2..0f04fdf77e 100644
--- a/sysdeps/ieee754/dbl-64/e_fmod.c
+++ b/sysdeps/ieee754/dbl-64/e_fmod.c
@@ -40,10 +40,10 @@
 
    r == x % y == (x % (N * y)) % y
 
-   And with mx/my being mantissa of double floating point number (which uses
+   And with mx/my being mantissa of a double floating point number (which uses
    less bits than the storage type), on each step the argument reduction can
    be improved by 11 (which is the size of uint64_t minus MANTISSA_WIDTH plus
-   the signal bit):
+   the implicit one bit):
 
    mx * 2^ex == 2^11 * mx * 2^(ex - 11)
 
@@ -54,7 +54,12 @@
        mx << 11;
        ex -= 11;
        mx %= my;
-     }  */
+     }
+
+   Special cases:
+     - If x or y is a NaN, a NaN is returned.
+     - If x is an infinity, or y is zero, a NaN is returned and EDOM is set.
+     - If x is +0/-0, and y is not zero, +0/-0 is returned.  */
 
 double
 __fmod (double x, double y)
@@ -67,62 +72,70 @@ __fmod (double x, double y)
   hx ^= sx;
   hy &= ~SIGN_MASK;
 
-  /* Special cases:
-     - If x or y is a Nan, NaN is returned.
-     - If x is an inifinity, a NaN is returned and EDOM is set.
-     - If y is zero, Nan is returned.
-     - If x is +0/-0, and y is not zero, +0/-0 is returned.  */
-  if (__glibc_unlikely (hy == 0
-			|| hx >= EXPONENT_MASK || hy > EXPONENT_MASK))
-    {
-      if (is_nan (hx) || is_nan (hy))
-	return (x * y) / (x * y);
-      return __math_edom ((x * y) / (x * y));
-    }
-
-  if (__glibc_unlikely (hx <= hy))
+  /* If x < y, return x (unless y is a NaN).  */
+  if (__glibc_likely (hx < hy))
     {
-      if (hx < hy)
-	return x;
-      return asdouble (sx);
+      /* If y is a NaN, return a NaN.  */
+      if (__glibc_unlikely (hy > EXPONENT_MASK))
+	return x * y;
+      return x;
     }
 
   int ex = hx >> MANTISSA_WIDTH;
   int ey = hy >> MANTISSA_WIDTH;
+  int exp_diff = ex - ey;
+
+  /* Common case where exponents are close: |x/y| < 2^12, x not inf/NaN
+     and |x%y| not denormal.  */
+  if (__glibc_likely (ey < (EXPONENT_MASK >> MANTISSA_WIDTH) - EXPONENT_WIDTH
+		      && ey > MANTISSA_WIDTH
+		      && exp_diff <= EXPONENT_WIDTH))
+    {
+      uint64_t mx = (hx << EXPONENT_WIDTH) | SIGN_MASK;
+      uint64_t my = (hy << EXPONENT_WIDTH) | SIGN_MASK;
+
+      mx %= (my >> exp_diff);
+
+      if (__glibc_unlikely (mx == 0))
+	return asdouble (sx);
+      int shift = clz_uint64 (mx);
+      ex -= shift + 1;
+      mx <<= shift;
+      mx = sx | (mx >> EXPONENT_WIDTH);
+      return asdouble (mx + ((uint64_t)ex << MANTISSA_WIDTH));
+    }
 
-  /* Common case where exponents are close: ey >= -907 and |x/y| < 2^52,  */
-  if (__glibc_likely (ey > MANTISSA_WIDTH && ex - ey <= EXPONENT_WIDTH))
+  if (__glibc_unlikely (hy == 0 || hx >= EXPONENT_MASK))
     {
-      uint64_t mx = (hx & MANTISSA_MASK) | (MANTISSA_MASK + 1);
-      uint64_t my = (hy & MANTISSA_MASK) | (MANTISSA_MASK + 1);
+      /* If x is a NaN, return a NaN.  */
+      if (hx > EXPONENT_MASK)
+	return x * y;
 
-      uint64_t d = (ex == ey) ? (mx - my) : (mx << (ex - ey)) % my;
-      return make_double (d, ey - 1, sx);
+      /* If x is an infinity or y is zero, return a NaN and set EDOM.  */
+      return __math_edom ((x * y) / (x * y));
     }
 
-  /* Special case, both x and y are subnormal.  */
-  if (__glibc_unlikely (ex == 0 && ey == 0))
+  /* Special case, both x and y are denormal.  */
+  if (__glibc_unlikely (ex == 0))
     return asdouble (sx | hx % hy);
 
-  /* Convert |x| and |y| to 'mx + 2^ex' and 'my + 2^ey'.  Assume that hx is
-     not subnormal by conditions above.  */
+  /* Extract normalized mantissas - hx is not denormal and hy != 0.  */
   uint64_t mx = get_mantissa (hx) | (MANTISSA_MASK + 1);
-  ex--;
   uint64_t my = get_mantissa (hy) | (MANTISSA_MASK + 1);
-
   int lead_zeros_my = EXPONENT_WIDTH;
-  if (__glibc_likely (ey > 0))
-    ey--;
-  else
+
+  ey--;
+  /* Special case for denormal y.  */
+  if (__glibc_unlikely (ey < 0))
     {
       my = hy;
+      ey = 0;
+      exp_diff--;
       lead_zeros_my = clz_uint64 (my);
     }
 
-  /* Assume hy != 0  */
   int tail_zeros_my = ctz_uint64 (my);
   int sides_zeroes = lead_zeros_my + tail_zeros_my;
-  int exp_diff = ex - ey;
 
   int right_shift = exp_diff < tail_zeros_my ? exp_diff : tail_zeros_my;
   my >>= right_shift;
@@ -141,8 +154,7 @@ __fmod (double x, double y)
   if (exp_diff == 0)
     return make_double (mx, ey, sx);
 
-  /* Assume modulo/divide operation is slow, so use multiplication with invert
-     values.  */
+  /* Multiplication with the inverse is faster than repeated modulo.  */
   uint64_t inv_hy = UINT64_MAX / my;
   while (exp_diff > sides_zeroes) {
     exp_diff -= sides_zeroes;
diff --git a/sysdeps/ieee754/flt-32/e_fmodf.c b/sysdeps/ieee754/flt-32/e_fmodf.c
index 763900efda..14f3fcae25 100644
--- a/sysdeps/ieee754/flt-32/e_fmodf.c
+++ b/sysdeps/ieee754/flt-32/e_fmodf.c
@@ -40,10 +40,10 @@
 
    r == x % y == (x % (N * y)) % y
 
-   And with mx/my being mantissa of double floating point number (which uses
+   And with mx/my being mantissa of a single floating point number (which uses
    less bits than the storage type), on each step the argument reduction can
    be improved by 8 (which is the size of uint32_t minus MANTISSA_WIDTH plus
-   the signal bit):
+   the implicit one bit):
 
    mx * 2^ex == 2^8 * mx * 2^(ex - 8)
 
@@ -54,7 +54,12 @@
        mx << 8;
        ex -= 8;
        mx %= my;
-     }  */
+     }
+
+   Special cases:
+     - If x or y is a NaN, a NaN is returned.
+     - If x is an infinity, or y is zero, a NaN is returned and EDOM is set.
+     - If x is +0/-0, and y is not zero, +0/-0 is returned.  */
 
 float
 __fmodf (float x, float y)
@@ -67,61 +72,69 @@ __fmodf (float x, float y)
   hx ^= sx;
   hy &= ~SIGN_MASK;
 
-  /* Special cases:
-     - If x or y is a Nan, NaN is returned.
-     - If x is an inifinity, a NaN is returned.
-     - If y is zero, Nan is returned.
-     - If x is +0/-0, and y is not zero, +0/-0 is returned.  */
-  if (__glibc_unlikely (hy == 0
-			|| hx >= EXPONENT_MASK || hy > EXPONENT_MASK))
-    {
-      if (is_nan (hx) || is_nan (hy))
-	return (x * y) / (x * y);
-      return __math_edomf ((x * y) / (x * y));
-    }
-
-  if (__glibc_unlikely (hx <= hy))
+  if (__glibc_likely (hx < hy))
     {
-      if (hx < hy)
-	return x;
-      return asfloat (sx);
+      /* If y is a NaN, return a NaN.  */
+      if (__glibc_unlikely (hy > EXPONENT_MASK))
+	return x * y;
+      return x;
     }
 
   int ex = hx >> MANTISSA_WIDTH;
   int ey = hy >> MANTISSA_WIDTH;
+  int exp_diff = ex - ey;
+
+  /* Common case where exponents are close: |x/y| < 2^9, x not inf/NaN
+     and |x%y| not denormal.  */
+  if (__glibc_likely (ey < (EXPONENT_MASK >> MANTISSA_WIDTH) - EXPONENT_WIDTH
+		     && ey > MANTISSA_WIDTH
+		     && exp_diff <= EXPONENT_WIDTH))
+    {
+      uint32_t mx = (hx << EXPONENT_WIDTH) | SIGN_MASK;
+      uint32_t my = (hy << EXPONENT_WIDTH) | SIGN_MASK;
+
+      mx %= (my >> exp_diff);
+
+      if (__glibc_unlikely (mx == 0))
+	return asfloat (sx);
+      int shift = __builtin_clz (mx);
+      ex -= shift + 1;
+      mx <<= shift;
+      mx = sx | (mx >> EXPONENT_WIDTH);
+      return asfloat (mx + ((uint32_t)ex << MANTISSA_WIDTH));
+    }
 
-  /* Common case where exponents are close: ey >= -103 and |x/y| < 2^8,  */
-  if (__glibc_likely (ey > MANTISSA_WIDTH && ex - ey <= EXPONENT_WIDTH))
+  if (__glibc_unlikely (hy == 0 || hx >= EXPONENT_MASK))
     {
-      uint64_t mx = (hx & MANTISSA_MASK) | (MANTISSA_MASK + 1);
-      uint64_t my = (hy & MANTISSA_MASK) | (MANTISSA_MASK + 1);
+      /* If x is a NaN, return a NaN.  */
+      if (hx > EXPONENT_MASK)
+	return x * y;
 
-      uint32_t d = (ex == ey) ? (mx - my) : (mx << (ex - ey)) % my;
-      return make_float (d, ey - 1, sx);
+      /* If x is an infinity or y is zero, return a NaN and set EDOM.  */
+      return __math_edomf ((x * y) / (x * y));
     }
 
-  /* Special case, both x and y are subnormal.  */
-  if (__glibc_unlikely (ex == 0 && ey == 0))
+  /* Special case, both x and y are denormal.  */
+  if (__glibc_unlikely (ex == 0))
     return asfloat (sx | hx % hy);
 
-  /* Convert |x| and |y| to 'mx + 2^ex' and 'my + 2^ey'.  Assume that hx is
-     not subnormal by conditions above.  */
+  /* Extract normalized mantissas - hx is not denormal and hy != 0.  */
   uint32_t mx = get_mantissa (hx) | (MANTISSA_MASK + 1);
-  ex--;
-
   uint32_t my = get_mantissa (hy) | (MANTISSA_MASK + 1);
   int lead_zeros_my = EXPONENT_WIDTH;
-  if (__glibc_likely (ey > 0))
-    ey--;
-  else
+
+  ey--;
+  /* Special case for denormal y.  */
+  if (__glibc_unlikely (ey < 0))
     {
       my = hy;
+      ey = 0;
+      exp_diff--;
       lead_zeros_my = __builtin_clz (my);
     }
 
   int tail_zeros_my = __builtin_ctz (my);
   int sides_zeroes = lead_zeros_my + tail_zeros_my;
-  int exp_diff = ex - ey;
 
   int right_shift = exp_diff < tail_zeros_my ? exp_diff : tail_zeros_my;
   my >>= right_shift;
@@ -140,8 +153,7 @@ __fmodf (float x, float y)
   if (exp_diff == 0)
     return make_float (mx, ey, sx);
 
-  /* Assume modulo/divide operation is slow, so use multiplication with invert
-     values.  */
+  /* Multiplication with the inverse is faster than repeated modulo.  */
   uint32_t inv_hy = UINT32_MAX / my;
   while (exp_diff > sides_zeroes) {
     exp_diff -= sides_zeroes;