From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 1895) id 55E853857BB2; Mon, 17 Apr 2023 12:04:53 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 55E853857BB2 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1681733093; bh=zXlLds4O4zAxdKzK/+X2eyI/PoMdaylz2kwY/zdCDIg=; h=From:To:Subject:Date:From; b=jbfjY0/l/V98HUhGkH9Mjl6SaBrDWjiRzG/jPdLYnc5Qg+Ol/gALYNCSaVmwiQyKr 9Waz8zos2/Nl2byPsmBU6t/fApFS0E57oEhuWipQzUkZz+WhIT1O+RMq2llmWJdIoN +t8Xc5eWrcslJ4R2UPOrX0tfRRUvTgq9NJpUYRUQ= Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: Wilco Dijkstra To: glibc-cvs@sourceware.org Subject: [glibc] math: Improve fmod(f) performance X-Act-Checkin: glibc X-Git-Author: Wilco Dijkstra X-Git-Refname: refs/heads/master X-Git-Oldrev: 2623479105a7e11ccd1e504b3f549cadbb875a42 X-Git-Newrev: 76d0f094dd177e303b36d7b77e21673f244a4b53 Message-Id: <20230417120453.55E853857BB2@sourceware.org> Date: Mon, 17 Apr 2023 12:04:53 +0000 (GMT) List-Id: https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=76d0f094dd177e303b36d7b77e21673f244a4b53 commit 76d0f094dd177e303b36d7b77e21673f244a4b53 Author: Wilco Dijkstra Date: Mon Apr 17 12:42:18 2023 +0100 math: Improve fmod(f) performance Optimize the fast paths (x < y) and (x/y < 2^12). Delay handling of special cases to reduce the number of instructions executed before the fast paths. Performance improvements for fmod: Skylake Zen2 Neoverse V1 subnormals 11.8% 4.2% 11.5% normal 3.9% 0.01% -0.5% close-exponents 6.3% 5.6% 19.4% Reviewed-by: Adhemerval Zanella Diff: --- sysdeps/ieee754/dbl-64/e_fmod.c | 90 +++++++++++++++++++++++----------------- sysdeps/ieee754/flt-32/e_fmodf.c | 88 ++++++++++++++++++++++----------------- 2 files changed, 101 insertions(+), 77 deletions(-) diff --git a/sysdeps/ieee754/dbl-64/e_fmod.c b/sysdeps/ieee754/dbl-64/e_fmod.c index caae4e47e2..0f04fdf77e 100644 --- a/sysdeps/ieee754/dbl-64/e_fmod.c +++ b/sysdeps/ieee754/dbl-64/e_fmod.c @@ -40,10 +40,10 @@ r == x % y == (x % (N * y)) % y - And with mx/my being mantissa of double floating point number (which uses + And with mx/my being mantissa of a double floating point number (which uses less bits than the storage type), on each step the argument reduction can be improved by 11 (which is the size of uint64_t minus MANTISSA_WIDTH plus - the signal bit): + the implicit one bit): mx * 2^ex == 2^11 * mx * 2^(ex - 11) @@ -54,7 +54,12 @@ mx << 11; ex -= 11; mx %= my; - } */ + } + + Special cases: + - If x or y is a NaN, a NaN is returned. + - If x is an infinity, or y is zero, a NaN is returned and EDOM is set. + - If x is +0/-0, and y is not zero, +0/-0 is returned. */ double __fmod (double x, double y) @@ -67,62 +72,70 @@ __fmod (double x, double y) hx ^= sx; hy &= ~SIGN_MASK; - /* Special cases: - - If x or y is a Nan, NaN is returned. - - If x is an inifinity, a NaN is returned and EDOM is set. - - If y is zero, Nan is returned. - - If x is +0/-0, and y is not zero, +0/-0 is returned. */ - if (__glibc_unlikely (hy == 0 - || hx >= EXPONENT_MASK || hy > EXPONENT_MASK)) - { - if (is_nan (hx) || is_nan (hy)) - return (x * y) / (x * y); - return __math_edom ((x * y) / (x * y)); - } - - if (__glibc_unlikely (hx <= hy)) + /* If x < y, return x (unless y is a NaN). */ + if (__glibc_likely (hx < hy)) { - if (hx < hy) - return x; - return asdouble (sx); + /* If y is a NaN, return a NaN. */ + if (__glibc_unlikely (hy > EXPONENT_MASK)) + return x * y; + return x; } int ex = hx >> MANTISSA_WIDTH; int ey = hy >> MANTISSA_WIDTH; + int exp_diff = ex - ey; + + /* Common case where exponents are close: |x/y| < 2^12, x not inf/NaN + and |x%y| not denormal. */ + if (__glibc_likely (ey < (EXPONENT_MASK >> MANTISSA_WIDTH) - EXPONENT_WIDTH + && ey > MANTISSA_WIDTH + && exp_diff <= EXPONENT_WIDTH)) + { + uint64_t mx = (hx << EXPONENT_WIDTH) | SIGN_MASK; + uint64_t my = (hy << EXPONENT_WIDTH) | SIGN_MASK; + + mx %= (my >> exp_diff); + + if (__glibc_unlikely (mx == 0)) + return asdouble (sx); + int shift = clz_uint64 (mx); + ex -= shift + 1; + mx <<= shift; + mx = sx | (mx >> EXPONENT_WIDTH); + return asdouble (mx + ((uint64_t)ex << MANTISSA_WIDTH)); + } - /* Common case where exponents are close: ey >= -907 and |x/y| < 2^52, */ - if (__glibc_likely (ey > MANTISSA_WIDTH && ex - ey <= EXPONENT_WIDTH)) + if (__glibc_unlikely (hy == 0 || hx >= EXPONENT_MASK)) { - uint64_t mx = (hx & MANTISSA_MASK) | (MANTISSA_MASK + 1); - uint64_t my = (hy & MANTISSA_MASK) | (MANTISSA_MASK + 1); + /* If x is a NaN, return a NaN. */ + if (hx > EXPONENT_MASK) + return x * y; - uint64_t d = (ex == ey) ? (mx - my) : (mx << (ex - ey)) % my; - return make_double (d, ey - 1, sx); + /* If x is an infinity or y is zero, return a NaN and set EDOM. */ + return __math_edom ((x * y) / (x * y)); } - /* Special case, both x and y are subnormal. */ - if (__glibc_unlikely (ex == 0 && ey == 0)) + /* Special case, both x and y are denormal. */ + if (__glibc_unlikely (ex == 0)) return asdouble (sx | hx % hy); - /* Convert |x| and |y| to 'mx + 2^ex' and 'my + 2^ey'. Assume that hx is - not subnormal by conditions above. */ + /* Extract normalized mantissas - hx is not denormal and hy != 0. */ uint64_t mx = get_mantissa (hx) | (MANTISSA_MASK + 1); - ex--; uint64_t my = get_mantissa (hy) | (MANTISSA_MASK + 1); - int lead_zeros_my = EXPONENT_WIDTH; - if (__glibc_likely (ey > 0)) - ey--; - else + + ey--; + /* Special case for denormal y. */ + if (__glibc_unlikely (ey < 0)) { my = hy; + ey = 0; + exp_diff--; lead_zeros_my = clz_uint64 (my); } - /* Assume hy != 0 */ int tail_zeros_my = ctz_uint64 (my); int sides_zeroes = lead_zeros_my + tail_zeros_my; - int exp_diff = ex - ey; int right_shift = exp_diff < tail_zeros_my ? exp_diff : tail_zeros_my; my >>= right_shift; @@ -141,8 +154,7 @@ __fmod (double x, double y) if (exp_diff == 0) return make_double (mx, ey, sx); - /* Assume modulo/divide operation is slow, so use multiplication with invert - values. */ + /* Multiplication with the inverse is faster than repeated modulo. */ uint64_t inv_hy = UINT64_MAX / my; while (exp_diff > sides_zeroes) { exp_diff -= sides_zeroes; diff --git a/sysdeps/ieee754/flt-32/e_fmodf.c b/sysdeps/ieee754/flt-32/e_fmodf.c index 763900efda..14f3fcae25 100644 --- a/sysdeps/ieee754/flt-32/e_fmodf.c +++ b/sysdeps/ieee754/flt-32/e_fmodf.c @@ -40,10 +40,10 @@ r == x % y == (x % (N * y)) % y - And with mx/my being mantissa of double floating point number (which uses + And with mx/my being mantissa of a single floating point number (which uses less bits than the storage type), on each step the argument reduction can be improved by 8 (which is the size of uint32_t minus MANTISSA_WIDTH plus - the signal bit): + the implicit one bit): mx * 2^ex == 2^8 * mx * 2^(ex - 8) @@ -54,7 +54,12 @@ mx << 8; ex -= 8; mx %= my; - } */ + } + + Special cases: + - If x or y is a NaN, a NaN is returned. + - If x is an infinity, or y is zero, a NaN is returned and EDOM is set. + - If x is +0/-0, and y is not zero, +0/-0 is returned. */ float __fmodf (float x, float y) @@ -67,61 +72,69 @@ __fmodf (float x, float y) hx ^= sx; hy &= ~SIGN_MASK; - /* Special cases: - - If x or y is a Nan, NaN is returned. - - If x is an inifinity, a NaN is returned. - - If y is zero, Nan is returned. - - If x is +0/-0, and y is not zero, +0/-0 is returned. */ - if (__glibc_unlikely (hy == 0 - || hx >= EXPONENT_MASK || hy > EXPONENT_MASK)) - { - if (is_nan (hx) || is_nan (hy)) - return (x * y) / (x * y); - return __math_edomf ((x * y) / (x * y)); - } - - if (__glibc_unlikely (hx <= hy)) + if (__glibc_likely (hx < hy)) { - if (hx < hy) - return x; - return asfloat (sx); + /* If y is a NaN, return a NaN. */ + if (__glibc_unlikely (hy > EXPONENT_MASK)) + return x * y; + return x; } int ex = hx >> MANTISSA_WIDTH; int ey = hy >> MANTISSA_WIDTH; + int exp_diff = ex - ey; + + /* Common case where exponents are close: |x/y| < 2^9, x not inf/NaN + and |x%y| not denormal. */ + if (__glibc_likely (ey < (EXPONENT_MASK >> MANTISSA_WIDTH) - EXPONENT_WIDTH + && ey > MANTISSA_WIDTH + && exp_diff <= EXPONENT_WIDTH)) + { + uint32_t mx = (hx << EXPONENT_WIDTH) | SIGN_MASK; + uint32_t my = (hy << EXPONENT_WIDTH) | SIGN_MASK; + + mx %= (my >> exp_diff); + + if (__glibc_unlikely (mx == 0)) + return asfloat (sx); + int shift = __builtin_clz (mx); + ex -= shift + 1; + mx <<= shift; + mx = sx | (mx >> EXPONENT_WIDTH); + return asfloat (mx + ((uint32_t)ex << MANTISSA_WIDTH)); + } - /* Common case where exponents are close: ey >= -103 and |x/y| < 2^8, */ - if (__glibc_likely (ey > MANTISSA_WIDTH && ex - ey <= EXPONENT_WIDTH)) + if (__glibc_unlikely (hy == 0 || hx >= EXPONENT_MASK)) { - uint64_t mx = (hx & MANTISSA_MASK) | (MANTISSA_MASK + 1); - uint64_t my = (hy & MANTISSA_MASK) | (MANTISSA_MASK + 1); + /* If x is a NaN, return a NaN. */ + if (hx > EXPONENT_MASK) + return x * y; - uint32_t d = (ex == ey) ? (mx - my) : (mx << (ex - ey)) % my; - return make_float (d, ey - 1, sx); + /* If x is an infinity or y is zero, return a NaN and set EDOM. */ + return __math_edomf ((x * y) / (x * y)); } - /* Special case, both x and y are subnormal. */ - if (__glibc_unlikely (ex == 0 && ey == 0)) + /* Special case, both x and y are denormal. */ + if (__glibc_unlikely (ex == 0)) return asfloat (sx | hx % hy); - /* Convert |x| and |y| to 'mx + 2^ex' and 'my + 2^ey'. Assume that hx is - not subnormal by conditions above. */ + /* Extract normalized mantissas - hx is not denormal and hy != 0. */ uint32_t mx = get_mantissa (hx) | (MANTISSA_MASK + 1); - ex--; - uint32_t my = get_mantissa (hy) | (MANTISSA_MASK + 1); int lead_zeros_my = EXPONENT_WIDTH; - if (__glibc_likely (ey > 0)) - ey--; - else + + ey--; + /* Special case for denormal y. */ + if (__glibc_unlikely (ey < 0)) { my = hy; + ey = 0; + exp_diff--; lead_zeros_my = __builtin_clz (my); } int tail_zeros_my = __builtin_ctz (my); int sides_zeroes = lead_zeros_my + tail_zeros_my; - int exp_diff = ex - ey; int right_shift = exp_diff < tail_zeros_my ? exp_diff : tail_zeros_my; my >>= right_shift; @@ -140,8 +153,7 @@ __fmodf (float x, float y) if (exp_diff == 0) return make_float (mx, ey, sx); - /* Assume modulo/divide operation is slow, so use multiplication with invert - values. */ + /* Multiplication with the inverse is faster than repeated modulo. */ uint32_t inv_hy = UINT32_MAX / my; while (exp_diff > sides_zeroes) { exp_diff -= sides_zeroes;