From: "Oliver Schädlich" <oliver.schaedlich@gmail.com>
To: Adhemerval Zanella Netto <adhemerval.zanella@linaro.org>,
libc-alpha <libc-alpha@sourceware.org>
Subject: Re: More efficient fmod()
Date: Sun, 2 Oct 2022 08:45:45 +0200 [thread overview]
Message-ID: <040a73fa-6171-061a-3251-4c532a405b24@gmail.com> (raw)
In-Reply-To: <fda43638-5f9d-2214-238e-37a2a37d010b@linaro.org>
I've benchmarkey my code against e_fmod.c and
the difference in the performance is still the same:
int main()
{
mt19937_64 mt;
auto dbl = []( uint64_t u ) constexpr -> double { return
bit_cast<double>( u ); };
uniform_int_distribution<uint64_t> uid( 1, 0x7FEFFFFFFFFFFFFFu );
constexpr size_t
N = 0x1000,
ROUNDS = 1'000;
vector<double> values( N );
for( double &v : values )
v = dbl( uid( mt ) );
auto bench = [&]<typename ModFn>( ModFn modFn ) -> double
requires requires( ModFn modFn ) { { modFn( 1.0, 1.0 ) } ->
same_as<double>; }
{
double sum = 0.0;
auto start = high_resolution_clock::now();
for( size_t r = ROUNDS; r--; )
for( size_t i = 0; i != N; ++i )
sum += __ieee754_fmod( values[i], values[(i + 1) % N] );
double ns = duration_cast<nanoseconds>(
high_resolution_clock::now() - start ).count() / ((double)N * ROUNDS);
aSum.store( sum, memory_order_relaxed );
return ns;
};
cout << bench( myFmod ) << endl;
cout << bench( []( double a, double b ) -> double { return fmod( a,
b ); } ) << endl;
}
Am 01.10.2022 um 22:16 schrieb Adhemerval Zanella Netto:
>
> On 01/10/22 13:34, Oliver Schädlich via Libc-help wrote:
>> I found that fmod() could be faster.
>> This is my implementation:
>>
>> #include <stdint.h>
>> #include <string.h>
>> #include <fenv.h>
>> #if defined(_MSC_VER)
>> #include <intrin.h>
>> #endif
>>
>> #if defined(__GNUC__) || defined(__clang__)
>> #define likely(x) __builtin_expect((x), 1)
>> #define unlikely(x) __builtin_expect((x), 0)
>> #else
>> #define likely(x) (x)
>> #define unlikely(x) (x)
>> #endif
>>
>> inline uint64_t bin( double d )
>> {
>> uint64_t u;
>> memcpy( &u, &d, sizeof d );
>> return u;
>> }
>>
>> inline double dbl( uint64_t u )
>> {
>> double d;
>> memcpy( &d, &u, sizeof u );
>> return d;
>> }
>>
>> inline double invalid( uint64_t u )
>> {
>> feraiseexcept( FE_INVALID );
>> return dbl( u );
>> }
>>
>> #define SIGN_BIT ((uint64_t)1 << 63)
>> #define EXP_MASK ((uint64_t)0x7FF << 52)
>> #define IMPLCIT_BIT ((uint64_t)1 << 52)
>> #define MANT_MASK (IMPLCIT_BIT - 1)
>> #define QNAN_BIT (IMPLCIT_BIT >> 1)
>>
>> inline void normalize( uint64_t *mant, int *exp )
>> {
>> #if defined(__GNUC__) || defined(__clang__)
>> unsigned bits;
>> bits = __builtin_clz( *mant ) - 11;
>> *mant <<= bits;
>> #elif defined(_MSC_VER)
>> unsigned long bits;
>> _BitScanReverse64( &bits, *mant );
>> *mant <<= bits - 11 ;
>> #else
>> unsigned bits;
>> for( bits = 0; !(*mant & IMPLCIT_BIT); *mant <<= 1, ++bits );
>> #endif
>> *exp -= bits;
>> }
>>
>> double myFmodC( double counter, double denominator )
>> {
>> uint64_t const
>> bCounter = bin( counter ),
>> bDenom = bin( denominator );
>> uint64_t const sign = (bCounter ^ bDenom) & SIGN_BIT;
>> if( unlikely((bCounter & EXP_MASK) == EXP_MASK) )
>> // +/-[Inf|QNaN|SNaN] % ... = -QNaN
>> // follow SSE/AVX-rules, first NaN rules, i.e.
>> // first parameter determines non-SNaN/QNaN-bits
>> return invalid( SIGN_BIT | bCounter | QNAN_BIT );
>> if( unlikely((bDenom & EXP_MASK) == EXP_MASK) )
>> // +/-x % +/-[Inf|QNan|SNaN]
>> if( likely(!(bDenom & MANT_MASK)) )
>> // +/-x % +/-Inf = -/+x
>> return dbl( sign | bCounter & ~SIGN_BIT );
>> else
>> // +/-x % +/-[QNaN|SNaN] = -NaN
>> return invalid( SIGN_BIT | bDenom | QNAN_BIT );
>> int
>> counterExp = (bCounter & EXP_MASK) >> 52,
>> denomExp = (bDenom & EXP_MASK) >> 52;
>> uint64_t
>> counterMant = (uint64_t)!!counterExp << 52 | bCounter & MANT_MASK,
>> denomMant = (uint64_t)!!denomExp << 52 | bDenom & MANT_MASK;
>> if( unlikely(!counterExp) )
>> // counter is denormal
>> if( likely(!counterMant) )
>> // counter == +/-0.0
>> if( likely(denomMant) )
>> // +/-0.0 % +/-x = -/+0.0
>> return dbl( sign );
>> else
>> // +/-0.0 % +/-0.0 = -QNaN
>> return invalid( SIGN_BIT | EXP_MASK | QNAN_BIT );
>> else
>> // normalize counter
>> normalize( &counterMant, &counterExp ),
>> ++counterExp;
>> if( unlikely(!denomExp) )
>> // denominator is denormal
>> if( likely(!denomMant) )
>> // +/-x % +/-0.0 = -/+QNaN
>> return invalid( SIGN_BIT | EXP_MASK | QNAN_BIT );
>> else
>> // normalize denominator
>> normalize( &denomMant, &denomExp ),
>> ++denomExp;
>> int exp = counterExp;
>> uint64_t remainderMant = counterMant;
>> for( ; ; )
>> {
>> int below = remainderMant < denomMant;
>> if( unlikely(exp - below < denomExp) )
>> break;
>> exp -= below;
>> remainderMant <<= below;
>> if( unlikely(!(remainderMant -= denomMant)) )
>> {
>> exp = 0;
>> break;
>> }
>> normalize( &remainderMant, &exp );
>> };
>> if( unlikely(exp <= 0) )
>> // denormal result
>> remainderMant >>= -exp + 1,
>> exp = 0;
>> return dbl( sign | (uint64_t)exp << 52 | remainderMant & MANT_MASK );
>> }
>>
>> If I chose random pairs of doubles the above code takes nearly
>> 40% less time on my Zen1-CPU than with the current glibc-Code.
> This is interesting, if you could work on a patch for glibc I can help you
> both adjusting the patch and on reviewing it. Some remarks:
>
> 1. How are you benchmarking the code? Are you comparing by static linking the
> myFmodC against the dynamic linked libc? If so this will not take in
> consideration the plt call overhead on the comparison.
>
> 2. On x86_64 the fmod implementation will generate a tail call to
> __ieee754_fmod (math/w_fmod_compat.c) plus some comparisons. So it will
> be extra overhead the glibc has, where is not strictly required (on recent
> math optimizations we follow what you did on your code, where errno is
> ignore (it requiresa symbol version though).
>
> 2. __builtin_clz might not the best option on all architecture, since it will
> occur in a libcall to the compiler supplied code. We have some macros
> on longlong.h that avoids it (although for performance comparison on
> x86_64 it won't matter since). Also, __builtin_clz is not correct for
> 32-bits.
>
> So to have a better performance profile, you will need to replace the
> implementation at sysdeps/ieee754/dbl-64/e_fmod.c, rebuild glibc, run the
> math tests to check for regression, and then profile to see if this yield
> better performance. It would be good to have also a benchtest benchmark
> that evaluates both throughput and latency, similar to what we have for
> sin/cos/etc.
next prev parent reply other threads:[~2022-10-02 6:45 UTC|newest]
Thread overview: 3+ messages / expand[flat|nested] mbox.gz Atom feed top
[not found] <211e8b64-519a-6037-62fc-7fcac1983ac4@gmail.com>
2022-10-01 20:16 ` Adhemerval Zanella Netto
2022-10-02 6:45 ` Oliver Schädlich [this message]
2022-10-03 22:28 Wilco Dijkstra
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=040a73fa-6171-061a-3251-4c532a405b24@gmail.com \
--to=oliver.schaedlich@gmail.com \
--cc=adhemerval.zanella@linaro.org \
--cc=libc-alpha@sourceware.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).