* More efficient fmod()
@ 2022-10-01 16:34 Oliver Schädlich
2022-10-01 20:19 ` Adhemerval Zanella Netto
0 siblings, 1 reply; 2+ messages in thread
From: Oliver Schädlich @ 2022-10-01 16:34 UTC (permalink / raw)
To: libc-help
I found that fmod() could be faster.
This is my implementation:
#include <stdint.h>
#include <string.h>
#include <fenv.h>
#if defined(_MSC_VER)
#include <intrin.h>
#endif
#if defined(__GNUC__) || defined(__clang__)
#define likely(x) __builtin_expect((x), 1)
#define unlikely(x) __builtin_expect((x), 0)
#else
#define likely(x) (x)
#define unlikely(x) (x)
#endif
inline uint64_t bin( double d )
{
uint64_t u;
memcpy( &u, &d, sizeof d );
return u;
}
inline double dbl( uint64_t u )
{
double d;
memcpy( &d, &u, sizeof u );
return d;
}
inline double invalid( uint64_t u )
{
feraiseexcept( FE_INVALID );
return dbl( u );
}
#define SIGN_BIT ((uint64_t)1 << 63)
#define EXP_MASK ((uint64_t)0x7FF << 52)
#define IMPLCIT_BIT ((uint64_t)1 << 52)
#define MANT_MASK (IMPLCIT_BIT - 1)
#define QNAN_BIT (IMPLCIT_BIT >> 1)
inline void normalize( uint64_t *mant, int *exp )
{
#if defined(__GNUC__) || defined(__clang__)
unsigned bits;
bits = __builtin_clz( *mant ) - 11;
*mant <<= bits;
#elif defined(_MSC_VER)
unsigned long bits;
_BitScanReverse64( &bits, *mant );
*mant <<= bits - 11 ;
#else
unsigned bits;
for( bits = 0; !(*mant & IMPLCIT_BIT); *mant <<= 1, ++bits );
#endif
*exp -= bits;
}
double myFmodC( double counter, double denominator )
{
uint64_t const
bCounter = bin( counter ),
bDenom = bin( denominator );
uint64_t const sign = (bCounter ^ bDenom) & SIGN_BIT;
if( unlikely((bCounter & EXP_MASK) == EXP_MASK) )
// +/-[Inf|QNaN|SNaN] % ... = -QNaN
// follow SSE/AVX-rules, first NaN rules, i.e.
// first parameter determines non-SNaN/QNaN-bits
return invalid( SIGN_BIT | bCounter | QNAN_BIT );
if( unlikely((bDenom & EXP_MASK) == EXP_MASK) )
// +/-x % +/-[Inf|QNan|SNaN]
if( likely(!(bDenom & MANT_MASK)) )
// +/-x % +/-Inf = -/+x
return dbl( sign | bCounter & ~SIGN_BIT );
else
// +/-x % +/-[QNaN|SNaN] = -NaN
return invalid( SIGN_BIT | bDenom | QNAN_BIT );
int
counterExp = (bCounter & EXP_MASK) >> 52,
denomExp = (bDenom & EXP_MASK) >> 52;
uint64_t
counterMant = (uint64_t)!!counterExp << 52 | bCounter & MANT_MASK,
denomMant = (uint64_t)!!denomExp << 52 | bDenom & MANT_MASK;
if( unlikely(!counterExp) )
// counter is denormal
if( likely(!counterMant) )
// counter == +/-0.0
if( likely(denomMant) )
// +/-0.0 % +/-x = -/+0.0
return dbl( sign );
else
// +/-0.0 % +/-0.0 = -QNaN
return invalid( SIGN_BIT | EXP_MASK | QNAN_BIT );
else
// normalize counter
normalize( &counterMant, &counterExp ),
++counterExp;
if( unlikely(!denomExp) )
// denominator is denormal
if( likely(!denomMant) )
// +/-x % +/-0.0 = -/+QNaN
return invalid( SIGN_BIT | EXP_MASK | QNAN_BIT );
else
// normalize denominator
normalize( &denomMant, &denomExp ),
++denomExp;
int exp = counterExp;
uint64_t remainderMant = counterMant;
for( ; ; )
{
int below = remainderMant < denomMant;
if( unlikely(exp - below < denomExp) )
break;
exp -= below;
remainderMant <<= below;
if( unlikely(!(remainderMant -= denomMant)) )
{
exp = 0;
break;
}
normalize( &remainderMant, &exp );
};
if( unlikely(exp <= 0) )
// denormal result
remainderMant >>= -exp + 1,
exp = 0;
return dbl( sign | (uint64_t)exp << 52 | remainderMant & MANT_MASK );
}
If I chose random pairs of doubles the above code takes nearly
40% less time on my Zen1-CPU than with the current glibc-Code.
^ permalink raw reply [flat|nested] 2+ messages in thread
* Re: More efficient fmod()
2022-10-01 16:34 More efficient fmod() Oliver Schädlich
@ 2022-10-01 20:19 ` Adhemerval Zanella Netto
0 siblings, 0 replies; 2+ messages in thread
From: Adhemerval Zanella Netto @ 2022-10-01 20:19 UTC (permalink / raw)
To: libc-help
On 01/10/22 13:34, Oliver Schädlich via Libc-help wrote:
> I found that fmod() could be faster.
> This is my implementation:
>
> #include <stdint.h>
> #include <string.h>
> #include <fenv.h>
> #if defined(_MSC_VER)
> #include <intrin.h>
> #endif
>
> #if defined(__GNUC__) || defined(__clang__)
> #define likely(x) __builtin_expect((x), 1)
> #define unlikely(x) __builtin_expect((x), 0)
> #else
> #define likely(x) (x)
> #define unlikely(x) (x)
> #endif
>
> inline uint64_t bin( double d )
> {
> uint64_t u;
> memcpy( &u, &d, sizeof d );
> return u;
> }
>
> inline double dbl( uint64_t u )
> {
> double d;
> memcpy( &d, &u, sizeof u );
> return d;
> }
>
> inline double invalid( uint64_t u )
> {
> feraiseexcept( FE_INVALID );
> return dbl( u );
> }
>
> #define SIGN_BIT ((uint64_t)1 << 63)
> #define EXP_MASK ((uint64_t)0x7FF << 52)
> #define IMPLCIT_BIT ((uint64_t)1 << 52)
> #define MANT_MASK (IMPLCIT_BIT - 1)
> #define QNAN_BIT (IMPLCIT_BIT >> 1)
>
> inline void normalize( uint64_t *mant, int *exp )
> {
> #if defined(__GNUC__) || defined(__clang__)
> unsigned bits;
> bits = __builtin_clz( *mant ) - 11;
> *mant <<= bits;
> #elif defined(_MSC_VER)
> unsigned long bits;
> _BitScanReverse64( &bits, *mant );
> *mant <<= bits - 11 ;
> #else
> unsigned bits;
> for( bits = 0; !(*mant & IMPLCIT_BIT); *mant <<= 1, ++bits );
> #endif
> *exp -= bits;
> }
>
> double myFmodC( double counter, double denominator )
> {
> uint64_t const
> bCounter = bin( counter ),
> bDenom = bin( denominator );
> uint64_t const sign = (bCounter ^ bDenom) & SIGN_BIT;
> if( unlikely((bCounter & EXP_MASK) == EXP_MASK) )
> // +/-[Inf|QNaN|SNaN] % ... = -QNaN
> // follow SSE/AVX-rules, first NaN rules, i.e.
> // first parameter determines non-SNaN/QNaN-bits
> return invalid( SIGN_BIT | bCounter | QNAN_BIT );
> if( unlikely((bDenom & EXP_MASK) == EXP_MASK) )
> // +/-x % +/-[Inf|QNan|SNaN]
> if( likely(!(bDenom & MANT_MASK)) )
> // +/-x % +/-Inf = -/+x
> return dbl( sign | bCounter & ~SIGN_BIT );
> else
> // +/-x % +/-[QNaN|SNaN] = -NaN
> return invalid( SIGN_BIT | bDenom | QNAN_BIT );
> int
> counterExp = (bCounter & EXP_MASK) >> 52,
> denomExp = (bDenom & EXP_MASK) >> 52;
> uint64_t
> counterMant = (uint64_t)!!counterExp << 52 | bCounter & MANT_MASK,
> denomMant = (uint64_t)!!denomExp << 52 | bDenom & MANT_MASK;
> if( unlikely(!counterExp) )
> // counter is denormal
> if( likely(!counterMant) )
> // counter == +/-0.0
> if( likely(denomMant) )
> // +/-0.0 % +/-x = -/+0.0
> return dbl( sign );
> else
> // +/-0.0 % +/-0.0 = -QNaN
> return invalid( SIGN_BIT | EXP_MASK | QNAN_BIT );
> else
> // normalize counter
> normalize( &counterMant, &counterExp ),
> ++counterExp;
> if( unlikely(!denomExp) )
> // denominator is denormal
> if( likely(!denomMant) )
> // +/-x % +/-0.0 = -/+QNaN
> return invalid( SIGN_BIT | EXP_MASK | QNAN_BIT );
> else
> // normalize denominator
> normalize( &denomMant, &denomExp ),
> ++denomExp;
> int exp = counterExp;
> uint64_t remainderMant = counterMant;
> for( ; ; )
> {
> int below = remainderMant < denomMant;
> if( unlikely(exp - below < denomExp) )
> break;
> exp -= below;
> remainderMant <<= below;
> if( unlikely(!(remainderMant -= denomMant)) )
> {
> exp = 0;
> break;
> }
> normalize( &remainderMant, &exp );
> };
> if( unlikely(exp <= 0) )
> // denormal result
> remainderMant >>= -exp + 1,
> exp = 0;
> return dbl( sign | (uint64_t)exp << 52 | remainderMant & MANT_MASK );
> }
>
> If I chose random pairs of doubles the above code takes nearly
> 40% less time on my Zen1-CPU than with the current glibc-Code.
This is interesting, if you could work on a patch for glibc I can help you
both adjusting the patch and on reviewing it. Some remarks:
1. How are you benchmarking the code? Are you comparing by static linking the
myFmodC against the dynamic linked libc? If so this will not take in
consideration the plt call overhead on the comparison.
2. On x86_64 the fmod implementation will generate a tail call to
__ieee754_fmod (math/w_fmod_compat.c) plus some comparisons. So it will
be extra overhead the glibc has, where is not strictly required (on recent
math optimizations we follow what you did on your code, where errno is
ignore (it requiresa symbol version though).
2. __builtin_clz might not the best option on all architecture, since it will
occur in a libcall to the compiler supplied code. We have some macros
on longlong.h that avoids it (although for performance comparison on
x86_64 it won't matter since). Also, __builtin_clz is not correct for
32-bits.
So to have a better performance profile, you will need to replace the
implementation at sysdeps/ieee754/dbl-64/e_fmod.c, rebuild glibc, run the
math tests to check for regression, and then profile to see if this yield
better performance. It would be good to have also a benchtest benchmark
that evaluates both throughput and latency, similar to what we have for
sin/cos/etc.
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2022-10-01 20:19 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-10-01 16:34 More efficient fmod() Oliver Schädlich
2022-10-01 20:19 ` Adhemerval Zanella Netto
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).