From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-ej1-x631.google.com (mail-ej1-x631.google.com [IPv6:2a00:1450:4864:20::631]) by sourceware.org (Postfix) with ESMTPS id B42CA3858D32 for ; Sun, 2 Oct 2022 06:45:46 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org B42CA3858D32 Authentication-Results: sourceware.org; dmarc=pass (p=none dis=none) header.from=gmail.com Authentication-Results: sourceware.org; spf=pass smtp.mailfrom=gmail.com Received: by mail-ej1-x631.google.com with SMTP id a26so16557913ejc.4 for ; Sat, 01 Oct 2022 23:45:46 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20210112; h=content-transfer-encoding:in-reply-to:references:to:subject :user-agent:mime-version:date:message-id:from:from:to:cc:subject :date; bh=0LKfUItLRR6UVDsYCclr52vWPwb3DoM4ObpCfxQOHLo=; b=JVM8thgGGj2fJ2XB+Y5lI1x2g7wvoXQsNHSKJJUWV+GdHZJPfEaPoyO8Te2RInfTPO uvvLHUArKxE+8/hGJ+D9udBYZIDJSvzCRCozDDsT+Dxsjbe4lJKtiMAuxeo/MIdDAANy H0g4efbItrrsqeoWJ9jK7VHWGVAsXGGbCic4X2wrHW/N4dzvEL7p8Q0OYKEmah+q8AtQ XPUlMeVVPYICyIeuufusFzAjChQFL0ppOND/UKSfFuF78ikxVX4Ue2Z+4AvIYducmWdz AqgOzhMmZhGDaJkMbXPPmDg815OXkgVkK/qEBYgi6jUevSyXSvajOqK+MAbJTPa1qVXx B60Q== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=content-transfer-encoding:in-reply-to:references:to:subject :user-agent:mime-version:date:message-id:from:x-gm-message-state :from:to:cc:subject:date; bh=0LKfUItLRR6UVDsYCclr52vWPwb3DoM4ObpCfxQOHLo=; b=cQXWy31DGge/MlDxnNNrtnIn9feHloCRMeM+FlriRNn4wVL3AHF0MuGHQfbZEWTDgA 7qc/0ue/TT6Ol98U+ZirgxV2lKGK9RNdLjqstmDgNi25x18C1pwrKKbK6Cis887HCdKz JDm3W7pLswSTkrkRPo7cBj5IOxSx41wI4C/TH6dl4b7jj5fw2MAbSCKYT8XRbYmRWiWW bjcx0BPuLf+c9rF7Q7NZwuwHdM4IPdvd+94b+Pz7sAXtm4KyTtcgO0ziGrb5ZYGq08h+ VguY+PoQjGh/ayXW3/3tSRFIxIsGL8jo4wtq9h5+kSEWnIYPITELlqFLfzSAD6knaSbE 0mow== X-Gm-Message-State: ACrzQf2bb+dGrMYUK4t5+MWNo5aAUd1opA+UutLXa3rBZH7xORL/HlsN /3bMYNWM6BI0w49dPamdoF4= X-Google-Smtp-Source: AMsMyM77hO5kLQiQV2I6qNaC/5h+4ty0pulGBRl92pVa4G0IJapuLWN+WrmUxchCjpMccaEo+Vh6ZA== X-Received: by 2002:a17:907:d1a:b0:782:62a8:2c1a with SMTP id gn26-20020a1709070d1a00b0078262a82c1amr12032614ejc.654.1664693145157; Sat, 01 Oct 2022 23:45:45 -0700 (PDT) Received: from [192.168.0.71] (ip-176-199-153-100.um44.pools.vodafone-ip.de. [176.199.153.100]) by smtp.gmail.com with ESMTPSA id kz20-20020a17090777d400b00780982d77d1sm3582508ejc.154.2022.10.01.23.45.44 (version=TLS1_3 cipher=TLS_AES_128_GCM_SHA256 bits=128/128); Sat, 01 Oct 2022 23:45:44 -0700 (PDT) From: "=?UTF-8?Q?Oliver_Sch=c3=a4dlich?=" X-Google-Original-From: =?UTF-8?Q?Oliver_Sch=c3=a4dlich?= Message-ID: <040a73fa-6171-061a-3251-4c532a405b24@gmail.com> Date: Sun, 2 Oct 2022 08:45:45 +0200 MIME-Version: 1.0 User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Thunderbird/102.3.1 Subject: Re: More efficient fmod() To: Adhemerval Zanella Netto , libc-alpha References: <211e8b64-519a-6037-62fc-7fcac1983ac4@gmail.com> In-Reply-To: Content-Type: text/plain; charset=UTF-8; format=flowed Content-Transfer-Encoding: 8bit X-Spam-Status: No, score=-0.8 required=5.0 tests=BAYES_00,BODY_8BITS,DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF,FREEMAIL_FROM,NICE_REPLY_A,RCVD_IN_DNSWL_NONE,SPF_HELO_NONE,SPF_PASS,TXREP autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org List-Id: I've benchmarkey my code against e_fmod.c and the difference in the performance is still the same: int main() {     mt19937_64 mt;     auto dbl = []( uint64_t u ) constexpr -> double { return bit_cast( u ); };     uniform_int_distribution uid( 1, 0x7FEFFFFFFFFFFFFFu );     constexpr size_t         N = 0x1000,         ROUNDS = 1'000;     vector values( N );     for( double &v : values )         v = dbl( uid( mt ) );     auto bench = [&]( ModFn modFn ) -> double         requires requires( ModFn modFn ) { { modFn( 1.0, 1.0 ) } -> same_as; }     {         double sum = 0.0;         auto start = high_resolution_clock::now();         for( size_t r = ROUNDS; r--; )             for( size_t i = 0; i != N; ++i )                 sum += __ieee754_fmod( values[i], values[(i + 1) % N] );         double ns = duration_cast( high_resolution_clock::now() - start ).count() / ((double)N * ROUNDS);         aSum.store( sum, memory_order_relaxed );         return ns;     };     cout << bench( myFmod ) << endl;     cout << bench( []( double a, double b ) -> double { return fmod( a, b ); } ) << endl; } Am 01.10.2022 um 22:16 schrieb Adhemerval Zanella Netto: > > On 01/10/22 13:34, Oliver Schädlich via Libc-help wrote: >> I found that fmod() could be faster. >> This is my implementation: >> >> #include >> #include >> #include >> #if defined(_MSC_VER) >>     #include >> #endif >> >> #if defined(__GNUC__) || defined(__clang__) >>     #define likely(x) __builtin_expect((x), 1) >>     #define unlikely(x) __builtin_expect((x), 0) >> #else >>     #define likely(x) (x) >>     #define unlikely(x) (x) >> #endif >> >> inline uint64_t bin( double d ) >> { >>     uint64_t u; >>     memcpy( &u, &d, sizeof d ); >>     return u; >> } >> >> inline double dbl( uint64_t u ) >> { >>     double d; >>     memcpy( &d, &u, sizeof u ); >>     return d; >> } >> >> inline double invalid( uint64_t u ) >> { >>     feraiseexcept( FE_INVALID ); >>     return dbl( u ); >> } >> >> #define SIGN_BIT ((uint64_t)1 << 63) >> #define EXP_MASK ((uint64_t)0x7FF << 52) >> #define IMPLCIT_BIT ((uint64_t)1 << 52) >> #define MANT_MASK (IMPLCIT_BIT - 1) >> #define QNAN_BIT (IMPLCIT_BIT >> 1) >> >> inline void normalize( uint64_t *mant, int *exp ) >> { >> #if defined(__GNUC__) || defined(__clang__) >>     unsigned bits; >>     bits = __builtin_clz( *mant ) - 11; >>     *mant <<= bits; >> #elif defined(_MSC_VER) >>     unsigned long bits; >>     _BitScanReverse64( &bits, *mant ); >>     *mant <<= bits - 11 ; >> #else >>     unsigned bits; >>     for( bits = 0; !(*mant & IMPLCIT_BIT); *mant <<= 1, ++bits ); >> #endif >>     *exp -= bits; >> } >> >> double myFmodC( double counter, double denominator ) >> { >>     uint64_t const >>         bCounter = bin( counter ), >>         bDenom = bin( denominator ); >>     uint64_t const sign = (bCounter ^ bDenom) & SIGN_BIT; >>     if( unlikely((bCounter & EXP_MASK) == EXP_MASK) ) >>         // +/-[Inf|QNaN|SNaN] % ... = -QNaN >>         // follow SSE/AVX-rules, first NaN rules, i.e. >>         // first parameter determines non-SNaN/QNaN-bits >>         return invalid( SIGN_BIT | bCounter | QNAN_BIT ); >>     if( unlikely((bDenom & EXP_MASK) == EXP_MASK) ) >>         // +/-x % +/-[Inf|QNan|SNaN] >>         if( likely(!(bDenom & MANT_MASK)) ) >>             // +/-x % +/-Inf = -/+x >>             return dbl( sign | bCounter & ~SIGN_BIT ); >>         else >>             // +/-x % +/-[QNaN|SNaN] = -NaN >>             return invalid( SIGN_BIT | bDenom | QNAN_BIT ); >>     int >>         counterExp = (bCounter & EXP_MASK) >> 52, >>         denomExp = (bDenom & EXP_MASK) >> 52; >>     uint64_t >>         counterMant = (uint64_t)!!counterExp << 52 | bCounter & MANT_MASK, >>         denomMant = (uint64_t)!!denomExp << 52 | bDenom & MANT_MASK; >>     if( unlikely(!counterExp) ) >>         // counter is denormal >>         if( likely(!counterMant) ) >>             // counter == +/-0.0 >>             if( likely(denomMant) ) >>                 // +/-0.0 % +/-x = -/+0.0 >>                 return dbl( sign ); >>             else >>                 // +/-0.0 % +/-0.0 = -QNaN >>                 return invalid( SIGN_BIT | EXP_MASK | QNAN_BIT ); >>         else >>             // normalize counter >>             normalize( &counterMant, &counterExp ), >>             ++counterExp; >>     if( unlikely(!denomExp) ) >>         // denominator is denormal >>         if( likely(!denomMant) ) >>             // +/-x % +/-0.0 = -/+QNaN >>             return invalid( SIGN_BIT | EXP_MASK | QNAN_BIT ); >>         else >>             // normalize denominator >>             normalize( &denomMant, &denomExp ), >>             ++denomExp; >>     int exp = counterExp; >>     uint64_t remainderMant = counterMant; >>     for( ; ; ) >>     { >>         int below = remainderMant < denomMant; >>         if( unlikely(exp - below < denomExp) ) >>             break; >>         exp -= below; >>         remainderMant <<= below; >>         if( unlikely(!(remainderMant -= denomMant)) ) >>         { >>             exp = 0; >>             break; >>         } >>         normalize( &remainderMant, &exp ); >>     }; >>     if( unlikely(exp <= 0) ) >>         // denormal result >>         remainderMant >>= -exp + 1, >>         exp = 0; >>     return dbl( sign | (uint64_t)exp << 52 | remainderMant & MANT_MASK ); >> } >> >> If I chose random pairs of doubles the above code takes nearly >> 40% less time on my Zen1-CPU than with the current glibc-Code. > This is interesting, if you could work on a patch for glibc I can help you > both adjusting the patch and on reviewing it. Some remarks: > > 1. How are you benchmarking the code? Are you comparing by static linking the > myFmodC against the dynamic linked libc? If so this will not take in > consideration the plt call overhead on the comparison. > > 2. On x86_64 the fmod implementation will generate a tail call to > __ieee754_fmod (math/w_fmod_compat.c) plus some comparisons. So it will > be extra overhead the glibc has, where is not strictly required (on recent > math optimizations we follow what you did on your code, where errno is > ignore (it requiresa symbol version though). > > 2. __builtin_clz might not the best option on all architecture, since it will > occur in a libcall to the compiler supplied code. We have some macros > on longlong.h that avoids it (although for performance comparison on > x86_64 it won't matter since). Also, __builtin_clz is not correct for > 32-bits. > > So to have a better performance profile, you will need to replace the > implementation at sysdeps/ieee754/dbl-64/e_fmod.c, rebuild glibc, run the > math tests to check for regression, and then profile to see if this yield > better performance. It would be good to have also a benchtest benchmark > that evaluates both throughput and latency, similar to what we have for > sin/cos/etc.