From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from us-smtp-delivery-124.mimecast.com (us-smtp-delivery-124.mimecast.com [170.10.133.124]) by sourceware.org (Postfix) with ESMTPS id B73F438518B5 for ; Thu, 24 Nov 2022 09:29:04 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org B73F438518B5 Authentication-Results: sourceware.org; dmarc=pass (p=none dis=none) header.from=redhat.com Authentication-Results: sourceware.org; spf=pass smtp.mailfrom=redhat.com DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=redhat.com; s=mimecast20190719; t=1669282144; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=mx1xUOGwUwRxlSog7kXROLCSgFy/N65NabND476s8WQ=; b=DlnKC1e1fCYdzi6VdWKc8vMOiCUNx9ZSSIRBKq3AeKTS5FPk9sbJ9r0bMVqOHSVMoXFvyQ +6RBeHUSGy9zeQtC/vKbZLk35edpsayErJFoJs4Uc6xnDhM6Ay+EsxyQreYz4kelTMSiSK XqfmBMzPw3FXkkSddZXmxdHx6A19D2M= Received: from mail-ed1-f69.google.com (mail-ed1-f69.google.com [209.85.208.69]) by relay.mimecast.com with ESMTP with STARTTLS (version=TLSv1.3, cipher=TLS_AES_128_GCM_SHA256) id us-mta-48-fx8xh4HJNg6d52BCV68Amw-1; Thu, 24 Nov 2022 04:29:03 -0500 X-MC-Unique: fx8xh4HJNg6d52BCV68Amw-1 Received: by mail-ed1-f69.google.com with SMTP id m7-20020a056402430700b0045daff6ee5dso660277edc.10 for ; Thu, 24 Nov 2022 01:29:02 -0800 (PST) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=content-transfer-encoding:cc:to:subject:message-id:date:from :in-reply-to:references:mime-version:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=mx1xUOGwUwRxlSog7kXROLCSgFy/N65NabND476s8WQ=; b=pYZkXOKgschB3Ptihh5eBycKW/h7YNyPqakhi//8Ad3Qvfrot0hGUllW9wm/QaCR8p 7xfwstVDlKEp99gUy9Nco3lu3aex+Sbv9H9FbUn0JSQjKO9GT7AOuHiW7x4dwMPfrEzH 758EMx364rysC4846+eaa1ciSRTzI2Cc1bSE6SQi1hLwfYlSBxeCogupCNgpGfoDkO2w 5Du/flKCYOOKpHqEqn9jyafTz5ddu5EXAWNiC5N7aa6rxxuX6VUdX4/+fmzXJ+m54/Lw 95dLRuMIdE1q2+m0eHIL/qOMAoaynJjgvhmjHowVra1kRJGjVB7O14L7wHl7wyENl23z i3iw== X-Gm-Message-State: ANoB5pkkxajlbBonpTm3xUzEOhDgEYIDm2mjJCVM4sbRpmP9pffqZhzH /2X5/KtmiE3HZB81tgtGHBUwTJq+o95sn+ILRjev+f0nb5A9xdrcCrpuEP7BB0m2Jdsit0mXK8y SWZLvYw1ZdmgUr9cx8qcAwC2rxiBZxaQ= X-Received: by 2002:a17:906:c02:b0:7ae:ca2f:171d with SMTP id s2-20020a1709060c0200b007aeca2f171dmr26921208ejf.353.1669282141965; Thu, 24 Nov 2022 01:29:01 -0800 (PST) X-Google-Smtp-Source: AA0mqf6F8pDMdLbODBYuqRcvFud/M6qDxAv5/jLNbea9SIqiK1wDBMTe5xEEPqqjznNGDuV2QZN2v/EMS8sJyRhs44Y= X-Received: by 2002:a17:906:c02:b0:7ae:ca2f:171d with SMTP id s2-20020a1709060c0200b007aeca2f171dmr26921193ejf.353.1669282141618; Thu, 24 Nov 2022 01:29:01 -0800 (PST) MIME-Version: 1.0 References: In-Reply-To: From: Jonathan Wakely Date: Thu, 24 Nov 2022 09:28:50 +0000 Message-ID: Subject: Re: [PATCH] libstdc++: Another merge from fast_float upstream [PR107468] To: Jakub Jelinek Cc: libstdc++@gcc.gnu.org, gcc-patches@gcc.gnu.org X-Mimecast-Spam-Score: 0 X-Mimecast-Originator: redhat.com Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: quoted-printable X-Spam-Status: No, score=-6.3 required=5.0 tests=BAYES_00,DKIMWL_WL_HIGH,DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF,KAM_INFOUSMEBIZ,RCVD_IN_DNSWL_NONE,RCVD_IN_MSPIKE_H2,SPF_HELO_NONE,SPF_NONE,TXREP autolearn=no autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org List-Id: On Thu, 24 Nov 2022 at 09:23, Jakub Jelinek wrote: > > Hi! > > Upstream fast_float came up with a cheaper test for > fegetround () =3D=3D FE_TONEAREST using one float addition, one subtracti= on > and one comparison. If we know we are rounding to nearest, we can use > fast path in more cases as before. > The following patch merges those changes into libstdc++. > > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? OK, thanks. > > 2022-11-24 Jakub Jelinek > > PR libstdc++/107468 > * src/c++17/fast_float/MERGE: Adjust for merge from upstream. > * src/c++17/fast_float/fast_float.h: Merge from fast_float > 2ef9abbcf6a11958b6fa685a89d0150022e82e78 commit. > > --- libstdc++-v3/src/c++17/fast_float/MERGE.jj 2022-11-07 15:17:14.03507= 1694 +0100 > +++ libstdc++-v3/src/c++17/fast_float/MERGE 2022-11-23 17:09:20.94086= 6070 +0100 > @@ -1,4 +1,4 @@ > -662497742fea7055f0e0ee27e5a7ddc382c2c38e > +2ef9abbcf6a11958b6fa685a89d0150022e82e78 > > The first line of this file holds the git revision number of the > last merge done from the master library sources. > --- libstdc++-v3/src/c++17/fast_float/fast_float.h.jj 2022-11-07 15:17:= 14.066071268 +0100 > +++ libstdc++-v3/src/c++17/fast_float/fast_float.h 2022-11-23 17:19:= 41.735693122 +0100 > @@ -99,11 +99,11 @@ from_chars_result from_chars_advanced(co > || defined(__MINGW64__) = \ > || defined(__s390x__) = \ > || (defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le= __) || defined(__PPC64LE__)) ) > -#define FASTFLOAT_64BIT > +#define FASTFLOAT_64BIT 1 > #elif (defined(__i386) || defined(__i386__) || defined(_M_IX86) \ > || defined(__arm__) || defined(_M_ARM) \ > || defined(__MINGW32__) || defined(__EMSCRIPTEN__)) > -#define FASTFLOAT_32BIT > +#define FASTFLOAT_32BIT 1 > #else > // Need to check incrementally, since SIZE_MAX is a size_t, avoid over= flow. > // We can never tell the register width, but the SIZE_MAX is a good ap= proximation. > @@ -111,9 +111,9 @@ from_chars_result from_chars_advanced(co > #if SIZE_MAX =3D=3D 0xffff > #error Unknown platform (16-bit, unsupported) > #elif SIZE_MAX =3D=3D 0xffffffff > - #define FASTFLOAT_32BIT > + #define FASTFLOAT_32BIT 1 > #elif SIZE_MAX =3D=3D 0xffffffffffffffff > - #define FASTFLOAT_64BIT > + #define FASTFLOAT_64BIT 1 > #else > #error Unknown platform (not 32-bit, not 64-bit?) > #endif > @@ -359,10 +359,12 @@ template struct binary_form > static inline constexpr int minimum_exponent(); > static inline constexpr int infinite_power(); > static inline constexpr int sign_index(); > + static inline constexpr int min_exponent_fast_path(); // used when feg= etround() =3D=3D FE_TONEAREST > static inline constexpr int max_exponent_fast_path(); > static inline constexpr int max_exponent_round_to_even(); > static inline constexpr int min_exponent_round_to_even(); > static inline constexpr uint64_t max_mantissa_fast_path(int64_t power)= ; > + static inline constexpr uint64_t max_mantissa_fast_path(); // used whe= n fegetround() =3D=3D FE_TONEAREST > static inline constexpr int largest_power_of_ten(); > static inline constexpr int smallest_power_of_ten(); > static inline constexpr T exact_power_of_ten(int64_t power); > @@ -372,6 +374,22 @@ template struct binary_form > static inline constexpr equiv_uint hidden_bit_mask(); > }; > > +template <> inline constexpr int binary_format::min_exponent_fas= t_path() { > +#if (FLT_EVAL_METHOD !=3D 1) && (FLT_EVAL_METHOD !=3D 0) > + return 0; > +#else > + return -22; > +#endif > +} > + > +template <> inline constexpr int binary_format::min_exponent_fast= _path() { > +#if (FLT_EVAL_METHOD !=3D 1) && (FLT_EVAL_METHOD !=3D 0) > + return 0; > +#else > + return -10; > +#endif > +} > + > template <> inline constexpr int binary_format::mantissa_explici= t_bits() { > return 52; > } > @@ -418,13 +436,18 @@ template <> inline constexpr int binary_ > template <> inline constexpr int binary_format::max_exponent_fast= _path() { > return 10; > } > - > +template <> inline constexpr uint64_t binary_format::max_mantiss= a_fast_path() { > + return uint64_t(2) << mantissa_explicit_bits(); > +} > template <> inline constexpr uint64_t binary_format::max_mantiss= a_fast_path(int64_t power) { > // caller is responsible to ensure that > // power >=3D 0 && power <=3D 22 > // > return max_mantissa_double[power]; > } > +template <> inline constexpr uint64_t binary_format::max_mantissa= _fast_path() { > + return uint64_t(2) << mantissa_explicit_bits(); > +} > template <> inline constexpr uint64_t binary_format::max_mantissa= _fast_path(int64_t power) { > // caller is responsible to ensure that > // power >=3D 0 && power <=3D 10 > @@ -619,10 +642,6 @@ parsed_number_string parse_number_string > > uint64_t i =3D 0; // an unsigned int avoids signed overflows (which ar= e bad) > > - while ((std::distance(p, pend) >=3D 8) && is_made_of_eight_digits_fast= (p)) { > - i =3D i * 100000000 + parse_eight_digits_unrolled(p); // in rare cas= es, this will overflow, but that's ok > - p +=3D 8; > - } > while ((p !=3D pend) && is_integer(*p)) { > // a multiplication by 10 is cheaper than an arbitrary integer > // multiplication > @@ -1640,7 +1659,7 @@ namespace fast_float { > // we might have platforms where `CHAR_BIT` is not 8, so let's avoid > // doing `8 * sizeof(limb)`. > #if defined(FASTFLOAT_64BIT) && !defined(__sparc) > -#define FASTFLOAT_64BIT_LIMB > +#define FASTFLOAT_64BIT_LIMB 1 > typedef uint64_t limb; > constexpr size_t limb_bits =3D 64; > #else > @@ -2314,10 +2333,6 @@ parsed_number_string parse_number_string > > uint64_t i =3D 0; // an unsigned int avoids signed overflows (which ar= e bad) > > - while ((std::distance(p, pend) >=3D 8) && is_made_of_eight_digits_fast= (p)) { > - i =3D i * 100000000 + parse_eight_digits_unrolled(p); // in rare cas= es, this will overflow, but that's ok > - p +=3D 8; > - } > while ((p !=3D pend) && is_integer(*p)) { > // a multiplication by 10 is cheaper than an arbitrary integer > // multiplication > @@ -2892,6 +2907,48 @@ from_chars_result parse_infnan(const cha > return answer; > } > > +/** > + * Returns true if the floating-pointing rounding mode is to 'nearest'. > + * It is the default on most system. This function is meant to be inexpe= nsive. > + * Credit : @mwalcott3 > + */ > +fastfloat_really_inline bool rounds_to_nearest() noexcept { > + // See > + // A fast function to check your floating-point rounding mode > + // https://lemire.me/blog/2022/11/16/a-fast-function-to-check-your-flo= ating-point-rounding-mode/ > + // > + // This function is meant to be equivalent to : > + // prior: #include > + // return fegetround() =3D=3D FE_TONEAREST; > + // However, it is expected to be much faster than the fegetround() > + // function call. > + // > + // The volatile keywoard prevents the compiler from computing the func= tion > + // at compile-time. > + // There might be other ways to prevent compile-time optimizations (e.= g., asm). > + // The value does not need to be std::numeric_limits::min(), an= y small > + // value so that 1 + x should round to 1 would do (after accounting fo= r excess > + // precision, as in 387 instructions). > + static volatile float fmin =3D std::numeric_limits::min(); > + float fmini =3D fmin; // we copy it so that it gets loaded at most onc= e. > + // > + // Explanation: > + // Only when fegetround() =3D=3D FE_TONEAREST do we have that > + // fmin + 1.0f =3D=3D 1.0f - fmin. > + // > + // FE_UPWARD: > + // fmin + 1.0f > 1 > + // 1.0f - fmin =3D=3D 1 > + // > + // FE_DOWNWARD or FE_TOWARDZERO: > + // fmin + 1.0f =3D=3D 1 > + // 1.0f - fmin < 1 > + // > + // Note: This may fail to be accurate if fast-math has been > + // enabled, as rounding conventions may not apply. > + return (fmini + 1.0f =3D=3D 1.0f - fmini); > +} > + > } // namespace detail > > template > @@ -2919,12 +2976,45 @@ from_chars_result from_chars_advanced(co > } > answer.ec =3D std::errc(); // be optimistic > answer.ptr =3D pns.lastmatch; > - // Next is a modified Clinger's fast path, inspired by Jakub Jel=C3=AD= nek's proposal > - if (pns.exponent >=3D 0 && pns.exponent <=3D binary_format::max_exp= onent_fast_path() && pns.mantissa <=3Dbinary_format::max_mantissa_fast_p= ath(pns.exponent) && !pns.too_many_digits) { > - value =3D T(pns.mantissa); > - value =3D value * binary_format::exact_power_of_ten(pns.exponent)= ; > - if (pns.negative) { value =3D -value; } > - return answer; > + // The implementation of the Clinger's fast path is convoluted because > + // we want round-to-nearest in all cases, irrespective of the rounding= mode > + // selected on the thread. > + // We proceed optimistically, assuming that detail::rounds_to_nearest(= ) returns > + // true. > + if (binary_format::min_exponent_fast_path() <=3D pns.exponent && pn= s.exponent <=3D binary_format::max_exponent_fast_path() && !pns.too_many= _digits) { > + // Unfortunately, the conventional Clinger's fast path is only possi= ble > + // when the system rounds to the nearest float. > + // > + // We expect the next branch to almost always be selected. > + // We could check it first (before the previous branch), but > + // there might be performance advantages at having the check > + // be last. > + if(detail::rounds_to_nearest()) { > + // We have that fegetround() =3D=3D FE_TONEAREST. > + // Next is Clinger's fast path. > + if (pns.mantissa <=3Dbinary_format::max_mantissa_fast_path()) { > + value =3D T(pns.mantissa); > + if (pns.exponent < 0) { value =3D value / binary_format::exac= t_power_of_ten(-pns.exponent); } > + else { value =3D value * binary_format::exact_power_of_ten(pn= s.exponent); } > + if (pns.negative) { value =3D -value; } > + return answer; > + } > + } else { > + // We do not have that fegetround() =3D=3D FE_TONEAREST. > + // Next is a modified Clinger's fast path, inspired by Jakub Jel= =C3=ADnek's proposal > + if (pns.exponent >=3D 0 && pns.mantissa <=3Dbinary_format::max_= mantissa_fast_path(pns.exponent)) { > +#if (defined(_WIN32) && defined(__clang__)) > + // ClangCL may map 0 to -0.0 when fegetround() =3D=3D FE_DOWNWAR= D > + if(pns.mantissa =3D=3D 0) { > + value =3D 0; > + return answer; > + } > +#endif > + value =3D T(pns.mantissa) * binary_format::exact_power_of_ten= (pns.exponent); > + if (pns.negative) { value =3D -value; } > + return answer; > + } > + } > } > adjusted_mantissa am =3D compute_float>(pns.exponent,= pns.mantissa); > if(pns.too_many_digits && am.power2 >=3D 0) { > > Jakub >