From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <jakub@sourceware.org>
Received: by sourceware.org (Postfix, from userid 2153)
	id B1B713853D6C; Thu, 24 Nov 2022 09:39:10 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org B1B713853D6C
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org;
	s=default; t=1669282750;
	bh=MiM0Igsnf3gqk/mdu6Q84+7vcDEyo5/arDz8Hb65msU=;
	h=From:To:Subject:Date:From;
	b=XwsFPV0WnDvRm7sCjd94ewkzI8oZsGqzvz3b1pDN+rUWwlisKNF8Ms7jmK+dVgiTa
	 N+aqTXcvnZat3GuUAU9PRxd4lTDQzT/yxC/VrviyedeMH0X4K+6PMXqYq9emj0mCYq
	 4zrL5VBWwTzggN2bYHoIOxVEYs0+SJmewq597E84=
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
Content-Type: text/plain; charset="utf-8"
From: Jakub Jelinek <jakub@gcc.gnu.org>
To: gcc-cvs@gcc.gnu.org, libstdc++-cvs@gcc.gnu.org
Subject: [gcc r13-4285] libstdc++: Another merge from fast_float upstream
 [PR107468]
X-Act-Checkin: gcc
X-Git-Author: Jakub Jelinek <jakub@redhat.com>
X-Git-Refname: refs/heads/master
X-Git-Oldrev: d1389be011f0fac422e98e795c55156052c4d960
X-Git-Newrev: ec73b55c75baa16c1cf7482fa65928a8d45598d4
Message-Id: <20221124093910.B1B713853D6C@sourceware.org>
Date: Thu, 24 Nov 2022 09:39:10 +0000 (GMT)
List-Id: <libstdc++-cvs.sourceware.org>

https://gcc.gnu.org/g:ec73b55c75baa16c1cf7482fa65928a8d45598d4

commit r13-4285-gec73b55c75baa16c1cf7482fa65928a8d45598d4
Author: Jakub Jelinek <jakub@redhat.com>
Date:   Thu Nov 24 10:38:42 2022 +0100

    libstdc++: Another merge from fast_float upstream [PR107468]
    
    Upstream fast_float came up with a cheaper test for
    fegetround () == FE_TONEAREST using one float addition, one subtraction
    and one comparison.  If we know we are rounding to nearest, we can use
    fast path in more cases as before.
    The following patch merges those changes into libstdc++.
    
    2022-11-24  Jakub Jelinek  <jakub@redhat.com>
    
            PR libstdc++/107468
            * src/c++17/fast_float/MERGE: Adjust for merge from upstream.
            * src/c++17/fast_float/fast_float.h: Merge from fast_float
            2ef9abbcf6a11958b6fa685a89d0150022e82e78 commit.

Diff:
---
 libstdc++-v3/src/c++17/fast_float/MERGE        |   2 +-
 libstdc++-v3/src/c++17/fast_float/fast_float.h | 130 +++++++++++++++++++++----
 2 files changed, 111 insertions(+), 21 deletions(-)

diff --git a/libstdc++-v3/src/c++17/fast_float/MERGE b/libstdc++-v3/src/c++17/fast_float/MERGE
index 20eae9d710f..82cca63704c 100644
--- a/libstdc++-v3/src/c++17/fast_float/MERGE
+++ b/libstdc++-v3/src/c++17/fast_float/MERGE
@@ -1,4 +1,4 @@
-662497742fea7055f0e0ee27e5a7ddc382c2c38e
+2ef9abbcf6a11958b6fa685a89d0150022e82e78
 
 The first line of this file holds the git revision number of the
 last merge done from the master library sources.
diff --git a/libstdc++-v3/src/c++17/fast_float/fast_float.h b/libstdc++-v3/src/c++17/fast_float/fast_float.h
index 5da55e2fe0a..7551c4f89ef 100644
--- a/libstdc++-v3/src/c++17/fast_float/fast_float.h
+++ b/libstdc++-v3/src/c++17/fast_float/fast_float.h
@@ -99,11 +99,11 @@ from_chars_result from_chars_advanced(const char *first, const char *last,
        || defined(__MINGW64__)                                          \
        || defined(__s390x__)                                            \
        || (defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le__) || defined(__PPC64LE__)) )
-#define FASTFLOAT_64BIT
+#define FASTFLOAT_64BIT 1
 #elif (defined(__i386) || defined(__i386__) || defined(_M_IX86)   \
      || defined(__arm__) || defined(_M_ARM)                   \
      || defined(__MINGW32__) || defined(__EMSCRIPTEN__))
-#define FASTFLOAT_32BIT
+#define FASTFLOAT_32BIT 1
 #else
   // Need to check incrementally, since SIZE_MAX is a size_t, avoid overflow.
   // We can never tell the register width, but the SIZE_MAX is a good approximation.
@@ -111,9 +111,9 @@ from_chars_result from_chars_advanced(const char *first, const char *last,
   #if SIZE_MAX == 0xffff
     #error Unknown platform (16-bit, unsupported)
   #elif SIZE_MAX == 0xffffffff
-    #define FASTFLOAT_32BIT
+    #define FASTFLOAT_32BIT 1
   #elif SIZE_MAX == 0xffffffffffffffff
-    #define FASTFLOAT_64BIT
+    #define FASTFLOAT_64BIT 1
   #else
     #error Unknown platform (not 32-bit, not 64-bit?)
   #endif
@@ -359,10 +359,12 @@ template <typename T> struct binary_format {
   static inline constexpr int minimum_exponent();
   static inline constexpr int infinite_power();
   static inline constexpr int sign_index();
+  static inline constexpr int min_exponent_fast_path(); // used when fegetround() == FE_TONEAREST
   static inline constexpr int max_exponent_fast_path();
   static inline constexpr int max_exponent_round_to_even();
   static inline constexpr int min_exponent_round_to_even();
   static inline constexpr uint64_t max_mantissa_fast_path(int64_t power);
+  static inline constexpr uint64_t max_mantissa_fast_path(); // used when fegetround() == FE_TONEAREST
   static inline constexpr int largest_power_of_ten();
   static inline constexpr int smallest_power_of_ten();
   static inline constexpr T exact_power_of_ten(int64_t power);
@@ -372,6 +374,22 @@ template <typename T> struct binary_format {
   static inline constexpr equiv_uint hidden_bit_mask();
 };
 
+template <> inline constexpr int binary_format<double>::min_exponent_fast_path() {
+#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
+  return 0;
+#else
+  return -22;
+#endif
+}
+
+template <> inline constexpr int binary_format<float>::min_exponent_fast_path() {
+#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
+  return 0;
+#else
+  return -10;
+#endif
+}
+
 template <> inline constexpr int binary_format<double>::mantissa_explicit_bits() {
   return 52;
 }
@@ -418,13 +436,18 @@ template <> inline constexpr int binary_format<double>::max_exponent_fast_path()
 template <> inline constexpr int binary_format<float>::max_exponent_fast_path() {
   return 10;
 }
-
+template <> inline constexpr uint64_t binary_format<double>::max_mantissa_fast_path() {
+  return uint64_t(2) << mantissa_explicit_bits();
+}
 template <> inline constexpr uint64_t binary_format<double>::max_mantissa_fast_path(int64_t power) {
   // caller is responsible to ensure that
   // power >= 0 && power <= 22
   //
   return max_mantissa_double[power];
 }
+template <> inline constexpr uint64_t binary_format<float>::max_mantissa_fast_path() {
+  return uint64_t(2) << mantissa_explicit_bits();
+}
 template <> inline constexpr uint64_t binary_format<float>::max_mantissa_fast_path(int64_t power) {
   // caller is responsible to ensure that
   // power >= 0 && power <= 10
@@ -619,10 +642,6 @@ parsed_number_string parse_number_string(const char *p, const char *pend, parse_
 
   uint64_t i = 0; // an unsigned int avoids signed overflows (which are bad)
 
-  while ((std::distance(p, pend) >= 8) && is_made_of_eight_digits_fast(p)) {
-    i = i * 100000000 + parse_eight_digits_unrolled(p); // in rare cases, this will overflow, but that's ok
-    p += 8;
-  }
   while ((p != pend) && is_integer(*p)) {
     // a multiplication by 10 is cheaper than an arbitrary integer
     // multiplication
@@ -1640,7 +1659,7 @@ namespace fast_float {
 // we might have platforms where `CHAR_BIT` is not 8, so let's avoid
 // doing `8 * sizeof(limb)`.
 #if defined(FASTFLOAT_64BIT) && !defined(__sparc)
-#define FASTFLOAT_64BIT_LIMB
+#define FASTFLOAT_64BIT_LIMB 1
 typedef uint64_t limb;
 constexpr size_t limb_bits = 64;
 #else
@@ -2314,10 +2333,6 @@ parsed_number_string parse_number_string(const char *p, const char *pend, parse_
 
   uint64_t i = 0; // an unsigned int avoids signed overflows (which are bad)
 
-  while ((std::distance(p, pend) >= 8) && is_made_of_eight_digits_fast(p)) {
-    i = i * 100000000 + parse_eight_digits_unrolled(p); // in rare cases, this will overflow, but that's ok
-    p += 8;
-  }
   while ((p != pend) && is_integer(*p)) {
     // a multiplication by 10 is cheaper than an arbitrary integer
     // multiplication
@@ -2892,6 +2907,48 @@ from_chars_result parse_infnan(const char *first, const char *last, T &value)  n
   return answer;
 }
 
+/**
+ * Returns true if the floating-pointing rounding mode is to 'nearest'.
+ * It is the default on most system. This function is meant to be inexpensive.
+ * Credit : @mwalcott3
+ */
+fastfloat_really_inline bool rounds_to_nearest() noexcept {
+  // See
+  // A fast function to check your floating-point rounding mode
+  // https://lemire.me/blog/2022/11/16/a-fast-function-to-check-your-floating-point-rounding-mode/
+  //
+  // This function is meant to be equivalent to :
+  // prior: #include <cfenv>
+  //  return fegetround() == FE_TONEAREST;
+  // However, it is expected to be much faster than the fegetround()
+  // function call.
+  //
+  // The volatile keywoard prevents the compiler from computing the function
+  // at compile-time.
+  // There might be other ways to prevent compile-time optimizations (e.g., asm).
+  // The value does not need to be std::numeric_limits<float>::min(), any small
+  // value so that 1 + x should round to 1 would do (after accounting for excess
+  // precision, as in 387 instructions).
+  static volatile float fmin = std::numeric_limits<float>::min();
+  float fmini = fmin; // we copy it so that it gets loaded at most once.
+  //
+  // Explanation:
+  // Only when fegetround() == FE_TONEAREST do we have that
+  // fmin + 1.0f == 1.0f - fmin.
+  //
+  // FE_UPWARD:
+  //  fmin + 1.0f > 1
+  //  1.0f - fmin == 1
+  //
+  // FE_DOWNWARD or  FE_TOWARDZERO:
+  //  fmin + 1.0f == 1
+  //  1.0f - fmin < 1
+  //
+  // Note: This may fail to be accurate if fast-math has been
+  // enabled, as rounding conventions may not apply.
+  return (fmini + 1.0f == 1.0f - fmini);
+}
+
 } // namespace detail
 
 template<typename T>
@@ -2919,12 +2976,45 @@ from_chars_result from_chars_advanced(const char *first, const char *last,
   }
   answer.ec = std::errc(); // be optimistic
   answer.ptr = pns.lastmatch;
-  // Next is a modified Clinger's fast path, inspired by Jakub Jelínek's proposal
-  if (pns.exponent >= 0 && pns.exponent <= binary_format<T>::max_exponent_fast_path() && pns.mantissa <=binary_format<T>::max_mantissa_fast_path(pns.exponent) && !pns.too_many_digits) {
-    value = T(pns.mantissa);
-    value = value * binary_format<T>::exact_power_of_ten(pns.exponent);
-    if (pns.negative) { value = -value; }
-    return answer;
+  // The implementation of the Clinger's fast path is convoluted because
+  // we want round-to-nearest in all cases, irrespective of the rounding mode
+  // selected on the thread.
+  // We proceed optimistically, assuming that detail::rounds_to_nearest() returns
+  // true.
+  if (binary_format<T>::min_exponent_fast_path() <= pns.exponent && pns.exponent <= binary_format<T>::max_exponent_fast_path() && !pns.too_many_digits) {
+    // Unfortunately, the conventional Clinger's fast path is only possible
+    // when the system rounds to the nearest float.
+    //
+    // We expect the next branch to almost always be selected.
+    // We could check it first (before the previous branch), but
+    // there might be performance advantages at having the check
+    // be last.
+    if(detail::rounds_to_nearest())  {
+      // We have that fegetround() == FE_TONEAREST.
+      // Next is Clinger's fast path.
+      if (pns.mantissa <=binary_format<T>::max_mantissa_fast_path()) {
+        value = T(pns.mantissa);
+        if (pns.exponent < 0) { value = value / binary_format<T>::exact_power_of_ten(-pns.exponent); }
+        else { value = value * binary_format<T>::exact_power_of_ten(pns.exponent); }
+        if (pns.negative) { value = -value; }
+        return answer;
+      }
+    } else {
+      // We do not have that fegetround() == FE_TONEAREST.
+      // Next is a modified Clinger's fast path, inspired by Jakub Jelínek's proposal
+      if (pns.exponent >= 0 && pns.mantissa <=binary_format<T>::max_mantissa_fast_path(pns.exponent)) {
+#if (defined(_WIN32) && defined(__clang__))
+        // ClangCL may map 0 to -0.0 when fegetround() == FE_DOWNWARD
+        if(pns.mantissa == 0) {
+          value = 0;
+          return answer;
+        }
+#endif
+        value = T(pns.mantissa) * binary_format<T>::exact_power_of_ten(pns.exponent);
+        if (pns.negative) { value = -value; }
+        return answer;
+      }
+    }
   }
   adjusted_mantissa am = compute_float<binary_format<T>>(pns.exponent, pns.mantissa);
   if(pns.too_many_digits && am.power2 >= 0) {