commit dbf8bd3c2f2cd2d27ca4f0fe379bd9490273c6d7
Author: Maged Michael <maged.michael@gmail.com>
Date:   Tue Dec 7 15:20:58 2021

    libstdc++: Skip atomic instructions in shared_ptr when both counts are 1
    
    This rewrites _Sp_counted_base::_M_release to skip the two atomic
    instructions that decrement each of the use count and the weak count
    when both are 1.
    
    Benefits: Save the cost of the last atomic decrements of each of the use
    count and the weak count in _Sp_counted_base. Atomic instructions are
    significantly slower than regular loads and stores across major
    architectures.
    
    How current code works: _M_release() atomically decrements the use
    count, checks if it was 1, if so calls _M_dispose(), atomically
    decrements the weak count, checks if it was 1, and if so calls
    _M_destroy().
    
    How the proposed algorithm works: _M_release() loads both use count and
    weak count together atomically (assuming suitable alignment, discussed
    later), checks if the value corresponds to a 0x1 value in the individual
    count members, and if so calls _M_dispose() and _M_destroy().
    Otherwise, it follows the original algorithm.
    
    Why it works: When the current thread executing _M_release() finds each
    of the counts is equal to 1, then no other threads could possibly hold
    use or weak references to this control block. That is, no other threads
    could possibly access the counts or the protected object.
    
    There are two crucial high-level issues that I'd like to point out first:
    - Atomicity of access to the counts together
    - Proper alignment of the counts together
    
    The patch is intended to apply the proposed algorithm only to the case of
    64-bit mode, 4-byte counts, and 8-byte aligned _Sp_counted_base.
    
    ** Atomicity **
    - The proposed algorithm depends on the mutual atomicity among 8-byte
    atomic operations and 4-byte atomic operations on each of the 4-byte halves
    of the 8-byte aligned 8-byte block.
    - The standard does not guarantee atomicity of 8-byte operations on a pair
    of 8-byte aligned 4-byte objects.
    - To my knowledge this works in practice on systems that guarantee native
    implementation of 4-byte and 8-byte atomic operations.
    - __atomic_always_lock_free is used to check for native atomic operations.
    
    ** Alignment **
    - _Sp_counted_base is an internal base class, with a virtual destructor,
    so it has a vptr at the beginning of the class, and will be aligned to
    alignof(void*) i.e. 8 bytes.
    - The first members of the class are the 4-byte use count and 4-byte
    weak count, which will occupy 8 contiguous bytes immediately after the
    vptr, i.e. they form an 8-byte aligned 8 byte range.
    
    Other points:
    - The proposed algorithm can interact correctly with the current algorithm.
    That is, multiple threads using different versions of the code with and
    without the patch operating on the same objects should always interact
    correctly. The intent for the patch is to be ABI compatible with the
    current implementation.
    - The proposed patch involves a performance trade-off between saving the
    costs of atomic instructions when the counts are both 1 vs adding the cost
    of loading the 8-byte combined counts and comparison with {0x1, 0x1}.
    - I noticed a big difference between the code generated by GCC vs LLVM. GCC
    seems to generate noticeably more code and what seems to be redundant null
    checks and branches.
    - The patch has been in use (built using LLVM) in a large environment for
    many months. The performance gains outweigh the losses (roughly 10 to 1)
    across a large variety of workloads.
    
    Signed-off-by: Jonathan Wakely <jwakely@redhat.com>
    
    Co-authored-by: Jonathan Wakely <jwakely@redhat.com>
    
    libstdc++-v3/ChangeLog:
    
            * include/bits/c++config (_GLIBCXX_TSAN): Define macro
            indicating that TSan is in use.
            * include/bits/shared_ptr_base.h (_Sp_counted_base::_M_release):
            Replace definition in primary template with explicit
            specializations for _S_mutex and _S_atomic policies.
            (_Sp_counted_base<_S_mutex>::_M_release): New specialization.
            (_Sp_counted_base<_S_atomic>::_M_release): New specialization,
            using a single atomic load to access both reference counts at
            once.
            (_Sp_counted_base::_M_release_last_use): New member function.

diff --git a/libstdc++-v3/include/bits/c++config b/libstdc++-v3/include/bits/c++config
index 90513ccae38..f2d704f57eb 100644
--- a/libstdc++-v3/include/bits/c++config
+++ b/libstdc++-v3/include/bits/c++config
@@ -577,6 +577,15 @@ namespace std
   do { __glibcxx_constexpr_assert(cond); } while (false)
 #endif
 
+// Macro indicating that TSAN is in use.
+#if __SANITIZE_THREAD__
+#  define _GLIBCXX_TSAN 1
+#elif defined __has_feature
+# if __has_feature(thread_sanitizer)
+#  define _GLIBCXX_TSAN 1
+# endif
+#endif
+
 // Macros for race detectors.
 // _GLIBCXX_SYNCHRONIZATION_HAPPENS_BEFORE(A) and
 // _GLIBCXX_SYNCHRONIZATION_HAPPENS_AFTER(A) should be used to explain
diff --git a/libstdc++-v3/include/bits/shared_ptr_base.h b/libstdc++-v3/include/bits/shared_ptr_base.h
index 3473a74280d..90ad30947b0 100644
--- a/libstdc++-v3/include/bits/shared_ptr_base.h
+++ b/libstdc++-v3/include/bits/shared_ptr_base.h
@@ -143,10 +143,12 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       virtual void*
       _M_get_deleter(const std::type_info&) noexcept = 0;
 
+      // Increment the use count (used when the count is greater than zero).
       void
       _M_add_ref_copy()
       { __gnu_cxx::__atomic_add_dispatch(&_M_use_count, 1); }
 
+      // Increment the use count if it is non-zero, throw otherwise.
       void
       _M_add_ref_lock()
       {
@@ -154,42 +156,51 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	  __throw_bad_weak_ptr();
       }
 
+      // Increment the use count if it is non-zero.
       bool
       _M_add_ref_lock_nothrow() noexcept;
 
+      // Decrement the use count.
       void
-      _M_release() noexcept
-      {
-        // Be race-detector-friendly.  For more info see bits/c++config.
-        _GLIBCXX_SYNCHRONIZATION_HAPPENS_BEFORE(&_M_use_count);
-	if (__gnu_cxx::__exchange_and_add_dispatch(&_M_use_count, -1) == 1)
-	  {
-            _GLIBCXX_SYNCHRONIZATION_HAPPENS_AFTER(&_M_use_count);
-	    _M_dispose();
-	    // There must be a memory barrier between dispose() and destroy()
-	    // to ensure that the effects of dispose() are observed in the
-	    // thread that runs destroy().
-	    // See http://gcc.gnu.org/ml/libstdc++/2005-11/msg00136.html
-	    if (_Mutex_base<_Lp>::_S_need_barriers)
-	      {
-		__atomic_thread_fence (__ATOMIC_ACQ_REL);
-	      }
+      _M_release() noexcept;
 
-            // Be race-detector-friendly.  For more info see bits/c++config.
-            _GLIBCXX_SYNCHRONIZATION_HAPPENS_BEFORE(&_M_weak_count);
-	    if (__gnu_cxx::__exchange_and_add_dispatch(&_M_weak_count,
-						       -1) == 1)
-              {
-                _GLIBCXX_SYNCHRONIZATION_HAPPENS_AFTER(&_M_weak_count);
-	        _M_destroy();
-              }
+      // Called by _M_release() when the use count reaches zero.
+      void
+      _M_release_last_use() noexcept
+      {
+	_GLIBCXX_SYNCHRONIZATION_HAPPENS_AFTER(&_M_use_count);
+	_M_dispose();
+	// There must be a memory barrier between dispose() and destroy()
+	// to ensure that the effects of dispose() are observed in the
+	// thread that runs destroy().
+	// See http://gcc.gnu.org/ml/libstdc++/2005-11/msg00136.html
+	if (_Mutex_base<_Lp>::_S_need_barriers)
+	  {
+	    __atomic_thread_fence (__ATOMIC_ACQ_REL);
+	  }
+
+	// Be race-detector-friendly.  For more info see bits/c++config.
+	_GLIBCXX_SYNCHRONIZATION_HAPPENS_BEFORE(&_M_weak_count);
+	if (__gnu_cxx::__exchange_and_add_dispatch(&_M_weak_count,
+						   -1) == 1)
+	  {
+	    _GLIBCXX_SYNCHRONIZATION_HAPPENS_AFTER(&_M_weak_count);
+	    _M_destroy();
 	  }
       }
 
+      // As above, but 'noinline' to reduce code size on the cold path.
+      __attribute__((__noinline__))
+      void
+      _M_release_last_use_cold() noexcept
+      { _M_release_last_use(); }
+
+      // Increment the weak count.
       void
       _M_weak_add_ref() noexcept
       { __gnu_cxx::__atomic_add_dispatch(&_M_weak_count, 1); }
 
+      // Decrement the weak count.
       void
       _M_weak_release() noexcept
       {
@@ -286,6 +297,67 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
         }
     }
 
+  template<>
+    inline void
+    _Sp_counted_base<_S_mutex>::_M_release() noexcept
+    {
+      // Be race-detector-friendly.  For more info see bits/c++config.
+      _GLIBCXX_SYNCHRONIZATION_HAPPENS_BEFORE(&_M_use_count);
+      if (__gnu_cxx::__exchange_and_add_dispatch(&_M_use_count, -1) == 1)
+	{
+	  _M_release_last_use();
+	}
+    }
+
+  template<>
+    inline void
+    _Sp_counted_base<_S_atomic>::_M_release() noexcept
+    {
+      _GLIBCXX_SYNCHRONIZATION_HAPPENS_BEFORE(&_M_use_count);
+#if ! _GLIBCXX_TSAN
+      constexpr bool __lock_free
+	= __atomic_always_lock_free(sizeof(long long), 0)
+	&& __atomic_always_lock_free(sizeof(_Atomic_word), 0);
+      constexpr bool __double_word
+	= sizeof(long long) == 2 * sizeof(_Atomic_word);
+      // The ref-count members follow the vptr, so are aligned to
+      // alignof(void*).
+      constexpr bool __aligned = __alignof(long long) <= alignof(void*);
+      if _GLIBCXX17_CONSTEXPR (__lock_free && __double_word && __aligned)
+	{
+	  constexpr long long __unique_ref
+	    = 1LL + (1LL << (__CHAR_BIT__ * sizeof(_Atomic_word)));
+	  auto __both_counts = reinterpret_cast<long long*>(&_M_use_count);
+
+	  _GLIBCXX_SYNCHRONIZATION_HAPPENS_BEFORE(&_M_weak_count);
+	  if (__atomic_load_n(__both_counts, __ATOMIC_ACQUIRE) == __unique_ref)
+	    {
+	      // Both counts are 1, so there are no weak references and
+	      // we are releasing the last strong reference. No other
+	      // threads can observe the effects of this _M_release()
+	      // call (e.g. calling use_count()) without a data race.
+	      *(long long*)(&_M_use_count) = 0;
+	      _GLIBCXX_SYNCHRONIZATION_HAPPENS_AFTER(&_M_use_count);
+	      _GLIBCXX_SYNCHRONIZATION_HAPPENS_AFTER(&_M_weak_count);
+	      _M_dispose();
+	      _M_destroy();
+	      return;
+	    }
+	  if (__gnu_cxx::__exchange_and_add_dispatch(&_M_use_count, -1) == 1)
+	    [[__unlikely__]]
+	    {
+	      _M_release_last_use_cold();
+	      return;
+	    }
+	}
+      else
+#endif
+      if (__gnu_cxx::__exchange_and_add_dispatch(&_M_use_count, -1) == 1)
+	{
+	  _M_release_last_use();
+	}
+    }
+
   template<>
     inline void
     _Sp_counted_base<_S_single>::_M_weak_add_ref() noexcept