public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc(refs/users/marxin/heads/merge-libsanitizer-v6)] libsanitizer: merge from master (996ef895cd3d1313665a42fc8e20d1d4e1cf2a28)
@ 2021-11-22 18:38 Martin Liska
  0 siblings, 0 replies; only message in thread
From: Martin Liska @ 2021-11-22 18:38 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:b60adfb0920df928db5e6b38be48f6ea2f28cd78

commit b60adfb0920df928db5e6b38be48f6ea2f28cd78
Author: Martin Liska <mliska@suse.cz>
Date:   Mon Nov 22 19:20:17 2021 +0100

    libsanitizer: merge from master (996ef895cd3d1313665a42fc8e20d1d4e1cf2a28)

Diff:
---
 libsanitizer/MERGE                                 |   2 +-
 libsanitizer/asan/asan_fuchsia.cpp                 |  11 +-
 libsanitizer/asan/asan_globals.cpp                 |  19 +
 libsanitizer/asan/asan_interceptors.h              |   7 +-
 libsanitizer/asan/asan_mapping.h                   |   2 +-
 libsanitizer/asan/asan_thread.cpp                  |   3 +-
 libsanitizer/lsan/lsan_fuchsia.cpp                 |   5 +-
 libsanitizer/lsan/lsan_interceptors.cpp            |  23 +-
 libsanitizer/lsan/lsan_mac.cpp                     |   2 +-
 libsanitizer/lsan/lsan_posix.cpp                   |   2 +-
 libsanitizer/lsan/lsan_thread.cpp                  |  26 +-
 libsanitizer/lsan/lsan_thread.h                    |   5 +-
 .../sanitizer_chained_origin_depot.cpp             |   1 -
 .../sanitizer_common/sanitizer_dense_map.h         | 678 ++++++++++++++++
 .../sanitizer_common/sanitizer_dense_map_info.h    | 260 +++++++
 .../sanitizer_common/sanitizer_internal_defs.h     |   3 +-
 libsanitizer/sanitizer_common/sanitizer_linux.cpp  |   2 +-
 .../sanitizer_common/sanitizer_linux_libcdep.cpp   |   4 -
 .../sanitizer_common/sanitizer_linux_s390.cpp      |  14 +-
 libsanitizer/sanitizer_common/sanitizer_mac.cpp    |  12 +-
 libsanitizer/sanitizer_common/sanitizer_mac.h      |  20 -
 .../sanitizer_persistent_allocator.h               | 110 ---
 .../sanitizer_platform_limits_linux.cpp            |   5 +-
 .../sanitizer_platform_limits_posix.h              |   2 +-
 libsanitizer/sanitizer_common/sanitizer_printf.cpp |  10 +-
 .../sanitizer_common/sanitizer_stack_store.cpp     |  91 +++
 .../sanitizer_common/sanitizer_stack_store.h       |  50 ++
 .../sanitizer_common/sanitizer_stackdepot.cpp      |  32 +-
 .../sanitizer_common/sanitizer_stacktrace.cpp      |  17 +-
 .../sanitizer_common/sanitizer_thread_registry.cpp |  38 +-
 .../sanitizer_common/sanitizer_thread_registry.h   |   5 +
 .../sanitizer_common/sanitizer_type_traits.h       |  79 ++
 libsanitizer/tsan/tsan_defs.h                      |  23 +-
 libsanitizer/tsan/tsan_dense_alloc.h               |   9 +
 libsanitizer/tsan/tsan_flags.cpp                   |   6 -
 libsanitizer/tsan/tsan_flags.inc                   |  10 +-
 libsanitizer/tsan/tsan_interceptors_posix.cpp      |  28 +-
 libsanitizer/tsan/tsan_interface_atomic.cpp        |  87 ++-
 libsanitizer/tsan/tsan_interface_java.cpp          |   4 +-
 libsanitizer/tsan/tsan_mman.cpp                    |  22 +-
 libsanitizer/tsan/tsan_mutexset.cpp                |  54 +-
 libsanitizer/tsan/tsan_mutexset.h                  |  11 +-
 libsanitizer/tsan/tsan_platform.h                  | 173 +----
 libsanitizer/tsan/tsan_platform_linux.cpp          |  48 +-
 libsanitizer/tsan/tsan_platform_mac.cpp            |   7 -
 libsanitizer/tsan/tsan_platform_posix.cpp          |  16 +-
 libsanitizer/tsan/tsan_platform_windows.cpp        |   3 -
 libsanitizer/tsan/tsan_rtl.cpp                     | 640 ++++++++++-----
 libsanitizer/tsan/tsan_rtl.h                       | 328 ++++----
 libsanitizer/tsan/tsan_rtl_access.cpp              | 860 ++++++++++++---------
 libsanitizer/tsan/tsan_rtl_mutex.cpp               | 642 +++++++--------
 libsanitizer/tsan/tsan_rtl_ppc64.S                 |   1 -
 libsanitizer/tsan/tsan_rtl_proc.cpp                |   1 -
 libsanitizer/tsan/tsan_rtl_report.cpp              | 367 ++++-----
 libsanitizer/tsan/tsan_rtl_thread.cpp              | 224 +++---
 libsanitizer/tsan/tsan_shadow.h                    | 315 ++++----
 libsanitizer/tsan/tsan_sync.cpp                    |  82 +-
 libsanitizer/tsan/tsan_sync.h                      |  48 +-
 libsanitizer/tsan/tsan_trace.h                     | 113 ++-
 libsanitizer/tsan/tsan_update_shadow_word.inc      |  59 --
 libsanitizer/ubsan/ubsan_flags.cpp                 |   1 -
 libsanitizer/ubsan/ubsan_handlers.cpp              |  15 -
 libsanitizer/ubsan/ubsan_handlers.h                |   8 -
 libsanitizer/ubsan/ubsan_platform.h                |   2 -
 64 files changed, 3375 insertions(+), 2372 deletions(-)

diff --git a/libsanitizer/MERGE b/libsanitizer/MERGE
index 01913de5d66..1de0487b4ff 100644
--- a/libsanitizer/MERGE
+++ b/libsanitizer/MERGE
@@ -1,4 +1,4 @@
-82bc6a094e85014f1891ef9407496f44af8fe442
+996ef895cd3d1313665a42fc8e20d1d4e1cf2a28
 
 The first line of this file holds the git revision number of the
 last merge done from the master library sources.
diff --git a/libsanitizer/asan/asan_fuchsia.cpp b/libsanitizer/asan/asan_fuchsia.cpp
index b419019d137..15381d5bd0e 100644
--- a/libsanitizer/asan/asan_fuchsia.cpp
+++ b/libsanitizer/asan/asan_fuchsia.cpp
@@ -118,14 +118,12 @@ struct AsanThread::InitOptions {
 
 // Shared setup between thread creation and startup for the initial thread.
 static AsanThread *CreateAsanThread(StackTrace *stack, u32 parent_tid,
-                                    uptr user_id, bool detached,
-                                    const char *name) {
+                                    bool detached, const char *name) {
   // In lieu of AsanThread::Create.
   AsanThread *thread = (AsanThread *)MmapOrDie(AsanThreadMmapSize(), __func__);
 
   AsanThreadContext::CreateThreadContextArgs args = {thread, stack};
-  u32 tid =
-      asanThreadRegistry().CreateThread(user_id, detached, parent_tid, &args);
+  u32 tid = asanThreadRegistry().CreateThread(0, detached, parent_tid, &args);
   asanThreadRegistry().SetThreadName(tid, name);
 
   return thread;
@@ -152,7 +150,7 @@ AsanThread *CreateMainThread() {
   CHECK_NE(__sanitizer::MainThreadStackBase, 0);
   CHECK_GT(__sanitizer::MainThreadStackSize, 0);
   AsanThread *t = CreateAsanThread(
-      nullptr, 0, reinterpret_cast<uptr>(self), true,
+      nullptr, 0, true,
       _zx_object_get_property(thrd_get_zx_handle(self), ZX_PROP_NAME, name,
                               sizeof(name)) == ZX_OK
           ? name
@@ -182,8 +180,7 @@ static void *BeforeThreadCreateHook(uptr user_id, bool detached,
   GET_STACK_TRACE_THREAD;
   u32 parent_tid = GetCurrentTidOrInvalid();
 
-  AsanThread *thread =
-      CreateAsanThread(&stack, parent_tid, user_id, detached, name);
+  AsanThread *thread = CreateAsanThread(&stack, parent_tid, detached, name);
 
   // On other systems, AsanThread::Init() is called from the new
   // thread itself.  But on Fuchsia we already know the stack address
diff --git a/libsanitizer/asan/asan_globals.cpp b/libsanitizer/asan/asan_globals.cpp
index 94004877227..5f56fe6f457 100644
--- a/libsanitizer/asan/asan_globals.cpp
+++ b/libsanitizer/asan/asan_globals.cpp
@@ -154,6 +154,23 @@ static void CheckODRViolationViaIndicator(const Global *g) {
   }
 }
 
+// Check ODR violation for given global G by checking if it's already poisoned.
+// We use this method in case compiler doesn't use private aliases for global
+// variables.
+static void CheckODRViolationViaPoisoning(const Global *g) {
+  if (__asan_region_is_poisoned(g->beg, g->size_with_redzone)) {
+    // This check may not be enough: if the first global is much larger
+    // the entire redzone of the second global may be within the first global.
+    for (ListOfGlobals *l = list_of_all_globals; l; l = l->next) {
+      if (g->beg == l->g->beg &&
+          (flags()->detect_odr_violation >= 2 || g->size != l->g->size) &&
+          !IsODRViolationSuppressed(g->name))
+        ReportODRViolation(g, FindRegistrationSite(g),
+                           l->g, FindRegistrationSite(l->g));
+    }
+  }
+}
+
 // Clang provides two different ways for global variables protection:
 // it can poison the global itself or its private alias. In former
 // case we may poison same symbol multiple times, that can help us to
@@ -199,6 +216,8 @@ static void RegisterGlobal(const Global *g) {
     // where two globals with the same name are defined in different modules.
     if (UseODRIndicator(g))
       CheckODRViolationViaIndicator(g);
+    else
+      CheckODRViolationViaPoisoning(g);
   }
   if (CanPoisonMemory())
     PoisonRedZones(*g);
diff --git a/libsanitizer/asan/asan_interceptors.h b/libsanitizer/asan/asan_interceptors.h
index 105c672cc24..047b044c8bf 100644
--- a/libsanitizer/asan/asan_interceptors.h
+++ b/libsanitizer/asan/asan_interceptors.h
@@ -81,12 +81,7 @@ void InitializePlatformInterceptors();
 #if ASAN_HAS_EXCEPTIONS && !SANITIZER_WINDOWS && !SANITIZER_SOLARIS && \
     !SANITIZER_NETBSD
 # define ASAN_INTERCEPT___CXA_THROW 1
-# if ! defined(ASAN_HAS_CXA_RETHROW_PRIMARY_EXCEPTION) \
-     || ASAN_HAS_CXA_RETHROW_PRIMARY_EXCEPTION
-#   define ASAN_INTERCEPT___CXA_RETHROW_PRIMARY_EXCEPTION 1
-# else
-#   define ASAN_INTERCEPT___CXA_RETHROW_PRIMARY_EXCEPTION 0
-# endif
+# define ASAN_INTERCEPT___CXA_RETHROW_PRIMARY_EXCEPTION 1
 # if defined(_GLIBCXX_SJLJ_EXCEPTIONS) || (SANITIZER_IOS && defined(__arm__))
 #  define ASAN_INTERCEPT__UNWIND_SJLJ_RAISEEXCEPTION 1
 # else
diff --git a/libsanitizer/asan/asan_mapping.h b/libsanitizer/asan/asan_mapping.h
index 4b0037fced3..e5a7f2007ae 100644
--- a/libsanitizer/asan/asan_mapping.h
+++ b/libsanitizer/asan/asan_mapping.h
@@ -165,7 +165,7 @@ static const u64 kAArch64_ShadowOffset64 = 1ULL << 36;
 static const u64 kRiscv64_ShadowOffset64 = 0xd55550000;
 static const u64 kMIPS32_ShadowOffset32 = 0x0aaa0000;
 static const u64 kMIPS64_ShadowOffset64 = 1ULL << 37;
-static const u64 kPPC64_ShadowOffset64 = 1ULL << 41;
+static const u64 kPPC64_ShadowOffset64 = 1ULL << 44;
 static const u64 kSystemZ_ShadowOffset64 = 1ULL << 52;
 static const u64 kSPARC64_ShadowOffset64 = 1ULL << 43;  // 0x80000000000
 static const u64 kFreeBSD_ShadowOffset32 = 1ULL << 30;  // 0x40000000
diff --git a/libsanitizer/asan/asan_thread.cpp b/libsanitizer/asan/asan_thread.cpp
index 8af74254cdc..930139968ec 100644
--- a/libsanitizer/asan/asan_thread.cpp
+++ b/libsanitizer/asan/asan_thread.cpp
@@ -83,8 +83,7 @@ AsanThread *AsanThread::Create(thread_callback_t start_routine, void *arg,
   thread->start_routine_ = start_routine;
   thread->arg_ = arg;
   AsanThreadContext::CreateThreadContextArgs args = {thread, stack};
-  asanThreadRegistry().CreateThread(*reinterpret_cast<uptr *>(thread), detached,
-                                    parent_tid, &args);
+  asanThreadRegistry().CreateThread(0, detached, parent_tid, &args);
 
   return thread;
 }
diff --git a/libsanitizer/lsan/lsan_fuchsia.cpp b/libsanitizer/lsan/lsan_fuchsia.cpp
index 40e65c6fb72..cb9f62a0341 100644
--- a/libsanitizer/lsan/lsan_fuchsia.cpp
+++ b/libsanitizer/lsan/lsan_fuchsia.cpp
@@ -62,7 +62,7 @@ void InitializeMainThread() {
   OnCreatedArgs args;
   __sanitizer::GetThreadStackTopAndBottom(true, &args.stack_end,
                                           &args.stack_begin);
-  u32 tid = ThreadCreate(0, GetThreadSelf(), true, &args);
+  u32 tid = ThreadCreate(kMainTid, true, &args);
   CHECK_EQ(tid, 0);
   ThreadStart(tid);
 }
@@ -86,14 +86,13 @@ void GetAllThreadAllocatorCachesLocked(InternalMmapVector<uptr> *caches) {
 void *__sanitizer_before_thread_create_hook(thrd_t thread, bool detached,
                                             const char *name, void *stack_base,
                                             size_t stack_size) {
-  uptr user_id = reinterpret_cast<uptr>(thread);
   ENSURE_LSAN_INITED;
   EnsureMainThreadIDIsCorrect();
   OnCreatedArgs args;
   args.stack_begin = reinterpret_cast<uptr>(stack_base);
   args.stack_end = args.stack_begin + stack_size;
   u32 parent_tid = GetCurrentThread();
-  u32 tid = ThreadCreate(parent_tid, user_id, detached, &args);
+  u32 tid = ThreadCreate(parent_tid, detached, &args);
   return reinterpret_cast<void *>(static_cast<uptr>(tid));
 }
 
diff --git a/libsanitizer/lsan/lsan_interceptors.cpp b/libsanitizer/lsan/lsan_interceptors.cpp
index 22999d567f6..ee723f210c9 100644
--- a/libsanitizer/lsan/lsan_interceptors.cpp
+++ b/libsanitizer/lsan/lsan_interceptors.cpp
@@ -468,8 +468,7 @@ INTERCEPTOR(int, pthread_create, void *th, void *attr,
     res = REAL(pthread_create)(th, attr, __lsan_thread_start_func, &p);
   }
   if (res == 0) {
-    int tid = ThreadCreate(GetCurrentThread(), *(uptr *)th,
-                           IsStateDetached(detached));
+    int tid = ThreadCreate(GetCurrentThread(), IsStateDetached(detached));
     CHECK_NE(tid, kMainTid);
     atomic_store(&p.tid, tid, memory_order_release);
     while (atomic_load(&p.tid, memory_order_acquire) != 0)
@@ -480,24 +479,6 @@ INTERCEPTOR(int, pthread_create, void *th, void *attr,
   return res;
 }
 
-INTERCEPTOR(int, pthread_join, void *th, void **ret) {
-  ENSURE_LSAN_INITED;
-  int tid = ThreadTid((uptr)th);
-  int res = REAL(pthread_join)(th, ret);
-  if (res == 0)
-    ThreadJoin(tid);
-  return res;
-}
-
-INTERCEPTOR(int, pthread_detach, void *th) {
-  ENSURE_LSAN_INITED;
-  int tid = ThreadTid((uptr)th);
-  int res = REAL(pthread_detach)(th);
-  if (res == 0)
-    ThreadDetach(tid);
-  return res;
-}
-
 INTERCEPTOR(void, _exit, int status) {
   if (status == 0 && HasReportedLeaks()) status = common_flags()->exitcode;
   REAL(_exit)(status);
@@ -530,8 +511,6 @@ void InitializeInterceptors() {
   LSAN_MAYBE_INTERCEPT_MALLINFO;
   LSAN_MAYBE_INTERCEPT_MALLOPT;
   INTERCEPT_FUNCTION(pthread_create);
-  INTERCEPT_FUNCTION(pthread_detach);
-  INTERCEPT_FUNCTION(pthread_join);
   INTERCEPT_FUNCTION(_exit);
 
   LSAN_MAYBE_INTERCEPT__LWP_EXIT;
diff --git a/libsanitizer/lsan/lsan_mac.cpp b/libsanitizer/lsan/lsan_mac.cpp
index b96893e2801..10a73f8fa93 100644
--- a/libsanitizer/lsan/lsan_mac.cpp
+++ b/libsanitizer/lsan/lsan_mac.cpp
@@ -68,7 +68,7 @@ typedef struct {
 ALWAYS_INLINE
 void lsan_register_worker_thread(int parent_tid) {
   if (GetCurrentThread() == kInvalidTid) {
-    u32 tid = ThreadCreate(parent_tid, 0, true);
+    u32 tid = ThreadCreate(parent_tid, true);
     ThreadStart(tid, GetTid());
     SetCurrentThread(tid);
   }
diff --git a/libsanitizer/lsan/lsan_posix.cpp b/libsanitizer/lsan/lsan_posix.cpp
index 5d1c3f6260d..77118a29f2e 100644
--- a/libsanitizer/lsan/lsan_posix.cpp
+++ b/libsanitizer/lsan/lsan_posix.cpp
@@ -75,7 +75,7 @@ bool GetThreadRangesLocked(tid_t os_id, uptr *stack_begin, uptr *stack_end,
 }
 
 void InitializeMainThread() {
-  u32 tid = ThreadCreate(kMainTid, 0, true);
+  u32 tid = ThreadCreate(kMainTid, true);
   CHECK_EQ(tid, kMainTid);
   ThreadStart(tid, GetTid());
 }
diff --git a/libsanitizer/lsan/lsan_thread.cpp b/libsanitizer/lsan/lsan_thread.cpp
index 1d224ebca69..ca3dfd03b10 100644
--- a/libsanitizer/lsan/lsan_thread.cpp
+++ b/libsanitizer/lsan/lsan_thread.cpp
@@ -44,8 +44,8 @@ void ThreadContextLsanBase::OnFinished() {
   DTLS_Destroy();
 }
 
-u32 ThreadCreate(u32 parent_tid, uptr user_id, bool detached, void *arg) {
-  return thread_registry->CreateThread(user_id, detached, parent_tid, arg);
+u32 ThreadCreate(u32 parent_tid, bool detached, void *arg) {
+  return thread_registry->CreateThread(0, detached, parent_tid, arg);
 }
 
 void ThreadContextLsanBase::ThreadStart(u32 tid, tid_t os_id,
@@ -68,28 +68,6 @@ ThreadContext *CurrentThreadContext() {
   return (ThreadContext *)thread_registry->GetThreadLocked(GetCurrentThread());
 }
 
-static bool FindThreadByUid(ThreadContextBase *tctx, void *arg) {
-  uptr uid = (uptr)arg;
-  if (tctx->user_id == uid && tctx->status != ThreadStatusInvalid) {
-    return true;
-  }
-  return false;
-}
-
-u32 ThreadTid(uptr uid) {
-  return thread_registry->FindThread(FindThreadByUid, (void *)uid);
-}
-
-void ThreadDetach(u32 tid) {
-  CHECK_NE(tid, kInvalidTid);
-  thread_registry->DetachThread(tid, /* arg */ nullptr);
-}
-
-void ThreadJoin(u32 tid) {
-  CHECK_NE(tid, kInvalidTid);
-  thread_registry->JoinThread(tid, /* arg */ nullptr);
-}
-
 void EnsureMainThreadIDIsCorrect() {
   if (GetCurrentThread() == kMainTid)
     CurrentThreadContext()->os_id = GetTid();
diff --git a/libsanitizer/lsan/lsan_thread.h b/libsanitizer/lsan/lsan_thread.h
index 36643753d01..6ab4172092a 100644
--- a/libsanitizer/lsan/lsan_thread.h
+++ b/libsanitizer/lsan/lsan_thread.h
@@ -45,11 +45,8 @@ class ThreadContext;
 void InitializeThreadRegistry();
 void InitializeMainThread();
 
-u32 ThreadCreate(u32 tid, uptr uid, bool detached, void *arg = nullptr);
+u32 ThreadCreate(u32 tid, bool detached, void *arg = nullptr);
 void ThreadFinish();
-void ThreadDetach(u32 tid);
-void ThreadJoin(u32 tid);
-u32 ThreadTid(uptr uid);
 
 u32 GetCurrentThread();
 void SetCurrentThread(u32 tid);
diff --git a/libsanitizer/sanitizer_common/sanitizer_chained_origin_depot.cpp b/libsanitizer/sanitizer_common/sanitizer_chained_origin_depot.cpp
index 626777d6943..472b83d63a0 100644
--- a/libsanitizer/sanitizer_common/sanitizer_chained_origin_depot.cpp
+++ b/libsanitizer/sanitizer_common/sanitizer_chained_origin_depot.cpp
@@ -11,7 +11,6 @@
 
 #include "sanitizer_chained_origin_depot.h"
 
-#include "sanitizer_persistent_allocator.h"
 #include "sanitizer_stackdepotbase.h"
 
 namespace __sanitizer {
diff --git a/libsanitizer/sanitizer_common/sanitizer_dense_map.h b/libsanitizer/sanitizer_common/sanitizer_dense_map.h
new file mode 100644
index 00000000000..3fa6af76ce2
--- /dev/null
+++ b/libsanitizer/sanitizer_common/sanitizer_dense_map.h
@@ -0,0 +1,678 @@
+//===- sanitizer_dense_map.h - Dense probed hash table ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is fork of llvm/ADT/DenseMap.h class with the following changes:
+//  * Use mmap to allocate.
+//  * No iterators.
+//  * Does not shrink.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SANITIZER_DENSE_MAP_H
+#define SANITIZER_DENSE_MAP_H
+
+#include "sanitizer_common.h"
+#include "sanitizer_dense_map_info.h"
+#include "sanitizer_internal_defs.h"
+#include "sanitizer_type_traits.h"
+
+namespace __sanitizer {
+
+template <typename DerivedT, typename KeyT, typename ValueT, typename KeyInfoT,
+          typename BucketT>
+class DenseMapBase {
+ public:
+  using size_type = unsigned;
+  using key_type = KeyT;
+  using mapped_type = ValueT;
+  using value_type = BucketT;
+
+  WARN_UNUSED_RESULT bool empty() const { return getNumEntries() == 0; }
+  unsigned size() const { return getNumEntries(); }
+
+  /// Grow the densemap so that it can contain at least \p NumEntries items
+  /// before resizing again.
+  void reserve(size_type NumEntries) {
+    auto NumBuckets = getMinBucketToReserveForEntries(NumEntries);
+    if (NumBuckets > getNumBuckets())
+      grow(NumBuckets);
+  }
+
+  void clear() {
+    if (getNumEntries() == 0 && getNumTombstones() == 0)
+      return;
+
+    const KeyT EmptyKey = getEmptyKey(), TombstoneKey = getTombstoneKey();
+    if (__sanitizer::is_trivially_destructible<ValueT>::value) {
+      // Use a simpler loop when values don't need destruction.
+      for (BucketT *P = getBuckets(), *E = getBucketsEnd(); P != E; ++P)
+        P->getFirst() = EmptyKey;
+    } else {
+      unsigned NumEntries = getNumEntries();
+      for (BucketT *P = getBuckets(), *E = getBucketsEnd(); P != E; ++P) {
+        if (!KeyInfoT::isEqual(P->getFirst(), EmptyKey)) {
+          if (!KeyInfoT::isEqual(P->getFirst(), TombstoneKey)) {
+            P->getSecond().~ValueT();
+            --NumEntries;
+          }
+          P->getFirst() = EmptyKey;
+        }
+      }
+      CHECK_EQ(NumEntries, 0);
+    }
+    setNumEntries(0);
+    setNumTombstones(0);
+  }
+
+  /// Return 1 if the specified key is in the map, 0 otherwise.
+  size_type count(const KeyT &Key) const {
+    const BucketT *TheBucket;
+    return LookupBucketFor(Key, TheBucket) ? 1 : 0;
+  }
+
+  value_type *find(const KeyT &Key) {
+    BucketT *TheBucket;
+    if (LookupBucketFor(Key, TheBucket))
+      return TheBucket;
+    return nullptr;
+  }
+  const value_type *find(const KeyT &Key) const {
+    const BucketT *TheBucket;
+    if (LookupBucketFor(Key, TheBucket))
+      return TheBucket;
+    return nullptr;
+  }
+
+  /// Alternate version of find() which allows a different, and possibly
+  /// less expensive, key type.
+  /// The DenseMapInfo is responsible for supplying methods
+  /// getHashValue(LookupKeyT) and isEqual(LookupKeyT, KeyT) for each key
+  /// type used.
+  template <class LookupKeyT>
+  value_type *find_as(const LookupKeyT &Key) {
+    BucketT *TheBucket;
+    if (LookupBucketFor(Key, TheBucket))
+      return TheBucket;
+    return nullptr;
+  }
+  template <class LookupKeyT>
+  const value_type *find_as(const LookupKeyT &Key) const {
+    const BucketT *TheBucket;
+    if (LookupBucketFor(Key, TheBucket))
+      return TheBucket;
+    return nullptr;
+  }
+
+  /// lookup - Return the entry for the specified key, or a default
+  /// constructed value if no such entry exists.
+  ValueT lookup(const KeyT &Key) const {
+    const BucketT *TheBucket;
+    if (LookupBucketFor(Key, TheBucket))
+      return TheBucket->getSecond();
+    return ValueT();
+  }
+
+  // Inserts key,value pair into the map if the key isn't already in the map.
+  // If the key is already in the map, it returns false and doesn't update the
+  // value.
+  detail::DenseMapPair<value_type *, bool> insert(const value_type &KV) {
+    return try_emplace(KV.first, KV.second);
+  }
+
+  // Inserts key,value pair into the map if the key isn't already in the map.
+  // If the key is already in the map, it returns false and doesn't update the
+  // value.
+  detail::DenseMapPair<value_type *, bool> insert(value_type &&KV) {
+    return try_emplace(__sanitizer::move(KV.first),
+                       __sanitizer::move(KV.second));
+  }
+
+  // Inserts key,value pair into the map if the key isn't already in the map.
+  // The value is constructed in-place if the key is not in the map, otherwise
+  // it is not moved.
+  template <typename... Ts>
+  detail::DenseMapPair<value_type *, bool> try_emplace(KeyT &&Key,
+                                                       Ts &&...Args) {
+    BucketT *TheBucket;
+    if (LookupBucketFor(Key, TheBucket))
+      return {TheBucket, false};  // Already in map.
+
+    // Otherwise, insert the new element.
+    TheBucket = InsertIntoBucket(TheBucket, __sanitizer::move(Key),
+                                 __sanitizer::forward<Ts>(Args)...);
+    return {TheBucket, true};
+  }
+
+  // Inserts key,value pair into the map if the key isn't already in the map.
+  // The value is constructed in-place if the key is not in the map, otherwise
+  // it is not moved.
+  template <typename... Ts>
+  detail::DenseMapPair<value_type *, bool> try_emplace(const KeyT &Key,
+                                                       Ts &&...Args) {
+    BucketT *TheBucket;
+    if (LookupBucketFor(Key, TheBucket))
+      return {TheBucket, false};  // Already in map.
+
+    // Otherwise, insert the new element.
+    TheBucket =
+        InsertIntoBucket(TheBucket, Key, __sanitizer::forward<Ts>(Args)...);
+    return {TheBucket, true};
+  }
+
+  /// Alternate version of insert() which allows a different, and possibly
+  /// less expensive, key type.
+  /// The DenseMapInfo is responsible for supplying methods
+  /// getHashValue(LookupKeyT) and isEqual(LookupKeyT, KeyT) for each key
+  /// type used.
+  template <typename LookupKeyT>
+  detail::DenseMapPair<value_type *, bool> insert_as(value_type &&KV,
+                                                     const LookupKeyT &Val) {
+    BucketT *TheBucket;
+    if (LookupBucketFor(Val, TheBucket))
+      return {TheBucket, false};  // Already in map.
+
+    // Otherwise, insert the new element.
+    TheBucket =
+        InsertIntoBucketWithLookup(TheBucket, __sanitizer::move(KV.first),
+                                   __sanitizer::move(KV.second), Val);
+    return {TheBucket, true};
+  }
+
+  bool erase(const KeyT &Val) {
+    BucketT *TheBucket;
+    if (!LookupBucketFor(Val, TheBucket))
+      return false;  // not in map.
+
+    TheBucket->getSecond().~ValueT();
+    TheBucket->getFirst() = getTombstoneKey();
+    decrementNumEntries();
+    incrementNumTombstones();
+    return true;
+  }
+
+  void erase(value_type *I) {
+    CHECK_NE(I, nullptr);
+    BucketT *TheBucket = &*I;
+    TheBucket->getSecond().~ValueT();
+    TheBucket->getFirst() = getTombstoneKey();
+    decrementNumEntries();
+    incrementNumTombstones();
+  }
+
+  value_type &FindAndConstruct(const KeyT &Key) {
+    BucketT *TheBucket;
+    if (LookupBucketFor(Key, TheBucket))
+      return *TheBucket;
+
+    return *InsertIntoBucket(TheBucket, Key);
+  }
+
+  ValueT &operator[](const KeyT &Key) { return FindAndConstruct(Key).second; }
+
+  value_type &FindAndConstruct(KeyT &&Key) {
+    BucketT *TheBucket;
+    if (LookupBucketFor(Key, TheBucket))
+      return *TheBucket;
+
+    return *InsertIntoBucket(TheBucket, __sanitizer::move(Key));
+  }
+
+  ValueT &operator[](KeyT &&Key) {
+    return FindAndConstruct(__sanitizer::move(Key)).second;
+  }
+
+  /// Equality comparison for DenseMap.
+  ///
+  /// Iterates over elements of LHS confirming that each (key, value) pair in
+  /// LHS is also in RHS, and that no additional pairs are in RHS. Equivalent to
+  /// N calls to RHS.find and N value comparisons. Amortized complexity is
+  /// linear, worst case is O(N^2) (if every hash collides).
+  bool operator==(const DenseMapBase &RHS) const {
+    if (size() != RHS.size())
+      return false;
+
+    const KeyT EmptyKey = getEmptyKey(), TombstoneKey = getTombstoneKey();
+    for (auto *P = getBuckets(), *E = getBucketsEnd(); P != E; ++P) {
+      const KeyT K = P->getFirst();
+      if (!KeyInfoT::isEqual(K, EmptyKey) &&
+          !KeyInfoT::isEqual(K, TombstoneKey)) {
+        const auto *I = RHS.find(K);
+        if (!I || P->getSecond() != I->getSecond())
+          return false;
+      }
+    }
+
+    return true;
+  }
+
+ protected:
+  DenseMapBase() = default;
+
+  void destroyAll() {
+    if (getNumBuckets() == 0)  // Nothing to do.
+      return;
+
+    const KeyT EmptyKey = getEmptyKey(), TombstoneKey = getTombstoneKey();
+    for (BucketT *P = getBuckets(), *E = getBucketsEnd(); P != E; ++P) {
+      if (!KeyInfoT::isEqual(P->getFirst(), EmptyKey) &&
+          !KeyInfoT::isEqual(P->getFirst(), TombstoneKey))
+        P->getSecond().~ValueT();
+      P->getFirst().~KeyT();
+    }
+  }
+
+  void initEmpty() {
+    setNumEntries(0);
+    setNumTombstones(0);
+
+    CHECK_EQ((getNumBuckets() & (getNumBuckets() - 1)), 0);
+    const KeyT EmptyKey = getEmptyKey();
+    for (BucketT *B = getBuckets(), *E = getBucketsEnd(); B != E; ++B)
+      ::new (&B->getFirst()) KeyT(EmptyKey);
+  }
+
+  /// Returns the number of buckets to allocate to ensure that the DenseMap can
+  /// accommodate \p NumEntries without need to grow().
+  unsigned getMinBucketToReserveForEntries(unsigned NumEntries) {
+    // Ensure that "NumEntries * 4 < NumBuckets * 3"
+    if (NumEntries == 0)
+      return 0;
+    // +1 is required because of the strict equality.
+    // For example if NumEntries is 48, we need to return 401.
+    return RoundUpToPowerOfTwo((NumEntries * 4 / 3 + 1) + /* NextPowerOf2 */ 1);
+  }
+
+  void moveFromOldBuckets(BucketT *OldBucketsBegin, BucketT *OldBucketsEnd) {
+    initEmpty();
+
+    // Insert all the old elements.
+    const KeyT EmptyKey = getEmptyKey();
+    const KeyT TombstoneKey = getTombstoneKey();
+    for (BucketT *B = OldBucketsBegin, *E = OldBucketsEnd; B != E; ++B) {
+      if (!KeyInfoT::isEqual(B->getFirst(), EmptyKey) &&
+          !KeyInfoT::isEqual(B->getFirst(), TombstoneKey)) {
+        // Insert the key/value into the new table.
+        BucketT *DestBucket;
+        bool FoundVal = LookupBucketFor(B->getFirst(), DestBucket);
+        (void)FoundVal;  // silence warning.
+        CHECK(!FoundVal);
+        DestBucket->getFirst() = __sanitizer::move(B->getFirst());
+        ::new (&DestBucket->getSecond())
+            ValueT(__sanitizer::move(B->getSecond()));
+        incrementNumEntries();
+
+        // Free the value.
+        B->getSecond().~ValueT();
+      }
+      B->getFirst().~KeyT();
+    }
+  }
+
+  template <typename OtherBaseT>
+  void copyFrom(
+      const DenseMapBase<OtherBaseT, KeyT, ValueT, KeyInfoT, BucketT> &other) {
+    CHECK_NE(&other, this);
+    CHECK_EQ(getNumBuckets(), other.getNumBuckets());
+
+    setNumEntries(other.getNumEntries());
+    setNumTombstones(other.getNumTombstones());
+
+    if (__sanitizer::is_trivially_copyable<KeyT>::value &&
+        __sanitizer::is_trivially_copyable<ValueT>::value)
+      internal_memcpy(reinterpret_cast<void *>(getBuckets()),
+                      other.getBuckets(), getNumBuckets() * sizeof(BucketT));
+    else
+      for (uptr i = 0; i < getNumBuckets(); ++i) {
+        ::new (&getBuckets()[i].getFirst())
+            KeyT(other.getBuckets()[i].getFirst());
+        if (!KeyInfoT::isEqual(getBuckets()[i].getFirst(), getEmptyKey()) &&
+            !KeyInfoT::isEqual(getBuckets()[i].getFirst(), getTombstoneKey()))
+          ::new (&getBuckets()[i].getSecond())
+              ValueT(other.getBuckets()[i].getSecond());
+      }
+  }
+
+  static unsigned getHashValue(const KeyT &Val) {
+    return KeyInfoT::getHashValue(Val);
+  }
+
+  template <typename LookupKeyT>
+  static unsigned getHashValue(const LookupKeyT &Val) {
+    return KeyInfoT::getHashValue(Val);
+  }
+
+  static const KeyT getEmptyKey() { return KeyInfoT::getEmptyKey(); }
+
+  static const KeyT getTombstoneKey() { return KeyInfoT::getTombstoneKey(); }
+
+ private:
+  unsigned getNumEntries() const {
+    return static_cast<const DerivedT *>(this)->getNumEntries();
+  }
+
+  void setNumEntries(unsigned Num) {
+    static_cast<DerivedT *>(this)->setNumEntries(Num);
+  }
+
+  void incrementNumEntries() { setNumEntries(getNumEntries() + 1); }
+
+  void decrementNumEntries() { setNumEntries(getNumEntries() - 1); }
+
+  unsigned getNumTombstones() const {
+    return static_cast<const DerivedT *>(this)->getNumTombstones();
+  }
+
+  void setNumTombstones(unsigned Num) {
+    static_cast<DerivedT *>(this)->setNumTombstones(Num);
+  }
+
+  void incrementNumTombstones() { setNumTombstones(getNumTombstones() + 1); }
+
+  void decrementNumTombstones() { setNumTombstones(getNumTombstones() - 1); }
+
+  const BucketT *getBuckets() const {
+    return static_cast<const DerivedT *>(this)->getBuckets();
+  }
+
+  BucketT *getBuckets() { return static_cast<DerivedT *>(this)->getBuckets(); }
+
+  unsigned getNumBuckets() const {
+    return static_cast<const DerivedT *>(this)->getNumBuckets();
+  }
+
+  BucketT *getBucketsEnd() { return getBuckets() + getNumBuckets(); }
+
+  const BucketT *getBucketsEnd() const {
+    return getBuckets() + getNumBuckets();
+  }
+
+  void grow(unsigned AtLeast) { static_cast<DerivedT *>(this)->grow(AtLeast); }
+
+  template <typename KeyArg, typename... ValueArgs>
+  BucketT *InsertIntoBucket(BucketT *TheBucket, KeyArg &&Key,
+                            ValueArgs &&...Values) {
+    TheBucket = InsertIntoBucketImpl(Key, Key, TheBucket);
+
+    TheBucket->getFirst() = __sanitizer::forward<KeyArg>(Key);
+    ::new (&TheBucket->getSecond())
+        ValueT(__sanitizer::forward<ValueArgs>(Values)...);
+    return TheBucket;
+  }
+
+  template <typename LookupKeyT>
+  BucketT *InsertIntoBucketWithLookup(BucketT *TheBucket, KeyT &&Key,
+                                      ValueT &&Value, LookupKeyT &Lookup) {
+    TheBucket = InsertIntoBucketImpl(Key, Lookup, TheBucket);
+
+    TheBucket->getFirst() = __sanitizer::move(Key);
+    ::new (&TheBucket->getSecond()) ValueT(__sanitizer::move(Value));
+    return TheBucket;
+  }
+
+  template <typename LookupKeyT>
+  BucketT *InsertIntoBucketImpl(const KeyT &Key, const LookupKeyT &Lookup,
+                                BucketT *TheBucket) {
+    // If the load of the hash table is more than 3/4, or if fewer than 1/8 of
+    // the buckets are empty (meaning that many are filled with tombstones),
+    // grow the table.
+    //
+    // The later case is tricky.  For example, if we had one empty bucket with
+    // tons of tombstones, failing lookups (e.g. for insertion) would have to
+    // probe almost the entire table until it found the empty bucket.  If the
+    // table completely filled with tombstones, no lookup would ever succeed,
+    // causing infinite loops in lookup.
+    unsigned NewNumEntries = getNumEntries() + 1;
+    unsigned NumBuckets = getNumBuckets();
+    if (UNLIKELY(NewNumEntries * 4 >= NumBuckets * 3)) {
+      this->grow(NumBuckets * 2);
+      LookupBucketFor(Lookup, TheBucket);
+      NumBuckets = getNumBuckets();
+    } else if (UNLIKELY(NumBuckets - (NewNumEntries + getNumTombstones()) <=
+                        NumBuckets / 8)) {
+      this->grow(NumBuckets);
+      LookupBucketFor(Lookup, TheBucket);
+    }
+    CHECK(TheBucket);
+
+    // Only update the state after we've grown our bucket space appropriately
+    // so that when growing buckets we have self-consistent entry count.
+    incrementNumEntries();
+
+    // If we are writing over a tombstone, remember this.
+    const KeyT EmptyKey = getEmptyKey();
+    if (!KeyInfoT::isEqual(TheBucket->getFirst(), EmptyKey))
+      decrementNumTombstones();
+
+    return TheBucket;
+  }
+
+  /// LookupBucketFor - Lookup the appropriate bucket for Val, returning it in
+  /// FoundBucket.  If the bucket contains the key and a value, this returns
+  /// true, otherwise it returns a bucket with an empty marker or tombstone and
+  /// returns false.
+  template <typename LookupKeyT>
+  bool LookupBucketFor(const LookupKeyT &Val,
+                       const BucketT *&FoundBucket) const {
+    const BucketT *BucketsPtr = getBuckets();
+    const unsigned NumBuckets = getNumBuckets();
+
+    if (NumBuckets == 0) {
+      FoundBucket = nullptr;
+      return false;
+    }
+
+    // FoundTombstone - Keep track of whether we find a tombstone while probing.
+    const BucketT *FoundTombstone = nullptr;
+    const KeyT EmptyKey = getEmptyKey();
+    const KeyT TombstoneKey = getTombstoneKey();
+    CHECK(!KeyInfoT::isEqual(Val, EmptyKey));
+    CHECK(!KeyInfoT::isEqual(Val, TombstoneKey));
+
+    unsigned BucketNo = getHashValue(Val) & (NumBuckets - 1);
+    unsigned ProbeAmt = 1;
+    while (true) {
+      const BucketT *ThisBucket = BucketsPtr + BucketNo;
+      // Found Val's bucket?  If so, return it.
+      if (LIKELY(KeyInfoT::isEqual(Val, ThisBucket->getFirst()))) {
+        FoundBucket = ThisBucket;
+        return true;
+      }
+
+      // If we found an empty bucket, the key doesn't exist in the set.
+      // Insert it and return the default value.
+      if (LIKELY(KeyInfoT::isEqual(ThisBucket->getFirst(), EmptyKey))) {
+        // If we've already seen a tombstone while probing, fill it in instead
+        // of the empty bucket we eventually probed to.
+        FoundBucket = FoundTombstone ? FoundTombstone : ThisBucket;
+        return false;
+      }
+
+      // If this is a tombstone, remember it.  If Val ends up not in the map, we
+      // prefer to return it than something that would require more probing.
+      if (KeyInfoT::isEqual(ThisBucket->getFirst(), TombstoneKey) &&
+          !FoundTombstone)
+        FoundTombstone = ThisBucket;  // Remember the first tombstone found.
+
+      // Otherwise, it's a hash collision or a tombstone, continue quadratic
+      // probing.
+      BucketNo += ProbeAmt++;
+      BucketNo &= (NumBuckets - 1);
+    }
+  }
+
+  template <typename LookupKeyT>
+  bool LookupBucketFor(const LookupKeyT &Val, BucketT *&FoundBucket) {
+    const BucketT *ConstFoundBucket;
+    bool Result = const_cast<const DenseMapBase *>(this)->LookupBucketFor(
+        Val, ConstFoundBucket);
+    FoundBucket = const_cast<BucketT *>(ConstFoundBucket);
+    return Result;
+  }
+
+ public:
+  /// Return the approximate size (in bytes) of the actual map.
+  /// This is just the raw memory used by DenseMap.
+  /// If entries are pointers to objects, the size of the referenced objects
+  /// are not included.
+  uptr getMemorySize() const {
+    return RoundUpTo(getNumBuckets() * sizeof(BucketT), GetPageSizeCached());
+  }
+};
+
+/// Inequality comparison for DenseMap.
+///
+/// Equivalent to !(LHS == RHS). See operator== for performance notes.
+template <typename DerivedT, typename KeyT, typename ValueT, typename KeyInfoT,
+          typename BucketT>
+bool operator!=(
+    const DenseMapBase<DerivedT, KeyT, ValueT, KeyInfoT, BucketT> &LHS,
+    const DenseMapBase<DerivedT, KeyT, ValueT, KeyInfoT, BucketT> &RHS) {
+  return !(LHS == RHS);
+}
+
+template <typename KeyT, typename ValueT,
+          typename KeyInfoT = DenseMapInfo<KeyT>,
+          typename BucketT = detail::DenseMapPair<KeyT, ValueT>>
+class DenseMap : public DenseMapBase<DenseMap<KeyT, ValueT, KeyInfoT, BucketT>,
+                                     KeyT, ValueT, KeyInfoT, BucketT> {
+  friend class DenseMapBase<DenseMap, KeyT, ValueT, KeyInfoT, BucketT>;
+
+  // Lift some types from the dependent base class into this class for
+  // simplicity of referring to them.
+  using BaseT = DenseMapBase<DenseMap, KeyT, ValueT, KeyInfoT, BucketT>;
+
+  BucketT *Buckets = nullptr;
+  unsigned NumEntries = 0;
+  unsigned NumTombstones = 0;
+  unsigned NumBuckets = 0;
+
+ public:
+  /// Create a DenseMap with an optional \p InitialReserve that guarantee that
+  /// this number of elements can be inserted in the map without grow()
+  explicit DenseMap(unsigned InitialReserve) { init(InitialReserve); }
+  constexpr DenseMap() = default;
+
+  DenseMap(const DenseMap &other) : BaseT() {
+    init(0);
+    copyFrom(other);
+  }
+
+  DenseMap(DenseMap &&other) : BaseT() {
+    init(0);
+    swap(other);
+  }
+
+  ~DenseMap() {
+    this->destroyAll();
+    deallocate_buffer(Buckets, sizeof(BucketT) * NumBuckets);
+  }
+
+  void swap(DenseMap &RHS) {
+    Swap(Buckets, RHS.Buckets);
+    Swap(NumEntries, RHS.NumEntries);
+    Swap(NumTombstones, RHS.NumTombstones);
+    Swap(NumBuckets, RHS.NumBuckets);
+  }
+
+  DenseMap &operator=(const DenseMap &other) {
+    if (&other != this)
+      copyFrom(other);
+    return *this;
+  }
+
+  DenseMap &operator=(DenseMap &&other) {
+    this->destroyAll();
+    deallocate_buffer(Buckets, sizeof(BucketT) * NumBuckets, alignof(BucketT));
+    init(0);
+    swap(other);
+    return *this;
+  }
+
+  void copyFrom(const DenseMap &other) {
+    this->destroyAll();
+    deallocate_buffer(Buckets, sizeof(BucketT) * NumBuckets);
+    if (allocateBuckets(other.NumBuckets)) {
+      this->BaseT::copyFrom(other);
+    } else {
+      NumEntries = 0;
+      NumTombstones = 0;
+    }
+  }
+
+  void init(unsigned InitNumEntries) {
+    auto InitBuckets = BaseT::getMinBucketToReserveForEntries(InitNumEntries);
+    if (allocateBuckets(InitBuckets)) {
+      this->BaseT::initEmpty();
+    } else {
+      NumEntries = 0;
+      NumTombstones = 0;
+    }
+  }
+
+  void grow(unsigned AtLeast) {
+    unsigned OldNumBuckets = NumBuckets;
+    BucketT *OldBuckets = Buckets;
+
+    allocateBuckets(RoundUpToPowerOfTwo(Max<unsigned>(64, AtLeast)));
+    CHECK(Buckets);
+    if (!OldBuckets) {
+      this->BaseT::initEmpty();
+      return;
+    }
+
+    this->moveFromOldBuckets(OldBuckets, OldBuckets + OldNumBuckets);
+
+    // Free the old table.
+    deallocate_buffer(OldBuckets, sizeof(BucketT) * OldNumBuckets);
+  }
+
+ private:
+  unsigned getNumEntries() const { return NumEntries; }
+
+  void setNumEntries(unsigned Num) { NumEntries = Num; }
+
+  unsigned getNumTombstones() const { return NumTombstones; }
+
+  void setNumTombstones(unsigned Num) { NumTombstones = Num; }
+
+  BucketT *getBuckets() const { return Buckets; }
+
+  unsigned getNumBuckets() const { return NumBuckets; }
+
+  bool allocateBuckets(unsigned Num) {
+    NumBuckets = Num;
+    if (NumBuckets == 0) {
+      Buckets = nullptr;
+      return false;
+    }
+
+    uptr Size = sizeof(BucketT) * NumBuckets;
+    if (Size * 2 <= GetPageSizeCached()) {
+      // We always allocate at least a page, so use entire space.
+      unsigned Log2 = MostSignificantSetBitIndex(GetPageSizeCached() / Size);
+      Size <<= Log2;
+      NumBuckets <<= Log2;
+      CHECK_EQ(Size, sizeof(BucketT) * NumBuckets);
+      CHECK_GT(Size * 2, GetPageSizeCached());
+    }
+    Buckets = static_cast<BucketT *>(allocate_buffer(Size));
+    return true;
+  }
+
+  static void *allocate_buffer(uptr Size) {
+    return MmapOrDie(RoundUpTo(Size, GetPageSizeCached()), "DenseMap");
+  }
+
+  static void deallocate_buffer(void *Ptr, uptr Size) {
+    UnmapOrDie(Ptr, RoundUpTo(Size, GetPageSizeCached()));
+  }
+};
+
+}  // namespace __sanitizer
+
+#endif  // SANITIZER_DENSE_MAP_H
diff --git a/libsanitizer/sanitizer_common/sanitizer_dense_map_info.h b/libsanitizer/sanitizer_common/sanitizer_dense_map_info.h
new file mode 100644
index 00000000000..85c6427906c
--- /dev/null
+++ b/libsanitizer/sanitizer_common/sanitizer_dense_map_info.h
@@ -0,0 +1,260 @@
+//===- sanitizer_dense_map_info.h - Type traits for DenseMap ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SANITIZER_DENSE_MAP_INFO_H
+#define SANITIZER_DENSE_MAP_INFO_H
+
+#include "sanitizer_common.h"
+#include "sanitizer_internal_defs.h"
+#include "sanitizer_type_traits.h"
+
+namespace __sanitizer {
+
+namespace detail {
+
+/// Simplistic combination of 32-bit hash values into 32-bit hash values.
+static inline unsigned combineHashValue(unsigned a, unsigned b) {
+  u64 key = (u64)a << 32 | (u64)b;
+  key += ~(key << 32);
+  key ^= (key >> 22);
+  key += ~(key << 13);
+  key ^= (key >> 8);
+  key += (key << 3);
+  key ^= (key >> 15);
+  key += ~(key << 27);
+  key ^= (key >> 31);
+  return (unsigned)key;
+}
+
+// We extend a pair to allow users to override the bucket type with their own
+// implementation without requiring two members.
+template <typename KeyT, typename ValueT>
+struct DenseMapPair {
+  KeyT first = {};
+  ValueT second = {};
+  DenseMapPair() = default;
+  DenseMapPair(const KeyT &f, const ValueT &s) : first(f), second(s) {}
+
+  template <typename KeyT2, typename ValueT2>
+  DenseMapPair(KeyT2 &&f, ValueT2 &&s)
+      : first(__sanitizer::forward<KeyT2>(f)),
+        second(__sanitizer::forward<ValueT2>(s)) {}
+
+  DenseMapPair(const DenseMapPair &other) = default;
+  DenseMapPair &operator=(const DenseMapPair &other) = default;
+  DenseMapPair(DenseMapPair &&other) = default;
+  DenseMapPair &operator=(DenseMapPair &&other) = default;
+
+  KeyT &getFirst() { return first; }
+  const KeyT &getFirst() const { return first; }
+  ValueT &getSecond() { return second; }
+  const ValueT &getSecond() const { return second; }
+};
+
+}  // end namespace detail
+
+template <typename T>
+struct DenseMapInfo {
+  // static inline T getEmptyKey();
+  // static inline T getTombstoneKey();
+  // static unsigned getHashValue(const T &Val);
+  // static bool isEqual(const T &LHS, const T &RHS);
+};
+
+// Provide DenseMapInfo for all pointers. Come up with sentinel pointer values
+// that are aligned to alignof(T) bytes, but try to avoid requiring T to be
+// complete. This allows clients to instantiate DenseMap<T*, ...> with forward
+// declared key types. Assume that no pointer key type requires more than 4096
+// bytes of alignment.
+template <typename T>
+struct DenseMapInfo<T *> {
+  // The following should hold, but it would require T to be complete:
+  // static_assert(alignof(T) <= (1 << Log2MaxAlign),
+  //               "DenseMap does not support pointer keys requiring more than "
+  //               "Log2MaxAlign bits of alignment");
+  static constexpr uptr Log2MaxAlign = 12;
+
+  static inline T *getEmptyKey() {
+    uptr Val = static_cast<uptr>(-1);
+    Val <<= Log2MaxAlign;
+    return reinterpret_cast<T *>(Val);
+  }
+
+  static inline T *getTombstoneKey() {
+    uptr Val = static_cast<uptr>(-2);
+    Val <<= Log2MaxAlign;
+    return reinterpret_cast<T *>(Val);
+  }
+
+  static unsigned getHashValue(const T *PtrVal) {
+    return (unsigned((uptr)PtrVal) >> 4) ^ (unsigned((uptr)PtrVal) >> 9);
+  }
+
+  static bool isEqual(const T *LHS, const T *RHS) { return LHS == RHS; }
+};
+
+// Provide DenseMapInfo for chars.
+template <>
+struct DenseMapInfo<char> {
+  static inline char getEmptyKey() { return ~0; }
+  static inline char getTombstoneKey() { return ~0 - 1; }
+  static unsigned getHashValue(const char &Val) { return Val * 37U; }
+
+  static bool isEqual(const char &LHS, const char &RHS) { return LHS == RHS; }
+};
+
+// Provide DenseMapInfo for unsigned chars.
+template <>
+struct DenseMapInfo<unsigned char> {
+  static inline unsigned char getEmptyKey() { return ~0; }
+  static inline unsigned char getTombstoneKey() { return ~0 - 1; }
+  static unsigned getHashValue(const unsigned char &Val) { return Val * 37U; }
+
+  static bool isEqual(const unsigned char &LHS, const unsigned char &RHS) {
+    return LHS == RHS;
+  }
+};
+
+// Provide DenseMapInfo for unsigned shorts.
+template <>
+struct DenseMapInfo<unsigned short> {
+  static inline unsigned short getEmptyKey() { return 0xFFFF; }
+  static inline unsigned short getTombstoneKey() { return 0xFFFF - 1; }
+  static unsigned getHashValue(const unsigned short &Val) { return Val * 37U; }
+
+  static bool isEqual(const unsigned short &LHS, const unsigned short &RHS) {
+    return LHS == RHS;
+  }
+};
+
+// Provide DenseMapInfo for unsigned ints.
+template <>
+struct DenseMapInfo<unsigned> {
+  static inline unsigned getEmptyKey() { return ~0U; }
+  static inline unsigned getTombstoneKey() { return ~0U - 1; }
+  static unsigned getHashValue(const unsigned &Val) { return Val * 37U; }
+
+  static bool isEqual(const unsigned &LHS, const unsigned &RHS) {
+    return LHS == RHS;
+  }
+};
+
+// Provide DenseMapInfo for unsigned longs.
+template <>
+struct DenseMapInfo<unsigned long> {
+  static inline unsigned long getEmptyKey() { return ~0UL; }
+  static inline unsigned long getTombstoneKey() { return ~0UL - 1L; }
+
+  static unsigned getHashValue(const unsigned long &Val) {
+    return (unsigned)(Val * 37UL);
+  }
+
+  static bool isEqual(const unsigned long &LHS, const unsigned long &RHS) {
+    return LHS == RHS;
+  }
+};
+
+// Provide DenseMapInfo for unsigned long longs.
+template <>
+struct DenseMapInfo<unsigned long long> {
+  static inline unsigned long long getEmptyKey() { return ~0ULL; }
+  static inline unsigned long long getTombstoneKey() { return ~0ULL - 1ULL; }
+
+  static unsigned getHashValue(const unsigned long long &Val) {
+    return (unsigned)(Val * 37ULL);
+  }
+
+  static bool isEqual(const unsigned long long &LHS,
+                      const unsigned long long &RHS) {
+    return LHS == RHS;
+  }
+};
+
+// Provide DenseMapInfo for shorts.
+template <>
+struct DenseMapInfo<short> {
+  static inline short getEmptyKey() { return 0x7FFF; }
+  static inline short getTombstoneKey() { return -0x7FFF - 1; }
+  static unsigned getHashValue(const short &Val) { return Val * 37U; }
+  static bool isEqual(const short &LHS, const short &RHS) { return LHS == RHS; }
+};
+
+// Provide DenseMapInfo for ints.
+template <>
+struct DenseMapInfo<int> {
+  static inline int getEmptyKey() { return 0x7fffffff; }
+  static inline int getTombstoneKey() { return -0x7fffffff - 1; }
+  static unsigned getHashValue(const int &Val) { return (unsigned)(Val * 37U); }
+
+  static bool isEqual(const int &LHS, const int &RHS) { return LHS == RHS; }
+};
+
+// Provide DenseMapInfo for longs.
+template <>
+struct DenseMapInfo<long> {
+  static inline long getEmptyKey() {
+    return (1UL << (sizeof(long) * 8 - 1)) - 1UL;
+  }
+
+  static inline long getTombstoneKey() { return getEmptyKey() - 1L; }
+
+  static unsigned getHashValue(const long &Val) {
+    return (unsigned)(Val * 37UL);
+  }
+
+  static bool isEqual(const long &LHS, const long &RHS) { return LHS == RHS; }
+};
+
+// Provide DenseMapInfo for long longs.
+template <>
+struct DenseMapInfo<long long> {
+  static inline long long getEmptyKey() { return 0x7fffffffffffffffLL; }
+  static inline long long getTombstoneKey() {
+    return -0x7fffffffffffffffLL - 1;
+  }
+
+  static unsigned getHashValue(const long long &Val) {
+    return (unsigned)(Val * 37ULL);
+  }
+
+  static bool isEqual(const long long &LHS, const long long &RHS) {
+    return LHS == RHS;
+  }
+};
+
+// Provide DenseMapInfo for all pairs whose members have info.
+template <typename T, typename U>
+struct DenseMapInfo<detail::DenseMapPair<T, U>> {
+  using Pair = detail::DenseMapPair<T, U>;
+  using FirstInfo = DenseMapInfo<T>;
+  using SecondInfo = DenseMapInfo<U>;
+
+  static inline Pair getEmptyKey() {
+    return detail::DenseMapPair<T, U>(FirstInfo::getEmptyKey(),
+                                      SecondInfo::getEmptyKey());
+  }
+
+  static inline Pair getTombstoneKey() {
+    return detail::DenseMapPair<T, U>(FirstInfo::getTombstoneKey(),
+                                      SecondInfo::getTombstoneKey());
+  }
+
+  static unsigned getHashValue(const Pair &PairVal) {
+    return detail::combineHashValue(FirstInfo::getHashValue(PairVal.first),
+                                    SecondInfo::getHashValue(PairVal.second));
+  }
+
+  static bool isEqual(const Pair &LHS, const Pair &RHS) {
+    return FirstInfo::isEqual(LHS.first, RHS.first) &&
+           SecondInfo::isEqual(LHS.second, RHS.second);
+  }
+};
+
+}  // namespace __sanitizer
+
+#endif  // SANITIZER_DENSE_MAP_INFO_H
diff --git a/libsanitizer/sanitizer_common/sanitizer_internal_defs.h b/libsanitizer/sanitizer_common/sanitizer_internal_defs.h
index e97cc9ac0df..d0db0129d4a 100644
--- a/libsanitizer/sanitizer_common/sanitizer_internal_defs.h
+++ b/libsanitizer/sanitizer_common/sanitizer_internal_defs.h
@@ -300,7 +300,8 @@ void NORETURN CheckFailed(const char *file, int line, const char *cond,
     }                                          \
   } while (0)
 
-#define RAW_CHECK(expr, ...) RAW_CHECK_MSG(expr, #expr "\n", __VA_ARGS__)
+#define RAW_CHECK(expr) RAW_CHECK_MSG(expr, #expr "\n", )
+#define RAW_CHECK_VA(expr, ...) RAW_CHECK_MSG(expr, #expr "\n", __VA_ARGS__)
 
 #define CHECK_IMPL(c1, op, c2) \
   do { \
diff --git a/libsanitizer/sanitizer_common/sanitizer_linux.cpp b/libsanitizer/sanitizer_common/sanitizer_linux.cpp
index aa59d9718ca..596037d7722 100644
--- a/libsanitizer/sanitizer_common/sanitizer_linux.cpp
+++ b/libsanitizer/sanitizer_common/sanitizer_linux.cpp
@@ -1380,7 +1380,7 @@ uptr internal_clone(int (*fn)(void *), void *child_stack, int flags, void *arg,
 #elif defined(__aarch64__)
 uptr internal_clone(int (*fn)(void *), void *child_stack, int flags, void *arg,
                     int *parent_tidptr, void *newtls, int *child_tidptr) {
-  long long res;
+  register long long res __asm__("x0");
   if (!fn || !child_stack)
     return -EINVAL;
   CHECK_EQ(0, (uptr)child_stack % 16);
diff --git a/libsanitizer/sanitizer_common/sanitizer_linux_libcdep.cpp b/libsanitizer/sanitizer_common/sanitizer_linux_libcdep.cpp
index fc5619e4b37..7ce9e25da34 100644
--- a/libsanitizer/sanitizer_common/sanitizer_linux_libcdep.cpp
+++ b/libsanitizer/sanitizer_common/sanitizer_linux_libcdep.cpp
@@ -759,13 +759,9 @@ u32 GetNumberOfCPUs() {
 #elif SANITIZER_SOLARIS
   return sysconf(_SC_NPROCESSORS_ONLN);
 #else
-#if defined(CPU_COUNT)
   cpu_set_t CPUs;
   CHECK_EQ(sched_getaffinity(0, sizeof(cpu_set_t), &CPUs), 0);
   return CPU_COUNT(&CPUs);
-#else
-  return 1;
-#endif
 #endif
 }
 
diff --git a/libsanitizer/sanitizer_common/sanitizer_linux_s390.cpp b/libsanitizer/sanitizer_common/sanitizer_linux_s390.cpp
index bb2f5b5f9f7..74db831b0aa 100644
--- a/libsanitizer/sanitizer_common/sanitizer_linux_s390.cpp
+++ b/libsanitizer/sanitizer_common/sanitizer_linux_s390.cpp
@@ -57,8 +57,10 @@ uptr internal_mmap(void *addr, uptr length, int prot, int flags, int fd,
 
 uptr internal_clone(int (*fn)(void *), void *child_stack, int flags, void *arg,
                     int *parent_tidptr, void *newtls, int *child_tidptr) {
-  if (!fn || !child_stack)
-    return -EINVAL;
+  if (!fn || !child_stack) {
+    errno = EINVAL;
+    return -1;
+  }
   CHECK_EQ(0, (uptr)child_stack % 16);
   // Minimum frame size.
 #ifdef __s390x__
@@ -71,9 +73,9 @@ uptr internal_clone(int (*fn)(void *), void *child_stack, int flags, void *arg,
   // And pass parameters.
   ((unsigned long *)child_stack)[1] = (uptr)fn;
   ((unsigned long *)child_stack)[2] = (uptr)arg;
-  register long res __asm__("r2");
+  register uptr res __asm__("r2");
   register void *__cstack      __asm__("r2") = child_stack;
-  register int __flags         __asm__("r3") = flags;
+  register long __flags        __asm__("r3") = flags;
   register int * __ptidptr     __asm__("r4") = parent_tidptr;
   register int * __ctidptr     __asm__("r5") = child_tidptr;
   register void * __newtls     __asm__("r6") = newtls;
@@ -113,6 +115,10 @@ uptr internal_clone(int (*fn)(void *), void *child_stack, int flags, void *arg,
                          "r"(__ctidptr),
                          "r"(__newtls)
                        : "memory", "cc");
+  if (res >= (uptr)-4095) {
+    errno = -res;
+    return -1;
+  }
   return res;
 }
 
diff --git a/libsanitizer/sanitizer_common/sanitizer_mac.cpp b/libsanitizer/sanitizer_common/sanitizer_mac.cpp
index a61cde891b9..b67203d4c10 100644
--- a/libsanitizer/sanitizer_common/sanitizer_mac.cpp
+++ b/libsanitizer/sanitizer_common/sanitizer_mac.cpp
@@ -37,7 +37,7 @@
 extern char **environ;
 #endif
 
-#if defined(__has_include) && __has_include(<os/trace.h>) && defined(__BLOCKS__)
+#if defined(__has_include) && __has_include(<os/trace.h>)
 #define SANITIZER_OS_TRACE 1
 #include <os/trace.h>
 #else
@@ -70,15 +70,7 @@ extern "C" {
 #include <mach/mach_time.h>
 #include <mach/vm_statistics.h>
 #include <malloc/malloc.h>
-#if defined(__has_builtin) && __has_builtin(__builtin_os_log_format)
-# include <os/log.h>
-#else
-   /* Without support for __builtin_os_log_format, fall back to the older
-      method.  */
-# define OS_LOG_DEFAULT 0
-# define os_log_error(A,B,C) \
-  asl_log(nullptr, nullptr, ASL_LEVEL_ERR, "%s", (C));
-#endif
+#include <os/log.h>
 #include <pthread.h>
 #include <sched.h>
 #include <signal.h>
diff --git a/libsanitizer/sanitizer_common/sanitizer_mac.h b/libsanitizer/sanitizer_common/sanitizer_mac.h
index 96a5986a47a..0b6af5a3c0e 100644
--- a/libsanitizer/sanitizer_common/sanitizer_mac.h
+++ b/libsanitizer/sanitizer_common/sanitizer_mac.h
@@ -14,26 +14,6 @@
 
 #include "sanitizer_common.h"
 #include "sanitizer_platform.h"
-
-/* TARGET_OS_OSX is not present in SDKs before Darwin16 (macOS 10.12) use
-   TARGET_OS_MAC (we have no support for iOS in any form for these versions,
-   so there's no ambiguity).  */
-#if !defined(TARGET_OS_OSX) && TARGET_OS_MAC
-# define TARGET_OS_OSX 1
-#endif
-
-/* Other TARGET_OS_xxx are not present on earlier versions, define them to
-   0 (we have no support for them; they are not valid targets anyway).  */
-#ifndef TARGET_OS_IOS
-#define TARGET_OS_IOS 0
-#endif
-#ifndef TARGET_OS_TV
-#define TARGET_OS_TV 0
-#endif
-#ifndef TARGET_OS_WATCH
-#define TARGET_OS_WATCH 0
-#endif
-
 #if SANITIZER_MAC
 #include "sanitizer_posix.h"
 
diff --git a/libsanitizer/sanitizer_common/sanitizer_persistent_allocator.h b/libsanitizer/sanitizer_common/sanitizer_persistent_allocator.h
deleted file mode 100644
index e18b0030567..00000000000
--- a/libsanitizer/sanitizer_common/sanitizer_persistent_allocator.h
+++ /dev/null
@@ -1,110 +0,0 @@
-//===-- sanitizer_persistent_allocator.h ------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// A fast memory allocator that does not support free() nor realloc().
-// All allocations are forever.
-//===----------------------------------------------------------------------===//
-
-#ifndef SANITIZER_PERSISTENT_ALLOCATOR_H
-#define SANITIZER_PERSISTENT_ALLOCATOR_H
-
-#include "sanitizer_internal_defs.h"
-#include "sanitizer_mutex.h"
-#include "sanitizer_atomic.h"
-#include "sanitizer_common.h"
-
-namespace __sanitizer {
-
-template <typename T>
-class PersistentAllocator {
- public:
-  T *alloc(uptr count = 1);
-  uptr allocated() const { return atomic_load_relaxed(&mapped_size); }
-
-  void TestOnlyUnmap();
-
- private:
-  T *tryAlloc(uptr count);
-  T *refillAndAlloc(uptr count);
-  mutable StaticSpinMutex mtx;  // Protects alloc of new blocks.
-  atomic_uintptr_t region_pos;  // Region allocator for Node's.
-  atomic_uintptr_t region_end;
-  atomic_uintptr_t mapped_size;
-
-  struct BlockInfo {
-    const BlockInfo *next;
-    uptr ptr;
-    uptr size;
-  };
-  const BlockInfo *curr;
-};
-
-template <typename T>
-inline T *PersistentAllocator<T>::tryAlloc(uptr count) {
-  // Optimisic lock-free allocation, essentially try to bump the region ptr.
-  for (;;) {
-    uptr cmp = atomic_load(&region_pos, memory_order_acquire);
-    uptr end = atomic_load(&region_end, memory_order_acquire);
-    uptr size = count * sizeof(T);
-    if (cmp == 0 || cmp + size > end)
-      return nullptr;
-    if (atomic_compare_exchange_weak(&region_pos, &cmp, cmp + size,
-                                     memory_order_acquire))
-      return reinterpret_cast<T *>(cmp);
-  }
-}
-
-template <typename T>
-inline T *PersistentAllocator<T>::alloc(uptr count) {
-  // First, try to allocate optimisitically.
-  T *s = tryAlloc(count);
-  if (LIKELY(s))
-    return s;
-  return refillAndAlloc(count);
-}
-
-template <typename T>
-inline T *PersistentAllocator<T>::refillAndAlloc(uptr count) {
-  // If failed, lock, retry and alloc new superblock.
-  SpinMutexLock l(&mtx);
-  for (;;) {
-    T *s = tryAlloc(count);
-    if (s)
-      return s;
-    atomic_store(&region_pos, 0, memory_order_relaxed);
-    uptr size = count * sizeof(T) + sizeof(BlockInfo);
-    uptr allocsz = RoundUpTo(Max<uptr>(size, 64u * 1024u), GetPageSizeCached());
-    uptr mem = (uptr)MmapOrDie(allocsz, "stack depot");
-    BlockInfo *new_block = (BlockInfo *)(mem + allocsz) - 1;
-    new_block->next = curr;
-    new_block->ptr = mem;
-    new_block->size = allocsz;
-    curr = new_block;
-
-    atomic_fetch_add(&mapped_size, allocsz, memory_order_relaxed);
-
-    allocsz -= sizeof(BlockInfo);
-    atomic_store(&region_end, mem + allocsz, memory_order_release);
-    atomic_store(&region_pos, mem, memory_order_release);
-  }
-}
-
-template <typename T>
-void PersistentAllocator<T>::TestOnlyUnmap() {
-  while (curr) {
-    uptr mem = curr->ptr;
-    uptr allocsz = curr->size;
-    curr = curr->next;
-    UnmapOrDie((void *)mem, allocsz);
-  }
-  internal_memset(this, 0, sizeof(*this));
-}
-
-} // namespace __sanitizer
-
-#endif // SANITIZER_PERSISTENT_ALLOCATOR_H
diff --git a/libsanitizer/sanitizer_common/sanitizer_platform_limits_linux.cpp b/libsanitizer/sanitizer_common/sanitizer_platform_limits_linux.cpp
index 2b1a2f7932c..9d577570ea1 100644
--- a/libsanitizer/sanitizer_common/sanitizer_platform_limits_linux.cpp
+++ b/libsanitizer/sanitizer_common/sanitizer_platform_limits_linux.cpp
@@ -26,10 +26,7 @@
 
 // With old kernels (and even new kernels on powerpc) asm/stat.h uses types that
 // are not defined anywhere in userspace headers. Fake them. This seems to work
-// fine with newer headers, too.  Beware that with <sys/stat.h>, struct stat
-// takes the form of struct stat64 on 32-bit platforms if _FILE_OFFSET_BITS=64.
-// Also, for some platforms (e.g. mips) there are additional members in the
-// <sys/stat.h> struct stat:s.
+// fine with newer headers, too.
 #include <linux/posix_types.h>
 #  if defined(__x86_64__) || defined(__mips__) || defined(__hexagon__)
 #    include <sys/stat.h>
diff --git a/libsanitizer/sanitizer_common/sanitizer_platform_limits_posix.h b/libsanitizer/sanitizer_common/sanitizer_platform_limits_posix.h
index da53b5abef2..d69b344dd61 100644
--- a/libsanitizer/sanitizer_common/sanitizer_platform_limits_posix.h
+++ b/libsanitizer/sanitizer_common/sanitizer_platform_limits_posix.h
@@ -83,7 +83,7 @@ const unsigned struct_kernel_stat64_sz = 104;
 #elif defined(__mips__)
 const unsigned struct_kernel_stat_sz = SANITIZER_ANDROID
                                            ? FIRST_32_SECOND_64(104, 128)
-                                           : FIRST_32_SECOND_64(144, 216);
+                                           : FIRST_32_SECOND_64(160, 216);
 const unsigned struct_kernel_stat64_sz = 104;
 #elif defined(__s390__) && !defined(__s390x__)
 const unsigned struct_kernel_stat_sz = 64;
diff --git a/libsanitizer/sanitizer_common/sanitizer_printf.cpp b/libsanitizer/sanitizer_common/sanitizer_printf.cpp
index 79aee8ba628..3a9e366d2df 100644
--- a/libsanitizer/sanitizer_common/sanitizer_printf.cpp
+++ b/libsanitizer/sanitizer_common/sanitizer_printf.cpp
@@ -191,12 +191,12 @@ int VSNPrintf(char *buff, int buff_length,
         break;
       }
       case 'p': {
-        RAW_CHECK(!have_flags, kPrintfFormatsHelp, format);
+        RAW_CHECK_VA(!have_flags, kPrintfFormatsHelp, format);
         result += AppendPointer(&buff, buff_end, va_arg(args, uptr));
         break;
       }
       case 's': {
-        RAW_CHECK(!have_length, kPrintfFormatsHelp, format);
+        RAW_CHECK_VA(!have_length, kPrintfFormatsHelp, format);
         // Only left-justified width is supported.
         CHECK(!have_width || left_justified);
         result += AppendString(&buff, buff_end, left_justified ? -width : width,
@@ -204,17 +204,17 @@ int VSNPrintf(char *buff, int buff_length,
         break;
       }
       case 'c': {
-        RAW_CHECK(!have_flags, kPrintfFormatsHelp, format);
+        RAW_CHECK_VA(!have_flags, kPrintfFormatsHelp, format);
         result += AppendChar(&buff, buff_end, va_arg(args, int));
         break;
       }
       case '%' : {
-        RAW_CHECK(!have_flags, kPrintfFormatsHelp, format);
+        RAW_CHECK_VA(!have_flags, kPrintfFormatsHelp, format);
         result += AppendChar(&buff, buff_end, '%');
         break;
       }
       default: {
-        RAW_CHECK(false, kPrintfFormatsHelp, format);
+        RAW_CHECK_VA(false, kPrintfFormatsHelp, format);
       }
     }
   }
diff --git a/libsanitizer/sanitizer_common/sanitizer_stack_store.cpp b/libsanitizer/sanitizer_common/sanitizer_stack_store.cpp
new file mode 100644
index 00000000000..ad88e2bbbef
--- /dev/null
+++ b/libsanitizer/sanitizer_common/sanitizer_stack_store.cpp
@@ -0,0 +1,91 @@
+//===-- sanitizer_stack_store.cpp -------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sanitizer_stack_store.h"
+
+#include "sanitizer_atomic.h"
+#include "sanitizer_common.h"
+#include "sanitizer_stacktrace.h"
+
+namespace __sanitizer {
+
+static constexpr u32 kStackSizeBits = 16;
+
+StackStore::Id StackStore::Store(const StackTrace &trace) {
+  uptr *stack_trace = Alloc(trace.size + 1);
+  CHECK_LT(trace.size, 1 << kStackSizeBits);
+  *stack_trace = trace.size + (trace.tag << kStackSizeBits);
+  internal_memcpy(stack_trace + 1, trace.trace, trace.size * sizeof(uptr));
+  return reinterpret_cast<StackStore::Id>(stack_trace);
+}
+
+StackTrace StackStore::Load(Id id) {
+  const uptr *stack_trace = reinterpret_cast<const uptr *>(id);
+  uptr size = *stack_trace & ((1 << kStackSizeBits) - 1);
+  uptr tag = *stack_trace >> kStackSizeBits;
+  return StackTrace(stack_trace + 1, size, tag);
+}
+
+uptr *StackStore::TryAlloc(uptr count) {
+  // Optimisic lock-free allocation, essentially try to bump the region ptr.
+  for (;;) {
+    uptr cmp = atomic_load(&region_pos_, memory_order_acquire);
+    uptr end = atomic_load(&region_end_, memory_order_acquire);
+    uptr size = count * sizeof(uptr);
+    if (cmp == 0 || cmp + size > end)
+      return nullptr;
+    if (atomic_compare_exchange_weak(&region_pos_, &cmp, cmp + size,
+                                     memory_order_acquire))
+      return reinterpret_cast<uptr *>(cmp);
+  }
+}
+
+uptr *StackStore::Alloc(uptr count) {
+  // First, try to allocate optimisitically.
+  uptr *s = TryAlloc(count);
+  if (LIKELY(s))
+    return s;
+  return RefillAndAlloc(count);
+}
+
+uptr *StackStore::RefillAndAlloc(uptr count) {
+  // If failed, lock, retry and alloc new superblock.
+  SpinMutexLock l(&mtx_);
+  for (;;) {
+    uptr *s = TryAlloc(count);
+    if (s)
+      return s;
+    atomic_store(&region_pos_, 0, memory_order_relaxed);
+    uptr size = count * sizeof(uptr) + sizeof(BlockInfo);
+    uptr allocsz = RoundUpTo(Max<uptr>(size, 64u * 1024u), GetPageSizeCached());
+    uptr mem = (uptr)MmapOrDie(allocsz, "stack depot");
+    BlockInfo *new_block = (BlockInfo *)(mem + allocsz) - 1;
+    new_block->next = curr_;
+    new_block->ptr = mem;
+    new_block->size = allocsz;
+    curr_ = new_block;
+
+    atomic_fetch_add(&mapped_size_, allocsz, memory_order_relaxed);
+
+    allocsz -= sizeof(BlockInfo);
+    atomic_store(&region_end_, mem + allocsz, memory_order_release);
+    atomic_store(&region_pos_, mem, memory_order_release);
+  }
+}
+
+void StackStore::TestOnlyUnmap() {
+  while (curr_) {
+    uptr mem = curr_->ptr;
+    uptr allocsz = curr_->size;
+    curr_ = curr_->next;
+    UnmapOrDie((void *)mem, allocsz);
+  }
+  internal_memset(this, 0, sizeof(*this));
+}
+
+}  // namespace __sanitizer
diff --git a/libsanitizer/sanitizer_common/sanitizer_stack_store.h b/libsanitizer/sanitizer_common/sanitizer_stack_store.h
new file mode 100644
index 00000000000..b5bbdccc20b
--- /dev/null
+++ b/libsanitizer/sanitizer_common/sanitizer_stack_store.h
@@ -0,0 +1,50 @@
+//===-- sanitizer_stack_store.h ---------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SANITIZER_STACK_STORE_H
+#define SANITIZER_STACK_STORE_H
+
+#include "sanitizer_atomic.h"
+#include "sanitizer_internal_defs.h"
+#include "sanitizer_mutex.h"
+#include "sanitizer_stacktrace.h"
+
+namespace __sanitizer {
+
+class StackStore {
+ public:
+  constexpr StackStore() = default;
+
+  using Id = uptr;
+
+  Id Store(const StackTrace &trace);
+  StackTrace Load(Id id);
+  uptr Allocated() const { return atomic_load_relaxed(&mapped_size_); }
+
+  void TestOnlyUnmap();
+
+ private:
+  uptr *Alloc(uptr count = 1);
+  uptr *TryAlloc(uptr count);
+  uptr *RefillAndAlloc(uptr count);
+  mutable StaticSpinMutex mtx_ = {};  // Protects alloc of new blocks.
+  atomic_uintptr_t region_pos_ = {};  // Region allocator for Node's.
+  atomic_uintptr_t region_end_ = {};
+  atomic_uintptr_t mapped_size_ = {};
+
+  struct BlockInfo {
+    const BlockInfo *next;
+    uptr ptr;
+    uptr size;
+  };
+  const BlockInfo *curr_ = nullptr;
+};
+
+}  // namespace __sanitizer
+
+#endif  // SANITIZER_STACK_STORE_H
diff --git a/libsanitizer/sanitizer_common/sanitizer_stackdepot.cpp b/libsanitizer/sanitizer_common/sanitizer_stackdepot.cpp
index 02855459922..e203b2cc4c8 100644
--- a/libsanitizer/sanitizer_common/sanitizer_stackdepot.cpp
+++ b/libsanitizer/sanitizer_common/sanitizer_stackdepot.cpp
@@ -14,20 +14,17 @@
 
 #include "sanitizer_common.h"
 #include "sanitizer_hash.h"
-#include "sanitizer_persistent_allocator.h"
+#include "sanitizer_stack_store.h"
 #include "sanitizer_stackdepotbase.h"
 
 namespace __sanitizer {
 
-static PersistentAllocator<uptr> traceAllocator;
-
 struct StackDepotNode {
   using hash_type = u64;
   hash_type stack_hash;
   u32 link;
 
   static const u32 kTabSizeLog = SANITIZER_ANDROID ? 16 : 20;
-  static const u32 kStackSizeBits = 16;
 
   typedef StackTrace args_type;
   bool eq(hash_type hash, const args_type &args) const {
@@ -50,14 +47,17 @@ struct StackDepotNode {
   typedef StackDepotHandle handle_type;
 };
 
+static StackStore stackStore;
+
 // FIXME(dvyukov): this single reserved bit is used in TSan.
 typedef StackDepotBase<StackDepotNode, 1, StackDepotNode::kTabSizeLog>
     StackDepot;
 static StackDepot theDepot;
 // Keep rarely accessed stack traces out of frequently access nodes to improve
 // caching efficiency.
-static TwoLevelMap<uptr *, StackDepot::kNodesSize1, StackDepot::kNodesSize2>
-    tracePtrs;
+static TwoLevelMap<StackStore::Id, StackDepot::kNodesSize1,
+                   StackDepot::kNodesSize2>
+    storeIds;
 // Keep mutable data out of frequently access nodes to improve caching
 // efficiency.
 static TwoLevelMap<atomic_uint32_t, StackDepot::kNodesSize1,
@@ -73,26 +73,20 @@ void StackDepotHandle::inc_use_count_unsafe() {
 }
 
 uptr StackDepotNode::allocated() {
-  return traceAllocator.allocated() + tracePtrs.MemoryUsage() +
+  return stackStore.Allocated() + storeIds.MemoryUsage() +
          useCounts.MemoryUsage();
 }
 
 void StackDepotNode::store(u32 id, const args_type &args, hash_type hash) {
   stack_hash = hash;
-  uptr *stack_trace = traceAllocator.alloc(args.size + 1);
-  CHECK_LT(args.size, 1 << kStackSizeBits);
-  *stack_trace = args.size + (args.tag << kStackSizeBits);
-  internal_memcpy(stack_trace + 1, args.trace, args.size * sizeof(uptr));
-  tracePtrs[id] = stack_trace;
+  storeIds[id] = stackStore.Store(args);
 }
 
 StackDepotNode::args_type StackDepotNode::load(u32 id) const {
-  const uptr *stack_trace = tracePtrs[id];
-  if (!stack_trace)
+  StackStore::Id store_id = storeIds[id];
+  if (!store_id)
     return {};
-  uptr size = *stack_trace & ((1 << kStackSizeBits) - 1);
-  uptr tag = *stack_trace >> kStackSizeBits;
-  return args_type(stack_trace + 1, size, tag);
+  return stackStore.Load(store_id);
 }
 
 StackDepotStats StackDepotGetStats() { return theDepot.GetStats(); }
@@ -127,8 +121,8 @@ StackDepotHandle StackDepotNode::get_handle(u32 id) {
 
 void StackDepotTestOnlyUnmap() {
   theDepot.TestOnlyUnmap();
-  tracePtrs.TestOnlyUnmap();
-  traceAllocator.TestOnlyUnmap();
+  storeIds.TestOnlyUnmap();
+  stackStore.TestOnlyUnmap();
 }
 
 } // namespace __sanitizer
diff --git a/libsanitizer/sanitizer_common/sanitizer_stacktrace.cpp b/libsanitizer/sanitizer_common/sanitizer_stacktrace.cpp
index 5a12422fc6f..37e9e6dd08d 100644
--- a/libsanitizer/sanitizer_common/sanitizer_stacktrace.cpp
+++ b/libsanitizer/sanitizer_common/sanitizer_stacktrace.cpp
@@ -86,8 +86,8 @@ static inline uhwptr *GetCanonicFrame(uptr bp,
   // Nope, this does not look right either. This means the frame after next does
   // not have a valid frame pointer, but we can still extract the caller PC.
   // Unfortunately, there is no way to decide between GCC and LLVM frame
-  // layouts. Assume GCC.
-  return bp_prev - 1;
+  // layouts. Assume LLVM.
+  return bp_prev;
 #else
   return (uhwptr*)bp;
 #endif
@@ -110,21 +110,14 @@ void BufferedStackTrace::UnwindFast(uptr pc, uptr bp, uptr stack_top,
          IsAligned((uptr)frame, sizeof(*frame)) &&
          size < max_depth) {
 #ifdef __powerpc__
-    // PowerPC ABIs specify that the return address is saved on the
-    // *caller's* stack frame.  Thus we must dereference the back chain
-    // to find the caller frame before extracting it.
+    // PowerPC ABIs specify that the return address is saved at offset
+    // 16 of the *caller's* stack frame.  Thus we must dereference the
+    // back chain to find the caller frame before extracting it.
     uhwptr *caller_frame = (uhwptr*)frame[0];
     if (!IsValidFrame((uptr)caller_frame, stack_top, bottom) ||
         !IsAligned((uptr)caller_frame, sizeof(uhwptr)))
       break;
-    // For most ABIs the offset where the return address is saved is two
-    // register sizes.  The exception is the SVR4 ABI, which uses an
-    // offset of only one register size.
-#ifdef _CALL_SYSV
-    uhwptr pc1 = caller_frame[1];
-#else
     uhwptr pc1 = caller_frame[2];
-#endif
 #elif defined(__s390__)
     uhwptr pc1 = frame[14];
 #elif defined(__riscv)
diff --git a/libsanitizer/sanitizer_common/sanitizer_thread_registry.cpp b/libsanitizer/sanitizer_common/sanitizer_thread_registry.cpp
index a34b8c15aa5..2e1cd023881 100644
--- a/libsanitizer/sanitizer_common/sanitizer_thread_registry.cpp
+++ b/libsanitizer/sanitizer_common/sanitizer_thread_registry.cpp
@@ -13,6 +13,8 @@
 
 #include "sanitizer_thread_registry.h"
 
+#include "sanitizer_placement_new.h"
+
 namespace __sanitizer {
 
 ThreadContextBase::ThreadContextBase(u32 tid)
@@ -162,6 +164,12 @@ u32 ThreadRegistry::CreateThread(uptr user_id, bool detached, u32 parent_tid,
     max_alive_threads_++;
     CHECK_EQ(alive_threads_, max_alive_threads_);
   }
+  if (user_id) {
+    // Ensure that user_id is unique. If it's not the case we are screwed.
+    // Ignoring this situation may lead to very hard to debug false
+    // positives later (e.g. if we join a wrong thread).
+    CHECK(live_.try_emplace(user_id, tid).second);
+  }
   tctx->SetCreated(user_id, total_threads_++, detached,
                    parent_tid, arg);
   return tid;
@@ -221,14 +229,8 @@ void ThreadRegistry::SetThreadName(u32 tid, const char *name) {
 
 void ThreadRegistry::SetThreadNameByUserId(uptr user_id, const char *name) {
   ThreadRegistryLock l(this);
-  for (u32 tid = 0; tid < threads_.size(); tid++) {
-    ThreadContextBase *tctx = threads_[tid];
-    if (tctx != 0 && tctx->user_id == user_id &&
-        tctx->status != ThreadStatusInvalid) {
-      tctx->SetName(name);
-      return;
-    }
-  }
+  if (const auto *tid = live_.find(user_id))
+    threads_[tid->second]->SetName(name);
 }
 
 void ThreadRegistry::DetachThread(u32 tid, void *arg) {
@@ -241,6 +243,8 @@ void ThreadRegistry::DetachThread(u32 tid, void *arg) {
   }
   tctx->OnDetached(arg);
   if (tctx->status == ThreadStatusFinished) {
+    if (tctx->user_id)
+      live_.erase(tctx->user_id);
     tctx->SetDead();
     QuarantinePush(tctx);
   } else {
@@ -260,6 +264,8 @@ void ThreadRegistry::JoinThread(u32 tid, void *arg) {
         return;
       }
       if ((destroyed = tctx->GetDestroyed())) {
+        if (tctx->user_id)
+          live_.erase(tctx->user_id);
         tctx->SetJoined(arg);
         QuarantinePush(tctx);
       }
@@ -292,6 +298,8 @@ ThreadStatus ThreadRegistry::FinishThread(u32 tid) {
   }
   tctx->SetFinished();
   if (dead) {
+    if (tctx->user_id)
+      live_.erase(tctx->user_id);
     tctx->SetDead();
     QuarantinePush(tctx);
   }
@@ -333,6 +341,19 @@ ThreadContextBase *ThreadRegistry::QuarantinePop() {
   return tctx;
 }
 
+u32 ThreadRegistry::ConsumeThreadUserId(uptr user_id) {
+  ThreadRegistryLock l(this);
+  u32 tid;
+  auto *t = live_.find(user_id);
+  CHECK(t);
+  tid = t->second;
+  live_.erase(t);
+  auto *tctx = threads_[tid];
+  CHECK_EQ(tctx->user_id, user_id);
+  tctx->user_id = 0;
+  return tid;
+}
+
 void ThreadRegistry::SetThreadUserId(u32 tid, uptr user_id) {
   ThreadRegistryLock l(this);
   ThreadContextBase *tctx = threads_[tid];
@@ -341,6 +362,7 @@ void ThreadRegistry::SetThreadUserId(u32 tid, uptr user_id) {
   CHECK_NE(tctx->status, ThreadStatusDead);
   CHECK_EQ(tctx->user_id, 0);
   tctx->user_id = user_id;
+  CHECK(live_.try_emplace(user_id, tctx->tid).second);
 }
 
 }  // namespace __sanitizer
diff --git a/libsanitizer/sanitizer_common/sanitizer_thread_registry.h b/libsanitizer/sanitizer_common/sanitizer_thread_registry.h
index a8a4d4d86a0..89e5fefa340 100644
--- a/libsanitizer/sanitizer_common/sanitizer_thread_registry.h
+++ b/libsanitizer/sanitizer_common/sanitizer_thread_registry.h
@@ -15,6 +15,7 @@
 #define SANITIZER_THREAD_REGISTRY_H
 
 #include "sanitizer_common.h"
+#include "sanitizer_dense_map.h"
 #include "sanitizer_list.h"
 #include "sanitizer_mutex.h"
 
@@ -103,6 +104,8 @@ class MUTEX ThreadRegistry {
     return threads_.empty() ? nullptr : threads_[tid];
   }
 
+  u32 NumThreadsLocked() const { return threads_.size(); }
+
   u32 CreateThread(uptr user_id, bool detached, u32 parent_tid, void *arg);
 
   typedef void (*ThreadCallback)(ThreadContextBase *tctx, void *arg);
@@ -127,6 +130,7 @@ class MUTEX ThreadRegistry {
   // Finishes thread and returns previous status.
   ThreadStatus FinishThread(u32 tid);
   void StartThread(u32 tid, tid_t os_id, ThreadType thread_type, void *arg);
+  u32 ConsumeThreadUserId(uptr user_id);
   void SetThreadUserId(u32 tid, uptr user_id);
 
  private:
@@ -146,6 +150,7 @@ class MUTEX ThreadRegistry {
   InternalMmapVector<ThreadContextBase *> threads_;
   IntrusiveList<ThreadContextBase> dead_threads_;
   IntrusiveList<ThreadContextBase> invalid_threads_;
+  DenseMap<uptr, Tid> live_;
 
   void QuarantinePush(ThreadContextBase *tctx);
   ThreadContextBase *QuarantinePop();
diff --git a/libsanitizer/sanitizer_common/sanitizer_type_traits.h b/libsanitizer/sanitizer_common/sanitizer_type_traits.h
index 2a58d9874d2..06a44d1b5c7 100644
--- a/libsanitizer/sanitizer_common/sanitizer_type_traits.h
+++ b/libsanitizer/sanitizer_common/sanitizer_type_traits.h
@@ -13,6 +13,8 @@
 #ifndef SANITIZER_TYPE_TRAITS_H
 #define SANITIZER_TYPE_TRAITS_H
 
+#include "sanitizer_common/sanitizer_internal_defs.h"
+
 namespace __sanitizer {
 
 struct true_type {
@@ -57,6 +59,83 @@ struct conditional<false, T, F> {
   using type = F;
 };
 
+template <class T>
+struct remove_reference {
+  using type = T;
+};
+template <class T>
+struct remove_reference<T&> {
+  using type = T;
+};
+template <class T>
+struct remove_reference<T&&> {
+  using type = T;
+};
+
+template <class T>
+WARN_UNUSED_RESULT inline typename remove_reference<T>::type&& move(T&& t) {
+  return static_cast<typename remove_reference<T>::type&&>(t);
+}
+
+template <class T>
+WARN_UNUSED_RESULT inline constexpr T&& forward(
+    typename remove_reference<T>::type& t) {
+  return static_cast<T&&>(t);
+}
+
+template <class T>
+WARN_UNUSED_RESULT inline constexpr T&& forward(
+    typename remove_reference<T>::type&& t) {
+  return static_cast<T&&>(t);
+}
+
+template <class T, T v>
+struct integral_constant {
+  static constexpr const T value = v;
+  typedef T value_type;
+  typedef integral_constant type;
+  constexpr operator value_type() const { return value; }
+  constexpr value_type operator()() const { return value; }
+};
+
+#ifndef __has_builtin
+#  define __has_builtin(x) 0
+#endif
+
+#if __has_builtin(__is_trivially_destructible)
+
+template <class T>
+struct is_trivially_destructible
+    : public integral_constant<bool, __is_trivially_destructible(T)> {};
+
+#elif __has_builtin(__has_trivial_destructor)
+
+template <class T>
+struct is_trivially_destructible
+    : public integral_constant<bool, __has_trivial_destructor(T)> {};
+
+#else
+
+template <class T>
+struct is_trivially_destructible
+    : public integral_constant<bool, /* less efficient fallback */ false> {};
+
+#endif
+
+#if __has_builtin(__is_trivially_copyable)
+
+template <class T>
+struct is_trivially_copyable
+    : public integral_constant<bool, __is_trivially_copyable(T)> {};
+
+#else
+
+template <class T>
+struct is_trivially_copyable
+    : public integral_constant<bool, /* less efficient fallback */ false> {};
+
+#endif
+
 }  // namespace __sanitizer
 
 #endif
diff --git a/libsanitizer/tsan/tsan_defs.h b/libsanitizer/tsan/tsan_defs.h
index fe0c1da3159..752020534ed 100644
--- a/libsanitizer/tsan/tsan_defs.h
+++ b/libsanitizer/tsan/tsan_defs.h
@@ -63,6 +63,13 @@ enum class Epoch : u16 {};
 constexpr uptr kEpochBits = 14;
 constexpr Epoch kEpochZero = static_cast<Epoch>(0);
 constexpr Epoch kEpochOver = static_cast<Epoch>(1 << kEpochBits);
+constexpr Epoch kEpochLast = static_cast<Epoch>((1 << kEpochBits) - 1);
+
+inline Epoch EpochInc(Epoch epoch) {
+  return static_cast<Epoch>(static_cast<u16>(epoch) + 1);
+}
+
+inline bool EpochOverflow(Epoch epoch) { return epoch == kEpochOver; }
 
 const int kClkBits = 42;
 const unsigned kMaxTidReuse = (1 << (64 - kClkBits)) - 1;
@@ -107,7 +114,7 @@ const uptr kShadowCnt = 4;
 const uptr kShadowCell = 8;
 
 // Single shadow value.
-typedef u64 RawShadow;
+enum class RawShadow : u32 {};
 const uptr kShadowSize = sizeof(RawShadow);
 
 // Shadow memory is kShadowMultiplier times larger than user memory.
@@ -184,10 +191,13 @@ MD5Hash md5_hash(const void *data, uptr size);
 struct Processor;
 struct ThreadState;
 class ThreadContext;
+struct TidSlot;
 struct Context;
 struct ReportStack;
 class ReportDesc;
 class RegionAlloc;
+struct Trace;
+struct TracePart;
 
 typedef uptr AccessType;
 
@@ -198,6 +208,8 @@ enum : AccessType {
   kAccessVptr = 1 << 2,  // read or write of an object virtual table pointer
   kAccessFree = 1 << 3,  // synthetic memory access during memory freeing
   kAccessExternalPC = 1 << 4,  // access PC can have kExternalPCBit set
+  kAccessCheckOnly = 1 << 5,   // check for races, but don't store
+  kAccessNoRodata = 1 << 6,    // don't check for .rodata marker
 };
 
 // Descriptor of user's memory block.
@@ -219,15 +231,18 @@ enum ExternalTag : uptr {
   // as 16-bit values, see tsan_defs.h.
 };
 
-enum MutexType {
-  MutexTypeTrace = MutexLastCommon,
-  MutexTypeReport,
+enum {
+  MutexTypeReport = MutexLastCommon,
   MutexTypeSyncVar,
   MutexTypeAnnotations,
   MutexTypeAtExit,
   MutexTypeFired,
   MutexTypeRacy,
   MutexTypeGlobalProc,
+  MutexTypeTrace,
+  MutexTypeSlot,
+  MutexTypeSlots,
+  MutexTypeMultiSlot,
 };
 
 }  // namespace __tsan
diff --git a/libsanitizer/tsan/tsan_dense_alloc.h b/libsanitizer/tsan/tsan_dense_alloc.h
index 9e15f74a061..7a39a39d51d 100644
--- a/libsanitizer/tsan/tsan_dense_alloc.h
+++ b/libsanitizer/tsan/tsan_dense_alloc.h
@@ -104,6 +104,15 @@ class DenseSlabAlloc {
     return atomic_load_relaxed(&fillpos_) * kL2Size * sizeof(T);
   }
 
+  template <typename Func>
+  void ForEach(Func func) {
+    SpinMutexLock lock(&mtx_);
+    uptr fillpos = atomic_load_relaxed(&fillpos_);
+    for (uptr l1 = 0; l1 < fillpos; l1++) {
+      for (IndexT l2 = l1 == 0 ? 1 : 0; l2 < kL2Size; l2++) func(&map_[l1][l2]);
+    }
+  }
+
  private:
   T *map_[kL1Size];
   SpinMutex mtx_;
diff --git a/libsanitizer/tsan/tsan_flags.cpp b/libsanitizer/tsan/tsan_flags.cpp
index ee89862d17b..54bed9f9a6b 100644
--- a/libsanitizer/tsan/tsan_flags.cpp
+++ b/libsanitizer/tsan/tsan_flags.cpp
@@ -110,12 +110,6 @@ void InitializeFlags(Flags *f, const char *env, const char *env_option_name) {
 
   if (common_flags()->help) parser.PrintFlagDescriptions();
 
-  if (f->history_size < 0 || f->history_size > 7) {
-    Printf("ThreadSanitizer: incorrect value for history_size"
-           " (must be [0..7])\n");
-    Die();
-  }
-
   if (f->io_sync < 0 || f->io_sync > 2) {
     Printf("ThreadSanitizer: incorrect value for io_sync"
            " (must be [0..2])\n");
diff --git a/libsanitizer/tsan/tsan_flags.inc b/libsanitizer/tsan/tsan_flags.inc
index 7954a4307fa..3df180ec68c 100644
--- a/libsanitizer/tsan/tsan_flags.inc
+++ b/libsanitizer/tsan/tsan_flags.inc
@@ -59,14 +59,10 @@ TSAN_FLAG(bool, stop_on_start, false,
           "Stops on start until __tsan_resume() is called (for debugging).")
 TSAN_FLAG(bool, running_on_valgrind, false,
           "Controls whether RunningOnValgrind() returns true or false.")
-// There are a lot of goroutines in Go, so we use smaller history.
 TSAN_FLAG(
-    int, history_size, SANITIZER_GO ? 1 : 3,
-    "Per-thread history size, controls how many previous memory accesses "
-    "are remembered per thread.  Possible values are [0..7]. "
-    "history_size=0 amounts to 32K memory accesses.  Each next value doubles "
-    "the amount of memory accesses, up to history_size=7 that amounts to "
-    "4M memory accesses.  The default value is 2 (128K memory accesses).")
+    uptr, history_size, 0,
+    "Per-thread history size,"
+    " controls how many extra previous memory accesses are remembered per thread.")
 TSAN_FLAG(int, io_sync, 1,
           "Controls level of synchronization implied by IO operations. "
           "0 - no synchronization "
diff --git a/libsanitizer/tsan/tsan_interceptors_posix.cpp b/libsanitizer/tsan/tsan_interceptors_posix.cpp
index 9a85ee00d2d..9d17e9bbdc0 100644
--- a/libsanitizer/tsan/tsan_interceptors_posix.cpp
+++ b/libsanitizer/tsan/tsan_interceptors_posix.cpp
@@ -90,6 +90,7 @@ DECLARE_REAL(int, pthread_mutexattr_gettype, void *, void *)
 DECLARE_REAL(int, fflush, __sanitizer_FILE *fp)
 DECLARE_REAL_AND_INTERCEPTOR(void *, malloc, uptr size)
 DECLARE_REAL_AND_INTERCEPTOR(void, free, void *ptr)
+extern "C" int pthread_equal(void *t1, void *t2);
 extern "C" void *pthread_self();
 extern "C" void _exit(int status);
 #if !SANITIZER_NETBSD
@@ -880,10 +881,11 @@ static int guard_acquire(ThreadState *thr, uptr pc, atomic_uint32_t *g,
   }
 }
 
-static void guard_release(ThreadState *thr, uptr pc, atomic_uint32_t *g) {
+static void guard_release(ThreadState *thr, uptr pc, atomic_uint32_t *g,
+                          u32 v) {
   if (!thr->in_ignored_lib)
     Release(thr, pc, (uptr)g);
-  u32 old = atomic_exchange(g, kGuardDone, memory_order_release);
+  u32 old = atomic_exchange(g, v, memory_order_release);
   if (old & kGuardWaiter)
     FutexWake(g, 1 << 30);
 }
@@ -913,12 +915,12 @@ STDCXX_INTERCEPTOR(int, __cxa_guard_acquire, atomic_uint32_t *g) {
 
 STDCXX_INTERCEPTOR(void, __cxa_guard_release, atomic_uint32_t *g) {
   SCOPED_INTERCEPTOR_RAW(__cxa_guard_release, g);
-  guard_release(thr, pc, g);
+  guard_release(thr, pc, g, kGuardDone);
 }
 
 STDCXX_INTERCEPTOR(void, __cxa_guard_abort, atomic_uint32_t *g) {
   SCOPED_INTERCEPTOR_RAW(__cxa_guard_abort, g);
-  atomic_store(g, kGuardInit, memory_order_relaxed);
+  guard_release(thr, pc, g, kGuardInit);
 }
 
 namespace __tsan {
@@ -1515,7 +1517,7 @@ TSAN_INTERCEPTOR(int, pthread_once, void *o, void (*f)()) {
   // result in crashes due to too little stack space.
   if (guard_acquire(thr, pc, a, !SANITIZER_MAC)) {
     (*f)();
-    guard_release(thr, pc, a);
+    guard_release(thr, pc, a, kGuardDone);
   }
   return 0;
 }
@@ -1965,6 +1967,7 @@ static void ReportErrnoSpoiling(ThreadState *thr, uptr pc) {
 static void CallUserSignalHandler(ThreadState *thr, bool sync, bool acquire,
                                   int sig, __sanitizer_siginfo *info,
                                   void *uctx) {
+  CHECK(thr->slot);
   __sanitizer_sigaction *sigactions = interceptor_ctx()->sigactions;
   if (acquire)
     Acquire(thr, 0, (uptr)&sigactions[sig]);
@@ -2132,11 +2135,11 @@ TSAN_INTERCEPTOR(int, pthread_kill, void *tid, int sig) {
   ThreadSignalContext *sctx = SigCtx(thr);
   CHECK_NE(sctx, 0);
   int prev = sctx->int_signal_send;
-  if (tid == pthread_self()) {
+  bool self = pthread_equal(tid, pthread_self());
+  if (self)
     sctx->int_signal_send = sig;
-  }
   int res = REAL(pthread_kill)(tid, sig);
-  if (tid == pthread_self()) {
+  if (self) {
     CHECK_EQ(sctx->int_signal_send, sig);
     sctx->int_signal_send = prev;
   }
@@ -2252,7 +2255,7 @@ struct dl_iterate_phdr_data {
 };
 
 static bool IsAppNotRodata(uptr addr) {
-  return IsAppMem(addr) && *MemToShadow(addr) != kShadowRodata;
+  return IsAppMem(addr) && *MemToShadow(addr) != Shadow::kRodata;
 }
 
 static int dl_iterate_phdr_cb(__sanitizer_dl_phdr_info *info, SIZE_T size,
@@ -2391,8 +2394,11 @@ static void HandleRecvmsg(ThreadState *thr, uptr pc,
 #define COMMON_INTERCEPTOR_SET_THREAD_NAME(ctx, name) \
   ThreadSetName(((TsanInterceptorContext *) ctx)->thr, name)
 
-#define COMMON_INTERCEPTOR_SET_PTHREAD_NAME(ctx, thread, name) \
-  __tsan::ctx->thread_registry.SetThreadNameByUserId(thread, name)
+#define COMMON_INTERCEPTOR_SET_PTHREAD_NAME(ctx, thread, name)         \
+  if (pthread_equal(pthread_self(), reinterpret_cast<void *>(thread))) \
+    COMMON_INTERCEPTOR_SET_THREAD_NAME(ctx, name);                     \
+  else                                                                 \
+    __tsan::ctx->thread_registry.SetThreadNameByUserId(thread, name)
 
 #define COMMON_INTERCEPTOR_BLOCK_REAL(name) BLOCK_REAL(name)
 
diff --git a/libsanitizer/tsan/tsan_interface_atomic.cpp b/libsanitizer/tsan/tsan_interface_atomic.cpp
index 24ba3bb1f65..f794a2fcdd0 100644
--- a/libsanitizer/tsan/tsan_interface_atomic.cpp
+++ b/libsanitizer/tsan/tsan_interface_atomic.cpp
@@ -235,8 +235,9 @@ static T AtomicLoad(ThreadState *thr, uptr pc, const volatile T *a, morder mo) {
   T v = NoTsanAtomicLoad(a, mo);
   SyncVar *s = ctx->metamap.GetSyncIfExists((uptr)a);
   if (s) {
-    ReadLock l(&s->mtx);
-    AcquireImpl(thr, pc, &s->clock);
+    SlotLocker locker(thr);
+    ReadLock lock(&s->mtx);
+    thr->clock.Acquire(s->clock);
     // Re-read under sync mutex because we need a consistent snapshot
     // of the value and the clock we acquire.
     v = NoTsanAtomicLoad(a, mo);
@@ -270,14 +271,14 @@ static void AtomicStore(ThreadState *thr, uptr pc, volatile T *a, T v,
     NoTsanAtomicStore(a, v, mo);
     return;
   }
-  __sync_synchronize();
-  SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, (uptr)a, false);
-  Lock l(&s->mtx);
-  thr->fast_state.IncrementEpoch();
-  // Can't increment epoch w/o writing to the trace as well.
-  TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
-  ReleaseStoreImpl(thr, pc, &s->clock);
-  NoTsanAtomicStore(a, v, mo);
+  SlotLocker locker(thr);
+  {
+    auto s = ctx->metamap.GetSyncOrCreate(thr, pc, (uptr)a, false);
+    Lock lock(&s->mtx);
+    thr->clock.ReleaseStore(&s->clock);
+    NoTsanAtomicStore(a, v, mo);
+  }
+  IncrementEpoch(thr);
 }
 
 template <typename T, T (*F)(volatile T *v, T op)>
@@ -285,18 +286,21 @@ static T AtomicRMW(ThreadState *thr, uptr pc, volatile T *a, T v, morder mo) {
   MemoryAccess(thr, pc, (uptr)a, AccessSize<T>(), kAccessWrite | kAccessAtomic);
   if (LIKELY(mo == mo_relaxed))
     return F(a, v);
-  SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, (uptr)a, false);
-  Lock l(&s->mtx);
-  thr->fast_state.IncrementEpoch();
-  // Can't increment epoch w/o writing to the trace as well.
-  TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
-  if (IsAcqRelOrder(mo))
-    AcquireReleaseImpl(thr, pc, &s->clock);
-  else if (IsReleaseOrder(mo))
-    ReleaseImpl(thr, pc, &s->clock);
-  else if (IsAcquireOrder(mo))
-    AcquireImpl(thr, pc, &s->clock);
-  return F(a, v);
+  SlotLocker locker(thr);
+  {
+    auto s = ctx->metamap.GetSyncOrCreate(thr, pc, (uptr)a, false);
+    RWLock lock(&s->mtx, IsReleaseOrder(mo));
+    if (IsAcqRelOrder(mo))
+      thr->clock.ReleaseAcquire(&s->clock);
+    else if (IsReleaseOrder(mo))
+      thr->clock.Release(&s->clock);
+    else if (IsAcquireOrder(mo))
+      thr->clock.Acquire(s->clock);
+    v = F(a, v);
+  }
+  if (IsReleaseOrder(mo))
+    IncrementEpoch(thr);
+  return v;
 }
 
 template<typename T>
@@ -416,27 +420,28 @@ static bool AtomicCAS(ThreadState *thr, uptr pc, volatile T *a, T *c, T v,
     *c = pr;
     return false;
   }
-
+  SlotLocker locker(thr);
   bool release = IsReleaseOrder(mo);
-  SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, (uptr)a, false);
-  RWLock l(&s->mtx, release);
-  T cc = *c;
-  T pr = func_cas(a, cc, v);
-  bool success = pr == cc;
-  if (!success) {
-    *c = pr;
-    mo = fmo;
+  bool success;
+  {
+    auto s = ctx->metamap.GetSyncOrCreate(thr, pc, (uptr)a, false);
+    RWLock lock(&s->mtx, release);
+    T cc = *c;
+    T pr = func_cas(a, cc, v);
+    success = pr == cc;
+    if (!success) {
+      *c = pr;
+      mo = fmo;
+    }
+    if (success && IsAcqRelOrder(mo))
+      thr->clock.ReleaseAcquire(&s->clock);
+    else if (success && IsReleaseOrder(mo))
+      thr->clock.Release(&s->clock);
+    else if (IsAcquireOrder(mo))
+      thr->clock.Acquire(s->clock);
   }
-  thr->fast_state.IncrementEpoch();
-  // Can't increment epoch w/o writing to the trace as well.
-  TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
-
-  if (success && IsAcqRelOrder(mo))
-    AcquireReleaseImpl(thr, pc, &s->clock);
-  else if (success && IsReleaseOrder(mo))
-    ReleaseImpl(thr, pc, &s->clock);
-  else if (IsAcquireOrder(mo))
-    AcquireImpl(thr, pc, &s->clock);
+  if (success && release)
+    IncrementEpoch(thr);
   return success;
 }
 
diff --git a/libsanitizer/tsan/tsan_interface_java.cpp b/libsanitizer/tsan/tsan_interface_java.cpp
index c090c1f08cb..7c15a163882 100644
--- a/libsanitizer/tsan/tsan_interface_java.cpp
+++ b/libsanitizer/tsan/tsan_interface_java.cpp
@@ -106,7 +106,7 @@ void __tsan_java_free(jptr ptr, jptr size) {
   DCHECK_GE(ptr, jctx->heap_begin);
   DCHECK_LE(ptr + size, jctx->heap_begin + jctx->heap_size);
 
-  ctx->metamap.FreeRange(thr->proc(), ptr, size);
+  ctx->metamap.FreeRange(thr->proc(), ptr, size, false);
 }
 
 void __tsan_java_move(jptr src, jptr dst, jptr size) {
@@ -133,7 +133,7 @@ void __tsan_java_move(jptr src, jptr dst, jptr size) {
   // support that anymore as it contains addresses of accesses.
   RawShadow *d = MemToShadow(dst);
   RawShadow *dend = MemToShadow(dst + size);
-  internal_memset(d, 0, (dend - d) * sizeof(*d));
+  ShadowSet(d, dend, Shadow::kEmpty);
 }
 
 jptr __tsan_java_find(jptr *from_ptr, jptr to) {
diff --git a/libsanitizer/tsan/tsan_mman.cpp b/libsanitizer/tsan/tsan_mman.cpp
index f1b6768c592..9217b72bbb4 100644
--- a/libsanitizer/tsan/tsan_mman.cpp
+++ b/libsanitizer/tsan/tsan_mman.cpp
@@ -219,8 +219,17 @@ void *user_reallocarray(ThreadState *thr, uptr pc, void *p, uptr size, uptr n) {
 
 void OnUserAlloc(ThreadState *thr, uptr pc, uptr p, uptr sz, bool write) {
   DPrintf("#%d: alloc(%zu) = 0x%zx\n", thr->tid, sz, p);
+  // Note: this can run before thread initialization/after finalization.
+  // As a result this is not necessarily synchronized with DoReset,
+  // which iterates over and resets all sync objects,
+  // but it is fine to create new MBlocks in this context.
   ctx->metamap.AllocBlock(thr, pc, p, sz);
-  if (write && thr->ignore_reads_and_writes == 0)
+  // If this runs before thread initialization/after finalization
+  // and we don't have trace initialized, we can't imitate writes.
+  // In such case just reset the shadow range, it is fine since
+  // it affects only a small fraction of special objects.
+  if (write && thr->ignore_reads_and_writes == 0 &&
+      atomic_load_relaxed(&thr->trace_pos))
     MemoryRangeImitateWrite(thr, pc, (uptr)p, sz);
   else
     MemoryResetRange(thr, pc, (uptr)p, sz);
@@ -228,7 +237,14 @@ void OnUserAlloc(ThreadState *thr, uptr pc, uptr p, uptr sz, bool write) {
 
 void OnUserFree(ThreadState *thr, uptr pc, uptr p, bool write) {
   CHECK_NE(p, (void*)0);
-  uptr sz = ctx->metamap.FreeBlock(thr->proc(), p);
+  if (!thr->slot) {
+    // Very early/late in thread lifetime, or during fork.
+    UNUSED uptr sz = ctx->metamap.FreeBlock(thr->proc(), p, false);
+    DPrintf("#%d: free(0x%zx, %zu) (no slot)\n", thr->tid, p, sz);
+    return;
+  }
+  SlotLocker locker(thr);
+  uptr sz = ctx->metamap.FreeBlock(thr->proc(), p, true);
   DPrintf("#%d: free(0x%zx, %zu)\n", thr->tid, p, sz);
   if (write && thr->ignore_reads_and_writes == 0)
     MemoryRangeFreed(thr, pc, (uptr)p, sz);
@@ -393,8 +409,6 @@ uptr __sanitizer_get_allocated_size(const void *p) {
 
 void __tsan_on_thread_idle() {
   ThreadState *thr = cur_thread();
-  thr->clock.ResetCached(&thr->proc()->clock_cache);
-  thr->last_sleep_clock.ResetCached(&thr->proc()->clock_cache);
   allocator()->SwallowCache(&thr->proc()->alloc_cache);
   internal_allocator()->SwallowCache(&thr->proc()->internal_alloc_cache);
   ctx->metamap.OnProcIdle(thr->proc());
diff --git a/libsanitizer/tsan/tsan_mutexset.cpp b/libsanitizer/tsan/tsan_mutexset.cpp
index 735179686ba..3a75b80ac30 100644
--- a/libsanitizer/tsan/tsan_mutexset.cpp
+++ b/libsanitizer/tsan/tsan_mutexset.cpp
@@ -19,57 +19,7 @@ namespace __tsan {
 MutexSet::MutexSet() {
 }
 
-void MutexSet::Add(u64 id, bool write, u64 epoch) {
-  // Look up existing mutex with the same id.
-  for (uptr i = 0; i < size_; i++) {
-    if (descs_[i].id == id) {
-      descs_[i].count++;
-      descs_[i].epoch = epoch;
-      return;
-    }
-  }
-  // On overflow, find the oldest mutex and drop it.
-  if (size_ == kMaxSize) {
-    u64 minepoch = (u64)-1;
-    u64 mini = (u64)-1;
-    for (uptr i = 0; i < size_; i++) {
-      if (descs_[i].epoch < minepoch) {
-        minepoch = descs_[i].epoch;
-        mini = i;
-      }
-    }
-    RemovePos(mini);
-    CHECK_EQ(size_, kMaxSize - 1);
-  }
-  // Add new mutex descriptor.
-  descs_[size_].addr = 0;
-  descs_[size_].stack_id = kInvalidStackID;
-  descs_[size_].id = id;
-  descs_[size_].write = write;
-  descs_[size_].epoch = epoch;
-  descs_[size_].seq = seq_++;
-  descs_[size_].count = 1;
-  size_++;
-}
-
-void MutexSet::Del(u64 id, bool write) {
-  for (uptr i = 0; i < size_; i++) {
-    if (descs_[i].id == id) {
-      if (--descs_[i].count == 0)
-        RemovePos(i);
-      return;
-    }
-  }
-}
-
-void MutexSet::Remove(u64 id) {
-  for (uptr i = 0; i < size_; i++) {
-    if (descs_[i].id == id) {
-      RemovePos(i);
-      return;
-    }
-  }
-}
+void MutexSet::Reset() { internal_memset(this, 0, sizeof(*this)); }
 
 void MutexSet::AddAddr(uptr addr, StackID stack_id, bool write) {
   // Look up existing mutex with the same id.
@@ -93,9 +43,7 @@ void MutexSet::AddAddr(uptr addr, StackID stack_id, bool write) {
   // Add new mutex descriptor.
   descs_[size_].addr = addr;
   descs_[size_].stack_id = stack_id;
-  descs_[size_].id = 0;
   descs_[size_].write = write;
-  descs_[size_].epoch = 0;
   descs_[size_].seq = seq_++;
   descs_[size_].count = 1;
   size_++;
diff --git a/libsanitizer/tsan/tsan_mutexset.h b/libsanitizer/tsan/tsan_mutexset.h
index 93776a66413..aabd361e6af 100644
--- a/libsanitizer/tsan/tsan_mutexset.h
+++ b/libsanitizer/tsan/tsan_mutexset.h
@@ -25,8 +25,6 @@ class MutexSet {
   struct Desc {
     uptr addr;
     StackID stack_id;
-    u64 id;
-    u64 epoch;
     u32 seq;
     u32 count;
     bool write;
@@ -40,10 +38,7 @@ class MutexSet {
   };
 
   MutexSet();
-  // The 'id' is obtained from SyncVar::GetId().
-  void Add(u64 id, bool write, u64 epoch);
-  void Del(u64 id, bool write);
-  void Remove(u64 id);  // Removes the mutex completely (if it's destroyed).
+  void Reset();
   void AddAddr(uptr addr, StackID stack_id, bool write);
   void DelAddr(uptr addr, bool destroy = false);
   uptr Size() const;
@@ -82,9 +77,7 @@ class DynamicMutexSet {
 // in different goroutine).
 #if SANITIZER_GO
 MutexSet::MutexSet() {}
-void MutexSet::Add(u64 id, bool write, u64 epoch) {}
-void MutexSet::Del(u64 id, bool write) {}
-void MutexSet::Remove(u64 id) {}
+void MutexSet::Reset() {}
 void MutexSet::AddAddr(uptr addr, StackID stack_id, bool write) {}
 void MutexSet::DelAddr(uptr addr, bool destroy) {}
 uptr MutexSet::Size() const { return 0; }
diff --git a/libsanitizer/tsan/tsan_platform.h b/libsanitizer/tsan/tsan_platform.h
index 7ff0acace8f..e28bac2457a 100644
--- a/libsanitizer/tsan/tsan_platform.h
+++ b/libsanitizer/tsan/tsan_platform.h
@@ -18,8 +18,8 @@
 # error "Only 64-bit is supported"
 #endif
 
+#include "sanitizer_common/sanitizer_common.h"
 #include "tsan_defs.h"
-#include "tsan_trace.h"
 
 namespace __tsan {
 
@@ -45,9 +45,7 @@ C/C++ on linux/x86_64 and freebsd/x86_64
 3000 0000 0000 - 4000 0000 0000: metainfo (memory blocks and sync objects)
 4000 0000 0000 - 5500 0000 0000: -
 5500 0000 0000 - 5680 0000 0000: pie binaries without ASLR or on 4.1+ kernels
-5680 0000 0000 - 6000 0000 0000: -
-6000 0000 0000 - 6200 0000 0000: traces
-6200 0000 0000 - 7d00 0000 0000: -
+5680 0000 0000 - 7d00 0000 0000: -
 7b00 0000 0000 - 7c00 0000 0000: heap
 7c00 0000 0000 - 7e80 0000 0000: -
 7e80 0000 0000 - 8000 0000 0000: modules and main thread stack
@@ -67,8 +65,6 @@ C/C++ on netbsd/amd64 can reuse the same mapping:
 struct Mapping48AddressSpace {
   static const uptr kMetaShadowBeg = 0x300000000000ull;
   static const uptr kMetaShadowEnd = 0x340000000000ull;
-  static const uptr kTraceMemBeg   = 0x600000000000ull;
-  static const uptr kTraceMemEnd   = 0x620000000000ull;
   static const uptr kShadowBeg     = 0x010000000000ull;
   static const uptr kShadowEnd     = 0x200000000000ull;
   static const uptr kHeapMemBeg    = 0x7b0000000000ull;
@@ -89,14 +85,12 @@ struct Mapping48AddressSpace {
 C/C++ on linux/mips64 (40-bit VMA)
 0000 0000 00 - 0100 0000 00: -                                           (4 GB)
 0100 0000 00 - 0200 0000 00: main binary                                 (4 GB)
-0200 0000 00 - 2000 0000 00: -                                         (120 GB)
-2000 0000 00 - 4000 0000 00: shadow                                    (128 GB)
+0200 0000 00 - 1200 0000 00: -                                         (120 GB)
+1200 0000 00 - 4000 0000 00: shadow                                    (128 GB)
 4000 0000 00 - 5000 0000 00: metainfo (memory blocks and sync objects)  (64 GB)
 5000 0000 00 - aa00 0000 00: -                                         (360 GB)
 aa00 0000 00 - ab00 0000 00: main binary (PIE)                           (4 GB)
-ab00 0000 00 - b000 0000 00: -                                          (20 GB)
-b000 0000 00 - b200 0000 00: traces                                      (8 GB)
-b200 0000 00 - fe00 0000 00: -                                         (304 GB)
+ab00 0000 00 - fe00 0000 00: -                                         (332 GB)
 fe00 0000 00 - ff00 0000 00: heap                                        (4 GB)
 ff00 0000 00 - ff80 0000 00: -                                           (2 GB)
 ff80 0000 00 - ffff ffff ff: modules and main thread stack              (<2 GB)
@@ -104,9 +98,7 @@ ff80 0000 00 - ffff ffff ff: modules and main thread stack              (<2 GB)
 struct MappingMips64_40 {
   static const uptr kMetaShadowBeg = 0x4000000000ull;
   static const uptr kMetaShadowEnd = 0x5000000000ull;
-  static const uptr kTraceMemBeg   = 0xb000000000ull;
-  static const uptr kTraceMemEnd   = 0xb200000000ull;
-  static const uptr kShadowBeg     = 0x2000000000ull;
+  static const uptr kShadowBeg = 0x1200000000ull;
   static const uptr kShadowEnd     = 0x4000000000ull;
   static const uptr kHeapMemBeg    = 0xfe00000000ull;
   static const uptr kHeapMemEnd    = 0xff00000000ull;
@@ -131,9 +123,7 @@ C/C++ on Darwin/iOS/ARM64 (36-bit VMA, 64 GB VM)
 0400 0000 00 - 0c00 0000 00: shadow memory                       (32 GB)
 0c00 0000 00 - 0d00 0000 00: -                                    (4 GB)
 0d00 0000 00 - 0e00 0000 00: metainfo                             (4 GB)
-0e00 0000 00 - 0f00 0000 00: -                                    (4 GB)
-0f00 0000 00 - 0fc0 0000 00: traces                               (3 GB)
-0fc0 0000 00 - 1000 0000 00: -
+0e00 0000 00 - 1000 0000 00: -
 */
 struct MappingAppleAarch64 {
   static const uptr kLoAppMemBeg   = 0x0100000000ull;
@@ -144,13 +134,11 @@ struct MappingAppleAarch64 {
   static const uptr kShadowEnd     = 0x0c00000000ull;
   static const uptr kMetaShadowBeg = 0x0d00000000ull;
   static const uptr kMetaShadowEnd = 0x0e00000000ull;
-  static const uptr kTraceMemBeg   = 0x0f00000000ull;
-  static const uptr kTraceMemEnd   = 0x0fc0000000ull;
   static const uptr kHiAppMemBeg   = 0x0fc0000000ull;
   static const uptr kHiAppMemEnd   = 0x0fc0000000ull;
   static const uptr kShadowMsk = 0x0ull;
   static const uptr kShadowXor = 0x0ull;
-  static const uptr kShadowAdd = 0x0ull;
+  static const uptr kShadowAdd = 0x0200000000ull;
   static const uptr kVdsoBeg       = 0x7000000000000000ull;
   static const uptr kMidAppMemBeg = 0;
   static const uptr kMidAppMemEnd = 0;
@@ -159,29 +147,25 @@ struct MappingAppleAarch64 {
 /*
 C/C++ on linux/aarch64 (39-bit VMA)
 0000 0010 00 - 0100 0000 00: main binary
-0100 0000 00 - 0800 0000 00: -
-0800 0000 00 - 2000 0000 00: shadow memory
+0100 0000 00 - 0400 0000 00: -
+0400 0000 00 - 2000 0000 00: shadow memory
 2000 0000 00 - 3100 0000 00: -
 3100 0000 00 - 3400 0000 00: metainfo
 3400 0000 00 - 5500 0000 00: -
 5500 0000 00 - 5600 0000 00: main binary (PIE)
-5600 0000 00 - 6000 0000 00: -
-6000 0000 00 - 6200 0000 00: traces
-6200 0000 00 - 7d00 0000 00: -
+5600 0000 00 - 7c00 0000 00: -
 7c00 0000 00 - 7d00 0000 00: heap
 7d00 0000 00 - 7fff ffff ff: modules and main thread stack
 */
 struct MappingAarch64_39 {
   static const uptr kLoAppMemBeg   = 0x0000001000ull;
   static const uptr kLoAppMemEnd   = 0x0100000000ull;
-  static const uptr kShadowBeg     = 0x0800000000ull;
+  static const uptr kShadowBeg = 0x0400000000ull;
   static const uptr kShadowEnd     = 0x2000000000ull;
   static const uptr kMetaShadowBeg = 0x3100000000ull;
   static const uptr kMetaShadowEnd = 0x3400000000ull;
   static const uptr kMidAppMemBeg  = 0x5500000000ull;
-  static const uptr kMidAppMemEnd  = 0x5600000000ull;
-  static const uptr kTraceMemBeg   = 0x6000000000ull;
-  static const uptr kTraceMemEnd   = 0x6200000000ull;
+  static const uptr kMidAppMemEnd = 0x5600000000ull;
   static const uptr kHeapMemBeg    = 0x7c00000000ull;
   static const uptr kHeapMemEnd    = 0x7d00000000ull;
   static const uptr kHiAppMemBeg   = 0x7e00000000ull;
@@ -195,15 +179,13 @@ struct MappingAarch64_39 {
 /*
 C/C++ on linux/aarch64 (42-bit VMA)
 00000 0010 00 - 01000 0000 00: main binary
-01000 0000 00 - 10000 0000 00: -
-10000 0000 00 - 20000 0000 00: shadow memory
+01000 0000 00 - 08000 0000 00: -
+08000 0000 00 - 20000 0000 00: shadow memory
 20000 0000 00 - 26000 0000 00: -
 26000 0000 00 - 28000 0000 00: metainfo
 28000 0000 00 - 2aa00 0000 00: -
 2aa00 0000 00 - 2ab00 0000 00: main binary (PIE)
-2ab00 0000 00 - 36200 0000 00: -
-36200 0000 00 - 36240 0000 00: traces
-36240 0000 00 - 3e000 0000 00: -
+2ab00 0000 00 - 3e000 0000 00: -
 3e000 0000 00 - 3f000 0000 00: heap
 3f000 0000 00 - 3ffff ffff ff: modules and main thread stack
 */
@@ -211,14 +193,12 @@ struct MappingAarch64_42 {
   static const uptr kBroken = kBrokenReverseMapping;
   static const uptr kLoAppMemBeg   = 0x00000001000ull;
   static const uptr kLoAppMemEnd   = 0x01000000000ull;
-  static const uptr kShadowBeg     = 0x10000000000ull;
+  static const uptr kShadowBeg = 0x08000000000ull;
   static const uptr kShadowEnd     = 0x20000000000ull;
   static const uptr kMetaShadowBeg = 0x26000000000ull;
   static const uptr kMetaShadowEnd = 0x28000000000ull;
   static const uptr kMidAppMemBeg  = 0x2aa00000000ull;
-  static const uptr kMidAppMemEnd  = 0x2ab00000000ull;
-  static const uptr kTraceMemBeg   = 0x36200000000ull;
-  static const uptr kTraceMemEnd   = 0x36400000000ull;
+  static const uptr kMidAppMemEnd = 0x2ab00000000ull;
   static const uptr kHeapMemBeg    = 0x3e000000000ull;
   static const uptr kHeapMemEnd    = 0x3f000000000ull;
   static const uptr kHiAppMemBeg   = 0x3f000000000ull;
@@ -232,14 +212,12 @@ struct MappingAarch64_42 {
 struct MappingAarch64_48 {
   static const uptr kLoAppMemBeg   = 0x0000000001000ull;
   static const uptr kLoAppMemEnd   = 0x0000200000000ull;
-  static const uptr kShadowBeg     = 0x0002000000000ull;
+  static const uptr kShadowBeg = 0x0001000000000ull;
   static const uptr kShadowEnd     = 0x0004000000000ull;
   static const uptr kMetaShadowBeg = 0x0005000000000ull;
   static const uptr kMetaShadowEnd = 0x0006000000000ull;
   static const uptr kMidAppMemBeg  = 0x0aaaa00000000ull;
-  static const uptr kMidAppMemEnd  = 0x0aaaf00000000ull;
-  static const uptr kTraceMemBeg   = 0x0f06000000000ull;
-  static const uptr kTraceMemEnd   = 0x0f06200000000ull;
+  static const uptr kMidAppMemEnd = 0x0aaaf00000000ull;
   static const uptr kHeapMemBeg    = 0x0ffff00000000ull;
   static const uptr kHeapMemEnd    = 0x0ffff00000000ull;
   static const uptr kHiAppMemBeg   = 0x0ffff00000000ull;
@@ -257,9 +235,7 @@ C/C++ on linux/powerpc64 (44-bit VMA)
 0001 0000 0000 - 0b00 0000 0000: shadow
 0b00 0000 0000 - 0b00 0000 0000: -
 0b00 0000 0000 - 0d00 0000 0000: metainfo (memory blocks and sync objects)
-0d00 0000 0000 - 0d00 0000 0000: -
-0d00 0000 0000 - 0f00 0000 0000: traces
-0f00 0000 0000 - 0f00 0000 0000: -
+0d00 0000 0000 - 0f00 0000 0000: -
 0f00 0000 0000 - 0f50 0000 0000: heap
 0f50 0000 0000 - 0f60 0000 0000: -
 0f60 0000 0000 - 1000 0000 0000: modules and main thread stack
@@ -269,8 +245,6 @@ struct MappingPPC64_44 {
       kBrokenMapping | kBrokenReverseMapping | kBrokenLinearity;
   static const uptr kMetaShadowBeg = 0x0b0000000000ull;
   static const uptr kMetaShadowEnd = 0x0d0000000000ull;
-  static const uptr kTraceMemBeg   = 0x0d0000000000ull;
-  static const uptr kTraceMemEnd   = 0x0f0000000000ull;
   static const uptr kShadowBeg     = 0x000100000000ull;
   static const uptr kShadowEnd     = 0x0b0000000000ull;
   static const uptr kLoAppMemBeg   = 0x000000000100ull;
@@ -295,8 +269,7 @@ C/C++ on linux/powerpc64 (46-bit VMA)
 1000 0000 0000 - 1000 0000 0000: -
 1000 0000 0000 - 2000 0000 0000: metainfo (memory blocks and sync objects)
 2000 0000 0000 - 2000 0000 0000: -
-2000 0000 0000 - 2200 0000 0000: traces
-2200 0000 0000 - 3d00 0000 0000: -
+1200 0000 0000 - 3d00 0000 0000: -
 3d00 0000 0000 - 3e00 0000 0000: heap
 3e00 0000 0000 - 3e80 0000 0000: -
 3e80 0000 0000 - 4000 0000 0000: modules and main thread stack
@@ -304,8 +277,6 @@ C/C++ on linux/powerpc64 (46-bit VMA)
 struct MappingPPC64_46 {
   static const uptr kMetaShadowBeg = 0x100000000000ull;
   static const uptr kMetaShadowEnd = 0x200000000000ull;
-  static const uptr kTraceMemBeg   = 0x200000000000ull;
-  static const uptr kTraceMemEnd   = 0x220000000000ull;
   static const uptr kShadowBeg     = 0x010000000000ull;
   static const uptr kShadowEnd     = 0x100000000000ull;
   static const uptr kHeapMemBeg    = 0x3d0000000000ull;
@@ -329,9 +300,7 @@ C/C++ on linux/powerpc64 (47-bit VMA)
 0100 0000 0000 - 1000 0000 0000: shadow
 1000 0000 0000 - 1000 0000 0000: -
 1000 0000 0000 - 2000 0000 0000: metainfo (memory blocks and sync objects)
-2000 0000 0000 - 2000 0000 0000: -
-2000 0000 0000 - 2200 0000 0000: traces
-2200 0000 0000 - 7d00 0000 0000: -
+2000 0000 0000 - 7d00 0000 0000: -
 7d00 0000 0000 - 7e00 0000 0000: heap
 7e00 0000 0000 - 7e80 0000 0000: -
 7e80 0000 0000 - 8000 0000 0000: modules and main thread stack
@@ -339,8 +308,6 @@ C/C++ on linux/powerpc64 (47-bit VMA)
 struct MappingPPC64_47 {
   static const uptr kMetaShadowBeg = 0x100000000000ull;
   static const uptr kMetaShadowEnd = 0x200000000000ull;
-  static const uptr kTraceMemBeg   = 0x200000000000ull;
-  static const uptr kTraceMemEnd   = 0x220000000000ull;
   static const uptr kShadowBeg     = 0x010000000000ull;
   static const uptr kShadowEnd     = 0x100000000000ull;
   static const uptr kHeapMemBeg    = 0x7d0000000000ull;
@@ -362,21 +329,17 @@ C/C++ on linux/s390x
 While the kernel provides a 64-bit address space, we have to restrict ourselves
 to 48 bits due to how e.g. SyncVar::GetId() works.
 0000 0000 1000 - 0e00 0000 0000: binary, modules, stacks - 14 TiB
-0e00 0000 0000 - 4000 0000 0000: -
-4000 0000 0000 - 8000 0000 0000: shadow - 64TiB (4 * app)
+0e00 0000 0000 - 2000 0000 0000: -
+2000 0000 0000 - 8000 0000 0000: shadow - 64TiB (4 * app)
 8000 0000 0000 - 9000 0000 0000: -
 9000 0000 0000 - 9800 0000 0000: metainfo - 8TiB (0.5 * app)
-9800 0000 0000 - a000 0000 0000: -
-a000 0000 0000 - b000 0000 0000: traces - 16TiB (max history * 128k threads)
-b000 0000 0000 - be00 0000 0000: -
+9800 0000 0000 - be00 0000 0000: -
 be00 0000 0000 - c000 0000 0000: heap - 2TiB (max supported by the allocator)
 */
 struct MappingS390x {
   static const uptr kMetaShadowBeg = 0x900000000000ull;
   static const uptr kMetaShadowEnd = 0x980000000000ull;
-  static const uptr kTraceMemBeg   = 0xa00000000000ull;
-  static const uptr kTraceMemEnd   = 0xb00000000000ull;
-  static const uptr kShadowBeg     = 0x400000000000ull;
+  static const uptr kShadowBeg = 0x200000000000ull;
   static const uptr kShadowEnd     = 0x800000000000ull;
   static const uptr kHeapMemBeg    = 0xbe0000000000ull;
   static const uptr kHeapMemEnd    = 0xc00000000000ull;
@@ -400,16 +363,12 @@ struct MappingS390x {
 2000 0000 0000 - 2380 0000 0000: shadow
 2380 0000 0000 - 3000 0000 0000: -
 3000 0000 0000 - 4000 0000 0000: metainfo (memory blocks and sync objects)
-4000 0000 0000 - 6000 0000 0000: -
-6000 0000 0000 - 6200 0000 0000: traces
-6200 0000 0000 - 8000 0000 0000: -
+4000 0000 0000 - 8000 0000 0000: -
 */
 
 struct MappingGo48 {
   static const uptr kMetaShadowBeg = 0x300000000000ull;
   static const uptr kMetaShadowEnd = 0x400000000000ull;
-  static const uptr kTraceMemBeg   = 0x600000000000ull;
-  static const uptr kTraceMemEnd   = 0x620000000000ull;
   static const uptr kShadowBeg     = 0x200000000000ull;
   static const uptr kShadowEnd     = 0x238000000000ull;
   static const uptr kLoAppMemBeg = 0x000000001000ull;
@@ -432,7 +391,7 @@ struct MappingGo48 {
 00c0 0000 0000 - 00e0 0000 0000: heap
 00e0 0000 0000 - 0100 0000 0000: -
 0100 0000 0000 - 0500 0000 0000: shadow
-0500 0000 0000 - 0700 0000 0000: traces
+0500 0000 0000 - 0700 0000 0000: -
 0700 0000 0000 - 0770 0000 0000: metainfo (memory blocks and sync objects)
 07d0 0000 0000 - 8000 0000 0000: -
 */
@@ -440,8 +399,6 @@ struct MappingGo48 {
 struct MappingGoWindows {
   static const uptr kMetaShadowBeg = 0x070000000000ull;
   static const uptr kMetaShadowEnd = 0x077000000000ull;
-  static const uptr kTraceMemBeg = 0x050000000000ull;
-  static const uptr kTraceMemEnd = 0x070000000000ull;
   static const uptr kShadowBeg     = 0x010000000000ull;
   static const uptr kShadowEnd     = 0x050000000000ull;
   static const uptr kLoAppMemBeg = 0x000000001000ull;
@@ -466,16 +423,12 @@ struct MappingGoWindows {
 2000 0000 0000 - 2380 0000 0000: shadow
 2380 0000 0000 - 2400 0000 0000: -
 2400 0000 0000 - 3400 0000 0000: metainfo (memory blocks and sync objects)
-3400 0000 0000 - 3600 0000 0000: -
-3600 0000 0000 - 3800 0000 0000: traces
-3800 0000 0000 - 4000 0000 0000: -
+3400 0000 0000 - 4000 0000 0000: -
 */
 
 struct MappingGoPPC64_46 {
   static const uptr kMetaShadowBeg = 0x240000000000ull;
   static const uptr kMetaShadowEnd = 0x340000000000ull;
-  static const uptr kTraceMemBeg   = 0x360000000000ull;
-  static const uptr kTraceMemEnd   = 0x380000000000ull;
   static const uptr kShadowBeg     = 0x200000000000ull;
   static const uptr kShadowEnd     = 0x238000000000ull;
   static const uptr kLoAppMemBeg = 0x000000001000ull;
@@ -500,16 +453,12 @@ struct MappingGoPPC64_46 {
 2000 0000 0000 - 3000 0000 0000: shadow
 3000 0000 0000 - 3000 0000 0000: -
 3000 0000 0000 - 4000 0000 0000: metainfo (memory blocks and sync objects)
-4000 0000 0000 - 6000 0000 0000: -
-6000 0000 0000 - 6200 0000 0000: traces
-6200 0000 0000 - 8000 0000 0000: -
+4000 0000 0000 - 8000 0000 0000: -
 */
 
 struct MappingGoPPC64_47 {
   static const uptr kMetaShadowBeg = 0x300000000000ull;
   static const uptr kMetaShadowEnd = 0x400000000000ull;
-  static const uptr kTraceMemBeg   = 0x600000000000ull;
-  static const uptr kTraceMemEnd   = 0x620000000000ull;
   static const uptr kShadowBeg     = 0x200000000000ull;
   static const uptr kShadowEnd     = 0x300000000000ull;
   static const uptr kLoAppMemBeg = 0x000000001000ull;
@@ -534,15 +483,11 @@ struct MappingGoPPC64_47 {
 2000 0000 0000 - 3000 0000 0000: shadow
 3000 0000 0000 - 3000 0000 0000: -
 3000 0000 0000 - 4000 0000 0000: metainfo (memory blocks and sync objects)
-4000 0000 0000 - 6000 0000 0000: -
-6000 0000 0000 - 6200 0000 0000: traces
-6200 0000 0000 - 8000 0000 0000: -
+4000 0000 0000 - 8000 0000 0000: -
 */
 struct MappingGoAarch64 {
   static const uptr kMetaShadowBeg = 0x300000000000ull;
   static const uptr kMetaShadowEnd = 0x400000000000ull;
-  static const uptr kTraceMemBeg   = 0x600000000000ull;
-  static const uptr kTraceMemEnd   = 0x620000000000ull;
   static const uptr kShadowBeg     = 0x200000000000ull;
   static const uptr kShadowEnd     = 0x300000000000ull;
   static const uptr kLoAppMemBeg = 0x000000001000ull;
@@ -568,15 +513,11 @@ Go on linux/mips64 (47-bit VMA)
 2000 0000 0000 - 3000 0000 0000: shadow
 3000 0000 0000 - 3000 0000 0000: -
 3000 0000 0000 - 4000 0000 0000: metainfo (memory blocks and sync objects)
-4000 0000 0000 - 6000 0000 0000: -
-6000 0000 0000 - 6200 0000 0000: traces
-6200 0000 0000 - 8000 0000 0000: -
+3200 0000 0000 - 8000 0000 0000: -
 */
 struct MappingGoMips64_47 {
   static const uptr kMetaShadowBeg = 0x300000000000ull;
   static const uptr kMetaShadowEnd = 0x400000000000ull;
-  static const uptr kTraceMemBeg = 0x600000000000ull;
-  static const uptr kTraceMemEnd = 0x620000000000ull;
   static const uptr kShadowBeg = 0x200000000000ull;
   static const uptr kShadowEnd = 0x300000000000ull;
   static const uptr kLoAppMemBeg = 0x000000001000ull;
@@ -600,14 +541,10 @@ Go on linux/s390x
 4000 0000 0000 - 8000 0000 0000: shadow - 64TiB (4 * app)
 8000 0000 0000 - 9000 0000 0000: -
 9000 0000 0000 - 9800 0000 0000: metainfo - 8TiB (0.5 * app)
-9800 0000 0000 - a000 0000 0000: -
-a000 0000 0000 - b000 0000 0000: traces - 16TiB (max history * 128k threads)
 */
 struct MappingGoS390x {
   static const uptr kMetaShadowBeg = 0x900000000000ull;
   static const uptr kMetaShadowEnd = 0x980000000000ull;
-  static const uptr kTraceMemBeg   = 0xa00000000000ull;
-  static const uptr kTraceMemEnd   = 0xb00000000000ull;
   static const uptr kShadowBeg     = 0x400000000000ull;
   static const uptr kShadowEnd     = 0x800000000000ull;
   static const uptr kLoAppMemBeg = 0x000000001000ull;
@@ -715,8 +652,6 @@ enum MappingType {
   kShadowEnd,
   kMetaShadowBeg,
   kMetaShadowEnd,
-  kTraceMemBeg,
-  kTraceMemEnd,
   kVdsoBeg,
 };
 
@@ -750,10 +685,6 @@ struct MappingField {
         return Mapping::kMetaShadowBeg;
       case kMetaShadowEnd:
         return Mapping::kMetaShadowEnd;
-      case kTraceMemBeg:
-        return Mapping::kTraceMemBeg;
-      case kTraceMemEnd:
-        return Mapping::kTraceMemEnd;
     }
     Die();
   }
@@ -792,11 +723,6 @@ uptr MetaShadowBeg(void) { return SelectMapping<MappingField>(kMetaShadowBeg); }
 ALWAYS_INLINE
 uptr MetaShadowEnd(void) { return SelectMapping<MappingField>(kMetaShadowEnd); }
 
-ALWAYS_INLINE
-uptr TraceMemBeg(void) { return SelectMapping<MappingField>(kTraceMemBeg); }
-ALWAYS_INLINE
-uptr TraceMemEnd(void) { return SelectMapping<MappingField>(kTraceMemEnd); }
-
 struct IsAppMemImpl {
   template <typename Mapping>
   static bool Apply(uptr mem) {
@@ -934,43 +860,10 @@ inline uptr RestoreAddr(uptr addr) {
   return SelectMapping<RestoreAddrImpl>(addr);
 }
 
-// The additional page is to catch shadow stack overflow as paging fault.
-// Windows wants 64K alignment for mmaps.
-const uptr kTotalTraceSize = (kTraceSize * sizeof(Event) + sizeof(Trace)
-    + (64 << 10) + (64 << 10) - 1) & ~((64 << 10) - 1);
-
-struct GetThreadTraceImpl {
-  template <typename Mapping>
-  static uptr Apply(uptr tid) {
-    uptr p = Mapping::kTraceMemBeg + tid * kTotalTraceSize;
-    DCHECK_LT(p, Mapping::kTraceMemEnd);
-    return p;
-  }
-};
-
-ALWAYS_INLINE
-uptr GetThreadTrace(int tid) { return SelectMapping<GetThreadTraceImpl>(tid); }
-
-struct GetThreadTraceHeaderImpl {
-  template <typename Mapping>
-  static uptr Apply(uptr tid) {
-    uptr p = Mapping::kTraceMemBeg + tid * kTotalTraceSize +
-             kTraceSize * sizeof(Event);
-    DCHECK_LT(p, Mapping::kTraceMemEnd);
-    return p;
-  }
-};
-
-ALWAYS_INLINE
-uptr GetThreadTraceHeader(int tid) {
-  return SelectMapping<GetThreadTraceHeaderImpl>(tid);
-}
-
 void InitializePlatform();
 void InitializePlatformEarly();
 void CheckAndProtect();
 void InitializeShadowMemoryPlatform();
-void FlushShadowMemory();
 void WriteMemoryProfile(char *buf, uptr buf_size, u64 uptime_ns);
 int ExtractResolvFDs(void *state, int *fds, int nfd);
 int ExtractRecvmsgFDs(void *msg, int *fds, int nfd);
diff --git a/libsanitizer/tsan/tsan_platform_linux.cpp b/libsanitizer/tsan/tsan_platform_linux.cpp
index 73ec14892d2..17dbdff8a53 100644
--- a/libsanitizer/tsan/tsan_platform_linux.cpp
+++ b/libsanitizer/tsan/tsan_platform_linux.cpp
@@ -94,7 +94,6 @@ enum {
   MemMeta,
   MemFile,
   MemMmap,
-  MemTrace,
   MemHeap,
   MemOther,
   MemCount,
@@ -112,8 +111,6 @@ void FillProfileCallback(uptr p, uptr rss, bool file, uptr *mem) {
     mem[file ? MemFile : MemMmap] += rss;
   else if (p >= HeapMemBeg() && p < HeapMemEnd())
     mem[MemHeap] += rss;
-  else if (p >= TraceMemBeg() && p < TraceMemEnd())
-    mem[MemTrace] += rss;
   else
     mem[MemOther] += rss;
 }
@@ -126,42 +123,33 @@ void WriteMemoryProfile(char *buf, uptr buf_size, u64 uptime_ns) {
   StackDepotStats stacks = StackDepotGetStats();
   uptr nthread, nlive;
   ctx->thread_registry.GetNumberOfThreads(&nthread, &nlive);
+  uptr trace_mem;
+  {
+    Lock l(&ctx->slot_mtx);
+    trace_mem = ctx->trace_part_total_allocated * sizeof(TracePart);
+  }
   uptr internal_stats[AllocatorStatCount];
   internal_allocator()->GetStats(internal_stats);
   // All these are allocated from the common mmap region.
-  mem[MemMmap] -= meta.mem_block + meta.sync_obj + stacks.allocated +
-                  internal_stats[AllocatorStatMapped];
+  mem[MemMmap] -= meta.mem_block + meta.sync_obj + trace_mem +
+                  stacks.allocated + internal_stats[AllocatorStatMapped];
   if (s64(mem[MemMmap]) < 0)
     mem[MemMmap] = 0;
   internal_snprintf(
       buf, buf_size,
-      "%llus: RSS %zd MB: shadow:%zd meta:%zd file:%zd mmap:%zd"
-      " trace:%zd heap:%zd other:%zd intalloc:%zd memblocks:%zd syncobj:%zu"
-      " stacks=%zd[%zd] nthr=%zd/%zd\n",
-      uptime_ns / (1000 * 1000 * 1000), mem[MemTotal] >> 20,
-      mem[MemShadow] >> 20, mem[MemMeta] >> 20, mem[MemFile] >> 20,
-      mem[MemMmap] >> 20, mem[MemTrace] >> 20, mem[MemHeap] >> 20,
+      "==%zu== %llus [%zu]: RSS %zd MB: shadow:%zd meta:%zd file:%zd"
+      " mmap:%zd heap:%zd other:%zd intalloc:%zd memblocks:%zd syncobj:%zu"
+      " trace:%zu stacks=%zd threads=%zu/%zu\n",
+      internal_getpid(), uptime_ns / (1000 * 1000 * 1000), ctx->global_epoch,
+      mem[MemTotal] >> 20, mem[MemShadow] >> 20, mem[MemMeta] >> 20,
+      mem[MemFile] >> 20, mem[MemMmap] >> 20, mem[MemHeap] >> 20,
       mem[MemOther] >> 20, internal_stats[AllocatorStatMapped] >> 20,
-      meta.mem_block >> 20, meta.sync_obj >> 20, stacks.allocated >> 20,
-      stacks.n_uniq_ids, nlive, nthread);
-}
-
-#  if SANITIZER_LINUX
-void FlushShadowMemoryCallback(
-    const SuspendedThreadsList &suspended_threads_list,
-    void *argument) {
-  ReleaseMemoryPagesToOS(ShadowBeg(), ShadowEnd());
-}
-#endif
-
-void FlushShadowMemory() {
-#if SANITIZER_LINUX
-  StopTheWorld(FlushShadowMemoryCallback, 0);
-#endif
+      meta.mem_block >> 20, meta.sync_obj >> 20, trace_mem >> 20,
+      stacks.allocated >> 20, nlive, nthread);
 }
 
 #if !SANITIZER_GO
-// Mark shadow for .rodata sections with the special kShadowRodata marker.
+// Mark shadow for .rodata sections with the special Shadow::kRodata marker.
 // Accesses to .rodata can't race, so this saves time, memory and trace space.
 static void MapRodata() {
   // First create temp file.
@@ -182,13 +170,13 @@ static void MapRodata() {
     return;
   internal_unlink(name);  // Unlink it now, so that we can reuse the buffer.
   fd_t fd = openrv;
-  // Fill the file with kShadowRodata.
+  // Fill the file with Shadow::kRodata.
   const uptr kMarkerSize = 512 * 1024 / sizeof(RawShadow);
   InternalMmapVector<RawShadow> marker(kMarkerSize);
   // volatile to prevent insertion of memset
   for (volatile RawShadow *p = marker.data(); p < marker.data() + kMarkerSize;
        p++)
-    *p = kShadowRodata;
+    *p = Shadow::kRodata;
   internal_write(fd, marker.data(), marker.size() * sizeof(RawShadow));
   // Map the file into memory.
   uptr page = internal_mmap(0, GetPageSizeCached(), PROT_READ | PROT_WRITE,
diff --git a/libsanitizer/tsan/tsan_platform_mac.cpp b/libsanitizer/tsan/tsan_platform_mac.cpp
index 3faa2d0c619..52af76f2f28 100644
--- a/libsanitizer/tsan/tsan_platform_mac.cpp
+++ b/libsanitizer/tsan/tsan_platform_mac.cpp
@@ -112,9 +112,6 @@ void cur_thread_finalize() {
 }
 #endif
 
-void FlushShadowMemory() {
-}
-
 static void RegionMemUsage(uptr start, uptr end, uptr *res, uptr *dirty) {
   vm_address_t address = start;
   vm_address_t end_address = end;
@@ -142,10 +139,8 @@ static void RegionMemUsage(uptr start, uptr end, uptr *res, uptr *dirty) {
 void WriteMemoryProfile(char *buf, uptr buf_size, u64 uptime_ns) {
   uptr shadow_res, shadow_dirty;
   uptr meta_res, meta_dirty;
-  uptr trace_res, trace_dirty;
   RegionMemUsage(ShadowBeg(), ShadowEnd(), &shadow_res, &shadow_dirty);
   RegionMemUsage(MetaShadowBeg(), MetaShadowEnd(), &meta_res, &meta_dirty);
-  RegionMemUsage(TraceMemBeg(), TraceMemEnd(), &trace_res, &trace_dirty);
 
 #if !SANITIZER_GO
   uptr low_res, low_dirty;
@@ -166,7 +161,6 @@ void WriteMemoryProfile(char *buf, uptr buf_size, u64 uptime_ns) {
       buf, buf_size,
       "shadow   (0x%016zx-0x%016zx): resident %zd kB, dirty %zd kB\n"
       "meta     (0x%016zx-0x%016zx): resident %zd kB, dirty %zd kB\n"
-      "traces   (0x%016zx-0x%016zx): resident %zd kB, dirty %zd kB\n"
 #  if !SANITIZER_GO
       "low app  (0x%016zx-0x%016zx): resident %zd kB, dirty %zd kB\n"
       "high app (0x%016zx-0x%016zx): resident %zd kB, dirty %zd kB\n"
@@ -179,7 +173,6 @@ void WriteMemoryProfile(char *buf, uptr buf_size, u64 uptime_ns) {
       "------------------------------\n",
       ShadowBeg(), ShadowEnd(), shadow_res / 1024, shadow_dirty / 1024,
       MetaShadowBeg(), MetaShadowEnd(), meta_res / 1024, meta_dirty / 1024,
-      TraceMemBeg(), TraceMemEnd(), trace_res / 1024, trace_dirty / 1024,
 #  if !SANITIZER_GO
       LoAppMemBeg(), LoAppMemEnd(), low_res / 1024, low_dirty / 1024,
       HiAppMemBeg(), HiAppMemEnd(), high_res / 1024, high_dirty / 1024,
diff --git a/libsanitizer/tsan/tsan_platform_posix.cpp b/libsanitizer/tsan/tsan_platform_posix.cpp
index 763ac444377..763a533de52 100644
--- a/libsanitizer/tsan/tsan_platform_posix.cpp
+++ b/libsanitizer/tsan/tsan_platform_posix.cpp
@@ -113,24 +113,20 @@ void CheckAndProtect() {
 #    if defined(__aarch64__) && defined(__APPLE__) && SANITIZER_IOS
   ProtectRange(HeapMemEnd(), ShadowBeg());
   ProtectRange(ShadowEnd(), MetaShadowBeg());
-  ProtectRange(MetaShadowEnd(), TraceMemBeg());
-#else
+  ProtectRange(MetaShadowEnd(), HeapMemBeg());
+#    else
   ProtectRange(LoAppMemEnd(), ShadowBeg());
   ProtectRange(ShadowEnd(), MetaShadowBeg());
   if (MidAppMemBeg()) {
     ProtectRange(MetaShadowEnd(), MidAppMemBeg());
-    ProtectRange(MidAppMemEnd(), TraceMemBeg());
+    ProtectRange(MidAppMemEnd(), HeapMemBeg());
   } else {
-    ProtectRange(MetaShadowEnd(), TraceMemBeg());
+    ProtectRange(MetaShadowEnd(), HeapMemBeg());
   }
-  // Memory for traces is mapped lazily in MapThreadTrace.
-  // Protect the whole range for now, so that user does not map something here.
-  ProtectRange(TraceMemBeg(), TraceMemEnd());
-  ProtectRange(TraceMemEnd(), HeapMemBeg());
   ProtectRange(HeapEnd(), HiAppMemBeg());
-#endif
+#    endif
 
-#if defined(__s390x__)
+#    if defined(__s390x__)
   // Protect the rest of the address space.
   const uptr user_addr_max_l4 = 0x0020000000000000ull;
   const uptr user_addr_max_l5 = 0xfffffffffffff000ull;
diff --git a/libsanitizer/tsan/tsan_platform_windows.cpp b/libsanitizer/tsan/tsan_platform_windows.cpp
index fea893768c7..eb8f354742f 100644
--- a/libsanitizer/tsan/tsan_platform_windows.cpp
+++ b/libsanitizer/tsan/tsan_platform_windows.cpp
@@ -20,9 +20,6 @@
 
 namespace __tsan {
 
-void FlushShadowMemory() {
-}
-
 void WriteMemoryProfile(char *buf, uptr buf_size, u64 uptime_ns) {}
 
 void InitializePlatformEarly() {
diff --git a/libsanitizer/tsan/tsan_rtl.cpp b/libsanitizer/tsan/tsan_rtl.cpp
index 46dec04b875..c68a13d718c 100644
--- a/libsanitizer/tsan/tsan_rtl.cpp
+++ b/libsanitizer/tsan/tsan_rtl.cpp
@@ -34,6 +34,9 @@ extern "C" void __tsan_resume() {
   __tsan_resumed = 1;
 }
 
+SANITIZER_WEAK_DEFAULT_IMPL
+void __tsan_test_only_on_fork() {}
+
 namespace __tsan {
 
 #if !SANITIZER_GO
@@ -54,109 +57,351 @@ Context *ctx;
 bool OnFinalize(bool failed);
 void OnInitialize();
 #else
-#include <dlfcn.h>
 SANITIZER_WEAK_CXX_DEFAULT_IMPL
 bool OnFinalize(bool failed) {
-#if !SANITIZER_GO
+#  if !SANITIZER_GO
   if (on_finalize)
     return on_finalize(failed);
-#endif
+#  endif
   return failed;
 }
+
 SANITIZER_WEAK_CXX_DEFAULT_IMPL
 void OnInitialize() {
-#if !SANITIZER_GO
+#  if !SANITIZER_GO
   if (on_initialize)
     on_initialize();
-#endif
+#  endif
 }
 #endif
 
-static ThreadContextBase *CreateThreadContext(Tid tid) {
-  // Map thread trace when context is created.
-  char name[50];
-  internal_snprintf(name, sizeof(name), "trace %u", tid);
-  MapThreadTrace(GetThreadTrace(tid), TraceSize() * sizeof(Event), name);
-  const uptr hdr = GetThreadTraceHeader(tid);
-  internal_snprintf(name, sizeof(name), "trace header %u", tid);
-  MapThreadTrace(hdr, sizeof(Trace), name);
-  new((void*)hdr) Trace();
-  // We are going to use only a small part of the trace with the default
-  // value of history_size. However, the constructor writes to the whole trace.
-  // Release the unused part.
-  uptr hdr_end = hdr + sizeof(Trace);
-  hdr_end -= sizeof(TraceHeader) * (kTraceParts - TraceParts());
-  hdr_end = RoundUp(hdr_end, GetPageSizeCached());
-  if (hdr_end < hdr + sizeof(Trace)) {
-    ReleaseMemoryPagesToOS(hdr_end, hdr + sizeof(Trace));
-    uptr unused = hdr + sizeof(Trace) - hdr_end;
-    if (hdr_end != (uptr)MmapFixedNoAccess(hdr_end, unused)) {
-      Report("ThreadSanitizer: failed to mprotect [0x%zx-0x%zx) \n", hdr_end,
-             unused);
-      CHECK("unable to mprotect" && 0);
+static TracePart* TracePartAlloc(ThreadState* thr) {
+  TracePart* part = nullptr;
+  {
+    Lock lock(&ctx->slot_mtx);
+    uptr max_parts = Trace::kMinParts + flags()->history_size;
+    Trace* trace = &thr->tctx->trace;
+    if (trace->parts_allocated == max_parts ||
+        ctx->trace_part_finished_excess) {
+      part = ctx->trace_part_recycle.PopFront();
+      DPrintf("#%d: TracePartAlloc: part=%p\n", thr->tid, part);
+      if (part && part->trace) {
+        Trace* trace1 = part->trace;
+        Lock trace_lock(&trace1->mtx);
+        part->trace = nullptr;
+        TracePart* part1 = trace1->parts.PopFront();
+        CHECK_EQ(part, part1);
+        if (trace1->parts_allocated > trace1->parts.Size()) {
+          ctx->trace_part_finished_excess +=
+              trace1->parts_allocated - trace1->parts.Size();
+          trace1->parts_allocated = trace1->parts.Size();
+        }
+      }
+    }
+    if (trace->parts_allocated < max_parts) {
+      trace->parts_allocated++;
+      if (ctx->trace_part_finished_excess)
+        ctx->trace_part_finished_excess--;
+    }
+    if (!part)
+      ctx->trace_part_total_allocated++;
+    else if (ctx->trace_part_recycle_finished)
+      ctx->trace_part_recycle_finished--;
+  }
+  if (!part)
+    part = new (MmapOrDie(sizeof(*part), "TracePart")) TracePart();
+  return part;
+}
+
+static void TracePartFree(TracePart* part) REQUIRES(ctx->slot_mtx) {
+  DCHECK(part->trace);
+  part->trace = nullptr;
+  ctx->trace_part_recycle.PushFront(part);
+}
+
+void TraceResetForTesting() {
+  Lock lock(&ctx->slot_mtx);
+  while (auto* part = ctx->trace_part_recycle.PopFront()) {
+    if (auto trace = part->trace)
+      CHECK_EQ(trace->parts.PopFront(), part);
+    UnmapOrDie(part, sizeof(*part));
+  }
+  ctx->trace_part_total_allocated = 0;
+  ctx->trace_part_recycle_finished = 0;
+  ctx->trace_part_finished_excess = 0;
+}
+
+static void DoResetImpl(uptr epoch) {
+  ThreadRegistryLock lock0(&ctx->thread_registry);
+  Lock lock1(&ctx->slot_mtx);
+  CHECK_EQ(ctx->global_epoch, epoch);
+  ctx->global_epoch++;
+  CHECK(!ctx->resetting);
+  ctx->resetting = true;
+  for (u32 i = ctx->thread_registry.NumThreadsLocked(); i--;) {
+    ThreadContext* tctx = (ThreadContext*)ctx->thread_registry.GetThreadLocked(
+        static_cast<Tid>(i));
+    // Potentially we could purge all ThreadStatusDead threads from the
+    // registry. Since we reset all shadow, they can't race with anything
+    // anymore. However, their tid's can still be stored in some aux places
+    // (e.g. tid of thread that created something).
+    auto trace = &tctx->trace;
+    Lock lock(&trace->mtx);
+    bool attached = tctx->thr && tctx->thr->slot;
+    auto parts = &trace->parts;
+    bool local = false;
+    while (!parts->Empty()) {
+      auto part = parts->Front();
+      local = local || part == trace->local_head;
+      if (local)
+        CHECK(!ctx->trace_part_recycle.Queued(part));
+      else
+        ctx->trace_part_recycle.Remove(part);
+      if (attached && parts->Size() == 1) {
+        // The thread is running and this is the last/current part.
+        // Set the trace position to the end of the current part
+        // to force the thread to call SwitchTracePart and re-attach
+        // to a new slot and allocate a new trace part.
+        // Note: the thread is concurrently modifying the position as well,
+        // so this is only best-effort. The thread can only modify position
+        // within this part, because switching parts is protected by
+        // slot/trace mutexes that we hold here.
+        atomic_store_relaxed(
+            &tctx->thr->trace_pos,
+            reinterpret_cast<uptr>(&part->events[TracePart::kSize]));
+        break;
+      }
+      parts->Remove(part);
+      TracePartFree(part);
+    }
+    CHECK_LE(parts->Size(), 1);
+    trace->local_head = parts->Front();
+    if (tctx->thr && !tctx->thr->slot) {
+      atomic_store_relaxed(&tctx->thr->trace_pos, 0);
+      tctx->thr->trace_prev_pc = 0;
+    }
+    if (trace->parts_allocated > trace->parts.Size()) {
+      ctx->trace_part_finished_excess +=
+          trace->parts_allocated - trace->parts.Size();
+      trace->parts_allocated = trace->parts.Size();
+    }
+  }
+  while (ctx->slot_queue.PopFront()) {
+  }
+  for (auto& slot : ctx->slots) {
+    slot.SetEpoch(kEpochZero);
+    slot.journal.Reset();
+    slot.thr = nullptr;
+    ctx->slot_queue.PushBack(&slot);
+  }
+
+  DPrintf("Resetting shadow...\n");
+  if (!MmapFixedSuperNoReserve(ShadowBeg(), ShadowEnd() - ShadowBeg(),
+                               "shadow")) {
+    Printf("failed to reset shadow memory\n");
+    Die();
+  }
+  DPrintf("Resetting meta shadow...\n");
+  ctx->metamap.ResetClocks();
+  ctx->resetting = false;
+}
+
+// Clang does not understand locking all slots in the loop:
+// error: expecting mutex 'slot.mtx' to be held at start of each loop
+void DoReset(ThreadState* thr, uptr epoch) NO_THREAD_SAFETY_ANALYSIS {
+  {
+    Lock l(&ctx->multi_slot_mtx);
+    for (auto& slot : ctx->slots) {
+      slot.mtx.Lock();
+      if (UNLIKELY(epoch == 0))
+        epoch = ctx->global_epoch;
+      if (UNLIKELY(epoch != ctx->global_epoch)) {
+        // Epoch can't change once we've locked the first slot.
+        CHECK_EQ(slot.sid, 0);
+        slot.mtx.Unlock();
+        return;
+      }
+    }
+  }
+  DPrintf("#%d: DoReset epoch=%lu\n", thr ? thr->tid : -1, epoch);
+  DoResetImpl(epoch);
+  for (auto& slot : ctx->slots) slot.mtx.Unlock();
+}
+
+void FlushShadowMemory() { DoReset(nullptr, 0); }
+
+static TidSlot* FindSlotAndLock(ThreadState* thr)
+    ACQUIRE(thr->slot->mtx) NO_THREAD_SAFETY_ANALYSIS {
+  CHECK(!thr->slot);
+  TidSlot* slot = nullptr;
+  for (;;) {
+    uptr epoch;
+    {
+      Lock lock(&ctx->slot_mtx);
+      epoch = ctx->global_epoch;
+      if (slot) {
+        // This is an exhausted slot from the previous iteration.
+        if (ctx->slot_queue.Queued(slot))
+          ctx->slot_queue.Remove(slot);
+        thr->slot_locked = false;
+        slot->mtx.Unlock();
+      }
+      for (;;) {
+        slot = ctx->slot_queue.PopFront();
+        if (!slot)
+          break;
+        if (slot->epoch() != kEpochLast) {
+          ctx->slot_queue.PushBack(slot);
+          break;
+        }
+      }
+    }
+    if (!slot) {
+      DoReset(thr, epoch);
+      continue;
     }
+    slot->mtx.Lock();
+    CHECK(!thr->slot_locked);
+    thr->slot_locked = true;
+    if (slot->thr) {
+      DPrintf("#%d: preempting sid=%d tid=%d\n", thr->tid, (u32)slot->sid,
+              slot->thr->tid);
+      slot->SetEpoch(slot->thr->fast_state.epoch());
+      slot->thr = nullptr;
+    }
+    if (slot->epoch() != kEpochLast)
+      return slot;
   }
-  return New<ThreadContext>(tid);
 }
 
+void SlotAttachAndLock(ThreadState* thr) {
+  TidSlot* slot = FindSlotAndLock(thr);
+  DPrintf("#%d: SlotAttach: slot=%u\n", thr->tid, static_cast<int>(slot->sid));
+  CHECK(!slot->thr);
+  CHECK(!thr->slot);
+  slot->thr = thr;
+  thr->slot = slot;
+  Epoch epoch = EpochInc(slot->epoch());
+  CHECK(!EpochOverflow(epoch));
+  slot->SetEpoch(epoch);
+  thr->fast_state.SetSid(slot->sid);
+  thr->fast_state.SetEpoch(epoch);
+  if (thr->slot_epoch != ctx->global_epoch) {
+    thr->slot_epoch = ctx->global_epoch;
+    thr->clock.Reset();
 #if !SANITIZER_GO
-static const u32 kThreadQuarantineSize = 16;
-#else
-static const u32 kThreadQuarantineSize = 64;
+    thr->last_sleep_stack_id = kInvalidStackID;
+    thr->last_sleep_clock.Reset();
 #endif
+  }
+  thr->clock.Set(slot->sid, epoch);
+  slot->journal.PushBack({thr->tid, epoch});
+}
+
+static void SlotDetachImpl(ThreadState* thr, bool exiting) {
+  TidSlot* slot = thr->slot;
+  thr->slot = nullptr;
+  if (thr != slot->thr) {
+    slot = nullptr;  // we don't own the slot anymore
+    if (thr->slot_epoch != ctx->global_epoch) {
+      TracePart* part = nullptr;
+      auto* trace = &thr->tctx->trace;
+      {
+        Lock l(&trace->mtx);
+        auto* parts = &trace->parts;
+        // The trace can be completely empty in an unlikely event
+        // the thread is preempted right after it acquired the slot
+        // in ThreadStart and did not trace any events yet.
+        CHECK_LE(parts->Size(), 1);
+        part = parts->PopFront();
+        thr->tctx->trace.local_head = nullptr;
+        atomic_store_relaxed(&thr->trace_pos, 0);
+        thr->trace_prev_pc = 0;
+      }
+      if (part) {
+        Lock l(&ctx->slot_mtx);
+        TracePartFree(part);
+      }
+    }
+    return;
+  }
+  CHECK(exiting || thr->fast_state.epoch() == kEpochLast);
+  slot->SetEpoch(thr->fast_state.epoch());
+  slot->thr = nullptr;
+}
+
+void SlotDetach(ThreadState* thr) {
+  Lock lock(&thr->slot->mtx);
+  SlotDetachImpl(thr, true);
+}
+
+void SlotLock(ThreadState* thr) NO_THREAD_SAFETY_ANALYSIS {
+  DCHECK(!thr->slot_locked);
+  TidSlot* slot = thr->slot;
+  slot->mtx.Lock();
+  thr->slot_locked = true;
+  if (LIKELY(thr == slot->thr && thr->fast_state.epoch() != kEpochLast))
+    return;
+  SlotDetachImpl(thr, false);
+  thr->slot_locked = false;
+  slot->mtx.Unlock();
+  SlotAttachAndLock(thr);
+}
+
+void SlotUnlock(ThreadState* thr) {
+  DCHECK(thr->slot_locked);
+  thr->slot_locked = false;
+  thr->slot->mtx.Unlock();
+}
 
 Context::Context()
     : initialized(),
       report_mtx(MutexTypeReport),
       nreported(),
-      thread_registry(CreateThreadContext, kMaxTid, kThreadQuarantineSize,
-                      kMaxTidReuse),
+      thread_registry([](Tid tid) -> ThreadContextBase* {
+        return new (Alloc(sizeof(ThreadContext))) ThreadContext(tid);
+      }),
       racy_mtx(MutexTypeRacy),
       racy_stacks(),
       racy_addresses(),
       fired_suppressions_mtx(MutexTypeFired),
-      clock_alloc(LINKER_INITIALIZED, "clock allocator") {
+      clock_alloc(LINKER_INITIALIZED, "clock allocator"),
+      slot_mtx(MutexTypeSlots),
+      multi_slot_mtx(MutexTypeMultiSlot),
+      resetting() {
   fired_suppressions.reserve(8);
+  for (uptr i = 0; i < ARRAY_SIZE(slots); i++) {
+    TidSlot* slot = &slots[i];
+    slot->sid = static_cast<Sid>(i);
+    slot_queue.PushBack(slot);
+  }
+  global_epoch = 1;
 }
 
+TidSlot::TidSlot() : mtx(MutexTypeSlot) {}
+
 // The objects are allocated in TLS, so one may rely on zero-initialization.
-ThreadState::ThreadState(Context *ctx, Tid tid, int unique_id, u64 epoch,
-                         unsigned reuse_count, uptr stk_addr, uptr stk_size,
-                         uptr tls_addr, uptr tls_size)
-    : fast_state(tid, epoch)
-      // Do not touch these, rely on zero initialization,
-      // they may be accessed before the ctor.
-      // , ignore_reads_and_writes()
-      // , ignore_interceptors()
-      ,
-      clock(tid, reuse_count)
-#if !SANITIZER_GO
-      ,
-      jmp_bufs()
-#endif
-      ,
-      tid(tid),
-      unique_id(unique_id),
-      stk_addr(stk_addr),
-      stk_size(stk_size),
-      tls_addr(tls_addr),
-      tls_size(tls_size)
-#if !SANITIZER_GO
-      ,
-      last_sleep_clock(tid)
-#endif
-{
+ThreadState::ThreadState(Tid tid)
+    // Do not touch these, rely on zero initialization,
+    // they may be accessed before the ctor.
+    // ignore_reads_and_writes()
+    // ignore_interceptors()
+    : tid(tid) {
   CHECK_EQ(reinterpret_cast<uptr>(this) % SANITIZER_CACHE_LINE_SIZE, 0);
 #if !SANITIZER_GO
-  shadow_stack_pos = shadow_stack;
-  shadow_stack_end = shadow_stack + kShadowStackSize;
+  // C/C++ uses fixed size shadow stack.
+  const int kInitStackSize = kShadowStackSize;
+  shadow_stack = static_cast<uptr*>(
+      MmapNoReserveOrDie(kInitStackSize * sizeof(uptr), "shadow stack"));
+  SetShadowRegionHugePageMode(reinterpret_cast<uptr>(shadow_stack),
+                              kInitStackSize * sizeof(uptr));
 #else
-  // Setup dynamic shadow stack.
+  // Go uses malloc-allocated shadow stack with dynamic size.
   const int kInitStackSize = 8;
-  shadow_stack = (uptr *)Alloc(kInitStackSize * sizeof(uptr));
+  shadow_stack = static_cast<uptr*>(Alloc(kInitStackSize * sizeof(uptr)));
+#endif
   shadow_stack_pos = shadow_stack;
   shadow_stack_end = shadow_stack + kInitStackSize;
-#endif
 }
 
 #if !SANITIZER_GO
@@ -271,7 +516,8 @@ void UnmapShadow(ThreadState *thr, uptr addr, uptr size) {
   if (size == 0) return;
   DontNeedShadowFor(addr, size);
   ScopedGlobalProcessor sgp;
-  ctx->metamap.ResetRange(thr->proc(), addr, size);
+  SlotLocker locker(thr, true);
+  ctx->metamap.ResetRange(thr->proc(), addr, size, true);
 }
 #endif
 
@@ -317,18 +563,6 @@ void MapShadow(uptr addr, uptr size) {
           addr + size, meta_begin, meta_end);
 }
 
-void MapThreadTrace(uptr addr, uptr size, const char *name) {
-  DPrintf("#0: Mapping trace at 0x%zx-0x%zx(0x%zx)\n", addr, addr + size, size);
-  CHECK_GE(addr, TraceMemBeg());
-  CHECK_LE(addr + size, TraceMemEnd());
-  CHECK_EQ(addr, addr & ~((64 << 10) - 1));  // windows wants 64K alignment
-  if (!MmapFixedSuperNoReserve(addr, size, name)) {
-    Printf("FATAL: ThreadSanitizer can not mmap thread trace (0x%zx/0x%zx)\n",
-           addr, size);
-    Die();
-  }
-}
-
 #if !SANITIZER_GO
 static void OnStackUnwind(const SignalContext &sig, const void *,
                           BufferedStackTrace *stack) {
@@ -347,8 +581,11 @@ void CheckUnwind() {
   // since we are going to die soon.
   ScopedIgnoreInterceptors ignore;
 #if !SANITIZER_GO
-  cur_thread()->ignore_sync++;
-  cur_thread()->ignore_reads_and_writes++;
+  ThreadState* thr = cur_thread();
+  thr->nomalloc = false;
+  thr->ignore_sync++;
+  thr->ignore_reads_and_writes++;
+  atomic_store_relaxed(&thr->in_signal_handler, 0);
 #endif
   PrintCurrentStackSlow(StackTrace::GetCurrentPc());
 }
@@ -403,22 +640,22 @@ void Initialize(ThreadState *thr) {
   Symbolizer::GetOrInit()->AddHooks(EnterSymbolizer, ExitSymbolizer);
 #endif
 
-  VPrintf(1, "***** Running under ThreadSanitizer v2 (pid %d) *****\n",
+  VPrintf(1, "***** Running under ThreadSanitizer v3 (pid %d) *****\n",
           (int)internal_getpid());
 
   // Initialize thread 0.
-  Tid tid = ThreadCreate(thr, 0, 0, true);
+  Tid tid = ThreadCreate(nullptr, 0, 0, true);
   CHECK_EQ(tid, kMainTid);
   ThreadStart(thr, tid, GetTid(), ThreadType::Regular);
 #if TSAN_CONTAINS_UBSAN
   __ubsan::InitAsPlugin();
 #endif
-  ctx->initialized = true;
 
 #if !SANITIZER_GO
   Symbolizer::LateInitialize();
   InitializeMemoryProfiler();
 #endif
+  ctx->initialized = true;
 
   if (flags()->stop_on_start) {
     Printf("ThreadSanitizer is suspended at startup (pid %d)."
@@ -444,7 +681,6 @@ void MaybeSpawnBackgroundThread() {
 #endif
 }
 
-
 int Finalize(ThreadState *thr) {
   bool failed = false;
 
@@ -452,12 +688,12 @@ int Finalize(ThreadState *thr) {
     DumpProcessMap();
 
   if (flags()->atexit_sleep_ms > 0 && ThreadCount(thr) > 1)
-    SleepForMillis(flags()->atexit_sleep_ms);
+    internal_usleep(u64(flags()->atexit_sleep_ms) * 1000);
 
-  // Wait for pending reports.
-  ctx->report_mtx.Lock();
-  { ScopedErrorReportLock l; }
-  ctx->report_mtx.Unlock();
+  {
+    // Wait for pending reports.
+    ScopedErrorReportLock lock;
+  }
 
 #if !SANITIZER_GO
   if (Verbosity()) AllocatorPrintStats();
@@ -484,8 +720,13 @@ int Finalize(ThreadState *thr) {
 
 #if !SANITIZER_GO
 void ForkBefore(ThreadState *thr, uptr pc) NO_THREAD_SAFETY_ANALYSIS {
+  // Detaching from the slot makes OnUserFree skip writing to the shadow.
+  // The slot will be locked so any attempts to use it will deadlock anyway.
+  SlotDetach(thr);
+  ctx->multi_slot_mtx.Lock();
+  for (auto& slot : ctx->slots) slot.mtx.Lock();
   ctx->thread_registry.Lock();
-  ctx->report_mtx.Lock();
+  ctx->slot_mtx.Lock();
   ScopedErrorReportLock::Lock();
   // Suppress all reports in the pthread_atfork callbacks.
   // Reports will deadlock on the report_mtx.
@@ -495,29 +736,37 @@ void ForkBefore(ThreadState *thr, uptr pc) NO_THREAD_SAFETY_ANALYSIS {
   thr->suppress_reports++;
   // On OS X, REAL(fork) can call intercepted functions (OSSpinLockLock), and
   // we'll assert in CheckNoLocks() unless we ignore interceptors.
+  // On OS X libSystem_atfork_prepare/parent/child callbacks are called
+  // after/before our callbacks and they call free.
   thr->ignore_interceptors++;
+  // Disables memory write in OnUserFree.
+  thr->ignore_reads_and_writes++;
+
+  __tsan_test_only_on_fork();
 }
 
-void ForkParentAfter(ThreadState *thr, uptr pc) NO_THREAD_SAFETY_ANALYSIS {
+static void ForkAfter(ThreadState* thr) NO_THREAD_SAFETY_ANALYSIS {
   thr->suppress_reports--;  // Enabled in ForkBefore.
   thr->ignore_interceptors--;
+  thr->ignore_reads_and_writes--;
   ScopedErrorReportLock::Unlock();
-  ctx->report_mtx.Unlock();
+  ctx->slot_mtx.Unlock();
   ctx->thread_registry.Unlock();
+  for (auto& slot : ctx->slots) slot.mtx.Unlock();
+  ctx->multi_slot_mtx.Unlock();
+  SlotAttachAndLock(thr);
+  SlotUnlock(thr);
 }
 
-void ForkChildAfter(ThreadState *thr, uptr pc,
-                    bool start_thread) NO_THREAD_SAFETY_ANALYSIS {
-  thr->suppress_reports--;  // Enabled in ForkBefore.
-  thr->ignore_interceptors--;
-  ScopedErrorReportLock::Unlock();
-  ctx->report_mtx.Unlock();
-  ctx->thread_registry.Unlock();
+void ForkParentAfter(ThreadState* thr, uptr pc) { ForkAfter(thr); }
 
-  uptr nthread = 0;
-  ctx->thread_registry.GetNumberOfThreads(0, 0, &nthread /* alive threads */);
-  VPrintf(1, "ThreadSanitizer: forked new process with pid %d,"
-      " parent had %d threads\n", (int)internal_getpid(), (int)nthread);
+void ForkChildAfter(ThreadState* thr, uptr pc, bool start_thread) {
+  ForkAfter(thr);
+  u32 nthread = ThreadCount(thr);
+  VPrintf(1,
+          "ThreadSanitizer: forked new process with pid %d,"
+          " parent had %d threads\n",
+          (int)internal_getpid(), (int)nthread);
   if (nthread == 1) {
     if (start_thread)
       StartBackgroundThread();
@@ -527,6 +776,7 @@ void ForkChildAfter(ThreadState *thr, uptr pc,
     // ignores for everything in the hope that we will exec soon.
     ctx->after_multithreaded_fork = true;
     thr->ignore_interceptors++;
+    thr->suppress_reports++;
     ThreadIgnoreBegin(thr, pc);
     ThreadIgnoreSyncBegin(thr, pc);
   }
@@ -548,8 +798,10 @@ void GrowShadowStack(ThreadState *thr) {
 #endif
 
 StackID CurrentStackId(ThreadState *thr, uptr pc) {
+#if !SANITIZER_GO
   if (!thr->is_inited)  // May happen during bootstrap.
     return kInvalidStackID;
+#endif
   if (pc != 0) {
 #if !SANITIZER_GO
     DCHECK_LT(thr->shadow_stack_pos, thr->shadow_stack_end);
@@ -567,53 +819,72 @@ StackID CurrentStackId(ThreadState *thr, uptr pc) {
   return id;
 }
 
-namespace v3 {
-
-NOINLINE
-void TraceSwitchPart(ThreadState *thr) {
+static bool TraceSkipGap(ThreadState* thr) {
   Trace *trace = &thr->tctx->trace;
   Event *pos = reinterpret_cast<Event *>(atomic_load_relaxed(&thr->trace_pos));
   DCHECK_EQ(reinterpret_cast<uptr>(pos + 1) & TracePart::kAlignment, 0);
   auto *part = trace->parts.Back();
-  DPrintf("TraceSwitchPart part=%p pos=%p\n", part, pos);
-  if (part) {
-    // We can get here when we still have space in the current trace part.
-    // The fast-path check in TraceAcquire has false positives in the middle of
-    // the part. Check if we are indeed at the end of the current part or not,
-    // and fill any gaps with NopEvent's.
-    Event *end = &part->events[TracePart::kSize];
-    DCHECK_GE(pos, &part->events[0]);
-    DCHECK_LE(pos, end);
-    if (pos + 1 < end) {
-      if ((reinterpret_cast<uptr>(pos) & TracePart::kAlignment) ==
-          TracePart::kAlignment)
-        *pos++ = NopEvent;
+  DPrintf("#%d: TraceSwitchPart enter trace=%p parts=%p-%p pos=%p\n", thr->tid,
+          trace, trace->parts.Front(), part, pos);
+  if (!part)
+    return false;
+  // We can get here when we still have space in the current trace part.
+  // The fast-path check in TraceAcquire has false positives in the middle of
+  // the part. Check if we are indeed at the end of the current part or not,
+  // and fill any gaps with NopEvent's.
+  Event* end = &part->events[TracePart::kSize];
+  DCHECK_GE(pos, &part->events[0]);
+  DCHECK_LE(pos, end);
+  if (pos + 1 < end) {
+    if ((reinterpret_cast<uptr>(pos) & TracePart::kAlignment) ==
+        TracePart::kAlignment)
       *pos++ = NopEvent;
-      DCHECK_LE(pos + 2, end);
-      atomic_store_relaxed(&thr->trace_pos, reinterpret_cast<uptr>(pos));
-      // Ensure we setup trace so that the next TraceAcquire
-      // won't detect trace part end.
-      Event *ev;
-      CHECK(TraceAcquire(thr, &ev));
-      return;
-    }
-    // We are indeed at the end.
-    for (; pos < end; pos++) *pos = NopEvent;
+    *pos++ = NopEvent;
+    DCHECK_LE(pos + 2, end);
+    atomic_store_relaxed(&thr->trace_pos, reinterpret_cast<uptr>(pos));
+    return true;
   }
+  // We are indeed at the end.
+  for (; pos < end; pos++) *pos = NopEvent;
+  return false;
+}
+
+NOINLINE
+void TraceSwitchPart(ThreadState* thr) {
+  if (TraceSkipGap(thr))
+    return;
 #if !SANITIZER_GO
   if (ctx->after_multithreaded_fork) {
     // We just need to survive till exec.
-    CHECK(part);
-    atomic_store_relaxed(&thr->trace_pos,
-                         reinterpret_cast<uptr>(&part->events[0]));
-    return;
+    TracePart* part = thr->tctx->trace.parts.Back();
+    if (part) {
+      atomic_store_relaxed(&thr->trace_pos,
+                           reinterpret_cast<uptr>(&part->events[0]));
+      return;
+    }
   }
 #endif
-  part = new (MmapOrDie(sizeof(TracePart), "TracePart")) TracePart();
+  TraceSwitchPartImpl(thr);
+}
+
+void TraceSwitchPartImpl(ThreadState* thr) {
+  SlotLocker locker(thr, true);
+  Trace* trace = &thr->tctx->trace;
+  TracePart* part = TracePartAlloc(thr);
   part->trace = trace;
   thr->trace_prev_pc = 0;
+  TracePart* recycle = nullptr;
+  // Keep roughly half of parts local to the thread
+  // (not queued into the recycle queue).
+  uptr local_parts = (Trace::kMinParts + flags()->history_size + 1) / 2;
   {
     Lock lock(&trace->mtx);
+    if (trace->parts.Empty())
+      trace->local_head = part;
+    if (trace->parts.Size() >= local_parts) {
+      recycle = trace->local_head;
+      trace->local_head = trace->parts.Next(recycle);
+    }
     trace->parts.PushBack(part);
     atomic_store_relaxed(&thr->trace_pos,
                          reinterpret_cast<uptr>(&part->events[0]));
@@ -621,60 +892,45 @@ void TraceSwitchPart(ThreadState *thr) {
   // Make this part self-sufficient by restoring the current stack
   // and mutex set in the beginning of the trace.
   TraceTime(thr);
-  for (uptr *pos = &thr->shadow_stack[0]; pos < thr->shadow_stack_pos; pos++)
-    CHECK(TryTraceFunc(thr, *pos));
+  {
+    // Pathologically large stacks may not fit into the part.
+    // In these cases we log only fixed number of top frames.
+    const uptr kMaxFrames = 1000;
+    // Sanity check that kMaxFrames won't consume the whole part.
+    static_assert(kMaxFrames < TracePart::kSize / 2, "kMaxFrames is too big");
+    uptr* pos = Max(&thr->shadow_stack[0], thr->shadow_stack_pos - kMaxFrames);
+    for (; pos < thr->shadow_stack_pos; pos++) {
+      if (TryTraceFunc(thr, *pos))
+        continue;
+      CHECK(TraceSkipGap(thr));
+      CHECK(TryTraceFunc(thr, *pos));
+    }
+  }
   for (uptr i = 0; i < thr->mset.Size(); i++) {
     MutexSet::Desc d = thr->mset.Get(i);
-    TraceMutexLock(thr, d.write ? EventType::kLock : EventType::kRLock, 0,
-                   d.addr, d.stack_id);
+    for (uptr i = 0; i < d.count; i++)
+      TraceMutexLock(thr, d.write ? EventType::kLock : EventType::kRLock, 0,
+                     d.addr, d.stack_id);
   }
-}
-
-}  // namespace v3
-
-void TraceSwitch(ThreadState *thr) {
-#if !SANITIZER_GO
-  if (ctx->after_multithreaded_fork)
-    return;
-#endif
-  thr->nomalloc++;
-  Trace *thr_trace = ThreadTrace(thr->tid);
-  Lock l(&thr_trace->mtx);
-  unsigned trace = (thr->fast_state.epoch() / kTracePartSize) % TraceParts();
-  TraceHeader *hdr = &thr_trace->headers[trace];
-  hdr->epoch0 = thr->fast_state.epoch();
-  ObtainCurrentStack(thr, 0, &hdr->stack0);
-  hdr->mset0 = thr->mset;
-  thr->nomalloc--;
-}
-
-Trace *ThreadTrace(Tid tid) { return (Trace *)GetThreadTraceHeader(tid); }
-
-uptr TraceTopPC(ThreadState *thr) {
-  Event *events = (Event*)GetThreadTrace(thr->tid);
-  uptr pc = events[thr->fast_state.GetTracePos()];
-  return pc;
-}
-
-uptr TraceSize() {
-  return (uptr)(1ull << (kTracePartSizeBits + flags()->history_size + 1));
-}
-
-uptr TraceParts() {
-  return TraceSize() / kTracePartSize;
+  {
+    Lock lock(&ctx->slot_mtx);
+    ctx->slot_queue.Remove(thr->slot);
+    ctx->slot_queue.PushBack(thr->slot);
+    if (recycle)
+      ctx->trace_part_recycle.PushBack(recycle);
+  }
+  DPrintf("#%d: TraceSwitchPart exit parts=%p-%p pos=0x%zx\n", thr->tid,
+          trace->parts.Front(), trace->parts.Back(),
+          atomic_load_relaxed(&thr->trace_pos));
 }
 
 #if !SANITIZER_GO
-extern "C" void __tsan_trace_switch() {
-  TraceSwitch(cur_thread());
-}
+extern "C" void __tsan_trace_switch() {}
 
-extern "C" void __tsan_report_race() {
-  ReportRace(cur_thread());
-}
+extern "C" void __tsan_report_race() {}
 #endif
 
-void ThreadIgnoreBegin(ThreadState *thr, uptr pc) {
+void ThreadIgnoreBegin(ThreadState* thr, uptr pc) {
   DPrintf("#%d: ThreadIgnoreBegin\n", thr->tid);
   thr->ignore_reads_and_writes++;
   CHECK_GT(thr->ignore_reads_and_writes, 0);
@@ -734,7 +990,6 @@ void build_consistency_debug() {}
 #else
 void build_consistency_release() {}
 #endif
-
 }  // namespace __tsan
 
 #if SANITIZER_CHECK_DEADLOCKS
@@ -742,18 +997,27 @@ namespace __sanitizer {
 using namespace __tsan;
 MutexMeta mutex_meta[] = {
     {MutexInvalid, "Invalid", {}},
-    {MutexThreadRegistry, "ThreadRegistry", {}},
-    {MutexTypeTrace, "Trace", {MutexLeaf}},
-    {MutexTypeReport, "Report", {MutexTypeSyncVar}},
-    {MutexTypeSyncVar, "SyncVar", {}},
-    {MutexTypeAnnotations, "Annotations", {}},
-    {MutexTypeAtExit, "AtExit", {MutexTypeSyncVar}},
+    {MutexThreadRegistry,
+     "ThreadRegistry",
+     {MutexTypeSlots, MutexTypeTrace, MutexTypeReport}},
+    {MutexTypeReport, "Report", {MutexTypeTrace}},
+    {MutexTypeSyncVar, "SyncVar", {MutexTypeReport, MutexTypeTrace}},
+    {MutexTypeAnnotations, "Annotations", {MutexLeaf}},
+    {MutexTypeAtExit, "AtExit", {}},
     {MutexTypeFired, "Fired", {MutexLeaf}},
     {MutexTypeRacy, "Racy", {MutexLeaf}},
-    {MutexTypeGlobalProc, "GlobalProc", {}},
+    {MutexTypeGlobalProc, "GlobalProc", {MutexTypeSlot, MutexTypeSlots}},
+    {MutexTypeTrace, "Trace", {}},
+    {MutexTypeSlot,
+     "Slot",
+     {MutexMulti, MutexTypeTrace, MutexTypeSyncVar, MutexThreadRegistry,
+      MutexTypeSlots}},
+    {MutexTypeSlots, "Slots", {MutexTypeTrace, MutexTypeReport}},
+    {MutexTypeMultiSlot, "MultiSlot", {MutexTypeSlot, MutexTypeSlots}},
     {},
 };
 
 void PrintMutexPC(uptr pc) { StackTrace(&pc, 1).Print(); }
+
 }  // namespace __sanitizer
 #endif
diff --git a/libsanitizer/tsan/tsan_rtl.h b/libsanitizer/tsan/tsan_rtl.h
index eab83704240..3175847a880 100644
--- a/libsanitizer/tsan/tsan_rtl.h
+++ b/libsanitizer/tsan/tsan_rtl.h
@@ -38,6 +38,7 @@
 #include "tsan_defs.h"
 #include "tsan_flags.h"
 #include "tsan_ignoreset.h"
+#include "tsan_ilist.h"
 #include "tsan_mman.h"
 #include "tsan_mutexset.h"
 #include "tsan_platform.h"
@@ -46,6 +47,7 @@
 #include "tsan_stack_trace.h"
 #include "tsan_sync.h"
 #include "tsan_trace.h"
+#include "tsan_vector_clock.h"
 
 #if SANITIZER_WORDSIZE != 64
 # error "ThreadSanitizer is supported only on 64-bit platforms"
@@ -116,7 +118,6 @@ struct Processor {
 #endif
   DenseSlabAllocCache block_cache;
   DenseSlabAllocCache sync_cache;
-  DenseSlabAllocCache clock_cache;
   DDPhysicalThread *dd_pt;
 };
 
@@ -130,67 +131,85 @@ struct ScopedGlobalProcessor {
 };
 #endif
 
+struct TidEpoch {
+  Tid tid;
+  Epoch epoch;
+};
+
+struct TidSlot {
+  Mutex mtx;
+  Sid sid;
+  atomic_uint32_t raw_epoch;
+  ThreadState *thr;
+  Vector<TidEpoch> journal;
+  INode node;
+
+  Epoch epoch() const {
+    return static_cast<Epoch>(atomic_load(&raw_epoch, memory_order_relaxed));
+  }
+
+  void SetEpoch(Epoch v) {
+    atomic_store(&raw_epoch, static_cast<u32>(v), memory_order_relaxed);
+  }
+
+  TidSlot();
+} ALIGNED(SANITIZER_CACHE_LINE_SIZE);
+
 // This struct is stored in TLS.
 struct ThreadState {
   FastState fast_state;
-  // Synch epoch represents the threads's epoch before the last synchronization
-  // action. It allows to reduce number of shadow state updates.
-  // For example, fast_synch_epoch=100, last write to addr X was at epoch=150,
-  // if we are processing write to X from the same thread at epoch=200,
-  // we do nothing, because both writes happen in the same 'synch epoch'.
-  // That is, if another memory access does not race with the former write,
-  // it does not race with the latter as well.
-  // QUESTION: can we can squeeze this into ThreadState::Fast?
-  // E.g. ThreadState::Fast is a 44-bit, 32 are taken by synch_epoch and 12 are
-  // taken by epoch between synchs.
-  // This way we can save one load from tls.
-  u64 fast_synch_epoch;
+  int ignore_sync;
+#if !SANITIZER_GO
+  int ignore_interceptors;
+#endif
+  uptr *shadow_stack_pos;
+
+  // Current position in tctx->trace.Back()->events (Event*).
+  atomic_uintptr_t trace_pos;
+  // PC of the last memory access, used to compute PC deltas in the trace.
+  uptr trace_prev_pc;
+
   // Technically `current` should be a separate THREADLOCAL variable;
   // but it is placed here in order to share cache line with previous fields.
   ThreadState* current;
+
+  atomic_sint32_t pending_signals;
+
+  VectorClock clock;
+
   // This is a slow path flag. On fast path, fast_state.GetIgnoreBit() is read.
   // We do not distinguish beteween ignoring reads and writes
   // for better performance.
   int ignore_reads_and_writes;
-  atomic_sint32_t pending_signals;
-  int ignore_sync;
   int suppress_reports;
   // Go does not support ignores.
 #if !SANITIZER_GO
   IgnoreSet mop_ignore_set;
   IgnoreSet sync_ignore_set;
-  // C/C++ uses fixed size shadow stack.
-  uptr shadow_stack[kShadowStackSize];
-#else
-  // Go uses malloc-allocated shadow stack with dynamic size.
-  uptr *shadow_stack;
 #endif
+  uptr *shadow_stack;
   uptr *shadow_stack_end;
-  uptr *shadow_stack_pos;
-  RawShadow *racy_shadow_addr;
-  RawShadow racy_state[2];
-  MutexSet mset;
-  ThreadClock clock;
 #if !SANITIZER_GO
   Vector<JmpBuf> jmp_bufs;
-  int ignore_interceptors;
-#endif
-  const Tid tid;
-  const int unique_id;
-  bool in_symbolizer;
+  int in_symbolizer;
   bool in_ignored_lib;
   bool is_inited;
+#endif
+  MutexSet mset;
   bool is_dead;
-  bool is_freeing;
-  bool is_vptr_access;
-  const uptr stk_addr;
-  const uptr stk_size;
-  const uptr tls_addr;
-  const uptr tls_size;
+  const Tid tid;
+  uptr stk_addr;
+  uptr stk_size;
+  uptr tls_addr;
+  uptr tls_size;
   ThreadContext *tctx;
 
   DDLogicalThread *dd_lt;
 
+  TidSlot *slot;
+  uptr slot_epoch;
+  bool slot_locked;
+
   // Current wired Processor, or nullptr. Required to handle any events.
   Processor *proc1;
 #if !SANITIZER_GO
@@ -204,7 +223,7 @@ struct ThreadState {
 
 #if !SANITIZER_GO
   StackID last_sleep_stack_id;
-  ThreadClock last_sleep_clock;
+  VectorClock last_sleep_clock;
 #endif
 
   // Set in regions of runtime that must be signal-safe and fork-safe.
@@ -213,16 +232,7 @@ struct ThreadState {
 
   const ReportDesc *current_report;
 
-  // Current position in tctx->trace.Back()->events (Event*).
-  atomic_uintptr_t trace_pos;
-  // PC of the last memory access, used to compute PC deltas in the trace.
-  uptr trace_prev_pc;
-  Sid sid;
-  Epoch epoch;
-
-  explicit ThreadState(Context *ctx, Tid tid, int unique_id, u64 epoch,
-                       unsigned reuse_count, uptr stk_addr, uptr stk_size,
-                       uptr tls_addr, uptr tls_size);
+  explicit ThreadState(Tid tid);
 } ALIGNED(SANITIZER_CACHE_LINE_SIZE);
 
 #if !SANITIZER_GO
@@ -256,14 +266,9 @@ class ThreadContext final : public ThreadContextBase {
   ~ThreadContext();
   ThreadState *thr;
   StackID creation_stack_id;
-  SyncClock sync;
-  // Epoch at which the thread had started.
-  // If we see an event from the thread stamped by an older epoch,
-  // the event is from a dead thread that shared tid with this thread.
-  u64 epoch0;
-  u64 epoch1;
-
-  v3::Trace trace;
+  VectorClock *sync;
+  uptr sync_epoch;
+  Trace trace;
 
   // Override superclass callbacks.
   void OnDead() override;
@@ -323,7 +328,21 @@ struct Context {
   Flags flags;
   fd_t memprof_fd;
 
+  // The last slot index (kFreeSid) is used to denote freed memory.
+  TidSlot slots[kThreadSlotCount - 1];
+
+  // Protects global_epoch, slot_queue, trace_part_recycle.
   Mutex slot_mtx;
+  // Prevents lock order inversions when we lock more than 1 slot.
+  Mutex multi_slot_mtx;
+  uptr global_epoch;  // guarded by slot_mtx and by all slot mutexes
+  bool resetting;     // global reset is in progress
+  IList<TidSlot, &TidSlot::node> slot_queue GUARDED_BY(slot_mtx);
+  IList<TraceHeader, &TraceHeader::global, TracePart> trace_part_recycle
+      GUARDED_BY(slot_mtx);
+  uptr trace_part_total_allocated GUARDED_BY(slot_mtx);
+  uptr trace_part_recycle_finished GUARDED_BY(slot_mtx);
+  uptr trace_part_finished_excess GUARDED_BY(slot_mtx);
 };
 
 extern Context *ctx;  // The one and the only global runtime context.
@@ -352,14 +371,13 @@ uptr TagFromShadowStackFrame(uptr pc);
 
 class ScopedReportBase {
  public:
-  void AddMemoryAccess(uptr addr, uptr external_tag, Shadow s, StackTrace stack,
-                       const MutexSet *mset);
+  void AddMemoryAccess(uptr addr, uptr external_tag, Shadow s, Tid tid,
+                       StackTrace stack, const MutexSet *mset);
   void AddStack(StackTrace stack, bool suppressable = false);
   void AddThread(const ThreadContext *tctx, bool suppressable = false);
-  void AddThread(Tid unique_tid, bool suppressable = false);
+  void AddThread(Tid tid, bool suppressable = false);
   void AddUniqueTid(Tid unique_tid);
-  void AddMutex(const SyncVar *s);
-  u64 AddMutex(u64 id);
+  int AddMutex(uptr addr, StackID creation_stack_id);
   void AddLocation(uptr addr, uptr size);
   void AddSleep(StackID stack_id);
   void SetCount(int count);
@@ -376,8 +394,6 @@ class ScopedReportBase {
   // at best it will cause deadlocks on internal mutexes.
   ScopedIgnoreInterceptors ignore_interceptors_;
 
-  void AddDeadMutex(u64 id);
-
   ScopedReportBase(const ScopedReportBase &) = delete;
   void operator=(const ScopedReportBase &) = delete;
 };
@@ -393,8 +409,6 @@ class ScopedReport : public ScopedReportBase {
 
 bool ShouldReport(ThreadState *thr, ReportType typ);
 ThreadContext *IsThreadStackOrTls(uptr addr, bool *is_stack);
-void RestoreStack(Tid tid, const u64 epoch, VarSizeStackTrace *stk,
-                  MutexSet *mset, uptr *tag = nullptr);
 
 // The stack could look like:
 //   <start> | <main> | <foo> | tag | <bar>
@@ -442,7 +456,8 @@ void ForkBefore(ThreadState *thr, uptr pc);
 void ForkParentAfter(ThreadState *thr, uptr pc);
 void ForkChildAfter(ThreadState *thr, uptr pc, bool start_thread);
 
-void ReportRace(ThreadState *thr);
+void ReportRace(ThreadState *thr, RawShadow *shadow_mem, Shadow cur, Shadow old,
+                AccessType typ);
 bool OutputReport(ThreadState *thr, const ScopedReport &srep);
 bool IsFiredSuppression(Context *ctx, ReportType type, StackTrace trace);
 bool IsExpectedReport(uptr addr, uptr size);
@@ -472,55 +487,28 @@ int Finalize(ThreadState *thr);
 void OnUserAlloc(ThreadState *thr, uptr pc, uptr p, uptr sz, bool write);
 void OnUserFree(ThreadState *thr, uptr pc, uptr p, bool write);
 
-void MemoryAccess(ThreadState *thr, uptr pc, uptr addr,
-    int kAccessSizeLog, bool kAccessIsWrite, bool kIsAtomic);
-void MemoryAccessImpl(ThreadState *thr, uptr addr,
-    int kAccessSizeLog, bool kAccessIsWrite, bool kIsAtomic,
-    u64 *shadow_mem, Shadow cur);
-void MemoryAccessRange(ThreadState *thr, uptr pc, uptr addr,
-    uptr size, bool is_write);
+void MemoryAccess(ThreadState *thr, uptr pc, uptr addr, uptr size,
+                  AccessType typ);
 void UnalignedMemoryAccess(ThreadState *thr, uptr pc, uptr addr, uptr size,
                            AccessType typ);
-
-const int kSizeLog1 = 0;
-const int kSizeLog2 = 1;
-const int kSizeLog4 = 2;
-const int kSizeLog8 = 3;
+// This creates 2 non-inlined specialized versions of MemoryAccessRange.
+template <bool is_read>
+void MemoryAccessRangeT(ThreadState *thr, uptr pc, uptr addr, uptr size);
 
 ALWAYS_INLINE
-void MemoryAccess(ThreadState *thr, uptr pc, uptr addr, uptr size,
-                  AccessType typ) {
-  int size_log;
-  switch (size) {
-    case 1:
-      size_log = kSizeLog1;
-      break;
-    case 2:
-      size_log = kSizeLog2;
-      break;
-    case 4:
-      size_log = kSizeLog4;
-      break;
-    default:
-      DCHECK_EQ(size, 8);
-      size_log = kSizeLog8;
-      break;
-  }
-  bool is_write = !(typ & kAccessRead);
-  bool is_atomic = typ & kAccessAtomic;
-  if (typ & kAccessVptr)
-    thr->is_vptr_access = true;
-  if (typ & kAccessFree)
-    thr->is_freeing = true;
-  MemoryAccess(thr, pc, addr, size_log, is_write, is_atomic);
-  if (typ & kAccessVptr)
-    thr->is_vptr_access = false;
-  if (typ & kAccessFree)
-    thr->is_freeing = false;
+void MemoryAccessRange(ThreadState *thr, uptr pc, uptr addr, uptr size,
+                       bool is_write) {
+  if (size == 0)
+    return;
+  if (is_write)
+    MemoryAccessRangeT<false>(thr, pc, addr, size);
+  else
+    MemoryAccessRangeT<true>(thr, pc, addr, size);
 }
 
-void MemoryResetRange(ThreadState *thr, uptr pc, uptr addr, uptr size);
+void ShadowSet(RawShadow *p, RawShadow *end, RawShadow v);
 void MemoryRangeFreed(ThreadState *thr, uptr pc, uptr addr, uptr size);
+void MemoryResetRange(ThreadState *thr, uptr pc, uptr addr, uptr size);
 void MemoryRangeImitateWrite(ThreadState *thr, uptr pc, uptr addr, uptr size);
 void MemoryRangeImitateWriteOrResetRange(ThreadState *thr, uptr pc, uptr addr,
                                          uptr size);
@@ -530,9 +518,6 @@ void ThreadIgnoreEnd(ThreadState *thr);
 void ThreadIgnoreSyncBegin(ThreadState *thr, uptr pc);
 void ThreadIgnoreSyncEnd(ThreadState *thr);
 
-void FuncEntry(ThreadState *thr, uptr pc);
-void FuncExit(ThreadState *thr);
-
 Tid ThreadCreate(ThreadState *thr, uptr pc, uptr uid, bool detached);
 void ThreadStart(ThreadState *thr, Tid tid, tid_t os_id,
                  ThreadType thread_type);
@@ -578,11 +563,7 @@ void Release(ThreadState *thr, uptr pc, uptr addr);
 void ReleaseStoreAcquire(ThreadState *thr, uptr pc, uptr addr);
 void ReleaseStore(ThreadState *thr, uptr pc, uptr addr);
 void AfterSleep(ThreadState *thr, uptr pc);
-void AcquireImpl(ThreadState *thr, uptr pc, SyncClock *c);
-void ReleaseImpl(ThreadState *thr, uptr pc, SyncClock *c);
-void ReleaseStoreAcquireImpl(ThreadState *thr, uptr pc, SyncClock *c);
-void ReleaseStoreImpl(ThreadState *thr, uptr pc, SyncClock *c);
-void AcquireReleaseImpl(ThreadState *thr, uptr pc, SyncClock *c);
+void IncrementEpoch(ThreadState *thr);
 
 // The hacky call uses custom calling convention and an assembly thunk.
 // It is considerably faster that a normal call for the caller
@@ -605,40 +586,19 @@ void AcquireReleaseImpl(ThreadState *thr, uptr pc, SyncClock *c);
 #define HACKY_CALL(f) f()
 #endif
 
-void TraceSwitch(ThreadState *thr);
-uptr TraceTopPC(ThreadState *thr);
-uptr TraceSize();
-uptr TraceParts();
-Trace *ThreadTrace(Tid tid);
-
-extern "C" void __tsan_trace_switch();
-void ALWAYS_INLINE TraceAddEvent(ThreadState *thr, FastState fs,
-                                        EventType typ, u64 addr) {
-  if (!kCollectHistory)
-    return;
-  DCHECK_GE((int)typ, 0);
-  DCHECK_LE((int)typ, 7);
-  DCHECK_EQ(GetLsb(addr, kEventPCBits), addr);
-  u64 pos = fs.GetTracePos();
-  if (UNLIKELY((pos % kTracePartSize) == 0)) {
-#if !SANITIZER_GO
-    HACKY_CALL(__tsan_trace_switch);
-#else
-    TraceSwitch(thr);
-#endif
-  }
-  Event *trace = (Event*)GetThreadTrace(fs.tid());
-  Event *evp = &trace[pos];
-  Event ev = (u64)addr | ((u64)typ << kEventPCBits);
-  *evp = ev;
-}
-
 #if !SANITIZER_GO
 uptr ALWAYS_INLINE HeapEnd() {
   return HeapMemEnd() + PrimaryAllocator::AdditionalSize();
 }
 #endif
 
+void SlotAttachAndLock(ThreadState *thr) ACQUIRE(thr->slot->mtx);
+void SlotDetach(ThreadState *thr);
+void SlotLock(ThreadState *thr) ACQUIRE(thr->slot->mtx);
+void SlotUnlock(ThreadState *thr) RELEASE(thr->slot->mtx);
+void DoReset(ThreadState *thr, uptr epoch);
+void FlushShadowMemory();
+
 ThreadState *FiberCreate(ThreadState *thr, uptr pc, unsigned flags);
 void FiberDestroy(ThreadState *thr, uptr pc, ThreadState *fiber);
 void FiberSwitch(ThreadState *thr, uptr pc, ThreadState *fiber, unsigned flags);
@@ -649,6 +609,53 @@ enum FiberSwitchFlags {
   FiberSwitchFlagNoSync = 1 << 0, // __tsan_switch_to_fiber_no_sync
 };
 
+class SlotPairLocker {
+ public:
+  SlotPairLocker(ThreadState *thr, Sid sid);
+  ~SlotPairLocker();
+
+ private:
+  ThreadState *thr_;
+  TidSlot *slot_;
+};
+
+class SlotLocker {
+ public:
+  ALWAYS_INLINE
+  SlotLocker(ThreadState *thr, bool recursive = false)
+      : thr_(thr), locked_(recursive ? thr->slot_locked : false) {
+    if (!locked_)
+      SlotLock(thr_);
+  }
+
+  ALWAYS_INLINE
+  ~SlotLocker() {
+    if (!locked_)
+      SlotUnlock(thr_);
+  }
+
+ private:
+  ThreadState *thr_;
+  bool locked_;
+};
+
+class SlotUnlocker {
+ public:
+  SlotUnlocker(ThreadState *thr) : thr_(thr), locked_(thr->slot_locked) {
+    if (locked_)
+      SlotUnlock(thr_);
+  }
+
+  ~SlotUnlocker() {
+    if (locked_)
+      SlotLock(thr_);
+  }
+
+ private:
+  ThreadState *thr_;
+  bool locked_;
+};
+
 ALWAYS_INLINE void ProcessPendingSignals(ThreadState *thr) {
   if (UNLIKELY(atomic_load_relaxed(&thr->pending_signals)))
     ProcessPendingSignalsImpl(thr);
@@ -667,16 +674,19 @@ void LazyInitialize(ThreadState *thr) {
 #endif
 }
 
-namespace v3 {
-
+void TraceResetForTesting();
 void TraceSwitchPart(ThreadState *thr);
-bool RestoreStack(Tid tid, EventType type, Sid sid, Epoch epoch, uptr addr,
-                  uptr size, AccessType typ, VarSizeStackTrace *pstk,
+void TraceSwitchPartImpl(ThreadState *thr);
+bool RestoreStack(EventType type, Sid sid, Epoch epoch, uptr addr, uptr size,
+                  AccessType typ, Tid *ptid, VarSizeStackTrace *pstk,
                   MutexSet *pmset, uptr *ptag);
 
 template <typename EventT>
 ALWAYS_INLINE WARN_UNUSED_RESULT bool TraceAcquire(ThreadState *thr,
                                                    EventT **ev) {
+  // TraceSwitchPart accesses shadow_stack, but it's called infrequently,
+  // so we check it here proactively.
+  DCHECK(thr->shadow_stack);
   Event *pos = reinterpret_cast<Event *>(atomic_load_relaxed(&thr->trace_pos));
 #if SANITIZER_DEBUG
   // TraceSwitch acquires these mutexes,
@@ -747,20 +757,16 @@ void TraceMutexLock(ThreadState *thr, EventType type, uptr pc, uptr addr,
 void TraceMutexUnlock(ThreadState *thr, uptr addr);
 void TraceTime(ThreadState *thr);
 
-}  // namespace v3
+void TraceRestartFuncExit(ThreadState *thr);
+void TraceRestartFuncEntry(ThreadState *thr, uptr pc);
 
 void GrowShadowStack(ThreadState *thr);
 
 ALWAYS_INLINE
 void FuncEntry(ThreadState *thr, uptr pc) {
-  DPrintf2("#%d: FuncEntry %p\n", (int)thr->fast_state.tid(), (void *)pc);
-  if (kCollectHistory) {
-    thr->fast_state.IncrementEpoch();
-    TraceAddEvent(thr, thr->fast_state, EventTypeFuncEnter, pc);
-  }
-
-  // Shadow stack maintenance can be replaced with
-  // stack unwinding during trace switch (which presumably must be faster).
+  DPrintf2("#%d: FuncEntry %p\n", (int)thr->fast_state.sid(), (void *)pc);
+  if (UNLIKELY(!TryTraceFunc(thr, pc)))
+    return TraceRestartFuncEntry(thr, pc);
   DCHECK_GE(thr->shadow_stack_pos, thr->shadow_stack);
 #if !SANITIZER_GO
   DCHECK_LT(thr->shadow_stack_pos, thr->shadow_stack_end);
@@ -774,12 +780,9 @@ void FuncEntry(ThreadState *thr, uptr pc) {
 
 ALWAYS_INLINE
 void FuncExit(ThreadState *thr) {
-  DPrintf2("#%d: FuncExit\n", (int)thr->fast_state.tid());
-  if (kCollectHistory) {
-    thr->fast_state.IncrementEpoch();
-    TraceAddEvent(thr, thr->fast_state, EventTypeFuncExit, 0);
-  }
-
+  DPrintf2("#%d: FuncExit\n", (int)thr->fast_state.sid());
+  if (UNLIKELY(!TryTraceFunc(thr, 0)))
+    return TraceRestartFuncExit(thr);
   DCHECK_GT(thr->shadow_stack_pos, thr->shadow_stack);
 #if !SANITIZER_GO
   DCHECK_LT(thr->shadow_stack_pos, thr->shadow_stack_end);
@@ -791,7 +794,6 @@ void FuncExit(ThreadState *thr) {
 extern void (*on_initialize)(void);
 extern int (*on_finalize)(int);
 #endif
-
 }  // namespace __tsan
 
 #endif  // TSAN_RTL_H
diff --git a/libsanitizer/tsan/tsan_rtl_access.cpp b/libsanitizer/tsan/tsan_rtl_access.cpp
index 7365fdaa303..76e269e2ed2 100644
--- a/libsanitizer/tsan/tsan_rtl_access.cpp
+++ b/libsanitizer/tsan/tsan_rtl_access.cpp
@@ -15,15 +15,13 @@
 
 namespace __tsan {
 
-namespace v3 {
-
-ALWAYS_INLINE USED bool TryTraceMemoryAccess(ThreadState *thr, uptr pc,
+ALWAYS_INLINE USED bool TryTraceMemoryAccess(ThreadState* thr, uptr pc,
                                              uptr addr, uptr size,
                                              AccessType typ) {
   DCHECK(size == 1 || size == 2 || size == 4 || size == 8);
   if (!kCollectHistory)
     return true;
-  EventAccess *ev;
+  EventAccess* ev;
   if (UNLIKELY(!TraceAcquire(thr, &ev)))
     return false;
   u64 size_log = size == 1 ? 0 : size == 2 ? 1 : size == 4 ? 2 : 3;
@@ -40,25 +38,27 @@ ALWAYS_INLINE USED bool TryTraceMemoryAccess(ThreadState *thr, uptr pc,
     TraceRelease(thr, ev);
     return true;
   }
-  auto *evex = reinterpret_cast<EventAccessExt *>(ev);
+  auto* evex = reinterpret_cast<EventAccessExt*>(ev);
   evex->is_access = 0;
   evex->is_func = 0;
   evex->type = EventType::kAccessExt;
   evex->is_read = !!(typ & kAccessRead);
   evex->is_atomic = !!(typ & kAccessAtomic);
   evex->size_log = size_log;
+  // Note: this is important, see comment in EventAccessExt.
+  evex->_ = 0;
   evex->addr = CompressAddr(addr);
   evex->pc = pc;
   TraceRelease(thr, evex);
   return true;
 }
 
-ALWAYS_INLINE USED bool TryTraceMemoryAccessRange(ThreadState *thr, uptr pc,
-                                                  uptr addr, uptr size,
-                                                  AccessType typ) {
+ALWAYS_INLINE
+bool TryTraceMemoryAccessRange(ThreadState* thr, uptr pc, uptr addr, uptr size,
+                               AccessType typ) {
   if (!kCollectHistory)
     return true;
-  EventAccessRange *ev;
+  EventAccessRange* ev;
   if (UNLIKELY(!TraceAcquire(thr, &ev)))
     return false;
   thr->trace_prev_pc = pc;
@@ -75,7 +75,7 @@ ALWAYS_INLINE USED bool TryTraceMemoryAccessRange(ThreadState *thr, uptr pc,
   return true;
 }
 
-void TraceMemoryAccessRange(ThreadState *thr, uptr pc, uptr addr, uptr size,
+void TraceMemoryAccessRange(ThreadState* thr, uptr pc, uptr addr, uptr size,
                             AccessType typ) {
   if (LIKELY(TryTraceMemoryAccessRange(thr, pc, addr, size, typ)))
     return;
@@ -84,7 +84,7 @@ void TraceMemoryAccessRange(ThreadState *thr, uptr pc, uptr addr, uptr size,
   DCHECK(res);
 }
 
-void TraceFunc(ThreadState *thr, uptr pc) {
+void TraceFunc(ThreadState* thr, uptr pc) {
   if (LIKELY(TryTraceFunc(thr, pc)))
     return;
   TraceSwitchPart(thr);
@@ -92,7 +92,17 @@ void TraceFunc(ThreadState *thr, uptr pc) {
   DCHECK(res);
 }
 
-void TraceMutexLock(ThreadState *thr, EventType type, uptr pc, uptr addr,
+NOINLINE void TraceRestartFuncEntry(ThreadState* thr, uptr pc) {
+  TraceSwitchPart(thr);
+  FuncEntry(thr, pc);
+}
+
+NOINLINE void TraceRestartFuncExit(ThreadState* thr) {
+  TraceSwitchPart(thr);
+  FuncExit(thr);
+}
+
+void TraceMutexLock(ThreadState* thr, EventType type, uptr pc, uptr addr,
                     StackID stk) {
   DCHECK(type == EventType::kLock || type == EventType::kRLock);
   if (!kCollectHistory)
@@ -109,7 +119,7 @@ void TraceMutexLock(ThreadState *thr, EventType type, uptr pc, uptr addr,
   TraceEvent(thr, ev);
 }
 
-void TraceMutexUnlock(ThreadState *thr, uptr addr) {
+void TraceMutexUnlock(ThreadState* thr, uptr addr) {
   if (!kCollectHistory)
     return;
   EventUnlock ev;
@@ -121,396 +131,485 @@ void TraceMutexUnlock(ThreadState *thr, uptr addr) {
   TraceEvent(thr, ev);
 }
 
-void TraceTime(ThreadState *thr) {
+void TraceTime(ThreadState* thr) {
   if (!kCollectHistory)
     return;
+  FastState fast_state = thr->fast_state;
   EventTime ev;
   ev.is_access = 0;
   ev.is_func = 0;
   ev.type = EventType::kTime;
-  ev.sid = static_cast<u64>(thr->sid);
-  ev.epoch = static_cast<u64>(thr->epoch);
+  ev.sid = static_cast<u64>(fast_state.sid());
+  ev.epoch = static_cast<u64>(fast_state.epoch());
   ev._ = 0;
   TraceEvent(thr, ev);
 }
 
-}  // namespace v3
+ALWAYS_INLINE RawShadow LoadShadow(RawShadow* p) {
+  return static_cast<RawShadow>(
+      atomic_load((atomic_uint32_t*)p, memory_order_relaxed));
+}
 
-ALWAYS_INLINE
-Shadow LoadShadow(u64 *p) {
-  u64 raw = atomic_load((atomic_uint64_t *)p, memory_order_relaxed);
-  return Shadow(raw);
+ALWAYS_INLINE void StoreShadow(RawShadow* sp, RawShadow s) {
+  atomic_store((atomic_uint32_t*)sp, static_cast<u32>(s), memory_order_relaxed);
 }
 
-ALWAYS_INLINE
-void StoreShadow(u64 *sp, u64 s) {
-  atomic_store((atomic_uint64_t *)sp, s, memory_order_relaxed);
+NOINLINE void DoReportRace(ThreadState* thr, RawShadow* shadow_mem, Shadow cur,
+                           Shadow old,
+                           AccessType typ) NO_THREAD_SAFETY_ANALYSIS {
+  // For the free shadow markers the first element (that contains kFreeSid)
+  // triggers the race, but the second element contains info about the freeing
+  // thread, take it.
+  if (old.sid() == kFreeSid)
+    old = Shadow(LoadShadow(&shadow_mem[1]));
+  // This prevents trapping on this address in future.
+  for (uptr i = 0; i < kShadowCnt; i++)
+    StoreShadow(&shadow_mem[i], i == 0 ? Shadow::kRodata : Shadow::kEmpty);
+  // See the comment in MemoryRangeFreed as to why the slot is locked
+  // for free memory accesses. ReportRace must not be called with
+  // the slot locked because of the fork. But MemoryRangeFreed is not
+  // called during fork because fork sets ignore_reads_and_writes,
+  // so simply unlocking the slot should be fine.
+  if (typ & kAccessFree)
+    SlotUnlock(thr);
+  ReportRace(thr, shadow_mem, cur, Shadow(old), typ);
+  if (typ & kAccessFree)
+    SlotLock(thr);
 }
 
+#if !TSAN_VECTORIZE
 ALWAYS_INLINE
-void StoreIfNotYetStored(u64 *sp, u64 *s) {
-  StoreShadow(sp, *s);
-  *s = 0;
+bool ContainsSameAccess(RawShadow* s, Shadow cur, int unused0, int unused1,
+                        AccessType typ) {
+  for (uptr i = 0; i < kShadowCnt; i++) {
+    auto old = LoadShadow(&s[i]);
+    if (!(typ & kAccessRead)) {
+      if (old == cur.raw())
+        return true;
+      continue;
+    }
+    auto masked = static_cast<RawShadow>(static_cast<u32>(old) |
+                                         static_cast<u32>(Shadow::kRodata));
+    if (masked == cur.raw())
+      return true;
+    if (!(typ & kAccessNoRodata) && !SANITIZER_GO) {
+      if (old == Shadow::kRodata)
+        return true;
+    }
+  }
+  return false;
 }
 
-extern "C" void __tsan_report_race();
-
 ALWAYS_INLINE
-void HandleRace(ThreadState *thr, u64 *shadow_mem, Shadow cur, Shadow old) {
-  thr->racy_state[0] = cur.raw();
-  thr->racy_state[1] = old.raw();
-  thr->racy_shadow_addr = shadow_mem;
-#if !SANITIZER_GO
-  HACKY_CALL(__tsan_report_race);
-#else
-  ReportRace(thr);
-#endif
+bool CheckRaces(ThreadState* thr, RawShadow* shadow_mem, Shadow cur,
+                int unused0, int unused1, AccessType typ) {
+  bool stored = false;
+  for (uptr idx = 0; idx < kShadowCnt; idx++) {
+    RawShadow* sp = &shadow_mem[idx];
+    Shadow old(LoadShadow(sp));
+    if (LIKELY(old.raw() == Shadow::kEmpty)) {
+      if (!(typ & kAccessCheckOnly) && !stored)
+        StoreShadow(sp, cur.raw());
+      return false;
+    }
+    if (LIKELY(!(cur.access() & old.access())))
+      continue;
+    if (LIKELY(cur.sid() == old.sid())) {
+      if (!(typ & kAccessCheckOnly) &&
+          LIKELY(cur.access() == old.access() && old.IsRWWeakerOrEqual(typ))) {
+        StoreShadow(sp, cur.raw());
+        stored = true;
+      }
+      continue;
+    }
+    if (LIKELY(old.IsBothReadsOrAtomic(typ)))
+      continue;
+    if (LIKELY(thr->clock.Get(old.sid()) >= old.epoch()))
+      continue;
+    DoReportRace(thr, shadow_mem, cur, old, typ);
+    return true;
+  }
+  // We did not find any races and had already stored
+  // the current access info, so we are done.
+  if (LIKELY(stored))
+    return false;
+  // Choose a random candidate slot and replace it.
+  uptr index =
+      atomic_load_relaxed(&thr->trace_pos) / sizeof(Event) % kShadowCnt;
+  StoreShadow(&shadow_mem[index], cur.raw());
+  return false;
 }
 
-static inline bool HappensBefore(Shadow old, ThreadState *thr) {
-  return thr->clock.get(old.TidWithIgnore()) >= old.epoch();
-}
+#  define LOAD_CURRENT_SHADOW(cur, shadow_mem) UNUSED int access = 0, shadow = 0
 
-ALWAYS_INLINE
-void MemoryAccessImpl1(ThreadState *thr, uptr addr, int kAccessSizeLog,
-                       bool kAccessIsWrite, bool kIsAtomic, u64 *shadow_mem,
-                       Shadow cur) {
-  // This potentially can live in an MMX/SSE scratch register.
-  // The required intrinsics are:
-  // __m128i _mm_move_epi64(__m128i*);
-  // _mm_storel_epi64(u64*, __m128i);
-  u64 store_word = cur.raw();
-  bool stored = false;
+#else /* !TSAN_VECTORIZE */
 
-  // scan all the shadow values and dispatch to 4 categories:
-  // same, replace, candidate and race (see comments below).
-  // we consider only 3 cases regarding access sizes:
-  // equal, intersect and not intersect. initially I considered
-  // larger and smaller as well, it allowed to replace some
-  // 'candidates' with 'same' or 'replace', but I think
-  // it's just not worth it (performance- and complexity-wise).
-
-  Shadow old(0);
-
-  // It release mode we manually unroll the loop,
-  // because empirically gcc generates better code this way.
-  // However, we can't afford unrolling in debug mode, because the function
-  // consumes almost 4K of stack. Gtest gives only 4K of stack to death test
-  // threads, which is not enough for the unrolled loop.
-#if SANITIZER_DEBUG
-  for (int idx = 0; idx < 4; idx++) {
-#  include "tsan_update_shadow_word.inc"
-  }
-#else
-  int idx = 0;
-#  include "tsan_update_shadow_word.inc"
-  idx = 1;
-  if (stored) {
-#  include "tsan_update_shadow_word.inc"
-  } else {
-#  include "tsan_update_shadow_word.inc"
-  }
-  idx = 2;
-  if (stored) {
-#  include "tsan_update_shadow_word.inc"
-  } else {
-#  include "tsan_update_shadow_word.inc"
+ALWAYS_INLINE
+bool ContainsSameAccess(RawShadow* unused0, Shadow unused1, m128 shadow,
+                        m128 access, AccessType typ) {
+  // Note: we could check if there is a larger access of the same type,
+  // e.g. we just allocated/memset-ed a block (so it contains 8 byte writes)
+  // and now do smaller reads/writes, these can also be considered as "same
+  // access". However, it will make the check more expensive, so it's unclear
+  // if it's worth it. But this would conserve trace space, so it's useful
+  // besides potential speed up.
+  if (!(typ & kAccessRead)) {
+    const m128 same = _mm_cmpeq_epi32(shadow, access);
+    return _mm_movemask_epi8(same);
   }
-  idx = 3;
-  if (stored) {
-#  include "tsan_update_shadow_word.inc"
-  } else {
-#  include "tsan_update_shadow_word.inc"
+  // For reads we need to reset read bit in the shadow,
+  // because we need to match read with both reads and writes.
+  // Shadow::kRodata has only read bit set, so it does what we want.
+  // We also abuse it for rodata check to save few cycles
+  // since we already loaded Shadow::kRodata into a register.
+  // Reads from rodata can't race.
+  // Measurements show that they can be 10-20% of all memory accesses.
+  // Shadow::kRodata has epoch 0 which cannot appear in shadow normally
+  // (thread epochs start from 1). So the same read bit mask
+  // serves as rodata indicator.
+  const m128 read_mask = _mm_set1_epi32(static_cast<u32>(Shadow::kRodata));
+  const m128 masked_shadow = _mm_or_si128(shadow, read_mask);
+  m128 same = _mm_cmpeq_epi32(masked_shadow, access);
+  // Range memory accesses check Shadow::kRodata before calling this,
+  // Shadow::kRodatas is not possible for free memory access
+  // and Go does not use Shadow::kRodata.
+  if (!(typ & kAccessNoRodata) && !SANITIZER_GO) {
+    const m128 ro = _mm_cmpeq_epi32(shadow, read_mask);
+    same = _mm_or_si128(ro, same);
   }
-#endif
+  return _mm_movemask_epi8(same);
+}
 
-  // we did not find any races and had already stored
-  // the current access info, so we are done
-  if (LIKELY(stored))
-    return;
-  // choose a random candidate slot and replace it
-  StoreShadow(shadow_mem + (cur.epoch() % kShadowCnt), store_word);
-  return;
-RACE:
-  HandleRace(thr, shadow_mem, cur, old);
-  return;
-}
-
-void UnalignedMemoryAccess(ThreadState *thr, uptr pc, uptr addr, uptr size,
-                           AccessType typ) {
-  DCHECK(!(typ & kAccessAtomic));
-  const bool kAccessIsWrite = !(typ & kAccessRead);
-  const bool kIsAtomic = false;
-  while (size) {
-    int size1 = 1;
-    int kAccessSizeLog = kSizeLog1;
-    if (size >= 8 && (addr & ~7) == ((addr + 7) & ~7)) {
-      size1 = 8;
-      kAccessSizeLog = kSizeLog8;
-    } else if (size >= 4 && (addr & ~7) == ((addr + 3) & ~7)) {
-      size1 = 4;
-      kAccessSizeLog = kSizeLog4;
-    } else if (size >= 2 && (addr & ~7) == ((addr + 1) & ~7)) {
-      size1 = 2;
-      kAccessSizeLog = kSizeLog2;
-    }
-    MemoryAccess(thr, pc, addr, kAccessSizeLog, kAccessIsWrite, kIsAtomic);
-    addr += size1;
-    size -= size1;
+NOINLINE void DoReportRaceV(ThreadState* thr, RawShadow* shadow_mem, Shadow cur,
+                            u32 race_mask, m128 shadow, AccessType typ) {
+  // race_mask points which of the shadow elements raced with the current
+  // access. Extract that element.
+  CHECK_NE(race_mask, 0);
+  u32 old;
+  // Note: _mm_extract_epi32 index must be a constant value.
+  switch (__builtin_ffs(race_mask) / 4) {
+    case 0:
+      old = _mm_extract_epi32(shadow, 0);
+      break;
+    case 1:
+      old = _mm_extract_epi32(shadow, 1);
+      break;
+    case 2:
+      old = _mm_extract_epi32(shadow, 2);
+      break;
+    case 3:
+      old = _mm_extract_epi32(shadow, 3);
+      break;
   }
+  Shadow prev(static_cast<RawShadow>(old));
+  // For the free shadow markers the first element (that contains kFreeSid)
+  // triggers the race, but the second element contains info about the freeing
+  // thread, take it.
+  if (prev.sid() == kFreeSid)
+    prev = Shadow(static_cast<RawShadow>(_mm_extract_epi32(shadow, 1)));
+  DoReportRace(thr, shadow_mem, cur, prev, typ);
 }
 
 ALWAYS_INLINE
-bool ContainsSameAccessSlow(u64 *s, u64 a, u64 sync_epoch, bool is_write) {
-  Shadow cur(a);
-  for (uptr i = 0; i < kShadowCnt; i++) {
-    Shadow old(LoadShadow(&s[i]));
-    if (Shadow::Addr0AndSizeAreEqual(cur, old) &&
-        old.TidWithIgnore() == cur.TidWithIgnore() &&
-        old.epoch() > sync_epoch && old.IsAtomic() == cur.IsAtomic() &&
-        old.IsRead() <= cur.IsRead())
-      return true;
+bool CheckRaces(ThreadState* thr, RawShadow* shadow_mem, Shadow cur,
+                m128 shadow, m128 access, AccessType typ) {
+  // Note: empty/zero slots don't intersect with any access.
+  const m128 zero = _mm_setzero_si128();
+  const m128 mask_access = _mm_set1_epi32(0x000000ff);
+  const m128 mask_sid = _mm_set1_epi32(0x0000ff00);
+  const m128 mask_read_atomic = _mm_set1_epi32(0xc0000000);
+  const m128 access_and = _mm_and_si128(access, shadow);
+  const m128 access_xor = _mm_xor_si128(access, shadow);
+  const m128 intersect = _mm_and_si128(access_and, mask_access);
+  const m128 not_intersect = _mm_cmpeq_epi32(intersect, zero);
+  const m128 not_same_sid = _mm_and_si128(access_xor, mask_sid);
+  const m128 same_sid = _mm_cmpeq_epi32(not_same_sid, zero);
+  const m128 both_read_or_atomic = _mm_and_si128(access_and, mask_read_atomic);
+  const m128 no_race =
+      _mm_or_si128(_mm_or_si128(not_intersect, same_sid), both_read_or_atomic);
+  const int race_mask = _mm_movemask_epi8(_mm_cmpeq_epi32(no_race, zero));
+  if (UNLIKELY(race_mask))
+    goto SHARED;
+
+STORE : {
+  if (typ & kAccessCheckOnly)
+    return false;
+  // We could also replace different sid's if access is the same,
+  // rw weaker and happens before. However, just checking access below
+  // is not enough because we also need to check that !both_read_or_atomic
+  // (reads from different sids can be concurrent).
+  // Theoretically we could replace smaller accesses with larger accesses,
+  // but it's unclear if it's worth doing.
+  const m128 mask_access_sid = _mm_set1_epi32(0x0000ffff);
+  const m128 not_same_sid_access = _mm_and_si128(access_xor, mask_access_sid);
+  const m128 same_sid_access = _mm_cmpeq_epi32(not_same_sid_access, zero);
+  const m128 access_read_atomic =
+      _mm_set1_epi32((typ & (kAccessRead | kAccessAtomic)) << 30);
+  const m128 rw_weaker =
+      _mm_cmpeq_epi32(_mm_max_epu32(shadow, access_read_atomic), shadow);
+  const m128 rewrite = _mm_and_si128(same_sid_access, rw_weaker);
+  const int rewrite_mask = _mm_movemask_epi8(rewrite);
+  int index = __builtin_ffs(rewrite_mask);
+  if (UNLIKELY(index == 0)) {
+    const m128 empty = _mm_cmpeq_epi32(shadow, zero);
+    const int empty_mask = _mm_movemask_epi8(empty);
+    index = __builtin_ffs(empty_mask);
+    if (UNLIKELY(index == 0))
+      index = (atomic_load_relaxed(&thr->trace_pos) / 2) % 16;
   }
+  StoreShadow(&shadow_mem[index / 4], cur.raw());
+  // We could zero other slots determined by rewrite_mask.
+  // That would help other threads to evict better slots,
+  // but it's unclear if it's worth it.
   return false;
 }
 
-#if TSAN_VECTORIZE
-#  define SHUF(v0, v1, i0, i1, i2, i3)                    \
-    _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(v0), \
-                                    _mm_castsi128_ps(v1), \
-                                    (i0)*1 + (i1)*4 + (i2)*16 + (i3)*64))
-ALWAYS_INLINE
-bool ContainsSameAccessFast(u64 *s, u64 a, u64 sync_epoch, bool is_write) {
-  // This is an optimized version of ContainsSameAccessSlow.
-  // load current access into access[0:63]
-  const m128 access = _mm_cvtsi64_si128(a);
-  // duplicate high part of access in addr0:
-  // addr0[0:31]        = access[32:63]
-  // addr0[32:63]       = access[32:63]
-  // addr0[64:95]       = access[32:63]
-  // addr0[96:127]      = access[32:63]
-  const m128 addr0 = SHUF(access, access, 1, 1, 1, 1);
-  // load 4 shadow slots
-  const m128 shadow0 = _mm_load_si128((__m128i *)s);
-  const m128 shadow1 = _mm_load_si128((__m128i *)s + 1);
-  // load high parts of 4 shadow slots into addr_vect:
-  // addr_vect[0:31]    = shadow0[32:63]
-  // addr_vect[32:63]   = shadow0[96:127]
-  // addr_vect[64:95]   = shadow1[32:63]
-  // addr_vect[96:127]  = shadow1[96:127]
-  m128 addr_vect = SHUF(shadow0, shadow1, 1, 3, 1, 3);
-  if (!is_write) {
-    // set IsRead bit in addr_vect
-    const m128 rw_mask1 = _mm_cvtsi64_si128(1 << 15);
-    const m128 rw_mask = SHUF(rw_mask1, rw_mask1, 0, 0, 0, 0);
-    addr_vect = _mm_or_si128(addr_vect, rw_mask);
-  }
-  // addr0 == addr_vect?
-  const m128 addr_res = _mm_cmpeq_epi32(addr0, addr_vect);
-  // epoch1[0:63]       = sync_epoch
-  const m128 epoch1 = _mm_cvtsi64_si128(sync_epoch);
-  // epoch[0:31]        = sync_epoch[0:31]
-  // epoch[32:63]       = sync_epoch[0:31]
-  // epoch[64:95]       = sync_epoch[0:31]
-  // epoch[96:127]      = sync_epoch[0:31]
-  const m128 epoch = SHUF(epoch1, epoch1, 0, 0, 0, 0);
-  // load low parts of shadow cell epochs into epoch_vect:
-  // epoch_vect[0:31]   = shadow0[0:31]
-  // epoch_vect[32:63]  = shadow0[64:95]
-  // epoch_vect[64:95]  = shadow1[0:31]
-  // epoch_vect[96:127] = shadow1[64:95]
-  const m128 epoch_vect = SHUF(shadow0, shadow1, 0, 2, 0, 2);
-  // epoch_vect >= sync_epoch?
-  const m128 epoch_res = _mm_cmpgt_epi32(epoch_vect, epoch);
-  // addr_res & epoch_res
-  const m128 res = _mm_and_si128(addr_res, epoch_res);
-  // mask[0] = res[7]
-  // mask[1] = res[15]
-  // ...
-  // mask[15] = res[127]
-  const int mask = _mm_movemask_epi8(res);
-  return mask != 0;
+SHARED:
+  m128 thread_epochs = _mm_set1_epi32(0x7fffffff);
+  // Need to unwind this because _mm_extract_epi8/_mm_insert_epi32
+  // indexes must be constants.
+#  define LOAD_EPOCH(idx)                                                     \
+    if (LIKELY(race_mask & (1 << (idx * 4)))) {                               \
+      u8 sid = _mm_extract_epi8(shadow, idx * 4 + 1);                         \
+      u16 epoch = static_cast<u16>(thr->clock.Get(static_cast<Sid>(sid)));    \
+      thread_epochs = _mm_insert_epi32(thread_epochs, u32(epoch) << 16, idx); \
+    }
+  LOAD_EPOCH(0);
+  LOAD_EPOCH(1);
+  LOAD_EPOCH(2);
+  LOAD_EPOCH(3);
+#  undef LOAD_EPOCH
+  const m128 mask_epoch = _mm_set1_epi32(0x3fff0000);
+  const m128 shadow_epochs = _mm_and_si128(shadow, mask_epoch);
+  const m128 concurrent = _mm_cmplt_epi32(thread_epochs, shadow_epochs);
+  const int concurrent_mask = _mm_movemask_epi8(concurrent);
+  if (LIKELY(concurrent_mask == 0))
+    goto STORE;
+
+  DoReportRaceV(thr, shadow_mem, cur, concurrent_mask, shadow, typ);
+  return true;
 }
-#endif
 
-ALWAYS_INLINE
-bool ContainsSameAccess(u64 *s, u64 a, u64 sync_epoch, bool is_write) {
-#if TSAN_VECTORIZE
-  bool res = ContainsSameAccessFast(s, a, sync_epoch, is_write);
-  // NOTE: this check can fail if the shadow is concurrently mutated
-  // by other threads. But it still can be useful if you modify
-  // ContainsSameAccessFast and want to ensure that it's not completely broken.
-  // DCHECK_EQ(res, ContainsSameAccessSlow(s, a, sync_epoch, is_write));
-  return res;
-#else
-  return ContainsSameAccessSlow(s, a, sync_epoch, is_write);
+#  define LOAD_CURRENT_SHADOW(cur, shadow_mem)                         \
+    const m128 access = _mm_set1_epi32(static_cast<u32>((cur).raw())); \
+    const m128 shadow = _mm_load_si128(reinterpret_cast<m128*>(shadow_mem))
 #endif
-}
 
-ALWAYS_INLINE USED void MemoryAccess(ThreadState *thr, uptr pc, uptr addr,
-                                     int kAccessSizeLog, bool kAccessIsWrite,
-                                     bool kIsAtomic) {
-  RawShadow *shadow_mem = MemToShadow(addr);
-  DPrintf2(
-      "#%d: MemoryAccess: @%p %p size=%d"
-      " is_write=%d shadow_mem=%p {%zx, %zx, %zx, %zx}\n",
-      (int)thr->fast_state.tid(), (void *)pc, (void *)addr,
-      (int)(1 << kAccessSizeLog), kAccessIsWrite, shadow_mem,
-      (uptr)shadow_mem[0], (uptr)shadow_mem[1], (uptr)shadow_mem[2],
-      (uptr)shadow_mem[3]);
-#if SANITIZER_DEBUG
-  if (!IsAppMem(addr)) {
-    Printf("Access to non app mem %zx\n", addr);
-    DCHECK(IsAppMem(addr));
-  }
-  if (!IsShadowMem(shadow_mem)) {
-    Printf("Bad shadow addr %p (%zx)\n", shadow_mem, addr);
-    DCHECK(IsShadowMem(shadow_mem));
+char* DumpShadow(char* buf, RawShadow raw) {
+  if (raw == Shadow::kEmpty) {
+    internal_snprintf(buf, 64, "0");
+    return buf;
   }
-#endif
+  Shadow s(raw);
+  AccessType typ;
+  s.GetAccess(nullptr, nullptr, &typ);
+  internal_snprintf(buf, 64, "{tid=%u@%u access=0x%x typ=%x}",
+                    static_cast<u32>(s.sid()), static_cast<u32>(s.epoch()),
+                    s.access(), static_cast<u32>(typ));
+  return buf;
+}
 
-  if (!SANITIZER_GO && !kAccessIsWrite && *shadow_mem == kShadowRodata) {
-    // Access to .rodata section, no races here.
-    // Measurements show that it can be 10-20% of all memory accesses.
-    return;
-  }
+// TryTrace* and TraceRestart* functions allow to turn memory access and func
+// entry/exit callbacks into leaf functions with all associated performance
+// benefits. These hottest callbacks do only 2 slow path calls: report a race
+// and trace part switching. Race reporting is easy to turn into a tail call, we
+// just always return from the runtime after reporting a race. But trace part
+// switching is harder because it needs to be in the middle of callbacks. To
+// turn it into a tail call we immidiately return after TraceRestart* functions,
+// but TraceRestart* functions themselves recurse into the callback after
+// switching trace part. As the result the hottest callbacks contain only tail
+// calls, which effectively makes them leaf functions (can use all registers,
+// no frame setup, etc).
+NOINLINE void TraceRestartMemoryAccess(ThreadState* thr, uptr pc, uptr addr,
+                                       uptr size, AccessType typ) {
+  TraceSwitchPart(thr);
+  MemoryAccess(thr, pc, addr, size, typ);
+}
 
-  FastState fast_state = thr->fast_state;
-  if (UNLIKELY(fast_state.GetIgnoreBit())) {
-    return;
-  }
+ALWAYS_INLINE USED void MemoryAccess(ThreadState* thr, uptr pc, uptr addr,
+                                     uptr size, AccessType typ) {
+  RawShadow* shadow_mem = MemToShadow(addr);
+  UNUSED char memBuf[4][64];
+  DPrintf2("#%d: Access: %d@%d %p/%zd typ=0x%x {%s, %s, %s, %s}\n", thr->tid,
+           static_cast<int>(thr->fast_state.sid()),
+           static_cast<int>(thr->fast_state.epoch()), (void*)addr, size,
+           static_cast<int>(typ), DumpShadow(memBuf[0], shadow_mem[0]),
+           DumpShadow(memBuf[1], shadow_mem[1]),
+           DumpShadow(memBuf[2], shadow_mem[2]),
+           DumpShadow(memBuf[3], shadow_mem[3]));
 
-  Shadow cur(fast_state);
-  cur.SetAddr0AndSizeLog(addr & 7, kAccessSizeLog);
-  cur.SetWrite(kAccessIsWrite);
-  cur.SetAtomic(kIsAtomic);
+  FastState fast_state = thr->fast_state;
+  Shadow cur(fast_state, addr, size, typ);
 
-  if (LIKELY(ContainsSameAccess(shadow_mem, cur.raw(), thr->fast_synch_epoch,
-                                kAccessIsWrite))) {
+  LOAD_CURRENT_SHADOW(cur, shadow_mem);
+  if (LIKELY(ContainsSameAccess(shadow_mem, cur, shadow, access, typ)))
     return;
-  }
-
-  if (kCollectHistory) {
-    fast_state.IncrementEpoch();
-    thr->fast_state = fast_state;
-    TraceAddEvent(thr, fast_state, EventTypeMop, pc);
-    cur.IncrementEpoch();
-  }
+  if (UNLIKELY(fast_state.GetIgnoreBit()))
+    return;
+  if (!TryTraceMemoryAccess(thr, pc, addr, size, typ))
+    return TraceRestartMemoryAccess(thr, pc, addr, size, typ);
+  CheckRaces(thr, shadow_mem, cur, shadow, access, typ);
+}
 
-  MemoryAccessImpl1(thr, addr, kAccessSizeLog, kAccessIsWrite, kIsAtomic,
-                    shadow_mem, cur);
+NOINLINE
+void RestartUnalignedMemoryAccess(ThreadState* thr, uptr pc, uptr addr,
+                                  uptr size, AccessType typ) {
+  TraceSwitchPart(thr);
+  UnalignedMemoryAccess(thr, pc, addr, size, typ);
 }
 
-// Called by MemoryAccessRange in tsan_rtl_thread.cpp
-ALWAYS_INLINE USED void MemoryAccessImpl(ThreadState *thr, uptr addr,
-                                         int kAccessSizeLog,
-                                         bool kAccessIsWrite, bool kIsAtomic,
-                                         u64 *shadow_mem, Shadow cur) {
-  if (LIKELY(ContainsSameAccess(shadow_mem, cur.raw(), thr->fast_synch_epoch,
-                                kAccessIsWrite))) {
+ALWAYS_INLINE USED void UnalignedMemoryAccess(ThreadState* thr, uptr pc,
+                                              uptr addr, uptr size,
+                                              AccessType typ) {
+  DCHECK_LE(size, 8);
+  FastState fast_state = thr->fast_state;
+  if (UNLIKELY(fast_state.GetIgnoreBit()))
     return;
+  RawShadow* shadow_mem = MemToShadow(addr);
+  bool traced = false;
+  uptr size1 = Min<uptr>(size, RoundUp(addr + 1, kShadowCell) - addr);
+  {
+    Shadow cur(fast_state, addr, size1, typ);
+    LOAD_CURRENT_SHADOW(cur, shadow_mem);
+    if (LIKELY(ContainsSameAccess(shadow_mem, cur, shadow, access, typ)))
+      goto SECOND;
+    if (!TryTraceMemoryAccessRange(thr, pc, addr, size, typ))
+      return RestartUnalignedMemoryAccess(thr, pc, addr, size, typ);
+    traced = true;
+    if (UNLIKELY(CheckRaces(thr, shadow_mem, cur, shadow, access, typ)))
+      return;
   }
+SECOND:
+  uptr size2 = size - size1;
+  if (LIKELY(size2 == 0))
+    return;
+  shadow_mem += kShadowCnt;
+  Shadow cur(fast_state, 0, size2, typ);
+  LOAD_CURRENT_SHADOW(cur, shadow_mem);
+  if (LIKELY(ContainsSameAccess(shadow_mem, cur, shadow, access, typ)))
+    return;
+  if (!traced && !TryTraceMemoryAccessRange(thr, pc, addr, size, typ))
+    return RestartUnalignedMemoryAccess(thr, pc, addr, size, typ);
+  CheckRaces(thr, shadow_mem, cur, shadow, access, typ);
+}
 
-  MemoryAccessImpl1(thr, addr, kAccessSizeLog, kAccessIsWrite, kIsAtomic,
-                    shadow_mem, cur);
+void ShadowSet(RawShadow* p, RawShadow* end, RawShadow v) {
+  DCHECK_LE(p, end);
+  DCHECK(IsShadowMem(p));
+  DCHECK(IsShadowMem(end));
+  UNUSED const uptr kAlign = kShadowCnt * kShadowSize;
+  DCHECK_EQ(reinterpret_cast<uptr>(p) % kAlign, 0);
+  DCHECK_EQ(reinterpret_cast<uptr>(end) % kAlign, 0);
+#if !TSAN_VECTORIZE
+  for (; p < end; p += kShadowCnt) {
+    p[0] = v;
+    for (uptr i = 1; i < kShadowCnt; i++) p[i] = Shadow::kEmpty;
+  }
+#else
+  m128 vv = _mm_setr_epi32(
+      static_cast<u32>(v), static_cast<u32>(Shadow::kEmpty),
+      static_cast<u32>(Shadow::kEmpty), static_cast<u32>(Shadow::kEmpty));
+  m128* vp = reinterpret_cast<m128*>(p);
+  m128* vend = reinterpret_cast<m128*>(end);
+  for (; vp < vend; vp++) _mm_store_si128(vp, vv);
+#endif
 }
 
-static void MemoryRangeSet(ThreadState *thr, uptr pc, uptr addr, uptr size,
-                           u64 val) {
-  (void)thr;
-  (void)pc;
+static void MemoryRangeSet(uptr addr, uptr size, RawShadow val) {
   if (size == 0)
     return;
-  // FIXME: fix me.
-  uptr offset = addr % kShadowCell;
-  if (offset) {
-    offset = kShadowCell - offset;
-    if (size <= offset)
-      return;
-    addr += offset;
-    size -= offset;
-  }
-  DCHECK_EQ(addr % 8, 0);
+  DCHECK_EQ(addr % kShadowCell, 0);
+  DCHECK_EQ(size % kShadowCell, 0);
   // If a user passes some insane arguments (memset(0)),
   // let it just crash as usual.
   if (!IsAppMem(addr) || !IsAppMem(addr + size - 1))
     return;
+  RawShadow* begin = MemToShadow(addr);
+  RawShadow* end = begin + size / kShadowCell * kShadowCnt;
   // Don't want to touch lots of shadow memory.
   // If a program maps 10MB stack, there is no need reset the whole range.
-  size = (size + (kShadowCell - 1)) & ~(kShadowCell - 1);
   // UnmapOrDie/MmapFixedNoReserve does not work on Windows.
-  if (SANITIZER_WINDOWS || size < common_flags()->clear_shadow_mmap_threshold) {
-    RawShadow *p = MemToShadow(addr);
-    CHECK(IsShadowMem(p));
-    CHECK(IsShadowMem(p + size * kShadowCnt / kShadowCell - 1));
-    // FIXME: may overwrite a part outside the region
-    for (uptr i = 0; i < size / kShadowCell * kShadowCnt;) {
-      p[i++] = val;
-      for (uptr j = 1; j < kShadowCnt; j++) p[i++] = 0;
-    }
-  } else {
-    // The region is big, reset only beginning and end.
-    const uptr kPageSize = GetPageSizeCached();
-    RawShadow *begin = MemToShadow(addr);
-    RawShadow *end = begin + size / kShadowCell * kShadowCnt;
-    RawShadow *p = begin;
-    // Set at least first kPageSize/2 to page boundary.
-    while ((p < begin + kPageSize / kShadowSize / 2) || ((uptr)p % kPageSize)) {
-      *p++ = val;
-      for (uptr j = 1; j < kShadowCnt; j++) *p++ = 0;
-    }
-    // Reset middle part.
-    RawShadow *p1 = p;
-    p = RoundDown(end, kPageSize);
-    if (!MmapFixedSuperNoReserve((uptr)p1, (uptr)p - (uptr)p1))
+  if (SANITIZER_WINDOWS ||
+      size <= common_flags()->clear_shadow_mmap_threshold) {
+    ShadowSet(begin, end, val);
+    return;
+  }
+  // The region is big, reset only beginning and end.
+  const uptr kPageSize = GetPageSizeCached();
+  // Set at least first kPageSize/2 to page boundary.
+  RawShadow* mid1 =
+      Min(end, reinterpret_cast<RawShadow*>(RoundUp(
+                   reinterpret_cast<uptr>(begin) + kPageSize / 2, kPageSize)));
+  ShadowSet(begin, mid1, val);
+  // Reset middle part.
+  RawShadow* mid2 = RoundDown(end, kPageSize);
+  if (mid2 > mid1) {
+    if (!MmapFixedSuperNoReserve((uptr)mid1, (uptr)mid2 - (uptr)mid1))
       Die();
-    // Set the ending.
-    while (p < end) {
-      *p++ = val;
-      for (uptr j = 1; j < kShadowCnt; j++) *p++ = 0;
-    }
   }
+  // Set the ending.
+  ShadowSet(mid2, end, val);
 }
 
-void MemoryResetRange(ThreadState *thr, uptr pc, uptr addr, uptr size) {
-  MemoryRangeSet(thr, pc, addr, size, 0);
+void MemoryResetRange(ThreadState* thr, uptr pc, uptr addr, uptr size) {
+  uptr addr1 = RoundDown(addr, kShadowCell);
+  uptr size1 = RoundUp(size + addr - addr1, kShadowCell);
+  MemoryRangeSet(addr1, size1, Shadow::kEmpty);
 }
 
-void MemoryRangeFreed(ThreadState *thr, uptr pc, uptr addr, uptr size) {
-  // Processing more than 1k (4k of shadow) is expensive,
+void MemoryRangeFreed(ThreadState* thr, uptr pc, uptr addr, uptr size) {
+  // Callers must lock the slot to ensure synchronization with the reset.
+  // The problem with "freed" memory is that it's not "monotonic"
+  // with respect to bug detection: freed memory is bad to access,
+  // but then if the heap block is reallocated later, it's good to access.
+  // As the result a garbage "freed" shadow can lead to a false positive
+  // if it happens to match a real free in the thread trace,
+  // but the heap block was reallocated before the current memory access,
+  // so it's still good to access. It's not the case with data races.
+  DCHECK(thr->slot_locked);
+  DCHECK_EQ(addr % kShadowCell, 0);
+  size = RoundUp(size, kShadowCell);
+  // Processing more than 1k (2k of shadow) is expensive,
   // can cause excessive memory consumption (user does not necessary touch
   // the whole range) and most likely unnecessary.
-  if (size > 1024)
-    size = 1024;
-  CHECK_EQ(thr->is_freeing, false);
-  thr->is_freeing = true;
-  MemoryAccessRange(thr, pc, addr, size, true);
-  thr->is_freeing = false;
-  if (kCollectHistory) {
-    thr->fast_state.IncrementEpoch();
-    TraceAddEvent(thr, thr->fast_state, EventTypeMop, pc);
+  size = Min<uptr>(size, 1024);
+  const AccessType typ =
+      kAccessWrite | kAccessFree | kAccessCheckOnly | kAccessNoRodata;
+  TraceMemoryAccessRange(thr, pc, addr, size, typ);
+  RawShadow* shadow_mem = MemToShadow(addr);
+  Shadow cur(thr->fast_state, 0, kShadowCell, typ);
+#if TSAN_VECTORIZE
+  const m128 access = _mm_set1_epi32(static_cast<u32>(cur.raw()));
+  const m128 freed = _mm_setr_epi32(
+      static_cast<u32>(Shadow::FreedMarker()),
+      static_cast<u32>(Shadow::FreedInfo(cur.sid(), cur.epoch())), 0, 0);
+  for (; size; size -= kShadowCell, shadow_mem += kShadowCnt) {
+    const m128 shadow = _mm_load_si128((m128*)shadow_mem);
+    if (UNLIKELY(CheckRaces(thr, shadow_mem, cur, shadow, access, typ)))
+      return;
+    _mm_store_si128((m128*)shadow_mem, freed);
   }
-  Shadow s(thr->fast_state);
-  s.ClearIgnoreBit();
-  s.MarkAsFreed();
-  s.SetWrite(true);
-  s.SetAddr0AndSizeLog(0, 3);
-  MemoryRangeSet(thr, pc, addr, size, s.raw());
-}
-
-void MemoryRangeImitateWrite(ThreadState *thr, uptr pc, uptr addr, uptr size) {
-  if (kCollectHistory) {
-    thr->fast_state.IncrementEpoch();
-    TraceAddEvent(thr, thr->fast_state, EventTypeMop, pc);
+#else
+  for (; size; size -= kShadowCell, shadow_mem += kShadowCnt) {
+    if (UNLIKELY(CheckRaces(thr, shadow_mem, cur, 0, 0, typ)))
+      return;
+    StoreShadow(&shadow_mem[0], Shadow::FreedMarker());
+    StoreShadow(&shadow_mem[1], Shadow::FreedInfo(cur.sid(), cur.epoch()));
+    StoreShadow(&shadow_mem[2], Shadow::kEmpty);
+    StoreShadow(&shadow_mem[3], Shadow::kEmpty);
   }
-  Shadow s(thr->fast_state);
-  s.ClearIgnoreBit();
-  s.SetWrite(true);
-  s.SetAddr0AndSizeLog(0, 3);
-  MemoryRangeSet(thr, pc, addr, size, s.raw());
+#endif
+}
+
+void MemoryRangeImitateWrite(ThreadState* thr, uptr pc, uptr addr, uptr size) {
+  DCHECK_EQ(addr % kShadowCell, 0);
+  size = RoundUp(size, kShadowCell);
+  TraceMemoryAccessRange(thr, pc, addr, size, kAccessWrite);
+  Shadow cur(thr->fast_state, 0, 8, kAccessWrite);
+  MemoryRangeSet(addr, size, cur.raw());
 }
 
-void MemoryRangeImitateWriteOrResetRange(ThreadState *thr, uptr pc, uptr addr,
+void MemoryRangeImitateWriteOrResetRange(ThreadState* thr, uptr pc, uptr addr,
                                          uptr size) {
   if (thr->ignore_reads_and_writes == 0)
     MemoryRangeImitateWrite(thr, pc, addr, size);
@@ -518,14 +617,29 @@ void MemoryRangeImitateWriteOrResetRange(ThreadState *thr, uptr pc, uptr addr,
     MemoryResetRange(thr, pc, addr, size);
 }
 
-void MemoryAccessRange(ThreadState *thr, uptr pc, uptr addr, uptr size,
-                       bool is_write) {
-  if (size == 0)
-    return;
+ALWAYS_INLINE
+bool MemoryAccessRangeOne(ThreadState* thr, RawShadow* shadow_mem, Shadow cur,
+                          AccessType typ) {
+  LOAD_CURRENT_SHADOW(cur, shadow_mem);
+  if (LIKELY(ContainsSameAccess(shadow_mem, cur, shadow, access, typ)))
+    return false;
+  return CheckRaces(thr, shadow_mem, cur, shadow, access, typ);
+}
+
+template <bool is_read>
+NOINLINE void RestartMemoryAccessRange(ThreadState* thr, uptr pc, uptr addr,
+                                       uptr size) {
+  TraceSwitchPart(thr);
+  MemoryAccessRangeT<is_read>(thr, pc, addr, size);
+}
 
-  RawShadow *shadow_mem = MemToShadow(addr);
-  DPrintf2("#%d: MemoryAccessRange: @%p %p size=%d is_write=%d\n", thr->tid,
-           (void *)pc, (void *)addr, (int)size, is_write);
+template <bool is_read>
+void MemoryAccessRangeT(ThreadState* thr, uptr pc, uptr addr, uptr size) {
+  const AccessType typ =
+      (is_read ? kAccessRead : kAccessWrite) | kAccessNoRodata;
+  RawShadow* shadow_mem = MemToShadow(addr);
+  DPrintf2("#%d: MemoryAccessRange: @%p %p size=%d is_read=%d\n", thr->tid,
+           (void*)pc, (void*)addr, (int)size, is_read);
 
 #if SANITIZER_DEBUG
   if (!IsAppMem(addr)) {
@@ -537,65 +651,57 @@ void MemoryAccessRange(ThreadState *thr, uptr pc, uptr addr, uptr size,
     DCHECK(IsAppMem(addr + size - 1));
   }
   if (!IsShadowMem(shadow_mem)) {
-    Printf("Bad shadow addr %p (%zx)\n", shadow_mem, addr);
+    Printf("Bad shadow addr %p (%zx)\n", static_cast<void*>(shadow_mem), addr);
     DCHECK(IsShadowMem(shadow_mem));
   }
-  if (!IsShadowMem(shadow_mem + size * kShadowCnt / 8 - 1)) {
-    Printf("Bad shadow addr %p (%zx)\n", shadow_mem + size * kShadowCnt / 8 - 1,
+  if (!IsShadowMem(shadow_mem + size * kShadowCnt - 1)) {
+    Printf("Bad shadow addr %p (%zx)\n",
+           static_cast<void*>(shadow_mem + size * kShadowCnt - 1),
            addr + size - 1);
-    DCHECK(IsShadowMem(shadow_mem + size * kShadowCnt / 8 - 1));
+    DCHECK(IsShadowMem(shadow_mem + size * kShadowCnt - 1));
   }
 #endif
 
-  if (*shadow_mem == kShadowRodata) {
-    DCHECK(!is_write);
-    // Access to .rodata section, no races here.
-    // Measurements show that it can be 10-20% of all memory accesses.
+  // Access to .rodata section, no races here.
+  // Measurements show that it can be 10-20% of all memory accesses.
+  if (is_read && *shadow_mem == Shadow::kRodata)
     return;
-  }
 
   FastState fast_state = thr->fast_state;
-  if (fast_state.GetIgnoreBit())
+  if (UNLIKELY(fast_state.GetIgnoreBit()))
     return;
 
-  fast_state.IncrementEpoch();
-  thr->fast_state = fast_state;
-  TraceAddEvent(thr, fast_state, EventTypeMop, pc);
+  if (!TryTraceMemoryAccessRange(thr, pc, addr, size, typ))
+    return RestartMemoryAccessRange<is_read>(thr, pc, addr, size);
 
-  bool unaligned = (addr % kShadowCell) != 0;
-
-  // Handle unaligned beginning, if any.
-  for (; addr % kShadowCell && size; addr++, size--) {
-    int const kAccessSizeLog = 0;
-    Shadow cur(fast_state);
-    cur.SetWrite(is_write);
-    cur.SetAddr0AndSizeLog(addr & (kShadowCell - 1), kAccessSizeLog);
-    MemoryAccessImpl(thr, addr, kAccessSizeLog, is_write, false, shadow_mem,
-                     cur);
-  }
-  if (unaligned)
+  if (UNLIKELY(addr % kShadowCell)) {
+    // Handle unaligned beginning, if any.
+    uptr size1 = Min(size, RoundUp(addr, kShadowCell) - addr);
+    size -= size1;
+    Shadow cur(fast_state, addr, size1, typ);
+    if (UNLIKELY(MemoryAccessRangeOne(thr, shadow_mem, cur, typ)))
+      return;
     shadow_mem += kShadowCnt;
+  }
   // Handle middle part, if any.
-  for (; size >= kShadowCell; addr += kShadowCell, size -= kShadowCell) {
-    int const kAccessSizeLog = 3;
-    Shadow cur(fast_state);
-    cur.SetWrite(is_write);
-    cur.SetAddr0AndSizeLog(0, kAccessSizeLog);
-    MemoryAccessImpl(thr, addr, kAccessSizeLog, is_write, false, shadow_mem,
-                     cur);
-    shadow_mem += kShadowCnt;
+  Shadow cur(fast_state, 0, kShadowCell, typ);
+  for (; size >= kShadowCell; size -= kShadowCell, shadow_mem += kShadowCnt) {
+    if (UNLIKELY(MemoryAccessRangeOne(thr, shadow_mem, cur, typ)))
+      return;
   }
   // Handle ending, if any.
-  for (; size; addr++, size--) {
-    int const kAccessSizeLog = 0;
-    Shadow cur(fast_state);
-    cur.SetWrite(is_write);
-    cur.SetAddr0AndSizeLog(addr & (kShadowCell - 1), kAccessSizeLog);
-    MemoryAccessImpl(thr, addr, kAccessSizeLog, is_write, false, shadow_mem,
-                     cur);
+  if (UNLIKELY(size)) {
+    Shadow cur(fast_state, 0, size, typ);
+    if (UNLIKELY(MemoryAccessRangeOne(thr, shadow_mem, cur, typ)))
+      return;
   }
 }
 
+template void MemoryAccessRangeT<true>(ThreadState* thr, uptr pc, uptr addr,
+                                       uptr size);
+template void MemoryAccessRangeT<false>(ThreadState* thr, uptr pc, uptr addr,
+                                        uptr size);
+
 }  // namespace __tsan
 
 #if !SANITIZER_GO
diff --git a/libsanitizer/tsan/tsan_rtl_mutex.cpp b/libsanitizer/tsan/tsan_rtl_mutex.cpp
index 7d6b41116aa..5ca2e4fca82 100644
--- a/libsanitizer/tsan/tsan_rtl_mutex.cpp
+++ b/libsanitizer/tsan/tsan_rtl_mutex.cpp
@@ -23,6 +23,8 @@
 namespace __tsan {
 
 void ReportDeadlock(ThreadState *thr, uptr pc, DDReport *r);
+void ReportDestroyLocked(ThreadState *thr, uptr pc, uptr addr,
+                         FastState last_lock, StackID creation_stack_id);
 
 struct Callback final : public DDCallback {
   ThreadState *thr;
@@ -36,17 +38,17 @@ struct Callback final : public DDCallback {
   }
 
   StackID Unwind() override { return CurrentStackId(thr, pc); }
-  int UniqueTid() override { return thr->unique_id; }
+  int UniqueTid() override { return thr->tid; }
 };
 
 void DDMutexInit(ThreadState *thr, uptr pc, SyncVar *s) {
   Callback cb(thr, pc);
   ctx->dd->MutexInit(&cb, &s->dd);
-  s->dd.ctx = s->GetId();
+  s->dd.ctx = s->addr;
 }
 
 static void ReportMutexMisuse(ThreadState *thr, uptr pc, ReportType typ,
-    uptr addr, u64 mid) {
+                              uptr addr, StackID creation_stack_id) {
   // In Go, these misuses are either impossible, or detected by std lib,
   // or false positives (e.g. unlock in a different thread).
   if (SANITIZER_GO)
@@ -55,7 +57,7 @@ static void ReportMutexMisuse(ThreadState *thr, uptr pc, ReportType typ,
     return;
   ThreadRegistryLock l(&ctx->thread_registry);
   ScopedReport rep(typ);
-  rep.AddMutex(mid);
+  rep.AddMutex(addr, creation_stack_id);
   VarSizeStackTrace trace;
   ObtainCurrentStack(thr, pc, &trace);
   rep.AddStack(trace, true);
@@ -63,95 +65,93 @@ static void ReportMutexMisuse(ThreadState *thr, uptr pc, ReportType typ,
   OutputReport(thr, rep);
 }
 
+static void RecordMutexLock(ThreadState *thr, uptr pc, uptr addr,
+                            StackID stack_id, bool write) {
+  auto typ = write ? EventType::kLock : EventType::kRLock;
+  // Note: it's important to trace before modifying mutex set
+  // because tracing can switch trace part and we write the current
+  // mutex set in the beginning of each part.
+  // If we do it in the opposite order, we will write already reduced
+  // mutex set in the beginning of the part and then trace unlock again.
+  TraceMutexLock(thr, typ, pc, addr, stack_id);
+  thr->mset.AddAddr(addr, stack_id, write);
+}
+
+static void RecordMutexUnlock(ThreadState *thr, uptr addr) {
+  // See the comment in RecordMutexLock re order of operations.
+  TraceMutexUnlock(thr, addr);
+  thr->mset.DelAddr(addr);
+}
+
 void MutexCreate(ThreadState *thr, uptr pc, uptr addr, u32 flagz) {
   DPrintf("#%d: MutexCreate %zx flagz=0x%x\n", thr->tid, addr, flagz);
-  if (!(flagz & MutexFlagLinkerInit) && IsAppMem(addr)) {
-    CHECK(!thr->is_freeing);
-    thr->is_freeing = true;
+  if (!(flagz & MutexFlagLinkerInit) && pc && IsAppMem(addr))
     MemoryAccess(thr, pc, addr, 1, kAccessWrite);
-    thr->is_freeing = false;
-  }
-  SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
-  Lock l(&s->mtx);
+  SlotLocker locker(thr);
+  auto s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
   s->SetFlags(flagz & MutexCreationFlagMask);
   // Save stack in the case the sync object was created before as atomic.
-  if (!SANITIZER_GO && s->creation_stack_id == 0)
+  if (!SANITIZER_GO && s->creation_stack_id == kInvalidStackID)
     s->creation_stack_id = CurrentStackId(thr, pc);
 }
 
 void MutexDestroy(ThreadState *thr, uptr pc, uptr addr, u32 flagz) {
   DPrintf("#%d: MutexDestroy %zx\n", thr->tid, addr);
   bool unlock_locked = false;
-  u64 mid = 0;
-  u64 last_lock = 0;
+  StackID creation_stack_id;
+  FastState last_lock;
   {
-    SyncVar *s = ctx->metamap.GetSyncIfExists(addr);
-    if (s == 0)
+    auto s = ctx->metamap.GetSyncIfExists(addr);
+    if (!s)
       return;
-    Lock l(&s->mtx);
-    if ((flagz & MutexFlagLinkerInit) || s->IsFlagSet(MutexFlagLinkerInit) ||
-        ((flagz & MutexFlagNotStatic) && !s->IsFlagSet(MutexFlagNotStatic))) {
-      // Destroy is no-op for linker-initialized mutexes.
-      return;
-    }
-    if (common_flags()->detect_deadlocks) {
-      Callback cb(thr, pc);
-      ctx->dd->MutexDestroy(&cb, &s->dd);
-      ctx->dd->MutexInit(&cb, &s->dd);
-    }
-    if (flags()->report_destroy_locked && s->owner_tid != kInvalidTid &&
-        !s->IsFlagSet(MutexFlagBroken)) {
-      s->SetFlags(MutexFlagBroken);
-      unlock_locked = true;
-    }
-    mid = s->GetId();
-    last_lock = s->last_lock;
-    if (!unlock_locked)
-      s->Reset(thr->proc());  // must not reset it before the report is printed
-  }
-  if (unlock_locked && ShouldReport(thr, ReportTypeMutexDestroyLocked)) {
-    ThreadRegistryLock l(&ctx->thread_registry);
-    ScopedReport rep(ReportTypeMutexDestroyLocked);
-    rep.AddMutex(mid);
-    VarSizeStackTrace trace;
-    ObtainCurrentStack(thr, pc, &trace);
-    rep.AddStack(trace, true);
-    FastState last(last_lock);
-    RestoreStack(last.tid(), last.epoch(), &trace, 0);
-    rep.AddStack(trace, true);
-    rep.AddLocation(addr, 1);
-    OutputReport(thr, rep);
-
-    SyncVar *s = ctx->metamap.GetSyncIfExists(addr);
-    if (s != 0) {
-      Lock l(&s->mtx);
-      s->Reset(thr->proc());
+    SlotLocker locker(thr);
+    {
+      Lock lock(&s->mtx);
+      creation_stack_id = s->creation_stack_id;
+      last_lock = s->last_lock;
+      if ((flagz & MutexFlagLinkerInit) || s->IsFlagSet(MutexFlagLinkerInit) ||
+          ((flagz & MutexFlagNotStatic) && !s->IsFlagSet(MutexFlagNotStatic))) {
+        // Destroy is no-op for linker-initialized mutexes.
+        return;
+      }
+      if (common_flags()->detect_deadlocks) {
+        Callback cb(thr, pc);
+        ctx->dd->MutexDestroy(&cb, &s->dd);
+        ctx->dd->MutexInit(&cb, &s->dd);
+      }
+      if (flags()->report_destroy_locked && s->owner_tid != kInvalidTid &&
+          !s->IsFlagSet(MutexFlagBroken)) {
+        s->SetFlags(MutexFlagBroken);
+        unlock_locked = true;
+      }
+      s->Reset();
     }
+    // Imitate a memory write to catch unlock-destroy races.
+    if (pc && IsAppMem(addr))
+      MemoryAccess(thr, pc, addr, 1, kAccessWrite | kAccessFree);
   }
-  thr->mset.Remove(mid);
-  // Imitate a memory write to catch unlock-destroy races.
-  // Do this outside of sync mutex, because it can report a race which locks
-  // sync mutexes.
-  if (IsAppMem(addr))
-    MemoryAccess(thr, pc, addr, 1, kAccessWrite | kAccessFree);
+  if (unlock_locked && ShouldReport(thr, ReportTypeMutexDestroyLocked))
+    ReportDestroyLocked(thr, pc, addr, last_lock, creation_stack_id);
+  thr->mset.DelAddr(addr, true);
   // s will be destroyed and freed in MetaMap::FreeBlock.
 }
 
 void MutexPreLock(ThreadState *thr, uptr pc, uptr addr, u32 flagz) {
   DPrintf("#%d: MutexPreLock %zx flagz=0x%x\n", thr->tid, addr, flagz);
-  if (!(flagz & MutexFlagTryLock) && common_flags()->detect_deadlocks) {
-    SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
-    {
-      ReadLock l(&s->mtx);
-      s->UpdateFlags(flagz);
-      if (s->owner_tid != thr->tid) {
-        Callback cb(thr, pc);
-        ctx->dd->MutexBeforeLock(&cb, &s->dd, true);
-      }
-    }
-    Callback cb(thr, pc);
-    ReportDeadlock(thr, pc, ctx->dd->GetReport(&cb));
+  if (flagz & MutexFlagTryLock)
+    return;
+  if (!common_flags()->detect_deadlocks)
+    return;
+  Callback cb(thr, pc);
+  {
+    SlotLocker locker(thr);
+    auto s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
+    ReadLock lock(&s->mtx);
+    s->UpdateFlags(flagz);
+    if (s->owner_tid != thr->tid)
+      ctx->dd->MutexBeforeLock(&cb, &s->dd, true);
   }
+  ReportDeadlock(thr, pc, ctx->dd->GetReport(&cb));
 }
 
 void MutexPostLock(ThreadState *thr, uptr pc, uptr addr, u32 flagz, int rec) {
@@ -161,48 +161,51 @@ void MutexPostLock(ThreadState *thr, uptr pc, uptr addr, u32 flagz, int rec) {
     CHECK_GT(rec, 0);
   else
     rec = 1;
-  if (IsAppMem(addr))
+  if (pc && IsAppMem(addr))
     MemoryAccess(thr, pc, addr, 1, kAccessRead | kAccessAtomic);
-  u64 mid = 0;
+  bool report_double_lock = false;
   bool pre_lock = false;
   bool first = false;
-  bool report_double_lock = false;
+  StackID creation_stack_id = kInvalidStackID;
   {
-    SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
-    Lock l(&s->mtx);
-    s->UpdateFlags(flagz);
-    thr->fast_state.IncrementEpoch();
-    TraceAddEvent(thr, thr->fast_state, EventTypeLock, s->GetId());
-    if (s->owner_tid == kInvalidTid) {
-      CHECK_EQ(s->recursion, 0);
-      s->owner_tid = thr->tid;
-      s->last_lock = thr->fast_state.raw();
-    } else if (s->owner_tid == thr->tid) {
-      CHECK_GT(s->recursion, 0);
-    } else if (flags()->report_mutex_bugs && !s->IsFlagSet(MutexFlagBroken)) {
-      s->SetFlags(MutexFlagBroken);
-      report_double_lock = true;
-    }
-    first = s->recursion == 0;
-    s->recursion += rec;
-    if (first) {
-      AcquireImpl(thr, pc, &s->clock);
-      AcquireImpl(thr, pc, &s->read_clock);
-    } else if (!s->IsFlagSet(MutexFlagWriteReentrant)) {
-    }
-    thr->mset.Add(s->GetId(), true, thr->fast_state.epoch());
-    if (first && common_flags()->detect_deadlocks) {
-      pre_lock =
-          (flagz & MutexFlagDoPreLockOnPostLock) && !(flagz & MutexFlagTryLock);
-      Callback cb(thr, pc);
-      if (pre_lock)
-        ctx->dd->MutexBeforeLock(&cb, &s->dd, true);
-      ctx->dd->MutexAfterLock(&cb, &s->dd, true, flagz & MutexFlagTryLock);
+    SlotLocker locker(thr);
+    auto s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
+    creation_stack_id = s->creation_stack_id;
+    RecordMutexLock(thr, pc, addr, creation_stack_id, true);
+    {
+      Lock lock(&s->mtx);
+      first = s->recursion == 0;
+      s->UpdateFlags(flagz);
+      if (s->owner_tid == kInvalidTid) {
+        CHECK_EQ(s->recursion, 0);
+        s->owner_tid = thr->tid;
+        s->last_lock = thr->fast_state;
+      } else if (s->owner_tid == thr->tid) {
+        CHECK_GT(s->recursion, 0);
+      } else if (flags()->report_mutex_bugs && !s->IsFlagSet(MutexFlagBroken)) {
+        s->SetFlags(MutexFlagBroken);
+        report_double_lock = true;
+      }
+      s->recursion += rec;
+      if (first) {
+        if (!thr->ignore_sync) {
+          thr->clock.Acquire(s->clock);
+          thr->clock.Acquire(s->read_clock);
+        }
+      }
+      if (first && common_flags()->detect_deadlocks) {
+        pre_lock = (flagz & MutexFlagDoPreLockOnPostLock) &&
+                   !(flagz & MutexFlagTryLock);
+        Callback cb(thr, pc);
+        if (pre_lock)
+          ctx->dd->MutexBeforeLock(&cb, &s->dd, true);
+        ctx->dd->MutexAfterLock(&cb, &s->dd, true, flagz & MutexFlagTryLock);
+      }
     }
-    mid = s->GetId();
   }
   if (report_double_lock)
-    ReportMutexMisuse(thr, pc, ReportTypeMutexDoubleLock, addr, mid);
+    ReportMutexMisuse(thr, pc, ReportTypeMutexDoubleLock, addr,
+                      creation_stack_id);
   if (first && pre_lock && common_flags()->detect_deadlocks) {
     Callback cb(thr, pc);
     ReportDeadlock(thr, pc, ctx->dd->GetReport(&cb));
@@ -211,40 +214,47 @@ void MutexPostLock(ThreadState *thr, uptr pc, uptr addr, u32 flagz, int rec) {
 
 int MutexUnlock(ThreadState *thr, uptr pc, uptr addr, u32 flagz) {
   DPrintf("#%d: MutexUnlock %zx flagz=0x%x\n", thr->tid, addr, flagz);
-  if (IsAppMem(addr))
+  if (pc && IsAppMem(addr))
     MemoryAccess(thr, pc, addr, 1, kAccessRead | kAccessAtomic);
-  u64 mid = 0;
+  StackID creation_stack_id;
+  RecordMutexUnlock(thr, addr);
   bool report_bad_unlock = false;
   int rec = 0;
   {
-    SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
-    Lock l(&s->mtx);
-    thr->fast_state.IncrementEpoch();
-    TraceAddEvent(thr, thr->fast_state, EventTypeUnlock, s->GetId());
-    if (!SANITIZER_GO && (s->recursion == 0 || s->owner_tid != thr->tid)) {
-      if (flags()->report_mutex_bugs && !s->IsFlagSet(MutexFlagBroken)) {
-        s->SetFlags(MutexFlagBroken);
-        report_bad_unlock = true;
-      }
-    } else {
-      rec = (flagz & MutexFlagRecursiveUnlock) ? s->recursion : 1;
-      s->recursion -= rec;
-      if (s->recursion == 0) {
-        s->owner_tid = kInvalidTid;
-        ReleaseStoreImpl(thr, pc, &s->clock);
+    SlotLocker locker(thr);
+    auto s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
+    bool released = false;
+    {
+      Lock lock(&s->mtx);
+      creation_stack_id = s->creation_stack_id;
+      if (!SANITIZER_GO && (s->recursion == 0 || s->owner_tid != thr->tid)) {
+        if (flags()->report_mutex_bugs && !s->IsFlagSet(MutexFlagBroken)) {
+          s->SetFlags(MutexFlagBroken);
+          report_bad_unlock = true;
+        }
       } else {
+        rec = (flagz & MutexFlagRecursiveUnlock) ? s->recursion : 1;
+        s->recursion -= rec;
+        if (s->recursion == 0) {
+          s->owner_tid = kInvalidTid;
+          if (!thr->ignore_sync) {
+            thr->clock.ReleaseStore(&s->clock);
+            released = true;
+          }
+        }
+      }
+      if (common_flags()->detect_deadlocks && s->recursion == 0 &&
+          !report_bad_unlock) {
+        Callback cb(thr, pc);
+        ctx->dd->MutexBeforeUnlock(&cb, &s->dd, true);
       }
     }
-    thr->mset.Del(s->GetId(), true);
-    if (common_flags()->detect_deadlocks && s->recursion == 0 &&
-        !report_bad_unlock) {
-      Callback cb(thr, pc);
-      ctx->dd->MutexBeforeUnlock(&cb, &s->dd, true);
-    }
-    mid = s->GetId();
+    if (released)
+      IncrementEpoch(thr);
   }
   if (report_bad_unlock)
-    ReportMutexMisuse(thr, pc, ReportTypeMutexBadUnlock, addr, mid);
+    ReportMutexMisuse(thr, pc, ReportTypeMutexBadUnlock, addr,
+                      creation_stack_id);
   if (common_flags()->detect_deadlocks && !report_bad_unlock) {
     Callback cb(thr, pc);
     ReportDeadlock(thr, pc, ctx->dd->GetReport(&cb));
@@ -254,53 +264,56 @@ int MutexUnlock(ThreadState *thr, uptr pc, uptr addr, u32 flagz) {
 
 void MutexPreReadLock(ThreadState *thr, uptr pc, uptr addr, u32 flagz) {
   DPrintf("#%d: MutexPreReadLock %zx flagz=0x%x\n", thr->tid, addr, flagz);
-  if (!(flagz & MutexFlagTryLock) && common_flags()->detect_deadlocks) {
-    {
-      SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
-      ReadLock l(&s->mtx);
-      s->UpdateFlags(flagz);
-      Callback cb(thr, pc);
-      ctx->dd->MutexBeforeLock(&cb, &s->dd, false);
-    }
-    Callback cb(thr, pc);
-    ReportDeadlock(thr, pc, ctx->dd->GetReport(&cb));
+  if ((flagz & MutexFlagTryLock) || !common_flags()->detect_deadlocks)
+    return;
+  Callback cb(thr, pc);
+  {
+    SlotLocker locker(thr);
+    auto s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
+    ReadLock lock(&s->mtx);
+    s->UpdateFlags(flagz);
+    ctx->dd->MutexBeforeLock(&cb, &s->dd, false);
   }
+  ReportDeadlock(thr, pc, ctx->dd->GetReport(&cb));
 }
 
 void MutexPostReadLock(ThreadState *thr, uptr pc, uptr addr, u32 flagz) {
   DPrintf("#%d: MutexPostReadLock %zx flagz=0x%x\n", thr->tid, addr, flagz);
-  if (IsAppMem(addr))
+  if (pc && IsAppMem(addr))
     MemoryAccess(thr, pc, addr, 1, kAccessRead | kAccessAtomic);
-  u64 mid = 0;
   bool report_bad_lock = false;
   bool pre_lock = false;
+  StackID creation_stack_id = kInvalidStackID;
   {
-    SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
-    ReadLock l(&s->mtx);
-    s->UpdateFlags(flagz);
-    thr->fast_state.IncrementEpoch();
-    TraceAddEvent(thr, thr->fast_state, EventTypeRLock, s->GetId());
-    if (s->owner_tid != kInvalidTid) {
-      if (flags()->report_mutex_bugs && !s->IsFlagSet(MutexFlagBroken)) {
-        s->SetFlags(MutexFlagBroken);
-        report_bad_lock = true;
+    SlotLocker locker(thr);
+    auto s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
+    creation_stack_id = s->creation_stack_id;
+    RecordMutexLock(thr, pc, addr, creation_stack_id, false);
+    {
+      ReadLock lock(&s->mtx);
+      s->UpdateFlags(flagz);
+      if (s->owner_tid != kInvalidTid) {
+        if (flags()->report_mutex_bugs && !s->IsFlagSet(MutexFlagBroken)) {
+          s->SetFlags(MutexFlagBroken);
+          report_bad_lock = true;
+        }
+      }
+      if (!thr->ignore_sync)
+        thr->clock.Acquire(s->clock);
+      s->last_lock = thr->fast_state;
+      if (common_flags()->detect_deadlocks) {
+        pre_lock = (flagz & MutexFlagDoPreLockOnPostLock) &&
+                   !(flagz & MutexFlagTryLock);
+        Callback cb(thr, pc);
+        if (pre_lock)
+          ctx->dd->MutexBeforeLock(&cb, &s->dd, false);
+        ctx->dd->MutexAfterLock(&cb, &s->dd, false, flagz & MutexFlagTryLock);
       }
     }
-    AcquireImpl(thr, pc, &s->clock);
-    s->last_lock = thr->fast_state.raw();
-    thr->mset.Add(s->GetId(), false, thr->fast_state.epoch());
-    if (common_flags()->detect_deadlocks) {
-      pre_lock =
-          (flagz & MutexFlagDoPreLockOnPostLock) && !(flagz & MutexFlagTryLock);
-      Callback cb(thr, pc);
-      if (pre_lock)
-        ctx->dd->MutexBeforeLock(&cb, &s->dd, false);
-      ctx->dd->MutexAfterLock(&cb, &s->dd, false, flagz & MutexFlagTryLock);
-    }
-    mid = s->GetId();
   }
   if (report_bad_lock)
-    ReportMutexMisuse(thr, pc, ReportTypeMutexBadReadLock, addr, mid);
+    ReportMutexMisuse(thr, pc, ReportTypeMutexBadReadLock, addr,
+                      creation_stack_id);
   if (pre_lock  && common_flags()->detect_deadlocks) {
     Callback cb(thr, pc);
     ReportDeadlock(thr, pc, ctx->dd->GetReport(&cb));
@@ -309,31 +322,39 @@ void MutexPostReadLock(ThreadState *thr, uptr pc, uptr addr, u32 flagz) {
 
 void MutexReadUnlock(ThreadState *thr, uptr pc, uptr addr) {
   DPrintf("#%d: MutexReadUnlock %zx\n", thr->tid, addr);
-  if (IsAppMem(addr))
+  if (pc && IsAppMem(addr))
     MemoryAccess(thr, pc, addr, 1, kAccessRead | kAccessAtomic);
-  u64 mid = 0;
+  RecordMutexUnlock(thr, addr);
+  StackID creation_stack_id;
   bool report_bad_unlock = false;
   {
-    SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
-    Lock l(&s->mtx);
-    thr->fast_state.IncrementEpoch();
-    TraceAddEvent(thr, thr->fast_state, EventTypeRUnlock, s->GetId());
-    if (s->owner_tid != kInvalidTid) {
-      if (flags()->report_mutex_bugs && !s->IsFlagSet(MutexFlagBroken)) {
-        s->SetFlags(MutexFlagBroken);
-        report_bad_unlock = true;
+    SlotLocker locker(thr);
+    auto s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
+    bool released = false;
+    {
+      Lock lock(&s->mtx);
+      creation_stack_id = s->creation_stack_id;
+      if (s->owner_tid != kInvalidTid) {
+        if (flags()->report_mutex_bugs && !s->IsFlagSet(MutexFlagBroken)) {
+          s->SetFlags(MutexFlagBroken);
+          report_bad_unlock = true;
+        }
+      }
+      if (!thr->ignore_sync) {
+        thr->clock.Release(&s->read_clock);
+        released = true;
+      }
+      if (common_flags()->detect_deadlocks && s->recursion == 0) {
+        Callback cb(thr, pc);
+        ctx->dd->MutexBeforeUnlock(&cb, &s->dd, false);
       }
     }
-    ReleaseImpl(thr, pc, &s->read_clock);
-    if (common_flags()->detect_deadlocks && s->recursion == 0) {
-      Callback cb(thr, pc);
-      ctx->dd->MutexBeforeUnlock(&cb, &s->dd, false);
-    }
-    mid = s->GetId();
+    if (released)
+      IncrementEpoch(thr);
   }
-  thr->mset.Del(mid, false);
   if (report_bad_unlock)
-    ReportMutexMisuse(thr, pc, ReportTypeMutexBadReadUnlock, addr, mid);
+    ReportMutexMisuse(thr, pc, ReportTypeMutexBadReadUnlock, addr,
+                      creation_stack_id);
   if (common_flags()->detect_deadlocks) {
     Callback cb(thr, pc);
     ReportDeadlock(thr, pc, ctx->dd->GetReport(&cb));
@@ -342,44 +363,52 @@ void MutexReadUnlock(ThreadState *thr, uptr pc, uptr addr) {
 
 void MutexReadOrWriteUnlock(ThreadState *thr, uptr pc, uptr addr) {
   DPrintf("#%d: MutexReadOrWriteUnlock %zx\n", thr->tid, addr);
-  if (IsAppMem(addr))
+  if (pc && IsAppMem(addr))
     MemoryAccess(thr, pc, addr, 1, kAccessRead | kAccessAtomic);
-  u64 mid = 0;
+  RecordMutexUnlock(thr, addr);
+  StackID creation_stack_id;
   bool report_bad_unlock = false;
+  bool write = true;
   {
-    SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
-    Lock l(&s->mtx);
-    bool write = true;
-    if (s->owner_tid == kInvalidTid) {
-      // Seems to be read unlock.
-      write = false;
-      thr->fast_state.IncrementEpoch();
-      TraceAddEvent(thr, thr->fast_state, EventTypeRUnlock, s->GetId());
-      ReleaseImpl(thr, pc, &s->read_clock);
-    } else if (s->owner_tid == thr->tid) {
-      // Seems to be write unlock.
-      thr->fast_state.IncrementEpoch();
-      TraceAddEvent(thr, thr->fast_state, EventTypeUnlock, s->GetId());
-      CHECK_GT(s->recursion, 0);
-      s->recursion--;
-      if (s->recursion == 0) {
-        s->owner_tid = kInvalidTid;
-        ReleaseStoreImpl(thr, pc, &s->clock);
-      } else {
+    SlotLocker locker(thr);
+    auto s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
+    bool released = false;
+    {
+      Lock lock(&s->mtx);
+      creation_stack_id = s->creation_stack_id;
+      if (s->owner_tid == kInvalidTid) {
+        // Seems to be read unlock.
+        write = false;
+        if (!thr->ignore_sync) {
+          thr->clock.Release(&s->read_clock);
+          released = true;
+        }
+      } else if (s->owner_tid == thr->tid) {
+        // Seems to be write unlock.
+        CHECK_GT(s->recursion, 0);
+        s->recursion--;
+        if (s->recursion == 0) {
+          s->owner_tid = kInvalidTid;
+          if (!thr->ignore_sync) {
+            thr->clock.ReleaseStore(&s->clock);
+            released = true;
+          }
+        }
+      } else if (!s->IsFlagSet(MutexFlagBroken)) {
+        s->SetFlags(MutexFlagBroken);
+        report_bad_unlock = true;
+      }
+      if (common_flags()->detect_deadlocks && s->recursion == 0) {
+        Callback cb(thr, pc);
+        ctx->dd->MutexBeforeUnlock(&cb, &s->dd, write);
       }
-    } else if (!s->IsFlagSet(MutexFlagBroken)) {
-      s->SetFlags(MutexFlagBroken);
-      report_bad_unlock = true;
-    }
-    thr->mset.Del(s->GetId(), write);
-    if (common_flags()->detect_deadlocks && s->recursion == 0) {
-      Callback cb(thr, pc);
-      ctx->dd->MutexBeforeUnlock(&cb, &s->dd, write);
     }
-    mid = s->GetId();
+    if (released)
+      IncrementEpoch(thr);
   }
   if (report_bad_unlock)
-    ReportMutexMisuse(thr, pc, ReportTypeMutexBadUnlock, addr, mid);
+    ReportMutexMisuse(thr, pc, ReportTypeMutexBadUnlock, addr,
+                      creation_stack_id);
   if (common_flags()->detect_deadlocks) {
     Callback cb(thr, pc);
     ReportDeadlock(thr, pc, ctx->dd->GetReport(&cb));
@@ -388,143 +417,112 @@ void MutexReadOrWriteUnlock(ThreadState *thr, uptr pc, uptr addr) {
 
 void MutexRepair(ThreadState *thr, uptr pc, uptr addr) {
   DPrintf("#%d: MutexRepair %zx\n", thr->tid, addr);
-  SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
-  Lock l(&s->mtx);
+  SlotLocker locker(thr);
+  auto s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
+  Lock lock(&s->mtx);
   s->owner_tid = kInvalidTid;
   s->recursion = 0;
 }
 
 void MutexInvalidAccess(ThreadState *thr, uptr pc, uptr addr) {
   DPrintf("#%d: MutexInvalidAccess %zx\n", thr->tid, addr);
-  SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
-  ReportMutexMisuse(thr, pc, ReportTypeMutexInvalidAccess, addr, s->GetId());
+  StackID creation_stack_id = kInvalidStackID;
+  {
+    SlotLocker locker(thr);
+    auto s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
+    if (s)
+      creation_stack_id = s->creation_stack_id;
+  }
+  ReportMutexMisuse(thr, pc, ReportTypeMutexInvalidAccess, addr,
+                    creation_stack_id);
 }
 
 void Acquire(ThreadState *thr, uptr pc, uptr addr) {
   DPrintf("#%d: Acquire %zx\n", thr->tid, addr);
   if (thr->ignore_sync)
     return;
-  SyncVar *s = ctx->metamap.GetSyncIfExists(addr);
+  auto s = ctx->metamap.GetSyncIfExists(addr);
   if (!s)
     return;
-  ReadLock l(&s->mtx);
-  AcquireImpl(thr, pc, &s->clock);
-}
-
-static void UpdateClockCallback(ThreadContextBase *tctx_base, void *arg) {
-  ThreadState *thr = reinterpret_cast<ThreadState*>(arg);
-  ThreadContext *tctx = static_cast<ThreadContext*>(tctx_base);
-  u64 epoch = tctx->epoch1;
-  if (tctx->status == ThreadStatusRunning) {
-    epoch = tctx->thr->fast_state.epoch();
-    tctx->thr->clock.NoteGlobalAcquire(epoch);
-  }
-  thr->clock.set(&thr->proc()->clock_cache, tctx->tid, epoch);
+  SlotLocker locker(thr);
+  if (!s->clock)
+    return;
+  ReadLock lock(&s->mtx);
+  thr->clock.Acquire(s->clock);
 }
 
 void AcquireGlobal(ThreadState *thr) {
   DPrintf("#%d: AcquireGlobal\n", thr->tid);
   if (thr->ignore_sync)
     return;
-  ThreadRegistryLock l(&ctx->thread_registry);
-  ctx->thread_registry.RunCallbackForEachThreadLocked(UpdateClockCallback, thr);
-}
-
-void ReleaseStoreAcquire(ThreadState *thr, uptr pc, uptr addr) {
-  DPrintf("#%d: ReleaseStoreAcquire %zx\n", thr->tid, addr);
-  if (thr->ignore_sync)
-    return;
-  SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, false);
-  Lock l(&s->mtx);
-  thr->fast_state.IncrementEpoch();
-  // Can't increment epoch w/o writing to the trace as well.
-  TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
-  ReleaseStoreAcquireImpl(thr, pc, &s->clock);
+  SlotLocker locker(thr);
+  for (auto &slot : ctx->slots) thr->clock.Set(slot.sid, slot.epoch());
 }
 
 void Release(ThreadState *thr, uptr pc, uptr addr) {
   DPrintf("#%d: Release %zx\n", thr->tid, addr);
   if (thr->ignore_sync)
     return;
-  SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, false);
-  Lock l(&s->mtx);
-  thr->fast_state.IncrementEpoch();
-  // Can't increment epoch w/o writing to the trace as well.
-  TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
-  ReleaseImpl(thr, pc, &s->clock);
+  SlotLocker locker(thr);
+  {
+    auto s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, false);
+    Lock lock(&s->mtx);
+    thr->clock.Release(&s->clock);
+  }
+  IncrementEpoch(thr);
 }
 
 void ReleaseStore(ThreadState *thr, uptr pc, uptr addr) {
   DPrintf("#%d: ReleaseStore %zx\n", thr->tid, addr);
   if (thr->ignore_sync)
     return;
-  SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, false);
-  Lock l(&s->mtx);
-  thr->fast_state.IncrementEpoch();
-  // Can't increment epoch w/o writing to the trace as well.
-  TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
-  ReleaseStoreImpl(thr, pc, &s->clock);
-}
-
-#if !SANITIZER_GO
-static void UpdateSleepClockCallback(ThreadContextBase *tctx_base, void *arg) {
-  ThreadState *thr = reinterpret_cast<ThreadState*>(arg);
-  ThreadContext *tctx = static_cast<ThreadContext*>(tctx_base);
-  u64 epoch = tctx->epoch1;
-  if (tctx->status == ThreadStatusRunning)
-    epoch = tctx->thr->fast_state.epoch();
-  thr->last_sleep_clock.set(&thr->proc()->clock_cache, tctx->tid, epoch);
-}
-
-void AfterSleep(ThreadState *thr, uptr pc) {
-  DPrintf("#%d: AfterSleep\n", thr->tid);
-  if (thr->ignore_sync)
-    return;
-  thr->last_sleep_stack_id = CurrentStackId(thr, pc);
-  ThreadRegistryLock l(&ctx->thread_registry);
-  ctx->thread_registry.RunCallbackForEachThreadLocked(UpdateSleepClockCallback,
-                                                      thr);
-}
-#endif
-
-void AcquireImpl(ThreadState *thr, uptr pc, SyncClock *c) {
-  if (thr->ignore_sync)
-    return;
-  thr->clock.set(thr->fast_state.epoch());
-  thr->clock.acquire(&thr->proc()->clock_cache, c);
-}
-
-void ReleaseStoreAcquireImpl(ThreadState *thr, uptr pc, SyncClock *c) {
-  if (thr->ignore_sync)
-    return;
-  thr->clock.set(thr->fast_state.epoch());
-  thr->fast_synch_epoch = thr->fast_state.epoch();
-  thr->clock.releaseStoreAcquire(&thr->proc()->clock_cache, c);
+  SlotLocker locker(thr);
+  {
+    auto s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, false);
+    Lock lock(&s->mtx);
+    thr->clock.ReleaseStore(&s->clock);
+  }
+  IncrementEpoch(thr);
 }
 
-void ReleaseImpl(ThreadState *thr, uptr pc, SyncClock *c) {
+void ReleaseStoreAcquire(ThreadState *thr, uptr pc, uptr addr) {
+  DPrintf("#%d: ReleaseStoreAcquire %zx\n", thr->tid, addr);
   if (thr->ignore_sync)
     return;
-  thr->clock.set(thr->fast_state.epoch());
-  thr->fast_synch_epoch = thr->fast_state.epoch();
-  thr->clock.release(&thr->proc()->clock_cache, c);
+  SlotLocker locker(thr);
+  {
+    auto s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, false);
+    Lock lock(&s->mtx);
+    thr->clock.ReleaseStoreAcquire(&s->clock);
+  }
+  IncrementEpoch(thr);
 }
 
-void ReleaseStoreImpl(ThreadState *thr, uptr pc, SyncClock *c) {
-  if (thr->ignore_sync)
-    return;
-  thr->clock.set(thr->fast_state.epoch());
-  thr->fast_synch_epoch = thr->fast_state.epoch();
-  thr->clock.ReleaseStore(&thr->proc()->clock_cache, c);
+void IncrementEpoch(ThreadState *thr) {
+  DCHECK(!thr->ignore_sync);
+  DCHECK(thr->slot_locked);
+  Epoch epoch = EpochInc(thr->fast_state.epoch());
+  if (!EpochOverflow(epoch)) {
+    Sid sid = thr->fast_state.sid();
+    thr->clock.Set(sid, epoch);
+    thr->fast_state.SetEpoch(epoch);
+    thr->slot->SetEpoch(epoch);
+    TraceTime(thr);
+  }
 }
 
-void AcquireReleaseImpl(ThreadState *thr, uptr pc, SyncClock *c) {
+#if !SANITIZER_GO
+void AfterSleep(ThreadState *thr, uptr pc) {
+  DPrintf("#%d: AfterSleep\n", thr->tid);
   if (thr->ignore_sync)
     return;
-  thr->clock.set(thr->fast_state.epoch());
-  thr->fast_synch_epoch = thr->fast_state.epoch();
-  thr->clock.acq_rel(&thr->proc()->clock_cache, c);
+  thr->last_sleep_stack_id = CurrentStackId(thr, pc);
+  thr->last_sleep_clock.Reset();
+  SlotLocker locker(thr);
+  for (auto &slot : ctx->slots)
+    thr->last_sleep_clock.Set(slot.sid, slot.epoch());
 }
+#endif
 
 void ReportDeadlock(ThreadState *thr, uptr pc, DDReport *r) {
   if (r == 0 || !ShouldReport(thr, ReportTypeDeadlock))
@@ -532,7 +530,7 @@ void ReportDeadlock(ThreadState *thr, uptr pc, DDReport *r) {
   ThreadRegistryLock l(&ctx->thread_registry);
   ScopedReport rep(ReportTypeDeadlock);
   for (int i = 0; i < r->n; i++) {
-    rep.AddMutex(r->loop[i].mtx_ctx0);
+    rep.AddMutex(r->loop[i].mtx_ctx0, r->loop[i].stk[0]);
     rep.AddUniqueTid((int)r->loop[i].thr_ctx);
     rep.AddThread((int)r->loop[i].thr_ctx);
   }
@@ -540,7 +538,7 @@ void ReportDeadlock(ThreadState *thr, uptr pc, DDReport *r) {
   for (int i = 0; i < r->n; i++) {
     for (int j = 0; j < (flags()->second_deadlock_stack ? 2 : 1); j++) {
       u32 stk = r->loop[i].stk[j];
-      if (stk && stk != 0xffffffff) {
+      if (stk && stk != kInvalidStackID) {
         rep.AddStack(StackDepotGet(stk), true);
       } else {
         // Sometimes we fail to extract the stack trace (FIXME: investigate),
@@ -552,4 +550,26 @@ void ReportDeadlock(ThreadState *thr, uptr pc, DDReport *r) {
   OutputReport(thr, rep);
 }
 
+void ReportDestroyLocked(ThreadState *thr, uptr pc, uptr addr,
+                         FastState last_lock, StackID creation_stack_id) {
+  SlotPairLocker locker(thr, last_lock.sid());
+  ThreadRegistryLock l0(&ctx->thread_registry);
+  Lock slots_lock(&ctx->slot_mtx);
+  ScopedReport rep(ReportTypeMutexDestroyLocked);
+  rep.AddMutex(addr, creation_stack_id);
+  VarSizeStackTrace trace;
+  ObtainCurrentStack(thr, pc, &trace);
+  rep.AddStack(trace, true);
+
+  Tid tid;
+  DynamicMutexSet mset;
+  uptr tag;
+  if (!RestoreStack(EventType::kLock, last_lock.sid(), last_lock.epoch(), addr,
+                    0, kAccessWrite, &tid, &trace, mset, &tag))
+    return;
+  rep.AddStack(trace, true);
+  rep.AddLocation(addr, 1);
+  OutputReport(thr, rep);
+}
+
 }  // namespace __tsan
diff --git a/libsanitizer/tsan/tsan_rtl_ppc64.S b/libsanitizer/tsan/tsan_rtl_ppc64.S
index 9e533a71a9c..8285e21aa1e 100644
--- a/libsanitizer/tsan/tsan_rtl_ppc64.S
+++ b/libsanitizer/tsan/tsan_rtl_ppc64.S
@@ -1,6 +1,5 @@
 #include "tsan_ppc_regs.h"
 
-        .machine altivec
         .section .text
         .hidden __tsan_setjmp
         .globl _setjmp
diff --git a/libsanitizer/tsan/tsan_rtl_proc.cpp b/libsanitizer/tsan/tsan_rtl_proc.cpp
index def61cca14d..5acc3967208 100644
--- a/libsanitizer/tsan/tsan_rtl_proc.cpp
+++ b/libsanitizer/tsan/tsan_rtl_proc.cpp
@@ -35,7 +35,6 @@ void ProcDestroy(Processor *proc) {
 #if !SANITIZER_GO
   AllocatorProcFinish(proc);
 #endif
-  ctx->clock_alloc.FlushCache(&proc->clock_cache);
   ctx->metamap.OnProcIdle(proc);
   if (common_flags()->detect_deadlocks)
      ctx->dd->DestroyPhysicalThread(proc->dd_pt);
diff --git a/libsanitizer/tsan/tsan_rtl_report.cpp b/libsanitizer/tsan/tsan_rtl_report.cpp
index 811695d144c..e331539bcdb 100644
--- a/libsanitizer/tsan/tsan_rtl_report.cpp
+++ b/libsanitizer/tsan/tsan_rtl_report.cpp
@@ -175,22 +175,26 @@ void ScopedReportBase::AddStack(StackTrace stack, bool suppressable) {
 }
 
 void ScopedReportBase::AddMemoryAccess(uptr addr, uptr external_tag, Shadow s,
-                                       StackTrace stack, const MutexSet *mset) {
+                                       Tid tid, StackTrace stack,
+                                       const MutexSet *mset) {
+  uptr addr0, size;
+  AccessType typ;
+  s.GetAccess(&addr0, &size, &typ);
   auto *mop = New<ReportMop>();
   rep_->mops.PushBack(mop);
-  mop->tid = s.tid();
-  mop->addr = addr + s.addr0();
-  mop->size = s.size();
-  mop->write = s.IsWrite();
-  mop->atomic = s.IsAtomic();
+  mop->tid = tid;
+  mop->addr = addr + addr0;
+  mop->size = size;
+  mop->write = !(typ & kAccessRead);
+  mop->atomic = typ & kAccessAtomic;
   mop->stack = SymbolizeStack(stack);
   mop->external_tag = external_tag;
   if (mop->stack)
     mop->stack->suppressable = true;
   for (uptr i = 0; i < mset->Size(); i++) {
     MutexSet::Desc d = mset->Get(i);
-    u64 mid = this->AddMutex(d.id);
-    ReportMopMutex mtx = {mid, d.write};
+    u64 id = this->AddMutex(d.addr, d.stack_id);
+    ReportMopMutex mtx = {id, d.write};
     mop->mset.PushBack(mtx);
   }
 }
@@ -219,18 +223,6 @@ void ScopedReportBase::AddThread(const ThreadContext *tctx, bool suppressable) {
 }
 
 #if !SANITIZER_GO
-static bool FindThreadByUidLockedCallback(ThreadContextBase *tctx, void *arg) {
-  int unique_id = *(int *)arg;
-  return tctx->unique_id == (u32)unique_id;
-}
-
-static ThreadContext *FindThreadByUidLocked(Tid unique_id) {
-  ctx->thread_registry.CheckLocked();
-  return static_cast<ThreadContext *>(
-      ctx->thread_registry.FindThreadContextLocked(
-          FindThreadByUidLockedCallback, &unique_id));
-}
-
 static ThreadContext *FindThreadByTidLocked(Tid tid) {
   ctx->thread_registry.CheckLocked();
   return static_cast<ThreadContext *>(
@@ -262,55 +254,25 @@ ThreadContext *IsThreadStackOrTls(uptr addr, bool *is_stack) {
 }
 #endif
 
-void ScopedReportBase::AddThread(Tid unique_tid, bool suppressable) {
+void ScopedReportBase::AddThread(Tid tid, bool suppressable) {
 #if !SANITIZER_GO
-  if (const ThreadContext *tctx = FindThreadByUidLocked(unique_tid))
+  if (const ThreadContext *tctx = FindThreadByTidLocked(tid))
     AddThread(tctx, suppressable);
 #endif
 }
 
-void ScopedReportBase::AddMutex(const SyncVar *s) {
+int ScopedReportBase::AddMutex(uptr addr, StackID creation_stack_id) {
   for (uptr i = 0; i < rep_->mutexes.Size(); i++) {
-    if (rep_->mutexes[i]->id == s->uid)
-      return;
+    if (rep_->mutexes[i]->addr == addr)
+      return rep_->mutexes[i]->id;
   }
   auto *rm = New<ReportMutex>();
   rep_->mutexes.PushBack(rm);
-  rm->id = s->uid;
-  rm->addr = s->addr;
+  rm->id = rep_->mutexes.Size() - 1;
+  rm->addr = addr;
   rm->destroyed = false;
-  rm->stack = SymbolizeStackId(s->creation_stack_id);
-}
-
-u64 ScopedReportBase::AddMutex(u64 id) {
-  u64 uid = 0;
-  u64 mid = id;
-  uptr addr = SyncVar::SplitId(id, &uid);
-  SyncVar *s = ctx->metamap.GetSyncIfExists(addr);
-  // Check that the mutex is still alive.
-  // Another mutex can be created at the same address,
-  // so check uid as well.
-  if (s && s->CheckId(uid)) {
-    Lock l(&s->mtx);
-    mid = s->uid;
-    AddMutex(s);
-  } else {
-    AddDeadMutex(id);
-  }
-  return mid;
-}
-
-void ScopedReportBase::AddDeadMutex(u64 id) {
-  for (uptr i = 0; i < rep_->mutexes.Size(); i++) {
-    if (rep_->mutexes[i]->id == id)
-      return;
-  }
-  auto *rm = New<ReportMutex>();
-  rep_->mutexes.PushBack(rm);
-  rm->id = id;
-  rm->addr = 0;
-  rm->destroyed = true;
-  rm->stack = 0;
+  rm->stack = SymbolizeStackId(creation_stack_id);
+  return rm->id;
 }
 
 void ScopedReportBase::AddLocation(uptr addr, uptr size) {
@@ -327,7 +289,7 @@ void ScopedReportBase::AddLocation(uptr addr, uptr size) {
     loc->tid = creat_tid;
     loc->stack = SymbolizeStackId(creat_stack);
     rep_->locs.PushBack(loc);
-    ThreadContext *tctx = FindThreadByUidLocked(creat_tid);
+    ThreadContext *tctx = FindThreadByTidLocked(creat_tid);
     if (tctx)
       AddThread(tctx);
     return;
@@ -343,16 +305,15 @@ void ScopedReportBase::AddLocation(uptr addr, uptr size) {
   if (!b)
     b = JavaHeapBlock(addr, &block_begin);
   if (b != 0) {
-    ThreadContext *tctx = FindThreadByTidLocked(b->tid);
     auto *loc = New<ReportLocation>();
     loc->type = ReportLocationHeap;
     loc->heap_chunk_start = (uptr)allocator()->GetBlockBegin((void *)addr);
     loc->heap_chunk_size = b->siz;
     loc->external_tag = b->tag;
-    loc->tid = tctx ? tctx->tid : b->tid;
+    loc->tid = b->tid;
     loc->stack = SymbolizeStackId(b->stk);
     rep_->locs.PushBack(loc);
-    if (tctx)
+    if (ThreadContext *tctx = FindThreadByTidLocked(b->tid))
       AddThread(tctx);
     return;
   }
@@ -387,71 +348,6 @@ ScopedReport::ScopedReport(ReportType typ, uptr tag)
 
 ScopedReport::~ScopedReport() {}
 
-void RestoreStack(Tid tid, const u64 epoch, VarSizeStackTrace *stk,
-                  MutexSet *mset, uptr *tag) {
-  // This function restores stack trace and mutex set for the thread/epoch.
-  // It does so by getting stack trace and mutex set at the beginning of
-  // trace part, and then replaying the trace till the given epoch.
-  Trace* trace = ThreadTrace(tid);
-  ReadLock l(&trace->mtx);
-  const int partidx = (epoch / kTracePartSize) % TraceParts();
-  TraceHeader* hdr = &trace->headers[partidx];
-  if (epoch < hdr->epoch0 || epoch >= hdr->epoch0 + kTracePartSize)
-    return;
-  CHECK_EQ(RoundDown(epoch, kTracePartSize), hdr->epoch0);
-  const u64 epoch0 = RoundDown(epoch, TraceSize());
-  const u64 eend = epoch % TraceSize();
-  const u64 ebegin = RoundDown(eend, kTracePartSize);
-  DPrintf("#%d: RestoreStack epoch=%zu ebegin=%zu eend=%zu partidx=%d\n",
-          tid, (uptr)epoch, (uptr)ebegin, (uptr)eend, partidx);
-  Vector<uptr> stack;
-  stack.Resize(hdr->stack0.size + 64);
-  for (uptr i = 0; i < hdr->stack0.size; i++) {
-    stack[i] = hdr->stack0.trace[i];
-    DPrintf2("  #%02zu: pc=%zx\n", i, stack[i]);
-  }
-  if (mset)
-    *mset = hdr->mset0;
-  uptr pos = hdr->stack0.size;
-  Event *events = (Event*)GetThreadTrace(tid);
-  for (uptr i = ebegin; i <= eend; i++) {
-    Event ev = events[i];
-    EventType typ = (EventType)(ev >> kEventPCBits);
-    uptr pc = (uptr)(ev & ((1ull << kEventPCBits) - 1));
-    DPrintf2("  %zu typ=%d pc=%zx\n", i, typ, pc);
-    if (typ == EventTypeMop) {
-      stack[pos] = pc;
-    } else if (typ == EventTypeFuncEnter) {
-      if (stack.Size() < pos + 2)
-        stack.Resize(pos + 2);
-      stack[pos++] = pc;
-    } else if (typ == EventTypeFuncExit) {
-      if (pos > 0)
-        pos--;
-    }
-    if (mset) {
-      if (typ == EventTypeLock) {
-        mset->Add(pc, true, epoch0 + i);
-      } else if (typ == EventTypeUnlock) {
-        mset->Del(pc, true);
-      } else if (typ == EventTypeRLock) {
-        mset->Add(pc, false, epoch0 + i);
-      } else if (typ == EventTypeRUnlock) {
-        mset->Del(pc, false);
-      }
-    }
-    for (uptr j = 0; j <= pos; j++)
-      DPrintf2("      #%zu: %zx\n", j, stack[j]);
-  }
-  if (pos == 0 && stack[0] == 0)
-    return;
-  pos++;
-  stk->Init(&stack[0], pos);
-  ExtractTagFromStack(stk, tag);
-}
-
-namespace v3 {
-
 // Replays the trace up to last_pos position in the last part
 // or up to the provided epoch/sid (whichever is earlier)
 // and calls the provided function f for each event.
@@ -469,6 +365,7 @@ void TraceReplay(Trace *trace, TracePart *last, Event *last_pos, Sid sid,
     Event *end = &part->events[TracePart::kSize - 1];
     if (part == last)
       end = last_pos;
+    f(kFreeSid, kEpochOver, nullptr);  // notify about part start
     for (Event *evp = &part->events[0]; evp < end; evp++) {
       Event *evp0 = evp;
       if (!evp->is_access && !evp->is_func) {
@@ -528,21 +425,36 @@ static constexpr bool IsWithinAccess(uptr addr1, uptr size1, uptr addr2,
   return addr1 >= addr2 && addr1 + size1 <= addr2 + size2;
 }
 
-// Replays the trace of thread tid up to the target event identified
-// by sid/epoch/addr/size/typ and restores and returns stack, mutex set
+// Replays the trace of slot sid up to the target event identified
+// by epoch/addr/size/typ and restores and returns tid, stack, mutex set
 // and tag for that event. If there are multiple such events, it returns
 // the last one. Returns false if the event is not present in the trace.
-bool RestoreStack(Tid tid, EventType type, Sid sid, Epoch epoch, uptr addr,
-                  uptr size, AccessType typ, VarSizeStackTrace *pstk,
+bool RestoreStack(EventType type, Sid sid, Epoch epoch, uptr addr, uptr size,
+                  AccessType typ, Tid *ptid, VarSizeStackTrace *pstk,
                   MutexSet *pmset, uptr *ptag) {
   // This function restores stack trace and mutex set for the thread/epoch.
   // It does so by getting stack trace and mutex set at the beginning of
   // trace part, and then replaying the trace till the given epoch.
-  DPrintf2("RestoreStack: tid=%u sid=%u@%u addr=0x%zx/%zu typ=%x\n", tid,
+  DPrintf2("RestoreStack: sid=%u@%u addr=0x%zx/%zu typ=%x\n",
            static_cast<int>(sid), static_cast<int>(epoch), addr, size,
            static_cast<int>(typ));
   ctx->slot_mtx.CheckLocked();  // needed to prevent trace part recycling
   ctx->thread_registry.CheckLocked();
+  TidSlot *slot = &ctx->slots[static_cast<uptr>(sid)];
+  Tid tid = kInvalidTid;
+  // Need to lock the slot mutex as it protects slot->journal.
+  slot->mtx.CheckLocked();
+  for (uptr i = 0; i < slot->journal.Size(); i++) {
+    DPrintf2("  journal: epoch=%d tid=%d\n",
+             static_cast<int>(slot->journal[i].epoch), slot->journal[i].tid);
+    if (i == slot->journal.Size() - 1 || slot->journal[i + 1].epoch > epoch) {
+      tid = slot->journal[i].tid;
+      break;
+    }
+  }
+  if (tid == kInvalidTid)
+    return false;
+  *ptid = tid;
   ThreadContext *tctx =
       static_cast<ThreadContext *>(ctx->thread_registry.GetThreadLocked(tid));
   Trace *trace = &tctx->trace;
@@ -553,8 +465,10 @@ bool RestoreStack(Tid tid, EventType type, Sid sid, Epoch epoch, uptr addr,
   {
     Lock lock(&trace->mtx);
     first_part = trace->parts.Front();
-    if (!first_part)
+    if (!first_part) {
+      DPrintf2("RestoreStack: tid=%d trace=%p no trace parts\n", tid, trace);
       return false;
+    }
     last_part = trace->parts.Back();
     last_pos = trace->final_pos;
     if (tctx->thr)
@@ -567,9 +481,18 @@ bool RestoreStack(Tid tid, EventType type, Sid sid, Epoch epoch, uptr addr,
   bool is_read = typ & kAccessRead;
   bool is_atomic = typ & kAccessAtomic;
   bool is_free = typ & kAccessFree;
+  DPrintf2("RestoreStack: tid=%d parts=[%p-%p] last_pos=%p\n", tid,
+           trace->parts.Front(), last_part, last_pos);
   TraceReplay(
       trace, last_part, last_pos, sid, epoch,
       [&](Sid ev_sid, Epoch ev_epoch, Event *evp) {
+        if (evp == nullptr) {
+          // Each trace part is self-consistent, so we reset state.
+          stack.Resize(0);
+          mset->Reset();
+          prev_pc = 0;
+          return;
+        }
         bool match = ev_sid == sid && ev_epoch == epoch;
         if (evp->is_access) {
           if (evp->is_func == 0 && evp->type == EventType::kAccessExt &&
@@ -592,12 +515,15 @@ bool RestoreStack(Tid tid, EventType type, Sid sid, Epoch epoch, uptr addr,
         if (evp->is_func) {
           auto *ev = reinterpret_cast<EventFunc *>(evp);
           if (ev->pc) {
-            DPrintf2("  FuncEnter: pc=0x%llx\n", ev->pc);
+            DPrintf2(" FuncEnter: pc=0x%llx\n", ev->pc);
             stack.PushBack(ev->pc);
           } else {
-            DPrintf2("  FuncExit\n");
-            CHECK(stack.Size());
-            stack.PopBack();
+            DPrintf2(" FuncExit\n");
+            // We don't log pathologically large stacks in each part,
+            // if the stack was truncated we can have more func exits than
+            // entries.
+            if (stack.Size())
+              stack.PopBack();
           }
           return;
         }
@@ -666,8 +592,6 @@ bool RestoreStack(Tid tid, EventType type, Sid sid, Epoch epoch, uptr addr,
   return found;
 }
 
-}  // namespace v3
-
 bool RacyStacks::operator==(const RacyStacks &other) const {
   if (hash[0] == other.hash[0] && hash[1] == other.hash[1])
     return true;
@@ -758,10 +682,7 @@ bool OutputReport(ThreadState *thr, const ScopedReport &srep) {
     ctx->fired_suppressions.push_back(s);
   }
   {
-    bool old_is_freeing = thr->is_freeing;
-    thr->is_freeing = false;
     bool suppressed = OnReport(rep, pc_or_addr != 0);
-    thr->is_freeing = old_is_freeing;
     if (suppressed) {
       thr->current_report = nullptr;
       return false;
@@ -808,97 +729,91 @@ static bool IsFiredSuppression(Context *ctx, ReportType type, uptr addr) {
   return false;
 }
 
-static bool RaceBetweenAtomicAndFree(ThreadState *thr) {
-  Shadow s0(thr->racy_state[0]);
-  Shadow s1(thr->racy_state[1]);
-  CHECK(!(s0.IsAtomic() && s1.IsAtomic()));
-  if (!s0.IsAtomic() && !s1.IsAtomic())
-    return true;
-  if (s0.IsAtomic() && s1.IsFreed())
-    return true;
-  if (s1.IsAtomic() && thr->is_freeing)
-    return true;
-  return false;
+// We need to lock the target slot during RestoreStack because it protects
+// the slot journal. However, the target slot can be the slot of the current
+// thread or a different slot.
+SlotPairLocker::SlotPairLocker(ThreadState *thr,
+                               Sid sid) NO_THREAD_SAFETY_ANALYSIS : thr_(thr),
+                                                                    slot_() {
+  CHECK_NE(sid, kFreeSid);
+  Lock l(&ctx->multi_slot_mtx);
+  SlotLock(thr);
+  if (sid == thr->slot->sid)
+    return;
+  slot_ = &ctx->slots[static_cast<uptr>(sid)];
+  slot_->mtx.Lock();
 }
 
-void ReportRace(ThreadState *thr) {
+SlotPairLocker::~SlotPairLocker() NO_THREAD_SAFETY_ANALYSIS {
+  SlotUnlock(thr_);
+  if (slot_)
+    slot_->mtx.Unlock();
+}
+
+void ReportRace(ThreadState *thr, RawShadow *shadow_mem, Shadow cur, Shadow old,
+                AccessType typ0) {
   CheckedMutex::CheckNoLocks();
 
   // Symbolizer makes lots of intercepted calls. If we try to process them,
   // at best it will cause deadlocks on internal mutexes.
   ScopedIgnoreInterceptors ignore;
 
+  uptr addr = ShadowToMem(shadow_mem);
+  DPrintf("#%d: ReportRace %p\n", thr->tid, (void *)addr);
   if (!ShouldReport(thr, ReportTypeRace))
     return;
-  if (!flags()->report_atomic_races && !RaceBetweenAtomicAndFree(thr))
+  uptr addr_off0, size0;
+  cur.GetAccess(&addr_off0, &size0, nullptr);
+  uptr addr_off1, size1, typ1;
+  old.GetAccess(&addr_off1, &size1, &typ1);
+  if (!flags()->report_atomic_races &&
+      ((typ0 & kAccessAtomic) || (typ1 & kAccessAtomic)) &&
+      !(typ0 & kAccessFree) && !(typ1 & kAccessFree))
     return;
 
-  bool freed = false;
-  {
-    Shadow s(thr->racy_state[1]);
-    freed = s.GetFreedAndReset();
-    thr->racy_state[1] = s.raw();
-  }
-
-  uptr addr = ShadowToMem(thr->racy_shadow_addr);
-  uptr addr_min = 0;
-  uptr addr_max = 0;
-  {
-    uptr a0 = addr + Shadow(thr->racy_state[0]).addr0();
-    uptr a1 = addr + Shadow(thr->racy_state[1]).addr0();
-    uptr e0 = a0 + Shadow(thr->racy_state[0]).size();
-    uptr e1 = a1 + Shadow(thr->racy_state[1]).size();
-    addr_min = min(a0, a1);
-    addr_max = max(e0, e1);
-    if (IsExpectedReport(addr_min, addr_max - addr_min))
-      return;
-  }
+  const uptr kMop = 2;
+  Shadow s[kMop] = {cur, old};
+  uptr addr0 = addr + addr_off0;
+  uptr addr1 = addr + addr_off1;
+  uptr end0 = addr0 + size0;
+  uptr end1 = addr1 + size1;
+  uptr addr_min = min(addr0, addr1);
+  uptr addr_max = max(end0, end1);
+  if (IsExpectedReport(addr_min, addr_max - addr_min))
+    return;
   if (HandleRacyAddress(thr, addr_min, addr_max))
     return;
 
-  ReportType typ = ReportTypeRace;
-  if (thr->is_vptr_access && freed)
-    typ = ReportTypeVptrUseAfterFree;
-  else if (thr->is_vptr_access)
-    typ = ReportTypeVptrRace;
-  else if (freed)
-    typ = ReportTypeUseAfterFree;
+  ReportType rep_typ = ReportTypeRace;
+  if ((typ0 & kAccessVptr) && (typ1 & kAccessFree))
+    rep_typ = ReportTypeVptrUseAfterFree;
+  else if (typ0 & kAccessVptr)
+    rep_typ = ReportTypeVptrRace;
+  else if (typ1 & kAccessFree)
+    rep_typ = ReportTypeUseAfterFree;
 
-  if (IsFiredSuppression(ctx, typ, addr))
+  if (IsFiredSuppression(ctx, rep_typ, addr))
     return;
 
-  const uptr kMop = 2;
   VarSizeStackTrace traces[kMop];
-  uptr tags[kMop] = {kExternalTagNone};
-  uptr toppc = TraceTopPC(thr);
-  if (toppc >> kEventPCBits) {
-    // This is a work-around for a known issue.
-    // The scenario where this happens is rather elaborate and requires
-    // an instrumented __sanitizer_report_error_summary callback and
-    // a __tsan_symbolize_external callback and a race during a range memory
-    // access larger than 8 bytes. MemoryAccessRange adds the current PC to
-    // the trace and starts processing memory accesses. A first memory access
-    // triggers a race, we report it and call the instrumented
-    // __sanitizer_report_error_summary, which adds more stuff to the trace
-    // since it is intrumented. Then a second memory access in MemoryAccessRange
-    // also triggers a race and we get here and call TraceTopPC to get the
-    // current PC, however now it contains some unrelated events from the
-    // callback. Most likely, TraceTopPC will now return a EventTypeFuncExit
-    // event. Later we subtract -1 from it (in GetPreviousInstructionPc)
-    // and the resulting PC has kExternalPCBit set, so we pass it to
-    // __tsan_symbolize_external_ex. __tsan_symbolize_external_ex is within its
-    // rights to crash since the PC is completely bogus.
-    // test/tsan/double_race.cpp contains a test case for this.
-    toppc = 0;
-  }
-  ObtainCurrentStack(thr, toppc, &traces[0], &tags[0]);
-  if (IsFiredSuppression(ctx, typ, traces[0]))
+  Tid tids[kMop] = {thr->tid, kInvalidTid};
+  uptr tags[kMop] = {kExternalTagNone, kExternalTagNone};
+
+  ObtainCurrentStack(thr, thr->trace_prev_pc, &traces[0], &tags[0]);
+  if (IsFiredSuppression(ctx, rep_typ, traces[0]))
+    return;
+
+  DynamicMutexSet mset1;
+  MutexSet *mset[kMop] = {&thr->mset, mset1};
+
+  SlotPairLocker locker(thr, s[1].sid());
+  ThreadRegistryLock l0(&ctx->thread_registry);
+  Lock slots_lock(&ctx->slot_mtx);
+  if (!RestoreStack(EventType::kAccessExt, s[1].sid(), s[1].epoch(), addr1,
+                    size1, typ1, &tids[1], &traces[1], mset[1], &tags[1]))
     return;
 
-  DynamicMutexSet mset2;
-  Shadow s2(thr->racy_state[1]);
-  RestoreStack(s2.tid(), s2.epoch(), &traces[1], mset2, &tags[1]);
-  if (IsFiredSuppression(ctx, typ, traces[1]))
+  if (IsFiredSuppression(ctx, rep_typ, traces[1]))
     return;
 
   if (HandleRacyStacks(thr, traces))
@@ -908,39 +823,29 @@ void ReportRace(ThreadState *thr) {
   uptr tag = kExternalTagNone;
   for (uptr i = 0; i < kMop; i++) {
     if (tags[i] != kExternalTagNone) {
-      typ = ReportTypeExternalRace;
+      rep_typ = ReportTypeExternalRace;
       tag = tags[i];
       break;
     }
   }
 
-  ThreadRegistryLock l0(&ctx->thread_registry);
-  ScopedReport rep(typ, tag);
-  for (uptr i = 0; i < kMop; i++) {
-    Shadow s(thr->racy_state[i]);
-    rep.AddMemoryAccess(addr, tags[i], s, traces[i],
-                        i == 0 ? &thr->mset : mset2);
-  }
+  ScopedReport rep(rep_typ, tag);
+  for (uptr i = 0; i < kMop; i++)
+    rep.AddMemoryAccess(addr, tags[i], s[i], tids[i], traces[i], mset[i]);
 
   for (uptr i = 0; i < kMop; i++) {
-    FastState s(thr->racy_state[i]);
     ThreadContext *tctx = static_cast<ThreadContext *>(
-        ctx->thread_registry.GetThreadLocked(s.tid()));
-    if (s.epoch() < tctx->epoch0 || s.epoch() > tctx->epoch1)
-      continue;
+        ctx->thread_registry.GetThreadLocked(tids[i]));
     rep.AddThread(tctx);
   }
 
   rep.AddLocation(addr_min, addr_max - addr_min);
 
 #if !SANITIZER_GO
-  {
-    Shadow s(thr->racy_state[1]);
-    if (s.epoch() <= thr->last_sleep_clock.get(s.tid()))
-      rep.AddSleep(thr->last_sleep_stack_id);
-  }
+  if (!((typ0 | typ1) & kAccessFree) &&
+      s[1].epoch() <= thr->last_sleep_clock.Get(s[1].sid()))
+    rep.AddSleep(thr->last_sleep_stack_id);
 #endif
-
   OutputReport(thr, rep);
 }
 
diff --git a/libsanitizer/tsan/tsan_rtl_thread.cpp b/libsanitizer/tsan/tsan_rtl_thread.cpp
index 6e652ee8a65..fc5088c336c 100644
--- a/libsanitizer/tsan/tsan_rtl_thread.cpp
+++ b/libsanitizer/tsan/tsan_rtl_thread.cpp
@@ -21,20 +21,14 @@ namespace __tsan {
 
 // ThreadContext implementation.
 
-ThreadContext::ThreadContext(Tid tid)
-    : ThreadContextBase(tid), thr(), sync(), epoch0(), epoch1() {}
+ThreadContext::ThreadContext(Tid tid) : ThreadContextBase(tid), thr(), sync() {}
 
 #if !SANITIZER_GO
 ThreadContext::~ThreadContext() {
 }
 #endif
 
-void ThreadContext::OnReset() {
-  CHECK_EQ(sync.size(), 0);
-  uptr trace_p = GetThreadTrace(tid);
-  ReleaseMemoryPagesToOS(trace_p, trace_p + TraceSize() * sizeof(Event));
-  //!!! ReleaseMemoryToOS(GetThreadTraceHeader(tid), sizeof(Trace));
-}
+void ThreadContext::OnReset() { CHECK(!sync); }
 
 #if !SANITIZER_GO
 struct ThreadLeak {
@@ -112,30 +106,35 @@ int ThreadCount(ThreadState *thr) {
 }
 
 struct OnCreatedArgs {
-  ThreadState *thr;
-  uptr pc;
+  VectorClock *sync;
+  uptr sync_epoch;
+  StackID stack;
 };
 
 Tid ThreadCreate(ThreadState *thr, uptr pc, uptr uid, bool detached) {
-  OnCreatedArgs args = { thr, pc };
-  u32 parent_tid = thr ? thr->tid : kInvalidTid;  // No parent for GCD workers.
-  Tid tid = ctx->thread_registry.CreateThread(uid, detached, parent_tid, &args);
-  DPrintf("#%d: ThreadCreate tid=%d uid=%zu\n", parent_tid, tid, uid);
+  // The main thread and GCD workers don't have a parent thread.
+  Tid parent = kInvalidTid;
+  OnCreatedArgs arg = {nullptr, 0, kInvalidStackID};
+  if (thr) {
+    parent = thr->tid;
+    arg.stack = CurrentStackId(thr, pc);
+    if (!thr->ignore_sync) {
+      SlotLocker locker(thr);
+      thr->clock.ReleaseStore(&arg.sync);
+      arg.sync_epoch = ctx->global_epoch;
+      IncrementEpoch(thr);
+    }
+  }
+  Tid tid = ctx->thread_registry.CreateThread(uid, detached, parent, &arg);
+  DPrintf("#%d: ThreadCreate tid=%d uid=%zu\n", parent, tid, uid);
   return tid;
 }
 
 void ThreadContext::OnCreated(void *arg) {
-  thr = 0;
-  if (tid == kMainTid)
-    return;
   OnCreatedArgs *args = static_cast<OnCreatedArgs *>(arg);
-  if (!args->thr)  // GCD workers don't have a parent thread.
-    return;
-  args->thr->fast_state.IncrementEpoch();
-  // Can't increment epoch w/o writing to the trace as well.
-  TraceAddEvent(args->thr, args->thr->fast_state, EventTypeMop, 0);
-  ReleaseImpl(args->thr, 0, &sync);
-  creation_stack_id = CurrentStackId(args->thr, args->pc);
+  sync = args->sync;
+  sync_epoch = args->sync_epoch;
+  creation_stack_id = args->stack;
 }
 
 extern "C" void __tsan_stack_initialization() {}
@@ -150,6 +149,15 @@ struct OnStartedArgs {
 
 void ThreadStart(ThreadState *thr, Tid tid, tid_t os_id,
                  ThreadType thread_type) {
+  ctx->thread_registry.StartThread(tid, os_id, thread_type, thr);
+  if (!thr->ignore_sync) {
+    SlotAttachAndLock(thr);
+    if (thr->tctx->sync_epoch == ctx->global_epoch)
+      thr->clock.Acquire(thr->tctx->sync);
+    SlotUnlock(thr);
+  }
+  Free(thr->tctx->sync);
+
   uptr stk_addr = 0;
   uptr stk_size = 0;
   uptr tls_addr = 0;
@@ -159,12 +167,10 @@ void ThreadStart(ThreadState *thr, Tid tid, tid_t os_id,
     GetThreadStackAndTls(tid == kMainTid, &stk_addr, &stk_size, &tls_addr,
                          &tls_size);
 #endif
-
-  ThreadRegistry *tr = &ctx->thread_registry;
-  OnStartedArgs args = { thr, stk_addr, stk_size, tls_addr, tls_size };
-  tr->StartThread(tid, os_id, thread_type, &args);
-
-  while (!thr->tctx->trace.parts.Empty()) thr->tctx->trace.parts.PopBack();
+  thr->stk_addr = stk_addr;
+  thr->stk_size = stk_size;
+  thr->tls_addr = tls_addr;
+  thr->tls_size = tls_size;
 
 #if !SANITIZER_GO
   if (ctx->after_multithreaded_fork) {
@@ -192,69 +198,80 @@ void ThreadStart(ThreadState *thr, Tid tid, tid_t os_id,
 }
 
 void ThreadContext::OnStarted(void *arg) {
-  OnStartedArgs *args = static_cast<OnStartedArgs *>(arg);
-  thr = args->thr;
-  // RoundUp so that one trace part does not contain events
-  // from different threads.
-  epoch0 = RoundUp(epoch1 + 1, kTracePartSize);
-  epoch1 = (u64)-1;
-  new (thr)
-      ThreadState(ctx, tid, unique_id, epoch0, reuse_count, args->stk_addr,
-                  args->stk_size, args->tls_addr, args->tls_size);
+  thr = static_cast<ThreadState *>(arg);
+  DPrintf("#%d: ThreadStart\n", tid);
+  new (thr) ThreadState(tid);
   if (common_flags()->detect_deadlocks)
-    thr->dd_lt = ctx->dd->CreateLogicalThread(unique_id);
-  thr->fast_state.SetHistorySize(flags()->history_size);
-  // Commit switch to the new part of the trace.
-  // TraceAddEvent will reset stack0/mset0 in the new part for us.
-  TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
-
-  thr->fast_synch_epoch = epoch0;
-  AcquireImpl(thr, 0, &sync);
-  sync.Reset(&thr->proc()->clock_cache);
+    thr->dd_lt = ctx->dd->CreateLogicalThread(tid);
   thr->tctx = this;
+#if !SANITIZER_GO
   thr->is_inited = true;
-  DPrintf(
-      "#%d: ThreadStart epoch=%zu stk_addr=%zx stk_size=%zx "
-      "tls_addr=%zx tls_size=%zx\n",
-      tid, (uptr)epoch0, args->stk_addr, args->stk_size, args->tls_addr,
-      args->tls_size);
+#endif
 }
 
 void ThreadFinish(ThreadState *thr) {
+  DPrintf("#%d: ThreadFinish\n", thr->tid);
   ThreadCheckIgnore(thr);
   if (thr->stk_addr && thr->stk_size)
     DontNeedShadowFor(thr->stk_addr, thr->stk_size);
   if (thr->tls_addr && thr->tls_size)
     DontNeedShadowFor(thr->tls_addr, thr->tls_size);
   thr->is_dead = true;
-  ctx->thread_registry.FinishThread(thr->tid);
-}
-
-void ThreadContext::OnFinished() {
-#if SANITIZER_GO
+#if !SANITIZER_GO
+  thr->is_inited = false;
+  thr->ignore_interceptors++;
+  PlatformCleanUpThreadState(thr);
+#endif
+  if (!thr->ignore_sync) {
+    SlotLocker locker(thr);
+    ThreadRegistryLock lock(&ctx->thread_registry);
+    // Note: detached is protected by the thread registry mutex,
+    // the thread may be detaching concurrently in another thread.
+    if (!thr->tctx->detached) {
+      thr->clock.ReleaseStore(&thr->tctx->sync);
+      thr->tctx->sync_epoch = ctx->global_epoch;
+      IncrementEpoch(thr);
+    }
+  }
+#if !SANITIZER_GO
+  UnmapOrDie(thr->shadow_stack, kShadowStackSize * sizeof(uptr));
+#else
   Free(thr->shadow_stack);
+#endif
+  thr->shadow_stack = nullptr;
   thr->shadow_stack_pos = nullptr;
   thr->shadow_stack_end = nullptr;
-#endif
-  if (!detached) {
-    thr->fast_state.IncrementEpoch();
-    // Can't increment epoch w/o writing to the trace as well.
-    TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
-    ReleaseImpl(thr, 0, &sync);
-  }
-  epoch1 = thr->fast_state.epoch();
-
   if (common_flags()->detect_deadlocks)
     ctx->dd->DestroyLogicalThread(thr->dd_lt);
-  thr->clock.ResetCached(&thr->proc()->clock_cache);
-#if !SANITIZER_GO
-  thr->last_sleep_clock.ResetCached(&thr->proc()->clock_cache);
-#endif
-#if !SANITIZER_GO
-  PlatformCleanUpThreadState(thr);
-#endif
+  SlotDetach(thr);
+  ctx->thread_registry.FinishThread(thr->tid);
   thr->~ThreadState();
-  thr = 0;
+}
+
+void ThreadContext::OnFinished() {
+  Lock lock(&ctx->slot_mtx);
+  Lock lock1(&trace.mtx);
+  // Queue all trace parts into the global recycle queue.
+  auto parts = &trace.parts;
+  while (trace.local_head) {
+    CHECK(parts->Queued(trace.local_head));
+    ctx->trace_part_recycle.PushBack(trace.local_head);
+    trace.local_head = parts->Next(trace.local_head);
+  }
+  ctx->trace_part_recycle_finished += parts->Size();
+  if (ctx->trace_part_recycle_finished > Trace::kFinishedThreadHi) {
+    ctx->trace_part_finished_excess += parts->Size();
+    trace.parts_allocated = 0;
+  } else if (ctx->trace_part_recycle_finished > Trace::kFinishedThreadLo &&
+             parts->Size() > 1) {
+    ctx->trace_part_finished_excess += parts->Size() - 1;
+    trace.parts_allocated = 1;
+  }
+  // From now on replay will use trace->final_pos.
+  trace.final_pos = (Event *)atomic_load_relaxed(&thr->trace_pos);
+  atomic_store_relaxed(&thr->trace_pos, 0);
+  thr->tctx = nullptr;
+  thr = nullptr;
 }
 
 struct ConsumeThreadContext {
@@ -262,60 +279,47 @@ struct ConsumeThreadContext {
   ThreadContextBase *tctx;
 };
 
-static bool ConsumeThreadByUid(ThreadContextBase *tctx, void *arg) {
-  ConsumeThreadContext *findCtx = (ConsumeThreadContext *)arg;
-  if (tctx->user_id == findCtx->uid && tctx->status != ThreadStatusInvalid) {
-    if (findCtx->tctx) {
-      // Ensure that user_id is unique. If it's not the case we are screwed.
-      // Something went wrong before, but now there is no way to recover.
-      // Returning a wrong thread is not an option, it may lead to very hard
-      // to debug false positives (e.g. if we join a wrong thread).
-      Report("ThreadSanitizer: dup thread with used id 0x%zx\n", findCtx->uid);
-      Die();
-    }
-    findCtx->tctx = tctx;
-    tctx->user_id = 0;
-  }
-  return false;
-}
-
 Tid ThreadConsumeTid(ThreadState *thr, uptr pc, uptr uid) {
-  ConsumeThreadContext findCtx = {uid, nullptr};
-  ctx->thread_registry.FindThread(ConsumeThreadByUid, &findCtx);
-  Tid tid = findCtx.tctx ? findCtx.tctx->tid : kInvalidTid;
-  DPrintf("#%d: ThreadTid uid=%zu tid=%d\n", thr->tid, uid, tid);
-  return tid;
+  return ctx->thread_registry.ConsumeThreadUserId(uid);
 }
 
+struct JoinArg {
+  VectorClock *sync;
+  uptr sync_epoch;
+};
+
 void ThreadJoin(ThreadState *thr, uptr pc, Tid tid) {
   CHECK_GT(tid, 0);
-  CHECK_LT(tid, kMaxTid);
   DPrintf("#%d: ThreadJoin tid=%d\n", thr->tid, tid);
-  ctx->thread_registry.JoinThread(tid, thr);
+  JoinArg arg = {};
+  ctx->thread_registry.JoinThread(tid, &arg);
+  if (!thr->ignore_sync) {
+    SlotLocker locker(thr);
+    if (arg.sync_epoch == ctx->global_epoch)
+      thr->clock.Acquire(arg.sync);
+  }
+  Free(arg.sync);
 }
 
-void ThreadContext::OnJoined(void *arg) {
-  ThreadState *caller_thr = static_cast<ThreadState *>(arg);
-  AcquireImpl(caller_thr, 0, &sync);
-  sync.Reset(&caller_thr->proc()->clock_cache);
+void ThreadContext::OnJoined(void *ptr) {
+  auto arg = static_cast<JoinArg *>(ptr);
+  arg->sync = sync;
+  arg->sync_epoch = sync_epoch;
+  sync = nullptr;
+  sync_epoch = 0;
 }
 
-void ThreadContext::OnDead() { CHECK_EQ(sync.size(), 0); }
+void ThreadContext::OnDead() { CHECK_EQ(sync, nullptr); }
 
 void ThreadDetach(ThreadState *thr, uptr pc, Tid tid) {
   CHECK_GT(tid, 0);
-  CHECK_LT(tid, kMaxTid);
   ctx->thread_registry.DetachThread(tid, thr);
 }
 
-void ThreadContext::OnDetached(void *arg) {
-  ThreadState *thr1 = static_cast<ThreadState *>(arg);
-  sync.Reset(&thr1->proc()->clock_cache);
-}
+void ThreadContext::OnDetached(void *arg) { Free(sync); }
 
 void ThreadNotJoined(ThreadState *thr, uptr pc, Tid tid, uptr uid) {
   CHECK_GT(tid, 0);
-  CHECK_LT(tid, kMaxTid);
   ctx->thread_registry.SetThreadUserId(tid, uid);
 }
 
diff --git a/libsanitizer/tsan/tsan_shadow.h b/libsanitizer/tsan/tsan_shadow.h
index 8b7bc341713..843573ecf5d 100644
--- a/libsanitizer/tsan/tsan_shadow.h
+++ b/libsanitizer/tsan/tsan_shadow.h
@@ -10,223 +10,170 @@
 #define TSAN_SHADOW_H
 
 #include "tsan_defs.h"
-#include "tsan_trace.h"
 
 namespace __tsan {
 
-// FastState (from most significant bit):
-//   ignore          : 1
-//   tid             : kTidBits
-//   unused          : -
-//   history_size    : 3
-//   epoch           : kClkBits
 class FastState {
  public:
-  FastState(u64 tid, u64 epoch) {
-    x_ = tid << kTidShift;
-    x_ |= epoch;
-    DCHECK_EQ(tid, this->tid());
-    DCHECK_EQ(epoch, this->epoch());
-    DCHECK_EQ(GetIgnoreBit(), false);
-  }
-
-  explicit FastState(u64 x) : x_(x) {}
-
-  u64 raw() const { return x_; }
-
-  u64 tid() const {
-    u64 res = (x_ & ~kIgnoreBit) >> kTidShift;
-    return res;
-  }
-
-  u64 TidWithIgnore() const {
-    u64 res = x_ >> kTidShift;
-    return res;
-  }
-
-  u64 epoch() const {
-    u64 res = x_ & ((1ull << kClkBits) - 1);
-    return res;
-  }
+  FastState() { Reset(); }
 
-  void IncrementEpoch() {
-    u64 old_epoch = epoch();
-    x_ += 1;
-    DCHECK_EQ(old_epoch + 1, epoch());
-    (void)old_epoch;
+  void Reset() {
+    part_.unused0_ = 0;
+    part_.sid_ = static_cast<u8>(kFreeSid);
+    part_.epoch_ = static_cast<u16>(kEpochLast);
+    part_.unused1_ = 0;
+    part_.ignore_accesses_ = false;
   }
 
-  void SetIgnoreBit() { x_ |= kIgnoreBit; }
-  void ClearIgnoreBit() { x_ &= ~kIgnoreBit; }
-  bool GetIgnoreBit() const { return (s64)x_ < 0; }
+  void SetSid(Sid sid) { part_.sid_ = static_cast<u8>(sid); }
 
-  void SetHistorySize(int hs) {
-    CHECK_GE(hs, 0);
-    CHECK_LE(hs, 7);
-    x_ = (x_ & ~(kHistoryMask << kHistoryShift)) | (u64(hs) << kHistoryShift);
-  }
+  Sid sid() const { return static_cast<Sid>(part_.sid_); }
 
-  ALWAYS_INLINE
-  int GetHistorySize() const {
-    return (int)((x_ >> kHistoryShift) & kHistoryMask);
-  }
+  Epoch epoch() const { return static_cast<Epoch>(part_.epoch_); }
 
-  void ClearHistorySize() { SetHistorySize(0); }
+  void SetEpoch(Epoch epoch) { part_.epoch_ = static_cast<u16>(epoch); }
 
-  ALWAYS_INLINE
-  u64 GetTracePos() const {
-    const int hs = GetHistorySize();
-    // When hs == 0, the trace consists of 2 parts.
-    const u64 mask = (1ull << (kTracePartSizeBits + hs + 1)) - 1;
-    return epoch() & mask;
-  }
+  void SetIgnoreBit() { part_.ignore_accesses_ = 1; }
+  void ClearIgnoreBit() { part_.ignore_accesses_ = 0; }
+  bool GetIgnoreBit() const { return part_.ignore_accesses_; }
 
  private:
   friend class Shadow;
-  static const int kTidShift = 64 - kTidBits - 1;
-  static const u64 kIgnoreBit = 1ull << 63;
-  static const u64 kFreedBit = 1ull << 63;
-  static const u64 kHistoryShift = kClkBits;
-  static const u64 kHistoryMask = 7;
-  u64 x_;
+  struct Parts {
+    u32 unused0_ : 8;
+    u32 sid_ : 8;
+    u32 epoch_ : kEpochBits;
+    u32 unused1_ : 1;
+    u32 ignore_accesses_ : 1;
+  };
+  union {
+    Parts part_;
+    u32 raw_;
+  };
 };
 
-// Shadow (from most significant bit):
-//   freed           : 1
-//   tid             : kTidBits
-//   is_atomic       : 1
-//   is_read         : 1
-//   size_log        : 2
-//   addr0           : 3
-//   epoch           : kClkBits
-class Shadow : public FastState {
- public:
-  explicit Shadow(u64 x) : FastState(x) {}
+static_assert(sizeof(FastState) == kShadowSize, "bad FastState size");
 
-  explicit Shadow(const FastState &s) : FastState(s.x_) { ClearHistorySize(); }
-
-  void SetAddr0AndSizeLog(u64 addr0, unsigned kAccessSizeLog) {
-    DCHECK_EQ((x_ >> kClkBits) & 31, 0);
-    DCHECK_LE(addr0, 7);
-    DCHECK_LE(kAccessSizeLog, 3);
-    x_ |= ((kAccessSizeLog << 3) | addr0) << kClkBits;
-    DCHECK_EQ(kAccessSizeLog, size_log());
-    DCHECK_EQ(addr0, this->addr0());
-  }
-
-  void SetWrite(unsigned kAccessIsWrite) {
-    DCHECK_EQ(x_ & kReadBit, 0);
-    if (!kAccessIsWrite)
-      x_ |= kReadBit;
-    DCHECK_EQ(kAccessIsWrite, IsWrite());
-  }
-
-  void SetAtomic(bool kIsAtomic) {
-    DCHECK(!IsAtomic());
-    if (kIsAtomic)
-      x_ |= kAtomicBit;
-    DCHECK_EQ(IsAtomic(), kIsAtomic);
-  }
-
-  bool IsAtomic() const { return x_ & kAtomicBit; }
-
-  bool IsZero() const { return x_ == 0; }
-
-  static inline bool TidsAreEqual(const Shadow s1, const Shadow s2) {
-    u64 shifted_xor = (s1.x_ ^ s2.x_) >> kTidShift;
-    DCHECK_EQ(shifted_xor == 0, s1.TidWithIgnore() == s2.TidWithIgnore());
-    return shifted_xor == 0;
-  }
-
-  static ALWAYS_INLINE bool Addr0AndSizeAreEqual(const Shadow s1,
-                                                 const Shadow s2) {
-    u64 masked_xor = ((s1.x_ ^ s2.x_) >> kClkBits) & 31;
-    return masked_xor == 0;
+class Shadow {
+ public:
+  static constexpr RawShadow kEmpty = static_cast<RawShadow>(0);
+
+  Shadow(FastState state, u32 addr, u32 size, AccessType typ) {
+    raw_ = state.raw_;
+    DCHECK_GT(size, 0);
+    DCHECK_LE(size, 8);
+    UNUSED Sid sid0 = part_.sid_;
+    UNUSED u16 epoch0 = part_.epoch_;
+    raw_ |= (!!(typ & kAccessAtomic) << kIsAtomicShift) |
+            (!!(typ & kAccessRead) << kIsReadShift) |
+            (((((1u << size) - 1) << (addr & 0x7)) & 0xff) << kAccessShift);
+    // Note: we don't check kAccessAtomic because it overlaps with
+    // FastState::ignore_accesses_ and it may be set spuriously.
+    DCHECK_EQ(part_.is_read_, !!(typ & kAccessRead));
+    DCHECK_EQ(sid(), sid0);
+    DCHECK_EQ(epoch(), epoch0);
+  }
+
+  explicit Shadow(RawShadow x = Shadow::kEmpty) { raw_ = static_cast<u32>(x); }
+
+  RawShadow raw() const { return static_cast<RawShadow>(raw_); }
+  Sid sid() const { return part_.sid_; }
+  Epoch epoch() const { return static_cast<Epoch>(part_.epoch_); }
+  u8 access() const { return part_.access_; }
+
+  void GetAccess(uptr *addr, uptr *size, AccessType *typ) const {
+    DCHECK(part_.access_ != 0 || raw_ == static_cast<u32>(Shadow::kRodata));
+    if (addr)
+      *addr = part_.access_ ? __builtin_ffs(part_.access_) - 1 : 0;
+    if (size)
+      *size = part_.access_ == kFreeAccess ? kShadowCell
+                                           : __builtin_popcount(part_.access_);
+    if (typ)
+      *typ = (part_.is_read_ ? kAccessRead : kAccessWrite) |
+             (part_.is_atomic_ ? kAccessAtomic : 0) |
+             (part_.access_ == kFreeAccess ? kAccessFree : 0);
   }
 
-  static ALWAYS_INLINE bool TwoRangesIntersect(Shadow s1, Shadow s2,
-                                               unsigned kS2AccessSize) {
-    bool res = false;
-    u64 diff = s1.addr0() - s2.addr0();
-    if ((s64)diff < 0) {  // s1.addr0 < s2.addr0
-      // if (s1.addr0() + size1) > s2.addr0()) return true;
-      if (s1.size() > -diff)
-        res = true;
-    } else {
-      // if (s2.addr0() + kS2AccessSize > s1.addr0()) return true;
-      if (kS2AccessSize > diff)
-        res = true;
-    }
-    DCHECK_EQ(res, TwoRangesIntersectSlow(s1, s2));
-    DCHECK_EQ(res, TwoRangesIntersectSlow(s2, s1));
+  ALWAYS_INLINE
+  bool IsBothReadsOrAtomic(AccessType typ) const {
+    u32 is_read = !!(typ & kAccessRead);
+    u32 is_atomic = !!(typ & kAccessAtomic);
+    bool res =
+        raw_ & ((is_atomic << kIsAtomicShift) | (is_read << kIsReadShift));
+    DCHECK_EQ(res,
+              (part_.is_read_ && is_read) || (part_.is_atomic_ && is_atomic));
     return res;
   }
 
-  u64 ALWAYS_INLINE addr0() const { return (x_ >> kClkBits) & 7; }
-  u64 ALWAYS_INLINE size() const { return 1ull << size_log(); }
-  bool ALWAYS_INLINE IsWrite() const { return !IsRead(); }
-  bool ALWAYS_INLINE IsRead() const { return x_ & kReadBit; }
-
-  // The idea behind the freed bit is as follows.
-  // When the memory is freed (or otherwise unaccessible) we write to the shadow
-  // values with tid/epoch related to the free and the freed bit set.
-  // During memory accesses processing the freed bit is considered
-  // as msb of tid. So any access races with shadow with freed bit set
-  // (it is as if write from a thread with which we never synchronized before).
-  // This allows us to detect accesses to freed memory w/o additional
-  // overheads in memory access processing and at the same time restore
-  // tid/epoch of free.
-  void MarkAsFreed() { x_ |= kFreedBit; }
-
-  bool IsFreed() const { return x_ & kFreedBit; }
-
-  bool GetFreedAndReset() {
-    bool res = x_ & kFreedBit;
-    x_ &= ~kFreedBit;
+  ALWAYS_INLINE
+  bool IsRWWeakerOrEqual(AccessType typ) const {
+    u32 is_read = !!(typ & kAccessRead);
+    u32 is_atomic = !!(typ & kAccessAtomic);
+    UNUSED u32 res0 =
+        (part_.is_atomic_ > is_atomic) ||
+        (part_.is_atomic_ == is_atomic && part_.is_read_ >= is_read);
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+    const u32 kAtomicReadMask = (1 << kIsAtomicShift) | (1 << kIsReadShift);
+    bool res = (raw_ & kAtomicReadMask) >=
+               ((is_atomic << kIsAtomicShift) | (is_read << kIsReadShift));
+
+    DCHECK_EQ(res, res0);
     return res;
+#else
+    return res0;
+#endif
   }
 
-  bool ALWAYS_INLINE IsBothReadsOrAtomic(bool kIsWrite, bool kIsAtomic) const {
-    bool v = x_ & ((u64(kIsWrite ^ 1) << kReadShift) |
-                   (u64(kIsAtomic) << kAtomicShift));
-    DCHECK_EQ(v, (!IsWrite() && !kIsWrite) || (IsAtomic() && kIsAtomic));
-    return v;
-  }
-
-  bool ALWAYS_INLINE IsRWNotWeaker(bool kIsWrite, bool kIsAtomic) const {
-    bool v = ((x_ >> kReadShift) & 3) <= u64((kIsWrite ^ 1) | (kIsAtomic << 1));
-    DCHECK_EQ(v, (IsAtomic() < kIsAtomic) ||
-                     (IsAtomic() == kIsAtomic && !IsWrite() <= !kIsWrite));
-    return v;
+  // The FreedMarker must not pass "the same access check" so that we don't
+  // return from the race detection algorithm early.
+  static RawShadow FreedMarker() {
+    FastState fs;
+    fs.SetSid(kFreeSid);
+    fs.SetEpoch(kEpochLast);
+    Shadow s(fs, 0, 8, kAccessWrite);
+    return s.raw();
   }
 
-  bool ALWAYS_INLINE IsRWWeakerOrEqual(bool kIsWrite, bool kIsAtomic) const {
-    bool v = ((x_ >> kReadShift) & 3) >= u64((kIsWrite ^ 1) | (kIsAtomic << 1));
-    DCHECK_EQ(v, (IsAtomic() > kIsAtomic) ||
-                     (IsAtomic() == kIsAtomic && !IsWrite() >= !kIsWrite));
-    return v;
+  static RawShadow FreedInfo(Sid sid, Epoch epoch) {
+    Shadow s;
+    s.part_.sid_ = sid;
+    s.part_.epoch_ = static_cast<u16>(epoch);
+    s.part_.access_ = kFreeAccess;
+    return s.raw();
   }
 
  private:
-  static const u64 kReadShift = 5 + kClkBits;
-  static const u64 kReadBit = 1ull << kReadShift;
-  static const u64 kAtomicShift = 6 + kClkBits;
-  static const u64 kAtomicBit = 1ull << kAtomicShift;
-
-  u64 size_log() const { return (x_ >> (3 + kClkBits)) & 3; }
-
-  static bool TwoRangesIntersectSlow(const Shadow s1, const Shadow s2) {
-    if (s1.addr0() == s2.addr0())
-      return true;
-    if (s1.addr0() < s2.addr0() && s1.addr0() + s1.size() > s2.addr0())
-      return true;
-    if (s2.addr0() < s1.addr0() && s2.addr0() + s2.size() > s1.addr0())
-      return true;
-    return false;
-  }
+  struct Parts {
+    u8 access_;
+    Sid sid_;
+    u16 epoch_ : kEpochBits;
+    u16 is_read_ : 1;
+    u16 is_atomic_ : 1;
+  };
+  union {
+    Parts part_;
+    u32 raw_;
+  };
+
+  static constexpr u8 kFreeAccess = 0x81;
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  static constexpr uptr kAccessShift = 0;
+  static constexpr uptr kIsReadShift = 30;
+  static constexpr uptr kIsAtomicShift = 31;
+#else
+  static constexpr uptr kAccessShift = 24;
+  static constexpr uptr kIsReadShift = 1;
+  static constexpr uptr kIsAtomicShift = 0;
+#endif
+
+ public:
+  // .rodata shadow marker, see MapRodata and ContainsSameAccessFast.
+  static constexpr RawShadow kRodata =
+      static_cast<RawShadow>(1 << kIsReadShift);
 };
 
-const RawShadow kShadowRodata = (RawShadow)-1;  // .rodata shadow marker
+static_assert(sizeof(Shadow) == kShadowSize, "bad Shadow size");
 
 }  // namespace __tsan
 
diff --git a/libsanitizer/tsan/tsan_sync.cpp b/libsanitizer/tsan/tsan_sync.cpp
index f042abab74e..09d41780d18 100644
--- a/libsanitizer/tsan/tsan_sync.cpp
+++ b/libsanitizer/tsan/tsan_sync.cpp
@@ -18,43 +18,31 @@ namespace __tsan {
 
 void DDMutexInit(ThreadState *thr, uptr pc, SyncVar *s);
 
-SyncVar::SyncVar() : mtx(MutexTypeSyncVar) { Reset(0); }
+SyncVar::SyncVar() : mtx(MutexTypeSyncVar) { Reset(); }
 
-void SyncVar::Init(ThreadState *thr, uptr pc, uptr addr, u64 uid,
-                   bool save_stack) {
+void SyncVar::Init(ThreadState *thr, uptr pc, uptr addr, bool save_stack) {
+  Reset();
   this->addr = addr;
-  this->uid = uid;
-  this->next = 0;
-
-  creation_stack_id = kInvalidStackID;
+  next = 0;
   if (save_stack && !SANITIZER_GO)  // Go does not use them
     creation_stack_id = CurrentStackId(thr, pc);
   if (common_flags()->detect_deadlocks)
     DDMutexInit(thr, pc, this);
 }
 
-void SyncVar::Reset(Processor *proc) {
-  uid = 0;
+void SyncVar::Reset() {
+  CHECK(!ctx->resetting);
   creation_stack_id = kInvalidStackID;
   owner_tid = kInvalidTid;
-  last_lock = 0;
+  last_lock.Reset();
   recursion = 0;
   atomic_store_relaxed(&flags, 0);
-
-  if (proc == 0) {
-    CHECK_EQ(clock.size(), 0);
-    CHECK_EQ(read_clock.size(), 0);
-  } else {
-    clock.Reset(&proc->clock_cache);
-    read_clock.Reset(&proc->clock_cache);
-  }
+  Free(clock);
+  Free(read_clock);
 }
 
 MetaMap::MetaMap()
-    : block_alloc_(LINKER_INITIALIZED, "heap block allocator"),
-      sync_alloc_(LINKER_INITIALIZED, "sync allocator") {
-  atomic_store(&uid_gen_, 0, memory_order_relaxed);
-}
+    : block_alloc_("heap block allocator"), sync_alloc_("sync allocator") {}
 
 void MetaMap::AllocBlock(ThreadState *thr, uptr pc, uptr p, uptr sz) {
   u32 idx = block_alloc_.Alloc(&thr->proc()->block_cache);
@@ -68,16 +56,16 @@ void MetaMap::AllocBlock(ThreadState *thr, uptr pc, uptr p, uptr sz) {
   *meta = idx | kFlagBlock;
 }
 
-uptr MetaMap::FreeBlock(Processor *proc, uptr p) {
+uptr MetaMap::FreeBlock(Processor *proc, uptr p, bool reset) {
   MBlock* b = GetBlock(p);
   if (b == 0)
     return 0;
   uptr sz = RoundUpTo(b->siz, kMetaShadowCell);
-  FreeRange(proc, p, sz);
+  FreeRange(proc, p, sz, reset);
   return sz;
 }
 
-bool MetaMap::FreeRange(Processor *proc, uptr p, uptr sz) {
+bool MetaMap::FreeRange(Processor *proc, uptr p, uptr sz, bool reset) {
   bool has_something = false;
   u32 *meta = MemToMeta(p);
   u32 *end = MemToMeta(p + sz);
@@ -99,7 +87,8 @@ bool MetaMap::FreeRange(Processor *proc, uptr p, uptr sz) {
         DCHECK(idx & kFlagSync);
         SyncVar *s = sync_alloc_.Map(idx & ~kFlagMask);
         u32 next = s->next;
-        s->Reset(proc);
+        if (reset)
+          s->Reset();
         sync_alloc_.Free(&proc->sync_cache, idx & ~kFlagMask);
         idx = next;
       } else {
@@ -116,30 +105,30 @@ bool MetaMap::FreeRange(Processor *proc, uptr p, uptr sz) {
 // which can be huge. The function probes pages one-by-one until it finds a page
 // without meta objects, at this point it stops freeing meta objects. Because
 // thread stacks grow top-down, we do the same starting from end as well.
-void MetaMap::ResetRange(Processor *proc, uptr p, uptr sz) {
+void MetaMap::ResetRange(Processor *proc, uptr p, uptr sz, bool reset) {
   if (SANITIZER_GO) {
     // UnmapOrDie/MmapFixedNoReserve does not work on Windows,
     // so we do the optimization only for C/C++.
-    FreeRange(proc, p, sz);
+    FreeRange(proc, p, sz, reset);
     return;
   }
   const uptr kMetaRatio = kMetaShadowCell / kMetaShadowSize;
   const uptr kPageSize = GetPageSizeCached() * kMetaRatio;
   if (sz <= 4 * kPageSize) {
     // If the range is small, just do the normal free procedure.
-    FreeRange(proc, p, sz);
+    FreeRange(proc, p, sz, reset);
     return;
   }
   // First, round both ends of the range to page size.
   uptr diff = RoundUp(p, kPageSize) - p;
   if (diff != 0) {
-    FreeRange(proc, p, diff);
+    FreeRange(proc, p, diff, reset);
     p += diff;
     sz -= diff;
   }
   diff = p + sz - RoundDown(p + sz, kPageSize);
   if (diff != 0) {
-    FreeRange(proc, p + sz - diff, diff);
+    FreeRange(proc, p + sz - diff, diff, reset);
     sz -= diff;
   }
   // Now we must have a non-empty page-aligned range.
@@ -150,7 +139,7 @@ void MetaMap::ResetRange(Processor *proc, uptr p, uptr sz) {
   const uptr sz0 = sz;
   // Probe start of the range.
   for (uptr checked = 0; sz > 0; checked += kPageSize) {
-    bool has_something = FreeRange(proc, p, kPageSize);
+    bool has_something = FreeRange(proc, p, kPageSize, reset);
     p += kPageSize;
     sz -= kPageSize;
     if (!has_something && checked > (128 << 10))
@@ -158,7 +147,7 @@ void MetaMap::ResetRange(Processor *proc, uptr p, uptr sz) {
   }
   // Probe end of the range.
   for (uptr checked = 0; sz > 0; checked += kPageSize) {
-    bool has_something = FreeRange(proc, p + sz - kPageSize, kPageSize);
+    bool has_something = FreeRange(proc, p + sz - kPageSize, kPageSize, reset);
     sz -= kPageSize;
     // Stacks grow down, so sync object are most likely at the end of the region
     // (if it is a stack). The very end of the stack is TLS and tsan increases
@@ -177,6 +166,27 @@ void MetaMap::ResetRange(Processor *proc, uptr p, uptr sz) {
     Die();
 }
 
+void MetaMap::ResetClocks() {
+  // This can be called from the background thread
+  // which does not have proc/cache.
+  // The cache is too large for stack.
+  static InternalAllocatorCache cache;
+  internal_memset(&cache, 0, sizeof(cache));
+  internal_allocator()->InitCache(&cache);
+  sync_alloc_.ForEach([&](SyncVar *s) {
+    if (s->clock) {
+      InternalFree(s->clock, &cache);
+      s->clock = nullptr;
+    }
+    if (s->read_clock) {
+      InternalFree(s->read_clock, &cache);
+      s->read_clock = nullptr;
+    }
+    s->last_lock.Reset();
+  });
+  internal_allocator()->DestroyCache(&cache);
+}
+
 MBlock* MetaMap::GetBlock(uptr p) {
   u32 *meta = MemToMeta(p);
   u32 idx = *meta;
@@ -193,6 +203,7 @@ MBlock* MetaMap::GetBlock(uptr p) {
 
 SyncVar *MetaMap::GetSync(ThreadState *thr, uptr pc, uptr addr, bool create,
                           bool save_stack) {
+  DCHECK(!create || thr->slot_locked);
   u32 *meta = MemToMeta(addr);
   u32 idx0 = *meta;
   u32 myidx = 0;
@@ -203,7 +214,7 @@ SyncVar *MetaMap::GetSync(ThreadState *thr, uptr pc, uptr addr, bool create,
       SyncVar * s = sync_alloc_.Map(idx & ~kFlagMask);
       if (LIKELY(s->addr == addr)) {
         if (UNLIKELY(myidx != 0)) {
-          mys->Reset(thr->proc());
+          mys->Reset();
           sync_alloc_.Free(&thr->proc()->sync_cache, myidx);
         }
         return s;
@@ -218,10 +229,9 @@ SyncVar *MetaMap::GetSync(ThreadState *thr, uptr pc, uptr addr, bool create,
     }
 
     if (LIKELY(myidx == 0)) {
-      const u64 uid = atomic_fetch_add(&uid_gen_, 1, memory_order_relaxed);
       myidx = sync_alloc_.Alloc(&thr->proc()->sync_cache);
       mys = sync_alloc_.Map(myidx);
-      mys->Init(thr, pc, addr, uid, save_stack);
+      mys->Init(thr, pc, addr, save_stack);
     }
     mys->next = idx0;
     if (atomic_compare_exchange_strong((atomic_uint32_t*)meta, &idx0,
diff --git a/libsanitizer/tsan/tsan_sync.h b/libsanitizer/tsan/tsan_sync.h
index fc8fa288a84..1e5f828349c 100644
--- a/libsanitizer/tsan/tsan_sync.h
+++ b/libsanitizer/tsan/tsan_sync.h
@@ -15,9 +15,11 @@
 #include "sanitizer_common/sanitizer_atomic.h"
 #include "sanitizer_common/sanitizer_common.h"
 #include "sanitizer_common/sanitizer_deadlock_detector_interface.h"
-#include "tsan_defs.h"
 #include "tsan_clock.h"
+#include "tsan_defs.h"
 #include "tsan_dense_alloc.h"
+#include "tsan_shadow.h"
+#include "tsan_vector_clock.h"
 
 namespace __tsan {
 
@@ -53,34 +55,18 @@ struct SyncVar {
 
   uptr addr;  // overwritten by DenseSlabAlloc freelist
   Mutex mtx;
-  u64 uid;  // Globally unique id.
   StackID creation_stack_id;
   Tid owner_tid;  // Set only by exclusive owners.
-  u64 last_lock;
+  FastState last_lock;
   int recursion;
   atomic_uint32_t flags;
   u32 next;  // in MetaMap
   DDMutex dd;
-  SyncClock read_clock;  // Used for rw mutexes only.
-  // The clock is placed last, so that it is situated on a different cache line
-  // with the mtx. This reduces contention for hot sync objects.
-  SyncClock clock;
+  VectorClock *read_clock;  // Used for rw mutexes only.
+  VectorClock *clock;
 
-  void Init(ThreadState *thr, uptr pc, uptr addr, u64 uid, bool save_stack);
-  void Reset(Processor *proc);
-
-  u64 GetId() const {
-    // 48 lsb is addr, then 14 bits is low part of uid, then 2 zero bits.
-    return GetLsb((u64)addr | (uid << 48), 60);
-  }
-  bool CheckId(u64 uid) const {
-    CHECK_EQ(uid, GetLsb(uid, 14));
-    return GetLsb(this->uid, 14) == uid;
-  }
-  static uptr SplitId(u64 id, u64 *uid) {
-    *uid = id >> 48;
-    return (uptr)GetLsb(id, 48);
-  }
+  void Init(ThreadState *thr, uptr pc, uptr addr, bool save_stack);
+  void Reset();
 
   bool IsFlagSet(u32 f) const {
     return atomic_load_relaxed(&flags) & f;
@@ -110,9 +96,20 @@ class MetaMap {
   MetaMap();
 
   void AllocBlock(ThreadState *thr, uptr pc, uptr p, uptr sz);
-  uptr FreeBlock(Processor *proc, uptr p);
-  bool FreeRange(Processor *proc, uptr p, uptr sz);
-  void ResetRange(Processor *proc, uptr p, uptr sz);
+
+  // FreeBlock resets all sync objects in the range if reset=true and must not
+  // run concurrently with ResetClocks which resets all sync objects
+  // w/o any synchronization (as part of DoReset).
+  // If we don't have a thread slot (very early/late in thread lifetime or
+  // Go/Java callbacks) or the slot is not locked, then reset must be set to
+  // false. In such case sync object clocks will be reset later (when it's
+  // reused or during the next ResetClocks).
+  uptr FreeBlock(Processor *proc, uptr p, bool reset);
+  bool FreeRange(Processor *proc, uptr p, uptr sz, bool reset);
+  void ResetRange(Processor *proc, uptr p, uptr sz, bool reset);
+  // Reset vector clocks of all sync objects.
+  // Must be called when no other threads access sync objects.
+  void ResetClocks();
   MBlock* GetBlock(uptr p);
 
   SyncVar *GetSyncOrCreate(ThreadState *thr, uptr pc, uptr addr,
@@ -142,7 +139,6 @@ class MetaMap {
   typedef DenseSlabAlloc<SyncVar, 1 << 20, 1 << 10, kFlagMask> SyncAlloc;
   BlockAlloc block_alloc_;
   SyncAlloc sync_alloc_;
-  atomic_uint64_t uid_gen_;
 
   SyncVar *GetSync(ThreadState *thr, uptr pc, uptr addr, bool create,
                    bool save_stack);
diff --git a/libsanitizer/tsan/tsan_trace.h b/libsanitizer/tsan/tsan_trace.h
index a771ad9f52f..01bb7b34f43 100644
--- a/libsanitizer/tsan/tsan_trace.h
+++ b/libsanitizer/tsan/tsan_trace.h
@@ -19,57 +19,6 @@
 
 namespace __tsan {
 
-const int kTracePartSizeBits = 13;
-const int kTracePartSize = 1 << kTracePartSizeBits;
-const int kTraceParts = 2 * 1024 * 1024 / kTracePartSize;
-const int kTraceSize = kTracePartSize * kTraceParts;
-
-// Must fit into 3 bits.
-enum EventType {
-  EventTypeMop,
-  EventTypeFuncEnter,
-  EventTypeFuncExit,
-  EventTypeLock,
-  EventTypeUnlock,
-  EventTypeRLock,
-  EventTypeRUnlock
-};
-
-// Represents a thread event (from most significant bit):
-// u64 typ  : 3;   // EventType.
-// u64 addr : 61;  // Associated pc.
-typedef u64 Event;
-
-const uptr kEventPCBits = 61;
-
-struct TraceHeader {
-#if !SANITIZER_GO
-  BufferedStackTrace stack0;  // Start stack for the trace.
-#else
-  VarSizeStackTrace stack0;
-#endif
-  u64        epoch0;  // Start epoch for the trace.
-  MutexSet   mset0;
-
-  TraceHeader() : stack0(), epoch0() {}
-};
-
-struct Trace {
-  Mutex mtx;
-#if !SANITIZER_GO
-  // Must be last to catch overflow as paging fault.
-  // Go shadow stack is dynamically allocated.
-  uptr shadow_stack[kShadowStackSize];
-#endif
-  // Must be the last field, because we unmap the unused part in
-  // CreateThreadContext.
-  TraceHeader headers[kTraceParts];
-
-  Trace() : mtx(MutexTypeTrace) {}
-};
-
-namespace v3 {
-
 enum class EventType : u64 {
   kAccessExt,
   kAccessRange,
@@ -99,6 +48,8 @@ static constexpr Event NopEvent = {1, 0, EventType::kAccessExt, 0};
 // close enough to each other. Otherwise we fall back to EventAccessExt.
 struct EventAccess {
   static constexpr uptr kPCBits = 15;
+  static_assert(kPCBits + kCompressedAddrBits + 5 == 64,
+                "unused bits in EventAccess");
 
   u64 is_access : 1;  // = 1
   u64 is_read : 1;
@@ -119,13 +70,23 @@ static_assert(sizeof(EventFunc) == 8, "bad EventFunc size");
 
 // Extended memory access with full PC.
 struct EventAccessExt {
+  // Note: precisely specifying the unused parts of the bitfield is critical for
+  // performance. If we don't specify them, compiler will generate code to load
+  // the old value and shuffle it to extract the unused bits to apply to the new
+  // value. If we specify the unused part and store 0 in there, all that
+  // unnecessary code goes away (store of the 0 const is combined with other
+  // constant parts).
+  static constexpr uptr kUnusedBits = 11;
+  static_assert(kCompressedAddrBits + kUnusedBits + 9 == 64,
+                "unused bits in EventAccessExt");
+
   u64 is_access : 1;   // = 0
   u64 is_func : 1;     // = 0
   EventType type : 3;  // = EventType::kAccessExt
   u64 is_read : 1;
   u64 is_atomic : 1;
   u64 size_log : 2;
-  u64 _ : 11;
+  u64 _ : kUnusedBits;
   u64 addr : kCompressedAddrBits;
   u64 pc;
 };
@@ -134,6 +95,8 @@ static_assert(sizeof(EventAccessExt) == 16, "bad EventAccessExt size");
 // Access to a memory range.
 struct EventAccessRange {
   static constexpr uptr kSizeLoBits = 13;
+  static_assert(kCompressedAddrBits + kSizeLoBits + 7 == 64,
+                "unused bits in EventAccessRange");
 
   u64 is_access : 1;   // = 0
   u64 is_func : 1;     // = 0
@@ -150,6 +113,13 @@ static_assert(sizeof(EventAccessRange) == 16, "bad EventAccessRange size");
 // Mutex lock.
 struct EventLock {
   static constexpr uptr kStackIDLoBits = 15;
+  static constexpr uptr kStackIDHiBits =
+      sizeof(StackID) * kByteBits - kStackIDLoBits;
+  static constexpr uptr kUnusedBits = 3;
+  static_assert(kCompressedAddrBits + kStackIDLoBits + 5 == 64,
+                "unused bits in EventLock");
+  static_assert(kCompressedAddrBits + kStackIDHiBits + kUnusedBits == 64,
+                "unused bits in EventLock");
 
   u64 is_access : 1;   // = 0
   u64 is_func : 1;     // = 0
@@ -157,29 +127,37 @@ struct EventLock {
   u64 pc : kCompressedAddrBits;
   u64 stack_lo : kStackIDLoBits;
   u64 stack_hi : sizeof(StackID) * kByteBits - kStackIDLoBits;
-  u64 _ : 3;
+  u64 _ : kUnusedBits;
   u64 addr : kCompressedAddrBits;
 };
 static_assert(sizeof(EventLock) == 16, "bad EventLock size");
 
 // Mutex unlock.
 struct EventUnlock {
+  static constexpr uptr kUnusedBits = 15;
+  static_assert(kCompressedAddrBits + kUnusedBits + 5 == 64,
+                "unused bits in EventUnlock");
+
   u64 is_access : 1;   // = 0
   u64 is_func : 1;     // = 0
   EventType type : 3;  // = EventType::kUnlock
-  u64 _ : 15;
+  u64 _ : kUnusedBits;
   u64 addr : kCompressedAddrBits;
 };
 static_assert(sizeof(EventUnlock) == 8, "bad EventUnlock size");
 
 // Time change event.
 struct EventTime {
+  static constexpr uptr kUnusedBits = 37;
+  static_assert(kUnusedBits + sizeof(Sid) * kByteBits + kEpochBits + 5 == 64,
+                "unused bits in EventTime");
+
   u64 is_access : 1;   // = 0
   u64 is_func : 1;     // = 0
   EventType type : 3;  // = EventType::kTime
   u64 sid : sizeof(Sid) * kByteBits;
   u64 epoch : kEpochBits;
-  u64 _ : 64 - 5 - sizeof(Sid) * kByteBits - kEpochBits;
+  u64 _ : kUnusedBits;
 };
 static_assert(sizeof(EventTime) == 8, "bad EventTime size");
 
@@ -188,10 +166,12 @@ struct Trace;
 struct TraceHeader {
   Trace* trace = nullptr;  // back-pointer to Trace containing this part
   INode trace_parts;       // in Trace::parts
+  INode global;            // in Contex::trace_part_recycle
 };
 
 struct TracePart : TraceHeader {
-  static constexpr uptr kByteSize = 256 << 10;
+  // There are a lot of goroutines in Go, so we use smaller parts.
+  static constexpr uptr kByteSize = (SANITIZER_GO ? 128 : 256) << 10;
   static constexpr uptr kSize =
       (kByteSize - sizeof(TraceHeader)) / sizeof(Event);
   // TraceAcquire does a fast event pointer overflow check by comparing
@@ -209,13 +189,26 @@ static_assert(sizeof(TracePart) == TracePart::kByteSize, "bad TracePart size");
 struct Trace {
   Mutex mtx;
   IList<TraceHeader, &TraceHeader::trace_parts, TracePart> parts;
-  Event* final_pos =
-      nullptr;  // final position in the last part for finished threads
+  // First node non-queued into ctx->trace_part_recycle.
+  TracePart* local_head;
+  // Final position in the last part for finished threads.
+  Event* final_pos = nullptr;
+  // Number of trace parts allocated on behalf of this trace specifically.
+  // Total number of parts in this trace can be larger if we retake some
+  // parts from other traces.
+  uptr parts_allocated = 0;
 
   Trace() : mtx(MutexTypeTrace) {}
-};
 
-}  // namespace v3
+  // We need at least 3 parts per thread, because we want to keep at last
+  // 2 parts per thread that are not queued into ctx->trace_part_recycle
+  // (the current one being filled and one full part that ensures that
+  // we always have at least one part worth of previous memory accesses).
+  static constexpr uptr kMinParts = 3;
+
+  static constexpr uptr kFinishedThreadLo = 16;
+  static constexpr uptr kFinishedThreadHi = 64;
+};
 
 }  // namespace __tsan
 
diff --git a/libsanitizer/tsan/tsan_update_shadow_word.inc b/libsanitizer/tsan/tsan_update_shadow_word.inc
deleted file mode 100644
index a58ef0f17ef..00000000000
--- a/libsanitizer/tsan/tsan_update_shadow_word.inc
+++ /dev/null
@@ -1,59 +0,0 @@
-//===-- tsan_update_shadow_word.inc -----------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is a part of ThreadSanitizer (TSan), a race detector.
-//
-// Body of the hottest inner loop.
-// If we wrap this body into a function, compilers (both gcc and clang)
-// produce sligtly less efficient code.
-//===----------------------------------------------------------------------===//
-do {
-  const unsigned kAccessSize = 1 << kAccessSizeLog;
-  u64 *sp = &shadow_mem[idx];
-  old = LoadShadow(sp);
-  if (LIKELY(old.IsZero())) {
-    if (!stored) {
-      StoreIfNotYetStored(sp, &store_word);
-      stored = true;
-    }
-    break;
-  }
-  // is the memory access equal to the previous?
-  if (LIKELY(Shadow::Addr0AndSizeAreEqual(cur, old))) {
-    // same thread?
-    if (LIKELY(Shadow::TidsAreEqual(old, cur))) {
-      if (LIKELY(old.IsRWWeakerOrEqual(kAccessIsWrite, kIsAtomic))) {
-        StoreIfNotYetStored(sp, &store_word);
-        stored = true;
-      }
-      break;
-    }
-    if (HappensBefore(old, thr)) {
-      if (old.IsRWWeakerOrEqual(kAccessIsWrite, kIsAtomic)) {
-        StoreIfNotYetStored(sp, &store_word);
-        stored = true;
-      }
-      break;
-    }
-    if (LIKELY(old.IsBothReadsOrAtomic(kAccessIsWrite, kIsAtomic)))
-      break;
-    goto RACE;
-  }
-  // Do the memory access intersect?
-  if (Shadow::TwoRangesIntersect(old, cur, kAccessSize)) {
-    if (Shadow::TidsAreEqual(old, cur))
-      break;
-    if (old.IsBothReadsOrAtomic(kAccessIsWrite, kIsAtomic))
-      break;
-    if (LIKELY(HappensBefore(old, thr)))
-      break;
-    goto RACE;
-  }
-  // The accesses do not intersect.
-  break;
-} while (0);
diff --git a/libsanitizer/ubsan/ubsan_flags.cpp b/libsanitizer/ubsan/ubsan_flags.cpp
index 9a66bd37518..25cefd46ce2 100644
--- a/libsanitizer/ubsan/ubsan_flags.cpp
+++ b/libsanitizer/ubsan/ubsan_flags.cpp
@@ -50,7 +50,6 @@ void InitializeFlags() {
   {
     CommonFlags cf;
     cf.CopyFrom(*common_flags());
-    cf.print_summary = false;
     cf.external_symbolizer_path = GetFlag("UBSAN_SYMBOLIZER_PATH");
     OverrideCommonFlags(cf);
   }
diff --git a/libsanitizer/ubsan/ubsan_handlers.cpp b/libsanitizer/ubsan/ubsan_handlers.cpp
index 2184625aa6e..e201e6bba22 100644
--- a/libsanitizer/ubsan/ubsan_handlers.cpp
+++ b/libsanitizer/ubsan/ubsan_handlers.cpp
@@ -894,21 +894,6 @@ void __ubsan_handle_cfi_bad_type(CFICheckFailData *Data, ValueHandle Vtable,
 
 }  // namespace __ubsan
 
-void __ubsan::__ubsan_handle_cfi_bad_icall(CFIBadIcallData *CallData,
-                                           ValueHandle Function) {
-  GET_REPORT_OPTIONS(false);
-  CFICheckFailData Data = {CFITCK_ICall, CallData->Loc, CallData->Type};
-  handleCFIBadIcall(&Data, Function, Opts);
-}
-
-void __ubsan::__ubsan_handle_cfi_bad_icall_abort(CFIBadIcallData *CallData,
-                                                 ValueHandle Function) {
-  GET_REPORT_OPTIONS(true);
-  CFICheckFailData Data = {CFITCK_ICall, CallData->Loc, CallData->Type};
-  handleCFIBadIcall(&Data, Function, Opts);
-  Die();
-}
-
 void __ubsan::__ubsan_handle_cfi_check_fail(CFICheckFailData *Data,
                                             ValueHandle Value,
                                             uptr ValidVtable) {
diff --git a/libsanitizer/ubsan/ubsan_handlers.h b/libsanitizer/ubsan/ubsan_handlers.h
index 9f412353fc0..219fb15de55 100644
--- a/libsanitizer/ubsan/ubsan_handlers.h
+++ b/libsanitizer/ubsan/ubsan_handlers.h
@@ -215,20 +215,12 @@ enum CFITypeCheckKind : unsigned char {
   CFITCK_VMFCall,
 };
 
-struct CFIBadIcallData {
-  SourceLocation Loc;
-  const TypeDescriptor &Type;
-};
-
 struct CFICheckFailData {
   CFITypeCheckKind CheckKind;
   SourceLocation Loc;
   const TypeDescriptor &Type;
 };
 
-/// \brief Handle control flow integrity failure for indirect function calls.
-RECOVERABLE(cfi_bad_icall, CFIBadIcallData *Data, ValueHandle Function)
-
 /// \brief Handle control flow integrity failures.
 RECOVERABLE(cfi_check_fail, CFICheckFailData *Data, ValueHandle Function,
             uptr VtableIsValid)
diff --git a/libsanitizer/ubsan/ubsan_platform.h b/libsanitizer/ubsan/ubsan_platform.h
index ad3e883f0f3..d2cc2e10bd2 100644
--- a/libsanitizer/ubsan/ubsan_platform.h
+++ b/libsanitizer/ubsan/ubsan_platform.h
@@ -12,7 +12,6 @@
 #ifndef UBSAN_PLATFORM_H
 #define UBSAN_PLATFORM_H
 
-#ifndef CAN_SANITIZE_UB
 // Other platforms should be easy to add, and probably work as-is.
 #if defined(__linux__) || defined(__FreeBSD__) || defined(__APPLE__) ||        \
     defined(__NetBSD__) || defined(__DragonFly__) ||                           \
@@ -22,6 +21,5 @@
 #else
 # define CAN_SANITIZE_UB 0
 #endif
-#endif //CAN_SANITIZE_UB
 
 #endif


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2021-11-22 18:38 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-11-22 18:38 [gcc(refs/users/marxin/heads/merge-libsanitizer-v6)] libsanitizer: merge from master (996ef895cd3d1313665a42fc8e20d1d4e1cf2a28) Martin Liska

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).