[PATCH v4] eliminate mutex in fast path of __register

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH v4] eliminate mutex in fast path of __register_frame
@ 2022-09-16 10:19 Thomas Neumann
  2022-09-16 14:49 ` Jason Merrill
                   ` (3 more replies)
  0 siblings, 4 replies; 33+ messages in thread
From: Thomas Neumann @ 2022-09-16 10:19 UTC (permalink / raw)
  To: gcc-patches, Jason Merrill
  Cc: Florian Weimer, Jakub Jelinek, Jonathan Wakely, Thomas Rodgers

The __register_frame/__deregister_frame functions are used to register
unwinding frames from JITed code in a sorted list. That list itself
is protected by object_mutex, which leads to terrible performance
in multi-threaded code and is somewhat expensive even if single-threaded.
There was already a fast-path that avoided taking the mutex if no
frame was registered at all.

This commit eliminates both the mutex and the sorted list from
the atomic fast path, and replaces it with a btree that uses
optimistic lock coupling during lookup. This allows for fully parallel
unwinding and is essential to scale exception handling to large
core counts.

Changes since v3:
- Avoid code duplication by adding query mode to classify_object_over_fdes
- Adjust all comments as requested

libgcc/ChangeLog:

         * unwind-dw2-fde.c (release_registered_frames): Cleanup at shutdown.
         (__register_frame_info_table_bases): Use btree in atomic fast path.
         (__deregister_frame_info_bases): Likewise.
         (_Unwind_Find_FDE): Likewise.
         (base_from_object): Make parameter const.
         (classify_object_over_fdes): Add query-only mode.
         (get_pc_range): Compute PC range for lookup.
         * unwind-dw2-fde.h (last_fde): Make parameter const.
         * unwind-dw2-btree.h: New file.
---
  libgcc/unwind-dw2-btree.h | 953 ++++++++++++++++++++++++++++++++++++++
  libgcc/unwind-dw2-fde.c   | 195 ++++++--
  libgcc/unwind-dw2-fde.h   |   2 +-
  3 files changed, 1098 insertions(+), 52 deletions(-)
  create mode 100644 libgcc/unwind-dw2-btree.h

diff --git a/libgcc/unwind-dw2-btree.h b/libgcc/unwind-dw2-btree.h
new file mode 100644
index 00000000000..8853f0eab48
--- /dev/null
+++ b/libgcc/unwind-dw2-btree.h
@@ -0,0 +1,953 @@
+/* Lock-free btree for manually registered unwind frames.  */
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+   Contributed by Thomas Neumann
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_UNWIND_DW2_BTREE_H
+#define GCC_UNWIND_DW2_BTREE_H
+
+#include <stdbool.h>
+
+// Common logic for version locks.
+struct version_lock
+{
+  // The lock itself. The lowest bit indicates an exclusive lock,
+  // the second bit indicates waiting threads. All other bits are
+  // used as counter to recognize changes.
+  // Overflows are okay here, we must only prevent overflow to the
+  // same value within one lock_optimistic/validate
+  // range. Even on 32 bit platforms that would require 1 billion
+  // frame registrations within the time span of a few assembler
+  // instructions.
+  uintptr_t version_lock;
+};
+
+#ifdef __GTHREAD_HAS_COND
+// We should never get contention within the tree as it rarely changes.
+// But if we ever do get contention we use these for waiting.
+static __gthread_mutex_t version_lock_mutex = __GTHREAD_MUTEX_INIT;
+static __gthread_cond_t version_lock_cond = __GTHREAD_COND_INIT;
+#endif
+
+// Initialize in locked state.
+static inline void
+version_lock_initialize_locked_exclusive (struct version_lock *vl)
+{
+  vl->version_lock = 1;
+}
+
+// Try to lock the node exclusive.
+static inline bool
+version_lock_try_lock_exclusive (struct version_lock *vl)
+{
+  uintptr_t state = __atomic_load_n (&(vl->version_lock), __ATOMIC_SEQ_CST);
+  if (state & 1)
+    return false;
+  return __atomic_compare_exchange_n (&(vl->version_lock), &state, state | 1,
+				      false, __ATOMIC_SEQ_CST,
+				      __ATOMIC_SEQ_CST);
+}
+
+// Lock the node exclusive, blocking as needed.
+static void
+version_lock_lock_exclusive (struct version_lock *vl)
+{
+#ifndef __GTHREAD_HAS_COND
+restart:
+#endif
+
+  // We should virtually never get contention here, as frame
+  // changes are rare.
+  uintptr_t state = __atomic_load_n (&(vl->version_lock), __ATOMIC_SEQ_CST);
+  if (!(state & 1))
+    {
+      if (__atomic_compare_exchange_n (&(vl->version_lock), &state, state | 1,
+				       false, __ATOMIC_SEQ_CST,
+				       __ATOMIC_SEQ_CST))
+	return;
+    }
+
+    // We did get contention, wait properly.
+#ifdef __GTHREAD_HAS_COND
+  __gthread_mutex_lock (&version_lock_mutex);
+  state = __atomic_load_n (&(vl->version_lock), __ATOMIC_SEQ_CST);
+  while (true)
+    {
+      // Check if the lock is still held.
+      if (!(state & 1))
+	{
+	  if (__atomic_compare_exchange_n (&(vl->version_lock), &state,
+					   state | 1, false, __ATOMIC_SEQ_CST,
+					   __ATOMIC_SEQ_CST))
+	    {
+	      __gthread_mutex_unlock (&version_lock_mutex);
+	      return;
+	    }
+	  else
+	    {
+	      continue;
+	    }
+	}
+
+      // Register waiting thread.
+      if (!(state & 2))
+	{
+	  if (!__atomic_compare_exchange_n (&(vl->version_lock), &state,
+					    state | 2, false, __ATOMIC_SEQ_CST,
+					    __ATOMIC_SEQ_CST))
+	    continue;
+	}
+
+      // And sleep.
+      __gthread_cond_wait (&version_lock_cond, &version_lock_mutex);
+      state = __atomic_load_n (&(vl->version_lock), __ATOMIC_SEQ_CST);
+    }
+#else
+  // Spin if we do not have condition variables available.
+  // We expect no contention here, spinning should be okay.
+  goto restart;
+#endif
+}
+
+// Release a locked node and increase the version lock.
+static void
+version_lock_unlock_exclusive (struct version_lock *vl)
+{
+  // increase version, reset exclusive lock bits
+  uintptr_t state = __atomic_load_n (&(vl->version_lock), __ATOMIC_SEQ_CST);
+  uintptr_t ns = (state + 4) & (~((uintptr_t) 3));
+  state = __atomic_exchange_n (&(vl->version_lock), ns, __ATOMIC_SEQ_CST);
+
+#ifdef __GTHREAD_HAS_COND
+  if (state & 2)
+    {
+      // Wake up waiting threads. This should be extremely rare.
+      __gthread_mutex_lock (&version_lock_mutex);
+      __gthread_cond_broadcast (&version_lock_cond);
+      __gthread_mutex_unlock (&version_lock_mutex);
+    }
+#endif
+}
+
+// Acquire an optimistic "lock". Note that this does not lock at all, it
+// only allows for validation later.
+static inline bool
+version_lock_lock_optimistic (const struct version_lock *vl, uintptr_t *lock)
+{
+  uintptr_t state = __atomic_load_n (&(vl->version_lock), __ATOMIC_SEQ_CST);
+  *lock = state;
+
+  // Acquiring the lock fails when there is currently an exclusive lock.
+  return !(state & 1);
+}
+
+// Validate a previously acquired "lock".
+static inline bool
+version_lock_validate (const struct version_lock *vl, uintptr_t lock)
+{
+  // Prevent the reordering of non-atomic loads behind the atomic load.
+  // Hans Boehm, Can Seqlocks Get Along with Programming Language Memory
+  // Models?, Section 4.
+  __atomic_thread_fence (__ATOMIC_ACQUIRE);
+
+  // Check that the node is still in the same state.
+  uintptr_t state = __atomic_load_n (&(vl->version_lock), __ATOMIC_SEQ_CST);
+  return (state == lock);
+}
+
+// The largest possible separator value.
+static const uintptr_t max_separator = ~((uintptr_t) (0));
+
+struct btree_node;
+
+// Inner entry. The child tree contains all entries <= separator.
+struct inner_entry
+{
+  uintptr_t separator;
+  struct btree_node *child;
+};
+
+// Leaf entry. Stores an object entry.
+struct leaf_entry
+{
+  uintptr_t base, size;
+  struct object *ob;
+};
+
+// Node types.
+enum node_type
+{
+  btree_node_inner,
+  btree_node_leaf,
+  btree_node_free
+};
+
+// Node sizes. Chosen such that the result size is roughly 256 bytes.
+#define max_fanout_inner 15
+#define max_fanout_leaf 10
+
+// A btree node.
+struct btree_node
+{
+  // The version lock used for optimistic lock coupling.
+  struct version_lock version_lock;
+  // The number of entries.
+  unsigned entry_count;
+  // The type.
+  enum node_type type;
+  // The payload.
+  union
+  {
+    // The inner nodes have fence keys, i.e., the right-most entry includes a
+    // separator.
+    struct inner_entry children[max_fanout_inner];
+    struct leaf_entry entries[max_fanout_leaf];
+  } content;
+};
+
+// Is an inner node?
+static inline bool
+btree_node_is_inner (const struct btree_node *n)
+{
+  return n->type == btree_node_inner;
+}
+
+// Is a leaf node?
+static inline bool
+btree_node_is_leaf (const struct btree_node *n)
+{
+  return n->type == btree_node_leaf;
+}
+
+// Should the node be merged?
+static inline bool
+btree_node_needs_merge (const struct btree_node *n)
+{
+  return n->entry_count < (btree_node_is_inner (n) ? (max_fanout_inner / 2)
+						   : (max_fanout_leaf / 2));
+}
+
+// Get the fence key for inner nodes.
+static inline uintptr_t
+btree_node_get_fence_key (const struct btree_node *n)
+{
+  // For inner nodes we just return our right-most entry.
+  return n->content.children[n->entry_count - 1].separator;
+}
+
+// Find the position for a slot in an inner node.
+static unsigned
+btree_node_find_inner_slot (const struct btree_node *n, uintptr_t value)
+{
+  for (unsigned index = 0, ec = n->entry_count; index != ec; ++index)
+    if (n->content.children[index].separator >= value)
+      return index;
+  return n->entry_count;
+}
+
+// Find the position for a slot in a leaf node.
+static unsigned
+btree_node_find_leaf_slot (const struct btree_node *n, uintptr_t value)
+{
+  for (unsigned index = 0, ec = n->entry_count; index != ec; ++index)
+    if (n->content.entries[index].base + n->content.entries[index].size > value)
+      return index;
+  return n->entry_count;
+}
+
+// Try to lock the node exclusive.
+static inline bool
+btree_node_try_lock_exclusive (struct btree_node *n)
+{
+  return version_lock_try_lock_exclusive (&(n->version_lock));
+}
+
+// Lock the node exclusive, blocking as needed.
+static inline void
+btree_node_lock_exclusive (struct btree_node *n)
+{
+  version_lock_lock_exclusive (&(n->version_lock));
+}
+
+// Release a locked node and increase the version lock.
+static inline void
+btree_node_unlock_exclusive (struct btree_node *n)
+{
+  version_lock_unlock_exclusive (&(n->version_lock));
+}
+
+// Acquire an optimistic "lock". Note that this does not lock at all, it
+// only allows for validation later.
+static inline bool
+btree_node_lock_optimistic (const struct btree_node *n, uintptr_t *lock)
+{
+  return version_lock_lock_optimistic (&(n->version_lock), lock);
+}
+
+// Validate a previously acquire lock.
+static inline bool
+btree_node_validate (const struct btree_node *n, uintptr_t lock)
+{
+  return version_lock_validate (&(n->version_lock), lock);
+}
+
+// Insert a new separator after splitting.
+static void
+btree_node_update_separator_after_split (struct btree_node *n,
+					 uintptr_t old_separator,
+					 uintptr_t new_separator,
+					 struct btree_node *new_right)
+{
+  unsigned slot = btree_node_find_inner_slot (n, old_separator);
+  for (unsigned index = n->entry_count; index > slot; --index)
+    n->content.children[index] = n->content.children[index - 1];
+  n->content.children[slot].separator = new_separator;
+  n->content.children[slot + 1].child = new_right;
+  n->entry_count++;
+}
+
+// A btree. Suitable for static initialization, all members are zero at the
+// beginning.
+struct btree
+{
+  // The root of the btree.
+  struct btree_node *root;
+  // The free list of released node.
+  struct btree_node *free_list;
+  // The version lock used to protect the root.
+  struct version_lock root_lock;
+};
+
+// Initialize a btree. Not actually used, just for exposition.
+static inline void
+btree_init (struct btree *t)
+{
+  t->root = NULL;
+  t->free_list = NULL;
+  t->root_lock.version_lock = 0;
+};
+
+static void
+btree_release_tree_recursively (struct btree *t, struct btree_node *n);
+
+// Destroy a tree and release all nodes.
+static void
+btree_destroy (struct btree *t)
+{
+  // Disable the mechanism before cleaning up.
+  struct btree_node *old_root
+    = __atomic_exchange_n (&(t->root), NULL, __ATOMIC_SEQ_CST);
+  if (old_root)
+    btree_release_tree_recursively (t, old_root);
+
+  // Release all free nodes.
+  while (t->free_list)
+    {
+      struct btree_node *next = t->free_list->content.children[0].child;
+      free (t->free_list);
+      t->free_list = next;
+    }
+}
+
+// Allocate a node. This node will be returned in locked exclusive state.
+static struct btree_node *
+btree_allocate_node (struct btree *t, bool inner)
+{
+  while (true)
+    {
+      // Try the free list first.
+      struct btree_node *next_free
+	= __atomic_load_n (&(t->free_list), __ATOMIC_SEQ_CST);
+      if (next_free)
+	{
+	  if (!btree_node_try_lock_exclusive (next_free))
+	    continue;
+	  // The node might no longer be free, check that again after acquiring
+	  // the exclusive lock.
+	  if (next_free->type == btree_node_free)
+	    {
+	      struct btree_node *ex = next_free;
+	      if (__atomic_compare_exchange_n (
+		    &(t->free_list), &ex, next_free->content.children[0].child,
+		    false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST))
+		{
+		  next_free->entry_count = 0;
+		  next_free->type = inner ? btree_node_inner : btree_node_leaf;
+		  return next_free;
+		}
+	    }
+	  btree_node_unlock_exclusive (next_free);
+	  continue;
+	}
+
+      // No free node available, allocate a new one.
+      struct btree_node *new_node
+	= (struct btree_node *) (malloc (sizeof (struct btree_node)));
+      version_lock_initialize_locked_exclusive (
+	&(new_node->version_lock)); // initialize the node in locked state.
+      new_node->entry_count = 0;
+      new_node->type = inner ? btree_node_inner : btree_node_leaf;
+      return new_node;
+    }
+}
+
+// Release a node. This node must be currently locked exclusively and will
+// be placed in the free list.
+static void
+btree_release_node (struct btree *t, struct btree_node *node)
+{
+  // We cannot release the memory immediately because there might still be
+  // concurrent readers on that node. Put it in the free list instead.
+  node->type = btree_node_free;
+  struct btree_node *next_free
+    = __atomic_load_n (&(t->free_list), __ATOMIC_SEQ_CST);
+  do
+    {
+      node->content.children[0].child = next_free;
+  } while (!__atomic_compare_exchange_n (&(t->free_list), &next_free, node,
+					 false, __ATOMIC_SEQ_CST,
+					 __ATOMIC_SEQ_CST));
+  btree_node_unlock_exclusive (node);
+}
+
+// Recursively release a tree. The btree is by design very shallow, thus
+// we can risk recursion here.
+static void
+btree_release_tree_recursively (struct btree *t, struct btree_node *node)
+{
+  btree_node_lock_exclusive (node);
+  if (btree_node_is_inner (node))
+    {
+      for (unsigned index = 0; index < node->entry_count; ++index)
+	btree_release_tree_recursively (t, node->content.children[index].child);
+    }
+  btree_release_node (t, node);
+}
+
+// Check if we are splitting the root.
+static void
+btree_handle_root_split (struct btree *t, struct btree_node **node,
+			 struct btree_node **parent)
+{
+  // We want to keep the root pointer stable to allow for contention
+  // free reads. Thus, we split the root by first moving the content
+  // of the root node to a new node, and then split that new node.
+  if (!*parent)
+    {
+      // Allocate a new node, this guarantees us that we will have a parent
+      // afterwards.
+      struct btree_node *new_node
+	= btree_allocate_node (t, btree_node_is_inner (*node));
+      struct btree_node *old_node = *node;
+      new_node->entry_count = old_node->entry_count;
+      new_node->content = old_node->content;
+      old_node->content.children[0].separator = max_separator;
+      old_node->content.children[0].child = new_node;
+      old_node->entry_count = 1;
+      old_node->type = btree_node_inner;
+
+      *parent = old_node;
+      *node = new_node;
+    }
+}
+
+// Split an inner node.
+static void
+btree_split_inner (struct btree *t, struct btree_node **inner,
+		   struct btree_node **parent, uintptr_t target)
+{
+  // Check for the root.
+  btree_handle_root_split (t, inner, parent);
+
+  // Create two inner node.
+  uintptr_t right_fence = btree_node_get_fence_key (*inner);
+  struct btree_node *left_inner = *inner;
+  struct btree_node *right_inner = btree_allocate_node (t, true);
+  unsigned split = left_inner->entry_count / 2;
+  right_inner->entry_count = left_inner->entry_count - split;
+  for (unsigned index = 0; index < right_inner->entry_count; ++index)
+    right_inner->content.children[index]
+      = left_inner->content.children[split + index];
+  left_inner->entry_count = split;
+  uintptr_t left_fence = btree_node_get_fence_key (left_inner);
+  btree_node_update_separator_after_split (*parent, right_fence, left_fence,
+					   right_inner);
+  if (target <= left_fence)
+    {
+      *inner = left_inner;
+      btree_node_unlock_exclusive (right_inner);
+    }
+  else
+    {
+      *inner = right_inner;
+      btree_node_unlock_exclusive (left_inner);
+    }
+}
+
+// Split a leaf node.
+static void
+btree_split_leaf (struct btree *t, struct btree_node **leaf,
+		  struct btree_node **parent, uintptr_t fence, uintptr_t target)
+{
+  // Check for the root.
+  btree_handle_root_split (t, leaf, parent);
+
+  // Create two leaf nodes.
+  uintptr_t right_fence = fence;
+  struct btree_node *left_leaf = *leaf;
+  struct btree_node *right_leaf = btree_allocate_node (t, false);
+  unsigned split = left_leaf->entry_count / 2;
+  right_leaf->entry_count = left_leaf->entry_count - split;
+  for (unsigned index = 0; index != right_leaf->entry_count; ++index)
+    right_leaf->content.entries[index]
+      = left_leaf->content.entries[split + index];
+  left_leaf->entry_count = split;
+  uintptr_t left_fence = right_leaf->content.entries[0].base - 1;
+  btree_node_update_separator_after_split (*parent, right_fence, left_fence,
+					   right_leaf);
+  if (target <= left_fence)
+    {
+      *leaf = left_leaf;
+      btree_node_unlock_exclusive (right_leaf);
+    }
+  else
+    {
+      *leaf = right_leaf;
+      btree_node_unlock_exclusive (left_leaf);
+    }
+}
+
+// Merge (or balance) child nodes.
+static struct btree_node *
+btree_merge_node (struct btree *t, unsigned child_slot,
+		  struct btree_node *parent, uintptr_t target)
+{
+  // Choose the emptiest neighbor and lock both. The target child is already
+  // locked.
+  unsigned left_slot;
+  struct btree_node *left_node, *right_node;
+  if ((child_slot == 0)
+      || (((child_slot + 1) < parent->entry_count)
+	  && (parent->content.children[child_slot + 1].child->entry_count
+	      < parent->content.children[child_slot - 1].child->entry_count)))
+    {
+      left_slot = child_slot;
+      left_node = parent->content.children[left_slot].child;
+      right_node = parent->content.children[left_slot + 1].child;
+      btree_node_lock_exclusive (right_node);
+    }
+  else
+    {
+      left_slot = child_slot - 1;
+      left_node = parent->content.children[left_slot].child;
+      right_node = parent->content.children[left_slot + 1].child;
+      btree_node_lock_exclusive (left_node);
+    }
+
+  // Can we merge both nodes into one node?
+  unsigned total_count = left_node->entry_count + right_node->entry_count;
+  unsigned max_count
+    = btree_node_is_inner (left_node) ? max_fanout_inner : max_fanout_leaf;
+  if (total_count <= max_count)
+    {
+      // Merge into the parent?
+      if (parent->entry_count == 2)
+	{
+	  // Merge children into parent. This can only happen at the root.
+	  if (btree_node_is_inner (left_node))
+	    {
+	      for (unsigned index = 0; index != left_node->entry_count; ++index)
+		parent->content.children[index]
+		  = left_node->content.children[index];
+	      for (unsigned index = 0; index != right_node->entry_count;
+		   ++index)
+		parent->content.children[index + left_node->entry_count]
+		  = right_node->content.children[index];
+	    }
+	  else
+	    {
+	      parent->type = btree_node_leaf;
+	      for (unsigned index = 0; index != left_node->entry_count; ++index)
+		parent->content.entries[index]
+		  = left_node->content.entries[index];
+	      for (unsigned index = 0; index != right_node->entry_count;
+		   ++index)
+		parent->content.entries[index + left_node->entry_count]
+		  = right_node->content.entries[index];
+	    }
+	  parent->entry_count = total_count;
+	  btree_release_node (t, left_node);
+	  btree_release_node (t, right_node);
+	  return parent;
+	}
+      else
+	{
+	  // Regular merge.
+	  if (btree_node_is_inner (left_node))
+	    {
+	      for (unsigned index = 0; index != right_node->entry_count;
+		   ++index)
+		left_node->content.children[left_node->entry_count++]
+		  = right_node->content.children[index];
+	    }
+	  else
+	    {
+	      for (unsigned index = 0; index != right_node->entry_count;
+		   ++index)
+		left_node->content.entries[left_node->entry_count++]
+		  = right_node->content.entries[index];
+	    }
+	  parent->content.children[left_slot].separator
+	    = parent->content.children[left_slot + 1].separator;
+	  for (unsigned index = left_slot + 1; index + 1 < parent->entry_count;
+	       ++index)
+	    parent->content.children[index]
+	      = parent->content.children[index + 1];
+	  parent->entry_count--;
+	  btree_release_node (t, right_node);
+	  btree_node_unlock_exclusive (parent);
+	  return left_node;
+	}
+    }
+
+  // No merge possible, rebalance instead.
+  if (left_node->entry_count > right_node->entry_count)
+    {
+      // Shift from left to right.
+      unsigned to_shift
+	= (left_node->entry_count - right_node->entry_count) / 2;
+      if (btree_node_is_inner (left_node))
+	{
+	  for (unsigned index = 0; index != right_node->entry_count; ++index)
+	    {
+	      unsigned pos = right_node->entry_count - 1 - index;
+	      right_node->content.children[pos + to_shift]
+		= right_node->content.children[pos];
+	    }
+	  for (unsigned index = 0; index != to_shift; ++index)
+	    right_node->content.children[index]
+	      = left_node->content
+		  .children[left_node->entry_count - to_shift + index];
+	}
+      else
+	{
+	  for (unsigned index = 0; index != right_node->entry_count; ++index)
+	    {
+	      unsigned pos = right_node->entry_count - 1 - index;
+	      right_node->content.entries[pos + to_shift]
+		= right_node->content.entries[pos];
+	    }
+	  for (unsigned index = 0; index != to_shift; ++index)
+	    right_node->content.entries[index]
+	      = left_node->content
+		  .entries[left_node->entry_count - to_shift + index];
+	}
+      left_node->entry_count -= to_shift;
+      right_node->entry_count += to_shift;
+    }
+  else
+    {
+      // Shift from right to left.
+      unsigned to_shift
+	= (right_node->entry_count - left_node->entry_count) / 2;
+      if (btree_node_is_inner (left_node))
+	{
+	  for (unsigned index = 0; index != to_shift; ++index)
+	    left_node->content.children[left_node->entry_count + index]
+	      = right_node->content.children[index];
+	  for (unsigned index = 0; index != right_node->entry_count - to_shift;
+	       ++index)
+	    right_node->content.children[index]
+	      = right_node->content.children[index + to_shift];
+	}
+      else
+	{
+	  for (unsigned index = 0; index != to_shift; ++index)
+	    left_node->content.entries[left_node->entry_count + index]
+	      = right_node->content.entries[index];
+	  for (unsigned index = 0; index != right_node->entry_count - to_shift;
+	       ++index)
+	    right_node->content.entries[index]
+	      = right_node->content.entries[index + to_shift];
+	}
+      left_node->entry_count += to_shift;
+      right_node->entry_count -= to_shift;
+    }
+  uintptr_t left_fence;
+  if (btree_node_is_leaf (left_node))
+    {
+      left_fence = right_node->content.entries[0].base - 1;
+    }
+  else
+    {
+      left_fence = btree_node_get_fence_key (left_node);
+    }
+  parent->content.children[left_slot].separator = left_fence;
+  btree_node_unlock_exclusive (parent);
+  if (target <= left_fence)
+    {
+      btree_node_unlock_exclusive (right_node);
+      return left_node;
+    }
+  else
+    {
+      btree_node_unlock_exclusive (left_node);
+      return right_node;
+    }
+}
+
+// Insert an entry.
+static bool
+btree_insert (struct btree *t, uintptr_t base, uintptr_t size,
+	      struct object *ob)
+{
+  // Sanity check.
+  if (!size)
+    return false;
+
+  // Access the root.
+  struct btree_node *iter, *parent = NULL;
+  {
+    version_lock_lock_exclusive (&(t->root_lock));
+    iter = t->root;
+    if (iter)
+      {
+	btree_node_lock_exclusive (iter);
+      }
+    else
+      {
+	t->root = iter = btree_allocate_node (t, false);
+      }
+    version_lock_unlock_exclusive (&(t->root_lock));
+  }
+
+  // Walk down the btree with classic lock coupling and eager splits.
+  // Strictly speaking this is not performance optimal, we could use
+  // optimistic lock coupling until we hit a node that has to be modified.
+  // But that is more difficult to implement and frame registration is
+  // rare anyway, we use simple locking for now.
+
+  uintptr_t fence = max_separator;
+  while (btree_node_is_inner (iter))
+    {
+      // Use eager splits to avoid lock coupling up.
+      if (iter->entry_count == max_fanout_inner)
+	btree_split_inner (t, &iter, &parent, base);
+
+      unsigned slot = btree_node_find_inner_slot (iter, base);
+      if (parent)
+	btree_node_unlock_exclusive (parent);
+      parent = iter;
+      fence = iter->content.children[slot].separator;
+      iter = iter->content.children[slot].child;
+      btree_node_lock_exclusive (iter);
+    }
+
+  // Make sure we have space.
+  if (iter->entry_count == max_fanout_leaf)
+    btree_split_leaf (t, &iter, &parent, fence, base);
+  if (parent)
+    btree_node_unlock_exclusive (parent);
+
+  // Insert in node.
+  unsigned slot = btree_node_find_leaf_slot (iter, base);
+  if ((slot < iter->entry_count) && (iter->content.entries[slot].base == base))
+    {
+      // Duplicate entry, this should never happen.
+      btree_node_unlock_exclusive (iter);
+      return false;
+    }
+  for (unsigned index = iter->entry_count; index > slot; --index)
+    iter->content.entries[index] = iter->content.entries[index - 1];
+  struct leaf_entry *e = &(iter->content.entries[slot]);
+  e->base = base;
+  e->size = size;
+  e->ob = ob;
+  iter->entry_count++;
+  btree_node_unlock_exclusive (iter);
+  return true;
+}
+
+// Remove an entry.
+static struct object *
+btree_remove (struct btree *t, uintptr_t base)
+{
+  // Access the root.
+  version_lock_lock_exclusive (&(t->root_lock));
+  struct btree_node *iter = t->root;
+  if (iter)
+    btree_node_lock_exclusive (iter);
+  version_lock_unlock_exclusive (&(t->root_lock));
+  if (!iter)
+    return NULL;
+
+  // Same strategy as with insert, walk down with lock coupling and
+  // merge eagerly.
+  while (btree_node_is_inner (iter))
+    {
+      unsigned slot = btree_node_find_inner_slot (iter, base);
+      struct btree_node *next = iter->content.children[slot].child;
+      btree_node_lock_exclusive (next);
+      if (btree_node_needs_merge (next))
+	{
+	  // Use eager merges to avoid lock coupling up.
+	  iter = btree_merge_node (t, slot, iter, base);
+	}
+      else
+	{
+	  btree_node_unlock_exclusive (iter);
+	  iter = next;
+	}
+    }
+
+  // Remove existing entry.
+  unsigned slot = btree_node_find_leaf_slot (iter, base);
+  if ((slot >= iter->entry_count) || (iter->content.entries[slot].base != base))
+    {
+      // Not found, this should never happen.
+      btree_node_unlock_exclusive (iter);
+      return NULL;
+    }
+  struct object *ob = iter->content.entries[slot].ob;
+  for (unsigned index = slot; index + 1 < iter->entry_count; ++index)
+    iter->content.entries[index] = iter->content.entries[index + 1];
+  iter->entry_count--;
+  btree_node_unlock_exclusive (iter);
+  return ob;
+}
+
+// Find the corresponding entry for the given address.
+static struct object *
+btree_lookup (const struct btree *t, uintptr_t target_addr)
+{
+  // Within this function many loads are relaxed atomic loads.
+  // Use a macro to keep the code reasonable.
+#define RLOAD(x) __atomic_load_n (&(x), __ATOMIC_RELAXED)
+
+  // For targets where unwind info is usually not registered through these
+  // APIs anymore, avoid any sequential consistent atomics.
+  // Use relaxed MO here, it is up to the app to ensure that the library
+  // loading/initialization happens-before using that library in other
+  // threads (in particular unwinding with that library's functions
+  // appearing in the backtraces).  Calling that library's functions
+  // without waiting for the library to initialize would be racy.
+  if (__builtin_expect (!RLOAD (t->root), 1))
+    return NULL;
+
+  // The unwinding tables are mostly static, they only change when
+  // frames are added or removed. This makes it extremely unlikely that they
+  // change during a given unwinding sequence. Thus, we optimize for the
+  // contention free case and use optimistic lock coupling. This does not
+  // require any writes to shared state, instead we validate every read. It is
+  // important that we do not trust any value that we have read until we call
+  // validate again. Data can change at arbitrary points in time, thus we always
+  // copy something into a local variable and validate again before acting on
+  // the read. In the unlikely event that we encounter a concurrent change we
+  // simply restart and try again.
+
+restart:
+  struct btree_node *iter;
+  uintptr_t lock;
+  {
+    // Accessing the root node requires defending against concurrent pointer
+    // changes Thus we couple rootLock -> lock on root node -> validate rootLock
+    if (!version_lock_lock_optimistic (&(t->root_lock), &lock))
+      goto restart;
+    iter = RLOAD (t->root);
+    if (!version_lock_validate (&(t->root_lock), lock))
+      goto restart;
+    if (!iter)
+      return NULL;
+    uintptr_t child_lock;
+    if ((!btree_node_lock_optimistic (iter, &child_lock))
+	|| (!version_lock_validate (&(t->root_lock), lock)))
+      goto restart;
+    lock = child_lock;
+  }
+
+  // Now we can walk down towards the right leaf node.
+  while (true)
+    {
+      enum node_type type = RLOAD (iter->type);
+      unsigned entry_count = RLOAD (iter->entry_count);
+      if (!btree_node_validate (iter, lock))
+	goto restart;
+      if (!entry_count)
+	return NULL;
+
+      if (type == btree_node_inner)
+	{
+	  // We cannot call find_inner_slot here because we need (relaxed)
+	  // atomic reads here.
+	  unsigned slot = 0;
+	  while (
+	    ((slot + 1) < entry_count)
+	    && (RLOAD (iter->content.children[slot].separator) < target_addr))
+	    ++slot;
+	  struct btree_node *child = RLOAD (iter->content.children[slot].child);
+	  if (!btree_node_validate (iter, lock))
+	    goto restart;
+
+	  // The node content can change at any point in time, thus we must
+	  // interleave parent and child checks.
+	  uintptr_t child_lock;
+	  if (!btree_node_lock_optimistic (child, &child_lock))
+	    goto restart;
+	  if (!btree_node_validate (iter, lock))
+	    goto restart; // make sure we still point to the correct node after
+			  // acquiring the optimistic lock.
+
+	  // Go down
+	  iter = child;
+	  lock = child_lock;
+	}
+      else
+	{
+	  // We cannot call find_leaf_slot here because we need (relaxed)
+	  // atomic reads here.
+	  unsigned slot = 0;
+	  while (((slot + 1) < entry_count)
+		 && (RLOAD (iter->content.entries[slot].base)
+		       + RLOAD (iter->content.entries[slot].size)
+		     <= target_addr))
+	    ++slot;
+	  struct leaf_entry entry;
+	  entry.base = RLOAD (iter->content.entries[slot].base);
+	  entry.size = RLOAD (iter->content.entries[slot].size);
+	  entry.ob = RLOAD (iter->content.entries[slot].ob);
+	  if (!btree_node_validate (iter, lock))
+	    goto restart;
+
+	  // Check if we have a hit.
+	  if ((entry.base <= target_addr)
+	      && (target_addr < entry.base + entry.size))
+	    {
+	      return entry.ob;
+	    }
+	  return NULL;
+	}
+    }
+#undef RLOAD
+}
+
+#endif /* unwind-dw2-btree.h */
diff --git a/libgcc/unwind-dw2-fde.c b/libgcc/unwind-dw2-fde.c
index 8ee55be5675..1165be0c6df 100644
--- a/libgcc/unwind-dw2-fde.c
+++ b/libgcc/unwind-dw2-fde.c
@@ -42,15 +42,34 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  #endif
  #endif
  
+#ifdef ATOMIC_FDE_FAST_PATH
+#include "unwind-dw2-btree.h"
+
+static struct btree registered_frames;
+
+static void
+release_registered_frames (void) __attribute__ ((destructor (110)));
+static void
+release_registered_frames (void)
+{
+  /* Release the b-tree and all frames. Frame releases that happen later are
+   * silently ignored */
+  btree_destroy (&registered_frames);
+}
+
+static void
+get_pc_range (const struct object *ob, uintptr_t *range);
+static void
+init_object (struct object *ob);
+
+#else
+
  /* The unseen_objects list contains objects that have been registered
     but not yet categorized in any way.  The seen_objects list has had
     its pc_begin and count fields initialized at minimum, and is sorted
     by decreasing value of pc_begin.  */
  static struct object *unseen_objects;
  static struct object *seen_objects;
-#ifdef ATOMIC_FDE_FAST_PATH
-static int any_objects_registered;
-#endif
  
  #ifdef __GTHREAD_MUTEX_INIT
  static __gthread_mutex_t object_mutex = __GTHREAD_MUTEX_INIT;
@@ -78,6 +97,7 @@ init_object_mutex_once (void)
  static __gthread_mutex_t object_mutex;
  #endif
  #endif
+#endif
  
  /* Called from crtbegin.o to register the unwind info for an object.  */
  
@@ -99,23 +119,23 @@ __register_frame_info_bases (const void *begin, struct object *ob,
    ob->fde_end = NULL;
  #endif
  
+#ifdef ATOMIC_FDE_FAST_PATH
+  // Initialize  eagerly to avoid locking later
+  init_object (ob);
+
+  // And register the frame
+  uintptr_t range[2];
+  get_pc_range (ob, range);
+  btree_insert (&registered_frames, range[0], range[1] - range[0], ob);
+#else
    init_object_mutex_once ();
    __gthread_mutex_lock (&object_mutex);
  
    ob->next = unseen_objects;
    unseen_objects = ob;
-#ifdef ATOMIC_FDE_FAST_PATH
-  /* Set flag that at least one library has registered FDEs.
-     Use relaxed MO here, it is up to the app to ensure that the library
-     loading/initialization happens-before using that library in other
-     threads (in particular unwinding with that library's functions
-     appearing in the backtraces).  Calling that library's functions
-     without waiting for the library to initialize would be racy.  */
-  if (!any_objects_registered)
-    __atomic_store_n (&any_objects_registered, 1, __ATOMIC_RELAXED);
-#endif
  
    __gthread_mutex_unlock (&object_mutex);
+#endif
  }
  
  void
@@ -153,23 +173,23 @@ __register_frame_info_table_bases (void *begin, struct object *ob,
    ob->s.b.from_array = 1;
    ob->s.b.encoding = DW_EH_PE_omit;
  
+#ifdef ATOMIC_FDE_FAST_PATH
+  // Initialize  eagerly to avoid locking later
+  init_object (ob);
+
+  // And register the frame
+  uintptr_t range[2];
+  get_pc_range (ob, range);
+  btree_insert (&registered_frames, range[0], range[1] - range[0], ob);
+#else
    init_object_mutex_once ();
    __gthread_mutex_lock (&object_mutex);
  
    ob->next = unseen_objects;
    unseen_objects = ob;
-#ifdef ATOMIC_FDE_FAST_PATH
-  /* Set flag that at least one library has registered FDEs.
-     Use relaxed MO here, it is up to the app to ensure that the library
-     loading/initialization happens-before using that library in other
-     threads (in particular unwinding with that library's functions
-     appearing in the backtraces).  Calling that library's functions
-     without waiting for the library to initialize would be racy.  */
-  if (!any_objects_registered)
-    __atomic_store_n (&any_objects_registered, 1, __ATOMIC_RELAXED);
-#endif
  
    __gthread_mutex_unlock (&object_mutex);
+#endif
  }
  
  void
@@ -200,16 +220,33 @@ __register_frame_table (void *begin)
  void *
  __deregister_frame_info_bases (const void *begin)
  {
-  struct object **p;
    struct object *ob = 0;
  
    /* If .eh_frame is empty, we haven't registered.  */
    if ((const uword *) begin == 0 || *(const uword *) begin == 0)
      return ob;
  
+#ifdef ATOMIC_FDE_FAST_PATH
+  // Find the corresponding PC range
+  struct object lookupob;
+  lookupob.tbase = 0;
+  lookupob.dbase = 0;
+  lookupob.u.single = begin;
+  lookupob.s.i = 0;
+  lookupob.s.b.encoding = DW_EH_PE_omit;
+#ifdef DWARF2_OBJECT_END_PTR_EXTENSION
+  lookupob.fde_end = NULL;
+#endif
+  uintptr_t range[2];
+  get_pc_range (&lookupob, range);
+
+  // And remove
+  ob = btree_remove (&registered_frames, range[0]);
+#else
    init_object_mutex_once ();
    __gthread_mutex_lock (&object_mutex);
  
+  struct object **p;
    for (p = &unseen_objects; *p ; p = &(*p)->next)
      if ((*p)->u.single == begin)
        {
@@ -241,6 +278,8 @@ __deregister_frame_info_bases (const void *begin)
  
   out:
    __gthread_mutex_unlock (&object_mutex);
+#endif
+
    gcc_assert (ob);
    return (void *) ob;
  }
@@ -264,7 +303,7 @@ __deregister_frame (void *begin)
     instead of an _Unwind_Context.  */
  
  static _Unwind_Ptr
-base_from_object (unsigned char encoding, struct object *ob)
+base_from_object (unsigned char encoding, const struct object *ob)
  {
    if (encoding == DW_EH_PE_omit)
      return 0;
@@ -628,13 +667,17 @@ end_fde_sort (struct object *ob, struct fde_accumulator *accu, size_t count)
      }
  }
  
-\f
-/* Update encoding, mixed_encoding, and pc_begin for OB for the
-   fde array beginning at THIS_FDE.  Return the number of fdes
-   encountered along the way.  */
+/* Inspect the fde array beginning at this_fde. This
+   function can be used either in query mode (RANGE is
+   not null, OB is const), or in update mode (RANGE is
+   null, OB is modified). In query mode the function computes
+   the range of PC values and stores it in rANGE. In
+   update mode it updates encoding, mixed_encoding, and pc_begin
+   for OB. Return the number of fdes encountered along the way. */
  
  static size_t
-classify_object_over_fdes (struct object *ob, const fde *this_fde)
+classify_object_over_fdes (struct object *ob, const fde *this_fde,
+			   uintptr_t *range)
  {
    const struct dwarf_cie *last_cie = 0;
    size_t count = 0;
@@ -644,7 +687,7 @@ classify_object_over_fdes (struct object *ob, const fde *this_fde)
    for (; ! last_fde (ob, this_fde); this_fde = next_fde (this_fde))
      {
        const struct dwarf_cie *this_cie;
-      _Unwind_Ptr mask, pc_begin;
+      _Unwind_Ptr mask, pc_begin, pc_range;
  
        /* Skip CIEs.  */
        if (this_fde->CIE_delta == 0)
@@ -660,14 +703,19 @@ classify_object_over_fdes (struct object *ob, const fde *this_fde)
  	  if (encoding == DW_EH_PE_omit)
  	    return -1;
  	  base = base_from_object (encoding, ob);
-	  if (ob->s.b.encoding == DW_EH_PE_omit)
-	    ob->s.b.encoding = encoding;
-	  else if (ob->s.b.encoding != encoding)
-	    ob->s.b.mixed_encoding = 1;
+	  if (!range)
+	    {
+	      if (ob->s.b.encoding == DW_EH_PE_omit)
+		ob->s.b.encoding = encoding;
+	      else if (ob->s.b.encoding != encoding)
+		ob->s.b.mixed_encoding = 1;
+	    }
  	}
  
-      read_encoded_value_with_base (encoding, base, this_fde->pc_begin,
-				    &pc_begin);
+      const unsigned char *p;
+      p = read_encoded_value_with_base (encoding, base, this_fde->pc_begin,
+					&pc_begin);
+      read_encoded_value_with_base (encoding & 0x0F, 0, p, &pc_range);
  
        /* Take care to ignore link-once functions that were removed.
  	 In these cases, the function address will be NULL, but if
@@ -683,8 +731,27 @@ classify_object_over_fdes (struct object *ob, const fde *this_fde)
  	continue;
  
        count += 1;
-      if ((void *) pc_begin < ob->pc_begin)
-	ob->pc_begin = (void *) pc_begin;
+      if (range)
+	{
+	  _Unwind_Ptr pc_end = pc_begin + pc_range;
+	  if ((!range[0]) && (!range[1]))
+	    {
+	      range[0] = pc_begin;
+	      range[1] = pc_end;
+	    }
+	  else
+	    {
+	      if (pc_begin < range[0])
+		range[0] = pc_begin;
+	      if (pc_end > range[1])
+		range[1] = pc_end;
+	    }
+	}
+      else
+	{
+	  if ((void *) pc_begin < ob->pc_begin)
+	    ob->pc_begin = (void *) pc_begin;
+	}
      }
  
    return count;
@@ -769,7 +836,7 @@ init_object (struct object* ob)
  	  fde **p = ob->u.array;
  	  for (count = 0; *p; ++p)
  	    {
-	      size_t cur_count = classify_object_over_fdes (ob, *p);
+	      size_t cur_count = classify_object_over_fdes (ob, *p, NULL);
  	      if (cur_count == (size_t) -1)
  		goto unhandled_fdes;
  	      count += cur_count;
@@ -777,7 +844,7 @@ init_object (struct object* ob)
  	}
        else
  	{
-	  count = classify_object_over_fdes (ob, ob->u.single);
+	  count = classify_object_over_fdes (ob, ob->u.single, NULL);
  	  if (count == (size_t) -1)
  	    {
  	      static const fde terminator;
@@ -821,6 +888,32 @@ init_object (struct object* ob)
    ob->s.b.sorted = 1;
  }
  
+#ifdef ATOMIC_FDE_FAST_PATH
+/* Get the PC range for lookup */
+static void
+get_pc_range (const struct object *ob, uintptr_t *range)
+{
+  // It is safe to cast to non-const object* here as
+  // classify_object_over_fdes does not modify ob in query mode.
+  struct object *ncob = (struct object *) (uintptr_t) ob;
+  range[0] = range[1] = 0;
+  if (ob->s.b.sorted)
+    {
+      classify_object_over_fdes (ncob, ob->u.sort->orig_data, range);
+    }
+  else if (ob->s.b.from_array)
+    {
+      fde **p = ob->u.array;
+      for (; *p; ++p)
+	classify_object_over_fdes (ncob, *p, range);
+    }
+  else
+    {
+      classify_object_over_fdes (ncob, ob->u.single, range);
+    }
+}
+#endif
+
  /* A linear search through a set of FDEs for the given PC.  This is
     used when there was insufficient memory to allocate and sort an
     array.  */
@@ -985,6 +1078,9 @@ binary_search_mixed_encoding_fdes (struct object *ob, void *pc)
  static const fde *
  search_object (struct object* ob, void *pc)
  {
+  /* The fast path initializes objects eagerly to avoid locking.
+   * On the slow path we initialize them now */
+#ifndef ATOMIC_FDE_FAST_PATH
    /* If the data hasn't been sorted, try to do this now.  We may have
       more memory available than last time we tried.  */
    if (! ob->s.b.sorted)
@@ -997,6 +1093,7 @@ search_object (struct object* ob, void *pc)
        if (pc < ob->pc_begin)
  	return NULL;
      }
+#endif
  
    if (ob->s.b.sorted)
      {
@@ -1033,17 +1130,12 @@ _Unwind_Find_FDE (void *pc, struct dwarf_eh_bases *bases)
    const fde *f = NULL;
  
  #ifdef ATOMIC_FDE_FAST_PATH
-  /* For targets where unwind info is usually not registered through these
-     APIs anymore, avoid taking a global lock.
-     Use relaxed MO here, it is up to the app to ensure that the library
-     loading/initialization happens-before using that library in other
-     threads (in particular unwinding with that library's functions
-     appearing in the backtraces).  Calling that library's functions
-     without waiting for the library to initialize would be racy.  */
-  if (__builtin_expect (!__atomic_load_n (&any_objects_registered,
-					  __ATOMIC_RELAXED), 1))
+  ob = btree_lookup (&registered_frames, (uintptr_t) pc);
+  if (!ob)
      return NULL;
-#endif
+
+  f = search_object (ob, pc);
+#else
  
    init_object_mutex_once ();
    __gthread_mutex_lock (&object_mutex);
@@ -1081,6 +1173,7 @@ _Unwind_Find_FDE (void *pc, struct dwarf_eh_bases *bases)
  
   fini:
    __gthread_mutex_unlock (&object_mutex);
+#endif
  
    if (f)
      {
diff --git a/libgcc/unwind-dw2-fde.h b/libgcc/unwind-dw2-fde.h
index 8a011c358b4..77c2caa4f5a 100644
--- a/libgcc/unwind-dw2-fde.h
+++ b/libgcc/unwind-dw2-fde.h
@@ -166,7 +166,7 @@ next_fde (const fde *f)
  extern const fde * _Unwind_Find_FDE (void *, struct dwarf_eh_bases *);
  
  static inline int
-last_fde (struct object *obj __attribute__ ((__unused__)), const fde *f)
+last_fde (const struct object *obj __attribute__ ((__unused__)), const fde *f)
  {
  #ifdef DWARF2_OBJECT_END_PTR_EXTENSION
    return f == (const fde *) obj->fde_end || f->length == 0;
-- 
2.34.1


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v4] eliminate mutex in fast path of __register_frame
  2022-09-16 10:19 [PATCH v4] eliminate mutex in fast path of __register_frame Thomas Neumann
@ 2022-09-16 14:49 ` Jason Merrill
  2022-09-18  8:59 ` Dimitar Dimitrov
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 33+ messages in thread
From: Jason Merrill @ 2022-09-16 14:49 UTC (permalink / raw)
  To: Thomas Neumann, gcc-patches
  Cc: Florian Weimer, Jakub Jelinek, Jonathan Wakely, Thomas Rodgers

On 9/16/22 06:19, Thomas Neumann wrote:
> The __register_frame/__deregister_frame functions are used to register
> unwinding frames from JITed code in a sorted list. That list itself
> is protected by object_mutex, which leads to terrible performance
> in multi-threaded code and is somewhat expensive even if single-threaded.
> There was already a fast-path that avoided taking the mutex if no
> frame was registered at all.
> 
> This commit eliminates both the mutex and the sorted list from
> the atomic fast path, and replaces it with a btree that uses
> optimistic lock coupling during lookup. This allows for fully parallel
> unwinding and is essential to scale exception handling to large
> core counts.
> 
> Changes since v3:
> - Avoid code duplication by adding query mode to classify_object_over_fdes
> - Adjust all comments as requested
> 
> libgcc/ChangeLog:
> 
>          * unwind-dw2-fde.c (release_registered_frames): Cleanup at 
> shutdown.
>          (__register_frame_info_table_bases): Use btree in atomic fast 
> path.
>          (__deregister_frame_info_bases): Likewise.
>          (_Unwind_Find_FDE): Likewise.
>          (base_from_object): Make parameter const.
>          (classify_object_over_fdes): Add query-only mode.
>          (get_pc_range): Compute PC range for lookup.
>          * unwind-dw2-fde.h (last_fde): Make parameter const.
>          * unwind-dw2-btree.h: New file.
> ---
>   libgcc/unwind-dw2-btree.h | 953 ++++++++++++++++++++++++++++++++++++++
>   libgcc/unwind-dw2-fde.c   | 195 ++++++--
>   libgcc/unwind-dw2-fde.h   |   2 +-
>   3 files changed, 1098 insertions(+), 52 deletions(-)
>   create mode 100644 libgcc/unwind-dw2-btree.h
> 
> diff --git a/libgcc/unwind-dw2-btree.h b/libgcc/unwind-dw2-btree.h
> new file mode 100644
> index 00000000000..8853f0eab48
> --- /dev/null
> +++ b/libgcc/unwind-dw2-btree.h
> @@ -0,0 +1,953 @@
> +/* Lock-free btree for manually registered unwind frames.  */
> +/* Copyright (C) 2022 Free Software Foundation, Inc.
> +   Contributed by Thomas Neumann
> +
> +This file is part of GCC.
> +
> +GCC is free software; you can redistribute it and/or modify it under
> +the terms of the GNU General Public License as published by the Free
> +Software Foundation; either version 3, or (at your option) any later
> +version.
> +
> +GCC is distributed in the hope that it will be useful, but WITHOUT ANY
> +WARRANTY; without even the implied warranty of MERCHANTABILITY or
> +FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
> +for more details.
> +
> +Under Section 7 of GPL version 3, you are granted additional
> +permissions described in the GCC Runtime Library Exception, version
> +3.1, as published by the Free Software Foundation.
> +
> +You should have received a copy of the GNU General Public License and
> +a copy of the GCC Runtime Library Exception along with this program;
> +see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
> +<http://www.gnu.org/licenses/>.  */
> +
> +#ifndef GCC_UNWIND_DW2_BTREE_H
> +#define GCC_UNWIND_DW2_BTREE_H
> +
> +#include <stdbool.h>
> +
> +// Common logic for version locks.
> +struct version_lock
> +{
> +  // The lock itself. The lowest bit indicates an exclusive lock,
> +  // the second bit indicates waiting threads. All other bits are
> +  // used as counter to recognize changes.
> +  // Overflows are okay here, we must only prevent overflow to the
> +  // same value within one lock_optimistic/validate
> +  // range. Even on 32 bit platforms that would require 1 billion
> +  // frame registrations within the time span of a few assembler
> +  // instructions.
> +  uintptr_t version_lock;
> +};
> +
> +#ifdef __GTHREAD_HAS_COND
> +// We should never get contention within the tree as it rarely changes.
> +// But if we ever do get contention we use these for waiting.
> +static __gthread_mutex_t version_lock_mutex = __GTHREAD_MUTEX_INIT;
> +static __gthread_cond_t version_lock_cond = __GTHREAD_COND_INIT;
> +#endif
> +
> +// Initialize in locked state.
> +static inline void
> +version_lock_initialize_locked_exclusive (struct version_lock *vl)
> +{
> +  vl->version_lock = 1;
> +}
> +
> +// Try to lock the node exclusive.
> +static inline bool
> +version_lock_try_lock_exclusive (struct version_lock *vl)
> +{
> +  uintptr_t state = __atomic_load_n (&(vl->version_lock), 
> __ATOMIC_SEQ_CST);
> +  if (state & 1)
> +    return false;
> +  return __atomic_compare_exchange_n (&(vl->version_lock), &state, 
> state | 1,
> +                      false, __ATOMIC_SEQ_CST,
> +                      __ATOMIC_SEQ_CST);
> +}
> +
> +// Lock the node exclusive, blocking as needed.
> +static void
> +version_lock_lock_exclusive (struct version_lock *vl)
> +{
> +#ifndef __GTHREAD_HAS_COND
> +restart:
> +#endif
> +
> +  // We should virtually never get contention here, as frame
> +  // changes are rare.
> +  uintptr_t state = __atomic_load_n (&(vl->version_lock), 
> __ATOMIC_SEQ_CST);
> +  if (!(state & 1))
> +    {
> +      if (__atomic_compare_exchange_n (&(vl->version_lock), &state, 
> state | 1,
> +                       false, __ATOMIC_SEQ_CST,
> +                       __ATOMIC_SEQ_CST))
> +    return;
> +    }
> +
> +    // We did get contention, wait properly.
> +#ifdef __GTHREAD_HAS_COND
> +  __gthread_mutex_lock (&version_lock_mutex);
> +  state = __atomic_load_n (&(vl->version_lock), __ATOMIC_SEQ_CST);
> +  while (true)
> +    {
> +      // Check if the lock is still held.
> +      if (!(state & 1))
> +    {
> +      if (__atomic_compare_exchange_n (&(vl->version_lock), &state,
> +                       state | 1, false, __ATOMIC_SEQ_CST,
> +                       __ATOMIC_SEQ_CST))
> +        {
> +          __gthread_mutex_unlock (&version_lock_mutex);
> +          return;
> +        }
> +      else
> +        {
> +          continue;
> +        }
> +    }
> +
> +      // Register waiting thread.
> +      if (!(state & 2))
> +    {
> +      if (!__atomic_compare_exchange_n (&(vl->version_lock), &state,
> +                        state | 2, false, __ATOMIC_SEQ_CST,
> +                        __ATOMIC_SEQ_CST))
> +        continue;
> +    }
> +
> +      // And sleep.
> +      __gthread_cond_wait (&version_lock_cond, &version_lock_mutex);
> +      state = __atomic_load_n (&(vl->version_lock), __ATOMIC_SEQ_CST);
> +    }
> +#else
> +  // Spin if we do not have condition variables available.
> +  // We expect no contention here, spinning should be okay.
> +  goto restart;
> +#endif
> +}
> +
> +// Release a locked node and increase the version lock.
> +static void
> +version_lock_unlock_exclusive (struct version_lock *vl)
> +{
> +  // increase version, reset exclusive lock bits
> +  uintptr_t state = __atomic_load_n (&(vl->version_lock), 
> __ATOMIC_SEQ_CST);
> +  uintptr_t ns = (state + 4) & (~((uintptr_t) 3));
> +  state = __atomic_exchange_n (&(vl->version_lock), ns, __ATOMIC_SEQ_CST);
> +
> +#ifdef __GTHREAD_HAS_COND
> +  if (state & 2)
> +    {
> +      // Wake up waiting threads. This should be extremely rare.
> +      __gthread_mutex_lock (&version_lock_mutex);
> +      __gthread_cond_broadcast (&version_lock_cond);
> +      __gthread_mutex_unlock (&version_lock_mutex);
> +    }
> +#endif
> +}
> +
> +// Acquire an optimistic "lock". Note that this does not lock at all, it
> +// only allows for validation later.
> +static inline bool
> +version_lock_lock_optimistic (const struct version_lock *vl, uintptr_t 
> *lock)
> +{
> +  uintptr_t state = __atomic_load_n (&(vl->version_lock), 
> __ATOMIC_SEQ_CST);
> +  *lock = state;
> +
> +  // Acquiring the lock fails when there is currently an exclusive lock.
> +  return !(state & 1);
> +}
> +
> +// Validate a previously acquired "lock".
> +static inline bool
> +version_lock_validate (const struct version_lock *vl, uintptr_t lock)
> +{
> +  // Prevent the reordering of non-atomic loads behind the atomic load.
> +  // Hans Boehm, Can Seqlocks Get Along with Programming Language Memory
> +  // Models?, Section 4.
> +  __atomic_thread_fence (__ATOMIC_ACQUIRE);
> +
> +  // Check that the node is still in the same state.
> +  uintptr_t state = __atomic_load_n (&(vl->version_lock), 
> __ATOMIC_SEQ_CST);
> +  return (state == lock);
> +}
> +
> +// The largest possible separator value.
> +static const uintptr_t max_separator = ~((uintptr_t) (0));
> +
> +struct btree_node;
> +
> +// Inner entry. The child tree contains all entries <= separator.
> +struct inner_entry
> +{
> +  uintptr_t separator;
> +  struct btree_node *child;
> +};
> +
> +// Leaf entry. Stores an object entry.
> +struct leaf_entry
> +{
> +  uintptr_t base, size;
> +  struct object *ob;
> +};
> +
> +// Node types.
> +enum node_type
> +{
> +  btree_node_inner,
> +  btree_node_leaf,
> +  btree_node_free
> +};
> +
> +// Node sizes. Chosen such that the result size is roughly 256 bytes.
> +#define max_fanout_inner 15
> +#define max_fanout_leaf 10
> +
> +// A btree node.
> +struct btree_node
> +{
> +  // The version lock used for optimistic lock coupling.
> +  struct version_lock version_lock;
> +  // The number of entries.
> +  unsigned entry_count;
> +  // The type.
> +  enum node_type type;
> +  // The payload.
> +  union
> +  {
> +    // The inner nodes have fence keys, i.e., the right-most entry 
> includes a
> +    // separator.
> +    struct inner_entry children[max_fanout_inner];
> +    struct leaf_entry entries[max_fanout_leaf];
> +  } content;
> +};
> +
> +// Is an inner node?
> +static inline bool
> +btree_node_is_inner (const struct btree_node *n)
> +{
> +  return n->type == btree_node_inner;
> +}
> +
> +// Is a leaf node?
> +static inline bool
> +btree_node_is_leaf (const struct btree_node *n)
> +{
> +  return n->type == btree_node_leaf;
> +}
> +
> +// Should the node be merged?
> +static inline bool
> +btree_node_needs_merge (const struct btree_node *n)
> +{
> +  return n->entry_count < (btree_node_is_inner (n) ? (max_fanout_inner 
> / 2)
> +                           : (max_fanout_leaf / 2));
> +}
> +
> +// Get the fence key for inner nodes.
> +static inline uintptr_t
> +btree_node_get_fence_key (const struct btree_node *n)
> +{
> +  // For inner nodes we just return our right-most entry.
> +  return n->content.children[n->entry_count - 1].separator;
> +}
> +
> +// Find the position for a slot in an inner node.
> +static unsigned
> +btree_node_find_inner_slot (const struct btree_node *n, uintptr_t value)
> +{
> +  for (unsigned index = 0, ec = n->entry_count; index != ec; ++index)
> +    if (n->content.children[index].separator >= value)
> +      return index;
> +  return n->entry_count;
> +}
> +
> +// Find the position for a slot in a leaf node.
> +static unsigned
> +btree_node_find_leaf_slot (const struct btree_node *n, uintptr_t value)
> +{
> +  for (unsigned index = 0, ec = n->entry_count; index != ec; ++index)
> +    if (n->content.entries[index].base + n->content.entries[index].size 
>  > value)
> +      return index;
> +  return n->entry_count;
> +}
> +
> +// Try to lock the node exclusive.
> +static inline bool
> +btree_node_try_lock_exclusive (struct btree_node *n)
> +{
> +  return version_lock_try_lock_exclusive (&(n->version_lock));
> +}
> +
> +// Lock the node exclusive, blocking as needed.
> +static inline void
> +btree_node_lock_exclusive (struct btree_node *n)
> +{
> +  version_lock_lock_exclusive (&(n->version_lock));
> +}
> +
> +// Release a locked node and increase the version lock.
> +static inline void
> +btree_node_unlock_exclusive (struct btree_node *n)
> +{
> +  version_lock_unlock_exclusive (&(n->version_lock));
> +}
> +
> +// Acquire an optimistic "lock". Note that this does not lock at all, it
> +// only allows for validation later.
> +static inline bool
> +btree_node_lock_optimistic (const struct btree_node *n, uintptr_t *lock)
> +{
> +  return version_lock_lock_optimistic (&(n->version_lock), lock);
> +}
> +
> +// Validate a previously acquire lock.
> +static inline bool
> +btree_node_validate (const struct btree_node *n, uintptr_t lock)
> +{
> +  return version_lock_validate (&(n->version_lock), lock);
> +}
> +
> +// Insert a new separator after splitting.
> +static void
> +btree_node_update_separator_after_split (struct btree_node *n,
> +                     uintptr_t old_separator,
> +                     uintptr_t new_separator,
> +                     struct btree_node *new_right)
> +{
> +  unsigned slot = btree_node_find_inner_slot (n, old_separator);
> +  for (unsigned index = n->entry_count; index > slot; --index)
> +    n->content.children[index] = n->content.children[index - 1];
> +  n->content.children[slot].separator = new_separator;
> +  n->content.children[slot + 1].child = new_right;
> +  n->entry_count++;
> +}
> +
> +// A btree. Suitable for static initialization, all members are zero at 
> the
> +// beginning.
> +struct btree
> +{
> +  // The root of the btree.
> +  struct btree_node *root;
> +  // The free list of released node.
> +  struct btree_node *free_list;
> +  // The version lock used to protect the root.
> +  struct version_lock root_lock;
> +};
> +
> +// Initialize a btree. Not actually used, just for exposition.
> +static inline void
> +btree_init (struct btree *t)
> +{
> +  t->root = NULL;
> +  t->free_list = NULL;
> +  t->root_lock.version_lock = 0;
> +};
> +
> +static void
> +btree_release_tree_recursively (struct btree *t, struct btree_node *n);
> +
> +// Destroy a tree and release all nodes.
> +static void
> +btree_destroy (struct btree *t)
> +{
> +  // Disable the mechanism before cleaning up.
> +  struct btree_node *old_root
> +    = __atomic_exchange_n (&(t->root), NULL, __ATOMIC_SEQ_CST);
> +  if (old_root)
> +    btree_release_tree_recursively (t, old_root);
> +
> +  // Release all free nodes.
> +  while (t->free_list)
> +    {
> +      struct btree_node *next = t->free_list->content.children[0].child;
> +      free (t->free_list);
> +      t->free_list = next;
> +    }
> +}
> +
> +// Allocate a node. This node will be returned in locked exclusive state.
> +static struct btree_node *
> +btree_allocate_node (struct btree *t, bool inner)
> +{
> +  while (true)
> +    {
> +      // Try the free list first.
> +      struct btree_node *next_free
> +    = __atomic_load_n (&(t->free_list), __ATOMIC_SEQ_CST);
> +      if (next_free)
> +    {
> +      if (!btree_node_try_lock_exclusive (next_free))
> +        continue;
> +      // The node might no longer be free, check that again after 
> acquiring
> +      // the exclusive lock.
> +      if (next_free->type == btree_node_free)
> +        {
> +          struct btree_node *ex = next_free;
> +          if (__atomic_compare_exchange_n (
> +            &(t->free_list), &ex, next_free->content.children[0].child,
> +            false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST))
> +        {
> +          next_free->entry_count = 0;
> +          next_free->type = inner ? btree_node_inner : btree_node_leaf;
> +          return next_free;
> +        }
> +        }
> +      btree_node_unlock_exclusive (next_free);
> +      continue;
> +    }
> +
> +      // No free node available, allocate a new one.
> +      struct btree_node *new_node
> +    = (struct btree_node *) (malloc (sizeof (struct btree_node)));
> +      version_lock_initialize_locked_exclusive (
> +    &(new_node->version_lock)); // initialize the node in locked state.
> +      new_node->entry_count = 0;
> +      new_node->type = inner ? btree_node_inner : btree_node_leaf;
> +      return new_node;
> +    }
> +}
> +
> +// Release a node. This node must be currently locked exclusively and will
> +// be placed in the free list.
> +static void
> +btree_release_node (struct btree *t, struct btree_node *node)
> +{
> +  // We cannot release the memory immediately because there might still be
> +  // concurrent readers on that node. Put it in the free list instead.
> +  node->type = btree_node_free;
> +  struct btree_node *next_free
> +    = __atomic_load_n (&(t->free_list), __ATOMIC_SEQ_CST);
> +  do
> +    {
> +      node->content.children[0].child = next_free;
> +  } while (!__atomic_compare_exchange_n (&(t->free_list), &next_free, 
> node,
> +                     false, __ATOMIC_SEQ_CST,
> +                     __ATOMIC_SEQ_CST));
> +  btree_node_unlock_exclusive (node);
> +}
> +
> +// Recursively release a tree. The btree is by design very shallow, thus
> +// we can risk recursion here.
> +static void
> +btree_release_tree_recursively (struct btree *t, struct btree_node *node)
> +{
> +  btree_node_lock_exclusive (node);
> +  if (btree_node_is_inner (node))
> +    {
> +      for (unsigned index = 0; index < node->entry_count; ++index)
> +    btree_release_tree_recursively (t, 
> node->content.children[index].child);
> +    }
> +  btree_release_node (t, node);
> +}
> +
> +// Check if we are splitting the root.
> +static void
> +btree_handle_root_split (struct btree *t, struct btree_node **node,
> +             struct btree_node **parent)
> +{
> +  // We want to keep the root pointer stable to allow for contention
> +  // free reads. Thus, we split the root by first moving the content
> +  // of the root node to a new node, and then split that new node.
> +  if (!*parent)
> +    {
> +      // Allocate a new node, this guarantees us that we will have a 
> parent
> +      // afterwards.
> +      struct btree_node *new_node
> +    = btree_allocate_node (t, btree_node_is_inner (*node));
> +      struct btree_node *old_node = *node;
> +      new_node->entry_count = old_node->entry_count;
> +      new_node->content = old_node->content;
> +      old_node->content.children[0].separator = max_separator;
> +      old_node->content.children[0].child = new_node;
> +      old_node->entry_count = 1;
> +      old_node->type = btree_node_inner;
> +
> +      *parent = old_node;
> +      *node = new_node;
> +    }
> +}
> +
> +// Split an inner node.
> +static void
> +btree_split_inner (struct btree *t, struct btree_node **inner,
> +           struct btree_node **parent, uintptr_t target)
> +{
> +  // Check for the root.
> +  btree_handle_root_split (t, inner, parent);
> +
> +  // Create two inner node.
> +  uintptr_t right_fence = btree_node_get_fence_key (*inner);
> +  struct btree_node *left_inner = *inner;
> +  struct btree_node *right_inner = btree_allocate_node (t, true);
> +  unsigned split = left_inner->entry_count / 2;
> +  right_inner->entry_count = left_inner->entry_count - split;
> +  for (unsigned index = 0; index < right_inner->entry_count; ++index)
> +    right_inner->content.children[index]
> +      = left_inner->content.children[split + index];
> +  left_inner->entry_count = split;
> +  uintptr_t left_fence = btree_node_get_fence_key (left_inner);
> +  btree_node_update_separator_after_split (*parent, right_fence, 
> left_fence,
> +                       right_inner);
> +  if (target <= left_fence)
> +    {
> +      *inner = left_inner;
> +      btree_node_unlock_exclusive (right_inner);
> +    }
> +  else
> +    {
> +      *inner = right_inner;
> +      btree_node_unlock_exclusive (left_inner);
> +    }
> +}
> +
> +// Split a leaf node.
> +static void
> +btree_split_leaf (struct btree *t, struct btree_node **leaf,
> +          struct btree_node **parent, uintptr_t fence, uintptr_t target)
> +{
> +  // Check for the root.
> +  btree_handle_root_split (t, leaf, parent);
> +
> +  // Create two leaf nodes.
> +  uintptr_t right_fence = fence;
> +  struct btree_node *left_leaf = *leaf;
> +  struct btree_node *right_leaf = btree_allocate_node (t, false);
> +  unsigned split = left_leaf->entry_count / 2;
> +  right_leaf->entry_count = left_leaf->entry_count - split;
> +  for (unsigned index = 0; index != right_leaf->entry_count; ++index)
> +    right_leaf->content.entries[index]
> +      = left_leaf->content.entries[split + index];
> +  left_leaf->entry_count = split;
> +  uintptr_t left_fence = right_leaf->content.entries[0].base - 1;
> +  btree_node_update_separator_after_split (*parent, right_fence, 
> left_fence,
> +                       right_leaf);
> +  if (target <= left_fence)
> +    {
> +      *leaf = left_leaf;
> +      btree_node_unlock_exclusive (right_leaf);
> +    }
> +  else
> +    {
> +      *leaf = right_leaf;
> +      btree_node_unlock_exclusive (left_leaf);
> +    }
> +}
> +
> +// Merge (or balance) child nodes.
> +static struct btree_node *
> +btree_merge_node (struct btree *t, unsigned child_slot,
> +          struct btree_node *parent, uintptr_t target)
> +{
> +  // Choose the emptiest neighbor and lock both. The target child is 
> already
> +  // locked.
> +  unsigned left_slot;
> +  struct btree_node *left_node, *right_node;
> +  if ((child_slot == 0)
> +      || (((child_slot + 1) < parent->entry_count)
> +      && (parent->content.children[child_slot + 1].child->entry_count
> +          < parent->content.children[child_slot - 1].child->entry_count)))
> +    {
> +      left_slot = child_slot;
> +      left_node = parent->content.children[left_slot].child;
> +      right_node = parent->content.children[left_slot + 1].child;
> +      btree_node_lock_exclusive (right_node);
> +    }
> +  else
> +    {
> +      left_slot = child_slot - 1;
> +      left_node = parent->content.children[left_slot].child;
> +      right_node = parent->content.children[left_slot + 1].child;
> +      btree_node_lock_exclusive (left_node);
> +    }
> +
> +  // Can we merge both nodes into one node?
> +  unsigned total_count = left_node->entry_count + right_node->entry_count;
> +  unsigned max_count
> +    = btree_node_is_inner (left_node) ? max_fanout_inner : 
> max_fanout_leaf;
> +  if (total_count <= max_count)
> +    {
> +      // Merge into the parent?
> +      if (parent->entry_count == 2)
> +    {
> +      // Merge children into parent. This can only happen at the root.
> +      if (btree_node_is_inner (left_node))
> +        {
> +          for (unsigned index = 0; index != left_node->entry_count; 
> ++index)
> +        parent->content.children[index]
> +          = left_node->content.children[index];
> +          for (unsigned index = 0; index != right_node->entry_count;
> +           ++index)
> +        parent->content.children[index + left_node->entry_count]
> +          = right_node->content.children[index];
> +        }
> +      else
> +        {
> +          parent->type = btree_node_leaf;
> +          for (unsigned index = 0; index != left_node->entry_count; 
> ++index)
> +        parent->content.entries[index]
> +          = left_node->content.entries[index];
> +          for (unsigned index = 0; index != right_node->entry_count;
> +           ++index)
> +        parent->content.entries[index + left_node->entry_count]
> +          = right_node->content.entries[index];
> +        }
> +      parent->entry_count = total_count;
> +      btree_release_node (t, left_node);
> +      btree_release_node (t, right_node);
> +      return parent;
> +    }
> +      else
> +    {
> +      // Regular merge.
> +      if (btree_node_is_inner (left_node))
> +        {
> +          for (unsigned index = 0; index != right_node->entry_count;
> +           ++index)
> +        left_node->content.children[left_node->entry_count++]
> +          = right_node->content.children[index];
> +        }
> +      else
> +        {
> +          for (unsigned index = 0; index != right_node->entry_count;
> +           ++index)
> +        left_node->content.entries[left_node->entry_count++]
> +          = right_node->content.entries[index];
> +        }
> +      parent->content.children[left_slot].separator
> +        = parent->content.children[left_slot + 1].separator;
> +      for (unsigned index = left_slot + 1; index + 1 < 
> parent->entry_count;
> +           ++index)
> +        parent->content.children[index]
> +          = parent->content.children[index + 1];
> +      parent->entry_count--;
> +      btree_release_node (t, right_node);
> +      btree_node_unlock_exclusive (parent);
> +      return left_node;
> +    }
> +    }
> +
> +  // No merge possible, rebalance instead.
> +  if (left_node->entry_count > right_node->entry_count)
> +    {
> +      // Shift from left to right.
> +      unsigned to_shift
> +    = (left_node->entry_count - right_node->entry_count) / 2;
> +      if (btree_node_is_inner (left_node))
> +    {
> +      for (unsigned index = 0; index != right_node->entry_count; ++index)
> +        {
> +          unsigned pos = right_node->entry_count - 1 - index;
> +          right_node->content.children[pos + to_shift]
> +        = right_node->content.children[pos];
> +        }
> +      for (unsigned index = 0; index != to_shift; ++index)
> +        right_node->content.children[index]
> +          = left_node->content
> +          .children[left_node->entry_count - to_shift + index];
> +    }
> +      else
> +    {
> +      for (unsigned index = 0; index != right_node->entry_count; ++index)
> +        {
> +          unsigned pos = right_node->entry_count - 1 - index;
> +          right_node->content.entries[pos + to_shift]
> +        = right_node->content.entries[pos];
> +        }
> +      for (unsigned index = 0; index != to_shift; ++index)
> +        right_node->content.entries[index]
> +          = left_node->content
> +          .entries[left_node->entry_count - to_shift + index];
> +    }
> +      left_node->entry_count -= to_shift;
> +      right_node->entry_count += to_shift;
> +    }
> +  else
> +    {
> +      // Shift from right to left.
> +      unsigned to_shift
> +    = (right_node->entry_count - left_node->entry_count) / 2;
> +      if (btree_node_is_inner (left_node))
> +    {
> +      for (unsigned index = 0; index != to_shift; ++index)
> +        left_node->content.children[left_node->entry_count + index]
> +          = right_node->content.children[index];
> +      for (unsigned index = 0; index != right_node->entry_count - 
> to_shift;
> +           ++index)
> +        right_node->content.children[index]
> +          = right_node->content.children[index + to_shift];
> +    }
> +      else
> +    {
> +      for (unsigned index = 0; index != to_shift; ++index)
> +        left_node->content.entries[left_node->entry_count + index]
> +          = right_node->content.entries[index];
> +      for (unsigned index = 0; index != right_node->entry_count - 
> to_shift;
> +           ++index)
> +        right_node->content.entries[index]
> +          = right_node->content.entries[index + to_shift];
> +    }
> +      left_node->entry_count += to_shift;
> +      right_node->entry_count -= to_shift;
> +    }
> +  uintptr_t left_fence;
> +  if (btree_node_is_leaf (left_node))
> +    {
> +      left_fence = right_node->content.entries[0].base - 1;
> +    }
> +  else
> +    {
> +      left_fence = btree_node_get_fence_key (left_node);
> +    }
> +  parent->content.children[left_slot].separator = left_fence;
> +  btree_node_unlock_exclusive (parent);
> +  if (target <= left_fence)
> +    {
> +      btree_node_unlock_exclusive (right_node);
> +      return left_node;
> +    }
> +  else
> +    {
> +      btree_node_unlock_exclusive (left_node);
> +      return right_node;
> +    }
> +}
> +
> +// Insert an entry.
> +static bool
> +btree_insert (struct btree *t, uintptr_t base, uintptr_t size,
> +          struct object *ob)
> +{
> +  // Sanity check.
> +  if (!size)
> +    return false;
> +
> +  // Access the root.
> +  struct btree_node *iter, *parent = NULL;
> +  {
> +    version_lock_lock_exclusive (&(t->root_lock));
> +    iter = t->root;
> +    if (iter)
> +      {
> +    btree_node_lock_exclusive (iter);
> +      }
> +    else
> +      {
> +    t->root = iter = btree_allocate_node (t, false);
> +      }
> +    version_lock_unlock_exclusive (&(t->root_lock));
> +  }
> +
> +  // Walk down the btree with classic lock coupling and eager splits.
> +  // Strictly speaking this is not performance optimal, we could use
> +  // optimistic lock coupling until we hit a node that has to be modified.
> +  // But that is more difficult to implement and frame registration is
> +  // rare anyway, we use simple locking for now.
> +
> +  uintptr_t fence = max_separator;
> +  while (btree_node_is_inner (iter))
> +    {
> +      // Use eager splits to avoid lock coupling up.
> +      if (iter->entry_count == max_fanout_inner)
> +    btree_split_inner (t, &iter, &parent, base);
> +
> +      unsigned slot = btree_node_find_inner_slot (iter, base);
> +      if (parent)
> +    btree_node_unlock_exclusive (parent);
> +      parent = iter;
> +      fence = iter->content.children[slot].separator;
> +      iter = iter->content.children[slot].child;
> +      btree_node_lock_exclusive (iter);
> +    }
> +
> +  // Make sure we have space.
> +  if (iter->entry_count == max_fanout_leaf)
> +    btree_split_leaf (t, &iter, &parent, fence, base);
> +  if (parent)
> +    btree_node_unlock_exclusive (parent);
> +
> +  // Insert in node.
> +  unsigned slot = btree_node_find_leaf_slot (iter, base);
> +  if ((slot < iter->entry_count) && (iter->content.entries[slot].base 
> == base))
> +    {
> +      // Duplicate entry, this should never happen.
> +      btree_node_unlock_exclusive (iter);
> +      return false;
> +    }
> +  for (unsigned index = iter->entry_count; index > slot; --index)
> +    iter->content.entries[index] = iter->content.entries[index - 1];
> +  struct leaf_entry *e = &(iter->content.entries[slot]);
> +  e->base = base;
> +  e->size = size;
> +  e->ob = ob;
> +  iter->entry_count++;
> +  btree_node_unlock_exclusive (iter);
> +  return true;
> +}
> +
> +// Remove an entry.
> +static struct object *
> +btree_remove (struct btree *t, uintptr_t base)
> +{
> +  // Access the root.
> +  version_lock_lock_exclusive (&(t->root_lock));
> +  struct btree_node *iter = t->root;
> +  if (iter)
> +    btree_node_lock_exclusive (iter);
> +  version_lock_unlock_exclusive (&(t->root_lock));
> +  if (!iter)
> +    return NULL;
> +
> +  // Same strategy as with insert, walk down with lock coupling and
> +  // merge eagerly.
> +  while (btree_node_is_inner (iter))
> +    {
> +      unsigned slot = btree_node_find_inner_slot (iter, base);
> +      struct btree_node *next = iter->content.children[slot].child;
> +      btree_node_lock_exclusive (next);
> +      if (btree_node_needs_merge (next))
> +    {
> +      // Use eager merges to avoid lock coupling up.
> +      iter = btree_merge_node (t, slot, iter, base);
> +    }
> +      else
> +    {
> +      btree_node_unlock_exclusive (iter);
> +      iter = next;
> +    }
> +    }
> +
> +  // Remove existing entry.
> +  unsigned slot = btree_node_find_leaf_slot (iter, base);
> +  if ((slot >= iter->entry_count) || (iter->content.entries[slot].base 
> != base))
> +    {
> +      // Not found, this should never happen.
> +      btree_node_unlock_exclusive (iter);
> +      return NULL;
> +    }
> +  struct object *ob = iter->content.entries[slot].ob;
> +  for (unsigned index = slot; index + 1 < iter->entry_count; ++index)
> +    iter->content.entries[index] = iter->content.entries[index + 1];
> +  iter->entry_count--;
> +  btree_node_unlock_exclusive (iter);
> +  return ob;
> +}
> +
> +// Find the corresponding entry for the given address.
> +static struct object *
> +btree_lookup (const struct btree *t, uintptr_t target_addr)
> +{
> +  // Within this function many loads are relaxed atomic loads.
> +  // Use a macro to keep the code reasonable.
> +#define RLOAD(x) __atomic_load_n (&(x), __ATOMIC_RELAXED)
> +
> +  // For targets where unwind info is usually not registered through these
> +  // APIs anymore, avoid any sequential consistent atomics.
> +  // Use relaxed MO here, it is up to the app to ensure that the library
> +  // loading/initialization happens-before using that library in other
> +  // threads (in particular unwinding with that library's functions
> +  // appearing in the backtraces).  Calling that library's functions
> +  // without waiting for the library to initialize would be racy.
> +  if (__builtin_expect (!RLOAD (t->root), 1))
> +    return NULL;
> +
> +  // The unwinding tables are mostly static, they only change when
> +  // frames are added or removed. This makes it extremely unlikely that 
> they
> +  // change during a given unwinding sequence. Thus, we optimize for the
> +  // contention free case and use optimistic lock coupling. This does not
> +  // require any writes to shared state, instead we validate every 
> read. It is
> +  // important that we do not trust any value that we have read until 
> we call
> +  // validate again. Data can change at arbitrary points in time, thus 
> we always
> +  // copy something into a local variable and validate again before 
> acting on
> +  // the read. In the unlikely event that we encounter a concurrent 
> change we
> +  // simply restart and try again.
> +
> +restart:
> +  struct btree_node *iter;
> +  uintptr_t lock;
> +  {
> +    // Accessing the root node requires defending against concurrent 
> pointer
> +    // changes Thus we couple rootLock -> lock on root node -> validate 
> rootLock
> +    if (!version_lock_lock_optimistic (&(t->root_lock), &lock))
> +      goto restart;
> +    iter = RLOAD (t->root);
> +    if (!version_lock_validate (&(t->root_lock), lock))
> +      goto restart;
> +    if (!iter)
> +      return NULL;
> +    uintptr_t child_lock;
> +    if ((!btree_node_lock_optimistic (iter, &child_lock))
> +    || (!version_lock_validate (&(t->root_lock), lock)))
> +      goto restart;
> +    lock = child_lock;
> +  }
> +
> +  // Now we can walk down towards the right leaf node.
> +  while (true)
> +    {
> +      enum node_type type = RLOAD (iter->type);
> +      unsigned entry_count = RLOAD (iter->entry_count);
> +      if (!btree_node_validate (iter, lock))
> +    goto restart;
> +      if (!entry_count)
> +    return NULL;
> +
> +      if (type == btree_node_inner)
> +    {
> +      // We cannot call find_inner_slot here because we need (relaxed)
> +      // atomic reads here.
> +      unsigned slot = 0;
> +      while (
> +        ((slot + 1) < entry_count)
> +        && (RLOAD (iter->content.children[slot].separator) < target_addr))
> +        ++slot;
> +      struct btree_node *child = RLOAD 
> (iter->content.children[slot].child);
> +      if (!btree_node_validate (iter, lock))
> +        goto restart;
> +
> +      // The node content can change at any point in time, thus we must
> +      // interleave parent and child checks.
> +      uintptr_t child_lock;
> +      if (!btree_node_lock_optimistic (child, &child_lock))
> +        goto restart;
> +      if (!btree_node_validate (iter, lock))
> +        goto restart; // make sure we still point to the correct node 
> after
> +              // acquiring the optimistic lock.
> +
> +      // Go down
> +      iter = child;
> +      lock = child_lock;
> +    }
> +      else
> +    {
> +      // We cannot call find_leaf_slot here because we need (relaxed)
> +      // atomic reads here.
> +      unsigned slot = 0;
> +      while (((slot + 1) < entry_count)
> +         && (RLOAD (iter->content.entries[slot].base)
> +               + RLOAD (iter->content.entries[slot].size)
> +             <= target_addr))
> +        ++slot;
> +      struct leaf_entry entry;
> +      entry.base = RLOAD (iter->content.entries[slot].base);
> +      entry.size = RLOAD (iter->content.entries[slot].size);
> +      entry.ob = RLOAD (iter->content.entries[slot].ob);
> +      if (!btree_node_validate (iter, lock))
> +        goto restart;
> +
> +      // Check if we have a hit.
> +      if ((entry.base <= target_addr)
> +          && (target_addr < entry.base + entry.size))
> +        {
> +          return entry.ob;
> +        }
> +      return NULL;
> +    }
> +    }
> +#undef RLOAD
> +}
> +
> +#endif /* unwind-dw2-btree.h */
> diff --git a/libgcc/unwind-dw2-fde.c b/libgcc/unwind-dw2-fde.c
> index 8ee55be5675..1165be0c6df 100644
> --- a/libgcc/unwind-dw2-fde.c
> +++ b/libgcc/unwind-dw2-fde.c
> @@ -42,15 +42,34 @@ see the files COPYING3 and COPYING.RUNTIME 
> respectively.  If not, see
>   #endif
>   #endif
> 
> +#ifdef ATOMIC_FDE_FAST_PATH
> +#include "unwind-dw2-btree.h"
> +
> +static struct btree registered_frames;
> +
> +static void
> +release_registered_frames (void) __attribute__ ((destructor (110)));
> +static void
> +release_registered_frames (void)
> +{
> +  /* Release the b-tree and all frames. Frame releases that happen 
> later are
> +   * silently ignored */
> +  btree_destroy (&registered_frames);
> +}
> +
> +static void
> +get_pc_range (const struct object *ob, uintptr_t *range);
> +static void
> +init_object (struct object *ob);
> +
> +#else
> +
>   /* The unseen_objects list contains objects that have been registered
>      but not yet categorized in any way.  The seen_objects list has had
>      its pc_begin and count fields initialized at minimum, and is sorted
>      by decreasing value of pc_begin.  */
>   static struct object *unseen_objects;
>   static struct object *seen_objects;
> -#ifdef ATOMIC_FDE_FAST_PATH
> -static int any_objects_registered;
> -#endif
> 
>   #ifdef __GTHREAD_MUTEX_INIT
>   static __gthread_mutex_t object_mutex = __GTHREAD_MUTEX_INIT;
> @@ -78,6 +97,7 @@ init_object_mutex_once (void)
>   static __gthread_mutex_t object_mutex;
>   #endif
>   #endif
> +#endif
> 
>   /* Called from crtbegin.o to register the unwind info for an object.  */
> 
> @@ -99,23 +119,23 @@ __register_frame_info_bases (const void *begin, 
> struct object *ob,
>     ob->fde_end = NULL;
>   #endif
> 
> +#ifdef ATOMIC_FDE_FAST_PATH
> +  // Initialize  eagerly to avoid locking later
> +  init_object (ob);
> +
> +  // And register the frame
> +  uintptr_t range[2];
> +  get_pc_range (ob, range);
> +  btree_insert (&registered_frames, range[0], range[1] - range[0], ob);
> +#else
>     init_object_mutex_once ();
>     __gthread_mutex_lock (&object_mutex);
> 
>     ob->next = unseen_objects;
>     unseen_objects = ob;
> -#ifdef ATOMIC_FDE_FAST_PATH
> -  /* Set flag that at least one library has registered FDEs.
> -     Use relaxed MO here, it is up to the app to ensure that the library
> -     loading/initialization happens-before using that library in other
> -     threads (in particular unwinding with that library's functions
> -     appearing in the backtraces).  Calling that library's functions
> -     without waiting for the library to initialize would be racy.  */
> -  if (!any_objects_registered)
> -    __atomic_store_n (&any_objects_registered, 1, __ATOMIC_RELAXED);
> -#endif
> 
>     __gthread_mutex_unlock (&object_mutex);
> +#endif
>   }
> 
>   void
> @@ -153,23 +173,23 @@ __register_frame_info_table_bases (void *begin, 
> struct object *ob,
>     ob->s.b.from_array = 1;
>     ob->s.b.encoding = DW_EH_PE_omit;
> 
> +#ifdef ATOMIC_FDE_FAST_PATH
> +  // Initialize  eagerly to avoid locking later
> +  init_object (ob);
> +
> +  // And register the frame
> +  uintptr_t range[2];
> +  get_pc_range (ob, range);
> +  btree_insert (&registered_frames, range[0], range[1] - range[0], ob);
> +#else
>     init_object_mutex_once ();
>     __gthread_mutex_lock (&object_mutex);
> 
>     ob->next = unseen_objects;
>     unseen_objects = ob;
> -#ifdef ATOMIC_FDE_FAST_PATH
> -  /* Set flag that at least one library has registered FDEs.
> -     Use relaxed MO here, it is up to the app to ensure that the library
> -     loading/initialization happens-before using that library in other
> -     threads (in particular unwinding with that library's functions
> -     appearing in the backtraces).  Calling that library's functions
> -     without waiting for the library to initialize would be racy.  */
> -  if (!any_objects_registered)
> -    __atomic_store_n (&any_objects_registered, 1, __ATOMIC_RELAXED);
> -#endif
> 
>     __gthread_mutex_unlock (&object_mutex);
> +#endif
>   }
> 
>   void
> @@ -200,16 +220,33 @@ __register_frame_table (void *begin)
>   void *
>   __deregister_frame_info_bases (const void *begin)
>   {
> -  struct object **p;
>     struct object *ob = 0;
> 
>     /* If .eh_frame is empty, we haven't registered.  */
>     if ((const uword *) begin == 0 || *(const uword *) begin == 0)
>       return ob;
> 
> +#ifdef ATOMIC_FDE_FAST_PATH
> +  // Find the corresponding PC range
> +  struct object lookupob;
> +  lookupob.tbase = 0;
> +  lookupob.dbase = 0;
> +  lookupob.u.single = begin;
> +  lookupob.s.i = 0;
> +  lookupob.s.b.encoding = DW_EH_PE_omit;
> +#ifdef DWARF2_OBJECT_END_PTR_EXTENSION
> +  lookupob.fde_end = NULL;
> +#endif
> +  uintptr_t range[2];
> +  get_pc_range (&lookupob, range);
> +
> +  // And remove
> +  ob = btree_remove (&registered_frames, range[0]);
> +#else
>     init_object_mutex_once ();
>     __gthread_mutex_lock (&object_mutex);
> 
> +  struct object **p;
>     for (p = &unseen_objects; *p ; p = &(*p)->next)
>       if ((*p)->u.single == begin)
>         {
> @@ -241,6 +278,8 @@ __deregister_frame_info_bases (const void *begin)
> 
>    out:
>     __gthread_mutex_unlock (&object_mutex);
> +#endif
> +
>     gcc_assert (ob);
>     return (void *) ob;
>   }
> @@ -264,7 +303,7 @@ __deregister_frame (void *begin)
>      instead of an _Unwind_Context.  */
> 
>   static _Unwind_Ptr
> -base_from_object (unsigned char encoding, struct object *ob)
> +base_from_object (unsigned char encoding, const struct object *ob)
>   {
>     if (encoding == DW_EH_PE_omit)
>       return 0;
> @@ -628,13 +667,17 @@ end_fde_sort (struct object *ob, struct 
> fde_accumulator *accu, size_t count)
>       }
>   }
> 
> -\f
> -/* Update encoding, mixed_encoding, and pc_begin for OB for the
> -   fde array beginning at THIS_FDE.  Return the number of fdes
> -   encountered along the way.  */
> +/* Inspect the fde array beginning at this_fde. This
> +   function can be used either in query mode (RANGE is
> +   not null, OB is const), or in update mode (RANGE is
> +   null, OB is modified). In query mode the function computes
> +   the range of PC values and stores it in rANGE. In

s/rANGE/RANGE/

> +   update mode it updates encoding, mixed_encoding, and pc_begin
> +   for OB. Return the number of fdes encountered along the way. */
> 
>   static size_t
> -classify_object_over_fdes (struct object *ob, const fde *this_fde)
> +classify_object_over_fdes (struct object *ob, const fde *this_fde,
> +               uintptr_t *range)
>   {
>     const struct dwarf_cie *last_cie = 0;
>     size_t count = 0;
> @@ -644,7 +687,7 @@ classify_object_over_fdes (struct object *ob, const 
> fde *this_fde)
>     for (; ! last_fde (ob, this_fde); this_fde = next_fde (this_fde))
>       {
>         const struct dwarf_cie *this_cie;
> -      _Unwind_Ptr mask, pc_begin;
> +      _Unwind_Ptr mask, pc_begin, pc_range;
> 
>         /* Skip CIEs.  */
>         if (this_fde->CIE_delta == 0)
> @@ -660,14 +703,19 @@ classify_object_over_fdes (struct object *ob, 
> const fde *this_fde)
>         if (encoding == DW_EH_PE_omit)
>           return -1;
>         base = base_from_object (encoding, ob);
> -      if (ob->s.b.encoding == DW_EH_PE_omit)
> -        ob->s.b.encoding = encoding;
> -      else if (ob->s.b.encoding != encoding)
> -        ob->s.b.mixed_encoding = 1;
> +      if (!range)
> +        {
> +          if (ob->s.b.encoding == DW_EH_PE_omit)
> +        ob->s.b.encoding = encoding;
> +          else if (ob->s.b.encoding != encoding)
> +        ob->s.b.mixed_encoding = 1;
> +        }
>       }
> 
> -      read_encoded_value_with_base (encoding, base, this_fde->pc_begin,
> -                    &pc_begin);
> +      const unsigned char *p;
> +      p = read_encoded_value_with_base (encoding, base, 
> this_fde->pc_begin,
> +                    &pc_begin);
> +      read_encoded_value_with_base (encoding & 0x0F, 0, p, &pc_range);

It looks like pc_range is only used in query mode, so how about moving 
this read and the declaration of pc_range...

>         /* Take care to ignore link-once functions that were removed.
>        In these cases, the function address will be NULL, but if
> @@ -683,8 +731,27 @@ classify_object_over_fdes (struct object *ob, const 
> fde *this_fde)
>       continue;
> 
>         count += 1;
> -      if ((void *) pc_begin < ob->pc_begin)
> -    ob->pc_begin = (void *) pc_begin;
> +      if (range)
> +    {

...in here?  OK either way.  Thanks again, this is impressive work!

> +      _Unwind_Ptr pc_end = pc_begin + pc_range;
> +      if ((!range[0]) && (!range[1]))
> +        {
> +          range[0] = pc_begin;
> +          range[1] = pc_end;
> +        }
> +      else
> +        {
> +          if (pc_begin < range[0])
> +        range[0] = pc_begin;
> +          if (pc_end > range[1])
> +        range[1] = pc_end;
> +        }
> +    }
> +      else
> +    {
> +      if ((void *) pc_begin < ob->pc_begin)
> +        ob->pc_begin = (void *) pc_begin;
> +    }
>       }
> 
>     return count;
> @@ -769,7 +836,7 @@ init_object (struct object* ob)
>         fde **p = ob->u.array;
>         for (count = 0; *p; ++p)
>           {
> -          size_t cur_count = classify_object_over_fdes (ob, *p);
> +          size_t cur_count = classify_object_over_fdes (ob, *p, NULL);
>             if (cur_count == (size_t) -1)
>           goto unhandled_fdes;
>             count += cur_count;
> @@ -777,7 +844,7 @@ init_object (struct object* ob)
>       }
>         else
>       {
> -      count = classify_object_over_fdes (ob, ob->u.single);
> +      count = classify_object_over_fdes (ob, ob->u.single, NULL);
>         if (count == (size_t) -1)
>           {
>             static const fde terminator;
> @@ -821,6 +888,32 @@ init_object (struct object* ob)
>     ob->s.b.sorted = 1;
>   }
> 
> +#ifdef ATOMIC_FDE_FAST_PATH
> +/* Get the PC range for lookup */
> +static void
> +get_pc_range (const struct object *ob, uintptr_t *range)
> +{
> +  // It is safe to cast to non-const object* here as
> +  // classify_object_over_fdes does not modify ob in query mode.
> +  struct object *ncob = (struct object *) (uintptr_t) ob;
> +  range[0] = range[1] = 0;
> +  if (ob->s.b.sorted)
> +    {
> +      classify_object_over_fdes (ncob, ob->u.sort->orig_data, range);
> +    }
> +  else if (ob->s.b.from_array)
> +    {
> +      fde **p = ob->u.array;
> +      for (; *p; ++p)
> +    classify_object_over_fdes (ncob, *p, range);
> +    }
> +  else
> +    {
> +      classify_object_over_fdes (ncob, ob->u.single, range);
> +    }
> +}
> +#endif
> +
>   /* A linear search through a set of FDEs for the given PC.  This is
>      used when there was insufficient memory to allocate and sort an
>      array.  */
> @@ -985,6 +1078,9 @@ binary_search_mixed_encoding_fdes (struct object 
> *ob, void *pc)
>   static const fde *
>   search_object (struct object* ob, void *pc)
>   {
> +  /* The fast path initializes objects eagerly to avoid locking.
> +   * On the slow path we initialize them now */
> +#ifndef ATOMIC_FDE_FAST_PATH
>     /* If the data hasn't been sorted, try to do this now.  We may have
>        more memory available than last time we tried.  */
>     if (! ob->s.b.sorted)
> @@ -997,6 +1093,7 @@ search_object (struct object* ob, void *pc)
>         if (pc < ob->pc_begin)
>       return NULL;
>       }
> +#endif
> 
>     if (ob->s.b.sorted)
>       {
> @@ -1033,17 +1130,12 @@ _Unwind_Find_FDE (void *pc, struct 
> dwarf_eh_bases *bases)
>     const fde *f = NULL;
> 
>   #ifdef ATOMIC_FDE_FAST_PATH
> -  /* For targets where unwind info is usually not registered through these
> -     APIs anymore, avoid taking a global lock.
> -     Use relaxed MO here, it is up to the app to ensure that the library
> -     loading/initialization happens-before using that library in other
> -     threads (in particular unwinding with that library's functions
> -     appearing in the backtraces).  Calling that library's functions
> -     without waiting for the library to initialize would be racy.  */
> -  if (__builtin_expect (!__atomic_load_n (&any_objects_registered,
> -                      __ATOMIC_RELAXED), 1))
> +  ob = btree_lookup (&registered_frames, (uintptr_t) pc);
> +  if (!ob)
>       return NULL;
> -#endif
> +
> +  f = search_object (ob, pc);
> +#else
> 
>     init_object_mutex_once ();
>     __gthread_mutex_lock (&object_mutex);
> @@ -1081,6 +1173,7 @@ _Unwind_Find_FDE (void *pc, struct dwarf_eh_bases 
> *bases)
> 
>    fini:
>     __gthread_mutex_unlock (&object_mutex);
> +#endif
> 
>     if (f)
>       {
> diff --git a/libgcc/unwind-dw2-fde.h b/libgcc/unwind-dw2-fde.h
> index 8a011c358b4..77c2caa4f5a 100644
> --- a/libgcc/unwind-dw2-fde.h
> +++ b/libgcc/unwind-dw2-fde.h
> @@ -166,7 +166,7 @@ next_fde (const fde *f)
>   extern const fde * _Unwind_Find_FDE (void *, struct dwarf_eh_bases *);
> 
>   static inline int
> -last_fde (struct object *obj __attribute__ ((__unused__)), const fde *f)
> +last_fde (const struct object *obj __attribute__ ((__unused__)), const 
> fde *f)
>   {
>   #ifdef DWARF2_OBJECT_END_PTR_EXTENSION
>     return f == (const fde *) obj->fde_end || f->length == 0;


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v4] eliminate mutex in fast path of __register_frame
  2022-09-16 10:19 [PATCH v4] eliminate mutex in fast path of __register_frame Thomas Neumann
  2022-09-16 14:49 ` Jason Merrill
@ 2022-09-18  8:59 ` Dimitar Dimitrov
  2022-09-18  9:20   ` Thomas Neumann
  2022-09-18 10:02   ` Thomas Neumann
  2022-09-19 13:46 ` Stephan Bergmann
  2022-11-21 11:14 ` Tamar Christina
  3 siblings, 2 replies; 33+ messages in thread
From: Dimitar Dimitrov @ 2022-09-18  8:59 UTC (permalink / raw)
  To: Thomas Neumann
  Cc: gcc-patches, Jason Merrill, Florian Weimer, Jakub Jelinek,
	Jonathan Wakely

On Fri, Sep 16, 2022 at 12:19:36PM +0200, Thomas Neumann via Gcc-patches wrote:
> The __register_frame/__deregister_frame functions are used to register
> unwinding frames from JITed code in a sorted list. That list itself
> is protected by object_mutex, which leads to terrible performance
> in multi-threaded code and is somewhat expensive even if single-threaded.
> There was already a fast-path that avoided taking the mutex if no
> frame was registered at all.
> 
> This commit eliminates both the mutex and the sorted list from
> the atomic fast path, and replaces it with a btree that uses
> optimistic lock coupling during lookup. This allows for fully parallel
> unwinding and is essential to scale exception handling to large
> core counts.
> 
> Changes since v3:
> - Avoid code duplication by adding query mode to classify_object_over_fdes
> - Adjust all comments as requested
> 
> libgcc/ChangeLog:
> 
>         * unwind-dw2-fde.c (release_registered_frames): Cleanup at shutdown.
>         (__register_frame_info_table_bases): Use btree in atomic fast path.
>         (__deregister_frame_info_bases): Likewise.
>         (_Unwind_Find_FDE): Likewise.
>         (base_from_object): Make parameter const.
>         (classify_object_over_fdes): Add query-only mode.
>         (get_pc_range): Compute PC range for lookup.
>         * unwind-dw2-fde.h (last_fde): Make parameter const.
>         * unwind-dw2-btree.h: New file.
> ---
>  libgcc/unwind-dw2-btree.h | 953 ++++++++++++++++++++++++++++++++++++++
>  libgcc/unwind-dw2-fde.c   | 195 ++++++--
>  libgcc/unwind-dw2-fde.h   |   2 +-
>  3 files changed, 1098 insertions(+), 52 deletions(-)
>  create mode 100644 libgcc/unwind-dw2-btree.h
> 

Hi Thomas,

This patch broke avr and pru-elf cross builds:
  gcc/libgcc/unwind-dw2-fde.c:680:28: error: unknown type name ‘uintptr_t’
  680 |                            uintptr_t *range)

Should uintptr_t be replaced with __UINTPTR_TYPE__? Such change fixes the
above broken builds for me. But I'm not sure how valid it is for that
part of libgcc.

Other embedded targets like arm-none-eabi are not broken because they
overwrite LIB2ADDEH, and consequently unwind-dw2-fde.c is not built for
them.

Regards,
Dimitar

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v4] eliminate mutex in fast path of __register_frame
  2022-09-18  8:59 ` Dimitar Dimitrov
@ 2022-09-18  9:20   ` Thomas Neumann
  2022-09-18 10:02   ` Thomas Neumann
  1 sibling, 0 replies; 33+ messages in thread
From: Thomas Neumann @ 2022-09-18  9:20 UTC (permalink / raw)
  To: Dimitar Dimitrov
  Cc: gcc-patches, Jason Merrill, Florian Weimer, Jakub Jelinek,
	Jonathan Wakely

Hi Dimitar,

> This patch broke avr and pru-elf cross builds:
>    gcc/libgcc/unwind-dw2-fde.c:680:28: error: unknown type name ‘uintptr_t’
>    680 |                            uintptr_t *range)
> 
> Should uintptr_t be replaced with __UINTPTR_TYPE__? Such change fixes the
> above broken builds for me. But I'm not sure how valid it is for that
> part of libgcc.

thanks for notifying me, I was not aware that uintptr_t is not available 
on all platforms. After looking at the existing code I think 
__UINTPTR_TYPE__ is the correct substitute. I will do some more testing 
and commit an s/uintptr_t/__UINTPTR_TYPE__/ patch soon. (I will probably 
use a typedef, as seen in generic-morestack.c, to avoid uglifying the code).

Best

Thomas

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v4] eliminate mutex in fast path of __register_frame
  2022-09-18  8:59 ` Dimitar Dimitrov
  2022-09-18  9:20   ` Thomas Neumann
@ 2022-09-18 10:02   ` Thomas Neumann
  1 sibling, 0 replies; 33+ messages in thread
From: Thomas Neumann @ 2022-09-18 10:02 UTC (permalink / raw)
  To: Dimitar Dimitrov
  Cc: gcc-patches, Jason Merrill, Florian Weimer, Jakub Jelinek,
	Jonathan Wakely

Hi Dimitar,

> This patch broke avr and pru-elf cross builds:
>    gcc/libgcc/unwind-dw2-fde.c:680:28: error: unknown type name ‘uintptr_t’
>    680 |                            uintptr_t *range)
> 
> Should uintptr_t be replaced with __UINTPTR_TYPE__? Such change fixes the
> above broken builds for me. But I'm not sure how valid it is for that
> part of libgcc.

I have fixed that by using a typedef for __UINTPTR_TYPE__.

Best

Thomas

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v4] eliminate mutex in fast path of __register_frame
  2022-09-16 10:19 [PATCH v4] eliminate mutex in fast path of __register_frame Thomas Neumann
  2022-09-16 14:49 ` Jason Merrill
  2022-09-18  8:59 ` Dimitar Dimitrov
@ 2022-09-19 13:46 ` Stephan Bergmann
  2022-09-19 13:55   ` Thomas Neumann
  2022-09-19 15:33   ` Thomas Neumann
  2022-11-21 11:14 ` Tamar Christina
  3 siblings, 2 replies; 33+ messages in thread
From: Stephan Bergmann @ 2022-09-19 13:46 UTC (permalink / raw)
  To: Thomas Neumann, gcc-patches

On 16/09/2022 12:19, Thomas Neumann via Gcc-patches wrote:
> The __register_frame/__deregister_frame functions are used to register
> unwinding frames from JITed code in a sorted list. That list itself
> is protected by object_mutex, which leads to terrible performance
> in multi-threaded code and is somewhat expensive even if single-threaded.
> There was already a fast-path that avoided taking the mutex if no
> frame was registered at all.
> 
> This commit eliminates both the mutex and the sorted list from
> the atomic fast path, and replaces it with a btree that uses
> optimistic lock coupling during lookup. This allows for fully parallel
> unwinding and is essential to scale exception handling to large
> core counts.

I haven't debugged this in any way, nor checked whether it only impacts 
exactly my below scenario, but noticed the following:

At least when building LibreOffice with Clang (16 trunk) with ASan and 
UBsan enabled against libstdc++ (with --gcc-toolchain and 
LD_LIBRARY_PATH to pick up a libstdc++ trunk build including this change 
at build and run-time), at least one of the LibreOffice tests executed 
during the build started to fail with

> Thread 1 "cppunittester" received signal SIGABRT, Aborted.
> __pthread_kill_implementation (threadid=<optimized out>, signo=signo@entry=6, no_tid=no_tid@entry=0)Downloading 0.00 MB source file /usr/src/debug/glibc-2.36-4.fc37.x86_64/nptl/pthread_kill.c
>  at ~/.cache/debuginfod_client/a6572cd46182057d3dbacf1685a12edab0e2eda1/source##usr##src##debug##glibc-2.36-4.fc37.x86_64##nptl##pthread_kill.c:44
> 44	      return INTERNAL_SYSCALL_ERROR_P (ret) ? INTERNAL_SYSCALL_ERRNO (ret) : 0;
> (gdb) bt
> #0  __pthread_kill_implementation (threadid=<optimized out>, signo=signo@entry=6, no_tid=no_tid@entry=0) at ~/.cache/debuginfod_client/a6572cd46182057d3dbacf1685a12edab0e2eda1/source##usr##src##debug##glibc-2.36-4.fc37.x86_64##nptl##pthread_kill.c:44
> #1  0x00007ffff6dcdd33 in __pthread_kill_internal (signo=6, threadid=<optimized out>) at ~/.cache/debuginfod_client/a6572cd46182057d3dbacf1685a12edab0e2eda1/source##usr##src##debug##glibc-2.36-4.fc37.x86_64##nptl##pthread_kill.c:78
> #2  0x00007ffff6d7daa6 in __GI_raise (sig=sig@entry=6) at ~/.cache/debuginfod_client/a6572cd46182057d3dbacf1685a12edab0e2eda1/source##usr##src##debug##glibc-2.36-4.fc37.x86_64##signal##..##sysdeps##posix##raise.c:26
> #3  0x00007ffff6d677fc in __GI_abort () at ~/.cache/debuginfod_client/a6572cd46182057d3dbacf1685a12edab0e2eda1/source##usr##src##debug##glibc-2.36-4.fc37.x86_64##stdlib##abort.c:79
> #4  0x00007ffff6f377e8 in __deregister_frame_info_bases (begin=<optimized out>) at ~/gcc/trunk/src/libgcc/unwind-dw2-fde.c:285
> #5  __deregister_frame_info_bases (begin=<optimized out>) at ~/gcc/trunk/src/libgcc/unwind-dw2-fde.c:223
> #6  0x00007fffc7c3b53f in __do_fini () at ~/lo/core/instdir/program/libcairo.so.2
> #7  0x00007ffff7fcda9e in _dl_fini () at ~/.cache/debuginfod_client/653dfb54d6e6d9c27c349f698a8af1ab86d5501d/source##usr##src##debug##glibc-2.36-4.fc37.x86_64##elf##dl-fini.c:142
> #8  0x00007ffff6d7ff35 in __run_exit_handlers (status=0, listp=0x7ffff6f13840 <__exit_funcs>, run_list_atexit=run_list_atexit@entry=true, run_dtors=run_dtors@entry=true) at ~/.cache/debuginfod_client/a6572cd46182057d3dbacf1685a12edab0e2eda1/source##usr##src##debug##glibc-2.36-4.fc37.x86_64##stdlib##exit.c:113
> #9  0x00007ffff6d800b0 in __GI_exit (status=<optimized out>) at ~/.cache/debuginfod_client/a6572cd46182057d3dbacf1685a12edab0e2eda1/source##usr##src##debug##glibc-2.36-4.fc37.x86_64##stdlib##exit.c:143
> #10 0x00007ffff6d68517 in __libc_start_call_main (main=main@entry=0x5555556c9ef0 <main(int, char**)>, argc=argc@entry=24, argv=argv@entry=0x7ffffffefbf8) at ~/.cache/debuginfod_client/a6572cd46182057d3dbacf1685a12edab0e2eda1/source##usr##src##debug##glibc-2.36-4.fc37.x86_64##csu##..##sysdeps##nptl##libc_start_call_main.h:74
> #11 0x00007ffff6d685c9 in __libc_start_main_impl (main=0x5555556c9ef0 <main(int, char**)>, argc=24, argv=0x7ffffffefbf8, init=<optimized out>, fini=<optimized out>, rtld_fini=<optimized out>, stack_end=0x7ffffffefbe8) at ~/.cache/debuginfod_client/a6572cd46182057d3dbacf1685a12edab0e2eda1/source##usr##src##debug##glibc-2.36-4.fc37.x86_64##csu##..##csu##libc-start.c:381
> #12 0x00005555555f1575 in _start ()

and which went away again when locally reverting this 
<https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=6e80a1d164d1f996ad08a512c000025a7c2ca893> 
"eliminate mutex in fast path of __register_frame".


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v4] eliminate mutex in fast path of __register_frame
  2022-09-19 13:46 ` Stephan Bergmann
@ 2022-09-19 13:55   ` Thomas Neumann
  2022-09-19 14:00     ` Stephan Bergmann
  2022-09-19 15:33   ` Thomas Neumann
  1 sibling, 1 reply; 33+ messages in thread
From: Thomas Neumann @ 2022-09-19 13:55 UTC (permalink / raw)
  To: Stephan Bergmann, gcc-patches

> I haven't debugged this in any way, nor checked whether it only impacts 
> exactly my below scenario, but noticed the following:
> 
> At least when building LibreOffice with Clang (16 trunk) with ASan and 
> UBsan enabled against libstdc++ (with --gcc-toolchain and 
> LD_LIBRARY_PATH to pick up a libstdc++ trunk build including this change 
> at build and run-time), at least one of the LibreOffice tests executed 
> during the build started to fail with

Apparently a registered frame is not found when deregistering, which 
triggers an assert. I will debug this. Could you send me a script or a 
description on how to reproduce the issue?

Best

Thomas

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v4] eliminate mutex in fast path of __register_frame
  2022-09-19 13:55   ` Thomas Neumann
@ 2022-09-19 14:00     ` Stephan Bergmann
  0 siblings, 0 replies; 33+ messages in thread
From: Stephan Bergmann @ 2022-09-19 14:00 UTC (permalink / raw)
  To: Thomas Neumann; +Cc: gcc-patches

On 19/09/2022 15:55, Thomas Neumann wrote:
> Apparently a registered frame is not found when deregistering, which 
> triggers an assert. I will debug this. Could you send me a script or a 
> description on how to reproduce the issue?

Thanks a lot!  I'm in the process of checking whether a more generic 
LibreOffice build scenario will also exhibit this, and will let you know 
about a (hopefully less demanding) reproducer scenario.


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v4] eliminate mutex in fast path of __register_frame
  2022-09-19 13:46 ` Stephan Bergmann
  2022-09-19 13:55   ` Thomas Neumann
@ 2022-09-19 15:33   ` Thomas Neumann
  2022-09-20  5:39     ` Stephan Bergmann
  1 sibling, 1 reply; 33+ messages in thread
From: Thomas Neumann @ 2022-09-19 15:33 UTC (permalink / raw)
  To: Stephan Bergmann, gcc-patches

> At least when building LibreOffice with Clang (16 trunk) with ASan and 
> UBsan enabled against libstdc++ (with --gcc-toolchain and 
> LD_LIBRARY_PATH to pick up a libstdc++ trunk build including this change 
> at build and run-time), at least one of the LibreOffice tests executed 
> during the build started to fail with

I could not reproduce the issue when building LibreOffice with gcc, but 
after reading the compiler-rt version of crtbegin.c I think the problem 
is the destruction order in compiler-rt. It calls 
__deregister_frame_info_bases after our lookup structure has already 
been destroyed.

Can you try if the patch below fixes the problem? It keeps the data 
structures alive at shutdown, though, which will probably make some leak 
detectors unhappy.

Alternatively we could simply remove the gcc_assert (ob) in line 285 of 
that file. As far as I can see in crt-begin nothing bad happens if we 
return nullptr at shutdown.

Best

Thomas

diff --git a/libgcc/unwind-dw2-fde.c b/libgcc/unwind-dw2-fde.c
index 919abfe0664..d427318280c 100644
--- a/libgcc/unwind-dw2-fde.c
+++ b/libgcc/unwind-dw2-fde.c
@@ -49,16 +49,6 @@ typedef __UINTPTR_TYPE__ uintptr_type;

  static struct btree registered_frames;

-static void
-release_registered_frames (void) __attribute__ ((destructor (110)));
-static void
-release_registered_frames (void)
-{
-  /* Release the b-tree and all frames. Frame releases that happen 
later are
-   * silently ignored */
-  btree_destroy (&registered_frames);
-}
-
  static void
  get_pc_range (const struct object *ob, uintptr_type *range);
  static void

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v4] eliminate mutex in fast path of __register_frame
  2022-09-19 15:33   ` Thomas Neumann
@ 2022-09-20  5:39     ` Stephan Bergmann
  0 siblings, 0 replies; 33+ messages in thread
From: Stephan Bergmann @ 2022-09-20  5:39 UTC (permalink / raw)
  To: Thomas Neumann; +Cc: gcc-patches

On 19/09/2022 17:33, Thomas Neumann wrote:
> Can you try if the patch below fixes the problem? It keeps the data 
> structures alive at shutdown, though, which will probably make some leak 
> detectors unhappy.
> 
> Alternatively we could simply remove the gcc_assert (ob) in line 285 of 
> that file. As far as I can see in crt-begin nothing bad happens if we 
> return nullptr at shutdown.

Yes, thanks, each of those two alternative approaches would appear to 
fix that LibreOffice build of mine.


^ permalink raw reply	[flat|nested] 33+ messages in thread

* RE: [PATCH v4] eliminate mutex in fast path of __register_frame
  2022-09-16 10:19 [PATCH v4] eliminate mutex in fast path of __register_frame Thomas Neumann
                   ` (2 preceding siblings ...)
  2022-09-19 13:46 ` Stephan Bergmann
@ 2022-11-21 11:14 ` Tamar Christina
  2022-11-21 11:22   ` Thomas Neumann
  3 siblings, 1 reply; 33+ messages in thread
From: Tamar Christina @ 2022-11-21 11:14 UTC (permalink / raw)
  To: Thomas Neumann, gcc-patches, Jason Merrill
  Cc: Florian Weimer, Jakub Jelinek, Jonathan Wakely

Hi,

I couldn't find a Bugzilla account for you Thomas,

I've bisected a slowdown in startup speed for C++ to this change,
See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107675 

When dynamically linking a fast enough machine hides the latency, but when
Statically linking or on slower devices this change caused a 5x increase in
Instruction count and 2x increase in cycle count before getting to main.

This has been quite noticeable on smaller devices.  Is there a reason the btree
can't be initialized lazily? It seems a bit harsh to pay the cost of unwinding at
startup even when you don't throw exceptions..

Cheers,
Tamar

> -----Original Message-----
> From: Gcc-patches <gcc-patches-
> bounces+tamar.christina=arm.com@gcc.gnu.org> On Behalf Of Thomas
> Neumann via Gcc-patches
> Sent: Friday, September 16, 2022 11:20 AM
> To: gcc-patches@gcc.gnu.org; Jason Merrill <jason@redhat.com>
> Cc: Florian Weimer <fweimer@redhat.com>; Jakub Jelinek
> <jakub@redhat.com>; Jonathan Wakely <jwakely.gcc@gmail.com>
> Subject: [PATCH v4] eliminate mutex in fast path of __register_frame
> 
> The __register_frame/__deregister_frame functions are used to register
> unwinding frames from JITed code in a sorted list. That list itself is protected
> by object_mutex, which leads to terrible performance in multi-threaded
> code and is somewhat expensive even if single-threaded.
> There was already a fast-path that avoided taking the mutex if no frame was
> registered at all.
> 
> This commit eliminates both the mutex and the sorted list from the atomic
> fast path, and replaces it with a btree that uses optimistic lock coupling during
> lookup. This allows for fully parallel unwinding and is essential to scale
> exception handling to large core counts.
> 
> Changes since v3:
> - Avoid code duplication by adding query mode to classify_object_over_fdes
> - Adjust all comments as requested
> 
> libgcc/ChangeLog:
> 
>          * unwind-dw2-fde.c (release_registered_frames): Cleanup at
> shutdown.
>          (__register_frame_info_table_bases): Use btree in atomic fast path.
>          (__deregister_frame_info_bases): Likewise.
>          (_Unwind_Find_FDE): Likewise.
>          (base_from_object): Make parameter const.
>          (classify_object_over_fdes): Add query-only mode.
>          (get_pc_range): Compute PC range for lookup.
>          * unwind-dw2-fde.h (last_fde): Make parameter const.
>          * unwind-dw2-btree.h: New file.
> ---
>   libgcc/unwind-dw2-btree.h | 953
> ++++++++++++++++++++++++++++++++++++++
>   libgcc/unwind-dw2-fde.c   | 195 ++++++--
>   libgcc/unwind-dw2-fde.h   |   2 +-
>   3 files changed, 1098 insertions(+), 52 deletions(-)
>   create mode 100644 libgcc/unwind-dw2-btree.h
> 
> diff --git a/libgcc/unwind-dw2-btree.h b/libgcc/unwind-dw2-btree.h new file
> mode 100644 index 00000000000..8853f0eab48
> --- /dev/null
> +++ b/libgcc/unwind-dw2-btree.h
> @@ -0,0 +1,953 @@
> +/* Lock-free btree for manually registered unwind frames.  */
> +/* Copyright (C) 2022 Free Software Foundation, Inc.
> +   Contributed by Thomas Neumann
> +
> +This file is part of GCC.
> +
> +GCC is free software; you can redistribute it and/or modify it under
> +the terms of the GNU General Public License as published by the Free
> +Software Foundation; either version 3, or (at your option) any later
> +version.
> +
> +GCC is distributed in the hope that it will be useful, but WITHOUT ANY
> +WARRANTY; without even the implied warranty of MERCHANTABILITY or
> +FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
> +for more details.
> +
> +Under Section 7 of GPL version 3, you are granted additional
> +permissions described in the GCC Runtime Library Exception, version
> +3.1, as published by the Free Software Foundation.
> +
> +You should have received a copy of the GNU General Public License and a
> +copy of the GCC Runtime Library Exception along with this program; see
> +the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
> +<http://www.gnu.org/licenses/>.  */
> +
> +#ifndef GCC_UNWIND_DW2_BTREE_H
> +#define GCC_UNWIND_DW2_BTREE_H
> +
> +#include <stdbool.h>
> +
> +// Common logic for version locks.
> +struct version_lock
> +{
> +  // The lock itself. The lowest bit indicates an exclusive lock,
> +  // the second bit indicates waiting threads. All other bits are
> +  // used as counter to recognize changes.
> +  // Overflows are okay here, we must only prevent overflow to the
> +  // same value within one lock_optimistic/validate
> +  // range. Even on 32 bit platforms that would require 1 billion
> +  // frame registrations within the time span of a few assembler
> +  // instructions.
> +  uintptr_t version_lock;
> +};
> +
> +#ifdef __GTHREAD_HAS_COND
> +// We should never get contention within the tree as it rarely changes.
> +// But if we ever do get contention we use these for waiting.
> +static __gthread_mutex_t version_lock_mutex =
> __GTHREAD_MUTEX_INIT;
> +static __gthread_cond_t version_lock_cond = __GTHREAD_COND_INIT;
> #endif
> +
> +// Initialize in locked state.
> +static inline void
> +version_lock_initialize_locked_exclusive (struct version_lock *vl) {
> +  vl->version_lock = 1;
> +}
> +
> +// Try to lock the node exclusive.
> +static inline bool
> +version_lock_try_lock_exclusive (struct version_lock *vl) {
> +  uintptr_t state = __atomic_load_n (&(vl->version_lock),
> +__ATOMIC_SEQ_CST);
> +  if (state & 1)
> +    return false;
> +  return __atomic_compare_exchange_n (&(vl->version_lock), &state,
> state | 1,
> +				      false, __ATOMIC_SEQ_CST,
> +				      __ATOMIC_SEQ_CST);
> +}
> +
> +// Lock the node exclusive, blocking as needed.
> +static void
> +version_lock_lock_exclusive (struct version_lock *vl) { #ifndef
> +__GTHREAD_HAS_COND
> +restart:
> +#endif
> +
> +  // We should virtually never get contention here, as frame
> +  // changes are rare.
> +  uintptr_t state = __atomic_load_n (&(vl->version_lock),
> +__ATOMIC_SEQ_CST);
> +  if (!(state & 1))
> +    {
> +      if (__atomic_compare_exchange_n (&(vl->version_lock), &state, state |
> 1,
> +				       false, __ATOMIC_SEQ_CST,
> +				       __ATOMIC_SEQ_CST))
> +	return;
> +    }
> +
> +    // We did get contention, wait properly.
> +#ifdef __GTHREAD_HAS_COND
> +  __gthread_mutex_lock (&version_lock_mutex);
> +  state = __atomic_load_n (&(vl->version_lock), __ATOMIC_SEQ_CST);
> +  while (true)
> +    {
> +      // Check if the lock is still held.
> +      if (!(state & 1))
> +	{
> +	  if (__atomic_compare_exchange_n (&(vl->version_lock), &state,
> +					   state | 1, false,
> __ATOMIC_SEQ_CST,
> +					   __ATOMIC_SEQ_CST))
> +	    {
> +	      __gthread_mutex_unlock (&version_lock_mutex);
> +	      return;
> +	    }
> +	  else
> +	    {
> +	      continue;
> +	    }
> +	}
> +
> +      // Register waiting thread.
> +      if (!(state & 2))
> +	{
> +	  if (!__atomic_compare_exchange_n (&(vl->version_lock), &state,
> +					    state | 2, false,
> __ATOMIC_SEQ_CST,
> +					    __ATOMIC_SEQ_CST))
> +	    continue;
> +	}
> +
> +      // And sleep.
> +      __gthread_cond_wait (&version_lock_cond, &version_lock_mutex);
> +      state = __atomic_load_n (&(vl->version_lock), __ATOMIC_SEQ_CST);
> +    }
> +#else
> +  // Spin if we do not have condition variables available.
> +  // We expect no contention here, spinning should be okay.
> +  goto restart;
> +#endif
> +}
> +
> +// Release a locked node and increase the version lock.
> +static void
> +version_lock_unlock_exclusive (struct version_lock *vl) {
> +  // increase version, reset exclusive lock bits
> +  uintptr_t state = __atomic_load_n (&(vl->version_lock),
> +__ATOMIC_SEQ_CST);
> +  uintptr_t ns = (state + 4) & (~((uintptr_t) 3));
> +  state = __atomic_exchange_n (&(vl->version_lock), ns,
> +__ATOMIC_SEQ_CST);
> +
> +#ifdef __GTHREAD_HAS_COND
> +  if (state & 2)
> +    {
> +      // Wake up waiting threads. This should be extremely rare.
> +      __gthread_mutex_lock (&version_lock_mutex);
> +      __gthread_cond_broadcast (&version_lock_cond);
> +      __gthread_mutex_unlock (&version_lock_mutex);
> +    }
> +#endif
> +}
> +
> +// Acquire an optimistic "lock". Note that this does not lock at all,
> +it // only allows for validation later.
> +static inline bool
> +version_lock_lock_optimistic (const struct version_lock *vl, uintptr_t
> +*lock) {
> +  uintptr_t state = __atomic_load_n (&(vl->version_lock),
> +__ATOMIC_SEQ_CST);
> +  *lock = state;
> +
> +  // Acquiring the lock fails when there is currently an exclusive lock.
> +  return !(state & 1);
> +}
> +
> +// Validate a previously acquired "lock".
> +static inline bool
> +version_lock_validate (const struct version_lock *vl, uintptr_t lock) {
> +  // Prevent the reordering of non-atomic loads behind the atomic load.
> +  // Hans Boehm, Can Seqlocks Get Along with Programming Language
> +Memory
> +  // Models?, Section 4.
> +  __atomic_thread_fence (__ATOMIC_ACQUIRE);
> +
> +  // Check that the node is still in the same state.
> +  uintptr_t state = __atomic_load_n (&(vl->version_lock),
> +__ATOMIC_SEQ_CST);
> +  return (state == lock);
> +}
> +
> +// The largest possible separator value.
> +static const uintptr_t max_separator = ~((uintptr_t) (0));
> +
> +struct btree_node;
> +
> +// Inner entry. The child tree contains all entries <= separator.
> +struct inner_entry
> +{
> +  uintptr_t separator;
> +  struct btree_node *child;
> +};
> +
> +// Leaf entry. Stores an object entry.
> +struct leaf_entry
> +{
> +  uintptr_t base, size;
> +  struct object *ob;
> +};
> +
> +// Node types.
> +enum node_type
> +{
> +  btree_node_inner,
> +  btree_node_leaf,
> +  btree_node_free
> +};
> +
> +// Node sizes. Chosen such that the result size is roughly 256 bytes.
> +#define max_fanout_inner 15
> +#define max_fanout_leaf 10
> +
> +// A btree node.
> +struct btree_node
> +{
> +  // The version lock used for optimistic lock coupling.
> +  struct version_lock version_lock;
> +  // The number of entries.
> +  unsigned entry_count;
> +  // The type.
> +  enum node_type type;
> +  // The payload.
> +  union
> +  {
> +    // The inner nodes have fence keys, i.e., the right-most entry includes a
> +    // separator.
> +    struct inner_entry children[max_fanout_inner];
> +    struct leaf_entry entries[max_fanout_leaf];
> +  } content;
> +};
> +
> +// Is an inner node?
> +static inline bool
> +btree_node_is_inner (const struct btree_node *n) {
> +  return n->type == btree_node_inner;
> +}
> +
> +// Is a leaf node?
> +static inline bool
> +btree_node_is_leaf (const struct btree_node *n) {
> +  return n->type == btree_node_leaf;
> +}
> +
> +// Should the node be merged?
> +static inline bool
> +btree_node_needs_merge (const struct btree_node *n) {
> +  return n->entry_count < (btree_node_is_inner (n) ? (max_fanout_inner /
> 2)
> +						   : (max_fanout_leaf / 2));
> +}
> +
> +// Get the fence key for inner nodes.
> +static inline uintptr_t
> +btree_node_get_fence_key (const struct btree_node *n) {
> +  // For inner nodes we just return our right-most entry.
> +  return n->content.children[n->entry_count - 1].separator; }
> +
> +// Find the position for a slot in an inner node.
> +static unsigned
> +btree_node_find_inner_slot (const struct btree_node *n, uintptr_t
> +value) {
> +  for (unsigned index = 0, ec = n->entry_count; index != ec; ++index)
> +    if (n->content.children[index].separator >= value)
> +      return index;
> +  return n->entry_count;
> +}
> +
> +// Find the position for a slot in a leaf node.
> +static unsigned
> +btree_node_find_leaf_slot (const struct btree_node *n, uintptr_t value)
> +{
> +  for (unsigned index = 0, ec = n->entry_count; index != ec; ++index)
> +    if (n->content.entries[index].base + n->content.entries[index].size >
> value)
> +      return index;
> +  return n->entry_count;
> +}
> +
> +// Try to lock the node exclusive.
> +static inline bool
> +btree_node_try_lock_exclusive (struct btree_node *n) {
> +  return version_lock_try_lock_exclusive (&(n->version_lock)); }
> +
> +// Lock the node exclusive, blocking as needed.
> +static inline void
> +btree_node_lock_exclusive (struct btree_node *n) {
> +  version_lock_lock_exclusive (&(n->version_lock)); }
> +
> +// Release a locked node and increase the version lock.
> +static inline void
> +btree_node_unlock_exclusive (struct btree_node *n) {
> +  version_lock_unlock_exclusive (&(n->version_lock)); }
> +
> +// Acquire an optimistic "lock". Note that this does not lock at all,
> +it // only allows for validation later.
> +static inline bool
> +btree_node_lock_optimistic (const struct btree_node *n, uintptr_t
> +*lock) {
> +  return version_lock_lock_optimistic (&(n->version_lock), lock); }
> +
> +// Validate a previously acquire lock.
> +static inline bool
> +btree_node_validate (const struct btree_node *n, uintptr_t lock) {
> +  return version_lock_validate (&(n->version_lock), lock); }
> +
> +// Insert a new separator after splitting.
> +static void
> +btree_node_update_separator_after_split (struct btree_node *n,
> +					 uintptr_t old_separator,
> +					 uintptr_t new_separator,
> +					 struct btree_node *new_right)
> +{
> +  unsigned slot = btree_node_find_inner_slot (n, old_separator);
> +  for (unsigned index = n->entry_count; index > slot; --index)
> +    n->content.children[index] = n->content.children[index - 1];
> +  n->content.children[slot].separator = new_separator;
> +  n->content.children[slot + 1].child = new_right;
> +  n->entry_count++;
> +}
> +
> +// A btree. Suitable for static initialization, all members are zero at
> +the // beginning.
> +struct btree
> +{
> +  // The root of the btree.
> +  struct btree_node *root;
> +  // The free list of released node.
> +  struct btree_node *free_list;
> +  // The version lock used to protect the root.
> +  struct version_lock root_lock;
> +};
> +
> +// Initialize a btree. Not actually used, just for exposition.
> +static inline void
> +btree_init (struct btree *t)
> +{
> +  t->root = NULL;
> +  t->free_list = NULL;
> +  t->root_lock.version_lock = 0;
> +};
> +
> +static void
> +btree_release_tree_recursively (struct btree *t, struct btree_node *n);
> +
> +// Destroy a tree and release all nodes.
> +static void
> +btree_destroy (struct btree *t)
> +{
> +  // Disable the mechanism before cleaning up.
> +  struct btree_node *old_root
> +    = __atomic_exchange_n (&(t->root), NULL, __ATOMIC_SEQ_CST);
> +  if (old_root)
> +    btree_release_tree_recursively (t, old_root);
> +
> +  // Release all free nodes.
> +  while (t->free_list)
> +    {
> +      struct btree_node *next = t->free_list->content.children[0].child;
> +      free (t->free_list);
> +      t->free_list = next;
> +    }
> +}
> +
> +// Allocate a node. This node will be returned in locked exclusive state.
> +static struct btree_node *
> +btree_allocate_node (struct btree *t, bool inner) {
> +  while (true)
> +    {
> +      // Try the free list first.
> +      struct btree_node *next_free
> +	= __atomic_load_n (&(t->free_list), __ATOMIC_SEQ_CST);
> +      if (next_free)
> +	{
> +	  if (!btree_node_try_lock_exclusive (next_free))
> +	    continue;
> +	  // The node might no longer be free, check that again after acquiring
> +	  // the exclusive lock.
> +	  if (next_free->type == btree_node_free)
> +	    {
> +	      struct btree_node *ex = next_free;
> +	      if (__atomic_compare_exchange_n (
> +		    &(t->free_list), &ex, next_free->content.children[0].child,
> +		    false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST))
> +		{
> +		  next_free->entry_count = 0;
> +		  next_free->type = inner ? btree_node_inner :
> btree_node_leaf;
> +		  return next_free;
> +		}
> +	    }
> +	  btree_node_unlock_exclusive (next_free);
> +	  continue;
> +	}
> +
> +      // No free node available, allocate a new one.
> +      struct btree_node *new_node
> +	= (struct btree_node *) (malloc (sizeof (struct btree_node)));
> +      version_lock_initialize_locked_exclusive (
> +	&(new_node->version_lock)); // initialize the node in locked state.
> +      new_node->entry_count = 0;
> +      new_node->type = inner ? btree_node_inner : btree_node_leaf;
> +      return new_node;
> +    }
> +}
> +
> +// Release a node. This node must be currently locked exclusively and
> +will // be placed in the free list.
> +static void
> +btree_release_node (struct btree *t, struct btree_node *node) {
> +  // We cannot release the memory immediately because there might still
> +be
> +  // concurrent readers on that node. Put it in the free list instead.
> +  node->type = btree_node_free;
> +  struct btree_node *next_free
> +    = __atomic_load_n (&(t->free_list), __ATOMIC_SEQ_CST);
> +  do
> +    {
> +      node->content.children[0].child = next_free;
> +  } while (!__atomic_compare_exchange_n (&(t->free_list), &next_free,
> node,
> +					 false, __ATOMIC_SEQ_CST,
> +					 __ATOMIC_SEQ_CST));
> +  btree_node_unlock_exclusive (node);
> +}
> +
> +// Recursively release a tree. The btree is by design very shallow,
> +thus // we can risk recursion here.
> +static void
> +btree_release_tree_recursively (struct btree *t, struct btree_node
> +*node) {
> +  btree_node_lock_exclusive (node);
> +  if (btree_node_is_inner (node))
> +    {
> +      for (unsigned index = 0; index < node->entry_count; ++index)
> +	btree_release_tree_recursively (t, node-
> >content.children[index].child);
> +    }
> +  btree_release_node (t, node);
> +}
> +
> +// Check if we are splitting the root.
> +static void
> +btree_handle_root_split (struct btree *t, struct btree_node **node,
> +			 struct btree_node **parent)
> +{
> +  // We want to keep the root pointer stable to allow for contention
> +  // free reads. Thus, we split the root by first moving the content
> +  // of the root node to a new node, and then split that new node.
> +  if (!*parent)
> +    {
> +      // Allocate a new node, this guarantees us that we will have a parent
> +      // afterwards.
> +      struct btree_node *new_node
> +	= btree_allocate_node (t, btree_node_is_inner (*node));
> +      struct btree_node *old_node = *node;
> +      new_node->entry_count = old_node->entry_count;
> +      new_node->content = old_node->content;
> +      old_node->content.children[0].separator = max_separator;
> +      old_node->content.children[0].child = new_node;
> +      old_node->entry_count = 1;
> +      old_node->type = btree_node_inner;
> +
> +      *parent = old_node;
> +      *node = new_node;
> +    }
> +}
> +
> +// Split an inner node.
> +static void
> +btree_split_inner (struct btree *t, struct btree_node **inner,
> +		   struct btree_node **parent, uintptr_t target) {
> +  // Check for the root.
> +  btree_handle_root_split (t, inner, parent);
> +
> +  // Create two inner node.
> +  uintptr_t right_fence = btree_node_get_fence_key (*inner);
> +  struct btree_node *left_inner = *inner;
> +  struct btree_node *right_inner = btree_allocate_node (t, true);
> +  unsigned split = left_inner->entry_count / 2;
> +  right_inner->entry_count = left_inner->entry_count - split;
> +  for (unsigned index = 0; index < right_inner->entry_count; ++index)
> +    right_inner->content.children[index]
> +      = left_inner->content.children[split + index];
> +  left_inner->entry_count = split;
> +  uintptr_t left_fence = btree_node_get_fence_key (left_inner);
> +  btree_node_update_separator_after_split (*parent, right_fence,
> left_fence,
> +					   right_inner);
> +  if (target <= left_fence)
> +    {
> +      *inner = left_inner;
> +      btree_node_unlock_exclusive (right_inner);
> +    }
> +  else
> +    {
> +      *inner = right_inner;
> +      btree_node_unlock_exclusive (left_inner);
> +    }
> +}
> +
> +// Split a leaf node.
> +static void
> +btree_split_leaf (struct btree *t, struct btree_node **leaf,
> +		  struct btree_node **parent, uintptr_t fence, uintptr_t
> target) {
> +  // Check for the root.
> +  btree_handle_root_split (t, leaf, parent);
> +
> +  // Create two leaf nodes.
> +  uintptr_t right_fence = fence;
> +  struct btree_node *left_leaf = *leaf;
> +  struct btree_node *right_leaf = btree_allocate_node (t, false);
> +  unsigned split = left_leaf->entry_count / 2;
> +  right_leaf->entry_count = left_leaf->entry_count - split;
> +  for (unsigned index = 0; index != right_leaf->entry_count; ++index)
> +    right_leaf->content.entries[index]
> +      = left_leaf->content.entries[split + index];
> +  left_leaf->entry_count = split;
> +  uintptr_t left_fence = right_leaf->content.entries[0].base - 1;
> +  btree_node_update_separator_after_split (*parent, right_fence,
> left_fence,
> +					   right_leaf);
> +  if (target <= left_fence)
> +    {
> +      *leaf = left_leaf;
> +      btree_node_unlock_exclusive (right_leaf);
> +    }
> +  else
> +    {
> +      *leaf = right_leaf;
> +      btree_node_unlock_exclusive (left_leaf);
> +    }
> +}
> +
> +// Merge (or balance) child nodes.
> +static struct btree_node *
> +btree_merge_node (struct btree *t, unsigned child_slot,
> +		  struct btree_node *parent, uintptr_t target) {
> +  // Choose the emptiest neighbor and lock both. The target child is
> +already
> +  // locked.
> +  unsigned left_slot;
> +  struct btree_node *left_node, *right_node;
> +  if ((child_slot == 0)
> +      || (((child_slot + 1) < parent->entry_count)
> +	  && (parent->content.children[child_slot + 1].child->entry_count
> +	      < parent->content.children[child_slot - 1].child->entry_count)))
> +    {
> +      left_slot = child_slot;
> +      left_node = parent->content.children[left_slot].child;
> +      right_node = parent->content.children[left_slot + 1].child;
> +      btree_node_lock_exclusive (right_node);
> +    }
> +  else
> +    {
> +      left_slot = child_slot - 1;
> +      left_node = parent->content.children[left_slot].child;
> +      right_node = parent->content.children[left_slot + 1].child;
> +      btree_node_lock_exclusive (left_node);
> +    }
> +
> +  // Can we merge both nodes into one node?
> +  unsigned total_count = left_node->entry_count +
> +right_node->entry_count;
> +  unsigned max_count
> +    = btree_node_is_inner (left_node) ? max_fanout_inner :
> +max_fanout_leaf;
> +  if (total_count <= max_count)
> +    {
> +      // Merge into the parent?
> +      if (parent->entry_count == 2)
> +	{
> +	  // Merge children into parent. This can only happen at the root.
> +	  if (btree_node_is_inner (left_node))
> +	    {
> +	      for (unsigned index = 0; index != left_node->entry_count;
> ++index)
> +		parent->content.children[index]
> +		  = left_node->content.children[index];
> +	      for (unsigned index = 0; index != right_node->entry_count;
> +		   ++index)
> +		parent->content.children[index + left_node->entry_count]
> +		  = right_node->content.children[index];
> +	    }
> +	  else
> +	    {
> +	      parent->type = btree_node_leaf;
> +	      for (unsigned index = 0; index != left_node->entry_count;
> ++index)
> +		parent->content.entries[index]
> +		  = left_node->content.entries[index];
> +	      for (unsigned index = 0; index != right_node->entry_count;
> +		   ++index)
> +		parent->content.entries[index + left_node->entry_count]
> +		  = right_node->content.entries[index];
> +	    }
> +	  parent->entry_count = total_count;
> +	  btree_release_node (t, left_node);
> +	  btree_release_node (t, right_node);
> +	  return parent;
> +	}
> +      else
> +	{
> +	  // Regular merge.
> +	  if (btree_node_is_inner (left_node))
> +	    {
> +	      for (unsigned index = 0; index != right_node->entry_count;
> +		   ++index)
> +		left_node->content.children[left_node->entry_count++]
> +		  = right_node->content.children[index];
> +	    }
> +	  else
> +	    {
> +	      for (unsigned index = 0; index != right_node->entry_count;
> +		   ++index)
> +		left_node->content.entries[left_node->entry_count++]
> +		  = right_node->content.entries[index];
> +	    }
> +	  parent->content.children[left_slot].separator
> +	    = parent->content.children[left_slot + 1].separator;
> +	  for (unsigned index = left_slot + 1; index + 1 < parent->entry_count;
> +	       ++index)
> +	    parent->content.children[index]
> +	      = parent->content.children[index + 1];
> +	  parent->entry_count--;
> +	  btree_release_node (t, right_node);
> +	  btree_node_unlock_exclusive (parent);
> +	  return left_node;
> +	}
> +    }
> +
> +  // No merge possible, rebalance instead.
> +  if (left_node->entry_count > right_node->entry_count)
> +    {
> +      // Shift from left to right.
> +      unsigned to_shift
> +	= (left_node->entry_count - right_node->entry_count) / 2;
> +      if (btree_node_is_inner (left_node))
> +	{
> +	  for (unsigned index = 0; index != right_node->entry_count;
> ++index)
> +	    {
> +	      unsigned pos = right_node->entry_count - 1 - index;
> +	      right_node->content.children[pos + to_shift]
> +		= right_node->content.children[pos];
> +	    }
> +	  for (unsigned index = 0; index != to_shift; ++index)
> +	    right_node->content.children[index]
> +	      = left_node->content
> +		  .children[left_node->entry_count - to_shift + index];
> +	}
> +      else
> +	{
> +	  for (unsigned index = 0; index != right_node->entry_count;
> ++index)
> +	    {
> +	      unsigned pos = right_node->entry_count - 1 - index;
> +	      right_node->content.entries[pos + to_shift]
> +		= right_node->content.entries[pos];
> +	    }
> +	  for (unsigned index = 0; index != to_shift; ++index)
> +	    right_node->content.entries[index]
> +	      = left_node->content
> +		  .entries[left_node->entry_count - to_shift + index];
> +	}
> +      left_node->entry_count -= to_shift;
> +      right_node->entry_count += to_shift;
> +    }
> +  else
> +    {
> +      // Shift from right to left.
> +      unsigned to_shift
> +	= (right_node->entry_count - left_node->entry_count) / 2;
> +      if (btree_node_is_inner (left_node))
> +	{
> +	  for (unsigned index = 0; index != to_shift; ++index)
> +	    left_node->content.children[left_node->entry_count + index]
> +	      = right_node->content.children[index];
> +	  for (unsigned index = 0; index != right_node->entry_count -
> to_shift;
> +	       ++index)
> +	    right_node->content.children[index]
> +	      = right_node->content.children[index + to_shift];
> +	}
> +      else
> +	{
> +	  for (unsigned index = 0; index != to_shift; ++index)
> +	    left_node->content.entries[left_node->entry_count + index]
> +	      = right_node->content.entries[index];
> +	  for (unsigned index = 0; index != right_node->entry_count -
> to_shift;
> +	       ++index)
> +	    right_node->content.entries[index]
> +	      = right_node->content.entries[index + to_shift];
> +	}
> +      left_node->entry_count += to_shift;
> +      right_node->entry_count -= to_shift;
> +    }
> +  uintptr_t left_fence;
> +  if (btree_node_is_leaf (left_node))
> +    {
> +      left_fence = right_node->content.entries[0].base - 1;
> +    }
> +  else
> +    {
> +      left_fence = btree_node_get_fence_key (left_node);
> +    }
> +  parent->content.children[left_slot].separator = left_fence;
> +  btree_node_unlock_exclusive (parent);
> +  if (target <= left_fence)
> +    {
> +      btree_node_unlock_exclusive (right_node);
> +      return left_node;
> +    }
> +  else
> +    {
> +      btree_node_unlock_exclusive (left_node);
> +      return right_node;
> +    }
> +}
> +
> +// Insert an entry.
> +static bool
> +btree_insert (struct btree *t, uintptr_t base, uintptr_t size,
> +	      struct object *ob)
> +{
> +  // Sanity check.
> +  if (!size)
> +    return false;
> +
> +  // Access the root.
> +  struct btree_node *iter, *parent = NULL;
> +  {
> +    version_lock_lock_exclusive (&(t->root_lock));
> +    iter = t->root;
> +    if (iter)
> +      {
> +	btree_node_lock_exclusive (iter);
> +      }
> +    else
> +      {
> +	t->root = iter = btree_allocate_node (t, false);
> +      }
> +    version_lock_unlock_exclusive (&(t->root_lock));
> +  }
> +
> +  // Walk down the btree with classic lock coupling and eager splits.
> +  // Strictly speaking this is not performance optimal, we could use
> + // optimistic lock coupling until we hit a node that has to be modified.
> +  // But that is more difficult to implement and frame registration is
> + // rare anyway, we use simple locking for now.
> +
> +  uintptr_t fence = max_separator;
> +  while (btree_node_is_inner (iter))
> +    {
> +      // Use eager splits to avoid lock coupling up.
> +      if (iter->entry_count == max_fanout_inner)
> +	btree_split_inner (t, &iter, &parent, base);
> +
> +      unsigned slot = btree_node_find_inner_slot (iter, base);
> +      if (parent)
> +	btree_node_unlock_exclusive (parent);
> +      parent = iter;
> +      fence = iter->content.children[slot].separator;
> +      iter = iter->content.children[slot].child;
> +      btree_node_lock_exclusive (iter);
> +    }
> +
> +  // Make sure we have space.
> +  if (iter->entry_count == max_fanout_leaf)
> +    btree_split_leaf (t, &iter, &parent, fence, base);  if (parent)
> +    btree_node_unlock_exclusive (parent);
> +
> +  // Insert in node.
> +  unsigned slot = btree_node_find_leaf_slot (iter, base);
> +  if ((slot < iter->entry_count) && (iter->content.entries[slot].base ==
> base))
> +    {
> +      // Duplicate entry, this should never happen.
> +      btree_node_unlock_exclusive (iter);
> +      return false;
> +    }
> +  for (unsigned index = iter->entry_count; index > slot; --index)
> +    iter->content.entries[index] = iter->content.entries[index - 1];
> +  struct leaf_entry *e = &(iter->content.entries[slot]);
> +  e->base = base;
> +  e->size = size;
> +  e->ob = ob;
> +  iter->entry_count++;
> +  btree_node_unlock_exclusive (iter);
> +  return true;
> +}
> +
> +// Remove an entry.
> +static struct object *
> +btree_remove (struct btree *t, uintptr_t base) {
> +  // Access the root.
> +  version_lock_lock_exclusive (&(t->root_lock));
> +  struct btree_node *iter = t->root;
> +  if (iter)
> +    btree_node_lock_exclusive (iter);
> +  version_lock_unlock_exclusive (&(t->root_lock));
> +  if (!iter)
> +    return NULL;
> +
> +  // Same strategy as with insert, walk down with lock coupling and
> +  // merge eagerly.
> +  while (btree_node_is_inner (iter))
> +    {
> +      unsigned slot = btree_node_find_inner_slot (iter, base);
> +      struct btree_node *next = iter->content.children[slot].child;
> +      btree_node_lock_exclusive (next);
> +      if (btree_node_needs_merge (next))
> +	{
> +	  // Use eager merges to avoid lock coupling up.
> +	  iter = btree_merge_node (t, slot, iter, base);
> +	}
> +      else
> +	{
> +	  btree_node_unlock_exclusive (iter);
> +	  iter = next;
> +	}
> +    }
> +
> +  // Remove existing entry.
> +  unsigned slot = btree_node_find_leaf_slot (iter, base);
> +  if ((slot >= iter->entry_count) || (iter->content.entries[slot].base !=
> base))
> +    {
> +      // Not found, this should never happen.
> +      btree_node_unlock_exclusive (iter);
> +      return NULL;
> +    }
> +  struct object *ob = iter->content.entries[slot].ob;
> +  for (unsigned index = slot; index + 1 < iter->entry_count; ++index)
> +    iter->content.entries[index] = iter->content.entries[index + 1];
> +  iter->entry_count--;
> +  btree_node_unlock_exclusive (iter);
> +  return ob;
> +}
> +
> +// Find the corresponding entry for the given address.
> +static struct object *
> +btree_lookup (const struct btree *t, uintptr_t target_addr) {
> +  // Within this function many loads are relaxed atomic loads.
> +  // Use a macro to keep the code reasonable.
> +#define RLOAD(x) __atomic_load_n (&(x), __ATOMIC_RELAXED)
> +
> +  // For targets where unwind info is usually not registered through
> + these  // APIs anymore, avoid any sequential consistent atomics.
> +  // Use relaxed MO here, it is up to the app to ensure that the
> + library  // loading/initialization happens-before using that library
> + in other  // threads (in particular unwinding with that library's
> + functions  // appearing in the backtraces).  Calling that library's
> + functions  // without waiting for the library to initialize would be racy.
> +  if (__builtin_expect (!RLOAD (t->root), 1))
> +    return NULL;
> +
> +  // The unwinding tables are mostly static, they only change when  //
> + frames are added or removed. This makes it extremely unlikely that
> + they  // change during a given unwinding sequence. Thus, we optimize
> + for the  // contention free case and use optimistic lock coupling.
> + This does not  // require any writes to shared state, instead we
> + validate every read. It is  // important that we do not trust any
> + value that we have read until we call  // validate again. Data can
> + change at arbitrary points in time, thus we always  // copy something
> + into a local variable and validate again before acting on  // the
> + read. In the unlikely event that we encounter a concurrent change we  //
> simply restart and try again.
> +
> +restart:
> +  struct btree_node *iter;
> +  uintptr_t lock;
> +  {
> +    // Accessing the root node requires defending against concurrent pointer
> +    // changes Thus we couple rootLock -> lock on root node -> validate
> rootLock
> +    if (!version_lock_lock_optimistic (&(t->root_lock), &lock))
> +      goto restart;
> +    iter = RLOAD (t->root);
> +    if (!version_lock_validate (&(t->root_lock), lock))
> +      goto restart;
> +    if (!iter)
> +      return NULL;
> +    uintptr_t child_lock;
> +    if ((!btree_node_lock_optimistic (iter, &child_lock))
> +	|| (!version_lock_validate (&(t->root_lock), lock)))
> +      goto restart;
> +    lock = child_lock;
> +  }
> +
> +  // Now we can walk down towards the right leaf node.
> +  while (true)
> +    {
> +      enum node_type type = RLOAD (iter->type);
> +      unsigned entry_count = RLOAD (iter->entry_count);
> +      if (!btree_node_validate (iter, lock))
> +	goto restart;
> +      if (!entry_count)
> +	return NULL;
> +
> +      if (type == btree_node_inner)
> +	{
> +	  // We cannot call find_inner_slot here because we need (relaxed)
> +	  // atomic reads here.
> +	  unsigned slot = 0;
> +	  while (
> +	    ((slot + 1) < entry_count)
> +	    && (RLOAD (iter->content.children[slot].separator) < target_addr))
> +	    ++slot;
> +	  struct btree_node *child = RLOAD (iter-
> >content.children[slot].child);
> +	  if (!btree_node_validate (iter, lock))
> +	    goto restart;
> +
> +	  // The node content can change at any point in time, thus we must
> +	  // interleave parent and child checks.
> +	  uintptr_t child_lock;
> +	  if (!btree_node_lock_optimistic (child, &child_lock))
> +	    goto restart;
> +	  if (!btree_node_validate (iter, lock))
> +	    goto restart; // make sure we still point to the correct node after
> +			  // acquiring the optimistic lock.
> +
> +	  // Go down
> +	  iter = child;
> +	  lock = child_lock;
> +	}
> +      else
> +	{
> +	  // We cannot call find_leaf_slot here because we need (relaxed)
> +	  // atomic reads here.
> +	  unsigned slot = 0;
> +	  while (((slot + 1) < entry_count)
> +		 && (RLOAD (iter->content.entries[slot].base)
> +		       + RLOAD (iter->content.entries[slot].size)
> +		     <= target_addr))
> +	    ++slot;
> +	  struct leaf_entry entry;
> +	  entry.base = RLOAD (iter->content.entries[slot].base);
> +	  entry.size = RLOAD (iter->content.entries[slot].size);
> +	  entry.ob = RLOAD (iter->content.entries[slot].ob);
> +	  if (!btree_node_validate (iter, lock))
> +	    goto restart;
> +
> +	  // Check if we have a hit.
> +	  if ((entry.base <= target_addr)
> +	      && (target_addr < entry.base + entry.size))
> +	    {
> +	      return entry.ob;
> +	    }
> +	  return NULL;
> +	}
> +    }
> +#undef RLOAD
> +}
> +
> +#endif /* unwind-dw2-btree.h */
> diff --git a/libgcc/unwind-dw2-fde.c b/libgcc/unwind-dw2-fde.c index
> 8ee55be5675..1165be0c6df 100644
> --- a/libgcc/unwind-dw2-fde.c
> +++ b/libgcc/unwind-dw2-fde.c
> @@ -42,15 +42,34 @@ see the files COPYING3 and COPYING.RUNTIME
> respectively.  If not, see
>   #endif
>   #endif
> 
> +#ifdef ATOMIC_FDE_FAST_PATH
> +#include "unwind-dw2-btree.h"
> +
> +static struct btree registered_frames;
> +
> +static void
> +release_registered_frames (void) __attribute__ ((destructor (110)));
> +static void release_registered_frames (void) {
> +  /* Release the b-tree and all frames. Frame releases that happen later are
> +   * silently ignored */
> +  btree_destroy (&registered_frames);
> +}
> +
> +static void
> +get_pc_range (const struct object *ob, uintptr_t *range); static void
> +init_object (struct object *ob);
> +
> +#else
> +
>   /* The unseen_objects list contains objects that have been registered
>      but not yet categorized in any way.  The seen_objects list has had
>      its pc_begin and count fields initialized at minimum, and is sorted
>      by decreasing value of pc_begin.  */
>   static struct object *unseen_objects;
>   static struct object *seen_objects;
> -#ifdef ATOMIC_FDE_FAST_PATH
> -static int any_objects_registered;
> -#endif
> 
>   #ifdef __GTHREAD_MUTEX_INIT
>   static __gthread_mutex_t object_mutex = __GTHREAD_MUTEX_INIT; @@
> -78,6 +97,7 @@ init_object_mutex_once (void)
>   static __gthread_mutex_t object_mutex;
>   #endif
>   #endif
> +#endif
> 
>   /* Called from crtbegin.o to register the unwind info for an object.  */
> 
> @@ -99,23 +119,23 @@ __register_frame_info_bases (const void *begin,
> struct object *ob,
>     ob->fde_end = NULL;
>   #endif
> 
> +#ifdef ATOMIC_FDE_FAST_PATH
> +  // Initialize  eagerly to avoid locking later
> +  init_object (ob);
> +
> +  // And register the frame
> +  uintptr_t range[2];
> +  get_pc_range (ob, range);
> +  btree_insert (&registered_frames, range[0], range[1] - range[0], ob);
> +#else
>     init_object_mutex_once ();
>     __gthread_mutex_lock (&object_mutex);
> 
>     ob->next = unseen_objects;
>     unseen_objects = ob;
> -#ifdef ATOMIC_FDE_FAST_PATH
> -  /* Set flag that at least one library has registered FDEs.
> -     Use relaxed MO here, it is up to the app to ensure that the library
> -     loading/initialization happens-before using that library in other
> -     threads (in particular unwinding with that library's functions
> -     appearing in the backtraces).  Calling that library's functions
> -     without waiting for the library to initialize would be racy.  */
> -  if (!any_objects_registered)
> -    __atomic_store_n (&any_objects_registered, 1, __ATOMIC_RELAXED);
> -#endif
> 
>     __gthread_mutex_unlock (&object_mutex);
> +#endif
>   }
> 
>   void
> @@ -153,23 +173,23 @@ __register_frame_info_table_bases (void *begin,
> struct object *ob,
>     ob->s.b.from_array = 1;
>     ob->s.b.encoding = DW_EH_PE_omit;
> 
> +#ifdef ATOMIC_FDE_FAST_PATH
> +  // Initialize  eagerly to avoid locking later
> +  init_object (ob);
> +
> +  // And register the frame
> +  uintptr_t range[2];
> +  get_pc_range (ob, range);
> +  btree_insert (&registered_frames, range[0], range[1] - range[0], ob);
> +#else
>     init_object_mutex_once ();
>     __gthread_mutex_lock (&object_mutex);
> 
>     ob->next = unseen_objects;
>     unseen_objects = ob;
> -#ifdef ATOMIC_FDE_FAST_PATH
> -  /* Set flag that at least one library has registered FDEs.
> -     Use relaxed MO here, it is up to the app to ensure that the library
> -     loading/initialization happens-before using that library in other
> -     threads (in particular unwinding with that library's functions
> -     appearing in the backtraces).  Calling that library's functions
> -     without waiting for the library to initialize would be racy.  */
> -  if (!any_objects_registered)
> -    __atomic_store_n (&any_objects_registered, 1, __ATOMIC_RELAXED);
> -#endif
> 
>     __gthread_mutex_unlock (&object_mutex);
> +#endif
>   }
> 
>   void
> @@ -200,16 +220,33 @@ __register_frame_table (void *begin)
>   void *
>   __deregister_frame_info_bases (const void *begin)
>   {
> -  struct object **p;
>     struct object *ob = 0;
> 
>     /* If .eh_frame is empty, we haven't registered.  */
>     if ((const uword *) begin == 0 || *(const uword *) begin == 0)
>       return ob;
> 
> +#ifdef ATOMIC_FDE_FAST_PATH
> +  // Find the corresponding PC range
> +  struct object lookupob;
> +  lookupob.tbase = 0;
> +  lookupob.dbase = 0;
> +  lookupob.u.single = begin;
> +  lookupob.s.i = 0;
> +  lookupob.s.b.encoding = DW_EH_PE_omit; #ifdef
> +DWARF2_OBJECT_END_PTR_EXTENSION
> +  lookupob.fde_end = NULL;
> +#endif
> +  uintptr_t range[2];
> +  get_pc_range (&lookupob, range);
> +
> +  // And remove
> +  ob = btree_remove (&registered_frames, range[0]); #else
>     init_object_mutex_once ();
>     __gthread_mutex_lock (&object_mutex);
> 
> +  struct object **p;
>     for (p = &unseen_objects; *p ; p = &(*p)->next)
>       if ((*p)->u.single == begin)
>         {
> @@ -241,6 +278,8 @@ __deregister_frame_info_bases (const void *begin)
> 
>    out:
>     __gthread_mutex_unlock (&object_mutex);
> +#endif
> +
>     gcc_assert (ob);
>     return (void *) ob;
>   }
> @@ -264,7 +303,7 @@ __deregister_frame (void *begin)
>      instead of an _Unwind_Context.  */
> 
>   static _Unwind_Ptr
> -base_from_object (unsigned char encoding, struct object *ob)
> +base_from_object (unsigned char encoding, const struct object *ob)
>   {
>     if (encoding == DW_EH_PE_omit)
>       return 0;
> @@ -628,13 +667,17 @@ end_fde_sort (struct object *ob, struct
> fde_accumulator *accu, size_t count)
>       }
>   }
> 
> -
> 
> 
> -/* Update encoding, mixed_encoding, and pc_begin for OB for the
> -   fde array beginning at THIS_FDE.  Return the number of fdes
> -   encountered along the way.  */
> +/* Inspect the fde array beginning at this_fde. This
> +   function can be used either in query mode (RANGE is
> +   not null, OB is const), or in update mode (RANGE is
> +   null, OB is modified). In query mode the function computes
> +   the range of PC values and stores it in rANGE. In
> +   update mode it updates encoding, mixed_encoding, and pc_begin
> +   for OB. Return the number of fdes encountered along the way. */
> 
>   static size_t
> -classify_object_over_fdes (struct object *ob, const fde *this_fde)
> +classify_object_over_fdes (struct object *ob, const fde *this_fde,
> +			   uintptr_t *range)
>   {
>     const struct dwarf_cie *last_cie = 0;
>     size_t count = 0;
> @@ -644,7 +687,7 @@ classify_object_over_fdes (struct object *ob, const
> fde *this_fde)
>     for (; ! last_fde (ob, this_fde); this_fde = next_fde (this_fde))
>       {
>         const struct dwarf_cie *this_cie;
> -      _Unwind_Ptr mask, pc_begin;
> +      _Unwind_Ptr mask, pc_begin, pc_range;
> 
>         /* Skip CIEs.  */
>         if (this_fde->CIE_delta == 0)
> @@ -660,14 +703,19 @@ classify_object_over_fdes (struct object *ob, const
> fde *this_fde)
>   	  if (encoding == DW_EH_PE_omit)
>   	    return -1;
>   	  base = base_from_object (encoding, ob);
> -	  if (ob->s.b.encoding == DW_EH_PE_omit)
> -	    ob->s.b.encoding = encoding;
> -	  else if (ob->s.b.encoding != encoding)
> -	    ob->s.b.mixed_encoding = 1;
> +	  if (!range)
> +	    {
> +	      if (ob->s.b.encoding == DW_EH_PE_omit)
> +		ob->s.b.encoding = encoding;
> +	      else if (ob->s.b.encoding != encoding)
> +		ob->s.b.mixed_encoding = 1;
> +	    }
>   	}
> 
> -      read_encoded_value_with_base (encoding, base, this_fde->pc_begin,
> -				    &pc_begin);
> +      const unsigned char *p;
> +      p = read_encoded_value_with_base (encoding, base, this_fde-
> >pc_begin,
> +					&pc_begin);
> +      read_encoded_value_with_base (encoding & 0x0F, 0, p, &pc_range);
> 
>         /* Take care to ignore link-once functions that were removed.
>   	 In these cases, the function address will be NULL, but if @@ -683,8
> +731,27 @@ classify_object_over_fdes (struct object *ob, const fde
> *this_fde)
>   	continue;
> 
>         count += 1;
> -      if ((void *) pc_begin < ob->pc_begin)
> -	ob->pc_begin = (void *) pc_begin;
> +      if (range)
> +	{
> +	  _Unwind_Ptr pc_end = pc_begin + pc_range;
> +	  if ((!range[0]) && (!range[1]))
> +	    {
> +	      range[0] = pc_begin;
> +	      range[1] = pc_end;
> +	    }
> +	  else
> +	    {
> +	      if (pc_begin < range[0])
> +		range[0] = pc_begin;
> +	      if (pc_end > range[1])
> +		range[1] = pc_end;
> +	    }
> +	}
> +      else
> +	{
> +	  if ((void *) pc_begin < ob->pc_begin)
> +	    ob->pc_begin = (void *) pc_begin;
> +	}
>       }
> 
>     return count;
> @@ -769,7 +836,7 @@ init_object (struct object* ob)
>   	  fde **p = ob->u.array;
>   	  for (count = 0; *p; ++p)
>   	    {
> -	      size_t cur_count = classify_object_over_fdes (ob, *p);
> +	      size_t cur_count = classify_object_over_fdes (ob, *p, NULL);
>   	      if (cur_count == (size_t) -1)
>   		goto unhandled_fdes;
>   	      count += cur_count;
> @@ -777,7 +844,7 @@ init_object (struct object* ob)
>   	}
>         else
>   	{
> -	  count = classify_object_over_fdes (ob, ob->u.single);
> +	  count = classify_object_over_fdes (ob, ob->u.single, NULL);
>   	  if (count == (size_t) -1)
>   	    {
>   	      static const fde terminator;
> @@ -821,6 +888,32 @@ init_object (struct object* ob)
>     ob->s.b.sorted = 1;
>   }
> 
> +#ifdef ATOMIC_FDE_FAST_PATH
> +/* Get the PC range for lookup */
> +static void
> +get_pc_range (const struct object *ob, uintptr_t *range) {
> +  // It is safe to cast to non-const object* here as
> +  // classify_object_over_fdes does not modify ob in query mode.
> +  struct object *ncob = (struct object *) (uintptr_t) ob;
> +  range[0] = range[1] = 0;
> +  if (ob->s.b.sorted)
> +    {
> +      classify_object_over_fdes (ncob, ob->u.sort->orig_data, range);
> +    }
> +  else if (ob->s.b.from_array)
> +    {
> +      fde **p = ob->u.array;
> +      for (; *p; ++p)
> +	classify_object_over_fdes (ncob, *p, range);
> +    }
> +  else
> +    {
> +      classify_object_over_fdes (ncob, ob->u.single, range);
> +    }
> +}
> +#endif
> +
>   /* A linear search through a set of FDEs for the given PC.  This is
>      used when there was insufficient memory to allocate and sort an
>      array.  */
> @@ -985,6 +1078,9 @@ binary_search_mixed_encoding_fdes (struct object
> *ob, void *pc)
>   static const fde *
>   search_object (struct object* ob, void *pc)
>   {
> +  /* The fast path initializes objects eagerly to avoid locking.
> +   * On the slow path we initialize them now */ #ifndef
> +ATOMIC_FDE_FAST_PATH
>     /* If the data hasn't been sorted, try to do this now.  We may have
>        more memory available than last time we tried.  */
>     if (! ob->s.b.sorted)
> @@ -997,6 +1093,7 @@ search_object (struct object* ob, void *pc)
>         if (pc < ob->pc_begin)
>   	return NULL;
>       }
> +#endif
> 
>     if (ob->s.b.sorted)
>       {
> @@ -1033,17 +1130,12 @@ _Unwind_Find_FDE (void *pc, struct
> dwarf_eh_bases *bases)
>     const fde *f = NULL;
> 
>   #ifdef ATOMIC_FDE_FAST_PATH
> -  /* For targets where unwind info is usually not registered through these
> -     APIs anymore, avoid taking a global lock.
> -     Use relaxed MO here, it is up to the app to ensure that the library
> -     loading/initialization happens-before using that library in other
> -     threads (in particular unwinding with that library's functions
> -     appearing in the backtraces).  Calling that library's functions
> -     without waiting for the library to initialize would be racy.  */
> -  if (__builtin_expect (!__atomic_load_n (&any_objects_registered,
> -					  __ATOMIC_RELAXED), 1))
> +  ob = btree_lookup (&registered_frames, (uintptr_t) pc);  if (!ob)
>       return NULL;
> -#endif
> +
> +  f = search_object (ob, pc);
> +#else
> 
>     init_object_mutex_once ();
>     __gthread_mutex_lock (&object_mutex); @@ -1081,6 +1173,7 @@
> _Unwind_Find_FDE (void *pc, struct dwarf_eh_bases *bases)
> 
>    fini:
>     __gthread_mutex_unlock (&object_mutex);
> +#endif
> 
>     if (f)
>       {
> diff --git a/libgcc/unwind-dw2-fde.h b/libgcc/unwind-dw2-fde.h index
> 8a011c358b4..77c2caa4f5a 100644
> --- a/libgcc/unwind-dw2-fde.h
> +++ b/libgcc/unwind-dw2-fde.h
> @@ -166,7 +166,7 @@ next_fde (const fde *f)
>   extern const fde * _Unwind_Find_FDE (void *, struct dwarf_eh_bases *);
> 
>   static inline int
> -last_fde (struct object *obj __attribute__ ((__unused__)), const fde *f)
> +last_fde (const struct object *obj __attribute__ ((__unused__)), const
> +fde *f)
>   {
>   #ifdef DWARF2_OBJECT_END_PTR_EXTENSION
>     return f == (const fde *) obj->fde_end || f->length == 0;
> --
> 2.34.1


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v4] eliminate mutex in fast path of __register_frame
  2022-11-21 11:14 ` Tamar Christina
@ 2022-11-21 11:22   ` Thomas Neumann
  2022-11-21 11:48     ` Jakub Jelinek
  2022-11-21 11:49     ` [PATCH v4] eliminate mutex in fast path of __register_frame Tamar Christina
  0 siblings, 2 replies; 33+ messages in thread
From: Thomas Neumann @ 2022-11-21 11:22 UTC (permalink / raw)
  To: Tamar Christina, gcc-patches, Jason Merrill
  Cc: Florian Weimer, Jakub Jelinek, Jonathan Wakely

Hi,

> When dynamically linking a fast enough machine hides the latency, but when
> Statically linking or on slower devices this change caused a 5x increase in
> Instruction count and 2x increase in cycle count before getting to main.
> 
> This has been quite noticeable on smaller devices.  Is there a reason the btree
> can't be initialized lazily? It seems a bit harsh to pay the cost of unwinding at
> startup even when you don't throw exceptions..

we cannot easily do that lazily because otherwise we need a mutex for 
lazy initialization, which is exactly what we wanted to get rid of.

Having said that, I am surprised that you saw a noticeable difference. 
On most platforms there should not be dynamic frame registration at all, 
as the regular frames are directly read from the ELF data.

Can you please send me an precise description on how to reproduce the 
issue? (Platform, tools, a VM if you have one would be great). I will 
then debug this to improve the startup time.

Best

Thomas

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v4] eliminate mutex in fast path of __register_frame
  2022-11-21 11:22   ` Thomas Neumann
@ 2022-11-21 11:48     ` Jakub Jelinek
  2022-11-21 17:13       ` H.J. Lu
  2022-11-21 11:49     ` [PATCH v4] eliminate mutex in fast path of __register_frame Tamar Christina
  1 sibling, 1 reply; 33+ messages in thread
From: Jakub Jelinek @ 2022-11-21 11:48 UTC (permalink / raw)
  To: Thomas Neumann
  Cc: Tamar Christina, gcc-patches, Jason Merrill, Florian Weimer,
	Jonathan Wakely

On Mon, Nov 21, 2022 at 12:22:32PM +0100, Thomas Neumann via Gcc-patches wrote:
> > When dynamically linking a fast enough machine hides the latency, but when
> > Statically linking or on slower devices this change caused a 5x increase in
> > Instruction count and 2x increase in cycle count before getting to main.
> > 
> > This has been quite noticeable on smaller devices.  Is there a reason the btree
> > can't be initialized lazily? It seems a bit harsh to pay the cost of unwinding at
> > startup even when you don't throw exceptions..
> 
> we cannot easily do that lazily because otherwise we need a mutex for lazy
> initialization, which is exactly what we wanted to get rid of.
> 
> Having said that, I am surprised that you saw a noticeable difference. On
> most platforms there should not be dynamic frame registration at all, as the
> regular frames are directly read from the ELF data.
> 
> Can you please send me an precise description on how to reproduce the issue?
> (Platform, tools, a VM if you have one would be great). I will then debug
> this to improve the startup time.

I can see it being called as well for -static linked binaries.
-static links in crtbeginT.o which is libgcc/crtstuff.c built with
CRTSTUFFT_O macro being defined among other things, and that disables
USE_PT_GNU_EH_FRAME:
#if defined(OBJECT_FORMAT_ELF) \
    && !defined(OBJECT_FORMAT_FLAT) \
    && defined(HAVE_LD_EH_FRAME_HDR) \
    && !defined(inhibit_libc) && !defined(CRTSTUFFT_O) \
    && defined(__GLIBC__) && __GLIBC__ >= 2
#include <link.h>
/* uClibc pretends to be glibc 2.2 and DT_CONFIG is defined in its link.h.
   But it doesn't use PT_GNU_EH_FRAME ELF segment currently.  */
# if !defined(__UCLIBC__) \
     && (__GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 2) \
     || (__GLIBC__ == 2 && __GLIBC_MINOR__ == 2 && defined(DT_CONFIG)))
#  define USE_PT_GNU_EH_FRAME
# endif
#endif

I think .eh_frame_hdr was never used for statically linked programs,
see already https://gcc.gnu.org/legacy-ml/gcc-patches/2001-12/msg01383.html
We don't pass --eh-frame-hdr when linking statically and dl_iterate_phdr
doesn't handle those.
Now, if -static -Wl,--eh-frame-hdr is passed when linking to the driver,
.eh_frame_hdr section is created and __GNU_EH_FRAME_HDR symbol points to
the start of that section, so at least that section could be found
if something in the crt files and libgcc is adjusted.  But e.g.
i?86, nios2, frv and bfin we also need to find the got.  Also, would it
work even for static PIEs?

	Jakub


^ permalink raw reply	[flat|nested] 33+ messages in thread

* RE: [PATCH v4] eliminate mutex in fast path of __register_frame
  2022-11-21 11:22   ` Thomas Neumann
  2022-11-21 11:48     ` Jakub Jelinek
@ 2022-11-21 11:49     ` Tamar Christina
  2022-11-21 11:53       ` Thomas Neumann
  1 sibling, 1 reply; 33+ messages in thread
From: Tamar Christina @ 2022-11-21 11:49 UTC (permalink / raw)
  To: Thomas Neumann, gcc-patches, Jason Merrill
  Cc: Florian Weimer, Jakub Jelinek, Jonathan Wakely

> -----Original Message-----
> From: Thomas Neumann <thomas.neumann@in.tum.de>
> Sent: Monday, November 21, 2022 11:23 AM
> To: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org;
> Jason Merrill <jason@redhat.com>
> Cc: Florian Weimer <fweimer@redhat.com>; Jakub Jelinek
> <jakub@redhat.com>; Jonathan Wakely <jwakely.gcc@gmail.com>
> Subject: Re: [PATCH v4] eliminate mutex in fast path of __register_frame
> 
> Hi,
> 
> > When dynamically linking a fast enough machine hides the latency, but
> > when Statically linking or on slower devices this change caused a 5x
> > increase in Instruction count and 2x increase in cycle count before getting
> to main.
> >
> > This has been quite noticeable on smaller devices.  Is there a reason
> > the btree can't be initialized lazily? It seems a bit harsh to pay the
> > cost of unwinding at startup even when you don't throw exceptions..
> 
> we cannot easily do that lazily because otherwise we need a mutex for lazy
> initialization, which is exactly what we wanted to get rid of.
> 
> Having said that, I am surprised that you saw a noticeable difference.
> On most platforms there should not be dynamic frame registration at all, as
> the regular frames are directly read from the ELF data.
> 
> Can you please send me an precise description on how to reproduce the
> issue? (Platform, tools, a VM if you have one would be great). I will then
> debug this to improve the startup time.

It's easy to reproduce on x86 as well.

As a testcase:

#include <cstdio>

int main(int argc, char** argv) {
    return 0;
}

And just compile with: g++ -O1 hello.cpp -static -o hello.exe.

Before this change on x86 I got:

> perf stat -r 200 ./hello.exe

 Performance counter stats for './hello.exe' (200 runs):

              0.32 msec task-clock                #    0.326 CPUs utilized            ( +-  0.34% )
                 0      context-switches          #    0.000 K/sec
                 0      cpu-migrations            #    0.000 K/sec
                22      page-faults               #    0.070 M/sec                    ( +-  0.13% )
           310,194      cycles                    #    0.984 GHz                      ( +-  0.33% )
           317,310      instructions              #    1.02  insn per cycle           ( +-  0.18% )
            58,885      branches                  #  186.710 M/sec                    ( +-  0.12% )
               931      branch-misses             #    1.58% of all branches          ( +-  2.57% )

        0.00096799 +- 0.00000374 seconds time elapsed  ( +-  0.39% )

And after this change:

> perf stat -r 200 ./hello.exe

 Performance counter stats for './hello.exe' (200 runs):

              1.03 msec task-clock                #    0.580 CPUs utilized            ( +-  0.23% )
                 0      context-switches          #    0.000 K/sec
                 0      cpu-migrations            #    0.000 K/sec
                27      page-faults               #    0.026 M/sec                    ( +-  0.10% )
         1,034,038      cycles                    #    1.002 GHz                      ( +-  0.11% )
         2,485,983      instructions              #    2.40  insn per cycle           ( +-  0.02% )
           557,567      branches                  #  540.215 M/sec                    ( +-  0.01% )
             4,843      branch-misses             #    0.87% of all branches          ( +-  0.53% )

        0.00178093 +- 0.00000456 seconds time elapsed  ( +-  0.26% )

Regards,
Tamar
> 
> Best
> 
> Thomas


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v4] eliminate mutex in fast path of __register_frame
  2022-11-21 11:49     ` [PATCH v4] eliminate mutex in fast path of __register_frame Tamar Christina
@ 2022-11-21 11:53       ` Thomas Neumann
  0 siblings, 0 replies; 33+ messages in thread
From: Thomas Neumann @ 2022-11-21 11:53 UTC (permalink / raw)
  To: Tamar Christina, gcc-patches, Jason Merrill
  Cc: Florian Weimer, Jakub Jelinek, Jonathan Wakely

> 
> It's easy to reproduce on x86 as well.
> 
> As a testcase:
> 
> #include <cstdio>
> 
> int main(int argc, char** argv) {
>      return 0;
> }
> 
> And just compile with: g++ -O1 hello.cpp -static -o hello.exe.

thanks, I will take a look.

Best

Thomas


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v4] eliminate mutex in fast path of __register_frame
  2022-11-21 11:48     ` Jakub Jelinek
@ 2022-11-21 17:13       ` H.J. Lu
  2022-11-22  0:31         ` Thomas Neumann
  2022-11-22  8:00         ` [PATCH] speed up end_fde_sort using radix sort Thomas Neumann
  0 siblings, 2 replies; 33+ messages in thread
From: H.J. Lu @ 2022-11-21 17:13 UTC (permalink / raw)
  To: Jakub Jelinek
  Cc: Thomas Neumann, Tamar Christina, gcc-patches, Jason Merrill,
	Florian Weimer, Jonathan Wakely

On Mon, Nov 21, 2022 at 3:49 AM Jakub Jelinek via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> On Mon, Nov 21, 2022 at 12:22:32PM +0100, Thomas Neumann via Gcc-patches wrote:
> > > When dynamically linking a fast enough machine hides the latency, but when
> > > Statically linking or on slower devices this change caused a 5x increase in
> > > Instruction count and 2x increase in cycle count before getting to main.
> > >
> > > This has been quite noticeable on smaller devices.  Is there a reason the btree
> > > can't be initialized lazily? It seems a bit harsh to pay the cost of unwinding at
> > > startup even when you don't throw exceptions..
> >
> > we cannot easily do that lazily because otherwise we need a mutex for lazy
> > initialization, which is exactly what we wanted to get rid of.
> >
> > Having said that, I am surprised that you saw a noticeable difference. On
> > most platforms there should not be dynamic frame registration at all, as the
> > regular frames are directly read from the ELF data.
> >
> > Can you please send me an precise description on how to reproduce the issue?
> > (Platform, tools, a VM if you have one would be great). I will then debug
> > this to improve the startup time.
>
> I can see it being called as well for -static linked binaries.
> -static links in crtbeginT.o which is libgcc/crtstuff.c built with
> CRTSTUFFT_O macro being defined among other things, and that disables
> USE_PT_GNU_EH_FRAME:
> #if defined(OBJECT_FORMAT_ELF) \
>     && !defined(OBJECT_FORMAT_FLAT) \
>     && defined(HAVE_LD_EH_FRAME_HDR) \
>     && !defined(inhibit_libc) && !defined(CRTSTUFFT_O) \
>     && defined(__GLIBC__) && __GLIBC__ >= 2
> #include <link.h>
> /* uClibc pretends to be glibc 2.2 and DT_CONFIG is defined in its link.h.
>    But it doesn't use PT_GNU_EH_FRAME ELF segment currently.  */
> # if !defined(__UCLIBC__) \
>      && (__GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 2) \
>      || (__GLIBC__ == 2 && __GLIBC_MINOR__ == 2 && defined(DT_CONFIG)))
> #  define USE_PT_GNU_EH_FRAME
> # endif
> #endif
>
> I think .eh_frame_hdr was never used for statically linked programs,
> see already https://gcc.gnu.org/legacy-ml/gcc-patches/2001-12/msg01383.html
> We don't pass --eh-frame-hdr when linking statically and dl_iterate_phdr
> doesn't handle those.
> Now, if -static -Wl,--eh-frame-hdr is passed when linking to the driver,
> .eh_frame_hdr section is created and __GNU_EH_FRAME_HDR symbol points to
> the start of that section, so at least that section could be found
> if something in the crt files and libgcc is adjusted.  But e.g.
> i?86, nios2, frv and bfin we also need to find the got.  Also, would it
> work even for static PIEs?
>
>         Jakub
>

There is

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54568

-- 
H.J.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v4] eliminate mutex in fast path of __register_frame
  2022-11-21 17:13       ` H.J. Lu
@ 2022-11-22  0:31         ` Thomas Neumann
  2022-11-22  8:20           ` Florian Weimer
  2022-11-22  8:00         ` [PATCH] speed up end_fde_sort using radix sort Thomas Neumann
  1 sibling, 1 reply; 33+ messages in thread
From: Thomas Neumann @ 2022-11-22  0:31 UTC (permalink / raw)
  To: H.J. Lu, Jakub Jelinek
  Cc: Tamar Christina, gcc-patches, Jason Merrill, Florian Weimer,
	Jonathan Wakely

Hi,

>>>> When dynamically linking a fast enough machine hides the latency, but when
>>>> Statically linking or on slower devices this change caused a 5x increase in
>>>> Instruction count and 2x increase in cycle count before getting to main.

I have looked at ways to fix that. The problem is that with static 
linking unwinding tables are registered dynamically, and with my patch 
that registration triggers an eager sort of fde lists. While previously 
the lists were sorted when the first exception was thrown. If an 
application throws at least one exception there is no downside in eager 
sorting, but if the application never throws there is overhead.

The obvious way to improve the situation is to make sorting faster. When 
replacing the split+sort+merge logic with a radix sort (which can be 
done without additional memory) we get the following timings for your
#include <cstdio>
int main() {}
example (with git stat -r 200):

# pre-patch version, fast
               0,06 msec task-clock
            272.286      cycles
            464.754      instructions
# post-patch version, slow
               0,21 msec task-clock
            972.876      cycles
          3.079.515      instructions
# +radix sort, in the middle
               0,13 msec task-clock
            604.697      cycles
          1.702.930      instructions

The radix sort clearly improves things, but it does not fully eliminate 
the overhead.

The question is, how much do we care about that situation (i.e., static 
linking, exceptions registered but never thrown). I could change the 
code to recognize three states instead of two: no exceptions registered, 
exceptions register but never thrown, and full exception mode. But that 
would increase code complexity and it would pessimize applications that 
do throw, as we now need more checks to guard against concurrent changes.

It makes the code more complex and a bit slower, which is why I am 
hesistant. But I can implement that if you think that we need that. Or 
we just replace the sort, which is probably a good idea anyway.

Best

Thomas

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH] speed up end_fde_sort using radix sort
  2022-11-21 17:13       ` H.J. Lu
  2022-11-22  0:31         ` Thomas Neumann
@ 2022-11-22  8:00         ` Thomas Neumann
  2022-12-16 18:02           ` Jason Merrill
  1 sibling, 1 reply; 33+ messages in thread
From: Thomas Neumann @ 2022-11-22  8:00 UTC (permalink / raw)
  To: gcc-patches
  Cc: Tamar Christina, Jason Merrill, Florian Weimer, Jonathan Wakely,
	H.J. Lu, Jakub Jelinek

When registering a dynamic unwinding frame the fde list is sorted.
Previously, we split the list into a sorted and an unsorted part,
sorted the later using heap sort, and merged both. That can be
quite slow due to the large number of (expensive) comparisons.

This patch replaces that logic with a radix sort instead. The
radix sort uses the same amount of memory as the old logic,
using the second list as auxiliary space, and it includes two
techniques to speed up sorting: First, it computes the pointer
addresses for blocks of values, reducing the decoding overhead.
And it recognizes when the data has reached a sorted state,
allowing for early termination. When running out of memory
we fall back to pure heap sort, as before.

For this test program

#include <cstdio>
int main(int argc, char** argv) {
      return 0;
}

compiled with g++ -O -o hello -static hello.c we get with
perf stat -r 200 on a 5950X the following performance numbers:

old logic:

               0,20 msec task-clock
            930.834      cycles
          3.079.765      instructions
         0,00030478 +- 0,00000237 seconds time elapsed

new logic:

               0,10 msec task-clock
            473.269      cycles
          1.239.077      instructions
         0,00021119 +- 0,00000168 seconds time elapsed

libgcc/ChangeLog:
         * unwind-dw2-fde.c: Use radix sort instead of split+sort+merge.
---
  libgcc/unwind-dw2-fde.c | 234 +++++++++++++++++++++++-----------------
  1 file changed, 134 insertions(+), 100 deletions(-)

diff --git a/libgcc/unwind-dw2-fde.c b/libgcc/unwind-dw2-fde.c
index 3c0cc654ec0..b81540c41a4 100644
--- a/libgcc/unwind-dw2-fde.c
+++ b/libgcc/unwind-dw2-fde.c
@@ -456,22 +456,52 @@ fde_mixed_encoding_compare (struct object *ob, const fde *x, const fde *y)
  
  typedef int (*fde_compare_t) (struct object *, const fde *, const fde *);
  
+// The extractor functions compute the pointer values for a block of
+// fdes. The block processing hides the call overhead.
  
-/* This is a special mix of insertion sort and heap sort, optimized for
-   the data sets that actually occur. They look like
-   101 102 103 127 128 105 108 110 190 111 115 119 125 160 126 129 130.
-   I.e. a linearly increasing sequence (coming from functions in the text
-   section), with additionally a few unordered elements (coming from functions
-   in gnu_linkonce sections) whose values are higher than the values in the
-   surrounding linear sequence (but not necessarily higher than the values
-   at the end of the linear sequence!).
-   The worst-case total run time is O(N) + O(n log (n)), where N is the
-   total number of FDEs and n is the number of erratic ones.  */
+static void
+fde_unencoded_extract (struct object *ob __attribute__ ((unused)),
+		       _Unwind_Ptr *target, const fde **x, int count)
+{
+  for (int index = 0; index < count; ++index)
+    memcpy (target + index, x[index]->pc_begin, sizeof (_Unwind_Ptr));
+}
+
+static void
+fde_single_encoding_extract (struct object *ob, _Unwind_Ptr *target,
+			     const fde **x, int count)
+{
+  _Unwind_Ptr base;
+
+  base = base_from_object (ob->s.b.encoding, ob);
+  for (int index = 0; index < count; ++index)
+    read_encoded_value_with_base (ob->s.b.encoding, base, x[index]->pc_begin,
+				  target + index);
+}
+
+static void
+fde_mixed_encoding_extract (struct object *ob, _Unwind_Ptr *target,
+			    const fde **x, int count)
+{
+  for (int index = 0; index < count; ++index)
+    {
+      int encoding = get_fde_encoding (x[index]);
+      read_encoded_value_with_base (encoding, base_from_object (encoding, ob),
+				    x[index]->pc_begin, target + index);
+    }
+}
+
+typedef void (*fde_extractor_t) (struct object *, _Unwind_Ptr *, const fde **,
+				 int);
+
+// Data is is sorted using radix sort if possible, using an temporary
+// auxiliary data structure of the same size as the input. When running
+// out of memory to in-place heap sort.
  
  struct fde_accumulator
  {
    struct fde_vector *linear;
-  struct fde_vector *erratic;
+  struct fde_vector *aux;
  };
  
  static inline int
@@ -485,8 +515,8 @@ start_fde_sort (struct fde_accumulator *accu, size_t count)
    if ((accu->linear = malloc (size)))
      {
        accu->linear->count = 0;
-      if ((accu->erratic = malloc (size)))
-	accu->erratic->count = 0;
+      if ((accu->aux = malloc (size)))
+	accu->aux->count = 0;
        return 1;
      }
    else
@@ -500,59 +530,6 @@ fde_insert (struct fde_accumulator *accu, const fde *this_fde)
      accu->linear->array[accu->linear->count++] = this_fde;
  }
  
-/* Split LINEAR into a linear sequence with low values and an erratic
-   sequence with high values, put the linear one (of longest possible
-   length) into LINEAR and the erratic one into ERRATIC. This is O(N).
-
-   Because the longest linear sequence we are trying to locate within the
-   incoming LINEAR array can be interspersed with (high valued) erratic
-   entries.  We construct a chain indicating the sequenced entries.
-   To avoid having to allocate this chain, we overlay it onto the space of
-   the ERRATIC array during construction.  A final pass iterates over the
-   chain to determine what should be placed in the ERRATIC array, and
-   what is the linear sequence.  This overlay is safe from aliasing.  */
-
-static inline void
-fde_split (struct object *ob, fde_compare_t fde_compare,
-	   struct fde_vector *linear, struct fde_vector *erratic)
-{
-  static const fde *marker;
-  size_t count = linear->count;
-  const fde *const *chain_end = &marker;
-  size_t i, j, k;
-
-  /* This should optimize out, but it is wise to make sure this assumption
-     is correct. Should these have different sizes, we cannot cast between
-     them and the overlaying onto ERRATIC will not work.  */
-  gcc_assert (sizeof (const fde *) == sizeof (const fde **));
-
-  for (i = 0; i < count; i++)
-    {
-      const fde *const *probe;
-
-      for (probe = chain_end;
-	   probe != &marker && fde_compare (ob, linear->array[i], *probe) < 0;
-	   probe = chain_end)
-	{
-	  chain_end = (const fde *const*) erratic->array[probe - linear->array];
-	  erratic->array[probe - linear->array] = NULL;
-	}
-      erratic->array[i] = (const fde *) chain_end;
-      chain_end = &linear->array[i];
-    }
-
-  /* Each entry in LINEAR which is part of the linear sequence we have
-     discovered will correspond to a non-NULL entry in the chain we built in
-     the ERRATIC array.  */
-  for (i = j = k = 0; i < count; i++)
-    if (erratic->array[i])
-      linear->array[j++] = linear->array[i];
-    else
-      erratic->array[k++] = linear->array[i];
-  linear->count = j;
-  erratic->count = k;
-}
-
  #define SWAP(x,y) do { const fde * tmp = x; x = y; y = tmp; } while (0)
  
  /* Convert a semi-heap to a heap.  A semi-heap is a heap except possibly
@@ -615,59 +592,116 @@ frame_heapsort (struct object *ob, fde_compare_t fde_compare,
  #undef SWAP
  }
  
-/* Merge V1 and V2, both sorted, and put the result into V1.  */
+// Radix sort data in V1 using V2 as aux memory. Runtime O(n).
  static inline void
-fde_merge (struct object *ob, fde_compare_t fde_compare,
-	   struct fde_vector *v1, struct fde_vector *v2)
+fde_radixsort (struct object *ob, fde_extractor_t fde_extractor,
+	       struct fde_vector *v1, struct fde_vector *v2)
  {
-  size_t i1, i2;
-  const fde * fde2;
-
-  i2 = v2->count;
-  if (i2 > 0)
+#define FANOUTBITS 8
+#define FANOUT (1 << FANOUTBITS)
+#define BLOCKSIZE 128
+  const unsigned rounds
+    = (__CHAR_BIT__ * sizeof (_Unwind_Ptr) + FANOUTBITS - 1) / FANOUTBITS;
+  const fde **a1 = v1->array, **a2 = v2->array;
+  _Unwind_Ptr ptrs[BLOCKSIZE + 1];
+  unsigned n = v1->count;
+  for (unsigned round = 0; round != rounds; ++round)
      {
-      i1 = v1->count;
-      do
+      unsigned counts[FANOUT] = {0};
+      unsigned violations = 0;
+
+      // Count the number of elements per bucket and check if we are already
+      // sorted.
+      _Unwind_Ptr last = 0;
+      for (unsigned i = 0; i < n;)
  	{
-	  i2--;
-	  fde2 = v2->array[i2];
-	  while (i1 > 0 && fde_compare (ob, v1->array[i1-1], fde2) > 0)
+	  unsigned chunk = ((n - i) <= BLOCKSIZE) ? (n - i) : BLOCKSIZE;
+	  fde_extractor (ob, ptrs + 1, a1 + i, chunk);
+	  ptrs[0] = last;
+	  for (unsigned j = 0; j < chunk; ++j)
  	    {
-	      v1->array[i1+i2] = v1->array[i1-1];
-	      i1--;
+	      unsigned b = (ptrs[j + 1] >> (round * FANOUTBITS)) & (FANOUT - 1);
+	      counts[b]++;
+	      // Use summation instead of an if to eliminate branches.
+	      violations += ptrs[j + 1] < ptrs[j];
  	    }
-	  v1->array[i1+i2] = fde2;
+	  i += chunk;
+	  last = ptrs[chunk];
  	}
-      while (i2 > 0);
-      v1->count += v2->count;
+
+      // Stop if we are already sorted.
+      if (!violations)
+	{
+	  // The sorted data is in a1 now.
+	  a2 = a1;
+	  break;
+	}
+
+      // Compute the prefix sum.
+      unsigned sum = 0;
+      for (unsigned i = 0; i != FANOUT; ++i)
+	{
+	  unsigned s = sum;
+	  sum += counts[i];
+	  counts[i] = s;
+	}
+
+      // Place all elements.
+      for (unsigned i = 0; i < n;)
+	{
+	  unsigned chunk = ((n - i) <= BLOCKSIZE) ? (n - i) : BLOCKSIZE;
+	  fde_extractor (ob, ptrs, a1 + i, chunk);
+	  for (unsigned j = 0; j < chunk; ++j)
+	    {
+	      unsigned b = (ptrs[j] >> (round * FANOUTBITS)) & (FANOUT - 1);
+	      a2[counts[b]++] = a1[i + j];
+	    }
+	  i += chunk;
+	}
+
+      // Swap a1 and a2.
+      const fde **tmp = a1;
+      a1 = a2;
+      a2 = tmp;
      }
+#undef BLOCKSIZE
+#undef FANOUT
+#undef FANOUTBITS
+
+  // The data is in a2 now, move in place if needed.
+  if (a2 != v1->array)
+    memcpy (v1->array, a2, sizeof (const fde *) * n);
  }
  
  static inline void
  end_fde_sort (struct object *ob, struct fde_accumulator *accu, size_t count)
  {
-  fde_compare_t fde_compare;
-
    gcc_assert (!accu->linear || accu->linear->count == count);
  
-  if (ob->s.b.mixed_encoding)
-    fde_compare = fde_mixed_encoding_compare;
-  else if (ob->s.b.encoding == DW_EH_PE_absptr)
-    fde_compare = fde_unencoded_compare;
-  else
-    fde_compare = fde_single_encoding_compare;
-
-  if (accu->erratic)
+  if (accu->aux)
      {
-      fde_split (ob, fde_compare, accu->linear, accu->erratic);
-      gcc_assert (accu->linear->count + accu->erratic->count == count);
-      frame_heapsort (ob, fde_compare, accu->erratic);
-      fde_merge (ob, fde_compare, accu->linear, accu->erratic);
-      free (accu->erratic);
+      fde_extractor_t fde_extractor;
+      if (ob->s.b.mixed_encoding)
+	fde_extractor = fde_mixed_encoding_extract;
+      else if (ob->s.b.encoding == DW_EH_PE_absptr)
+	fde_extractor = fde_unencoded_extract;
+      else
+	fde_extractor = fde_single_encoding_extract;
+
+      fde_radixsort (ob, fde_extractor, accu->linear, accu->aux);
+      free (accu->aux);
      }
    else
      {
-      /* We've not managed to malloc an erratic array,
+      fde_compare_t fde_compare;
+      if (ob->s.b.mixed_encoding)
+	fde_compare = fde_mixed_encoding_compare;
+      else if (ob->s.b.encoding == DW_EH_PE_absptr)
+	fde_compare = fde_unencoded_compare;
+      else
+	fde_compare = fde_single_encoding_compare;
+
+      /* We've not managed to malloc an aux array,
  	 so heap sort in the linear one.  */
        frame_heapsort (ob, fde_compare, accu->linear);
      }
-- 
2.37.2


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v4] eliminate mutex in fast path of __register_frame
  2022-11-22  0:31         ` Thomas Neumann
@ 2022-11-22  8:20           ` Florian Weimer
  2022-11-22  9:12             ` Thomas Neumann
                               ` (5 more replies)
  0 siblings, 6 replies; 33+ messages in thread
From: Florian Weimer @ 2022-11-22  8:20 UTC (permalink / raw)
  To: Thomas Neumann
  Cc: H.J. Lu, Jakub Jelinek, Tamar Christina, gcc-patches,
	Jason Merrill, Jonathan Wakely

* Thomas Neumann:

> Hi,
>
>>>>> When dynamically linking a fast enough machine hides the latency, but when
>>>>> Statically linking or on slower devices this change caused a 5x increase in
>>>>> Instruction count and 2x increase in cycle count before getting to main.
>
> I have looked at ways to fix that. The problem is that with static
> linking unwinding tables are registered dynamically, and with my patch 
> that registration triggers an eager sort of fde lists. While
> previously the lists were sorted when the first exception was
> thrown. If an application throws at least one exception there is no
> downside in eager sorting, but if the application never throws there
> is overhead.

Would it be possible to trigger lazy registration if the version is read
as a zero?  This would not introduce any additional atomic instructions
on the fast path.

Thanks,
Florian


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v4] eliminate mutex in fast path of __register_frame
  2022-11-22  8:20           ` Florian Weimer
@ 2022-11-22  9:12             ` Thomas Neumann
  2022-12-09 17:34             ` [PATCH] initialize fde objects lazily Thomas Neumann
                               ` (4 subsequent siblings)
  5 siblings, 0 replies; 33+ messages in thread
From: Thomas Neumann @ 2022-11-22  9:12 UTC (permalink / raw)
  To: Florian Weimer
  Cc: H.J. Lu, Jakub Jelinek, Tamar Christina, gcc-patches,
	Jason Merrill, Jonathan Wakely

> Would it be possible to trigger lazy registration if the version is read
> as a zero?  This would not introduce any additional atomic instructions
> on the fast path.

yes, that is possible. The main problem is the transition from lazy to 
non-lazy mode when the first exception is thrown. We must somehow stop 
the world for that without introducing an additional mutex. But I have 
though about that some more, and that is possible too, by encoding a 
magic value as version during the transition, which causes the other 
threads to block. A bit ugly, but manageable. I will implement that in a 
few days.

Independent of that I think we should improve the sort logic, as we 
still have to sort, even in lazy mode, at latest when the first 
exception is thrown. I have send a patch that significantly improves 
that step.

Best

Thomas

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH] initialize fde objects lazily
  2022-11-22  8:20           ` Florian Weimer
  2022-11-22  9:12             ` Thomas Neumann
@ 2022-12-09 17:34             ` Thomas Neumann
  2022-12-15 16:11               ` Tamar Christina
  2022-12-16 17:25               ` Jason Merrill
  2023-05-02 14:32             ` [PATCH] release the sorted FDE array when deregistering a frame [PR109685] Thomas Neumann
                               ` (3 subsequent siblings)
  5 siblings, 2 replies; 33+ messages in thread
From: Thomas Neumann @ 2022-12-09 17:34 UTC (permalink / raw)
  To: gcc-patches
  Cc: H.J. Lu, Jakub Jelinek, Tamar Christina, Jason Merrill,
	Jonathan Wakely, Florian Weimer

When registering an unwind frame with __register_frame_info_bases
we currently initialize that fde object eagerly. This has the
advantage that it is immutable afterwards and we can safely
access it from multiple threads, but it has the disadvantage
that we pay the initialization cost even if the application
never throws an exception.

This commit changes the logic to initialize the objects lazily.
The objects themselves are inserted into the b-tree when
registering the frame, but the sorted fde_vector is
not constructed yet. Only on the first time that an
exception tries to pass through the registered code the
object is initialized. We notice that with a double checking,
first doing a relaxed load of the sorted bit and then re-checking
under a mutex when the object was not initialized yet.

Note that the check must implicitly be safe concering a concurrent
frame deregistration, as trying the deregister a frame that is
on the unwinding path of a concurrent exception is inherently racy.

libgcc/ChangeLog:
         * unwind-dw2-fde.c: Initialize fde object lazily when
         the first exception tries to pass through.
---
  libgcc/unwind-dw2-fde.c | 52 ++++++++++++++++++++++++++++++++---------
  1 file changed, 41 insertions(+), 11 deletions(-)

diff --git a/libgcc/unwind-dw2-fde.c b/libgcc/unwind-dw2-fde.c
index 3c0cc654ec0..6f69c20ff4b 100644
--- a/libgcc/unwind-dw2-fde.c
+++ b/libgcc/unwind-dw2-fde.c
@@ -63,8 +63,6 @@ release_registered_frames (void)
  
  static void
  get_pc_range (const struct object *ob, uintptr_type *range);
-static void
-init_object (struct object *ob);
  
  #else
  /* Without fast path frame deregistration must always succeed.  */
@@ -76,6 +74,7 @@ static const int in_shutdown = 0;
     by decreasing value of pc_begin.  */
  static struct object *unseen_objects;
  static struct object *seen_objects;
+#endif
  
  #ifdef __GTHREAD_MUTEX_INIT
  static __gthread_mutex_t object_mutex = __GTHREAD_MUTEX_INIT;
@@ -103,7 +102,6 @@ init_object_mutex_once (void)
  static __gthread_mutex_t object_mutex;
  #endif
  #endif
-#endif
  
  /* Called from crtbegin.o to register the unwind info for an object.  */
  
@@ -126,10 +124,7 @@ __register_frame_info_bases (const void *begin, struct object *ob,
  #endif
  
  #ifdef ATOMIC_FDE_FAST_PATH
-  // Initialize eagerly to avoid locking later
-  init_object (ob);
-
-  // And register the frame
+  // Register the frame in the b-tree
    uintptr_type range[2];
    get_pc_range (ob, range);
    btree_insert (&registered_frames, range[0], range[1] - range[0], ob);
@@ -180,10 +175,7 @@ __register_frame_info_table_bases (void *begin, struct object *ob,
    ob->s.b.encoding = DW_EH_PE_omit;
  
  #ifdef ATOMIC_FDE_FAST_PATH
-  // Initialize eagerly to avoid locking later
-  init_object (ob);
-
-  // And register the frame
+  // Register the frame in the b-tree
    uintptr_type range[2];
    get_pc_range (ob, range);
    btree_insert (&registered_frames, range[0], range[1] - range[0], ob);
@@ -892,7 +884,15 @@ init_object (struct object* ob)
    accu.linear->orig_data = ob->u.single;
    ob->u.sort = accu.linear;
  
+#ifdef ATOMIC_FDE_FAST_PATH
+  // We must update the sorted bit with an atomic operation
+  struct object tmp;
+  tmp.s.b = ob->s.b;
+  tmp.s.b.sorted = 1;
+  __atomic_store (&(ob->s.b), &(tmp.s.b), __ATOMIC_SEQ_CST);
+#else
    ob->s.b.sorted = 1;
+#endif
  }
  
  #ifdef ATOMIC_FDE_FAST_PATH
@@ -1130,6 +1130,21 @@ search_object (struct object* ob, void *pc)
      }
  }
  
+#ifdef ATOMIC_FDE_FAST_PATH
+
+// Check if the object was already initialized
+static inline bool
+is_object_initialized (struct object *ob)
+{
+  // We have to use relaxed atomics for the read, which
+  // is a bit involved as we read from a bitfield
+  struct object tmp;
+  __atomic_load (&(ob->s.b), &(tmp.s.b), __ATOMIC_RELAXED);
+  return tmp.s.b.sorted;
+}
+
+#endif
+
  const fde *
  _Unwind_Find_FDE (void *pc, struct dwarf_eh_bases *bases)
  {
@@ -1141,6 +1156,21 @@ _Unwind_Find_FDE (void *pc, struct dwarf_eh_bases *bases)
    if (!ob)
      return NULL;
  
+  // Initialize the object lazily
+  if (!is_object_initialized (ob))
+    {
+      // Check again under mutex
+      init_object_mutex_once ();
+      __gthread_mutex_lock (&object_mutex);
+
+      if (!ob->s.b.sorted)
+	{
+	  init_object (ob);
+	}
+
+      __gthread_mutex_unlock (&object_mutex);
+    }
+
    f = search_object (ob, pc);
  #else
  
-- 
2.37.2


^ permalink raw reply	[flat|nested] 33+ messages in thread

* RE: [PATCH] initialize fde objects lazily
  2022-12-09 17:34             ` [PATCH] initialize fde objects lazily Thomas Neumann
@ 2022-12-15 16:11               ` Tamar Christina
  2022-12-16 17:25               ` Jason Merrill
  1 sibling, 0 replies; 33+ messages in thread
From: Tamar Christina @ 2022-12-15 16:11 UTC (permalink / raw)
  To: Thomas Neumann, gcc-patches
  Cc: H.J. Lu, Jakub Jelinek, Jason Merrill, Jonathan Wakely, Florian Weimer

Thanks Thomas!

We've tested it and this brings the startup time back into a reasonable amount!

We'd quite like to see this get into GCC 13.

Regards,
Tamar

> -----Original Message-----
> From: Thomas Neumann <thomas.neumann@in.tum.de>
> Sent: Friday, December 9, 2022 5:34 PM
> To: gcc-patches@gcc.gnu.org
> Cc: H.J. Lu <hjl.tools@gmail.com>; Jakub Jelinek <jakub@redhat.com>;
> Tamar Christina <Tamar.Christina@arm.com>; Jason Merrill
> <jason@redhat.com>; Jonathan Wakely <jwakely.gcc@gmail.com>; Florian
> Weimer <fweimer@redhat.com>
> Subject: [PATCH] initialize fde objects lazily
> 
> When registering an unwind frame with __register_frame_info_bases we
> currently initialize that fde object eagerly. This has the advantage that it is
> immutable afterwards and we can safely access it from multiple threads, but
> it has the disadvantage that we pay the initialization cost even if the
> application never throws an exception.
> 
> This commit changes the logic to initialize the objects lazily.
> The objects themselves are inserted into the b-tree when registering the
> frame, but the sorted fde_vector is not constructed yet. Only on the first
> time that an exception tries to pass through the registered code the object is
> initialized. We notice that with a double checking, first doing a relaxed load of
> the sorted bit and then re-checking under a mutex when the object was not
> initialized yet.
> 
> Note that the check must implicitly be safe concering a concurrent frame
> deregistration, as trying the deregister a frame that is on the unwinding path
> of a concurrent exception is inherently racy.
> 
> libgcc/ChangeLog:
>          * unwind-dw2-fde.c: Initialize fde object lazily when
>          the first exception tries to pass through.
> ---
>   libgcc/unwind-dw2-fde.c | 52 ++++++++++++++++++++++++++++++++-----
> ----
>   1 file changed, 41 insertions(+), 11 deletions(-)
> 
> diff --git a/libgcc/unwind-dw2-fde.c b/libgcc/unwind-dw2-fde.c index
> 3c0cc654ec0..6f69c20ff4b 100644
> --- a/libgcc/unwind-dw2-fde.c
> +++ b/libgcc/unwind-dw2-fde.c
> @@ -63,8 +63,6 @@ release_registered_frames (void)
> 
>   static void
>   get_pc_range (const struct object *ob, uintptr_type *range); -static void -
> init_object (struct object *ob);
> 
>   #else
>   /* Without fast path frame deregistration must always succeed.  */ @@ -
> 76,6 +74,7 @@ static const int in_shutdown = 0;
>      by decreasing value of pc_begin.  */
>   static struct object *unseen_objects;
>   static struct object *seen_objects;
> +#endif
> 
>   #ifdef __GTHREAD_MUTEX_INIT
>   static __gthread_mutex_t object_mutex = __GTHREAD_MUTEX_INIT; @@
> -103,7 +102,6 @@ init_object_mutex_once (void)
>   static __gthread_mutex_t object_mutex;
>   #endif
>   #endif
> -#endif
> 
>   /* Called from crtbegin.o to register the unwind info for an object.  */
> 
> @@ -126,10 +124,7 @@ __register_frame_info_bases (const void *begin,
> struct object *ob,
>   #endif
> 
>   #ifdef ATOMIC_FDE_FAST_PATH
> -  // Initialize eagerly to avoid locking later
> -  init_object (ob);
> -
> -  // And register the frame
> +  // Register the frame in the b-tree
>     uintptr_type range[2];
>     get_pc_range (ob, range);
>     btree_insert (&registered_frames, range[0], range[1] - range[0], ob); @@
> -180,10 +175,7 @@ __register_frame_info_table_bases (void *begin, struct
> object *ob,
>     ob->s.b.encoding = DW_EH_PE_omit;
> 
>   #ifdef ATOMIC_FDE_FAST_PATH
> -  // Initialize eagerly to avoid locking later
> -  init_object (ob);
> -
> -  // And register the frame
> +  // Register the frame in the b-tree
>     uintptr_type range[2];
>     get_pc_range (ob, range);
>     btree_insert (&registered_frames, range[0], range[1] - range[0], ob); @@
> -892,7 +884,15 @@ init_object (struct object* ob)
>     accu.linear->orig_data = ob->u.single;
>     ob->u.sort = accu.linear;
> 
> +#ifdef ATOMIC_FDE_FAST_PATH
> +  // We must update the sorted bit with an atomic operation
> +  struct object tmp;
> +  tmp.s.b = ob->s.b;
> +  tmp.s.b.sorted = 1;
> +  __atomic_store (&(ob->s.b), &(tmp.s.b), __ATOMIC_SEQ_CST); #else
>     ob->s.b.sorted = 1;
> +#endif
>   }
> 
>   #ifdef ATOMIC_FDE_FAST_PATH
> @@ -1130,6 +1130,21 @@ search_object (struct object* ob, void *pc)
>       }
>   }
> 
> +#ifdef ATOMIC_FDE_FAST_PATH
> +
> +// Check if the object was already initialized static inline bool
> +is_object_initialized (struct object *ob) {
> +  // We have to use relaxed atomics for the read, which
> +  // is a bit involved as we read from a bitfield
> +  struct object tmp;
> +  __atomic_load (&(ob->s.b), &(tmp.s.b), __ATOMIC_RELAXED);
> +  return tmp.s.b.sorted;
> +}
> +
> +#endif
> +
>   const fde *
>   _Unwind_Find_FDE (void *pc, struct dwarf_eh_bases *bases)
>   {
> @@ -1141,6 +1156,21 @@ _Unwind_Find_FDE (void *pc, struct
> dwarf_eh_bases *bases)
>     if (!ob)
>       return NULL;
> 
> +  // Initialize the object lazily
> +  if (!is_object_initialized (ob))
> +    {
> +      // Check again under mutex
> +      init_object_mutex_once ();
> +      __gthread_mutex_lock (&object_mutex);
> +
> +      if (!ob->s.b.sorted)
> +	{
> +	  init_object (ob);
> +	}
> +
> +      __gthread_mutex_unlock (&object_mutex);
> +    }
> +
>     f = search_object (ob, pc);
>   #else
> 
> --
> 2.37.2


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] initialize fde objects lazily
  2022-12-09 17:34             ` [PATCH] initialize fde objects lazily Thomas Neumann
  2022-12-15 16:11               ` Tamar Christina
@ 2022-12-16 17:25               ` Jason Merrill
  1 sibling, 0 replies; 33+ messages in thread
From: Jason Merrill @ 2022-12-16 17:25 UTC (permalink / raw)
  To: Thomas Neumann, gcc-patches
  Cc: H.J. Lu, Jakub Jelinek, Tamar Christina, Jonathan Wakely, Florian Weimer

On 12/9/22 12:34, Thomas Neumann wrote:
> When registering an unwind frame with __register_frame_info_bases
> we currently initialize that fde object eagerly. This has the
> advantage that it is immutable afterwards and we can safely
> access it from multiple threads, but it has the disadvantage
> that we pay the initialization cost even if the application
> never throws an exception.
> 
> This commit changes the logic to initialize the objects lazily.
> The objects themselves are inserted into the b-tree when
> registering the frame, but the sorted fde_vector is
> not constructed yet. Only on the first time that an
> exception tries to pass through the registered code the
> object is initialized. We notice that with a double checking,
> first doing a relaxed load of the sorted bit and then re-checking
> under a mutex when the object was not initialized yet.
> 
> Note that the check must implicitly be safe concering a concurrent
> frame deregistration, as trying the deregister a frame that is
> on the unwinding path of a concurrent exception is inherently racy.

OK, thanks.

> libgcc/ChangeLog:
>          * unwind-dw2-fde.c: Initialize fde object lazily when
>          the first exception tries to pass through.
> ---
>   libgcc/unwind-dw2-fde.c | 52 ++++++++++++++++++++++++++++++++---------
>   1 file changed, 41 insertions(+), 11 deletions(-)
> 
> diff --git a/libgcc/unwind-dw2-fde.c b/libgcc/unwind-dw2-fde.c
> index 3c0cc654ec0..6f69c20ff4b 100644
> --- a/libgcc/unwind-dw2-fde.c
> +++ b/libgcc/unwind-dw2-fde.c
> @@ -63,8 +63,6 @@ release_registered_frames (void)
> 
>   static void
>   get_pc_range (const struct object *ob, uintptr_type *range);
> -static void
> -init_object (struct object *ob);
> 
>   #else
>   /* Without fast path frame deregistration must always succeed.  */
> @@ -76,6 +74,7 @@ static const int in_shutdown = 0;
>      by decreasing value of pc_begin.  */
>   static struct object *unseen_objects;
>   static struct object *seen_objects;
> +#endif
> 
>   #ifdef __GTHREAD_MUTEX_INIT
>   static __gthread_mutex_t object_mutex = __GTHREAD_MUTEX_INIT;
> @@ -103,7 +102,6 @@ init_object_mutex_once (void)
>   static __gthread_mutex_t object_mutex;
>   #endif
>   #endif
> -#endif
> 
>   /* Called from crtbegin.o to register the unwind info for an object.  */
> 
> @@ -126,10 +124,7 @@ __register_frame_info_bases (const void *begin, 
> struct object *ob,
>   #endif
> 
>   #ifdef ATOMIC_FDE_FAST_PATH
> -  // Initialize eagerly to avoid locking later
> -  init_object (ob);
> -
> -  // And register the frame
> +  // Register the frame in the b-tree
>     uintptr_type range[2];
>     get_pc_range (ob, range);
>     btree_insert (&registered_frames, range[0], range[1] - range[0], ob);
> @@ -180,10 +175,7 @@ __register_frame_info_table_bases (void *begin, 
> struct object *ob,
>     ob->s.b.encoding = DW_EH_PE_omit;
> 
>   #ifdef ATOMIC_FDE_FAST_PATH
> -  // Initialize eagerly to avoid locking later
> -  init_object (ob);
> -
> -  // And register the frame
> +  // Register the frame in the b-tree
>     uintptr_type range[2];
>     get_pc_range (ob, range);
>     btree_insert (&registered_frames, range[0], range[1] - range[0], ob);
> @@ -892,7 +884,15 @@ init_object (struct object* ob)
>     accu.linear->orig_data = ob->u.single;
>     ob->u.sort = accu.linear;
> 
> +#ifdef ATOMIC_FDE_FAST_PATH
> +  // We must update the sorted bit with an atomic operation
> +  struct object tmp;
> +  tmp.s.b = ob->s.b;
> +  tmp.s.b.sorted = 1;
> +  __atomic_store (&(ob->s.b), &(tmp.s.b), __ATOMIC_SEQ_CST);
> +#else
>     ob->s.b.sorted = 1;
> +#endif
>   }
> 
>   #ifdef ATOMIC_FDE_FAST_PATH
> @@ -1130,6 +1130,21 @@ search_object (struct object* ob, void *pc)
>       }
>   }
> 
> +#ifdef ATOMIC_FDE_FAST_PATH
> +
> +// Check if the object was already initialized
> +static inline bool
> +is_object_initialized (struct object *ob)
> +{
> +  // We have to use relaxed atomics for the read, which
> +  // is a bit involved as we read from a bitfield
> +  struct object tmp;
> +  __atomic_load (&(ob->s.b), &(tmp.s.b), __ATOMIC_RELAXED);
> +  return tmp.s.b.sorted;
> +}
> +
> +#endif
> +
>   const fde *
>   _Unwind_Find_FDE (void *pc, struct dwarf_eh_bases *bases)
>   {
> @@ -1141,6 +1156,21 @@ _Unwind_Find_FDE (void *pc, struct dwarf_eh_bases 
> *bases)
>     if (!ob)
>       return NULL;
> 
> +  // Initialize the object lazily
> +  if (!is_object_initialized (ob))
> +    {
> +      // Check again under mutex
> +      init_object_mutex_once ();
> +      __gthread_mutex_lock (&object_mutex);
> +
> +      if (!ob->s.b.sorted)
> +    {
> +      init_object (ob);
> +    }
> +
> +      __gthread_mutex_unlock (&object_mutex);
> +    }
> +
>     f = search_object (ob, pc);
>   #else
> 


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] speed up end_fde_sort using radix sort
  2022-11-22  8:00         ` [PATCH] speed up end_fde_sort using radix sort Thomas Neumann
@ 2022-12-16 18:02           ` Jason Merrill
  0 siblings, 0 replies; 33+ messages in thread
From: Jason Merrill @ 2022-12-16 18:02 UTC (permalink / raw)
  To: Thomas Neumann, gcc-patches
  Cc: Tamar Christina, Florian Weimer, Jonathan Wakely, H.J. Lu, Jakub Jelinek

On 11/22/22 03:00, Thomas Neumann wrote:
> When registering a dynamic unwinding frame the fde list is sorted.
> Previously, we split the list into a sorted and an unsorted part,
> sorted the later using heap sort, and merged both. That can be
> quite slow due to the large number of (expensive) comparisons.
> 
> This patch replaces that logic with a radix sort instead. The
> radix sort uses the same amount of memory as the old logic,
> using the second list as auxiliary space, and it includes two
> techniques to speed up sorting: First, it computes the pointer
> addresses for blocks of values, reducing the decoding overhead.
> And it recognizes when the data has reached a sorted state,
> allowing for early termination. When running out of memory
> we fall back to pure heap sort, as before.
> 
> For this test program
> 
> #include <cstdio>
> int main(int argc, char** argv) {
>       return 0;
> }
> 
> compiled with g++ -O -o hello -static hello.c we get with
> perf stat -r 200 on a 5950X the following performance numbers:
> 
> old logic:
> 
>                0,20 msec task-clock
>             930.834      cycles
>           3.079.765      instructions
>          0,00030478 +- 0,00000237 seconds time elapsed
> 
> new logic:
> 
>                0,10 msec task-clock
>             473.269      cycles
>           1.239.077      instructions
>          0,00021119 +- 0,00000168 seconds time elapsed
> 
> libgcc/ChangeLog:
>          * unwind-dw2-fde.c: Use radix sort instead of split+sort+merge.
> ---
>   libgcc/unwind-dw2-fde.c | 234 +++++++++++++++++++++++-----------------
>   1 file changed, 134 insertions(+), 100 deletions(-)
> 
> diff --git a/libgcc/unwind-dw2-fde.c b/libgcc/unwind-dw2-fde.c
> index 3c0cc654ec0..b81540c41a4 100644
> --- a/libgcc/unwind-dw2-fde.c
> +++ b/libgcc/unwind-dw2-fde.c
> @@ -456,22 +456,52 @@ fde_mixed_encoding_compare (struct object *ob, 
> const fde *x, const fde *y)
> 
>   typedef int (*fde_compare_t) (struct object *, const fde *, const fde *);
> 
> +// The extractor functions compute the pointer values for a block of
> +// fdes. The block processing hides the call overhead.
> 
> -/* This is a special mix of insertion sort and heap sort, optimized for
> -   the data sets that actually occur. They look like
> -   101 102 103 127 128 105 108 110 190 111 115 119 125 160 126 129 130.
> -   I.e. a linearly increasing sequence (coming from functions in the text
> -   section), with additionally a few unordered elements (coming from 
> functions
> -   in gnu_linkonce sections) whose values are higher than the values in 
> the
> -   surrounding linear sequence (but not necessarily higher than the values
> -   at the end of the linear sequence!).
> -   The worst-case total run time is O(N) + O(n log (n)), where N is the
> -   total number of FDEs and n is the number of erratic ones.  */
> +static void
> +fde_unencoded_extract (struct object *ob __attribute__ ((unused)),
> +               _Unwind_Ptr *target, const fde **x, int count)
> +{
> +  for (int index = 0; index < count; ++index)
> +    memcpy (target + index, x[index]->pc_begin, sizeof (_Unwind_Ptr));
> +}
> +
> +static void
> +fde_single_encoding_extract (struct object *ob, _Unwind_Ptr *target,
> +                 const fde **x, int count)
> +{
> +  _Unwind_Ptr base;
> +
> +  base = base_from_object (ob->s.b.encoding, ob);
> +  for (int index = 0; index < count; ++index)
> +    read_encoded_value_with_base (ob->s.b.encoding, base, 
> x[index]->pc_begin,
> +                  target + index);
> +}
> +
> +static void
> +fde_mixed_encoding_extract (struct object *ob, _Unwind_Ptr *target,
> +                const fde **x, int count)
> +{
> +  for (int index = 0; index < count; ++index)
> +    {
> +      int encoding = get_fde_encoding (x[index]);
> +      read_encoded_value_with_base (encoding, base_from_object 
> (encoding, ob),
> +                    x[index]->pc_begin, target + index);
> +    }
> +}
> +
> +typedef void (*fde_extractor_t) (struct object *, _Unwind_Ptr *, const 
> fde **,
> +                 int);
> +
> +// Data is is sorted using radix sort if possible, using an temporary
> +// auxiliary data structure of the same size as the input. When running
> +// out of memory to in-place heap sort.

s/to/do/ ?

OK with that change or other fix to that sentence.

>   struct fde_accumulator
>   {
>     struct fde_vector *linear;
> -  struct fde_vector *erratic;
> +  struct fde_vector *aux;
>   };
> 
>   static inline int
> @@ -485,8 +515,8 @@ start_fde_sort (struct fde_accumulator *accu, size_t 
> count)
>     if ((accu->linear = malloc (size)))
>       {
>         accu->linear->count = 0;
> -      if ((accu->erratic = malloc (size)))
> -    accu->erratic->count = 0;
> +      if ((accu->aux = malloc (size)))
> +    accu->aux->count = 0;
>         return 1;
>       }
>     else
> @@ -500,59 +530,6 @@ fde_insert (struct fde_accumulator *accu, const fde 
> *this_fde)
>       accu->linear->array[accu->linear->count++] = this_fde;
>   }
> 
> -/* Split LINEAR into a linear sequence with low values and an erratic
> -   sequence with high values, put the linear one (of longest possible
> -   length) into LINEAR and the erratic one into ERRATIC. This is O(N).
> -
> -   Because the longest linear sequence we are trying to locate within the
> -   incoming LINEAR array can be interspersed with (high valued) erratic
> -   entries.  We construct a chain indicating the sequenced entries.
> -   To avoid having to allocate this chain, we overlay it onto the space of
> -   the ERRATIC array during construction.  A final pass iterates over the
> -   chain to determine what should be placed in the ERRATIC array, and
> -   what is the linear sequence.  This overlay is safe from aliasing.  */
> -
> -static inline void
> -fde_split (struct object *ob, fde_compare_t fde_compare,
> -       struct fde_vector *linear, struct fde_vector *erratic)
> -{
> -  static const fde *marker;
> -  size_t count = linear->count;
> -  const fde *const *chain_end = &marker;
> -  size_t i, j, k;
> -
> -  /* This should optimize out, but it is wise to make sure this assumption
> -     is correct. Should these have different sizes, we cannot cast between
> -     them and the overlaying onto ERRATIC will not work.  */
> -  gcc_assert (sizeof (const fde *) == sizeof (const fde **));
> -
> -  for (i = 0; i < count; i++)
> -    {
> -      const fde *const *probe;
> -
> -      for (probe = chain_end;
> -       probe != &marker && fde_compare (ob, linear->array[i], *probe) < 0;
> -       probe = chain_end)
> -    {
> -      chain_end = (const fde *const*) erratic->array[probe - 
> linear->array];
> -      erratic->array[probe - linear->array] = NULL;
> -    }
> -      erratic->array[i] = (const fde *) chain_end;
> -      chain_end = &linear->array[i];
> -    }
> -
> -  /* Each entry in LINEAR which is part of the linear sequence we have
> -     discovered will correspond to a non-NULL entry in the chain we 
> built in
> -     the ERRATIC array.  */
> -  for (i = j = k = 0; i < count; i++)
> -    if (erratic->array[i])
> -      linear->array[j++] = linear->array[i];
> -    else
> -      erratic->array[k++] = linear->array[i];
> -  linear->count = j;
> -  erratic->count = k;
> -}
> -
>   #define SWAP(x,y) do { const fde * tmp = x; x = y; y = tmp; } while (0)
> 
>   /* Convert a semi-heap to a heap.  A semi-heap is a heap except possibly
> @@ -615,59 +592,116 @@ frame_heapsort (struct object *ob, fde_compare_t 
> fde_compare,
>   #undef SWAP
>   }
> 
> -/* Merge V1 and V2, both sorted, and put the result into V1.  */
> +// Radix sort data in V1 using V2 as aux memory. Runtime O(n).
>   static inline void
> -fde_merge (struct object *ob, fde_compare_t fde_compare,
> -       struct fde_vector *v1, struct fde_vector *v2)
> +fde_radixsort (struct object *ob, fde_extractor_t fde_extractor,
> +           struct fde_vector *v1, struct fde_vector *v2)
>   {
> -  size_t i1, i2;
> -  const fde * fde2;
> -
> -  i2 = v2->count;
> -  if (i2 > 0)
> +#define FANOUTBITS 8
> +#define FANOUT (1 << FANOUTBITS)
> +#define BLOCKSIZE 128
> +  const unsigned rounds
> +    = (__CHAR_BIT__ * sizeof (_Unwind_Ptr) + FANOUTBITS - 1) / FANOUTBITS;
> +  const fde **a1 = v1->array, **a2 = v2->array;
> +  _Unwind_Ptr ptrs[BLOCKSIZE + 1];
> +  unsigned n = v1->count;
> +  for (unsigned round = 0; round != rounds; ++round)
>       {
> -      i1 = v1->count;
> -      do
> +      unsigned counts[FANOUT] = {0};
> +      unsigned violations = 0;
> +
> +      // Count the number of elements per bucket and check if we are 
> already
> +      // sorted.
> +      _Unwind_Ptr last = 0;
> +      for (unsigned i = 0; i < n;)
>       {
> -      i2--;
> -      fde2 = v2->array[i2];
> -      while (i1 > 0 && fde_compare (ob, v1->array[i1-1], fde2) > 0)
> +      unsigned chunk = ((n - i) <= BLOCKSIZE) ? (n - i) : BLOCKSIZE;
> +      fde_extractor (ob, ptrs + 1, a1 + i, chunk);
> +      ptrs[0] = last;
> +      for (unsigned j = 0; j < chunk; ++j)
>           {
> -          v1->array[i1+i2] = v1->array[i1-1];
> -          i1--;
> +          unsigned b = (ptrs[j + 1] >> (round * FANOUTBITS)) & (FANOUT 
> - 1);
> +          counts[b]++;
> +          // Use summation instead of an if to eliminate branches.
> +          violations += ptrs[j + 1] < ptrs[j];
>           }
> -      v1->array[i1+i2] = fde2;
> +      i += chunk;
> +      last = ptrs[chunk];
>       }
> -      while (i2 > 0);
> -      v1->count += v2->count;
> +
> +      // Stop if we are already sorted.
> +      if (!violations)
> +    {
> +      // The sorted data is in a1 now.
> +      a2 = a1;
> +      break;
> +    }
> +
> +      // Compute the prefix sum.
> +      unsigned sum = 0;
> +      for (unsigned i = 0; i != FANOUT; ++i)
> +    {
> +      unsigned s = sum;
> +      sum += counts[i];
> +      counts[i] = s;
> +    }
> +
> +      // Place all elements.
> +      for (unsigned i = 0; i < n;)
> +    {
> +      unsigned chunk = ((n - i) <= BLOCKSIZE) ? (n - i) : BLOCKSIZE;
> +      fde_extractor (ob, ptrs, a1 + i, chunk);
> +      for (unsigned j = 0; j < chunk; ++j)
> +        {
> +          unsigned b = (ptrs[j] >> (round * FANOUTBITS)) & (FANOUT - 1);
> +          a2[counts[b]++] = a1[i + j];
> +        }
> +      i += chunk;
> +    }
> +
> +      // Swap a1 and a2.
> +      const fde **tmp = a1;
> +      a1 = a2;
> +      a2 = tmp;
>       }
> +#undef BLOCKSIZE
> +#undef FANOUT
> +#undef FANOUTBITS
> +
> +  // The data is in a2 now, move in place if needed.
> +  if (a2 != v1->array)
> +    memcpy (v1->array, a2, sizeof (const fde *) * n);
>   }
> 
>   static inline void
>   end_fde_sort (struct object *ob, struct fde_accumulator *accu, size_t 
> count)
>   {
> -  fde_compare_t fde_compare;
> -
>     gcc_assert (!accu->linear || accu->linear->count == count);
> 
> -  if (ob->s.b.mixed_encoding)
> -    fde_compare = fde_mixed_encoding_compare;
> -  else if (ob->s.b.encoding == DW_EH_PE_absptr)
> -    fde_compare = fde_unencoded_compare;
> -  else
> -    fde_compare = fde_single_encoding_compare;
> -
> -  if (accu->erratic)
> +  if (accu->aux)
>       {
> -      fde_split (ob, fde_compare, accu->linear, accu->erratic);
> -      gcc_assert (accu->linear->count + accu->erratic->count == count);
> -      frame_heapsort (ob, fde_compare, accu->erratic);
> -      fde_merge (ob, fde_compare, accu->linear, accu->erratic);
> -      free (accu->erratic);
> +      fde_extractor_t fde_extractor;
> +      if (ob->s.b.mixed_encoding)
> +    fde_extractor = fde_mixed_encoding_extract;
> +      else if (ob->s.b.encoding == DW_EH_PE_absptr)
> +    fde_extractor = fde_unencoded_extract;
> +      else
> +    fde_extractor = fde_single_encoding_extract;
> +
> +      fde_radixsort (ob, fde_extractor, accu->linear, accu->aux);
> +      free (accu->aux);
>       }
>     else
>       {
> -      /* We've not managed to malloc an erratic array,
> +      fde_compare_t fde_compare;
> +      if (ob->s.b.mixed_encoding)
> +    fde_compare = fde_mixed_encoding_compare;
> +      else if (ob->s.b.encoding == DW_EH_PE_absptr)
> +    fde_compare = fde_unencoded_compare;
> +      else
> +    fde_compare = fde_single_encoding_compare;
> +
> +      /* We've not managed to malloc an aux array,
>        so heap sort in the linear one.  */
>         frame_heapsort (ob, fde_compare, accu->linear);
>       }


^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH] release the sorted FDE array when deregistering a frame [PR109685]
  2022-11-22  8:20           ` Florian Weimer
  2022-11-22  9:12             ` Thomas Neumann
  2022-12-09 17:34             ` [PATCH] initialize fde objects lazily Thomas Neumann
@ 2023-05-02 14:32             ` Thomas Neumann
  2023-05-10 10:49             ` [PATCH] fix radix sort on 32bit platforms [PR109670] Thomas Neumann
                               ` (2 subsequent siblings)
  5 siblings, 0 replies; 33+ messages in thread
From: Thomas Neumann @ 2023-05-02 14:32 UTC (permalink / raw)
  To: gcc-patches; +Cc: Jakub Jelinek, markus.boeck02

The atomic fastpath bypasses the code that releases the sort
array which was lazily allocated during unwinding. We now
check after deregistering if there is an array to free.

libgcc/ChangeLog:
	* unwind-dw2-fde.c: Free sort array in atomic fast path.
---
  libgcc/unwind-dw2-fde.c | 6 ++++++
  1 file changed, 6 insertions(+)

diff --git a/libgcc/unwind-dw2-fde.c b/libgcc/unwind-dw2-fde.c
index 7b74c391ced..4d2737ff9f7 100644
--- a/libgcc/unwind-dw2-fde.c
+++ b/libgcc/unwind-dw2-fde.c
@@ -240,6 +240,12 @@ __deregister_frame_info_bases (const void *begin)

    // And remove
    ob = btree_remove (&registered_frames, range[0]);
+
+  // Deallocate the sort array if any.
+  if (ob && ob->s.b.sorted)
+    {
+      free (ob->u.sort);
+    }
  #else
    init_object_mutex_once ();
    __gthread_mutex_lock (&object_mutex);
-- 
2.39.2


^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH] fix radix sort on 32bit platforms [PR109670]
  2022-11-22  8:20           ` Florian Weimer
                               ` (2 preceding siblings ...)
  2023-05-02 14:32             ` [PATCH] release the sorted FDE array when deregistering a frame [PR109685] Thomas Neumann
@ 2023-05-10 10:49             ` Thomas Neumann
  2023-08-10 11:33             ` [PATCH] preserve base pointer for __deregister_frame [PR110956] Thomas Neumann
  2024-03-15 10:29             ` [PATCH] handle unwind tables that are embedded within unwinding code, [PR111731] Thomas Neumann
  5 siblings, 0 replies; 33+ messages in thread
From: Thomas Neumann @ 2023-05-10 10:49 UTC (permalink / raw)
  To: gcc-patches; +Cc: Jakub Jelinek, Eric Botcazou

The radix sort uses two buffers, a1 for input and a2 for output.
After every digit the role of the two buffers is swapped.
When terminating the sort early the code made sure the output
was in a2.  However, when we run out of bits, as can happen on
32bit platforms, the sorted result was in a1, was we had just
swapped a1 and a2.
This patch fixes the problem by unconditionally having a1 as
output after every loop iteration.

This bug manifested itself only on 32bit platforms and even then
only in some circumstances, as it needs frames where a swap
is required due to differences in the top-most byte, which is
affected by ASLR. The new logic was validated by exhaustive
search over 32bit input values.

libgcc/ChangeLog:
         * unwind-dw2-fde.c: Fix radix sort buffer management.
---
  libgcc/unwind-dw2-fde.c | 8 +++-----
  1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/libgcc/unwind-dw2-fde.c b/libgcc/unwind-dw2-fde.c
index 7b74c391ced..31a3834156b 100644
--- a/libgcc/unwind-dw2-fde.c
+++ b/libgcc/unwind-dw2-fde.c
@@ -624,8 +624,6 @@ fde_radixsort (struct object *ob, fde_extractor_t 
fde_extractor,
        // Stop if we are already sorted.
        if (!violations)
  	{
-	  // The sorted data is in a1 now.
-	  a2 = a1;
  	  break;
  	}

@@ -660,9 +658,9 @@ fde_radixsort (struct object *ob, fde_extractor_t 
fde_extractor,
  #undef FANOUT
  #undef FANOUTBITS

-  // The data is in a2 now, move in place if needed.
-  if (a2 != v1->array)
-    memcpy (v1->array, a2, sizeof (const fde *) * n);
+  // The data is in a1 now, move in place if needed.
+  if (a1 != v1->array)
+    memcpy (v1->array, a1, sizeof (const fde *) * n);
  }

  static inline void
-- 
2.39.2


^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH] preserve base pointer for __deregister_frame [PR110956]
  2022-11-22  8:20           ` Florian Weimer
                               ` (3 preceding siblings ...)
  2023-05-10 10:49             ` [PATCH] fix radix sort on 32bit platforms [PR109670] Thomas Neumann
@ 2023-08-10 11:33             ` Thomas Neumann
  2023-08-11 15:21               ` Jeff Law
  2024-03-15 10:29             ` [PATCH] handle unwind tables that are embedded within unwinding code, [PR111731] Thomas Neumann
  5 siblings, 1 reply; 33+ messages in thread
From: Thomas Neumann @ 2023-08-10 11:33 UTC (permalink / raw)
  To: gcc-patches; +Cc: Jakub Jelinek, Rainer Orth

Original bug report: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110956
Rainer Orth successfully tested the patch on Solaris with a full bootstrap.



Some uncommon unwinding table encodings need to access the base pointer
for address computations. We do not have that information in calls to
__deregister_frame_info_bases, and previously simply used nullptr as
base pointer. That is usually fine, but for some Solaris i386 shared
libraries that results in wrong address computations.

To fix this problem we now associate the unwinding object with
the table pointer itself, which is always known, in addition to
the PC range. When deregistering a frame, we first locate the object
using the table pointer, and then use the base pointer stored within
the object to compute the PC range.

libgcc/ChangeLog:
	PR libgcc/110956
	* unwind-dw2-fde.c: Associate object with address of unwinding
	table.
---
  libgcc/unwind-dw2-fde.c | 34 +++++++++++++++++-----------------
  1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/libgcc/unwind-dw2-fde.c b/libgcc/unwind-dw2-fde.c
index d7c4a467754..ae4530179f3 100644
--- a/libgcc/unwind-dw2-fde.c
+++ b/libgcc/unwind-dw2-fde.c
@@ -124,6 +124,9 @@ __register_frame_info_bases (const void *begin, struct object *ob,
  #endif
  
  #ifdef ATOMIC_FDE_FAST_PATH
+  // Register the object itself to know the base pointer on deregistration.
+  btree_insert (&registered_frames, (uintptr_type) begin, 1, ob);
+
    // Register the frame in the b-tree
    uintptr_type range[2];
    get_pc_range (ob, range);
@@ -175,6 +178,9 @@ __register_frame_info_table_bases (void *begin, struct object *ob,
    ob->s.b.encoding = DW_EH_PE_omit;
  
  #ifdef ATOMIC_FDE_FAST_PATH
+  // Register the object itself to know the base pointer on deregistration.
+  btree_insert (&registered_frames, (uintptr_type) begin, 1, ob);
+
    // Register the frame in the b-tree
    uintptr_type range[2];
    get_pc_range (ob, range);
@@ -225,22 +231,17 @@ __deregister_frame_info_bases (const void *begin)
      return ob;
  
  #ifdef ATOMIC_FDE_FAST_PATH
-  // Find the corresponding PC range
-  struct object lookupob;
-  lookupob.tbase = 0;
-  lookupob.dbase = 0;
-  lookupob.u.single = begin;
-  lookupob.s.i = 0;
-  lookupob.s.b.encoding = DW_EH_PE_omit;
-#ifdef DWARF2_OBJECT_END_PTR_EXTENSION
-  lookupob.fde_end = NULL;
-#endif
-  uintptr_type range[2];
-  get_pc_range (&lookupob, range);
+  // Find the originally registered object to get the base pointer.
+  ob = btree_remove (&registered_frames, (uintptr_type) begin);
  
-  // And remove
-  ob = btree_remove (&registered_frames, range[0]);
-  bool empty_table = (range[1] - range[0]) == 0;
+  // Remove the corresponding PC range.
+  if (ob)
+    {
+      uintptr_type range[2];
+      get_pc_range (ob, range);
+      if (range[0] != range[1])
+	btree_remove (&registered_frames, range[0]);
+    }
  
    // Deallocate the sort array if any.
    if (ob && ob->s.b.sorted)
@@ -283,12 +284,11 @@ __deregister_frame_info_bases (const void *begin)
  
   out:
    __gthread_mutex_unlock (&object_mutex);
-  const int empty_table = 0; // The non-atomic path stores all tables.
  #endif
  
    // If we didn't find anything in the lookup data structures then they
    // were either already destroyed or we tried to remove an empty range.
-  gcc_assert (in_shutdown || (empty_table || ob));
+  gcc_assert (in_shutdown || ob);
    return (void *) ob;
  }
  
-- 
2.39.2


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] preserve base pointer for __deregister_frame [PR110956]
  2023-08-10 11:33             ` [PATCH] preserve base pointer for __deregister_frame [PR110956] Thomas Neumann
@ 2023-08-11 15:21               ` Jeff Law
  0 siblings, 0 replies; 33+ messages in thread
From: Jeff Law @ 2023-08-11 15:21 UTC (permalink / raw)
  To: Thomas Neumann, gcc-patches; +Cc: Jakub Jelinek, Rainer Orth



On 8/10/23 05:33, Thomas Neumann via Gcc-patches wrote:
> Original bug report: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110956
> Rainer Orth successfully tested the patch on Solaris with a full bootstrap.
> 
> 
> 
> Some uncommon unwinding table encodings need to access the base pointer
> for address computations. We do not have that information in calls to
> __deregister_frame_info_bases, and previously simply used nullptr as
> base pointer. That is usually fine, but for some Solaris i386 shared
> libraries that results in wrong address computations.
> 
> To fix this problem we now associate the unwinding object with
> the table pointer itself, which is always known, in addition to
> the PC range. When deregistering a frame, we first locate the object
> using the table pointer, and then use the base pointer stored within
> the object to compute the PC range.
> 
> libgcc/ChangeLog:
>      PR libgcc/110956
>      * unwind-dw2-fde.c: Associate object with address of unwinding
>      table.
Pushed to the trunk.  Thanks.

Jeff


^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH] handle unwind tables that are embedded within unwinding code, [PR111731]
  2022-11-22  8:20           ` Florian Weimer
                               ` (4 preceding siblings ...)
  2023-08-10 11:33             ` [PATCH] preserve base pointer for __deregister_frame [PR110956] Thomas Neumann
@ 2024-03-15 10:29             ` Thomas Neumann
  2024-03-20  8:25               ` Richard Biener
                                 ` (2 more replies)
  5 siblings, 3 replies; 33+ messages in thread
From: Thomas Neumann @ 2024-03-15 10:29 UTC (permalink / raw)
  To: gcc-patches; +Cc: Jakub Jelinek

Original bug report: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111731
Given that this is a regression, is this okay for gcc 13 and mainline?

The unwinding mechanism registers both the code range and the unwind
table itself within a b-tree lookup structure. That data structure
assumes that is consists of non-overlappping intervals. This
becomes a problem if the unwinding table is embedded within the
code itself, as now the intervals do overlap.

To fix this problem we now keep the unwind tables in a separate
b-tree, which prevents the overlap.

libgcc/ChangeLog:
	PR libgcc/111731
	* unwind-dw2-fde.c: Split unwind ranges if they contain the
	unwind table.
---
  libgcc/unwind-dw2-fde.c | 37 +++++++++++++++++++++----------------
  1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/libgcc/unwind-dw2-fde.c b/libgcc/unwind-dw2-fde.c
index 61a578d097e..9d503545677 100644
--- a/libgcc/unwind-dw2-fde.c
+++ b/libgcc/unwind-dw2-fde.c
@@ -48,6 +48,7 @@ typedef __UINTPTR_TYPE__ uintptr_type;
  #include "unwind-dw2-btree.h"
  
  static struct btree registered_frames;
+static struct btree registered_objects;
  static bool in_shutdown;
  
  static void
@@ -58,6 +59,7 @@ release_registered_frames (void)
    /* Release the b-tree and all frames. Frame releases that happen later are
     * silently ignored */
    btree_destroy (&registered_frames);
+  btree_destroy (&registered_objects);
    in_shutdown = true;
  }
  
@@ -103,6 +105,21 @@ static __gthread_mutex_t object_mutex;
  #endif
  #endif
  
+#ifdef ATOMIC_FDE_FAST_PATH
+// Register the pc range for a given object in the lookup structure.
+static void
+register_pc_range_for_object (uintptr_type begin, struct object *ob)
+{
+  // Register the object itself to know the base pointer on deregistration.
+  btree_insert (&registered_objects, begin, 1, ob);
+
+  // Register the frame in the b-tree
+  uintptr_type range[2];
+  get_pc_range (ob, range);
+  btree_insert (&registered_frames, range[0], range[1] - range[0], ob);
+}
+#endif
+
  /* Called from crtbegin.o to register the unwind info for an object.  */
  
  void
@@ -124,13 +141,7 @@ __register_frame_info_bases (const void *begin, struct object *ob,
  #endif
  
  #ifdef ATOMIC_FDE_FAST_PATH
-  // Register the object itself to know the base pointer on deregistration.
-  btree_insert (&registered_frames, (uintptr_type) begin, 1, ob);
-
-  // Register the frame in the b-tree
-  uintptr_type range[2];
-  get_pc_range (ob, range);
-  btree_insert (&registered_frames, range[0], range[1] - range[0], ob);
+  register_pc_range_for_object ((uintptr_type) begin, ob);
  #else
    init_object_mutex_once ();
    __gthread_mutex_lock (&object_mutex);
@@ -178,13 +189,7 @@ __register_frame_info_table_bases (void *begin, struct object *ob,
    ob->s.b.encoding = DW_EH_PE_omit;
  
  #ifdef ATOMIC_FDE_FAST_PATH
-  // Register the object itself to know the base pointer on deregistration.
-  btree_insert (&registered_frames, (uintptr_type) begin, 1, ob);
-
-  // Register the frame in the b-tree
-  uintptr_type range[2];
-  get_pc_range (ob, range);
-  btree_insert (&registered_frames, range[0], range[1] - range[0], ob);
+  register_pc_range_for_object ((uintptr_type) begin, ob);
  #else
    init_object_mutex_once ();
    __gthread_mutex_lock (&object_mutex);
@@ -232,7 +237,7 @@ __deregister_frame_info_bases (const void *begin)
  
  #ifdef ATOMIC_FDE_FAST_PATH
    // Find the originally registered object to get the base pointer.
-  ob = btree_remove (&registered_frames, (uintptr_type) begin);
+  ob = btree_remove (&registered_objects, (uintptr_type) begin);
  
    // Remove the corresponding PC range.
    if (ob)
@@ -240,7 +245,7 @@ __deregister_frame_info_bases (const void *begin)
        uintptr_type range[2];
        get_pc_range (ob, range);
        if (range[0] != range[1])
-    btree_remove (&registered_frames, range[0]);
+	btree_remove (&registered_frames, range[0]);
      }
  
    // Deallocate the sort array if any.
-- 
2.43.0


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] handle unwind tables that are embedded within unwinding code, [PR111731]
  2024-03-15 10:29             ` [PATCH] handle unwind tables that are embedded within unwinding code, [PR111731] Thomas Neumann
@ 2024-03-20  8:25               ` Richard Biener
  2024-03-22 13:35               ` Jeff Law
  2024-03-22 13:36               ` Jeff Law
  2 siblings, 0 replies; 33+ messages in thread
From: Richard Biener @ 2024-03-20  8:25 UTC (permalink / raw)
  To: Thomas Neumann, Jason Merrill, Florian Weimer; +Cc: gcc-patches, Jakub Jelinek

On Fri, Mar 15, 2024 at 11:31 AM Thomas Neumann
<thomas.neumann@in.tum.de> wrote:
>
> Original bug report: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111731
> Given that this is a regression, is this okay for gcc 13 and mainline?

It does look straightforward but I hope Jason or Florian can provide the ACK.

Thanks,
Richard.

> The unwinding mechanism registers both the code range and the unwind
> table itself within a b-tree lookup structure. That data structure
> assumes that is consists of non-overlappping intervals. This
> becomes a problem if the unwinding table is embedded within the
> code itself, as now the intervals do overlap.
>
> To fix this problem we now keep the unwind tables in a separate
> b-tree, which prevents the overlap.
>
> libgcc/ChangeLog:
>         PR libgcc/111731
>         * unwind-dw2-fde.c: Split unwind ranges if they contain the
>         unwind table.
> ---
>   libgcc/unwind-dw2-fde.c | 37 +++++++++++++++++++++----------------
>   1 file changed, 21 insertions(+), 16 deletions(-)
>
> diff --git a/libgcc/unwind-dw2-fde.c b/libgcc/unwind-dw2-fde.c
> index 61a578d097e..9d503545677 100644
> --- a/libgcc/unwind-dw2-fde.c
> +++ b/libgcc/unwind-dw2-fde.c
> @@ -48,6 +48,7 @@ typedef __UINTPTR_TYPE__ uintptr_type;
>   #include "unwind-dw2-btree.h"
>
>   static struct btree registered_frames;
> +static struct btree registered_objects;
>   static bool in_shutdown;
>
>   static void
> @@ -58,6 +59,7 @@ release_registered_frames (void)
>     /* Release the b-tree and all frames. Frame releases that happen later are
>      * silently ignored */
>     btree_destroy (&registered_frames);
> +  btree_destroy (&registered_objects);
>     in_shutdown = true;
>   }
>
> @@ -103,6 +105,21 @@ static __gthread_mutex_t object_mutex;
>   #endif
>   #endif
>
> +#ifdef ATOMIC_FDE_FAST_PATH
> +// Register the pc range for a given object in the lookup structure.
> +static void
> +register_pc_range_for_object (uintptr_type begin, struct object *ob)
> +{
> +  // Register the object itself to know the base pointer on deregistration.
> +  btree_insert (&registered_objects, begin, 1, ob);
> +
> +  // Register the frame in the b-tree
> +  uintptr_type range[2];
> +  get_pc_range (ob, range);
> +  btree_insert (&registered_frames, range[0], range[1] - range[0], ob);
> +}
> +#endif
> +
>   /* Called from crtbegin.o to register the unwind info for an object.  */
>
>   void
> @@ -124,13 +141,7 @@ __register_frame_info_bases (const void *begin, struct object *ob,
>   #endif
>
>   #ifdef ATOMIC_FDE_FAST_PATH
> -  // Register the object itself to know the base pointer on deregistration.
> -  btree_insert (&registered_frames, (uintptr_type) begin, 1, ob);
> -
> -  // Register the frame in the b-tree
> -  uintptr_type range[2];
> -  get_pc_range (ob, range);
> -  btree_insert (&registered_frames, range[0], range[1] - range[0], ob);
> +  register_pc_range_for_object ((uintptr_type) begin, ob);
>   #else
>     init_object_mutex_once ();
>     __gthread_mutex_lock (&object_mutex);
> @@ -178,13 +189,7 @@ __register_frame_info_table_bases (void *begin, struct object *ob,
>     ob->s.b.encoding = DW_EH_PE_omit;
>
>   #ifdef ATOMIC_FDE_FAST_PATH
> -  // Register the object itself to know the base pointer on deregistration.
> -  btree_insert (&registered_frames, (uintptr_type) begin, 1, ob);
> -
> -  // Register the frame in the b-tree
> -  uintptr_type range[2];
> -  get_pc_range (ob, range);
> -  btree_insert (&registered_frames, range[0], range[1] - range[0], ob);
> +  register_pc_range_for_object ((uintptr_type) begin, ob);
>   #else
>     init_object_mutex_once ();
>     __gthread_mutex_lock (&object_mutex);
> @@ -232,7 +237,7 @@ __deregister_frame_info_bases (const void *begin)
>
>   #ifdef ATOMIC_FDE_FAST_PATH
>     // Find the originally registered object to get the base pointer.
> -  ob = btree_remove (&registered_frames, (uintptr_type) begin);
> +  ob = btree_remove (&registered_objects, (uintptr_type) begin);
>
>     // Remove the corresponding PC range.
>     if (ob)
> @@ -240,7 +245,7 @@ __deregister_frame_info_bases (const void *begin)
>         uintptr_type range[2];
>         get_pc_range (ob, range);
>         if (range[0] != range[1])
> -    btree_remove (&registered_frames, range[0]);
> +       btree_remove (&registered_frames, range[0]);
>       }
>
>     // Deallocate the sort array if any.
> --
> 2.43.0
>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] handle unwind tables that are embedded within unwinding code, [PR111731]
  2024-03-15 10:29             ` [PATCH] handle unwind tables that are embedded within unwinding code, [PR111731] Thomas Neumann
  2024-03-20  8:25               ` Richard Biener
@ 2024-03-22 13:35               ` Jeff Law
  2024-03-22 13:36               ` Jeff Law
  2 siblings, 0 replies; 33+ messages in thread
From: Jeff Law @ 2024-03-22 13:35 UTC (permalink / raw)
  To: Thomas Neumann, gcc-patches; +Cc: Jakub Jelinek



On 3/15/24 4:29 AM, Thomas Neumann wrote:
> Original bug report: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111731
> Given that this is a regression, is this okay for gcc 13 and mainline?
> 
> The unwinding mechanism registers both the code range and the unwind
> table itself within a b-tree lookup structure. That data structure
> assumes that is consists of non-overlappping intervals. This
> becomes a problem if the unwinding table is embedded within the
> code itself, as now the intervals do overlap.
> 
> To fix this problem we now keep the unwind tables in a separate
> b-tree, which prevents the overlap.
> 
> libgcc/ChangeLog:
>      PR libgcc/111731
>      * unwind-dw2-fde.c: Split unwind ranges if they contain the
>      unwind table.
I'll go ahead and give the final OK here :-)

Jeff


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] handle unwind tables that are embedded within unwinding code, [PR111731]
  2024-03-15 10:29             ` [PATCH] handle unwind tables that are embedded within unwinding code, [PR111731] Thomas Neumann
  2024-03-20  8:25               ` Richard Biener
  2024-03-22 13:35               ` Jeff Law
@ 2024-03-22 13:36               ` Jeff Law
  2024-03-22 14:43                 ` Thomas Neumann
  2 siblings, 1 reply; 33+ messages in thread
From: Jeff Law @ 2024-03-22 13:36 UTC (permalink / raw)
  To: Thomas Neumann, gcc-patches; +Cc: Jakub Jelinek



On 3/15/24 4:29 AM, Thomas Neumann wrote:
> Original bug report: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111731
> Given that this is a regression, is this okay for gcc 13 and mainline?
> 
> The unwinding mechanism registers both the code range and the unwind
> table itself within a b-tree lookup structure. That data structure
> assumes that is consists of non-overlappping intervals. This
> becomes a problem if the unwinding table is embedded within the
> code itself, as now the intervals do overlap.
> 
> To fix this problem we now keep the unwind tables in a separate
> b-tree, which prevents the overlap.
> 
> libgcc/ChangeLog:
>      PR libgcc/111731
>      * unwind-dw2-fde.c: Split unwind ranges if they contain the
>      unwind table.
And what I'd suggest is committing to the trunk now, then waiting a week 
or two before backporting to gcc-13.

jeff


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] handle unwind tables that are embedded within unwinding code, [PR111731]
  2024-03-22 13:36               ` Jeff Law
@ 2024-03-22 14:43                 ` Thomas Neumann
  0 siblings, 0 replies; 33+ messages in thread
From: Thomas Neumann @ 2024-03-22 14:43 UTC (permalink / raw)
  To: Jeff Law, gcc-patches; +Cc: Jakub Jelinek

>> libgcc/ChangeLog:
>>      PR libgcc/111731
>>      * unwind-dw2-fde.c: Split unwind ranges if they contain the
>>      unwind table.
> And what I'd suggest is committing to the trunk now, then waiting a week 
> or two before backporting to gcc-13.

I will do that, thanks for looking at the patch.

Best

Thomas


^ permalink raw reply	[flat|nested] 33+ messages in thread

end of thread, other threads:[~2024-03-22 14:43 UTC | newest]

Thread overview: 33+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-09-16 10:19 [PATCH v4] eliminate mutex in fast path of __register_frame Thomas Neumann
2022-09-16 14:49 ` Jason Merrill
2022-09-18  8:59 ` Dimitar Dimitrov
2022-09-18  9:20   ` Thomas Neumann
2022-09-18 10:02   ` Thomas Neumann
2022-09-19 13:46 ` Stephan Bergmann
2022-09-19 13:55   ` Thomas Neumann
2022-09-19 14:00     ` Stephan Bergmann
2022-09-19 15:33   ` Thomas Neumann
2022-09-20  5:39     ` Stephan Bergmann
2022-11-21 11:14 ` Tamar Christina
2022-11-21 11:22   ` Thomas Neumann
2022-11-21 11:48     ` Jakub Jelinek
2022-11-21 17:13       ` H.J. Lu
2022-11-22  0:31         ` Thomas Neumann
2022-11-22  8:20           ` Florian Weimer
2022-11-22  9:12             ` Thomas Neumann
2022-12-09 17:34             ` [PATCH] initialize fde objects lazily Thomas Neumann
2022-12-15 16:11               ` Tamar Christina
2022-12-16 17:25               ` Jason Merrill
2023-05-02 14:32             ` [PATCH] release the sorted FDE array when deregistering a frame [PR109685] Thomas Neumann
2023-05-10 10:49             ` [PATCH] fix radix sort on 32bit platforms [PR109670] Thomas Neumann
2023-08-10 11:33             ` [PATCH] preserve base pointer for __deregister_frame [PR110956] Thomas Neumann
2023-08-11 15:21               ` Jeff Law
2024-03-15 10:29             ` [PATCH] handle unwind tables that are embedded within unwinding code, [PR111731] Thomas Neumann
2024-03-20  8:25               ` Richard Biener
2024-03-22 13:35               ` Jeff Law
2024-03-22 13:36               ` Jeff Law
2024-03-22 14:43                 ` Thomas Neumann
2022-11-22  8:00         ` [PATCH] speed up end_fde_sort using radix sort Thomas Neumann
2022-12-16 18:02           ` Jason Merrill
2022-11-21 11:49     ` [PATCH v4] eliminate mutex in fast path of __register_frame Tamar Christina
2022-11-21 11:53       ` Thomas Neumann

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).