From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <elfutils-devel-return-7847-listarch-elfutils-devel=sourceware.org@sourceware.org>
Received: (qmail 117617 invoked by alias); 29 Oct 2019 21:15:02 -0000
Mailing-List: contact elfutils-devel-help@sourceware.org; run by ezmlm
Precedence: bulk
List-Id: <elfutils-devel.sourceware.org>
List-Post: <mailto:elfutils-devel@sourceware.org>
List-Help: <mailto:elfutils-devel-help@sourceware.org>
List-Subscribe: <mailto:elfutils-devel-subscribe@sourceware.org>
Sender: elfutils-devel-owner@sourceware.org
Received: (qmail 117568 invoked by uid 89); 29 Oct 2019 21:15:01 -0000
Authentication-Results: sourceware.org; auth=none
X-Virus-Checked: by ClamAV 0.100.3 on sourceware.org
X-Virus-Found: No
X-Spam-SWARE-Status: No, score=-19.1 required=5.0 tests=AWL,BAYES_00,GIT_PATCH_0,GIT_PATCH_1,GIT_PATCH_2,GIT_PATCH_3,SPF_PASS autolearn=ham version=3.3.1 spammy=
X-Spam-Status: No, score=-19.1 required=5.0 tests=AWL,BAYES_00,GIT_PATCH_0,GIT_PATCH_1,GIT_PATCH_2,GIT_PATCH_3,SPF_PASS autolearn=ham version=3.3.1
X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on sourceware.org
X-Spam-Level:
X-HELO: gnu.wildebeest.org
Received: from wildebeest.demon.nl (HELO gnu.wildebeest.org) (212.238.236.112) by sourceware.org (qpsmtpd/0.93/v0.84-503-g423c35a) with ESMTP; Tue, 29 Oct 2019 21:14:59 +0000
Received: from librem (deer0x15.wildebeest.org [172.31.17.151])	(using TLSv1.2 with cipher ADH-AES256-GCM-SHA384 (256/256 bits))	(No client certificate requested)	by gnu.wildebeest.org (Postfix) with ESMTPSA id 3017630003CD;	Tue, 29 Oct 2019 22:14:57 +0100 (CET)
Received: by librem (Postfix, from userid 1000)	id 62725C0237; Tue, 29 Oct 2019 22:14:55 +0100 (CET)
From: Mark Wielaard <mark@klomp.org>
To: elfutils-devel@sourceware.org
Cc: Jonathon Anderson <jma14@rice.edu>
Subject: [PATCH 2/2] libdw: Rewrite the memory handler to be more robust.
Date: Tue, 29 Oct 2019 21:15:00 -0000
Message-Id: <20191029211437.3268-2-mark@klomp.org>
X-Mailer: git-send-email 2.20.1
In-Reply-To: <20191029211437.3268-1-mark@klomp.org>
References: <1572380520.19948.0@rice.edu> <20191029211437.3268-1-mark@klomp.org>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
X-Spam-Flag: NO
X-IsSubscribed: yes
X-SW-Source: 2019-q4/txt/msg00077.txt.bz2

From: Jonathon Anderson <jma14@rice.edu>

Pthread's thread-local variables are highly limited, which makes
it difficult to use many Dwarfs. This replaces that with a
less efficient (or elegant) but more robust method.

Signed-off-by: Jonathon Anderson <jma14@rice.edu>
---
 lib/atomics.h           |  2 ++
 libdw/ChangeLog         |  9 ++++++
 libdw/dwarf_begin_elf.c |  7 +++--
 libdw/dwarf_end.c       | 24 +++++++-------
 libdw/libdwP.h          | 67 +++++++++++++++++++--------------------
 libdw/libdw_alloc.c     | 69 ++++++++++++++++++++++++++++++++++++++---
 6 files changed, 125 insertions(+), 53 deletions(-)

diff --git a/lib/atomics.h b/lib/atomics.h
index ffd12f87..e3773759 100644
--- a/lib/atomics.h
+++ b/lib/atomics.h
@@ -31,7 +31,9 @@
 #if HAVE_STDATOMIC_H
 /* If possible, use the compiler's preferred atomics.  */
 # include <stdatomic.h>
+# include <threads.h>
 #else
 /* Otherwise, try to use the builtins provided by this compiler.  */
 # include "stdatomic-fbsd.h"
+# define thread_local __thread
 #endif /* HAVE_STDATOMIC_H */
diff --git a/libdw/ChangeLog b/libdw/ChangeLog
index 394c0df2..8de8a837 100644
--- a/libdw/ChangeLog
+++ b/libdw/ChangeLog
@@ -1,3 +1,12 @@
+2019-10-28  Jonathon Anderson  <jma14@rice.edu>
+
+	* libdw_alloc.c: Added __libdw_alloc_tail.
+	(__libdw_allocate): Switch to use the mem_tails array.
+	* libdwP.h (Dwarf): Likewise.
+	* dwarf_begin_elf.c (dwarf_begin_elf): Support for above.
+	* dwarf_end.c (dwarf_end): Likewise.
+	* atomics.h: Add support for thread_local.
+
 2019-08-26  Jonathon Anderson  <jma14@rice.edu>
 
 	* libdw_alloc.c (__libdw_allocate): Added thread-safe stack allocator.
diff --git a/libdw/dwarf_begin_elf.c b/libdw/dwarf_begin_elf.c
index 8d137414..eadff13b 100644
--- a/libdw/dwarf_begin_elf.c
+++ b/libdw/dwarf_begin_elf.c
@@ -418,13 +418,14 @@ dwarf_begin_elf (Elf *elf, Dwarf_Cmd cmd, Elf_Scn *scngrp)
      actual allocation.  */
   result->mem_default_size = mem_default_size;
   result->oom_handler = __libdw_oom;
-  if (pthread_key_create (&result->mem_key, NULL) != 0)
+  if (pthread_rwlock_init(&result->mem_rwl, NULL) != 0)
     {
       free (result);
-      __libdw_seterrno (DWARF_E_NOMEM); /* no memory or max pthread keys.  */
+      __libdw_seterrno (DWARF_E_NOMEM); /* no memory.  */
       return NULL;
     }
-  atomic_init (&result->mem_tail, (uintptr_t)NULL);
+  result->mem_stacks = 0;
+  result->mem_tails = NULL;
 
   if (cmd == DWARF_C_READ || cmd == DWARF_C_RDWR)
     {
diff --git a/libdw/dwarf_end.c b/libdw/dwarf_end.c
index a2e94436..3fd2836d 100644
--- a/libdw/dwarf_end.c
+++ b/libdw/dwarf_end.c
@@ -95,17 +95,19 @@ dwarf_end (Dwarf *dwarf)
       tdestroy (dwarf->split_tree, noop_free);
 
       /* Free the internally allocated memory.  */
-      struct libdw_memblock *memp;
-      memp = (struct libdw_memblock *) (atomic_load_explicit
-					(&dwarf->mem_tail,
-					 memory_order_relaxed));
-      while (memp != NULL)
-	{
-	  struct libdw_memblock *prevp = memp->prev;
-	  free (memp);
-	  memp = prevp;
-	}
-      pthread_key_delete (dwarf->mem_key);
+      for (size_t i = 0; i < dwarf->mem_stacks; i++)
+        {
+          struct libdw_memblock *memp = dwarf->mem_tails[i];
+          while (memp != NULL)
+	    {
+	      struct libdw_memblock *prevp = memp->prev;
+	      free (memp);
+	      memp = prevp;
+	    }
+        }
+      if (dwarf->mem_tails != NULL)
+        free (dwarf->mem_tails);
+      pthread_rwlock_destroy (&dwarf->mem_rwl);
 
       /* Free the pubnames helper structure.  */
       free (dwarf->pubnames_sets);
diff --git a/libdw/libdwP.h b/libdw/libdwP.h
index ad2599eb..3e1ef59b 100644
--- a/libdw/libdwP.h
+++ b/libdw/libdwP.h
@@ -149,17 +149,6 @@ enum
 
 #include "dwarf_sig8_hash.h"
 
-/* Structure for internal memory handling.  This is basically a simplified
-   reimplementation of obstacks.  Unfortunately the standard obstack
-   implementation is not usable in libraries.  */
-struct libdw_memblock
-{
-  size_t size;
-  size_t remaining;
-  struct libdw_memblock *prev;
-  char mem[0];
-};
-
 /* This is the structure representing the debugging state.  */
 struct Dwarf
 {
@@ -231,11 +220,22 @@ struct Dwarf
   /* Similar for addrx/constx, which will come from .debug_addr section.  */
   struct Dwarf_CU *fake_addr_cu;
 
-  /* Internal memory handling.  Each thread allocates separately and only
-     allocates from its own blocks, while all the blocks are pushed atomically
-     onto a unified stack for easy deallocation.  */
-  pthread_key_t mem_key;
-  atomic_uintptr_t mem_tail;
+  /* Supporting lock for internal memory handling.  Ensures threads that have
+     an entry in the mem_tails array are not disturbed by new threads doing
+     allocations for this Dwarf.  */
+  pthread_rwlock_t mem_rwl;
+
+  /* Internal memory handling.  This is basically a simplified thread-local
+     reimplementation of obstacks.  Unfortunately the standard obstack
+     implementation is not usable in libraries.  */
+  size_t mem_stacks;
+  struct libdw_memblock
+  {
+    size_t size;
+    size_t remaining;
+    struct libdw_memblock *prev;
+    char mem[0];
+  } **mem_tails;
 
   /* Default size of allocated memory blocks.  */
   size_t mem_default_size;
@@ -578,34 +578,31 @@ libdw_valid_user_form (int form)
 extern void __libdw_seterrno (int value) internal_function;
 
 
-/* Memory handling, the easy parts.  This macro does not do nor need to do any
-   locking for proper concurrent operation.  */
+/* Memory handling, the easy parts.  */
 #define libdw_alloc(dbg, type, tsize, cnt) \
-  ({ struct libdw_memblock *_tail = pthread_getspecific (dbg->mem_key);       \
-     size_t _req = (tsize) * (cnt);					      \
-     type *_result;							      \
-     if (unlikely (_tail == NULL))					      \
-       _result = (type *) __libdw_allocate (dbg, _req, __alignof (type));     \
+  ({ struct libdw_memblock *_tail = __libdw_alloc_tail(dbg);		      \
+     size_t _required = (tsize) * (cnt);				      \
+     type *_result = (type *) (_tail->mem + (_tail->size - _tail->remaining));\
+     size_t _padding = ((__alignof (type)				      \
+			 - ((uintptr_t) _result & (__alignof (type) - 1)))    \
+			& (__alignof (type) - 1));			      \
+     if (unlikely (_tail->remaining < _required + _padding))		      \
+       _result = (type *) __libdw_allocate (dbg, _required, __alignof (type));\
      else								      \
        {								      \
-	 _result = (type *) (_tail->mem + (_tail->size - _tail->remaining));  \
-	 size_t _padding = ((__alignof (type)				      \
-			    - ((uintptr_t) _result & (__alignof (type) - 1))) \
-			       & (__alignof (type) - 1));		      \
-	 if (unlikely (_tail->remaining < _req + _padding))		      \
-	   _result = (type *) __libdw_allocate (dbg, _req, __alignof (type)); \
-	 else								      \
-	   {								      \
-	     _req += _padding;						      \
-	     _result = (type *) ((char *) _result + _padding);		      \
-	     _tail->remaining -= _req;					      \
-	   }								      \
+	 _required += _padding;						      \
+	 _result = (type *) ((char *) _result + _padding);		      \
+	 _tail->remaining -= _required;					      \
        }								      \
      _result; })
 
 #define libdw_typed_alloc(dbg, type) \
   libdw_alloc (dbg, type, sizeof (type), 1)
 
+/* Callback to choose a thread-local memory allocation stack.  */
+extern struct libdw_memblock *__libdw_alloc_tail (Dwarf* dbg)
+     __nonnull_attribute__ (1);
+
 /* Callback to allocate more.  */
 extern void *__libdw_allocate (Dwarf *dbg, size_t minsize, size_t align)
      __attribute__ ((__malloc__)) __nonnull_attribute__ (1);
diff --git a/libdw/libdw_alloc.c b/libdw/libdw_alloc.c
index f2e74d18..86ca7032 100644
--- a/libdw/libdw_alloc.c
+++ b/libdw/libdw_alloc.c
@@ -35,7 +35,68 @@
 #include <stdlib.h>
 #include "libdwP.h"
 #include "system.h"
+#include "atomics.h"
+#if USE_VG_ANNOTATIONS == 1
+#include <helgrind.h>
+#include <drd.h>
+#else
+#define ANNOTATE_HAPPENS_BEFORE(X)
+#define ANNOTATE_HAPPENS_AFTER(X)
+#endif
 
+#define THREAD_ID_UNSET ((size_t) -1)
+static thread_local size_t thread_id = THREAD_ID_UNSET;
+static atomic_size_t next_id = ATOMIC_VAR_INIT(0);
+
+struct libdw_memblock *
+__libdw_alloc_tail (Dwarf *dbg)
+{
+  if (thread_id == THREAD_ID_UNSET)
+    thread_id = atomic_fetch_add (&next_id, 1);
+
+  pthread_rwlock_rdlock (&dbg->mem_rwl);
+  if (thread_id >= dbg->mem_stacks)
+    {
+      pthread_rwlock_unlock (&dbg->mem_rwl);
+      pthread_rwlock_wrlock (&dbg->mem_rwl);
+
+      /* Another thread may have already reallocated. In theory using an
+         atomic would be faster, but given that this only happens once per
+         thread per Dwarf, some minor slowdown should be fine.  */
+      if (thread_id >= dbg->mem_stacks)
+        {
+          dbg->mem_tails = realloc (dbg->mem_tails, (thread_id+1)
+                                    * sizeof (struct libdw_memblock *));
+          if (dbg->mem_tails == NULL)
+            {
+              pthread_rwlock_unlock (&dbg->mem_rwl);
+              dbg->oom_handler();
+            }
+          for (size_t i = dbg->mem_stacks; i <= thread_id; i++)
+            dbg->mem_tails[i] = NULL;
+          dbg->mem_stacks = thread_id + 1;
+          ANNOTATE_HAPPENS_BEFORE (&dbg->mem_tails);
+        }
+
+      pthread_rwlock_unlock (&dbg->mem_rwl);
+      pthread_rwlock_rdlock (&dbg->mem_rwl);
+    }
+
+  /* At this point, we have an entry in the tail array.  */
+  ANNOTATE_HAPPENS_AFTER (&dbg->mem_tails);
+  struct libdw_memblock *result = dbg->mem_tails[thread_id];
+  if (result == NULL)
+    {
+      result = malloc (dbg->mem_default_size);
+      result->size = dbg->mem_default_size
+                     - offsetof (struct libdw_memblock, mem);
+      result->remaining = result->size;
+      result->prev = NULL;
+      dbg->mem_tails[thread_id] = result;
+    }
+  pthread_rwlock_unlock (&dbg->mem_rwl);
+  return result;
+}
 
 void *
 __libdw_allocate (Dwarf *dbg, size_t minsize, size_t align)
@@ -52,10 +113,10 @@ __libdw_allocate (Dwarf *dbg, size_t minsize, size_t align)
   newp->size = size - offsetof (struct libdw_memblock, mem);
   newp->remaining = (uintptr_t) newp + size - (result + minsize);
 
-  newp->prev = (struct libdw_memblock*)atomic_exchange_explicit(
-      &dbg->mem_tail, (uintptr_t)newp, memory_order_relaxed);
-  if (pthread_setspecific (dbg->mem_key, newp) != 0)
-    dbg->oom_handler ();
+  pthread_rwlock_rdlock (&dbg->mem_rwl);
+  newp->prev = dbg->mem_tails[thread_id];
+  dbg->mem_tails[thread_id] = newp;
+  pthread_rwlock_unlock (&dbg->mem_rwl);
 
   return (void *) result;
 }
-- 
2.20.1