From: Eric Wong <normalperson@yhbt.net>
To: libc-alpha@sourceware.org
Subject: [RFC/PoC] malloc: use wfcqueue to speed up remote frees
Date: Tue, 31 Jul 2018 08:49:00 -0000 [thread overview]
Message-ID: <20180731084936.g4yw6wnvt677miti@dcvr> (raw)
The goal is to reduce contention and improve locality of cross-thread
malloc/free traffic common to IPC systems (including Userspace-RCU) and
some garbage-collected runtimes.
Very rough benchmarks using `xthr`[1], a small URCU test program
I wrote years ago shows huge improvements in time and space:
$ /usr/bin/time ./before.sh ./xthr -a 2 -m 2500 -i $((1024 * 1024 * 5))
2.46user 3.51system 0:05.50elapsed 108%CPU (0avgtext+0avgdata 3352592maxresident)k
0inputs+0outputs (17major+838014minor)pagefaults 0swaps
$ /usr/bin/time ./after.sh ./xthr -a 2 -m 2500 -i $((1024 * 1024 * 5))
2.68user 0.48system 0:02.55elapsed 123%CPU (0avgtext+0avgdata 532904maxresident)k
0inputs+0outputs (0major+174304minor)pagefaults 0swaps
Where before.sh and after.sh are script wrappers around ld-linux for
the appropriate glibc installation.
#!/bin/sh
exec /tmp/6/lib/ld-linux-x86-64.so.2 --library-path /tmp/6/lib "$@"
[1] xthr.c: https://80x24.org/spew/20180731082205.vykyunsm5xg7ml3e@dcvr/
It avoids lock contention by only deferring `_int_free' to scenarios
where the arena lock is already acquired. Three functions are added:
* remote_free_begin - Producer - enqueues the allocation into an arena
designated to another thread. This is wait-free,
branchless, and only modifies the last (cold)
cacheline of the arena belonging to another thread
* remote_free_step - Consumer - grabs everything enqueued by
remote_free_begin and calls `_int_free' locally
without acquiring extra locks. Returns `true'
if it did any work, as other threads may have
called `remote_free_begin' in the meantime.
* remote_free_finish - Consumer - calls remote_free_step in a loop until
there is no more work to do. It runs before most
calls to malloc_consolidate.
wfcqueue is the LGPL-2.1+ Wait-Free Concurrent Queue distributed
with Userspace-RCU <http://liburcu.org/>. wfcqueue does not
depend on RCU itself (only atomics), but forms the basis of the
workqueue and call_rcu primitive within liburcu.
The functions I'm using from wfcqueue can be statically-linked
from header files, so it involves no extra linkage at runtime.
Note: Debian users can `apt-get install liburcu-dev' to get
wfcqueue.h; I expect it's available for other distros.
If this proof-of-concept is found acceptable, I can work on
making wfcqueue use the atomics provided by gcc/glibc instead
of the `uatomic` headers of URCU so it can be bundled with
glibc. But maybe adding liburcu as a build-time dependency
is acceptable :)
Note: I'm haven't gotten "make -j4 check" even close to passing even
without this patch on commit 98864ed0e055583707e37cdb7d41a9cdeac4473b.
It's likely a problem on my end; but I'm only a fairly
common Debian 9 x86-64 system; though I haven't built glibc in years.
On the other hand, with the exception of fiddle (dl-dependent)
and tz tests, most of the "test-all" suite passes for Ruby
when using the either the before.sh or after.sh glibc wrapper
(but I haven't done much testing otherwise):
make test-all TESTS='-x fiddle -x time_tz' \
RUNRUBYOPT=--precommand=/path/to/after.sh
---
malloc/malloc.c | 108 +++++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 103 insertions(+), 5 deletions(-)
diff --git a/malloc/malloc.c b/malloc/malloc.c
index e247c77b7d..40d61e45db 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -247,6 +247,9 @@
/* For SINGLE_THREAD_P. */
#include <sysdep-cancel.h>
+#define _LGPL_SOURCE /* allows inlines */
+#include <urcu/wfcqueue.h>
+
/*
Debugging:
@@ -1660,6 +1663,9 @@ struct malloc_state
/* Serialize access. */
__libc_lock_define (, mutex);
+ /* Only the owner of this arena writes to the head */
+ struct __cds_wfcq_head remote_free_head;
+
/* Flags (formerly in max_fast). */
int flags;
@@ -1697,6 +1703,11 @@ struct malloc_state
/* Memory allocated from the system in this arena. */
INTERNAL_SIZE_T system_mem;
INTERNAL_SIZE_T max_system_mem;
+
+ /* remote_free_tail is written to by a thread other than the owner of
+ this arena, so we want this on a different cacheline than
+ remote_free_head */
+ struct cds_wfcq_tail remote_free_tail;
};
struct malloc_par
@@ -1794,6 +1805,7 @@ malloc_init_state (mstate av)
int i;
mbinptr bin;
+ __cds_wfcq_init (&av->remote_free_head, &av->remote_free_tail);
/* Establish circular links for normal bins */
for (i = 1; i < NBINS; ++i)
{
@@ -3007,6 +3019,67 @@ tcache_thread_shutdown (void)
#endif /* !USE_TCACHE */
+_Static_assert (MINSIZE >= sizeof(struct cds_wfcq_node),
+ "struct cds_wfcq_node too big");
+/* wait-free enqueue to the remote arena */
+static void
+remote_free_begin (mstate av, void *mem)
+{
+ struct cds_wfcq_node *node = mem;
+
+ cds_wfcq_node_init (node);
+ cds_wfcq_enqueue (&av->remote_free_head, &av->remote_free_tail, node);
+ /* other thread calls remote_free_step */
+}
+
+/*
+ * process remote free queue, must have locked av
+ * returns true if it did anything
+ */
+static bool
+remote_free_step (mstate av)
+{
+ struct cds_wfcq_node *node, *n;
+ struct __cds_wfcq_head tmp_head;
+ struct cds_wfcq_tail tmp_tail;
+ enum cds_wfcq_ret ret;
+
+ if (__glibc_unlikely (av == NULL))
+ {
+ return false;
+ }
+
+ __cds_wfcq_init (&tmp_head, &tmp_tail);
+ ret = __cds_wfcq_splice_nonblocking (&tmp_head, &tmp_tail,
+ &av->remote_free_head,
+ &av->remote_free_tail);
+
+ if (__glibc_unlikely (ret == CDS_WFCQ_RET_DEST_EMPTY))
+ {
+ MAYBE_INIT_TCACHE ();
+ __cds_wfcq_for_each_blocking_safe (&tmp_head, &tmp_tail, node, n)
+ {
+ _int_free (av, mem2chunk(node), 1);
+ }
+
+ /*
+ * tell caller we did some work, and it's possible other threads
+ * to enqueued more work for us while we were busy
+ */
+ return true;
+ }
+
+ assert (ret != CDS_WFCQ_RET_DEST_NON_EMPTY);
+
+ return false; /* did nothing */
+}
+
+static void
+remote_free_finish (mstate av)
+{
+ while (remote_free_step (av)) ;
+}
+
void *
__libc_malloc (size_t bytes)
{
@@ -3045,6 +3118,7 @@ __libc_malloc (size_t bytes)
}
arena_get (ar_ptr, bytes);
+ remote_free_step (ar_ptr);
victim = _int_malloc (ar_ptr, bytes);
/* Retry with another arena only if we were able to find a usable arena
@@ -3053,6 +3127,7 @@ __libc_malloc (size_t bytes)
{
LIBC_PROBE (memory_malloc_retry, 1, bytes);
ar_ptr = arena_get_retry (ar_ptr, bytes);
+ remote_free_step (ar_ptr);
victim = _int_malloc (ar_ptr, bytes);
}
@@ -3102,10 +3177,16 @@ __libc_free (void *mem)
return;
}
- MAYBE_INIT_TCACHE ();
-
ar_ptr = arena_for_chunk (p);
- _int_free (ar_ptr, p, 0);
+ if (thread_arena == ar_ptr) /* thread_arena may be NULL */
+ {
+ MAYBE_INIT_TCACHE (); /* XXX is this needed if thread_arena == ar_ptr? */
+ _int_free (ar_ptr, p, 0);
+ }
+ else
+ {
+ remote_free_begin (ar_ptr, mem);
+ }
}
libc_hidden_def (__libc_free)
@@ -3211,6 +3292,8 @@ __libc_realloc (void *oldmem, size_t bytes)
__libc_lock_lock (ar_ptr->mutex);
+ remote_free_step (ar_ptr);
+
newp = _int_realloc (ar_ptr, oldp, oldsize, nb);
__libc_lock_unlock (ar_ptr->mutex);
@@ -3225,7 +3308,14 @@ __libc_realloc (void *oldmem, size_t bytes)
if (newp != NULL)
{
memcpy (newp, oldmem, oldsize - SIZE_SZ);
- _int_free (ar_ptr, oldp, 0);
+ if (thread_arena == ar_ptr)
+ {
+ _int_free (ar_ptr, oldp, 0);
+ }
+ else /* don't lock again */
+ {
+ remote_free_begin (ar_ptr, oldmem);
+ }
}
}
@@ -3294,12 +3384,14 @@ _mid_memalign (size_t alignment, size_t bytes, void *address)
}
arena_get (ar_ptr, bytes + alignment + MINSIZE);
+ remote_free_step(ar_ptr);
p = _int_memalign (ar_ptr, alignment, bytes);
if (!p && ar_ptr != NULL)
{
LIBC_PROBE (memory_memalign_retry, 2, bytes, alignment);
ar_ptr = arena_get_retry (ar_ptr, bytes);
+ remote_free_step(ar_ptr);
p = _int_memalign (ar_ptr, alignment, bytes);
}
@@ -3388,7 +3480,10 @@ __libc_calloc (size_t n, size_t elem_size)
if (SINGLE_THREAD_P)
av = &main_arena;
else
- arena_get (av, sz);
+ {
+ arena_get (av, sz);
+ remote_free_step (av);
+ }
if (av)
{
@@ -3428,6 +3523,7 @@ __libc_calloc (size_t n, size_t elem_size)
{
LIBC_PROBE (memory_calloc_retry, 1, sz);
av = arena_get_retry (av, sz);
+ remote_free_step(av);
mem = _int_malloc (av, sz);
}
@@ -4750,6 +4846,7 @@ static int
mtrim (mstate av, size_t pad)
{
/* Ensure all blocks are consolidated. */
+ remote_free_finish (av);
malloc_consolidate (av);
const size_t ps = GLRO (dl_pagesize);
@@ -5133,6 +5230,7 @@ __libc_mallopt (int param_number, int value)
/* We must consolidate main arena before changing max_fast
(see definition of set_max_fast). */
+ remote_free_finish (av);
malloc_consolidate (av);
switch (param_number)
--
EW
next reply other threads:[~2018-07-31 8:49 UTC|newest]
Thread overview: 17+ messages / expand[flat|nested] mbox.gz Atom feed top
2018-07-31 8:49 Eric Wong [this message]
2018-07-31 12:16 ` Carlos O'Donell
2018-07-31 23:18 ` Eric Wong
2018-08-01 4:41 ` Carlos O'Donell
2018-08-01 6:23 ` Eric Wong
2018-08-01 7:01 ` Carlos O'Donell
2018-08-01 9:26 ` Eric Wong
2018-08-02 21:38 ` Carlos O'Donell
2023-01-17 6:42 ` Eric Wong
2023-01-17 19:05 ` Mathieu Desnoyers
2023-01-18 15:48 ` Mathieu Desnoyers
2023-01-18 19:12 ` Eric Wong
2023-01-18 19:17 ` Mathieu Desnoyers
2023-01-18 20:05 ` Eric Wong
2023-01-18 14:53 ` Mathieu Desnoyers
2023-01-18 14:58 ` Mathieu Desnoyers
2018-08-08 10:40 ` Eric Wong
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20180731084936.g4yw6wnvt677miti@dcvr \
--to=normalperson@yhbt.net \
--cc=libc-alpha@sourceware.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).