public inbox for cygwin-patches@cygwin.com
 help / color / mirror / Atom feed
* [PATCH] Cygwin: malloc tune-up
@ 2020-08-24  4:59 Mark Geisert
  2020-08-24  9:48 ` Corinna Vinschen
  0 siblings, 1 reply; 4+ messages in thread
From: Mark Geisert @ 2020-08-24  4:59 UTC (permalink / raw)
  To: cygwin-patches

1. Replace global malloc lock with finer-grained internal locks.
2. Enable MSPACES, i.e. thread-specific memory pools.

Porting and testing of several dlmalloc-related malloc implementations
(ptmalloc, ptmalloc2, ptmalloc3, nedalloc) shows that they are no faster
than the current dlmalloc used by Cygwin, when the latter is tuned.  This
could be because the others are forks of earlier dlmalloc versions.  For
the record, I could not get jemalloc or tcmalloc running at all due to
chicken-egg issues, nor could I get a Win32 Heap-based malloc to work
across fork(), which was expected.

I think I can see a way to get Win32 Heap-based malloc to work across
fork()s, but it would depend on undocumented info and would likely be
subject to breakage in future Windows versions.  Too bad, because that
form of malloc package would be 2 to 8 times faster in practice.

---
 winsup/cygwin/cygmalloc.h       |  21 +++-
 winsup/cygwin/fork.cc           |   7 --
 winsup/cygwin/malloc_wrapper.cc | 163 +++++++++++++++++++-------------
 3 files changed, 113 insertions(+), 78 deletions(-)

diff --git a/winsup/cygwin/cygmalloc.h b/winsup/cygwin/cygmalloc.h
index 84bad824c..302ce59c8 100644
--- a/winsup/cygwin/cygmalloc.h
+++ b/winsup/cygwin/cygmalloc.h
@@ -33,15 +33,30 @@ void *mmap64 (void *, size_t, int, int, int, off_t);
 # define mmap mmap64
 # define MALLOC_FAILURE_ACTION	__set_ENOMEM ()
 # define USE_DL_PREFIX 1
+# define USE_LOCKS 1
+# define MSPACES 1
+# define FOOTERS 1
 
 #elif defined (__INSIDE_CYGWIN__)
 
-# define __malloc_lock() mallock.acquire ()
-# define __malloc_unlock() mallock.release ()
-extern muto mallock;
+# define MSPACES 0
 
 #endif
 
+#if MSPACES
+void __reg2 *create_mspace (size_t, int);
+void __reg2 mspace_free (void *m, void *p);
+void __reg2 *mspace_malloc (void *m, size_t size);
+void __reg3 *mspace_realloc (void *m, void *p, size_t size);
+void __reg3 *mspace_calloc (void *m, size_t nmemb, size_t size);
+void __reg3 *mspace_memalign (void *m, size_t alignment, size_t bytes);
+void __reg2 *mspace_valloc (void *m, size_t bytes);
+size_t __reg1 mspace_usable_size (const void *p);
+int __reg2 mspace_malloc_trim (void *m, size_t);
+int __reg2 mspace_mallopt (int p, int v);
+void __reg1 mspace_malloc_stats (void *m);
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/winsup/cygwin/fork.cc b/winsup/cygwin/fork.cc
index 38172ca1e..82f95dafe 100644
--- a/winsup/cygwin/fork.cc
+++ b/winsup/cygwin/fork.cc
@@ -296,8 +296,6 @@ frok::parent (volatile char * volatile stack_here)
   si.lpReserved2 = (LPBYTE) &ch;
   si.cbReserved2 = sizeof (ch);
 
-  bool locked = __malloc_lock ();
-
   /* Remove impersonation */
   cygheap->user.deimpersonate ();
   fix_impersonation = true;
@@ -448,9 +446,6 @@ frok::parent (volatile char * volatile stack_here)
 		   "stack", stack_here, ch.stackbase,
 		   impure, impure_beg, impure_end,
 		   NULL);
-
-  __malloc_unlock ();
-  locked = false;
   if (!rc)
     {
       this_errno = get_errno ();
@@ -533,8 +528,6 @@ cleanup:
 
   if (fix_impersonation)
     cygheap->user.reimpersonate ();
-  if (locked)
-    __malloc_unlock ();
 
   /* Remember to de-allocate the fd table. */
   if (hchild)
diff --git a/winsup/cygwin/malloc_wrapper.cc b/winsup/cygwin/malloc_wrapper.cc
index 3b245800a..f1c23b4a8 100644
--- a/winsup/cygwin/malloc_wrapper.cc
+++ b/winsup/cygwin/malloc_wrapper.cc
@@ -27,6 +27,33 @@ extern "C" struct mallinfo dlmallinfo ();
 static bool use_internal = true;
 static bool internal_malloc_determined;
 
+/* If MSPACES (thread-specific memory pools) are enabled, use a
+   thread-local variable to store a pointer to that thread's mspace.
+ */
+#if MSPACES
+static DWORD tls_mspace; // index into thread's TLS array
+#define MSPACE_SIZE (8 * 1024 * 1024)
+
+static void *
+get_current_mspace ()
+{
+  if (unlikely(tls_mspace == 0))
+    api_fatal ("a malloc-related function was called before malloc_init");
+
+  void *m = TlsGetValue (tls_mspace);
+  if (unlikely(m == 0))
+    {
+      m = create_mspace (MSPACE_SIZE, 0);
+      if (!m)
+        api_fatal ("unable to create mspace");
+      TlsSetValue (tls_mspace, m);
+    }
+  return m;
+}
+
+#define MSPACE get_current_mspace()
+#endif
+
 /* These routines are used by the application if it
    doesn't provide its own malloc. */
 
@@ -37,11 +64,11 @@ free (void *p)
   if (!use_internal)
     user_data->free (p);
   else
-    {
-      __malloc_lock ();
-      dlfree (p);
-      __malloc_unlock ();
-    }
+#if MSPACES
+    mspace_free (MSPACE, p);
+#else
+    dlfree (p);
+#endif
 }
 
 extern "C" void *
@@ -51,11 +78,11 @@ malloc (size_t size)
   if (!use_internal)
     res = user_data->malloc (size);
   else
-    {
-      __malloc_lock ();
-      res = dlmalloc (size);
-      __malloc_unlock ();
-    }
+#if MSPACES
+    res = mspace_malloc (MSPACE, size);
+#else
+    res = dlmalloc (size);
+#endif
   malloc_printf ("(%ld) = %p, called by %p", size, res,
 					     caller_return_address ());
   return res;
@@ -68,11 +95,11 @@ realloc (void *p, size_t size)
   if (!use_internal)
     res = user_data->realloc (p, size);
   else
-    {
-      __malloc_lock ();
-      res = dlrealloc (p, size);
-      __malloc_unlock ();
-    }
+#if MSPACES
+    res = mspace_realloc (MSPACE, p, size);
+#else
+    res = dlrealloc (p, size);
+#endif
   malloc_printf ("(%p, %ld) = %p, called by %p", p, size, res,
 						 caller_return_address ());
   return res;
@@ -96,11 +123,11 @@ calloc (size_t nmemb, size_t size)
   if (!use_internal)
     res = user_data->calloc (nmemb, size);
   else
-    {
-      __malloc_lock ();
-      res = dlcalloc (nmemb, size);
-      __malloc_unlock ();
-    }
+#if MSPACES
+    res = memspace_calloc (MSPACE, nmemb, size);
+#else
+    res = dlcalloc (nmemb, size);
+#endif
   malloc_printf ("(%ld, %ld) = %p, called by %p", nmemb, size, res,
 						  caller_return_address ());
   return res;
@@ -116,9 +143,11 @@ posix_memalign (void **memptr, size_t alignment, size_t bytes)
     return user_data->posix_memalign (memptr, alignment, bytes);
   if ((alignment & (alignment - 1)) != 0)
     return EINVAL;
-  __malloc_lock ();
+#if MSPACES
+  res = mspace_memalign (MSPACE, alignment, bytes);
+#else
   res = dlmemalign (alignment, bytes);
-  __malloc_unlock ();
+#endif
   if (!res)
     return ENOMEM;
   *memptr = res;
@@ -135,11 +164,11 @@ memalign (size_t alignment, size_t bytes)
       res = NULL;
     }
   else
-    {
-      __malloc_lock ();
-      res = dlmemalign (alignment, bytes);
-      __malloc_unlock ();
-    }
+#if MSPACES
+    res = mspace_memalign (MSPACE, alignment, bytes);
+#else
+    res = dlmemalign (alignment, bytes);
+#endif
 
   return res;
 }
@@ -154,11 +183,11 @@ valloc (size_t bytes)
       res = NULL;
     }
   else
-    {
-      __malloc_lock ();
-      res = dlvalloc (bytes);
-      __malloc_unlock ();
-    }
+#if MSPACES
+    res = mspace_valloc (MSPACE, bytes);
+#else
+    res = dlvalloc (bytes);
+#endif
 
   return res;
 }
@@ -173,11 +202,11 @@ malloc_usable_size (void *p)
       res = 0;
     }
   else
-    {
-      __malloc_lock ();
-      res = dlmalloc_usable_size (p);
-      __malloc_unlock ();
-    }
+#if MSPACES
+    res = memspace_usable_size (p);
+#else
+    res = dlmalloc_usable_size (p);
+#endif
 
   return res;
 }
@@ -192,11 +221,11 @@ malloc_trim (size_t pad)
       res = 0;
     }
   else
-    {
-      __malloc_lock ();
-      res = dlmalloc_trim (pad);
-      __malloc_unlock ();
-    }
+#if MSPACES
+    res = mspace_trim (MSPACE, pad);
+#else
+    res = dlmalloc_trim (pad);
+#endif
 
   return res;
 }
@@ -211,11 +240,11 @@ mallopt (int p, int v)
       res = 0;
     }
   else
-    {
-      __malloc_lock ();
-      res = dlmallopt (p, v);
-      __malloc_unlock ();
-    }
+#if MSPACES
+    res = mspace_mallopt (p, v);
+#else
+    res = dlmallopt (p, v);
+#endif
 
   return res;
 }
@@ -226,11 +255,11 @@ malloc_stats ()
   if (!use_internal)
     set_errno (ENOSYS);
   else
-    {
-      __malloc_lock ();
-      dlmalloc_stats ();
-      __malloc_unlock ();
-    }
+#if MSPACES
+    memspace_stats (MSPACE);
+#else
+    dlmalloc_stats ();
+#endif
 }
 
 extern "C" struct mallinfo
@@ -243,11 +272,11 @@ mallinfo ()
       set_errno (ENOSYS);
     }
   else
-    {
-      __malloc_lock ();
-      m = dlmallinfo ();
-      __malloc_unlock ();
-    }
+#if MSPACES
+    m = mspace_mallinfo (MSPACE);
+#else
+    m = dlmallinfo ();
+#endif
 
   return m;
 }
@@ -262,20 +291,9 @@ strdup (const char *s)
   return p;
 }
 
-/* We use a critical section to lock access to the malloc data
-   structures.  This permits malloc to be called from different
-   threads.  Note that it does not make malloc reentrant, and it does
-   not permit a signal handler to call malloc.  The malloc code in
-   newlib will call __malloc_lock and __malloc_unlock at appropriate
-   times.  */
-
-muto NO_COPY mallock;
-
 void
 malloc_init ()
 {
-  mallock.init ("mallock");
-
   /* Check if malloc is provided by application. If so, redirect all
      calls to malloc/free/realloc to application provided. This may
      happen if some other dll calls cygwin's malloc, but main code provides
@@ -290,6 +308,15 @@ malloc_init ()
       malloc_printf ("using %s malloc", use_internal ? "internal" : "external");
       internal_malloc_determined = true;
     }
+
+#if MSPACES
+  if (use_internal)
+    {
+      tls_mspace = TlsAlloc ();
+      if (tls_mspace == 0)
+        api_fatal ("malloc_init couldn't init tls_mspace");
+    }
+#endif
 }
 
 extern "C" void
-- 
2.28.0


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] Cygwin: malloc tune-up
  2020-08-24  4:59 [PATCH] Cygwin: malloc tune-up Mark Geisert
@ 2020-08-24  9:48 ` Corinna Vinschen
  2020-08-25  7:29   ` Mark Geisert
  0 siblings, 1 reply; 4+ messages in thread
From: Corinna Vinschen @ 2020-08-24  9:48 UTC (permalink / raw)
  To: cygwin-patches

Hi Mark,

On Aug 23 21:59, Mark Geisert wrote:
> 1. Replace global malloc lock with finer-grained internal locks.
> 2. Enable MSPACES, i.e. thread-specific memory pools.
> 
> Porting and testing of several dlmalloc-related malloc implementations
> (ptmalloc, ptmalloc2, ptmalloc3, nedalloc) shows that they are no faster
> than the current dlmalloc used by Cygwin, when the latter is tuned.  This
> could be because the others are forks of earlier dlmalloc versions.  For

That also means, there may be a point to update dlmalloc to a newer
version at one point again.  If you have fun to do so...

> the record, I could not get jemalloc or tcmalloc running at all due to
> chicken-egg issues, nor could I get a Win32 Heap-based malloc to work
> across fork(), which was expected.
> 
> I think I can see a way to get Win32 Heap-based malloc to work across
> fork()s, but it would depend on undocumented info and would likely be
> subject to breakage in future Windows versions.  Too bad, because that
> form of malloc package would be 2 to 8 times faster in practice.

Which is kind of strange, given malloc is all about heap management.
Given a sufficiently big heap, how can a well-written malloc be slower
than Win32 HeapAlloc?  Puzzeling...

> ---
>  winsup/cygwin/cygmalloc.h       |  21 +++-
>  winsup/cygwin/fork.cc           |   7 --
>  winsup/cygwin/malloc_wrapper.cc | 163 +++++++++++++++++++-------------
>  3 files changed, 113 insertions(+), 78 deletions(-)

I think this is a great idea.  I have a few nits, though:

> diff --git a/winsup/cygwin/cygmalloc.h b/winsup/cygwin/cygmalloc.h
> index 84bad824c..302ce59c8 100644
> --- a/winsup/cygwin/cygmalloc.h
> +++ b/winsup/cygwin/cygmalloc.h
> @@ -33,15 +33,30 @@ void *mmap64 (void *, size_t, int, int, int, off_t);
>  # define mmap mmap64
>  # define MALLOC_FAILURE_ACTION	__set_ENOMEM ()
>  # define USE_DL_PREFIX 1
> +# define USE_LOCKS 1
> +# define MSPACES 1
> +# define FOOTERS 1
>  
>  #elif defined (__INSIDE_CYGWIN__)
>  
> -# define __malloc_lock() mallock.acquire ()
> -# define __malloc_unlock() mallock.release ()
> -extern muto mallock;
> +# define MSPACES 0

That means MSPACES is 0 in malloc_wrapper.cc.  This code is not building
the MSPACES stuff into malloc_wrapper.cc, but the now non-thread-safe
direct dlfoo calls.  Just add `#error foo' to some of the `#if MSPACES'
code and you'll see you're (probably) not building what you think you're
building.

>  #endif
>  
> +#if MSPACES
> +void __reg2 *create_mspace (size_t, int);
> +void __reg2 mspace_free (void *m, void *p);
> +void __reg2 *mspace_malloc (void *m, size_t size);
> +void __reg3 *mspace_realloc (void *m, void *p, size_t size);
> +void __reg3 *mspace_calloc (void *m, size_t nmemb, size_t size);
> +void __reg3 *mspace_memalign (void *m, size_t alignment, size_t bytes);
> +void __reg2 *mspace_valloc (void *m, size_t bytes);
> +size_t __reg1 mspace_usable_size (const void *p);
> +int __reg2 mspace_malloc_trim (void *m, size_t);
> +int __reg2 mspace_mallopt (int p, int v);
> +void __reg1 mspace_malloc_stats (void *m);
> +#endif
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/winsup/cygwin/fork.cc b/winsup/cygwin/fork.cc
> index 38172ca1e..82f95dafe 100644
> --- a/winsup/cygwin/fork.cc
> +++ b/winsup/cygwin/fork.cc
> @@ -296,8 +296,6 @@ frok::parent (volatile char * volatile stack_here)
>    si.lpReserved2 = (LPBYTE) &ch;
>    si.cbReserved2 = sizeof (ch);
>  
> -  bool locked = __malloc_lock ();
> -

You're guarding the new code below with #if MSPACES and keep the calling
dlmalloc and friends here in fork as well as in the #else branches.  But
these code branches would need the old __malloc_lock/__malloc_unlock stuff,
otherwise they are broken.

Either we can go entirely without the #if MSPACES bracketing, or the
the code must keep using the global lock in the #if !MSPACES case.

>    /* Remove impersonation */
>    cygheap->user.deimpersonate ();
>    fix_impersonation = true;
> @@ -448,9 +446,6 @@ frok::parent (volatile char * volatile stack_here)
>  		   "stack", stack_here, ch.stackbase,
>  		   impure, impure_beg, impure_end,
>  		   NULL);
> -
> -  __malloc_unlock ();
> -  locked = false;
>    if (!rc)
>      {
>        this_errno = get_errno ();
> @@ -533,8 +528,6 @@ cleanup:
>  
>    if (fix_impersonation)
>      cygheap->user.reimpersonate ();
> -  if (locked)
> -    __malloc_unlock ();
>  
>    /* Remember to de-allocate the fd table. */
>    if (hchild)
> diff --git a/winsup/cygwin/malloc_wrapper.cc b/winsup/cygwin/malloc_wrapper.cc
> index 3b245800a..f1c23b4a8 100644
> --- a/winsup/cygwin/malloc_wrapper.cc
> +++ b/winsup/cygwin/malloc_wrapper.cc
> @@ -27,6 +27,33 @@ extern "C" struct mallinfo dlmallinfo ();
>  static bool use_internal = true;
>  static bool internal_malloc_determined;
>  
> +/* If MSPACES (thread-specific memory pools) are enabled, use a
> +   thread-local variable to store a pointer to that thread's mspace.
> + */
> +#if MSPACES
> +static DWORD tls_mspace; // index into thread's TLS array
> +#define MSPACE_SIZE (8 * 1024 * 1024)
> +
> +static void *
> +get_current_mspace ()
> +{
> +  if (unlikely(tls_mspace == 0))
> +    api_fatal ("a malloc-related function was called before malloc_init");
> +
> +  void *m = TlsGetValue (tls_mspace);

Why do we need a Win32 TLS value here?  We're usually having our own tls
area as defined in cygtls.h.  You could just add a

  void *mspace_ptr;

or so to class _cygtls.  The only downside is that the first build will
be very likely broken.  It regenerates tlsoffsets{64}.h, but then newlib
potentially uses wrong offsets.  Therefore, after changing class _cygtls
and regenerating the tls offsets once, a full rebuild should be
performed to be on the safe side.

In terms of using TlsAlloc and friends I have a problem with fork
safety.  The _cygtls area is on the thread's stack, and the stack of the
forking thread is copied over to the child, so the value of mspace_ptr
is intact after fork.  This isn't the case for the TlsAlloc'ed void
pointer, given the TlsAlloc is called anew in every process.  So it
looks like the forked process is losing a pointer.


Thanks,
Corinna

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] Cygwin: malloc tune-up
  2020-08-24  9:48 ` Corinna Vinschen
@ 2020-08-25  7:29   ` Mark Geisert
  2020-08-25 11:02     ` Corinna Vinschen
  0 siblings, 1 reply; 4+ messages in thread
From: Mark Geisert @ 2020-08-25  7:29 UTC (permalink / raw)
  To: cygwin-patches

Hi Corinna,
Well, this patch turned out to be half-baked.  Locking is working correctly 
because USE_LOCKS was set to 1 for malloc.cc's compilation.  The torture test I 
run validated that.  OTOH as you said, MSPACES was set 1 for malloc.cc but 0 for 
malloc_wrapper.cc.  So this patch yields a malloc facility like pre-3.2 but 
using internal locking on data structures instead of function-level locking.  An 
improvement, but not the whole package that I'm attempting to deliver because 
there's still thread contention on the internal locks.  Properly operating 
mspaces should get rid of that or at least lower it significantly.

I appreciate your comments on TLS variables and fork safety.  I will investigate 
this area further.  The mspaces should/will survive a fork but obviously the 
threads that created them won't.  Memory blocks can be freed by any thread; 
there's no need to create threads in the child process just to manage mspaces.

I have more work to do then I'll submit a v2 of the patch.
Thanks & Regards,

..mark

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] Cygwin: malloc tune-up
  2020-08-25  7:29   ` Mark Geisert
@ 2020-08-25 11:02     ` Corinna Vinschen
  0 siblings, 0 replies; 4+ messages in thread
From: Corinna Vinschen @ 2020-08-25 11:02 UTC (permalink / raw)
  To: cygwin-patches

Hi Mark,

On Aug 25 00:29, Mark Geisert wrote:
> Hi Corinna,
> Well, this patch turned out to be half-baked.  Locking is working correctly
> because USE_LOCKS was set to 1 for malloc.cc's compilation.  The torture
> test I run validated that.  OTOH as you said, MSPACES was set 1 for
> malloc.cc but 0 for malloc_wrapper.cc.  So this patch yields a malloc
> facility like pre-3.2 but using internal locking on data structures instead
> of function-level locking.  An improvement, but not the whole package that
> I'm attempting to deliver because there's still thread contention on the
> internal locks.  Properly operating mspaces should get rid of that or at
> least lower it significantly.
> 
> I appreciate your comments on TLS variables and fork safety.  I will
> investigate this area further.  The mspaces should/will survive a fork but
> obviously the threads that created them won't.  Memory blocks can be freed
> by any thread; there's no need to create threads in the child process just
> to manage mspaces.

I was not worried about the malloced spaces themselves, but about the
TLS pointer of the thread calling fork().  The TLS pointer itself
(the one returned by TlsGetValue) is not propagated to the child process.
Therefore, this pointer is wrong in the child.  If that's not a problem,
fine.  But if it has to be preserved over fork, then the cygtls area
is definitely the way to propagate thread-specific info via fork.


Corinna

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2020-08-25 11:02 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-08-24  4:59 [PATCH] Cygwin: malloc tune-up Mark Geisert
2020-08-24  9:48 ` Corinna Vinschen
2020-08-25  7:29   ` Mark Geisert
2020-08-25 11:02     ` Corinna Vinschen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).