public inbox for cygwin-patches@cygwin.com
 help / color / mirror / Atom feed
* [PATCH] Cygwin: Interim malloc speedup
@ 2020-12-22  4:53 Mark Geisert
  2020-12-24  8:28 ` [PATCH] Cygwin: Interim malloc speedup -- addendum Mark Geisert
  2021-01-11 12:18 ` [PATCH] Cygwin: Interim malloc speedup Corinna Vinschen
  0 siblings, 2 replies; 5+ messages in thread
From: Mark Geisert @ 2020-12-22  4:53 UTC (permalink / raw)
  To: cygwin-patches

Replaces function-level lock with data-level lock provided by existing
dlmalloc.  Sets up to enable dlmalloc's MSPACES, but does not yet enable
them due to visible but uninvestigated issues.

Single-thread applications may or may not see a performance gain,
depending on how heavily it uses the malloc functions.  Multi-thread
apps will likely see a performance gain.

---
 winsup/cygwin/cygmalloc.h       |  28 +++-
 winsup/cygwin/fork.cc           |   8 -
 winsup/cygwin/malloc_wrapper.cc | 274 +++++++++++++++++++++-----------
 3 files changed, 202 insertions(+), 108 deletions(-)

diff --git a/winsup/cygwin/cygmalloc.h b/winsup/cygwin/cygmalloc.h
index 84bad824c..67a9f3b3f 100644
--- a/winsup/cygwin/cygmalloc.h
+++ b/winsup/cygwin/cygmalloc.h
@@ -26,20 +26,36 @@ void dlmalloc_stats ();
 #define MALLOC_ALIGNMENT ((size_t)16U)
 #endif
 
+/* As of Cygwin 3.2.0 we could enable dlmalloc's MSPACES */
+#define MSPACES 0 // DO NOT ENABLE: cygserver, XWin, etc will malfunction
+
 #if defined (DLMALLOC_VERSION)	/* Building malloc.cc */
 
 extern "C" void __set_ENOMEM ();
 void *mmap64 (void *, size_t, int, int, int, off_t);
 # define mmap mmap64
+
+/* These defines tune the dlmalloc implementation in malloc.cc */
 # define MALLOC_FAILURE_ACTION	__set_ENOMEM ()
 # define USE_DL_PREFIX 1
+# define USE_LOCKS 1
+# define LOCK_AT_FORK 0
+# define FOOTERS MSPACES
+#endif
 
-#elif defined (__INSIDE_CYGWIN__)
-
-# define __malloc_lock() mallock.acquire ()
-# define __malloc_unlock() mallock.release ()
-extern muto mallock;
-
+#if MSPACES
+# define MSPACE_SIZE (512 * 1024)
+void __reg2 *create_mspace (size_t, int);
+void __reg2 mspace_free (void *m, void *p);
+void __reg2 *mspace_malloc (void *m, size_t size);
+void __reg3 *mspace_realloc (void *m, void *p, size_t size);
+void __reg3 *mspace_calloc (void *m, size_t nmemb, size_t size);
+void __reg3 *mspace_memalign (void *m, size_t alignment, size_t bytes);
+size_t __reg1 mspace_usable_size (const void *p);
+int __reg2 mspace_trim (void *m, size_t);
+int __reg2 mspace_mallopt (int p, int v);
+void __reg1 mspace_malloc_stats (void *m);
+struct mallinfo mspace_mallinfo (void *m);
 #endif
 
 #ifdef __cplusplus
diff --git a/winsup/cygwin/fork.cc b/winsup/cygwin/fork.cc
index 719217856..8f9ca45d1 100644
--- a/winsup/cygwin/fork.cc
+++ b/winsup/cygwin/fork.cc
@@ -22,7 +22,6 @@ details. */
 #include "tls_pbuf.h"
 #include "shared_info.h"
 #include "dll_init.h"
-#include "cygmalloc.h"
 #include "ntdll.h"
 
 #define NPIDS_HELD 4
@@ -296,8 +295,6 @@ frok::parent (volatile char * volatile stack_here)
   si.lpReserved2 = (LPBYTE) &ch;
   si.cbReserved2 = sizeof (ch);
 
-  bool locked = __malloc_lock ();
-
   /* Remove impersonation */
   cygheap->user.deimpersonate ();
   fix_impersonation = true;
@@ -448,9 +445,6 @@ frok::parent (volatile char * volatile stack_here)
 		   "stack", stack_here, ch.stackbase,
 		   impure, impure_beg, impure_end,
 		   NULL);
-
-  __malloc_unlock ();
-  locked = false;
   if (!rc)
     {
       this_errno = get_errno ();
@@ -533,8 +527,6 @@ cleanup:
 
   if (fix_impersonation)
     cygheap->user.reimpersonate ();
-  if (locked)
-    __malloc_unlock ();
 
   /* Remember to de-allocate the fd table. */
   if (hchild)
diff --git a/winsup/cygwin/malloc_wrapper.cc b/winsup/cygwin/malloc_wrapper.cc
index 3b245800a..51dbf8a59 100644
--- a/winsup/cygwin/malloc_wrapper.cc
+++ b/winsup/cygwin/malloc_wrapper.cc
@@ -27,6 +27,34 @@ extern "C" struct mallinfo dlmallinfo ();
 static bool use_internal = true;
 static bool internal_malloc_determined;
 
+#if MSPACES
+/* If mspaces (thread-specific memory pools) are enabled, use a thread-
+   local variable to store a pointer to the calling thread's mspace.
+
+   On any use of a malloc-family function, if the appropriate mspace cannot
+   be determined, the general (non-mspace) form of the corresponding malloc
+   function is substituted.  This is not expected to happen often.
+*/
+static NO_COPY DWORD tls_mspace; // index into thread's TLS array
+
+static void *
+get_current_mspace ()
+{
+  if (unlikely (tls_mspace == 0))
+    return 0;
+
+  void *m = TlsGetValue (tls_mspace);
+  if (unlikely (m == 0))
+    {
+      m = create_mspace (MSPACE_SIZE, 0);
+      if (!m)
+        return 0;
+      TlsSetValue (tls_mspace, m);
+    }
+  return m;
+}
+#endif
+
 /* These routines are used by the application if it
    doesn't provide its own malloc. */
 
@@ -34,28 +62,40 @@ extern "C" void
 free (void *p)
 {
   malloc_printf ("(%p), called by %p", p, caller_return_address ());
-  if (!use_internal)
-    user_data->free (p);
-  else
+  if (likely (use_internal))
     {
-      __malloc_lock ();
+#if MSPACES
+      void *m = get_current_mspace ();
+      if (likely (m))
+	mspace_free (m, p);
+      else
+	dlfree (p);
+#else
       dlfree (p);
-      __malloc_unlock ();
+#endif
     }
+  else
+    user_data->free (p);
 }
 
 extern "C" void *
 malloc (size_t size)
 {
   void *res;
-  if (!use_internal)
-    res = user_data->malloc (size);
-  else
+  if (likely (use_internal))
     {
-      __malloc_lock ();
+#if MSPACES
+      void *m = get_current_mspace ();
+      if (likely (m))
+	res = mspace_malloc (m, size);
+      else
+	res = dlmalloc (size);
+#else
       res = dlmalloc (size);
-      __malloc_unlock ();
+#endif
     }
+  else
+    res = user_data->malloc (size);
   malloc_printf ("(%ld) = %p, called by %p", size, res,
 					     caller_return_address ());
   return res;
@@ -65,14 +105,20 @@ extern "C" void *
 realloc (void *p, size_t size)
 {
   void *res;
-  if (!use_internal)
-    res = user_data->realloc (p, size);
-  else
+  if (likely (use_internal))
     {
-      __malloc_lock ();
+#if MSPACES
+      void *m = get_current_mspace ();
+      if (likely (m))
+	res = mspace_realloc (m, p, size);
+      else
+        res = dlrealloc (p, size);
+#else
       res = dlrealloc (p, size);
-      __malloc_unlock ();
+#endif
     }
+  else
+    res = user_data->realloc (p, size);
   malloc_printf ("(%p, %ld) = %p, called by %p", p, size, res,
 						 caller_return_address ());
   return res;
@@ -93,14 +139,20 @@ extern "C" void *
 calloc (size_t nmemb, size_t size)
 {
   void *res;
-  if (!use_internal)
-    res = user_data->calloc (nmemb, size);
-  else
+  if (likely (use_internal))
     {
-      __malloc_lock ();
+#if MSPACES
+      void *m = get_current_mspace ();
+      if (likely (m))
+	res = mspace_calloc (m, nmemb, size);
+      else
+        res = dlcalloc (nmemb, size);
+#else
       res = dlcalloc (nmemb, size);
-      __malloc_unlock ();
+#endif
     }
+  else
+    res = user_data->calloc (nmemb, size);
   malloc_printf ("(%ld, %ld) = %p, called by %p", nmemb, size, res,
 						  caller_return_address ());
   return res;
@@ -109,38 +161,50 @@ calloc (size_t nmemb, size_t size)
 extern "C" int
 posix_memalign (void **memptr, size_t alignment, size_t bytes)
 {
-  save_errno save;
-
   void *res;
-  if (!use_internal)
+  if (likely (use_internal))
+    {
+      if ((alignment & (alignment - 1)) != 0)
+	return EINVAL;
+#if MSPACES
+      void *m = get_current_mspace ();
+      if (likely (m))
+	res = mspace_memalign (m, alignment, bytes);
+      else
+	res = dlmemalign (alignment, bytes);
+#else
+      res = dlmemalign (alignment, bytes);
+#endif
+      if (!res)
+	return ENOMEM;
+      *memptr = res;
+      return 0;
+    }
+  else
     return user_data->posix_memalign (memptr, alignment, bytes);
-  if ((alignment & (alignment - 1)) != 0)
-    return EINVAL;
-  __malloc_lock ();
-  res = dlmemalign (alignment, bytes);
-  __malloc_unlock ();
-  if (!res)
-    return ENOMEM;
-  *memptr = res;
-  return 0;
 }
 
 extern "C" void *
 memalign (size_t alignment, size_t bytes)
 {
   void *res;
-  if (!use_internal)
+  if (likely (use_internal))
     {
-      set_errno (ENOSYS);
-      res = NULL;
+#if MSPACES
+      void *m = get_current_mspace ();
+      if (likely (m))
+	res = mspace_memalign (m, alignment, bytes);
+      else
+        res = dlmemalign (alignment, bytes);
+#else
+      res = dlmemalign (alignment, bytes);
+#endif
     }
   else
     {
-      __malloc_lock ();
-      res = dlmemalign (alignment, bytes);
-      __malloc_unlock ();
+      set_errno (ENOSYS);
+      res = NULL;
     }
-
   return res;
 }
 
@@ -148,18 +212,28 @@ extern "C" void *
 valloc (size_t bytes)
 {
   void *res;
-  if (!use_internal)
+  if (likely (use_internal))
     {
-      set_errno (ENOSYS);
-      res = NULL;
+#if MSPACES
+      static size_t syspagesize = 0;
+      if (unlikely (syspagesize == 0))
+	syspagesize = wincap.allocation_granularity ();
+
+      /* there is no mspace_valloc(), so fake it with *memalign() */
+      void *m = get_current_mspace ();
+      if (likely (m))
+	res = mspace_memalign (m, syspagesize, bytes);
+      else
+	res = dlmemalign (syspagesize, bytes);
+#else
+      res = dlvalloc (bytes);
+#endif
     }
   else
     {
-      __malloc_lock ();
-      res = dlvalloc (bytes);
-      __malloc_unlock ();
+      set_errno (ENOSYS);
+      res = NULL;
     }
-
   return res;
 }
 
@@ -167,18 +241,17 @@ extern "C" size_t
 malloc_usable_size (void *p)
 {
   size_t res;
-  if (!use_internal)
+  if (likely (use_internal))
+#if MSPACES
+    res = mspace_usable_size (p);
+#else
+    res = dlmalloc_usable_size (p);
+#endif
+  else
     {
       set_errno (ENOSYS);
       res = 0;
     }
-  else
-    {
-      __malloc_lock ();
-      res = dlmalloc_usable_size (p);
-      __malloc_unlock ();
-    }
-
   return res;
 }
 
@@ -186,18 +259,23 @@ extern "C" int
 malloc_trim (size_t pad)
 {
   size_t res;
-  if (!use_internal)
+  if (likely (use_internal))
     {
-      set_errno (ENOSYS);
-      res = 0;
+#if MSPACES
+      void *m = get_current_mspace ();
+      if (likely (m))
+	res = mspace_trim (m, pad);
+      else
+        res = dlmalloc_trim (pad);
+#else
+      res = dlmalloc_trim (pad);
+#endif
     }
   else
     {
-      __malloc_lock ();
-      res = dlmalloc_trim (pad);
-      __malloc_unlock ();
+      set_errno (ENOSYS);
+      res = 0;
     }
-
   return res;
 }
 
@@ -205,51 +283,61 @@ extern "C" int
 mallopt (int p, int v)
 {
   int res;
-  if (!use_internal)
+  if (likely (use_internal))
+#if MSPACES
+    res = mspace_mallopt (p, v);
+#else
+    res = dlmallopt (p, v);
+#endif
+  else
     {
       set_errno (ENOSYS);
       res = 0;
     }
-  else
-    {
-      __malloc_lock ();
-      res = dlmallopt (p, v);
-      __malloc_unlock ();
-    }
-
   return res;
 }
 
 extern "C" void
 malloc_stats ()
 {
-  if (!use_internal)
-    set_errno (ENOSYS);
-  else
+  if (likely (use_internal))
     {
-      __malloc_lock ();
+#if MSPACES
+      void *m = get_current_mspace ();
+      if (likely (m))
+	mspace_malloc_stats (m);
+      else
+        dlmalloc_stats ();
+#else
       dlmalloc_stats ();
-      __malloc_unlock ();
+#endif
     }
+  else
+    set_errno (ENOSYS);
 }
 
 extern "C" struct mallinfo
 mallinfo ()
 {
-  struct mallinfo m;
-  if (!use_internal)
+  struct mallinfo mal;
+  if (likely (use_internal))
     {
-      memset (&m, 0, sizeof m);
-      set_errno (ENOSYS);
+#if MSPACES
+      void *m = get_current_mspace ();
+      if (likely (m))
+	mal = mspace_mallinfo (m);
+      else
+        mal = dlmallinfo ();
+#else
+      mal = dlmallinfo ();
+#endif
     }
   else
     {
-      __malloc_lock ();
-      m = dlmallinfo ();
-      __malloc_unlock ();
+      memset (&mal, 0, sizeof mal);
+      set_errno (ENOSYS);
     }
-
-  return m;
+  return mal;
 }
 
 extern "C" char *
@@ -262,20 +350,9 @@ strdup (const char *s)
   return p;
 }
 
-/* We use a critical section to lock access to the malloc data
-   structures.  This permits malloc to be called from different
-   threads.  Note that it does not make malloc reentrant, and it does
-   not permit a signal handler to call malloc.  The malloc code in
-   newlib will call __malloc_lock and __malloc_unlock at appropriate
-   times.  */
-
-muto NO_COPY mallock;
-
 void
 malloc_init ()
 {
-  mallock.init ("mallock");
-
   /* Check if malloc is provided by application. If so, redirect all
      calls to malloc/free/realloc to application provided. This may
      happen if some other dll calls cygwin's malloc, but main code provides
@@ -290,6 +367,15 @@ malloc_init ()
       malloc_printf ("using %s malloc", use_internal ? "internal" : "external");
       internal_malloc_determined = true;
     }
+
+#if MSPACES
+  if (use_internal)
+    {
+      tls_mspace = TlsAlloc ();
+      if (tls_mspace == 0)
+	api_fatal ("malloc_init() couldn't init tls_mspace");
+    }
+#endif
 }
 
 extern "C" void
-- 
2.29.2


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] Cygwin: Interim malloc speedup -- addendum
  2020-12-22  4:53 [PATCH] Cygwin: Interim malloc speedup Mark Geisert
@ 2020-12-24  8:28 ` Mark Geisert
  2021-01-11 12:18 ` [PATCH] Cygwin: Interim malloc speedup Corinna Vinschen
  1 sibling, 0 replies; 5+ messages in thread
From: Mark Geisert @ 2020-12-24  8:28 UTC (permalink / raw)
  To: cygwin-patches

I could easily provide an updated patch without the MSPACES stuff, given that that 
aspect is not really functional at this point.  Just let me know.

..mark

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] Cygwin: Interim malloc speedup
  2020-12-22  4:53 [PATCH] Cygwin: Interim malloc speedup Mark Geisert
  2020-12-24  8:28 ` [PATCH] Cygwin: Interim malloc speedup -- addendum Mark Geisert
@ 2021-01-11 12:18 ` Corinna Vinschen
  2021-01-18  6:47   ` Mark Geisert
  1 sibling, 1 reply; 5+ messages in thread
From: Corinna Vinschen @ 2021-01-11 12:18 UTC (permalink / raw)
  To: cygwin-patches

Hi Mark,

Happy New Year!

On Dec 21 20:53, Mark Geisert wrote:
> Replaces function-level lock with data-level lock provided by existing
> dlmalloc.  Sets up to enable dlmalloc's MSPACES, but does not yet enable
> them due to visible but uninvestigated issues.
> 
> Single-thread applications may or may not see a performance gain,
> depending on how heavily it uses the malloc functions.  Multi-thread
> apps will likely see a performance gain.
> 
> ---
>  winsup/cygwin/cygmalloc.h       |  28 +++-
>  winsup/cygwin/fork.cc           |   8 -
>  winsup/cygwin/malloc_wrapper.cc | 274 +++++++++++++++++++++-----------
>  3 files changed, 202 insertions(+), 108 deletions(-)
> 
> diff --git a/winsup/cygwin/cygmalloc.h b/winsup/cygwin/cygmalloc.h
> index 84bad824c..67a9f3b3f 100644
> --- a/winsup/cygwin/cygmalloc.h
> +++ b/winsup/cygwin/cygmalloc.h
> @@ -26,20 +26,36 @@ void dlmalloc_stats ();
>  #define MALLOC_ALIGNMENT ((size_t)16U)
>  #endif
>  
> +/* As of Cygwin 3.2.0 we could enable dlmalloc's MSPACES */
> +#define MSPACES 0 // DO NOT ENABLE: cygserver, XWin, etc will malfunction
> +
>  #if defined (DLMALLOC_VERSION)	/* Building malloc.cc */
>  
>  extern "C" void __set_ENOMEM ();
>  void *mmap64 (void *, size_t, int, int, int, off_t);
>  # define mmap mmap64
> +
> +/* These defines tune the dlmalloc implementation in malloc.cc */
>  # define MALLOC_FAILURE_ACTION	__set_ENOMEM ()
>  # define USE_DL_PREFIX 1
> +# define USE_LOCKS 1

Just enabling USE_LOCKS looks wrong to me.  Before enabling USE_LOCKS,
you should check how the actual locking is performed.  For non WIN32,
that will be pthread_mutex_lock/unlock, which may not be feasible,
because it may break expectations during fork.

What you may want to do is setting USE_LOCKS to 2, and defining your own
MLOCK_T/ACQUIRE_LOCK/... macros (in the `#if USE_LOCKS > 1' branch of
the malloc source, see lines 1798ff), using a type which is non-critical
during forking, as well as during process initialization.  Win32 fast
R/W Locks come to mind and adding them should be pretty straight-forward.
This may also allow MSPACES to work OOTB.

> +# define LOCK_AT_FORK 0

This looks dangerous.  You're removing the locking from fork entirely
*and* the lock isn't re-initialized in the child.  This reinitializing
was no problem before because mallock was NO_COPY, but it's a problem
now because the global malloc_state _gm_ isn't (and mustn't).  The
current implementation calling

  #if LOCK_AT_FORK
      pthread_atfork(&pre_fork, &post_fork_parent, &post_fork_child);
  #endif

should do the trick, assuming the USE_LOCKS stuff is working as desired.

> [...]
> +#if MSPACES
> +/* If mspaces (thread-specific memory pools) are enabled, use a thread-
> +   local variable to store a pointer to the calling thread's mspace.
> +
> +   On any use of a malloc-family function, if the appropriate mspace cannot
> +   be determined, the general (non-mspace) form of the corresponding malloc
> +   function is substituted.  This is not expected to happen often.
> +*/
> +static NO_COPY DWORD tls_mspace; // index into thread's TLS array
> +
> +static void *
> +get_current_mspace ()
> +{
> +  if (unlikely (tls_mspace == 0))
> +    return 0;
> +
> +  void *m = TlsGetValue (tls_mspace);
> +  if (unlikely (m == 0))
> +    {
> +      m = create_mspace (MSPACE_SIZE, 0);
> +      if (!m)
> +        return 0;
> +      TlsSetValue (tls_mspace, m);
> +    }
> +  return m;
> +}
> +#endif

Please define a new slot in _cygtls keeping the memory address returned
by create_mspace.  You don't have to call TlsGetValue/TlsSetValue.


Thanks,
Corinna

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] Cygwin: Interim malloc speedup
  2021-01-11 12:18 ` [PATCH] Cygwin: Interim malloc speedup Corinna Vinschen
@ 2021-01-18  6:47   ` Mark Geisert
  2021-01-18 12:50     ` Corinna Vinschen
  0 siblings, 1 reply; 5+ messages in thread
From: Mark Geisert @ 2021-01-18  6:47 UTC (permalink / raw)
  To: cygwin-patches

Hi Corinna,
Happy New Year back at you!  I'm very glad to see you posting again!

Corinna Vinschen via Cygwin-patches wrote:
> Hi Mark,
> 
> Happy New Year!
> 
> On Dec 21 20:53, Mark Geisert wrote:
>> Replaces function-level lock with data-level lock provided by existing
>> dlmalloc.  Sets up to enable dlmalloc's MSPACES, but does not yet enable
>> them due to visible but uninvestigated issues.
>>
>> Single-thread applications may or may not see a performance gain,
>> depending on how heavily it uses the malloc functions.  Multi-thread
>> apps will likely see a performance gain.
[...]
>> diff --git a/winsup/cygwin/cygmalloc.h b/winsup/cygwin/cygmalloc.h
>> index 84bad824c..67a9f3b3f 100644
>> --- a/winsup/cygwin/cygmalloc.h
>> +++ b/winsup/cygwin/cygmalloc.h
[...]
>> +/* These defines tune the dlmalloc implementation in malloc.cc */
>>   # define MALLOC_FAILURE_ACTION	__set_ENOMEM ()
>>   # define USE_DL_PREFIX 1
>> +# define USE_LOCKS 1
> 
> Just enabling USE_LOCKS looks wrong to me.  Before enabling USE_LOCKS,
> you should check how the actual locking is performed.  For non WIN32,
> that will be pthread_mutex_lock/unlock, which may not be feasible,
> because it may break expectations during fork.

I did investigate this before setting it, and I've been running with '#define 
USE_LOCKS 1' for many weeks and haven't seen any memory issues of any kind. 
Malloc multi-thread stress testing, fork() stress testing, Cygwin DLL builds, 
Python and binutils builds, routine X usage; all OK.  (Once I straightened out 
sped-up mkimport to actually do what Jon T suggested, blush.)

> What you may want to do is setting USE_LOCKS to 2, and defining your own
> MLOCK_T/ACQUIRE_LOCK/... macros (in the `#if USE_LOCKS > 1' branch of
> the malloc source, see lines 1798ff), using a type which is non-critical
> during forking, as well as during process initialization.  Win32 fast
> R/W Locks come to mind and adding them should be pretty straight-forward.
> This may also allow MSPACES to work OOTB.

With '#define USE_LOCKS 1' the tangled mess of #if-logic in malloc.cc resolves on 
Cygwin to using pthread_mutex_locks, so that seems to be OK as-is unless what 
you're suggesting is preferable for speed (or MSPACES when I get to that).
>> +# define LOCK_AT_FORK 0
> 
> This looks dangerous.  You're removing the locking from fork entirely
> *and* the lock isn't re-initialized in the child.  This reinitializing
> was no problem before because mallock was NO_COPY, but it's a problem
> now because the global malloc_state _gm_ isn't (and mustn't).  The
> current implementation calling
> 
>    #if LOCK_AT_FORK
>        pthread_atfork(&pre_fork, &post_fork_parent, &post_fork_child);
>    #endif
> 
> should do the trick, assuming the USE_LOCKS stuff is working as desired.

I don't remember what led me to #define LOCK_AT_FORK 0, but in the new light of 
this year it's obviously wrong.  I've #define'd it 1.

>> [...]
>> +#if MSPACES
>> +/* If mspaces (thread-specific memory pools) are enabled, use a thread-
>> +   local variable to store a pointer to the calling thread's mspace.
>> +
>> +   On any use of a malloc-family function, if the appropriate mspace cannot
>> +   be determined, the general (non-mspace) form of the corresponding malloc
>> +   function is substituted.  This is not expected to happen often.
>> +*/
>> +static NO_COPY DWORD tls_mspace; // index into thread's TLS array
>> +
>> +static void *
>> +get_current_mspace ()
>> +{
>> +  if (unlikely (tls_mspace == 0))
>> +    return 0;
>> +
>> +  void *m = TlsGetValue (tls_mspace);
>> +  if (unlikely (m == 0))
>> +    {
>> +      m = create_mspace (MSPACE_SIZE, 0);
>> +      if (!m)
>> +        return 0;
>> +      TlsSetValue (tls_mspace, m);
>> +    }
>> +  return m;
>> +}
>> +#endif
> 
> Please define a new slot in _cygtls keeping the memory address returned
> by create_mspace.  You don't have to call TlsGetValue/TlsSetValue.

Thank you for repeating this suggestion.  I now understand why it's better.

I'm going to delay submitting the v2 patch until I see where the investigation of 
Achim's malloc testcase (running zstd on 1600 files, for instance) leads.  I'm 
about to respond to his thread in cygwin-apps.
Thanks & Regards,

..mark

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] Cygwin: Interim malloc speedup
  2021-01-18  6:47   ` Mark Geisert
@ 2021-01-18 12:50     ` Corinna Vinschen
  0 siblings, 0 replies; 5+ messages in thread
From: Corinna Vinschen @ 2021-01-18 12:50 UTC (permalink / raw)
  To: cygwin-patches

Hi Mark,

On Jan 17 22:47, Mark Geisert wrote:
> Hi Corinna,
> Happy New Year back at you!  I'm very glad to see you posting again!

Yeah, I took a longer timeout over the holiday season.

> Corinna Vinschen via Cygwin-patches wrote:
> > Hi Mark,
> > 
> > Happy New Year!
> > 
> > On Dec 21 20:53, Mark Geisert wrote:
> > > Replaces function-level lock with data-level lock provided by existing
> > > dlmalloc.  Sets up to enable dlmalloc's MSPACES, but does not yet enable
> > > them due to visible but uninvestigated issues.
> > > 
> > > Single-thread applications may or may not see a performance gain,
> > > depending on how heavily it uses the malloc functions.  Multi-thread
> > > apps will likely see a performance gain.
> [...]
> > > diff --git a/winsup/cygwin/cygmalloc.h b/winsup/cygwin/cygmalloc.h
> > > index 84bad824c..67a9f3b3f 100644
> > > --- a/winsup/cygwin/cygmalloc.h
> > > +++ b/winsup/cygwin/cygmalloc.h
> [...]
> > > +/* These defines tune the dlmalloc implementation in malloc.cc */
> > >   # define MALLOC_FAILURE_ACTION	__set_ENOMEM ()
> > >   # define USE_DL_PREFIX 1
> > > +# define USE_LOCKS 1
> > 
> > Just enabling USE_LOCKS looks wrong to me.  Before enabling USE_LOCKS,
> > you should check how the actual locking is performed.  For non WIN32,
> > that will be pthread_mutex_lock/unlock, which may not be feasible,
> > because it may break expectations during fork.
> 
> I did investigate this before setting it, and I've been running with
> '#define USE_LOCKS 1' for many weeks and haven't seen any memory issues of
> any kind. Malloc multi-thread stress testing, fork() stress testing, Cygwin
> DLL builds, Python and binutils builds, routine X usage; all OK.  (Once I
> straightened out sped-up mkimport to actually do what Jon T suggested,
> blush.)
> 
> > What you may want to do is setting USE_LOCKS to 2, and defining your own
> > MLOCK_T/ACQUIRE_LOCK/... macros (in the `#if USE_LOCKS > 1' branch of
> > the malloc source, see lines 1798ff), using a type which is non-critical
> > during forking, as well as during process initialization.  Win32 fast
> > R/W Locks come to mind and adding them should be pretty straight-forward.
> > This may also allow MSPACES to work OOTB.
> 
> With '#define USE_LOCKS 1' the tangled mess of #if-logic in malloc.cc
> resolves on Cygwin to using pthread_mutex_locks, so that seems to be OK
> as-is unless what you're suggesting is preferable for speed (or MSPACES when
> I get to that).

Admittedly, I'm not sure if pthread mutexes pose a problem here, I'm
just cautious.

Malloc locking is single-process only and pthread mutexes are adding some
unnecessary overhead (Event object, bookkeeping list, fixup_after_fork
handling).  Win32 SRW Locks, especially the exclusive type, is much
faster and also easy to use, unless you need recursive locking.


Thanks,
Corinna

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2021-01-18 12:50 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-12-22  4:53 [PATCH] Cygwin: Interim malloc speedup Mark Geisert
2020-12-24  8:28 ` [PATCH] Cygwin: Interim malloc speedup -- addendum Mark Geisert
2021-01-11 12:18 ` [PATCH] Cygwin: Interim malloc speedup Corinna Vinschen
2021-01-18  6:47   ` Mark Geisert
2021-01-18 12:50     ` Corinna Vinschen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).