public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH v1 1/1] memalign: Support scanning for aligned chunks.
@ 2022-07-14  3:58 DJ Delorie
  2022-07-19  2:54 ` Carlos O'Donell
  0 siblings, 1 reply; 38+ messages in thread
From: DJ Delorie @ 2022-07-14  3:58 UTC (permalink / raw)
  To: libc-alpha


This patch adds a chunk scanning algorithm to the _int_memalign code
path that reduces heap fragmentation by reusing already aligned chunks
instead of always looking for chunks of larger sizes and splitting
them.

The goal is to fix the pathological use cases where heaps grow
continuously in workloads that are heavy users of memalign.

diff --git a/malloc/Makefile b/malloc/Makefile
index 4e32de2a0b..7a25f2d781 100644
--- a/malloc/Makefile
+++ b/malloc/Makefile
@@ -43,6 +43,7 @@ tests := mallocbug tst-malloc tst-valloc tst-calloc tst-obstack \
 	 tst-tcfree1 tst-tcfree2 tst-tcfree3 \
 	 tst-safe-linking \
 	 tst-mallocalign1 \
+	 tst-memalign-2
 
 tests-static := \
 	 tst-interpose-static-nothread \
diff --git a/malloc/malloc.c b/malloc/malloc.c
index 12908b8f97..219707ebec 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -3557,6 +3557,32 @@ _mid_memalign (size_t alignment, size_t bytes, void *address)
       alignment = a;
     }
 
+#if USE_TCACHE
+  {
+    size_t tbytes;
+    tbytes = checked_request2size (bytes);
+    if (tbytes == 0)
+      {
+	__set_errno (ENOMEM);
+	return NULL;
+      }
+    size_t tc_idx = csize2tidx (tbytes);
+
+    MAYBE_INIT_TCACHE ();
+
+    if (tc_idx < mp_.tcache_bins
+	&& tcache
+	&& tcache->counts[tc_idx] > 0
+	&& ((intptr_t)tcache->entries[tc_idx] & (alignment - 1)) == 0)
+      {
+	void *victim = tcache_get (tc_idx);
+	if (__glibc_unlikely (misaligned_chunk (victim)))
+	  malloc_printerr ("_mid_memalign(): unaligned tcache chunk detected");
+	return tag_new_usable (victim);
+      }
+  }
+#endif
+
   if (SINGLE_THREAD_P)
     {
       p = _int_memalign (&main_arena, alignment, bytes);
@@ -4937,6 +4963,43 @@ _int_realloc (mstate av, mchunkptr oldp, INTERNAL_SIZE_T oldsize,
    ------------------------------ memalign ------------------------------
  */
 
+/* Returns 0 if the chunk is not and does not contain the requested
+   aligned sub-chunk, else returns the amount of "waste" from
+   trimming.  BYTES is the *user* byte size, not the chunk byte
+   size.  */
+static int
+chunk_ok_for_memalign (mchunkptr p, size_t alignment, size_t bytes)
+{
+  void *m = chunk2mem (p);
+  INTERNAL_SIZE_T size = memsize (p);
+  void *aligned_m = m;
+
+  if (__glibc_unlikely (misaligned_chunk (p)))
+    malloc_printerr ("_int_memalign(): unaligned chunk detected");
+
+  aligned_m = PTR_ALIGN_UP (m, alignment);
+
+  INTERNAL_SIZE_T front_extra = (intptr_t) aligned_m - (intptr_t) m;
+
+  /* We can't trim off the front as it's too small.  */
+  if (front_extra > 0 && front_extra < MINSIZE)
+    return 0;
+
+  /* If it's a perfect fit, it's an exception to the return value rule
+     (we would return zero waste, which looks like "not usable"), so
+     handle it here by returning a small non-zero value instead.  */
+  if (size == bytes && front_extra == 0)
+    return 1;
+
+  /* If the block we need fits in the chunk, calculate total waste.  */
+  if (size > bytes + front_extra)
+    return size - bytes;
+
+  /* Can't use this chunk.  */ 
+  return 0;
+}
+
+/* BYTES is user requested bytes, not requested chunksize bytes.  */
 static void *
 _int_memalign (mstate av, size_t alignment, size_t bytes)
 {
@@ -4950,8 +5013,7 @@ _int_memalign (mstate av, size_t alignment, size_t bytes)
   mchunkptr remainder;            /* spare room at end to split off */
   unsigned long remainder_size;   /* its size */
   INTERNAL_SIZE_T size;
-
-
+  mchunkptr victim;
 
   nb = checked_request2size (bytes);
   if (nb == 0)
@@ -4960,29 +5022,142 @@ _int_memalign (mstate av, size_t alignment, size_t bytes)
       return NULL;
     }
 
-  /*
-     Strategy: find a spot within that chunk that meets the alignment
+  /* We can't check tcache here because we hold the arena lock, which
+     tcache doesn't expect.  We expect it has been checked
+     earlier.  */
+
+  /* Strategy: search the bins looking for an existing block that
+     meets our needs.  We scan a range of bins from "exact size" to
+     "just under 2x", spanning the small/large barrier if needed.  If
+     we don't find anything in those bins, the common malloc code will
+     scan starting at 2x.  */
+
+  /* This will be set if we found a candidate chunk.  */
+  victim = NULL;
+
+  /* Fast bins are singly-linked, hard to remove a chunk from the middle
+     and unlikely to meet our alignment requirements.  We have not done
+     any experimentation with searching for aligned fastbins.  */
+
+  int first_bin_index;
+  int first_largebin_index;
+  int last_bin_index;
+
+  if (in_smallbin_range (nb))
+    first_bin_index = smallbin_index (nb);
+  else
+    first_bin_index = largebin_index (nb);
+
+  if (in_smallbin_range (nb * 2))
+    last_bin_index = smallbin_index (nb * 2);
+  else
+    last_bin_index = largebin_index (nb * 2);
+
+  first_largebin_index = largebin_index (MIN_LARGE_SIZE);
+
+  int victim_index;                 /* its bin index */
+
+  for (victim_index = first_bin_index;
+       victim_index < last_bin_index;
+       victim_index ++)
+    {
+      victim = NULL;
+
+      if (victim_index < first_largebin_index)
+    {
+      /* Check small bins.  Small bin chunks are doubly-linked despite
+	 being the same size.  */
+
+      mchunkptr fwd;                    /* misc temp for linking */
+      mchunkptr bck;                    /* misc temp for linking */
+
+      bck = bin_at (av, victim_index);
+      fwd = bck->fd;
+      while (fwd != bck)
+	{
+	  if (chunk_ok_for_memalign (fwd, alignment, bytes) > 0)
+	    {
+	      victim = fwd;
+
+	      /* Unlink it */
+	      victim->fd->bk = victim->bk;
+	      victim->bk->fd = victim->fd;
+	      break;
+	    }
+
+	  fwd = fwd->fd;
+	}
+    }
+  else
+    {
+      /* Check large bins.  */
+      mchunkptr fwd;                    /* misc temp for linking */
+      mchunkptr bck;                    /* misc temp for linking */
+      mchunkptr best = NULL;
+      size_t best_size = 0;
+
+      bck = bin_at (av, victim_index);
+      fwd = bck->fd;
+
+      while (fwd != bck)
+	{
+	  int extra;
+
+	  if (chunksize (fwd) < nb)
+	      break;
+	  extra = chunk_ok_for_memalign (fwd, alignment, bytes);
+	  if (extra > 0
+	      && (extra <= best_size || best == NULL))
+	    {
+	      best = fwd;
+	      best_size = extra;
+	    }
+
+	  fwd = fwd->fd;
+	}
+      victim = best;
+
+      if (victim != NULL)
+	{
+	  unlink_chunk (av, victim);
+	  break;
+	}
+    }
+
+      if (victim != NULL)
+	break;
+    }
+
+  /* Strategy: find a spot within that chunk that meets the alignment
      request, and then possibly free the leading and trailing space.
-   */
+     This strategy is incredibly costly and can lead to external
+     fragmentation if header and footer chunks are unused.  */
 
-  /* Call malloc with worst case padding to hit alignment. */
+  if (victim != NULL)
+    {
+      p = victim;
+      m = chunk2mem (p);
+      set_inuse (p);
+    }
+  else
+    {
+      /* Call malloc with worst case padding to hit alignment. */
 
-  m = (char *) (_int_malloc (av, nb + alignment + MINSIZE));
+      m = (char *) (_int_malloc (av, nb + alignment + MINSIZE));
 
-  if (m == 0)
-    return 0;           /* propagate failure */
+      if (m == 0)
+	return 0;           /* propagate failure */
 
-  p = mem2chunk (m);
+      p = mem2chunk (m);
+    }
 
   if ((((unsigned long) (m)) % alignment) != 0)   /* misaligned */
-
-    { /*
-                Find an aligned spot inside chunk.  Since we need to give back
-                leading space in a chunk of at least MINSIZE, if the first
-                calculation places us at a spot with less than MINSIZE leader,
-                we can move to the next aligned spot -- we've allocated enough
-                total room so that this is always possible.
-                 */
+    {
+      /* Find an aligned spot inside chunk.  Since we need to give back
+         leading space in a chunk of at least MINSIZE, if the first
+         calculation places us at a spot with less than MINSIZE leader,
+         we can move to the next aligned spot -- we've allocated enough
+         total room so that this is always possible.  */
       brk = (char *) mem2chunk (((unsigned long) (m + alignment - 1)) &
                                 - ((signed long) alignment));
       if ((unsigned long) (brk - (char *) (p)) < MINSIZE)
diff --git a/malloc/tst-memalign-2.c b/malloc/tst-memalign-2.c
new file mode 100644
index 0000000000..04d42a2da2
--- /dev/null
+++ b/malloc/tst-memalign-2.c
@@ -0,0 +1,115 @@
+/* Test for memalign chunk reuse
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <errno.h>
+#include <malloc.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <array_length.h>
+
+#include <support/check.h>
+
+typedef struct TestCase {
+  size_t size;
+  size_t alignment;
+  void *ptr1;
+  void *ptr2;
+} TestCase;
+
+static TestCase tcache_allocs[] = {
+  { 24, 8, NULL, NULL },
+  { 24, 16, NULL, NULL },
+  { 128, 32, NULL, NULL }
+};
+#define TN array_length (tcache_allocs)
+
+static TestCase large_allocs[] = {
+  { 23450, 64, NULL, NULL },
+  { 23450, 64, NULL, NULL },
+  { 23550, 64, NULL, NULL },
+  { 23550, 64, NULL, NULL },
+  { 23650, 64, NULL, NULL },
+  { 23650, 64, NULL, NULL },
+  { 33650, 64, NULL, NULL },
+  { 33650, 64, NULL, NULL }
+};
+#define LN array_length (large_allocs)
+
+void *p;
+
+static int
+do_test (void)
+{
+  int i, j;
+  int count;
+
+  /* TCache test.  */
+
+  for (i = 0; i < TN; ++ i)
+    {
+      tcache_allocs[i].ptr1 = memalign (tcache_allocs[i].alignment, tcache_allocs[i].size);
+      free (tcache_allocs[i].ptr1);
+      /* This should return the same chunk as was just free'd.  */
+      tcache_allocs[i].ptr2 = memalign (tcache_allocs[i].alignment, tcache_allocs[i].size);
+      free (tcache_allocs[i].ptr2);
+
+      TEST_VERIFY (tcache_allocs[i].ptr1 == tcache_allocs[i].ptr2);
+    }
+
+  /* Large bins test.  */
+
+  for (i = 0; i < LN; ++ i)
+    {
+      large_allocs[i].ptr1 = memalign (large_allocs[i].alignment, large_allocs[i].size);
+      /* Keep chunks from combining by fragmenting the heap.  */
+      p = malloc (512);
+    }
+
+  for (i = 0; i < LN; ++ i)
+    free (large_allocs[i].ptr1);
+
+  /* Force the unsorted bins to be scanned and moved to small/large
+     bins.  */
+  p = malloc (60000);
+
+  for (i = 0; i < LN; ++ i)
+    large_allocs[i].ptr2 = memalign (large_allocs[i].alignment, large_allocs[i].size);
+
+  count = 0;
+  for (i = 0; i < LN; ++ i)
+    {
+      int ok = 0;
+      for (j = 0; j < LN; ++ j)
+	if (large_allocs[i].ptr1 == large_allocs[j].ptr2)
+	  ok = 1;
+      if (ok == 1)
+	count ++;
+    }
+
+  /* The allocation algorithm is complicated outside of the memalign
+     logic, so just make sure it's working for most of the
+     allocations.  This avoids possible boundary conditions with
+     empty/full heaps.  */
+  TEST_VERIFY (count > LN / 2);
+
+  return 0;
+}
+
+#define TEST_FUNCTION do_test ()
+#include "../test-skeleton.c"


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v1 1/1] memalign: Support scanning for aligned chunks.
  2022-07-14  3:58 [PATCH v1 1/1] memalign: Support scanning for aligned chunks DJ Delorie
@ 2022-07-19  2:54 ` Carlos O'Donell
  2022-07-19  3:57   ` [PATCH v2 " DJ Delorie
  0 siblings, 1 reply; 38+ messages in thread
From: Carlos O'Donell @ 2022-07-19  2:54 UTC (permalink / raw)
  To: DJ Delorie, libc-alpha

On 7/13/22 23:58, DJ Delorie via Libc-alpha wrote:
> This patch adds a chunk scanning algorithm to the _int_memalign code
> path that reduces heap fragmentation by reusing already aligned chunks
> instead of always looking for chunks of larger sizes and splitting
> them.
> 
> The goal is to fix the pathological use cases where heaps grow
> continuously in workloads that are heavy users of memalign.

Fails pre-commit CI. Please review :-)

https://www.delorie.com/trybots/32bit/10936/
 
> diff --git a/malloc/Makefile b/malloc/Makefile
> index 4e32de2a0b..7a25f2d781 100644
> --- a/malloc/Makefile
> +++ b/malloc/Makefile
> @@ -43,6 +43,7 @@ tests := mallocbug tst-malloc tst-valloc tst-calloc tst-obstack \
>  	 tst-tcfree1 tst-tcfree2 tst-tcfree3 \
>  	 tst-safe-linking \
>  	 tst-mallocalign1 \
> +	 tst-memalign-2
>  
>  tests-static := \
>  	 tst-interpose-static-nothread \
> diff --git a/malloc/malloc.c b/malloc/malloc.c
> index 12908b8f97..219707ebec 100644
> --- a/malloc/malloc.c
> +++ b/malloc/malloc.c
> @@ -3557,6 +3557,32 @@ _mid_memalign (size_t alignment, size_t bytes, void *address)
>        alignment = a;
>      }
>  
> +#if USE_TCACHE
> +  {
> +    size_t tbytes;
> +    tbytes = checked_request2size (bytes);
> +    if (tbytes == 0)
> +      {
> +	__set_errno (ENOMEM);
> +	return NULL;
> +      }
> +    size_t tc_idx = csize2tidx (tbytes);
> +
> +    MAYBE_INIT_TCACHE ();
> +
> +    if (tc_idx < mp_.tcache_bins
> +	&& tcache
> +	&& tcache->counts[tc_idx] > 0
> +	&& ((intptr_t)tcache->entries[tc_idx] & (alignment - 1)) == 0)
> +      {
> +	void *victim = tcache_get (tc_idx);
> +	if (__glibc_unlikely (misaligned_chunk (victim)))
> +	  malloc_printerr ("_mid_memalign(): unaligned tcache chunk detected");
> +	return tag_new_usable (victim);
> +      }
> +  }
> +#endif
> +
>    if (SINGLE_THREAD_P)
>      {
>        p = _int_memalign (&main_arena, alignment, bytes);
> @@ -4937,6 +4963,43 @@ _int_realloc (mstate av, mchunkptr oldp, INTERNAL_SIZE_T oldsize,
>     ------------------------------ memalign ------------------------------
>   */
>  
> +/* Returns 0 if the chunk is not and does not contain the requested
> +   aligned sub-chunk, else returns the amount of "waste" from
> +   trimming.  BYTES is the *user* byte size, not the chunk byte
> +   size.  */
> +static int
> +chunk_ok_for_memalign (mchunkptr p, size_t alignment, size_t bytes)
> +{
> +  void *m = chunk2mem (p);
> +  INTERNAL_SIZE_T size = memsize (p);
> +  void *aligned_m = m;
> +
> +  if (__glibc_unlikely (misaligned_chunk (p)))
> +    malloc_printerr ("_int_memalign(): unaligned chunk detected");
> +
> +  aligned_m = PTR_ALIGN_UP (m, alignment);
> +
> +  INTERNAL_SIZE_T front_extra = (intptr_t) aligned_m - (intptr_t) m;
> +
> +  /* We can't trim off the front as it's too small.  */
> +  if (front_extra > 0 && front_extra < MINSIZE)
> +    return 0;
> +
> +  /* If it's a perfect fit, it's an exception to the return value rule
> +     (we would return zero waste, which looks like "not usable"), so
> +     handle it here by returning a small non-zero value instead.  */
> +  if (size == bytes && front_extra == 0)
> +    return 1;
> +
> +  /* If the block we need fits in the chunk, calculate total waste.  */
> +  if (size > bytes + front_extra)
> +    return size - bytes;
> +
> +  /* Can't use this chunk.  */ 
> +  return 0;
> +}
> +
> +/* BYTES is user requested bytes, not requested chunksize bytes.  */
>  static void *
>  _int_memalign (mstate av, size_t alignment, size_t bytes)
>  {
> @@ -4950,8 +5013,7 @@ _int_memalign (mstate av, size_t alignment, size_t bytes)
>    mchunkptr remainder;            /* spare room at end to split off */
>    unsigned long remainder_size;   /* its size */
>    INTERNAL_SIZE_T size;
> -
> -
> +  mchunkptr victim;
>  
>    nb = checked_request2size (bytes);
>    if (nb == 0)
> @@ -4960,29 +5022,142 @@ _int_memalign (mstate av, size_t alignment, size_t bytes)
>        return NULL;
>      }
>  
> -  /*
> -     Strategy: find a spot within that chunk that meets the alignment
> +  /* We can't check tcache here because we hold the arena lock, which
> +     tcache doesn't expect.  We expect it has been checked
> +     earlier.  */
> +
> +  /* Strategy: search the bins looking for an existing block that
> +     meets our needs.  We scan a range of bins from "exact size" to
> +     "just under 2x", spanning the small/large barrier if needed.  If
> +     we don't find anything in those bins, the common malloc code will
> +     scan starting at 2x.  */
> +
> +  /* This will be set if we found a candidate chunk.  */
> +  victim = NULL;
> +
> +  /* Fast bins are singly-linked, hard to remove a chunk from the middle
> +     and unlikely to meet our alignment requirements.  We have not done
> +     any experimentation with searching for aligned fastbins.  */
> +
> +  int first_bin_index;
> +  int first_largebin_index;
> +  int last_bin_index;
> +
> +  if (in_smallbin_range (nb))
> +    first_bin_index = smallbin_index (nb);
> +  else
> +    first_bin_index = largebin_index (nb);
> +
> +  if (in_smallbin_range (nb * 2))
> +    last_bin_index = smallbin_index (nb * 2);
> +  else
> +    last_bin_index = largebin_index (nb * 2);
> +
> +  first_largebin_index = largebin_index (MIN_LARGE_SIZE);
> +
> +  int victim_index;                 /* its bin index */
> +
> +  for (victim_index = first_bin_index;
> +       victim_index < last_bin_index;
> +       victim_index ++)
> +    {
> +      victim = NULL;
> +
> +      if (victim_index < first_largebin_index)
> +    {
> +      /* Check small bins.  Small bin chunks are doubly-linked despite
> +	 being the same size.  */
> +
> +      mchunkptr fwd;                    /* misc temp for linking */
> +      mchunkptr bck;                    /* misc temp for linking */
> +
> +      bck = bin_at (av, victim_index);
> +      fwd = bck->fd;
> +      while (fwd != bck)
> +	{
> +	  if (chunk_ok_for_memalign (fwd, alignment, bytes) > 0)
> +	    {
> +	      victim = fwd;
> +
> +	      /* Unlink it */
> +	      victim->fd->bk = victim->bk;
> +	      victim->bk->fd = victim->fd;
> +	      break;
> +	    }
> +
> +	  fwd = fwd->fd;
> +	}
> +    }
> +  else
> +    {
> +      /* Check large bins.  */
> +      mchunkptr fwd;                    /* misc temp for linking */
> +      mchunkptr bck;                    /* misc temp for linking */
> +      mchunkptr best = NULL;
> +      size_t best_size = 0;
> +
> +      bck = bin_at (av, victim_index);
> +      fwd = bck->fd;
> +
> +      while (fwd != bck)
> +	{
> +	  int extra;
> +
> +	  if (chunksize (fwd) < nb)
> +	      break;
> +	  extra = chunk_ok_for_memalign (fwd, alignment, bytes);
> +	  if (extra > 0
> +	      && (extra <= best_size || best == NULL))
> +	    {
> +	      best = fwd;
> +	      best_size = extra;
> +	    }
> +
> +	  fwd = fwd->fd;
> +	}
> +      victim = best;
> +
> +      if (victim != NULL)
> +	{
> +	  unlink_chunk (av, victim);
> +	  break;
> +	}
> +    }
> +
> +      if (victim != NULL)
> +	break;
> +    }
> +
> +  /* Strategy: find a spot within that chunk that meets the alignment
>       request, and then possibly free the leading and trailing space.
> -   */
> +     This strategy is incredibly costly and can lead to external
> +     fragmentation if header and footer chunks are unused.  */
>  
> -  /* Call malloc with worst case padding to hit alignment. */
> +  if (victim != NULL)
> +    {
> +      p = victim;
> +      m = chunk2mem (p);
> +      set_inuse (p);
> +    }
> +  else
> +    {
> +      /* Call malloc with worst case padding to hit alignment. */
>  
> -  m = (char *) (_int_malloc (av, nb + alignment + MINSIZE));
> +      m = (char *) (_int_malloc (av, nb + alignment + MINSIZE));
>  
> -  if (m == 0)
> -    return 0;           /* propagate failure */
> +      if (m == 0)
> +	return 0;           /* propagate failure */
>  
> -  p = mem2chunk (m);
> +      p = mem2chunk (m);
> +    }
>  
>    if ((((unsigned long) (m)) % alignment) != 0)   /* misaligned */
> -
> -    { /*
> -                Find an aligned spot inside chunk.  Since we need to give back
> -                leading space in a chunk of at least MINSIZE, if the first
> -                calculation places us at a spot with less than MINSIZE leader,
> -                we can move to the next aligned spot -- we've allocated enough
> -                total room so that this is always possible.
> -                 */
> +    {
> +      /* Find an aligned spot inside chunk.  Since we need to give back
> +         leading space in a chunk of at least MINSIZE, if the first
> +         calculation places us at a spot with less than MINSIZE leader,
> +         we can move to the next aligned spot -- we've allocated enough
> +         total room so that this is always possible.  */
>        brk = (char *) mem2chunk (((unsigned long) (m + alignment - 1)) &
>                                  - ((signed long) alignment));
>        if ((unsigned long) (brk - (char *) (p)) < MINSIZE)
> diff --git a/malloc/tst-memalign-2.c b/malloc/tst-memalign-2.c
> new file mode 100644
> index 0000000000..04d42a2da2
> --- /dev/null
> +++ b/malloc/tst-memalign-2.c
> @@ -0,0 +1,115 @@
> +/* Test for memalign chunk reuse
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <errno.h>
> +#include <malloc.h>
> +#include <stdio.h>
> +#include <string.h>
> +#include <unistd.h>
> +#include <array_length.h>
> +
> +#include <support/check.h>
> +
> +typedef struct TestCase {
> +  size_t size;
> +  size_t alignment;
> +  void *ptr1;
> +  void *ptr2;
> +} TestCase;
> +
> +static TestCase tcache_allocs[] = {
> +  { 24, 8, NULL, NULL },
> +  { 24, 16, NULL, NULL },
> +  { 128, 32, NULL, NULL }
> +};
> +#define TN array_length (tcache_allocs)
> +
> +static TestCase large_allocs[] = {
> +  { 23450, 64, NULL, NULL },
> +  { 23450, 64, NULL, NULL },
> +  { 23550, 64, NULL, NULL },
> +  { 23550, 64, NULL, NULL },
> +  { 23650, 64, NULL, NULL },
> +  { 23650, 64, NULL, NULL },
> +  { 33650, 64, NULL, NULL },
> +  { 33650, 64, NULL, NULL }
> +};
> +#define LN array_length (large_allocs)
> +
> +void *p;
> +
> +static int
> +do_test (void)
> +{
> +  int i, j;
> +  int count;
> +
> +  /* TCache test.  */
> +
> +  for (i = 0; i < TN; ++ i)
> +    {
> +      tcache_allocs[i].ptr1 = memalign (tcache_allocs[i].alignment, tcache_allocs[i].size);
> +      free (tcache_allocs[i].ptr1);
> +      /* This should return the same chunk as was just free'd.  */
> +      tcache_allocs[i].ptr2 = memalign (tcache_allocs[i].alignment, tcache_allocs[i].size);
> +      free (tcache_allocs[i].ptr2);
> +
> +      TEST_VERIFY (tcache_allocs[i].ptr1 == tcache_allocs[i].ptr2);
> +    }
> +
> +  /* Large bins test.  */
> +
> +  for (i = 0; i < LN; ++ i)
> +    {
> +      large_allocs[i].ptr1 = memalign (large_allocs[i].alignment, large_allocs[i].size);
> +      /* Keep chunks from combining by fragmenting the heap.  */
> +      p = malloc (512);
> +    }
> +
> +  for (i = 0; i < LN; ++ i)
> +    free (large_allocs[i].ptr1);
> +
> +  /* Force the unsorted bins to be scanned and moved to small/large
> +     bins.  */
> +  p = malloc (60000);
> +
> +  for (i = 0; i < LN; ++ i)
> +    large_allocs[i].ptr2 = memalign (large_allocs[i].alignment, large_allocs[i].size);
> +
> +  count = 0;
> +  for (i = 0; i < LN; ++ i)
> +    {
> +      int ok = 0;
> +      for (j = 0; j < LN; ++ j)
> +	if (large_allocs[i].ptr1 == large_allocs[j].ptr2)
> +	  ok = 1;
> +      if (ok == 1)
> +	count ++;
> +    }
> +
> +  /* The allocation algorithm is complicated outside of the memalign
> +     logic, so just make sure it's working for most of the
> +     allocations.  This avoids possible boundary conditions with
> +     empty/full heaps.  */
> +  TEST_VERIFY (count > LN / 2);
> +
> +  return 0;
> +}
> +
> +#define TEST_FUNCTION do_test ()
> +#include "../test-skeleton.c"
> 


-- 
Cheers,
Carlos.


^ permalink raw reply	[flat|nested] 38+ messages in thread

* [PATCH v2 1/1] memalign: Support scanning for aligned chunks.
  2022-07-19  2:54 ` Carlos O'Donell
@ 2022-07-19  3:57   ` DJ Delorie
  2022-07-19  9:19     ` Florian Weimer
  0 siblings, 1 reply; 38+ messages in thread
From: DJ Delorie @ 2022-07-19  3:57 UTC (permalink / raw)
  To: libc-alpha


[v2: disable chunk alignment check for tcache because tcache doesn't
return chunks; skip malloc-check test because that bypasses tcache]

This patch adds a chunk scanning algorithm to the _int_memalign code
path that reduces heap fragmentation by reusing already aligned chunks
instead of always looking for chunks of larger sizes and splitting
them.

The goal is to fix the pathological use cases where heaps grow
continuously in workloads that are heavy users of memalign.

Note that tst-memalign-2 checks for tcache operation, which
malloc-check bypasses.

diff --git a/malloc/Makefile b/malloc/Makefile
index 4e32de2a0b..084c408ac7 100644
--- a/malloc/Makefile
+++ b/malloc/Makefile
@@ -43,6 +43,7 @@ tests := mallocbug tst-malloc tst-valloc tst-calloc tst-obstack \
 	 tst-tcfree1 tst-tcfree2 tst-tcfree3 \
 	 tst-safe-linking \
 	 tst-mallocalign1 \
+	 tst-memalign-2
 
 tests-static := \
 	 tst-interpose-static-nothread \
@@ -72,7 +73,7 @@ test-srcs = tst-mtrace
 # with MALLOC_CHECK_=3 because they expect a specific failure.
 tests-exclude-malloc-check = tst-malloc-check tst-malloc-usable \
 	tst-mxfast tst-safe-linking \
-	tst-compathooks-off tst-compathooks-on
+	tst-compathooks-off tst-compathooks-on tst-memalign-2
 
 # Run all tests with MALLOC_CHECK_=3
 tests-malloc-check = $(filter-out $(tests-exclude-malloc-check) \
diff --git a/malloc/malloc.c b/malloc/malloc.c
index 12908b8f97..14ee98dfa2 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -3557,6 +3557,30 @@ _mid_memalign (size_t alignment, size_t bytes, void *address)
       alignment = a;
     }
 
+#if USE_TCACHE
+  {
+    size_t tbytes;
+    tbytes = checked_request2size (bytes);
+    if (tbytes == 0)
+      {
+	__set_errno (ENOMEM);
+	return NULL;
+      }
+    size_t tc_idx = csize2tidx (tbytes);
+
+    MAYBE_INIT_TCACHE ();
+
+    if (tc_idx < mp_.tcache_bins
+	&& tcache
+	&& tcache->counts[tc_idx] > 0
+	&& ((intptr_t)tcache->entries[tc_idx] & (alignment - 1)) == 0)
+      {
+	void *victim = tcache_get (tc_idx);
+	return tag_new_usable (victim);
+      }
+  }
+#endif
+
   if (SINGLE_THREAD_P)
     {
       p = _int_memalign (&main_arena, alignment, bytes);
@@ -4937,6 +4961,43 @@ _int_realloc (mstate av, mchunkptr oldp, INTERNAL_SIZE_T oldsize,
    ------------------------------ memalign ------------------------------
  */
 
+/* Returns 0 if the chunk is not and does not contain the requested
+   aligned sub-chunk, else returns the amount of "waste" from
+   trimming.  BYTES is the *user* byte size, not the chunk byte
+   size.  */
+static int
+chunk_ok_for_memalign (mchunkptr p, size_t alignment, size_t bytes)
+{
+  void *m = chunk2mem (p);
+  INTERNAL_SIZE_T size = memsize (p);
+  void *aligned_m = m;
+
+  if (__glibc_unlikely (misaligned_chunk (p)))
+    malloc_printerr ("_int_memalign(): unaligned chunk detected");
+
+  aligned_m = PTR_ALIGN_UP (m, alignment);
+
+  INTERNAL_SIZE_T front_extra = (intptr_t) aligned_m - (intptr_t) m;
+
+  /* We can't trim off the front as it's too small.  */
+  if (front_extra > 0 && front_extra < MINSIZE)
+    return 0;
+
+  /* If it's a perfect fit, it's an exception to the return value rule
+     (we would return zero waste, which looks like "not usable"), so
+     handle it here by returning a small non-zero value instead.  */
+  if (size == bytes && front_extra == 0)
+    return 1;
+
+  /* If the block we need fits in the chunk, calculate total waste.  */
+  if (size > bytes + front_extra)
+    return size - bytes;
+
+  /* Can't use this chunk.  */ 
+  return 0;
+}
+
+/* BYTES is user requested bytes, not requested chunksize bytes.  */
 static void *
 _int_memalign (mstate av, size_t alignment, size_t bytes)
 {
@@ -4950,8 +5011,7 @@ _int_memalign (mstate av, size_t alignment, size_t bytes)
   mchunkptr remainder;            /* spare room at end to split off */
   unsigned long remainder_size;   /* its size */
   INTERNAL_SIZE_T size;
-
-
+  mchunkptr victim;
 
   nb = checked_request2size (bytes);
   if (nb == 0)
@@ -4960,29 +5020,142 @@ _int_memalign (mstate av, size_t alignment, size_t bytes)
       return NULL;
     }
 
-  /*
-     Strategy: find a spot within that chunk that meets the alignment
+  /* We can't check tcache here because we hold the arena lock, which
+     tcache doesn't expect.  We expect it has been checked
+     earlier.  */
+
+  /* Strategy: search the bins looking for an existing block that
+     meets our needs.  We scan a range of bins from "exact size" to
+     "just under 2x", spanning the small/large barrier if needed.  If
+     we don't find anything in those bins, the common malloc code will
+     scan starting at 2x.  */
+
+  /* This will be set if we found a candidate chunk.  */
+  victim = NULL;
+
+  /* Fast bins are singly-linked, hard to remove a chunk from the middle
+     and unlikely to meet our alignment requirements.  We have not done
+     any experimentation with searching for aligned fastbins.  */
+
+  int first_bin_index;
+  int first_largebin_index;
+  int last_bin_index;
+
+  if (in_smallbin_range (nb))
+    first_bin_index = smallbin_index (nb);
+  else
+    first_bin_index = largebin_index (nb);
+
+  if (in_smallbin_range (nb * 2))
+    last_bin_index = smallbin_index (nb * 2);
+  else
+    last_bin_index = largebin_index (nb * 2);
+
+  first_largebin_index = largebin_index (MIN_LARGE_SIZE);
+
+  int victim_index;                 /* its bin index */
+
+  for (victim_index = first_bin_index;
+       victim_index < last_bin_index;
+       victim_index ++)
+    {
+      victim = NULL;
+
+      if (victim_index < first_largebin_index)
+    {
+      /* Check small bins.  Small bin chunks are doubly-linked despite
+	 being the same size.  */
+
+      mchunkptr fwd;                    /* misc temp for linking */
+      mchunkptr bck;                    /* misc temp for linking */
+
+      bck = bin_at (av, victim_index);
+      fwd = bck->fd;
+      while (fwd != bck)
+	{
+	  if (chunk_ok_for_memalign (fwd, alignment, bytes) > 0)
+	    {
+	      victim = fwd;
+
+	      /* Unlink it */
+	      victim->fd->bk = victim->bk;
+	      victim->bk->fd = victim->fd;
+	      break;
+	    }
+
+	  fwd = fwd->fd;
+	}
+    }
+  else
+    {
+      /* Check large bins.  */
+      mchunkptr fwd;                    /* misc temp for linking */
+      mchunkptr bck;                    /* misc temp for linking */
+      mchunkptr best = NULL;
+      size_t best_size = 0;
+
+      bck = bin_at (av, victim_index);
+      fwd = bck->fd;
+
+      while (fwd != bck)
+	{
+	  int extra;
+
+	  if (chunksize (fwd) < nb)
+	      break;
+	  extra = chunk_ok_for_memalign (fwd, alignment, bytes);
+	  if (extra > 0
+	      && (extra <= best_size || best == NULL))
+	    {
+	      best = fwd;
+	      best_size = extra;
+	    }
+
+	  fwd = fwd->fd;
+	}
+      victim = best;
+
+      if (victim != NULL)
+	{
+	  unlink_chunk (av, victim);
+	  break;
+	}
+    }
+
+      if (victim != NULL)
+	break;
+    }
+
+  /* Strategy: find a spot within that chunk that meets the alignment
      request, and then possibly free the leading and trailing space.
-   */
+     This strategy is incredibly costly and can lead to external
+     fragmentation if header and footer chunks are unused.  */
 
-  /* Call malloc with worst case padding to hit alignment. */
+  if (victim != NULL)
+    {
+      p = victim;
+      m = chunk2mem (p);
+      set_inuse (p);
+    }
+  else
+    {
+      /* Call malloc with worst case padding to hit alignment. */
 
-  m = (char *) (_int_malloc (av, nb + alignment + MINSIZE));
+      m = (char *) (_int_malloc (av, nb + alignment + MINSIZE));
 
-  if (m == 0)
-    return 0;           /* propagate failure */
+      if (m == 0)
+	return 0;           /* propagate failure */
 
-  p = mem2chunk (m);
+      p = mem2chunk (m);
+    }
 
   if ((((unsigned long) (m)) % alignment) != 0)   /* misaligned */
-
-    { /*
-                Find an aligned spot inside chunk.  Since we need to give back
-                leading space in a chunk of at least MINSIZE, if the first
-                calculation places us at a spot with less than MINSIZE leader,
-                we can move to the next aligned spot -- we've allocated enough
-                total room so that this is always possible.
-                 */
+    {
+      /* Find an aligned spot inside chunk.  Since we need to give back
+         leading space in a chunk of at least MINSIZE, if the first
+         calculation places us at a spot with less than MINSIZE leader,
+         we can move to the next aligned spot -- we've allocated enough
+         total room so that this is always possible.  */
       brk = (char *) mem2chunk (((unsigned long) (m + alignment - 1)) &
                                 - ((signed long) alignment));
       if ((unsigned long) (brk - (char *) (p)) < MINSIZE)
diff --git a/malloc/tst-memalign-2.c b/malloc/tst-memalign-2.c
new file mode 100644
index 0000000000..04d42a2da2
--- /dev/null
+++ b/malloc/tst-memalign-2.c
@@ -0,0 +1,115 @@
+/* Test for memalign chunk reuse
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <errno.h>
+#include <malloc.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <array_length.h>
+
+#include <support/check.h>
+
+typedef struct TestCase {
+  size_t size;
+  size_t alignment;
+  void *ptr1;
+  void *ptr2;
+} TestCase;
+
+static TestCase tcache_allocs[] = {
+  { 24, 8, NULL, NULL },
+  { 24, 16, NULL, NULL },
+  { 128, 32, NULL, NULL }
+};
+#define TN array_length (tcache_allocs)
+
+static TestCase large_allocs[] = {
+  { 23450, 64, NULL, NULL },
+  { 23450, 64, NULL, NULL },
+  { 23550, 64, NULL, NULL },
+  { 23550, 64, NULL, NULL },
+  { 23650, 64, NULL, NULL },
+  { 23650, 64, NULL, NULL },
+  { 33650, 64, NULL, NULL },
+  { 33650, 64, NULL, NULL }
+};
+#define LN array_length (large_allocs)
+
+void *p;
+
+static int
+do_test (void)
+{
+  int i, j;
+  int count;
+
+  /* TCache test.  */
+
+  for (i = 0; i < TN; ++ i)
+    {
+      tcache_allocs[i].ptr1 = memalign (tcache_allocs[i].alignment, tcache_allocs[i].size);
+      free (tcache_allocs[i].ptr1);
+      /* This should return the same chunk as was just free'd.  */
+      tcache_allocs[i].ptr2 = memalign (tcache_allocs[i].alignment, tcache_allocs[i].size);
+      free (tcache_allocs[i].ptr2);
+
+      TEST_VERIFY (tcache_allocs[i].ptr1 == tcache_allocs[i].ptr2);
+    }
+
+  /* Large bins test.  */
+
+  for (i = 0; i < LN; ++ i)
+    {
+      large_allocs[i].ptr1 = memalign (large_allocs[i].alignment, large_allocs[i].size);
+      /* Keep chunks from combining by fragmenting the heap.  */
+      p = malloc (512);
+    }
+
+  for (i = 0; i < LN; ++ i)
+    free (large_allocs[i].ptr1);
+
+  /* Force the unsorted bins to be scanned and moved to small/large
+     bins.  */
+  p = malloc (60000);
+
+  for (i = 0; i < LN; ++ i)
+    large_allocs[i].ptr2 = memalign (large_allocs[i].alignment, large_allocs[i].size);
+
+  count = 0;
+  for (i = 0; i < LN; ++ i)
+    {
+      int ok = 0;
+      for (j = 0; j < LN; ++ j)
+	if (large_allocs[i].ptr1 == large_allocs[j].ptr2)
+	  ok = 1;
+      if (ok == 1)
+	count ++;
+    }
+
+  /* The allocation algorithm is complicated outside of the memalign
+     logic, so just make sure it's working for most of the
+     allocations.  This avoids possible boundary conditions with
+     empty/full heaps.  */
+  TEST_VERIFY (count > LN / 2);
+
+  return 0;
+}
+
+#define TEST_FUNCTION do_test ()
+#include "../test-skeleton.c"


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 1/1] memalign: Support scanning for aligned chunks.
  2022-07-19  3:57   ` [PATCH v2 " DJ Delorie
@ 2022-07-19  9:19     ` Florian Weimer
  2022-07-19 17:32       ` DJ Delorie
  2022-07-20  0:32       ` [PATCH v3 " DJ Delorie
  0 siblings, 2 replies; 38+ messages in thread
From: Florian Weimer @ 2022-07-19  9:19 UTC (permalink / raw)
  To: DJ Delorie via Libc-alpha

* DJ Delorie via Libc-alpha:

> diff --git a/malloc/malloc.c b/malloc/malloc.c
> index 12908b8f97..14ee98dfa2 100644
> --- a/malloc/malloc.c
> +++ b/malloc/malloc.c
> @@ -3557,6 +3557,30 @@ _mid_memalign (size_t alignment, size_t bytes, void *address)
>        alignment = a;
>      }
>  
> +#if USE_TCACHE
> +  {
> +    size_t tbytes;
> +    tbytes = checked_request2size (bytes);
> +    if (tbytes == 0)
> +      {
> +	__set_errno (ENOMEM);
> +	return NULL;
> +      }
> +    size_t tc_idx = csize2tidx (tbytes);
> +
> +    MAYBE_INIT_TCACHE ();
> +
> +    if (tc_idx < mp_.tcache_bins
> +	&& tcache
> +	&& tcache->counts[tc_idx] > 0
> +	&& ((intptr_t)tcache->entries[tc_idx] & (alignment - 1)) == 0)
> +      {
> +	void *victim = tcache_get (tc_idx);
> +	return tag_new_usable (victim);
> +      }
> +  }
> +#endif

MAYBE_INIT_TCACHE does not seem necessary.  If the tcache is empty,
there's no need to initialize it and scane it.  And why not scan the
entire chain here?  It's another source of missed chunk reuse.

The tcache null check should be written as tcache != NULL.

Thanks,
Florian


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 1/1] memalign: Support scanning for aligned chunks.
  2022-07-19  9:19     ` Florian Weimer
@ 2022-07-19 17:32       ` DJ Delorie
  2022-07-20  0:32       ` [PATCH v3 " DJ Delorie
  1 sibling, 0 replies; 38+ messages in thread
From: DJ Delorie @ 2022-07-19 17:32 UTC (permalink / raw)
  To: Florian Weimer; +Cc: libc-alpha

Florian Weimer <fweimer@redhat.com> writes:
> MAYBE_INIT_TCACHE does not seem necessary.  If the tcache is empty,
> there's no need to initialize it and scane it.
> The tcache null check should be written as tcache != NULL.

Literal cut-n-paste from elsewhere in the code ;-)

> And why not scan the entire chain here?

Speed.  The tcache list is not designed to be easy to access in the
middle.  I know that's a weak excuse, but keeping everything going
through tcache_get() and tcache_put() both simplifies and robustifies
the code.


^ permalink raw reply	[flat|nested] 38+ messages in thread

* [PATCH v3 1/1] memalign: Support scanning for aligned chunks.
  2022-07-19  9:19     ` Florian Weimer
  2022-07-19 17:32       ` DJ Delorie
@ 2022-07-20  0:32       ` DJ Delorie
  2022-07-22 20:21         ` DJ Delorie
  2022-07-22 20:28         ` Joseph Myers
  1 sibling, 2 replies; 38+ messages in thread
From: DJ Delorie @ 2022-07-20  0:32 UTC (permalink / raw)
  To: Florian Weimer; +Cc: libc-alpha


Florian Weimer <fweimer@redhat.com> writes:
> MAYBE_INIT_TCACHE does not seem necessary.  If the tcache is empty,
> there's no need to initialize it and scane it.  And why not scan the
> entire chain here?  It's another source of missed chunk reuse.
>
> The tcache null check should be written as tcache != NULL.

[v3: cleaned up all the != NULL cases, added messy logic to remove
non-head tcache chunks]

This patch adds a chunk scanning algorithm to the _int_memalign code
path that reduces heap fragmentation by reusing already aligned chunks
instead of always looking for chunks of larger sizes and splitting
them.  The tcache macros are extended to allow removing a chunk from
the middle of the list.

The goal is to fix the pathological use cases where heaps grow
continuously in workloads that are heavy users of memalign.

Note that tst-memalign-2 checks for tcache operation, which
malloc-check bypasses.

diff --git a/malloc/Makefile b/malloc/Makefile
index 4e32de2a0b..084c408ac7 100644
--- a/malloc/Makefile
+++ b/malloc/Makefile
@@ -43,6 +43,7 @@ tests := mallocbug tst-malloc tst-valloc tst-calloc tst-obstack \
 	 tst-tcfree1 tst-tcfree2 tst-tcfree3 \
 	 tst-safe-linking \
 	 tst-mallocalign1 \
+	 tst-memalign-2
 
 tests-static := \
 	 tst-interpose-static-nothread \
@@ -72,7 +73,7 @@ test-srcs = tst-mtrace
 # with MALLOC_CHECK_=3 because they expect a specific failure.
 tests-exclude-malloc-check = tst-malloc-check tst-malloc-usable \
 	tst-mxfast tst-safe-linking \
-	tst-compathooks-off tst-compathooks-on
+	tst-compathooks-off tst-compathooks-on tst-memalign-2
 
 # Run all tests with MALLOC_CHECK_=3
 tests-malloc-check = $(filter-out $(tests-exclude-malloc-check) \
diff --git a/malloc/malloc.c b/malloc/malloc.c
index 12908b8f97..364125e6b4 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -3185,19 +3185,44 @@ tcache_put (mchunkptr chunk, size_t tc_idx)
 }
 
 /* Caller must ensure that we know tc_idx is valid and there's
-   available chunks to remove.  */
+   available chunks to remove.  Removes chunk from the middle of the
+   list.  */
 static __always_inline void *
-tcache_get (size_t tc_idx)
+tcache_get_n (size_t tc_idx, tcache_entry **ep)
 {
-  tcache_entry *e = tcache->entries[tc_idx];
+  tcache_entry *e;
+  if (ep == &(tcache->entries[tc_idx]))
+    e = *ep;
+  else
+    e = REVEAL_PTR (*ep);
+
   if (__glibc_unlikely (!aligned_OK (e)))
     malloc_printerr ("malloc(): unaligned tcache chunk detected");
-  tcache->entries[tc_idx] = REVEAL_PTR (e->next);
+
+  if (ep == &(tcache->entries[tc_idx]))
+      *ep = REVEAL_PTR (e->next);
+  else
+    *ep = PROTECT_PTR (ep, REVEAL_PTR (e->next));
+
   --(tcache->counts[tc_idx]);
   e->key = 0;
   return (void *) e;
 }
 
+/* Like the above, but removes from the head of the list.  */
+static __always_inline void *
+tcache_get (size_t tc_idx)
+{
+  return tcache_get_n (tc_idx, & tcache->entries[tc_idx]);
+}
+
+/* Iterates through the tcache linked list.  */
+static __always_inline void *
+tcache_next (tcache_entry *e)
+{
+  return (tcache_entry *) REVEAL_PTR (e->next);
+}
+
 static void
 tcache_thread_shutdown (void)
 {
@@ -3306,7 +3331,7 @@ __libc_malloc (size_t bytes)
 
   DIAG_PUSH_NEEDS_COMMENT;
   if (tc_idx < mp_.tcache_bins
-      && tcache
+      && tcache != NULL
       && tcache->counts[tc_idx] > 0)
     {
       victim = tcache_get (tc_idx);
@@ -3557,6 +3582,38 @@ _mid_memalign (size_t alignment, size_t bytes, void *address)
       alignment = a;
     }
 
+#if USE_TCACHE
+  {
+    size_t tbytes;
+    tbytes = checked_request2size (bytes);
+    if (tbytes == 0)
+      {
+	__set_errno (ENOMEM);
+	return NULL;
+      }
+    size_t tc_idx = csize2tidx (tbytes);
+
+    if (tc_idx < mp_.tcache_bins
+	&& tcache != NULL
+	&& tcache->counts[tc_idx] > 0)
+      {
+	/* The tcache itself isn't encoded, but the chain is.  */
+	tcache_entry **tep = & tcache->entries[tc_idx];
+	tcache_entry *te = *tep;
+	while (te != NULL && ((intptr_t)te & (alignment - 1)) != 0)
+	  {
+	    tep = & (te->next);
+	    te = tcache_next (te);
+	  }
+	if (te != NULL)
+	  {
+	    void *victim = tcache_get_n (tc_idx, tep);
+	    return tag_new_usable (victim);
+	  }
+      }
+  }
+#endif
+
   if (SINGLE_THREAD_P)
     {
       p = _int_memalign (&main_arena, alignment, bytes);
@@ -3862,7 +3919,7 @@ _int_malloc (mstate av, size_t bytes)
 	      /* While we're here, if we see other chunks of the same size,
 		 stash them in the tcache.  */
 	      size_t tc_idx = csize2tidx (nb);
-	      if (tcache && tc_idx < mp_.tcache_bins)
+	      if (tcache != NULL && tc_idx < mp_.tcache_bins)
 		{
 		  mchunkptr tc_victim;
 
@@ -3920,7 +3977,7 @@ _int_malloc (mstate av, size_t bytes)
 	  /* While we're here, if we see other chunks of the same size,
 	     stash them in the tcache.  */
 	  size_t tc_idx = csize2tidx (nb);
-	  if (tcache && tc_idx < mp_.tcache_bins)
+	  if (tcache != NULL && tc_idx < mp_.tcache_bins)
 	    {
 	      mchunkptr tc_victim;
 
@@ -3982,7 +4039,7 @@ _int_malloc (mstate av, size_t bytes)
 #if USE_TCACHE
   INTERNAL_SIZE_T tcache_nb = 0;
   size_t tc_idx = csize2tidx (nb);
-  if (tcache && tc_idx < mp_.tcache_bins)
+  if (tcache != NULL && tc_idx < mp_.tcache_bins)
     tcache_nb = nb;
   int return_cached = 0;
 
@@ -4064,7 +4121,7 @@ _int_malloc (mstate av, size_t bytes)
 #if USE_TCACHE
 	      /* Fill cache first, return to user only if cache fills.
 		 We may return one of these chunks later.  */
-	      if (tcache_nb
+	      if (tcache_nb > 0
 		  && tcache->counts[tc_idx] < mp_.tcache_count)
 		{
 		  tcache_put (victim, tc_idx);
@@ -4937,6 +4994,43 @@ _int_realloc (mstate av, mchunkptr oldp, INTERNAL_SIZE_T oldsize,
    ------------------------------ memalign ------------------------------
  */
 
+/* Returns 0 if the chunk is not and does not contain the requested
+   aligned sub-chunk, else returns the amount of "waste" from
+   trimming.  BYTES is the *user* byte size, not the chunk byte
+   size.  */
+static int
+chunk_ok_for_memalign (mchunkptr p, size_t alignment, size_t bytes)
+{
+  void *m = chunk2mem (p);
+  INTERNAL_SIZE_T size = memsize (p);
+  void *aligned_m = m;
+
+  if (__glibc_unlikely (misaligned_chunk (p)))
+    malloc_printerr ("_int_memalign(): unaligned chunk detected");
+
+  aligned_m = PTR_ALIGN_UP (m, alignment);
+
+  INTERNAL_SIZE_T front_extra = (intptr_t) aligned_m - (intptr_t) m;
+
+  /* We can't trim off the front as it's too small.  */
+  if (front_extra > 0 && front_extra < MINSIZE)
+    return 0;
+
+  /* If it's a perfect fit, it's an exception to the return value rule
+     (we would return zero waste, which looks like "not usable"), so
+     handle it here by returning a small non-zero value instead.  */
+  if (size == bytes && front_extra == 0)
+    return 1;
+
+  /* If the block we need fits in the chunk, calculate total waste.  */
+  if (size > bytes + front_extra)
+    return size - bytes;
+
+  /* Can't use this chunk.  */ 
+  return 0;
+}
+
+/* BYTES is user requested bytes, not requested chunksize bytes.  */
 static void *
 _int_memalign (mstate av, size_t alignment, size_t bytes)
 {
@@ -4950,8 +5044,7 @@ _int_memalign (mstate av, size_t alignment, size_t bytes)
   mchunkptr remainder;            /* spare room at end to split off */
   unsigned long remainder_size;   /* its size */
   INTERNAL_SIZE_T size;
-
-
+  mchunkptr victim;
 
   nb = checked_request2size (bytes);
   if (nb == 0)
@@ -4960,29 +5053,142 @@ _int_memalign (mstate av, size_t alignment, size_t bytes)
       return NULL;
     }
 
-  /*
-     Strategy: find a spot within that chunk that meets the alignment
+  /* We can't check tcache here because we hold the arena lock, which
+     tcache doesn't expect.  We expect it has been checked
+     earlier.  */
+
+  /* Strategy: search the bins looking for an existing block that
+     meets our needs.  We scan a range of bins from "exact size" to
+     "just under 2x", spanning the small/large barrier if needed.  If
+     we don't find anything in those bins, the common malloc code will
+     scan starting at 2x.  */
+
+  /* This will be set if we found a candidate chunk.  */
+  victim = NULL;
+
+  /* Fast bins are singly-linked, hard to remove a chunk from the middle
+     and unlikely to meet our alignment requirements.  We have not done
+     any experimentation with searching for aligned fastbins.  */
+
+  int first_bin_index;
+  int first_largebin_index;
+  int last_bin_index;
+
+  if (in_smallbin_range (nb))
+    first_bin_index = smallbin_index (nb);
+  else
+    first_bin_index = largebin_index (nb);
+
+  if (in_smallbin_range (nb * 2))
+    last_bin_index = smallbin_index (nb * 2);
+  else
+    last_bin_index = largebin_index (nb * 2);
+
+  first_largebin_index = largebin_index (MIN_LARGE_SIZE);
+
+  int victim_index;                 /* its bin index */
+
+  for (victim_index = first_bin_index;
+       victim_index < last_bin_index;
+       victim_index ++)
+    {
+      victim = NULL;
+
+      if (victim_index < first_largebin_index)
+    {
+      /* Check small bins.  Small bin chunks are doubly-linked despite
+	 being the same size.  */
+
+      mchunkptr fwd;                    /* misc temp for linking */
+      mchunkptr bck;                    /* misc temp for linking */
+
+      bck = bin_at (av, victim_index);
+      fwd = bck->fd;
+      while (fwd != bck)
+	{
+	  if (chunk_ok_for_memalign (fwd, alignment, bytes) > 0)
+	    {
+	      victim = fwd;
+
+	      /* Unlink it */
+	      victim->fd->bk = victim->bk;
+	      victim->bk->fd = victim->fd;
+	      break;
+	    }
+
+	  fwd = fwd->fd;
+	}
+    }
+  else
+    {
+      /* Check large bins.  */
+      mchunkptr fwd;                    /* misc temp for linking */
+      mchunkptr bck;                    /* misc temp for linking */
+      mchunkptr best = NULL;
+      size_t best_size = 0;
+
+      bck = bin_at (av, victim_index);
+      fwd = bck->fd;
+
+      while (fwd != bck)
+	{
+	  int extra;
+
+	  if (chunksize (fwd) < nb)
+	      break;
+	  extra = chunk_ok_for_memalign (fwd, alignment, bytes);
+	  if (extra > 0
+	      && (extra <= best_size || best == NULL))
+	    {
+	      best = fwd;
+	      best_size = extra;
+	    }
+
+	  fwd = fwd->fd;
+	}
+      victim = best;
+
+      if (victim != NULL)
+	{
+	  unlink_chunk (av, victim);
+	  break;
+	}
+    }
+
+      if (victim != NULL)
+	break;
+    }
+
+  /* Strategy: find a spot within that chunk that meets the alignment
      request, and then possibly free the leading and trailing space.
-   */
+     This strategy is incredibly costly and can lead to external
+     fragmentation if header and footer chunks are unused.  */
 
-  /* Call malloc with worst case padding to hit alignment. */
+  if (victim != NULL)
+    {
+      p = victim;
+      m = chunk2mem (p);
+      set_inuse (p);
+    }
+  else
+    {
+      /* Call malloc with worst case padding to hit alignment. */
 
-  m = (char *) (_int_malloc (av, nb + alignment + MINSIZE));
+      m = (char *) (_int_malloc (av, nb + alignment + MINSIZE));
 
-  if (m == 0)
-    return 0;           /* propagate failure */
+      if (m == 0)
+	return 0;           /* propagate failure */
 
-  p = mem2chunk (m);
+      p = mem2chunk (m);
+    }
 
   if ((((unsigned long) (m)) % alignment) != 0)   /* misaligned */
-
-    { /*
-                Find an aligned spot inside chunk.  Since we need to give back
-                leading space in a chunk of at least MINSIZE, if the first
-                calculation places us at a spot with less than MINSIZE leader,
-                we can move to the next aligned spot -- we've allocated enough
-                total room so that this is always possible.
-                 */
+    {
+      /* Find an aligned spot inside chunk.  Since we need to give back
+         leading space in a chunk of at least MINSIZE, if the first
+         calculation places us at a spot with less than MINSIZE leader,
+         we can move to the next aligned spot -- we've allocated enough
+         total room so that this is always possible.  */
       brk = (char *) mem2chunk (((unsigned long) (m + alignment - 1)) &
                                 - ((signed long) alignment));
       if ((unsigned long) (brk - (char *) (p)) < MINSIZE)
diff --git a/malloc/tst-memalign-2.c b/malloc/tst-memalign-2.c
new file mode 100644
index 0000000000..432a37bd8b
--- /dev/null
+++ b/malloc/tst-memalign-2.c
@@ -0,0 +1,137 @@
+/* Test for memalign chunk reuse
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <errno.h>
+#include <malloc.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <array_length.h>
+
+#include <support/check.h>
+
+typedef struct TestCase {
+  size_t size;
+  size_t alignment;
+  void *ptr1;
+  void *ptr2;
+} TestCase;
+
+static TestCase tcache_allocs[] = {
+  { 24, 8, NULL, NULL },
+  { 24, 16, NULL, NULL },
+  { 128, 32, NULL, NULL }
+};
+#define TN array_length (tcache_allocs)
+
+static TestCase large_allocs[] = {
+  { 23450, 64, NULL, NULL },
+  { 23450, 64, NULL, NULL },
+  { 23550, 64, NULL, NULL },
+  { 23550, 64, NULL, NULL },
+  { 23650, 64, NULL, NULL },
+  { 23650, 64, NULL, NULL },
+  { 33650, 64, NULL, NULL },
+  { 33650, 64, NULL, NULL }
+};
+#define LN array_length (large_allocs)
+
+void *p;
+
+static int
+do_test (void)
+{
+  int i, j;
+  int count;
+  void *ptr[10];
+  void *p;
+
+  /* TCache test.  */
+
+  for (i = 0; i < TN; ++ i)
+    {
+      tcache_allocs[i].ptr1 = memalign (tcache_allocs[i].alignment, tcache_allocs[i].size);
+      free (tcache_allocs[i].ptr1);
+      /* This should return the same chunk as was just free'd.  */
+      tcache_allocs[i].ptr2 = memalign (tcache_allocs[i].alignment, tcache_allocs[i].size);
+      free (tcache_allocs[i].ptr2);
+
+      TEST_VERIFY (tcache_allocs[i].ptr1 == tcache_allocs[i].ptr2);
+    }
+
+  /* Test for non-head tcache hits.  */
+  for (i = 0; i < 10; ++ i)
+    {
+      if (i == 4)
+	ptr[i] = memalign (64, 256);
+      else
+	ptr[i] = malloc (256);
+    }
+  for (i = 0; i < 10; ++ i)
+    free (ptr[i]);
+
+  p = memalign (64, 256);
+
+  count = 0;
+  for (i = 0; i < 10; ++ i)
+    if (ptr[i] == p)
+      ++ count;
+  free (p);
+  TEST_VERIFY (count > 0);
+
+  /* Large bins test.  */
+
+  for (i = 0; i < LN; ++ i)
+    {
+      large_allocs[i].ptr1 = memalign (large_allocs[i].alignment, large_allocs[i].size);
+      /* Keep chunks from combining by fragmenting the heap.  */
+      p = malloc (512);
+    }
+
+  for (i = 0; i < LN; ++ i)
+    free (large_allocs[i].ptr1);
+
+  /* Force the unsorted bins to be scanned and moved to small/large
+     bins.  */
+  p = malloc (60000);
+
+  for (i = 0; i < LN; ++ i)
+    large_allocs[i].ptr2 = memalign (large_allocs[i].alignment, large_allocs[i].size);
+
+  count = 0;
+  for (i = 0; i < LN; ++ i)
+    {
+      int ok = 0;
+      for (j = 0; j < LN; ++ j)
+	if (large_allocs[i].ptr1 == large_allocs[j].ptr2)
+	  ok = 1;
+      if (ok == 1)
+	count ++;
+    }
+
+  /* The allocation algorithm is complicated outside of the memalign
+     logic, so just make sure it's working for most of the
+     allocations.  This avoids possible boundary conditions with
+     empty/full heaps.  */
+  TEST_VERIFY (count > LN / 2);
+
+  return 0;
+}
+
+#define TEST_FUNCTION do_test ()
+#include "../test-skeleton.c"


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v3 1/1] memalign: Support scanning for aligned chunks.
  2022-07-20  0:32       ` [PATCH v3 " DJ Delorie
@ 2022-07-22 20:21         ` DJ Delorie
  2022-07-22 20:28         ` Joseph Myers
  1 sibling, 0 replies; 38+ messages in thread
From: DJ Delorie @ 2022-07-22 20:21 UTC (permalink / raw)
  To: fweimer, libc-alpha


Sidebar - I ran the malloc benchtests on before/after builds, and the
variation between the two seemed no more or less than the variation
between just running the "before" tests twice.

The variation, just running the same tests multiple times, seems larger
than the gains/losses we typically get with malloc tweaks, though.
Computers are just too fast and unpredictable these days ;-)

I also have a python script that compares malloc benchtest runs (below).
Run like this:

 $ compare-malloc.py ~/tools/upstream/glibc.pristine.build/benchtests ~/tools/upstream/glibc.memalign.build/benchtests `ls -1d bench-malloc-simple-*.out | nsort` > /tmp/malloc2.csv

If there's interest, I can add this to benchtests/scripts

---------- 8< ----------
#!/usr/bin/python

import sys
import json
import argparse
import pprint
    
def do_keys (pattern, bd, ad):
    avg = 0
    count = 0
    print_it = 0;
    for b in sorted(bd.keys()):
        if pattern in b:
            if print_it:
                print ("%s,%s,%s,%s" % (save_b, bv, av, imp));
            save_b = b;
            bv = bd[b]
            av = ad[b]
            imp = av / bv;
            avg += imp
            count = count + 1
            print_it = 1

    if print_it:
        print ("%s,%s,%s,%s,%s" % (save_b, bv, av, imp, avg/count));

def compare_files(fn1, fn2, t):
    b = open (fn1, 'r')
    before = json.load(b)
    a = open (fn2, 'r')
    after = json.load(a)

    print (t)
    print ("Test,Before,After,Ratio,Average")
    do_keys ('max_rss', before['functions']['malloc'][''], after['functions']['malloc'][''])
    do_keys ('main_arena', before['functions']['malloc'][''], after['functions']['malloc'][''])
    do_keys ('thread_arena', before['functions']['malloc'][''], after['functions']['malloc'][''])
    print ("");

def main(argv):
    before_dir = argv[1];
    after_dir = argv[2];
    files = argv[3:]

    for f in files:
        bf = before_dir + "/" + f;
        af = after_dir + "/" + f;
        compare_files (bf, af, f)

if __name__ == '__main__':
    main(sys.argv)


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v3 1/1] memalign: Support scanning for aligned chunks.
  2022-07-20  0:32       ` [PATCH v3 " DJ Delorie
  2022-07-22 20:21         ` DJ Delorie
@ 2022-07-22 20:28         ` Joseph Myers
  2022-07-28 19:50           ` [PATCH v4 " DJ Delorie
  1 sibling, 1 reply; 38+ messages in thread
From: Joseph Myers @ 2022-07-22 20:28 UTC (permalink / raw)
  To: DJ Delorie; +Cc: Florian Weimer, libc-alpha

On Tue, 19 Jul 2022, DJ Delorie via Libc-alpha wrote:

> +#define TEST_FUNCTION do_test ()
> +#include "../test-skeleton.c"

New tests should be written to use <support/test-driver.c> rather than the 
old test-skeleton.c wrapper round it.

-- 
Joseph S. Myers
joseph@codesourcery.com

^ permalink raw reply	[flat|nested] 38+ messages in thread

* [PATCH v4 1/1] memalign: Support scanning for aligned chunks.
  2022-07-22 20:28         ` Joseph Myers
@ 2022-07-28 19:50           ` DJ Delorie
  2022-08-17 19:00             ` DJ Delorie
  2023-03-28 19:07             ` Adhemerval Zanella Netto
  0 siblings, 2 replies; 38+ messages in thread
From: DJ Delorie @ 2022-07-28 19:50 UTC (permalink / raw)
  To: libc-alpha; +Cc: fweimer, Joseph Myers


[v4: updated testcase to new driver]

[note that I am not pushing this patch for this release, the timing is
coincidence]

This patch adds a chunk scanning algorithm to the _int_memalign code
path that reduces heap fragmentation by reusing already aligned chunks
instead of always looking for chunks of larger sizes and splitting
them.  The tcache macros are extended to allow removing a chunk from
the middle of the list.

The goal is to fix the pathological use cases where heaps grow
continuously in workloads that are heavy users of memalign.

Note that tst-memalign-2 checks for tcache operation, which
malloc-check bypasses.

diff --git a/malloc/Makefile b/malloc/Makefile
index 4e32de2a0b..084c408ac7 100644
--- a/malloc/Makefile
+++ b/malloc/Makefile
@@ -43,6 +43,7 @@ tests := mallocbug tst-malloc tst-valloc tst-calloc tst-obstack \
 	 tst-tcfree1 tst-tcfree2 tst-tcfree3 \
 	 tst-safe-linking \
 	 tst-mallocalign1 \
+	 tst-memalign-2
 
 tests-static := \
 	 tst-interpose-static-nothread \
@@ -72,7 +73,7 @@ test-srcs = tst-mtrace
 # with MALLOC_CHECK_=3 because they expect a specific failure.
 tests-exclude-malloc-check = tst-malloc-check tst-malloc-usable \
 	tst-mxfast tst-safe-linking \
-	tst-compathooks-off tst-compathooks-on
+	tst-compathooks-off tst-compathooks-on tst-memalign-2
 
 # Run all tests with MALLOC_CHECK_=3
 tests-malloc-check = $(filter-out $(tests-exclude-malloc-check) \
diff --git a/malloc/malloc.c b/malloc/malloc.c
index bd3c76ed31..3b31d6ae0f 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -3180,19 +3180,44 @@ tcache_put (mchunkptr chunk, size_t tc_idx)
 }
 
 /* Caller must ensure that we know tc_idx is valid and there's
-   available chunks to remove.  */
+   available chunks to remove.  Removes chunk from the middle of the
+   list.  */
 static __always_inline void *
-tcache_get (size_t tc_idx)
+tcache_get_n (size_t tc_idx, tcache_entry **ep)
 {
-  tcache_entry *e = tcache->entries[tc_idx];
+  tcache_entry *e;
+  if (ep == &(tcache->entries[tc_idx]))
+    e = *ep;
+  else
+    e = REVEAL_PTR (*ep);
+
   if (__glibc_unlikely (!aligned_OK (e)))
     malloc_printerr ("malloc(): unaligned tcache chunk detected");
-  tcache->entries[tc_idx] = REVEAL_PTR (e->next);
+
+  if (ep == &(tcache->entries[tc_idx]))
+      *ep = REVEAL_PTR (e->next);
+  else
+    *ep = PROTECT_PTR (ep, REVEAL_PTR (e->next));
+
   --(tcache->counts[tc_idx]);
   e->key = 0;
   return (void *) e;
 }
 
+/* Like the above, but removes from the head of the list.  */
+static __always_inline void *
+tcache_get (size_t tc_idx)
+{
+  return tcache_get_n (tc_idx, & tcache->entries[tc_idx]);
+}
+
+/* Iterates through the tcache linked list.  */
+static __always_inline void *
+tcache_next (tcache_entry *e)
+{
+  return (tcache_entry *) REVEAL_PTR (e->next);
+}
+
 static void
 tcache_thread_shutdown (void)
 {
@@ -3301,7 +3326,7 @@ __libc_malloc (size_t bytes)
 
   DIAG_PUSH_NEEDS_COMMENT;
   if (tc_idx < mp_.tcache_bins
-      && tcache
+      && tcache != NULL
       && tcache->counts[tc_idx] > 0)
     {
       victim = tcache_get (tc_idx);
@@ -3552,6 +3577,38 @@ _mid_memalign (size_t alignment, size_t bytes, void *address)
       alignment = a;
     }
 
+#if USE_TCACHE
+  {
+    size_t tbytes;
+    tbytes = checked_request2size (bytes);
+    if (tbytes == 0)
+      {
+	__set_errno (ENOMEM);
+	return NULL;
+      }
+    size_t tc_idx = csize2tidx (tbytes);
+
+    if (tc_idx < mp_.tcache_bins
+	&& tcache != NULL
+	&& tcache->counts[tc_idx] > 0)
+      {
+	/* The tcache itself isn't encoded, but the chain is.  */
+	tcache_entry **tep = & tcache->entries[tc_idx];
+	tcache_entry *te = *tep;
+	while (te != NULL && ((intptr_t)te & (alignment - 1)) != 0)
+	  {
+	    tep = & (te->next);
+	    te = tcache_next (te);
+	  }
+	if (te != NULL)
+	  {
+	    void *victim = tcache_get_n (tc_idx, tep);
+	    return tag_new_usable (victim);
+	  }
+      }
+  }
+#endif
+
   if (SINGLE_THREAD_P)
     {
       p = _int_memalign (&main_arena, alignment, bytes);
@@ -3857,7 +3914,7 @@ _int_malloc (mstate av, size_t bytes)
 	      /* While we're here, if we see other chunks of the same size,
 		 stash them in the tcache.  */
 	      size_t tc_idx = csize2tidx (nb);
-	      if (tcache && tc_idx < mp_.tcache_bins)
+	      if (tcache != NULL && tc_idx < mp_.tcache_bins)
 		{
 		  mchunkptr tc_victim;
 
@@ -3915,7 +3972,7 @@ _int_malloc (mstate av, size_t bytes)
 	  /* While we're here, if we see other chunks of the same size,
 	     stash them in the tcache.  */
 	  size_t tc_idx = csize2tidx (nb);
-	  if (tcache && tc_idx < mp_.tcache_bins)
+	  if (tcache != NULL && tc_idx < mp_.tcache_bins)
 	    {
 	      mchunkptr tc_victim;
 
@@ -3977,7 +4034,7 @@ _int_malloc (mstate av, size_t bytes)
 #if USE_TCACHE
   INTERNAL_SIZE_T tcache_nb = 0;
   size_t tc_idx = csize2tidx (nb);
-  if (tcache && tc_idx < mp_.tcache_bins)
+  if (tcache != NULL && tc_idx < mp_.tcache_bins)
     tcache_nb = nb;
   int return_cached = 0;
 
@@ -4059,7 +4116,7 @@ _int_malloc (mstate av, size_t bytes)
 #if USE_TCACHE
 	      /* Fill cache first, return to user only if cache fills.
 		 We may return one of these chunks later.  */
-	      if (tcache_nb
+	      if (tcache_nb > 0
 		  && tcache->counts[tc_idx] < mp_.tcache_count)
 		{
 		  tcache_put (victim, tc_idx);
@@ -4932,6 +4989,43 @@ _int_realloc (mstate av, mchunkptr oldp, INTERNAL_SIZE_T oldsize,
    ------------------------------ memalign ------------------------------
  */
 
+/* Returns 0 if the chunk is not and does not contain the requested
+   aligned sub-chunk, else returns the amount of "waste" from
+   trimming.  BYTES is the *user* byte size, not the chunk byte
+   size.  */
+static int
+chunk_ok_for_memalign (mchunkptr p, size_t alignment, size_t bytes)
+{
+  void *m = chunk2mem (p);
+  INTERNAL_SIZE_T size = memsize (p);
+  void *aligned_m = m;
+
+  if (__glibc_unlikely (misaligned_chunk (p)))
+    malloc_printerr ("_int_memalign(): unaligned chunk detected");
+
+  aligned_m = PTR_ALIGN_UP (m, alignment);
+
+  INTERNAL_SIZE_T front_extra = (intptr_t) aligned_m - (intptr_t) m;
+
+  /* We can't trim off the front as it's too small.  */
+  if (front_extra > 0 && front_extra < MINSIZE)
+    return 0;
+
+  /* If it's a perfect fit, it's an exception to the return value rule
+     (we would return zero waste, which looks like "not usable"), so
+     handle it here by returning a small non-zero value instead.  */
+  if (size == bytes && front_extra == 0)
+    return 1;
+
+  /* If the block we need fits in the chunk, calculate total waste.  */
+  if (size > bytes + front_extra)
+    return size - bytes;
+
+  /* Can't use this chunk.  */ 
+  return 0;
+}
+
+/* BYTES is user requested bytes, not requested chunksize bytes.  */
 static void *
 _int_memalign (mstate av, size_t alignment, size_t bytes)
 {
@@ -4945,8 +5039,7 @@ _int_memalign (mstate av, size_t alignment, size_t bytes)
   mchunkptr remainder;            /* spare room at end to split off */
   unsigned long remainder_size;   /* its size */
   INTERNAL_SIZE_T size;
-
-
+  mchunkptr victim;
 
   nb = checked_request2size (bytes);
   if (nb == 0)
@@ -4955,29 +5048,142 @@ _int_memalign (mstate av, size_t alignment, size_t bytes)
       return NULL;
     }
 
-  /*
-     Strategy: find a spot within that chunk that meets the alignment
+  /* We can't check tcache here because we hold the arena lock, which
+     tcache doesn't expect.  We expect it has been checked
+     earlier.  */
+
+  /* Strategy: search the bins looking for an existing block that
+     meets our needs.  We scan a range of bins from "exact size" to
+     "just under 2x", spanning the small/large barrier if needed.  If
+     we don't find anything in those bins, the common malloc code will
+     scan starting at 2x.  */
+
+  /* This will be set if we found a candidate chunk.  */
+  victim = NULL;
+
+  /* Fast bins are singly-linked, hard to remove a chunk from the middle
+     and unlikely to meet our alignment requirements.  We have not done
+     any experimentation with searching for aligned fastbins.  */
+
+  int first_bin_index;
+  int first_largebin_index;
+  int last_bin_index;
+
+  if (in_smallbin_range (nb))
+    first_bin_index = smallbin_index (nb);
+  else
+    first_bin_index = largebin_index (nb);
+
+  if (in_smallbin_range (nb * 2))
+    last_bin_index = smallbin_index (nb * 2);
+  else
+    last_bin_index = largebin_index (nb * 2);
+
+  first_largebin_index = largebin_index (MIN_LARGE_SIZE);
+
+  int victim_index;                 /* its bin index */
+
+  for (victim_index = first_bin_index;
+       victim_index < last_bin_index;
+       victim_index ++)
+    {
+      victim = NULL;
+
+      if (victim_index < first_largebin_index)
+    {
+      /* Check small bins.  Small bin chunks are doubly-linked despite
+	 being the same size.  */
+
+      mchunkptr fwd;                    /* misc temp for linking */
+      mchunkptr bck;                    /* misc temp for linking */
+
+      bck = bin_at (av, victim_index);
+      fwd = bck->fd;
+      while (fwd != bck)
+	{
+	  if (chunk_ok_for_memalign (fwd, alignment, bytes) > 0)
+	    {
+	      victim = fwd;
+
+	      /* Unlink it */
+	      victim->fd->bk = victim->bk;
+	      victim->bk->fd = victim->fd;
+	      break;
+	    }
+
+	  fwd = fwd->fd;
+	}
+    }
+  else
+    {
+      /* Check large bins.  */
+      mchunkptr fwd;                    /* misc temp for linking */
+      mchunkptr bck;                    /* misc temp for linking */
+      mchunkptr best = NULL;
+      size_t best_size = 0;
+
+      bck = bin_at (av, victim_index);
+      fwd = bck->fd;
+
+      while (fwd != bck)
+	{
+	  int extra;
+
+	  if (chunksize (fwd) < nb)
+	      break;
+	  extra = chunk_ok_for_memalign (fwd, alignment, bytes);
+	  if (extra > 0
+	      && (extra <= best_size || best == NULL))
+	    {
+	      best = fwd;
+	      best_size = extra;
+	    }
+
+	  fwd = fwd->fd;
+	}
+      victim = best;
+
+      if (victim != NULL)
+	{
+	  unlink_chunk (av, victim);
+	  break;
+	}
+    }
+
+      if (victim != NULL)
+	break;
+    }
+
+  /* Strategy: find a spot within that chunk that meets the alignment
      request, and then possibly free the leading and trailing space.
-   */
+     This strategy is incredibly costly and can lead to external
+     fragmentation if header and footer chunks are unused.  */
 
-  /* Call malloc with worst case padding to hit alignment. */
+  if (victim != NULL)
+    {
+      p = victim;
+      m = chunk2mem (p);
+      set_inuse (p);
+    }
+  else
+    {
+      /* Call malloc with worst case padding to hit alignment. */
 
-  m = (char *) (_int_malloc (av, nb + alignment + MINSIZE));
+      m = (char *) (_int_malloc (av, nb + alignment + MINSIZE));
 
-  if (m == 0)
-    return 0;           /* propagate failure */
+      if (m == 0)
+	return 0;           /* propagate failure */
 
-  p = mem2chunk (m);
+      p = mem2chunk (m);
+    }
 
   if ((((unsigned long) (m)) % alignment) != 0)   /* misaligned */
-
-    { /*
-                Find an aligned spot inside chunk.  Since we need to give back
-                leading space in a chunk of at least MINSIZE, if the first
-                calculation places us at a spot with less than MINSIZE leader,
-                we can move to the next aligned spot -- we've allocated enough
-                total room so that this is always possible.
-                 */
+    {
+      /* Find an aligned spot inside chunk.  Since we need to give back
+         leading space in a chunk of at least MINSIZE, if the first
+         calculation places us at a spot with less than MINSIZE leader,
+         we can move to the next aligned spot -- we've allocated enough
+         total room so that this is always possible.  */
       brk = (char *) mem2chunk (((unsigned long) (m + alignment - 1)) &
                                 - ((signed long) alignment));
       if ((unsigned long) (brk - (char *) (p)) < MINSIZE)
diff --git a/malloc/tst-memalign-2.c b/malloc/tst-memalign-2.c
new file mode 100644
index 0000000000..ed3660959a
--- /dev/null
+++ b/malloc/tst-memalign-2.c
@@ -0,0 +1,136 @@
+/* Test for memalign chunk reuse
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <errno.h>
+#include <malloc.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <array_length.h>
+
+#include <support/check.h>
+
+typedef struct TestCase {
+  size_t size;
+  size_t alignment;
+  void *ptr1;
+  void *ptr2;
+} TestCase;
+
+static TestCase tcache_allocs[] = {
+  { 24, 8, NULL, NULL },
+  { 24, 16, NULL, NULL },
+  { 128, 32, NULL, NULL }
+};
+#define TN array_length (tcache_allocs)
+
+static TestCase large_allocs[] = {
+  { 23450, 64, NULL, NULL },
+  { 23450, 64, NULL, NULL },
+  { 23550, 64, NULL, NULL },
+  { 23550, 64, NULL, NULL },
+  { 23650, 64, NULL, NULL },
+  { 23650, 64, NULL, NULL },
+  { 33650, 64, NULL, NULL },
+  { 33650, 64, NULL, NULL }
+};
+#define LN array_length (large_allocs)
+
+void *p;
+
+static int
+do_test (void)
+{
+  int i, j;
+  int count;
+  void *ptr[10];
+  void *p;
+
+  /* TCache test.  */
+
+  for (i = 0; i < TN; ++ i)
+    {
+      tcache_allocs[i].ptr1 = memalign (tcache_allocs[i].alignment, tcache_allocs[i].size);
+      free (tcache_allocs[i].ptr1);
+      /* This should return the same chunk as was just free'd.  */
+      tcache_allocs[i].ptr2 = memalign (tcache_allocs[i].alignment, tcache_allocs[i].size);
+      free (tcache_allocs[i].ptr2);
+
+      TEST_VERIFY (tcache_allocs[i].ptr1 == tcache_allocs[i].ptr2);
+    }
+
+  /* Test for non-head tcache hits.  */
+  for (i = 0; i < 10; ++ i)
+    {
+      if (i == 4)
+	ptr[i] = memalign (64, 256);
+      else
+	ptr[i] = malloc (256);
+    }
+  for (i = 0; i < 10; ++ i)
+    free (ptr[i]);
+
+  p = memalign (64, 256);
+
+  count = 0;
+  for (i = 0; i < 10; ++ i)
+    if (ptr[i] == p)
+      ++ count;
+  free (p);
+  TEST_VERIFY (count > 0);
+
+  /* Large bins test.  */
+
+  for (i = 0; i < LN; ++ i)
+    {
+      large_allocs[i].ptr1 = memalign (large_allocs[i].alignment, large_allocs[i].size);
+      /* Keep chunks from combining by fragmenting the heap.  */
+      p = malloc (512);
+    }
+
+  for (i = 0; i < LN; ++ i)
+    free (large_allocs[i].ptr1);
+
+  /* Force the unsorted bins to be scanned and moved to small/large
+     bins.  */
+  p = malloc (60000);
+
+  for (i = 0; i < LN; ++ i)
+    large_allocs[i].ptr2 = memalign (large_allocs[i].alignment, large_allocs[i].size);
+
+  count = 0;
+  for (i = 0; i < LN; ++ i)
+    {
+      int ok = 0;
+      for (j = 0; j < LN; ++ j)
+	if (large_allocs[i].ptr1 == large_allocs[j].ptr2)
+	  ok = 1;
+      if (ok == 1)
+	count ++;
+    }
+
+  /* The allocation algorithm is complicated outside of the memalign
+     logic, so just make sure it's working for most of the
+     allocations.  This avoids possible boundary conditions with
+     empty/full heaps.  */
+  TEST_VERIFY (count > LN / 2);
+
+  return 0;
+}
+
+#include <support/test-driver.c>


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v4 1/1] memalign: Support scanning for aligned chunks.
  2022-07-28 19:50           ` [PATCH v4 " DJ Delorie
@ 2022-08-17 19:00             ` DJ Delorie
  2022-11-10 21:40               ` Ping^2: " DJ Delorie
  2023-03-28 19:07             ` Adhemerval Zanella Netto
  1 sibling, 1 reply; 38+ messages in thread
From: DJ Delorie @ 2022-08-17 19:00 UTC (permalink / raw)
  To: libc-alpha


Ping?

https://patchwork.sourceware.org/project/glibc/patch/xn4jz19fts.fsf@greed.delorie.com/
(I can't find it in the archives though, sent July 28th)

DJ Delorie via Libc-alpha <libc-alpha@sourceware.org> writes:
> [v4: updated testcase to new driver]
>
> [note that I am not pushing this patch for this release, the timing is
> coincidence]
>
> This patch adds a chunk scanning algorithm to the _int_memalign code
> path that reduces heap fragmentation by reusing already aligned chunks
> instead of always looking for chunks of larger sizes and splitting
> them.  The tcache macros are extended to allow removing a chunk from
> the middle of the list.
>
> The goal is to fix the pathological use cases where heaps grow
> continuously in workloads that are heavy users of memalign.
>
> Note that tst-memalign-2 checks for tcache operation, which
> malloc-check bypasses.
>
> diff --git a/malloc/Makefile b/malloc/Makefile
> index 4e32de2a0b..084c408ac7 100644
> --- a/malloc/Makefile
> +++ b/malloc/Makefile
> @@ -43,6 +43,7 @@ tests := mallocbug tst-malloc tst-valloc tst-calloc tst-obstack \
>  	 tst-tcfree1 tst-tcfree2 tst-tcfree3 \
>  	 tst-safe-linking \
>  	 tst-mallocalign1 \
> +	 tst-memalign-2
>  
>  tests-static := \
>  	 tst-interpose-static-nothread \
> @@ -72,7 +73,7 @@ test-srcs = tst-mtrace
>  # with MALLOC_CHECK_=3 because they expect a specific failure.
>  tests-exclude-malloc-check = tst-malloc-check tst-malloc-usable \
>  	tst-mxfast tst-safe-linking \
> -	tst-compathooks-off tst-compathooks-on
> +	tst-compathooks-off tst-compathooks-on tst-memalign-2
>  
>  # Run all tests with MALLOC_CHECK_=3
>  tests-malloc-check = $(filter-out $(tests-exclude-malloc-check) \
> diff --git a/malloc/malloc.c b/malloc/malloc.c
> index bd3c76ed31..3b31d6ae0f 100644
> --- a/malloc/malloc.c
> +++ b/malloc/malloc.c
> @@ -3180,19 +3180,44 @@ tcache_put (mchunkptr chunk, size_t tc_idx)
>  }
>  
>  /* Caller must ensure that we know tc_idx is valid and there's
> -   available chunks to remove.  */
> +   available chunks to remove.  Removes chunk from the middle of the
> +   list.  */
>  static __always_inline void *
> -tcache_get (size_t tc_idx)
> +tcache_get_n (size_t tc_idx, tcache_entry **ep)
>  {
> -  tcache_entry *e = tcache->entries[tc_idx];
> +  tcache_entry *e;
> +  if (ep == &(tcache->entries[tc_idx]))
> +    e = *ep;
> +  else
> +    e = REVEAL_PTR (*ep);
> +
>    if (__glibc_unlikely (!aligned_OK (e)))
>      malloc_printerr ("malloc(): unaligned tcache chunk detected");
> -  tcache->entries[tc_idx] = REVEAL_PTR (e->next);
> +
> +  if (ep == &(tcache->entries[tc_idx]))
> +      *ep = REVEAL_PTR (e->next);
> +  else
> +    *ep = PROTECT_PTR (ep, REVEAL_PTR (e->next));
> +
>    --(tcache->counts[tc_idx]);
>    e->key = 0;
>    return (void *) e;
>  }
>  
> +/* Like the above, but removes from the head of the list.  */
> +static __always_inline void *
> +tcache_get (size_t tc_idx)
> +{
> +  return tcache_get_n (tc_idx, & tcache->entries[tc_idx]);
> +}
> +
> +/* Iterates through the tcache linked list.  */
> +static __always_inline void *
> +tcache_next (tcache_entry *e)
> +{
> +  return (tcache_entry *) REVEAL_PTR (e->next);
> +}
> +
>  static void
>  tcache_thread_shutdown (void)
>  {
> @@ -3301,7 +3326,7 @@ __libc_malloc (size_t bytes)
>  
>    DIAG_PUSH_NEEDS_COMMENT;
>    if (tc_idx < mp_.tcache_bins
> -      && tcache
> +      && tcache != NULL
>        && tcache->counts[tc_idx] > 0)
>      {
>        victim = tcache_get (tc_idx);
> @@ -3552,6 +3577,38 @@ _mid_memalign (size_t alignment, size_t bytes, void *address)
>        alignment = a;
>      }
>  
> +#if USE_TCACHE
> +  {
> +    size_t tbytes;
> +    tbytes = checked_request2size (bytes);
> +    if (tbytes == 0)
> +      {
> +	__set_errno (ENOMEM);
> +	return NULL;
> +      }
> +    size_t tc_idx = csize2tidx (tbytes);
> +
> +    if (tc_idx < mp_.tcache_bins
> +	&& tcache != NULL
> +	&& tcache->counts[tc_idx] > 0)
> +      {
> +	/* The tcache itself isn't encoded, but the chain is.  */
> +	tcache_entry **tep = & tcache->entries[tc_idx];
> +	tcache_entry *te = *tep;
> +	while (te != NULL && ((intptr_t)te & (alignment - 1)) != 0)
> +	  {
> +	    tep = & (te->next);
> +	    te = tcache_next (te);
> +	  }
> +	if (te != NULL)
> +	  {
> +	    void *victim = tcache_get_n (tc_idx, tep);
> +	    return tag_new_usable (victim);
> +	  }
> +      }
> +  }
> +#endif
> +
>    if (SINGLE_THREAD_P)
>      {
>        p = _int_memalign (&main_arena, alignment, bytes);
> @@ -3857,7 +3914,7 @@ _int_malloc (mstate av, size_t bytes)
>  	      /* While we're here, if we see other chunks of the same size,
>  		 stash them in the tcache.  */
>  	      size_t tc_idx = csize2tidx (nb);
> -	      if (tcache && tc_idx < mp_.tcache_bins)
> +	      if (tcache != NULL && tc_idx < mp_.tcache_bins)
>  		{
>  		  mchunkptr tc_victim;
>  
> @@ -3915,7 +3972,7 @@ _int_malloc (mstate av, size_t bytes)
>  	  /* While we're here, if we see other chunks of the same size,
>  	     stash them in the tcache.  */
>  	  size_t tc_idx = csize2tidx (nb);
> -	  if (tcache && tc_idx < mp_.tcache_bins)
> +	  if (tcache != NULL && tc_idx < mp_.tcache_bins)
>  	    {
>  	      mchunkptr tc_victim;
>  
> @@ -3977,7 +4034,7 @@ _int_malloc (mstate av, size_t bytes)
>  #if USE_TCACHE
>    INTERNAL_SIZE_T tcache_nb = 0;
>    size_t tc_idx = csize2tidx (nb);
> -  if (tcache && tc_idx < mp_.tcache_bins)
> +  if (tcache != NULL && tc_idx < mp_.tcache_bins)
>      tcache_nb = nb;
>    int return_cached = 0;
>  
> @@ -4059,7 +4116,7 @@ _int_malloc (mstate av, size_t bytes)
>  #if USE_TCACHE
>  	      /* Fill cache first, return to user only if cache fills.
>  		 We may return one of these chunks later.  */
> -	      if (tcache_nb
> +	      if (tcache_nb > 0
>  		  && tcache->counts[tc_idx] < mp_.tcache_count)
>  		{
>  		  tcache_put (victim, tc_idx);
> @@ -4932,6 +4989,43 @@ _int_realloc (mstate av, mchunkptr oldp, INTERNAL_SIZE_T oldsize,
>     ------------------------------ memalign ------------------------------
>   */
>  
> +/* Returns 0 if the chunk is not and does not contain the requested
> +   aligned sub-chunk, else returns the amount of "waste" from
> +   trimming.  BYTES is the *user* byte size, not the chunk byte
> +   size.  */
> +static int
> +chunk_ok_for_memalign (mchunkptr p, size_t alignment, size_t bytes)
> +{
> +  void *m = chunk2mem (p);
> +  INTERNAL_SIZE_T size = memsize (p);
> +  void *aligned_m = m;
> +
> +  if (__glibc_unlikely (misaligned_chunk (p)))
> +    malloc_printerr ("_int_memalign(): unaligned chunk detected");
> +
> +  aligned_m = PTR_ALIGN_UP (m, alignment);
> +
> +  INTERNAL_SIZE_T front_extra = (intptr_t) aligned_m - (intptr_t) m;
> +
> +  /* We can't trim off the front as it's too small.  */
> +  if (front_extra > 0 && front_extra < MINSIZE)
> +    return 0;
> +
> +  /* If it's a perfect fit, it's an exception to the return value rule
> +     (we would return zero waste, which looks like "not usable"), so
> +     handle it here by returning a small non-zero value instead.  */
> +  if (size == bytes && front_extra == 0)
> +    return 1;
> +
> +  /* If the block we need fits in the chunk, calculate total waste.  */
> +  if (size > bytes + front_extra)
> +    return size - bytes;
> +
> +  /* Can't use this chunk.  */ 
> +  return 0;
> +}
> +
> +/* BYTES is user requested bytes, not requested chunksize bytes.  */
>  static void *
>  _int_memalign (mstate av, size_t alignment, size_t bytes)
>  {
> @@ -4945,8 +5039,7 @@ _int_memalign (mstate av, size_t alignment, size_t bytes)
>    mchunkptr remainder;            /* spare room at end to split off */
>    unsigned long remainder_size;   /* its size */
>    INTERNAL_SIZE_T size;
> -
> -
> +  mchunkptr victim;
>  
>    nb = checked_request2size (bytes);
>    if (nb == 0)
> @@ -4955,29 +5048,142 @@ _int_memalign (mstate av, size_t alignment, size_t bytes)
>        return NULL;
>      }
>  
> -  /*
> -     Strategy: find a spot within that chunk that meets the alignment
> +  /* We can't check tcache here because we hold the arena lock, which
> +     tcache doesn't expect.  We expect it has been checked
> +     earlier.  */
> +
> +  /* Strategy: search the bins looking for an existing block that
> +     meets our needs.  We scan a range of bins from "exact size" to
> +     "just under 2x", spanning the small/large barrier if needed.  If
> +     we don't find anything in those bins, the common malloc code will
> +     scan starting at 2x.  */
> +
> +  /* This will be set if we found a candidate chunk.  */
> +  victim = NULL;
> +
> +  /* Fast bins are singly-linked, hard to remove a chunk from the middle
> +     and unlikely to meet our alignment requirements.  We have not done
> +     any experimentation with searching for aligned fastbins.  */
> +
> +  int first_bin_index;
> +  int first_largebin_index;
> +  int last_bin_index;
> +
> +  if (in_smallbin_range (nb))
> +    first_bin_index = smallbin_index (nb);
> +  else
> +    first_bin_index = largebin_index (nb);
> +
> +  if (in_smallbin_range (nb * 2))
> +    last_bin_index = smallbin_index (nb * 2);
> +  else
> +    last_bin_index = largebin_index (nb * 2);
> +
> +  first_largebin_index = largebin_index (MIN_LARGE_SIZE);
> +
> +  int victim_index;                 /* its bin index */
> +
> +  for (victim_index = first_bin_index;
> +       victim_index < last_bin_index;
> +       victim_index ++)
> +    {
> +      victim = NULL;
> +
> +      if (victim_index < first_largebin_index)
> +    {
> +      /* Check small bins.  Small bin chunks are doubly-linked despite
> +	 being the same size.  */
> +
> +      mchunkptr fwd;                    /* misc temp for linking */
> +      mchunkptr bck;                    /* misc temp for linking */
> +
> +      bck = bin_at (av, victim_index);
> +      fwd = bck->fd;
> +      while (fwd != bck)
> +	{
> +	  if (chunk_ok_for_memalign (fwd, alignment, bytes) > 0)
> +	    {
> +	      victim = fwd;
> +
> +	      /* Unlink it */
> +	      victim->fd->bk = victim->bk;
> +	      victim->bk->fd = victim->fd;
> +	      break;
> +	    }
> +
> +	  fwd = fwd->fd;
> +	}
> +    }
> +  else
> +    {
> +      /* Check large bins.  */
> +      mchunkptr fwd;                    /* misc temp for linking */
> +      mchunkptr bck;                    /* misc temp for linking */
> +      mchunkptr best = NULL;
> +      size_t best_size = 0;
> +
> +      bck = bin_at (av, victim_index);
> +      fwd = bck->fd;
> +
> +      while (fwd != bck)
> +	{
> +	  int extra;
> +
> +	  if (chunksize (fwd) < nb)
> +	      break;
> +	  extra = chunk_ok_for_memalign (fwd, alignment, bytes);
> +	  if (extra > 0
> +	      && (extra <= best_size || best == NULL))
> +	    {
> +	      best = fwd;
> +	      best_size = extra;
> +	    }
> +
> +	  fwd = fwd->fd;
> +	}
> +      victim = best;
> +
> +      if (victim != NULL)
> +	{
> +	  unlink_chunk (av, victim);
> +	  break;
> +	}
> +    }
> +
> +      if (victim != NULL)
> +	break;
> +    }
> +
> +  /* Strategy: find a spot within that chunk that meets the alignment
>       request, and then possibly free the leading and trailing space.
> -   */
> +     This strategy is incredibly costly and can lead to external
> +     fragmentation if header and footer chunks are unused.  */
>  
> -  /* Call malloc with worst case padding to hit alignment. */
> +  if (victim != NULL)
> +    {
> +      p = victim;
> +      m = chunk2mem (p);
> +      set_inuse (p);
> +    }
> +  else
> +    {
> +      /* Call malloc with worst case padding to hit alignment. */
>  
> -  m = (char *) (_int_malloc (av, nb + alignment + MINSIZE));
> +      m = (char *) (_int_malloc (av, nb + alignment + MINSIZE));
>  
> -  if (m == 0)
> -    return 0;           /* propagate failure */
> +      if (m == 0)
> +	return 0;           /* propagate failure */
>  
> -  p = mem2chunk (m);
> +      p = mem2chunk (m);
> +    }
>  
>    if ((((unsigned long) (m)) % alignment) != 0)   /* misaligned */
> -
> -    { /*
> -                Find an aligned spot inside chunk.  Since we need to give back
> -                leading space in a chunk of at least MINSIZE, if the first
> -                calculation places us at a spot with less than MINSIZE leader,
> -                we can move to the next aligned spot -- we've allocated enough
> -                total room so that this is always possible.
> -                 */
> +    {
> +      /* Find an aligned spot inside chunk.  Since we need to give back
> +         leading space in a chunk of at least MINSIZE, if the first
> +         calculation places us at a spot with less than MINSIZE leader,
> +         we can move to the next aligned spot -- we've allocated enough
> +         total room so that this is always possible.  */
>        brk = (char *) mem2chunk (((unsigned long) (m + alignment - 1)) &
>                                  - ((signed long) alignment));
>        if ((unsigned long) (brk - (char *) (p)) < MINSIZE)
> diff --git a/malloc/tst-memalign-2.c b/malloc/tst-memalign-2.c
> new file mode 100644
> index 0000000000..ed3660959a
> --- /dev/null
> +++ b/malloc/tst-memalign-2.c
> @@ -0,0 +1,136 @@
> +/* Test for memalign chunk reuse
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <errno.h>
> +#include <malloc.h>
> +#include <stdio.h>
> +#include <string.h>
> +#include <unistd.h>
> +#include <array_length.h>
> +
> +#include <support/check.h>
> +
> +typedef struct TestCase {
> +  size_t size;
> +  size_t alignment;
> +  void *ptr1;
> +  void *ptr2;
> +} TestCase;
> +
> +static TestCase tcache_allocs[] = {
> +  { 24, 8, NULL, NULL },
> +  { 24, 16, NULL, NULL },
> +  { 128, 32, NULL, NULL }
> +};
> +#define TN array_length (tcache_allocs)
> +
> +static TestCase large_allocs[] = {
> +  { 23450, 64, NULL, NULL },
> +  { 23450, 64, NULL, NULL },
> +  { 23550, 64, NULL, NULL },
> +  { 23550, 64, NULL, NULL },
> +  { 23650, 64, NULL, NULL },
> +  { 23650, 64, NULL, NULL },
> +  { 33650, 64, NULL, NULL },
> +  { 33650, 64, NULL, NULL }
> +};
> +#define LN array_length (large_allocs)
> +
> +void *p;
> +
> +static int
> +do_test (void)
> +{
> +  int i, j;
> +  int count;
> +  void *ptr[10];
> +  void *p;
> +
> +  /* TCache test.  */
> +
> +  for (i = 0; i < TN; ++ i)
> +    {
> +      tcache_allocs[i].ptr1 = memalign (tcache_allocs[i].alignment, tcache_allocs[i].size);
> +      free (tcache_allocs[i].ptr1);
> +      /* This should return the same chunk as was just free'd.  */
> +      tcache_allocs[i].ptr2 = memalign (tcache_allocs[i].alignment, tcache_allocs[i].size);
> +      free (tcache_allocs[i].ptr2);
> +
> +      TEST_VERIFY (tcache_allocs[i].ptr1 == tcache_allocs[i].ptr2);
> +    }
> +
> +  /* Test for non-head tcache hits.  */
> +  for (i = 0; i < 10; ++ i)
> +    {
> +      if (i == 4)
> +	ptr[i] = memalign (64, 256);
> +      else
> +	ptr[i] = malloc (256);
> +    }
> +  for (i = 0; i < 10; ++ i)
> +    free (ptr[i]);
> +
> +  p = memalign (64, 256);
> +
> +  count = 0;
> +  for (i = 0; i < 10; ++ i)
> +    if (ptr[i] == p)
> +      ++ count;
> +  free (p);
> +  TEST_VERIFY (count > 0);
> +
> +  /* Large bins test.  */
> +
> +  for (i = 0; i < LN; ++ i)
> +    {
> +      large_allocs[i].ptr1 = memalign (large_allocs[i].alignment, large_allocs[i].size);
> +      /* Keep chunks from combining by fragmenting the heap.  */
> +      p = malloc (512);
> +    }
> +
> +  for (i = 0; i < LN; ++ i)
> +    free (large_allocs[i].ptr1);
> +
> +  /* Force the unsorted bins to be scanned and moved to small/large
> +     bins.  */
> +  p = malloc (60000);
> +
> +  for (i = 0; i < LN; ++ i)
> +    large_allocs[i].ptr2 = memalign (large_allocs[i].alignment, large_allocs[i].size);
> +
> +  count = 0;
> +  for (i = 0; i < LN; ++ i)
> +    {
> +      int ok = 0;
> +      for (j = 0; j < LN; ++ j)
> +	if (large_allocs[i].ptr1 == large_allocs[j].ptr2)
> +	  ok = 1;
> +      if (ok == 1)
> +	count ++;
> +    }
> +
> +  /* The allocation algorithm is complicated outside of the memalign
> +     logic, so just make sure it's working for most of the
> +     allocations.  This avoids possible boundary conditions with
> +     empty/full heaps.  */
> +  TEST_VERIFY (count > LN / 2);
> +
> +  return 0;
> +}
> +
> +#include <support/test-driver.c>


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: Ping^2: [PATCH v4 1/1] memalign: Support scanning for aligned chunks.
  2022-08-17 19:00             ` DJ Delorie
@ 2022-11-10 21:40               ` DJ Delorie
  2023-03-20 21:49                 ` Ping^3: " DJ Delorie
  0 siblings, 1 reply; 38+ messages in thread
From: DJ Delorie @ 2022-11-10 21:40 UTC (permalink / raw)
  To: libc-alpha


Ping^2 ?

https://patchwork.sourceware.org/project/glibc/patch/xn4jz19fts.fsf@greed.delorie.com/
https://sourceware.org/pipermail/libc-alpha/2022-July/141117.html

DJ Delorie via Libc-alpha <libc-alpha@sourceware.org> writes:
> [v4: updated testcase to new driver]
>
> [note that I am not pushing this patch for this release, the timing is
> coincidence]
>
> This patch adds a chunk scanning algorithm to the _int_memalign code
> path that reduces heap fragmentation by reusing already aligned chunks
> instead of always looking for chunks of larger sizes and splitting
> them.  The tcache macros are extended to allow removing a chunk from
> the middle of the list.
>
> The goal is to fix the pathological use cases where heaps grow
> continuously in workloads that are heavy users of memalign.
>
> Note that tst-memalign-2 checks for tcache operation, which
> malloc-check bypasses.
>
> diff --git a/malloc/Makefile b/malloc/Makefile
> index 4e32de2a0b..084c408ac7 100644
> --- a/malloc/Makefile
> +++ b/malloc/Makefile
> @@ -43,6 +43,7 @@ tests := mallocbug tst-malloc tst-valloc tst-calloc tst-obstack \
>  	 tst-tcfree1 tst-tcfree2 tst-tcfree3 \
>  	 tst-safe-linking \
>  	 tst-mallocalign1 \
> +	 tst-memalign-2
>  
>  tests-static := \
>  	 tst-interpose-static-nothread \
> @@ -72,7 +73,7 @@ test-srcs = tst-mtrace
>  # with MALLOC_CHECK_=3 because they expect a specific failure.
>  tests-exclude-malloc-check = tst-malloc-check tst-malloc-usable \
>  	tst-mxfast tst-safe-linking \
> -	tst-compathooks-off tst-compathooks-on
> +	tst-compathooks-off tst-compathooks-on tst-memalign-2
>  
>  # Run all tests with MALLOC_CHECK_=3
>  tests-malloc-check = $(filter-out $(tests-exclude-malloc-check) \
> diff --git a/malloc/malloc.c b/malloc/malloc.c
> index bd3c76ed31..3b31d6ae0f 100644
> --- a/malloc/malloc.c
> +++ b/malloc/malloc.c
> @@ -3180,19 +3180,44 @@ tcache_put (mchunkptr chunk, size_t tc_idx)
>  }
>  
>  /* Caller must ensure that we know tc_idx is valid and there's
> -   available chunks to remove.  */
> +   available chunks to remove.  Removes chunk from the middle of the
> +   list.  */
>  static __always_inline void *
> -tcache_get (size_t tc_idx)
> +tcache_get_n (size_t tc_idx, tcache_entry **ep)
>  {
> -  tcache_entry *e = tcache->entries[tc_idx];
> +  tcache_entry *e;
> +  if (ep == &(tcache->entries[tc_idx]))
> +    e = *ep;
> +  else
> +    e = REVEAL_PTR (*ep);
> +
>    if (__glibc_unlikely (!aligned_OK (e)))
>      malloc_printerr ("malloc(): unaligned tcache chunk detected");
> -  tcache->entries[tc_idx] = REVEAL_PTR (e->next);
> +
> +  if (ep == &(tcache->entries[tc_idx]))
> +      *ep = REVEAL_PTR (e->next);
> +  else
> +    *ep = PROTECT_PTR (ep, REVEAL_PTR (e->next));
> +
>    --(tcache->counts[tc_idx]);
>    e->key = 0;
>    return (void *) e;
>  }
>  
> +/* Like the above, but removes from the head of the list.  */
> +static __always_inline void *
> +tcache_get (size_t tc_idx)
> +{
> +  return tcache_get_n (tc_idx, & tcache->entries[tc_idx]);
> +}
> +
> +/* Iterates through the tcache linked list.  */
> +static __always_inline void *
> +tcache_next (tcache_entry *e)
> +{
> +  return (tcache_entry *) REVEAL_PTR (e->next);
> +}
> +
>  static void
>  tcache_thread_shutdown (void)
>  {
> @@ -3301,7 +3326,7 @@ __libc_malloc (size_t bytes)
>  
>    DIAG_PUSH_NEEDS_COMMENT;
>    if (tc_idx < mp_.tcache_bins
> -      && tcache
> +      && tcache != NULL
>        && tcache->counts[tc_idx] > 0)
>      {
>        victim = tcache_get (tc_idx);
> @@ -3552,6 +3577,38 @@ _mid_memalign (size_t alignment, size_t bytes, void *address)
>        alignment = a;
>      }
>  
> +#if USE_TCACHE
> +  {
> +    size_t tbytes;
> +    tbytes = checked_request2size (bytes);
> +    if (tbytes == 0)
> +      {
> +	__set_errno (ENOMEM);
> +	return NULL;
> +      }
> +    size_t tc_idx = csize2tidx (tbytes);
> +
> +    if (tc_idx < mp_.tcache_bins
> +	&& tcache != NULL
> +	&& tcache->counts[tc_idx] > 0)
> +      {
> +	/* The tcache itself isn't encoded, but the chain is.  */
> +	tcache_entry **tep = & tcache->entries[tc_idx];
> +	tcache_entry *te = *tep;
> +	while (te != NULL && ((intptr_t)te & (alignment - 1)) != 0)
> +	  {
> +	    tep = & (te->next);
> +	    te = tcache_next (te);
> +	  }
> +	if (te != NULL)
> +	  {
> +	    void *victim = tcache_get_n (tc_idx, tep);
> +	    return tag_new_usable (victim);
> +	  }
> +      }
> +  }
> +#endif
> +
>    if (SINGLE_THREAD_P)
>      {
>        p = _int_memalign (&main_arena, alignment, bytes);
> @@ -3857,7 +3914,7 @@ _int_malloc (mstate av, size_t bytes)
>  	      /* While we're here, if we see other chunks of the same size,
>  		 stash them in the tcache.  */
>  	      size_t tc_idx = csize2tidx (nb);
> -	      if (tcache && tc_idx < mp_.tcache_bins)
> +	      if (tcache != NULL && tc_idx < mp_.tcache_bins)
>  		{
>  		  mchunkptr tc_victim;
>  
> @@ -3915,7 +3972,7 @@ _int_malloc (mstate av, size_t bytes)
>  	  /* While we're here, if we see other chunks of the same size,
>  	     stash them in the tcache.  */
>  	  size_t tc_idx = csize2tidx (nb);
> -	  if (tcache && tc_idx < mp_.tcache_bins)
> +	  if (tcache != NULL && tc_idx < mp_.tcache_bins)
>  	    {
>  	      mchunkptr tc_victim;
>  
> @@ -3977,7 +4034,7 @@ _int_malloc (mstate av, size_t bytes)
>  #if USE_TCACHE
>    INTERNAL_SIZE_T tcache_nb = 0;
>    size_t tc_idx = csize2tidx (nb);
> -  if (tcache && tc_idx < mp_.tcache_bins)
> +  if (tcache != NULL && tc_idx < mp_.tcache_bins)
>      tcache_nb = nb;
>    int return_cached = 0;
>  
> @@ -4059,7 +4116,7 @@ _int_malloc (mstate av, size_t bytes)
>  #if USE_TCACHE
>  	      /* Fill cache first, return to user only if cache fills.
>  		 We may return one of these chunks later.  */
> -	      if (tcache_nb
> +	      if (tcache_nb > 0
>  		  && tcache->counts[tc_idx] < mp_.tcache_count)
>  		{
>  		  tcache_put (victim, tc_idx);
> @@ -4932,6 +4989,43 @@ _int_realloc (mstate av, mchunkptr oldp, INTERNAL_SIZE_T oldsize,
>     ------------------------------ memalign ------------------------------
>   */
>  
> +/* Returns 0 if the chunk is not and does not contain the requested
> +   aligned sub-chunk, else returns the amount of "waste" from
> +   trimming.  BYTES is the *user* byte size, not the chunk byte
> +   size.  */
> +static int
> +chunk_ok_for_memalign (mchunkptr p, size_t alignment, size_t bytes)
> +{
> +  void *m = chunk2mem (p);
> +  INTERNAL_SIZE_T size = memsize (p);
> +  void *aligned_m = m;
> +
> +  if (__glibc_unlikely (misaligned_chunk (p)))
> +    malloc_printerr ("_int_memalign(): unaligned chunk detected");
> +
> +  aligned_m = PTR_ALIGN_UP (m, alignment);
> +
> +  INTERNAL_SIZE_T front_extra = (intptr_t) aligned_m - (intptr_t) m;
> +
> +  /* We can't trim off the front as it's too small.  */
> +  if (front_extra > 0 && front_extra < MINSIZE)
> +    return 0;
> +
> +  /* If it's a perfect fit, it's an exception to the return value rule
> +     (we would return zero waste, which looks like "not usable"), so
> +     handle it here by returning a small non-zero value instead.  */
> +  if (size == bytes && front_extra == 0)
> +    return 1;
> +
> +  /* If the block we need fits in the chunk, calculate total waste.  */
> +  if (size > bytes + front_extra)
> +    return size - bytes;
> +
> +  /* Can't use this chunk.  */ 
> +  return 0;
> +}
> +
> +/* BYTES is user requested bytes, not requested chunksize bytes.  */
>  static void *
>  _int_memalign (mstate av, size_t alignment, size_t bytes)
>  {
> @@ -4945,8 +5039,7 @@ _int_memalign (mstate av, size_t alignment, size_t bytes)
>    mchunkptr remainder;            /* spare room at end to split off */
>    unsigned long remainder_size;   /* its size */
>    INTERNAL_SIZE_T size;
> -
> -
> +  mchunkptr victim;
>  
>    nb = checked_request2size (bytes);
>    if (nb == 0)
> @@ -4955,29 +5048,142 @@ _int_memalign (mstate av, size_t alignment, size_t bytes)
>        return NULL;
>      }
>  
> -  /*
> -     Strategy: find a spot within that chunk that meets the alignment
> +  /* We can't check tcache here because we hold the arena lock, which
> +     tcache doesn't expect.  We expect it has been checked
> +     earlier.  */
> +
> +  /* Strategy: search the bins looking for an existing block that
> +     meets our needs.  We scan a range of bins from "exact size" to
> +     "just under 2x", spanning the small/large barrier if needed.  If
> +     we don't find anything in those bins, the common malloc code will
> +     scan starting at 2x.  */
> +
> +  /* This will be set if we found a candidate chunk.  */
> +  victim = NULL;
> +
> +  /* Fast bins are singly-linked, hard to remove a chunk from the middle
> +     and unlikely to meet our alignment requirements.  We have not done
> +     any experimentation with searching for aligned fastbins.  */
> +
> +  int first_bin_index;
> +  int first_largebin_index;
> +  int last_bin_index;
> +
> +  if (in_smallbin_range (nb))
> +    first_bin_index = smallbin_index (nb);
> +  else
> +    first_bin_index = largebin_index (nb);
> +
> +  if (in_smallbin_range (nb * 2))
> +    last_bin_index = smallbin_index (nb * 2);
> +  else
> +    last_bin_index = largebin_index (nb * 2);
> +
> +  first_largebin_index = largebin_index (MIN_LARGE_SIZE);
> +
> +  int victim_index;                 /* its bin index */
> +
> +  for (victim_index = first_bin_index;
> +       victim_index < last_bin_index;
> +       victim_index ++)
> +    {
> +      victim = NULL;
> +
> +      if (victim_index < first_largebin_index)
> +    {
> +      /* Check small bins.  Small bin chunks are doubly-linked despite
> +	 being the same size.  */
> +
> +      mchunkptr fwd;                    /* misc temp for linking */
> +      mchunkptr bck;                    /* misc temp for linking */
> +
> +      bck = bin_at (av, victim_index);
> +      fwd = bck->fd;
> +      while (fwd != bck)
> +	{
> +	  if (chunk_ok_for_memalign (fwd, alignment, bytes) > 0)
> +	    {
> +	      victim = fwd;
> +
> +	      /* Unlink it */
> +	      victim->fd->bk = victim->bk;
> +	      victim->bk->fd = victim->fd;
> +	      break;
> +	    }
> +
> +	  fwd = fwd->fd;
> +	}
> +    }
> +  else
> +    {
> +      /* Check large bins.  */
> +      mchunkptr fwd;                    /* misc temp for linking */
> +      mchunkptr bck;                    /* misc temp for linking */
> +      mchunkptr best = NULL;
> +      size_t best_size = 0;
> +
> +      bck = bin_at (av, victim_index);
> +      fwd = bck->fd;
> +
> +      while (fwd != bck)
> +	{
> +	  int extra;
> +
> +	  if (chunksize (fwd) < nb)
> +	      break;
> +	  extra = chunk_ok_for_memalign (fwd, alignment, bytes);
> +	  if (extra > 0
> +	      && (extra <= best_size || best == NULL))
> +	    {
> +	      best = fwd;
> +	      best_size = extra;
> +	    }
> +
> +	  fwd = fwd->fd;
> +	}
> +      victim = best;
> +
> +      if (victim != NULL)
> +	{
> +	  unlink_chunk (av, victim);
> +	  break;
> +	}
> +    }
> +
> +      if (victim != NULL)
> +	break;
> +    }
> +
> +  /* Strategy: find a spot within that chunk that meets the alignment
>       request, and then possibly free the leading and trailing space.
> -   */
> +     This strategy is incredibly costly and can lead to external
> +     fragmentation if header and footer chunks are unused.  */
>  
> -  /* Call malloc with worst case padding to hit alignment. */
> +  if (victim != NULL)
> +    {
> +      p = victim;
> +      m = chunk2mem (p);
> +      set_inuse (p);
> +    }
> +  else
> +    {
> +      /* Call malloc with worst case padding to hit alignment. */
>  
> -  m = (char *) (_int_malloc (av, nb + alignment + MINSIZE));
> +      m = (char *) (_int_malloc (av, nb + alignment + MINSIZE));
>  
> -  if (m == 0)
> -    return 0;           /* propagate failure */
> +      if (m == 0)
> +	return 0;           /* propagate failure */
>  
> -  p = mem2chunk (m);
> +      p = mem2chunk (m);
> +    }
>  
>    if ((((unsigned long) (m)) % alignment) != 0)   /* misaligned */
> -
> -    { /*
> -                Find an aligned spot inside chunk.  Since we need to give back
> -                leading space in a chunk of at least MINSIZE, if the first
> -                calculation places us at a spot with less than MINSIZE leader,
> -                we can move to the next aligned spot -- we've allocated enough
> -                total room so that this is always possible.
> -                 */
> +    {
> +      /* Find an aligned spot inside chunk.  Since we need to give back
> +         leading space in a chunk of at least MINSIZE, if the first
> +         calculation places us at a spot with less than MINSIZE leader,
> +         we can move to the next aligned spot -- we've allocated enough
> +         total room so that this is always possible.  */
>        brk = (char *) mem2chunk (((unsigned long) (m + alignment - 1)) &
>                                  - ((signed long) alignment));
>        if ((unsigned long) (brk - (char *) (p)) < MINSIZE)
> diff --git a/malloc/tst-memalign-2.c b/malloc/tst-memalign-2.c
> new file mode 100644
> index 0000000000..ed3660959a
> --- /dev/null
> +++ b/malloc/tst-memalign-2.c
> @@ -0,0 +1,136 @@
> +/* Test for memalign chunk reuse
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <errno.h>
> +#include <malloc.h>
> +#include <stdio.h>
> +#include <string.h>
> +#include <unistd.h>
> +#include <array_length.h>
> +
> +#include <support/check.h>
> +
> +typedef struct TestCase {
> +  size_t size;
> +  size_t alignment;
> +  void *ptr1;
> +  void *ptr2;
> +} TestCase;
> +
> +static TestCase tcache_allocs[] = {
> +  { 24, 8, NULL, NULL },
> +  { 24, 16, NULL, NULL },
> +  { 128, 32, NULL, NULL }
> +};
> +#define TN array_length (tcache_allocs)
> +
> +static TestCase large_allocs[] = {
> +  { 23450, 64, NULL, NULL },
> +  { 23450, 64, NULL, NULL },
> +  { 23550, 64, NULL, NULL },
> +  { 23550, 64, NULL, NULL },
> +  { 23650, 64, NULL, NULL },
> +  { 23650, 64, NULL, NULL },
> +  { 33650, 64, NULL, NULL },
> +  { 33650, 64, NULL, NULL }
> +};
> +#define LN array_length (large_allocs)
> +
> +void *p;
> +
> +static int
> +do_test (void)
> +{
> +  int i, j;
> +  int count;
> +  void *ptr[10];
> +  void *p;
> +
> +  /* TCache test.  */
> +
> +  for (i = 0; i < TN; ++ i)
> +    {
> +      tcache_allocs[i].ptr1 = memalign (tcache_allocs[i].alignment, tcache_allocs[i].size);
> +      free (tcache_allocs[i].ptr1);
> +      /* This should return the same chunk as was just free'd.  */
> +      tcache_allocs[i].ptr2 = memalign (tcache_allocs[i].alignment, tcache_allocs[i].size);
> +      free (tcache_allocs[i].ptr2);
> +
> +      TEST_VERIFY (tcache_allocs[i].ptr1 == tcache_allocs[i].ptr2);
> +    }
> +
> +  /* Test for non-head tcache hits.  */
> +  for (i = 0; i < 10; ++ i)
> +    {
> +      if (i == 4)
> +	ptr[i] = memalign (64, 256);
> +      else
> +	ptr[i] = malloc (256);
> +    }
> +  for (i = 0; i < 10; ++ i)
> +    free (ptr[i]);
> +
> +  p = memalign (64, 256);
> +
> +  count = 0;
> +  for (i = 0; i < 10; ++ i)
> +    if (ptr[i] == p)
> +      ++ count;
> +  free (p);
> +  TEST_VERIFY (count > 0);
> +
> +  /* Large bins test.  */
> +
> +  for (i = 0; i < LN; ++ i)
> +    {
> +      large_allocs[i].ptr1 = memalign (large_allocs[i].alignment, large_allocs[i].size);
> +      /* Keep chunks from combining by fragmenting the heap.  */
> +      p = malloc (512);
> +    }
> +
> +  for (i = 0; i < LN; ++ i)
> +    free (large_allocs[i].ptr1);
> +
> +  /* Force the unsorted bins to be scanned and moved to small/large
> +     bins.  */
> +  p = malloc (60000);
> +
> +  for (i = 0; i < LN; ++ i)
> +    large_allocs[i].ptr2 = memalign (large_allocs[i].alignment, large_allocs[i].size);
> +
> +  count = 0;
> +  for (i = 0; i < LN; ++ i)
> +    {
> +      int ok = 0;
> +      for (j = 0; j < LN; ++ j)
> +	if (large_allocs[i].ptr1 == large_allocs[j].ptr2)
> +	  ok = 1;
> +      if (ok == 1)
> +	count ++;
> +    }
> +
> +  /* The allocation algorithm is complicated outside of the memalign
> +     logic, so just make sure it's working for most of the
> +     allocations.  This avoids possible boundary conditions with
> +     empty/full heaps.  */
> +  TEST_VERIFY (count > LN / 2);
> +
> +  return 0;
> +}
> +
> +#include <support/test-driver.c>


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: Ping^3: [PATCH v4 1/1] memalign: Support scanning for aligned chunks.
  2022-11-10 21:40               ` Ping^2: " DJ Delorie
@ 2023-03-20 21:49                 ` DJ Delorie
  0 siblings, 0 replies; 38+ messages in thread
From: DJ Delorie @ 2023-03-20 21:49 UTC (permalink / raw)
  To: libc-alpha


Ping^3 ?

https://patchwork.sourceware.org/project/glibc/patch/xn4jz19fts.fsf@greed.delorie.com/
https://sourceware.org/pipermail/libc-alpha/2022-July/141117.html

DJ Delorie via Libc-alpha <libc-alpha@sourceware.org> writes:
> [v4: updated testcase to new driver]
>
> [note that I am not pushing this patch for this release, the timing is
> coincidence]
>
> This patch adds a chunk scanning algorithm to the _int_memalign code
> path that reduces heap fragmentation by reusing already aligned chunks
> instead of always looking for chunks of larger sizes and splitting
> them.  The tcache macros are extended to allow removing a chunk from
> the middle of the list.
>
> The goal is to fix the pathological use cases where heaps grow
> continuously in workloads that are heavy users of memalign.
>
> Note that tst-memalign-2 checks for tcache operation, which
> malloc-check bypasses.
>
> diff --git a/malloc/Makefile b/malloc/Makefile
> index 4e32de2a0b..084c408ac7 100644
> --- a/malloc/Makefile
> +++ b/malloc/Makefile
> @@ -43,6 +43,7 @@ tests := mallocbug tst-malloc tst-valloc tst-calloc tst-obstack \
>  	 tst-tcfree1 tst-tcfree2 tst-tcfree3 \
>  	 tst-safe-linking \
>  	 tst-mallocalign1 \
> +	 tst-memalign-2
>  
>  tests-static := \
>  	 tst-interpose-static-nothread \
> @@ -72,7 +73,7 @@ test-srcs = tst-mtrace
>  # with MALLOC_CHECK_=3 because they expect a specific failure.
>  tests-exclude-malloc-check = tst-malloc-check tst-malloc-usable \
>  	tst-mxfast tst-safe-linking \
> -	tst-compathooks-off tst-compathooks-on
> +	tst-compathooks-off tst-compathooks-on tst-memalign-2
>  
>  # Run all tests with MALLOC_CHECK_=3
>  tests-malloc-check = $(filter-out $(tests-exclude-malloc-check) \
> diff --git a/malloc/malloc.c b/malloc/malloc.c
> index bd3c76ed31..3b31d6ae0f 100644
> --- a/malloc/malloc.c
> +++ b/malloc/malloc.c
> @@ -3180,19 +3180,44 @@ tcache_put (mchunkptr chunk, size_t tc_idx)
>  }
>  
>  /* Caller must ensure that we know tc_idx is valid and there's
> -   available chunks to remove.  */
> +   available chunks to remove.  Removes chunk from the middle of the
> +   list.  */
>  static __always_inline void *
> -tcache_get (size_t tc_idx)
> +tcache_get_n (size_t tc_idx, tcache_entry **ep)
>  {
> -  tcache_entry *e = tcache->entries[tc_idx];
> +  tcache_entry *e;
> +  if (ep == &(tcache->entries[tc_idx]))
> +    e = *ep;
> +  else
> +    e = REVEAL_PTR (*ep);
> +
>    if (__glibc_unlikely (!aligned_OK (e)))
>      malloc_printerr ("malloc(): unaligned tcache chunk detected");
> -  tcache->entries[tc_idx] = REVEAL_PTR (e->next);
> +
> +  if (ep == &(tcache->entries[tc_idx]))
> +      *ep = REVEAL_PTR (e->next);
> +  else
> +    *ep = PROTECT_PTR (ep, REVEAL_PTR (e->next));
> +
>    --(tcache->counts[tc_idx]);
>    e->key = 0;
>    return (void *) e;
>  }
>  
> +/* Like the above, but removes from the head of the list.  */
> +static __always_inline void *
> +tcache_get (size_t tc_idx)
> +{
> +  return tcache_get_n (tc_idx, & tcache->entries[tc_idx]);
> +}
> +
> +/* Iterates through the tcache linked list.  */
> +static __always_inline void *
> +tcache_next (tcache_entry *e)
> +{
> +  return (tcache_entry *) REVEAL_PTR (e->next);
> +}
> +
>  static void
>  tcache_thread_shutdown (void)
>  {
> @@ -3301,7 +3326,7 @@ __libc_malloc (size_t bytes)
>  
>    DIAG_PUSH_NEEDS_COMMENT;
>    if (tc_idx < mp_.tcache_bins
> -      && tcache
> +      && tcache != NULL
>        && tcache->counts[tc_idx] > 0)
>      {
>        victim = tcache_get (tc_idx);
> @@ -3552,6 +3577,38 @@ _mid_memalign (size_t alignment, size_t bytes, void *address)
>        alignment = a;
>      }
>  
> +#if USE_TCACHE
> +  {
> +    size_t tbytes;
> +    tbytes = checked_request2size (bytes);
> +    if (tbytes == 0)
> +      {
> +	__set_errno (ENOMEM);
> +	return NULL;
> +      }
> +    size_t tc_idx = csize2tidx (tbytes);
> +
> +    if (tc_idx < mp_.tcache_bins
> +	&& tcache != NULL
> +	&& tcache->counts[tc_idx] > 0)
> +      {
> +	/* The tcache itself isn't encoded, but the chain is.  */
> +	tcache_entry **tep = & tcache->entries[tc_idx];
> +	tcache_entry *te = *tep;
> +	while (te != NULL && ((intptr_t)te & (alignment - 1)) != 0)
> +	  {
> +	    tep = & (te->next);
> +	    te = tcache_next (te);
> +	  }
> +	if (te != NULL)
> +	  {
> +	    void *victim = tcache_get_n (tc_idx, tep);
> +	    return tag_new_usable (victim);
> +	  }
> +      }
> +  }
> +#endif
> +
>    if (SINGLE_THREAD_P)
>      {
>        p = _int_memalign (&main_arena, alignment, bytes);
> @@ -3857,7 +3914,7 @@ _int_malloc (mstate av, size_t bytes)
>  	      /* While we're here, if we see other chunks of the same size,
>  		 stash them in the tcache.  */
>  	      size_t tc_idx = csize2tidx (nb);
> -	      if (tcache && tc_idx < mp_.tcache_bins)
> +	      if (tcache != NULL && tc_idx < mp_.tcache_bins)
>  		{
>  		  mchunkptr tc_victim;
>  
> @@ -3915,7 +3972,7 @@ _int_malloc (mstate av, size_t bytes)
>  	  /* While we're here, if we see other chunks of the same size,
>  	     stash them in the tcache.  */
>  	  size_t tc_idx = csize2tidx (nb);
> -	  if (tcache && tc_idx < mp_.tcache_bins)
> +	  if (tcache != NULL && tc_idx < mp_.tcache_bins)
>  	    {
>  	      mchunkptr tc_victim;
>  
> @@ -3977,7 +4034,7 @@ _int_malloc (mstate av, size_t bytes)
>  #if USE_TCACHE
>    INTERNAL_SIZE_T tcache_nb = 0;
>    size_t tc_idx = csize2tidx (nb);
> -  if (tcache && tc_idx < mp_.tcache_bins)
> +  if (tcache != NULL && tc_idx < mp_.tcache_bins)
>      tcache_nb = nb;
>    int return_cached = 0;
>  
> @@ -4059,7 +4116,7 @@ _int_malloc (mstate av, size_t bytes)
>  #if USE_TCACHE
>  	      /* Fill cache first, return to user only if cache fills.
>  		 We may return one of these chunks later.  */
> -	      if (tcache_nb
> +	      if (tcache_nb > 0
>  		  && tcache->counts[tc_idx] < mp_.tcache_count)
>  		{
>  		  tcache_put (victim, tc_idx);
> @@ -4932,6 +4989,43 @@ _int_realloc (mstate av, mchunkptr oldp, INTERNAL_SIZE_T oldsize,
>     ------------------------------ memalign ------------------------------
>   */
>  
> +/* Returns 0 if the chunk is not and does not contain the requested
> +   aligned sub-chunk, else returns the amount of "waste" from
> +   trimming.  BYTES is the *user* byte size, not the chunk byte
> +   size.  */
> +static int
> +chunk_ok_for_memalign (mchunkptr p, size_t alignment, size_t bytes)
> +{
> +  void *m = chunk2mem (p);
> +  INTERNAL_SIZE_T size = memsize (p);
> +  void *aligned_m = m;
> +
> +  if (__glibc_unlikely (misaligned_chunk (p)))
> +    malloc_printerr ("_int_memalign(): unaligned chunk detected");
> +
> +  aligned_m = PTR_ALIGN_UP (m, alignment);
> +
> +  INTERNAL_SIZE_T front_extra = (intptr_t) aligned_m - (intptr_t) m;
> +
> +  /* We can't trim off the front as it's too small.  */
> +  if (front_extra > 0 && front_extra < MINSIZE)
> +    return 0;
> +
> +  /* If it's a perfect fit, it's an exception to the return value rule
> +     (we would return zero waste, which looks like "not usable"), so
> +     handle it here by returning a small non-zero value instead.  */
> +  if (size == bytes && front_extra == 0)
> +    return 1;
> +
> +  /* If the block we need fits in the chunk, calculate total waste.  */
> +  if (size > bytes + front_extra)
> +    return size - bytes;
> +
> +  /* Can't use this chunk.  */ 
> +  return 0;
> +}
> +
> +/* BYTES is user requested bytes, not requested chunksize bytes.  */
>  static void *
>  _int_memalign (mstate av, size_t alignment, size_t bytes)
>  {
> @@ -4945,8 +5039,7 @@ _int_memalign (mstate av, size_t alignment, size_t bytes)
>    mchunkptr remainder;            /* spare room at end to split off */
>    unsigned long remainder_size;   /* its size */
>    INTERNAL_SIZE_T size;
> -
> -
> +  mchunkptr victim;
>  
>    nb = checked_request2size (bytes);
>    if (nb == 0)
> @@ -4955,29 +5048,142 @@ _int_memalign (mstate av, size_t alignment, size_t bytes)
>        return NULL;
>      }
>  
> -  /*
> -     Strategy: find a spot within that chunk that meets the alignment
> +  /* We can't check tcache here because we hold the arena lock, which
> +     tcache doesn't expect.  We expect it has been checked
> +     earlier.  */
> +
> +  /* Strategy: search the bins looking for an existing block that
> +     meets our needs.  We scan a range of bins from "exact size" to
> +     "just under 2x", spanning the small/large barrier if needed.  If
> +     we don't find anything in those bins, the common malloc code will
> +     scan starting at 2x.  */
> +
> +  /* This will be set if we found a candidate chunk.  */
> +  victim = NULL;
> +
> +  /* Fast bins are singly-linked, hard to remove a chunk from the middle
> +     and unlikely to meet our alignment requirements.  We have not done
> +     any experimentation with searching for aligned fastbins.  */
> +
> +  int first_bin_index;
> +  int first_largebin_index;
> +  int last_bin_index;
> +
> +  if (in_smallbin_range (nb))
> +    first_bin_index = smallbin_index (nb);
> +  else
> +    first_bin_index = largebin_index (nb);
> +
> +  if (in_smallbin_range (nb * 2))
> +    last_bin_index = smallbin_index (nb * 2);
> +  else
> +    last_bin_index = largebin_index (nb * 2);
> +
> +  first_largebin_index = largebin_index (MIN_LARGE_SIZE);
> +
> +  int victim_index;                 /* its bin index */
> +
> +  for (victim_index = first_bin_index;
> +       victim_index < last_bin_index;
> +       victim_index ++)
> +    {
> +      victim = NULL;
> +
> +      if (victim_index < first_largebin_index)
> +    {
> +      /* Check small bins.  Small bin chunks are doubly-linked despite
> +	 being the same size.  */
> +
> +      mchunkptr fwd;                    /* misc temp for linking */
> +      mchunkptr bck;                    /* misc temp for linking */
> +
> +      bck = bin_at (av, victim_index);
> +      fwd = bck->fd;
> +      while (fwd != bck)
> +	{
> +	  if (chunk_ok_for_memalign (fwd, alignment, bytes) > 0)
> +	    {
> +	      victim = fwd;
> +
> +	      /* Unlink it */
> +	      victim->fd->bk = victim->bk;
> +	      victim->bk->fd = victim->fd;
> +	      break;
> +	    }
> +
> +	  fwd = fwd->fd;
> +	}
> +    }
> +  else
> +    {
> +      /* Check large bins.  */
> +      mchunkptr fwd;                    /* misc temp for linking */
> +      mchunkptr bck;                    /* misc temp for linking */
> +      mchunkptr best = NULL;
> +      size_t best_size = 0;
> +
> +      bck = bin_at (av, victim_index);
> +      fwd = bck->fd;
> +
> +      while (fwd != bck)
> +	{
> +	  int extra;
> +
> +	  if (chunksize (fwd) < nb)
> +	      break;
> +	  extra = chunk_ok_for_memalign (fwd, alignment, bytes);
> +	  if (extra > 0
> +	      && (extra <= best_size || best == NULL))
> +	    {
> +	      best = fwd;
> +	      best_size = extra;
> +	    }
> +
> +	  fwd = fwd->fd;
> +	}
> +      victim = best;
> +
> +      if (victim != NULL)
> +	{
> +	  unlink_chunk (av, victim);
> +	  break;
> +	}
> +    }
> +
> +      if (victim != NULL)
> +	break;
> +    }
> +
> +  /* Strategy: find a spot within that chunk that meets the alignment
>       request, and then possibly free the leading and trailing space.
> -   */
> +     This strategy is incredibly costly and can lead to external
> +     fragmentation if header and footer chunks are unused.  */
>  
> -  /* Call malloc with worst case padding to hit alignment. */
> +  if (victim != NULL)
> +    {
> +      p = victim;
> +      m = chunk2mem (p);
> +      set_inuse (p);
> +    }
> +  else
> +    {
> +      /* Call malloc with worst case padding to hit alignment. */
>  
> -  m = (char *) (_int_malloc (av, nb + alignment + MINSIZE));
> +      m = (char *) (_int_malloc (av, nb + alignment + MINSIZE));
>  
> -  if (m == 0)
> -    return 0;           /* propagate failure */
> +      if (m == 0)
> +	return 0;           /* propagate failure */
>  
> -  p = mem2chunk (m);
> +      p = mem2chunk (m);
> +    }
>  
>    if ((((unsigned long) (m)) % alignment) != 0)   /* misaligned */
> -
> -    { /*
> -                Find an aligned spot inside chunk.  Since we need to give back
> -                leading space in a chunk of at least MINSIZE, if the first
> -                calculation places us at a spot with less than MINSIZE leader,
> -                we can move to the next aligned spot -- we've allocated enough
> -                total room so that this is always possible.
> -                 */
> +    {
> +      /* Find an aligned spot inside chunk.  Since we need to give back
> +         leading space in a chunk of at least MINSIZE, if the first
> +         calculation places us at a spot with less than MINSIZE leader,
> +         we can move to the next aligned spot -- we've allocated enough
> +         total room so that this is always possible.  */
>        brk = (char *) mem2chunk (((unsigned long) (m + alignment - 1)) &
>                                  - ((signed long) alignment));
>        if ((unsigned long) (brk - (char *) (p)) < MINSIZE)
> diff --git a/malloc/tst-memalign-2.c b/malloc/tst-memalign-2.c
> new file mode 100644
> index 0000000000..ed3660959a
> --- /dev/null
> +++ b/malloc/tst-memalign-2.c
> @@ -0,0 +1,136 @@
> +/* Test for memalign chunk reuse
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <errno.h>
> +#include <malloc.h>
> +#include <stdio.h>
> +#include <string.h>
> +#include <unistd.h>
> +#include <array_length.h>
> +
> +#include <support/check.h>
> +
> +typedef struct TestCase {
> +  size_t size;
> +  size_t alignment;
> +  void *ptr1;
> +  void *ptr2;
> +} TestCase;
> +
> +static TestCase tcache_allocs[] = {
> +  { 24, 8, NULL, NULL },
> +  { 24, 16, NULL, NULL },
> +  { 128, 32, NULL, NULL }
> +};
> +#define TN array_length (tcache_allocs)
> +
> +static TestCase large_allocs[] = {
> +  { 23450, 64, NULL, NULL },
> +  { 23450, 64, NULL, NULL },
> +  { 23550, 64, NULL, NULL },
> +  { 23550, 64, NULL, NULL },
> +  { 23650, 64, NULL, NULL },
> +  { 23650, 64, NULL, NULL },
> +  { 33650, 64, NULL, NULL },
> +  { 33650, 64, NULL, NULL }
> +};
> +#define LN array_length (large_allocs)
> +
> +void *p;
> +
> +static int
> +do_test (void)
> +{
> +  int i, j;
> +  int count;
> +  void *ptr[10];
> +  void *p;
> +
> +  /* TCache test.  */
> +
> +  for (i = 0; i < TN; ++ i)
> +    {
> +      tcache_allocs[i].ptr1 = memalign (tcache_allocs[i].alignment, tcache_allocs[i].size);
> +      free (tcache_allocs[i].ptr1);
> +      /* This should return the same chunk as was just free'd.  */
> +      tcache_allocs[i].ptr2 = memalign (tcache_allocs[i].alignment, tcache_allocs[i].size);
> +      free (tcache_allocs[i].ptr2);
> +
> +      TEST_VERIFY (tcache_allocs[i].ptr1 == tcache_allocs[i].ptr2);
> +    }
> +
> +  /* Test for non-head tcache hits.  */
> +  for (i = 0; i < 10; ++ i)
> +    {
> +      if (i == 4)
> +	ptr[i] = memalign (64, 256);
> +      else
> +	ptr[i] = malloc (256);
> +    }
> +  for (i = 0; i < 10; ++ i)
> +    free (ptr[i]);
> +
> +  p = memalign (64, 256);
> +
> +  count = 0;
> +  for (i = 0; i < 10; ++ i)
> +    if (ptr[i] == p)
> +      ++ count;
> +  free (p);
> +  TEST_VERIFY (count > 0);
> +
> +  /* Large bins test.  */
> +
> +  for (i = 0; i < LN; ++ i)
> +    {
> +      large_allocs[i].ptr1 = memalign (large_allocs[i].alignment, large_allocs[i].size);
> +      /* Keep chunks from combining by fragmenting the heap.  */
> +      p = malloc (512);
> +    }
> +
> +  for (i = 0; i < LN; ++ i)
> +    free (large_allocs[i].ptr1);
> +
> +  /* Force the unsorted bins to be scanned and moved to small/large
> +     bins.  */
> +  p = malloc (60000);
> +
> +  for (i = 0; i < LN; ++ i)
> +    large_allocs[i].ptr2 = memalign (large_allocs[i].alignment, large_allocs[i].size);
> +
> +  count = 0;
> +  for (i = 0; i < LN; ++ i)
> +    {
> +      int ok = 0;
> +      for (j = 0; j < LN; ++ j)
> +	if (large_allocs[i].ptr1 == large_allocs[j].ptr2)
> +	  ok = 1;
> +      if (ok == 1)
> +	count ++;
> +    }
> +
> +  /* The allocation algorithm is complicated outside of the memalign
> +     logic, so just make sure it's working for most of the
> +     allocations.  This avoids possible boundary conditions with
> +     empty/full heaps.  */
> +  TEST_VERIFY (count > LN / 2);
> +
> +  return 0;
> +}
> +
> +#include <support/test-driver.c>


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v4 1/1] memalign: Support scanning for aligned chunks.
  2022-07-28 19:50           ` [PATCH v4 " DJ Delorie
  2022-08-17 19:00             ` DJ Delorie
@ 2023-03-28 19:07             ` Adhemerval Zanella Netto
  2023-03-29  4:20               ` [PATCH v5 " DJ Delorie
  1 sibling, 1 reply; 38+ messages in thread
From: Adhemerval Zanella Netto @ 2023-03-28 19:07 UTC (permalink / raw)
  To: libc-alpha, DJ Delorie



On 28/07/22 16:50, DJ Delorie via Libc-alpha wrote:
> 
> [v4: updated testcase to new driver]
> 
> [note that I am not pushing this patch for this release, the timing is
> coincidence]
> 
> This patch adds a chunk scanning algorithm to the _int_memalign code
> path that reduces heap fragmentation by reusing already aligned chunks
> instead of always looking for chunks of larger sizes and splitting
> them.  The tcache macros are extended to allow removing a chunk from
> the middle of the list.
> 
> The goal is to fix the pathological use cases where heaps grow
> continuously in workloads that are heavy users of memalign.
> 
> Note that tst-memalign-2 checks for tcache operation, which
> malloc-check bypasses.

Hi DJ (I think I got it right now), patch looks good, some comments below.

> 
> diff --git a/malloc/Makefile b/malloc/Makefile
> index 4e32de2a0b..084c408ac7 100644
> --- a/malloc/Makefile
> +++ b/malloc/Makefile
> @@ -43,6 +43,7 @@ tests := mallocbug tst-malloc tst-valloc tst-calloc tst-obstack \
>  	 tst-tcfree1 tst-tcfree2 tst-tcfree3 \
>  	 tst-safe-linking \
>  	 tst-mallocalign1 \
> +	 tst-memalign-2
>  
>  tests-static := \
>  	 tst-interpose-static-nothread \
> @@ -72,7 +73,7 @@ test-srcs = tst-mtrace
>  # with MALLOC_CHECK_=3 because they expect a specific failure.
>  tests-exclude-malloc-check = tst-malloc-check tst-malloc-usable \
>  	tst-mxfast tst-safe-linking \
> -	tst-compathooks-off tst-compathooks-on
> +	tst-compathooks-off tst-compathooks-on tst-memalign-2
>  
>  # Run all tests with MALLOC_CHECK_=3
>  tests-malloc-check = $(filter-out $(tests-exclude-malloc-check) \
> diff --git a/malloc/malloc.c b/malloc/malloc.c
> index bd3c76ed31..3b31d6ae0f 100644
> --- a/malloc/malloc.c
> +++ b/malloc/malloc.c
> @@ -3180,19 +3180,44 @@ tcache_put (mchunkptr chunk, size_t tc_idx)
>  }
>  
>  /* Caller must ensure that we know tc_idx is valid and there's
> -   available chunks to remove.  */
> +   available chunks to remove.  Removes chunk from the middle of the
> +   list.  */
>  static __always_inline void *
> -tcache_get (size_t tc_idx)
> +tcache_get_n (size_t tc_idx, tcache_entry **ep)
>  {
> -  tcache_entry *e = tcache->entries[tc_idx];
> +  tcache_entry *e;
> +  if (ep == &(tcache->entries[tc_idx]))
> +    e = *ep;
> +  else
> +    e = REVEAL_PTR (*ep);
> +
>    if (__glibc_unlikely (!aligned_OK (e)))
>      malloc_printerr ("malloc(): unaligned tcache chunk detected");
> -  tcache->entries[tc_idx] = REVEAL_PTR (e->next);
> +
> +  if (ep == &(tcache->entries[tc_idx]))
> +      *ep = REVEAL_PTR (e->next);
> +  else
> +    *ep = PROTECT_PTR (ep, REVEAL_PTR (e->next));
> +
>    --(tcache->counts[tc_idx]);
>    e->key = 0;
>    return (void *) e;
>  }
>  
> +/* Like the above, but removes from the head of the list.  */
> +static __always_inline void *> +tcache_get (size_t tc_idx)
> +{
> +  return tcache_get_n (tc_idx, & tcache->entries[tc_idx]);
> +}
> +
> +/* Iterates through the tcache linked list.  */
> +static __always_inline void *

Why not use 'tcache_next *' as return type here?

> +tcache_next (tcache_entry *e)
> +{
> +  return (tcache_entry *) REVEAL_PTR (e->next);
> +}
> +
>  static void
>  tcache_thread_shutdown (void)
>  {
> @@ -3301,7 +3326,7 @@ __libc_malloc (size_t bytes)
>  
>    DIAG_PUSH_NEEDS_COMMENT;
>    if (tc_idx < mp_.tcache_bins
> -      && tcache
> +      && tcache != NULL
>        && tcache->counts[tc_idx] > 0)
>      {
>        victim = tcache_get (tc_idx);

I think the style chance should be on a different patch.

> @@ -3552,6 +3577,38 @@ _mid_memalign (size_t alignment, size_t bytes, void *address)
>        alignment = a;
>      }
>  
> +#if USE_TCACHE
> +  {
> +    size_t tbytes;
> +    tbytes = checked_request2size (bytes);
> +    if (tbytes == 0)
> +      {
> +	__set_errno (ENOMEM);
> +	return NULL;
> +      }
> +    size_t tc_idx = csize2tidx (tbytes);
> +
> +    if (tc_idx < mp_.tcache_bins
> +	&& tcache != NULL
> +	&& tcache->counts[tc_idx] > 0)
> +      {
> +	/* The tcache itself isn't encoded, but the chain is.  */
> +	tcache_entry **tep = & tcache->entries[tc_idx];
> +	tcache_entry *te = *tep;
> +	while (te != NULL && ((intptr_t)te & (alignment - 1)) != 0)

Maybe use '!PTR_IS_ALIGNED (te, alignment)' here?

> +	  {
> +	    tep = & (te->next);
> +	    te = tcache_next (te);
> +	  }
> +	if (te != NULL)
> +	  {
> +	    void *victim = tcache_get_n (tc_idx, tep);
> +	    return tag_new_usable (victim);
> +	  }
> +      }
> +  }
> +#endif
> +
>    if (SINGLE_THREAD_P)
>      {
>        p = _int_memalign (&main_arena, alignment, bytes);
> @@ -3857,7 +3914,7 @@ _int_malloc (mstate av, size_t bytes)
>  	      /* While we're here, if we see other chunks of the same size,
>  		 stash them in the tcache.  */
>  	      size_t tc_idx = csize2tidx (nb);
> -	      if (tcache && tc_idx < mp_.tcache_bins)
> +	      if (tcache != NULL && tc_idx < mp_.tcache_bins)
>  		{
>  		  mchunkptr tc_victim;
>  

I think the style chance should be on a different patch.

> @@ -3915,7 +3972,7 @@ _int_malloc (mstate av, size_t bytes)
>  	  /* While we're here, if we see other chunks of the same size,
>  	     stash them in the tcache.  */
>  	  size_t tc_idx = csize2tidx (nb);
> -	  if (tcache && tc_idx < mp_.tcache_bins)
> +	  if (tcache != NULL && tc_idx < mp_.tcache_bins)
>  	    {
>  	      mchunkptr tc_victim;
>  

Ditto.

> @@ -3977,7 +4034,7 @@ _int_malloc (mstate av, size_t bytes)
>  #if USE_TCACHE
>    INTERNAL_SIZE_T tcache_nb = 0;
>    size_t tc_idx = csize2tidx (nb);
> -  if (tcache && tc_idx < mp_.tcache_bins)
> +  if (tcache != NULL && tc_idx < mp_.tcache_bins)
>      tcache_nb = nb;
>    int return_cached = 0;
>  

Ditto.

> @@ -4059,7 +4116,7 @@ _int_malloc (mstate av, size_t bytes)
>  #if USE_TCACHE
>  	      /* Fill cache first, return to user only if cache fills.
>  		 We may return one of these chunks later.  */
> -	      if (tcache_nb
> +	      if (tcache_nb > 0
>  		  && tcache->counts[tc_idx] < mp_.tcache_count)
>  		{
>  		  tcache_put (victim, tc_idx);

Ditto.

> @@ -4932,6 +4989,43 @@ _int_realloc (mstate av, mchunkptr oldp, INTERNAL_SIZE_T oldsize,
>     ------------------------------ memalign ------------------------------
>   */
>  
> +/* Returns 0 if the chunk is not and does not contain the requested
> +   aligned sub-chunk, else returns the amount of "waste" from
> +   trimming.  BYTES is the *user* byte size, not the chunk byte
> +   size.  */
> +static int

Shouldn't it return a size_t here?

> +chunk_ok_for_memalign (mchunkptr p, size_t alignment, size_t bytes)
> +{
> +  void *m = chunk2mem (p);
> +  INTERNAL_SIZE_T size = memsize (p);
> +  void *aligned_m = m;
> +
> +  if (__glibc_unlikely (misaligned_chunk (p)))
> +    malloc_printerr ("_int_memalign(): unaligned chunk detected");
> +
> +  aligned_m = PTR_ALIGN_UP (m, alignment);
> +
> +  INTERNAL_SIZE_T front_extra = (intptr_t) aligned_m - (intptr_t) m;
> +
> +  /* We can't trim off the front as it's too small.  */
> +  if (front_extra > 0 && front_extra < MINSIZE)
> +    return 0;
> +
> +  /* If it's a perfect fit, it's an exception to the return value rule
> +     (we would return zero waste, which looks like "not usable"), so
> +     handle it here by returning a small non-zero value instead.  */
> +  if (size == bytes && front_extra == 0)
> +    return 1;
> +
> +  /* If the block we need fits in the chunk, calculate total waste.  */
> +  if (size > bytes + front_extra)
> +    return size - bytes;
> +
> +  /* Can't use this chunk.  */ 
> +  return 0;
> +}
> +
> +/* BYTES is user requested bytes, not requested chunksize bytes.  */
>  static void *
>  _int_memalign (mstate av, size_t alignment, size_t bytes)
>  {
> @@ -4945,8 +5039,7 @@ _int_memalign (mstate av, size_t alignment, size_t bytes)
>    mchunkptr remainder;            /* spare room at end to split off */
>    unsigned long remainder_size;   /* its size */
>    INTERNAL_SIZE_T size;
> -
> -

Spurious extra new lines?

> +  mchunkptr victim;
>  
>    nb = checked_request2size (bytes);
>    if (nb == 0)
> @@ -4955,29 +5048,142 @@ _int_memalign (mstate av, size_t alignment, size_t bytes)
>        return NULL;
>      }
>  
> -  /*
> -     Strategy: find a spot within that chunk that meets the alignment
> +  /* We can't check tcache here because we hold the arena lock, which
> +     tcache doesn't expect.  We expect it has been checked
> +     earlier.  */
> +
> +  /* Strategy: search the bins looking for an existing block that
> +     meets our needs.  We scan a range of bins from "exact size" to
> +     "just under 2x", spanning the small/large barrier if needed.  If
> +     we don't find anything in those bins, the common malloc code will
> +     scan starting at 2x.  */
> +
> +  /* This will be set if we found a candidate chunk.  */
> +  victim = NULL;
> +
> +  /* Fast bins are singly-linked, hard to remove a chunk from the middle
> +     and unlikely to meet our alignment requirements.  We have not done
> +     any experimentation with searching for aligned fastbins.  */
> +
> +  int first_bin_index;
> +  int first_largebin_index;
> +  int last_bin_index;
> +
> +  if (in_smallbin_range (nb))
> +    first_bin_index = smallbin_index (nb);
> +  else
> +    first_bin_index = largebin_index (nb);
> +
> +  if (in_smallbin_range (nb * 2))
> +    last_bin_index = smallbin_index (nb * 2);
> +  else
> +    last_bin_index = largebin_index (nb * 2);
> +
> +  first_largebin_index = largebin_index (MIN_LARGE_SIZE);
> +
> +  int victim_index;                 /* its bin index */
> +
> +  for (victim_index = first_bin_index;
> +       victim_index < last_bin_index;
> +       victim_index ++)
> +    {
> +      victim = NULL;
> +
> +      if (victim_index < first_largebin_index)
> +    {
> +      /* Check small bins.  Small bin chunks are doubly-linked despite
> +	 being the same size.  */
> +
> +      mchunkptr fwd;                    /* misc temp for linking */
> +      mchunkptr bck;                    /* misc temp for linking */
> +
> +      bck = bin_at (av, victim_index);
> +      fwd = bck->fd;
> +      while (fwd != bck)
> +	{
> +	  if (chunk_ok_for_memalign (fwd, alignment, bytes) > 0)
> +	    {
> +	      victim = fwd;
> +
> +	      /* Unlink it */
> +	      victim->fd->bk = victim->bk;
> +	      victim->bk->fd = victim->fd;
> +	      break;
> +	    }
> +
> +	  fwd = fwd->fd;
> +	}
> +    }
> +  else
> +    {
> +      /* Check large bins.  */
> +      mchunkptr fwd;                    /* misc temp for linking */
> +      mchunkptr bck;                    /* misc temp for linking */
> +      mchunkptr best = NULL;
> +      size_t best_size = 0;
> +
> +      bck = bin_at (av, victim_index);
> +      fwd = bck->fd;
> +
> +      while (fwd != bck)
> +	{
> +	  int extra;
> +
> +	  if (chunksize (fwd) < nb)
> +	      break;
> +	  extra = chunk_ok_for_memalign (fwd, alignment, bytes);
> +	  if (extra > 0
> +	      && (extra <= best_size || best == NULL))
> +	    {
> +	      best = fwd;
> +	      best_size = extra;
> +	    }
> +
> +	  fwd = fwd->fd;
> +	}
> +      victim = best;
> +
> +      if (victim != NULL)
> +	{
> +	  unlink_chunk (av, victim);
> +	  break;
> +	}
> +    }
> +
> +      if (victim != NULL)
> +	break;
> +    }
> +
> +  /* Strategy: find a spot within that chunk that meets the alignment
>       request, and then possibly free the leading and trailing space.
> -   */
> +     This strategy is incredibly costly and can lead to external
> +     fragmentation if header and footer chunks are unused.  */
>  
> -  /* Call malloc with worst case padding to hit alignment. */
> +  if (victim != NULL)
> +    {
> +      p = victim;
> +      m = chunk2mem (p);
> +      set_inuse (p);
> +    }
> +  else
> +    {
> +      /* Call malloc with worst case padding to hit alignment. */
>  
> -  m = (char *) (_int_malloc (av, nb + alignment + MINSIZE));
> +      m = (char *) (_int_malloc (av, nb + alignment + MINSIZE));
>  
> -  if (m == 0)
> -    return 0;           /* propagate failure */
> +      if (m == 0)
> +	return 0;           /* propagate failure */
>  
> -  p = mem2chunk (m);
> +      p = mem2chunk (m);
> +    }
>  
>    if ((((unsigned long) (m)) % alignment) != 0)   /* misaligned */
> -
> -    { /*
> -                Find an aligned spot inside chunk.  Since we need to give back
> -                leading space in a chunk of at least MINSIZE, if the first
> -                calculation places us at a spot with less than MINSIZE leader,
> -                we can move to the next aligned spot -- we've allocated enough
> -                total room so that this is always possible.
> -                 */
> +    {
> +      /* Find an aligned spot inside chunk.  Since we need to give back
> +         leading space in a chunk of at least MINSIZE, if the first
> +         calculation places us at a spot with less than MINSIZE leader,
> +         we can move to the next aligned spot -- we've allocated enough
> +         total room so that this is always possible.  */
>        brk = (char *) mem2chunk (((unsigned long) (m + alignment - 1)) &
>                                  - ((signed long) alignment));
>        if ((unsigned long) (brk - (char *) (p)) < MINSIZE)
> diff --git a/malloc/tst-memalign-2.c b/malloc/tst-memalign-2.c
> new file mode 100644
> index 0000000000..ed3660959a
> --- /dev/null
> +++ b/malloc/tst-memalign-2.c
> @@ -0,0 +1,136 @@
> +/* Test for memalign chunk reuse

Missing period.

> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <errno.h>
> +#include <malloc.h>
> +#include <stdio.h>
> +#include <string.h>
> +#include <unistd.h>
> +#include <array_length.h>
> +
> +#include <support/check.h>
> +
> +typedef struct TestCase {
> +  size_t size;
> +  size_t alignment;
> +  void *ptr1;
> +  void *ptr2;
> +} TestCase;
> +
> +static TestCase tcache_allocs[] = {
> +  { 24, 8, NULL, NULL },
> +  { 24, 16, NULL, NULL },
> +  { 128, 32, NULL, NULL }
> +};
> +#define TN array_length (tcache_allocs)
> +
> +static TestCase large_allocs[] = {
> +  { 23450, 64, NULL, NULL },
> +  { 23450, 64, NULL, NULL },
> +  { 23550, 64, NULL, NULL },
> +  { 23550, 64, NULL, NULL },
> +  { 23650, 64, NULL, NULL },
> +  { 23650, 64, NULL, NULL },
> +  { 33650, 64, NULL, NULL },
> +  { 33650, 64, NULL, NULL }
> +};
> +#define LN array_length (large_allocs)
> +
> +void *p;
> +
> +static int
> +do_test (void)
> +{
> +  int i, j;
> +  int count;
> +  void *ptr[10];
> +  void *p;
> +
> +  /* TCache test.  */
> +
> +  for (i = 0; i < TN; ++ i)
> +    {
> +      tcache_allocs[i].ptr1 = memalign (tcache_allocs[i].alignment, tcache_allocs[i].size);
> +      free (tcache_allocs[i].ptr1);
> +      /* This should return the same chunk as was just free'd.  */
> +      tcache_allocs[i].ptr2 = memalign (tcache_allocs[i].alignment, tcache_allocs[i].size);
> +      free (tcache_allocs[i].ptr2);

Should we also check for non NULL and return alignment as sanity checks here?

> +
> +      TEST_VERIFY (tcache_allocs[i].ptr1 == tcache_allocs[i].ptr2);
> +    }
> +
> +  /* Test for non-head tcache hits.  */
> +  for (i = 0; i < 10; ++ i)

Maybe use array_length (ptr) here.

> +    {
> +      if (i == 4)
> +	ptr[i] = memalign (64, 256);
> +      else
> +	ptr[i] = malloc (256);
> +    }
> +  for (i = 0; i < 10; ++ i)
> +    free (ptr[i]);
> +
> +  p = memalign (64, 256);
> +
> +  count = 0;
> +  for (i = 0; i < 10; ++ i)
> +    if (ptr[i] == p)
> +      ++ count;
> +  free (p);
> +  TEST_VERIFY (count > 0);
> +
> +  /* Large bins test.  */
> +
> +  for (i = 0; i < LN; ++ i)
> +    {
> +      large_allocs[i].ptr1 = memalign (large_allocs[i].alignment, large_allocs[i].size);
> +      /* Keep chunks from combining by fragmenting the heap.  */
> +      p = malloc (512);
> +    }
> +
> +  for (i = 0; i < LN; ++ i)
> +    free (large_allocs[i].ptr1);
> +
> +  /* Force the unsorted bins to be scanned and moved to small/large
> +     bins.  */
> +  p = malloc (60000);
> +
> +  for (i = 0; i < LN; ++ i)
> +    large_allocs[i].ptr2 = memalign (large_allocs[i].alignment, large_allocs[i].size);
> +
> +  count = 0;
> +  for (i = 0; i < LN; ++ i)
> +    {
> +      int ok = 0;
> +      for (j = 0; j < LN; ++ j)
> +	if (large_allocs[i].ptr1 == large_allocs[j].ptr2)
> +	  ok = 1;
> +      if (ok == 1)
> +	count ++;
> +    }
> +
> +  /* The allocation algorithm is complicated outside of the memalign
> +     logic, so just make sure it's working for most of the
> +     allocations.  This avoids possible boundary conditions with
> +     empty/full heaps.  */
> +  TEST_VERIFY (count > LN / 2);
> +
> +  return 0;
> +}
> +
> +#include <support/test-driver.c>
> 

^ permalink raw reply	[flat|nested] 38+ messages in thread

* [PATCH v5 1/1] memalign: Support scanning for aligned chunks.
  2023-03-28 19:07             ` Adhemerval Zanella Netto
@ 2023-03-29  4:20               ` DJ Delorie
  2023-03-29 19:41                 ` Adhemerval Zanella Netto
  2023-03-31 15:39                 ` Adhemerval Zanella Netto
  0 siblings, 2 replies; 38+ messages in thread
From: DJ Delorie @ 2023-03-29  4:20 UTC (permalink / raw)
  To: Adhemerval Zanella Netto; +Cc: libc-alpha

Adhemerval Zanella Netto <adhemerval.zanella@linaro.org> writes:
> Hi DJ (I think I got it right now),

Yup!

> patch looks good, some comments below.

v5 attached with changes as noted.

>> +/* Iterates through the tcache linked list.  */
>> +static __always_inline void *
>
> Why not use 'tcache_next *' as return type here?
>
>> +tcache_next (tcache_entry *e)

IIRC I copied tcache_get(), which returns that.
Fixed.

>> +	while (te != NULL && ((intptr_t)te & (alignment - 1)) != 0)
>
> Maybe use '!PTR_IS_ALIGNED (te, alignment)' here?

Yup.

>> +	  {
>> +	    tep = & (te->next);
>> +	    te = tcache_next (te);
>> +	  }
>> +	if (te != NULL)
>> +	  {
>> +	    void *victim = tcache_get_n (tc_idx, tep);
>> +	    return tag_new_usable (victim);
>> +	  }
>> +      }
>> +  }
>> +#endif
>> +
>>    if (SINGLE_THREAD_P)
>>      {
>>        p = _int_memalign (&main_arena, alignment, bytes);
>> @@ -3857,7 +3914,7 @@ _int_malloc (mstate av, size_t bytes)
>>  	      /* While we're here, if we see other chunks of the same size,
>>  		 stash them in the tcache.  */
>>  	      size_t tc_idx = csize2tidx (nb);
>> -	      if (tcache && tc_idx < mp_.tcache_bins)
>> +	      if (tcache != NULL && tc_idx < mp_.tcache_bins)
>>  		{
>>  		  mchunkptr tc_victim;
>>  
>
> I think the style chance should be on a different patch.

Perhaps but IIRC I needed those to get the warnings down to zero so I'd
prefer to leave them in.  Too much effort to split them out.

>> +/* Returns 0 if the chunk is not and does not contain the requested
>> +   aligned sub-chunk, else returns the amount of "waste" from
>> +   trimming.  BYTES is the *user* byte size, not the chunk byte
>> +   size.  */
>> +static int
>
> Shouldn't it return a size_t here?

Fixed.

>> +chunk_ok_for_memalign (mchunkptr p, size_t alignment, size_t bytes)
>>  _int_memalign (mstate av, size_t alignment, size_t bytes)
>>  {
>> @@ -4945,8 +5039,7 @@ _int_memalign (mstate av, size_t alignment, size_t bytes)
>>    mchunkptr remainder;            /* spare room at end to split off */
>>    unsigned long remainder_size;   /* its size */
>>    INTERNAL_SIZE_T size;
>> -
>> -
>
> Spurious extra new lines?

The original had three blank lines there for some reason.  I wouldn't
have bothered if I didn't have to add a new decl there anyway.

>> diff --git a/malloc/tst-memalign-2.c b/malloc/tst-memalign-2.c
>> new file mode 100644
>> index 0000000000..ed3660959a
>> --- /dev/null
>> +++ b/malloc/tst-memalign-2.c
>> @@ -0,0 +1,136 @@
>> +/* Test for memalign chunk reuse
>
> Missing period.

Fixed.

>> +  for (i = 0; i < TN; ++ i)
>> +    {
>> +      tcache_allocs[i].ptr1 = memalign (tcache_allocs[i].alignment, tcache_allocs[i].size);
>> +      free (tcache_allocs[i].ptr1);
>> +      /* This should return the same chunk as was just free'd.  */
>> +      tcache_allocs[i].ptr2 = memalign (tcache_allocs[i].alignment, tcache_allocs[i].size);
>> +      free (tcache_allocs[i].ptr2);
>
> Should we also check for non NULL and return alignment as sanity checks here?

Done.

>> +
>> +      TEST_VERIFY (tcache_allocs[i].ptr1 == tcache_allocs[i].ptr2);
>> +    }
>> +
>> +  /* Test for non-head tcache hits.  */
>> +  for (i = 0; i < 10; ++ i)
>
> Maybe use array_length (ptr) here.

Done.


From e32abda27e5c0aa82f4b736fdca35d56bf665cce Mon Sep 17 00:00:00 2001
From: DJ Delorie via Libc-alpha <libc-alpha@sourceware.org>
Date: Wed, 29 Mar 2023 00:18:40 -0400
Subject: memalign: Support scanning for aligned chunks.

This patch adds a chunk scanning algorithm to the _int_memalign code
path that reduces heap fragmentation by reusing already aligned chunks
instead of always looking for chunks of larger sizes and splitting
them.  The tcache macros are extended to allow removing a chunk from
the middle of the list.

The goal is to fix the pathological use cases where heaps grow
continuously in workloads that are heavy users of memalign.

Note that tst-memalign-2 checks for tcache operation, which
malloc-check bypasses.

diff --git a/malloc/Makefile b/malloc/Makefile
index dfb51d344c..79178c4905 100644
--- a/malloc/Makefile
+++ b/malloc/Makefile
@@ -43,6 +43,7 @@ tests := mallocbug tst-malloc tst-valloc tst-calloc tst-obstack \
 	 tst-tcfree1 tst-tcfree2 tst-tcfree3 \
 	 tst-safe-linking \
 	 tst-mallocalign1 \
+	 tst-memalign-2
 
 tests-static := \
 	 tst-interpose-static-nothread \
@@ -72,7 +73,7 @@ test-srcs = tst-mtrace
 # with MALLOC_CHECK_=3 because they expect a specific failure.
 tests-exclude-malloc-check = tst-malloc-check tst-malloc-usable \
 	tst-mxfast tst-safe-linking \
-	tst-compathooks-off tst-compathooks-on
+	tst-compathooks-off tst-compathooks-on tst-memalign-2
 
 # Run all tests with MALLOC_CHECK_=3
 tests-malloc-check = $(filter-out $(tests-exclude-malloc-check) \
diff --git a/malloc/malloc.c b/malloc/malloc.c
index 76c50e3f58..8ebc4372bc 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -3162,19 +3162,44 @@ tcache_put (mchunkptr chunk, size_t tc_idx)
 }
 
 /* Caller must ensure that we know tc_idx is valid and there's
-   available chunks to remove.  */
+   available chunks to remove.  Removes chunk from the middle of the
+   list.  */
 static __always_inline void *
-tcache_get (size_t tc_idx)
+tcache_get_n (size_t tc_idx, tcache_entry **ep)
 {
-  tcache_entry *e = tcache->entries[tc_idx];
+  tcache_entry *e;
+  if (ep == &(tcache->entries[tc_idx]))
+    e = *ep;
+  else
+    e = REVEAL_PTR (*ep);
+
   if (__glibc_unlikely (!aligned_OK (e)))
     malloc_printerr ("malloc(): unaligned tcache chunk detected");
-  tcache->entries[tc_idx] = REVEAL_PTR (e->next);
+
+  if (ep == &(tcache->entries[tc_idx]))
+      *ep = REVEAL_PTR (e->next);
+  else
+    *ep = PROTECT_PTR (ep, REVEAL_PTR (e->next));
+
   --(tcache->counts[tc_idx]);
   e->key = 0;
   return (void *) e;
 }
 
+/* Like the above, but removes from the head of the list.  */
+static __always_inline void *
+tcache_get (size_t tc_idx)
+{
+  return tcache_get_n (tc_idx, & tcache->entries[tc_idx]);
+}
+
+/* Iterates through the tcache linked list.  */
+static __always_inline tcache_entry *
+tcache_next (tcache_entry *e)
+{
+  return (tcache_entry *) REVEAL_PTR (e->next);
+}
+
 static void
 tcache_thread_shutdown (void)
 {
@@ -3283,7 +3308,7 @@ __libc_malloc (size_t bytes)
 
   DIAG_PUSH_NEEDS_COMMENT;
   if (tc_idx < mp_.tcache_bins
-      && tcache
+      && tcache != NULL
       && tcache->counts[tc_idx] > 0)
     {
       victim = tcache_get (tc_idx);
@@ -3542,6 +3567,38 @@ _mid_memalign (size_t alignment, size_t bytes, void *address)
       alignment = a;
     }
 
+#if USE_TCACHE
+  {
+    size_t tbytes;
+    tbytes = checked_request2size (bytes);
+    if (tbytes == 0)
+      {
+	__set_errno (ENOMEM);
+	return NULL;
+      }
+    size_t tc_idx = csize2tidx (tbytes);
+
+    if (tc_idx < mp_.tcache_bins
+	&& tcache != NULL
+	&& tcache->counts[tc_idx] > 0)
+      {
+	/* The tcache itself isn't encoded, but the chain is.  */
+	tcache_entry **tep = & tcache->entries[tc_idx];
+	tcache_entry *te = *tep;
+	while (te != NULL && !PTR_IS_ALIGNED (te, alignment))
+	  {
+	    tep = & (te->next);
+	    te = tcache_next (te);
+	  }
+	if (te != NULL)
+	  {
+	    void *victim = tcache_get_n (tc_idx, tep);
+	    return tag_new_usable (victim);
+	  }
+      }
+  }
+#endif
+
   if (SINGLE_THREAD_P)
     {
       p = _int_memalign (&main_arena, alignment, bytes);
@@ -3847,7 +3904,7 @@ _int_malloc (mstate av, size_t bytes)
 	      /* While we're here, if we see other chunks of the same size,
 		 stash them in the tcache.  */
 	      size_t tc_idx = csize2tidx (nb);
-	      if (tcache && tc_idx < mp_.tcache_bins)
+	      if (tcache != NULL && tc_idx < mp_.tcache_bins)
 		{
 		  mchunkptr tc_victim;
 
@@ -3905,7 +3962,7 @@ _int_malloc (mstate av, size_t bytes)
 	  /* While we're here, if we see other chunks of the same size,
 	     stash them in the tcache.  */
 	  size_t tc_idx = csize2tidx (nb);
-	  if (tcache && tc_idx < mp_.tcache_bins)
+	  if (tcache != NULL && tc_idx < mp_.tcache_bins)
 	    {
 	      mchunkptr tc_victim;
 
@@ -3967,7 +4024,7 @@ _int_malloc (mstate av, size_t bytes)
 #if USE_TCACHE
   INTERNAL_SIZE_T tcache_nb = 0;
   size_t tc_idx = csize2tidx (nb);
-  if (tcache && tc_idx < mp_.tcache_bins)
+  if (tcache != NULL && tc_idx < mp_.tcache_bins)
     tcache_nb = nb;
   int return_cached = 0;
 
@@ -4047,7 +4104,7 @@ _int_malloc (mstate av, size_t bytes)
 #if USE_TCACHE
 	      /* Fill cache first, return to user only if cache fills.
 		 We may return one of these chunks later.  */
-	      if (tcache_nb
+	      if (tcache_nb > 0
 		  && tcache->counts[tc_idx] < mp_.tcache_count)
 		{
 		  tcache_put (victim, tc_idx);
@@ -4921,6 +4978,43 @@ _int_realloc (mstate av, mchunkptr oldp, INTERNAL_SIZE_T oldsize,
    ------------------------------ memalign ------------------------------
  */
 
+/* Returns 0 if the chunk is not and does not contain the requested
+   aligned sub-chunk, else returns the amount of "waste" from
+   trimming.  BYTES is the *user* byte size, not the chunk byte
+   size.  */
+static size_t
+chunk_ok_for_memalign (mchunkptr p, size_t alignment, size_t bytes)
+{
+  void *m = chunk2mem (p);
+  INTERNAL_SIZE_T size = memsize (p);
+  void *aligned_m = m;
+
+  if (__glibc_unlikely (misaligned_chunk (p)))
+    malloc_printerr ("_int_memalign(): unaligned chunk detected");
+
+  aligned_m = PTR_ALIGN_UP (m, alignment);
+
+  INTERNAL_SIZE_T front_extra = (intptr_t) aligned_m - (intptr_t) m;
+
+  /* We can't trim off the front as it's too small.  */
+  if (front_extra > 0 && front_extra < MINSIZE)
+    return 0;
+
+  /* If it's a perfect fit, it's an exception to the return value rule
+     (we would return zero waste, which looks like "not usable"), so
+     handle it here by returning a small non-zero value instead.  */
+  if (size == bytes && front_extra == 0)
+    return 1;
+
+  /* If the block we need fits in the chunk, calculate total waste.  */
+  if (size > bytes + front_extra)
+    return size - bytes;
+
+  /* Can't use this chunk.  */ 
+  return 0;
+}
+
+/* BYTES is user requested bytes, not requested chunksize bytes.  */
 static void *
 _int_memalign (mstate av, size_t alignment, size_t bytes)
 {
@@ -4934,8 +5028,7 @@ _int_memalign (mstate av, size_t alignment, size_t bytes)
   mchunkptr remainder;            /* spare room at end to split off */
   unsigned long remainder_size;   /* its size */
   INTERNAL_SIZE_T size;
-
-
+  mchunkptr victim;
 
   nb = checked_request2size (bytes);
   if (nb == 0)
@@ -4944,29 +5037,142 @@ _int_memalign (mstate av, size_t alignment, size_t bytes)
       return NULL;
     }
 
-  /*
-     Strategy: find a spot within that chunk that meets the alignment
+  /* We can't check tcache here because we hold the arena lock, which
+     tcache doesn't expect.  We expect it has been checked
+     earlier.  */
+
+  /* Strategy: search the bins looking for an existing block that
+     meets our needs.  We scan a range of bins from "exact size" to
+     "just under 2x", spanning the small/large barrier if needed.  If
+     we don't find anything in those bins, the common malloc code will
+     scan starting at 2x.  */
+
+  /* This will be set if we found a candidate chunk.  */
+  victim = NULL;
+
+  /* Fast bins are singly-linked, hard to remove a chunk from the middle
+     and unlikely to meet our alignment requirements.  We have not done
+     any experimentation with searching for aligned fastbins.  */
+
+  int first_bin_index;
+  int first_largebin_index;
+  int last_bin_index;
+
+  if (in_smallbin_range (nb))
+    first_bin_index = smallbin_index (nb);
+  else
+    first_bin_index = largebin_index (nb);
+
+  if (in_smallbin_range (nb * 2))
+    last_bin_index = smallbin_index (nb * 2);
+  else
+    last_bin_index = largebin_index (nb * 2);
+
+  first_largebin_index = largebin_index (MIN_LARGE_SIZE);
+
+  int victim_index;                 /* its bin index */
+
+  for (victim_index = first_bin_index;
+       victim_index < last_bin_index;
+       victim_index ++)
+    {
+      victim = NULL;
+
+      if (victim_index < first_largebin_index)
+    {
+      /* Check small bins.  Small bin chunks are doubly-linked despite
+	 being the same size.  */
+
+      mchunkptr fwd;                    /* misc temp for linking */
+      mchunkptr bck;                    /* misc temp for linking */
+
+      bck = bin_at (av, victim_index);
+      fwd = bck->fd;
+      while (fwd != bck)
+	{
+	  if (chunk_ok_for_memalign (fwd, alignment, bytes) > 0)
+	    {
+	      victim = fwd;
+
+	      /* Unlink it */
+	      victim->fd->bk = victim->bk;
+	      victim->bk->fd = victim->fd;
+	      break;
+	    }
+
+	  fwd = fwd->fd;
+	}
+    }
+  else
+    {
+      /* Check large bins.  */
+      mchunkptr fwd;                    /* misc temp for linking */
+      mchunkptr bck;                    /* misc temp for linking */
+      mchunkptr best = NULL;
+      size_t best_size = 0;
+
+      bck = bin_at (av, victim_index);
+      fwd = bck->fd;
+
+      while (fwd != bck)
+	{
+	  int extra;
+
+	  if (chunksize (fwd) < nb)
+	      break;
+	  extra = chunk_ok_for_memalign (fwd, alignment, bytes);
+	  if (extra > 0
+	      && (extra <= best_size || best == NULL))
+	    {
+	      best = fwd;
+	      best_size = extra;
+	    }
+
+	  fwd = fwd->fd;
+	}
+      victim = best;
+
+      if (victim != NULL)
+	{
+	  unlink_chunk (av, victim);
+	  break;
+	}
+    }
+
+      if (victim != NULL)
+	break;
+    }
+
+  /* Strategy: find a spot within that chunk that meets the alignment
      request, and then possibly free the leading and trailing space.
-   */
+     This strategy is incredibly costly and can lead to external
+     fragmentation if header and footer chunks are unused.  */
 
-  /* Call malloc with worst case padding to hit alignment. */
+  if (victim != NULL)
+    {
+      p = victim;
+      m = chunk2mem (p);
+      set_inuse (p);
+    }
+  else
+    {
+      /* Call malloc with worst case padding to hit alignment. */
 
-  m = (char *) (_int_malloc (av, nb + alignment + MINSIZE));
+      m = (char *) (_int_malloc (av, nb + alignment + MINSIZE));
 
-  if (m == 0)
-    return 0;           /* propagate failure */
+      if (m == 0)
+	return 0;           /* propagate failure */
 
-  p = mem2chunk (m);
+      p = mem2chunk (m);
+    }
 
   if ((((unsigned long) (m)) % alignment) != 0)   /* misaligned */
-
-    { /*
-                Find an aligned spot inside chunk.  Since we need to give back
-                leading space in a chunk of at least MINSIZE, if the first
-                calculation places us at a spot with less than MINSIZE leader,
-                we can move to the next aligned spot -- we've allocated enough
-                total room so that this is always possible.
-                 */
+    {
+      /* Find an aligned spot inside chunk.  Since we need to give back
+         leading space in a chunk of at least MINSIZE, if the first
+         calculation places us at a spot with less than MINSIZE leader,
+         we can move to the next aligned spot -- we've allocated enough
+         total room so that this is always possible.  */
       brk = (char *) mem2chunk (((unsigned long) (m + alignment - 1)) &
                                 - ((signed long) alignment));
       if ((unsigned long) (brk - (char *) (p)) < MINSIZE)
diff --git a/malloc/tst-memalign-2.c b/malloc/tst-memalign-2.c
new file mode 100644
index 0000000000..4996578e9f
--- /dev/null
+++ b/malloc/tst-memalign-2.c
@@ -0,0 +1,155 @@
+/* Test for memalign chunk reuse.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <errno.h>
+#include <malloc.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <array_length.h>
+#include <libc-pointer-arith.h>
+#include <support/check.h>
+
+typedef struct TestCase {
+  size_t size;
+  size_t alignment;
+  void *ptr1;
+  void *ptr2;
+} TestCase;
+
+static TestCase tcache_allocs[] = {
+  { 24, 8, NULL, NULL },
+  { 24, 16, NULL, NULL },
+  { 128, 32, NULL, NULL }
+};
+#define TN array_length (tcache_allocs)
+
+static TestCase large_allocs[] = {
+  { 23450, 64, NULL, NULL },
+  { 23450, 64, NULL, NULL },
+  { 23550, 64, NULL, NULL },
+  { 23550, 64, NULL, NULL },
+  { 23650, 64, NULL, NULL },
+  { 23650, 64, NULL, NULL },
+  { 33650, 64, NULL, NULL },
+  { 33650, 64, NULL, NULL }
+};
+#define LN array_length (large_allocs)
+
+void *p;
+
+/* Sanity checks, ancillary to the actual test.  */
+#define CHECK(p,a) \
+  if (p == NULL || !PTR_IS_ALIGNED (p, a)) \
+    FAIL_EXIT1 ("NULL or misaligned memory detected.\n");
+
+static int
+do_test (void)
+{
+  int i, j;
+  int count;
+  void *ptr[10];
+  void *p;
+
+  /* TCache test.  */
+
+  for (i = 0; i < TN; ++ i)
+    {
+      tcache_allocs[i].ptr1 = memalign (tcache_allocs[i].alignment, tcache_allocs[i].size);
+      CHECK (tcache_allocs[i].ptr1, tcache_allocs[i].alignment);
+      free (tcache_allocs[i].ptr1);
+      /* This should return the same chunk as was just free'd.  */
+      tcache_allocs[i].ptr2 = memalign (tcache_allocs[i].alignment, tcache_allocs[i].size);
+      CHECK (tcache_allocs[i].ptr2, tcache_allocs[i].alignment);
+      free (tcache_allocs[i].ptr2);
+
+      TEST_VERIFY (tcache_allocs[i].ptr1 == tcache_allocs[i].ptr2);
+    }
+
+  /* Test for non-head tcache hits.  */
+  for (i = 0; i < array_length (ptr); ++ i)
+    {
+      if (i == 4)
+	{
+	  ptr[i] = memalign (64, 256);
+	  CHECK (ptr[i], 64);
+	}
+      else
+	{
+	  ptr[i] = malloc (256);
+	  CHECK (ptr[i], 4);
+	}
+    }
+  for (i = 0; i < array_length (ptr); ++ i)
+    free (ptr[i]);
+
+  p = memalign (64, 256);
+  CHECK (p, 64);
+
+  count = 0;
+  for (i = 0; i < 10; ++ i)
+    if (ptr[i] == p)
+      ++ count;
+  free (p);
+  TEST_VERIFY (count > 0);
+
+  /* Large bins test.  */
+
+  for (i = 0; i < LN; ++ i)
+    {
+      large_allocs[i].ptr1 = memalign (large_allocs[i].alignment, large_allocs[i].size);
+      CHECK (large_allocs[i].ptr1, large_allocs[i].alignment);
+      /* Keep chunks from combining by fragmenting the heap.  */
+      p = malloc (512);
+      CHECK (p, 4);
+    }
+
+  for (i = 0; i < LN; ++ i)
+    free (large_allocs[i].ptr1);
+
+  /* Force the unsorted bins to be scanned and moved to small/large
+     bins.  */
+  p = malloc (60000);
+
+  for (i = 0; i < LN; ++ i)
+    {
+      large_allocs[i].ptr2 = memalign (large_allocs[i].alignment, large_allocs[i].size);
+      CHECK (large_allocs[i].ptr2, large_allocs[i].alignment);
+    }
+
+  count = 0;
+  for (i = 0; i < LN; ++ i)
+    {
+      int ok = 0;
+      for (j = 0; j < LN; ++ j)
+	if (large_allocs[i].ptr1 == large_allocs[j].ptr2)
+	  ok = 1;
+      if (ok == 1)
+	count ++;
+    }
+
+  /* The allocation algorithm is complicated outside of the memalign
+     logic, so just make sure it's working for most of the
+     allocations.  This avoids possible boundary conditions with
+     empty/full heaps.  */
+  TEST_VERIFY (count > LN / 2);
+
+  return 0;
+}
+
+#include <support/test-driver.c>


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v5 1/1] memalign: Support scanning for aligned chunks.
  2023-03-29  4:20               ` [PATCH v5 " DJ Delorie
@ 2023-03-29 19:41                 ` Adhemerval Zanella Netto
  2023-03-29 20:36                   ` DJ Delorie
  2023-03-31 15:39                 ` Adhemerval Zanella Netto
  1 sibling, 1 reply; 38+ messages in thread
From: Adhemerval Zanella Netto @ 2023-03-29 19:41 UTC (permalink / raw)
  To: DJ Delorie; +Cc: libc-alpha



On 29/03/23 01:20, DJ Delorie wrote:
> From e32abda27e5c0aa82f4b736fdca35d56bf665cce Mon Sep 17 00:00:00 2001
> From: DJ Delorie via Libc-alpha <libc-alpha@sourceware.org>
> Date: Wed, 29 Mar 2023 00:18:40 -0400
> Subject: memalign: Support scanning for aligned chunks.
> 
> This patch adds a chunk scanning algorithm to the _int_memalign code
> path that reduces heap fragmentation by reusing already aligned chunks
> instead of always looking for chunks of larger sizes and splitting
> them.  The tcache macros are extended to allow removing a chunk from
> the middle of the list.
> 
> The goal is to fix the pathological use cases where heaps grow
> continuously in workloads that are heavy users of memalign.
> 
> Note that tst-memalign-2 checks for tcache operation, which
> malloc-check bypasses.

LGTM, thanks.

Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>

> 
> diff --git a/malloc/Makefile b/malloc/Makefile
> index dfb51d344c..79178c4905 100644
> --- a/malloc/Makefile
> +++ b/malloc/Makefile
> @@ -43,6 +43,7 @@ tests := mallocbug tst-malloc tst-valloc tst-calloc tst-obstack \
>  	 tst-tcfree1 tst-tcfree2 tst-tcfree3 \
>  	 tst-safe-linking \
>  	 tst-mallocalign1 \
> +	 tst-memalign-2
>  
>  tests-static := \
>  	 tst-interpose-static-nothread \
> @@ -72,7 +73,7 @@ test-srcs = tst-mtrace
>  # with MALLOC_CHECK_=3 because they expect a specific failure.
>  tests-exclude-malloc-check = tst-malloc-check tst-malloc-usable \
>  	tst-mxfast tst-safe-linking \
> -	tst-compathooks-off tst-compathooks-on
> +	tst-compathooks-off tst-compathooks-on tst-memalign-2
>  
>  # Run all tests with MALLOC_CHECK_=3
>  tests-malloc-check = $(filter-out $(tests-exclude-malloc-check) \
> diff --git a/malloc/malloc.c b/malloc/malloc.c
> index 76c50e3f58..8ebc4372bc 100644
> --- a/malloc/malloc.c
> +++ b/malloc/malloc.c
> @@ -3162,19 +3162,44 @@ tcache_put (mchunkptr chunk, size_t tc_idx)
>  }
>  
>  /* Caller must ensure that we know tc_idx is valid and there's
> -   available chunks to remove.  */
> +   available chunks to remove.  Removes chunk from the middle of the
> +   list.  */
>  static __always_inline void *
> -tcache_get (size_t tc_idx)
> +tcache_get_n (size_t tc_idx, tcache_entry **ep)
>  {
> -  tcache_entry *e = tcache->entries[tc_idx];
> +  tcache_entry *e;
> +  if (ep == &(tcache->entries[tc_idx]))
> +    e = *ep;
> +  else
> +    e = REVEAL_PTR (*ep);
> +
>    if (__glibc_unlikely (!aligned_OK (e)))
>      malloc_printerr ("malloc(): unaligned tcache chunk detected");
> -  tcache->entries[tc_idx] = REVEAL_PTR (e->next);
> +
> +  if (ep == &(tcache->entries[tc_idx]))
> +      *ep = REVEAL_PTR (e->next);
> +  else
> +    *ep = PROTECT_PTR (ep, REVEAL_PTR (e->next));
> +
>    --(tcache->counts[tc_idx]);
>    e->key = 0;
>    return (void *) e;
>  }
>  
> +/* Like the above, but removes from the head of the list.  */
> +static __always_inline void *
> +tcache_get (size_t tc_idx)
> +{
> +  return tcache_get_n (tc_idx, & tcache->entries[tc_idx]);
> +}
> +
> +/* Iterates through the tcache linked list.  */
> +static __always_inline tcache_entry *
> +tcache_next (tcache_entry *e)
> +{
> +  return (tcache_entry *) REVEAL_PTR (e->next);
> +}
> +
>  static void
>  tcache_thread_shutdown (void)
>  {
> @@ -3283,7 +3308,7 @@ __libc_malloc (size_t bytes)
>  
>    DIAG_PUSH_NEEDS_COMMENT;
>    if (tc_idx < mp_.tcache_bins
> -      && tcache
> +      && tcache != NULL
>        && tcache->counts[tc_idx] > 0)
>      {
>        victim = tcache_get (tc_idx);
> @@ -3542,6 +3567,38 @@ _mid_memalign (size_t alignment, size_t bytes, void *address)
>        alignment = a;
>      }
>  
> +#if USE_TCACHE
> +  {
> +    size_t tbytes;
> +    tbytes = checked_request2size (bytes);
> +    if (tbytes == 0)
> +      {
> +	__set_errno (ENOMEM);
> +	return NULL;
> +      }
> +    size_t tc_idx = csize2tidx (tbytes);
> +
> +    if (tc_idx < mp_.tcache_bins
> +	&& tcache != NULL
> +	&& tcache->counts[tc_idx] > 0)
> +      {
> +	/* The tcache itself isn't encoded, but the chain is.  */
> +	tcache_entry **tep = & tcache->entries[tc_idx];
> +	tcache_entry *te = *tep;
> +	while (te != NULL && !PTR_IS_ALIGNED (te, alignment))
> +	  {
> +	    tep = & (te->next);
> +	    te = tcache_next (te);
> +	  }
> +	if (te != NULL)
> +	  {
> +	    void *victim = tcache_get_n (tc_idx, tep);
> +	    return tag_new_usable (victim);
> +	  }
> +      }
> +  }
> +#endif
> +
>    if (SINGLE_THREAD_P)
>      {
>        p = _int_memalign (&main_arena, alignment, bytes);
> @@ -3847,7 +3904,7 @@ _int_malloc (mstate av, size_t bytes)
>  	      /* While we're here, if we see other chunks of the same size,
>  		 stash them in the tcache.  */
>  	      size_t tc_idx = csize2tidx (nb);
> -	      if (tcache && tc_idx < mp_.tcache_bins)
> +	      if (tcache != NULL && tc_idx < mp_.tcache_bins)
>  		{
>  		  mchunkptr tc_victim;
>  
> @@ -3905,7 +3962,7 @@ _int_malloc (mstate av, size_t bytes)
>  	  /* While we're here, if we see other chunks of the same size,
>  	     stash them in the tcache.  */
>  	  size_t tc_idx = csize2tidx (nb);
> -	  if (tcache && tc_idx < mp_.tcache_bins)
> +	  if (tcache != NULL && tc_idx < mp_.tcache_bins)
>  	    {
>  	      mchunkptr tc_victim;
>  
> @@ -3967,7 +4024,7 @@ _int_malloc (mstate av, size_t bytes)
>  #if USE_TCACHE
>    INTERNAL_SIZE_T tcache_nb = 0;
>    size_t tc_idx = csize2tidx (nb);
> -  if (tcache && tc_idx < mp_.tcache_bins)
> +  if (tcache != NULL && tc_idx < mp_.tcache_bins)
>      tcache_nb = nb;
>    int return_cached = 0;
>  
> @@ -4047,7 +4104,7 @@ _int_malloc (mstate av, size_t bytes)
>  #if USE_TCACHE
>  	      /* Fill cache first, return to user only if cache fills.
>  		 We may return one of these chunks later.  */
> -	      if (tcache_nb
> +	      if (tcache_nb > 0
>  		  && tcache->counts[tc_idx] < mp_.tcache_count)
>  		{
>  		  tcache_put (victim, tc_idx);
> @@ -4921,6 +4978,43 @@ _int_realloc (mstate av, mchunkptr oldp, INTERNAL_SIZE_T oldsize,
>     ------------------------------ memalign ------------------------------
>   */
>  
> +/* Returns 0 if the chunk is not and does not contain the requested
> +   aligned sub-chunk, else returns the amount of "waste" from
> +   trimming.  BYTES is the *user* byte size, not the chunk byte
> +   size.  */
> +static size_t
> +chunk_ok_for_memalign (mchunkptr p, size_t alignment, size_t bytes)
> +{
> +  void *m = chunk2mem (p);
> +  INTERNAL_SIZE_T size = memsize (p);
> +  void *aligned_m = m;
> +
> +  if (__glibc_unlikely (misaligned_chunk (p)))
> +    malloc_printerr ("_int_memalign(): unaligned chunk detected");
> +
> +  aligned_m = PTR_ALIGN_UP (m, alignment);
> +
> +  INTERNAL_SIZE_T front_extra = (intptr_t) aligned_m - (intptr_t) m;
> +
> +  /* We can't trim off the front as it's too small.  */
> +  if (front_extra > 0 && front_extra < MINSIZE)
> +    return 0;
> +
> +  /* If it's a perfect fit, it's an exception to the return value rule
> +     (we would return zero waste, which looks like "not usable"), so
> +     handle it here by returning a small non-zero value instead.  */
> +  if (size == bytes && front_extra == 0)
> +    return 1;
> +
> +  /* If the block we need fits in the chunk, calculate total waste.  */
> +  if (size > bytes + front_extra)
> +    return size - bytes;
> +
> +  /* Can't use this chunk.  */ 
> +  return 0;
> +}
> +
> +/* BYTES is user requested bytes, not requested chunksize bytes.  */
>  static void *
>  _int_memalign (mstate av, size_t alignment, size_t bytes)
>  {
> @@ -4934,8 +5028,7 @@ _int_memalign (mstate av, size_t alignment, size_t bytes)
>    mchunkptr remainder;            /* spare room at end to split off */
>    unsigned long remainder_size;   /* its size */
>    INTERNAL_SIZE_T size;
> -
> -
> +  mchunkptr victim;
>  
>    nb = checked_request2size (bytes);
>    if (nb == 0)
> @@ -4944,29 +5037,142 @@ _int_memalign (mstate av, size_t alignment, size_t bytes)
>        return NULL;
>      }
>  
> -  /*
> -     Strategy: find a spot within that chunk that meets the alignment
> +  /* We can't check tcache here because we hold the arena lock, which
> +     tcache doesn't expect.  We expect it has been checked
> +     earlier.  */
> +
> +  /* Strategy: search the bins looking for an existing block that
> +     meets our needs.  We scan a range of bins from "exact size" to
> +     "just under 2x", spanning the small/large barrier if needed.  If
> +     we don't find anything in those bins, the common malloc code will
> +     scan starting at 2x.  */
> +
> +  /* This will be set if we found a candidate chunk.  */
> +  victim = NULL;
> +
> +  /* Fast bins are singly-linked, hard to remove a chunk from the middle
> +     and unlikely to meet our alignment requirements.  We have not done
> +     any experimentation with searching for aligned fastbins.  */
> +
> +  int first_bin_index;
> +  int first_largebin_index;
> +  int last_bin_index;
> +
> +  if (in_smallbin_range (nb))
> +    first_bin_index = smallbin_index (nb);
> +  else
> +    first_bin_index = largebin_index (nb);
> +
> +  if (in_smallbin_range (nb * 2))
> +    last_bin_index = smallbin_index (nb * 2);
> +  else
> +    last_bin_index = largebin_index (nb * 2);
> +
> +  first_largebin_index = largebin_index (MIN_LARGE_SIZE);
> +
> +  int victim_index;                 /* its bin index */
> +
> +  for (victim_index = first_bin_index;
> +       victim_index < last_bin_index;
> +       victim_index ++)
> +    {
> +      victim = NULL;
> +
> +      if (victim_index < first_largebin_index)
> +    {
> +      /* Check small bins.  Small bin chunks are doubly-linked despite
> +	 being the same size.  */
> +
> +      mchunkptr fwd;                    /* misc temp for linking */
> +      mchunkptr bck;                    /* misc temp for linking */
> +
> +      bck = bin_at (av, victim_index);
> +      fwd = bck->fd;
> +      while (fwd != bck)
> +	{
> +	  if (chunk_ok_for_memalign (fwd, alignment, bytes) > 0)
> +	    {
> +	      victim = fwd;
> +
> +	      /* Unlink it */
> +	      victim->fd->bk = victim->bk;
> +	      victim->bk->fd = victim->fd;
> +	      break;
> +	    }
> +
> +	  fwd = fwd->fd;
> +	}
> +    }
> +  else
> +    {
> +      /* Check large bins.  */
> +      mchunkptr fwd;                    /* misc temp for linking */
> +      mchunkptr bck;                    /* misc temp for linking */
> +      mchunkptr best = NULL;
> +      size_t best_size = 0;
> +
> +      bck = bin_at (av, victim_index);
> +      fwd = bck->fd;
> +
> +      while (fwd != bck)
> +	{
> +	  int extra;
> +
> +	  if (chunksize (fwd) < nb)
> +	      break;
> +	  extra = chunk_ok_for_memalign (fwd, alignment, bytes);
> +	  if (extra > 0
> +	      && (extra <= best_size || best == NULL))
> +	    {
> +	      best = fwd;
> +	      best_size = extra;
> +	    }
> +
> +	  fwd = fwd->fd;
> +	}
> +      victim = best;
> +
> +      if (victim != NULL)
> +	{
> +	  unlink_chunk (av, victim);
> +	  break;
> +	}
> +    }
> +
> +      if (victim != NULL)
> +	break;
> +    }
> +
> +  /* Strategy: find a spot within that chunk that meets the alignment
>       request, and then possibly free the leading and trailing space.
> -   */
> +     This strategy is incredibly costly and can lead to external
> +     fragmentation if header and footer chunks are unused.  */
>  
> -  /* Call malloc with worst case padding to hit alignment. */
> +  if (victim != NULL)
> +    {
> +      p = victim;
> +      m = chunk2mem (p);
> +      set_inuse (p);
> +    }
> +  else
> +    {
> +      /* Call malloc with worst case padding to hit alignment. */
>  
> -  m = (char *) (_int_malloc (av, nb + alignment + MINSIZE));
> +      m = (char *) (_int_malloc (av, nb + alignment + MINSIZE));
>  
> -  if (m == 0)
> -    return 0;           /* propagate failure */
> +      if (m == 0)
> +	return 0;           /* propagate failure */
>  
> -  p = mem2chunk (m);
> +      p = mem2chunk (m);
> +    }
>  
>    if ((((unsigned long) (m)) % alignment) != 0)   /* misaligned */
> -
> -    { /*
> -                Find an aligned spot inside chunk.  Since we need to give back
> -                leading space in a chunk of at least MINSIZE, if the first
> -                calculation places us at a spot with less than MINSIZE leader,
> -                we can move to the next aligned spot -- we've allocated enough
> -                total room so that this is always possible.
> -                 */
> +    {
> +      /* Find an aligned spot inside chunk.  Since we need to give back
> +         leading space in a chunk of at least MINSIZE, if the first
> +         calculation places us at a spot with less than MINSIZE leader,
> +         we can move to the next aligned spot -- we've allocated enough
> +         total room so that this is always possible.  */
>        brk = (char *) mem2chunk (((unsigned long) (m + alignment - 1)) &
>                                  - ((signed long) alignment));
>        if ((unsigned long) (brk - (char *) (p)) < MINSIZE)
> diff --git a/malloc/tst-memalign-2.c b/malloc/tst-memalign-2.c
> new file mode 100644
> index 0000000000..4996578e9f
> --- /dev/null
> +++ b/malloc/tst-memalign-2.c
> @@ -0,0 +1,155 @@
> +/* Test for memalign chunk reuse.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <errno.h>
> +#include <malloc.h>
> +#include <stdio.h>
> +#include <string.h>
> +#include <unistd.h>
> +#include <array_length.h>
> +#include <libc-pointer-arith.h>
> +#include <support/check.h>
> +
> +typedef struct TestCase {
> +  size_t size;
> +  size_t alignment;
> +  void *ptr1;
> +  void *ptr2;
> +} TestCase;
> +
> +static TestCase tcache_allocs[] = {
> +  { 24, 8, NULL, NULL },
> +  { 24, 16, NULL, NULL },
> +  { 128, 32, NULL, NULL }
> +};
> +#define TN array_length (tcache_allocs)
> +
> +static TestCase large_allocs[] = {
> +  { 23450, 64, NULL, NULL },
> +  { 23450, 64, NULL, NULL },
> +  { 23550, 64, NULL, NULL },
> +  { 23550, 64, NULL, NULL },
> +  { 23650, 64, NULL, NULL },
> +  { 23650, 64, NULL, NULL },
> +  { 33650, 64, NULL, NULL },
> +  { 33650, 64, NULL, NULL }
> +};
> +#define LN array_length (large_allocs)
> +
> +void *p;
> +
> +/* Sanity checks, ancillary to the actual test.  */
> +#define CHECK(p,a) \
> +  if (p == NULL || !PTR_IS_ALIGNED (p, a)) \
> +    FAIL_EXIT1 ("NULL or misaligned memory detected.\n");
> +
> +static int
> +do_test (void)
> +{
> +  int i, j;
> +  int count;
> +  void *ptr[10];
> +  void *p;
> +
> +  /* TCache test.  */
> +
> +  for (i = 0; i < TN; ++ i)
> +    {
> +      tcache_allocs[i].ptr1 = memalign (tcache_allocs[i].alignment, tcache_allocs[i].size);
> +      CHECK (tcache_allocs[i].ptr1, tcache_allocs[i].alignment);
> +      free (tcache_allocs[i].ptr1);
> +      /* This should return the same chunk as was just free'd.  */
> +      tcache_allocs[i].ptr2 = memalign (tcache_allocs[i].alignment, tcache_allocs[i].size);
> +      CHECK (tcache_allocs[i].ptr2, tcache_allocs[i].alignment);
> +      free (tcache_allocs[i].ptr2);
> +
> +      TEST_VERIFY (tcache_allocs[i].ptr1 == tcache_allocs[i].ptr2);
> +    }
> +
> +  /* Test for non-head tcache hits.  */
> +  for (i = 0; i < array_length (ptr); ++ i)
> +    {
> +      if (i == 4)
> +	{
> +	  ptr[i] = memalign (64, 256);
> +	  CHECK (ptr[i], 64);
> +	}
> +      else
> +	{
> +	  ptr[i] = malloc (256);
> +	  CHECK (ptr[i], 4);
> +	}
> +    }
> +  for (i = 0; i < array_length (ptr); ++ i)
> +    free (ptr[i]);
> +
> +  p = memalign (64, 256);
> +  CHECK (p, 64);
> +
> +  count = 0;
> +  for (i = 0; i < 10; ++ i)
> +    if (ptr[i] == p)
> +      ++ count;
> +  free (p);
> +  TEST_VERIFY (count > 0);
> +
> +  /* Large bins test.  */
> +
> +  for (i = 0; i < LN; ++ i)
> +    {
> +      large_allocs[i].ptr1 = memalign (large_allocs[i].alignment, large_allocs[i].size);
> +      CHECK (large_allocs[i].ptr1, large_allocs[i].alignment);
> +      /* Keep chunks from combining by fragmenting the heap.  */
> +      p = malloc (512);
> +      CHECK (p, 4);
> +    }
> +
> +  for (i = 0; i < LN; ++ i)
> +    free (large_allocs[i].ptr1);
> +
> +  /* Force the unsorted bins to be scanned and moved to small/large
> +     bins.  */
> +  p = malloc (60000);
> +
> +  for (i = 0; i < LN; ++ i)
> +    {
> +      large_allocs[i].ptr2 = memalign (large_allocs[i].alignment, large_allocs[i].size);
> +      CHECK (large_allocs[i].ptr2, large_allocs[i].alignment);
> +    }
> +
> +  count = 0;
> +  for (i = 0; i < LN; ++ i)
> +    {
> +      int ok = 0;
> +      for (j = 0; j < LN; ++ j)
> +	if (large_allocs[i].ptr1 == large_allocs[j].ptr2)
> +	  ok = 1;
> +      if (ok == 1)
> +	count ++;
> +    }
> +
> +  /* The allocation algorithm is complicated outside of the memalign
> +     logic, so just make sure it's working for most of the
> +     allocations.  This avoids possible boundary conditions with
> +     empty/full heaps.  */
> +  TEST_VERIFY (count > LN / 2);
> +
> +  return 0;
> +}
> +
> +#include <support/test-driver.c>
> 

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v5 1/1] memalign: Support scanning for aligned chunks.
  2023-03-29 19:41                 ` Adhemerval Zanella Netto
@ 2023-03-29 20:36                   ` DJ Delorie
  2023-03-30 10:04                     ` Cristian Rodríguez
  2023-04-05 14:07                     ` Stefan Liebler
  0 siblings, 2 replies; 38+ messages in thread
From: DJ Delorie @ 2023-03-29 20:36 UTC (permalink / raw)
  To: Adhemerval Zanella Netto; +Cc: libc-alpha


Thanks!  Pushed.


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v5 1/1] memalign: Support scanning for aligned chunks.
  2023-03-29 20:36                   ` DJ Delorie
@ 2023-03-30 10:04                     ` Cristian Rodríguez
  2023-03-30 10:50                       ` Adhemerval Zanella Netto
  2023-04-05 14:07                     ` Stefan Liebler
  1 sibling, 1 reply; 38+ messages in thread
From: Cristian Rodríguez @ 2023-03-30 10:04 UTC (permalink / raw)
  To: DJ Delorie; +Cc: Adhemerval Zanella Netto, libc-alpha

On Wed, Mar 29, 2023 at 5:36 PM DJ Delorie via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
>
> Thanks!  Pushed.

Crashes previously working rust code..

for example "ripgrep" (command rg)

#rg FIND_PA
Fatal glibc error: malloc.c:3617 (_mid_memalign): assertion failed: !p
|| chunk_is_mmapped (mem2chunk (p)) || ar_ptr == arena_for_chunk
(mem2chunk (p))

Will see if anything else died ;)

[1]    2123 IOT instruction (core dumped)  rg FIND_PA

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v5 1/1] memalign: Support scanning for aligned chunks.
  2023-03-30 10:04                     ` Cristian Rodríguez
@ 2023-03-30 10:50                       ` Adhemerval Zanella Netto
  2023-03-30 21:43                         ` Cristian Rodríguez
  0 siblings, 1 reply; 38+ messages in thread
From: Adhemerval Zanella Netto @ 2023-03-30 10:50 UTC (permalink / raw)
  To: Cristian Rodríguez, DJ Delorie; +Cc: libc-alpha



On 30/03/23 07:04, Cristian Rodríguez wrote:
> On Wed, Mar 29, 2023 at 5:36 PM DJ Delorie via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
>>
>>
>> Thanks!  Pushed.
> 
> Crashes previously working rust code..
> 
> for example "ripgrep" (command rg)
> 
> #rg FIND_PA
> Fatal glibc error: malloc.c:3617 (_mid_memalign): assertion failed: !p
> || chunk_is_mmapped (mem2chunk (p)) || ar_ptr == arena_for_chunk
> (mem2chunk (p))
> 
> Will see if anything else died ;)
> 
> [1]    2123 IOT instruction (core dumped)  rg FIND_PA

Do you have any testcase that triggers it? 

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v5 1/1] memalign: Support scanning for aligned chunks.
  2023-03-30 10:50                       ` Adhemerval Zanella Netto
@ 2023-03-30 21:43                         ` Cristian Rodríguez
  2023-04-12 17:04                           ` Xi Ruoyao
  0 siblings, 1 reply; 38+ messages in thread
From: Cristian Rodríguez @ 2023-03-30 21:43 UTC (permalink / raw)
  To: Adhemerval Zanella Netto; +Cc: DJ Delorie, libc-alpha

[-- Attachment #1: Type: text/plain, Size: 282 bytes --]

On Thu, Mar 30, 2023 at 7:50 AM Adhemerval Zanella Netto <
adhemerval.zanella@linaro.org> wrote:

> Do you have any testcase that triggers it?

I'll try.. cannot compile ripgrep with debug info either ..because this
commit also makes rustc crash with the same message.😕

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v5 1/1] memalign: Support scanning for aligned chunks.
  2023-03-29  4:20               ` [PATCH v5 " DJ Delorie
  2023-03-29 19:41                 ` Adhemerval Zanella Netto
@ 2023-03-31 15:39                 ` Adhemerval Zanella Netto
  1 sibling, 0 replies; 38+ messages in thread
From: Adhemerval Zanella Netto @ 2023-03-31 15:39 UTC (permalink / raw)
  To: DJ Delorie; +Cc: libc-alpha, Cristian Rodríguez



On 29/03/23 01:20, DJ Delorie wrote:
> From e32abda27e5c0aa82f4b736fdca35d56bf665cce Mon Sep 17 00:00:00 2001
> From: DJ Delorie via Libc-alpha <libc-alpha@sourceware.org>
> Date: Wed, 29 Mar 2023 00:18:40 -0400
> Subject: memalign: Support scanning for aligned chunks.
> 
> This patch adds a chunk scanning algorithm to the _int_memalign code
> path that reduces heap fragmentation by reusing already aligned chunks
> instead of always looking for chunks of larger sizes and splitting
> them.  The tcache macros are extended to allow removing a chunk from
> the middle of the list.
> 
> The goal is to fix the pathological use cases where heaps grow
> continuously in workloads that are heavy users of memalign.
> 
> Note that tst-memalign-2 checks for tcache operation, which
> malloc-check bypasses.

So it seems this patch does trigger a regression.  I am seeing on a speccpu2017
benchmark (cam4_s) failure:

****************************************
Contents of cam4_s_base.gcc-64.err
****************************************
Fatal glibc error: malloc.c:3617 (_mid_memalign): assertion failed: !p || chunk_is_mmapped (mem2chunk (p)) || ar_ptr == arena_for_chunk (mem2chunk (p))
Fatal glibc error: malloc.c:3617 (_mid_memalign): assertion failed: !p || chunk_is_mmapped (mem2chunk (p)) || ar_ptr == arena_for_chunk (mem2chunk (p))
Fatal glibc error: malloc.c:3617 (_mid_memalign): assertion failed: !p || chunk_is_mmapped (mem2chunk (p)) || ar_ptr == arena_for_chunk (mem2chunk (p))
Fatal glibc error: malloc.c:3617 (_mid_memalign): assertion failed: !p || chunk_is_mmapped (mem2chunk (p)) || ar_ptr == arena_for_chunk (mem2chunk (p))
Fatal glibc error: malloc.c:3617 (_mid_memalign): assertion failed: !p || chunk_is_mmapped (mem2chunk (p)) || ar_ptr == arena_for_chunk (mem2chunk (p))
Fatal glibc error: malloc.c:3617 (_mid_memalign): assertion failed: !p || chunk_is_mmapped (mem2chunk (p)) || ar_ptr == arena_for_chunk (mem2chunk (p))
Fatal glibc error: malloc.c:3617 (_mid_memalign): assertion failed: !p || chunk_is_mmapped (mem2chunk (p)) || ar_ptr == arena_for_chunk (mem2chunk (p))
Fatal glibc error: malloc.c:3617 (_mid_memalign): assertion failed: !p || chunk_is_mmapped (mem2chunk (p)) || ar_ptr == arena_for_chunk (mem2chunk (p))
Fatal glibc error: malloc.c:3617 (_mid_memalign): assertion failed: !p || chunk_is_mmapped (mem2chunk (p)) || ar_ptr == arena_for_chunk (mem2chunk (p))
Fatal glibc error: malloc.c:3617 (_mid_memalign): assertion failed: !p || chunk_is_mmapped (mem2chunk (p)) || ar_ptr == arena_for_chunk (mem2chunk (p))
Fatal glibc error: malloc.c:3617 (_mid_memalign): assertion failed: !p || chunk_is_mmapped (mem2chunk (p)) || ar_ptr == arena_for_chunk (mem2chunk (p))
Fatal glibc error: malloc.c:3617 (_mid_memalign): assertion failed: !p || chunk_is_mmapped (mem2chunk (p)) || ar_ptr == arena_for_chunk (mem2chunk (p))
Fatal glibc error: malloc.c:3617 (_mid_memalign): assertion failed: !p || chunk_is_mmapped (mem2chunk (p)) || ar_ptr == arena_for_chunk (mem2chunk (p))
Fatal glibc error: malloc.c:3617 (_mid_memalign): assertion failed: !p || chunk_is_mmapped (mem2chunk (p)) || ar_ptr == arena_for_chunk (mem2chunk (p))
Fatal glibc error: malloc.c:3617 (_mid_memalign): assertion failed: !p || chunk_is_mmapped (mem2chunk (p)) || ar_ptr == arena_for_chunk (mem2chunk (p))
Program received signal SIGABRT: Process abort signal.
Fatal glibc error: malloc.c:3617 (_mid_memalign): assertion failed: !p || chunk_is_mmapped (mem2chunk (p)) || ar_ptr == arena_for_chunk (mem2chunk (p))
Fatal glibc error: malloc.c:3617 (_mid_memalign): assertion failed: !p || chunk_is_mmapped (mem2chunk (p)) || ar_ptr == arena_for_chunk (mem2chunk (p))
Fatal glibc error: malloc.c:3617 (_mid_memalign): assertion failed: !p || chunk_is_mmapped (mem2chunk (p)) || ar_ptr == arena_for_chunk (mem2chunk (p))
Fatal glibc error: malloc.c:3617 (_mid_memalign): assertion failed: !p || chunk_is_mmapped (mem2chunk (p)) || ar_ptr == arena_for_chunk (mem2chunk (p))

I have not yet isolated the malloc calls patterns, but I would like to give you
a heads up that this does seems to be an issue with at least on reproduce.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v5 1/1] memalign: Support scanning for aligned chunks.
  2023-03-29 20:36                   ` DJ Delorie
  2023-03-30 10:04                     ` Cristian Rodríguez
@ 2023-04-05 14:07                     ` Stefan Liebler
  2023-04-05 17:58                       ` DJ Delorie
  1 sibling, 1 reply; 38+ messages in thread
From: Stefan Liebler @ 2023-04-05 14:07 UTC (permalink / raw)
  To: libc-alpha

On 29.03.23 22:36, DJ Delorie via Libc-alpha wrote:
> 
> Thanks!  Pushed.
> 
On s390 (31bit), I see the test fail:
FAIL: malloc/tst-memalign-2-mcheck

After adding those printfs ...:
diff --git a/malloc/tst-memalign-2.c b/malloc/tst-memalign-2.c
index 4996578e9f..adfebf8384 100644
--- a/malloc/tst-memalign-2.c
+++ b/malloc/tst-memalign-2.c
@@ -72,10 +72,12 @@ do_test (void)
     {
       tcache_allocs[i].ptr1 = memalign (tcache_allocs[i].alignment,
tcache_allocs[i].size);
       CHECK (tcache_allocs[i].ptr1, tcache_allocs[i].alignment);
+      printf ("%d# ptr1=%p\n", i, tcache_allocs[i].ptr1);
       free (tcache_allocs[i].ptr1);
       /* This should return the same chunk as was just free'd.  */
       tcache_allocs[i].ptr2 = memalign (tcache_allocs[i].alignment,
tcache_allocs[i].size);
       CHECK (tcache_allocs[i].ptr2, tcache_allocs[i].alignment);
+      printf ("%d# ptr2=%p\n", i, tcache_allocs[i].ptr2);
       free (tcache_allocs[i].ptr2);

       TEST_VERIFY (tcache_allocs[i].ptr1 == tcache_allocs[i].ptr2);


... I've got this output:
0# ptr1=0x55bc71b8
0# ptr2=0x55bc71b8
1# ptr1=0x55bc7210
1# ptr2=0x55bc7260
error: tst-memalign-2.c:83: not true: tcache_allocs[i].ptr1 ==
tcache_allocs[i].ptr2
2# ptr1=0x55bc72e0
2# ptr2=0x55bc72e0
error: 1 test failures


malloc/tst-memalign-2 (without mcheck) is passing.
PASS: malloc/tst-memalign-2
original exit status 0
0# ptr1=0x55c0e190
0# ptr2=0x55c0e190
1# ptr1=0x55c0e190
1# ptr2=0x55c0e190
2# ptr1=0x55c0e1c0
2# ptr2=0x55c0e1c0

Can you please help.

Thanks,
Stefan

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v5 1/1] memalign: Support scanning for aligned chunks.
  2023-04-05 14:07                     ` Stefan Liebler
@ 2023-04-05 17:58                       ` DJ Delorie
  2023-04-11 11:40                         ` Stefan Liebler
  0 siblings, 1 reply; 38+ messages in thread
From: DJ Delorie @ 2023-04-05 17:58 UTC (permalink / raw)
  To: Stefan Liebler; +Cc: libc-alpha

Stefan Liebler via Libc-alpha <libc-alpha@sourceware.org> writes:
> On s390 (31bit), I see the test fail:
> FAIL: malloc/tst-memalign-2-mcheck

Please try
https://sourceware.org/pipermail/libc-alpha/2023-April/146959.html

I fixed that test there, hopefully you're seeing the same thing I saw ;-)


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v5 1/1] memalign: Support scanning for aligned chunks.
  2023-04-05 17:58                       ` DJ Delorie
@ 2023-04-11 11:40                         ` Stefan Liebler
  2023-04-12 11:23                           ` Stefan Liebler
  0 siblings, 1 reply; 38+ messages in thread
From: Stefan Liebler @ 2023-04-11 11:40 UTC (permalink / raw)
  To: DJ Delorie; +Cc: libc-alpha

On 05.04.23 19:58, DJ Delorie wrote:
> Stefan Liebler via Libc-alpha <libc-alpha@sourceware.org> writes:
>> On s390 (31bit), I see the test fail:
>> FAIL: malloc/tst-memalign-2-mcheck
> 
> Please try
> https://sourceware.org/pipermail/libc-alpha/2023-April/146959.html
> 
> I fixed that test there, hopefully you're seeing the same thing I saw ;-)
> 
Hi DJ,

I've applied your patch
"[patch v2] malloc: set NON_MAIN_ARENA flag for reclaimed memalign chunk
(BZ #30101)"
on top of
"hurd: Don't leak __hurd_reply_port0"
commit cd019ddd892e182277fadd6aedccc57fa3923c8d

Now I get those fails:
on s390x (64bit) and x86_64:
- malloc/tst-memalign-2-mcheck.out
error: tst-memalign-2.c:114: not true: count > 0
error: 1 test failures

- malloc/tst-memalign-3-mcheck.out
error: tst-memalign-3.c:89: not true: tcache_allocs[i].ptr1 ==
tcache_allocs[i].ptr2
error: tst-memalign-3.c:89: not true: tcache_allocs[i].ptr1 ==
tcache_allocs[i].ptr2
error: tst-memalign-3.c:89: not true: tcache_allocs[i].ptr1 ==
tcache_allocs[i].ptr2
error: 3 test failures


on s390 (31bit):
- malloc/tst-memalign-2-mcheck.out
error: tst-memalign-2.c:86: not true: tcache_allocs[i].ptr1 ==
tcache_allocs[i].ptr2
error: 1 test failures

- malloc/tst-memalign-3-mcheck.out
error: tst-memalign-3.c:89: not true: tcache_allocs[i].ptr1 ==
tcache_allocs[i].ptr2
error: tst-memalign-3.c:89: not true: tcache_allocs[i].ptr1 ==
tcache_allocs[i].ptr2
error: tst-memalign-3.c:117: not true: count > 0
error: 3 test failures


Do you also see the fails on x86_64?

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v5 1/1] memalign: Support scanning for aligned chunks.
  2023-04-11 11:40                         ` Stefan Liebler
@ 2023-04-12 11:23                           ` Stefan Liebler
  0 siblings, 0 replies; 38+ messages in thread
From: Stefan Liebler @ 2023-04-12 11:23 UTC (permalink / raw)
  To: libc-alpha

On 11.04.23 13:40, Stefan Liebler via Libc-alpha wrote:
> On 05.04.23 19:58, DJ Delorie wrote:
>> Stefan Liebler via Libc-alpha <libc-alpha@sourceware.org> writes:
>>> On s390 (31bit), I see the test fail:
>>> FAIL: malloc/tst-memalign-2-mcheck
>>
>> Please try
>> https://sourceware.org/pipermail/libc-alpha/2023-April/146959.html
>>
>> I fixed that test there, hopefully you're seeing the same thing I saw ;-)
>>
> Hi DJ,
> 
> I've applied your patch
> "[patch v2] malloc: set NON_MAIN_ARENA flag for reclaimed memalign chunk
> (BZ #30101)"
> on top of
> "hurd: Don't leak __hurd_reply_port0"
> commit cd019ddd892e182277fadd6aedccc57fa3923c8d
> 
> Now I get those fails:
> on s390x (64bit) and x86_64:
> - malloc/tst-memalign-2-mcheck.out
> error: tst-memalign-2.c:114: not true: count > 0
> error: 1 test failures
> 
> - malloc/tst-memalign-3-mcheck.out
> error: tst-memalign-3.c:89: not true: tcache_allocs[i].ptr1 ==
> tcache_allocs[i].ptr2
> error: tst-memalign-3.c:89: not true: tcache_allocs[i].ptr1 ==
> tcache_allocs[i].ptr2
> error: tst-memalign-3.c:89: not true: tcache_allocs[i].ptr1 ==
> tcache_allocs[i].ptr2
> error: 3 test failures
> 
> 
> on s390 (31bit):
> - malloc/tst-memalign-2-mcheck.out
> error: tst-memalign-2.c:86: not true: tcache_allocs[i].ptr1 ==
> tcache_allocs[i].ptr2
> error: 1 test failures
> 
> - malloc/tst-memalign-3-mcheck.out
> error: tst-memalign-3.c:89: not true: tcache_allocs[i].ptr1 ==
> tcache_allocs[i].ptr2
> error: tst-memalign-3.c:89: not true: tcache_allocs[i].ptr1 ==
> tcache_allocs[i].ptr2
> error: tst-memalign-3.c:117: not true: count > 0
> error: 3 test failures
> 
> 
> Do you also see the fails on x86_64?

Just as information. I also see the same fails as described above with
"[patch v3] malloc: set NON_MAIN_ARENA flag for reclaimed memalign chunk
(BZ #30101)"
https://sourceware.org/pipermail/libc-alpha/2023-April/147181.html

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v5 1/1] memalign: Support scanning for aligned chunks.
  2023-03-30 21:43                         ` Cristian Rodríguez
@ 2023-04-12 17:04                           ` Xi Ruoyao
  2023-04-12 17:16                             ` DJ Delorie
  2023-04-12 17:33                             ` [PATCH v5 1/1] " Adhemerval Zanella Netto
  0 siblings, 2 replies; 38+ messages in thread
From: Xi Ruoyao @ 2023-04-12 17:04 UTC (permalink / raw)
  To: Cristian Rodríguez, Adhemerval Zanella Netto; +Cc: DJ Delorie, libc-alpha

On Thu, 2023-03-30 at 18:43 -0300, Cristian Rodríguez via Libc-alpha
wrote:
> On Thu, Mar 30, 2023 at 7:50 AM Adhemerval Zanella Netto <
> adhemerval.zanella@linaro.org> wrote:
> 
> > Do you have any testcase that triggers it?
> 
> I'll try.. cannot compile ripgrep with debug info either ..because this
> commit also makes rustc crash with the same message.😕

On LoongArch (with a GCC 12.2 but LoongArch backend patched to match GCC
trunk), this causes an almost deterministic failure of malloc/tst-
malloc-thread-fail-malloc-hugetlb2:

FAIL: malloc/tst-malloc-thread-fail-malloc-hugetlb2
original exit status 1
error: exit status 139 from child process

-- 
Xi Ruoyao <xry111@xry111.site>
School of Aerospace Science and Technology, Xidian University

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v5 1/1] memalign: Support scanning for aligned chunks.
  2023-04-12 17:04                           ` Xi Ruoyao
@ 2023-04-12 17:16                             ` DJ Delorie
  2023-04-12 17:26                               ` Xi Ruoyao
  2023-04-12 17:33                             ` [PATCH v5 1/1] " Adhemerval Zanella Netto
  1 sibling, 1 reply; 38+ messages in thread
From: DJ Delorie @ 2023-04-12 17:16 UTC (permalink / raw)
  To: Xi Ruoyao; +Cc: libc-alpha

Xi Ruoyao <xry111@xry111.site> writes:
> On LoongArch (with a GCC 12.2 but LoongArch backend patched to match GCC
> trunk), this causes an almost deterministic failure of malloc/tst-
> malloc-thread-fail-malloc-hugetlb2:

(1) What's in malloc/tst-malloc-thread-fail-malloc-hugetlb2.out ?

(2) Have you tried with "export TIMEOUTFACTOR=20" ? some tests fail
    reliably on slower systems due to timeouts.


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v5 1/1] memalign: Support scanning for aligned chunks.
  2023-04-12 17:16                             ` DJ Delorie
@ 2023-04-12 17:26                               ` Xi Ruoyao
  2023-04-13  1:52                                 ` [PATCH v6 " DJ Delorie
  0 siblings, 1 reply; 38+ messages in thread
From: Xi Ruoyao @ 2023-04-12 17:26 UTC (permalink / raw)
  To: DJ Delorie; +Cc: libc-alpha

On Wed, 2023-04-12 at 13:16 -0400, DJ Delorie wrote:
> Xi Ruoyao <xry111@xry111.site> writes:
> > On LoongArch (with a GCC 12.2 but LoongArch backend patched to match
> > GCC
> > trunk), this causes an almost deterministic failure of malloc/tst-
> > malloc-thread-fail-malloc-hugetlb2:
> 
> (1) What's in malloc/tst-malloc-thread-fail-malloc-hugetlb2.out ?
> 
> (2) Have you tried with "export TIMEOUTFACTOR=20" ? some tests fail
>     reliably on slower systems due to timeouts.

No, it's not a timeout, but a segment fault.  The output is just "error:
exit status 139 from child process".

I'm not an expert with ptmalloc, but there is some code like:

  arena_get (ar_ptr, bytes + alignment + MINSIZE);

  p = _int_memalign (ar_ptr, alignment, bytes);
  if (!p && ar_ptr != NULL)
    {
      LIBC_PROBE (memory_memalign_retry, 2, bytes, alignment);
      ar_ptr = arena_get_retry (ar_ptr, bytes);
      p = _int_memalign (ar_ptr, alignment, bytes);
    }

arena_get can set ar_ptr to NULL (at least when the system memory is not
enough).

Then _int_memalign is dereferencing ar_ptr w/o any nullity check (bin_at
seems a "fancy" dereference operation to me).

Then we test ar_ptr != NULL in the if statement.

Now it looks like a notorious "NULL check after dereferencing" pattern.
So I added a nullity check:

diff --git a/malloc/malloc.c b/malloc/malloc.c
index 0315ac5d16..ed10b6b0e3 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -5025,7 +5025,7 @@ _int_memalign (mstate av, size_t alignment, size_t
bytes)
   mchunkptr victim;
 
   nb = checked_request2size (bytes);
-  if (nb == 0)
+  if (nb == 0 || !av)
     {
       __set_errno (ENOMEM);
       return NULL;

And it indeed fixed the test for me.  But I'm not sure if it's the
correct solution and I've not ran the complete test suite with the
change yet.


-- 
Xi Ruoyao <xry111@xry111.site>
School of Aerospace Science and Technology, Xidian University

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v5 1/1] memalign: Support scanning for aligned chunks.
  2023-04-12 17:04                           ` Xi Ruoyao
  2023-04-12 17:16                             ` DJ Delorie
@ 2023-04-12 17:33                             ` Adhemerval Zanella Netto
  2023-04-12 17:40                               ` DJ Delorie
  1 sibling, 1 reply; 38+ messages in thread
From: Adhemerval Zanella Netto @ 2023-04-12 17:33 UTC (permalink / raw)
  To: Xi Ruoyao, Cristian Rodríguez; +Cc: DJ Delorie, libc-alpha



On 12/04/23 14:04, Xi Ruoyao wrote:
> On Thu, 2023-03-30 at 18:43 -0300, Cristian Rodríguez via Libc-alpha
> wrote:
>> On Thu, Mar 30, 2023 at 7:50 AM Adhemerval Zanella Netto <
>> adhemerval.zanella@linaro.org> wrote:
>>
>>> Do you have any testcase that triggers it?
>>
>> I'll try.. cannot compile ripgrep with debug info either ..because this
>> commit also makes rustc crash with the same message.😕
> 
> On LoongArch (with a GCC 12.2 but LoongArch backend patched to match GCC
> trunk), this causes an almost deterministic failure of malloc/tst-
> malloc-thread-fail-malloc-hugetlb2:
> 
> FAIL: malloc/tst-malloc-thread-fail-malloc-hugetlb2
> original exit status 1
> error: exit status 139 from child process
> 

I can reproduce on x86_64 as well in a non-deterministic manner (with multiple
'rm malloc/*.out && make malloc/tests -j24'.  The core file shows internal
metadata corruption:

$ gdb elf/ld.so core
[...]
#0  __pthread_kill_implementation (threadid=<optimized out>, signo=signo@entry=6, no_tid=no_tid@entry=0)
    at pthread_kill.c:44
44            return INTERNAL_SYSCALL_ERROR_P (ret) ? INTERNAL_SYSCALL_ERRNO (ret) : 0;
(gdb) bt
#0  __pthread_kill_implementation (threadid=<optimized out>, signo=signo@entry=6, no_tid=no_tid@entry=0)
    at pthread_kill.c:44
#1  0x00007fae15090017 in __pthread_kill_internal (signo=6, threadid=<optimized out>) at pthread_kill.c:78
#2  0x00007fae1503e326 in __GI_raise (sig=sig@entry=6) at ../sysdeps/posix/raise.c:26
#3  0x00007fae150264b1 in __GI_abort () at abort.c:79
#4  0x00007fae150272cc in __libc_message (fmt=fmt@entry=0x7fae151b0632 "%s\n") at ../sysdeps/posix/libc_fatal.c:150
#5  0x00007fae1509b873 in malloc_printerr (str=str@entry=0x7fae151b3458 "free(): double free detected in tcache 2")
    at malloc.c:5847
#6  0x00007fae1509dd5b in _int_free (av=0x7fae1524cac0 <main_arena>, p=0x7fae14e00290, have_lock=0) at malloc.c:4508
#7  0x00007fae150a07bb in __GI___libc_free (mem=<optimized out>) at malloc.c:3386
#8  0x00007fae15445594 in ?? ()
#9  0x00007fae14e002a0 in ?? ()
#10 0x73afe8780d804600 in ?? ()
#11 0x00007ffec090cc68 in ?? ()
#12 0x00007fae1544622b in ?? ()
#13 0x0000000000000000 in ?? ()

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v5 1/1] memalign: Support scanning for aligned chunks.
  2023-04-12 17:33                             ` [PATCH v5 1/1] " Adhemerval Zanella Netto
@ 2023-04-12 17:40                               ` DJ Delorie
  2023-04-12 18:01                                 ` Adhemerval Zanella Netto
  0 siblings, 1 reply; 38+ messages in thread
From: DJ Delorie @ 2023-04-12 17:40 UTC (permalink / raw)
  To: Adhemerval Zanella Netto; +Cc: libc-alpha

Adhemerval Zanella Netto <adhemerval.zanella@linaro.org> writes:
> I can reproduce on x86_64 as well in a non-deterministic manner (with multiple

Also malloc/tst-malloc-thread-fail-malloc-hugetlb2 ?


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v5 1/1] memalign: Support scanning for aligned chunks.
  2023-04-12 17:40                               ` DJ Delorie
@ 2023-04-12 18:01                                 ` Adhemerval Zanella Netto
  2023-04-13  1:57                                   ` DJ Delorie
  0 siblings, 1 reply; 38+ messages in thread
From: Adhemerval Zanella Netto @ 2023-04-12 18:01 UTC (permalink / raw)
  To: DJ Delorie; +Cc: libc-alpha



On 12/04/23 14:40, DJ Delorie wrote:
> Adhemerval Zanella Netto <adhemerval.zanella@linaro.org> writes:
>> I can reproduce on x86_64 as well in a non-deterministic manner (with multiple
> 
> Also malloc/tst-malloc-thread-fail-malloc-hugetlb2 ?
> 

Yes and with  malloc/tst-malloc-thread-fail-malloc-hugetlb1 as well. But 
the hugetlb1 and hugetlb2 just essentially enable hugepages with extra 
mprotect/madvise calls.  I don't think they play a role in this issue
(they won't affect any metadata setting, just the timings on the tests).

^ permalink raw reply	[flat|nested] 38+ messages in thread

* [PATCH v6 1/1] memalign: Support scanning for aligned chunks.
  2023-04-12 17:26                               ` Xi Ruoyao
@ 2023-04-13  1:52                                 ` DJ Delorie
  2023-04-13  5:51                                   ` Xi Ruoyao
  2023-04-17 21:48                                   ` Carlos O'Donell
  0 siblings, 2 replies; 38+ messages in thread
From: DJ Delorie @ 2023-04-13  1:52 UTC (permalink / raw)
  To: Xi Ruoyao; +Cc: libc-alpha

Xi Ruoyao <xry111@xry111.site> writes:
> Then we test ar_ptr != NULL in the if statement.

I haven't reproduce the tcache fail (it might be unrelated) but this
should fix the ar_ptr case (most of the malloc.c patch just indents a
bunch of code, to make it conditional).  We don't want to just fail on
ar_ptr==NULL because that prevents us from calling sysmalloc() to get
more plain heap.

From 250e31ff15d92d20e6c66689e34baeab8daa034d Mon Sep 17 00:00:00 2001
From: DJ Delorie <dj@redhat.com>
Date: Mon, 3 Apr 2023 17:33:03 -0400
Subject: malloc: set NON_MAIN_ARENA flag for reclaimed memalign chunk (BZ
 #30101)

Based on these comments in malloc.c:

   size field is or'ed with NON_MAIN_ARENA if the chunk was obtained
   from a non-main arena.  This is only set immediately before handing
   the chunk to the user, if necessary.

   The NON_MAIN_ARENA flag is never set for unsorted chunks, so it
   does not have to be taken into account in size comparisons.

When we pull a chunk off the unsorted list (or any list) we need to
make sure that flag is set properly before returning the chunk.

Use the rounded-up size for chunk_ok_for_memalign()

Do not scan the arena for reusable chunks if there's no arena.

diff --git a/malloc/Makefile b/malloc/Makefile
index f49675845e..e66247ed01 100644
--- a/malloc/Makefile
+++ b/malloc/Makefile
@@ -43,7 +43,8 @@ tests := mallocbug tst-malloc tst-valloc tst-calloc tst-obstack \
 	 tst-tcfree1 tst-tcfree2 tst-tcfree3 \
 	 tst-safe-linking \
 	 tst-mallocalign1 \
-	 tst-memalign-2
+	 tst-memalign-2 \
+	 tst-memalign-3
 
 tests-static := \
 	 tst-interpose-static-nothread \
@@ -71,7 +72,7 @@ test-srcs = tst-mtrace
 # with MALLOC_CHECK_=3 because they expect a specific failure.
 tests-exclude-malloc-check = tst-malloc-check tst-malloc-usable \
 	tst-mxfast tst-safe-linking \
-	tst-compathooks-off tst-compathooks-on tst-memalign-2
+	tst-compathooks-off tst-compathooks-on tst-memalign-2 tst-memalign-3
 
 # Run all tests with MALLOC_CHECK_=3
 tests-malloc-check = $(filter-out $(tests-exclude-malloc-check) \
diff --git a/malloc/malloc.c b/malloc/malloc.c
index 0315ac5d16..7afc02a759 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -5048,95 +5048,98 @@ _int_memalign (mstate av, size_t alignment, size_t bytes)
      and unlikely to meet our alignment requirements.  We have not done
      any experimentation with searching for aligned fastbins.  */
 
-  int first_bin_index;
-  int first_largebin_index;
-  int last_bin_index;
+  if (av != NULL)
+    {
+      int first_bin_index;
+      int first_largebin_index;
+      int last_bin_index;
 
-  if (in_smallbin_range (nb))
-    first_bin_index = smallbin_index (nb);
-  else
-    first_bin_index = largebin_index (nb);
+      if (in_smallbin_range (nb))
+	first_bin_index = smallbin_index (nb);
+      else
+	first_bin_index = largebin_index (nb);
 
-  if (in_smallbin_range (nb * 2))
-    last_bin_index = smallbin_index (nb * 2);
-  else
-    last_bin_index = largebin_index (nb * 2);
+      if (in_smallbin_range (nb * 2))
+	last_bin_index = smallbin_index (nb * 2);
+      else
+	last_bin_index = largebin_index (nb * 2);
 
-  first_largebin_index = largebin_index (MIN_LARGE_SIZE);
+      first_largebin_index = largebin_index (MIN_LARGE_SIZE);
 
-  int victim_index;                 /* its bin index */
+      int victim_index;                 /* its bin index */
 
-  for (victim_index = first_bin_index;
-       victim_index < last_bin_index;
-       victim_index ++)
-    {
-      victim = NULL;
+      for (victim_index = first_bin_index;
+	   victim_index < last_bin_index;
+	   victim_index ++)
+	{
+	  victim = NULL;
 
-      if (victim_index < first_largebin_index)
-    {
-      /* Check small bins.  Small bin chunks are doubly-linked despite
-	 being the same size.  */
+	  if (victim_index < first_largebin_index)
+	    {
+	      /* Check small bins.  Small bin chunks are doubly-linked despite
+		 being the same size.  */
 
-      mchunkptr fwd;                    /* misc temp for linking */
-      mchunkptr bck;                    /* misc temp for linking */
+	      mchunkptr fwd;                    /* misc temp for linking */
+	      mchunkptr bck;                    /* misc temp for linking */
 
-      bck = bin_at (av, victim_index);
-      fwd = bck->fd;
-      while (fwd != bck)
-	{
-	  if (chunk_ok_for_memalign (fwd, alignment, bytes) > 0)
-	    {
-	      victim = fwd;
+	      bck = bin_at (av, victim_index);
+	      fwd = bck->fd;
+	      while (fwd != bck)
+		{
+		  if (chunk_ok_for_memalign (fwd, alignment, nb) > 0)
+		    {
+		      victim = fwd;
 
-	      /* Unlink it */
-	      victim->fd->bk = victim->bk;
-	      victim->bk->fd = victim->fd;
-	      break;
+		      /* Unlink it */
+		      victim->fd->bk = victim->bk;
+		      victim->bk->fd = victim->fd;
+		      break;
+		    }
+
+		  fwd = fwd->fd;
+		}
 	    }
+	  else
+	    {
+	      /* Check large bins.  */
+	      mchunkptr fwd;                    /* misc temp for linking */
+	      mchunkptr bck;                    /* misc temp for linking */
+	      mchunkptr best = NULL;
+	      size_t best_size = 0;
 
-	  fwd = fwd->fd;
-	}
-    }
-  else
-    {
-      /* Check large bins.  */
-      mchunkptr fwd;                    /* misc temp for linking */
-      mchunkptr bck;                    /* misc temp for linking */
-      mchunkptr best = NULL;
-      size_t best_size = 0;
+	      bck = bin_at (av, victim_index);
+	      fwd = bck->fd;
 
-      bck = bin_at (av, victim_index);
-      fwd = bck->fd;
+	      while (fwd != bck)
+		{
+		  int extra;
 
-      while (fwd != bck)
-	{
-	  int extra;
+		  if (chunksize (fwd) < nb)
+		    break;
+		  extra = chunk_ok_for_memalign (fwd, alignment, nb);
+		  if (extra > 0
+		      && (extra <= best_size || best == NULL))
+		    {
+		      best = fwd;
+		      best_size = extra;
+		    }
 
-	  if (chunksize (fwd) < nb)
-	      break;
-	  extra = chunk_ok_for_memalign (fwd, alignment, bytes);
-	  if (extra > 0
-	      && (extra <= best_size || best == NULL))
-	    {
-	      best = fwd;
-	      best_size = extra;
-	    }
+		  fwd = fwd->fd;
+		}
+	      victim = best;
 
-	  fwd = fwd->fd;
-	}
-      victim = best;
+	      if (victim != NULL)
+		{
+		  unlink_chunk (av, victim);
+		  break;
+		}
+	    }
 
-      if (victim != NULL)
-	{
-	  unlink_chunk (av, victim);
-	  break;
+	  if (victim != NULL)
+	    break;
 	}
     }
 
-      if (victim != NULL)
-	break;
-    }
-
   /* Strategy: find a spot within that chunk that meets the alignment
      request, and then possibly free the leading and trailing space.
      This strategy is incredibly costly and can lead to external
@@ -5147,6 +5150,8 @@ _int_memalign (mstate av, size_t alignment, size_t bytes)
       p = victim;
       m = chunk2mem (p);
       set_inuse (p);
+      if (av != &main_arena)
+	set_non_main_arena (p);
     }
   else
     {
diff --git a/malloc/tst-memalign-2.c b/malloc/tst-memalign-2.c
index 4996578e9f..f229283dbf 100644
--- a/malloc/tst-memalign-2.c
+++ b/malloc/tst-memalign-2.c
@@ -33,9 +33,10 @@ typedef struct TestCase {
 } TestCase;
 
 static TestCase tcache_allocs[] = {
-  { 24, 8, NULL, NULL },
-  { 24, 16, NULL, NULL },
-  { 128, 32, NULL, NULL }
+  { 24, 32, NULL, NULL },
+  { 24, 64, NULL, NULL },
+  { 128, 128, NULL, NULL },
+  { 500, 128, NULL, NULL }
 };
 #define TN array_length (tcache_allocs)
 
@@ -70,11 +71,15 @@ do_test (void)
 
   for (i = 0; i < TN; ++ i)
     {
+      size_t sz2;
+
       tcache_allocs[i].ptr1 = memalign (tcache_allocs[i].alignment, tcache_allocs[i].size);
       CHECK (tcache_allocs[i].ptr1, tcache_allocs[i].alignment);
+      sz2 = malloc_usable_size (tcache_allocs[i].ptr1);
       free (tcache_allocs[i].ptr1);
+
       /* This should return the same chunk as was just free'd.  */
-      tcache_allocs[i].ptr2 = memalign (tcache_allocs[i].alignment, tcache_allocs[i].size);
+      tcache_allocs[i].ptr2 = memalign (tcache_allocs[i].alignment, sz2);
       CHECK (tcache_allocs[i].ptr2, tcache_allocs[i].alignment);
       free (tcache_allocs[i].ptr2);
 
diff --git a/malloc/tst-memalign-3.c b/malloc/tst-memalign-3.c
new file mode 100644
index 0000000000..ab90d6ca9b
--- /dev/null
+++ b/malloc/tst-memalign-3.c
@@ -0,0 +1,173 @@
+/* Test for memalign chunk reuse.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <errno.h>
+#include <malloc.h>
+#include <stdio.h>
+#include <pthread.h>
+#include <string.h>
+#include <unistd.h>
+#include <array_length.h>
+#include <libc-pointer-arith.h>
+#include <support/check.h>
+#include <support/xthread.h>
+
+
+typedef struct TestCase {
+  size_t size;
+  size_t alignment;
+  void *ptr1;
+  void *ptr2;
+} TestCase;
+
+static TestCase tcache_allocs[] = {
+  { 24, 32, NULL, NULL },
+  { 24, 64, NULL, NULL },
+  { 128, 128, NULL, NULL },
+  { 500, 128, NULL, NULL }
+};
+#define TN array_length (tcache_allocs)
+
+static TestCase large_allocs[] = {
+  { 23450, 64, NULL, NULL },
+  { 23450, 64, NULL, NULL },
+  { 23550, 64, NULL, NULL },
+  { 23550, 64, NULL, NULL },
+  { 23650, 64, NULL, NULL },
+  { 23650, 64, NULL, NULL },
+  { 33650, 64, NULL, NULL },
+  { 33650, 64, NULL, NULL }
+};
+#define LN array_length (large_allocs)
+
+void *p;
+
+/* Sanity checks, ancillary to the actual test.  */
+#define CHECK(p,a) \
+  if (p == NULL || !PTR_IS_ALIGNED (p, a)) \
+    FAIL_EXIT1 ("NULL or misaligned memory detected.\n");
+
+static void *
+mem_test (void *closure)
+{
+  int i;
+  int j;
+  int count;
+  void *ptr[10];
+  void *p;
+
+  /* TCache test.  */
+  for (i = 0; i < TN; ++ i)
+    {
+      size_t sz2;
+
+      tcache_allocs[i].ptr1 = memalign (tcache_allocs[i].alignment, tcache_allocs[i].size);
+      CHECK (tcache_allocs[i].ptr1, tcache_allocs[i].alignment);
+      sz2 = malloc_usable_size (tcache_allocs[i].ptr1);
+      free (tcache_allocs[i].ptr1);
+
+      /* This should return the same chunk as was just free'd.  */
+      tcache_allocs[i].ptr2 = memalign (tcache_allocs[i].alignment, sz2);
+      CHECK (tcache_allocs[i].ptr2, tcache_allocs[i].alignment);
+      free (tcache_allocs[i].ptr2);
+
+      TEST_VERIFY (tcache_allocs[i].ptr1 == tcache_allocs[i].ptr2);
+    }
+
+  /* Test for non-head tcache hits.  */
+  for (i = 0; i < array_length (ptr); ++ i)
+    {
+      if (i == 4)
+	{
+	  ptr[i] = memalign (64, 256);
+	  CHECK (ptr[i], 64);
+	}
+      else
+	{
+	  ptr[i] = malloc (256);
+	  CHECK (ptr[i], 4);
+	}
+    }
+  for (i = 0; i < array_length (ptr); ++ i)
+    free (ptr[i]);
+
+  p = memalign (64, 256);
+  CHECK (p, 64);
+
+  count = 0;
+  for (i = 0; i < 10; ++ i)
+    if (ptr[i] == p)
+      ++ count;
+  free (p);
+  TEST_VERIFY (count > 0);
+
+  /* Large bins test.  */
+
+  for (i = 0; i < LN; ++ i)
+    {
+      large_allocs[i].ptr1 = memalign (large_allocs[i].alignment, large_allocs[i].size);
+      CHECK (large_allocs[i].ptr1, large_allocs[i].alignment);
+      /* Keep chunks from combining by fragmenting the heap.  */
+      p = malloc (512);
+      CHECK (p, 4);
+    }
+
+  for (i = 0; i < LN; ++ i)
+    free (large_allocs[i].ptr1);
+
+  /* Force the unsorted bins to be scanned and moved to small/large
+     bins.  */
+  p = malloc (60000);
+
+  for (i = 0; i < LN; ++ i)
+    {
+      large_allocs[i].ptr2 = memalign (large_allocs[i].alignment, large_allocs[i].size);
+      CHECK (large_allocs[i].ptr2, large_allocs[i].alignment);
+    }
+
+  count = 0;
+  for (i = 0; i < LN; ++ i)
+    {
+      int ok = 0;
+      for (j = 0; j < LN; ++ j)
+	if (large_allocs[i].ptr1 == large_allocs[j].ptr2)
+	  ok = 1;
+      if (ok == 1)
+	count ++;
+    }
+
+  /* The allocation algorithm is complicated outside of the memalign
+     logic, so just make sure it's working for most of the
+     allocations.  This avoids possible boundary conditions with
+     empty/full heaps.  */
+  TEST_VERIFY (count > LN / 2);
+
+  return 0;
+}
+
+static int
+do_test (void)
+{
+  pthread_t p;
+
+  p = xpthread_create (NULL, mem_test, NULL);
+  xpthread_join (p);
+  return 0;
+}
+
+#include <support/test-driver.c>


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v5 1/1] memalign: Support scanning for aligned chunks.
  2023-04-12 18:01                                 ` Adhemerval Zanella Netto
@ 2023-04-13  1:57                                   ` DJ Delorie
  2023-04-13 10:46                                     ` Adhemerval Zanella Netto
  0 siblings, 1 reply; 38+ messages in thread
From: DJ Delorie @ 2023-04-13  1:57 UTC (permalink / raw)
  To: Adhemerval Zanella Netto; +Cc: libc-alpha

Adhemerval Zanella Netto <adhemerval.zanella@linaro.org> writes:
> Yes and with  malloc/tst-malloc-thread-fail-malloc-hugetlb1 as well. But 
> the hugetlb1 and hugetlb2 just essentially enable hugepages with extra 
> mprotect/madvise calls.  I don't think they play a role in this issue
> (they won't affect any metadata setting, just the timings on the tests).

I still can't reproduce this; how much free memory is on that machine?
Have you tested this particular setup without my patch?  I inspected all
the tcache code changes and didn't find anything...


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v6 1/1] memalign: Support scanning for aligned chunks.
  2023-04-13  1:52                                 ` [PATCH v6 " DJ Delorie
@ 2023-04-13  5:51                                   ` Xi Ruoyao
  2023-04-17 21:48                                   ` Carlos O'Donell
  1 sibling, 0 replies; 38+ messages in thread
From: Xi Ruoyao @ 2023-04-13  5:51 UTC (permalink / raw)
  To: DJ Delorie; +Cc: libc-alpha

On Wed, 2023-04-12 at 21:52 -0400, DJ Delorie wrote:
> Xi Ruoyao <xry111@xry111.site> writes:
> > Then we test ar_ptr != NULL in the if statement.
> 
> I haven't reproduce the tcache fail (it might be unrelated) but this
> should fix the ar_ptr case (most of the malloc.c patch just indents a
> bunch of code, to make it conditional).  We don't want to just fail on
> ar_ptr==NULL because that prevents us from calling sysmalloc() to get
> more plain heap.
> 
> From 250e31ff15d92d20e6c66689e34baeab8daa034d Mon Sep 17 00:00:00 2001
> From: DJ Delorie <dj@redhat.com>
> Date: Mon, 3 Apr 2023 17:33:03 -0400
> Subject: malloc: set NON_MAIN_ARENA flag for reclaimed memalign chunk
> (BZ
>  #30101)

Works for me.

> Based on these comments in malloc.c:
> 
>    size field is or'ed with NON_MAIN_ARENA if the chunk was obtained
>    from a non-main arena.  This is only set immediately before handing
>    the chunk to the user, if necessary.
> 
>    The NON_MAIN_ARENA flag is never set for unsorted chunks, so it
>    does not have to be taken into account in size comparisons.
> 
> When we pull a chunk off the unsorted list (or any list) we need to
> make sure that flag is set properly before returning the chunk.
> 
> Use the rounded-up size for chunk_ok_for_memalign()
> 
> Do not scan the arena for reusable chunks if there's no arena.
> 
> diff --git a/malloc/Makefile b/malloc/Makefile
> index f49675845e..e66247ed01 100644
> --- a/malloc/Makefile
> +++ b/malloc/Makefile
> @@ -43,7 +43,8 @@ tests := mallocbug tst-malloc tst-valloc tst-calloc
> tst-obstack \
>          tst-tcfree1 tst-tcfree2 tst-tcfree3 \
>          tst-safe-linking \
>          tst-mallocalign1 \
> -        tst-memalign-2
> +        tst-memalign-2 \
> +        tst-memalign-3
>  
>  tests-static := \
>          tst-interpose-static-nothread \
> @@ -71,7 +72,7 @@ test-srcs = tst-mtrace
>  # with MALLOC_CHECK_=3 because they expect a specific failure.
>  tests-exclude-malloc-check = tst-malloc-check tst-malloc-usable \
>         tst-mxfast tst-safe-linking \
> -       tst-compathooks-off tst-compathooks-on tst-memalign-2
> +       tst-compathooks-off tst-compathooks-on tst-memalign-2 tst-
> memalign-3
>  
>  # Run all tests with MALLOC_CHECK_=3
>  tests-malloc-check = $(filter-out $(tests-exclude-malloc-check) \
> diff --git a/malloc/malloc.c b/malloc/malloc.c
> index 0315ac5d16..7afc02a759 100644
> --- a/malloc/malloc.c
> +++ b/malloc/malloc.c
> @@ -5048,95 +5048,98 @@ _int_memalign (mstate av, size_t alignment,
> size_t bytes)
>       and unlikely to meet our alignment requirements.  We have not
> done
>       any experimentation with searching for aligned fastbins.  */
>  
> -  int first_bin_index;
> -  int first_largebin_index;
> -  int last_bin_index;
> +  if (av != NULL)
> +    {
> +      int first_bin_index;
> +      int first_largebin_index;
> +      int last_bin_index;
>  
> -  if (in_smallbin_range (nb))
> -    first_bin_index = smallbin_index (nb);
> -  else
> -    first_bin_index = largebin_index (nb);
> +      if (in_smallbin_range (nb))
> +       first_bin_index = smallbin_index (nb);
> +      else
> +       first_bin_index = largebin_index (nb);
>  
> -  if (in_smallbin_range (nb * 2))
> -    last_bin_index = smallbin_index (nb * 2);
> -  else
> -    last_bin_index = largebin_index (nb * 2);
> +      if (in_smallbin_range (nb * 2))
> +       last_bin_index = smallbin_index (nb * 2);
> +      else
> +       last_bin_index = largebin_index (nb * 2);
>  
> -  first_largebin_index = largebin_index (MIN_LARGE_SIZE);
> +      first_largebin_index = largebin_index (MIN_LARGE_SIZE);
>  
> -  int victim_index;                 /* its bin index */
> +      int victim_index;                 /* its bin index */
>  
> -  for (victim_index = first_bin_index;
> -       victim_index < last_bin_index;
> -       victim_index ++)
> -    {
> -      victim = NULL;
> +      for (victim_index = first_bin_index;
> +          victim_index < last_bin_index;
> +          victim_index ++)
> +       {
> +         victim = NULL;
>  
> -      if (victim_index < first_largebin_index)
> -    {
> -      /* Check small bins.  Small bin chunks are doubly-linked
> despite
> -        being the same size.  */
> +         if (victim_index < first_largebin_index)
> +           {
> +             /* Check small bins.  Small bin chunks are doubly-linked
> despite
> +                being the same size.  */
>  
> -      mchunkptr fwd;                    /* misc temp for linking */
> -      mchunkptr bck;                    /* misc temp for linking */
> +             mchunkptr fwd;                    /* misc temp for
> linking */
> +             mchunkptr bck;                    /* misc temp for
> linking */
>  
> -      bck = bin_at (av, victim_index);
> -      fwd = bck->fd;
> -      while (fwd != bck)
> -       {
> -         if (chunk_ok_for_memalign (fwd, alignment, bytes) > 0)
> -           {
> -             victim = fwd;
> +             bck = bin_at (av, victim_index);
> +             fwd = bck->fd;
> +             while (fwd != bck)
> +               {
> +                 if (chunk_ok_for_memalign (fwd, alignment, nb) > 0)
> +                   {
> +                     victim = fwd;
>  
> -             /* Unlink it */
> -             victim->fd->bk = victim->bk;
> -             victim->bk->fd = victim->fd;
> -             break;
> +                     /* Unlink it */
> +                     victim->fd->bk = victim->bk;
> +                     victim->bk->fd = victim->fd;
> +                     break;
> +                   }
> +
> +                 fwd = fwd->fd;
> +               }
>             }
> +         else
> +           {
> +             /* Check large bins.  */
> +             mchunkptr fwd;                    /* misc temp for
> linking */
> +             mchunkptr bck;                    /* misc temp for
> linking */
> +             mchunkptr best = NULL;
> +             size_t best_size = 0;
>  
> -         fwd = fwd->fd;
> -       }
> -    }
> -  else
> -    {
> -      /* Check large bins.  */
> -      mchunkptr fwd;                    /* misc temp for linking */
> -      mchunkptr bck;                    /* misc temp for linking */
> -      mchunkptr best = NULL;
> -      size_t best_size = 0;
> +             bck = bin_at (av, victim_index);
> +             fwd = bck->fd;
>  
> -      bck = bin_at (av, victim_index);
> -      fwd = bck->fd;
> +             while (fwd != bck)
> +               {
> +                 int extra;
>  
> -      while (fwd != bck)
> -       {
> -         int extra;
> +                 if (chunksize (fwd) < nb)
> +                   break;
> +                 extra = chunk_ok_for_memalign (fwd, alignment, nb);
> +                 if (extra > 0
> +                     && (extra <= best_size || best == NULL))
> +                   {
> +                     best = fwd;
> +                     best_size = extra;
> +                   }
>  
> -         if (chunksize (fwd) < nb)
> -             break;
> -         extra = chunk_ok_for_memalign (fwd, alignment, bytes);
> -         if (extra > 0
> -             && (extra <= best_size || best == NULL))
> -           {
> -             best = fwd;
> -             best_size = extra;
> -           }
> +                 fwd = fwd->fd;
> +               }
> +             victim = best;
>  
> -         fwd = fwd->fd;
> -       }
> -      victim = best;
> +             if (victim != NULL)
> +               {
> +                 unlink_chunk (av, victim);
> +                 break;
> +               }
> +           }
>  
> -      if (victim != NULL)
> -       {
> -         unlink_chunk (av, victim);
> -         break;
> +         if (victim != NULL)
> +           break;
>         }
>      }
>  
> -      if (victim != NULL)
> -       break;
> -    }
> -
>    /* Strategy: find a spot within that chunk that meets the alignment
>       request, and then possibly free the leading and trailing space.
>       This strategy is incredibly costly and can lead to external
> @@ -5147,6 +5150,8 @@ _int_memalign (mstate av, size_t alignment,
> size_t bytes)
>        p = victim;
>        m = chunk2mem (p);
>        set_inuse (p);
> +      if (av != &main_arena)
> +       set_non_main_arena (p);
>      }
>    else
>      {
> diff --git a/malloc/tst-memalign-2.c b/malloc/tst-memalign-2.c
> index 4996578e9f..f229283dbf 100644
> --- a/malloc/tst-memalign-2.c
> +++ b/malloc/tst-memalign-2.c
> @@ -33,9 +33,10 @@ typedef struct TestCase {
>  } TestCase;
>  
>  static TestCase tcache_allocs[] = {
> -  { 24, 8, NULL, NULL },
> -  { 24, 16, NULL, NULL },
> -  { 128, 32, NULL, NULL }
> +  { 24, 32, NULL, NULL },
> +  { 24, 64, NULL, NULL },
> +  { 128, 128, NULL, NULL },
> +  { 500, 128, NULL, NULL }
>  };
>  #define TN array_length (tcache_allocs)
>  
> @@ -70,11 +71,15 @@ do_test (void)
>  
>    for (i = 0; i < TN; ++ i)
>      {
> +      size_t sz2;
> +
>        tcache_allocs[i].ptr1 = memalign (tcache_allocs[i].alignment,
> tcache_allocs[i].size);
>        CHECK (tcache_allocs[i].ptr1, tcache_allocs[i].alignment);
> +      sz2 = malloc_usable_size (tcache_allocs[i].ptr1);
>        free (tcache_allocs[i].ptr1);
> +
>        /* This should return the same chunk as was just free'd.  */
> -      tcache_allocs[i].ptr2 = memalign (tcache_allocs[i].alignment,
> tcache_allocs[i].size);
> +      tcache_allocs[i].ptr2 = memalign (tcache_allocs[i].alignment,
> sz2);
>        CHECK (tcache_allocs[i].ptr2, tcache_allocs[i].alignment);
>        free (tcache_allocs[i].ptr2);
>  
> diff --git a/malloc/tst-memalign-3.c b/malloc/tst-memalign-3.c
> new file mode 100644
> index 0000000000..ab90d6ca9b
> --- /dev/null
> +++ b/malloc/tst-memalign-3.c
> @@ -0,0 +1,173 @@
> +/* Test for memalign chunk reuse.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be
> useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <errno.h>
> +#include <malloc.h>
> +#include <stdio.h>
> +#include <pthread.h>
> +#include <string.h>
> +#include <unistd.h>
> +#include <array_length.h>
> +#include <libc-pointer-arith.h>
> +#include <support/check.h>
> +#include <support/xthread.h>
> +
> +
> +typedef struct TestCase {
> +  size_t size;
> +  size_t alignment;
> +  void *ptr1;
> +  void *ptr2;
> +} TestCase;
> +
> +static TestCase tcache_allocs[] = {
> +  { 24, 32, NULL, NULL },
> +  { 24, 64, NULL, NULL },
> +  { 128, 128, NULL, NULL },
> +  { 500, 128, NULL, NULL }
> +};
> +#define TN array_length (tcache_allocs)
> +
> +static TestCase large_allocs[] = {
> +  { 23450, 64, NULL, NULL },
> +  { 23450, 64, NULL, NULL },
> +  { 23550, 64, NULL, NULL },
> +  { 23550, 64, NULL, NULL },
> +  { 23650, 64, NULL, NULL },
> +  { 23650, 64, NULL, NULL },
> +  { 33650, 64, NULL, NULL },
> +  { 33650, 64, NULL, NULL }
> +};
> +#define LN array_length (large_allocs)
> +
> +void *p;
> +
> +/* Sanity checks, ancillary to the actual test.  */
> +#define CHECK(p,a) \
> +  if (p == NULL || !PTR_IS_ALIGNED (p, a)) \
> +    FAIL_EXIT1 ("NULL or misaligned memory detected.\n");
> +
> +static void *
> +mem_test (void *closure)
> +{
> +  int i;
> +  int j;
> +  int count;
> +  void *ptr[10];
> +  void *p;
> +
> +  /* TCache test.  */
> +  for (i = 0; i < TN; ++ i)
> +    {
> +      size_t sz2;
> +
> +      tcache_allocs[i].ptr1 = memalign (tcache_allocs[i].alignment,
> tcache_allocs[i].size);
> +      CHECK (tcache_allocs[i].ptr1, tcache_allocs[i].alignment);
> +      sz2 = malloc_usable_size (tcache_allocs[i].ptr1);
> +      free (tcache_allocs[i].ptr1);
> +
> +      /* This should return the same chunk as was just free'd.  */
> +      tcache_allocs[i].ptr2 = memalign (tcache_allocs[i].alignment,
> sz2);
> +      CHECK (tcache_allocs[i].ptr2, tcache_allocs[i].alignment);
> +      free (tcache_allocs[i].ptr2);
> +
> +      TEST_VERIFY (tcache_allocs[i].ptr1 == tcache_allocs[i].ptr2);
> +    }
> +
> +  /* Test for non-head tcache hits.  */
> +  for (i = 0; i < array_length (ptr); ++ i)
> +    {
> +      if (i == 4)
> +       {
> +         ptr[i] = memalign (64, 256);
> +         CHECK (ptr[i], 64);
> +       }
> +      else
> +       {
> +         ptr[i] = malloc (256);
> +         CHECK (ptr[i], 4);
> +       }
> +    }
> +  for (i = 0; i < array_length (ptr); ++ i)
> +    free (ptr[i]);
> +
> +  p = memalign (64, 256);
> +  CHECK (p, 64);
> +
> +  count = 0;
> +  for (i = 0; i < 10; ++ i)
> +    if (ptr[i] == p)
> +      ++ count;
> +  free (p);
> +  TEST_VERIFY (count > 0);
> +
> +  /* Large bins test.  */
> +
> +  for (i = 0; i < LN; ++ i)
> +    {
> +      large_allocs[i].ptr1 = memalign (large_allocs[i].alignment,
> large_allocs[i].size);
> +      CHECK (large_allocs[i].ptr1, large_allocs[i].alignment);
> +      /* Keep chunks from combining by fragmenting the heap.  */
> +      p = malloc (512);
> +      CHECK (p, 4);
> +    }
> +
> +  for (i = 0; i < LN; ++ i)
> +    free (large_allocs[i].ptr1);
> +
> +  /* Force the unsorted bins to be scanned and moved to small/large
> +     bins.  */
> +  p = malloc (60000);
> +
> +  for (i = 0; i < LN; ++ i)
> +    {
> +      large_allocs[i].ptr2 = memalign (large_allocs[i].alignment,
> large_allocs[i].size);
> +      CHECK (large_allocs[i].ptr2, large_allocs[i].alignment);
> +    }
> +
> +  count = 0;
> +  for (i = 0; i < LN; ++ i)
> +    {
> +      int ok = 0;
> +      for (j = 0; j < LN; ++ j)
> +       if (large_allocs[i].ptr1 == large_allocs[j].ptr2)
> +         ok = 1;
> +      if (ok == 1)
> +       count ++;
> +    }
> +
> +  /* The allocation algorithm is complicated outside of the memalign
> +     logic, so just make sure it's working for most of the
> +     allocations.  This avoids possible boundary conditions with
> +     empty/full heaps.  */
> +  TEST_VERIFY (count > LN / 2);
> +
> +  return 0;
> +}
> +
> +static int
> +do_test (void)
> +{
> +  pthread_t p;
> +
> +  p = xpthread_create (NULL, mem_test, NULL);
> +  xpthread_join (p);
> +  return 0;
> +}
> +
> +#include <support/test-driver.c>
> 

-- 
Xi Ruoyao <xry111@xry111.site>
School of Aerospace Science and Technology, Xidian University

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v5 1/1] memalign: Support scanning for aligned chunks.
  2023-04-13  1:57                                   ` DJ Delorie
@ 2023-04-13 10:46                                     ` Adhemerval Zanella Netto
  0 siblings, 0 replies; 38+ messages in thread
From: Adhemerval Zanella Netto @ 2023-04-13 10:46 UTC (permalink / raw)
  To: DJ Delorie; +Cc: libc-alpha



On 12/04/23 22:57, DJ Delorie wrote:
> Adhemerval Zanella Netto <adhemerval.zanella@linaro.org> writes:
>> Yes and with  malloc/tst-malloc-thread-fail-malloc-hugetlb1 as well. But 
>> the hugetlb1 and hugetlb2 just essentially enable hugepages with extra 
>> mprotect/madvise calls.  I don't think they play a role in this issue
>> (they won't affect any metadata setting, just the timings on the tests).
> 
> I still can't reproduce this; how much free memory is on that machine?

About 64GB:

$ free
               total        used        free      shared  buff/cache   available
Mem:        65760296     6847324    41000244       17540    17912728    58182972
Swap:        8388604           0     8388604

> Have you tested this particular setup without my patch?  I inspected all
> the tcache code changes and didn't find anything...
> 

I will check this patch, thanks.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v6 1/1] memalign: Support scanning for aligned chunks.
  2023-04-13  1:52                                 ` [PATCH v6 " DJ Delorie
  2023-04-13  5:51                                   ` Xi Ruoyao
@ 2023-04-17 21:48                                   ` Carlos O'Donell
  2023-04-18  1:25                                     ` [PATCH v7] " DJ Delorie
  1 sibling, 1 reply; 38+ messages in thread
From: Carlos O'Donell @ 2023-04-17 21:48 UTC (permalink / raw)
  To: DJ Delorie, Xi Ruoyao; +Cc: libc-alpha

On 4/12/23 21:52, DJ Delorie via Libc-alpha wrote:
> Xi Ruoyao <xry111@xry111.site> writes:
>> Then we test ar_ptr != NULL in the if statement.
> 
> I haven't reproduce the tcache fail (it might be unrelated) but this
> should fix the ar_ptr case (most of the malloc.c patch just indents a
> bunch of code, to make it conditional).  We don't want to just fail on
> ar_ptr==NULL because that prevents us from calling sysmalloc() to get
> more plain heap.

DJ and I sat down to review v6 in detail.

We need a v7 for three reasons:

(a) chunk_ok_for_memalign() is commented as taking *user* bytes but you pass
    chunksize in this change and the comment needs adjusting. You might as
    well change `size_t bytes` to `size_t nb` to be consistent.

(b) If we are now passing chunksize then the second line of the function which
    computes size, should use, not memsize(p), but chunksize(p), to correctly
    account for the header bytes (otherwise we're conservative).

(c) We need to exclude the test from mcheck runs via tests-exclude-mcheck since
    the expected chunk results won't work when mcheck is in effect.

	FAIL: malloc/tst-memalign-2-mcheck
	FAIL: malloc/tst-memalign-3-mcheck

> From 250e31ff15d92d20e6c66689e34baeab8daa034d Mon Sep 17 00:00:00 2001
> From: DJ Delorie <dj@redhat.com>
> Date: Mon, 3 Apr 2023 17:33:03 -0400
> Subject: malloc: set NON_MAIN_ARENA flag for reclaimed memalign chunk (BZ
>  #30101)
> 
> Based on these comments in malloc.c:
> 
>    size field is or'ed with NON_MAIN_ARENA if the chunk was obtained
>    from a non-main arena.  This is only set immediately before handing
>    the chunk to the user, if necessary.
> 
>    The NON_MAIN_ARENA flag is never set for unsorted chunks, so it
>    does not have to be taken into account in size comparisons.
> 
> When we pull a chunk off the unsorted list (or any list) we need to
> make sure that flag is set properly before returning the chunk.
> 
> Use the rounded-up size for chunk_ok_for_memalign()
> 
> Do not scan the arena for reusable chunks if there's no arena.
> 
> diff --git a/malloc/Makefile b/malloc/Makefile
> index f49675845e..e66247ed01 100644
> --- a/malloc/Makefile
> +++ b/malloc/Makefile
> @@ -43,7 +43,8 @@ tests := mallocbug tst-malloc tst-valloc tst-calloc tst-obstack \
>  	 tst-tcfree1 tst-tcfree2 tst-tcfree3 \
>  	 tst-safe-linking \
>  	 tst-mallocalign1 \
> -	 tst-memalign-2
> +	 tst-memalign-2 \
> +	 tst-memalign-3

OK. Adds a new test.

>  
>  tests-static := \
>  	 tst-interpose-static-nothread \
> @@ -71,7 +72,7 @@ test-srcs = tst-mtrace
>  # with MALLOC_CHECK_=3 because they expect a specific failure.
>  tests-exclude-malloc-check = tst-malloc-check tst-malloc-usable \
>  	tst-mxfast tst-safe-linking \
> -	tst-compathooks-off tst-compathooks-on tst-memalign-2
> +	tst-compathooks-off tst-compathooks-on tst-memalign-2 tst-memalign-3

OK. Adds a new test.

>  
>  # Run all tests with MALLOC_CHECK_=3
>  tests-malloc-check = $(filter-out $(tests-exclude-malloc-check) \
> diff --git a/malloc/malloc.c b/malloc/malloc.c
> index 0315ac5d16..7afc02a759 100644
> --- a/malloc/malloc.c
> +++ b/malloc/malloc.c
> @@ -5048,95 +5048,98 @@ _int_memalign (mstate av, size_t alignment, size_t bytes)
>       and unlikely to meet our alignment requirements.  We have not done
>       any experimentation with searching for aligned fastbins.  */
>  
> -  int first_bin_index;
> -  int first_largebin_index;
> -  int last_bin_index;
> +  if (av != NULL)

OK. If we don't have an arena, then fall through (discovered issue).

> +    {
> +      int first_bin_index;
> +      int first_largebin_index;
> +      int last_bin_index;
>  
> -  if (in_smallbin_range (nb))
> -    first_bin_index = smallbin_index (nb);
> -  else
> -    first_bin_index = largebin_index (nb);
> +      if (in_smallbin_range (nb))
> +	first_bin_index = smallbin_index (nb);
> +      else
> +	first_bin_index = largebin_index (nb);

OK.

>  
> -  if (in_smallbin_range (nb * 2))
> -    last_bin_index = smallbin_index (nb * 2);
> -  else
> -    last_bin_index = largebin_index (nb * 2);
> +      if (in_smallbin_range (nb * 2))
> +	last_bin_index = smallbin_index (nb * 2);
> +      else
> +	last_bin_index = largebin_index (nb * 2);

OK.

>  
> -  first_largebin_index = largebin_index (MIN_LARGE_SIZE);
> +      first_largebin_index = largebin_index (MIN_LARGE_SIZE);
>  
> -  int victim_index;                 /* its bin index */
> +      int victim_index;                 /* its bin index */
>  
> -  for (victim_index = first_bin_index;
> -       victim_index < last_bin_index;
> -       victim_index ++)
> -    {
> -      victim = NULL;
> +      for (victim_index = first_bin_index;
> +	   victim_index < last_bin_index;
> +	   victim_index ++)
> +	{
> +	  victim = NULL;
>  
> -      if (victim_index < first_largebin_index)
> -    {
> -      /* Check small bins.  Small bin chunks are doubly-linked despite
> -	 being the same size.  */
> +	  if (victim_index < first_largebin_index)
> +	    {
> +	      /* Check small bins.  Small bin chunks are doubly-linked despite
> +		 being the same size.  */
>  
> -      mchunkptr fwd;                    /* misc temp for linking */
> -      mchunkptr bck;                    /* misc temp for linking */
> +	      mchunkptr fwd;                    /* misc temp for linking */
> +	      mchunkptr bck;                    /* misc temp for linking */
>  
> -      bck = bin_at (av, victim_index);
> -      fwd = bck->fd;
> -      while (fwd != bck)
> -	{
> -	  if (chunk_ok_for_memalign (fwd, alignment, bytes) > 0)
> -	    {
> -	      victim = fwd;
> +	      bck = bin_at (av, victim_index);
> +	      fwd = bck->fd;
> +	      while (fwd != bck)
> +		{
> +		  if (chunk_ok_for_memalign (fwd, alignment, nb) > 0)

OK, but needs a follow-up change.

The function chunk_ok_for_memalign() needs to be adjusted to take chunksize.

> +		    {
> +		      victim = fwd;
>  
> -	      /* Unlink it */
> -	      victim->fd->bk = victim->bk;
> -	      victim->bk->fd = victim->fd;
> -	      break;
> +		      /* Unlink it */
> +		      victim->fd->bk = victim->bk;
> +		      victim->bk->fd = victim->fd;
> +		      break;
> +		    }
> +
> +		  fwd = fwd->fd;
> +		}
>  	    }
> +	  else
> +	    {
> +	      /* Check large bins.  */
> +	      mchunkptr fwd;                    /* misc temp for linking */
> +	      mchunkptr bck;                    /* misc temp for linking */
> +	      mchunkptr best = NULL;
> +	      size_t best_size = 0;
>  
> -	  fwd = fwd->fd;
> -	}
> -    }
> -  else
> -    {
> -      /* Check large bins.  */
> -      mchunkptr fwd;                    /* misc temp for linking */
> -      mchunkptr bck;                    /* misc temp for linking */
> -      mchunkptr best = NULL;
> -      size_t best_size = 0;
> +	      bck = bin_at (av, victim_index);
> +	      fwd = bck->fd;
>  
> -      bck = bin_at (av, victim_index);
> -      fwd = bck->fd;
> +	      while (fwd != bck)
> +		{
> +		  int extra;
>  
> -      while (fwd != bck)
> -	{
> -	  int extra;
> +		  if (chunksize (fwd) < nb)
> +		    break;
> +		  extra = chunk_ok_for_memalign (fwd, alignment, nb);

OK.

> +		  if (extra > 0
> +		      && (extra <= best_size || best == NULL))
> +		    {
> +		      best = fwd;
> +		      best_size = extra;
> +		    }
>  
> -	  if (chunksize (fwd) < nb)
> -	      break;
> -	  extra = chunk_ok_for_memalign (fwd, alignment, bytes);
> -	  if (extra > 0
> -	      && (extra <= best_size || best == NULL))
> -	    {
> -	      best = fwd;
> -	      best_size = extra;
> -	    }
> +		  fwd = fwd->fd;
> +		}
> +	      victim = best;
>  
> -	  fwd = fwd->fd;
> -	}
> -      victim = best;
> +	      if (victim != NULL)
> +		{
> +		  unlink_chunk (av, victim);
> +		  break;
> +		}
> +	    }
>  
> -      if (victim != NULL)
> -	{
> -	  unlink_chunk (av, victim);
> -	  break;
> +	  if (victim != NULL)
> +	    break;
>  	}
>      }
>  
> -      if (victim != NULL)
> -	break;
> -    }

OK.

> -
>    /* Strategy: find a spot within that chunk that meets the alignment
>       request, and then possibly free the leading and trailing space.
>       This strategy is incredibly costly and can lead to external
> @@ -5147,6 +5150,8 @@ _int_memalign (mstate av, size_t alignment, size_t bytes)
>        p = victim;
>        m = chunk2mem (p);
>        set_inuse (p);
> +      if (av != &main_arena)
> +	set_non_main_arena (p);

OK. Set the non-main-arena bit as expected (the original bug).

>      }
>    else
>      {
> diff --git a/malloc/tst-memalign-2.c b/malloc/tst-memalign-2.c
> index 4996578e9f..f229283dbf 100644
> --- a/malloc/tst-memalign-2.c
> +++ b/malloc/tst-memalign-2.c
> @@ -33,9 +33,10 @@ typedef struct TestCase {
>  } TestCase;
>  
>  static TestCase tcache_allocs[] = {
> -  { 24, 8, NULL, NULL },
> -  { 24, 16, NULL, NULL },
> -  { 128, 32, NULL, NULL }
> +  { 24, 32, NULL, NULL },
> +  { 24, 64, NULL, NULL },
> +  { 128, 128, NULL, NULL },
> +  { 500, 128, NULL, NULL }

OK. Raised alignments for testing.

>  };
>  #define TN array_length (tcache_allocs)
>  
> @@ -70,11 +71,15 @@ do_test (void)
>  
>    for (i = 0; i < TN; ++ i)
>      {
> +      size_t sz2;
> +
>        tcache_allocs[i].ptr1 = memalign (tcache_allocs[i].alignment, tcache_allocs[i].size);
>        CHECK (tcache_allocs[i].ptr1, tcache_allocs[i].alignment);
> +      sz2 = malloc_usable_size (tcache_allocs[i].ptr1);
>        free (tcache_allocs[i].ptr1);
> +
>        /* This should return the same chunk as was just free'd.  */
> -      tcache_allocs[i].ptr2 = memalign (tcache_allocs[i].alignment, tcache_allocs[i].size);
> +      tcache_allocs[i].ptr2 = memalign (tcache_allocs[i].alignment, sz2);
>        CHECK (tcache_allocs[i].ptr2, tcache_allocs[i].alignment);
>        free (tcache_allocs[i].ptr2);
>  
> diff --git a/malloc/tst-memalign-3.c b/malloc/tst-memalign-3.c
> new file mode 100644
> index 0000000000..ab90d6ca9b
> --- /dev/null
> +++ b/malloc/tst-memalign-3.c
> @@ -0,0 +1,173 @@
> +/* Test for memalign chunk reuse.

OK. Heuristic test for reuse of aligned chunks.

> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <errno.h>
> +#include <malloc.h>
> +#include <stdio.h>
> +#include <pthread.h>
> +#include <string.h>
> +#include <unistd.h>
> +#include <array_length.h>
> +#include <libc-pointer-arith.h>
> +#include <support/check.h>
> +#include <support/xthread.h>
> +
> +
> +typedef struct TestCase {
> +  size_t size;
> +  size_t alignment;
> +  void *ptr1;
> +  void *ptr2;
> +} TestCase;
> +
> +static TestCase tcache_allocs[] = {
> +  { 24, 32, NULL, NULL },
> +  { 24, 64, NULL, NULL },
> +  { 128, 128, NULL, NULL },
> +  { 500, 128, NULL, NULL }

OK.

> +};
> +#define TN array_length (tcache_allocs)
> +
> +static TestCase large_allocs[] = {
> +  { 23450, 64, NULL, NULL },
> +  { 23450, 64, NULL, NULL },
> +  { 23550, 64, NULL, NULL },
> +  { 23550, 64, NULL, NULL },
> +  { 23650, 64, NULL, NULL },
> +  { 23650, 64, NULL, NULL },
> +  { 33650, 64, NULL, NULL },
> +  { 33650, 64, NULL, NULL }

OK.

> +};
> +#define LN array_length (large_allocs)
> +
> +void *p;
> +
> +/* Sanity checks, ancillary to the actual test.  */
> +#define CHECK(p,a) \
> +  if (p == NULL || !PTR_IS_ALIGNED (p, a)) \
> +    FAIL_EXIT1 ("NULL or misaligned memory detected.\n");
> +
> +static void *
> +mem_test (void *closure)
> +{
> +  int i;
> +  int j;
> +  int count;
> +  void *ptr[10];
> +  void *p;
> +
> +  /* TCache test.  */
> +  for (i = 0; i < TN; ++ i)
> +    {
> +      size_t sz2;
> +
> +      tcache_allocs[i].ptr1 = memalign (tcache_allocs[i].alignment, tcache_allocs[i].size);
> +      CHECK (tcache_allocs[i].ptr1, tcache_allocs[i].alignment);
> +      sz2 = malloc_usable_size (tcache_allocs[i].ptr1);
> +      free (tcache_allocs[i].ptr1);
> +
> +      /* This should return the same chunk as was just free'd.  */
> +      tcache_allocs[i].ptr2 = memalign (tcache_allocs[i].alignment, sz2);
> +      CHECK (tcache_allocs[i].ptr2, tcache_allocs[i].alignment);
> +      free (tcache_allocs[i].ptr2);
> +
> +      TEST_VERIFY (tcache_allocs[i].ptr1 == tcache_allocs[i].ptr2);

Works only for non-mcheck case.

> +    }
> +
> +  /* Test for non-head tcache hits.  */
> +  for (i = 0; i < array_length (ptr); ++ i)
> +    {
> +      if (i == 4)
> +	{
> +	  ptr[i] = memalign (64, 256);
> +	  CHECK (ptr[i], 64);
> +	}
> +      else
> +	{
> +	  ptr[i] = malloc (256);
> +	  CHECK (ptr[i], 4);
> +	}
> +    }
> +  for (i = 0; i < array_length (ptr); ++ i)
> +    free (ptr[i]);
> +
> +  p = memalign (64, 256);
> +  CHECK (p, 64);
> +
> +  count = 0;
> +  for (i = 0; i < 10; ++ i)
> +    if (ptr[i] == p)
> +      ++ count;
> +  free (p);
> +  TEST_VERIFY (count > 0);
> +
> +  /* Large bins test.  */
> +
> +  for (i = 0; i < LN; ++ i)
> +    {
> +      large_allocs[i].ptr1 = memalign (large_allocs[i].alignment, large_allocs[i].size);
> +      CHECK (large_allocs[i].ptr1, large_allocs[i].alignment);
> +      /* Keep chunks from combining by fragmenting the heap.  */
> +      p = malloc (512);
> +      CHECK (p, 4);
> +    }
> +
> +  for (i = 0; i < LN; ++ i)
> +    free (large_allocs[i].ptr1);
> +
> +  /* Force the unsorted bins to be scanned and moved to small/large
> +     bins.  */
> +  p = malloc (60000);
> +
> +  for (i = 0; i < LN; ++ i)
> +    {
> +      large_allocs[i].ptr2 = memalign (large_allocs[i].alignment, large_allocs[i].size);
> +      CHECK (large_allocs[i].ptr2, large_allocs[i].alignment);
> +    }
> +
> +  count = 0;
> +  for (i = 0; i < LN; ++ i)
> +    {
> +      int ok = 0;
> +      for (j = 0; j < LN; ++ j)
> +	if (large_allocs[i].ptr1 == large_allocs[j].ptr2)
> +	  ok = 1;
> +      if (ok == 1)
> +	count ++;
> +    }
> +
> +  /* The allocation algorithm is complicated outside of the memalign
> +     logic, so just make sure it's working for most of the
> +     allocations.  This avoids possible boundary conditions with
> +     empty/full heaps.  */
> +  TEST_VERIFY (count > LN / 2);
> +
> +  return 0;
> +}
> +
> +static int
> +do_test (void)
> +{
> +  pthread_t p;
> +
> +  p = xpthread_create (NULL, mem_test, NULL);
> +  xpthread_join (p);
> +  return 0;
> +}
> +
> +#include <support/test-driver.c>
> 

-- 
Cheers,
Carlos.


^ permalink raw reply	[flat|nested] 38+ messages in thread

* [PATCH v7] memalign: Support scanning for aligned chunks.
  2023-04-17 21:48                                   ` Carlos O'Donell
@ 2023-04-18  1:25                                     ` DJ Delorie
  2023-04-18 13:58                                       ` Carlos O'Donell
  0 siblings, 1 reply; 38+ messages in thread
From: DJ Delorie @ 2023-04-18  1:25 UTC (permalink / raw)
  To: Carlos O'Donell; +Cc: libc-alpha

"Carlos O'Donell" <carlos@redhat.com> writes:
> DJ and I sat down to review v6 in detail.
>
> We need a v7 for three reasons:
>
> (a) chunk_ok_for_memalign() is commented as taking *user* bytes but you pass
>     chunksize in this change and the comment needs adjusting. You might as
>     well change `size_t bytes` to `size_t nb` to be consistent.
>
> (b) If we are now passing chunksize then the second line of the function which
>     computes size, should use, not memsize(p), but chunksize(p), to correctly
>     account for the header bytes (otherwise we're conservative).
>
> (c) We need to exclude the test from mcheck runs via tests-exclude-mcheck since
>     the expected chunk results won't work when mcheck is in effect.
>
> 	FAIL: malloc/tst-memalign-2-mcheck
> 	FAIL: malloc/tst-memalign-3-mcheck

Adjusted as requested.  As noted previously, a large chunk of the
malloc.c diff is merely indentation change (with the exception of the
calls to chunk_ok_for_memalign).

From fee79d5ab6d385817ea88e5097254a9559c35878 Mon Sep 17 00:00:00 2001
From: DJ Delorie <dj@redhat.com>
Date: Mon, 3 Apr 2023 17:33:03 -0400
Subject: malloc: set NON_MAIN_ARENA flag for reclaimed memalign chunk (BZ #30101)

Based on these comments in malloc.c:

   size field is or'ed with NON_MAIN_ARENA if the chunk was obtained
   from a non-main arena.  This is only set immediately before handing
   the chunk to the user, if necessary.

   The NON_MAIN_ARENA flag is never set for unsorted chunks, so it
   does not have to be taken into account in size comparisons.

When we pull a chunk off the unsorted list (or any list) we need to
make sure that flag is set properly before returning the chunk.

Use the rounded-up size for chunk_ok_for_memalign()

Do not scan the arena for reusable chunks if there's no arena.

Account for chunk overhead when determining if a chunk is a reuse
candidate.

mcheck interferes with memalign, so skip mcheck variants of
memalign tests.

diff --git a/malloc/Makefile b/malloc/Makefile
index f49675845e..071dfdb9d8 100644
--- a/malloc/Makefile
+++ b/malloc/Makefile
@@ -43,7 +43,8 @@ tests := mallocbug tst-malloc tst-valloc tst-calloc tst-obstack \
 	 tst-tcfree1 tst-tcfree2 tst-tcfree3 \
 	 tst-safe-linking \
 	 tst-mallocalign1 \
-	 tst-memalign-2
+	 tst-memalign-2 \
+	 tst-memalign-3
 
 tests-static := \
 	 tst-interpose-static-nothread \
@@ -71,7 +72,7 @@ test-srcs = tst-mtrace
 # with MALLOC_CHECK_=3 because they expect a specific failure.
 tests-exclude-malloc-check = tst-malloc-check tst-malloc-usable \
 	tst-mxfast tst-safe-linking \
-	tst-compathooks-off tst-compathooks-on tst-memalign-2
+	tst-compathooks-off tst-compathooks-on tst-memalign-2 tst-memalign-3
 
 # Run all tests with MALLOC_CHECK_=3
 tests-malloc-check = $(filter-out $(tests-exclude-malloc-check) \
@@ -116,6 +117,8 @@ tests-exclude-mcheck = tst-mallocstate \
 	tst-malloc-usable-tunables \
 	tst-malloc_info \
 	tst-compathooks-off tst-compathooks-on \
+	tst-memalign-2 \
+	tst-memalign-3 \
 	tst-mxfast
 
 tests-mcheck = $(filter-out $(tests-exclude-mcheck) $(tests-static), $(tests))
diff --git a/malloc/malloc.c b/malloc/malloc.c
index 0315ac5d16..e33ed665db 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -4974,13 +4974,13 @@ _int_realloc (mstate av, mchunkptr oldp, INTERNAL_SIZE_T oldsize,
 
 /* Returns 0 if the chunk is not and does not contain the requested
    aligned sub-chunk, else returns the amount of "waste" from
-   trimming.  BYTES is the *user* byte size, not the chunk byte
+   trimming.  NB is the *chunk* byte size, not the user byte
    size.  */
 static size_t
-chunk_ok_for_memalign (mchunkptr p, size_t alignment, size_t bytes)
+chunk_ok_for_memalign (mchunkptr p, size_t alignment, size_t nb)
 {
   void *m = chunk2mem (p);
-  INTERNAL_SIZE_T size = memsize (p);
+  INTERNAL_SIZE_T size = chunksize (p);
   void *aligned_m = m;
 
   if (__glibc_unlikely (misaligned_chunk (p)))
@@ -4997,12 +4997,12 @@ chunk_ok_for_memalign (mchunkptr p, size_t alignment, size_t bytes)
   /* If it's a perfect fit, it's an exception to the return value rule
      (we would return zero waste, which looks like "not usable"), so
      handle it here by returning a small non-zero value instead.  */
-  if (size == bytes && front_extra == 0)
+  if (size == nb && front_extra == 0)
     return 1;
 
   /* If the block we need fits in the chunk, calculate total waste.  */
-  if (size > bytes + front_extra)
-    return size - bytes;
+  if (size > nb + front_extra)
+    return size - nb;
 
   /* Can't use this chunk.  */
   return 0;
@@ -5048,95 +5048,98 @@ _int_memalign (mstate av, size_t alignment, size_t bytes)
      and unlikely to meet our alignment requirements.  We have not done
      any experimentation with searching for aligned fastbins.  */
 
-  int first_bin_index;
-  int first_largebin_index;
-  int last_bin_index;
+  if (av != NULL)
+    {
+      int first_bin_index;
+      int first_largebin_index;
+      int last_bin_index;
 
-  if (in_smallbin_range (nb))
-    first_bin_index = smallbin_index (nb);
-  else
-    first_bin_index = largebin_index (nb);
+      if (in_smallbin_range (nb))
+	first_bin_index = smallbin_index (nb);
+      else
+	first_bin_index = largebin_index (nb);
 
-  if (in_smallbin_range (nb * 2))
-    last_bin_index = smallbin_index (nb * 2);
-  else
-    last_bin_index = largebin_index (nb * 2);
+      if (in_smallbin_range (nb * 2))
+	last_bin_index = smallbin_index (nb * 2);
+      else
+	last_bin_index = largebin_index (nb * 2);
 
-  first_largebin_index = largebin_index (MIN_LARGE_SIZE);
+      first_largebin_index = largebin_index (MIN_LARGE_SIZE);
 
-  int victim_index;                 /* its bin index */
+      int victim_index;                 /* its bin index */
 
-  for (victim_index = first_bin_index;
-       victim_index < last_bin_index;
-       victim_index ++)
-    {
-      victim = NULL;
+      for (victim_index = first_bin_index;
+	   victim_index < last_bin_index;
+	   victim_index ++)
+	{
+	  victim = NULL;
 
-      if (victim_index < first_largebin_index)
-    {
-      /* Check small bins.  Small bin chunks are doubly-linked despite
-	 being the same size.  */
+	  if (victim_index < first_largebin_index)
+	    {
+	      /* Check small bins.  Small bin chunks are doubly-linked despite
+		 being the same size.  */
 
-      mchunkptr fwd;                    /* misc temp for linking */
-      mchunkptr bck;                    /* misc temp for linking */
+	      mchunkptr fwd;                    /* misc temp for linking */
+	      mchunkptr bck;                    /* misc temp for linking */
 
-      bck = bin_at (av, victim_index);
-      fwd = bck->fd;
-      while (fwd != bck)
-	{
-	  if (chunk_ok_for_memalign (fwd, alignment, bytes) > 0)
-	    {
-	      victim = fwd;
+	      bck = bin_at (av, victim_index);
+	      fwd = bck->fd;
+	      while (fwd != bck)
+		{
+		  if (chunk_ok_for_memalign (fwd, alignment, nb) > 0)
+		    {
+		      victim = fwd;
 
-	      /* Unlink it */
-	      victim->fd->bk = victim->bk;
-	      victim->bk->fd = victim->fd;
-	      break;
+		      /* Unlink it */
+		      victim->fd->bk = victim->bk;
+		      victim->bk->fd = victim->fd;
+		      break;
+		    }
+
+		  fwd = fwd->fd;
+		}
 	    }
+	  else
+	    {
+	      /* Check large bins.  */
+	      mchunkptr fwd;                    /* misc temp for linking */
+	      mchunkptr bck;                    /* misc temp for linking */
+	      mchunkptr best = NULL;
+	      size_t best_size = 0;
 
-	  fwd = fwd->fd;
-	}
-    }
-  else
-    {
-      /* Check large bins.  */
-      mchunkptr fwd;                    /* misc temp for linking */
-      mchunkptr bck;                    /* misc temp for linking */
-      mchunkptr best = NULL;
-      size_t best_size = 0;
+	      bck = bin_at (av, victim_index);
+	      fwd = bck->fd;
 
-      bck = bin_at (av, victim_index);
-      fwd = bck->fd;
+	      while (fwd != bck)
+		{
+		  int extra;
 
-      while (fwd != bck)
-	{
-	  int extra;
+		  if (chunksize (fwd) < nb)
+		    break;
+		  extra = chunk_ok_for_memalign (fwd, alignment, nb);
+		  if (extra > 0
+		      && (extra <= best_size || best == NULL))
+		    {
+		      best = fwd;
+		      best_size = extra;
+		    }
 
-	  if (chunksize (fwd) < nb)
-	      break;
-	  extra = chunk_ok_for_memalign (fwd, alignment, bytes);
-	  if (extra > 0
-	      && (extra <= best_size || best == NULL))
-	    {
-	      best = fwd;
-	      best_size = extra;
-	    }
+		  fwd = fwd->fd;
+		}
+	      victim = best;
 
-	  fwd = fwd->fd;
-	}
-      victim = best;
+	      if (victim != NULL)
+		{
+		  unlink_chunk (av, victim);
+		  break;
+		}
+	    }
 
-      if (victim != NULL)
-	{
-	  unlink_chunk (av, victim);
-	  break;
+	  if (victim != NULL)
+	    break;
 	}
     }
 
-      if (victim != NULL)
-	break;
-    }
-
   /* Strategy: find a spot within that chunk that meets the alignment
      request, and then possibly free the leading and trailing space.
      This strategy is incredibly costly and can lead to external
@@ -5147,6 +5150,8 @@ _int_memalign (mstate av, size_t alignment, size_t bytes)
       p = victim;
       m = chunk2mem (p);
       set_inuse (p);
+      if (av != &main_arena)
+	set_non_main_arena (p);
     }
   else
     {
diff --git a/malloc/tst-memalign-2.c b/malloc/tst-memalign-2.c
index 4996578e9f..f229283dbf 100644
--- a/malloc/tst-memalign-2.c
+++ b/malloc/tst-memalign-2.c
@@ -33,9 +33,10 @@ typedef struct TestCase {
 } TestCase;
 
 static TestCase tcache_allocs[] = {
-  { 24, 8, NULL, NULL },
-  { 24, 16, NULL, NULL },
-  { 128, 32, NULL, NULL }
+  { 24, 32, NULL, NULL },
+  { 24, 64, NULL, NULL },
+  { 128, 128, NULL, NULL },
+  { 500, 128, NULL, NULL }
 };
 #define TN array_length (tcache_allocs)
 
@@ -70,11 +71,15 @@ do_test (void)
 
   for (i = 0; i < TN; ++ i)
     {
+      size_t sz2;
+
       tcache_allocs[i].ptr1 = memalign (tcache_allocs[i].alignment, tcache_allocs[i].size);
       CHECK (tcache_allocs[i].ptr1, tcache_allocs[i].alignment);
+      sz2 = malloc_usable_size (tcache_allocs[i].ptr1);
       free (tcache_allocs[i].ptr1);
+
       /* This should return the same chunk as was just free'd.  */
-      tcache_allocs[i].ptr2 = memalign (tcache_allocs[i].alignment, tcache_allocs[i].size);
+      tcache_allocs[i].ptr2 = memalign (tcache_allocs[i].alignment, sz2);
       CHECK (tcache_allocs[i].ptr2, tcache_allocs[i].alignment);
       free (tcache_allocs[i].ptr2);
 
diff --git a/malloc/tst-memalign-3.c b/malloc/tst-memalign-3.c
new file mode 100644
index 0000000000..ab90d6ca9b
--- /dev/null
+++ b/malloc/tst-memalign-3.c
@@ -0,0 +1,173 @@
+/* Test for memalign chunk reuse.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <errno.h>
+#include <malloc.h>
+#include <stdio.h>
+#include <pthread.h>
+#include <string.h>
+#include <unistd.h>
+#include <array_length.h>
+#include <libc-pointer-arith.h>
+#include <support/check.h>
+#include <support/xthread.h>
+
+
+typedef struct TestCase {
+  size_t size;
+  size_t alignment;
+  void *ptr1;
+  void *ptr2;
+} TestCase;
+
+static TestCase tcache_allocs[] = {
+  { 24, 32, NULL, NULL },
+  { 24, 64, NULL, NULL },
+  { 128, 128, NULL, NULL },
+  { 500, 128, NULL, NULL }
+};
+#define TN array_length (tcache_allocs)
+
+static TestCase large_allocs[] = {
+  { 23450, 64, NULL, NULL },
+  { 23450, 64, NULL, NULL },
+  { 23550, 64, NULL, NULL },
+  { 23550, 64, NULL, NULL },
+  { 23650, 64, NULL, NULL },
+  { 23650, 64, NULL, NULL },
+  { 33650, 64, NULL, NULL },
+  { 33650, 64, NULL, NULL }
+};
+#define LN array_length (large_allocs)
+
+void *p;
+
+/* Sanity checks, ancillary to the actual test.  */
+#define CHECK(p,a) \
+  if (p == NULL || !PTR_IS_ALIGNED (p, a)) \
+    FAIL_EXIT1 ("NULL or misaligned memory detected.\n");
+
+static void *
+mem_test (void *closure)
+{
+  int i;
+  int j;
+  int count;
+  void *ptr[10];
+  void *p;
+
+  /* TCache test.  */
+  for (i = 0; i < TN; ++ i)
+    {
+      size_t sz2;
+
+      tcache_allocs[i].ptr1 = memalign (tcache_allocs[i].alignment, tcache_allocs[i].size);
+      CHECK (tcache_allocs[i].ptr1, tcache_allocs[i].alignment);
+      sz2 = malloc_usable_size (tcache_allocs[i].ptr1);
+      free (tcache_allocs[i].ptr1);
+
+      /* This should return the same chunk as was just free'd.  */
+      tcache_allocs[i].ptr2 = memalign (tcache_allocs[i].alignment, sz2);
+      CHECK (tcache_allocs[i].ptr2, tcache_allocs[i].alignment);
+      free (tcache_allocs[i].ptr2);
+
+      TEST_VERIFY (tcache_allocs[i].ptr1 == tcache_allocs[i].ptr2);
+    }
+
+  /* Test for non-head tcache hits.  */
+  for (i = 0; i < array_length (ptr); ++ i)
+    {
+      if (i == 4)
+	{
+	  ptr[i] = memalign (64, 256);
+	  CHECK (ptr[i], 64);
+	}
+      else
+	{
+	  ptr[i] = malloc (256);
+	  CHECK (ptr[i], 4);
+	}
+    }
+  for (i = 0; i < array_length (ptr); ++ i)
+    free (ptr[i]);
+
+  p = memalign (64, 256);
+  CHECK (p, 64);
+
+  count = 0;
+  for (i = 0; i < 10; ++ i)
+    if (ptr[i] == p)
+      ++ count;
+  free (p);
+  TEST_VERIFY (count > 0);
+
+  /* Large bins test.  */
+
+  for (i = 0; i < LN; ++ i)
+    {
+      large_allocs[i].ptr1 = memalign (large_allocs[i].alignment, large_allocs[i].size);
+      CHECK (large_allocs[i].ptr1, large_allocs[i].alignment);
+      /* Keep chunks from combining by fragmenting the heap.  */
+      p = malloc (512);
+      CHECK (p, 4);
+    }
+
+  for (i = 0; i < LN; ++ i)
+    free (large_allocs[i].ptr1);
+
+  /* Force the unsorted bins to be scanned and moved to small/large
+     bins.  */
+  p = malloc (60000);
+
+  for (i = 0; i < LN; ++ i)
+    {
+      large_allocs[i].ptr2 = memalign (large_allocs[i].alignment, large_allocs[i].size);
+      CHECK (large_allocs[i].ptr2, large_allocs[i].alignment);
+    }
+
+  count = 0;
+  for (i = 0; i < LN; ++ i)
+    {
+      int ok = 0;
+      for (j = 0; j < LN; ++ j)
+	if (large_allocs[i].ptr1 == large_allocs[j].ptr2)
+	  ok = 1;
+      if (ok == 1)
+	count ++;
+    }
+
+  /* The allocation algorithm is complicated outside of the memalign
+     logic, so just make sure it's working for most of the
+     allocations.  This avoids possible boundary conditions with
+     empty/full heaps.  */
+  TEST_VERIFY (count > LN / 2);
+
+  return 0;
+}
+
+static int
+do_test (void)
+{
+  pthread_t p;
+
+  p = xpthread_create (NULL, mem_test, NULL);
+  xpthread_join (p);
+  return 0;
+}
+
+#include <support/test-driver.c>


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v7] memalign: Support scanning for aligned chunks.
  2023-04-18  1:25                                     ` [PATCH v7] " DJ Delorie
@ 2023-04-18 13:58                                       ` Carlos O'Donell
  2023-04-18 15:02                                         ` DJ Delorie
  0 siblings, 1 reply; 38+ messages in thread
From: Carlos O'Donell @ 2023-04-18 13:58 UTC (permalink / raw)
  To: DJ Delorie; +Cc: libc-alpha

On 4/17/23 21:25, DJ Delorie wrote:
> "Carlos O'Donell" <carlos@redhat.com> writes:
>> DJ and I sat down to review v6 in detail.
>>
>> We need a v7 for three reasons:
>>
>> (a) chunk_ok_for_memalign() is commented as taking *user* bytes but you pass
>>     chunksize in this change and the comment needs adjusting. You might as
>>     well change `size_t bytes` to `size_t nb` to be consistent.
>>
>> (b) If we are now passing chunksize then the second line of the function which
>>     computes size, should use, not memsize(p), but chunksize(p), to correctly
>>     account for the header bytes (otherwise we're conservative).
>>
>> (c) We need to exclude the test from mcheck runs via tests-exclude-mcheck since
>>     the expected chunk results won't work when mcheck is in effect.
>>
>> 	FAIL: malloc/tst-memalign-2-mcheck
>> 	FAIL: malloc/tst-memalign-3-mcheck
> 
> Adjusted as requested.  As noted previously, a large chunk of the
> malloc.c diff is merely indentation change (with the exception of the
> calls to chunk_ok_for_memalign).
> 
> From fee79d5ab6d385817ea88e5097254a9559c35878 Mon Sep 17 00:00:00 2001
> From: DJ Delorie <dj@redhat.com>
> Date: Mon, 3 Apr 2023 17:33:03 -0400
> Subject: malloc: set NON_MAIN_ARENA flag for reclaimed memalign chunk (BZ #30101)
> 
> Based on these comments in malloc.c:
> 
>    size field is or'ed with NON_MAIN_ARENA if the chunk was obtained
>    from a non-main arena.  This is only set immediately before handing
>    the chunk to the user, if necessary.
> 
>    The NON_MAIN_ARENA flag is never set for unsorted chunks, so it
>    does not have to be taken into account in size comparisons.
> 
> When we pull a chunk off the unsorted list (or any list) we need to
> make sure that flag is set properly before returning the chunk.
> 
> Use the rounded-up size for chunk_ok_for_memalign()
> 
> Do not scan the arena for reusable chunks if there's no arena.
> 
> Account for chunk overhead when determining if a chunk is a reuse
> candidate.
> 
> mcheck interferes with memalign, so skip mcheck variants of
> memalign tests.

v7 LGTM and addresses all previous issues.

Reviewed-by: Carlos O'Donell <carlos@redhat.com>
Tested-by: Carlos O'Donell <carlos@redhat.com>

> 
> diff --git a/malloc/Makefile b/malloc/Makefile
> index f49675845e..071dfdb9d8 100644
> --- a/malloc/Makefile
> +++ b/malloc/Makefile
> @@ -43,7 +43,8 @@ tests := mallocbug tst-malloc tst-valloc tst-calloc tst-obstack \
>  	 tst-tcfree1 tst-tcfree2 tst-tcfree3 \
>  	 tst-safe-linking \
>  	 tst-mallocalign1 \
> -	 tst-memalign-2
> +	 tst-memalign-2 \
> +	 tst-memalign-3

OK.

>  
>  tests-static := \
>  	 tst-interpose-static-nothread \
> @@ -71,7 +72,7 @@ test-srcs = tst-mtrace
>  # with MALLOC_CHECK_=3 because they expect a specific failure.
>  tests-exclude-malloc-check = tst-malloc-check tst-malloc-usable \
>  	tst-mxfast tst-safe-linking \
> -	tst-compathooks-off tst-compathooks-on tst-memalign-2
> +	tst-compathooks-off tst-compathooks-on tst-memalign-2 tst-memalign-3

OK.

>  
>  # Run all tests with MALLOC_CHECK_=3
>  tests-malloc-check = $(filter-out $(tests-exclude-malloc-check) \
> @@ -116,6 +117,8 @@ tests-exclude-mcheck = tst-mallocstate \
>  	tst-malloc-usable-tunables \
>  	tst-malloc_info \
>  	tst-compathooks-off tst-compathooks-on \
> +	tst-memalign-2 \
> +	tst-memalign-3 \

OK.

>  	tst-mxfast
>  
>  tests-mcheck = $(filter-out $(tests-exclude-mcheck) $(tests-static), $(tests))
> diff --git a/malloc/malloc.c b/malloc/malloc.c
> index 0315ac5d16..e33ed665db 100644
> --- a/malloc/malloc.c
> +++ b/malloc/malloc.c
> @@ -4974,13 +4974,13 @@ _int_realloc (mstate av, mchunkptr oldp, INTERNAL_SIZE_T oldsize,
>  
>  /* Returns 0 if the chunk is not and does not contain the requested
>     aligned sub-chunk, else returns the amount of "waste" from
> -   trimming.  BYTES is the *user* byte size, not the chunk byte
> +   trimming.  NB is the *chunk* byte size, not the user byte

OK. Corrects the comment to use chunksize as required for the back-chunk size.

>     size.  */
>  static size_t
> -chunk_ok_for_memalign (mchunkptr p, size_t alignment, size_t bytes)
> +chunk_ok_for_memalign (mchunkptr p, size_t alignment, size_t nb)
>  {
>    void *m = chunk2mem (p);
> -  INTERNAL_SIZE_T size = memsize (p);
> +  INTERNAL_SIZE_T size = chunksize (p);

OK. Use the correct tight boundary e.g. chunksize.

>    void *aligned_m = m;
>  
>    if (__glibc_unlikely (misaligned_chunk (p)))
> @@ -4997,12 +4997,12 @@ chunk_ok_for_memalign (mchunkptr p, size_t alignment, size_t bytes)
>    /* If it's a perfect fit, it's an exception to the return value rule
>       (we would return zero waste, which looks like "not usable"), so
>       handle it here by returning a small non-zero value instead.  */
> -  if (size == bytes && front_extra == 0)
> +  if (size == nb && front_extra == 0)

OK.

>      return 1;
>  
>    /* If the block we need fits in the chunk, calculate total waste.  */
> -  if (size > bytes + front_extra)
> -    return size - bytes;
> +  if (size > nb + front_extra)
> +    return size - nb;

OK.

>  
>    /* Can't use this chunk.  */
>    return 0;
> @@ -5048,95 +5048,98 @@ _int_memalign (mstate av, size_t alignment, size_t bytes)
>       and unlikely to meet our alignment requirements.  We have not done
>       any experimentation with searching for aligned fastbins.  */
>  
> -  int first_bin_index;
> -  int first_largebin_index;
> -  int last_bin_index;
> +  if (av != NULL)
> +    {
> +      int first_bin_index;
> +      int first_largebin_index;
> +      int last_bin_index;
>  
> -  if (in_smallbin_range (nb))
> -    first_bin_index = smallbin_index (nb);
> -  else
> -    first_bin_index = largebin_index (nb);
> +      if (in_smallbin_range (nb))
> +	first_bin_index = smallbin_index (nb);
> +      else
> +	first_bin_index = largebin_index (nb);
>  
> -  if (in_smallbin_range (nb * 2))
> -    last_bin_index = smallbin_index (nb * 2);
> -  else
> -    last_bin_index = largebin_index (nb * 2);
> +      if (in_smallbin_range (nb * 2))
> +	last_bin_index = smallbin_index (nb * 2);
> +      else
> +	last_bin_index = largebin_index (nb * 2);
>  
> -  first_largebin_index = largebin_index (MIN_LARGE_SIZE);
> +      first_largebin_index = largebin_index (MIN_LARGE_SIZE);
>  
> -  int victim_index;                 /* its bin index */
> +      int victim_index;                 /* its bin index */
>  
> -  for (victim_index = first_bin_index;
> -       victim_index < last_bin_index;
> -       victim_index ++)
> -    {
> -      victim = NULL;
> +      for (victim_index = first_bin_index;
> +	   victim_index < last_bin_index;
> +	   victim_index ++)
> +	{
> +	  victim = NULL;
>  
> -      if (victim_index < first_largebin_index)
> -    {
> -      /* Check small bins.  Small bin chunks are doubly-linked despite
> -	 being the same size.  */
> +	  if (victim_index < first_largebin_index)
> +	    {
> +	      /* Check small bins.  Small bin chunks are doubly-linked despite
> +		 being the same size.  */
>  
> -      mchunkptr fwd;                    /* misc temp for linking */
> -      mchunkptr bck;                    /* misc temp for linking */
> +	      mchunkptr fwd;                    /* misc temp for linking */
> +	      mchunkptr bck;                    /* misc temp for linking */
>  
> -      bck = bin_at (av, victim_index);
> -      fwd = bck->fd;
> -      while (fwd != bck)
> -	{
> -	  if (chunk_ok_for_memalign (fwd, alignment, bytes) > 0)
> -	    {
> -	      victim = fwd;
> +	      bck = bin_at (av, victim_index);
> +	      fwd = bck->fd;
> +	      while (fwd != bck)
> +		{
> +		  if (chunk_ok_for_memalign (fwd, alignment, nb) > 0)

OK. Use nb.

> +		    {
> +		      victim = fwd;
>  
> -	      /* Unlink it */
> -	      victim->fd->bk = victim->bk;
> -	      victim->bk->fd = victim->fd;
> -	      break;
> +		      /* Unlink it */
> +		      victim->fd->bk = victim->bk;
> +		      victim->bk->fd = victim->fd;
> +		      break;
> +		    }
> +
> +		  fwd = fwd->fd;
> +		}
>  	    }
> +	  else
> +	    {
> +	      /* Check large bins.  */
> +	      mchunkptr fwd;                    /* misc temp for linking */
> +	      mchunkptr bck;                    /* misc temp for linking */
> +	      mchunkptr best = NULL;
> +	      size_t best_size = 0;
>  
> -	  fwd = fwd->fd;
> -	}
> -    }
> -  else
> -    {
> -      /* Check large bins.  */
> -      mchunkptr fwd;                    /* misc temp for linking */
> -      mchunkptr bck;                    /* misc temp for linking */
> -      mchunkptr best = NULL;
> -      size_t best_size = 0;
> +	      bck = bin_at (av, victim_index);
> +	      fwd = bck->fd;
>  
> -      bck = bin_at (av, victim_index);
> -      fwd = bck->fd;
> +	      while (fwd != bck)
> +		{
> +		  int extra;
>  
> -      while (fwd != bck)
> -	{
> -	  int extra;
> +		  if (chunksize (fwd) < nb)
> +		    break;
> +		  extra = chunk_ok_for_memalign (fwd, alignment, nb);

OK. Use nb.

> +		  if (extra > 0
> +		      && (extra <= best_size || best == NULL))
> +		    {
> +		      best = fwd;
> +		      best_size = extra;
> +		    }
>  
> -	  if (chunksize (fwd) < nb)
> -	      break;
> -	  extra = chunk_ok_for_memalign (fwd, alignment, bytes);
> -	  if (extra > 0
> -	      && (extra <= best_size || best == NULL))
> -	    {
> -	      best = fwd;
> -	      best_size = extra;
> -	    }
> +		  fwd = fwd->fd;
> +		}
> +	      victim = best;
>  
> -	  fwd = fwd->fd;
> -	}
> -      victim = best;
> +	      if (victim != NULL)
> +		{
> +		  unlink_chunk (av, victim);
> +		  break;
> +		}
> +	    }
>  
> -      if (victim != NULL)
> -	{
> -	  unlink_chunk (av, victim);
> -	  break;
> +	  if (victim != NULL)
> +	    break;
>  	}
>      }
>  
> -      if (victim != NULL)
> -	break;
> -    }
> -
>    /* Strategy: find a spot within that chunk that meets the alignment
>       request, and then possibly free the leading and trailing space.
>       This strategy is incredibly costly and can lead to external
> @@ -5147,6 +5150,8 @@ _int_memalign (mstate av, size_t alignment, size_t bytes)
>        p = victim;
>        m = chunk2mem (p);
>        set_inuse (p);
> +      if (av != &main_arena)
> +	set_non_main_arena (p);

OK. Set M bit.

>      }
>    else
>      {
> diff --git a/malloc/tst-memalign-2.c b/malloc/tst-memalign-2.c
> index 4996578e9f..f229283dbf 100644
> --- a/malloc/tst-memalign-2.c
> +++ b/malloc/tst-memalign-2.c
> @@ -33,9 +33,10 @@ typedef struct TestCase {
>  } TestCase;
>  
>  static TestCase tcache_allocs[] = {
> -  { 24, 8, NULL, NULL },
> -  { 24, 16, NULL, NULL },
> -  { 128, 32, NULL, NULL }
> +  { 24, 32, NULL, NULL },
> +  { 24, 64, NULL, NULL },
> +  { 128, 128, NULL, NULL },
> +  { 500, 128, NULL, NULL }

OK. Use higher alignments.

>  };
>  #define TN array_length (tcache_allocs)
>  
> @@ -70,11 +71,15 @@ do_test (void)
>  
>    for (i = 0; i < TN; ++ i)
>      {
> +      size_t sz2;
> +
>        tcache_allocs[i].ptr1 = memalign (tcache_allocs[i].alignment, tcache_allocs[i].size);
>        CHECK (tcache_allocs[i].ptr1, tcache_allocs[i].alignment);
> +      sz2 = malloc_usable_size (tcache_allocs[i].ptr1);
>        free (tcache_allocs[i].ptr1);
> +
>        /* This should return the same chunk as was just free'd.  */
> -      tcache_allocs[i].ptr2 = memalign (tcache_allocs[i].alignment, tcache_allocs[i].size);
> +      tcache_allocs[i].ptr2 = memalign (tcache_allocs[i].alignment, sz2);
>        CHECK (tcache_allocs[i].ptr2, tcache_allocs[i].alignment);
>        free (tcache_allocs[i].ptr2);
>  
> diff --git a/malloc/tst-memalign-3.c b/malloc/tst-memalign-3.c
> new file mode 100644
> index 0000000000..ab90d6ca9b
> --- /dev/null
> +++ b/malloc/tst-memalign-3.c
> @@ -0,0 +1,173 @@
> +/* Test for memalign chunk reuse.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <errno.h>
> +#include <malloc.h>
> +#include <stdio.h>
> +#include <pthread.h>
> +#include <string.h>
> +#include <unistd.h>
> +#include <array_length.h>
> +#include <libc-pointer-arith.h>
> +#include <support/check.h>
> +#include <support/xthread.h>
> +
> +
> +typedef struct TestCase {
> +  size_t size;
> +  size_t alignment;
> +  void *ptr1;
> +  void *ptr2;
> +} TestCase;
> +
> +static TestCase tcache_allocs[] = {
> +  { 24, 32, NULL, NULL },
> +  { 24, 64, NULL, NULL },
> +  { 128, 128, NULL, NULL },
> +  { 500, 128, NULL, NULL }
> +};
> +#define TN array_length (tcache_allocs)
> +
> +static TestCase large_allocs[] = {
> +  { 23450, 64, NULL, NULL },
> +  { 23450, 64, NULL, NULL },
> +  { 23550, 64, NULL, NULL },
> +  { 23550, 64, NULL, NULL },
> +  { 23650, 64, NULL, NULL },
> +  { 23650, 64, NULL, NULL },
> +  { 33650, 64, NULL, NULL },
> +  { 33650, 64, NULL, NULL }
> +};
> +#define LN array_length (large_allocs)
> +
> +void *p;
> +
> +/* Sanity checks, ancillary to the actual test.  */
> +#define CHECK(p,a) \
> +  if (p == NULL || !PTR_IS_ALIGNED (p, a)) \
> +    FAIL_EXIT1 ("NULL or misaligned memory detected.\n");
> +
> +static void *
> +mem_test (void *closure)
> +{
> +  int i;
> +  int j;
> +  int count;
> +  void *ptr[10];
> +  void *p;
> +
> +  /* TCache test.  */
> +  for (i = 0; i < TN; ++ i)
> +    {
> +      size_t sz2;
> +
> +      tcache_allocs[i].ptr1 = memalign (tcache_allocs[i].alignment, tcache_allocs[i].size);
> +      CHECK (tcache_allocs[i].ptr1, tcache_allocs[i].alignment);
> +      sz2 = malloc_usable_size (tcache_allocs[i].ptr1);
> +      free (tcache_allocs[i].ptr1);
> +
> +      /* This should return the same chunk as was just free'd.  */
> +      tcache_allocs[i].ptr2 = memalign (tcache_allocs[i].alignment, sz2);
> +      CHECK (tcache_allocs[i].ptr2, tcache_allocs[i].alignment);
> +      free (tcache_allocs[i].ptr2);
> +
> +      TEST_VERIFY (tcache_allocs[i].ptr1 == tcache_allocs[i].ptr2);
> +    }
> +
> +  /* Test for non-head tcache hits.  */
> +  for (i = 0; i < array_length (ptr); ++ i)
> +    {
> +      if (i == 4)
> +	{
> +	  ptr[i] = memalign (64, 256);
> +	  CHECK (ptr[i], 64);
> +	}
> +      else
> +	{
> +	  ptr[i] = malloc (256);
> +	  CHECK (ptr[i], 4);
> +	}
> +    }
> +  for (i = 0; i < array_length (ptr); ++ i)
> +    free (ptr[i]);
> +
> +  p = memalign (64, 256);
> +  CHECK (p, 64);
> +
> +  count = 0;
> +  for (i = 0; i < 10; ++ i)
> +    if (ptr[i] == p)
> +      ++ count;
> +  free (p);
> +  TEST_VERIFY (count > 0);
> +
> +  /* Large bins test.  */
> +
> +  for (i = 0; i < LN; ++ i)
> +    {
> +      large_allocs[i].ptr1 = memalign (large_allocs[i].alignment, large_allocs[i].size);
> +      CHECK (large_allocs[i].ptr1, large_allocs[i].alignment);
> +      /* Keep chunks from combining by fragmenting the heap.  */
> +      p = malloc (512);
> +      CHECK (p, 4);
> +    }
> +
> +  for (i = 0; i < LN; ++ i)
> +    free (large_allocs[i].ptr1);
> +
> +  /* Force the unsorted bins to be scanned and moved to small/large
> +     bins.  */
> +  p = malloc (60000);
> +
> +  for (i = 0; i < LN; ++ i)
> +    {
> +      large_allocs[i].ptr2 = memalign (large_allocs[i].alignment, large_allocs[i].size);
> +      CHECK (large_allocs[i].ptr2, large_allocs[i].alignment);
> +    }
> +
> +  count = 0;
> +  for (i = 0; i < LN; ++ i)
> +    {
> +      int ok = 0;
> +      for (j = 0; j < LN; ++ j)
> +	if (large_allocs[i].ptr1 == large_allocs[j].ptr2)
> +	  ok = 1;
> +      if (ok == 1)
> +	count ++;
> +    }
> +
> +  /* The allocation algorithm is complicated outside of the memalign
> +     logic, so just make sure it's working for most of the
> +     allocations.  This avoids possible boundary conditions with
> +     empty/full heaps.  */
> +  TEST_VERIFY (count > LN / 2);
> +
> +  return 0;
> +}
> +
> +static int
> +do_test (void)
> +{
> +  pthread_t p;
> +
> +  p = xpthread_create (NULL, mem_test, NULL);
> +  xpthread_join (p);
> +  return 0;
> +}
> +
> +#include <support/test-driver.c>
> 

-- 
Cheers,
Carlos.


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v7] memalign: Support scanning for aligned chunks.
  2023-04-18 13:58                                       ` Carlos O'Donell
@ 2023-04-18 15:02                                         ` DJ Delorie
  0 siblings, 0 replies; 38+ messages in thread
From: DJ Delorie @ 2023-04-18 15:02 UTC (permalink / raw)
  To: Carlos O'Donell; +Cc: libc-alpha


Thanks!  Pushed.  WHEW.


^ permalink raw reply	[flat|nested] 38+ messages in thread

end of thread, other threads:[~2023-04-18 15:02 UTC | newest]

Thread overview: 38+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-07-14  3:58 [PATCH v1 1/1] memalign: Support scanning for aligned chunks DJ Delorie
2022-07-19  2:54 ` Carlos O'Donell
2022-07-19  3:57   ` [PATCH v2 " DJ Delorie
2022-07-19  9:19     ` Florian Weimer
2022-07-19 17:32       ` DJ Delorie
2022-07-20  0:32       ` [PATCH v3 " DJ Delorie
2022-07-22 20:21         ` DJ Delorie
2022-07-22 20:28         ` Joseph Myers
2022-07-28 19:50           ` [PATCH v4 " DJ Delorie
2022-08-17 19:00             ` DJ Delorie
2022-11-10 21:40               ` Ping^2: " DJ Delorie
2023-03-20 21:49                 ` Ping^3: " DJ Delorie
2023-03-28 19:07             ` Adhemerval Zanella Netto
2023-03-29  4:20               ` [PATCH v5 " DJ Delorie
2023-03-29 19:41                 ` Adhemerval Zanella Netto
2023-03-29 20:36                   ` DJ Delorie
2023-03-30 10:04                     ` Cristian Rodríguez
2023-03-30 10:50                       ` Adhemerval Zanella Netto
2023-03-30 21:43                         ` Cristian Rodríguez
2023-04-12 17:04                           ` Xi Ruoyao
2023-04-12 17:16                             ` DJ Delorie
2023-04-12 17:26                               ` Xi Ruoyao
2023-04-13  1:52                                 ` [PATCH v6 " DJ Delorie
2023-04-13  5:51                                   ` Xi Ruoyao
2023-04-17 21:48                                   ` Carlos O'Donell
2023-04-18  1:25                                     ` [PATCH v7] " DJ Delorie
2023-04-18 13:58                                       ` Carlos O'Donell
2023-04-18 15:02                                         ` DJ Delorie
2023-04-12 17:33                             ` [PATCH v5 1/1] " Adhemerval Zanella Netto
2023-04-12 17:40                               ` DJ Delorie
2023-04-12 18:01                                 ` Adhemerval Zanella Netto
2023-04-13  1:57                                   ` DJ Delorie
2023-04-13 10:46                                     ` Adhemerval Zanella Netto
2023-04-05 14:07                     ` Stefan Liebler
2023-04-05 17:58                       ` DJ Delorie
2023-04-11 11:40                         ` Stefan Liebler
2023-04-12 11:23                           ` Stefan Liebler
2023-03-31 15:39                 ` Adhemerval Zanella Netto

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).