public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH v4 0/6] Use introsort for qsort
@ 2023-07-11 19:07 Adhemerval Zanella
  2023-07-11 19:07 ` [PATCH v4 1/6] stdlib: Optimization qsort{_r} swap implementation (BZ 19305) Adhemerval Zanella
                   ` (5 more replies)
  0 siblings, 6 replies; 24+ messages in thread
From: Adhemerval Zanella @ 2023-07-11 19:07 UTC (permalink / raw)
  To: libc-alpha

The main motivation to use introsort is to make it fully AS-Safe and
AC-Safe, with a limited stack size requirement, to remove the use
of malloc (which troublesome since it seems some program do longjmp
from the callback comparison program), and keep worst-case scenario
bounded to O(n*ln(n)) (instead of potentially quadradic as for the
quicksort).

The implementation does not aim to be the state-of-the-art sort
algorithm, rather I used a well understood introsort (used on
libstdc++, for instance) and leverage the current quicksort
implementation along with a heapsort one from Linux kernel.

Performance-wise, the introsort does fall short compare to
mergesort [1].  I have not added the benchmark because I see that
this should not be focus of this patch. I have added some simple
optimization, also used by Linux kernel heapsort.

[1] https://sourceware.org/pipermail/libc-alpha/2021-September/130698.html

Adhemerval Zanella (6):
  stdlib: Optimization qsort{_r} swap implementation (BZ 19305)
  stdlib: Move insertion sort out qsort
  stdlib: qsort: Move some macros to inline function
  stdlib: Implement introsort with qsort
  stdlib: Remove use of mergesort on qsort (BZ 21719)
  stdlib: Add more qsort{_r} coverage

 manual/argp.texi    |   2 +-
 manual/locale.texi  |   3 +-
 manual/search.texi  |   7 +-
 stdlib/Makefile     |   3 +-
 stdlib/msort.c      | 309 ----------------------------------------
 stdlib/qsort.c      | 338 ++++++++++++++++++++++++++++++++++----------
 stdlib/tst-qsort3.c | 298 ++++++++++++++++++++++++++++++++++++++
 7 files changed, 566 insertions(+), 394 deletions(-)
 delete mode 100644 stdlib/msort.c
 create mode 100644 stdlib/tst-qsort3.c

-- 
2.34.1


^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH v4 1/6] stdlib: Optimization qsort{_r} swap implementation (BZ 19305)
  2023-07-11 19:07 [PATCH v4 0/6] Use introsort for qsort Adhemerval Zanella
@ 2023-07-11 19:07 ` Adhemerval Zanella
  2023-07-11 23:40   ` Noah Goldstein
  2023-07-12 21:04   ` Paul Eggert
  2023-07-11 19:07 ` [PATCH v4 2/6] stdlib: Move insertion sort out qsort Adhemerval Zanella
                   ` (4 subsequent siblings)
  5 siblings, 2 replies; 24+ messages in thread
From: Adhemerval Zanella @ 2023-07-11 19:07 UTC (permalink / raw)
  To: libc-alpha

It optimizes take in consideration both the most common elements are
either 32 or 64 bit in size and inputs are aligned to the word
boundary.  This is similar to the optimization done on lib/sort.c
from Linux.

This patchs adds an optimized swap operation on qsort based in previous
msort one.  Instead of byte operation, three variants are provided:

  1. Using uint32_t loads and stores.
  2. Using uint64_t loads and stores.
  3. Generic one with a temporary buffer and memcpy/mempcpy.

The 1. and 2. options are selected only if base pointer is aligned
to required type.

It also fixes BZ#19305 by checking input size against number of
elements 1 besides 0.

Checked on x86_64-linux-gnu.
---
 stdlib/qsort.c | 113 +++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 95 insertions(+), 18 deletions(-)

diff --git a/stdlib/qsort.c b/stdlib/qsort.c
index 728a0ed370..8a3331fdb4 100644
--- a/stdlib/qsort.c
+++ b/stdlib/qsort.c
@@ -23,20 +23,89 @@
 #include <limits.h>
 #include <stdlib.h>
 #include <string.h>
+#include <stdbool.h>
+#include <libc-pointer-arith.h>
 
-/* Byte-wise swap two items of size SIZE. */
-#define SWAP(a, b, size)						      \
-  do									      \
-    {									      \
-      size_t __size = (size);						      \
-      char *__a = (a), *__b = (b);					      \
-      do								      \
-	{								      \
-	  char __tmp = *__a;						      \
-	  *__a++ = *__b;						      \
-	  *__b++ = __tmp;						      \
-	} while (--__size > 0);						      \
-    } while (0)
+/* Swap SIZE bytes between addresses A and B.  These helpers are provided
+   along the generic one as an optimization.  */
+
+typedef void (*swap_func_t)(void * restrict, void * restrict, size_t);
+
+#define SWAP_WORDS_64 (swap_func_t)0
+#define SWAP_WORDS_32 (swap_func_t)1
+#define SWAP_BYTES    (swap_func_t)2
+
+/* Returns true if elements can be copied using word loads and stores.
+   The SIZE and BASE must be a multiple of the ALIGN.  */
+__attribute_const__ __always_inline
+static bool
+is_aligned (const void *base, size_t size, unsigned char align)
+{
+  unsigned char lsbits = (unsigned char) size;
+
+  lsbits |= (unsigned char)(uintptr_t) base;
+  return (lsbits & (align - 1)) == 0;
+}
+
+static void
+swap_words_64 (void * restrict a, void * restrict b, size_t n)
+{
+  do
+   {
+     n -= 8;
+     uint64_t t = *(uint64_t *)(a + n);
+     *(uint64_t *)(a + n) = *(uint64_t *)(b + n);
+     *(uint64_t *)(b + n) = t;
+   } while (n);
+}
+
+static void
+swap_words_32 (void * restrict a, void * restrict b, size_t n)
+{
+  do
+   {
+     n -= 4;
+     uint32_t t = *(uint32_t *)(a + n);
+     *(uint32_t *)(a + n) = *(uint32_t *)(b + n);
+     *(uint32_t *)(b + n) = t;
+   } while (n);
+}
+
+static void
+swap_bytes (void * restrict a, void * restrict b, size_t n)
+{
+  /* Use multiple small memcpys with constant size to enable inlining
+     on most targets.  */
+  enum { SWAP_GENERIC_SIZE = 32 };
+  unsigned char tmp[SWAP_GENERIC_SIZE];
+  while (n > SWAP_GENERIC_SIZE)
+    {
+      __builtin_memcpy (tmp, a, SWAP_GENERIC_SIZE);
+      a = __mempcpy (a, b, SWAP_GENERIC_SIZE);
+      b = __mempcpy (b, tmp, SWAP_GENERIC_SIZE);
+      n -= SWAP_GENERIC_SIZE;
+    }
+  while (n > 0)
+    {
+      unsigned char t = ((unsigned char *)a)[--n];
+      ((unsigned char *)a)[n] = ((unsigned char *)b)[n];
+      ((unsigned char *)b)[n] = t;
+    }
+}
+
+/* Replace the indirect call with a serie of if statements.  It should help
+   the branch predictor.  */
+static void
+do_swap (void * restrict a, void * restrict b, size_t size,
+	 swap_func_t swap_func)
+{
+  if (swap_func == SWAP_WORDS_64)
+    swap_words_64 (a, b, size);
+  else if (swap_func == SWAP_WORDS_32)
+    swap_words_32 (a, b, size);
+  else
+    swap_bytes (a, b, size);
+}
 
 /* Discontinue quicksort algorithm when partition gets below this size.
    This particular magic number was chosen to work best on a Sun 4/260. */
@@ -96,6 +165,14 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
     /* Avoid lossage with unsigned arithmetic below.  */
     return;
 
+  swap_func_t swap_func;
+  if (is_aligned (pbase, size, sizeof (uint64_t)))
+    swap_func = SWAP_WORDS_64;
+  else if (is_aligned (pbase, size, sizeof (uint32_t)))
+    swap_func = SWAP_WORDS_32;
+  else
+    swap_func = SWAP_BYTES;
+
   if (total_elems > MAX_THRESH)
     {
       char *lo = base_ptr;
@@ -119,13 +196,13 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
 	  char *mid = lo + size * ((hi - lo) / size >> 1);
 
 	  if ((*cmp) ((void *) mid, (void *) lo, arg) < 0)
-	    SWAP (mid, lo, size);
+	    do_swap (mid, lo, size, swap_func);
 	  if ((*cmp) ((void *) hi, (void *) mid, arg) < 0)
-	    SWAP (mid, hi, size);
+	    do_swap (mid, hi, size, swap_func);
 	  else
 	    goto jump_over;
 	  if ((*cmp) ((void *) mid, (void *) lo, arg) < 0)
-	    SWAP (mid, lo, size);
+	    do_swap (mid, lo, size, swap_func);
 	jump_over:;
 
 	  left_ptr  = lo + size;
@@ -144,7 +221,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
 
 	      if (left_ptr < right_ptr)
 		{
-		  SWAP (left_ptr, right_ptr, size);
+		  do_swap (left_ptr, right_ptr, size, swap_func);
 		  if (mid == left_ptr)
 		    mid = right_ptr;
 		  else if (mid == right_ptr)
@@ -216,7 +293,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
         tmp_ptr = run_ptr;
 
     if (tmp_ptr != base_ptr)
-      SWAP (tmp_ptr, base_ptr, size);
+      do_swap (tmp_ptr, base_ptr, size, swap_func);
 
     /* Insertion sort, running from left-hand-side up to right-hand-side.  */
 
-- 
2.34.1


^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH v4 2/6] stdlib: Move insertion sort out qsort
  2023-07-11 19:07 [PATCH v4 0/6] Use introsort for qsort Adhemerval Zanella
  2023-07-11 19:07 ` [PATCH v4 1/6] stdlib: Optimization qsort{_r} swap implementation (BZ 19305) Adhemerval Zanella
@ 2023-07-11 19:07 ` Adhemerval Zanella
  2023-07-11 23:46   ` Noah Goldstein
  2023-07-11 19:07 ` [PATCH v4 3/6] stdlib: qsort: Move some macros to inline function Adhemerval Zanella
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 24+ messages in thread
From: Adhemerval Zanella @ 2023-07-11 19:07 UTC (permalink / raw)
  To: libc-alpha

---
 stdlib/qsort.c | 100 ++++++++++++++++++++++++++-----------------------
 1 file changed, 53 insertions(+), 47 deletions(-)

diff --git a/stdlib/qsort.c b/stdlib/qsort.c
index 8a3331fdb4..00637208ab 100644
--- a/stdlib/qsort.c
+++ b/stdlib/qsort.c
@@ -153,6 +153,58 @@ typedef struct
       smaller partition.  This *guarantees* no more than log (total_elems)
       stack size is needed (actually O(1) in this case)!  */
 
+static inline void
+insertion_sort (void *const pbase, size_t total_elems, size_t size,
+                swap_func_t swap_func,
+	        __compar_d_fn_t cmp, void *arg)
+{
+  char *base_ptr = (char *) pbase;
+  char *const end_ptr = &base_ptr[size * (total_elems - 1)];
+  char *tmp_ptr = base_ptr;
+#define min(x, y) ((x) < (y) ? (x) : (y))
+  const size_t max_thresh = MAX_THRESH * size;
+  char *thresh = min(end_ptr, base_ptr + max_thresh);
+  char *run_ptr;
+
+  /* Find smallest element in first threshold and place it at the
+     array's beginning.  This is the smallest array element,
+     and the operation speeds up insertion sort's inner loop. */
+
+  for (run_ptr = tmp_ptr + size; run_ptr <= thresh; run_ptr += size)
+    if (cmp (run_ptr, tmp_ptr, arg) < 0)
+      tmp_ptr = run_ptr;
+
+  if (tmp_ptr != base_ptr)
+    do_swap (tmp_ptr, base_ptr, size, swap_func);
+
+  /* Insertion sort, running from left-hand-side up to right-hand-side.  */
+
+  run_ptr = base_ptr + size;
+  while ((run_ptr += size) <= end_ptr)
+    {
+      tmp_ptr = run_ptr - size;
+      while (cmp (run_ptr, tmp_ptr, arg) < 0)
+        tmp_ptr -= size;
+
+      tmp_ptr += size;
+      if (tmp_ptr != run_ptr)
+        {
+          char *trav;
+
+          trav = run_ptr + size;
+          while (--trav >= run_ptr)
+            {
+              char c = *trav;
+              char *hi, *lo;
+
+              for (hi = lo = trav; (lo -= size) >= tmp_ptr; hi = lo)
+                *hi = *lo;
+              *hi = c;
+            }
+        }
+    }
+}
+
 void
 _quicksort (void *const pbase, size_t total_elems, size_t size,
 	    __compar_d_fn_t cmp, void *arg)
@@ -275,51 +327,5 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
      for partitions below MAX_THRESH size. BASE_PTR points to the beginning
      of the array to sort, and END_PTR points at the very last element in
      the array (*not* one beyond it!). */
-
-#define min(x, y) ((x) < (y) ? (x) : (y))
-
-  {
-    char *const end_ptr = &base_ptr[size * (total_elems - 1)];
-    char *tmp_ptr = base_ptr;
-    char *thresh = min(end_ptr, base_ptr + max_thresh);
-    char *run_ptr;
-
-    /* Find smallest element in first threshold and place it at the
-       array's beginning.  This is the smallest array element,
-       and the operation speeds up insertion sort's inner loop. */
-
-    for (run_ptr = tmp_ptr + size; run_ptr <= thresh; run_ptr += size)
-      if ((*cmp) ((void *) run_ptr, (void *) tmp_ptr, arg) < 0)
-        tmp_ptr = run_ptr;
-
-    if (tmp_ptr != base_ptr)
-      do_swap (tmp_ptr, base_ptr, size, swap_func);
-
-    /* Insertion sort, running from left-hand-side up to right-hand-side.  */
-
-    run_ptr = base_ptr + size;
-    while ((run_ptr += size) <= end_ptr)
-      {
-	tmp_ptr = run_ptr - size;
-	while ((*cmp) ((void *) run_ptr, (void *) tmp_ptr, arg) < 0)
-	  tmp_ptr -= size;
-
-	tmp_ptr += size;
-        if (tmp_ptr != run_ptr)
-          {
-            char *trav;
-
-	    trav = run_ptr + size;
-	    while (--trav >= run_ptr)
-              {
-                char c = *trav;
-                char *hi, *lo;
-
-                for (hi = lo = trav; (lo -= size) >= tmp_ptr; hi = lo)
-                  *hi = *lo;
-                *hi = c;
-              }
-          }
-      }
-  }
+  insertion_sort (pbase, total_elems, size, swap_func, cmp, arg);
 }
-- 
2.34.1


^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH v4 3/6] stdlib: qsort: Move some macros to inline function
  2023-07-11 19:07 [PATCH v4 0/6] Use introsort for qsort Adhemerval Zanella
  2023-07-11 19:07 ` [PATCH v4 1/6] stdlib: Optimization qsort{_r} swap implementation (BZ 19305) Adhemerval Zanella
  2023-07-11 19:07 ` [PATCH v4 2/6] stdlib: Move insertion sort out qsort Adhemerval Zanella
@ 2023-07-11 19:07 ` Adhemerval Zanella
  2023-07-11 23:44   ` Noah Goldstein
  2023-07-11 19:07 ` [PATCH v4 4/6] stdlib: Implement introsort with qsort Adhemerval Zanella
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 24+ messages in thread
From: Adhemerval Zanella @ 2023-07-11 19:07 UTC (permalink / raw)
  To: libc-alpha

---
 stdlib/qsort.c | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/stdlib/qsort.c b/stdlib/qsort.c
index 00637208ab..134e784bd1 100644
--- a/stdlib/qsort.c
+++ b/stdlib/qsort.c
@@ -118,15 +118,28 @@ typedef struct
     char *hi;
   } stack_node;
 
-/* The next 4 #defines implement a very fast in-line stack abstraction. */
 /* The stack needs log (total_elements) entries (we could even subtract
    log(MAX_THRESH)).  Since total_elements has type size_t, we get as
    upper bound for log (total_elements):
    bits per byte (CHAR_BIT) * sizeof(size_t).  */
-#define STACK_SIZE	(CHAR_BIT * sizeof (size_t))
-#define PUSH(low, high)	((void) ((top->lo = (low)), (top->hi = (high)), ++top))
-#define	POP(low, high)	((void) (--top, (low = top->lo), (high = top->hi)))
-#define	STACK_NOT_EMPTY	(stack < top)
+enum { STACK_SIZE = CHAR_BIT * sizeof (size_t) };
+
+static inline stack_node *
+push (stack_node *top, char *lo, char *hi)
+{
+  top->lo = lo;
+  top->hi = hi;
+  return ++top;
+}
+
+static inline stack_node *
+pop (stack_node *top, char **lo, char **hi)
+{
+  --top;
+  *lo = top->lo;
+  *hi = top->hi;
+  return top;
+}
 
 
 /* Order size using quicksort.  This implementation incorporates
@@ -232,9 +245,9 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
       stack_node stack[STACK_SIZE];
       stack_node *top = stack;
 
-      PUSH (NULL, NULL);
+      top = push (top, NULL, NULL);
 
-      while (STACK_NOT_EMPTY)
+      while (stack < top)
         {
           char *left_ptr;
           char *right_ptr;
@@ -299,7 +312,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
             {
               if ((size_t) (hi - left_ptr) <= max_thresh)
 		/* Ignore both small partitions. */
-                POP (lo, hi);
+                top = pop (top, &lo, &hi);
               else
 		/* Ignore small left partition. */
                 lo = left_ptr;
@@ -310,13 +323,13 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
           else if ((right_ptr - lo) > (hi - left_ptr))
             {
 	      /* Push larger left partition indices. */
-              PUSH (lo, right_ptr);
+              top = push (top, lo, right_ptr);
               lo = left_ptr;
             }
           else
             {
 	      /* Push larger right partition indices. */
-              PUSH (left_ptr, hi);
+              top = push (top, left_ptr, hi);
               hi = right_ptr;
             }
         }
-- 
2.34.1


^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH v4 4/6] stdlib: Implement introsort with qsort
  2023-07-11 19:07 [PATCH v4 0/6] Use introsort for qsort Adhemerval Zanella
                   ` (2 preceding siblings ...)
  2023-07-11 19:07 ` [PATCH v4 3/6] stdlib: qsort: Move some macros to inline function Adhemerval Zanella
@ 2023-07-11 19:07 ` Adhemerval Zanella
  2023-07-12  5:39   ` Alexander Monakov
  2023-07-12 21:55   ` Paul Eggert
  2023-07-11 19:07 ` [PATCH v4 5/6] stdlib: Remove use of mergesort on qsort (BZ 21719) Adhemerval Zanella
  2023-07-11 19:07 ` [PATCH v4 6/6] stdlib: Add more qsort{_r} coverage Adhemerval Zanella
  5 siblings, 2 replies; 24+ messages in thread
From: Adhemerval Zanella @ 2023-07-11 19:07 UTC (permalink / raw)
  To: libc-alpha

This patch adds a introsort implementation on qsort to avoid worse-case
performance of quicksort to O(nlog n).  The heapsort fallback used is a
heapsort based on Linux implementation (commit 22a241ccb2c19962a).  As a
side note the introsort implementation is similar the one used on
libstdc++ for std::sort.

Checked on x86_64-linux-gnu.
---
 stdlib/qsort.c | 93 ++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 86 insertions(+), 7 deletions(-)

diff --git a/stdlib/qsort.c b/stdlib/qsort.c
index 134e784bd1..f08640b5d9 100644
--- a/stdlib/qsort.c
+++ b/stdlib/qsort.c
@@ -116,6 +116,7 @@ typedef struct
   {
     char *lo;
     char *hi;
+    size_t depth;
   } stack_node;
 
 /* The stack needs log (total_elements) entries (we could even subtract
@@ -125,22 +126,90 @@ typedef struct
 enum { STACK_SIZE = CHAR_BIT * sizeof (size_t) };
 
 static inline stack_node *
-push (stack_node *top, char *lo, char *hi)
+push (stack_node *top, char *lo, char *hi, size_t depth)
 {
   top->lo = lo;
   top->hi = hi;
+  top->depth = depth;
   return ++top;
 }
 
 static inline stack_node *
-pop (stack_node *top, char **lo, char **hi)
+pop (stack_node *top, char **lo, char **hi, size_t *depth)
 {
   --top;
   *lo = top->lo;
   *hi = top->hi;
+  *depth = top->depth;
   return top;
 }
 
+/* A fast, small, non-recursive O(nlog n) heapsort, adapted from Linux
+   lib/sort.c.  Used on introsort implementation as a fallback routine with
+   worst-case performance of O(nlog n) and worst-case space complexity of
+   O(1).  */
+
+static inline size_t
+parent (size_t i, unsigned int lsbit, size_t size)
+{
+  i -= size;
+  i -= size & -(i & lsbit);
+  return i / 2;
+}
+
+static void
+heapsort_r (void *base, void *end, size_t size, swap_func_t swap_func,
+	    __compar_d_fn_t cmp, void *arg)
+{
+  size_t num = ((uintptr_t) end - (uintptr_t) base) / size;
+  size_t n = num * size, a = (num/2) * size;
+  /* Used to find parent  */
+  const unsigned int lsbit = size & -size;
+
+  /* num < 2 || size == 0.  */
+  if (a == 0)
+    return;
+
+  for (;;)
+    {
+      size_t b, c, d;
+
+      if (a != 0)
+	/* Building heap: sift down --a */
+	a -= size;
+      else if (n -= size)
+	/* Sorting: Extract root to --n */
+	do_swap (base, base + n, size, swap_func);
+      else
+	break;
+
+      /* Sift element at "a" down into heap.  This is the "bottom-up" variant,
+	 which significantly reduces calls to cmp_func(): we find the sift-down
+	 path all the way to the leaves (one compare per level), then backtrack
+	 to find where to insert the target element.
+
+	 Because elements tend to sift down close to the leaves, this uses fewer
+	 compares than doing two per level on the way down.  (A bit more than
+	 half as many on average, 3/4 worst-case.).  */
+      for (b = a; c = 2 * b + size, (d = c + size) < n;)
+	b = cmp (base + c, base + d, arg) >= 0 ? c : d;
+      if (d == n)
+	/* Special case last leaf with no sibling.  */
+	b = c;
+
+      /* Now backtrack from "b" to the correct location for "a".  */
+      while (b != a && cmp (base + a, base + b, arg) >= 0)
+	b = parent (b, lsbit, size);
+      /* Where "a" belongs.  */
+      c = b;
+      while (b != a)
+	{
+	  /* Shift it into place.  */
+	  b = parent (b, lsbit, size);
+          do_swap (base + b, base + c, size, swap_func);
+        }
+    }
+}
 
 /* Order size using quicksort.  This implementation incorporates
    four optimizations discussed in Sedgewick:
@@ -226,7 +295,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
 
   const size_t max_thresh = MAX_THRESH * size;
 
-  if (total_elems == 0)
+  if (total_elems <= 0)
     /* Avoid lossage with unsigned arithmetic below.  */
     return;
 
@@ -238,6 +307,9 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
   else
     swap_func = SWAP_BYTES;
 
+  /* Maximum depth before quicksort switches to heapsort.  */
+  size_t depth = 2 * (CHAR_BIT - 1 - __builtin_clzl (total_elems));
+
   if (total_elems > MAX_THRESH)
     {
       char *lo = base_ptr;
@@ -245,10 +317,17 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
       stack_node stack[STACK_SIZE];
       stack_node *top = stack;
 
-      top = push (top, NULL, NULL);
+      top = push (top, NULL, NULL, depth);
 
       while (stack < top)
         {
+          if (depth == 0)
+            {
+              heapsort_r (lo, hi, size, swap_func, cmp, arg);
+              top = pop (top, &lo, &hi, &depth);
+              continue;
+            }
+
           char *left_ptr;
           char *right_ptr;
 
@@ -312,7 +391,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
             {
               if ((size_t) (hi - left_ptr) <= max_thresh)
 		/* Ignore both small partitions. */
-                top = pop (top, &lo, &hi);
+                top = pop (top, &lo, &hi, &depth);
               else
 		/* Ignore small left partition. */
                 lo = left_ptr;
@@ -323,13 +402,13 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
           else if ((right_ptr - lo) > (hi - left_ptr))
             {
 	      /* Push larger left partition indices. */
-              top = push (top, lo, right_ptr);
+              top = push (top, lo, right_ptr, depth - 1);
               lo = left_ptr;
             }
           else
             {
 	      /* Push larger right partition indices. */
-              top = push (top, left_ptr, hi);
+              top = push (top, left_ptr, hi, depth - 1);
               hi = right_ptr;
             }
         }
-- 
2.34.1


^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH v4 5/6] stdlib: Remove use of mergesort on qsort (BZ 21719)
  2023-07-11 19:07 [PATCH v4 0/6] Use introsort for qsort Adhemerval Zanella
                   ` (3 preceding siblings ...)
  2023-07-11 19:07 ` [PATCH v4 4/6] stdlib: Implement introsort with qsort Adhemerval Zanella
@ 2023-07-11 19:07 ` Adhemerval Zanella
  2023-07-12 22:04   ` Paul Eggert
  2023-07-11 19:07 ` [PATCH v4 6/6] stdlib: Add more qsort{_r} coverage Adhemerval Zanella
  5 siblings, 1 reply; 24+ messages in thread
From: Adhemerval Zanella @ 2023-07-11 19:07 UTC (permalink / raw)
  To: libc-alpha

iThis patch removes the mergesort optimization on qsort implementation
and uses the quicksort instead.  The mergesort implementation has some
issues:

  - It is as-safe only for certain types sizes (if total size is less
    than 1 KB with large element sizes also forcing memory allocation)
    which contradicts the function documentation.  Although not required
    by the C standard, it is preferable and doable to have an O(1) space
    implementation.

  - The malloc for certain element size and element number adds
    arbitrary latency (might even be worse if malloc is interposed).

  - To avoid trigger swap from memory allocation the implementation
    relies on system information that might be virtualized (for instance
    VMs with overcommit memory) which might lead to potentially use of
    swap even if system advertise more memory than actually has.  The
    check also have the downside of issuing syscalls where none is
    expected (although only once per execution).

  - The mergesort is suboptimal on an already sorted array (BZ#21719).

The quicksort implementation is already optimized to use constant extra
space (due to the limit of total number of elements from maximum VM
size) and thus can be used to avoid the malloc usage issues.

Resulting performance is slower due the usage of qsort, specially in the
worst-case scenario (partialy or sorted arrays) and due the fact
mergesort uses a slight improved swap operations.

This change also renders the BZ#21719 fix unrequired (since it is meant
to fix the sorted input performance degradation for mergesort).  The
manual is also updated to indicate the function is now async-cancel
safe.

Checked on x86_64-linux-gnu.
---
 manual/argp.texi   |   2 +-
 manual/locale.texi |   3 +-
 manual/search.texi |   7 +-
 stdlib/Makefile    |   2 -
 stdlib/msort.c     | 309 ---------------------------------------------
 stdlib/qsort.c     |  19 ++-
 6 files changed, 20 insertions(+), 322 deletions(-)
 delete mode 100644 stdlib/msort.c

diff --git a/manual/argp.texi b/manual/argp.texi
index 0023441812..b77ad68285 100644
--- a/manual/argp.texi
+++ b/manual/argp.texi
@@ -735,7 +735,7 @@ for options, bad phase of the moon, etc.
 @c  hol_set_group ok
 @c   hol_find_entry ok
 @c  hol_sort @mtslocale @acucorrupt
-@c   qsort dup @acucorrupt
+@c   qsort dup
 @c    hol_entry_qcmp @mtslocale
 @c     hol_entry_cmp @mtslocale
 @c      group_cmp ok
diff --git a/manual/locale.texi b/manual/locale.texi
index 720e0ca952..f6afa5dc44 100644
--- a/manual/locale.texi
+++ b/manual/locale.texi
@@ -253,7 +253,7 @@ The symbols in this section are defined in the header file @file{locale.h}.
 @c    calculate_head_size ok
 @c    __munmap ok
 @c    compute_hashval ok
-@c    qsort dup @acucorrupt
+@c    qsort dup
 @c     rangecmp ok
 @c    malloc @ascuheap @acsmem
 @c    strdup @ascuheap @acsmem
@@ -275,7 +275,6 @@ The symbols in this section are defined in the header file @file{locale.h}.
 @c      realloc @ascuheap @acsmem
 @c     realloc @ascuheap @acsmem
 @c     fclose @ascuheap @asulock @acsmem @acsfd @aculock
-@c     qsort @ascuheap @acsmem
 @c      alias_compare dup
 @c    libc_lock_unlock @aculock
 @c   _nl_explode_name @ascuheap @acsmem
diff --git a/manual/search.texi b/manual/search.texi
index 5691bf2f2b..a550858478 100644
--- a/manual/search.texi
+++ b/manual/search.texi
@@ -159,7 +159,7 @@ To sort an array using an arbitrary comparison function, use the
 
 @deftypefun void qsort (void *@var{array}, size_t @var{count}, size_t @var{size}, comparison_fn_t @var{compare})
 @standards{ISO, stdlib.h}
-@safety{@prelim{}@mtsafe{}@assafe{}@acunsafe{@acucorrupt{}}}
+@safety{@prelim{}@mtsafe{}@assafe{}@acsafe{}}
 The @code{qsort} function sorts the array @var{array}.  The array
 contains @var{count} elements, each of which is of size @var{size}.
 
@@ -199,9 +199,8 @@ Functions}):
 The @code{qsort} function derives its name from the fact that it was
 originally implemented using the ``quick sort'' algorithm.
 
-The implementation of @code{qsort} in this library might not be an
-in-place sort and might thereby use an extra amount of memory to store
-the array.
+The implementation of @code{qsort} in this library is an in-place sort
+and uses a constant extra space (allocated on the stack).
 @end deftypefun
 
 @node Search/Sort Example
diff --git a/stdlib/Makefile b/stdlib/Makefile
index 25e42a77e7..095518eef4 100644
--- a/stdlib/Makefile
+++ b/stdlib/Makefile
@@ -96,7 +96,6 @@ routines := \
   mbtowc \
   mrand48 \
   mrand48_r \
-  msort \
   nrand48 \
   nrand48_r \
   old_atexit  \
@@ -380,7 +379,6 @@ generated += \
   # generated
 
 CFLAGS-bsearch.c += $(uses-callbacks)
-CFLAGS-msort.c += $(uses-callbacks)
 CFLAGS-qsort.c += $(uses-callbacks)
 CFLAGS-system.c += -fexceptions
 CFLAGS-system.os = -fomit-frame-pointer
diff --git a/stdlib/msort.c b/stdlib/msort.c
deleted file mode 100644
index bbaa5e9f82..0000000000
--- a/stdlib/msort.c
+++ /dev/null
@@ -1,309 +0,0 @@
-/* An alternative to qsort, with an identical interface.
-   This file is part of the GNU C Library.
-   Copyright (C) 1992-2023 Free Software Foundation, Inc.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <alloca.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include <memcopy.h>
-#include <errno.h>
-#include <atomic.h>
-
-struct msort_param
-{
-  size_t s;
-  size_t var;
-  __compar_d_fn_t cmp;
-  void *arg;
-  char *t;
-};
-static void msort_with_tmp (const struct msort_param *p, void *b, size_t n);
-
-static void
-msort_with_tmp (const struct msort_param *p, void *b, size_t n)
-{
-  char *b1, *b2;
-  size_t n1, n2;
-
-  if (n <= 1)
-    return;
-
-  n1 = n / 2;
-  n2 = n - n1;
-  b1 = b;
-  b2 = (char *) b + (n1 * p->s);
-
-  msort_with_tmp (p, b1, n1);
-  msort_with_tmp (p, b2, n2);
-
-  char *tmp = p->t;
-  const size_t s = p->s;
-  __compar_d_fn_t cmp = p->cmp;
-  void *arg = p->arg;
-  switch (p->var)
-    {
-    case 0:
-      while (n1 > 0 && n2 > 0)
-	{
-	  if ((*cmp) (b1, b2, arg) <= 0)
-	    {
-	      *(uint32_t *) tmp = *(uint32_t *) b1;
-	      b1 += sizeof (uint32_t);
-	      --n1;
-	    }
-	  else
-	    {
-	      *(uint32_t *) tmp = *(uint32_t *) b2;
-	      b2 += sizeof (uint32_t);
-	      --n2;
-	    }
-	  tmp += sizeof (uint32_t);
-	}
-      break;
-    case 1:
-      while (n1 > 0 && n2 > 0)
-	{
-	  if ((*cmp) (b1, b2, arg) <= 0)
-	    {
-	      *(uint64_t *) tmp = *(uint64_t *) b1;
-	      b1 += sizeof (uint64_t);
-	      --n1;
-	    }
-	  else
-	    {
-	      *(uint64_t *) tmp = *(uint64_t *) b2;
-	      b2 += sizeof (uint64_t);
-	      --n2;
-	    }
-	  tmp += sizeof (uint64_t);
-	}
-      break;
-    case 2:
-      while (n1 > 0 && n2 > 0)
-	{
-	  unsigned long *tmpl = (unsigned long *) tmp;
-	  unsigned long *bl;
-
-	  tmp += s;
-	  if ((*cmp) (b1, b2, arg) <= 0)
-	    {
-	      bl = (unsigned long *) b1;
-	      b1 += s;
-	      --n1;
-	    }
-	  else
-	    {
-	      bl = (unsigned long *) b2;
-	      b2 += s;
-	      --n2;
-	    }
-	  while (tmpl < (unsigned long *) tmp)
-	    *tmpl++ = *bl++;
-	}
-      break;
-    case 3:
-      while (n1 > 0 && n2 > 0)
-	{
-	  if ((*cmp) (*(const void **) b1, *(const void **) b2, arg) <= 0)
-	    {
-	      *(void **) tmp = *(void **) b1;
-	      b1 += sizeof (void *);
-	      --n1;
-	    }
-	  else
-	    {
-	      *(void **) tmp = *(void **) b2;
-	      b2 += sizeof (void *);
-	      --n2;
-	    }
-	  tmp += sizeof (void *);
-	}
-      break;
-    default:
-      while (n1 > 0 && n2 > 0)
-	{
-	  if ((*cmp) (b1, b2, arg) <= 0)
-	    {
-	      tmp = (char *) __mempcpy (tmp, b1, s);
-	      b1 += s;
-	      --n1;
-	    }
-	  else
-	    {
-	      tmp = (char *) __mempcpy (tmp, b2, s);
-	      b2 += s;
-	      --n2;
-	    }
-	}
-      break;
-    }
-
-  if (n1 > 0)
-    memcpy (tmp, b1, n1 * s);
-  memcpy (b, p->t, (n - n2) * s);
-}
-
-
-void
-__qsort_r (void *b, size_t n, size_t s, __compar_d_fn_t cmp, void *arg)
-{
-  size_t size = n * s;
-  char *tmp = NULL;
-  struct msort_param p;
-
-  /* For large object sizes use indirect sorting.  */
-  if (s > 32)
-    size = 2 * n * sizeof (void *) + s;
-
-  if (size < 1024)
-    /* The temporary array is small, so put it on the stack.  */
-    p.t = __alloca (size);
-  else
-    {
-      /* We should avoid allocating too much memory since this might
-	 have to be backed up by swap space.  */
-      static long int phys_pages;
-      static int pagesize;
-
-      if (pagesize == 0)
-	{
-	  phys_pages = __sysconf (_SC_PHYS_PAGES);
-
-	  if (phys_pages == -1)
-	    /* Error while determining the memory size.  So let's
-	       assume there is enough memory.  Otherwise the
-	       implementer should provide a complete implementation of
-	       the `sysconf' function.  */
-	    phys_pages = (long int) (~0ul >> 1);
-
-	  /* The following determines that we will never use more than
-	     a quarter of the physical memory.  */
-	  phys_pages /= 4;
-
-	  /* Make sure phys_pages is written to memory.  */
-	  atomic_write_barrier ();
-
-	  pagesize = __sysconf (_SC_PAGESIZE);
-	}
-
-      /* Just a comment here.  We cannot compute
-	   phys_pages * pagesize
-	   and compare the needed amount of memory against this value.
-	   The problem is that some systems might have more physical
-	   memory then can be represented with a `size_t' value (when
-	   measured in bytes.  */
-
-      /* If the memory requirements are too high don't allocate memory.  */
-      if (size / pagesize > (size_t) phys_pages)
-	{
-	  _quicksort (b, n, s, cmp, arg);
-	  return;
-	}
-
-      /* It's somewhat large, so malloc it.  */
-      int save = errno;
-      tmp = malloc (size);
-      __set_errno (save);
-      if (tmp == NULL)
-	{
-	  /* Couldn't get space, so use the slower algorithm
-	     that doesn't need a temporary array.  */
-	  _quicksort (b, n, s, cmp, arg);
-	  return;
-	}
-      p.t = tmp;
-    }
-
-  p.s = s;
-  p.var = 4;
-  p.cmp = cmp;
-  p.arg = arg;
-
-  if (s > 32)
-    {
-      /* Indirect sorting.  */
-      char *ip = (char *) b;
-      void **tp = (void **) (p.t + n * sizeof (void *));
-      void **t = tp;
-      void *tmp_storage = (void *) (tp + n);
-
-      while ((void *) t < tmp_storage)
-	{
-	  *t++ = ip;
-	  ip += s;
-	}
-      p.s = sizeof (void *);
-      p.var = 3;
-      msort_with_tmp (&p, p.t + n * sizeof (void *), n);
-
-      /* tp[0] .. tp[n - 1] is now sorted, copy around entries of
-	 the original array.  Knuth vol. 3 (2nd ed.) exercise 5.2-10.  */
-      char *kp;
-      size_t i;
-      for (i = 0, ip = (char *) b; i < n; i++, ip += s)
-	if ((kp = tp[i]) != ip)
-	  {
-	    size_t j = i;
-	    char *jp = ip;
-	    memcpy (tmp_storage, ip, s);
-
-	    do
-	      {
-		size_t k = (kp - (char *) b) / s;
-		tp[j] = jp;
-		memcpy (jp, kp, s);
-		j = k;
-		jp = kp;
-		kp = tp[k];
-	      }
-	    while (kp != ip);
-
-	    tp[j] = jp;
-	    memcpy (jp, tmp_storage, s);
-	  }
-    }
-  else
-    {
-      if ((s & (sizeof (uint32_t) - 1)) == 0
-	  && ((uintptr_t) b) % __alignof__ (uint32_t) == 0)
-	{
-	  if (s == sizeof (uint32_t))
-	    p.var = 0;
-	  else if (s == sizeof (uint64_t)
-		   && ((uintptr_t) b) % __alignof__ (uint64_t) == 0)
-	    p.var = 1;
-	  else if ((s & (sizeof (unsigned long) - 1)) == 0
-		   && ((uintptr_t) b)
-		      % __alignof__ (unsigned long) == 0)
-	    p.var = 2;
-	}
-      msort_with_tmp (&p, b, n);
-    }
-  free (tmp);
-}
-libc_hidden_def (__qsort_r)
-weak_alias (__qsort_r, qsort_r)
-
-
-void
-qsort (void *b, size_t n, size_t s, __compar_fn_t cmp)
-{
-  return __qsort_r (b, n, s, (__compar_d_fn_t) cmp, NULL);
-}
-libc_hidden_def (qsort)
diff --git a/stdlib/qsort.c b/stdlib/qsort.c
index f08640b5d9..414ee4e93e 100644
--- a/stdlib/qsort.c
+++ b/stdlib/qsort.c
@@ -19,7 +19,6 @@
    Engineering a sort function; Jon Bentley and M. Douglas McIlroy;
    Software - Practice and Experience; Vol. 23 (11), 1249-1265, 1993.  */
 
-#include <alloca.h>
 #include <limits.h>
 #include <stdlib.h>
 #include <string.h>
@@ -152,6 +151,7 @@ pop (stack_node *top, char **lo, char **hi, size_t *depth)
 static inline size_t
 parent (size_t i, unsigned int lsbit, size_t size)
 {
+  /* The below is logically equivalent to 'if (i & lsbit) i -= size'.  */
   i -= size;
   i -= size & -(i & lsbit);
   return i / 2;
@@ -288,8 +288,8 @@ insertion_sort (void *const pbase, size_t total_elems, size_t size,
 }
 
 void
-_quicksort (void *const pbase, size_t total_elems, size_t size,
-	    __compar_d_fn_t cmp, void *arg)
+__qsort_r (void *const pbase, size_t total_elems, size_t size,
+	   __compar_d_fn_t cmp, void *arg)
 {
   char *base_ptr = (char *) pbase;
 
@@ -308,7 +308,8 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
     swap_func = SWAP_BYTES;
 
   /* Maximum depth before quicksort switches to heapsort.  */
-  size_t depth = 2 * (CHAR_BIT - 1 - __builtin_clzl (total_elems));
+  size_t depth = 2 * (sizeof (size_t) * CHAR_BIT - 1
+                 - __builtin_clzl (total_elems));
 
   if (total_elems > MAX_THRESH)
     {
@@ -421,3 +422,13 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
      the array (*not* one beyond it!). */
   insertion_sort (pbase, total_elems, size, swap_func, cmp, arg);
 }
+
+libc_hidden_def (__qsort_r)
+weak_alias (__qsort_r, qsort_r)
+
+void
+qsort (void *b, size_t n, size_t s, __compar_fn_t cmp)
+{
+  return __qsort_r (b, n, s, (__compar_d_fn_t) cmp, NULL);
+}
+libc_hidden_def (qsort)
-- 
2.34.1


^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH v4 6/6] stdlib: Add more qsort{_r} coverage
  2023-07-11 19:07 [PATCH v4 0/6] Use introsort for qsort Adhemerval Zanella
                   ` (4 preceding siblings ...)
  2023-07-11 19:07 ` [PATCH v4 5/6] stdlib: Remove use of mergesort on qsort (BZ 21719) Adhemerval Zanella
@ 2023-07-11 19:07 ` Adhemerval Zanella
  5 siblings, 0 replies; 24+ messages in thread
From: Adhemerval Zanella @ 2023-07-11 19:07 UTC (permalink / raw)
  To: libc-alpha

This patch adds a qsort and qsort_r (which glibc current lacks
coverage).  The test is done with random input, dfferent internal
types (uint8_t, uint16_t, uint32_t, and uint64_t), and with
different set of element numbers.

Checked on x86_64-linux-gnu.
---
 stdlib/Makefile     |   1 +
 stdlib/tst-qsort3.c | 298 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 299 insertions(+)
 create mode 100644 stdlib/tst-qsort3.c

diff --git a/stdlib/Makefile b/stdlib/Makefile
index 095518eef4..6af606136e 100644
--- a/stdlib/Makefile
+++ b/stdlib/Makefile
@@ -214,6 +214,7 @@ tests := \
   tst-on_exit \
   tst-qsort \
   tst-qsort2 \
+  tst-qsort3 \
   tst-quick_exit \
   tst-rand48 \
   tst-rand48-2 \
diff --git a/stdlib/tst-qsort3.c b/stdlib/tst-qsort3.c
new file mode 100644
index 0000000000..86d054ff38
--- /dev/null
+++ b/stdlib/tst-qsort3.c
@@ -0,0 +1,298 @@
+/* qsort(_r) generic tests.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <array_length.h>
+#include <errno.h>
+#include <getopt.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <support/check.h>
+#include <support/support.h>
+#include <support/test-driver.h>
+
+typedef enum
+{
+  Sorted,
+  Random,
+  Repeated,
+  Bitonic
+} arraytype_t;
+
+/* Ratio of total of elements which will be repeated.  */
+static const double RepeatedRatio = 0.2;
+
+struct array_t
+{
+  arraytype_t type;
+  const char *name;
+} static const arraytypes[] =
+{
+  { Sorted,       "Sorted" },
+  { Random,       "Random" },
+  { Repeated,     "Repeated" },
+  { Bitonic,      "Bitonic" },
+};
+
+/* Return the index of BASE as interpreted as an array of elements
+   of size SIZE.  */
+static inline void *
+arr (void *base, size_t idx, size_t size)
+{
+  return (void*)((uintptr_t)base + (idx * size));
+}
+
+/* Functions used to check qsort.  */
+static int
+uint8_t_cmp (const void *a, const void *b)
+{
+  uint8_t ia = *(uint8_t*)a;
+  uint8_t ib = *(uint8_t*)b;
+  return (ia > ib) - (ia < ib);
+}
+
+static int
+uint16_t_cmp (const void *a, const void *b)
+{
+  uint16_t ia = *(uint16_t*)a;
+  uint16_t ib = *(uint16_t*)b;
+  return (ia > ib) - (ia < ib);
+}
+
+static int
+uint32_t_cmp (const void *a, const void *b)
+{
+  uint32_t ia = *(uint32_t*)a;
+  uint32_t ib = *(uint32_t*)b;
+  return (ia > ib) - (ia < ib);
+}
+
+static int
+uint64_t_cmp (const void *a, const void *b)
+{
+  uint64_t ia = *(uint64_t*)a;
+  uint64_t ib = *(uint64_t*)b;
+  return (ia > ib) - (ia < ib);
+}
+
+#define LARGE_SIZE 48
+
+static int
+large_cmp (const void *a, const void *b)
+{
+  return memcmp (a, b, LARGE_SIZE);
+}
+
+/* Function used to check qsort_r.  */
+typedef enum
+{
+  UINT8_CMP_T,
+  UINT16_CMP_T,
+  UINT32_CMP_T,
+  UINT64_CMP_T,
+  LARGE_CMP_T
+} type_cmp_t;
+
+static type_cmp_t
+uint_t_cmp_type (size_t sz)
+{
+  switch (sz)
+    {
+      case sizeof (uint8_t):  return UINT8_CMP_T;
+      case sizeof (uint16_t): return UINT16_CMP_T;
+      case sizeof (uint64_t): return UINT64_CMP_T;
+      case sizeof (uint32_t): return UINT32_CMP_T;
+      default:                return LARGE_CMP_T;
+    }
+}
+
+static int
+uint_t_cmp (const void *a, const void *b, void *arg)
+{
+  type_cmp_t type = *(type_cmp_t*) arg;
+  switch (type)
+    {
+    case UINT8_CMP_T:  return uint8_t_cmp (a, b);
+    case UINT32_CMP_T: return uint32_t_cmp (a, b);
+    case UINT16_CMP_T: return uint16_t_cmp (a, b);
+    case UINT64_CMP_T: return uint64_t_cmp (a, b);
+    default:           return large_cmp (a, b);
+    }
+}
+
+static void
+seq (void *elem, size_t type_size, int value)
+{
+  if (type_size == sizeof (uint8_t))
+    *(uint8_t*)elem = value;
+  else if (type_size == sizeof (uint16_t))
+    *(uint16_t*)elem = value;
+  else if (type_size == sizeof (uint32_t))
+    *(uint32_t*)elem = value;
+  else if (type_size == sizeof (uint64_t))
+    *(uint64_t*)elem = value;
+  else
+    memset (elem, value, type_size);
+}
+
+static void *
+create_array (size_t nmemb, size_t type_size, arraytype_t type)
+{
+  size_t size = nmemb * type_size;
+  void *array = xmalloc (size);
+
+  switch (type)
+    {
+    case Sorted:
+      for (size_t i = 0; i < nmemb; i++)
+	seq (arr (array, i, type_size), type_size, i);
+      break;
+
+    case Random:
+      arc4random_buf (array, size);
+      break;
+
+    case Repeated:
+      {
+        arc4random_buf (array, size);
+
+	void *randelem = xmalloc (type_size);
+	arc4random_buf (randelem, type_size);
+
+	/* Repeat REPEATED elements (based on RepeatRatio ratio) in the random
+	   array.  */
+        size_t repeated = (size_t)(nmemb * RepeatedRatio);
+	for (size_t i = 0; i < repeated; i++)
+	  {
+	    size_t pos = arc4random_uniform (nmemb - 1);
+	    memcpy (arr (array, pos, type_size), randelem, type_size);
+	  }
+	free (randelem);
+      }
+      break;
+
+    case Bitonic:
+      {
+	size_t i;
+        for (i = 0; i < nmemb / 2; i++)
+	  seq (arr (array, i, type_size), type_size, i);
+        for (     ; i < nmemb;     i++)
+	  seq (arr (array, i, type_size), type_size, (nmemb - 1) - i);
+      }
+      break;
+    }
+
+  return array;
+}
+
+typedef int (*cmpfunc_t)(const void *, const void *);
+
+/* Check if ARRAY of total NMEMB element of size SIZE is sorted
+   based on CMPFUNC.  */
+static void
+check_array (void *array, size_t nmemb, size_t type_size,
+	     cmpfunc_t cmpfunc)
+{
+  for (size_t i = 1; i < nmemb; i++)
+    {
+      int ret = cmpfunc (arr (array, i,   type_size),
+			 arr (array, i-1, type_size));
+      TEST_VERIFY_EXIT (ret >= 0);
+    }
+}
+
+static void
+check_qsort (size_t nelem, size_t type_size, arraytype_t type,
+	     cmpfunc_t cmpfunc)
+{
+  void *array = create_array (nelem, type_size, type);
+
+  qsort (array, nelem, type_size, cmpfunc);
+
+  check_array (array, nelem, type_size, cmpfunc);
+
+  free (array);
+}
+
+static void
+check_qsort_r (size_t nelem, size_t type_size, arraytype_t type,
+	       cmpfunc_t cmpfunc)
+{
+  void *array = create_array (nelem, type_size, type);
+
+  type_cmp_t typecmp = uint_t_cmp_type (type_size);
+  qsort_r (array, nelem, type_size, uint_t_cmp, &typecmp);
+
+  check_array (array, nelem, type_size, cmpfunc);
+
+  free (array);
+}
+
+static int
+do_test (void)
+{
+  /* Some random sizes.  */
+  const size_t nelems[] = { 0, 1, 7, 20, 32, 100, 256, 1024, 8560, 17384 };
+
+  struct test_t
+    {
+      size_t type_size;
+      cmpfunc_t cmpfunc;
+    }
+  const tests[] =
+    {
+      { sizeof (uint8_t),  uint8_t_cmp },
+      { sizeof (uint16_t), uint16_t_cmp },
+      { sizeof (uint32_t), uint32_t_cmp },
+      { sizeof (uint64_t), uint64_t_cmp },
+      /* Test swap with large elements.  */
+      { LARGE_SIZE,        large_cmp },
+    };
+
+  for (const struct test_t *test = tests; test < array_end (tests); ++test)
+    {
+      if (test_verbose > 0)
+	printf ("info: testing qsort with type_size=%zu\n", test->type_size);
+      for (const struct array_t *arraytype = arraytypes;
+	   arraytype < array_end (arraytypes);
+	   ++arraytype)
+	{
+	  if (test_verbose > 0)
+            printf ("  distribution=%s\n", arraytype->name);
+	  for (const size_t *nelem = nelems;
+	       nelem < array_end (nelems);
+	       ++nelem)
+	    {
+	      if (test_verbose > 0)
+		printf ("  i  nelem=%zu, total size=%zu\n", *nelem,
+			*nelem * test->type_size);
+
+	      check_qsort (*nelem, test->type_size, arraytype->type,
+			   test->cmpfunc);
+	      check_qsort_r (*nelem, test->type_size, arraytype->type,
+			     test->cmpfunc);
+	   }
+	}
+    }
+
+  return 0;
+}
+
+#include <support/test-driver.c>
-- 
2.34.1


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v4 1/6] stdlib: Optimization qsort{_r} swap implementation (BZ 19305)
  2023-07-11 19:07 ` [PATCH v4 1/6] stdlib: Optimization qsort{_r} swap implementation (BZ 19305) Adhemerval Zanella
@ 2023-07-11 23:40   ` Noah Goldstein
  2023-07-12 19:40     ` Adhemerval Zanella Netto
  2023-07-12 21:04   ` Paul Eggert
  1 sibling, 1 reply; 24+ messages in thread
From: Noah Goldstein @ 2023-07-11 23:40 UTC (permalink / raw)
  To: Adhemerval Zanella; +Cc: libc-alpha

On Tue, Jul 11, 2023 at 2:08 PM Adhemerval Zanella via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> It optimizes take in consideration both the most common elements are
> either 32 or 64 bit in size and inputs are aligned to the word
> boundary.  This is similar to the optimization done on lib/sort.c
> from Linux.
>
> This patchs adds an optimized swap operation on qsort based in previous
> msort one.  Instead of byte operation, three variants are provided:
>
>   1. Using uint32_t loads and stores.
>   2. Using uint64_t loads and stores.
>   3. Generic one with a temporary buffer and memcpy/mempcpy.
>
> The 1. and 2. options are selected only if base pointer is aligned
> to required type.
>
> It also fixes BZ#19305 by checking input size against number of
> elements 1 besides 0.
>
> Checked on x86_64-linux-gnu.
> ---
>  stdlib/qsort.c | 113 +++++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 95 insertions(+), 18 deletions(-)
>
> diff --git a/stdlib/qsort.c b/stdlib/qsort.c
> index 728a0ed370..8a3331fdb4 100644
> --- a/stdlib/qsort.c
> +++ b/stdlib/qsort.c
> @@ -23,20 +23,89 @@
>  #include <limits.h>
>  #include <stdlib.h>
>  #include <string.h>
> +#include <stdbool.h>
> +#include <libc-pointer-arith.h>
>
> -/* Byte-wise swap two items of size SIZE. */
> -#define SWAP(a, b, size)                                                     \
> -  do                                                                         \
> -    {                                                                        \
> -      size_t __size = (size);                                                \
> -      char *__a = (a), *__b = (b);                                           \
> -      do                                                                     \
> -       {                                                                     \
> -         char __tmp = *__a;                                                  \
> -         *__a++ = *__b;                                                      \
> -         *__b++ = __tmp;                                                     \
> -       } while (--__size > 0);                                               \
> -    } while (0)
> +/* Swap SIZE bytes between addresses A and B.  These helpers are provided
> +   along the generic one as an optimization.  */
> +
> +typedef void (*swap_func_t)(void * restrict, void * restrict, size_t);
> +
> +#define SWAP_WORDS_64 (swap_func_t)0
> +#define SWAP_WORDS_32 (swap_func_t)1
> +#define SWAP_BYTES    (swap_func_t)2
> +

Why both making this a function pointer? Why not just make it an enum?
> +/* Returns true if elements can be copied using word loads and stores.
> +   The SIZE and BASE must be a multiple of the ALIGN.  */
> +__attribute_const__ __always_inline
> +static bool
> +is_aligned (const void *base, size_t size, unsigned char align)
> +{
> +  unsigned char lsbits = (unsigned char) size;
> +
> +  lsbits |= (unsigned char)(uintptr_t) base;
> +  return (lsbits & (align - 1)) == 0;
> +}
> +
> +static void
> +swap_words_64 (void * restrict a, void * restrict b, size_t n)
> +{
> +  do
> +   {
> +     n -= 8;
> +     uint64_t t = *(uint64_t *)(a + n);
> +     *(uint64_t *)(a + n) = *(uint64_t *)(b + n);
> +     *(uint64_t *)(b + n) = t;
> +   } while (n);
> +}
> +
> +static void
> +swap_words_32 (void * restrict a, void * restrict b, size_t n)
> +{
> +  do
> +   {
> +     n -= 4;
> +     uint32_t t = *(uint32_t *)(a + n);
> +     *(uint32_t *)(a + n) = *(uint32_t *)(b + n);
> +     *(uint32_t *)(b + n) = t;
> +   } while (n);
> +}
> +
> +static void
> +swap_bytes (void * restrict a, void * restrict b, size_t n)
> +{
> +  /* Use multiple small memcpys with constant size to enable inlining
> +     on most targets.  */
> +  enum { SWAP_GENERIC_SIZE = 32 };
> +  unsigned char tmp[SWAP_GENERIC_SIZE];
> +  while (n > SWAP_GENERIC_SIZE)
> +    {
> +      __builtin_memcpy (tmp, a, SWAP_GENERIC_SIZE);
> +      a = __mempcpy (a, b, SWAP_GENERIC_SIZE);
> +      b = __mempcpy (b, tmp, SWAP_GENERIC_SIZE);
> +      n -= SWAP_GENERIC_SIZE;
> +    }
> +  while (n > 0)
> +    {
> +      unsigned char t = ((unsigned char *)a)[--n];
> +      ((unsigned char *)a)[n] = ((unsigned char *)b)[n];
> +      ((unsigned char *)b)[n] = t;
> +    }
> +}
> +
> +/* Replace the indirect call with a serie of if statements.  It should help
> +   the branch predictor.  */
> +static void
> +do_swap (void * restrict a, void * restrict b, size_t size,
> +        swap_func_t swap_func)
> +{
> +  if (swap_func == SWAP_WORDS_64)
> +    swap_words_64 (a, b, size);
> +  else if (swap_func == SWAP_WORDS_32)
> +    swap_words_32 (a, b, size);
> +  else
> +    swap_bytes (a, b, size);
> +}
>
>  /* Discontinue quicksort algorithm when partition gets below this size.
>     This particular magic number was chosen to work best on a Sun 4/260. */
> @@ -96,6 +165,14 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>      /* Avoid lossage with unsigned arithmetic below.  */
>      return;
>
> +  swap_func_t swap_func;
> +  if (is_aligned (pbase, size, sizeof (uint64_t)))
> +    swap_func = SWAP_WORDS_64;
> +  else if (is_aligned (pbase, size, sizeof (uint32_t)))
> +    swap_func = SWAP_WORDS_32;
> +  else
> +    swap_func = SWAP_BYTES;
> +
>    if (total_elems > MAX_THRESH)
>      {
>        char *lo = base_ptr;
> @@ -119,13 +196,13 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>           char *mid = lo + size * ((hi - lo) / size >> 1);
>
>           if ((*cmp) ((void *) mid, (void *) lo, arg) < 0)
> -           SWAP (mid, lo, size);
> +           do_swap (mid, lo, size, swap_func);
>           if ((*cmp) ((void *) hi, (void *) mid, arg) < 0)
> -           SWAP (mid, hi, size);
> +           do_swap (mid, hi, size, swap_func);
>           else
>             goto jump_over;
>           if ((*cmp) ((void *) mid, (void *) lo, arg) < 0)
> -           SWAP (mid, lo, size);
> +           do_swap (mid, lo, size, swap_func);
>         jump_over:;
>
>           left_ptr  = lo + size;
> @@ -144,7 +221,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>
>               if (left_ptr < right_ptr)
>                 {
> -                 SWAP (left_ptr, right_ptr, size);
> +                 do_swap (left_ptr, right_ptr, size, swap_func);
>                   if (mid == left_ptr)
>                     mid = right_ptr;
>                   else if (mid == right_ptr)
> @@ -216,7 +293,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>          tmp_ptr = run_ptr;
>
>      if (tmp_ptr != base_ptr)
> -      SWAP (tmp_ptr, base_ptr, size);
> +      do_swap (tmp_ptr, base_ptr, size, swap_func);
>
>      /* Insertion sort, running from left-hand-side up to right-hand-side.  */
>
> --
> 2.34.1
>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v4 3/6] stdlib: qsort: Move some macros to inline function
  2023-07-11 19:07 ` [PATCH v4 3/6] stdlib: qsort: Move some macros to inline function Adhemerval Zanella
@ 2023-07-11 23:44   ` Noah Goldstein
  2023-07-12 20:45     ` Adhemerval Zanella Netto
  0 siblings, 1 reply; 24+ messages in thread
From: Noah Goldstein @ 2023-07-11 23:44 UTC (permalink / raw)
  To: Adhemerval Zanella; +Cc: libc-alpha

On Tue, Jul 11, 2023 at 2:10 PM Adhemerval Zanella via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> ---
>  stdlib/qsort.c | 33 +++++++++++++++++++++++----------
>  1 file changed, 23 insertions(+), 10 deletions(-)
>
> diff --git a/stdlib/qsort.c b/stdlib/qsort.c
> index 00637208ab..134e784bd1 100644
> --- a/stdlib/qsort.c
> +++ b/stdlib/qsort.c
> @@ -118,15 +118,28 @@ typedef struct
>      char *hi;
>    } stack_node;
>
> -/* The next 4 #defines implement a very fast in-line stack abstraction. */
>  /* The stack needs log (total_elements) entries (we could even subtract
>     log(MAX_THRESH)).  Since total_elements has type size_t, we get as
>     upper bound for log (total_elements):
>     bits per byte (CHAR_BIT) * sizeof(size_t).  */
> -#define STACK_SIZE     (CHAR_BIT * sizeof (size_t))
> -#define PUSH(low, high)        ((void) ((top->lo = (low)), (top->hi = (high)), ++top))
> -#define        POP(low, high)  ((void) (--top, (low = top->lo), (high = top->hi)))
> -#define        STACK_NOT_EMPTY (stack < top)
> +enum { STACK_SIZE = CHAR_BIT * sizeof (size_t) };
> +
> +static inline stack_node *
> +push (stack_node *top, char *lo, char *hi)
> +{
> +  top->lo = lo;
> +  top->hi = hi;
> +  return ++top;
> +}
> +
> +static inline stack_node *
> +pop (stack_node *top, char **lo, char **hi)
> +{
> +  --top;
> +  *lo = top->lo;
> +  *hi = top->hi;
> +  return top;
> +}
>
>
>  /* Order size using quicksort.  This implementation incorporates
> @@ -232,9 +245,9 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>        stack_node stack[STACK_SIZE];
>        stack_node *top = stack;
>
> -      PUSH (NULL, NULL);
> +      top = push (top, NULL, NULL);
>
You could also drop this push and just set `top = stack + 1;`
> -      while (STACK_NOT_EMPTY)
> +      while (stack < top)
>          {
>            char *left_ptr;
>            char *right_ptr;
> @@ -299,7 +312,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>              {
>                if ((size_t) (hi - left_ptr) <= max_thresh)
>                 /* Ignore both small partitions. */
> -                POP (lo, hi);
> +                top = pop (top, &lo, &hi);
>                else
>                 /* Ignore small left partition. */
>                  lo = left_ptr;
> @@ -310,13 +323,13 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>            else if ((right_ptr - lo) > (hi - left_ptr))
>              {
>               /* Push larger left partition indices. */
> -              PUSH (lo, right_ptr);
> +              top = push (top, lo, right_ptr);
>                lo = left_ptr;
>              }
>            else
>              {
>               /* Push larger right partition indices. */
> -              PUSH (left_ptr, hi);
> +              top = push (top, left_ptr, hi);
>                hi = right_ptr;
>              }
>          }
> --
> 2.34.1
>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v4 2/6] stdlib: Move insertion sort out qsort
  2023-07-11 19:07 ` [PATCH v4 2/6] stdlib: Move insertion sort out qsort Adhemerval Zanella
@ 2023-07-11 23:46   ` Noah Goldstein
  2023-07-12  7:28     ` Andreas Schwab
  2023-07-12 20:35     ` Adhemerval Zanella Netto
  0 siblings, 2 replies; 24+ messages in thread
From: Noah Goldstein @ 2023-07-11 23:46 UTC (permalink / raw)
  To: Adhemerval Zanella; +Cc: libc-alpha

On Tue, Jul 11, 2023 at 2:09 PM Adhemerval Zanella via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> ---
>  stdlib/qsort.c | 100 ++++++++++++++++++++++++++-----------------------
>  1 file changed, 53 insertions(+), 47 deletions(-)
>
> diff --git a/stdlib/qsort.c b/stdlib/qsort.c
> index 8a3331fdb4..00637208ab 100644
> --- a/stdlib/qsort.c
> +++ b/stdlib/qsort.c
> @@ -153,6 +153,58 @@ typedef struct
>        smaller partition.  This *guarantees* no more than log (total_elems)
>        stack size is needed (actually O(1) in this case)!  */
>
> +static inline void
> +insertion_sort (void *const pbase, size_t total_elems, size_t size,

Maybe a less generic name would be better. Something like:
"insertion_sort_qsort_partions" to indicate this is not just standard
insertion sort but a helper for qsort. Just thinking about a reader
grepping around in a year or so.
> +                swap_func_t swap_func,
> +               __compar_d_fn_t cmp, void *arg)
> +{
> +  char *base_ptr = (char *) pbase;
> +  char *const end_ptr = &base_ptr[size * (total_elems - 1)];
> +  char *tmp_ptr = base_ptr;
> +#define min(x, y) ((x) < (y) ? (x) : (y))
> +  const size_t max_thresh = MAX_THRESH * size;
> +  char *thresh = min(end_ptr, base_ptr + max_thresh);
> +  char *run_ptr;
> +
> +  /* Find smallest element in first threshold and place it at the
> +     array's beginning.  This is the smallest array element,
> +     and the operation speeds up insertion sort's inner loop. */
> +
> +  for (run_ptr = tmp_ptr + size; run_ptr <= thresh; run_ptr += size)
> +    if (cmp (run_ptr, tmp_ptr, arg) < 0)
> +      tmp_ptr = run_ptr;
> +
> +  if (tmp_ptr != base_ptr)
> +    do_swap (tmp_ptr, base_ptr, size, swap_func);
> +
> +  /* Insertion sort, running from left-hand-side up to right-hand-side.  */
> +
> +  run_ptr = base_ptr + size;
> +  while ((run_ptr += size) <= end_ptr)
> +    {
> +      tmp_ptr = run_ptr - size;
> +      while (cmp (run_ptr, tmp_ptr, arg) < 0)
> +        tmp_ptr -= size;
> +
> +      tmp_ptr += size;
> +      if (tmp_ptr != run_ptr)
> +        {
> +          char *trav;
> +
> +          trav = run_ptr + size;
> +          while (--trav >= run_ptr)
> +            {
> +              char c = *trav;
> +              char *hi, *lo;
> +
> +              for (hi = lo = trav; (lo -= size) >= tmp_ptr; hi = lo)
> +                *hi = *lo;
> +              *hi = c;
> +            }
> +        }
> +    }
> +}
> +
>  void
>  _quicksort (void *const pbase, size_t total_elems, size_t size,
>             __compar_d_fn_t cmp, void *arg)
> @@ -275,51 +327,5 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>       for partitions below MAX_THRESH size. BASE_PTR points to the beginning
>       of the array to sort, and END_PTR points at the very last element in
>       the array (*not* one beyond it!). */
> -
> -#define min(x, y) ((x) < (y) ? (x) : (y))
> -
> -  {
> -    char *const end_ptr = &base_ptr[size * (total_elems - 1)];
> -    char *tmp_ptr = base_ptr;
> -    char *thresh = min(end_ptr, base_ptr + max_thresh);
> -    char *run_ptr;
> -
> -    /* Find smallest element in first threshold and place it at the
> -       array's beginning.  This is the smallest array element,
> -       and the operation speeds up insertion sort's inner loop. */
> -
> -    for (run_ptr = tmp_ptr + size; run_ptr <= thresh; run_ptr += size)
> -      if ((*cmp) ((void *) run_ptr, (void *) tmp_ptr, arg) < 0)
> -        tmp_ptr = run_ptr;
> -
> -    if (tmp_ptr != base_ptr)
> -      do_swap (tmp_ptr, base_ptr, size, swap_func);
> -
> -    /* Insertion sort, running from left-hand-side up to right-hand-side.  */
> -
> -    run_ptr = base_ptr + size;
> -    while ((run_ptr += size) <= end_ptr)
> -      {
> -       tmp_ptr = run_ptr - size;
> -       while ((*cmp) ((void *) run_ptr, (void *) tmp_ptr, arg) < 0)
> -         tmp_ptr -= size;
> -
> -       tmp_ptr += size;
> -        if (tmp_ptr != run_ptr)
> -          {
> -            char *trav;
> -
> -           trav = run_ptr + size;
> -           while (--trav >= run_ptr)
> -              {
> -                char c = *trav;
> -                char *hi, *lo;
> -
> -                for (hi = lo = trav; (lo -= size) >= tmp_ptr; hi = lo)
> -                  *hi = *lo;
> -                *hi = c;
> -              }
> -          }
> -      }
> -  }
> +  insertion_sort (pbase, total_elems, size, swap_func, cmp, arg);
>  }
> --
> 2.34.1
>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v4 4/6] stdlib: Implement introsort with qsort
  2023-07-11 19:07 ` [PATCH v4 4/6] stdlib: Implement introsort with qsort Adhemerval Zanella
@ 2023-07-12  5:39   ` Alexander Monakov
  2023-07-12 20:55     ` Adhemerval Zanella Netto
  2023-07-12 21:55   ` Paul Eggert
  1 sibling, 1 reply; 24+ messages in thread
From: Alexander Monakov @ 2023-07-12  5:39 UTC (permalink / raw)
  To: Adhemerval Zanella; +Cc: libc-alpha

Hi,

On Tue, 11 Jul 2023, Adhemerval Zanella via Libc-alpha wrote:

> @@ -226,7 +295,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>  
>    const size_t max_thresh = MAX_THRESH * size;
>  
> -  if (total_elems == 0)
> +  if (total_elems <= 0)

The previous version of the patchset had '<= 1' here, which is more appropriate.
I am surprised at the change.

>      /* Avoid lossage with unsigned arithmetic below.  */
>      return;
>  
> @@ -238,6 +307,9 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>    else
>      swap_func = SWAP_BYTES;
>  
> +  /* Maximum depth before quicksort switches to heapsort.  */
> +  size_t depth = 2 * (CHAR_BIT - 1 - __builtin_clzl (total_elems));
> +

This still carries the arithmetic issue I pointed out when this was previously
posted: the expression in parenthesis is usually negative, making 'depth' huge
after unsigned wraparound:
https://inbox.sourceware.org/libc-alpha/c16f6331-c0ce-4362-718c-8b7d8e8cbbe2@linaro.org/

Alexander

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v4 2/6] stdlib: Move insertion sort out qsort
  2023-07-11 23:46   ` Noah Goldstein
@ 2023-07-12  7:28     ` Andreas Schwab
  2023-07-12 20:35     ` Adhemerval Zanella Netto
  1 sibling, 0 replies; 24+ messages in thread
From: Andreas Schwab @ 2023-07-12  7:28 UTC (permalink / raw)
  To: Noah Goldstein via Libc-alpha; +Cc: Adhemerval Zanella, Noah Goldstein

On Jul 11 2023, Noah Goldstein via Libc-alpha wrote:

> Maybe a less generic name would be better. Something like:
> "insertion_sort_qsort_partions" to indicate this is not just standard

s/partions/partitions/

-- 
Andreas Schwab, SUSE Labs, schwab@suse.de
GPG Key fingerprint = 0196 BAD8 1CE9 1970 F4BE  1748 E4D4 88E3 0EEA B9D7
"And now for something completely different."

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v4 1/6] stdlib: Optimization qsort{_r} swap implementation (BZ 19305)
  2023-07-11 23:40   ` Noah Goldstein
@ 2023-07-12 19:40     ` Adhemerval Zanella Netto
  0 siblings, 0 replies; 24+ messages in thread
From: Adhemerval Zanella Netto @ 2023-07-12 19:40 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha



On 11/07/23 20:40, Noah Goldstein wrote:
> On Tue, Jul 11, 2023 at 2:08 PM Adhemerval Zanella via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
>>
>> It optimizes take in consideration both the most common elements are
>> either 32 or 64 bit in size and inputs are aligned to the word
>> boundary.  This is similar to the optimization done on lib/sort.c
>> from Linux.
>>
>> This patchs adds an optimized swap operation on qsort based in previous
>> msort one.  Instead of byte operation, three variants are provided:
>>
>>   1. Using uint32_t loads and stores.
>>   2. Using uint64_t loads and stores.
>>   3. Generic one with a temporary buffer and memcpy/mempcpy.
>>
>> The 1. and 2. options are selected only if base pointer is aligned
>> to required type.
>>
>> It also fixes BZ#19305 by checking input size against number of
>> elements 1 besides 0.
>>
>> Checked on x86_64-linux-gnu.
>> ---
>>  stdlib/qsort.c | 113 +++++++++++++++++++++++++++++++++++++++++--------
>>  1 file changed, 95 insertions(+), 18 deletions(-)
>>
>> diff --git a/stdlib/qsort.c b/stdlib/qsort.c
>> index 728a0ed370..8a3331fdb4 100644
>> --- a/stdlib/qsort.c
>> +++ b/stdlib/qsort.c
>> @@ -23,20 +23,89 @@
>>  #include <limits.h>
>>  #include <stdlib.h>
>>  #include <string.h>
>> +#include <stdbool.h>
>> +#include <libc-pointer-arith.h>
>>
>> -/* Byte-wise swap two items of size SIZE. */
>> -#define SWAP(a, b, size)                                                     \
>> -  do                                                                         \
>> -    {                                                                        \
>> -      size_t __size = (size);                                                \
>> -      char *__a = (a), *__b = (b);                                           \
>> -      do                                                                     \
>> -       {                                                                     \
>> -         char __tmp = *__a;                                                  \
>> -         *__a++ = *__b;                                                      \
>> -         *__b++ = __tmp;                                                     \
>> -       } while (--__size > 0);                                               \
>> -    } while (0)
>> +/* Swap SIZE bytes between addresses A and B.  These helpers are provided
>> +   along the generic one as an optimization.  */
>> +
>> +typedef void (*swap_func_t)(void * restrict, void * restrict, size_t);
>> +
>> +#define SWAP_WORDS_64 (swap_func_t)0
>> +#define SWAP_WORDS_32 (swap_func_t)1
>> +#define SWAP_BYTES    (swap_func_t)2
>> +
> 
> Why both making this a function pointer? Why not just make it an enum?

I used kernel lib/sort.c as base and it support caller to pass a function as
wrapper, but since it does make sense for this patch an enum does make more
sense.  I will adjust to use it.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v4 2/6] stdlib: Move insertion sort out qsort
  2023-07-11 23:46   ` Noah Goldstein
  2023-07-12  7:28     ` Andreas Schwab
@ 2023-07-12 20:35     ` Adhemerval Zanella Netto
  1 sibling, 0 replies; 24+ messages in thread
From: Adhemerval Zanella Netto @ 2023-07-12 20:35 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha



On 11/07/23 20:46, Noah Goldstein wrote:
> On Tue, Jul 11, 2023 at 2:09 PM Adhemerval Zanella via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
>>
>> ---
>>  stdlib/qsort.c | 100 ++++++++++++++++++++++++++-----------------------
>>  1 file changed, 53 insertions(+), 47 deletions(-)
>>
>> diff --git a/stdlib/qsort.c b/stdlib/qsort.c
>> index 8a3331fdb4..00637208ab 100644
>> --- a/stdlib/qsort.c
>> +++ b/stdlib/qsort.c
>> @@ -153,6 +153,58 @@ typedef struct
>>        smaller partition.  This *guarantees* no more than log (total_elems)
>>        stack size is needed (actually O(1) in this case)!  */
>>
>> +static inline void
>> +insertion_sort (void *const pbase, size_t total_elems, size_t size,
> 
> Maybe a less generic name would be better. Something like:
> "insertion_sort_qsort_partions" to indicate this is not just standard
> insertion sort but a helper for qsort. Just thinking about a reader
> grepping around in a year or so.

Fair enough, I will change it.

>> +                swap_func_t swap_func,
>> +               __compar_d_fn_t cmp, void *arg)
>> +{
>> +  char *base_ptr = (char *) pbase;
>> +  char *const end_ptr = &base_ptr[size * (total_elems - 1)];
>> +  char *tmp_ptr = base_ptr;
>> +#define min(x, y) ((x) < (y) ? (x) : (y))
>> +  const size_t max_thresh = MAX_THRESH * size;
>> +  char *thresh = min(end_ptr, base_ptr + max_thresh);
>> +  char *run_ptr;
>> +
>> +  /* Find smallest element in first threshold and place it at the
>> +     array's beginning.  This is the smallest array element,
>> +     and the operation speeds up insertion sort's inner loop. */
>> +
>> +  for (run_ptr = tmp_ptr + size; run_ptr <= thresh; run_ptr += size)
>> +    if (cmp (run_ptr, tmp_ptr, arg) < 0)
>> +      tmp_ptr = run_ptr;
>> +
>> +  if (tmp_ptr != base_ptr)
>> +    do_swap (tmp_ptr, base_ptr, size, swap_func);
>> +
>> +  /* Insertion sort, running from left-hand-side up to right-hand-side.  */
>> +
>> +  run_ptr = base_ptr + size;
>> +  while ((run_ptr += size) <= end_ptr)
>> +    {
>> +      tmp_ptr = run_ptr - size;
>> +      while (cmp (run_ptr, tmp_ptr, arg) < 0)
>> +        tmp_ptr -= size;
>> +
>> +      tmp_ptr += size;
>> +      if (tmp_ptr != run_ptr)
>> +        {
>> +          char *trav;
>> +
>> +          trav = run_ptr + size;
>> +          while (--trav >= run_ptr)
>> +            {
>> +              char c = *trav;
>> +              char *hi, *lo;
>> +
>> +              for (hi = lo = trav; (lo -= size) >= tmp_ptr; hi = lo)
>> +                *hi = *lo;
>> +              *hi = c;
>> +            }
>> +        }
>> +    }
>> +}
>> +
>>  void
>>  _quicksort (void *const pbase, size_t total_elems, size_t size,
>>             __compar_d_fn_t cmp, void *arg)
>> @@ -275,51 +327,5 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>>       for partitions below MAX_THRESH size. BASE_PTR points to the beginning
>>       of the array to sort, and END_PTR points at the very last element in
>>       the array (*not* one beyond it!). */
>> -
>> -#define min(x, y) ((x) < (y) ? (x) : (y))
>> -
>> -  {
>> -    char *const end_ptr = &base_ptr[size * (total_elems - 1)];
>> -    char *tmp_ptr = base_ptr;
>> -    char *thresh = min(end_ptr, base_ptr + max_thresh);
>> -    char *run_ptr;
>> -
>> -    /* Find smallest element in first threshold and place it at the
>> -       array's beginning.  This is the smallest array element,
>> -       and the operation speeds up insertion sort's inner loop. */
>> -
>> -    for (run_ptr = tmp_ptr + size; run_ptr <= thresh; run_ptr += size)
>> -      if ((*cmp) ((void *) run_ptr, (void *) tmp_ptr, arg) < 0)
>> -        tmp_ptr = run_ptr;
>> -
>> -    if (tmp_ptr != base_ptr)
>> -      do_swap (tmp_ptr, base_ptr, size, swap_func);
>> -
>> -    /* Insertion sort, running from left-hand-side up to right-hand-side.  */
>> -
>> -    run_ptr = base_ptr + size;
>> -    while ((run_ptr += size) <= end_ptr)
>> -      {
>> -       tmp_ptr = run_ptr - size;
>> -       while ((*cmp) ((void *) run_ptr, (void *) tmp_ptr, arg) < 0)
>> -         tmp_ptr -= size;
>> -
>> -       tmp_ptr += size;
>> -        if (tmp_ptr != run_ptr)
>> -          {
>> -            char *trav;
>> -
>> -           trav = run_ptr + size;
>> -           while (--trav >= run_ptr)
>> -              {
>> -                char c = *trav;
>> -                char *hi, *lo;
>> -
>> -                for (hi = lo = trav; (lo -= size) >= tmp_ptr; hi = lo)
>> -                  *hi = *lo;
>> -                *hi = c;
>> -              }
>> -          }
>> -      }
>> -  }
>> +  insertion_sort (pbase, total_elems, size, swap_func, cmp, arg);
>>  }
>> --
>> 2.34.1
>>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v4 3/6] stdlib: qsort: Move some macros to inline function
  2023-07-11 23:44   ` Noah Goldstein
@ 2023-07-12 20:45     ` Adhemerval Zanella Netto
  0 siblings, 0 replies; 24+ messages in thread
From: Adhemerval Zanella Netto @ 2023-07-12 20:45 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha



On 11/07/23 20:44, Noah Goldstein wrote:
> On Tue, Jul 11, 2023 at 2:10 PM Adhemerval Zanella via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
>>
>> ---
>>  stdlib/qsort.c | 33 +++++++++++++++++++++++----------
>>  1 file changed, 23 insertions(+), 10 deletions(-)
>>
>> diff --git a/stdlib/qsort.c b/stdlib/qsort.c
>> index 00637208ab..134e784bd1 100644
>> --- a/stdlib/qsort.c
>> +++ b/stdlib/qsort.c
>> @@ -118,15 +118,28 @@ typedef struct
>>      char *hi;
>>    } stack_node;
>>
>> -/* The next 4 #defines implement a very fast in-line stack abstraction. */
>>  /* The stack needs log (total_elements) entries (we could even subtract
>>     log(MAX_THRESH)).  Since total_elements has type size_t, we get as
>>     upper bound for log (total_elements):
>>     bits per byte (CHAR_BIT) * sizeof(size_t).  */
>> -#define STACK_SIZE     (CHAR_BIT * sizeof (size_t))
>> -#define PUSH(low, high)        ((void) ((top->lo = (low)), (top->hi = (high)), ++top))
>> -#define        POP(low, high)  ((void) (--top, (low = top->lo), (high = top->hi)))
>> -#define        STACK_NOT_EMPTY (stack < top)
>> +enum { STACK_SIZE = CHAR_BIT * sizeof (size_t) };
>> +
>> +static inline stack_node *
>> +push (stack_node *top, char *lo, char *hi)
>> +{
>> +  top->lo = lo;
>> +  top->hi = hi;
>> +  return ++top;
>> +}
>> +
>> +static inline stack_node *
>> +pop (stack_node *top, char **lo, char **hi)
>> +{
>> +  --top;
>> +  *lo = top->lo;
>> +  *hi = top->hi;
>> +  return top;
>> +}
>>
>>
>>  /* Order size using quicksort.  This implementation incorporates
>> @@ -232,9 +245,9 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>>        stack_node stack[STACK_SIZE];
>>        stack_node *top = stack;
>>
>> -      PUSH (NULL, NULL);
>> +      top = push (top, NULL, NULL);
>>
> You could also drop this push and just set `top = stack + 1;`

Ack.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v4 4/6] stdlib: Implement introsort with qsort
  2023-07-12  5:39   ` Alexander Monakov
@ 2023-07-12 20:55     ` Adhemerval Zanella Netto
  0 siblings, 0 replies; 24+ messages in thread
From: Adhemerval Zanella Netto @ 2023-07-12 20:55 UTC (permalink / raw)
  To: Alexander Monakov; +Cc: libc-alpha



On 12/07/23 02:39, Alexander Monakov wrote:
> Hi,
> 
> On Tue, 11 Jul 2023, Adhemerval Zanella via Libc-alpha wrote:
> 
>> @@ -226,7 +295,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>>  
>>    const size_t max_thresh = MAX_THRESH * size;
>>  
>> -  if (total_elems == 0)
>> +  if (total_elems <= 0)
> 
> The previous version of the patchset had '<= 1' here, which is more appropriate.
> I am surprised at the change.

Indeed, I am not sure sure why I have changed it. I will fix it.

> 
>>      /* Avoid lossage with unsigned arithmetic below.  */
>>      return;
>>  
>> @@ -238,6 +307,9 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>>    else
>>      swap_func = SWAP_BYTES;
>>  
>> +  /* Maximum depth before quicksort switches to heapsort.  */
>> +  size_t depth = 2 * (CHAR_BIT - 1 - __builtin_clzl (total_elems));
>> +
> 
> This still carries the arithmetic issue I pointed out when this was previously
> posted: the expression in parenthesis is usually negative, making 'depth' huge
> after unsigned wraparound:
> https://inbox.sourceware.org/libc-alpha/c16f6331-c0ce-4362-718c-8b7d8e8cbbe2@linaro.org/

I wrongly added the fix on the next patch on the set [1].  I will move back to
this patch.

[1] https://patchwork.sourceware.org/project/glibc/patch/20230711190722.4028821-6-adhemerval.zanella@linaro.org/

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v4 1/6] stdlib: Optimization qsort{_r} swap implementation (BZ 19305)
  2023-07-11 19:07 ` [PATCH v4 1/6] stdlib: Optimization qsort{_r} swap implementation (BZ 19305) Adhemerval Zanella
  2023-07-11 23:40   ` Noah Goldstein
@ 2023-07-12 21:04   ` Paul Eggert
  2023-07-13 11:07     ` Adhemerval Zanella Netto
  1 sibling, 1 reply; 24+ messages in thread
From: Paul Eggert @ 2023-07-12 21:04 UTC (permalink / raw)
  To: Adhemerval Zanella; +Cc: libc-alpha

On 2023-07-11 12:07, Adhemerval Zanella via Libc-alpha wrote:
> +/* Returns true if elements can be copied using word loads and stores.
> +   The SIZE and BASE must be a multiple of the ALIGN.  */
> +__attribute_const__ __always_inline
> +static bool
> +is_aligned (const void *base, size_t size, unsigned char align)
> +{
> +  unsigned char lsbits = (unsigned char) size;
> +
> +  lsbits |= (unsigned char)(uintptr_t) base;
> +  return (lsbits & (align - 1)) == 0;
> +}

Doesn't the following source code generate better machine code? It does 
for me, anyway (x86-64). Even if the code were the same, the simplicity 
would be a win.

   static bool
   is_aligned (const void *base, size_t size, size_t align)
   {
     return (((uintptr_t) base | size) & (align - 1)) == 0;
   }


> +  if (is_aligned (pbase, size, sizeof (uint64_t)))
> +    swap_func = SWAP_WORDS_64;

alignof not sizeof, in contexts like these that are talking about 
alignment not size.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v4 4/6] stdlib: Implement introsort with qsort
  2023-07-11 19:07 ` [PATCH v4 4/6] stdlib: Implement introsort with qsort Adhemerval Zanella
  2023-07-12  5:39   ` Alexander Monakov
@ 2023-07-12 21:55   ` Paul Eggert
  2023-07-13 11:53     ` Adhemerval Zanella Netto
  1 sibling, 1 reply; 24+ messages in thread
From: Paul Eggert @ 2023-07-12 21:55 UTC (permalink / raw)
  To: Adhemerval Zanella, libc-alpha

On 2023-07-11 12:07, Adhemerval Zanella via Libc-alpha wrote:
> +  /* Used to find parent  */
> +  const unsigned int lsbit = size & -size;

SIZE is of type size_t, so it would be clearer (and maybe safer?) if 
lsbit were size_t too, here and elsewhere.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v4 5/6] stdlib: Remove use of mergesort on qsort (BZ 21719)
  2023-07-11 19:07 ` [PATCH v4 5/6] stdlib: Remove use of mergesort on qsort (BZ 21719) Adhemerval Zanella
@ 2023-07-12 22:04   ` Paul Eggert
  2023-07-13 11:55     ` Adhemerval Zanella Netto
  0 siblings, 1 reply; 24+ messages in thread
From: Paul Eggert @ 2023-07-12 22:04 UTC (permalink / raw)
  To: Adhemerval Zanella, libc-alpha

On 2023-07-11 12:07, Adhemerval Zanella via Libc-alpha wrote:
> @@ -152,6 +151,7 @@ pop (stack_node *top, char **lo, char **hi, size_t *depth)
>   static inline size_t
>   parent (size_t i, unsigned int lsbit, size_t size)
>   {
> +  /* The below is logically equivalent to 'if (i & lsbit) i -= size'.  */
>     i -= size;
>     i -= size & -(i & lsbit);
>     return i / 2;

Doesn't the comment belong to the patch that introduced this code, not 
to patch 5?

Also, it's not clear which of the three lines the phrase "The below" 
refers to. I assume it refers to the line "i -= size & -(i & lsbit);". 
If so, this should be made clearer, e.g.:

     i -= size;
     i -= size & -(i & lsbit); /* i.e., 'if (i & lsbit) i -= size;' */
     return i / 2;


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v4 1/6] stdlib: Optimization qsort{_r} swap implementation (BZ 19305)
  2023-07-12 21:04   ` Paul Eggert
@ 2023-07-13 11:07     ` Adhemerval Zanella Netto
  2023-07-13 13:13       ` Alexander Monakov
  0 siblings, 1 reply; 24+ messages in thread
From: Adhemerval Zanella Netto @ 2023-07-13 11:07 UTC (permalink / raw)
  To: Paul Eggert; +Cc: libc-alpha



On 12/07/23 18:04, Paul Eggert wrote:
> On 2023-07-11 12:07, Adhemerval Zanella via Libc-alpha wrote:
>> +/* Returns true if elements can be copied using word loads and stores.
>> +   The SIZE and BASE must be a multiple of the ALIGN.  */
>> +__attribute_const__ __always_inline
>> +static bool
>> +is_aligned (const void *base, size_t size, unsigned char align)
>> +{
>> +  unsigned char lsbits = (unsigned char) size;
>> +
>> +  lsbits |= (unsigned char)(uintptr_t) base;
>> +  return (lsbits & (align - 1)) == 0;
>> +}
> 
> Doesn't the following source code generate better machine code? It does for me, anyway (x86-64). Even if the code were the same, the simplicity would be a win.
> 
>   static bool
>   is_aligned (const void *base, size_t size, size_t align)
>   {
>     return (((uintptr_t) base | size) & (align - 1)) == 0;
>   }
> 

Ok.

> 
>> +  if (is_aligned (pbase, size, sizeof (uint64_t)))
>> +    swap_func = SWAP_WORDS_64;
> 
> alignof not sizeof, in contexts like these that are talking about alignment not size.

Indeed, I will fix it.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v4 4/6] stdlib: Implement introsort with qsort
  2023-07-12 21:55   ` Paul Eggert
@ 2023-07-13 11:53     ` Adhemerval Zanella Netto
  0 siblings, 0 replies; 24+ messages in thread
From: Adhemerval Zanella Netto @ 2023-07-13 11:53 UTC (permalink / raw)
  To: Paul Eggert, libc-alpha



On 12/07/23 18:55, Paul Eggert wrote:
> On 2023-07-11 12:07, Adhemerval Zanella via Libc-alpha wrote:
>> +  /* Used to find parent  */
>> +  const unsigned int lsbit = size & -size;
> 
> SIZE is of type size_t, so it would be clearer (and maybe safer?) if lsbit were size_t too, here and elsewhere.

Ack.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v4 5/6] stdlib: Remove use of mergesort on qsort (BZ 21719)
  2023-07-12 22:04   ` Paul Eggert
@ 2023-07-13 11:55     ` Adhemerval Zanella Netto
  0 siblings, 0 replies; 24+ messages in thread
From: Adhemerval Zanella Netto @ 2023-07-13 11:55 UTC (permalink / raw)
  To: Paul Eggert, libc-alpha



On 12/07/23 19:04, Paul Eggert wrote:
> On 2023-07-11 12:07, Adhemerval Zanella via Libc-alpha wrote:
>> @@ -152,6 +151,7 @@ pop (stack_node *top, char **lo, char **hi, size_t *depth)
>>   static inline size_t
>>   parent (size_t i, unsigned int lsbit, size_t size)
>>   {
>> +  /* The below is logically equivalent to 'if (i & lsbit) i -= size'.  */
>>     i -= size;
>>     i -= size & -(i & lsbit);
>>     return i / 2;
> 
> Doesn't the comment belong to the patch that introduced this code, not to patch 5?

Indeed, I realized it after Alexander comment.

> 
> Also, it's not clear which of the three lines the phrase "The below" refers to. I assume it refers to the line "i -= size & -(i & lsbit);". If so, this should be made clearer, e.g.:
> 
>     i -= size;
>     i -= size & -(i & lsbit); /* i.e., 'if (i & lsbit) i -= size;' */
>     return i / 2;
> 

Ack.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v4 1/6] stdlib: Optimization qsort{_r} swap implementation (BZ 19305)
  2023-07-13 11:07     ` Adhemerval Zanella Netto
@ 2023-07-13 13:13       ` Alexander Monakov
  2023-07-13 13:29         ` Adhemerval Zanella Netto
  0 siblings, 1 reply; 24+ messages in thread
From: Alexander Monakov @ 2023-07-13 13:13 UTC (permalink / raw)
  To: Adhemerval Zanella Netto; +Cc: Paul Eggert, libc-alpha

[-- Attachment #1: Type: text/plain, Size: 662 bytes --]


On Thu, 13 Jul 2023, Adhemerval Zanella Netto via Libc-alpha wrote:

> >> +  if (is_aligned (pbase, size, sizeof (uint64_t)))
> >> +    swap_func = SWAP_WORDS_64;
> > 
> > alignof not sizeof, in contexts like these that are talking about alignment not size.
> 
> Indeed, I will fix it.

Are you going to use GNU __alignof, or C11 _Alignof? One is safe. The other
makes the code go *boom* on 32-bit x86 when 'size' is 4 modulo 8, coming
this || close to a sneaky singular out-of-bounds write, saved only by the
fact that gcc doesn't (yet) do high-level transforms in swap_words_64.

(the code is not, in fact, talking solely about alignment there)

Alexander

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v4 1/6] stdlib: Optimization qsort{_r} swap implementation (BZ 19305)
  2023-07-13 13:13       ` Alexander Monakov
@ 2023-07-13 13:29         ` Adhemerval Zanella Netto
  0 siblings, 0 replies; 24+ messages in thread
From: Adhemerval Zanella Netto @ 2023-07-13 13:29 UTC (permalink / raw)
  To: Alexander Monakov; +Cc: Paul Eggert, libc-alpha



On 13/07/23 10:13, Alexander Monakov wrote:
> 
> On Thu, 13 Jul 2023, Adhemerval Zanella Netto via Libc-alpha wrote:
> 
>>>> +  if (is_aligned (pbase, size, sizeof (uint64_t)))
>>>> +    swap_func = SWAP_WORDS_64;
>>>
>>> alignof not sizeof, in contexts like these that are talking about alignment not size.
>>
>> Indeed, I will fix it.
> 
> Are you going to use GNU __alignof, or C11 _Alignof? One is safe. The other
> makes the code go *boom* on 32-bit x86 when 'size' is 4 modulo 8, coming
> this || close to a sneaky singular out-of-bounds write, saved only by the
> fact that gcc doesn't (yet) do high-level transforms in swap_words_64.
> 
> (the code is not, in fact, talking solely about alignment there)

Yes I noted if fails on i686, I will use explicit value instead
(as Linux lib/sort.c does).

^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2023-07-13 13:29 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-07-11 19:07 [PATCH v4 0/6] Use introsort for qsort Adhemerval Zanella
2023-07-11 19:07 ` [PATCH v4 1/6] stdlib: Optimization qsort{_r} swap implementation (BZ 19305) Adhemerval Zanella
2023-07-11 23:40   ` Noah Goldstein
2023-07-12 19:40     ` Adhemerval Zanella Netto
2023-07-12 21:04   ` Paul Eggert
2023-07-13 11:07     ` Adhemerval Zanella Netto
2023-07-13 13:13       ` Alexander Monakov
2023-07-13 13:29         ` Adhemerval Zanella Netto
2023-07-11 19:07 ` [PATCH v4 2/6] stdlib: Move insertion sort out qsort Adhemerval Zanella
2023-07-11 23:46   ` Noah Goldstein
2023-07-12  7:28     ` Andreas Schwab
2023-07-12 20:35     ` Adhemerval Zanella Netto
2023-07-11 19:07 ` [PATCH v4 3/6] stdlib: qsort: Move some macros to inline function Adhemerval Zanella
2023-07-11 23:44   ` Noah Goldstein
2023-07-12 20:45     ` Adhemerval Zanella Netto
2023-07-11 19:07 ` [PATCH v4 4/6] stdlib: Implement introsort with qsort Adhemerval Zanella
2023-07-12  5:39   ` Alexander Monakov
2023-07-12 20:55     ` Adhemerval Zanella Netto
2023-07-12 21:55   ` Paul Eggert
2023-07-13 11:53     ` Adhemerval Zanella Netto
2023-07-11 19:07 ` [PATCH v4 5/6] stdlib: Remove use of mergesort on qsort (BZ 21719) Adhemerval Zanella
2023-07-12 22:04   ` Paul Eggert
2023-07-13 11:55     ` Adhemerval Zanella Netto
2023-07-11 19:07 ` [PATCH v4 6/6] stdlib: Add more qsort{_r} coverage Adhemerval Zanella

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).