public inbox for glibc-cvs@sourceware.org
help / color / mirror / Atom feed
* [glibc/azanella/qsort-fixes] stdlib: Implement introsort with qsort
@ 2021-09-06 18:22 Adhemerval Zanella
  0 siblings, 0 replies; 2+ messages in thread
From: Adhemerval Zanella @ 2021-09-06 18:22 UTC (permalink / raw)
  To: glibc-cvs

https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=407bd69b3403b3f9f973fa38330c838e24396d32

commit 407bd69b3403b3f9f973fa38330c838e24396d32
Author: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Date:   Thu Sep 2 16:25:23 2021 -0300

    stdlib: Implement introsort with qsort
    
    This patch adds a introsort implementation on qsort to avoid worse-case
    performance of quicksort to O(nlog n).  The heapsort fallback used is a
    heapsort based on Linux implementation (commit 22a241ccb2c19962a).  As a
    side note the introsort implementation is similar the one used on
    libstdc++ for std::sort.
    
    Checked on x86_64-linux-gnu.

Diff:
---
 stdlib/qsort.c | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 87 insertions(+), 7 deletions(-)

diff --git a/stdlib/qsort.c b/stdlib/qsort.c
index 60ca3b48b0..28824cd8bb 100644
--- a/stdlib/qsort.c
+++ b/stdlib/qsort.c
@@ -112,6 +112,7 @@ typedef struct
   {
     char *lo;
     char *hi;
+    size_t depth;
   } stack_node;
 
 /* The stack needs log (total_elements) entries (we could even subtract
@@ -121,23 +122,92 @@ typedef struct
 enum { STACK_SIZE = CHAR_BIT * sizeof (size_t) };
 
 static inline stack_node *
-push (stack_node *top, char *lo, char *hi)
+push (stack_node *top, char *lo, char *hi, size_t depth)
 {
   top->lo = lo;
   top->hi = hi;
+  top->depth = depth;
   return ++top;
 }
 
 static inline stack_node *
-pop (stack_node *top, char **lo, char **hi)
+pop (stack_node *top, char **lo, char **hi, size_t *depth)
 {
   --top;
   *lo = top->lo;
   *hi = top->hi;
+  *depth = top->depth;
   return top;
 }
 
 
+/* A fast, small, non-recursive O(nlog n) heapsort, adapted from Linux
+   lib/sort.c.  Used on introsort implementation as a fallback routine with
+   worst-case performance of O(nlog n) and worst-case space complexity of
+   O(1).  */
+
+static inline size_t
+parent (size_t i, unsigned int lsbit, size_t size)
+{
+  i -= size;
+  i -= size & -(i & lsbit);
+  return i / 2;
+}
+
+static void
+heapsort_r (void *base, void *end, size_t size, swap_func_t swap_func,
+	    __compar_d_fn_t cmp, void *arg)
+{
+  size_t num = ((uintptr_t) end - (uintptr_t) base) / size;
+  size_t n = num * size, a = (num/2) * size;
+  /* Used to find parent  */
+  const unsigned int lsbit = size & -size;
+
+  /* num < 2 || size == 0.  */
+  if (a == 0)
+    return;
+
+  for (;;)
+    {
+      size_t b, c, d;
+
+      if (a != 0)
+	/* Building heap: sift down --a */
+	a -= size;
+      else if (n -= size)
+	/* Sorting: Extract root to --n */
+	do_swap (base, base + n, size, swap_func);
+      else
+	break;
+
+      /* Sift element at "a" down into heap.  This is the "bottom-up" variant,
+	 which significantly reduces calls to cmp_func(): we find the sift-down
+	 path all the way to the leaves (one compare per level), then backtrack
+	 to find where to insert the target element.
+
+	 Because elements tend to sift down close to the leaves, this uses fewer
+	 compares than doing two per level on the way down.  (A bit more than
+	 half as many on average, 3/4 worst-case.).  */
+      for (b = a; c = 2 * b + size, (d = c + size) < n;)
+	b = cmp (base + c, base + d, arg) >= 0 ? c : d;
+      if (d == n)
+	/* Special case last leaf with no sibling.  */
+	b = c;
+
+      /* Now backtrack from "b" to the correct location for "a".  */
+      while (b != a && cmp (base + a, base + b, arg) >= 0)
+	b = parent (b, lsbit, size);
+      /* Where "a" belongs.  */
+      c = b;
+      while (b != a)
+	{
+	  /* Shift it into place.  */
+	  b = parent (b, lsbit, size);
+          do_swap (base + b, base + c, size, swap_func);
+        }
+    }
+}
+
 /* Order size using quicksort.  This implementation incorporates
    four optimizations discussed in Sedgewick:
 
@@ -222,7 +292,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
 
   const size_t max_thresh = MAX_THRESH * size;
 
-  if (total_elems == 0)
+  if (total_elems <= 1)
     /* Avoid lossage with unsigned arithmetic below.  */
     return;
 
@@ -234,6 +304,9 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
   else
     swap_func = SWAP_BYTES;
 
+  /* Maximum depth before quicksort switches to heapsort.  */
+  size_t depth = 2 * (CHAR_BIT - 1 - __builtin_clzl (total_elems));
+
   if (total_elems > MAX_THRESH)
     {
       char *lo = base_ptr;
@@ -241,10 +314,17 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
       stack_node stack[STACK_SIZE];
       stack_node *top = stack;
 
-      top = push (top, NULL, NULL);
+      top = push (top, NULL, NULL, depth);
 
       while (stack < top)
         {
+	  if (depth == 0)
+	    {
+	      heapsort_r (lo, hi, size, swap_func, cmp, arg);
+              top = pop (top, &lo, &hi, &depth);
+	      continue;
+	    }
+
           char *left_ptr;
           char *right_ptr;
 
@@ -308,7 +388,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
             {
               if ((size_t) (hi - left_ptr) <= max_thresh)
 		/* Ignore both small partitions. */
-		top = pop (top, &lo, &hi);
+		top = pop (top, &lo, &hi, &depth);
               else
 		/* Ignore small left partition. */
                 lo = left_ptr;
@@ -319,13 +399,13 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
           else if ((right_ptr - lo) > (hi - left_ptr))
             {
 	      /* Push larger left partition indices. */
-	      top = push (top, lo, right_ptr);
+	      top = push (top, lo, right_ptr, depth - 1);
               lo = left_ptr;
             }
           else
             {
 	      /* Push larger right partition indices. */
-	      top = push (top, left_ptr, hi);
+	      top = push (top, left_ptr, hi, depth - 1);
               hi = right_ptr;
             }
         }


^ permalink raw reply	[flat|nested] 2+ messages in thread

* [glibc/azanella/qsort-fixes] stdlib: Implement introsort with qsort
@ 2021-09-06 18:45 Adhemerval Zanella
  0 siblings, 0 replies; 2+ messages in thread
From: Adhemerval Zanella @ 2021-09-06 18:45 UTC (permalink / raw)
  To: glibc-cvs

https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=57788bd80c352d972e678fdcc00e758f7b03ee92

commit 57788bd80c352d972e678fdcc00e758f7b03ee92
Author: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Date:   Thu Sep 2 16:25:23 2021 -0300

    stdlib: Implement introsort with qsort
    
    This patch adds a introsort implementation on qsort to avoid worse-case
    performance of quicksort to O(nlog n).  The heapsort fallback used is a
    heapsort based on Linux implementation (commit 22a241ccb2c19962a).  As a
    side note the introsort implementation is similar the one used on
    libstdc++ for std::sort.
    
    Checked on x86_64-linux-gnu.

Diff:
---
 stdlib/qsort.c | 95 +++++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 88 insertions(+), 7 deletions(-)

diff --git a/stdlib/qsort.c b/stdlib/qsort.c
index 60ca3b48b0..f00f6054dd 100644
--- a/stdlib/qsort.c
+++ b/stdlib/qsort.c
@@ -112,6 +112,7 @@ typedef struct
   {
     char *lo;
     char *hi;
+    size_t depth;
   } stack_node;
 
 /* The stack needs log (total_elements) entries (we could even subtract
@@ -121,23 +122,92 @@ typedef struct
 enum { STACK_SIZE = CHAR_BIT * sizeof (size_t) };
 
 static inline stack_node *
-push (stack_node *top, char *lo, char *hi)
+push (stack_node *top, char *lo, char *hi, size_t depth)
 {
   top->lo = lo;
   top->hi = hi;
+  top->depth = depth;
   return ++top;
 }
 
 static inline stack_node *
-pop (stack_node *top, char **lo, char **hi)
+pop (stack_node *top, char **lo, char **hi, size_t *depth)
 {
   --top;
   *lo = top->lo;
   *hi = top->hi;
+  *depth = top->depth;
   return top;
 }
 
 
+/* A fast, small, non-recursive O(nlog n) heapsort, adapted from Linux
+   lib/sort.c.  Used on introsort implementation as a fallback routine with
+   worst-case performance of O(nlog n) and worst-case space complexity of
+   O(1).  */
+
+static inline size_t
+parent (size_t i, unsigned int lsbit, size_t size)
+{
+  i -= size;
+  i -= size & -(i & lsbit);
+  return i / 2;
+}
+
+static void
+heapsort_r (void *base, void *end, size_t size, swap_func_t swap_func,
+	    __compar_d_fn_t cmp, void *arg)
+{
+  size_t num = ((uintptr_t) end - (uintptr_t) base) / size;
+  size_t n = num * size, a = (num/2) * size;
+  /* Used to find parent  */
+  const unsigned int lsbit = size & -size;
+
+  /* num < 2 || size == 0.  */
+  if (a == 0)
+    return;
+
+  for (;;)
+    {
+      size_t b, c, d;
+
+      if (a != 0)
+	/* Building heap: sift down --a */
+	a -= size;
+      else if (n -= size)
+	/* Sorting: Extract root to --n */
+	do_swap (base, base + n, size, swap_func);
+      else
+	break;
+
+      /* Sift element at "a" down into heap.  This is the "bottom-up" variant,
+	 which significantly reduces calls to cmp_func(): we find the sift-down
+	 path all the way to the leaves (one compare per level), then backtrack
+	 to find where to insert the target element.
+
+	 Because elements tend to sift down close to the leaves, this uses fewer
+	 compares than doing two per level on the way down.  (A bit more than
+	 half as many on average, 3/4 worst-case.).  */
+      for (b = a; c = 2 * b + size, (d = c + size) < n;)
+	b = cmp (base + c, base + d, arg) >= 0 ? c : d;
+      if (d == n)
+	/* Special case last leaf with no sibling.  */
+	b = c;
+
+      /* Now backtrack from "b" to the correct location for "a".  */
+      while (b != a && cmp (base + a, base + b, arg) >= 0)
+	b = parent (b, lsbit, size);
+      /* Where "a" belongs.  */
+      c = b;
+      while (b != a)
+	{
+	  /* Shift it into place.  */
+	  b = parent (b, lsbit, size);
+          do_swap (base + b, base + c, size, swap_func);
+        }
+    }
+}
+
 /* Order size using quicksort.  This implementation incorporates
    four optimizations discussed in Sedgewick:
 
@@ -222,7 +292,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
 
   const size_t max_thresh = MAX_THRESH * size;
 
-  if (total_elems == 0)
+  if (total_elems <= 1)
     /* Avoid lossage with unsigned arithmetic below.  */
     return;
 
@@ -234,6 +304,10 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
   else
     swap_func = SWAP_BYTES;
 
+  /* Maximum depth before quicksort switches to heapsort.  */
+  size_t depth = 2 * (sizeof (size_t) * CHAR_BIT - 1
+		      - __builtin_clzl (total_elems));
+
   if (total_elems > MAX_THRESH)
     {
       char *lo = base_ptr;
@@ -241,10 +315,17 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
       stack_node stack[STACK_SIZE];
       stack_node *top = stack;
 
-      top = push (top, NULL, NULL);
+      top = push (top, NULL, NULL, depth);
 
       while (stack < top)
         {
+	  if (depth == 0)
+	    {
+	      heapsort_r (lo, hi, size, swap_func, cmp, arg);
+              top = pop (top, &lo, &hi, &depth);
+	      continue;
+	    }
+
           char *left_ptr;
           char *right_ptr;
 
@@ -308,7 +389,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
             {
               if ((size_t) (hi - left_ptr) <= max_thresh)
 		/* Ignore both small partitions. */
-		top = pop (top, &lo, &hi);
+		top = pop (top, &lo, &hi, &depth);
               else
 		/* Ignore small left partition. */
                 lo = left_ptr;
@@ -319,13 +400,13 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
           else if ((right_ptr - lo) > (hi - left_ptr))
             {
 	      /* Push larger left partition indices. */
-	      top = push (top, lo, right_ptr);
+	      top = push (top, lo, right_ptr, depth - 1);
               lo = left_ptr;
             }
           else
             {
 	      /* Push larger right partition indices. */
-	      top = push (top, left_ptr, hi);
+	      top = push (top, left_ptr, hi, depth - 1);
               hi = right_ptr;
             }
         }


^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2021-09-06 18:45 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-09-06 18:22 [glibc/azanella/qsort-fixes] stdlib: Implement introsort with qsort Adhemerval Zanella
2021-09-06 18:45 Adhemerval Zanella

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).