From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <goldstein.w.n@gmail.com>
Received: from mail-pf1-x429.google.com (mail-pf1-x429.google.com
 [IPv6:2607:f8b0:4864:20::429])
 by sourceware.org (Postfix) with ESMTPS id 509D13858C60
 for <libc-alpha@sourceware.org>; Wed, 13 Oct 2021 03:53:19 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 509D13858C60
Received: by mail-pf1-x429.google.com with SMTP id q19so1279794pfl.4
 for <libc-alpha@sourceware.org>; Tue, 12 Oct 2021 20:53:19 -0700 (PDT)
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
 d=1e100.net; s=20210112;
 h=x-gm-message-state:mime-version:references:in-reply-to:from:date
 :message-id:subject:to:cc;
 bh=eNZ8AFVrfidhGctpW/30hRZMxWQN2QBt9wFj2wfr9YA=;
 b=TcZyE6UDxGZxb+ykLiPYhJ8EzcOc7jSmyFYrAP8eT6fbd7VBKzs5DCB43TsdJHMXcQ
 mXKxC9F/B1LUrcbpVuJvSWDd6pObs+pVosnF19k2iiH8EOxSxSRtgcpg42eUfJ/AOFAr
 hJPStod0c+ulB01ZgSGzKRXuZyCXGdRdM5kHMBu5CkxX+I+qBqmDTYLbYDwjAYjJ0jd8
 mqQTEaxIv64q3JvznAG6w2lXYXYiiyFxgY5CHaHtEoDU/fnap5jZFEk/vGa2jX7alXo9
 GotD9VxdUWoRpN/SqYs5QQgUnWiLBeaK9adA+ljL0M5/Tl999odSeFzW9rXOOKcCyyXy
 bJtQ==
X-Gm-Message-State: AOAM531/T9PLsKaHpFBBxVlTVAoBbe7In4Ie/weup9G1KoFQLi+azVAu
 xz4dgRONrfp+jgD+fhCqdbJ4F4+4H7qZ7a8HJMa+Esq0
X-Google-Smtp-Source: ABdhPJwFR/kR+IFVlKX6zyVAldl49QHN4tAK0odDepBv1ltS5JuvTmcLCrlpkK0PZRp9Wq41lne4Wh44VZ8OwvEWXhA=
X-Received: by 2002:aa7:8b1a:0:b0:44d:37c7:dbb6 with SMTP id
 f26-20020aa78b1a000000b0044d37c7dbb6mr8818336pfd.11.1634097198425; Tue, 12
 Oct 2021 20:53:18 -0700 (PDT)
MIME-Version: 1.0
References: <20210903171144.952737-1-adhemerval.zanella@linaro.org>
 <20210903171144.952737-7-adhemerval.zanella@linaro.org>
In-Reply-To: <20210903171144.952737-7-adhemerval.zanella@linaro.org>
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Tue, 12 Oct 2021 23:53:07 -0400
Message-ID: <CAFUsyf+R7q7tcihJX48t3z-Oy1B-yuiosufwg13T7NxKhM=sWA@mail.gmail.com>
Subject: Re: [PATCH v3 6/7] stdlib: Implement introsort with qsort
To: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Cc: GNU C Library <libc-alpha@sourceware.org>
X-Spam-Status: No, score=-9.6 required=5.0 tests=BAYES_00, DKIM_SIGNED,
 DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0,
 HTML_MESSAGE, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS,
 TXREP autolearn=ham autolearn_force=no version=3.4.4
X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on
 server2.sourceware.org
Content-Type: text/plain; charset="UTF-8"
X-Content-Filtered-By: Mailman/MimeDel 2.1.29
X-BeenThere: libc-alpha@sourceware.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Libc-alpha mailing list <libc-alpha.sourceware.org>
List-Unsubscribe: <https://sourceware.org/mailman/options/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=unsubscribe>
List-Archive: <https://sourceware.org/pipermail/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-request@sourceware.org?subject=help>
List-Subscribe: <https://sourceware.org/mailman/listinfo/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=subscribe>
X-List-Received-Date: Wed, 13 Oct 2021 03:53:21 -0000

On Fri, Sep 3, 2021 at 1:16 PM Adhemerval Zanella via Libc-alpha <
libc-alpha@sourceware.org> wrote:

> This patch adds a introsort implementation on qsort to avoid worse-case
> performance of quicksort to O(nlog n).  The heapsort fallback used is a
> heapsort based on Linux implementation (commit 22a241ccb2c19962a).  As a
> side note the introsort implementation is similar the one used on
> libstdc++ for std::sort.
>
> Checked on x86_64-linux-gnu.
> ---
>  stdlib/qsort.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 87 insertions(+), 7 deletions(-)
>
> diff --git a/stdlib/qsort.c b/stdlib/qsort.c
> index 5df640362d..8368576aae 100644
> --- a/stdlib/qsort.c
> +++ b/stdlib/qsort.c
> @@ -113,6 +113,7 @@ typedef struct
>    {
>      char *lo;
>      char *hi;
> +    size_t depth;
>

Why do a depth tracker per stack_node as opposed to one for
the entire call?


>    } stack_node;
>
>  /* The stack needs log (total_elements) entries (we could even subtract
> @@ -122,23 +123,92 @@ typedef struct
>  enum { STACK_SIZE = CHAR_BIT * sizeof (size_t) };
>
>  static inline stack_node *
> -push (stack_node *top, char *lo, char *hi)
> +push (stack_node *top, char *lo, char *hi, size_t depth)
>  {
>    top->lo = lo;
>    top->hi = hi;
> +  top->depth = depth;
>    return ++top;
>  }
>
>  static inline stack_node *
> -pop (stack_node *top, char **lo, char **hi)
> +pop (stack_node *top, char **lo, char **hi, size_t *depth)
>  {
>    --top;
>    *lo = top->lo;
>    *hi = top->hi;
> +  *depth = top->depth;
>    return top;
>  }
>
>
> +/* A fast, small, non-recursive O(nlog n) heapsort, adapted from Linux
> +   lib/sort.c.  Used on introsort implementation as a fallback routine
> with
> +   worst-case performance of O(nlog n) and worst-case space complexity of
> +   O(1).  */
> +
> +static inline size_t
> +parent (size_t i, unsigned int lsbit, size_t size)
> +{
> +  i -= size;
> +  i -= size & -(i & lsbit);
> +  return i / 2;
> +}
> +
> +static void
> +heapsort_r (void *base, void *end, size_t size, swap_func_t swap_func,
> +           __compar_d_fn_t cmp, void *arg)
> +{
> +  size_t num = ((uintptr_t) end - (uintptr_t) base) / size;
> +  size_t n = num * size, a = (num/2) * size;
> +  /* Used to find parent  */
> +  const unsigned int lsbit = size & -size;
> +
> +  /* num < 2 || size == 0.  */
> +  if (a == 0)
> +    return;
> +
> +  for (;;)
> +    {
> +      size_t b, c, d;
> +
> +      if (a != 0)
> +       /* Building heap: sift down --a */
> +       a -= size;
> +      else if (n -= size)
> +       /* Sorting: Extract root to --n */
> +       do_swap (base, base + n, size, swap_func);
> +      else
> +       break;
> +
> +      /* Sift element at "a" down into heap.  This is the "bottom-up"
> variant,
> +        which significantly reduces calls to cmp_func(): we find the
> sift-down
> +        path all the way to the leaves (one compare per level), then
> backtrack
> +        to find where to insert the target element.
> +
> +        Because elements tend to sift down close to the leaves, this uses
> fewer
> +        compares than doing two per level on the way down.  (A bit more
> than
> +        half as many on average, 3/4 worst-case.).  */
> +      for (b = a; c = 2 * b + size, (d = c + size) < n;)
> +       b = cmp (base + c, base + d, arg) >= 0 ? c : d;
> +      if (d == n)
> +       /* Special case last leaf with no sibling.  */
> +       b = c;
> +
> +      /* Now backtrack from "b" to the correct location for "a".  */
> +      while (b != a && cmp (base + a, base + b, arg) >= 0)
> +       b = parent (b, lsbit, size);
> +      /* Where "a" belongs.  */
> +      c = b;
> +      while (b != a)
> +       {
> +         /* Shift it into place.  */
> +         b = parent (b, lsbit, size);
> +          do_swap (base + b, base + c, size, swap_func);
> +        }
> +    }
> +}
> +
>  /* Order size using quicksort.  This implementation incorporates
>     four optimizations discussed in Sedgewick:
>
> @@ -223,7 +293,7 @@ _quicksort (void *const pbase, size_t total_elems,
> size_t size,
>
>    const size_t max_thresh = MAX_THRESH * size;
>
> -  if (total_elems == 0)
> +  if (total_elems <= 1)
>      /* Avoid lossage with unsigned arithmetic below.  */
>      return;
>
> @@ -235,6 +305,9 @@ _quicksort (void *const pbase, size_t total_elems,
> size_t size,
>    else
>      swap_func = SWAP_BYTES;
>
> +  /* Maximum depth before quicksort switches to heapsort.  */
> +  size_t depth = 2 * (CHAR_BIT - 1 - __builtin_clzl (total_elems));
> +
>    if (total_elems > MAX_THRESH)
>      {
>        char *lo = base_ptr;
> @@ -242,10 +315,17 @@ _quicksort (void *const pbase, size_t total_elems,
> size_t size,
>        stack_node stack[STACK_SIZE];
>        stack_node *top = stack;
>
> -      top = push (top, NULL, NULL);
> +      top = push (top, NULL, NULL, depth);
>
>        while (stack < top)
>          {
> +         if (depth == 0)
> +           {

+             heapsort_r (lo, hi, size, swap_func, cmp, arg);
> +              top = pop (top, &lo, &hi, &depth);
> +             continue;
> +           }
> +
>            char *left_ptr;
>            char *right_ptr;
>
> @@ -309,7 +389,7 @@ _quicksort (void *const pbase, size_t total_elems,
> size_t size,
>              {
>                if ((size_t) (hi - left_ptr) <= max_thresh)
>                 /* Ignore both small partitions. */
> -               top = pop (top, &lo, &hi);
> +               top = pop (top, &lo, &hi, &depth);
>                else
>                 /* Ignore small left partition. */
>                  lo = left_ptr;
> @@ -320,13 +400,13 @@ _quicksort (void *const pbase, size_t total_elems,
> size_t size,
>            else if ((right_ptr - lo) > (hi - left_ptr))
>

Since we now have a depth counter, is it faster to still
always select the bigger region for the stack or
to remove this branch and just choose a direction?

We should be able to bound the size of the stack structure
with depth.


>              {
>               /* Push larger left partition indices. */
> -             top = push (top, lo, right_ptr);
> +             top = push (top, lo, right_ptr, depth - 1);
>                lo = left_ptr;
>              }
>            else
>              {
>               /* Push larger right partition indices. */
> -             top = push (top, left_ptr, hi);
> +             top = push (top, left_ptr, hi, depth - 1);
>                hi = right_ptr;
>              }
>          }
> --
> 2.30.2
>
>