[PATCH v4 1/6] stdlib: Optimization qsort{_r} swap implementation (BZ 19305)

public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed

From: Adhemerval Zanella <adhemerval.zanella@linaro.org>
To: libc-alpha@sourceware.org
Subject: [PATCH v4 1/6] stdlib: Optimization qsort{_r} swap implementation (BZ 19305)
Date: Tue, 11 Jul 2023 16:07:17 -0300	[thread overview]
Message-ID: <20230711190722.4028821-2-adhemerval.zanella@linaro.org> (raw)
In-Reply-To: <20230711190722.4028821-1-adhemerval.zanella@linaro.org>

It optimizes take in consideration both the most common elements are
either 32 or 64 bit in size and inputs are aligned to the word
boundary.  This is similar to the optimization done on lib/sort.c
from Linux.

This patchs adds an optimized swap operation on qsort based in previous
msort one.  Instead of byte operation, three variants are provided:

  1. Using uint32_t loads and stores.
  2. Using uint64_t loads and stores.
  3. Generic one with a temporary buffer and memcpy/mempcpy.

The 1. and 2. options are selected only if base pointer is aligned
to required type.

It also fixes BZ#19305 by checking input size against number of
elements 1 besides 0.

Checked on x86_64-linux-gnu.
---
 stdlib/qsort.c | 113 +++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 95 insertions(+), 18 deletions(-)

diff --git a/stdlib/qsort.c b/stdlib/qsort.c
index 728a0ed370..8a3331fdb4 100644
--- a/stdlib/qsort.c
+++ b/stdlib/qsort.c
@@ -23,20 +23,89 @@
 #include <limits.h>
 #include <stdlib.h>
 #include <string.h>
+#include <stdbool.h>
+#include <libc-pointer-arith.h>
 
-/* Byte-wise swap two items of size SIZE. */
-#define SWAP(a, b, size)						      \
-  do									      \
-    {									      \
-      size_t __size = (size);						      \
-      char *__a = (a), *__b = (b);					      \
-      do								      \
-	{								      \
-	  char __tmp = *__a;						      \
-	  *__a++ = *__b;						      \
-	  *__b++ = __tmp;						      \
-	} while (--__size > 0);						      \
-    } while (0)
+/* Swap SIZE bytes between addresses A and B.  These helpers are provided
+   along the generic one as an optimization.  */
+
+typedef void (*swap_func_t)(void * restrict, void * restrict, size_t);
+
+#define SWAP_WORDS_64 (swap_func_t)0
+#define SWAP_WORDS_32 (swap_func_t)1
+#define SWAP_BYTES    (swap_func_t)2
+
+/* Returns true if elements can be copied using word loads and stores.
+   The SIZE and BASE must be a multiple of the ALIGN.  */
+__attribute_const__ __always_inline
+static bool
+is_aligned (const void *base, size_t size, unsigned char align)
+{
+  unsigned char lsbits = (unsigned char) size;
+
+  lsbits |= (unsigned char)(uintptr_t) base;
+  return (lsbits & (align - 1)) == 0;
+}
+
+static void
+swap_words_64 (void * restrict a, void * restrict b, size_t n)
+{
+  do
+   {
+     n -= 8;
+     uint64_t t = *(uint64_t *)(a + n);
+     *(uint64_t *)(a + n) = *(uint64_t *)(b + n);
+     *(uint64_t *)(b + n) = t;
+   } while (n);
+}
+
+static void
+swap_words_32 (void * restrict a, void * restrict b, size_t n)
+{
+  do
+   {
+     n -= 4;
+     uint32_t t = *(uint32_t *)(a + n);
+     *(uint32_t *)(a + n) = *(uint32_t *)(b + n);
+     *(uint32_t *)(b + n) = t;
+   } while (n);
+}
+
+static void
+swap_bytes (void * restrict a, void * restrict b, size_t n)
+{
+  /* Use multiple small memcpys with constant size to enable inlining
+     on most targets.  */
+  enum { SWAP_GENERIC_SIZE = 32 };
+  unsigned char tmp[SWAP_GENERIC_SIZE];
+  while (n > SWAP_GENERIC_SIZE)
+    {
+      __builtin_memcpy (tmp, a, SWAP_GENERIC_SIZE);
+      a = __mempcpy (a, b, SWAP_GENERIC_SIZE);
+      b = __mempcpy (b, tmp, SWAP_GENERIC_SIZE);
+      n -= SWAP_GENERIC_SIZE;
+    }
+  while (n > 0)
+    {
+      unsigned char t = ((unsigned char *)a)[--n];
+      ((unsigned char *)a)[n] = ((unsigned char *)b)[n];
+      ((unsigned char *)b)[n] = t;
+    }
+}
+
+/* Replace the indirect call with a serie of if statements.  It should help
+   the branch predictor.  */
+static void
+do_swap (void * restrict a, void * restrict b, size_t size,
+	 swap_func_t swap_func)
+{
+  if (swap_func == SWAP_WORDS_64)
+    swap_words_64 (a, b, size);
+  else if (swap_func == SWAP_WORDS_32)
+    swap_words_32 (a, b, size);
+  else
+    swap_bytes (a, b, size);
+}
 
 /* Discontinue quicksort algorithm when partition gets below this size.
    This particular magic number was chosen to work best on a Sun 4/260. */
@@ -96,6 +165,14 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
     /* Avoid lossage with unsigned arithmetic below.  */
     return;
 
+  swap_func_t swap_func;
+  if (is_aligned (pbase, size, sizeof (uint64_t)))
+    swap_func = SWAP_WORDS_64;
+  else if (is_aligned (pbase, size, sizeof (uint32_t)))
+    swap_func = SWAP_WORDS_32;
+  else
+    swap_func = SWAP_BYTES;
+
   if (total_elems > MAX_THRESH)
     {
       char *lo = base_ptr;
@@ -119,13 +196,13 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
 	  char *mid = lo + size * ((hi - lo) / size >> 1);
 
 	  if ((*cmp) ((void *) mid, (void *) lo, arg) < 0)
-	    SWAP (mid, lo, size);
+	    do_swap (mid, lo, size, swap_func);
 	  if ((*cmp) ((void *) hi, (void *) mid, arg) < 0)
-	    SWAP (mid, hi, size);
+	    do_swap (mid, hi, size, swap_func);
 	  else
 	    goto jump_over;
 	  if ((*cmp) ((void *) mid, (void *) lo, arg) < 0)
-	    SWAP (mid, lo, size);
+	    do_swap (mid, lo, size, swap_func);
 	jump_over:;
 
 	  left_ptr  = lo + size;
@@ -144,7 +221,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
 
 	      if (left_ptr < right_ptr)
 		{
-		  SWAP (left_ptr, right_ptr, size);
+		  do_swap (left_ptr, right_ptr, size, swap_func);
 		  if (mid == left_ptr)
 		    mid = right_ptr;
 		  else if (mid == right_ptr)
@@ -216,7 +293,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
         tmp_ptr = run_ptr;
 
     if (tmp_ptr != base_ptr)
-      SWAP (tmp_ptr, base_ptr, size);
+      do_swap (tmp_ptr, base_ptr, size, swap_func);
 
     /* Insertion sort, running from left-hand-side up to right-hand-side.  */
 
-- 
2.34.1

next prev parent reply	other threads:[~2023-07-11 19:07 UTC|newest]

Thread overview: 24+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-07-11 19:07 [PATCH v4 0/6] Use introsort for qsort Adhemerval Zanella
2023-07-11 19:07 ` Adhemerval Zanella [this message]
2023-07-11 23:40   ` [PATCH v4 1/6] stdlib: Optimization qsort{_r} swap implementation (BZ 19305) Noah Goldstein
2023-07-12 19:40     ` Adhemerval Zanella Netto
2023-07-12 21:04   ` Paul Eggert
2023-07-13 11:07     ` Adhemerval Zanella Netto
2023-07-13 13:13       ` Alexander Monakov
2023-07-13 13:29         ` Adhemerval Zanella Netto
2023-07-11 19:07 ` [PATCH v4 2/6] stdlib: Move insertion sort out qsort Adhemerval Zanella
2023-07-11 23:46   ` Noah Goldstein
2023-07-12  7:28     ` Andreas Schwab
2023-07-12 20:35     ` Adhemerval Zanella Netto
2023-07-11 19:07 ` [PATCH v4 3/6] stdlib: qsort: Move some macros to inline function Adhemerval Zanella
2023-07-11 23:44   ` Noah Goldstein
2023-07-12 20:45     ` Adhemerval Zanella Netto
2023-07-11 19:07 ` [PATCH v4 4/6] stdlib: Implement introsort with qsort Adhemerval Zanella
2023-07-12  5:39   ` Alexander Monakov
2023-07-12 20:55     ` Adhemerval Zanella Netto
2023-07-12 21:55   ` Paul Eggert
2023-07-13 11:53     ` Adhemerval Zanella Netto
2023-07-11 19:07 ` [PATCH v4 5/6] stdlib: Remove use of mergesort on qsort (BZ 21719) Adhemerval Zanella
2023-07-12 22:04   ` Paul Eggert
2023-07-13 11:55     ` Adhemerval Zanella Netto
2023-07-11 19:07 ` [PATCH v4 6/6] stdlib: Add more qsort{_r} coverage Adhemerval Zanella

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230711190722.4028821-2-adhemerval.zanella@linaro.org \
    --to=adhemerval.zanella@linaro.org \
    --cc=libc-alpha@sourceware.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).