From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <azanella@sourceware.org>
Received: by sourceware.org (Postfix, from userid 1791)
	id 481FF3858000; Tue, 31 Oct 2023 17:23:45 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 481FF3858000
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org;
	s=default; t=1698773025;
	bh=Bptep8X7unJXAmBXZItuz9xxEtGA3VmGW6jQJVgUQXk=;
	h=From:To:Subject:Date:From;
	b=g8eGnctcyPBRS+VVChfqDz0DDjikdH3NUaJMEVqVxJgOLZr4v/ibntDDa8ah+v57z
	 FXE2berlaahufHTi6CiU39Q4UpEm8PodhmE3CteB/J4vx+8vvg3azDL2KHnxPNSL6p
	 NH6RRZPgfvjtQ7IgY/E5TGQMKWK2BFe3mjTdNx3Y=
Content-Type: text/plain; charset="us-ascii"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
From: Adhemerval Zanella <azanella@sourceware.org>
To: glibc-cvs@sourceware.org
Subject: [glibc] stdlib: Optimization qsort{_r} swap implementation
X-Act-Checkin: glibc
X-Git-Author: Adhemerval Zanella <adhemerval.zanella@linaro.org>
X-Git-Refname: refs/heads/master
X-Git-Oldrev: fccf38c51746e0817c2409bb361398f9465e0760
X-Git-Newrev: 21d30c774c7f9f5878f0bf9438736c702b0a58a3
Message-Id: <20231031172345.481FF3858000@sourceware.org>
Date: Tue, 31 Oct 2023 17:23:45 +0000 (GMT)
List-Id: <glibc-cvs.sourceware.org>

https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=21d30c774c7f9f5878f0bf9438736c702b0a58a3

commit 21d30c774c7f9f5878f0bf9438736c702b0a58a3
Author: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Date:   Tue Oct 3 09:22:46 2023 -0300

    stdlib: Optimization qsort{_r} swap implementation
    
    The optimization takes in consideration both the most common elements
    are either 32 or 64 bit in size and inputs are aligned to the word
    boundary.  This is similar to what msort does.
    
    For large buffer the swap operation uses memcpy/mempcpy with a
    small fixed size buffer (so compiler might inline the operations).
    
    Checked on x86_64-linux-gnu.
    Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>

Diff:
---
 stdlib/qsort.c | 95 +++++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 77 insertions(+), 18 deletions(-)

diff --git a/stdlib/qsort.c b/stdlib/qsort.c
index 728a0ed370..072ccdfb95 100644
--- a/stdlib/qsort.c
+++ b/stdlib/qsort.c
@@ -21,22 +21,73 @@
 
 #include <alloca.h>
 #include <limits.h>
+#include <memswap.h>
 #include <stdlib.h>
 #include <string.h>
+#include <stdbool.h>
 
-/* Byte-wise swap two items of size SIZE. */
-#define SWAP(a, b, size)						      \
-  do									      \
-    {									      \
-      size_t __size = (size);						      \
-      char *__a = (a), *__b = (b);					      \
-      do								      \
-	{								      \
-	  char __tmp = *__a;						      \
-	  *__a++ = *__b;						      \
-	  *__b++ = __tmp;						      \
-	} while (--__size > 0);						      \
-    } while (0)
+/* Swap SIZE bytes between addresses A and B.  These helpers are provided
+   along the generic one as an optimization.  */
+
+enum swap_type_t
+  {
+    SWAP_WORDS_64,
+    SWAP_WORDS_32,
+    SWAP_BYTES
+  };
+
+/* If this function returns true, elements can be safely copied using word
+   loads and stores.  Otherwise, it might not be safe.  BASE (as an integer)
+   must be a multiple of the word alignment.  SIZE must be a multiple of
+   WORDSIZE.  Since WORDSIZE must be a multiple of the word alignment, and
+   WORDSIZE is a power of two on all supported platforms, this function for
+   speed merely checks that BASE and SIZE are both multiples of the word
+   size.  */
+static inline bool
+is_aligned (const void *base, size_t size, size_t wordsize)
+{
+  return (((uintptr_t) base | size) & (wordsize - 1)) == 0;
+}
+
+static inline void
+swap_words_64 (void * restrict a, void * restrict b, size_t n)
+{
+  typedef uint64_t __attribute__ ((__may_alias__)) u64_alias_t;
+  do
+   {
+     n -= 8;
+     u64_alias_t t = *(u64_alias_t *)(a + n);
+     *(u64_alias_t *)(a + n) = *(u64_alias_t *)(b + n);
+     *(u64_alias_t *)(b + n) = t;
+   } while (n);
+}
+
+static inline void
+swap_words_32 (void * restrict a, void * restrict b, size_t n)
+{
+  typedef uint32_t __attribute__ ((__may_alias__)) u32_alias_t;
+  do
+   {
+     n -= 4;
+     u32_alias_t t = *(u32_alias_t *)(a + n);
+     *(u32_alias_t *)(a + n) = *(u32_alias_t *)(b + n);
+     *(u32_alias_t *)(b + n) = t;
+   } while (n);
+}
+
+/* Replace the indirect call with a serie of if statements.  It should help
+   the branch predictor.  */
+static void
+do_swap (void * restrict a, void * restrict b, size_t size,
+	 enum swap_type_t swap_type)
+{
+  if (swap_type == SWAP_WORDS_64)
+    swap_words_64 (a, b, size);
+  else if (swap_type == SWAP_WORDS_32)
+    swap_words_32 (a, b, size);
+  else
+    __memswap (a, b, size);
+}
 
 /* Discontinue quicksort algorithm when partition gets below this size.
    This particular magic number was chosen to work best on a Sun 4/260. */
@@ -96,6 +147,14 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
     /* Avoid lossage with unsigned arithmetic below.  */
     return;
 
+  enum swap_type_t swap_type;
+  if (is_aligned (pbase, size, 8))
+    swap_type = SWAP_WORDS_64;
+  else if (is_aligned (pbase, size, 4))
+    swap_type = SWAP_WORDS_32;
+  else
+    swap_type = SWAP_BYTES;
+
   if (total_elems > MAX_THRESH)
     {
       char *lo = base_ptr;
@@ -119,13 +178,13 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
 	  char *mid = lo + size * ((hi - lo) / size >> 1);
 
 	  if ((*cmp) ((void *) mid, (void *) lo, arg) < 0)
-	    SWAP (mid, lo, size);
+	    do_swap (mid, lo, size, swap_type);
 	  if ((*cmp) ((void *) hi, (void *) mid, arg) < 0)
-	    SWAP (mid, hi, size);
+	    do_swap (mid, hi, size, swap_type);
 	  else
 	    goto jump_over;
 	  if ((*cmp) ((void *) mid, (void *) lo, arg) < 0)
-	    SWAP (mid, lo, size);
+	    do_swap (mid, lo, size, swap_type);
 	jump_over:;
 
 	  left_ptr  = lo + size;
@@ -144,7 +203,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
 
 	      if (left_ptr < right_ptr)
 		{
-		  SWAP (left_ptr, right_ptr, size);
+		  do_swap (left_ptr, right_ptr, size, swap_type);
 		  if (mid == left_ptr)
 		    mid = right_ptr;
 		  else if (mid == right_ptr)
@@ -216,7 +275,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
         tmp_ptr = run_ptr;
 
     if (tmp_ptr != base_ptr)
-      SWAP (tmp_ptr, base_ptr, size);
+      do_swap (tmp_ptr, base_ptr, size, swap_type);
 
     /* Insertion sort, running from left-hand-side up to right-hand-side.  */