public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH v1 1/5] benchtests: Improve bench-strrchr
@ 2022-04-21  3:14 Noah Goldstein
  2022-04-21  3:14 ` [PATCH v1 2/5] x86: Optimize {str|wcs}rchr-sse2 Noah Goldstein
                   ` (7 more replies)
  0 siblings, 8 replies; 36+ messages in thread
From: Noah Goldstein @ 2022-04-21  3:14 UTC (permalink / raw)
  To: libc-alpha

1. Use json-lib for printing results.
2. Expose all parameters (before pos, seek_char, and max_char where
   not printed).
3. Add benchmarks that test multiple occurence of seek_char in the
   string.
---
 benchtests/bench-strrchr.c | 126 ++++++++++++++++++++++++-------------
 1 file changed, 82 insertions(+), 44 deletions(-)

diff --git a/benchtests/bench-strrchr.c b/benchtests/bench-strrchr.c
index abdae60c51..cceea77e1b 100644
--- a/benchtests/bench-strrchr.c
+++ b/benchtests/bench-strrchr.c
@@ -23,6 +23,7 @@
 # define TEST_NAME "strrchr"
 #endif
 #include "bench-string.h"
+#include "json-lib.h"
 
 #define BIG_CHAR MAX_CHAR
 
@@ -53,7 +54,8 @@ SIMPLE_STRRCHR (const CHAR *s, int c)
 }
 
 static void
-do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
+do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s, int c,
+	     CHAR *exp_res)
 {
   CHAR *res = CALL (impl, s, c);
   size_t i, iters = INNER_LOOP_ITERS8;
@@ -61,8 +63,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
 
   if (res != exp_res)
     {
-      error (0, 0, "Wrong result in function %s %p %p", impl->name,
-	     res, exp_res);
+      error (0, 0, "Wrong result in function %s %p %p", impl->name, res,
+	     exp_res);
       ret = 1;
       return;
     }
@@ -72,24 +74,28 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
     {
       CALL (impl, s, c);
     }
-  TIMING_NOW (stop);
 
+  TIMING_NOW (stop);
   TIMING_DIFF (cur, start, stop);
 
-  TIMING_PRINT_MEAN ((double) cur, (double) iters);
+  json_element_double (json_ctx, (double) cur / (double) iters);
+  return;
 }
 
 static void
-do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
+do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
+	 int seek_char, int max_char, size_t freq)
 /* For wcsrchr: align here means align not in bytes,
    but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
    len for wcschr here isn't in bytes but it's number of wchar_t symbols.  */
 {
   size_t i;
+  size_t pos_chunk_sz = freq ? (pos / freq) : pos;
+  size_t last_pos = len;
   CHAR *result;
   CHAR *buf = (CHAR *) buf1;
 
-  align &= 7;
+  align &= (getpagesize () - 1);
   if ((align + len) * sizeof (CHAR) >= page_size)
     return;
 
@@ -103,6 +109,16 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
       if ((i > pos || pos >= len) && buf[align + i] == seek_char)
 	buf[align + i] = seek_char + 10 + (random () & 15);
     }
+
+  if (pos_chunk_sz == 0 && pos)
+    pos_chunk_sz = 1;
+
+  for (i = pos_chunk_sz; i < pos && i < len; i += pos_chunk_sz)
+    {
+      buf[align + i] = seek_char;
+      last_pos = i;
+    }
+
   buf[align + len] = 0;
 
   if (pos < len)
@@ -110,66 +126,88 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
       buf[align + pos] = seek_char;
       result = (CHAR *) (buf + align + pos);
     }
+  else if (last_pos < len)
+    result = (CHAR *) (buf + align + last_pos);
   else if (seek_char == 0)
     result = (CHAR *) (buf + align + len);
   else
     result = NULL;
 
-  printf ("Length %4zd, alignment in bytes %2zd:", len, align * sizeof (CHAR));
+  json_element_object_begin (json_ctx);
+  json_attr_uint (json_ctx, "len", len);
+  json_attr_uint (json_ctx, "pos", pos);
+  json_attr_uint (json_ctx, "align", align);
+  json_attr_uint (json_ctx, "freq", freq);
+  json_attr_uint (json_ctx, "seek", seek_char);
+  json_attr_uint (json_ctx, "max_char", max_char);
+  json_array_begin (json_ctx, "timings");
 
   FOR_EACH_IMPL (impl, 0)
-    do_one_test (impl, (CHAR *) (buf + align), seek_char, result);
+    do_one_test (json_ctx, impl, (CHAR *) (buf + align), seek_char, result);
 
-  putchar ('\n');
+  json_array_end (json_ctx);
+  json_element_object_end (json_ctx);
 }
 
 int
 test_main (void)
 {
-  size_t i;
+  json_ctx_t json_ctx;
+  size_t i, j;
+  int seek;
 
   test_init ();
+  json_init (&json_ctx, 0, stdout);
 
-  printf ("%20s", "");
-  FOR_EACH_IMPL (impl, 0)
-    printf ("\t%s", impl->name);
-  putchar ('\n');
+  json_document_begin (&json_ctx);
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
 
-  for (i = 1; i < 8; ++i)
-    {
-      do_test (0, 16 << i, 2048, 23, SMALL_CHAR);
-      do_test (i, 16 << i, 2048, 23, SMALL_CHAR);
-    }
+  json_attr_object_begin (&json_ctx, "functions");
+  json_attr_object_begin (&json_ctx, TEST_NAME);
+  json_attr_string (&json_ctx, "bench-variant", "");
 
-  for (i = 1; i < 8; ++i)
-    {
-      do_test (i, 64, 256, 23, SMALL_CHAR);
-      do_test (i, 64, 256, 23, BIG_CHAR);
-    }
-
-  for (i = 0; i < 32; ++i)
-    {
-      do_test (0, i, i + 1, 23, SMALL_CHAR);
-      do_test (0, i, i + 1, 23, BIG_CHAR);
-    }
+  json_array_begin (&json_ctx, "ifuncs");
+  FOR_EACH_IMPL (impl, 0)
+    json_element_string (&json_ctx, impl->name);
+  json_array_end (&json_ctx);
 
-  for (i = 1; i < 8; ++i)
-    {
-      do_test (0, 16 << i, 2048, 0, SMALL_CHAR);
-      do_test (i, 16 << i, 2048, 0, SMALL_CHAR);
-    }
+  json_array_begin (&json_ctx, "results");
 
-  for (i = 1; i < 8; ++i)
+  for (seek = 0; seek <= 23; seek += 23)
     {
-      do_test (i, 64, 256, 0, SMALL_CHAR);
-      do_test (i, 64, 256, 0, BIG_CHAR);
+      for (j = 1; j < 32; j += j)
+	{
+	  for (i = 1; i < 9; ++i)
+	    {
+	      do_test (&json_ctx, 0, 16 << i, 2048, seek, SMALL_CHAR, j);
+	      do_test (&json_ctx, i, 16 << i, 2048, seek, SMALL_CHAR, j);
+	    }
+
+	  for (i = 1; i < 8; ++i)
+	    {
+	      do_test (&json_ctx, i, 64, 256, seek, SMALL_CHAR, j);
+	      do_test (&json_ctx, i, 64, 256, seek, BIG_CHAR, j);
+
+	      do_test (&json_ctx, i * 15, 64, 256, seek, SMALL_CHAR, j);
+	      do_test (&json_ctx, i * 15, 64, 256, seek, BIG_CHAR, j);
+	    }
+
+	  for (i = 0; i < 32; ++i)
+	    {
+	      do_test (&json_ctx, 0, i, i + 1, seek, SMALL_CHAR, j);
+	      do_test (&json_ctx, 0, i, i + 1, seek, BIG_CHAR, j);
+	    }
+	  if (seek == 0)
+	    {
+	      break;
+	    }
+	}
     }
 
-  for (i = 0; i < 32; ++i)
-    {
-      do_test (0, i, i + 1, 0, SMALL_CHAR);
-      do_test (0, i, i + 1, 0, BIG_CHAR);
-    }
+  json_array_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_document_end (&json_ctx);
 
   return ret;
 }
-- 
2.25.1


^ permalink raw reply	[flat|nested] 36+ messages in thread

* [PATCH v1 2/5] x86: Optimize {str|wcs}rchr-sse2
  2022-04-21  3:14 [PATCH v1 1/5] benchtests: Improve bench-strrchr Noah Goldstein
@ 2022-04-21  3:14 ` Noah Goldstein
  2022-04-21 20:26   ` H.J. Lu
  2022-04-21  3:14 ` [PATCH v1 3/5] x86: Add wcsrchr optimized with SSE4_1 in wcsrchr-sse4_1.S Noah Goldstein
                   ` (6 subsequent siblings)
  7 siblings, 1 reply; 36+ messages in thread
From: Noah Goldstein @ 2022-04-21  3:14 UTC (permalink / raw)
  To: libc-alpha

The new code unrolls the main loop slightly without adding too much
overhead and minimizes the comparisons for the search CHAR.

Geometric Mean of all benchmarks New / Old: 0.741
See email for all results.

Full xcheck passes on x86_64 with and without multiarch enabled.
---
Results For: strrchr

Geometric Mean of N=30 runs.

Geometric Mean of all benchmarks New / Old: 0.741
Benchmarks performance on Tigerlake:
https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i71165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html

 len, align,  pos, seek, max_char, freq, New Time / Old Time
2048,     0,   32,    0,      127,    1,               0.647
2048,     1,   32,    0,      127,    1,               0.621
2048,     0,   64,    0,      127,    1,               0.661
2048,     2,   64,    0,      127,    1,               0.655
2048,     0,  128,    0,      127,    1,                0.69
2048,     3,  128,    0,      127,    1,               0.689
2048,     0,  256,    0,      127,    1,               0.718
2048,     4,  256,    0,      127,    1,               0.718
2048,     0,  512,    0,      127,    1,               0.758
2048,     5,  512,    0,      127,    1,               0.754
2048,     0, 1024,    0,      127,    1,               1.029
2048,     6, 1024,    0,      127,    1,               1.032
2048,     0, 2048,    0,      127,    1,               0.826
2048,     7, 2048,    0,      127,    1,               0.834
2048,     0, 4096,    0,      127,    1,               0.825
2048,     8, 4096,    0,      127,    1,                0.83
 256,     1,   64,    0,      127,    1,               0.657
 256,    15,   64,    0,      127,    1,               0.657
 256,     2,   64,    0,      127,    1,               0.657
 256,    30,   64,    0,      127,    1,               0.523
 256,     3,   64,    0,      127,    1,               0.657
 256,    45,   64,    0,      127,    1,               0.654
 256,     4,   64,    0,      127,    1,               0.657
 256,    60,   64,    0,      127,    1,               0.526
 256,     5,   64,    0,      127,    1,               0.658
 256,    75,   64,    0,      127,    1,               0.658
 256,     6,   64,    0,      127,    1,               0.655
 256,    90,   64,    0,      127,    1,               0.523
 256,     7,   64,    0,      127,    1,               0.655
 256,   105,   64,    0,      127,    1,               0.654
   1,     0,    0,    0,      127,    1,                0.98
   2,     0,    1,    0,      127,    1,               0.978
   3,     0,    2,    0,      127,    1,               0.975
   4,     0,    3,    0,      127,    1,               0.976
   5,     0,    4,    0,      127,    1,               0.977
   6,     0,    5,    0,      127,    1,               0.981
   7,     0,    6,    0,      127,    1,               0.982
   8,     0,    7,    0,      127,    1,                0.98
   9,     0,    8,    0,      127,    1,               0.978
  10,     0,    9,    0,      127,    1,               0.981
  11,     0,   10,    0,      127,    1,               0.984
  12,     0,   11,    0,      127,    1,               0.982
  13,     0,   12,    0,      127,    1,                0.98
  14,     0,   13,    0,      127,    1,               0.978
  15,     0,   14,    0,      127,    1,               0.979
  16,     0,   15,    0,      127,    1,               0.986
  17,     0,   16,    0,      127,    1,               0.529
  18,     0,   17,    0,      127,    1,               0.566
  19,     0,   18,    0,      127,    1,               0.575
  20,     0,   19,    0,      127,    1,               0.573
  21,     0,   20,    0,      127,    1,               0.579
  22,     0,   21,    0,      127,    1,               0.595
  23,     0,   22,    0,      127,    1,               0.585
  24,     0,   23,    0,      127,    1,               0.586
  25,     0,   24,    0,      127,    1,               0.587
  26,     0,   25,    0,      127,    1,               0.592
  27,     0,   26,    0,      127,    1,               0.595
  28,     0,   27,    0,      127,    1,               0.592
  29,     0,   28,    0,      127,    1,                 0.6
  30,     0,   29,    0,      127,    1,               0.598
  31,     0,   30,    0,      127,    1,               0.595
  32,     0,   31,    0,      127,    1,               0.592
2048,     0,   32,   23,      127,    1,               0.827
2048,     1,   32,   23,      127,    1,               0.826
2048,     0,   64,   23,      127,    1,               0.824
2048,     2,   64,   23,      127,    1,               0.825
2048,     0,  128,   23,      127,    1,               0.829
2048,     3,  128,   23,      127,    1,               0.824
2048,     0,  256,   23,      127,    1,               0.832
2048,     4,  256,   23,      127,    1,               0.825
2048,     0,  512,   23,      127,    1,               0.831
2048,     5,  512,   23,      127,    1,               0.837
2048,     0, 1024,   23,      127,    1,               0.721
2048,     6, 1024,   23,      127,    1,               0.757
2048,     0, 2048,   23,      127,    1,               0.825
2048,     7, 2048,   23,      127,    1,               0.824
2048,     0, 4096,   23,      127,    1,               0.828
2048,     8, 4096,   23,      127,    1,               0.823
 256,     1,   64,   23,      127,    1,               0.665
 256,    15,   64,   23,      127,    1,               0.661
 256,     2,   64,   23,      127,    1,               0.674
 256,    30,   64,   23,      127,    1,               0.605
 256,     3,   64,   23,      127,    1,               0.668
 256,    45,   64,   23,      127,    1,               0.661
 256,     4,   64,   23,      127,    1,               0.657
 256,    60,   64,   23,      127,    1,               0.594
 256,     5,   64,   23,      127,    1,               0.654
 256,    75,   64,   23,      127,    1,               0.673
 256,     6,   64,   23,      127,    1,               0.688
 256,    90,   64,   23,      127,    1,                 0.6
 256,     7,   64,   23,      127,    1,                0.66
 256,   105,   64,   23,      127,    1,               0.654
   1,     0,    0,   23,      127,    1,               0.981
   2,     0,    1,   23,      127,    1,               0.976
   3,     0,    2,   23,      127,    1,               0.983
   4,     0,    3,   23,      127,    1,               0.984
   5,     0,    4,   23,      127,    1,               0.973
   6,     0,    5,   23,      127,    1,               0.987
   7,     0,    6,   23,      127,    1,               0.977
   8,     0,    7,   23,      127,    1,               0.979
   9,     0,    8,   23,      127,    1,               0.981
  10,     0,    9,   23,      127,    1,                0.98
  11,     0,   10,   23,      127,    1,               0.983
  12,     0,   11,   23,      127,    1,                0.98
  13,     0,   12,   23,      127,    1,                0.98
  14,     0,   13,   23,      127,    1,               0.977
  15,     0,   14,   23,      127,    1,               0.982
  16,     0,   15,   23,      127,    1,               0.581
  17,     0,   16,   23,      127,    1,               0.551
  18,     0,   17,   23,      127,    1,               0.555
  19,     0,   18,   23,      127,    1,               0.586
  20,     0,   19,   23,      127,    1,               0.585
  21,     0,   20,   23,      127,    1,               0.582
  22,     0,   21,   23,      127,    1,               0.571
  23,     0,   22,   23,      127,    1,               0.576
  24,     0,   23,   23,      127,    1,               0.581
  25,     0,   24,   23,      127,    1,               0.589
  26,     0,   25,   23,      127,    1,               0.593
  27,     0,   26,   23,      127,    1,               0.595
  28,     0,   27,   23,      127,    1,               0.583
  29,     0,   28,   23,      127,    1,               0.595
  30,     0,   29,   23,      127,    1,                0.58
  31,     0,   30,   23,      127,    1,               0.594
  32,     0,   31,   23,      127,    1,               0.665
2048,     0,   32,   23,      127,    2,               0.825
2048,     1,   32,   23,      127,    2,               0.818
2048,     0,   64,   23,      127,    2,               0.829
2048,     2,   64,   23,      127,    2,               0.828
2048,     0,  128,   23,      127,    2,               0.823
2048,     3,  128,   23,      127,    2,               0.825
2048,     0,  256,   23,      127,    2,               0.819
2048,     4,  256,   23,      127,    2,               0.828
2048,     0,  512,   23,      127,    2,               0.824
2048,     5,  512,   23,      127,    2,               0.827
2048,     0, 1024,   23,      127,    2,               0.813
2048,     6, 1024,   23,      127,    2,               0.834
2048,     0, 2048,   23,      127,    2,               0.927
2048,     7, 2048,   23,      127,    2,               0.923
2048,     0, 4096,   23,      127,    2,               0.818
2048,     8, 4096,   23,      127,    2,                0.82
 256,     1,   64,   23,      127,    2,               0.693
 256,    15,   64,   23,      127,    2,               0.686
 256,     2,   64,   23,      127,    2,                0.69
 256,    30,   64,   23,      127,    2,               0.611
 256,     3,   64,   23,      127,    2,               0.692
 256,    45,   64,   23,      127,    2,               0.685
 256,     4,   64,   23,      127,    2,               0.688
 256,    60,   64,   23,      127,    2,                 0.6
 256,     5,   64,   23,      127,    2,                0.69
 256,    75,   64,   23,      127,    2,               0.689
 256,     6,   64,   23,      127,    2,               0.688
 256,    90,   64,   23,      127,    2,               0.611
 256,     7,   64,   23,      127,    2,                0.69
 256,   105,   64,   23,      127,    2,               0.686
   1,     0,    0,   23,      127,    2,               0.982
   2,     0,    1,   23,      127,    2,               0.987
   3,     0,    2,   23,      127,    2,               0.978
   4,     0,    3,   23,      127,    2,               0.977
   5,     0,    4,   23,      127,    2,               0.979
   6,     0,    5,   23,      127,    2,               0.985
   7,     0,    6,   23,      127,    2,               0.975
   8,     0,    7,   23,      127,    2,               0.981
   9,     0,    8,   23,      127,    2,               0.984
  10,     0,    9,   23,      127,    2,               0.983
  11,     0,   10,   23,      127,    2,               0.982
  12,     0,   11,   23,      127,    2,               0.976
  13,     0,   12,   23,      127,    2,               0.985
  14,     0,   13,   23,      127,    2,               0.984
  15,     0,   14,   23,      127,    2,                0.98
  16,     0,   15,   23,      127,    2,               0.583
  17,     0,   16,   23,      127,    2,               0.552
  18,     0,   17,   23,      127,    2,               0.564
  19,     0,   18,   23,      127,    2,               0.585
  20,     0,   19,   23,      127,    2,               0.578
  21,     0,   20,   23,      127,    2,               0.578
  22,     0,   21,   23,      127,    2,               0.571
  23,     0,   22,   23,      127,    2,               0.587
  24,     0,   23,   23,      127,    2,               0.589
  25,     0,   24,   23,      127,    2,               0.593
  26,     0,   25,   23,      127,    2,               0.589
  27,     0,   26,   23,      127,    2,               0.588
  28,     0,   27,   23,      127,    2,               0.593
  29,     0,   28,   23,      127,    2,               0.579
  30,     0,   29,   23,      127,    2,               0.572
  31,     0,   30,   23,      127,    2,               0.582
  32,     0,   31,   23,      127,    2,               0.659
2048,     0,   32,   23,      127,    4,               0.822
2048,     1,   32,   23,      127,    4,               0.818
2048,     0,   64,   23,      127,    4,               0.826
2048,     2,   64,   23,      127,    4,               0.824
2048,     0,  128,   23,      127,    4,               0.833
2048,     3,  128,   23,      127,    4,               0.831
2048,     0,  256,   23,      127,    4,               0.826
2048,     4,  256,   23,      127,    4,               0.831
2048,     0,  512,   23,      127,    4,               0.834
2048,     5,  512,   23,      127,    4,                0.83
2048,     0, 1024,   23,      127,    4,               0.836
2048,     6, 1024,   23,      127,    4,               0.844
2048,     0, 2048,   23,      127,    4,               0.696
2048,     7, 2048,   23,      127,    4,               0.704
2048,     0, 4096,   23,      127,    4,               0.936
2048,     8, 4096,   23,      127,    4,               0.925
 256,     1,   64,   23,      127,    4,               0.694
 256,    15,   64,   23,      127,    4,                0.69
 256,     2,   64,   23,      127,    4,               0.687
 256,    30,   64,   23,      127,    4,               0.612
 256,     3,   64,   23,      127,    4,               0.685
 256,    45,   64,   23,      127,    4,               0.685
 256,     4,   64,   23,      127,    4,               0.684
 256,    60,   64,   23,      127,    4,               0.606
 256,     5,   64,   23,      127,    4,                0.69
 256,    75,   64,   23,      127,    4,               0.688
 256,     6,   64,   23,      127,    4,                0.69
 256,    90,   64,   23,      127,    4,               0.615
 256,     7,   64,   23,      127,    4,               0.691
 256,   105,   64,   23,      127,    4,               0.688
   1,     0,    0,   23,      127,    4,               0.982
   2,     0,    1,   23,      127,    4,               0.983
   3,     0,    2,   23,      127,    4,               0.981
   4,     0,    3,   23,      127,    4,               0.984
   5,     0,    4,   23,      127,    4,               0.963
   6,     0,    5,   23,      127,    4,               0.978
   7,     0,    6,   23,      127,    4,               0.985
   8,     0,    7,   23,      127,    4,               0.986
   9,     0,    8,   23,      127,    4,               0.978
  10,     0,    9,   23,      127,    4,               0.985
  11,     0,   10,   23,      127,    4,               0.986
  12,     0,   11,   23,      127,    4,               0.983
  13,     0,   12,   23,      127,    4,               0.986
  14,     0,   13,   23,      127,    4,                0.98
  15,     0,   14,   23,      127,    4,               0.979
  16,     0,   15,   23,      127,    4,               0.582
  17,     0,   16,   23,      127,    4,               0.542
  18,     0,   17,   23,      127,    4,               0.564
  19,     0,   18,   23,      127,    4,               0.571
  20,     0,   19,   23,      127,    4,               0.582
  21,     0,   20,   23,      127,    4,               0.573
  22,     0,   21,   23,      127,    4,               0.575
  23,     0,   22,   23,      127,    4,               0.578
  24,     0,   23,   23,      127,    4,                0.58
  25,     0,   24,   23,      127,    4,               0.592
  26,     0,   25,   23,      127,    4,               0.588
  27,     0,   26,   23,      127,    4,               0.574
  28,     0,   27,   23,      127,    4,               0.589
  29,     0,   28,   23,      127,    4,                0.56
  30,     0,   29,   23,      127,    4,               0.587
  31,     0,   30,   23,      127,    4,               0.584
  32,     0,   31,   23,      127,    4,               0.664
2048,     0,   32,   23,      127,    8,               0.826
2048,     1,   32,   23,      127,    8,               0.821
2048,     0,   64,   23,      127,    8,               0.828
2048,     2,   64,   23,      127,    8,               0.827
2048,     0,  128,   23,      127,    8,               0.833
2048,     3,  128,   23,      127,    8,                0.83
2048,     0,  256,   23,      127,    8,               0.855
2048,     4,  256,   23,      127,    8,               0.849
2048,     0,  512,   23,      127,    8,               0.849
2048,     5,  512,   23,      127,    8,               0.851
2048,     0, 1024,   23,      127,    8,               0.856
2048,     6, 1024,   23,      127,    8,               0.862
2048,     0, 2048,   23,      127,    8,               0.709
2048,     7, 2048,   23,      127,    8,               0.712
2048,     0, 4096,   23,      127,    8,               0.702
2048,     8, 4096,   23,      127,    8,               0.701
 256,     1,   64,   23,      127,    8,               0.689
 256,    15,   64,   23,      127,    8,               0.688
 256,     2,   64,   23,      127,    8,               0.691
 256,    30,   64,   23,      127,    8,               0.612
 256,     3,   64,   23,      127,    8,               0.688
 256,    45,   64,   23,      127,    8,               0.686
 256,     4,   64,   23,      127,    8,               0.694
 256,    60,   64,   23,      127,    8,               0.609
 256,     5,   64,   23,      127,    8,                0.69
 256,    75,   64,   23,      127,    8,                0.69
 256,     6,   64,   23,      127,    8,               0.691
 256,    90,   64,   23,      127,    8,               0.612
 256,     7,   64,   23,      127,    8,               0.689
 256,   105,   64,   23,      127,    8,               0.688
   1,     0,    0,   23,      127,    8,                0.98
   2,     0,    1,   23,      127,    8,               0.978
   3,     0,    2,   23,      127,    8,                0.98
   4,     0,    3,   23,      127,    8,               0.978
   5,     0,    4,   23,      127,    8,               0.977
   6,     0,    5,   23,      127,    8,               0.984
   7,     0,    6,   23,      127,    8,               0.982
   8,     0,    7,   23,      127,    8,               0.983
   9,     0,    8,   23,      127,    8,               0.987
  10,     0,    9,   23,      127,    8,               0.979
  11,     0,   10,   23,      127,    8,               0.985
  12,     0,   11,   23,      127,    8,               0.981
  13,     0,   12,   23,      127,    8,                0.98
  14,     0,   13,   23,      127,    8,               0.982
  15,     0,   14,   23,      127,    8,               0.981
  16,     0,   15,   23,      127,    8,               0.579
  17,     0,   16,   23,      127,    8,               0.531
  18,     0,   17,   23,      127,    8,               0.577
  19,     0,   18,   23,      127,    8,               0.588
  20,     0,   19,   23,      127,    8,               0.571
  21,     0,   20,   23,      127,    8,               0.576
  22,     0,   21,   23,      127,    8,                0.59
  23,     0,   22,   23,      127,    8,               0.574
  24,     0,   23,   23,      127,    8,               0.583
  25,     0,   24,   23,      127,    8,               0.581
  26,     0,   25,   23,      127,    8,               0.592
  27,     0,   26,   23,      127,    8,               0.586
  28,     0,   27,   23,      127,    8,               0.588
  29,     0,   28,   23,      127,    8,               0.578
  30,     0,   29,   23,      127,    8,               0.573
  31,     0,   30,   23,      127,    8,               0.588
  32,     0,   31,   23,      127,    8,               0.664
2048,     0,   32,   23,      127,   16,               0.825
2048,     1,   32,   23,      127,   16,               0.823
2048,     0,   64,   23,      127,   16,               0.831
2048,     2,   64,   23,      127,   16,               0.822
2048,     0,  128,   23,      127,   16,               0.831
2048,     3,  128,   23,      127,   16,               0.831
2048,     0,  256,   23,      127,   16,               0.849
2048,     4,  256,   23,      127,   16,                0.85
2048,     0,  512,   23,      127,   16,               0.751
2048,     5,  512,   23,      127,   16,                0.75
2048,     0, 1024,   23,      127,   16,               0.913
2048,     6, 1024,   23,      127,   16,               0.895
2048,     0, 2048,   23,      127,   16,               0.736
2048,     7, 2048,   23,      127,   16,               0.741
2048,     0, 4096,   23,      127,   16,               0.712
2048,     8, 4096,   23,      127,   16,               0.711
 256,     1,   64,   23,      127,   16,               0.758
 256,    15,   64,   23,      127,   16,               0.692
 256,     2,   64,   23,      127,   16,               0.692
 256,    30,   64,   23,      127,   16,               0.613
 256,     3,   64,   23,      127,   16,                0.69
 256,    45,   64,   23,      127,   16,               0.687
 256,     4,   64,   23,      127,   16,                0.69
 256,    60,   64,   23,      127,   16,               0.604
 256,     5,   64,   23,      127,   16,               0.687
 256,    75,   64,   23,      127,   16,               0.687
 256,     6,   64,   23,      127,   16,                0.69
 256,    90,   64,   23,      127,   16,                0.61
 256,     7,   64,   23,      127,   16,                0.69
 256,   105,   64,   23,      127,   16,               0.685
   1,     0,    0,   23,      127,   16,               0.981
   2,     0,    1,   23,      127,   16,               0.985
   3,     0,    2,   23,      127,   16,               0.985
   4,     0,    3,   23,      127,   16,               0.981
   5,     0,    4,   23,      127,   16,               0.979
   6,     0,    5,   23,      127,   16,               0.986
   7,     0,    6,   23,      127,   16,               0.986
   8,     0,    7,   23,      127,   16,               0.982
   9,     0,    8,   23,      127,   16,               0.982
  10,     0,    9,   23,      127,   16,                0.98
  11,     0,   10,   23,      127,   16,               0.983
  12,     0,   11,   23,      127,   16,               0.982
  13,     0,   12,   23,      127,   16,               0.982
  14,     0,   13,   23,      127,   16,               0.982
  15,     0,   14,   23,      127,   16,               0.982
  16,     0,   15,   23,      127,   16,               0.582
  17,     0,   16,   23,      127,   16,               0.542
  18,     0,   17,   23,      127,   16,               0.554
  19,     0,   18,   23,      127,   16,               0.562
  20,     0,   19,   23,      127,   16,               0.587
  21,     0,   20,   23,      127,   16,               0.584
  22,     0,   21,   23,      127,   16,               0.587
  23,     0,   22,   23,      127,   16,               0.594
  24,     0,   23,   23,      127,   16,               0.581
  25,     0,   24,   23,      127,   16,               0.577
  26,     0,   25,   23,      127,   16,               0.588
  27,     0,   26,   23,      127,   16,               0.589
  28,     0,   27,   23,      127,   16,               0.596
  29,     0,   28,   23,      127,   16,               0.591
  30,     0,   29,   23,      127,   16,               0.585
  31,     0,   30,   23,      127,   16,                0.59
  32,     0,   31,   23,      127,   16,               0.669

 sysdeps/x86_64/multiarch/strrchr-sse2.S |   2 +-
 sysdeps/x86_64/multiarch/wcsrchr-sse2.S |   3 +-
 sysdeps/x86_64/strrchr.S                | 505 +++++++++++++++---------
 sysdeps/x86_64/wcsrchr.S                | 268 +------------
 4 files changed, 334 insertions(+), 444 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
index db1b44c23c..866396e947 100644
--- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
+++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
@@ -17,7 +17,7 @@
    <https://www.gnu.org/licenses/>.  */
 
 #if IS_IN (libc)
-# define strrchr __strrchr_sse2
+# define STRRCHR __strrchr_sse2
 
 # undef weak_alias
 # define weak_alias(strrchr, rindex)
diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
index 78d1ca6553..69d2f3cdb1 100644
--- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
+++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
@@ -17,7 +17,6 @@
    <https://www.gnu.org/licenses/>.  */
 
 #if IS_IN (libc)
-# define wcsrchr __wcsrchr_sse2
+# define STRRCHR	__wcsrchr_sse2
 #endif
-
 #include "../wcsrchr.S"
diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
index 50d886713e..94449ad806 100644
--- a/sysdeps/x86_64/strrchr.S
+++ b/sysdeps/x86_64/strrchr.S
@@ -19,210 +19,355 @@
 
 #include <sysdep.h>
 
+#ifndef STRRCHR
+# define STRRCHR	strrchr
+#endif
+
+#ifdef USE_AS_WCSRCHR
+# define PCMPEQ	pcmpeqd
+# define CHAR_SIZE	4
+# define PMINU	pminud
+#else
+# define PCMPEQ	pcmpeqb
+# define CHAR_SIZE	1
+# define PMINU	pminub
+#endif
+
+#define PAGE_SIZE	4096
+#define VEC_SIZE	16
+
 	.text
-ENTRY (strrchr)
-	movd	%esi, %xmm1
+ENTRY(STRRCHR)
+	movd	%esi, %xmm0
 	movq	%rdi, %rax
-	andl	$4095, %eax
-	punpcklbw	%xmm1, %xmm1
-	cmpq	$4032, %rax
-	punpcklwd	%xmm1, %xmm1
-	pshufd	$0, %xmm1, %xmm1
+	andl	$(PAGE_SIZE - 1), %eax
+#ifndef USE_AS_WCSRCHR
+	punpcklbw %xmm0, %xmm0
+	punpcklwd %xmm0, %xmm0
+#endif
+	pshufd	$0, %xmm0, %xmm0
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 	ja	L(cross_page)
-	movdqu	(%rdi), %xmm0
+
+L(cross_page_continue):
+	movups	(%rdi), %xmm1
 	pxor	%xmm2, %xmm2
-	movdqa	%xmm0, %xmm3
-	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm2, %xmm3
-	pmovmskb	%xmm0, %ecx
-	pmovmskb	%xmm3, %edx
-	testq	%rdx, %rdx
-	je	L(next_48_bytes)
-	leaq	-1(%rdx), %rax
-	xorq	%rdx, %rax
-	andq	%rcx, %rax
-	je	L(exit)
-	bsrq	%rax, %rax
+	PCMPEQ	%xmm1, %xmm2
+	pmovmskb %xmm2, %ecx
+	testl	%ecx, %ecx
+	jz	L(aligned_more)
+
+	PCMPEQ	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	leal	-1(%rcx), %edx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(ret0)
+	bsrl	%eax, %eax
 	addq	%rdi, %rax
+	/* We are off by 3 for wcsrchr if search CHAR is non-zero. If
+	   search CHAR is zero we are correct. Either way `andq
+	   -CHAR_SIZE, %rax` gets the correct result.  */
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+L(ret0):
 	ret
 
+	/* Returns for first vec x1/x2 have hard coded backward search
+	   path for earlier matches.  */
 	.p2align 4
-L(next_48_bytes):
-	movdqu	16(%rdi), %xmm4
-	movdqa	%xmm4, %xmm5
-	movdqu	32(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm4
-	pcmpeqb	%xmm2, %xmm5
-	movdqu	48(%rdi), %xmm0
-	pmovmskb	%xmm5, %edx
-	movdqa	%xmm3, %xmm5
-	pcmpeqb	%xmm1, %xmm3
-	pcmpeqb	%xmm2, %xmm5
-	pcmpeqb	%xmm0, %xmm2
-	salq	$16, %rdx
-	pmovmskb	%xmm3, %r8d
-	pmovmskb	%xmm5, %eax
-	pmovmskb	%xmm2, %esi
-	salq	$32, %r8
-	salq	$32, %rax
-	pcmpeqb	%xmm1, %xmm0
-	orq	%rdx, %rax
-	movq	%rsi, %rdx
-	pmovmskb	%xmm4, %esi
-	salq	$48, %rdx
-	salq	$16, %rsi
-	orq	%r8, %rsi
-	orq	%rcx, %rsi
-	pmovmskb	%xmm0, %ecx
-	salq	$48, %rcx
-	orq	%rcx, %rsi
-	orq	%rdx, %rax
-	je	L(loop_header2)
-	leaq	-1(%rax), %rcx
-	xorq	%rax, %rcx
-	andq	%rcx, %rsi
-	je	L(exit)
-	bsrq	%rsi, %rsi
-	leaq	(%rdi,%rsi), %rax
+L(first_vec_x0_test):
+	PCMPEQ	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	testl	%eax, %eax
+	jz	L(ret0)
+	bsrl	%eax, %eax
+	addq	%r8, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
 	ret
 
 	.p2align 4
-L(loop_header2):
-	testq	%rsi, %rsi
-	movq	%rdi, %rcx
-	je	L(no_c_found)
-L(loop_header):
-	addq	$64, %rdi
-	pxor	%xmm7, %xmm7
-	andq	$-64, %rdi
-	jmp	L(loop_entry)
+L(first_vec_x1):
+	PCMPEQ	%xmm0, %xmm2
+	pmovmskb %xmm2, %eax
+	leal	-1(%rcx), %edx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(first_vec_x0_test)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
 
 	.p2align 4
-L(loop64):
-	testq	%rdx, %rdx
-	cmovne	%rdx, %rsi
-	cmovne	%rdi, %rcx
-	addq	$64, %rdi
-L(loop_entry):
-	movdqa	32(%rdi), %xmm3
-	pxor	%xmm6, %xmm6
-	movdqa	48(%rdi), %xmm2
-	movdqa	%xmm3, %xmm0
-	movdqa	16(%rdi), %xmm4
-	pminub	%xmm2, %xmm0
-	movdqa	(%rdi), %xmm5
-	pminub	%xmm4, %xmm0
-	pminub	%xmm5, %xmm0
-	pcmpeqb	%xmm7, %xmm0
-	pmovmskb	%xmm0, %eax
-	movdqa	%xmm5, %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %r9d
-	movdqa	%xmm4, %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %edx
-	movdqa	%xmm3, %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	salq	$16, %rdx
-	pmovmskb	%xmm0, %r10d
-	movdqa	%xmm2, %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	salq	$32, %r10
-	orq	%r10, %rdx
-	pmovmskb	%xmm0, %r8d
-	orq	%r9, %rdx
-	salq	$48, %r8
-	orq	%r8, %rdx
+L(first_vec_x1_test):
+	PCMPEQ	%xmm0, %xmm2
+	pmovmskb %xmm2, %eax
 	testl	%eax, %eax
-	je	L(loop64)
-	pcmpeqb	%xmm6, %xmm4
-	pcmpeqb	%xmm6, %xmm3
-	pcmpeqb	%xmm6, %xmm5
-	pmovmskb	%xmm4, %eax
-	pmovmskb	%xmm3, %r10d
-	pcmpeqb	%xmm6, %xmm2
-	pmovmskb	%xmm5, %r9d
-	salq	$32, %r10
-	salq	$16, %rax
-	pmovmskb	%xmm2, %r8d
-	orq	%r10, %rax
-	orq	%r9, %rax
-	salq	$48, %r8
-	orq	%r8, %rax
-	leaq	-1(%rax), %r8
-	xorq	%rax, %r8
-	andq	%r8, %rdx
-	cmovne	%rdi, %rcx
-	cmovne	%rdx, %rsi
-	bsrq	%rsi, %rsi
-	leaq	(%rcx,%rsi), %rax
+	jz	L(first_vec_x0_test)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
+
+	.p2align 4
+L(first_vec_x2):
+	PCMPEQ	%xmm0, %xmm3
+	pmovmskb %xmm3, %eax
+	leal	-1(%rcx), %edx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(first_vec_x1_test)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
+
+	.p2align 4
+L(aligned_more):
+	/* Save original pointer if match was in VEC 0.  */
+	movq	%rdi, %r8
+	andq	$-VEC_SIZE, %rdi
+
+	movaps	VEC_SIZE(%rdi), %xmm2
+	pxor	%xmm3, %xmm3
+	PCMPEQ	%xmm2, %xmm3
+	pmovmskb %xmm3, %ecx
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x1)
+
+	movaps	(VEC_SIZE * 2)(%rdi), %xmm3
+	pxor	%xmm4, %xmm4
+	PCMPEQ	%xmm3, %xmm4
+	pmovmskb %xmm4, %ecx
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x2)
+
+	addq	$VEC_SIZE, %rdi
+	/* Save pointer again before realigning.  */
+	movq	%rdi, %rsi
+	andq	$-(VEC_SIZE * 2), %rdi
+	.p2align 4
+L(first_loop):
+	/* Do 2x VEC at a time.  */
+	movaps	(VEC_SIZE * 2)(%rdi), %xmm4
+	movaps	(VEC_SIZE * 3)(%rdi), %xmm5
+	/* If SSE2 no pminud.  */
+#ifdef NO_PMINU
+	movaps	%xmm5, %xmm6
+	pxor	%xmm8, %xmm8
+
+	PCMPEQ	%xmm8, %xmm5
+	PCMPEQ	%xmm4, %xmm8
+	por	%xmm5, %xmm8
+#else
+	movaps	%xmm5, %xmm6
+	PMINU	%xmm4, %xmm5
+#endif
+
+	movaps	%xmm4, %xmm9
+	PCMPEQ	%xmm0, %xmm4
+	PCMPEQ	%xmm0, %xmm6
+	movaps	%xmm6, %xmm7
+	por	%xmm4, %xmm6
+#ifndef NO_PMINU
+	pxor	%xmm8, %xmm8
+	PCMPEQ	%xmm5, %xmm8
+#endif
+	pmovmskb %xmm8, %ecx
+	pmovmskb %xmm6, %eax
+
+	addq	$(VEC_SIZE * 2), %rdi
+	/* Use `addl` 1) so we can undo it with `subl` and 2) it can
+	   macro-fuse with `jz`.  */
+	addl	%ecx, %eax
+	jz	L(first_loop)
+
+	/* Check if there is zero match.  */
+	testl	%ecx, %ecx
+	jz	L(second_loop_match)
+
+	/* Check if there was a match in last iteration.  */
+	subl	%ecx, %eax
+	jnz	L(new_match)
+
+L(first_loop_old_match):
+	PCMPEQ	%xmm0, %xmm2
+	PCMPEQ	%xmm0, %xmm3
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm3, %eax
+	addl	%eax, %ecx
+	jz	L(first_vec_x0_test)
+	/* NB: We could move this shift to before the branch and save a
+	   bit of code size / performance on the fall through. The
+	   branch leads to the null case which generally seems hotter
+	   than char in first 3x VEC.  */
+	sall	$16, %eax
+	orl	%ecx, %eax
+
+	bsrl	%eax, %eax
+	addq	%rsi, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
+
+	.p2align 4
+L(new_match):
+	pxor	%xmm6, %xmm6
+	PCMPEQ	%xmm9, %xmm6
+	pmovmskb %xmm6, %eax
+	sall	$16, %ecx
+	orl	%eax, %ecx
+
+	/* We can't reuse either of the old comparisons as since we mask
+	   of zeros after first zero (instead of using the full
+	   comparison) we can't gurantee no interference between match
+	   after end of string and valid match.  */
+	pmovmskb %xmm4, %eax
+	pmovmskb %xmm7, %edx
+	sall	$16, %edx
+	orl	%edx, %eax
+
+	leal	-1(%ecx), %edx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(first_loop_old_match)
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
 	ret
 
+	/* Save minimum state for getting most recent match. We can
+	   throw out all previous work.  */
 	.p2align 4
-L(no_c_found):
-	movl	$1, %esi
-	xorl	%ecx, %ecx
-	jmp	L(loop_header)
+L(second_loop_match):
+	movq	%rdi, %rsi
+	movaps	%xmm4, %xmm2
+	movaps	%xmm7, %xmm3
 
 	.p2align 4
-L(exit):
-	xorl	%eax, %eax
+L(second_loop):
+	movaps	(VEC_SIZE * 2)(%rdi), %xmm4
+	movaps	(VEC_SIZE * 3)(%rdi), %xmm5
+#ifdef NO_PMINU
+	movaps	%xmm5, %xmm6
+	pxor	%xmm8, %xmm8
+
+	PCMPEQ	%xmm8, %xmm5
+	PCMPEQ	%xmm4, %xmm8
+	por	%xmm5, %xmm8
+#else
+	movaps	%xmm5, %xmm6
+	PMINU	%xmm4, %xmm5
+#endif
+
+	movaps	%xmm4, %xmm9
+	PCMPEQ	%xmm0, %xmm4
+	PCMPEQ	%xmm0, %xmm6
+	movaps	%xmm6, %xmm7
+	por	%xmm4, %xmm6
+#ifndef NO_PMINU
+	pxor	%xmm8, %xmm8
+	PCMPEQ	%xmm5, %xmm8
+#endif
+
+	pmovmskb %xmm8, %ecx
+	pmovmskb %xmm6, %eax
+
+	addq	$(VEC_SIZE * 2), %rdi
+	/* Either null term or new occurence of CHAR.  */
+	addl	%ecx, %eax
+	jz	L(second_loop)
+
+	/* No null term so much be new occurence of CHAR.  */
+	testl	%ecx, %ecx
+	jz	L(second_loop_match)
+
+
+	subl	%ecx, %eax
+	jnz	L(second_loop_new_match)
+
+L(second_loop_old_match):
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm3, %eax
+	sall	$16, %eax
+	orl	%ecx, %eax
+	bsrl	%eax, %eax
+	addq	%rsi, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
 	ret
 
 	.p2align 4
+L(second_loop_new_match):
+	pxor	%xmm6, %xmm6
+	PCMPEQ	%xmm9, %xmm6
+	pmovmskb %xmm6, %eax
+	sall	$16, %ecx
+	orl	%eax, %ecx
+
+	/* We can't reuse either of the old comparisons as since we mask
+	   of zeros after first zero (instead of using the full
+	   comparison) we can't gurantee no interference between match
+	   after end of string and valid match.  */
+	pmovmskb %xmm4, %eax
+	pmovmskb %xmm7, %edx
+	sall	$16, %edx
+	orl	%edx, %eax
+
+	leal	-1(%ecx), %edx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(second_loop_old_match)
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
+
+	.p2align 4,, 4
 L(cross_page):
-	movq	%rdi, %rax
-	pxor	%xmm0, %xmm0
-	andq	$-64, %rax
-	movdqu	(%rax), %xmm5
-	movdqa	%xmm5, %xmm6
-	movdqu	16(%rax), %xmm4
-	pcmpeqb	%xmm1, %xmm5
-	pcmpeqb	%xmm0, %xmm6
-	movdqu	32(%rax), %xmm3
-	pmovmskb	%xmm6, %esi
-	movdqa	%xmm4, %xmm6
-	movdqu	48(%rax), %xmm2
-	pcmpeqb	%xmm1, %xmm4
-	pcmpeqb	%xmm0, %xmm6
-	pmovmskb	%xmm6, %edx
-	movdqa	%xmm3, %xmm6
-	pcmpeqb	%xmm1, %xmm3
-	pcmpeqb	%xmm0, %xmm6
-	pcmpeqb	%xmm2, %xmm0
-	salq	$16, %rdx
-	pmovmskb	%xmm3, %r9d
-	pmovmskb	%xmm6, %r8d
-	pmovmskb	%xmm0, %ecx
-	salq	$32, %r9
-	salq	$32, %r8
-	pcmpeqb	%xmm1, %xmm2
-	orq	%r8, %rdx
-	salq	$48, %rcx
-	pmovmskb	%xmm5, %r8d
-	orq	%rsi, %rdx
-	pmovmskb	%xmm4, %esi
-	orq	%rcx, %rdx
-	pmovmskb	%xmm2, %ecx
-	salq	$16, %rsi
-	salq	$48, %rcx
-	orq	%r9, %rsi
-	orq	%r8, %rsi
-	orq	%rcx, %rsi
+	movq	%rdi, %rsi
+	andq	$-VEC_SIZE, %rsi
+	movaps	(%rsi), %xmm1
+	pxor	%xmm2, %xmm2
+	PCMPEQ	%xmm1, %xmm2
+	pmovmskb %xmm2, %edx
 	movl	%edi, %ecx
-	subl	%eax, %ecx
-	shrq	%cl, %rdx
-	shrq	%cl, %rsi
-	testq	%rdx, %rdx
-	je	L(loop_header2)
-	leaq	-1(%rdx), %rax
-	xorq	%rdx, %rax
-	andq	%rax, %rsi
-	je	L(exit)
-	bsrq	%rsi, %rax
+	andl	$(VEC_SIZE - 1), %ecx
+	sarl	%cl, %edx
+	jz	L(cross_page_continue)
+	PCMPEQ	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	sarl	%cl, %eax
+	leal	-1(%rdx), %ecx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(ret1)
+	bsrl	%eax, %eax
 	addq	%rdi, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+L(ret1):
 	ret
-END (strrchr)
+END(STRRCHR)
 
-weak_alias (strrchr, rindex)
-libc_hidden_builtin_def (strrchr)
+#ifndef USE_AS_WCSRCHR
+	weak_alias (STRRCHR, rindex)
+	libc_hidden_builtin_def (STRRCHR)
+#endif
diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
index 61552954de..2b80efc5ef 100644
--- a/sysdeps/x86_64/wcsrchr.S
+++ b/sysdeps/x86_64/wcsrchr.S
@@ -1,4 +1,4 @@
-/* wcsrchr with SSSE3
+/* wcsrchr optimized with SSE2.
    Copyright (C) 2011-2022 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -16,266 +16,12 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
 
-	.text
-ENTRY (wcsrchr)
+#define USE_AS_WCSRCHR	1
+#define NO_PMINU	1
 
-	movd	%rsi, %xmm1
-	mov	%rdi, %rcx
-	punpckldq %xmm1, %xmm1
-	pxor	%xmm2, %xmm2
-	punpckldq %xmm1, %xmm1
-	and	$63, %rcx
-	cmp	$48, %rcx
-	ja	L(crosscache)
+#ifndef STRRCHR
+# define STRRCHR	wcsrchr
+#endif
 
-	movdqu	(%rdi), %xmm0
-	pcmpeqd	%xmm0, %xmm2
-	pcmpeqd	%xmm1, %xmm0
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm0, %rax
-	add	$16, %rdi
-
-	test	%rax, %rax
-	jnz	L(unaligned_match1)
-
-	test	%rcx, %rcx
-	jnz	L(return_null)
-
-	and	$-16, %rdi
-	xor	%r8, %r8
-	jmp	L(loop)
-
-	.p2align 4
-L(unaligned_match1):
-	test	%rcx, %rcx
-	jnz	L(prolog_find_zero_1)
-
-	mov	%rax, %r8
-	mov	%rdi, %rsi
-	and	$-16, %rdi
-	jmp	L(loop)
-
-	.p2align 4
-L(crosscache):
-	and	$15, %rcx
-	and	$-16, %rdi
-	pxor	%xmm3, %xmm3
-	movdqa	(%rdi), %xmm0
-	pcmpeqd	%xmm0, %xmm3
-	pcmpeqd	%xmm1, %xmm0
-	pmovmskb %xmm3, %rdx
-	pmovmskb %xmm0, %rax
-	shr	%cl, %rdx
-	shr	%cl, %rax
-	add	$16, %rdi
-
-	test	%rax, %rax
-	jnz	L(unaligned_match)
-
-	test	%rdx, %rdx
-	jnz	L(return_null)
-
-	xor	%r8, %r8
-	jmp	L(loop)
-
-	.p2align 4
-L(unaligned_match):
-	test	%rdx, %rdx
-	jnz	L(prolog_find_zero)
-
-	mov	%rax, %r8
-	lea	(%rdi, %rcx), %rsi
-
-/* Loop start on aligned string.  */
-	.p2align 4
-L(loop):
-	movdqa	(%rdi), %xmm0
-	pcmpeqd	%xmm0, %xmm2
-	add	$16, %rdi
-	pcmpeqd	%xmm1, %xmm0
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm0, %rax
-	or	%rax, %rcx
-	jnz	L(matches)
-
-	movdqa	(%rdi), %xmm3
-	pcmpeqd	%xmm3, %xmm2
-	add	$16, %rdi
-	pcmpeqd	%xmm1, %xmm3
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm3, %rax
-	or	%rax, %rcx
-	jnz	L(matches)
-
-	movdqa	(%rdi), %xmm4
-	pcmpeqd	%xmm4, %xmm2
-	add	$16, %rdi
-	pcmpeqd	%xmm1, %xmm4
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm4, %rax
-	or	%rax, %rcx
-	jnz	L(matches)
-
-	movdqa	(%rdi), %xmm5
-	pcmpeqd	%xmm5, %xmm2
-	add	$16, %rdi
-	pcmpeqd	%xmm1, %xmm5
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm5, %rax
-	or	%rax, %rcx
-	jz	L(loop)
-
-	.p2align 4
-L(matches):
-	test	%rax, %rax
-	jnz	L(match)
-L(return_value):
-	test	%r8, %r8
-	jz	L(return_null)
-	mov	%r8, %rax
-	mov	%rsi, %rdi
-
-	test	$15 << 4, %ah
-	jnz	L(match_fourth_wchar)
-	test	%ah, %ah
-	jnz	L(match_third_wchar)
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(match):
-	pmovmskb %xmm2, %rcx
-	test	%rcx, %rcx
-	jnz	L(find_zero)
-	mov	%rax, %r8
-	mov	%rdi, %rsi
-	jmp	L(loop)
-
-	.p2align 4
-L(find_zero):
-	test	$15, %cl
-	jnz	L(find_zero_in_first_wchar)
-	test	%cl, %cl
-	jnz	L(find_zero_in_second_wchar)
-	test	$15, %ch
-	jnz	L(find_zero_in_third_wchar)
-
-	and	$1 << 13 - 1, %rax
-	jz	L(return_value)
-
-	test	$15 << 4, %ah
-	jnz	L(match_fourth_wchar)
-	test	%ah, %ah
-	jnz	L(match_third_wchar)
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(find_zero_in_first_wchar):
-	test	$1, %rax
-	jz	L(return_value)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(find_zero_in_second_wchar):
-	and	$1 << 5 - 1, %rax
-	jz	L(return_value)
-
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(find_zero_in_third_wchar):
-	and	$1 << 9 - 1, %rax
-	jz	L(return_value)
-
-	test	%ah, %ah
-	jnz	L(match_third_wchar)
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(prolog_find_zero):
-	add	%rcx, %rdi
-	mov     %rdx, %rcx
-L(prolog_find_zero_1):
-	test	$15, %cl
-	jnz	L(prolog_find_zero_in_first_wchar)
-	test	%cl, %cl
-	jnz	L(prolog_find_zero_in_second_wchar)
-	test	$15, %ch
-	jnz	L(prolog_find_zero_in_third_wchar)
-
-	and	$1 << 13 - 1, %rax
-	jz	L(return_null)
-
-	test	$15 << 4, %ah
-	jnz	L(match_fourth_wchar)
-	test	%ah, %ah
-	jnz	L(match_third_wchar)
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(prolog_find_zero_in_first_wchar):
-	test	$1, %rax
-	jz	L(return_null)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(prolog_find_zero_in_second_wchar):
-	and	$1 << 5 - 1, %rax
-	jz	L(return_null)
-
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(prolog_find_zero_in_third_wchar):
-	and	$1 << 9 - 1, %rax
-	jz	L(return_null)
-
-	test	%ah, %ah
-	jnz	L(match_third_wchar)
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(match_second_wchar):
-	lea	-12(%rdi), %rax
-	ret
-
-	.p2align 4
-L(match_third_wchar):
-	lea	-8(%rdi), %rax
-	ret
-
-	.p2align 4
-L(match_fourth_wchar):
-	lea	-4(%rdi), %rax
-	ret
-
-	.p2align 4
-L(return_null):
-	xor	%rax, %rax
-	ret
-
-END (wcsrchr)
+#include "../strrchr.S"
-- 
2.25.1


^ permalink raw reply	[flat|nested] 36+ messages in thread

* [PATCH v1 3/5] x86: Add wcsrchr optimized with SSE4_1 in wcsrchr-sse4_1.S
  2022-04-21  3:14 [PATCH v1 1/5] benchtests: Improve bench-strrchr Noah Goldstein
  2022-04-21  3:14 ` [PATCH v1 2/5] x86: Optimize {str|wcs}rchr-sse2 Noah Goldstein
@ 2022-04-21  3:14 ` Noah Goldstein
  2022-04-21  3:14 ` [PATCH v1 4/5] x86: Optimize {str|wcs}rchr-avx2 Noah Goldstein
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 36+ messages in thread
From: Noah Goldstein @ 2022-04-21  3:14 UTC (permalink / raw)
  To: libc-alpha

wcsrchr-sse2 can't use `pminud` which can speedup the main loop:

len, align, pos, seek,   max_char, freq, New Time / Old Time
256,     1,  64,   23,       1273,    1,               1.082
256,     1,  64,   23, 2147483647,    1,               1.076
256,    15,  64,   23,       1273,    1,               1.061
256,    15,  64,   23, 2147483647,    1,               1.075
256,     2,  64,   23,       1273,    1,               1.108
256,     2,  64,   23, 2147483647,    1,               1.109
256,    30,  64,   23,       1273,    1,               1.072
256,    30,  64,   23, 2147483647,    1,               1.077
256,     3,  64,   23,       1273,    1,               1.108
256,     3,  64,   23, 2147483647,    1,               1.103
256,    45,  64,   23,       1273,    1,               1.076
256,    45,  64,   23, 2147483647,    1,               1.079
256,     4,  64,   23,       1273,    1,               1.119
256,     4,  64,   23, 2147483647,    1,               1.112
256,    60,  64,   23,       1273,    1,               1.117
256,    60,  64,   23, 2147483647,    1,               1.112
256,     5,  64,   23,       1273,    1,                1.21
256,     5,  64,   23, 2147483647,    1,               1.194
256,    75,  64,   23,       1273,    1,               1.055
256,    75,  64,   23, 2147483647,    1,               1.045
256,     6,  64,   23,       1273,    1,               1.264
256,     6,  64,   23, 2147483647,    1,                 1.3
256,    90,  64,   23,       1273,    1,               1.022
256,    90,  64,   23, 2147483647,    1,               1.026
256,     7,  64,   23,       1273,    1,               1.316
256,     7,  64,   23, 2147483647,    1,               1.325

Overall this leads to a 5% performance improvement in the benchmark
suite.

Full xcheck passes on x86_64 with and without multiarch enabled.
---
 sysdeps/x86_64/multiarch/Makefile          |  1 +
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |  3 +++
 sysdeps/x86_64/multiarch/wcsrchr-sse4_1.S  | 21 +++++++++++++++++++++
 sysdeps/x86_64/multiarch/wcsrchr.c         |  3 ++-
 4 files changed, 27 insertions(+), 1 deletion(-)
 create mode 100644 sysdeps/x86_64/multiarch/wcsrchr-sse4_1.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 0400ea332b..5ad7bc8c25 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -154,6 +154,7 @@ sysdep_routines += \
   wcsrchr-avx2-rtm \
   wcsrchr-evex \
   wcsrchr-sse2 \
+  wcsrchr-sse4_1 \
   wmemchr-avx2 \
   wmemchr-avx2-rtm \
   wmemchr-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index a8afcf81bb..1cbb6938c8 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -685,6 +685,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			       && CPU_FEATURE_USABLE (AVX512BW)
 			       && CPU_FEATURE_USABLE (BMI2)),
 			      __wcsrchr_evex)
+	      IFUNC_IMPL_ADD (array, i, wcsrchr,
+			      CPU_FEATURE_USABLE (SSE4_1),
+			      __wcsrchr_sse4_1)
 	      IFUNC_IMPL_ADD (array, i, wcsrchr, 1, __wcsrchr_sse2))
 
   /* Support sysdeps/x86_64/multiarch/wcscmp.c.  */
diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse4_1.S b/sysdeps/x86_64/multiarch/wcsrchr-sse4_1.S
new file mode 100644
index 0000000000..34b92d28eb
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsrchr-sse4_1.S
@@ -0,0 +1,21 @@
+/* wcsrchr optimized with SSE4.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define USE_AS_WCSRCHR	1
+#define STRRCHR	__wcsrchr_sse4_1
+#include "../strrchr.S"
diff --git a/sysdeps/x86_64/multiarch/wcsrchr.c b/sysdeps/x86_64/multiarch/wcsrchr.c
index 8b30c06f2e..eb18038eec 100644
--- a/sysdeps/x86_64/multiarch/wcsrchr.c
+++ b/sysdeps/x86_64/multiarch/wcsrchr.c
@@ -23,7 +23,8 @@
 # undef wcsrchr
 
 # define SYMBOL_NAME wcsrchr
-# include "ifunc-avx2.h"
+
+# include "ifunc-wcslen.h"
 
 libc_ifunc_redirected (__redirect_wcsrchr, wcsrchr, IFUNC_SELECTOR ());
 #endif
-- 
2.25.1


^ permalink raw reply	[flat|nested] 36+ messages in thread

* [PATCH v1 4/5] x86: Optimize {str|wcs}rchr-avx2
  2022-04-21  3:14 [PATCH v1 1/5] benchtests: Improve bench-strrchr Noah Goldstein
  2022-04-21  3:14 ` [PATCH v1 2/5] x86: Optimize {str|wcs}rchr-sse2 Noah Goldstein
  2022-04-21  3:14 ` [PATCH v1 3/5] x86: Add wcsrchr optimized with SSE4_1 in wcsrchr-sse4_1.S Noah Goldstein
@ 2022-04-21  3:14 ` Noah Goldstein
  2022-04-21  3:14 ` [PATCH v1 5/5] x86: Optimize {str|wcs}rchr-evex Noah Goldstein
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 36+ messages in thread
From: Noah Goldstein @ 2022-04-21  3:14 UTC (permalink / raw)
  To: libc-alpha

The new code unrolls the main loop slightly without adding too much
overhead and minimizes the comparisons for the search CHAR.

Geometric Mean of all benchmarks New / Old: 0.832
See email for all results.

Full xcheck passes on x86_64 with and without multiarch enabled.
---
Results For: strrchr

Geometric Mean of N=30 runs.

Geometric Mean of all benchmarks New / Old: 0.832
Benchmarks performance on Tigerlake:
https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i71165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html

 len, align,  pos, seek, max_char, freq, New Time / Old Time
2048,     0,   32,    0,      127,    1,               0.673
2048,     1,   32,    0,      127,    1,                0.68
2048,     0,   64,    0,      127,    1,               0.566
2048,     2,   64,    0,      127,    1,               0.574
2048,     0,  128,    0,      127,    1,               0.976
2048,     3,  128,    0,      127,    1,               0.967
2048,     0,  256,    0,      127,    1,               0.931
2048,     4,  256,    0,      127,    1,               0.921
2048,     0,  512,    0,      127,    1,               0.792
2048,     5,  512,    0,      127,    1,                0.78
2048,     0, 1024,    0,      127,    1,               0.733
2048,     6, 1024,    0,      127,    1,               0.729
2048,     0, 2048,    0,      127,    1,               0.795
2048,     7, 2048,    0,      127,    1,               0.805
2048,     0, 4096,    0,      127,    1,               0.803
2048,     8, 4096,    0,      127,    1,               0.794
 256,     1,   64,    0,      127,    1,               0.584
 256,    15,   64,    0,      127,    1,               0.587
 256,     2,   64,    0,      127,    1,               0.586
 256,    30,   64,    0,      127,    1,               0.592
 256,     3,   64,    0,      127,    1,               0.586
 256,    45,   64,    0,      127,    1,               0.505
 256,     4,   64,    0,      127,    1,                0.59
 256,    60,   64,    0,      127,    1,               0.501
 256,     5,   64,    0,      127,    1,               0.595
 256,    75,   64,    0,      127,    1,               0.588
 256,     6,   64,    0,      127,    1,               0.593
 256,    90,   64,    0,      127,    1,               0.594
 256,     7,   64,    0,      127,    1,               0.596
 256,   105,   64,    0,      127,    1,               0.506
   1,     0,    0,    0,      127,    1,               0.872
   2,     0,    1,    0,      127,    1,               0.861
   3,     0,    2,    0,      127,    1,               0.862
   4,     0,    3,    0,      127,    1,               0.884
   5,     0,    4,    0,      127,    1,               0.869
   6,     0,    5,    0,      127,    1,               0.861
   7,     0,    6,    0,      127,    1,               0.865
   8,     0,    7,    0,      127,    1,               0.884
   9,     0,    8,    0,      127,    1,               0.862
  10,     0,    9,    0,      127,    1,               0.889
  11,     0,   10,    0,      127,    1,                 0.9
  12,     0,   11,    0,      127,    1,               0.897
  13,     0,   12,    0,      127,    1,               0.909
  14,     0,   13,    0,      127,    1,               0.885
  15,     0,   14,    0,      127,    1,               0.929
  16,     0,   15,    0,      127,    1,               0.871
  17,     0,   16,    0,      127,    1,               0.875
  18,     0,   17,    0,      127,    1,               0.878
  19,     0,   18,    0,      127,    1,               0.889
  20,     0,   19,    0,      127,    1,                0.89
  21,     0,   20,    0,      127,    1,               0.901
  22,     0,   21,    0,      127,    1,                0.91
  23,     0,   22,    0,      127,    1,               0.912
  24,     0,   23,    0,      127,    1,               0.907
  25,     0,   24,    0,      127,    1,               0.947
  26,     0,   25,    0,      127,    1,               0.904
  27,     0,   26,    0,      127,    1,               0.921
  28,     0,   27,    0,      127,    1,               0.899
  29,     0,   28,    0,      127,    1,               0.923
  30,     0,   29,    0,      127,    1,               0.918
  31,     0,   30,    0,      127,    1,               0.943
  32,     0,   31,    0,      127,    1,               0.914
2048,     0,   32,   23,      127,    1,               0.815
2048,     1,   32,   23,      127,    1,               0.829
2048,     0,   64,   23,      127,    1,               0.884
2048,     2,   64,   23,      127,    1,               0.882
2048,     0,  128,   23,      127,    1,               0.884
2048,     3,  128,   23,      127,    1,               0.851
2048,     0,  256,   23,      127,    1,               0.843
2048,     4,  256,   23,      127,    1,               0.867
2048,     0,  512,   23,      127,    1,               0.746
2048,     5,  512,   23,      127,    1,               0.863
2048,     0, 1024,   23,      127,    1,               0.662
2048,     6, 1024,   23,      127,    1,               0.683
2048,     0, 2048,   23,      127,    1,               0.852
2048,     7, 2048,   23,      127,    1,               0.837
2048,     0, 4096,   23,      127,    1,               0.837
2048,     8, 4096,   23,      127,    1,               0.829
 256,     1,   64,   23,      127,    1,               0.934
 256,    15,   64,   23,      127,    1,               0.936
 256,     2,   64,   23,      127,    1,               0.931
 256,    30,   64,   23,      127,    1,               0.938
 256,     3,   64,   23,      127,    1,               0.927
 256,    45,   64,   23,      127,    1,               0.863
 256,     4,   64,   23,      127,    1,               0.939
 256,    60,   64,   23,      127,    1,               0.871
 256,     5,   64,   23,      127,    1,                0.94
 256,    75,   64,   23,      127,    1,               0.933
 256,     6,   64,   23,      127,    1,               0.915
 256,    90,   64,   23,      127,    1,               0.934
 256,     7,   64,   23,      127,    1,               0.938
 256,   105,   64,   23,      127,    1,               0.871
   1,     0,    0,   23,      127,    1,               0.865
   2,     0,    1,   23,      127,    1,                0.87
   3,     0,    2,   23,      127,    1,               0.882
   4,     0,    3,   23,      127,    1,               0.901
   5,     0,    4,   23,      127,    1,               0.879
   6,     0,    5,   23,      127,    1,               0.934
   7,     0,    6,   23,      127,    1,               0.874
   8,     0,    7,   23,      127,    1,               0.895
   9,     0,    8,   23,      127,    1,               0.873
  10,     0,    9,   23,      127,    1,               0.861
  11,     0,   10,   23,      127,    1,               0.865
  12,     0,   11,   23,      127,    1,               0.875
  13,     0,   12,   23,      127,    1,               0.878
  14,     0,   13,   23,      127,    1,                0.86
  15,     0,   14,   23,      127,    1,               0.889
  16,     0,   15,   23,      127,    1,               0.875
  17,     0,   16,   23,      127,    1,               0.911
  18,     0,   17,   23,      127,    1,               0.891
  19,     0,   18,   23,      127,    1,               0.921
  20,     0,   19,   23,      127,    1,               0.898
  21,     0,   20,   23,      127,    1,               0.895
  22,     0,   21,   23,      127,    1,               0.906
  23,     0,   22,   23,      127,    1,               0.911
  24,     0,   23,   23,      127,    1,               0.877
  25,     0,   24,   23,      127,    1,                 0.9
  26,     0,   25,   23,      127,    1,               0.911
  27,     0,   26,   23,      127,    1,               0.926
  28,     0,   27,   23,      127,    1,               0.918
  29,     0,   28,   23,      127,    1,               0.952
  30,     0,   29,   23,      127,    1,               0.943
  31,     0,   30,   23,      127,    1,               0.934
  32,     0,   31,   23,      127,    1,                 0.8
2048,     0,   32,   23,      127,    2,               0.872
2048,     1,   32,   23,      127,    2,               0.819
2048,     0,   64,   23,      127,    2,               0.815
2048,     2,   64,   23,      127,    2,               0.805
2048,     0,  128,   23,      127,    2,               0.884
2048,     3,  128,   23,      127,    2,               0.852
2048,     0,  256,   23,      127,    2,               0.873
2048,     4,  256,   23,      127,    2,               0.871
2048,     0,  512,   23,      127,    2,               0.654
2048,     5,  512,   23,      127,    2,               0.762
2048,     0, 1024,   23,      127,    2,               0.646
2048,     6, 1024,   23,      127,    2,               0.665
2048,     0, 2048,   23,      127,    2,               0.678
2048,     7, 2048,   23,      127,    2,               0.675
2048,     0, 4096,   23,      127,    2,               0.849
2048,     8, 4096,   23,      127,    2,               0.835
 256,     1,   64,   23,      127,    2,               0.917
 256,    15,   64,   23,      127,    2,               0.915
 256,     2,   64,   23,      127,    2,               0.911
 256,    30,   64,   23,      127,    2,               0.907
 256,     3,   64,   23,      127,    2,                 0.9
 256,    45,   64,   23,      127,    2,               0.816
 256,     4,   64,   23,      127,    2,               0.912
 256,    60,   64,   23,      127,    2,                0.81
 256,     5,   64,   23,      127,    2,               0.904
 256,    75,   64,   23,      127,    2,               0.911
 256,     6,   64,   23,      127,    2,               0.898
 256,    90,   64,   23,      127,    2,               0.912
 256,     7,   64,   23,      127,    2,               0.909
 256,   105,   64,   23,      127,    2,                0.81
   1,     0,    0,   23,      127,    2,               0.858
   2,     0,    1,   23,      127,    2,                0.89
   3,     0,    2,   23,      127,    2,               0.877
   4,     0,    3,   23,      127,    2,               0.863
   5,     0,    4,   23,      127,    2,               0.863
   6,     0,    5,   23,      127,    2,               0.889
   7,     0,    6,   23,      127,    2,               0.898
   8,     0,    7,   23,      127,    2,               0.885
   9,     0,    8,   23,      127,    2,               0.863
  10,     0,    9,   23,      127,    2,               0.902
  11,     0,   10,   23,      127,    2,               0.865
  12,     0,   11,   23,      127,    2,               0.864
  13,     0,   12,   23,      127,    2,                0.87
  14,     0,   13,   23,      127,    2,               0.862
  15,     0,   14,   23,      127,    2,               0.861
  16,     0,   15,   23,      127,    2,               0.859
  17,     0,   16,   23,      127,    2,                0.87
  18,     0,   17,   23,      127,    2,               0.892
  19,     0,   18,   23,      127,    2,               0.874
  20,     0,   19,   23,      127,    2,               0.866
  21,     0,   20,   23,      127,    2,               0.877
  22,     0,   21,   23,      127,    2,               0.868
  23,     0,   22,   23,      127,    2,               0.884
  24,     0,   23,   23,      127,    2,               0.881
  25,     0,   24,   23,      127,    2,               0.872
  26,     0,   25,   23,      127,    2,               0.866
  27,     0,   26,   23,      127,    2,               0.881
  28,     0,   27,   23,      127,    2,                0.93
  29,     0,   28,   23,      127,    2,               0.886
  30,     0,   29,   23,      127,    2,               0.869
  31,     0,   30,   23,      127,    2,               0.869
  32,     0,   31,   23,      127,    2,               0.667
2048,     0,   32,   23,      127,    4,               0.858
2048,     1,   32,   23,      127,    4,               0.858
2048,     0,   64,   23,      127,    4,               0.838
2048,     2,   64,   23,      127,    4,               0.834
2048,     0,  128,   23,      127,    4,                0.85
2048,     3,  128,   23,      127,    4,               0.762
2048,     0,  256,   23,      127,    4,               0.874
2048,     4,  256,   23,      127,    4,               0.796
2048,     0,  512,   23,      127,    4,               0.691
2048,     5,  512,   23,      127,    4,               0.755
2048,     0, 1024,   23,      127,    4,               0.676
2048,     6, 1024,   23,      127,    4,               0.661
2048,     0, 2048,   23,      127,    4,               0.678
2048,     7, 2048,   23,      127,    4,               0.678
2048,     0, 4096,   23,      127,    4,               0.676
2048,     8, 4096,   23,      127,    4,               0.677
 256,     1,   64,   23,      127,    4,               0.875
 256,    15,   64,   23,      127,    4,               0.877
 256,     2,   64,   23,      127,    4,               0.875
 256,    30,   64,   23,      127,    4,               0.875
 256,     3,   64,   23,      127,    4,               0.878
 256,    45,   64,   23,      127,    4,               0.829
 256,     4,   64,   23,      127,    4,               0.876
 256,    60,   64,   23,      127,    4,               0.807
 256,     5,   64,   23,      127,    4,               0.874
 256,    75,   64,   23,      127,    4,               0.872
 256,     6,   64,   23,      127,    4,               0.874
 256,    90,   64,   23,      127,    4,               0.874
 256,     7,   64,   23,      127,    4,               0.873
 256,   105,   64,   23,      127,    4,               0.826
   1,     0,    0,   23,      127,    4,               0.863
   2,     0,    1,   23,      127,    4,               0.861
   3,     0,    2,   23,      127,    4,               0.863
   4,     0,    3,   23,      127,    4,               0.867
   5,     0,    4,   23,      127,    4,               0.866
   6,     0,    5,   23,      127,    4,               0.873
   7,     0,    6,   23,      127,    4,               0.873
   8,     0,    7,   23,      127,    4,               0.866
   9,     0,    8,   23,      127,    4,               0.861
  10,     0,    9,   23,      127,    4,               0.861
  11,     0,   10,   23,      127,    4,               0.857
  12,     0,   11,   23,      127,    4,               0.864
  13,     0,   12,   23,      127,    4,                0.86
  14,     0,   13,   23,      127,    4,               0.859
  15,     0,   14,   23,      127,    4,               0.854
  16,     0,   15,   23,      127,    4,               0.857
  17,     0,   16,   23,      127,    4,               0.881
  18,     0,   17,   23,      127,    4,               0.863
  19,     0,   18,   23,      127,    4,                0.86
  20,     0,   19,   23,      127,    4,               0.906
  21,     0,   20,   23,      127,    4,               0.924
  22,     0,   21,   23,      127,    4,               0.885
  23,     0,   22,   23,      127,    4,               0.861
  24,     0,   23,   23,      127,    4,               0.907
  25,     0,   24,   23,      127,    4,               0.909
  26,     0,   25,   23,      127,    4,               0.863
  27,     0,   26,   23,      127,    4,               0.862
  28,     0,   27,   23,      127,    4,               0.887
  29,     0,   28,   23,      127,    4,               0.879
  30,     0,   29,   23,      127,    4,               0.932
  31,     0,   30,   23,      127,    4,               0.895
  32,     0,   31,   23,      127,    4,               0.666
2048,     0,   32,   23,      127,    8,               0.865
2048,     1,   32,   23,      127,    8,               0.892
2048,     0,   64,   23,      127,    8,                0.85
2048,     2,   64,   23,      127,    8,               0.834
2048,     0,  128,   23,      127,    8,               0.823
2048,     3,  128,   23,      127,    8,               0.809
2048,     0,  256,   23,      127,    8,                0.84
2048,     4,  256,   23,      127,    8,               0.738
2048,     0,  512,   23,      127,    8,               0.656
2048,     5,  512,   23,      127,    8,               0.644
2048,     0, 1024,   23,      127,    8,               0.705
2048,     6, 1024,   23,      127,    8,               0.708
2048,     0, 2048,   23,      127,    8,               0.701
2048,     7, 2048,   23,      127,    8,                 0.7
2048,     0, 4096,   23,      127,    8,                0.68
2048,     8, 4096,   23,      127,    8,               0.678
 256,     1,   64,   23,      127,    8,               0.881
 256,    15,   64,   23,      127,    8,               0.879
 256,     2,   64,   23,      127,    8,               0.878
 256,    30,   64,   23,      127,    8,               0.877
 256,     3,   64,   23,      127,    8,                0.88
 256,    45,   64,   23,      127,    8,               0.829
 256,     4,   64,   23,      127,    8,               0.883
 256,    60,   64,   23,      127,    8,               0.808
 256,     5,   64,   23,      127,    8,               0.875
 256,    75,   64,   23,      127,    8,               0.877
 256,     6,   64,   23,      127,    8,               0.874
 256,    90,   64,   23,      127,    8,               0.874
 256,     7,   64,   23,      127,    8,               0.874
 256,   105,   64,   23,      127,    8,                0.83
   1,     0,    0,   23,      127,    8,               0.862
   2,     0,    1,   23,      127,    8,               0.865
   3,     0,    2,   23,      127,    8,               0.866
   4,     0,    3,   23,      127,    8,               0.863
   5,     0,    4,   23,      127,    8,               0.874
   6,     0,    5,   23,      127,    8,                0.87
   7,     0,    6,   23,      127,    8,                0.87
   8,     0,    7,   23,      127,    8,               0.864
   9,     0,    8,   23,      127,    8,                0.87
  10,     0,    9,   23,      127,    8,               0.861
  11,     0,   10,   23,      127,    8,               0.862
  12,     0,   11,   23,      127,    8,                0.87
  13,     0,   12,   23,      127,    8,               0.858
  14,     0,   13,   23,      127,    8,                0.86
  15,     0,   14,   23,      127,    8,               0.863
  16,     0,   15,   23,      127,    8,               0.866
  17,     0,   16,   23,      127,    8,                0.86
  18,     0,   17,   23,      127,    8,               0.887
  19,     0,   18,   23,      127,    8,               0.858
  20,     0,   19,   23,      127,    8,               0.891
  21,     0,   20,   23,      127,    8,               0.874
  22,     0,   21,   23,      127,    8,               0.891
  23,     0,   22,   23,      127,    8,               0.873
  24,     0,   23,   23,      127,    8,               0.895
  25,     0,   24,   23,      127,    8,               0.884
  26,     0,   25,   23,      127,    8,               0.878
  27,     0,   26,   23,      127,    8,               0.878
  28,     0,   27,   23,      127,    8,               0.891
  29,     0,   28,   23,      127,    8,                0.91
  30,     0,   29,   23,      127,    8,               0.881
  31,     0,   30,   23,      127,    8,               0.917
  32,     0,   31,   23,      127,    8,               0.667
2048,     0,   32,   23,      127,   16,                0.86
2048,     1,   32,   23,      127,   16,               0.847
2048,     0,   64,   23,      127,   16,               0.846
2048,     2,   64,   23,      127,   16,               0.852
2048,     0,  128,   23,      127,   16,                0.82
2048,     3,  128,   23,      127,   16,               0.751
2048,     0,  256,   23,      127,   16,               0.788
2048,     4,  256,   23,      127,   16,               0.712
2048,     0,  512,   23,      127,   16,               0.524
2048,     5,  512,   23,      127,   16,               0.517
2048,     0, 1024,   23,      127,   16,               0.583
2048,     6, 1024,   23,      127,   16,               0.682
2048,     0, 2048,   23,      127,   16,                0.77
2048,     7, 2048,   23,      127,   16,               0.659
2048,     0, 4096,   23,      127,   16,                 0.7
2048,     8, 4096,   23,      127,   16,                 0.7
 256,     1,   64,   23,      127,   16,               0.798
 256,    15,   64,   23,      127,   16,               0.873
 256,     2,   64,   23,      127,   16,               0.875
 256,    30,   64,   23,      127,   16,               0.877
 256,     3,   64,   23,      127,   16,               0.875
 256,    45,   64,   23,      127,   16,               0.834
 256,     4,   64,   23,      127,   16,               0.873
 256,    60,   64,   23,      127,   16,               0.809
 256,     5,   64,   23,      127,   16,               0.879
 256,    75,   64,   23,      127,   16,               0.884
 256,     6,   64,   23,      127,   16,               0.874
 256,    90,   64,   23,      127,   16,               0.876
 256,     7,   64,   23,      127,   16,               0.876
 256,   105,   64,   23,      127,   16,               0.827
   1,     0,    0,   23,      127,   16,               0.859
   2,     0,    1,   23,      127,   16,               0.864
   3,     0,    2,   23,      127,   16,               0.871
   4,     0,    3,   23,      127,   16,               0.869
   5,     0,    4,   23,      127,   16,               0.881
   6,     0,    5,   23,      127,   16,               0.869
   7,     0,    6,   23,      127,   16,               0.867
   8,     0,    7,   23,      127,   16,               0.877
   9,     0,    8,   23,      127,   16,               0.862
  10,     0,    9,   23,      127,   16,               0.861
  11,     0,   10,   23,      127,   16,               0.859
  12,     0,   11,   23,      127,   16,               0.858
  13,     0,   12,   23,      127,   16,               0.867
  14,     0,   13,   23,      127,   16,               0.857
  15,     0,   14,   23,      127,   16,               0.858
  16,     0,   15,   23,      127,   16,               0.857
  17,     0,   16,   23,      127,   16,               0.858
  18,     0,   17,   23,      127,   16,               0.867
  19,     0,   18,   23,      127,   16,               0.875
  20,     0,   19,   23,      127,   16,               0.868
  21,     0,   20,   23,      127,   16,               0.861
  22,     0,   21,   23,      127,   16,               0.868
  23,     0,   22,   23,      127,   16,               0.866
  24,     0,   23,   23,      127,   16,               0.858
  25,     0,   24,   23,      127,   16,               0.859
  26,     0,   25,   23,      127,   16,               0.857
  27,     0,   26,   23,      127,   16,               0.866
  28,     0,   27,   23,      127,   16,               0.875
  29,     0,   28,   23,      127,   16,               0.896
  30,     0,   29,   23,      127,   16,               0.889
  31,     0,   30,   23,      127,   16,               0.903
  32,     0,   31,   23,      127,   16,               0.667

 sysdeps/x86_64/multiarch/strrchr-avx2.S | 415 +++++++++++++++---------
 1 file changed, 258 insertions(+), 157 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
index 1df2adfad0..9d1e45defc 100644
--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
@@ -27,9 +27,13 @@
 # ifdef USE_AS_WCSRCHR
 #  define VPBROADCAST	vpbroadcastd
 #  define VPCMPEQ	vpcmpeqd
+#  define VPMIN	vpminud
+#  define CHAR_SIZE	4
 # else
 #  define VPBROADCAST	vpbroadcastb
 #  define VPCMPEQ	vpcmpeqb
+#  define VPMIN	vpminub
+#  define CHAR_SIZE	1
 # endif
 
 # ifndef VZEROUPPER
@@ -41,196 +45,293 @@
 # endif
 
 # define VEC_SIZE	32
+# define PAGE_SIZE	4096
 
-	.section SECTION(.text),"ax",@progbits
-ENTRY (STRRCHR)
-	movd	%esi, %xmm4
-	movl	%edi, %ecx
+	.section SECTION(.text), "ax", @progbits
+ENTRY(STRRCHR)
+	movd	%esi, %xmm7
+	movl	%edi, %eax
 	/* Broadcast CHAR to YMM4.  */
-	VPBROADCAST %xmm4, %ymm4
+	VPBROADCAST %xmm7, %ymm7
 	vpxor	%xmm0, %xmm0, %xmm0
 
-	/* Check if we may cross page boundary with one vector load.  */
-	andl	$(2 * VEC_SIZE - 1), %ecx
-	cmpl	$VEC_SIZE, %ecx
-	ja	L(cros_page_boundary)
+	/* Shift here instead of `andl` to save code size (saves a fetch
+	   block).  */
+	sall	$20, %eax
+	cmpl	$((PAGE_SIZE - VEC_SIZE) << 20), %eax
+	ja	L(cross_page)
 
+L(page_cross_continue):
 	vmovdqu	(%rdi), %ymm1
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
-	vpmovmskb %ymm2, %ecx
-	vpmovmskb %ymm3, %eax
-	addq	$VEC_SIZE, %rdi
+	/* Check end of string match.  */
+	VPCMPEQ	%ymm1, %ymm0, %ymm6
+	vpmovmskb %ymm6, %ecx
+	testl	%ecx, %ecx
+	jz	L(aligned_more)
+
+	/* Only check match with search CHAR if needed.  */
+	VPCMPEQ	%ymm1, %ymm7, %ymm1
+	vpmovmskb %ymm1, %eax
+	/* Check if match before first zero.  */
+	blsmskl	%ecx, %ecx
+	andl	%ecx, %eax
+	jz	L(ret0)
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+	/* We are off by 3 for wcsrchr if search CHAR is non-zero. If
+	   search CHAR is zero we are correct. Either way `andq
+	   -CHAR_SIZE, %rax` gets the correct result.  */
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+L(ret0):
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+
+	/* Returns for first vec x1/x2 have hard coded backward search
+	   path for earlier matches.  */
+	.p2align 4,, 10
+L(first_vec_x1):
+	VPCMPEQ	%ymm2, %ymm7, %ymm6
+	vpmovmskb %ymm6, %eax
+	blsmskl	%ecx, %ecx
+	andl	%ecx, %eax
+	jnz	L(first_vec_x1_return)
+
+	.p2align 4,, 4
+L(first_vec_x0_test):
+	VPCMPEQ	%ymm1, %ymm7, %ymm6
+	vpmovmskb %ymm6, %eax
+	testl	%eax, %eax
+	jz	L(ret1)
+	bsrl	%eax, %eax
+	addq	%r8, %rax
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+L(ret1):
+	VZEROUPPER_RETURN
 
+	.p2align 4,, 10
+L(first_vec_x0_x1_test):
+	VPCMPEQ	%ymm2, %ymm7, %ymm6
+	vpmovmskb %ymm6, %eax
 	testl	%eax, %eax
-	jnz	L(first_vec)
+	jz	L(first_vec_x0_test)
+	.p2align 4,, 4
+L(first_vec_x1_return):
+	bsrl	%eax, %eax
+	leaq	1(%rdi, %rax), %rax
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+	VZEROUPPER_RETURN
 
-	testl	%ecx, %ecx
-	jnz	L(return_null)
 
-	andq	$-VEC_SIZE, %rdi
-	xorl	%edx, %edx
-	jmp	L(aligned_loop)
+	.p2align 4,, 10
+L(first_vec_x2):
+	VPCMPEQ	%ymm3, %ymm7, %ymm6
+	vpmovmskb %ymm6, %eax
+	blsmskl	%ecx, %ecx
+	andl	%ecx, %eax
+	jz	L(first_vec_x0_x1_test)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE + 1)(%rdi, %rax), %rax
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+	VZEROUPPER_RETURN
+
 
 	.p2align 4
-L(first_vec):
-	/* Check if there is a nul CHAR.  */
+L(aligned_more):
+	/* Save original pointer if match was in VEC 0.  */
+	movq	%rdi, %r8
+
+	/* Align src.  */
+	orq	$(VEC_SIZE - 1), %rdi
+	vmovdqu	1(%rdi), %ymm2
+	VPCMPEQ	%ymm2, %ymm0, %ymm6
+	vpmovmskb %ymm6, %ecx
 	testl	%ecx, %ecx
-	jnz	L(char_and_nul_in_first_vec)
+	jnz	L(first_vec_x1)
 
-	/* Remember the match and keep searching.  */
-	movl	%eax, %edx
-	movq	%rdi, %rsi
-	andq	$-VEC_SIZE, %rdi
-	jmp	L(aligned_loop)
+	vmovdqu	(VEC_SIZE + 1)(%rdi), %ymm3
+	VPCMPEQ	%ymm3, %ymm0, %ymm6
+	vpmovmskb %ymm6, %ecx
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x2)
 
+	/* Save pointer again before realigning.  */
+	movq	%rdi, %rsi
+	addq	$(VEC_SIZE + 1), %rdi
+	andq	$-(VEC_SIZE * 2), %rdi
 	.p2align 4
-L(cros_page_boundary):
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
-	vmovdqa	(%rdi), %ymm1
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
-	vpmovmskb %ymm2, %edx
-	vpmovmskb %ymm3, %eax
-	shrl	%cl, %edx
-	shrl	%cl, %eax
-	addq	$VEC_SIZE, %rdi
-
-	/* Check if there is a CHAR.  */
+L(first_aligned_loop):
+	/* Do 2x VEC at a time. Any more and the cost of finding the
+	   match outweights loop benefit.  */
+	vmovdqa	(VEC_SIZE * 0)(%rdi), %ymm4
+	vmovdqa	(VEC_SIZE * 1)(%rdi), %ymm5
+
+	VPCMPEQ	%ymm4, %ymm7, %ymm6
+	VPMIN	%ymm4, %ymm5, %ymm8
+	VPCMPEQ	%ymm5, %ymm7, %ymm10
+	vpor	%ymm6, %ymm10, %ymm5
+	VPCMPEQ	%ymm8, %ymm0, %ymm8
+	vpor	%ymm5, %ymm8, %ymm9
+
+	vpmovmskb %ymm9, %eax
+	addq	$(VEC_SIZE * 2), %rdi
+	/* No zero or search CHAR.  */
 	testl	%eax, %eax
-	jnz	L(found_char)
-
-	testl	%edx, %edx
-	jnz	L(return_null)
+	jz	L(first_aligned_loop)
 
-	jmp	L(aligned_loop)
-
-	.p2align 4
-L(found_char):
-	testl	%edx, %edx
-	jnz	L(char_and_nul)
+	/* If no zero CHAR then go to second loop (this allows us to
+	   throw away all prior work).  */
+	vpmovmskb %ymm8, %ecx
+	testl	%ecx, %ecx
+	jz	L(second_aligned_loop_prep)
 
-	/* Remember the match and keep searching.  */
-	movl	%eax, %edx
-	leaq	(%rdi, %rcx), %rsi
+	/* Search char could be zero so we need to get the true match.
+	 */
+	vpmovmskb %ymm5, %eax
+	testl	%eax, %eax
+	jnz	L(first_aligned_loop_return)
 
-	.p2align 4
-L(aligned_loop):
-	vmovdqa	(%rdi), %ymm1
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
-	addq	$VEC_SIZE, %rdi
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
-	vpmovmskb %ymm2, %ecx
-	vpmovmskb %ymm3, %eax
-	orl	%eax, %ecx
-	jnz	L(char_nor_null)
-
-	vmovdqa	(%rdi), %ymm1
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
-	add	$VEC_SIZE, %rdi
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
-	vpmovmskb %ymm2, %ecx
+	.p2align 4,, 4
+L(first_vec_x1_or_x2):
+	VPCMPEQ	%ymm3, %ymm7, %ymm3
+	VPCMPEQ	%ymm2, %ymm7, %ymm2
 	vpmovmskb %ymm3, %eax
-	orl	%eax, %ecx
-	jnz	L(char_nor_null)
-
-	vmovdqa	(%rdi), %ymm1
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
-	addq	$VEC_SIZE, %rdi
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
-	vpmovmskb %ymm2, %ecx
-	vpmovmskb %ymm3, %eax
-	orl	%eax, %ecx
-	jnz	L(char_nor_null)
-
-	vmovdqa	(%rdi), %ymm1
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
-	addq	$VEC_SIZE, %rdi
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
-	vpmovmskb %ymm2, %ecx
-	vpmovmskb %ymm3, %eax
-	orl	%eax, %ecx
-	jz	L(aligned_loop)
-
-	.p2align 4
-L(char_nor_null):
-	/* Find a CHAR or a nul CHAR in a loop.  */
-	testl	%eax, %eax
-	jnz	L(match)
-L(return_value):
-	testl	%edx, %edx
-	jz	L(return_null)
-	movl	%edx, %eax
-	movq	%rsi, %rdi
+	vpmovmskb %ymm2, %edx
+	/* Use add for macro-fusion.  */
+	addq	%rax, %rdx
+	jz	L(first_vec_x0_test)
+	/* NB: We could move this shift to before the branch and save a
+	   bit of code size / performance on the fall through. The
+	   branch leads to the null case which generally seems hotter
+	   than char in first 3x VEC.  */
+	salq	$32, %rax
+	addq	%rdx, %rax
+	bsrq	%rax, %rax
+	leaq	1(%rsi, %rax), %rax
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+	VZEROUPPER_RETURN
 
+	.p2align 4,, 8
+L(first_aligned_loop_return):
+	VPCMPEQ	%ymm4, %ymm0, %ymm4
+	vpmovmskb %ymm4, %edx
+	salq	$32, %rcx
+	orq	%rdx, %rcx
+
+	vpmovmskb %ymm10, %eax
+	vpmovmskb %ymm6, %edx
+	salq	$32, %rax
+	orq	%rdx, %rax
+	blsmskq	%rcx, %rcx
+	andq	%rcx, %rax
+	jz	L(first_vec_x1_or_x2)
+
+	bsrq	%rax, %rax
+	leaq	-(VEC_SIZE * 2)(%rdi, %rax), %rax
 # ifdef USE_AS_WCSRCHR
-	/* Keep the first bit for each matching CHAR for bsr.  */
-	andl	$0x11111111, %eax
+	andq	$-CHAR_SIZE, %rax
 # endif
-	bsrl	%eax, %eax
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
-L(return_vzeroupper):
-	ZERO_UPPER_VEC_REGISTERS_RETURN
+	VZEROUPPER_RETURN
 
+	/* Search char cannot be zero.  */
 	.p2align 4
-L(match):
-	/* Find a CHAR.  Check if there is a nul CHAR.  */
-	vpmovmskb %ymm2, %ecx
-	testl	%ecx, %ecx
-	jnz	L(find_nul)
-
-	/* Remember the match and keep searching.  */
-	movl	%eax, %edx
+L(second_aligned_loop_set_furthest_match):
+	/* Save VEC and pointer from most recent match.  */
+L(second_aligned_loop_prep):
 	movq	%rdi, %rsi
-	jmp	L(aligned_loop)
+	vmovdqu	%ymm6, %ymm2
+	vmovdqu	%ymm10, %ymm3
 
 	.p2align 4
-L(find_nul):
-# ifdef USE_AS_WCSRCHR
-	/* Keep the first bit for each matching CHAR for bsr.  */
-	andl	$0x11111111, %ecx
-	andl	$0x11111111, %eax
-# endif
-	/* Mask out any matching bits after the nul CHAR.  */
-	movl	%ecx, %r8d
-	subl	$1, %r8d
-	xorl	%ecx, %r8d
-	andl	%r8d, %eax
+L(second_aligned_loop):
+	/* Search 2x at at time.  */
+	vmovdqa	(VEC_SIZE * 0)(%rdi), %ymm4
+	vmovdqa	(VEC_SIZE * 1)(%rdi), %ymm5
+
+	VPCMPEQ	%ymm4, %ymm7, %ymm6
+	VPMIN	%ymm4, %ymm5, %ymm1
+	VPCMPEQ	%ymm5, %ymm7, %ymm10
+	vpor	%ymm6, %ymm10, %ymm5
+	VPCMPEQ	%ymm1, %ymm0, %ymm1
+	vpor	%ymm5, %ymm1, %ymm9
+
+	vpmovmskb %ymm9, %eax
+	addq	$(VEC_SIZE * 2), %rdi
 	testl	%eax, %eax
-	/* If there is no CHAR here, return the remembered one.  */
-	jz	L(return_value)
-	bsrl	%eax, %eax
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(char_and_nul):
-	/* Find both a CHAR and a nul CHAR.  */
-	addq	%rcx, %rdi
-	movl	%edx, %ecx
-L(char_and_nul_in_first_vec):
-# ifdef USE_AS_WCSRCHR
-	/* Keep the first bit for each matching CHAR for bsr.  */
-	andl	$0x11111111, %ecx
-	andl	$0x11111111, %eax
-# endif
-	/* Mask out any matching bits after the nul CHAR.  */
-	movl	%ecx, %r8d
-	subl	$1, %r8d
-	xorl	%ecx, %r8d
-	andl	%r8d, %eax
+	jz	L(second_aligned_loop)
+	vpmovmskb %ymm1, %ecx
+	testl	%ecx, %ecx
+	jz	L(second_aligned_loop_set_furthest_match)
+	vpmovmskb %ymm5, %eax
 	testl	%eax, %eax
-	/* Return null pointer if the nul CHAR comes first.  */
-	jz	L(return_null)
-	bsrl	%eax, %eax
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
+	jnz	L(return_new_match)
+
+	/* This is the hot patch. We know CHAR is inbounds and that
+	   ymm3/ymm2 have latest match.  */
+	.p2align 4,, 4
+L(return_old_match):
+	vpmovmskb %ymm3, %eax
+	vpmovmskb %ymm2, %edx
+	salq	$32, %rax
+	orq	%rdx, %rax
+	bsrq	%rax, %rax
+	/* Search char cannot be zero so safe to just use lea for
+	   wcsrchr.  */
+	leaq	(VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax
 	VZEROUPPER_RETURN
 
-	.p2align 4
-L(return_null):
-	xorl	%eax, %eax
+	/* Last iteration also potentially has a match.  */
+	.p2align 4,, 8
+L(return_new_match):
+	VPCMPEQ	%ymm4, %ymm0, %ymm4
+	vpmovmskb %ymm4, %edx
+	salq	$32, %rcx
+	orq	%rdx, %rcx
+
+	vpmovmskb %ymm10, %eax
+	vpmovmskb %ymm6, %edx
+	salq	$32, %rax
+	orq	%rdx, %rax
+	blsmskq	%rcx, %rcx
+	andq	%rcx, %rax
+	jz	L(return_old_match)
+	bsrq	%rax, %rax
+	/* Search char cannot be zero so safe to just use lea for
+	   wcsrchr.  */
+	leaq	(VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax
 	VZEROUPPER_RETURN
 
-END (STRRCHR)
+	.p2align 4,, 4
+L(cross_page):
+	movq	%rdi, %rsi
+	andq	$-VEC_SIZE, %rsi
+	vmovdqu	(%rsi), %ymm1
+	VPCMPEQ	%ymm1, %ymm0, %ymm6
+	vpmovmskb %ymm6, %ecx
+	shrxl	%edi, %ecx, %ecx
+	testl	%ecx, %ecx
+	jz	L(page_cross_continue)
+	VPCMPEQ	%ymm1, %ymm7, %ymm1
+	vpmovmskb %ymm1, %eax
+	shrxl	%edi, %eax, %eax
+	blsmskl	%ecx, %ecx
+	andl	%ecx, %eax
+	jz	L(ret2)
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+L(ret2):
+	VZEROUPPER_RETURN
+END(STRRCHR)
 #endif
-- 
2.25.1


^ permalink raw reply	[flat|nested] 36+ messages in thread

* [PATCH v1 5/5] x86: Optimize {str|wcs}rchr-evex
  2022-04-21  3:14 [PATCH v1 1/5] benchtests: Improve bench-strrchr Noah Goldstein
                   ` (2 preceding siblings ...)
  2022-04-21  3:14 ` [PATCH v1 4/5] x86: Optimize {str|wcs}rchr-avx2 Noah Goldstein
@ 2022-04-21  3:14 ` Noah Goldstein
  2022-04-21 20:12 ` [PATCH v1 1/5] benchtests: Improve bench-strrchr H.J. Lu
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 36+ messages in thread
From: Noah Goldstein @ 2022-04-21  3:14 UTC (permalink / raw)
  To: libc-alpha

The new code unrolls the main loop slightly without adding too much
overhead and minimizes the comparisons for the search CHAR.

Geometric Mean of all benchmarks New / Old: 0.755
See email for all results.

Full xcheck passes on x86_64 with and without multiarch enabled.
---
Results For: strrchr

Geometric Mean of N=30 runs.

Geometric Mean of all benchmarks New / Old: 0.755
Benchmarks performance on Tigerlake:
https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i71165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html

 len, align,  pos, seek, max_char, freq, New Time / Old Time
2048,     0,   32,    0,      127,    1,               0.669
2048,     1,   32,    0,      127,    1,               0.672
2048,     0,   64,    0,      127,    1,               0.579
2048,     2,   64,    0,      127,    1,               0.579
2048,     0,  128,    0,      127,    1,               0.828
2048,     3,  128,    0,      127,    1,               0.827
2048,     0,  256,    0,      127,    1,               0.693
2048,     4,  256,    0,      127,    1,               0.692
2048,     0,  512,    0,      127,    1,               0.619
2048,     5,  512,    0,      127,    1,               0.622
2048,     0, 1024,    0,      127,    1,               0.626
2048,     6, 1024,    0,      127,    1,               0.627
2048,     0, 2048,    0,      127,    1,                0.85
2048,     7, 2048,    0,      127,    1,               0.855
2048,     0, 4096,    0,      127,    1,               0.849
2048,     8, 4096,    0,      127,    1,               0.848
 256,     1,   64,    0,      127,    1,               0.579
 256,    15,   64,    0,      127,    1,               0.579
 256,     2,   64,    0,      127,    1,               0.579
 256,    30,   64,    0,      127,    1,               0.579
 256,     3,   64,    0,      127,    1,               0.579
 256,    45,   64,    0,      127,    1,               0.551
 256,     4,   64,    0,      127,    1,               0.579
 256,    60,   64,    0,      127,    1,               0.553
 256,     5,   64,    0,      127,    1,               0.579
 256,    75,   64,    0,      127,    1,               0.578
 256,     6,   64,    0,      127,    1,               0.578
 256,    90,   64,    0,      127,    1,               0.579
 256,     7,   64,    0,      127,    1,               0.579
 256,   105,   64,    0,      127,    1,                0.55
   1,     0,    0,    0,      127,    1,               0.795
   2,     0,    1,    0,      127,    1,               0.797
   3,     0,    2,    0,      127,    1,               0.796
   4,     0,    3,    0,      127,    1,               0.792
   5,     0,    4,    0,      127,    1,               0.789
   6,     0,    5,    0,      127,    1,               0.791
   7,     0,    6,    0,      127,    1,               0.793
   8,     0,    7,    0,      127,    1,               0.789
   9,     0,    8,    0,      127,    1,               0.797
  10,     0,    9,    0,      127,    1,               0.788
  11,     0,   10,    0,      127,    1,               0.796
  12,     0,   11,    0,      127,    1,               0.793
  13,     0,   12,    0,      127,    1,               0.797
  14,     0,   13,    0,      127,    1,               0.795
  15,     0,   14,    0,      127,    1,               0.795
  16,     0,   15,    0,      127,    1,               0.791
  17,     0,   16,    0,      127,    1,               0.798
  18,     0,   17,    0,      127,    1,                 0.8
  19,     0,   18,    0,      127,    1,               0.797
  20,     0,   19,    0,      127,    1,               0.798
  21,     0,   20,    0,      127,    1,               0.797
  22,     0,   21,    0,      127,    1,               0.796
  23,     0,   22,    0,      127,    1,               0.792
  24,     0,   23,    0,      127,    1,               0.791
  25,     0,   24,    0,      127,    1,               0.794
  26,     0,   25,    0,      127,    1,               0.797
  27,     0,   26,    0,      127,    1,               0.793
  28,     0,   27,    0,      127,    1,                0.79
  29,     0,   28,    0,      127,    1,                0.79
  30,     0,   29,    0,      127,    1,               0.791
  31,     0,   30,    0,      127,    1,               0.791
  32,     0,   31,    0,      127,    1,                0.79
2048,     0,   32,   23,      127,    1,               0.734
2048,     1,   32,   23,      127,    1,               0.748
2048,     0,   64,   23,      127,    1,               0.759
2048,     2,   64,   23,      127,    1,               0.753
2048,     0,  128,   23,      127,    1,               0.834
2048,     3,  128,   23,      127,    1,               0.835
2048,     0,  256,   23,      127,    1,               0.789
2048,     4,  256,   23,      127,    1,               0.791
2048,     0,  512,   23,      127,    1,               0.882
2048,     5,  512,   23,      127,    1,               0.861
2048,     0, 1024,   23,      127,    1,               0.643
2048,     6, 1024,   23,      127,    1,               0.643
2048,     0, 2048,   23,      127,    1,               0.931
2048,     7, 2048,   23,      127,    1,               0.929
2048,     0, 4096,   23,      127,    1,               0.922
2048,     8, 4096,   23,      127,    1,               0.934
 256,     1,   64,   23,      127,    1,                0.73
 256,    15,   64,   23,      127,    1,               0.729
 256,     2,   64,   23,      127,    1,               0.725
 256,    30,   64,   23,      127,    1,               0.728
 256,     3,   64,   23,      127,    1,               0.727
 256,    45,   64,   23,      127,    1,               0.749
 256,     4,   64,   23,      127,    1,                0.73
 256,    60,   64,   23,      127,    1,               0.752
 256,     5,   64,   23,      127,    1,               0.729
 256,    75,   64,   23,      127,    1,               0.727
 256,     6,   64,   23,      127,    1,               0.693
 256,    90,   64,   23,      127,    1,                0.73
 256,     7,   64,   23,      127,    1,                0.73
 256,   105,   64,   23,      127,    1,               0.751
   1,     0,    0,   23,      127,    1,               0.797
   2,     0,    1,   23,      127,    1,               0.794
   3,     0,    2,   23,      127,    1,               0.797
   4,     0,    3,   23,      127,    1,               0.792
   5,     0,    4,   23,      127,    1,               0.781
   6,     0,    5,   23,      127,    1,               0.783
   7,     0,    6,   23,      127,    1,                0.79
   8,     0,    7,   23,      127,    1,               0.791
   9,     0,    8,   23,      127,    1,               0.794
  10,     0,    9,   23,      127,    1,               0.795
  11,     0,   10,   23,      127,    1,               0.795
  12,     0,   11,   23,      127,    1,               0.795
  13,     0,   12,   23,      127,    1,               0.794
  14,     0,   13,   23,      127,    1,               0.792
  15,     0,   14,   23,      127,    1,                0.79
  16,     0,   15,   23,      127,    1,               0.793
  17,     0,   16,   23,      127,    1,               0.795
  18,     0,   17,   23,      127,    1,               0.797
  19,     0,   18,   23,      127,    1,               0.796
  20,     0,   19,   23,      127,    1,               0.796
  21,     0,   20,   23,      127,    1,               0.794
  22,     0,   21,   23,      127,    1,               0.794
  23,     0,   22,   23,      127,    1,               0.793
  24,     0,   23,   23,      127,    1,               0.792
  25,     0,   24,   23,      127,    1,               0.795
  26,     0,   25,   23,      127,    1,               0.792
  27,     0,   26,   23,      127,    1,               0.789
  28,     0,   27,   23,      127,    1,               0.794
  29,     0,   28,   23,      127,    1,               0.793
  30,     0,   29,   23,      127,    1,               0.795
  31,     0,   30,   23,      127,    1,               0.797
  32,     0,   31,   23,      127,    1,               0.775
2048,     0,   32,   23,      127,    2,               0.736
2048,     1,   32,   23,      127,    2,               0.738
2048,     0,   64,   23,      127,    2,               0.895
2048,     2,   64,   23,      127,    2,               0.897
2048,     0,  128,   23,      127,    2,               0.852
2048,     3,  128,   23,      127,    2,               0.845
2048,     0,  256,   23,      127,    2,               0.755
2048,     4,  256,   23,      127,    2,               0.712
2048,     0,  512,   23,      127,    2,               0.857
2048,     5,  512,   23,      127,    2,               0.849
2048,     0, 1024,   23,      127,    2,               0.626
2048,     6, 1024,   23,      127,    2,               0.661
2048,     0, 2048,   23,      127,    2,                0.67
2048,     7, 2048,   23,      127,    2,                0.67
2048,     0, 4096,   23,      127,    2,               0.928
2048,     8, 4096,   23,      127,    2,               0.935
 256,     1,   64,   23,      127,    2,               0.693
 256,    15,   64,   23,      127,    2,               0.692
 256,     2,   64,   23,      127,    2,               0.693
 256,    30,   64,   23,      127,    2,               0.692
 256,     3,   64,   23,      127,    2,               0.692
 256,    45,   64,   23,      127,    2,               0.701
 256,     4,   64,   23,      127,    2,               0.692
 256,    60,   64,   23,      127,    2,               0.701
 256,     5,   64,   23,      127,    2,                0.69
 256,    75,   64,   23,      127,    2,               0.693
 256,     6,   64,   23,      127,    2,               0.691
 256,    90,   64,   23,      127,    2,               0.692
 256,     7,   64,   23,      127,    2,               0.693
 256,   105,   64,   23,      127,    2,               0.701
   1,     0,    0,   23,      127,    2,               0.797
   2,     0,    1,   23,      127,    2,               0.787
   3,     0,    2,   23,      127,    2,               0.797
   4,     0,    3,   23,      127,    2,               0.793
   5,     0,    4,   23,      127,    2,               0.792
   6,     0,    5,   23,      127,    2,               0.795
   7,     0,    6,   23,      127,    2,               0.791
   8,     0,    7,   23,      127,    2,               0.792
   9,     0,    8,   23,      127,    2,               0.796
  10,     0,    9,   23,      127,    2,               0.797
  11,     0,   10,   23,      127,    2,               0.797
  12,     0,   11,   23,      127,    2,               0.798
  13,     0,   12,   23,      127,    2,               0.799
  14,     0,   13,   23,      127,    2,               0.796
  15,     0,   14,   23,      127,    2,               0.796
  16,     0,   15,   23,      127,    2,               0.794
  17,     0,   16,   23,      127,    2,               0.795
  18,     0,   17,   23,      127,    2,               0.797
  19,     0,   18,   23,      127,    2,               0.793
  20,     0,   19,   23,      127,    2,               0.795
  21,     0,   20,   23,      127,    2,               0.794
  22,     0,   21,   23,      127,    2,               0.794
  23,     0,   22,   23,      127,    2,               0.796
  24,     0,   23,   23,      127,    2,               0.794
  25,     0,   24,   23,      127,    2,               0.794
  26,     0,   25,   23,      127,    2,               0.794
  27,     0,   26,   23,      127,    2,               0.788
  28,     0,   27,   23,      127,    2,               0.791
  29,     0,   28,   23,      127,    2,               0.791
  30,     0,   29,   23,      127,    2,               0.793
  31,     0,   30,   23,      127,    2,               0.796
  32,     0,   31,   23,      127,    2,               0.628
2048,     0,   32,   23,      127,    4,               0.742
2048,     1,   32,   23,      127,    4,               0.742
2048,     0,   64,   23,      127,    4,               0.899
2048,     2,   64,   23,      127,    4,               0.912
2048,     0,  128,   23,      127,    4,               0.783
2048,     3,  128,   23,      127,    4,               0.815
2048,     0,  256,   23,      127,    4,               0.854
2048,     4,  256,   23,      127,    4,               0.858
2048,     0,  512,   23,      127,    4,               0.907
2048,     5,  512,   23,      127,    4,               0.873
2048,     0, 1024,   23,      127,    4,               0.657
2048,     6, 1024,   23,      127,    4,               0.653
2048,     0, 2048,   23,      127,    4,               0.666
2048,     7, 2048,   23,      127,    4,               0.667
2048,     0, 4096,   23,      127,    4,                0.67
2048,     8, 4096,   23,      127,    4,                0.67
 256,     1,   64,   23,      127,    4,               0.686
 256,    15,   64,   23,      127,    4,               0.687
 256,     2,   64,   23,      127,    4,               0.687
 256,    30,   64,   23,      127,    4,               0.687
 256,     3,   64,   23,      127,    4,               0.687
 256,    45,   64,   23,      127,    4,               0.672
 256,     4,   64,   23,      127,    4,               0.687
 256,    60,   64,   23,      127,    4,               0.701
 256,     5,   64,   23,      127,    4,               0.687
 256,    75,   64,   23,      127,    4,               0.686
 256,     6,   64,   23,      127,    4,               0.687
 256,    90,   64,   23,      127,    4,               0.686
 256,     7,   64,   23,      127,    4,                0.69
 256,   105,   64,   23,      127,    4,               0.672
   1,     0,    0,   23,      127,    4,               0.798
   2,     0,    1,   23,      127,    4,               0.791
   3,     0,    2,   23,      127,    4,               0.792
   4,     0,    3,   23,      127,    4,               0.795
   5,     0,    4,   23,      127,    4,               0.791
   6,     0,    5,   23,      127,    4,               0.793
   7,     0,    6,   23,      127,    4,                0.78
   8,     0,    7,   23,      127,    4,               0.791
   9,     0,    8,   23,      127,    4,               0.788
  10,     0,    9,   23,      127,    4,               0.798
  11,     0,   10,   23,      127,    4,               0.796
  12,     0,   11,   23,      127,    4,               0.794
  13,     0,   12,   23,      127,    4,               0.795
  14,     0,   13,   23,      127,    4,               0.793
  15,     0,   14,   23,      127,    4,                 0.8
  16,     0,   15,   23,      127,    4,               0.796
  17,     0,   16,   23,      127,    4,               0.796
  18,     0,   17,   23,      127,    4,               0.796
  19,     0,   18,   23,      127,    4,               0.798
  20,     0,   19,   23,      127,    4,               0.796
  21,     0,   20,   23,      127,    4,               0.796
  22,     0,   21,   23,      127,    4,               0.796
  23,     0,   22,   23,      127,    4,               0.801
  24,     0,   23,   23,      127,    4,               0.799
  25,     0,   24,   23,      127,    4,               0.795
  26,     0,   25,   23,      127,    4,               0.793
  27,     0,   26,   23,      127,    4,               0.796
  28,     0,   27,   23,      127,    4,               0.794
  29,     0,   28,   23,      127,    4,               0.798
  30,     0,   29,   23,      127,    4,               0.795
  31,     0,   30,   23,      127,    4,               0.797
  32,     0,   31,   23,      127,    4,               0.628
2048,     0,   32,   23,      127,    8,               0.738
2048,     1,   32,   23,      127,    8,               0.747
2048,     0,   64,   23,      127,    8,               0.905
2048,     2,   64,   23,      127,    8,               0.906
2048,     0,  128,   23,      127,    8,               0.822
2048,     3,  128,   23,      127,    8,               0.827
2048,     0,  256,   23,      127,    8,               0.825
2048,     4,  256,   23,      127,    8,               0.825
2048,     0,  512,   23,      127,    8,               0.851
2048,     5,  512,   23,      127,    8,               0.855
2048,     0, 1024,   23,      127,    8,               0.653
2048,     6, 1024,   23,      127,    8,               0.651
2048,     0, 2048,   23,      127,    8,               0.644
2048,     7, 2048,   23,      127,    8,               0.643
2048,     0, 4096,   23,      127,    8,                0.67
2048,     8, 4096,   23,      127,    8,                0.67
 256,     1,   64,   23,      127,    8,               0.686
 256,    15,   64,   23,      127,    8,               0.686
 256,     2,   64,   23,      127,    8,               0.686
 256,    30,   64,   23,      127,    8,               0.687
 256,     3,   64,   23,      127,    8,               0.686
 256,    45,   64,   23,      127,    8,               0.671
 256,     4,   64,   23,      127,    8,                0.69
 256,    60,   64,   23,      127,    8,               0.705
 256,     5,   64,   23,      127,    8,               0.688
 256,    75,   64,   23,      127,    8,               0.687
 256,     6,   64,   23,      127,    8,               0.692
 256,    90,   64,   23,      127,    8,               0.689
 256,     7,   64,   23,      127,    8,                0.69
 256,   105,   64,   23,      127,    8,               0.674
   1,     0,    0,   23,      127,    8,               0.798
   2,     0,    1,   23,      127,    8,               0.798
   3,     0,    2,   23,      127,    8,               0.797
   4,     0,    3,   23,      127,    8,               0.792
   5,     0,    4,   23,      127,    8,               0.795
   6,     0,    5,   23,      127,    8,               0.792
   7,     0,    6,   23,      127,    8,               0.792
   8,     0,    7,   23,      127,    8,               0.795
   9,     0,    8,   23,      127,    8,               0.799
  10,     0,    9,   23,      127,    8,               0.798
  11,     0,   10,   23,      127,    8,               0.795
  12,     0,   11,   23,      127,    8,               0.795
  13,     0,   12,   23,      127,    8,               0.797
  14,     0,   13,   23,      127,    8,               0.796
  15,     0,   14,   23,      127,    8,               0.795
  16,     0,   15,   23,      127,    8,               0.796
  17,     0,   16,   23,      127,    8,               0.798
  18,     0,   17,   23,      127,    8,               0.798
  19,     0,   18,   23,      127,    8,               0.795
  20,     0,   19,   23,      127,    8,               0.797
  21,     0,   20,   23,      127,    8,               0.797
  22,     0,   21,   23,      127,    8,               0.793
  23,     0,   22,   23,      127,    8,               0.797
  24,     0,   23,   23,      127,    8,                 0.8
  25,     0,   24,   23,      127,    8,               0.796
  26,     0,   25,   23,      127,    8,               0.796
  27,     0,   26,   23,      127,    8,               0.791
  28,     0,   27,   23,      127,    8,               0.795
  29,     0,   28,   23,      127,    8,               0.786
  30,     0,   29,   23,      127,    8,               0.797
  31,     0,   30,   23,      127,    8,               0.791
  32,     0,   31,   23,      127,    8,               0.628
2048,     0,   32,   23,      127,   16,               0.736
2048,     1,   32,   23,      127,   16,               0.737
2048,     0,   64,   23,      127,   16,               0.905
2048,     2,   64,   23,      127,   16,               0.908
2048,     0,  128,   23,      127,   16,               0.829
2048,     3,  128,   23,      127,   16,               0.824
2048,     0,  256,   23,      127,   16,               0.827
2048,     4,  256,   23,      127,   16,               0.825
2048,     0,  512,   23,      127,   16,               0.694
2048,     5,  512,   23,      127,   16,               0.687
2048,     0, 1024,   23,      127,   16,               0.568
2048,     6, 1024,   23,      127,   16,               0.667
2048,     0, 2048,   23,      127,   16,               0.766
2048,     7, 2048,   23,      127,   16,               0.781
2048,     0, 4096,   23,      127,   16,               0.646
2048,     8, 4096,   23,      127,   16,               0.646
 256,     1,   64,   23,      127,   16,               0.697
 256,    15,   64,   23,      127,   16,               0.686
 256,     2,   64,   23,      127,   16,               0.687
 256,    30,   64,   23,      127,   16,               0.687
 256,     3,   64,   23,      127,   16,               0.686
 256,    45,   64,   23,      127,   16,               0.672
 256,     4,   64,   23,      127,   16,               0.686
 256,    60,   64,   23,      127,   16,               0.701
 256,     5,   64,   23,      127,   16,               0.686
 256,    75,   64,   23,      127,   16,               0.686
 256,     6,   64,   23,      127,   16,               0.691
 256,    90,   64,   23,      127,   16,               0.687
 256,     7,   64,   23,      127,   16,               0.688
 256,   105,   64,   23,      127,   16,               0.674
   1,     0,    0,   23,      127,   16,               0.797
   2,     0,    1,   23,      127,   16,               0.798
   3,     0,    2,   23,      127,   16,               0.786
   4,     0,    3,   23,      127,   16,               0.792
   5,     0,    4,   23,      127,   16,               0.792
   6,     0,    5,   23,      127,   16,               0.795
   7,     0,    6,   23,      127,   16,               0.796
   8,     0,    7,   23,      127,   16,               0.798
   9,     0,    8,   23,      127,   16,               0.795
  10,     0,    9,   23,      127,   16,               0.797
  11,     0,   10,   23,      127,   16,               0.797
  12,     0,   11,   23,      127,   16,               0.797
  13,     0,   12,   23,      127,   16,               0.799
  14,     0,   13,   23,      127,   16,               0.798
  15,     0,   14,   23,      127,   16,               0.798
  16,     0,   15,   23,      127,   16,               0.796
  17,     0,   16,   23,      127,   16,               0.798
  18,     0,   17,   23,      127,   16,               0.796
  19,     0,   18,   23,      127,   16,               0.797
  20,     0,   19,   23,      127,   16,               0.797
  21,     0,   20,   23,      127,   16,               0.798
  22,     0,   21,   23,      127,   16,               0.797
  23,     0,   22,   23,      127,   16,               0.797
  24,     0,   23,   23,      127,   16,               0.797
  25,     0,   24,   23,      127,   16,               0.798
  26,     0,   25,   23,      127,   16,               0.794
  27,     0,   26,   23,      127,   16,               0.796
  28,     0,   27,   23,      127,   16,               0.796
  29,     0,   28,   23,      127,   16,               0.792
  30,     0,   29,   23,      127,   16,               0.788
  31,     0,   30,   23,      127,   16,                0.79
  32,     0,   31,   23,      127,   16,               0.628

 sysdeps/x86_64/multiarch/strrchr-evex.S | 441 ++++++++++++++----------
 1 file changed, 259 insertions(+), 182 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
index adeddaed32..5cf9a8315b 100644
--- a/sysdeps/x86_64/multiarch/strrchr-evex.S
+++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
@@ -24,242 +24,319 @@
 #  define STRRCHR	__strrchr_evex
 # endif
 
-# define VMOVU		vmovdqu64
-# define VMOVA		vmovdqa64
+# define VMOVU	vmovdqu64
+# define VMOVA	vmovdqa64
 
 # ifdef USE_AS_WCSRCHR
+#  define SHIFT_REG	esi
+
+#  define kunpck	kunpckbw
+#  define kmov_2x	kmovd
+#  define maskz_2x	ecx
+#  define maskm_2x	eax
+#  define CHAR_SIZE	4
+#  define VPMIN	vpminud
+#  define VPTESTN	vptestnmd
 #  define VPBROADCAST	vpbroadcastd
-#  define VPCMP		vpcmpd
-#  define SHIFT_REG	r8d
+#  define VPCMP	vpcmpd
 # else
+#  define SHIFT_REG	edi
+
+#  define kunpck	kunpckdq
+#  define kmov_2x	kmovq
+#  define maskz_2x	rcx
+#  define maskm_2x	rax
+
+#  define CHAR_SIZE	1
+#  define VPMIN	vpminub
+#  define VPTESTN	vptestnmb
 #  define VPBROADCAST	vpbroadcastb
-#  define VPCMP		vpcmpb
-#  define SHIFT_REG	ecx
+#  define VPCMP	vpcmpb
 # endif
 
 # define XMMZERO	xmm16
 # define YMMZERO	ymm16
 # define YMMMATCH	ymm17
-# define YMM1		ymm18
+# define YMMSAVE	ymm18
+
+# define YMM1	ymm19
+# define YMM2	ymm20
+# define YMM3	ymm21
+# define YMM4	ymm22
+# define YMM5	ymm23
+# define YMM6	ymm24
+# define YMM7	ymm25
+# define YMM8	ymm26
 
-# define VEC_SIZE	32
 
-	.section .text.evex,"ax",@progbits
-ENTRY (STRRCHR)
-	movl	%edi, %ecx
+# define VEC_SIZE	32
+# define PAGE_SIZE	4096
+	.section .text.evex, "ax", @progbits
+ENTRY(STRRCHR)
+	movl	%edi, %eax
 	/* Broadcast CHAR to YMMMATCH.  */
 	VPBROADCAST %esi, %YMMMATCH
 
-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
-
-	/* Check if we may cross page boundary with one vector load.  */
-	andl	$(2 * VEC_SIZE - 1), %ecx
-	cmpl	$VEC_SIZE, %ecx
-	ja	L(cros_page_boundary)
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	jg	L(cross_page_boundary)
 
+L(page_cross_continue):
 	VMOVU	(%rdi), %YMM1
-
-	/* Each bit in K0 represents a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
-	/* Each bit in K1 represents a CHAR in YMM1.  */
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+	VPTESTN	%YMM1, %YMM1, %k0
 	kmovd	%k0, %ecx
-	kmovd	%k1, %eax
-
-	addq	$VEC_SIZE, %rdi
-
-	testl	%eax, %eax
-	jnz	L(first_vec)
-
 	testl	%ecx, %ecx
-	jnz	L(return_null)
-
-	andq	$-VEC_SIZE, %rdi
-	xorl	%edx, %edx
-	jmp	L(aligned_loop)
-
-	.p2align 4
-L(first_vec):
-	/* Check if there is a null byte.  */
-	testl	%ecx, %ecx
-	jnz	L(char_and_nul_in_first_vec)
-
-	/* Remember the match and keep searching.  */
-	movl	%eax, %edx
-	movq	%rdi, %rsi
-	andq	$-VEC_SIZE, %rdi
-	jmp	L(aligned_loop)
-
-	.p2align 4
-L(cros_page_boundary):
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
-
+	jz	L(aligned_more)
+	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+	kmovd	%k1, %eax
+	blsmskl	%ecx, %ecx
+	andl	%ecx, %eax
+	jz	L(ret0)
+	bsrl	%eax, %eax
 # ifdef USE_AS_WCSRCHR
-	/* NB: Divide shift count by 4 since each bit in K1 represent 4
-	   bytes.  */
-	movl	%ecx, %SHIFT_REG
-	sarl	$2, %SHIFT_REG
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	addq	%rdi, %rax
 # endif
+L(ret0):
+	ret
 
-	VMOVA	(%rdi), %YMM1
-
-	/* Each bit in K0 represents a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
-	/* Each bit in K1 represents a CHAR in YMM1.  */
+	/* Returns for first vec x1/x2/x3 have hard coded backward
+	   search path for earlier matches.  */
+	.p2align 4,, 6
+L(first_vec_x1):
+	VPCMP	$0, %YMMMATCH, %YMM2, %k1
+	kmovd	%k1, %eax
+	blsmskl	%ecx, %ecx
+	andl	%ecx, %eax
+	jnz	L(first_vec_x1_return)
+	.p2align 4,, 4
+L(first_vec_x0_test):
 	VPCMP	$0, %YMMMATCH, %YMM1, %k1
-	kmovd	%k0, %edx
 	kmovd	%k1, %eax
-
-	shrxl	%SHIFT_REG, %edx, %edx
-	shrxl	%SHIFT_REG, %eax, %eax
-	addq	$VEC_SIZE, %rdi
-
-	/* Check if there is a CHAR.  */
 	testl	%eax, %eax
-	jnz	L(found_char)
-
-	testl	%edx, %edx
-	jnz	L(return_null)
-
-	jmp	L(aligned_loop)
-
-	.p2align 4
-L(found_char):
-	testl	%edx, %edx
-	jnz	L(char_and_nul)
-
-	/* Remember the match and keep searching.  */
-	movl	%eax, %edx
-	leaq	(%rdi, %rcx), %rsi
+	jz	L(ret1)
+	bsrl	%eax, %eax
+# ifdef USE_AS_WCSRCHR
+	leaq	(%rsi, %rax, CHAR_SIZE), %rax
+# else
+	addq	%rsi, %rax
+# endif
+L(ret1):
+	ret
 
-	.p2align 4
-L(aligned_loop):
-	VMOVA	(%rdi), %YMM1
-	addq	$VEC_SIZE, %rdi
+	.p2align 4,, 10
+L(first_vec_x1_or_x2):
+	VPCMP	$0, %YMM3, %YMMMATCH, %k3
+	VPCMP	$0, %YMM2, %YMMMATCH, %k2
+	kortestd %k2, %k3
+	jz	L(first_vec_x0_test)
+
+	kunpck	%k2, %k3, %k3
+	kmovq	%k3, %rax
+	bsrq	%rax, %rax
+	leaq	(VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax
+	ret
 
-	/* Each bit in K0 represents a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
-	/* Each bit in K1 represents a CHAR in YMM1.  */
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
-	kmovd	%k0, %ecx
+	.p2align 4,, 6
+L(first_vec_x3):
+	VPCMP	$0, %YMMMATCH, %YMM4, %k1
 	kmovd	%k1, %eax
-	orl	%eax, %ecx
-	jnz	L(char_nor_null)
+	blsmskl	%ecx, %ecx
+	andl	%ecx, %eax
+	jz	L(first_vec_x1_or_x2)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
 
-	VMOVA	(%rdi), %YMM1
-	add	$VEC_SIZE, %rdi
+	.p2align 4,, 6
+L(first_vec_x0_x1_test):
+	VPCMP	$0, %YMMMATCH, %YMM2, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jz	L(first_vec_x0_test)
+	.p2align 4,, 4
+L(first_vec_x1_return):
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
 
-	/* Each bit in K0 represents a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
-	/* Each bit in K1 represents a CHAR in YMM1.  */
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
-	kmovd	%k0, %ecx
+	.p2align 4,, 10
+L(first_vec_x2):
+	VPCMP	$0, %YMMMATCH, %YMM3, %k1
 	kmovd	%k1, %eax
-	orl	%eax, %ecx
-	jnz	L(char_nor_null)
+	blsmskl	%ecx, %ecx
+	andl	%ecx, %eax
+	jz	L(first_vec_x0_x1_test)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
 
-	VMOVA	(%rdi), %YMM1
-	addq	$VEC_SIZE, %rdi
 
-	/* Each bit in K0 represents a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
-	/* Each bit in K1 represents a CHAR in YMM1.  */
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+	.p2align 4
+L(aligned_more):
+	/* Need to keep original pointer incase YMM1 has last match.  */
+	movq	%rdi, %rsi
+	andq	$-VEC_SIZE, %rdi
+	VMOVU	VEC_SIZE(%rdi), %YMM2
+	VPTESTN	%YMM2, %YMM2, %k0
 	kmovd	%k0, %ecx
-	kmovd	%k1, %eax
-	orl	%eax, %ecx
-	jnz	L(char_nor_null)
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x1)
 
-	VMOVA	(%rdi), %YMM1
-	addq	$VEC_SIZE, %rdi
+	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM3
+	VPTESTN	%YMM3, %YMM3, %k0
+	kmovd	%k0, %ecx
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x2)
 
-	/* Each bit in K0 represents a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
-	/* Each bit in K1 represents a CHAR in YMM1.  */
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM4
+	VPTESTN	%YMM4, %YMM4, %k0
 	kmovd	%k0, %ecx
-	kmovd	%k1, %eax
-	orl	%eax, %ecx
-	jz	L(aligned_loop)
+	movq	%rdi, %r8
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x3)
 
+	andq	$-(VEC_SIZE * 2), %rdi
 	.p2align 4
-L(char_nor_null):
-	/* Find a CHAR or a null byte in a loop.  */
+L(first_aligned_loop):
+	/* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee
+	   they don't store a match.  */
+	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM5
+	VMOVA	(VEC_SIZE * 5)(%rdi), %YMM6
+
+	VPCMP	$0, %YMM5, %YMMMATCH, %k2
+	vpxord	%YMM6, %YMMMATCH, %YMM7
+
+	VPMIN	%YMM5, %YMM6, %YMM8
+	VPMIN	%YMM8, %YMM7, %YMM7
+
+	VPTESTN	%YMM7, %YMM7, %k1
+	subq	$(VEC_SIZE * -2), %rdi
+	kortestd %k1, %k2
+	jz	L(first_aligned_loop)
+
+	VPCMP	$0, %YMM6, %YMMMATCH, %k3
+	VPTESTN	%YMM8, %YMM8, %k1
+	ktestd	%k1, %k1
+	jz	L(second_aligned_loop_prep)
+
+	kortestd %k2, %k3
+	jnz	L(return_first_aligned_loop)
+
+	.p2align 4,, 6
+L(first_vec_x1_or_x2_or_x3):
+	VPCMP	$0, %YMM4, %YMMMATCH, %k4
+	kmovd	%k4, %eax
 	testl	%eax, %eax
-	jnz	L(match)
-L(return_value):
-	testl	%edx, %edx
-	jz	L(return_null)
-	movl	%edx, %eax
-	movq	%rsi, %rdi
+	jz	L(first_vec_x1_or_x2)
 	bsrl	%eax, %eax
-# ifdef USE_AS_WCSRCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
-# else
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
-# endif
+	leaq	(VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
 	ret
 
-	.p2align 4
-L(match):
-	/* Find a CHAR.  Check if there is a null byte.  */
-	kmovd	%k0, %ecx
-	testl	%ecx, %ecx
-	jnz	L(find_nul)
+	.p2align 4,, 8
+L(return_first_aligned_loop):
+	VPTESTN	%YMM5, %YMM5, %k0
+	kunpck	%k0, %k1, %k0
+	kmov_2x	%k0, %maskz_2x
+
+	blsmsk	%maskz_2x, %maskz_2x
+	kunpck	%k2, %k3, %k3
+	kmov_2x	%k3, %maskm_2x
+	and	%maskz_2x, %maskm_2x
+	jz	L(first_vec_x1_or_x2_or_x3)
+
+	bsr	%maskm_2x, %maskm_2x
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
 
-	/* Remember the match and keep searching.  */
-	movl	%eax, %edx
+	.p2align 4
+	/* We can throw away the work done for the first 4x checks here
+	   as we have a later match. This is the 'fast' path persay.
+	 */
+L(second_aligned_loop_prep):
+L(second_aligned_loop_set_furthest_match):
 	movq	%rdi, %rsi
-	jmp	L(aligned_loop)
+	kunpck	%k2, %k3, %k4
 
 	.p2align 4
-L(find_nul):
-	/* Mask out any matching bits after the null byte.  */
-	movl	%ecx, %r8d
-	subl	$1, %r8d
-	xorl	%ecx, %r8d
-	andl	%r8d, %eax
-	testl	%eax, %eax
-	/* If there is no CHAR here, return the remembered one.  */
-	jz	L(return_value)
-	bsrl	%eax, %eax
+L(second_aligned_loop):
+	VMOVU	(VEC_SIZE * 4)(%rdi), %YMM1
+	VMOVU	(VEC_SIZE * 5)(%rdi), %YMM2
+
+	VPCMP	$0, %YMM1, %YMMMATCH, %k2
+	vpxord	%YMM2, %YMMMATCH, %YMM3
+
+	VPMIN	%YMM1, %YMM2, %YMM4
+	VPMIN	%YMM3, %YMM4, %YMM3
+
+	VPTESTN	%YMM3, %YMM3, %k1
+	subq	$(VEC_SIZE * -2), %rdi
+	kortestd %k1, %k2
+	jz	L(second_aligned_loop)
+
+	VPCMP	$0, %YMM2, %YMMMATCH, %k3
+	VPTESTN	%YMM4, %YMM4, %k1
+	ktestd	%k1, %k1
+	jz	L(second_aligned_loop_set_furthest_match)
+
+	kortestd %k2, %k3
+	/* branch here because there is a significant advantage interms
+	   of output dependency chance in using edx.  */
+	jnz	L(return_new_match)
+L(return_old_match):
+	kmovq	%k4, %rax
+	bsrq	%rax, %rax
+	leaq	(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax
+	ret
+
+L(return_new_match):
+	VPTESTN	%YMM1, %YMM1, %k0
+	kunpck	%k0, %k1, %k0
+	kmov_2x	%k0, %maskz_2x
+
+	blsmsk	%maskz_2x, %maskz_2x
+	kunpck	%k2, %k3, %k3
+	kmov_2x	%k3, %maskm_2x
+	and	%maskz_2x, %maskm_2x
+	jz	L(return_old_match)
+
+	bsr	%maskm_2x, %maskm_2x
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+	/* This block is horribly aligned (% 16 == 15). This is
+	   intentional. The L(cross_page_boundary) block is exactly
+	   32-bytes of code size. Ultimately this is a cold case so
+	   save the code size by leaving misaligned.  */
+L(cross_page_boundary):
+	xorq	%rdi, %rax
+	VMOVU	(PAGE_SIZE - VEC_SIZE)(%rax), %YMM1
+	VPTESTN	%YMM1, %YMM1, %k0
+	kmovd	%k0, %ecx
 # ifdef USE_AS_WCSRCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
-# else
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
+	movl	%edi, %esi
+	andl	$(VEC_SIZE - 1), %esi
+	shrl	$2, %esi
 # endif
-	ret
+	shrxl	%SHIFT_REG, %ecx, %ecx
 
-	.p2align 4
-L(char_and_nul):
-	/* Find both a CHAR and a null byte.  */
-	addq	%rcx, %rdi
-	movl	%edx, %ecx
-L(char_and_nul_in_first_vec):
-	/* Mask out any matching bits after the null byte.  */
-	movl	%ecx, %r8d
-	subl	$1, %r8d
-	xorl	%ecx, %r8d
-	andl	%r8d, %eax
-	testl	%eax, %eax
-	/* Return null pointer if the null byte comes first.  */
-	jz	L(return_null)
+	testl	%ecx, %ecx
+	jz	L(page_cross_continue)
+	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+	kmovd	%k1, %eax
+	shrxl	%SHIFT_REG, %eax, %eax
+	blsmskl	%ecx, %ecx
+	andl	%ecx, %eax
+	jz	L(ret3)
 	bsrl	%eax, %eax
 # ifdef USE_AS_WCSRCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
 # else
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
+	addq	%rdi, %rax
 # endif
+L(ret3):
 	ret
 
-	.p2align 4
-L(return_null):
-	xorl	%eax, %eax
-	ret
-
-END (STRRCHR)
+END(STRRCHR)
 #endif
-- 
2.25.1


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v1 1/5] benchtests: Improve bench-strrchr
  2022-04-21  3:14 [PATCH v1 1/5] benchtests: Improve bench-strrchr Noah Goldstein
                   ` (3 preceding siblings ...)
  2022-04-21  3:14 ` [PATCH v1 5/5] x86: Optimize {str|wcs}rchr-evex Noah Goldstein
@ 2022-04-21 20:12 ` H.J. Lu
  2022-04-21 22:07   ` Noah Goldstein
  2022-04-21 22:22 ` [PATCH v2 1/4] " Noah Goldstein
                   ` (2 subsequent siblings)
  7 siblings, 1 reply; 36+ messages in thread
From: H.J. Lu @ 2022-04-21 20:12 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Wed, Apr 20, 2022 at 8:14 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> 1. Use json-lib for printing results.
> 2. Expose all parameters (before pos, seek_char, and max_char where
>    not printed).
> 3. Add benchmarks that test multiple occurence of seek_char in the
>    string.
> ---
>  benchtests/bench-strrchr.c | 126 ++++++++++++++++++++++++-------------
>  1 file changed, 82 insertions(+), 44 deletions(-)
>
> diff --git a/benchtests/bench-strrchr.c b/benchtests/bench-strrchr.c
> index abdae60c51..cceea77e1b 100644
> --- a/benchtests/bench-strrchr.c
> +++ b/benchtests/bench-strrchr.c
> @@ -23,6 +23,7 @@
>  # define TEST_NAME "strrchr"
>  #endif
>  #include "bench-string.h"
> +#include "json-lib.h"
>
>  #define BIG_CHAR MAX_CHAR
>
> @@ -53,7 +54,8 @@ SIMPLE_STRRCHR (const CHAR *s, int c)
>  }
>
>  static void
> -do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
> +do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s, int c,
> +            CHAR *exp_res)
>  {
>    CHAR *res = CALL (impl, s, c);
>    size_t i, iters = INNER_LOOP_ITERS8;
> @@ -61,8 +63,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
>
>    if (res != exp_res)
>      {
> -      error (0, 0, "Wrong result in function %s %p %p", impl->name,
> -            res, exp_res);
> +      error (0, 0, "Wrong result in function %s %p %p", impl->name, res,
> +            exp_res);

These changes aren't needed.

>        ret = 1;
>        return;
>      }
> @@ -72,24 +74,28 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
>      {
>        CALL (impl, s, c);
>      }
> -  TIMING_NOW (stop);
>
> +  TIMING_NOW (stop);

Not needed.

>    TIMING_DIFF (cur, start, stop);
>
> -  TIMING_PRINT_MEAN ((double) cur, (double) iters);
> +  json_element_double (json_ctx, (double) cur / (double) iters);
> +  return;

Return isn't needed.

>  }
>
>  static void
> -do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
> +do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
> +        int seek_char, int max_char, size_t freq)
>  /* For wcsrchr: align here means align not in bytes,
>     but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
>     len for wcschr here isn't in bytes but it's number of wchar_t symbols.  */
>  {
>    size_t i;
> +  size_t pos_chunk_sz = freq ? (pos / freq) : pos;
> +  size_t last_pos = len;
>    CHAR *result;
>    CHAR *buf = (CHAR *) buf1;
>
> -  align &= 7;
> +  align &= (getpagesize () - 1);

If we have such large alignments, the tests may be skipped.
Should we change it to 127 instead?

>    if ((align + len) * sizeof (CHAR) >= page_size)
>      return;
>
> @@ -103,6 +109,16 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
>        if ((i > pos || pos >= len) && buf[align + i] == seek_char)
>         buf[align + i] = seek_char + 10 + (random () & 15);
>      }
> +
> +  if (pos_chunk_sz == 0 && pos)
> +    pos_chunk_sz = 1;
> +
> +  for (i = pos_chunk_sz; i < pos && i < len; i += pos_chunk_sz)
> +    {
> +      buf[align + i] = seek_char;
> +      last_pos = i;
> +    }
> +
>    buf[align + len] = 0;
>
>    if (pos < len)
> @@ -110,66 +126,88 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
>        buf[align + pos] = seek_char;
>        result = (CHAR *) (buf + align + pos);
>      }
> +  else if (last_pos < len)
> +    result = (CHAR *) (buf + align + last_pos);
>    else if (seek_char == 0)
>      result = (CHAR *) (buf + align + len);
>    else
>      result = NULL;
>
> -  printf ("Length %4zd, alignment in bytes %2zd:", len, align * sizeof (CHAR));
> +  json_element_object_begin (json_ctx);
> +  json_attr_uint (json_ctx, "len", len);
> +  json_attr_uint (json_ctx, "pos", pos);
> +  json_attr_uint (json_ctx, "align", align);
> +  json_attr_uint (json_ctx, "freq", freq);
> +  json_attr_uint (json_ctx, "seek", seek_char);
> +  json_attr_uint (json_ctx, "max_char", max_char);
> +  json_array_begin (json_ctx, "timings");
>
>    FOR_EACH_IMPL (impl, 0)
> -    do_one_test (impl, (CHAR *) (buf + align), seek_char, result);
> +    do_one_test (json_ctx, impl, (CHAR *) (buf + align), seek_char, result);
>
> -  putchar ('\n');
> +  json_array_end (json_ctx);
> +  json_element_object_end (json_ctx);
>  }
>
>  int
>  test_main (void)
>  {
> -  size_t i;
> +  json_ctx_t json_ctx;
> +  size_t i, j;
> +  int seek;
>
>    test_init ();
> +  json_init (&json_ctx, 0, stdout);
>
> -  printf ("%20s", "");
> -  FOR_EACH_IMPL (impl, 0)
> -    printf ("\t%s", impl->name);
> -  putchar ('\n');
> +  json_document_begin (&json_ctx);
> +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
>
> -  for (i = 1; i < 8; ++i)
> -    {
> -      do_test (0, 16 << i, 2048, 23, SMALL_CHAR);
> -      do_test (i, 16 << i, 2048, 23, SMALL_CHAR);
> -    }
> +  json_attr_object_begin (&json_ctx, "functions");
> +  json_attr_object_begin (&json_ctx, TEST_NAME);
> +  json_attr_string (&json_ctx, "bench-variant", "");
>
> -  for (i = 1; i < 8; ++i)
> -    {
> -      do_test (i, 64, 256, 23, SMALL_CHAR);
> -      do_test (i, 64, 256, 23, BIG_CHAR);
> -    }
> -
> -  for (i = 0; i < 32; ++i)
> -    {
> -      do_test (0, i, i + 1, 23, SMALL_CHAR);
> -      do_test (0, i, i + 1, 23, BIG_CHAR);
> -    }
> +  json_array_begin (&json_ctx, "ifuncs");
> +  FOR_EACH_IMPL (impl, 0)
> +    json_element_string (&json_ctx, impl->name);
> +  json_array_end (&json_ctx);
>
> -  for (i = 1; i < 8; ++i)
> -    {
> -      do_test (0, 16 << i, 2048, 0, SMALL_CHAR);
> -      do_test (i, 16 << i, 2048, 0, SMALL_CHAR);
> -    }
> +  json_array_begin (&json_ctx, "results");
>
> -  for (i = 1; i < 8; ++i)
> +  for (seek = 0; seek <= 23; seek += 23)
>      {
> -      do_test (i, 64, 256, 0, SMALL_CHAR);
> -      do_test (i, 64, 256, 0, BIG_CHAR);
> +      for (j = 1; j < 32; j += j)
> +       {
> +         for (i = 1; i < 9; ++i)
> +           {
> +             do_test (&json_ctx, 0, 16 << i, 2048, seek, SMALL_CHAR, j);
> +             do_test (&json_ctx, i, 16 << i, 2048, seek, SMALL_CHAR, j);
> +           }
> +
> +         for (i = 1; i < 8; ++i)
> +           {
> +             do_test (&json_ctx, i, 64, 256, seek, SMALL_CHAR, j);
> +             do_test (&json_ctx, i, 64, 256, seek, BIG_CHAR, j);
> +
> +             do_test (&json_ctx, i * 15, 64, 256, seek, SMALL_CHAR, j);
> +             do_test (&json_ctx, i * 15, 64, 256, seek, BIG_CHAR, j);
> +           }
> +
> +         for (i = 0; i < 32; ++i)
> +           {
> +             do_test (&json_ctx, 0, i, i + 1, seek, SMALL_CHAR, j);
> +             do_test (&json_ctx, 0, i, i + 1, seek, BIG_CHAR, j);
> +           }
> +         if (seek == 0)
> +           {
> +             break;
> +           }
> +       }
>      }
>
> -  for (i = 0; i < 32; ++i)
> -    {
> -      do_test (0, i, i + 1, 0, SMALL_CHAR);
> -      do_test (0, i, i + 1, 0, BIG_CHAR);
> -    }
> +  json_array_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_document_end (&json_ctx);
>
>    return ret;
>  }
> --
> 2.25.1
>


-- 
H.J.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v1 2/5] x86: Optimize {str|wcs}rchr-sse2
  2022-04-21  3:14 ` [PATCH v1 2/5] x86: Optimize {str|wcs}rchr-sse2 Noah Goldstein
@ 2022-04-21 20:26   ` H.J. Lu
  2022-04-21 20:57     ` Noah Goldstein
  0 siblings, 1 reply; 36+ messages in thread
From: H.J. Lu @ 2022-04-21 20:26 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Wed, Apr 20, 2022 at 8:14 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The new code unrolls the main loop slightly without adding too much
> overhead and minimizes the comparisons for the search CHAR.
>
> Geometric Mean of all benchmarks New / Old: 0.741
> See email for all results.
>
> Full xcheck passes on x86_64 with and without multiarch enabled.
> ---
> Results For: strrchr
>
> Geometric Mean of N=30 runs.
>
> Geometric Mean of all benchmarks New / Old: 0.741
> Benchmarks performance on Tigerlake:
> https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i71165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html
>
>  len, align,  pos, seek, max_char, freq, New Time / Old Time
> 2048,     0,   32,    0,      127,    1,               0.647
> 2048,     1,   32,    0,      127,    1,               0.621
> 2048,     0,   64,    0,      127,    1,               0.661
> 2048,     2,   64,    0,      127,    1,               0.655
> 2048,     0,  128,    0,      127,    1,                0.69
> 2048,     3,  128,    0,      127,    1,               0.689
> 2048,     0,  256,    0,      127,    1,               0.718
> 2048,     4,  256,    0,      127,    1,               0.718
> 2048,     0,  512,    0,      127,    1,               0.758
> 2048,     5,  512,    0,      127,    1,               0.754
> 2048,     0, 1024,    0,      127,    1,               1.029
> 2048,     6, 1024,    0,      127,    1,               1.032
> 2048,     0, 2048,    0,      127,    1,               0.826
> 2048,     7, 2048,    0,      127,    1,               0.834
> 2048,     0, 4096,    0,      127,    1,               0.825
> 2048,     8, 4096,    0,      127,    1,                0.83
>  256,     1,   64,    0,      127,    1,               0.657
>  256,    15,   64,    0,      127,    1,               0.657
>  256,     2,   64,    0,      127,    1,               0.657
>  256,    30,   64,    0,      127,    1,               0.523
>  256,     3,   64,    0,      127,    1,               0.657
>  256,    45,   64,    0,      127,    1,               0.654
>  256,     4,   64,    0,      127,    1,               0.657
>  256,    60,   64,    0,      127,    1,               0.526
>  256,     5,   64,    0,      127,    1,               0.658
>  256,    75,   64,    0,      127,    1,               0.658
>  256,     6,   64,    0,      127,    1,               0.655
>  256,    90,   64,    0,      127,    1,               0.523
>  256,     7,   64,    0,      127,    1,               0.655
>  256,   105,   64,    0,      127,    1,               0.654
>    1,     0,    0,    0,      127,    1,                0.98
>    2,     0,    1,    0,      127,    1,               0.978
>    3,     0,    2,    0,      127,    1,               0.975
>    4,     0,    3,    0,      127,    1,               0.976
>    5,     0,    4,    0,      127,    1,               0.977
>    6,     0,    5,    0,      127,    1,               0.981
>    7,     0,    6,    0,      127,    1,               0.982
>    8,     0,    7,    0,      127,    1,                0.98
>    9,     0,    8,    0,      127,    1,               0.978
>   10,     0,    9,    0,      127,    1,               0.981
>   11,     0,   10,    0,      127,    1,               0.984
>   12,     0,   11,    0,      127,    1,               0.982
>   13,     0,   12,    0,      127,    1,                0.98
>   14,     0,   13,    0,      127,    1,               0.978
>   15,     0,   14,    0,      127,    1,               0.979
>   16,     0,   15,    0,      127,    1,               0.986
>   17,     0,   16,    0,      127,    1,               0.529
>   18,     0,   17,    0,      127,    1,               0.566
>   19,     0,   18,    0,      127,    1,               0.575
>   20,     0,   19,    0,      127,    1,               0.573
>   21,     0,   20,    0,      127,    1,               0.579
>   22,     0,   21,    0,      127,    1,               0.595
>   23,     0,   22,    0,      127,    1,               0.585
>   24,     0,   23,    0,      127,    1,               0.586
>   25,     0,   24,    0,      127,    1,               0.587
>   26,     0,   25,    0,      127,    1,               0.592
>   27,     0,   26,    0,      127,    1,               0.595
>   28,     0,   27,    0,      127,    1,               0.592
>   29,     0,   28,    0,      127,    1,                 0.6
>   30,     0,   29,    0,      127,    1,               0.598
>   31,     0,   30,    0,      127,    1,               0.595
>   32,     0,   31,    0,      127,    1,               0.592
> 2048,     0,   32,   23,      127,    1,               0.827
> 2048,     1,   32,   23,      127,    1,               0.826
> 2048,     0,   64,   23,      127,    1,               0.824
> 2048,     2,   64,   23,      127,    1,               0.825
> 2048,     0,  128,   23,      127,    1,               0.829
> 2048,     3,  128,   23,      127,    1,               0.824
> 2048,     0,  256,   23,      127,    1,               0.832
> 2048,     4,  256,   23,      127,    1,               0.825
> 2048,     0,  512,   23,      127,    1,               0.831
> 2048,     5,  512,   23,      127,    1,               0.837
> 2048,     0, 1024,   23,      127,    1,               0.721
> 2048,     6, 1024,   23,      127,    1,               0.757
> 2048,     0, 2048,   23,      127,    1,               0.825
> 2048,     7, 2048,   23,      127,    1,               0.824
> 2048,     0, 4096,   23,      127,    1,               0.828
> 2048,     8, 4096,   23,      127,    1,               0.823
>  256,     1,   64,   23,      127,    1,               0.665
>  256,    15,   64,   23,      127,    1,               0.661
>  256,     2,   64,   23,      127,    1,               0.674
>  256,    30,   64,   23,      127,    1,               0.605
>  256,     3,   64,   23,      127,    1,               0.668
>  256,    45,   64,   23,      127,    1,               0.661
>  256,     4,   64,   23,      127,    1,               0.657
>  256,    60,   64,   23,      127,    1,               0.594
>  256,     5,   64,   23,      127,    1,               0.654
>  256,    75,   64,   23,      127,    1,               0.673
>  256,     6,   64,   23,      127,    1,               0.688
>  256,    90,   64,   23,      127,    1,                 0.6
>  256,     7,   64,   23,      127,    1,                0.66
>  256,   105,   64,   23,      127,    1,               0.654
>    1,     0,    0,   23,      127,    1,               0.981
>    2,     0,    1,   23,      127,    1,               0.976
>    3,     0,    2,   23,      127,    1,               0.983
>    4,     0,    3,   23,      127,    1,               0.984
>    5,     0,    4,   23,      127,    1,               0.973
>    6,     0,    5,   23,      127,    1,               0.987
>    7,     0,    6,   23,      127,    1,               0.977
>    8,     0,    7,   23,      127,    1,               0.979
>    9,     0,    8,   23,      127,    1,               0.981
>   10,     0,    9,   23,      127,    1,                0.98
>   11,     0,   10,   23,      127,    1,               0.983
>   12,     0,   11,   23,      127,    1,                0.98
>   13,     0,   12,   23,      127,    1,                0.98
>   14,     0,   13,   23,      127,    1,               0.977
>   15,     0,   14,   23,      127,    1,               0.982
>   16,     0,   15,   23,      127,    1,               0.581
>   17,     0,   16,   23,      127,    1,               0.551
>   18,     0,   17,   23,      127,    1,               0.555
>   19,     0,   18,   23,      127,    1,               0.586
>   20,     0,   19,   23,      127,    1,               0.585
>   21,     0,   20,   23,      127,    1,               0.582
>   22,     0,   21,   23,      127,    1,               0.571
>   23,     0,   22,   23,      127,    1,               0.576
>   24,     0,   23,   23,      127,    1,               0.581
>   25,     0,   24,   23,      127,    1,               0.589
>   26,     0,   25,   23,      127,    1,               0.593
>   27,     0,   26,   23,      127,    1,               0.595
>   28,     0,   27,   23,      127,    1,               0.583
>   29,     0,   28,   23,      127,    1,               0.595
>   30,     0,   29,   23,      127,    1,                0.58
>   31,     0,   30,   23,      127,    1,               0.594
>   32,     0,   31,   23,      127,    1,               0.665
> 2048,     0,   32,   23,      127,    2,               0.825
> 2048,     1,   32,   23,      127,    2,               0.818
> 2048,     0,   64,   23,      127,    2,               0.829
> 2048,     2,   64,   23,      127,    2,               0.828
> 2048,     0,  128,   23,      127,    2,               0.823
> 2048,     3,  128,   23,      127,    2,               0.825
> 2048,     0,  256,   23,      127,    2,               0.819
> 2048,     4,  256,   23,      127,    2,               0.828
> 2048,     0,  512,   23,      127,    2,               0.824
> 2048,     5,  512,   23,      127,    2,               0.827
> 2048,     0, 1024,   23,      127,    2,               0.813
> 2048,     6, 1024,   23,      127,    2,               0.834
> 2048,     0, 2048,   23,      127,    2,               0.927
> 2048,     7, 2048,   23,      127,    2,               0.923
> 2048,     0, 4096,   23,      127,    2,               0.818
> 2048,     8, 4096,   23,      127,    2,                0.82
>  256,     1,   64,   23,      127,    2,               0.693
>  256,    15,   64,   23,      127,    2,               0.686
>  256,     2,   64,   23,      127,    2,                0.69
>  256,    30,   64,   23,      127,    2,               0.611
>  256,     3,   64,   23,      127,    2,               0.692
>  256,    45,   64,   23,      127,    2,               0.685
>  256,     4,   64,   23,      127,    2,               0.688
>  256,    60,   64,   23,      127,    2,                 0.6
>  256,     5,   64,   23,      127,    2,                0.69
>  256,    75,   64,   23,      127,    2,               0.689
>  256,     6,   64,   23,      127,    2,               0.688
>  256,    90,   64,   23,      127,    2,               0.611
>  256,     7,   64,   23,      127,    2,                0.69
>  256,   105,   64,   23,      127,    2,               0.686
>    1,     0,    0,   23,      127,    2,               0.982
>    2,     0,    1,   23,      127,    2,               0.987
>    3,     0,    2,   23,      127,    2,               0.978
>    4,     0,    3,   23,      127,    2,               0.977
>    5,     0,    4,   23,      127,    2,               0.979
>    6,     0,    5,   23,      127,    2,               0.985
>    7,     0,    6,   23,      127,    2,               0.975
>    8,     0,    7,   23,      127,    2,               0.981
>    9,     0,    8,   23,      127,    2,               0.984
>   10,     0,    9,   23,      127,    2,               0.983
>   11,     0,   10,   23,      127,    2,               0.982
>   12,     0,   11,   23,      127,    2,               0.976
>   13,     0,   12,   23,      127,    2,               0.985
>   14,     0,   13,   23,      127,    2,               0.984
>   15,     0,   14,   23,      127,    2,                0.98
>   16,     0,   15,   23,      127,    2,               0.583
>   17,     0,   16,   23,      127,    2,               0.552
>   18,     0,   17,   23,      127,    2,               0.564
>   19,     0,   18,   23,      127,    2,               0.585
>   20,     0,   19,   23,      127,    2,               0.578
>   21,     0,   20,   23,      127,    2,               0.578
>   22,     0,   21,   23,      127,    2,               0.571
>   23,     0,   22,   23,      127,    2,               0.587
>   24,     0,   23,   23,      127,    2,               0.589
>   25,     0,   24,   23,      127,    2,               0.593
>   26,     0,   25,   23,      127,    2,               0.589
>   27,     0,   26,   23,      127,    2,               0.588
>   28,     0,   27,   23,      127,    2,               0.593
>   29,     0,   28,   23,      127,    2,               0.579
>   30,     0,   29,   23,      127,    2,               0.572
>   31,     0,   30,   23,      127,    2,               0.582
>   32,     0,   31,   23,      127,    2,               0.659
> 2048,     0,   32,   23,      127,    4,               0.822
> 2048,     1,   32,   23,      127,    4,               0.818
> 2048,     0,   64,   23,      127,    4,               0.826
> 2048,     2,   64,   23,      127,    4,               0.824
> 2048,     0,  128,   23,      127,    4,               0.833
> 2048,     3,  128,   23,      127,    4,               0.831
> 2048,     0,  256,   23,      127,    4,               0.826
> 2048,     4,  256,   23,      127,    4,               0.831
> 2048,     0,  512,   23,      127,    4,               0.834
> 2048,     5,  512,   23,      127,    4,                0.83
> 2048,     0, 1024,   23,      127,    4,               0.836
> 2048,     6, 1024,   23,      127,    4,               0.844
> 2048,     0, 2048,   23,      127,    4,               0.696
> 2048,     7, 2048,   23,      127,    4,               0.704
> 2048,     0, 4096,   23,      127,    4,               0.936
> 2048,     8, 4096,   23,      127,    4,               0.925
>  256,     1,   64,   23,      127,    4,               0.694
>  256,    15,   64,   23,      127,    4,                0.69
>  256,     2,   64,   23,      127,    4,               0.687
>  256,    30,   64,   23,      127,    4,               0.612
>  256,     3,   64,   23,      127,    4,               0.685
>  256,    45,   64,   23,      127,    4,               0.685
>  256,     4,   64,   23,      127,    4,               0.684
>  256,    60,   64,   23,      127,    4,               0.606
>  256,     5,   64,   23,      127,    4,                0.69
>  256,    75,   64,   23,      127,    4,               0.688
>  256,     6,   64,   23,      127,    4,                0.69
>  256,    90,   64,   23,      127,    4,               0.615
>  256,     7,   64,   23,      127,    4,               0.691
>  256,   105,   64,   23,      127,    4,               0.688
>    1,     0,    0,   23,      127,    4,               0.982
>    2,     0,    1,   23,      127,    4,               0.983
>    3,     0,    2,   23,      127,    4,               0.981
>    4,     0,    3,   23,      127,    4,               0.984
>    5,     0,    4,   23,      127,    4,               0.963
>    6,     0,    5,   23,      127,    4,               0.978
>    7,     0,    6,   23,      127,    4,               0.985
>    8,     0,    7,   23,      127,    4,               0.986
>    9,     0,    8,   23,      127,    4,               0.978
>   10,     0,    9,   23,      127,    4,               0.985
>   11,     0,   10,   23,      127,    4,               0.986
>   12,     0,   11,   23,      127,    4,               0.983
>   13,     0,   12,   23,      127,    4,               0.986
>   14,     0,   13,   23,      127,    4,                0.98
>   15,     0,   14,   23,      127,    4,               0.979
>   16,     0,   15,   23,      127,    4,               0.582
>   17,     0,   16,   23,      127,    4,               0.542
>   18,     0,   17,   23,      127,    4,               0.564
>   19,     0,   18,   23,      127,    4,               0.571
>   20,     0,   19,   23,      127,    4,               0.582
>   21,     0,   20,   23,      127,    4,               0.573
>   22,     0,   21,   23,      127,    4,               0.575
>   23,     0,   22,   23,      127,    4,               0.578
>   24,     0,   23,   23,      127,    4,                0.58
>   25,     0,   24,   23,      127,    4,               0.592
>   26,     0,   25,   23,      127,    4,               0.588
>   27,     0,   26,   23,      127,    4,               0.574
>   28,     0,   27,   23,      127,    4,               0.589
>   29,     0,   28,   23,      127,    4,                0.56
>   30,     0,   29,   23,      127,    4,               0.587
>   31,     0,   30,   23,      127,    4,               0.584
>   32,     0,   31,   23,      127,    4,               0.664
> 2048,     0,   32,   23,      127,    8,               0.826
> 2048,     1,   32,   23,      127,    8,               0.821
> 2048,     0,   64,   23,      127,    8,               0.828
> 2048,     2,   64,   23,      127,    8,               0.827
> 2048,     0,  128,   23,      127,    8,               0.833
> 2048,     3,  128,   23,      127,    8,                0.83
> 2048,     0,  256,   23,      127,    8,               0.855
> 2048,     4,  256,   23,      127,    8,               0.849
> 2048,     0,  512,   23,      127,    8,               0.849
> 2048,     5,  512,   23,      127,    8,               0.851
> 2048,     0, 1024,   23,      127,    8,               0.856
> 2048,     6, 1024,   23,      127,    8,               0.862
> 2048,     0, 2048,   23,      127,    8,               0.709
> 2048,     7, 2048,   23,      127,    8,               0.712
> 2048,     0, 4096,   23,      127,    8,               0.702
> 2048,     8, 4096,   23,      127,    8,               0.701
>  256,     1,   64,   23,      127,    8,               0.689
>  256,    15,   64,   23,      127,    8,               0.688
>  256,     2,   64,   23,      127,    8,               0.691
>  256,    30,   64,   23,      127,    8,               0.612
>  256,     3,   64,   23,      127,    8,               0.688
>  256,    45,   64,   23,      127,    8,               0.686
>  256,     4,   64,   23,      127,    8,               0.694
>  256,    60,   64,   23,      127,    8,               0.609
>  256,     5,   64,   23,      127,    8,                0.69
>  256,    75,   64,   23,      127,    8,                0.69
>  256,     6,   64,   23,      127,    8,               0.691
>  256,    90,   64,   23,      127,    8,               0.612
>  256,     7,   64,   23,      127,    8,               0.689
>  256,   105,   64,   23,      127,    8,               0.688
>    1,     0,    0,   23,      127,    8,                0.98
>    2,     0,    1,   23,      127,    8,               0.978
>    3,     0,    2,   23,      127,    8,                0.98
>    4,     0,    3,   23,      127,    8,               0.978
>    5,     0,    4,   23,      127,    8,               0.977
>    6,     0,    5,   23,      127,    8,               0.984
>    7,     0,    6,   23,      127,    8,               0.982
>    8,     0,    7,   23,      127,    8,               0.983
>    9,     0,    8,   23,      127,    8,               0.987
>   10,     0,    9,   23,      127,    8,               0.979
>   11,     0,   10,   23,      127,    8,               0.985
>   12,     0,   11,   23,      127,    8,               0.981
>   13,     0,   12,   23,      127,    8,                0.98
>   14,     0,   13,   23,      127,    8,               0.982
>   15,     0,   14,   23,      127,    8,               0.981
>   16,     0,   15,   23,      127,    8,               0.579
>   17,     0,   16,   23,      127,    8,               0.531
>   18,     0,   17,   23,      127,    8,               0.577
>   19,     0,   18,   23,      127,    8,               0.588
>   20,     0,   19,   23,      127,    8,               0.571
>   21,     0,   20,   23,      127,    8,               0.576
>   22,     0,   21,   23,      127,    8,                0.59
>   23,     0,   22,   23,      127,    8,               0.574
>   24,     0,   23,   23,      127,    8,               0.583
>   25,     0,   24,   23,      127,    8,               0.581
>   26,     0,   25,   23,      127,    8,               0.592
>   27,     0,   26,   23,      127,    8,               0.586
>   28,     0,   27,   23,      127,    8,               0.588
>   29,     0,   28,   23,      127,    8,               0.578
>   30,     0,   29,   23,      127,    8,               0.573
>   31,     0,   30,   23,      127,    8,               0.588
>   32,     0,   31,   23,      127,    8,               0.664
> 2048,     0,   32,   23,      127,   16,               0.825
> 2048,     1,   32,   23,      127,   16,               0.823
> 2048,     0,   64,   23,      127,   16,               0.831
> 2048,     2,   64,   23,      127,   16,               0.822
> 2048,     0,  128,   23,      127,   16,               0.831
> 2048,     3,  128,   23,      127,   16,               0.831
> 2048,     0,  256,   23,      127,   16,               0.849
> 2048,     4,  256,   23,      127,   16,                0.85
> 2048,     0,  512,   23,      127,   16,               0.751
> 2048,     5,  512,   23,      127,   16,                0.75
> 2048,     0, 1024,   23,      127,   16,               0.913
> 2048,     6, 1024,   23,      127,   16,               0.895
> 2048,     0, 2048,   23,      127,   16,               0.736
> 2048,     7, 2048,   23,      127,   16,               0.741
> 2048,     0, 4096,   23,      127,   16,               0.712
> 2048,     8, 4096,   23,      127,   16,               0.711
>  256,     1,   64,   23,      127,   16,               0.758
>  256,    15,   64,   23,      127,   16,               0.692
>  256,     2,   64,   23,      127,   16,               0.692
>  256,    30,   64,   23,      127,   16,               0.613
>  256,     3,   64,   23,      127,   16,                0.69
>  256,    45,   64,   23,      127,   16,               0.687
>  256,     4,   64,   23,      127,   16,                0.69
>  256,    60,   64,   23,      127,   16,               0.604
>  256,     5,   64,   23,      127,   16,               0.687
>  256,    75,   64,   23,      127,   16,               0.687
>  256,     6,   64,   23,      127,   16,                0.69
>  256,    90,   64,   23,      127,   16,                0.61
>  256,     7,   64,   23,      127,   16,                0.69
>  256,   105,   64,   23,      127,   16,               0.685
>    1,     0,    0,   23,      127,   16,               0.981
>    2,     0,    1,   23,      127,   16,               0.985
>    3,     0,    2,   23,      127,   16,               0.985
>    4,     0,    3,   23,      127,   16,               0.981
>    5,     0,    4,   23,      127,   16,               0.979
>    6,     0,    5,   23,      127,   16,               0.986
>    7,     0,    6,   23,      127,   16,               0.986
>    8,     0,    7,   23,      127,   16,               0.982
>    9,     0,    8,   23,      127,   16,               0.982
>   10,     0,    9,   23,      127,   16,                0.98
>   11,     0,   10,   23,      127,   16,               0.983
>   12,     0,   11,   23,      127,   16,               0.982
>   13,     0,   12,   23,      127,   16,               0.982
>   14,     0,   13,   23,      127,   16,               0.982
>   15,     0,   14,   23,      127,   16,               0.982
>   16,     0,   15,   23,      127,   16,               0.582
>   17,     0,   16,   23,      127,   16,               0.542
>   18,     0,   17,   23,      127,   16,               0.554
>   19,     0,   18,   23,      127,   16,               0.562
>   20,     0,   19,   23,      127,   16,               0.587
>   21,     0,   20,   23,      127,   16,               0.584
>   22,     0,   21,   23,      127,   16,               0.587
>   23,     0,   22,   23,      127,   16,               0.594
>   24,     0,   23,   23,      127,   16,               0.581
>   25,     0,   24,   23,      127,   16,               0.577
>   26,     0,   25,   23,      127,   16,               0.588
>   27,     0,   26,   23,      127,   16,               0.589
>   28,     0,   27,   23,      127,   16,               0.596
>   29,     0,   28,   23,      127,   16,               0.591
>   30,     0,   29,   23,      127,   16,               0.585
>   31,     0,   30,   23,      127,   16,                0.59
>   32,     0,   31,   23,      127,   16,               0.669
>
>  sysdeps/x86_64/multiarch/strrchr-sse2.S |   2 +-
>  sysdeps/x86_64/multiarch/wcsrchr-sse2.S |   3 +-
>  sysdeps/x86_64/strrchr.S                | 505 +++++++++++++++---------
>  sysdeps/x86_64/wcsrchr.S                | 268 +------------
>  4 files changed, 334 insertions(+), 444 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
> index db1b44c23c..866396e947 100644
> --- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
> +++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
> @@ -17,7 +17,7 @@
>     <https://www.gnu.org/licenses/>.  */
>
>  #if IS_IN (libc)
> -# define strrchr __strrchr_sse2
> +# define STRRCHR __strrchr_sse2
>
>  # undef weak_alias
>  # define weak_alias(strrchr, rindex)
> diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> index 78d1ca6553..69d2f3cdb1 100644
> --- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> +++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> @@ -17,7 +17,6 @@
>     <https://www.gnu.org/licenses/>.  */
>
>  #if IS_IN (libc)
> -# define wcsrchr __wcsrchr_sse2
> +# define STRRCHR       __wcsrchr_sse2
>  #endif
> -
>  #include "../wcsrchr.S"
> diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
> index 50d886713e..94449ad806 100644
> --- a/sysdeps/x86_64/strrchr.S
> +++ b/sysdeps/x86_64/strrchr.S
> @@ -19,210 +19,355 @@
>
>  #include <sysdep.h>
>
> +#ifndef STRRCHR
> +# define STRRCHR       strrchr
> +#endif
> +
> +#ifdef USE_AS_WCSRCHR
> +# define PCMPEQ        pcmpeqd
> +# define CHAR_SIZE     4
> +# define PMINU pminud
> +#else
> +# define PCMPEQ        pcmpeqb
> +# define CHAR_SIZE     1
> +# define PMINU pminub
> +#endif
> +
> +#define PAGE_SIZE      4096
> +#define VEC_SIZE       16
> +
>         .text
> -ENTRY (strrchr)
> -       movd    %esi, %xmm1
> +ENTRY(STRRCHR)
> +       movd    %esi, %xmm0
>         movq    %rdi, %rax
> -       andl    $4095, %eax
> -       punpcklbw       %xmm1, %xmm1
> -       cmpq    $4032, %rax
> -       punpcklwd       %xmm1, %xmm1
> -       pshufd  $0, %xmm1, %xmm1
> +       andl    $(PAGE_SIZE - 1), %eax
> +#ifndef USE_AS_WCSRCHR
> +       punpcklbw %xmm0, %xmm0
> +       punpcklwd %xmm0, %xmm0
> +#endif
> +       pshufd  $0, %xmm0, %xmm0
> +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
>         ja      L(cross_page)
> -       movdqu  (%rdi), %xmm0
> +
> +L(cross_page_continue):
> +       movups  (%rdi), %xmm1
>         pxor    %xmm2, %xmm2
> -       movdqa  %xmm0, %xmm3
> -       pcmpeqb %xmm1, %xmm0
> -       pcmpeqb %xmm2, %xmm3
> -       pmovmskb        %xmm0, %ecx
> -       pmovmskb        %xmm3, %edx
> -       testq   %rdx, %rdx
> -       je      L(next_48_bytes)
> -       leaq    -1(%rdx), %rax
> -       xorq    %rdx, %rax
> -       andq    %rcx, %rax
> -       je      L(exit)
> -       bsrq    %rax, %rax
> +       PCMPEQ  %xmm1, %xmm2
> +       pmovmskb %xmm2, %ecx
> +       testl   %ecx, %ecx
> +       jz      L(aligned_more)
> +
> +       PCMPEQ  %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +       leal    -1(%rcx), %edx
> +       xorl    %edx, %ecx
> +       andl    %ecx, %eax
> +       jz      L(ret0)
> +       bsrl    %eax, %eax
>         addq    %rdi, %rax
> +       /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
> +          search CHAR is zero we are correct. Either way `andq
> +          -CHAR_SIZE, %rax` gets the correct result.  */
> +#ifdef USE_AS_WCSRCHR
> +       andq    $-CHAR_SIZE, %rax
> +#endif
> +L(ret0):
>         ret
>
> +       /* Returns for first vec x1/x2 have hard coded backward search
> +          path for earlier matches.  */
>         .p2align 4
> -L(next_48_bytes):
> -       movdqu  16(%rdi), %xmm4
> -       movdqa  %xmm4, %xmm5
> -       movdqu  32(%rdi), %xmm3
> -       pcmpeqb %xmm1, %xmm4
> -       pcmpeqb %xmm2, %xmm5
> -       movdqu  48(%rdi), %xmm0
> -       pmovmskb        %xmm5, %edx
> -       movdqa  %xmm3, %xmm5
> -       pcmpeqb %xmm1, %xmm3
> -       pcmpeqb %xmm2, %xmm5
> -       pcmpeqb %xmm0, %xmm2
> -       salq    $16, %rdx
> -       pmovmskb        %xmm3, %r8d
> -       pmovmskb        %xmm5, %eax
> -       pmovmskb        %xmm2, %esi
> -       salq    $32, %r8
> -       salq    $32, %rax
> -       pcmpeqb %xmm1, %xmm0
> -       orq     %rdx, %rax
> -       movq    %rsi, %rdx
> -       pmovmskb        %xmm4, %esi
> -       salq    $48, %rdx
> -       salq    $16, %rsi
> -       orq     %r8, %rsi
> -       orq     %rcx, %rsi
> -       pmovmskb        %xmm0, %ecx
> -       salq    $48, %rcx
> -       orq     %rcx, %rsi
> -       orq     %rdx, %rax
> -       je      L(loop_header2)
> -       leaq    -1(%rax), %rcx
> -       xorq    %rax, %rcx
> -       andq    %rcx, %rsi
> -       je      L(exit)
> -       bsrq    %rsi, %rsi
> -       leaq    (%rdi,%rsi), %rax
> +L(first_vec_x0_test):
> +       PCMPEQ  %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +       testl   %eax, %eax
> +       jz      L(ret0)
> +       bsrl    %eax, %eax
> +       addq    %r8, %rax
> +#ifdef USE_AS_WCSRCHR
> +       andq    $-CHAR_SIZE, %rax
> +#endif
>         ret
>
>         .p2align 4
> -L(loop_header2):
> -       testq   %rsi, %rsi
> -       movq    %rdi, %rcx
> -       je      L(no_c_found)
> -L(loop_header):
> -       addq    $64, %rdi
> -       pxor    %xmm7, %xmm7
> -       andq    $-64, %rdi
> -       jmp     L(loop_entry)
> +L(first_vec_x1):
> +       PCMPEQ  %xmm0, %xmm2
> +       pmovmskb %xmm2, %eax
> +       leal    -1(%rcx), %edx
> +       xorl    %edx, %ecx
> +       andl    %ecx, %eax
> +       jz      L(first_vec_x0_test)
> +       bsrl    %eax, %eax
> +       leaq    (VEC_SIZE)(%rdi, %rax), %rax
> +#ifdef USE_AS_WCSRCHR
> +       andq    $-CHAR_SIZE, %rax
> +#endif
> +       ret
>
>         .p2align 4
> -L(loop64):
> -       testq   %rdx, %rdx
> -       cmovne  %rdx, %rsi
> -       cmovne  %rdi, %rcx
> -       addq    $64, %rdi
> -L(loop_entry):
> -       movdqa  32(%rdi), %xmm3
> -       pxor    %xmm6, %xmm6
> -       movdqa  48(%rdi), %xmm2
> -       movdqa  %xmm3, %xmm0
> -       movdqa  16(%rdi), %xmm4
> -       pminub  %xmm2, %xmm0
> -       movdqa  (%rdi), %xmm5
> -       pminub  %xmm4, %xmm0
> -       pminub  %xmm5, %xmm0
> -       pcmpeqb %xmm7, %xmm0
> -       pmovmskb        %xmm0, %eax
> -       movdqa  %xmm5, %xmm0
> -       pcmpeqb %xmm1, %xmm0
> -       pmovmskb        %xmm0, %r9d
> -       movdqa  %xmm4, %xmm0
> -       pcmpeqb %xmm1, %xmm0
> -       pmovmskb        %xmm0, %edx
> -       movdqa  %xmm3, %xmm0
> -       pcmpeqb %xmm1, %xmm0
> -       salq    $16, %rdx
> -       pmovmskb        %xmm0, %r10d
> -       movdqa  %xmm2, %xmm0
> -       pcmpeqb %xmm1, %xmm0
> -       salq    $32, %r10
> -       orq     %r10, %rdx
> -       pmovmskb        %xmm0, %r8d
> -       orq     %r9, %rdx
> -       salq    $48, %r8
> -       orq     %r8, %rdx
> +L(first_vec_x1_test):
> +       PCMPEQ  %xmm0, %xmm2
> +       pmovmskb %xmm2, %eax
>         testl   %eax, %eax
> -       je      L(loop64)
> -       pcmpeqb %xmm6, %xmm4
> -       pcmpeqb %xmm6, %xmm3
> -       pcmpeqb %xmm6, %xmm5
> -       pmovmskb        %xmm4, %eax
> -       pmovmskb        %xmm3, %r10d
> -       pcmpeqb %xmm6, %xmm2
> -       pmovmskb        %xmm5, %r9d
> -       salq    $32, %r10
> -       salq    $16, %rax
> -       pmovmskb        %xmm2, %r8d
> -       orq     %r10, %rax
> -       orq     %r9, %rax
> -       salq    $48, %r8
> -       orq     %r8, %rax
> -       leaq    -1(%rax), %r8
> -       xorq    %rax, %r8
> -       andq    %r8, %rdx
> -       cmovne  %rdi, %rcx
> -       cmovne  %rdx, %rsi
> -       bsrq    %rsi, %rsi
> -       leaq    (%rcx,%rsi), %rax
> +       jz      L(first_vec_x0_test)
> +       bsrl    %eax, %eax
> +       leaq    (VEC_SIZE)(%rdi, %rax), %rax
> +#ifdef USE_AS_WCSRCHR
> +       andq    $-CHAR_SIZE, %rax
> +#endif
> +       ret
> +
> +       .p2align 4
> +L(first_vec_x2):
> +       PCMPEQ  %xmm0, %xmm3
> +       pmovmskb %xmm3, %eax
> +       leal    -1(%rcx), %edx
> +       xorl    %edx, %ecx
> +       andl    %ecx, %eax
> +       jz      L(first_vec_x1_test)
> +       bsrl    %eax, %eax
> +       leaq    (VEC_SIZE * 2)(%rdi, %rax), %rax
> +#ifdef USE_AS_WCSRCHR
> +       andq    $-CHAR_SIZE, %rax
> +#endif
> +       ret
> +
> +       .p2align 4
> +L(aligned_more):
> +       /* Save original pointer if match was in VEC 0.  */
> +       movq    %rdi, %r8
> +       andq    $-VEC_SIZE, %rdi
> +
> +       movaps  VEC_SIZE(%rdi), %xmm2
> +       pxor    %xmm3, %xmm3
> +       PCMPEQ  %xmm2, %xmm3
> +       pmovmskb %xmm3, %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(first_vec_x1)
> +
> +       movaps  (VEC_SIZE * 2)(%rdi), %xmm3
> +       pxor    %xmm4, %xmm4
> +       PCMPEQ  %xmm3, %xmm4
> +       pmovmskb %xmm4, %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(first_vec_x2)
> +
> +       addq    $VEC_SIZE, %rdi
> +       /* Save pointer again before realigning.  */
> +       movq    %rdi, %rsi
> +       andq    $-(VEC_SIZE * 2), %rdi
> +       .p2align 4
> +L(first_loop):
> +       /* Do 2x VEC at a time.  */
> +       movaps  (VEC_SIZE * 2)(%rdi), %xmm4
> +       movaps  (VEC_SIZE * 3)(%rdi), %xmm5
> +       /* If SSE2 no pminud.  */
> +#ifdef NO_PMINU

Do we really need SSE4.1 wcsrchr?  I think we should focus on AVX2 and
above.

> +       movaps  %xmm5, %xmm6
> +       pxor    %xmm8, %xmm8
> +
> +       PCMPEQ  %xmm8, %xmm5
> +       PCMPEQ  %xmm4, %xmm8
> +       por     %xmm5, %xmm8
> +#else
> +       movaps  %xmm5, %xmm6
> +       PMINU   %xmm4, %xmm5
> +#endif
> +
> +       movaps  %xmm4, %xmm9
> +       PCMPEQ  %xmm0, %xmm4
> +       PCMPEQ  %xmm0, %xmm6
> +       movaps  %xmm6, %xmm7
> +       por     %xmm4, %xmm6
> +#ifndef NO_PMINU
> +       pxor    %xmm8, %xmm8
> +       PCMPEQ  %xmm5, %xmm8
> +#endif
> +       pmovmskb %xmm8, %ecx
> +       pmovmskb %xmm6, %eax
> +
> +       addq    $(VEC_SIZE * 2), %rdi
> +       /* Use `addl` 1) so we can undo it with `subl` and 2) it can
> +          macro-fuse with `jz`.  */
> +       addl    %ecx, %eax
> +       jz      L(first_loop)
> +
> +       /* Check if there is zero match.  */
> +       testl   %ecx, %ecx
> +       jz      L(second_loop_match)
> +
> +       /* Check if there was a match in last iteration.  */
> +       subl    %ecx, %eax
> +       jnz     L(new_match)
> +
> +L(first_loop_old_match):
> +       PCMPEQ  %xmm0, %xmm2
> +       PCMPEQ  %xmm0, %xmm3
> +       pmovmskb %xmm2, %ecx
> +       pmovmskb %xmm3, %eax
> +       addl    %eax, %ecx
> +       jz      L(first_vec_x0_test)
> +       /* NB: We could move this shift to before the branch and save a
> +          bit of code size / performance on the fall through. The
> +          branch leads to the null case which generally seems hotter
> +          than char in first 3x VEC.  */
> +       sall    $16, %eax
> +       orl     %ecx, %eax
> +
> +       bsrl    %eax, %eax
> +       addq    %rsi, %rax
> +#ifdef USE_AS_WCSRCHR
> +       andq    $-CHAR_SIZE, %rax
> +#endif
> +       ret
> +
> +       .p2align 4
> +L(new_match):
> +       pxor    %xmm6, %xmm6
> +       PCMPEQ  %xmm9, %xmm6
> +       pmovmskb %xmm6, %eax
> +       sall    $16, %ecx
> +       orl     %eax, %ecx
> +
> +       /* We can't reuse either of the old comparisons as since we mask
> +          of zeros after first zero (instead of using the full
> +          comparison) we can't gurantee no interference between match
> +          after end of string and valid match.  */
> +       pmovmskb %xmm4, %eax
> +       pmovmskb %xmm7, %edx
> +       sall    $16, %edx
> +       orl     %edx, %eax
> +
> +       leal    -1(%ecx), %edx
> +       xorl    %edx, %ecx
> +       andl    %ecx, %eax
> +       jz      L(first_loop_old_match)
> +       bsrl    %eax, %eax
> +       addq    %rdi, %rax
> +#ifdef USE_AS_WCSRCHR
> +       andq    $-CHAR_SIZE, %rax
> +#endif
>         ret
>
> +       /* Save minimum state for getting most recent match. We can
> +          throw out all previous work.  */
>         .p2align 4
> -L(no_c_found):
> -       movl    $1, %esi
> -       xorl    %ecx, %ecx
> -       jmp     L(loop_header)
> +L(second_loop_match):
> +       movq    %rdi, %rsi
> +       movaps  %xmm4, %xmm2
> +       movaps  %xmm7, %xmm3
>
>         .p2align 4
> -L(exit):
> -       xorl    %eax, %eax
> +L(second_loop):
> +       movaps  (VEC_SIZE * 2)(%rdi), %xmm4
> +       movaps  (VEC_SIZE * 3)(%rdi), %xmm5
> +#ifdef NO_PMINU
> +       movaps  %xmm5, %xmm6
> +       pxor    %xmm8, %xmm8
> +
> +       PCMPEQ  %xmm8, %xmm5
> +       PCMPEQ  %xmm4, %xmm8
> +       por     %xmm5, %xmm8
> +#else
> +       movaps  %xmm5, %xmm6
> +       PMINU   %xmm4, %xmm5
> +#endif
> +
> +       movaps  %xmm4, %xmm9
> +       PCMPEQ  %xmm0, %xmm4
> +       PCMPEQ  %xmm0, %xmm6
> +       movaps  %xmm6, %xmm7
> +       por     %xmm4, %xmm6
> +#ifndef NO_PMINU
> +       pxor    %xmm8, %xmm8
> +       PCMPEQ  %xmm5, %xmm8
> +#endif
> +
> +       pmovmskb %xmm8, %ecx
> +       pmovmskb %xmm6, %eax
> +
> +       addq    $(VEC_SIZE * 2), %rdi
> +       /* Either null term or new occurence of CHAR.  */
> +       addl    %ecx, %eax
> +       jz      L(second_loop)
> +
> +       /* No null term so much be new occurence of CHAR.  */
> +       testl   %ecx, %ecx
> +       jz      L(second_loop_match)
> +
> +
> +       subl    %ecx, %eax
> +       jnz     L(second_loop_new_match)
> +
> +L(second_loop_old_match):
> +       pmovmskb %xmm2, %ecx
> +       pmovmskb %xmm3, %eax
> +       sall    $16, %eax
> +       orl     %ecx, %eax
> +       bsrl    %eax, %eax
> +       addq    %rsi, %rax
> +#ifdef USE_AS_WCSRCHR
> +       andq    $-CHAR_SIZE, %rax
> +#endif
>         ret
>
>         .p2align 4
> +L(second_loop_new_match):
> +       pxor    %xmm6, %xmm6
> +       PCMPEQ  %xmm9, %xmm6
> +       pmovmskb %xmm6, %eax
> +       sall    $16, %ecx
> +       orl     %eax, %ecx
> +
> +       /* We can't reuse either of the old comparisons as since we mask
> +          of zeros after first zero (instead of using the full
> +          comparison) we can't gurantee no interference between match
> +          after end of string and valid match.  */
> +       pmovmskb %xmm4, %eax
> +       pmovmskb %xmm7, %edx
> +       sall    $16, %edx
> +       orl     %edx, %eax
> +
> +       leal    -1(%ecx), %edx
> +       xorl    %edx, %ecx
> +       andl    %ecx, %eax
> +       jz      L(second_loop_old_match)
> +       bsrl    %eax, %eax
> +       addq    %rdi, %rax
> +#ifdef USE_AS_WCSRCHR
> +       andq    $-CHAR_SIZE, %rax
> +#endif
> +       ret
> +
> +       .p2align 4,, 4
>  L(cross_page):
> -       movq    %rdi, %rax
> -       pxor    %xmm0, %xmm0
> -       andq    $-64, %rax
> -       movdqu  (%rax), %xmm5
> -       movdqa  %xmm5, %xmm6
> -       movdqu  16(%rax), %xmm4
> -       pcmpeqb %xmm1, %xmm5
> -       pcmpeqb %xmm0, %xmm6
> -       movdqu  32(%rax), %xmm3
> -       pmovmskb        %xmm6, %esi
> -       movdqa  %xmm4, %xmm6
> -       movdqu  48(%rax), %xmm2
> -       pcmpeqb %xmm1, %xmm4
> -       pcmpeqb %xmm0, %xmm6
> -       pmovmskb        %xmm6, %edx
> -       movdqa  %xmm3, %xmm6
> -       pcmpeqb %xmm1, %xmm3
> -       pcmpeqb %xmm0, %xmm6
> -       pcmpeqb %xmm2, %xmm0
> -       salq    $16, %rdx
> -       pmovmskb        %xmm3, %r9d
> -       pmovmskb        %xmm6, %r8d
> -       pmovmskb        %xmm0, %ecx
> -       salq    $32, %r9
> -       salq    $32, %r8
> -       pcmpeqb %xmm1, %xmm2
> -       orq     %r8, %rdx
> -       salq    $48, %rcx
> -       pmovmskb        %xmm5, %r8d
> -       orq     %rsi, %rdx
> -       pmovmskb        %xmm4, %esi
> -       orq     %rcx, %rdx
> -       pmovmskb        %xmm2, %ecx
> -       salq    $16, %rsi
> -       salq    $48, %rcx
> -       orq     %r9, %rsi
> -       orq     %r8, %rsi
> -       orq     %rcx, %rsi
> +       movq    %rdi, %rsi
> +       andq    $-VEC_SIZE, %rsi
> +       movaps  (%rsi), %xmm1
> +       pxor    %xmm2, %xmm2
> +       PCMPEQ  %xmm1, %xmm2
> +       pmovmskb %xmm2, %edx
>         movl    %edi, %ecx
> -       subl    %eax, %ecx
> -       shrq    %cl, %rdx
> -       shrq    %cl, %rsi
> -       testq   %rdx, %rdx
> -       je      L(loop_header2)
> -       leaq    -1(%rdx), %rax
> -       xorq    %rdx, %rax
> -       andq    %rax, %rsi
> -       je      L(exit)
> -       bsrq    %rsi, %rax
> +       andl    $(VEC_SIZE - 1), %ecx
> +       sarl    %cl, %edx
> +       jz      L(cross_page_continue)
> +       PCMPEQ  %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +       sarl    %cl, %eax
> +       leal    -1(%rdx), %ecx
> +       xorl    %edx, %ecx
> +       andl    %ecx, %eax
> +       jz      L(ret1)
> +       bsrl    %eax, %eax
>         addq    %rdi, %rax
> +#ifdef USE_AS_WCSRCHR
> +       andq    $-CHAR_SIZE, %rax
> +#endif
> +L(ret1):
>         ret
> -END (strrchr)
> +END(STRRCHR)
>
> -weak_alias (strrchr, rindex)
> -libc_hidden_builtin_def (strrchr)
> +#ifndef USE_AS_WCSRCHR
> +       weak_alias (STRRCHR, rindex)
> +       libc_hidden_builtin_def (STRRCHR)
> +#endif
> diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
> index 61552954de..2b80efc5ef 100644
> --- a/sysdeps/x86_64/wcsrchr.S
> +++ b/sysdeps/x86_64/wcsrchr.S
> @@ -1,4 +1,4 @@
> -/* wcsrchr with SSSE3
> +/* wcsrchr optimized with SSE2.
>     Copyright (C) 2011-2022 Free Software Foundation, Inc.
>     This file is part of the GNU C Library.
>
> @@ -16,266 +16,12 @@
>     License along with the GNU C Library; if not, see
>     <https://www.gnu.org/licenses/>.  */
>
> -#include <sysdep.h>
>
> -       .text
> -ENTRY (wcsrchr)
> +#define USE_AS_WCSRCHR 1
> +#define NO_PMINU       1
>
> -       movd    %rsi, %xmm1
> -       mov     %rdi, %rcx
> -       punpckldq %xmm1, %xmm1
> -       pxor    %xmm2, %xmm2
> -       punpckldq %xmm1, %xmm1
> -       and     $63, %rcx
> -       cmp     $48, %rcx
> -       ja      L(crosscache)
> +#ifndef STRRCHR
> +# define STRRCHR       wcsrchr
> +#endif
>
> -       movdqu  (%rdi), %xmm0
> -       pcmpeqd %xmm0, %xmm2
> -       pcmpeqd %xmm1, %xmm0
> -       pmovmskb %xmm2, %rcx
> -       pmovmskb %xmm0, %rax
> -       add     $16, %rdi
> -
> -       test    %rax, %rax
> -       jnz     L(unaligned_match1)
> -
> -       test    %rcx, %rcx
> -       jnz     L(return_null)
> -
> -       and     $-16, %rdi
> -       xor     %r8, %r8
> -       jmp     L(loop)
> -
> -       .p2align 4
> -L(unaligned_match1):
> -       test    %rcx, %rcx
> -       jnz     L(prolog_find_zero_1)
> -
> -       mov     %rax, %r8
> -       mov     %rdi, %rsi
> -       and     $-16, %rdi
> -       jmp     L(loop)
> -
> -       .p2align 4
> -L(crosscache):
> -       and     $15, %rcx
> -       and     $-16, %rdi
> -       pxor    %xmm3, %xmm3
> -       movdqa  (%rdi), %xmm0
> -       pcmpeqd %xmm0, %xmm3
> -       pcmpeqd %xmm1, %xmm0
> -       pmovmskb %xmm3, %rdx
> -       pmovmskb %xmm0, %rax
> -       shr     %cl, %rdx
> -       shr     %cl, %rax
> -       add     $16, %rdi
> -
> -       test    %rax, %rax
> -       jnz     L(unaligned_match)
> -
> -       test    %rdx, %rdx
> -       jnz     L(return_null)
> -
> -       xor     %r8, %r8
> -       jmp     L(loop)
> -
> -       .p2align 4
> -L(unaligned_match):
> -       test    %rdx, %rdx
> -       jnz     L(prolog_find_zero)
> -
> -       mov     %rax, %r8
> -       lea     (%rdi, %rcx), %rsi
> -
> -/* Loop start on aligned string.  */
> -       .p2align 4
> -L(loop):
> -       movdqa  (%rdi), %xmm0
> -       pcmpeqd %xmm0, %xmm2
> -       add     $16, %rdi
> -       pcmpeqd %xmm1, %xmm0
> -       pmovmskb %xmm2, %rcx
> -       pmovmskb %xmm0, %rax
> -       or      %rax, %rcx
> -       jnz     L(matches)
> -
> -       movdqa  (%rdi), %xmm3
> -       pcmpeqd %xmm3, %xmm2
> -       add     $16, %rdi
> -       pcmpeqd %xmm1, %xmm3
> -       pmovmskb %xmm2, %rcx
> -       pmovmskb %xmm3, %rax
> -       or      %rax, %rcx
> -       jnz     L(matches)
> -
> -       movdqa  (%rdi), %xmm4
> -       pcmpeqd %xmm4, %xmm2
> -       add     $16, %rdi
> -       pcmpeqd %xmm1, %xmm4
> -       pmovmskb %xmm2, %rcx
> -       pmovmskb %xmm4, %rax
> -       or      %rax, %rcx
> -       jnz     L(matches)
> -
> -       movdqa  (%rdi), %xmm5
> -       pcmpeqd %xmm5, %xmm2
> -       add     $16, %rdi
> -       pcmpeqd %xmm1, %xmm5
> -       pmovmskb %xmm2, %rcx
> -       pmovmskb %xmm5, %rax
> -       or      %rax, %rcx
> -       jz      L(loop)
> -
> -       .p2align 4
> -L(matches):
> -       test    %rax, %rax
> -       jnz     L(match)
> -L(return_value):
> -       test    %r8, %r8
> -       jz      L(return_null)
> -       mov     %r8, %rax
> -       mov     %rsi, %rdi
> -
> -       test    $15 << 4, %ah
> -       jnz     L(match_fourth_wchar)
> -       test    %ah, %ah
> -       jnz     L(match_third_wchar)
> -       test    $15 << 4, %al
> -       jnz     L(match_second_wchar)
> -       lea     -16(%rdi), %rax
> -       ret
> -
> -       .p2align 4
> -L(match):
> -       pmovmskb %xmm2, %rcx
> -       test    %rcx, %rcx
> -       jnz     L(find_zero)
> -       mov     %rax, %r8
> -       mov     %rdi, %rsi
> -       jmp     L(loop)
> -
> -       .p2align 4
> -L(find_zero):
> -       test    $15, %cl
> -       jnz     L(find_zero_in_first_wchar)
> -       test    %cl, %cl
> -       jnz     L(find_zero_in_second_wchar)
> -       test    $15, %ch
> -       jnz     L(find_zero_in_third_wchar)
> -
> -       and     $1 << 13 - 1, %rax
> -       jz      L(return_value)
> -
> -       test    $15 << 4, %ah
> -       jnz     L(match_fourth_wchar)
> -       test    %ah, %ah
> -       jnz     L(match_third_wchar)
> -       test    $15 << 4, %al
> -       jnz     L(match_second_wchar)
> -       lea     -16(%rdi), %rax
> -       ret
> -
> -       .p2align 4
> -L(find_zero_in_first_wchar):
> -       test    $1, %rax
> -       jz      L(return_value)
> -       lea     -16(%rdi), %rax
> -       ret
> -
> -       .p2align 4
> -L(find_zero_in_second_wchar):
> -       and     $1 << 5 - 1, %rax
> -       jz      L(return_value)
> -
> -       test    $15 << 4, %al
> -       jnz     L(match_second_wchar)
> -       lea     -16(%rdi), %rax
> -       ret
> -
> -       .p2align 4
> -L(find_zero_in_third_wchar):
> -       and     $1 << 9 - 1, %rax
> -       jz      L(return_value)
> -
> -       test    %ah, %ah
> -       jnz     L(match_third_wchar)
> -       test    $15 << 4, %al
> -       jnz     L(match_second_wchar)
> -       lea     -16(%rdi), %rax
> -       ret
> -
> -       .p2align 4
> -L(prolog_find_zero):
> -       add     %rcx, %rdi
> -       mov     %rdx, %rcx
> -L(prolog_find_zero_1):
> -       test    $15, %cl
> -       jnz     L(prolog_find_zero_in_first_wchar)
> -       test    %cl, %cl
> -       jnz     L(prolog_find_zero_in_second_wchar)
> -       test    $15, %ch
> -       jnz     L(prolog_find_zero_in_third_wchar)
> -
> -       and     $1 << 13 - 1, %rax
> -       jz      L(return_null)
> -
> -       test    $15 << 4, %ah
> -       jnz     L(match_fourth_wchar)
> -       test    %ah, %ah
> -       jnz     L(match_third_wchar)
> -       test    $15 << 4, %al
> -       jnz     L(match_second_wchar)
> -       lea     -16(%rdi), %rax
> -       ret
> -
> -       .p2align 4
> -L(prolog_find_zero_in_first_wchar):
> -       test    $1, %rax
> -       jz      L(return_null)
> -       lea     -16(%rdi), %rax
> -       ret
> -
> -       .p2align 4
> -L(prolog_find_zero_in_second_wchar):
> -       and     $1 << 5 - 1, %rax
> -       jz      L(return_null)
> -
> -       test    $15 << 4, %al
> -       jnz     L(match_second_wchar)
> -       lea     -16(%rdi), %rax
> -       ret
> -
> -       .p2align 4
> -L(prolog_find_zero_in_third_wchar):
> -       and     $1 << 9 - 1, %rax
> -       jz      L(return_null)
> -
> -       test    %ah, %ah
> -       jnz     L(match_third_wchar)
> -       test    $15 << 4, %al
> -       jnz     L(match_second_wchar)
> -       lea     -16(%rdi), %rax
> -       ret
> -
> -       .p2align 4
> -L(match_second_wchar):
> -       lea     -12(%rdi), %rax
> -       ret
> -
> -       .p2align 4
> -L(match_third_wchar):
> -       lea     -8(%rdi), %rax
> -       ret
> -
> -       .p2align 4
> -L(match_fourth_wchar):
> -       lea     -4(%rdi), %rax
> -       ret
> -
> -       .p2align 4
> -L(return_null):
> -       xor     %rax, %rax
> -       ret
> -
> -END (wcsrchr)
> +#include "../strrchr.S"
> --
> 2.25.1
>


--
H.J.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v1 2/5] x86: Optimize {str|wcs}rchr-sse2
  2022-04-21 20:26   ` H.J. Lu
@ 2022-04-21 20:57     ` Noah Goldstein
  2022-04-21 21:48       ` H.J. Lu
  0 siblings, 1 reply; 36+ messages in thread
From: Noah Goldstein @ 2022-04-21 20:57 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell

On Thu, Apr 21, 2022 at 3:27 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Wed, Apr 20, 2022 at 8:14 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > The new code unrolls the main loop slightly without adding too much
> > overhead and minimizes the comparisons for the search CHAR.
> >
> > Geometric Mean of all benchmarks New / Old: 0.741
> > See email for all results.
> >
> > Full xcheck passes on x86_64 with and without multiarch enabled.
> > ---
> > Results For: strrchr
> >
> > Geometric Mean of N=30 runs.
> >
> > Geometric Mean of all benchmarks New / Old: 0.741
> > Benchmarks performance on Tigerlake:
> > https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i71165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html
> >
> >  len, align,  pos, seek, max_char, freq, New Time / Old Time
> > 2048,     0,   32,    0,      127,    1,               0.647
> > 2048,     1,   32,    0,      127,    1,               0.621
> > 2048,     0,   64,    0,      127,    1,               0.661
> > 2048,     2,   64,    0,      127,    1,               0.655
> > 2048,     0,  128,    0,      127,    1,                0.69
> > 2048,     3,  128,    0,      127,    1,               0.689
> > 2048,     0,  256,    0,      127,    1,               0.718
> > 2048,     4,  256,    0,      127,    1,               0.718
> > 2048,     0,  512,    0,      127,    1,               0.758
> > 2048,     5,  512,    0,      127,    1,               0.754
> > 2048,     0, 1024,    0,      127,    1,               1.029
> > 2048,     6, 1024,    0,      127,    1,               1.032
> > 2048,     0, 2048,    0,      127,    1,               0.826
> > 2048,     7, 2048,    0,      127,    1,               0.834
> > 2048,     0, 4096,    0,      127,    1,               0.825
> > 2048,     8, 4096,    0,      127,    1,                0.83
> >  256,     1,   64,    0,      127,    1,               0.657
> >  256,    15,   64,    0,      127,    1,               0.657
> >  256,     2,   64,    0,      127,    1,               0.657
> >  256,    30,   64,    0,      127,    1,               0.523
> >  256,     3,   64,    0,      127,    1,               0.657
> >  256,    45,   64,    0,      127,    1,               0.654
> >  256,     4,   64,    0,      127,    1,               0.657
> >  256,    60,   64,    0,      127,    1,               0.526
> >  256,     5,   64,    0,      127,    1,               0.658
> >  256,    75,   64,    0,      127,    1,               0.658
> >  256,     6,   64,    0,      127,    1,               0.655
> >  256,    90,   64,    0,      127,    1,               0.523
> >  256,     7,   64,    0,      127,    1,               0.655
> >  256,   105,   64,    0,      127,    1,               0.654
> >    1,     0,    0,    0,      127,    1,                0.98
> >    2,     0,    1,    0,      127,    1,               0.978
> >    3,     0,    2,    0,      127,    1,               0.975
> >    4,     0,    3,    0,      127,    1,               0.976
> >    5,     0,    4,    0,      127,    1,               0.977
> >    6,     0,    5,    0,      127,    1,               0.981
> >    7,     0,    6,    0,      127,    1,               0.982
> >    8,     0,    7,    0,      127,    1,                0.98
> >    9,     0,    8,    0,      127,    1,               0.978
> >   10,     0,    9,    0,      127,    1,               0.981
> >   11,     0,   10,    0,      127,    1,               0.984
> >   12,     0,   11,    0,      127,    1,               0.982
> >   13,     0,   12,    0,      127,    1,                0.98
> >   14,     0,   13,    0,      127,    1,               0.978
> >   15,     0,   14,    0,      127,    1,               0.979
> >   16,     0,   15,    0,      127,    1,               0.986
> >   17,     0,   16,    0,      127,    1,               0.529
> >   18,     0,   17,    0,      127,    1,               0.566
> >   19,     0,   18,    0,      127,    1,               0.575
> >   20,     0,   19,    0,      127,    1,               0.573
> >   21,     0,   20,    0,      127,    1,               0.579
> >   22,     0,   21,    0,      127,    1,               0.595
> >   23,     0,   22,    0,      127,    1,               0.585
> >   24,     0,   23,    0,      127,    1,               0.586
> >   25,     0,   24,    0,      127,    1,               0.587
> >   26,     0,   25,    0,      127,    1,               0.592
> >   27,     0,   26,    0,      127,    1,               0.595
> >   28,     0,   27,    0,      127,    1,               0.592
> >   29,     0,   28,    0,      127,    1,                 0.6
> >   30,     0,   29,    0,      127,    1,               0.598
> >   31,     0,   30,    0,      127,    1,               0.595
> >   32,     0,   31,    0,      127,    1,               0.592
> > 2048,     0,   32,   23,      127,    1,               0.827
> > 2048,     1,   32,   23,      127,    1,               0.826
> > 2048,     0,   64,   23,      127,    1,               0.824
> > 2048,     2,   64,   23,      127,    1,               0.825
> > 2048,     0,  128,   23,      127,    1,               0.829
> > 2048,     3,  128,   23,      127,    1,               0.824
> > 2048,     0,  256,   23,      127,    1,               0.832
> > 2048,     4,  256,   23,      127,    1,               0.825
> > 2048,     0,  512,   23,      127,    1,               0.831
> > 2048,     5,  512,   23,      127,    1,               0.837
> > 2048,     0, 1024,   23,      127,    1,               0.721
> > 2048,     6, 1024,   23,      127,    1,               0.757
> > 2048,     0, 2048,   23,      127,    1,               0.825
> > 2048,     7, 2048,   23,      127,    1,               0.824
> > 2048,     0, 4096,   23,      127,    1,               0.828
> > 2048,     8, 4096,   23,      127,    1,               0.823
> >  256,     1,   64,   23,      127,    1,               0.665
> >  256,    15,   64,   23,      127,    1,               0.661
> >  256,     2,   64,   23,      127,    1,               0.674
> >  256,    30,   64,   23,      127,    1,               0.605
> >  256,     3,   64,   23,      127,    1,               0.668
> >  256,    45,   64,   23,      127,    1,               0.661
> >  256,     4,   64,   23,      127,    1,               0.657
> >  256,    60,   64,   23,      127,    1,               0.594
> >  256,     5,   64,   23,      127,    1,               0.654
> >  256,    75,   64,   23,      127,    1,               0.673
> >  256,     6,   64,   23,      127,    1,               0.688
> >  256,    90,   64,   23,      127,    1,                 0.6
> >  256,     7,   64,   23,      127,    1,                0.66
> >  256,   105,   64,   23,      127,    1,               0.654
> >    1,     0,    0,   23,      127,    1,               0.981
> >    2,     0,    1,   23,      127,    1,               0.976
> >    3,     0,    2,   23,      127,    1,               0.983
> >    4,     0,    3,   23,      127,    1,               0.984
> >    5,     0,    4,   23,      127,    1,               0.973
> >    6,     0,    5,   23,      127,    1,               0.987
> >    7,     0,    6,   23,      127,    1,               0.977
> >    8,     0,    7,   23,      127,    1,               0.979
> >    9,     0,    8,   23,      127,    1,               0.981
> >   10,     0,    9,   23,      127,    1,                0.98
> >   11,     0,   10,   23,      127,    1,               0.983
> >   12,     0,   11,   23,      127,    1,                0.98
> >   13,     0,   12,   23,      127,    1,                0.98
> >   14,     0,   13,   23,      127,    1,               0.977
> >   15,     0,   14,   23,      127,    1,               0.982
> >   16,     0,   15,   23,      127,    1,               0.581
> >   17,     0,   16,   23,      127,    1,               0.551
> >   18,     0,   17,   23,      127,    1,               0.555
> >   19,     0,   18,   23,      127,    1,               0.586
> >   20,     0,   19,   23,      127,    1,               0.585
> >   21,     0,   20,   23,      127,    1,               0.582
> >   22,     0,   21,   23,      127,    1,               0.571
> >   23,     0,   22,   23,      127,    1,               0.576
> >   24,     0,   23,   23,      127,    1,               0.581
> >   25,     0,   24,   23,      127,    1,               0.589
> >   26,     0,   25,   23,      127,    1,               0.593
> >   27,     0,   26,   23,      127,    1,               0.595
> >   28,     0,   27,   23,      127,    1,               0.583
> >   29,     0,   28,   23,      127,    1,               0.595
> >   30,     0,   29,   23,      127,    1,                0.58
> >   31,     0,   30,   23,      127,    1,               0.594
> >   32,     0,   31,   23,      127,    1,               0.665
> > 2048,     0,   32,   23,      127,    2,               0.825
> > 2048,     1,   32,   23,      127,    2,               0.818
> > 2048,     0,   64,   23,      127,    2,               0.829
> > 2048,     2,   64,   23,      127,    2,               0.828
> > 2048,     0,  128,   23,      127,    2,               0.823
> > 2048,     3,  128,   23,      127,    2,               0.825
> > 2048,     0,  256,   23,      127,    2,               0.819
> > 2048,     4,  256,   23,      127,    2,               0.828
> > 2048,     0,  512,   23,      127,    2,               0.824
> > 2048,     5,  512,   23,      127,    2,               0.827
> > 2048,     0, 1024,   23,      127,    2,               0.813
> > 2048,     6, 1024,   23,      127,    2,               0.834
> > 2048,     0, 2048,   23,      127,    2,               0.927
> > 2048,     7, 2048,   23,      127,    2,               0.923
> > 2048,     0, 4096,   23,      127,    2,               0.818
> > 2048,     8, 4096,   23,      127,    2,                0.82
> >  256,     1,   64,   23,      127,    2,               0.693
> >  256,    15,   64,   23,      127,    2,               0.686
> >  256,     2,   64,   23,      127,    2,                0.69
> >  256,    30,   64,   23,      127,    2,               0.611
> >  256,     3,   64,   23,      127,    2,               0.692
> >  256,    45,   64,   23,      127,    2,               0.685
> >  256,     4,   64,   23,      127,    2,               0.688
> >  256,    60,   64,   23,      127,    2,                 0.6
> >  256,     5,   64,   23,      127,    2,                0.69
> >  256,    75,   64,   23,      127,    2,               0.689
> >  256,     6,   64,   23,      127,    2,               0.688
> >  256,    90,   64,   23,      127,    2,               0.611
> >  256,     7,   64,   23,      127,    2,                0.69
> >  256,   105,   64,   23,      127,    2,               0.686
> >    1,     0,    0,   23,      127,    2,               0.982
> >    2,     0,    1,   23,      127,    2,               0.987
> >    3,     0,    2,   23,      127,    2,               0.978
> >    4,     0,    3,   23,      127,    2,               0.977
> >    5,     0,    4,   23,      127,    2,               0.979
> >    6,     0,    5,   23,      127,    2,               0.985
> >    7,     0,    6,   23,      127,    2,               0.975
> >    8,     0,    7,   23,      127,    2,               0.981
> >    9,     0,    8,   23,      127,    2,               0.984
> >   10,     0,    9,   23,      127,    2,               0.983
> >   11,     0,   10,   23,      127,    2,               0.982
> >   12,     0,   11,   23,      127,    2,               0.976
> >   13,     0,   12,   23,      127,    2,               0.985
> >   14,     0,   13,   23,      127,    2,               0.984
> >   15,     0,   14,   23,      127,    2,                0.98
> >   16,     0,   15,   23,      127,    2,               0.583
> >   17,     0,   16,   23,      127,    2,               0.552
> >   18,     0,   17,   23,      127,    2,               0.564
> >   19,     0,   18,   23,      127,    2,               0.585
> >   20,     0,   19,   23,      127,    2,               0.578
> >   21,     0,   20,   23,      127,    2,               0.578
> >   22,     0,   21,   23,      127,    2,               0.571
> >   23,     0,   22,   23,      127,    2,               0.587
> >   24,     0,   23,   23,      127,    2,               0.589
> >   25,     0,   24,   23,      127,    2,               0.593
> >   26,     0,   25,   23,      127,    2,               0.589
> >   27,     0,   26,   23,      127,    2,               0.588
> >   28,     0,   27,   23,      127,    2,               0.593
> >   29,     0,   28,   23,      127,    2,               0.579
> >   30,     0,   29,   23,      127,    2,               0.572
> >   31,     0,   30,   23,      127,    2,               0.582
> >   32,     0,   31,   23,      127,    2,               0.659
> > 2048,     0,   32,   23,      127,    4,               0.822
> > 2048,     1,   32,   23,      127,    4,               0.818
> > 2048,     0,   64,   23,      127,    4,               0.826
> > 2048,     2,   64,   23,      127,    4,               0.824
> > 2048,     0,  128,   23,      127,    4,               0.833
> > 2048,     3,  128,   23,      127,    4,               0.831
> > 2048,     0,  256,   23,      127,    4,               0.826
> > 2048,     4,  256,   23,      127,    4,               0.831
> > 2048,     0,  512,   23,      127,    4,               0.834
> > 2048,     5,  512,   23,      127,    4,                0.83
> > 2048,     0, 1024,   23,      127,    4,               0.836
> > 2048,     6, 1024,   23,      127,    4,               0.844
> > 2048,     0, 2048,   23,      127,    4,               0.696
> > 2048,     7, 2048,   23,      127,    4,               0.704
> > 2048,     0, 4096,   23,      127,    4,               0.936
> > 2048,     8, 4096,   23,      127,    4,               0.925
> >  256,     1,   64,   23,      127,    4,               0.694
> >  256,    15,   64,   23,      127,    4,                0.69
> >  256,     2,   64,   23,      127,    4,               0.687
> >  256,    30,   64,   23,      127,    4,               0.612
> >  256,     3,   64,   23,      127,    4,               0.685
> >  256,    45,   64,   23,      127,    4,               0.685
> >  256,     4,   64,   23,      127,    4,               0.684
> >  256,    60,   64,   23,      127,    4,               0.606
> >  256,     5,   64,   23,      127,    4,                0.69
> >  256,    75,   64,   23,      127,    4,               0.688
> >  256,     6,   64,   23,      127,    4,                0.69
> >  256,    90,   64,   23,      127,    4,               0.615
> >  256,     7,   64,   23,      127,    4,               0.691
> >  256,   105,   64,   23,      127,    4,               0.688
> >    1,     0,    0,   23,      127,    4,               0.982
> >    2,     0,    1,   23,      127,    4,               0.983
> >    3,     0,    2,   23,      127,    4,               0.981
> >    4,     0,    3,   23,      127,    4,               0.984
> >    5,     0,    4,   23,      127,    4,               0.963
> >    6,     0,    5,   23,      127,    4,               0.978
> >    7,     0,    6,   23,      127,    4,               0.985
> >    8,     0,    7,   23,      127,    4,               0.986
> >    9,     0,    8,   23,      127,    4,               0.978
> >   10,     0,    9,   23,      127,    4,               0.985
> >   11,     0,   10,   23,      127,    4,               0.986
> >   12,     0,   11,   23,      127,    4,               0.983
> >   13,     0,   12,   23,      127,    4,               0.986
> >   14,     0,   13,   23,      127,    4,                0.98
> >   15,     0,   14,   23,      127,    4,               0.979
> >   16,     0,   15,   23,      127,    4,               0.582
> >   17,     0,   16,   23,      127,    4,               0.542
> >   18,     0,   17,   23,      127,    4,               0.564
> >   19,     0,   18,   23,      127,    4,               0.571
> >   20,     0,   19,   23,      127,    4,               0.582
> >   21,     0,   20,   23,      127,    4,               0.573
> >   22,     0,   21,   23,      127,    4,               0.575
> >   23,     0,   22,   23,      127,    4,               0.578
> >   24,     0,   23,   23,      127,    4,                0.58
> >   25,     0,   24,   23,      127,    4,               0.592
> >   26,     0,   25,   23,      127,    4,               0.588
> >   27,     0,   26,   23,      127,    4,               0.574
> >   28,     0,   27,   23,      127,    4,               0.589
> >   29,     0,   28,   23,      127,    4,                0.56
> >   30,     0,   29,   23,      127,    4,               0.587
> >   31,     0,   30,   23,      127,    4,               0.584
> >   32,     0,   31,   23,      127,    4,               0.664
> > 2048,     0,   32,   23,      127,    8,               0.826
> > 2048,     1,   32,   23,      127,    8,               0.821
> > 2048,     0,   64,   23,      127,    8,               0.828
> > 2048,     2,   64,   23,      127,    8,               0.827
> > 2048,     0,  128,   23,      127,    8,               0.833
> > 2048,     3,  128,   23,      127,    8,                0.83
> > 2048,     0,  256,   23,      127,    8,               0.855
> > 2048,     4,  256,   23,      127,    8,               0.849
> > 2048,     0,  512,   23,      127,    8,               0.849
> > 2048,     5,  512,   23,      127,    8,               0.851
> > 2048,     0, 1024,   23,      127,    8,               0.856
> > 2048,     6, 1024,   23,      127,    8,               0.862
> > 2048,     0, 2048,   23,      127,    8,               0.709
> > 2048,     7, 2048,   23,      127,    8,               0.712
> > 2048,     0, 4096,   23,      127,    8,               0.702
> > 2048,     8, 4096,   23,      127,    8,               0.701
> >  256,     1,   64,   23,      127,    8,               0.689
> >  256,    15,   64,   23,      127,    8,               0.688
> >  256,     2,   64,   23,      127,    8,               0.691
> >  256,    30,   64,   23,      127,    8,               0.612
> >  256,     3,   64,   23,      127,    8,               0.688
> >  256,    45,   64,   23,      127,    8,               0.686
> >  256,     4,   64,   23,      127,    8,               0.694
> >  256,    60,   64,   23,      127,    8,               0.609
> >  256,     5,   64,   23,      127,    8,                0.69
> >  256,    75,   64,   23,      127,    8,                0.69
> >  256,     6,   64,   23,      127,    8,               0.691
> >  256,    90,   64,   23,      127,    8,               0.612
> >  256,     7,   64,   23,      127,    8,               0.689
> >  256,   105,   64,   23,      127,    8,               0.688
> >    1,     0,    0,   23,      127,    8,                0.98
> >    2,     0,    1,   23,      127,    8,               0.978
> >    3,     0,    2,   23,      127,    8,                0.98
> >    4,     0,    3,   23,      127,    8,               0.978
> >    5,     0,    4,   23,      127,    8,               0.977
> >    6,     0,    5,   23,      127,    8,               0.984
> >    7,     0,    6,   23,      127,    8,               0.982
> >    8,     0,    7,   23,      127,    8,               0.983
> >    9,     0,    8,   23,      127,    8,               0.987
> >   10,     0,    9,   23,      127,    8,               0.979
> >   11,     0,   10,   23,      127,    8,               0.985
> >   12,     0,   11,   23,      127,    8,               0.981
> >   13,     0,   12,   23,      127,    8,                0.98
> >   14,     0,   13,   23,      127,    8,               0.982
> >   15,     0,   14,   23,      127,    8,               0.981
> >   16,     0,   15,   23,      127,    8,               0.579
> >   17,     0,   16,   23,      127,    8,               0.531
> >   18,     0,   17,   23,      127,    8,               0.577
> >   19,     0,   18,   23,      127,    8,               0.588
> >   20,     0,   19,   23,      127,    8,               0.571
> >   21,     0,   20,   23,      127,    8,               0.576
> >   22,     0,   21,   23,      127,    8,                0.59
> >   23,     0,   22,   23,      127,    8,               0.574
> >   24,     0,   23,   23,      127,    8,               0.583
> >   25,     0,   24,   23,      127,    8,               0.581
> >   26,     0,   25,   23,      127,    8,               0.592
> >   27,     0,   26,   23,      127,    8,               0.586
> >   28,     0,   27,   23,      127,    8,               0.588
> >   29,     0,   28,   23,      127,    8,               0.578
> >   30,     0,   29,   23,      127,    8,               0.573
> >   31,     0,   30,   23,      127,    8,               0.588
> >   32,     0,   31,   23,      127,    8,               0.664
> > 2048,     0,   32,   23,      127,   16,               0.825
> > 2048,     1,   32,   23,      127,   16,               0.823
> > 2048,     0,   64,   23,      127,   16,               0.831
> > 2048,     2,   64,   23,      127,   16,               0.822
> > 2048,     0,  128,   23,      127,   16,               0.831
> > 2048,     3,  128,   23,      127,   16,               0.831
> > 2048,     0,  256,   23,      127,   16,               0.849
> > 2048,     4,  256,   23,      127,   16,                0.85
> > 2048,     0,  512,   23,      127,   16,               0.751
> > 2048,     5,  512,   23,      127,   16,                0.75
> > 2048,     0, 1024,   23,      127,   16,               0.913
> > 2048,     6, 1024,   23,      127,   16,               0.895
> > 2048,     0, 2048,   23,      127,   16,               0.736
> > 2048,     7, 2048,   23,      127,   16,               0.741
> > 2048,     0, 4096,   23,      127,   16,               0.712
> > 2048,     8, 4096,   23,      127,   16,               0.711
> >  256,     1,   64,   23,      127,   16,               0.758
> >  256,    15,   64,   23,      127,   16,               0.692
> >  256,     2,   64,   23,      127,   16,               0.692
> >  256,    30,   64,   23,      127,   16,               0.613
> >  256,     3,   64,   23,      127,   16,                0.69
> >  256,    45,   64,   23,      127,   16,               0.687
> >  256,     4,   64,   23,      127,   16,                0.69
> >  256,    60,   64,   23,      127,   16,               0.604
> >  256,     5,   64,   23,      127,   16,               0.687
> >  256,    75,   64,   23,      127,   16,               0.687
> >  256,     6,   64,   23,      127,   16,                0.69
> >  256,    90,   64,   23,      127,   16,                0.61
> >  256,     7,   64,   23,      127,   16,                0.69
> >  256,   105,   64,   23,      127,   16,               0.685
> >    1,     0,    0,   23,      127,   16,               0.981
> >    2,     0,    1,   23,      127,   16,               0.985
> >    3,     0,    2,   23,      127,   16,               0.985
> >    4,     0,    3,   23,      127,   16,               0.981
> >    5,     0,    4,   23,      127,   16,               0.979
> >    6,     0,    5,   23,      127,   16,               0.986
> >    7,     0,    6,   23,      127,   16,               0.986
> >    8,     0,    7,   23,      127,   16,               0.982
> >    9,     0,    8,   23,      127,   16,               0.982
> >   10,     0,    9,   23,      127,   16,                0.98
> >   11,     0,   10,   23,      127,   16,               0.983
> >   12,     0,   11,   23,      127,   16,               0.982
> >   13,     0,   12,   23,      127,   16,               0.982
> >   14,     0,   13,   23,      127,   16,               0.982
> >   15,     0,   14,   23,      127,   16,               0.982
> >   16,     0,   15,   23,      127,   16,               0.582
> >   17,     0,   16,   23,      127,   16,               0.542
> >   18,     0,   17,   23,      127,   16,               0.554
> >   19,     0,   18,   23,      127,   16,               0.562
> >   20,     0,   19,   23,      127,   16,               0.587
> >   21,     0,   20,   23,      127,   16,               0.584
> >   22,     0,   21,   23,      127,   16,               0.587
> >   23,     0,   22,   23,      127,   16,               0.594
> >   24,     0,   23,   23,      127,   16,               0.581
> >   25,     0,   24,   23,      127,   16,               0.577
> >   26,     0,   25,   23,      127,   16,               0.588
> >   27,     0,   26,   23,      127,   16,               0.589
> >   28,     0,   27,   23,      127,   16,               0.596
> >   29,     0,   28,   23,      127,   16,               0.591
> >   30,     0,   29,   23,      127,   16,               0.585
> >   31,     0,   30,   23,      127,   16,                0.59
> >   32,     0,   31,   23,      127,   16,               0.669
> >
> >  sysdeps/x86_64/multiarch/strrchr-sse2.S |   2 +-
> >  sysdeps/x86_64/multiarch/wcsrchr-sse2.S |   3 +-
> >  sysdeps/x86_64/strrchr.S                | 505 +++++++++++++++---------
> >  sysdeps/x86_64/wcsrchr.S                | 268 +------------
> >  4 files changed, 334 insertions(+), 444 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
> > index db1b44c23c..866396e947 100644
> > --- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
> > +++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
> > @@ -17,7 +17,7 @@
> >     <https://www.gnu.org/licenses/>.  */
> >
> >  #if IS_IN (libc)
> > -# define strrchr __strrchr_sse2
> > +# define STRRCHR __strrchr_sse2
> >
> >  # undef weak_alias
> >  # define weak_alias(strrchr, rindex)
> > diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> > index 78d1ca6553..69d2f3cdb1 100644
> > --- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> > +++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> > @@ -17,7 +17,6 @@
> >     <https://www.gnu.org/licenses/>.  */
> >
> >  #if IS_IN (libc)
> > -# define wcsrchr __wcsrchr_sse2
> > +# define STRRCHR       __wcsrchr_sse2
> >  #endif
> > -
> >  #include "../wcsrchr.S"
> > diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
> > index 50d886713e..94449ad806 100644
> > --- a/sysdeps/x86_64/strrchr.S
> > +++ b/sysdeps/x86_64/strrchr.S
> > @@ -19,210 +19,355 @@
> >
> >  #include <sysdep.h>
> >
> > +#ifndef STRRCHR
> > +# define STRRCHR       strrchr
> > +#endif
> > +
> > +#ifdef USE_AS_WCSRCHR
> > +# define PCMPEQ        pcmpeqd
> > +# define CHAR_SIZE     4
> > +# define PMINU pminud
> > +#else
> > +# define PCMPEQ        pcmpeqb
> > +# define CHAR_SIZE     1
> > +# define PMINU pminub
> > +#endif
> > +
> > +#define PAGE_SIZE      4096
> > +#define VEC_SIZE       16
> > +
> >         .text
> > -ENTRY (strrchr)
> > -       movd    %esi, %xmm1
> > +ENTRY(STRRCHR)
> > +       movd    %esi, %xmm0
> >         movq    %rdi, %rax
> > -       andl    $4095, %eax
> > -       punpcklbw       %xmm1, %xmm1
> > -       cmpq    $4032, %rax
> > -       punpcklwd       %xmm1, %xmm1
> > -       pshufd  $0, %xmm1, %xmm1
> > +       andl    $(PAGE_SIZE - 1), %eax
> > +#ifndef USE_AS_WCSRCHR
> > +       punpcklbw %xmm0, %xmm0
> > +       punpcklwd %xmm0, %xmm0
> > +#endif
> > +       pshufd  $0, %xmm0, %xmm0
> > +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> >         ja      L(cross_page)
> > -       movdqu  (%rdi), %xmm0
> > +
> > +L(cross_page_continue):
> > +       movups  (%rdi), %xmm1
> >         pxor    %xmm2, %xmm2
> > -       movdqa  %xmm0, %xmm3
> > -       pcmpeqb %xmm1, %xmm0
> > -       pcmpeqb %xmm2, %xmm3
> > -       pmovmskb        %xmm0, %ecx
> > -       pmovmskb        %xmm3, %edx
> > -       testq   %rdx, %rdx
> > -       je      L(next_48_bytes)
> > -       leaq    -1(%rdx), %rax
> > -       xorq    %rdx, %rax
> > -       andq    %rcx, %rax
> > -       je      L(exit)
> > -       bsrq    %rax, %rax
> > +       PCMPEQ  %xmm1, %xmm2
> > +       pmovmskb %xmm2, %ecx
> > +       testl   %ecx, %ecx
> > +       jz      L(aligned_more)
> > +
> > +       PCMPEQ  %xmm0, %xmm1
> > +       pmovmskb %xmm1, %eax
> > +       leal    -1(%rcx), %edx
> > +       xorl    %edx, %ecx
> > +       andl    %ecx, %eax
> > +       jz      L(ret0)
> > +       bsrl    %eax, %eax
> >         addq    %rdi, %rax
> > +       /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
> > +          search CHAR is zero we are correct. Either way `andq
> > +          -CHAR_SIZE, %rax` gets the correct result.  */
> > +#ifdef USE_AS_WCSRCHR
> > +       andq    $-CHAR_SIZE, %rax
> > +#endif
> > +L(ret0):
> >         ret
> >
> > +       /* Returns for first vec x1/x2 have hard coded backward search
> > +          path for earlier matches.  */
> >         .p2align 4
> > -L(next_48_bytes):
> > -       movdqu  16(%rdi), %xmm4
> > -       movdqa  %xmm4, %xmm5
> > -       movdqu  32(%rdi), %xmm3
> > -       pcmpeqb %xmm1, %xmm4
> > -       pcmpeqb %xmm2, %xmm5
> > -       movdqu  48(%rdi), %xmm0
> > -       pmovmskb        %xmm5, %edx
> > -       movdqa  %xmm3, %xmm5
> > -       pcmpeqb %xmm1, %xmm3
> > -       pcmpeqb %xmm2, %xmm5
> > -       pcmpeqb %xmm0, %xmm2
> > -       salq    $16, %rdx
> > -       pmovmskb        %xmm3, %r8d
> > -       pmovmskb        %xmm5, %eax
> > -       pmovmskb        %xmm2, %esi
> > -       salq    $32, %r8
> > -       salq    $32, %rax
> > -       pcmpeqb %xmm1, %xmm0
> > -       orq     %rdx, %rax
> > -       movq    %rsi, %rdx
> > -       pmovmskb        %xmm4, %esi
> > -       salq    $48, %rdx
> > -       salq    $16, %rsi
> > -       orq     %r8, %rsi
> > -       orq     %rcx, %rsi
> > -       pmovmskb        %xmm0, %ecx
> > -       salq    $48, %rcx
> > -       orq     %rcx, %rsi
> > -       orq     %rdx, %rax
> > -       je      L(loop_header2)
> > -       leaq    -1(%rax), %rcx
> > -       xorq    %rax, %rcx
> > -       andq    %rcx, %rsi
> > -       je      L(exit)
> > -       bsrq    %rsi, %rsi
> > -       leaq    (%rdi,%rsi), %rax
> > +L(first_vec_x0_test):
> > +       PCMPEQ  %xmm0, %xmm1
> > +       pmovmskb %xmm1, %eax
> > +       testl   %eax, %eax
> > +       jz      L(ret0)
> > +       bsrl    %eax, %eax
> > +       addq    %r8, %rax
> > +#ifdef USE_AS_WCSRCHR
> > +       andq    $-CHAR_SIZE, %rax
> > +#endif
> >         ret
> >
> >         .p2align 4
> > -L(loop_header2):
> > -       testq   %rsi, %rsi
> > -       movq    %rdi, %rcx
> > -       je      L(no_c_found)
> > -L(loop_header):
> > -       addq    $64, %rdi
> > -       pxor    %xmm7, %xmm7
> > -       andq    $-64, %rdi
> > -       jmp     L(loop_entry)
> > +L(first_vec_x1):
> > +       PCMPEQ  %xmm0, %xmm2
> > +       pmovmskb %xmm2, %eax
> > +       leal    -1(%rcx), %edx
> > +       xorl    %edx, %ecx
> > +       andl    %ecx, %eax
> > +       jz      L(first_vec_x0_test)
> > +       bsrl    %eax, %eax
> > +       leaq    (VEC_SIZE)(%rdi, %rax), %rax
> > +#ifdef USE_AS_WCSRCHR
> > +       andq    $-CHAR_SIZE, %rax
> > +#endif
> > +       ret
> >
> >         .p2align 4
> > -L(loop64):
> > -       testq   %rdx, %rdx
> > -       cmovne  %rdx, %rsi
> > -       cmovne  %rdi, %rcx
> > -       addq    $64, %rdi
> > -L(loop_entry):
> > -       movdqa  32(%rdi), %xmm3
> > -       pxor    %xmm6, %xmm6
> > -       movdqa  48(%rdi), %xmm2
> > -       movdqa  %xmm3, %xmm0
> > -       movdqa  16(%rdi), %xmm4
> > -       pminub  %xmm2, %xmm0
> > -       movdqa  (%rdi), %xmm5
> > -       pminub  %xmm4, %xmm0
> > -       pminub  %xmm5, %xmm0
> > -       pcmpeqb %xmm7, %xmm0
> > -       pmovmskb        %xmm0, %eax
> > -       movdqa  %xmm5, %xmm0
> > -       pcmpeqb %xmm1, %xmm0
> > -       pmovmskb        %xmm0, %r9d
> > -       movdqa  %xmm4, %xmm0
> > -       pcmpeqb %xmm1, %xmm0
> > -       pmovmskb        %xmm0, %edx
> > -       movdqa  %xmm3, %xmm0
> > -       pcmpeqb %xmm1, %xmm0
> > -       salq    $16, %rdx
> > -       pmovmskb        %xmm0, %r10d
> > -       movdqa  %xmm2, %xmm0
> > -       pcmpeqb %xmm1, %xmm0
> > -       salq    $32, %r10
> > -       orq     %r10, %rdx
> > -       pmovmskb        %xmm0, %r8d
> > -       orq     %r9, %rdx
> > -       salq    $48, %r8
> > -       orq     %r8, %rdx
> > +L(first_vec_x1_test):
> > +       PCMPEQ  %xmm0, %xmm2
> > +       pmovmskb %xmm2, %eax
> >         testl   %eax, %eax
> > -       je      L(loop64)
> > -       pcmpeqb %xmm6, %xmm4
> > -       pcmpeqb %xmm6, %xmm3
> > -       pcmpeqb %xmm6, %xmm5
> > -       pmovmskb        %xmm4, %eax
> > -       pmovmskb        %xmm3, %r10d
> > -       pcmpeqb %xmm6, %xmm2
> > -       pmovmskb        %xmm5, %r9d
> > -       salq    $32, %r10
> > -       salq    $16, %rax
> > -       pmovmskb        %xmm2, %r8d
> > -       orq     %r10, %rax
> > -       orq     %r9, %rax
> > -       salq    $48, %r8
> > -       orq     %r8, %rax
> > -       leaq    -1(%rax), %r8
> > -       xorq    %rax, %r8
> > -       andq    %r8, %rdx
> > -       cmovne  %rdi, %rcx
> > -       cmovne  %rdx, %rsi
> > -       bsrq    %rsi, %rsi
> > -       leaq    (%rcx,%rsi), %rax
> > +       jz      L(first_vec_x0_test)
> > +       bsrl    %eax, %eax
> > +       leaq    (VEC_SIZE)(%rdi, %rax), %rax
> > +#ifdef USE_AS_WCSRCHR
> > +       andq    $-CHAR_SIZE, %rax
> > +#endif
> > +       ret
> > +
> > +       .p2align 4
> > +L(first_vec_x2):
> > +       PCMPEQ  %xmm0, %xmm3
> > +       pmovmskb %xmm3, %eax
> > +       leal    -1(%rcx), %edx
> > +       xorl    %edx, %ecx
> > +       andl    %ecx, %eax
> > +       jz      L(first_vec_x1_test)
> > +       bsrl    %eax, %eax
> > +       leaq    (VEC_SIZE * 2)(%rdi, %rax), %rax
> > +#ifdef USE_AS_WCSRCHR
> > +       andq    $-CHAR_SIZE, %rax
> > +#endif
> > +       ret
> > +
> > +       .p2align 4
> > +L(aligned_more):
> > +       /* Save original pointer if match was in VEC 0.  */
> > +       movq    %rdi, %r8
> > +       andq    $-VEC_SIZE, %rdi
> > +
> > +       movaps  VEC_SIZE(%rdi), %xmm2
> > +       pxor    %xmm3, %xmm3
> > +       PCMPEQ  %xmm2, %xmm3
> > +       pmovmskb %xmm3, %ecx
> > +       testl   %ecx, %ecx
> > +       jnz     L(first_vec_x1)
> > +
> > +       movaps  (VEC_SIZE * 2)(%rdi), %xmm3
> > +       pxor    %xmm4, %xmm4
> > +       PCMPEQ  %xmm3, %xmm4
> > +       pmovmskb %xmm4, %ecx
> > +       testl   %ecx, %ecx
> > +       jnz     L(first_vec_x2)
> > +
> > +       addq    $VEC_SIZE, %rdi
> > +       /* Save pointer again before realigning.  */
> > +       movq    %rdi, %rsi
> > +       andq    $-(VEC_SIZE * 2), %rdi
> > +       .p2align 4
> > +L(first_loop):
> > +       /* Do 2x VEC at a time.  */
> > +       movaps  (VEC_SIZE * 2)(%rdi), %xmm4
> > +       movaps  (VEC_SIZE * 3)(%rdi), %xmm5
> > +       /* If SSE2 no pminud.  */
> > +#ifdef NO_PMINU
>
> Do we really need SSE4.1 wcsrchr?  I think we should focus on AVX2 and
> above.

It seems like freebie performance that can make a difference in the loop
cases. (see the SSE4.1 commit for numbers).

Imo there is little harm but if you feel strongly I'll drop. (In V2 will
change the .text section for SSE4_1).

What do you think?
>
> > +       movaps  %xmm5, %xmm6
> > +       pxor    %xmm8, %xmm8
> > +
> > +       PCMPEQ  %xmm8, %xmm5
> > +       PCMPEQ  %xmm4, %xmm8
> > +       por     %xmm5, %xmm8
> > +#else
> > +       movaps  %xmm5, %xmm6
> > +       PMINU   %xmm4, %xmm5
> > +#endif
> > +
> > +       movaps  %xmm4, %xmm9
> > +       PCMPEQ  %xmm0, %xmm4
> > +       PCMPEQ  %xmm0, %xmm6
> > +       movaps  %xmm6, %xmm7
> > +       por     %xmm4, %xmm6
> > +#ifndef NO_PMINU
> > +       pxor    %xmm8, %xmm8
> > +       PCMPEQ  %xmm5, %xmm8
> > +#endif
> > +       pmovmskb %xmm8, %ecx
> > +       pmovmskb %xmm6, %eax
> > +
> > +       addq    $(VEC_SIZE * 2), %rdi
> > +       /* Use `addl` 1) so we can undo it with `subl` and 2) it can
> > +          macro-fuse with `jz`.  */
> > +       addl    %ecx, %eax
> > +       jz      L(first_loop)
> > +
> > +       /* Check if there is zero match.  */
> > +       testl   %ecx, %ecx
> > +       jz      L(second_loop_match)
> > +
> > +       /* Check if there was a match in last iteration.  */
> > +       subl    %ecx, %eax
> > +       jnz     L(new_match)
> > +
> > +L(first_loop_old_match):
> > +       PCMPEQ  %xmm0, %xmm2
> > +       PCMPEQ  %xmm0, %xmm3
> > +       pmovmskb %xmm2, %ecx
> > +       pmovmskb %xmm3, %eax
> > +       addl    %eax, %ecx
> > +       jz      L(first_vec_x0_test)
> > +       /* NB: We could move this shift to before the branch and save a
> > +          bit of code size / performance on the fall through. The
> > +          branch leads to the null case which generally seems hotter
> > +          than char in first 3x VEC.  */
> > +       sall    $16, %eax
> > +       orl     %ecx, %eax
> > +
> > +       bsrl    %eax, %eax
> > +       addq    %rsi, %rax
> > +#ifdef USE_AS_WCSRCHR
> > +       andq    $-CHAR_SIZE, %rax
> > +#endif
> > +       ret
> > +
> > +       .p2align 4
> > +L(new_match):
> > +       pxor    %xmm6, %xmm6
> > +       PCMPEQ  %xmm9, %xmm6
> > +       pmovmskb %xmm6, %eax
> > +       sall    $16, %ecx
> > +       orl     %eax, %ecx
> > +
> > +       /* We can't reuse either of the old comparisons as since we mask
> > +          of zeros after first zero (instead of using the full
> > +          comparison) we can't gurantee no interference between match
> > +          after end of string and valid match.  */
> > +       pmovmskb %xmm4, %eax
> > +       pmovmskb %xmm7, %edx
> > +       sall    $16, %edx
> > +       orl     %edx, %eax
> > +
> > +       leal    -1(%ecx), %edx
> > +       xorl    %edx, %ecx
> > +       andl    %ecx, %eax
> > +       jz      L(first_loop_old_match)
> > +       bsrl    %eax, %eax
> > +       addq    %rdi, %rax
> > +#ifdef USE_AS_WCSRCHR
> > +       andq    $-CHAR_SIZE, %rax
> > +#endif
> >         ret
> >
> > +       /* Save minimum state for getting most recent match. We can
> > +          throw out all previous work.  */
> >         .p2align 4
> > -L(no_c_found):
> > -       movl    $1, %esi
> > -       xorl    %ecx, %ecx
> > -       jmp     L(loop_header)
> > +L(second_loop_match):
> > +       movq    %rdi, %rsi
> > +       movaps  %xmm4, %xmm2
> > +       movaps  %xmm7, %xmm3
> >
> >         .p2align 4
> > -L(exit):
> > -       xorl    %eax, %eax
> > +L(second_loop):
> > +       movaps  (VEC_SIZE * 2)(%rdi), %xmm4
> > +       movaps  (VEC_SIZE * 3)(%rdi), %xmm5
> > +#ifdef NO_PMINU
> > +       movaps  %xmm5, %xmm6
> > +       pxor    %xmm8, %xmm8
> > +
> > +       PCMPEQ  %xmm8, %xmm5
> > +       PCMPEQ  %xmm4, %xmm8
> > +       por     %xmm5, %xmm8
> > +#else
> > +       movaps  %xmm5, %xmm6
> > +       PMINU   %xmm4, %xmm5
> > +#endif
> > +
> > +       movaps  %xmm4, %xmm9
> > +       PCMPEQ  %xmm0, %xmm4
> > +       PCMPEQ  %xmm0, %xmm6
> > +       movaps  %xmm6, %xmm7
> > +       por     %xmm4, %xmm6
> > +#ifndef NO_PMINU
> > +       pxor    %xmm8, %xmm8
> > +       PCMPEQ  %xmm5, %xmm8
> > +#endif
> > +
> > +       pmovmskb %xmm8, %ecx
> > +       pmovmskb %xmm6, %eax
> > +
> > +       addq    $(VEC_SIZE * 2), %rdi
> > +       /* Either null term or new occurence of CHAR.  */
> > +       addl    %ecx, %eax
> > +       jz      L(second_loop)
> > +
> > +       /* No null term so much be new occurence of CHAR.  */
> > +       testl   %ecx, %ecx
> > +       jz      L(second_loop_match)
> > +
> > +
> > +       subl    %ecx, %eax
> > +       jnz     L(second_loop_new_match)
> > +
> > +L(second_loop_old_match):
> > +       pmovmskb %xmm2, %ecx
> > +       pmovmskb %xmm3, %eax
> > +       sall    $16, %eax
> > +       orl     %ecx, %eax
> > +       bsrl    %eax, %eax
> > +       addq    %rsi, %rax
> > +#ifdef USE_AS_WCSRCHR
> > +       andq    $-CHAR_SIZE, %rax
> > +#endif
> >         ret
> >
> >         .p2align 4
> > +L(second_loop_new_match):
> > +       pxor    %xmm6, %xmm6
> > +       PCMPEQ  %xmm9, %xmm6
> > +       pmovmskb %xmm6, %eax
> > +       sall    $16, %ecx
> > +       orl     %eax, %ecx
> > +
> > +       /* We can't reuse either of the old comparisons as since we mask
> > +          of zeros after first zero (instead of using the full
> > +          comparison) we can't gurantee no interference between match
> > +          after end of string and valid match.  */
> > +       pmovmskb %xmm4, %eax
> > +       pmovmskb %xmm7, %edx
> > +       sall    $16, %edx
> > +       orl     %edx, %eax
> > +
> > +       leal    -1(%ecx), %edx
> > +       xorl    %edx, %ecx
> > +       andl    %ecx, %eax
> > +       jz      L(second_loop_old_match)
> > +       bsrl    %eax, %eax
> > +       addq    %rdi, %rax
> > +#ifdef USE_AS_WCSRCHR
> > +       andq    $-CHAR_SIZE, %rax
> > +#endif
> > +       ret
> > +
> > +       .p2align 4,, 4
> >  L(cross_page):
> > -       movq    %rdi, %rax
> > -       pxor    %xmm0, %xmm0
> > -       andq    $-64, %rax
> > -       movdqu  (%rax), %xmm5
> > -       movdqa  %xmm5, %xmm6
> > -       movdqu  16(%rax), %xmm4
> > -       pcmpeqb %xmm1, %xmm5
> > -       pcmpeqb %xmm0, %xmm6
> > -       movdqu  32(%rax), %xmm3
> > -       pmovmskb        %xmm6, %esi
> > -       movdqa  %xmm4, %xmm6
> > -       movdqu  48(%rax), %xmm2
> > -       pcmpeqb %xmm1, %xmm4
> > -       pcmpeqb %xmm0, %xmm6
> > -       pmovmskb        %xmm6, %edx
> > -       movdqa  %xmm3, %xmm6
> > -       pcmpeqb %xmm1, %xmm3
> > -       pcmpeqb %xmm0, %xmm6
> > -       pcmpeqb %xmm2, %xmm0
> > -       salq    $16, %rdx
> > -       pmovmskb        %xmm3, %r9d
> > -       pmovmskb        %xmm6, %r8d
> > -       pmovmskb        %xmm0, %ecx
> > -       salq    $32, %r9
> > -       salq    $32, %r8
> > -       pcmpeqb %xmm1, %xmm2
> > -       orq     %r8, %rdx
> > -       salq    $48, %rcx
> > -       pmovmskb        %xmm5, %r8d
> > -       orq     %rsi, %rdx
> > -       pmovmskb        %xmm4, %esi
> > -       orq     %rcx, %rdx
> > -       pmovmskb        %xmm2, %ecx
> > -       salq    $16, %rsi
> > -       salq    $48, %rcx
> > -       orq     %r9, %rsi
> > -       orq     %r8, %rsi
> > -       orq     %rcx, %rsi
> > +       movq    %rdi, %rsi
> > +       andq    $-VEC_SIZE, %rsi
> > +       movaps  (%rsi), %xmm1
> > +       pxor    %xmm2, %xmm2
> > +       PCMPEQ  %xmm1, %xmm2
> > +       pmovmskb %xmm2, %edx
> >         movl    %edi, %ecx
> > -       subl    %eax, %ecx
> > -       shrq    %cl, %rdx
> > -       shrq    %cl, %rsi
> > -       testq   %rdx, %rdx
> > -       je      L(loop_header2)
> > -       leaq    -1(%rdx), %rax
> > -       xorq    %rdx, %rax
> > -       andq    %rax, %rsi
> > -       je      L(exit)
> > -       bsrq    %rsi, %rax
> > +       andl    $(VEC_SIZE - 1), %ecx
> > +       sarl    %cl, %edx
> > +       jz      L(cross_page_continue)
> > +       PCMPEQ  %xmm0, %xmm1
> > +       pmovmskb %xmm1, %eax
> > +       sarl    %cl, %eax
> > +       leal    -1(%rdx), %ecx
> > +       xorl    %edx, %ecx
> > +       andl    %ecx, %eax
> > +       jz      L(ret1)
> > +       bsrl    %eax, %eax
> >         addq    %rdi, %rax
> > +#ifdef USE_AS_WCSRCHR
> > +       andq    $-CHAR_SIZE, %rax
> > +#endif
> > +L(ret1):
> >         ret
> > -END (strrchr)
> > +END(STRRCHR)
> >
> > -weak_alias (strrchr, rindex)
> > -libc_hidden_builtin_def (strrchr)
> > +#ifndef USE_AS_WCSRCHR
> > +       weak_alias (STRRCHR, rindex)
> > +       libc_hidden_builtin_def (STRRCHR)
> > +#endif
> > diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
> > index 61552954de..2b80efc5ef 100644
> > --- a/sysdeps/x86_64/wcsrchr.S
> > +++ b/sysdeps/x86_64/wcsrchr.S
> > @@ -1,4 +1,4 @@
> > -/* wcsrchr with SSSE3
> > +/* wcsrchr optimized with SSE2.
> >     Copyright (C) 2011-2022 Free Software Foundation, Inc.
> >     This file is part of the GNU C Library.
> >
> > @@ -16,266 +16,12 @@
> >     License along with the GNU C Library; if not, see
> >     <https://www.gnu.org/licenses/>.  */
> >
> > -#include <sysdep.h>
> >
> > -       .text
> > -ENTRY (wcsrchr)
> > +#define USE_AS_WCSRCHR 1
> > +#define NO_PMINU       1
> >
> > -       movd    %rsi, %xmm1
> > -       mov     %rdi, %rcx
> > -       punpckldq %xmm1, %xmm1
> > -       pxor    %xmm2, %xmm2
> > -       punpckldq %xmm1, %xmm1
> > -       and     $63, %rcx
> > -       cmp     $48, %rcx
> > -       ja      L(crosscache)
> > +#ifndef STRRCHR
> > +# define STRRCHR       wcsrchr
> > +#endif
> >
> > -       movdqu  (%rdi), %xmm0
> > -       pcmpeqd %xmm0, %xmm2
> > -       pcmpeqd %xmm1, %xmm0
> > -       pmovmskb %xmm2, %rcx
> > -       pmovmskb %xmm0, %rax
> > -       add     $16, %rdi
> > -
> > -       test    %rax, %rax
> > -       jnz     L(unaligned_match1)
> > -
> > -       test    %rcx, %rcx
> > -       jnz     L(return_null)
> > -
> > -       and     $-16, %rdi
> > -       xor     %r8, %r8
> > -       jmp     L(loop)
> > -
> > -       .p2align 4
> > -L(unaligned_match1):
> > -       test    %rcx, %rcx
> > -       jnz     L(prolog_find_zero_1)
> > -
> > -       mov     %rax, %r8
> > -       mov     %rdi, %rsi
> > -       and     $-16, %rdi
> > -       jmp     L(loop)
> > -
> > -       .p2align 4
> > -L(crosscache):
> > -       and     $15, %rcx
> > -       and     $-16, %rdi
> > -       pxor    %xmm3, %xmm3
> > -       movdqa  (%rdi), %xmm0
> > -       pcmpeqd %xmm0, %xmm3
> > -       pcmpeqd %xmm1, %xmm0
> > -       pmovmskb %xmm3, %rdx
> > -       pmovmskb %xmm0, %rax
> > -       shr     %cl, %rdx
> > -       shr     %cl, %rax
> > -       add     $16, %rdi
> > -
> > -       test    %rax, %rax
> > -       jnz     L(unaligned_match)
> > -
> > -       test    %rdx, %rdx
> > -       jnz     L(return_null)
> > -
> > -       xor     %r8, %r8
> > -       jmp     L(loop)
> > -
> > -       .p2align 4
> > -L(unaligned_match):
> > -       test    %rdx, %rdx
> > -       jnz     L(prolog_find_zero)
> > -
> > -       mov     %rax, %r8
> > -       lea     (%rdi, %rcx), %rsi
> > -
> > -/* Loop start on aligned string.  */
> > -       .p2align 4
> > -L(loop):
> > -       movdqa  (%rdi), %xmm0
> > -       pcmpeqd %xmm0, %xmm2
> > -       add     $16, %rdi
> > -       pcmpeqd %xmm1, %xmm0
> > -       pmovmskb %xmm2, %rcx
> > -       pmovmskb %xmm0, %rax
> > -       or      %rax, %rcx
> > -       jnz     L(matches)
> > -
> > -       movdqa  (%rdi), %xmm3
> > -       pcmpeqd %xmm3, %xmm2
> > -       add     $16, %rdi
> > -       pcmpeqd %xmm1, %xmm3
> > -       pmovmskb %xmm2, %rcx
> > -       pmovmskb %xmm3, %rax
> > -       or      %rax, %rcx
> > -       jnz     L(matches)
> > -
> > -       movdqa  (%rdi), %xmm4
> > -       pcmpeqd %xmm4, %xmm2
> > -       add     $16, %rdi
> > -       pcmpeqd %xmm1, %xmm4
> > -       pmovmskb %xmm2, %rcx
> > -       pmovmskb %xmm4, %rax
> > -       or      %rax, %rcx
> > -       jnz     L(matches)
> > -
> > -       movdqa  (%rdi), %xmm5
> > -       pcmpeqd %xmm5, %xmm2
> > -       add     $16, %rdi
> > -       pcmpeqd %xmm1, %xmm5
> > -       pmovmskb %xmm2, %rcx
> > -       pmovmskb %xmm5, %rax
> > -       or      %rax, %rcx
> > -       jz      L(loop)
> > -
> > -       .p2align 4
> > -L(matches):
> > -       test    %rax, %rax
> > -       jnz     L(match)
> > -L(return_value):
> > -       test    %r8, %r8
> > -       jz      L(return_null)
> > -       mov     %r8, %rax
> > -       mov     %rsi, %rdi
> > -
> > -       test    $15 << 4, %ah
> > -       jnz     L(match_fourth_wchar)
> > -       test    %ah, %ah
> > -       jnz     L(match_third_wchar)
> > -       test    $15 << 4, %al
> > -       jnz     L(match_second_wchar)
> > -       lea     -16(%rdi), %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(match):
> > -       pmovmskb %xmm2, %rcx
> > -       test    %rcx, %rcx
> > -       jnz     L(find_zero)
> > -       mov     %rax, %r8
> > -       mov     %rdi, %rsi
> > -       jmp     L(loop)
> > -
> > -       .p2align 4
> > -L(find_zero):
> > -       test    $15, %cl
> > -       jnz     L(find_zero_in_first_wchar)
> > -       test    %cl, %cl
> > -       jnz     L(find_zero_in_second_wchar)
> > -       test    $15, %ch
> > -       jnz     L(find_zero_in_third_wchar)
> > -
> > -       and     $1 << 13 - 1, %rax
> > -       jz      L(return_value)
> > -
> > -       test    $15 << 4, %ah
> > -       jnz     L(match_fourth_wchar)
> > -       test    %ah, %ah
> > -       jnz     L(match_third_wchar)
> > -       test    $15 << 4, %al
> > -       jnz     L(match_second_wchar)
> > -       lea     -16(%rdi), %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(find_zero_in_first_wchar):
> > -       test    $1, %rax
> > -       jz      L(return_value)
> > -       lea     -16(%rdi), %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(find_zero_in_second_wchar):
> > -       and     $1 << 5 - 1, %rax
> > -       jz      L(return_value)
> > -
> > -       test    $15 << 4, %al
> > -       jnz     L(match_second_wchar)
> > -       lea     -16(%rdi), %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(find_zero_in_third_wchar):
> > -       and     $1 << 9 - 1, %rax
> > -       jz      L(return_value)
> > -
> > -       test    %ah, %ah
> > -       jnz     L(match_third_wchar)
> > -       test    $15 << 4, %al
> > -       jnz     L(match_second_wchar)
> > -       lea     -16(%rdi), %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(prolog_find_zero):
> > -       add     %rcx, %rdi
> > -       mov     %rdx, %rcx
> > -L(prolog_find_zero_1):
> > -       test    $15, %cl
> > -       jnz     L(prolog_find_zero_in_first_wchar)
> > -       test    %cl, %cl
> > -       jnz     L(prolog_find_zero_in_second_wchar)
> > -       test    $15, %ch
> > -       jnz     L(prolog_find_zero_in_third_wchar)
> > -
> > -       and     $1 << 13 - 1, %rax
> > -       jz      L(return_null)
> > -
> > -       test    $15 << 4, %ah
> > -       jnz     L(match_fourth_wchar)
> > -       test    %ah, %ah
> > -       jnz     L(match_third_wchar)
> > -       test    $15 << 4, %al
> > -       jnz     L(match_second_wchar)
> > -       lea     -16(%rdi), %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(prolog_find_zero_in_first_wchar):
> > -       test    $1, %rax
> > -       jz      L(return_null)
> > -       lea     -16(%rdi), %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(prolog_find_zero_in_second_wchar):
> > -       and     $1 << 5 - 1, %rax
> > -       jz      L(return_null)
> > -
> > -       test    $15 << 4, %al
> > -       jnz     L(match_second_wchar)
> > -       lea     -16(%rdi), %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(prolog_find_zero_in_third_wchar):
> > -       and     $1 << 9 - 1, %rax
> > -       jz      L(return_null)
> > -
> > -       test    %ah, %ah
> > -       jnz     L(match_third_wchar)
> > -       test    $15 << 4, %al
> > -       jnz     L(match_second_wchar)
> > -       lea     -16(%rdi), %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(match_second_wchar):
> > -       lea     -12(%rdi), %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(match_third_wchar):
> > -       lea     -8(%rdi), %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(match_fourth_wchar):
> > -       lea     -4(%rdi), %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(return_null):
> > -       xor     %rax, %rax
> > -       ret
> > -
> > -END (wcsrchr)
> > +#include "../strrchr.S"
> > --
> > 2.25.1
> >
>
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v1 2/5] x86: Optimize {str|wcs}rchr-sse2
  2022-04-21 20:57     ` Noah Goldstein
@ 2022-04-21 21:48       ` H.J. Lu
  2022-04-21 22:23         ` Noah Goldstein
  0 siblings, 1 reply; 36+ messages in thread
From: H.J. Lu @ 2022-04-21 21:48 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Thu, Apr 21, 2022 at 1:57 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Thu, Apr 21, 2022 at 3:27 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Wed, Apr 20, 2022 at 8:14 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > The new code unrolls the main loop slightly without adding too much
> > > overhead and minimizes the comparisons for the search CHAR.
> > >
> > > Geometric Mean of all benchmarks New / Old: 0.741
> > > See email for all results.
> > >
> > > Full xcheck passes on x86_64 with and without multiarch enabled.
> > > ---
> > > Results For: strrchr
> > >
> > > Geometric Mean of N=30 runs.
> > >
> > > Geometric Mean of all benchmarks New / Old: 0.741
> > > Benchmarks performance on Tigerlake:
> > > https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i71165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html
> > >
> > >  len, align,  pos, seek, max_char, freq, New Time / Old Time
> > > 2048,     0,   32,    0,      127,    1,               0.647
> > > 2048,     1,   32,    0,      127,    1,               0.621
> > > 2048,     0,   64,    0,      127,    1,               0.661
> > > 2048,     2,   64,    0,      127,    1,               0.655
> > > 2048,     0,  128,    0,      127,    1,                0.69
> > > 2048,     3,  128,    0,      127,    1,               0.689
> > > 2048,     0,  256,    0,      127,    1,               0.718
> > > 2048,     4,  256,    0,      127,    1,               0.718
> > > 2048,     0,  512,    0,      127,    1,               0.758
> > > 2048,     5,  512,    0,      127,    1,               0.754
> > > 2048,     0, 1024,    0,      127,    1,               1.029
> > > 2048,     6, 1024,    0,      127,    1,               1.032
> > > 2048,     0, 2048,    0,      127,    1,               0.826
> > > 2048,     7, 2048,    0,      127,    1,               0.834
> > > 2048,     0, 4096,    0,      127,    1,               0.825
> > > 2048,     8, 4096,    0,      127,    1,                0.83
> > >  256,     1,   64,    0,      127,    1,               0.657
> > >  256,    15,   64,    0,      127,    1,               0.657
> > >  256,     2,   64,    0,      127,    1,               0.657
> > >  256,    30,   64,    0,      127,    1,               0.523
> > >  256,     3,   64,    0,      127,    1,               0.657
> > >  256,    45,   64,    0,      127,    1,               0.654
> > >  256,     4,   64,    0,      127,    1,               0.657
> > >  256,    60,   64,    0,      127,    1,               0.526
> > >  256,     5,   64,    0,      127,    1,               0.658
> > >  256,    75,   64,    0,      127,    1,               0.658
> > >  256,     6,   64,    0,      127,    1,               0.655
> > >  256,    90,   64,    0,      127,    1,               0.523
> > >  256,     7,   64,    0,      127,    1,               0.655
> > >  256,   105,   64,    0,      127,    1,               0.654
> > >    1,     0,    0,    0,      127,    1,                0.98
> > >    2,     0,    1,    0,      127,    1,               0.978
> > >    3,     0,    2,    0,      127,    1,               0.975
> > >    4,     0,    3,    0,      127,    1,               0.976
> > >    5,     0,    4,    0,      127,    1,               0.977
> > >    6,     0,    5,    0,      127,    1,               0.981
> > >    7,     0,    6,    0,      127,    1,               0.982
> > >    8,     0,    7,    0,      127,    1,                0.98
> > >    9,     0,    8,    0,      127,    1,               0.978
> > >   10,     0,    9,    0,      127,    1,               0.981
> > >   11,     0,   10,    0,      127,    1,               0.984
> > >   12,     0,   11,    0,      127,    1,               0.982
> > >   13,     0,   12,    0,      127,    1,                0.98
> > >   14,     0,   13,    0,      127,    1,               0.978
> > >   15,     0,   14,    0,      127,    1,               0.979
> > >   16,     0,   15,    0,      127,    1,               0.986
> > >   17,     0,   16,    0,      127,    1,               0.529
> > >   18,     0,   17,    0,      127,    1,               0.566
> > >   19,     0,   18,    0,      127,    1,               0.575
> > >   20,     0,   19,    0,      127,    1,               0.573
> > >   21,     0,   20,    0,      127,    1,               0.579
> > >   22,     0,   21,    0,      127,    1,               0.595
> > >   23,     0,   22,    0,      127,    1,               0.585
> > >   24,     0,   23,    0,      127,    1,               0.586
> > >   25,     0,   24,    0,      127,    1,               0.587
> > >   26,     0,   25,    0,      127,    1,               0.592
> > >   27,     0,   26,    0,      127,    1,               0.595
> > >   28,     0,   27,    0,      127,    1,               0.592
> > >   29,     0,   28,    0,      127,    1,                 0.6
> > >   30,     0,   29,    0,      127,    1,               0.598
> > >   31,     0,   30,    0,      127,    1,               0.595
> > >   32,     0,   31,    0,      127,    1,               0.592
> > > 2048,     0,   32,   23,      127,    1,               0.827
> > > 2048,     1,   32,   23,      127,    1,               0.826
> > > 2048,     0,   64,   23,      127,    1,               0.824
> > > 2048,     2,   64,   23,      127,    1,               0.825
> > > 2048,     0,  128,   23,      127,    1,               0.829
> > > 2048,     3,  128,   23,      127,    1,               0.824
> > > 2048,     0,  256,   23,      127,    1,               0.832
> > > 2048,     4,  256,   23,      127,    1,               0.825
> > > 2048,     0,  512,   23,      127,    1,               0.831
> > > 2048,     5,  512,   23,      127,    1,               0.837
> > > 2048,     0, 1024,   23,      127,    1,               0.721
> > > 2048,     6, 1024,   23,      127,    1,               0.757
> > > 2048,     0, 2048,   23,      127,    1,               0.825
> > > 2048,     7, 2048,   23,      127,    1,               0.824
> > > 2048,     0, 4096,   23,      127,    1,               0.828
> > > 2048,     8, 4096,   23,      127,    1,               0.823
> > >  256,     1,   64,   23,      127,    1,               0.665
> > >  256,    15,   64,   23,      127,    1,               0.661
> > >  256,     2,   64,   23,      127,    1,               0.674
> > >  256,    30,   64,   23,      127,    1,               0.605
> > >  256,     3,   64,   23,      127,    1,               0.668
> > >  256,    45,   64,   23,      127,    1,               0.661
> > >  256,     4,   64,   23,      127,    1,               0.657
> > >  256,    60,   64,   23,      127,    1,               0.594
> > >  256,     5,   64,   23,      127,    1,               0.654
> > >  256,    75,   64,   23,      127,    1,               0.673
> > >  256,     6,   64,   23,      127,    1,               0.688
> > >  256,    90,   64,   23,      127,    1,                 0.6
> > >  256,     7,   64,   23,      127,    1,                0.66
> > >  256,   105,   64,   23,      127,    1,               0.654
> > >    1,     0,    0,   23,      127,    1,               0.981
> > >    2,     0,    1,   23,      127,    1,               0.976
> > >    3,     0,    2,   23,      127,    1,               0.983
> > >    4,     0,    3,   23,      127,    1,               0.984
> > >    5,     0,    4,   23,      127,    1,               0.973
> > >    6,     0,    5,   23,      127,    1,               0.987
> > >    7,     0,    6,   23,      127,    1,               0.977
> > >    8,     0,    7,   23,      127,    1,               0.979
> > >    9,     0,    8,   23,      127,    1,               0.981
> > >   10,     0,    9,   23,      127,    1,                0.98
> > >   11,     0,   10,   23,      127,    1,               0.983
> > >   12,     0,   11,   23,      127,    1,                0.98
> > >   13,     0,   12,   23,      127,    1,                0.98
> > >   14,     0,   13,   23,      127,    1,               0.977
> > >   15,     0,   14,   23,      127,    1,               0.982
> > >   16,     0,   15,   23,      127,    1,               0.581
> > >   17,     0,   16,   23,      127,    1,               0.551
> > >   18,     0,   17,   23,      127,    1,               0.555
> > >   19,     0,   18,   23,      127,    1,               0.586
> > >   20,     0,   19,   23,      127,    1,               0.585
> > >   21,     0,   20,   23,      127,    1,               0.582
> > >   22,     0,   21,   23,      127,    1,               0.571
> > >   23,     0,   22,   23,      127,    1,               0.576
> > >   24,     0,   23,   23,      127,    1,               0.581
> > >   25,     0,   24,   23,      127,    1,               0.589
> > >   26,     0,   25,   23,      127,    1,               0.593
> > >   27,     0,   26,   23,      127,    1,               0.595
> > >   28,     0,   27,   23,      127,    1,               0.583
> > >   29,     0,   28,   23,      127,    1,               0.595
> > >   30,     0,   29,   23,      127,    1,                0.58
> > >   31,     0,   30,   23,      127,    1,               0.594
> > >   32,     0,   31,   23,      127,    1,               0.665
> > > 2048,     0,   32,   23,      127,    2,               0.825
> > > 2048,     1,   32,   23,      127,    2,               0.818
> > > 2048,     0,   64,   23,      127,    2,               0.829
> > > 2048,     2,   64,   23,      127,    2,               0.828
> > > 2048,     0,  128,   23,      127,    2,               0.823
> > > 2048,     3,  128,   23,      127,    2,               0.825
> > > 2048,     0,  256,   23,      127,    2,               0.819
> > > 2048,     4,  256,   23,      127,    2,               0.828
> > > 2048,     0,  512,   23,      127,    2,               0.824
> > > 2048,     5,  512,   23,      127,    2,               0.827
> > > 2048,     0, 1024,   23,      127,    2,               0.813
> > > 2048,     6, 1024,   23,      127,    2,               0.834
> > > 2048,     0, 2048,   23,      127,    2,               0.927
> > > 2048,     7, 2048,   23,      127,    2,               0.923
> > > 2048,     0, 4096,   23,      127,    2,               0.818
> > > 2048,     8, 4096,   23,      127,    2,                0.82
> > >  256,     1,   64,   23,      127,    2,               0.693
> > >  256,    15,   64,   23,      127,    2,               0.686
> > >  256,     2,   64,   23,      127,    2,                0.69
> > >  256,    30,   64,   23,      127,    2,               0.611
> > >  256,     3,   64,   23,      127,    2,               0.692
> > >  256,    45,   64,   23,      127,    2,               0.685
> > >  256,     4,   64,   23,      127,    2,               0.688
> > >  256,    60,   64,   23,      127,    2,                 0.6
> > >  256,     5,   64,   23,      127,    2,                0.69
> > >  256,    75,   64,   23,      127,    2,               0.689
> > >  256,     6,   64,   23,      127,    2,               0.688
> > >  256,    90,   64,   23,      127,    2,               0.611
> > >  256,     7,   64,   23,      127,    2,                0.69
> > >  256,   105,   64,   23,      127,    2,               0.686
> > >    1,     0,    0,   23,      127,    2,               0.982
> > >    2,     0,    1,   23,      127,    2,               0.987
> > >    3,     0,    2,   23,      127,    2,               0.978
> > >    4,     0,    3,   23,      127,    2,               0.977
> > >    5,     0,    4,   23,      127,    2,               0.979
> > >    6,     0,    5,   23,      127,    2,               0.985
> > >    7,     0,    6,   23,      127,    2,               0.975
> > >    8,     0,    7,   23,      127,    2,               0.981
> > >    9,     0,    8,   23,      127,    2,               0.984
> > >   10,     0,    9,   23,      127,    2,               0.983
> > >   11,     0,   10,   23,      127,    2,               0.982
> > >   12,     0,   11,   23,      127,    2,               0.976
> > >   13,     0,   12,   23,      127,    2,               0.985
> > >   14,     0,   13,   23,      127,    2,               0.984
> > >   15,     0,   14,   23,      127,    2,                0.98
> > >   16,     0,   15,   23,      127,    2,               0.583
> > >   17,     0,   16,   23,      127,    2,               0.552
> > >   18,     0,   17,   23,      127,    2,               0.564
> > >   19,     0,   18,   23,      127,    2,               0.585
> > >   20,     0,   19,   23,      127,    2,               0.578
> > >   21,     0,   20,   23,      127,    2,               0.578
> > >   22,     0,   21,   23,      127,    2,               0.571
> > >   23,     0,   22,   23,      127,    2,               0.587
> > >   24,     0,   23,   23,      127,    2,               0.589
> > >   25,     0,   24,   23,      127,    2,               0.593
> > >   26,     0,   25,   23,      127,    2,               0.589
> > >   27,     0,   26,   23,      127,    2,               0.588
> > >   28,     0,   27,   23,      127,    2,               0.593
> > >   29,     0,   28,   23,      127,    2,               0.579
> > >   30,     0,   29,   23,      127,    2,               0.572
> > >   31,     0,   30,   23,      127,    2,               0.582
> > >   32,     0,   31,   23,      127,    2,               0.659
> > > 2048,     0,   32,   23,      127,    4,               0.822
> > > 2048,     1,   32,   23,      127,    4,               0.818
> > > 2048,     0,   64,   23,      127,    4,               0.826
> > > 2048,     2,   64,   23,      127,    4,               0.824
> > > 2048,     0,  128,   23,      127,    4,               0.833
> > > 2048,     3,  128,   23,      127,    4,               0.831
> > > 2048,     0,  256,   23,      127,    4,               0.826
> > > 2048,     4,  256,   23,      127,    4,               0.831
> > > 2048,     0,  512,   23,      127,    4,               0.834
> > > 2048,     5,  512,   23,      127,    4,                0.83
> > > 2048,     0, 1024,   23,      127,    4,               0.836
> > > 2048,     6, 1024,   23,      127,    4,               0.844
> > > 2048,     0, 2048,   23,      127,    4,               0.696
> > > 2048,     7, 2048,   23,      127,    4,               0.704
> > > 2048,     0, 4096,   23,      127,    4,               0.936
> > > 2048,     8, 4096,   23,      127,    4,               0.925
> > >  256,     1,   64,   23,      127,    4,               0.694
> > >  256,    15,   64,   23,      127,    4,                0.69
> > >  256,     2,   64,   23,      127,    4,               0.687
> > >  256,    30,   64,   23,      127,    4,               0.612
> > >  256,     3,   64,   23,      127,    4,               0.685
> > >  256,    45,   64,   23,      127,    4,               0.685
> > >  256,     4,   64,   23,      127,    4,               0.684
> > >  256,    60,   64,   23,      127,    4,               0.606
> > >  256,     5,   64,   23,      127,    4,                0.69
> > >  256,    75,   64,   23,      127,    4,               0.688
> > >  256,     6,   64,   23,      127,    4,                0.69
> > >  256,    90,   64,   23,      127,    4,               0.615
> > >  256,     7,   64,   23,      127,    4,               0.691
> > >  256,   105,   64,   23,      127,    4,               0.688
> > >    1,     0,    0,   23,      127,    4,               0.982
> > >    2,     0,    1,   23,      127,    4,               0.983
> > >    3,     0,    2,   23,      127,    4,               0.981
> > >    4,     0,    3,   23,      127,    4,               0.984
> > >    5,     0,    4,   23,      127,    4,               0.963
> > >    6,     0,    5,   23,      127,    4,               0.978
> > >    7,     0,    6,   23,      127,    4,               0.985
> > >    8,     0,    7,   23,      127,    4,               0.986
> > >    9,     0,    8,   23,      127,    4,               0.978
> > >   10,     0,    9,   23,      127,    4,               0.985
> > >   11,     0,   10,   23,      127,    4,               0.986
> > >   12,     0,   11,   23,      127,    4,               0.983
> > >   13,     0,   12,   23,      127,    4,               0.986
> > >   14,     0,   13,   23,      127,    4,                0.98
> > >   15,     0,   14,   23,      127,    4,               0.979
> > >   16,     0,   15,   23,      127,    4,               0.582
> > >   17,     0,   16,   23,      127,    4,               0.542
> > >   18,     0,   17,   23,      127,    4,               0.564
> > >   19,     0,   18,   23,      127,    4,               0.571
> > >   20,     0,   19,   23,      127,    4,               0.582
> > >   21,     0,   20,   23,      127,    4,               0.573
> > >   22,     0,   21,   23,      127,    4,               0.575
> > >   23,     0,   22,   23,      127,    4,               0.578
> > >   24,     0,   23,   23,      127,    4,                0.58
> > >   25,     0,   24,   23,      127,    4,               0.592
> > >   26,     0,   25,   23,      127,    4,               0.588
> > >   27,     0,   26,   23,      127,    4,               0.574
> > >   28,     0,   27,   23,      127,    4,               0.589
> > >   29,     0,   28,   23,      127,    4,                0.56
> > >   30,     0,   29,   23,      127,    4,               0.587
> > >   31,     0,   30,   23,      127,    4,               0.584
> > >   32,     0,   31,   23,      127,    4,               0.664
> > > 2048,     0,   32,   23,      127,    8,               0.826
> > > 2048,     1,   32,   23,      127,    8,               0.821
> > > 2048,     0,   64,   23,      127,    8,               0.828
> > > 2048,     2,   64,   23,      127,    8,               0.827
> > > 2048,     0,  128,   23,      127,    8,               0.833
> > > 2048,     3,  128,   23,      127,    8,                0.83
> > > 2048,     0,  256,   23,      127,    8,               0.855
> > > 2048,     4,  256,   23,      127,    8,               0.849
> > > 2048,     0,  512,   23,      127,    8,               0.849
> > > 2048,     5,  512,   23,      127,    8,               0.851
> > > 2048,     0, 1024,   23,      127,    8,               0.856
> > > 2048,     6, 1024,   23,      127,    8,               0.862
> > > 2048,     0, 2048,   23,      127,    8,               0.709
> > > 2048,     7, 2048,   23,      127,    8,               0.712
> > > 2048,     0, 4096,   23,      127,    8,               0.702
> > > 2048,     8, 4096,   23,      127,    8,               0.701
> > >  256,     1,   64,   23,      127,    8,               0.689
> > >  256,    15,   64,   23,      127,    8,               0.688
> > >  256,     2,   64,   23,      127,    8,               0.691
> > >  256,    30,   64,   23,      127,    8,               0.612
> > >  256,     3,   64,   23,      127,    8,               0.688
> > >  256,    45,   64,   23,      127,    8,               0.686
> > >  256,     4,   64,   23,      127,    8,               0.694
> > >  256,    60,   64,   23,      127,    8,               0.609
> > >  256,     5,   64,   23,      127,    8,                0.69
> > >  256,    75,   64,   23,      127,    8,                0.69
> > >  256,     6,   64,   23,      127,    8,               0.691
> > >  256,    90,   64,   23,      127,    8,               0.612
> > >  256,     7,   64,   23,      127,    8,               0.689
> > >  256,   105,   64,   23,      127,    8,               0.688
> > >    1,     0,    0,   23,      127,    8,                0.98
> > >    2,     0,    1,   23,      127,    8,               0.978
> > >    3,     0,    2,   23,      127,    8,                0.98
> > >    4,     0,    3,   23,      127,    8,               0.978
> > >    5,     0,    4,   23,      127,    8,               0.977
> > >    6,     0,    5,   23,      127,    8,               0.984
> > >    7,     0,    6,   23,      127,    8,               0.982
> > >    8,     0,    7,   23,      127,    8,               0.983
> > >    9,     0,    8,   23,      127,    8,               0.987
> > >   10,     0,    9,   23,      127,    8,               0.979
> > >   11,     0,   10,   23,      127,    8,               0.985
> > >   12,     0,   11,   23,      127,    8,               0.981
> > >   13,     0,   12,   23,      127,    8,                0.98
> > >   14,     0,   13,   23,      127,    8,               0.982
> > >   15,     0,   14,   23,      127,    8,               0.981
> > >   16,     0,   15,   23,      127,    8,               0.579
> > >   17,     0,   16,   23,      127,    8,               0.531
> > >   18,     0,   17,   23,      127,    8,               0.577
> > >   19,     0,   18,   23,      127,    8,               0.588
> > >   20,     0,   19,   23,      127,    8,               0.571
> > >   21,     0,   20,   23,      127,    8,               0.576
> > >   22,     0,   21,   23,      127,    8,                0.59
> > >   23,     0,   22,   23,      127,    8,               0.574
> > >   24,     0,   23,   23,      127,    8,               0.583
> > >   25,     0,   24,   23,      127,    8,               0.581
> > >   26,     0,   25,   23,      127,    8,               0.592
> > >   27,     0,   26,   23,      127,    8,               0.586
> > >   28,     0,   27,   23,      127,    8,               0.588
> > >   29,     0,   28,   23,      127,    8,               0.578
> > >   30,     0,   29,   23,      127,    8,               0.573
> > >   31,     0,   30,   23,      127,    8,               0.588
> > >   32,     0,   31,   23,      127,    8,               0.664
> > > 2048,     0,   32,   23,      127,   16,               0.825
> > > 2048,     1,   32,   23,      127,   16,               0.823
> > > 2048,     0,   64,   23,      127,   16,               0.831
> > > 2048,     2,   64,   23,      127,   16,               0.822
> > > 2048,     0,  128,   23,      127,   16,               0.831
> > > 2048,     3,  128,   23,      127,   16,               0.831
> > > 2048,     0,  256,   23,      127,   16,               0.849
> > > 2048,     4,  256,   23,      127,   16,                0.85
> > > 2048,     0,  512,   23,      127,   16,               0.751
> > > 2048,     5,  512,   23,      127,   16,                0.75
> > > 2048,     0, 1024,   23,      127,   16,               0.913
> > > 2048,     6, 1024,   23,      127,   16,               0.895
> > > 2048,     0, 2048,   23,      127,   16,               0.736
> > > 2048,     7, 2048,   23,      127,   16,               0.741
> > > 2048,     0, 4096,   23,      127,   16,               0.712
> > > 2048,     8, 4096,   23,      127,   16,               0.711
> > >  256,     1,   64,   23,      127,   16,               0.758
> > >  256,    15,   64,   23,      127,   16,               0.692
> > >  256,     2,   64,   23,      127,   16,               0.692
> > >  256,    30,   64,   23,      127,   16,               0.613
> > >  256,     3,   64,   23,      127,   16,                0.69
> > >  256,    45,   64,   23,      127,   16,               0.687
> > >  256,     4,   64,   23,      127,   16,                0.69
> > >  256,    60,   64,   23,      127,   16,               0.604
> > >  256,     5,   64,   23,      127,   16,               0.687
> > >  256,    75,   64,   23,      127,   16,               0.687
> > >  256,     6,   64,   23,      127,   16,                0.69
> > >  256,    90,   64,   23,      127,   16,                0.61
> > >  256,     7,   64,   23,      127,   16,                0.69
> > >  256,   105,   64,   23,      127,   16,               0.685
> > >    1,     0,    0,   23,      127,   16,               0.981
> > >    2,     0,    1,   23,      127,   16,               0.985
> > >    3,     0,    2,   23,      127,   16,               0.985
> > >    4,     0,    3,   23,      127,   16,               0.981
> > >    5,     0,    4,   23,      127,   16,               0.979
> > >    6,     0,    5,   23,      127,   16,               0.986
> > >    7,     0,    6,   23,      127,   16,               0.986
> > >    8,     0,    7,   23,      127,   16,               0.982
> > >    9,     0,    8,   23,      127,   16,               0.982
> > >   10,     0,    9,   23,      127,   16,                0.98
> > >   11,     0,   10,   23,      127,   16,               0.983
> > >   12,     0,   11,   23,      127,   16,               0.982
> > >   13,     0,   12,   23,      127,   16,               0.982
> > >   14,     0,   13,   23,      127,   16,               0.982
> > >   15,     0,   14,   23,      127,   16,               0.982
> > >   16,     0,   15,   23,      127,   16,               0.582
> > >   17,     0,   16,   23,      127,   16,               0.542
> > >   18,     0,   17,   23,      127,   16,               0.554
> > >   19,     0,   18,   23,      127,   16,               0.562
> > >   20,     0,   19,   23,      127,   16,               0.587
> > >   21,     0,   20,   23,      127,   16,               0.584
> > >   22,     0,   21,   23,      127,   16,               0.587
> > >   23,     0,   22,   23,      127,   16,               0.594
> > >   24,     0,   23,   23,      127,   16,               0.581
> > >   25,     0,   24,   23,      127,   16,               0.577
> > >   26,     0,   25,   23,      127,   16,               0.588
> > >   27,     0,   26,   23,      127,   16,               0.589
> > >   28,     0,   27,   23,      127,   16,               0.596
> > >   29,     0,   28,   23,      127,   16,               0.591
> > >   30,     0,   29,   23,      127,   16,               0.585
> > >   31,     0,   30,   23,      127,   16,                0.59
> > >   32,     0,   31,   23,      127,   16,               0.669
> > >
> > >  sysdeps/x86_64/multiarch/strrchr-sse2.S |   2 +-
> > >  sysdeps/x86_64/multiarch/wcsrchr-sse2.S |   3 +-
> > >  sysdeps/x86_64/strrchr.S                | 505 +++++++++++++++---------
> > >  sysdeps/x86_64/wcsrchr.S                | 268 +------------
> > >  4 files changed, 334 insertions(+), 444 deletions(-)
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
> > > index db1b44c23c..866396e947 100644
> > > --- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
> > > +++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
> > > @@ -17,7 +17,7 @@
> > >     <https://www.gnu.org/licenses/>.  */
> > >
> > >  #if IS_IN (libc)
> > > -# define strrchr __strrchr_sse2
> > > +# define STRRCHR __strrchr_sse2
> > >
> > >  # undef weak_alias
> > >  # define weak_alias(strrchr, rindex)
> > > diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> > > index 78d1ca6553..69d2f3cdb1 100644
> > > --- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> > > +++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> > > @@ -17,7 +17,6 @@
> > >     <https://www.gnu.org/licenses/>.  */
> > >
> > >  #if IS_IN (libc)
> > > -# define wcsrchr __wcsrchr_sse2
> > > +# define STRRCHR       __wcsrchr_sse2
> > >  #endif
> > > -
> > >  #include "../wcsrchr.S"
> > > diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
> > > index 50d886713e..94449ad806 100644
> > > --- a/sysdeps/x86_64/strrchr.S
> > > +++ b/sysdeps/x86_64/strrchr.S
> > > @@ -19,210 +19,355 @@
> > >
> > >  #include <sysdep.h>
> > >
> > > +#ifndef STRRCHR
> > > +# define STRRCHR       strrchr
> > > +#endif
> > > +
> > > +#ifdef USE_AS_WCSRCHR
> > > +# define PCMPEQ        pcmpeqd
> > > +# define CHAR_SIZE     4
> > > +# define PMINU pminud
> > > +#else
> > > +# define PCMPEQ        pcmpeqb
> > > +# define CHAR_SIZE     1
> > > +# define PMINU pminub
> > > +#endif
> > > +
> > > +#define PAGE_SIZE      4096
> > > +#define VEC_SIZE       16
> > > +
> > >         .text
> > > -ENTRY (strrchr)
> > > -       movd    %esi, %xmm1
> > > +ENTRY(STRRCHR)
> > > +       movd    %esi, %xmm0
> > >         movq    %rdi, %rax
> > > -       andl    $4095, %eax
> > > -       punpcklbw       %xmm1, %xmm1
> > > -       cmpq    $4032, %rax
> > > -       punpcklwd       %xmm1, %xmm1
> > > -       pshufd  $0, %xmm1, %xmm1
> > > +       andl    $(PAGE_SIZE - 1), %eax
> > > +#ifndef USE_AS_WCSRCHR
> > > +       punpcklbw %xmm0, %xmm0
> > > +       punpcklwd %xmm0, %xmm0
> > > +#endif
> > > +       pshufd  $0, %xmm0, %xmm0
> > > +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > >         ja      L(cross_page)
> > > -       movdqu  (%rdi), %xmm0
> > > +
> > > +L(cross_page_continue):
> > > +       movups  (%rdi), %xmm1
> > >         pxor    %xmm2, %xmm2
> > > -       movdqa  %xmm0, %xmm3
> > > -       pcmpeqb %xmm1, %xmm0
> > > -       pcmpeqb %xmm2, %xmm3
> > > -       pmovmskb        %xmm0, %ecx
> > > -       pmovmskb        %xmm3, %edx
> > > -       testq   %rdx, %rdx
> > > -       je      L(next_48_bytes)
> > > -       leaq    -1(%rdx), %rax
> > > -       xorq    %rdx, %rax
> > > -       andq    %rcx, %rax
> > > -       je      L(exit)
> > > -       bsrq    %rax, %rax
> > > +       PCMPEQ  %xmm1, %xmm2
> > > +       pmovmskb %xmm2, %ecx
> > > +       testl   %ecx, %ecx
> > > +       jz      L(aligned_more)
> > > +
> > > +       PCMPEQ  %xmm0, %xmm1
> > > +       pmovmskb %xmm1, %eax
> > > +       leal    -1(%rcx), %edx
> > > +       xorl    %edx, %ecx
> > > +       andl    %ecx, %eax
> > > +       jz      L(ret0)
> > > +       bsrl    %eax, %eax
> > >         addq    %rdi, %rax
> > > +       /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
> > > +          search CHAR is zero we are correct. Either way `andq
> > > +          -CHAR_SIZE, %rax` gets the correct result.  */
> > > +#ifdef USE_AS_WCSRCHR
> > > +       andq    $-CHAR_SIZE, %rax
> > > +#endif
> > > +L(ret0):
> > >         ret
> > >
> > > +       /* Returns for first vec x1/x2 have hard coded backward search
> > > +          path for earlier matches.  */
> > >         .p2align 4
> > > -L(next_48_bytes):
> > > -       movdqu  16(%rdi), %xmm4
> > > -       movdqa  %xmm4, %xmm5
> > > -       movdqu  32(%rdi), %xmm3
> > > -       pcmpeqb %xmm1, %xmm4
> > > -       pcmpeqb %xmm2, %xmm5
> > > -       movdqu  48(%rdi), %xmm0
> > > -       pmovmskb        %xmm5, %edx
> > > -       movdqa  %xmm3, %xmm5
> > > -       pcmpeqb %xmm1, %xmm3
> > > -       pcmpeqb %xmm2, %xmm5
> > > -       pcmpeqb %xmm0, %xmm2
> > > -       salq    $16, %rdx
> > > -       pmovmskb        %xmm3, %r8d
> > > -       pmovmskb        %xmm5, %eax
> > > -       pmovmskb        %xmm2, %esi
> > > -       salq    $32, %r8
> > > -       salq    $32, %rax
> > > -       pcmpeqb %xmm1, %xmm0
> > > -       orq     %rdx, %rax
> > > -       movq    %rsi, %rdx
> > > -       pmovmskb        %xmm4, %esi
> > > -       salq    $48, %rdx
> > > -       salq    $16, %rsi
> > > -       orq     %r8, %rsi
> > > -       orq     %rcx, %rsi
> > > -       pmovmskb        %xmm0, %ecx
> > > -       salq    $48, %rcx
> > > -       orq     %rcx, %rsi
> > > -       orq     %rdx, %rax
> > > -       je      L(loop_header2)
> > > -       leaq    -1(%rax), %rcx
> > > -       xorq    %rax, %rcx
> > > -       andq    %rcx, %rsi
> > > -       je      L(exit)
> > > -       bsrq    %rsi, %rsi
> > > -       leaq    (%rdi,%rsi), %rax
> > > +L(first_vec_x0_test):
> > > +       PCMPEQ  %xmm0, %xmm1
> > > +       pmovmskb %xmm1, %eax
> > > +       testl   %eax, %eax
> > > +       jz      L(ret0)
> > > +       bsrl    %eax, %eax
> > > +       addq    %r8, %rax
> > > +#ifdef USE_AS_WCSRCHR
> > > +       andq    $-CHAR_SIZE, %rax
> > > +#endif
> > >         ret
> > >
> > >         .p2align 4
> > > -L(loop_header2):
> > > -       testq   %rsi, %rsi
> > > -       movq    %rdi, %rcx
> > > -       je      L(no_c_found)
> > > -L(loop_header):
> > > -       addq    $64, %rdi
> > > -       pxor    %xmm7, %xmm7
> > > -       andq    $-64, %rdi
> > > -       jmp     L(loop_entry)
> > > +L(first_vec_x1):
> > > +       PCMPEQ  %xmm0, %xmm2
> > > +       pmovmskb %xmm2, %eax
> > > +       leal    -1(%rcx), %edx
> > > +       xorl    %edx, %ecx
> > > +       andl    %ecx, %eax
> > > +       jz      L(first_vec_x0_test)
> > > +       bsrl    %eax, %eax
> > > +       leaq    (VEC_SIZE)(%rdi, %rax), %rax
> > > +#ifdef USE_AS_WCSRCHR
> > > +       andq    $-CHAR_SIZE, %rax
> > > +#endif
> > > +       ret
> > >
> > >         .p2align 4
> > > -L(loop64):
> > > -       testq   %rdx, %rdx
> > > -       cmovne  %rdx, %rsi
> > > -       cmovne  %rdi, %rcx
> > > -       addq    $64, %rdi
> > > -L(loop_entry):
> > > -       movdqa  32(%rdi), %xmm3
> > > -       pxor    %xmm6, %xmm6
> > > -       movdqa  48(%rdi), %xmm2
> > > -       movdqa  %xmm3, %xmm0
> > > -       movdqa  16(%rdi), %xmm4
> > > -       pminub  %xmm2, %xmm0
> > > -       movdqa  (%rdi), %xmm5
> > > -       pminub  %xmm4, %xmm0
> > > -       pminub  %xmm5, %xmm0
> > > -       pcmpeqb %xmm7, %xmm0
> > > -       pmovmskb        %xmm0, %eax
> > > -       movdqa  %xmm5, %xmm0
> > > -       pcmpeqb %xmm1, %xmm0
> > > -       pmovmskb        %xmm0, %r9d
> > > -       movdqa  %xmm4, %xmm0
> > > -       pcmpeqb %xmm1, %xmm0
> > > -       pmovmskb        %xmm0, %edx
> > > -       movdqa  %xmm3, %xmm0
> > > -       pcmpeqb %xmm1, %xmm0
> > > -       salq    $16, %rdx
> > > -       pmovmskb        %xmm0, %r10d
> > > -       movdqa  %xmm2, %xmm0
> > > -       pcmpeqb %xmm1, %xmm0
> > > -       salq    $32, %r10
> > > -       orq     %r10, %rdx
> > > -       pmovmskb        %xmm0, %r8d
> > > -       orq     %r9, %rdx
> > > -       salq    $48, %r8
> > > -       orq     %r8, %rdx
> > > +L(first_vec_x1_test):
> > > +       PCMPEQ  %xmm0, %xmm2
> > > +       pmovmskb %xmm2, %eax
> > >         testl   %eax, %eax
> > > -       je      L(loop64)
> > > -       pcmpeqb %xmm6, %xmm4
> > > -       pcmpeqb %xmm6, %xmm3
> > > -       pcmpeqb %xmm6, %xmm5
> > > -       pmovmskb        %xmm4, %eax
> > > -       pmovmskb        %xmm3, %r10d
> > > -       pcmpeqb %xmm6, %xmm2
> > > -       pmovmskb        %xmm5, %r9d
> > > -       salq    $32, %r10
> > > -       salq    $16, %rax
> > > -       pmovmskb        %xmm2, %r8d
> > > -       orq     %r10, %rax
> > > -       orq     %r9, %rax
> > > -       salq    $48, %r8
> > > -       orq     %r8, %rax
> > > -       leaq    -1(%rax), %r8
> > > -       xorq    %rax, %r8
> > > -       andq    %r8, %rdx
> > > -       cmovne  %rdi, %rcx
> > > -       cmovne  %rdx, %rsi
> > > -       bsrq    %rsi, %rsi
> > > -       leaq    (%rcx,%rsi), %rax
> > > +       jz      L(first_vec_x0_test)
> > > +       bsrl    %eax, %eax
> > > +       leaq    (VEC_SIZE)(%rdi, %rax), %rax
> > > +#ifdef USE_AS_WCSRCHR
> > > +       andq    $-CHAR_SIZE, %rax
> > > +#endif
> > > +       ret
> > > +
> > > +       .p2align 4
> > > +L(first_vec_x2):
> > > +       PCMPEQ  %xmm0, %xmm3
> > > +       pmovmskb %xmm3, %eax
> > > +       leal    -1(%rcx), %edx
> > > +       xorl    %edx, %ecx
> > > +       andl    %ecx, %eax
> > > +       jz      L(first_vec_x1_test)
> > > +       bsrl    %eax, %eax
> > > +       leaq    (VEC_SIZE * 2)(%rdi, %rax), %rax
> > > +#ifdef USE_AS_WCSRCHR
> > > +       andq    $-CHAR_SIZE, %rax
> > > +#endif
> > > +       ret
> > > +
> > > +       .p2align 4
> > > +L(aligned_more):
> > > +       /* Save original pointer if match was in VEC 0.  */
> > > +       movq    %rdi, %r8
> > > +       andq    $-VEC_SIZE, %rdi
> > > +
> > > +       movaps  VEC_SIZE(%rdi), %xmm2
> > > +       pxor    %xmm3, %xmm3
> > > +       PCMPEQ  %xmm2, %xmm3
> > > +       pmovmskb %xmm3, %ecx
> > > +       testl   %ecx, %ecx
> > > +       jnz     L(first_vec_x1)
> > > +
> > > +       movaps  (VEC_SIZE * 2)(%rdi), %xmm3
> > > +       pxor    %xmm4, %xmm4
> > > +       PCMPEQ  %xmm3, %xmm4
> > > +       pmovmskb %xmm4, %ecx
> > > +       testl   %ecx, %ecx
> > > +       jnz     L(first_vec_x2)
> > > +
> > > +       addq    $VEC_SIZE, %rdi
> > > +       /* Save pointer again before realigning.  */
> > > +       movq    %rdi, %rsi
> > > +       andq    $-(VEC_SIZE * 2), %rdi
> > > +       .p2align 4
> > > +L(first_loop):
> > > +       /* Do 2x VEC at a time.  */
> > > +       movaps  (VEC_SIZE * 2)(%rdi), %xmm4
> > > +       movaps  (VEC_SIZE * 3)(%rdi), %xmm5
> > > +       /* If SSE2 no pminud.  */
> > > +#ifdef NO_PMINU
> >
> > Do we really need SSE4.1 wcsrchr?  I think we should focus on AVX2 and
> > above.
>
> It seems like freebie performance that can make a difference in the loop
> cases. (see the SSE4.1 commit for numbers).

But these numbers are on Tiger Lake.   I think we should continue to
improve SSE2
version and optimize AVX2/AVX512.   I don't think we should increase code sizes
for SSE4.

> Imo there is little harm but if you feel strongly I'll drop. (In V2 will
> change the .text section for SSE4_1).
>
> What do you think?
> >
> > > +       movaps  %xmm5, %xmm6
> > > +       pxor    %xmm8, %xmm8
> > > +
> > > +       PCMPEQ  %xmm8, %xmm5
> > > +       PCMPEQ  %xmm4, %xmm8
> > > +       por     %xmm5, %xmm8
> > > +#else
> > > +       movaps  %xmm5, %xmm6
> > > +       PMINU   %xmm4, %xmm5
> > > +#endif
> > > +
> > > +       movaps  %xmm4, %xmm9
> > > +       PCMPEQ  %xmm0, %xmm4
> > > +       PCMPEQ  %xmm0, %xmm6
> > > +       movaps  %xmm6, %xmm7
> > > +       por     %xmm4, %xmm6
> > > +#ifndef NO_PMINU
> > > +       pxor    %xmm8, %xmm8
> > > +       PCMPEQ  %xmm5, %xmm8
> > > +#endif
> > > +       pmovmskb %xmm8, %ecx
> > > +       pmovmskb %xmm6, %eax
> > > +
> > > +       addq    $(VEC_SIZE * 2), %rdi
> > > +       /* Use `addl` 1) so we can undo it with `subl` and 2) it can
> > > +          macro-fuse with `jz`.  */
> > > +       addl    %ecx, %eax
> > > +       jz      L(first_loop)
> > > +
> > > +       /* Check if there is zero match.  */
> > > +       testl   %ecx, %ecx
> > > +       jz      L(second_loop_match)
> > > +
> > > +       /* Check if there was a match in last iteration.  */
> > > +       subl    %ecx, %eax
> > > +       jnz     L(new_match)
> > > +
> > > +L(first_loop_old_match):
> > > +       PCMPEQ  %xmm0, %xmm2
> > > +       PCMPEQ  %xmm0, %xmm3
> > > +       pmovmskb %xmm2, %ecx
> > > +       pmovmskb %xmm3, %eax
> > > +       addl    %eax, %ecx
> > > +       jz      L(first_vec_x0_test)
> > > +       /* NB: We could move this shift to before the branch and save a
> > > +          bit of code size / performance on the fall through. The
> > > +          branch leads to the null case which generally seems hotter
> > > +          than char in first 3x VEC.  */
> > > +       sall    $16, %eax
> > > +       orl     %ecx, %eax
> > > +
> > > +       bsrl    %eax, %eax
> > > +       addq    %rsi, %rax
> > > +#ifdef USE_AS_WCSRCHR
> > > +       andq    $-CHAR_SIZE, %rax
> > > +#endif
> > > +       ret
> > > +
> > > +       .p2align 4
> > > +L(new_match):
> > > +       pxor    %xmm6, %xmm6
> > > +       PCMPEQ  %xmm9, %xmm6
> > > +       pmovmskb %xmm6, %eax
> > > +       sall    $16, %ecx
> > > +       orl     %eax, %ecx
> > > +
> > > +       /* We can't reuse either of the old comparisons as since we mask
> > > +          of zeros after first zero (instead of using the full
> > > +          comparison) we can't gurantee no interference between match
> > > +          after end of string and valid match.  */
> > > +       pmovmskb %xmm4, %eax
> > > +       pmovmskb %xmm7, %edx
> > > +       sall    $16, %edx
> > > +       orl     %edx, %eax
> > > +
> > > +       leal    -1(%ecx), %edx
> > > +       xorl    %edx, %ecx
> > > +       andl    %ecx, %eax
> > > +       jz      L(first_loop_old_match)
> > > +       bsrl    %eax, %eax
> > > +       addq    %rdi, %rax
> > > +#ifdef USE_AS_WCSRCHR
> > > +       andq    $-CHAR_SIZE, %rax
> > > +#endif
> > >         ret
> > >
> > > +       /* Save minimum state for getting most recent match. We can
> > > +          throw out all previous work.  */
> > >         .p2align 4
> > > -L(no_c_found):
> > > -       movl    $1, %esi
> > > -       xorl    %ecx, %ecx
> > > -       jmp     L(loop_header)
> > > +L(second_loop_match):
> > > +       movq    %rdi, %rsi
> > > +       movaps  %xmm4, %xmm2
> > > +       movaps  %xmm7, %xmm3
> > >
> > >         .p2align 4
> > > -L(exit):
> > > -       xorl    %eax, %eax
> > > +L(second_loop):
> > > +       movaps  (VEC_SIZE * 2)(%rdi), %xmm4
> > > +       movaps  (VEC_SIZE * 3)(%rdi), %xmm5
> > > +#ifdef NO_PMINU
> > > +       movaps  %xmm5, %xmm6
> > > +       pxor    %xmm8, %xmm8
> > > +
> > > +       PCMPEQ  %xmm8, %xmm5
> > > +       PCMPEQ  %xmm4, %xmm8
> > > +       por     %xmm5, %xmm8
> > > +#else
> > > +       movaps  %xmm5, %xmm6
> > > +       PMINU   %xmm4, %xmm5
> > > +#endif
> > > +
> > > +       movaps  %xmm4, %xmm9
> > > +       PCMPEQ  %xmm0, %xmm4
> > > +       PCMPEQ  %xmm0, %xmm6
> > > +       movaps  %xmm6, %xmm7
> > > +       por     %xmm4, %xmm6
> > > +#ifndef NO_PMINU
> > > +       pxor    %xmm8, %xmm8
> > > +       PCMPEQ  %xmm5, %xmm8
> > > +#endif
> > > +
> > > +       pmovmskb %xmm8, %ecx
> > > +       pmovmskb %xmm6, %eax
> > > +
> > > +       addq    $(VEC_SIZE * 2), %rdi
> > > +       /* Either null term or new occurence of CHAR.  */
> > > +       addl    %ecx, %eax
> > > +       jz      L(second_loop)
> > > +
> > > +       /* No null term so much be new occurence of CHAR.  */
> > > +       testl   %ecx, %ecx
> > > +       jz      L(second_loop_match)
> > > +
> > > +
> > > +       subl    %ecx, %eax
> > > +       jnz     L(second_loop_new_match)
> > > +
> > > +L(second_loop_old_match):
> > > +       pmovmskb %xmm2, %ecx
> > > +       pmovmskb %xmm3, %eax
> > > +       sall    $16, %eax
> > > +       orl     %ecx, %eax
> > > +       bsrl    %eax, %eax
> > > +       addq    %rsi, %rax
> > > +#ifdef USE_AS_WCSRCHR
> > > +       andq    $-CHAR_SIZE, %rax
> > > +#endif
> > >         ret
> > >
> > >         .p2align 4
> > > +L(second_loop_new_match):
> > > +       pxor    %xmm6, %xmm6
> > > +       PCMPEQ  %xmm9, %xmm6
> > > +       pmovmskb %xmm6, %eax
> > > +       sall    $16, %ecx
> > > +       orl     %eax, %ecx
> > > +
> > > +       /* We can't reuse either of the old comparisons as since we mask
> > > +          of zeros after first zero (instead of using the full
> > > +          comparison) we can't gurantee no interference between match
> > > +          after end of string and valid match.  */
> > > +       pmovmskb %xmm4, %eax
> > > +       pmovmskb %xmm7, %edx
> > > +       sall    $16, %edx
> > > +       orl     %edx, %eax
> > > +
> > > +       leal    -1(%ecx), %edx
> > > +       xorl    %edx, %ecx
> > > +       andl    %ecx, %eax
> > > +       jz      L(second_loop_old_match)
> > > +       bsrl    %eax, %eax
> > > +       addq    %rdi, %rax
> > > +#ifdef USE_AS_WCSRCHR
> > > +       andq    $-CHAR_SIZE, %rax
> > > +#endif
> > > +       ret
> > > +
> > > +       .p2align 4,, 4
> > >  L(cross_page):
> > > -       movq    %rdi, %rax
> > > -       pxor    %xmm0, %xmm0
> > > -       andq    $-64, %rax
> > > -       movdqu  (%rax), %xmm5
> > > -       movdqa  %xmm5, %xmm6
> > > -       movdqu  16(%rax), %xmm4
> > > -       pcmpeqb %xmm1, %xmm5
> > > -       pcmpeqb %xmm0, %xmm6
> > > -       movdqu  32(%rax), %xmm3
> > > -       pmovmskb        %xmm6, %esi
> > > -       movdqa  %xmm4, %xmm6
> > > -       movdqu  48(%rax), %xmm2
> > > -       pcmpeqb %xmm1, %xmm4
> > > -       pcmpeqb %xmm0, %xmm6
> > > -       pmovmskb        %xmm6, %edx
> > > -       movdqa  %xmm3, %xmm6
> > > -       pcmpeqb %xmm1, %xmm3
> > > -       pcmpeqb %xmm0, %xmm6
> > > -       pcmpeqb %xmm2, %xmm0
> > > -       salq    $16, %rdx
> > > -       pmovmskb        %xmm3, %r9d
> > > -       pmovmskb        %xmm6, %r8d
> > > -       pmovmskb        %xmm0, %ecx
> > > -       salq    $32, %r9
> > > -       salq    $32, %r8
> > > -       pcmpeqb %xmm1, %xmm2
> > > -       orq     %r8, %rdx
> > > -       salq    $48, %rcx
> > > -       pmovmskb        %xmm5, %r8d
> > > -       orq     %rsi, %rdx
> > > -       pmovmskb        %xmm4, %esi
> > > -       orq     %rcx, %rdx
> > > -       pmovmskb        %xmm2, %ecx
> > > -       salq    $16, %rsi
> > > -       salq    $48, %rcx
> > > -       orq     %r9, %rsi
> > > -       orq     %r8, %rsi
> > > -       orq     %rcx, %rsi
> > > +       movq    %rdi, %rsi
> > > +       andq    $-VEC_SIZE, %rsi
> > > +       movaps  (%rsi), %xmm1
> > > +       pxor    %xmm2, %xmm2
> > > +       PCMPEQ  %xmm1, %xmm2
> > > +       pmovmskb %xmm2, %edx
> > >         movl    %edi, %ecx
> > > -       subl    %eax, %ecx
> > > -       shrq    %cl, %rdx
> > > -       shrq    %cl, %rsi
> > > -       testq   %rdx, %rdx
> > > -       je      L(loop_header2)
> > > -       leaq    -1(%rdx), %rax
> > > -       xorq    %rdx, %rax
> > > -       andq    %rax, %rsi
> > > -       je      L(exit)
> > > -       bsrq    %rsi, %rax
> > > +       andl    $(VEC_SIZE - 1), %ecx
> > > +       sarl    %cl, %edx
> > > +       jz      L(cross_page_continue)
> > > +       PCMPEQ  %xmm0, %xmm1
> > > +       pmovmskb %xmm1, %eax
> > > +       sarl    %cl, %eax
> > > +       leal    -1(%rdx), %ecx
> > > +       xorl    %edx, %ecx
> > > +       andl    %ecx, %eax
> > > +       jz      L(ret1)
> > > +       bsrl    %eax, %eax
> > >         addq    %rdi, %rax
> > > +#ifdef USE_AS_WCSRCHR
> > > +       andq    $-CHAR_SIZE, %rax
> > > +#endif
> > > +L(ret1):
> > >         ret
> > > -END (strrchr)
> > > +END(STRRCHR)
> > >
> > > -weak_alias (strrchr, rindex)
> > > -libc_hidden_builtin_def (strrchr)
> > > +#ifndef USE_AS_WCSRCHR
> > > +       weak_alias (STRRCHR, rindex)
> > > +       libc_hidden_builtin_def (STRRCHR)
> > > +#endif
> > > diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
> > > index 61552954de..2b80efc5ef 100644
> > > --- a/sysdeps/x86_64/wcsrchr.S
> > > +++ b/sysdeps/x86_64/wcsrchr.S
> > > @@ -1,4 +1,4 @@
> > > -/* wcsrchr with SSSE3
> > > +/* wcsrchr optimized with SSE2.
> > >     Copyright (C) 2011-2022 Free Software Foundation, Inc.
> > >     This file is part of the GNU C Library.
> > >
> > > @@ -16,266 +16,12 @@
> > >     License along with the GNU C Library; if not, see
> > >     <https://www.gnu.org/licenses/>.  */
> > >
> > > -#include <sysdep.h>
> > >
> > > -       .text
> > > -ENTRY (wcsrchr)
> > > +#define USE_AS_WCSRCHR 1
> > > +#define NO_PMINU       1
> > >
> > > -       movd    %rsi, %xmm1
> > > -       mov     %rdi, %rcx
> > > -       punpckldq %xmm1, %xmm1
> > > -       pxor    %xmm2, %xmm2
> > > -       punpckldq %xmm1, %xmm1
> > > -       and     $63, %rcx
> > > -       cmp     $48, %rcx
> > > -       ja      L(crosscache)
> > > +#ifndef STRRCHR
> > > +# define STRRCHR       wcsrchr
> > > +#endif
> > >
> > > -       movdqu  (%rdi), %xmm0
> > > -       pcmpeqd %xmm0, %xmm2
> > > -       pcmpeqd %xmm1, %xmm0
> > > -       pmovmskb %xmm2, %rcx
> > > -       pmovmskb %xmm0, %rax
> > > -       add     $16, %rdi
> > > -
> > > -       test    %rax, %rax
> > > -       jnz     L(unaligned_match1)
> > > -
> > > -       test    %rcx, %rcx
> > > -       jnz     L(return_null)
> > > -
> > > -       and     $-16, %rdi
> > > -       xor     %r8, %r8
> > > -       jmp     L(loop)
> > > -
> > > -       .p2align 4
> > > -L(unaligned_match1):
> > > -       test    %rcx, %rcx
> > > -       jnz     L(prolog_find_zero_1)
> > > -
> > > -       mov     %rax, %r8
> > > -       mov     %rdi, %rsi
> > > -       and     $-16, %rdi
> > > -       jmp     L(loop)
> > > -
> > > -       .p2align 4
> > > -L(crosscache):
> > > -       and     $15, %rcx
> > > -       and     $-16, %rdi
> > > -       pxor    %xmm3, %xmm3
> > > -       movdqa  (%rdi), %xmm0
> > > -       pcmpeqd %xmm0, %xmm3
> > > -       pcmpeqd %xmm1, %xmm0
> > > -       pmovmskb %xmm3, %rdx
> > > -       pmovmskb %xmm0, %rax
> > > -       shr     %cl, %rdx
> > > -       shr     %cl, %rax
> > > -       add     $16, %rdi
> > > -
> > > -       test    %rax, %rax
> > > -       jnz     L(unaligned_match)
> > > -
> > > -       test    %rdx, %rdx
> > > -       jnz     L(return_null)
> > > -
> > > -       xor     %r8, %r8
> > > -       jmp     L(loop)
> > > -
> > > -       .p2align 4
> > > -L(unaligned_match):
> > > -       test    %rdx, %rdx
> > > -       jnz     L(prolog_find_zero)
> > > -
> > > -       mov     %rax, %r8
> > > -       lea     (%rdi, %rcx), %rsi
> > > -
> > > -/* Loop start on aligned string.  */
> > > -       .p2align 4
> > > -L(loop):
> > > -       movdqa  (%rdi), %xmm0
> > > -       pcmpeqd %xmm0, %xmm2
> > > -       add     $16, %rdi
> > > -       pcmpeqd %xmm1, %xmm0
> > > -       pmovmskb %xmm2, %rcx
> > > -       pmovmskb %xmm0, %rax
> > > -       or      %rax, %rcx
> > > -       jnz     L(matches)
> > > -
> > > -       movdqa  (%rdi), %xmm3
> > > -       pcmpeqd %xmm3, %xmm2
> > > -       add     $16, %rdi
> > > -       pcmpeqd %xmm1, %xmm3
> > > -       pmovmskb %xmm2, %rcx
> > > -       pmovmskb %xmm3, %rax
> > > -       or      %rax, %rcx
> > > -       jnz     L(matches)
> > > -
> > > -       movdqa  (%rdi), %xmm4
> > > -       pcmpeqd %xmm4, %xmm2
> > > -       add     $16, %rdi
> > > -       pcmpeqd %xmm1, %xmm4
> > > -       pmovmskb %xmm2, %rcx
> > > -       pmovmskb %xmm4, %rax
> > > -       or      %rax, %rcx
> > > -       jnz     L(matches)
> > > -
> > > -       movdqa  (%rdi), %xmm5
> > > -       pcmpeqd %xmm5, %xmm2
> > > -       add     $16, %rdi
> > > -       pcmpeqd %xmm1, %xmm5
> > > -       pmovmskb %xmm2, %rcx
> > > -       pmovmskb %xmm5, %rax
> > > -       or      %rax, %rcx
> > > -       jz      L(loop)
> > > -
> > > -       .p2align 4
> > > -L(matches):
> > > -       test    %rax, %rax
> > > -       jnz     L(match)
> > > -L(return_value):
> > > -       test    %r8, %r8
> > > -       jz      L(return_null)
> > > -       mov     %r8, %rax
> > > -       mov     %rsi, %rdi
> > > -
> > > -       test    $15 << 4, %ah
> > > -       jnz     L(match_fourth_wchar)
> > > -       test    %ah, %ah
> > > -       jnz     L(match_third_wchar)
> > > -       test    $15 << 4, %al
> > > -       jnz     L(match_second_wchar)
> > > -       lea     -16(%rdi), %rax
> > > -       ret
> > > -
> > > -       .p2align 4
> > > -L(match):
> > > -       pmovmskb %xmm2, %rcx
> > > -       test    %rcx, %rcx
> > > -       jnz     L(find_zero)
> > > -       mov     %rax, %r8
> > > -       mov     %rdi, %rsi
> > > -       jmp     L(loop)
> > > -
> > > -       .p2align 4
> > > -L(find_zero):
> > > -       test    $15, %cl
> > > -       jnz     L(find_zero_in_first_wchar)
> > > -       test    %cl, %cl
> > > -       jnz     L(find_zero_in_second_wchar)
> > > -       test    $15, %ch
> > > -       jnz     L(find_zero_in_third_wchar)
> > > -
> > > -       and     $1 << 13 - 1, %rax
> > > -       jz      L(return_value)
> > > -
> > > -       test    $15 << 4, %ah
> > > -       jnz     L(match_fourth_wchar)
> > > -       test    %ah, %ah
> > > -       jnz     L(match_third_wchar)
> > > -       test    $15 << 4, %al
> > > -       jnz     L(match_second_wchar)
> > > -       lea     -16(%rdi), %rax
> > > -       ret
> > > -
> > > -       .p2align 4
> > > -L(find_zero_in_first_wchar):
> > > -       test    $1, %rax
> > > -       jz      L(return_value)
> > > -       lea     -16(%rdi), %rax
> > > -       ret
> > > -
> > > -       .p2align 4
> > > -L(find_zero_in_second_wchar):
> > > -       and     $1 << 5 - 1, %rax
> > > -       jz      L(return_value)
> > > -
> > > -       test    $15 << 4, %al
> > > -       jnz     L(match_second_wchar)
> > > -       lea     -16(%rdi), %rax
> > > -       ret
> > > -
> > > -       .p2align 4
> > > -L(find_zero_in_third_wchar):
> > > -       and     $1 << 9 - 1, %rax
> > > -       jz      L(return_value)
> > > -
> > > -       test    %ah, %ah
> > > -       jnz     L(match_third_wchar)
> > > -       test    $15 << 4, %al
> > > -       jnz     L(match_second_wchar)
> > > -       lea     -16(%rdi), %rax
> > > -       ret
> > > -
> > > -       .p2align 4
> > > -L(prolog_find_zero):
> > > -       add     %rcx, %rdi
> > > -       mov     %rdx, %rcx
> > > -L(prolog_find_zero_1):
> > > -       test    $15, %cl
> > > -       jnz     L(prolog_find_zero_in_first_wchar)
> > > -       test    %cl, %cl
> > > -       jnz     L(prolog_find_zero_in_second_wchar)
> > > -       test    $15, %ch
> > > -       jnz     L(prolog_find_zero_in_third_wchar)
> > > -
> > > -       and     $1 << 13 - 1, %rax
> > > -       jz      L(return_null)
> > > -
> > > -       test    $15 << 4, %ah
> > > -       jnz     L(match_fourth_wchar)
> > > -       test    %ah, %ah
> > > -       jnz     L(match_third_wchar)
> > > -       test    $15 << 4, %al
> > > -       jnz     L(match_second_wchar)
> > > -       lea     -16(%rdi), %rax
> > > -       ret
> > > -
> > > -       .p2align 4
> > > -L(prolog_find_zero_in_first_wchar):
> > > -       test    $1, %rax
> > > -       jz      L(return_null)
> > > -       lea     -16(%rdi), %rax
> > > -       ret
> > > -
> > > -       .p2align 4
> > > -L(prolog_find_zero_in_second_wchar):
> > > -       and     $1 << 5 - 1, %rax
> > > -       jz      L(return_null)
> > > -
> > > -       test    $15 << 4, %al
> > > -       jnz     L(match_second_wchar)
> > > -       lea     -16(%rdi), %rax
> > > -       ret
> > > -
> > > -       .p2align 4
> > > -L(prolog_find_zero_in_third_wchar):
> > > -       and     $1 << 9 - 1, %rax
> > > -       jz      L(return_null)
> > > -
> > > -       test    %ah, %ah
> > > -       jnz     L(match_third_wchar)
> > > -       test    $15 << 4, %al
> > > -       jnz     L(match_second_wchar)
> > > -       lea     -16(%rdi), %rax
> > > -       ret
> > > -
> > > -       .p2align 4
> > > -L(match_second_wchar):
> > > -       lea     -12(%rdi), %rax
> > > -       ret
> > > -
> > > -       .p2align 4
> > > -L(match_third_wchar):
> > > -       lea     -8(%rdi), %rax
> > > -       ret
> > > -
> > > -       .p2align 4
> > > -L(match_fourth_wchar):
> > > -       lea     -4(%rdi), %rax
> > > -       ret
> > > -
> > > -       .p2align 4
> > > -L(return_null):
> > > -       xor     %rax, %rax
> > > -       ret
> > > -
> > > -END (wcsrchr)
> > > +#include "../strrchr.S"
> > > --
> > > 2.25.1
> > >
> >
> >
> > --
> > H.J.



-- 
H.J.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v1 1/5] benchtests: Improve bench-strrchr
  2022-04-21 20:12 ` [PATCH v1 1/5] benchtests: Improve bench-strrchr H.J. Lu
@ 2022-04-21 22:07   ` Noah Goldstein
  2022-04-21 23:49     ` H.J. Lu
  0 siblings, 1 reply; 36+ messages in thread
From: Noah Goldstein @ 2022-04-21 22:07 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell

On Thu, Apr 21, 2022 at 3:13 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Wed, Apr 20, 2022 at 8:14 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > 1. Use json-lib for printing results.
> > 2. Expose all parameters (before pos, seek_char, and max_char where
> >    not printed).
> > 3. Add benchmarks that test multiple occurence of seek_char in the
> >    string.
> > ---
> >  benchtests/bench-strrchr.c | 126 ++++++++++++++++++++++++-------------
> >  1 file changed, 82 insertions(+), 44 deletions(-)
> >
> > diff --git a/benchtests/bench-strrchr.c b/benchtests/bench-strrchr.c
> > index abdae60c51..cceea77e1b 100644
> > --- a/benchtests/bench-strrchr.c
> > +++ b/benchtests/bench-strrchr.c
> > @@ -23,6 +23,7 @@
> >  # define TEST_NAME "strrchr"
> >  #endif
> >  #include "bench-string.h"
> > +#include "json-lib.h"
> >
> >  #define BIG_CHAR MAX_CHAR
> >
> > @@ -53,7 +54,8 @@ SIMPLE_STRRCHR (const CHAR *s, int c)
> >  }
> >
> >  static void
> > -do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
> > +do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s, int c,
> > +            CHAR *exp_res)
> >  {
> >    CHAR *res = CALL (impl, s, c);
> >    size_t i, iters = INNER_LOOP_ITERS8;
> > @@ -61,8 +63,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
> >
> >    if (res != exp_res)
> >      {
> > -      error (0, 0, "Wrong result in function %s %p %p", impl->name,
> > -            res, exp_res);
> > +      error (0, 0, "Wrong result in function %s %p %p", impl->name, res,
> > +            exp_res);
>
> These changes aren't needed.
>
> >        ret = 1;
> >        return;
> >      }
> > @@ -72,24 +74,28 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
> >      {
> >        CALL (impl, s, c);
> >      }
> > -  TIMING_NOW (stop);
> >
> > +  TIMING_NOW (stop);
>
> Not needed.

Will fix in V2
>
> >    TIMING_DIFF (cur, start, stop);
> >
> > -  TIMING_PRINT_MEAN ((double) cur, (double) iters);
> > +  json_element_double (json_ctx, (double) cur / (double) iters);
> > +  return;
>
> Return isn't needed.

Will fix in V2.
>
> >  }
> >
> >  static void
> > -do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
> > +do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
> > +        int seek_char, int max_char, size_t freq)
> >  /* For wcsrchr: align here means align not in bytes,
> >     but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
> >     len for wcschr here isn't in bytes but it's number of wchar_t symbols.  */
> >  {
> >    size_t i;
> > +  size_t pos_chunk_sz = freq ? (pos / freq) : pos;
> > +  size_t last_pos = len;
> >    CHAR *result;
> >    CHAR *buf = (CHAR *) buf1;
> >
> > -  align &= 7;
> > +  align &= (getpagesize () - 1);
>
> If we have such large alignments, the tests may be skipped.
> Should we change it to 127 instead?

There is logic around page cross cases in x86_64 versions so think
makes sense to support benchmarking it.

Also i think that would tend to give the previous version a bit of
an unfair disadvantage as the slow aligning case will never be
tested in the new version.

>
> >    if ((align + len) * sizeof (CHAR) >= page_size)
> >      return;
> >
> > @@ -103,6 +109,16 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
> >        if ((i > pos || pos >= len) && buf[align + i] == seek_char)
> >         buf[align + i] = seek_char + 10 + (random () & 15);
> >      }
> > +
> > +  if (pos_chunk_sz == 0 && pos)
> > +    pos_chunk_sz = 1;
> > +
> > +  for (i = pos_chunk_sz; i < pos && i < len; i += pos_chunk_sz)
> > +    {
> > +      buf[align + i] = seek_char;
> > +      last_pos = i;
> > +    }
> > +
> >    buf[align + len] = 0;
> >
> >    if (pos < len)
> > @@ -110,66 +126,88 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
> >        buf[align + pos] = seek_char;
> >        result = (CHAR *) (buf + align + pos);
> >      }
> > +  else if (last_pos < len)
> > +    result = (CHAR *) (buf + align + last_pos);
> >    else if (seek_char == 0)
> >      result = (CHAR *) (buf + align + len);
> >    else
> >      result = NULL;
> >
> > -  printf ("Length %4zd, alignment in bytes %2zd:", len, align * sizeof (CHAR));
> > +  json_element_object_begin (json_ctx);
> > +  json_attr_uint (json_ctx, "len", len);
> > +  json_attr_uint (json_ctx, "pos", pos);
> > +  json_attr_uint (json_ctx, "align", align);
> > +  json_attr_uint (json_ctx, "freq", freq);
> > +  json_attr_uint (json_ctx, "seek", seek_char);
> > +  json_attr_uint (json_ctx, "max_char", max_char);
> > +  json_array_begin (json_ctx, "timings");
> >
> >    FOR_EACH_IMPL (impl, 0)
> > -    do_one_test (impl, (CHAR *) (buf + align), seek_char, result);
> > +    do_one_test (json_ctx, impl, (CHAR *) (buf + align), seek_char, result);
> >
> > -  putchar ('\n');
> > +  json_array_end (json_ctx);
> > +  json_element_object_end (json_ctx);
> >  }
> >
> >  int
> >  test_main (void)
> >  {
> > -  size_t i;
> > +  json_ctx_t json_ctx;
> > +  size_t i, j;
> > +  int seek;
> >
> >    test_init ();
> > +  json_init (&json_ctx, 0, stdout);
> >
> > -  printf ("%20s", "");
> > -  FOR_EACH_IMPL (impl, 0)
> > -    printf ("\t%s", impl->name);
> > -  putchar ('\n');
> > +  json_document_begin (&json_ctx);
> > +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> >
> > -  for (i = 1; i < 8; ++i)
> > -    {
> > -      do_test (0, 16 << i, 2048, 23, SMALL_CHAR);
> > -      do_test (i, 16 << i, 2048, 23, SMALL_CHAR);
> > -    }
> > +  json_attr_object_begin (&json_ctx, "functions");
> > +  json_attr_object_begin (&json_ctx, TEST_NAME);
> > +  json_attr_string (&json_ctx, "bench-variant", "");
> >
> > -  for (i = 1; i < 8; ++i)
> > -    {
> > -      do_test (i, 64, 256, 23, SMALL_CHAR);
> > -      do_test (i, 64, 256, 23, BIG_CHAR);
> > -    }
> > -
> > -  for (i = 0; i < 32; ++i)
> > -    {
> > -      do_test (0, i, i + 1, 23, SMALL_CHAR);
> > -      do_test (0, i, i + 1, 23, BIG_CHAR);
> > -    }
> > +  json_array_begin (&json_ctx, "ifuncs");
> > +  FOR_EACH_IMPL (impl, 0)
> > +    json_element_string (&json_ctx, impl->name);
> > +  json_array_end (&json_ctx);
> >
> > -  for (i = 1; i < 8; ++i)
> > -    {
> > -      do_test (0, 16 << i, 2048, 0, SMALL_CHAR);
> > -      do_test (i, 16 << i, 2048, 0, SMALL_CHAR);
> > -    }
> > +  json_array_begin (&json_ctx, "results");
> >
> > -  for (i = 1; i < 8; ++i)
> > +  for (seek = 0; seek <= 23; seek += 23)
> >      {
> > -      do_test (i, 64, 256, 0, SMALL_CHAR);
> > -      do_test (i, 64, 256, 0, BIG_CHAR);
> > +      for (j = 1; j < 32; j += j)
> > +       {
> > +         for (i = 1; i < 9; ++i)
> > +           {
> > +             do_test (&json_ctx, 0, 16 << i, 2048, seek, SMALL_CHAR, j);
> > +             do_test (&json_ctx, i, 16 << i, 2048, seek, SMALL_CHAR, j);
> > +           }
> > +
> > +         for (i = 1; i < 8; ++i)
> > +           {
> > +             do_test (&json_ctx, i, 64, 256, seek, SMALL_CHAR, j);
> > +             do_test (&json_ctx, i, 64, 256, seek, BIG_CHAR, j);
> > +
> > +             do_test (&json_ctx, i * 15, 64, 256, seek, SMALL_CHAR, j);
> > +             do_test (&json_ctx, i * 15, 64, 256, seek, BIG_CHAR, j);
> > +           }
> > +
> > +         for (i = 0; i < 32; ++i)
> > +           {
> > +             do_test (&json_ctx, 0, i, i + 1, seek, SMALL_CHAR, j);
> > +             do_test (&json_ctx, 0, i, i + 1, seek, BIG_CHAR, j);
> > +           }
> > +         if (seek == 0)
> > +           {
> > +             break;
> > +           }
> > +       }
> >      }
> >
> > -  for (i = 0; i < 32; ++i)
> > -    {
> > -      do_test (0, i, i + 1, 0, SMALL_CHAR);
> > -      do_test (0, i, i + 1, 0, BIG_CHAR);
> > -    }
> > +  json_array_end (&json_ctx);
> > +  json_attr_object_end (&json_ctx);
> > +  json_attr_object_end (&json_ctx);
> > +  json_document_end (&json_ctx);
> >
> >    return ret;
> >  }
> > --
> > 2.25.1
> >
>
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* [PATCH v2 1/4] benchtests: Improve bench-strrchr
  2022-04-21  3:14 [PATCH v1 1/5] benchtests: Improve bench-strrchr Noah Goldstein
                   ` (4 preceding siblings ...)
  2022-04-21 20:12 ` [PATCH v1 1/5] benchtests: Improve bench-strrchr H.J. Lu
@ 2022-04-21 22:22 ` Noah Goldstein
  2022-04-21 22:22   ` [PATCH v2 2/4] x86: Optimize {str|wcs}rchr-sse2 Noah Goldstein
                     ` (2 more replies)
  2022-04-22  1:52 ` [PATCH v3 1/4] benchtests: Improve bench-strrchr Noah Goldstein
  2022-04-22 19:11 ` [PATCH v4 " Noah Goldstein
  7 siblings, 3 replies; 36+ messages in thread
From: Noah Goldstein @ 2022-04-21 22:22 UTC (permalink / raw)
  To: libc-alpha

1. Use json-lib for printing results.
2. Expose all parameters (before pos, seek_char, and max_char where
   not printed).
3. Add benchmarks that test multiple occurence of seek_char in the
   string.
---
 benchtests/bench-strrchr.c | 124 ++++++++++++++++++++++++-------------
 1 file changed, 80 insertions(+), 44 deletions(-)

diff --git a/benchtests/bench-strrchr.c b/benchtests/bench-strrchr.c
index abdae60c51..ce4307a098 100644
--- a/benchtests/bench-strrchr.c
+++ b/benchtests/bench-strrchr.c
@@ -23,6 +23,7 @@
 # define TEST_NAME "strrchr"
 #endif
 #include "bench-string.h"
+#include "json-lib.h"
 
 #define BIG_CHAR MAX_CHAR
 
@@ -53,7 +54,8 @@ SIMPLE_STRRCHR (const CHAR *s, int c)
 }
 
 static void
-do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
+do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s, int c,
+	     CHAR *exp_res)
 {
   CHAR *res = CALL (impl, s, c);
   size_t i, iters = INNER_LOOP_ITERS8;
@@ -61,8 +63,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
 
   if (res != exp_res)
     {
-      error (0, 0, "Wrong result in function %s %p %p", impl->name,
-	     res, exp_res);
+      error (0, 0, "Wrong result in function %s %p %p", impl->name, res,
+	     exp_res);
       ret = 1;
       return;
     }
@@ -73,23 +75,25 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
       CALL (impl, s, c);
     }
   TIMING_NOW (stop);
-
   TIMING_DIFF (cur, start, stop);
 
-  TIMING_PRINT_MEAN ((double) cur, (double) iters);
+  json_element_double (json_ctx, (double) cur / (double) iters);
 }
 
 static void
-do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
+do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
+	 int seek_char, int max_char, size_t freq)
 /* For wcsrchr: align here means align not in bytes,
    but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
    len for wcschr here isn't in bytes but it's number of wchar_t symbols.  */
 {
   size_t i;
+  size_t pos_chunk_sz = freq ? (pos / freq) : pos;
+  size_t last_pos = len;
   CHAR *result;
   CHAR *buf = (CHAR *) buf1;
 
-  align &= 7;
+  align &= (getpagesize () - 1);
   if ((align + len) * sizeof (CHAR) >= page_size)
     return;
 
@@ -103,6 +107,16 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
       if ((i > pos || pos >= len) && buf[align + i] == seek_char)
 	buf[align + i] = seek_char + 10 + (random () & 15);
     }
+
+  if (pos_chunk_sz == 0 && pos)
+    pos_chunk_sz = 1;
+
+  for (i = pos_chunk_sz; i < pos && i < len; i += pos_chunk_sz)
+    {
+      buf[align + i] = seek_char;
+      last_pos = i;
+    }
+
   buf[align + len] = 0;
 
   if (pos < len)
@@ -110,66 +124,88 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
       buf[align + pos] = seek_char;
       result = (CHAR *) (buf + align + pos);
     }
+  else if (last_pos < len)
+    result = (CHAR *) (buf + align + last_pos);
   else if (seek_char == 0)
     result = (CHAR *) (buf + align + len);
   else
     result = NULL;
 
-  printf ("Length %4zd, alignment in bytes %2zd:", len, align * sizeof (CHAR));
+  json_element_object_begin (json_ctx);
+  json_attr_uint (json_ctx, "len", len);
+  json_attr_uint (json_ctx, "pos", pos);
+  json_attr_uint (json_ctx, "align", align);
+  json_attr_uint (json_ctx, "freq", freq);
+  json_attr_uint (json_ctx, "seek", seek_char);
+  json_attr_uint (json_ctx, "max_char", max_char);
+  json_array_begin (json_ctx, "timings");
 
   FOR_EACH_IMPL (impl, 0)
-    do_one_test (impl, (CHAR *) (buf + align), seek_char, result);
+    do_one_test (json_ctx, impl, (CHAR *) (buf + align), seek_char, result);
 
-  putchar ('\n');
+  json_array_end (json_ctx);
+  json_element_object_end (json_ctx);
 }
 
 int
 test_main (void)
 {
-  size_t i;
+  json_ctx_t json_ctx;
+  size_t i, j;
+  int seek;
 
   test_init ();
+  json_init (&json_ctx, 0, stdout);
 
-  printf ("%20s", "");
-  FOR_EACH_IMPL (impl, 0)
-    printf ("\t%s", impl->name);
-  putchar ('\n');
-
-  for (i = 1; i < 8; ++i)
-    {
-      do_test (0, 16 << i, 2048, 23, SMALL_CHAR);
-      do_test (i, 16 << i, 2048, 23, SMALL_CHAR);
-    }
+  json_document_begin (&json_ctx);
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
 
-  for (i = 1; i < 8; ++i)
-    {
-      do_test (i, 64, 256, 23, SMALL_CHAR);
-      do_test (i, 64, 256, 23, BIG_CHAR);
-    }
+  json_attr_object_begin (&json_ctx, "functions");
+  json_attr_object_begin (&json_ctx, TEST_NAME);
+  json_attr_string (&json_ctx, "bench-variant", "");
 
-  for (i = 0; i < 32; ++i)
-    {
-      do_test (0, i, i + 1, 23, SMALL_CHAR);
-      do_test (0, i, i + 1, 23, BIG_CHAR);
-    }
+  json_array_begin (&json_ctx, "ifuncs");
+  FOR_EACH_IMPL (impl, 0)
+    json_element_string (&json_ctx, impl->name);
+  json_array_end (&json_ctx);
 
-  for (i = 1; i < 8; ++i)
-    {
-      do_test (0, 16 << i, 2048, 0, SMALL_CHAR);
-      do_test (i, 16 << i, 2048, 0, SMALL_CHAR);
-    }
+  json_array_begin (&json_ctx, "results");
 
-  for (i = 1; i < 8; ++i)
+  for (seek = 0; seek <= 23; seek += 23)
     {
-      do_test (i, 64, 256, 0, SMALL_CHAR);
-      do_test (i, 64, 256, 0, BIG_CHAR);
+      for (j = 1; j < 32; j += j)
+	{
+	  for (i = 1; i < 9; ++i)
+	    {
+	      do_test (&json_ctx, 0, 16 << i, 2048, seek, SMALL_CHAR, j);
+	      do_test (&json_ctx, i, 16 << i, 2048, seek, SMALL_CHAR, j);
+	    }
+
+	  for (i = 1; i < 8; ++i)
+	    {
+	      do_test (&json_ctx, i, 64, 256, seek, SMALL_CHAR, j);
+	      do_test (&json_ctx, i, 64, 256, seek, BIG_CHAR, j);
+
+	      do_test (&json_ctx, i * 15, 64, 256, seek, SMALL_CHAR, j);
+	      do_test (&json_ctx, i * 15, 64, 256, seek, BIG_CHAR, j);
+	    }
+
+	  for (i = 0; i < 32; ++i)
+	    {
+	      do_test (&json_ctx, 0, i, i + 1, seek, SMALL_CHAR, j);
+	      do_test (&json_ctx, 0, i, i + 1, seek, BIG_CHAR, j);
+	    }
+	  if (seek == 0)
+	    {
+	      break;
+	    }
+	}
     }
 
-  for (i = 0; i < 32; ++i)
-    {
-      do_test (0, i, i + 1, 0, SMALL_CHAR);
-      do_test (0, i, i + 1, 0, BIG_CHAR);
-    }
+  json_array_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_document_end (&json_ctx);
 
   return ret;
 }
-- 
2.25.1


^ permalink raw reply	[flat|nested] 36+ messages in thread

* [PATCH v2 2/4] x86: Optimize {str|wcs}rchr-sse2
  2022-04-21 22:22 ` [PATCH v2 1/4] " Noah Goldstein
@ 2022-04-21 22:22   ` Noah Goldstein
  2022-04-21 23:46     ` H.J. Lu
  2022-04-21 22:22   ` [PATCH v2 3/4] x86: Optimize {str|wcs}rchr-avx2 Noah Goldstein
  2022-04-21 22:22   ` [PATCH v2 4/4] x86: Optimize {str|wcs}rchr-evex Noah Goldstein
  2 siblings, 1 reply; 36+ messages in thread
From: Noah Goldstein @ 2022-04-21 22:22 UTC (permalink / raw)
  To: libc-alpha

The new code unrolls the main loop slightly without adding too much
overhead and minimizes the comparisons for the search CHAR.

Geometric Mean of all benchmarks New / Old: 0.741
See email for all results.

Full xcheck passes on x86_64 with and without multiarch enabled.
---
 sysdeps/x86_64/multiarch/strrchr-sse2.S |   2 +-
 sysdeps/x86_64/multiarch/wcsrchr-sse2.S |   3 +-
 sysdeps/x86_64/strrchr.S                | 510 +++++++++++++++---------
 sysdeps/x86_64/wcsrchr.S                | 268 +------------
 4 files changed, 339 insertions(+), 444 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
index db1b44c23c..866396e947 100644
--- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
+++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
@@ -17,7 +17,7 @@
    <https://www.gnu.org/licenses/>.  */
 
 #if IS_IN (libc)
-# define strrchr __strrchr_sse2
+# define STRRCHR __strrchr_sse2
 
 # undef weak_alias
 # define weak_alias(strrchr, rindex)
diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
index 78d1ca6553..69d2f3cdb1 100644
--- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
+++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
@@ -17,7 +17,6 @@
    <https://www.gnu.org/licenses/>.  */
 
 #if IS_IN (libc)
-# define wcsrchr __wcsrchr_sse2
+# define STRRCHR	__wcsrchr_sse2
 #endif
-
 #include "../wcsrchr.S"
diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
index 50d886713e..6efb25c880 100644
--- a/sysdeps/x86_64/strrchr.S
+++ b/sysdeps/x86_64/strrchr.S
@@ -19,210 +19,360 @@
 
 #include <sysdep.h>
 
+#ifndef STRRCHR
+# define STRRCHR	strrchr
+#endif
+
+#ifdef USE_AS_WCSRCHR
+# define PCMPEQ	pcmpeqd
+# define CHAR_SIZE	4
+# define PMINU	pminud
+#else
+# define PCMPEQ	pcmpeqb
+# define CHAR_SIZE	1
+# define PMINU	pminub
+#endif
+
+#define PAGE_SIZE	4096
+#define VEC_SIZE	16
+
 	.text
-ENTRY (strrchr)
-	movd	%esi, %xmm1
+ENTRY(STRRCHR)
+	movd	%esi, %xmm0
 	movq	%rdi, %rax
-	andl	$4095, %eax
-	punpcklbw	%xmm1, %xmm1
-	cmpq	$4032, %rax
-	punpcklwd	%xmm1, %xmm1
-	pshufd	$0, %xmm1, %xmm1
+	andl	$(PAGE_SIZE - 1), %eax
+#ifndef USE_AS_WCSRCHR
+	punpcklbw %xmm0, %xmm0
+	punpcklwd %xmm0, %xmm0
+#endif
+	pshufd	$0, %xmm0, %xmm0
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 	ja	L(cross_page)
-	movdqu	(%rdi), %xmm0
+
+L(cross_page_continue):
+	movups	(%rdi), %xmm1
 	pxor	%xmm2, %xmm2
-	movdqa	%xmm0, %xmm3
-	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm2, %xmm3
-	pmovmskb	%xmm0, %ecx
-	pmovmskb	%xmm3, %edx
-	testq	%rdx, %rdx
-	je	L(next_48_bytes)
-	leaq	-1(%rdx), %rax
-	xorq	%rdx, %rax
-	andq	%rcx, %rax
-	je	L(exit)
-	bsrq	%rax, %rax
+	PCMPEQ	%xmm1, %xmm2
+	pmovmskb %xmm2, %ecx
+	testl	%ecx, %ecx
+	jz	L(aligned_more)
+
+	PCMPEQ	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	leal	-1(%rcx), %edx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(ret0)
+	bsrl	%eax, %eax
 	addq	%rdi, %rax
+	/* We are off by 3 for wcsrchr if search CHAR is non-zero. If
+	   search CHAR is zero we are correct. Either way `andq
+	   -CHAR_SIZE, %rax` gets the correct result.  */
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+L(ret0):
 	ret
 
+	/* Returns for first vec x1/x2 have hard coded backward search
+	   path for earlier matches.  */
 	.p2align 4
-L(next_48_bytes):
-	movdqu	16(%rdi), %xmm4
-	movdqa	%xmm4, %xmm5
-	movdqu	32(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm4
-	pcmpeqb	%xmm2, %xmm5
-	movdqu	48(%rdi), %xmm0
-	pmovmskb	%xmm5, %edx
-	movdqa	%xmm3, %xmm5
-	pcmpeqb	%xmm1, %xmm3
-	pcmpeqb	%xmm2, %xmm5
-	pcmpeqb	%xmm0, %xmm2
-	salq	$16, %rdx
-	pmovmskb	%xmm3, %r8d
-	pmovmskb	%xmm5, %eax
-	pmovmskb	%xmm2, %esi
-	salq	$32, %r8
-	salq	$32, %rax
-	pcmpeqb	%xmm1, %xmm0
-	orq	%rdx, %rax
-	movq	%rsi, %rdx
-	pmovmskb	%xmm4, %esi
-	salq	$48, %rdx
-	salq	$16, %rsi
-	orq	%r8, %rsi
-	orq	%rcx, %rsi
-	pmovmskb	%xmm0, %ecx
-	salq	$48, %rcx
-	orq	%rcx, %rsi
-	orq	%rdx, %rax
-	je	L(loop_header2)
-	leaq	-1(%rax), %rcx
-	xorq	%rax, %rcx
-	andq	%rcx, %rsi
-	je	L(exit)
-	bsrq	%rsi, %rsi
-	leaq	(%rdi,%rsi), %rax
+L(first_vec_x0_test):
+	PCMPEQ	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	testl	%eax, %eax
+	jz	L(ret0)
+	bsrl	%eax, %eax
+	addq	%r8, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
 	ret
 
 	.p2align 4
-L(loop_header2):
-	testq	%rsi, %rsi
-	movq	%rdi, %rcx
-	je	L(no_c_found)
-L(loop_header):
-	addq	$64, %rdi
-	pxor	%xmm7, %xmm7
-	andq	$-64, %rdi
-	jmp	L(loop_entry)
+L(first_vec_x1):
+	PCMPEQ	%xmm0, %xmm2
+	pmovmskb %xmm2, %eax
+	leal	-1(%rcx), %edx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(first_vec_x0_test)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
 
 	.p2align 4
-L(loop64):
-	testq	%rdx, %rdx
-	cmovne	%rdx, %rsi
-	cmovne	%rdi, %rcx
-	addq	$64, %rdi
-L(loop_entry):
-	movdqa	32(%rdi), %xmm3
-	pxor	%xmm6, %xmm6
-	movdqa	48(%rdi), %xmm2
-	movdqa	%xmm3, %xmm0
-	movdqa	16(%rdi), %xmm4
-	pminub	%xmm2, %xmm0
-	movdqa	(%rdi), %xmm5
-	pminub	%xmm4, %xmm0
-	pminub	%xmm5, %xmm0
-	pcmpeqb	%xmm7, %xmm0
-	pmovmskb	%xmm0, %eax
-	movdqa	%xmm5, %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %r9d
-	movdqa	%xmm4, %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %edx
-	movdqa	%xmm3, %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	salq	$16, %rdx
-	pmovmskb	%xmm0, %r10d
-	movdqa	%xmm2, %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	salq	$32, %r10
-	orq	%r10, %rdx
-	pmovmskb	%xmm0, %r8d
-	orq	%r9, %rdx
-	salq	$48, %r8
-	orq	%r8, %rdx
+L(first_vec_x1_test):
+	PCMPEQ	%xmm0, %xmm2
+	pmovmskb %xmm2, %eax
 	testl	%eax, %eax
-	je	L(loop64)
-	pcmpeqb	%xmm6, %xmm4
-	pcmpeqb	%xmm6, %xmm3
-	pcmpeqb	%xmm6, %xmm5
-	pmovmskb	%xmm4, %eax
-	pmovmskb	%xmm3, %r10d
-	pcmpeqb	%xmm6, %xmm2
-	pmovmskb	%xmm5, %r9d
-	salq	$32, %r10
-	salq	$16, %rax
-	pmovmskb	%xmm2, %r8d
-	orq	%r10, %rax
-	orq	%r9, %rax
-	salq	$48, %r8
-	orq	%r8, %rax
-	leaq	-1(%rax), %r8
-	xorq	%rax, %r8
-	andq	%r8, %rdx
-	cmovne	%rdi, %rcx
-	cmovne	%rdx, %rsi
-	bsrq	%rsi, %rsi
-	leaq	(%rcx,%rsi), %rax
+	jz	L(first_vec_x0_test)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
+
+	.p2align 4
+L(first_vec_x2):
+	PCMPEQ	%xmm0, %xmm3
+	pmovmskb %xmm3, %eax
+	leal	-1(%rcx), %edx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(first_vec_x1_test)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
+
+	.p2align 4
+L(aligned_more):
+	/* Save original pointer if match was in VEC 0.  */
+	movq	%rdi, %r8
+	andq	$-VEC_SIZE, %rdi
+
+	movaps	VEC_SIZE(%rdi), %xmm2
+	pxor	%xmm3, %xmm3
+	PCMPEQ	%xmm2, %xmm3
+	pmovmskb %xmm3, %ecx
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x1)
+
+	movaps	(VEC_SIZE * 2)(%rdi), %xmm3
+	pxor	%xmm4, %xmm4
+	PCMPEQ	%xmm3, %xmm4
+	pmovmskb %xmm4, %ecx
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x2)
+
+	addq	$VEC_SIZE, %rdi
+	/* Save pointer again before realigning.  */
+	movq	%rdi, %rsi
+	andq	$-(VEC_SIZE * 2), %rdi
+	.p2align 4
+L(first_loop):
+	/* Do 2x VEC at a time.  */
+	movaps	(VEC_SIZE * 2)(%rdi), %xmm4
+	movaps	(VEC_SIZE * 3)(%rdi), %xmm5
+	/* If SSE2 no pminud so wcsrchr needs seperate logic for
+	   detecting zero. Note if this is found to be a bottleneck it
+	   may be worth adding an SSE4.1 wcsrchr implementation.  */
+#ifdef USE_AS_WCSRCHR
+	movaps	%xmm5, %xmm6
+	pxor	%xmm8, %xmm8
+
+	PCMPEQ	%xmm8, %xmm5
+	PCMPEQ	%xmm4, %xmm8
+	por	%xmm5, %xmm8
+#else
+	movaps	%xmm5, %xmm6
+	PMINU	%xmm4, %xmm5
+#endif
+
+	movaps	%xmm4, %xmm9
+	PCMPEQ	%xmm0, %xmm4
+	PCMPEQ	%xmm0, %xmm6
+	movaps	%xmm6, %xmm7
+	por	%xmm4, %xmm6
+#ifndef USE_AS_WCSRCHR
+	pxor	%xmm8, %xmm8
+	PCMPEQ	%xmm5, %xmm8
+#endif
+	pmovmskb %xmm8, %ecx
+	pmovmskb %xmm6, %eax
+
+	addq	$(VEC_SIZE * 2), %rdi
+	/* Use `addl` 1) so we can undo it with `subl` and 2) it can
+	   macro-fuse with `jz`.  */
+	addl	%ecx, %eax
+	jz	L(first_loop)
+
+	/* Check if there is zero match.  */
+	testl	%ecx, %ecx
+	jz	L(second_loop_match)
+
+	/* Check if there was a match in last iteration.  */
+	subl	%ecx, %eax
+	jnz	L(new_match)
+
+L(first_loop_old_match):
+	PCMPEQ	%xmm0, %xmm2
+	PCMPEQ	%xmm0, %xmm3
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm3, %eax
+	addl	%eax, %ecx
+	jz	L(first_vec_x0_test)
+	/* NB: We could move this shift to before the branch and save a
+	   bit of code size / performance on the fall through. The
+	   branch leads to the null case which generally seems hotter
+	   than char in first 3x VEC.  */
+	sall	$16, %eax
+	orl	%ecx, %eax
+
+	bsrl	%eax, %eax
+	addq	%rsi, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
+
+	.p2align 4
+L(new_match):
+	pxor	%xmm6, %xmm6
+	PCMPEQ	%xmm9, %xmm6
+	pmovmskb %xmm6, %eax
+	sall	$16, %ecx
+	orl	%eax, %ecx
+
+	/* We can't reuse either of the old comparisons as since we mask
+	   of zeros after first zero (instead of using the full
+	   comparison) we can't gurantee no interference between match
+	   after end of string and valid match.  */
+	pmovmskb %xmm4, %eax
+	pmovmskb %xmm7, %edx
+	sall	$16, %edx
+	orl	%edx, %eax
+
+	leal	-1(%ecx), %edx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(first_loop_old_match)
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
 	ret
 
+	/* Save minimum state for getting most recent match. We can
+	   throw out all previous work.  */
 	.p2align 4
-L(no_c_found):
-	movl	$1, %esi
-	xorl	%ecx, %ecx
-	jmp	L(loop_header)
+L(second_loop_match):
+	movq	%rdi, %rsi
+	movaps	%xmm4, %xmm2
+	movaps	%xmm7, %xmm3
 
 	.p2align 4
-L(exit):
-	xorl	%eax, %eax
+L(second_loop):
+	movaps	(VEC_SIZE * 2)(%rdi), %xmm4
+	movaps	(VEC_SIZE * 3)(%rdi), %xmm5
+	/* If SSE2 no pminud so wcsrchr needs seperate logic for
+	   detecting zero. Note if this is found to be a bottleneck it
+	   may be worth adding an SSE4.1 wcsrchr implementation.  */
+#ifdef USE_AS_WCSRCHR
+	movaps	%xmm5, %xmm6
+	pxor	%xmm8, %xmm8
+
+	PCMPEQ	%xmm8, %xmm5
+	PCMPEQ	%xmm4, %xmm8
+	por	%xmm5, %xmm8
+#else
+	movaps	%xmm5, %xmm6
+	PMINU	%xmm4, %xmm5
+#endif
+
+	movaps	%xmm4, %xmm9
+	PCMPEQ	%xmm0, %xmm4
+	PCMPEQ	%xmm0, %xmm6
+	movaps	%xmm6, %xmm7
+	por	%xmm4, %xmm6
+#ifndef USE_AS_WCSRCHR
+	pxor	%xmm8, %xmm8
+	PCMPEQ	%xmm5, %xmm8
+#endif
+
+	pmovmskb %xmm8, %ecx
+	pmovmskb %xmm6, %eax
+
+	addq	$(VEC_SIZE * 2), %rdi
+	/* Either null term or new occurence of CHAR.  */
+	addl	%ecx, %eax
+	jz	L(second_loop)
+
+	/* No null term so much be new occurence of CHAR.  */
+	testl	%ecx, %ecx
+	jz	L(second_loop_match)
+
+
+	subl	%ecx, %eax
+	jnz	L(second_loop_new_match)
+
+L(second_loop_old_match):
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm3, %eax
+	sall	$16, %eax
+	orl	%ecx, %eax
+	bsrl	%eax, %eax
+	addq	%rsi, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
 	ret
 
 	.p2align 4
+L(second_loop_new_match):
+	pxor	%xmm6, %xmm6
+	PCMPEQ	%xmm9, %xmm6
+	pmovmskb %xmm6, %eax
+	sall	$16, %ecx
+	orl	%eax, %ecx
+
+	/* We can't reuse either of the old comparisons as since we mask
+	   of zeros after first zero (instead of using the full
+	   comparison) we can't gurantee no interference between match
+	   after end of string and valid match.  */
+	pmovmskb %xmm4, %eax
+	pmovmskb %xmm7, %edx
+	sall	$16, %edx
+	orl	%edx, %eax
+
+	leal	-1(%ecx), %edx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(second_loop_old_match)
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
+
+	.p2align 4,, 4
 L(cross_page):
-	movq	%rdi, %rax
-	pxor	%xmm0, %xmm0
-	andq	$-64, %rax
-	movdqu	(%rax), %xmm5
-	movdqa	%xmm5, %xmm6
-	movdqu	16(%rax), %xmm4
-	pcmpeqb	%xmm1, %xmm5
-	pcmpeqb	%xmm0, %xmm6
-	movdqu	32(%rax), %xmm3
-	pmovmskb	%xmm6, %esi
-	movdqa	%xmm4, %xmm6
-	movdqu	48(%rax), %xmm2
-	pcmpeqb	%xmm1, %xmm4
-	pcmpeqb	%xmm0, %xmm6
-	pmovmskb	%xmm6, %edx
-	movdqa	%xmm3, %xmm6
-	pcmpeqb	%xmm1, %xmm3
-	pcmpeqb	%xmm0, %xmm6
-	pcmpeqb	%xmm2, %xmm0
-	salq	$16, %rdx
-	pmovmskb	%xmm3, %r9d
-	pmovmskb	%xmm6, %r8d
-	pmovmskb	%xmm0, %ecx
-	salq	$32, %r9
-	salq	$32, %r8
-	pcmpeqb	%xmm1, %xmm2
-	orq	%r8, %rdx
-	salq	$48, %rcx
-	pmovmskb	%xmm5, %r8d
-	orq	%rsi, %rdx
-	pmovmskb	%xmm4, %esi
-	orq	%rcx, %rdx
-	pmovmskb	%xmm2, %ecx
-	salq	$16, %rsi
-	salq	$48, %rcx
-	orq	%r9, %rsi
-	orq	%r8, %rsi
-	orq	%rcx, %rsi
+	movq	%rdi, %rsi
+	andq	$-VEC_SIZE, %rsi
+	movaps	(%rsi), %xmm1
+	pxor	%xmm2, %xmm2
+	PCMPEQ	%xmm1, %xmm2
+	pmovmskb %xmm2, %edx
 	movl	%edi, %ecx
-	subl	%eax, %ecx
-	shrq	%cl, %rdx
-	shrq	%cl, %rsi
-	testq	%rdx, %rdx
-	je	L(loop_header2)
-	leaq	-1(%rdx), %rax
-	xorq	%rdx, %rax
-	andq	%rax, %rsi
-	je	L(exit)
-	bsrq	%rsi, %rax
+	andl	$(VEC_SIZE - 1), %ecx
+	sarl	%cl, %edx
+	jz	L(cross_page_continue)
+	PCMPEQ	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	sarl	%cl, %eax
+	leal	-1(%rdx), %ecx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(ret1)
+	bsrl	%eax, %eax
 	addq	%rdi, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+L(ret1):
 	ret
-END (strrchr)
+END(STRRCHR)
 
-weak_alias (strrchr, rindex)
-libc_hidden_builtin_def (strrchr)
+#ifndef USE_AS_WCSRCHR
+	weak_alias (STRRCHR, rindex)
+	libc_hidden_builtin_def (STRRCHR)
+#endif
diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
index 61552954de..2b80efc5ef 100644
--- a/sysdeps/x86_64/wcsrchr.S
+++ b/sysdeps/x86_64/wcsrchr.S
@@ -1,4 +1,4 @@
-/* wcsrchr with SSSE3
+/* wcsrchr optimized with SSE2.
    Copyright (C) 2011-2022 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -16,266 +16,12 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
 
-	.text
-ENTRY (wcsrchr)
+#define USE_AS_WCSRCHR	1
+#define NO_PMINU	1
 
-	movd	%rsi, %xmm1
-	mov	%rdi, %rcx
-	punpckldq %xmm1, %xmm1
-	pxor	%xmm2, %xmm2
-	punpckldq %xmm1, %xmm1
-	and	$63, %rcx
-	cmp	$48, %rcx
-	ja	L(crosscache)
+#ifndef STRRCHR
+# define STRRCHR	wcsrchr
+#endif
 
-	movdqu	(%rdi), %xmm0
-	pcmpeqd	%xmm0, %xmm2
-	pcmpeqd	%xmm1, %xmm0
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm0, %rax
-	add	$16, %rdi
-
-	test	%rax, %rax
-	jnz	L(unaligned_match1)
-
-	test	%rcx, %rcx
-	jnz	L(return_null)
-
-	and	$-16, %rdi
-	xor	%r8, %r8
-	jmp	L(loop)
-
-	.p2align 4
-L(unaligned_match1):
-	test	%rcx, %rcx
-	jnz	L(prolog_find_zero_1)
-
-	mov	%rax, %r8
-	mov	%rdi, %rsi
-	and	$-16, %rdi
-	jmp	L(loop)
-
-	.p2align 4
-L(crosscache):
-	and	$15, %rcx
-	and	$-16, %rdi
-	pxor	%xmm3, %xmm3
-	movdqa	(%rdi), %xmm0
-	pcmpeqd	%xmm0, %xmm3
-	pcmpeqd	%xmm1, %xmm0
-	pmovmskb %xmm3, %rdx
-	pmovmskb %xmm0, %rax
-	shr	%cl, %rdx
-	shr	%cl, %rax
-	add	$16, %rdi
-
-	test	%rax, %rax
-	jnz	L(unaligned_match)
-
-	test	%rdx, %rdx
-	jnz	L(return_null)
-
-	xor	%r8, %r8
-	jmp	L(loop)
-
-	.p2align 4
-L(unaligned_match):
-	test	%rdx, %rdx
-	jnz	L(prolog_find_zero)
-
-	mov	%rax, %r8
-	lea	(%rdi, %rcx), %rsi
-
-/* Loop start on aligned string.  */
-	.p2align 4
-L(loop):
-	movdqa	(%rdi), %xmm0
-	pcmpeqd	%xmm0, %xmm2
-	add	$16, %rdi
-	pcmpeqd	%xmm1, %xmm0
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm0, %rax
-	or	%rax, %rcx
-	jnz	L(matches)
-
-	movdqa	(%rdi), %xmm3
-	pcmpeqd	%xmm3, %xmm2
-	add	$16, %rdi
-	pcmpeqd	%xmm1, %xmm3
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm3, %rax
-	or	%rax, %rcx
-	jnz	L(matches)
-
-	movdqa	(%rdi), %xmm4
-	pcmpeqd	%xmm4, %xmm2
-	add	$16, %rdi
-	pcmpeqd	%xmm1, %xmm4
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm4, %rax
-	or	%rax, %rcx
-	jnz	L(matches)
-
-	movdqa	(%rdi), %xmm5
-	pcmpeqd	%xmm5, %xmm2
-	add	$16, %rdi
-	pcmpeqd	%xmm1, %xmm5
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm5, %rax
-	or	%rax, %rcx
-	jz	L(loop)
-
-	.p2align 4
-L(matches):
-	test	%rax, %rax
-	jnz	L(match)
-L(return_value):
-	test	%r8, %r8
-	jz	L(return_null)
-	mov	%r8, %rax
-	mov	%rsi, %rdi
-
-	test	$15 << 4, %ah
-	jnz	L(match_fourth_wchar)
-	test	%ah, %ah
-	jnz	L(match_third_wchar)
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(match):
-	pmovmskb %xmm2, %rcx
-	test	%rcx, %rcx
-	jnz	L(find_zero)
-	mov	%rax, %r8
-	mov	%rdi, %rsi
-	jmp	L(loop)
-
-	.p2align 4
-L(find_zero):
-	test	$15, %cl
-	jnz	L(find_zero_in_first_wchar)
-	test	%cl, %cl
-	jnz	L(find_zero_in_second_wchar)
-	test	$15, %ch
-	jnz	L(find_zero_in_third_wchar)
-
-	and	$1 << 13 - 1, %rax
-	jz	L(return_value)
-
-	test	$15 << 4, %ah
-	jnz	L(match_fourth_wchar)
-	test	%ah, %ah
-	jnz	L(match_third_wchar)
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(find_zero_in_first_wchar):
-	test	$1, %rax
-	jz	L(return_value)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(find_zero_in_second_wchar):
-	and	$1 << 5 - 1, %rax
-	jz	L(return_value)
-
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(find_zero_in_third_wchar):
-	and	$1 << 9 - 1, %rax
-	jz	L(return_value)
-
-	test	%ah, %ah
-	jnz	L(match_third_wchar)
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(prolog_find_zero):
-	add	%rcx, %rdi
-	mov     %rdx, %rcx
-L(prolog_find_zero_1):
-	test	$15, %cl
-	jnz	L(prolog_find_zero_in_first_wchar)
-	test	%cl, %cl
-	jnz	L(prolog_find_zero_in_second_wchar)
-	test	$15, %ch
-	jnz	L(prolog_find_zero_in_third_wchar)
-
-	and	$1 << 13 - 1, %rax
-	jz	L(return_null)
-
-	test	$15 << 4, %ah
-	jnz	L(match_fourth_wchar)
-	test	%ah, %ah
-	jnz	L(match_third_wchar)
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(prolog_find_zero_in_first_wchar):
-	test	$1, %rax
-	jz	L(return_null)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(prolog_find_zero_in_second_wchar):
-	and	$1 << 5 - 1, %rax
-	jz	L(return_null)
-
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(prolog_find_zero_in_third_wchar):
-	and	$1 << 9 - 1, %rax
-	jz	L(return_null)
-
-	test	%ah, %ah
-	jnz	L(match_third_wchar)
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(match_second_wchar):
-	lea	-12(%rdi), %rax
-	ret
-
-	.p2align 4
-L(match_third_wchar):
-	lea	-8(%rdi), %rax
-	ret
-
-	.p2align 4
-L(match_fourth_wchar):
-	lea	-4(%rdi), %rax
-	ret
-
-	.p2align 4
-L(return_null):
-	xor	%rax, %rax
-	ret
-
-END (wcsrchr)
+#include "../strrchr.S"
-- 
2.25.1


^ permalink raw reply	[flat|nested] 36+ messages in thread

* [PATCH v2 3/4] x86: Optimize {str|wcs}rchr-avx2
  2022-04-21 22:22 ` [PATCH v2 1/4] " Noah Goldstein
  2022-04-21 22:22   ` [PATCH v2 2/4] x86: Optimize {str|wcs}rchr-sse2 Noah Goldstein
@ 2022-04-21 22:22   ` Noah Goldstein
  2022-04-21 22:22   ` [PATCH v2 4/4] x86: Optimize {str|wcs}rchr-evex Noah Goldstein
  2 siblings, 0 replies; 36+ messages in thread
From: Noah Goldstein @ 2022-04-21 22:22 UTC (permalink / raw)
  To: libc-alpha

The new code unrolls the main loop slightly without adding too much
overhead and minimizes the comparisons for the search CHAR.

Geometric Mean of all benchmarks New / Old: 0.832
See email for all results.

Full xcheck passes on x86_64 with and without multiarch enabled.
---
 sysdeps/x86_64/multiarch/strrchr-avx2.S | 415 +++++++++++++++---------
 1 file changed, 258 insertions(+), 157 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
index 1df2adfad0..9d1e45defc 100644
--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
@@ -27,9 +27,13 @@
 # ifdef USE_AS_WCSRCHR
 #  define VPBROADCAST	vpbroadcastd
 #  define VPCMPEQ	vpcmpeqd
+#  define VPMIN	vpminud
+#  define CHAR_SIZE	4
 # else
 #  define VPBROADCAST	vpbroadcastb
 #  define VPCMPEQ	vpcmpeqb
+#  define VPMIN	vpminub
+#  define CHAR_SIZE	1
 # endif
 
 # ifndef VZEROUPPER
@@ -41,196 +45,293 @@
 # endif
 
 # define VEC_SIZE	32
+# define PAGE_SIZE	4096
 
-	.section SECTION(.text),"ax",@progbits
-ENTRY (STRRCHR)
-	movd	%esi, %xmm4
-	movl	%edi, %ecx
+	.section SECTION(.text), "ax", @progbits
+ENTRY(STRRCHR)
+	movd	%esi, %xmm7
+	movl	%edi, %eax
 	/* Broadcast CHAR to YMM4.  */
-	VPBROADCAST %xmm4, %ymm4
+	VPBROADCAST %xmm7, %ymm7
 	vpxor	%xmm0, %xmm0, %xmm0
 
-	/* Check if we may cross page boundary with one vector load.  */
-	andl	$(2 * VEC_SIZE - 1), %ecx
-	cmpl	$VEC_SIZE, %ecx
-	ja	L(cros_page_boundary)
+	/* Shift here instead of `andl` to save code size (saves a fetch
+	   block).  */
+	sall	$20, %eax
+	cmpl	$((PAGE_SIZE - VEC_SIZE) << 20), %eax
+	ja	L(cross_page)
 
+L(page_cross_continue):
 	vmovdqu	(%rdi), %ymm1
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
-	vpmovmskb %ymm2, %ecx
-	vpmovmskb %ymm3, %eax
-	addq	$VEC_SIZE, %rdi
+	/* Check end of string match.  */
+	VPCMPEQ	%ymm1, %ymm0, %ymm6
+	vpmovmskb %ymm6, %ecx
+	testl	%ecx, %ecx
+	jz	L(aligned_more)
+
+	/* Only check match with search CHAR if needed.  */
+	VPCMPEQ	%ymm1, %ymm7, %ymm1
+	vpmovmskb %ymm1, %eax
+	/* Check if match before first zero.  */
+	blsmskl	%ecx, %ecx
+	andl	%ecx, %eax
+	jz	L(ret0)
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+	/* We are off by 3 for wcsrchr if search CHAR is non-zero. If
+	   search CHAR is zero we are correct. Either way `andq
+	   -CHAR_SIZE, %rax` gets the correct result.  */
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+L(ret0):
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+
+	/* Returns for first vec x1/x2 have hard coded backward search
+	   path for earlier matches.  */
+	.p2align 4,, 10
+L(first_vec_x1):
+	VPCMPEQ	%ymm2, %ymm7, %ymm6
+	vpmovmskb %ymm6, %eax
+	blsmskl	%ecx, %ecx
+	andl	%ecx, %eax
+	jnz	L(first_vec_x1_return)
+
+	.p2align 4,, 4
+L(first_vec_x0_test):
+	VPCMPEQ	%ymm1, %ymm7, %ymm6
+	vpmovmskb %ymm6, %eax
+	testl	%eax, %eax
+	jz	L(ret1)
+	bsrl	%eax, %eax
+	addq	%r8, %rax
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+L(ret1):
+	VZEROUPPER_RETURN
 
+	.p2align 4,, 10
+L(first_vec_x0_x1_test):
+	VPCMPEQ	%ymm2, %ymm7, %ymm6
+	vpmovmskb %ymm6, %eax
 	testl	%eax, %eax
-	jnz	L(first_vec)
+	jz	L(first_vec_x0_test)
+	.p2align 4,, 4
+L(first_vec_x1_return):
+	bsrl	%eax, %eax
+	leaq	1(%rdi, %rax), %rax
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+	VZEROUPPER_RETURN
 
-	testl	%ecx, %ecx
-	jnz	L(return_null)
 
-	andq	$-VEC_SIZE, %rdi
-	xorl	%edx, %edx
-	jmp	L(aligned_loop)
+	.p2align 4,, 10
+L(first_vec_x2):
+	VPCMPEQ	%ymm3, %ymm7, %ymm6
+	vpmovmskb %ymm6, %eax
+	blsmskl	%ecx, %ecx
+	andl	%ecx, %eax
+	jz	L(first_vec_x0_x1_test)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE + 1)(%rdi, %rax), %rax
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+	VZEROUPPER_RETURN
+
 
 	.p2align 4
-L(first_vec):
-	/* Check if there is a nul CHAR.  */
+L(aligned_more):
+	/* Save original pointer if match was in VEC 0.  */
+	movq	%rdi, %r8
+
+	/* Align src.  */
+	orq	$(VEC_SIZE - 1), %rdi
+	vmovdqu	1(%rdi), %ymm2
+	VPCMPEQ	%ymm2, %ymm0, %ymm6
+	vpmovmskb %ymm6, %ecx
 	testl	%ecx, %ecx
-	jnz	L(char_and_nul_in_first_vec)
+	jnz	L(first_vec_x1)
 
-	/* Remember the match and keep searching.  */
-	movl	%eax, %edx
-	movq	%rdi, %rsi
-	andq	$-VEC_SIZE, %rdi
-	jmp	L(aligned_loop)
+	vmovdqu	(VEC_SIZE + 1)(%rdi), %ymm3
+	VPCMPEQ	%ymm3, %ymm0, %ymm6
+	vpmovmskb %ymm6, %ecx
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x2)
 
+	/* Save pointer again before realigning.  */
+	movq	%rdi, %rsi
+	addq	$(VEC_SIZE + 1), %rdi
+	andq	$-(VEC_SIZE * 2), %rdi
 	.p2align 4
-L(cros_page_boundary):
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
-	vmovdqa	(%rdi), %ymm1
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
-	vpmovmskb %ymm2, %edx
-	vpmovmskb %ymm3, %eax
-	shrl	%cl, %edx
-	shrl	%cl, %eax
-	addq	$VEC_SIZE, %rdi
-
-	/* Check if there is a CHAR.  */
+L(first_aligned_loop):
+	/* Do 2x VEC at a time. Any more and the cost of finding the
+	   match outweights loop benefit.  */
+	vmovdqa	(VEC_SIZE * 0)(%rdi), %ymm4
+	vmovdqa	(VEC_SIZE * 1)(%rdi), %ymm5
+
+	VPCMPEQ	%ymm4, %ymm7, %ymm6
+	VPMIN	%ymm4, %ymm5, %ymm8
+	VPCMPEQ	%ymm5, %ymm7, %ymm10
+	vpor	%ymm6, %ymm10, %ymm5
+	VPCMPEQ	%ymm8, %ymm0, %ymm8
+	vpor	%ymm5, %ymm8, %ymm9
+
+	vpmovmskb %ymm9, %eax
+	addq	$(VEC_SIZE * 2), %rdi
+	/* No zero or search CHAR.  */
 	testl	%eax, %eax
-	jnz	L(found_char)
-
-	testl	%edx, %edx
-	jnz	L(return_null)
+	jz	L(first_aligned_loop)
 
-	jmp	L(aligned_loop)
-
-	.p2align 4
-L(found_char):
-	testl	%edx, %edx
-	jnz	L(char_and_nul)
+	/* If no zero CHAR then go to second loop (this allows us to
+	   throw away all prior work).  */
+	vpmovmskb %ymm8, %ecx
+	testl	%ecx, %ecx
+	jz	L(second_aligned_loop_prep)
 
-	/* Remember the match and keep searching.  */
-	movl	%eax, %edx
-	leaq	(%rdi, %rcx), %rsi
+	/* Search char could be zero so we need to get the true match.
+	 */
+	vpmovmskb %ymm5, %eax
+	testl	%eax, %eax
+	jnz	L(first_aligned_loop_return)
 
-	.p2align 4
-L(aligned_loop):
-	vmovdqa	(%rdi), %ymm1
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
-	addq	$VEC_SIZE, %rdi
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
-	vpmovmskb %ymm2, %ecx
-	vpmovmskb %ymm3, %eax
-	orl	%eax, %ecx
-	jnz	L(char_nor_null)
-
-	vmovdqa	(%rdi), %ymm1
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
-	add	$VEC_SIZE, %rdi
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
-	vpmovmskb %ymm2, %ecx
+	.p2align 4,, 4
+L(first_vec_x1_or_x2):
+	VPCMPEQ	%ymm3, %ymm7, %ymm3
+	VPCMPEQ	%ymm2, %ymm7, %ymm2
 	vpmovmskb %ymm3, %eax
-	orl	%eax, %ecx
-	jnz	L(char_nor_null)
-
-	vmovdqa	(%rdi), %ymm1
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
-	addq	$VEC_SIZE, %rdi
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
-	vpmovmskb %ymm2, %ecx
-	vpmovmskb %ymm3, %eax
-	orl	%eax, %ecx
-	jnz	L(char_nor_null)
-
-	vmovdqa	(%rdi), %ymm1
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
-	addq	$VEC_SIZE, %rdi
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
-	vpmovmskb %ymm2, %ecx
-	vpmovmskb %ymm3, %eax
-	orl	%eax, %ecx
-	jz	L(aligned_loop)
-
-	.p2align 4
-L(char_nor_null):
-	/* Find a CHAR or a nul CHAR in a loop.  */
-	testl	%eax, %eax
-	jnz	L(match)
-L(return_value):
-	testl	%edx, %edx
-	jz	L(return_null)
-	movl	%edx, %eax
-	movq	%rsi, %rdi
+	vpmovmskb %ymm2, %edx
+	/* Use add for macro-fusion.  */
+	addq	%rax, %rdx
+	jz	L(first_vec_x0_test)
+	/* NB: We could move this shift to before the branch and save a
+	   bit of code size / performance on the fall through. The
+	   branch leads to the null case which generally seems hotter
+	   than char in first 3x VEC.  */
+	salq	$32, %rax
+	addq	%rdx, %rax
+	bsrq	%rax, %rax
+	leaq	1(%rsi, %rax), %rax
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+	VZEROUPPER_RETURN
 
+	.p2align 4,, 8
+L(first_aligned_loop_return):
+	VPCMPEQ	%ymm4, %ymm0, %ymm4
+	vpmovmskb %ymm4, %edx
+	salq	$32, %rcx
+	orq	%rdx, %rcx
+
+	vpmovmskb %ymm10, %eax
+	vpmovmskb %ymm6, %edx
+	salq	$32, %rax
+	orq	%rdx, %rax
+	blsmskq	%rcx, %rcx
+	andq	%rcx, %rax
+	jz	L(first_vec_x1_or_x2)
+
+	bsrq	%rax, %rax
+	leaq	-(VEC_SIZE * 2)(%rdi, %rax), %rax
 # ifdef USE_AS_WCSRCHR
-	/* Keep the first bit for each matching CHAR for bsr.  */
-	andl	$0x11111111, %eax
+	andq	$-CHAR_SIZE, %rax
 # endif
-	bsrl	%eax, %eax
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
-L(return_vzeroupper):
-	ZERO_UPPER_VEC_REGISTERS_RETURN
+	VZEROUPPER_RETURN
 
+	/* Search char cannot be zero.  */
 	.p2align 4
-L(match):
-	/* Find a CHAR.  Check if there is a nul CHAR.  */
-	vpmovmskb %ymm2, %ecx
-	testl	%ecx, %ecx
-	jnz	L(find_nul)
-
-	/* Remember the match and keep searching.  */
-	movl	%eax, %edx
+L(second_aligned_loop_set_furthest_match):
+	/* Save VEC and pointer from most recent match.  */
+L(second_aligned_loop_prep):
 	movq	%rdi, %rsi
-	jmp	L(aligned_loop)
+	vmovdqu	%ymm6, %ymm2
+	vmovdqu	%ymm10, %ymm3
 
 	.p2align 4
-L(find_nul):
-# ifdef USE_AS_WCSRCHR
-	/* Keep the first bit for each matching CHAR for bsr.  */
-	andl	$0x11111111, %ecx
-	andl	$0x11111111, %eax
-# endif
-	/* Mask out any matching bits after the nul CHAR.  */
-	movl	%ecx, %r8d
-	subl	$1, %r8d
-	xorl	%ecx, %r8d
-	andl	%r8d, %eax
+L(second_aligned_loop):
+	/* Search 2x at at time.  */
+	vmovdqa	(VEC_SIZE * 0)(%rdi), %ymm4
+	vmovdqa	(VEC_SIZE * 1)(%rdi), %ymm5
+
+	VPCMPEQ	%ymm4, %ymm7, %ymm6
+	VPMIN	%ymm4, %ymm5, %ymm1
+	VPCMPEQ	%ymm5, %ymm7, %ymm10
+	vpor	%ymm6, %ymm10, %ymm5
+	VPCMPEQ	%ymm1, %ymm0, %ymm1
+	vpor	%ymm5, %ymm1, %ymm9
+
+	vpmovmskb %ymm9, %eax
+	addq	$(VEC_SIZE * 2), %rdi
 	testl	%eax, %eax
-	/* If there is no CHAR here, return the remembered one.  */
-	jz	L(return_value)
-	bsrl	%eax, %eax
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(char_and_nul):
-	/* Find both a CHAR and a nul CHAR.  */
-	addq	%rcx, %rdi
-	movl	%edx, %ecx
-L(char_and_nul_in_first_vec):
-# ifdef USE_AS_WCSRCHR
-	/* Keep the first bit for each matching CHAR for bsr.  */
-	andl	$0x11111111, %ecx
-	andl	$0x11111111, %eax
-# endif
-	/* Mask out any matching bits after the nul CHAR.  */
-	movl	%ecx, %r8d
-	subl	$1, %r8d
-	xorl	%ecx, %r8d
-	andl	%r8d, %eax
+	jz	L(second_aligned_loop)
+	vpmovmskb %ymm1, %ecx
+	testl	%ecx, %ecx
+	jz	L(second_aligned_loop_set_furthest_match)
+	vpmovmskb %ymm5, %eax
 	testl	%eax, %eax
-	/* Return null pointer if the nul CHAR comes first.  */
-	jz	L(return_null)
-	bsrl	%eax, %eax
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
+	jnz	L(return_new_match)
+
+	/* This is the hot patch. We know CHAR is inbounds and that
+	   ymm3/ymm2 have latest match.  */
+	.p2align 4,, 4
+L(return_old_match):
+	vpmovmskb %ymm3, %eax
+	vpmovmskb %ymm2, %edx
+	salq	$32, %rax
+	orq	%rdx, %rax
+	bsrq	%rax, %rax
+	/* Search char cannot be zero so safe to just use lea for
+	   wcsrchr.  */
+	leaq	(VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax
 	VZEROUPPER_RETURN
 
-	.p2align 4
-L(return_null):
-	xorl	%eax, %eax
+	/* Last iteration also potentially has a match.  */
+	.p2align 4,, 8
+L(return_new_match):
+	VPCMPEQ	%ymm4, %ymm0, %ymm4
+	vpmovmskb %ymm4, %edx
+	salq	$32, %rcx
+	orq	%rdx, %rcx
+
+	vpmovmskb %ymm10, %eax
+	vpmovmskb %ymm6, %edx
+	salq	$32, %rax
+	orq	%rdx, %rax
+	blsmskq	%rcx, %rcx
+	andq	%rcx, %rax
+	jz	L(return_old_match)
+	bsrq	%rax, %rax
+	/* Search char cannot be zero so safe to just use lea for
+	   wcsrchr.  */
+	leaq	(VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax
 	VZEROUPPER_RETURN
 
-END (STRRCHR)
+	.p2align 4,, 4
+L(cross_page):
+	movq	%rdi, %rsi
+	andq	$-VEC_SIZE, %rsi
+	vmovdqu	(%rsi), %ymm1
+	VPCMPEQ	%ymm1, %ymm0, %ymm6
+	vpmovmskb %ymm6, %ecx
+	shrxl	%edi, %ecx, %ecx
+	testl	%ecx, %ecx
+	jz	L(page_cross_continue)
+	VPCMPEQ	%ymm1, %ymm7, %ymm1
+	vpmovmskb %ymm1, %eax
+	shrxl	%edi, %eax, %eax
+	blsmskl	%ecx, %ecx
+	andl	%ecx, %eax
+	jz	L(ret2)
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+L(ret2):
+	VZEROUPPER_RETURN
+END(STRRCHR)
 #endif
-- 
2.25.1


^ permalink raw reply	[flat|nested] 36+ messages in thread

* [PATCH v2 4/4] x86: Optimize {str|wcs}rchr-evex
  2022-04-21 22:22 ` [PATCH v2 1/4] " Noah Goldstein
  2022-04-21 22:22   ` [PATCH v2 2/4] x86: Optimize {str|wcs}rchr-sse2 Noah Goldstein
  2022-04-21 22:22   ` [PATCH v2 3/4] x86: Optimize {str|wcs}rchr-avx2 Noah Goldstein
@ 2022-04-21 22:22   ` Noah Goldstein
  2022-04-21 23:59     ` H.J. Lu
  2 siblings, 1 reply; 36+ messages in thread
From: Noah Goldstein @ 2022-04-21 22:22 UTC (permalink / raw)
  To: libc-alpha

The new code unrolls the main loop slightly without adding too much
overhead and minimizes the comparisons for the search CHAR.

Geometric Mean of all benchmarks New / Old: 0.755
See email for all results.

Full xcheck passes on x86_64 with and without multiarch enabled.
---
 sysdeps/x86_64/multiarch/strrchr-evex.S | 441 ++++++++++++++----------
 1 file changed, 259 insertions(+), 182 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
index adeddaed32..5cf9a8315b 100644
--- a/sysdeps/x86_64/multiarch/strrchr-evex.S
+++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
@@ -24,242 +24,319 @@
 #  define STRRCHR	__strrchr_evex
 # endif
 
-# define VMOVU		vmovdqu64
-# define VMOVA		vmovdqa64
+# define VMOVU	vmovdqu64
+# define VMOVA	vmovdqa64
 
 # ifdef USE_AS_WCSRCHR
+#  define SHIFT_REG	esi
+
+#  define kunpck	kunpckbw
+#  define kmov_2x	kmovd
+#  define maskz_2x	ecx
+#  define maskm_2x	eax
+#  define CHAR_SIZE	4
+#  define VPMIN	vpminud
+#  define VPTESTN	vptestnmd
 #  define VPBROADCAST	vpbroadcastd
-#  define VPCMP		vpcmpd
-#  define SHIFT_REG	r8d
+#  define VPCMP	vpcmpd
 # else
+#  define SHIFT_REG	edi
+
+#  define kunpck	kunpckdq
+#  define kmov_2x	kmovq
+#  define maskz_2x	rcx
+#  define maskm_2x	rax
+
+#  define CHAR_SIZE	1
+#  define VPMIN	vpminub
+#  define VPTESTN	vptestnmb
 #  define VPBROADCAST	vpbroadcastb
-#  define VPCMP		vpcmpb
-#  define SHIFT_REG	ecx
+#  define VPCMP	vpcmpb
 # endif
 
 # define XMMZERO	xmm16
 # define YMMZERO	ymm16
 # define YMMMATCH	ymm17
-# define YMM1		ymm18
+# define YMMSAVE	ymm18
+
+# define YMM1	ymm19
+# define YMM2	ymm20
+# define YMM3	ymm21
+# define YMM4	ymm22
+# define YMM5	ymm23
+# define YMM6	ymm24
+# define YMM7	ymm25
+# define YMM8	ymm26
 
-# define VEC_SIZE	32
 
-	.section .text.evex,"ax",@progbits
-ENTRY (STRRCHR)
-	movl	%edi, %ecx
+# define VEC_SIZE	32
+# define PAGE_SIZE	4096
+	.section .text.evex, "ax", @progbits
+ENTRY(STRRCHR)
+	movl	%edi, %eax
 	/* Broadcast CHAR to YMMMATCH.  */
 	VPBROADCAST %esi, %YMMMATCH
 
-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
-
-	/* Check if we may cross page boundary with one vector load.  */
-	andl	$(2 * VEC_SIZE - 1), %ecx
-	cmpl	$VEC_SIZE, %ecx
-	ja	L(cros_page_boundary)
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	jg	L(cross_page_boundary)
 
+L(page_cross_continue):
 	VMOVU	(%rdi), %YMM1
-
-	/* Each bit in K0 represents a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
-	/* Each bit in K1 represents a CHAR in YMM1.  */
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+	VPTESTN	%YMM1, %YMM1, %k0
 	kmovd	%k0, %ecx
-	kmovd	%k1, %eax
-
-	addq	$VEC_SIZE, %rdi
-
-	testl	%eax, %eax
-	jnz	L(first_vec)
-
 	testl	%ecx, %ecx
-	jnz	L(return_null)
-
-	andq	$-VEC_SIZE, %rdi
-	xorl	%edx, %edx
-	jmp	L(aligned_loop)
-
-	.p2align 4
-L(first_vec):
-	/* Check if there is a null byte.  */
-	testl	%ecx, %ecx
-	jnz	L(char_and_nul_in_first_vec)
-
-	/* Remember the match and keep searching.  */
-	movl	%eax, %edx
-	movq	%rdi, %rsi
-	andq	$-VEC_SIZE, %rdi
-	jmp	L(aligned_loop)
-
-	.p2align 4
-L(cros_page_boundary):
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
-
+	jz	L(aligned_more)
+	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+	kmovd	%k1, %eax
+	blsmskl	%ecx, %ecx
+	andl	%ecx, %eax
+	jz	L(ret0)
+	bsrl	%eax, %eax
 # ifdef USE_AS_WCSRCHR
-	/* NB: Divide shift count by 4 since each bit in K1 represent 4
-	   bytes.  */
-	movl	%ecx, %SHIFT_REG
-	sarl	$2, %SHIFT_REG
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	addq	%rdi, %rax
 # endif
+L(ret0):
+	ret
 
-	VMOVA	(%rdi), %YMM1
-
-	/* Each bit in K0 represents a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
-	/* Each bit in K1 represents a CHAR in YMM1.  */
+	/* Returns for first vec x1/x2/x3 have hard coded backward
+	   search path for earlier matches.  */
+	.p2align 4,, 6
+L(first_vec_x1):
+	VPCMP	$0, %YMMMATCH, %YMM2, %k1
+	kmovd	%k1, %eax
+	blsmskl	%ecx, %ecx
+	andl	%ecx, %eax
+	jnz	L(first_vec_x1_return)
+	.p2align 4,, 4
+L(first_vec_x0_test):
 	VPCMP	$0, %YMMMATCH, %YMM1, %k1
-	kmovd	%k0, %edx
 	kmovd	%k1, %eax
-
-	shrxl	%SHIFT_REG, %edx, %edx
-	shrxl	%SHIFT_REG, %eax, %eax
-	addq	$VEC_SIZE, %rdi
-
-	/* Check if there is a CHAR.  */
 	testl	%eax, %eax
-	jnz	L(found_char)
-
-	testl	%edx, %edx
-	jnz	L(return_null)
-
-	jmp	L(aligned_loop)
-
-	.p2align 4
-L(found_char):
-	testl	%edx, %edx
-	jnz	L(char_and_nul)
-
-	/* Remember the match and keep searching.  */
-	movl	%eax, %edx
-	leaq	(%rdi, %rcx), %rsi
+	jz	L(ret1)
+	bsrl	%eax, %eax
+# ifdef USE_AS_WCSRCHR
+	leaq	(%rsi, %rax, CHAR_SIZE), %rax
+# else
+	addq	%rsi, %rax
+# endif
+L(ret1):
+	ret
 
-	.p2align 4
-L(aligned_loop):
-	VMOVA	(%rdi), %YMM1
-	addq	$VEC_SIZE, %rdi
+	.p2align 4,, 10
+L(first_vec_x1_or_x2):
+	VPCMP	$0, %YMM3, %YMMMATCH, %k3
+	VPCMP	$0, %YMM2, %YMMMATCH, %k2
+	kortestd %k2, %k3
+	jz	L(first_vec_x0_test)
+
+	kunpck	%k2, %k3, %k3
+	kmovq	%k3, %rax
+	bsrq	%rax, %rax
+	leaq	(VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax
+	ret
 
-	/* Each bit in K0 represents a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
-	/* Each bit in K1 represents a CHAR in YMM1.  */
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
-	kmovd	%k0, %ecx
+	.p2align 4,, 6
+L(first_vec_x3):
+	VPCMP	$0, %YMMMATCH, %YMM4, %k1
 	kmovd	%k1, %eax
-	orl	%eax, %ecx
-	jnz	L(char_nor_null)
+	blsmskl	%ecx, %ecx
+	andl	%ecx, %eax
+	jz	L(first_vec_x1_or_x2)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
 
-	VMOVA	(%rdi), %YMM1
-	add	$VEC_SIZE, %rdi
+	.p2align 4,, 6
+L(first_vec_x0_x1_test):
+	VPCMP	$0, %YMMMATCH, %YMM2, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jz	L(first_vec_x0_test)
+	.p2align 4,, 4
+L(first_vec_x1_return):
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
 
-	/* Each bit in K0 represents a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
-	/* Each bit in K1 represents a CHAR in YMM1.  */
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
-	kmovd	%k0, %ecx
+	.p2align 4,, 10
+L(first_vec_x2):
+	VPCMP	$0, %YMMMATCH, %YMM3, %k1
 	kmovd	%k1, %eax
-	orl	%eax, %ecx
-	jnz	L(char_nor_null)
+	blsmskl	%ecx, %ecx
+	andl	%ecx, %eax
+	jz	L(first_vec_x0_x1_test)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
 
-	VMOVA	(%rdi), %YMM1
-	addq	$VEC_SIZE, %rdi
 
-	/* Each bit in K0 represents a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
-	/* Each bit in K1 represents a CHAR in YMM1.  */
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+	.p2align 4
+L(aligned_more):
+	/* Need to keep original pointer incase YMM1 has last match.  */
+	movq	%rdi, %rsi
+	andq	$-VEC_SIZE, %rdi
+	VMOVU	VEC_SIZE(%rdi), %YMM2
+	VPTESTN	%YMM2, %YMM2, %k0
 	kmovd	%k0, %ecx
-	kmovd	%k1, %eax
-	orl	%eax, %ecx
-	jnz	L(char_nor_null)
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x1)
 
-	VMOVA	(%rdi), %YMM1
-	addq	$VEC_SIZE, %rdi
+	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM3
+	VPTESTN	%YMM3, %YMM3, %k0
+	kmovd	%k0, %ecx
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x2)
 
-	/* Each bit in K0 represents a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
-	/* Each bit in K1 represents a CHAR in YMM1.  */
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM4
+	VPTESTN	%YMM4, %YMM4, %k0
 	kmovd	%k0, %ecx
-	kmovd	%k1, %eax
-	orl	%eax, %ecx
-	jz	L(aligned_loop)
+	movq	%rdi, %r8
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x3)
 
+	andq	$-(VEC_SIZE * 2), %rdi
 	.p2align 4
-L(char_nor_null):
-	/* Find a CHAR or a null byte in a loop.  */
+L(first_aligned_loop):
+	/* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee
+	   they don't store a match.  */
+	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM5
+	VMOVA	(VEC_SIZE * 5)(%rdi), %YMM6
+
+	VPCMP	$0, %YMM5, %YMMMATCH, %k2
+	vpxord	%YMM6, %YMMMATCH, %YMM7
+
+	VPMIN	%YMM5, %YMM6, %YMM8
+	VPMIN	%YMM8, %YMM7, %YMM7
+
+	VPTESTN	%YMM7, %YMM7, %k1
+	subq	$(VEC_SIZE * -2), %rdi
+	kortestd %k1, %k2
+	jz	L(first_aligned_loop)
+
+	VPCMP	$0, %YMM6, %YMMMATCH, %k3
+	VPTESTN	%YMM8, %YMM8, %k1
+	ktestd	%k1, %k1
+	jz	L(second_aligned_loop_prep)
+
+	kortestd %k2, %k3
+	jnz	L(return_first_aligned_loop)
+
+	.p2align 4,, 6
+L(first_vec_x1_or_x2_or_x3):
+	VPCMP	$0, %YMM4, %YMMMATCH, %k4
+	kmovd	%k4, %eax
 	testl	%eax, %eax
-	jnz	L(match)
-L(return_value):
-	testl	%edx, %edx
-	jz	L(return_null)
-	movl	%edx, %eax
-	movq	%rsi, %rdi
+	jz	L(first_vec_x1_or_x2)
 	bsrl	%eax, %eax
-# ifdef USE_AS_WCSRCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
-# else
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
-# endif
+	leaq	(VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
 	ret
 
-	.p2align 4
-L(match):
-	/* Find a CHAR.  Check if there is a null byte.  */
-	kmovd	%k0, %ecx
-	testl	%ecx, %ecx
-	jnz	L(find_nul)
+	.p2align 4,, 8
+L(return_first_aligned_loop):
+	VPTESTN	%YMM5, %YMM5, %k0
+	kunpck	%k0, %k1, %k0
+	kmov_2x	%k0, %maskz_2x
+
+	blsmsk	%maskz_2x, %maskz_2x
+	kunpck	%k2, %k3, %k3
+	kmov_2x	%k3, %maskm_2x
+	and	%maskz_2x, %maskm_2x
+	jz	L(first_vec_x1_or_x2_or_x3)
+
+	bsr	%maskm_2x, %maskm_2x
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
 
-	/* Remember the match and keep searching.  */
-	movl	%eax, %edx
+	.p2align 4
+	/* We can throw away the work done for the first 4x checks here
+	   as we have a later match. This is the 'fast' path persay.
+	 */
+L(second_aligned_loop_prep):
+L(second_aligned_loop_set_furthest_match):
 	movq	%rdi, %rsi
-	jmp	L(aligned_loop)
+	kunpck	%k2, %k3, %k4
 
 	.p2align 4
-L(find_nul):
-	/* Mask out any matching bits after the null byte.  */
-	movl	%ecx, %r8d
-	subl	$1, %r8d
-	xorl	%ecx, %r8d
-	andl	%r8d, %eax
-	testl	%eax, %eax
-	/* If there is no CHAR here, return the remembered one.  */
-	jz	L(return_value)
-	bsrl	%eax, %eax
+L(second_aligned_loop):
+	VMOVU	(VEC_SIZE * 4)(%rdi), %YMM1
+	VMOVU	(VEC_SIZE * 5)(%rdi), %YMM2
+
+	VPCMP	$0, %YMM1, %YMMMATCH, %k2
+	vpxord	%YMM2, %YMMMATCH, %YMM3
+
+	VPMIN	%YMM1, %YMM2, %YMM4
+	VPMIN	%YMM3, %YMM4, %YMM3
+
+	VPTESTN	%YMM3, %YMM3, %k1
+	subq	$(VEC_SIZE * -2), %rdi
+	kortestd %k1, %k2
+	jz	L(second_aligned_loop)
+
+	VPCMP	$0, %YMM2, %YMMMATCH, %k3
+	VPTESTN	%YMM4, %YMM4, %k1
+	ktestd	%k1, %k1
+	jz	L(second_aligned_loop_set_furthest_match)
+
+	kortestd %k2, %k3
+	/* branch here because there is a significant advantage interms
+	   of output dependency chance in using edx.  */
+	jnz	L(return_new_match)
+L(return_old_match):
+	kmovq	%k4, %rax
+	bsrq	%rax, %rax
+	leaq	(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax
+	ret
+
+L(return_new_match):
+	VPTESTN	%YMM1, %YMM1, %k0
+	kunpck	%k0, %k1, %k0
+	kmov_2x	%k0, %maskz_2x
+
+	blsmsk	%maskz_2x, %maskz_2x
+	kunpck	%k2, %k3, %k3
+	kmov_2x	%k3, %maskm_2x
+	and	%maskz_2x, %maskm_2x
+	jz	L(return_old_match)
+
+	bsr	%maskm_2x, %maskm_2x
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+	/* This block is horribly aligned (% 16 == 15). This is
+	   intentional. The L(cross_page_boundary) block is exactly
+	   32-bytes of code size. Ultimately this is a cold case so
+	   save the code size by leaving misaligned.  */
+L(cross_page_boundary):
+	xorq	%rdi, %rax
+	VMOVU	(PAGE_SIZE - VEC_SIZE)(%rax), %YMM1
+	VPTESTN	%YMM1, %YMM1, %k0
+	kmovd	%k0, %ecx
 # ifdef USE_AS_WCSRCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
-# else
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
+	movl	%edi, %esi
+	andl	$(VEC_SIZE - 1), %esi
+	shrl	$2, %esi
 # endif
-	ret
+	shrxl	%SHIFT_REG, %ecx, %ecx
 
-	.p2align 4
-L(char_and_nul):
-	/* Find both a CHAR and a null byte.  */
-	addq	%rcx, %rdi
-	movl	%edx, %ecx
-L(char_and_nul_in_first_vec):
-	/* Mask out any matching bits after the null byte.  */
-	movl	%ecx, %r8d
-	subl	$1, %r8d
-	xorl	%ecx, %r8d
-	andl	%r8d, %eax
-	testl	%eax, %eax
-	/* Return null pointer if the null byte comes first.  */
-	jz	L(return_null)
+	testl	%ecx, %ecx
+	jz	L(page_cross_continue)
+	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+	kmovd	%k1, %eax
+	shrxl	%SHIFT_REG, %eax, %eax
+	blsmskl	%ecx, %ecx
+	andl	%ecx, %eax
+	jz	L(ret3)
 	bsrl	%eax, %eax
 # ifdef USE_AS_WCSRCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
 # else
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
+	addq	%rdi, %rax
 # endif
+L(ret3):
 	ret
 
-	.p2align 4
-L(return_null):
-	xorl	%eax, %eax
-	ret
-
-END (STRRCHR)
+END(STRRCHR)
 #endif
-- 
2.25.1


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v1 2/5] x86: Optimize {str|wcs}rchr-sse2
  2022-04-21 21:48       ` H.J. Lu
@ 2022-04-21 22:23         ` Noah Goldstein
  0 siblings, 0 replies; 36+ messages in thread
From: Noah Goldstein @ 2022-04-21 22:23 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell

On Thu, Apr 21, 2022 at 4:49 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Thu, Apr 21, 2022 at 1:57 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Thu, Apr 21, 2022 at 3:27 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Wed, Apr 20, 2022 at 8:14 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > The new code unrolls the main loop slightly without adding too much
> > > > overhead and minimizes the comparisons for the search CHAR.
> > > >
> > > > Geometric Mean of all benchmarks New / Old: 0.741
> > > > See email for all results.
> > > >
> > > > Full xcheck passes on x86_64 with and without multiarch enabled.
> > > > ---
> > > > Results For: strrchr
> > > >
> > > > Geometric Mean of N=30 runs.
> > > >
> > > > Geometric Mean of all benchmarks New / Old: 0.741
> > > > Benchmarks performance on Tigerlake:
> > > > https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i71165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html
> > > >
> > > >  len, align,  pos, seek, max_char, freq, New Time / Old Time
> > > > 2048,     0,   32,    0,      127,    1,               0.647
> > > > 2048,     1,   32,    0,      127,    1,               0.621
> > > > 2048,     0,   64,    0,      127,    1,               0.661
> > > > 2048,     2,   64,    0,      127,    1,               0.655
> > > > 2048,     0,  128,    0,      127,    1,                0.69
> > > > 2048,     3,  128,    0,      127,    1,               0.689
> > > > 2048,     0,  256,    0,      127,    1,               0.718
> > > > 2048,     4,  256,    0,      127,    1,               0.718
> > > > 2048,     0,  512,    0,      127,    1,               0.758
> > > > 2048,     5,  512,    0,      127,    1,               0.754
> > > > 2048,     0, 1024,    0,      127,    1,               1.029
> > > > 2048,     6, 1024,    0,      127,    1,               1.032
> > > > 2048,     0, 2048,    0,      127,    1,               0.826
> > > > 2048,     7, 2048,    0,      127,    1,               0.834
> > > > 2048,     0, 4096,    0,      127,    1,               0.825
> > > > 2048,     8, 4096,    0,      127,    1,                0.83
> > > >  256,     1,   64,    0,      127,    1,               0.657
> > > >  256,    15,   64,    0,      127,    1,               0.657
> > > >  256,     2,   64,    0,      127,    1,               0.657
> > > >  256,    30,   64,    0,      127,    1,               0.523
> > > >  256,     3,   64,    0,      127,    1,               0.657
> > > >  256,    45,   64,    0,      127,    1,               0.654
> > > >  256,     4,   64,    0,      127,    1,               0.657
> > > >  256,    60,   64,    0,      127,    1,               0.526
> > > >  256,     5,   64,    0,      127,    1,               0.658
> > > >  256,    75,   64,    0,      127,    1,               0.658
> > > >  256,     6,   64,    0,      127,    1,               0.655
> > > >  256,    90,   64,    0,      127,    1,               0.523
> > > >  256,     7,   64,    0,      127,    1,               0.655
> > > >  256,   105,   64,    0,      127,    1,               0.654
> > > >    1,     0,    0,    0,      127,    1,                0.98
> > > >    2,     0,    1,    0,      127,    1,               0.978
> > > >    3,     0,    2,    0,      127,    1,               0.975
> > > >    4,     0,    3,    0,      127,    1,               0.976
> > > >    5,     0,    4,    0,      127,    1,               0.977
> > > >    6,     0,    5,    0,      127,    1,               0.981
> > > >    7,     0,    6,    0,      127,    1,               0.982
> > > >    8,     0,    7,    0,      127,    1,                0.98
> > > >    9,     0,    8,    0,      127,    1,               0.978
> > > >   10,     0,    9,    0,      127,    1,               0.981
> > > >   11,     0,   10,    0,      127,    1,               0.984
> > > >   12,     0,   11,    0,      127,    1,               0.982
> > > >   13,     0,   12,    0,      127,    1,                0.98
> > > >   14,     0,   13,    0,      127,    1,               0.978
> > > >   15,     0,   14,    0,      127,    1,               0.979
> > > >   16,     0,   15,    0,      127,    1,               0.986
> > > >   17,     0,   16,    0,      127,    1,               0.529
> > > >   18,     0,   17,    0,      127,    1,               0.566
> > > >   19,     0,   18,    0,      127,    1,               0.575
> > > >   20,     0,   19,    0,      127,    1,               0.573
> > > >   21,     0,   20,    0,      127,    1,               0.579
> > > >   22,     0,   21,    0,      127,    1,               0.595
> > > >   23,     0,   22,    0,      127,    1,               0.585
> > > >   24,     0,   23,    0,      127,    1,               0.586
> > > >   25,     0,   24,    0,      127,    1,               0.587
> > > >   26,     0,   25,    0,      127,    1,               0.592
> > > >   27,     0,   26,    0,      127,    1,               0.595
> > > >   28,     0,   27,    0,      127,    1,               0.592
> > > >   29,     0,   28,    0,      127,    1,                 0.6
> > > >   30,     0,   29,    0,      127,    1,               0.598
> > > >   31,     0,   30,    0,      127,    1,               0.595
> > > >   32,     0,   31,    0,      127,    1,               0.592
> > > > 2048,     0,   32,   23,      127,    1,               0.827
> > > > 2048,     1,   32,   23,      127,    1,               0.826
> > > > 2048,     0,   64,   23,      127,    1,               0.824
> > > > 2048,     2,   64,   23,      127,    1,               0.825
> > > > 2048,     0,  128,   23,      127,    1,               0.829
> > > > 2048,     3,  128,   23,      127,    1,               0.824
> > > > 2048,     0,  256,   23,      127,    1,               0.832
> > > > 2048,     4,  256,   23,      127,    1,               0.825
> > > > 2048,     0,  512,   23,      127,    1,               0.831
> > > > 2048,     5,  512,   23,      127,    1,               0.837
> > > > 2048,     0, 1024,   23,      127,    1,               0.721
> > > > 2048,     6, 1024,   23,      127,    1,               0.757
> > > > 2048,     0, 2048,   23,      127,    1,               0.825
> > > > 2048,     7, 2048,   23,      127,    1,               0.824
> > > > 2048,     0, 4096,   23,      127,    1,               0.828
> > > > 2048,     8, 4096,   23,      127,    1,               0.823
> > > >  256,     1,   64,   23,      127,    1,               0.665
> > > >  256,    15,   64,   23,      127,    1,               0.661
> > > >  256,     2,   64,   23,      127,    1,               0.674
> > > >  256,    30,   64,   23,      127,    1,               0.605
> > > >  256,     3,   64,   23,      127,    1,               0.668
> > > >  256,    45,   64,   23,      127,    1,               0.661
> > > >  256,     4,   64,   23,      127,    1,               0.657
> > > >  256,    60,   64,   23,      127,    1,               0.594
> > > >  256,     5,   64,   23,      127,    1,               0.654
> > > >  256,    75,   64,   23,      127,    1,               0.673
> > > >  256,     6,   64,   23,      127,    1,               0.688
> > > >  256,    90,   64,   23,      127,    1,                 0.6
> > > >  256,     7,   64,   23,      127,    1,                0.66
> > > >  256,   105,   64,   23,      127,    1,               0.654
> > > >    1,     0,    0,   23,      127,    1,               0.981
> > > >    2,     0,    1,   23,      127,    1,               0.976
> > > >    3,     0,    2,   23,      127,    1,               0.983
> > > >    4,     0,    3,   23,      127,    1,               0.984
> > > >    5,     0,    4,   23,      127,    1,               0.973
> > > >    6,     0,    5,   23,      127,    1,               0.987
> > > >    7,     0,    6,   23,      127,    1,               0.977
> > > >    8,     0,    7,   23,      127,    1,               0.979
> > > >    9,     0,    8,   23,      127,    1,               0.981
> > > >   10,     0,    9,   23,      127,    1,                0.98
> > > >   11,     0,   10,   23,      127,    1,               0.983
> > > >   12,     0,   11,   23,      127,    1,                0.98
> > > >   13,     0,   12,   23,      127,    1,                0.98
> > > >   14,     0,   13,   23,      127,    1,               0.977
> > > >   15,     0,   14,   23,      127,    1,               0.982
> > > >   16,     0,   15,   23,      127,    1,               0.581
> > > >   17,     0,   16,   23,      127,    1,               0.551
> > > >   18,     0,   17,   23,      127,    1,               0.555
> > > >   19,     0,   18,   23,      127,    1,               0.586
> > > >   20,     0,   19,   23,      127,    1,               0.585
> > > >   21,     0,   20,   23,      127,    1,               0.582
> > > >   22,     0,   21,   23,      127,    1,               0.571
> > > >   23,     0,   22,   23,      127,    1,               0.576
> > > >   24,     0,   23,   23,      127,    1,               0.581
> > > >   25,     0,   24,   23,      127,    1,               0.589
> > > >   26,     0,   25,   23,      127,    1,               0.593
> > > >   27,     0,   26,   23,      127,    1,               0.595
> > > >   28,     0,   27,   23,      127,    1,               0.583
> > > >   29,     0,   28,   23,      127,    1,               0.595
> > > >   30,     0,   29,   23,      127,    1,                0.58
> > > >   31,     0,   30,   23,      127,    1,               0.594
> > > >   32,     0,   31,   23,      127,    1,               0.665
> > > > 2048,     0,   32,   23,      127,    2,               0.825
> > > > 2048,     1,   32,   23,      127,    2,               0.818
> > > > 2048,     0,   64,   23,      127,    2,               0.829
> > > > 2048,     2,   64,   23,      127,    2,               0.828
> > > > 2048,     0,  128,   23,      127,    2,               0.823
> > > > 2048,     3,  128,   23,      127,    2,               0.825
> > > > 2048,     0,  256,   23,      127,    2,               0.819
> > > > 2048,     4,  256,   23,      127,    2,               0.828
> > > > 2048,     0,  512,   23,      127,    2,               0.824
> > > > 2048,     5,  512,   23,      127,    2,               0.827
> > > > 2048,     0, 1024,   23,      127,    2,               0.813
> > > > 2048,     6, 1024,   23,      127,    2,               0.834
> > > > 2048,     0, 2048,   23,      127,    2,               0.927
> > > > 2048,     7, 2048,   23,      127,    2,               0.923
> > > > 2048,     0, 4096,   23,      127,    2,               0.818
> > > > 2048,     8, 4096,   23,      127,    2,                0.82
> > > >  256,     1,   64,   23,      127,    2,               0.693
> > > >  256,    15,   64,   23,      127,    2,               0.686
> > > >  256,     2,   64,   23,      127,    2,                0.69
> > > >  256,    30,   64,   23,      127,    2,               0.611
> > > >  256,     3,   64,   23,      127,    2,               0.692
> > > >  256,    45,   64,   23,      127,    2,               0.685
> > > >  256,     4,   64,   23,      127,    2,               0.688
> > > >  256,    60,   64,   23,      127,    2,                 0.6
> > > >  256,     5,   64,   23,      127,    2,                0.69
> > > >  256,    75,   64,   23,      127,    2,               0.689
> > > >  256,     6,   64,   23,      127,    2,               0.688
> > > >  256,    90,   64,   23,      127,    2,               0.611
> > > >  256,     7,   64,   23,      127,    2,                0.69
> > > >  256,   105,   64,   23,      127,    2,               0.686
> > > >    1,     0,    0,   23,      127,    2,               0.982
> > > >    2,     0,    1,   23,      127,    2,               0.987
> > > >    3,     0,    2,   23,      127,    2,               0.978
> > > >    4,     0,    3,   23,      127,    2,               0.977
> > > >    5,     0,    4,   23,      127,    2,               0.979
> > > >    6,     0,    5,   23,      127,    2,               0.985
> > > >    7,     0,    6,   23,      127,    2,               0.975
> > > >    8,     0,    7,   23,      127,    2,               0.981
> > > >    9,     0,    8,   23,      127,    2,               0.984
> > > >   10,     0,    9,   23,      127,    2,               0.983
> > > >   11,     0,   10,   23,      127,    2,               0.982
> > > >   12,     0,   11,   23,      127,    2,               0.976
> > > >   13,     0,   12,   23,      127,    2,               0.985
> > > >   14,     0,   13,   23,      127,    2,               0.984
> > > >   15,     0,   14,   23,      127,    2,                0.98
> > > >   16,     0,   15,   23,      127,    2,               0.583
> > > >   17,     0,   16,   23,      127,    2,               0.552
> > > >   18,     0,   17,   23,      127,    2,               0.564
> > > >   19,     0,   18,   23,      127,    2,               0.585
> > > >   20,     0,   19,   23,      127,    2,               0.578
> > > >   21,     0,   20,   23,      127,    2,               0.578
> > > >   22,     0,   21,   23,      127,    2,               0.571
> > > >   23,     0,   22,   23,      127,    2,               0.587
> > > >   24,     0,   23,   23,      127,    2,               0.589
> > > >   25,     0,   24,   23,      127,    2,               0.593
> > > >   26,     0,   25,   23,      127,    2,               0.589
> > > >   27,     0,   26,   23,      127,    2,               0.588
> > > >   28,     0,   27,   23,      127,    2,               0.593
> > > >   29,     0,   28,   23,      127,    2,               0.579
> > > >   30,     0,   29,   23,      127,    2,               0.572
> > > >   31,     0,   30,   23,      127,    2,               0.582
> > > >   32,     0,   31,   23,      127,    2,               0.659
> > > > 2048,     0,   32,   23,      127,    4,               0.822
> > > > 2048,     1,   32,   23,      127,    4,               0.818
> > > > 2048,     0,   64,   23,      127,    4,               0.826
> > > > 2048,     2,   64,   23,      127,    4,               0.824
> > > > 2048,     0,  128,   23,      127,    4,               0.833
> > > > 2048,     3,  128,   23,      127,    4,               0.831
> > > > 2048,     0,  256,   23,      127,    4,               0.826
> > > > 2048,     4,  256,   23,      127,    4,               0.831
> > > > 2048,     0,  512,   23,      127,    4,               0.834
> > > > 2048,     5,  512,   23,      127,    4,                0.83
> > > > 2048,     0, 1024,   23,      127,    4,               0.836
> > > > 2048,     6, 1024,   23,      127,    4,               0.844
> > > > 2048,     0, 2048,   23,      127,    4,               0.696
> > > > 2048,     7, 2048,   23,      127,    4,               0.704
> > > > 2048,     0, 4096,   23,      127,    4,               0.936
> > > > 2048,     8, 4096,   23,      127,    4,               0.925
> > > >  256,     1,   64,   23,      127,    4,               0.694
> > > >  256,    15,   64,   23,      127,    4,                0.69
> > > >  256,     2,   64,   23,      127,    4,               0.687
> > > >  256,    30,   64,   23,      127,    4,               0.612
> > > >  256,     3,   64,   23,      127,    4,               0.685
> > > >  256,    45,   64,   23,      127,    4,               0.685
> > > >  256,     4,   64,   23,      127,    4,               0.684
> > > >  256,    60,   64,   23,      127,    4,               0.606
> > > >  256,     5,   64,   23,      127,    4,                0.69
> > > >  256,    75,   64,   23,      127,    4,               0.688
> > > >  256,     6,   64,   23,      127,    4,                0.69
> > > >  256,    90,   64,   23,      127,    4,               0.615
> > > >  256,     7,   64,   23,      127,    4,               0.691
> > > >  256,   105,   64,   23,      127,    4,               0.688
> > > >    1,     0,    0,   23,      127,    4,               0.982
> > > >    2,     0,    1,   23,      127,    4,               0.983
> > > >    3,     0,    2,   23,      127,    4,               0.981
> > > >    4,     0,    3,   23,      127,    4,               0.984
> > > >    5,     0,    4,   23,      127,    4,               0.963
> > > >    6,     0,    5,   23,      127,    4,               0.978
> > > >    7,     0,    6,   23,      127,    4,               0.985
> > > >    8,     0,    7,   23,      127,    4,               0.986
> > > >    9,     0,    8,   23,      127,    4,               0.978
> > > >   10,     0,    9,   23,      127,    4,               0.985
> > > >   11,     0,   10,   23,      127,    4,               0.986
> > > >   12,     0,   11,   23,      127,    4,               0.983
> > > >   13,     0,   12,   23,      127,    4,               0.986
> > > >   14,     0,   13,   23,      127,    4,                0.98
> > > >   15,     0,   14,   23,      127,    4,               0.979
> > > >   16,     0,   15,   23,      127,    4,               0.582
> > > >   17,     0,   16,   23,      127,    4,               0.542
> > > >   18,     0,   17,   23,      127,    4,               0.564
> > > >   19,     0,   18,   23,      127,    4,               0.571
> > > >   20,     0,   19,   23,      127,    4,               0.582
> > > >   21,     0,   20,   23,      127,    4,               0.573
> > > >   22,     0,   21,   23,      127,    4,               0.575
> > > >   23,     0,   22,   23,      127,    4,               0.578
> > > >   24,     0,   23,   23,      127,    4,                0.58
> > > >   25,     0,   24,   23,      127,    4,               0.592
> > > >   26,     0,   25,   23,      127,    4,               0.588
> > > >   27,     0,   26,   23,      127,    4,               0.574
> > > >   28,     0,   27,   23,      127,    4,               0.589
> > > >   29,     0,   28,   23,      127,    4,                0.56
> > > >   30,     0,   29,   23,      127,    4,               0.587
> > > >   31,     0,   30,   23,      127,    4,               0.584
> > > >   32,     0,   31,   23,      127,    4,               0.664
> > > > 2048,     0,   32,   23,      127,    8,               0.826
> > > > 2048,     1,   32,   23,      127,    8,               0.821
> > > > 2048,     0,   64,   23,      127,    8,               0.828
> > > > 2048,     2,   64,   23,      127,    8,               0.827
> > > > 2048,     0,  128,   23,      127,    8,               0.833
> > > > 2048,     3,  128,   23,      127,    8,                0.83
> > > > 2048,     0,  256,   23,      127,    8,               0.855
> > > > 2048,     4,  256,   23,      127,    8,               0.849
> > > > 2048,     0,  512,   23,      127,    8,               0.849
> > > > 2048,     5,  512,   23,      127,    8,               0.851
> > > > 2048,     0, 1024,   23,      127,    8,               0.856
> > > > 2048,     6, 1024,   23,      127,    8,               0.862
> > > > 2048,     0, 2048,   23,      127,    8,               0.709
> > > > 2048,     7, 2048,   23,      127,    8,               0.712
> > > > 2048,     0, 4096,   23,      127,    8,               0.702
> > > > 2048,     8, 4096,   23,      127,    8,               0.701
> > > >  256,     1,   64,   23,      127,    8,               0.689
> > > >  256,    15,   64,   23,      127,    8,               0.688
> > > >  256,     2,   64,   23,      127,    8,               0.691
> > > >  256,    30,   64,   23,      127,    8,               0.612
> > > >  256,     3,   64,   23,      127,    8,               0.688
> > > >  256,    45,   64,   23,      127,    8,               0.686
> > > >  256,     4,   64,   23,      127,    8,               0.694
> > > >  256,    60,   64,   23,      127,    8,               0.609
> > > >  256,     5,   64,   23,      127,    8,                0.69
> > > >  256,    75,   64,   23,      127,    8,                0.69
> > > >  256,     6,   64,   23,      127,    8,               0.691
> > > >  256,    90,   64,   23,      127,    8,               0.612
> > > >  256,     7,   64,   23,      127,    8,               0.689
> > > >  256,   105,   64,   23,      127,    8,               0.688
> > > >    1,     0,    0,   23,      127,    8,                0.98
> > > >    2,     0,    1,   23,      127,    8,               0.978
> > > >    3,     0,    2,   23,      127,    8,                0.98
> > > >    4,     0,    3,   23,      127,    8,               0.978
> > > >    5,     0,    4,   23,      127,    8,               0.977
> > > >    6,     0,    5,   23,      127,    8,               0.984
> > > >    7,     0,    6,   23,      127,    8,               0.982
> > > >    8,     0,    7,   23,      127,    8,               0.983
> > > >    9,     0,    8,   23,      127,    8,               0.987
> > > >   10,     0,    9,   23,      127,    8,               0.979
> > > >   11,     0,   10,   23,      127,    8,               0.985
> > > >   12,     0,   11,   23,      127,    8,               0.981
> > > >   13,     0,   12,   23,      127,    8,                0.98
> > > >   14,     0,   13,   23,      127,    8,               0.982
> > > >   15,     0,   14,   23,      127,    8,               0.981
> > > >   16,     0,   15,   23,      127,    8,               0.579
> > > >   17,     0,   16,   23,      127,    8,               0.531
> > > >   18,     0,   17,   23,      127,    8,               0.577
> > > >   19,     0,   18,   23,      127,    8,               0.588
> > > >   20,     0,   19,   23,      127,    8,               0.571
> > > >   21,     0,   20,   23,      127,    8,               0.576
> > > >   22,     0,   21,   23,      127,    8,                0.59
> > > >   23,     0,   22,   23,      127,    8,               0.574
> > > >   24,     0,   23,   23,      127,    8,               0.583
> > > >   25,     0,   24,   23,      127,    8,               0.581
> > > >   26,     0,   25,   23,      127,    8,               0.592
> > > >   27,     0,   26,   23,      127,    8,               0.586
> > > >   28,     0,   27,   23,      127,    8,               0.588
> > > >   29,     0,   28,   23,      127,    8,               0.578
> > > >   30,     0,   29,   23,      127,    8,               0.573
> > > >   31,     0,   30,   23,      127,    8,               0.588
> > > >   32,     0,   31,   23,      127,    8,               0.664
> > > > 2048,     0,   32,   23,      127,   16,               0.825
> > > > 2048,     1,   32,   23,      127,   16,               0.823
> > > > 2048,     0,   64,   23,      127,   16,               0.831
> > > > 2048,     2,   64,   23,      127,   16,               0.822
> > > > 2048,     0,  128,   23,      127,   16,               0.831
> > > > 2048,     3,  128,   23,      127,   16,               0.831
> > > > 2048,     0,  256,   23,      127,   16,               0.849
> > > > 2048,     4,  256,   23,      127,   16,                0.85
> > > > 2048,     0,  512,   23,      127,   16,               0.751
> > > > 2048,     5,  512,   23,      127,   16,                0.75
> > > > 2048,     0, 1024,   23,      127,   16,               0.913
> > > > 2048,     6, 1024,   23,      127,   16,               0.895
> > > > 2048,     0, 2048,   23,      127,   16,               0.736
> > > > 2048,     7, 2048,   23,      127,   16,               0.741
> > > > 2048,     0, 4096,   23,      127,   16,               0.712
> > > > 2048,     8, 4096,   23,      127,   16,               0.711
> > > >  256,     1,   64,   23,      127,   16,               0.758
> > > >  256,    15,   64,   23,      127,   16,               0.692
> > > >  256,     2,   64,   23,      127,   16,               0.692
> > > >  256,    30,   64,   23,      127,   16,               0.613
> > > >  256,     3,   64,   23,      127,   16,                0.69
> > > >  256,    45,   64,   23,      127,   16,               0.687
> > > >  256,     4,   64,   23,      127,   16,                0.69
> > > >  256,    60,   64,   23,      127,   16,               0.604
> > > >  256,     5,   64,   23,      127,   16,               0.687
> > > >  256,    75,   64,   23,      127,   16,               0.687
> > > >  256,     6,   64,   23,      127,   16,                0.69
> > > >  256,    90,   64,   23,      127,   16,                0.61
> > > >  256,     7,   64,   23,      127,   16,                0.69
> > > >  256,   105,   64,   23,      127,   16,               0.685
> > > >    1,     0,    0,   23,      127,   16,               0.981
> > > >    2,     0,    1,   23,      127,   16,               0.985
> > > >    3,     0,    2,   23,      127,   16,               0.985
> > > >    4,     0,    3,   23,      127,   16,               0.981
> > > >    5,     0,    4,   23,      127,   16,               0.979
> > > >    6,     0,    5,   23,      127,   16,               0.986
> > > >    7,     0,    6,   23,      127,   16,               0.986
> > > >    8,     0,    7,   23,      127,   16,               0.982
> > > >    9,     0,    8,   23,      127,   16,               0.982
> > > >   10,     0,    9,   23,      127,   16,                0.98
> > > >   11,     0,   10,   23,      127,   16,               0.983
> > > >   12,     0,   11,   23,      127,   16,               0.982
> > > >   13,     0,   12,   23,      127,   16,               0.982
> > > >   14,     0,   13,   23,      127,   16,               0.982
> > > >   15,     0,   14,   23,      127,   16,               0.982
> > > >   16,     0,   15,   23,      127,   16,               0.582
> > > >   17,     0,   16,   23,      127,   16,               0.542
> > > >   18,     0,   17,   23,      127,   16,               0.554
> > > >   19,     0,   18,   23,      127,   16,               0.562
> > > >   20,     0,   19,   23,      127,   16,               0.587
> > > >   21,     0,   20,   23,      127,   16,               0.584
> > > >   22,     0,   21,   23,      127,   16,               0.587
> > > >   23,     0,   22,   23,      127,   16,               0.594
> > > >   24,     0,   23,   23,      127,   16,               0.581
> > > >   25,     0,   24,   23,      127,   16,               0.577
> > > >   26,     0,   25,   23,      127,   16,               0.588
> > > >   27,     0,   26,   23,      127,   16,               0.589
> > > >   28,     0,   27,   23,      127,   16,               0.596
> > > >   29,     0,   28,   23,      127,   16,               0.591
> > > >   30,     0,   29,   23,      127,   16,               0.585
> > > >   31,     0,   30,   23,      127,   16,                0.59
> > > >   32,     0,   31,   23,      127,   16,               0.669
> > > >
> > > >  sysdeps/x86_64/multiarch/strrchr-sse2.S |   2 +-
> > > >  sysdeps/x86_64/multiarch/wcsrchr-sse2.S |   3 +-
> > > >  sysdeps/x86_64/strrchr.S                | 505 +++++++++++++++---------
> > > >  sysdeps/x86_64/wcsrchr.S                | 268 +------------
> > > >  4 files changed, 334 insertions(+), 444 deletions(-)
> > > >
> > > > diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
> > > > index db1b44c23c..866396e947 100644
> > > > --- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
> > > > +++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
> > > > @@ -17,7 +17,7 @@
> > > >     <https://www.gnu.org/licenses/>.  */
> > > >
> > > >  #if IS_IN (libc)
> > > > -# define strrchr __strrchr_sse2
> > > > +# define STRRCHR __strrchr_sse2
> > > >
> > > >  # undef weak_alias
> > > >  # define weak_alias(strrchr, rindex)
> > > > diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> > > > index 78d1ca6553..69d2f3cdb1 100644
> > > > --- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> > > > +++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> > > > @@ -17,7 +17,6 @@
> > > >     <https://www.gnu.org/licenses/>.  */
> > > >
> > > >  #if IS_IN (libc)
> > > > -# define wcsrchr __wcsrchr_sse2
> > > > +# define STRRCHR       __wcsrchr_sse2
> > > >  #endif
> > > > -
> > > >  #include "../wcsrchr.S"
> > > > diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
> > > > index 50d886713e..94449ad806 100644
> > > > --- a/sysdeps/x86_64/strrchr.S
> > > > +++ b/sysdeps/x86_64/strrchr.S
> > > > @@ -19,210 +19,355 @@
> > > >
> > > >  #include <sysdep.h>
> > > >
> > > > +#ifndef STRRCHR
> > > > +# define STRRCHR       strrchr
> > > > +#endif
> > > > +
> > > > +#ifdef USE_AS_WCSRCHR
> > > > +# define PCMPEQ        pcmpeqd
> > > > +# define CHAR_SIZE     4
> > > > +# define PMINU pminud
> > > > +#else
> > > > +# define PCMPEQ        pcmpeqb
> > > > +# define CHAR_SIZE     1
> > > > +# define PMINU pminub
> > > > +#endif
> > > > +
> > > > +#define PAGE_SIZE      4096
> > > > +#define VEC_SIZE       16
> > > > +
> > > >         .text
> > > > -ENTRY (strrchr)
> > > > -       movd    %esi, %xmm1
> > > > +ENTRY(STRRCHR)
> > > > +       movd    %esi, %xmm0
> > > >         movq    %rdi, %rax
> > > > -       andl    $4095, %eax
> > > > -       punpcklbw       %xmm1, %xmm1
> > > > -       cmpq    $4032, %rax
> > > > -       punpcklwd       %xmm1, %xmm1
> > > > -       pshufd  $0, %xmm1, %xmm1
> > > > +       andl    $(PAGE_SIZE - 1), %eax
> > > > +#ifndef USE_AS_WCSRCHR
> > > > +       punpcklbw %xmm0, %xmm0
> > > > +       punpcklwd %xmm0, %xmm0
> > > > +#endif
> > > > +       pshufd  $0, %xmm0, %xmm0
> > > > +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > > >         ja      L(cross_page)
> > > > -       movdqu  (%rdi), %xmm0
> > > > +
> > > > +L(cross_page_continue):
> > > > +       movups  (%rdi), %xmm1
> > > >         pxor    %xmm2, %xmm2
> > > > -       movdqa  %xmm0, %xmm3
> > > > -       pcmpeqb %xmm1, %xmm0
> > > > -       pcmpeqb %xmm2, %xmm3
> > > > -       pmovmskb        %xmm0, %ecx
> > > > -       pmovmskb        %xmm3, %edx
> > > > -       testq   %rdx, %rdx
> > > > -       je      L(next_48_bytes)
> > > > -       leaq    -1(%rdx), %rax
> > > > -       xorq    %rdx, %rax
> > > > -       andq    %rcx, %rax
> > > > -       je      L(exit)
> > > > -       bsrq    %rax, %rax
> > > > +       PCMPEQ  %xmm1, %xmm2
> > > > +       pmovmskb %xmm2, %ecx
> > > > +       testl   %ecx, %ecx
> > > > +       jz      L(aligned_more)
> > > > +
> > > > +       PCMPEQ  %xmm0, %xmm1
> > > > +       pmovmskb %xmm1, %eax
> > > > +       leal    -1(%rcx), %edx
> > > > +       xorl    %edx, %ecx
> > > > +       andl    %ecx, %eax
> > > > +       jz      L(ret0)
> > > > +       bsrl    %eax, %eax
> > > >         addq    %rdi, %rax
> > > > +       /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
> > > > +          search CHAR is zero we are correct. Either way `andq
> > > > +          -CHAR_SIZE, %rax` gets the correct result.  */
> > > > +#ifdef USE_AS_WCSRCHR
> > > > +       andq    $-CHAR_SIZE, %rax
> > > > +#endif
> > > > +L(ret0):
> > > >         ret
> > > >
> > > > +       /* Returns for first vec x1/x2 have hard coded backward search
> > > > +          path for earlier matches.  */
> > > >         .p2align 4
> > > > -L(next_48_bytes):
> > > > -       movdqu  16(%rdi), %xmm4
> > > > -       movdqa  %xmm4, %xmm5
> > > > -       movdqu  32(%rdi), %xmm3
> > > > -       pcmpeqb %xmm1, %xmm4
> > > > -       pcmpeqb %xmm2, %xmm5
> > > > -       movdqu  48(%rdi), %xmm0
> > > > -       pmovmskb        %xmm5, %edx
> > > > -       movdqa  %xmm3, %xmm5
> > > > -       pcmpeqb %xmm1, %xmm3
> > > > -       pcmpeqb %xmm2, %xmm5
> > > > -       pcmpeqb %xmm0, %xmm2
> > > > -       salq    $16, %rdx
> > > > -       pmovmskb        %xmm3, %r8d
> > > > -       pmovmskb        %xmm5, %eax
> > > > -       pmovmskb        %xmm2, %esi
> > > > -       salq    $32, %r8
> > > > -       salq    $32, %rax
> > > > -       pcmpeqb %xmm1, %xmm0
> > > > -       orq     %rdx, %rax
> > > > -       movq    %rsi, %rdx
> > > > -       pmovmskb        %xmm4, %esi
> > > > -       salq    $48, %rdx
> > > > -       salq    $16, %rsi
> > > > -       orq     %r8, %rsi
> > > > -       orq     %rcx, %rsi
> > > > -       pmovmskb        %xmm0, %ecx
> > > > -       salq    $48, %rcx
> > > > -       orq     %rcx, %rsi
> > > > -       orq     %rdx, %rax
> > > > -       je      L(loop_header2)
> > > > -       leaq    -1(%rax), %rcx
> > > > -       xorq    %rax, %rcx
> > > > -       andq    %rcx, %rsi
> > > > -       je      L(exit)
> > > > -       bsrq    %rsi, %rsi
> > > > -       leaq    (%rdi,%rsi), %rax
> > > > +L(first_vec_x0_test):
> > > > +       PCMPEQ  %xmm0, %xmm1
> > > > +       pmovmskb %xmm1, %eax
> > > > +       testl   %eax, %eax
> > > > +       jz      L(ret0)
> > > > +       bsrl    %eax, %eax
> > > > +       addq    %r8, %rax
> > > > +#ifdef USE_AS_WCSRCHR
> > > > +       andq    $-CHAR_SIZE, %rax
> > > > +#endif
> > > >         ret
> > > >
> > > >         .p2align 4
> > > > -L(loop_header2):
> > > > -       testq   %rsi, %rsi
> > > > -       movq    %rdi, %rcx
> > > > -       je      L(no_c_found)
> > > > -L(loop_header):
> > > > -       addq    $64, %rdi
> > > > -       pxor    %xmm7, %xmm7
> > > > -       andq    $-64, %rdi
> > > > -       jmp     L(loop_entry)
> > > > +L(first_vec_x1):
> > > > +       PCMPEQ  %xmm0, %xmm2
> > > > +       pmovmskb %xmm2, %eax
> > > > +       leal    -1(%rcx), %edx
> > > > +       xorl    %edx, %ecx
> > > > +       andl    %ecx, %eax
> > > > +       jz      L(first_vec_x0_test)
> > > > +       bsrl    %eax, %eax
> > > > +       leaq    (VEC_SIZE)(%rdi, %rax), %rax
> > > > +#ifdef USE_AS_WCSRCHR
> > > > +       andq    $-CHAR_SIZE, %rax
> > > > +#endif
> > > > +       ret
> > > >
> > > >         .p2align 4
> > > > -L(loop64):
> > > > -       testq   %rdx, %rdx
> > > > -       cmovne  %rdx, %rsi
> > > > -       cmovne  %rdi, %rcx
> > > > -       addq    $64, %rdi
> > > > -L(loop_entry):
> > > > -       movdqa  32(%rdi), %xmm3
> > > > -       pxor    %xmm6, %xmm6
> > > > -       movdqa  48(%rdi), %xmm2
> > > > -       movdqa  %xmm3, %xmm0
> > > > -       movdqa  16(%rdi), %xmm4
> > > > -       pminub  %xmm2, %xmm0
> > > > -       movdqa  (%rdi), %xmm5
> > > > -       pminub  %xmm4, %xmm0
> > > > -       pminub  %xmm5, %xmm0
> > > > -       pcmpeqb %xmm7, %xmm0
> > > > -       pmovmskb        %xmm0, %eax
> > > > -       movdqa  %xmm5, %xmm0
> > > > -       pcmpeqb %xmm1, %xmm0
> > > > -       pmovmskb        %xmm0, %r9d
> > > > -       movdqa  %xmm4, %xmm0
> > > > -       pcmpeqb %xmm1, %xmm0
> > > > -       pmovmskb        %xmm0, %edx
> > > > -       movdqa  %xmm3, %xmm0
> > > > -       pcmpeqb %xmm1, %xmm0
> > > > -       salq    $16, %rdx
> > > > -       pmovmskb        %xmm0, %r10d
> > > > -       movdqa  %xmm2, %xmm0
> > > > -       pcmpeqb %xmm1, %xmm0
> > > > -       salq    $32, %r10
> > > > -       orq     %r10, %rdx
> > > > -       pmovmskb        %xmm0, %r8d
> > > > -       orq     %r9, %rdx
> > > > -       salq    $48, %r8
> > > > -       orq     %r8, %rdx
> > > > +L(first_vec_x1_test):
> > > > +       PCMPEQ  %xmm0, %xmm2
> > > > +       pmovmskb %xmm2, %eax
> > > >         testl   %eax, %eax
> > > > -       je      L(loop64)
> > > > -       pcmpeqb %xmm6, %xmm4
> > > > -       pcmpeqb %xmm6, %xmm3
> > > > -       pcmpeqb %xmm6, %xmm5
> > > > -       pmovmskb        %xmm4, %eax
> > > > -       pmovmskb        %xmm3, %r10d
> > > > -       pcmpeqb %xmm6, %xmm2
> > > > -       pmovmskb        %xmm5, %r9d
> > > > -       salq    $32, %r10
> > > > -       salq    $16, %rax
> > > > -       pmovmskb        %xmm2, %r8d
> > > > -       orq     %r10, %rax
> > > > -       orq     %r9, %rax
> > > > -       salq    $48, %r8
> > > > -       orq     %r8, %rax
> > > > -       leaq    -1(%rax), %r8
> > > > -       xorq    %rax, %r8
> > > > -       andq    %r8, %rdx
> > > > -       cmovne  %rdi, %rcx
> > > > -       cmovne  %rdx, %rsi
> > > > -       bsrq    %rsi, %rsi
> > > > -       leaq    (%rcx,%rsi), %rax
> > > > +       jz      L(first_vec_x0_test)
> > > > +       bsrl    %eax, %eax
> > > > +       leaq    (VEC_SIZE)(%rdi, %rax), %rax
> > > > +#ifdef USE_AS_WCSRCHR
> > > > +       andq    $-CHAR_SIZE, %rax
> > > > +#endif
> > > > +       ret
> > > > +
> > > > +       .p2align 4
> > > > +L(first_vec_x2):
> > > > +       PCMPEQ  %xmm0, %xmm3
> > > > +       pmovmskb %xmm3, %eax
> > > > +       leal    -1(%rcx), %edx
> > > > +       xorl    %edx, %ecx
> > > > +       andl    %ecx, %eax
> > > > +       jz      L(first_vec_x1_test)
> > > > +       bsrl    %eax, %eax
> > > > +       leaq    (VEC_SIZE * 2)(%rdi, %rax), %rax
> > > > +#ifdef USE_AS_WCSRCHR
> > > > +       andq    $-CHAR_SIZE, %rax
> > > > +#endif
> > > > +       ret
> > > > +
> > > > +       .p2align 4
> > > > +L(aligned_more):
> > > > +       /* Save original pointer if match was in VEC 0.  */
> > > > +       movq    %rdi, %r8
> > > > +       andq    $-VEC_SIZE, %rdi
> > > > +
> > > > +       movaps  VEC_SIZE(%rdi), %xmm2
> > > > +       pxor    %xmm3, %xmm3
> > > > +       PCMPEQ  %xmm2, %xmm3
> > > > +       pmovmskb %xmm3, %ecx
> > > > +       testl   %ecx, %ecx
> > > > +       jnz     L(first_vec_x1)
> > > > +
> > > > +       movaps  (VEC_SIZE * 2)(%rdi), %xmm3
> > > > +       pxor    %xmm4, %xmm4
> > > > +       PCMPEQ  %xmm3, %xmm4
> > > > +       pmovmskb %xmm4, %ecx
> > > > +       testl   %ecx, %ecx
> > > > +       jnz     L(first_vec_x2)
> > > > +
> > > > +       addq    $VEC_SIZE, %rdi
> > > > +       /* Save pointer again before realigning.  */
> > > > +       movq    %rdi, %rsi
> > > > +       andq    $-(VEC_SIZE * 2), %rdi
> > > > +       .p2align 4
> > > > +L(first_loop):
> > > > +       /* Do 2x VEC at a time.  */
> > > > +       movaps  (VEC_SIZE * 2)(%rdi), %xmm4
> > > > +       movaps  (VEC_SIZE * 3)(%rdi), %xmm5
> > > > +       /* If SSE2 no pminud.  */
> > > > +#ifdef NO_PMINU
> > >
> > > Do we really need SSE4.1 wcsrchr?  I think we should focus on AVX2 and
> > > above.
> >
> > It seems like freebie performance that can make a difference in the loop
> > cases. (see the SSE4.1 commit for numbers).
>
> But these numbers are on Tiger Lake.   I think we should continue to
> improve SSE2
> version and optimize AVX2/AVX512.   I don't think we should increase code sizes
> for SSE4.

Fair enough. Removed SSE4 version but added comment suggesting it as an
optimization if the need arises.
>
> > Imo there is little harm but if you feel strongly I'll drop. (In V2 will
> > change the .text section for SSE4_1).
> >
> > What do you think?
> > >
> > > > +       movaps  %xmm5, %xmm6
> > > > +       pxor    %xmm8, %xmm8
> > > > +
> > > > +       PCMPEQ  %xmm8, %xmm5
> > > > +       PCMPEQ  %xmm4, %xmm8
> > > > +       por     %xmm5, %xmm8
> > > > +#else
> > > > +       movaps  %xmm5, %xmm6
> > > > +       PMINU   %xmm4, %xmm5
> > > > +#endif
> > > > +
> > > > +       movaps  %xmm4, %xmm9
> > > > +       PCMPEQ  %xmm0, %xmm4
> > > > +       PCMPEQ  %xmm0, %xmm6
> > > > +       movaps  %xmm6, %xmm7
> > > > +       por     %xmm4, %xmm6
> > > > +#ifndef NO_PMINU
> > > > +       pxor    %xmm8, %xmm8
> > > > +       PCMPEQ  %xmm5, %xmm8
> > > > +#endif
> > > > +       pmovmskb %xmm8, %ecx
> > > > +       pmovmskb %xmm6, %eax
> > > > +
> > > > +       addq    $(VEC_SIZE * 2), %rdi
> > > > +       /* Use `addl` 1) so we can undo it with `subl` and 2) it can
> > > > +          macro-fuse with `jz`.  */
> > > > +       addl    %ecx, %eax
> > > > +       jz      L(first_loop)
> > > > +
> > > > +       /* Check if there is zero match.  */
> > > > +       testl   %ecx, %ecx
> > > > +       jz      L(second_loop_match)
> > > > +
> > > > +       /* Check if there was a match in last iteration.  */
> > > > +       subl    %ecx, %eax
> > > > +       jnz     L(new_match)
> > > > +
> > > > +L(first_loop_old_match):
> > > > +       PCMPEQ  %xmm0, %xmm2
> > > > +       PCMPEQ  %xmm0, %xmm3
> > > > +       pmovmskb %xmm2, %ecx
> > > > +       pmovmskb %xmm3, %eax
> > > > +       addl    %eax, %ecx
> > > > +       jz      L(first_vec_x0_test)
> > > > +       /* NB: We could move this shift to before the branch and save a
> > > > +          bit of code size / performance on the fall through. The
> > > > +          branch leads to the null case which generally seems hotter
> > > > +          than char in first 3x VEC.  */
> > > > +       sall    $16, %eax
> > > > +       orl     %ecx, %eax
> > > > +
> > > > +       bsrl    %eax, %eax
> > > > +       addq    %rsi, %rax
> > > > +#ifdef USE_AS_WCSRCHR
> > > > +       andq    $-CHAR_SIZE, %rax
> > > > +#endif
> > > > +       ret
> > > > +
> > > > +       .p2align 4
> > > > +L(new_match):
> > > > +       pxor    %xmm6, %xmm6
> > > > +       PCMPEQ  %xmm9, %xmm6
> > > > +       pmovmskb %xmm6, %eax
> > > > +       sall    $16, %ecx
> > > > +       orl     %eax, %ecx
> > > > +
> > > > +       /* We can't reuse either of the old comparisons as since we mask
> > > > +          of zeros after first zero (instead of using the full
> > > > +          comparison) we can't gurantee no interference between match
> > > > +          after end of string and valid match.  */
> > > > +       pmovmskb %xmm4, %eax
> > > > +       pmovmskb %xmm7, %edx
> > > > +       sall    $16, %edx
> > > > +       orl     %edx, %eax
> > > > +
> > > > +       leal    -1(%ecx), %edx
> > > > +       xorl    %edx, %ecx
> > > > +       andl    %ecx, %eax
> > > > +       jz      L(first_loop_old_match)
> > > > +       bsrl    %eax, %eax
> > > > +       addq    %rdi, %rax
> > > > +#ifdef USE_AS_WCSRCHR
> > > > +       andq    $-CHAR_SIZE, %rax
> > > > +#endif
> > > >         ret
> > > >
> > > > +       /* Save minimum state for getting most recent match. We can
> > > > +          throw out all previous work.  */
> > > >         .p2align 4
> > > > -L(no_c_found):
> > > > -       movl    $1, %esi
> > > > -       xorl    %ecx, %ecx
> > > > -       jmp     L(loop_header)
> > > > +L(second_loop_match):
> > > > +       movq    %rdi, %rsi
> > > > +       movaps  %xmm4, %xmm2
> > > > +       movaps  %xmm7, %xmm3
> > > >
> > > >         .p2align 4
> > > > -L(exit):
> > > > -       xorl    %eax, %eax
> > > > +L(second_loop):
> > > > +       movaps  (VEC_SIZE * 2)(%rdi), %xmm4
> > > > +       movaps  (VEC_SIZE * 3)(%rdi), %xmm5
> > > > +#ifdef NO_PMINU
> > > > +       movaps  %xmm5, %xmm6
> > > > +       pxor    %xmm8, %xmm8
> > > > +
> > > > +       PCMPEQ  %xmm8, %xmm5
> > > > +       PCMPEQ  %xmm4, %xmm8
> > > > +       por     %xmm5, %xmm8
> > > > +#else
> > > > +       movaps  %xmm5, %xmm6
> > > > +       PMINU   %xmm4, %xmm5
> > > > +#endif
> > > > +
> > > > +       movaps  %xmm4, %xmm9
> > > > +       PCMPEQ  %xmm0, %xmm4
> > > > +       PCMPEQ  %xmm0, %xmm6
> > > > +       movaps  %xmm6, %xmm7
> > > > +       por     %xmm4, %xmm6
> > > > +#ifndef NO_PMINU
> > > > +       pxor    %xmm8, %xmm8
> > > > +       PCMPEQ  %xmm5, %xmm8
> > > > +#endif
> > > > +
> > > > +       pmovmskb %xmm8, %ecx
> > > > +       pmovmskb %xmm6, %eax
> > > > +
> > > > +       addq    $(VEC_SIZE * 2), %rdi
> > > > +       /* Either null term or new occurence of CHAR.  */
> > > > +       addl    %ecx, %eax
> > > > +       jz      L(second_loop)
> > > > +
> > > > +       /* No null term so much be new occurence of CHAR.  */
> > > > +       testl   %ecx, %ecx
> > > > +       jz      L(second_loop_match)
> > > > +
> > > > +
> > > > +       subl    %ecx, %eax
> > > > +       jnz     L(second_loop_new_match)
> > > > +
> > > > +L(second_loop_old_match):
> > > > +       pmovmskb %xmm2, %ecx
> > > > +       pmovmskb %xmm3, %eax
> > > > +       sall    $16, %eax
> > > > +       orl     %ecx, %eax
> > > > +       bsrl    %eax, %eax
> > > > +       addq    %rsi, %rax
> > > > +#ifdef USE_AS_WCSRCHR
> > > > +       andq    $-CHAR_SIZE, %rax
> > > > +#endif
> > > >         ret
> > > >
> > > >         .p2align 4
> > > > +L(second_loop_new_match):
> > > > +       pxor    %xmm6, %xmm6
> > > > +       PCMPEQ  %xmm9, %xmm6
> > > > +       pmovmskb %xmm6, %eax
> > > > +       sall    $16, %ecx
> > > > +       orl     %eax, %ecx
> > > > +
> > > > +       /* We can't reuse either of the old comparisons as since we mask
> > > > +          of zeros after first zero (instead of using the full
> > > > +          comparison) we can't gurantee no interference between match
> > > > +          after end of string and valid match.  */
> > > > +       pmovmskb %xmm4, %eax
> > > > +       pmovmskb %xmm7, %edx
> > > > +       sall    $16, %edx
> > > > +       orl     %edx, %eax
> > > > +
> > > > +       leal    -1(%ecx), %edx
> > > > +       xorl    %edx, %ecx
> > > > +       andl    %ecx, %eax
> > > > +       jz      L(second_loop_old_match)
> > > > +       bsrl    %eax, %eax
> > > > +       addq    %rdi, %rax
> > > > +#ifdef USE_AS_WCSRCHR
> > > > +       andq    $-CHAR_SIZE, %rax
> > > > +#endif
> > > > +       ret
> > > > +
> > > > +       .p2align 4,, 4
> > > >  L(cross_page):
> > > > -       movq    %rdi, %rax
> > > > -       pxor    %xmm0, %xmm0
> > > > -       andq    $-64, %rax
> > > > -       movdqu  (%rax), %xmm5
> > > > -       movdqa  %xmm5, %xmm6
> > > > -       movdqu  16(%rax), %xmm4
> > > > -       pcmpeqb %xmm1, %xmm5
> > > > -       pcmpeqb %xmm0, %xmm6
> > > > -       movdqu  32(%rax), %xmm3
> > > > -       pmovmskb        %xmm6, %esi
> > > > -       movdqa  %xmm4, %xmm6
> > > > -       movdqu  48(%rax), %xmm2
> > > > -       pcmpeqb %xmm1, %xmm4
> > > > -       pcmpeqb %xmm0, %xmm6
> > > > -       pmovmskb        %xmm6, %edx
> > > > -       movdqa  %xmm3, %xmm6
> > > > -       pcmpeqb %xmm1, %xmm3
> > > > -       pcmpeqb %xmm0, %xmm6
> > > > -       pcmpeqb %xmm2, %xmm0
> > > > -       salq    $16, %rdx
> > > > -       pmovmskb        %xmm3, %r9d
> > > > -       pmovmskb        %xmm6, %r8d
> > > > -       pmovmskb        %xmm0, %ecx
> > > > -       salq    $32, %r9
> > > > -       salq    $32, %r8
> > > > -       pcmpeqb %xmm1, %xmm2
> > > > -       orq     %r8, %rdx
> > > > -       salq    $48, %rcx
> > > > -       pmovmskb        %xmm5, %r8d
> > > > -       orq     %rsi, %rdx
> > > > -       pmovmskb        %xmm4, %esi
> > > > -       orq     %rcx, %rdx
> > > > -       pmovmskb        %xmm2, %ecx
> > > > -       salq    $16, %rsi
> > > > -       salq    $48, %rcx
> > > > -       orq     %r9, %rsi
> > > > -       orq     %r8, %rsi
> > > > -       orq     %rcx, %rsi
> > > > +       movq    %rdi, %rsi
> > > > +       andq    $-VEC_SIZE, %rsi
> > > > +       movaps  (%rsi), %xmm1
> > > > +       pxor    %xmm2, %xmm2
> > > > +       PCMPEQ  %xmm1, %xmm2
> > > > +       pmovmskb %xmm2, %edx
> > > >         movl    %edi, %ecx
> > > > -       subl    %eax, %ecx
> > > > -       shrq    %cl, %rdx
> > > > -       shrq    %cl, %rsi
> > > > -       testq   %rdx, %rdx
> > > > -       je      L(loop_header2)
> > > > -       leaq    -1(%rdx), %rax
> > > > -       xorq    %rdx, %rax
> > > > -       andq    %rax, %rsi
> > > > -       je      L(exit)
> > > > -       bsrq    %rsi, %rax
> > > > +       andl    $(VEC_SIZE - 1), %ecx
> > > > +       sarl    %cl, %edx
> > > > +       jz      L(cross_page_continue)
> > > > +       PCMPEQ  %xmm0, %xmm1
> > > > +       pmovmskb %xmm1, %eax
> > > > +       sarl    %cl, %eax
> > > > +       leal    -1(%rdx), %ecx
> > > > +       xorl    %edx, %ecx
> > > > +       andl    %ecx, %eax
> > > > +       jz      L(ret1)
> > > > +       bsrl    %eax, %eax
> > > >         addq    %rdi, %rax
> > > > +#ifdef USE_AS_WCSRCHR
> > > > +       andq    $-CHAR_SIZE, %rax
> > > > +#endif
> > > > +L(ret1):
> > > >         ret
> > > > -END (strrchr)
> > > > +END(STRRCHR)
> > > >
> > > > -weak_alias (strrchr, rindex)
> > > > -libc_hidden_builtin_def (strrchr)
> > > > +#ifndef USE_AS_WCSRCHR
> > > > +       weak_alias (STRRCHR, rindex)
> > > > +       libc_hidden_builtin_def (STRRCHR)
> > > > +#endif
> > > > diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
> > > > index 61552954de..2b80efc5ef 100644
> > > > --- a/sysdeps/x86_64/wcsrchr.S
> > > > +++ b/sysdeps/x86_64/wcsrchr.S
> > > > @@ -1,4 +1,4 @@
> > > > -/* wcsrchr with SSSE3
> > > > +/* wcsrchr optimized with SSE2.
> > > >     Copyright (C) 2011-2022 Free Software Foundation, Inc.
> > > >     This file is part of the GNU C Library.
> > > >
> > > > @@ -16,266 +16,12 @@
> > > >     License along with the GNU C Library; if not, see
> > > >     <https://www.gnu.org/licenses/>.  */
> > > >
> > > > -#include <sysdep.h>
> > > >
> > > > -       .text
> > > > -ENTRY (wcsrchr)
> > > > +#define USE_AS_WCSRCHR 1
> > > > +#define NO_PMINU       1
> > > >
> > > > -       movd    %rsi, %xmm1
> > > > -       mov     %rdi, %rcx
> > > > -       punpckldq %xmm1, %xmm1
> > > > -       pxor    %xmm2, %xmm2
> > > > -       punpckldq %xmm1, %xmm1
> > > > -       and     $63, %rcx
> > > > -       cmp     $48, %rcx
> > > > -       ja      L(crosscache)
> > > > +#ifndef STRRCHR
> > > > +# define STRRCHR       wcsrchr
> > > > +#endif
> > > >
> > > > -       movdqu  (%rdi), %xmm0
> > > > -       pcmpeqd %xmm0, %xmm2
> > > > -       pcmpeqd %xmm1, %xmm0
> > > > -       pmovmskb %xmm2, %rcx
> > > > -       pmovmskb %xmm0, %rax
> > > > -       add     $16, %rdi
> > > > -
> > > > -       test    %rax, %rax
> > > > -       jnz     L(unaligned_match1)
> > > > -
> > > > -       test    %rcx, %rcx
> > > > -       jnz     L(return_null)
> > > > -
> > > > -       and     $-16, %rdi
> > > > -       xor     %r8, %r8
> > > > -       jmp     L(loop)
> > > > -
> > > > -       .p2align 4
> > > > -L(unaligned_match1):
> > > > -       test    %rcx, %rcx
> > > > -       jnz     L(prolog_find_zero_1)
> > > > -
> > > > -       mov     %rax, %r8
> > > > -       mov     %rdi, %rsi
> > > > -       and     $-16, %rdi
> > > > -       jmp     L(loop)
> > > > -
> > > > -       .p2align 4
> > > > -L(crosscache):
> > > > -       and     $15, %rcx
> > > > -       and     $-16, %rdi
> > > > -       pxor    %xmm3, %xmm3
> > > > -       movdqa  (%rdi), %xmm0
> > > > -       pcmpeqd %xmm0, %xmm3
> > > > -       pcmpeqd %xmm1, %xmm0
> > > > -       pmovmskb %xmm3, %rdx
> > > > -       pmovmskb %xmm0, %rax
> > > > -       shr     %cl, %rdx
> > > > -       shr     %cl, %rax
> > > > -       add     $16, %rdi
> > > > -
> > > > -       test    %rax, %rax
> > > > -       jnz     L(unaligned_match)
> > > > -
> > > > -       test    %rdx, %rdx
> > > > -       jnz     L(return_null)
> > > > -
> > > > -       xor     %r8, %r8
> > > > -       jmp     L(loop)
> > > > -
> > > > -       .p2align 4
> > > > -L(unaligned_match):
> > > > -       test    %rdx, %rdx
> > > > -       jnz     L(prolog_find_zero)
> > > > -
> > > > -       mov     %rax, %r8
> > > > -       lea     (%rdi, %rcx), %rsi
> > > > -
> > > > -/* Loop start on aligned string.  */
> > > > -       .p2align 4
> > > > -L(loop):
> > > > -       movdqa  (%rdi), %xmm0
> > > > -       pcmpeqd %xmm0, %xmm2
> > > > -       add     $16, %rdi
> > > > -       pcmpeqd %xmm1, %xmm0
> > > > -       pmovmskb %xmm2, %rcx
> > > > -       pmovmskb %xmm0, %rax
> > > > -       or      %rax, %rcx
> > > > -       jnz     L(matches)
> > > > -
> > > > -       movdqa  (%rdi), %xmm3
> > > > -       pcmpeqd %xmm3, %xmm2
> > > > -       add     $16, %rdi
> > > > -       pcmpeqd %xmm1, %xmm3
> > > > -       pmovmskb %xmm2, %rcx
> > > > -       pmovmskb %xmm3, %rax
> > > > -       or      %rax, %rcx
> > > > -       jnz     L(matches)
> > > > -
> > > > -       movdqa  (%rdi), %xmm4
> > > > -       pcmpeqd %xmm4, %xmm2
> > > > -       add     $16, %rdi
> > > > -       pcmpeqd %xmm1, %xmm4
> > > > -       pmovmskb %xmm2, %rcx
> > > > -       pmovmskb %xmm4, %rax
> > > > -       or      %rax, %rcx
> > > > -       jnz     L(matches)
> > > > -
> > > > -       movdqa  (%rdi), %xmm5
> > > > -       pcmpeqd %xmm5, %xmm2
> > > > -       add     $16, %rdi
> > > > -       pcmpeqd %xmm1, %xmm5
> > > > -       pmovmskb %xmm2, %rcx
> > > > -       pmovmskb %xmm5, %rax
> > > > -       or      %rax, %rcx
> > > > -       jz      L(loop)
> > > > -
> > > > -       .p2align 4
> > > > -L(matches):
> > > > -       test    %rax, %rax
> > > > -       jnz     L(match)
> > > > -L(return_value):
> > > > -       test    %r8, %r8
> > > > -       jz      L(return_null)
> > > > -       mov     %r8, %rax
> > > > -       mov     %rsi, %rdi
> > > > -
> > > > -       test    $15 << 4, %ah
> > > > -       jnz     L(match_fourth_wchar)
> > > > -       test    %ah, %ah
> > > > -       jnz     L(match_third_wchar)
> > > > -       test    $15 << 4, %al
> > > > -       jnz     L(match_second_wchar)
> > > > -       lea     -16(%rdi), %rax
> > > > -       ret
> > > > -
> > > > -       .p2align 4
> > > > -L(match):
> > > > -       pmovmskb %xmm2, %rcx
> > > > -       test    %rcx, %rcx
> > > > -       jnz     L(find_zero)
> > > > -       mov     %rax, %r8
> > > > -       mov     %rdi, %rsi
> > > > -       jmp     L(loop)
> > > > -
> > > > -       .p2align 4
> > > > -L(find_zero):
> > > > -       test    $15, %cl
> > > > -       jnz     L(find_zero_in_first_wchar)
> > > > -       test    %cl, %cl
> > > > -       jnz     L(find_zero_in_second_wchar)
> > > > -       test    $15, %ch
> > > > -       jnz     L(find_zero_in_third_wchar)
> > > > -
> > > > -       and     $1 << 13 - 1, %rax
> > > > -       jz      L(return_value)
> > > > -
> > > > -       test    $15 << 4, %ah
> > > > -       jnz     L(match_fourth_wchar)
> > > > -       test    %ah, %ah
> > > > -       jnz     L(match_third_wchar)
> > > > -       test    $15 << 4, %al
> > > > -       jnz     L(match_second_wchar)
> > > > -       lea     -16(%rdi), %rax
> > > > -       ret
> > > > -
> > > > -       .p2align 4
> > > > -L(find_zero_in_first_wchar):
> > > > -       test    $1, %rax
> > > > -       jz      L(return_value)
> > > > -       lea     -16(%rdi), %rax
> > > > -       ret
> > > > -
> > > > -       .p2align 4
> > > > -L(find_zero_in_second_wchar):
> > > > -       and     $1 << 5 - 1, %rax
> > > > -       jz      L(return_value)
> > > > -
> > > > -       test    $15 << 4, %al
> > > > -       jnz     L(match_second_wchar)
> > > > -       lea     -16(%rdi), %rax
> > > > -       ret
> > > > -
> > > > -       .p2align 4
> > > > -L(find_zero_in_third_wchar):
> > > > -       and     $1 << 9 - 1, %rax
> > > > -       jz      L(return_value)
> > > > -
> > > > -       test    %ah, %ah
> > > > -       jnz     L(match_third_wchar)
> > > > -       test    $15 << 4, %al
> > > > -       jnz     L(match_second_wchar)
> > > > -       lea     -16(%rdi), %rax
> > > > -       ret
> > > > -
> > > > -       .p2align 4
> > > > -L(prolog_find_zero):
> > > > -       add     %rcx, %rdi
> > > > -       mov     %rdx, %rcx
> > > > -L(prolog_find_zero_1):
> > > > -       test    $15, %cl
> > > > -       jnz     L(prolog_find_zero_in_first_wchar)
> > > > -       test    %cl, %cl
> > > > -       jnz     L(prolog_find_zero_in_second_wchar)
> > > > -       test    $15, %ch
> > > > -       jnz     L(prolog_find_zero_in_third_wchar)
> > > > -
> > > > -       and     $1 << 13 - 1, %rax
> > > > -       jz      L(return_null)
> > > > -
> > > > -       test    $15 << 4, %ah
> > > > -       jnz     L(match_fourth_wchar)
> > > > -       test    %ah, %ah
> > > > -       jnz     L(match_third_wchar)
> > > > -       test    $15 << 4, %al
> > > > -       jnz     L(match_second_wchar)
> > > > -       lea     -16(%rdi), %rax
> > > > -       ret
> > > > -
> > > > -       .p2align 4
> > > > -L(prolog_find_zero_in_first_wchar):
> > > > -       test    $1, %rax
> > > > -       jz      L(return_null)
> > > > -       lea     -16(%rdi), %rax
> > > > -       ret
> > > > -
> > > > -       .p2align 4
> > > > -L(prolog_find_zero_in_second_wchar):
> > > > -       and     $1 << 5 - 1, %rax
> > > > -       jz      L(return_null)
> > > > -
> > > > -       test    $15 << 4, %al
> > > > -       jnz     L(match_second_wchar)
> > > > -       lea     -16(%rdi), %rax
> > > > -       ret
> > > > -
> > > > -       .p2align 4
> > > > -L(prolog_find_zero_in_third_wchar):
> > > > -       and     $1 << 9 - 1, %rax
> > > > -       jz      L(return_null)
> > > > -
> > > > -       test    %ah, %ah
> > > > -       jnz     L(match_third_wchar)
> > > > -       test    $15 << 4, %al
> > > > -       jnz     L(match_second_wchar)
> > > > -       lea     -16(%rdi), %rax
> > > > -       ret
> > > > -
> > > > -       .p2align 4
> > > > -L(match_second_wchar):
> > > > -       lea     -12(%rdi), %rax
> > > > -       ret
> > > > -
> > > > -       .p2align 4
> > > > -L(match_third_wchar):
> > > > -       lea     -8(%rdi), %rax
> > > > -       ret
> > > > -
> > > > -       .p2align 4
> > > > -L(match_fourth_wchar):
> > > > -       lea     -4(%rdi), %rax
> > > > -       ret
> > > > -
> > > > -       .p2align 4
> > > > -L(return_null):
> > > > -       xor     %rax, %rax
> > > > -       ret
> > > > -
> > > > -END (wcsrchr)
> > > > +#include "../strrchr.S"
> > > > --
> > > > 2.25.1
> > > >
> > >
> > >
> > > --
> > > H.J.
>
>
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2 2/4] x86: Optimize {str|wcs}rchr-sse2
  2022-04-21 22:22   ` [PATCH v2 2/4] x86: Optimize {str|wcs}rchr-sse2 Noah Goldstein
@ 2022-04-21 23:46     ` H.J. Lu
  2022-04-22  1:54       ` Noah Goldstein
  0 siblings, 1 reply; 36+ messages in thread
From: H.J. Lu @ 2022-04-21 23:46 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Thu, Apr 21, 2022 at 3:22 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The new code unrolls the main loop slightly without adding too much
> overhead and minimizes the comparisons for the search CHAR.
>
> Geometric Mean of all benchmarks New / Old: 0.741
> See email for all results.
>
> Full xcheck passes on x86_64 with and without multiarch enabled.
> ---
>  sysdeps/x86_64/multiarch/strrchr-sse2.S |   2 +-
>  sysdeps/x86_64/multiarch/wcsrchr-sse2.S |   3 +-
>  sysdeps/x86_64/strrchr.S                | 510 +++++++++++++++---------
>  sysdeps/x86_64/wcsrchr.S                | 268 +------------
>  4 files changed, 339 insertions(+), 444 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
> index db1b44c23c..866396e947 100644
> --- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
> +++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
> @@ -17,7 +17,7 @@
>     <https://www.gnu.org/licenses/>.  */
>
>  #if IS_IN (libc)
> -# define strrchr __strrchr_sse2
> +# define STRRCHR __strrchr_sse2
>
>  # undef weak_alias
>  # define weak_alias(strrchr, rindex)
> diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> index 78d1ca6553..69d2f3cdb1 100644
> --- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> +++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> @@ -17,7 +17,6 @@
>     <https://www.gnu.org/licenses/>.  */
>
>  #if IS_IN (libc)
> -# define wcsrchr __wcsrchr_sse2
> +# define STRRCHR       __wcsrchr_sse2
>  #endif
> -
>  #include "../wcsrchr.S"
> diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
> index 50d886713e..6efb25c880 100644
> --- a/sysdeps/x86_64/strrchr.S
> +++ b/sysdeps/x86_64/strrchr.S
> @@ -19,210 +19,360 @@
>
>  #include <sysdep.h>
>
> +#ifndef STRRCHR
> +# define STRRCHR       strrchr
> +#endif
> +
> +#ifdef USE_AS_WCSRCHR
> +# define PCMPEQ        pcmpeqd
> +# define CHAR_SIZE     4
> +# define PMINU pminud
> +#else
> +# define PCMPEQ        pcmpeqb
> +# define CHAR_SIZE     1
> +# define PMINU pminub
> +#endif
> +
> +#define PAGE_SIZE      4096
> +#define VEC_SIZE       16
> +
>         .text
> -ENTRY (strrchr)
> -       movd    %esi, %xmm1
> +ENTRY(STRRCHR)
> +       movd    %esi, %xmm0
>         movq    %rdi, %rax
> -       andl    $4095, %eax
> -       punpcklbw       %xmm1, %xmm1
> -       cmpq    $4032, %rax
> -       punpcklwd       %xmm1, %xmm1
> -       pshufd  $0, %xmm1, %xmm1
> +       andl    $(PAGE_SIZE - 1), %eax
> +#ifndef USE_AS_WCSRCHR
> +       punpcklbw %xmm0, %xmm0
> +       punpcklwd %xmm0, %xmm0
> +#endif
> +       pshufd  $0, %xmm0, %xmm0
> +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
>         ja      L(cross_page)
> -       movdqu  (%rdi), %xmm0
> +
> +L(cross_page_continue):
> +       movups  (%rdi), %xmm1
>         pxor    %xmm2, %xmm2
> -       movdqa  %xmm0, %xmm3
> -       pcmpeqb %xmm1, %xmm0
> -       pcmpeqb %xmm2, %xmm3
> -       pmovmskb        %xmm0, %ecx
> -       pmovmskb        %xmm3, %edx
> -       testq   %rdx, %rdx
> -       je      L(next_48_bytes)
> -       leaq    -1(%rdx), %rax
> -       xorq    %rdx, %rax
> -       andq    %rcx, %rax
> -       je      L(exit)
> -       bsrq    %rax, %rax
> +       PCMPEQ  %xmm1, %xmm2
> +       pmovmskb %xmm2, %ecx
> +       testl   %ecx, %ecx
> +       jz      L(aligned_more)
> +
> +       PCMPEQ  %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +       leal    -1(%rcx), %edx
> +       xorl    %edx, %ecx
> +       andl    %ecx, %eax
> +       jz      L(ret0)
> +       bsrl    %eax, %eax
>         addq    %rdi, %rax
> +       /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
> +          search CHAR is zero we are correct. Either way `andq
> +          -CHAR_SIZE, %rax` gets the correct result.  */
> +#ifdef USE_AS_WCSRCHR
> +       andq    $-CHAR_SIZE, %rax
> +#endif
> +L(ret0):
>         ret
>
> +       /* Returns for first vec x1/x2 have hard coded backward search
> +          path for earlier matches.  */
>         .p2align 4
> -L(next_48_bytes):
> -       movdqu  16(%rdi), %xmm4
> -       movdqa  %xmm4, %xmm5
> -       movdqu  32(%rdi), %xmm3
> -       pcmpeqb %xmm1, %xmm4
> -       pcmpeqb %xmm2, %xmm5
> -       movdqu  48(%rdi), %xmm0
> -       pmovmskb        %xmm5, %edx
> -       movdqa  %xmm3, %xmm5
> -       pcmpeqb %xmm1, %xmm3
> -       pcmpeqb %xmm2, %xmm5
> -       pcmpeqb %xmm0, %xmm2
> -       salq    $16, %rdx
> -       pmovmskb        %xmm3, %r8d
> -       pmovmskb        %xmm5, %eax
> -       pmovmskb        %xmm2, %esi
> -       salq    $32, %r8
> -       salq    $32, %rax
> -       pcmpeqb %xmm1, %xmm0
> -       orq     %rdx, %rax
> -       movq    %rsi, %rdx
> -       pmovmskb        %xmm4, %esi
> -       salq    $48, %rdx
> -       salq    $16, %rsi
> -       orq     %r8, %rsi
> -       orq     %rcx, %rsi
> -       pmovmskb        %xmm0, %ecx
> -       salq    $48, %rcx
> -       orq     %rcx, %rsi
> -       orq     %rdx, %rax
> -       je      L(loop_header2)
> -       leaq    -1(%rax), %rcx
> -       xorq    %rax, %rcx
> -       andq    %rcx, %rsi
> -       je      L(exit)
> -       bsrq    %rsi, %rsi
> -       leaq    (%rdi,%rsi), %rax
> +L(first_vec_x0_test):
> +       PCMPEQ  %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +       testl   %eax, %eax
> +       jz      L(ret0)
> +       bsrl    %eax, %eax
> +       addq    %r8, %rax
> +#ifdef USE_AS_WCSRCHR
> +       andq    $-CHAR_SIZE, %rax
> +#endif
>         ret
>
>         .p2align 4
> -L(loop_header2):
> -       testq   %rsi, %rsi
> -       movq    %rdi, %rcx
> -       je      L(no_c_found)
> -L(loop_header):
> -       addq    $64, %rdi
> -       pxor    %xmm7, %xmm7
> -       andq    $-64, %rdi
> -       jmp     L(loop_entry)
> +L(first_vec_x1):
> +       PCMPEQ  %xmm0, %xmm2
> +       pmovmskb %xmm2, %eax
> +       leal    -1(%rcx), %edx
> +       xorl    %edx, %ecx
> +       andl    %ecx, %eax
> +       jz      L(first_vec_x0_test)
> +       bsrl    %eax, %eax
> +       leaq    (VEC_SIZE)(%rdi, %rax), %rax
> +#ifdef USE_AS_WCSRCHR
> +       andq    $-CHAR_SIZE, %rax
> +#endif
> +       ret
>
>         .p2align 4
> -L(loop64):
> -       testq   %rdx, %rdx
> -       cmovne  %rdx, %rsi
> -       cmovne  %rdi, %rcx
> -       addq    $64, %rdi
> -L(loop_entry):
> -       movdqa  32(%rdi), %xmm3
> -       pxor    %xmm6, %xmm6
> -       movdqa  48(%rdi), %xmm2
> -       movdqa  %xmm3, %xmm0
> -       movdqa  16(%rdi), %xmm4
> -       pminub  %xmm2, %xmm0
> -       movdqa  (%rdi), %xmm5
> -       pminub  %xmm4, %xmm0
> -       pminub  %xmm5, %xmm0
> -       pcmpeqb %xmm7, %xmm0
> -       pmovmskb        %xmm0, %eax
> -       movdqa  %xmm5, %xmm0
> -       pcmpeqb %xmm1, %xmm0
> -       pmovmskb        %xmm0, %r9d
> -       movdqa  %xmm4, %xmm0
> -       pcmpeqb %xmm1, %xmm0
> -       pmovmskb        %xmm0, %edx
> -       movdqa  %xmm3, %xmm0
> -       pcmpeqb %xmm1, %xmm0
> -       salq    $16, %rdx
> -       pmovmskb        %xmm0, %r10d
> -       movdqa  %xmm2, %xmm0
> -       pcmpeqb %xmm1, %xmm0
> -       salq    $32, %r10
> -       orq     %r10, %rdx
> -       pmovmskb        %xmm0, %r8d
> -       orq     %r9, %rdx
> -       salq    $48, %r8
> -       orq     %r8, %rdx
> +L(first_vec_x1_test):
> +       PCMPEQ  %xmm0, %xmm2
> +       pmovmskb %xmm2, %eax
>         testl   %eax, %eax
> -       je      L(loop64)
> -       pcmpeqb %xmm6, %xmm4
> -       pcmpeqb %xmm6, %xmm3
> -       pcmpeqb %xmm6, %xmm5
> -       pmovmskb        %xmm4, %eax
> -       pmovmskb        %xmm3, %r10d
> -       pcmpeqb %xmm6, %xmm2
> -       pmovmskb        %xmm5, %r9d
> -       salq    $32, %r10
> -       salq    $16, %rax
> -       pmovmskb        %xmm2, %r8d
> -       orq     %r10, %rax
> -       orq     %r9, %rax
> -       salq    $48, %r8
> -       orq     %r8, %rax
> -       leaq    -1(%rax), %r8
> -       xorq    %rax, %r8
> -       andq    %r8, %rdx
> -       cmovne  %rdi, %rcx
> -       cmovne  %rdx, %rsi
> -       bsrq    %rsi, %rsi
> -       leaq    (%rcx,%rsi), %rax
> +       jz      L(first_vec_x0_test)
> +       bsrl    %eax, %eax
> +       leaq    (VEC_SIZE)(%rdi, %rax), %rax
> +#ifdef USE_AS_WCSRCHR
> +       andq    $-CHAR_SIZE, %rax
> +#endif
> +       ret
> +
> +       .p2align 4
> +L(first_vec_x2):
> +       PCMPEQ  %xmm0, %xmm3
> +       pmovmskb %xmm3, %eax
> +       leal    -1(%rcx), %edx
> +       xorl    %edx, %ecx
> +       andl    %ecx, %eax
> +       jz      L(first_vec_x1_test)
> +       bsrl    %eax, %eax
> +       leaq    (VEC_SIZE * 2)(%rdi, %rax), %rax
> +#ifdef USE_AS_WCSRCHR
> +       andq    $-CHAR_SIZE, %rax
> +#endif
> +       ret
> +
> +       .p2align 4
> +L(aligned_more):
> +       /* Save original pointer if match was in VEC 0.  */
> +       movq    %rdi, %r8
> +       andq    $-VEC_SIZE, %rdi
> +
> +       movaps  VEC_SIZE(%rdi), %xmm2
> +       pxor    %xmm3, %xmm3
> +       PCMPEQ  %xmm2, %xmm3
> +       pmovmskb %xmm3, %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(first_vec_x1)
> +
> +       movaps  (VEC_SIZE * 2)(%rdi), %xmm3
> +       pxor    %xmm4, %xmm4
> +       PCMPEQ  %xmm3, %xmm4
> +       pmovmskb %xmm4, %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(first_vec_x2)
> +
> +       addq    $VEC_SIZE, %rdi
> +       /* Save pointer again before realigning.  */
> +       movq    %rdi, %rsi
> +       andq    $-(VEC_SIZE * 2), %rdi
> +       .p2align 4
> +L(first_loop):
> +       /* Do 2x VEC at a time.  */
> +       movaps  (VEC_SIZE * 2)(%rdi), %xmm4
> +       movaps  (VEC_SIZE * 3)(%rdi), %xmm5
> +       /* If SSE2 no pminud so wcsrchr needs seperate logic for
              Did you mean "Since", instead of "If"?

> +          detecting zero. Note if this is found to be a bottleneck it
> +          may be worth adding an SSE4.1 wcsrchr implementation.  */
> +#ifdef USE_AS_WCSRCHR
> +       movaps  %xmm5, %xmm6
> +       pxor    %xmm8, %xmm8
> +
> +       PCMPEQ  %xmm8, %xmm5
> +       PCMPEQ  %xmm4, %xmm8
> +       por     %xmm5, %xmm8
> +#else
> +       movaps  %xmm5, %xmm6
> +       PMINU   %xmm4, %xmm5
> +#endif
> +
> +       movaps  %xmm4, %xmm9
> +       PCMPEQ  %xmm0, %xmm4
> +       PCMPEQ  %xmm0, %xmm6
> +       movaps  %xmm6, %xmm7
> +       por     %xmm4, %xmm6
> +#ifndef USE_AS_WCSRCHR
> +       pxor    %xmm8, %xmm8
> +       PCMPEQ  %xmm5, %xmm8
> +#endif
> +       pmovmskb %xmm8, %ecx
> +       pmovmskb %xmm6, %eax
> +
> +       addq    $(VEC_SIZE * 2), %rdi
> +       /* Use `addl` 1) so we can undo it with `subl` and 2) it can
> +          macro-fuse with `jz`.  */
> +       addl    %ecx, %eax
> +       jz      L(first_loop)
> +
> +       /* Check if there is zero match.  */
> +       testl   %ecx, %ecx
> +       jz      L(second_loop_match)
> +
> +       /* Check if there was a match in last iteration.  */
> +       subl    %ecx, %eax
> +       jnz     L(new_match)
> +
> +L(first_loop_old_match):
> +       PCMPEQ  %xmm0, %xmm2
> +       PCMPEQ  %xmm0, %xmm3
> +       pmovmskb %xmm2, %ecx
> +       pmovmskb %xmm3, %eax
> +       addl    %eax, %ecx
> +       jz      L(first_vec_x0_test)
> +       /* NB: We could move this shift to before the branch and save a
> +          bit of code size / performance on the fall through. The
> +          branch leads to the null case which generally seems hotter
> +          than char in first 3x VEC.  */
> +       sall    $16, %eax
> +       orl     %ecx, %eax
> +
> +       bsrl    %eax, %eax
> +       addq    %rsi, %rax
> +#ifdef USE_AS_WCSRCHR
> +       andq    $-CHAR_SIZE, %rax
> +#endif
> +       ret
> +
> +       .p2align 4
> +L(new_match):
> +       pxor    %xmm6, %xmm6
> +       PCMPEQ  %xmm9, %xmm6
> +       pmovmskb %xmm6, %eax
> +       sall    $16, %ecx
> +       orl     %eax, %ecx
> +
> +       /* We can't reuse either of the old comparisons as since we mask
> +          of zeros after first zero (instead of using the full
> +          comparison) we can't gurantee no interference between match
> +          after end of string and valid match.  */
> +       pmovmskb %xmm4, %eax
> +       pmovmskb %xmm7, %edx
> +       sall    $16, %edx
> +       orl     %edx, %eax
> +
> +       leal    -1(%ecx), %edx
> +       xorl    %edx, %ecx
> +       andl    %ecx, %eax
> +       jz      L(first_loop_old_match)
> +       bsrl    %eax, %eax
> +       addq    %rdi, %rax
> +#ifdef USE_AS_WCSRCHR
> +       andq    $-CHAR_SIZE, %rax
> +#endif
>         ret
>
> +       /* Save minimum state for getting most recent match. We can
> +          throw out all previous work.  */
>         .p2align 4
> -L(no_c_found):
> -       movl    $1, %esi
> -       xorl    %ecx, %ecx
> -       jmp     L(loop_header)
> +L(second_loop_match):
> +       movq    %rdi, %rsi
> +       movaps  %xmm4, %xmm2
> +       movaps  %xmm7, %xmm3
>
>         .p2align 4
> -L(exit):
> -       xorl    %eax, %eax
> +L(second_loop):
> +       movaps  (VEC_SIZE * 2)(%rdi), %xmm4
> +       movaps  (VEC_SIZE * 3)(%rdi), %xmm5
> +       /* If SSE2 no pminud so wcsrchr needs seperate logic for
                Did you mean "Since", instead of "If"?

> +          detecting zero. Note if this is found to be a bottleneck it
> +          may be worth adding an SSE4.1 wcsrchr implementation.  */
> +#ifdef USE_AS_WCSRCHR
> +       movaps  %xmm5, %xmm6
> +       pxor    %xmm8, %xmm8
> +
> +       PCMPEQ  %xmm8, %xmm5
> +       PCMPEQ  %xmm4, %xmm8
> +       por     %xmm5, %xmm8
> +#else
> +       movaps  %xmm5, %xmm6
> +       PMINU   %xmm4, %xmm5
> +#endif
> +
> +       movaps  %xmm4, %xmm9
> +       PCMPEQ  %xmm0, %xmm4
> +       PCMPEQ  %xmm0, %xmm6
> +       movaps  %xmm6, %xmm7
> +       por     %xmm4, %xmm6
> +#ifndef USE_AS_WCSRCHR
> +       pxor    %xmm8, %xmm8
> +       PCMPEQ  %xmm5, %xmm8
> +#endif
> +
> +       pmovmskb %xmm8, %ecx
> +       pmovmskb %xmm6, %eax
> +
> +       addq    $(VEC_SIZE * 2), %rdi
> +       /* Either null term or new occurence of CHAR.  */
> +       addl    %ecx, %eax
> +       jz      L(second_loop)
> +
> +       /* No null term so much be new occurence of CHAR.  */
> +       testl   %ecx, %ecx
> +       jz      L(second_loop_match)
> +
> +
> +       subl    %ecx, %eax
> +       jnz     L(second_loop_new_match)
> +
> +L(second_loop_old_match):
> +       pmovmskb %xmm2, %ecx
> +       pmovmskb %xmm3, %eax
> +       sall    $16, %eax
> +       orl     %ecx, %eax
> +       bsrl    %eax, %eax
> +       addq    %rsi, %rax
> +#ifdef USE_AS_WCSRCHR
> +       andq    $-CHAR_SIZE, %rax
> +#endif
>         ret
>
>         .p2align 4
> +L(second_loop_new_match):
> +       pxor    %xmm6, %xmm6
> +       PCMPEQ  %xmm9, %xmm6
> +       pmovmskb %xmm6, %eax
> +       sall    $16, %ecx
> +       orl     %eax, %ecx
> +
> +       /* We can't reuse either of the old comparisons as since we mask
> +          of zeros after first zero (instead of using the full
> +          comparison) we can't gurantee no interference between match
> +          after end of string and valid match.  */
> +       pmovmskb %xmm4, %eax
> +       pmovmskb %xmm7, %edx
> +       sall    $16, %edx
> +       orl     %edx, %eax
> +
> +       leal    -1(%ecx), %edx
> +       xorl    %edx, %ecx
> +       andl    %ecx, %eax
> +       jz      L(second_loop_old_match)
> +       bsrl    %eax, %eax
> +       addq    %rdi, %rax
> +#ifdef USE_AS_WCSRCHR
> +       andq    $-CHAR_SIZE, %rax
> +#endif
> +       ret
> +
> +       .p2align 4,, 4
>  L(cross_page):
> -       movq    %rdi, %rax
> -       pxor    %xmm0, %xmm0
> -       andq    $-64, %rax
> -       movdqu  (%rax), %xmm5
> -       movdqa  %xmm5, %xmm6
> -       movdqu  16(%rax), %xmm4
> -       pcmpeqb %xmm1, %xmm5
> -       pcmpeqb %xmm0, %xmm6
> -       movdqu  32(%rax), %xmm3
> -       pmovmskb        %xmm6, %esi
> -       movdqa  %xmm4, %xmm6
> -       movdqu  48(%rax), %xmm2
> -       pcmpeqb %xmm1, %xmm4
> -       pcmpeqb %xmm0, %xmm6
> -       pmovmskb        %xmm6, %edx
> -       movdqa  %xmm3, %xmm6
> -       pcmpeqb %xmm1, %xmm3
> -       pcmpeqb %xmm0, %xmm6
> -       pcmpeqb %xmm2, %xmm0
> -       salq    $16, %rdx
> -       pmovmskb        %xmm3, %r9d
> -       pmovmskb        %xmm6, %r8d
> -       pmovmskb        %xmm0, %ecx
> -       salq    $32, %r9
> -       salq    $32, %r8
> -       pcmpeqb %xmm1, %xmm2
> -       orq     %r8, %rdx
> -       salq    $48, %rcx
> -       pmovmskb        %xmm5, %r8d
> -       orq     %rsi, %rdx
> -       pmovmskb        %xmm4, %esi
> -       orq     %rcx, %rdx
> -       pmovmskb        %xmm2, %ecx
> -       salq    $16, %rsi
> -       salq    $48, %rcx
> -       orq     %r9, %rsi
> -       orq     %r8, %rsi
> -       orq     %rcx, %rsi
> +       movq    %rdi, %rsi
> +       andq    $-VEC_SIZE, %rsi
> +       movaps  (%rsi), %xmm1
> +       pxor    %xmm2, %xmm2
> +       PCMPEQ  %xmm1, %xmm2
> +       pmovmskb %xmm2, %edx
>         movl    %edi, %ecx
> -       subl    %eax, %ecx
> -       shrq    %cl, %rdx
> -       shrq    %cl, %rsi
> -       testq   %rdx, %rdx
> -       je      L(loop_header2)
> -       leaq    -1(%rdx), %rax
> -       xorq    %rdx, %rax
> -       andq    %rax, %rsi
> -       je      L(exit)
> -       bsrq    %rsi, %rax
> +       andl    $(VEC_SIZE - 1), %ecx
> +       sarl    %cl, %edx
> +       jz      L(cross_page_continue)
> +       PCMPEQ  %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +       sarl    %cl, %eax
> +       leal    -1(%rdx), %ecx
> +       xorl    %edx, %ecx
> +       andl    %ecx, %eax
> +       jz      L(ret1)
> +       bsrl    %eax, %eax
>         addq    %rdi, %rax
> +#ifdef USE_AS_WCSRCHR
> +       andq    $-CHAR_SIZE, %rax
> +#endif
> +L(ret1):
>         ret
> -END (strrchr)
> +END(STRRCHR)
>
> -weak_alias (strrchr, rindex)
> -libc_hidden_builtin_def (strrchr)
> +#ifndef USE_AS_WCSRCHR
> +       weak_alias (STRRCHR, rindex)
> +       libc_hidden_builtin_def (STRRCHR)
> +#endif
> diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
> index 61552954de..2b80efc5ef 100644
> --- a/sysdeps/x86_64/wcsrchr.S
> +++ b/sysdeps/x86_64/wcsrchr.S
> @@ -1,4 +1,4 @@
> -/* wcsrchr with SSSE3
> +/* wcsrchr optimized with SSE2.
>     Copyright (C) 2011-2022 Free Software Foundation, Inc.
>     This file is part of the GNU C Library.
>
> @@ -16,266 +16,12 @@
>     License along with the GNU C Library; if not, see
>     <https://www.gnu.org/licenses/>.  */
>
> -#include <sysdep.h>
>
> -       .text
> -ENTRY (wcsrchr)
> +#define USE_AS_WCSRCHR 1
> +#define NO_PMINU       1
>
> -       movd    %rsi, %xmm1
> -       mov     %rdi, %rcx
> -       punpckldq %xmm1, %xmm1
> -       pxor    %xmm2, %xmm2
> -       punpckldq %xmm1, %xmm1
> -       and     $63, %rcx
> -       cmp     $48, %rcx
> -       ja      L(crosscache)
> +#ifndef STRRCHR
> +# define STRRCHR       wcsrchr
> +#endif
>
> -       movdqu  (%rdi), %xmm0
> -       pcmpeqd %xmm0, %xmm2
> -       pcmpeqd %xmm1, %xmm0
> -       pmovmskb %xmm2, %rcx
> -       pmovmskb %xmm0, %rax
> -       add     $16, %rdi
> -
> -       test    %rax, %rax
> -       jnz     L(unaligned_match1)
> -
> -       test    %rcx, %rcx
> -       jnz     L(return_null)
> -
> -       and     $-16, %rdi
> -       xor     %r8, %r8
> -       jmp     L(loop)
> -
> -       .p2align 4
> -L(unaligned_match1):
> -       test    %rcx, %rcx
> -       jnz     L(prolog_find_zero_1)
> -
> -       mov     %rax, %r8
> -       mov     %rdi, %rsi
> -       and     $-16, %rdi
> -       jmp     L(loop)
> -
> -       .p2align 4
> -L(crosscache):
> -       and     $15, %rcx
> -       and     $-16, %rdi
> -       pxor    %xmm3, %xmm3
> -       movdqa  (%rdi), %xmm0
> -       pcmpeqd %xmm0, %xmm3
> -       pcmpeqd %xmm1, %xmm0
> -       pmovmskb %xmm3, %rdx
> -       pmovmskb %xmm0, %rax
> -       shr     %cl, %rdx
> -       shr     %cl, %rax
> -       add     $16, %rdi
> -
> -       test    %rax, %rax
> -       jnz     L(unaligned_match)
> -
> -       test    %rdx, %rdx
> -       jnz     L(return_null)
> -
> -       xor     %r8, %r8
> -       jmp     L(loop)
> -
> -       .p2align 4
> -L(unaligned_match):
> -       test    %rdx, %rdx
> -       jnz     L(prolog_find_zero)
> -
> -       mov     %rax, %r8
> -       lea     (%rdi, %rcx), %rsi
> -
> -/* Loop start on aligned string.  */
> -       .p2align 4
> -L(loop):
> -       movdqa  (%rdi), %xmm0
> -       pcmpeqd %xmm0, %xmm2
> -       add     $16, %rdi
> -       pcmpeqd %xmm1, %xmm0
> -       pmovmskb %xmm2, %rcx
> -       pmovmskb %xmm0, %rax
> -       or      %rax, %rcx
> -       jnz     L(matches)
> -
> -       movdqa  (%rdi), %xmm3
> -       pcmpeqd %xmm3, %xmm2
> -       add     $16, %rdi
> -       pcmpeqd %xmm1, %xmm3
> -       pmovmskb %xmm2, %rcx
> -       pmovmskb %xmm3, %rax
> -       or      %rax, %rcx
> -       jnz     L(matches)
> -
> -       movdqa  (%rdi), %xmm4
> -       pcmpeqd %xmm4, %xmm2
> -       add     $16, %rdi
> -       pcmpeqd %xmm1, %xmm4
> -       pmovmskb %xmm2, %rcx
> -       pmovmskb %xmm4, %rax
> -       or      %rax, %rcx
> -       jnz     L(matches)
> -
> -       movdqa  (%rdi), %xmm5
> -       pcmpeqd %xmm5, %xmm2
> -       add     $16, %rdi
> -       pcmpeqd %xmm1, %xmm5
> -       pmovmskb %xmm2, %rcx
> -       pmovmskb %xmm5, %rax
> -       or      %rax, %rcx
> -       jz      L(loop)
> -
> -       .p2align 4
> -L(matches):
> -       test    %rax, %rax
> -       jnz     L(match)
> -L(return_value):
> -       test    %r8, %r8
> -       jz      L(return_null)
> -       mov     %r8, %rax
> -       mov     %rsi, %rdi
> -
> -       test    $15 << 4, %ah
> -       jnz     L(match_fourth_wchar)
> -       test    %ah, %ah
> -       jnz     L(match_third_wchar)
> -       test    $15 << 4, %al
> -       jnz     L(match_second_wchar)
> -       lea     -16(%rdi), %rax
> -       ret
> -
> -       .p2align 4
> -L(match):
> -       pmovmskb %xmm2, %rcx
> -       test    %rcx, %rcx
> -       jnz     L(find_zero)
> -       mov     %rax, %r8
> -       mov     %rdi, %rsi
> -       jmp     L(loop)
> -
> -       .p2align 4
> -L(find_zero):
> -       test    $15, %cl
> -       jnz     L(find_zero_in_first_wchar)
> -       test    %cl, %cl
> -       jnz     L(find_zero_in_second_wchar)
> -       test    $15, %ch
> -       jnz     L(find_zero_in_third_wchar)
> -
> -       and     $1 << 13 - 1, %rax
> -       jz      L(return_value)
> -
> -       test    $15 << 4, %ah
> -       jnz     L(match_fourth_wchar)
> -       test    %ah, %ah
> -       jnz     L(match_third_wchar)
> -       test    $15 << 4, %al
> -       jnz     L(match_second_wchar)
> -       lea     -16(%rdi), %rax
> -       ret
> -
> -       .p2align 4
> -L(find_zero_in_first_wchar):
> -       test    $1, %rax
> -       jz      L(return_value)
> -       lea     -16(%rdi), %rax
> -       ret
> -
> -       .p2align 4
> -L(find_zero_in_second_wchar):
> -       and     $1 << 5 - 1, %rax
> -       jz      L(return_value)
> -
> -       test    $15 << 4, %al
> -       jnz     L(match_second_wchar)
> -       lea     -16(%rdi), %rax
> -       ret
> -
> -       .p2align 4
> -L(find_zero_in_third_wchar):
> -       and     $1 << 9 - 1, %rax
> -       jz      L(return_value)
> -
> -       test    %ah, %ah
> -       jnz     L(match_third_wchar)
> -       test    $15 << 4, %al
> -       jnz     L(match_second_wchar)
> -       lea     -16(%rdi), %rax
> -       ret
> -
> -       .p2align 4
> -L(prolog_find_zero):
> -       add     %rcx, %rdi
> -       mov     %rdx, %rcx
> -L(prolog_find_zero_1):
> -       test    $15, %cl
> -       jnz     L(prolog_find_zero_in_first_wchar)
> -       test    %cl, %cl
> -       jnz     L(prolog_find_zero_in_second_wchar)
> -       test    $15, %ch
> -       jnz     L(prolog_find_zero_in_third_wchar)
> -
> -       and     $1 << 13 - 1, %rax
> -       jz      L(return_null)
> -
> -       test    $15 << 4, %ah
> -       jnz     L(match_fourth_wchar)
> -       test    %ah, %ah
> -       jnz     L(match_third_wchar)
> -       test    $15 << 4, %al
> -       jnz     L(match_second_wchar)
> -       lea     -16(%rdi), %rax
> -       ret
> -
> -       .p2align 4
> -L(prolog_find_zero_in_first_wchar):
> -       test    $1, %rax
> -       jz      L(return_null)
> -       lea     -16(%rdi), %rax
> -       ret
> -
> -       .p2align 4
> -L(prolog_find_zero_in_second_wchar):
> -       and     $1 << 5 - 1, %rax
> -       jz      L(return_null)
> -
> -       test    $15 << 4, %al
> -       jnz     L(match_second_wchar)
> -       lea     -16(%rdi), %rax
> -       ret
> -
> -       .p2align 4
> -L(prolog_find_zero_in_third_wchar):
> -       and     $1 << 9 - 1, %rax
> -       jz      L(return_null)
> -
> -       test    %ah, %ah
> -       jnz     L(match_third_wchar)
> -       test    $15 << 4, %al
> -       jnz     L(match_second_wchar)
> -       lea     -16(%rdi), %rax
> -       ret
> -
> -       .p2align 4
> -L(match_second_wchar):
> -       lea     -12(%rdi), %rax
> -       ret
> -
> -       .p2align 4
> -L(match_third_wchar):
> -       lea     -8(%rdi), %rax
> -       ret
> -
> -       .p2align 4
> -L(match_fourth_wchar):
> -       lea     -4(%rdi), %rax
> -       ret
> -
> -       .p2align 4
> -L(return_null):
> -       xor     %rax, %rax
> -       ret
> -
> -END (wcsrchr)
> +#include "../strrchr.S"
> --
> 2.25.1
>


-- 
H.J.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v1 1/5] benchtests: Improve bench-strrchr
  2022-04-21 22:07   ` Noah Goldstein
@ 2022-04-21 23:49     ` H.J. Lu
  2022-04-22  1:11       ` Noah Goldstein
  0 siblings, 1 reply; 36+ messages in thread
From: H.J. Lu @ 2022-04-21 23:49 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Thu, Apr 21, 2022 at 3:08 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Thu, Apr 21, 2022 at 3:13 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Wed, Apr 20, 2022 at 8:14 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > 1. Use json-lib for printing results.
> > > 2. Expose all parameters (before pos, seek_char, and max_char where
> > >    not printed).
> > > 3. Add benchmarks that test multiple occurence of seek_char in the
> > >    string.
> > > ---
> > >  benchtests/bench-strrchr.c | 126 ++++++++++++++++++++++++-------------
> > >  1 file changed, 82 insertions(+), 44 deletions(-)
> > >
> > > diff --git a/benchtests/bench-strrchr.c b/benchtests/bench-strrchr.c
> > > index abdae60c51..cceea77e1b 100644
> > > --- a/benchtests/bench-strrchr.c
> > > +++ b/benchtests/bench-strrchr.c
> > > @@ -23,6 +23,7 @@
> > >  # define TEST_NAME "strrchr"
> > >  #endif
> > >  #include "bench-string.h"
> > > +#include "json-lib.h"
> > >
> > >  #define BIG_CHAR MAX_CHAR
> > >
> > > @@ -53,7 +54,8 @@ SIMPLE_STRRCHR (const CHAR *s, int c)
> > >  }
> > >
> > >  static void
> > > -do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
> > > +do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s, int c,
> > > +            CHAR *exp_res)
> > >  {
> > >    CHAR *res = CALL (impl, s, c);
> > >    size_t i, iters = INNER_LOOP_ITERS8;
> > > @@ -61,8 +63,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
> > >
> > >    if (res != exp_res)
> > >      {
> > > -      error (0, 0, "Wrong result in function %s %p %p", impl->name,
> > > -            res, exp_res);
> > > +      error (0, 0, "Wrong result in function %s %p %p", impl->name, res,
> > > +            exp_res);
> >
> > These changes aren't needed.
> >
> > >        ret = 1;
> > >        return;
> > >      }
> > > @@ -72,24 +74,28 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
> > >      {
> > >        CALL (impl, s, c);
> > >      }
> > > -  TIMING_NOW (stop);
> > >
> > > +  TIMING_NOW (stop);
> >
> > Not needed.
>
> Will fix in V2
> >
> > >    TIMING_DIFF (cur, start, stop);
> > >
> > > -  TIMING_PRINT_MEAN ((double) cur, (double) iters);
> > > +  json_element_double (json_ctx, (double) cur / (double) iters);
> > > +  return;
> >
> > Return isn't needed.
>
> Will fix in V2.
> >
> > >  }
> > >
> > >  static void
> > > -do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
> > > +do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
> > > +        int seek_char, int max_char, size_t freq)
> > >  /* For wcsrchr: align here means align not in bytes,
> > >     but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
> > >     len for wcschr here isn't in bytes but it's number of wchar_t symbols.  */
> > >  {
> > >    size_t i;
> > > +  size_t pos_chunk_sz = freq ? (pos / freq) : pos;
> > > +  size_t last_pos = len;
> > >    CHAR *result;
> > >    CHAR *buf = (CHAR *) buf1;
> > >
> > > -  align &= 7;
> > > +  align &= (getpagesize () - 1);
> >
> > If we have such large alignments, the tests may be skipped.
> > Should we change it to 127 instead?
>
> There is logic around page cross cases in x86_64 versions so think
> makes sense to support benchmarking it.
>
> Also i think that would tend to give the previous version a bit of
> an unfair disadvantage as the slow aligning case will never be
> tested in the new version.

If "align" is close to the page size, will it trigger

 if ((align + len) * sizeof (CHAR) >= page_size)
    return;

and skip page cross cases?

> >
> > >    if ((align + len) * sizeof (CHAR) >= page_size)
> > >      return;
> > >
> > > @@ -103,6 +109,16 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
> > >        if ((i > pos || pos >= len) && buf[align + i] == seek_char)
> > >         buf[align + i] = seek_char + 10 + (random () & 15);
> > >      }
> > > +
> > > +  if (pos_chunk_sz == 0 && pos)
> > > +    pos_chunk_sz = 1;
> > > +
> > > +  for (i = pos_chunk_sz; i < pos && i < len; i += pos_chunk_sz)
> > > +    {
> > > +      buf[align + i] = seek_char;
> > > +      last_pos = i;
> > > +    }
> > > +
> > >    buf[align + len] = 0;
> > >
> > >    if (pos < len)
> > > @@ -110,66 +126,88 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
> > >        buf[align + pos] = seek_char;
> > >        result = (CHAR *) (buf + align + pos);
> > >      }
> > > +  else if (last_pos < len)
> > > +    result = (CHAR *) (buf + align + last_pos);
> > >    else if (seek_char == 0)
> > >      result = (CHAR *) (buf + align + len);
> > >    else
> > >      result = NULL;
> > >
> > > -  printf ("Length %4zd, alignment in bytes %2zd:", len, align * sizeof (CHAR));
> > > +  json_element_object_begin (json_ctx);
> > > +  json_attr_uint (json_ctx, "len", len);
> > > +  json_attr_uint (json_ctx, "pos", pos);
> > > +  json_attr_uint (json_ctx, "align", align);
> > > +  json_attr_uint (json_ctx, "freq", freq);
> > > +  json_attr_uint (json_ctx, "seek", seek_char);
> > > +  json_attr_uint (json_ctx, "max_char", max_char);
> > > +  json_array_begin (json_ctx, "timings");
> > >
> > >    FOR_EACH_IMPL (impl, 0)
> > > -    do_one_test (impl, (CHAR *) (buf + align), seek_char, result);
> > > +    do_one_test (json_ctx, impl, (CHAR *) (buf + align), seek_char, result);
> > >
> > > -  putchar ('\n');
> > > +  json_array_end (json_ctx);
> > > +  json_element_object_end (json_ctx);
> > >  }
> > >
> > >  int
> > >  test_main (void)
> > >  {
> > > -  size_t i;
> > > +  json_ctx_t json_ctx;
> > > +  size_t i, j;
> > > +  int seek;
> > >
> > >    test_init ();
> > > +  json_init (&json_ctx, 0, stdout);
> > >
> > > -  printf ("%20s", "");
> > > -  FOR_EACH_IMPL (impl, 0)
> > > -    printf ("\t%s", impl->name);
> > > -  putchar ('\n');
> > > +  json_document_begin (&json_ctx);
> > > +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> > >
> > > -  for (i = 1; i < 8; ++i)
> > > -    {
> > > -      do_test (0, 16 << i, 2048, 23, SMALL_CHAR);
> > > -      do_test (i, 16 << i, 2048, 23, SMALL_CHAR);
> > > -    }
> > > +  json_attr_object_begin (&json_ctx, "functions");
> > > +  json_attr_object_begin (&json_ctx, TEST_NAME);
> > > +  json_attr_string (&json_ctx, "bench-variant", "");
> > >
> > > -  for (i = 1; i < 8; ++i)
> > > -    {
> > > -      do_test (i, 64, 256, 23, SMALL_CHAR);
> > > -      do_test (i, 64, 256, 23, BIG_CHAR);
> > > -    }
> > > -
> > > -  for (i = 0; i < 32; ++i)
> > > -    {
> > > -      do_test (0, i, i + 1, 23, SMALL_CHAR);
> > > -      do_test (0, i, i + 1, 23, BIG_CHAR);
> > > -    }
> > > +  json_array_begin (&json_ctx, "ifuncs");
> > > +  FOR_EACH_IMPL (impl, 0)
> > > +    json_element_string (&json_ctx, impl->name);
> > > +  json_array_end (&json_ctx);
> > >
> > > -  for (i = 1; i < 8; ++i)
> > > -    {
> > > -      do_test (0, 16 << i, 2048, 0, SMALL_CHAR);
> > > -      do_test (i, 16 << i, 2048, 0, SMALL_CHAR);
> > > -    }
> > > +  json_array_begin (&json_ctx, "results");
> > >
> > > -  for (i = 1; i < 8; ++i)
> > > +  for (seek = 0; seek <= 23; seek += 23)
> > >      {
> > > -      do_test (i, 64, 256, 0, SMALL_CHAR);
> > > -      do_test (i, 64, 256, 0, BIG_CHAR);
> > > +      for (j = 1; j < 32; j += j)
> > > +       {
> > > +         for (i = 1; i < 9; ++i)
> > > +           {
> > > +             do_test (&json_ctx, 0, 16 << i, 2048, seek, SMALL_CHAR, j);
> > > +             do_test (&json_ctx, i, 16 << i, 2048, seek, SMALL_CHAR, j);
> > > +           }
> > > +
> > > +         for (i = 1; i < 8; ++i)
> > > +           {
> > > +             do_test (&json_ctx, i, 64, 256, seek, SMALL_CHAR, j);
> > > +             do_test (&json_ctx, i, 64, 256, seek, BIG_CHAR, j);
> > > +
> > > +             do_test (&json_ctx, i * 15, 64, 256, seek, SMALL_CHAR, j);
> > > +             do_test (&json_ctx, i * 15, 64, 256, seek, BIG_CHAR, j);
> > > +           }
> > > +
> > > +         for (i = 0; i < 32; ++i)
> > > +           {
> > > +             do_test (&json_ctx, 0, i, i + 1, seek, SMALL_CHAR, j);
> > > +             do_test (&json_ctx, 0, i, i + 1, seek, BIG_CHAR, j);
> > > +           }
> > > +         if (seek == 0)
> > > +           {
> > > +             break;
> > > +           }
> > > +       }
> > >      }
> > >
> > > -  for (i = 0; i < 32; ++i)
> > > -    {
> > > -      do_test (0, i, i + 1, 0, SMALL_CHAR);
> > > -      do_test (0, i, i + 1, 0, BIG_CHAR);
> > > -    }
> > > +  json_array_end (&json_ctx);
> > > +  json_attr_object_end (&json_ctx);
> > > +  json_attr_object_end (&json_ctx);
> > > +  json_document_end (&json_ctx);
> > >
> > >    return ret;
> > >  }
> > > --
> > > 2.25.1
> > >
> >
> >
> > --
> > H.J.



-- 
H.J.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2 4/4] x86: Optimize {str|wcs}rchr-evex
  2022-04-21 22:22   ` [PATCH v2 4/4] x86: Optimize {str|wcs}rchr-evex Noah Goldstein
@ 2022-04-21 23:59     ` H.J. Lu
  2022-04-22  1:53       ` Noah Goldstein
  0 siblings, 1 reply; 36+ messages in thread
From: H.J. Lu @ 2022-04-21 23:59 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Thu, Apr 21, 2022 at 3:22 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The new code unrolls the main loop slightly without adding too much
> overhead and minimizes the comparisons for the search CHAR.
>
> Geometric Mean of all benchmarks New / Old: 0.755
> See email for all results.
>
> Full xcheck passes on x86_64 with and without multiarch enabled.
> ---
>  sysdeps/x86_64/multiarch/strrchr-evex.S | 441 ++++++++++++++----------
>  1 file changed, 259 insertions(+), 182 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
> index adeddaed32..5cf9a8315b 100644
> --- a/sysdeps/x86_64/multiarch/strrchr-evex.S
> +++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
> @@ -24,242 +24,319 @@
>  #  define STRRCHR      __strrchr_evex
>  # endif
>
> -# define VMOVU         vmovdqu64
> -# define VMOVA         vmovdqa64
> +# define VMOVU vmovdqu64
> +# define VMOVA vmovdqa64
>
>  # ifdef USE_AS_WCSRCHR
> +#  define SHIFT_REG    esi
> +
> +#  define kunpck       kunpckbw
> +#  define kmov_2x      kmovd
> +#  define maskz_2x     ecx
> +#  define maskm_2x     eax
> +#  define CHAR_SIZE    4
> +#  define VPMIN        vpminud
> +#  define VPTESTN      vptestnmd
>  #  define VPBROADCAST  vpbroadcastd
> -#  define VPCMP                vpcmpd
> -#  define SHIFT_REG    r8d
> +#  define VPCMP        vpcmpd
>  # else
> +#  define SHIFT_REG    edi
> +
> +#  define kunpck       kunpckdq
> +#  define kmov_2x      kmovq
> +#  define maskz_2x     rcx
> +#  define maskm_2x     rax
> +
> +#  define CHAR_SIZE    1
> +#  define VPMIN        vpminub
> +#  define VPTESTN      vptestnmb
>  #  define VPBROADCAST  vpbroadcastb
> -#  define VPCMP                vpcmpb
> -#  define SHIFT_REG    ecx
> +#  define VPCMP        vpcmpb
>  # endif
>
>  # define XMMZERO       xmm16
>  # define YMMZERO       ymm16
>  # define YMMMATCH      ymm17
> -# define YMM1          ymm18
> +# define YMMSAVE       ymm18
> +
> +# define YMM1  ymm19
> +# define YMM2  ymm20
> +# define YMM3  ymm21
> +# define YMM4  ymm22
> +# define YMM5  ymm23
> +# define YMM6  ymm24
> +# define YMM7  ymm25
> +# define YMM8  ymm26
>
> -# define VEC_SIZE      32
>
> -       .section .text.evex,"ax",@progbits
> -ENTRY (STRRCHR)
> -       movl    %edi, %ecx
> +# define VEC_SIZE      32
> +# define PAGE_SIZE     4096
> +       .section .text.evex, "ax", @progbits
> +ENTRY(STRRCHR)
> +       movl    %edi, %eax
>         /* Broadcast CHAR to YMMMATCH.  */
>         VPBROADCAST %esi, %YMMMATCH
>
> -       vpxorq  %XMMZERO, %XMMZERO, %XMMZERO
> -
> -       /* Check if we may cross page boundary with one vector load.  */
> -       andl    $(2 * VEC_SIZE - 1), %ecx
> -       cmpl    $VEC_SIZE, %ecx
> -       ja      L(cros_page_boundary)
> +       andl    $(PAGE_SIZE - 1), %eax
> +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> +       jg      L(cross_page_boundary)
>
> +L(page_cross_continue):
>         VMOVU   (%rdi), %YMM1
> -
> -       /* Each bit in K0 represents a null byte in YMM1.  */
> -       VPCMP   $0, %YMMZERO, %YMM1, %k0
> -       /* Each bit in K1 represents a CHAR in YMM1.  */
> -       VPCMP   $0, %YMMMATCH, %YMM1, %k1
> +       VPTESTN %YMM1, %YMM1, %k0

Please add some comments for mask register on VPTESTN tests.

>         kmovd   %k0, %ecx
> -       kmovd   %k1, %eax
> -
> -       addq    $VEC_SIZE, %rdi
> -
> -       testl   %eax, %eax
> -       jnz     L(first_vec)
> -
>         testl   %ecx, %ecx
> -       jnz     L(return_null)
> -
> -       andq    $-VEC_SIZE, %rdi
> -       xorl    %edx, %edx
> -       jmp     L(aligned_loop)
> -
> -       .p2align 4
> -L(first_vec):
> -       /* Check if there is a null byte.  */
> -       testl   %ecx, %ecx
> -       jnz     L(char_and_nul_in_first_vec)
> -
> -       /* Remember the match and keep searching.  */
> -       movl    %eax, %edx
> -       movq    %rdi, %rsi
> -       andq    $-VEC_SIZE, %rdi
> -       jmp     L(aligned_loop)
> -
> -       .p2align 4
> -L(cros_page_boundary):
> -       andl    $(VEC_SIZE - 1), %ecx
> -       andq    $-VEC_SIZE, %rdi
> -
> +       jz      L(aligned_more)
> +       VPCMP   $0, %YMMMATCH, %YMM1, %k1

Please add some comments.

> +       kmovd   %k1, %eax
> +       blsmskl %ecx, %ecx
> +       andl    %ecx, %eax
> +       jz      L(ret0)
> +       bsrl    %eax, %eax
>  # ifdef USE_AS_WCSRCHR
> -       /* NB: Divide shift count by 4 since each bit in K1 represent 4
> -          bytes.  */
> -       movl    %ecx, %SHIFT_REG
> -       sarl    $2, %SHIFT_REG
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> +# else
> +       addq    %rdi, %rax
>  # endif
> +L(ret0):
> +       ret
>
> -       VMOVA   (%rdi), %YMM1
> -
> -       /* Each bit in K0 represents a null byte in YMM1.  */
> -       VPCMP   $0, %YMMZERO, %YMM1, %k0
> -       /* Each bit in K1 represents a CHAR in YMM1.  */
> +       /* Returns for first vec x1/x2/x3 have hard coded backward
> +          search path for earlier matches.  */
> +       .p2align 4,, 6
> +L(first_vec_x1):
> +       VPCMP   $0, %YMMMATCH, %YMM2, %k1
> +       kmovd   %k1, %eax
> +       blsmskl %ecx, %ecx
> +       andl    %ecx, %eax
> +       jnz     L(first_vec_x1_return)
> +       .p2align 4,, 4
> +L(first_vec_x0_test):
>         VPCMP   $0, %YMMMATCH, %YMM1, %k1
> -       kmovd   %k0, %edx
>         kmovd   %k1, %eax
> -
> -       shrxl   %SHIFT_REG, %edx, %edx
> -       shrxl   %SHIFT_REG, %eax, %eax
> -       addq    $VEC_SIZE, %rdi
> -
> -       /* Check if there is a CHAR.  */
>         testl   %eax, %eax
> -       jnz     L(found_char)
> -
> -       testl   %edx, %edx
> -       jnz     L(return_null)
> -
> -       jmp     L(aligned_loop)
> -
> -       .p2align 4
> -L(found_char):
> -       testl   %edx, %edx
> -       jnz     L(char_and_nul)
> -
> -       /* Remember the match and keep searching.  */
> -       movl    %eax, %edx
> -       leaq    (%rdi, %rcx), %rsi
> +       jz      L(ret1)
> +       bsrl    %eax, %eax
> +# ifdef USE_AS_WCSRCHR
> +       leaq    (%rsi, %rax, CHAR_SIZE), %rax
> +# else
> +       addq    %rsi, %rax
> +# endif
> +L(ret1):
> +       ret
>
> -       .p2align 4
> -L(aligned_loop):
> -       VMOVA   (%rdi), %YMM1
> -       addq    $VEC_SIZE, %rdi
> +       .p2align 4,, 10
> +L(first_vec_x1_or_x2):
> +       VPCMP   $0, %YMM3, %YMMMATCH, %k3
> +       VPCMP   $0, %YMM2, %YMMMATCH, %k2
> +       kortestd %k2, %k3
> +       jz      L(first_vec_x0_test)
> +
> +       kunpck  %k2, %k3, %k3
> +       kmovq   %k3, %rax
> +       bsrq    %rax, %rax
> +       leaq    (VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax
> +       ret
>
> -       /* Each bit in K0 represents a null byte in YMM1.  */
> -       VPCMP   $0, %YMMZERO, %YMM1, %k0
> -       /* Each bit in K1 represents a CHAR in YMM1.  */
> -       VPCMP   $0, %YMMMATCH, %YMM1, %k1
> -       kmovd   %k0, %ecx
> +       .p2align 4,, 6
> +L(first_vec_x3):
> +       VPCMP   $0, %YMMMATCH, %YMM4, %k1
>         kmovd   %k1, %eax
> -       orl     %eax, %ecx
> -       jnz     L(char_nor_null)
> +       blsmskl %ecx, %ecx
> +       andl    %ecx, %eax
> +       jz      L(first_vec_x1_or_x2)
> +       bsrl    %eax, %eax
> +       leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> +       ret
>
> -       VMOVA   (%rdi), %YMM1
> -       add     $VEC_SIZE, %rdi
> +       .p2align 4,, 6
> +L(first_vec_x0_x1_test):
> +       VPCMP   $0, %YMMMATCH, %YMM2, %k1
> +       kmovd   %k1, %eax
> +       testl   %eax, %eax
> +       jz      L(first_vec_x0_test)
> +       .p2align 4,, 4
> +L(first_vec_x1_return):
> +       bsrl    %eax, %eax
> +       leaq    (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
> +       ret
>
> -       /* Each bit in K0 represents a null byte in YMM1.  */
> -       VPCMP   $0, %YMMZERO, %YMM1, %k0
> -       /* Each bit in K1 represents a CHAR in YMM1.  */
> -       VPCMP   $0, %YMMMATCH, %YMM1, %k1
> -       kmovd   %k0, %ecx
> +       .p2align 4,, 10
> +L(first_vec_x2):
> +       VPCMP   $0, %YMMMATCH, %YMM3, %k1
>         kmovd   %k1, %eax
> -       orl     %eax, %ecx
> -       jnz     L(char_nor_null)
> +       blsmskl %ecx, %ecx
> +       andl    %ecx, %eax
> +       jz      L(first_vec_x0_x1_test)
> +       bsrl    %eax, %eax
> +       leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> +       ret
>
> -       VMOVA   (%rdi), %YMM1
> -       addq    $VEC_SIZE, %rdi
>
> -       /* Each bit in K0 represents a null byte in YMM1.  */
> -       VPCMP   $0, %YMMZERO, %YMM1, %k0
> -       /* Each bit in K1 represents a CHAR in YMM1.  */
> -       VPCMP   $0, %YMMMATCH, %YMM1, %k1
> +       .p2align 4
> +L(aligned_more):
> +       /* Need to keep original pointer incase YMM1 has last match.  */
> +       movq    %rdi, %rsi
> +       andq    $-VEC_SIZE, %rdi
> +       VMOVU   VEC_SIZE(%rdi), %YMM2
> +       VPTESTN %YMM2, %YMM2, %k0
>         kmovd   %k0, %ecx
> -       kmovd   %k1, %eax
> -       orl     %eax, %ecx
> -       jnz     L(char_nor_null)
> +       testl   %ecx, %ecx
> +       jnz     L(first_vec_x1)
>
> -       VMOVA   (%rdi), %YMM1
> -       addq    $VEC_SIZE, %rdi
> +       VMOVU   (VEC_SIZE * 2)(%rdi), %YMM3
> +       VPTESTN %YMM3, %YMM3, %k0
> +       kmovd   %k0, %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(first_vec_x2)
>
> -       /* Each bit in K0 represents a null byte in YMM1.  */
> -       VPCMP   $0, %YMMZERO, %YMM1, %k0
> -       /* Each bit in K1 represents a CHAR in YMM1.  */
> -       VPCMP   $0, %YMMMATCH, %YMM1, %k1
> +       VMOVU   (VEC_SIZE * 3)(%rdi), %YMM4
> +       VPTESTN %YMM4, %YMM4, %k0
>         kmovd   %k0, %ecx
> -       kmovd   %k1, %eax
> -       orl     %eax, %ecx
> -       jz      L(aligned_loop)
> +       movq    %rdi, %r8
> +       testl   %ecx, %ecx
> +       jnz     L(first_vec_x3)
>
> +       andq    $-(VEC_SIZE * 2), %rdi
>         .p2align 4
> -L(char_nor_null):
> -       /* Find a CHAR or a null byte in a loop.  */
> +L(first_aligned_loop):
> +       /* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee
> +          they don't store a match.  */
> +       VMOVA   (VEC_SIZE * 4)(%rdi), %YMM5
> +       VMOVA   (VEC_SIZE * 5)(%rdi), %YMM6
> +
> +       VPCMP   $0, %YMM5, %YMMMATCH, %k2
> +       vpxord  %YMM6, %YMMMATCH, %YMM7
> +
> +       VPMIN   %YMM5, %YMM6, %YMM8
> +       VPMIN   %YMM8, %YMM7, %YMM7
> +
> +       VPTESTN %YMM7, %YMM7, %k1
> +       subq    $(VEC_SIZE * -2), %rdi
> +       kortestd %k1, %k2
> +       jz      L(first_aligned_loop)
> +
> +       VPCMP   $0, %YMM6, %YMMMATCH, %k3
> +       VPTESTN %YMM8, %YMM8, %k1
> +       ktestd  %k1, %k1
> +       jz      L(second_aligned_loop_prep)
> +
> +       kortestd %k2, %k3
> +       jnz     L(return_first_aligned_loop)
> +
> +       .p2align 4,, 6
> +L(first_vec_x1_or_x2_or_x3):
> +       VPCMP   $0, %YMM4, %YMMMATCH, %k4
> +       kmovd   %k4, %eax
>         testl   %eax, %eax
> -       jnz     L(match)
> -L(return_value):
> -       testl   %edx, %edx
> -       jz      L(return_null)
> -       movl    %edx, %eax
> -       movq    %rsi, %rdi
> +       jz      L(first_vec_x1_or_x2)
>         bsrl    %eax, %eax
> -# ifdef USE_AS_WCSRCHR
> -       /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -       leaq    -VEC_SIZE(%rdi, %rax, 4), %rax
> -# else
> -       leaq    -VEC_SIZE(%rdi, %rax), %rax
> -# endif
> +       leaq    (VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
>         ret
>
> -       .p2align 4
> -L(match):
> -       /* Find a CHAR.  Check if there is a null byte.  */
> -       kmovd   %k0, %ecx
> -       testl   %ecx, %ecx
> -       jnz     L(find_nul)
> +       .p2align 4,, 8
> +L(return_first_aligned_loop):
> +       VPTESTN %YMM5, %YMM5, %k0
> +       kunpck  %k0, %k1, %k0
> +       kmov_2x %k0, %maskz_2x
> +
> +       blsmsk  %maskz_2x, %maskz_2x
> +       kunpck  %k2, %k3, %k3
> +       kmov_2x %k3, %maskm_2x
> +       and     %maskz_2x, %maskm_2x
> +       jz      L(first_vec_x1_or_x2_or_x3)
> +
> +       bsr     %maskm_2x, %maskm_2x
> +       leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> +       ret
>
> -       /* Remember the match and keep searching.  */
> -       movl    %eax, %edx
> +       .p2align 4
> +       /* We can throw away the work done for the first 4x checks here
> +          as we have a later match. This is the 'fast' path persay.
> +        */
> +L(second_aligned_loop_prep):
> +L(second_aligned_loop_set_furthest_match):
>         movq    %rdi, %rsi
> -       jmp     L(aligned_loop)
> +       kunpck  %k2, %k3, %k4
>
>         .p2align 4
> -L(find_nul):
> -       /* Mask out any matching bits after the null byte.  */
> -       movl    %ecx, %r8d
> -       subl    $1, %r8d
> -       xorl    %ecx, %r8d
> -       andl    %r8d, %eax
> -       testl   %eax, %eax
> -       /* If there is no CHAR here, return the remembered one.  */
> -       jz      L(return_value)
> -       bsrl    %eax, %eax
> +L(second_aligned_loop):
> +       VMOVU   (VEC_SIZE * 4)(%rdi), %YMM1
> +       VMOVU   (VEC_SIZE * 5)(%rdi), %YMM2
> +
> +       VPCMP   $0, %YMM1, %YMMMATCH, %k2
> +       vpxord  %YMM2, %YMMMATCH, %YMM3
> +
> +       VPMIN   %YMM1, %YMM2, %YMM4
> +       VPMIN   %YMM3, %YMM4, %YMM3
> +
> +       VPTESTN %YMM3, %YMM3, %k1
> +       subq    $(VEC_SIZE * -2), %rdi
> +       kortestd %k1, %k2
> +       jz      L(second_aligned_loop)
> +
> +       VPCMP   $0, %YMM2, %YMMMATCH, %k3
> +       VPTESTN %YMM4, %YMM4, %k1
> +       ktestd  %k1, %k1
> +       jz      L(second_aligned_loop_set_furthest_match)
> +
> +       kortestd %k2, %k3
> +       /* branch here because there is a significant advantage interms
> +          of output dependency chance in using edx.  */
> +       jnz     L(return_new_match)
> +L(return_old_match):
> +       kmovq   %k4, %rax
> +       bsrq    %rax, %rax
> +       leaq    (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax
> +       ret
> +
> +L(return_new_match):
> +       VPTESTN %YMM1, %YMM1, %k0
> +       kunpck  %k0, %k1, %k0
> +       kmov_2x %k0, %maskz_2x
> +
> +       blsmsk  %maskz_2x, %maskz_2x
> +       kunpck  %k2, %k3, %k3
> +       kmov_2x %k3, %maskm_2x
> +       and     %maskz_2x, %maskm_2x
> +       jz      L(return_old_match)
> +
> +       bsr     %maskm_2x, %maskm_2x
> +       leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> +       ret
> +
> +       /* This block is horribly aligned (% 16 == 15). This is
> +          intentional. The L(cross_page_boundary) block is exactly
> +          32-bytes of code size. Ultimately this is a cold case so
> +          save the code size by leaving misaligned.  */
> +L(cross_page_boundary):
> +       xorq    %rdi, %rax
> +       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %YMM1
> +       VPTESTN %YMM1, %YMM1, %k0
> +       kmovd   %k0, %ecx
>  # ifdef USE_AS_WCSRCHR
> -       /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -       leaq    -VEC_SIZE(%rdi, %rax, 4), %rax
> -# else
> -       leaq    -VEC_SIZE(%rdi, %rax), %rax
> +       movl    %edi, %esi
> +       andl    $(VEC_SIZE - 1), %esi
> +       shrl    $2, %esi
>  # endif
> -       ret
> +       shrxl   %SHIFT_REG, %ecx, %ecx
>
> -       .p2align 4
> -L(char_and_nul):
> -       /* Find both a CHAR and a null byte.  */
> -       addq    %rcx, %rdi
> -       movl    %edx, %ecx
> -L(char_and_nul_in_first_vec):
> -       /* Mask out any matching bits after the null byte.  */
> -       movl    %ecx, %r8d
> -       subl    $1, %r8d
> -       xorl    %ecx, %r8d
> -       andl    %r8d, %eax
> -       testl   %eax, %eax
> -       /* Return null pointer if the null byte comes first.  */
> -       jz      L(return_null)
> +       testl   %ecx, %ecx
> +       jz      L(page_cross_continue)
> +       VPCMP   $0, %YMMMATCH, %YMM1, %k1
> +       kmovd   %k1, %eax
> +       shrxl   %SHIFT_REG, %eax, %eax
> +       blsmskl %ecx, %ecx
> +       andl    %ecx, %eax
> +       jz      L(ret3)
>         bsrl    %eax, %eax
>  # ifdef USE_AS_WCSRCHR
> -       /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -       leaq    -VEC_SIZE(%rdi, %rax, 4), %rax
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
>  # else
> -       leaq    -VEC_SIZE(%rdi, %rax), %rax
> +       addq    %rdi, %rax
>  # endif
> +L(ret3):
>         ret
>
> -       .p2align 4
> -L(return_null):
> -       xorl    %eax, %eax
> -       ret
> -
> -END (STRRCHR)
> +END(STRRCHR)
>  #endif
> --
> 2.25.1
>


-- 
H.J.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v1 1/5] benchtests: Improve bench-strrchr
  2022-04-21 23:49     ` H.J. Lu
@ 2022-04-22  1:11       ` Noah Goldstein
  0 siblings, 0 replies; 36+ messages in thread
From: Noah Goldstein @ 2022-04-22  1:11 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell

On Thu, Apr 21, 2022 at 6:50 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Thu, Apr 21, 2022 at 3:08 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Thu, Apr 21, 2022 at 3:13 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Wed, Apr 20, 2022 at 8:14 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > 1. Use json-lib for printing results.
> > > > 2. Expose all parameters (before pos, seek_char, and max_char where
> > > >    not printed).
> > > > 3. Add benchmarks that test multiple occurence of seek_char in the
> > > >    string.
> > > > ---
> > > >  benchtests/bench-strrchr.c | 126 ++++++++++++++++++++++++-------------
> > > >  1 file changed, 82 insertions(+), 44 deletions(-)
> > > >
> > > > diff --git a/benchtests/bench-strrchr.c b/benchtests/bench-strrchr.c
> > > > index abdae60c51..cceea77e1b 100644
> > > > --- a/benchtests/bench-strrchr.c
> > > > +++ b/benchtests/bench-strrchr.c
> > > > @@ -23,6 +23,7 @@
> > > >  # define TEST_NAME "strrchr"
> > > >  #endif
> > > >  #include "bench-string.h"
> > > > +#include "json-lib.h"
> > > >
> > > >  #define BIG_CHAR MAX_CHAR
> > > >
> > > > @@ -53,7 +54,8 @@ SIMPLE_STRRCHR (const CHAR *s, int c)
> > > >  }
> > > >
> > > >  static void
> > > > -do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
> > > > +do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s, int c,
> > > > +            CHAR *exp_res)
> > > >  {
> > > >    CHAR *res = CALL (impl, s, c);
> > > >    size_t i, iters = INNER_LOOP_ITERS8;
> > > > @@ -61,8 +63,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
> > > >
> > > >    if (res != exp_res)
> > > >      {
> > > > -      error (0, 0, "Wrong result in function %s %p %p", impl->name,
> > > > -            res, exp_res);
> > > > +      error (0, 0, "Wrong result in function %s %p %p", impl->name, res,
> > > > +            exp_res);
> > >
> > > These changes aren't needed.
> > >
> > > >        ret = 1;
> > > >        return;
> > > >      }
> > > > @@ -72,24 +74,28 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
> > > >      {
> > > >        CALL (impl, s, c);
> > > >      }
> > > > -  TIMING_NOW (stop);
> > > >
> > > > +  TIMING_NOW (stop);
> > >
> > > Not needed.
> >
> > Will fix in V2
> > >
> > > >    TIMING_DIFF (cur, start, stop);
> > > >
> > > > -  TIMING_PRINT_MEAN ((double) cur, (double) iters);
> > > > +  json_element_double (json_ctx, (double) cur / (double) iters);
> > > > +  return;
> > >
> > > Return isn't needed.
> >
> > Will fix in V2.
> > >
> > > >  }
> > > >
> > > >  static void
> > > > -do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
> > > > +do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
> > > > +        int seek_char, int max_char, size_t freq)
> > > >  /* For wcsrchr: align here means align not in bytes,
> > > >     but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
> > > >     len for wcschr here isn't in bytes but it's number of wchar_t symbols.  */
> > > >  {
> > > >    size_t i;
> > > > +  size_t pos_chunk_sz = freq ? (pos / freq) : pos;
> > > > +  size_t last_pos = len;
> > > >    CHAR *result;
> > > >    CHAR *buf = (CHAR *) buf1;
> > > >
> > > > -  align &= 7;
> > > > +  align &= (getpagesize () - 1);
> > >
> > > If we have such large alignments, the tests may be skipped.
> > > Should we change it to 127 instead?
> >
> > There is logic around page cross cases in x86_64 versions so think
> > makes sense to support benchmarking it.
> >
> > Also i think that would tend to give the previous version a bit of
> > an unfair disadvantage as the slow aligning case will never be
> > tested in the new version.
>
> If "align" is close to the page size, will it trigger
>
>  if ((align + len) * sizeof (CHAR) >= page_size)
>     return;
>
> and skip page cross cases?

https://sourceware.org/git/?p=glibc.git;a=blob;f=benchtests/bench-string.h;h=5339ff47ffd9c9082c7bce038da00f9c48472c7f;hb=HEAD#l244

So for med/small sizes we will be fine?
>
> > >
> > > >    if ((align + len) * sizeof (CHAR) >= page_size)
> > > >      return;
> > > >
> > > > @@ -103,6 +109,16 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
> > > >        if ((i > pos || pos >= len) && buf[align + i] == seek_char)
> > > >         buf[align + i] = seek_char + 10 + (random () & 15);
> > > >      }
> > > > +
> > > > +  if (pos_chunk_sz == 0 && pos)
> > > > +    pos_chunk_sz = 1;
> > > > +
> > > > +  for (i = pos_chunk_sz; i < pos && i < len; i += pos_chunk_sz)
> > > > +    {
> > > > +      buf[align + i] = seek_char;
> > > > +      last_pos = i;
> > > > +    }
> > > > +
> > > >    buf[align + len] = 0;
> > > >
> > > >    if (pos < len)
> > > > @@ -110,66 +126,88 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
> > > >        buf[align + pos] = seek_char;
> > > >        result = (CHAR *) (buf + align + pos);
> > > >      }
> > > > +  else if (last_pos < len)
> > > > +    result = (CHAR *) (buf + align + last_pos);
> > > >    else if (seek_char == 0)
> > > >      result = (CHAR *) (buf + align + len);
> > > >    else
> > > >      result = NULL;
> > > >
> > > > -  printf ("Length %4zd, alignment in bytes %2zd:", len, align * sizeof (CHAR));
> > > > +  json_element_object_begin (json_ctx);
> > > > +  json_attr_uint (json_ctx, "len", len);
> > > > +  json_attr_uint (json_ctx, "pos", pos);
> > > > +  json_attr_uint (json_ctx, "align", align);
> > > > +  json_attr_uint (json_ctx, "freq", freq);
> > > > +  json_attr_uint (json_ctx, "seek", seek_char);
> > > > +  json_attr_uint (json_ctx, "max_char", max_char);
> > > > +  json_array_begin (json_ctx, "timings");
> > > >
> > > >    FOR_EACH_IMPL (impl, 0)
> > > > -    do_one_test (impl, (CHAR *) (buf + align), seek_char, result);
> > > > +    do_one_test (json_ctx, impl, (CHAR *) (buf + align), seek_char, result);
> > > >
> > > > -  putchar ('\n');
> > > > +  json_array_end (json_ctx);
> > > > +  json_element_object_end (json_ctx);
> > > >  }
> > > >
> > > >  int
> > > >  test_main (void)
> > > >  {
> > > > -  size_t i;
> > > > +  json_ctx_t json_ctx;
> > > > +  size_t i, j;
> > > > +  int seek;
> > > >
> > > >    test_init ();
> > > > +  json_init (&json_ctx, 0, stdout);
> > > >
> > > > -  printf ("%20s", "");
> > > > -  FOR_EACH_IMPL (impl, 0)
> > > > -    printf ("\t%s", impl->name);
> > > > -  putchar ('\n');
> > > > +  json_document_begin (&json_ctx);
> > > > +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> > > >
> > > > -  for (i = 1; i < 8; ++i)
> > > > -    {
> > > > -      do_test (0, 16 << i, 2048, 23, SMALL_CHAR);
> > > > -      do_test (i, 16 << i, 2048, 23, SMALL_CHAR);
> > > > -    }
> > > > +  json_attr_object_begin (&json_ctx, "functions");
> > > > +  json_attr_object_begin (&json_ctx, TEST_NAME);
> > > > +  json_attr_string (&json_ctx, "bench-variant", "");
> > > >
> > > > -  for (i = 1; i < 8; ++i)
> > > > -    {
> > > > -      do_test (i, 64, 256, 23, SMALL_CHAR);
> > > > -      do_test (i, 64, 256, 23, BIG_CHAR);
> > > > -    }
> > > > -
> > > > -  for (i = 0; i < 32; ++i)
> > > > -    {
> > > > -      do_test (0, i, i + 1, 23, SMALL_CHAR);
> > > > -      do_test (0, i, i + 1, 23, BIG_CHAR);
> > > > -    }
> > > > +  json_array_begin (&json_ctx, "ifuncs");
> > > > +  FOR_EACH_IMPL (impl, 0)
> > > > +    json_element_string (&json_ctx, impl->name);
> > > > +  json_array_end (&json_ctx);
> > > >
> > > > -  for (i = 1; i < 8; ++i)
> > > > -    {
> > > > -      do_test (0, 16 << i, 2048, 0, SMALL_CHAR);
> > > > -      do_test (i, 16 << i, 2048, 0, SMALL_CHAR);
> > > > -    }
> > > > +  json_array_begin (&json_ctx, "results");
> > > >
> > > > -  for (i = 1; i < 8; ++i)
> > > > +  for (seek = 0; seek <= 23; seek += 23)
> > > >      {
> > > > -      do_test (i, 64, 256, 0, SMALL_CHAR);
> > > > -      do_test (i, 64, 256, 0, BIG_CHAR);
> > > > +      for (j = 1; j < 32; j += j)
> > > > +       {
> > > > +         for (i = 1; i < 9; ++i)
> > > > +           {
> > > > +             do_test (&json_ctx, 0, 16 << i, 2048, seek, SMALL_CHAR, j);
> > > > +             do_test (&json_ctx, i, 16 << i, 2048, seek, SMALL_CHAR, j);
> > > > +           }
> > > > +
> > > > +         for (i = 1; i < 8; ++i)
> > > > +           {
> > > > +             do_test (&json_ctx, i, 64, 256, seek, SMALL_CHAR, j);
> > > > +             do_test (&json_ctx, i, 64, 256, seek, BIG_CHAR, j);
> > > > +
> > > > +             do_test (&json_ctx, i * 15, 64, 256, seek, SMALL_CHAR, j);
> > > > +             do_test (&json_ctx, i * 15, 64, 256, seek, BIG_CHAR, j);
> > > > +           }
> > > > +
> > > > +         for (i = 0; i < 32; ++i)
> > > > +           {
> > > > +             do_test (&json_ctx, 0, i, i + 1, seek, SMALL_CHAR, j);
> > > > +             do_test (&json_ctx, 0, i, i + 1, seek, BIG_CHAR, j);
> > > > +           }
> > > > +         if (seek == 0)
> > > > +           {
> > > > +             break;
> > > > +           }
> > > > +       }
> > > >      }
> > > >
> > > > -  for (i = 0; i < 32; ++i)
> > > > -    {
> > > > -      do_test (0, i, i + 1, 0, SMALL_CHAR);
> > > > -      do_test (0, i, i + 1, 0, BIG_CHAR);
> > > > -    }
> > > > +  json_array_end (&json_ctx);
> > > > +  json_attr_object_end (&json_ctx);
> > > > +  json_attr_object_end (&json_ctx);
> > > > +  json_document_end (&json_ctx);
> > > >
> > > >    return ret;
> > > >  }
> > > > --
> > > > 2.25.1
> > > >
> > >
> > >
> > > --
> > > H.J.
>
>
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* [PATCH v3 1/4] benchtests: Improve bench-strrchr
  2022-04-21  3:14 [PATCH v1 1/5] benchtests: Improve bench-strrchr Noah Goldstein
                   ` (5 preceding siblings ...)
  2022-04-21 22:22 ` [PATCH v2 1/4] " Noah Goldstein
@ 2022-04-22  1:52 ` Noah Goldstein
  2022-04-22  1:52   ` [PATCH v3 2/4] x86: Optimize {str|wcs}rchr-sse2 Noah Goldstein
                     ` (3 more replies)
  2022-04-22 19:11 ` [PATCH v4 " Noah Goldstein
  7 siblings, 4 replies; 36+ messages in thread
From: Noah Goldstein @ 2022-04-22  1:52 UTC (permalink / raw)
  To: libc-alpha

1. Use json-lib for printing results.
2. Expose all parameters (before pos, seek_char, and max_char where
   not printed).
3. Add benchmarks that test multiple occurence of seek_char in the
   string.
---
 benchtests/bench-strrchr.c | 124 ++++++++++++++++++++++++-------------
 1 file changed, 80 insertions(+), 44 deletions(-)

diff --git a/benchtests/bench-strrchr.c b/benchtests/bench-strrchr.c
index abdae60c51..ce4307a098 100644
--- a/benchtests/bench-strrchr.c
+++ b/benchtests/bench-strrchr.c
@@ -23,6 +23,7 @@
 # define TEST_NAME "strrchr"
 #endif
 #include "bench-string.h"
+#include "json-lib.h"
 
 #define BIG_CHAR MAX_CHAR
 
@@ -53,7 +54,8 @@ SIMPLE_STRRCHR (const CHAR *s, int c)
 }
 
 static void
-do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
+do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s, int c,
+	     CHAR *exp_res)
 {
   CHAR *res = CALL (impl, s, c);
   size_t i, iters = INNER_LOOP_ITERS8;
@@ -61,8 +63,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
 
   if (res != exp_res)
     {
-      error (0, 0, "Wrong result in function %s %p %p", impl->name,
-	     res, exp_res);
+      error (0, 0, "Wrong result in function %s %p %p", impl->name, res,
+	     exp_res);
       ret = 1;
       return;
     }
@@ -73,23 +75,25 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
       CALL (impl, s, c);
     }
   TIMING_NOW (stop);
-
   TIMING_DIFF (cur, start, stop);
 
-  TIMING_PRINT_MEAN ((double) cur, (double) iters);
+  json_element_double (json_ctx, (double) cur / (double) iters);
 }
 
 static void
-do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
+do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
+	 int seek_char, int max_char, size_t freq)
 /* For wcsrchr: align here means align not in bytes,
    but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
    len for wcschr here isn't in bytes but it's number of wchar_t symbols.  */
 {
   size_t i;
+  size_t pos_chunk_sz = freq ? (pos / freq) : pos;
+  size_t last_pos = len;
   CHAR *result;
   CHAR *buf = (CHAR *) buf1;
 
-  align &= 7;
+  align &= (getpagesize () - 1);
   if ((align + len) * sizeof (CHAR) >= page_size)
     return;
 
@@ -103,6 +107,16 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
       if ((i > pos || pos >= len) && buf[align + i] == seek_char)
 	buf[align + i] = seek_char + 10 + (random () & 15);
     }
+
+  if (pos_chunk_sz == 0 && pos)
+    pos_chunk_sz = 1;
+
+  for (i = pos_chunk_sz; i < pos && i < len; i += pos_chunk_sz)
+    {
+      buf[align + i] = seek_char;
+      last_pos = i;
+    }
+
   buf[align + len] = 0;
 
   if (pos < len)
@@ -110,66 +124,88 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
       buf[align + pos] = seek_char;
       result = (CHAR *) (buf + align + pos);
     }
+  else if (last_pos < len)
+    result = (CHAR *) (buf + align + last_pos);
   else if (seek_char == 0)
     result = (CHAR *) (buf + align + len);
   else
     result = NULL;
 
-  printf ("Length %4zd, alignment in bytes %2zd:", len, align * sizeof (CHAR));
+  json_element_object_begin (json_ctx);
+  json_attr_uint (json_ctx, "len", len);
+  json_attr_uint (json_ctx, "pos", pos);
+  json_attr_uint (json_ctx, "align", align);
+  json_attr_uint (json_ctx, "freq", freq);
+  json_attr_uint (json_ctx, "seek", seek_char);
+  json_attr_uint (json_ctx, "max_char", max_char);
+  json_array_begin (json_ctx, "timings");
 
   FOR_EACH_IMPL (impl, 0)
-    do_one_test (impl, (CHAR *) (buf + align), seek_char, result);
+    do_one_test (json_ctx, impl, (CHAR *) (buf + align), seek_char, result);
 
-  putchar ('\n');
+  json_array_end (json_ctx);
+  json_element_object_end (json_ctx);
 }
 
 int
 test_main (void)
 {
-  size_t i;
+  json_ctx_t json_ctx;
+  size_t i, j;
+  int seek;
 
   test_init ();
+  json_init (&json_ctx, 0, stdout);
 
-  printf ("%20s", "");
-  FOR_EACH_IMPL (impl, 0)
-    printf ("\t%s", impl->name);
-  putchar ('\n');
-
-  for (i = 1; i < 8; ++i)
-    {
-      do_test (0, 16 << i, 2048, 23, SMALL_CHAR);
-      do_test (i, 16 << i, 2048, 23, SMALL_CHAR);
-    }
+  json_document_begin (&json_ctx);
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
 
-  for (i = 1; i < 8; ++i)
-    {
-      do_test (i, 64, 256, 23, SMALL_CHAR);
-      do_test (i, 64, 256, 23, BIG_CHAR);
-    }
+  json_attr_object_begin (&json_ctx, "functions");
+  json_attr_object_begin (&json_ctx, TEST_NAME);
+  json_attr_string (&json_ctx, "bench-variant", "");
 
-  for (i = 0; i < 32; ++i)
-    {
-      do_test (0, i, i + 1, 23, SMALL_CHAR);
-      do_test (0, i, i + 1, 23, BIG_CHAR);
-    }
+  json_array_begin (&json_ctx, "ifuncs");
+  FOR_EACH_IMPL (impl, 0)
+    json_element_string (&json_ctx, impl->name);
+  json_array_end (&json_ctx);
 
-  for (i = 1; i < 8; ++i)
-    {
-      do_test (0, 16 << i, 2048, 0, SMALL_CHAR);
-      do_test (i, 16 << i, 2048, 0, SMALL_CHAR);
-    }
+  json_array_begin (&json_ctx, "results");
 
-  for (i = 1; i < 8; ++i)
+  for (seek = 0; seek <= 23; seek += 23)
     {
-      do_test (i, 64, 256, 0, SMALL_CHAR);
-      do_test (i, 64, 256, 0, BIG_CHAR);
+      for (j = 1; j < 32; j += j)
+	{
+	  for (i = 1; i < 9; ++i)
+	    {
+	      do_test (&json_ctx, 0, 16 << i, 2048, seek, SMALL_CHAR, j);
+	      do_test (&json_ctx, i, 16 << i, 2048, seek, SMALL_CHAR, j);
+	    }
+
+	  for (i = 1; i < 8; ++i)
+	    {
+	      do_test (&json_ctx, i, 64, 256, seek, SMALL_CHAR, j);
+	      do_test (&json_ctx, i, 64, 256, seek, BIG_CHAR, j);
+
+	      do_test (&json_ctx, i * 15, 64, 256, seek, SMALL_CHAR, j);
+	      do_test (&json_ctx, i * 15, 64, 256, seek, BIG_CHAR, j);
+	    }
+
+	  for (i = 0; i < 32; ++i)
+	    {
+	      do_test (&json_ctx, 0, i, i + 1, seek, SMALL_CHAR, j);
+	      do_test (&json_ctx, 0, i, i + 1, seek, BIG_CHAR, j);
+	    }
+	  if (seek == 0)
+	    {
+	      break;
+	    }
+	}
     }
 
-  for (i = 0; i < 32; ++i)
-    {
-      do_test (0, i, i + 1, 0, SMALL_CHAR);
-      do_test (0, i, i + 1, 0, BIG_CHAR);
-    }
+  json_array_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_document_end (&json_ctx);
 
   return ret;
 }
-- 
2.25.1


^ permalink raw reply	[flat|nested] 36+ messages in thread

* [PATCH v3 2/4] x86: Optimize {str|wcs}rchr-sse2
  2022-04-22  1:52 ` [PATCH v3 1/4] benchtests: Improve bench-strrchr Noah Goldstein
@ 2022-04-22  1:52   ` Noah Goldstein
  2022-04-22 19:06     ` H.J. Lu
  2022-04-22  1:52   ` [PATCH v3 3/4] x86: Optimize {str|wcs}rchr-avx2 Noah Goldstein
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 36+ messages in thread
From: Noah Goldstein @ 2022-04-22  1:52 UTC (permalink / raw)
  To: libc-alpha

The new code unrolls the main loop slightly without adding too much
overhead and minimizes the comparisons for the search CHAR.

Geometric Mean of all benchmarks New / Old: 0.741
See email for all results.

Full xcheck passes on x86_64 with and without multiarch enabled.
---
 sysdeps/x86_64/multiarch/strrchr-sse2.S |   2 +-
 sysdeps/x86_64/multiarch/wcsrchr-sse2.S |   3 +-
 sysdeps/x86_64/strrchr.S                | 510 +++++++++++++++---------
 sysdeps/x86_64/wcsrchr.S                | 268 +------------
 4 files changed, 339 insertions(+), 444 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
index db1b44c23c..866396e947 100644
--- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
+++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
@@ -17,7 +17,7 @@
    <https://www.gnu.org/licenses/>.  */
 
 #if IS_IN (libc)
-# define strrchr __strrchr_sse2
+# define STRRCHR __strrchr_sse2
 
 # undef weak_alias
 # define weak_alias(strrchr, rindex)
diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
index 78d1ca6553..69d2f3cdb1 100644
--- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
+++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
@@ -17,7 +17,6 @@
    <https://www.gnu.org/licenses/>.  */
 
 #if IS_IN (libc)
-# define wcsrchr __wcsrchr_sse2
+# define STRRCHR	__wcsrchr_sse2
 #endif
-
 #include "../wcsrchr.S"
diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
index 50d886713e..4d7ba4ceb2 100644
--- a/sysdeps/x86_64/strrchr.S
+++ b/sysdeps/x86_64/strrchr.S
@@ -19,210 +19,360 @@
 
 #include <sysdep.h>
 
+#ifndef STRRCHR
+# define STRRCHR	strrchr
+#endif
+
+#ifdef USE_AS_WCSRCHR
+# define PCMPEQ	pcmpeqd
+# define CHAR_SIZE	4
+# define PMINU	pminud
+#else
+# define PCMPEQ	pcmpeqb
+# define CHAR_SIZE	1
+# define PMINU	pminub
+#endif
+
+#define PAGE_SIZE	4096
+#define VEC_SIZE	16
+
 	.text
-ENTRY (strrchr)
-	movd	%esi, %xmm1
+ENTRY(STRRCHR)
+	movd	%esi, %xmm0
 	movq	%rdi, %rax
-	andl	$4095, %eax
-	punpcklbw	%xmm1, %xmm1
-	cmpq	$4032, %rax
-	punpcklwd	%xmm1, %xmm1
-	pshufd	$0, %xmm1, %xmm1
+	andl	$(PAGE_SIZE - 1), %eax
+#ifndef USE_AS_WCSRCHR
+	punpcklbw %xmm0, %xmm0
+	punpcklwd %xmm0, %xmm0
+#endif
+	pshufd	$0, %xmm0, %xmm0
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 	ja	L(cross_page)
-	movdqu	(%rdi), %xmm0
+
+L(cross_page_continue):
+	movups	(%rdi), %xmm1
 	pxor	%xmm2, %xmm2
-	movdqa	%xmm0, %xmm3
-	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm2, %xmm3
-	pmovmskb	%xmm0, %ecx
-	pmovmskb	%xmm3, %edx
-	testq	%rdx, %rdx
-	je	L(next_48_bytes)
-	leaq	-1(%rdx), %rax
-	xorq	%rdx, %rax
-	andq	%rcx, %rax
-	je	L(exit)
-	bsrq	%rax, %rax
+	PCMPEQ	%xmm1, %xmm2
+	pmovmskb %xmm2, %ecx
+	testl	%ecx, %ecx
+	jz	L(aligned_more)
+
+	PCMPEQ	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	leal	-1(%rcx), %edx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(ret0)
+	bsrl	%eax, %eax
 	addq	%rdi, %rax
+	/* We are off by 3 for wcsrchr if search CHAR is non-zero. If
+	   search CHAR is zero we are correct. Either way `andq
+	   -CHAR_SIZE, %rax` gets the correct result.  */
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+L(ret0):
 	ret
 
+	/* Returns for first vec x1/x2 have hard coded backward search
+	   path for earlier matches.  */
 	.p2align 4
-L(next_48_bytes):
-	movdqu	16(%rdi), %xmm4
-	movdqa	%xmm4, %xmm5
-	movdqu	32(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm4
-	pcmpeqb	%xmm2, %xmm5
-	movdqu	48(%rdi), %xmm0
-	pmovmskb	%xmm5, %edx
-	movdqa	%xmm3, %xmm5
-	pcmpeqb	%xmm1, %xmm3
-	pcmpeqb	%xmm2, %xmm5
-	pcmpeqb	%xmm0, %xmm2
-	salq	$16, %rdx
-	pmovmskb	%xmm3, %r8d
-	pmovmskb	%xmm5, %eax
-	pmovmskb	%xmm2, %esi
-	salq	$32, %r8
-	salq	$32, %rax
-	pcmpeqb	%xmm1, %xmm0
-	orq	%rdx, %rax
-	movq	%rsi, %rdx
-	pmovmskb	%xmm4, %esi
-	salq	$48, %rdx
-	salq	$16, %rsi
-	orq	%r8, %rsi
-	orq	%rcx, %rsi
-	pmovmskb	%xmm0, %ecx
-	salq	$48, %rcx
-	orq	%rcx, %rsi
-	orq	%rdx, %rax
-	je	L(loop_header2)
-	leaq	-1(%rax), %rcx
-	xorq	%rax, %rcx
-	andq	%rcx, %rsi
-	je	L(exit)
-	bsrq	%rsi, %rsi
-	leaq	(%rdi,%rsi), %rax
+L(first_vec_x0_test):
+	PCMPEQ	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	testl	%eax, %eax
+	jz	L(ret0)
+	bsrl	%eax, %eax
+	addq	%r8, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
 	ret
 
 	.p2align 4
-L(loop_header2):
-	testq	%rsi, %rsi
-	movq	%rdi, %rcx
-	je	L(no_c_found)
-L(loop_header):
-	addq	$64, %rdi
-	pxor	%xmm7, %xmm7
-	andq	$-64, %rdi
-	jmp	L(loop_entry)
+L(first_vec_x1):
+	PCMPEQ	%xmm0, %xmm2
+	pmovmskb %xmm2, %eax
+	leal	-1(%rcx), %edx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(first_vec_x0_test)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
 
 	.p2align 4
-L(loop64):
-	testq	%rdx, %rdx
-	cmovne	%rdx, %rsi
-	cmovne	%rdi, %rcx
-	addq	$64, %rdi
-L(loop_entry):
-	movdqa	32(%rdi), %xmm3
-	pxor	%xmm6, %xmm6
-	movdqa	48(%rdi), %xmm2
-	movdqa	%xmm3, %xmm0
-	movdqa	16(%rdi), %xmm4
-	pminub	%xmm2, %xmm0
-	movdqa	(%rdi), %xmm5
-	pminub	%xmm4, %xmm0
-	pminub	%xmm5, %xmm0
-	pcmpeqb	%xmm7, %xmm0
-	pmovmskb	%xmm0, %eax
-	movdqa	%xmm5, %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %r9d
-	movdqa	%xmm4, %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %edx
-	movdqa	%xmm3, %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	salq	$16, %rdx
-	pmovmskb	%xmm0, %r10d
-	movdqa	%xmm2, %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	salq	$32, %r10
-	orq	%r10, %rdx
-	pmovmskb	%xmm0, %r8d
-	orq	%r9, %rdx
-	salq	$48, %r8
-	orq	%r8, %rdx
+L(first_vec_x1_test):
+	PCMPEQ	%xmm0, %xmm2
+	pmovmskb %xmm2, %eax
 	testl	%eax, %eax
-	je	L(loop64)
-	pcmpeqb	%xmm6, %xmm4
-	pcmpeqb	%xmm6, %xmm3
-	pcmpeqb	%xmm6, %xmm5
-	pmovmskb	%xmm4, %eax
-	pmovmskb	%xmm3, %r10d
-	pcmpeqb	%xmm6, %xmm2
-	pmovmskb	%xmm5, %r9d
-	salq	$32, %r10
-	salq	$16, %rax
-	pmovmskb	%xmm2, %r8d
-	orq	%r10, %rax
-	orq	%r9, %rax
-	salq	$48, %r8
-	orq	%r8, %rax
-	leaq	-1(%rax), %r8
-	xorq	%rax, %r8
-	andq	%r8, %rdx
-	cmovne	%rdi, %rcx
-	cmovne	%rdx, %rsi
-	bsrq	%rsi, %rsi
-	leaq	(%rcx,%rsi), %rax
+	jz	L(first_vec_x0_test)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
+
+	.p2align 4
+L(first_vec_x2):
+	PCMPEQ	%xmm0, %xmm3
+	pmovmskb %xmm3, %eax
+	leal	-1(%rcx), %edx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(first_vec_x1_test)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
+
+	.p2align 4
+L(aligned_more):
+	/* Save original pointer if match was in VEC 0.  */
+	movq	%rdi, %r8
+	andq	$-VEC_SIZE, %rdi
+
+	movaps	VEC_SIZE(%rdi), %xmm2
+	pxor	%xmm3, %xmm3
+	PCMPEQ	%xmm2, %xmm3
+	pmovmskb %xmm3, %ecx
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x1)
+
+	movaps	(VEC_SIZE * 2)(%rdi), %xmm3
+	pxor	%xmm4, %xmm4
+	PCMPEQ	%xmm3, %xmm4
+	pmovmskb %xmm4, %ecx
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x2)
+
+	addq	$VEC_SIZE, %rdi
+	/* Save pointer again before realigning.  */
+	movq	%rdi, %rsi
+	andq	$-(VEC_SIZE * 2), %rdi
+	.p2align 4
+L(first_loop):
+	/* Do 2x VEC at a time.  */
+	movaps	(VEC_SIZE * 2)(%rdi), %xmm4
+	movaps	(VEC_SIZE * 3)(%rdi), %xmm5
+	/* Since SSE2 no pminud so wcsrchr needs seperate logic for
+	   detecting zero. Note if this is found to be a bottleneck it
+	   may be worth adding an SSE4.1 wcsrchr implementation.  */
+#ifdef USE_AS_WCSRCHR
+	movaps	%xmm5, %xmm6
+	pxor	%xmm8, %xmm8
+
+	PCMPEQ	%xmm8, %xmm5
+	PCMPEQ	%xmm4, %xmm8
+	por	%xmm5, %xmm8
+#else
+	movaps	%xmm5, %xmm6
+	PMINU	%xmm4, %xmm5
+#endif
+
+	movaps	%xmm4, %xmm9
+	PCMPEQ	%xmm0, %xmm4
+	PCMPEQ	%xmm0, %xmm6
+	movaps	%xmm6, %xmm7
+	por	%xmm4, %xmm6
+#ifndef USE_AS_WCSRCHR
+	pxor	%xmm8, %xmm8
+	PCMPEQ	%xmm5, %xmm8
+#endif
+	pmovmskb %xmm8, %ecx
+	pmovmskb %xmm6, %eax
+
+	addq	$(VEC_SIZE * 2), %rdi
+	/* Use `addl` 1) so we can undo it with `subl` and 2) it can
+	   macro-fuse with `jz`.  */
+	addl	%ecx, %eax
+	jz	L(first_loop)
+
+	/* Check if there is zero match.  */
+	testl	%ecx, %ecx
+	jz	L(second_loop_match)
+
+	/* Check if there was a match in last iteration.  */
+	subl	%ecx, %eax
+	jnz	L(new_match)
+
+L(first_loop_old_match):
+	PCMPEQ	%xmm0, %xmm2
+	PCMPEQ	%xmm0, %xmm3
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm3, %eax
+	addl	%eax, %ecx
+	jz	L(first_vec_x0_test)
+	/* NB: We could move this shift to before the branch and save a
+	   bit of code size / performance on the fall through. The
+	   branch leads to the null case which generally seems hotter
+	   than char in first 3x VEC.  */
+	sall	$16, %eax
+	orl	%ecx, %eax
+
+	bsrl	%eax, %eax
+	addq	%rsi, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
+
+	.p2align 4
+L(new_match):
+	pxor	%xmm6, %xmm6
+	PCMPEQ	%xmm9, %xmm6
+	pmovmskb %xmm6, %eax
+	sall	$16, %ecx
+	orl	%eax, %ecx
+
+	/* We can't reuse either of the old comparisons as since we mask
+	   of zeros after first zero (instead of using the full
+	   comparison) we can't gurantee no interference between match
+	   after end of string and valid match.  */
+	pmovmskb %xmm4, %eax
+	pmovmskb %xmm7, %edx
+	sall	$16, %edx
+	orl	%edx, %eax
+
+	leal	-1(%ecx), %edx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(first_loop_old_match)
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
 	ret
 
+	/* Save minimum state for getting most recent match. We can
+	   throw out all previous work.  */
 	.p2align 4
-L(no_c_found):
-	movl	$1, %esi
-	xorl	%ecx, %ecx
-	jmp	L(loop_header)
+L(second_loop_match):
+	movq	%rdi, %rsi
+	movaps	%xmm4, %xmm2
+	movaps	%xmm7, %xmm3
 
 	.p2align 4
-L(exit):
-	xorl	%eax, %eax
+L(second_loop):
+	movaps	(VEC_SIZE * 2)(%rdi), %xmm4
+	movaps	(VEC_SIZE * 3)(%rdi), %xmm5
+	/* Since SSE2 no pminud so wcsrchr needs seperate logic for
+	   detecting zero. Note if this is found to be a bottleneck it
+	   may be worth adding an SSE4.1 wcsrchr implementation.  */
+#ifdef USE_AS_WCSRCHR
+	movaps	%xmm5, %xmm6
+	pxor	%xmm8, %xmm8
+
+	PCMPEQ	%xmm8, %xmm5
+	PCMPEQ	%xmm4, %xmm8
+	por	%xmm5, %xmm8
+#else
+	movaps	%xmm5, %xmm6
+	PMINU	%xmm4, %xmm5
+#endif
+
+	movaps	%xmm4, %xmm9
+	PCMPEQ	%xmm0, %xmm4
+	PCMPEQ	%xmm0, %xmm6
+	movaps	%xmm6, %xmm7
+	por	%xmm4, %xmm6
+#ifndef USE_AS_WCSRCHR
+	pxor	%xmm8, %xmm8
+	PCMPEQ	%xmm5, %xmm8
+#endif
+
+	pmovmskb %xmm8, %ecx
+	pmovmskb %xmm6, %eax
+
+	addq	$(VEC_SIZE * 2), %rdi
+	/* Either null term or new occurence of CHAR.  */
+	addl	%ecx, %eax
+	jz	L(second_loop)
+
+	/* No null term so much be new occurence of CHAR.  */
+	testl	%ecx, %ecx
+	jz	L(second_loop_match)
+
+
+	subl	%ecx, %eax
+	jnz	L(second_loop_new_match)
+
+L(second_loop_old_match):
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm3, %eax
+	sall	$16, %eax
+	orl	%ecx, %eax
+	bsrl	%eax, %eax
+	addq	%rsi, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
 	ret
 
 	.p2align 4
+L(second_loop_new_match):
+	pxor	%xmm6, %xmm6
+	PCMPEQ	%xmm9, %xmm6
+	pmovmskb %xmm6, %eax
+	sall	$16, %ecx
+	orl	%eax, %ecx
+
+	/* We can't reuse either of the old comparisons as since we mask
+	   of zeros after first zero (instead of using the full
+	   comparison) we can't gurantee no interference between match
+	   after end of string and valid match.  */
+	pmovmskb %xmm4, %eax
+	pmovmskb %xmm7, %edx
+	sall	$16, %edx
+	orl	%edx, %eax
+
+	leal	-1(%ecx), %edx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(second_loop_old_match)
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
+
+	.p2align 4,, 4
 L(cross_page):
-	movq	%rdi, %rax
-	pxor	%xmm0, %xmm0
-	andq	$-64, %rax
-	movdqu	(%rax), %xmm5
-	movdqa	%xmm5, %xmm6
-	movdqu	16(%rax), %xmm4
-	pcmpeqb	%xmm1, %xmm5
-	pcmpeqb	%xmm0, %xmm6
-	movdqu	32(%rax), %xmm3
-	pmovmskb	%xmm6, %esi
-	movdqa	%xmm4, %xmm6
-	movdqu	48(%rax), %xmm2
-	pcmpeqb	%xmm1, %xmm4
-	pcmpeqb	%xmm0, %xmm6
-	pmovmskb	%xmm6, %edx
-	movdqa	%xmm3, %xmm6
-	pcmpeqb	%xmm1, %xmm3
-	pcmpeqb	%xmm0, %xmm6
-	pcmpeqb	%xmm2, %xmm0
-	salq	$16, %rdx
-	pmovmskb	%xmm3, %r9d
-	pmovmskb	%xmm6, %r8d
-	pmovmskb	%xmm0, %ecx
-	salq	$32, %r9
-	salq	$32, %r8
-	pcmpeqb	%xmm1, %xmm2
-	orq	%r8, %rdx
-	salq	$48, %rcx
-	pmovmskb	%xmm5, %r8d
-	orq	%rsi, %rdx
-	pmovmskb	%xmm4, %esi
-	orq	%rcx, %rdx
-	pmovmskb	%xmm2, %ecx
-	salq	$16, %rsi
-	salq	$48, %rcx
-	orq	%r9, %rsi
-	orq	%r8, %rsi
-	orq	%rcx, %rsi
+	movq	%rdi, %rsi
+	andq	$-VEC_SIZE, %rsi
+	movaps	(%rsi), %xmm1
+	pxor	%xmm2, %xmm2
+	PCMPEQ	%xmm1, %xmm2
+	pmovmskb %xmm2, %edx
 	movl	%edi, %ecx
-	subl	%eax, %ecx
-	shrq	%cl, %rdx
-	shrq	%cl, %rsi
-	testq	%rdx, %rdx
-	je	L(loop_header2)
-	leaq	-1(%rdx), %rax
-	xorq	%rdx, %rax
-	andq	%rax, %rsi
-	je	L(exit)
-	bsrq	%rsi, %rax
+	andl	$(VEC_SIZE - 1), %ecx
+	sarl	%cl, %edx
+	jz	L(cross_page_continue)
+	PCMPEQ	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	sarl	%cl, %eax
+	leal	-1(%rdx), %ecx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(ret1)
+	bsrl	%eax, %eax
 	addq	%rdi, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+L(ret1):
 	ret
-END (strrchr)
+END(STRRCHR)
 
-weak_alias (strrchr, rindex)
-libc_hidden_builtin_def (strrchr)
+#ifndef USE_AS_WCSRCHR
+	weak_alias (STRRCHR, rindex)
+	libc_hidden_builtin_def (STRRCHR)
+#endif
diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
index 61552954de..2b80efc5ef 100644
--- a/sysdeps/x86_64/wcsrchr.S
+++ b/sysdeps/x86_64/wcsrchr.S
@@ -1,4 +1,4 @@
-/* wcsrchr with SSSE3
+/* wcsrchr optimized with SSE2.
    Copyright (C) 2011-2022 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -16,266 +16,12 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
 
-	.text
-ENTRY (wcsrchr)
+#define USE_AS_WCSRCHR	1
+#define NO_PMINU	1
 
-	movd	%rsi, %xmm1
-	mov	%rdi, %rcx
-	punpckldq %xmm1, %xmm1
-	pxor	%xmm2, %xmm2
-	punpckldq %xmm1, %xmm1
-	and	$63, %rcx
-	cmp	$48, %rcx
-	ja	L(crosscache)
+#ifndef STRRCHR
+# define STRRCHR	wcsrchr
+#endif
 
-	movdqu	(%rdi), %xmm0
-	pcmpeqd	%xmm0, %xmm2
-	pcmpeqd	%xmm1, %xmm0
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm0, %rax
-	add	$16, %rdi
-
-	test	%rax, %rax
-	jnz	L(unaligned_match1)
-
-	test	%rcx, %rcx
-	jnz	L(return_null)
-
-	and	$-16, %rdi
-	xor	%r8, %r8
-	jmp	L(loop)
-
-	.p2align 4
-L(unaligned_match1):
-	test	%rcx, %rcx
-	jnz	L(prolog_find_zero_1)
-
-	mov	%rax, %r8
-	mov	%rdi, %rsi
-	and	$-16, %rdi
-	jmp	L(loop)
-
-	.p2align 4
-L(crosscache):
-	and	$15, %rcx
-	and	$-16, %rdi
-	pxor	%xmm3, %xmm3
-	movdqa	(%rdi), %xmm0
-	pcmpeqd	%xmm0, %xmm3
-	pcmpeqd	%xmm1, %xmm0
-	pmovmskb %xmm3, %rdx
-	pmovmskb %xmm0, %rax
-	shr	%cl, %rdx
-	shr	%cl, %rax
-	add	$16, %rdi
-
-	test	%rax, %rax
-	jnz	L(unaligned_match)
-
-	test	%rdx, %rdx
-	jnz	L(return_null)
-
-	xor	%r8, %r8
-	jmp	L(loop)
-
-	.p2align 4
-L(unaligned_match):
-	test	%rdx, %rdx
-	jnz	L(prolog_find_zero)
-
-	mov	%rax, %r8
-	lea	(%rdi, %rcx), %rsi
-
-/* Loop start on aligned string.  */
-	.p2align 4
-L(loop):
-	movdqa	(%rdi), %xmm0
-	pcmpeqd	%xmm0, %xmm2
-	add	$16, %rdi
-	pcmpeqd	%xmm1, %xmm0
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm0, %rax
-	or	%rax, %rcx
-	jnz	L(matches)
-
-	movdqa	(%rdi), %xmm3
-	pcmpeqd	%xmm3, %xmm2
-	add	$16, %rdi
-	pcmpeqd	%xmm1, %xmm3
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm3, %rax
-	or	%rax, %rcx
-	jnz	L(matches)
-
-	movdqa	(%rdi), %xmm4
-	pcmpeqd	%xmm4, %xmm2
-	add	$16, %rdi
-	pcmpeqd	%xmm1, %xmm4
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm4, %rax
-	or	%rax, %rcx
-	jnz	L(matches)
-
-	movdqa	(%rdi), %xmm5
-	pcmpeqd	%xmm5, %xmm2
-	add	$16, %rdi
-	pcmpeqd	%xmm1, %xmm5
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm5, %rax
-	or	%rax, %rcx
-	jz	L(loop)
-
-	.p2align 4
-L(matches):
-	test	%rax, %rax
-	jnz	L(match)
-L(return_value):
-	test	%r8, %r8
-	jz	L(return_null)
-	mov	%r8, %rax
-	mov	%rsi, %rdi
-
-	test	$15 << 4, %ah
-	jnz	L(match_fourth_wchar)
-	test	%ah, %ah
-	jnz	L(match_third_wchar)
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(match):
-	pmovmskb %xmm2, %rcx
-	test	%rcx, %rcx
-	jnz	L(find_zero)
-	mov	%rax, %r8
-	mov	%rdi, %rsi
-	jmp	L(loop)
-
-	.p2align 4
-L(find_zero):
-	test	$15, %cl
-	jnz	L(find_zero_in_first_wchar)
-	test	%cl, %cl
-	jnz	L(find_zero_in_second_wchar)
-	test	$15, %ch
-	jnz	L(find_zero_in_third_wchar)
-
-	and	$1 << 13 - 1, %rax
-	jz	L(return_value)
-
-	test	$15 << 4, %ah
-	jnz	L(match_fourth_wchar)
-	test	%ah, %ah
-	jnz	L(match_third_wchar)
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(find_zero_in_first_wchar):
-	test	$1, %rax
-	jz	L(return_value)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(find_zero_in_second_wchar):
-	and	$1 << 5 - 1, %rax
-	jz	L(return_value)
-
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(find_zero_in_third_wchar):
-	and	$1 << 9 - 1, %rax
-	jz	L(return_value)
-
-	test	%ah, %ah
-	jnz	L(match_third_wchar)
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(prolog_find_zero):
-	add	%rcx, %rdi
-	mov     %rdx, %rcx
-L(prolog_find_zero_1):
-	test	$15, %cl
-	jnz	L(prolog_find_zero_in_first_wchar)
-	test	%cl, %cl
-	jnz	L(prolog_find_zero_in_second_wchar)
-	test	$15, %ch
-	jnz	L(prolog_find_zero_in_third_wchar)
-
-	and	$1 << 13 - 1, %rax
-	jz	L(return_null)
-
-	test	$15 << 4, %ah
-	jnz	L(match_fourth_wchar)
-	test	%ah, %ah
-	jnz	L(match_third_wchar)
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(prolog_find_zero_in_first_wchar):
-	test	$1, %rax
-	jz	L(return_null)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(prolog_find_zero_in_second_wchar):
-	and	$1 << 5 - 1, %rax
-	jz	L(return_null)
-
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(prolog_find_zero_in_third_wchar):
-	and	$1 << 9 - 1, %rax
-	jz	L(return_null)
-
-	test	%ah, %ah
-	jnz	L(match_third_wchar)
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(match_second_wchar):
-	lea	-12(%rdi), %rax
-	ret
-
-	.p2align 4
-L(match_third_wchar):
-	lea	-8(%rdi), %rax
-	ret
-
-	.p2align 4
-L(match_fourth_wchar):
-	lea	-4(%rdi), %rax
-	ret
-
-	.p2align 4
-L(return_null):
-	xor	%rax, %rax
-	ret
-
-END (wcsrchr)
+#include "../strrchr.S"
-- 
2.25.1


^ permalink raw reply	[flat|nested] 36+ messages in thread

* [PATCH v3 3/4] x86: Optimize {str|wcs}rchr-avx2
  2022-04-22  1:52 ` [PATCH v3 1/4] benchtests: Improve bench-strrchr Noah Goldstein
  2022-04-22  1:52   ` [PATCH v3 2/4] x86: Optimize {str|wcs}rchr-sse2 Noah Goldstein
@ 2022-04-22  1:52   ` Noah Goldstein
  2022-04-22 19:03     ` H.J. Lu
  2022-04-22  1:52   ` [PATCH v3 4/4] x86: Optimize {str|wcs}rchr-evex Noah Goldstein
  2022-04-22 18:29   ` [PATCH v3 1/4] benchtests: Improve bench-strrchr H.J. Lu
  3 siblings, 1 reply; 36+ messages in thread
From: Noah Goldstein @ 2022-04-22  1:52 UTC (permalink / raw)
  To: libc-alpha

The new code unrolls the main loop slightly without adding too much
overhead and minimizes the comparisons for the search CHAR.

Geometric Mean of all benchmarks New / Old: 0.832
See email for all results.

Full xcheck passes on x86_64 with and without multiarch enabled.
---
 sysdeps/x86_64/multiarch/strrchr-avx2.S | 426 +++++++++++++++---------
 1 file changed, 269 insertions(+), 157 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
index 1df2adfad0..bd26ba80d5 100644
--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
@@ -27,9 +27,13 @@
 # ifdef USE_AS_WCSRCHR
 #  define VPBROADCAST	vpbroadcastd
 #  define VPCMPEQ	vpcmpeqd
+#  define VPMIN	vpminud
+#  define CHAR_SIZE	4
 # else
 #  define VPBROADCAST	vpbroadcastb
 #  define VPCMPEQ	vpcmpeqb
+#  define VPMIN	vpminub
+#  define CHAR_SIZE	1
 # endif
 
 # ifndef VZEROUPPER
@@ -41,196 +45,304 @@
 # endif
 
 # define VEC_SIZE	32
+# define PAGE_SIZE	4096
 
-	.section SECTION(.text),"ax",@progbits
-ENTRY (STRRCHR)
-	movd	%esi, %xmm4
-	movl	%edi, %ecx
+	.section SECTION(.text), "ax", @progbits
+ENTRY(STRRCHR)
+	movd	%esi, %xmm7
+	movl	%edi, %eax
 	/* Broadcast CHAR to YMM4.  */
-	VPBROADCAST %xmm4, %ymm4
+	VPBROADCAST %xmm7, %ymm7
 	vpxor	%xmm0, %xmm0, %xmm0
 
-	/* Check if we may cross page boundary with one vector load.  */
-	andl	$(2 * VEC_SIZE - 1), %ecx
-	cmpl	$VEC_SIZE, %ecx
-	ja	L(cros_page_boundary)
+	/* Shift here instead of `andl` to save code size (saves a fetch
+	   block).  */
+	sall	$20, %eax
+	cmpl	$((PAGE_SIZE - VEC_SIZE) << 20), %eax
+	ja	L(cross_page)
 
+L(page_cross_continue):
 	vmovdqu	(%rdi), %ymm1
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
-	vpmovmskb %ymm2, %ecx
-	vpmovmskb %ymm3, %eax
-	addq	$VEC_SIZE, %rdi
+	/* Check end of string match.  */
+	VPCMPEQ	%ymm1, %ymm0, %ymm6
+	vpmovmskb %ymm6, %ecx
+	testl	%ecx, %ecx
+	jz	L(aligned_more)
+
+	/* Only check match with search CHAR if needed.  */
+	VPCMPEQ	%ymm1, %ymm7, %ymm1
+	vpmovmskb %ymm1, %eax
+	/* Check if match before first zero.  */
+	blsmskl	%ecx, %ecx
+	andl	%ecx, %eax
+	jz	L(ret0)
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+	/* We are off by 3 for wcsrchr if search CHAR is non-zero. If
+	   search CHAR is zero we are correct. Either way `andq
+	   -CHAR_SIZE, %rax` gets the correct result.  */
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+L(ret0):
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+
+	/* Returns for first vec x1/x2 have hard coded backward search
+	   path for earlier matches.  */
+	.p2align 4,, 10
+L(first_vec_x1):
+	VPCMPEQ	%ymm2, %ymm7, %ymm6
+	vpmovmskb %ymm6, %eax
+	blsmskl	%ecx, %ecx
+	andl	%ecx, %eax
+	jnz	L(first_vec_x1_return)
+
+	.p2align 4,, 4
+L(first_vec_x0_test):
+	VPCMPEQ	%ymm1, %ymm7, %ymm6
+	vpmovmskb %ymm6, %eax
+	testl	%eax, %eax
+	jz	L(ret1)
+	bsrl	%eax, %eax
+	addq	%r8, %rax
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+L(ret1):
+	VZEROUPPER_RETURN
 
+	.p2align 4,, 10
+L(first_vec_x0_x1_test):
+	VPCMPEQ	%ymm2, %ymm7, %ymm6
+	vpmovmskb %ymm6, %eax
+	/* Check ymm2 for search CHAR match. If no match then check ymm1
+	   before returning.  */
 	testl	%eax, %eax
-	jnz	L(first_vec)
+	jz	L(first_vec_x0_test)
+	.p2align 4,, 4
+L(first_vec_x1_return):
+	bsrl	%eax, %eax
+	leaq	1(%rdi, %rax), %rax
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+	VZEROUPPER_RETURN
 
-	testl	%ecx, %ecx
-	jnz	L(return_null)
 
-	andq	$-VEC_SIZE, %rdi
-	xorl	%edx, %edx
-	jmp	L(aligned_loop)
+	.p2align 4,, 10
+L(first_vec_x2):
+	VPCMPEQ	%ymm3, %ymm7, %ymm6
+	vpmovmskb %ymm6, %eax
+	blsmskl	%ecx, %ecx
+	/* If no in-range search CHAR match in ymm3 then need to check
+	   ymm1/ymm2 for an earlier match (we delay checking search
+	   CHAR matches until needed).  */
+	andl	%ecx, %eax
+	jz	L(first_vec_x0_x1_test)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE + 1)(%rdi, %rax), %rax
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+	VZEROUPPER_RETURN
+
 
 	.p2align 4
-L(first_vec):
-	/* Check if there is a nul CHAR.  */
+L(aligned_more):
+	/* Save original pointer if match was in VEC 0.  */
+	movq	%rdi, %r8
+
+	/* Align src.  */
+	orq	$(VEC_SIZE - 1), %rdi
+	vmovdqu	1(%rdi), %ymm2
+	VPCMPEQ	%ymm2, %ymm0, %ymm6
+	vpmovmskb %ymm6, %ecx
 	testl	%ecx, %ecx
-	jnz	L(char_and_nul_in_first_vec)
+	jnz	L(first_vec_x1)
 
-	/* Remember the match and keep searching.  */
-	movl	%eax, %edx
-	movq	%rdi, %rsi
-	andq	$-VEC_SIZE, %rdi
-	jmp	L(aligned_loop)
+	vmovdqu	(VEC_SIZE + 1)(%rdi), %ymm3
+	VPCMPEQ	%ymm3, %ymm0, %ymm6
+	vpmovmskb %ymm6, %ecx
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x2)
 
+	/* Save pointer again before realigning.  */
+	movq	%rdi, %rsi
+	addq	$(VEC_SIZE + 1), %rdi
+	andq	$-(VEC_SIZE * 2), %rdi
 	.p2align 4
-L(cros_page_boundary):
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
-	vmovdqa	(%rdi), %ymm1
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
-	vpmovmskb %ymm2, %edx
-	vpmovmskb %ymm3, %eax
-	shrl	%cl, %edx
-	shrl	%cl, %eax
-	addq	$VEC_SIZE, %rdi
-
-	/* Check if there is a CHAR.  */
+L(first_aligned_loop):
+	/* Do 2x VEC at a time. Any more and the cost of finding the
+	   match outweights loop benefit.  */
+	vmovdqa	(VEC_SIZE * 0)(%rdi), %ymm4
+	vmovdqa	(VEC_SIZE * 1)(%rdi), %ymm5
+
+	VPCMPEQ	%ymm4, %ymm7, %ymm6
+	VPMIN	%ymm4, %ymm5, %ymm8
+	VPCMPEQ	%ymm5, %ymm7, %ymm10
+	vpor	%ymm6, %ymm10, %ymm5
+	VPCMPEQ	%ymm8, %ymm0, %ymm8
+	vpor	%ymm5, %ymm8, %ymm9
+
+	vpmovmskb %ymm9, %eax
+	addq	$(VEC_SIZE * 2), %rdi
+	/* No zero or search CHAR.  */
 	testl	%eax, %eax
-	jnz	L(found_char)
-
-	testl	%edx, %edx
-	jnz	L(return_null)
+	jz	L(first_aligned_loop)
 
-	jmp	L(aligned_loop)
-
-	.p2align 4
-L(found_char):
-	testl	%edx, %edx
-	jnz	L(char_and_nul)
+	/* If no zero CHAR then go to second loop (this allows us to
+	   throw away all prior work).  */
+	vpmovmskb %ymm8, %ecx
+	testl	%ecx, %ecx
+	jz	L(second_aligned_loop_prep)
 
-	/* Remember the match and keep searching.  */
-	movl	%eax, %edx
-	leaq	(%rdi, %rcx), %rsi
+	/* Search char could be zero so we need to get the true match.
+	 */
+	vpmovmskb %ymm5, %eax
+	testl	%eax, %eax
+	jnz	L(first_aligned_loop_return)
 
-	.p2align 4
-L(aligned_loop):
-	vmovdqa	(%rdi), %ymm1
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
-	addq	$VEC_SIZE, %rdi
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
-	vpmovmskb %ymm2, %ecx
-	vpmovmskb %ymm3, %eax
-	orl	%eax, %ecx
-	jnz	L(char_nor_null)
-
-	vmovdqa	(%rdi), %ymm1
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
-	add	$VEC_SIZE, %rdi
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
-	vpmovmskb %ymm2, %ecx
+	.p2align 4,, 4
+L(first_vec_x1_or_x2):
+	VPCMPEQ	%ymm3, %ymm7, %ymm3
+	VPCMPEQ	%ymm2, %ymm7, %ymm2
 	vpmovmskb %ymm3, %eax
-	orl	%eax, %ecx
-	jnz	L(char_nor_null)
-
-	vmovdqa	(%rdi), %ymm1
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
-	addq	$VEC_SIZE, %rdi
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
-	vpmovmskb %ymm2, %ecx
-	vpmovmskb %ymm3, %eax
-	orl	%eax, %ecx
-	jnz	L(char_nor_null)
-
-	vmovdqa	(%rdi), %ymm1
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
-	addq	$VEC_SIZE, %rdi
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
-	vpmovmskb %ymm2, %ecx
-	vpmovmskb %ymm3, %eax
-	orl	%eax, %ecx
-	jz	L(aligned_loop)
-
-	.p2align 4
-L(char_nor_null):
-	/* Find a CHAR or a nul CHAR in a loop.  */
-	testl	%eax, %eax
-	jnz	L(match)
-L(return_value):
-	testl	%edx, %edx
-	jz	L(return_null)
-	movl	%edx, %eax
-	movq	%rsi, %rdi
+	vpmovmskb %ymm2, %edx
+	/* Use add for macro-fusion.  */
+	addq	%rax, %rdx
+	jz	L(first_vec_x0_test)
+	/* NB: We could move this shift to before the branch and save a
+	   bit of code size / performance on the fall through. The
+	   branch leads to the null case which generally seems hotter
+	   than char in first 3x VEC.  */
+	salq	$32, %rax
+	addq	%rdx, %rax
+	bsrq	%rax, %rax
+	leaq	1(%rsi, %rax), %rax
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+	VZEROUPPER_RETURN
 
+	.p2align 4,, 8
+L(first_aligned_loop_return):
+	VPCMPEQ	%ymm4, %ymm0, %ymm4
+	vpmovmskb %ymm4, %edx
+	salq	$32, %rcx
+	orq	%rdx, %rcx
+
+	vpmovmskb %ymm10, %eax
+	vpmovmskb %ymm6, %edx
+	salq	$32, %rax
+	orq	%rdx, %rax
+	blsmskq	%rcx, %rcx
+	andq	%rcx, %rax
+	jz	L(first_vec_x1_or_x2)
+
+	bsrq	%rax, %rax
+	leaq	-(VEC_SIZE * 2)(%rdi, %rax), %rax
 # ifdef USE_AS_WCSRCHR
-	/* Keep the first bit for each matching CHAR for bsr.  */
-	andl	$0x11111111, %eax
+	andq	$-CHAR_SIZE, %rax
 # endif
-	bsrl	%eax, %eax
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
-L(return_vzeroupper):
-	ZERO_UPPER_VEC_REGISTERS_RETURN
+	VZEROUPPER_RETURN
 
+	/* Search char cannot be zero.  */
 	.p2align 4
-L(match):
-	/* Find a CHAR.  Check if there is a nul CHAR.  */
-	vpmovmskb %ymm2, %ecx
-	testl	%ecx, %ecx
-	jnz	L(find_nul)
-
-	/* Remember the match and keep searching.  */
-	movl	%eax, %edx
+L(second_aligned_loop_set_furthest_match):
+	/* Save VEC and pointer from most recent match.  */
+L(second_aligned_loop_prep):
 	movq	%rdi, %rsi
-	jmp	L(aligned_loop)
+	vmovdqu	%ymm6, %ymm2
+	vmovdqu	%ymm10, %ymm3
 
 	.p2align 4
-L(find_nul):
-# ifdef USE_AS_WCSRCHR
-	/* Keep the first bit for each matching CHAR for bsr.  */
-	andl	$0x11111111, %ecx
-	andl	$0x11111111, %eax
-# endif
-	/* Mask out any matching bits after the nul CHAR.  */
-	movl	%ecx, %r8d
-	subl	$1, %r8d
-	xorl	%ecx, %r8d
-	andl	%r8d, %eax
+L(second_aligned_loop):
+	/* Search 2x at at time.  */
+	vmovdqa	(VEC_SIZE * 0)(%rdi), %ymm4
+	vmovdqa	(VEC_SIZE * 1)(%rdi), %ymm5
+
+	VPCMPEQ	%ymm4, %ymm7, %ymm6
+	VPMIN	%ymm4, %ymm5, %ymm1
+	VPCMPEQ	%ymm5, %ymm7, %ymm10
+	vpor	%ymm6, %ymm10, %ymm5
+	VPCMPEQ	%ymm1, %ymm0, %ymm1
+	vpor	%ymm5, %ymm1, %ymm9
+
+	vpmovmskb %ymm9, %eax
+	addq	$(VEC_SIZE * 2), %rdi
 	testl	%eax, %eax
-	/* If there is no CHAR here, return the remembered one.  */
-	jz	L(return_value)
-	bsrl	%eax, %eax
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(char_and_nul):
-	/* Find both a CHAR and a nul CHAR.  */
-	addq	%rcx, %rdi
-	movl	%edx, %ecx
-L(char_and_nul_in_first_vec):
-# ifdef USE_AS_WCSRCHR
-	/* Keep the first bit for each matching CHAR for bsr.  */
-	andl	$0x11111111, %ecx
-	andl	$0x11111111, %eax
-# endif
-	/* Mask out any matching bits after the nul CHAR.  */
-	movl	%ecx, %r8d
-	subl	$1, %r8d
-	xorl	%ecx, %r8d
-	andl	%r8d, %eax
+	jz	L(second_aligned_loop)
+	vpmovmskb %ymm1, %ecx
+	testl	%ecx, %ecx
+	jz	L(second_aligned_loop_set_furthest_match)
+	vpmovmskb %ymm5, %eax
 	testl	%eax, %eax
-	/* Return null pointer if the nul CHAR comes first.  */
-	jz	L(return_null)
-	bsrl	%eax, %eax
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
+	jnz	L(return_new_match)
+
+	/* This is the hot patch. We know CHAR is inbounds and that
+	   ymm3/ymm2 have latest match.  */
+	.p2align 4,, 4
+L(return_old_match):
+	vpmovmskb %ymm3, %eax
+	vpmovmskb %ymm2, %edx
+	salq	$32, %rax
+	orq	%rdx, %rax
+	bsrq	%rax, %rax
+	/* Search char cannot be zero so safe to just use lea for
+	   wcsrchr.  */
+	leaq	(VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax
 	VZEROUPPER_RETURN
 
-	.p2align 4
-L(return_null):
-	xorl	%eax, %eax
+	/* Last iteration also potentially has a match.  */
+	.p2align 4,, 8
+L(return_new_match):
+	VPCMPEQ	%ymm4, %ymm0, %ymm4
+	vpmovmskb %ymm4, %edx
+	salq	$32, %rcx
+	orq	%rdx, %rcx
+
+	vpmovmskb %ymm10, %eax
+	vpmovmskb %ymm6, %edx
+	salq	$32, %rax
+	orq	%rdx, %rax
+	blsmskq	%rcx, %rcx
+	andq	%rcx, %rax
+	jz	L(return_old_match)
+	bsrq	%rax, %rax
+	/* Search char cannot be zero so safe to just use lea for
+	   wcsrchr.  */
+	leaq	(VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax
 	VZEROUPPER_RETURN
 
-END (STRRCHR)
+	.p2align 4,, 4
+L(cross_page):
+	movq	%rdi, %rsi
+	andq	$-VEC_SIZE, %rsi
+	vmovdqu	(%rsi), %ymm1
+	VPCMPEQ	%ymm1, %ymm0, %ymm6
+	vpmovmskb %ymm6, %ecx
+	/* Shift out zero CHAR matches that are before the begining of
+	   src (rdi).  */
+	shrxl	%edi, %ecx, %ecx
+	testl	%ecx, %ecx
+	jz	L(page_cross_continue)
+	VPCMPEQ	%ymm1, %ymm7, %ymm1
+	vpmovmskb %ymm1, %eax
+
+	/* Shift out search CHAR matches that are before the begining of
+	   src (rdi).  */
+	shrxl	%edi, %eax, %eax
+	blsmskl	%ecx, %ecx
+	/* Check if any search CHAR match in range.  */
+	andl	%ecx, %eax
+	jz	L(ret2)
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+L(ret2):
+	VZEROUPPER_RETURN
+END(STRRCHR)
 #endif
-- 
2.25.1


^ permalink raw reply	[flat|nested] 36+ messages in thread

* [PATCH v3 4/4] x86: Optimize {str|wcs}rchr-evex
  2022-04-22  1:52 ` [PATCH v3 1/4] benchtests: Improve bench-strrchr Noah Goldstein
  2022-04-22  1:52   ` [PATCH v3 2/4] x86: Optimize {str|wcs}rchr-sse2 Noah Goldstein
  2022-04-22  1:52   ` [PATCH v3 3/4] x86: Optimize {str|wcs}rchr-avx2 Noah Goldstein
@ 2022-04-22  1:52   ` Noah Goldstein
  2022-04-22 19:04     ` H.J. Lu
  2022-04-22 18:29   ` [PATCH v3 1/4] benchtests: Improve bench-strrchr H.J. Lu
  3 siblings, 1 reply; 36+ messages in thread
From: Noah Goldstein @ 2022-04-22  1:52 UTC (permalink / raw)
  To: libc-alpha

The new code unrolls the main loop slightly without adding too much
overhead and minimizes the comparisons for the search CHAR.

Geometric Mean of all benchmarks New / Old: 0.755
See email for all results.

Full xcheck passes on x86_64 with and without multiarch enabled.
---
 sysdeps/x86_64/multiarch/strrchr-evex.S | 471 +++++++++++++++---------
 1 file changed, 290 insertions(+), 181 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
index adeddaed32..8014c285b3 100644
--- a/sysdeps/x86_64/multiarch/strrchr-evex.S
+++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
@@ -24,242 +24,351 @@
 #  define STRRCHR	__strrchr_evex
 # endif
 
-# define VMOVU		vmovdqu64
-# define VMOVA		vmovdqa64
+# define VMOVU	vmovdqu64
+# define VMOVA	vmovdqa64
 
 # ifdef USE_AS_WCSRCHR
+#  define SHIFT_REG	esi
+
+#  define kunpck	kunpckbw
+#  define kmov_2x	kmovd
+#  define maskz_2x	ecx
+#  define maskm_2x	eax
+#  define CHAR_SIZE	4
+#  define VPMIN	vpminud
+#  define VPTESTN	vptestnmd
 #  define VPBROADCAST	vpbroadcastd
-#  define VPCMP		vpcmpd
-#  define SHIFT_REG	r8d
+#  define VPCMP	vpcmpd
 # else
+#  define SHIFT_REG	edi
+
+#  define kunpck	kunpckdq
+#  define kmov_2x	kmovq
+#  define maskz_2x	rcx
+#  define maskm_2x	rax
+
+#  define CHAR_SIZE	1
+#  define VPMIN	vpminub
+#  define VPTESTN	vptestnmb
 #  define VPBROADCAST	vpbroadcastb
-#  define VPCMP		vpcmpb
-#  define SHIFT_REG	ecx
+#  define VPCMP	vpcmpb
 # endif
 
 # define XMMZERO	xmm16
 # define YMMZERO	ymm16
 # define YMMMATCH	ymm17
-# define YMM1		ymm18
+# define YMMSAVE	ymm18
+
+# define YMM1	ymm19
+# define YMM2	ymm20
+# define YMM3	ymm21
+# define YMM4	ymm22
+# define YMM5	ymm23
+# define YMM6	ymm24
+# define YMM7	ymm25
+# define YMM8	ymm26
 
-# define VEC_SIZE	32
 
-	.section .text.evex,"ax",@progbits
-ENTRY (STRRCHR)
-	movl	%edi, %ecx
+# define VEC_SIZE	32
+# define PAGE_SIZE	4096
+	.section .text.evex, "ax", @progbits
+ENTRY(STRRCHR)
+	movl	%edi, %eax
 	/* Broadcast CHAR to YMMMATCH.  */
 	VPBROADCAST %esi, %YMMMATCH
 
-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
-
-	/* Check if we may cross page boundary with one vector load.  */
-	andl	$(2 * VEC_SIZE - 1), %ecx
-	cmpl	$VEC_SIZE, %ecx
-	ja	L(cros_page_boundary)
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	jg	L(cross_page_boundary)
 
+L(page_cross_continue):
 	VMOVU	(%rdi), %YMM1
-
-	/* Each bit in K0 represents a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
-	/* Each bit in K1 represents a CHAR in YMM1.  */
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+	/* k0 has a 1 for each zero CHAR in YMM1.  */
+	VPTESTN	%YMM1, %YMM1, %k0
 	kmovd	%k0, %ecx
-	kmovd	%k1, %eax
-
-	addq	$VEC_SIZE, %rdi
-
-	testl	%eax, %eax
-	jnz	L(first_vec)
-
 	testl	%ecx, %ecx
-	jnz	L(return_null)
-
-	andq	$-VEC_SIZE, %rdi
-	xorl	%edx, %edx
-	jmp	L(aligned_loop)
-
-	.p2align 4
-L(first_vec):
-	/* Check if there is a null byte.  */
-	testl	%ecx, %ecx
-	jnz	L(char_and_nul_in_first_vec)
-
-	/* Remember the match and keep searching.  */
-	movl	%eax, %edx
-	movq	%rdi, %rsi
-	andq	$-VEC_SIZE, %rdi
-	jmp	L(aligned_loop)
-
-	.p2align 4
-L(cros_page_boundary):
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
+	jz	L(aligned_more)
+	/* fallthrough: zero CHAR in first VEC.  */
 
+	/* K1 has a 1 for each search CHAR match in YMM1.  */
+	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+	kmovd	%k1, %eax
+	/* Build mask up until first zero CHAR (used to mask of
+	   potential search CHAR matches past the end of the string).
+	 */
+	blsmskl	%ecx, %ecx
+	andl	%ecx, %eax
+	jz	L(ret0)
+	/* Get last match (the `andl` removed any out of bounds
+	   matches).  */
+	bsrl	%eax, %eax
 # ifdef USE_AS_WCSRCHR
-	/* NB: Divide shift count by 4 since each bit in K1 represent 4
-	   bytes.  */
-	movl	%ecx, %SHIFT_REG
-	sarl	$2, %SHIFT_REG
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	addq	%rdi, %rax
 # endif
+L(ret0):
+	ret
 
-	VMOVA	(%rdi), %YMM1
-
-	/* Each bit in K0 represents a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
-	/* Each bit in K1 represents a CHAR in YMM1.  */
+	/* Returns for first vec x1/x2/x3 have hard coded backward
+	   search path for earlier matches.  */
+	.p2align 4,, 6
+L(first_vec_x1):
+	VPCMP	$0, %YMMMATCH, %YMM2, %k1
+	kmovd	%k1, %eax
+	blsmskl	%ecx, %ecx
+	/* eax non-zero if search CHAR in range.  */
+	andl	%ecx, %eax
+	jnz	L(first_vec_x1_return)
+
+	/* fallthrough: no match in YMM2 then need to check for earlier
+	   matches (in YMM1).  */
+	.p2align 4,, 4
+L(first_vec_x0_test):
 	VPCMP	$0, %YMMMATCH, %YMM1, %k1
-	kmovd	%k0, %edx
 	kmovd	%k1, %eax
-
-	shrxl	%SHIFT_REG, %edx, %edx
-	shrxl	%SHIFT_REG, %eax, %eax
-	addq	$VEC_SIZE, %rdi
-
-	/* Check if there is a CHAR.  */
 	testl	%eax, %eax
-	jnz	L(found_char)
-
-	testl	%edx, %edx
-	jnz	L(return_null)
-
-	jmp	L(aligned_loop)
-
-	.p2align 4
-L(found_char):
-	testl	%edx, %edx
-	jnz	L(char_and_nul)
-
-	/* Remember the match and keep searching.  */
-	movl	%eax, %edx
-	leaq	(%rdi, %rcx), %rsi
+	jz	L(ret1)
+	bsrl	%eax, %eax
+# ifdef USE_AS_WCSRCHR
+	leaq	(%rsi, %rax, CHAR_SIZE), %rax
+# else
+	addq	%rsi, %rax
+# endif
+L(ret1):
+	ret
 
-	.p2align 4
-L(aligned_loop):
-	VMOVA	(%rdi), %YMM1
-	addq	$VEC_SIZE, %rdi
+	.p2align 4,, 10
+L(first_vec_x1_or_x2):
+	VPCMP	$0, %YMM3, %YMMMATCH, %k3
+	VPCMP	$0, %YMM2, %YMMMATCH, %k2
+	/* K2 and K3 have 1 for any search CHAR match. Test if any
+	   matches between either of them. Otherwise check YMM1.  */
+	kortestd %k2, %k3
+	jz	L(first_vec_x0_test)
+
+	/* Guranteed that YMM2 and YMM3 are within range so merge the
+	   two bitmasks then get last result.  */
+	kunpck	%k2, %k3, %k3
+	kmovq	%k3, %rax
+	bsrq	%rax, %rax
+	leaq	(VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax
+	ret
 
-	/* Each bit in K0 represents a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
-	/* Each bit in K1 represents a CHAR in YMM1.  */
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
-	kmovd	%k0, %ecx
+	.p2align 4,, 6
+L(first_vec_x3):
+	VPCMP	$0, %YMMMATCH, %YMM4, %k1
 	kmovd	%k1, %eax
-	orl	%eax, %ecx
-	jnz	L(char_nor_null)
+	blsmskl	%ecx, %ecx
+	/* If no search CHAR match in range check YMM1/YMM2/YMM3.  */
+	andl	%ecx, %eax
+	jz	L(first_vec_x1_or_x2)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
 
-	VMOVA	(%rdi), %YMM1
-	add	$VEC_SIZE, %rdi
+	.p2align 4,, 6
+L(first_vec_x0_x1_test):
+	VPCMP	$0, %YMMMATCH, %YMM2, %k1
+	kmovd	%k1, %eax
+	/* Check YMM2 for last match first. If no match try YMM1.  */
+	testl	%eax, %eax
+	jz	L(first_vec_x0_test)
+	.p2align 4,, 4
+L(first_vec_x1_return):
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
 
-	/* Each bit in K0 represents a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
-	/* Each bit in K1 represents a CHAR in YMM1.  */
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
-	kmovd	%k0, %ecx
+	.p2align 4,, 10
+L(first_vec_x2):
+	VPCMP	$0, %YMMMATCH, %YMM3, %k1
 	kmovd	%k1, %eax
-	orl	%eax, %ecx
-	jnz	L(char_nor_null)
+	blsmskl	%ecx, %ecx
+	/* Check YMM3 for last match first. If no match try YMM2/YMM1.
+	 */
+	andl	%ecx, %eax
+	jz	L(first_vec_x0_x1_test)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
 
-	VMOVA	(%rdi), %YMM1
-	addq	$VEC_SIZE, %rdi
 
-	/* Each bit in K0 represents a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
-	/* Each bit in K1 represents a CHAR in YMM1.  */
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+	.p2align 4
+L(aligned_more):
+	/* Need to keep original pointer incase YMM1 has last match.  */
+	movq	%rdi, %rsi
+	andq	$-VEC_SIZE, %rdi
+	VMOVU	VEC_SIZE(%rdi), %YMM2
+	VPTESTN	%YMM2, %YMM2, %k0
 	kmovd	%k0, %ecx
-	kmovd	%k1, %eax
-	orl	%eax, %ecx
-	jnz	L(char_nor_null)
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x1)
 
-	VMOVA	(%rdi), %YMM1
-	addq	$VEC_SIZE, %rdi
+	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM3
+	VPTESTN	%YMM3, %YMM3, %k0
+	kmovd	%k0, %ecx
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x2)
 
-	/* Each bit in K0 represents a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
-	/* Each bit in K1 represents a CHAR in YMM1.  */
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM4
+	VPTESTN	%YMM4, %YMM4, %k0
 	kmovd	%k0, %ecx
-	kmovd	%k1, %eax
-	orl	%eax, %ecx
-	jz	L(aligned_loop)
+	movq	%rdi, %r8
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x3)
 
+	andq	$-(VEC_SIZE * 2), %rdi
 	.p2align 4
-L(char_nor_null):
-	/* Find a CHAR or a null byte in a loop.  */
+L(first_aligned_loop):
+	/* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee
+	   they don't store a match.  */
+	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM5
+	VMOVA	(VEC_SIZE * 5)(%rdi), %YMM6
+
+	VPCMP	$0, %YMM5, %YMMMATCH, %k2
+	vpxord	%YMM6, %YMMMATCH, %YMM7
+
+	VPMIN	%YMM5, %YMM6, %YMM8
+	VPMIN	%YMM8, %YMM7, %YMM7
+
+	VPTESTN	%YMM7, %YMM7, %k1
+	subq	$(VEC_SIZE * -2), %rdi
+	kortestd %k1, %k2
+	jz	L(first_aligned_loop)
+
+	VPCMP	$0, %YMM6, %YMMMATCH, %k3
+	VPTESTN	%YMM8, %YMM8, %k1
+	ktestd	%k1, %k1
+	jz	L(second_aligned_loop_prep)
+
+	kortestd %k2, %k3
+	jnz	L(return_first_aligned_loop)
+
+	.p2align 4,, 6
+L(first_vec_x1_or_x2_or_x3):
+	VPCMP	$0, %YMM4, %YMMMATCH, %k4
+	kmovd	%k4, %eax
 	testl	%eax, %eax
-	jnz	L(match)
-L(return_value):
-	testl	%edx, %edx
-	jz	L(return_null)
-	movl	%edx, %eax
-	movq	%rsi, %rdi
+	jz	L(first_vec_x1_or_x2)
 	bsrl	%eax, %eax
-# ifdef USE_AS_WCSRCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
-# else
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
-# endif
+	leaq	(VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
 	ret
 
-	.p2align 4
-L(match):
-	/* Find a CHAR.  Check if there is a null byte.  */
-	kmovd	%k0, %ecx
-	testl	%ecx, %ecx
-	jnz	L(find_nul)
+	.p2align 4,, 8
+L(return_first_aligned_loop):
+	VPTESTN	%YMM5, %YMM5, %k0
+	kunpck	%k0, %k1, %k0
+	kmov_2x	%k0, %maskz_2x
+
+	blsmsk	%maskz_2x, %maskz_2x
+	kunpck	%k2, %k3, %k3
+	kmov_2x	%k3, %maskm_2x
+	and	%maskz_2x, %maskm_2x
+	jz	L(first_vec_x1_or_x2_or_x3)
 
-	/* Remember the match and keep searching.  */
-	movl	%eax, %edx
+	bsr	%maskm_2x, %maskm_2x
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+	.p2align 4
+	/* We can throw away the work done for the first 4x checks here
+	   as we have a later match. This is the 'fast' path persay.
+	 */
+L(second_aligned_loop_prep):
+L(second_aligned_loop_set_furthest_match):
 	movq	%rdi, %rsi
-	jmp	L(aligned_loop)
+	kunpck	%k2, %k3, %k4
 
 	.p2align 4
-L(find_nul):
-	/* Mask out any matching bits after the null byte.  */
-	movl	%ecx, %r8d
-	subl	$1, %r8d
-	xorl	%ecx, %r8d
-	andl	%r8d, %eax
-	testl	%eax, %eax
-	/* If there is no CHAR here, return the remembered one.  */
-	jz	L(return_value)
-	bsrl	%eax, %eax
+L(second_aligned_loop):
+	VMOVU	(VEC_SIZE * 4)(%rdi), %YMM1
+	VMOVU	(VEC_SIZE * 5)(%rdi), %YMM2
+
+	VPCMP	$0, %YMM1, %YMMMATCH, %k2
+	vpxord	%YMM2, %YMMMATCH, %YMM3
+
+	VPMIN	%YMM1, %YMM2, %YMM4
+	VPMIN	%YMM3, %YMM4, %YMM3
+
+	VPTESTN	%YMM3, %YMM3, %k1
+	subq	$(VEC_SIZE * -2), %rdi
+	kortestd %k1, %k2
+	jz	L(second_aligned_loop)
+
+	VPCMP	$0, %YMM2, %YMMMATCH, %k3
+	VPTESTN	%YMM4, %YMM4, %k1
+	ktestd	%k1, %k1
+	jz	L(second_aligned_loop_set_furthest_match)
+
+	kortestd %k2, %k3
+	/* branch here because there is a significant advantage interms
+	   of output dependency chance in using edx.  */
+	jnz	L(return_new_match)
+L(return_old_match):
+	kmovq	%k4, %rax
+	bsrq	%rax, %rax
+	leaq	(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax
+	ret
+
+L(return_new_match):
+	VPTESTN	%YMM1, %YMM1, %k0
+	kunpck	%k0, %k1, %k0
+	kmov_2x	%k0, %maskz_2x
+
+	blsmsk	%maskz_2x, %maskz_2x
+	kunpck	%k2, %k3, %k3
+	kmov_2x	%k3, %maskm_2x
+	and	%maskz_2x, %maskm_2x
+	jz	L(return_old_match)
+
+	bsr	%maskm_2x, %maskm_2x
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+L(cross_page_boundary):
+	/* eax contains all the page offset bits of src (rdi). `xor rdi,
+	   rax` sets pointer will all page offset bits cleared so
+	   offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
+	   before page cross (guranteed to be safe to read). Doing this
+	   as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
+	   a bit of code size.  */
+	xorq	%rdi, %rax
+	VMOVU	(PAGE_SIZE - VEC_SIZE)(%rax), %YMM1
+	VPTESTN	%YMM1, %YMM1, %k0
+	kmovd	%k0, %ecx
+
+	/* Shift out zero CHAR matches that are before the begining of
+	   src (rdi).  */
 # ifdef USE_AS_WCSRCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
-# else
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
+	movl	%edi, %esi
+	andl	$(VEC_SIZE - 1), %esi
+	shrl	$2, %esi
 # endif
-	ret
+	shrxl	%SHIFT_REG, %ecx, %ecx
 
-	.p2align 4
-L(char_and_nul):
-	/* Find both a CHAR and a null byte.  */
-	addq	%rcx, %rdi
-	movl	%edx, %ecx
-L(char_and_nul_in_first_vec):
-	/* Mask out any matching bits after the null byte.  */
-	movl	%ecx, %r8d
-	subl	$1, %r8d
-	xorl	%ecx, %r8d
-	andl	%r8d, %eax
-	testl	%eax, %eax
-	/* Return null pointer if the null byte comes first.  */
-	jz	L(return_null)
+	testl	%ecx, %ecx
+	jz	L(page_cross_continue)
+
+	/* Found zero CHAR so need to test for search CHAR.  */
+	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+	kmovd	%k1, %eax
+	/* Shift out search CHAR matches that are before the begining of
+	   src (rdi).  */
+	shrxl	%SHIFT_REG, %eax, %eax
+
+	/* Check if any search CHAR match in range.  */
+	blsmskl	%ecx, %ecx
+	andl	%ecx, %eax
+	jz	L(ret3)
 	bsrl	%eax, %eax
 # ifdef USE_AS_WCSRCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
 # else
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
+	addq	%rdi, %rax
 # endif
+L(ret3):
 	ret
 
-	.p2align 4
-L(return_null):
-	xorl	%eax, %eax
-	ret
-
-END (STRRCHR)
+END(STRRCHR)
 #endif
-- 
2.25.1


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2 4/4] x86: Optimize {str|wcs}rchr-evex
  2022-04-21 23:59     ` H.J. Lu
@ 2022-04-22  1:53       ` Noah Goldstein
  0 siblings, 0 replies; 36+ messages in thread
From: Noah Goldstein @ 2022-04-22  1:53 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell

On Thu, Apr 21, 2022 at 7:00 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Thu, Apr 21, 2022 at 3:22 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > The new code unrolls the main loop slightly without adding too much
> > overhead and minimizes the comparisons for the search CHAR.
> >
> > Geometric Mean of all benchmarks New / Old: 0.755
> > See email for all results.
> >
> > Full xcheck passes on x86_64 with and without multiarch enabled.
> > ---
> >  sysdeps/x86_64/multiarch/strrchr-evex.S | 441 ++++++++++++++----------
> >  1 file changed, 259 insertions(+), 182 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
> > index adeddaed32..5cf9a8315b 100644
> > --- a/sysdeps/x86_64/multiarch/strrchr-evex.S
> > +++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
> > @@ -24,242 +24,319 @@
> >  #  define STRRCHR      __strrchr_evex
> >  # endif
> >
> > -# define VMOVU         vmovdqu64
> > -# define VMOVA         vmovdqa64
> > +# define VMOVU vmovdqu64
> > +# define VMOVA vmovdqa64
> >
> >  # ifdef USE_AS_WCSRCHR
> > +#  define SHIFT_REG    esi
> > +
> > +#  define kunpck       kunpckbw
> > +#  define kmov_2x      kmovd
> > +#  define maskz_2x     ecx
> > +#  define maskm_2x     eax
> > +#  define CHAR_SIZE    4
> > +#  define VPMIN        vpminud
> > +#  define VPTESTN      vptestnmd
> >  #  define VPBROADCAST  vpbroadcastd
> > -#  define VPCMP                vpcmpd
> > -#  define SHIFT_REG    r8d
> > +#  define VPCMP        vpcmpd
> >  # else
> > +#  define SHIFT_REG    edi
> > +
> > +#  define kunpck       kunpckdq
> > +#  define kmov_2x      kmovq
> > +#  define maskz_2x     rcx
> > +#  define maskm_2x     rax
> > +
> > +#  define CHAR_SIZE    1
> > +#  define VPMIN        vpminub
> > +#  define VPTESTN      vptestnmb
> >  #  define VPBROADCAST  vpbroadcastb
> > -#  define VPCMP                vpcmpb
> > -#  define SHIFT_REG    ecx
> > +#  define VPCMP        vpcmpb
> >  # endif
> >
> >  # define XMMZERO       xmm16
> >  # define YMMZERO       ymm16
> >  # define YMMMATCH      ymm17
> > -# define YMM1          ymm18
> > +# define YMMSAVE       ymm18
> > +
> > +# define YMM1  ymm19
> > +# define YMM2  ymm20
> > +# define YMM3  ymm21
> > +# define YMM4  ymm22
> > +# define YMM5  ymm23
> > +# define YMM6  ymm24
> > +# define YMM7  ymm25
> > +# define YMM8  ymm26
> >
> > -# define VEC_SIZE      32
> >
> > -       .section .text.evex,"ax",@progbits
> > -ENTRY (STRRCHR)
> > -       movl    %edi, %ecx
> > +# define VEC_SIZE      32
> > +# define PAGE_SIZE     4096
> > +       .section .text.evex, "ax", @progbits
> > +ENTRY(STRRCHR)
> > +       movl    %edi, %eax
> >         /* Broadcast CHAR to YMMMATCH.  */
> >         VPBROADCAST %esi, %YMMMATCH
> >
> > -       vpxorq  %XMMZERO, %XMMZERO, %XMMZERO
> > -
> > -       /* Check if we may cross page boundary with one vector load.  */
> > -       andl    $(2 * VEC_SIZE - 1), %ecx
> > -       cmpl    $VEC_SIZE, %ecx
> > -       ja      L(cros_page_boundary)
> > +       andl    $(PAGE_SIZE - 1), %eax
> > +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > +       jg      L(cross_page_boundary)
> >
> > +L(page_cross_continue):
> >         VMOVU   (%rdi), %YMM1
> > -
> > -       /* Each bit in K0 represents a null byte in YMM1.  */
> > -       VPCMP   $0, %YMMZERO, %YMM1, %k0
> > -       /* Each bit in K1 represents a CHAR in YMM1.  */
> > -       VPCMP   $0, %YMMMATCH, %YMM1, %k1
> > +       VPTESTN %YMM1, %YMM1, %k0
>
> Please add some comments for mask register on VPTESTN tests.

Added in V3.
>
> >         kmovd   %k0, %ecx
> > -       kmovd   %k1, %eax
> > -
> > -       addq    $VEC_SIZE, %rdi
> > -
> > -       testl   %eax, %eax
> > -       jnz     L(first_vec)
> > -
> >         testl   %ecx, %ecx
> > -       jnz     L(return_null)
> > -
> > -       andq    $-VEC_SIZE, %rdi
> > -       xorl    %edx, %edx
> > -       jmp     L(aligned_loop)
> > -
> > -       .p2align 4
> > -L(first_vec):
> > -       /* Check if there is a null byte.  */
> > -       testl   %ecx, %ecx
> > -       jnz     L(char_and_nul_in_first_vec)
> > -
> > -       /* Remember the match and keep searching.  */
> > -       movl    %eax, %edx
> > -       movq    %rdi, %rsi
> > -       andq    $-VEC_SIZE, %rdi
> > -       jmp     L(aligned_loop)
> > -
> > -       .p2align 4
> > -L(cros_page_boundary):
> > -       andl    $(VEC_SIZE - 1), %ecx
> > -       andq    $-VEC_SIZE, %rdi
> > -
> > +       jz      L(aligned_more)
> > +       VPCMP   $0, %YMMMATCH, %YMM1, %k1
>
> Please add some comments.

Added in V3.
>
> > +       kmovd   %k1, %eax
> > +       blsmskl %ecx, %ecx
> > +       andl    %ecx, %eax
> > +       jz      L(ret0)
> > +       bsrl    %eax, %eax
> >  # ifdef USE_AS_WCSRCHR
> > -       /* NB: Divide shift count by 4 since each bit in K1 represent 4
> > -          bytes.  */
> > -       movl    %ecx, %SHIFT_REG
> > -       sarl    $2, %SHIFT_REG
> > +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > +# else
> > +       addq    %rdi, %rax
> >  # endif
> > +L(ret0):
> > +       ret
> >
> > -       VMOVA   (%rdi), %YMM1
> > -
> > -       /* Each bit in K0 represents a null byte in YMM1.  */
> > -       VPCMP   $0, %YMMZERO, %YMM1, %k0
> > -       /* Each bit in K1 represents a CHAR in YMM1.  */
> > +       /* Returns for first vec x1/x2/x3 have hard coded backward
> > +          search path for earlier matches.  */
> > +       .p2align 4,, 6
> > +L(first_vec_x1):
> > +       VPCMP   $0, %YMMMATCH, %YMM2, %k1
> > +       kmovd   %k1, %eax
> > +       blsmskl %ecx, %ecx
> > +       andl    %ecx, %eax
> > +       jnz     L(first_vec_x1_return)
> > +       .p2align 4,, 4
> > +L(first_vec_x0_test):
> >         VPCMP   $0, %YMMMATCH, %YMM1, %k1
> > -       kmovd   %k0, %edx
> >         kmovd   %k1, %eax
> > -
> > -       shrxl   %SHIFT_REG, %edx, %edx
> > -       shrxl   %SHIFT_REG, %eax, %eax
> > -       addq    $VEC_SIZE, %rdi
> > -
> > -       /* Check if there is a CHAR.  */
> >         testl   %eax, %eax
> > -       jnz     L(found_char)
> > -
> > -       testl   %edx, %edx
> > -       jnz     L(return_null)
> > -
> > -       jmp     L(aligned_loop)
> > -
> > -       .p2align 4
> > -L(found_char):
> > -       testl   %edx, %edx
> > -       jnz     L(char_and_nul)
> > -
> > -       /* Remember the match and keep searching.  */
> > -       movl    %eax, %edx
> > -       leaq    (%rdi, %rcx), %rsi
> > +       jz      L(ret1)
> > +       bsrl    %eax, %eax
> > +# ifdef USE_AS_WCSRCHR
> > +       leaq    (%rsi, %rax, CHAR_SIZE), %rax
> > +# else
> > +       addq    %rsi, %rax
> > +# endif
> > +L(ret1):
> > +       ret
> >
> > -       .p2align 4
> > -L(aligned_loop):
> > -       VMOVA   (%rdi), %YMM1
> > -       addq    $VEC_SIZE, %rdi
> > +       .p2align 4,, 10
> > +L(first_vec_x1_or_x2):
> > +       VPCMP   $0, %YMM3, %YMMMATCH, %k3
> > +       VPCMP   $0, %YMM2, %YMMMATCH, %k2
> > +       kortestd %k2, %k3
> > +       jz      L(first_vec_x0_test)
> > +
> > +       kunpck  %k2, %k3, %k3
> > +       kmovq   %k3, %rax
> > +       bsrq    %rax, %rax
> > +       leaq    (VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax
> > +       ret
> >
> > -       /* Each bit in K0 represents a null byte in YMM1.  */
> > -       VPCMP   $0, %YMMZERO, %YMM1, %k0
> > -       /* Each bit in K1 represents a CHAR in YMM1.  */
> > -       VPCMP   $0, %YMMMATCH, %YMM1, %k1
> > -       kmovd   %k0, %ecx
> > +       .p2align 4,, 6
> > +L(first_vec_x3):
> > +       VPCMP   $0, %YMMMATCH, %YMM4, %k1
> >         kmovd   %k1, %eax
> > -       orl     %eax, %ecx
> > -       jnz     L(char_nor_null)
> > +       blsmskl %ecx, %ecx
> > +       andl    %ecx, %eax
> > +       jz      L(first_vec_x1_or_x2)
> > +       bsrl    %eax, %eax
> > +       leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> > +       ret
> >
> > -       VMOVA   (%rdi), %YMM1
> > -       add     $VEC_SIZE, %rdi
> > +       .p2align 4,, 6
> > +L(first_vec_x0_x1_test):
> > +       VPCMP   $0, %YMMMATCH, %YMM2, %k1
> > +       kmovd   %k1, %eax
> > +       testl   %eax, %eax
> > +       jz      L(first_vec_x0_test)
> > +       .p2align 4,, 4
> > +L(first_vec_x1_return):
> > +       bsrl    %eax, %eax
> > +       leaq    (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
> > +       ret
> >
> > -       /* Each bit in K0 represents a null byte in YMM1.  */
> > -       VPCMP   $0, %YMMZERO, %YMM1, %k0
> > -       /* Each bit in K1 represents a CHAR in YMM1.  */
> > -       VPCMP   $0, %YMMMATCH, %YMM1, %k1
> > -       kmovd   %k0, %ecx
> > +       .p2align 4,, 10
> > +L(first_vec_x2):
> > +       VPCMP   $0, %YMMMATCH, %YMM3, %k1
> >         kmovd   %k1, %eax
> > -       orl     %eax, %ecx
> > -       jnz     L(char_nor_null)
> > +       blsmskl %ecx, %ecx
> > +       andl    %ecx, %eax
> > +       jz      L(first_vec_x0_x1_test)
> > +       bsrl    %eax, %eax
> > +       leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> > +       ret
> >
> > -       VMOVA   (%rdi), %YMM1
> > -       addq    $VEC_SIZE, %rdi
> >
> > -       /* Each bit in K0 represents a null byte in YMM1.  */
> > -       VPCMP   $0, %YMMZERO, %YMM1, %k0
> > -       /* Each bit in K1 represents a CHAR in YMM1.  */
> > -       VPCMP   $0, %YMMMATCH, %YMM1, %k1
> > +       .p2align 4
> > +L(aligned_more):
> > +       /* Need to keep original pointer incase YMM1 has last match.  */
> > +       movq    %rdi, %rsi
> > +       andq    $-VEC_SIZE, %rdi
> > +       VMOVU   VEC_SIZE(%rdi), %YMM2
> > +       VPTESTN %YMM2, %YMM2, %k0
> >         kmovd   %k0, %ecx
> > -       kmovd   %k1, %eax
> > -       orl     %eax, %ecx
> > -       jnz     L(char_nor_null)
> > +       testl   %ecx, %ecx
> > +       jnz     L(first_vec_x1)
> >
> > -       VMOVA   (%rdi), %YMM1
> > -       addq    $VEC_SIZE, %rdi
> > +       VMOVU   (VEC_SIZE * 2)(%rdi), %YMM3
> > +       VPTESTN %YMM3, %YMM3, %k0
> > +       kmovd   %k0, %ecx
> > +       testl   %ecx, %ecx
> > +       jnz     L(first_vec_x2)
> >
> > -       /* Each bit in K0 represents a null byte in YMM1.  */
> > -       VPCMP   $0, %YMMZERO, %YMM1, %k0
> > -       /* Each bit in K1 represents a CHAR in YMM1.  */
> > -       VPCMP   $0, %YMMMATCH, %YMM1, %k1
> > +       VMOVU   (VEC_SIZE * 3)(%rdi), %YMM4
> > +       VPTESTN %YMM4, %YMM4, %k0
> >         kmovd   %k0, %ecx
> > -       kmovd   %k1, %eax
> > -       orl     %eax, %ecx
> > -       jz      L(aligned_loop)
> > +       movq    %rdi, %r8
> > +       testl   %ecx, %ecx
> > +       jnz     L(first_vec_x3)
> >
> > +       andq    $-(VEC_SIZE * 2), %rdi
> >         .p2align 4
> > -L(char_nor_null):
> > -       /* Find a CHAR or a null byte in a loop.  */
> > +L(first_aligned_loop):
> > +       /* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee
> > +          they don't store a match.  */
> > +       VMOVA   (VEC_SIZE * 4)(%rdi), %YMM5
> > +       VMOVA   (VEC_SIZE * 5)(%rdi), %YMM6
> > +
> > +       VPCMP   $0, %YMM5, %YMMMATCH, %k2
> > +       vpxord  %YMM6, %YMMMATCH, %YMM7
> > +
> > +       VPMIN   %YMM5, %YMM6, %YMM8
> > +       VPMIN   %YMM8, %YMM7, %YMM7
> > +
> > +       VPTESTN %YMM7, %YMM7, %k1
> > +       subq    $(VEC_SIZE * -2), %rdi
> > +       kortestd %k1, %k2
> > +       jz      L(first_aligned_loop)
> > +
> > +       VPCMP   $0, %YMM6, %YMMMATCH, %k3
> > +       VPTESTN %YMM8, %YMM8, %k1
> > +       ktestd  %k1, %k1
> > +       jz      L(second_aligned_loop_prep)
> > +
> > +       kortestd %k2, %k3
> > +       jnz     L(return_first_aligned_loop)
> > +
> > +       .p2align 4,, 6
> > +L(first_vec_x1_or_x2_or_x3):
> > +       VPCMP   $0, %YMM4, %YMMMATCH, %k4
> > +       kmovd   %k4, %eax
> >         testl   %eax, %eax
> > -       jnz     L(match)
> > -L(return_value):
> > -       testl   %edx, %edx
> > -       jz      L(return_null)
> > -       movl    %edx, %eax
> > -       movq    %rsi, %rdi
> > +       jz      L(first_vec_x1_or_x2)
> >         bsrl    %eax, %eax
> > -# ifdef USE_AS_WCSRCHR
> > -       /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> > -       leaq    -VEC_SIZE(%rdi, %rax, 4), %rax
> > -# else
> > -       leaq    -VEC_SIZE(%rdi, %rax), %rax
> > -# endif
> > +       leaq    (VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
> >         ret
> >
> > -       .p2align 4
> > -L(match):
> > -       /* Find a CHAR.  Check if there is a null byte.  */
> > -       kmovd   %k0, %ecx
> > -       testl   %ecx, %ecx
> > -       jnz     L(find_nul)
> > +       .p2align 4,, 8
> > +L(return_first_aligned_loop):
> > +       VPTESTN %YMM5, %YMM5, %k0
> > +       kunpck  %k0, %k1, %k0
> > +       kmov_2x %k0, %maskz_2x
> > +
> > +       blsmsk  %maskz_2x, %maskz_2x
> > +       kunpck  %k2, %k3, %k3
> > +       kmov_2x %k3, %maskm_2x
> > +       and     %maskz_2x, %maskm_2x
> > +       jz      L(first_vec_x1_or_x2_or_x3)
> > +
> > +       bsr     %maskm_2x, %maskm_2x
> > +       leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> > +       ret
> >
> > -       /* Remember the match and keep searching.  */
> > -       movl    %eax, %edx
> > +       .p2align 4
> > +       /* We can throw away the work done for the first 4x checks here
> > +          as we have a later match. This is the 'fast' path persay.
> > +        */
> > +L(second_aligned_loop_prep):
> > +L(second_aligned_loop_set_furthest_match):
> >         movq    %rdi, %rsi
> > -       jmp     L(aligned_loop)
> > +       kunpck  %k2, %k3, %k4
> >
> >         .p2align 4
> > -L(find_nul):
> > -       /* Mask out any matching bits after the null byte.  */
> > -       movl    %ecx, %r8d
> > -       subl    $1, %r8d
> > -       xorl    %ecx, %r8d
> > -       andl    %r8d, %eax
> > -       testl   %eax, %eax
> > -       /* If there is no CHAR here, return the remembered one.  */
> > -       jz      L(return_value)
> > -       bsrl    %eax, %eax
> > +L(second_aligned_loop):
> > +       VMOVU   (VEC_SIZE * 4)(%rdi), %YMM1
> > +       VMOVU   (VEC_SIZE * 5)(%rdi), %YMM2
> > +
> > +       VPCMP   $0, %YMM1, %YMMMATCH, %k2
> > +       vpxord  %YMM2, %YMMMATCH, %YMM3
> > +
> > +       VPMIN   %YMM1, %YMM2, %YMM4
> > +       VPMIN   %YMM3, %YMM4, %YMM3
> > +
> > +       VPTESTN %YMM3, %YMM3, %k1
> > +       subq    $(VEC_SIZE * -2), %rdi
> > +       kortestd %k1, %k2
> > +       jz      L(second_aligned_loop)
> > +
> > +       VPCMP   $0, %YMM2, %YMMMATCH, %k3
> > +       VPTESTN %YMM4, %YMM4, %k1
> > +       ktestd  %k1, %k1
> > +       jz      L(second_aligned_loop_set_furthest_match)
> > +
> > +       kortestd %k2, %k3
> > +       /* branch here because there is a significant advantage interms
> > +          of output dependency chance in using edx.  */
> > +       jnz     L(return_new_match)
> > +L(return_old_match):
> > +       kmovq   %k4, %rax
> > +       bsrq    %rax, %rax
> > +       leaq    (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax
> > +       ret
> > +
> > +L(return_new_match):
> > +       VPTESTN %YMM1, %YMM1, %k0
> > +       kunpck  %k0, %k1, %k0
> > +       kmov_2x %k0, %maskz_2x
> > +
> > +       blsmsk  %maskz_2x, %maskz_2x
> > +       kunpck  %k2, %k3, %k3
> > +       kmov_2x %k3, %maskm_2x
> > +       and     %maskz_2x, %maskm_2x
> > +       jz      L(return_old_match)
> > +
> > +       bsr     %maskm_2x, %maskm_2x
> > +       leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> > +       ret
> > +
> > +       /* This block is horribly aligned (% 16 == 15). This is
> > +          intentional. The L(cross_page_boundary) block is exactly
> > +          32-bytes of code size. Ultimately this is a cold case so
> > +          save the code size by leaving misaligned.  */
> > +L(cross_page_boundary):
> > +       xorq    %rdi, %rax
> > +       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %YMM1
> > +       VPTESTN %YMM1, %YMM1, %k0
> > +       kmovd   %k0, %ecx
> >  # ifdef USE_AS_WCSRCHR
> > -       /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> > -       leaq    -VEC_SIZE(%rdi, %rax, 4), %rax
> > -# else
> > -       leaq    -VEC_SIZE(%rdi, %rax), %rax
> > +       movl    %edi, %esi
> > +       andl    $(VEC_SIZE - 1), %esi
> > +       shrl    $2, %esi
> >  # endif
> > -       ret
> > +       shrxl   %SHIFT_REG, %ecx, %ecx
> >
> > -       .p2align 4
> > -L(char_and_nul):
> > -       /* Find both a CHAR and a null byte.  */
> > -       addq    %rcx, %rdi
> > -       movl    %edx, %ecx
> > -L(char_and_nul_in_first_vec):
> > -       /* Mask out any matching bits after the null byte.  */
> > -       movl    %ecx, %r8d
> > -       subl    $1, %r8d
> > -       xorl    %ecx, %r8d
> > -       andl    %r8d, %eax
> > -       testl   %eax, %eax
> > -       /* Return null pointer if the null byte comes first.  */
> > -       jz      L(return_null)
> > +       testl   %ecx, %ecx
> > +       jz      L(page_cross_continue)
> > +       VPCMP   $0, %YMMMATCH, %YMM1, %k1
> > +       kmovd   %k1, %eax
> > +       shrxl   %SHIFT_REG, %eax, %eax
> > +       blsmskl %ecx, %ecx
> > +       andl    %ecx, %eax
> > +       jz      L(ret3)
> >         bsrl    %eax, %eax
> >  # ifdef USE_AS_WCSRCHR
> > -       /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> > -       leaq    -VEC_SIZE(%rdi, %rax, 4), %rax
> > +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> >  # else
> > -       leaq    -VEC_SIZE(%rdi, %rax), %rax
> > +       addq    %rdi, %rax
> >  # endif
> > +L(ret3):
> >         ret
> >
> > -       .p2align 4
> > -L(return_null):
> > -       xorl    %eax, %eax
> > -       ret
> > -
> > -END (STRRCHR)
> > +END(STRRCHR)
> >  #endif
> > --
> > 2.25.1
> >
>
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2 2/4] x86: Optimize {str|wcs}rchr-sse2
  2022-04-21 23:46     ` H.J. Lu
@ 2022-04-22  1:54       ` Noah Goldstein
  0 siblings, 0 replies; 36+ messages in thread
From: Noah Goldstein @ 2022-04-22  1:54 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell

On Thu, Apr 21, 2022 at 6:46 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Thu, Apr 21, 2022 at 3:22 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > The new code unrolls the main loop slightly without adding too much
> > overhead and minimizes the comparisons for the search CHAR.
> >
> > Geometric Mean of all benchmarks New / Old: 0.741
> > See email for all results.
> >
> > Full xcheck passes on x86_64 with and without multiarch enabled.
> > ---
> >  sysdeps/x86_64/multiarch/strrchr-sse2.S |   2 +-
> >  sysdeps/x86_64/multiarch/wcsrchr-sse2.S |   3 +-
> >  sysdeps/x86_64/strrchr.S                | 510 +++++++++++++++---------
> >  sysdeps/x86_64/wcsrchr.S                | 268 +------------
> >  4 files changed, 339 insertions(+), 444 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
> > index db1b44c23c..866396e947 100644
> > --- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
> > +++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
> > @@ -17,7 +17,7 @@
> >     <https://www.gnu.org/licenses/>.  */
> >
> >  #if IS_IN (libc)
> > -# define strrchr __strrchr_sse2
> > +# define STRRCHR __strrchr_sse2
> >
> >  # undef weak_alias
> >  # define weak_alias(strrchr, rindex)
> > diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> > index 78d1ca6553..69d2f3cdb1 100644
> > --- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> > +++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> > @@ -17,7 +17,6 @@
> >     <https://www.gnu.org/licenses/>.  */
> >
> >  #if IS_IN (libc)
> > -# define wcsrchr __wcsrchr_sse2
> > +# define STRRCHR       __wcsrchr_sse2
> >  #endif
> > -
> >  #include "../wcsrchr.S"
> > diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
> > index 50d886713e..6efb25c880 100644
> > --- a/sysdeps/x86_64/strrchr.S
> > +++ b/sysdeps/x86_64/strrchr.S
> > @@ -19,210 +19,360 @@
> >
> >  #include <sysdep.h>
> >
> > +#ifndef STRRCHR
> > +# define STRRCHR       strrchr
> > +#endif
> > +
> > +#ifdef USE_AS_WCSRCHR
> > +# define PCMPEQ        pcmpeqd
> > +# define CHAR_SIZE     4
> > +# define PMINU pminud
> > +#else
> > +# define PCMPEQ        pcmpeqb
> > +# define CHAR_SIZE     1
> > +# define PMINU pminub
> > +#endif
> > +
> > +#define PAGE_SIZE      4096
> > +#define VEC_SIZE       16
> > +
> >         .text
> > -ENTRY (strrchr)
> > -       movd    %esi, %xmm1
> > +ENTRY(STRRCHR)
> > +       movd    %esi, %xmm0
> >         movq    %rdi, %rax
> > -       andl    $4095, %eax
> > -       punpcklbw       %xmm1, %xmm1
> > -       cmpq    $4032, %rax
> > -       punpcklwd       %xmm1, %xmm1
> > -       pshufd  $0, %xmm1, %xmm1
> > +       andl    $(PAGE_SIZE - 1), %eax
> > +#ifndef USE_AS_WCSRCHR
> > +       punpcklbw %xmm0, %xmm0
> > +       punpcklwd %xmm0, %xmm0
> > +#endif
> > +       pshufd  $0, %xmm0, %xmm0
> > +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> >         ja      L(cross_page)
> > -       movdqu  (%rdi), %xmm0
> > +
> > +L(cross_page_continue):
> > +       movups  (%rdi), %xmm1
> >         pxor    %xmm2, %xmm2
> > -       movdqa  %xmm0, %xmm3
> > -       pcmpeqb %xmm1, %xmm0
> > -       pcmpeqb %xmm2, %xmm3
> > -       pmovmskb        %xmm0, %ecx
> > -       pmovmskb        %xmm3, %edx
> > -       testq   %rdx, %rdx
> > -       je      L(next_48_bytes)
> > -       leaq    -1(%rdx), %rax
> > -       xorq    %rdx, %rax
> > -       andq    %rcx, %rax
> > -       je      L(exit)
> > -       bsrq    %rax, %rax
> > +       PCMPEQ  %xmm1, %xmm2
> > +       pmovmskb %xmm2, %ecx
> > +       testl   %ecx, %ecx
> > +       jz      L(aligned_more)
> > +
> > +       PCMPEQ  %xmm0, %xmm1
> > +       pmovmskb %xmm1, %eax
> > +       leal    -1(%rcx), %edx
> > +       xorl    %edx, %ecx
> > +       andl    %ecx, %eax
> > +       jz      L(ret0)
> > +       bsrl    %eax, %eax
> >         addq    %rdi, %rax
> > +       /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
> > +          search CHAR is zero we are correct. Either way `andq
> > +          -CHAR_SIZE, %rax` gets the correct result.  */
> > +#ifdef USE_AS_WCSRCHR
> > +       andq    $-CHAR_SIZE, %rax
> > +#endif
> > +L(ret0):
> >         ret
> >
> > +       /* Returns for first vec x1/x2 have hard coded backward search
> > +          path for earlier matches.  */
> >         .p2align 4
> > -L(next_48_bytes):
> > -       movdqu  16(%rdi), %xmm4
> > -       movdqa  %xmm4, %xmm5
> > -       movdqu  32(%rdi), %xmm3
> > -       pcmpeqb %xmm1, %xmm4
> > -       pcmpeqb %xmm2, %xmm5
> > -       movdqu  48(%rdi), %xmm0
> > -       pmovmskb        %xmm5, %edx
> > -       movdqa  %xmm3, %xmm5
> > -       pcmpeqb %xmm1, %xmm3
> > -       pcmpeqb %xmm2, %xmm5
> > -       pcmpeqb %xmm0, %xmm2
> > -       salq    $16, %rdx
> > -       pmovmskb        %xmm3, %r8d
> > -       pmovmskb        %xmm5, %eax
> > -       pmovmskb        %xmm2, %esi
> > -       salq    $32, %r8
> > -       salq    $32, %rax
> > -       pcmpeqb %xmm1, %xmm0
> > -       orq     %rdx, %rax
> > -       movq    %rsi, %rdx
> > -       pmovmskb        %xmm4, %esi
> > -       salq    $48, %rdx
> > -       salq    $16, %rsi
> > -       orq     %r8, %rsi
> > -       orq     %rcx, %rsi
> > -       pmovmskb        %xmm0, %ecx
> > -       salq    $48, %rcx
> > -       orq     %rcx, %rsi
> > -       orq     %rdx, %rax
> > -       je      L(loop_header2)
> > -       leaq    -1(%rax), %rcx
> > -       xorq    %rax, %rcx
> > -       andq    %rcx, %rsi
> > -       je      L(exit)
> > -       bsrq    %rsi, %rsi
> > -       leaq    (%rdi,%rsi), %rax
> > +L(first_vec_x0_test):
> > +       PCMPEQ  %xmm0, %xmm1
> > +       pmovmskb %xmm1, %eax
> > +       testl   %eax, %eax
> > +       jz      L(ret0)
> > +       bsrl    %eax, %eax
> > +       addq    %r8, %rax
> > +#ifdef USE_AS_WCSRCHR
> > +       andq    $-CHAR_SIZE, %rax
> > +#endif
> >         ret
> >
> >         .p2align 4
> > -L(loop_header2):
> > -       testq   %rsi, %rsi
> > -       movq    %rdi, %rcx
> > -       je      L(no_c_found)
> > -L(loop_header):
> > -       addq    $64, %rdi
> > -       pxor    %xmm7, %xmm7
> > -       andq    $-64, %rdi
> > -       jmp     L(loop_entry)
> > +L(first_vec_x1):
> > +       PCMPEQ  %xmm0, %xmm2
> > +       pmovmskb %xmm2, %eax
> > +       leal    -1(%rcx), %edx
> > +       xorl    %edx, %ecx
> > +       andl    %ecx, %eax
> > +       jz      L(first_vec_x0_test)
> > +       bsrl    %eax, %eax
> > +       leaq    (VEC_SIZE)(%rdi, %rax), %rax
> > +#ifdef USE_AS_WCSRCHR
> > +       andq    $-CHAR_SIZE, %rax
> > +#endif
> > +       ret
> >
> >         .p2align 4
> > -L(loop64):
> > -       testq   %rdx, %rdx
> > -       cmovne  %rdx, %rsi
> > -       cmovne  %rdi, %rcx
> > -       addq    $64, %rdi
> > -L(loop_entry):
> > -       movdqa  32(%rdi), %xmm3
> > -       pxor    %xmm6, %xmm6
> > -       movdqa  48(%rdi), %xmm2
> > -       movdqa  %xmm3, %xmm0
> > -       movdqa  16(%rdi), %xmm4
> > -       pminub  %xmm2, %xmm0
> > -       movdqa  (%rdi), %xmm5
> > -       pminub  %xmm4, %xmm0
> > -       pminub  %xmm5, %xmm0
> > -       pcmpeqb %xmm7, %xmm0
> > -       pmovmskb        %xmm0, %eax
> > -       movdqa  %xmm5, %xmm0
> > -       pcmpeqb %xmm1, %xmm0
> > -       pmovmskb        %xmm0, %r9d
> > -       movdqa  %xmm4, %xmm0
> > -       pcmpeqb %xmm1, %xmm0
> > -       pmovmskb        %xmm0, %edx
> > -       movdqa  %xmm3, %xmm0
> > -       pcmpeqb %xmm1, %xmm0
> > -       salq    $16, %rdx
> > -       pmovmskb        %xmm0, %r10d
> > -       movdqa  %xmm2, %xmm0
> > -       pcmpeqb %xmm1, %xmm0
> > -       salq    $32, %r10
> > -       orq     %r10, %rdx
> > -       pmovmskb        %xmm0, %r8d
> > -       orq     %r9, %rdx
> > -       salq    $48, %r8
> > -       orq     %r8, %rdx
> > +L(first_vec_x1_test):
> > +       PCMPEQ  %xmm0, %xmm2
> > +       pmovmskb %xmm2, %eax
> >         testl   %eax, %eax
> > -       je      L(loop64)
> > -       pcmpeqb %xmm6, %xmm4
> > -       pcmpeqb %xmm6, %xmm3
> > -       pcmpeqb %xmm6, %xmm5
> > -       pmovmskb        %xmm4, %eax
> > -       pmovmskb        %xmm3, %r10d
> > -       pcmpeqb %xmm6, %xmm2
> > -       pmovmskb        %xmm5, %r9d
> > -       salq    $32, %r10
> > -       salq    $16, %rax
> > -       pmovmskb        %xmm2, %r8d
> > -       orq     %r10, %rax
> > -       orq     %r9, %rax
> > -       salq    $48, %r8
> > -       orq     %r8, %rax
> > -       leaq    -1(%rax), %r8
> > -       xorq    %rax, %r8
> > -       andq    %r8, %rdx
> > -       cmovne  %rdi, %rcx
> > -       cmovne  %rdx, %rsi
> > -       bsrq    %rsi, %rsi
> > -       leaq    (%rcx,%rsi), %rax
> > +       jz      L(first_vec_x0_test)
> > +       bsrl    %eax, %eax
> > +       leaq    (VEC_SIZE)(%rdi, %rax), %rax
> > +#ifdef USE_AS_WCSRCHR
> > +       andq    $-CHAR_SIZE, %rax
> > +#endif
> > +       ret
> > +
> > +       .p2align 4
> > +L(first_vec_x2):
> > +       PCMPEQ  %xmm0, %xmm3
> > +       pmovmskb %xmm3, %eax
> > +       leal    -1(%rcx), %edx
> > +       xorl    %edx, %ecx
> > +       andl    %ecx, %eax
> > +       jz      L(first_vec_x1_test)
> > +       bsrl    %eax, %eax
> > +       leaq    (VEC_SIZE * 2)(%rdi, %rax), %rax
> > +#ifdef USE_AS_WCSRCHR
> > +       andq    $-CHAR_SIZE, %rax
> > +#endif
> > +       ret
> > +
> > +       .p2align 4
> > +L(aligned_more):
> > +       /* Save original pointer if match was in VEC 0.  */
> > +       movq    %rdi, %r8
> > +       andq    $-VEC_SIZE, %rdi
> > +
> > +       movaps  VEC_SIZE(%rdi), %xmm2
> > +       pxor    %xmm3, %xmm3
> > +       PCMPEQ  %xmm2, %xmm3
> > +       pmovmskb %xmm3, %ecx
> > +       testl   %ecx, %ecx
> > +       jnz     L(first_vec_x1)
> > +
> > +       movaps  (VEC_SIZE * 2)(%rdi), %xmm3
> > +       pxor    %xmm4, %xmm4
> > +       PCMPEQ  %xmm3, %xmm4
> > +       pmovmskb %xmm4, %ecx
> > +       testl   %ecx, %ecx
> > +       jnz     L(first_vec_x2)
> > +
> > +       addq    $VEC_SIZE, %rdi
> > +       /* Save pointer again before realigning.  */
> > +       movq    %rdi, %rsi
> > +       andq    $-(VEC_SIZE * 2), %rdi
> > +       .p2align 4
> > +L(first_loop):
> > +       /* Do 2x VEC at a time.  */
> > +       movaps  (VEC_SIZE * 2)(%rdi), %xmm4
> > +       movaps  (VEC_SIZE * 3)(%rdi), %xmm5
> > +       /* If SSE2 no pminud so wcsrchr needs seperate logic for
>               Did you mean "Since", instead of "If"?

Fixed in V3.
>
> > +          detecting zero. Note if this is found to be a bottleneck it
> > +          may be worth adding an SSE4.1 wcsrchr implementation.  */
> > +#ifdef USE_AS_WCSRCHR
> > +       movaps  %xmm5, %xmm6
> > +       pxor    %xmm8, %xmm8
> > +
> > +       PCMPEQ  %xmm8, %xmm5
> > +       PCMPEQ  %xmm4, %xmm8
> > +       por     %xmm5, %xmm8
> > +#else
> > +       movaps  %xmm5, %xmm6
> > +       PMINU   %xmm4, %xmm5
> > +#endif
> > +
> > +       movaps  %xmm4, %xmm9
> > +       PCMPEQ  %xmm0, %xmm4
> > +       PCMPEQ  %xmm0, %xmm6
> > +       movaps  %xmm6, %xmm7
> > +       por     %xmm4, %xmm6
> > +#ifndef USE_AS_WCSRCHR
> > +       pxor    %xmm8, %xmm8
> > +       PCMPEQ  %xmm5, %xmm8
> > +#endif
> > +       pmovmskb %xmm8, %ecx
> > +       pmovmskb %xmm6, %eax
> > +
> > +       addq    $(VEC_SIZE * 2), %rdi
> > +       /* Use `addl` 1) so we can undo it with `subl` and 2) it can
> > +          macro-fuse with `jz`.  */
> > +       addl    %ecx, %eax
> > +       jz      L(first_loop)
> > +
> > +       /* Check if there is zero match.  */
> > +       testl   %ecx, %ecx
> > +       jz      L(second_loop_match)
> > +
> > +       /* Check if there was a match in last iteration.  */
> > +       subl    %ecx, %eax
> > +       jnz     L(new_match)
> > +
> > +L(first_loop_old_match):
> > +       PCMPEQ  %xmm0, %xmm2
> > +       PCMPEQ  %xmm0, %xmm3
> > +       pmovmskb %xmm2, %ecx
> > +       pmovmskb %xmm3, %eax
> > +       addl    %eax, %ecx
> > +       jz      L(first_vec_x0_test)
> > +       /* NB: We could move this shift to before the branch and save a
> > +          bit of code size / performance on the fall through. The
> > +          branch leads to the null case which generally seems hotter
> > +          than char in first 3x VEC.  */
> > +       sall    $16, %eax
> > +       orl     %ecx, %eax
> > +
> > +       bsrl    %eax, %eax
> > +       addq    %rsi, %rax
> > +#ifdef USE_AS_WCSRCHR
> > +       andq    $-CHAR_SIZE, %rax
> > +#endif
> > +       ret
> > +
> > +       .p2align 4
> > +L(new_match):
> > +       pxor    %xmm6, %xmm6
> > +       PCMPEQ  %xmm9, %xmm6
> > +       pmovmskb %xmm6, %eax
> > +       sall    $16, %ecx
> > +       orl     %eax, %ecx
> > +
> > +       /* We can't reuse either of the old comparisons as since we mask
> > +          of zeros after first zero (instead of using the full
> > +          comparison) we can't gurantee no interference between match
> > +          after end of string and valid match.  */
> > +       pmovmskb %xmm4, %eax
> > +       pmovmskb %xmm7, %edx
> > +       sall    $16, %edx
> > +       orl     %edx, %eax
> > +
> > +       leal    -1(%ecx), %edx
> > +       xorl    %edx, %ecx
> > +       andl    %ecx, %eax
> > +       jz      L(first_loop_old_match)
> > +       bsrl    %eax, %eax
> > +       addq    %rdi, %rax
> > +#ifdef USE_AS_WCSRCHR
> > +       andq    $-CHAR_SIZE, %rax
> > +#endif
> >         ret
> >
> > +       /* Save minimum state for getting most recent match. We can
> > +          throw out all previous work.  */
> >         .p2align 4
> > -L(no_c_found):
> > -       movl    $1, %esi
> > -       xorl    %ecx, %ecx
> > -       jmp     L(loop_header)
> > +L(second_loop_match):
> > +       movq    %rdi, %rsi
> > +       movaps  %xmm4, %xmm2
> > +       movaps  %xmm7, %xmm3
> >
> >         .p2align 4
> > -L(exit):
> > -       xorl    %eax, %eax
> > +L(second_loop):
> > +       movaps  (VEC_SIZE * 2)(%rdi), %xmm4
> > +       movaps  (VEC_SIZE * 3)(%rdi), %xmm5
> > +       /* If SSE2 no pminud so wcsrchr needs seperate logic for
>                 Did you mean "Since", instead of "If"?
>
> > +          detecting zero. Note if this is found to be a bottleneck it
> > +          may be worth adding an SSE4.1 wcsrchr implementation.  */
> > +#ifdef USE_AS_WCSRCHR
> > +       movaps  %xmm5, %xmm6
> > +       pxor    %xmm8, %xmm8
> > +
> > +       PCMPEQ  %xmm8, %xmm5
> > +       PCMPEQ  %xmm4, %xmm8
> > +       por     %xmm5, %xmm8
> > +#else
> > +       movaps  %xmm5, %xmm6
> > +       PMINU   %xmm4, %xmm5
> > +#endif
> > +
> > +       movaps  %xmm4, %xmm9
> > +       PCMPEQ  %xmm0, %xmm4
> > +       PCMPEQ  %xmm0, %xmm6
> > +       movaps  %xmm6, %xmm7
> > +       por     %xmm4, %xmm6
> > +#ifndef USE_AS_WCSRCHR
> > +       pxor    %xmm8, %xmm8
> > +       PCMPEQ  %xmm5, %xmm8
> > +#endif
> > +
> > +       pmovmskb %xmm8, %ecx
> > +       pmovmskb %xmm6, %eax
> > +
> > +       addq    $(VEC_SIZE * 2), %rdi
> > +       /* Either null term or new occurence of CHAR.  */
> > +       addl    %ecx, %eax
> > +       jz      L(second_loop)
> > +
> > +       /* No null term so much be new occurence of CHAR.  */
> > +       testl   %ecx, %ecx
> > +       jz      L(second_loop_match)
> > +
> > +
> > +       subl    %ecx, %eax
> > +       jnz     L(second_loop_new_match)
> > +
> > +L(second_loop_old_match):
> > +       pmovmskb %xmm2, %ecx
> > +       pmovmskb %xmm3, %eax
> > +       sall    $16, %eax
> > +       orl     %ecx, %eax
> > +       bsrl    %eax, %eax
> > +       addq    %rsi, %rax
> > +#ifdef USE_AS_WCSRCHR
> > +       andq    $-CHAR_SIZE, %rax
> > +#endif
> >         ret
> >
> >         .p2align 4
> > +L(second_loop_new_match):
> > +       pxor    %xmm6, %xmm6
> > +       PCMPEQ  %xmm9, %xmm6
> > +       pmovmskb %xmm6, %eax
> > +       sall    $16, %ecx
> > +       orl     %eax, %ecx
> > +
> > +       /* We can't reuse either of the old comparisons as since we mask
> > +          of zeros after first zero (instead of using the full
> > +          comparison) we can't gurantee no interference between match
> > +          after end of string and valid match.  */
> > +       pmovmskb %xmm4, %eax
> > +       pmovmskb %xmm7, %edx
> > +       sall    $16, %edx
> > +       orl     %edx, %eax
> > +
> > +       leal    -1(%ecx), %edx
> > +       xorl    %edx, %ecx
> > +       andl    %ecx, %eax
> > +       jz      L(second_loop_old_match)
> > +       bsrl    %eax, %eax
> > +       addq    %rdi, %rax
> > +#ifdef USE_AS_WCSRCHR
> > +       andq    $-CHAR_SIZE, %rax
> > +#endif
> > +       ret
> > +
> > +       .p2align 4,, 4
> >  L(cross_page):
> > -       movq    %rdi, %rax
> > -       pxor    %xmm0, %xmm0
> > -       andq    $-64, %rax
> > -       movdqu  (%rax), %xmm5
> > -       movdqa  %xmm5, %xmm6
> > -       movdqu  16(%rax), %xmm4
> > -       pcmpeqb %xmm1, %xmm5
> > -       pcmpeqb %xmm0, %xmm6
> > -       movdqu  32(%rax), %xmm3
> > -       pmovmskb        %xmm6, %esi
> > -       movdqa  %xmm4, %xmm6
> > -       movdqu  48(%rax), %xmm2
> > -       pcmpeqb %xmm1, %xmm4
> > -       pcmpeqb %xmm0, %xmm6
> > -       pmovmskb        %xmm6, %edx
> > -       movdqa  %xmm3, %xmm6
> > -       pcmpeqb %xmm1, %xmm3
> > -       pcmpeqb %xmm0, %xmm6
> > -       pcmpeqb %xmm2, %xmm0
> > -       salq    $16, %rdx
> > -       pmovmskb        %xmm3, %r9d
> > -       pmovmskb        %xmm6, %r8d
> > -       pmovmskb        %xmm0, %ecx
> > -       salq    $32, %r9
> > -       salq    $32, %r8
> > -       pcmpeqb %xmm1, %xmm2
> > -       orq     %r8, %rdx
> > -       salq    $48, %rcx
> > -       pmovmskb        %xmm5, %r8d
> > -       orq     %rsi, %rdx
> > -       pmovmskb        %xmm4, %esi
> > -       orq     %rcx, %rdx
> > -       pmovmskb        %xmm2, %ecx
> > -       salq    $16, %rsi
> > -       salq    $48, %rcx
> > -       orq     %r9, %rsi
> > -       orq     %r8, %rsi
> > -       orq     %rcx, %rsi
> > +       movq    %rdi, %rsi
> > +       andq    $-VEC_SIZE, %rsi
> > +       movaps  (%rsi), %xmm1
> > +       pxor    %xmm2, %xmm2
> > +       PCMPEQ  %xmm1, %xmm2
> > +       pmovmskb %xmm2, %edx
> >         movl    %edi, %ecx
> > -       subl    %eax, %ecx
> > -       shrq    %cl, %rdx
> > -       shrq    %cl, %rsi
> > -       testq   %rdx, %rdx
> > -       je      L(loop_header2)
> > -       leaq    -1(%rdx), %rax
> > -       xorq    %rdx, %rax
> > -       andq    %rax, %rsi
> > -       je      L(exit)
> > -       bsrq    %rsi, %rax
> > +       andl    $(VEC_SIZE - 1), %ecx
> > +       sarl    %cl, %edx
> > +       jz      L(cross_page_continue)
> > +       PCMPEQ  %xmm0, %xmm1
> > +       pmovmskb %xmm1, %eax
> > +       sarl    %cl, %eax
> > +       leal    -1(%rdx), %ecx
> > +       xorl    %edx, %ecx
> > +       andl    %ecx, %eax
> > +       jz      L(ret1)
> > +       bsrl    %eax, %eax
> >         addq    %rdi, %rax
> > +#ifdef USE_AS_WCSRCHR
> > +       andq    $-CHAR_SIZE, %rax
> > +#endif
> > +L(ret1):
> >         ret
> > -END (strrchr)
> > +END(STRRCHR)
> >
> > -weak_alias (strrchr, rindex)
> > -libc_hidden_builtin_def (strrchr)
> > +#ifndef USE_AS_WCSRCHR
> > +       weak_alias (STRRCHR, rindex)
> > +       libc_hidden_builtin_def (STRRCHR)
> > +#endif
> > diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
> > index 61552954de..2b80efc5ef 100644
> > --- a/sysdeps/x86_64/wcsrchr.S
> > +++ b/sysdeps/x86_64/wcsrchr.S
> > @@ -1,4 +1,4 @@
> > -/* wcsrchr with SSSE3
> > +/* wcsrchr optimized with SSE2.
> >     Copyright (C) 2011-2022 Free Software Foundation, Inc.
> >     This file is part of the GNU C Library.
> >
> > @@ -16,266 +16,12 @@
> >     License along with the GNU C Library; if not, see
> >     <https://www.gnu.org/licenses/>.  */
> >
> > -#include <sysdep.h>
> >
> > -       .text
> > -ENTRY (wcsrchr)
> > +#define USE_AS_WCSRCHR 1
> > +#define NO_PMINU       1
> >
> > -       movd    %rsi, %xmm1
> > -       mov     %rdi, %rcx
> > -       punpckldq %xmm1, %xmm1
> > -       pxor    %xmm2, %xmm2
> > -       punpckldq %xmm1, %xmm1
> > -       and     $63, %rcx
> > -       cmp     $48, %rcx
> > -       ja      L(crosscache)
> > +#ifndef STRRCHR
> > +# define STRRCHR       wcsrchr
> > +#endif
> >
> > -       movdqu  (%rdi), %xmm0
> > -       pcmpeqd %xmm0, %xmm2
> > -       pcmpeqd %xmm1, %xmm0
> > -       pmovmskb %xmm2, %rcx
> > -       pmovmskb %xmm0, %rax
> > -       add     $16, %rdi
> > -
> > -       test    %rax, %rax
> > -       jnz     L(unaligned_match1)
> > -
> > -       test    %rcx, %rcx
> > -       jnz     L(return_null)
> > -
> > -       and     $-16, %rdi
> > -       xor     %r8, %r8
> > -       jmp     L(loop)
> > -
> > -       .p2align 4
> > -L(unaligned_match1):
> > -       test    %rcx, %rcx
> > -       jnz     L(prolog_find_zero_1)
> > -
> > -       mov     %rax, %r8
> > -       mov     %rdi, %rsi
> > -       and     $-16, %rdi
> > -       jmp     L(loop)
> > -
> > -       .p2align 4
> > -L(crosscache):
> > -       and     $15, %rcx
> > -       and     $-16, %rdi
> > -       pxor    %xmm3, %xmm3
> > -       movdqa  (%rdi), %xmm0
> > -       pcmpeqd %xmm0, %xmm3
> > -       pcmpeqd %xmm1, %xmm0
> > -       pmovmskb %xmm3, %rdx
> > -       pmovmskb %xmm0, %rax
> > -       shr     %cl, %rdx
> > -       shr     %cl, %rax
> > -       add     $16, %rdi
> > -
> > -       test    %rax, %rax
> > -       jnz     L(unaligned_match)
> > -
> > -       test    %rdx, %rdx
> > -       jnz     L(return_null)
> > -
> > -       xor     %r8, %r8
> > -       jmp     L(loop)
> > -
> > -       .p2align 4
> > -L(unaligned_match):
> > -       test    %rdx, %rdx
> > -       jnz     L(prolog_find_zero)
> > -
> > -       mov     %rax, %r8
> > -       lea     (%rdi, %rcx), %rsi
> > -
> > -/* Loop start on aligned string.  */
> > -       .p2align 4
> > -L(loop):
> > -       movdqa  (%rdi), %xmm0
> > -       pcmpeqd %xmm0, %xmm2
> > -       add     $16, %rdi
> > -       pcmpeqd %xmm1, %xmm0
> > -       pmovmskb %xmm2, %rcx
> > -       pmovmskb %xmm0, %rax
> > -       or      %rax, %rcx
> > -       jnz     L(matches)
> > -
> > -       movdqa  (%rdi), %xmm3
> > -       pcmpeqd %xmm3, %xmm2
> > -       add     $16, %rdi
> > -       pcmpeqd %xmm1, %xmm3
> > -       pmovmskb %xmm2, %rcx
> > -       pmovmskb %xmm3, %rax
> > -       or      %rax, %rcx
> > -       jnz     L(matches)
> > -
> > -       movdqa  (%rdi), %xmm4
> > -       pcmpeqd %xmm4, %xmm2
> > -       add     $16, %rdi
> > -       pcmpeqd %xmm1, %xmm4
> > -       pmovmskb %xmm2, %rcx
> > -       pmovmskb %xmm4, %rax
> > -       or      %rax, %rcx
> > -       jnz     L(matches)
> > -
> > -       movdqa  (%rdi), %xmm5
> > -       pcmpeqd %xmm5, %xmm2
> > -       add     $16, %rdi
> > -       pcmpeqd %xmm1, %xmm5
> > -       pmovmskb %xmm2, %rcx
> > -       pmovmskb %xmm5, %rax
> > -       or      %rax, %rcx
> > -       jz      L(loop)
> > -
> > -       .p2align 4
> > -L(matches):
> > -       test    %rax, %rax
> > -       jnz     L(match)
> > -L(return_value):
> > -       test    %r8, %r8
> > -       jz      L(return_null)
> > -       mov     %r8, %rax
> > -       mov     %rsi, %rdi
> > -
> > -       test    $15 << 4, %ah
> > -       jnz     L(match_fourth_wchar)
> > -       test    %ah, %ah
> > -       jnz     L(match_third_wchar)
> > -       test    $15 << 4, %al
> > -       jnz     L(match_second_wchar)
> > -       lea     -16(%rdi), %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(match):
> > -       pmovmskb %xmm2, %rcx
> > -       test    %rcx, %rcx
> > -       jnz     L(find_zero)
> > -       mov     %rax, %r8
> > -       mov     %rdi, %rsi
> > -       jmp     L(loop)
> > -
> > -       .p2align 4
> > -L(find_zero):
> > -       test    $15, %cl
> > -       jnz     L(find_zero_in_first_wchar)
> > -       test    %cl, %cl
> > -       jnz     L(find_zero_in_second_wchar)
> > -       test    $15, %ch
> > -       jnz     L(find_zero_in_third_wchar)
> > -
> > -       and     $1 << 13 - 1, %rax
> > -       jz      L(return_value)
> > -
> > -       test    $15 << 4, %ah
> > -       jnz     L(match_fourth_wchar)
> > -       test    %ah, %ah
> > -       jnz     L(match_third_wchar)
> > -       test    $15 << 4, %al
> > -       jnz     L(match_second_wchar)
> > -       lea     -16(%rdi), %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(find_zero_in_first_wchar):
> > -       test    $1, %rax
> > -       jz      L(return_value)
> > -       lea     -16(%rdi), %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(find_zero_in_second_wchar):
> > -       and     $1 << 5 - 1, %rax
> > -       jz      L(return_value)
> > -
> > -       test    $15 << 4, %al
> > -       jnz     L(match_second_wchar)
> > -       lea     -16(%rdi), %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(find_zero_in_third_wchar):
> > -       and     $1 << 9 - 1, %rax
> > -       jz      L(return_value)
> > -
> > -       test    %ah, %ah
> > -       jnz     L(match_third_wchar)
> > -       test    $15 << 4, %al
> > -       jnz     L(match_second_wchar)
> > -       lea     -16(%rdi), %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(prolog_find_zero):
> > -       add     %rcx, %rdi
> > -       mov     %rdx, %rcx
> > -L(prolog_find_zero_1):
> > -       test    $15, %cl
> > -       jnz     L(prolog_find_zero_in_first_wchar)
> > -       test    %cl, %cl
> > -       jnz     L(prolog_find_zero_in_second_wchar)
> > -       test    $15, %ch
> > -       jnz     L(prolog_find_zero_in_third_wchar)
> > -
> > -       and     $1 << 13 - 1, %rax
> > -       jz      L(return_null)
> > -
> > -       test    $15 << 4, %ah
> > -       jnz     L(match_fourth_wchar)
> > -       test    %ah, %ah
> > -       jnz     L(match_third_wchar)
> > -       test    $15 << 4, %al
> > -       jnz     L(match_second_wchar)
> > -       lea     -16(%rdi), %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(prolog_find_zero_in_first_wchar):
> > -       test    $1, %rax
> > -       jz      L(return_null)
> > -       lea     -16(%rdi), %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(prolog_find_zero_in_second_wchar):
> > -       and     $1 << 5 - 1, %rax
> > -       jz      L(return_null)
> > -
> > -       test    $15 << 4, %al
> > -       jnz     L(match_second_wchar)
> > -       lea     -16(%rdi), %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(prolog_find_zero_in_third_wchar):
> > -       and     $1 << 9 - 1, %rax
> > -       jz      L(return_null)
> > -
> > -       test    %ah, %ah
> > -       jnz     L(match_third_wchar)
> > -       test    $15 << 4, %al
> > -       jnz     L(match_second_wchar)
> > -       lea     -16(%rdi), %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(match_second_wchar):
> > -       lea     -12(%rdi), %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(match_third_wchar):
> > -       lea     -8(%rdi), %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(match_fourth_wchar):
> > -       lea     -4(%rdi), %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(return_null):
> > -       xor     %rax, %rax
> > -       ret
> > -
> > -END (wcsrchr)
> > +#include "../strrchr.S"
> > --
> > 2.25.1
> >
>
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v3 1/4] benchtests: Improve bench-strrchr
  2022-04-22  1:52 ` [PATCH v3 1/4] benchtests: Improve bench-strrchr Noah Goldstein
                     ` (2 preceding siblings ...)
  2022-04-22  1:52   ` [PATCH v3 4/4] x86: Optimize {str|wcs}rchr-evex Noah Goldstein
@ 2022-04-22 18:29   ` H.J. Lu
  2022-04-22 19:12     ` Noah Goldstein
  3 siblings, 1 reply; 36+ messages in thread
From: H.J. Lu @ 2022-04-22 18:29 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Thu, Apr 21, 2022 at 6:52 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> 1. Use json-lib for printing results.
> 2. Expose all parameters (before pos, seek_char, and max_char where
>    not printed).
> 3. Add benchmarks that test multiple occurence of seek_char in the
>    string.
> ---
>  benchtests/bench-strrchr.c | 124 ++++++++++++++++++++++++-------------
>  1 file changed, 80 insertions(+), 44 deletions(-)
>
> diff --git a/benchtests/bench-strrchr.c b/benchtests/bench-strrchr.c
> index abdae60c51..ce4307a098 100644
> --- a/benchtests/bench-strrchr.c
> +++ b/benchtests/bench-strrchr.c
> @@ -23,6 +23,7 @@
>  # define TEST_NAME "strrchr"
>  #endif
>  #include "bench-string.h"
> +#include "json-lib.h"
>
>  #define BIG_CHAR MAX_CHAR
>
> @@ -53,7 +54,8 @@ SIMPLE_STRRCHR (const CHAR *s, int c)
>  }
>
>  static void
> -do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
> +do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s, int c,
> +            CHAR *exp_res)
>  {
>    CHAR *res = CALL (impl, s, c);
>    size_t i, iters = INNER_LOOP_ITERS8;
> @@ -61,8 +63,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
>
>    if (res != exp_res)
>      {
> -      error (0, 0, "Wrong result in function %s %p %p", impl->name,
> -            res, exp_res);
> +      error (0, 0, "Wrong result in function %s %p %p", impl->name, res,
> +            exp_res);
>        ret = 1;
>        return;
>      }
> @@ -73,23 +75,25 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
>        CALL (impl, s, c);
>      }
>    TIMING_NOW (stop);
> -
>    TIMING_DIFF (cur, start, stop);
>
> -  TIMING_PRINT_MEAN ((double) cur, (double) iters);
> +  json_element_double (json_ctx, (double) cur / (double) iters);
>  }
>
>  static void
> -do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
> +do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
> +        int seek_char, int max_char, size_t freq)
>  /* For wcsrchr: align here means align not in bytes,
>     but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
>     len for wcschr here isn't in bytes but it's number of wchar_t symbols.  */
>  {
>    size_t i;
> +  size_t pos_chunk_sz = freq ? (pos / freq) : pos;
> +  size_t last_pos = len;
>    CHAR *result;
>    CHAR *buf = (CHAR *) buf1;
>
> -  align &= 7;
> +  align &= (getpagesize () - 1);

Should we add some tests for page boundary cross?

>    if ((align + len) * sizeof (CHAR) >= page_size)
>      return;
>
> @@ -103,6 +107,16 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
>        if ((i > pos || pos >= len) && buf[align + i] == seek_char)
>         buf[align + i] = seek_char + 10 + (random () & 15);
>      }
> +
> +  if (pos_chunk_sz == 0 && pos)
> +    pos_chunk_sz = 1;
> +
> +  for (i = pos_chunk_sz; i < pos && i < len; i += pos_chunk_sz)
> +    {
> +      buf[align + i] = seek_char;
> +      last_pos = i;
> +    }
> +
>    buf[align + len] = 0;
>
>    if (pos < len)
> @@ -110,66 +124,88 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
>        buf[align + pos] = seek_char;
>        result = (CHAR *) (buf + align + pos);
>      }
> +  else if (last_pos < len)
> +    result = (CHAR *) (buf + align + last_pos);
>    else if (seek_char == 0)
>      result = (CHAR *) (buf + align + len);
>    else
>      result = NULL;
>
> -  printf ("Length %4zd, alignment in bytes %2zd:", len, align * sizeof (CHAR));
> +  json_element_object_begin (json_ctx);
> +  json_attr_uint (json_ctx, "len", len);
> +  json_attr_uint (json_ctx, "pos", pos);
> +  json_attr_uint (json_ctx, "align", align);
> +  json_attr_uint (json_ctx, "freq", freq);
> +  json_attr_uint (json_ctx, "seek", seek_char);
> +  json_attr_uint (json_ctx, "max_char", max_char);
> +  json_array_begin (json_ctx, "timings");
>
>    FOR_EACH_IMPL (impl, 0)
> -    do_one_test (impl, (CHAR *) (buf + align), seek_char, result);
> +    do_one_test (json_ctx, impl, (CHAR *) (buf + align), seek_char, result);
>
> -  putchar ('\n');
> +  json_array_end (json_ctx);
> +  json_element_object_end (json_ctx);
>  }
>
>  int
>  test_main (void)
>  {
> -  size_t i;
> +  json_ctx_t json_ctx;
> +  size_t i, j;
> +  int seek;
>
>    test_init ();
> +  json_init (&json_ctx, 0, stdout);
>
> -  printf ("%20s", "");
> -  FOR_EACH_IMPL (impl, 0)
> -    printf ("\t%s", impl->name);
> -  putchar ('\n');
> -
> -  for (i = 1; i < 8; ++i)
> -    {
> -      do_test (0, 16 << i, 2048, 23, SMALL_CHAR);
> -      do_test (i, 16 << i, 2048, 23, SMALL_CHAR);
> -    }
> +  json_document_begin (&json_ctx);
> +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
>
> -  for (i = 1; i < 8; ++i)
> -    {
> -      do_test (i, 64, 256, 23, SMALL_CHAR);
> -      do_test (i, 64, 256, 23, BIG_CHAR);
> -    }
> +  json_attr_object_begin (&json_ctx, "functions");
> +  json_attr_object_begin (&json_ctx, TEST_NAME);
> +  json_attr_string (&json_ctx, "bench-variant", "");
>
> -  for (i = 0; i < 32; ++i)
> -    {
> -      do_test (0, i, i + 1, 23, SMALL_CHAR);
> -      do_test (0, i, i + 1, 23, BIG_CHAR);
> -    }
> +  json_array_begin (&json_ctx, "ifuncs");
> +  FOR_EACH_IMPL (impl, 0)
> +    json_element_string (&json_ctx, impl->name);
> +  json_array_end (&json_ctx);
>
> -  for (i = 1; i < 8; ++i)
> -    {
> -      do_test (0, 16 << i, 2048, 0, SMALL_CHAR);
> -      do_test (i, 16 << i, 2048, 0, SMALL_CHAR);
> -    }
> +  json_array_begin (&json_ctx, "results");
>
> -  for (i = 1; i < 8; ++i)
> +  for (seek = 0; seek <= 23; seek += 23)
>      {
> -      do_test (i, 64, 256, 0, SMALL_CHAR);
> -      do_test (i, 64, 256, 0, BIG_CHAR);
> +      for (j = 1; j < 32; j += j)
> +       {
> +         for (i = 1; i < 9; ++i)
> +           {
> +             do_test (&json_ctx, 0, 16 << i, 2048, seek, SMALL_CHAR, j);
> +             do_test (&json_ctx, i, 16 << i, 2048, seek, SMALL_CHAR, j);
> +           }
> +
> +         for (i = 1; i < 8; ++i)
> +           {
> +             do_test (&json_ctx, i, 64, 256, seek, SMALL_CHAR, j);
> +             do_test (&json_ctx, i, 64, 256, seek, BIG_CHAR, j);
> +
> +             do_test (&json_ctx, i * 15, 64, 256, seek, SMALL_CHAR, j);
> +             do_test (&json_ctx, i * 15, 64, 256, seek, BIG_CHAR, j);
> +           }
> +
> +         for (i = 0; i < 32; ++i)
> +           {
> +             do_test (&json_ctx, 0, i, i + 1, seek, SMALL_CHAR, j);
> +             do_test (&json_ctx, 0, i, i + 1, seek, BIG_CHAR, j);
> +           }
> +         if (seek == 0)
> +           {
> +             break;
> +           }
> +       }
>      }
>
> -  for (i = 0; i < 32; ++i)
> -    {
> -      do_test (0, i, i + 1, 0, SMALL_CHAR);
> -      do_test (0, i, i + 1, 0, BIG_CHAR);
> -    }
> +  json_array_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_document_end (&json_ctx);
>
>    return ret;
>  }
> --
> 2.25.1
>


-- 
H.J.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v3 3/4] x86: Optimize {str|wcs}rchr-avx2
  2022-04-22  1:52   ` [PATCH v3 3/4] x86: Optimize {str|wcs}rchr-avx2 Noah Goldstein
@ 2022-04-22 19:03     ` H.J. Lu
  2022-05-12 20:14       ` Sunil Pandey
  0 siblings, 1 reply; 36+ messages in thread
From: H.J. Lu @ 2022-04-22 19:03 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Thu, Apr 21, 2022 at 6:52 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The new code unrolls the main loop slightly without adding too much
> overhead and minimizes the comparisons for the search CHAR.
>
> Geometric Mean of all benchmarks New / Old: 0.832
> See email for all results.
>
> Full xcheck passes on x86_64 with and without multiarch enabled.
> ---
>  sysdeps/x86_64/multiarch/strrchr-avx2.S | 426 +++++++++++++++---------
>  1 file changed, 269 insertions(+), 157 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
> index 1df2adfad0..bd26ba80d5 100644
> --- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
> @@ -27,9 +27,13 @@
>  # ifdef USE_AS_WCSRCHR
>  #  define VPBROADCAST  vpbroadcastd
>  #  define VPCMPEQ      vpcmpeqd
> +#  define VPMIN        vpminud
> +#  define CHAR_SIZE    4
>  # else
>  #  define VPBROADCAST  vpbroadcastb
>  #  define VPCMPEQ      vpcmpeqb
> +#  define VPMIN        vpminub
> +#  define CHAR_SIZE    1
>  # endif
>
>  # ifndef VZEROUPPER
> @@ -41,196 +45,304 @@
>  # endif
>
>  # define VEC_SIZE      32
> +# define PAGE_SIZE     4096
>
> -       .section SECTION(.text),"ax",@progbits
> -ENTRY (STRRCHR)
> -       movd    %esi, %xmm4
> -       movl    %edi, %ecx
> +       .section SECTION(.text), "ax", @progbits
> +ENTRY(STRRCHR)
> +       movd    %esi, %xmm7
> +       movl    %edi, %eax
>         /* Broadcast CHAR to YMM4.  */
> -       VPBROADCAST %xmm4, %ymm4
> +       VPBROADCAST %xmm7, %ymm7
>         vpxor   %xmm0, %xmm0, %xmm0
>
> -       /* Check if we may cross page boundary with one vector load.  */
> -       andl    $(2 * VEC_SIZE - 1), %ecx
> -       cmpl    $VEC_SIZE, %ecx
> -       ja      L(cros_page_boundary)
> +       /* Shift here instead of `andl` to save code size (saves a fetch
> +          block).  */
> +       sall    $20, %eax
> +       cmpl    $((PAGE_SIZE - VEC_SIZE) << 20), %eax
> +       ja      L(cross_page)
>
> +L(page_cross_continue):
>         vmovdqu (%rdi), %ymm1
> -       VPCMPEQ %ymm1, %ymm0, %ymm2
> -       VPCMPEQ %ymm1, %ymm4, %ymm3
> -       vpmovmskb %ymm2, %ecx
> -       vpmovmskb %ymm3, %eax
> -       addq    $VEC_SIZE, %rdi
> +       /* Check end of string match.  */
> +       VPCMPEQ %ymm1, %ymm0, %ymm6
> +       vpmovmskb %ymm6, %ecx
> +       testl   %ecx, %ecx
> +       jz      L(aligned_more)
> +
> +       /* Only check match with search CHAR if needed.  */
> +       VPCMPEQ %ymm1, %ymm7, %ymm1
> +       vpmovmskb %ymm1, %eax
> +       /* Check if match before first zero.  */
> +       blsmskl %ecx, %ecx
> +       andl    %ecx, %eax
> +       jz      L(ret0)
> +       bsrl    %eax, %eax
> +       addq    %rdi, %rax
> +       /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
> +          search CHAR is zero we are correct. Either way `andq
> +          -CHAR_SIZE, %rax` gets the correct result.  */
> +# ifdef USE_AS_WCSRCHR
> +       andq    $-CHAR_SIZE, %rax
> +# endif
> +L(ret0):
> +L(return_vzeroupper):
> +       ZERO_UPPER_VEC_REGISTERS_RETURN
> +
> +       /* Returns for first vec x1/x2 have hard coded backward search
> +          path for earlier matches.  */
> +       .p2align 4,, 10
> +L(first_vec_x1):
> +       VPCMPEQ %ymm2, %ymm7, %ymm6
> +       vpmovmskb %ymm6, %eax
> +       blsmskl %ecx, %ecx
> +       andl    %ecx, %eax
> +       jnz     L(first_vec_x1_return)
> +
> +       .p2align 4,, 4
> +L(first_vec_x0_test):
> +       VPCMPEQ %ymm1, %ymm7, %ymm6
> +       vpmovmskb %ymm6, %eax
> +       testl   %eax, %eax
> +       jz      L(ret1)
> +       bsrl    %eax, %eax
> +       addq    %r8, %rax
> +# ifdef USE_AS_WCSRCHR
> +       andq    $-CHAR_SIZE, %rax
> +# endif
> +L(ret1):
> +       VZEROUPPER_RETURN
>
> +       .p2align 4,, 10
> +L(first_vec_x0_x1_test):
> +       VPCMPEQ %ymm2, %ymm7, %ymm6
> +       vpmovmskb %ymm6, %eax
> +       /* Check ymm2 for search CHAR match. If no match then check ymm1
> +          before returning.  */
>         testl   %eax, %eax
> -       jnz     L(first_vec)
> +       jz      L(first_vec_x0_test)
> +       .p2align 4,, 4
> +L(first_vec_x1_return):
> +       bsrl    %eax, %eax
> +       leaq    1(%rdi, %rax), %rax
> +# ifdef USE_AS_WCSRCHR
> +       andq    $-CHAR_SIZE, %rax
> +# endif
> +       VZEROUPPER_RETURN
>
> -       testl   %ecx, %ecx
> -       jnz     L(return_null)
>
> -       andq    $-VEC_SIZE, %rdi
> -       xorl    %edx, %edx
> -       jmp     L(aligned_loop)
> +       .p2align 4,, 10
> +L(first_vec_x2):
> +       VPCMPEQ %ymm3, %ymm7, %ymm6
> +       vpmovmskb %ymm6, %eax
> +       blsmskl %ecx, %ecx
> +       /* If no in-range search CHAR match in ymm3 then need to check
> +          ymm1/ymm2 for an earlier match (we delay checking search
> +          CHAR matches until needed).  */
> +       andl    %ecx, %eax
> +       jz      L(first_vec_x0_x1_test)
> +       bsrl    %eax, %eax
> +       leaq    (VEC_SIZE + 1)(%rdi, %rax), %rax
> +# ifdef USE_AS_WCSRCHR
> +       andq    $-CHAR_SIZE, %rax
> +# endif
> +       VZEROUPPER_RETURN
> +
>
>         .p2align 4
> -L(first_vec):
> -       /* Check if there is a nul CHAR.  */
> +L(aligned_more):
> +       /* Save original pointer if match was in VEC 0.  */
> +       movq    %rdi, %r8
> +
> +       /* Align src.  */
> +       orq     $(VEC_SIZE - 1), %rdi
> +       vmovdqu 1(%rdi), %ymm2
> +       VPCMPEQ %ymm2, %ymm0, %ymm6
> +       vpmovmskb %ymm6, %ecx
>         testl   %ecx, %ecx
> -       jnz     L(char_and_nul_in_first_vec)
> +       jnz     L(first_vec_x1)
>
> -       /* Remember the match and keep searching.  */
> -       movl    %eax, %edx
> -       movq    %rdi, %rsi
> -       andq    $-VEC_SIZE, %rdi
> -       jmp     L(aligned_loop)
> +       vmovdqu (VEC_SIZE + 1)(%rdi), %ymm3
> +       VPCMPEQ %ymm3, %ymm0, %ymm6
> +       vpmovmskb %ymm6, %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(first_vec_x2)
>
> +       /* Save pointer again before realigning.  */
> +       movq    %rdi, %rsi
> +       addq    $(VEC_SIZE + 1), %rdi
> +       andq    $-(VEC_SIZE * 2), %rdi
>         .p2align 4
> -L(cros_page_boundary):
> -       andl    $(VEC_SIZE - 1), %ecx
> -       andq    $-VEC_SIZE, %rdi
> -       vmovdqa (%rdi), %ymm1
> -       VPCMPEQ %ymm1, %ymm0, %ymm2
> -       VPCMPEQ %ymm1, %ymm4, %ymm3
> -       vpmovmskb %ymm2, %edx
> -       vpmovmskb %ymm3, %eax
> -       shrl    %cl, %edx
> -       shrl    %cl, %eax
> -       addq    $VEC_SIZE, %rdi
> -
> -       /* Check if there is a CHAR.  */
> +L(first_aligned_loop):
> +       /* Do 2x VEC at a time. Any more and the cost of finding the
> +          match outweights loop benefit.  */
> +       vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4
> +       vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5
> +
> +       VPCMPEQ %ymm4, %ymm7, %ymm6
> +       VPMIN   %ymm4, %ymm5, %ymm8
> +       VPCMPEQ %ymm5, %ymm7, %ymm10
> +       vpor    %ymm6, %ymm10, %ymm5
> +       VPCMPEQ %ymm8, %ymm0, %ymm8
> +       vpor    %ymm5, %ymm8, %ymm9
> +
> +       vpmovmskb %ymm9, %eax
> +       addq    $(VEC_SIZE * 2), %rdi
> +       /* No zero or search CHAR.  */
>         testl   %eax, %eax
> -       jnz     L(found_char)
> -
> -       testl   %edx, %edx
> -       jnz     L(return_null)
> +       jz      L(first_aligned_loop)
>
> -       jmp     L(aligned_loop)
> -
> -       .p2align 4
> -L(found_char):
> -       testl   %edx, %edx
> -       jnz     L(char_and_nul)
> +       /* If no zero CHAR then go to second loop (this allows us to
> +          throw away all prior work).  */
> +       vpmovmskb %ymm8, %ecx
> +       testl   %ecx, %ecx
> +       jz      L(second_aligned_loop_prep)
>
> -       /* Remember the match and keep searching.  */
> -       movl    %eax, %edx
> -       leaq    (%rdi, %rcx), %rsi
> +       /* Search char could be zero so we need to get the true match.
> +        */
> +       vpmovmskb %ymm5, %eax
> +       testl   %eax, %eax
> +       jnz     L(first_aligned_loop_return)
>
> -       .p2align 4
> -L(aligned_loop):
> -       vmovdqa (%rdi), %ymm1
> -       VPCMPEQ %ymm1, %ymm0, %ymm2
> -       addq    $VEC_SIZE, %rdi
> -       VPCMPEQ %ymm1, %ymm4, %ymm3
> -       vpmovmskb %ymm2, %ecx
> -       vpmovmskb %ymm3, %eax
> -       orl     %eax, %ecx
> -       jnz     L(char_nor_null)
> -
> -       vmovdqa (%rdi), %ymm1
> -       VPCMPEQ %ymm1, %ymm0, %ymm2
> -       add     $VEC_SIZE, %rdi
> -       VPCMPEQ %ymm1, %ymm4, %ymm3
> -       vpmovmskb %ymm2, %ecx
> +       .p2align 4,, 4
> +L(first_vec_x1_or_x2):
> +       VPCMPEQ %ymm3, %ymm7, %ymm3
> +       VPCMPEQ %ymm2, %ymm7, %ymm2
>         vpmovmskb %ymm3, %eax
> -       orl     %eax, %ecx
> -       jnz     L(char_nor_null)
> -
> -       vmovdqa (%rdi), %ymm1
> -       VPCMPEQ %ymm1, %ymm0, %ymm2
> -       addq    $VEC_SIZE, %rdi
> -       VPCMPEQ %ymm1, %ymm4, %ymm3
> -       vpmovmskb %ymm2, %ecx
> -       vpmovmskb %ymm3, %eax
> -       orl     %eax, %ecx
> -       jnz     L(char_nor_null)
> -
> -       vmovdqa (%rdi), %ymm1
> -       VPCMPEQ %ymm1, %ymm0, %ymm2
> -       addq    $VEC_SIZE, %rdi
> -       VPCMPEQ %ymm1, %ymm4, %ymm3
> -       vpmovmskb %ymm2, %ecx
> -       vpmovmskb %ymm3, %eax
> -       orl     %eax, %ecx
> -       jz      L(aligned_loop)
> -
> -       .p2align 4
> -L(char_nor_null):
> -       /* Find a CHAR or a nul CHAR in a loop.  */
> -       testl   %eax, %eax
> -       jnz     L(match)
> -L(return_value):
> -       testl   %edx, %edx
> -       jz      L(return_null)
> -       movl    %edx, %eax
> -       movq    %rsi, %rdi
> +       vpmovmskb %ymm2, %edx
> +       /* Use add for macro-fusion.  */
> +       addq    %rax, %rdx
> +       jz      L(first_vec_x0_test)
> +       /* NB: We could move this shift to before the branch and save a
> +          bit of code size / performance on the fall through. The
> +          branch leads to the null case which generally seems hotter
> +          than char in first 3x VEC.  */
> +       salq    $32, %rax
> +       addq    %rdx, %rax
> +       bsrq    %rax, %rax
> +       leaq    1(%rsi, %rax), %rax
> +# ifdef USE_AS_WCSRCHR
> +       andq    $-CHAR_SIZE, %rax
> +# endif
> +       VZEROUPPER_RETURN
>
> +       .p2align 4,, 8
> +L(first_aligned_loop_return):
> +       VPCMPEQ %ymm4, %ymm0, %ymm4
> +       vpmovmskb %ymm4, %edx
> +       salq    $32, %rcx
> +       orq     %rdx, %rcx
> +
> +       vpmovmskb %ymm10, %eax
> +       vpmovmskb %ymm6, %edx
> +       salq    $32, %rax
> +       orq     %rdx, %rax
> +       blsmskq %rcx, %rcx
> +       andq    %rcx, %rax
> +       jz      L(first_vec_x1_or_x2)
> +
> +       bsrq    %rax, %rax
> +       leaq    -(VEC_SIZE * 2)(%rdi, %rax), %rax
>  # ifdef USE_AS_WCSRCHR
> -       /* Keep the first bit for each matching CHAR for bsr.  */
> -       andl    $0x11111111, %eax
> +       andq    $-CHAR_SIZE, %rax
>  # endif
> -       bsrl    %eax, %eax
> -       leaq    -VEC_SIZE(%rdi, %rax), %rax
> -L(return_vzeroupper):
> -       ZERO_UPPER_VEC_REGISTERS_RETURN
> +       VZEROUPPER_RETURN
>
> +       /* Search char cannot be zero.  */
>         .p2align 4
> -L(match):
> -       /* Find a CHAR.  Check if there is a nul CHAR.  */
> -       vpmovmskb %ymm2, %ecx
> -       testl   %ecx, %ecx
> -       jnz     L(find_nul)
> -
> -       /* Remember the match and keep searching.  */
> -       movl    %eax, %edx
> +L(second_aligned_loop_set_furthest_match):
> +       /* Save VEC and pointer from most recent match.  */
> +L(second_aligned_loop_prep):
>         movq    %rdi, %rsi
> -       jmp     L(aligned_loop)
> +       vmovdqu %ymm6, %ymm2
> +       vmovdqu %ymm10, %ymm3
>
>         .p2align 4
> -L(find_nul):
> -# ifdef USE_AS_WCSRCHR
> -       /* Keep the first bit for each matching CHAR for bsr.  */
> -       andl    $0x11111111, %ecx
> -       andl    $0x11111111, %eax
> -# endif
> -       /* Mask out any matching bits after the nul CHAR.  */
> -       movl    %ecx, %r8d
> -       subl    $1, %r8d
> -       xorl    %ecx, %r8d
> -       andl    %r8d, %eax
> +L(second_aligned_loop):
> +       /* Search 2x at at time.  */
> +       vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4
> +       vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5
> +
> +       VPCMPEQ %ymm4, %ymm7, %ymm6
> +       VPMIN   %ymm4, %ymm5, %ymm1
> +       VPCMPEQ %ymm5, %ymm7, %ymm10
> +       vpor    %ymm6, %ymm10, %ymm5
> +       VPCMPEQ %ymm1, %ymm0, %ymm1
> +       vpor    %ymm5, %ymm1, %ymm9
> +
> +       vpmovmskb %ymm9, %eax
> +       addq    $(VEC_SIZE * 2), %rdi
>         testl   %eax, %eax
> -       /* If there is no CHAR here, return the remembered one.  */
> -       jz      L(return_value)
> -       bsrl    %eax, %eax
> -       leaq    -VEC_SIZE(%rdi, %rax), %rax
> -       VZEROUPPER_RETURN
> -
> -       .p2align 4
> -L(char_and_nul):
> -       /* Find both a CHAR and a nul CHAR.  */
> -       addq    %rcx, %rdi
> -       movl    %edx, %ecx
> -L(char_and_nul_in_first_vec):
> -# ifdef USE_AS_WCSRCHR
> -       /* Keep the first bit for each matching CHAR for bsr.  */
> -       andl    $0x11111111, %ecx
> -       andl    $0x11111111, %eax
> -# endif
> -       /* Mask out any matching bits after the nul CHAR.  */
> -       movl    %ecx, %r8d
> -       subl    $1, %r8d
> -       xorl    %ecx, %r8d
> -       andl    %r8d, %eax
> +       jz      L(second_aligned_loop)
> +       vpmovmskb %ymm1, %ecx
> +       testl   %ecx, %ecx
> +       jz      L(second_aligned_loop_set_furthest_match)
> +       vpmovmskb %ymm5, %eax
>         testl   %eax, %eax
> -       /* Return null pointer if the nul CHAR comes first.  */
> -       jz      L(return_null)
> -       bsrl    %eax, %eax
> -       leaq    -VEC_SIZE(%rdi, %rax), %rax
> +       jnz     L(return_new_match)
> +
> +       /* This is the hot patch. We know CHAR is inbounds and that
> +          ymm3/ymm2 have latest match.  */
> +       .p2align 4,, 4
> +L(return_old_match):
> +       vpmovmskb %ymm3, %eax
> +       vpmovmskb %ymm2, %edx
> +       salq    $32, %rax
> +       orq     %rdx, %rax
> +       bsrq    %rax, %rax
> +       /* Search char cannot be zero so safe to just use lea for
> +          wcsrchr.  */
> +       leaq    (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax
>         VZEROUPPER_RETURN
>
> -       .p2align 4
> -L(return_null):
> -       xorl    %eax, %eax
> +       /* Last iteration also potentially has a match.  */
> +       .p2align 4,, 8
> +L(return_new_match):
> +       VPCMPEQ %ymm4, %ymm0, %ymm4
> +       vpmovmskb %ymm4, %edx
> +       salq    $32, %rcx
> +       orq     %rdx, %rcx
> +
> +       vpmovmskb %ymm10, %eax
> +       vpmovmskb %ymm6, %edx
> +       salq    $32, %rax
> +       orq     %rdx, %rax
> +       blsmskq %rcx, %rcx
> +       andq    %rcx, %rax
> +       jz      L(return_old_match)
> +       bsrq    %rax, %rax
> +       /* Search char cannot be zero so safe to just use lea for
> +          wcsrchr.  */
> +       leaq    (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax
>         VZEROUPPER_RETURN
>
> -END (STRRCHR)
> +       .p2align 4,, 4
> +L(cross_page):
> +       movq    %rdi, %rsi
> +       andq    $-VEC_SIZE, %rsi
> +       vmovdqu (%rsi), %ymm1
> +       VPCMPEQ %ymm1, %ymm0, %ymm6
> +       vpmovmskb %ymm6, %ecx
> +       /* Shift out zero CHAR matches that are before the begining of
> +          src (rdi).  */
> +       shrxl   %edi, %ecx, %ecx
> +       testl   %ecx, %ecx
> +       jz      L(page_cross_continue)
> +       VPCMPEQ %ymm1, %ymm7, %ymm1
> +       vpmovmskb %ymm1, %eax
> +
> +       /* Shift out search CHAR matches that are before the begining of
> +          src (rdi).  */
> +       shrxl   %edi, %eax, %eax
> +       blsmskl %ecx, %ecx
> +       /* Check if any search CHAR match in range.  */
> +       andl    %ecx, %eax
> +       jz      L(ret2)
> +       bsrl    %eax, %eax
> +       addq    %rdi, %rax
> +# ifdef USE_AS_WCSRCHR
> +       andq    $-CHAR_SIZE, %rax
> +# endif
> +L(ret2):
> +       VZEROUPPER_RETURN
> +END(STRRCHR)
>  #endif
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v3 4/4] x86: Optimize {str|wcs}rchr-evex
  2022-04-22  1:52   ` [PATCH v3 4/4] x86: Optimize {str|wcs}rchr-evex Noah Goldstein
@ 2022-04-22 19:04     ` H.J. Lu
  2022-05-12 20:16       ` Sunil Pandey
  0 siblings, 1 reply; 36+ messages in thread
From: H.J. Lu @ 2022-04-22 19:04 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Thu, Apr 21, 2022 at 6:52 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The new code unrolls the main loop slightly without adding too much
> overhead and minimizes the comparisons for the search CHAR.
>
> Geometric Mean of all benchmarks New / Old: 0.755
> See email for all results.
>
> Full xcheck passes on x86_64 with and without multiarch enabled.
> ---
>  sysdeps/x86_64/multiarch/strrchr-evex.S | 471 +++++++++++++++---------
>  1 file changed, 290 insertions(+), 181 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
> index adeddaed32..8014c285b3 100644
> --- a/sysdeps/x86_64/multiarch/strrchr-evex.S
> +++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
> @@ -24,242 +24,351 @@
>  #  define STRRCHR      __strrchr_evex
>  # endif
>
> -# define VMOVU         vmovdqu64
> -# define VMOVA         vmovdqa64
> +# define VMOVU vmovdqu64
> +# define VMOVA vmovdqa64
>
>  # ifdef USE_AS_WCSRCHR
> +#  define SHIFT_REG    esi
> +
> +#  define kunpck       kunpckbw
> +#  define kmov_2x      kmovd
> +#  define maskz_2x     ecx
> +#  define maskm_2x     eax
> +#  define CHAR_SIZE    4
> +#  define VPMIN        vpminud
> +#  define VPTESTN      vptestnmd
>  #  define VPBROADCAST  vpbroadcastd
> -#  define VPCMP                vpcmpd
> -#  define SHIFT_REG    r8d
> +#  define VPCMP        vpcmpd
>  # else
> +#  define SHIFT_REG    edi
> +
> +#  define kunpck       kunpckdq
> +#  define kmov_2x      kmovq
> +#  define maskz_2x     rcx
> +#  define maskm_2x     rax
> +
> +#  define CHAR_SIZE    1
> +#  define VPMIN        vpminub
> +#  define VPTESTN      vptestnmb
>  #  define VPBROADCAST  vpbroadcastb
> -#  define VPCMP                vpcmpb
> -#  define SHIFT_REG    ecx
> +#  define VPCMP        vpcmpb
>  # endif
>
>  # define XMMZERO       xmm16
>  # define YMMZERO       ymm16
>  # define YMMMATCH      ymm17
> -# define YMM1          ymm18
> +# define YMMSAVE       ymm18
> +
> +# define YMM1  ymm19
> +# define YMM2  ymm20
> +# define YMM3  ymm21
> +# define YMM4  ymm22
> +# define YMM5  ymm23
> +# define YMM6  ymm24
> +# define YMM7  ymm25
> +# define YMM8  ymm26
>
> -# define VEC_SIZE      32
>
> -       .section .text.evex,"ax",@progbits
> -ENTRY (STRRCHR)
> -       movl    %edi, %ecx
> +# define VEC_SIZE      32
> +# define PAGE_SIZE     4096
> +       .section .text.evex, "ax", @progbits
> +ENTRY(STRRCHR)
> +       movl    %edi, %eax
>         /* Broadcast CHAR to YMMMATCH.  */
>         VPBROADCAST %esi, %YMMMATCH
>
> -       vpxorq  %XMMZERO, %XMMZERO, %XMMZERO
> -
> -       /* Check if we may cross page boundary with one vector load.  */
> -       andl    $(2 * VEC_SIZE - 1), %ecx
> -       cmpl    $VEC_SIZE, %ecx
> -       ja      L(cros_page_boundary)
> +       andl    $(PAGE_SIZE - 1), %eax
> +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> +       jg      L(cross_page_boundary)
>
> +L(page_cross_continue):
>         VMOVU   (%rdi), %YMM1
> -
> -       /* Each bit in K0 represents a null byte in YMM1.  */
> -       VPCMP   $0, %YMMZERO, %YMM1, %k0
> -       /* Each bit in K1 represents a CHAR in YMM1.  */
> -       VPCMP   $0, %YMMMATCH, %YMM1, %k1
> +       /* k0 has a 1 for each zero CHAR in YMM1.  */
> +       VPTESTN %YMM1, %YMM1, %k0
>         kmovd   %k0, %ecx
> -       kmovd   %k1, %eax
> -
> -       addq    $VEC_SIZE, %rdi
> -
> -       testl   %eax, %eax
> -       jnz     L(first_vec)
> -
>         testl   %ecx, %ecx
> -       jnz     L(return_null)
> -
> -       andq    $-VEC_SIZE, %rdi
> -       xorl    %edx, %edx
> -       jmp     L(aligned_loop)
> -
> -       .p2align 4
> -L(first_vec):
> -       /* Check if there is a null byte.  */
> -       testl   %ecx, %ecx
> -       jnz     L(char_and_nul_in_first_vec)
> -
> -       /* Remember the match and keep searching.  */
> -       movl    %eax, %edx
> -       movq    %rdi, %rsi
> -       andq    $-VEC_SIZE, %rdi
> -       jmp     L(aligned_loop)
> -
> -       .p2align 4
> -L(cros_page_boundary):
> -       andl    $(VEC_SIZE - 1), %ecx
> -       andq    $-VEC_SIZE, %rdi
> +       jz      L(aligned_more)
> +       /* fallthrough: zero CHAR in first VEC.  */
>
> +       /* K1 has a 1 for each search CHAR match in YMM1.  */
> +       VPCMP   $0, %YMMMATCH, %YMM1, %k1
> +       kmovd   %k1, %eax
> +       /* Build mask up until first zero CHAR (used to mask of
> +          potential search CHAR matches past the end of the string).
> +        */
> +       blsmskl %ecx, %ecx
> +       andl    %ecx, %eax
> +       jz      L(ret0)
> +       /* Get last match (the `andl` removed any out of bounds
> +          matches).  */
> +       bsrl    %eax, %eax
>  # ifdef USE_AS_WCSRCHR
> -       /* NB: Divide shift count by 4 since each bit in K1 represent 4
> -          bytes.  */
> -       movl    %ecx, %SHIFT_REG
> -       sarl    $2, %SHIFT_REG
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> +# else
> +       addq    %rdi, %rax
>  # endif
> +L(ret0):
> +       ret
>
> -       VMOVA   (%rdi), %YMM1
> -
> -       /* Each bit in K0 represents a null byte in YMM1.  */
> -       VPCMP   $0, %YMMZERO, %YMM1, %k0
> -       /* Each bit in K1 represents a CHAR in YMM1.  */
> +       /* Returns for first vec x1/x2/x3 have hard coded backward
> +          search path for earlier matches.  */
> +       .p2align 4,, 6
> +L(first_vec_x1):
> +       VPCMP   $0, %YMMMATCH, %YMM2, %k1
> +       kmovd   %k1, %eax
> +       blsmskl %ecx, %ecx
> +       /* eax non-zero if search CHAR in range.  */
> +       andl    %ecx, %eax
> +       jnz     L(first_vec_x1_return)
> +
> +       /* fallthrough: no match in YMM2 then need to check for earlier
> +          matches (in YMM1).  */
> +       .p2align 4,, 4
> +L(first_vec_x0_test):
>         VPCMP   $0, %YMMMATCH, %YMM1, %k1
> -       kmovd   %k0, %edx
>         kmovd   %k1, %eax
> -
> -       shrxl   %SHIFT_REG, %edx, %edx
> -       shrxl   %SHIFT_REG, %eax, %eax
> -       addq    $VEC_SIZE, %rdi
> -
> -       /* Check if there is a CHAR.  */
>         testl   %eax, %eax
> -       jnz     L(found_char)
> -
> -       testl   %edx, %edx
> -       jnz     L(return_null)
> -
> -       jmp     L(aligned_loop)
> -
> -       .p2align 4
> -L(found_char):
> -       testl   %edx, %edx
> -       jnz     L(char_and_nul)
> -
> -       /* Remember the match and keep searching.  */
> -       movl    %eax, %edx
> -       leaq    (%rdi, %rcx), %rsi
> +       jz      L(ret1)
> +       bsrl    %eax, %eax
> +# ifdef USE_AS_WCSRCHR
> +       leaq    (%rsi, %rax, CHAR_SIZE), %rax
> +# else
> +       addq    %rsi, %rax
> +# endif
> +L(ret1):
> +       ret
>
> -       .p2align 4
> -L(aligned_loop):
> -       VMOVA   (%rdi), %YMM1
> -       addq    $VEC_SIZE, %rdi
> +       .p2align 4,, 10
> +L(first_vec_x1_or_x2):
> +       VPCMP   $0, %YMM3, %YMMMATCH, %k3
> +       VPCMP   $0, %YMM2, %YMMMATCH, %k2
> +       /* K2 and K3 have 1 for any search CHAR match. Test if any
> +          matches between either of them. Otherwise check YMM1.  */
> +       kortestd %k2, %k3
> +       jz      L(first_vec_x0_test)
> +
> +       /* Guranteed that YMM2 and YMM3 are within range so merge the
> +          two bitmasks then get last result.  */
> +       kunpck  %k2, %k3, %k3
> +       kmovq   %k3, %rax
> +       bsrq    %rax, %rax
> +       leaq    (VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax
> +       ret
>
> -       /* Each bit in K0 represents a null byte in YMM1.  */
> -       VPCMP   $0, %YMMZERO, %YMM1, %k0
> -       /* Each bit in K1 represents a CHAR in YMM1.  */
> -       VPCMP   $0, %YMMMATCH, %YMM1, %k1
> -       kmovd   %k0, %ecx
> +       .p2align 4,, 6
> +L(first_vec_x3):
> +       VPCMP   $0, %YMMMATCH, %YMM4, %k1
>         kmovd   %k1, %eax
> -       orl     %eax, %ecx
> -       jnz     L(char_nor_null)
> +       blsmskl %ecx, %ecx
> +       /* If no search CHAR match in range check YMM1/YMM2/YMM3.  */
> +       andl    %ecx, %eax
> +       jz      L(first_vec_x1_or_x2)
> +       bsrl    %eax, %eax
> +       leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> +       ret
>
> -       VMOVA   (%rdi), %YMM1
> -       add     $VEC_SIZE, %rdi
> +       .p2align 4,, 6
> +L(first_vec_x0_x1_test):
> +       VPCMP   $0, %YMMMATCH, %YMM2, %k1
> +       kmovd   %k1, %eax
> +       /* Check YMM2 for last match first. If no match try YMM1.  */
> +       testl   %eax, %eax
> +       jz      L(first_vec_x0_test)
> +       .p2align 4,, 4
> +L(first_vec_x1_return):
> +       bsrl    %eax, %eax
> +       leaq    (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
> +       ret
>
> -       /* Each bit in K0 represents a null byte in YMM1.  */
> -       VPCMP   $0, %YMMZERO, %YMM1, %k0
> -       /* Each bit in K1 represents a CHAR in YMM1.  */
> -       VPCMP   $0, %YMMMATCH, %YMM1, %k1
> -       kmovd   %k0, %ecx
> +       .p2align 4,, 10
> +L(first_vec_x2):
> +       VPCMP   $0, %YMMMATCH, %YMM3, %k1
>         kmovd   %k1, %eax
> -       orl     %eax, %ecx
> -       jnz     L(char_nor_null)
> +       blsmskl %ecx, %ecx
> +       /* Check YMM3 for last match first. If no match try YMM2/YMM1.
> +        */
> +       andl    %ecx, %eax
> +       jz      L(first_vec_x0_x1_test)
> +       bsrl    %eax, %eax
> +       leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> +       ret
>
> -       VMOVA   (%rdi), %YMM1
> -       addq    $VEC_SIZE, %rdi
>
> -       /* Each bit in K0 represents a null byte in YMM1.  */
> -       VPCMP   $0, %YMMZERO, %YMM1, %k0
> -       /* Each bit in K1 represents a CHAR in YMM1.  */
> -       VPCMP   $0, %YMMMATCH, %YMM1, %k1
> +       .p2align 4
> +L(aligned_more):
> +       /* Need to keep original pointer incase YMM1 has last match.  */
> +       movq    %rdi, %rsi
> +       andq    $-VEC_SIZE, %rdi
> +       VMOVU   VEC_SIZE(%rdi), %YMM2
> +       VPTESTN %YMM2, %YMM2, %k0
>         kmovd   %k0, %ecx
> -       kmovd   %k1, %eax
> -       orl     %eax, %ecx
> -       jnz     L(char_nor_null)
> +       testl   %ecx, %ecx
> +       jnz     L(first_vec_x1)
>
> -       VMOVA   (%rdi), %YMM1
> -       addq    $VEC_SIZE, %rdi
> +       VMOVU   (VEC_SIZE * 2)(%rdi), %YMM3
> +       VPTESTN %YMM3, %YMM3, %k0
> +       kmovd   %k0, %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(first_vec_x2)
>
> -       /* Each bit in K0 represents a null byte in YMM1.  */
> -       VPCMP   $0, %YMMZERO, %YMM1, %k0
> -       /* Each bit in K1 represents a CHAR in YMM1.  */
> -       VPCMP   $0, %YMMMATCH, %YMM1, %k1
> +       VMOVU   (VEC_SIZE * 3)(%rdi), %YMM4
> +       VPTESTN %YMM4, %YMM4, %k0
>         kmovd   %k0, %ecx
> -       kmovd   %k1, %eax
> -       orl     %eax, %ecx
> -       jz      L(aligned_loop)
> +       movq    %rdi, %r8
> +       testl   %ecx, %ecx
> +       jnz     L(first_vec_x3)
>
> +       andq    $-(VEC_SIZE * 2), %rdi
>         .p2align 4
> -L(char_nor_null):
> -       /* Find a CHAR or a null byte in a loop.  */
> +L(first_aligned_loop):
> +       /* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee
> +          they don't store a match.  */
> +       VMOVA   (VEC_SIZE * 4)(%rdi), %YMM5
> +       VMOVA   (VEC_SIZE * 5)(%rdi), %YMM6
> +
> +       VPCMP   $0, %YMM5, %YMMMATCH, %k2
> +       vpxord  %YMM6, %YMMMATCH, %YMM7
> +
> +       VPMIN   %YMM5, %YMM6, %YMM8
> +       VPMIN   %YMM8, %YMM7, %YMM7
> +
> +       VPTESTN %YMM7, %YMM7, %k1
> +       subq    $(VEC_SIZE * -2), %rdi
> +       kortestd %k1, %k2
> +       jz      L(first_aligned_loop)
> +
> +       VPCMP   $0, %YMM6, %YMMMATCH, %k3
> +       VPTESTN %YMM8, %YMM8, %k1
> +       ktestd  %k1, %k1
> +       jz      L(second_aligned_loop_prep)
> +
> +       kortestd %k2, %k3
> +       jnz     L(return_first_aligned_loop)
> +
> +       .p2align 4,, 6
> +L(first_vec_x1_or_x2_or_x3):
> +       VPCMP   $0, %YMM4, %YMMMATCH, %k4
> +       kmovd   %k4, %eax
>         testl   %eax, %eax
> -       jnz     L(match)
> -L(return_value):
> -       testl   %edx, %edx
> -       jz      L(return_null)
> -       movl    %edx, %eax
> -       movq    %rsi, %rdi
> +       jz      L(first_vec_x1_or_x2)
>         bsrl    %eax, %eax
> -# ifdef USE_AS_WCSRCHR
> -       /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -       leaq    -VEC_SIZE(%rdi, %rax, 4), %rax
> -# else
> -       leaq    -VEC_SIZE(%rdi, %rax), %rax
> -# endif
> +       leaq    (VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
>         ret
>
> -       .p2align 4
> -L(match):
> -       /* Find a CHAR.  Check if there is a null byte.  */
> -       kmovd   %k0, %ecx
> -       testl   %ecx, %ecx
> -       jnz     L(find_nul)
> +       .p2align 4,, 8
> +L(return_first_aligned_loop):
> +       VPTESTN %YMM5, %YMM5, %k0
> +       kunpck  %k0, %k1, %k0
> +       kmov_2x %k0, %maskz_2x
> +
> +       blsmsk  %maskz_2x, %maskz_2x
> +       kunpck  %k2, %k3, %k3
> +       kmov_2x %k3, %maskm_2x
> +       and     %maskz_2x, %maskm_2x
> +       jz      L(first_vec_x1_or_x2_or_x3)
>
> -       /* Remember the match and keep searching.  */
> -       movl    %eax, %edx
> +       bsr     %maskm_2x, %maskm_2x
> +       leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> +       ret
> +
> +       .p2align 4
> +       /* We can throw away the work done for the first 4x checks here
> +          as we have a later match. This is the 'fast' path persay.
> +        */
> +L(second_aligned_loop_prep):
> +L(second_aligned_loop_set_furthest_match):
>         movq    %rdi, %rsi
> -       jmp     L(aligned_loop)
> +       kunpck  %k2, %k3, %k4
>
>         .p2align 4
> -L(find_nul):
> -       /* Mask out any matching bits after the null byte.  */
> -       movl    %ecx, %r8d
> -       subl    $1, %r8d
> -       xorl    %ecx, %r8d
> -       andl    %r8d, %eax
> -       testl   %eax, %eax
> -       /* If there is no CHAR here, return the remembered one.  */
> -       jz      L(return_value)
> -       bsrl    %eax, %eax
> +L(second_aligned_loop):
> +       VMOVU   (VEC_SIZE * 4)(%rdi), %YMM1
> +       VMOVU   (VEC_SIZE * 5)(%rdi), %YMM2
> +
> +       VPCMP   $0, %YMM1, %YMMMATCH, %k2
> +       vpxord  %YMM2, %YMMMATCH, %YMM3
> +
> +       VPMIN   %YMM1, %YMM2, %YMM4
> +       VPMIN   %YMM3, %YMM4, %YMM3
> +
> +       VPTESTN %YMM3, %YMM3, %k1
> +       subq    $(VEC_SIZE * -2), %rdi
> +       kortestd %k1, %k2
> +       jz      L(second_aligned_loop)
> +
> +       VPCMP   $0, %YMM2, %YMMMATCH, %k3
> +       VPTESTN %YMM4, %YMM4, %k1
> +       ktestd  %k1, %k1
> +       jz      L(second_aligned_loop_set_furthest_match)
> +
> +       kortestd %k2, %k3
> +       /* branch here because there is a significant advantage interms
> +          of output dependency chance in using edx.  */
> +       jnz     L(return_new_match)
> +L(return_old_match):
> +       kmovq   %k4, %rax
> +       bsrq    %rax, %rax
> +       leaq    (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax
> +       ret
> +
> +L(return_new_match):
> +       VPTESTN %YMM1, %YMM1, %k0
> +       kunpck  %k0, %k1, %k0
> +       kmov_2x %k0, %maskz_2x
> +
> +       blsmsk  %maskz_2x, %maskz_2x
> +       kunpck  %k2, %k3, %k3
> +       kmov_2x %k3, %maskm_2x
> +       and     %maskz_2x, %maskm_2x
> +       jz      L(return_old_match)
> +
> +       bsr     %maskm_2x, %maskm_2x
> +       leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> +       ret
> +
> +L(cross_page_boundary):
> +       /* eax contains all the page offset bits of src (rdi). `xor rdi,
> +          rax` sets pointer will all page offset bits cleared so
> +          offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
> +          before page cross (guranteed to be safe to read). Doing this
> +          as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
> +          a bit of code size.  */
> +       xorq    %rdi, %rax
> +       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %YMM1
> +       VPTESTN %YMM1, %YMM1, %k0
> +       kmovd   %k0, %ecx
> +
> +       /* Shift out zero CHAR matches that are before the begining of
> +          src (rdi).  */
>  # ifdef USE_AS_WCSRCHR
> -       /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -       leaq    -VEC_SIZE(%rdi, %rax, 4), %rax
> -# else
> -       leaq    -VEC_SIZE(%rdi, %rax), %rax
> +       movl    %edi, %esi
> +       andl    $(VEC_SIZE - 1), %esi
> +       shrl    $2, %esi
>  # endif
> -       ret
> +       shrxl   %SHIFT_REG, %ecx, %ecx
>
> -       .p2align 4
> -L(char_and_nul):
> -       /* Find both a CHAR and a null byte.  */
> -       addq    %rcx, %rdi
> -       movl    %edx, %ecx
> -L(char_and_nul_in_first_vec):
> -       /* Mask out any matching bits after the null byte.  */
> -       movl    %ecx, %r8d
> -       subl    $1, %r8d
> -       xorl    %ecx, %r8d
> -       andl    %r8d, %eax
> -       testl   %eax, %eax
> -       /* Return null pointer if the null byte comes first.  */
> -       jz      L(return_null)
> +       testl   %ecx, %ecx
> +       jz      L(page_cross_continue)
> +
> +       /* Found zero CHAR so need to test for search CHAR.  */
> +       VPCMP   $0, %YMMMATCH, %YMM1, %k1
> +       kmovd   %k1, %eax
> +       /* Shift out search CHAR matches that are before the begining of
> +          src (rdi).  */
> +       shrxl   %SHIFT_REG, %eax, %eax
> +
> +       /* Check if any search CHAR match in range.  */
> +       blsmskl %ecx, %ecx
> +       andl    %ecx, %eax
> +       jz      L(ret3)
>         bsrl    %eax, %eax
>  # ifdef USE_AS_WCSRCHR
> -       /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -       leaq    -VEC_SIZE(%rdi, %rax, 4), %rax
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
>  # else
> -       leaq    -VEC_SIZE(%rdi, %rax), %rax
> +       addq    %rdi, %rax
>  # endif
> +L(ret3):
>         ret
>
> -       .p2align 4
> -L(return_null):
> -       xorl    %eax, %eax
> -       ret
> -
> -END (STRRCHR)
> +END(STRRCHR)
>  #endif
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v3 2/4] x86: Optimize {str|wcs}rchr-sse2
  2022-04-22  1:52   ` [PATCH v3 2/4] x86: Optimize {str|wcs}rchr-sse2 Noah Goldstein
@ 2022-04-22 19:06     ` H.J. Lu
  2022-05-12 20:13       ` Sunil Pandey
  0 siblings, 1 reply; 36+ messages in thread
From: H.J. Lu @ 2022-04-22 19:06 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Thu, Apr 21, 2022 at 6:52 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The new code unrolls the main loop slightly without adding too much
> overhead and minimizes the comparisons for the search CHAR.
>
> Geometric Mean of all benchmarks New / Old: 0.741
> See email for all results.
>
> Full xcheck passes on x86_64 with and without multiarch enabled.
> ---
>  sysdeps/x86_64/multiarch/strrchr-sse2.S |   2 +-
>  sysdeps/x86_64/multiarch/wcsrchr-sse2.S |   3 +-
>  sysdeps/x86_64/strrchr.S                | 510 +++++++++++++++---------
>  sysdeps/x86_64/wcsrchr.S                | 268 +------------
>  4 files changed, 339 insertions(+), 444 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
> index db1b44c23c..866396e947 100644
> --- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
> +++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
> @@ -17,7 +17,7 @@
>     <https://www.gnu.org/licenses/>.  */
>
>  #if IS_IN (libc)
> -# define strrchr __strrchr_sse2
> +# define STRRCHR __strrchr_sse2
>
>  # undef weak_alias
>  # define weak_alias(strrchr, rindex)
> diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> index 78d1ca6553..69d2f3cdb1 100644
> --- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> +++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> @@ -17,7 +17,6 @@
>     <https://www.gnu.org/licenses/>.  */
>
>  #if IS_IN (libc)
> -# define wcsrchr __wcsrchr_sse2
> +# define STRRCHR       __wcsrchr_sse2
>  #endif
> -
>  #include "../wcsrchr.S"
> diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
> index 50d886713e..4d7ba4ceb2 100644
> --- a/sysdeps/x86_64/strrchr.S
> +++ b/sysdeps/x86_64/strrchr.S
> @@ -19,210 +19,360 @@
>
>  #include <sysdep.h>
>
> +#ifndef STRRCHR
> +# define STRRCHR       strrchr
> +#endif
> +
> +#ifdef USE_AS_WCSRCHR
> +# define PCMPEQ        pcmpeqd
> +# define CHAR_SIZE     4
> +# define PMINU pminud
> +#else
> +# define PCMPEQ        pcmpeqb
> +# define CHAR_SIZE     1
> +# define PMINU pminub
> +#endif
> +
> +#define PAGE_SIZE      4096
> +#define VEC_SIZE       16
> +
>         .text
> -ENTRY (strrchr)
> -       movd    %esi, %xmm1
> +ENTRY(STRRCHR)
> +       movd    %esi, %xmm0
>         movq    %rdi, %rax
> -       andl    $4095, %eax
> -       punpcklbw       %xmm1, %xmm1
> -       cmpq    $4032, %rax
> -       punpcklwd       %xmm1, %xmm1
> -       pshufd  $0, %xmm1, %xmm1
> +       andl    $(PAGE_SIZE - 1), %eax
> +#ifndef USE_AS_WCSRCHR
> +       punpcklbw %xmm0, %xmm0
> +       punpcklwd %xmm0, %xmm0
> +#endif
> +       pshufd  $0, %xmm0, %xmm0
> +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
>         ja      L(cross_page)
> -       movdqu  (%rdi), %xmm0
> +
> +L(cross_page_continue):
> +       movups  (%rdi), %xmm1
>         pxor    %xmm2, %xmm2
> -       movdqa  %xmm0, %xmm3
> -       pcmpeqb %xmm1, %xmm0
> -       pcmpeqb %xmm2, %xmm3
> -       pmovmskb        %xmm0, %ecx
> -       pmovmskb        %xmm3, %edx
> -       testq   %rdx, %rdx
> -       je      L(next_48_bytes)
> -       leaq    -1(%rdx), %rax
> -       xorq    %rdx, %rax
> -       andq    %rcx, %rax
> -       je      L(exit)
> -       bsrq    %rax, %rax
> +       PCMPEQ  %xmm1, %xmm2
> +       pmovmskb %xmm2, %ecx
> +       testl   %ecx, %ecx
> +       jz      L(aligned_more)
> +
> +       PCMPEQ  %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +       leal    -1(%rcx), %edx
> +       xorl    %edx, %ecx
> +       andl    %ecx, %eax
> +       jz      L(ret0)
> +       bsrl    %eax, %eax
>         addq    %rdi, %rax
> +       /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
> +          search CHAR is zero we are correct. Either way `andq
> +          -CHAR_SIZE, %rax` gets the correct result.  */
> +#ifdef USE_AS_WCSRCHR
> +       andq    $-CHAR_SIZE, %rax
> +#endif
> +L(ret0):
>         ret
>
> +       /* Returns for first vec x1/x2 have hard coded backward search
> +          path for earlier matches.  */
>         .p2align 4
> -L(next_48_bytes):
> -       movdqu  16(%rdi), %xmm4
> -       movdqa  %xmm4, %xmm5
> -       movdqu  32(%rdi), %xmm3
> -       pcmpeqb %xmm1, %xmm4
> -       pcmpeqb %xmm2, %xmm5
> -       movdqu  48(%rdi), %xmm0
> -       pmovmskb        %xmm5, %edx
> -       movdqa  %xmm3, %xmm5
> -       pcmpeqb %xmm1, %xmm3
> -       pcmpeqb %xmm2, %xmm5
> -       pcmpeqb %xmm0, %xmm2
> -       salq    $16, %rdx
> -       pmovmskb        %xmm3, %r8d
> -       pmovmskb        %xmm5, %eax
> -       pmovmskb        %xmm2, %esi
> -       salq    $32, %r8
> -       salq    $32, %rax
> -       pcmpeqb %xmm1, %xmm0
> -       orq     %rdx, %rax
> -       movq    %rsi, %rdx
> -       pmovmskb        %xmm4, %esi
> -       salq    $48, %rdx
> -       salq    $16, %rsi
> -       orq     %r8, %rsi
> -       orq     %rcx, %rsi
> -       pmovmskb        %xmm0, %ecx
> -       salq    $48, %rcx
> -       orq     %rcx, %rsi
> -       orq     %rdx, %rax
> -       je      L(loop_header2)
> -       leaq    -1(%rax), %rcx
> -       xorq    %rax, %rcx
> -       andq    %rcx, %rsi
> -       je      L(exit)
> -       bsrq    %rsi, %rsi
> -       leaq    (%rdi,%rsi), %rax
> +L(first_vec_x0_test):
> +       PCMPEQ  %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +       testl   %eax, %eax
> +       jz      L(ret0)
> +       bsrl    %eax, %eax
> +       addq    %r8, %rax
> +#ifdef USE_AS_WCSRCHR
> +       andq    $-CHAR_SIZE, %rax
> +#endif
>         ret
>
>         .p2align 4
> -L(loop_header2):
> -       testq   %rsi, %rsi
> -       movq    %rdi, %rcx
> -       je      L(no_c_found)
> -L(loop_header):
> -       addq    $64, %rdi
> -       pxor    %xmm7, %xmm7
> -       andq    $-64, %rdi
> -       jmp     L(loop_entry)
> +L(first_vec_x1):
> +       PCMPEQ  %xmm0, %xmm2
> +       pmovmskb %xmm2, %eax
> +       leal    -1(%rcx), %edx
> +       xorl    %edx, %ecx
> +       andl    %ecx, %eax
> +       jz      L(first_vec_x0_test)
> +       bsrl    %eax, %eax
> +       leaq    (VEC_SIZE)(%rdi, %rax), %rax
> +#ifdef USE_AS_WCSRCHR
> +       andq    $-CHAR_SIZE, %rax
> +#endif
> +       ret
>
>         .p2align 4
> -L(loop64):
> -       testq   %rdx, %rdx
> -       cmovne  %rdx, %rsi
> -       cmovne  %rdi, %rcx
> -       addq    $64, %rdi
> -L(loop_entry):
> -       movdqa  32(%rdi), %xmm3
> -       pxor    %xmm6, %xmm6
> -       movdqa  48(%rdi), %xmm2
> -       movdqa  %xmm3, %xmm0
> -       movdqa  16(%rdi), %xmm4
> -       pminub  %xmm2, %xmm0
> -       movdqa  (%rdi), %xmm5
> -       pminub  %xmm4, %xmm0
> -       pminub  %xmm5, %xmm0
> -       pcmpeqb %xmm7, %xmm0
> -       pmovmskb        %xmm0, %eax
> -       movdqa  %xmm5, %xmm0
> -       pcmpeqb %xmm1, %xmm0
> -       pmovmskb        %xmm0, %r9d
> -       movdqa  %xmm4, %xmm0
> -       pcmpeqb %xmm1, %xmm0
> -       pmovmskb        %xmm0, %edx
> -       movdqa  %xmm3, %xmm0
> -       pcmpeqb %xmm1, %xmm0
> -       salq    $16, %rdx
> -       pmovmskb        %xmm0, %r10d
> -       movdqa  %xmm2, %xmm0
> -       pcmpeqb %xmm1, %xmm0
> -       salq    $32, %r10
> -       orq     %r10, %rdx
> -       pmovmskb        %xmm0, %r8d
> -       orq     %r9, %rdx
> -       salq    $48, %r8
> -       orq     %r8, %rdx
> +L(first_vec_x1_test):
> +       PCMPEQ  %xmm0, %xmm2
> +       pmovmskb %xmm2, %eax
>         testl   %eax, %eax
> -       je      L(loop64)
> -       pcmpeqb %xmm6, %xmm4
> -       pcmpeqb %xmm6, %xmm3
> -       pcmpeqb %xmm6, %xmm5
> -       pmovmskb        %xmm4, %eax
> -       pmovmskb        %xmm3, %r10d
> -       pcmpeqb %xmm6, %xmm2
> -       pmovmskb        %xmm5, %r9d
> -       salq    $32, %r10
> -       salq    $16, %rax
> -       pmovmskb        %xmm2, %r8d
> -       orq     %r10, %rax
> -       orq     %r9, %rax
> -       salq    $48, %r8
> -       orq     %r8, %rax
> -       leaq    -1(%rax), %r8
> -       xorq    %rax, %r8
> -       andq    %r8, %rdx
> -       cmovne  %rdi, %rcx
> -       cmovne  %rdx, %rsi
> -       bsrq    %rsi, %rsi
> -       leaq    (%rcx,%rsi), %rax
> +       jz      L(first_vec_x0_test)
> +       bsrl    %eax, %eax
> +       leaq    (VEC_SIZE)(%rdi, %rax), %rax
> +#ifdef USE_AS_WCSRCHR
> +       andq    $-CHAR_SIZE, %rax
> +#endif
> +       ret
> +
> +       .p2align 4
> +L(first_vec_x2):
> +       PCMPEQ  %xmm0, %xmm3
> +       pmovmskb %xmm3, %eax
> +       leal    -1(%rcx), %edx
> +       xorl    %edx, %ecx
> +       andl    %ecx, %eax
> +       jz      L(first_vec_x1_test)
> +       bsrl    %eax, %eax
> +       leaq    (VEC_SIZE * 2)(%rdi, %rax), %rax
> +#ifdef USE_AS_WCSRCHR
> +       andq    $-CHAR_SIZE, %rax
> +#endif
> +       ret
> +
> +       .p2align 4
> +L(aligned_more):
> +       /* Save original pointer if match was in VEC 0.  */
> +       movq    %rdi, %r8
> +       andq    $-VEC_SIZE, %rdi
> +
> +       movaps  VEC_SIZE(%rdi), %xmm2
> +       pxor    %xmm3, %xmm3
> +       PCMPEQ  %xmm2, %xmm3
> +       pmovmskb %xmm3, %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(first_vec_x1)
> +
> +       movaps  (VEC_SIZE * 2)(%rdi), %xmm3
> +       pxor    %xmm4, %xmm4
> +       PCMPEQ  %xmm3, %xmm4
> +       pmovmskb %xmm4, %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(first_vec_x2)
> +
> +       addq    $VEC_SIZE, %rdi
> +       /* Save pointer again before realigning.  */
> +       movq    %rdi, %rsi
> +       andq    $-(VEC_SIZE * 2), %rdi
> +       .p2align 4
> +L(first_loop):
> +       /* Do 2x VEC at a time.  */
> +       movaps  (VEC_SIZE * 2)(%rdi), %xmm4
> +       movaps  (VEC_SIZE * 3)(%rdi), %xmm5
> +       /* Since SSE2 no pminud so wcsrchr needs seperate logic for
> +          detecting zero. Note if this is found to be a bottleneck it
> +          may be worth adding an SSE4.1 wcsrchr implementation.  */
> +#ifdef USE_AS_WCSRCHR
> +       movaps  %xmm5, %xmm6
> +       pxor    %xmm8, %xmm8
> +
> +       PCMPEQ  %xmm8, %xmm5
> +       PCMPEQ  %xmm4, %xmm8
> +       por     %xmm5, %xmm8
> +#else
> +       movaps  %xmm5, %xmm6
> +       PMINU   %xmm4, %xmm5
> +#endif
> +
> +       movaps  %xmm4, %xmm9
> +       PCMPEQ  %xmm0, %xmm4
> +       PCMPEQ  %xmm0, %xmm6
> +       movaps  %xmm6, %xmm7
> +       por     %xmm4, %xmm6
> +#ifndef USE_AS_WCSRCHR
> +       pxor    %xmm8, %xmm8
> +       PCMPEQ  %xmm5, %xmm8
> +#endif
> +       pmovmskb %xmm8, %ecx
> +       pmovmskb %xmm6, %eax
> +
> +       addq    $(VEC_SIZE * 2), %rdi
> +       /* Use `addl` 1) so we can undo it with `subl` and 2) it can
> +          macro-fuse with `jz`.  */
> +       addl    %ecx, %eax
> +       jz      L(first_loop)
> +
> +       /* Check if there is zero match.  */
> +       testl   %ecx, %ecx
> +       jz      L(second_loop_match)
> +
> +       /* Check if there was a match in last iteration.  */
> +       subl    %ecx, %eax
> +       jnz     L(new_match)
> +
> +L(first_loop_old_match):
> +       PCMPEQ  %xmm0, %xmm2
> +       PCMPEQ  %xmm0, %xmm3
> +       pmovmskb %xmm2, %ecx
> +       pmovmskb %xmm3, %eax
> +       addl    %eax, %ecx
> +       jz      L(first_vec_x0_test)
> +       /* NB: We could move this shift to before the branch and save a
> +          bit of code size / performance on the fall through. The
> +          branch leads to the null case which generally seems hotter
> +          than char in first 3x VEC.  */
> +       sall    $16, %eax
> +       orl     %ecx, %eax
> +
> +       bsrl    %eax, %eax
> +       addq    %rsi, %rax
> +#ifdef USE_AS_WCSRCHR
> +       andq    $-CHAR_SIZE, %rax
> +#endif
> +       ret
> +
> +       .p2align 4
> +L(new_match):
> +       pxor    %xmm6, %xmm6
> +       PCMPEQ  %xmm9, %xmm6
> +       pmovmskb %xmm6, %eax
> +       sall    $16, %ecx
> +       orl     %eax, %ecx
> +
> +       /* We can't reuse either of the old comparisons as since we mask
> +          of zeros after first zero (instead of using the full
> +          comparison) we can't gurantee no interference between match
> +          after end of string and valid match.  */
> +       pmovmskb %xmm4, %eax
> +       pmovmskb %xmm7, %edx
> +       sall    $16, %edx
> +       orl     %edx, %eax
> +
> +       leal    -1(%ecx), %edx
> +       xorl    %edx, %ecx
> +       andl    %ecx, %eax
> +       jz      L(first_loop_old_match)
> +       bsrl    %eax, %eax
> +       addq    %rdi, %rax
> +#ifdef USE_AS_WCSRCHR
> +       andq    $-CHAR_SIZE, %rax
> +#endif
>         ret
>
> +       /* Save minimum state for getting most recent match. We can
> +          throw out all previous work.  */
>         .p2align 4
> -L(no_c_found):
> -       movl    $1, %esi
> -       xorl    %ecx, %ecx
> -       jmp     L(loop_header)
> +L(second_loop_match):
> +       movq    %rdi, %rsi
> +       movaps  %xmm4, %xmm2
> +       movaps  %xmm7, %xmm3
>
>         .p2align 4
> -L(exit):
> -       xorl    %eax, %eax
> +L(second_loop):
> +       movaps  (VEC_SIZE * 2)(%rdi), %xmm4
> +       movaps  (VEC_SIZE * 3)(%rdi), %xmm5
> +       /* Since SSE2 no pminud so wcsrchr needs seperate logic for
> +          detecting zero. Note if this is found to be a bottleneck it
> +          may be worth adding an SSE4.1 wcsrchr implementation.  */
> +#ifdef USE_AS_WCSRCHR
> +       movaps  %xmm5, %xmm6
> +       pxor    %xmm8, %xmm8
> +
> +       PCMPEQ  %xmm8, %xmm5
> +       PCMPEQ  %xmm4, %xmm8
> +       por     %xmm5, %xmm8
> +#else
> +       movaps  %xmm5, %xmm6
> +       PMINU   %xmm4, %xmm5
> +#endif
> +
> +       movaps  %xmm4, %xmm9
> +       PCMPEQ  %xmm0, %xmm4
> +       PCMPEQ  %xmm0, %xmm6
> +       movaps  %xmm6, %xmm7
> +       por     %xmm4, %xmm6
> +#ifndef USE_AS_WCSRCHR
> +       pxor    %xmm8, %xmm8
> +       PCMPEQ  %xmm5, %xmm8
> +#endif
> +
> +       pmovmskb %xmm8, %ecx
> +       pmovmskb %xmm6, %eax
> +
> +       addq    $(VEC_SIZE * 2), %rdi
> +       /* Either null term or new occurence of CHAR.  */
> +       addl    %ecx, %eax
> +       jz      L(second_loop)
> +
> +       /* No null term so much be new occurence of CHAR.  */
> +       testl   %ecx, %ecx
> +       jz      L(second_loop_match)
> +
> +
> +       subl    %ecx, %eax
> +       jnz     L(second_loop_new_match)
> +
> +L(second_loop_old_match):
> +       pmovmskb %xmm2, %ecx
> +       pmovmskb %xmm3, %eax
> +       sall    $16, %eax
> +       orl     %ecx, %eax
> +       bsrl    %eax, %eax
> +       addq    %rsi, %rax
> +#ifdef USE_AS_WCSRCHR
> +       andq    $-CHAR_SIZE, %rax
> +#endif
>         ret
>
>         .p2align 4
> +L(second_loop_new_match):
> +       pxor    %xmm6, %xmm6
> +       PCMPEQ  %xmm9, %xmm6
> +       pmovmskb %xmm6, %eax
> +       sall    $16, %ecx
> +       orl     %eax, %ecx
> +
> +       /* We can't reuse either of the old comparisons as since we mask
> +          of zeros after first zero (instead of using the full
> +          comparison) we can't gurantee no interference between match
> +          after end of string and valid match.  */
> +       pmovmskb %xmm4, %eax
> +       pmovmskb %xmm7, %edx
> +       sall    $16, %edx
> +       orl     %edx, %eax
> +
> +       leal    -1(%ecx), %edx
> +       xorl    %edx, %ecx
> +       andl    %ecx, %eax
> +       jz      L(second_loop_old_match)
> +       bsrl    %eax, %eax
> +       addq    %rdi, %rax
> +#ifdef USE_AS_WCSRCHR
> +       andq    $-CHAR_SIZE, %rax
> +#endif
> +       ret
> +
> +       .p2align 4,, 4
>  L(cross_page):
> -       movq    %rdi, %rax
> -       pxor    %xmm0, %xmm0
> -       andq    $-64, %rax
> -       movdqu  (%rax), %xmm5
> -       movdqa  %xmm5, %xmm6
> -       movdqu  16(%rax), %xmm4
> -       pcmpeqb %xmm1, %xmm5
> -       pcmpeqb %xmm0, %xmm6
> -       movdqu  32(%rax), %xmm3
> -       pmovmskb        %xmm6, %esi
> -       movdqa  %xmm4, %xmm6
> -       movdqu  48(%rax), %xmm2
> -       pcmpeqb %xmm1, %xmm4
> -       pcmpeqb %xmm0, %xmm6
> -       pmovmskb        %xmm6, %edx
> -       movdqa  %xmm3, %xmm6
> -       pcmpeqb %xmm1, %xmm3
> -       pcmpeqb %xmm0, %xmm6
> -       pcmpeqb %xmm2, %xmm0
> -       salq    $16, %rdx
> -       pmovmskb        %xmm3, %r9d
> -       pmovmskb        %xmm6, %r8d
> -       pmovmskb        %xmm0, %ecx
> -       salq    $32, %r9
> -       salq    $32, %r8
> -       pcmpeqb %xmm1, %xmm2
> -       orq     %r8, %rdx
> -       salq    $48, %rcx
> -       pmovmskb        %xmm5, %r8d
> -       orq     %rsi, %rdx
> -       pmovmskb        %xmm4, %esi
> -       orq     %rcx, %rdx
> -       pmovmskb        %xmm2, %ecx
> -       salq    $16, %rsi
> -       salq    $48, %rcx
> -       orq     %r9, %rsi
> -       orq     %r8, %rsi
> -       orq     %rcx, %rsi
> +       movq    %rdi, %rsi
> +       andq    $-VEC_SIZE, %rsi
> +       movaps  (%rsi), %xmm1
> +       pxor    %xmm2, %xmm2
> +       PCMPEQ  %xmm1, %xmm2
> +       pmovmskb %xmm2, %edx
>         movl    %edi, %ecx
> -       subl    %eax, %ecx
> -       shrq    %cl, %rdx
> -       shrq    %cl, %rsi
> -       testq   %rdx, %rdx
> -       je      L(loop_header2)
> -       leaq    -1(%rdx), %rax
> -       xorq    %rdx, %rax
> -       andq    %rax, %rsi
> -       je      L(exit)
> -       bsrq    %rsi, %rax
> +       andl    $(VEC_SIZE - 1), %ecx
> +       sarl    %cl, %edx
> +       jz      L(cross_page_continue)
> +       PCMPEQ  %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +       sarl    %cl, %eax
> +       leal    -1(%rdx), %ecx
> +       xorl    %edx, %ecx
> +       andl    %ecx, %eax
> +       jz      L(ret1)
> +       bsrl    %eax, %eax
>         addq    %rdi, %rax
> +#ifdef USE_AS_WCSRCHR
> +       andq    $-CHAR_SIZE, %rax
> +#endif
> +L(ret1):
>         ret
> -END (strrchr)
> +END(STRRCHR)
>
> -weak_alias (strrchr, rindex)
> -libc_hidden_builtin_def (strrchr)
> +#ifndef USE_AS_WCSRCHR
> +       weak_alias (STRRCHR, rindex)
> +       libc_hidden_builtin_def (STRRCHR)
> +#endif
> diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
> index 61552954de..2b80efc5ef 100644
> --- a/sysdeps/x86_64/wcsrchr.S
> +++ b/sysdeps/x86_64/wcsrchr.S
> @@ -1,4 +1,4 @@
> -/* wcsrchr with SSSE3
> +/* wcsrchr optimized with SSE2.
>     Copyright (C) 2011-2022 Free Software Foundation, Inc.
>     This file is part of the GNU C Library.
>
> @@ -16,266 +16,12 @@
>     License along with the GNU C Library; if not, see
>     <https://www.gnu.org/licenses/>.  */
>
> -#include <sysdep.h>
>
> -       .text
> -ENTRY (wcsrchr)
> +#define USE_AS_WCSRCHR 1
> +#define NO_PMINU       1
>
> -       movd    %rsi, %xmm1
> -       mov     %rdi, %rcx
> -       punpckldq %xmm1, %xmm1
> -       pxor    %xmm2, %xmm2
> -       punpckldq %xmm1, %xmm1
> -       and     $63, %rcx
> -       cmp     $48, %rcx
> -       ja      L(crosscache)
> +#ifndef STRRCHR
> +# define STRRCHR       wcsrchr
> +#endif
>
> -       movdqu  (%rdi), %xmm0
> -       pcmpeqd %xmm0, %xmm2
> -       pcmpeqd %xmm1, %xmm0
> -       pmovmskb %xmm2, %rcx
> -       pmovmskb %xmm0, %rax
> -       add     $16, %rdi
> -
> -       test    %rax, %rax
> -       jnz     L(unaligned_match1)
> -
> -       test    %rcx, %rcx
> -       jnz     L(return_null)
> -
> -       and     $-16, %rdi
> -       xor     %r8, %r8
> -       jmp     L(loop)
> -
> -       .p2align 4
> -L(unaligned_match1):
> -       test    %rcx, %rcx
> -       jnz     L(prolog_find_zero_1)
> -
> -       mov     %rax, %r8
> -       mov     %rdi, %rsi
> -       and     $-16, %rdi
> -       jmp     L(loop)
> -
> -       .p2align 4
> -L(crosscache):
> -       and     $15, %rcx
> -       and     $-16, %rdi
> -       pxor    %xmm3, %xmm3
> -       movdqa  (%rdi), %xmm0
> -       pcmpeqd %xmm0, %xmm3
> -       pcmpeqd %xmm1, %xmm0
> -       pmovmskb %xmm3, %rdx
> -       pmovmskb %xmm0, %rax
> -       shr     %cl, %rdx
> -       shr     %cl, %rax
> -       add     $16, %rdi
> -
> -       test    %rax, %rax
> -       jnz     L(unaligned_match)
> -
> -       test    %rdx, %rdx
> -       jnz     L(return_null)
> -
> -       xor     %r8, %r8
> -       jmp     L(loop)
> -
> -       .p2align 4
> -L(unaligned_match):
> -       test    %rdx, %rdx
> -       jnz     L(prolog_find_zero)
> -
> -       mov     %rax, %r8
> -       lea     (%rdi, %rcx), %rsi
> -
> -/* Loop start on aligned string.  */
> -       .p2align 4
> -L(loop):
> -       movdqa  (%rdi), %xmm0
> -       pcmpeqd %xmm0, %xmm2
> -       add     $16, %rdi
> -       pcmpeqd %xmm1, %xmm0
> -       pmovmskb %xmm2, %rcx
> -       pmovmskb %xmm0, %rax
> -       or      %rax, %rcx
> -       jnz     L(matches)
> -
> -       movdqa  (%rdi), %xmm3
> -       pcmpeqd %xmm3, %xmm2
> -       add     $16, %rdi
> -       pcmpeqd %xmm1, %xmm3
> -       pmovmskb %xmm2, %rcx
> -       pmovmskb %xmm3, %rax
> -       or      %rax, %rcx
> -       jnz     L(matches)
> -
> -       movdqa  (%rdi), %xmm4
> -       pcmpeqd %xmm4, %xmm2
> -       add     $16, %rdi
> -       pcmpeqd %xmm1, %xmm4
> -       pmovmskb %xmm2, %rcx
> -       pmovmskb %xmm4, %rax
> -       or      %rax, %rcx
> -       jnz     L(matches)
> -
> -       movdqa  (%rdi), %xmm5
> -       pcmpeqd %xmm5, %xmm2
> -       add     $16, %rdi
> -       pcmpeqd %xmm1, %xmm5
> -       pmovmskb %xmm2, %rcx
> -       pmovmskb %xmm5, %rax
> -       or      %rax, %rcx
> -       jz      L(loop)
> -
> -       .p2align 4
> -L(matches):
> -       test    %rax, %rax
> -       jnz     L(match)
> -L(return_value):
> -       test    %r8, %r8
> -       jz      L(return_null)
> -       mov     %r8, %rax
> -       mov     %rsi, %rdi
> -
> -       test    $15 << 4, %ah
> -       jnz     L(match_fourth_wchar)
> -       test    %ah, %ah
> -       jnz     L(match_third_wchar)
> -       test    $15 << 4, %al
> -       jnz     L(match_second_wchar)
> -       lea     -16(%rdi), %rax
> -       ret
> -
> -       .p2align 4
> -L(match):
> -       pmovmskb %xmm2, %rcx
> -       test    %rcx, %rcx
> -       jnz     L(find_zero)
> -       mov     %rax, %r8
> -       mov     %rdi, %rsi
> -       jmp     L(loop)
> -
> -       .p2align 4
> -L(find_zero):
> -       test    $15, %cl
> -       jnz     L(find_zero_in_first_wchar)
> -       test    %cl, %cl
> -       jnz     L(find_zero_in_second_wchar)
> -       test    $15, %ch
> -       jnz     L(find_zero_in_third_wchar)
> -
> -       and     $1 << 13 - 1, %rax
> -       jz      L(return_value)
> -
> -       test    $15 << 4, %ah
> -       jnz     L(match_fourth_wchar)
> -       test    %ah, %ah
> -       jnz     L(match_third_wchar)
> -       test    $15 << 4, %al
> -       jnz     L(match_second_wchar)
> -       lea     -16(%rdi), %rax
> -       ret
> -
> -       .p2align 4
> -L(find_zero_in_first_wchar):
> -       test    $1, %rax
> -       jz      L(return_value)
> -       lea     -16(%rdi), %rax
> -       ret
> -
> -       .p2align 4
> -L(find_zero_in_second_wchar):
> -       and     $1 << 5 - 1, %rax
> -       jz      L(return_value)
> -
> -       test    $15 << 4, %al
> -       jnz     L(match_second_wchar)
> -       lea     -16(%rdi), %rax
> -       ret
> -
> -       .p2align 4
> -L(find_zero_in_third_wchar):
> -       and     $1 << 9 - 1, %rax
> -       jz      L(return_value)
> -
> -       test    %ah, %ah
> -       jnz     L(match_third_wchar)
> -       test    $15 << 4, %al
> -       jnz     L(match_second_wchar)
> -       lea     -16(%rdi), %rax
> -       ret
> -
> -       .p2align 4
> -L(prolog_find_zero):
> -       add     %rcx, %rdi
> -       mov     %rdx, %rcx
> -L(prolog_find_zero_1):
> -       test    $15, %cl
> -       jnz     L(prolog_find_zero_in_first_wchar)
> -       test    %cl, %cl
> -       jnz     L(prolog_find_zero_in_second_wchar)
> -       test    $15, %ch
> -       jnz     L(prolog_find_zero_in_third_wchar)
> -
> -       and     $1 << 13 - 1, %rax
> -       jz      L(return_null)
> -
> -       test    $15 << 4, %ah
> -       jnz     L(match_fourth_wchar)
> -       test    %ah, %ah
> -       jnz     L(match_third_wchar)
> -       test    $15 << 4, %al
> -       jnz     L(match_second_wchar)
> -       lea     -16(%rdi), %rax
> -       ret
> -
> -       .p2align 4
> -L(prolog_find_zero_in_first_wchar):
> -       test    $1, %rax
> -       jz      L(return_null)
> -       lea     -16(%rdi), %rax
> -       ret
> -
> -       .p2align 4
> -L(prolog_find_zero_in_second_wchar):
> -       and     $1 << 5 - 1, %rax
> -       jz      L(return_null)
> -
> -       test    $15 << 4, %al
> -       jnz     L(match_second_wchar)
> -       lea     -16(%rdi), %rax
> -       ret
> -
> -       .p2align 4
> -L(prolog_find_zero_in_third_wchar):
> -       and     $1 << 9 - 1, %rax
> -       jz      L(return_null)
> -
> -       test    %ah, %ah
> -       jnz     L(match_third_wchar)
> -       test    $15 << 4, %al
> -       jnz     L(match_second_wchar)
> -       lea     -16(%rdi), %rax
> -       ret
> -
> -       .p2align 4
> -L(match_second_wchar):
> -       lea     -12(%rdi), %rax
> -       ret
> -
> -       .p2align 4
> -L(match_third_wchar):
> -       lea     -8(%rdi), %rax
> -       ret
> -
> -       .p2align 4
> -L(match_fourth_wchar):
> -       lea     -4(%rdi), %rax
> -       ret
> -
> -       .p2align 4
> -L(return_null):
> -       xor     %rax, %rax
> -       ret
> -
> -END (wcsrchr)
> +#include "../strrchr.S"
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* [PATCH v4 1/4] benchtests: Improve bench-strrchr
  2022-04-21  3:14 [PATCH v1 1/5] benchtests: Improve bench-strrchr Noah Goldstein
                   ` (6 preceding siblings ...)
  2022-04-22  1:52 ` [PATCH v3 1/4] benchtests: Improve bench-strrchr Noah Goldstein
@ 2022-04-22 19:11 ` Noah Goldstein
  2022-04-23  1:53   ` H.J. Lu
  7 siblings, 1 reply; 36+ messages in thread
From: Noah Goldstein @ 2022-04-22 19:11 UTC (permalink / raw)
  To: libc-alpha

1. Use json-lib for printing results.
2. Expose all parameters (before pos, seek_char, and max_char where
   not printed).
3. Add benchmarks that test multiple occurence of seek_char in the
   string.
---
 benchtests/bench-strrchr.c | 126 ++++++++++++++++++++++++-------------
 1 file changed, 82 insertions(+), 44 deletions(-)

diff --git a/benchtests/bench-strrchr.c b/benchtests/bench-strrchr.c
index abdae60c51..7cd2a15484 100644
--- a/benchtests/bench-strrchr.c
+++ b/benchtests/bench-strrchr.c
@@ -23,6 +23,7 @@
 # define TEST_NAME "strrchr"
 #endif
 #include "bench-string.h"
+#include "json-lib.h"
 
 #define BIG_CHAR MAX_CHAR
 
@@ -53,7 +54,8 @@ SIMPLE_STRRCHR (const CHAR *s, int c)
 }
 
 static void
-do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
+do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s, int c,
+	     CHAR *exp_res)
 {
   CHAR *res = CALL (impl, s, c);
   size_t i, iters = INNER_LOOP_ITERS8;
@@ -61,8 +63,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
 
   if (res != exp_res)
     {
-      error (0, 0, "Wrong result in function %s %p %p", impl->name,
-	     res, exp_res);
+      error (0, 0, "Wrong result in function %s %p %p", impl->name, res,
+	     exp_res);
       ret = 1;
       return;
     }
@@ -73,23 +75,25 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
       CALL (impl, s, c);
     }
   TIMING_NOW (stop);
-
   TIMING_DIFF (cur, start, stop);
 
-  TIMING_PRINT_MEAN ((double) cur, (double) iters);
+  json_element_double (json_ctx, (double) cur / (double) iters);
 }
 
 static void
-do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
+do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
+	 int seek_char, int max_char, size_t freq)
 /* For wcsrchr: align here means align not in bytes,
    but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
    len for wcschr here isn't in bytes but it's number of wchar_t symbols.  */
 {
   size_t i;
+  size_t pos_chunk_sz = freq ? (pos / freq) : pos;
+  size_t last_pos = len;
   CHAR *result;
   CHAR *buf = (CHAR *) buf1;
 
-  align &= 7;
+  align &= (getpagesize () - 1);
   if ((align + len) * sizeof (CHAR) >= page_size)
     return;
 
@@ -103,6 +107,16 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
       if ((i > pos || pos >= len) && buf[align + i] == seek_char)
 	buf[align + i] = seek_char + 10 + (random () & 15);
     }
+
+  if (pos_chunk_sz == 0 && pos)
+    pos_chunk_sz = 1;
+
+  for (i = pos_chunk_sz; i < pos && i < len; i += pos_chunk_sz)
+    {
+      buf[align + i] = seek_char;
+      last_pos = i;
+    }
+
   buf[align + len] = 0;
 
   if (pos < len)
@@ -110,66 +124,90 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
       buf[align + pos] = seek_char;
       result = (CHAR *) (buf + align + pos);
     }
+  else if (last_pos < len)
+    result = (CHAR *) (buf + align + last_pos);
   else if (seek_char == 0)
     result = (CHAR *) (buf + align + len);
   else
     result = NULL;
 
-  printf ("Length %4zd, alignment in bytes %2zd:", len, align * sizeof (CHAR));
+  json_element_object_begin (json_ctx);
+  json_attr_uint (json_ctx, "len", len);
+  json_attr_uint (json_ctx, "pos", pos);
+  json_attr_uint (json_ctx, "align", align);
+  json_attr_uint (json_ctx, "freq", freq);
+  json_attr_uint (json_ctx, "seek", seek_char);
+  json_attr_uint (json_ctx, "max_char", max_char);
+  json_array_begin (json_ctx, "timings");
 
   FOR_EACH_IMPL (impl, 0)
-    do_one_test (impl, (CHAR *) (buf + align), seek_char, result);
+    do_one_test (json_ctx, impl, (CHAR *) (buf + align), seek_char, result);
 
-  putchar ('\n');
+  json_array_end (json_ctx);
+  json_element_object_end (json_ctx);
 }
 
 int
 test_main (void)
 {
-  size_t i;
+  json_ctx_t json_ctx;
+  size_t i, j;
+  int seek;
 
   test_init ();
+  json_init (&json_ctx, 0, stdout);
 
-  printf ("%20s", "");
-  FOR_EACH_IMPL (impl, 0)
-    printf ("\t%s", impl->name);
-  putchar ('\n');
-
-  for (i = 1; i < 8; ++i)
-    {
-      do_test (0, 16 << i, 2048, 23, SMALL_CHAR);
-      do_test (i, 16 << i, 2048, 23, SMALL_CHAR);
-    }
+  json_document_begin (&json_ctx);
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
 
-  for (i = 1; i < 8; ++i)
-    {
-      do_test (i, 64, 256, 23, SMALL_CHAR);
-      do_test (i, 64, 256, 23, BIG_CHAR);
-    }
+  json_attr_object_begin (&json_ctx, "functions");
+  json_attr_object_begin (&json_ctx, TEST_NAME);
+  json_attr_string (&json_ctx, "bench-variant", "");
 
-  for (i = 0; i < 32; ++i)
-    {
-      do_test (0, i, i + 1, 23, SMALL_CHAR);
-      do_test (0, i, i + 1, 23, BIG_CHAR);
-    }
+  json_array_begin (&json_ctx, "ifuncs");
+  FOR_EACH_IMPL (impl, 0)
+    json_element_string (&json_ctx, impl->name);
+  json_array_end (&json_ctx);
 
-  for (i = 1; i < 8; ++i)
-    {
-      do_test (0, 16 << i, 2048, 0, SMALL_CHAR);
-      do_test (i, 16 << i, 2048, 0, SMALL_CHAR);
-    }
+  json_array_begin (&json_ctx, "results");
 
-  for (i = 1; i < 8; ++i)
+  for (seek = 0; seek <= 23; seek += 23)
     {
-      do_test (i, 64, 256, 0, SMALL_CHAR);
-      do_test (i, 64, 256, 0, BIG_CHAR);
+      for (j = 1; j < 32; j += j)
+	{
+	  for (i = 1; i < 9; ++i)
+	    {
+	      do_test (&json_ctx, 0, 16 << i, 2048, seek, SMALL_CHAR, j);
+	      do_test (&json_ctx, i, 16 << i, 2048, seek, SMALL_CHAR, j);
+	    }
+
+	  for (i = 1; i < 8; ++i)
+	    {
+	      do_test (&json_ctx, i, 64, 256, seek, SMALL_CHAR, j);
+	      do_test (&json_ctx, i, 64, 256, seek, BIG_CHAR, j);
+
+	      do_test (&json_ctx, i * 15, 64, 256, seek, SMALL_CHAR, j);
+	      do_test (&json_ctx, i * 15, 64, 256, seek, BIG_CHAR, j);
+	    }
+
+	  for (i = 0; i < 32; ++i)
+	    {
+	      do_test (&json_ctx, 0, i, i + 1, seek, SMALL_CHAR, j);
+	      do_test (&json_ctx, 0, i, i + 1, seek, BIG_CHAR, j);
+	      do_test (&json_ctx, getpagesize () - i / 2 - 1, i, i + 1, seek,
+		       SMALL_CHAR, j);
+	    }
+	  if (seek == 0)
+	    {
+	      break;
+	    }
+	}
     }
 
-  for (i = 0; i < 32; ++i)
-    {
-      do_test (0, i, i + 1, 0, SMALL_CHAR);
-      do_test (0, i, i + 1, 0, BIG_CHAR);
-    }
+  json_array_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_document_end (&json_ctx);
 
   return ret;
 }
-- 
2.25.1


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v3 1/4] benchtests: Improve bench-strrchr
  2022-04-22 18:29   ` [PATCH v3 1/4] benchtests: Improve bench-strrchr H.J. Lu
@ 2022-04-22 19:12     ` Noah Goldstein
  0 siblings, 0 replies; 36+ messages in thread
From: Noah Goldstein @ 2022-04-22 19:12 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell

On Fri, Apr 22, 2022 at 1:30 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Thu, Apr 21, 2022 at 6:52 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > 1. Use json-lib for printing results.
> > 2. Expose all parameters (before pos, seek_char, and max_char where
> >    not printed).
> > 3. Add benchmarks that test multiple occurence of seek_char in the
> >    string.
> > ---
> >  benchtests/bench-strrchr.c | 124 ++++++++++++++++++++++++-------------
> >  1 file changed, 80 insertions(+), 44 deletions(-)
> >
> > diff --git a/benchtests/bench-strrchr.c b/benchtests/bench-strrchr.c
> > index abdae60c51..ce4307a098 100644
> > --- a/benchtests/bench-strrchr.c
> > +++ b/benchtests/bench-strrchr.c
> > @@ -23,6 +23,7 @@
> >  # define TEST_NAME "strrchr"
> >  #endif
> >  #include "bench-string.h"
> > +#include "json-lib.h"
> >
> >  #define BIG_CHAR MAX_CHAR
> >
> > @@ -53,7 +54,8 @@ SIMPLE_STRRCHR (const CHAR *s, int c)
> >  }
> >
> >  static void
> > -do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
> > +do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s, int c,
> > +            CHAR *exp_res)
> >  {
> >    CHAR *res = CALL (impl, s, c);
> >    size_t i, iters = INNER_LOOP_ITERS8;
> > @@ -61,8 +63,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
> >
> >    if (res != exp_res)
> >      {
> > -      error (0, 0, "Wrong result in function %s %p %p", impl->name,
> > -            res, exp_res);
> > +      error (0, 0, "Wrong result in function %s %p %p", impl->name, res,
> > +            exp_res);
> >        ret = 1;
> >        return;
> >      }
> > @@ -73,23 +75,25 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
> >        CALL (impl, s, c);
> >      }
> >    TIMING_NOW (stop);
> > -
> >    TIMING_DIFF (cur, start, stop);
> >
> > -  TIMING_PRINT_MEAN ((double) cur, (double) iters);
> > +  json_element_double (json_ctx, (double) cur / (double) iters);
> >  }
> >
> >  static void
> > -do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
> > +do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
> > +        int seek_char, int max_char, size_t freq)
> >  /* For wcsrchr: align here means align not in bytes,
> >     but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
> >     len for wcschr here isn't in bytes but it's number of wchar_t symbols.  */
> >  {
> >    size_t i;
> > +  size_t pos_chunk_sz = freq ? (pos / freq) : pos;
> > +  size_t last_pos = len;
> >    CHAR *result;
> >    CHAR *buf = (CHAR *) buf1;
> >
> > -  align &= 7;
> > +  align &= (getpagesize () - 1);
>
> Should we add some tests for page boundary cross?

Added a few in V4.
>
> >    if ((align + len) * sizeof (CHAR) >= page_size)
> >      return;
> >
> > @@ -103,6 +107,16 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
> >        if ((i > pos || pos >= len) && buf[align + i] == seek_char)
> >         buf[align + i] = seek_char + 10 + (random () & 15);
> >      }
> > +
> > +  if (pos_chunk_sz == 0 && pos)
> > +    pos_chunk_sz = 1;
> > +
> > +  for (i = pos_chunk_sz; i < pos && i < len; i += pos_chunk_sz)
> > +    {
> > +      buf[align + i] = seek_char;
> > +      last_pos = i;
> > +    }
> > +
> >    buf[align + len] = 0;
> >
> >    if (pos < len)
> > @@ -110,66 +124,88 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
> >        buf[align + pos] = seek_char;
> >        result = (CHAR *) (buf + align + pos);
> >      }
> > +  else if (last_pos < len)
> > +    result = (CHAR *) (buf + align + last_pos);
> >    else if (seek_char == 0)
> >      result = (CHAR *) (buf + align + len);
> >    else
> >      result = NULL;
> >
> > -  printf ("Length %4zd, alignment in bytes %2zd:", len, align * sizeof (CHAR));
> > +  json_element_object_begin (json_ctx);
> > +  json_attr_uint (json_ctx, "len", len);
> > +  json_attr_uint (json_ctx, "pos", pos);
> > +  json_attr_uint (json_ctx, "align", align);
> > +  json_attr_uint (json_ctx, "freq", freq);
> > +  json_attr_uint (json_ctx, "seek", seek_char);
> > +  json_attr_uint (json_ctx, "max_char", max_char);
> > +  json_array_begin (json_ctx, "timings");
> >
> >    FOR_EACH_IMPL (impl, 0)
> > -    do_one_test (impl, (CHAR *) (buf + align), seek_char, result);
> > +    do_one_test (json_ctx, impl, (CHAR *) (buf + align), seek_char, result);
> >
> > -  putchar ('\n');
> > +  json_array_end (json_ctx);
> > +  json_element_object_end (json_ctx);
> >  }
> >
> >  int
> >  test_main (void)
> >  {
> > -  size_t i;
> > +  json_ctx_t json_ctx;
> > +  size_t i, j;
> > +  int seek;
> >
> >    test_init ();
> > +  json_init (&json_ctx, 0, stdout);
> >
> > -  printf ("%20s", "");
> > -  FOR_EACH_IMPL (impl, 0)
> > -    printf ("\t%s", impl->name);
> > -  putchar ('\n');
> > -
> > -  for (i = 1; i < 8; ++i)
> > -    {
> > -      do_test (0, 16 << i, 2048, 23, SMALL_CHAR);
> > -      do_test (i, 16 << i, 2048, 23, SMALL_CHAR);
> > -    }
> > +  json_document_begin (&json_ctx);
> > +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> >
> > -  for (i = 1; i < 8; ++i)
> > -    {
> > -      do_test (i, 64, 256, 23, SMALL_CHAR);
> > -      do_test (i, 64, 256, 23, BIG_CHAR);
> > -    }
> > +  json_attr_object_begin (&json_ctx, "functions");
> > +  json_attr_object_begin (&json_ctx, TEST_NAME);
> > +  json_attr_string (&json_ctx, "bench-variant", "");
> >
> > -  for (i = 0; i < 32; ++i)
> > -    {
> > -      do_test (0, i, i + 1, 23, SMALL_CHAR);
> > -      do_test (0, i, i + 1, 23, BIG_CHAR);
> > -    }
> > +  json_array_begin (&json_ctx, "ifuncs");
> > +  FOR_EACH_IMPL (impl, 0)
> > +    json_element_string (&json_ctx, impl->name);
> > +  json_array_end (&json_ctx);
> >
> > -  for (i = 1; i < 8; ++i)
> > -    {
> > -      do_test (0, 16 << i, 2048, 0, SMALL_CHAR);
> > -      do_test (i, 16 << i, 2048, 0, SMALL_CHAR);
> > -    }
> > +  json_array_begin (&json_ctx, "results");
> >
> > -  for (i = 1; i < 8; ++i)
> > +  for (seek = 0; seek <= 23; seek += 23)
> >      {
> > -      do_test (i, 64, 256, 0, SMALL_CHAR);
> > -      do_test (i, 64, 256, 0, BIG_CHAR);
> > +      for (j = 1; j < 32; j += j)
> > +       {
> > +         for (i = 1; i < 9; ++i)
> > +           {
> > +             do_test (&json_ctx, 0, 16 << i, 2048, seek, SMALL_CHAR, j);
> > +             do_test (&json_ctx, i, 16 << i, 2048, seek, SMALL_CHAR, j);
> > +           }
> > +
> > +         for (i = 1; i < 8; ++i)
> > +           {
> > +             do_test (&json_ctx, i, 64, 256, seek, SMALL_CHAR, j);
> > +             do_test (&json_ctx, i, 64, 256, seek, BIG_CHAR, j);
> > +
> > +             do_test (&json_ctx, i * 15, 64, 256, seek, SMALL_CHAR, j);
> > +             do_test (&json_ctx, i * 15, 64, 256, seek, BIG_CHAR, j);
> > +           }
> > +
> > +         for (i = 0; i < 32; ++i)
> > +           {
> > +             do_test (&json_ctx, 0, i, i + 1, seek, SMALL_CHAR, j);
> > +             do_test (&json_ctx, 0, i, i + 1, seek, BIG_CHAR, j);
> > +           }
> > +         if (seek == 0)
> > +           {
> > +             break;
> > +           }
> > +       }
> >      }
> >
> > -  for (i = 0; i < 32; ++i)
> > -    {
> > -      do_test (0, i, i + 1, 0, SMALL_CHAR);
> > -      do_test (0, i, i + 1, 0, BIG_CHAR);
> > -    }
> > +  json_array_end (&json_ctx);
> > +  json_attr_object_end (&json_ctx);
> > +  json_attr_object_end (&json_ctx);
> > +  json_document_end (&json_ctx);
> >
> >    return ret;
> >  }
> > --
> > 2.25.1
> >
>
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v4 1/4] benchtests: Improve bench-strrchr
  2022-04-22 19:11 ` [PATCH v4 " Noah Goldstein
@ 2022-04-23  1:53   ` H.J. Lu
  0 siblings, 0 replies; 36+ messages in thread
From: H.J. Lu @ 2022-04-23  1:53 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Fri, Apr 22, 2022 at 12:12 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> 1. Use json-lib for printing results.
> 2. Expose all parameters (before pos, seek_char, and max_char where
>    not printed).
> 3. Add benchmarks that test multiple occurence of seek_char in the
>    string.
> ---
>  benchtests/bench-strrchr.c | 126 ++++++++++++++++++++++++-------------
>  1 file changed, 82 insertions(+), 44 deletions(-)
>
> diff --git a/benchtests/bench-strrchr.c b/benchtests/bench-strrchr.c
> index abdae60c51..7cd2a15484 100644
> --- a/benchtests/bench-strrchr.c
> +++ b/benchtests/bench-strrchr.c
> @@ -23,6 +23,7 @@
>  # define TEST_NAME "strrchr"
>  #endif
>  #include "bench-string.h"
> +#include "json-lib.h"
>
>  #define BIG_CHAR MAX_CHAR
>
> @@ -53,7 +54,8 @@ SIMPLE_STRRCHR (const CHAR *s, int c)
>  }
>
>  static void
> -do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
> +do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s, int c,
> +            CHAR *exp_res)
>  {
>    CHAR *res = CALL (impl, s, c);
>    size_t i, iters = INNER_LOOP_ITERS8;
> @@ -61,8 +63,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
>
>    if (res != exp_res)
>      {
> -      error (0, 0, "Wrong result in function %s %p %p", impl->name,
> -            res, exp_res);
> +      error (0, 0, "Wrong result in function %s %p %p", impl->name, res,
> +            exp_res);
>        ret = 1;
>        return;
>      }
> @@ -73,23 +75,25 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
>        CALL (impl, s, c);
>      }
>    TIMING_NOW (stop);
> -
>    TIMING_DIFF (cur, start, stop);
>
> -  TIMING_PRINT_MEAN ((double) cur, (double) iters);
> +  json_element_double (json_ctx, (double) cur / (double) iters);
>  }
>
>  static void
> -do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
> +do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
> +        int seek_char, int max_char, size_t freq)
>  /* For wcsrchr: align here means align not in bytes,
>     but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
>     len for wcschr here isn't in bytes but it's number of wchar_t symbols.  */
>  {
>    size_t i;
> +  size_t pos_chunk_sz = freq ? (pos / freq) : pos;
> +  size_t last_pos = len;
>    CHAR *result;
>    CHAR *buf = (CHAR *) buf1;
>
> -  align &= 7;
> +  align &= (getpagesize () - 1);
>    if ((align + len) * sizeof (CHAR) >= page_size)

page_size == 2 * getpagesize ()

>      return;
>
> @@ -103,6 +107,16 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
>        if ((i > pos || pos >= len) && buf[align + i] == seek_char)
>         buf[align + i] = seek_char + 10 + (random () & 15);
>      }
> +
> +  if (pos_chunk_sz == 0 && pos)
> +    pos_chunk_sz = 1;
> +
> +  for (i = pos_chunk_sz; i < pos && i < len; i += pos_chunk_sz)
> +    {
> +      buf[align + i] = seek_char;
> +      last_pos = i;
> +    }
> +
>    buf[align + len] = 0;
>
>    if (pos < len)
> @@ -110,66 +124,90 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
>        buf[align + pos] = seek_char;
>        result = (CHAR *) (buf + align + pos);
>      }
> +  else if (last_pos < len)
> +    result = (CHAR *) (buf + align + last_pos);
>    else if (seek_char == 0)
>      result = (CHAR *) (buf + align + len);
>    else
>      result = NULL;
>
> -  printf ("Length %4zd, alignment in bytes %2zd:", len, align * sizeof (CHAR));
> +  json_element_object_begin (json_ctx);
> +  json_attr_uint (json_ctx, "len", len);
> +  json_attr_uint (json_ctx, "pos", pos);
> +  json_attr_uint (json_ctx, "align", align);
> +  json_attr_uint (json_ctx, "freq", freq);
> +  json_attr_uint (json_ctx, "seek", seek_char);
> +  json_attr_uint (json_ctx, "max_char", max_char);
> +  json_array_begin (json_ctx, "timings");
>
>    FOR_EACH_IMPL (impl, 0)
> -    do_one_test (impl, (CHAR *) (buf + align), seek_char, result);
> +    do_one_test (json_ctx, impl, (CHAR *) (buf + align), seek_char, result);
>
> -  putchar ('\n');
> +  json_array_end (json_ctx);
> +  json_element_object_end (json_ctx);
>  }
>
>  int
>  test_main (void)
>  {
> -  size_t i;
> +  json_ctx_t json_ctx;
> +  size_t i, j;
> +  int seek;
>
>    test_init ();
> +  json_init (&json_ctx, 0, stdout);
>
> -  printf ("%20s", "");
> -  FOR_EACH_IMPL (impl, 0)
> -    printf ("\t%s", impl->name);
> -  putchar ('\n');
> -
> -  for (i = 1; i < 8; ++i)
> -    {
> -      do_test (0, 16 << i, 2048, 23, SMALL_CHAR);
> -      do_test (i, 16 << i, 2048, 23, SMALL_CHAR);
> -    }
> +  json_document_begin (&json_ctx);
> +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
>
> -  for (i = 1; i < 8; ++i)
> -    {
> -      do_test (i, 64, 256, 23, SMALL_CHAR);
> -      do_test (i, 64, 256, 23, BIG_CHAR);
> -    }
> +  json_attr_object_begin (&json_ctx, "functions");
> +  json_attr_object_begin (&json_ctx, TEST_NAME);
> +  json_attr_string (&json_ctx, "bench-variant", "");
>
> -  for (i = 0; i < 32; ++i)
> -    {
> -      do_test (0, i, i + 1, 23, SMALL_CHAR);
> -      do_test (0, i, i + 1, 23, BIG_CHAR);
> -    }
> +  json_array_begin (&json_ctx, "ifuncs");
> +  FOR_EACH_IMPL (impl, 0)
> +    json_element_string (&json_ctx, impl->name);
> +  json_array_end (&json_ctx);
>
> -  for (i = 1; i < 8; ++i)
> -    {
> -      do_test (0, 16 << i, 2048, 0, SMALL_CHAR);
> -      do_test (i, 16 << i, 2048, 0, SMALL_CHAR);
> -    }
> +  json_array_begin (&json_ctx, "results");
>
> -  for (i = 1; i < 8; ++i)
> +  for (seek = 0; seek <= 23; seek += 23)
>      {
> -      do_test (i, 64, 256, 0, SMALL_CHAR);
> -      do_test (i, 64, 256, 0, BIG_CHAR);
> +      for (j = 1; j < 32; j += j)
> +       {
> +         for (i = 1; i < 9; ++i)
> +           {
> +             do_test (&json_ctx, 0, 16 << i, 2048, seek, SMALL_CHAR, j);
> +             do_test (&json_ctx, i, 16 << i, 2048, seek, SMALL_CHAR, j);
> +           }
> +
> +         for (i = 1; i < 8; ++i)
> +           {
> +             do_test (&json_ctx, i, 64, 256, seek, SMALL_CHAR, j);
> +             do_test (&json_ctx, i, 64, 256, seek, BIG_CHAR, j);
> +
> +             do_test (&json_ctx, i * 15, 64, 256, seek, SMALL_CHAR, j);
> +             do_test (&json_ctx, i * 15, 64, 256, seek, BIG_CHAR, j);
> +           }
> +
> +         for (i = 0; i < 32; ++i)
> +           {
> +             do_test (&json_ctx, 0, i, i + 1, seek, SMALL_CHAR, j);
> +             do_test (&json_ctx, 0, i, i + 1, seek, BIG_CHAR, j);
> +             do_test (&json_ctx, getpagesize () - i / 2 - 1, i, i + 1, seek,
> +                      SMALL_CHAR, j);
> +           }
> +         if (seek == 0)
> +           {
> +             break;
> +           }
> +       }
>      }
>
> -  for (i = 0; i < 32; ++i)
> -    {
> -      do_test (0, i, i + 1, 0, SMALL_CHAR);
> -      do_test (0, i, i + 1, 0, BIG_CHAR);
> -    }
> +  json_array_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_document_end (&json_ctx);
>
>    return ret;
>  }
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks

-- 
H.J.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v3 2/4] x86: Optimize {str|wcs}rchr-sse2
  2022-04-22 19:06     ` H.J. Lu
@ 2022-05-12 20:13       ` Sunil Pandey
  0 siblings, 0 replies; 36+ messages in thread
From: Sunil Pandey @ 2022-05-12 20:13 UTC (permalink / raw)
  To: H.J. Lu, Libc-stable Mailing List; +Cc: Noah Goldstein, GNU C Library

On Fri, Apr 22, 2022 at 12:09 PM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Thu, Apr 21, 2022 at 6:52 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > The new code unrolls the main loop slightly without adding too much
> > overhead and minimizes the comparisons for the search CHAR.
> >
> > Geometric Mean of all benchmarks New / Old: 0.741
> > See email for all results.
> >
> > Full xcheck passes on x86_64 with and without multiarch enabled.
> > ---
> >  sysdeps/x86_64/multiarch/strrchr-sse2.S |   2 +-
> >  sysdeps/x86_64/multiarch/wcsrchr-sse2.S |   3 +-
> >  sysdeps/x86_64/strrchr.S                | 510 +++++++++++++++---------
> >  sysdeps/x86_64/wcsrchr.S                | 268 +------------
> >  4 files changed, 339 insertions(+), 444 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
> > index db1b44c23c..866396e947 100644
> > --- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
> > +++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
> > @@ -17,7 +17,7 @@
> >     <https://www.gnu.org/licenses/>.  */
> >
> >  #if IS_IN (libc)
> > -# define strrchr __strrchr_sse2
> > +# define STRRCHR __strrchr_sse2
> >
> >  # undef weak_alias
> >  # define weak_alias(strrchr, rindex)
> > diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> > index 78d1ca6553..69d2f3cdb1 100644
> > --- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> > +++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> > @@ -17,7 +17,6 @@
> >     <https://www.gnu.org/licenses/>.  */
> >
> >  #if IS_IN (libc)
> > -# define wcsrchr __wcsrchr_sse2
> > +# define STRRCHR       __wcsrchr_sse2
> >  #endif
> > -
> >  #include "../wcsrchr.S"
> > diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
> > index 50d886713e..4d7ba4ceb2 100644
> > --- a/sysdeps/x86_64/strrchr.S
> > +++ b/sysdeps/x86_64/strrchr.S
> > @@ -19,210 +19,360 @@
> >
> >  #include <sysdep.h>
> >
> > +#ifndef STRRCHR
> > +# define STRRCHR       strrchr
> > +#endif
> > +
> > +#ifdef USE_AS_WCSRCHR
> > +# define PCMPEQ        pcmpeqd
> > +# define CHAR_SIZE     4
> > +# define PMINU pminud
> > +#else
> > +# define PCMPEQ        pcmpeqb
> > +# define CHAR_SIZE     1
> > +# define PMINU pminub
> > +#endif
> > +
> > +#define PAGE_SIZE      4096
> > +#define VEC_SIZE       16
> > +
> >         .text
> > -ENTRY (strrchr)
> > -       movd    %esi, %xmm1
> > +ENTRY(STRRCHR)
> > +       movd    %esi, %xmm0
> >         movq    %rdi, %rax
> > -       andl    $4095, %eax
> > -       punpcklbw       %xmm1, %xmm1
> > -       cmpq    $4032, %rax
> > -       punpcklwd       %xmm1, %xmm1
> > -       pshufd  $0, %xmm1, %xmm1
> > +       andl    $(PAGE_SIZE - 1), %eax
> > +#ifndef USE_AS_WCSRCHR
> > +       punpcklbw %xmm0, %xmm0
> > +       punpcklwd %xmm0, %xmm0
> > +#endif
> > +       pshufd  $0, %xmm0, %xmm0
> > +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> >         ja      L(cross_page)
> > -       movdqu  (%rdi), %xmm0
> > +
> > +L(cross_page_continue):
> > +       movups  (%rdi), %xmm1
> >         pxor    %xmm2, %xmm2
> > -       movdqa  %xmm0, %xmm3
> > -       pcmpeqb %xmm1, %xmm0
> > -       pcmpeqb %xmm2, %xmm3
> > -       pmovmskb        %xmm0, %ecx
> > -       pmovmskb        %xmm3, %edx
> > -       testq   %rdx, %rdx
> > -       je      L(next_48_bytes)
> > -       leaq    -1(%rdx), %rax
> > -       xorq    %rdx, %rax
> > -       andq    %rcx, %rax
> > -       je      L(exit)
> > -       bsrq    %rax, %rax
> > +       PCMPEQ  %xmm1, %xmm2
> > +       pmovmskb %xmm2, %ecx
> > +       testl   %ecx, %ecx
> > +       jz      L(aligned_more)
> > +
> > +       PCMPEQ  %xmm0, %xmm1
> > +       pmovmskb %xmm1, %eax
> > +       leal    -1(%rcx), %edx
> > +       xorl    %edx, %ecx
> > +       andl    %ecx, %eax
> > +       jz      L(ret0)
> > +       bsrl    %eax, %eax
> >         addq    %rdi, %rax
> > +       /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
> > +          search CHAR is zero we are correct. Either way `andq
> > +          -CHAR_SIZE, %rax` gets the correct result.  */
> > +#ifdef USE_AS_WCSRCHR
> > +       andq    $-CHAR_SIZE, %rax
> > +#endif
> > +L(ret0):
> >         ret
> >
> > +       /* Returns for first vec x1/x2 have hard coded backward search
> > +          path for earlier matches.  */
> >         .p2align 4
> > -L(next_48_bytes):
> > -       movdqu  16(%rdi), %xmm4
> > -       movdqa  %xmm4, %xmm5
> > -       movdqu  32(%rdi), %xmm3
> > -       pcmpeqb %xmm1, %xmm4
> > -       pcmpeqb %xmm2, %xmm5
> > -       movdqu  48(%rdi), %xmm0
> > -       pmovmskb        %xmm5, %edx
> > -       movdqa  %xmm3, %xmm5
> > -       pcmpeqb %xmm1, %xmm3
> > -       pcmpeqb %xmm2, %xmm5
> > -       pcmpeqb %xmm0, %xmm2
> > -       salq    $16, %rdx
> > -       pmovmskb        %xmm3, %r8d
> > -       pmovmskb        %xmm5, %eax
> > -       pmovmskb        %xmm2, %esi
> > -       salq    $32, %r8
> > -       salq    $32, %rax
> > -       pcmpeqb %xmm1, %xmm0
> > -       orq     %rdx, %rax
> > -       movq    %rsi, %rdx
> > -       pmovmskb        %xmm4, %esi
> > -       salq    $48, %rdx
> > -       salq    $16, %rsi
> > -       orq     %r8, %rsi
> > -       orq     %rcx, %rsi
> > -       pmovmskb        %xmm0, %ecx
> > -       salq    $48, %rcx
> > -       orq     %rcx, %rsi
> > -       orq     %rdx, %rax
> > -       je      L(loop_header2)
> > -       leaq    -1(%rax), %rcx
> > -       xorq    %rax, %rcx
> > -       andq    %rcx, %rsi
> > -       je      L(exit)
> > -       bsrq    %rsi, %rsi
> > -       leaq    (%rdi,%rsi), %rax
> > +L(first_vec_x0_test):
> > +       PCMPEQ  %xmm0, %xmm1
> > +       pmovmskb %xmm1, %eax
> > +       testl   %eax, %eax
> > +       jz      L(ret0)
> > +       bsrl    %eax, %eax
> > +       addq    %r8, %rax
> > +#ifdef USE_AS_WCSRCHR
> > +       andq    $-CHAR_SIZE, %rax
> > +#endif
> >         ret
> >
> >         .p2align 4
> > -L(loop_header2):
> > -       testq   %rsi, %rsi
> > -       movq    %rdi, %rcx
> > -       je      L(no_c_found)
> > -L(loop_header):
> > -       addq    $64, %rdi
> > -       pxor    %xmm7, %xmm7
> > -       andq    $-64, %rdi
> > -       jmp     L(loop_entry)
> > +L(first_vec_x1):
> > +       PCMPEQ  %xmm0, %xmm2
> > +       pmovmskb %xmm2, %eax
> > +       leal    -1(%rcx), %edx
> > +       xorl    %edx, %ecx
> > +       andl    %ecx, %eax
> > +       jz      L(first_vec_x0_test)
> > +       bsrl    %eax, %eax
> > +       leaq    (VEC_SIZE)(%rdi, %rax), %rax
> > +#ifdef USE_AS_WCSRCHR
> > +       andq    $-CHAR_SIZE, %rax
> > +#endif
> > +       ret
> >
> >         .p2align 4
> > -L(loop64):
> > -       testq   %rdx, %rdx
> > -       cmovne  %rdx, %rsi
> > -       cmovne  %rdi, %rcx
> > -       addq    $64, %rdi
> > -L(loop_entry):
> > -       movdqa  32(%rdi), %xmm3
> > -       pxor    %xmm6, %xmm6
> > -       movdqa  48(%rdi), %xmm2
> > -       movdqa  %xmm3, %xmm0
> > -       movdqa  16(%rdi), %xmm4
> > -       pminub  %xmm2, %xmm0
> > -       movdqa  (%rdi), %xmm5
> > -       pminub  %xmm4, %xmm0
> > -       pminub  %xmm5, %xmm0
> > -       pcmpeqb %xmm7, %xmm0
> > -       pmovmskb        %xmm0, %eax
> > -       movdqa  %xmm5, %xmm0
> > -       pcmpeqb %xmm1, %xmm0
> > -       pmovmskb        %xmm0, %r9d
> > -       movdqa  %xmm4, %xmm0
> > -       pcmpeqb %xmm1, %xmm0
> > -       pmovmskb        %xmm0, %edx
> > -       movdqa  %xmm3, %xmm0
> > -       pcmpeqb %xmm1, %xmm0
> > -       salq    $16, %rdx
> > -       pmovmskb        %xmm0, %r10d
> > -       movdqa  %xmm2, %xmm0
> > -       pcmpeqb %xmm1, %xmm0
> > -       salq    $32, %r10
> > -       orq     %r10, %rdx
> > -       pmovmskb        %xmm0, %r8d
> > -       orq     %r9, %rdx
> > -       salq    $48, %r8
> > -       orq     %r8, %rdx
> > +L(first_vec_x1_test):
> > +       PCMPEQ  %xmm0, %xmm2
> > +       pmovmskb %xmm2, %eax
> >         testl   %eax, %eax
> > -       je      L(loop64)
> > -       pcmpeqb %xmm6, %xmm4
> > -       pcmpeqb %xmm6, %xmm3
> > -       pcmpeqb %xmm6, %xmm5
> > -       pmovmskb        %xmm4, %eax
> > -       pmovmskb        %xmm3, %r10d
> > -       pcmpeqb %xmm6, %xmm2
> > -       pmovmskb        %xmm5, %r9d
> > -       salq    $32, %r10
> > -       salq    $16, %rax
> > -       pmovmskb        %xmm2, %r8d
> > -       orq     %r10, %rax
> > -       orq     %r9, %rax
> > -       salq    $48, %r8
> > -       orq     %r8, %rax
> > -       leaq    -1(%rax), %r8
> > -       xorq    %rax, %r8
> > -       andq    %r8, %rdx
> > -       cmovne  %rdi, %rcx
> > -       cmovne  %rdx, %rsi
> > -       bsrq    %rsi, %rsi
> > -       leaq    (%rcx,%rsi), %rax
> > +       jz      L(first_vec_x0_test)
> > +       bsrl    %eax, %eax
> > +       leaq    (VEC_SIZE)(%rdi, %rax), %rax
> > +#ifdef USE_AS_WCSRCHR
> > +       andq    $-CHAR_SIZE, %rax
> > +#endif
> > +       ret
> > +
> > +       .p2align 4
> > +L(first_vec_x2):
> > +       PCMPEQ  %xmm0, %xmm3
> > +       pmovmskb %xmm3, %eax
> > +       leal    -1(%rcx), %edx
> > +       xorl    %edx, %ecx
> > +       andl    %ecx, %eax
> > +       jz      L(first_vec_x1_test)
> > +       bsrl    %eax, %eax
> > +       leaq    (VEC_SIZE * 2)(%rdi, %rax), %rax
> > +#ifdef USE_AS_WCSRCHR
> > +       andq    $-CHAR_SIZE, %rax
> > +#endif
> > +       ret
> > +
> > +       .p2align 4
> > +L(aligned_more):
> > +       /* Save original pointer if match was in VEC 0.  */
> > +       movq    %rdi, %r8
> > +       andq    $-VEC_SIZE, %rdi
> > +
> > +       movaps  VEC_SIZE(%rdi), %xmm2
> > +       pxor    %xmm3, %xmm3
> > +       PCMPEQ  %xmm2, %xmm3
> > +       pmovmskb %xmm3, %ecx
> > +       testl   %ecx, %ecx
> > +       jnz     L(first_vec_x1)
> > +
> > +       movaps  (VEC_SIZE * 2)(%rdi), %xmm3
> > +       pxor    %xmm4, %xmm4
> > +       PCMPEQ  %xmm3, %xmm4
> > +       pmovmskb %xmm4, %ecx
> > +       testl   %ecx, %ecx
> > +       jnz     L(first_vec_x2)
> > +
> > +       addq    $VEC_SIZE, %rdi
> > +       /* Save pointer again before realigning.  */
> > +       movq    %rdi, %rsi
> > +       andq    $-(VEC_SIZE * 2), %rdi
> > +       .p2align 4
> > +L(first_loop):
> > +       /* Do 2x VEC at a time.  */
> > +       movaps  (VEC_SIZE * 2)(%rdi), %xmm4
> > +       movaps  (VEC_SIZE * 3)(%rdi), %xmm5
> > +       /* Since SSE2 no pminud so wcsrchr needs seperate logic for
> > +          detecting zero. Note if this is found to be a bottleneck it
> > +          may be worth adding an SSE4.1 wcsrchr implementation.  */
> > +#ifdef USE_AS_WCSRCHR
> > +       movaps  %xmm5, %xmm6
> > +       pxor    %xmm8, %xmm8
> > +
> > +       PCMPEQ  %xmm8, %xmm5
> > +       PCMPEQ  %xmm4, %xmm8
> > +       por     %xmm5, %xmm8
> > +#else
> > +       movaps  %xmm5, %xmm6
> > +       PMINU   %xmm4, %xmm5
> > +#endif
> > +
> > +       movaps  %xmm4, %xmm9
> > +       PCMPEQ  %xmm0, %xmm4
> > +       PCMPEQ  %xmm0, %xmm6
> > +       movaps  %xmm6, %xmm7
> > +       por     %xmm4, %xmm6
> > +#ifndef USE_AS_WCSRCHR
> > +       pxor    %xmm8, %xmm8
> > +       PCMPEQ  %xmm5, %xmm8
> > +#endif
> > +       pmovmskb %xmm8, %ecx
> > +       pmovmskb %xmm6, %eax
> > +
> > +       addq    $(VEC_SIZE * 2), %rdi
> > +       /* Use `addl` 1) so we can undo it with `subl` and 2) it can
> > +          macro-fuse with `jz`.  */
> > +       addl    %ecx, %eax
> > +       jz      L(first_loop)
> > +
> > +       /* Check if there is zero match.  */
> > +       testl   %ecx, %ecx
> > +       jz      L(second_loop_match)
> > +
> > +       /* Check if there was a match in last iteration.  */
> > +       subl    %ecx, %eax
> > +       jnz     L(new_match)
> > +
> > +L(first_loop_old_match):
> > +       PCMPEQ  %xmm0, %xmm2
> > +       PCMPEQ  %xmm0, %xmm3
> > +       pmovmskb %xmm2, %ecx
> > +       pmovmskb %xmm3, %eax
> > +       addl    %eax, %ecx
> > +       jz      L(first_vec_x0_test)
> > +       /* NB: We could move this shift to before the branch and save a
> > +          bit of code size / performance on the fall through. The
> > +          branch leads to the null case which generally seems hotter
> > +          than char in first 3x VEC.  */
> > +       sall    $16, %eax
> > +       orl     %ecx, %eax
> > +
> > +       bsrl    %eax, %eax
> > +       addq    %rsi, %rax
> > +#ifdef USE_AS_WCSRCHR
> > +       andq    $-CHAR_SIZE, %rax
> > +#endif
> > +       ret
> > +
> > +       .p2align 4
> > +L(new_match):
> > +       pxor    %xmm6, %xmm6
> > +       PCMPEQ  %xmm9, %xmm6
> > +       pmovmskb %xmm6, %eax
> > +       sall    $16, %ecx
> > +       orl     %eax, %ecx
> > +
> > +       /* We can't reuse either of the old comparisons as since we mask
> > +          of zeros after first zero (instead of using the full
> > +          comparison) we can't gurantee no interference between match
> > +          after end of string and valid match.  */
> > +       pmovmskb %xmm4, %eax
> > +       pmovmskb %xmm7, %edx
> > +       sall    $16, %edx
> > +       orl     %edx, %eax
> > +
> > +       leal    -1(%ecx), %edx
> > +       xorl    %edx, %ecx
> > +       andl    %ecx, %eax
> > +       jz      L(first_loop_old_match)
> > +       bsrl    %eax, %eax
> > +       addq    %rdi, %rax
> > +#ifdef USE_AS_WCSRCHR
> > +       andq    $-CHAR_SIZE, %rax
> > +#endif
> >         ret
> >
> > +       /* Save minimum state for getting most recent match. We can
> > +          throw out all previous work.  */
> >         .p2align 4
> > -L(no_c_found):
> > -       movl    $1, %esi
> > -       xorl    %ecx, %ecx
> > -       jmp     L(loop_header)
> > +L(second_loop_match):
> > +       movq    %rdi, %rsi
> > +       movaps  %xmm4, %xmm2
> > +       movaps  %xmm7, %xmm3
> >
> >         .p2align 4
> > -L(exit):
> > -       xorl    %eax, %eax
> > +L(second_loop):
> > +       movaps  (VEC_SIZE * 2)(%rdi), %xmm4
> > +       movaps  (VEC_SIZE * 3)(%rdi), %xmm5
> > +       /* Since SSE2 no pminud so wcsrchr needs seperate logic for
> > +          detecting zero. Note if this is found to be a bottleneck it
> > +          may be worth adding an SSE4.1 wcsrchr implementation.  */
> > +#ifdef USE_AS_WCSRCHR
> > +       movaps  %xmm5, %xmm6
> > +       pxor    %xmm8, %xmm8
> > +
> > +       PCMPEQ  %xmm8, %xmm5
> > +       PCMPEQ  %xmm4, %xmm8
> > +       por     %xmm5, %xmm8
> > +#else
> > +       movaps  %xmm5, %xmm6
> > +       PMINU   %xmm4, %xmm5
> > +#endif
> > +
> > +       movaps  %xmm4, %xmm9
> > +       PCMPEQ  %xmm0, %xmm4
> > +       PCMPEQ  %xmm0, %xmm6
> > +       movaps  %xmm6, %xmm7
> > +       por     %xmm4, %xmm6
> > +#ifndef USE_AS_WCSRCHR
> > +       pxor    %xmm8, %xmm8
> > +       PCMPEQ  %xmm5, %xmm8
> > +#endif
> > +
> > +       pmovmskb %xmm8, %ecx
> > +       pmovmskb %xmm6, %eax
> > +
> > +       addq    $(VEC_SIZE * 2), %rdi
> > +       /* Either null term or new occurence of CHAR.  */
> > +       addl    %ecx, %eax
> > +       jz      L(second_loop)
> > +
> > +       /* No null term so much be new occurence of CHAR.  */
> > +       testl   %ecx, %ecx
> > +       jz      L(second_loop_match)
> > +
> > +
> > +       subl    %ecx, %eax
> > +       jnz     L(second_loop_new_match)
> > +
> > +L(second_loop_old_match):
> > +       pmovmskb %xmm2, %ecx
> > +       pmovmskb %xmm3, %eax
> > +       sall    $16, %eax
> > +       orl     %ecx, %eax
> > +       bsrl    %eax, %eax
> > +       addq    %rsi, %rax
> > +#ifdef USE_AS_WCSRCHR
> > +       andq    $-CHAR_SIZE, %rax
> > +#endif
> >         ret
> >
> >         .p2align 4
> > +L(second_loop_new_match):
> > +       pxor    %xmm6, %xmm6
> > +       PCMPEQ  %xmm9, %xmm6
> > +       pmovmskb %xmm6, %eax
> > +       sall    $16, %ecx
> > +       orl     %eax, %ecx
> > +
> > +       /* We can't reuse either of the old comparisons as since we mask
> > +          of zeros after first zero (instead of using the full
> > +          comparison) we can't gurantee no interference between match
> > +          after end of string and valid match.  */
> > +       pmovmskb %xmm4, %eax
> > +       pmovmskb %xmm7, %edx
> > +       sall    $16, %edx
> > +       orl     %edx, %eax
> > +
> > +       leal    -1(%ecx), %edx
> > +       xorl    %edx, %ecx
> > +       andl    %ecx, %eax
> > +       jz      L(second_loop_old_match)
> > +       bsrl    %eax, %eax
> > +       addq    %rdi, %rax
> > +#ifdef USE_AS_WCSRCHR
> > +       andq    $-CHAR_SIZE, %rax
> > +#endif
> > +       ret
> > +
> > +       .p2align 4,, 4
> >  L(cross_page):
> > -       movq    %rdi, %rax
> > -       pxor    %xmm0, %xmm0
> > -       andq    $-64, %rax
> > -       movdqu  (%rax), %xmm5
> > -       movdqa  %xmm5, %xmm6
> > -       movdqu  16(%rax), %xmm4
> > -       pcmpeqb %xmm1, %xmm5
> > -       pcmpeqb %xmm0, %xmm6
> > -       movdqu  32(%rax), %xmm3
> > -       pmovmskb        %xmm6, %esi
> > -       movdqa  %xmm4, %xmm6
> > -       movdqu  48(%rax), %xmm2
> > -       pcmpeqb %xmm1, %xmm4
> > -       pcmpeqb %xmm0, %xmm6
> > -       pmovmskb        %xmm6, %edx
> > -       movdqa  %xmm3, %xmm6
> > -       pcmpeqb %xmm1, %xmm3
> > -       pcmpeqb %xmm0, %xmm6
> > -       pcmpeqb %xmm2, %xmm0
> > -       salq    $16, %rdx
> > -       pmovmskb        %xmm3, %r9d
> > -       pmovmskb        %xmm6, %r8d
> > -       pmovmskb        %xmm0, %ecx
> > -       salq    $32, %r9
> > -       salq    $32, %r8
> > -       pcmpeqb %xmm1, %xmm2
> > -       orq     %r8, %rdx
> > -       salq    $48, %rcx
> > -       pmovmskb        %xmm5, %r8d
> > -       orq     %rsi, %rdx
> > -       pmovmskb        %xmm4, %esi
> > -       orq     %rcx, %rdx
> > -       pmovmskb        %xmm2, %ecx
> > -       salq    $16, %rsi
> > -       salq    $48, %rcx
> > -       orq     %r9, %rsi
> > -       orq     %r8, %rsi
> > -       orq     %rcx, %rsi
> > +       movq    %rdi, %rsi
> > +       andq    $-VEC_SIZE, %rsi
> > +       movaps  (%rsi), %xmm1
> > +       pxor    %xmm2, %xmm2
> > +       PCMPEQ  %xmm1, %xmm2
> > +       pmovmskb %xmm2, %edx
> >         movl    %edi, %ecx
> > -       subl    %eax, %ecx
> > -       shrq    %cl, %rdx
> > -       shrq    %cl, %rsi
> > -       testq   %rdx, %rdx
> > -       je      L(loop_header2)
> > -       leaq    -1(%rdx), %rax
> > -       xorq    %rdx, %rax
> > -       andq    %rax, %rsi
> > -       je      L(exit)
> > -       bsrq    %rsi, %rax
> > +       andl    $(VEC_SIZE - 1), %ecx
> > +       sarl    %cl, %edx
> > +       jz      L(cross_page_continue)
> > +       PCMPEQ  %xmm0, %xmm1
> > +       pmovmskb %xmm1, %eax
> > +       sarl    %cl, %eax
> > +       leal    -1(%rdx), %ecx
> > +       xorl    %edx, %ecx
> > +       andl    %ecx, %eax
> > +       jz      L(ret1)
> > +       bsrl    %eax, %eax
> >         addq    %rdi, %rax
> > +#ifdef USE_AS_WCSRCHR
> > +       andq    $-CHAR_SIZE, %rax
> > +#endif
> > +L(ret1):
> >         ret
> > -END (strrchr)
> > +END(STRRCHR)
> >
> > -weak_alias (strrchr, rindex)
> > -libc_hidden_builtin_def (strrchr)
> > +#ifndef USE_AS_WCSRCHR
> > +       weak_alias (STRRCHR, rindex)
> > +       libc_hidden_builtin_def (STRRCHR)
> > +#endif
> > diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
> > index 61552954de..2b80efc5ef 100644
> > --- a/sysdeps/x86_64/wcsrchr.S
> > +++ b/sysdeps/x86_64/wcsrchr.S
> > @@ -1,4 +1,4 @@
> > -/* wcsrchr with SSSE3
> > +/* wcsrchr optimized with SSE2.
> >     Copyright (C) 2011-2022 Free Software Foundation, Inc.
> >     This file is part of the GNU C Library.
> >
> > @@ -16,266 +16,12 @@
> >     License along with the GNU C Library; if not, see
> >     <https://www.gnu.org/licenses/>.  */
> >
> > -#include <sysdep.h>
> >
> > -       .text
> > -ENTRY (wcsrchr)
> > +#define USE_AS_WCSRCHR 1
> > +#define NO_PMINU       1
> >
> > -       movd    %rsi, %xmm1
> > -       mov     %rdi, %rcx
> > -       punpckldq %xmm1, %xmm1
> > -       pxor    %xmm2, %xmm2
> > -       punpckldq %xmm1, %xmm1
> > -       and     $63, %rcx
> > -       cmp     $48, %rcx
> > -       ja      L(crosscache)
> > +#ifndef STRRCHR
> > +# define STRRCHR       wcsrchr
> > +#endif
> >
> > -       movdqu  (%rdi), %xmm0
> > -       pcmpeqd %xmm0, %xmm2
> > -       pcmpeqd %xmm1, %xmm0
> > -       pmovmskb %xmm2, %rcx
> > -       pmovmskb %xmm0, %rax
> > -       add     $16, %rdi
> > -
> > -       test    %rax, %rax
> > -       jnz     L(unaligned_match1)
> > -
> > -       test    %rcx, %rcx
> > -       jnz     L(return_null)
> > -
> > -       and     $-16, %rdi
> > -       xor     %r8, %r8
> > -       jmp     L(loop)
> > -
> > -       .p2align 4
> > -L(unaligned_match1):
> > -       test    %rcx, %rcx
> > -       jnz     L(prolog_find_zero_1)
> > -
> > -       mov     %rax, %r8
> > -       mov     %rdi, %rsi
> > -       and     $-16, %rdi
> > -       jmp     L(loop)
> > -
> > -       .p2align 4
> > -L(crosscache):
> > -       and     $15, %rcx
> > -       and     $-16, %rdi
> > -       pxor    %xmm3, %xmm3
> > -       movdqa  (%rdi), %xmm0
> > -       pcmpeqd %xmm0, %xmm3
> > -       pcmpeqd %xmm1, %xmm0
> > -       pmovmskb %xmm3, %rdx
> > -       pmovmskb %xmm0, %rax
> > -       shr     %cl, %rdx
> > -       shr     %cl, %rax
> > -       add     $16, %rdi
> > -
> > -       test    %rax, %rax
> > -       jnz     L(unaligned_match)
> > -
> > -       test    %rdx, %rdx
> > -       jnz     L(return_null)
> > -
> > -       xor     %r8, %r8
> > -       jmp     L(loop)
> > -
> > -       .p2align 4
> > -L(unaligned_match):
> > -       test    %rdx, %rdx
> > -       jnz     L(prolog_find_zero)
> > -
> > -       mov     %rax, %r8
> > -       lea     (%rdi, %rcx), %rsi
> > -
> > -/* Loop start on aligned string.  */
> > -       .p2align 4
> > -L(loop):
> > -       movdqa  (%rdi), %xmm0
> > -       pcmpeqd %xmm0, %xmm2
> > -       add     $16, %rdi
> > -       pcmpeqd %xmm1, %xmm0
> > -       pmovmskb %xmm2, %rcx
> > -       pmovmskb %xmm0, %rax
> > -       or      %rax, %rcx
> > -       jnz     L(matches)
> > -
> > -       movdqa  (%rdi), %xmm3
> > -       pcmpeqd %xmm3, %xmm2
> > -       add     $16, %rdi
> > -       pcmpeqd %xmm1, %xmm3
> > -       pmovmskb %xmm2, %rcx
> > -       pmovmskb %xmm3, %rax
> > -       or      %rax, %rcx
> > -       jnz     L(matches)
> > -
> > -       movdqa  (%rdi), %xmm4
> > -       pcmpeqd %xmm4, %xmm2
> > -       add     $16, %rdi
> > -       pcmpeqd %xmm1, %xmm4
> > -       pmovmskb %xmm2, %rcx
> > -       pmovmskb %xmm4, %rax
> > -       or      %rax, %rcx
> > -       jnz     L(matches)
> > -
> > -       movdqa  (%rdi), %xmm5
> > -       pcmpeqd %xmm5, %xmm2
> > -       add     $16, %rdi
> > -       pcmpeqd %xmm1, %xmm5
> > -       pmovmskb %xmm2, %rcx
> > -       pmovmskb %xmm5, %rax
> > -       or      %rax, %rcx
> > -       jz      L(loop)
> > -
> > -       .p2align 4
> > -L(matches):
> > -       test    %rax, %rax
> > -       jnz     L(match)
> > -L(return_value):
> > -       test    %r8, %r8
> > -       jz      L(return_null)
> > -       mov     %r8, %rax
> > -       mov     %rsi, %rdi
> > -
> > -       test    $15 << 4, %ah
> > -       jnz     L(match_fourth_wchar)
> > -       test    %ah, %ah
> > -       jnz     L(match_third_wchar)
> > -       test    $15 << 4, %al
> > -       jnz     L(match_second_wchar)
> > -       lea     -16(%rdi), %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(match):
> > -       pmovmskb %xmm2, %rcx
> > -       test    %rcx, %rcx
> > -       jnz     L(find_zero)
> > -       mov     %rax, %r8
> > -       mov     %rdi, %rsi
> > -       jmp     L(loop)
> > -
> > -       .p2align 4
> > -L(find_zero):
> > -       test    $15, %cl
> > -       jnz     L(find_zero_in_first_wchar)
> > -       test    %cl, %cl
> > -       jnz     L(find_zero_in_second_wchar)
> > -       test    $15, %ch
> > -       jnz     L(find_zero_in_third_wchar)
> > -
> > -       and     $1 << 13 - 1, %rax
> > -       jz      L(return_value)
> > -
> > -       test    $15 << 4, %ah
> > -       jnz     L(match_fourth_wchar)
> > -       test    %ah, %ah
> > -       jnz     L(match_third_wchar)
> > -       test    $15 << 4, %al
> > -       jnz     L(match_second_wchar)
> > -       lea     -16(%rdi), %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(find_zero_in_first_wchar):
> > -       test    $1, %rax
> > -       jz      L(return_value)
> > -       lea     -16(%rdi), %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(find_zero_in_second_wchar):
> > -       and     $1 << 5 - 1, %rax
> > -       jz      L(return_value)
> > -
> > -       test    $15 << 4, %al
> > -       jnz     L(match_second_wchar)
> > -       lea     -16(%rdi), %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(find_zero_in_third_wchar):
> > -       and     $1 << 9 - 1, %rax
> > -       jz      L(return_value)
> > -
> > -       test    %ah, %ah
> > -       jnz     L(match_third_wchar)
> > -       test    $15 << 4, %al
> > -       jnz     L(match_second_wchar)
> > -       lea     -16(%rdi), %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(prolog_find_zero):
> > -       add     %rcx, %rdi
> > -       mov     %rdx, %rcx
> > -L(prolog_find_zero_1):
> > -       test    $15, %cl
> > -       jnz     L(prolog_find_zero_in_first_wchar)
> > -       test    %cl, %cl
> > -       jnz     L(prolog_find_zero_in_second_wchar)
> > -       test    $15, %ch
> > -       jnz     L(prolog_find_zero_in_third_wchar)
> > -
> > -       and     $1 << 13 - 1, %rax
> > -       jz      L(return_null)
> > -
> > -       test    $15 << 4, %ah
> > -       jnz     L(match_fourth_wchar)
> > -       test    %ah, %ah
> > -       jnz     L(match_third_wchar)
> > -       test    $15 << 4, %al
> > -       jnz     L(match_second_wchar)
> > -       lea     -16(%rdi), %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(prolog_find_zero_in_first_wchar):
> > -       test    $1, %rax
> > -       jz      L(return_null)
> > -       lea     -16(%rdi), %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(prolog_find_zero_in_second_wchar):
> > -       and     $1 << 5 - 1, %rax
> > -       jz      L(return_null)
> > -
> > -       test    $15 << 4, %al
> > -       jnz     L(match_second_wchar)
> > -       lea     -16(%rdi), %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(prolog_find_zero_in_third_wchar):
> > -       and     $1 << 9 - 1, %rax
> > -       jz      L(return_null)
> > -
> > -       test    %ah, %ah
> > -       jnz     L(match_third_wchar)
> > -       test    $15 << 4, %al
> > -       jnz     L(match_second_wchar)
> > -       lea     -16(%rdi), %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(match_second_wchar):
> > -       lea     -12(%rdi), %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(match_third_wchar):
> > -       lea     -8(%rdi), %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(match_fourth_wchar):
> > -       lea     -4(%rdi), %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(return_null):
> > -       xor     %rax, %rax
> > -       ret
> > -
> > -END (wcsrchr)
> > +#include "../strrchr.S"
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.

I would like to backport this patch to release branches.
Any comments or objections?

--Sunil

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v3 3/4] x86: Optimize {str|wcs}rchr-avx2
  2022-04-22 19:03     ` H.J. Lu
@ 2022-05-12 20:14       ` Sunil Pandey
  2022-07-20 15:33         ` Noah Goldstein
  0 siblings, 1 reply; 36+ messages in thread
From: Sunil Pandey @ 2022-05-12 20:14 UTC (permalink / raw)
  To: H.J. Lu, Libc-stable Mailing List; +Cc: Noah Goldstein, GNU C Library

On Fri, Apr 22, 2022 at 12:08 PM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Thu, Apr 21, 2022 at 6:52 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > The new code unrolls the main loop slightly without adding too much
> > overhead and minimizes the comparisons for the search CHAR.
> >
> > Geometric Mean of all benchmarks New / Old: 0.832
> > See email for all results.
> >
> > Full xcheck passes on x86_64 with and without multiarch enabled.
> > ---
> >  sysdeps/x86_64/multiarch/strrchr-avx2.S | 426 +++++++++++++++---------
> >  1 file changed, 269 insertions(+), 157 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
> > index 1df2adfad0..bd26ba80d5 100644
> > --- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
> > +++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
> > @@ -27,9 +27,13 @@
> >  # ifdef USE_AS_WCSRCHR
> >  #  define VPBROADCAST  vpbroadcastd
> >  #  define VPCMPEQ      vpcmpeqd
> > +#  define VPMIN        vpminud
> > +#  define CHAR_SIZE    4
> >  # else
> >  #  define VPBROADCAST  vpbroadcastb
> >  #  define VPCMPEQ      vpcmpeqb
> > +#  define VPMIN        vpminub
> > +#  define CHAR_SIZE    1
> >  # endif
> >
> >  # ifndef VZEROUPPER
> > @@ -41,196 +45,304 @@
> >  # endif
> >
> >  # define VEC_SIZE      32
> > +# define PAGE_SIZE     4096
> >
> > -       .section SECTION(.text),"ax",@progbits
> > -ENTRY (STRRCHR)
> > -       movd    %esi, %xmm4
> > -       movl    %edi, %ecx
> > +       .section SECTION(.text), "ax", @progbits
> > +ENTRY(STRRCHR)
> > +       movd    %esi, %xmm7
> > +       movl    %edi, %eax
> >         /* Broadcast CHAR to YMM4.  */
> > -       VPBROADCAST %xmm4, %ymm4
> > +       VPBROADCAST %xmm7, %ymm7
> >         vpxor   %xmm0, %xmm0, %xmm0
> >
> > -       /* Check if we may cross page boundary with one vector load.  */
> > -       andl    $(2 * VEC_SIZE - 1), %ecx
> > -       cmpl    $VEC_SIZE, %ecx
> > -       ja      L(cros_page_boundary)
> > +       /* Shift here instead of `andl` to save code size (saves a fetch
> > +          block).  */
> > +       sall    $20, %eax
> > +       cmpl    $((PAGE_SIZE - VEC_SIZE) << 20), %eax
> > +       ja      L(cross_page)
> >
> > +L(page_cross_continue):
> >         vmovdqu (%rdi), %ymm1
> > -       VPCMPEQ %ymm1, %ymm0, %ymm2
> > -       VPCMPEQ %ymm1, %ymm4, %ymm3
> > -       vpmovmskb %ymm2, %ecx
> > -       vpmovmskb %ymm3, %eax
> > -       addq    $VEC_SIZE, %rdi
> > +       /* Check end of string match.  */
> > +       VPCMPEQ %ymm1, %ymm0, %ymm6
> > +       vpmovmskb %ymm6, %ecx
> > +       testl   %ecx, %ecx
> > +       jz      L(aligned_more)
> > +
> > +       /* Only check match with search CHAR if needed.  */
> > +       VPCMPEQ %ymm1, %ymm7, %ymm1
> > +       vpmovmskb %ymm1, %eax
> > +       /* Check if match before first zero.  */
> > +       blsmskl %ecx, %ecx
> > +       andl    %ecx, %eax
> > +       jz      L(ret0)
> > +       bsrl    %eax, %eax
> > +       addq    %rdi, %rax
> > +       /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
> > +          search CHAR is zero we are correct. Either way `andq
> > +          -CHAR_SIZE, %rax` gets the correct result.  */
> > +# ifdef USE_AS_WCSRCHR
> > +       andq    $-CHAR_SIZE, %rax
> > +# endif
> > +L(ret0):
> > +L(return_vzeroupper):
> > +       ZERO_UPPER_VEC_REGISTERS_RETURN
> > +
> > +       /* Returns for first vec x1/x2 have hard coded backward search
> > +          path for earlier matches.  */
> > +       .p2align 4,, 10
> > +L(first_vec_x1):
> > +       VPCMPEQ %ymm2, %ymm7, %ymm6
> > +       vpmovmskb %ymm6, %eax
> > +       blsmskl %ecx, %ecx
> > +       andl    %ecx, %eax
> > +       jnz     L(first_vec_x1_return)
> > +
> > +       .p2align 4,, 4
> > +L(first_vec_x0_test):
> > +       VPCMPEQ %ymm1, %ymm7, %ymm6
> > +       vpmovmskb %ymm6, %eax
> > +       testl   %eax, %eax
> > +       jz      L(ret1)
> > +       bsrl    %eax, %eax
> > +       addq    %r8, %rax
> > +# ifdef USE_AS_WCSRCHR
> > +       andq    $-CHAR_SIZE, %rax
> > +# endif
> > +L(ret1):
> > +       VZEROUPPER_RETURN
> >
> > +       .p2align 4,, 10
> > +L(first_vec_x0_x1_test):
> > +       VPCMPEQ %ymm2, %ymm7, %ymm6
> > +       vpmovmskb %ymm6, %eax
> > +       /* Check ymm2 for search CHAR match. If no match then check ymm1
> > +          before returning.  */
> >         testl   %eax, %eax
> > -       jnz     L(first_vec)
> > +       jz      L(first_vec_x0_test)
> > +       .p2align 4,, 4
> > +L(first_vec_x1_return):
> > +       bsrl    %eax, %eax
> > +       leaq    1(%rdi, %rax), %rax
> > +# ifdef USE_AS_WCSRCHR
> > +       andq    $-CHAR_SIZE, %rax
> > +# endif
> > +       VZEROUPPER_RETURN
> >
> > -       testl   %ecx, %ecx
> > -       jnz     L(return_null)
> >
> > -       andq    $-VEC_SIZE, %rdi
> > -       xorl    %edx, %edx
> > -       jmp     L(aligned_loop)
> > +       .p2align 4,, 10
> > +L(first_vec_x2):
> > +       VPCMPEQ %ymm3, %ymm7, %ymm6
> > +       vpmovmskb %ymm6, %eax
> > +       blsmskl %ecx, %ecx
> > +       /* If no in-range search CHAR match in ymm3 then need to check
> > +          ymm1/ymm2 for an earlier match (we delay checking search
> > +          CHAR matches until needed).  */
> > +       andl    %ecx, %eax
> > +       jz      L(first_vec_x0_x1_test)
> > +       bsrl    %eax, %eax
> > +       leaq    (VEC_SIZE + 1)(%rdi, %rax), %rax
> > +# ifdef USE_AS_WCSRCHR
> > +       andq    $-CHAR_SIZE, %rax
> > +# endif
> > +       VZEROUPPER_RETURN
> > +
> >
> >         .p2align 4
> > -L(first_vec):
> > -       /* Check if there is a nul CHAR.  */
> > +L(aligned_more):
> > +       /* Save original pointer if match was in VEC 0.  */
> > +       movq    %rdi, %r8
> > +
> > +       /* Align src.  */
> > +       orq     $(VEC_SIZE - 1), %rdi
> > +       vmovdqu 1(%rdi), %ymm2
> > +       VPCMPEQ %ymm2, %ymm0, %ymm6
> > +       vpmovmskb %ymm6, %ecx
> >         testl   %ecx, %ecx
> > -       jnz     L(char_and_nul_in_first_vec)
> > +       jnz     L(first_vec_x1)
> >
> > -       /* Remember the match and keep searching.  */
> > -       movl    %eax, %edx
> > -       movq    %rdi, %rsi
> > -       andq    $-VEC_SIZE, %rdi
> > -       jmp     L(aligned_loop)
> > +       vmovdqu (VEC_SIZE + 1)(%rdi), %ymm3
> > +       VPCMPEQ %ymm3, %ymm0, %ymm6
> > +       vpmovmskb %ymm6, %ecx
> > +       testl   %ecx, %ecx
> > +       jnz     L(first_vec_x2)
> >
> > +       /* Save pointer again before realigning.  */
> > +       movq    %rdi, %rsi
> > +       addq    $(VEC_SIZE + 1), %rdi
> > +       andq    $-(VEC_SIZE * 2), %rdi
> >         .p2align 4
> > -L(cros_page_boundary):
> > -       andl    $(VEC_SIZE - 1), %ecx
> > -       andq    $-VEC_SIZE, %rdi
> > -       vmovdqa (%rdi), %ymm1
> > -       VPCMPEQ %ymm1, %ymm0, %ymm2
> > -       VPCMPEQ %ymm1, %ymm4, %ymm3
> > -       vpmovmskb %ymm2, %edx
> > -       vpmovmskb %ymm3, %eax
> > -       shrl    %cl, %edx
> > -       shrl    %cl, %eax
> > -       addq    $VEC_SIZE, %rdi
> > -
> > -       /* Check if there is a CHAR.  */
> > +L(first_aligned_loop):
> > +       /* Do 2x VEC at a time. Any more and the cost of finding the
> > +          match outweights loop benefit.  */
> > +       vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4
> > +       vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5
> > +
> > +       VPCMPEQ %ymm4, %ymm7, %ymm6
> > +       VPMIN   %ymm4, %ymm5, %ymm8
> > +       VPCMPEQ %ymm5, %ymm7, %ymm10
> > +       vpor    %ymm6, %ymm10, %ymm5
> > +       VPCMPEQ %ymm8, %ymm0, %ymm8
> > +       vpor    %ymm5, %ymm8, %ymm9
> > +
> > +       vpmovmskb %ymm9, %eax
> > +       addq    $(VEC_SIZE * 2), %rdi
> > +       /* No zero or search CHAR.  */
> >         testl   %eax, %eax
> > -       jnz     L(found_char)
> > -
> > -       testl   %edx, %edx
> > -       jnz     L(return_null)
> > +       jz      L(first_aligned_loop)
> >
> > -       jmp     L(aligned_loop)
> > -
> > -       .p2align 4
> > -L(found_char):
> > -       testl   %edx, %edx
> > -       jnz     L(char_and_nul)
> > +       /* If no zero CHAR then go to second loop (this allows us to
> > +          throw away all prior work).  */
> > +       vpmovmskb %ymm8, %ecx
> > +       testl   %ecx, %ecx
> > +       jz      L(second_aligned_loop_prep)
> >
> > -       /* Remember the match and keep searching.  */
> > -       movl    %eax, %edx
> > -       leaq    (%rdi, %rcx), %rsi
> > +       /* Search char could be zero so we need to get the true match.
> > +        */
> > +       vpmovmskb %ymm5, %eax
> > +       testl   %eax, %eax
> > +       jnz     L(first_aligned_loop_return)
> >
> > -       .p2align 4
> > -L(aligned_loop):
> > -       vmovdqa (%rdi), %ymm1
> > -       VPCMPEQ %ymm1, %ymm0, %ymm2
> > -       addq    $VEC_SIZE, %rdi
> > -       VPCMPEQ %ymm1, %ymm4, %ymm3
> > -       vpmovmskb %ymm2, %ecx
> > -       vpmovmskb %ymm3, %eax
> > -       orl     %eax, %ecx
> > -       jnz     L(char_nor_null)
> > -
> > -       vmovdqa (%rdi), %ymm1
> > -       VPCMPEQ %ymm1, %ymm0, %ymm2
> > -       add     $VEC_SIZE, %rdi
> > -       VPCMPEQ %ymm1, %ymm4, %ymm3
> > -       vpmovmskb %ymm2, %ecx
> > +       .p2align 4,, 4
> > +L(first_vec_x1_or_x2):
> > +       VPCMPEQ %ymm3, %ymm7, %ymm3
> > +       VPCMPEQ %ymm2, %ymm7, %ymm2
> >         vpmovmskb %ymm3, %eax
> > -       orl     %eax, %ecx
> > -       jnz     L(char_nor_null)
> > -
> > -       vmovdqa (%rdi), %ymm1
> > -       VPCMPEQ %ymm1, %ymm0, %ymm2
> > -       addq    $VEC_SIZE, %rdi
> > -       VPCMPEQ %ymm1, %ymm4, %ymm3
> > -       vpmovmskb %ymm2, %ecx
> > -       vpmovmskb %ymm3, %eax
> > -       orl     %eax, %ecx
> > -       jnz     L(char_nor_null)
> > -
> > -       vmovdqa (%rdi), %ymm1
> > -       VPCMPEQ %ymm1, %ymm0, %ymm2
> > -       addq    $VEC_SIZE, %rdi
> > -       VPCMPEQ %ymm1, %ymm4, %ymm3
> > -       vpmovmskb %ymm2, %ecx
> > -       vpmovmskb %ymm3, %eax
> > -       orl     %eax, %ecx
> > -       jz      L(aligned_loop)
> > -
> > -       .p2align 4
> > -L(char_nor_null):
> > -       /* Find a CHAR or a nul CHAR in a loop.  */
> > -       testl   %eax, %eax
> > -       jnz     L(match)
> > -L(return_value):
> > -       testl   %edx, %edx
> > -       jz      L(return_null)
> > -       movl    %edx, %eax
> > -       movq    %rsi, %rdi
> > +       vpmovmskb %ymm2, %edx
> > +       /* Use add for macro-fusion.  */
> > +       addq    %rax, %rdx
> > +       jz      L(first_vec_x0_test)
> > +       /* NB: We could move this shift to before the branch and save a
> > +          bit of code size / performance on the fall through. The
> > +          branch leads to the null case which generally seems hotter
> > +          than char in first 3x VEC.  */
> > +       salq    $32, %rax
> > +       addq    %rdx, %rax
> > +       bsrq    %rax, %rax
> > +       leaq    1(%rsi, %rax), %rax
> > +# ifdef USE_AS_WCSRCHR
> > +       andq    $-CHAR_SIZE, %rax
> > +# endif
> > +       VZEROUPPER_RETURN
> >
> > +       .p2align 4,, 8
> > +L(first_aligned_loop_return):
> > +       VPCMPEQ %ymm4, %ymm0, %ymm4
> > +       vpmovmskb %ymm4, %edx
> > +       salq    $32, %rcx
> > +       orq     %rdx, %rcx
> > +
> > +       vpmovmskb %ymm10, %eax
> > +       vpmovmskb %ymm6, %edx
> > +       salq    $32, %rax
> > +       orq     %rdx, %rax
> > +       blsmskq %rcx, %rcx
> > +       andq    %rcx, %rax
> > +       jz      L(first_vec_x1_or_x2)
> > +
> > +       bsrq    %rax, %rax
> > +       leaq    -(VEC_SIZE * 2)(%rdi, %rax), %rax
> >  # ifdef USE_AS_WCSRCHR
> > -       /* Keep the first bit for each matching CHAR for bsr.  */
> > -       andl    $0x11111111, %eax
> > +       andq    $-CHAR_SIZE, %rax
> >  # endif
> > -       bsrl    %eax, %eax
> > -       leaq    -VEC_SIZE(%rdi, %rax), %rax
> > -L(return_vzeroupper):
> > -       ZERO_UPPER_VEC_REGISTERS_RETURN
> > +       VZEROUPPER_RETURN
> >
> > +       /* Search char cannot be zero.  */
> >         .p2align 4
> > -L(match):
> > -       /* Find a CHAR.  Check if there is a nul CHAR.  */
> > -       vpmovmskb %ymm2, %ecx
> > -       testl   %ecx, %ecx
> > -       jnz     L(find_nul)
> > -
> > -       /* Remember the match and keep searching.  */
> > -       movl    %eax, %edx
> > +L(second_aligned_loop_set_furthest_match):
> > +       /* Save VEC and pointer from most recent match.  */
> > +L(second_aligned_loop_prep):
> >         movq    %rdi, %rsi
> > -       jmp     L(aligned_loop)
> > +       vmovdqu %ymm6, %ymm2
> > +       vmovdqu %ymm10, %ymm3
> >
> >         .p2align 4
> > -L(find_nul):
> > -# ifdef USE_AS_WCSRCHR
> > -       /* Keep the first bit for each matching CHAR for bsr.  */
> > -       andl    $0x11111111, %ecx
> > -       andl    $0x11111111, %eax
> > -# endif
> > -       /* Mask out any matching bits after the nul CHAR.  */
> > -       movl    %ecx, %r8d
> > -       subl    $1, %r8d
> > -       xorl    %ecx, %r8d
> > -       andl    %r8d, %eax
> > +L(second_aligned_loop):
> > +       /* Search 2x at at time.  */
> > +       vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4
> > +       vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5
> > +
> > +       VPCMPEQ %ymm4, %ymm7, %ymm6
> > +       VPMIN   %ymm4, %ymm5, %ymm1
> > +       VPCMPEQ %ymm5, %ymm7, %ymm10
> > +       vpor    %ymm6, %ymm10, %ymm5
> > +       VPCMPEQ %ymm1, %ymm0, %ymm1
> > +       vpor    %ymm5, %ymm1, %ymm9
> > +
> > +       vpmovmskb %ymm9, %eax
> > +       addq    $(VEC_SIZE * 2), %rdi
> >         testl   %eax, %eax
> > -       /* If there is no CHAR here, return the remembered one.  */
> > -       jz      L(return_value)
> > -       bsrl    %eax, %eax
> > -       leaq    -VEC_SIZE(%rdi, %rax), %rax
> > -       VZEROUPPER_RETURN
> > -
> > -       .p2align 4
> > -L(char_and_nul):
> > -       /* Find both a CHAR and a nul CHAR.  */
> > -       addq    %rcx, %rdi
> > -       movl    %edx, %ecx
> > -L(char_and_nul_in_first_vec):
> > -# ifdef USE_AS_WCSRCHR
> > -       /* Keep the first bit for each matching CHAR for bsr.  */
> > -       andl    $0x11111111, %ecx
> > -       andl    $0x11111111, %eax
> > -# endif
> > -       /* Mask out any matching bits after the nul CHAR.  */
> > -       movl    %ecx, %r8d
> > -       subl    $1, %r8d
> > -       xorl    %ecx, %r8d
> > -       andl    %r8d, %eax
> > +       jz      L(second_aligned_loop)
> > +       vpmovmskb %ymm1, %ecx
> > +       testl   %ecx, %ecx
> > +       jz      L(second_aligned_loop_set_furthest_match)
> > +       vpmovmskb %ymm5, %eax
> >         testl   %eax, %eax
> > -       /* Return null pointer if the nul CHAR comes first.  */
> > -       jz      L(return_null)
> > -       bsrl    %eax, %eax
> > -       leaq    -VEC_SIZE(%rdi, %rax), %rax
> > +       jnz     L(return_new_match)
> > +
> > +       /* This is the hot patch. We know CHAR is inbounds and that
> > +          ymm3/ymm2 have latest match.  */
> > +       .p2align 4,, 4
> > +L(return_old_match):
> > +       vpmovmskb %ymm3, %eax
> > +       vpmovmskb %ymm2, %edx
> > +       salq    $32, %rax
> > +       orq     %rdx, %rax
> > +       bsrq    %rax, %rax
> > +       /* Search char cannot be zero so safe to just use lea for
> > +          wcsrchr.  */
> > +       leaq    (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax
> >         VZEROUPPER_RETURN
> >
> > -       .p2align 4
> > -L(return_null):
> > -       xorl    %eax, %eax
> > +       /* Last iteration also potentially has a match.  */
> > +       .p2align 4,, 8
> > +L(return_new_match):
> > +       VPCMPEQ %ymm4, %ymm0, %ymm4
> > +       vpmovmskb %ymm4, %edx
> > +       salq    $32, %rcx
> > +       orq     %rdx, %rcx
> > +
> > +       vpmovmskb %ymm10, %eax
> > +       vpmovmskb %ymm6, %edx
> > +       salq    $32, %rax
> > +       orq     %rdx, %rax
> > +       blsmskq %rcx, %rcx
> > +       andq    %rcx, %rax
> > +       jz      L(return_old_match)
> > +       bsrq    %rax, %rax
> > +       /* Search char cannot be zero so safe to just use lea for
> > +          wcsrchr.  */
> > +       leaq    (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax
> >         VZEROUPPER_RETURN
> >
> > -END (STRRCHR)
> > +       .p2align 4,, 4
> > +L(cross_page):
> > +       movq    %rdi, %rsi
> > +       andq    $-VEC_SIZE, %rsi
> > +       vmovdqu (%rsi), %ymm1
> > +       VPCMPEQ %ymm1, %ymm0, %ymm6
> > +       vpmovmskb %ymm6, %ecx
> > +       /* Shift out zero CHAR matches that are before the begining of
> > +          src (rdi).  */
> > +       shrxl   %edi, %ecx, %ecx
> > +       testl   %ecx, %ecx
> > +       jz      L(page_cross_continue)
> > +       VPCMPEQ %ymm1, %ymm7, %ymm1
> > +       vpmovmskb %ymm1, %eax
> > +
> > +       /* Shift out search CHAR matches that are before the begining of
> > +          src (rdi).  */
> > +       shrxl   %edi, %eax, %eax
> > +       blsmskl %ecx, %ecx
> > +       /* Check if any search CHAR match in range.  */
> > +       andl    %ecx, %eax
> > +       jz      L(ret2)
> > +       bsrl    %eax, %eax
> > +       addq    %rdi, %rax
> > +# ifdef USE_AS_WCSRCHR
> > +       andq    $-CHAR_SIZE, %rax
> > +# endif
> > +L(ret2):
> > +       VZEROUPPER_RETURN
> > +END(STRRCHR)
> >  #endif
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.

I would like to backport this patch to release branches.
Any comments or objections?

--Sunil

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v3 4/4] x86: Optimize {str|wcs}rchr-evex
  2022-04-22 19:04     ` H.J. Lu
@ 2022-05-12 20:16       ` Sunil Pandey
  0 siblings, 0 replies; 36+ messages in thread
From: Sunil Pandey @ 2022-05-12 20:16 UTC (permalink / raw)
  To: H.J. Lu, Libc-stable Mailing List; +Cc: Noah Goldstein, GNU C Library

On Fri, Apr 22, 2022 at 12:08 PM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Thu, Apr 21, 2022 at 6:52 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > The new code unrolls the main loop slightly without adding too much
> > overhead and minimizes the comparisons for the search CHAR.
> >
> > Geometric Mean of all benchmarks New / Old: 0.755
> > See email for all results.
> >
> > Full xcheck passes on x86_64 with and without multiarch enabled.
> > ---
> >  sysdeps/x86_64/multiarch/strrchr-evex.S | 471 +++++++++++++++---------
> >  1 file changed, 290 insertions(+), 181 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
> > index adeddaed32..8014c285b3 100644
> > --- a/sysdeps/x86_64/multiarch/strrchr-evex.S
> > +++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
> > @@ -24,242 +24,351 @@
> >  #  define STRRCHR      __strrchr_evex
> >  # endif
> >
> > -# define VMOVU         vmovdqu64
> > -# define VMOVA         vmovdqa64
> > +# define VMOVU vmovdqu64
> > +# define VMOVA vmovdqa64
> >
> >  # ifdef USE_AS_WCSRCHR
> > +#  define SHIFT_REG    esi
> > +
> > +#  define kunpck       kunpckbw
> > +#  define kmov_2x      kmovd
> > +#  define maskz_2x     ecx
> > +#  define maskm_2x     eax
> > +#  define CHAR_SIZE    4
> > +#  define VPMIN        vpminud
> > +#  define VPTESTN      vptestnmd
> >  #  define VPBROADCAST  vpbroadcastd
> > -#  define VPCMP                vpcmpd
> > -#  define SHIFT_REG    r8d
> > +#  define VPCMP        vpcmpd
> >  # else
> > +#  define SHIFT_REG    edi
> > +
> > +#  define kunpck       kunpckdq
> > +#  define kmov_2x      kmovq
> > +#  define maskz_2x     rcx
> > +#  define maskm_2x     rax
> > +
> > +#  define CHAR_SIZE    1
> > +#  define VPMIN        vpminub
> > +#  define VPTESTN      vptestnmb
> >  #  define VPBROADCAST  vpbroadcastb
> > -#  define VPCMP                vpcmpb
> > -#  define SHIFT_REG    ecx
> > +#  define VPCMP        vpcmpb
> >  # endif
> >
> >  # define XMMZERO       xmm16
> >  # define YMMZERO       ymm16
> >  # define YMMMATCH      ymm17
> > -# define YMM1          ymm18
> > +# define YMMSAVE       ymm18
> > +
> > +# define YMM1  ymm19
> > +# define YMM2  ymm20
> > +# define YMM3  ymm21
> > +# define YMM4  ymm22
> > +# define YMM5  ymm23
> > +# define YMM6  ymm24
> > +# define YMM7  ymm25
> > +# define YMM8  ymm26
> >
> > -# define VEC_SIZE      32
> >
> > -       .section .text.evex,"ax",@progbits
> > -ENTRY (STRRCHR)
> > -       movl    %edi, %ecx
> > +# define VEC_SIZE      32
> > +# define PAGE_SIZE     4096
> > +       .section .text.evex, "ax", @progbits
> > +ENTRY(STRRCHR)
> > +       movl    %edi, %eax
> >         /* Broadcast CHAR to YMMMATCH.  */
> >         VPBROADCAST %esi, %YMMMATCH
> >
> > -       vpxorq  %XMMZERO, %XMMZERO, %XMMZERO
> > -
> > -       /* Check if we may cross page boundary with one vector load.  */
> > -       andl    $(2 * VEC_SIZE - 1), %ecx
> > -       cmpl    $VEC_SIZE, %ecx
> > -       ja      L(cros_page_boundary)
> > +       andl    $(PAGE_SIZE - 1), %eax
> > +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > +       jg      L(cross_page_boundary)
> >
> > +L(page_cross_continue):
> >         VMOVU   (%rdi), %YMM1
> > -
> > -       /* Each bit in K0 represents a null byte in YMM1.  */
> > -       VPCMP   $0, %YMMZERO, %YMM1, %k0
> > -       /* Each bit in K1 represents a CHAR in YMM1.  */
> > -       VPCMP   $0, %YMMMATCH, %YMM1, %k1
> > +       /* k0 has a 1 for each zero CHAR in YMM1.  */
> > +       VPTESTN %YMM1, %YMM1, %k0
> >         kmovd   %k0, %ecx
> > -       kmovd   %k1, %eax
> > -
> > -       addq    $VEC_SIZE, %rdi
> > -
> > -       testl   %eax, %eax
> > -       jnz     L(first_vec)
> > -
> >         testl   %ecx, %ecx
> > -       jnz     L(return_null)
> > -
> > -       andq    $-VEC_SIZE, %rdi
> > -       xorl    %edx, %edx
> > -       jmp     L(aligned_loop)
> > -
> > -       .p2align 4
> > -L(first_vec):
> > -       /* Check if there is a null byte.  */
> > -       testl   %ecx, %ecx
> > -       jnz     L(char_and_nul_in_first_vec)
> > -
> > -       /* Remember the match and keep searching.  */
> > -       movl    %eax, %edx
> > -       movq    %rdi, %rsi
> > -       andq    $-VEC_SIZE, %rdi
> > -       jmp     L(aligned_loop)
> > -
> > -       .p2align 4
> > -L(cros_page_boundary):
> > -       andl    $(VEC_SIZE - 1), %ecx
> > -       andq    $-VEC_SIZE, %rdi
> > +       jz      L(aligned_more)
> > +       /* fallthrough: zero CHAR in first VEC.  */
> >
> > +       /* K1 has a 1 for each search CHAR match in YMM1.  */
> > +       VPCMP   $0, %YMMMATCH, %YMM1, %k1
> > +       kmovd   %k1, %eax
> > +       /* Build mask up until first zero CHAR (used to mask of
> > +          potential search CHAR matches past the end of the string).
> > +        */
> > +       blsmskl %ecx, %ecx
> > +       andl    %ecx, %eax
> > +       jz      L(ret0)
> > +       /* Get last match (the `andl` removed any out of bounds
> > +          matches).  */
> > +       bsrl    %eax, %eax
> >  # ifdef USE_AS_WCSRCHR
> > -       /* NB: Divide shift count by 4 since each bit in K1 represent 4
> > -          bytes.  */
> > -       movl    %ecx, %SHIFT_REG
> > -       sarl    $2, %SHIFT_REG
> > +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > +# else
> > +       addq    %rdi, %rax
> >  # endif
> > +L(ret0):
> > +       ret
> >
> > -       VMOVA   (%rdi), %YMM1
> > -
> > -       /* Each bit in K0 represents a null byte in YMM1.  */
> > -       VPCMP   $0, %YMMZERO, %YMM1, %k0
> > -       /* Each bit in K1 represents a CHAR in YMM1.  */
> > +       /* Returns for first vec x1/x2/x3 have hard coded backward
> > +          search path for earlier matches.  */
> > +       .p2align 4,, 6
> > +L(first_vec_x1):
> > +       VPCMP   $0, %YMMMATCH, %YMM2, %k1
> > +       kmovd   %k1, %eax
> > +       blsmskl %ecx, %ecx
> > +       /* eax non-zero if search CHAR in range.  */
> > +       andl    %ecx, %eax
> > +       jnz     L(first_vec_x1_return)
> > +
> > +       /* fallthrough: no match in YMM2 then need to check for earlier
> > +          matches (in YMM1).  */
> > +       .p2align 4,, 4
> > +L(first_vec_x0_test):
> >         VPCMP   $0, %YMMMATCH, %YMM1, %k1
> > -       kmovd   %k0, %edx
> >         kmovd   %k1, %eax
> > -
> > -       shrxl   %SHIFT_REG, %edx, %edx
> > -       shrxl   %SHIFT_REG, %eax, %eax
> > -       addq    $VEC_SIZE, %rdi
> > -
> > -       /* Check if there is a CHAR.  */
> >         testl   %eax, %eax
> > -       jnz     L(found_char)
> > -
> > -       testl   %edx, %edx
> > -       jnz     L(return_null)
> > -
> > -       jmp     L(aligned_loop)
> > -
> > -       .p2align 4
> > -L(found_char):
> > -       testl   %edx, %edx
> > -       jnz     L(char_and_nul)
> > -
> > -       /* Remember the match and keep searching.  */
> > -       movl    %eax, %edx
> > -       leaq    (%rdi, %rcx), %rsi
> > +       jz      L(ret1)
> > +       bsrl    %eax, %eax
> > +# ifdef USE_AS_WCSRCHR
> > +       leaq    (%rsi, %rax, CHAR_SIZE), %rax
> > +# else
> > +       addq    %rsi, %rax
> > +# endif
> > +L(ret1):
> > +       ret
> >
> > -       .p2align 4
> > -L(aligned_loop):
> > -       VMOVA   (%rdi), %YMM1
> > -       addq    $VEC_SIZE, %rdi
> > +       .p2align 4,, 10
> > +L(first_vec_x1_or_x2):
> > +       VPCMP   $0, %YMM3, %YMMMATCH, %k3
> > +       VPCMP   $0, %YMM2, %YMMMATCH, %k2
> > +       /* K2 and K3 have 1 for any search CHAR match. Test if any
> > +          matches between either of them. Otherwise check YMM1.  */
> > +       kortestd %k2, %k3
> > +       jz      L(first_vec_x0_test)
> > +
> > +       /* Guranteed that YMM2 and YMM3 are within range so merge the
> > +          two bitmasks then get last result.  */
> > +       kunpck  %k2, %k3, %k3
> > +       kmovq   %k3, %rax
> > +       bsrq    %rax, %rax
> > +       leaq    (VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax
> > +       ret
> >
> > -       /* Each bit in K0 represents a null byte in YMM1.  */
> > -       VPCMP   $0, %YMMZERO, %YMM1, %k0
> > -       /* Each bit in K1 represents a CHAR in YMM1.  */
> > -       VPCMP   $0, %YMMMATCH, %YMM1, %k1
> > -       kmovd   %k0, %ecx
> > +       .p2align 4,, 6
> > +L(first_vec_x3):
> > +       VPCMP   $0, %YMMMATCH, %YMM4, %k1
> >         kmovd   %k1, %eax
> > -       orl     %eax, %ecx
> > -       jnz     L(char_nor_null)
> > +       blsmskl %ecx, %ecx
> > +       /* If no search CHAR match in range check YMM1/YMM2/YMM3.  */
> > +       andl    %ecx, %eax
> > +       jz      L(first_vec_x1_or_x2)
> > +       bsrl    %eax, %eax
> > +       leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> > +       ret
> >
> > -       VMOVA   (%rdi), %YMM1
> > -       add     $VEC_SIZE, %rdi
> > +       .p2align 4,, 6
> > +L(first_vec_x0_x1_test):
> > +       VPCMP   $0, %YMMMATCH, %YMM2, %k1
> > +       kmovd   %k1, %eax
> > +       /* Check YMM2 for last match first. If no match try YMM1.  */
> > +       testl   %eax, %eax
> > +       jz      L(first_vec_x0_test)
> > +       .p2align 4,, 4
> > +L(first_vec_x1_return):
> > +       bsrl    %eax, %eax
> > +       leaq    (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
> > +       ret
> >
> > -       /* Each bit in K0 represents a null byte in YMM1.  */
> > -       VPCMP   $0, %YMMZERO, %YMM1, %k0
> > -       /* Each bit in K1 represents a CHAR in YMM1.  */
> > -       VPCMP   $0, %YMMMATCH, %YMM1, %k1
> > -       kmovd   %k0, %ecx
> > +       .p2align 4,, 10
> > +L(first_vec_x2):
> > +       VPCMP   $0, %YMMMATCH, %YMM3, %k1
> >         kmovd   %k1, %eax
> > -       orl     %eax, %ecx
> > -       jnz     L(char_nor_null)
> > +       blsmskl %ecx, %ecx
> > +       /* Check YMM3 for last match first. If no match try YMM2/YMM1.
> > +        */
> > +       andl    %ecx, %eax
> > +       jz      L(first_vec_x0_x1_test)
> > +       bsrl    %eax, %eax
> > +       leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> > +       ret
> >
> > -       VMOVA   (%rdi), %YMM1
> > -       addq    $VEC_SIZE, %rdi
> >
> > -       /* Each bit in K0 represents a null byte in YMM1.  */
> > -       VPCMP   $0, %YMMZERO, %YMM1, %k0
> > -       /* Each bit in K1 represents a CHAR in YMM1.  */
> > -       VPCMP   $0, %YMMMATCH, %YMM1, %k1
> > +       .p2align 4
> > +L(aligned_more):
> > +       /* Need to keep original pointer incase YMM1 has last match.  */
> > +       movq    %rdi, %rsi
> > +       andq    $-VEC_SIZE, %rdi
> > +       VMOVU   VEC_SIZE(%rdi), %YMM2
> > +       VPTESTN %YMM2, %YMM2, %k0
> >         kmovd   %k0, %ecx
> > -       kmovd   %k1, %eax
> > -       orl     %eax, %ecx
> > -       jnz     L(char_nor_null)
> > +       testl   %ecx, %ecx
> > +       jnz     L(first_vec_x1)
> >
> > -       VMOVA   (%rdi), %YMM1
> > -       addq    $VEC_SIZE, %rdi
> > +       VMOVU   (VEC_SIZE * 2)(%rdi), %YMM3
> > +       VPTESTN %YMM3, %YMM3, %k0
> > +       kmovd   %k0, %ecx
> > +       testl   %ecx, %ecx
> > +       jnz     L(first_vec_x2)
> >
> > -       /* Each bit in K0 represents a null byte in YMM1.  */
> > -       VPCMP   $0, %YMMZERO, %YMM1, %k0
> > -       /* Each bit in K1 represents a CHAR in YMM1.  */
> > -       VPCMP   $0, %YMMMATCH, %YMM1, %k1
> > +       VMOVU   (VEC_SIZE * 3)(%rdi), %YMM4
> > +       VPTESTN %YMM4, %YMM4, %k0
> >         kmovd   %k0, %ecx
> > -       kmovd   %k1, %eax
> > -       orl     %eax, %ecx
> > -       jz      L(aligned_loop)
> > +       movq    %rdi, %r8
> > +       testl   %ecx, %ecx
> > +       jnz     L(first_vec_x3)
> >
> > +       andq    $-(VEC_SIZE * 2), %rdi
> >         .p2align 4
> > -L(char_nor_null):
> > -       /* Find a CHAR or a null byte in a loop.  */
> > +L(first_aligned_loop):
> > +       /* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee
> > +          they don't store a match.  */
> > +       VMOVA   (VEC_SIZE * 4)(%rdi), %YMM5
> > +       VMOVA   (VEC_SIZE * 5)(%rdi), %YMM6
> > +
> > +       VPCMP   $0, %YMM5, %YMMMATCH, %k2
> > +       vpxord  %YMM6, %YMMMATCH, %YMM7
> > +
> > +       VPMIN   %YMM5, %YMM6, %YMM8
> > +       VPMIN   %YMM8, %YMM7, %YMM7
> > +
> > +       VPTESTN %YMM7, %YMM7, %k1
> > +       subq    $(VEC_SIZE * -2), %rdi
> > +       kortestd %k1, %k2
> > +       jz      L(first_aligned_loop)
> > +
> > +       VPCMP   $0, %YMM6, %YMMMATCH, %k3
> > +       VPTESTN %YMM8, %YMM8, %k1
> > +       ktestd  %k1, %k1
> > +       jz      L(second_aligned_loop_prep)
> > +
> > +       kortestd %k2, %k3
> > +       jnz     L(return_first_aligned_loop)
> > +
> > +       .p2align 4,, 6
> > +L(first_vec_x1_or_x2_or_x3):
> > +       VPCMP   $0, %YMM4, %YMMMATCH, %k4
> > +       kmovd   %k4, %eax
> >         testl   %eax, %eax
> > -       jnz     L(match)
> > -L(return_value):
> > -       testl   %edx, %edx
> > -       jz      L(return_null)
> > -       movl    %edx, %eax
> > -       movq    %rsi, %rdi
> > +       jz      L(first_vec_x1_or_x2)
> >         bsrl    %eax, %eax
> > -# ifdef USE_AS_WCSRCHR
> > -       /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> > -       leaq    -VEC_SIZE(%rdi, %rax, 4), %rax
> > -# else
> > -       leaq    -VEC_SIZE(%rdi, %rax), %rax
> > -# endif
> > +       leaq    (VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
> >         ret
> >
> > -       .p2align 4
> > -L(match):
> > -       /* Find a CHAR.  Check if there is a null byte.  */
> > -       kmovd   %k0, %ecx
> > -       testl   %ecx, %ecx
> > -       jnz     L(find_nul)
> > +       .p2align 4,, 8
> > +L(return_first_aligned_loop):
> > +       VPTESTN %YMM5, %YMM5, %k0
> > +       kunpck  %k0, %k1, %k0
> > +       kmov_2x %k0, %maskz_2x
> > +
> > +       blsmsk  %maskz_2x, %maskz_2x
> > +       kunpck  %k2, %k3, %k3
> > +       kmov_2x %k3, %maskm_2x
> > +       and     %maskz_2x, %maskm_2x
> > +       jz      L(first_vec_x1_or_x2_or_x3)
> >
> > -       /* Remember the match and keep searching.  */
> > -       movl    %eax, %edx
> > +       bsr     %maskm_2x, %maskm_2x
> > +       leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> > +       ret
> > +
> > +       .p2align 4
> > +       /* We can throw away the work done for the first 4x checks here
> > +          as we have a later match. This is the 'fast' path persay.
> > +        */
> > +L(second_aligned_loop_prep):
> > +L(second_aligned_loop_set_furthest_match):
> >         movq    %rdi, %rsi
> > -       jmp     L(aligned_loop)
> > +       kunpck  %k2, %k3, %k4
> >
> >         .p2align 4
> > -L(find_nul):
> > -       /* Mask out any matching bits after the null byte.  */
> > -       movl    %ecx, %r8d
> > -       subl    $1, %r8d
> > -       xorl    %ecx, %r8d
> > -       andl    %r8d, %eax
> > -       testl   %eax, %eax
> > -       /* If there is no CHAR here, return the remembered one.  */
> > -       jz      L(return_value)
> > -       bsrl    %eax, %eax
> > +L(second_aligned_loop):
> > +       VMOVU   (VEC_SIZE * 4)(%rdi), %YMM1
> > +       VMOVU   (VEC_SIZE * 5)(%rdi), %YMM2
> > +
> > +       VPCMP   $0, %YMM1, %YMMMATCH, %k2
> > +       vpxord  %YMM2, %YMMMATCH, %YMM3
> > +
> > +       VPMIN   %YMM1, %YMM2, %YMM4
> > +       VPMIN   %YMM3, %YMM4, %YMM3
> > +
> > +       VPTESTN %YMM3, %YMM3, %k1
> > +       subq    $(VEC_SIZE * -2), %rdi
> > +       kortestd %k1, %k2
> > +       jz      L(second_aligned_loop)
> > +
> > +       VPCMP   $0, %YMM2, %YMMMATCH, %k3
> > +       VPTESTN %YMM4, %YMM4, %k1
> > +       ktestd  %k1, %k1
> > +       jz      L(second_aligned_loop_set_furthest_match)
> > +
> > +       kortestd %k2, %k3
> > +       /* branch here because there is a significant advantage interms
> > +          of output dependency chance in using edx.  */
> > +       jnz     L(return_new_match)
> > +L(return_old_match):
> > +       kmovq   %k4, %rax
> > +       bsrq    %rax, %rax
> > +       leaq    (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax
> > +       ret
> > +
> > +L(return_new_match):
> > +       VPTESTN %YMM1, %YMM1, %k0
> > +       kunpck  %k0, %k1, %k0
> > +       kmov_2x %k0, %maskz_2x
> > +
> > +       blsmsk  %maskz_2x, %maskz_2x
> > +       kunpck  %k2, %k3, %k3
> > +       kmov_2x %k3, %maskm_2x
> > +       and     %maskz_2x, %maskm_2x
> > +       jz      L(return_old_match)
> > +
> > +       bsr     %maskm_2x, %maskm_2x
> > +       leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> > +       ret
> > +
> > +L(cross_page_boundary):
> > +       /* eax contains all the page offset bits of src (rdi). `xor rdi,
> > +          rax` sets pointer will all page offset bits cleared so
> > +          offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
> > +          before page cross (guranteed to be safe to read). Doing this
> > +          as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
> > +          a bit of code size.  */
> > +       xorq    %rdi, %rax
> > +       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %YMM1
> > +       VPTESTN %YMM1, %YMM1, %k0
> > +       kmovd   %k0, %ecx
> > +
> > +       /* Shift out zero CHAR matches that are before the begining of
> > +          src (rdi).  */
> >  # ifdef USE_AS_WCSRCHR
> > -       /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> > -       leaq    -VEC_SIZE(%rdi, %rax, 4), %rax
> > -# else
> > -       leaq    -VEC_SIZE(%rdi, %rax), %rax
> > +       movl    %edi, %esi
> > +       andl    $(VEC_SIZE - 1), %esi
> > +       shrl    $2, %esi
> >  # endif
> > -       ret
> > +       shrxl   %SHIFT_REG, %ecx, %ecx
> >
> > -       .p2align 4
> > -L(char_and_nul):
> > -       /* Find both a CHAR and a null byte.  */
> > -       addq    %rcx, %rdi
> > -       movl    %edx, %ecx
> > -L(char_and_nul_in_first_vec):
> > -       /* Mask out any matching bits after the null byte.  */
> > -       movl    %ecx, %r8d
> > -       subl    $1, %r8d
> > -       xorl    %ecx, %r8d
> > -       andl    %r8d, %eax
> > -       testl   %eax, %eax
> > -       /* Return null pointer if the null byte comes first.  */
> > -       jz      L(return_null)
> > +       testl   %ecx, %ecx
> > +       jz      L(page_cross_continue)
> > +
> > +       /* Found zero CHAR so need to test for search CHAR.  */
> > +       VPCMP   $0, %YMMMATCH, %YMM1, %k1
> > +       kmovd   %k1, %eax
> > +       /* Shift out search CHAR matches that are before the begining of
> > +          src (rdi).  */
> > +       shrxl   %SHIFT_REG, %eax, %eax
> > +
> > +       /* Check if any search CHAR match in range.  */
> > +       blsmskl %ecx, %ecx
> > +       andl    %ecx, %eax
> > +       jz      L(ret3)
> >         bsrl    %eax, %eax
> >  # ifdef USE_AS_WCSRCHR
> > -       /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> > -       leaq    -VEC_SIZE(%rdi, %rax, 4), %rax
> > +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> >  # else
> > -       leaq    -VEC_SIZE(%rdi, %rax), %rax
> > +       addq    %rdi, %rax
> >  # endif
> > +L(ret3):
> >         ret
> >
> > -       .p2align 4
> > -L(return_null):
> > -       xorl    %eax, %eax
> > -       ret
> > -
> > -END (STRRCHR)
> > +END(STRRCHR)
> >  #endif
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.

I would like to backport this patch to release branches.
Any comments or objections?

--Sunil

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v3 3/4] x86: Optimize {str|wcs}rchr-avx2
  2022-05-12 20:14       ` Sunil Pandey
@ 2022-07-20 15:33         ` Noah Goldstein
  0 siblings, 0 replies; 36+ messages in thread
From: Noah Goldstein @ 2022-07-20 15:33 UTC (permalink / raw)
  To: Sunil Pandey; +Cc: H.J. Lu, Libc-stable Mailing List, GNU C Library

On Fri, May 13, 2022 at 4:15 AM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
> On Fri, Apr 22, 2022 at 12:08 PM H.J. Lu via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > On Thu, Apr 21, 2022 at 6:52 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > The new code unrolls the main loop slightly without adding too much
> > > overhead and minimizes the comparisons for the search CHAR.
> > >
> > > Geometric Mean of all benchmarks New / Old: 0.832
> > > See email for all results.
> > >
> > > Full xcheck passes on x86_64 with and without multiarch enabled.
> > > ---
> > >  sysdeps/x86_64/multiarch/strrchr-avx2.S | 426 +++++++++++++++---------
> > >  1 file changed, 269 insertions(+), 157 deletions(-)
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
> > > index 1df2adfad0..bd26ba80d5 100644
> > > --- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
> > > +++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
> > > @@ -27,9 +27,13 @@
> > >  # ifdef USE_AS_WCSRCHR
> > >  #  define VPBROADCAST  vpbroadcastd
> > >  #  define VPCMPEQ      vpcmpeqd
> > > +#  define VPMIN        vpminud
> > > +#  define CHAR_SIZE    4
> > >  # else
> > >  #  define VPBROADCAST  vpbroadcastb
> > >  #  define VPCMPEQ      vpcmpeqb
> > > +#  define VPMIN        vpminub
> > > +#  define CHAR_SIZE    1
> > >  # endif
> > >
> > >  # ifndef VZEROUPPER
> > > @@ -41,196 +45,304 @@
> > >  # endif
> > >
> > >  # define VEC_SIZE      32
> > > +# define PAGE_SIZE     4096
> > >
> > > -       .section SECTION(.text),"ax",@progbits
> > > -ENTRY (STRRCHR)
> > > -       movd    %esi, %xmm4
> > > -       movl    %edi, %ecx
> > > +       .section SECTION(.text), "ax", @progbits
> > > +ENTRY(STRRCHR)
> > > +       movd    %esi, %xmm7
> > > +       movl    %edi, %eax
> > >         /* Broadcast CHAR to YMM4.  */
> > > -       VPBROADCAST %xmm4, %ymm4
> > > +       VPBROADCAST %xmm7, %ymm7
> > >         vpxor   %xmm0, %xmm0, %xmm0
> > >
> > > -       /* Check if we may cross page boundary with one vector load.  */
> > > -       andl    $(2 * VEC_SIZE - 1), %ecx
> > > -       cmpl    $VEC_SIZE, %ecx
> > > -       ja      L(cros_page_boundary)
> > > +       /* Shift here instead of `andl` to save code size (saves a fetch
> > > +          block).  */
> > > +       sall    $20, %eax
> > > +       cmpl    $((PAGE_SIZE - VEC_SIZE) << 20), %eax
> > > +       ja      L(cross_page)
> > >
> > > +L(page_cross_continue):
> > >         vmovdqu (%rdi), %ymm1
> > > -       VPCMPEQ %ymm1, %ymm0, %ymm2
> > > -       VPCMPEQ %ymm1, %ymm4, %ymm3
> > > -       vpmovmskb %ymm2, %ecx
> > > -       vpmovmskb %ymm3, %eax
> > > -       addq    $VEC_SIZE, %rdi
> > > +       /* Check end of string match.  */
> > > +       VPCMPEQ %ymm1, %ymm0, %ymm6
> > > +       vpmovmskb %ymm6, %ecx
> > > +       testl   %ecx, %ecx
> > > +       jz      L(aligned_more)
> > > +
> > > +       /* Only check match with search CHAR if needed.  */
> > > +       VPCMPEQ %ymm1, %ymm7, %ymm1
> > > +       vpmovmskb %ymm1, %eax
> > > +       /* Check if match before first zero.  */
> > > +       blsmskl %ecx, %ecx
> > > +       andl    %ecx, %eax
> > > +       jz      L(ret0)
> > > +       bsrl    %eax, %eax
> > > +       addq    %rdi, %rax
> > > +       /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
> > > +          search CHAR is zero we are correct. Either way `andq
> > > +          -CHAR_SIZE, %rax` gets the correct result.  */
> > > +# ifdef USE_AS_WCSRCHR
> > > +       andq    $-CHAR_SIZE, %rax
> > > +# endif
> > > +L(ret0):
> > > +L(return_vzeroupper):
> > > +       ZERO_UPPER_VEC_REGISTERS_RETURN
> > > +
> > > +       /* Returns for first vec x1/x2 have hard coded backward search
> > > +          path for earlier matches.  */
> > > +       .p2align 4,, 10
> > > +L(first_vec_x1):
> > > +       VPCMPEQ %ymm2, %ymm7, %ymm6
> > > +       vpmovmskb %ymm6, %eax
> > > +       blsmskl %ecx, %ecx
> > > +       andl    %ecx, %eax
> > > +       jnz     L(first_vec_x1_return)
> > > +
> > > +       .p2align 4,, 4
> > > +L(first_vec_x0_test):
> > > +       VPCMPEQ %ymm1, %ymm7, %ymm6
> > > +       vpmovmskb %ymm6, %eax
> > > +       testl   %eax, %eax
> > > +       jz      L(ret1)
> > > +       bsrl    %eax, %eax
> > > +       addq    %r8, %rax
> > > +# ifdef USE_AS_WCSRCHR
> > > +       andq    $-CHAR_SIZE, %rax
> > > +# endif
> > > +L(ret1):
> > > +       VZEROUPPER_RETURN
> > >
> > > +       .p2align 4,, 10
> > > +L(first_vec_x0_x1_test):
> > > +       VPCMPEQ %ymm2, %ymm7, %ymm6
> > > +       vpmovmskb %ymm6, %eax
> > > +       /* Check ymm2 for search CHAR match. If no match then check ymm1
> > > +          before returning.  */
> > >         testl   %eax, %eax
> > > -       jnz     L(first_vec)
> > > +       jz      L(first_vec_x0_test)
> > > +       .p2align 4,, 4
> > > +L(first_vec_x1_return):
> > > +       bsrl    %eax, %eax
> > > +       leaq    1(%rdi, %rax), %rax
> > > +# ifdef USE_AS_WCSRCHR
> > > +       andq    $-CHAR_SIZE, %rax
> > > +# endif
> > > +       VZEROUPPER_RETURN
> > >
> > > -       testl   %ecx, %ecx
> > > -       jnz     L(return_null)
> > >
> > > -       andq    $-VEC_SIZE, %rdi
> > > -       xorl    %edx, %edx
> > > -       jmp     L(aligned_loop)
> > > +       .p2align 4,, 10
> > > +L(first_vec_x2):
> > > +       VPCMPEQ %ymm3, %ymm7, %ymm6
> > > +       vpmovmskb %ymm6, %eax
> > > +       blsmskl %ecx, %ecx
> > > +       /* If no in-range search CHAR match in ymm3 then need to check
> > > +          ymm1/ymm2 for an earlier match (we delay checking search
> > > +          CHAR matches until needed).  */
> > > +       andl    %ecx, %eax
> > > +       jz      L(first_vec_x0_x1_test)
> > > +       bsrl    %eax, %eax
> > > +       leaq    (VEC_SIZE + 1)(%rdi, %rax), %rax
> > > +# ifdef USE_AS_WCSRCHR
> > > +       andq    $-CHAR_SIZE, %rax
> > > +# endif
> > > +       VZEROUPPER_RETURN
> > > +
> > >
> > >         .p2align 4
> > > -L(first_vec):
> > > -       /* Check if there is a nul CHAR.  */
> > > +L(aligned_more):
> > > +       /* Save original pointer if match was in VEC 0.  */
> > > +       movq    %rdi, %r8
> > > +
> > > +       /* Align src.  */
> > > +       orq     $(VEC_SIZE - 1), %rdi
> > > +       vmovdqu 1(%rdi), %ymm2
> > > +       VPCMPEQ %ymm2, %ymm0, %ymm6
> > > +       vpmovmskb %ymm6, %ecx
> > >         testl   %ecx, %ecx
> > > -       jnz     L(char_and_nul_in_first_vec)
> > > +       jnz     L(first_vec_x1)
> > >
> > > -       /* Remember the match and keep searching.  */
> > > -       movl    %eax, %edx
> > > -       movq    %rdi, %rsi
> > > -       andq    $-VEC_SIZE, %rdi
> > > -       jmp     L(aligned_loop)
> > > +       vmovdqu (VEC_SIZE + 1)(%rdi), %ymm3
> > > +       VPCMPEQ %ymm3, %ymm0, %ymm6
> > > +       vpmovmskb %ymm6, %ecx
> > > +       testl   %ecx, %ecx
> > > +       jnz     L(first_vec_x2)
> > >
> > > +       /* Save pointer again before realigning.  */
> > > +       movq    %rdi, %rsi
> > > +       addq    $(VEC_SIZE + 1), %rdi
> > > +       andq    $-(VEC_SIZE * 2), %rdi
> > >         .p2align 4
> > > -L(cros_page_boundary):
> > > -       andl    $(VEC_SIZE - 1), %ecx
> > > -       andq    $-VEC_SIZE, %rdi
> > > -       vmovdqa (%rdi), %ymm1
> > > -       VPCMPEQ %ymm1, %ymm0, %ymm2
> > > -       VPCMPEQ %ymm1, %ymm4, %ymm3
> > > -       vpmovmskb %ymm2, %edx
> > > -       vpmovmskb %ymm3, %eax
> > > -       shrl    %cl, %edx
> > > -       shrl    %cl, %eax
> > > -       addq    $VEC_SIZE, %rdi
> > > -
> > > -       /* Check if there is a CHAR.  */
> > > +L(first_aligned_loop):
> > > +       /* Do 2x VEC at a time. Any more and the cost of finding the
> > > +          match outweights loop benefit.  */
> > > +       vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4
> > > +       vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5
> > > +
> > > +       VPCMPEQ %ymm4, %ymm7, %ymm6
> > > +       VPMIN   %ymm4, %ymm5, %ymm8
> > > +       VPCMPEQ %ymm5, %ymm7, %ymm10
> > > +       vpor    %ymm6, %ymm10, %ymm5
> > > +       VPCMPEQ %ymm8, %ymm0, %ymm8
> > > +       vpor    %ymm5, %ymm8, %ymm9
> > > +
> > > +       vpmovmskb %ymm9, %eax
> > > +       addq    $(VEC_SIZE * 2), %rdi
> > > +       /* No zero or search CHAR.  */
> > >         testl   %eax, %eax
> > > -       jnz     L(found_char)
> > > -
> > > -       testl   %edx, %edx
> > > -       jnz     L(return_null)
> > > +       jz      L(first_aligned_loop)
> > >
> > > -       jmp     L(aligned_loop)
> > > -
> > > -       .p2align 4
> > > -L(found_char):
> > > -       testl   %edx, %edx
> > > -       jnz     L(char_and_nul)
> > > +       /* If no zero CHAR then go to second loop (this allows us to
> > > +          throw away all prior work).  */
> > > +       vpmovmskb %ymm8, %ecx
> > > +       testl   %ecx, %ecx
> > > +       jz      L(second_aligned_loop_prep)
> > >
> > > -       /* Remember the match and keep searching.  */
> > > -       movl    %eax, %edx
> > > -       leaq    (%rdi, %rcx), %rsi
> > > +       /* Search char could be zero so we need to get the true match.
> > > +        */
> > > +       vpmovmskb %ymm5, %eax
> > > +       testl   %eax, %eax
> > > +       jnz     L(first_aligned_loop_return)
> > >
> > > -       .p2align 4
> > > -L(aligned_loop):
> > > -       vmovdqa (%rdi), %ymm1
> > > -       VPCMPEQ %ymm1, %ymm0, %ymm2
> > > -       addq    $VEC_SIZE, %rdi
> > > -       VPCMPEQ %ymm1, %ymm4, %ymm3
> > > -       vpmovmskb %ymm2, %ecx
> > > -       vpmovmskb %ymm3, %eax
> > > -       orl     %eax, %ecx
> > > -       jnz     L(char_nor_null)
> > > -
> > > -       vmovdqa (%rdi), %ymm1
> > > -       VPCMPEQ %ymm1, %ymm0, %ymm2
> > > -       add     $VEC_SIZE, %rdi
> > > -       VPCMPEQ %ymm1, %ymm4, %ymm3
> > > -       vpmovmskb %ymm2, %ecx
> > > +       .p2align 4,, 4
> > > +L(first_vec_x1_or_x2):
> > > +       VPCMPEQ %ymm3, %ymm7, %ymm3
> > > +       VPCMPEQ %ymm2, %ymm7, %ymm2
> > >         vpmovmskb %ymm3, %eax
> > > -       orl     %eax, %ecx
> > > -       jnz     L(char_nor_null)
> > > -
> > > -       vmovdqa (%rdi), %ymm1
> > > -       VPCMPEQ %ymm1, %ymm0, %ymm2
> > > -       addq    $VEC_SIZE, %rdi
> > > -       VPCMPEQ %ymm1, %ymm4, %ymm3
> > > -       vpmovmskb %ymm2, %ecx
> > > -       vpmovmskb %ymm3, %eax
> > > -       orl     %eax, %ecx
> > > -       jnz     L(char_nor_null)
> > > -
> > > -       vmovdqa (%rdi), %ymm1
> > > -       VPCMPEQ %ymm1, %ymm0, %ymm2
> > > -       addq    $VEC_SIZE, %rdi
> > > -       VPCMPEQ %ymm1, %ymm4, %ymm3
> > > -       vpmovmskb %ymm2, %ecx
> > > -       vpmovmskb %ymm3, %eax
> > > -       orl     %eax, %ecx
> > > -       jz      L(aligned_loop)
> > > -
> > > -       .p2align 4
> > > -L(char_nor_null):
> > > -       /* Find a CHAR or a nul CHAR in a loop.  */
> > > -       testl   %eax, %eax
> > > -       jnz     L(match)
> > > -L(return_value):
> > > -       testl   %edx, %edx
> > > -       jz      L(return_null)
> > > -       movl    %edx, %eax
> > > -       movq    %rsi, %rdi
> > > +       vpmovmskb %ymm2, %edx
> > > +       /* Use add for macro-fusion.  */
> > > +       addq    %rax, %rdx
> > > +       jz      L(first_vec_x0_test)
> > > +       /* NB: We could move this shift to before the branch and save a
> > > +          bit of code size / performance on the fall through. The
> > > +          branch leads to the null case which generally seems hotter
> > > +          than char in first 3x VEC.  */
> > > +       salq    $32, %rax
> > > +       addq    %rdx, %rax
> > > +       bsrq    %rax, %rax
> > > +       leaq    1(%rsi, %rax), %rax
> > > +# ifdef USE_AS_WCSRCHR
> > > +       andq    $-CHAR_SIZE, %rax
> > > +# endif
> > > +       VZEROUPPER_RETURN
> > >
> > > +       .p2align 4,, 8
> > > +L(first_aligned_loop_return):
> > > +       VPCMPEQ %ymm4, %ymm0, %ymm4
> > > +       vpmovmskb %ymm4, %edx
> > > +       salq    $32, %rcx
> > > +       orq     %rdx, %rcx
> > > +
> > > +       vpmovmskb %ymm10, %eax
> > > +       vpmovmskb %ymm6, %edx
> > > +       salq    $32, %rax
> > > +       orq     %rdx, %rax
> > > +       blsmskq %rcx, %rcx
> > > +       andq    %rcx, %rax
> > > +       jz      L(first_vec_x1_or_x2)
> > > +
> > > +       bsrq    %rax, %rax
> > > +       leaq    -(VEC_SIZE * 2)(%rdi, %rax), %rax
> > >  # ifdef USE_AS_WCSRCHR
> > > -       /* Keep the first bit for each matching CHAR for bsr.  */
> > > -       andl    $0x11111111, %eax
> > > +       andq    $-CHAR_SIZE, %rax
> > >  # endif
> > > -       bsrl    %eax, %eax
> > > -       leaq    -VEC_SIZE(%rdi, %rax), %rax
> > > -L(return_vzeroupper):
> > > -       ZERO_UPPER_VEC_REGISTERS_RETURN
> > > +       VZEROUPPER_RETURN
> > >
> > > +       /* Search char cannot be zero.  */
> > >         .p2align 4
> > > -L(match):
> > > -       /* Find a CHAR.  Check if there is a nul CHAR.  */
> > > -       vpmovmskb %ymm2, %ecx
> > > -       testl   %ecx, %ecx
> > > -       jnz     L(find_nul)
> > > -
> > > -       /* Remember the match and keep searching.  */
> > > -       movl    %eax, %edx
> > > +L(second_aligned_loop_set_furthest_match):
> > > +       /* Save VEC and pointer from most recent match.  */
> > > +L(second_aligned_loop_prep):
> > >         movq    %rdi, %rsi
> > > -       jmp     L(aligned_loop)
> > > +       vmovdqu %ymm6, %ymm2
> > > +       vmovdqu %ymm10, %ymm3
> > >
> > >         .p2align 4
> > > -L(find_nul):
> > > -# ifdef USE_AS_WCSRCHR
> > > -       /* Keep the first bit for each matching CHAR for bsr.  */
> > > -       andl    $0x11111111, %ecx
> > > -       andl    $0x11111111, %eax
> > > -# endif
> > > -       /* Mask out any matching bits after the nul CHAR.  */
> > > -       movl    %ecx, %r8d
> > > -       subl    $1, %r8d
> > > -       xorl    %ecx, %r8d
> > > -       andl    %r8d, %eax
> > > +L(second_aligned_loop):
> > > +       /* Search 2x at at time.  */
> > > +       vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4
> > > +       vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5
> > > +
> > > +       VPCMPEQ %ymm4, %ymm7, %ymm6
> > > +       VPMIN   %ymm4, %ymm5, %ymm1
> > > +       VPCMPEQ %ymm5, %ymm7, %ymm10
> > > +       vpor    %ymm6, %ymm10, %ymm5
> > > +       VPCMPEQ %ymm1, %ymm0, %ymm1
> > > +       vpor    %ymm5, %ymm1, %ymm9
> > > +
> > > +       vpmovmskb %ymm9, %eax
> > > +       addq    $(VEC_SIZE * 2), %rdi
> > >         testl   %eax, %eax
> > > -       /* If there is no CHAR here, return the remembered one.  */
> > > -       jz      L(return_value)
> > > -       bsrl    %eax, %eax
> > > -       leaq    -VEC_SIZE(%rdi, %rax), %rax
> > > -       VZEROUPPER_RETURN
> > > -
> > > -       .p2align 4
> > > -L(char_and_nul):
> > > -       /* Find both a CHAR and a nul CHAR.  */
> > > -       addq    %rcx, %rdi
> > > -       movl    %edx, %ecx
> > > -L(char_and_nul_in_first_vec):
> > > -# ifdef USE_AS_WCSRCHR
> > > -       /* Keep the first bit for each matching CHAR for bsr.  */
> > > -       andl    $0x11111111, %ecx
> > > -       andl    $0x11111111, %eax
> > > -# endif
> > > -       /* Mask out any matching bits after the nul CHAR.  */
> > > -       movl    %ecx, %r8d
> > > -       subl    $1, %r8d
> > > -       xorl    %ecx, %r8d
> > > -       andl    %r8d, %eax
> > > +       jz      L(second_aligned_loop)
> > > +       vpmovmskb %ymm1, %ecx
> > > +       testl   %ecx, %ecx
> > > +       jz      L(second_aligned_loop_set_furthest_match)
> > > +       vpmovmskb %ymm5, %eax
> > >         testl   %eax, %eax
> > > -       /* Return null pointer if the nul CHAR comes first.  */
> > > -       jz      L(return_null)
> > > -       bsrl    %eax, %eax
> > > -       leaq    -VEC_SIZE(%rdi, %rax), %rax
> > > +       jnz     L(return_new_match)
> > > +
> > > +       /* This is the hot patch. We know CHAR is inbounds and that
> > > +          ymm3/ymm2 have latest match.  */
> > > +       .p2align 4,, 4
> > > +L(return_old_match):
> > > +       vpmovmskb %ymm3, %eax
> > > +       vpmovmskb %ymm2, %edx
> > > +       salq    $32, %rax
> > > +       orq     %rdx, %rax
> > > +       bsrq    %rax, %rax
> > > +       /* Search char cannot be zero so safe to just use lea for
> > > +          wcsrchr.  */
> > > +       leaq    (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax
> > >         VZEROUPPER_RETURN
> > >
> > > -       .p2align 4
> > > -L(return_null):
> > > -       xorl    %eax, %eax
> > > +       /* Last iteration also potentially has a match.  */
> > > +       .p2align 4,, 8
> > > +L(return_new_match):
> > > +       VPCMPEQ %ymm4, %ymm0, %ymm4
> > > +       vpmovmskb %ymm4, %edx
> > > +       salq    $32, %rcx
> > > +       orq     %rdx, %rcx
> > > +
> > > +       vpmovmskb %ymm10, %eax
> > > +       vpmovmskb %ymm6, %edx
> > > +       salq    $32, %rax
> > > +       orq     %rdx, %rax
> > > +       blsmskq %rcx, %rcx
> > > +       andq    %rcx, %rax
> > > +       jz      L(return_old_match)
> > > +       bsrq    %rax, %rax
> > > +       /* Search char cannot be zero so safe to just use lea for
> > > +          wcsrchr.  */
> > > +       leaq    (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax
> > >         VZEROUPPER_RETURN
> > >
> > > -END (STRRCHR)
> > > +       .p2align 4,, 4
> > > +L(cross_page):
> > > +       movq    %rdi, %rsi
> > > +       andq    $-VEC_SIZE, %rsi
> > > +       vmovdqu (%rsi), %ymm1
> > > +       VPCMPEQ %ymm1, %ymm0, %ymm6
> > > +       vpmovmskb %ymm6, %ecx
> > > +       /* Shift out zero CHAR matches that are before the begining of
> > > +          src (rdi).  */
> > > +       shrxl   %edi, %ecx, %ecx
> > > +       testl   %ecx, %ecx
> > > +       jz      L(page_cross_continue)
> > > +       VPCMPEQ %ymm1, %ymm7, %ymm1
> > > +       vpmovmskb %ymm1, %eax
> > > +
> > > +       /* Shift out search CHAR matches that are before the begining of
> > > +          src (rdi).  */
> > > +       shrxl   %edi, %eax, %eax
> > > +       blsmskl %ecx, %ecx
> > > +       /* Check if any search CHAR match in range.  */
> > > +       andl    %ecx, %eax
> > > +       jz      L(ret2)
> > > +       bsrl    %eax, %eax
> > > +       addq    %rdi, %rax
> > > +# ifdef USE_AS_WCSRCHR
> > > +       andq    $-CHAR_SIZE, %rax
> > > +# endif
> > > +L(ret2):
> > > +       VZEROUPPER_RETURN
> > > +END(STRRCHR)
> > >  #endif
> > > --
> > > 2.25.1
> > >
> >
> > LGTM.
> >
> > Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
> >
> > Thanks.
> >
> > --
> > H.J.
>
> I would like to backport this patch to release branches.
> Any comments or objections?

Sorry, should have mentioned earlier but we should probably
get the strrchr-avx2.S changes from:
https://sourceware.org/git/?p=glibc.git;a=commit;h=3079f652d7cc34456aefb412677c01e758922527
>
> --Sunil

^ permalink raw reply	[flat|nested] 36+ messages in thread

end of thread, other threads:[~2022-07-20 15:33 UTC | newest]

Thread overview: 36+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-04-21  3:14 [PATCH v1 1/5] benchtests: Improve bench-strrchr Noah Goldstein
2022-04-21  3:14 ` [PATCH v1 2/5] x86: Optimize {str|wcs}rchr-sse2 Noah Goldstein
2022-04-21 20:26   ` H.J. Lu
2022-04-21 20:57     ` Noah Goldstein
2022-04-21 21:48       ` H.J. Lu
2022-04-21 22:23         ` Noah Goldstein
2022-04-21  3:14 ` [PATCH v1 3/5] x86: Add wcsrchr optimized with SSE4_1 in wcsrchr-sse4_1.S Noah Goldstein
2022-04-21  3:14 ` [PATCH v1 4/5] x86: Optimize {str|wcs}rchr-avx2 Noah Goldstein
2022-04-21  3:14 ` [PATCH v1 5/5] x86: Optimize {str|wcs}rchr-evex Noah Goldstein
2022-04-21 20:12 ` [PATCH v1 1/5] benchtests: Improve bench-strrchr H.J. Lu
2022-04-21 22:07   ` Noah Goldstein
2022-04-21 23:49     ` H.J. Lu
2022-04-22  1:11       ` Noah Goldstein
2022-04-21 22:22 ` [PATCH v2 1/4] " Noah Goldstein
2022-04-21 22:22   ` [PATCH v2 2/4] x86: Optimize {str|wcs}rchr-sse2 Noah Goldstein
2022-04-21 23:46     ` H.J. Lu
2022-04-22  1:54       ` Noah Goldstein
2022-04-21 22:22   ` [PATCH v2 3/4] x86: Optimize {str|wcs}rchr-avx2 Noah Goldstein
2022-04-21 22:22   ` [PATCH v2 4/4] x86: Optimize {str|wcs}rchr-evex Noah Goldstein
2022-04-21 23:59     ` H.J. Lu
2022-04-22  1:53       ` Noah Goldstein
2022-04-22  1:52 ` [PATCH v3 1/4] benchtests: Improve bench-strrchr Noah Goldstein
2022-04-22  1:52   ` [PATCH v3 2/4] x86: Optimize {str|wcs}rchr-sse2 Noah Goldstein
2022-04-22 19:06     ` H.J. Lu
2022-05-12 20:13       ` Sunil Pandey
2022-04-22  1:52   ` [PATCH v3 3/4] x86: Optimize {str|wcs}rchr-avx2 Noah Goldstein
2022-04-22 19:03     ` H.J. Lu
2022-05-12 20:14       ` Sunil Pandey
2022-07-20 15:33         ` Noah Goldstein
2022-04-22  1:52   ` [PATCH v3 4/4] x86: Optimize {str|wcs}rchr-evex Noah Goldstein
2022-04-22 19:04     ` H.J. Lu
2022-05-12 20:16       ` Sunil Pandey
2022-04-22 18:29   ` [PATCH v3 1/4] benchtests: Improve bench-strrchr H.J. Lu
2022-04-22 19:12     ` Noah Goldstein
2022-04-22 19:11 ` [PATCH v4 " Noah Goldstein
2022-04-23  1:53   ` H.J. Lu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).