public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c
@ 2022-03-23 21:57 Noah Goldstein
  2022-03-23 21:57 ` [PATCH v1 02/23] benchtests: Add random benchmark " Noah Goldstein
                   ` (22 more replies)
  0 siblings, 23 replies; 76+ messages in thread
From: Noah Goldstein @ 2022-03-23 21:57 UTC (permalink / raw)
  To: libc-alpha

Just QOL change to make parsing the output of the benchtests more
consistent.
---
 benchtests/bench-strchr.c | 94 ++++++++++++++++++++++++++-------------
 1 file changed, 64 insertions(+), 30 deletions(-)

diff --git a/benchtests/bench-strchr.c b/benchtests/bench-strchr.c
index 821bc615b0..203900d4ad 100644
--- a/benchtests/bench-strchr.c
+++ b/benchtests/bench-strchr.c
@@ -32,6 +32,7 @@
 #endif /* WIDE */
 #include "bench-string.h"
 
+#include "json-lib.h"
 #define BIG_CHAR MAX_CHAR
 
 #ifndef WIDE
@@ -74,10 +75,19 @@ IMPL (simple_STRCHR, 0)
 IMPL (STRCHR, 1)
 
 static void
-do_one_test (impl_t *impl, const CHAR *s, int c, const CHAR *exp_res)
+do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s, int c,
+             const CHAR *exp_res)
 {
   size_t i, iters = INNER_LOOP_ITERS_LARGE;
   timing_t start, stop, cur;
+  const CHAR *res = CALL (impl, s, c);
+  if (res != exp_res)
+    {
+      error (0, 0, "Wrong result in function %s %p != %p", impl->name, res,
+             exp_res);
+      ret = 1;
+      return;
+    }
 
   TIMING_NOW (start);
   for (i = 0; i < iters; ++i)
@@ -88,11 +98,12 @@ do_one_test (impl_t *impl, const CHAR *s, int c, const CHAR *exp_res)
 
   TIMING_DIFF (cur, start, stop);
 
-  TIMING_PRINT_MEAN ((double) cur, (double) iters);
+  json_element_double (json_ctx, (double)cur / (double)iters);
 }
 
 static void
-do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
+do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
+         int seek_char, int max_char)
 /* For wcschr: align here means align not in bytes,
    but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
    len for wcschr here isn't in bytes but it's number of wchar_t symbols.  */
@@ -124,87 +135,110 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
   else
     result = NULLRET (buf + align + len);
 
-  printf ("Length %4zd, alignment in bytes %2zd:",
-	  pos, align * sizeof (CHAR));
+  json_element_object_begin (json_ctx);
+  json_attr_uint (json_ctx, "length", len);
+  json_attr_uint (json_ctx, "pos", pos);
+  json_attr_uint (json_ctx, "seek_char", seek_char);
+  json_attr_uint (json_ctx, "max_char", max_char);
+  json_attr_uint (json_ctx, "alignment", align);
+  json_array_begin (json_ctx, "timings");
 
   FOR_EACH_IMPL (impl, 0)
-    do_one_test (impl, buf + align, seek_char, result);
+    do_one_test (json_ctx, impl, buf + align, seek_char, result);
 
-  putchar ('\n');
+  json_array_end (json_ctx);
+  json_element_object_end (json_ctx);
 }
 
 int
 test_main (void)
 {
+  json_ctx_t json_ctx;
   size_t i;
 
   test_init ();
 
-  printf ("%20s", "");
+  json_init (&json_ctx, 0, stdout);
+
+  json_document_begin (&json_ctx);
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+
+  json_attr_object_begin (&json_ctx, "functions");
+  json_attr_object_begin (&json_ctx, TEST_NAME);
+  json_attr_string (&json_ctx, "bench-variant", "");
+
+  json_array_begin (&json_ctx, "ifuncs");
   FOR_EACH_IMPL (impl, 0)
-    printf ("\t%s", impl->name);
-  putchar ('\n');
+    json_element_string (&json_ctx, impl->name);
+  json_array_end (&json_ctx);
+
+  json_array_begin (&json_ctx, "results");
 
   for (i = 1; i < 8; ++i)
     {
-      do_test (0, 16 << i, 2048, SMALL_CHAR, MIDDLE_CHAR);
-      do_test (i, 16 << i, 2048, SMALL_CHAR, MIDDLE_CHAR);
+      do_test (&json_ctx, 0, 16 << i, 2048, SMALL_CHAR, MIDDLE_CHAR);
+      do_test (&json_ctx, i, 16 << i, 2048, SMALL_CHAR, MIDDLE_CHAR);
     }
 
   for (i = 1; i < 8; ++i)
     {
-      do_test (0, 16 << i, 4096, SMALL_CHAR, MIDDLE_CHAR);
-      do_test (i, 16 << i, 4096, SMALL_CHAR, MIDDLE_CHAR);
+      do_test (&json_ctx, 0, 16 << i, 4096, SMALL_CHAR, MIDDLE_CHAR);
+      do_test (&json_ctx, i, 16 << i, 4096, SMALL_CHAR, MIDDLE_CHAR);
     }
 
   for (i = 1; i < 8; ++i)
     {
-      do_test (i, 64, 256, SMALL_CHAR, MIDDLE_CHAR);
-      do_test (i, 64, 256, SMALL_CHAR, BIG_CHAR);
+      do_test (&json_ctx, i, 64, 256, SMALL_CHAR, MIDDLE_CHAR);
+      do_test (&json_ctx, i, 64, 256, SMALL_CHAR, BIG_CHAR);
     }
 
   for (i = 0; i < 8; ++i)
     {
-      do_test (16 * i, 256, 512, SMALL_CHAR, MIDDLE_CHAR);
-      do_test (16 * i, 256, 512, SMALL_CHAR, BIG_CHAR);
+      do_test (&json_ctx, 16 * i, 256, 512, SMALL_CHAR, MIDDLE_CHAR);
+      do_test (&json_ctx, 16 * i, 256, 512, SMALL_CHAR, BIG_CHAR);
     }
 
   for (i = 0; i < 32; ++i)
     {
-      do_test (0, i, i + 1, SMALL_CHAR, MIDDLE_CHAR);
-      do_test (0, i, i + 1, SMALL_CHAR, BIG_CHAR);
+      do_test (&json_ctx, 0, i, i + 1, SMALL_CHAR, MIDDLE_CHAR);
+      do_test (&json_ctx, 0, i, i + 1, SMALL_CHAR, BIG_CHAR);
     }
 
   for (i = 1; i < 8; ++i)
     {
-      do_test (0, 16 << i, 2048, 0, MIDDLE_CHAR);
-      do_test (i, 16 << i, 2048, 0, MIDDLE_CHAR);
+      do_test (&json_ctx, 0, 16 << i, 2048, 0, MIDDLE_CHAR);
+      do_test (&json_ctx, i, 16 << i, 2048, 0, MIDDLE_CHAR);
     }
 
   for (i = 1; i < 8; ++i)
     {
-      do_test (0, 16 << i, 4096, 0, MIDDLE_CHAR);
-      do_test (i, 16 << i, 4096, 0, MIDDLE_CHAR);
+      do_test (&json_ctx, 0, 16 << i, 4096, 0, MIDDLE_CHAR);
+      do_test (&json_ctx, i, 16 << i, 4096, 0, MIDDLE_CHAR);
     }
 
   for (i = 1; i < 8; ++i)
     {
-      do_test (i, 64, 256, 0, MIDDLE_CHAR);
-      do_test (i, 64, 256, 0, BIG_CHAR);
+      do_test (&json_ctx, i, 64, 256, 0, MIDDLE_CHAR);
+      do_test (&json_ctx, i, 64, 256, 0, BIG_CHAR);
     }
 
   for (i = 0; i < 8; ++i)
     {
-      do_test (16 * i, 256, 512, 0, MIDDLE_CHAR);
-      do_test (16 * i, 256, 512, 0, BIG_CHAR);
+      do_test (&json_ctx, 16 * i, 256, 512, 0, MIDDLE_CHAR);
+      do_test (&json_ctx, 16 * i, 256, 512, 0, BIG_CHAR);
     }
 
   for (i = 0; i < 32; ++i)
     {
-      do_test (0, i, i + 1, 0, MIDDLE_CHAR);
-      do_test (0, i, i + 1, 0, BIG_CHAR);
+      do_test (&json_ctx, 0, i, i + 1, 0, MIDDLE_CHAR);
+      do_test (&json_ctx, 0, i, i + 1, 0, BIG_CHAR);
     }
 
+  json_array_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_document_end (&json_ctx);
+
   return ret;
 }
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH v1 02/23] benchtests: Add random benchmark in bench-strchr.c
  2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
@ 2022-03-23 21:57 ` Noah Goldstein
  2022-03-24 18:44   ` H.J. Lu
  2022-03-23 21:57 ` [PATCH v1 03/23] x86: Code cleanup in strchr-avx2 and comment justifying branch Noah Goldstein
                   ` (21 subsequent siblings)
  22 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-23 21:57 UTC (permalink / raw)
  To: libc-alpha

Add benchmark that randomizes whether return should be NULL or pointer
to CHAR. The rationale is on many architectures there is a choice
between a predicate execution option (i.e cmovcc on x86) or a branch.

On x86 the results for cmovcc vs branch are something along the lines
of the following:

perc-zero, Br On Result, Time Br / Time cmov
     0.10,            1,              ,0.983
     0.10,            0,              ,1.246
     0.25,            1,              ,1.035
     0.25,            0,              ,1.49
     0.33,            1,              ,1.016
     0.33,            0,              ,1.579
     0.50,            1,              ,1.228
     0.50,            0,              ,1.739
     0.66,            1,              ,1.039
     0.66,            0,              ,1.764
     0.75,            1,              ,0.996
     0.75,            0,              ,1.642
     0.90,            1,              ,1.071
     0.90,            0,              ,1.409
     1.00,            1,              ,0.937
     1.00,            0,              ,0.999
---
 benchtests/bench-strchr.c | 143 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 143 insertions(+)

diff --git a/benchtests/bench-strchr.c b/benchtests/bench-strchr.c
index 203900d4ad..54640bde7e 100644
--- a/benchtests/bench-strchr.c
+++ b/benchtests/bench-strchr.c
@@ -53,6 +53,11 @@
 # define SMALL_CHAR 851
 #endif /* WIDE */
 
+#ifdef USE_FOR_STRCHRNUL
+# define DO_RAND_TEST(...)
+#else
+# define DO_RAND_TEST(...) do_rand_test(__VA_ARGS__)
+#endif
 #ifdef USE_FOR_STRCHRNUL
 # define NULLRET(endptr) endptr
 #else
@@ -74,6 +79,133 @@ simple_STRCHR (const CHAR *s, int c)
 IMPL (simple_STRCHR, 0)
 IMPL (STRCHR, 1)
 
+#ifndef USE_FOR_STRCHRNUL
+/* Random benchmarks for strchr (if return is CHAR or NULL).  The
+   rational for the benchmark is returning null/char can be done with
+   predicate execution (i.e cmovcc on x86) or a branch. */
+
+
+/* Large enough that full history can't be stored in BHT. */
+#define NUM_SEARCH_CHARS 2048
+
+/* Expectation is usecases of strchr check the return. Otherwise
+   strchrnul would almost always be better. Since there is another
+   branch coming we want to test the case where a potential branch in
+   strchr can be used to skip a later mispredict because of the
+   relationship between the two branches. */
+static void __attribute__ ((noinline, noclone))
+do_one_rand_plus_branch_test (json_ctx_t *json_ctx, impl_t *impl,
+                              const CHAR *s, const CHAR *c)
+{
+  size_t i, iters = INNER_LOOP_ITERS_LARGE;
+  int must_execute = 0;
+  timing_t start, stop, cur;
+  TIMING_NOW (start);
+  for (i = 0; i < iters; ++i)
+    {
+      if (CALL (impl, s, c[i % NUM_SEARCH_CHARS]))
+        {
+          /* We just need something that will force compiler to emit
+             a branch instead of conditional execution. */
+          ++must_execute;
+          asm volatile("" : : :);
+        }
+    }
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (cur, start, stop);
+
+  json_element_double (json_ctx, (double)cur / (double)iters);
+}
+
+static void __attribute__ ((noinline, noclone))
+do_one_rand_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s,
+                  const CHAR *c)
+{
+  size_t i, iters = INNER_LOOP_ITERS_LARGE;
+  timing_t start, stop, cur;
+  TIMING_NOW (start);
+  for (i = 0; i < iters; ++i)
+    {
+      CALL (impl, s, c[i % NUM_SEARCH_CHARS]);
+    }
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (cur, start, stop);
+
+  json_element_double (json_ctx, (double)cur / (double)iters);
+}
+
+static void
+do_rand_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
+              float perc_zero)
+{
+  size_t i;
+  int perc_zero_int;
+  CHAR *buf = (CHAR *)buf1;
+  CHAR *c = (CHAR *)buf2;
+  align &= 127;
+  if ((align + len) * sizeof (CHAR) >= page_size)
+    return;
+
+  /* Test is only interesting if we can hit both cases. */
+  if (pos >= len)
+    return;
+
+  /* Segfault if we run the test. */
+  if (NUM_SEARCH_CHARS * sizeof (CHAR) > page_size)
+    return;
+
+  for (i = 0; i < len; ++i)
+    {
+      buf[align + i] = 2;
+    }
+  buf[align + len] = 0;
+  buf[align + pos] = 1;
+
+  perc_zero_int = perc_zero * RAND_MAX;
+  for (i = 0; i < NUM_SEARCH_CHARS; ++i)
+    {
+      if (rand () > perc_zero_int)
+        c[i] = 0;
+      else
+        c[i] = 1;
+    }
+  {
+    json_element_object_begin (json_ctx);
+    json_attr_uint (json_ctx, "rand", 1);
+    json_attr_uint (json_ctx, "branch", 1);
+    json_attr_double (json_ctx, "perc-zero", perc_zero);
+    json_attr_uint (json_ctx, "length", len);
+    json_attr_uint (json_ctx, "pos", pos);
+    json_attr_uint (json_ctx, "alignment", align);
+    json_array_begin (json_ctx, "timings");
+
+    FOR_EACH_IMPL (impl, 0)
+      do_one_rand_plus_branch_test (json_ctx, impl, buf + align, c);
+
+    json_array_end (json_ctx);
+    json_element_object_end (json_ctx);
+  }
+  {
+    json_element_object_begin (json_ctx);
+    json_attr_uint (json_ctx, "rand", 1);
+    json_attr_uint (json_ctx, "branch", 0);
+    json_attr_double (json_ctx, "perc-zero", perc_zero);
+    json_attr_uint (json_ctx, "length", len);
+    json_attr_uint (json_ctx, "pos", pos);
+    json_attr_uint (json_ctx, "alignment", align);
+    json_array_begin (json_ctx, "timings");
+
+    FOR_EACH_IMPL (impl, 0)
+      do_one_rand_test (json_ctx, impl, buf + align, c);
+
+    json_array_end (json_ctx);
+    json_element_object_end (json_ctx);
+  }
+}
+#endif
+
 static void
 do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s, int c,
              const CHAR *exp_res)
@@ -136,6 +268,7 @@ do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
     result = NULLRET (buf + align + len);
 
   json_element_object_begin (json_ctx);
+  json_attr_uint (json_ctx, "rand", 0);
   json_attr_uint (json_ctx, "length", len);
   json_attr_uint (json_ctx, "pos", pos);
   json_attr_uint (json_ctx, "seek_char", seek_char);
@@ -234,6 +367,16 @@ test_main (void)
       do_test (&json_ctx, 0, i, i + 1, 0, BIG_CHAR);
     }
 
+  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.0);
+  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.1);
+  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.25);
+  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.33);
+  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.5);
+  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.66);
+  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.75);
+  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.9);
+  DO_RAND_TEST(&json_ctx, 0, 15, 16, 1.0);
+
   json_array_end (&json_ctx);
   json_attr_object_end (&json_ctx);
   json_attr_object_end (&json_ctx);
-- 
2.25.1


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH v1 03/23] x86: Code cleanup in strchr-avx2 and comment justifying branch
  2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
  2022-03-23 21:57 ` [PATCH v1 02/23] benchtests: Add random benchmark " Noah Goldstein
@ 2022-03-23 21:57 ` Noah Goldstein
  2022-03-24 18:53   ` H.J. Lu
  2022-03-23 21:57 ` [PATCH v1 04/23] x86: Code cleanup in strchr-evex " Noah Goldstein
                   ` (20 subsequent siblings)
  22 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-23 21:57 UTC (permalink / raw)
  To: libc-alpha

Small code cleanup for size: -53 bytes.

Add comment justifying using a branch to do NULL/non-null return.

All string/memory tests pass and no regressions in benchtests.

geometric_mean(N=20) of all benchmarks Original / New: 1.00
---
Geomtric Mean N=20 runs; All functions page aligned
length, alignment,  pos, rand, seek_char/branch, max_char/perc-zero, New Time / Old Time
  2048,         0,   32,    0,               23,                127,               1.033
  2048,         1,   32,    0,               23,                127,               1.006
  2048,         0,   64,    0,               23,                127,                1.02
  2048,         2,   64,    0,               23,                127,               0.992
  2048,         0,  128,    0,               23,                127,               0.996
  2048,         3,  128,    0,               23,                127,               0.966
  2048,         0,  256,    0,               23,                127,               0.996
  2048,         4,  256,    0,               23,                127,               0.998
  2048,         0,  512,    0,               23,                127,               0.991
  2048,         5,  512,    0,               23,                127,               0.991
  2048,         0, 1024,    0,               23,                127,               0.993
  2048,         6, 1024,    0,               23,                127,               0.992
  2048,         0, 2048,    0,               23,                127,               0.992
  2048,         7, 2048,    0,               23,                127,               0.976
  4096,         0,   32,    0,               23,                127,               0.983
  4096,         1,   32,    0,               23,                127,               0.994
  4096,         0,   64,    0,               23,                127,               0.968
  4096,         2,   64,    0,               23,                127,               1.018
  4096,         0,  128,    0,               23,                127,                0.99
  4096,         3,  128,    0,               23,                127,               1.001
  4096,         0,  256,    0,               23,                127,                 1.0
  4096,         4,  256,    0,               23,                127,               1.001
  4096,         0,  512,    0,               23,                127,               0.989
  4096,         5,  512,    0,               23,                127,               0.988
  4096,         0, 1024,    0,               23,                127,               0.994
  4096,         6, 1024,    0,               23,                127,               0.993
  4096,         0, 2048,    0,               23,                127,               0.987
  4096,         7, 2048,    0,               23,                127,               0.996
   256,         1,   64,    0,               23,                127,               1.004
   256,         2,   64,    0,               23,                127,               1.004
   256,         3,   64,    0,               23,                127,               0.992
   256,         4,   64,    0,               23,                127,               1.001
   256,         5,   64,    0,               23,                127,               1.001
   256,         6,   64,    0,               23,                127,               0.998
   256,         7,   64,    0,               23,                127,               0.994
   512,         0,  256,    0,               23,                127,               0.999
   512,        16,  256,    0,               23,                127,               1.002
   512,        32,  256,    0,               23,                127,               0.994
   512,        48,  256,    0,               23,                127,               0.991
   512,        64,  256,    0,               23,                127,               0.994
   512,        80,  256,    0,               23,                127,               0.994
   512,        96,  256,    0,               23,                127,               0.996
   512,       112,  256,    0,               23,                127,               0.999
     1,         0,    0,    0,               23,                127,               0.978
     2,         0,    1,    0,               23,                127,               0.981
     3,         0,    2,    0,               23,                127,               0.993
     4,         0,    3,    0,               23,                127,               1.004
     5,         0,    4,    0,               23,                127,               1.002
     6,         0,    5,    0,               23,                127,               0.991
     7,         0,    6,    0,               23,                127,                0.99
     8,         0,    7,    0,               23,                127,               1.012
     9,         0,    8,    0,               23,                127,               0.994
    10,         0,    9,    0,               23,                127,               1.003
    11,         0,   10,    0,               23,                127,               0.999
    12,         0,   11,    0,               23,                127,               1.007
    13,         0,   12,    0,               23,                127,                 1.0
    14,         0,   13,    0,               23,                127,               0.997
    15,         0,   14,    0,               23,                127,               0.996
    16,         0,   15,    0,               23,                127,               0.993
    17,         0,   16,    0,               23,                127,               1.002
    18,         0,   17,    0,               23,                127,               0.997
    19,         0,   18,    0,               23,                127,               0.998
    20,         0,   19,    0,               23,                127,               0.994
    21,         0,   20,    0,               23,                127,                0.99
    22,         0,   21,    0,               23,                127,               0.992
    23,         0,   22,    0,               23,                127,               0.996
    24,         0,   23,    0,               23,                127,               0.991
    25,         0,   24,    0,               23,                127,               0.997
    26,         0,   25,    0,               23,                127,               1.011
    27,         0,   26,    0,               23,                127,               1.013
    28,         0,   27,    0,               23,                127,               0.996
    29,         0,   28,    0,               23,                127,               0.993
    30,         0,   29,    0,               23,                127,               1.009
    31,         0,   30,    0,               23,                127,               1.009
    32,         0,   31,    0,               23,                127,               1.008
  2048,         0,   32,    0,                0,                127,                 1.0
  2048,         1,   32,    0,                0,                127,                1.01
  2048,         0,   64,    0,                0,                127,               0.997
  2048,         2,   64,    0,                0,                127,               1.002
  2048,         0,  128,    0,                0,                127,               0.986
  2048,         3,  128,    0,                0,                127,               0.997
  2048,         0,  256,    0,                0,                127,               1.002
  2048,         4,  256,    0,                0,                127,               0.999
  2048,         0,  512,    0,                0,                127,               0.991
  2048,         5,  512,    0,                0,                127,               0.984
  2048,         0, 1024,    0,                0,                127,               0.994
  2048,         6, 1024,    0,                0,                127,               0.993
  2048,         0, 2048,    0,                0,                127,               0.951
  2048,         7, 2048,    0,                0,                127,               0.989
  4096,         0,   32,    0,                0,                127,               0.993
  4096,         1,   32,    0,                0,                127,               0.997
  4096,         0,   64,    0,                0,                127,               1.004
  4096,         2,   64,    0,                0,                127,               1.016
  4096,         0,  128,    0,                0,                127,               0.973
  4096,         3,  128,    0,                0,                127,               1.001
  4096,         0,  256,    0,                0,                127,               0.999
  4096,         4,  256,    0,                0,                127,               0.998
  4096,         0,  512,    0,                0,                127,                0.99
  4096,         5,  512,    0,                0,                127,               0.985
  4096,         0, 1024,    0,                0,                127,               0.993
  4096,         6, 1024,    0,                0,                127,               0.997
  4096,         0, 2048,    0,                0,                127,               0.995
  4096,         7, 2048,    0,                0,                127,               0.996
   256,         1,   64,    0,                0,                127,                1.01
   256,         2,   64,    0,                0,                127,               1.024
   256,         3,   64,    0,                0,                127,                1.03
   256,         4,   64,    0,                0,                127,               1.004
   256,         5,   64,    0,                0,                127,               0.998
   256,         6,   64,    0,                0,                127,               0.998
   256,         7,   64,    0,                0,                127,               0.997
   512,         0,  256,    0,                0,                127,               0.996
   512,        16,  256,    0,                0,                127,               0.995
   512,        32,  256,    0,                0,                127,               0.996
   512,        48,  256,    0,                0,                127,               0.992
   512,        64,  256,    0,                0,                127,               0.999
   512,        80,  256,    0,                0,                127,               1.002
   512,        96,  256,    0,                0,                127,               0.999
   512,       112,  256,    0,                0,                127,               0.998
     1,         0,    0,    0,                0,                127,               1.016
     2,         0,    1,    0,                0,                127,               0.998
     3,         0,    2,    0,                0,                127,                1.02
     4,         0,    3,    0,                0,                127,               1.004
     5,         0,    4,    0,                0,                127,               1.021
     6,         0,    5,    0,                0,                127,               1.014
     7,         0,    6,    0,                0,                127,               1.007
     8,         0,    7,    0,                0,                127,               1.016
     9,         0,    8,    0,                0,                127,               1.003
    10,         0,    9,    0,                0,                127,               1.004
    11,         0,   10,    0,                0,                127,               0.995
    12,         0,   11,    0,                0,                127,               1.009
    13,         0,   12,    0,                0,                127,               1.005
    14,         0,   13,    0,                0,                127,               0.987
    15,         0,   14,    0,                0,                127,               0.998
    16,         0,   15,    0,                0,                127,               1.004
    17,         0,   16,    0,                0,                127,                1.01
    18,         0,   17,    0,                0,                127,                1.01
    19,         0,   18,    0,                0,                127,               1.006
    20,         0,   19,    0,                0,                127,               1.012
    21,         0,   20,    0,                0,                127,               0.999
    22,         0,   21,    0,                0,                127,               1.004
    23,         0,   22,    0,                0,                127,               0.988
    24,         0,   23,    0,                0,                127,               0.993
    25,         0,   24,    0,                0,                127,               1.004
    26,         0,   25,    0,                0,                127,                0.99
    27,         0,   26,    0,                0,                127,               1.016
    28,         0,   27,    0,                0,                127,               0.987
    29,         0,   28,    0,                0,                127,               0.989
    30,         0,   29,    0,                0,                127,               0.998
    31,         0,   30,    0,                0,                127,               1.005
    32,         0,   31,    0,                0,                127,               0.993

    16,         0,   15,    1,                1,                  0,               1.002
    16,         0,   15,    1,                0,                  0,                 1.0
    16,         0,   15,    1,                1,                0.1,               1.034
    16,         0,   15,    1,                0,                0.1,                1.03
    16,         0,   15,    1,                1,               0.25,               0.993
    16,         0,   15,    1,                0,               0.25,               1.081
    16,         0,   15,    1,                1,               0.33,               0.959
    16,         0,   15,    1,                0,               0.33,               1.142
    16,         0,   15,    1,                1,                0.5,               0.929
    16,         0,   15,    1,                0,                0.5,               1.072
    16,         0,   15,    1,                1,               0.66,               0.984
    16,         0,   15,    1,                0,               0.66,               1.069
    16,         0,   15,    1,                1,               0.75,               0.969
    16,         0,   15,    1,                0,               0.75,               1.059
    16,         0,   15,    1,                1,                0.9,                0.98
    16,         0,   15,    1,                0,                0.9,               0.994
    16,         0,   15,    1,                1,                  1,               0.993
    16,         0,   15,    1,                0,                  1,               0.996

 sysdeps/x86_64/multiarch/strchr-avx2.S | 204 +++++++++++++------------
 1 file changed, 107 insertions(+), 97 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
index 086cabf76a..1a916cc951 100644
--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
@@ -48,13 +48,13 @@
 # define PAGE_SIZE 4096
 
 	.section SECTION(.text),"ax",@progbits
-ENTRY (STRCHR)
+ENTRY_P2ALIGN (STRCHR, 5)
 	/* Broadcast CHAR to YMM0.	*/
 	vmovd	%esi, %xmm0
 	movl	%edi, %eax
 	andl	$(PAGE_SIZE - 1), %eax
 	VPBROADCAST	%xmm0, %ymm0
-	vpxor	%xmm9, %xmm9, %xmm9
+	vpxor	%xmm1, %xmm1, %xmm1
 
 	/* Check if we cross page boundary with one vector load.  */
 	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
@@ -62,37 +62,29 @@ ENTRY (STRCHR)
 
 	/* Check the first VEC_SIZE bytes.	Search for both CHAR and the
 	   null byte.  */
-	vmovdqu	(%rdi), %ymm8
-	VPCMPEQ	%ymm8, %ymm0, %ymm1
-	VPCMPEQ	%ymm8, %ymm9, %ymm2
-	vpor	%ymm1, %ymm2, %ymm1
-	vpmovmskb %ymm1, %eax
+	vmovdqu	(%rdi), %ymm2
+	VPCMPEQ	%ymm2, %ymm0, %ymm3
+	VPCMPEQ	%ymm2, %ymm1, %ymm2
+	vpor	%ymm3, %ymm2, %ymm3
+	vpmovmskb %ymm3, %eax
 	testl	%eax, %eax
 	jz	L(aligned_more)
 	tzcntl	%eax, %eax
 # ifndef USE_AS_STRCHRNUL
-	/* Found CHAR or the null byte.	 */
-	cmp	(%rdi, %rax), %CHAR_REG
-	jne	L(zero)
-# endif
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
-
-	/* .p2align 5 helps keep performance more consistent if ENTRY()
-	   alignment % 32 was either 16 or 0. As well this makes the
-	   alignment % 32 of the loop_4x_vec fixed which makes tuning it
-	   easier.  */
-	.p2align 5
-L(first_vec_x4):
-	tzcntl	%eax, %eax
-	addq	$(VEC_SIZE * 3 + 1), %rdi
-# ifndef USE_AS_STRCHRNUL
-	/* Found CHAR or the null byte.	 */
+	/* Found CHAR or the null byte.  */
 	cmp	(%rdi, %rax), %CHAR_REG
+	/* NB: Use a branch instead of cmovcc here. The expectation is
+	   that with strchr the user will branch based on input being
+	   null. Since this branch will be 100% predictive of the user
+	   branch a branch miss here should save what otherwise would
+	   be branch miss in the user code. Otherwise using a branch 1)
+	   saves code size and 2) is faster in highly predictable
+	   environments.  */
 	jne	L(zero)
 # endif
 	addq	%rdi, %rax
-	VZEROUPPER_RETURN
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
 
 # ifndef USE_AS_STRCHRNUL
 L(zero):
@@ -103,7 +95,8 @@ L(zero):
 
 	.p2align 4
 L(first_vec_x1):
-	tzcntl	%eax, %eax
+	/* Use bsf to save code size.  */
+	bsfl	%eax, %eax
 	incq	%rdi
 # ifndef USE_AS_STRCHRNUL
 	/* Found CHAR or the null byte.	 */
@@ -113,9 +106,10 @@ L(first_vec_x1):
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
 
-	.p2align 4
+	.p2align 4,, 10
 L(first_vec_x2):
-	tzcntl	%eax, %eax
+	/* Use bsf to save code size.  */
+	bsfl	%eax, %eax
 	addq	$(VEC_SIZE + 1), %rdi
 # ifndef USE_AS_STRCHRNUL
 	/* Found CHAR or the null byte.	 */
@@ -125,9 +119,10 @@ L(first_vec_x2):
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
 
-	.p2align 4
+	.p2align 4,, 8
 L(first_vec_x3):
-	tzcntl	%eax, %eax
+	/* Use bsf to save code size.  */
+	bsfl	%eax, %eax
 	addq	$(VEC_SIZE * 2 + 1), %rdi
 # ifndef USE_AS_STRCHRNUL
 	/* Found CHAR or the null byte.	 */
@@ -137,6 +132,21 @@ L(first_vec_x3):
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
 
+	.p2align 4,, 10
+L(first_vec_x4):
+	/* Use bsf to save code size.  */
+	bsfl	%eax, %eax
+	addq	$(VEC_SIZE * 3 + 1), %rdi
+# ifndef USE_AS_STRCHRNUL
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero)
+# endif
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
+
+
+
 	.p2align 4
 L(aligned_more):
 	/* Align data to VEC_SIZE - 1. This is the same number of
@@ -146,90 +156,92 @@ L(aligned_more):
 L(cross_page_continue):
 	/* Check the next 4 * VEC_SIZE.  Only one VEC_SIZE at a time
 	   since data is only aligned to VEC_SIZE.  */
-	vmovdqa	1(%rdi), %ymm8
-	VPCMPEQ	%ymm8, %ymm0, %ymm1
-	VPCMPEQ	%ymm8, %ymm9, %ymm2
-	vpor	%ymm1, %ymm2, %ymm1
-	vpmovmskb %ymm1, %eax
+	vmovdqa	1(%rdi), %ymm2
+	VPCMPEQ	%ymm2, %ymm0, %ymm3
+	VPCMPEQ	%ymm2, %ymm1, %ymm2
+	vpor	%ymm3, %ymm2, %ymm3
+	vpmovmskb %ymm3, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x1)
 
-	vmovdqa	(VEC_SIZE + 1)(%rdi), %ymm8
-	VPCMPEQ	%ymm8, %ymm0, %ymm1
-	VPCMPEQ	%ymm8, %ymm9, %ymm2
-	vpor	%ymm1, %ymm2, %ymm1
-	vpmovmskb %ymm1, %eax
+	vmovdqa	(VEC_SIZE + 1)(%rdi), %ymm2
+	VPCMPEQ	%ymm2, %ymm0, %ymm3
+	VPCMPEQ	%ymm2, %ymm1, %ymm2
+	vpor	%ymm3, %ymm2, %ymm3
+	vpmovmskb %ymm3, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x2)
 
-	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm8
-	VPCMPEQ	%ymm8, %ymm0, %ymm1
-	VPCMPEQ	%ymm8, %ymm9, %ymm2
-	vpor	%ymm1, %ymm2, %ymm1
-	vpmovmskb %ymm1, %eax
+	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm2
+	VPCMPEQ	%ymm2, %ymm0, %ymm3
+	VPCMPEQ	%ymm2, %ymm1, %ymm2
+	vpor	%ymm3, %ymm2, %ymm3
+	vpmovmskb %ymm3, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x3)
 
-	vmovdqa	(VEC_SIZE * 3 + 1)(%rdi), %ymm8
-	VPCMPEQ	%ymm8, %ymm0, %ymm1
-	VPCMPEQ	%ymm8, %ymm9, %ymm2
-	vpor	%ymm1, %ymm2, %ymm1
-	vpmovmskb %ymm1, %eax
+	vmovdqa	(VEC_SIZE * 3 + 1)(%rdi), %ymm2
+	VPCMPEQ	%ymm2, %ymm0, %ymm3
+	VPCMPEQ	%ymm2, %ymm1, %ymm2
+	vpor	%ymm3, %ymm2, %ymm3
+	vpmovmskb %ymm3, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x4)
-	/* Align data to VEC_SIZE * 4 - 1.	*/
-	addq	$(VEC_SIZE * 4 + 1), %rdi
-	andq	$-(VEC_SIZE * 4), %rdi
+	/* Align data to VEC_SIZE * 4 - 1.  */
+	incq	%rdi
+	orq	$(VEC_SIZE * 4 - 1), %rdi
 	.p2align 4
 L(loop_4x_vec):
 	/* Compare 4 * VEC at a time forward.  */
-	vmovdqa	(%rdi), %ymm5
-	vmovdqa	(VEC_SIZE)(%rdi), %ymm6
-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm7
-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
+	vmovdqa	1(%rdi), %ymm6
+	vmovdqa	(VEC_SIZE + 1)(%rdi), %ymm7
 
 	/* Leaves only CHARS matching esi as 0.	 */
-	vpxor	%ymm5, %ymm0, %ymm1
 	vpxor	%ymm6, %ymm0, %ymm2
 	vpxor	%ymm7, %ymm0, %ymm3
-	vpxor	%ymm8, %ymm0, %ymm4
 
-	VPMINU	%ymm1, %ymm5, %ymm1
 	VPMINU	%ymm2, %ymm6, %ymm2
 	VPMINU	%ymm3, %ymm7, %ymm3
-	VPMINU	%ymm4, %ymm8, %ymm4
 
-	VPMINU	%ymm1, %ymm2, %ymm5
-	VPMINU	%ymm3, %ymm4, %ymm6
+	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm6
+	vmovdqa	(VEC_SIZE * 3 + 1)(%rdi), %ymm7
+
+	vpxor	%ymm6, %ymm0, %ymm4
+	vpxor	%ymm7, %ymm0, %ymm5
+
+	VPMINU	%ymm4, %ymm6, %ymm4
+	VPMINU	%ymm5, %ymm7, %ymm5
 
-	VPMINU	%ymm5, %ymm6, %ymm6
+	VPMINU	%ymm2, %ymm3, %ymm6
+	VPMINU	%ymm4, %ymm5, %ymm7
 
-	VPCMPEQ	%ymm6, %ymm9, %ymm6
-	vpmovmskb %ymm6, %ecx
+	VPMINU	%ymm6, %ymm7, %ymm7
+
+	VPCMPEQ	%ymm7, %ymm1, %ymm7
+	vpmovmskb %ymm7, %ecx
 	subq	$-(VEC_SIZE * 4), %rdi
 	testl	%ecx, %ecx
 	jz	L(loop_4x_vec)
 
-
-	VPCMPEQ	%ymm1, %ymm9, %ymm1
-	vpmovmskb %ymm1, %eax
+	VPCMPEQ	%ymm2, %ymm1, %ymm2
+	vpmovmskb %ymm2, %eax
 	testl	%eax, %eax
 	jnz	L(last_vec_x0)
 
 
-	VPCMPEQ	%ymm5, %ymm9, %ymm2
-	vpmovmskb %ymm2, %eax
+	VPCMPEQ	%ymm3, %ymm1, %ymm3
+	vpmovmskb %ymm3, %eax
 	testl	%eax, %eax
 	jnz	L(last_vec_x1)
 
-	VPCMPEQ	%ymm3, %ymm9, %ymm3
-	vpmovmskb %ymm3, %eax
+	VPCMPEQ	%ymm4, %ymm1, %ymm4
+	vpmovmskb %ymm4, %eax
 	/* rcx has combined result from all 4 VEC. It will only be used
 	   if the first 3 other VEC all did not contain a match.  */
 	salq	$32, %rcx
 	orq	%rcx, %rax
 	tzcntq	%rax, %rax
-	subq	$(VEC_SIZE * 2), %rdi
+	subq	$(VEC_SIZE * 2 - 1), %rdi
 # ifndef USE_AS_STRCHRNUL
 	/* Found CHAR or the null byte.	 */
 	cmp	(%rdi, %rax), %CHAR_REG
@@ -239,10 +251,11 @@ L(loop_4x_vec):
 	VZEROUPPER_RETURN
 
 
-	.p2align 4
+	.p2align 4,, 10
 L(last_vec_x0):
-	tzcntl	%eax, %eax
-	addq	$-(VEC_SIZE * 4), %rdi
+	/* Use bsf to save code size.  */
+	bsfl	%eax, %eax
+	addq	$-(VEC_SIZE * 4 - 1), %rdi
 # ifndef USE_AS_STRCHRNUL
 	/* Found CHAR or the null byte.	 */
 	cmp	(%rdi, %rax), %CHAR_REG
@@ -251,16 +264,11 @@ L(last_vec_x0):
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
 
-# ifndef USE_AS_STRCHRNUL
-L(zero_end):
-	xorl	%eax, %eax
-	VZEROUPPER_RETURN
-# endif
 
-	.p2align 4
+	.p2align 4,, 10
 L(last_vec_x1):
 	tzcntl	%eax, %eax
-	subq	$(VEC_SIZE * 3), %rdi
+	subq	$(VEC_SIZE * 3 - 1), %rdi
 # ifndef USE_AS_STRCHRNUL
 	/* Found CHAR or the null byte.	 */
 	cmp	(%rdi, %rax), %CHAR_REG
@@ -269,18 +277,23 @@ L(last_vec_x1):
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
 
+# ifndef USE_AS_STRCHRNUL
+L(zero_end):
+	xorl	%eax, %eax
+	VZEROUPPER_RETURN
+# endif
 
 	/* Cold case for crossing page with first load.	 */
-	.p2align 4
+	.p2align 4,, 8
 L(cross_page_boundary):
 	movq	%rdi, %rdx
 	/* Align rdi to VEC_SIZE - 1.  */
 	orq	$(VEC_SIZE - 1), %rdi
-	vmovdqa	-(VEC_SIZE - 1)(%rdi), %ymm8
-	VPCMPEQ	%ymm8, %ymm0, %ymm1
-	VPCMPEQ	%ymm8, %ymm9, %ymm2
-	vpor	%ymm1, %ymm2, %ymm1
-	vpmovmskb %ymm1, %eax
+	vmovdqa	-(VEC_SIZE - 1)(%rdi), %ymm2
+	VPCMPEQ	%ymm2, %ymm0, %ymm3
+	VPCMPEQ	%ymm2, %ymm1, %ymm2
+	vpor	%ymm3, %ymm2, %ymm3
+	vpmovmskb %ymm3, %eax
 	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
 	   so no need to manually mod edx.  */
 	sarxl	%edx, %eax, %eax
@@ -291,13 +304,10 @@ L(cross_page_boundary):
 	xorl	%ecx, %ecx
 	/* Found CHAR or the null byte.	 */
 	cmp	(%rdx, %rax), %CHAR_REG
-	leaq	(%rdx, %rax), %rax
-	cmovne	%rcx, %rax
-# else
-	addq	%rdx, %rax
+	jne	L(zero_end)
 # endif
-L(return_vzeroupper):
-	ZERO_UPPER_VEC_REGISTERS_RETURN
+	addq	%rdx, %rax
+	VZEROUPPER_RETURN
 
 END (STRCHR)
-# endif
+#endif
-- 
2.25.1


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH v1 04/23] x86: Code cleanup in strchr-evex and comment justifying branch
  2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
  2022-03-23 21:57 ` [PATCH v1 02/23] benchtests: Add random benchmark " Noah Goldstein
  2022-03-23 21:57 ` [PATCH v1 03/23] x86: Code cleanup in strchr-avx2 and comment justifying branch Noah Goldstein
@ 2022-03-23 21:57 ` Noah Goldstein
  2022-03-24 18:54   ` H.J. Lu
  2022-03-23 21:57 ` [PATCH v1 05/23] benchtests: Use json-lib in bench-strpbrk.c Noah Goldstein
                   ` (19 subsequent siblings)
  22 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-23 21:57 UTC (permalink / raw)
  To: libc-alpha

Small code cleanup for size: -81 bytes.

Add comment justifying using a branch to do NULL/non-null return.

All string/memory tests pass and no regressions in benchtests.

geometric_mean(N=20) of all benchmarks New / Original: .985
---
Geomtric Mean N=20 runs; All functions page aligned
length, alignment,  pos, rand, seek_char/branch, max_char/perc-zero, New Time / Old Time
  2048,         0,   32,    0,               23,                127,               0.878
  2048,         1,   32,    0,               23,                127,                0.88
  2048,         0,   64,    0,               23,                127,               0.997
  2048,         2,   64,    0,               23,                127,               1.001
  2048,         0,  128,    0,               23,                127,               0.973
  2048,         3,  128,    0,               23,                127,               0.971
  2048,         0,  256,    0,               23,                127,               0.976
  2048,         4,  256,    0,               23,                127,               0.973
  2048,         0,  512,    0,               23,                127,               1.001
  2048,         5,  512,    0,               23,                127,               1.004
  2048,         0, 1024,    0,               23,                127,               1.005
  2048,         6, 1024,    0,               23,                127,               1.007
  2048,         0, 2048,    0,               23,                127,               1.035
  2048,         7, 2048,    0,               23,                127,                1.03
  4096,         0,   32,    0,               23,                127,               0.889
  4096,         1,   32,    0,               23,                127,               0.891
  4096,         0,   64,    0,               23,                127,               1.012
  4096,         2,   64,    0,               23,                127,               1.017
  4096,         0,  128,    0,               23,                127,               0.975
  4096,         3,  128,    0,               23,                127,               0.974
  4096,         0,  256,    0,               23,                127,               0.974
  4096,         4,  256,    0,               23,                127,               0.972
  4096,         0,  512,    0,               23,                127,               1.002
  4096,         5,  512,    0,               23,                127,               1.016
  4096,         0, 1024,    0,               23,                127,               1.009
  4096,         6, 1024,    0,               23,                127,               1.008
  4096,         0, 2048,    0,               23,                127,               1.003
  4096,         7, 2048,    0,               23,                127,               1.004
   256,         1,   64,    0,               23,                127,               0.993
   256,         2,   64,    0,               23,                127,               0.999
   256,         3,   64,    0,               23,                127,               0.992
   256,         4,   64,    0,               23,                127,                0.99
   256,         5,   64,    0,               23,                127,                0.99
   256,         6,   64,    0,               23,                127,               0.994
   256,         7,   64,    0,               23,                127,               0.991
   512,         0,  256,    0,               23,                127,               0.971
   512,        16,  256,    0,               23,                127,               0.971
   512,        32,  256,    0,               23,                127,               1.005
   512,        48,  256,    0,               23,                127,               0.998
   512,        64,  256,    0,               23,                127,               1.001
   512,        80,  256,    0,               23,                127,               1.002
   512,        96,  256,    0,               23,                127,               1.005
   512,       112,  256,    0,               23,                127,               1.012
     1,         0,    0,    0,               23,                127,               1.024
     2,         0,    1,    0,               23,                127,               0.991
     3,         0,    2,    0,               23,                127,               0.997
     4,         0,    3,    0,               23,                127,               0.984
     5,         0,    4,    0,               23,                127,               0.993
     6,         0,    5,    0,               23,                127,               0.985
     7,         0,    6,    0,               23,                127,               0.979
     8,         0,    7,    0,               23,                127,               0.975
     9,         0,    8,    0,               23,                127,               0.965
    10,         0,    9,    0,               23,                127,               0.957
    11,         0,   10,    0,               23,                127,               0.979
    12,         0,   11,    0,               23,                127,               0.987
    13,         0,   12,    0,               23,                127,               1.023
    14,         0,   13,    0,               23,                127,               0.997
    15,         0,   14,    0,               23,                127,               0.983
    16,         0,   15,    0,               23,                127,               0.987
    17,         0,   16,    0,               23,                127,               0.993
    18,         0,   17,    0,               23,                127,               0.985
    19,         0,   18,    0,               23,                127,               0.999
    20,         0,   19,    0,               23,                127,               0.998
    21,         0,   20,    0,               23,                127,               0.983
    22,         0,   21,    0,               23,                127,               0.983
    23,         0,   22,    0,               23,                127,               1.002
    24,         0,   23,    0,               23,                127,                 1.0
    25,         0,   24,    0,               23,                127,               1.002
    26,         0,   25,    0,               23,                127,               0.984
    27,         0,   26,    0,               23,                127,               0.994
    28,         0,   27,    0,               23,                127,               0.995
    29,         0,   28,    0,               23,                127,               1.017
    30,         0,   29,    0,               23,                127,               1.009
    31,         0,   30,    0,               23,                127,               1.001
    32,         0,   31,    0,               23,                127,               1.021
  2048,         0,   32,    0,                0,                127,               0.899
  2048,         1,   32,    0,                0,                127,                0.93
  2048,         0,   64,    0,                0,                127,               1.009
  2048,         2,   64,    0,                0,                127,               1.023
  2048,         0,  128,    0,                0,                127,               0.973
  2048,         3,  128,    0,                0,                127,               0.975
  2048,         0,  256,    0,                0,                127,               0.974
  2048,         4,  256,    0,                0,                127,                0.97
  2048,         0,  512,    0,                0,                127,               0.999
  2048,         5,  512,    0,                0,                127,               1.004
  2048,         0, 1024,    0,                0,                127,               1.008
  2048,         6, 1024,    0,                0,                127,               1.008
  2048,         0, 2048,    0,                0,                127,               0.996
  2048,         7, 2048,    0,                0,                127,               1.002
  4096,         0,   32,    0,                0,                127,               0.872
  4096,         1,   32,    0,                0,                127,               0.881
  4096,         0,   64,    0,                0,                127,               1.006
  4096,         2,   64,    0,                0,                127,               1.005
  4096,         0,  128,    0,                0,                127,               0.973
  4096,         3,  128,    0,                0,                127,               0.974
  4096,         0,  256,    0,                0,                127,               0.969
  4096,         4,  256,    0,                0,                127,               0.971
  4096,         0,  512,    0,                0,                127,                 1.0
  4096,         5,  512,    0,                0,                127,               1.005
  4096,         0, 1024,    0,                0,                127,               1.007
  4096,         6, 1024,    0,                0,                127,               1.009
  4096,         0, 2048,    0,                0,                127,               1.005
  4096,         7, 2048,    0,                0,                127,               1.007
   256,         1,   64,    0,                0,                127,               0.994
   256,         2,   64,    0,                0,                127,               1.008
   256,         3,   64,    0,                0,                127,               1.019
   256,         4,   64,    0,                0,                127,               0.991
   256,         5,   64,    0,                0,                127,               0.992
   256,         6,   64,    0,                0,                127,               0.991
   256,         7,   64,    0,                0,                127,               0.988
   512,         0,  256,    0,                0,                127,               0.971
   512,        16,  256,    0,                0,                127,               0.967
   512,        32,  256,    0,                0,                127,               1.005
   512,        48,  256,    0,                0,                127,               1.001
   512,        64,  256,    0,                0,                127,               1.009
   512,        80,  256,    0,                0,                127,               1.008
   512,        96,  256,    0,                0,                127,               1.009
   512,       112,  256,    0,                0,                127,               1.016
     1,         0,    0,    0,                0,                127,               1.038
     2,         0,    1,    0,                0,                127,               1.009
     3,         0,    2,    0,                0,                127,               0.992
     4,         0,    3,    0,                0,                127,               1.004
     5,         0,    4,    0,                0,                127,               0.966
     6,         0,    5,    0,                0,                127,               0.968
     7,         0,    6,    0,                0,                127,               1.004
     8,         0,    7,    0,                0,                127,                0.99
     9,         0,    8,    0,                0,                127,               0.958
    10,         0,    9,    0,                0,                127,                0.96
    11,         0,   10,    0,                0,                127,               0.948
    12,         0,   11,    0,                0,                127,               0.984
    13,         0,   12,    0,                0,                127,               0.967
    14,         0,   13,    0,                0,                127,               0.993
    15,         0,   14,    0,                0,                127,               0.991
    16,         0,   15,    0,                0,                127,                 1.0
    17,         0,   16,    0,                0,                127,               0.982
    18,         0,   17,    0,                0,                127,               0.977
    19,         0,   18,    0,                0,                127,               0.987
    20,         0,   19,    0,                0,                127,               0.978
    21,         0,   20,    0,                0,                127,                 1.0
    22,         0,   21,    0,                0,                127,                0.99
    23,         0,   22,    0,                0,                127,               0.988
    24,         0,   23,    0,                0,                127,               0.997
    25,         0,   24,    0,                0,                127,               1.003
    26,         0,   25,    0,                0,                127,               1.004
    27,         0,   26,    0,                0,                127,               0.982
    28,         0,   27,    0,                0,                127,               0.972
    29,         0,   28,    0,                0,                127,               0.978
    30,         0,   29,    0,                0,                127,               0.992
    31,         0,   30,    0,                0,                127,               0.986
    32,         0,   31,    0,                0,                127,                 1.0

    16,         0,   15,    1,                1,                  0,               0.997
    16,         0,   15,    1,                0,                  0,               1.001
    16,         0,   15,    1,                1,                0.1,               0.984
    16,         0,   15,    1,                0,                0.1,               0.999
    16,         0,   15,    1,                1,               0.25,               0.929
    16,         0,   15,    1,                0,               0.25,               1.001
    16,         0,   15,    1,                1,               0.33,               0.892
    16,         0,   15,    1,                0,               0.33,               0.996
    16,         0,   15,    1,                1,                0.5,               0.897
    16,         0,   15,    1,                0,                0.5,               1.009
    16,         0,   15,    1,                1,               0.66,               0.882
    16,         0,   15,    1,                0,               0.66,               0.967
    16,         0,   15,    1,                1,               0.75,               0.919
    16,         0,   15,    1,                0,               0.75,               1.027
    16,         0,   15,    1,                1,                0.9,               0.949
    16,         0,   15,    1,                0,                0.9,               1.021
    16,         0,   15,    1,                1,                  1,               0.998
    16,         0,   15,    1,                0,                  1,               0.999

 sysdeps/x86_64/multiarch/strchr-evex.S | 146 ++++++++++++++-----------
 1 file changed, 80 insertions(+), 66 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
index f62cd9d144..ec739fb8f9 100644
--- a/sysdeps/x86_64/multiarch/strchr-evex.S
+++ b/sysdeps/x86_64/multiarch/strchr-evex.S
@@ -30,6 +30,7 @@
 # ifdef USE_AS_WCSCHR
 #  define VPBROADCAST	vpbroadcastd
 #  define VPCMP		vpcmpd
+#  define VPTESTN	vptestnmd
 #  define VPMINU	vpminud
 #  define CHAR_REG	esi
 #  define SHIFT_REG	ecx
@@ -37,6 +38,7 @@
 # else
 #  define VPBROADCAST	vpbroadcastb
 #  define VPCMP		vpcmpb
+#  define VPTESTN	vptestnmb
 #  define VPMINU	vpminub
 #  define CHAR_REG	sil
 #  define SHIFT_REG	edx
@@ -61,13 +63,11 @@
 # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
 
 	.section .text.evex,"ax",@progbits
-ENTRY (STRCHR)
+ENTRY_P2ALIGN (STRCHR, 5)
 	/* Broadcast CHAR to YMM0.	*/
 	VPBROADCAST	%esi, %YMM0
 	movl	%edi, %eax
 	andl	$(PAGE_SIZE - 1), %eax
-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
-
 	/* Check if we cross page boundary with one vector load.
 	   Otherwise it is safe to use an unaligned load.  */
 	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
@@ -81,49 +81,35 @@ ENTRY (STRCHR)
 	vpxorq	%YMM1, %YMM0, %YMM2
 	VPMINU	%YMM2, %YMM1, %YMM2
 	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM2, %k0
+	VPTESTN	%YMM2, %YMM2, %k0
 	kmovd	%k0, %eax
 	testl	%eax, %eax
 	jz	L(aligned_more)
 	tzcntl	%eax, %eax
+# ifndef USE_AS_STRCHRNUL
+	/* Found CHAR or the null byte.  */
+	cmp	(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+	/* NB: Use a branch instead of cmovcc here. The expectation is
+	   that with strchr the user will branch based on input being
+	   null. Since this branch will be 100% predictive of the user
+	   branch a branch miss here should save what otherwise would
+	   be branch miss in the user code. Otherwise using a branch 1)
+	   saves code size and 2) is faster in highly predictable
+	   environments.  */
+	jne	L(zero)
+# endif
 # ifdef USE_AS_WCSCHR
 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.
 	 */
 	leaq	(%rdi, %rax, CHAR_SIZE), %rax
 # else
 	addq	%rdi, %rax
-# endif
-# ifndef USE_AS_STRCHRNUL
-	/* Found CHAR or the null byte.	 */
-	cmp	(%rax), %CHAR_REG
-	jne	L(zero)
 # endif
 	ret
 
-	/* .p2align 5 helps keep performance more consistent if ENTRY()
-	   alignment % 32 was either 16 or 0. As well this makes the
-	   alignment % 32 of the loop_4x_vec fixed which makes tuning it
-	   easier.  */
-	.p2align 5
-L(first_vec_x3):
-	tzcntl	%eax, %eax
-# ifndef USE_AS_STRCHRNUL
-	/* Found CHAR or the null byte.	 */
-	cmp	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
-	jne	L(zero)
-# endif
-	/* NB: Multiply sizeof char type (1 or 4) to get the number of
-	   bytes.  */
-	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
-	ret
 
-# ifndef USE_AS_STRCHRNUL
-L(zero):
-	xorl	%eax, %eax
-	ret
-# endif
 
-	.p2align 4
+	.p2align 4,, 10
 L(first_vec_x4):
 # ifndef USE_AS_STRCHRNUL
 	/* Check to see if first match was CHAR (k0) or null (k1).  */
@@ -144,9 +130,18 @@ L(first_vec_x4):
 	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
+# ifndef USE_AS_STRCHRNUL
+L(zero):
+	xorl	%eax, %eax
+	ret
+# endif
+
+
 	.p2align 4
 L(first_vec_x1):
-	tzcntl	%eax, %eax
+	/* Use bsf here to save 1-byte keeping keeping the block in 1x
+	   fetch block. eax guranteed non-zero.  */
+	bsfl	%eax, %eax
 # ifndef USE_AS_STRCHRNUL
 	/* Found CHAR or the null byte.	 */
 	cmp	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
@@ -158,7 +153,7 @@ L(first_vec_x1):
 	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
-	.p2align 4
+	.p2align 4,, 10
 L(first_vec_x2):
 # ifndef USE_AS_STRCHRNUL
 	/* Check to see if first match was CHAR (k0) or null (k1).  */
@@ -179,6 +174,21 @@ L(first_vec_x2):
 	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
+	.p2align 4,, 10
+L(first_vec_x3):
+	/* Use bsf here to save 1-byte keeping keeping the block in 1x
+	   fetch block. eax guranteed non-zero.  */
+	bsfl	%eax, %eax
+# ifndef USE_AS_STRCHRNUL
+	/* Found CHAR or the null byte.	 */
+	cmp	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+	jne	L(zero)
+# endif
+	/* NB: Multiply sizeof char type (1 or 4) to get the number of
+	   bytes.  */
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
 	.p2align 4
 L(aligned_more):
 	/* Align data to VEC_SIZE.  */
@@ -195,7 +205,7 @@ L(cross_page_continue):
 	vpxorq	%YMM1, %YMM0, %YMM2
 	VPMINU	%YMM2, %YMM1, %YMM2
 	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM2, %k0
+	VPTESTN	%YMM2, %YMM2, %k0
 	kmovd	%k0, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x1)
@@ -206,7 +216,7 @@ L(cross_page_continue):
 	/* Each bit in K0 represents a CHAR in YMM1.  */
 	VPCMP	$0, %YMM1, %YMM0, %k0
 	/* Each bit in K1 represents a CHAR in YMM1.  */
-	VPCMP	$0, %YMM1, %YMMZERO, %k1
+	VPTESTN	%YMM1, %YMM1, %k1
 	kortestd	%k0, %k1
 	jnz	L(first_vec_x2)
 
@@ -215,7 +225,7 @@ L(cross_page_continue):
 	vpxorq	%YMM1, %YMM0, %YMM2
 	VPMINU	%YMM2, %YMM1, %YMM2
 	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM2, %k0
+	VPTESTN	%YMM2, %YMM2, %k0
 	kmovd	%k0, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x3)
@@ -224,7 +234,7 @@ L(cross_page_continue):
 	/* Each bit in K0 represents a CHAR in YMM1.  */
 	VPCMP	$0, %YMM1, %YMM0, %k0
 	/* Each bit in K1 represents a CHAR in YMM1.  */
-	VPCMP	$0, %YMM1, %YMMZERO, %k1
+	VPTESTN	%YMM1, %YMM1, %k1
 	kortestd	%k0, %k1
 	jnz	L(first_vec_x4)
 
@@ -265,33 +275,33 @@ L(loop_4x_vec):
 	VPMINU	%YMM3, %YMM4, %YMM4
 	VPMINU	%YMM2, %YMM4, %YMM4{%k4}{z}
 
-	VPCMP	$0, %YMMZERO, %YMM4, %k1
+	VPTESTN	%YMM4, %YMM4, %k1
 	kmovd	%k1, %ecx
 	subq	$-(VEC_SIZE * 4), %rdi
 	testl	%ecx, %ecx
 	jz	L(loop_4x_vec)
 
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
+	VPTESTN	%YMM1, %YMM1, %k0
 	kmovd	%k0, %eax
 	testl	%eax, %eax
 	jnz	L(last_vec_x1)
 
-	VPCMP	$0, %YMMZERO, %YMM2, %k0
+	VPTESTN	%YMM2, %YMM2, %k0
 	kmovd	%k0, %eax
 	testl	%eax, %eax
 	jnz	L(last_vec_x2)
 
-	VPCMP	$0, %YMMZERO, %YMM3, %k0
+	VPTESTN	%YMM3, %YMM3, %k0
 	kmovd	%k0, %eax
 	/* Combine YMM3 matches (eax) with YMM4 matches (ecx).  */
 # ifdef USE_AS_WCSCHR
 	sall	$8, %ecx
 	orl	%ecx, %eax
-	tzcntl	%eax, %eax
+	bsfl	%eax, %eax
 # else
 	salq	$32, %rcx
 	orq	%rcx, %rax
-	tzcntq	%rax, %rax
+	bsfq	%rax, %rax
 # endif
 # ifndef USE_AS_STRCHRNUL
 	/* Check if match was CHAR or null.  */
@@ -303,28 +313,28 @@ L(loop_4x_vec):
 	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
-# ifndef USE_AS_STRCHRNUL
-L(zero_end):
-	xorl	%eax, %eax
-	ret
+	.p2align 4,, 8
+L(last_vec_x1):
+	bsfl	%eax, %eax
+# ifdef USE_AS_WCSCHR
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.
+	   */
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	addq	%rdi, %rax
 # endif
 
-	.p2align 4
-L(last_vec_x1):
-	tzcntl	%eax, %eax
 # ifndef USE_AS_STRCHRNUL
 	/* Check if match was null.  */
-	cmp	(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+	cmp	(%rax), %CHAR_REG
 	jne	L(zero_end)
 # endif
-	/* NB: Multiply sizeof char type (1 or 4) to get the number of
-	   bytes.  */
-	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+
 	ret
 
-	.p2align 4
+	.p2align 4,, 8
 L(last_vec_x2):
-	tzcntl	%eax, %eax
+	bsfl	%eax, %eax
 # ifndef USE_AS_STRCHRNUL
 	/* Check if match was null.  */
 	cmp	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
@@ -336,7 +346,7 @@ L(last_vec_x2):
 	ret
 
 	/* Cold case for crossing page with first load.	 */
-	.p2align 4
+	.p2align 4,, 8
 L(cross_page_boundary):
 	movq	%rdi, %rdx
 	/* Align rdi.  */
@@ -346,9 +356,9 @@ L(cross_page_boundary):
 	vpxorq	%YMM1, %YMM0, %YMM2
 	VPMINU	%YMM2, %YMM1, %YMM2
 	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM2, %k0
+	VPTESTN	%YMM2, %YMM2, %k0
 	kmovd	%k0, %eax
-	/* Remove the leading bits.	 */
+	/* Remove the leading bits.  */
 # ifdef USE_AS_WCSCHR
 	movl	%edx, %SHIFT_REG
 	/* NB: Divide shift count by 4 since each bit in K1 represent 4
@@ -360,20 +370,24 @@ L(cross_page_boundary):
 	/* If eax is zero continue.  */
 	testl	%eax, %eax
 	jz	L(cross_page_continue)
-	tzcntl	%eax, %eax
-# ifndef USE_AS_STRCHRNUL
-	/* Check to see if match was CHAR or null.  */
-	cmp	(%rdx, %rax, CHAR_SIZE), %CHAR_REG
-	jne	L(zero_end)
-# endif
+	bsfl	%eax, %eax
+
 # ifdef USE_AS_WCSCHR
 	/* NB: Multiply wchar_t count by 4 to get the number of
 	   bytes.  */
 	leaq	(%rdx, %rax, CHAR_SIZE), %rax
 # else
 	addq	%rdx, %rax
+# endif
+# ifndef USE_AS_STRCHRNUL
+	/* Check to see if match was CHAR or null.  */
+	cmp	(%rax), %CHAR_REG
+	je	L(cross_page_ret)
+L(zero_end):
+	xorl	%eax, %eax
+L(cross_page_ret):
 # endif
 	ret
 
 END (STRCHR)
-# endif
+#endif
-- 
2.25.1


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH v1 05/23] benchtests: Use json-lib in bench-strpbrk.c
  2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
                   ` (2 preceding siblings ...)
  2022-03-23 21:57 ` [PATCH v1 04/23] x86: Code cleanup in strchr-evex " Noah Goldstein
@ 2022-03-23 21:57 ` Noah Goldstein
  2022-03-24 18:54   ` H.J. Lu
  2022-03-23 21:57 ` [PATCH v1 06/23] benchtests: Use json-lib in bench-strspn.c Noah Goldstein
                   ` (18 subsequent siblings)
  22 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-23 21:57 UTC (permalink / raw)
  To: libc-alpha

Just QOL change to make parsing the output of the benchtests more
consistent.
---
 benchtests/bench-strpbrk.c | 81 ++++++++++++++++++++++++++++----------
 1 file changed, 61 insertions(+), 20 deletions(-)

diff --git a/benchtests/bench-strpbrk.c b/benchtests/bench-strpbrk.c
index d46bf9c0e2..a7522a76e6 100644
--- a/benchtests/bench-strpbrk.c
+++ b/benchtests/bench-strpbrk.c
@@ -62,11 +62,14 @@ SIMPLE_STRPBRK (const CHAR *s, const CHAR *rej)
 
 #endif /* !STRPBRK_RESULT */
 
+#include "json-lib.h"
+
 static void
-do_one_test (impl_t *impl, const CHAR *s, const CHAR *rej, RES_TYPE exp_res)
+do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s,
+             const CHAR *rej, RES_TYPE exp_res)
 {
   RES_TYPE res = CALL (impl, s, rej);
-  size_t i, iters = INNER_LOOP_ITERS_MEDIUM;
+  size_t i, iters = INNER_LOOP_ITERS;
   timing_t start, stop, cur;
 
   if (res != exp_res)
@@ -86,23 +89,26 @@ do_one_test (impl_t *impl, const CHAR *s, const CHAR *rej, RES_TYPE exp_res)
 
   TIMING_DIFF (cur, start, stop);
 
-  TIMING_PRINT_MEAN ((double) cur, (double) iters);
+  json_element_double (json_ctx, (double)cur / (double)iters);
 }
 
 static void
-do_test (size_t align, size_t pos, size_t len)
+do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t pos,
+         size_t len)
 {
   size_t i;
   int c;
   RES_TYPE result;
   CHAR *rej, *s;
 
-  align &= 7;
-  if ((align + pos + 10) * sizeof (CHAR) >= page_size || len > 240)
+  align1 &= 7;
+  if ((align1 + pos + 10) * sizeof (CHAR) >= page_size || len > 240)
+    return;
+  if ((align2 + len) * sizeof (CHAR) >= page_size)
     return;
 
-  rej = (CHAR *) (buf2) + (random () & 255);
-  s = (CHAR *) (buf1) + align;
+  rej = (CHAR *) (buf2) + align2;
+  s = (CHAR *) (buf1) + align1;
 
   for (i = 0; i < len; ++i)
     {
@@ -136,43 +142,78 @@ do_test (size_t align, size_t pos, size_t len)
     }
   result = STRPBRK_RESULT (s, pos);
 
-  printf ("Length %4zd, alignment %2zd, rej len %2zd:", pos, align, len);
+  json_element_object_begin (json_ctx);
+  json_attr_uint (json_ctx, "len", len);
+  json_attr_uint (json_ctx, "pos", pos);
+  json_attr_uint (json_ctx, "align1", align1);
+  json_attr_uint (json_ctx, "align2", align2);
+  json_array_begin (json_ctx, "timings");
 
   FOR_EACH_IMPL (impl, 0)
-    do_one_test (impl, s, rej, result);
+    do_one_test (json_ctx, impl, s, rej, result);
 
-  putchar ('\n');
+  json_array_end (json_ctx);
+  json_element_object_end (json_ctx);
 }
 
 int
 test_main (void)
 {
+  json_ctx_t json_ctx;
   size_t i;
 
   test_init ();
 
-  printf ("%32s", "");
+  json_init (&json_ctx, 0, stdout);
+
+  json_document_begin (&json_ctx);
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+
+  json_attr_object_begin (&json_ctx, "functions");
+  json_attr_object_begin (&json_ctx, TEST_NAME);
+  json_attr_string (&json_ctx, "bench-variant", "");
+
+  json_array_begin (&json_ctx, "ifuncs");
   FOR_EACH_IMPL (impl, 0)
-    printf ("\t%s", impl->name);
-  putchar ('\n');
+    json_element_string (&json_ctx, impl->name);
+  json_array_end (&json_ctx);
+
+  json_array_begin (&json_ctx, "results");
+
 
   for (i = 0; i < 32; ++i)
     {
-      do_test (0, 512, i);
-      do_test (i, 512, i);
+      do_test (&json_ctx, 0, 0, 512, i);
+      do_test (&json_ctx, i, 0, 512, i);
+      do_test (&json_ctx, 0, i, 512, i);
+      do_test (&json_ctx, i, i, 512, i);
+
     }
 
   for (i = 1; i < 8; ++i)
     {
-      do_test (0, 16 << i, 4);
-      do_test (i, 16 << i, 4);
+      do_test (&json_ctx, 0, 0, 16 << i, 4);
+      do_test (&json_ctx, i, 0, 16 << i, 4);
+      do_test (&json_ctx, 0, i, 16 << i, 4);
+      do_test (&json_ctx, i, i, 16 << i, 4);
     }
 
   for (i = 1; i < 8; ++i)
-    do_test (i, 64, 10);
+  {
+    do_test (&json_ctx, i, 0, 64, 10);
+    do_test (&json_ctx, i, i, 64, 10);
+  }
 
   for (i = 0; i < 64; ++i)
-    do_test (0, i, 6);
+  {
+    do_test (&json_ctx, 0, 0, i, 6);
+    do_test (&json_ctx, 0, i, i, 6);
+  }
+
+  json_array_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_document_end (&json_ctx);
 
   return ret;
 }
-- 
2.25.1


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH v1 06/23] benchtests: Use json-lib in bench-strspn.c
  2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
                   ` (3 preceding siblings ...)
  2022-03-23 21:57 ` [PATCH v1 05/23] benchtests: Use json-lib in bench-strpbrk.c Noah Goldstein
@ 2022-03-23 21:57 ` Noah Goldstein
  2022-03-24 18:54   ` H.J. Lu
  2022-03-23 21:57 ` [PATCH v1 07/23] x86: Optimize strcspn and strpbrk in strcspn-c.c Noah Goldstein
                   ` (17 subsequent siblings)
  22 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-23 21:57 UTC (permalink / raw)
  To: libc-alpha

Just QOL change to make parsing the output of the benchtests more
consistent.
---
 benchtests/bench-strspn.c | 78 +++++++++++++++++++++++++++++----------
 1 file changed, 58 insertions(+), 20 deletions(-)

diff --git a/benchtests/bench-strspn.c b/benchtests/bench-strspn.c
index d79c36fae6..061e90c54d 100644
--- a/benchtests/bench-strspn.c
+++ b/benchtests/bench-strspn.c
@@ -23,6 +23,7 @@
 # define TEST_NAME "wcsspn"
 #endif /* WIDE */
 #include "bench-string.h"
+#include "json-lib.h"
 
 #define BIG_CHAR MAX_CHAR
 
@@ -58,9 +59,10 @@ SIMPLE_STRSPN (const CHAR *s, const CHAR *acc)
 }
 
 static void
-do_one_test (impl_t *impl, const CHAR *s, const CHAR *acc, size_t exp_res)
+do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s,
+             const CHAR *acc, size_t exp_res)
 {
-  size_t res = CALL (impl, s, acc), i, iters = INNER_LOOP_ITERS_MEDIUM;
+  size_t res = CALL (impl, s, acc), i, iters = INNER_LOOP_ITERS;
   timing_t start, stop, cur;
 
   if (res != exp_res)
@@ -80,21 +82,24 @@ do_one_test (impl_t *impl, const CHAR *s, const CHAR *acc, size_t exp_res)
 
   TIMING_DIFF (cur, start, stop);
 
-  TIMING_PRINT_MEAN ((double) cur, (double) iters);
+  json_element_double (json_ctx, (double)cur / (double)iters);
 }
 
 static void
-do_test (size_t align, size_t pos, size_t len)
+do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t pos,
+         size_t len)
 {
   size_t i;
   CHAR *acc, *s;
 
-  align &= 7;
-  if ((align + pos + 10) * sizeof (CHAR) >= page_size || len > 240 || ! len)
+  align1 &= 7;
+  if ((align1 + pos + 10) * sizeof (CHAR) >= page_size || len > 240 || !len)
+    return;
+  if ((align2 + len) * sizeof (CHAR) >= page_size)
     return;
 
-  acc = (CHAR *) (buf2) + (random () & 255);
-  s = (CHAR *) (buf1) + align;
+  acc = (CHAR *) (buf2) + align2;
+  s = (CHAR *) (buf1) + align1;
 
   for (i = 0; i < len; ++i)
     {
@@ -118,43 +123,76 @@ do_test (size_t align, size_t pos, size_t len)
       s[i] = '\0';
     }
 
-  printf ("Length %4zd, alignment %2zd, acc len %2zd:", pos, align, len);
+  json_element_object_begin (json_ctx);
+  json_attr_uint (json_ctx, "len", len);
+  json_attr_uint (json_ctx, "pos", pos);
+  json_attr_uint (json_ctx, "align1", align1);
+  json_attr_uint (json_ctx, "align2", align2);
+  json_array_begin (json_ctx, "timings");
 
   FOR_EACH_IMPL (impl, 0)
-    do_one_test (impl, s, acc, pos);
+    do_one_test (json_ctx, impl, s, acc, pos);
 
-  putchar ('\n');
+  json_array_end (json_ctx);
+  json_element_object_end (json_ctx);
 }
 
 int
 test_main (void)
 {
+  json_ctx_t json_ctx;
   size_t i;
 
   test_init ();
 
-  printf ("%32s", "");
+  json_init (&json_ctx, 0, stdout);
+
+  json_document_begin (&json_ctx);
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+
+  json_attr_object_begin (&json_ctx, "functions");
+  json_attr_object_begin (&json_ctx, TEST_NAME);
+  json_attr_string (&json_ctx, "bench-variant", "");
+
+  json_array_begin (&json_ctx, "ifuncs");
   FOR_EACH_IMPL (impl, 0)
-    printf ("\t%s", impl->name);
-  putchar ('\n');
+    json_element_string (&json_ctx, impl->name);
+  json_array_end (&json_ctx);
+
+  json_array_begin (&json_ctx, "results");
 
   for (i = 0; i < 32; ++i)
     {
-      do_test (0, 512, i);
-      do_test (i, 512, i);
+      do_test (&json_ctx, 0, 0, 512, i);
+      do_test (&json_ctx, i, 0, 512, i);
+      do_test (&json_ctx, 0, i, 512, i);
+      do_test (&json_ctx, i, i, 512, i);
     }
 
   for (i = 1; i < 8; ++i)
     {
-      do_test (0, 16 << i, 4);
-      do_test (i, 16 << i, 4);
+      do_test (&json_ctx, 0, 0, 16 << i, 4);
+      do_test (&json_ctx, i, 0, 16 << i, 4);
+      do_test (&json_ctx, 0, i, 16 << i, 4);
+      do_test (&json_ctx, i, i, 16 << i, 4);
     }
 
   for (i = 1; i < 8; ++i)
-    do_test (i, 64, 10);
+    {
+      do_test (&json_ctx, i, 0, 64, 10);
+      do_test (&json_ctx, i, i, 64, 10);
+    }
 
   for (i = 0; i < 64; ++i)
-    do_test (0, i, 6);
+    {
+      do_test (&json_ctx, 0, 0, i, 6);
+      do_test (&json_ctx, 0, i, i, 6);
+    }
+
+  json_array_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_document_end (&json_ctx);
 
   return ret;
 }
-- 
2.25.1


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH v1 07/23] x86: Optimize strcspn and strpbrk in strcspn-c.c
  2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
                   ` (4 preceding siblings ...)
  2022-03-23 21:57 ` [PATCH v1 06/23] benchtests: Use json-lib in bench-strspn.c Noah Goldstein
@ 2022-03-23 21:57 ` Noah Goldstein
  2022-03-24 18:55   ` H.J. Lu
  2022-03-23 21:57 ` [PATCH v1 08/23] x86: Optimize strspn in strspn-c.c Noah Goldstein
                   ` (16 subsequent siblings)
  22 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-23 21:57 UTC (permalink / raw)
  To: libc-alpha

Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of
_mm_cmpistri. Also change offset to unsigned to avoid unnecessary
sign extensions.

geometric_mean(N=20) of all benchmarks that dont fallback on
sse2/strlen; New / Original: .928

All string/memory tests pass.
---
Geomtric Mean N=20 runs; All functions page aligned
len, align1, align2,  pos, New Time / Old Time
  0,      0,      0,  512,               1.207
  1,      0,      0,  512,               1.039
  1,      1,      0,  512,               0.997
  1,      0,      1,  512,               0.981
  1,      1,      1,  512,               0.977
  2,      0,      0,  512,                1.02
  2,      2,      0,  512,               0.979
  2,      0,      2,  512,               0.902
  2,      2,      2,  512,               0.958
  3,      0,      0,  512,               0.978
  3,      3,      0,  512,               0.988
  3,      0,      3,  512,               0.979
  3,      3,      3,  512,               0.955
  4,      0,      0,  512,               0.969
  4,      4,      0,  512,               0.991
  4,      0,      4,  512,                0.94
  4,      4,      4,  512,               0.958
  5,      0,      0,  512,               0.963
  5,      5,      0,  512,               1.004
  5,      0,      5,  512,               0.948
  5,      5,      5,  512,               0.971
  6,      0,      0,  512,               0.933
  6,      6,      0,  512,               1.007
  6,      0,      6,  512,               0.921
  6,      6,      6,  512,               0.969
  7,      0,      0,  512,               0.928
  7,      7,      0,  512,               0.976
  7,      0,      7,  512,               0.932
  7,      7,      7,  512,               0.995
  8,      0,      0,  512,               0.931
  8,      0,      8,  512,               0.766
  9,      0,      0,  512,               0.965
  9,      1,      0,  512,               0.999
  9,      0,      9,  512,               0.765
  9,      1,      9,  512,                0.97
 10,      0,      0,  512,               0.976
 10,      2,      0,  512,               0.991
 10,      0,     10,  512,               0.768
 10,      2,     10,  512,               0.926
 11,      0,      0,  512,               0.958
 11,      3,      0,  512,               1.006
 11,      0,     11,  512,               0.768
 11,      3,     11,  512,               0.908
 12,      0,      0,  512,               0.945
 12,      4,      0,  512,               0.896
 12,      0,     12,  512,               0.764
 12,      4,     12,  512,               0.785
 13,      0,      0,  512,               0.957
 13,      5,      0,  512,               1.019
 13,      0,     13,  512,                0.76
 13,      5,     13,  512,               0.785
 14,      0,      0,  512,               0.918
 14,      6,      0,  512,               1.004
 14,      0,     14,  512,                0.78
 14,      6,     14,  512,               0.711
 15,      0,      0,  512,               0.855
 15,      7,      0,  512,               0.985
 15,      0,     15,  512,               0.779
 15,      7,     15,  512,               0.772
 16,      0,      0,  512,               0.987
 16,      0,     16,  512,                0.99
 17,      0,      0,  512,               0.996
 17,      1,      0,  512,               0.979
 17,      0,     17,  512,               1.001
 17,      1,     17,  512,                1.03
 18,      0,      0,  512,               0.976
 18,      2,      0,  512,               0.989
 18,      0,     18,  512,               0.976
 18,      2,     18,  512,               0.992
 19,      0,      0,  512,               0.991
 19,      3,      0,  512,               0.988
 19,      0,     19,  512,               1.009
 19,      3,     19,  512,               1.018
 20,      0,      0,  512,               0.999
 20,      4,      0,  512,               1.005
 20,      0,     20,  512,               0.993
 20,      4,     20,  512,               0.983
 21,      0,      0,  512,               0.982
 21,      5,      0,  512,               0.988
 21,      0,     21,  512,               0.978
 21,      5,     21,  512,               0.984
 22,      0,      0,  512,               0.988
 22,      6,      0,  512,               0.979
 22,      0,     22,  512,               0.984
 22,      6,     22,  512,               0.983
 23,      0,      0,  512,               0.996
 23,      7,      0,  512,               0.998
 23,      0,     23,  512,               0.979
 23,      7,     23,  512,               0.987
 24,      0,      0,  512,                0.99
 24,      0,     24,  512,               0.979
 25,      0,      0,  512,               0.985
 25,      1,      0,  512,               0.988
 25,      0,     25,  512,                0.99
 25,      1,     25,  512,               0.986
 26,      0,      0,  512,               1.005
 26,      2,      0,  512,               0.995
 26,      0,     26,  512,               0.992
 26,      2,     26,  512,               0.983
 27,      0,      0,  512,               0.986
 27,      3,      0,  512,               0.978
 27,      0,     27,  512,               0.986
 27,      3,     27,  512,               0.973
 28,      0,      0,  512,               0.995
 28,      4,      0,  512,               0.993
 28,      0,     28,  512,               0.983
 28,      4,     28,  512,               1.005
 29,      0,      0,  512,               0.983
 29,      5,      0,  512,               0.982
 29,      0,     29,  512,               0.984
 29,      5,     29,  512,               1.005
 30,      0,      0,  512,               0.978
 30,      6,      0,  512,               0.985
 30,      0,     30,  512,               0.994
 30,      6,     30,  512,               0.993
 31,      0,      0,  512,               0.984
 31,      7,      0,  512,               0.983
 31,      0,     31,  512,                 1.0
 31,      7,     31,  512,               1.031
  4,      0,      0,   32,               0.916
  4,      1,      0,   32,               0.952
  4,      0,      1,   32,               0.927
  4,      1,      1,   32,               0.969
  4,      0,      0,   64,               0.961
  4,      2,      0,   64,               0.955
  4,      0,      2,   64,               0.975
  4,      2,      2,   64,               0.972
  4,      0,      0,  128,               0.971
  4,      3,      0,  128,               0.982
  4,      0,      3,  128,               0.945
  4,      3,      3,  128,               0.971
  4,      0,      0,  256,               1.004
  4,      4,      0,  256,               0.966
  4,      0,      4,  256,               0.961
  4,      4,      4,  256,               0.971
  4,      5,      0,  512,               0.929
  4,      0,      5,  512,               0.969
  4,      5,      5,  512,               0.985
  4,      0,      0, 1024,               1.003
  4,      6,      0, 1024,               1.009
  4,      0,      6, 1024,               1.005
  4,      6,      6, 1024,               0.999
  4,      0,      0, 2048,               0.917
  4,      7,      0, 2048,               1.015
  4,      0,      7, 2048,               1.011
  4,      7,      7, 2048,               0.907
 10,      1,      0,   64,               0.964
 10,      1,      1,   64,               0.966
 10,      2,      0,   64,               0.953
 10,      2,      2,   64,               0.972
 10,      3,      0,   64,               0.962
 10,      3,      3,   64,               0.969
 10,      4,      0,   64,               0.957
 10,      4,      4,   64,               0.969
 10,      5,      0,   64,               0.961
 10,      5,      5,   64,               0.965
 10,      6,      0,   64,               0.949
 10,      6,      6,   64,                 0.9
 10,      7,      0,   64,               0.957
 10,      7,      7,   64,               0.897
  6,      0,      0,    0,               0.991
  6,      0,      0,    1,               1.011
  6,      0,      1,    1,               0.939
  6,      0,      0,    2,               1.016
  6,      0,      2,    2,                0.94
  6,      0,      0,    3,               1.019
  6,      0,      3,    3,               0.941
  6,      0,      0,    4,               1.056
  6,      0,      4,    4,               0.884
  6,      0,      0,    5,               0.977
  6,      0,      5,    5,               0.934
  6,      0,      0,    6,               0.954
  6,      0,      6,    6,                0.93
  6,      0,      0,    7,               0.963
  6,      0,      7,    7,               0.916
  6,      0,      0,    8,               0.963
  6,      0,      8,    8,               0.945
  6,      0,      0,    9,               1.028
  6,      0,      9,    9,               0.942
  6,      0,      0,   10,               0.955
  6,      0,     10,   10,               0.831
  6,      0,      0,   11,               0.948
  6,      0,     11,   11,                0.82
  6,      0,      0,   12,               1.033
  6,      0,     12,   12,               0.873
  6,      0,      0,   13,               0.983
  6,      0,     13,   13,               0.852
  6,      0,      0,   14,               0.984
  6,      0,     14,   14,               0.853
  6,      0,      0,   15,               0.984
  6,      0,     15,   15,               0.882
  6,      0,      0,   16,               0.971
  6,      0,     16,   16,               0.958
  6,      0,      0,   17,               0.938
  6,      0,     17,   17,               0.947
  6,      0,      0,   18,                0.96
  6,      0,     18,   18,               0.938
  6,      0,      0,   19,               0.903
  6,      0,     19,   19,               0.943
  6,      0,      0,   20,               0.947
  6,      0,     20,   20,               0.951
  6,      0,      0,   21,               0.948
  6,      0,     21,   21,                0.96
  6,      0,      0,   22,               0.926
  6,      0,     22,   22,               0.951
  6,      0,      0,   23,               0.923
  6,      0,     23,   23,               0.959
  6,      0,      0,   24,               0.918
  6,      0,     24,   24,               0.952
  6,      0,      0,   25,                0.97
  6,      0,     25,   25,               0.952
  6,      0,      0,   26,               0.871
  6,      0,     26,   26,               0.869
  6,      0,      0,   27,               0.935
  6,      0,     27,   27,               0.836
  6,      0,      0,   28,               0.936
  6,      0,     28,   28,               0.857
  6,      0,      0,   29,               0.876
  6,      0,     29,   29,               0.859
  6,      0,      0,   30,               0.934
  6,      0,     30,   30,               0.857
  6,      0,      0,   31,               0.962
  6,      0,     31,   31,                0.86
  6,      0,      0,   32,               0.912
  6,      0,     32,   32,                0.94
  6,      0,      0,   33,               0.903
  6,      0,     33,   33,               0.968
  6,      0,      0,   34,               0.913
  6,      0,     34,   34,               0.896
  6,      0,      0,   35,               0.904
  6,      0,     35,   35,               0.913
  6,      0,      0,   36,               0.905
  6,      0,     36,   36,               0.907
  6,      0,      0,   37,               0.899
  6,      0,     37,   37,                 0.9
  6,      0,      0,   38,               0.912
  6,      0,     38,   38,               0.919
  6,      0,      0,   39,               0.925
  6,      0,     39,   39,               0.927
  6,      0,      0,   40,               0.923
  6,      0,     40,   40,               0.972
  6,      0,      0,   41,                0.92
  6,      0,     41,   41,               0.966
  6,      0,      0,   42,               0.915
  6,      0,     42,   42,               0.834
  6,      0,      0,   43,                0.92
  6,      0,     43,   43,               0.856
  6,      0,      0,   44,               0.908
  6,      0,     44,   44,               0.858
  6,      0,      0,   45,               0.932
  6,      0,     45,   45,               0.847
  6,      0,      0,   46,               0.927
  6,      0,     46,   46,               0.859
  6,      0,      0,   47,               0.902
  6,      0,     47,   47,               0.855
  6,      0,      0,   48,               0.949
  6,      0,     48,   48,               0.934
  6,      0,      0,   49,               0.907
  6,      0,     49,   49,               0.943
  6,      0,      0,   50,               0.934
  6,      0,     50,   50,               0.943
  6,      0,      0,   51,               0.933
  6,      0,     51,   51,               0.939
  6,      0,      0,   52,               0.944
  6,      0,     52,   52,               0.944
  6,      0,      0,   53,               0.939
  6,      0,     53,   53,               0.938
  6,      0,      0,   54,                 0.9
  6,      0,     54,   54,               0.923
  6,      0,      0,   55,                 0.9
  6,      0,     55,   55,               0.927
  6,      0,      0,   56,                 0.9
  6,      0,     56,   56,               0.917
  6,      0,      0,   57,                 0.9
  6,      0,     57,   57,               0.916
  6,      0,      0,   58,               0.914
  6,      0,     58,   58,               0.784
  6,      0,      0,   59,               0.863
  6,      0,     59,   59,               0.846
  6,      0,      0,   60,                0.88
  6,      0,     60,   60,               0.827
  6,      0,      0,   61,               0.896
  6,      0,     61,   61,               0.847
  6,      0,      0,   62,               0.894
  6,      0,     62,   62,               0.865
  6,      0,      0,   63,               0.934
  6,      0,     63,   63,               0.866

 sysdeps/x86_64/multiarch/strcspn-c.c | 83 +++++++++++++---------------
 1 file changed, 37 insertions(+), 46 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c
index 013aebf797..c312fab8b1 100644
--- a/sysdeps/x86_64/multiarch/strcspn-c.c
+++ b/sysdeps/x86_64/multiarch/strcspn-c.c
@@ -84,83 +84,74 @@ STRCSPN_SSE42 (const char *s, const char *a)
     RETURN (NULL, strlen (s));
 
   const char *aligned;
-  __m128i mask;
-  int offset = (int) ((size_t) a & 15);
+  __m128i mask, maskz, zero;
+  unsigned int maskz_bits;
+  unsigned int offset = (unsigned int) ((size_t) a & 15);
+  zero = _mm_set1_epi8 (0);
   if (offset != 0)
     {
       /* Load masks.  */
       aligned = (const char *) ((size_t) a & -16L);
       __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
-
-      mask = __m128i_shift_right (mask0, offset);
+      maskz = _mm_cmpeq_epi8 (mask0, zero);
 
       /* Find where the NULL terminator is.  */
-      int length = _mm_cmpistri (mask, mask, 0x3a);
-      if (length == 16 - offset)
-	{
-	  /* There is no NULL terminator.  */
-	  __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
-	  int index = _mm_cmpistri (mask1, mask1, 0x3a);
-	  length += index;
-
-	  /* Don't use SSE4.2 if the length of A > 16.  */
-	  if (length > 16)
-	    return STRCSPN_SSE2 (s, a);
-
-	  if (index != 0)
-	    {
-	      /* Combine mask0 and mask1.  We could play games with
-		 palignr, but frankly this data should be in L1 now
-		 so do the merge via an unaligned load.  */
-	      mask = _mm_loadu_si128 ((__m128i *) a);
-	    }
-	}
+      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
+      if (maskz_bits != 0)
+        {
+          mask = __m128i_shift_right (mask0, offset);
+          offset = (unsigned int) ((size_t) s & 15);
+          if (offset)
+            goto start_unaligned;
+
+          aligned = s;
+          goto start_loop;
+        }
     }
-  else
-    {
-      /* A is aligned.  */
-      mask = _mm_load_si128 ((__m128i *) a);
 
-      /* Find where the NULL terminator is.  */
-      int length = _mm_cmpistri (mask, mask, 0x3a);
-      if (length == 16)
-	{
-	  /* There is no NULL terminator.  Don't use SSE4.2 if the length
-	     of A > 16.  */
-	  if (a[16] != 0)
-	    return STRCSPN_SSE2 (s, a);
-	}
+  /* A is aligned.  */
+  mask = _mm_loadu_si128 ((__m128i *) a);
+  /* Find where the NULL terminator is.  */
+  maskz = _mm_cmpeq_epi8 (mask, zero);
+  maskz_bits = _mm_movemask_epi8 (maskz);
+  if (maskz_bits == 0)
+    {
+      /* There is no NULL terminator.  Don't use SSE4.2 if the length
+         of A > 16.  */
+      if (a[16] != 0)
+        return STRCSPN_SSE2 (s, a);
     }
 
-  offset = (int) ((size_t) s & 15);
+  aligned = s;
+  offset = (unsigned int) ((size_t) s & 15);
   if (offset != 0)
     {
+    start_unaligned:
       /* Check partial string.  */
       aligned = (const char *) ((size_t) s & -16L);
       __m128i value = _mm_load_si128 ((__m128i *) aligned);
 
       value = __m128i_shift_right (value, offset);
 
-      int length = _mm_cmpistri (mask, value, 0x2);
+      unsigned int length = _mm_cmpistri (mask, value, 0x2);
       /* No need to check ZFlag since ZFlag is always 1.  */
-      int cflag = _mm_cmpistrc (mask, value, 0x2);
+      unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
       if (cflag)
 	RETURN ((char *) (s + length), length);
       /* Find where the NULL terminator is.  */
-      int index = _mm_cmpistri (value, value, 0x3a);
+      unsigned int index = _mm_cmpistri (value, value, 0x3a);
       if (index < 16 - offset)
 	RETURN (NULL, index);
       aligned += 16;
     }
-  else
-    aligned = s;
 
+start_loop:
   while (1)
     {
       __m128i value = _mm_load_si128 ((__m128i *) aligned);
-      int index = _mm_cmpistri (mask, value, 0x2);
-      int cflag = _mm_cmpistrc (mask, value, 0x2);
-      int zflag = _mm_cmpistrz (mask, value, 0x2);
+      unsigned int index = _mm_cmpistri (mask, value, 0x2);
+      unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
+      unsigned int zflag = _mm_cmpistrz (mask, value, 0x2);
       if (cflag)
 	RETURN ((char *) (aligned + index), (size_t) (aligned + index - s));
       if (zflag)
-- 
2.25.1


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH v1 08/23] x86: Optimize strspn in strspn-c.c
  2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
                   ` (5 preceding siblings ...)
  2022-03-23 21:57 ` [PATCH v1 07/23] x86: Optimize strcspn and strpbrk in strcspn-c.c Noah Goldstein
@ 2022-03-23 21:57 ` Noah Goldstein
  2022-03-24 18:56   ` H.J. Lu
  2022-03-23 21:57 ` [PATCH v1 09/23] x86: Remove strcspn-sse2.S and use the generic implementation Noah Goldstein
                   ` (15 subsequent siblings)
  22 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-23 21:57 UTC (permalink / raw)
  To: libc-alpha

Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of
_mm_cmpistri. Also change offset to unsigned to avoid unnecessary
sign extensions.

geometric_mean(N=20) of all benchmarks that dont fallback on
sse2; New / Original: .901

All string/memory tests pass.
---
Geomtric Mean N=20 runs; All functions page aligned
len, align1, align2,  pos, New Time / Old Time
  1,      0,      0,  512,               0.768
  1,      1,      0,  512,               0.666
  1,      0,      1,  512,               1.193
  1,      1,      1,  512,               0.872
  2,      0,      0,  512,               0.698
  2,      2,      0,  512,               0.687
  2,      0,      2,  512,               1.393
  2,      2,      2,  512,               0.944
  3,      0,      0,  512,               0.691
  3,      3,      0,  512,               0.676
  3,      0,      3,  512,               1.388
  3,      3,      3,  512,               0.948
  4,      0,      0,  512,                0.74
  4,      4,      0,  512,               0.678
  4,      0,      4,  512,               1.421
  4,      4,      4,  512,               0.943
  5,      0,      0,  512,               0.691
  5,      5,      0,  512,               0.675
  5,      0,      5,  512,               1.348
  5,      5,      5,  512,               0.952
  6,      0,      0,  512,               0.685
  6,      6,      0,  512,                0.67
  6,      0,      6,  512,               1.333
  6,      6,      6,  512,                0.95
  7,      0,      0,  512,               0.688
  7,      7,      0,  512,               0.675
  7,      0,      7,  512,               1.344
  7,      7,      7,  512,               0.919
  8,      0,      0,  512,               0.716
  8,      0,      8,  512,               0.935
  9,      0,      0,  512,               0.716
  9,      1,      0,  512,               0.712
  9,      0,      9,  512,               0.956
  9,      1,      9,  512,               0.992
 10,      0,      0,  512,               0.699
 10,      2,      0,  512,                0.68
 10,      0,     10,  512,               0.952
 10,      2,     10,  512,               0.932
 11,      0,      0,  512,               0.705
 11,      3,      0,  512,               0.685
 11,      0,     11,  512,               0.956
 11,      3,     11,  512,               0.927
 12,      0,      0,  512,               0.695
 12,      4,      0,  512,               0.675
 12,      0,     12,  512,               0.948
 12,      4,     12,  512,               0.928
 13,      0,      0,  512,                 0.7
 13,      5,      0,  512,               0.678
 13,      0,     13,  512,               0.944
 13,      5,     13,  512,               0.931
 14,      0,      0,  512,               0.703
 14,      6,      0,  512,               0.678
 14,      0,     14,  512,               0.949
 14,      6,     14,  512,                0.93
 15,      0,      0,  512,               0.694
 15,      7,      0,  512,               0.678
 15,      0,     15,  512,               0.953
 15,      7,     15,  512,               0.924
 16,      0,      0,  512,               1.021
 16,      0,     16,  512,               1.067
 17,      0,      0,  512,               0.991
 17,      1,      0,  512,               0.984
 17,      0,     17,  512,               0.979
 17,      1,     17,  512,               0.993
 18,      0,      0,  512,               0.992
 18,      2,      0,  512,               1.008
 18,      0,     18,  512,               1.016
 18,      2,     18,  512,               0.993
 19,      0,      0,  512,               0.984
 19,      3,      0,  512,               0.985
 19,      0,     19,  512,               1.007
 19,      3,     19,  512,               1.006
 20,      0,      0,  512,               0.969
 20,      4,      0,  512,               0.968
 20,      0,     20,  512,               0.975
 20,      4,     20,  512,               0.975
 21,      0,      0,  512,               0.992
 21,      5,      0,  512,               0.992
 21,      0,     21,  512,                0.98
 21,      5,     21,  512,                0.97
 22,      0,      0,  512,               0.989
 22,      6,      0,  512,               0.987
 22,      0,     22,  512,                0.99
 22,      6,     22,  512,               0.985
 23,      0,      0,  512,               0.989
 23,      7,      0,  512,                0.98
 23,      0,     23,  512,                 1.0
 23,      7,     23,  512,               0.993
 24,      0,      0,  512,                0.99
 24,      0,     24,  512,               0.998
 25,      0,      0,  512,                1.01
 25,      1,      0,  512,                 1.0
 25,      0,     25,  512,                0.97
 25,      1,     25,  512,               0.967
 26,      0,      0,  512,               1.009
 26,      2,      0,  512,               0.986
 26,      0,     26,  512,               0.997
 26,      2,     26,  512,               0.993
 27,      0,      0,  512,               0.984
 27,      3,      0,  512,               0.997
 27,      0,     27,  512,               0.989
 27,      3,     27,  512,               0.976
 28,      0,      0,  512,               0.991
 28,      4,      0,  512,               1.003
 28,      0,     28,  512,               0.986
 28,      4,     28,  512,               0.989
 29,      0,      0,  512,               0.986
 29,      5,      0,  512,               0.985
 29,      0,     29,  512,               0.984
 29,      5,     29,  512,               0.977
 30,      0,      0,  512,               0.991
 30,      6,      0,  512,               0.987
 30,      0,     30,  512,               0.979
 30,      6,     30,  512,               0.974
 31,      0,      0,  512,               0.995
 31,      7,      0,  512,               0.995
 31,      0,     31,  512,               0.994
 31,      7,     31,  512,               0.984
  4,      0,      0,   32,               0.861
  4,      1,      0,   32,               0.864
  4,      0,      1,   32,               0.962
  4,      1,      1,   32,               0.967
  4,      0,      0,   64,               0.884
  4,      2,      0,   64,               0.818
  4,      0,      2,   64,               0.889
  4,      2,      2,   64,               0.918
  4,      0,      0,  128,               0.942
  4,      3,      0,  128,               0.884
  4,      0,      3,  128,               0.931
  4,      3,      3,  128,               0.883
  4,      0,      0,  256,               0.964
  4,      4,      0,  256,               0.922
  4,      0,      4,  256,               0.956
  4,      4,      4,  256,                0.93
  4,      5,      0,  512,               0.833
  4,      0,      5,  512,               1.027
  4,      5,      5,  512,               0.929
  4,      0,      0, 1024,               0.998
  4,      6,      0, 1024,               0.986
  4,      0,      6, 1024,               0.984
  4,      6,      6, 1024,               0.977
  4,      0,      0, 2048,               0.991
  4,      7,      0, 2048,               0.987
  4,      0,      7, 2048,               0.996
  4,      7,      7, 2048,                0.98
 10,      1,      0,   64,               0.826
 10,      1,      1,   64,               0.907
 10,      2,      0,   64,               0.829
 10,      2,      2,   64,                0.91
 10,      3,      0,   64,                0.83
 10,      3,      3,   64,               0.915
 10,      4,      0,   64,                0.83
 10,      4,      4,   64,               0.911
 10,      5,      0,   64,               0.828
 10,      5,      5,   64,               0.905
 10,      6,      0,   64,               0.828
 10,      6,      6,   64,               0.812
 10,      7,      0,   64,                0.83
 10,      7,      7,   64,               0.819
  6,      0,      0,    0,               1.261
  6,      0,      0,    1,               1.252
  6,      0,      1,    1,               0.845
  6,      0,      0,    2,                1.27
  6,      0,      2,    2,                0.85
  6,      0,      0,    3,               1.269
  6,      0,      3,    3,               0.845
  6,      0,      0,    4,               1.287
  6,      0,      4,    4,               0.852
  6,      0,      0,    5,               1.278
  6,      0,      5,    5,               0.851
  6,      0,      0,    6,               1.269
  6,      0,      6,    6,               0.841
  6,      0,      0,    7,               1.268
  6,      0,      7,    7,               0.851
  6,      0,      0,    8,               1.291
  6,      0,      8,    8,               0.837
  6,      0,      0,    9,               1.283
  6,      0,      9,    9,               0.831
  6,      0,      0,   10,               1.252
  6,      0,     10,   10,               0.997
  6,      0,      0,   11,               1.295
  6,      0,     11,   11,               1.046
  6,      0,      0,   12,               1.296
  6,      0,     12,   12,               1.038
  6,      0,      0,   13,               1.287
  6,      0,     13,   13,               1.082
  6,      0,      0,   14,               1.284
  6,      0,     14,   14,               1.001
  6,      0,      0,   15,               1.286
  6,      0,     15,   15,               1.002
  6,      0,      0,   16,               0.894
  6,      0,     16,   16,               0.874
  6,      0,      0,   17,               0.892
  6,      0,     17,   17,               0.974
  6,      0,      0,   18,               0.907
  6,      0,     18,   18,               0.993
  6,      0,      0,   19,               0.909
  6,      0,     19,   19,                0.99
  6,      0,      0,   20,               0.894
  6,      0,     20,   20,               0.978
  6,      0,      0,   21,                0.89
  6,      0,     21,   21,               0.958
  6,      0,      0,   22,               0.893
  6,      0,     22,   22,                0.99
  6,      0,      0,   23,               0.899
  6,      0,     23,   23,               0.986
  6,      0,      0,   24,               0.893
  6,      0,     24,   24,               0.989
  6,      0,      0,   25,               0.889
  6,      0,     25,   25,               0.982
  6,      0,      0,   26,               0.889
  6,      0,     26,   26,               0.852
  6,      0,      0,   27,                0.89
  6,      0,     27,   27,               0.832
  6,      0,      0,   28,                0.89
  6,      0,     28,   28,               0.831
  6,      0,      0,   29,                0.89
  6,      0,     29,   29,               0.838
  6,      0,      0,   30,               0.907
  6,      0,     30,   30,               0.833
  6,      0,      0,   31,               0.888
  6,      0,     31,   31,               0.837
  6,      0,      0,   32,               0.853
  6,      0,     32,   32,               0.828
  6,      0,      0,   33,               0.857
  6,      0,     33,   33,               0.947
  6,      0,      0,   34,               0.847
  6,      0,     34,   34,               0.954
  6,      0,      0,   35,               0.841
  6,      0,     35,   35,                0.94
  6,      0,      0,   36,               0.854
  6,      0,     36,   36,               0.958
  6,      0,      0,   37,               0.856
  6,      0,     37,   37,               0.957
  6,      0,      0,   38,               0.839
  6,      0,     38,   38,               0.962
  6,      0,      0,   39,               0.866
  6,      0,     39,   39,               0.945
  6,      0,      0,   40,               0.845
  6,      0,     40,   40,               0.961
  6,      0,      0,   41,               0.858
  6,      0,     41,   41,               0.961
  6,      0,      0,   42,               0.862
  6,      0,     42,   42,               0.825
  6,      0,      0,   43,               0.864
  6,      0,     43,   43,                0.82
  6,      0,      0,   44,               0.843
  6,      0,     44,   44,                0.81
  6,      0,      0,   45,               0.859
  6,      0,     45,   45,               0.816
  6,      0,      0,   46,               0.866
  6,      0,     46,   46,                0.81
  6,      0,      0,   47,               0.858
  6,      0,     47,   47,               0.807
  6,      0,      0,   48,                0.87
  6,      0,     48,   48,                0.87
  6,      0,      0,   49,               0.871
  6,      0,     49,   49,               0.874
  6,      0,      0,   50,                0.87
  6,      0,     50,   50,               0.881
  6,      0,      0,   51,               0.868
  6,      0,     51,   51,               0.875
  6,      0,      0,   52,               0.873
  6,      0,     52,   52,               0.871
  6,      0,      0,   53,               0.866
  6,      0,     53,   53,               0.882
  6,      0,      0,   54,               0.863
  6,      0,     54,   54,               0.876
  6,      0,      0,   55,               0.851
  6,      0,     55,   55,               0.871
  6,      0,      0,   56,               0.867
  6,      0,     56,   56,               0.888
  6,      0,      0,   57,               0.862
  6,      0,     57,   57,               0.899
  6,      0,      0,   58,               0.873
  6,      0,     58,   58,               0.798
  6,      0,      0,   59,               0.881
  6,      0,     59,   59,               0.785
  6,      0,      0,   60,               0.867
  6,      0,     60,   60,               0.797
  6,      0,      0,   61,               0.872
  6,      0,     61,   61,               0.791
  6,      0,      0,   62,               0.859
  6,      0,     62,   62,                0.79
  6,      0,      0,   63,                0.87
  6,      0,     63,   63,               0.796

 sysdeps/x86_64/multiarch/strspn-c.c | 86 +++++++++++++----------------
 1 file changed, 39 insertions(+), 47 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c
index 8fb3aba64d..6124033ceb 100644
--- a/sysdeps/x86_64/multiarch/strspn-c.c
+++ b/sysdeps/x86_64/multiarch/strspn-c.c
@@ -62,81 +62,73 @@ __strspn_sse42 (const char *s, const char *a)
     return 0;
 
   const char *aligned;
-  __m128i mask;
-  int offset = (int) ((size_t) a & 15);
+  __m128i mask, maskz, zero;
+  unsigned int maskz_bits;
+  unsigned int offset = (int) ((size_t) a & 15);
+  zero = _mm_set1_epi8 (0);
   if (offset != 0)
     {
       /* Load masks.  */
       aligned = (const char *) ((size_t) a & -16L);
       __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
-
-      mask = __m128i_shift_right (mask0, offset);
+      maskz = _mm_cmpeq_epi8 (mask0, zero);
 
       /* Find where the NULL terminator is.  */
-      int length = _mm_cmpistri (mask, mask, 0x3a);
-      if (length == 16 - offset)
-	{
-	  /* There is no NULL terminator.  */
-	  __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
-	  int index = _mm_cmpistri (mask1, mask1, 0x3a);
-	  length += index;
-
-	  /* Don't use SSE4.2 if the length of A > 16.  */
-	  if (length > 16)
-	    return __strspn_sse2 (s, a);
-
-	  if (index != 0)
-	    {
-	      /* Combine mask0 and mask1.  We could play games with
-		 palignr, but frankly this data should be in L1 now
-		 so do the merge via an unaligned load.  */
-	      mask = _mm_loadu_si128 ((__m128i *) a);
-	    }
-	}
+      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
+      if (maskz_bits != 0)
+        {
+          mask = __m128i_shift_right (mask0, offset);
+          offset = (unsigned int) ((size_t) s & 15);
+          if (offset)
+            goto start_unaligned;
+
+          aligned = s;
+          goto start_loop;
+        }
     }
-  else
-    {
-      /* A is aligned.  */
-      mask = _mm_load_si128 ((__m128i *) a);
 
-      /* Find where the NULL terminator is.  */
-      int length = _mm_cmpistri (mask, mask, 0x3a);
-      if (length == 16)
-	{
-	  /* There is no NULL terminator.  Don't use SSE4.2 if the length
-	     of A > 16.  */
-	  if (a[16] != 0)
-	    return __strspn_sse2 (s, a);
-	}
+  /* A is aligned.  */
+  mask = _mm_loadu_si128 ((__m128i *) a);
+
+  /* Find where the NULL terminator is.  */
+  maskz = _mm_cmpeq_epi8 (mask, zero);
+  maskz_bits = _mm_movemask_epi8 (maskz);
+  if (maskz_bits == 0)
+    {
+      /* There is no NULL terminator.  Don't use SSE4.2 if the length
+         of A > 16.  */
+      if (a[16] != 0)
+        return __strspn_sse2 (s, a);
     }
+  aligned = s;
+  offset = (unsigned int) ((size_t) s & 15);
 
-  offset = (int) ((size_t) s & 15);
   if (offset != 0)
     {
+    start_unaligned:
       /* Check partial string.  */
       aligned = (const char *) ((size_t) s & -16L);
       __m128i value = _mm_load_si128 ((__m128i *) aligned);
+      __m128i adj_value = __m128i_shift_right (value, offset);
 
-      value = __m128i_shift_right (value, offset);
-
-      int length = _mm_cmpistri (mask, value, 0x12);
+      unsigned int length = _mm_cmpistri (mask, adj_value, 0x12);
       /* No need to check CFlag since it is always 1.  */
       if (length < 16 - offset)
 	return length;
       /* Find where the NULL terminator is.  */
-      int index = _mm_cmpistri (value, value, 0x3a);
-      if (index < 16 - offset)
+      maskz = _mm_cmpeq_epi8 (value, zero);
+      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
+      if (maskz_bits != 0)
 	return length;
       aligned += 16;
     }
-  else
-    aligned = s;
 
+start_loop:
   while (1)
     {
       __m128i value = _mm_load_si128 ((__m128i *) aligned);
-      int index = _mm_cmpistri (mask, value, 0x12);
-      int cflag = _mm_cmpistrc (mask, value, 0x12);
+      unsigned int index = _mm_cmpistri (mask, value, 0x12);
+      unsigned int cflag = _mm_cmpistrc (mask, value, 0x12);
       if (cflag)
 	return (size_t) (aligned + index - s);
       aligned += 16;
-- 
2.25.1


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH v1 09/23] x86: Remove strcspn-sse2.S and use the generic implementation
  2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
                   ` (6 preceding siblings ...)
  2022-03-23 21:57 ` [PATCH v1 08/23] x86: Optimize strspn in strspn-c.c Noah Goldstein
@ 2022-03-23 21:57 ` Noah Goldstein
  2022-03-24 18:57   ` H.J. Lu
  2022-03-23 21:57 ` [PATCH v1 10/23] x86: Remove strpbrk-sse2.S " Noah Goldstein
                   ` (14 subsequent siblings)
  22 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-23 21:57 UTC (permalink / raw)
  To: libc-alpha

The generic implementation is faster.

geometric_mean(N=20) of all benchmarks New / Original: .678

All string/memory tests pass.
---
Geomtric Mean N=20 runs; All functions page aligned
len, align1, align2,  pos, New Time / Old Time
  0,      0,      0,  512,               0.054
  1,      0,      0,  512,               0.055
  1,      1,      0,  512,               0.051
  1,      0,      1,  512,               0.054
  1,      1,      1,  512,               0.054
  2,      0,      0,  512,               0.861
  2,      2,      0,  512,               0.861
  2,      0,      2,  512,               0.861
  2,      2,      2,  512,               0.864
  3,      0,      0,  512,               0.854
  3,      3,      0,  512,               0.848
  3,      0,      3,  512,               0.845
  3,      3,      3,  512,                0.85
  4,      0,      0,  512,               0.851
  4,      4,      0,  512,                0.85
  4,      0,      4,  512,               0.852
  4,      4,      4,  512,               0.849
  5,      0,      0,  512,               0.938
  5,      5,      0,  512,                0.94
  5,      0,      5,  512,               0.864
  5,      5,      5,  512,                0.86
  6,      0,      0,  512,               0.858
  6,      6,      0,  512,               0.869
  6,      0,      6,  512,               0.847
  6,      6,      6,  512,               0.868
  7,      0,      0,  512,               0.867
  7,      7,      0,  512,               0.861
  7,      0,      7,  512,               0.864
  7,      7,      7,  512,               0.863
  8,      0,      0,  512,               0.884
  8,      0,      8,  512,               0.884
  9,      0,      0,  512,               0.886
  9,      1,      0,  512,               0.894
  9,      0,      9,  512,               0.889
  9,      1,      9,  512,               0.886
 10,      0,      0,  512,               0.859
 10,      2,      0,  512,               0.859
 10,      0,     10,  512,               0.862
 10,      2,     10,  512,               0.861
 11,      0,      0,  512,               0.846
 11,      3,      0,  512,               0.865
 11,      0,     11,  512,               0.859
 11,      3,     11,  512,               0.862
 12,      0,      0,  512,               0.858
 12,      4,      0,  512,               0.857
 12,      0,     12,  512,               0.964
 12,      4,     12,  512,               0.876
 13,      0,      0,  512,               0.827
 13,      5,      0,  512,               0.805
 13,      0,     13,  512,               0.821
 13,      5,     13,  512,               0.825
 14,      0,      0,  512,               0.786
 14,      6,      0,  512,               0.786
 14,      0,     14,  512,               0.803
 14,      6,     14,  512,               0.783
 15,      0,      0,  512,               0.778
 15,      7,      0,  512,               0.792
 15,      0,     15,  512,               0.796
 15,      7,     15,  512,               0.799
 16,      0,      0,  512,               0.803
 16,      0,     16,  512,               0.815
 17,      0,      0,  512,               0.812
 17,      1,      0,  512,               0.826
 17,      0,     17,  512,               0.803
 17,      1,     17,  512,               0.856
 18,      0,      0,  512,               0.801
 18,      2,      0,  512,               0.886
 18,      0,     18,  512,               0.805
 18,      2,     18,  512,               0.807
 19,      0,      0,  512,               0.814
 19,      3,      0,  512,               0.804
 19,      0,     19,  512,               0.813
 19,      3,     19,  512,               0.814
 20,      0,      0,  512,               0.885
 20,      4,      0,  512,               0.799
 20,      0,     20,  512,               0.826
 20,      4,     20,  512,               0.808
 21,      0,      0,  512,               0.816
 21,      5,      0,  512,               0.824
 21,      0,     21,  512,               0.819
 21,      5,     21,  512,               0.826
 22,      0,      0,  512,               0.814
 22,      6,      0,  512,               0.824
 22,      0,     22,  512,                0.81
 22,      6,     22,  512,               0.806
 23,      0,      0,  512,               0.825
 23,      7,      0,  512,               0.829
 23,      0,     23,  512,               0.809
 23,      7,     23,  512,               0.823
 24,      0,      0,  512,               0.829
 24,      0,     24,  512,               0.823
 25,      0,      0,  512,               0.864
 25,      1,      0,  512,               0.895
 25,      0,     25,  512,                0.88
 25,      1,     25,  512,               0.848
 26,      0,      0,  512,               0.903
 26,      2,      0,  512,               0.888
 26,      0,     26,  512,               0.894
 26,      2,     26,  512,                0.89
 27,      0,      0,  512,               0.914
 27,      3,      0,  512,               0.917
 27,      0,     27,  512,               0.902
 27,      3,     27,  512,               0.887
 28,      0,      0,  512,               0.887
 28,      4,      0,  512,               0.877
 28,      0,     28,  512,               0.893
 28,      4,     28,  512,               0.866
 29,      0,      0,  512,               0.885
 29,      5,      0,  512,               0.907
 29,      0,     29,  512,               0.894
 29,      5,     29,  512,               0.906
 30,      0,      0,  512,                0.88
 30,      6,      0,  512,               0.898
 30,      0,     30,  512,                 0.9
 30,      6,     30,  512,               0.895
 31,      0,      0,  512,               0.893
 31,      7,      0,  512,               0.874
 31,      0,     31,  512,               0.894
 31,      7,     31,  512,               0.899
  4,      0,      0,   32,               0.618
  4,      1,      0,   32,               0.627
  4,      0,      1,   32,               0.625
  4,      1,      1,   32,               0.613
  4,      0,      0,   64,               0.913
  4,      2,      0,   64,               0.801
  4,      0,      2,   64,               0.759
  4,      2,      2,   64,               0.761
  4,      0,      0,  128,               0.822
  4,      3,      0,  128,               0.863
  4,      0,      3,  128,               0.867
  4,      3,      3,  128,               0.917
  4,      0,      0,  256,               0.816
  4,      4,      0,  256,               0.812
  4,      0,      4,  256,               0.803
  4,      4,      4,  256,               0.811
  4,      5,      0,  512,               0.848
  4,      0,      5,  512,               0.843
  4,      5,      5,  512,               0.857
  4,      0,      0, 1024,               0.886
  4,      6,      0, 1024,               0.887
  4,      0,      6, 1024,               0.881
  4,      6,      6, 1024,               0.873
  4,      0,      0, 2048,               0.892
  4,      7,      0, 2048,               0.894
  4,      0,      7, 2048,                0.89
  4,      7,      7, 2048,               0.874
 10,      1,      0,   64,               0.946
 10,      1,      1,   64,                0.81
 10,      2,      0,   64,               0.804
 10,      2,      2,   64,                0.82
 10,      3,      0,   64,               0.772
 10,      3,      3,   64,               0.772
 10,      4,      0,   64,               0.748
 10,      4,      4,   64,               0.751
 10,      5,      0,   64,                0.76
 10,      5,      5,   64,                0.76
 10,      6,      0,   64,               0.726
 10,      6,      6,   64,               0.718
 10,      7,      0,   64,               0.724
 10,      7,      7,   64,                0.72
  6,      0,      0,    0,               0.415
  6,      0,      0,    1,               0.423
  6,      0,      1,    1,               0.412
  6,      0,      0,    2,               0.433
  6,      0,      2,    2,               0.434
  6,      0,      0,    3,               0.427
  6,      0,      3,    3,               0.428
  6,      0,      0,    4,               0.465
  6,      0,      4,    4,               0.466
  6,      0,      0,    5,               0.463
  6,      0,      5,    5,               0.468
  6,      0,      0,    6,               0.435
  6,      0,      6,    6,               0.444
  6,      0,      0,    7,                0.41
  6,      0,      7,    7,                0.42
  6,      0,      0,    8,               0.474
  6,      0,      8,    8,               0.501
  6,      0,      0,    9,               0.471
  6,      0,      9,    9,               0.489
  6,      0,      0,   10,               0.462
  6,      0,     10,   10,                0.46
  6,      0,      0,   11,               0.459
  6,      0,     11,   11,               0.458
  6,      0,      0,   12,               0.516
  6,      0,     12,   12,                0.51
  6,      0,      0,   13,               0.494
  6,      0,     13,   13,               0.524
  6,      0,      0,   14,               0.486
  6,      0,     14,   14,                 0.5
  6,      0,      0,   15,                0.48
  6,      0,     15,   15,               0.501
  6,      0,      0,   16,                0.54
  6,      0,     16,   16,               0.538
  6,      0,      0,   17,               0.503
  6,      0,     17,   17,               0.541
  6,      0,      0,   18,               0.537
  6,      0,     18,   18,               0.549
  6,      0,      0,   19,               0.527
  6,      0,     19,   19,               0.537
  6,      0,      0,   20,               0.539
  6,      0,     20,   20,               0.554
  6,      0,      0,   21,               0.558
  6,      0,     21,   21,               0.541
  6,      0,      0,   22,               0.546
  6,      0,     22,   22,               0.561
  6,      0,      0,   23,                0.54
  6,      0,     23,   23,               0.536
  6,      0,      0,   24,               0.565
  6,      0,     24,   24,               0.584
  6,      0,      0,   25,               0.563
  6,      0,     25,   25,                0.58
  6,      0,      0,   26,               0.555
  6,      0,     26,   26,               0.584
  6,      0,      0,   27,               0.569
  6,      0,     27,   27,               0.587
  6,      0,      0,   28,               0.612
  6,      0,     28,   28,               0.623
  6,      0,      0,   29,               0.604
  6,      0,     29,   29,               0.621
  6,      0,      0,   30,                0.59
  6,      0,     30,   30,               0.609
  6,      0,      0,   31,               0.577
  6,      0,     31,   31,               0.588
  6,      0,      0,   32,               0.621
  6,      0,     32,   32,               0.608
  6,      0,      0,   33,               0.601
  6,      0,     33,   33,               0.623
  6,      0,      0,   34,               0.614
  6,      0,     34,   34,               0.615
  6,      0,      0,   35,               0.598
  6,      0,     35,   35,               0.608
  6,      0,      0,   36,               0.626
  6,      0,     36,   36,               0.634
  6,      0,      0,   37,                0.62
  6,      0,     37,   37,               0.634
  6,      0,      0,   38,               0.612
  6,      0,     38,   38,               0.637
  6,      0,      0,   39,               0.627
  6,      0,     39,   39,               0.612
  6,      0,      0,   40,               0.661
  6,      0,     40,   40,               0.674
  6,      0,      0,   41,               0.633
  6,      0,     41,   41,               0.643
  6,      0,      0,   42,               0.634
  6,      0,     42,   42,               0.636
  6,      0,      0,   43,               0.619
  6,      0,     43,   43,               0.625
  6,      0,      0,   44,               0.654
  6,      0,     44,   44,               0.654
  6,      0,      0,   45,               0.647
  6,      0,     45,   45,               0.649
  6,      0,      0,   46,               0.651
  6,      0,     46,   46,               0.651
  6,      0,      0,   47,               0.646
  6,      0,     47,   47,               0.648
  6,      0,      0,   48,               0.662
  6,      0,     48,   48,               0.664
  6,      0,      0,   49,                0.68
  6,      0,     49,   49,               0.667
  6,      0,      0,   50,               0.654
  6,      0,     50,   50,               0.659
  6,      0,      0,   51,               0.638
  6,      0,     51,   51,               0.639
  6,      0,      0,   52,               0.665
  6,      0,     52,   52,               0.669
  6,      0,      0,   53,               0.658
  6,      0,     53,   53,               0.656
  6,      0,      0,   54,               0.669
  6,      0,     54,   54,                0.67
  6,      0,      0,   55,               0.668
  6,      0,     55,   55,               0.664
  6,      0,      0,   56,               0.701
  6,      0,     56,   56,               0.695
  6,      0,      0,   57,               0.687
  6,      0,     57,   57,               0.696
  6,      0,      0,   58,               0.693
  6,      0,     58,   58,               0.704
  6,      0,      0,   59,               0.695
  6,      0,     59,   59,               0.708
  6,      0,      0,   60,               0.708
  6,      0,     60,   60,               0.728
  6,      0,      0,   61,               0.708
  6,      0,     61,   61,                0.71
  6,      0,      0,   62,               0.715
  6,      0,     62,   62,               0.705
  6,      0,      0,   63,               0.677
  6,      0,     63,   63,               0.702

 .../{strcspn-sse2.S => strcspn-sse2.c}        |   8 +-
 sysdeps/x86_64/strcspn.S                      | 119 ------------------
 2 files changed, 4 insertions(+), 123 deletions(-)
 rename sysdeps/x86_64/multiarch/{strcspn-sse2.S => strcspn-sse2.c} (85%)
 delete mode 100644 sysdeps/x86_64/strcspn.S

diff --git a/sysdeps/x86_64/multiarch/strcspn-sse2.S b/sysdeps/x86_64/multiarch/strcspn-sse2.c
similarity index 85%
rename from sysdeps/x86_64/multiarch/strcspn-sse2.S
rename to sysdeps/x86_64/multiarch/strcspn-sse2.c
index f97e856e1f..3a04bb39fc 100644
--- a/sysdeps/x86_64/multiarch/strcspn-sse2.S
+++ b/sysdeps/x86_64/multiarch/strcspn-sse2.c
@@ -1,4 +1,4 @@
-/* strcspn optimized with SSE2.
+/* strcspn.
    Copyright (C) 2017-2022 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -19,10 +19,10 @@
 #if IS_IN (libc)
 
 # include <sysdep.h>
-# define strcspn __strcspn_sse2
+# define STRCSPN __strcspn_sse2
 
 # undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(strcspn)
+# define libc_hidden_builtin_def(STRCSPN)
 #endif
 
-#include <sysdeps/x86_64/strcspn.S>
+#include <string/strcspn.c>
diff --git a/sysdeps/x86_64/strcspn.S b/sysdeps/x86_64/strcspn.S
deleted file mode 100644
index f3cd86c606..0000000000
--- a/sysdeps/x86_64/strcspn.S
+++ /dev/null
@@ -1,119 +0,0 @@
-/* strcspn (str, ss) -- Return the length of the initial segment of STR
-			which contains no characters from SS.
-   For AMD x86-64.
-   Copyright (C) 1994-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-#include "asm-syntax.h"
-
-	.text
-ENTRY (strcspn)
-
-	movq %rdi, %rdx		/* Save SRC.  */
-
-	/* First we create a table with flags for all possible characters.
-	   For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
-	   supported by the C string functions we have 256 characters.
-	   Before inserting marks for the stop characters we clear the whole
-	   table.  */
-	movq %rdi, %r8			/* Save value.  */
-	subq $256, %rsp			/* Make space for 256 bytes.  */
-	cfi_adjust_cfa_offset(256)
-	movl $32,  %ecx			/* 32*8 bytes = 256 bytes.  */
-	movq %rsp, %rdi
-	xorl %eax, %eax			/* We store 0s.  */
-	cld
-	rep
-	stosq
-
-	movq %rsi, %rax			/* Setup skipset.  */
-
-/* For understanding the following code remember that %rcx == 0 now.
-   Although all the following instruction only modify %cl we always
-   have a correct zero-extended 64-bit value in %rcx.  */
-
-	.p2align 4
-L(2):	movb (%rax), %cl	/* get byte from skipset */
-	testb %cl, %cl		/* is NUL char? */
-	jz L(1)			/* yes => start compare loop */
-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in skipset table */
-
-	movb 1(%rax), %cl	/* get byte from skipset */
-	testb $0xff, %cl	/* is NUL char? */
-	jz L(1)			/* yes => start compare loop */
-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in skipset table */
-
-	movb 2(%rax), %cl	/* get byte from skipset */
-	testb $0xff, %cl	/* is NUL char? */
-	jz L(1)			/* yes => start compare loop */
-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in skipset table */
-
-	movb 3(%rax), %cl	/* get byte from skipset */
-	addq $4, %rax		/* increment skipset pointer */
-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in skipset table */
-	testb $0xff, %cl	/* is NUL char? */
-	jnz L(2)		/* no => process next dword from skipset */
-
-L(1):	leaq -4(%rdx), %rax	/* prepare loop */
-
-	/* We use a neat trick for the following loop.  Normally we would
-	   have to test for two termination conditions
-	   1. a character in the skipset was found
-	   and
-	   2. the end of the string was found
-	   But as a sign that the character is in the skipset we store its
-	   value in the table.  But the value of NUL is NUL so the loop
-	   terminates for NUL in every case.  */
-
-	.p2align 4
-L(3):	addq $4, %rax		/* adjust pointer for full loop round */
-
-	movb (%rax), %cl	/* get byte from string */
-	cmpb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
-	je L(4)			/* yes => return */
-
-	movb 1(%rax), %cl	/* get byte from string */
-	cmpb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
-	je L(5)			/* yes => return */
-
-	movb 2(%rax), %cl	/* get byte from string */
-	cmpb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
-	jz L(6)			/* yes => return */
-
-	movb 3(%rax), %cl	/* get byte from string */
-	cmpb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
-	jne L(3)		/* no => start loop again */
-
-	incq %rax		/* adjust pointer */
-L(6):	incq %rax
-L(5):	incq %rax
-
-L(4):	addq $256, %rsp		/* remove skipset */
-	cfi_adjust_cfa_offset(-256)
-#ifdef USE_AS_STRPBRK
-	xorl %edx,%edx
-	orb %cl, %cl		/* was last character NUL? */
-	cmovzq %rdx, %rax	/* Yes:	return NULL */
-#else
-	subq %rdx, %rax		/* we have to return the number of valid
-				   characters, so compute distance to first
-				   non-valid character */
-#endif
-	ret
-END (strcspn)
-libc_hidden_builtin_def (strcspn)
-- 
2.25.1


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH v1 10/23] x86: Remove strpbrk-sse2.S and use the generic implementation
  2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
                   ` (7 preceding siblings ...)
  2022-03-23 21:57 ` [PATCH v1 09/23] x86: Remove strcspn-sse2.S and use the generic implementation Noah Goldstein
@ 2022-03-23 21:57 ` Noah Goldstein
  2022-03-24 18:57   ` H.J. Lu
  2022-03-23 21:57 ` [PATCH v1 11/23] x86: Remove strspn-sse2.S " Noah Goldstein
                   ` (13 subsequent siblings)
  22 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-23 21:57 UTC (permalink / raw)
  To: libc-alpha

The generic implementation is faster (see strcspn commit).

All string/memory tests pass.
---
 .../x86_64/multiarch/{strpbrk-sse2.S => strpbrk-sse2.c}  | 9 ++++-----
 sysdeps/x86_64/strpbrk.S                                 | 3 ---
 2 files changed, 4 insertions(+), 8 deletions(-)
 rename sysdeps/x86_64/multiarch/{strpbrk-sse2.S => strpbrk-sse2.c} (84%)
 delete mode 100644 sysdeps/x86_64/strpbrk.S

diff --git a/sysdeps/x86_64/multiarch/strpbrk-sse2.S b/sysdeps/x86_64/multiarch/strpbrk-sse2.c
similarity index 84%
rename from sysdeps/x86_64/multiarch/strpbrk-sse2.S
rename to sysdeps/x86_64/multiarch/strpbrk-sse2.c
index d537b6c27b..d03214c4fb 100644
--- a/sysdeps/x86_64/multiarch/strpbrk-sse2.S
+++ b/sysdeps/x86_64/multiarch/strpbrk-sse2.c
@@ -1,4 +1,4 @@
-/* strpbrk optimized with SSE2.
+/* strpbrk.
    Copyright (C) 2017-2022 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -19,11 +19,10 @@
 #if IS_IN (libc)
 
 # include <sysdep.h>
-# define strcspn __strpbrk_sse2
+# define STRPBRK __strpbrk_sse2
 
 # undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(strpbrk)
+# define libc_hidden_builtin_def(STRPBRK)
 #endif
 
-#define USE_AS_STRPBRK
-#include <sysdeps/x86_64/strcspn.S>
+#include <string/strpbrk.c>
diff --git a/sysdeps/x86_64/strpbrk.S b/sysdeps/x86_64/strpbrk.S
deleted file mode 100644
index 21888a5b92..0000000000
--- a/sysdeps/x86_64/strpbrk.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define strcspn strpbrk
-#define USE_AS_STRPBRK
-#include <sysdeps/x86_64/strcspn.S>
-- 
2.25.1


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH v1 11/23] x86: Remove strspn-sse2.S and use the generic implementation
  2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
                   ` (8 preceding siblings ...)
  2022-03-23 21:57 ` [PATCH v1 10/23] x86: Remove strpbrk-sse2.S " Noah Goldstein
@ 2022-03-23 21:57 ` Noah Goldstein
  2022-03-24 18:57   ` H.J. Lu
  2022-03-23 21:57 ` [PATCH v1 12/23] x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896] Noah Goldstein
                   ` (12 subsequent siblings)
  22 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-23 21:57 UTC (permalink / raw)
  To: libc-alpha

The generic implementation is faster.

geometric_mean(N=20) of all benchmarks New / Original: .710

All string/memory tests pass.
---
Geomtric Mean N=20 runs; All functions page aligned
len, align1, align2,  pos, New Time / Old Time
  1,      0,      0,  512,               0.824
  1,      1,      0,  512,               1.018
  1,      0,      1,  512,               0.986
  1,      1,      1,  512,               1.092
  2,      0,      0,  512,                0.86
  2,      2,      0,  512,               0.868
  2,      0,      2,  512,               0.858
  2,      2,      2,  512,               0.857
  3,      0,      0,  512,               0.836
  3,      3,      0,  512,               0.849
  3,      0,      3,  512,                0.84
  3,      3,      3,  512,                0.85
  4,      0,      0,  512,               0.843
  4,      4,      0,  512,               0.837
  4,      0,      4,  512,               0.835
  4,      4,      4,  512,               0.846
  5,      0,      0,  512,               0.852
  5,      5,      0,  512,               0.848
  5,      0,      5,  512,                0.85
  5,      5,      5,  512,                0.85
  6,      0,      0,  512,               0.853
  6,      6,      0,  512,               0.855
  6,      0,      6,  512,               0.853
  6,      6,      6,  512,               0.853
  7,      0,      0,  512,               0.857
  7,      7,      0,  512,               0.861
  7,      0,      7,  512,                0.94
  7,      7,      7,  512,               0.856
  8,      0,      0,  512,               0.927
  8,      0,      8,  512,               0.965
  9,      0,      0,  512,               0.967
  9,      1,      0,  512,               0.976
  9,      0,      9,  512,               0.887
  9,      1,      9,  512,               0.881
 10,      0,      0,  512,               0.853
 10,      2,      0,  512,               0.846
 10,      0,     10,  512,               0.855
 10,      2,     10,  512,               0.849
 11,      0,      0,  512,               0.854
 11,      3,      0,  512,               0.855
 11,      0,     11,  512,                0.85
 11,      3,     11,  512,               0.854
 12,      0,      0,  512,               0.864
 12,      4,      0,  512,               0.864
 12,      0,     12,  512,               0.867
 12,      4,     12,  512,                0.87
 13,      0,      0,  512,               0.853
 13,      5,      0,  512,               0.841
 13,      0,     13,  512,               0.837
 13,      5,     13,  512,                0.85
 14,      0,      0,  512,               0.838
 14,      6,      0,  512,               0.842
 14,      0,     14,  512,               0.818
 14,      6,     14,  512,               0.845
 15,      0,      0,  512,               0.799
 15,      7,      0,  512,               0.847
 15,      0,     15,  512,               0.787
 15,      7,     15,  512,                0.84
 16,      0,      0,  512,               0.824
 16,      0,     16,  512,               0.827
 17,      0,      0,  512,               0.817
 17,      1,      0,  512,               0.823
 17,      0,     17,  512,                0.82
 17,      1,     17,  512,               0.814
 18,      0,      0,  512,                0.81
 18,      2,      0,  512,               0.833
 18,      0,     18,  512,               0.811
 18,      2,     18,  512,               0.842
 19,      0,      0,  512,               0.823
 19,      3,      0,  512,               0.818
 19,      0,     19,  512,               0.821
 19,      3,     19,  512,               0.824
 20,      0,      0,  512,               0.814
 20,      4,      0,  512,               0.818
 20,      0,     20,  512,               0.806
 20,      4,     20,  512,               0.802
 21,      0,      0,  512,               0.835
 21,      5,      0,  512,               0.839
 21,      0,     21,  512,               0.842
 21,      5,     21,  512,                0.82
 22,      0,      0,  512,               0.824
 22,      6,      0,  512,               0.831
 22,      0,     22,  512,               0.819
 22,      6,     22,  512,               0.824
 23,      0,      0,  512,               0.816
 23,      7,      0,  512,               0.856
 23,      0,     23,  512,               0.808
 23,      7,     23,  512,               0.848
 24,      0,      0,  512,                0.88
 24,      0,     24,  512,               0.846
 25,      0,      0,  512,               0.929
 25,      1,      0,  512,               0.917
 25,      0,     25,  512,               0.884
 25,      1,     25,  512,               0.859
 26,      0,      0,  512,               0.919
 26,      2,      0,  512,               0.867
 26,      0,     26,  512,               0.914
 26,      2,     26,  512,               0.845
 27,      0,      0,  512,               0.919
 27,      3,      0,  512,               0.864
 27,      0,     27,  512,               0.917
 27,      3,     27,  512,               0.847
 28,      0,      0,  512,               0.905
 28,      4,      0,  512,               0.896
 28,      0,     28,  512,               0.898
 28,      4,     28,  512,               0.871
 29,      0,      0,  512,               0.911
 29,      5,      0,  512,                0.91
 29,      0,     29,  512,               0.905
 29,      5,     29,  512,               0.884
 30,      0,      0,  512,               0.907
 30,      6,      0,  512,               0.802
 30,      0,     30,  512,               0.906
 30,      6,     30,  512,               0.818
 31,      0,      0,  512,               0.907
 31,      7,      0,  512,               0.821
 31,      0,     31,  512,                0.89
 31,      7,     31,  512,               0.787
  4,      0,      0,   32,               0.623
  4,      1,      0,   32,               0.606
  4,      0,      1,   32,                 0.6
  4,      1,      1,   32,               0.603
  4,      0,      0,   64,               0.731
  4,      2,      0,   64,               0.733
  4,      0,      2,   64,               0.734
  4,      2,      2,   64,               0.755
  4,      0,      0,  128,               0.822
  4,      3,      0,  128,               0.873
  4,      0,      3,  128,                0.89
  4,      3,      3,  128,               0.907
  4,      0,      0,  256,               0.827
  4,      4,      0,  256,               0.811
  4,      0,      4,  256,               0.794
  4,      4,      4,  256,               0.814
  4,      5,      0,  512,               0.841
  4,      0,      5,  512,               0.831
  4,      5,      5,  512,               0.845
  4,      0,      0, 1024,               0.861
  4,      6,      0, 1024,               0.857
  4,      0,      6, 1024,                 0.9
  4,      6,      6, 1024,               0.861
  4,      0,      0, 2048,               0.879
  4,      7,      0, 2048,               0.875
  4,      0,      7, 2048,               0.883
  4,      7,      7, 2048,                0.88
 10,      1,      0,   64,               0.747
 10,      1,      1,   64,               0.743
 10,      2,      0,   64,               0.732
 10,      2,      2,   64,               0.729
 10,      3,      0,   64,               0.747
 10,      3,      3,   64,               0.733
 10,      4,      0,   64,                0.74
 10,      4,      4,   64,               0.751
 10,      5,      0,   64,               0.735
 10,      5,      5,   64,               0.746
 10,      6,      0,   64,               0.735
 10,      6,      6,   64,               0.733
 10,      7,      0,   64,               0.734
 10,      7,      7,   64,                0.74
  6,      0,      0,    0,               0.377
  6,      0,      0,    1,               0.369
  6,      0,      1,    1,               0.383
  6,      0,      0,    2,               0.391
  6,      0,      2,    2,               0.394
  6,      0,      0,    3,               0.416
  6,      0,      3,    3,               0.411
  6,      0,      0,    4,               0.475
  6,      0,      4,    4,               0.483
  6,      0,      0,    5,               0.473
  6,      0,      5,    5,               0.476
  6,      0,      0,    6,               0.459
  6,      0,      6,    6,               0.445
  6,      0,      0,    7,               0.433
  6,      0,      7,    7,               0.432
  6,      0,      0,    8,               0.492
  6,      0,      8,    8,               0.494
  6,      0,      0,    9,               0.476
  6,      0,      9,    9,               0.483
  6,      0,      0,   10,                0.46
  6,      0,     10,   10,               0.476
  6,      0,      0,   11,               0.463
  6,      0,     11,   11,               0.463
  6,      0,      0,   12,               0.511
  6,      0,     12,   12,               0.515
  6,      0,      0,   13,               0.506
  6,      0,     13,   13,               0.536
  6,      0,      0,   14,               0.496
  6,      0,     14,   14,               0.484
  6,      0,      0,   15,               0.473
  6,      0,     15,   15,               0.475
  6,      0,      0,   16,               0.534
  6,      0,     16,   16,               0.534
  6,      0,      0,   17,               0.525
  6,      0,     17,   17,               0.523
  6,      0,      0,   18,               0.522
  6,      0,     18,   18,               0.524
  6,      0,      0,   19,               0.512
  6,      0,     19,   19,               0.514
  6,      0,      0,   20,               0.535
  6,      0,     20,   20,                0.54
  6,      0,      0,   21,               0.543
  6,      0,     21,   21,               0.536
  6,      0,      0,   22,               0.542
  6,      0,     22,   22,               0.542
  6,      0,      0,   23,               0.529
  6,      0,     23,   23,                0.53
  6,      0,      0,   24,               0.596
  6,      0,     24,   24,               0.589
  6,      0,      0,   25,               0.583
  6,      0,     25,   25,                0.58
  6,      0,      0,   26,               0.574
  6,      0,     26,   26,                0.58
  6,      0,      0,   27,               0.575
  6,      0,     27,   27,               0.558
  6,      0,      0,   28,               0.606
  6,      0,     28,   28,               0.606
  6,      0,      0,   29,               0.589
  6,      0,     29,   29,               0.595
  6,      0,      0,   30,               0.592
  6,      0,     30,   30,               0.585
  6,      0,      0,   31,               0.585
  6,      0,     31,   31,               0.579
  6,      0,      0,   32,               0.625
  6,      0,     32,   32,               0.615
  6,      0,      0,   33,               0.615
  6,      0,     33,   33,                0.61
  6,      0,      0,   34,               0.604
  6,      0,     34,   34,                 0.6
  6,      0,      0,   35,               0.602
  6,      0,     35,   35,               0.608
  6,      0,      0,   36,               0.644
  6,      0,     36,   36,               0.644
  6,      0,      0,   37,               0.658
  6,      0,     37,   37,               0.651
  6,      0,      0,   38,               0.644
  6,      0,     38,   38,               0.649
  6,      0,      0,   39,               0.626
  6,      0,     39,   39,               0.632
  6,      0,      0,   40,               0.662
  6,      0,     40,   40,               0.661
  6,      0,      0,   41,               0.656
  6,      0,     41,   41,               0.655
  6,      0,      0,   42,               0.643
  6,      0,     42,   42,               0.637
  6,      0,      0,   43,               0.622
  6,      0,     43,   43,               0.628
  6,      0,      0,   44,               0.673
  6,      0,     44,   44,               0.687
  6,      0,      0,   45,               0.661
  6,      0,     45,   45,               0.659
  6,      0,      0,   46,               0.657
  6,      0,     46,   46,               0.653
  6,      0,      0,   47,               0.658
  6,      0,     47,   47,                0.65
  6,      0,      0,   48,               0.678
  6,      0,     48,   48,               0.683
  6,      0,      0,   49,               0.676
  6,      0,     49,   49,               0.661
  6,      0,      0,   50,               0.672
  6,      0,     50,   50,               0.662
  6,      0,      0,   51,               0.656
  6,      0,     51,   51,               0.659
  6,      0,      0,   52,               0.682
  6,      0,     52,   52,               0.686
  6,      0,      0,   53,                0.67
  6,      0,     53,   53,               0.674
  6,      0,      0,   54,               0.663
  6,      0,     54,   54,               0.675
  6,      0,      0,   55,               0.662
  6,      0,     55,   55,               0.665
  6,      0,      0,   56,               0.681
  6,      0,     56,   56,               0.697
  6,      0,      0,   57,               0.686
  6,      0,     57,   57,               0.687
  6,      0,      0,   58,               0.701
  6,      0,     58,   58,               0.693
  6,      0,      0,   59,               0.709
  6,      0,     59,   59,               0.698
  6,      0,      0,   60,               0.708
  6,      0,     60,   60,               0.708
  6,      0,      0,   61,               0.709
  6,      0,     61,   61,               0.716
  6,      0,      0,   62,               0.709
  6,      0,     62,   62,               0.707
  6,      0,      0,   63,               0.703
  6,      0,     63,   63,               0.716

 .../{strspn-sse2.S => strspn-sse2.c}          |   8 +-
 sysdeps/x86_64/strspn.S                       | 112 ------------------
 2 files changed, 4 insertions(+), 116 deletions(-)
 rename sysdeps/x86_64/multiarch/{strspn-sse2.S => strspn-sse2.c} (86%)
 delete mode 100644 sysdeps/x86_64/strspn.S

diff --git a/sysdeps/x86_64/multiarch/strspn-sse2.S b/sysdeps/x86_64/multiarch/strspn-sse2.c
similarity index 86%
rename from sysdeps/x86_64/multiarch/strspn-sse2.S
rename to sysdeps/x86_64/multiarch/strspn-sse2.c
index e0a095f25a..61cc6cb0a5 100644
--- a/sysdeps/x86_64/multiarch/strspn-sse2.S
+++ b/sysdeps/x86_64/multiarch/strspn-sse2.c
@@ -1,4 +1,4 @@
-/* strspn optimized with SSE2.
+/* strspn.
    Copyright (C) 2017-2022 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -19,10 +19,10 @@
 #if IS_IN (libc)
 
 # include <sysdep.h>
-# define strspn __strspn_sse2
+# define STRSPN __strspn_sse2
 
 # undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(strspn)
+# define libc_hidden_builtin_def(STRSPN)
 #endif
 
-#include <sysdeps/x86_64/strspn.S>
+#include <string/strspn.c>
diff --git a/sysdeps/x86_64/strspn.S b/sysdeps/x86_64/strspn.S
deleted file mode 100644
index 61b76ee0a1..0000000000
--- a/sysdeps/x86_64/strspn.S
+++ /dev/null
@@ -1,112 +0,0 @@
-/* strspn (str, ss) -- Return the length of the initial segment of STR
-			which contains only characters from SS.
-   For AMD x86-64.
-   Copyright (C) 1994-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-	.text
-ENTRY (strspn)
-
-	movq %rdi, %rdx		/* Save SRC.  */
-
-	/* First we create a table with flags for all possible characters.
-	   For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
-	   supported by the C string functions we have 256 characters.
-	   Before inserting marks for the stop characters we clear the whole
-	   table.  */
-	movq %rdi, %r8			/* Save value.  */
-	subq $256, %rsp			/* Make space for 256 bytes.  */
-	cfi_adjust_cfa_offset(256)
-	movl $32,  %ecx			/* 32*8 bytes = 256 bytes.  */
-	movq %rsp, %rdi
-	xorl %eax, %eax			/* We store 0s.  */
-	cld
-	rep
-	stosq
-
-	movq %rsi, %rax			/* Setup stopset.  */
-
-/* For understanding the following code remember that %rcx == 0 now.
-   Although all the following instruction only modify %cl we always
-   have a correct zero-extended 64-bit value in %rcx.  */
-
-	.p2align 4
-L(2):	movb (%rax), %cl	/* get byte from stopset */
-	testb %cl, %cl		/* is NUL char? */
-	jz L(1)			/* yes => start compare loop */
-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in stopset table */
-
-	movb 1(%rax), %cl	/* get byte from stopset */
-	testb $0xff, %cl	/* is NUL char? */
-	jz L(1)			/* yes => start compare loop */
-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in stopset table */
-
-	movb 2(%rax), %cl	/* get byte from stopset */
-	testb $0xff, %cl	/* is NUL char? */
-	jz L(1)			/* yes => start compare loop */
-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in stopset table */
-
-	movb 3(%rax), %cl	/* get byte from stopset */
-	addq $4, %rax		/* increment stopset pointer */
-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in stopset table */
-	testb $0xff, %cl	/* is NUL char? */
-	jnz L(2)		/* no => process next dword from stopset */
-
-L(1):	leaq -4(%rdx), %rax	/* prepare loop */
-
-	/* We use a neat trick for the following loop.  Normally we would
-	   have to test for two termination conditions
-	   1. a character in the stopset was found
-	   and
-	   2. the end of the string was found
-	   But as a sign that the character is in the stopset we store its
-	   value in the table.  But the value of NUL is NUL so the loop
-	   terminates for NUL in every case.  */
-
-	.p2align 4
-L(3):	addq $4, %rax		/* adjust pointer for full loop round */
-
-	movb (%rax), %cl	/* get byte from string */
-	testb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
-	jz L(4)			/* no => return */
-
-	movb 1(%rax), %cl	/* get byte from string */
-	testb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
-	jz L(5)			/* no => return */
-
-	movb 2(%rax), %cl	/* get byte from string */
-	testb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
-	jz L(6)			/* no => return */
-
-	movb 3(%rax), %cl	/* get byte from string */
-	testb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
-	jnz L(3)		/* yes => start loop again */
-
-	incq %rax		/* adjust pointer */
-L(6):	incq %rax
-L(5):	incq %rax
-
-L(4):	addq $256, %rsp		/* remove stopset */
-	cfi_adjust_cfa_offset(-256)
-	subq %rdx, %rax		/* we have to return the number of valid
-				   characters, so compute distance to first
-				   non-valid character */
-	ret
-END (strspn)
-libc_hidden_builtin_def (strspn)
-- 
2.25.1


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH v1 12/23] x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896]
  2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
                   ` (9 preceding siblings ...)
  2022-03-23 21:57 ` [PATCH v1 11/23] x86: Remove strspn-sse2.S " Noah Goldstein
@ 2022-03-23 21:57 ` Noah Goldstein
  2022-03-24 18:59   ` H.J. Lu
  2022-03-24 20:50   ` [PATCH v2 12/31] " Noah Goldstein
  2022-03-23 21:57 ` [PATCH v1 13/23] benchtests: Use json-lib in bench-strcasecmp.c Noah Goldstein
                   ` (11 subsequent siblings)
  22 siblings, 2 replies; 76+ messages in thread
From: Noah Goldstein @ 2022-03-23 21:57 UTC (permalink / raw)
  To: libc-alpha

Overflow case for __wcsncmp_avx2_rtm should be __wcscmp_avx2_rtm not
__wcscmp_avx2.

All string/memory tests pass.
---
 sysdeps/x86_64/multiarch/strcmp-avx2.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
index 52ff5ad724..86a86b68e3 100644
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
@@ -122,7 +122,7 @@ ENTRY(STRCMP)
 	   are cases where length is large enough that it can never be a
 	   bound on valid memory so just use wcscmp.  */
 	shrq	$56, %rcx
-	jnz	__wcscmp_avx2
+	jnz	OVERFLOW_STRCMP
 
 	leaq	(, %rdx, 4), %rdx
 #  endif
-- 
2.25.1


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH v1 13/23] benchtests: Use json-lib in bench-strcasecmp.c
  2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
                   ` (10 preceding siblings ...)
  2022-03-23 21:57 ` [PATCH v1 12/23] x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896] Noah Goldstein
@ 2022-03-23 21:57 ` Noah Goldstein
  2022-03-24 19:00   ` H.J. Lu
  2022-03-23 21:57 ` [PATCH v1 14/23] benchtests: Use json-lib in bench-strncasecmp.c Noah Goldstein
                   ` (10 subsequent siblings)
  22 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-23 21:57 UTC (permalink / raw)
  To: libc-alpha

Just QOL change to make parsing the output of the benchtests more
consistent.
---
 benchtests/bench-strcasecmp.c | 77 +++++++++++++++++++++++------------
 1 file changed, 51 insertions(+), 26 deletions(-)

diff --git a/benchtests/bench-strcasecmp.c b/benchtests/bench-strcasecmp.c
index daccf1d245..855f2db2ad 100644
--- a/benchtests/bench-strcasecmp.c
+++ b/benchtests/bench-strcasecmp.c
@@ -20,6 +20,7 @@
 #define TEST_MAIN
 #define TEST_NAME "strcasecmp"
 #include "bench-string.h"
+#include "json-lib.h"
 
 typedef int (*proto_t) (const char *, const char *);
 static int simple_strcasecmp (const char *, const char *);
@@ -40,7 +41,8 @@ simple_strcasecmp (const char *s1, const char *s2)
 }
 
 static void
-do_one_test (impl_t *impl, const char *s1, const char *s2, int exp_result)
+do_one_test (json_ctx_t *json_ctx, impl_t *impl, const char *s1,
+             const char *s2, int exp_result)
 {
   size_t i, iters = INNER_LOOP_ITERS;
   timing_t start, stop, cur;
@@ -64,12 +66,12 @@ do_one_test (impl_t *impl, const char *s1, const char *s2, int exp_result)
 
   TIMING_DIFF (cur, start, stop);
 
-  TIMING_PRINT_MEAN ((double) cur, (double) iters);
+  json_element_double (json_ctx, (double) cur / (double) iters);
 }
 
 static void
-do_test (size_t align1, size_t align2, size_t len, int max_char,
-	 int exp_result)
+do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len,
+         int max_char, int exp_result)
 {
   size_t i;
   char *s1, *s2;
@@ -85,6 +87,13 @@ do_test (size_t align1, size_t align2, size_t len, int max_char,
   if (align2 + len + 1 >= page_size)
     return;
 
+  json_element_object_begin (json_ctx);
+  json_attr_uint (json_ctx, "length", len);
+  json_attr_uint (json_ctx, "align1", align1);
+  json_attr_uint (json_ctx, "align2", align2);
+  json_attr_uint (json_ctx, "max_char", max_char);
+  json_array_begin (json_ctx, "timings");
+
   s1 = (char *) (buf1 + align1);
   s2 = (char *) (buf2 + align2);
 
@@ -103,53 +112,69 @@ do_test (size_t align1, size_t align2, size_t len, int max_char,
   else
     s2[len - 1] -= exp_result;
 
-  printf ("Length %4zd, alignment %2zd/%2zd:", len, align1, align2);
-
   FOR_EACH_IMPL (impl, 0)
-    do_one_test (impl, s1, s2, exp_result);
+    do_one_test (json_ctx, impl, s1, s2, exp_result);
 
-  putchar ('\n');
+  json_array_end (json_ctx);
+  json_element_object_end (json_ctx);
 }
 
 int
 test_main (void)
 {
+  json_ctx_t json_ctx;
   size_t i;
 
   test_init ();
 
-  printf ("%23s", "");
+  json_init (&json_ctx, 0, stdout);
+
+  json_document_begin (&json_ctx);
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+
+  json_attr_object_begin (&json_ctx, "functions");
+  json_attr_object_begin (&json_ctx, TEST_NAME);
+  json_attr_string (&json_ctx, "bench-variant", "");
+
+  json_array_begin (&json_ctx, "ifuncs");
   FOR_EACH_IMPL (impl, 0)
-    printf ("\t%s", impl->name);
-  putchar ('\n');
+    json_element_string (&json_ctx, impl->name);
+  json_array_end (&json_ctx);
+
+  json_array_begin (&json_ctx, "results");
 
   for (i = 1; i < 16; ++i)
     {
-      do_test (i, i, i, 127, 0);
-      do_test (i, i, i, 127, 1);
-      do_test (i, i, i, 127, -1);
+      do_test (&json_ctx, i, i, i, 127, 0);
+      do_test (&json_ctx, i, i, i, 127, 1);
+      do_test (&json_ctx, i, i, i, 127, -1);
     }
 
   for (i = 1; i < 10; ++i)
     {
-      do_test (0, 0, 2 << i, 127, 0);
-      do_test (0, 0, 2 << i, 254, 0);
-      do_test (0, 0, 2 << i, 127, 1);
-      do_test (0, 0, 2 << i, 254, 1);
-      do_test (0, 0, 2 << i, 127, -1);
-      do_test (0, 0, 2 << i, 254, -1);
+      do_test (&json_ctx, 0, 0, 2 << i, 127, 0);
+      do_test (&json_ctx, 0, 0, 2 << i, 254, 0);
+      do_test (&json_ctx, 0, 0, 2 << i, 127, 1);
+      do_test (&json_ctx, 0, 0, 2 << i, 254, 1);
+      do_test (&json_ctx, 0, 0, 2 << i, 127, -1);
+      do_test (&json_ctx, 0, 0, 2 << i, 254, -1);
     }
 
   for (i = 1; i < 8; ++i)
     {
-      do_test (i, 2 * i, 8 << i, 127, 0);
-      do_test (2 * i, i, 8 << i, 254, 0);
-      do_test (i, 2 * i, 8 << i, 127, 1);
-      do_test (2 * i, i, 8 << i, 254, 1);
-      do_test (i, 2 * i, 8 << i, 127, -1);
-      do_test (2 * i, i, 8 << i, 254, -1);
+      do_test (&json_ctx, i, 2 * i, 8 << i, 127, 0);
+      do_test (&json_ctx, 2 * i, i, 8 << i, 254, 0);
+      do_test (&json_ctx, i, 2 * i, 8 << i, 127, 1);
+      do_test (&json_ctx, 2 * i, i, 8 << i, 254, 1);
+      do_test (&json_ctx, i, 2 * i, 8 << i, 127, -1);
+      do_test (&json_ctx, 2 * i, i, 8 << i, 254, -1);
     }
 
+  json_array_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_document_end (&json_ctx);
+
   return ret;
 }
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH v1 14/23] benchtests: Use json-lib in bench-strncasecmp.c
  2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
                   ` (11 preceding siblings ...)
  2022-03-23 21:57 ` [PATCH v1 13/23] benchtests: Use json-lib in bench-strcasecmp.c Noah Goldstein
@ 2022-03-23 21:57 ` Noah Goldstein
  2022-03-24 19:00   ` H.J. Lu
  2022-03-23 21:57 ` [PATCH v1 15/23] string: Expand page cross tests in test-strcasecmp.c Noah Goldstein
                   ` (9 subsequent siblings)
  22 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-23 21:57 UTC (permalink / raw)
  To: libc-alpha

Just QOL change to make parsing the output of the benchtests more
consistent.
---
 benchtests/bench-strncasecmp.c | 113 ++++++++++++++++++++-------------
 1 file changed, 69 insertions(+), 44 deletions(-)

diff --git a/benchtests/bench-strncasecmp.c b/benchtests/bench-strncasecmp.c
index a9819efc73..91f49cc8d3 100644
--- a/benchtests/bench-strncasecmp.c
+++ b/benchtests/bench-strncasecmp.c
@@ -20,6 +20,7 @@
 #define TEST_MAIN
 #define TEST_NAME "strncasecmp"
 #include "bench-string.h"
+#include "json-lib.h"
 
 typedef int (*proto_t) (const char *, const char *, size_t);
 static int simple_strncasecmp (const char *, const char *, size_t);
@@ -47,8 +48,8 @@ simple_strncasecmp (const char *s1, const char *s2, size_t n)
 }
 
 static void
-do_one_test (impl_t *impl, const char *s1, const char *s2, size_t n,
-	     int exp_result)
+do_one_test (json_ctx_t *json_ctx, impl_t *impl, const char *s1,
+             const char *s2, size_t n, int exp_result)
 {
   size_t i, iters = INNER_LOOP_ITERS;
   timing_t start, stop, cur;
@@ -62,12 +63,12 @@ do_one_test (impl_t *impl, const char *s1, const char *s2, size_t n,
 
   TIMING_DIFF (cur, start, stop);
 
-  TIMING_PRINT_MEAN ((double) cur, (double) iters);
+  json_element_double (json_ctx, (double) cur / (double) iters);
 }
 
 static void
-do_test (size_t align1, size_t align2, size_t n, size_t len, int max_char,
-	 int exp_result)
+do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t n,
+         size_t len, int max_char, int exp_result)
 {
   size_t i;
   char *s1, *s2;
@@ -101,83 +102,107 @@ do_test (size_t align1, size_t align2, size_t n, size_t len, int max_char,
   else
     s2[len - 1] -= exp_result;
 
-  printf ("Length %4zd, alignment %2zd/%2zd:", len, align1, align2);
+  json_element_object_begin (json_ctx);
+  json_attr_uint (json_ctx, "length", len);
+  json_attr_uint (json_ctx, "n", n);
+  json_attr_uint (json_ctx, "align1", align1);
+  json_attr_uint (json_ctx, "align2", align2);
+  json_attr_uint (json_ctx, "max_char", max_char);
+  json_array_begin (json_ctx, "timings");
 
   FOR_EACH_IMPL (impl, 0)
-    do_one_test (impl, s1, s2, n, exp_result);
+    do_one_test (json_ctx, impl, s1, s2, n, exp_result);
 
-  putchar ('\n');
+  json_array_end (json_ctx);
+  json_element_object_end (json_ctx);
 }
 
 int
 test_main (void)
 {
+  json_ctx_t json_ctx;
   size_t i;
 
   test_init ();
 
-  printf ("%23s", "");
+  json_init (&json_ctx, 0, stdout);
+
+  json_document_begin (&json_ctx);
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+
+  json_attr_object_begin (&json_ctx, "functions");
+  json_attr_object_begin (&json_ctx, TEST_NAME);
+  json_attr_string (&json_ctx, "bench-variant", "");
+
+  json_array_begin (&json_ctx, "ifuncs");
   FOR_EACH_IMPL (impl, 0)
-    printf ("\t%s", impl->name);
-  putchar ('\n');
+    json_element_string (&json_ctx, impl->name);
+  json_array_end (&json_ctx);
+
+  json_array_begin (&json_ctx, "results");
 
   for (i = 1; i < 16; ++i)
     {
-      do_test (i, i, i - 1, i, 127, 0);
+      do_test (&json_ctx, i, i, i - 1, i, 127, 0);
 
-      do_test (i, i, i, i, 127, 0);
-      do_test (i, i, i, i, 127, 1);
-      do_test (i, i, i, i, 127, -1);
+      do_test (&json_ctx, i, i, i, i, 127, 0);
+      do_test (&json_ctx, i, i, i, i, 127, 1);
+      do_test (&json_ctx, i, i, i, i, 127, -1);
 
-      do_test (i, i, i + 1, i, 127, 0);
-      do_test (i, i, i + 1, i, 127, 1);
-      do_test (i, i, i + 1, i, 127, -1);
+      do_test (&json_ctx, i, i, i + 1, i, 127, 0);
+      do_test (&json_ctx, i, i, i + 1, i, 127, 1);
+      do_test (&json_ctx, i, i, i + 1, i, 127, -1);
     }
 
   for (i = 1; i < 10; ++i)
     {
-      do_test (0, 0, (2 << i) - 1, 2 << i, 127, 0);
-      do_test (0, 0, 2 << i, 2 << i, 254, 0);
-      do_test (0, 0, (2 << i) + 1, 2 << i, 127, 0);
+      do_test (&json_ctx, 0, 0, (2 << i) - 1, 2 << i, 127, 0);
+      do_test (&json_ctx, 0, 0, 2 << i, 2 << i, 254, 0);
+      do_test (&json_ctx, 0, 0, (2 << i) + 1, 2 << i, 127, 0);
 
-      do_test (0, 0, (2 << i) + 1, 2 << i, 254, 0);
+      do_test (&json_ctx, 0, 0, (2 << i) + 1, 2 << i, 254, 0);
 
-      do_test (0, 0, 2 << i, 2 << i, 127, 1);
-      do_test (0, 0, (2 << i) + 10, 2 << i, 127, 1);
+      do_test (&json_ctx, 0, 0, 2 << i, 2 << i, 127, 1);
+      do_test (&json_ctx, 0, 0, (2 << i) + 10, 2 << i, 127, 1);
 
-      do_test (0, 0, 2 << i, 2 << i, 254, 1);
-      do_test (0, 0, (2 << i) + 10, 2 << i, 254, 1);
+      do_test (&json_ctx, 0, 0, 2 << i, 2 << i, 254, 1);
+      do_test (&json_ctx, 0, 0, (2 << i) + 10, 2 << i, 254, 1);
 
-      do_test (0, 0, 2 << i, 2 << i, 127, -1);
-      do_test (0, 0, (2 << i) + 10, 2 << i, 127, -1);
+      do_test (&json_ctx, 0, 0, 2 << i, 2 << i, 127, -1);
+      do_test (&json_ctx, 0, 0, (2 << i) + 10, 2 << i, 127, -1);
 
-      do_test (0, 0, 2 << i, 2 << i, 254, -1);
-      do_test (0, 0, (2 << i) + 10, 2 << i, 254, -1);
+      do_test (&json_ctx, 0, 0, 2 << i, 2 << i, 254, -1);
+      do_test (&json_ctx, 0, 0, (2 << i) + 10, 2 << i, 254, -1);
     }
 
   for (i = 1; i < 8; ++i)
     {
-      do_test (i, 2 * i, (8 << i) - 1, 8 << i, 127, 0);
-      do_test (i, 2 * i, 8 << i, 8 << i, 127, 0);
-      do_test (i, 2 * i, (8 << i) + 100, 8 << i, 127, 0);
+      do_test (&json_ctx, i, 2 * i, (8 << i) - 1, 8 << i, 127, 0);
+      do_test (&json_ctx, i, 2 * i, 8 << i, 8 << i, 127, 0);
+      do_test (&json_ctx, i, 2 * i, (8 << i) + 100, 8 << i, 127, 0);
 
-      do_test (2 * i, i, (8 << i) - 1, 8 << i, 254, 0);
-      do_test (2 * i, i, 8 << i, 8 << i, 254, 0);
-      do_test (2 * i, i, (8 << i) + 100, 8 << i, 254, 0);
+      do_test (&json_ctx, 2 * i, i, (8 << i) - 1, 8 << i, 254, 0);
+      do_test (&json_ctx, 2 * i, i, 8 << i, 8 << i, 254, 0);
+      do_test (&json_ctx, 2 * i, i, (8 << i) + 100, 8 << i, 254, 0);
 
-      do_test (i, 2 * i, 8 << i, 8 << i, 127, 1);
-      do_test (i, 2 * i, (8 << i) + 100, 8 << i, 127, 1);
+      do_test (&json_ctx, i, 2 * i, 8 << i, 8 << i, 127, 1);
+      do_test (&json_ctx, i, 2 * i, (8 << i) + 100, 8 << i, 127, 1);
 
-      do_test (2 * i, i, 8 << i, 8 << i, 254, 1);
-      do_test (2 * i, i, (8 << i) + 100, 8 << i, 254, 1);
+      do_test (&json_ctx, 2 * i, i, 8 << i, 8 << i, 254, 1);
+      do_test (&json_ctx, 2 * i, i, (8 << i) + 100, 8 << i, 254, 1);
 
-      do_test (i, 2 * i, 8 << i, 8 << i, 127, -1);
-      do_test (i, 2 * i, (8 << i) + 100, 8 << i, 127, -1);
+      do_test (&json_ctx, i, 2 * i, 8 << i, 8 << i, 127, -1);
+      do_test (&json_ctx, i, 2 * i, (8 << i) + 100, 8 << i, 127, -1);
 
-      do_test (2 * i, i, 8 << i, 8 << i, 254, -1);
-      do_test (2 * i, i, (8 << i) + 100, 8 << i, 254, -1);
+      do_test (&json_ctx, 2 * i, i, 8 << i, 8 << i, 254, -1);
+      do_test (&json_ctx, 2 * i, i, (8 << i) + 100, 8 << i, 254, -1);
     }
 
+  json_array_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_document_end (&json_ctx);
+
   return ret;
 }
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH v1 15/23] string: Expand page cross tests in test-strcasecmp.c
  2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
                   ` (12 preceding siblings ...)
  2022-03-23 21:57 ` [PATCH v1 14/23] benchtests: Use json-lib in bench-strncasecmp.c Noah Goldstein
@ 2022-03-23 21:57 ` Noah Goldstein
  2022-03-24 19:01   ` H.J. Lu
  2022-03-23 21:57 ` [PATCH v1 16/23] string: Expand page cross tests in test-strncasecmp.c Noah Goldstein
                   ` (8 subsequent siblings)
  22 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-23 21:57 UTC (permalink / raw)
  To: libc-alpha

Add more robust tests that cover all the page cross edge cases.
---
 string/test-strcasecmp.c | 112 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 100 insertions(+), 12 deletions(-)

diff --git a/string/test-strcasecmp.c b/string/test-strcasecmp.c
index 3d994f9d64..438a9713ac 100644
--- a/string/test-strcasecmp.c
+++ b/string/test-strcasecmp.c
@@ -18,6 +18,10 @@
 
 #include <locale.h>
 #include <ctype.h>
+#include <assert.h>
+#define TEST_LEN (getpagesize () * 3)
+#define MIN_PAGE_SIZE (TEST_LEN + 2 * getpagesize ())
+
 #define TEST_MAIN
 #define TEST_NAME "strcasecmp"
 #include "test-string.h"
@@ -85,12 +89,13 @@ do_test (size_t align1, size_t align2, size_t len, int max_char,
   if (len == 0)
     return;
 
-  align1 &= 7;
-  if (align1 + len + 1 >= page_size)
+
+  align1 &= getpagesize () - 1;
+  if (align1 + (len + 1) >= page_size)
     return;
 
-  align2 &= 7;
-  if (align2 + len + 1 >= page_size)
+  align2 &= getpagesize () - 1;
+  if (align2 + (len + 1) >= page_size)
     return;
 
   s1 = (char *) (buf1 + align1);
@@ -105,12 +110,33 @@ do_test (size_t align1, size_t align2, size_t len, int max_char,
   s1[len] = s2[len] = 0;
   s1[len + 1] = 23;
   s2[len + 1] = 24 + exp_result;
+
   if ((s2[len - 1] == 'z' && exp_result == -1)
       || (s2[len - 1] == 'a' && exp_result == 1))
     s1[len - 1] += exp_result;
+  else if ((s1[len - 1] == 'Z' + 1 && exp_result == 1)
+           || (s1[len - 1] == 'A' - 1 && exp_result == -1))
+    s1[len - 1] = tolower (s2[len - 1]) + exp_result;
   else
     s2[len - 1] -= exp_result;
 
+  /* For some locals this is not guranteed yet.  */
+  if (tolower (s1[len - 1]) - tolower (s2[len - 1]) != exp_result)
+    {
+      if (exp_result == -1)
+        {
+          s1[len - 1] = tolower ('a');
+          s2[len - 1] = toupper (tolower ('a') - 1);
+        }
+      else if (exp_result == 0)
+        s1[len - 1] = toupper (s2[len - 1]);
+      else
+        {
+          s1[len - 1] = tolower ('a');
+          s2[len - 1] = toupper (tolower ('a') + 1);
+        }
+    }
+
   FOR_EACH_IMPL (impl, 0)
     do_one_test (impl, s1, s2, exp_result);
 }
@@ -207,10 +233,10 @@ do_random_tests (void)
 }
 
 static void
-test_locale (const char *locale)
+test_locale (const char *locale, int extra_tests)
 {
-  size_t i;
-
+  size_t i, j, k;
+  const size_t test_len = MIN(TEST_LEN, 3 * 4096);
   if (setlocale (LC_CTYPE, locale) == NULL)
     {
       error (0, 0, "cannot set locale \"%s\"", locale);
@@ -249,6 +275,68 @@ test_locale (const char *locale)
       do_test (2 * i, i, 8 << i, 254, -1);
     }
 
+  for (j = 0; extra_tests && j < 160; ++j)
+    {
+      for (i = 0; i < test_len;)
+        {
+          do_test (getpagesize () - j - 1, 0, i, 127, 0);
+          do_test (getpagesize () - j - 1, 0, i, 127, 1);
+          do_test (getpagesize () - j - 1, 0, i, 127, -1);
+
+          do_test (getpagesize () - j - 1, j, i, 127, 0);
+          do_test (getpagesize () - j - 1, j, i, 127, 1);
+          do_test (getpagesize () - j - 1, j, i, 127, -1);
+
+          do_test (0, getpagesize () - j - 1, i, 127, 0);
+          do_test (0, getpagesize () - j - 1, i, 127, 1);
+          do_test (0, getpagesize () - j - 1, i, 127, -1);
+
+          do_test (j, getpagesize () - j - 1, i, 127, 0);
+          do_test (j, getpagesize () - j - 1, i, 127, 1);
+          do_test (j, getpagesize () - j - 1, i, 127, -1);
+
+          for (k = 2; k <= 128; k += k)
+            {
+              do_test (getpagesize () - k, getpagesize () - j - 1, i, 127, 0);
+              do_test (getpagesize () - k - 1, getpagesize () - j - 1, i, 127,
+                       0);
+              do_test (getpagesize () - k, getpagesize () - j - 1, i, 127, 1);
+              do_test (getpagesize () - k - 1, getpagesize () - j - 1, i, 127,
+                       1);
+              do_test (getpagesize () - k, getpagesize () - j - 1, i, 127, -1);
+              do_test (getpagesize () - k - 1, getpagesize () - j - 1, i, 127,
+                       -1);
+            }
+
+          if (i < 32)
+            {
+              i += 1;
+            }
+          else if (i < 161)
+            {
+              i += 7;
+            }
+          else if (i + 161 < test_len)
+            {
+              i += 31;
+              i *= 17;
+              i /= 16;
+              if (i + 161 > test_len)
+                {
+                  i = test_len - 160;
+                }
+            }
+          else if (i + 32 < test_len)
+            {
+              i += 7;
+            }
+          else
+            {
+              i += 1;
+            }
+        }
+    }
+
   do_random_tests ();
 }
 
@@ -257,11 +345,11 @@ test_main (void)
 {
   test_init ();
 
-  test_locale ("C");
-  test_locale ("en_US.ISO-8859-1");
-  test_locale ("en_US.UTF-8");
-  test_locale ("tr_TR.ISO-8859-9");
-  test_locale ("tr_TR.UTF-8");
+  test_locale ("C", 1);
+  test_locale ("en_US.ISO-8859-1", 0);
+  test_locale ("en_US.UTF-8", 0);
+  test_locale ("tr_TR.ISO-8859-9", 0);
+  test_locale ("tr_TR.UTF-8", 0);
 
   return ret;
 }
-- 
2.25.1


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH v1 16/23] string: Expand page cross tests in test-strncasecmp.c
  2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
                   ` (13 preceding siblings ...)
  2022-03-23 21:57 ` [PATCH v1 15/23] string: Expand page cross tests in test-strcasecmp.c Noah Goldstein
@ 2022-03-23 21:57 ` Noah Goldstein
  2022-03-24 19:01   ` H.J. Lu
  2022-03-23 21:57 ` [PATCH v1 17/23] x86: Optimize str{n}casecmp TOLOWER logic in strcmp.S Noah Goldstein
                   ` (7 subsequent siblings)
  22 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-23 21:57 UTC (permalink / raw)
  To: libc-alpha

Add more robust tests that cover all the page cross edge cases.
---
 string/test-strncasecmp.c | 166 +++++++++++++++++++++++++++++++++++---
 1 file changed, 154 insertions(+), 12 deletions(-)

diff --git a/string/test-strncasecmp.c b/string/test-strncasecmp.c
index a3c848165a..b86c630bf6 100644
--- a/string/test-strncasecmp.c
+++ b/string/test-strncasecmp.c
@@ -18,6 +18,10 @@
 
 #include <locale.h>
 #include <ctype.h>
+
+#define TEST_LEN (getpagesize () * 3)
+#define MIN_PAGE_SIZE (TEST_LEN + 2 * getpagesize ())
+
 #define TEST_MAIN
 #define TEST_NAME "strncasecmp"
 #include "test-string.h"
@@ -106,14 +110,15 @@ do_test (size_t align1, size_t align2, size_t n, size_t len, int max_char,
   if (len == 0)
     return;
 
-  align1 &= 7;
-  if (align1 + len + 1 >= page_size)
+  align1 &= getpagesize () - 1;
+  if (align1 + (len + 2) >= page_size)
     return;
 
-  align2 &= 7;
-  if (align2 + len + 1 >= page_size)
+  align2 &= getpagesize () - 1;
+  if (align2 + (len + 2) >= page_size)
     return;
 
+
   s1 = (char *) (buf1 + align1);
   s2 = (char *) (buf2 + align2);
 
@@ -126,12 +131,33 @@ do_test (size_t align1, size_t align2, size_t n, size_t len, int max_char,
   s1[len] = s2[len] = 0;
   s1[len + 1] = 23;
   s2[len + 1] = 24 + exp_result;
+
   if ((s2[len - 1] == 'z' && exp_result == -1)
       || (s2[len - 1] == 'a' && exp_result == 1))
     s1[len - 1] += exp_result;
+  else if ((s1[len - 1] == 'Z' + 1 && exp_result == 1)
+           || (s1[len - 1] == 'A' - 1 && exp_result == -1))
+    s1[len - 1] = tolower (s2[len - 1]) + exp_result;
   else
     s2[len - 1] -= exp_result;
 
+  /* For some locals this is not guranteed yet.  */
+  if (tolower (s1[len - 1]) - tolower (s2[len - 1]) != exp_result)
+    {
+      if (exp_result == -1)
+        {
+          s1[len - 1] = tolower ('a');
+          s2[len - 1] = toupper (tolower ('a') - 1);
+        }
+      else if (exp_result == 0)
+        s1[len - 1] = toupper (s2[len - 1]);
+      else
+        {
+          s1[len - 1] = tolower ('a');
+          s2[len - 1] = toupper (tolower ('a') + 1);
+        }
+    }
+
   FOR_EACH_IMPL (impl, 0)
     do_one_test (impl, s1, s2, n, exp_result);
 }
@@ -299,10 +325,10 @@ bz14195 (void)
 }
 
 static void
-test_locale (const char *locale)
+test_locale (const char *locale, int extra_tests)
 {
-  size_t i;
-
+  size_t i, j, k;
+  const size_t test_len = MIN(TEST_LEN, 3 * 4096);
   if (setlocale (LC_CTYPE, locale) == NULL)
     {
       error (0, 0, "cannot set locale \"%s\"", locale);
@@ -374,6 +400,122 @@ test_locale (const char *locale)
       do_test (2 * i, i, (8 << i) + 100, 8 << i, 254, -1);
     }
 
+  for (j = 0; extra_tests && j < 160; ++j)
+    {
+      for (i = 0; i < test_len;)
+        {
+            do_test (getpagesize () - j - 1, 0, i + 1, i, 127, 0);
+            do_test (getpagesize () - j - 1, 0, i + 1, i, 127, 1);
+            do_test (getpagesize () - j - 1, 0, i + 1, i, 127, -1);
+
+            do_test (getpagesize () - j - 1, 0, i, i, 127, 0);
+            do_test (getpagesize () - j - 1, 0, i - 1, i, 127, 0);
+
+            do_test (getpagesize () - j - 1, 0, ULONG_MAX, i, 127, 0);
+            do_test (getpagesize () - j - 1, 0, ULONG_MAX, i, 127, 1);
+            do_test (getpagesize () - j - 1, 0, ULONG_MAX, i, 127, -1);
+
+            do_test (getpagesize () - j - 1, 0, ULONG_MAX - i, i, 127, 0);
+            do_test (getpagesize () - j - 1, 0, ULONG_MAX - i, i, 127, 1);
+            do_test (getpagesize () - j - 1, 0, ULONG_MAX - i, i, 127, -1);
+
+            do_test (getpagesize () - j - 1, j, i + 1, i, 127, 0);
+            do_test (getpagesize () - j - 1, j, i + 1, i, 127, 1);
+            do_test (getpagesize () - j - 1, j, i + 1, i, 127, -1);
+
+            do_test (getpagesize () - j - 1, j, i, i, 127, 0);
+            do_test (getpagesize () - j - 1, j, i - 1, i, 127, 0);
+
+            do_test (getpagesize () - j - 1, j, ULONG_MAX, i, 127, 0);
+            do_test (getpagesize () - j - 1, j, ULONG_MAX, i, 127, 1);
+            do_test (getpagesize () - j - 1, j, ULONG_MAX, i, 127, -1);
+
+            do_test (getpagesize () - j - 1, j, ULONG_MAX - i, i, 127, 0);
+            do_test (getpagesize () - j - 1, j, ULONG_MAX - i, i, 127, 1);
+            do_test (getpagesize () - j - 1, j, ULONG_MAX - i, i, 127, -1);
+
+            do_test (0, getpagesize () - j - 1, i + 1, i, 127, 0);
+            do_test (0, getpagesize () - j - 1, i + 1, i, 127, 1);
+            do_test (0, getpagesize () - j - 1, i + 1, i, 127, -1);
+
+            do_test (0, getpagesize () - j - 1, i, i, 127, 0);
+            do_test (0, getpagesize () - j - 1, i - 1, i, 127, 0);
+
+            do_test (0, getpagesize () - j - 1, ULONG_MAX, i, 127, 0);
+            do_test (0, getpagesize () - j - 1, ULONG_MAX, i, 127, 1);
+            do_test (0, getpagesize () - j - 1, ULONG_MAX, i, 127, -1);
+
+            do_test (0, getpagesize () - j - 1, ULONG_MAX - i, i, 127, 0);
+            do_test (0, getpagesize () - j - 1, ULONG_MAX - i, i, 127, 1);
+            do_test (0, getpagesize () - j - 1, ULONG_MAX - i, i, 127, -1);
+
+            do_test (j, getpagesize () - j - 1, i + 1, i, 127, 0);
+            do_test (j, getpagesize () - j - 1, i + 1, i, 127, 1);
+            do_test (j, getpagesize () - j - 1, i + 1, i, 127, -1);
+
+            do_test (j, getpagesize () - j - 1, i, i, 127, 0);
+            do_test (j, getpagesize () - j - 1, i - 1, i, 127, 0);
+
+            do_test (j, getpagesize () - j - 1, ULONG_MAX, i, 127, 0);
+            do_test (j, getpagesize () - j - 1, ULONG_MAX, i, 127, 1);
+            do_test (j, getpagesize () - j - 1, ULONG_MAX, i, 127, -1);
+
+            do_test (j, getpagesize () - j - 1, ULONG_MAX - i, i, 127, 0);
+            do_test (j, getpagesize () - j - 1, ULONG_MAX - i, i, 127, 1);
+            do_test (j, getpagesize () - j - 1, ULONG_MAX - i, i, 127, -1);
+
+          for (k = 2; k <= 128; k += k)
+            {
+              do_test (getpagesize () - k, getpagesize () - j - 1, i - 1, i,
+                       127, 0);
+              do_test (getpagesize () - k - 1, getpagesize () - j - 1, i - 1,
+                       i, 127, 0);
+              do_test (getpagesize () - k, getpagesize () - j - 1, i + 1, i,
+                       127, 0);
+              do_test (getpagesize () - k - 1, getpagesize () - j - 1, i + 1,
+                       i, 127, 0);
+              do_test (getpagesize () - k, getpagesize () - j - 1, i, i, 127,
+                       0);
+              do_test (getpagesize () - k - 1, getpagesize () - j - 1, i, i,
+                       127, 0);
+              do_test (getpagesize () - k, getpagesize () - j - 1, i + 1, i,
+                       127, -1);
+              do_test (getpagesize () - k - 1, getpagesize () - j - 1, i + 1,
+                       i, 127, -1);
+              do_test (getpagesize () - k, getpagesize () - j - 1, i + 1, i,
+                       127, 1);
+              do_test (getpagesize () - k - 1, getpagesize () - j - 1, i + 1,
+                       i, 127, 1);
+            }
+          if (i < 32)
+            {
+              i += 1;
+            }
+          else if (i < 161)
+            {
+              i += 7;
+            }
+          else if (i + 161 < test_len)
+            {
+              i += 31;
+              i *= 17;
+              i /= 16;
+              if (i + 161 > test_len)
+                {
+                  i = test_len - 160;
+                }
+            }
+          else if (i + 32 < test_len)
+            {
+              i += 7;
+            }
+          else
+            {
+              i += 1;
+            }
+        }
+    }
+
   do_random_tests ();
   do_page_tests ();
 }
@@ -383,11 +525,11 @@ test_main (void)
 {
   test_init ();
 
-  test_locale ("C");
-  test_locale ("en_US.ISO-8859-1");
-  test_locale ("en_US.UTF-8");
-  test_locale ("tr_TR.ISO-8859-9");
-  test_locale ("tr_TR.UTF-8");
+  test_locale ("C", 1);
+  test_locale ("en_US.ISO-8859-1", 0);
+  test_locale ("en_US.UTF-8", 0);
+  test_locale ("tr_TR.ISO-8859-9", 0);
+  test_locale ("tr_TR.UTF-8", 0);
 
   return ret;
 }
-- 
2.25.1


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH v1 17/23] x86: Optimize str{n}casecmp TOLOWER logic in strcmp.S
  2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
                   ` (14 preceding siblings ...)
  2022-03-23 21:57 ` [PATCH v1 16/23] string: Expand page cross tests in test-strncasecmp.c Noah Goldstein
@ 2022-03-23 21:57 ` Noah Goldstein
  2022-03-24 19:02   ` H.J. Lu
  2022-03-23 21:57 ` [PATCH v1 18/23] x86: Optimize str{n}casecmp TOLOWER logic in strcmp-sse42.S Noah Goldstein
                   ` (6 subsequent siblings)
  22 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-23 21:57 UTC (permalink / raw)
  To: libc-alpha

Slightly faster method of doing TOLOWER that saves an
instruction.

Also replace the hard coded 5-byte no with .p2align 4. On builds with
CET enabled this misaligned entry to strcasecmp.

geometric_mean(N=40) of all benchmarks New / Original: .894

All string/memory tests pass.
---
Geomtric Mean N=40 runs; All functions page aligned
length, align1, align2, max_char, New Time / Old Time
     1,      1,      1,      127,               0.903
     2,      2,      2,      127,               0.905
     3,      3,      3,      127,               0.877
     4,      4,      4,      127,               0.888
     5,      5,      5,      127,               0.901
     6,      6,      6,      127,               0.954
     7,      7,      7,      127,               0.932
     8,      0,      0,      127,               0.918
     9,      1,      1,      127,               0.914
    10,      2,      2,      127,               0.877
    11,      3,      3,      127,               0.909
    12,      4,      4,      127,               0.876
    13,      5,      5,      127,               0.886
    14,      6,      6,      127,               0.914
    15,      7,      7,      127,               0.939
     4,      0,      0,      127,               0.963
     4,      0,      0,      254,               0.943
     8,      0,      0,      254,               0.927
    16,      0,      0,      127,               0.876
    16,      0,      0,      254,               0.865
    32,      0,      0,      127,               0.865
    32,      0,      0,      254,               0.862
    64,      0,      0,      127,               0.863
    64,      0,      0,      254,               0.896
   128,      0,      0,      127,               0.885
   128,      0,      0,      254,               0.882
   256,      0,      0,      127,                0.87
   256,      0,      0,      254,               0.869
   512,      0,      0,      127,               0.832
   512,      0,      0,      254,               0.848
  1024,      0,      0,      127,               0.835
  1024,      0,      0,      254,               0.843
    16,      1,      2,      127,               0.914
    16,      2,      1,      254,               0.949
    32,      2,      4,      127,               0.955
    32,      4,      2,      254,               1.004
    64,      3,      6,      127,               0.844
    64,      6,      3,      254,               0.905
   128,      4,      0,      127,               0.889
   128,      0,      4,      254,               0.845
   256,      5,      2,      127,               0.929
   256,      2,      5,      254,               0.907
   512,      6,      4,      127,               0.837
   512,      4,      6,      254,               0.862
  1024,      7,      6,      127,               0.895
  1024,      6,      7,      254,                0.89

 sysdeps/x86_64/strcmp.S | 64 +++++++++++++++++++----------------------
 1 file changed, 29 insertions(+), 35 deletions(-)

diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
index e2ab59c555..99d8b36f1d 100644
--- a/sysdeps/x86_64/strcmp.S
+++ b/sysdeps/x86_64/strcmp.S
@@ -75,9 +75,8 @@ ENTRY2 (__strcasecmp)
 	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
 	mov	%fs:(%rax),%RDX_LP
 
-	// XXX 5 byte should be before the function
-	/* 5-byte NOP.  */
-	.byte	0x0f,0x1f,0x44,0x00,0x00
+	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
+	.p2align 4
 END2 (__strcasecmp)
 # ifndef NO_NOLOCALE_ALIAS
 weak_alias (__strcasecmp, strcasecmp)
@@ -94,9 +93,8 @@ ENTRY2 (__strncasecmp)
 	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
 	mov	%fs:(%rax),%RCX_LP
 
-	// XXX 5 byte should be before the function
-	/* 5-byte NOP.  */
-	.byte	0x0f,0x1f,0x44,0x00,0x00
+	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
+	.p2align 4
 END2 (__strncasecmp)
 # ifndef NO_NOLOCALE_ALIAS
 weak_alias (__strncasecmp, strncasecmp)
@@ -146,22 +144,22 @@ ENTRY (STRCMP)
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 	.section .rodata.cst16,"aM",@progbits,16
 	.align 16
-.Lbelowupper:
-	.quad	0x4040404040404040
-	.quad	0x4040404040404040
-.Ltopupper:
-	.quad	0x5b5b5b5b5b5b5b5b
-	.quad	0x5b5b5b5b5b5b5b5b
-.Ltouppermask:
+.Llcase_min:
+	.quad	0x3f3f3f3f3f3f3f3f
+	.quad	0x3f3f3f3f3f3f3f3f
+.Llcase_max:
+	.quad	0x9999999999999999
+	.quad	0x9999999999999999
+.Lcase_add:
 	.quad	0x2020202020202020
 	.quad	0x2020202020202020
 	.previous
-	movdqa	.Lbelowupper(%rip), %xmm5
-# define UCLOW_reg %xmm5
-	movdqa	.Ltopupper(%rip), %xmm6
-# define UCHIGH_reg %xmm6
-	movdqa	.Ltouppermask(%rip), %xmm7
-# define LCQWORD_reg %xmm7
+	movdqa	.Llcase_min(%rip), %xmm5
+# define LCASE_MIN_reg %xmm5
+	movdqa	.Llcase_max(%rip), %xmm6
+# define LCASE_MAX_reg %xmm6
+	movdqa	.Lcase_add(%rip), %xmm7
+# define CASE_ADD_reg %xmm7
 #endif
 	cmp	$0x30, %ecx
 	ja	LABEL(crosscache)	/* rsi: 16-byte load will cross cache line */
@@ -172,22 +170,18 @@ ENTRY (STRCMP)
 	movhpd	8(%rdi), %xmm1
 	movhpd	8(%rsi), %xmm2
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
-# define TOLOWER(reg1, reg2) \
-	movdqa	reg1, %xmm8;					\
-	movdqa	UCHIGH_reg, %xmm9;				\
-	movdqa	reg2, %xmm10;					\
-	movdqa	UCHIGH_reg, %xmm11;				\
-	pcmpgtb	UCLOW_reg, %xmm8;				\
-	pcmpgtb	reg1, %xmm9;					\
-	pcmpgtb	UCLOW_reg, %xmm10;				\
-	pcmpgtb	reg2, %xmm11;					\
-	pand	%xmm9, %xmm8;					\
-	pand	%xmm11, %xmm10;					\
-	pand	LCQWORD_reg, %xmm8;				\
-	pand	LCQWORD_reg, %xmm10;				\
-	por	%xmm8, reg1;					\
-	por	%xmm10, reg2
-	TOLOWER (%xmm1, %xmm2)
+#  define TOLOWER(reg1, reg2) \
+	movdqa	LCASE_MIN_reg, %xmm8;					\
+	movdqa	LCASE_MIN_reg, %xmm9;					\
+	paddb	reg1, %xmm8;					\
+	paddb	reg2, %xmm9;					\
+	pcmpgtb	LCASE_MAX_reg, %xmm8;				\
+	pcmpgtb	LCASE_MAX_reg, %xmm9;				\
+	pandn	CASE_ADD_reg, %xmm8;					\
+	pandn	CASE_ADD_reg, %xmm9;					\
+	paddb	%xmm8, reg1;					\
+	paddb	%xmm9, reg2
+	TOLOWER	(%xmm1, %xmm2)
 #else
 # define TOLOWER(reg1, reg2)
 #endif
-- 
2.25.1


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH v1 18/23] x86: Optimize str{n}casecmp TOLOWER logic in strcmp-sse42.S
  2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
                   ` (15 preceding siblings ...)
  2022-03-23 21:57 ` [PATCH v1 17/23] x86: Optimize str{n}casecmp TOLOWER logic in strcmp.S Noah Goldstein
@ 2022-03-23 21:57 ` Noah Goldstein
  2022-03-24 19:02   ` H.J. Lu
  2022-03-23 21:57 ` [PATCH v1 19/23] string: Expand page cross test cases in test-strcmp.c Noah Goldstein
                   ` (5 subsequent siblings)
  22 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-23 21:57 UTC (permalink / raw)
  To: libc-alpha

Slightly faster method of doing TOLOWER that saves an
instruction.

Also replace the hard coded 5-byte no with .p2align 4. On builds with
CET enabled this misaligned entry to strcasecmp.

geometric_mean(N=40) of all benchmarks New / Original: .920

All string/memory tests pass.
---
Geomtric Mean N=40 runs; All functions page aligned
length, align1, align2, max_char, New Time / Old Time
     1,      1,      1,      127,               0.914
     2,      2,      2,      127,               0.952
     3,      3,      3,      127,               0.924
     4,      4,      4,      127,               0.995
     5,      5,      5,      127,               0.985
     6,      6,      6,      127,               1.017
     7,      7,      7,      127,               1.031
     8,      0,      0,      127,               0.967
     9,      1,      1,      127,               0.969
    10,      2,      2,      127,               0.951
    11,      3,      3,      127,               0.938
    12,      4,      4,      127,               0.937
    13,      5,      5,      127,               0.967
    14,      6,      6,      127,               0.941
    15,      7,      7,      127,               0.951
     4,      0,      0,      127,               0.959
     4,      0,      0,      254,                0.98
     8,      0,      0,      254,               0.959
    16,      0,      0,      127,               0.895
    16,      0,      0,      254,               0.901
    32,      0,      0,      127,                0.85
    32,      0,      0,      254,               0.851
    64,      0,      0,      127,               0.897
    64,      0,      0,      254,               0.895
   128,      0,      0,      127,               0.944
   128,      0,      0,      254,               0.935
   256,      0,      0,      127,               0.922
   256,      0,      0,      254,               0.913
   512,      0,      0,      127,               0.921
   512,      0,      0,      254,               0.914
  1024,      0,      0,      127,               0.845
  1024,      0,      0,      254,                0.84
    16,      1,      2,      127,               0.923
    16,      2,      1,      254,               0.955
    32,      2,      4,      127,               0.979
    32,      4,      2,      254,               0.957
    64,      3,      6,      127,               0.866
    64,      6,      3,      254,               0.849
   128,      4,      0,      127,               0.882
   128,      0,      4,      254,               0.876
   256,      5,      2,      127,               0.877
   256,      2,      5,      254,               0.882
   512,      6,      4,      127,               0.822
   512,      4,      6,      254,               0.862
  1024,      7,      6,      127,               0.903
  1024,      6,      7,      254,               0.908

 sysdeps/x86_64/multiarch/strcmp-sse42.S | 83 +++++++++++--------------
 1 file changed, 35 insertions(+), 48 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
index 580feb90e9..7805ae9d41 100644
--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
+++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
@@ -88,9 +88,8 @@ ENTRY (GLABEL(__strcasecmp))
 	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
 	mov	%fs:(%rax),%RDX_LP
 
-	// XXX 5 byte should be before the function
-	/* 5-byte NOP.  */
-	.byte	0x0f,0x1f,0x44,0x00,0x00
+	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
+	.p2align 4
 END (GLABEL(__strcasecmp))
 	/* FALLTHROUGH to strcasecmp_l.  */
 #endif
@@ -99,9 +98,8 @@ ENTRY (GLABEL(__strncasecmp))
 	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
 	mov	%fs:(%rax),%RCX_LP
 
-	// XXX 5 byte should be before the function
-	/* 5-byte NOP.  */
-	.byte	0x0f,0x1f,0x44,0x00,0x00
+	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
+	.p2align 4
 END (GLABEL(__strncasecmp))
 	/* FALLTHROUGH to strncasecmp_l.  */
 #endif
@@ -169,27 +167,22 @@ STRCMP_SSE42:
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 	.section .rodata.cst16,"aM",@progbits,16
 	.align 16
-LABEL(belowupper):
-	.quad	0x4040404040404040
-	.quad	0x4040404040404040
-LABEL(topupper):
-# ifdef USE_AVX
-	.quad	0x5a5a5a5a5a5a5a5a
-	.quad	0x5a5a5a5a5a5a5a5a
-# else
-	.quad	0x5b5b5b5b5b5b5b5b
-	.quad	0x5b5b5b5b5b5b5b5b
-# endif
-LABEL(touppermask):
+LABEL(lcase_min):
+	.quad	0x3f3f3f3f3f3f3f3f
+	.quad	0x3f3f3f3f3f3f3f3f
+LABEL(lcase_max):
+	.quad	0x9999999999999999
+	.quad	0x9999999999999999
+LABEL(case_add):
 	.quad	0x2020202020202020
 	.quad	0x2020202020202020
 	.previous
-	movdqa	LABEL(belowupper)(%rip), %xmm4
-# define UCLOW_reg %xmm4
-	movdqa	LABEL(topupper)(%rip), %xmm5
-# define UCHIGH_reg %xmm5
-	movdqa	LABEL(touppermask)(%rip), %xmm6
-# define LCQWORD_reg %xmm6
+	movdqa	LABEL(lcase_min)(%rip), %xmm4
+# define LCASE_MIN_reg %xmm4
+	movdqa	LABEL(lcase_max)(%rip), %xmm5
+# define LCASE_MAX_reg %xmm5
+	movdqa	LABEL(case_add)(%rip), %xmm6
+# define CASE_ADD_reg %xmm6
 #endif
 	cmp	$0x30, %ecx
 	ja	LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
@@ -200,32 +193,26 @@ LABEL(touppermask):
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 # ifdef USE_AVX
 #  define TOLOWER(reg1, reg2) \
-	vpcmpgtb UCLOW_reg, reg1, %xmm7;			\
-	vpcmpgtb UCHIGH_reg, reg1, %xmm8;			\
-	vpcmpgtb UCLOW_reg, reg2, %xmm9;			\
-	vpcmpgtb UCHIGH_reg, reg2, %xmm10;			\
-	vpandn	%xmm7, %xmm8, %xmm8;					\
-	vpandn	%xmm9, %xmm10, %xmm10;					\
-	vpand	LCQWORD_reg, %xmm8, %xmm8;				\
-	vpand	LCQWORD_reg, %xmm10, %xmm10;				\
-	vpor	reg1, %xmm8, reg1;					\
-	vpor	reg2, %xmm10, reg2
+	vpaddb	LCASE_MIN_reg, reg1, %xmm7;					\
+	vpaddb	LCASE_MIN_reg, reg2, %xmm8;					\
+	vpcmpgtb LCASE_MAX_reg, %xmm7, %xmm7;					\
+	vpcmpgtb LCASE_MAX_reg, %xmm8, %xmm8;					\
+	vpandn	CASE_ADD_reg, %xmm7, %xmm7;					\
+	vpandn	CASE_ADD_reg, %xmm8, %xmm8;					\
+	vpaddb	%xmm7, reg1, reg1;					\
+	vpaddb	%xmm8, reg2, reg2
 # else
 #  define TOLOWER(reg1, reg2) \
-	movdqa	reg1, %xmm7;					\
-	movdqa	UCHIGH_reg, %xmm8;				\
-	movdqa	reg2, %xmm9;					\
-	movdqa	UCHIGH_reg, %xmm10;				\
-	pcmpgtb	UCLOW_reg, %xmm7;				\
-	pcmpgtb	reg1, %xmm8;					\
-	pcmpgtb	UCLOW_reg, %xmm9;				\
-	pcmpgtb	reg2, %xmm10;					\
-	pand	%xmm8, %xmm7;					\
-	pand	%xmm10, %xmm9;					\
-	pand	LCQWORD_reg, %xmm7;				\
-	pand	LCQWORD_reg, %xmm9;				\
-	por	%xmm7, reg1;					\
-	por	%xmm9, reg2
+	movdqa	LCASE_MIN_reg, %xmm7;					\
+	movdqa	LCASE_MIN_reg, %xmm8;					\
+	paddb	reg1, %xmm7;					\
+	paddb	reg2, %xmm8;					\
+	pcmpgtb	LCASE_MAX_reg, %xmm7;				\
+	pcmpgtb	LCASE_MAX_reg, %xmm8;				\
+	pandn	CASE_ADD_reg, %xmm7;					\
+	pandn	CASE_ADD_reg, %xmm8;					\
+	paddb	%xmm7, reg1;					\
+	paddb	%xmm8, reg2
 # endif
 	TOLOWER (%xmm1, %xmm2)
 #else
-- 
2.25.1


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH v1 19/23] string: Expand page cross test cases in test-strcmp.c
  2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
                   ` (16 preceding siblings ...)
  2022-03-23 21:57 ` [PATCH v1 18/23] x86: Optimize str{n}casecmp TOLOWER logic in strcmp-sse42.S Noah Goldstein
@ 2022-03-23 21:57 ` Noah Goldstein
  2022-03-24 19:02   ` H.J. Lu
  2022-03-23 21:57 ` [PATCH v1 20/23] string: Expand page cross test cases in test-strncmp.c Noah Goldstein
                   ` (4 subsequent siblings)
  22 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-23 21:57 UTC (permalink / raw)
  To: libc-alpha

Test cases for when both `s1` and `s2` are near the end of a page
where previously missing.
---
 string/test-strcmp.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/string/test-strcmp.c b/string/test-strcmp.c
index 0abce769d0..ece03c6d0b 100644
--- a/string/test-strcmp.c
+++ b/string/test-strcmp.c
@@ -392,7 +392,7 @@ check3 (void)
 int
 test_main (void)
 {
-  size_t i, j;
+  size_t i, j, k;
   const size_t test_len = MIN(TEST_LEN, 3 * 4096);
   test_init ();
   check();
@@ -453,6 +453,19 @@ test_main (void)
           do_test (j, getpagesize () - j - 1, i, 127, 1);
           do_test (j, getpagesize () - j - 1, i, 127, -1);
 
+          for (k = 2; k <= 128; k += k)
+            {
+              do_test (getpagesize () - k, getpagesize () - j - 1, i, 127, 0);
+              do_test (getpagesize () - k - 1, getpagesize () - j - 1, i, 127,
+                       0);
+              do_test (getpagesize () - k, getpagesize () - j - 1, i, 127, 1);
+              do_test (getpagesize () - k - 1, getpagesize () - j - 1, i, 127,
+                       1);
+              do_test (getpagesize () - k, getpagesize () - j - 1, i, 127, -1);
+              do_test (getpagesize () - k - 1, getpagesize () - j - 1, i, 127,
+                       -1);
+            }
+
           if (i < 32)
             {
               i += 1;
-- 
2.25.1


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH v1 20/23] string: Expand page cross test cases in test-strncmp.c
  2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
                   ` (17 preceding siblings ...)
  2022-03-23 21:57 ` [PATCH v1 19/23] string: Expand page cross test cases in test-strcmp.c Noah Goldstein
@ 2022-03-23 21:57 ` Noah Goldstein
  2022-03-24 19:02   ` H.J. Lu
  2022-03-23 21:57 ` [PATCH v1 21/23] x86: Add AVX2 optimized str{n}casecmp Noah Goldstein
                   ` (3 subsequent siblings)
  22 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-23 21:57 UTC (permalink / raw)
  To: libc-alpha

Test cases for when both `s1` and `s2` are near the end of a page
where previously missing.
---
 string/test-strncmp.c | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/string/test-strncmp.c b/string/test-strncmp.c
index 1a87f0e73e..bba9e3d2dc 100644
--- a/string/test-strncmp.c
+++ b/string/test-strncmp.c
@@ -573,7 +573,7 @@ check_overflow (void)
 int
 test_main (void)
 {
-  size_t i, j;
+  size_t i, j, k;
   const size_t test_len = MIN(TEST_LEN, 3 * 4096);
   test_init ();
 
@@ -705,6 +705,31 @@ test_main (void)
           do_test_n (j, getpagesize () - j - 1, i, ULONG_MAX - i, 0, 127, 0);
           do_test_n (j, getpagesize () - j - 1, i, ULONG_MAX - i, 0, 127, 1);
           do_test_n (j, getpagesize () - j - 1, i, ULONG_MAX - i, 0, 127, -1);
+
+          for (k = 2; k <= 128; k += k)
+            {
+              do_test (getpagesize () - k, getpagesize () - j - 1, i - 1, i,
+                       127, 0);
+              do_test (getpagesize () - k - 1, getpagesize () - j - 1, i - 1,
+                       i, 127, 0);
+              do_test (getpagesize () - k, getpagesize () - j - 1, i + 1, i,
+                       127, 0);
+              do_test (getpagesize () - k - 1, getpagesize () - j - 1, i + 1,
+                       i, 127, 0);
+              do_test (getpagesize () - k, getpagesize () - j - 1, i, i, 127,
+                       0);
+              do_test (getpagesize () - k - 1, getpagesize () - j - 1, i, i,
+                       127, 0);
+              do_test (getpagesize () - k, getpagesize () - j - 1, i + 1, i,
+                       127, -1);
+              do_test (getpagesize () - k - 1, getpagesize () - j - 1, i + 1,
+                       i, 127, -1);
+              do_test (getpagesize () - k, getpagesize () - j - 1, i + 1, i,
+                       127, 1);
+              do_test (getpagesize () - k - 1, getpagesize () - j - 1, i + 1,
+                       i, 127, 1);
+            }
+
           if (i < 32)
             {
               i += 1;
-- 
2.25.1


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH v1 21/23] x86: Add AVX2 optimized str{n}casecmp
  2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
                   ` (18 preceding siblings ...)
  2022-03-23 21:57 ` [PATCH v1 20/23] string: Expand page cross test cases in test-strncmp.c Noah Goldstein
@ 2022-03-23 21:57 ` Noah Goldstein
  2022-03-24 19:03   ` H.J. Lu
                     ` (3 more replies)
  2022-03-23 21:57 ` [PATCH v1 22/23] x86: Add EVEX " Noah Goldstein
                   ` (2 subsequent siblings)
  22 siblings, 4 replies; 76+ messages in thread
From: Noah Goldstein @ 2022-03-23 21:57 UTC (permalink / raw)
  To: libc-alpha

geometric_mean(N=40) of all benchmarks AVX2 / SSE42: .702

All string/memory tests pass.
---
Geomtric Mean N=40 runs; All functions page aligned
length, align1, align2, max_char, AVX2 Time / SSE42 Time
     1,      1,      1,      127,                  1.032
     2,      2,      2,      127,                  1.006
     3,      3,      3,      127,                  1.009
     4,      4,      4,      127,                  0.964
     5,      5,      5,      127,                  0.929
     6,      6,      6,      127,                   0.94
     7,      7,      7,      127,                  0.958
     8,      0,      0,      127,                  0.988
     9,      1,      1,      127,                   0.99
    10,      2,      2,      127,                  0.995
    11,      3,      3,      127,                  0.991
    12,      4,      4,      127,                  0.975
    13,      5,      5,      127,                  0.943
    14,      6,      6,      127,                  0.955
    15,      7,      7,      127,                  0.988
     4,      0,      0,      127,                  0.983
     4,      0,      0,      254,                  0.978
     8,      0,      0,      254,                  0.989
    16,      0,      0,      127,                  0.792
    16,      0,      0,      254,                  0.774
    32,      0,      0,      127,                  0.568
    32,      0,      0,      254,                  0.555
    64,      0,      0,      127,                  0.561
    64,      0,      0,      254,                  0.561
   128,      0,      0,      127,                  0.574
   128,      0,      0,      254,                  0.577
   256,      0,      0,      127,                  0.561
   256,      0,      0,      254,                  0.552
   512,      0,      0,      127,                   0.59
   512,      0,      0,      254,                  0.594
  1024,      0,      0,      127,                  0.528
  1024,      0,      0,      254,                  0.517
    16,      1,      2,      127,                  0.758
    16,      2,      1,      254,                  0.748
    32,      2,      4,      127,                  0.419
    32,      4,      2,      254,                  0.428
    64,      3,      6,      127,                  0.472
    64,      6,      3,      254,                  0.464
   128,      4,      0,      127,                  0.534
   128,      0,      4,      254,                   0.53
   256,      5,      2,      127,                  0.679
   256,      2,      5,      254,                  0.676
   512,      6,      4,      127,                  0.525
   512,      4,      6,      254,                  0.523
  1024,      7,      6,      127,                  0.518
  1024,      6,      7,      254,                  0.505

 sysdeps/x86_64/multiarch/Makefile             |   4 +
 sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  28 +++
 sysdeps/x86_64/multiarch/ifunc-strcasecmp.h   |  12 +
 .../x86_64/multiarch/strcasecmp_l-avx2-rtm.S  |  15 ++
 sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S  |  23 ++
 sysdeps/x86_64/multiarch/strcmp-avx2.S        | 230 +++++++++++++++---
 .../x86_64/multiarch/strncase_l-avx2-rtm.S    |  16 ++
 sysdeps/x86_64/multiarch/strncase_l-avx2.S    |  27 ++
 8 files changed, 324 insertions(+), 31 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
 create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
 create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index e7b413edad..06e1848823 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -55,6 +55,8 @@ sysdep_routines += \
   stpncpy-sse2-unaligned \
   stpncpy-ssse3 \
   strcasecmp_l-avx \
+  strcasecmp_l-avx2 \
+  strcasecmp_l-avx2-rtm \
   strcasecmp_l-sse2 \
   strcasecmp_l-sse4_2 \
   strcasecmp_l-ssse3 \
@@ -93,6 +95,8 @@ sysdep_routines += \
   strlen-evex \
   strlen-sse2 \
   strncase_l-avx \
+  strncase_l-avx2 \
+  strncase_l-avx2-rtm \
   strncase_l-sse2 \
   strncase_l-sse4_2 \
   strncase_l-ssse3 \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index a594f4176e..3c556d07ac 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -436,6 +436,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
   IFUNC_IMPL (i, name, strcasecmp,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      CPU_FEATURE_USABLE (AVX2),
+			      __strcasecmp_avx2)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __strcasecmp_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp,
 			      CPU_FEATURE_USABLE (AVX),
 			      __strcasecmp_avx)
@@ -449,6 +456,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
   IFUNC_IMPL (i, name, strcasecmp_l,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      CPU_FEATURE_USABLE (AVX2),
+			      __strcasecmp_l_avx2)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __strcasecmp_l_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
 			      CPU_FEATURE_USABLE (AVX),
 			      __strcasecmp_l_avx)
@@ -576,6 +590,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
   IFUNC_IMPL (i, name, strncasecmp,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      CPU_FEATURE_USABLE (AVX2),
+			      __strncasecmp_avx2)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __strncasecmp_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp,
 			      CPU_FEATURE_USABLE (AVX),
 			      __strncasecmp_avx)
@@ -590,6 +611,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
   IFUNC_IMPL (i, name, strncasecmp_l,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      CPU_FEATURE_USABLE (AVX2),
+			      __strncasecmp_l_avx2)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __strncasecmp_l_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
 			      CPU_FEATURE_USABLE (AVX),
 			      __strncasecmp_l_avx)
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
index 9e3cc61ac0..c4de111fd0 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
@@ -23,12 +23,24 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
 
 static inline void *
 IFUNC_SELECTOR (void)
 {
   const struct cpu_features* cpu_features = __get_cpu_features ();
 
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+    {
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+        return OPTIMIZE (avx2_rtm);
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+        return OPTIMIZE (avx2);
+    }
+
   if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
     return OPTIMIZE (avx);
 
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
new file mode 100644
index 0000000000..09957fc3c5
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
@@ -0,0 +1,15 @@
+#ifndef STRCMP
+# define STRCMP	__strcasecmp_l_avx2_rtm
+#endif
+
+#define _GLABEL(x)	x ## _rtm
+#define GLABEL(x)	_GLABEL(x)
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN	\
+	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN	jmp L(return_vzeroupper)
+
+#define SECTION(p)	p##.avx.rtm
+
+#include "strcasecmp_l-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
new file mode 100644
index 0000000000..e2762f2a22
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
@@ -0,0 +1,23 @@
+/* strcasecmp_l optimized with AVX2.
+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef STRCMP
+# define STRCMP	__strcasecmp_l_avx2
+#endif
+#define USE_AS_STRCASECMP_L
+#include "strcmp-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
index 86a86b68e3..eeb90a0da6 100644
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
@@ -20,6 +20,10 @@
 
 # include <sysdep.h>
 
+# if defined USE_AS_STRCASECMP_L
+#  include "locale-defines.h"
+# endif
+
 # ifndef STRCMP
 #  define STRCMP	__strcmp_avx2
 # endif
@@ -74,13 +78,88 @@
 #  define VEC_OFFSET	(-VEC_SIZE)
 # endif
 
+# ifdef USE_AS_STRCASECMP_L
+#  define BYTE_LOOP_REG	OFFSET_REG
+# else
+#  define BYTE_LOOP_REG	ecx
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+#  ifdef USE_AS_STRNCMP
+#   define STRCASECMP	__strncasecmp_avx2
+#   define LOCALE_REG	rcx
+#   define LOCALE_REG_LP	RCX_LP
+#   define STRCASECMP_NONASCII	__strncasecmp_l_nonascii
+#  else
+#   define STRCASECMP	__strcasecmp_avx2
+#   define LOCALE_REG	rdx
+#   define LOCALE_REG_LP	RDX_LP
+#   define STRCASECMP_NONASCII	__strcasecmp_l_nonascii
+#  endif
+# endif
+
 # define xmmZERO	xmm15
 # define ymmZERO	ymm15
 
+# define LCASE_MIN_ymm	%ymm10
+# define LCASE_MAX_ymm	%ymm11
+# define CASE_ADD_ymm	%ymm12
+
+# define LCASE_MIN_xmm	%xmm10
+# define LCASE_MAX_xmm	%xmm11
+# define CASE_ADD_xmm	%xmm12
+
+	/* r11 is never use elsewhere so this is safe to maintain.  */
+# define TOLOWER_BASE	%r11
+
 # ifndef SECTION
 #  define SECTION(p)	p##.avx
 # endif
 
+# ifdef USE_AS_STRCASECMP_L
+#  define REG(x, y) x ## y
+#  define TOLOWER(reg1_in, reg1_out, reg2_in, reg2_out, ext)			\
+	vpaddb	REG(LCASE_MIN_, ext), reg1_in, REG(%ext, 8);				\
+	vpaddb	REG(LCASE_MIN_, ext), reg2_in, REG(%ext, 9);				\
+	vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 8), REG(%ext, 8);			\
+	vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 9), REG(%ext, 9);			\
+	vpandn	REG(CASE_ADD_, ext), REG(%ext, 8), REG(%ext, 8);			\
+	vpandn	REG(CASE_ADD_, ext), REG(%ext, 9), REG(%ext, 9);			\
+	vpaddb	REG(%ext, 8), reg1_in, reg1_out;							\
+	vpaddb	REG(%ext, 9), reg2_in, reg2_out
+
+#  define TOLOWER_gpr(src, dst)	movl (TOLOWER_BASE, src, 4), dst
+#  define TOLOWER_ymm(...)	TOLOWER(__VA_ARGS__, ymm)
+#  define TOLOWER_xmm(...)	TOLOWER(__VA_ARGS__, xmm)
+
+#  define CMP_R1_R2(s1_reg, s2_reg, scratch_reg, reg_out, ext)			\
+	TOLOWER	(s1_reg, scratch_reg, s2_reg, s2_reg, ext);					\
+	VPCMPEQ	scratch_reg, s2_reg, reg_out
+
+#  define CMP_R1_S2(s1_reg, s2_mem, scratch_reg, reg_out, ext)			\
+	VMOVU	s2_mem, reg_out;											\
+	CMP_R1_R2(s1_reg, reg_out, scratch_reg, reg_out, ext)
+
+#  define CMP_R1_R2_ymm(...) CMP_R1_R2(__VA_ARGS__, ymm)
+#  define CMP_R1_R2_xmm(...) CMP_R1_R2(__VA_ARGS__, xmm)
+
+#  define CMP_R1_S2_ymm(...) CMP_R1_S2(__VA_ARGS__, ymm)
+#  define CMP_R1_S2_xmm(...) CMP_R1_S2(__VA_ARGS__, xmm)
+
+# else
+#  define TOLOWER_gpr(...)
+#  define TOLOWER_ymm(...)
+#  define TOLOWER_xmm(...)
+
+#  define CMP_R1_R2_ymm(s1_reg, s2_reg, scratch_reg, reg_out)			\
+	VPCMPEQ	s2_reg, s1_reg, reg_out
+
+#  define CMP_R1_R2_xmm(...) CMP_R1_R2_ymm(__VA_ARGS__)
+
+#  define CMP_R1_S2_ymm(...) CMP_R1_R2_ymm(__VA_ARGS__)
+#  define CMP_R1_S2_xmm(...) CMP_R1_R2_xmm(__VA_ARGS__)
+# endif
+
 /* Warning!
            wcscmp/wcsncmp have to use SIGNED comparison for elements.
            strcmp/strncmp have to use UNSIGNED comparison for elements.
@@ -102,7 +181,45 @@
    returned.  */
 
 	.section SECTION(.text), "ax", @progbits
-ENTRY(STRCMP)
+	.align	16
+	.type	STRCMP, @function
+	.globl	STRCMP
+	.hidden	STRCMP
+
+# ifndef GLABEL
+#  define GLABEL(...)	__VA_ARGS__
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+ENTRY (GLABEL(STRCASECMP))
+	movq	__libc_tsd_LOCALE@gottpoff(%rip), %rax
+	mov	%fs:(%rax), %LOCALE_REG_LP
+
+	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
+	.p2align 4
+END (GLABEL(STRCASECMP))
+	/* FALLTHROUGH to strcasecmp/strncasecmp_l.  */
+# endif
+
+	.p2align 4
+STRCMP:
+	cfi_startproc
+	_CET_ENDBR
+	CALL_MCOUNT
+
+# if defined USE_AS_STRCASECMP_L
+	/* We have to fall back on the C implementation for locales with
+	   encodings not matching ASCII for single bytes.  */
+#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	mov	LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
+#  else
+	mov	(%LOCALE_REG), %RAX_LP
+#  endif
+	testl	$1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
+	jne	STRCASECMP_NONASCII
+	leaq	_nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
+# endif
+
 # ifdef USE_AS_STRNCMP
 #  ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
@@ -128,6 +245,30 @@ ENTRY(STRCMP)
 #  endif
 # endif
 	vpxor	%xmmZERO, %xmmZERO, %xmmZERO
+# if defined USE_AS_STRCASECMP_L
+	.section .rodata.cst32, "aM", @progbits, 32
+	.align	32
+L(lcase_min):
+	.quad	0x3f3f3f3f3f3f3f3f
+	.quad	0x3f3f3f3f3f3f3f3f
+	.quad	0x3f3f3f3f3f3f3f3f
+	.quad	0x3f3f3f3f3f3f3f3f
+L(lcase_max):
+	.quad	0x9999999999999999
+	.quad	0x9999999999999999
+	.quad	0x9999999999999999
+	.quad	0x9999999999999999
+L(case_add):
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.previous
+
+	vmovdqa	L(lcase_min)(%rip), LCASE_MIN_ymm
+	vmovdqa	L(lcase_max)(%rip), LCASE_MAX_ymm
+	vmovdqa	L(case_add)(%rip), CASE_ADD_ymm
+# endif
 	movl	%edi, %eax
 	orl	%esi, %eax
 	sall	$20, %eax
@@ -138,8 +279,10 @@ ENTRY(STRCMP)
 L(no_page_cross):
 	/* Safe to compare 4x vectors.  */
 	VMOVU	(%rdi), %ymm0
-	/* 1s where s1 and s2 equal.  */
-	VPCMPEQ	(%rsi), %ymm0, %ymm1
+	/* 1s where s1 and s2 equal. Just VPCMPEQ if its not strcasecmp.
+	   Otherwise converts ymm0 and load from rsi to lower. ymm2 is
+	   scratch and ymm1 is the return.  */
+	CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
 	/* 1s at null CHAR.  */
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	/* 1s where s1 and s2 equal AND not null CHAR.  */
@@ -172,6 +315,8 @@ L(return_vec_0):
 # else
 	movzbl	(%rdi, %rcx), %eax
 	movzbl	(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 # endif
 L(ret0):
@@ -207,6 +352,8 @@ L(one_or_less):
 #  else
 	movzbl	(%rdi), %eax
 	movzbl	(%rsi), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 #  endif
 L(ret1):
@@ -234,6 +381,8 @@ L(return_vec_1):
 # else
 	movzbl	VEC_SIZE(%rdi, %rcx), %eax
 	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 # endif
 L(ret2):
@@ -265,6 +414,8 @@ L(return_vec_2):
 # else
 	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
 	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 # endif
 L(ret3):
@@ -285,6 +436,8 @@ L(return_vec_3):
 #  else
 	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
 	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 #  endif
 L(ret4):
@@ -295,7 +448,7 @@ L(ret4):
 L(more_3x_vec):
 	/* Safe to compare 4x vectors.  */
 	VMOVU	VEC_SIZE(%rdi), %ymm0
-	VPCMPEQ	VEC_SIZE(%rsi), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -308,7 +461,7 @@ L(more_3x_vec):
 # endif
 
 	VMOVU	(VEC_SIZE * 2)(%rdi), %ymm0
-	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 2)(%rsi), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -316,7 +469,7 @@ L(more_3x_vec):
 	jnz	L(return_vec_2)
 
 	VMOVU	(VEC_SIZE * 3)(%rdi), %ymm0
-	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 3)(%rsi), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -391,12 +544,10 @@ L(loop_skip_page_cross_check):
 	VMOVA	(VEC_SIZE * 3)(%rdi), %ymm6
 
 	/* ymm1 all 1s where s1 and s2 equal. All 0s otherwise.  */
-	VPCMPEQ	(VEC_SIZE * 0)(%rsi), %ymm0, %ymm1
-
-	VPCMPEQ	(VEC_SIZE * 1)(%rsi), %ymm2, %ymm3
-	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
-	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
-
+	CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 0)(%rsi), %ymm3, %ymm1)
+	CMP_R1_S2_ymm (%ymm2, (VEC_SIZE * 1)(%rsi), %ymm5, %ymm3)
+	CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
+	CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
 
 	/* If any mismatches or null CHAR then 0 CHAR, otherwise non-
 	   zero.  */
@@ -465,6 +616,8 @@ L(return_vec_2_3_end):
 # else
 	movzbl	(VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax
 	movzbl	(VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -508,6 +661,8 @@ L(return_vec_0_end):
 # else
 	movzbl	(%rdi, %rcx), %eax
 	movzbl	(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -530,6 +685,8 @@ L(return_vec_1_end):
 #  else
 	movzbl	VEC_SIZE(%rdi, %rcx), %eax
 	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -556,6 +713,8 @@ L(return_vec_2_end):
 # else
 	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
 	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -583,7 +742,7 @@ L(page_cross_during_loop):
 	jle	L(less_1x_vec_till_page_cross)
 
 	VMOVA	(%rdi), %ymm0
-	VPCMPEQ	(%rsi), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -605,7 +764,7 @@ L(less_1x_vec_till_page_cross):
 	   here, it means the previous page (rdi - VEC_SIZE) has already
 	   been loaded earlier so must be valid.  */
 	VMOVU	-VEC_SIZE(%rdi, %rax), %ymm0
-	VPCMPEQ	-VEC_SIZE(%rsi, %rax), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, -VEC_SIZE(%rsi, %rax), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -647,6 +806,8 @@ L(return_page_cross_cmp_mem):
 # else
 	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
 	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -673,7 +834,7 @@ L(more_2x_vec_till_page_cross):
 	   iteration here.  */
 
 	VMOVU	VEC_SIZE(%rdi), %ymm0
-	VPCMPEQ	VEC_SIZE(%rsi), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -689,7 +850,7 @@ L(more_2x_vec_till_page_cross):
 
 	/* Safe to include comparisons from lower bytes.  */
 	VMOVU	-(VEC_SIZE * 2)(%rdi, %rax), %ymm0
-	VPCMPEQ	-(VEC_SIZE * 2)(%rsi, %rax), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 2)(%rsi, %rax), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -697,7 +858,7 @@ L(more_2x_vec_till_page_cross):
 	jnz	L(return_vec_page_cross_0)
 
 	VMOVU	-(VEC_SIZE * 1)(%rdi, %rax), %ymm0
-	VPCMPEQ	-(VEC_SIZE * 1)(%rsi, %rax), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 1)(%rsi, %rax), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -715,8 +876,8 @@ L(more_2x_vec_till_page_cross):
 	VMOVA	(VEC_SIZE * 2)(%rdi), %ymm4
 	VMOVA	(VEC_SIZE * 3)(%rdi), %ymm6
 
-	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
-	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
+	CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
+	CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
 	vpand	%ymm4, %ymm5, %ymm5
 	vpand	%ymm6, %ymm7, %ymm7
 	VPMINU	%ymm5, %ymm7, %ymm7
@@ -767,6 +928,8 @@ L(return_vec_page_cross_1):
 # else
 	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
 	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -822,7 +985,7 @@ L(page_cross):
 L(page_cross_loop):
 
 	VMOVU	(%rdi, %OFFSET_REG64), %ymm0
-	VPCMPEQ	(%rsi, %OFFSET_REG64), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -840,11 +1003,11 @@ L(page_cross_loop):
 	subl	%eax, %OFFSET_REG
 	/* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
 	   to not cross page so is safe to load. Since we have already
-	   loaded at least 1 VEC from rsi it is also guranteed to be safe.
-	 */
+	   loaded at least 1 VEC from rsi it is also guranteed to be
+	   safe.  */
 
 	VMOVU	(%rdi, %OFFSET_REG64), %ymm0
-	VPCMPEQ	(%rsi, %OFFSET_REG64), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -877,6 +1040,8 @@ L(ret_vec_page_cross_cont):
 # else
 	movzbl	(%rdi, %rcx), %eax
 	movzbl	(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -930,7 +1095,7 @@ L(less_1x_vec_till_page):
 	ja	L(less_16_till_page)
 
 	VMOVU	(%rdi), %xmm0
-	VPCMPEQ	(%rsi), %xmm0, %xmm1
+	CMP_R1_S2_xmm (%xmm0, (%rsi), %xmm2, %xmm1)
 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
 	vpandn	%xmm1, %xmm2, %xmm1
 	vpmovmskb %ymm1, %ecx
@@ -948,7 +1113,7 @@ L(less_1x_vec_till_page):
 # endif
 
 	VMOVU	(%rdi, %OFFSET_REG64), %xmm0
-	VPCMPEQ	(%rsi, %OFFSET_REG64), %xmm0, %xmm1
+	CMP_R1_S2_xmm (%xmm0, (%rsi, %OFFSET_REG64), %xmm2, %xmm1)
 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
 	vpandn	%xmm1, %xmm2, %xmm1
 	vpmovmskb %ymm1, %ecx
@@ -986,7 +1151,7 @@ L(less_16_till_page):
 	vmovq	(%rdi), %xmm0
 	vmovq	(%rsi), %xmm1
 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
-	VPCMPEQ	%xmm1, %xmm0, %xmm1
+	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
 	vpandn	%xmm1, %xmm2, %xmm1
 	vpmovmskb %ymm1, %ecx
 	incb	%cl
@@ -1006,7 +1171,7 @@ L(less_16_till_page):
 	vmovq	(%rdi, %OFFSET_REG64), %xmm0
 	vmovq	(%rsi, %OFFSET_REG64), %xmm1
 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
-	VPCMPEQ	%xmm1, %xmm0, %xmm1
+	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
 	vpandn	%xmm1, %xmm2, %xmm1
 	vpmovmskb %ymm1, %ecx
 	incb	%cl
@@ -1062,7 +1227,7 @@ L(ret_less_8_wcs):
 	vmovd	(%rdi), %xmm0
 	vmovd	(%rsi), %xmm1
 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
-	VPCMPEQ	%xmm1, %xmm0, %xmm1
+	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
 	vpandn	%xmm1, %xmm2, %xmm1
 	vpmovmskb %ymm1, %ecx
 	subl	$0xf, %ecx
@@ -1081,7 +1246,7 @@ L(ret_less_8_wcs):
 	vmovd	(%rdi, %OFFSET_REG64), %xmm0
 	vmovd	(%rsi, %OFFSET_REG64), %xmm1
 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
-	VPCMPEQ	%xmm1, %xmm0, %xmm1
+	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
 	vpandn	%xmm1, %xmm2, %xmm1
 	vpmovmskb %ymm1, %ecx
 	subl	$0xf, %ecx
@@ -1115,7 +1280,9 @@ L(less_4_till_page):
 L(less_4_loop):
 	movzbl	(%rdi), %eax
 	movzbl	(%rsi, %rdi), %ecx
-	subl	%ecx, %eax
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
+	subl	%BYTE_LOOP_REG, %eax
 	jnz	L(ret_less_4_loop)
 	testl	%ecx, %ecx
 	jz	L(ret_zero_4_loop)
@@ -1142,5 +1309,6 @@ L(ret_less_4_loop):
 	subl	%r8d, %eax
 	ret
 # endif
-END(STRCMP)
+	cfi_endproc
+	.size	STRCMP, .-STRCMP
 #endif
diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
new file mode 100644
index 0000000000..e194936c36
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
@@ -0,0 +1,16 @@
+#ifndef STRCMP
+# define STRCMP	__strncasecmp_l_avx2_rtm
+#endif
+
+#define _GLABEL(x)	x ## _rtm
+#define GLABEL(x)	_GLABEL(x)
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN	\
+	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN	jmp L(return_vzeroupper)
+
+#define SECTION(p)	p##.avx.rtm
+#define OVERFLOW_STRCMP	__strcasecmp_avx2_rtm
+
+#include "strncase_l-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2.S b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
new file mode 100644
index 0000000000..29afccbcc5
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
@@ -0,0 +1,27 @@
+/* strncasecmp_l optimized with AVX2.
+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef STRCMP
+# define STRCMP	__strncasecmp_l_avx2
+#endif
+#define USE_AS_STRCASECMP_L
+#define USE_AS_STRNCMP
+#ifndef OVERFLOW_STRCMP
+# define OVERFLOW_STRCMP	__strcasecmp_avx2
+#endif
+#include "strcmp-avx2.S"
-- 
2.25.1


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH v1 22/23] x86: Add EVEX optimized str{n}casecmp
  2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
                   ` (19 preceding siblings ...)
  2022-03-23 21:57 ` [PATCH v1 21/23] x86: Add AVX2 optimized str{n}casecmp Noah Goldstein
@ 2022-03-23 21:57 ` Noah Goldstein
  2022-03-24 19:04   ` H.J. Lu
  2022-03-23 21:57 ` [PATCH v1 23/23] x86: Remove AVX str{n}casecmp Noah Goldstein
  2022-03-24 18:43 ` [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c H.J. Lu
  22 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-23 21:57 UTC (permalink / raw)
  To: libc-alpha

geometric_mean(N=40) of all benchmarks EVEX / SSE42: .621

All string/memory tests pass.
---
Geomtric Mean N=40 runs; All functions page aligned
length, align1, align2, max_char, EVEX Time / SSE42 Time
     1,      1,      1,      127,                  0.871
     2,      2,      2,      127,                  0.833
     3,      3,      3,      127,                  0.851
     4,      4,      4,      127,                  0.824
     5,      5,      5,      127,                  0.791
     6,      6,      6,      127,                  0.789
     7,      7,      7,      127,                  0.804
     8,      0,      0,      127,                  0.838
     9,      1,      1,      127,                  0.837
    10,      2,      2,      127,                  0.834
    11,      3,      3,      127,                  0.839
    12,      4,      4,      127,                  0.844
    13,      5,      5,      127,                  0.796
    14,      6,      6,      127,                  0.811
    15,      7,      7,      127,                  0.838
     4,      0,      0,      127,                   0.84
     4,      0,      0,      254,                  0.823
     8,      0,      0,      254,                  0.838
    16,      0,      0,      127,                  0.669
    16,      0,      0,      254,                  0.656
    32,      0,      0,      127,                  0.488
    32,      0,      0,      254,                  0.484
    64,      0,      0,      127,                  0.492
    64,      0,      0,      254,                  0.502
   128,      0,      0,      127,                  0.508
   128,      0,      0,      254,                  0.497
   256,      0,      0,      127,                  0.574
   256,      0,      0,      254,                  0.581
   512,      0,      0,      127,                  0.573
   512,      0,      0,      254,                  0.577
  1024,      0,      0,      127,                  0.489
  1024,      0,      0,      254,                  0.485
    16,      1,      2,      127,                  0.655
    16,      2,      1,      254,                  0.646
    32,      2,      4,      127,                  0.368
    32,      4,      2,      254,                  0.376
    64,      3,      6,      127,                  0.428
    64,      6,      3,      254,                  0.426
   128,      4,      0,      127,                  0.478
   128,      0,      4,      254,                  0.473
   256,      5,      2,      127,                   0.65
   256,      2,      5,      254,                  0.654
   512,      6,      4,      127,                  0.492
   512,      4,      6,      254,                  0.489
  1024,      7,      6,      127,                  0.463
  1024,      6,      7,      254,                  0.457

 sysdeps/x86_64/multiarch/Makefile            |   2 +
 sysdeps/x86_64/multiarch/ifunc-impl-list.c   |  16 ++
 sysdeps/x86_64/multiarch/ifunc-strcasecmp.h  |   5 +
 sysdeps/x86_64/multiarch/strcasecmp_l-evex.S |  23 ++
 sysdeps/x86_64/multiarch/strcmp-evex.S       | 280 ++++++++++++++++---
 sysdeps/x86_64/multiarch/strncase_l-evex.S   |  25 ++
 6 files changed, 314 insertions(+), 37 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
 create mode 100644 sysdeps/x86_64/multiarch/strncase_l-evex.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 06e1848823..35d80dc2ff 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -57,6 +57,7 @@ sysdep_routines += \
   strcasecmp_l-avx \
   strcasecmp_l-avx2 \
   strcasecmp_l-avx2-rtm \
+  strcasecmp_l-evex \
   strcasecmp_l-sse2 \
   strcasecmp_l-sse4_2 \
   strcasecmp_l-ssse3 \
@@ -97,6 +98,7 @@ sysdep_routines += \
   strncase_l-avx \
   strncase_l-avx2 \
   strncase_l-avx2-rtm \
+  strncase_l-evex \
   strncase_l-sse2 \
   strncase_l-sse4_2 \
   strncase_l-ssse3 \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 3c556d07ac..f1a4d3dac2 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -436,6 +436,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
   IFUNC_IMPL (i, name, strcasecmp,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __strcasecmp_evex)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp,
 			      CPU_FEATURE_USABLE (AVX2),
 			      __strcasecmp_avx2)
@@ -456,6 +460,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
   IFUNC_IMPL (i, name, strcasecmp_l,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __strcasecmp_l_evex)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp,
 			      CPU_FEATURE_USABLE (AVX2),
 			      __strcasecmp_l_avx2)
@@ -590,6 +598,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
   IFUNC_IMPL (i, name, strncasecmp,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __strncasecmp_evex)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp,
 			      CPU_FEATURE_USABLE (AVX2),
 			      __strncasecmp_avx2)
@@ -611,6 +623,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
   IFUNC_IMPL (i, name, strncasecmp_l,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __strncasecmp_l_evex)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp,
 			      CPU_FEATURE_USABLE (AVX2),
 			      __strncasecmp_l_avx2)
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
index c4de111fd0..bf0d146e7f 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
@@ -25,6 +25,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
 
 static inline void *
 IFUNC_SELECTOR (void)
@@ -34,6 +35,10 @@ IFUNC_SELECTOR (void)
   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
     {
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+        return OPTIMIZE (evex);
+
       if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
         return OPTIMIZE (avx2_rtm);
 
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
new file mode 100644
index 0000000000..58642db748
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
@@ -0,0 +1,23 @@
+/* strcasecmp_l optimized with EVEX.
+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef STRCMP
+# define STRCMP	__strcasecmp_l_evex
+#endif
+#define USE_AS_STRCASECMP_L
+#include "strcmp-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
index 56d8c118e4..85afd6535f 100644
--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
@@ -19,6 +19,9 @@
 #if IS_IN (libc)
 
 # include <sysdep.h>
+# if defined USE_AS_STRCASECMP_L
+#  include "locale-defines.h"
+# endif
 
 # ifndef STRCMP
 #  define STRCMP	__strcmp_evex
@@ -34,19 +37,29 @@
 # define VMOVA	vmovdqa64
 
 # ifdef USE_AS_WCSCMP
-#  define TESTEQ	subl	$0xff,
+#  ifndef OVERFLOW_STRCMP
+#   define OVERFLOW_STRCMP	__wcscmp_evex
+#  endif
+
+#  define TESTEQ	subl $0xff,
 	/* Compare packed dwords.  */
 #  define VPCMP	vpcmpd
 #  define VPMINU	vpminud
 #  define VPTESTM	vptestmd
+#  define VPTESTNM	vptestnmd
 	/* 1 dword char == 4 bytes.  */
 #  define SIZE_OF_CHAR	4
 # else
+#  ifndef OVERFLOW_STRCMP
+#   define OVERFLOW_STRCMP	__strcmp_evex
+#  endif
+
 #  define TESTEQ	incl
 	/* Compare packed bytes.  */
 #  define VPCMP	vpcmpb
 #  define VPMINU	vpminub
 #  define VPTESTM	vptestmb
+#  define VPTESTNM	vptestnmb
 	/* 1 byte char == 1 byte.  */
 #  define SIZE_OF_CHAR	1
 # endif
@@ -73,11 +86,16 @@
 #  define VEC_OFFSET	(-VEC_SIZE)
 # endif
 
-# define XMMZERO	xmm16
 # define XMM0	xmm17
 # define XMM1	xmm18
 
-# define YMMZERO	ymm16
+# define XMM10	xmm27
+# define XMM11	xmm28
+# define XMM12	xmm29
+# define XMM13	xmm30
+# define XMM14	xmm31
+
+
 # define YMM0	ymm17
 # define YMM1	ymm18
 # define YMM2	ymm19
@@ -89,6 +107,87 @@
 # define YMM8	ymm25
 # define YMM9	ymm26
 # define YMM10	ymm27
+# define YMM11	ymm28
+# define YMM12	ymm29
+# define YMM13	ymm30
+# define YMM14	ymm31
+
+# ifdef USE_AS_STRCASECMP_L
+#  define BYTE_LOOP_REG	OFFSET_REG
+# else
+#  define BYTE_LOOP_REG	ecx
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+#  ifdef USE_AS_STRNCMP
+#   define STRCASECMP	__strncasecmp_evex
+#   define LOCALE_REG	rcx
+#   define LOCALE_REG_LP	RCX_LP
+#   define STRCASECMP_NONASCII	__strncasecmp_l_nonascii
+#  else
+#   define STRCASECMP	__strcasecmp_evex
+#   define LOCALE_REG	rdx
+#   define LOCALE_REG_LP	RDX_LP
+#   define STRCASECMP_NONASCII	__strcasecmp_l_nonascii
+#  endif
+# endif
+
+# define LCASE_MIN_YMM	%YMM12
+# define LCASE_MAX_YMM	%YMM13
+# define CASE_ADD_YMM	%YMM14
+
+# define LCASE_MIN_XMM	%XMM12
+# define LCASE_MAX_XMM	%XMM13
+# define CASE_ADD_XMM	%XMM14
+
+	/* NB: wcsncmp uses r11 but strcasecmp is never used in
+	   conjunction with wcscmp.  */
+# define TOLOWER_BASE	%r11
+
+# ifdef USE_AS_STRCASECMP_L
+#  define _REG(x, y) x ## y
+#  define REG(x, y) _REG(x, y)
+#  define TOLOWER(reg1, reg2, ext)										\
+	vpsubb	REG(LCASE_MIN_, ext), reg1, REG(%ext, 10);					\
+	vpsubb	REG(LCASE_MIN_, ext), reg2, REG(%ext, 11);					\
+	vpcmpub	$1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5;				\
+	vpcmpub	$1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6;				\
+	vpaddb	reg1, REG(CASE_ADD_, ext), reg1{%k5};						\
+	vpaddb	reg2, REG(CASE_ADD_, ext), reg2{%k6}
+
+#  define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
+#  define TOLOWER_YMM(...)	TOLOWER(__VA_ARGS__, YMM)
+#  define TOLOWER_XMM(...)	TOLOWER(__VA_ARGS__, XMM)
+
+#  define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)						\
+	TOLOWER	(s1_reg, s2_reg, ext);										\
+	VPCMP	$0, s1_reg, s2_reg, reg_out
+
+#  define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext)				\
+	VMOVU	s2_mem, s2_reg;												\
+	CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)
+
+#  define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM)
+#  define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM)
+
+#  define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM)
+#  define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM)
+
+# else
+#  define TOLOWER_gpr(...)
+#  define TOLOWER_YMM(...)
+#  define TOLOWER_XMM(...)
+
+#  define CMP_R1_R2_YMM(s1_reg, s2_reg, reg_out)						\
+	VPCMP	$0, s2_reg, s1_reg, reg_out
+
+#  define CMP_R1_R2_XMM(...) CMP_R1_R2_YMM(__VA_ARGS__)
+
+#  define CMP_R1_S2_YMM(s1_reg, s2_mem, unused, reg_out)				\
+	VPCMP	$0, s2_mem, s1_reg, reg_out
+
+#  define CMP_R1_S2_XMM(...) CMP_R1_S2_YMM(__VA_ARGS__)
+# endif
 
 /* Warning!
            wcscmp/wcsncmp have to use SIGNED comparison for elements.
@@ -112,7 +211,41 @@
    returned.  */
 
 	.section .text.evex, "ax", @progbits
-ENTRY(STRCMP)
+	.align	16
+	.type	STRCMP, @function
+	.globl	STRCMP
+	.hidden	STRCMP
+
+# ifdef USE_AS_STRCASECMP_L
+ENTRY (STRCASECMP)
+	movq	__libc_tsd_LOCALE@gottpoff(%rip), %rax
+	mov	%fs:(%rax), %LOCALE_REG_LP
+
+	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
+	.p2align 4
+END (STRCASECMP)
+	/* FALLTHROUGH to strcasecmp/strncasecmp_l.  */
+# endif
+
+	.p2align 4
+STRCMP:
+	cfi_startproc
+	_CET_ENDBR
+	CALL_MCOUNT
+
+# if defined USE_AS_STRCASECMP_L
+	/* We have to fall back on the C implementation for locales with
+	   encodings not matching ASCII for single bytes.  */
+#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	mov	LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
+#  else
+	mov	(%LOCALE_REG), %RAX_LP
+#  endif
+	testl	$1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
+	jne	STRCASECMP_NONASCII
+	leaq	_nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
+# endif
+
 # ifdef USE_AS_STRNCMP
 #  ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
@@ -125,6 +258,32 @@ ENTRY(STRCMP)
 	   actually bound the buffer.  */
 	jle	L(one_or_less)
 # endif
+
+# if defined USE_AS_STRCASECMP_L
+	.section .rodata.cst32, "aM", @progbits, 32
+	.align	32
+L(lcase_min):
+	.quad	0x4141414141414141
+	.quad	0x4141414141414141
+	.quad	0x4141414141414141
+	.quad	0x4141414141414141
+L(lcase_max):
+	.quad	0x1a1a1a1a1a1a1a1a
+	.quad	0x1a1a1a1a1a1a1a1a
+	.quad	0x1a1a1a1a1a1a1a1a
+	.quad	0x1a1a1a1a1a1a1a1a
+L(case_add):
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.previous
+
+	vmovdqa64 L(lcase_min)(%rip), LCASE_MIN_YMM
+	vmovdqa64 L(lcase_max)(%rip), LCASE_MAX_YMM
+	vmovdqa64 L(case_add)(%rip), CASE_ADD_YMM
+# endif
+
 	movl	%edi, %eax
 	orl	%esi, %eax
 	/* Shift out the bits irrelivant to page boundary ([63:12]).  */
@@ -139,7 +298,7 @@ L(no_page_cross):
 	VPTESTM	%YMM0, %YMM0, %k2
 	/* Each bit cleared in K1 represents a mismatch or a null CHAR
 	   in YMM0 and 32 bytes at (%rsi).  */
-	VPCMP	$0, (%rsi), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
 # ifdef USE_AS_STRNCMP
 	cmpq	$CHAR_PER_VEC, %rdx
@@ -169,6 +328,8 @@ L(return_vec_0):
 # else
 	movzbl	(%rdi, %rcx), %eax
 	movzbl	(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 # endif
 L(ret0):
@@ -192,7 +353,7 @@ L(one_or_less):
 #  ifdef USE_AS_WCSCMP
 	/* 'nbe' covers the case where length is negative (large
 	   unsigned).  */
-	jnbe	__wcscmp_evex
+	jnbe	OVERFLOW_STRCMP
 	movl	(%rdi), %edx
 	xorl	%eax, %eax
 	cmpl	(%rsi), %edx
@@ -203,9 +364,11 @@ L(one_or_less):
 #  else
 	/* 'nbe' covers the case where length is negative (large
 	   unsigned).  */
-	jnbe	__strcmp_evex
+	jnbe	OVERFLOW_STRCMP
 	movzbl	(%rdi), %eax
 	movzbl	(%rsi), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 #  endif
 L(ret1):
@@ -233,6 +396,8 @@ L(return_vec_1):
 # else
 	movzbl	VEC_SIZE(%rdi, %rcx), %eax
 	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 # endif
 L(ret2):
@@ -270,6 +435,8 @@ L(return_vec_2):
 # else
 	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
 	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 # endif
 L(ret3):
@@ -290,6 +457,8 @@ L(return_vec_3):
 #  else
 	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
 	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 #  endif
 L(ret4):
@@ -303,7 +472,7 @@ L(more_3x_vec):
 	/* Safe to compare 4x vectors.  */
 	VMOVU	(VEC_SIZE)(%rdi), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, (VEC_SIZE)(%rsi), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_1)
@@ -315,14 +484,14 @@ L(more_3x_vec):
 
 	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_2)
 
 	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_3)
@@ -381,7 +550,6 @@ L(prepare_loop_aligned):
 	subl	%esi, %eax
 	andl	$(PAGE_SIZE - 1), %eax
 
-	vpxorq	%YMMZERO, %YMMZERO, %YMMZERO
 
 	/* Loop 4x comparisons at a time.  */
 	.p2align 4
@@ -413,22 +581,35 @@ L(loop_skip_page_cross_check):
 	/* A zero CHAR in YMM9 means that there is a null CHAR.  */
 	VPMINU	%YMM8, %YMM9, %YMM9
 
-	/* Each bit set in K1 represents a non-null CHAR in YMM8.  */
+	/* Each bit set in K1 represents a non-null CHAR in YMM9.  */
 	VPTESTM	%YMM9, %YMM9, %k1
-
+# ifndef USE_AS_STRCASECMP_L
 	vpxorq	(VEC_SIZE * 0)(%rsi), %YMM0, %YMM1
 	vpxorq	(VEC_SIZE * 1)(%rsi), %YMM2, %YMM3
 	vpxorq	(VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
 	/* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
 	   oring with YMM1. Result is stored in YMM6.  */
 	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6
-
+# else
+	VMOVU	(VEC_SIZE * 0)(%rsi), %YMM1
+	TOLOWER_YMM (%YMM0, %YMM1)
+	VMOVU	(VEC_SIZE * 1)(%rsi), %YMM3
+	TOLOWER_YMM (%YMM2, %YMM3)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM5
+	TOLOWER_YMM (%YMM4, %YMM5)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM7
+	TOLOWER_YMM (%YMM6, %YMM7)
+	vpxorq	%YMM0, %YMM1, %YMM1
+	vpxorq	%YMM2, %YMM3, %YMM3
+	vpxorq	%YMM4, %YMM5, %YMM5
+	vpternlogd $0xde, %YMM7, %YMM1, %YMM6
+# endif
 	/* Or together YMM3, YMM5, and YMM6.  */
 	vpternlogd $0xfe, %YMM3, %YMM5, %YMM6
 
 
 	/* A non-zero CHAR in YMM6 represents a mismatch.  */
-	VPCMP	$0, %YMMZERO, %YMM6, %k0{%k1}
+	VPTESTNM %YMM6, %YMM6, %k0{%k1}
 	kmovd	%k0, %LOOP_REG
 
 	TESTEQ	%LOOP_REG
@@ -437,13 +618,13 @@ L(loop_skip_page_cross_check):
 
 	/* Find which VEC has the mismatch of end of string.  */
 	VPTESTM	%YMM0, %YMM0, %k1
-	VPCMP	$0, %YMMZERO, %YMM1, %k0{%k1}
+	VPTESTNM %YMM1, %YMM1, %k0{%k1}
 	kmovd	%k0, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_0_end)
 
 	VPTESTM	%YMM2, %YMM2, %k1
-	VPCMP	$0, %YMMZERO, %YMM3, %k0{%k1}
+	VPTESTNM %YMM3, %YMM3, %k0{%k1}
 	kmovd	%k0, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_1_end)
@@ -457,7 +638,7 @@ L(return_vec_2_3_end):
 # endif
 
 	VPTESTM	%YMM4, %YMM4, %k1
-	VPCMP	$0, %YMMZERO, %YMM5, %k0{%k1}
+	VPTESTNM %YMM5, %YMM5, %k0{%k1}
 	kmovd	%k0, %ecx
 	TESTEQ	%ecx
 # if CHAR_PER_VEC <= 16
@@ -493,6 +674,8 @@ L(return_vec_3_end):
 # else
 	movzbl	(VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax
 	movzbl	(VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -545,6 +728,8 @@ L(return_vec_0_end):
 # else
 	movzbl	(%rdi, %rcx), %eax
 	movzbl	(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	/* Flip `eax` if `rdi` and `rsi` where swapped in page cross
 	   logic. Subtract `r8d` after xor for zero case.  */
@@ -569,6 +754,8 @@ L(return_vec_1_end):
 #  else
 	movzbl	VEC_SIZE(%rdi, %rcx), %eax
 	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -598,7 +785,7 @@ L(page_cross_during_loop):
 
 	VMOVA	(%rdi), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, (%rsi), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_0_end)
@@ -619,8 +806,7 @@ L(less_1x_vec_till_page_cross):
 	   been loaded earlier so must be valid.  */
 	VMOVU	-VEC_SIZE(%rdi, %rax), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, -VEC_SIZE(%rsi, %rax), %YMM0, %k1{%k2}
-
+	CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2}
 	/* Mask of potentially valid bits. The lower bits can be out of
 	   range comparisons (but safe regarding page crosses).  */
 
@@ -642,6 +828,8 @@ L(less_1x_vec_till_page_cross):
 
 # ifdef USE_AS_STRNCMP
 #  ifdef USE_AS_WCSCMP
+	/* NB: strcasecmp not used with WCSCMP so this access to r11 is
+	   safe.  */
 	movl	%eax, %r11d
 	shrl	$2, %r11d
 	cmpq	%r11, %rdx
@@ -679,6 +867,8 @@ L(return_page_cross_cmp_mem):
 # else
 	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
 	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -709,7 +899,7 @@ L(more_2x_vec_till_page_cross):
 
 	VMOVA	VEC_SIZE(%rdi), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_1_end)
@@ -724,14 +914,14 @@ L(more_2x_vec_till_page_cross):
 	/* Safe to include comparisons from lower bytes.  */
 	VMOVU	-(VEC_SIZE * 2)(%rdi, %rax), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_page_cross_0)
 
 	VMOVU	-(VEC_SIZE * 1)(%rdi, %rax), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_page_cross_1)
@@ -740,6 +930,8 @@ L(more_2x_vec_till_page_cross):
 	/* Must check length here as length might proclude reading next
 	   page.  */
 #  ifdef USE_AS_WCSCMP
+	/* NB: strcasecmp not used with WCSCMP so this access to r11 is
+	   safe.  */
 	movl	%eax, %r11d
 	shrl	$2, %r11d
 	cmpq	%r11, %rdx
@@ -754,12 +946,19 @@ L(more_2x_vec_till_page_cross):
 	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM6
 	VPMINU	%YMM4, %YMM6, %YMM9
 	VPTESTM	%YMM9, %YMM9, %k1
-
+# ifndef USE_AS_STRCASECMP_L
 	vpxorq	(VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
 	/* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6).  */
 	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6
-
-	VPCMP	$0, %YMMZERO, %YMM6, %k0{%k1}
+# else
+	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM5
+	TOLOWER_YMM (%YMM4, %YMM5)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM7
+	TOLOWER_YMM (%YMM6, %YMM7)
+	vpxorq	%YMM4, %YMM5, %YMM5
+	vpternlogd $0xde, %YMM7, %YMM5, %YMM6
+# endif
+	VPTESTNM %YMM6, %YMM6, %k0{%k1}
 	kmovd	%k0, %LOOP_REG
 	TESTEQ	%LOOP_REG
 	jnz	L(return_vec_2_3_end)
@@ -815,6 +1014,8 @@ L(return_vec_page_cross_1):
 # else
 	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
 	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -871,7 +1072,7 @@ L(page_cross):
 L(page_cross_loop):
 	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(check_ret_vec_page_cross)
@@ -895,7 +1096,7 @@ L(page_cross_loop):
 	 */
 	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
 
 	kmovd	%k1, %ecx
 # ifdef USE_AS_STRNCMP
@@ -930,6 +1131,8 @@ L(ret_vec_page_cross_cont):
 # else
 	movzbl	(%rdi, %rcx, SIZE_OF_CHAR), %eax
 	movzbl	(%rsi, %rcx, SIZE_OF_CHAR), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -989,7 +1192,7 @@ L(less_1x_vec_till_page):
 	/* Use 16 byte comparison.  */
 	vmovdqu	(%rdi), %xmm0
 	VPTESTM	%xmm0, %xmm0, %k2
-	VPCMP	$0, (%rsi), %xmm0, %k1{%k2}
+	CMP_R1_S2_XMM (%xmm0, (%rsi), %xmm1, %k1){%k2}
 	kmovd	%k1, %ecx
 # ifdef USE_AS_WCSCMP
 	subl	$0xf, %ecx
@@ -1009,7 +1212,7 @@ L(less_1x_vec_till_page):
 # endif
 	vmovdqu	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
 	VPTESTM	%xmm0, %xmm0, %k2
-	VPCMP	$0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0, %k1{%k2}
+	CMP_R1_S2_XMM (%xmm0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1, %k1){%k2}
 	kmovd	%k1, %ecx
 # ifdef USE_AS_WCSCMP
 	subl	$0xf, %ecx
@@ -1048,7 +1251,7 @@ L(less_16_till_page):
 	vmovq	(%rdi), %xmm0
 	vmovq	(%rsi), %xmm1
 	VPTESTM	%xmm0, %xmm0, %k2
-	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
+	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
 	kmovd	%k1, %ecx
 # ifdef USE_AS_WCSCMP
 	subl	$0x3, %ecx
@@ -1068,7 +1271,7 @@ L(less_16_till_page):
 	vmovq	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
 	vmovq	(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
 	VPTESTM	%xmm0, %xmm0, %k2
-	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
+	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
 	kmovd	%k1, %ecx
 # ifdef USE_AS_WCSCMP
 	subl	$0x3, %ecx
@@ -1128,7 +1331,7 @@ L(ret_less_8_wcs):
 	vmovd	(%rdi), %xmm0
 	vmovd	(%rsi), %xmm1
 	VPTESTM	%xmm0, %xmm0, %k2
-	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
+	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
 	kmovd	%k1, %ecx
 	subl	$0xf, %ecx
 	jnz	L(check_ret_vec_page_cross)
@@ -1143,7 +1346,7 @@ L(ret_less_8_wcs):
 	vmovd	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
 	vmovd	(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
 	VPTESTM	%xmm0, %xmm0, %k2
-	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
+	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
 	kmovd	%k1, %ecx
 	subl	$0xf, %ecx
 	jnz	L(check_ret_vec_page_cross)
@@ -1176,7 +1379,9 @@ L(less_4_till_page):
 L(less_4_loop):
 	movzbl	(%rdi), %eax
 	movzbl	(%rsi, %rdi), %ecx
-	subl	%ecx, %eax
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
+	subl	%BYTE_LOOP_REG, %eax
 	jnz	L(ret_less_4_loop)
 	testl	%ecx, %ecx
 	jz	L(ret_zero_4_loop)
@@ -1203,5 +1408,6 @@ L(ret_less_4_loop):
 	subl	%r8d, %eax
 	ret
 # endif
-END(STRCMP)
+	cfi_endproc
+	.size	STRCMP, .-STRCMP
 #endif
diff --git a/sysdeps/x86_64/multiarch/strncase_l-evex.S b/sysdeps/x86_64/multiarch/strncase_l-evex.S
new file mode 100644
index 0000000000..b0808c1b21
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncase_l-evex.S
@@ -0,0 +1,25 @@
+/* strncasecmp_l optimized with EVEX.
+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef STRCMP
+# define STRCMP	__strncasecmp_l_evex
+#endif
+#define OVERFLOW_STRCMP	__strcasecmp_evex
+#define USE_AS_STRCASECMP_L
+#define USE_AS_STRNCMP
+#include "strcmp-evex.S"
-- 
2.25.1


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH v1 23/23] x86: Remove AVX str{n}casecmp
  2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
                   ` (20 preceding siblings ...)
  2022-03-23 21:57 ` [PATCH v1 22/23] x86: Add EVEX " Noah Goldstein
@ 2022-03-23 21:57 ` Noah Goldstein
  2022-03-24 19:04   ` H.J. Lu
  2022-03-24 18:43 ` [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c H.J. Lu
  22 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-23 21:57 UTC (permalink / raw)
  To: libc-alpha

The rational is:

1. SSE42 has nearly identical logic so any benefit is minimal (3.4%
   regression on Tigerlake using SSE42 versus AVX across the
   benchtest suite).
2. AVX2 version covers the majority of targets that previously
   prefered it.
3. The targets where AVX would still be best (SnB and IVB) are
   becoming outdated.

All in all the saving the code size is worth it.

All string/memory tests pass.
---
Geomtric Mean N=40 runs; All functions page aligned
length, align1, align2, max_char, AVX Time / SSE42 Time
     1,      1,      1,      127,                 0.928
     2,      2,      2,      127,                 0.934
     3,      3,      3,      127,                 0.975
     4,      4,      4,      127,                  0.96
     5,      5,      5,      127,                 0.935
     6,      6,      6,      127,                 0.929
     7,      7,      7,      127,                 0.959
     8,      0,      0,      127,                 0.955
     9,      1,      1,      127,                 0.944
    10,      2,      2,      127,                 0.975
    11,      3,      3,      127,                 0.935
    12,      4,      4,      127,                 0.931
    13,      5,      5,      127,                 0.926
    14,      6,      6,      127,                 0.901
    15,      7,      7,      127,                 0.951
     4,      0,      0,      127,                 0.958
     4,      0,      0,      254,                 0.956
     8,      0,      0,      254,                 0.977
    16,      0,      0,      127,                 0.955
    16,      0,      0,      254,                 0.953
    32,      0,      0,      127,                 0.943
    32,      0,      0,      254,                 0.941
    64,      0,      0,      127,                 0.941
    64,      0,      0,      254,                 0.955
   128,      0,      0,      127,                 0.972
   128,      0,      0,      254,                 0.975
   256,      0,      0,      127,                 0.996
   256,      0,      0,      254,                 0.993
   512,      0,      0,      127,                 0.992
   512,      0,      0,      254,                 0.986
  1024,      0,      0,      127,                 0.994
  1024,      0,      0,      254,                 0.993
    16,      1,      2,      127,                 0.933
    16,      2,      1,      254,                 0.953
    32,      2,      4,      127,                 0.927
    32,      4,      2,      254,                 0.986
    64,      3,      6,      127,                 0.991
    64,      6,      3,      254,                 1.014
   128,      4,      0,      127,                 1.001
   128,      0,      4,      254,                 0.991
   256,      5,      2,      127,                 1.011
   256,      2,      5,      254,                 1.013
   512,      6,      4,      127,                 1.056
   512,      4,      6,      254,                 0.916
  1024,      7,      6,      127,                 1.059
  1024,      6,      7,      254,                 1.043

 sysdeps/x86_64/multiarch/Makefile           |   2 -
 sysdeps/x86_64/multiarch/ifunc-impl-list.c  |  12 -
 sysdeps/x86_64/multiarch/ifunc-strcasecmp.h |   4 -
 sysdeps/x86_64/multiarch/strcasecmp_l-avx.S |  22 --
 sysdeps/x86_64/multiarch/strcmp-sse42.S     | 240 +++++++++-----------
 sysdeps/x86_64/multiarch/strncase_l-avx.S   |  22 --
 6 files changed, 105 insertions(+), 197 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx.S
 delete mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 35d80dc2ff..6507d1b7fa 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -54,7 +54,6 @@ sysdep_routines += \
   stpncpy-evex \
   stpncpy-sse2-unaligned \
   stpncpy-ssse3 \
-  strcasecmp_l-avx \
   strcasecmp_l-avx2 \
   strcasecmp_l-avx2-rtm \
   strcasecmp_l-evex \
@@ -95,7 +94,6 @@ sysdep_routines += \
   strlen-avx2-rtm \
   strlen-evex \
   strlen-sse2 \
-  strncase_l-avx \
   strncase_l-avx2 \
   strncase_l-avx2-rtm \
   strncase_l-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index f1a4d3dac2..40cc6cc49e 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -447,9 +447,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      (CPU_FEATURE_USABLE (AVX2)
 			       && CPU_FEATURE_USABLE (RTM)),
 			      __strcasecmp_avx2_rtm)
-	      IFUNC_IMPL_ADD (array, i, strcasecmp,
-			      CPU_FEATURE_USABLE (AVX),
-			      __strcasecmp_avx)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp,
 			      CPU_FEATURE_USABLE (SSE4_2),
 			      __strcasecmp_sse42)
@@ -471,9 +468,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      (CPU_FEATURE_USABLE (AVX2)
 			       && CPU_FEATURE_USABLE (RTM)),
 			      __strcasecmp_l_avx2_rtm)
-	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
-			      CPU_FEATURE_USABLE (AVX),
-			      __strcasecmp_l_avx)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
 			      CPU_FEATURE_USABLE (SSE4_2),
 			      __strcasecmp_l_sse42)
@@ -609,9 +603,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      (CPU_FEATURE_USABLE (AVX2)
 			       && CPU_FEATURE_USABLE (RTM)),
 			      __strncasecmp_avx2_rtm)
-	      IFUNC_IMPL_ADD (array, i, strncasecmp,
-			      CPU_FEATURE_USABLE (AVX),
-			      __strncasecmp_avx)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp,
 			      CPU_FEATURE_USABLE (SSE4_2),
 			      __strncasecmp_sse42)
@@ -634,9 +625,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      (CPU_FEATURE_USABLE (AVX2)
 			       && CPU_FEATURE_USABLE (RTM)),
 			      __strncasecmp_l_avx2_rtm)
-	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
-			      CPU_FEATURE_USABLE (AVX),
-			      __strncasecmp_l_avx)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
 			      CPU_FEATURE_USABLE (SSE4_2),
 			      __strncasecmp_l_sse42)
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
index bf0d146e7f..766539c241 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
@@ -22,7 +22,6 @@
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
@@ -46,9 +45,6 @@ IFUNC_SELECTOR (void)
         return OPTIMIZE (avx2);
     }
 
-  if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
-    return OPTIMIZE (avx);
-
   if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_2)
       && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
     return OPTIMIZE (sse42);
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S
deleted file mode 100644
index 7ec7c21b5a..0000000000
--- a/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S
+++ /dev/null
@@ -1,22 +0,0 @@
-/* strcasecmp_l optimized with AVX.
-   Copyright (C) 2017-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#define STRCMP_SSE42 __strcasecmp_l_avx
-#define USE_AVX 1
-#define USE_AS_STRCASECMP_L
-#include "strcmp-sse42.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
index 7805ae9d41..a9178ad25c 100644
--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
+++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
@@ -41,13 +41,8 @@
 # define UPDATE_STRNCMP_COUNTER
 #endif
 
-#ifdef USE_AVX
-# define SECTION	avx
-# define GLABEL(l)	l##_avx
-#else
-# define SECTION	sse4.2
-# define GLABEL(l)	l##_sse42
-#endif
+#define SECTION	sse4.2
+#define GLABEL(l)	l##_sse42
 
 #define LABEL(l)	.L##l
 
@@ -105,21 +100,7 @@ END (GLABEL(__strncasecmp))
 #endif
 
 
-#ifdef USE_AVX
-# define movdqa vmovdqa
-# define movdqu vmovdqu
-# define pmovmskb vpmovmskb
-# define pcmpistri vpcmpistri
-# define psubb vpsubb
-# define pcmpeqb vpcmpeqb
-# define psrldq vpsrldq
-# define pslldq vpslldq
-# define palignr vpalignr
-# define pxor vpxor
-# define D(arg) arg, arg
-#else
-# define D(arg) arg
-#endif
+#define arg arg
 
 STRCMP_SSE42:
 	cfi_startproc
@@ -191,18 +172,7 @@ LABEL(case_add):
 	movdqu	(%rdi), %xmm1
 	movdqu	(%rsi), %xmm2
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
-# ifdef USE_AVX
-#  define TOLOWER(reg1, reg2) \
-	vpaddb	LCASE_MIN_reg, reg1, %xmm7;					\
-	vpaddb	LCASE_MIN_reg, reg2, %xmm8;					\
-	vpcmpgtb LCASE_MAX_reg, %xmm7, %xmm7;					\
-	vpcmpgtb LCASE_MAX_reg, %xmm8, %xmm8;					\
-	vpandn	CASE_ADD_reg, %xmm7, %xmm7;					\
-	vpandn	CASE_ADD_reg, %xmm8, %xmm8;					\
-	vpaddb	%xmm7, reg1, reg1;					\
-	vpaddb	%xmm8, reg2, reg2
-# else
-#  define TOLOWER(reg1, reg2) \
+# define TOLOWER(reg1, reg2) \
 	movdqa	LCASE_MIN_reg, %xmm7;					\
 	movdqa	LCASE_MIN_reg, %xmm8;					\
 	paddb	reg1, %xmm7;					\
@@ -213,15 +183,15 @@ LABEL(case_add):
 	pandn	CASE_ADD_reg, %xmm8;					\
 	paddb	%xmm7, reg1;					\
 	paddb	%xmm8, reg2
-# endif
+
 	TOLOWER (%xmm1, %xmm2)
 #else
 # define TOLOWER(reg1, reg2)
 #endif
-	pxor	%xmm0, D(%xmm0)		/* clear %xmm0 for null char checks */
-	pcmpeqb	%xmm1, D(%xmm0)		/* Any null chars? */
-	pcmpeqb	%xmm2, D(%xmm1)		/* compare first 16 bytes for equality */
-	psubb	%xmm0, D(%xmm1)		/* packed sub of comparison results*/
+	pxor	%xmm0, %xmm0		/* clear %xmm0 for null char checks */
+	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
+	pcmpeqb	%xmm2, %xmm1		/* compare first 16 bytes for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
 	pmovmskb %xmm1, %edx
 	sub	$0xffff, %edx		/* if first 16 bytes are same, edx == 0xffff */
 	jnz	LABEL(less16bytes)/* If not, find different value or null char */
@@ -245,7 +215,7 @@ LABEL(crosscache):
 	xor	%r8d, %r8d
 	and	$0xf, %ecx		/* offset of rsi */
 	and	$0xf, %eax		/* offset of rdi */
-	pxor	%xmm0, D(%xmm0)		/* clear %xmm0 for null char check */
+	pxor	%xmm0, %xmm0		/* clear %xmm0 for null char check */
 	cmp	%eax, %ecx
 	je	LABEL(ashr_0)		/* rsi and rdi relative offset same */
 	ja	LABEL(bigger)
@@ -259,7 +229,7 @@ LABEL(bigger):
 	sub	%rcx, %r9
 	lea	LABEL(unaligned_table)(%rip), %r10
 	movslq	(%r10, %r9,4), %r9
-	pcmpeqb	%xmm1, D(%xmm0)		/* Any null chars? */
+	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
 	lea	(%r10, %r9), %r10
 	_CET_NOTRACK jmp *%r10		/* jump to corresponding case */
 
@@ -272,15 +242,15 @@ LABEL(bigger):
 LABEL(ashr_0):
 
 	movdqa	(%rsi), %xmm1
-	pcmpeqb	%xmm1, D(%xmm0)		/* Any null chars? */
+	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
-	pcmpeqb	(%rdi), D(%xmm1)	/* compare 16 bytes for equality */
+	pcmpeqb	(%rdi), %xmm1		/* compare 16 bytes for equality */
 #else
 	movdqa	(%rdi), %xmm2
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm2, D(%xmm1)		/* compare 16 bytes for equality */
+	pcmpeqb	%xmm2, %xmm1		/* compare 16 bytes for equality */
 #endif
-	psubb	%xmm0, D(%xmm1)		/* packed sub of comparison results*/
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
 	pmovmskb %xmm1, %r9d
 	shr	%cl, %edx		/* adjust 0xffff for offset */
 	shr	%cl, %r9d		/* adjust for 16-byte offset */
@@ -360,10 +330,10 @@ LABEL(ashr_0_exit_use):
  */
 	.p2align 4
 LABEL(ashr_1):
-	pslldq	$15, D(%xmm2)		/* shift first string to align with second */
+	pslldq	$15, %xmm2		/* shift first string to align with second */
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm1, D(%xmm2)		/* compare 16 bytes for equality */
-	psubb	%xmm0, D(%xmm2)		/* packed sub of comparison results*/
+	pcmpeqb	%xmm1, %xmm2		/* compare 16 bytes for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
 	pmovmskb %xmm2, %r9d
 	shr	%cl, %edx		/* adjust 0xffff for offset */
 	shr	%cl, %r9d		/* adjust for 16-byte offset */
@@ -391,7 +361,7 @@ LABEL(loop_ashr_1_use):
 
 LABEL(nibble_ashr_1_restart_use):
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $1, -16(%rdi, %rdx), D(%xmm0)
+	palignr $1, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
 #else
@@ -410,7 +380,7 @@ LABEL(nibble_ashr_1_restart_use):
 	jg	LABEL(nibble_ashr_1_use)
 
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $1, -16(%rdi, %rdx), D(%xmm0)
+	palignr $1, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
 #else
@@ -430,7 +400,7 @@ LABEL(nibble_ashr_1_restart_use):
 LABEL(nibble_ashr_1_use):
 	sub	$0x1000, %r10
 	movdqa	-16(%rdi, %rdx), %xmm0
-	psrldq	$1, D(%xmm0)
+	psrldq	$1, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	%r11, %rcx
@@ -448,10 +418,10 @@ LABEL(nibble_ashr_1_use):
  */
 	.p2align 4
 LABEL(ashr_2):
-	pslldq	$14, D(%xmm2)
+	pslldq	$14, %xmm2
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm1, D(%xmm2)
-	psubb	%xmm0, D(%xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
 	shr	%cl, %edx
 	shr	%cl, %r9d
@@ -479,7 +449,7 @@ LABEL(loop_ashr_2_use):
 
 LABEL(nibble_ashr_2_restart_use):
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $2, -16(%rdi, %rdx), D(%xmm0)
+	palignr $2, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
 #else
@@ -498,7 +468,7 @@ LABEL(nibble_ashr_2_restart_use):
 	jg	LABEL(nibble_ashr_2_use)
 
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $2, -16(%rdi, %rdx), D(%xmm0)
+	palignr $2, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
 #else
@@ -518,7 +488,7 @@ LABEL(nibble_ashr_2_restart_use):
 LABEL(nibble_ashr_2_use):
 	sub	$0x1000, %r10
 	movdqa	-16(%rdi, %rdx), %xmm0
-	psrldq	$2, D(%xmm0)
+	psrldq	$2, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	%r11, %rcx
@@ -536,10 +506,10 @@ LABEL(nibble_ashr_2_use):
  */
 	.p2align 4
 LABEL(ashr_3):
-	pslldq	$13, D(%xmm2)
+	pslldq	$13, %xmm2
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm1, D(%xmm2)
-	psubb	%xmm0, D(%xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
 	shr	%cl, %edx
 	shr	%cl, %r9d
@@ -567,7 +537,7 @@ LABEL(loop_ashr_3_use):
 
 LABEL(nibble_ashr_3_restart_use):
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $3, -16(%rdi, %rdx), D(%xmm0)
+	palignr $3, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
 #else
@@ -586,7 +556,7 @@ LABEL(nibble_ashr_3_restart_use):
 	jg	LABEL(nibble_ashr_3_use)
 
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $3, -16(%rdi, %rdx), D(%xmm0)
+	palignr $3, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
 #else
@@ -606,7 +576,7 @@ LABEL(nibble_ashr_3_restart_use):
 LABEL(nibble_ashr_3_use):
 	sub	$0x1000, %r10
 	movdqa	-16(%rdi, %rdx), %xmm0
-	psrldq	$3, D(%xmm0)
+	psrldq	$3, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	%r11, %rcx
@@ -624,10 +594,10 @@ LABEL(nibble_ashr_3_use):
  */
 	.p2align 4
 LABEL(ashr_4):
-	pslldq	$12, D(%xmm2)
+	pslldq	$12, %xmm2
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm1, D(%xmm2)
-	psubb	%xmm0, D(%xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
 	shr	%cl, %edx
 	shr	%cl, %r9d
@@ -656,7 +626,7 @@ LABEL(loop_ashr_4_use):
 
 LABEL(nibble_ashr_4_restart_use):
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $4, -16(%rdi, %rdx), D(%xmm0)
+	palignr $4, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
 #else
@@ -675,7 +645,7 @@ LABEL(nibble_ashr_4_restart_use):
 	jg	LABEL(nibble_ashr_4_use)
 
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $4, -16(%rdi, %rdx), D(%xmm0)
+	palignr $4, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
 #else
@@ -695,7 +665,7 @@ LABEL(nibble_ashr_4_restart_use):
 LABEL(nibble_ashr_4_use):
 	sub	$0x1000, %r10
 	movdqa	-16(%rdi, %rdx), %xmm0
-	psrldq	$4, D(%xmm0)
+	psrldq	$4, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	%r11, %rcx
@@ -713,10 +683,10 @@ LABEL(nibble_ashr_4_use):
  */
 	.p2align 4
 LABEL(ashr_5):
-	pslldq	$11, D(%xmm2)
+	pslldq	$11, %xmm2
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm1, D(%xmm2)
-	psubb	%xmm0, D(%xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
 	shr	%cl, %edx
 	shr	%cl, %r9d
@@ -745,7 +715,7 @@ LABEL(loop_ashr_5_use):
 
 LABEL(nibble_ashr_5_restart_use):
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $5, -16(%rdi, %rdx), D(%xmm0)
+	palignr $5, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
 #else
@@ -765,7 +735,7 @@ LABEL(nibble_ashr_5_restart_use):
 
 	movdqa	(%rdi, %rdx), %xmm0
 
-	palignr $5, -16(%rdi, %rdx), D(%xmm0)
+	palignr $5, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
 #else
@@ -785,7 +755,7 @@ LABEL(nibble_ashr_5_restart_use):
 LABEL(nibble_ashr_5_use):
 	sub	$0x1000, %r10
 	movdqa	-16(%rdi, %rdx), %xmm0
-	psrldq	$5, D(%xmm0)
+	psrldq	$5, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	%r11, %rcx
@@ -803,10 +773,10 @@ LABEL(nibble_ashr_5_use):
  */
 	.p2align 4
 LABEL(ashr_6):
-	pslldq	$10, D(%xmm2)
+	pslldq	$10, %xmm2
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm1, D(%xmm2)
-	psubb	%xmm0, D(%xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
 	shr	%cl, %edx
 	shr	%cl, %r9d
@@ -835,7 +805,7 @@ LABEL(loop_ashr_6_use):
 
 LABEL(nibble_ashr_6_restart_use):
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $6, -16(%rdi, %rdx), D(%xmm0)
+	palignr $6, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a,(%rsi,%rdx), %xmm0
 #else
@@ -854,7 +824,7 @@ LABEL(nibble_ashr_6_restart_use):
 	jg	LABEL(nibble_ashr_6_use)
 
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $6, -16(%rdi, %rdx), D(%xmm0)
+	palignr $6, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a,(%rsi,%rdx), %xmm0
 #else
@@ -874,7 +844,7 @@ LABEL(nibble_ashr_6_restart_use):
 LABEL(nibble_ashr_6_use):
 	sub	$0x1000, %r10
 	movdqa	-16(%rdi, %rdx), %xmm0
-	psrldq	$6, D(%xmm0)
+	psrldq	$6, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	%r11, %rcx
@@ -892,10 +862,10 @@ LABEL(nibble_ashr_6_use):
  */
 	.p2align 4
 LABEL(ashr_7):
-	pslldq	$9, D(%xmm2)
+	pslldq	$9, %xmm2
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm1, D(%xmm2)
-	psubb	%xmm0, D(%xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
 	shr	%cl, %edx
 	shr	%cl, %r9d
@@ -924,7 +894,7 @@ LABEL(loop_ashr_7_use):
 
 LABEL(nibble_ashr_7_restart_use):
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $7, -16(%rdi, %rdx), D(%xmm0)
+	palignr $7, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
 #else
@@ -943,7 +913,7 @@ LABEL(nibble_ashr_7_restart_use):
 	jg	LABEL(nibble_ashr_7_use)
 
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $7, -16(%rdi, %rdx), D(%xmm0)
+	palignr $7, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
 #else
@@ -963,7 +933,7 @@ LABEL(nibble_ashr_7_restart_use):
 LABEL(nibble_ashr_7_use):
 	sub	$0x1000, %r10
 	movdqa	-16(%rdi, %rdx), %xmm0
-	psrldq	$7, D(%xmm0)
+	psrldq	$7, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	%r11, %rcx
@@ -981,10 +951,10 @@ LABEL(nibble_ashr_7_use):
  */
 	.p2align 4
 LABEL(ashr_8):
-	pslldq	$8, D(%xmm2)
+	pslldq	$8, %xmm2
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm1, D(%xmm2)
-	psubb	%xmm0, D(%xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
 	shr	%cl, %edx
 	shr	%cl, %r9d
@@ -1013,7 +983,7 @@ LABEL(loop_ashr_8_use):
 
 LABEL(nibble_ashr_8_restart_use):
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $8, -16(%rdi, %rdx), D(%xmm0)
+	palignr $8, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1032,7 +1002,7 @@ LABEL(nibble_ashr_8_restart_use):
 	jg	LABEL(nibble_ashr_8_use)
 
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $8, -16(%rdi, %rdx), D(%xmm0)
+	palignr $8, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1052,7 +1022,7 @@ LABEL(nibble_ashr_8_restart_use):
 LABEL(nibble_ashr_8_use):
 	sub	$0x1000, %r10
 	movdqa	-16(%rdi, %rdx), %xmm0
-	psrldq	$8, D(%xmm0)
+	psrldq	$8, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	%r11, %rcx
@@ -1070,10 +1040,10 @@ LABEL(nibble_ashr_8_use):
  */
 	.p2align 4
 LABEL(ashr_9):
-	pslldq	$7, D(%xmm2)
+	pslldq	$7, %xmm2
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm1, D(%xmm2)
-	psubb	%xmm0, D(%xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
 	shr	%cl, %edx
 	shr	%cl, %r9d
@@ -1103,7 +1073,7 @@ LABEL(loop_ashr_9_use):
 LABEL(nibble_ashr_9_restart_use):
 	movdqa	(%rdi, %rdx), %xmm0
 
-	palignr $9, -16(%rdi, %rdx), D(%xmm0)
+	palignr $9, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1122,7 +1092,7 @@ LABEL(nibble_ashr_9_restart_use):
 	jg	LABEL(nibble_ashr_9_use)
 
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $9, -16(%rdi, %rdx), D(%xmm0)
+	palignr $9, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1142,7 +1112,7 @@ LABEL(nibble_ashr_9_restart_use):
 LABEL(nibble_ashr_9_use):
 	sub	$0x1000, %r10
 	movdqa	-16(%rdi, %rdx), %xmm0
-	psrldq	$9, D(%xmm0)
+	psrldq	$9, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	%r11, %rcx
@@ -1160,10 +1130,10 @@ LABEL(nibble_ashr_9_use):
  */
 	.p2align 4
 LABEL(ashr_10):
-	pslldq	$6, D(%xmm2)
+	pslldq	$6, %xmm2
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm1, D(%xmm2)
-	psubb	%xmm0, D(%xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
 	shr	%cl, %edx
 	shr	%cl, %r9d
@@ -1192,7 +1162,7 @@ LABEL(loop_ashr_10_use):
 
 LABEL(nibble_ashr_10_restart_use):
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $10, -16(%rdi, %rdx), D(%xmm0)
+	palignr $10, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1211,7 +1181,7 @@ LABEL(nibble_ashr_10_restart_use):
 	jg	LABEL(nibble_ashr_10_use)
 
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $10, -16(%rdi, %rdx), D(%xmm0)
+	palignr $10, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1231,7 +1201,7 @@ LABEL(nibble_ashr_10_restart_use):
 LABEL(nibble_ashr_10_use):
 	sub	$0x1000, %r10
 	movdqa	-16(%rdi, %rdx), %xmm0
-	psrldq	$10, D(%xmm0)
+	psrldq	$10, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	%r11, %rcx
@@ -1249,10 +1219,10 @@ LABEL(nibble_ashr_10_use):
  */
 	.p2align 4
 LABEL(ashr_11):
-	pslldq	$5, D(%xmm2)
+	pslldq	$5, %xmm2
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm1, D(%xmm2)
-	psubb	%xmm0, D(%xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
 	shr	%cl, %edx
 	shr	%cl, %r9d
@@ -1281,7 +1251,7 @@ LABEL(loop_ashr_11_use):
 
 LABEL(nibble_ashr_11_restart_use):
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $11, -16(%rdi, %rdx), D(%xmm0)
+	palignr $11, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1300,7 +1270,7 @@ LABEL(nibble_ashr_11_restart_use):
 	jg	LABEL(nibble_ashr_11_use)
 
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $11, -16(%rdi, %rdx), D(%xmm0)
+	palignr $11, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1320,7 +1290,7 @@ LABEL(nibble_ashr_11_restart_use):
 LABEL(nibble_ashr_11_use):
 	sub	$0x1000, %r10
 	movdqa	-16(%rdi, %rdx), %xmm0
-	psrldq	$11, D(%xmm0)
+	psrldq	$11, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	%r11, %rcx
@@ -1338,10 +1308,10 @@ LABEL(nibble_ashr_11_use):
  */
 	.p2align 4
 LABEL(ashr_12):
-	pslldq	$4, D(%xmm2)
+	pslldq	$4, %xmm2
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm1, D(%xmm2)
-	psubb	%xmm0, D(%xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
 	shr	%cl, %edx
 	shr	%cl, %r9d
@@ -1370,7 +1340,7 @@ LABEL(loop_ashr_12_use):
 
 LABEL(nibble_ashr_12_restart_use):
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $12, -16(%rdi, %rdx), D(%xmm0)
+	palignr $12, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1389,7 +1359,7 @@ LABEL(nibble_ashr_12_restart_use):
 	jg	LABEL(nibble_ashr_12_use)
 
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $12, -16(%rdi, %rdx), D(%xmm0)
+	palignr $12, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1409,7 +1379,7 @@ LABEL(nibble_ashr_12_restart_use):
 LABEL(nibble_ashr_12_use):
 	sub	$0x1000, %r10
 	movdqa	-16(%rdi, %rdx), %xmm0
-	psrldq	$12, D(%xmm0)
+	psrldq	$12, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	%r11, %rcx
@@ -1427,10 +1397,10 @@ LABEL(nibble_ashr_12_use):
  */
 	.p2align 4
 LABEL(ashr_13):
-	pslldq	$3, D(%xmm2)
+	pslldq	$3, %xmm2
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm1, D(%xmm2)
-	psubb	%xmm0, D(%xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
 	shr	%cl, %edx
 	shr	%cl, %r9d
@@ -1460,7 +1430,7 @@ LABEL(loop_ashr_13_use):
 
 LABEL(nibble_ashr_13_restart_use):
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $13, -16(%rdi, %rdx), D(%xmm0)
+	palignr $13, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1479,7 +1449,7 @@ LABEL(nibble_ashr_13_restart_use):
 	jg	LABEL(nibble_ashr_13_use)
 
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $13, -16(%rdi, %rdx), D(%xmm0)
+	palignr $13, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1499,7 +1469,7 @@ LABEL(nibble_ashr_13_restart_use):
 LABEL(nibble_ashr_13_use):
 	sub	$0x1000, %r10
 	movdqa	-16(%rdi, %rdx), %xmm0
-	psrldq	$13, D(%xmm0)
+	psrldq	$13, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	%r11, %rcx
@@ -1517,10 +1487,10 @@ LABEL(nibble_ashr_13_use):
  */
 	.p2align 4
 LABEL(ashr_14):
-	pslldq  $2, D(%xmm2)
+	pslldq  $2, %xmm2
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm1, D(%xmm2)
-	psubb	%xmm0, D(%xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
 	shr	%cl, %edx
 	shr	%cl, %r9d
@@ -1550,7 +1520,7 @@ LABEL(loop_ashr_14_use):
 
 LABEL(nibble_ashr_14_restart_use):
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $14, -16(%rdi, %rdx), D(%xmm0)
+	palignr $14, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1569,7 +1539,7 @@ LABEL(nibble_ashr_14_restart_use):
 	jg	LABEL(nibble_ashr_14_use)
 
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $14, -16(%rdi, %rdx), D(%xmm0)
+	palignr $14, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1589,7 +1559,7 @@ LABEL(nibble_ashr_14_restart_use):
 LABEL(nibble_ashr_14_use):
 	sub	$0x1000, %r10
 	movdqa	-16(%rdi, %rdx), %xmm0
-	psrldq	$14, D(%xmm0)
+	psrldq	$14, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	%r11, %rcx
@@ -1607,10 +1577,10 @@ LABEL(nibble_ashr_14_use):
  */
 	.p2align 4
 LABEL(ashr_15):
-	pslldq	$1, D(%xmm2)
+	pslldq	$1, %xmm2
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm1, D(%xmm2)
-	psubb	%xmm0, D(%xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
 	shr	%cl, %edx
 	shr	%cl, %r9d
@@ -1642,7 +1612,7 @@ LABEL(loop_ashr_15_use):
 
 LABEL(nibble_ashr_15_restart_use):
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $15, -16(%rdi, %rdx), D(%xmm0)
+	palignr $15, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1661,7 +1631,7 @@ LABEL(nibble_ashr_15_restart_use):
 	jg	LABEL(nibble_ashr_15_use)
 
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $15, -16(%rdi, %rdx), D(%xmm0)
+	palignr $15, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1681,7 +1651,7 @@ LABEL(nibble_ashr_15_restart_use):
 LABEL(nibble_ashr_15_use):
 	sub	$0x1000, %r10
 	movdqa	-16(%rdi, %rdx), %xmm0
-	psrldq	$15, D(%xmm0)
+	psrldq	$15, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	%r11, %rcx
diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx.S b/sysdeps/x86_64/multiarch/strncase_l-avx.S
deleted file mode 100644
index b51b86d223..0000000000
--- a/sysdeps/x86_64/multiarch/strncase_l-avx.S
+++ /dev/null
@@ -1,22 +0,0 @@
-/* strncasecmp_l optimized with AVX.
-   Copyright (C) 2017-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#define STRCMP_SSE42 __strncasecmp_l_avx
-#define USE_AVX 1
-#define USE_AS_STRNCASECMP_L
-#include "strcmp-sse42.S"
-- 
2.25.1


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c
  2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
                   ` (21 preceding siblings ...)
  2022-03-23 21:57 ` [PATCH v1 23/23] x86: Remove AVX str{n}casecmp Noah Goldstein
@ 2022-03-24 18:43 ` H.J. Lu
  22 siblings, 0 replies; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 18:43 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Wed, Mar 23, 2022 at 2:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Just QOL change to make parsing the output of the benchtests more
> consistent.
> ---
>  benchtests/bench-strchr.c | 94 ++++++++++++++++++++++++++-------------
>  1 file changed, 64 insertions(+), 30 deletions(-)
>
> diff --git a/benchtests/bench-strchr.c b/benchtests/bench-strchr.c
> index 821bc615b0..203900d4ad 100644
> --- a/benchtests/bench-strchr.c
> +++ b/benchtests/bench-strchr.c
> @@ -32,6 +32,7 @@
>  #endif /* WIDE */
>  #include "bench-string.h"
>
> +#include "json-lib.h"
>  #define BIG_CHAR MAX_CHAR
>
>  #ifndef WIDE
> @@ -74,10 +75,19 @@ IMPL (simple_STRCHR, 0)
>  IMPL (STRCHR, 1)
>
>  static void
> -do_one_test (impl_t *impl, const CHAR *s, int c, const CHAR *exp_res)
> +do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s, int c,
> +             const CHAR *exp_res)
>  {
>    size_t i, iters = INNER_LOOP_ITERS_LARGE;
>    timing_t start, stop, cur;
> +  const CHAR *res = CALL (impl, s, c);
> +  if (res != exp_res)
> +    {
> +      error (0, 0, "Wrong result in function %s %p != %p", impl->name, res,
> +             exp_res);
> +      ret = 1;
> +      return;
> +    }
>
>    TIMING_NOW (start);
>    for (i = 0; i < iters; ++i)
> @@ -88,11 +98,12 @@ do_one_test (impl_t *impl, const CHAR *s, int c, const CHAR *exp_res)
>
>    TIMING_DIFF (cur, start, stop);
>
> -  TIMING_PRINT_MEAN ((double) cur, (double) iters);
> +  json_element_double (json_ctx, (double)cur / (double)iters);
>  }
>
>  static void
> -do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
> +do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
> +         int seek_char, int max_char)
>  /* For wcschr: align here means align not in bytes,
>     but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
>     len for wcschr here isn't in bytes but it's number of wchar_t symbols.  */
> @@ -124,87 +135,110 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
>    else
>      result = NULLRET (buf + align + len);
>
> -  printf ("Length %4zd, alignment in bytes %2zd:",
> -         pos, align * sizeof (CHAR));
> +  json_element_object_begin (json_ctx);
> +  json_attr_uint (json_ctx, "length", len);
> +  json_attr_uint (json_ctx, "pos", pos);
> +  json_attr_uint (json_ctx, "seek_char", seek_char);
> +  json_attr_uint (json_ctx, "max_char", max_char);
> +  json_attr_uint (json_ctx, "alignment", align);
> +  json_array_begin (json_ctx, "timings");
>
>    FOR_EACH_IMPL (impl, 0)
> -    do_one_test (impl, buf + align, seek_char, result);
> +    do_one_test (json_ctx, impl, buf + align, seek_char, result);
>
> -  putchar ('\n');
> +  json_array_end (json_ctx);
> +  json_element_object_end (json_ctx);
>  }
>
>  int
>  test_main (void)
>  {
> +  json_ctx_t json_ctx;
>    size_t i;
>
>    test_init ();
>
> -  printf ("%20s", "");
> +  json_init (&json_ctx, 0, stdout);
> +
> +  json_document_begin (&json_ctx);
> +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> +
> +  json_attr_object_begin (&json_ctx, "functions");
> +  json_attr_object_begin (&json_ctx, TEST_NAME);
> +  json_attr_string (&json_ctx, "bench-variant", "");
> +
> +  json_array_begin (&json_ctx, "ifuncs");
>    FOR_EACH_IMPL (impl, 0)
> -    printf ("\t%s", impl->name);
> -  putchar ('\n');
> +    json_element_string (&json_ctx, impl->name);
> +  json_array_end (&json_ctx);
> +
> +  json_array_begin (&json_ctx, "results");
>
>    for (i = 1; i < 8; ++i)
>      {
> -      do_test (0, 16 << i, 2048, SMALL_CHAR, MIDDLE_CHAR);
> -      do_test (i, 16 << i, 2048, SMALL_CHAR, MIDDLE_CHAR);
> +      do_test (&json_ctx, 0, 16 << i, 2048, SMALL_CHAR, MIDDLE_CHAR);
> +      do_test (&json_ctx, i, 16 << i, 2048, SMALL_CHAR, MIDDLE_CHAR);
>      }
>
>    for (i = 1; i < 8; ++i)
>      {
> -      do_test (0, 16 << i, 4096, SMALL_CHAR, MIDDLE_CHAR);
> -      do_test (i, 16 << i, 4096, SMALL_CHAR, MIDDLE_CHAR);
> +      do_test (&json_ctx, 0, 16 << i, 4096, SMALL_CHAR, MIDDLE_CHAR);
> +      do_test (&json_ctx, i, 16 << i, 4096, SMALL_CHAR, MIDDLE_CHAR);
>      }
>
>    for (i = 1; i < 8; ++i)
>      {
> -      do_test (i, 64, 256, SMALL_CHAR, MIDDLE_CHAR);
> -      do_test (i, 64, 256, SMALL_CHAR, BIG_CHAR);
> +      do_test (&json_ctx, i, 64, 256, SMALL_CHAR, MIDDLE_CHAR);
> +      do_test (&json_ctx, i, 64, 256, SMALL_CHAR, BIG_CHAR);
>      }
>
>    for (i = 0; i < 8; ++i)
>      {
> -      do_test (16 * i, 256, 512, SMALL_CHAR, MIDDLE_CHAR);
> -      do_test (16 * i, 256, 512, SMALL_CHAR, BIG_CHAR);
> +      do_test (&json_ctx, 16 * i, 256, 512, SMALL_CHAR, MIDDLE_CHAR);
> +      do_test (&json_ctx, 16 * i, 256, 512, SMALL_CHAR, BIG_CHAR);
>      }
>
>    for (i = 0; i < 32; ++i)
>      {
> -      do_test (0, i, i + 1, SMALL_CHAR, MIDDLE_CHAR);
> -      do_test (0, i, i + 1, SMALL_CHAR, BIG_CHAR);
> +      do_test (&json_ctx, 0, i, i + 1, SMALL_CHAR, MIDDLE_CHAR);
> +      do_test (&json_ctx, 0, i, i + 1, SMALL_CHAR, BIG_CHAR);
>      }
>
>    for (i = 1; i < 8; ++i)
>      {
> -      do_test (0, 16 << i, 2048, 0, MIDDLE_CHAR);
> -      do_test (i, 16 << i, 2048, 0, MIDDLE_CHAR);
> +      do_test (&json_ctx, 0, 16 << i, 2048, 0, MIDDLE_CHAR);
> +      do_test (&json_ctx, i, 16 << i, 2048, 0, MIDDLE_CHAR);
>      }
>
>    for (i = 1; i < 8; ++i)
>      {
> -      do_test (0, 16 << i, 4096, 0, MIDDLE_CHAR);
> -      do_test (i, 16 << i, 4096, 0, MIDDLE_CHAR);
> +      do_test (&json_ctx, 0, 16 << i, 4096, 0, MIDDLE_CHAR);
> +      do_test (&json_ctx, i, 16 << i, 4096, 0, MIDDLE_CHAR);
>      }
>
>    for (i = 1; i < 8; ++i)
>      {
> -      do_test (i, 64, 256, 0, MIDDLE_CHAR);
> -      do_test (i, 64, 256, 0, BIG_CHAR);
> +      do_test (&json_ctx, i, 64, 256, 0, MIDDLE_CHAR);
> +      do_test (&json_ctx, i, 64, 256, 0, BIG_CHAR);
>      }
>
>    for (i = 0; i < 8; ++i)
>      {
> -      do_test (16 * i, 256, 512, 0, MIDDLE_CHAR);
> -      do_test (16 * i, 256, 512, 0, BIG_CHAR);
> +      do_test (&json_ctx, 16 * i, 256, 512, 0, MIDDLE_CHAR);
> +      do_test (&json_ctx, 16 * i, 256, 512, 0, BIG_CHAR);
>      }
>
>    for (i = 0; i < 32; ++i)
>      {
> -      do_test (0, i, i + 1, 0, MIDDLE_CHAR);
> -      do_test (0, i, i + 1, 0, BIG_CHAR);
> +      do_test (&json_ctx, 0, i, i + 1, 0, MIDDLE_CHAR);
> +      do_test (&json_ctx, 0, i, i + 1, 0, BIG_CHAR);
>      }
>
> +  json_array_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_document_end (&json_ctx);
> +
>    return ret;
>  }
>
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v1 02/23] benchtests: Add random benchmark in bench-strchr.c
  2022-03-23 21:57 ` [PATCH v1 02/23] benchtests: Add random benchmark " Noah Goldstein
@ 2022-03-24 18:44   ` H.J. Lu
  0 siblings, 0 replies; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 18:44 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Wed, Mar 23, 2022 at 2:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Add benchmark that randomizes whether return should be NULL or pointer
> to CHAR. The rationale is on many architectures there is a choice
> between a predicate execution option (i.e cmovcc on x86) or a branch.
>
> On x86 the results for cmovcc vs branch are something along the lines
> of the following:
>
> perc-zero, Br On Result, Time Br / Time cmov
>      0.10,            1,              ,0.983
>      0.10,            0,              ,1.246
>      0.25,            1,              ,1.035
>      0.25,            0,              ,1.49
>      0.33,            1,              ,1.016
>      0.33,            0,              ,1.579
>      0.50,            1,              ,1.228
>      0.50,            0,              ,1.739
>      0.66,            1,              ,1.039
>      0.66,            0,              ,1.764
>      0.75,            1,              ,0.996
>      0.75,            0,              ,1.642
>      0.90,            1,              ,1.071
>      0.90,            0,              ,1.409
>      1.00,            1,              ,0.937
>      1.00,            0,              ,0.999
> ---
>  benchtests/bench-strchr.c | 143 ++++++++++++++++++++++++++++++++++++++
>  1 file changed, 143 insertions(+)
>
> diff --git a/benchtests/bench-strchr.c b/benchtests/bench-strchr.c
> index 203900d4ad..54640bde7e 100644
> --- a/benchtests/bench-strchr.c
> +++ b/benchtests/bench-strchr.c
> @@ -53,6 +53,11 @@
>  # define SMALL_CHAR 851
>  #endif /* WIDE */
>
> +#ifdef USE_FOR_STRCHRNUL
> +# define DO_RAND_TEST(...)
> +#else
> +# define DO_RAND_TEST(...) do_rand_test(__VA_ARGS__)
> +#endif
>  #ifdef USE_FOR_STRCHRNUL
>  # define NULLRET(endptr) endptr
>  #else
> @@ -74,6 +79,133 @@ simple_STRCHR (const CHAR *s, int c)
>  IMPL (simple_STRCHR, 0)
>  IMPL (STRCHR, 1)
>
> +#ifndef USE_FOR_STRCHRNUL
> +/* Random benchmarks for strchr (if return is CHAR or NULL).  The
> +   rational for the benchmark is returning null/char can be done with
> +   predicate execution (i.e cmovcc on x86) or a branch. */
> +
> +
> +/* Large enough that full history can't be stored in BHT. */
> +#define NUM_SEARCH_CHARS 2048
> +
> +/* Expectation is usecases of strchr check the return. Otherwise
> +   strchrnul would almost always be better. Since there is another
> +   branch coming we want to test the case where a potential branch in
> +   strchr can be used to skip a later mispredict because of the
> +   relationship between the two branches. */
> +static void __attribute__ ((noinline, noclone))
> +do_one_rand_plus_branch_test (json_ctx_t *json_ctx, impl_t *impl,
> +                              const CHAR *s, const CHAR *c)
> +{
> +  size_t i, iters = INNER_LOOP_ITERS_LARGE;
> +  int must_execute = 0;
> +  timing_t start, stop, cur;
> +  TIMING_NOW (start);
> +  for (i = 0; i < iters; ++i)
> +    {
> +      if (CALL (impl, s, c[i % NUM_SEARCH_CHARS]))
> +        {
> +          /* We just need something that will force compiler to emit
> +             a branch instead of conditional execution. */
> +          ++must_execute;
> +          asm volatile("" : : :);
> +        }
> +    }
> +  TIMING_NOW (stop);
> +
> +  TIMING_DIFF (cur, start, stop);
> +
> +  json_element_double (json_ctx, (double)cur / (double)iters);
> +}
> +
> +static void __attribute__ ((noinline, noclone))
> +do_one_rand_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s,
> +                  const CHAR *c)
> +{
> +  size_t i, iters = INNER_LOOP_ITERS_LARGE;
> +  timing_t start, stop, cur;
> +  TIMING_NOW (start);
> +  for (i = 0; i < iters; ++i)
> +    {
> +      CALL (impl, s, c[i % NUM_SEARCH_CHARS]);
> +    }
> +  TIMING_NOW (stop);
> +
> +  TIMING_DIFF (cur, start, stop);
> +
> +  json_element_double (json_ctx, (double)cur / (double)iters);
> +}
> +
> +static void
> +do_rand_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
> +              float perc_zero)
> +{
> +  size_t i;
> +  int perc_zero_int;
> +  CHAR *buf = (CHAR *)buf1;
> +  CHAR *c = (CHAR *)buf2;
> +  align &= 127;
> +  if ((align + len) * sizeof (CHAR) >= page_size)
> +    return;
> +
> +  /* Test is only interesting if we can hit both cases. */
> +  if (pos >= len)
> +    return;
> +
> +  /* Segfault if we run the test. */
> +  if (NUM_SEARCH_CHARS * sizeof (CHAR) > page_size)
> +    return;
> +
> +  for (i = 0; i < len; ++i)
> +    {
> +      buf[align + i] = 2;
> +    }
> +  buf[align + len] = 0;
> +  buf[align + pos] = 1;
> +
> +  perc_zero_int = perc_zero * RAND_MAX;
> +  for (i = 0; i < NUM_SEARCH_CHARS; ++i)
> +    {
> +      if (rand () > perc_zero_int)
> +        c[i] = 0;
> +      else
> +        c[i] = 1;
> +    }
> +  {
> +    json_element_object_begin (json_ctx);
> +    json_attr_uint (json_ctx, "rand", 1);
> +    json_attr_uint (json_ctx, "branch", 1);
> +    json_attr_double (json_ctx, "perc-zero", perc_zero);
> +    json_attr_uint (json_ctx, "length", len);
> +    json_attr_uint (json_ctx, "pos", pos);
> +    json_attr_uint (json_ctx, "alignment", align);
> +    json_array_begin (json_ctx, "timings");
> +
> +    FOR_EACH_IMPL (impl, 0)
> +      do_one_rand_plus_branch_test (json_ctx, impl, buf + align, c);
> +
> +    json_array_end (json_ctx);
> +    json_element_object_end (json_ctx);
> +  }
> +  {
> +    json_element_object_begin (json_ctx);
> +    json_attr_uint (json_ctx, "rand", 1);
> +    json_attr_uint (json_ctx, "branch", 0);
> +    json_attr_double (json_ctx, "perc-zero", perc_zero);
> +    json_attr_uint (json_ctx, "length", len);
> +    json_attr_uint (json_ctx, "pos", pos);
> +    json_attr_uint (json_ctx, "alignment", align);
> +    json_array_begin (json_ctx, "timings");
> +
> +    FOR_EACH_IMPL (impl, 0)
> +      do_one_rand_test (json_ctx, impl, buf + align, c);
> +
> +    json_array_end (json_ctx);
> +    json_element_object_end (json_ctx);
> +  }
> +}
> +#endif
> +
>  static void
>  do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s, int c,
>               const CHAR *exp_res)
> @@ -136,6 +268,7 @@ do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
>      result = NULLRET (buf + align + len);
>
>    json_element_object_begin (json_ctx);
> +  json_attr_uint (json_ctx, "rand", 0);
>    json_attr_uint (json_ctx, "length", len);
>    json_attr_uint (json_ctx, "pos", pos);
>    json_attr_uint (json_ctx, "seek_char", seek_char);
> @@ -234,6 +367,16 @@ test_main (void)
>        do_test (&json_ctx, 0, i, i + 1, 0, BIG_CHAR);
>      }
>
> +  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.0);
> +  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.1);
> +  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.25);
> +  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.33);
> +  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.5);
> +  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.66);
> +  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.75);
> +  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.9);
> +  DO_RAND_TEST(&json_ctx, 0, 15, 16, 1.0);
> +
>    json_array_end (&json_ctx);
>    json_attr_object_end (&json_ctx);
>    json_attr_object_end (&json_ctx);
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v1 03/23] x86: Code cleanup in strchr-avx2 and comment justifying branch
  2022-03-23 21:57 ` [PATCH v1 03/23] x86: Code cleanup in strchr-avx2 and comment justifying branch Noah Goldstein
@ 2022-03-24 18:53   ` H.J. Lu
  2022-03-24 19:20     ` Noah Goldstein
  0 siblings, 1 reply; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 18:53 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Wed, Mar 23, 2022 at 2:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Small code cleanup for size: -53 bytes.
>
> Add comment justifying using a branch to do NULL/non-null return.


Do you have followup patches to improve its performance?  We are
backporting all x86-64 improvements to Intel release branches:

https://gitlab.com/x86-glibc/glibc/-/wikis/home

Patches without performance improvements are undesirable.

> All string/memory tests pass and no regressions in benchtests.
>
> geometric_mean(N=20) of all benchmarks Original / New: 1.00
> ---
> Geomtric Mean N=20 runs; All functions page aligned
> length, alignment,  pos, rand, seek_char/branch, max_char/perc-zero, New Time / Old Time
>   2048,         0,   32,    0,               23,                127,               1.033
>   2048,         1,   32,    0,               23,                127,               1.006
>   2048,         0,   64,    0,               23,                127,                1.02
>   2048,         2,   64,    0,               23,                127,               0.992
>   2048,         0,  128,    0,               23,                127,               0.996
>   2048,         3,  128,    0,               23,                127,               0.966
>   2048,         0,  256,    0,               23,                127,               0.996
>   2048,         4,  256,    0,               23,                127,               0.998
>   2048,         0,  512,    0,               23,                127,               0.991
>   2048,         5,  512,    0,               23,                127,               0.991
>   2048,         0, 1024,    0,               23,                127,               0.993
>   2048,         6, 1024,    0,               23,                127,               0.992
>   2048,         0, 2048,    0,               23,                127,               0.992
>   2048,         7, 2048,    0,               23,                127,               0.976
>   4096,         0,   32,    0,               23,                127,               0.983
>   4096,         1,   32,    0,               23,                127,               0.994
>   4096,         0,   64,    0,               23,                127,               0.968
>   4096,         2,   64,    0,               23,                127,               1.018
>   4096,         0,  128,    0,               23,                127,                0.99
>   4096,         3,  128,    0,               23,                127,               1.001
>   4096,         0,  256,    0,               23,                127,                 1.0
>   4096,         4,  256,    0,               23,                127,               1.001
>   4096,         0,  512,    0,               23,                127,               0.989
>   4096,         5,  512,    0,               23,                127,               0.988
>   4096,         0, 1024,    0,               23,                127,               0.994
>   4096,         6, 1024,    0,               23,                127,               0.993
>   4096,         0, 2048,    0,               23,                127,               0.987
>   4096,         7, 2048,    0,               23,                127,               0.996
>    256,         1,   64,    0,               23,                127,               1.004
>    256,         2,   64,    0,               23,                127,               1.004
>    256,         3,   64,    0,               23,                127,               0.992
>    256,         4,   64,    0,               23,                127,               1.001
>    256,         5,   64,    0,               23,                127,               1.001
>    256,         6,   64,    0,               23,                127,               0.998
>    256,         7,   64,    0,               23,                127,               0.994
>    512,         0,  256,    0,               23,                127,               0.999
>    512,        16,  256,    0,               23,                127,               1.002
>    512,        32,  256,    0,               23,                127,               0.994
>    512,        48,  256,    0,               23,                127,               0.991
>    512,        64,  256,    0,               23,                127,               0.994
>    512,        80,  256,    0,               23,                127,               0.994
>    512,        96,  256,    0,               23,                127,               0.996
>    512,       112,  256,    0,               23,                127,               0.999
>      1,         0,    0,    0,               23,                127,               0.978
>      2,         0,    1,    0,               23,                127,               0.981
>      3,         0,    2,    0,               23,                127,               0.993
>      4,         0,    3,    0,               23,                127,               1.004
>      5,         0,    4,    0,               23,                127,               1.002
>      6,         0,    5,    0,               23,                127,               0.991
>      7,         0,    6,    0,               23,                127,                0.99
>      8,         0,    7,    0,               23,                127,               1.012
>      9,         0,    8,    0,               23,                127,               0.994
>     10,         0,    9,    0,               23,                127,               1.003
>     11,         0,   10,    0,               23,                127,               0.999
>     12,         0,   11,    0,               23,                127,               1.007
>     13,         0,   12,    0,               23,                127,                 1.0
>     14,         0,   13,    0,               23,                127,               0.997
>     15,         0,   14,    0,               23,                127,               0.996
>     16,         0,   15,    0,               23,                127,               0.993
>     17,         0,   16,    0,               23,                127,               1.002
>     18,         0,   17,    0,               23,                127,               0.997
>     19,         0,   18,    0,               23,                127,               0.998
>     20,         0,   19,    0,               23,                127,               0.994
>     21,         0,   20,    0,               23,                127,                0.99
>     22,         0,   21,    0,               23,                127,               0.992
>     23,         0,   22,    0,               23,                127,               0.996
>     24,         0,   23,    0,               23,                127,               0.991
>     25,         0,   24,    0,               23,                127,               0.997
>     26,         0,   25,    0,               23,                127,               1.011
>     27,         0,   26,    0,               23,                127,               1.013
>     28,         0,   27,    0,               23,                127,               0.996
>     29,         0,   28,    0,               23,                127,               0.993
>     30,         0,   29,    0,               23,                127,               1.009
>     31,         0,   30,    0,               23,                127,               1.009
>     32,         0,   31,    0,               23,                127,               1.008
>   2048,         0,   32,    0,                0,                127,                 1.0
>   2048,         1,   32,    0,                0,                127,                1.01
>   2048,         0,   64,    0,                0,                127,               0.997
>   2048,         2,   64,    0,                0,                127,               1.002
>   2048,         0,  128,    0,                0,                127,               0.986
>   2048,         3,  128,    0,                0,                127,               0.997
>   2048,         0,  256,    0,                0,                127,               1.002
>   2048,         4,  256,    0,                0,                127,               0.999
>   2048,         0,  512,    0,                0,                127,               0.991
>   2048,         5,  512,    0,                0,                127,               0.984
>   2048,         0, 1024,    0,                0,                127,               0.994
>   2048,         6, 1024,    0,                0,                127,               0.993
>   2048,         0, 2048,    0,                0,                127,               0.951
>   2048,         7, 2048,    0,                0,                127,               0.989
>   4096,         0,   32,    0,                0,                127,               0.993
>   4096,         1,   32,    0,                0,                127,               0.997
>   4096,         0,   64,    0,                0,                127,               1.004
>   4096,         2,   64,    0,                0,                127,               1.016
>   4096,         0,  128,    0,                0,                127,               0.973
>   4096,         3,  128,    0,                0,                127,               1.001
>   4096,         0,  256,    0,                0,                127,               0.999
>   4096,         4,  256,    0,                0,                127,               0.998
>   4096,         0,  512,    0,                0,                127,                0.99
>   4096,         5,  512,    0,                0,                127,               0.985
>   4096,         0, 1024,    0,                0,                127,               0.993
>   4096,         6, 1024,    0,                0,                127,               0.997
>   4096,         0, 2048,    0,                0,                127,               0.995
>   4096,         7, 2048,    0,                0,                127,               0.996
>    256,         1,   64,    0,                0,                127,                1.01
>    256,         2,   64,    0,                0,                127,               1.024
>    256,         3,   64,    0,                0,                127,                1.03
>    256,         4,   64,    0,                0,                127,               1.004
>    256,         5,   64,    0,                0,                127,               0.998
>    256,         6,   64,    0,                0,                127,               0.998
>    256,         7,   64,    0,                0,                127,               0.997
>    512,         0,  256,    0,                0,                127,               0.996
>    512,        16,  256,    0,                0,                127,               0.995
>    512,        32,  256,    0,                0,                127,               0.996
>    512,        48,  256,    0,                0,                127,               0.992
>    512,        64,  256,    0,                0,                127,               0.999
>    512,        80,  256,    0,                0,                127,               1.002
>    512,        96,  256,    0,                0,                127,               0.999
>    512,       112,  256,    0,                0,                127,               0.998
>      1,         0,    0,    0,                0,                127,               1.016
>      2,         0,    1,    0,                0,                127,               0.998
>      3,         0,    2,    0,                0,                127,                1.02
>      4,         0,    3,    0,                0,                127,               1.004
>      5,         0,    4,    0,                0,                127,               1.021
>      6,         0,    5,    0,                0,                127,               1.014
>      7,         0,    6,    0,                0,                127,               1.007
>      8,         0,    7,    0,                0,                127,               1.016
>      9,         0,    8,    0,                0,                127,               1.003
>     10,         0,    9,    0,                0,                127,               1.004
>     11,         0,   10,    0,                0,                127,               0.995
>     12,         0,   11,    0,                0,                127,               1.009
>     13,         0,   12,    0,                0,                127,               1.005
>     14,         0,   13,    0,                0,                127,               0.987
>     15,         0,   14,    0,                0,                127,               0.998
>     16,         0,   15,    0,                0,                127,               1.004
>     17,         0,   16,    0,                0,                127,                1.01
>     18,         0,   17,    0,                0,                127,                1.01
>     19,         0,   18,    0,                0,                127,               1.006
>     20,         0,   19,    0,                0,                127,               1.012
>     21,         0,   20,    0,                0,                127,               0.999
>     22,         0,   21,    0,                0,                127,               1.004
>     23,         0,   22,    0,                0,                127,               0.988
>     24,         0,   23,    0,                0,                127,               0.993
>     25,         0,   24,    0,                0,                127,               1.004
>     26,         0,   25,    0,                0,                127,                0.99
>     27,         0,   26,    0,                0,                127,               1.016
>     28,         0,   27,    0,                0,                127,               0.987
>     29,         0,   28,    0,                0,                127,               0.989
>     30,         0,   29,    0,                0,                127,               0.998
>     31,         0,   30,    0,                0,                127,               1.005
>     32,         0,   31,    0,                0,                127,               0.993
>
>     16,         0,   15,    1,                1,                  0,               1.002
>     16,         0,   15,    1,                0,                  0,                 1.0
>     16,         0,   15,    1,                1,                0.1,               1.034
>     16,         0,   15,    1,                0,                0.1,                1.03
>     16,         0,   15,    1,                1,               0.25,               0.993
>     16,         0,   15,    1,                0,               0.25,               1.081
>     16,         0,   15,    1,                1,               0.33,               0.959
>     16,         0,   15,    1,                0,               0.33,               1.142
>     16,         0,   15,    1,                1,                0.5,               0.929
>     16,         0,   15,    1,                0,                0.5,               1.072
>     16,         0,   15,    1,                1,               0.66,               0.984
>     16,         0,   15,    1,                0,               0.66,               1.069
>     16,         0,   15,    1,                1,               0.75,               0.969
>     16,         0,   15,    1,                0,               0.75,               1.059
>     16,         0,   15,    1,                1,                0.9,                0.98
>     16,         0,   15,    1,                0,                0.9,               0.994
>     16,         0,   15,    1,                1,                  1,               0.993
>     16,         0,   15,    1,                0,                  1,               0.996
>
>  sysdeps/x86_64/multiarch/strchr-avx2.S | 204 +++++++++++++------------
>  1 file changed, 107 insertions(+), 97 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
> index 086cabf76a..1a916cc951 100644
> --- a/sysdeps/x86_64/multiarch/strchr-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
> @@ -48,13 +48,13 @@
>  # define PAGE_SIZE 4096
>
>         .section SECTION(.text),"ax",@progbits
> -ENTRY (STRCHR)
> +ENTRY_P2ALIGN (STRCHR, 5)
>         /* Broadcast CHAR to YMM0.      */
>         vmovd   %esi, %xmm0
>         movl    %edi, %eax
>         andl    $(PAGE_SIZE - 1), %eax
>         VPBROADCAST     %xmm0, %ymm0
> -       vpxor   %xmm9, %xmm9, %xmm9
> +       vpxor   %xmm1, %xmm1, %xmm1
>
>         /* Check if we cross page boundary with one vector load.  */
>         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> @@ -62,37 +62,29 @@ ENTRY (STRCHR)
>
>         /* Check the first VEC_SIZE bytes.      Search for both CHAR and the
>            null byte.  */
> -       vmovdqu (%rdi), %ymm8
> -       VPCMPEQ %ymm8, %ymm0, %ymm1
> -       VPCMPEQ %ymm8, %ymm9, %ymm2
> -       vpor    %ymm1, %ymm2, %ymm1
> -       vpmovmskb %ymm1, %eax
> +       vmovdqu (%rdi), %ymm2
> +       VPCMPEQ %ymm2, %ymm0, %ymm3
> +       VPCMPEQ %ymm2, %ymm1, %ymm2
> +       vpor    %ymm3, %ymm2, %ymm3
> +       vpmovmskb %ymm3, %eax
>         testl   %eax, %eax
>         jz      L(aligned_more)
>         tzcntl  %eax, %eax
>  # ifndef USE_AS_STRCHRNUL
> -       /* Found CHAR or the null byte.  */
> -       cmp     (%rdi, %rax), %CHAR_REG
> -       jne     L(zero)
> -# endif
> -       addq    %rdi, %rax
> -       VZEROUPPER_RETURN
> -
> -       /* .p2align 5 helps keep performance more consistent if ENTRY()
> -          alignment % 32 was either 16 or 0. As well this makes the
> -          alignment % 32 of the loop_4x_vec fixed which makes tuning it
> -          easier.  */
> -       .p2align 5
> -L(first_vec_x4):
> -       tzcntl  %eax, %eax
> -       addq    $(VEC_SIZE * 3 + 1), %rdi
> -# ifndef USE_AS_STRCHRNUL
> -       /* Found CHAR or the null byte.  */
> +       /* Found CHAR or the null byte.  */
>         cmp     (%rdi, %rax), %CHAR_REG
> +       /* NB: Use a branch instead of cmovcc here. The expectation is
> +          that with strchr the user will branch based on input being
> +          null. Since this branch will be 100% predictive of the user
> +          branch a branch miss here should save what otherwise would
> +          be branch miss in the user code. Otherwise using a branch 1)
> +          saves code size and 2) is faster in highly predictable
> +          environments.  */
>         jne     L(zero)
>  # endif
>         addq    %rdi, %rax
> -       VZEROUPPER_RETURN
> +L(return_vzeroupper):
> +       ZERO_UPPER_VEC_REGISTERS_RETURN
>
>  # ifndef USE_AS_STRCHRNUL
>  L(zero):
> @@ -103,7 +95,8 @@ L(zero):
>
>         .p2align 4
>  L(first_vec_x1):
> -       tzcntl  %eax, %eax
> +       /* Use bsf to save code size.  */
> +       bsfl    %eax, %eax
>         incq    %rdi
>  # ifndef USE_AS_STRCHRNUL
>         /* Found CHAR or the null byte.  */
> @@ -113,9 +106,10 @@ L(first_vec_x1):
>         addq    %rdi, %rax
>         VZEROUPPER_RETURN
>
> -       .p2align 4
> +       .p2align 4,, 10
>  L(first_vec_x2):
> -       tzcntl  %eax, %eax
> +       /* Use bsf to save code size.  */
> +       bsfl    %eax, %eax
>         addq    $(VEC_SIZE + 1), %rdi
>  # ifndef USE_AS_STRCHRNUL
>         /* Found CHAR or the null byte.  */
> @@ -125,9 +119,10 @@ L(first_vec_x2):
>         addq    %rdi, %rax
>         VZEROUPPER_RETURN
>
> -       .p2align 4
> +       .p2align 4,, 8
>  L(first_vec_x3):
> -       tzcntl  %eax, %eax
> +       /* Use bsf to save code size.  */
> +       bsfl    %eax, %eax
>         addq    $(VEC_SIZE * 2 + 1), %rdi
>  # ifndef USE_AS_STRCHRNUL
>         /* Found CHAR or the null byte.  */
> @@ -137,6 +132,21 @@ L(first_vec_x3):
>         addq    %rdi, %rax
>         VZEROUPPER_RETURN
>
> +       .p2align 4,, 10
> +L(first_vec_x4):
> +       /* Use bsf to save code size.  */
> +       bsfl    %eax, %eax
> +       addq    $(VEC_SIZE * 3 + 1), %rdi
> +# ifndef USE_AS_STRCHRNUL
> +       /* Found CHAR or the null byte.  */
> +       cmp     (%rdi, %rax), %CHAR_REG
> +       jne     L(zero)
> +# endif
> +       addq    %rdi, %rax
> +       VZEROUPPER_RETURN
> +
> +
> +
>         .p2align 4
>  L(aligned_more):
>         /* Align data to VEC_SIZE - 1. This is the same number of
> @@ -146,90 +156,92 @@ L(aligned_more):
>  L(cross_page_continue):
>         /* Check the next 4 * VEC_SIZE.  Only one VEC_SIZE at a time
>            since data is only aligned to VEC_SIZE.  */
> -       vmovdqa 1(%rdi), %ymm8
> -       VPCMPEQ %ymm8, %ymm0, %ymm1
> -       VPCMPEQ %ymm8, %ymm9, %ymm2
> -       vpor    %ymm1, %ymm2, %ymm1
> -       vpmovmskb %ymm1, %eax
> +       vmovdqa 1(%rdi), %ymm2
> +       VPCMPEQ %ymm2, %ymm0, %ymm3
> +       VPCMPEQ %ymm2, %ymm1, %ymm2
> +       vpor    %ymm3, %ymm2, %ymm3
> +       vpmovmskb %ymm3, %eax
>         testl   %eax, %eax
>         jnz     L(first_vec_x1)
>
> -       vmovdqa (VEC_SIZE + 1)(%rdi), %ymm8
> -       VPCMPEQ %ymm8, %ymm0, %ymm1
> -       VPCMPEQ %ymm8, %ymm9, %ymm2
> -       vpor    %ymm1, %ymm2, %ymm1
> -       vpmovmskb %ymm1, %eax
> +       vmovdqa (VEC_SIZE + 1)(%rdi), %ymm2
> +       VPCMPEQ %ymm2, %ymm0, %ymm3
> +       VPCMPEQ %ymm2, %ymm1, %ymm2
> +       vpor    %ymm3, %ymm2, %ymm3
> +       vpmovmskb %ymm3, %eax
>         testl   %eax, %eax
>         jnz     L(first_vec_x2)
>
> -       vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm8
> -       VPCMPEQ %ymm8, %ymm0, %ymm1
> -       VPCMPEQ %ymm8, %ymm9, %ymm2
> -       vpor    %ymm1, %ymm2, %ymm1
> -       vpmovmskb %ymm1, %eax
> +       vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm2
> +       VPCMPEQ %ymm2, %ymm0, %ymm3
> +       VPCMPEQ %ymm2, %ymm1, %ymm2
> +       vpor    %ymm3, %ymm2, %ymm3
> +       vpmovmskb %ymm3, %eax
>         testl   %eax, %eax
>         jnz     L(first_vec_x3)
>
> -       vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm8
> -       VPCMPEQ %ymm8, %ymm0, %ymm1
> -       VPCMPEQ %ymm8, %ymm9, %ymm2
> -       vpor    %ymm1, %ymm2, %ymm1
> -       vpmovmskb %ymm1, %eax
> +       vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm2
> +       VPCMPEQ %ymm2, %ymm0, %ymm3
> +       VPCMPEQ %ymm2, %ymm1, %ymm2
> +       vpor    %ymm3, %ymm2, %ymm3
> +       vpmovmskb %ymm3, %eax
>         testl   %eax, %eax
>         jnz     L(first_vec_x4)
> -       /* Align data to VEC_SIZE * 4 - 1.      */
> -       addq    $(VEC_SIZE * 4 + 1), %rdi
> -       andq    $-(VEC_SIZE * 4), %rdi
> +       /* Align data to VEC_SIZE * 4 - 1.  */
> +       incq    %rdi
> +       orq     $(VEC_SIZE * 4 - 1), %rdi
>         .p2align 4
>  L(loop_4x_vec):
>         /* Compare 4 * VEC at a time forward.  */
> -       vmovdqa (%rdi), %ymm5
> -       vmovdqa (VEC_SIZE)(%rdi), %ymm6
> -       vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
> -       vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
> +       vmovdqa 1(%rdi), %ymm6
> +       vmovdqa (VEC_SIZE + 1)(%rdi), %ymm7
>
>         /* Leaves only CHARS matching esi as 0.  */
> -       vpxor   %ymm5, %ymm0, %ymm1
>         vpxor   %ymm6, %ymm0, %ymm2
>         vpxor   %ymm7, %ymm0, %ymm3
> -       vpxor   %ymm8, %ymm0, %ymm4
>
> -       VPMINU  %ymm1, %ymm5, %ymm1
>         VPMINU  %ymm2, %ymm6, %ymm2
>         VPMINU  %ymm3, %ymm7, %ymm3
> -       VPMINU  %ymm4, %ymm8, %ymm4
>
> -       VPMINU  %ymm1, %ymm2, %ymm5
> -       VPMINU  %ymm3, %ymm4, %ymm6
> +       vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm6
> +       vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm7
> +
> +       vpxor   %ymm6, %ymm0, %ymm4
> +       vpxor   %ymm7, %ymm0, %ymm5
> +
> +       VPMINU  %ymm4, %ymm6, %ymm4
> +       VPMINU  %ymm5, %ymm7, %ymm5
>
> -       VPMINU  %ymm5, %ymm6, %ymm6
> +       VPMINU  %ymm2, %ymm3, %ymm6
> +       VPMINU  %ymm4, %ymm5, %ymm7
>
> -       VPCMPEQ %ymm6, %ymm9, %ymm6
> -       vpmovmskb %ymm6, %ecx
> +       VPMINU  %ymm6, %ymm7, %ymm7
> +
> +       VPCMPEQ %ymm7, %ymm1, %ymm7
> +       vpmovmskb %ymm7, %ecx
>         subq    $-(VEC_SIZE * 4), %rdi
>         testl   %ecx, %ecx
>         jz      L(loop_4x_vec)
>
> -
> -       VPCMPEQ %ymm1, %ymm9, %ymm1
> -       vpmovmskb %ymm1, %eax
> +       VPCMPEQ %ymm2, %ymm1, %ymm2
> +       vpmovmskb %ymm2, %eax
>         testl   %eax, %eax
>         jnz     L(last_vec_x0)
>
>
> -       VPCMPEQ %ymm5, %ymm9, %ymm2
> -       vpmovmskb %ymm2, %eax
> +       VPCMPEQ %ymm3, %ymm1, %ymm3
> +       vpmovmskb %ymm3, %eax
>         testl   %eax, %eax
>         jnz     L(last_vec_x1)
>
> -       VPCMPEQ %ymm3, %ymm9, %ymm3
> -       vpmovmskb %ymm3, %eax
> +       VPCMPEQ %ymm4, %ymm1, %ymm4
> +       vpmovmskb %ymm4, %eax
>         /* rcx has combined result from all 4 VEC. It will only be used
>            if the first 3 other VEC all did not contain a match.  */
>         salq    $32, %rcx
>         orq     %rcx, %rax
>         tzcntq  %rax, %rax
> -       subq    $(VEC_SIZE * 2), %rdi
> +       subq    $(VEC_SIZE * 2 - 1), %rdi
>  # ifndef USE_AS_STRCHRNUL
>         /* Found CHAR or the null byte.  */
>         cmp     (%rdi, %rax), %CHAR_REG
> @@ -239,10 +251,11 @@ L(loop_4x_vec):
>         VZEROUPPER_RETURN
>
>
> -       .p2align 4
> +       .p2align 4,, 10
>  L(last_vec_x0):
> -       tzcntl  %eax, %eax
> -       addq    $-(VEC_SIZE * 4), %rdi
> +       /* Use bsf to save code size.  */
> +       bsfl    %eax, %eax
> +       addq    $-(VEC_SIZE * 4 - 1), %rdi
>  # ifndef USE_AS_STRCHRNUL
>         /* Found CHAR or the null byte.  */
>         cmp     (%rdi, %rax), %CHAR_REG
> @@ -251,16 +264,11 @@ L(last_vec_x0):
>         addq    %rdi, %rax
>         VZEROUPPER_RETURN
>
> -# ifndef USE_AS_STRCHRNUL
> -L(zero_end):
> -       xorl    %eax, %eax
> -       VZEROUPPER_RETURN
> -# endif
>
> -       .p2align 4
> +       .p2align 4,, 10
>  L(last_vec_x1):
>         tzcntl  %eax, %eax
> -       subq    $(VEC_SIZE * 3), %rdi
> +       subq    $(VEC_SIZE * 3 - 1), %rdi
>  # ifndef USE_AS_STRCHRNUL
>         /* Found CHAR or the null byte.  */
>         cmp     (%rdi, %rax), %CHAR_REG
> @@ -269,18 +277,23 @@ L(last_vec_x1):
>         addq    %rdi, %rax
>         VZEROUPPER_RETURN
>
> +# ifndef USE_AS_STRCHRNUL
> +L(zero_end):
> +       xorl    %eax, %eax
> +       VZEROUPPER_RETURN
> +# endif
>
>         /* Cold case for crossing page with first load.  */
> -       .p2align 4
> +       .p2align 4,, 8
>  L(cross_page_boundary):
>         movq    %rdi, %rdx
>         /* Align rdi to VEC_SIZE - 1.  */
>         orq     $(VEC_SIZE - 1), %rdi
> -       vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm8
> -       VPCMPEQ %ymm8, %ymm0, %ymm1
> -       VPCMPEQ %ymm8, %ymm9, %ymm2
> -       vpor    %ymm1, %ymm2, %ymm1
> -       vpmovmskb %ymm1, %eax
> +       vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm2
> +       VPCMPEQ %ymm2, %ymm0, %ymm3
> +       VPCMPEQ %ymm2, %ymm1, %ymm2
> +       vpor    %ymm3, %ymm2, %ymm3
> +       vpmovmskb %ymm3, %eax
>         /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
>            so no need to manually mod edx.  */
>         sarxl   %edx, %eax, %eax
> @@ -291,13 +304,10 @@ L(cross_page_boundary):
>         xorl    %ecx, %ecx
>         /* Found CHAR or the null byte.  */
>         cmp     (%rdx, %rax), %CHAR_REG
> -       leaq    (%rdx, %rax), %rax
> -       cmovne  %rcx, %rax
> -# else
> -       addq    %rdx, %rax
> +       jne     L(zero_end)
>  # endif
> -L(return_vzeroupper):
> -       ZERO_UPPER_VEC_REGISTERS_RETURN
> +       addq    %rdx, %rax
> +       VZEROUPPER_RETURN
>
>  END (STRCHR)
> -# endif
> +#endif
> --
> 2.25.1
>


-- 
H.J.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v1 04/23] x86: Code cleanup in strchr-evex and comment justifying branch
  2022-03-23 21:57 ` [PATCH v1 04/23] x86: Code cleanup in strchr-evex " Noah Goldstein
@ 2022-03-24 18:54   ` H.J. Lu
  2022-05-12 19:32     ` Sunil Pandey
  0 siblings, 1 reply; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 18:54 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Wed, Mar 23, 2022 at 2:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Small code cleanup for size: -81 bytes.
>
> Add comment justifying using a branch to do NULL/non-null return.
>
> All string/memory tests pass and no regressions in benchtests.
>
> geometric_mean(N=20) of all benchmarks New / Original: .985
> ---
> Geomtric Mean N=20 runs; All functions page aligned
> length, alignment,  pos, rand, seek_char/branch, max_char/perc-zero, New Time / Old Time
>   2048,         0,   32,    0,               23,                127,               0.878
>   2048,         1,   32,    0,               23,                127,                0.88
>   2048,         0,   64,    0,               23,                127,               0.997
>   2048,         2,   64,    0,               23,                127,               1.001
>   2048,         0,  128,    0,               23,                127,               0.973
>   2048,         3,  128,    0,               23,                127,               0.971
>   2048,         0,  256,    0,               23,                127,               0.976
>   2048,         4,  256,    0,               23,                127,               0.973
>   2048,         0,  512,    0,               23,                127,               1.001
>   2048,         5,  512,    0,               23,                127,               1.004
>   2048,         0, 1024,    0,               23,                127,               1.005
>   2048,         6, 1024,    0,               23,                127,               1.007
>   2048,         0, 2048,    0,               23,                127,               1.035
>   2048,         7, 2048,    0,               23,                127,                1.03
>   4096,         0,   32,    0,               23,                127,               0.889
>   4096,         1,   32,    0,               23,                127,               0.891
>   4096,         0,   64,    0,               23,                127,               1.012
>   4096,         2,   64,    0,               23,                127,               1.017
>   4096,         0,  128,    0,               23,                127,               0.975
>   4096,         3,  128,    0,               23,                127,               0.974
>   4096,         0,  256,    0,               23,                127,               0.974
>   4096,         4,  256,    0,               23,                127,               0.972
>   4096,         0,  512,    0,               23,                127,               1.002
>   4096,         5,  512,    0,               23,                127,               1.016
>   4096,         0, 1024,    0,               23,                127,               1.009
>   4096,         6, 1024,    0,               23,                127,               1.008
>   4096,         0, 2048,    0,               23,                127,               1.003
>   4096,         7, 2048,    0,               23,                127,               1.004
>    256,         1,   64,    0,               23,                127,               0.993
>    256,         2,   64,    0,               23,                127,               0.999
>    256,         3,   64,    0,               23,                127,               0.992
>    256,         4,   64,    0,               23,                127,                0.99
>    256,         5,   64,    0,               23,                127,                0.99
>    256,         6,   64,    0,               23,                127,               0.994
>    256,         7,   64,    0,               23,                127,               0.991
>    512,         0,  256,    0,               23,                127,               0.971
>    512,        16,  256,    0,               23,                127,               0.971
>    512,        32,  256,    0,               23,                127,               1.005
>    512,        48,  256,    0,               23,                127,               0.998
>    512,        64,  256,    0,               23,                127,               1.001
>    512,        80,  256,    0,               23,                127,               1.002
>    512,        96,  256,    0,               23,                127,               1.005
>    512,       112,  256,    0,               23,                127,               1.012
>      1,         0,    0,    0,               23,                127,               1.024
>      2,         0,    1,    0,               23,                127,               0.991
>      3,         0,    2,    0,               23,                127,               0.997
>      4,         0,    3,    0,               23,                127,               0.984
>      5,         0,    4,    0,               23,                127,               0.993
>      6,         0,    5,    0,               23,                127,               0.985
>      7,         0,    6,    0,               23,                127,               0.979
>      8,         0,    7,    0,               23,                127,               0.975
>      9,         0,    8,    0,               23,                127,               0.965
>     10,         0,    9,    0,               23,                127,               0.957
>     11,         0,   10,    0,               23,                127,               0.979
>     12,         0,   11,    0,               23,                127,               0.987
>     13,         0,   12,    0,               23,                127,               1.023
>     14,         0,   13,    0,               23,                127,               0.997
>     15,         0,   14,    0,               23,                127,               0.983
>     16,         0,   15,    0,               23,                127,               0.987
>     17,         0,   16,    0,               23,                127,               0.993
>     18,         0,   17,    0,               23,                127,               0.985
>     19,         0,   18,    0,               23,                127,               0.999
>     20,         0,   19,    0,               23,                127,               0.998
>     21,         0,   20,    0,               23,                127,               0.983
>     22,         0,   21,    0,               23,                127,               0.983
>     23,         0,   22,    0,               23,                127,               1.002
>     24,         0,   23,    0,               23,                127,                 1.0
>     25,         0,   24,    0,               23,                127,               1.002
>     26,         0,   25,    0,               23,                127,               0.984
>     27,         0,   26,    0,               23,                127,               0.994
>     28,         0,   27,    0,               23,                127,               0.995
>     29,         0,   28,    0,               23,                127,               1.017
>     30,         0,   29,    0,               23,                127,               1.009
>     31,         0,   30,    0,               23,                127,               1.001
>     32,         0,   31,    0,               23,                127,               1.021
>   2048,         0,   32,    0,                0,                127,               0.899
>   2048,         1,   32,    0,                0,                127,                0.93
>   2048,         0,   64,    0,                0,                127,               1.009
>   2048,         2,   64,    0,                0,                127,               1.023
>   2048,         0,  128,    0,                0,                127,               0.973
>   2048,         3,  128,    0,                0,                127,               0.975
>   2048,         0,  256,    0,                0,                127,               0.974
>   2048,         4,  256,    0,                0,                127,                0.97
>   2048,         0,  512,    0,                0,                127,               0.999
>   2048,         5,  512,    0,                0,                127,               1.004
>   2048,         0, 1024,    0,                0,                127,               1.008
>   2048,         6, 1024,    0,                0,                127,               1.008
>   2048,         0, 2048,    0,                0,                127,               0.996
>   2048,         7, 2048,    0,                0,                127,               1.002
>   4096,         0,   32,    0,                0,                127,               0.872
>   4096,         1,   32,    0,                0,                127,               0.881
>   4096,         0,   64,    0,                0,                127,               1.006
>   4096,         2,   64,    0,                0,                127,               1.005
>   4096,         0,  128,    0,                0,                127,               0.973
>   4096,         3,  128,    0,                0,                127,               0.974
>   4096,         0,  256,    0,                0,                127,               0.969
>   4096,         4,  256,    0,                0,                127,               0.971
>   4096,         0,  512,    0,                0,                127,                 1.0
>   4096,         5,  512,    0,                0,                127,               1.005
>   4096,         0, 1024,    0,                0,                127,               1.007
>   4096,         6, 1024,    0,                0,                127,               1.009
>   4096,         0, 2048,    0,                0,                127,               1.005
>   4096,         7, 2048,    0,                0,                127,               1.007
>    256,         1,   64,    0,                0,                127,               0.994
>    256,         2,   64,    0,                0,                127,               1.008
>    256,         3,   64,    0,                0,                127,               1.019
>    256,         4,   64,    0,                0,                127,               0.991
>    256,         5,   64,    0,                0,                127,               0.992
>    256,         6,   64,    0,                0,                127,               0.991
>    256,         7,   64,    0,                0,                127,               0.988
>    512,         0,  256,    0,                0,                127,               0.971
>    512,        16,  256,    0,                0,                127,               0.967
>    512,        32,  256,    0,                0,                127,               1.005
>    512,        48,  256,    0,                0,                127,               1.001
>    512,        64,  256,    0,                0,                127,               1.009
>    512,        80,  256,    0,                0,                127,               1.008
>    512,        96,  256,    0,                0,                127,               1.009
>    512,       112,  256,    0,                0,                127,               1.016
>      1,         0,    0,    0,                0,                127,               1.038
>      2,         0,    1,    0,                0,                127,               1.009
>      3,         0,    2,    0,                0,                127,               0.992
>      4,         0,    3,    0,                0,                127,               1.004
>      5,         0,    4,    0,                0,                127,               0.966
>      6,         0,    5,    0,                0,                127,               0.968
>      7,         0,    6,    0,                0,                127,               1.004
>      8,         0,    7,    0,                0,                127,                0.99
>      9,         0,    8,    0,                0,                127,               0.958
>     10,         0,    9,    0,                0,                127,                0.96
>     11,         0,   10,    0,                0,                127,               0.948
>     12,         0,   11,    0,                0,                127,               0.984
>     13,         0,   12,    0,                0,                127,               0.967
>     14,         0,   13,    0,                0,                127,               0.993
>     15,         0,   14,    0,                0,                127,               0.991
>     16,         0,   15,    0,                0,                127,                 1.0
>     17,         0,   16,    0,                0,                127,               0.982
>     18,         0,   17,    0,                0,                127,               0.977
>     19,         0,   18,    0,                0,                127,               0.987
>     20,         0,   19,    0,                0,                127,               0.978
>     21,         0,   20,    0,                0,                127,                 1.0
>     22,         0,   21,    0,                0,                127,                0.99
>     23,         0,   22,    0,                0,                127,               0.988
>     24,         0,   23,    0,                0,                127,               0.997
>     25,         0,   24,    0,                0,                127,               1.003
>     26,         0,   25,    0,                0,                127,               1.004
>     27,         0,   26,    0,                0,                127,               0.982
>     28,         0,   27,    0,                0,                127,               0.972
>     29,         0,   28,    0,                0,                127,               0.978
>     30,         0,   29,    0,                0,                127,               0.992
>     31,         0,   30,    0,                0,                127,               0.986
>     32,         0,   31,    0,                0,                127,                 1.0
>
>     16,         0,   15,    1,                1,                  0,               0.997
>     16,         0,   15,    1,                0,                  0,               1.001
>     16,         0,   15,    1,                1,                0.1,               0.984
>     16,         0,   15,    1,                0,                0.1,               0.999
>     16,         0,   15,    1,                1,               0.25,               0.929
>     16,         0,   15,    1,                0,               0.25,               1.001
>     16,         0,   15,    1,                1,               0.33,               0.892
>     16,         0,   15,    1,                0,               0.33,               0.996
>     16,         0,   15,    1,                1,                0.5,               0.897
>     16,         0,   15,    1,                0,                0.5,               1.009
>     16,         0,   15,    1,                1,               0.66,               0.882
>     16,         0,   15,    1,                0,               0.66,               0.967
>     16,         0,   15,    1,                1,               0.75,               0.919
>     16,         0,   15,    1,                0,               0.75,               1.027
>     16,         0,   15,    1,                1,                0.9,               0.949
>     16,         0,   15,    1,                0,                0.9,               1.021
>     16,         0,   15,    1,                1,                  1,               0.998
>     16,         0,   15,    1,                0,                  1,               0.999
>
>  sysdeps/x86_64/multiarch/strchr-evex.S | 146 ++++++++++++++-----------
>  1 file changed, 80 insertions(+), 66 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
> index f62cd9d144..ec739fb8f9 100644
> --- a/sysdeps/x86_64/multiarch/strchr-evex.S
> +++ b/sysdeps/x86_64/multiarch/strchr-evex.S
> @@ -30,6 +30,7 @@
>  # ifdef USE_AS_WCSCHR
>  #  define VPBROADCAST  vpbroadcastd
>  #  define VPCMP                vpcmpd
> +#  define VPTESTN      vptestnmd
>  #  define VPMINU       vpminud
>  #  define CHAR_REG     esi
>  #  define SHIFT_REG    ecx
> @@ -37,6 +38,7 @@
>  # else
>  #  define VPBROADCAST  vpbroadcastb
>  #  define VPCMP                vpcmpb
> +#  define VPTESTN      vptestnmb
>  #  define VPMINU       vpminub
>  #  define CHAR_REG     sil
>  #  define SHIFT_REG    edx
> @@ -61,13 +63,11 @@
>  # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
>
>         .section .text.evex,"ax",@progbits
> -ENTRY (STRCHR)
> +ENTRY_P2ALIGN (STRCHR, 5)
>         /* Broadcast CHAR to YMM0.      */
>         VPBROADCAST     %esi, %YMM0
>         movl    %edi, %eax
>         andl    $(PAGE_SIZE - 1), %eax
> -       vpxorq  %XMMZERO, %XMMZERO, %XMMZERO
> -
>         /* Check if we cross page boundary with one vector load.
>            Otherwise it is safe to use an unaligned load.  */
>         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> @@ -81,49 +81,35 @@ ENTRY (STRCHR)
>         vpxorq  %YMM1, %YMM0, %YMM2
>         VPMINU  %YMM2, %YMM1, %YMM2
>         /* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
> -       VPCMP   $0, %YMMZERO, %YMM2, %k0
> +       VPTESTN %YMM2, %YMM2, %k0
>         kmovd   %k0, %eax
>         testl   %eax, %eax
>         jz      L(aligned_more)
>         tzcntl  %eax, %eax
> +# ifndef USE_AS_STRCHRNUL
> +       /* Found CHAR or the null byte.  */
> +       cmp     (%rdi, %rax, CHAR_SIZE), %CHAR_REG
> +       /* NB: Use a branch instead of cmovcc here. The expectation is
> +          that with strchr the user will branch based on input being
> +          null. Since this branch will be 100% predictive of the user
> +          branch a branch miss here should save what otherwise would
> +          be branch miss in the user code. Otherwise using a branch 1)
> +          saves code size and 2) is faster in highly predictable
> +          environments.  */
> +       jne     L(zero)
> +# endif
>  # ifdef USE_AS_WCSCHR
>         /* NB: Multiply wchar_t count by 4 to get the number of bytes.
>          */
>         leaq    (%rdi, %rax, CHAR_SIZE), %rax
>  # else
>         addq    %rdi, %rax
> -# endif
> -# ifndef USE_AS_STRCHRNUL
> -       /* Found CHAR or the null byte.  */
> -       cmp     (%rax), %CHAR_REG
> -       jne     L(zero)
>  # endif
>         ret
>
> -       /* .p2align 5 helps keep performance more consistent if ENTRY()
> -          alignment % 32 was either 16 or 0. As well this makes the
> -          alignment % 32 of the loop_4x_vec fixed which makes tuning it
> -          easier.  */
> -       .p2align 5
> -L(first_vec_x3):
> -       tzcntl  %eax, %eax
> -# ifndef USE_AS_STRCHRNUL
> -       /* Found CHAR or the null byte.  */
> -       cmp     (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
> -       jne     L(zero)
> -# endif
> -       /* NB: Multiply sizeof char type (1 or 4) to get the number of
> -          bytes.  */
> -       leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> -       ret
>
> -# ifndef USE_AS_STRCHRNUL
> -L(zero):
> -       xorl    %eax, %eax
> -       ret
> -# endif
>
> -       .p2align 4
> +       .p2align 4,, 10
>  L(first_vec_x4):
>  # ifndef USE_AS_STRCHRNUL
>         /* Check to see if first match was CHAR (k0) or null (k1).  */
> @@ -144,9 +130,18 @@ L(first_vec_x4):
>         leaq    (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
>         ret
>
> +# ifndef USE_AS_STRCHRNUL
> +L(zero):
> +       xorl    %eax, %eax
> +       ret
> +# endif
> +
> +
>         .p2align 4
>  L(first_vec_x1):
> -       tzcntl  %eax, %eax
> +       /* Use bsf here to save 1-byte keeping keeping the block in 1x
> +          fetch block. eax guranteed non-zero.  */
> +       bsfl    %eax, %eax
>  # ifndef USE_AS_STRCHRNUL
>         /* Found CHAR or the null byte.  */
>         cmp     (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
> @@ -158,7 +153,7 @@ L(first_vec_x1):
>         leaq    (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
>         ret
>
> -       .p2align 4
> +       .p2align 4,, 10
>  L(first_vec_x2):
>  # ifndef USE_AS_STRCHRNUL
>         /* Check to see if first match was CHAR (k0) or null (k1).  */
> @@ -179,6 +174,21 @@ L(first_vec_x2):
>         leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
>         ret
>
> +       .p2align 4,, 10
> +L(first_vec_x3):
> +       /* Use bsf here to save 1-byte keeping keeping the block in 1x
> +          fetch block. eax guranteed non-zero.  */
> +       bsfl    %eax, %eax
> +# ifndef USE_AS_STRCHRNUL
> +       /* Found CHAR or the null byte.  */
> +       cmp     (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
> +       jne     L(zero)
> +# endif
> +       /* NB: Multiply sizeof char type (1 or 4) to get the number of
> +          bytes.  */
> +       leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> +       ret
> +
>         .p2align 4
>  L(aligned_more):
>         /* Align data to VEC_SIZE.  */
> @@ -195,7 +205,7 @@ L(cross_page_continue):
>         vpxorq  %YMM1, %YMM0, %YMM2
>         VPMINU  %YMM2, %YMM1, %YMM2
>         /* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
> -       VPCMP   $0, %YMMZERO, %YMM2, %k0
> +       VPTESTN %YMM2, %YMM2, %k0
>         kmovd   %k0, %eax
>         testl   %eax, %eax
>         jnz     L(first_vec_x1)
> @@ -206,7 +216,7 @@ L(cross_page_continue):
>         /* Each bit in K0 represents a CHAR in YMM1.  */
>         VPCMP   $0, %YMM1, %YMM0, %k0
>         /* Each bit in K1 represents a CHAR in YMM1.  */
> -       VPCMP   $0, %YMM1, %YMMZERO, %k1
> +       VPTESTN %YMM1, %YMM1, %k1
>         kortestd        %k0, %k1
>         jnz     L(first_vec_x2)
>
> @@ -215,7 +225,7 @@ L(cross_page_continue):
>         vpxorq  %YMM1, %YMM0, %YMM2
>         VPMINU  %YMM2, %YMM1, %YMM2
>         /* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
> -       VPCMP   $0, %YMMZERO, %YMM2, %k0
> +       VPTESTN %YMM2, %YMM2, %k0
>         kmovd   %k0, %eax
>         testl   %eax, %eax
>         jnz     L(first_vec_x3)
> @@ -224,7 +234,7 @@ L(cross_page_continue):
>         /* Each bit in K0 represents a CHAR in YMM1.  */
>         VPCMP   $0, %YMM1, %YMM0, %k0
>         /* Each bit in K1 represents a CHAR in YMM1.  */
> -       VPCMP   $0, %YMM1, %YMMZERO, %k1
> +       VPTESTN %YMM1, %YMM1, %k1
>         kortestd        %k0, %k1
>         jnz     L(first_vec_x4)
>
> @@ -265,33 +275,33 @@ L(loop_4x_vec):
>         VPMINU  %YMM3, %YMM4, %YMM4
>         VPMINU  %YMM2, %YMM4, %YMM4{%k4}{z}
>
> -       VPCMP   $0, %YMMZERO, %YMM4, %k1
> +       VPTESTN %YMM4, %YMM4, %k1
>         kmovd   %k1, %ecx
>         subq    $-(VEC_SIZE * 4), %rdi
>         testl   %ecx, %ecx
>         jz      L(loop_4x_vec)
>
> -       VPCMP   $0, %YMMZERO, %YMM1, %k0
> +       VPTESTN %YMM1, %YMM1, %k0
>         kmovd   %k0, %eax
>         testl   %eax, %eax
>         jnz     L(last_vec_x1)
>
> -       VPCMP   $0, %YMMZERO, %YMM2, %k0
> +       VPTESTN %YMM2, %YMM2, %k0
>         kmovd   %k0, %eax
>         testl   %eax, %eax
>         jnz     L(last_vec_x2)
>
> -       VPCMP   $0, %YMMZERO, %YMM3, %k0
> +       VPTESTN %YMM3, %YMM3, %k0
>         kmovd   %k0, %eax
>         /* Combine YMM3 matches (eax) with YMM4 matches (ecx).  */
>  # ifdef USE_AS_WCSCHR
>         sall    $8, %ecx
>         orl     %ecx, %eax
> -       tzcntl  %eax, %eax
> +       bsfl    %eax, %eax
>  # else
>         salq    $32, %rcx
>         orq     %rcx, %rax
> -       tzcntq  %rax, %rax
> +       bsfq    %rax, %rax
>  # endif
>  # ifndef USE_AS_STRCHRNUL
>         /* Check if match was CHAR or null.  */
> @@ -303,28 +313,28 @@ L(loop_4x_vec):
>         leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
>         ret
>
> -# ifndef USE_AS_STRCHRNUL
> -L(zero_end):
> -       xorl    %eax, %eax
> -       ret
> +       .p2align 4,, 8
> +L(last_vec_x1):
> +       bsfl    %eax, %eax
> +# ifdef USE_AS_WCSCHR
> +       /* NB: Multiply wchar_t count by 4 to get the number of bytes.
> +          */
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> +# else
> +       addq    %rdi, %rax
>  # endif
>
> -       .p2align 4
> -L(last_vec_x1):
> -       tzcntl  %eax, %eax
>  # ifndef USE_AS_STRCHRNUL
>         /* Check if match was null.  */
> -       cmp     (%rdi, %rax, CHAR_SIZE), %CHAR_REG
> +       cmp     (%rax), %CHAR_REG
>         jne     L(zero_end)
>  # endif
> -       /* NB: Multiply sizeof char type (1 or 4) to get the number of
> -          bytes.  */
> -       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> +
>         ret
>
> -       .p2align 4
> +       .p2align 4,, 8
>  L(last_vec_x2):
> -       tzcntl  %eax, %eax
> +       bsfl    %eax, %eax
>  # ifndef USE_AS_STRCHRNUL
>         /* Check if match was null.  */
>         cmp     (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
> @@ -336,7 +346,7 @@ L(last_vec_x2):
>         ret
>
>         /* Cold case for crossing page with first load.  */
> -       .p2align 4
> +       .p2align 4,, 8
>  L(cross_page_boundary):
>         movq    %rdi, %rdx
>         /* Align rdi.  */
> @@ -346,9 +356,9 @@ L(cross_page_boundary):
>         vpxorq  %YMM1, %YMM0, %YMM2
>         VPMINU  %YMM2, %YMM1, %YMM2
>         /* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
> -       VPCMP   $0, %YMMZERO, %YMM2, %k0
> +       VPTESTN %YMM2, %YMM2, %k0
>         kmovd   %k0, %eax
> -       /* Remove the leading bits.      */
> +       /* Remove the leading bits.  */
>  # ifdef USE_AS_WCSCHR
>         movl    %edx, %SHIFT_REG
>         /* NB: Divide shift count by 4 since each bit in K1 represent 4
> @@ -360,20 +370,24 @@ L(cross_page_boundary):
>         /* If eax is zero continue.  */
>         testl   %eax, %eax
>         jz      L(cross_page_continue)
> -       tzcntl  %eax, %eax
> -# ifndef USE_AS_STRCHRNUL
> -       /* Check to see if match was CHAR or null.  */
> -       cmp     (%rdx, %rax, CHAR_SIZE), %CHAR_REG
> -       jne     L(zero_end)
> -# endif
> +       bsfl    %eax, %eax
> +
>  # ifdef USE_AS_WCSCHR
>         /* NB: Multiply wchar_t count by 4 to get the number of
>            bytes.  */
>         leaq    (%rdx, %rax, CHAR_SIZE), %rax
>  # else
>         addq    %rdx, %rax
> +# endif
> +# ifndef USE_AS_STRCHRNUL
> +       /* Check to see if match was CHAR or null.  */
> +       cmp     (%rax), %CHAR_REG
> +       je      L(cross_page_ret)
> +L(zero_end):
> +       xorl    %eax, %eax
> +L(cross_page_ret):
>  # endif
>         ret
>
>  END (STRCHR)
> -# endif
> +#endif
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v1 05/23] benchtests: Use json-lib in bench-strpbrk.c
  2022-03-23 21:57 ` [PATCH v1 05/23] benchtests: Use json-lib in bench-strpbrk.c Noah Goldstein
@ 2022-03-24 18:54   ` H.J. Lu
  0 siblings, 0 replies; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 18:54 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Wed, Mar 23, 2022 at 2:59 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Just QOL change to make parsing the output of the benchtests more
> consistent.
> ---
>  benchtests/bench-strpbrk.c | 81 ++++++++++++++++++++++++++++----------
>  1 file changed, 61 insertions(+), 20 deletions(-)
>
> diff --git a/benchtests/bench-strpbrk.c b/benchtests/bench-strpbrk.c
> index d46bf9c0e2..a7522a76e6 100644
> --- a/benchtests/bench-strpbrk.c
> +++ b/benchtests/bench-strpbrk.c
> @@ -62,11 +62,14 @@ SIMPLE_STRPBRK (const CHAR *s, const CHAR *rej)
>
>  #endif /* !STRPBRK_RESULT */
>
> +#include "json-lib.h"
> +
>  static void
> -do_one_test (impl_t *impl, const CHAR *s, const CHAR *rej, RES_TYPE exp_res)
> +do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s,
> +             const CHAR *rej, RES_TYPE exp_res)
>  {
>    RES_TYPE res = CALL (impl, s, rej);
> -  size_t i, iters = INNER_LOOP_ITERS_MEDIUM;
> +  size_t i, iters = INNER_LOOP_ITERS;
>    timing_t start, stop, cur;
>
>    if (res != exp_res)
> @@ -86,23 +89,26 @@ do_one_test (impl_t *impl, const CHAR *s, const CHAR *rej, RES_TYPE exp_res)
>
>    TIMING_DIFF (cur, start, stop);
>
> -  TIMING_PRINT_MEAN ((double) cur, (double) iters);
> +  json_element_double (json_ctx, (double)cur / (double)iters);
>  }
>
>  static void
> -do_test (size_t align, size_t pos, size_t len)
> +do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t pos,
> +         size_t len)
>  {
>    size_t i;
>    int c;
>    RES_TYPE result;
>    CHAR *rej, *s;
>
> -  align &= 7;
> -  if ((align + pos + 10) * sizeof (CHAR) >= page_size || len > 240)
> +  align1 &= 7;
> +  if ((align1 + pos + 10) * sizeof (CHAR) >= page_size || len > 240)
> +    return;
> +  if ((align2 + len) * sizeof (CHAR) >= page_size)
>      return;
>
> -  rej = (CHAR *) (buf2) + (random () & 255);
> -  s = (CHAR *) (buf1) + align;
> +  rej = (CHAR *) (buf2) + align2;
> +  s = (CHAR *) (buf1) + align1;
>
>    for (i = 0; i < len; ++i)
>      {
> @@ -136,43 +142,78 @@ do_test (size_t align, size_t pos, size_t len)
>      }
>    result = STRPBRK_RESULT (s, pos);
>
> -  printf ("Length %4zd, alignment %2zd, rej len %2zd:", pos, align, len);
> +  json_element_object_begin (json_ctx);
> +  json_attr_uint (json_ctx, "len", len);
> +  json_attr_uint (json_ctx, "pos", pos);
> +  json_attr_uint (json_ctx, "align1", align1);
> +  json_attr_uint (json_ctx, "align2", align2);
> +  json_array_begin (json_ctx, "timings");
>
>    FOR_EACH_IMPL (impl, 0)
> -    do_one_test (impl, s, rej, result);
> +    do_one_test (json_ctx, impl, s, rej, result);
>
> -  putchar ('\n');
> +  json_array_end (json_ctx);
> +  json_element_object_end (json_ctx);
>  }
>
>  int
>  test_main (void)
>  {
> +  json_ctx_t json_ctx;
>    size_t i;
>
>    test_init ();
>
> -  printf ("%32s", "");
> +  json_init (&json_ctx, 0, stdout);
> +
> +  json_document_begin (&json_ctx);
> +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> +
> +  json_attr_object_begin (&json_ctx, "functions");
> +  json_attr_object_begin (&json_ctx, TEST_NAME);
> +  json_attr_string (&json_ctx, "bench-variant", "");
> +
> +  json_array_begin (&json_ctx, "ifuncs");
>    FOR_EACH_IMPL (impl, 0)
> -    printf ("\t%s", impl->name);
> -  putchar ('\n');
> +    json_element_string (&json_ctx, impl->name);
> +  json_array_end (&json_ctx);
> +
> +  json_array_begin (&json_ctx, "results");
> +
>
>    for (i = 0; i < 32; ++i)
>      {
> -      do_test (0, 512, i);
> -      do_test (i, 512, i);
> +      do_test (&json_ctx, 0, 0, 512, i);
> +      do_test (&json_ctx, i, 0, 512, i);
> +      do_test (&json_ctx, 0, i, 512, i);
> +      do_test (&json_ctx, i, i, 512, i);
> +
>      }
>
>    for (i = 1; i < 8; ++i)
>      {
> -      do_test (0, 16 << i, 4);
> -      do_test (i, 16 << i, 4);
> +      do_test (&json_ctx, 0, 0, 16 << i, 4);
> +      do_test (&json_ctx, i, 0, 16 << i, 4);
> +      do_test (&json_ctx, 0, i, 16 << i, 4);
> +      do_test (&json_ctx, i, i, 16 << i, 4);
>      }
>
>    for (i = 1; i < 8; ++i)
> -    do_test (i, 64, 10);
> +  {
> +    do_test (&json_ctx, i, 0, 64, 10);
> +    do_test (&json_ctx, i, i, 64, 10);
> +  }
>
>    for (i = 0; i < 64; ++i)
> -    do_test (0, i, 6);
> +  {
> +    do_test (&json_ctx, 0, 0, i, 6);
> +    do_test (&json_ctx, 0, i, i, 6);
> +  }
> +
> +  json_array_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_document_end (&json_ctx);
>
>    return ret;
>  }
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v1 06/23] benchtests: Use json-lib in bench-strspn.c
  2022-03-23 21:57 ` [PATCH v1 06/23] benchtests: Use json-lib in bench-strspn.c Noah Goldstein
@ 2022-03-24 18:54   ` H.J. Lu
  0 siblings, 0 replies; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 18:54 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Wed, Mar 23, 2022 at 2:59 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Just QOL change to make parsing the output of the benchtests more
> consistent.
> ---
>  benchtests/bench-strspn.c | 78 +++++++++++++++++++++++++++++----------
>  1 file changed, 58 insertions(+), 20 deletions(-)
>
> diff --git a/benchtests/bench-strspn.c b/benchtests/bench-strspn.c
> index d79c36fae6..061e90c54d 100644
> --- a/benchtests/bench-strspn.c
> +++ b/benchtests/bench-strspn.c
> @@ -23,6 +23,7 @@
>  # define TEST_NAME "wcsspn"
>  #endif /* WIDE */
>  #include "bench-string.h"
> +#include "json-lib.h"
>
>  #define BIG_CHAR MAX_CHAR
>
> @@ -58,9 +59,10 @@ SIMPLE_STRSPN (const CHAR *s, const CHAR *acc)
>  }
>
>  static void
> -do_one_test (impl_t *impl, const CHAR *s, const CHAR *acc, size_t exp_res)
> +do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s,
> +             const CHAR *acc, size_t exp_res)
>  {
> -  size_t res = CALL (impl, s, acc), i, iters = INNER_LOOP_ITERS_MEDIUM;
> +  size_t res = CALL (impl, s, acc), i, iters = INNER_LOOP_ITERS;
>    timing_t start, stop, cur;
>
>    if (res != exp_res)
> @@ -80,21 +82,24 @@ do_one_test (impl_t *impl, const CHAR *s, const CHAR *acc, size_t exp_res)
>
>    TIMING_DIFF (cur, start, stop);
>
> -  TIMING_PRINT_MEAN ((double) cur, (double) iters);
> +  json_element_double (json_ctx, (double)cur / (double)iters);
>  }
>
>  static void
> -do_test (size_t align, size_t pos, size_t len)
> +do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t pos,
> +         size_t len)
>  {
>    size_t i;
>    CHAR *acc, *s;
>
> -  align &= 7;
> -  if ((align + pos + 10) * sizeof (CHAR) >= page_size || len > 240 || ! len)
> +  align1 &= 7;
> +  if ((align1 + pos + 10) * sizeof (CHAR) >= page_size || len > 240 || !len)
> +    return;
> +  if ((align2 + len) * sizeof (CHAR) >= page_size)
>      return;
>
> -  acc = (CHAR *) (buf2) + (random () & 255);
> -  s = (CHAR *) (buf1) + align;
> +  acc = (CHAR *) (buf2) + align2;
> +  s = (CHAR *) (buf1) + align1;
>
>    for (i = 0; i < len; ++i)
>      {
> @@ -118,43 +123,76 @@ do_test (size_t align, size_t pos, size_t len)
>        s[i] = '\0';
>      }
>
> -  printf ("Length %4zd, alignment %2zd, acc len %2zd:", pos, align, len);
> +  json_element_object_begin (json_ctx);
> +  json_attr_uint (json_ctx, "len", len);
> +  json_attr_uint (json_ctx, "pos", pos);
> +  json_attr_uint (json_ctx, "align1", align1);
> +  json_attr_uint (json_ctx, "align2", align2);
> +  json_array_begin (json_ctx, "timings");
>
>    FOR_EACH_IMPL (impl, 0)
> -    do_one_test (impl, s, acc, pos);
> +    do_one_test (json_ctx, impl, s, acc, pos);
>
> -  putchar ('\n');
> +  json_array_end (json_ctx);
> +  json_element_object_end (json_ctx);
>  }
>
>  int
>  test_main (void)
>  {
> +  json_ctx_t json_ctx;
>    size_t i;
>
>    test_init ();
>
> -  printf ("%32s", "");
> +  json_init (&json_ctx, 0, stdout);
> +
> +  json_document_begin (&json_ctx);
> +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> +
> +  json_attr_object_begin (&json_ctx, "functions");
> +  json_attr_object_begin (&json_ctx, TEST_NAME);
> +  json_attr_string (&json_ctx, "bench-variant", "");
> +
> +  json_array_begin (&json_ctx, "ifuncs");
>    FOR_EACH_IMPL (impl, 0)
> -    printf ("\t%s", impl->name);
> -  putchar ('\n');
> +    json_element_string (&json_ctx, impl->name);
> +  json_array_end (&json_ctx);
> +
> +  json_array_begin (&json_ctx, "results");
>
>    for (i = 0; i < 32; ++i)
>      {
> -      do_test (0, 512, i);
> -      do_test (i, 512, i);
> +      do_test (&json_ctx, 0, 0, 512, i);
> +      do_test (&json_ctx, i, 0, 512, i);
> +      do_test (&json_ctx, 0, i, 512, i);
> +      do_test (&json_ctx, i, i, 512, i);
>      }
>
>    for (i = 1; i < 8; ++i)
>      {
> -      do_test (0, 16 << i, 4);
> -      do_test (i, 16 << i, 4);
> +      do_test (&json_ctx, 0, 0, 16 << i, 4);
> +      do_test (&json_ctx, i, 0, 16 << i, 4);
> +      do_test (&json_ctx, 0, i, 16 << i, 4);
> +      do_test (&json_ctx, i, i, 16 << i, 4);
>      }
>
>    for (i = 1; i < 8; ++i)
> -    do_test (i, 64, 10);
> +    {
> +      do_test (&json_ctx, i, 0, 64, 10);
> +      do_test (&json_ctx, i, i, 64, 10);
> +    }
>
>    for (i = 0; i < 64; ++i)
> -    do_test (0, i, 6);
> +    {
> +      do_test (&json_ctx, 0, 0, i, 6);
> +      do_test (&json_ctx, 0, i, i, 6);
> +    }
> +
> +  json_array_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_document_end (&json_ctx);
>
>    return ret;
>  }
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v1 07/23] x86: Optimize strcspn and strpbrk in strcspn-c.c
  2022-03-23 21:57 ` [PATCH v1 07/23] x86: Optimize strcspn and strpbrk in strcspn-c.c Noah Goldstein
@ 2022-03-24 18:55   ` H.J. Lu
  2022-05-12 19:34     ` Sunil Pandey
  0 siblings, 1 reply; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 18:55 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Wed, Mar 23, 2022 at 2:59 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of
> _mm_cmpistri. Also change offset to unsigned to avoid unnecessary
> sign extensions.
>
> geometric_mean(N=20) of all benchmarks that dont fallback on
> sse2/strlen; New / Original: .928
>
> All string/memory tests pass.
> ---
> Geomtric Mean N=20 runs; All functions page aligned
> len, align1, align2,  pos, New Time / Old Time
>   0,      0,      0,  512,               1.207
>   1,      0,      0,  512,               1.039
>   1,      1,      0,  512,               0.997
>   1,      0,      1,  512,               0.981
>   1,      1,      1,  512,               0.977
>   2,      0,      0,  512,                1.02
>   2,      2,      0,  512,               0.979
>   2,      0,      2,  512,               0.902
>   2,      2,      2,  512,               0.958
>   3,      0,      0,  512,               0.978
>   3,      3,      0,  512,               0.988
>   3,      0,      3,  512,               0.979
>   3,      3,      3,  512,               0.955
>   4,      0,      0,  512,               0.969
>   4,      4,      0,  512,               0.991
>   4,      0,      4,  512,                0.94
>   4,      4,      4,  512,               0.958
>   5,      0,      0,  512,               0.963
>   5,      5,      0,  512,               1.004
>   5,      0,      5,  512,               0.948
>   5,      5,      5,  512,               0.971
>   6,      0,      0,  512,               0.933
>   6,      6,      0,  512,               1.007
>   6,      0,      6,  512,               0.921
>   6,      6,      6,  512,               0.969
>   7,      0,      0,  512,               0.928
>   7,      7,      0,  512,               0.976
>   7,      0,      7,  512,               0.932
>   7,      7,      7,  512,               0.995
>   8,      0,      0,  512,               0.931
>   8,      0,      8,  512,               0.766
>   9,      0,      0,  512,               0.965
>   9,      1,      0,  512,               0.999
>   9,      0,      9,  512,               0.765
>   9,      1,      9,  512,                0.97
>  10,      0,      0,  512,               0.976
>  10,      2,      0,  512,               0.991
>  10,      0,     10,  512,               0.768
>  10,      2,     10,  512,               0.926
>  11,      0,      0,  512,               0.958
>  11,      3,      0,  512,               1.006
>  11,      0,     11,  512,               0.768
>  11,      3,     11,  512,               0.908
>  12,      0,      0,  512,               0.945
>  12,      4,      0,  512,               0.896
>  12,      0,     12,  512,               0.764
>  12,      4,     12,  512,               0.785
>  13,      0,      0,  512,               0.957
>  13,      5,      0,  512,               1.019
>  13,      0,     13,  512,                0.76
>  13,      5,     13,  512,               0.785
>  14,      0,      0,  512,               0.918
>  14,      6,      0,  512,               1.004
>  14,      0,     14,  512,                0.78
>  14,      6,     14,  512,               0.711
>  15,      0,      0,  512,               0.855
>  15,      7,      0,  512,               0.985
>  15,      0,     15,  512,               0.779
>  15,      7,     15,  512,               0.772
>  16,      0,      0,  512,               0.987
>  16,      0,     16,  512,                0.99
>  17,      0,      0,  512,               0.996
>  17,      1,      0,  512,               0.979
>  17,      0,     17,  512,               1.001
>  17,      1,     17,  512,                1.03
>  18,      0,      0,  512,               0.976
>  18,      2,      0,  512,               0.989
>  18,      0,     18,  512,               0.976
>  18,      2,     18,  512,               0.992
>  19,      0,      0,  512,               0.991
>  19,      3,      0,  512,               0.988
>  19,      0,     19,  512,               1.009
>  19,      3,     19,  512,               1.018
>  20,      0,      0,  512,               0.999
>  20,      4,      0,  512,               1.005
>  20,      0,     20,  512,               0.993
>  20,      4,     20,  512,               0.983
>  21,      0,      0,  512,               0.982
>  21,      5,      0,  512,               0.988
>  21,      0,     21,  512,               0.978
>  21,      5,     21,  512,               0.984
>  22,      0,      0,  512,               0.988
>  22,      6,      0,  512,               0.979
>  22,      0,     22,  512,               0.984
>  22,      6,     22,  512,               0.983
>  23,      0,      0,  512,               0.996
>  23,      7,      0,  512,               0.998
>  23,      0,     23,  512,               0.979
>  23,      7,     23,  512,               0.987
>  24,      0,      0,  512,                0.99
>  24,      0,     24,  512,               0.979
>  25,      0,      0,  512,               0.985
>  25,      1,      0,  512,               0.988
>  25,      0,     25,  512,                0.99
>  25,      1,     25,  512,               0.986
>  26,      0,      0,  512,               1.005
>  26,      2,      0,  512,               0.995
>  26,      0,     26,  512,               0.992
>  26,      2,     26,  512,               0.983
>  27,      0,      0,  512,               0.986
>  27,      3,      0,  512,               0.978
>  27,      0,     27,  512,               0.986
>  27,      3,     27,  512,               0.973
>  28,      0,      0,  512,               0.995
>  28,      4,      0,  512,               0.993
>  28,      0,     28,  512,               0.983
>  28,      4,     28,  512,               1.005
>  29,      0,      0,  512,               0.983
>  29,      5,      0,  512,               0.982
>  29,      0,     29,  512,               0.984
>  29,      5,     29,  512,               1.005
>  30,      0,      0,  512,               0.978
>  30,      6,      0,  512,               0.985
>  30,      0,     30,  512,               0.994
>  30,      6,     30,  512,               0.993
>  31,      0,      0,  512,               0.984
>  31,      7,      0,  512,               0.983
>  31,      0,     31,  512,                 1.0
>  31,      7,     31,  512,               1.031
>   4,      0,      0,   32,               0.916
>   4,      1,      0,   32,               0.952
>   4,      0,      1,   32,               0.927
>   4,      1,      1,   32,               0.969
>   4,      0,      0,   64,               0.961
>   4,      2,      0,   64,               0.955
>   4,      0,      2,   64,               0.975
>   4,      2,      2,   64,               0.972
>   4,      0,      0,  128,               0.971
>   4,      3,      0,  128,               0.982
>   4,      0,      3,  128,               0.945
>   4,      3,      3,  128,               0.971
>   4,      0,      0,  256,               1.004
>   4,      4,      0,  256,               0.966
>   4,      0,      4,  256,               0.961
>   4,      4,      4,  256,               0.971
>   4,      5,      0,  512,               0.929
>   4,      0,      5,  512,               0.969
>   4,      5,      5,  512,               0.985
>   4,      0,      0, 1024,               1.003
>   4,      6,      0, 1024,               1.009
>   4,      0,      6, 1024,               1.005
>   4,      6,      6, 1024,               0.999
>   4,      0,      0, 2048,               0.917
>   4,      7,      0, 2048,               1.015
>   4,      0,      7, 2048,               1.011
>   4,      7,      7, 2048,               0.907
>  10,      1,      0,   64,               0.964
>  10,      1,      1,   64,               0.966
>  10,      2,      0,   64,               0.953
>  10,      2,      2,   64,               0.972
>  10,      3,      0,   64,               0.962
>  10,      3,      3,   64,               0.969
>  10,      4,      0,   64,               0.957
>  10,      4,      4,   64,               0.969
>  10,      5,      0,   64,               0.961
>  10,      5,      5,   64,               0.965
>  10,      6,      0,   64,               0.949
>  10,      6,      6,   64,                 0.9
>  10,      7,      0,   64,               0.957
>  10,      7,      7,   64,               0.897
>   6,      0,      0,    0,               0.991
>   6,      0,      0,    1,               1.011
>   6,      0,      1,    1,               0.939
>   6,      0,      0,    2,               1.016
>   6,      0,      2,    2,                0.94
>   6,      0,      0,    3,               1.019
>   6,      0,      3,    3,               0.941
>   6,      0,      0,    4,               1.056
>   6,      0,      4,    4,               0.884
>   6,      0,      0,    5,               0.977
>   6,      0,      5,    5,               0.934
>   6,      0,      0,    6,               0.954
>   6,      0,      6,    6,                0.93
>   6,      0,      0,    7,               0.963
>   6,      0,      7,    7,               0.916
>   6,      0,      0,    8,               0.963
>   6,      0,      8,    8,               0.945
>   6,      0,      0,    9,               1.028
>   6,      0,      9,    9,               0.942
>   6,      0,      0,   10,               0.955
>   6,      0,     10,   10,               0.831
>   6,      0,      0,   11,               0.948
>   6,      0,     11,   11,                0.82
>   6,      0,      0,   12,               1.033
>   6,      0,     12,   12,               0.873
>   6,      0,      0,   13,               0.983
>   6,      0,     13,   13,               0.852
>   6,      0,      0,   14,               0.984
>   6,      0,     14,   14,               0.853
>   6,      0,      0,   15,               0.984
>   6,      0,     15,   15,               0.882
>   6,      0,      0,   16,               0.971
>   6,      0,     16,   16,               0.958
>   6,      0,      0,   17,               0.938
>   6,      0,     17,   17,               0.947
>   6,      0,      0,   18,                0.96
>   6,      0,     18,   18,               0.938
>   6,      0,      0,   19,               0.903
>   6,      0,     19,   19,               0.943
>   6,      0,      0,   20,               0.947
>   6,      0,     20,   20,               0.951
>   6,      0,      0,   21,               0.948
>   6,      0,     21,   21,                0.96
>   6,      0,      0,   22,               0.926
>   6,      0,     22,   22,               0.951
>   6,      0,      0,   23,               0.923
>   6,      0,     23,   23,               0.959
>   6,      0,      0,   24,               0.918
>   6,      0,     24,   24,               0.952
>   6,      0,      0,   25,                0.97
>   6,      0,     25,   25,               0.952
>   6,      0,      0,   26,               0.871
>   6,      0,     26,   26,               0.869
>   6,      0,      0,   27,               0.935
>   6,      0,     27,   27,               0.836
>   6,      0,      0,   28,               0.936
>   6,      0,     28,   28,               0.857
>   6,      0,      0,   29,               0.876
>   6,      0,     29,   29,               0.859
>   6,      0,      0,   30,               0.934
>   6,      0,     30,   30,               0.857
>   6,      0,      0,   31,               0.962
>   6,      0,     31,   31,                0.86
>   6,      0,      0,   32,               0.912
>   6,      0,     32,   32,                0.94
>   6,      0,      0,   33,               0.903
>   6,      0,     33,   33,               0.968
>   6,      0,      0,   34,               0.913
>   6,      0,     34,   34,               0.896
>   6,      0,      0,   35,               0.904
>   6,      0,     35,   35,               0.913
>   6,      0,      0,   36,               0.905
>   6,      0,     36,   36,               0.907
>   6,      0,      0,   37,               0.899
>   6,      0,     37,   37,                 0.9
>   6,      0,      0,   38,               0.912
>   6,      0,     38,   38,               0.919
>   6,      0,      0,   39,               0.925
>   6,      0,     39,   39,               0.927
>   6,      0,      0,   40,               0.923
>   6,      0,     40,   40,               0.972
>   6,      0,      0,   41,                0.92
>   6,      0,     41,   41,               0.966
>   6,      0,      0,   42,               0.915
>   6,      0,     42,   42,               0.834
>   6,      0,      0,   43,                0.92
>   6,      0,     43,   43,               0.856
>   6,      0,      0,   44,               0.908
>   6,      0,     44,   44,               0.858
>   6,      0,      0,   45,               0.932
>   6,      0,     45,   45,               0.847
>   6,      0,      0,   46,               0.927
>   6,      0,     46,   46,               0.859
>   6,      0,      0,   47,               0.902
>   6,      0,     47,   47,               0.855
>   6,      0,      0,   48,               0.949
>   6,      0,     48,   48,               0.934
>   6,      0,      0,   49,               0.907
>   6,      0,     49,   49,               0.943
>   6,      0,      0,   50,               0.934
>   6,      0,     50,   50,               0.943
>   6,      0,      0,   51,               0.933
>   6,      0,     51,   51,               0.939
>   6,      0,      0,   52,               0.944
>   6,      0,     52,   52,               0.944
>   6,      0,      0,   53,               0.939
>   6,      0,     53,   53,               0.938
>   6,      0,      0,   54,                 0.9
>   6,      0,     54,   54,               0.923
>   6,      0,      0,   55,                 0.9
>   6,      0,     55,   55,               0.927
>   6,      0,      0,   56,                 0.9
>   6,      0,     56,   56,               0.917
>   6,      0,      0,   57,                 0.9
>   6,      0,     57,   57,               0.916
>   6,      0,      0,   58,               0.914
>   6,      0,     58,   58,               0.784
>   6,      0,      0,   59,               0.863
>   6,      0,     59,   59,               0.846
>   6,      0,      0,   60,                0.88
>   6,      0,     60,   60,               0.827
>   6,      0,      0,   61,               0.896
>   6,      0,     61,   61,               0.847
>   6,      0,      0,   62,               0.894
>   6,      0,     62,   62,               0.865
>   6,      0,      0,   63,               0.934
>   6,      0,     63,   63,               0.866
>
>  sysdeps/x86_64/multiarch/strcspn-c.c | 83 +++++++++++++---------------
>  1 file changed, 37 insertions(+), 46 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c
> index 013aebf797..c312fab8b1 100644
> --- a/sysdeps/x86_64/multiarch/strcspn-c.c
> +++ b/sysdeps/x86_64/multiarch/strcspn-c.c
> @@ -84,83 +84,74 @@ STRCSPN_SSE42 (const char *s, const char *a)
>      RETURN (NULL, strlen (s));
>
>    const char *aligned;
> -  __m128i mask;
> -  int offset = (int) ((size_t) a & 15);
> +  __m128i mask, maskz, zero;
> +  unsigned int maskz_bits;
> +  unsigned int offset = (unsigned int) ((size_t) a & 15);
> +  zero = _mm_set1_epi8 (0);
>    if (offset != 0)
>      {
>        /* Load masks.  */
>        aligned = (const char *) ((size_t) a & -16L);
>        __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
> -
> -      mask = __m128i_shift_right (mask0, offset);
> +      maskz = _mm_cmpeq_epi8 (mask0, zero);
>
>        /* Find where the NULL terminator is.  */
> -      int length = _mm_cmpistri (mask, mask, 0x3a);
> -      if (length == 16 - offset)
> -       {
> -         /* There is no NULL terminator.  */
> -         __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
> -         int index = _mm_cmpistri (mask1, mask1, 0x3a);
> -         length += index;
> -
> -         /* Don't use SSE4.2 if the length of A > 16.  */
> -         if (length > 16)
> -           return STRCSPN_SSE2 (s, a);
> -
> -         if (index != 0)
> -           {
> -             /* Combine mask0 and mask1.  We could play games with
> -                palignr, but frankly this data should be in L1 now
> -                so do the merge via an unaligned load.  */
> -             mask = _mm_loadu_si128 ((__m128i *) a);
> -           }
> -       }
> +      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
> +      if (maskz_bits != 0)
> +        {
> +          mask = __m128i_shift_right (mask0, offset);
> +          offset = (unsigned int) ((size_t) s & 15);
> +          if (offset)
> +            goto start_unaligned;
> +
> +          aligned = s;
> +          goto start_loop;
> +        }
>      }
> -  else
> -    {
> -      /* A is aligned.  */
> -      mask = _mm_load_si128 ((__m128i *) a);
>
> -      /* Find where the NULL terminator is.  */
> -      int length = _mm_cmpistri (mask, mask, 0x3a);
> -      if (length == 16)
> -       {
> -         /* There is no NULL terminator.  Don't use SSE4.2 if the length
> -            of A > 16.  */
> -         if (a[16] != 0)
> -           return STRCSPN_SSE2 (s, a);
> -       }
> +  /* A is aligned.  */
> +  mask = _mm_loadu_si128 ((__m128i *) a);
> +  /* Find where the NULL terminator is.  */
> +  maskz = _mm_cmpeq_epi8 (mask, zero);
> +  maskz_bits = _mm_movemask_epi8 (maskz);
> +  if (maskz_bits == 0)
> +    {
> +      /* There is no NULL terminator.  Don't use SSE4.2 if the length
> +         of A > 16.  */
> +      if (a[16] != 0)
> +        return STRCSPN_SSE2 (s, a);
>      }
>
> -  offset = (int) ((size_t) s & 15);
> +  aligned = s;
> +  offset = (unsigned int) ((size_t) s & 15);
>    if (offset != 0)
>      {
> +    start_unaligned:
>        /* Check partial string.  */
>        aligned = (const char *) ((size_t) s & -16L);
>        __m128i value = _mm_load_si128 ((__m128i *) aligned);
>
>        value = __m128i_shift_right (value, offset);
>
> -      int length = _mm_cmpistri (mask, value, 0x2);
> +      unsigned int length = _mm_cmpistri (mask, value, 0x2);
>        /* No need to check ZFlag since ZFlag is always 1.  */
> -      int cflag = _mm_cmpistrc (mask, value, 0x2);
> +      unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
>        if (cflag)
>         RETURN ((char *) (s + length), length);
>        /* Find where the NULL terminator is.  */
> -      int index = _mm_cmpistri (value, value, 0x3a);
> +      unsigned int index = _mm_cmpistri (value, value, 0x3a);
>        if (index < 16 - offset)
>         RETURN (NULL, index);
>        aligned += 16;
>      }
> -  else
> -    aligned = s;
>
> +start_loop:
>    while (1)
>      {
>        __m128i value = _mm_load_si128 ((__m128i *) aligned);
> -      int index = _mm_cmpistri (mask, value, 0x2);
> -      int cflag = _mm_cmpistrc (mask, value, 0x2);
> -      int zflag = _mm_cmpistrz (mask, value, 0x2);
> +      unsigned int index = _mm_cmpistri (mask, value, 0x2);
> +      unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
> +      unsigned int zflag = _mm_cmpistrz (mask, value, 0x2);
>        if (cflag)
>         RETURN ((char *) (aligned + index), (size_t) (aligned + index - s));
>        if (zflag)
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v1 08/23] x86: Optimize strspn in strspn-c.c
  2022-03-23 21:57 ` [PATCH v1 08/23] x86: Optimize strspn in strspn-c.c Noah Goldstein
@ 2022-03-24 18:56   ` H.J. Lu
  2022-05-12 19:39     ` Sunil Pandey
  0 siblings, 1 reply; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 18:56 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Wed, Mar 23, 2022 at 2:59 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of
> _mm_cmpistri. Also change offset to unsigned to avoid unnecessary
> sign extensions.
>
> geometric_mean(N=20) of all benchmarks that dont fallback on
> sse2; New / Original: .901
>
> All string/memory tests pass.
> ---
> Geomtric Mean N=20 runs; All functions page aligned
> len, align1, align2,  pos, New Time / Old Time
>   1,      0,      0,  512,               0.768
>   1,      1,      0,  512,               0.666
>   1,      0,      1,  512,               1.193
>   1,      1,      1,  512,               0.872
>   2,      0,      0,  512,               0.698
>   2,      2,      0,  512,               0.687
>   2,      0,      2,  512,               1.393
>   2,      2,      2,  512,               0.944
>   3,      0,      0,  512,               0.691
>   3,      3,      0,  512,               0.676
>   3,      0,      3,  512,               1.388
>   3,      3,      3,  512,               0.948
>   4,      0,      0,  512,                0.74
>   4,      4,      0,  512,               0.678
>   4,      0,      4,  512,               1.421
>   4,      4,      4,  512,               0.943
>   5,      0,      0,  512,               0.691
>   5,      5,      0,  512,               0.675
>   5,      0,      5,  512,               1.348
>   5,      5,      5,  512,               0.952
>   6,      0,      0,  512,               0.685
>   6,      6,      0,  512,                0.67
>   6,      0,      6,  512,               1.333
>   6,      6,      6,  512,                0.95
>   7,      0,      0,  512,               0.688
>   7,      7,      0,  512,               0.675
>   7,      0,      7,  512,               1.344
>   7,      7,      7,  512,               0.919
>   8,      0,      0,  512,               0.716
>   8,      0,      8,  512,               0.935
>   9,      0,      0,  512,               0.716
>   9,      1,      0,  512,               0.712
>   9,      0,      9,  512,               0.956
>   9,      1,      9,  512,               0.992
>  10,      0,      0,  512,               0.699
>  10,      2,      0,  512,                0.68
>  10,      0,     10,  512,               0.952
>  10,      2,     10,  512,               0.932
>  11,      0,      0,  512,               0.705
>  11,      3,      0,  512,               0.685
>  11,      0,     11,  512,               0.956
>  11,      3,     11,  512,               0.927
>  12,      0,      0,  512,               0.695
>  12,      4,      0,  512,               0.675
>  12,      0,     12,  512,               0.948
>  12,      4,     12,  512,               0.928
>  13,      0,      0,  512,                 0.7
>  13,      5,      0,  512,               0.678
>  13,      0,     13,  512,               0.944
>  13,      5,     13,  512,               0.931
>  14,      0,      0,  512,               0.703
>  14,      6,      0,  512,               0.678
>  14,      0,     14,  512,               0.949
>  14,      6,     14,  512,                0.93
>  15,      0,      0,  512,               0.694
>  15,      7,      0,  512,               0.678
>  15,      0,     15,  512,               0.953
>  15,      7,     15,  512,               0.924
>  16,      0,      0,  512,               1.021
>  16,      0,     16,  512,               1.067
>  17,      0,      0,  512,               0.991
>  17,      1,      0,  512,               0.984
>  17,      0,     17,  512,               0.979
>  17,      1,     17,  512,               0.993
>  18,      0,      0,  512,               0.992
>  18,      2,      0,  512,               1.008
>  18,      0,     18,  512,               1.016
>  18,      2,     18,  512,               0.993
>  19,      0,      0,  512,               0.984
>  19,      3,      0,  512,               0.985
>  19,      0,     19,  512,               1.007
>  19,      3,     19,  512,               1.006
>  20,      0,      0,  512,               0.969
>  20,      4,      0,  512,               0.968
>  20,      0,     20,  512,               0.975
>  20,      4,     20,  512,               0.975
>  21,      0,      0,  512,               0.992
>  21,      5,      0,  512,               0.992
>  21,      0,     21,  512,                0.98
>  21,      5,     21,  512,                0.97
>  22,      0,      0,  512,               0.989
>  22,      6,      0,  512,               0.987
>  22,      0,     22,  512,                0.99
>  22,      6,     22,  512,               0.985
>  23,      0,      0,  512,               0.989
>  23,      7,      0,  512,                0.98
>  23,      0,     23,  512,                 1.0
>  23,      7,     23,  512,               0.993
>  24,      0,      0,  512,                0.99
>  24,      0,     24,  512,               0.998
>  25,      0,      0,  512,                1.01
>  25,      1,      0,  512,                 1.0
>  25,      0,     25,  512,                0.97
>  25,      1,     25,  512,               0.967
>  26,      0,      0,  512,               1.009
>  26,      2,      0,  512,               0.986
>  26,      0,     26,  512,               0.997
>  26,      2,     26,  512,               0.993
>  27,      0,      0,  512,               0.984
>  27,      3,      0,  512,               0.997
>  27,      0,     27,  512,               0.989
>  27,      3,     27,  512,               0.976
>  28,      0,      0,  512,               0.991
>  28,      4,      0,  512,               1.003
>  28,      0,     28,  512,               0.986
>  28,      4,     28,  512,               0.989
>  29,      0,      0,  512,               0.986
>  29,      5,      0,  512,               0.985
>  29,      0,     29,  512,               0.984
>  29,      5,     29,  512,               0.977
>  30,      0,      0,  512,               0.991
>  30,      6,      0,  512,               0.987
>  30,      0,     30,  512,               0.979
>  30,      6,     30,  512,               0.974
>  31,      0,      0,  512,               0.995
>  31,      7,      0,  512,               0.995
>  31,      0,     31,  512,               0.994
>  31,      7,     31,  512,               0.984
>   4,      0,      0,   32,               0.861
>   4,      1,      0,   32,               0.864
>   4,      0,      1,   32,               0.962
>   4,      1,      1,   32,               0.967
>   4,      0,      0,   64,               0.884
>   4,      2,      0,   64,               0.818
>   4,      0,      2,   64,               0.889
>   4,      2,      2,   64,               0.918
>   4,      0,      0,  128,               0.942
>   4,      3,      0,  128,               0.884
>   4,      0,      3,  128,               0.931
>   4,      3,      3,  128,               0.883
>   4,      0,      0,  256,               0.964
>   4,      4,      0,  256,               0.922
>   4,      0,      4,  256,               0.956
>   4,      4,      4,  256,                0.93
>   4,      5,      0,  512,               0.833
>   4,      0,      5,  512,               1.027
>   4,      5,      5,  512,               0.929
>   4,      0,      0, 1024,               0.998
>   4,      6,      0, 1024,               0.986
>   4,      0,      6, 1024,               0.984
>   4,      6,      6, 1024,               0.977
>   4,      0,      0, 2048,               0.991
>   4,      7,      0, 2048,               0.987
>   4,      0,      7, 2048,               0.996
>   4,      7,      7, 2048,                0.98
>  10,      1,      0,   64,               0.826
>  10,      1,      1,   64,               0.907
>  10,      2,      0,   64,               0.829
>  10,      2,      2,   64,                0.91
>  10,      3,      0,   64,                0.83
>  10,      3,      3,   64,               0.915
>  10,      4,      0,   64,                0.83
>  10,      4,      4,   64,               0.911
>  10,      5,      0,   64,               0.828
>  10,      5,      5,   64,               0.905
>  10,      6,      0,   64,               0.828
>  10,      6,      6,   64,               0.812
>  10,      7,      0,   64,                0.83
>  10,      7,      7,   64,               0.819
>   6,      0,      0,    0,               1.261
>   6,      0,      0,    1,               1.252
>   6,      0,      1,    1,               0.845
>   6,      0,      0,    2,                1.27
>   6,      0,      2,    2,                0.85
>   6,      0,      0,    3,               1.269
>   6,      0,      3,    3,               0.845
>   6,      0,      0,    4,               1.287
>   6,      0,      4,    4,               0.852
>   6,      0,      0,    5,               1.278
>   6,      0,      5,    5,               0.851
>   6,      0,      0,    6,               1.269
>   6,      0,      6,    6,               0.841
>   6,      0,      0,    7,               1.268
>   6,      0,      7,    7,               0.851
>   6,      0,      0,    8,               1.291
>   6,      0,      8,    8,               0.837
>   6,      0,      0,    9,               1.283
>   6,      0,      9,    9,               0.831
>   6,      0,      0,   10,               1.252
>   6,      0,     10,   10,               0.997
>   6,      0,      0,   11,               1.295
>   6,      0,     11,   11,               1.046
>   6,      0,      0,   12,               1.296
>   6,      0,     12,   12,               1.038
>   6,      0,      0,   13,               1.287
>   6,      0,     13,   13,               1.082
>   6,      0,      0,   14,               1.284
>   6,      0,     14,   14,               1.001
>   6,      0,      0,   15,               1.286
>   6,      0,     15,   15,               1.002
>   6,      0,      0,   16,               0.894
>   6,      0,     16,   16,               0.874
>   6,      0,      0,   17,               0.892
>   6,      0,     17,   17,               0.974
>   6,      0,      0,   18,               0.907
>   6,      0,     18,   18,               0.993
>   6,      0,      0,   19,               0.909
>   6,      0,     19,   19,                0.99
>   6,      0,      0,   20,               0.894
>   6,      0,     20,   20,               0.978
>   6,      0,      0,   21,                0.89
>   6,      0,     21,   21,               0.958
>   6,      0,      0,   22,               0.893
>   6,      0,     22,   22,                0.99
>   6,      0,      0,   23,               0.899
>   6,      0,     23,   23,               0.986
>   6,      0,      0,   24,               0.893
>   6,      0,     24,   24,               0.989
>   6,      0,      0,   25,               0.889
>   6,      0,     25,   25,               0.982
>   6,      0,      0,   26,               0.889
>   6,      0,     26,   26,               0.852
>   6,      0,      0,   27,                0.89
>   6,      0,     27,   27,               0.832
>   6,      0,      0,   28,                0.89
>   6,      0,     28,   28,               0.831
>   6,      0,      0,   29,                0.89
>   6,      0,     29,   29,               0.838
>   6,      0,      0,   30,               0.907
>   6,      0,     30,   30,               0.833
>   6,      0,      0,   31,               0.888
>   6,      0,     31,   31,               0.837
>   6,      0,      0,   32,               0.853
>   6,      0,     32,   32,               0.828
>   6,      0,      0,   33,               0.857
>   6,      0,     33,   33,               0.947
>   6,      0,      0,   34,               0.847
>   6,      0,     34,   34,               0.954
>   6,      0,      0,   35,               0.841
>   6,      0,     35,   35,                0.94
>   6,      0,      0,   36,               0.854
>   6,      0,     36,   36,               0.958
>   6,      0,      0,   37,               0.856
>   6,      0,     37,   37,               0.957
>   6,      0,      0,   38,               0.839
>   6,      0,     38,   38,               0.962
>   6,      0,      0,   39,               0.866
>   6,      0,     39,   39,               0.945
>   6,      0,      0,   40,               0.845
>   6,      0,     40,   40,               0.961
>   6,      0,      0,   41,               0.858
>   6,      0,     41,   41,               0.961
>   6,      0,      0,   42,               0.862
>   6,      0,     42,   42,               0.825
>   6,      0,      0,   43,               0.864
>   6,      0,     43,   43,                0.82
>   6,      0,      0,   44,               0.843
>   6,      0,     44,   44,                0.81
>   6,      0,      0,   45,               0.859
>   6,      0,     45,   45,               0.816
>   6,      0,      0,   46,               0.866
>   6,      0,     46,   46,                0.81
>   6,      0,      0,   47,               0.858
>   6,      0,     47,   47,               0.807
>   6,      0,      0,   48,                0.87
>   6,      0,     48,   48,                0.87
>   6,      0,      0,   49,               0.871
>   6,      0,     49,   49,               0.874
>   6,      0,      0,   50,                0.87
>   6,      0,     50,   50,               0.881
>   6,      0,      0,   51,               0.868
>   6,      0,     51,   51,               0.875
>   6,      0,      0,   52,               0.873
>   6,      0,     52,   52,               0.871
>   6,      0,      0,   53,               0.866
>   6,      0,     53,   53,               0.882
>   6,      0,      0,   54,               0.863
>   6,      0,     54,   54,               0.876
>   6,      0,      0,   55,               0.851
>   6,      0,     55,   55,               0.871
>   6,      0,      0,   56,               0.867
>   6,      0,     56,   56,               0.888
>   6,      0,      0,   57,               0.862
>   6,      0,     57,   57,               0.899
>   6,      0,      0,   58,               0.873
>   6,      0,     58,   58,               0.798
>   6,      0,      0,   59,               0.881
>   6,      0,     59,   59,               0.785
>   6,      0,      0,   60,               0.867
>   6,      0,     60,   60,               0.797
>   6,      0,      0,   61,               0.872
>   6,      0,     61,   61,               0.791
>   6,      0,      0,   62,               0.859
>   6,      0,     62,   62,                0.79
>   6,      0,      0,   63,                0.87
>   6,      0,     63,   63,               0.796
>
>  sysdeps/x86_64/multiarch/strspn-c.c | 86 +++++++++++++----------------
>  1 file changed, 39 insertions(+), 47 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c
> index 8fb3aba64d..6124033ceb 100644
> --- a/sysdeps/x86_64/multiarch/strspn-c.c
> +++ b/sysdeps/x86_64/multiarch/strspn-c.c
> @@ -62,81 +62,73 @@ __strspn_sse42 (const char *s, const char *a)
>      return 0;
>
>    const char *aligned;
> -  __m128i mask;
> -  int offset = (int) ((size_t) a & 15);
> +  __m128i mask, maskz, zero;
> +  unsigned int maskz_bits;
> +  unsigned int offset = (int) ((size_t) a & 15);
> +  zero = _mm_set1_epi8 (0);
>    if (offset != 0)
>      {
>        /* Load masks.  */
>        aligned = (const char *) ((size_t) a & -16L);
>        __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
> -
> -      mask = __m128i_shift_right (mask0, offset);
> +      maskz = _mm_cmpeq_epi8 (mask0, zero);
>
>        /* Find where the NULL terminator is.  */
> -      int length = _mm_cmpistri (mask, mask, 0x3a);
> -      if (length == 16 - offset)
> -       {
> -         /* There is no NULL terminator.  */
> -         __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
> -         int index = _mm_cmpistri (mask1, mask1, 0x3a);
> -         length += index;
> -
> -         /* Don't use SSE4.2 if the length of A > 16.  */
> -         if (length > 16)
> -           return __strspn_sse2 (s, a);
> -
> -         if (index != 0)
> -           {
> -             /* Combine mask0 and mask1.  We could play games with
> -                palignr, but frankly this data should be in L1 now
> -                so do the merge via an unaligned load.  */
> -             mask = _mm_loadu_si128 ((__m128i *) a);
> -           }
> -       }
> +      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
> +      if (maskz_bits != 0)
> +        {
> +          mask = __m128i_shift_right (mask0, offset);
> +          offset = (unsigned int) ((size_t) s & 15);
> +          if (offset)
> +            goto start_unaligned;
> +
> +          aligned = s;
> +          goto start_loop;
> +        }
>      }
> -  else
> -    {
> -      /* A is aligned.  */
> -      mask = _mm_load_si128 ((__m128i *) a);
>
> -      /* Find where the NULL terminator is.  */
> -      int length = _mm_cmpistri (mask, mask, 0x3a);
> -      if (length == 16)
> -       {
> -         /* There is no NULL terminator.  Don't use SSE4.2 if the length
> -            of A > 16.  */
> -         if (a[16] != 0)
> -           return __strspn_sse2 (s, a);
> -       }
> +  /* A is aligned.  */
> +  mask = _mm_loadu_si128 ((__m128i *) a);
> +
> +  /* Find where the NULL terminator is.  */
> +  maskz = _mm_cmpeq_epi8 (mask, zero);
> +  maskz_bits = _mm_movemask_epi8 (maskz);
> +  if (maskz_bits == 0)
> +    {
> +      /* There is no NULL terminator.  Don't use SSE4.2 if the length
> +         of A > 16.  */
> +      if (a[16] != 0)
> +        return __strspn_sse2 (s, a);
>      }
> +  aligned = s;
> +  offset = (unsigned int) ((size_t) s & 15);
>
> -  offset = (int) ((size_t) s & 15);
>    if (offset != 0)
>      {
> +    start_unaligned:
>        /* Check partial string.  */
>        aligned = (const char *) ((size_t) s & -16L);
>        __m128i value = _mm_load_si128 ((__m128i *) aligned);
> +      __m128i adj_value = __m128i_shift_right (value, offset);
>
> -      value = __m128i_shift_right (value, offset);
> -
> -      int length = _mm_cmpistri (mask, value, 0x12);
> +      unsigned int length = _mm_cmpistri (mask, adj_value, 0x12);
>        /* No need to check CFlag since it is always 1.  */
>        if (length < 16 - offset)
>         return length;
>        /* Find where the NULL terminator is.  */
> -      int index = _mm_cmpistri (value, value, 0x3a);
> -      if (index < 16 - offset)
> +      maskz = _mm_cmpeq_epi8 (value, zero);
> +      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
> +      if (maskz_bits != 0)
>         return length;
>        aligned += 16;
>      }
> -  else
> -    aligned = s;
>
> +start_loop:
>    while (1)
>      {
>        __m128i value = _mm_load_si128 ((__m128i *) aligned);
> -      int index = _mm_cmpistri (mask, value, 0x12);
> -      int cflag = _mm_cmpistrc (mask, value, 0x12);
> +      unsigned int index = _mm_cmpistri (mask, value, 0x12);
> +      unsigned int cflag = _mm_cmpistrc (mask, value, 0x12);
>        if (cflag)
>         return (size_t) (aligned + index - s);
>        aligned += 16;
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v1 09/23] x86: Remove strcspn-sse2.S and use the generic implementation
  2022-03-23 21:57 ` [PATCH v1 09/23] x86: Remove strcspn-sse2.S and use the generic implementation Noah Goldstein
@ 2022-03-24 18:57   ` H.J. Lu
  2022-05-12 19:40     ` Sunil Pandey
  0 siblings, 1 reply; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 18:57 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Wed, Mar 23, 2022 at 3:00 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The generic implementation is faster.
>
> geometric_mean(N=20) of all benchmarks New / Original: .678
>
> All string/memory tests pass.
> ---
> Geomtric Mean N=20 runs; All functions page aligned
> len, align1, align2,  pos, New Time / Old Time
>   0,      0,      0,  512,               0.054
>   1,      0,      0,  512,               0.055
>   1,      1,      0,  512,               0.051
>   1,      0,      1,  512,               0.054
>   1,      1,      1,  512,               0.054
>   2,      0,      0,  512,               0.861
>   2,      2,      0,  512,               0.861
>   2,      0,      2,  512,               0.861
>   2,      2,      2,  512,               0.864
>   3,      0,      0,  512,               0.854
>   3,      3,      0,  512,               0.848
>   3,      0,      3,  512,               0.845
>   3,      3,      3,  512,                0.85
>   4,      0,      0,  512,               0.851
>   4,      4,      0,  512,                0.85
>   4,      0,      4,  512,               0.852
>   4,      4,      4,  512,               0.849
>   5,      0,      0,  512,               0.938
>   5,      5,      0,  512,                0.94
>   5,      0,      5,  512,               0.864
>   5,      5,      5,  512,                0.86
>   6,      0,      0,  512,               0.858
>   6,      6,      0,  512,               0.869
>   6,      0,      6,  512,               0.847
>   6,      6,      6,  512,               0.868
>   7,      0,      0,  512,               0.867
>   7,      7,      0,  512,               0.861
>   7,      0,      7,  512,               0.864
>   7,      7,      7,  512,               0.863
>   8,      0,      0,  512,               0.884
>   8,      0,      8,  512,               0.884
>   9,      0,      0,  512,               0.886
>   9,      1,      0,  512,               0.894
>   9,      0,      9,  512,               0.889
>   9,      1,      9,  512,               0.886
>  10,      0,      0,  512,               0.859
>  10,      2,      0,  512,               0.859
>  10,      0,     10,  512,               0.862
>  10,      2,     10,  512,               0.861
>  11,      0,      0,  512,               0.846
>  11,      3,      0,  512,               0.865
>  11,      0,     11,  512,               0.859
>  11,      3,     11,  512,               0.862
>  12,      0,      0,  512,               0.858
>  12,      4,      0,  512,               0.857
>  12,      0,     12,  512,               0.964
>  12,      4,     12,  512,               0.876
>  13,      0,      0,  512,               0.827
>  13,      5,      0,  512,               0.805
>  13,      0,     13,  512,               0.821
>  13,      5,     13,  512,               0.825
>  14,      0,      0,  512,               0.786
>  14,      6,      0,  512,               0.786
>  14,      0,     14,  512,               0.803
>  14,      6,     14,  512,               0.783
>  15,      0,      0,  512,               0.778
>  15,      7,      0,  512,               0.792
>  15,      0,     15,  512,               0.796
>  15,      7,     15,  512,               0.799
>  16,      0,      0,  512,               0.803
>  16,      0,     16,  512,               0.815
>  17,      0,      0,  512,               0.812
>  17,      1,      0,  512,               0.826
>  17,      0,     17,  512,               0.803
>  17,      1,     17,  512,               0.856
>  18,      0,      0,  512,               0.801
>  18,      2,      0,  512,               0.886
>  18,      0,     18,  512,               0.805
>  18,      2,     18,  512,               0.807
>  19,      0,      0,  512,               0.814
>  19,      3,      0,  512,               0.804
>  19,      0,     19,  512,               0.813
>  19,      3,     19,  512,               0.814
>  20,      0,      0,  512,               0.885
>  20,      4,      0,  512,               0.799
>  20,      0,     20,  512,               0.826
>  20,      4,     20,  512,               0.808
>  21,      0,      0,  512,               0.816
>  21,      5,      0,  512,               0.824
>  21,      0,     21,  512,               0.819
>  21,      5,     21,  512,               0.826
>  22,      0,      0,  512,               0.814
>  22,      6,      0,  512,               0.824
>  22,      0,     22,  512,                0.81
>  22,      6,     22,  512,               0.806
>  23,      0,      0,  512,               0.825
>  23,      7,      0,  512,               0.829
>  23,      0,     23,  512,               0.809
>  23,      7,     23,  512,               0.823
>  24,      0,      0,  512,               0.829
>  24,      0,     24,  512,               0.823
>  25,      0,      0,  512,               0.864
>  25,      1,      0,  512,               0.895
>  25,      0,     25,  512,                0.88
>  25,      1,     25,  512,               0.848
>  26,      0,      0,  512,               0.903
>  26,      2,      0,  512,               0.888
>  26,      0,     26,  512,               0.894
>  26,      2,     26,  512,                0.89
>  27,      0,      0,  512,               0.914
>  27,      3,      0,  512,               0.917
>  27,      0,     27,  512,               0.902
>  27,      3,     27,  512,               0.887
>  28,      0,      0,  512,               0.887
>  28,      4,      0,  512,               0.877
>  28,      0,     28,  512,               0.893
>  28,      4,     28,  512,               0.866
>  29,      0,      0,  512,               0.885
>  29,      5,      0,  512,               0.907
>  29,      0,     29,  512,               0.894
>  29,      5,     29,  512,               0.906
>  30,      0,      0,  512,                0.88
>  30,      6,      0,  512,               0.898
>  30,      0,     30,  512,                 0.9
>  30,      6,     30,  512,               0.895
>  31,      0,      0,  512,               0.893
>  31,      7,      0,  512,               0.874
>  31,      0,     31,  512,               0.894
>  31,      7,     31,  512,               0.899
>   4,      0,      0,   32,               0.618
>   4,      1,      0,   32,               0.627
>   4,      0,      1,   32,               0.625
>   4,      1,      1,   32,               0.613
>   4,      0,      0,   64,               0.913
>   4,      2,      0,   64,               0.801
>   4,      0,      2,   64,               0.759
>   4,      2,      2,   64,               0.761
>   4,      0,      0,  128,               0.822
>   4,      3,      0,  128,               0.863
>   4,      0,      3,  128,               0.867
>   4,      3,      3,  128,               0.917
>   4,      0,      0,  256,               0.816
>   4,      4,      0,  256,               0.812
>   4,      0,      4,  256,               0.803
>   4,      4,      4,  256,               0.811
>   4,      5,      0,  512,               0.848
>   4,      0,      5,  512,               0.843
>   4,      5,      5,  512,               0.857
>   4,      0,      0, 1024,               0.886
>   4,      6,      0, 1024,               0.887
>   4,      0,      6, 1024,               0.881
>   4,      6,      6, 1024,               0.873
>   4,      0,      0, 2048,               0.892
>   4,      7,      0, 2048,               0.894
>   4,      0,      7, 2048,                0.89
>   4,      7,      7, 2048,               0.874
>  10,      1,      0,   64,               0.946
>  10,      1,      1,   64,                0.81
>  10,      2,      0,   64,               0.804
>  10,      2,      2,   64,                0.82
>  10,      3,      0,   64,               0.772
>  10,      3,      3,   64,               0.772
>  10,      4,      0,   64,               0.748
>  10,      4,      4,   64,               0.751
>  10,      5,      0,   64,                0.76
>  10,      5,      5,   64,                0.76
>  10,      6,      0,   64,               0.726
>  10,      6,      6,   64,               0.718
>  10,      7,      0,   64,               0.724
>  10,      7,      7,   64,                0.72
>   6,      0,      0,    0,               0.415
>   6,      0,      0,    1,               0.423
>   6,      0,      1,    1,               0.412
>   6,      0,      0,    2,               0.433
>   6,      0,      2,    2,               0.434
>   6,      0,      0,    3,               0.427
>   6,      0,      3,    3,               0.428
>   6,      0,      0,    4,               0.465
>   6,      0,      4,    4,               0.466
>   6,      0,      0,    5,               0.463
>   6,      0,      5,    5,               0.468
>   6,      0,      0,    6,               0.435
>   6,      0,      6,    6,               0.444
>   6,      0,      0,    7,                0.41
>   6,      0,      7,    7,                0.42
>   6,      0,      0,    8,               0.474
>   6,      0,      8,    8,               0.501
>   6,      0,      0,    9,               0.471
>   6,      0,      9,    9,               0.489
>   6,      0,      0,   10,               0.462
>   6,      0,     10,   10,                0.46
>   6,      0,      0,   11,               0.459
>   6,      0,     11,   11,               0.458
>   6,      0,      0,   12,               0.516
>   6,      0,     12,   12,                0.51
>   6,      0,      0,   13,               0.494
>   6,      0,     13,   13,               0.524
>   6,      0,      0,   14,               0.486
>   6,      0,     14,   14,                 0.5
>   6,      0,      0,   15,                0.48
>   6,      0,     15,   15,               0.501
>   6,      0,      0,   16,                0.54
>   6,      0,     16,   16,               0.538
>   6,      0,      0,   17,               0.503
>   6,      0,     17,   17,               0.541
>   6,      0,      0,   18,               0.537
>   6,      0,     18,   18,               0.549
>   6,      0,      0,   19,               0.527
>   6,      0,     19,   19,               0.537
>   6,      0,      0,   20,               0.539
>   6,      0,     20,   20,               0.554
>   6,      0,      0,   21,               0.558
>   6,      0,     21,   21,               0.541
>   6,      0,      0,   22,               0.546
>   6,      0,     22,   22,               0.561
>   6,      0,      0,   23,                0.54
>   6,      0,     23,   23,               0.536
>   6,      0,      0,   24,               0.565
>   6,      0,     24,   24,               0.584
>   6,      0,      0,   25,               0.563
>   6,      0,     25,   25,                0.58
>   6,      0,      0,   26,               0.555
>   6,      0,     26,   26,               0.584
>   6,      0,      0,   27,               0.569
>   6,      0,     27,   27,               0.587
>   6,      0,      0,   28,               0.612
>   6,      0,     28,   28,               0.623
>   6,      0,      0,   29,               0.604
>   6,      0,     29,   29,               0.621
>   6,      0,      0,   30,                0.59
>   6,      0,     30,   30,               0.609
>   6,      0,      0,   31,               0.577
>   6,      0,     31,   31,               0.588
>   6,      0,      0,   32,               0.621
>   6,      0,     32,   32,               0.608
>   6,      0,      0,   33,               0.601
>   6,      0,     33,   33,               0.623
>   6,      0,      0,   34,               0.614
>   6,      0,     34,   34,               0.615
>   6,      0,      0,   35,               0.598
>   6,      0,     35,   35,               0.608
>   6,      0,      0,   36,               0.626
>   6,      0,     36,   36,               0.634
>   6,      0,      0,   37,                0.62
>   6,      0,     37,   37,               0.634
>   6,      0,      0,   38,               0.612
>   6,      0,     38,   38,               0.637
>   6,      0,      0,   39,               0.627
>   6,      0,     39,   39,               0.612
>   6,      0,      0,   40,               0.661
>   6,      0,     40,   40,               0.674
>   6,      0,      0,   41,               0.633
>   6,      0,     41,   41,               0.643
>   6,      0,      0,   42,               0.634
>   6,      0,     42,   42,               0.636
>   6,      0,      0,   43,               0.619
>   6,      0,     43,   43,               0.625
>   6,      0,      0,   44,               0.654
>   6,      0,     44,   44,               0.654
>   6,      0,      0,   45,               0.647
>   6,      0,     45,   45,               0.649
>   6,      0,      0,   46,               0.651
>   6,      0,     46,   46,               0.651
>   6,      0,      0,   47,               0.646
>   6,      0,     47,   47,               0.648
>   6,      0,      0,   48,               0.662
>   6,      0,     48,   48,               0.664
>   6,      0,      0,   49,                0.68
>   6,      0,     49,   49,               0.667
>   6,      0,      0,   50,               0.654
>   6,      0,     50,   50,               0.659
>   6,      0,      0,   51,               0.638
>   6,      0,     51,   51,               0.639
>   6,      0,      0,   52,               0.665
>   6,      0,     52,   52,               0.669
>   6,      0,      0,   53,               0.658
>   6,      0,     53,   53,               0.656
>   6,      0,      0,   54,               0.669
>   6,      0,     54,   54,                0.67
>   6,      0,      0,   55,               0.668
>   6,      0,     55,   55,               0.664
>   6,      0,      0,   56,               0.701
>   6,      0,     56,   56,               0.695
>   6,      0,      0,   57,               0.687
>   6,      0,     57,   57,               0.696
>   6,      0,      0,   58,               0.693
>   6,      0,     58,   58,               0.704
>   6,      0,      0,   59,               0.695
>   6,      0,     59,   59,               0.708
>   6,      0,      0,   60,               0.708
>   6,      0,     60,   60,               0.728
>   6,      0,      0,   61,               0.708
>   6,      0,     61,   61,                0.71
>   6,      0,      0,   62,               0.715
>   6,      0,     62,   62,               0.705
>   6,      0,      0,   63,               0.677
>   6,      0,     63,   63,               0.702
>
>  .../{strcspn-sse2.S => strcspn-sse2.c}        |   8 +-
>  sysdeps/x86_64/strcspn.S                      | 119 ------------------
>  2 files changed, 4 insertions(+), 123 deletions(-)
>  rename sysdeps/x86_64/multiarch/{strcspn-sse2.S => strcspn-sse2.c} (85%)
>  delete mode 100644 sysdeps/x86_64/strcspn.S
>
> diff --git a/sysdeps/x86_64/multiarch/strcspn-sse2.S b/sysdeps/x86_64/multiarch/strcspn-sse2.c
> similarity index 85%
> rename from sysdeps/x86_64/multiarch/strcspn-sse2.S
> rename to sysdeps/x86_64/multiarch/strcspn-sse2.c
> index f97e856e1f..3a04bb39fc 100644
> --- a/sysdeps/x86_64/multiarch/strcspn-sse2.S
> +++ b/sysdeps/x86_64/multiarch/strcspn-sse2.c
> @@ -1,4 +1,4 @@
> -/* strcspn optimized with SSE2.
> +/* strcspn.
>     Copyright (C) 2017-2022 Free Software Foundation, Inc.
>     This file is part of the GNU C Library.
>
> @@ -19,10 +19,10 @@
>  #if IS_IN (libc)
>
>  # include <sysdep.h>
> -# define strcspn __strcspn_sse2
> +# define STRCSPN __strcspn_sse2
>
>  # undef libc_hidden_builtin_def
> -# define libc_hidden_builtin_def(strcspn)
> +# define libc_hidden_builtin_def(STRCSPN)
>  #endif
>
> -#include <sysdeps/x86_64/strcspn.S>
> +#include <string/strcspn.c>
> diff --git a/sysdeps/x86_64/strcspn.S b/sysdeps/x86_64/strcspn.S
> deleted file mode 100644
> index f3cd86c606..0000000000
> --- a/sysdeps/x86_64/strcspn.S
> +++ /dev/null
> @@ -1,119 +0,0 @@
> -/* strcspn (str, ss) -- Return the length of the initial segment of STR
> -                       which contains no characters from SS.
> -   For AMD x86-64.
> -   Copyright (C) 1994-2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#include <sysdep.h>
> -#include "asm-syntax.h"
> -
> -       .text
> -ENTRY (strcspn)
> -
> -       movq %rdi, %rdx         /* Save SRC.  */
> -
> -       /* First we create a table with flags for all possible characters.
> -          For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
> -          supported by the C string functions we have 256 characters.
> -          Before inserting marks for the stop characters we clear the whole
> -          table.  */
> -       movq %rdi, %r8                  /* Save value.  */
> -       subq $256, %rsp                 /* Make space for 256 bytes.  */
> -       cfi_adjust_cfa_offset(256)
> -       movl $32,  %ecx                 /* 32*8 bytes = 256 bytes.  */
> -       movq %rsp, %rdi
> -       xorl %eax, %eax                 /* We store 0s.  */
> -       cld
> -       rep
> -       stosq
> -
> -       movq %rsi, %rax                 /* Setup skipset.  */
> -
> -/* For understanding the following code remember that %rcx == 0 now.
> -   Although all the following instruction only modify %cl we always
> -   have a correct zero-extended 64-bit value in %rcx.  */
> -
> -       .p2align 4
> -L(2):  movb (%rax), %cl        /* get byte from skipset */
> -       testb %cl, %cl          /* is NUL char? */
> -       jz L(1)                 /* yes => start compare loop */
> -       movb %cl, (%rsp,%rcx)   /* set corresponding byte in skipset table */
> -
> -       movb 1(%rax), %cl       /* get byte from skipset */
> -       testb $0xff, %cl        /* is NUL char? */
> -       jz L(1)                 /* yes => start compare loop */
> -       movb %cl, (%rsp,%rcx)   /* set corresponding byte in skipset table */
> -
> -       movb 2(%rax), %cl       /* get byte from skipset */
> -       testb $0xff, %cl        /* is NUL char? */
> -       jz L(1)                 /* yes => start compare loop */
> -       movb %cl, (%rsp,%rcx)   /* set corresponding byte in skipset table */
> -
> -       movb 3(%rax), %cl       /* get byte from skipset */
> -       addq $4, %rax           /* increment skipset pointer */
> -       movb %cl, (%rsp,%rcx)   /* set corresponding byte in skipset table */
> -       testb $0xff, %cl        /* is NUL char? */
> -       jnz L(2)                /* no => process next dword from skipset */
> -
> -L(1):  leaq -4(%rdx), %rax     /* prepare loop */
> -
> -       /* We use a neat trick for the following loop.  Normally we would
> -          have to test for two termination conditions
> -          1. a character in the skipset was found
> -          and
> -          2. the end of the string was found
> -          But as a sign that the character is in the skipset we store its
> -          value in the table.  But the value of NUL is NUL so the loop
> -          terminates for NUL in every case.  */
> -
> -       .p2align 4
> -L(3):  addq $4, %rax           /* adjust pointer for full loop round */
> -
> -       movb (%rax), %cl        /* get byte from string */
> -       cmpb %cl, (%rsp,%rcx)   /* is it contained in skipset? */
> -       je L(4)                 /* yes => return */
> -
> -       movb 1(%rax), %cl       /* get byte from string */
> -       cmpb %cl, (%rsp,%rcx)   /* is it contained in skipset? */
> -       je L(5)                 /* yes => return */
> -
> -       movb 2(%rax), %cl       /* get byte from string */
> -       cmpb %cl, (%rsp,%rcx)   /* is it contained in skipset? */
> -       jz L(6)                 /* yes => return */
> -
> -       movb 3(%rax), %cl       /* get byte from string */
> -       cmpb %cl, (%rsp,%rcx)   /* is it contained in skipset? */
> -       jne L(3)                /* no => start loop again */
> -
> -       incq %rax               /* adjust pointer */
> -L(6):  incq %rax
> -L(5):  incq %rax
> -
> -L(4):  addq $256, %rsp         /* remove skipset */
> -       cfi_adjust_cfa_offset(-256)
> -#ifdef USE_AS_STRPBRK
> -       xorl %edx,%edx
> -       orb %cl, %cl            /* was last character NUL? */
> -       cmovzq %rdx, %rax       /* Yes: return NULL */
> -#else
> -       subq %rdx, %rax         /* we have to return the number of valid
> -                                  characters, so compute distance to first
> -                                  non-valid character */
> -#endif
> -       ret
> -END (strcspn)
> -libc_hidden_builtin_def (strcspn)
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v1 10/23] x86: Remove strpbrk-sse2.S and use the generic implementation
  2022-03-23 21:57 ` [PATCH v1 10/23] x86: Remove strpbrk-sse2.S " Noah Goldstein
@ 2022-03-24 18:57   ` H.J. Lu
  2022-05-12 19:41     ` Sunil Pandey
  0 siblings, 1 reply; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 18:57 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Wed, Mar 23, 2022 at 3:00 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The generic implementation is faster (see strcspn commit).
>
> All string/memory tests pass.
> ---
>  .../x86_64/multiarch/{strpbrk-sse2.S => strpbrk-sse2.c}  | 9 ++++-----
>  sysdeps/x86_64/strpbrk.S                                 | 3 ---
>  2 files changed, 4 insertions(+), 8 deletions(-)
>  rename sysdeps/x86_64/multiarch/{strpbrk-sse2.S => strpbrk-sse2.c} (84%)
>  delete mode 100644 sysdeps/x86_64/strpbrk.S
>
> diff --git a/sysdeps/x86_64/multiarch/strpbrk-sse2.S b/sysdeps/x86_64/multiarch/strpbrk-sse2.c
> similarity index 84%
> rename from sysdeps/x86_64/multiarch/strpbrk-sse2.S
> rename to sysdeps/x86_64/multiarch/strpbrk-sse2.c
> index d537b6c27b..d03214c4fb 100644
> --- a/sysdeps/x86_64/multiarch/strpbrk-sse2.S
> +++ b/sysdeps/x86_64/multiarch/strpbrk-sse2.c
> @@ -1,4 +1,4 @@
> -/* strpbrk optimized with SSE2.
> +/* strpbrk.
>     Copyright (C) 2017-2022 Free Software Foundation, Inc.
>     This file is part of the GNU C Library.
>
> @@ -19,11 +19,10 @@
>  #if IS_IN (libc)
>
>  # include <sysdep.h>
> -# define strcspn __strpbrk_sse2
> +# define STRPBRK __strpbrk_sse2
>
>  # undef libc_hidden_builtin_def
> -# define libc_hidden_builtin_def(strpbrk)
> +# define libc_hidden_builtin_def(STRPBRK)
>  #endif
>
> -#define USE_AS_STRPBRK
> -#include <sysdeps/x86_64/strcspn.S>
> +#include <string/strpbrk.c>
> diff --git a/sysdeps/x86_64/strpbrk.S b/sysdeps/x86_64/strpbrk.S
> deleted file mode 100644
> index 21888a5b92..0000000000
> --- a/sysdeps/x86_64/strpbrk.S
> +++ /dev/null
> @@ -1,3 +0,0 @@
> -#define strcspn strpbrk
> -#define USE_AS_STRPBRK
> -#include <sysdeps/x86_64/strcspn.S>
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v1 11/23] x86: Remove strspn-sse2.S and use the generic implementation
  2022-03-23 21:57 ` [PATCH v1 11/23] x86: Remove strspn-sse2.S " Noah Goldstein
@ 2022-03-24 18:57   ` H.J. Lu
  2022-05-12 19:42     ` Sunil Pandey
  0 siblings, 1 reply; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 18:57 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Wed, Mar 23, 2022 at 3:01 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The generic implementation is faster.
>
> geometric_mean(N=20) of all benchmarks New / Original: .710
>
> All string/memory tests pass.
> ---
> Geomtric Mean N=20 runs; All functions page aligned
> len, align1, align2,  pos, New Time / Old Time
>   1,      0,      0,  512,               0.824
>   1,      1,      0,  512,               1.018
>   1,      0,      1,  512,               0.986
>   1,      1,      1,  512,               1.092
>   2,      0,      0,  512,                0.86
>   2,      2,      0,  512,               0.868
>   2,      0,      2,  512,               0.858
>   2,      2,      2,  512,               0.857
>   3,      0,      0,  512,               0.836
>   3,      3,      0,  512,               0.849
>   3,      0,      3,  512,                0.84
>   3,      3,      3,  512,                0.85
>   4,      0,      0,  512,               0.843
>   4,      4,      0,  512,               0.837
>   4,      0,      4,  512,               0.835
>   4,      4,      4,  512,               0.846
>   5,      0,      0,  512,               0.852
>   5,      5,      0,  512,               0.848
>   5,      0,      5,  512,                0.85
>   5,      5,      5,  512,                0.85
>   6,      0,      0,  512,               0.853
>   6,      6,      0,  512,               0.855
>   6,      0,      6,  512,               0.853
>   6,      6,      6,  512,               0.853
>   7,      0,      0,  512,               0.857
>   7,      7,      0,  512,               0.861
>   7,      0,      7,  512,                0.94
>   7,      7,      7,  512,               0.856
>   8,      0,      0,  512,               0.927
>   8,      0,      8,  512,               0.965
>   9,      0,      0,  512,               0.967
>   9,      1,      0,  512,               0.976
>   9,      0,      9,  512,               0.887
>   9,      1,      9,  512,               0.881
>  10,      0,      0,  512,               0.853
>  10,      2,      0,  512,               0.846
>  10,      0,     10,  512,               0.855
>  10,      2,     10,  512,               0.849
>  11,      0,      0,  512,               0.854
>  11,      3,      0,  512,               0.855
>  11,      0,     11,  512,                0.85
>  11,      3,     11,  512,               0.854
>  12,      0,      0,  512,               0.864
>  12,      4,      0,  512,               0.864
>  12,      0,     12,  512,               0.867
>  12,      4,     12,  512,                0.87
>  13,      0,      0,  512,               0.853
>  13,      5,      0,  512,               0.841
>  13,      0,     13,  512,               0.837
>  13,      5,     13,  512,                0.85
>  14,      0,      0,  512,               0.838
>  14,      6,      0,  512,               0.842
>  14,      0,     14,  512,               0.818
>  14,      6,     14,  512,               0.845
>  15,      0,      0,  512,               0.799
>  15,      7,      0,  512,               0.847
>  15,      0,     15,  512,               0.787
>  15,      7,     15,  512,                0.84
>  16,      0,      0,  512,               0.824
>  16,      0,     16,  512,               0.827
>  17,      0,      0,  512,               0.817
>  17,      1,      0,  512,               0.823
>  17,      0,     17,  512,                0.82
>  17,      1,     17,  512,               0.814
>  18,      0,      0,  512,                0.81
>  18,      2,      0,  512,               0.833
>  18,      0,     18,  512,               0.811
>  18,      2,     18,  512,               0.842
>  19,      0,      0,  512,               0.823
>  19,      3,      0,  512,               0.818
>  19,      0,     19,  512,               0.821
>  19,      3,     19,  512,               0.824
>  20,      0,      0,  512,               0.814
>  20,      4,      0,  512,               0.818
>  20,      0,     20,  512,               0.806
>  20,      4,     20,  512,               0.802
>  21,      0,      0,  512,               0.835
>  21,      5,      0,  512,               0.839
>  21,      0,     21,  512,               0.842
>  21,      5,     21,  512,                0.82
>  22,      0,      0,  512,               0.824
>  22,      6,      0,  512,               0.831
>  22,      0,     22,  512,               0.819
>  22,      6,     22,  512,               0.824
>  23,      0,      0,  512,               0.816
>  23,      7,      0,  512,               0.856
>  23,      0,     23,  512,               0.808
>  23,      7,     23,  512,               0.848
>  24,      0,      0,  512,                0.88
>  24,      0,     24,  512,               0.846
>  25,      0,      0,  512,               0.929
>  25,      1,      0,  512,               0.917
>  25,      0,     25,  512,               0.884
>  25,      1,     25,  512,               0.859
>  26,      0,      0,  512,               0.919
>  26,      2,      0,  512,               0.867
>  26,      0,     26,  512,               0.914
>  26,      2,     26,  512,               0.845
>  27,      0,      0,  512,               0.919
>  27,      3,      0,  512,               0.864
>  27,      0,     27,  512,               0.917
>  27,      3,     27,  512,               0.847
>  28,      0,      0,  512,               0.905
>  28,      4,      0,  512,               0.896
>  28,      0,     28,  512,               0.898
>  28,      4,     28,  512,               0.871
>  29,      0,      0,  512,               0.911
>  29,      5,      0,  512,                0.91
>  29,      0,     29,  512,               0.905
>  29,      5,     29,  512,               0.884
>  30,      0,      0,  512,               0.907
>  30,      6,      0,  512,               0.802
>  30,      0,     30,  512,               0.906
>  30,      6,     30,  512,               0.818
>  31,      0,      0,  512,               0.907
>  31,      7,      0,  512,               0.821
>  31,      0,     31,  512,                0.89
>  31,      7,     31,  512,               0.787
>   4,      0,      0,   32,               0.623
>   4,      1,      0,   32,               0.606
>   4,      0,      1,   32,                 0.6
>   4,      1,      1,   32,               0.603
>   4,      0,      0,   64,               0.731
>   4,      2,      0,   64,               0.733
>   4,      0,      2,   64,               0.734
>   4,      2,      2,   64,               0.755
>   4,      0,      0,  128,               0.822
>   4,      3,      0,  128,               0.873
>   4,      0,      3,  128,                0.89
>   4,      3,      3,  128,               0.907
>   4,      0,      0,  256,               0.827
>   4,      4,      0,  256,               0.811
>   4,      0,      4,  256,               0.794
>   4,      4,      4,  256,               0.814
>   4,      5,      0,  512,               0.841
>   4,      0,      5,  512,               0.831
>   4,      5,      5,  512,               0.845
>   4,      0,      0, 1024,               0.861
>   4,      6,      0, 1024,               0.857
>   4,      0,      6, 1024,                 0.9
>   4,      6,      6, 1024,               0.861
>   4,      0,      0, 2048,               0.879
>   4,      7,      0, 2048,               0.875
>   4,      0,      7, 2048,               0.883
>   4,      7,      7, 2048,                0.88
>  10,      1,      0,   64,               0.747
>  10,      1,      1,   64,               0.743
>  10,      2,      0,   64,               0.732
>  10,      2,      2,   64,               0.729
>  10,      3,      0,   64,               0.747
>  10,      3,      3,   64,               0.733
>  10,      4,      0,   64,                0.74
>  10,      4,      4,   64,               0.751
>  10,      5,      0,   64,               0.735
>  10,      5,      5,   64,               0.746
>  10,      6,      0,   64,               0.735
>  10,      6,      6,   64,               0.733
>  10,      7,      0,   64,               0.734
>  10,      7,      7,   64,                0.74
>   6,      0,      0,    0,               0.377
>   6,      0,      0,    1,               0.369
>   6,      0,      1,    1,               0.383
>   6,      0,      0,    2,               0.391
>   6,      0,      2,    2,               0.394
>   6,      0,      0,    3,               0.416
>   6,      0,      3,    3,               0.411
>   6,      0,      0,    4,               0.475
>   6,      0,      4,    4,               0.483
>   6,      0,      0,    5,               0.473
>   6,      0,      5,    5,               0.476
>   6,      0,      0,    6,               0.459
>   6,      0,      6,    6,               0.445
>   6,      0,      0,    7,               0.433
>   6,      0,      7,    7,               0.432
>   6,      0,      0,    8,               0.492
>   6,      0,      8,    8,               0.494
>   6,      0,      0,    9,               0.476
>   6,      0,      9,    9,               0.483
>   6,      0,      0,   10,                0.46
>   6,      0,     10,   10,               0.476
>   6,      0,      0,   11,               0.463
>   6,      0,     11,   11,               0.463
>   6,      0,      0,   12,               0.511
>   6,      0,     12,   12,               0.515
>   6,      0,      0,   13,               0.506
>   6,      0,     13,   13,               0.536
>   6,      0,      0,   14,               0.496
>   6,      0,     14,   14,               0.484
>   6,      0,      0,   15,               0.473
>   6,      0,     15,   15,               0.475
>   6,      0,      0,   16,               0.534
>   6,      0,     16,   16,               0.534
>   6,      0,      0,   17,               0.525
>   6,      0,     17,   17,               0.523
>   6,      0,      0,   18,               0.522
>   6,      0,     18,   18,               0.524
>   6,      0,      0,   19,               0.512
>   6,      0,     19,   19,               0.514
>   6,      0,      0,   20,               0.535
>   6,      0,     20,   20,                0.54
>   6,      0,      0,   21,               0.543
>   6,      0,     21,   21,               0.536
>   6,      0,      0,   22,               0.542
>   6,      0,     22,   22,               0.542
>   6,      0,      0,   23,               0.529
>   6,      0,     23,   23,                0.53
>   6,      0,      0,   24,               0.596
>   6,      0,     24,   24,               0.589
>   6,      0,      0,   25,               0.583
>   6,      0,     25,   25,                0.58
>   6,      0,      0,   26,               0.574
>   6,      0,     26,   26,                0.58
>   6,      0,      0,   27,               0.575
>   6,      0,     27,   27,               0.558
>   6,      0,      0,   28,               0.606
>   6,      0,     28,   28,               0.606
>   6,      0,      0,   29,               0.589
>   6,      0,     29,   29,               0.595
>   6,      0,      0,   30,               0.592
>   6,      0,     30,   30,               0.585
>   6,      0,      0,   31,               0.585
>   6,      0,     31,   31,               0.579
>   6,      0,      0,   32,               0.625
>   6,      0,     32,   32,               0.615
>   6,      0,      0,   33,               0.615
>   6,      0,     33,   33,                0.61
>   6,      0,      0,   34,               0.604
>   6,      0,     34,   34,                 0.6
>   6,      0,      0,   35,               0.602
>   6,      0,     35,   35,               0.608
>   6,      0,      0,   36,               0.644
>   6,      0,     36,   36,               0.644
>   6,      0,      0,   37,               0.658
>   6,      0,     37,   37,               0.651
>   6,      0,      0,   38,               0.644
>   6,      0,     38,   38,               0.649
>   6,      0,      0,   39,               0.626
>   6,      0,     39,   39,               0.632
>   6,      0,      0,   40,               0.662
>   6,      0,     40,   40,               0.661
>   6,      0,      0,   41,               0.656
>   6,      0,     41,   41,               0.655
>   6,      0,      0,   42,               0.643
>   6,      0,     42,   42,               0.637
>   6,      0,      0,   43,               0.622
>   6,      0,     43,   43,               0.628
>   6,      0,      0,   44,               0.673
>   6,      0,     44,   44,               0.687
>   6,      0,      0,   45,               0.661
>   6,      0,     45,   45,               0.659
>   6,      0,      0,   46,               0.657
>   6,      0,     46,   46,               0.653
>   6,      0,      0,   47,               0.658
>   6,      0,     47,   47,                0.65
>   6,      0,      0,   48,               0.678
>   6,      0,     48,   48,               0.683
>   6,      0,      0,   49,               0.676
>   6,      0,     49,   49,               0.661
>   6,      0,      0,   50,               0.672
>   6,      0,     50,   50,               0.662
>   6,      0,      0,   51,               0.656
>   6,      0,     51,   51,               0.659
>   6,      0,      0,   52,               0.682
>   6,      0,     52,   52,               0.686
>   6,      0,      0,   53,                0.67
>   6,      0,     53,   53,               0.674
>   6,      0,      0,   54,               0.663
>   6,      0,     54,   54,               0.675
>   6,      0,      0,   55,               0.662
>   6,      0,     55,   55,               0.665
>   6,      0,      0,   56,               0.681
>   6,      0,     56,   56,               0.697
>   6,      0,      0,   57,               0.686
>   6,      0,     57,   57,               0.687
>   6,      0,      0,   58,               0.701
>   6,      0,     58,   58,               0.693
>   6,      0,      0,   59,               0.709
>   6,      0,     59,   59,               0.698
>   6,      0,      0,   60,               0.708
>   6,      0,     60,   60,               0.708
>   6,      0,      0,   61,               0.709
>   6,      0,     61,   61,               0.716
>   6,      0,      0,   62,               0.709
>   6,      0,     62,   62,               0.707
>   6,      0,      0,   63,               0.703
>   6,      0,     63,   63,               0.716
>
>  .../{strspn-sse2.S => strspn-sse2.c}          |   8 +-
>  sysdeps/x86_64/strspn.S                       | 112 ------------------
>  2 files changed, 4 insertions(+), 116 deletions(-)
>  rename sysdeps/x86_64/multiarch/{strspn-sse2.S => strspn-sse2.c} (86%)
>  delete mode 100644 sysdeps/x86_64/strspn.S
>
> diff --git a/sysdeps/x86_64/multiarch/strspn-sse2.S b/sysdeps/x86_64/multiarch/strspn-sse2.c
> similarity index 86%
> rename from sysdeps/x86_64/multiarch/strspn-sse2.S
> rename to sysdeps/x86_64/multiarch/strspn-sse2.c
> index e0a095f25a..61cc6cb0a5 100644
> --- a/sysdeps/x86_64/multiarch/strspn-sse2.S
> +++ b/sysdeps/x86_64/multiarch/strspn-sse2.c
> @@ -1,4 +1,4 @@
> -/* strspn optimized with SSE2.
> +/* strspn.
>     Copyright (C) 2017-2022 Free Software Foundation, Inc.
>     This file is part of the GNU C Library.
>
> @@ -19,10 +19,10 @@
>  #if IS_IN (libc)
>
>  # include <sysdep.h>
> -# define strspn __strspn_sse2
> +# define STRSPN __strspn_sse2
>
>  # undef libc_hidden_builtin_def
> -# define libc_hidden_builtin_def(strspn)
> +# define libc_hidden_builtin_def(STRSPN)
>  #endif
>
> -#include <sysdeps/x86_64/strspn.S>
> +#include <string/strspn.c>
> diff --git a/sysdeps/x86_64/strspn.S b/sysdeps/x86_64/strspn.S
> deleted file mode 100644
> index 61b76ee0a1..0000000000
> --- a/sysdeps/x86_64/strspn.S
> +++ /dev/null
> @@ -1,112 +0,0 @@
> -/* strspn (str, ss) -- Return the length of the initial segment of STR
> -                       which contains only characters from SS.
> -   For AMD x86-64.
> -   Copyright (C) 1994-2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#include <sysdep.h>
> -
> -       .text
> -ENTRY (strspn)
> -
> -       movq %rdi, %rdx         /* Save SRC.  */
> -
> -       /* First we create a table with flags for all possible characters.
> -          For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
> -          supported by the C string functions we have 256 characters.
> -          Before inserting marks for the stop characters we clear the whole
> -          table.  */
> -       movq %rdi, %r8                  /* Save value.  */
> -       subq $256, %rsp                 /* Make space for 256 bytes.  */
> -       cfi_adjust_cfa_offset(256)
> -       movl $32,  %ecx                 /* 32*8 bytes = 256 bytes.  */
> -       movq %rsp, %rdi
> -       xorl %eax, %eax                 /* We store 0s.  */
> -       cld
> -       rep
> -       stosq
> -
> -       movq %rsi, %rax                 /* Setup stopset.  */
> -
> -/* For understanding the following code remember that %rcx == 0 now.
> -   Although all the following instruction only modify %cl we always
> -   have a correct zero-extended 64-bit value in %rcx.  */
> -
> -       .p2align 4
> -L(2):  movb (%rax), %cl        /* get byte from stopset */
> -       testb %cl, %cl          /* is NUL char? */
> -       jz L(1)                 /* yes => start compare loop */
> -       movb %cl, (%rsp,%rcx)   /* set corresponding byte in stopset table */
> -
> -       movb 1(%rax), %cl       /* get byte from stopset */
> -       testb $0xff, %cl        /* is NUL char? */
> -       jz L(1)                 /* yes => start compare loop */
> -       movb %cl, (%rsp,%rcx)   /* set corresponding byte in stopset table */
> -
> -       movb 2(%rax), %cl       /* get byte from stopset */
> -       testb $0xff, %cl        /* is NUL char? */
> -       jz L(1)                 /* yes => start compare loop */
> -       movb %cl, (%rsp,%rcx)   /* set corresponding byte in stopset table */
> -
> -       movb 3(%rax), %cl       /* get byte from stopset */
> -       addq $4, %rax           /* increment stopset pointer */
> -       movb %cl, (%rsp,%rcx)   /* set corresponding byte in stopset table */
> -       testb $0xff, %cl        /* is NUL char? */
> -       jnz L(2)                /* no => process next dword from stopset */
> -
> -L(1):  leaq -4(%rdx), %rax     /* prepare loop */
> -
> -       /* We use a neat trick for the following loop.  Normally we would
> -          have to test for two termination conditions
> -          1. a character in the stopset was found
> -          and
> -          2. the end of the string was found
> -          But as a sign that the character is in the stopset we store its
> -          value in the table.  But the value of NUL is NUL so the loop
> -          terminates for NUL in every case.  */
> -
> -       .p2align 4
> -L(3):  addq $4, %rax           /* adjust pointer for full loop round */
> -
> -       movb (%rax), %cl        /* get byte from string */
> -       testb %cl, (%rsp,%rcx)  /* is it contained in skipset? */
> -       jz L(4)                 /* no => return */
> -
> -       movb 1(%rax), %cl       /* get byte from string */
> -       testb %cl, (%rsp,%rcx)  /* is it contained in skipset? */
> -       jz L(5)                 /* no => return */
> -
> -       movb 2(%rax), %cl       /* get byte from string */
> -       testb %cl, (%rsp,%rcx)  /* is it contained in skipset? */
> -       jz L(6)                 /* no => return */
> -
> -       movb 3(%rax), %cl       /* get byte from string */
> -       testb %cl, (%rsp,%rcx)  /* is it contained in skipset? */
> -       jnz L(3)                /* yes => start loop again */
> -
> -       incq %rax               /* adjust pointer */
> -L(6):  incq %rax
> -L(5):  incq %rax
> -
> -L(4):  addq $256, %rsp         /* remove stopset */
> -       cfi_adjust_cfa_offset(-256)
> -       subq %rdx, %rax         /* we have to return the number of valid
> -                                  characters, so compute distance to first
> -                                  non-valid character */
> -       ret
> -END (strspn)
> -libc_hidden_builtin_def (strspn)
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v1 12/23] x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896]
  2022-03-23 21:57 ` [PATCH v1 12/23] x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896] Noah Goldstein
@ 2022-03-24 18:59   ` H.J. Lu
  2022-03-24 19:18     ` Noah Goldstein
  2022-03-24 20:50   ` [PATCH v2 12/31] " Noah Goldstein
  1 sibling, 1 reply; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 18:59 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Wed, Mar 23, 2022 at 3:01 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Overflow case for __wcsncmp_avx2_rtm should be __wcscmp_avx2_rtm not
> __wcscmp_avx2.
>
> All string/memory tests pass.
> ---
>  sysdeps/x86_64/multiarch/strcmp-avx2.S | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> index 52ff5ad724..86a86b68e3 100644
> --- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> @@ -122,7 +122,7 @@ ENTRY(STRCMP)
>            are cases where length is large enough that it can never be a
>            bound on valid memory so just use wcscmp.  */
>         shrq    $56, %rcx
> -       jnz     __wcscmp_avx2
> +       jnz     OVERFLOW_STRCMP
>
>         leaq    (, %rdx, 4), %rdx
>  #  endif
> --
> 2.25.1
>

Isn't it a bug?  Is there a glibc bug? Should this also be fixed on release
branches?

-- 
H.J.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v1 13/23] benchtests: Use json-lib in bench-strcasecmp.c
  2022-03-23 21:57 ` [PATCH v1 13/23] benchtests: Use json-lib in bench-strcasecmp.c Noah Goldstein
@ 2022-03-24 19:00   ` H.J. Lu
  0 siblings, 0 replies; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 19:00 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Wed, Mar 23, 2022 at 3:01 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Just QOL change to make parsing the output of the benchtests more
> consistent.
> ---
>  benchtests/bench-strcasecmp.c | 77 +++++++++++++++++++++++------------
>  1 file changed, 51 insertions(+), 26 deletions(-)
>
> diff --git a/benchtests/bench-strcasecmp.c b/benchtests/bench-strcasecmp.c
> index daccf1d245..855f2db2ad 100644
> --- a/benchtests/bench-strcasecmp.c
> +++ b/benchtests/bench-strcasecmp.c
> @@ -20,6 +20,7 @@
>  #define TEST_MAIN
>  #define TEST_NAME "strcasecmp"
>  #include "bench-string.h"
> +#include "json-lib.h"
>
>  typedef int (*proto_t) (const char *, const char *);
>  static int simple_strcasecmp (const char *, const char *);
> @@ -40,7 +41,8 @@ simple_strcasecmp (const char *s1, const char *s2)
>  }
>
>  static void
> -do_one_test (impl_t *impl, const char *s1, const char *s2, int exp_result)
> +do_one_test (json_ctx_t *json_ctx, impl_t *impl, const char *s1,
> +             const char *s2, int exp_result)
>  {
>    size_t i, iters = INNER_LOOP_ITERS;
>    timing_t start, stop, cur;
> @@ -64,12 +66,12 @@ do_one_test (impl_t *impl, const char *s1, const char *s2, int exp_result)
>
>    TIMING_DIFF (cur, start, stop);
>
> -  TIMING_PRINT_MEAN ((double) cur, (double) iters);
> +  json_element_double (json_ctx, (double) cur / (double) iters);
>  }
>
>  static void
> -do_test (size_t align1, size_t align2, size_t len, int max_char,
> -        int exp_result)
> +do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len,
> +         int max_char, int exp_result)
>  {
>    size_t i;
>    char *s1, *s2;
> @@ -85,6 +87,13 @@ do_test (size_t align1, size_t align2, size_t len, int max_char,
>    if (align2 + len + 1 >= page_size)
>      return;
>
> +  json_element_object_begin (json_ctx);
> +  json_attr_uint (json_ctx, "length", len);
> +  json_attr_uint (json_ctx, "align1", align1);
> +  json_attr_uint (json_ctx, "align2", align2);
> +  json_attr_uint (json_ctx, "max_char", max_char);
> +  json_array_begin (json_ctx, "timings");
> +
>    s1 = (char *) (buf1 + align1);
>    s2 = (char *) (buf2 + align2);
>
> @@ -103,53 +112,69 @@ do_test (size_t align1, size_t align2, size_t len, int max_char,
>    else
>      s2[len - 1] -= exp_result;
>
> -  printf ("Length %4zd, alignment %2zd/%2zd:", len, align1, align2);
> -
>    FOR_EACH_IMPL (impl, 0)
> -    do_one_test (impl, s1, s2, exp_result);
> +    do_one_test (json_ctx, impl, s1, s2, exp_result);
>
> -  putchar ('\n');
> +  json_array_end (json_ctx);
> +  json_element_object_end (json_ctx);
>  }
>
>  int
>  test_main (void)
>  {
> +  json_ctx_t json_ctx;
>    size_t i;
>
>    test_init ();
>
> -  printf ("%23s", "");
> +  json_init (&json_ctx, 0, stdout);
> +
> +  json_document_begin (&json_ctx);
> +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> +
> +  json_attr_object_begin (&json_ctx, "functions");
> +  json_attr_object_begin (&json_ctx, TEST_NAME);
> +  json_attr_string (&json_ctx, "bench-variant", "");
> +
> +  json_array_begin (&json_ctx, "ifuncs");
>    FOR_EACH_IMPL (impl, 0)
> -    printf ("\t%s", impl->name);
> -  putchar ('\n');
> +    json_element_string (&json_ctx, impl->name);
> +  json_array_end (&json_ctx);
> +
> +  json_array_begin (&json_ctx, "results");
>
>    for (i = 1; i < 16; ++i)
>      {
> -      do_test (i, i, i, 127, 0);
> -      do_test (i, i, i, 127, 1);
> -      do_test (i, i, i, 127, -1);
> +      do_test (&json_ctx, i, i, i, 127, 0);
> +      do_test (&json_ctx, i, i, i, 127, 1);
> +      do_test (&json_ctx, i, i, i, 127, -1);
>      }
>
>    for (i = 1; i < 10; ++i)
>      {
> -      do_test (0, 0, 2 << i, 127, 0);
> -      do_test (0, 0, 2 << i, 254, 0);
> -      do_test (0, 0, 2 << i, 127, 1);
> -      do_test (0, 0, 2 << i, 254, 1);
> -      do_test (0, 0, 2 << i, 127, -1);
> -      do_test (0, 0, 2 << i, 254, -1);
> +      do_test (&json_ctx, 0, 0, 2 << i, 127, 0);
> +      do_test (&json_ctx, 0, 0, 2 << i, 254, 0);
> +      do_test (&json_ctx, 0, 0, 2 << i, 127, 1);
> +      do_test (&json_ctx, 0, 0, 2 << i, 254, 1);
> +      do_test (&json_ctx, 0, 0, 2 << i, 127, -1);
> +      do_test (&json_ctx, 0, 0, 2 << i, 254, -1);
>      }
>
>    for (i = 1; i < 8; ++i)
>      {
> -      do_test (i, 2 * i, 8 << i, 127, 0);
> -      do_test (2 * i, i, 8 << i, 254, 0);
> -      do_test (i, 2 * i, 8 << i, 127, 1);
> -      do_test (2 * i, i, 8 << i, 254, 1);
> -      do_test (i, 2 * i, 8 << i, 127, -1);
> -      do_test (2 * i, i, 8 << i, 254, -1);
> +      do_test (&json_ctx, i, 2 * i, 8 << i, 127, 0);
> +      do_test (&json_ctx, 2 * i, i, 8 << i, 254, 0);
> +      do_test (&json_ctx, i, 2 * i, 8 << i, 127, 1);
> +      do_test (&json_ctx, 2 * i, i, 8 << i, 254, 1);
> +      do_test (&json_ctx, i, 2 * i, 8 << i, 127, -1);
> +      do_test (&json_ctx, 2 * i, i, 8 << i, 254, -1);
>      }
>
> +  json_array_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_document_end (&json_ctx);
> +
>    return ret;
>  }
>
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v1 14/23] benchtests: Use json-lib in bench-strncasecmp.c
  2022-03-23 21:57 ` [PATCH v1 14/23] benchtests: Use json-lib in bench-strncasecmp.c Noah Goldstein
@ 2022-03-24 19:00   ` H.J. Lu
  0 siblings, 0 replies; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 19:00 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Wed, Mar 23, 2022 at 3:01 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Just QOL change to make parsing the output of the benchtests more
> consistent.
> ---
>  benchtests/bench-strncasecmp.c | 113 ++++++++++++++++++++-------------
>  1 file changed, 69 insertions(+), 44 deletions(-)
>
> diff --git a/benchtests/bench-strncasecmp.c b/benchtests/bench-strncasecmp.c
> index a9819efc73..91f49cc8d3 100644
> --- a/benchtests/bench-strncasecmp.c
> +++ b/benchtests/bench-strncasecmp.c
> @@ -20,6 +20,7 @@
>  #define TEST_MAIN
>  #define TEST_NAME "strncasecmp"
>  #include "bench-string.h"
> +#include "json-lib.h"
>
>  typedef int (*proto_t) (const char *, const char *, size_t);
>  static int simple_strncasecmp (const char *, const char *, size_t);
> @@ -47,8 +48,8 @@ simple_strncasecmp (const char *s1, const char *s2, size_t n)
>  }
>
>  static void
> -do_one_test (impl_t *impl, const char *s1, const char *s2, size_t n,
> -            int exp_result)
> +do_one_test (json_ctx_t *json_ctx, impl_t *impl, const char *s1,
> +             const char *s2, size_t n, int exp_result)
>  {
>    size_t i, iters = INNER_LOOP_ITERS;
>    timing_t start, stop, cur;
> @@ -62,12 +63,12 @@ do_one_test (impl_t *impl, const char *s1, const char *s2, size_t n,
>
>    TIMING_DIFF (cur, start, stop);
>
> -  TIMING_PRINT_MEAN ((double) cur, (double) iters);
> +  json_element_double (json_ctx, (double) cur / (double) iters);
>  }
>
>  static void
> -do_test (size_t align1, size_t align2, size_t n, size_t len, int max_char,
> -        int exp_result)
> +do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t n,
> +         size_t len, int max_char, int exp_result)
>  {
>    size_t i;
>    char *s1, *s2;
> @@ -101,83 +102,107 @@ do_test (size_t align1, size_t align2, size_t n, size_t len, int max_char,
>    else
>      s2[len - 1] -= exp_result;
>
> -  printf ("Length %4zd, alignment %2zd/%2zd:", len, align1, align2);
> +  json_element_object_begin (json_ctx);
> +  json_attr_uint (json_ctx, "length", len);
> +  json_attr_uint (json_ctx, "n", n);
> +  json_attr_uint (json_ctx, "align1", align1);
> +  json_attr_uint (json_ctx, "align2", align2);
> +  json_attr_uint (json_ctx, "max_char", max_char);
> +  json_array_begin (json_ctx, "timings");
>
>    FOR_EACH_IMPL (impl, 0)
> -    do_one_test (impl, s1, s2, n, exp_result);
> +    do_one_test (json_ctx, impl, s1, s2, n, exp_result);
>
> -  putchar ('\n');
> +  json_array_end (json_ctx);
> +  json_element_object_end (json_ctx);
>  }
>
>  int
>  test_main (void)
>  {
> +  json_ctx_t json_ctx;
>    size_t i;
>
>    test_init ();
>
> -  printf ("%23s", "");
> +  json_init (&json_ctx, 0, stdout);
> +
> +  json_document_begin (&json_ctx);
> +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> +
> +  json_attr_object_begin (&json_ctx, "functions");
> +  json_attr_object_begin (&json_ctx, TEST_NAME);
> +  json_attr_string (&json_ctx, "bench-variant", "");
> +
> +  json_array_begin (&json_ctx, "ifuncs");
>    FOR_EACH_IMPL (impl, 0)
> -    printf ("\t%s", impl->name);
> -  putchar ('\n');
> +    json_element_string (&json_ctx, impl->name);
> +  json_array_end (&json_ctx);
> +
> +  json_array_begin (&json_ctx, "results");
>
>    for (i = 1; i < 16; ++i)
>      {
> -      do_test (i, i, i - 1, i, 127, 0);
> +      do_test (&json_ctx, i, i, i - 1, i, 127, 0);
>
> -      do_test (i, i, i, i, 127, 0);
> -      do_test (i, i, i, i, 127, 1);
> -      do_test (i, i, i, i, 127, -1);
> +      do_test (&json_ctx, i, i, i, i, 127, 0);
> +      do_test (&json_ctx, i, i, i, i, 127, 1);
> +      do_test (&json_ctx, i, i, i, i, 127, -1);
>
> -      do_test (i, i, i + 1, i, 127, 0);
> -      do_test (i, i, i + 1, i, 127, 1);
> -      do_test (i, i, i + 1, i, 127, -1);
> +      do_test (&json_ctx, i, i, i + 1, i, 127, 0);
> +      do_test (&json_ctx, i, i, i + 1, i, 127, 1);
> +      do_test (&json_ctx, i, i, i + 1, i, 127, -1);
>      }
>
>    for (i = 1; i < 10; ++i)
>      {
> -      do_test (0, 0, (2 << i) - 1, 2 << i, 127, 0);
> -      do_test (0, 0, 2 << i, 2 << i, 254, 0);
> -      do_test (0, 0, (2 << i) + 1, 2 << i, 127, 0);
> +      do_test (&json_ctx, 0, 0, (2 << i) - 1, 2 << i, 127, 0);
> +      do_test (&json_ctx, 0, 0, 2 << i, 2 << i, 254, 0);
> +      do_test (&json_ctx, 0, 0, (2 << i) + 1, 2 << i, 127, 0);
>
> -      do_test (0, 0, (2 << i) + 1, 2 << i, 254, 0);
> +      do_test (&json_ctx, 0, 0, (2 << i) + 1, 2 << i, 254, 0);
>
> -      do_test (0, 0, 2 << i, 2 << i, 127, 1);
> -      do_test (0, 0, (2 << i) + 10, 2 << i, 127, 1);
> +      do_test (&json_ctx, 0, 0, 2 << i, 2 << i, 127, 1);
> +      do_test (&json_ctx, 0, 0, (2 << i) + 10, 2 << i, 127, 1);
>
> -      do_test (0, 0, 2 << i, 2 << i, 254, 1);
> -      do_test (0, 0, (2 << i) + 10, 2 << i, 254, 1);
> +      do_test (&json_ctx, 0, 0, 2 << i, 2 << i, 254, 1);
> +      do_test (&json_ctx, 0, 0, (2 << i) + 10, 2 << i, 254, 1);
>
> -      do_test (0, 0, 2 << i, 2 << i, 127, -1);
> -      do_test (0, 0, (2 << i) + 10, 2 << i, 127, -1);
> +      do_test (&json_ctx, 0, 0, 2 << i, 2 << i, 127, -1);
> +      do_test (&json_ctx, 0, 0, (2 << i) + 10, 2 << i, 127, -1);
>
> -      do_test (0, 0, 2 << i, 2 << i, 254, -1);
> -      do_test (0, 0, (2 << i) + 10, 2 << i, 254, -1);
> +      do_test (&json_ctx, 0, 0, 2 << i, 2 << i, 254, -1);
> +      do_test (&json_ctx, 0, 0, (2 << i) + 10, 2 << i, 254, -1);
>      }
>
>    for (i = 1; i < 8; ++i)
>      {
> -      do_test (i, 2 * i, (8 << i) - 1, 8 << i, 127, 0);
> -      do_test (i, 2 * i, 8 << i, 8 << i, 127, 0);
> -      do_test (i, 2 * i, (8 << i) + 100, 8 << i, 127, 0);
> +      do_test (&json_ctx, i, 2 * i, (8 << i) - 1, 8 << i, 127, 0);
> +      do_test (&json_ctx, i, 2 * i, 8 << i, 8 << i, 127, 0);
> +      do_test (&json_ctx, i, 2 * i, (8 << i) + 100, 8 << i, 127, 0);
>
> -      do_test (2 * i, i, (8 << i) - 1, 8 << i, 254, 0);
> -      do_test (2 * i, i, 8 << i, 8 << i, 254, 0);
> -      do_test (2 * i, i, (8 << i) + 100, 8 << i, 254, 0);
> +      do_test (&json_ctx, 2 * i, i, (8 << i) - 1, 8 << i, 254, 0);
> +      do_test (&json_ctx, 2 * i, i, 8 << i, 8 << i, 254, 0);
> +      do_test (&json_ctx, 2 * i, i, (8 << i) + 100, 8 << i, 254, 0);
>
> -      do_test (i, 2 * i, 8 << i, 8 << i, 127, 1);
> -      do_test (i, 2 * i, (8 << i) + 100, 8 << i, 127, 1);
> +      do_test (&json_ctx, i, 2 * i, 8 << i, 8 << i, 127, 1);
> +      do_test (&json_ctx, i, 2 * i, (8 << i) + 100, 8 << i, 127, 1);
>
> -      do_test (2 * i, i, 8 << i, 8 << i, 254, 1);
> -      do_test (2 * i, i, (8 << i) + 100, 8 << i, 254, 1);
> +      do_test (&json_ctx, 2 * i, i, 8 << i, 8 << i, 254, 1);
> +      do_test (&json_ctx, 2 * i, i, (8 << i) + 100, 8 << i, 254, 1);
>
> -      do_test (i, 2 * i, 8 << i, 8 << i, 127, -1);
> -      do_test (i, 2 * i, (8 << i) + 100, 8 << i, 127, -1);
> +      do_test (&json_ctx, i, 2 * i, 8 << i, 8 << i, 127, -1);
> +      do_test (&json_ctx, i, 2 * i, (8 << i) + 100, 8 << i, 127, -1);
>
> -      do_test (2 * i, i, 8 << i, 8 << i, 254, -1);
> -      do_test (2 * i, i, (8 << i) + 100, 8 << i, 254, -1);
> +      do_test (&json_ctx, 2 * i, i, 8 << i, 8 << i, 254, -1);
> +      do_test (&json_ctx, 2 * i, i, (8 << i) + 100, 8 << i, 254, -1);
>      }
>
> +  json_array_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_document_end (&json_ctx);
> +
>    return ret;
>  }
>
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v1 15/23] string: Expand page cross tests in test-strcasecmp.c
  2022-03-23 21:57 ` [PATCH v1 15/23] string: Expand page cross tests in test-strcasecmp.c Noah Goldstein
@ 2022-03-24 19:01   ` H.J. Lu
  0 siblings, 0 replies; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 19:01 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Wed, Mar 23, 2022 at 3:01 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Add more robust tests that cover all the page cross edge cases.
> ---
>  string/test-strcasecmp.c | 112 ++++++++++++++++++++++++++++++++++-----
>  1 file changed, 100 insertions(+), 12 deletions(-)
>
> diff --git a/string/test-strcasecmp.c b/string/test-strcasecmp.c
> index 3d994f9d64..438a9713ac 100644
> --- a/string/test-strcasecmp.c
> +++ b/string/test-strcasecmp.c
> @@ -18,6 +18,10 @@
>
>  #include <locale.h>
>  #include <ctype.h>
> +#include <assert.h>
> +#define TEST_LEN (getpagesize () * 3)
> +#define MIN_PAGE_SIZE (TEST_LEN + 2 * getpagesize ())
> +
>  #define TEST_MAIN
>  #define TEST_NAME "strcasecmp"
>  #include "test-string.h"
> @@ -85,12 +89,13 @@ do_test (size_t align1, size_t align2, size_t len, int max_char,
>    if (len == 0)
>      return;
>
> -  align1 &= 7;
> -  if (align1 + len + 1 >= page_size)
> +
> +  align1 &= getpagesize () - 1;
> +  if (align1 + (len + 1) >= page_size)
>      return;
>
> -  align2 &= 7;
> -  if (align2 + len + 1 >= page_size)
> +  align2 &= getpagesize () - 1;
> +  if (align2 + (len + 1) >= page_size)
>      return;
>
>    s1 = (char *) (buf1 + align1);
> @@ -105,12 +110,33 @@ do_test (size_t align1, size_t align2, size_t len, int max_char,
>    s1[len] = s2[len] = 0;
>    s1[len + 1] = 23;
>    s2[len + 1] = 24 + exp_result;
> +
>    if ((s2[len - 1] == 'z' && exp_result == -1)
>        || (s2[len - 1] == 'a' && exp_result == 1))
>      s1[len - 1] += exp_result;
> +  else if ((s1[len - 1] == 'Z' + 1 && exp_result == 1)
> +           || (s1[len - 1] == 'A' - 1 && exp_result == -1))
> +    s1[len - 1] = tolower (s2[len - 1]) + exp_result;
>    else
>      s2[len - 1] -= exp_result;
>
> +  /* For some locals this is not guranteed yet.  */
> +  if (tolower (s1[len - 1]) - tolower (s2[len - 1]) != exp_result)
> +    {
> +      if (exp_result == -1)
> +        {
> +          s1[len - 1] = tolower ('a');
> +          s2[len - 1] = toupper (tolower ('a') - 1);
> +        }
> +      else if (exp_result == 0)
> +        s1[len - 1] = toupper (s2[len - 1]);
> +      else
> +        {
> +          s1[len - 1] = tolower ('a');
> +          s2[len - 1] = toupper (tolower ('a') + 1);
> +        }
> +    }
> +
>    FOR_EACH_IMPL (impl, 0)
>      do_one_test (impl, s1, s2, exp_result);
>  }
> @@ -207,10 +233,10 @@ do_random_tests (void)
>  }
>
>  static void
> -test_locale (const char *locale)
> +test_locale (const char *locale, int extra_tests)
>  {
> -  size_t i;
> -
> +  size_t i, j, k;
> +  const size_t test_len = MIN(TEST_LEN, 3 * 4096);
>    if (setlocale (LC_CTYPE, locale) == NULL)
>      {
>        error (0, 0, "cannot set locale \"%s\"", locale);
> @@ -249,6 +275,68 @@ test_locale (const char *locale)
>        do_test (2 * i, i, 8 << i, 254, -1);
>      }
>
> +  for (j = 0; extra_tests && j < 160; ++j)
> +    {
> +      for (i = 0; i < test_len;)
> +        {
> +          do_test (getpagesize () - j - 1, 0, i, 127, 0);
> +          do_test (getpagesize () - j - 1, 0, i, 127, 1);
> +          do_test (getpagesize () - j - 1, 0, i, 127, -1);
> +
> +          do_test (getpagesize () - j - 1, j, i, 127, 0);
> +          do_test (getpagesize () - j - 1, j, i, 127, 1);
> +          do_test (getpagesize () - j - 1, j, i, 127, -1);
> +
> +          do_test (0, getpagesize () - j - 1, i, 127, 0);
> +          do_test (0, getpagesize () - j - 1, i, 127, 1);
> +          do_test (0, getpagesize () - j - 1, i, 127, -1);
> +
> +          do_test (j, getpagesize () - j - 1, i, 127, 0);
> +          do_test (j, getpagesize () - j - 1, i, 127, 1);
> +          do_test (j, getpagesize () - j - 1, i, 127, -1);
> +
> +          for (k = 2; k <= 128; k += k)
> +            {
> +              do_test (getpagesize () - k, getpagesize () - j - 1, i, 127, 0);
> +              do_test (getpagesize () - k - 1, getpagesize () - j - 1, i, 127,
> +                       0);
> +              do_test (getpagesize () - k, getpagesize () - j - 1, i, 127, 1);
> +              do_test (getpagesize () - k - 1, getpagesize () - j - 1, i, 127,
> +                       1);
> +              do_test (getpagesize () - k, getpagesize () - j - 1, i, 127, -1);
> +              do_test (getpagesize () - k - 1, getpagesize () - j - 1, i, 127,
> +                       -1);
> +            }
> +
> +          if (i < 32)
> +            {
> +              i += 1;
> +            }
> +          else if (i < 161)
> +            {
> +              i += 7;
> +            }
> +          else if (i + 161 < test_len)
> +            {
> +              i += 31;
> +              i *= 17;
> +              i /= 16;
> +              if (i + 161 > test_len)
> +                {
> +                  i = test_len - 160;
> +                }
> +            }
> +          else if (i + 32 < test_len)
> +            {
> +              i += 7;
> +            }
> +          else
> +            {
> +              i += 1;
> +            }
> +        }
> +    }
> +
>    do_random_tests ();
>  }
>
> @@ -257,11 +345,11 @@ test_main (void)
>  {
>    test_init ();
>
> -  test_locale ("C");
> -  test_locale ("en_US.ISO-8859-1");
> -  test_locale ("en_US.UTF-8");
> -  test_locale ("tr_TR.ISO-8859-9");
> -  test_locale ("tr_TR.UTF-8");
> +  test_locale ("C", 1);
> +  test_locale ("en_US.ISO-8859-1", 0);
> +  test_locale ("en_US.UTF-8", 0);
> +  test_locale ("tr_TR.ISO-8859-9", 0);
> +  test_locale ("tr_TR.UTF-8", 0);
>
>    return ret;
>  }
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v1 16/23] string: Expand page cross tests in test-strncasecmp.c
  2022-03-23 21:57 ` [PATCH v1 16/23] string: Expand page cross tests in test-strncasecmp.c Noah Goldstein
@ 2022-03-24 19:01   ` H.J. Lu
  0 siblings, 0 replies; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 19:01 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Wed, Mar 23, 2022 at 3:01 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Add more robust tests that cover all the page cross edge cases.
> ---
>  string/test-strncasecmp.c | 166 +++++++++++++++++++++++++++++++++++---
>  1 file changed, 154 insertions(+), 12 deletions(-)
>
> diff --git a/string/test-strncasecmp.c b/string/test-strncasecmp.c
> index a3c848165a..b86c630bf6 100644
> --- a/string/test-strncasecmp.c
> +++ b/string/test-strncasecmp.c
> @@ -18,6 +18,10 @@
>
>  #include <locale.h>
>  #include <ctype.h>
> +
> +#define TEST_LEN (getpagesize () * 3)
> +#define MIN_PAGE_SIZE (TEST_LEN + 2 * getpagesize ())
> +
>  #define TEST_MAIN
>  #define TEST_NAME "strncasecmp"
>  #include "test-string.h"
> @@ -106,14 +110,15 @@ do_test (size_t align1, size_t align2, size_t n, size_t len, int max_char,
>    if (len == 0)
>      return;
>
> -  align1 &= 7;
> -  if (align1 + len + 1 >= page_size)
> +  align1 &= getpagesize () - 1;
> +  if (align1 + (len + 2) >= page_size)
>      return;
>
> -  align2 &= 7;
> -  if (align2 + len + 1 >= page_size)
> +  align2 &= getpagesize () - 1;
> +  if (align2 + (len + 2) >= page_size)
>      return;
>
> +
>    s1 = (char *) (buf1 + align1);
>    s2 = (char *) (buf2 + align2);
>
> @@ -126,12 +131,33 @@ do_test (size_t align1, size_t align2, size_t n, size_t len, int max_char,
>    s1[len] = s2[len] = 0;
>    s1[len + 1] = 23;
>    s2[len + 1] = 24 + exp_result;
> +
>    if ((s2[len - 1] == 'z' && exp_result == -1)
>        || (s2[len - 1] == 'a' && exp_result == 1))
>      s1[len - 1] += exp_result;
> +  else if ((s1[len - 1] == 'Z' + 1 && exp_result == 1)
> +           || (s1[len - 1] == 'A' - 1 && exp_result == -1))
> +    s1[len - 1] = tolower (s2[len - 1]) + exp_result;
>    else
>      s2[len - 1] -= exp_result;
>
> +  /* For some locals this is not guranteed yet.  */
> +  if (tolower (s1[len - 1]) - tolower (s2[len - 1]) != exp_result)
> +    {
> +      if (exp_result == -1)
> +        {
> +          s1[len - 1] = tolower ('a');
> +          s2[len - 1] = toupper (tolower ('a') - 1);
> +        }
> +      else if (exp_result == 0)
> +        s1[len - 1] = toupper (s2[len - 1]);
> +      else
> +        {
> +          s1[len - 1] = tolower ('a');
> +          s2[len - 1] = toupper (tolower ('a') + 1);
> +        }
> +    }
> +
>    FOR_EACH_IMPL (impl, 0)
>      do_one_test (impl, s1, s2, n, exp_result);
>  }
> @@ -299,10 +325,10 @@ bz14195 (void)
>  }
>
>  static void
> -test_locale (const char *locale)
> +test_locale (const char *locale, int extra_tests)
>  {
> -  size_t i;
> -
> +  size_t i, j, k;
> +  const size_t test_len = MIN(TEST_LEN, 3 * 4096);
>    if (setlocale (LC_CTYPE, locale) == NULL)
>      {
>        error (0, 0, "cannot set locale \"%s\"", locale);
> @@ -374,6 +400,122 @@ test_locale (const char *locale)
>        do_test (2 * i, i, (8 << i) + 100, 8 << i, 254, -1);
>      }
>
> +  for (j = 0; extra_tests && j < 160; ++j)
> +    {
> +      for (i = 0; i < test_len;)
> +        {
> +            do_test (getpagesize () - j - 1, 0, i + 1, i, 127, 0);
> +            do_test (getpagesize () - j - 1, 0, i + 1, i, 127, 1);
> +            do_test (getpagesize () - j - 1, 0, i + 1, i, 127, -1);
> +
> +            do_test (getpagesize () - j - 1, 0, i, i, 127, 0);
> +            do_test (getpagesize () - j - 1, 0, i - 1, i, 127, 0);
> +
> +            do_test (getpagesize () - j - 1, 0, ULONG_MAX, i, 127, 0);
> +            do_test (getpagesize () - j - 1, 0, ULONG_MAX, i, 127, 1);
> +            do_test (getpagesize () - j - 1, 0, ULONG_MAX, i, 127, -1);
> +
> +            do_test (getpagesize () - j - 1, 0, ULONG_MAX - i, i, 127, 0);
> +            do_test (getpagesize () - j - 1, 0, ULONG_MAX - i, i, 127, 1);
> +            do_test (getpagesize () - j - 1, 0, ULONG_MAX - i, i, 127, -1);
> +
> +            do_test (getpagesize () - j - 1, j, i + 1, i, 127, 0);
> +            do_test (getpagesize () - j - 1, j, i + 1, i, 127, 1);
> +            do_test (getpagesize () - j - 1, j, i + 1, i, 127, -1);
> +
> +            do_test (getpagesize () - j - 1, j, i, i, 127, 0);
> +            do_test (getpagesize () - j - 1, j, i - 1, i, 127, 0);
> +
> +            do_test (getpagesize () - j - 1, j, ULONG_MAX, i, 127, 0);
> +            do_test (getpagesize () - j - 1, j, ULONG_MAX, i, 127, 1);
> +            do_test (getpagesize () - j - 1, j, ULONG_MAX, i, 127, -1);
> +
> +            do_test (getpagesize () - j - 1, j, ULONG_MAX - i, i, 127, 0);
> +            do_test (getpagesize () - j - 1, j, ULONG_MAX - i, i, 127, 1);
> +            do_test (getpagesize () - j - 1, j, ULONG_MAX - i, i, 127, -1);
> +
> +            do_test (0, getpagesize () - j - 1, i + 1, i, 127, 0);
> +            do_test (0, getpagesize () - j - 1, i + 1, i, 127, 1);
> +            do_test (0, getpagesize () - j - 1, i + 1, i, 127, -1);
> +
> +            do_test (0, getpagesize () - j - 1, i, i, 127, 0);
> +            do_test (0, getpagesize () - j - 1, i - 1, i, 127, 0);
> +
> +            do_test (0, getpagesize () - j - 1, ULONG_MAX, i, 127, 0);
> +            do_test (0, getpagesize () - j - 1, ULONG_MAX, i, 127, 1);
> +            do_test (0, getpagesize () - j - 1, ULONG_MAX, i, 127, -1);
> +
> +            do_test (0, getpagesize () - j - 1, ULONG_MAX - i, i, 127, 0);
> +            do_test (0, getpagesize () - j - 1, ULONG_MAX - i, i, 127, 1);
> +            do_test (0, getpagesize () - j - 1, ULONG_MAX - i, i, 127, -1);
> +
> +            do_test (j, getpagesize () - j - 1, i + 1, i, 127, 0);
> +            do_test (j, getpagesize () - j - 1, i + 1, i, 127, 1);
> +            do_test (j, getpagesize () - j - 1, i + 1, i, 127, -1);
> +
> +            do_test (j, getpagesize () - j - 1, i, i, 127, 0);
> +            do_test (j, getpagesize () - j - 1, i - 1, i, 127, 0);
> +
> +            do_test (j, getpagesize () - j - 1, ULONG_MAX, i, 127, 0);
> +            do_test (j, getpagesize () - j - 1, ULONG_MAX, i, 127, 1);
> +            do_test (j, getpagesize () - j - 1, ULONG_MAX, i, 127, -1);
> +
> +            do_test (j, getpagesize () - j - 1, ULONG_MAX - i, i, 127, 0);
> +            do_test (j, getpagesize () - j - 1, ULONG_MAX - i, i, 127, 1);
> +            do_test (j, getpagesize () - j - 1, ULONG_MAX - i, i, 127, -1);
> +
> +          for (k = 2; k <= 128; k += k)
> +            {
> +              do_test (getpagesize () - k, getpagesize () - j - 1, i - 1, i,
> +                       127, 0);
> +              do_test (getpagesize () - k - 1, getpagesize () - j - 1, i - 1,
> +                       i, 127, 0);
> +              do_test (getpagesize () - k, getpagesize () - j - 1, i + 1, i,
> +                       127, 0);
> +              do_test (getpagesize () - k - 1, getpagesize () - j - 1, i + 1,
> +                       i, 127, 0);
> +              do_test (getpagesize () - k, getpagesize () - j - 1, i, i, 127,
> +                       0);
> +              do_test (getpagesize () - k - 1, getpagesize () - j - 1, i, i,
> +                       127, 0);
> +              do_test (getpagesize () - k, getpagesize () - j - 1, i + 1, i,
> +                       127, -1);
> +              do_test (getpagesize () - k - 1, getpagesize () - j - 1, i + 1,
> +                       i, 127, -1);
> +              do_test (getpagesize () - k, getpagesize () - j - 1, i + 1, i,
> +                       127, 1);
> +              do_test (getpagesize () - k - 1, getpagesize () - j - 1, i + 1,
> +                       i, 127, 1);
> +            }
> +          if (i < 32)
> +            {
> +              i += 1;
> +            }
> +          else if (i < 161)
> +            {
> +              i += 7;
> +            }
> +          else if (i + 161 < test_len)
> +            {
> +              i += 31;
> +              i *= 17;
> +              i /= 16;
> +              if (i + 161 > test_len)
> +                {
> +                  i = test_len - 160;
> +                }
> +            }
> +          else if (i + 32 < test_len)
> +            {
> +              i += 7;
> +            }
> +          else
> +            {
> +              i += 1;
> +            }
> +        }
> +    }
> +
>    do_random_tests ();
>    do_page_tests ();
>  }
> @@ -383,11 +525,11 @@ test_main (void)
>  {
>    test_init ();
>
> -  test_locale ("C");
> -  test_locale ("en_US.ISO-8859-1");
> -  test_locale ("en_US.UTF-8");
> -  test_locale ("tr_TR.ISO-8859-9");
> -  test_locale ("tr_TR.UTF-8");
> +  test_locale ("C", 1);
> +  test_locale ("en_US.ISO-8859-1", 0);
> +  test_locale ("en_US.UTF-8", 0);
> +  test_locale ("tr_TR.ISO-8859-9", 0);
> +  test_locale ("tr_TR.UTF-8", 0);
>
>    return ret;
>  }
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v1 17/23] x86: Optimize str{n}casecmp TOLOWER logic in strcmp.S
  2022-03-23 21:57 ` [PATCH v1 17/23] x86: Optimize str{n}casecmp TOLOWER logic in strcmp.S Noah Goldstein
@ 2022-03-24 19:02   ` H.J. Lu
  2022-05-12 19:44     ` Sunil Pandey
  0 siblings, 1 reply; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 19:02 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Wed, Mar 23, 2022 at 3:01 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Slightly faster method of doing TOLOWER that saves an
> instruction.
>
> Also replace the hard coded 5-byte no with .p2align 4. On builds with
> CET enabled this misaligned entry to strcasecmp.
>
> geometric_mean(N=40) of all benchmarks New / Original: .894
>
> All string/memory tests pass.
> ---
> Geomtric Mean N=40 runs; All functions page aligned
> length, align1, align2, max_char, New Time / Old Time
>      1,      1,      1,      127,               0.903
>      2,      2,      2,      127,               0.905
>      3,      3,      3,      127,               0.877
>      4,      4,      4,      127,               0.888
>      5,      5,      5,      127,               0.901
>      6,      6,      6,      127,               0.954
>      7,      7,      7,      127,               0.932
>      8,      0,      0,      127,               0.918
>      9,      1,      1,      127,               0.914
>     10,      2,      2,      127,               0.877
>     11,      3,      3,      127,               0.909
>     12,      4,      4,      127,               0.876
>     13,      5,      5,      127,               0.886
>     14,      6,      6,      127,               0.914
>     15,      7,      7,      127,               0.939
>      4,      0,      0,      127,               0.963
>      4,      0,      0,      254,               0.943
>      8,      0,      0,      254,               0.927
>     16,      0,      0,      127,               0.876
>     16,      0,      0,      254,               0.865
>     32,      0,      0,      127,               0.865
>     32,      0,      0,      254,               0.862
>     64,      0,      0,      127,               0.863
>     64,      0,      0,      254,               0.896
>    128,      0,      0,      127,               0.885
>    128,      0,      0,      254,               0.882
>    256,      0,      0,      127,                0.87
>    256,      0,      0,      254,               0.869
>    512,      0,      0,      127,               0.832
>    512,      0,      0,      254,               0.848
>   1024,      0,      0,      127,               0.835
>   1024,      0,      0,      254,               0.843
>     16,      1,      2,      127,               0.914
>     16,      2,      1,      254,               0.949
>     32,      2,      4,      127,               0.955
>     32,      4,      2,      254,               1.004
>     64,      3,      6,      127,               0.844
>     64,      6,      3,      254,               0.905
>    128,      4,      0,      127,               0.889
>    128,      0,      4,      254,               0.845
>    256,      5,      2,      127,               0.929
>    256,      2,      5,      254,               0.907
>    512,      6,      4,      127,               0.837
>    512,      4,      6,      254,               0.862
>   1024,      7,      6,      127,               0.895
>   1024,      6,      7,      254,                0.89
>
>  sysdeps/x86_64/strcmp.S | 64 +++++++++++++++++++----------------------
>  1 file changed, 29 insertions(+), 35 deletions(-)
>
> diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
> index e2ab59c555..99d8b36f1d 100644
> --- a/sysdeps/x86_64/strcmp.S
> +++ b/sysdeps/x86_64/strcmp.S
> @@ -75,9 +75,8 @@ ENTRY2 (__strcasecmp)
>         movq    __libc_tsd_LOCALE@gottpoff(%rip),%rax
>         mov     %fs:(%rax),%RDX_LP
>
> -       // XXX 5 byte should be before the function
> -       /* 5-byte NOP.  */
> -       .byte   0x0f,0x1f,0x44,0x00,0x00
> +       /* Either 1 or 5 bytes (dependeing if CET is enabled).  */
> +       .p2align 4
>  END2 (__strcasecmp)
>  # ifndef NO_NOLOCALE_ALIAS
>  weak_alias (__strcasecmp, strcasecmp)
> @@ -94,9 +93,8 @@ ENTRY2 (__strncasecmp)
>         movq    __libc_tsd_LOCALE@gottpoff(%rip),%rax
>         mov     %fs:(%rax),%RCX_LP
>
> -       // XXX 5 byte should be before the function
> -       /* 5-byte NOP.  */
> -       .byte   0x0f,0x1f,0x44,0x00,0x00
> +       /* Either 1 or 5 bytes (dependeing if CET is enabled).  */
> +       .p2align 4
>  END2 (__strncasecmp)
>  # ifndef NO_NOLOCALE_ALIAS
>  weak_alias (__strncasecmp, strncasecmp)
> @@ -146,22 +144,22 @@ ENTRY (STRCMP)
>  #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
>         .section .rodata.cst16,"aM",@progbits,16
>         .align 16
> -.Lbelowupper:
> -       .quad   0x4040404040404040
> -       .quad   0x4040404040404040
> -.Ltopupper:
> -       .quad   0x5b5b5b5b5b5b5b5b
> -       .quad   0x5b5b5b5b5b5b5b5b
> -.Ltouppermask:
> +.Llcase_min:
> +       .quad   0x3f3f3f3f3f3f3f3f
> +       .quad   0x3f3f3f3f3f3f3f3f
> +.Llcase_max:
> +       .quad   0x9999999999999999
> +       .quad   0x9999999999999999
> +.Lcase_add:
>         .quad   0x2020202020202020
>         .quad   0x2020202020202020
>         .previous
> -       movdqa  .Lbelowupper(%rip), %xmm5
> -# define UCLOW_reg %xmm5
> -       movdqa  .Ltopupper(%rip), %xmm6
> -# define UCHIGH_reg %xmm6
> -       movdqa  .Ltouppermask(%rip), %xmm7
> -# define LCQWORD_reg %xmm7
> +       movdqa  .Llcase_min(%rip), %xmm5
> +# define LCASE_MIN_reg %xmm5
> +       movdqa  .Llcase_max(%rip), %xmm6
> +# define LCASE_MAX_reg %xmm6
> +       movdqa  .Lcase_add(%rip), %xmm7
> +# define CASE_ADD_reg %xmm7
>  #endif
>         cmp     $0x30, %ecx
>         ja      LABEL(crosscache)       /* rsi: 16-byte load will cross cache line */
> @@ -172,22 +170,18 @@ ENTRY (STRCMP)
>         movhpd  8(%rdi), %xmm1
>         movhpd  8(%rsi), %xmm2
>  #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> -# define TOLOWER(reg1, reg2) \
> -       movdqa  reg1, %xmm8;                                    \
> -       movdqa  UCHIGH_reg, %xmm9;                              \
> -       movdqa  reg2, %xmm10;                                   \
> -       movdqa  UCHIGH_reg, %xmm11;                             \
> -       pcmpgtb UCLOW_reg, %xmm8;                               \
> -       pcmpgtb reg1, %xmm9;                                    \
> -       pcmpgtb UCLOW_reg, %xmm10;                              \
> -       pcmpgtb reg2, %xmm11;                                   \
> -       pand    %xmm9, %xmm8;                                   \
> -       pand    %xmm11, %xmm10;                                 \
> -       pand    LCQWORD_reg, %xmm8;                             \
> -       pand    LCQWORD_reg, %xmm10;                            \
> -       por     %xmm8, reg1;                                    \
> -       por     %xmm10, reg2
> -       TOLOWER (%xmm1, %xmm2)
> +#  define TOLOWER(reg1, reg2) \
> +       movdqa  LCASE_MIN_reg, %xmm8;                                   \
> +       movdqa  LCASE_MIN_reg, %xmm9;                                   \
> +       paddb   reg1, %xmm8;                                    \
> +       paddb   reg2, %xmm9;                                    \
> +       pcmpgtb LCASE_MAX_reg, %xmm8;                           \
> +       pcmpgtb LCASE_MAX_reg, %xmm9;                           \
> +       pandn   CASE_ADD_reg, %xmm8;                                    \
> +       pandn   CASE_ADD_reg, %xmm9;                                    \
> +       paddb   %xmm8, reg1;                                    \
> +       paddb   %xmm9, reg2
> +       TOLOWER (%xmm1, %xmm2)
>  #else
>  # define TOLOWER(reg1, reg2)
>  #endif
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v1 18/23] x86: Optimize str{n}casecmp TOLOWER logic in strcmp-sse42.S
  2022-03-23 21:57 ` [PATCH v1 18/23] x86: Optimize str{n}casecmp TOLOWER logic in strcmp-sse42.S Noah Goldstein
@ 2022-03-24 19:02   ` H.J. Lu
  2022-05-12 19:45     ` Sunil Pandey
  0 siblings, 1 reply; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 19:02 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Wed, Mar 23, 2022 at 3:02 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Slightly faster method of doing TOLOWER that saves an
> instruction.
>
> Also replace the hard coded 5-byte no with .p2align 4. On builds with
> CET enabled this misaligned entry to strcasecmp.
>
> geometric_mean(N=40) of all benchmarks New / Original: .920
>
> All string/memory tests pass.
> ---
> Geomtric Mean N=40 runs; All functions page aligned
> length, align1, align2, max_char, New Time / Old Time
>      1,      1,      1,      127,               0.914
>      2,      2,      2,      127,               0.952
>      3,      3,      3,      127,               0.924
>      4,      4,      4,      127,               0.995
>      5,      5,      5,      127,               0.985
>      6,      6,      6,      127,               1.017
>      7,      7,      7,      127,               1.031
>      8,      0,      0,      127,               0.967
>      9,      1,      1,      127,               0.969
>     10,      2,      2,      127,               0.951
>     11,      3,      3,      127,               0.938
>     12,      4,      4,      127,               0.937
>     13,      5,      5,      127,               0.967
>     14,      6,      6,      127,               0.941
>     15,      7,      7,      127,               0.951
>      4,      0,      0,      127,               0.959
>      4,      0,      0,      254,                0.98
>      8,      0,      0,      254,               0.959
>     16,      0,      0,      127,               0.895
>     16,      0,      0,      254,               0.901
>     32,      0,      0,      127,                0.85
>     32,      0,      0,      254,               0.851
>     64,      0,      0,      127,               0.897
>     64,      0,      0,      254,               0.895
>    128,      0,      0,      127,               0.944
>    128,      0,      0,      254,               0.935
>    256,      0,      0,      127,               0.922
>    256,      0,      0,      254,               0.913
>    512,      0,      0,      127,               0.921
>    512,      0,      0,      254,               0.914
>   1024,      0,      0,      127,               0.845
>   1024,      0,      0,      254,                0.84
>     16,      1,      2,      127,               0.923
>     16,      2,      1,      254,               0.955
>     32,      2,      4,      127,               0.979
>     32,      4,      2,      254,               0.957
>     64,      3,      6,      127,               0.866
>     64,      6,      3,      254,               0.849
>    128,      4,      0,      127,               0.882
>    128,      0,      4,      254,               0.876
>    256,      5,      2,      127,               0.877
>    256,      2,      5,      254,               0.882
>    512,      6,      4,      127,               0.822
>    512,      4,      6,      254,               0.862
>   1024,      7,      6,      127,               0.903
>   1024,      6,      7,      254,               0.908
>
>  sysdeps/x86_64/multiarch/strcmp-sse42.S | 83 +++++++++++--------------
>  1 file changed, 35 insertions(+), 48 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
> index 580feb90e9..7805ae9d41 100644
> --- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
> +++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
> @@ -88,9 +88,8 @@ ENTRY (GLABEL(__strcasecmp))
>         movq    __libc_tsd_LOCALE@gottpoff(%rip),%rax
>         mov     %fs:(%rax),%RDX_LP
>
> -       // XXX 5 byte should be before the function
> -       /* 5-byte NOP.  */
> -       .byte   0x0f,0x1f,0x44,0x00,0x00
> +       /* Either 1 or 5 bytes (dependeing if CET is enabled).  */
> +       .p2align 4
>  END (GLABEL(__strcasecmp))
>         /* FALLTHROUGH to strcasecmp_l.  */
>  #endif
> @@ -99,9 +98,8 @@ ENTRY (GLABEL(__strncasecmp))
>         movq    __libc_tsd_LOCALE@gottpoff(%rip),%rax
>         mov     %fs:(%rax),%RCX_LP
>
> -       // XXX 5 byte should be before the function
> -       /* 5-byte NOP.  */
> -       .byte   0x0f,0x1f,0x44,0x00,0x00
> +       /* Either 1 or 5 bytes (dependeing if CET is enabled).  */
> +       .p2align 4
>  END (GLABEL(__strncasecmp))
>         /* FALLTHROUGH to strncasecmp_l.  */
>  #endif
> @@ -169,27 +167,22 @@ STRCMP_SSE42:
>  #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
>         .section .rodata.cst16,"aM",@progbits,16
>         .align 16
> -LABEL(belowupper):
> -       .quad   0x4040404040404040
> -       .quad   0x4040404040404040
> -LABEL(topupper):
> -# ifdef USE_AVX
> -       .quad   0x5a5a5a5a5a5a5a5a
> -       .quad   0x5a5a5a5a5a5a5a5a
> -# else
> -       .quad   0x5b5b5b5b5b5b5b5b
> -       .quad   0x5b5b5b5b5b5b5b5b
> -# endif
> -LABEL(touppermask):
> +LABEL(lcase_min):
> +       .quad   0x3f3f3f3f3f3f3f3f
> +       .quad   0x3f3f3f3f3f3f3f3f
> +LABEL(lcase_max):
> +       .quad   0x9999999999999999
> +       .quad   0x9999999999999999
> +LABEL(case_add):
>         .quad   0x2020202020202020
>         .quad   0x2020202020202020
>         .previous
> -       movdqa  LABEL(belowupper)(%rip), %xmm4
> -# define UCLOW_reg %xmm4
> -       movdqa  LABEL(topupper)(%rip), %xmm5
> -# define UCHIGH_reg %xmm5
> -       movdqa  LABEL(touppermask)(%rip), %xmm6
> -# define LCQWORD_reg %xmm6
> +       movdqa  LABEL(lcase_min)(%rip), %xmm4
> +# define LCASE_MIN_reg %xmm4
> +       movdqa  LABEL(lcase_max)(%rip), %xmm5
> +# define LCASE_MAX_reg %xmm5
> +       movdqa  LABEL(case_add)(%rip), %xmm6
> +# define CASE_ADD_reg %xmm6
>  #endif
>         cmp     $0x30, %ecx
>         ja      LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
> @@ -200,32 +193,26 @@ LABEL(touppermask):
>  #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
>  # ifdef USE_AVX
>  #  define TOLOWER(reg1, reg2) \
> -       vpcmpgtb UCLOW_reg, reg1, %xmm7;                        \
> -       vpcmpgtb UCHIGH_reg, reg1, %xmm8;                       \
> -       vpcmpgtb UCLOW_reg, reg2, %xmm9;                        \
> -       vpcmpgtb UCHIGH_reg, reg2, %xmm10;                      \
> -       vpandn  %xmm7, %xmm8, %xmm8;                                    \
> -       vpandn  %xmm9, %xmm10, %xmm10;                                  \
> -       vpand   LCQWORD_reg, %xmm8, %xmm8;                              \
> -       vpand   LCQWORD_reg, %xmm10, %xmm10;                            \
> -       vpor    reg1, %xmm8, reg1;                                      \
> -       vpor    reg2, %xmm10, reg2
> +       vpaddb  LCASE_MIN_reg, reg1, %xmm7;                                     \
> +       vpaddb  LCASE_MIN_reg, reg2, %xmm8;                                     \
> +       vpcmpgtb LCASE_MAX_reg, %xmm7, %xmm7;                                   \
> +       vpcmpgtb LCASE_MAX_reg, %xmm8, %xmm8;                                   \
> +       vpandn  CASE_ADD_reg, %xmm7, %xmm7;                                     \
> +       vpandn  CASE_ADD_reg, %xmm8, %xmm8;                                     \
> +       vpaddb  %xmm7, reg1, reg1;                                      \
> +       vpaddb  %xmm8, reg2, reg2
>  # else
>  #  define TOLOWER(reg1, reg2) \
> -       movdqa  reg1, %xmm7;                                    \
> -       movdqa  UCHIGH_reg, %xmm8;                              \
> -       movdqa  reg2, %xmm9;                                    \
> -       movdqa  UCHIGH_reg, %xmm10;                             \
> -       pcmpgtb UCLOW_reg, %xmm7;                               \
> -       pcmpgtb reg1, %xmm8;                                    \
> -       pcmpgtb UCLOW_reg, %xmm9;                               \
> -       pcmpgtb reg2, %xmm10;                                   \
> -       pand    %xmm8, %xmm7;                                   \
> -       pand    %xmm10, %xmm9;                                  \
> -       pand    LCQWORD_reg, %xmm7;                             \
> -       pand    LCQWORD_reg, %xmm9;                             \
> -       por     %xmm7, reg1;                                    \
> -       por     %xmm9, reg2
> +       movdqa  LCASE_MIN_reg, %xmm7;                                   \
> +       movdqa  LCASE_MIN_reg, %xmm8;                                   \
> +       paddb   reg1, %xmm7;                                    \
> +       paddb   reg2, %xmm8;                                    \
> +       pcmpgtb LCASE_MAX_reg, %xmm7;                           \
> +       pcmpgtb LCASE_MAX_reg, %xmm8;                           \
> +       pandn   CASE_ADD_reg, %xmm7;                                    \
> +       pandn   CASE_ADD_reg, %xmm8;                                    \
> +       paddb   %xmm7, reg1;                                    \
> +       paddb   %xmm8, reg2
>  # endif
>         TOLOWER (%xmm1, %xmm2)
>  #else
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v1 19/23] string: Expand page cross test cases in test-strcmp.c
  2022-03-23 21:57 ` [PATCH v1 19/23] string: Expand page cross test cases in test-strcmp.c Noah Goldstein
@ 2022-03-24 19:02   ` H.J. Lu
  0 siblings, 0 replies; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 19:02 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Wed, Mar 23, 2022 at 3:02 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Test cases for when both `s1` and `s2` are near the end of a page
> where previously missing.
> ---
>  string/test-strcmp.c | 15 ++++++++++++++-
>  1 file changed, 14 insertions(+), 1 deletion(-)
>
> diff --git a/string/test-strcmp.c b/string/test-strcmp.c
> index 0abce769d0..ece03c6d0b 100644
> --- a/string/test-strcmp.c
> +++ b/string/test-strcmp.c
> @@ -392,7 +392,7 @@ check3 (void)
>  int
>  test_main (void)
>  {
> -  size_t i, j;
> +  size_t i, j, k;
>    const size_t test_len = MIN(TEST_LEN, 3 * 4096);
>    test_init ();
>    check();
> @@ -453,6 +453,19 @@ test_main (void)
>            do_test (j, getpagesize () - j - 1, i, 127, 1);
>            do_test (j, getpagesize () - j - 1, i, 127, -1);
>
> +          for (k = 2; k <= 128; k += k)
> +            {
> +              do_test (getpagesize () - k, getpagesize () - j - 1, i, 127, 0);
> +              do_test (getpagesize () - k - 1, getpagesize () - j - 1, i, 127,
> +                       0);
> +              do_test (getpagesize () - k, getpagesize () - j - 1, i, 127, 1);
> +              do_test (getpagesize () - k - 1, getpagesize () - j - 1, i, 127,
> +                       1);
> +              do_test (getpagesize () - k, getpagesize () - j - 1, i, 127, -1);
> +              do_test (getpagesize () - k - 1, getpagesize () - j - 1, i, 127,
> +                       -1);
> +            }
> +
>            if (i < 32)
>              {
>                i += 1;
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v1 20/23] string: Expand page cross test cases in test-strncmp.c
  2022-03-23 21:57 ` [PATCH v1 20/23] string: Expand page cross test cases in test-strncmp.c Noah Goldstein
@ 2022-03-24 19:02   ` H.J. Lu
  0 siblings, 0 replies; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 19:02 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Wed, Mar 23, 2022 at 3:02 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Test cases for when both `s1` and `s2` are near the end of a page
> where previously missing.
> ---
>  string/test-strncmp.c | 27 ++++++++++++++++++++++++++-
>  1 file changed, 26 insertions(+), 1 deletion(-)
>
> diff --git a/string/test-strncmp.c b/string/test-strncmp.c
> index 1a87f0e73e..bba9e3d2dc 100644
> --- a/string/test-strncmp.c
> +++ b/string/test-strncmp.c
> @@ -573,7 +573,7 @@ check_overflow (void)
>  int
>  test_main (void)
>  {
> -  size_t i, j;
> +  size_t i, j, k;
>    const size_t test_len = MIN(TEST_LEN, 3 * 4096);
>    test_init ();
>
> @@ -705,6 +705,31 @@ test_main (void)
>            do_test_n (j, getpagesize () - j - 1, i, ULONG_MAX - i, 0, 127, 0);
>            do_test_n (j, getpagesize () - j - 1, i, ULONG_MAX - i, 0, 127, 1);
>            do_test_n (j, getpagesize () - j - 1, i, ULONG_MAX - i, 0, 127, -1);
> +
> +          for (k = 2; k <= 128; k += k)
> +            {
> +              do_test (getpagesize () - k, getpagesize () - j - 1, i - 1, i,
> +                       127, 0);
> +              do_test (getpagesize () - k - 1, getpagesize () - j - 1, i - 1,
> +                       i, 127, 0);
> +              do_test (getpagesize () - k, getpagesize () - j - 1, i + 1, i,
> +                       127, 0);
> +              do_test (getpagesize () - k - 1, getpagesize () - j - 1, i + 1,
> +                       i, 127, 0);
> +              do_test (getpagesize () - k, getpagesize () - j - 1, i, i, 127,
> +                       0);
> +              do_test (getpagesize () - k - 1, getpagesize () - j - 1, i, i,
> +                       127, 0);
> +              do_test (getpagesize () - k, getpagesize () - j - 1, i + 1, i,
> +                       127, -1);
> +              do_test (getpagesize () - k - 1, getpagesize () - j - 1, i + 1,
> +                       i, 127, -1);
> +              do_test (getpagesize () - k, getpagesize () - j - 1, i + 1, i,
> +                       127, 1);
> +              do_test (getpagesize () - k - 1, getpagesize () - j - 1, i + 1,
> +                       i, 127, 1);
> +            }
> +
>            if (i < 32)
>              {
>                i += 1;
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v1 21/23] x86: Add AVX2 optimized str{n}casecmp
  2022-03-23 21:57 ` [PATCH v1 21/23] x86: Add AVX2 optimized str{n}casecmp Noah Goldstein
@ 2022-03-24 19:03   ` H.J. Lu
  2022-03-24 22:41   ` [PATCH v3 " Noah Goldstein
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 19:03 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Wed, Mar 23, 2022 at 3:02 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> geometric_mean(N=40) of all benchmarks AVX2 / SSE42: .702
>
> All string/memory tests pass.
> ---
> Geomtric Mean N=40 runs; All functions page aligned
> length, align1, align2, max_char, AVX2 Time / SSE42 Time
>      1,      1,      1,      127,                  1.032
>      2,      2,      2,      127,                  1.006
>      3,      3,      3,      127,                  1.009
>      4,      4,      4,      127,                  0.964
>      5,      5,      5,      127,                  0.929
>      6,      6,      6,      127,                   0.94
>      7,      7,      7,      127,                  0.958
>      8,      0,      0,      127,                  0.988
>      9,      1,      1,      127,                   0.99
>     10,      2,      2,      127,                  0.995
>     11,      3,      3,      127,                  0.991
>     12,      4,      4,      127,                  0.975
>     13,      5,      5,      127,                  0.943
>     14,      6,      6,      127,                  0.955
>     15,      7,      7,      127,                  0.988
>      4,      0,      0,      127,                  0.983
>      4,      0,      0,      254,                  0.978
>      8,      0,      0,      254,                  0.989
>     16,      0,      0,      127,                  0.792
>     16,      0,      0,      254,                  0.774
>     32,      0,      0,      127,                  0.568
>     32,      0,      0,      254,                  0.555
>     64,      0,      0,      127,                  0.561
>     64,      0,      0,      254,                  0.561
>    128,      0,      0,      127,                  0.574
>    128,      0,      0,      254,                  0.577
>    256,      0,      0,      127,                  0.561
>    256,      0,      0,      254,                  0.552
>    512,      0,      0,      127,                   0.59
>    512,      0,      0,      254,                  0.594
>   1024,      0,      0,      127,                  0.528
>   1024,      0,      0,      254,                  0.517
>     16,      1,      2,      127,                  0.758
>     16,      2,      1,      254,                  0.748
>     32,      2,      4,      127,                  0.419
>     32,      4,      2,      254,                  0.428
>     64,      3,      6,      127,                  0.472
>     64,      6,      3,      254,                  0.464
>    128,      4,      0,      127,                  0.534
>    128,      0,      4,      254,                   0.53
>    256,      5,      2,      127,                  0.679
>    256,      2,      5,      254,                  0.676
>    512,      6,      4,      127,                  0.525
>    512,      4,      6,      254,                  0.523
>   1024,      7,      6,      127,                  0.518
>   1024,      6,      7,      254,                  0.505
>
>  sysdeps/x86_64/multiarch/Makefile             |   4 +
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  28 +++
>  sysdeps/x86_64/multiarch/ifunc-strcasecmp.h   |  12 +
>  .../x86_64/multiarch/strcasecmp_l-avx2-rtm.S  |  15 ++
>  sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S  |  23 ++
>  sysdeps/x86_64/multiarch/strcmp-avx2.S        | 230 +++++++++++++++---
>  .../x86_64/multiarch/strncase_l-avx2-rtm.S    |  16 ++
>  sysdeps/x86_64/multiarch/strncase_l-avx2.S    |  27 ++
>  8 files changed, 324 insertions(+), 31 deletions(-)
>  create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
>  create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
>  create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
>  create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index e7b413edad..06e1848823 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -55,6 +55,8 @@ sysdep_routines += \
>    stpncpy-sse2-unaligned \
>    stpncpy-ssse3 \
>    strcasecmp_l-avx \
> +  strcasecmp_l-avx2 \
> +  strcasecmp_l-avx2-rtm \
>    strcasecmp_l-sse2 \
>    strcasecmp_l-sse4_2 \
>    strcasecmp_l-ssse3 \
> @@ -93,6 +95,8 @@ sysdep_routines += \
>    strlen-evex \
>    strlen-sse2 \
>    strncase_l-avx \
> +  strncase_l-avx2 \
> +  strncase_l-avx2-rtm \
>    strncase_l-sse2 \
>    strncase_l-sse4_2 \
>    strncase_l-ssse3 \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index a594f4176e..3c556d07ac 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -436,6 +436,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
>    /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
>    IFUNC_IMPL (i, name, strcasecmp,
> +             IFUNC_IMPL_ADD (array, i, strcasecmp,
> +                             CPU_FEATURE_USABLE (AVX2),
> +                             __strcasecmp_avx2)
> +             IFUNC_IMPL_ADD (array, i, strcasecmp,
> +                             (CPU_FEATURE_USABLE (AVX2)
> +                              && CPU_FEATURE_USABLE (RTM)),
> +                             __strcasecmp_avx2_rtm)
>               IFUNC_IMPL_ADD (array, i, strcasecmp,
>                               CPU_FEATURE_USABLE (AVX),
>                               __strcasecmp_avx)
> @@ -449,6 +456,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
>    /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
>    IFUNC_IMPL (i, name, strcasecmp_l,
> +             IFUNC_IMPL_ADD (array, i, strcasecmp,
> +                             CPU_FEATURE_USABLE (AVX2),
> +                             __strcasecmp_l_avx2)
> +             IFUNC_IMPL_ADD (array, i, strcasecmp,
> +                             (CPU_FEATURE_USABLE (AVX2)
> +                              && CPU_FEATURE_USABLE (RTM)),
> +                             __strcasecmp_l_avx2_rtm)
>               IFUNC_IMPL_ADD (array, i, strcasecmp_l,
>                               CPU_FEATURE_USABLE (AVX),
>                               __strcasecmp_l_avx)
> @@ -576,6 +590,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
>    /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
>    IFUNC_IMPL (i, name, strncasecmp,
> +             IFUNC_IMPL_ADD (array, i, strncasecmp,
> +                             CPU_FEATURE_USABLE (AVX2),
> +                             __strncasecmp_avx2)
> +             IFUNC_IMPL_ADD (array, i, strncasecmp,
> +                             (CPU_FEATURE_USABLE (AVX2)
> +                              && CPU_FEATURE_USABLE (RTM)),
> +                             __strncasecmp_avx2_rtm)
>               IFUNC_IMPL_ADD (array, i, strncasecmp,
>                               CPU_FEATURE_USABLE (AVX),
>                               __strncasecmp_avx)
> @@ -590,6 +611,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
>    /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
>    IFUNC_IMPL (i, name, strncasecmp_l,
> +             IFUNC_IMPL_ADD (array, i, strncasecmp,
> +                             CPU_FEATURE_USABLE (AVX2),
> +                             __strncasecmp_l_avx2)
> +             IFUNC_IMPL_ADD (array, i, strncasecmp,
> +                             (CPU_FEATURE_USABLE (AVX2)
> +                              && CPU_FEATURE_USABLE (RTM)),
> +                             __strncasecmp_l_avx2_rtm)
>               IFUNC_IMPL_ADD (array, i, strncasecmp_l,
>                               CPU_FEATURE_USABLE (AVX),
>                               __strncasecmp_l_avx)
> diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> index 9e3cc61ac0..c4de111fd0 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> @@ -23,12 +23,24 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
>
>  static inline void *
>  IFUNC_SELECTOR (void)
>  {
>    const struct cpu_features* cpu_features = __get_cpu_features ();
>
> +  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> +      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
> +    {
> +      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> +        return OPTIMIZE (avx2_rtm);
> +
> +      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
> +        return OPTIMIZE (avx2);
> +    }
> +
>    if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
>      return OPTIMIZE (avx);
>
> diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
> new file mode 100644
> index 0000000000..09957fc3c5
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
> @@ -0,0 +1,15 @@
> +#ifndef STRCMP
> +# define STRCMP        __strcasecmp_l_avx2_rtm
> +#endif
> +
> +#define _GLABEL(x)     x ## _rtm
> +#define GLABEL(x)      _GLABEL(x)
> +
> +#define ZERO_UPPER_VEC_REGISTERS_RETURN        \
> +       ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> +
> +#define VZEROUPPER_RETURN      jmp L(return_vzeroupper)
> +
> +#define SECTION(p)     p##.avx.rtm
> +
> +#include "strcasecmp_l-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
> new file mode 100644
> index 0000000000..e2762f2a22
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
> @@ -0,0 +1,23 @@
> +/* strcasecmp_l optimized with AVX2.
> +   Copyright (C) 2017-2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef STRCMP
> +# define STRCMP        __strcasecmp_l_avx2
> +#endif
> +#define USE_AS_STRCASECMP_L
> +#include "strcmp-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> index 86a86b68e3..eeb90a0da6 100644
> --- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> @@ -20,6 +20,10 @@
>
>  # include <sysdep.h>
>
> +# if defined USE_AS_STRCASECMP_L
> +#  include "locale-defines.h"
> +# endif
> +
>  # ifndef STRCMP
>  #  define STRCMP       __strcmp_avx2
>  # endif
> @@ -74,13 +78,88 @@
>  #  define VEC_OFFSET   (-VEC_SIZE)
>  # endif
>
> +# ifdef USE_AS_STRCASECMP_L
> +#  define BYTE_LOOP_REG        OFFSET_REG
> +# else
> +#  define BYTE_LOOP_REG        ecx
> +# endif
> +
> +# ifdef USE_AS_STRCASECMP_L
> +#  ifdef USE_AS_STRNCMP
> +#   define STRCASECMP  __strncasecmp_avx2
> +#   define LOCALE_REG  rcx
> +#   define LOCALE_REG_LP       RCX_LP
> +#   define STRCASECMP_NONASCII __strncasecmp_l_nonascii
> +#  else
> +#   define STRCASECMP  __strcasecmp_avx2
> +#   define LOCALE_REG  rdx
> +#   define LOCALE_REG_LP       RDX_LP
> +#   define STRCASECMP_NONASCII __strcasecmp_l_nonascii
> +#  endif
> +# endif
> +
>  # define xmmZERO       xmm15
>  # define ymmZERO       ymm15
>
> +# define LCASE_MIN_ymm %ymm10
> +# define LCASE_MAX_ymm %ymm11
> +# define CASE_ADD_ymm  %ymm12
> +
> +# define LCASE_MIN_xmm %xmm10
> +# define LCASE_MAX_xmm %xmm11
> +# define CASE_ADD_xmm  %xmm12
> +
> +       /* r11 is never use elsewhere so this is safe to maintain.  */
> +# define TOLOWER_BASE  %r11
> +
>  # ifndef SECTION
>  #  define SECTION(p)   p##.avx
>  # endif
>
> +# ifdef USE_AS_STRCASECMP_L
> +#  define REG(x, y) x ## y
> +#  define TOLOWER(reg1_in, reg1_out, reg2_in, reg2_out, ext)                   \
> +       vpaddb  REG(LCASE_MIN_, ext), reg1_in, REG(%ext, 8);                            \
> +       vpaddb  REG(LCASE_MIN_, ext), reg2_in, REG(%ext, 9);                            \
> +       vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 8), REG(%ext, 8);                      \
> +       vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 9), REG(%ext, 9);                      \
> +       vpandn  REG(CASE_ADD_, ext), REG(%ext, 8), REG(%ext, 8);                        \
> +       vpandn  REG(CASE_ADD_, ext), REG(%ext, 9), REG(%ext, 9);                        \
> +       vpaddb  REG(%ext, 8), reg1_in, reg1_out;                                                        \
> +       vpaddb  REG(%ext, 9), reg2_in, reg2_out
> +
> +#  define TOLOWER_gpr(src, dst)        movl (TOLOWER_BASE, src, 4), dst
> +#  define TOLOWER_ymm(...)     TOLOWER(__VA_ARGS__, ymm)
> +#  define TOLOWER_xmm(...)     TOLOWER(__VA_ARGS__, xmm)
> +
> +#  define CMP_R1_R2(s1_reg, s2_reg, scratch_reg, reg_out, ext)                 \
> +       TOLOWER (s1_reg, scratch_reg, s2_reg, s2_reg, ext);                                     \
> +       VPCMPEQ scratch_reg, s2_reg, reg_out
> +
> +#  define CMP_R1_S2(s1_reg, s2_mem, scratch_reg, reg_out, ext)                 \
> +       VMOVU   s2_mem, reg_out;                                                                                        \
> +       CMP_R1_R2(s1_reg, reg_out, scratch_reg, reg_out, ext)
> +
> +#  define CMP_R1_R2_ymm(...) CMP_R1_R2(__VA_ARGS__, ymm)
> +#  define CMP_R1_R2_xmm(...) CMP_R1_R2(__VA_ARGS__, xmm)
> +
> +#  define CMP_R1_S2_ymm(...) CMP_R1_S2(__VA_ARGS__, ymm)
> +#  define CMP_R1_S2_xmm(...) CMP_R1_S2(__VA_ARGS__, xmm)
> +
> +# else
> +#  define TOLOWER_gpr(...)
> +#  define TOLOWER_ymm(...)
> +#  define TOLOWER_xmm(...)
> +
> +#  define CMP_R1_R2_ymm(s1_reg, s2_reg, scratch_reg, reg_out)                  \
> +       VPCMPEQ s2_reg, s1_reg, reg_out
> +
> +#  define CMP_R1_R2_xmm(...) CMP_R1_R2_ymm(__VA_ARGS__)
> +
> +#  define CMP_R1_S2_ymm(...) CMP_R1_R2_ymm(__VA_ARGS__)
> +#  define CMP_R1_S2_xmm(...) CMP_R1_R2_xmm(__VA_ARGS__)
> +# endif
> +
>  /* Warning!
>             wcscmp/wcsncmp have to use SIGNED comparison for elements.
>             strcmp/strncmp have to use UNSIGNED comparison for elements.
> @@ -102,7 +181,45 @@
>     returned.  */
>
>         .section SECTION(.text), "ax", @progbits
> -ENTRY(STRCMP)
> +       .align  16
> +       .type   STRCMP, @function
> +       .globl  STRCMP
> +       .hidden STRCMP
> +
> +# ifndef GLABEL
> +#  define GLABEL(...)  __VA_ARGS__
> +# endif
> +
> +# ifdef USE_AS_STRCASECMP_L
> +ENTRY (GLABEL(STRCASECMP))
> +       movq    __libc_tsd_LOCALE@gottpoff(%rip), %rax
> +       mov     %fs:(%rax), %LOCALE_REG_LP
> +
> +       /* Either 1 or 5 bytes (dependeing if CET is enabled).  */
> +       .p2align 4
> +END (GLABEL(STRCASECMP))
> +       /* FALLTHROUGH to strcasecmp/strncasecmp_l.  */
> +# endif
> +
> +       .p2align 4
> +STRCMP:
> +       cfi_startproc
> +       _CET_ENDBR
> +       CALL_MCOUNT
> +
> +# if defined USE_AS_STRCASECMP_L
> +       /* We have to fall back on the C implementation for locales with
> +          encodings not matching ASCII for single bytes.  */
> +#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
> +       mov     LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
> +#  else
> +       mov     (%LOCALE_REG), %RAX_LP
> +#  endif
> +       testl   $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
> +       jne     STRCASECMP_NONASCII
> +       leaq    _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
> +# endif
> +
>  # ifdef USE_AS_STRNCMP
>  #  ifdef __ILP32__
>         /* Clear the upper 32 bits.  */
> @@ -128,6 +245,30 @@ ENTRY(STRCMP)
>  #  endif
>  # endif
>         vpxor   %xmmZERO, %xmmZERO, %xmmZERO
> +# if defined USE_AS_STRCASECMP_L
> +       .section .rodata.cst32, "aM", @progbits, 32
> +       .align  32
> +L(lcase_min):
> +       .quad   0x3f3f3f3f3f3f3f3f
> +       .quad   0x3f3f3f3f3f3f3f3f
> +       .quad   0x3f3f3f3f3f3f3f3f
> +       .quad   0x3f3f3f3f3f3f3f3f
> +L(lcase_max):
> +       .quad   0x9999999999999999
> +       .quad   0x9999999999999999
> +       .quad   0x9999999999999999
> +       .quad   0x9999999999999999
> +L(case_add):
> +       .quad   0x2020202020202020
> +       .quad   0x2020202020202020
> +       .quad   0x2020202020202020
> +       .quad   0x2020202020202020
> +       .previous
> +
> +       vmovdqa L(lcase_min)(%rip), LCASE_MIN_ymm
> +       vmovdqa L(lcase_max)(%rip), LCASE_MAX_ymm
> +       vmovdqa L(case_add)(%rip), CASE_ADD_ymm
> +# endif
>         movl    %edi, %eax
>         orl     %esi, %eax
>         sall    $20, %eax
> @@ -138,8 +279,10 @@ ENTRY(STRCMP)
>  L(no_page_cross):
>         /* Safe to compare 4x vectors.  */
>         VMOVU   (%rdi), %ymm0
> -       /* 1s where s1 and s2 equal.  */
> -       VPCMPEQ (%rsi), %ymm0, %ymm1
> +       /* 1s where s1 and s2 equal. Just VPCMPEQ if its not strcasecmp.
> +          Otherwise converts ymm0 and load from rsi to lower. ymm2 is
> +          scratch and ymm1 is the return.  */
> +       CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
>         /* 1s at null CHAR.  */
>         VPCMPEQ %ymm0, %ymmZERO, %ymm2
>         /* 1s where s1 and s2 equal AND not null CHAR.  */
> @@ -172,6 +315,8 @@ L(return_vec_0):
>  # else
>         movzbl  (%rdi, %rcx), %eax
>         movzbl  (%rsi, %rcx), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>  # endif
>  L(ret0):
> @@ -207,6 +352,8 @@ L(one_or_less):
>  #  else
>         movzbl  (%rdi), %eax
>         movzbl  (%rsi), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>  #  endif
>  L(ret1):
> @@ -234,6 +381,8 @@ L(return_vec_1):
>  # else
>         movzbl  VEC_SIZE(%rdi, %rcx), %eax
>         movzbl  VEC_SIZE(%rsi, %rcx), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>  # endif
>  L(ret2):
> @@ -265,6 +414,8 @@ L(return_vec_2):
>  # else
>         movzbl  (VEC_SIZE * 2)(%rdi, %rcx), %eax
>         movzbl  (VEC_SIZE * 2)(%rsi, %rcx), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>  # endif
>  L(ret3):
> @@ -285,6 +436,8 @@ L(return_vec_3):
>  #  else
>         movzbl  (VEC_SIZE * 3)(%rdi, %rcx), %eax
>         movzbl  (VEC_SIZE * 3)(%rsi, %rcx), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>  #  endif
>  L(ret4):
> @@ -295,7 +448,7 @@ L(ret4):
>  L(more_3x_vec):
>         /* Safe to compare 4x vectors.  */
>         VMOVU   VEC_SIZE(%rdi), %ymm0
> -       VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1
> +       CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
>         VPCMPEQ %ymm0, %ymmZERO, %ymm2
>         vpandn  %ymm1, %ymm2, %ymm1
>         vpmovmskb %ymm1, %ecx
> @@ -308,7 +461,7 @@ L(more_3x_vec):
>  # endif
>
>         VMOVU   (VEC_SIZE * 2)(%rdi), %ymm0
> -       VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm0, %ymm1
> +       CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 2)(%rsi), %ymm2, %ymm1)
>         VPCMPEQ %ymm0, %ymmZERO, %ymm2
>         vpandn  %ymm1, %ymm2, %ymm1
>         vpmovmskb %ymm1, %ecx
> @@ -316,7 +469,7 @@ L(more_3x_vec):
>         jnz     L(return_vec_2)
>
>         VMOVU   (VEC_SIZE * 3)(%rdi), %ymm0
> -       VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm0, %ymm1
> +       CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 3)(%rsi), %ymm2, %ymm1)
>         VPCMPEQ %ymm0, %ymmZERO, %ymm2
>         vpandn  %ymm1, %ymm2, %ymm1
>         vpmovmskb %ymm1, %ecx
> @@ -391,12 +544,10 @@ L(loop_skip_page_cross_check):
>         VMOVA   (VEC_SIZE * 3)(%rdi), %ymm6
>
>         /* ymm1 all 1s where s1 and s2 equal. All 0s otherwise.  */
> -       VPCMPEQ (VEC_SIZE * 0)(%rsi), %ymm0, %ymm1
> -
> -       VPCMPEQ (VEC_SIZE * 1)(%rsi), %ymm2, %ymm3
> -       VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
> -       VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
> -
> +       CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 0)(%rsi), %ymm3, %ymm1)
> +       CMP_R1_S2_ymm (%ymm2, (VEC_SIZE * 1)(%rsi), %ymm5, %ymm3)
> +       CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
> +       CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
>
>         /* If any mismatches or null CHAR then 0 CHAR, otherwise non-
>            zero.  */
> @@ -465,6 +616,8 @@ L(return_vec_2_3_end):
>  # else
>         movzbl  (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax
>         movzbl  (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>         xorl    %r8d, %eax
>         subl    %r8d, %eax
> @@ -508,6 +661,8 @@ L(return_vec_0_end):
>  # else
>         movzbl  (%rdi, %rcx), %eax
>         movzbl  (%rsi, %rcx), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>         xorl    %r8d, %eax
>         subl    %r8d, %eax
> @@ -530,6 +685,8 @@ L(return_vec_1_end):
>  #  else
>         movzbl  VEC_SIZE(%rdi, %rcx), %eax
>         movzbl  VEC_SIZE(%rsi, %rcx), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>         xorl    %r8d, %eax
>         subl    %r8d, %eax
> @@ -556,6 +713,8 @@ L(return_vec_2_end):
>  # else
>         movzbl  (VEC_SIZE * 2)(%rdi, %rcx), %eax
>         movzbl  (VEC_SIZE * 2)(%rsi, %rcx), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>         xorl    %r8d, %eax
>         subl    %r8d, %eax
> @@ -583,7 +742,7 @@ L(page_cross_during_loop):
>         jle     L(less_1x_vec_till_page_cross)
>
>         VMOVA   (%rdi), %ymm0
> -       VPCMPEQ (%rsi), %ymm0, %ymm1
> +       CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
>         VPCMPEQ %ymm0, %ymmZERO, %ymm2
>         vpandn  %ymm1, %ymm2, %ymm1
>         vpmovmskb %ymm1, %ecx
> @@ -605,7 +764,7 @@ L(less_1x_vec_till_page_cross):
>            here, it means the previous page (rdi - VEC_SIZE) has already
>            been loaded earlier so must be valid.  */
>         VMOVU   -VEC_SIZE(%rdi, %rax), %ymm0
> -       VPCMPEQ -VEC_SIZE(%rsi, %rax), %ymm0, %ymm1
> +       CMP_R1_S2_ymm (%ymm0, -VEC_SIZE(%rsi, %rax), %ymm2, %ymm1)
>         VPCMPEQ %ymm0, %ymmZERO, %ymm2
>         vpandn  %ymm1, %ymm2, %ymm1
>         vpmovmskb %ymm1, %ecx
> @@ -647,6 +806,8 @@ L(return_page_cross_cmp_mem):
>  # else
>         movzbl  VEC_OFFSET(%rdi, %rcx), %eax
>         movzbl  VEC_OFFSET(%rsi, %rcx), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>         xorl    %r8d, %eax
>         subl    %r8d, %eax
> @@ -673,7 +834,7 @@ L(more_2x_vec_till_page_cross):
>            iteration here.  */
>
>         VMOVU   VEC_SIZE(%rdi), %ymm0
> -       VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1
> +       CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
>         VPCMPEQ %ymm0, %ymmZERO, %ymm2
>         vpandn  %ymm1, %ymm2, %ymm1
>         vpmovmskb %ymm1, %ecx
> @@ -689,7 +850,7 @@ L(more_2x_vec_till_page_cross):
>
>         /* Safe to include comparisons from lower bytes.  */
>         VMOVU   -(VEC_SIZE * 2)(%rdi, %rax), %ymm0
> -       VPCMPEQ -(VEC_SIZE * 2)(%rsi, %rax), %ymm0, %ymm1
> +       CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 2)(%rsi, %rax), %ymm2, %ymm1)
>         VPCMPEQ %ymm0, %ymmZERO, %ymm2
>         vpandn  %ymm1, %ymm2, %ymm1
>         vpmovmskb %ymm1, %ecx
> @@ -697,7 +858,7 @@ L(more_2x_vec_till_page_cross):
>         jnz     L(return_vec_page_cross_0)
>
>         VMOVU   -(VEC_SIZE * 1)(%rdi, %rax), %ymm0
> -       VPCMPEQ -(VEC_SIZE * 1)(%rsi, %rax), %ymm0, %ymm1
> +       CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 1)(%rsi, %rax), %ymm2, %ymm1)
>         VPCMPEQ %ymm0, %ymmZERO, %ymm2
>         vpandn  %ymm1, %ymm2, %ymm1
>         vpmovmskb %ymm1, %ecx
> @@ -715,8 +876,8 @@ L(more_2x_vec_till_page_cross):
>         VMOVA   (VEC_SIZE * 2)(%rdi), %ymm4
>         VMOVA   (VEC_SIZE * 3)(%rdi), %ymm6
>
> -       VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
> -       VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
> +       CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
> +       CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
>         vpand   %ymm4, %ymm5, %ymm5
>         vpand   %ymm6, %ymm7, %ymm7
>         VPMINU  %ymm5, %ymm7, %ymm7
> @@ -767,6 +928,8 @@ L(return_vec_page_cross_1):
>  # else
>         movzbl  VEC_OFFSET(%rdi, %rcx), %eax
>         movzbl  VEC_OFFSET(%rsi, %rcx), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>         xorl    %r8d, %eax
>         subl    %r8d, %eax
> @@ -822,7 +985,7 @@ L(page_cross):
>  L(page_cross_loop):
>
>         VMOVU   (%rdi, %OFFSET_REG64), %ymm0
> -       VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1
> +       CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
>         VPCMPEQ %ymm0, %ymmZERO, %ymm2
>         vpandn  %ymm1, %ymm2, %ymm1
>         vpmovmskb %ymm1, %ecx
> @@ -840,11 +1003,11 @@ L(page_cross_loop):
>         subl    %eax, %OFFSET_REG
>         /* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
>            to not cross page so is safe to load. Since we have already
> -          loaded at least 1 VEC from rsi it is also guranteed to be safe.
> -        */
> +          loaded at least 1 VEC from rsi it is also guranteed to be
> +          safe.  */
>
>         VMOVU   (%rdi, %OFFSET_REG64), %ymm0
> -       VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1
> +       CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
>         VPCMPEQ %ymm0, %ymmZERO, %ymm2
>         vpandn  %ymm1, %ymm2, %ymm1
>         vpmovmskb %ymm1, %ecx
> @@ -877,6 +1040,8 @@ L(ret_vec_page_cross_cont):
>  # else
>         movzbl  (%rdi, %rcx), %eax
>         movzbl  (%rsi, %rcx), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>         xorl    %r8d, %eax
>         subl    %r8d, %eax
> @@ -930,7 +1095,7 @@ L(less_1x_vec_till_page):
>         ja      L(less_16_till_page)
>
>         VMOVU   (%rdi), %xmm0
> -       VPCMPEQ (%rsi), %xmm0, %xmm1
> +       CMP_R1_S2_xmm (%xmm0, (%rsi), %xmm2, %xmm1)
>         VPCMPEQ %xmm0, %xmmZERO, %xmm2
>         vpandn  %xmm1, %xmm2, %xmm1
>         vpmovmskb %ymm1, %ecx
> @@ -948,7 +1113,7 @@ L(less_1x_vec_till_page):
>  # endif
>
>         VMOVU   (%rdi, %OFFSET_REG64), %xmm0
> -       VPCMPEQ (%rsi, %OFFSET_REG64), %xmm0, %xmm1
> +       CMP_R1_S2_xmm (%xmm0, (%rsi, %OFFSET_REG64), %xmm2, %xmm1)
>         VPCMPEQ %xmm0, %xmmZERO, %xmm2
>         vpandn  %xmm1, %xmm2, %xmm1
>         vpmovmskb %ymm1, %ecx
> @@ -986,7 +1151,7 @@ L(less_16_till_page):
>         vmovq   (%rdi), %xmm0
>         vmovq   (%rsi), %xmm1
>         VPCMPEQ %xmm0, %xmmZERO, %xmm2
> -       VPCMPEQ %xmm1, %xmm0, %xmm1
> +       CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
>         vpandn  %xmm1, %xmm2, %xmm1
>         vpmovmskb %ymm1, %ecx
>         incb    %cl
> @@ -1006,7 +1171,7 @@ L(less_16_till_page):
>         vmovq   (%rdi, %OFFSET_REG64), %xmm0
>         vmovq   (%rsi, %OFFSET_REG64), %xmm1
>         VPCMPEQ %xmm0, %xmmZERO, %xmm2
> -       VPCMPEQ %xmm1, %xmm0, %xmm1
> +       CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
>         vpandn  %xmm1, %xmm2, %xmm1
>         vpmovmskb %ymm1, %ecx
>         incb    %cl
> @@ -1062,7 +1227,7 @@ L(ret_less_8_wcs):
>         vmovd   (%rdi), %xmm0
>         vmovd   (%rsi), %xmm1
>         VPCMPEQ %xmm0, %xmmZERO, %xmm2
> -       VPCMPEQ %xmm1, %xmm0, %xmm1
> +       CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
>         vpandn  %xmm1, %xmm2, %xmm1
>         vpmovmskb %ymm1, %ecx
>         subl    $0xf, %ecx
> @@ -1081,7 +1246,7 @@ L(ret_less_8_wcs):
>         vmovd   (%rdi, %OFFSET_REG64), %xmm0
>         vmovd   (%rsi, %OFFSET_REG64), %xmm1
>         VPCMPEQ %xmm0, %xmmZERO, %xmm2
> -       VPCMPEQ %xmm1, %xmm0, %xmm1
> +       CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
>         vpandn  %xmm1, %xmm2, %xmm1
>         vpmovmskb %ymm1, %ecx
>         subl    $0xf, %ecx
> @@ -1115,7 +1280,9 @@ L(less_4_till_page):
>  L(less_4_loop):
>         movzbl  (%rdi), %eax
>         movzbl  (%rsi, %rdi), %ecx
> -       subl    %ecx, %eax
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
> +       subl    %BYTE_LOOP_REG, %eax
>         jnz     L(ret_less_4_loop)
>         testl   %ecx, %ecx
>         jz      L(ret_zero_4_loop)
> @@ -1142,5 +1309,6 @@ L(ret_less_4_loop):
>         subl    %r8d, %eax
>         ret
>  # endif
> -END(STRCMP)
> +       cfi_endproc
> +       .size   STRCMP, .-STRCMP
>  #endif
> diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
> new file mode 100644
> index 0000000000..e194936c36
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
> @@ -0,0 +1,16 @@
> +#ifndef STRCMP
> +# define STRCMP        __strncasecmp_l_avx2_rtm
> +#endif
> +
> +#define _GLABEL(x)     x ## _rtm
> +#define GLABEL(x)      _GLABEL(x)
> +
> +#define ZERO_UPPER_VEC_REGISTERS_RETURN        \
> +       ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> +
> +#define VZEROUPPER_RETURN      jmp L(return_vzeroupper)
> +
> +#define SECTION(p)     p##.avx.rtm
> +#define OVERFLOW_STRCMP        __strcasecmp_avx2_rtm
> +
> +#include "strncase_l-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2.S b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
> new file mode 100644
> index 0000000000..29afccbcc5
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
> @@ -0,0 +1,27 @@
> +/* strncasecmp_l optimized with AVX2.
> +   Copyright (C) 2017-2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef STRCMP
> +# define STRCMP        __strncasecmp_l_avx2
> +#endif
> +#define USE_AS_STRCASECMP_L
> +#define USE_AS_STRNCMP
> +#ifndef OVERFLOW_STRCMP
> +# define OVERFLOW_STRCMP       __strcasecmp_avx2
> +#endif
> +#include "strcmp-avx2.S"
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v1 22/23] x86: Add EVEX optimized str{n}casecmp
  2022-03-23 21:57 ` [PATCH v1 22/23] x86: Add EVEX " Noah Goldstein
@ 2022-03-24 19:04   ` H.J. Lu
  0 siblings, 0 replies; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 19:04 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Wed, Mar 23, 2022 at 3:02 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> geometric_mean(N=40) of all benchmarks EVEX / SSE42: .621
>
> All string/memory tests pass.
> ---
> Geomtric Mean N=40 runs; All functions page aligned
> length, align1, align2, max_char, EVEX Time / SSE42 Time
>      1,      1,      1,      127,                  0.871
>      2,      2,      2,      127,                  0.833
>      3,      3,      3,      127,                  0.851
>      4,      4,      4,      127,                  0.824
>      5,      5,      5,      127,                  0.791
>      6,      6,      6,      127,                  0.789
>      7,      7,      7,      127,                  0.804
>      8,      0,      0,      127,                  0.838
>      9,      1,      1,      127,                  0.837
>     10,      2,      2,      127,                  0.834
>     11,      3,      3,      127,                  0.839
>     12,      4,      4,      127,                  0.844
>     13,      5,      5,      127,                  0.796
>     14,      6,      6,      127,                  0.811
>     15,      7,      7,      127,                  0.838
>      4,      0,      0,      127,                   0.84
>      4,      0,      0,      254,                  0.823
>      8,      0,      0,      254,                  0.838
>     16,      0,      0,      127,                  0.669
>     16,      0,      0,      254,                  0.656
>     32,      0,      0,      127,                  0.488
>     32,      0,      0,      254,                  0.484
>     64,      0,      0,      127,                  0.492
>     64,      0,      0,      254,                  0.502
>    128,      0,      0,      127,                  0.508
>    128,      0,      0,      254,                  0.497
>    256,      0,      0,      127,                  0.574
>    256,      0,      0,      254,                  0.581
>    512,      0,      0,      127,                  0.573
>    512,      0,      0,      254,                  0.577
>   1024,      0,      0,      127,                  0.489
>   1024,      0,      0,      254,                  0.485
>     16,      1,      2,      127,                  0.655
>     16,      2,      1,      254,                  0.646
>     32,      2,      4,      127,                  0.368
>     32,      4,      2,      254,                  0.376
>     64,      3,      6,      127,                  0.428
>     64,      6,      3,      254,                  0.426
>    128,      4,      0,      127,                  0.478
>    128,      0,      4,      254,                  0.473
>    256,      5,      2,      127,                   0.65
>    256,      2,      5,      254,                  0.654
>    512,      6,      4,      127,                  0.492
>    512,      4,      6,      254,                  0.489
>   1024,      7,      6,      127,                  0.463
>   1024,      6,      7,      254,                  0.457
>
>  sysdeps/x86_64/multiarch/Makefile            |   2 +
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c   |  16 ++
>  sysdeps/x86_64/multiarch/ifunc-strcasecmp.h  |   5 +
>  sysdeps/x86_64/multiarch/strcasecmp_l-evex.S |  23 ++
>  sysdeps/x86_64/multiarch/strcmp-evex.S       | 280 ++++++++++++++++---
>  sysdeps/x86_64/multiarch/strncase_l-evex.S   |  25 ++
>  6 files changed, 314 insertions(+), 37 deletions(-)
>  create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
>  create mode 100644 sysdeps/x86_64/multiarch/strncase_l-evex.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 06e1848823..35d80dc2ff 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -57,6 +57,7 @@ sysdep_routines += \
>    strcasecmp_l-avx \
>    strcasecmp_l-avx2 \
>    strcasecmp_l-avx2-rtm \
> +  strcasecmp_l-evex \
>    strcasecmp_l-sse2 \
>    strcasecmp_l-sse4_2 \
>    strcasecmp_l-ssse3 \
> @@ -97,6 +98,7 @@ sysdep_routines += \
>    strncase_l-avx \
>    strncase_l-avx2 \
>    strncase_l-avx2-rtm \
> +  strncase_l-evex \
>    strncase_l-sse2 \
>    strncase_l-sse4_2 \
>    strncase_l-ssse3 \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 3c556d07ac..f1a4d3dac2 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -436,6 +436,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
>    /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
>    IFUNC_IMPL (i, name, strcasecmp,
> +             IFUNC_IMPL_ADD (array, i, strcasecmp,
> +                             (CPU_FEATURE_USABLE (AVX512VL)
> +                              && CPU_FEATURE_USABLE (AVX512BW)),
> +                             __strcasecmp_evex)
>               IFUNC_IMPL_ADD (array, i, strcasecmp,
>                               CPU_FEATURE_USABLE (AVX2),
>                               __strcasecmp_avx2)
> @@ -456,6 +460,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
>    /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
>    IFUNC_IMPL (i, name, strcasecmp_l,
> +             IFUNC_IMPL_ADD (array, i, strcasecmp,
> +                             (CPU_FEATURE_USABLE (AVX512VL)
> +                              && CPU_FEATURE_USABLE (AVX512BW)),
> +                             __strcasecmp_l_evex)
>               IFUNC_IMPL_ADD (array, i, strcasecmp,
>                               CPU_FEATURE_USABLE (AVX2),
>                               __strcasecmp_l_avx2)
> @@ -590,6 +598,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
>    /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
>    IFUNC_IMPL (i, name, strncasecmp,
> +             IFUNC_IMPL_ADD (array, i, strncasecmp,
> +                             (CPU_FEATURE_USABLE (AVX512VL)
> +                              && CPU_FEATURE_USABLE (AVX512BW)),
> +                             __strncasecmp_evex)
>               IFUNC_IMPL_ADD (array, i, strncasecmp,
>                               CPU_FEATURE_USABLE (AVX2),
>                               __strncasecmp_avx2)
> @@ -611,6 +623,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
>    /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
>    IFUNC_IMPL (i, name, strncasecmp_l,
> +             IFUNC_IMPL_ADD (array, i, strncasecmp,
> +                             (CPU_FEATURE_USABLE (AVX512VL)
> +                              && CPU_FEATURE_USABLE (AVX512BW)),
> +                             __strncasecmp_l_evex)
>               IFUNC_IMPL_ADD (array, i, strncasecmp,
>                               CPU_FEATURE_USABLE (AVX2),
>                               __strncasecmp_l_avx2)
> diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> index c4de111fd0..bf0d146e7f 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> @@ -25,6 +25,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
>
>  static inline void *
>  IFUNC_SELECTOR (void)
> @@ -34,6 +35,10 @@ IFUNC_SELECTOR (void)
>    if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
>        && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
>      {
> +      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> +          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
> +        return OPTIMIZE (evex);
> +
>        if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
>          return OPTIMIZE (avx2_rtm);
>
> diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
> new file mode 100644
> index 0000000000..58642db748
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
> @@ -0,0 +1,23 @@
> +/* strcasecmp_l optimized with EVEX.
> +   Copyright (C) 2017-2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef STRCMP
> +# define STRCMP        __strcasecmp_l_evex
> +#endif
> +#define USE_AS_STRCASECMP_L
> +#include "strcmp-evex.S"
> diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
> index 56d8c118e4..85afd6535f 100644
> --- a/sysdeps/x86_64/multiarch/strcmp-evex.S
> +++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
> @@ -19,6 +19,9 @@
>  #if IS_IN (libc)
>
>  # include <sysdep.h>
> +# if defined USE_AS_STRCASECMP_L
> +#  include "locale-defines.h"
> +# endif
>
>  # ifndef STRCMP
>  #  define STRCMP       __strcmp_evex
> @@ -34,19 +37,29 @@
>  # define VMOVA vmovdqa64
>
>  # ifdef USE_AS_WCSCMP
> -#  define TESTEQ       subl    $0xff,
> +#  ifndef OVERFLOW_STRCMP
> +#   define OVERFLOW_STRCMP     __wcscmp_evex
> +#  endif
> +
> +#  define TESTEQ       subl $0xff,
>         /* Compare packed dwords.  */
>  #  define VPCMP        vpcmpd
>  #  define VPMINU       vpminud
>  #  define VPTESTM      vptestmd
> +#  define VPTESTNM     vptestnmd
>         /* 1 dword char == 4 bytes.  */
>  #  define SIZE_OF_CHAR 4
>  # else
> +#  ifndef OVERFLOW_STRCMP
> +#   define OVERFLOW_STRCMP     __strcmp_evex
> +#  endif
> +
>  #  define TESTEQ       incl
>         /* Compare packed bytes.  */
>  #  define VPCMP        vpcmpb
>  #  define VPMINU       vpminub
>  #  define VPTESTM      vptestmb
> +#  define VPTESTNM     vptestnmb
>         /* 1 byte char == 1 byte.  */
>  #  define SIZE_OF_CHAR 1
>  # endif
> @@ -73,11 +86,16 @@
>  #  define VEC_OFFSET   (-VEC_SIZE)
>  # endif
>
> -# define XMMZERO       xmm16
>  # define XMM0  xmm17
>  # define XMM1  xmm18
>
> -# define YMMZERO       ymm16
> +# define XMM10 xmm27
> +# define XMM11 xmm28
> +# define XMM12 xmm29
> +# define XMM13 xmm30
> +# define XMM14 xmm31
> +
> +
>  # define YMM0  ymm17
>  # define YMM1  ymm18
>  # define YMM2  ymm19
> @@ -89,6 +107,87 @@
>  # define YMM8  ymm25
>  # define YMM9  ymm26
>  # define YMM10 ymm27
> +# define YMM11 ymm28
> +# define YMM12 ymm29
> +# define YMM13 ymm30
> +# define YMM14 ymm31
> +
> +# ifdef USE_AS_STRCASECMP_L
> +#  define BYTE_LOOP_REG        OFFSET_REG
> +# else
> +#  define BYTE_LOOP_REG        ecx
> +# endif
> +
> +# ifdef USE_AS_STRCASECMP_L
> +#  ifdef USE_AS_STRNCMP
> +#   define STRCASECMP  __strncasecmp_evex
> +#   define LOCALE_REG  rcx
> +#   define LOCALE_REG_LP       RCX_LP
> +#   define STRCASECMP_NONASCII __strncasecmp_l_nonascii
> +#  else
> +#   define STRCASECMP  __strcasecmp_evex
> +#   define LOCALE_REG  rdx
> +#   define LOCALE_REG_LP       RDX_LP
> +#   define STRCASECMP_NONASCII __strcasecmp_l_nonascii
> +#  endif
> +# endif
> +
> +# define LCASE_MIN_YMM %YMM12
> +# define LCASE_MAX_YMM %YMM13
> +# define CASE_ADD_YMM  %YMM14
> +
> +# define LCASE_MIN_XMM %XMM12
> +# define LCASE_MAX_XMM %XMM13
> +# define CASE_ADD_XMM  %XMM14
> +
> +       /* NB: wcsncmp uses r11 but strcasecmp is never used in
> +          conjunction with wcscmp.  */
> +# define TOLOWER_BASE  %r11
> +
> +# ifdef USE_AS_STRCASECMP_L
> +#  define _REG(x, y) x ## y
> +#  define REG(x, y) _REG(x, y)
> +#  define TOLOWER(reg1, reg2, ext)                                                                             \
> +       vpsubb  REG(LCASE_MIN_, ext), reg1, REG(%ext, 10);                                      \
> +       vpsubb  REG(LCASE_MIN_, ext), reg2, REG(%ext, 11);                                      \
> +       vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5;                           \
> +       vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6;                           \
> +       vpaddb  reg1, REG(CASE_ADD_, ext), reg1{%k5};                                           \
> +       vpaddb  reg2, REG(CASE_ADD_, ext), reg2{%k6}
> +
> +#  define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
> +#  define TOLOWER_YMM(...)     TOLOWER(__VA_ARGS__, YMM)
> +#  define TOLOWER_XMM(...)     TOLOWER(__VA_ARGS__, XMM)
> +
> +#  define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)                                              \
> +       TOLOWER (s1_reg, s2_reg, ext);                                                                          \
> +       VPCMP   $0, s1_reg, s2_reg, reg_out
> +
> +#  define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext)                              \
> +       VMOVU   s2_mem, s2_reg;                                                                                         \
> +       CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)
> +
> +#  define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM)
> +#  define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM)
> +
> +#  define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM)
> +#  define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM)
> +
> +# else
> +#  define TOLOWER_gpr(...)
> +#  define TOLOWER_YMM(...)
> +#  define TOLOWER_XMM(...)
> +
> +#  define CMP_R1_R2_YMM(s1_reg, s2_reg, reg_out)                                               \
> +       VPCMP   $0, s2_reg, s1_reg, reg_out
> +
> +#  define CMP_R1_R2_XMM(...) CMP_R1_R2_YMM(__VA_ARGS__)
> +
> +#  define CMP_R1_S2_YMM(s1_reg, s2_mem, unused, reg_out)                               \
> +       VPCMP   $0, s2_mem, s1_reg, reg_out
> +
> +#  define CMP_R1_S2_XMM(...) CMP_R1_S2_YMM(__VA_ARGS__)
> +# endif
>
>  /* Warning!
>             wcscmp/wcsncmp have to use SIGNED comparison for elements.
> @@ -112,7 +211,41 @@
>     returned.  */
>
>         .section .text.evex, "ax", @progbits
> -ENTRY(STRCMP)
> +       .align  16
> +       .type   STRCMP, @function
> +       .globl  STRCMP
> +       .hidden STRCMP
> +
> +# ifdef USE_AS_STRCASECMP_L
> +ENTRY (STRCASECMP)
> +       movq    __libc_tsd_LOCALE@gottpoff(%rip), %rax
> +       mov     %fs:(%rax), %LOCALE_REG_LP
> +
> +       /* Either 1 or 5 bytes (dependeing if CET is enabled).  */
> +       .p2align 4
> +END (STRCASECMP)
> +       /* FALLTHROUGH to strcasecmp/strncasecmp_l.  */
> +# endif
> +
> +       .p2align 4
> +STRCMP:
> +       cfi_startproc
> +       _CET_ENDBR
> +       CALL_MCOUNT
> +
> +# if defined USE_AS_STRCASECMP_L
> +       /* We have to fall back on the C implementation for locales with
> +          encodings not matching ASCII for single bytes.  */
> +#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
> +       mov     LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
> +#  else
> +       mov     (%LOCALE_REG), %RAX_LP
> +#  endif
> +       testl   $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
> +       jne     STRCASECMP_NONASCII
> +       leaq    _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
> +# endif
> +
>  # ifdef USE_AS_STRNCMP
>  #  ifdef __ILP32__
>         /* Clear the upper 32 bits.  */
> @@ -125,6 +258,32 @@ ENTRY(STRCMP)
>            actually bound the buffer.  */
>         jle     L(one_or_less)
>  # endif
> +
> +# if defined USE_AS_STRCASECMP_L
> +       .section .rodata.cst32, "aM", @progbits, 32
> +       .align  32
> +L(lcase_min):
> +       .quad   0x4141414141414141
> +       .quad   0x4141414141414141
> +       .quad   0x4141414141414141
> +       .quad   0x4141414141414141
> +L(lcase_max):
> +       .quad   0x1a1a1a1a1a1a1a1a
> +       .quad   0x1a1a1a1a1a1a1a1a
> +       .quad   0x1a1a1a1a1a1a1a1a
> +       .quad   0x1a1a1a1a1a1a1a1a
> +L(case_add):
> +       .quad   0x2020202020202020
> +       .quad   0x2020202020202020
> +       .quad   0x2020202020202020
> +       .quad   0x2020202020202020
> +       .previous
> +
> +       vmovdqa64 L(lcase_min)(%rip), LCASE_MIN_YMM
> +       vmovdqa64 L(lcase_max)(%rip), LCASE_MAX_YMM
> +       vmovdqa64 L(case_add)(%rip), CASE_ADD_YMM
> +# endif
> +
>         movl    %edi, %eax
>         orl     %esi, %eax
>         /* Shift out the bits irrelivant to page boundary ([63:12]).  */
> @@ -139,7 +298,7 @@ L(no_page_cross):
>         VPTESTM %YMM0, %YMM0, %k2
>         /* Each bit cleared in K1 represents a mismatch or a null CHAR
>            in YMM0 and 32 bytes at (%rsi).  */
> -       VPCMP   $0, (%rsi), %YMM0, %k1{%k2}
> +       CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
>         kmovd   %k1, %ecx
>  # ifdef USE_AS_STRNCMP
>         cmpq    $CHAR_PER_VEC, %rdx
> @@ -169,6 +328,8 @@ L(return_vec_0):
>  # else
>         movzbl  (%rdi, %rcx), %eax
>         movzbl  (%rsi, %rcx), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>  # endif
>  L(ret0):
> @@ -192,7 +353,7 @@ L(one_or_less):
>  #  ifdef USE_AS_WCSCMP
>         /* 'nbe' covers the case where length is negative (large
>            unsigned).  */
> -       jnbe    __wcscmp_evex
> +       jnbe    OVERFLOW_STRCMP
>         movl    (%rdi), %edx
>         xorl    %eax, %eax
>         cmpl    (%rsi), %edx
> @@ -203,9 +364,11 @@ L(one_or_less):
>  #  else
>         /* 'nbe' covers the case where length is negative (large
>            unsigned).  */
> -       jnbe    __strcmp_evex
> +       jnbe    OVERFLOW_STRCMP
>         movzbl  (%rdi), %eax
>         movzbl  (%rsi), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>  #  endif
>  L(ret1):
> @@ -233,6 +396,8 @@ L(return_vec_1):
>  # else
>         movzbl  VEC_SIZE(%rdi, %rcx), %eax
>         movzbl  VEC_SIZE(%rsi, %rcx), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>  # endif
>  L(ret2):
> @@ -270,6 +435,8 @@ L(return_vec_2):
>  # else
>         movzbl  (VEC_SIZE * 2)(%rdi, %rcx), %eax
>         movzbl  (VEC_SIZE * 2)(%rsi, %rcx), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>  # endif
>  L(ret3):
> @@ -290,6 +457,8 @@ L(return_vec_3):
>  #  else
>         movzbl  (VEC_SIZE * 3)(%rdi, %rcx), %eax
>         movzbl  (VEC_SIZE * 3)(%rsi, %rcx), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>  #  endif
>  L(ret4):
> @@ -303,7 +472,7 @@ L(more_3x_vec):
>         /* Safe to compare 4x vectors.  */
>         VMOVU   (VEC_SIZE)(%rdi), %YMM0
>         VPTESTM %YMM0, %YMM0, %k2
> -       VPCMP   $0, (VEC_SIZE)(%rsi), %YMM0, %k1{%k2}
> +       CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
>         kmovd   %k1, %ecx
>         TESTEQ  %ecx
>         jnz     L(return_vec_1)
> @@ -315,14 +484,14 @@ L(more_3x_vec):
>
>         VMOVU   (VEC_SIZE * 2)(%rdi), %YMM0
>         VPTESTM %YMM0, %YMM0, %k2
> -       VPCMP   $0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
> +       CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2}
>         kmovd   %k1, %ecx
>         TESTEQ  %ecx
>         jnz     L(return_vec_2)
>
>         VMOVU   (VEC_SIZE * 3)(%rdi), %YMM0
>         VPTESTM %YMM0, %YMM0, %k2
> -       VPCMP   $0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
> +       CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2}
>         kmovd   %k1, %ecx
>         TESTEQ  %ecx
>         jnz     L(return_vec_3)
> @@ -381,7 +550,6 @@ L(prepare_loop_aligned):
>         subl    %esi, %eax
>         andl    $(PAGE_SIZE - 1), %eax
>
> -       vpxorq  %YMMZERO, %YMMZERO, %YMMZERO
>
>         /* Loop 4x comparisons at a time.  */
>         .p2align 4
> @@ -413,22 +581,35 @@ L(loop_skip_page_cross_check):
>         /* A zero CHAR in YMM9 means that there is a null CHAR.  */
>         VPMINU  %YMM8, %YMM9, %YMM9
>
> -       /* Each bit set in K1 represents a non-null CHAR in YMM8.  */
> +       /* Each bit set in K1 represents a non-null CHAR in YMM9.  */
>         VPTESTM %YMM9, %YMM9, %k1
> -
> +# ifndef USE_AS_STRCASECMP_L
>         vpxorq  (VEC_SIZE * 0)(%rsi), %YMM0, %YMM1
>         vpxorq  (VEC_SIZE * 1)(%rsi), %YMM2, %YMM3
>         vpxorq  (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
>         /* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
>            oring with YMM1. Result is stored in YMM6.  */
>         vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6
> -
> +# else
> +       VMOVU   (VEC_SIZE * 0)(%rsi), %YMM1
> +       TOLOWER_YMM (%YMM0, %YMM1)
> +       VMOVU   (VEC_SIZE * 1)(%rsi), %YMM3
> +       TOLOWER_YMM (%YMM2, %YMM3)
> +       VMOVU   (VEC_SIZE * 2)(%rsi), %YMM5
> +       TOLOWER_YMM (%YMM4, %YMM5)
> +       VMOVU   (VEC_SIZE * 3)(%rsi), %YMM7
> +       TOLOWER_YMM (%YMM6, %YMM7)
> +       vpxorq  %YMM0, %YMM1, %YMM1
> +       vpxorq  %YMM2, %YMM3, %YMM3
> +       vpxorq  %YMM4, %YMM5, %YMM5
> +       vpternlogd $0xde, %YMM7, %YMM1, %YMM6
> +# endif
>         /* Or together YMM3, YMM5, and YMM6.  */
>         vpternlogd $0xfe, %YMM3, %YMM5, %YMM6
>
>
>         /* A non-zero CHAR in YMM6 represents a mismatch.  */
> -       VPCMP   $0, %YMMZERO, %YMM6, %k0{%k1}
> +       VPTESTNM %YMM6, %YMM6, %k0{%k1}
>         kmovd   %k0, %LOOP_REG
>
>         TESTEQ  %LOOP_REG
> @@ -437,13 +618,13 @@ L(loop_skip_page_cross_check):
>
>         /* Find which VEC has the mismatch of end of string.  */
>         VPTESTM %YMM0, %YMM0, %k1
> -       VPCMP   $0, %YMMZERO, %YMM1, %k0{%k1}
> +       VPTESTNM %YMM1, %YMM1, %k0{%k1}
>         kmovd   %k0, %ecx
>         TESTEQ  %ecx
>         jnz     L(return_vec_0_end)
>
>         VPTESTM %YMM2, %YMM2, %k1
> -       VPCMP   $0, %YMMZERO, %YMM3, %k0{%k1}
> +       VPTESTNM %YMM3, %YMM3, %k0{%k1}
>         kmovd   %k0, %ecx
>         TESTEQ  %ecx
>         jnz     L(return_vec_1_end)
> @@ -457,7 +638,7 @@ L(return_vec_2_3_end):
>  # endif
>
>         VPTESTM %YMM4, %YMM4, %k1
> -       VPCMP   $0, %YMMZERO, %YMM5, %k0{%k1}
> +       VPTESTNM %YMM5, %YMM5, %k0{%k1}
>         kmovd   %k0, %ecx
>         TESTEQ  %ecx
>  # if CHAR_PER_VEC <= 16
> @@ -493,6 +674,8 @@ L(return_vec_3_end):
>  # else
>         movzbl  (VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax
>         movzbl  (VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>         xorl    %r8d, %eax
>         subl    %r8d, %eax
> @@ -545,6 +728,8 @@ L(return_vec_0_end):
>  # else
>         movzbl  (%rdi, %rcx), %eax
>         movzbl  (%rsi, %rcx), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>         /* Flip `eax` if `rdi` and `rsi` where swapped in page cross
>            logic. Subtract `r8d` after xor for zero case.  */
> @@ -569,6 +754,8 @@ L(return_vec_1_end):
>  #  else
>         movzbl  VEC_SIZE(%rdi, %rcx), %eax
>         movzbl  VEC_SIZE(%rsi, %rcx), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>         xorl    %r8d, %eax
>         subl    %r8d, %eax
> @@ -598,7 +785,7 @@ L(page_cross_during_loop):
>
>         VMOVA   (%rdi), %YMM0
>         VPTESTM %YMM0, %YMM0, %k2
> -       VPCMP   $0, (%rsi), %YMM0, %k1{%k2}
> +       CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
>         kmovd   %k1, %ecx
>         TESTEQ  %ecx
>         jnz     L(return_vec_0_end)
> @@ -619,8 +806,7 @@ L(less_1x_vec_till_page_cross):
>            been loaded earlier so must be valid.  */
>         VMOVU   -VEC_SIZE(%rdi, %rax), %YMM0
>         VPTESTM %YMM0, %YMM0, %k2
> -       VPCMP   $0, -VEC_SIZE(%rsi, %rax), %YMM0, %k1{%k2}
> -
> +       CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2}
>         /* Mask of potentially valid bits. The lower bits can be out of
>            range comparisons (but safe regarding page crosses).  */
>
> @@ -642,6 +828,8 @@ L(less_1x_vec_till_page_cross):
>
>  # ifdef USE_AS_STRNCMP
>  #  ifdef USE_AS_WCSCMP
> +       /* NB: strcasecmp not used with WCSCMP so this access to r11 is
> +          safe.  */
>         movl    %eax, %r11d
>         shrl    $2, %r11d
>         cmpq    %r11, %rdx
> @@ -679,6 +867,8 @@ L(return_page_cross_cmp_mem):
>  # else
>         movzbl  VEC_OFFSET(%rdi, %rcx), %eax
>         movzbl  VEC_OFFSET(%rsi, %rcx), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>         xorl    %r8d, %eax
>         subl    %r8d, %eax
> @@ -709,7 +899,7 @@ L(more_2x_vec_till_page_cross):
>
>         VMOVA   VEC_SIZE(%rdi), %YMM0
>         VPTESTM %YMM0, %YMM0, %k2
> -       VPCMP   $0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
> +       CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
>         kmovd   %k1, %ecx
>         TESTEQ  %ecx
>         jnz     L(return_vec_1_end)
> @@ -724,14 +914,14 @@ L(more_2x_vec_till_page_cross):
>         /* Safe to include comparisons from lower bytes.  */
>         VMOVU   -(VEC_SIZE * 2)(%rdi, %rax), %YMM0
>         VPTESTM %YMM0, %YMM0, %k2
> -       VPCMP   $0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM0, %k1{%k2}
> +       CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2}
>         kmovd   %k1, %ecx
>         TESTEQ  %ecx
>         jnz     L(return_vec_page_cross_0)
>
>         VMOVU   -(VEC_SIZE * 1)(%rdi, %rax), %YMM0
>         VPTESTM %YMM0, %YMM0, %k2
> -       VPCMP   $0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM0, %k1{%k2}
> +       CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2}
>         kmovd   %k1, %ecx
>         TESTEQ  %ecx
>         jnz     L(return_vec_page_cross_1)
> @@ -740,6 +930,8 @@ L(more_2x_vec_till_page_cross):
>         /* Must check length here as length might proclude reading next
>            page.  */
>  #  ifdef USE_AS_WCSCMP
> +       /* NB: strcasecmp not used with WCSCMP so this access to r11 is
> +          safe.  */
>         movl    %eax, %r11d
>         shrl    $2, %r11d
>         cmpq    %r11, %rdx
> @@ -754,12 +946,19 @@ L(more_2x_vec_till_page_cross):
>         VMOVA   (VEC_SIZE * 3)(%rdi), %YMM6
>         VPMINU  %YMM4, %YMM6, %YMM9
>         VPTESTM %YMM9, %YMM9, %k1
> -
> +# ifndef USE_AS_STRCASECMP_L
>         vpxorq  (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
>         /* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6).  */
>         vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6
> -
> -       VPCMP   $0, %YMMZERO, %YMM6, %k0{%k1}
> +# else
> +       VMOVU   (VEC_SIZE * 2)(%rsi), %YMM5
> +       TOLOWER_YMM (%YMM4, %YMM5)
> +       VMOVU   (VEC_SIZE * 3)(%rsi), %YMM7
> +       TOLOWER_YMM (%YMM6, %YMM7)
> +       vpxorq  %YMM4, %YMM5, %YMM5
> +       vpternlogd $0xde, %YMM7, %YMM5, %YMM6
> +# endif
> +       VPTESTNM %YMM6, %YMM6, %k0{%k1}
>         kmovd   %k0, %LOOP_REG
>         TESTEQ  %LOOP_REG
>         jnz     L(return_vec_2_3_end)
> @@ -815,6 +1014,8 @@ L(return_vec_page_cross_1):
>  # else
>         movzbl  VEC_OFFSET(%rdi, %rcx), %eax
>         movzbl  VEC_OFFSET(%rsi, %rcx), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>         xorl    %r8d, %eax
>         subl    %r8d, %eax
> @@ -871,7 +1072,7 @@ L(page_cross):
>  L(page_cross_loop):
>         VMOVU   (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
>         VPTESTM %YMM0, %YMM0, %k2
> -       VPCMP   $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
> +       CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
>         kmovd   %k1, %ecx
>         TESTEQ  %ecx
>         jnz     L(check_ret_vec_page_cross)
> @@ -895,7 +1096,7 @@ L(page_cross_loop):
>          */
>         VMOVU   (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
>         VPTESTM %YMM0, %YMM0, %k2
> -       VPCMP   $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
> +       CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
>
>         kmovd   %k1, %ecx
>  # ifdef USE_AS_STRNCMP
> @@ -930,6 +1131,8 @@ L(ret_vec_page_cross_cont):
>  # else
>         movzbl  (%rdi, %rcx, SIZE_OF_CHAR), %eax
>         movzbl  (%rsi, %rcx, SIZE_OF_CHAR), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>         xorl    %r8d, %eax
>         subl    %r8d, %eax
> @@ -989,7 +1192,7 @@ L(less_1x_vec_till_page):
>         /* Use 16 byte comparison.  */
>         vmovdqu (%rdi), %xmm0
>         VPTESTM %xmm0, %xmm0, %k2
> -       VPCMP   $0, (%rsi), %xmm0, %k1{%k2}
> +       CMP_R1_S2_XMM (%xmm0, (%rsi), %xmm1, %k1){%k2}
>         kmovd   %k1, %ecx
>  # ifdef USE_AS_WCSCMP
>         subl    $0xf, %ecx
> @@ -1009,7 +1212,7 @@ L(less_1x_vec_till_page):
>  # endif
>         vmovdqu (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
>         VPTESTM %xmm0, %xmm0, %k2
> -       VPCMP   $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0, %k1{%k2}
> +       CMP_R1_S2_XMM (%xmm0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1, %k1){%k2}
>         kmovd   %k1, %ecx
>  # ifdef USE_AS_WCSCMP
>         subl    $0xf, %ecx
> @@ -1048,7 +1251,7 @@ L(less_16_till_page):
>         vmovq   (%rdi), %xmm0
>         vmovq   (%rsi), %xmm1
>         VPTESTM %xmm0, %xmm0, %k2
> -       VPCMP   $0, %xmm1, %xmm0, %k1{%k2}
> +       CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
>         kmovd   %k1, %ecx
>  # ifdef USE_AS_WCSCMP
>         subl    $0x3, %ecx
> @@ -1068,7 +1271,7 @@ L(less_16_till_page):
>         vmovq   (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
>         vmovq   (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
>         VPTESTM %xmm0, %xmm0, %k2
> -       VPCMP   $0, %xmm1, %xmm0, %k1{%k2}
> +       CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
>         kmovd   %k1, %ecx
>  # ifdef USE_AS_WCSCMP
>         subl    $0x3, %ecx
> @@ -1128,7 +1331,7 @@ L(ret_less_8_wcs):
>         vmovd   (%rdi), %xmm0
>         vmovd   (%rsi), %xmm1
>         VPTESTM %xmm0, %xmm0, %k2
> -       VPCMP   $0, %xmm1, %xmm0, %k1{%k2}
> +       CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
>         kmovd   %k1, %ecx
>         subl    $0xf, %ecx
>         jnz     L(check_ret_vec_page_cross)
> @@ -1143,7 +1346,7 @@ L(ret_less_8_wcs):
>         vmovd   (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
>         vmovd   (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
>         VPTESTM %xmm0, %xmm0, %k2
> -       VPCMP   $0, %xmm1, %xmm0, %k1{%k2}
> +       CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
>         kmovd   %k1, %ecx
>         subl    $0xf, %ecx
>         jnz     L(check_ret_vec_page_cross)
> @@ -1176,7 +1379,9 @@ L(less_4_till_page):
>  L(less_4_loop):
>         movzbl  (%rdi), %eax
>         movzbl  (%rsi, %rdi), %ecx
> -       subl    %ecx, %eax
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
> +       subl    %BYTE_LOOP_REG, %eax
>         jnz     L(ret_less_4_loop)
>         testl   %ecx, %ecx
>         jz      L(ret_zero_4_loop)
> @@ -1203,5 +1408,6 @@ L(ret_less_4_loop):
>         subl    %r8d, %eax
>         ret
>  # endif
> -END(STRCMP)
> +       cfi_endproc
> +       .size   STRCMP, .-STRCMP
>  #endif
> diff --git a/sysdeps/x86_64/multiarch/strncase_l-evex.S b/sysdeps/x86_64/multiarch/strncase_l-evex.S
> new file mode 100644
> index 0000000000..b0808c1b21
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strncase_l-evex.S
> @@ -0,0 +1,25 @@
> +/* strncasecmp_l optimized with EVEX.
> +   Copyright (C) 2017-2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef STRCMP
> +# define STRCMP        __strncasecmp_l_evex
> +#endif
> +#define OVERFLOW_STRCMP        __strcasecmp_evex
> +#define USE_AS_STRCASECMP_L
> +#define USE_AS_STRNCMP
> +#include "strcmp-evex.S"
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v1 23/23] x86: Remove AVX str{n}casecmp
  2022-03-23 21:57 ` [PATCH v1 23/23] x86: Remove AVX str{n}casecmp Noah Goldstein
@ 2022-03-24 19:04   ` H.J. Lu
  2022-05-12 19:54     ` Sunil Pandey
  0 siblings, 1 reply; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 19:04 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Wed, Mar 23, 2022 at 3:03 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The rational is:
>
> 1. SSE42 has nearly identical logic so any benefit is minimal (3.4%
>    regression on Tigerlake using SSE42 versus AVX across the
>    benchtest suite).
> 2. AVX2 version covers the majority of targets that previously
>    prefered it.
> 3. The targets where AVX would still be best (SnB and IVB) are
>    becoming outdated.
>
> All in all the saving the code size is worth it.
>
> All string/memory tests pass.
> ---
> Geomtric Mean N=40 runs; All functions page aligned
> length, align1, align2, max_char, AVX Time / SSE42 Time
>      1,      1,      1,      127,                 0.928
>      2,      2,      2,      127,                 0.934
>      3,      3,      3,      127,                 0.975
>      4,      4,      4,      127,                  0.96
>      5,      5,      5,      127,                 0.935
>      6,      6,      6,      127,                 0.929
>      7,      7,      7,      127,                 0.959
>      8,      0,      0,      127,                 0.955
>      9,      1,      1,      127,                 0.944
>     10,      2,      2,      127,                 0.975
>     11,      3,      3,      127,                 0.935
>     12,      4,      4,      127,                 0.931
>     13,      5,      5,      127,                 0.926
>     14,      6,      6,      127,                 0.901
>     15,      7,      7,      127,                 0.951
>      4,      0,      0,      127,                 0.958
>      4,      0,      0,      254,                 0.956
>      8,      0,      0,      254,                 0.977
>     16,      0,      0,      127,                 0.955
>     16,      0,      0,      254,                 0.953
>     32,      0,      0,      127,                 0.943
>     32,      0,      0,      254,                 0.941
>     64,      0,      0,      127,                 0.941
>     64,      0,      0,      254,                 0.955
>    128,      0,      0,      127,                 0.972
>    128,      0,      0,      254,                 0.975
>    256,      0,      0,      127,                 0.996
>    256,      0,      0,      254,                 0.993
>    512,      0,      0,      127,                 0.992
>    512,      0,      0,      254,                 0.986
>   1024,      0,      0,      127,                 0.994
>   1024,      0,      0,      254,                 0.993
>     16,      1,      2,      127,                 0.933
>     16,      2,      1,      254,                 0.953
>     32,      2,      4,      127,                 0.927
>     32,      4,      2,      254,                 0.986
>     64,      3,      6,      127,                 0.991
>     64,      6,      3,      254,                 1.014
>    128,      4,      0,      127,                 1.001
>    128,      0,      4,      254,                 0.991
>    256,      5,      2,      127,                 1.011
>    256,      2,      5,      254,                 1.013
>    512,      6,      4,      127,                 1.056
>    512,      4,      6,      254,                 0.916
>   1024,      7,      6,      127,                 1.059
>   1024,      6,      7,      254,                 1.043
>
>  sysdeps/x86_64/multiarch/Makefile           |   2 -
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c  |  12 -
>  sysdeps/x86_64/multiarch/ifunc-strcasecmp.h |   4 -
>  sysdeps/x86_64/multiarch/strcasecmp_l-avx.S |  22 --
>  sysdeps/x86_64/multiarch/strcmp-sse42.S     | 240 +++++++++-----------
>  sysdeps/x86_64/multiarch/strncase_l-avx.S   |  22 --
>  6 files changed, 105 insertions(+), 197 deletions(-)
>  delete mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx.S
>  delete mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 35d80dc2ff..6507d1b7fa 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -54,7 +54,6 @@ sysdep_routines += \
>    stpncpy-evex \
>    stpncpy-sse2-unaligned \
>    stpncpy-ssse3 \
> -  strcasecmp_l-avx \
>    strcasecmp_l-avx2 \
>    strcasecmp_l-avx2-rtm \
>    strcasecmp_l-evex \
> @@ -95,7 +94,6 @@ sysdep_routines += \
>    strlen-avx2-rtm \
>    strlen-evex \
>    strlen-sse2 \
> -  strncase_l-avx \
>    strncase_l-avx2 \
>    strncase_l-avx2-rtm \
>    strncase_l-evex \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index f1a4d3dac2..40cc6cc49e 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -447,9 +447,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                               (CPU_FEATURE_USABLE (AVX2)
>                                && CPU_FEATURE_USABLE (RTM)),
>                               __strcasecmp_avx2_rtm)
> -             IFUNC_IMPL_ADD (array, i, strcasecmp,
> -                             CPU_FEATURE_USABLE (AVX),
> -                             __strcasecmp_avx)
>               IFUNC_IMPL_ADD (array, i, strcasecmp,
>                               CPU_FEATURE_USABLE (SSE4_2),
>                               __strcasecmp_sse42)
> @@ -471,9 +468,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                               (CPU_FEATURE_USABLE (AVX2)
>                                && CPU_FEATURE_USABLE (RTM)),
>                               __strcasecmp_l_avx2_rtm)
> -             IFUNC_IMPL_ADD (array, i, strcasecmp_l,
> -                             CPU_FEATURE_USABLE (AVX),
> -                             __strcasecmp_l_avx)
>               IFUNC_IMPL_ADD (array, i, strcasecmp_l,
>                               CPU_FEATURE_USABLE (SSE4_2),
>                               __strcasecmp_l_sse42)
> @@ -609,9 +603,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                               (CPU_FEATURE_USABLE (AVX2)
>                                && CPU_FEATURE_USABLE (RTM)),
>                               __strncasecmp_avx2_rtm)
> -             IFUNC_IMPL_ADD (array, i, strncasecmp,
> -                             CPU_FEATURE_USABLE (AVX),
> -                             __strncasecmp_avx)
>               IFUNC_IMPL_ADD (array, i, strncasecmp,
>                               CPU_FEATURE_USABLE (SSE4_2),
>                               __strncasecmp_sse42)
> @@ -634,9 +625,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                               (CPU_FEATURE_USABLE (AVX2)
>                                && CPU_FEATURE_USABLE (RTM)),
>                               __strncasecmp_l_avx2_rtm)
> -             IFUNC_IMPL_ADD (array, i, strncasecmp_l,
> -                             CPU_FEATURE_USABLE (AVX),
> -                             __strncasecmp_l_avx)
>               IFUNC_IMPL_ADD (array, i, strncasecmp_l,
>                               CPU_FEATURE_USABLE (SSE4_2),
>                               __strncasecmp_l_sse42)
> diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> index bf0d146e7f..766539c241 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> @@ -22,7 +22,6 @@
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
> -extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
> @@ -46,9 +45,6 @@ IFUNC_SELECTOR (void)
>          return OPTIMIZE (avx2);
>      }
>
> -  if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
> -    return OPTIMIZE (avx);
> -
>    if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_2)
>        && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
>      return OPTIMIZE (sse42);
> diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S
> deleted file mode 100644
> index 7ec7c21b5a..0000000000
> --- a/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S
> +++ /dev/null
> @@ -1,22 +0,0 @@
> -/* strcasecmp_l optimized with AVX.
> -   Copyright (C) 2017-2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#define STRCMP_SSE42 __strcasecmp_l_avx
> -#define USE_AVX 1
> -#define USE_AS_STRCASECMP_L
> -#include "strcmp-sse42.S"
> diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
> index 7805ae9d41..a9178ad25c 100644
> --- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
> +++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
> @@ -41,13 +41,8 @@
>  # define UPDATE_STRNCMP_COUNTER
>  #endif
>
> -#ifdef USE_AVX
> -# define SECTION       avx
> -# define GLABEL(l)     l##_avx
> -#else
> -# define SECTION       sse4.2
> -# define GLABEL(l)     l##_sse42
> -#endif
> +#define SECTION        sse4.2
> +#define GLABEL(l)      l##_sse42
>
>  #define LABEL(l)       .L##l
>
> @@ -105,21 +100,7 @@ END (GLABEL(__strncasecmp))
>  #endif
>
>
> -#ifdef USE_AVX
> -# define movdqa vmovdqa
> -# define movdqu vmovdqu
> -# define pmovmskb vpmovmskb
> -# define pcmpistri vpcmpistri
> -# define psubb vpsubb
> -# define pcmpeqb vpcmpeqb
> -# define psrldq vpsrldq
> -# define pslldq vpslldq
> -# define palignr vpalignr
> -# define pxor vpxor
> -# define D(arg) arg, arg
> -#else
> -# define D(arg) arg
> -#endif
> +#define arg arg
>
>  STRCMP_SSE42:
>         cfi_startproc
> @@ -191,18 +172,7 @@ LABEL(case_add):
>         movdqu  (%rdi), %xmm1
>         movdqu  (%rsi), %xmm2
>  #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> -# ifdef USE_AVX
> -#  define TOLOWER(reg1, reg2) \
> -       vpaddb  LCASE_MIN_reg, reg1, %xmm7;                                     \
> -       vpaddb  LCASE_MIN_reg, reg2, %xmm8;                                     \
> -       vpcmpgtb LCASE_MAX_reg, %xmm7, %xmm7;                                   \
> -       vpcmpgtb LCASE_MAX_reg, %xmm8, %xmm8;                                   \
> -       vpandn  CASE_ADD_reg, %xmm7, %xmm7;                                     \
> -       vpandn  CASE_ADD_reg, %xmm8, %xmm8;                                     \
> -       vpaddb  %xmm7, reg1, reg1;                                      \
> -       vpaddb  %xmm8, reg2, reg2
> -# else
> -#  define TOLOWER(reg1, reg2) \
> +# define TOLOWER(reg1, reg2) \
>         movdqa  LCASE_MIN_reg, %xmm7;                                   \
>         movdqa  LCASE_MIN_reg, %xmm8;                                   \
>         paddb   reg1, %xmm7;                                    \
> @@ -213,15 +183,15 @@ LABEL(case_add):
>         pandn   CASE_ADD_reg, %xmm8;                                    \
>         paddb   %xmm7, reg1;                                    \
>         paddb   %xmm8, reg2
> -# endif
> +
>         TOLOWER (%xmm1, %xmm2)
>  #else
>  # define TOLOWER(reg1, reg2)
>  #endif
> -       pxor    %xmm0, D(%xmm0)         /* clear %xmm0 for null char checks */
> -       pcmpeqb %xmm1, D(%xmm0)         /* Any null chars? */
> -       pcmpeqb %xmm2, D(%xmm1)         /* compare first 16 bytes for equality */
> -       psubb   %xmm0, D(%xmm1)         /* packed sub of comparison results*/
> +       pxor    %xmm0, %xmm0            /* clear %xmm0 for null char checks */
> +       pcmpeqb %xmm1, %xmm0            /* Any null chars? */
> +       pcmpeqb %xmm2, %xmm1            /* compare first 16 bytes for equality */
> +       psubb   %xmm0, %xmm1            /* packed sub of comparison results*/
>         pmovmskb %xmm1, %edx
>         sub     $0xffff, %edx           /* if first 16 bytes are same, edx == 0xffff */
>         jnz     LABEL(less16bytes)/* If not, find different value or null char */
> @@ -245,7 +215,7 @@ LABEL(crosscache):
>         xor     %r8d, %r8d
>         and     $0xf, %ecx              /* offset of rsi */
>         and     $0xf, %eax              /* offset of rdi */
> -       pxor    %xmm0, D(%xmm0)         /* clear %xmm0 for null char check */
> +       pxor    %xmm0, %xmm0            /* clear %xmm0 for null char check */
>         cmp     %eax, %ecx
>         je      LABEL(ashr_0)           /* rsi and rdi relative offset same */
>         ja      LABEL(bigger)
> @@ -259,7 +229,7 @@ LABEL(bigger):
>         sub     %rcx, %r9
>         lea     LABEL(unaligned_table)(%rip), %r10
>         movslq  (%r10, %r9,4), %r9
> -       pcmpeqb %xmm1, D(%xmm0)         /* Any null chars? */
> +       pcmpeqb %xmm1, %xmm0            /* Any null chars? */
>         lea     (%r10, %r9), %r10
>         _CET_NOTRACK jmp *%r10          /* jump to corresponding case */
>
> @@ -272,15 +242,15 @@ LABEL(bigger):
>  LABEL(ashr_0):
>
>         movdqa  (%rsi), %xmm1
> -       pcmpeqb %xmm1, D(%xmm0)         /* Any null chars? */
> +       pcmpeqb %xmm1, %xmm0            /* Any null chars? */
>  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> -       pcmpeqb (%rdi), D(%xmm1)        /* compare 16 bytes for equality */
> +       pcmpeqb (%rdi), %xmm1           /* compare 16 bytes for equality */
>  #else
>         movdqa  (%rdi), %xmm2
>         TOLOWER (%xmm1, %xmm2)
> -       pcmpeqb %xmm2, D(%xmm1)         /* compare 16 bytes for equality */
> +       pcmpeqb %xmm2, %xmm1            /* compare 16 bytes for equality */
>  #endif
> -       psubb   %xmm0, D(%xmm1)         /* packed sub of comparison results*/
> +       psubb   %xmm0, %xmm1            /* packed sub of comparison results*/
>         pmovmskb %xmm1, %r9d
>         shr     %cl, %edx               /* adjust 0xffff for offset */
>         shr     %cl, %r9d               /* adjust for 16-byte offset */
> @@ -360,10 +330,10 @@ LABEL(ashr_0_exit_use):
>   */
>         .p2align 4
>  LABEL(ashr_1):
> -       pslldq  $15, D(%xmm2)           /* shift first string to align with second */
> +       pslldq  $15, %xmm2              /* shift first string to align with second */
>         TOLOWER (%xmm1, %xmm2)
> -       pcmpeqb %xmm1, D(%xmm2)         /* compare 16 bytes for equality */
> -       psubb   %xmm0, D(%xmm2)         /* packed sub of comparison results*/
> +       pcmpeqb %xmm1, %xmm2            /* compare 16 bytes for equality */
> +       psubb   %xmm0, %xmm2            /* packed sub of comparison results*/
>         pmovmskb %xmm2, %r9d
>         shr     %cl, %edx               /* adjust 0xffff for offset */
>         shr     %cl, %r9d               /* adjust for 16-byte offset */
> @@ -391,7 +361,7 @@ LABEL(loop_ashr_1_use):
>
>  LABEL(nibble_ashr_1_restart_use):
>         movdqa  (%rdi, %rdx), %xmm0
> -       palignr $1, -16(%rdi, %rdx), D(%xmm0)
> +       palignr $1, -16(%rdi, %rdx), %xmm0
>  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
>         pcmpistri       $0x1a,(%rsi,%rdx), %xmm0
>  #else
> @@ -410,7 +380,7 @@ LABEL(nibble_ashr_1_restart_use):
>         jg      LABEL(nibble_ashr_1_use)
>
>         movdqa  (%rdi, %rdx), %xmm0
> -       palignr $1, -16(%rdi, %rdx), D(%xmm0)
> +       palignr $1, -16(%rdi, %rdx), %xmm0
>  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
>         pcmpistri       $0x1a,(%rsi,%rdx), %xmm0
>  #else
> @@ -430,7 +400,7 @@ LABEL(nibble_ashr_1_restart_use):
>  LABEL(nibble_ashr_1_use):
>         sub     $0x1000, %r10
>         movdqa  -16(%rdi, %rdx), %xmm0
> -       psrldq  $1, D(%xmm0)
> +       psrldq  $1, %xmm0
>         pcmpistri      $0x3a,%xmm0, %xmm0
>  #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
>         cmp     %r11, %rcx
> @@ -448,10 +418,10 @@ LABEL(nibble_ashr_1_use):
>   */
>         .p2align 4
>  LABEL(ashr_2):
> -       pslldq  $14, D(%xmm2)
> +       pslldq  $14, %xmm2
>         TOLOWER (%xmm1, %xmm2)
> -       pcmpeqb %xmm1, D(%xmm2)
> -       psubb   %xmm0, D(%xmm2)
> +       pcmpeqb %xmm1, %xmm2
> +       psubb   %xmm0, %xmm2
>         pmovmskb %xmm2, %r9d
>         shr     %cl, %edx
>         shr     %cl, %r9d
> @@ -479,7 +449,7 @@ LABEL(loop_ashr_2_use):
>
>  LABEL(nibble_ashr_2_restart_use):
>         movdqa  (%rdi, %rdx), %xmm0
> -       palignr $2, -16(%rdi, %rdx), D(%xmm0)
> +       palignr $2, -16(%rdi, %rdx), %xmm0
>  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
>         pcmpistri       $0x1a,(%rsi,%rdx), %xmm0
>  #else
> @@ -498,7 +468,7 @@ LABEL(nibble_ashr_2_restart_use):
>         jg      LABEL(nibble_ashr_2_use)
>
>         movdqa  (%rdi, %rdx), %xmm0
> -       palignr $2, -16(%rdi, %rdx), D(%xmm0)
> +       palignr $2, -16(%rdi, %rdx), %xmm0
>  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
>         pcmpistri       $0x1a,(%rsi,%rdx), %xmm0
>  #else
> @@ -518,7 +488,7 @@ LABEL(nibble_ashr_2_restart_use):
>  LABEL(nibble_ashr_2_use):
>         sub     $0x1000, %r10
>         movdqa  -16(%rdi, %rdx), %xmm0
> -       psrldq  $2, D(%xmm0)
> +       psrldq  $2, %xmm0
>         pcmpistri      $0x3a,%xmm0, %xmm0
>  #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
>         cmp     %r11, %rcx
> @@ -536,10 +506,10 @@ LABEL(nibble_ashr_2_use):
>   */
>         .p2align 4
>  LABEL(ashr_3):
> -       pslldq  $13, D(%xmm2)
> +       pslldq  $13, %xmm2
>         TOLOWER (%xmm1, %xmm2)
> -       pcmpeqb %xmm1, D(%xmm2)
> -       psubb   %xmm0, D(%xmm2)
> +       pcmpeqb %xmm1, %xmm2
> +       psubb   %xmm0, %xmm2
>         pmovmskb %xmm2, %r9d
>         shr     %cl, %edx
>         shr     %cl, %r9d
> @@ -567,7 +537,7 @@ LABEL(loop_ashr_3_use):
>
>  LABEL(nibble_ashr_3_restart_use):
>         movdqa  (%rdi, %rdx), %xmm0
> -       palignr $3, -16(%rdi, %rdx), D(%xmm0)
> +       palignr $3, -16(%rdi, %rdx), %xmm0
>  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
>         pcmpistri       $0x1a,(%rsi,%rdx), %xmm0
>  #else
> @@ -586,7 +556,7 @@ LABEL(nibble_ashr_3_restart_use):
>         jg      LABEL(nibble_ashr_3_use)
>
>         movdqa  (%rdi, %rdx), %xmm0
> -       palignr $3, -16(%rdi, %rdx), D(%xmm0)
> +       palignr $3, -16(%rdi, %rdx), %xmm0
>  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
>         pcmpistri       $0x1a,(%rsi,%rdx), %xmm0
>  #else
> @@ -606,7 +576,7 @@ LABEL(nibble_ashr_3_restart_use):
>  LABEL(nibble_ashr_3_use):
>         sub     $0x1000, %r10
>         movdqa  -16(%rdi, %rdx), %xmm0
> -       psrldq  $3, D(%xmm0)
> +       psrldq  $3, %xmm0
>         pcmpistri      $0x3a,%xmm0, %xmm0
>  #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
>         cmp     %r11, %rcx
> @@ -624,10 +594,10 @@ LABEL(nibble_ashr_3_use):
>   */
>         .p2align 4
>  LABEL(ashr_4):
> -       pslldq  $12, D(%xmm2)
> +       pslldq  $12, %xmm2
>         TOLOWER (%xmm1, %xmm2)
> -       pcmpeqb %xmm1, D(%xmm2)
> -       psubb   %xmm0, D(%xmm2)
> +       pcmpeqb %xmm1, %xmm2
> +       psubb   %xmm0, %xmm2
>         pmovmskb %xmm2, %r9d
>         shr     %cl, %edx
>         shr     %cl, %r9d
> @@ -656,7 +626,7 @@ LABEL(loop_ashr_4_use):
>
>  LABEL(nibble_ashr_4_restart_use):
>         movdqa  (%rdi, %rdx), %xmm0
> -       palignr $4, -16(%rdi, %rdx), D(%xmm0)
> +       palignr $4, -16(%rdi, %rdx), %xmm0
>  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
>         pcmpistri       $0x1a,(%rsi,%rdx), %xmm0
>  #else
> @@ -675,7 +645,7 @@ LABEL(nibble_ashr_4_restart_use):
>         jg      LABEL(nibble_ashr_4_use)
>
>         movdqa  (%rdi, %rdx), %xmm0
> -       palignr $4, -16(%rdi, %rdx), D(%xmm0)
> +       palignr $4, -16(%rdi, %rdx), %xmm0
>  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
>         pcmpistri       $0x1a,(%rsi,%rdx), %xmm0
>  #else
> @@ -695,7 +665,7 @@ LABEL(nibble_ashr_4_restart_use):
>  LABEL(nibble_ashr_4_use):
>         sub     $0x1000, %r10
>         movdqa  -16(%rdi, %rdx), %xmm0
> -       psrldq  $4, D(%xmm0)
> +       psrldq  $4, %xmm0
>         pcmpistri      $0x3a,%xmm0, %xmm0
>  #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
>         cmp     %r11, %rcx
> @@ -713,10 +683,10 @@ LABEL(nibble_ashr_4_use):
>   */
>         .p2align 4
>  LABEL(ashr_5):
> -       pslldq  $11, D(%xmm2)
> +       pslldq  $11, %xmm2
>         TOLOWER (%xmm1, %xmm2)
> -       pcmpeqb %xmm1, D(%xmm2)
> -       psubb   %xmm0, D(%xmm2)
> +       pcmpeqb %xmm1, %xmm2
> +       psubb   %xmm0, %xmm2
>         pmovmskb %xmm2, %r9d
>         shr     %cl, %edx
>         shr     %cl, %r9d
> @@ -745,7 +715,7 @@ LABEL(loop_ashr_5_use):
>
>  LABEL(nibble_ashr_5_restart_use):
>         movdqa  (%rdi, %rdx), %xmm0
> -       palignr $5, -16(%rdi, %rdx), D(%xmm0)
> +       palignr $5, -16(%rdi, %rdx), %xmm0
>  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
>         pcmpistri       $0x1a,(%rsi,%rdx), %xmm0
>  #else
> @@ -765,7 +735,7 @@ LABEL(nibble_ashr_5_restart_use):
>
>         movdqa  (%rdi, %rdx), %xmm0
>
> -       palignr $5, -16(%rdi, %rdx), D(%xmm0)
> +       palignr $5, -16(%rdi, %rdx), %xmm0
>  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
>         pcmpistri       $0x1a,(%rsi,%rdx), %xmm0
>  #else
> @@ -785,7 +755,7 @@ LABEL(nibble_ashr_5_restart_use):
>  LABEL(nibble_ashr_5_use):
>         sub     $0x1000, %r10
>         movdqa  -16(%rdi, %rdx), %xmm0
> -       psrldq  $5, D(%xmm0)
> +       psrldq  $5, %xmm0
>         pcmpistri      $0x3a,%xmm0, %xmm0
>  #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
>         cmp     %r11, %rcx
> @@ -803,10 +773,10 @@ LABEL(nibble_ashr_5_use):
>   */
>         .p2align 4
>  LABEL(ashr_6):
> -       pslldq  $10, D(%xmm2)
> +       pslldq  $10, %xmm2
>         TOLOWER (%xmm1, %xmm2)
> -       pcmpeqb %xmm1, D(%xmm2)
> -       psubb   %xmm0, D(%xmm2)
> +       pcmpeqb %xmm1, %xmm2
> +       psubb   %xmm0, %xmm2
>         pmovmskb %xmm2, %r9d
>         shr     %cl, %edx
>         shr     %cl, %r9d
> @@ -835,7 +805,7 @@ LABEL(loop_ashr_6_use):
>
>  LABEL(nibble_ashr_6_restart_use):
>         movdqa  (%rdi, %rdx), %xmm0
> -       palignr $6, -16(%rdi, %rdx), D(%xmm0)
> +       palignr $6, -16(%rdi, %rdx), %xmm0
>  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
>         pcmpistri $0x1a,(%rsi,%rdx), %xmm0
>  #else
> @@ -854,7 +824,7 @@ LABEL(nibble_ashr_6_restart_use):
>         jg      LABEL(nibble_ashr_6_use)
>
>         movdqa  (%rdi, %rdx), %xmm0
> -       palignr $6, -16(%rdi, %rdx), D(%xmm0)
> +       palignr $6, -16(%rdi, %rdx), %xmm0
>  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
>         pcmpistri $0x1a,(%rsi,%rdx), %xmm0
>  #else
> @@ -874,7 +844,7 @@ LABEL(nibble_ashr_6_restart_use):
>  LABEL(nibble_ashr_6_use):
>         sub     $0x1000, %r10
>         movdqa  -16(%rdi, %rdx), %xmm0
> -       psrldq  $6, D(%xmm0)
> +       psrldq  $6, %xmm0
>         pcmpistri      $0x3a,%xmm0, %xmm0
>  #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
>         cmp     %r11, %rcx
> @@ -892,10 +862,10 @@ LABEL(nibble_ashr_6_use):
>   */
>         .p2align 4
>  LABEL(ashr_7):
> -       pslldq  $9, D(%xmm2)
> +       pslldq  $9, %xmm2
>         TOLOWER (%xmm1, %xmm2)
> -       pcmpeqb %xmm1, D(%xmm2)
> -       psubb   %xmm0, D(%xmm2)
> +       pcmpeqb %xmm1, %xmm2
> +       psubb   %xmm0, %xmm2
>         pmovmskb %xmm2, %r9d
>         shr     %cl, %edx
>         shr     %cl, %r9d
> @@ -924,7 +894,7 @@ LABEL(loop_ashr_7_use):
>
>  LABEL(nibble_ashr_7_restart_use):
>         movdqa  (%rdi, %rdx), %xmm0
> -       palignr $7, -16(%rdi, %rdx), D(%xmm0)
> +       palignr $7, -16(%rdi, %rdx), %xmm0
>  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
>         pcmpistri       $0x1a,(%rsi,%rdx), %xmm0
>  #else
> @@ -943,7 +913,7 @@ LABEL(nibble_ashr_7_restart_use):
>         jg      LABEL(nibble_ashr_7_use)
>
>         movdqa  (%rdi, %rdx), %xmm0
> -       palignr $7, -16(%rdi, %rdx), D(%xmm0)
> +       palignr $7, -16(%rdi, %rdx), %xmm0
>  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
>         pcmpistri       $0x1a,(%rsi,%rdx), %xmm0
>  #else
> @@ -963,7 +933,7 @@ LABEL(nibble_ashr_7_restart_use):
>  LABEL(nibble_ashr_7_use):
>         sub     $0x1000, %r10
>         movdqa  -16(%rdi, %rdx), %xmm0
> -       psrldq  $7, D(%xmm0)
> +       psrldq  $7, %xmm0
>         pcmpistri      $0x3a,%xmm0, %xmm0
>  #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
>         cmp     %r11, %rcx
> @@ -981,10 +951,10 @@ LABEL(nibble_ashr_7_use):
>   */
>         .p2align 4
>  LABEL(ashr_8):
> -       pslldq  $8, D(%xmm2)
> +       pslldq  $8, %xmm2
>         TOLOWER (%xmm1, %xmm2)
> -       pcmpeqb %xmm1, D(%xmm2)
> -       psubb   %xmm0, D(%xmm2)
> +       pcmpeqb %xmm1, %xmm2
> +       psubb   %xmm0, %xmm2
>         pmovmskb %xmm2, %r9d
>         shr     %cl, %edx
>         shr     %cl, %r9d
> @@ -1013,7 +983,7 @@ LABEL(loop_ashr_8_use):
>
>  LABEL(nibble_ashr_8_restart_use):
>         movdqa  (%rdi, %rdx), %xmm0
> -       palignr $8, -16(%rdi, %rdx), D(%xmm0)
> +       palignr $8, -16(%rdi, %rdx), %xmm0
>  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
>         pcmpistri $0x1a, (%rsi,%rdx), %xmm0
>  #else
> @@ -1032,7 +1002,7 @@ LABEL(nibble_ashr_8_restart_use):
>         jg      LABEL(nibble_ashr_8_use)
>
>         movdqa  (%rdi, %rdx), %xmm0
> -       palignr $8, -16(%rdi, %rdx), D(%xmm0)
> +       palignr $8, -16(%rdi, %rdx), %xmm0
>  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
>         pcmpistri $0x1a, (%rsi,%rdx), %xmm0
>  #else
> @@ -1052,7 +1022,7 @@ LABEL(nibble_ashr_8_restart_use):
>  LABEL(nibble_ashr_8_use):
>         sub     $0x1000, %r10
>         movdqa  -16(%rdi, %rdx), %xmm0
> -       psrldq  $8, D(%xmm0)
> +       psrldq  $8, %xmm0
>         pcmpistri      $0x3a,%xmm0, %xmm0
>  #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
>         cmp     %r11, %rcx
> @@ -1070,10 +1040,10 @@ LABEL(nibble_ashr_8_use):
>   */
>         .p2align 4
>  LABEL(ashr_9):
> -       pslldq  $7, D(%xmm2)
> +       pslldq  $7, %xmm2
>         TOLOWER (%xmm1, %xmm2)
> -       pcmpeqb %xmm1, D(%xmm2)
> -       psubb   %xmm0, D(%xmm2)
> +       pcmpeqb %xmm1, %xmm2
> +       psubb   %xmm0, %xmm2
>         pmovmskb %xmm2, %r9d
>         shr     %cl, %edx
>         shr     %cl, %r9d
> @@ -1103,7 +1073,7 @@ LABEL(loop_ashr_9_use):
>  LABEL(nibble_ashr_9_restart_use):
>         movdqa  (%rdi, %rdx), %xmm0
>
> -       palignr $9, -16(%rdi, %rdx), D(%xmm0)
> +       palignr $9, -16(%rdi, %rdx), %xmm0
>  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
>         pcmpistri $0x1a, (%rsi,%rdx), %xmm0
>  #else
> @@ -1122,7 +1092,7 @@ LABEL(nibble_ashr_9_restart_use):
>         jg      LABEL(nibble_ashr_9_use)
>
>         movdqa  (%rdi, %rdx), %xmm0
> -       palignr $9, -16(%rdi, %rdx), D(%xmm0)
> +       palignr $9, -16(%rdi, %rdx), %xmm0
>  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
>         pcmpistri $0x1a, (%rsi,%rdx), %xmm0
>  #else
> @@ -1142,7 +1112,7 @@ LABEL(nibble_ashr_9_restart_use):
>  LABEL(nibble_ashr_9_use):
>         sub     $0x1000, %r10
>         movdqa  -16(%rdi, %rdx), %xmm0
> -       psrldq  $9, D(%xmm0)
> +       psrldq  $9, %xmm0
>         pcmpistri      $0x3a,%xmm0, %xmm0
>  #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
>         cmp     %r11, %rcx
> @@ -1160,10 +1130,10 @@ LABEL(nibble_ashr_9_use):
>   */
>         .p2align 4
>  LABEL(ashr_10):
> -       pslldq  $6, D(%xmm2)
> +       pslldq  $6, %xmm2
>         TOLOWER (%xmm1, %xmm2)
> -       pcmpeqb %xmm1, D(%xmm2)
> -       psubb   %xmm0, D(%xmm2)
> +       pcmpeqb %xmm1, %xmm2
> +       psubb   %xmm0, %xmm2
>         pmovmskb %xmm2, %r9d
>         shr     %cl, %edx
>         shr     %cl, %r9d
> @@ -1192,7 +1162,7 @@ LABEL(loop_ashr_10_use):
>
>  LABEL(nibble_ashr_10_restart_use):
>         movdqa  (%rdi, %rdx), %xmm0
> -       palignr $10, -16(%rdi, %rdx), D(%xmm0)
> +       palignr $10, -16(%rdi, %rdx), %xmm0
>  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
>         pcmpistri $0x1a, (%rsi,%rdx), %xmm0
>  #else
> @@ -1211,7 +1181,7 @@ LABEL(nibble_ashr_10_restart_use):
>         jg      LABEL(nibble_ashr_10_use)
>
>         movdqa  (%rdi, %rdx), %xmm0
> -       palignr $10, -16(%rdi, %rdx), D(%xmm0)
> +       palignr $10, -16(%rdi, %rdx), %xmm0
>  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
>         pcmpistri $0x1a, (%rsi,%rdx), %xmm0
>  #else
> @@ -1231,7 +1201,7 @@ LABEL(nibble_ashr_10_restart_use):
>  LABEL(nibble_ashr_10_use):
>         sub     $0x1000, %r10
>         movdqa  -16(%rdi, %rdx), %xmm0
> -       psrldq  $10, D(%xmm0)
> +       psrldq  $10, %xmm0
>         pcmpistri      $0x3a,%xmm0, %xmm0
>  #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
>         cmp     %r11, %rcx
> @@ -1249,10 +1219,10 @@ LABEL(nibble_ashr_10_use):
>   */
>         .p2align 4
>  LABEL(ashr_11):
> -       pslldq  $5, D(%xmm2)
> +       pslldq  $5, %xmm2
>         TOLOWER (%xmm1, %xmm2)
> -       pcmpeqb %xmm1, D(%xmm2)
> -       psubb   %xmm0, D(%xmm2)
> +       pcmpeqb %xmm1, %xmm2
> +       psubb   %xmm0, %xmm2
>         pmovmskb %xmm2, %r9d
>         shr     %cl, %edx
>         shr     %cl, %r9d
> @@ -1281,7 +1251,7 @@ LABEL(loop_ashr_11_use):
>
>  LABEL(nibble_ashr_11_restart_use):
>         movdqa  (%rdi, %rdx), %xmm0
> -       palignr $11, -16(%rdi, %rdx), D(%xmm0)
> +       palignr $11, -16(%rdi, %rdx), %xmm0
>  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
>         pcmpistri $0x1a, (%rsi,%rdx), %xmm0
>  #else
> @@ -1300,7 +1270,7 @@ LABEL(nibble_ashr_11_restart_use):
>         jg      LABEL(nibble_ashr_11_use)
>
>         movdqa  (%rdi, %rdx), %xmm0
> -       palignr $11, -16(%rdi, %rdx), D(%xmm0)
> +       palignr $11, -16(%rdi, %rdx), %xmm0
>  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
>         pcmpistri $0x1a, (%rsi,%rdx), %xmm0
>  #else
> @@ -1320,7 +1290,7 @@ LABEL(nibble_ashr_11_restart_use):
>  LABEL(nibble_ashr_11_use):
>         sub     $0x1000, %r10
>         movdqa  -16(%rdi, %rdx), %xmm0
> -       psrldq  $11, D(%xmm0)
> +       psrldq  $11, %xmm0
>         pcmpistri      $0x3a,%xmm0, %xmm0
>  #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
>         cmp     %r11, %rcx
> @@ -1338,10 +1308,10 @@ LABEL(nibble_ashr_11_use):
>   */
>         .p2align 4
>  LABEL(ashr_12):
> -       pslldq  $4, D(%xmm2)
> +       pslldq  $4, %xmm2
>         TOLOWER (%xmm1, %xmm2)
> -       pcmpeqb %xmm1, D(%xmm2)
> -       psubb   %xmm0, D(%xmm2)
> +       pcmpeqb %xmm1, %xmm2
> +       psubb   %xmm0, %xmm2
>         pmovmskb %xmm2, %r9d
>         shr     %cl, %edx
>         shr     %cl, %r9d
> @@ -1370,7 +1340,7 @@ LABEL(loop_ashr_12_use):
>
>  LABEL(nibble_ashr_12_restart_use):
>         movdqa  (%rdi, %rdx), %xmm0
> -       palignr $12, -16(%rdi, %rdx), D(%xmm0)
> +       palignr $12, -16(%rdi, %rdx), %xmm0
>  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
>         pcmpistri $0x1a, (%rsi,%rdx), %xmm0
>  #else
> @@ -1389,7 +1359,7 @@ LABEL(nibble_ashr_12_restart_use):
>         jg      LABEL(nibble_ashr_12_use)
>
>         movdqa  (%rdi, %rdx), %xmm0
> -       palignr $12, -16(%rdi, %rdx), D(%xmm0)
> +       palignr $12, -16(%rdi, %rdx), %xmm0
>  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
>         pcmpistri $0x1a, (%rsi,%rdx), %xmm0
>  #else
> @@ -1409,7 +1379,7 @@ LABEL(nibble_ashr_12_restart_use):
>  LABEL(nibble_ashr_12_use):
>         sub     $0x1000, %r10
>         movdqa  -16(%rdi, %rdx), %xmm0
> -       psrldq  $12, D(%xmm0)
> +       psrldq  $12, %xmm0
>         pcmpistri      $0x3a,%xmm0, %xmm0
>  #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
>         cmp     %r11, %rcx
> @@ -1427,10 +1397,10 @@ LABEL(nibble_ashr_12_use):
>   */
>         .p2align 4
>  LABEL(ashr_13):
> -       pslldq  $3, D(%xmm2)
> +       pslldq  $3, %xmm2
>         TOLOWER (%xmm1, %xmm2)
> -       pcmpeqb %xmm1, D(%xmm2)
> -       psubb   %xmm0, D(%xmm2)
> +       pcmpeqb %xmm1, %xmm2
> +       psubb   %xmm0, %xmm2
>         pmovmskb %xmm2, %r9d
>         shr     %cl, %edx
>         shr     %cl, %r9d
> @@ -1460,7 +1430,7 @@ LABEL(loop_ashr_13_use):
>
>  LABEL(nibble_ashr_13_restart_use):
>         movdqa  (%rdi, %rdx), %xmm0
> -       palignr $13, -16(%rdi, %rdx), D(%xmm0)
> +       palignr $13, -16(%rdi, %rdx), %xmm0
>  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
>         pcmpistri $0x1a, (%rsi,%rdx), %xmm0
>  #else
> @@ -1479,7 +1449,7 @@ LABEL(nibble_ashr_13_restart_use):
>         jg      LABEL(nibble_ashr_13_use)
>
>         movdqa  (%rdi, %rdx), %xmm0
> -       palignr $13, -16(%rdi, %rdx), D(%xmm0)
> +       palignr $13, -16(%rdi, %rdx), %xmm0
>  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
>         pcmpistri $0x1a, (%rsi,%rdx), %xmm0
>  #else
> @@ -1499,7 +1469,7 @@ LABEL(nibble_ashr_13_restart_use):
>  LABEL(nibble_ashr_13_use):
>         sub     $0x1000, %r10
>         movdqa  -16(%rdi, %rdx), %xmm0
> -       psrldq  $13, D(%xmm0)
> +       psrldq  $13, %xmm0
>         pcmpistri      $0x3a,%xmm0, %xmm0
>  #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
>         cmp     %r11, %rcx
> @@ -1517,10 +1487,10 @@ LABEL(nibble_ashr_13_use):
>   */
>         .p2align 4
>  LABEL(ashr_14):
> -       pslldq  $2, D(%xmm2)
> +       pslldq  $2, %xmm2
>         TOLOWER (%xmm1, %xmm2)
> -       pcmpeqb %xmm1, D(%xmm2)
> -       psubb   %xmm0, D(%xmm2)
> +       pcmpeqb %xmm1, %xmm2
> +       psubb   %xmm0, %xmm2
>         pmovmskb %xmm2, %r9d
>         shr     %cl, %edx
>         shr     %cl, %r9d
> @@ -1550,7 +1520,7 @@ LABEL(loop_ashr_14_use):
>
>  LABEL(nibble_ashr_14_restart_use):
>         movdqa  (%rdi, %rdx), %xmm0
> -       palignr $14, -16(%rdi, %rdx), D(%xmm0)
> +       palignr $14, -16(%rdi, %rdx), %xmm0
>  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
>         pcmpistri $0x1a, (%rsi,%rdx), %xmm0
>  #else
> @@ -1569,7 +1539,7 @@ LABEL(nibble_ashr_14_restart_use):
>         jg      LABEL(nibble_ashr_14_use)
>
>         movdqa  (%rdi, %rdx), %xmm0
> -       palignr $14, -16(%rdi, %rdx), D(%xmm0)
> +       palignr $14, -16(%rdi, %rdx), %xmm0
>  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
>         pcmpistri $0x1a, (%rsi,%rdx), %xmm0
>  #else
> @@ -1589,7 +1559,7 @@ LABEL(nibble_ashr_14_restart_use):
>  LABEL(nibble_ashr_14_use):
>         sub     $0x1000, %r10
>         movdqa  -16(%rdi, %rdx), %xmm0
> -       psrldq  $14, D(%xmm0)
> +       psrldq  $14, %xmm0
>         pcmpistri      $0x3a,%xmm0, %xmm0
>  #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
>         cmp     %r11, %rcx
> @@ -1607,10 +1577,10 @@ LABEL(nibble_ashr_14_use):
>   */
>         .p2align 4
>  LABEL(ashr_15):
> -       pslldq  $1, D(%xmm2)
> +       pslldq  $1, %xmm2
>         TOLOWER (%xmm1, %xmm2)
> -       pcmpeqb %xmm1, D(%xmm2)
> -       psubb   %xmm0, D(%xmm2)
> +       pcmpeqb %xmm1, %xmm2
> +       psubb   %xmm0, %xmm2
>         pmovmskb %xmm2, %r9d
>         shr     %cl, %edx
>         shr     %cl, %r9d
> @@ -1642,7 +1612,7 @@ LABEL(loop_ashr_15_use):
>
>  LABEL(nibble_ashr_15_restart_use):
>         movdqa  (%rdi, %rdx), %xmm0
> -       palignr $15, -16(%rdi, %rdx), D(%xmm0)
> +       palignr $15, -16(%rdi, %rdx), %xmm0
>  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
>         pcmpistri $0x1a, (%rsi,%rdx), %xmm0
>  #else
> @@ -1661,7 +1631,7 @@ LABEL(nibble_ashr_15_restart_use):
>         jg      LABEL(nibble_ashr_15_use)
>
>         movdqa  (%rdi, %rdx), %xmm0
> -       palignr $15, -16(%rdi, %rdx), D(%xmm0)
> +       palignr $15, -16(%rdi, %rdx), %xmm0
>  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
>         pcmpistri $0x1a, (%rsi,%rdx), %xmm0
>  #else
> @@ -1681,7 +1651,7 @@ LABEL(nibble_ashr_15_restart_use):
>  LABEL(nibble_ashr_15_use):
>         sub     $0x1000, %r10
>         movdqa  -16(%rdi, %rdx), %xmm0
> -       psrldq  $15, D(%xmm0)
> +       psrldq  $15, %xmm0
>         pcmpistri      $0x3a,%xmm0, %xmm0
>  #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
>         cmp     %r11, %rcx
> diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx.S b/sysdeps/x86_64/multiarch/strncase_l-avx.S
> deleted file mode 100644
> index b51b86d223..0000000000
> --- a/sysdeps/x86_64/multiarch/strncase_l-avx.S
> +++ /dev/null
> @@ -1,22 +0,0 @@
> -/* strncasecmp_l optimized with AVX.
> -   Copyright (C) 2017-2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#define STRCMP_SSE42 __strncasecmp_l_avx
> -#define USE_AVX 1
> -#define USE_AS_STRNCASECMP_L
> -#include "strcmp-sse42.S"
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v1 12/23] x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896]
  2022-03-24 18:59   ` H.J. Lu
@ 2022-03-24 19:18     ` Noah Goldstein
  2022-03-24 19:34       ` H.J. Lu
  0 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-24 19:18 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell

On Thu, Mar 24, 2022 at 2:00 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Wed, Mar 23, 2022 at 3:01 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Overflow case for __wcsncmp_avx2_rtm should be __wcscmp_avx2_rtm not
> > __wcscmp_avx2.
> >
> > All string/memory tests pass.
> > ---
> >  sysdeps/x86_64/multiarch/strcmp-avx2.S | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> > index 52ff5ad724..86a86b68e3 100644
> > --- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
> > +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> > @@ -122,7 +122,7 @@ ENTRY(STRCMP)
> >            are cases where length is large enough that it can never be a
> >            bound on valid memory so just use wcscmp.  */
> >         shrq    $56, %rcx
> > -       jnz     __wcscmp_avx2
> > +       jnz     OVERFLOW_STRCMP
> >
> >         leaq    (, %rdx, 4), %rdx
> >  #  endif
> > --
> > 2.25.1
> >
>
> Isn't it a bug?  Is there a glibc bug? Should this also be fixed on release
> branches?

It is bug but no need for backport.
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v1 03/23] x86: Code cleanup in strchr-avx2 and comment justifying branch
  2022-03-24 18:53   ` H.J. Lu
@ 2022-03-24 19:20     ` Noah Goldstein
  2022-03-24 19:36       ` H.J. Lu
  0 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-24 19:20 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell

On Thu, Mar 24, 2022 at 1:53 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Wed, Mar 23, 2022 at 2:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Small code cleanup for size: -53 bytes.
> >
> > Add comment justifying using a branch to do NULL/non-null return.
>
>
> Do you have followup patches to improve its performance?  We are
> backporting all x86-64 improvements to Intel release branches:
>
> https://gitlab.com/x86-glibc/glibc/-/wikis/home
>
> Patches without performance improvements are undesirable.

No further changes planned at the moment, code size saves
seem worth it for master though. Also in favor of adding the comment
as I think its non-intuitive.

>
> > All string/memory tests pass and no regressions in benchtests.
> >
> > geometric_mean(N=20) of all benchmarks Original / New: 1.00
> > ---
> > Geomtric Mean N=20 runs; All functions page aligned
> > length, alignment,  pos, rand, seek_char/branch, max_char/perc-zero, New Time / Old Time
> >   2048,         0,   32,    0,               23,                127,               1.033
> >   2048,         1,   32,    0,               23,                127,               1.006
> >   2048,         0,   64,    0,               23,                127,                1.02
> >   2048,         2,   64,    0,               23,                127,               0.992
> >   2048,         0,  128,    0,               23,                127,               0.996
> >   2048,         3,  128,    0,               23,                127,               0.966
> >   2048,         0,  256,    0,               23,                127,               0.996
> >   2048,         4,  256,    0,               23,                127,               0.998
> >   2048,         0,  512,    0,               23,                127,               0.991
> >   2048,         5,  512,    0,               23,                127,               0.991
> >   2048,         0, 1024,    0,               23,                127,               0.993
> >   2048,         6, 1024,    0,               23,                127,               0.992
> >   2048,         0, 2048,    0,               23,                127,               0.992
> >   2048,         7, 2048,    0,               23,                127,               0.976
> >   4096,         0,   32,    0,               23,                127,               0.983
> >   4096,         1,   32,    0,               23,                127,               0.994
> >   4096,         0,   64,    0,               23,                127,               0.968
> >   4096,         2,   64,    0,               23,                127,               1.018
> >   4096,         0,  128,    0,               23,                127,                0.99
> >   4096,         3,  128,    0,               23,                127,               1.001
> >   4096,         0,  256,    0,               23,                127,                 1.0
> >   4096,         4,  256,    0,               23,                127,               1.001
> >   4096,         0,  512,    0,               23,                127,               0.989
> >   4096,         5,  512,    0,               23,                127,               0.988
> >   4096,         0, 1024,    0,               23,                127,               0.994
> >   4096,         6, 1024,    0,               23,                127,               0.993
> >   4096,         0, 2048,    0,               23,                127,               0.987
> >   4096,         7, 2048,    0,               23,                127,               0.996
> >    256,         1,   64,    0,               23,                127,               1.004
> >    256,         2,   64,    0,               23,                127,               1.004
> >    256,         3,   64,    0,               23,                127,               0.992
> >    256,         4,   64,    0,               23,                127,               1.001
> >    256,         5,   64,    0,               23,                127,               1.001
> >    256,         6,   64,    0,               23,                127,               0.998
> >    256,         7,   64,    0,               23,                127,               0.994
> >    512,         0,  256,    0,               23,                127,               0.999
> >    512,        16,  256,    0,               23,                127,               1.002
> >    512,        32,  256,    0,               23,                127,               0.994
> >    512,        48,  256,    0,               23,                127,               0.991
> >    512,        64,  256,    0,               23,                127,               0.994
> >    512,        80,  256,    0,               23,                127,               0.994
> >    512,        96,  256,    0,               23,                127,               0.996
> >    512,       112,  256,    0,               23,                127,               0.999
> >      1,         0,    0,    0,               23,                127,               0.978
> >      2,         0,    1,    0,               23,                127,               0.981
> >      3,         0,    2,    0,               23,                127,               0.993
> >      4,         0,    3,    0,               23,                127,               1.004
> >      5,         0,    4,    0,               23,                127,               1.002
> >      6,         0,    5,    0,               23,                127,               0.991
> >      7,         0,    6,    0,               23,                127,                0.99
> >      8,         0,    7,    0,               23,                127,               1.012
> >      9,         0,    8,    0,               23,                127,               0.994
> >     10,         0,    9,    0,               23,                127,               1.003
> >     11,         0,   10,    0,               23,                127,               0.999
> >     12,         0,   11,    0,               23,                127,               1.007
> >     13,         0,   12,    0,               23,                127,                 1.0
> >     14,         0,   13,    0,               23,                127,               0.997
> >     15,         0,   14,    0,               23,                127,               0.996
> >     16,         0,   15,    0,               23,                127,               0.993
> >     17,         0,   16,    0,               23,                127,               1.002
> >     18,         0,   17,    0,               23,                127,               0.997
> >     19,         0,   18,    0,               23,                127,               0.998
> >     20,         0,   19,    0,               23,                127,               0.994
> >     21,         0,   20,    0,               23,                127,                0.99
> >     22,         0,   21,    0,               23,                127,               0.992
> >     23,         0,   22,    0,               23,                127,               0.996
> >     24,         0,   23,    0,               23,                127,               0.991
> >     25,         0,   24,    0,               23,                127,               0.997
> >     26,         0,   25,    0,               23,                127,               1.011
> >     27,         0,   26,    0,               23,                127,               1.013
> >     28,         0,   27,    0,               23,                127,               0.996
> >     29,         0,   28,    0,               23,                127,               0.993
> >     30,         0,   29,    0,               23,                127,               1.009
> >     31,         0,   30,    0,               23,                127,               1.009
> >     32,         0,   31,    0,               23,                127,               1.008
> >   2048,         0,   32,    0,                0,                127,                 1.0
> >   2048,         1,   32,    0,                0,                127,                1.01
> >   2048,         0,   64,    0,                0,                127,               0.997
> >   2048,         2,   64,    0,                0,                127,               1.002
> >   2048,         0,  128,    0,                0,                127,               0.986
> >   2048,         3,  128,    0,                0,                127,               0.997
> >   2048,         0,  256,    0,                0,                127,               1.002
> >   2048,         4,  256,    0,                0,                127,               0.999
> >   2048,         0,  512,    0,                0,                127,               0.991
> >   2048,         5,  512,    0,                0,                127,               0.984
> >   2048,         0, 1024,    0,                0,                127,               0.994
> >   2048,         6, 1024,    0,                0,                127,               0.993
> >   2048,         0, 2048,    0,                0,                127,               0.951
> >   2048,         7, 2048,    0,                0,                127,               0.989
> >   4096,         0,   32,    0,                0,                127,               0.993
> >   4096,         1,   32,    0,                0,                127,               0.997
> >   4096,         0,   64,    0,                0,                127,               1.004
> >   4096,         2,   64,    0,                0,                127,               1.016
> >   4096,         0,  128,    0,                0,                127,               0.973
> >   4096,         3,  128,    0,                0,                127,               1.001
> >   4096,         0,  256,    0,                0,                127,               0.999
> >   4096,         4,  256,    0,                0,                127,               0.998
> >   4096,         0,  512,    0,                0,                127,                0.99
> >   4096,         5,  512,    0,                0,                127,               0.985
> >   4096,         0, 1024,    0,                0,                127,               0.993
> >   4096,         6, 1024,    0,                0,                127,               0.997
> >   4096,         0, 2048,    0,                0,                127,               0.995
> >   4096,         7, 2048,    0,                0,                127,               0.996
> >    256,         1,   64,    0,                0,                127,                1.01
> >    256,         2,   64,    0,                0,                127,               1.024
> >    256,         3,   64,    0,                0,                127,                1.03
> >    256,         4,   64,    0,                0,                127,               1.004
> >    256,         5,   64,    0,                0,                127,               0.998
> >    256,         6,   64,    0,                0,                127,               0.998
> >    256,         7,   64,    0,                0,                127,               0.997
> >    512,         0,  256,    0,                0,                127,               0.996
> >    512,        16,  256,    0,                0,                127,               0.995
> >    512,        32,  256,    0,                0,                127,               0.996
> >    512,        48,  256,    0,                0,                127,               0.992
> >    512,        64,  256,    0,                0,                127,               0.999
> >    512,        80,  256,    0,                0,                127,               1.002
> >    512,        96,  256,    0,                0,                127,               0.999
> >    512,       112,  256,    0,                0,                127,               0.998
> >      1,         0,    0,    0,                0,                127,               1.016
> >      2,         0,    1,    0,                0,                127,               0.998
> >      3,         0,    2,    0,                0,                127,                1.02
> >      4,         0,    3,    0,                0,                127,               1.004
> >      5,         0,    4,    0,                0,                127,               1.021
> >      6,         0,    5,    0,                0,                127,               1.014
> >      7,         0,    6,    0,                0,                127,               1.007
> >      8,         0,    7,    0,                0,                127,               1.016
> >      9,         0,    8,    0,                0,                127,               1.003
> >     10,         0,    9,    0,                0,                127,               1.004
> >     11,         0,   10,    0,                0,                127,               0.995
> >     12,         0,   11,    0,                0,                127,               1.009
> >     13,         0,   12,    0,                0,                127,               1.005
> >     14,         0,   13,    0,                0,                127,               0.987
> >     15,         0,   14,    0,                0,                127,               0.998
> >     16,         0,   15,    0,                0,                127,               1.004
> >     17,         0,   16,    0,                0,                127,                1.01
> >     18,         0,   17,    0,                0,                127,                1.01
> >     19,         0,   18,    0,                0,                127,               1.006
> >     20,         0,   19,    0,                0,                127,               1.012
> >     21,         0,   20,    0,                0,                127,               0.999
> >     22,         0,   21,    0,                0,                127,               1.004
> >     23,         0,   22,    0,                0,                127,               0.988
> >     24,         0,   23,    0,                0,                127,               0.993
> >     25,         0,   24,    0,                0,                127,               1.004
> >     26,         0,   25,    0,                0,                127,                0.99
> >     27,         0,   26,    0,                0,                127,               1.016
> >     28,         0,   27,    0,                0,                127,               0.987
> >     29,         0,   28,    0,                0,                127,               0.989
> >     30,         0,   29,    0,                0,                127,               0.998
> >     31,         0,   30,    0,                0,                127,               1.005
> >     32,         0,   31,    0,                0,                127,               0.993
> >
> >     16,         0,   15,    1,                1,                  0,               1.002
> >     16,         0,   15,    1,                0,                  0,                 1.0
> >     16,         0,   15,    1,                1,                0.1,               1.034
> >     16,         0,   15,    1,                0,                0.1,                1.03
> >     16,         0,   15,    1,                1,               0.25,               0.993
> >     16,         0,   15,    1,                0,               0.25,               1.081
> >     16,         0,   15,    1,                1,               0.33,               0.959
> >     16,         0,   15,    1,                0,               0.33,               1.142
> >     16,         0,   15,    1,                1,                0.5,               0.929
> >     16,         0,   15,    1,                0,                0.5,               1.072
> >     16,         0,   15,    1,                1,               0.66,               0.984
> >     16,         0,   15,    1,                0,               0.66,               1.069
> >     16,         0,   15,    1,                1,               0.75,               0.969
> >     16,         0,   15,    1,                0,               0.75,               1.059
> >     16,         0,   15,    1,                1,                0.9,                0.98
> >     16,         0,   15,    1,                0,                0.9,               0.994
> >     16,         0,   15,    1,                1,                  1,               0.993
> >     16,         0,   15,    1,                0,                  1,               0.996
> >
> >  sysdeps/x86_64/multiarch/strchr-avx2.S | 204 +++++++++++++------------
> >  1 file changed, 107 insertions(+), 97 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
> > index 086cabf76a..1a916cc951 100644
> > --- a/sysdeps/x86_64/multiarch/strchr-avx2.S
> > +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
> > @@ -48,13 +48,13 @@
> >  # define PAGE_SIZE 4096
> >
> >         .section SECTION(.text),"ax",@progbits
> > -ENTRY (STRCHR)
> > +ENTRY_P2ALIGN (STRCHR, 5)
> >         /* Broadcast CHAR to YMM0.      */
> >         vmovd   %esi, %xmm0
> >         movl    %edi, %eax
> >         andl    $(PAGE_SIZE - 1), %eax
> >         VPBROADCAST     %xmm0, %ymm0
> > -       vpxor   %xmm9, %xmm9, %xmm9
> > +       vpxor   %xmm1, %xmm1, %xmm1
> >
> >         /* Check if we cross page boundary with one vector load.  */
> >         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > @@ -62,37 +62,29 @@ ENTRY (STRCHR)
> >
> >         /* Check the first VEC_SIZE bytes.      Search for both CHAR and the
> >            null byte.  */
> > -       vmovdqu (%rdi), %ymm8
> > -       VPCMPEQ %ymm8, %ymm0, %ymm1
> > -       VPCMPEQ %ymm8, %ymm9, %ymm2
> > -       vpor    %ymm1, %ymm2, %ymm1
> > -       vpmovmskb %ymm1, %eax
> > +       vmovdqu (%rdi), %ymm2
> > +       VPCMPEQ %ymm2, %ymm0, %ymm3
> > +       VPCMPEQ %ymm2, %ymm1, %ymm2
> > +       vpor    %ymm3, %ymm2, %ymm3
> > +       vpmovmskb %ymm3, %eax
> >         testl   %eax, %eax
> >         jz      L(aligned_more)
> >         tzcntl  %eax, %eax
> >  # ifndef USE_AS_STRCHRNUL
> > -       /* Found CHAR or the null byte.  */
> > -       cmp     (%rdi, %rax), %CHAR_REG
> > -       jne     L(zero)
> > -# endif
> > -       addq    %rdi, %rax
> > -       VZEROUPPER_RETURN
> > -
> > -       /* .p2align 5 helps keep performance more consistent if ENTRY()
> > -          alignment % 32 was either 16 or 0. As well this makes the
> > -          alignment % 32 of the loop_4x_vec fixed which makes tuning it
> > -          easier.  */
> > -       .p2align 5
> > -L(first_vec_x4):
> > -       tzcntl  %eax, %eax
> > -       addq    $(VEC_SIZE * 3 + 1), %rdi
> > -# ifndef USE_AS_STRCHRNUL
> > -       /* Found CHAR or the null byte.  */
> > +       /* Found CHAR or the null byte.  */
> >         cmp     (%rdi, %rax), %CHAR_REG
> > +       /* NB: Use a branch instead of cmovcc here. The expectation is
> > +          that with strchr the user will branch based on input being
> > +          null. Since this branch will be 100% predictive of the user
> > +          branch a branch miss here should save what otherwise would
> > +          be branch miss in the user code. Otherwise using a branch 1)
> > +          saves code size and 2) is faster in highly predictable
> > +          environments.  */
> >         jne     L(zero)
> >  # endif
> >         addq    %rdi, %rax
> > -       VZEROUPPER_RETURN
> > +L(return_vzeroupper):
> > +       ZERO_UPPER_VEC_REGISTERS_RETURN
> >
> >  # ifndef USE_AS_STRCHRNUL
> >  L(zero):
> > @@ -103,7 +95,8 @@ L(zero):
> >
> >         .p2align 4
> >  L(first_vec_x1):
> > -       tzcntl  %eax, %eax
> > +       /* Use bsf to save code size.  */
> > +       bsfl    %eax, %eax
> >         incq    %rdi
> >  # ifndef USE_AS_STRCHRNUL
> >         /* Found CHAR or the null byte.  */
> > @@ -113,9 +106,10 @@ L(first_vec_x1):
> >         addq    %rdi, %rax
> >         VZEROUPPER_RETURN
> >
> > -       .p2align 4
> > +       .p2align 4,, 10
> >  L(first_vec_x2):
> > -       tzcntl  %eax, %eax
> > +       /* Use bsf to save code size.  */
> > +       bsfl    %eax, %eax
> >         addq    $(VEC_SIZE + 1), %rdi
> >  # ifndef USE_AS_STRCHRNUL
> >         /* Found CHAR or the null byte.  */
> > @@ -125,9 +119,10 @@ L(first_vec_x2):
> >         addq    %rdi, %rax
> >         VZEROUPPER_RETURN
> >
> > -       .p2align 4
> > +       .p2align 4,, 8
> >  L(first_vec_x3):
> > -       tzcntl  %eax, %eax
> > +       /* Use bsf to save code size.  */
> > +       bsfl    %eax, %eax
> >         addq    $(VEC_SIZE * 2 + 1), %rdi
> >  # ifndef USE_AS_STRCHRNUL
> >         /* Found CHAR or the null byte.  */
> > @@ -137,6 +132,21 @@ L(first_vec_x3):
> >         addq    %rdi, %rax
> >         VZEROUPPER_RETURN
> >
> > +       .p2align 4,, 10
> > +L(first_vec_x4):
> > +       /* Use bsf to save code size.  */
> > +       bsfl    %eax, %eax
> > +       addq    $(VEC_SIZE * 3 + 1), %rdi
> > +# ifndef USE_AS_STRCHRNUL
> > +       /* Found CHAR or the null byte.  */
> > +       cmp     (%rdi, %rax), %CHAR_REG
> > +       jne     L(zero)
> > +# endif
> > +       addq    %rdi, %rax
> > +       VZEROUPPER_RETURN
> > +
> > +
> > +
> >         .p2align 4
> >  L(aligned_more):
> >         /* Align data to VEC_SIZE - 1. This is the same number of
> > @@ -146,90 +156,92 @@ L(aligned_more):
> >  L(cross_page_continue):
> >         /* Check the next 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> >            since data is only aligned to VEC_SIZE.  */
> > -       vmovdqa 1(%rdi), %ymm8
> > -       VPCMPEQ %ymm8, %ymm0, %ymm1
> > -       VPCMPEQ %ymm8, %ymm9, %ymm2
> > -       vpor    %ymm1, %ymm2, %ymm1
> > -       vpmovmskb %ymm1, %eax
> > +       vmovdqa 1(%rdi), %ymm2
> > +       VPCMPEQ %ymm2, %ymm0, %ymm3
> > +       VPCMPEQ %ymm2, %ymm1, %ymm2
> > +       vpor    %ymm3, %ymm2, %ymm3
> > +       vpmovmskb %ymm3, %eax
> >         testl   %eax, %eax
> >         jnz     L(first_vec_x1)
> >
> > -       vmovdqa (VEC_SIZE + 1)(%rdi), %ymm8
> > -       VPCMPEQ %ymm8, %ymm0, %ymm1
> > -       VPCMPEQ %ymm8, %ymm9, %ymm2
> > -       vpor    %ymm1, %ymm2, %ymm1
> > -       vpmovmskb %ymm1, %eax
> > +       vmovdqa (VEC_SIZE + 1)(%rdi), %ymm2
> > +       VPCMPEQ %ymm2, %ymm0, %ymm3
> > +       VPCMPEQ %ymm2, %ymm1, %ymm2
> > +       vpor    %ymm3, %ymm2, %ymm3
> > +       vpmovmskb %ymm3, %eax
> >         testl   %eax, %eax
> >         jnz     L(first_vec_x2)
> >
> > -       vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm8
> > -       VPCMPEQ %ymm8, %ymm0, %ymm1
> > -       VPCMPEQ %ymm8, %ymm9, %ymm2
> > -       vpor    %ymm1, %ymm2, %ymm1
> > -       vpmovmskb %ymm1, %eax
> > +       vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm2
> > +       VPCMPEQ %ymm2, %ymm0, %ymm3
> > +       VPCMPEQ %ymm2, %ymm1, %ymm2
> > +       vpor    %ymm3, %ymm2, %ymm3
> > +       vpmovmskb %ymm3, %eax
> >         testl   %eax, %eax
> >         jnz     L(first_vec_x3)
> >
> > -       vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm8
> > -       VPCMPEQ %ymm8, %ymm0, %ymm1
> > -       VPCMPEQ %ymm8, %ymm9, %ymm2
> > -       vpor    %ymm1, %ymm2, %ymm1
> > -       vpmovmskb %ymm1, %eax
> > +       vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm2
> > +       VPCMPEQ %ymm2, %ymm0, %ymm3
> > +       VPCMPEQ %ymm2, %ymm1, %ymm2
> > +       vpor    %ymm3, %ymm2, %ymm3
> > +       vpmovmskb %ymm3, %eax
> >         testl   %eax, %eax
> >         jnz     L(first_vec_x4)
> > -       /* Align data to VEC_SIZE * 4 - 1.      */
> > -       addq    $(VEC_SIZE * 4 + 1), %rdi
> > -       andq    $-(VEC_SIZE * 4), %rdi
> > +       /* Align data to VEC_SIZE * 4 - 1.  */
> > +       incq    %rdi
> > +       orq     $(VEC_SIZE * 4 - 1), %rdi
> >         .p2align 4
> >  L(loop_4x_vec):
> >         /* Compare 4 * VEC at a time forward.  */
> > -       vmovdqa (%rdi), %ymm5
> > -       vmovdqa (VEC_SIZE)(%rdi), %ymm6
> > -       vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
> > -       vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
> > +       vmovdqa 1(%rdi), %ymm6
> > +       vmovdqa (VEC_SIZE + 1)(%rdi), %ymm7
> >
> >         /* Leaves only CHARS matching esi as 0.  */
> > -       vpxor   %ymm5, %ymm0, %ymm1
> >         vpxor   %ymm6, %ymm0, %ymm2
> >         vpxor   %ymm7, %ymm0, %ymm3
> > -       vpxor   %ymm8, %ymm0, %ymm4
> >
> > -       VPMINU  %ymm1, %ymm5, %ymm1
> >         VPMINU  %ymm2, %ymm6, %ymm2
> >         VPMINU  %ymm3, %ymm7, %ymm3
> > -       VPMINU  %ymm4, %ymm8, %ymm4
> >
> > -       VPMINU  %ymm1, %ymm2, %ymm5
> > -       VPMINU  %ymm3, %ymm4, %ymm6
> > +       vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm6
> > +       vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm7
> > +
> > +       vpxor   %ymm6, %ymm0, %ymm4
> > +       vpxor   %ymm7, %ymm0, %ymm5
> > +
> > +       VPMINU  %ymm4, %ymm6, %ymm4
> > +       VPMINU  %ymm5, %ymm7, %ymm5
> >
> > -       VPMINU  %ymm5, %ymm6, %ymm6
> > +       VPMINU  %ymm2, %ymm3, %ymm6
> > +       VPMINU  %ymm4, %ymm5, %ymm7
> >
> > -       VPCMPEQ %ymm6, %ymm9, %ymm6
> > -       vpmovmskb %ymm6, %ecx
> > +       VPMINU  %ymm6, %ymm7, %ymm7
> > +
> > +       VPCMPEQ %ymm7, %ymm1, %ymm7
> > +       vpmovmskb %ymm7, %ecx
> >         subq    $-(VEC_SIZE * 4), %rdi
> >         testl   %ecx, %ecx
> >         jz      L(loop_4x_vec)
> >
> > -
> > -       VPCMPEQ %ymm1, %ymm9, %ymm1
> > -       vpmovmskb %ymm1, %eax
> > +       VPCMPEQ %ymm2, %ymm1, %ymm2
> > +       vpmovmskb %ymm2, %eax
> >         testl   %eax, %eax
> >         jnz     L(last_vec_x0)
> >
> >
> > -       VPCMPEQ %ymm5, %ymm9, %ymm2
> > -       vpmovmskb %ymm2, %eax
> > +       VPCMPEQ %ymm3, %ymm1, %ymm3
> > +       vpmovmskb %ymm3, %eax
> >         testl   %eax, %eax
> >         jnz     L(last_vec_x1)
> >
> > -       VPCMPEQ %ymm3, %ymm9, %ymm3
> > -       vpmovmskb %ymm3, %eax
> > +       VPCMPEQ %ymm4, %ymm1, %ymm4
> > +       vpmovmskb %ymm4, %eax
> >         /* rcx has combined result from all 4 VEC. It will only be used
> >            if the first 3 other VEC all did not contain a match.  */
> >         salq    $32, %rcx
> >         orq     %rcx, %rax
> >         tzcntq  %rax, %rax
> > -       subq    $(VEC_SIZE * 2), %rdi
> > +       subq    $(VEC_SIZE * 2 - 1), %rdi
> >  # ifndef USE_AS_STRCHRNUL
> >         /* Found CHAR or the null byte.  */
> >         cmp     (%rdi, %rax), %CHAR_REG
> > @@ -239,10 +251,11 @@ L(loop_4x_vec):
> >         VZEROUPPER_RETURN
> >
> >
> > -       .p2align 4
> > +       .p2align 4,, 10
> >  L(last_vec_x0):
> > -       tzcntl  %eax, %eax
> > -       addq    $-(VEC_SIZE * 4), %rdi
> > +       /* Use bsf to save code size.  */
> > +       bsfl    %eax, %eax
> > +       addq    $-(VEC_SIZE * 4 - 1), %rdi
> >  # ifndef USE_AS_STRCHRNUL
> >         /* Found CHAR or the null byte.  */
> >         cmp     (%rdi, %rax), %CHAR_REG
> > @@ -251,16 +264,11 @@ L(last_vec_x0):
> >         addq    %rdi, %rax
> >         VZEROUPPER_RETURN
> >
> > -# ifndef USE_AS_STRCHRNUL
> > -L(zero_end):
> > -       xorl    %eax, %eax
> > -       VZEROUPPER_RETURN
> > -# endif
> >
> > -       .p2align 4
> > +       .p2align 4,, 10
> >  L(last_vec_x1):
> >         tzcntl  %eax, %eax
> > -       subq    $(VEC_SIZE * 3), %rdi
> > +       subq    $(VEC_SIZE * 3 - 1), %rdi
> >  # ifndef USE_AS_STRCHRNUL
> >         /* Found CHAR or the null byte.  */
> >         cmp     (%rdi, %rax), %CHAR_REG
> > @@ -269,18 +277,23 @@ L(last_vec_x1):
> >         addq    %rdi, %rax
> >         VZEROUPPER_RETURN
> >
> > +# ifndef USE_AS_STRCHRNUL
> > +L(zero_end):
> > +       xorl    %eax, %eax
> > +       VZEROUPPER_RETURN
> > +# endif
> >
> >         /* Cold case for crossing page with first load.  */
> > -       .p2align 4
> > +       .p2align 4,, 8
> >  L(cross_page_boundary):
> >         movq    %rdi, %rdx
> >         /* Align rdi to VEC_SIZE - 1.  */
> >         orq     $(VEC_SIZE - 1), %rdi
> > -       vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm8
> > -       VPCMPEQ %ymm8, %ymm0, %ymm1
> > -       VPCMPEQ %ymm8, %ymm9, %ymm2
> > -       vpor    %ymm1, %ymm2, %ymm1
> > -       vpmovmskb %ymm1, %eax
> > +       vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm2
> > +       VPCMPEQ %ymm2, %ymm0, %ymm3
> > +       VPCMPEQ %ymm2, %ymm1, %ymm2
> > +       vpor    %ymm3, %ymm2, %ymm3
> > +       vpmovmskb %ymm3, %eax
> >         /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
> >            so no need to manually mod edx.  */
> >         sarxl   %edx, %eax, %eax
> > @@ -291,13 +304,10 @@ L(cross_page_boundary):
> >         xorl    %ecx, %ecx
> >         /* Found CHAR or the null byte.  */
> >         cmp     (%rdx, %rax), %CHAR_REG
> > -       leaq    (%rdx, %rax), %rax
> > -       cmovne  %rcx, %rax
> > -# else
> > -       addq    %rdx, %rax
> > +       jne     L(zero_end)
> >  # endif
> > -L(return_vzeroupper):
> > -       ZERO_UPPER_VEC_REGISTERS_RETURN
> > +       addq    %rdx, %rax
> > +       VZEROUPPER_RETURN
> >
> >  END (STRCHR)
> > -# endif
> > +#endif
> > --
> > 2.25.1
> >
>
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v1 12/23] x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896]
  2022-03-24 19:18     ` Noah Goldstein
@ 2022-03-24 19:34       ` H.J. Lu
  2022-03-24 19:39         ` Noah Goldstein
  0 siblings, 1 reply; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 19:34 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Thu, Mar 24, 2022 at 12:18 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Thu, Mar 24, 2022 at 2:00 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Wed, Mar 23, 2022 at 3:01 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > Overflow case for __wcsncmp_avx2_rtm should be __wcscmp_avx2_rtm not
> > > __wcscmp_avx2.
> > >
> > > All string/memory tests pass.
> > > ---
> > >  sysdeps/x86_64/multiarch/strcmp-avx2.S | 2 +-
> > >  1 file changed, 1 insertion(+), 1 deletion(-)
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> > > index 52ff5ad724..86a86b68e3 100644
> > > --- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
> > > +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> > > @@ -122,7 +122,7 @@ ENTRY(STRCMP)
> > >            are cases where length is large enough that it can never be a
> > >            bound on valid memory so just use wcscmp.  */
> > >         shrq    $56, %rcx
> > > -       jnz     __wcscmp_avx2
> > > +       jnz     OVERFLOW_STRCMP
> > >
> > >         leaq    (, %rdx, 4), %rdx
> > >  #  endif
> > > --
> > > 2.25.1
> > >
> >
> > Isn't it a bug?  Is there a glibc bug? Should this also be fixed on release
> > branches?
>
> It is bug but no need for backport.

Why no need for backport? Is there a testcase?

-- 
H.J.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v1 03/23] x86: Code cleanup in strchr-avx2 and comment justifying branch
  2022-03-24 19:20     ` Noah Goldstein
@ 2022-03-24 19:36       ` H.J. Lu
  2022-05-12 19:31         ` Sunil Pandey
  0 siblings, 1 reply; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 19:36 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Thu, Mar 24, 2022 at 12:20 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Thu, Mar 24, 2022 at 1:53 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Wed, Mar 23, 2022 at 2:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > Small code cleanup for size: -53 bytes.
> > >
> > > Add comment justifying using a branch to do NULL/non-null return.
> >
> >
> > Do you have followup patches to improve its performance?  We are
> > backporting all x86-64 improvements to Intel release branches:
> >
> > https://gitlab.com/x86-glibc/glibc/-/wikis/home
> >
> > Patches without performance improvements are undesirable.
>
> No further changes planned at the moment, code size saves
> seem worth it for master though. Also in favor of adding the comment
> as I think its non-intuitive.
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v1 12/23] x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896]
  2022-03-24 19:34       ` H.J. Lu
@ 2022-03-24 19:39         ` Noah Goldstein
  0 siblings, 0 replies; 76+ messages in thread
From: Noah Goldstein @ 2022-03-24 19:39 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell

On Thu, Mar 24, 2022 at 2:34 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Thu, Mar 24, 2022 at 12:18 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Thu, Mar 24, 2022 at 2:00 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Wed, Mar 23, 2022 at 3:01 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > Overflow case for __wcsncmp_avx2_rtm should be __wcscmp_avx2_rtm not
> > > > __wcscmp_avx2.
> > > >
> > > > All string/memory tests pass.
> > > > ---
> > > >  sysdeps/x86_64/multiarch/strcmp-avx2.S | 2 +-
> > > >  1 file changed, 1 insertion(+), 1 deletion(-)
> > > >
> > > > diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> > > > index 52ff5ad724..86a86b68e3 100644
> > > > --- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
> > > > +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> > > > @@ -122,7 +122,7 @@ ENTRY(STRCMP)
> > > >            are cases where length is large enough that it can never be a
> > > >            bound on valid memory so just use wcscmp.  */
> > > >         shrq    $56, %rcx
> > > > -       jnz     __wcscmp_avx2
> > > > +       jnz     OVERFLOW_STRCMP
> > > >
> > > >         leaq    (, %rdx, 4), %rdx
> > > >  #  endif
> > > > --
> > > > 2.25.1
> > > >
> > >
> > > Isn't it a bug?  Is there a glibc bug? Should this also be fixed on release
> > > branches?
> >
> > It is bug but no need for backport.
>
> Why no need for backport? Is there a testcase?

Oh no, you're right. It needs to be backported. Had thought it was a different
commit that introduced.

Sorry, I'll update the commit message with more info, ping on the bugzilla,
and add a test case.

Going to push the rest of the patchset, will add v2 for this shortly.
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH v2 12/31] x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896]
  2022-03-23 21:57 ` [PATCH v1 12/23] x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896] Noah Goldstein
  2022-03-24 18:59   ` H.J. Lu
@ 2022-03-24 20:50   ` Noah Goldstein
  2022-03-24 21:26     ` H.J. Lu
  1 sibling, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-24 20:50 UTC (permalink / raw)
  To: libc-alpha

Overflow case for __wcsncmp_avx2_rtm should be __wcscmp_avx2_rtm not
__wcscmp_avx2.

commit ddf0992cf57a93200e0c782e2a94d0733a5a0b87
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date:   Sun Jan 9 16:02:21 2022 -0600

    x86: Fix __wcsncmp_avx2 in strcmp-avx2.S [BZ# 28755]

Set the wrong fallback function for `__wcsncmp_avx2_rtm`. It was set
to fallback on to `__wcscmp_avx2` instead of `__wcscmp_avx2_rtm` which
can cause spurious aborts.

This change will need to be backported.

All string/memory tests pass.
---
 sysdeps/x86/tst-strncmp-rtm.c          | 15 +++++++++++++++
 sysdeps/x86_64/multiarch/strcmp-avx2.S |  2 +-
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
index 300bc8c281..a3b14e72ff 100644
--- a/sysdeps/x86/tst-strncmp-rtm.c
+++ b/sysdeps/x86/tst-strncmp-rtm.c
@@ -70,6 +70,16 @@ function_overflow (void)
     return 1;
 }
 
+__attribute__ ((noinline, noclone))
+static int
+function_overflow2 (void)
+{
+  if (STRNCMP (string1, string2, SIZE_MAX >> 4) == 0)
+    return 0;
+  else
+    return 1;
+}
+
 static int
 do_test (void)
 {
@@ -77,5 +87,10 @@ do_test (void)
   if (status != EXIT_SUCCESS)
     return status;
   status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow);
+  if (status != EXIT_SUCCESS)
+    return status;
+  status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow2);
+  if (status != EXIT_SUCCESS)
+    return status;
   return status;
 }
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
index 52ff5ad724..86a86b68e3 100644
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
@@ -122,7 +122,7 @@ ENTRY(STRCMP)
 	   are cases where length is large enough that it can never be a
 	   bound on valid memory so just use wcscmp.  */
 	shrq	$56, %rcx
-	jnz	__wcscmp_avx2
+	jnz	OVERFLOW_STRCMP
 
 	leaq	(, %rdx, 4), %rdx
 #  endif
-- 
2.25.1


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v2 12/31] x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896]
  2022-03-24 20:50   ` [PATCH v2 12/31] " Noah Goldstein
@ 2022-03-24 21:26     ` H.J. Lu
  2022-03-24 21:43       ` Noah Goldstein
  0 siblings, 1 reply; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 21:26 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Thu, Mar 24, 2022 at 1:50 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Overflow case for __wcsncmp_avx2_rtm should be __wcscmp_avx2_rtm not
> __wcscmp_avx2.
>
> commit ddf0992cf57a93200e0c782e2a94d0733a5a0b87
> Author: Noah Goldstein <goldstein.w.n@gmail.com>
> Date:   Sun Jan 9 16:02:21 2022 -0600
>
>     x86: Fix __wcsncmp_avx2 in strcmp-avx2.S [BZ# 28755]
>
> Set the wrong fallback function for `__wcsncmp_avx2_rtm`. It was set
> to fallback on to `__wcscmp_avx2` instead of `__wcscmp_avx2_rtm` which
> can cause spurious aborts.
>
> This change will need to be backported.
>
> All string/memory tests pass.
> ---
>  sysdeps/x86/tst-strncmp-rtm.c          | 15 +++++++++++++++
>  sysdeps/x86_64/multiarch/strcmp-avx2.S |  2 +-
>  2 files changed, 16 insertions(+), 1 deletion(-)
>
> diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
> index 300bc8c281..a3b14e72ff 100644
> --- a/sysdeps/x86/tst-strncmp-rtm.c
> +++ b/sysdeps/x86/tst-strncmp-rtm.c
> @@ -70,6 +70,16 @@ function_overflow (void)
>      return 1;
>  }
>
> +__attribute__ ((noinline, noclone))
> +static int
> +function_overflow2 (void)
> +{
> +  if (STRNCMP (string1, string2, SIZE_MAX >> 4) == 0)
> +    return 0;
> +  else
> +    return 1;
> +}
> +
>  static int
>  do_test (void)
>  {
> @@ -77,5 +87,10 @@ do_test (void)
>    if (status != EXIT_SUCCESS)
>      return status;
>    status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow);
> +  if (status != EXIT_SUCCESS)
> +    return status;
> +  status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow2);
> +  if (status != EXIT_SUCCESS)
> +    return status;
>    return status;
>  }
> diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> index 52ff5ad724..86a86b68e3 100644
> --- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> @@ -122,7 +122,7 @@ ENTRY(STRCMP)
>            are cases where length is large enough that it can never be a
>            bound on valid memory so just use wcscmp.  */
>         shrq    $56, %rcx
> -       jnz     __wcscmp_avx2
> +       jnz     OVERFLOW_STRCMP
>
>         leaq    (, %rdx, 4), %rdx
>  #  endif
> --
> 2.25.1
>

LGTM.   Verified on AVX2 machine with RTM.  Without the fix,
the new testcase failed.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v2 12/31] x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896]
  2022-03-24 21:26     ` H.J. Lu
@ 2022-03-24 21:43       ` Noah Goldstein
  2022-03-24 21:58         ` H.J. Lu
  0 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-24 21:43 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell

On Thu, Mar 24, 2022 at 4:26 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Thu, Mar 24, 2022 at 1:50 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Overflow case for __wcsncmp_avx2_rtm should be __wcscmp_avx2_rtm not
> > __wcscmp_avx2.
> >
> > commit ddf0992cf57a93200e0c782e2a94d0733a5a0b87
> > Author: Noah Goldstein <goldstein.w.n@gmail.com>
> > Date:   Sun Jan 9 16:02:21 2022 -0600
> >
> >     x86: Fix __wcsncmp_avx2 in strcmp-avx2.S [BZ# 28755]
> >
> > Set the wrong fallback function for `__wcsncmp_avx2_rtm`. It was set
> > to fallback on to `__wcscmp_avx2` instead of `__wcscmp_avx2_rtm` which
> > can cause spurious aborts.
> >
> > This change will need to be backported.
> >
> > All string/memory tests pass.
> > ---
> >  sysdeps/x86/tst-strncmp-rtm.c          | 15 +++++++++++++++
> >  sysdeps/x86_64/multiarch/strcmp-avx2.S |  2 +-
> >  2 files changed, 16 insertions(+), 1 deletion(-)
> >
> > diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
> > index 300bc8c281..a3b14e72ff 100644
> > --- a/sysdeps/x86/tst-strncmp-rtm.c
> > +++ b/sysdeps/x86/tst-strncmp-rtm.c
> > @@ -70,6 +70,16 @@ function_overflow (void)
> >      return 1;
> >  }
> >
> > +__attribute__ ((noinline, noclone))
> > +static int
> > +function_overflow2 (void)
> > +{
> > +  if (STRNCMP (string1, string2, SIZE_MAX >> 4) == 0)
> > +    return 0;
> > +  else
> > +    return 1;
> > +}
> > +
> >  static int
> >  do_test (void)
> >  {
> > @@ -77,5 +87,10 @@ do_test (void)
> >    if (status != EXIT_SUCCESS)
> >      return status;
> >    status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow);
> > +  if (status != EXIT_SUCCESS)
> > +    return status;
> > +  status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow2);
> > +  if (status != EXIT_SUCCESS)
> > +    return status;
> >    return status;
> >  }
> > diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> > index 52ff5ad724..86a86b68e3 100644
> > --- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
> > +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> > @@ -122,7 +122,7 @@ ENTRY(STRCMP)
> >            are cases where length is large enough that it can never be a
> >            bound on valid memory so just use wcscmp.  */
> >         shrq    $56, %rcx
> > -       jnz     __wcscmp_avx2
> > +       jnz     OVERFLOW_STRCMP
> >
> >         leaq    (, %rdx, 4), %rdx
> >  #  endif
> > --
> > 2.25.1
> >
>
> LGTM.   Verified on AVX2 machine with RTM.  Without the fix,
> the new testcase failed.

And that w/ the fix it passes?
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v2 12/31] x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896]
  2022-03-24 21:43       ` Noah Goldstein
@ 2022-03-24 21:58         ` H.J. Lu
  2022-05-04  6:05           ` Sunil Pandey
  0 siblings, 1 reply; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 21:58 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Thu, Mar 24, 2022 at 2:43 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Thu, Mar 24, 2022 at 4:26 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Thu, Mar 24, 2022 at 1:50 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > Overflow case for __wcsncmp_avx2_rtm should be __wcscmp_avx2_rtm not
> > > __wcscmp_avx2.
> > >
> > > commit ddf0992cf57a93200e0c782e2a94d0733a5a0b87
> > > Author: Noah Goldstein <goldstein.w.n@gmail.com>
> > > Date:   Sun Jan 9 16:02:21 2022 -0600
> > >
> > >     x86: Fix __wcsncmp_avx2 in strcmp-avx2.S [BZ# 28755]
> > >
> > > Set the wrong fallback function for `__wcsncmp_avx2_rtm`. It was set
> > > to fallback on to `__wcscmp_avx2` instead of `__wcscmp_avx2_rtm` which
> > > can cause spurious aborts.
> > >
> > > This change will need to be backported.
> > >
> > > All string/memory tests pass.
> > > ---
> > >  sysdeps/x86/tst-strncmp-rtm.c          | 15 +++++++++++++++
> > >  sysdeps/x86_64/multiarch/strcmp-avx2.S |  2 +-
> > >  2 files changed, 16 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
> > > index 300bc8c281..a3b14e72ff 100644
> > > --- a/sysdeps/x86/tst-strncmp-rtm.c
> > > +++ b/sysdeps/x86/tst-strncmp-rtm.c
> > > @@ -70,6 +70,16 @@ function_overflow (void)
> > >      return 1;
> > >  }
> > >
> > > +__attribute__ ((noinline, noclone))
> > > +static int
> > > +function_overflow2 (void)
> > > +{
> > > +  if (STRNCMP (string1, string2, SIZE_MAX >> 4) == 0)
> > > +    return 0;
> > > +  else
> > > +    return 1;
> > > +}
> > > +
> > >  static int
> > >  do_test (void)
> > >  {
> > > @@ -77,5 +87,10 @@ do_test (void)
> > >    if (status != EXIT_SUCCESS)
> > >      return status;
> > >    status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow);
> > > +  if (status != EXIT_SUCCESS)
> > > +    return status;
> > > +  status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow2);
> > > +  if (status != EXIT_SUCCESS)
> > > +    return status;
> > >    return status;
> > >  }
> > > diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> > > index 52ff5ad724..86a86b68e3 100644
> > > --- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
> > > +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> > > @@ -122,7 +122,7 @@ ENTRY(STRCMP)
> > >            are cases where length is large enough that it can never be a
> > >            bound on valid memory so just use wcscmp.  */
> > >         shrq    $56, %rcx
> > > -       jnz     __wcscmp_avx2
> > > +       jnz     OVERFLOW_STRCMP
> > >
> > >         leaq    (, %rdx, 4), %rdx
> > >  #  endif
> > > --
> > > 2.25.1
> > >
> >
> > LGTM.   Verified on AVX2 machine with RTM.  Without the fix,
> > the new testcase failed.
>
> And that w/ the fix it passes?

Yes.

> >
> > Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
> >
> > Thanks.
> >
> > --
> > H.J.



-- 
H.J.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH v3 21/23] x86: Add AVX2 optimized str{n}casecmp
  2022-03-23 21:57 ` [PATCH v1 21/23] x86: Add AVX2 optimized str{n}casecmp Noah Goldstein
  2022-03-24 19:03   ` H.J. Lu
@ 2022-03-24 22:41   ` Noah Goldstein
  2022-03-24 22:41   ` [PATCH v3 22/23] x86: Add EVEX " Noah Goldstein
  2022-03-24 23:56   ` [PATCH v4 21/23] x86: Add AVX2 " Noah Goldstein
  3 siblings, 0 replies; 76+ messages in thread
From: Noah Goldstein @ 2022-03-24 22:41 UTC (permalink / raw)
  To: libc-alpha

geometric_mean(N=40) of all benchmarks AVX2 / SSE42: .702

All string/memory tests pass.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
Double checked, should be strcasecmp_l-* is the proper
fallback for strncasecmp*. Also added comment that
LOCALE_REG needs to be preserved until we finish
the fallback logic.

 sysdeps/x86_64/multiarch/Makefile             |   4 +
 sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  28 +++
 sysdeps/x86_64/multiarch/ifunc-strcasecmp.h   |  12 +
 .../x86_64/multiarch/strcasecmp_l-avx2-rtm.S  |  15 ++
 sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S  |  23 ++
 sysdeps/x86_64/multiarch/strcmp-avx2.S        | 233 +++++++++++++++---
 .../x86_64/multiarch/strncase_l-avx2-rtm.S    |  16 ++
 sysdeps/x86_64/multiarch/strncase_l-avx2.S    |  27 ++
 8 files changed, 327 insertions(+), 31 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
 create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
 create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index e7b413edad..06e1848823 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -55,6 +55,8 @@ sysdep_routines += \
   stpncpy-sse2-unaligned \
   stpncpy-ssse3 \
   strcasecmp_l-avx \
+  strcasecmp_l-avx2 \
+  strcasecmp_l-avx2-rtm \
   strcasecmp_l-sse2 \
   strcasecmp_l-sse4_2 \
   strcasecmp_l-ssse3 \
@@ -93,6 +95,8 @@ sysdep_routines += \
   strlen-evex \
   strlen-sse2 \
   strncase_l-avx \
+  strncase_l-avx2 \
+  strncase_l-avx2-rtm \
   strncase_l-sse2 \
   strncase_l-sse4_2 \
   strncase_l-ssse3 \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index a594f4176e..3c556d07ac 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -436,6 +436,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
   IFUNC_IMPL (i, name, strcasecmp,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      CPU_FEATURE_USABLE (AVX2),
+			      __strcasecmp_avx2)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __strcasecmp_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp,
 			      CPU_FEATURE_USABLE (AVX),
 			      __strcasecmp_avx)
@@ -449,6 +456,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
   IFUNC_IMPL (i, name, strcasecmp_l,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      CPU_FEATURE_USABLE (AVX2),
+			      __strcasecmp_l_avx2)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __strcasecmp_l_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
 			      CPU_FEATURE_USABLE (AVX),
 			      __strcasecmp_l_avx)
@@ -576,6 +590,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
   IFUNC_IMPL (i, name, strncasecmp,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      CPU_FEATURE_USABLE (AVX2),
+			      __strncasecmp_avx2)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __strncasecmp_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp,
 			      CPU_FEATURE_USABLE (AVX),
 			      __strncasecmp_avx)
@@ -590,6 +611,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
   IFUNC_IMPL (i, name, strncasecmp_l,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      CPU_FEATURE_USABLE (AVX2),
+			      __strncasecmp_l_avx2)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __strncasecmp_l_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
 			      CPU_FEATURE_USABLE (AVX),
 			      __strncasecmp_l_avx)
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
index 9e3cc61ac0..c4de111fd0 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
@@ -23,12 +23,24 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
 
 static inline void *
 IFUNC_SELECTOR (void)
 {
   const struct cpu_features* cpu_features = __get_cpu_features ();
 
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+    {
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+        return OPTIMIZE (avx2_rtm);
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+        return OPTIMIZE (avx2);
+    }
+
   if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
     return OPTIMIZE (avx);
 
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
new file mode 100644
index 0000000000..09957fc3c5
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
@@ -0,0 +1,15 @@
+#ifndef STRCMP
+# define STRCMP	__strcasecmp_l_avx2_rtm
+#endif
+
+#define _GLABEL(x)	x ## _rtm
+#define GLABEL(x)	_GLABEL(x)
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN	\
+	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN	jmp L(return_vzeroupper)
+
+#define SECTION(p)	p##.avx.rtm
+
+#include "strcasecmp_l-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
new file mode 100644
index 0000000000..e2762f2a22
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
@@ -0,0 +1,23 @@
+/* strcasecmp_l optimized with AVX2.
+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef STRCMP
+# define STRCMP	__strcasecmp_l_avx2
+#endif
+#define USE_AS_STRCASECMP_L
+#include "strcmp-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
index 86a86b68e3..e16cc2378c 100644
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
@@ -20,6 +20,10 @@
 
 # include <sysdep.h>
 
+# if defined USE_AS_STRCASECMP_L
+#  include "locale-defines.h"
+# endif
+
 # ifndef STRCMP
 #  define STRCMP	__strcmp_avx2
 # endif
@@ -74,13 +78,88 @@
 #  define VEC_OFFSET	(-VEC_SIZE)
 # endif
 
+# ifdef USE_AS_STRCASECMP_L
+#  define BYTE_LOOP_REG	OFFSET_REG
+# else
+#  define BYTE_LOOP_REG	ecx
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+#  ifdef USE_AS_STRNCMP
+#   define STRCASECMP	__strncasecmp_avx2
+#   define LOCALE_REG	rcx
+#   define LOCALE_REG_LP	RCX_LP
+#   define STRCASECMP_NONASCII	__strncasecmp_l_nonascii
+#  else
+#   define STRCASECMP	__strcasecmp_avx2
+#   define LOCALE_REG	rdx
+#   define LOCALE_REG_LP	RDX_LP
+#   define STRCASECMP_NONASCII	__strcasecmp_l_nonascii
+#  endif
+# endif
+
 # define xmmZERO	xmm15
 # define ymmZERO	ymm15
 
+# define LCASE_MIN_ymm	%ymm10
+# define LCASE_MAX_ymm	%ymm11
+# define CASE_ADD_ymm	%ymm12
+
+# define LCASE_MIN_xmm	%xmm10
+# define LCASE_MAX_xmm	%xmm11
+# define CASE_ADD_xmm	%xmm12
+
+	/* r11 is never use elsewhere so this is safe to maintain.  */
+# define TOLOWER_BASE	%r11
+
 # ifndef SECTION
 #  define SECTION(p)	p##.avx
 # endif
 
+# ifdef USE_AS_STRCASECMP_L
+#  define REG(x, y) x ## y
+#  define TOLOWER(reg1_in, reg1_out, reg2_in, reg2_out, ext)			\
+	vpaddb	REG(LCASE_MIN_, ext), reg1_in, REG(%ext, 8);				\
+	vpaddb	REG(LCASE_MIN_, ext), reg2_in, REG(%ext, 9);				\
+	vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 8), REG(%ext, 8);			\
+	vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 9), REG(%ext, 9);			\
+	vpandn	REG(CASE_ADD_, ext), REG(%ext, 8), REG(%ext, 8);			\
+	vpandn	REG(CASE_ADD_, ext), REG(%ext, 9), REG(%ext, 9);			\
+	vpaddb	REG(%ext, 8), reg1_in, reg1_out;							\
+	vpaddb	REG(%ext, 9), reg2_in, reg2_out
+
+#  define TOLOWER_gpr(src, dst)	movl (TOLOWER_BASE, src, 4), dst
+#  define TOLOWER_ymm(...)	TOLOWER(__VA_ARGS__, ymm)
+#  define TOLOWER_xmm(...)	TOLOWER(__VA_ARGS__, xmm)
+
+#  define CMP_R1_R2(s1_reg, s2_reg, scratch_reg, reg_out, ext)			\
+	TOLOWER	(s1_reg, scratch_reg, s2_reg, s2_reg, ext);					\
+	VPCMPEQ	scratch_reg, s2_reg, reg_out
+
+#  define CMP_R1_S2(s1_reg, s2_mem, scratch_reg, reg_out, ext)			\
+	VMOVU	s2_mem, reg_out;											\
+	CMP_R1_R2(s1_reg, reg_out, scratch_reg, reg_out, ext)
+
+#  define CMP_R1_R2_ymm(...) CMP_R1_R2(__VA_ARGS__, ymm)
+#  define CMP_R1_R2_xmm(...) CMP_R1_R2(__VA_ARGS__, xmm)
+
+#  define CMP_R1_S2_ymm(...) CMP_R1_S2(__VA_ARGS__, ymm)
+#  define CMP_R1_S2_xmm(...) CMP_R1_S2(__VA_ARGS__, xmm)
+
+# else
+#  define TOLOWER_gpr(...)
+#  define TOLOWER_ymm(...)
+#  define TOLOWER_xmm(...)
+
+#  define CMP_R1_R2_ymm(s1_reg, s2_reg, scratch_reg, reg_out)			\
+	VPCMPEQ	s2_reg, s1_reg, reg_out
+
+#  define CMP_R1_R2_xmm(...) CMP_R1_R2_ymm(__VA_ARGS__)
+
+#  define CMP_R1_S2_ymm(...) CMP_R1_R2_ymm(__VA_ARGS__)
+#  define CMP_R1_S2_xmm(...) CMP_R1_R2_xmm(__VA_ARGS__)
+# endif
+
 /* Warning!
            wcscmp/wcsncmp have to use SIGNED comparison for elements.
            strcmp/strncmp have to use UNSIGNED comparison for elements.
@@ -102,8 +181,49 @@
    returned.  */
 
 	.section SECTION(.text), "ax", @progbits
-ENTRY(STRCMP)
+	.align	16
+	.type	STRCMP, @function
+	.globl	STRCMP
+	.hidden	STRCMP
+
+# ifndef GLABEL
+#  define GLABEL(...)	__VA_ARGS__
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+ENTRY (GLABEL(STRCASECMP))
+	movq	__libc_tsd_LOCALE@gottpoff(%rip), %rax
+	mov	%fs:(%rax), %LOCALE_REG_LP
+
+	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
+	.p2align 4
+END (GLABEL(STRCASECMP))
+	/* FALLTHROUGH to strcasecmp/strncasecmp_l.  */
+# endif
+
+	.p2align 4
+STRCMP:
+	cfi_startproc
+	_CET_ENDBR
+	CALL_MCOUNT
+
+# if defined USE_AS_STRCASECMP_L
+	/* We have to fall back on the C implementation for locales with
+	   encodings not matching ASCII for single bytes.  */
+#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	mov	LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
+#  else
+	mov	(%LOCALE_REG), %RAX_LP
+#  endif
+	testl	$1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
+	jne	STRCASECMP_NONASCII
+	leaq	_nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
+# endif
+
 # ifdef USE_AS_STRNCMP
+	/* Don't overwrite LOCALE_REG (rcx) until we have pass
+	   L(one_or_less). Otherwise we might use the wrong locale in
+	   the OVERFLOW_STRCMP (strcasecmp_l).  */
 #  ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
 	movl	%edx, %edx
@@ -128,6 +248,30 @@ ENTRY(STRCMP)
 #  endif
 # endif
 	vpxor	%xmmZERO, %xmmZERO, %xmmZERO
+# if defined USE_AS_STRCASECMP_L
+	.section .rodata.cst32, "aM", @progbits, 32
+	.align	32
+L(lcase_min):
+	.quad	0x3f3f3f3f3f3f3f3f
+	.quad	0x3f3f3f3f3f3f3f3f
+	.quad	0x3f3f3f3f3f3f3f3f
+	.quad	0x3f3f3f3f3f3f3f3f
+L(lcase_max):
+	.quad	0x9999999999999999
+	.quad	0x9999999999999999
+	.quad	0x9999999999999999
+	.quad	0x9999999999999999
+L(case_add):
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.previous
+
+	vmovdqa	L(lcase_min)(%rip), LCASE_MIN_ymm
+	vmovdqa	L(lcase_max)(%rip), LCASE_MAX_ymm
+	vmovdqa	L(case_add)(%rip), CASE_ADD_ymm
+# endif
 	movl	%edi, %eax
 	orl	%esi, %eax
 	sall	$20, %eax
@@ -138,8 +282,10 @@ ENTRY(STRCMP)
 L(no_page_cross):
 	/* Safe to compare 4x vectors.  */
 	VMOVU	(%rdi), %ymm0
-	/* 1s where s1 and s2 equal.  */
-	VPCMPEQ	(%rsi), %ymm0, %ymm1
+	/* 1s where s1 and s2 equal. Just VPCMPEQ if its not strcasecmp.
+	   Otherwise converts ymm0 and load from rsi to lower. ymm2 is
+	   scratch and ymm1 is the return.  */
+	CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
 	/* 1s at null CHAR.  */
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	/* 1s where s1 and s2 equal AND not null CHAR.  */
@@ -172,6 +318,8 @@ L(return_vec_0):
 # else
 	movzbl	(%rdi, %rcx), %eax
 	movzbl	(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 # endif
 L(ret0):
@@ -207,6 +355,8 @@ L(one_or_less):
 #  else
 	movzbl	(%rdi), %eax
 	movzbl	(%rsi), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 #  endif
 L(ret1):
@@ -234,6 +384,8 @@ L(return_vec_1):
 # else
 	movzbl	VEC_SIZE(%rdi, %rcx), %eax
 	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 # endif
 L(ret2):
@@ -265,6 +417,8 @@ L(return_vec_2):
 # else
 	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
 	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 # endif
 L(ret3):
@@ -285,6 +439,8 @@ L(return_vec_3):
 #  else
 	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
 	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 #  endif
 L(ret4):
@@ -295,7 +451,7 @@ L(ret4):
 L(more_3x_vec):
 	/* Safe to compare 4x vectors.  */
 	VMOVU	VEC_SIZE(%rdi), %ymm0
-	VPCMPEQ	VEC_SIZE(%rsi), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -308,7 +464,7 @@ L(more_3x_vec):
 # endif
 
 	VMOVU	(VEC_SIZE * 2)(%rdi), %ymm0
-	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 2)(%rsi), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -316,7 +472,7 @@ L(more_3x_vec):
 	jnz	L(return_vec_2)
 
 	VMOVU	(VEC_SIZE * 3)(%rdi), %ymm0
-	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 3)(%rsi), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -391,12 +547,10 @@ L(loop_skip_page_cross_check):
 	VMOVA	(VEC_SIZE * 3)(%rdi), %ymm6
 
 	/* ymm1 all 1s where s1 and s2 equal. All 0s otherwise.  */
-	VPCMPEQ	(VEC_SIZE * 0)(%rsi), %ymm0, %ymm1
-
-	VPCMPEQ	(VEC_SIZE * 1)(%rsi), %ymm2, %ymm3
-	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
-	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
-
+	CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 0)(%rsi), %ymm3, %ymm1)
+	CMP_R1_S2_ymm (%ymm2, (VEC_SIZE * 1)(%rsi), %ymm5, %ymm3)
+	CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
+	CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
 
 	/* If any mismatches or null CHAR then 0 CHAR, otherwise non-
 	   zero.  */
@@ -465,6 +619,8 @@ L(return_vec_2_3_end):
 # else
 	movzbl	(VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax
 	movzbl	(VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -508,6 +664,8 @@ L(return_vec_0_end):
 # else
 	movzbl	(%rdi, %rcx), %eax
 	movzbl	(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -530,6 +688,8 @@ L(return_vec_1_end):
 #  else
 	movzbl	VEC_SIZE(%rdi, %rcx), %eax
 	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -556,6 +716,8 @@ L(return_vec_2_end):
 # else
 	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
 	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -583,7 +745,7 @@ L(page_cross_during_loop):
 	jle	L(less_1x_vec_till_page_cross)
 
 	VMOVA	(%rdi), %ymm0
-	VPCMPEQ	(%rsi), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -605,7 +767,7 @@ L(less_1x_vec_till_page_cross):
 	   here, it means the previous page (rdi - VEC_SIZE) has already
 	   been loaded earlier so must be valid.  */
 	VMOVU	-VEC_SIZE(%rdi, %rax), %ymm0
-	VPCMPEQ	-VEC_SIZE(%rsi, %rax), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, -VEC_SIZE(%rsi, %rax), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -647,6 +809,8 @@ L(return_page_cross_cmp_mem):
 # else
 	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
 	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -673,7 +837,7 @@ L(more_2x_vec_till_page_cross):
 	   iteration here.  */
 
 	VMOVU	VEC_SIZE(%rdi), %ymm0
-	VPCMPEQ	VEC_SIZE(%rsi), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -689,7 +853,7 @@ L(more_2x_vec_till_page_cross):
 
 	/* Safe to include comparisons from lower bytes.  */
 	VMOVU	-(VEC_SIZE * 2)(%rdi, %rax), %ymm0
-	VPCMPEQ	-(VEC_SIZE * 2)(%rsi, %rax), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 2)(%rsi, %rax), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -697,7 +861,7 @@ L(more_2x_vec_till_page_cross):
 	jnz	L(return_vec_page_cross_0)
 
 	VMOVU	-(VEC_SIZE * 1)(%rdi, %rax), %ymm0
-	VPCMPEQ	-(VEC_SIZE * 1)(%rsi, %rax), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 1)(%rsi, %rax), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -715,8 +879,8 @@ L(more_2x_vec_till_page_cross):
 	VMOVA	(VEC_SIZE * 2)(%rdi), %ymm4
 	VMOVA	(VEC_SIZE * 3)(%rdi), %ymm6
 
-	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
-	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
+	CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
+	CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
 	vpand	%ymm4, %ymm5, %ymm5
 	vpand	%ymm6, %ymm7, %ymm7
 	VPMINU	%ymm5, %ymm7, %ymm7
@@ -767,6 +931,8 @@ L(return_vec_page_cross_1):
 # else
 	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
 	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -822,7 +988,7 @@ L(page_cross):
 L(page_cross_loop):
 
 	VMOVU	(%rdi, %OFFSET_REG64), %ymm0
-	VPCMPEQ	(%rsi, %OFFSET_REG64), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -840,11 +1006,11 @@ L(page_cross_loop):
 	subl	%eax, %OFFSET_REG
 	/* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
 	   to not cross page so is safe to load. Since we have already
-	   loaded at least 1 VEC from rsi it is also guranteed to be safe.
-	 */
+	   loaded at least 1 VEC from rsi it is also guranteed to be
+	   safe.  */
 
 	VMOVU	(%rdi, %OFFSET_REG64), %ymm0
-	VPCMPEQ	(%rsi, %OFFSET_REG64), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -877,6 +1043,8 @@ L(ret_vec_page_cross_cont):
 # else
 	movzbl	(%rdi, %rcx), %eax
 	movzbl	(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -930,7 +1098,7 @@ L(less_1x_vec_till_page):
 	ja	L(less_16_till_page)
 
 	VMOVU	(%rdi), %xmm0
-	VPCMPEQ	(%rsi), %xmm0, %xmm1
+	CMP_R1_S2_xmm (%xmm0, (%rsi), %xmm2, %xmm1)
 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
 	vpandn	%xmm1, %xmm2, %xmm1
 	vpmovmskb %ymm1, %ecx
@@ -948,7 +1116,7 @@ L(less_1x_vec_till_page):
 # endif
 
 	VMOVU	(%rdi, %OFFSET_REG64), %xmm0
-	VPCMPEQ	(%rsi, %OFFSET_REG64), %xmm0, %xmm1
+	CMP_R1_S2_xmm (%xmm0, (%rsi, %OFFSET_REG64), %xmm2, %xmm1)
 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
 	vpandn	%xmm1, %xmm2, %xmm1
 	vpmovmskb %ymm1, %ecx
@@ -986,7 +1154,7 @@ L(less_16_till_page):
 	vmovq	(%rdi), %xmm0
 	vmovq	(%rsi), %xmm1
 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
-	VPCMPEQ	%xmm1, %xmm0, %xmm1
+	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
 	vpandn	%xmm1, %xmm2, %xmm1
 	vpmovmskb %ymm1, %ecx
 	incb	%cl
@@ -1006,7 +1174,7 @@ L(less_16_till_page):
 	vmovq	(%rdi, %OFFSET_REG64), %xmm0
 	vmovq	(%rsi, %OFFSET_REG64), %xmm1
 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
-	VPCMPEQ	%xmm1, %xmm0, %xmm1
+	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
 	vpandn	%xmm1, %xmm2, %xmm1
 	vpmovmskb %ymm1, %ecx
 	incb	%cl
@@ -1062,7 +1230,7 @@ L(ret_less_8_wcs):
 	vmovd	(%rdi), %xmm0
 	vmovd	(%rsi), %xmm1
 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
-	VPCMPEQ	%xmm1, %xmm0, %xmm1
+	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
 	vpandn	%xmm1, %xmm2, %xmm1
 	vpmovmskb %ymm1, %ecx
 	subl	$0xf, %ecx
@@ -1081,7 +1249,7 @@ L(ret_less_8_wcs):
 	vmovd	(%rdi, %OFFSET_REG64), %xmm0
 	vmovd	(%rsi, %OFFSET_REG64), %xmm1
 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
-	VPCMPEQ	%xmm1, %xmm0, %xmm1
+	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
 	vpandn	%xmm1, %xmm2, %xmm1
 	vpmovmskb %ymm1, %ecx
 	subl	$0xf, %ecx
@@ -1115,7 +1283,9 @@ L(less_4_till_page):
 L(less_4_loop):
 	movzbl	(%rdi), %eax
 	movzbl	(%rsi, %rdi), %ecx
-	subl	%ecx, %eax
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
+	subl	%BYTE_LOOP_REG, %eax
 	jnz	L(ret_less_4_loop)
 	testl	%ecx, %ecx
 	jz	L(ret_zero_4_loop)
@@ -1142,5 +1312,6 @@ L(ret_less_4_loop):
 	subl	%r8d, %eax
 	ret
 # endif
-END(STRCMP)
+	cfi_endproc
+	.size	STRCMP, .-STRCMP
 #endif
diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
new file mode 100644
index 0000000000..58c05dcfb8
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
@@ -0,0 +1,16 @@
+#ifndef STRCMP
+# define STRCMP	__strncasecmp_l_avx2_rtm
+#endif
+
+#define _GLABEL(x)	x ## _rtm
+#define GLABEL(x)	_GLABEL(x)
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN	\
+	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN	jmp L(return_vzeroupper)
+
+#define SECTION(p)	p##.avx.rtm
+#define OVERFLOW_STRCMP	__strcasecmp_l_avx2_rtm
+
+#include "strncase_l-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2.S b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
new file mode 100644
index 0000000000..48c0aa21f8
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
@@ -0,0 +1,27 @@
+/* strncasecmp_l optimized with AVX2.
+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef STRCMP
+# define STRCMP	__strncasecmp_l_avx2
+#endif
+#define USE_AS_STRCASECMP_L
+#define USE_AS_STRNCMP
+#ifndef OVERFLOW_STRCMP
+# define OVERFLOW_STRCMP	__strcasecmp_l_avx2
+#endif
+#include "strcmp-avx2.S"
-- 
2.25.1


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH v3 22/23] x86: Add EVEX optimized str{n}casecmp
  2022-03-23 21:57 ` [PATCH v1 21/23] x86: Add AVX2 optimized str{n}casecmp Noah Goldstein
  2022-03-24 19:03   ` H.J. Lu
  2022-03-24 22:41   ` [PATCH v3 " Noah Goldstein
@ 2022-03-24 22:41   ` Noah Goldstein
  2022-03-24 23:56   ` [PATCH v4 21/23] x86: Add AVX2 " Noah Goldstein
  3 siblings, 0 replies; 76+ messages in thread
From: Noah Goldstein @ 2022-03-24 22:41 UTC (permalink / raw)
  To: libc-alpha

geometric_mean(N=40) of all benchmarks EVEX / SSE42: .621

All string/memory tests pass.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
 sysdeps/x86_64/multiarch/Makefile            |   2 +
 sysdeps/x86_64/multiarch/ifunc-impl-list.c   |  16 ++
 sysdeps/x86_64/multiarch/ifunc-strcasecmp.h  |   5 +
 sysdeps/x86_64/multiarch/strcasecmp_l-evex.S |  23 ++
 sysdeps/x86_64/multiarch/strcmp-evex.S       | 286 ++++++++++++++++---
 sysdeps/x86_64/multiarch/strncase_l-evex.S   |  25 ++
 6 files changed, 317 insertions(+), 40 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
 create mode 100644 sysdeps/x86_64/multiarch/strncase_l-evex.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 06e1848823..35d80dc2ff 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -57,6 +57,7 @@ sysdep_routines += \
   strcasecmp_l-avx \
   strcasecmp_l-avx2 \
   strcasecmp_l-avx2-rtm \
+  strcasecmp_l-evex \
   strcasecmp_l-sse2 \
   strcasecmp_l-sse4_2 \
   strcasecmp_l-ssse3 \
@@ -97,6 +98,7 @@ sysdep_routines += \
   strncase_l-avx \
   strncase_l-avx2 \
   strncase_l-avx2-rtm \
+  strncase_l-evex \
   strncase_l-sse2 \
   strncase_l-sse4_2 \
   strncase_l-ssse3 \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 3c556d07ac..f1a4d3dac2 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -436,6 +436,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
   IFUNC_IMPL (i, name, strcasecmp,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __strcasecmp_evex)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp,
 			      CPU_FEATURE_USABLE (AVX2),
 			      __strcasecmp_avx2)
@@ -456,6 +460,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
   IFUNC_IMPL (i, name, strcasecmp_l,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __strcasecmp_l_evex)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp,
 			      CPU_FEATURE_USABLE (AVX2),
 			      __strcasecmp_l_avx2)
@@ -590,6 +598,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
   IFUNC_IMPL (i, name, strncasecmp,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __strncasecmp_evex)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp,
 			      CPU_FEATURE_USABLE (AVX2),
 			      __strncasecmp_avx2)
@@ -611,6 +623,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
   IFUNC_IMPL (i, name, strncasecmp_l,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __strncasecmp_l_evex)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp,
 			      CPU_FEATURE_USABLE (AVX2),
 			      __strncasecmp_l_avx2)
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
index c4de111fd0..bf0d146e7f 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
@@ -25,6 +25,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
 
 static inline void *
 IFUNC_SELECTOR (void)
@@ -34,6 +35,10 @@ IFUNC_SELECTOR (void)
   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
     {
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+        return OPTIMIZE (evex);
+
       if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
         return OPTIMIZE (avx2_rtm);
 
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
new file mode 100644
index 0000000000..58642db748
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
@@ -0,0 +1,23 @@
+/* strcasecmp_l optimized with EVEX.
+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef STRCMP
+# define STRCMP	__strcasecmp_l_evex
+#endif
+#define USE_AS_STRCASECMP_L
+#include "strcmp-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
index 56d8c118e4..002dd600ed 100644
--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
@@ -19,6 +19,9 @@
 #if IS_IN (libc)
 
 # include <sysdep.h>
+# if defined USE_AS_STRCASECMP_L
+#  include "locale-defines.h"
+# endif
 
 # ifndef STRCMP
 #  define STRCMP	__strcmp_evex
@@ -34,19 +37,29 @@
 # define VMOVA	vmovdqa64
 
 # ifdef USE_AS_WCSCMP
-#  define TESTEQ	subl	$0xff,
+#  ifndef OVERFLOW_STRCMP
+#   define OVERFLOW_STRCMP	__wcscmp_evex
+#  endif
+
+#  define TESTEQ	subl $0xff,
 	/* Compare packed dwords.  */
 #  define VPCMP	vpcmpd
 #  define VPMINU	vpminud
 #  define VPTESTM	vptestmd
+#  define VPTESTNM	vptestnmd
 	/* 1 dword char == 4 bytes.  */
 #  define SIZE_OF_CHAR	4
 # else
+#  ifndef OVERFLOW_STRCMP
+#   define OVERFLOW_STRCMP	__strcmp_evex
+#  endif
+
 #  define TESTEQ	incl
 	/* Compare packed bytes.  */
 #  define VPCMP	vpcmpb
 #  define VPMINU	vpminub
 #  define VPTESTM	vptestmb
+#  define VPTESTNM	vptestnmb
 	/* 1 byte char == 1 byte.  */
 #  define SIZE_OF_CHAR	1
 # endif
@@ -73,11 +86,16 @@
 #  define VEC_OFFSET	(-VEC_SIZE)
 # endif
 
-# define XMMZERO	xmm16
 # define XMM0	xmm17
 # define XMM1	xmm18
 
-# define YMMZERO	ymm16
+# define XMM10	xmm27
+# define XMM11	xmm28
+# define XMM12	xmm29
+# define XMM13	xmm30
+# define XMM14	xmm31
+
+
 # define YMM0	ymm17
 # define YMM1	ymm18
 # define YMM2	ymm19
@@ -89,6 +107,87 @@
 # define YMM8	ymm25
 # define YMM9	ymm26
 # define YMM10	ymm27
+# define YMM11	ymm28
+# define YMM12	ymm29
+# define YMM13	ymm30
+# define YMM14	ymm31
+
+# ifdef USE_AS_STRCASECMP_L
+#  define BYTE_LOOP_REG	OFFSET_REG
+# else
+#  define BYTE_LOOP_REG	ecx
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+#  ifdef USE_AS_STRNCMP
+#   define STRCASECMP	__strncasecmp_evex
+#   define LOCALE_REG	rcx
+#   define LOCALE_REG_LP	RCX_LP
+#   define STRCASECMP_NONASCII	__strncasecmp_l_nonascii
+#  else
+#   define STRCASECMP	__strcasecmp_evex
+#   define LOCALE_REG	rdx
+#   define LOCALE_REG_LP	RDX_LP
+#   define STRCASECMP_NONASCII	__strcasecmp_l_nonascii
+#  endif
+# endif
+
+# define LCASE_MIN_YMM	%YMM12
+# define LCASE_MAX_YMM	%YMM13
+# define CASE_ADD_YMM	%YMM14
+
+# define LCASE_MIN_XMM	%XMM12
+# define LCASE_MAX_XMM	%XMM13
+# define CASE_ADD_XMM	%XMM14
+
+	/* NB: wcsncmp uses r11 but strcasecmp is never used in
+	   conjunction with wcscmp.  */
+# define TOLOWER_BASE	%r11
+
+# ifdef USE_AS_STRCASECMP_L
+#  define _REG(x, y) x ## y
+#  define REG(x, y) _REG(x, y)
+#  define TOLOWER(reg1, reg2, ext)										\
+	vpsubb	REG(LCASE_MIN_, ext), reg1, REG(%ext, 10);					\
+	vpsubb	REG(LCASE_MIN_, ext), reg2, REG(%ext, 11);					\
+	vpcmpub	$1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5;				\
+	vpcmpub	$1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6;				\
+	vpaddb	reg1, REG(CASE_ADD_, ext), reg1{%k5};						\
+	vpaddb	reg2, REG(CASE_ADD_, ext), reg2{%k6}
+
+#  define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
+#  define TOLOWER_YMM(...)	TOLOWER(__VA_ARGS__, YMM)
+#  define TOLOWER_XMM(...)	TOLOWER(__VA_ARGS__, XMM)
+
+#  define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)						\
+	TOLOWER	(s1_reg, s2_reg, ext);										\
+	VPCMP	$0, s1_reg, s2_reg, reg_out
+
+#  define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext)				\
+	VMOVU	s2_mem, s2_reg;												\
+	CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)
+
+#  define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM)
+#  define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM)
+
+#  define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM)
+#  define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM)
+
+# else
+#  define TOLOWER_gpr(...)
+#  define TOLOWER_YMM(...)
+#  define TOLOWER_XMM(...)
+
+#  define CMP_R1_R2_YMM(s1_reg, s2_reg, reg_out)						\
+	VPCMP	$0, s2_reg, s1_reg, reg_out
+
+#  define CMP_R1_R2_XMM(...) CMP_R1_R2_YMM(__VA_ARGS__)
+
+#  define CMP_R1_S2_YMM(s1_reg, s2_mem, unused, reg_out)				\
+	VPCMP	$0, s2_mem, s1_reg, reg_out
+
+#  define CMP_R1_S2_XMM(...) CMP_R1_S2_YMM(__VA_ARGS__)
+# endif
 
 /* Warning!
            wcscmp/wcsncmp have to use SIGNED comparison for elements.
@@ -112,8 +211,45 @@
    returned.  */
 
 	.section .text.evex, "ax", @progbits
-ENTRY(STRCMP)
+	.align	16
+	.type	STRCMP, @function
+	.globl	STRCMP
+	.hidden	STRCMP
+
+# ifdef USE_AS_STRCASECMP_L
+ENTRY (STRCASECMP)
+	movq	__libc_tsd_LOCALE@gottpoff(%rip), %rax
+	mov	%fs:(%rax), %LOCALE_REG_LP
+
+	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
+	.p2align 4
+END (STRCASECMP)
+	/* FALLTHROUGH to strcasecmp/strncasecmp_l.  */
+# endif
+
+	.p2align 4
+STRCMP:
+	cfi_startproc
+	_CET_ENDBR
+	CALL_MCOUNT
+
+# if defined USE_AS_STRCASECMP_L
+	/* We have to fall back on the C implementation for locales with
+	   encodings not matching ASCII for single bytes.  */
+#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	mov	LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
+#  else
+	mov	(%LOCALE_REG), %RAX_LP
+#  endif
+	testl	$1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
+	jne	STRCASECMP_NONASCII
+	leaq	_nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
+# endif
+
 # ifdef USE_AS_STRNCMP
+	/* Don't overwrite LOCALE_REG (rcx) until we have pass
+	   L(one_or_less). Otherwise we might use the wrong locale in
+	   the OVERFLOW_STRCMP (strcasecmp_l).  */
 #  ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
 	movl	%edx, %edx
@@ -125,6 +261,32 @@ ENTRY(STRCMP)
 	   actually bound the buffer.  */
 	jle	L(one_or_less)
 # endif
+
+# if defined USE_AS_STRCASECMP_L
+	.section .rodata.cst32, "aM", @progbits, 32
+	.align	32
+L(lcase_min):
+	.quad	0x4141414141414141
+	.quad	0x4141414141414141
+	.quad	0x4141414141414141
+	.quad	0x4141414141414141
+L(lcase_max):
+	.quad	0x1a1a1a1a1a1a1a1a
+	.quad	0x1a1a1a1a1a1a1a1a
+	.quad	0x1a1a1a1a1a1a1a1a
+	.quad	0x1a1a1a1a1a1a1a1a
+L(case_add):
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.previous
+
+	vmovdqa64 L(lcase_min)(%rip), LCASE_MIN_YMM
+	vmovdqa64 L(lcase_max)(%rip), LCASE_MAX_YMM
+	vmovdqa64 L(case_add)(%rip), CASE_ADD_YMM
+# endif
+
 	movl	%edi, %eax
 	orl	%esi, %eax
 	/* Shift out the bits irrelivant to page boundary ([63:12]).  */
@@ -139,7 +301,7 @@ L(no_page_cross):
 	VPTESTM	%YMM0, %YMM0, %k2
 	/* Each bit cleared in K1 represents a mismatch or a null CHAR
 	   in YMM0 and 32 bytes at (%rsi).  */
-	VPCMP	$0, (%rsi), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
 # ifdef USE_AS_STRNCMP
 	cmpq	$CHAR_PER_VEC, %rdx
@@ -169,6 +331,8 @@ L(return_vec_0):
 # else
 	movzbl	(%rdi, %rcx), %eax
 	movzbl	(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 # endif
 L(ret0):
@@ -189,10 +353,10 @@ L(ret_zero):
 	.p2align 4,, 5
 L(one_or_less):
 	jb	L(ret_zero)
-#  ifdef USE_AS_WCSCMP
 	/* 'nbe' covers the case where length is negative (large
 	   unsigned).  */
-	jnbe	__wcscmp_evex
+	jnbe	OVERFLOW_STRCMP
+#  ifdef USE_AS_WCSCMP
 	movl	(%rdi), %edx
 	xorl	%eax, %eax
 	cmpl	(%rsi), %edx
@@ -201,11 +365,10 @@ L(one_or_less):
 	negl	%eax
 	orl	$1, %eax
 #  else
-	/* 'nbe' covers the case where length is negative (large
-	   unsigned).  */
-	jnbe	__strcmp_evex
 	movzbl	(%rdi), %eax
 	movzbl	(%rsi), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 #  endif
 L(ret1):
@@ -233,6 +396,8 @@ L(return_vec_1):
 # else
 	movzbl	VEC_SIZE(%rdi, %rcx), %eax
 	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 # endif
 L(ret2):
@@ -270,6 +435,8 @@ L(return_vec_2):
 # else
 	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
 	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 # endif
 L(ret3):
@@ -290,6 +457,8 @@ L(return_vec_3):
 #  else
 	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
 	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 #  endif
 L(ret4):
@@ -303,7 +472,7 @@ L(more_3x_vec):
 	/* Safe to compare 4x vectors.  */
 	VMOVU	(VEC_SIZE)(%rdi), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, (VEC_SIZE)(%rsi), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_1)
@@ -315,14 +484,14 @@ L(more_3x_vec):
 
 	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_2)
 
 	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_3)
@@ -381,7 +550,6 @@ L(prepare_loop_aligned):
 	subl	%esi, %eax
 	andl	$(PAGE_SIZE - 1), %eax
 
-	vpxorq	%YMMZERO, %YMMZERO, %YMMZERO
 
 	/* Loop 4x comparisons at a time.  */
 	.p2align 4
@@ -413,22 +581,35 @@ L(loop_skip_page_cross_check):
 	/* A zero CHAR in YMM9 means that there is a null CHAR.  */
 	VPMINU	%YMM8, %YMM9, %YMM9
 
-	/* Each bit set in K1 represents a non-null CHAR in YMM8.  */
+	/* Each bit set in K1 represents a non-null CHAR in YMM9.  */
 	VPTESTM	%YMM9, %YMM9, %k1
-
+# ifndef USE_AS_STRCASECMP_L
 	vpxorq	(VEC_SIZE * 0)(%rsi), %YMM0, %YMM1
 	vpxorq	(VEC_SIZE * 1)(%rsi), %YMM2, %YMM3
 	vpxorq	(VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
 	/* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
 	   oring with YMM1. Result is stored in YMM6.  */
 	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6
-
+# else
+	VMOVU	(VEC_SIZE * 0)(%rsi), %YMM1
+	TOLOWER_YMM (%YMM0, %YMM1)
+	VMOVU	(VEC_SIZE * 1)(%rsi), %YMM3
+	TOLOWER_YMM (%YMM2, %YMM3)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM5
+	TOLOWER_YMM (%YMM4, %YMM5)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM7
+	TOLOWER_YMM (%YMM6, %YMM7)
+	vpxorq	%YMM0, %YMM1, %YMM1
+	vpxorq	%YMM2, %YMM3, %YMM3
+	vpxorq	%YMM4, %YMM5, %YMM5
+	vpternlogd $0xde, %YMM7, %YMM1, %YMM6
+# endif
 	/* Or together YMM3, YMM5, and YMM6.  */
 	vpternlogd $0xfe, %YMM3, %YMM5, %YMM6
 
 
 	/* A non-zero CHAR in YMM6 represents a mismatch.  */
-	VPCMP	$0, %YMMZERO, %YMM6, %k0{%k1}
+	VPTESTNM %YMM6, %YMM6, %k0{%k1}
 	kmovd	%k0, %LOOP_REG
 
 	TESTEQ	%LOOP_REG
@@ -437,13 +618,13 @@ L(loop_skip_page_cross_check):
 
 	/* Find which VEC has the mismatch of end of string.  */
 	VPTESTM	%YMM0, %YMM0, %k1
-	VPCMP	$0, %YMMZERO, %YMM1, %k0{%k1}
+	VPTESTNM %YMM1, %YMM1, %k0{%k1}
 	kmovd	%k0, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_0_end)
 
 	VPTESTM	%YMM2, %YMM2, %k1
-	VPCMP	$0, %YMMZERO, %YMM3, %k0{%k1}
+	VPTESTNM %YMM3, %YMM3, %k0{%k1}
 	kmovd	%k0, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_1_end)
@@ -457,7 +638,7 @@ L(return_vec_2_3_end):
 # endif
 
 	VPTESTM	%YMM4, %YMM4, %k1
-	VPCMP	$0, %YMMZERO, %YMM5, %k0{%k1}
+	VPTESTNM %YMM5, %YMM5, %k0{%k1}
 	kmovd	%k0, %ecx
 	TESTEQ	%ecx
 # if CHAR_PER_VEC <= 16
@@ -493,6 +674,8 @@ L(return_vec_3_end):
 # else
 	movzbl	(VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax
 	movzbl	(VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -545,6 +728,8 @@ L(return_vec_0_end):
 # else
 	movzbl	(%rdi, %rcx), %eax
 	movzbl	(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	/* Flip `eax` if `rdi` and `rsi` where swapped in page cross
 	   logic. Subtract `r8d` after xor for zero case.  */
@@ -569,6 +754,8 @@ L(return_vec_1_end):
 #  else
 	movzbl	VEC_SIZE(%rdi, %rcx), %eax
 	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -598,7 +785,7 @@ L(page_cross_during_loop):
 
 	VMOVA	(%rdi), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, (%rsi), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_0_end)
@@ -619,8 +806,7 @@ L(less_1x_vec_till_page_cross):
 	   been loaded earlier so must be valid.  */
 	VMOVU	-VEC_SIZE(%rdi, %rax), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, -VEC_SIZE(%rsi, %rax), %YMM0, %k1{%k2}
-
+	CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2}
 	/* Mask of potentially valid bits. The lower bits can be out of
 	   range comparisons (but safe regarding page crosses).  */
 
@@ -642,6 +828,8 @@ L(less_1x_vec_till_page_cross):
 
 # ifdef USE_AS_STRNCMP
 #  ifdef USE_AS_WCSCMP
+	/* NB: strcasecmp not used with WCSCMP so this access to r11 is
+	   safe.  */
 	movl	%eax, %r11d
 	shrl	$2, %r11d
 	cmpq	%r11, %rdx
@@ -679,6 +867,8 @@ L(return_page_cross_cmp_mem):
 # else
 	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
 	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -709,7 +899,7 @@ L(more_2x_vec_till_page_cross):
 
 	VMOVA	VEC_SIZE(%rdi), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_1_end)
@@ -724,14 +914,14 @@ L(more_2x_vec_till_page_cross):
 	/* Safe to include comparisons from lower bytes.  */
 	VMOVU	-(VEC_SIZE * 2)(%rdi, %rax), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_page_cross_0)
 
 	VMOVU	-(VEC_SIZE * 1)(%rdi, %rax), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_page_cross_1)
@@ -740,6 +930,8 @@ L(more_2x_vec_till_page_cross):
 	/* Must check length here as length might proclude reading next
 	   page.  */
 #  ifdef USE_AS_WCSCMP
+	/* NB: strcasecmp not used with WCSCMP so this access to r11 is
+	   safe.  */
 	movl	%eax, %r11d
 	shrl	$2, %r11d
 	cmpq	%r11, %rdx
@@ -754,12 +946,19 @@ L(more_2x_vec_till_page_cross):
 	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM6
 	VPMINU	%YMM4, %YMM6, %YMM9
 	VPTESTM	%YMM9, %YMM9, %k1
-
+# ifndef USE_AS_STRCASECMP_L
 	vpxorq	(VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
 	/* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6).  */
 	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6
-
-	VPCMP	$0, %YMMZERO, %YMM6, %k0{%k1}
+# else
+	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM5
+	TOLOWER_YMM (%YMM4, %YMM5)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM7
+	TOLOWER_YMM (%YMM6, %YMM7)
+	vpxorq	%YMM4, %YMM5, %YMM5
+	vpternlogd $0xde, %YMM7, %YMM5, %YMM6
+# endif
+	VPTESTNM %YMM6, %YMM6, %k0{%k1}
 	kmovd	%k0, %LOOP_REG
 	TESTEQ	%LOOP_REG
 	jnz	L(return_vec_2_3_end)
@@ -815,6 +1014,8 @@ L(return_vec_page_cross_1):
 # else
 	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
 	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -871,7 +1072,7 @@ L(page_cross):
 L(page_cross_loop):
 	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(check_ret_vec_page_cross)
@@ -895,7 +1096,7 @@ L(page_cross_loop):
 	 */
 	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
 
 	kmovd	%k1, %ecx
 # ifdef USE_AS_STRNCMP
@@ -930,6 +1131,8 @@ L(ret_vec_page_cross_cont):
 # else
 	movzbl	(%rdi, %rcx, SIZE_OF_CHAR), %eax
 	movzbl	(%rsi, %rcx, SIZE_OF_CHAR), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -989,7 +1192,7 @@ L(less_1x_vec_till_page):
 	/* Use 16 byte comparison.  */
 	vmovdqu	(%rdi), %xmm0
 	VPTESTM	%xmm0, %xmm0, %k2
-	VPCMP	$0, (%rsi), %xmm0, %k1{%k2}
+	CMP_R1_S2_XMM (%xmm0, (%rsi), %xmm1, %k1){%k2}
 	kmovd	%k1, %ecx
 # ifdef USE_AS_WCSCMP
 	subl	$0xf, %ecx
@@ -1009,7 +1212,7 @@ L(less_1x_vec_till_page):
 # endif
 	vmovdqu	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
 	VPTESTM	%xmm0, %xmm0, %k2
-	VPCMP	$0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0, %k1{%k2}
+	CMP_R1_S2_XMM (%xmm0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1, %k1){%k2}
 	kmovd	%k1, %ecx
 # ifdef USE_AS_WCSCMP
 	subl	$0xf, %ecx
@@ -1048,7 +1251,7 @@ L(less_16_till_page):
 	vmovq	(%rdi), %xmm0
 	vmovq	(%rsi), %xmm1
 	VPTESTM	%xmm0, %xmm0, %k2
-	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
+	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
 	kmovd	%k1, %ecx
 # ifdef USE_AS_WCSCMP
 	subl	$0x3, %ecx
@@ -1068,7 +1271,7 @@ L(less_16_till_page):
 	vmovq	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
 	vmovq	(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
 	VPTESTM	%xmm0, %xmm0, %k2
-	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
+	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
 	kmovd	%k1, %ecx
 # ifdef USE_AS_WCSCMP
 	subl	$0x3, %ecx
@@ -1128,7 +1331,7 @@ L(ret_less_8_wcs):
 	vmovd	(%rdi), %xmm0
 	vmovd	(%rsi), %xmm1
 	VPTESTM	%xmm0, %xmm0, %k2
-	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
+	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
 	kmovd	%k1, %ecx
 	subl	$0xf, %ecx
 	jnz	L(check_ret_vec_page_cross)
@@ -1143,7 +1346,7 @@ L(ret_less_8_wcs):
 	vmovd	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
 	vmovd	(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
 	VPTESTM	%xmm0, %xmm0, %k2
-	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
+	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
 	kmovd	%k1, %ecx
 	subl	$0xf, %ecx
 	jnz	L(check_ret_vec_page_cross)
@@ -1176,7 +1379,9 @@ L(less_4_till_page):
 L(less_4_loop):
 	movzbl	(%rdi), %eax
 	movzbl	(%rsi, %rdi), %ecx
-	subl	%ecx, %eax
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
+	subl	%BYTE_LOOP_REG, %eax
 	jnz	L(ret_less_4_loop)
 	testl	%ecx, %ecx
 	jz	L(ret_zero_4_loop)
@@ -1203,5 +1408,6 @@ L(ret_less_4_loop):
 	subl	%r8d, %eax
 	ret
 # endif
-END(STRCMP)
+	cfi_endproc
+	.size	STRCMP, .-STRCMP
 #endif
diff --git a/sysdeps/x86_64/multiarch/strncase_l-evex.S b/sysdeps/x86_64/multiarch/strncase_l-evex.S
new file mode 100644
index 0000000000..8a5af3695c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncase_l-evex.S
@@ -0,0 +1,25 @@
+/* strncasecmp_l optimized with EVEX.
+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef STRCMP
+# define STRCMP	__strncasecmp_l_evex
+#endif
+#define OVERFLOW_STRCMP	__strcasecmp_l_evex
+#define USE_AS_STRCASECMP_L
+#define USE_AS_STRNCMP
+#include "strcmp-evex.S"
-- 
2.25.1


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH v4 21/23] x86: Add AVX2 optimized str{n}casecmp
  2022-03-23 21:57 ` [PATCH v1 21/23] x86: Add AVX2 optimized str{n}casecmp Noah Goldstein
                     ` (2 preceding siblings ...)
  2022-03-24 22:41   ` [PATCH v3 22/23] x86: Add EVEX " Noah Goldstein
@ 2022-03-24 23:56   ` Noah Goldstein
  2022-03-24 23:56     ` [PATCH v4 22/23] x86: Add EVEX " Noah Goldstein
  2022-03-25 18:14     ` [PATCH v4 21/23] x86: Add AVX2 " H.J. Lu
  3 siblings, 2 replies; 76+ messages in thread
From: Noah Goldstein @ 2022-03-24 23:56 UTC (permalink / raw)
  To: libc-alpha

geometric_mean(N=40) of all benchmarks AVX2 / SSE42: .702

All string/memory tests pass.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
 sysdeps/x86_64/multiarch/Makefile             |   4 +
 sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  28 +++
 sysdeps/x86_64/multiarch/ifunc-strcasecmp.h   |  12 +
 .../x86_64/multiarch/strcasecmp_l-avx2-rtm.S  |  15 ++
 sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S  |  23 ++
 sysdeps/x86_64/multiarch/strcmp-avx2.S        | 237 +++++++++++++++---
 .../x86_64/multiarch/strncase_l-avx2-rtm.S    |  16 ++
 sysdeps/x86_64/multiarch/strncase_l-avx2.S    |  27 ++
 8 files changed, 331 insertions(+), 31 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
 create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
 create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index e7b413edad..06e1848823 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -55,6 +55,8 @@ sysdep_routines += \
   stpncpy-sse2-unaligned \
   stpncpy-ssse3 \
   strcasecmp_l-avx \
+  strcasecmp_l-avx2 \
+  strcasecmp_l-avx2-rtm \
   strcasecmp_l-sse2 \
   strcasecmp_l-sse4_2 \
   strcasecmp_l-ssse3 \
@@ -93,6 +95,8 @@ sysdep_routines += \
   strlen-evex \
   strlen-sse2 \
   strncase_l-avx \
+  strncase_l-avx2 \
+  strncase_l-avx2-rtm \
   strncase_l-sse2 \
   strncase_l-sse4_2 \
   strncase_l-ssse3 \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index a594f4176e..3c556d07ac 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -436,6 +436,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
   IFUNC_IMPL (i, name, strcasecmp,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      CPU_FEATURE_USABLE (AVX2),
+			      __strcasecmp_avx2)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __strcasecmp_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp,
 			      CPU_FEATURE_USABLE (AVX),
 			      __strcasecmp_avx)
@@ -449,6 +456,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
   IFUNC_IMPL (i, name, strcasecmp_l,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      CPU_FEATURE_USABLE (AVX2),
+			      __strcasecmp_l_avx2)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __strcasecmp_l_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
 			      CPU_FEATURE_USABLE (AVX),
 			      __strcasecmp_l_avx)
@@ -576,6 +590,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
   IFUNC_IMPL (i, name, strncasecmp,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      CPU_FEATURE_USABLE (AVX2),
+			      __strncasecmp_avx2)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __strncasecmp_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp,
 			      CPU_FEATURE_USABLE (AVX),
 			      __strncasecmp_avx)
@@ -590,6 +611,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
   IFUNC_IMPL (i, name, strncasecmp_l,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      CPU_FEATURE_USABLE (AVX2),
+			      __strncasecmp_l_avx2)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __strncasecmp_l_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
 			      CPU_FEATURE_USABLE (AVX),
 			      __strncasecmp_l_avx)
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
index 9e3cc61ac0..c4de111fd0 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
@@ -23,12 +23,24 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
 
 static inline void *
 IFUNC_SELECTOR (void)
 {
   const struct cpu_features* cpu_features = __get_cpu_features ();
 
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+    {
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+        return OPTIMIZE (avx2_rtm);
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+        return OPTIMIZE (avx2);
+    }
+
   if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
     return OPTIMIZE (avx);
 
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
new file mode 100644
index 0000000000..09957fc3c5
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
@@ -0,0 +1,15 @@
+#ifndef STRCMP
+# define STRCMP	__strcasecmp_l_avx2_rtm
+#endif
+
+#define _GLABEL(x)	x ## _rtm
+#define GLABEL(x)	_GLABEL(x)
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN	\
+	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN	jmp L(return_vzeroupper)
+
+#define SECTION(p)	p##.avx.rtm
+
+#include "strcasecmp_l-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
new file mode 100644
index 0000000000..e2762f2a22
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
@@ -0,0 +1,23 @@
+/* strcasecmp_l optimized with AVX2.
+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef STRCMP
+# define STRCMP	__strcasecmp_l_avx2
+#endif
+#define USE_AS_STRCASECMP_L
+#include "strcmp-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
index 86a86b68e3..8da09bd86d 100644
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
@@ -20,6 +20,10 @@
 
 # include <sysdep.h>
 
+# if defined USE_AS_STRCASECMP_L
+#  include "locale-defines.h"
+# endif
+
 # ifndef STRCMP
 #  define STRCMP	__strcmp_avx2
 # endif
@@ -74,13 +78,88 @@
 #  define VEC_OFFSET	(-VEC_SIZE)
 # endif
 
+# ifdef USE_AS_STRCASECMP_L
+#  define BYTE_LOOP_REG	OFFSET_REG
+# else
+#  define BYTE_LOOP_REG	ecx
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+#  ifdef USE_AS_STRNCMP
+#   define STRCASECMP	__strncasecmp_avx2
+#   define LOCALE_REG	rcx
+#   define LOCALE_REG_LP	RCX_LP
+#   define STRCASECMP_NONASCII	__strncasecmp_l_nonascii
+#  else
+#   define STRCASECMP	__strcasecmp_avx2
+#   define LOCALE_REG	rdx
+#   define LOCALE_REG_LP	RDX_LP
+#   define STRCASECMP_NONASCII	__strcasecmp_l_nonascii
+#  endif
+# endif
+
 # define xmmZERO	xmm15
 # define ymmZERO	ymm15
 
+# define LCASE_MIN_ymm	%ymm10
+# define LCASE_MAX_ymm	%ymm11
+# define CASE_ADD_ymm	%ymm12
+
+# define LCASE_MIN_xmm	%xmm10
+# define LCASE_MAX_xmm	%xmm11
+# define CASE_ADD_xmm	%xmm12
+
+	/* r11 is never use elsewhere so this is safe to maintain.  */
+# define TOLOWER_BASE	%r11
+
 # ifndef SECTION
 #  define SECTION(p)	p##.avx
 # endif
 
+# ifdef USE_AS_STRCASECMP_L
+#  define REG(x, y) x ## y
+#  define TOLOWER(reg1_in, reg1_out, reg2_in, reg2_out, ext)			\
+	vpaddb	REG(LCASE_MIN_, ext), reg1_in, REG(%ext, 8);				\
+	vpaddb	REG(LCASE_MIN_, ext), reg2_in, REG(%ext, 9);				\
+	vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 8), REG(%ext, 8);			\
+	vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 9), REG(%ext, 9);			\
+	vpandn	REG(CASE_ADD_, ext), REG(%ext, 8), REG(%ext, 8);			\
+	vpandn	REG(CASE_ADD_, ext), REG(%ext, 9), REG(%ext, 9);			\
+	vpaddb	REG(%ext, 8), reg1_in, reg1_out;							\
+	vpaddb	REG(%ext, 9), reg2_in, reg2_out
+
+#  define TOLOWER_gpr(src, dst)	movl (TOLOWER_BASE, src, 4), dst
+#  define TOLOWER_ymm(...)	TOLOWER(__VA_ARGS__, ymm)
+#  define TOLOWER_xmm(...)	TOLOWER(__VA_ARGS__, xmm)
+
+#  define CMP_R1_R2(s1_reg, s2_reg, scratch_reg, reg_out, ext)			\
+	TOLOWER	(s1_reg, scratch_reg, s2_reg, s2_reg, ext);					\
+	VPCMPEQ	scratch_reg, s2_reg, reg_out
+
+#  define CMP_R1_S2(s1_reg, s2_mem, scratch_reg, reg_out, ext)			\
+	VMOVU	s2_mem, reg_out;											\
+	CMP_R1_R2(s1_reg, reg_out, scratch_reg, reg_out, ext)
+
+#  define CMP_R1_R2_ymm(...) CMP_R1_R2(__VA_ARGS__, ymm)
+#  define CMP_R1_R2_xmm(...) CMP_R1_R2(__VA_ARGS__, xmm)
+
+#  define CMP_R1_S2_ymm(...) CMP_R1_S2(__VA_ARGS__, ymm)
+#  define CMP_R1_S2_xmm(...) CMP_R1_S2(__VA_ARGS__, xmm)
+
+# else
+#  define TOLOWER_gpr(...)
+#  define TOLOWER_ymm(...)
+#  define TOLOWER_xmm(...)
+
+#  define CMP_R1_R2_ymm(s1_reg, s2_reg, scratch_reg, reg_out)			\
+	VPCMPEQ	s2_reg, s1_reg, reg_out
+
+#  define CMP_R1_R2_xmm(...) CMP_R1_R2_ymm(__VA_ARGS__)
+
+#  define CMP_R1_S2_ymm(...) CMP_R1_R2_ymm(__VA_ARGS__)
+#  define CMP_R1_S2_xmm(...) CMP_R1_R2_xmm(__VA_ARGS__)
+# endif
+
 /* Warning!
            wcscmp/wcsncmp have to use SIGNED comparison for elements.
            strcmp/strncmp have to use UNSIGNED comparison for elements.
@@ -102,8 +181,49 @@
    returned.  */
 
 	.section SECTION(.text), "ax", @progbits
-ENTRY(STRCMP)
+	.align	16
+	.type	STRCMP, @function
+	.globl	STRCMP
+	.hidden	STRCMP
+
+# ifndef GLABEL
+#  define GLABEL(...)	__VA_ARGS__
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+ENTRY (GLABEL(STRCASECMP))
+	movq	__libc_tsd_LOCALE@gottpoff(%rip), %rax
+	mov	%fs:(%rax), %LOCALE_REG_LP
+
+	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
+	.p2align 4
+END (GLABEL(STRCASECMP))
+	/* FALLTHROUGH to strcasecmp/strncasecmp_l.  */
+# endif
+
+	.p2align 4
+STRCMP:
+	cfi_startproc
+	_CET_ENDBR
+	CALL_MCOUNT
+
+# if defined USE_AS_STRCASECMP_L
+	/* We have to fall back on the C implementation for locales with
+	   encodings not matching ASCII for single bytes.  */
+#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	mov	LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
+#  else
+	mov	(%LOCALE_REG), %RAX_LP
+#  endif
+	testl	$1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
+	jne	STRCASECMP_NONASCII
+	leaq	_nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
+# endif
+
 # ifdef USE_AS_STRNCMP
+	/* Don't overwrite LOCALE_REG (rcx) until we have pass
+	   L(one_or_less). Otherwise we might use the wrong locale in
+	   the OVERFLOW_STRCMP (strcasecmp_l).  */
 #  ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
 	movl	%edx, %edx
@@ -128,6 +248,30 @@ ENTRY(STRCMP)
 #  endif
 # endif
 	vpxor	%xmmZERO, %xmmZERO, %xmmZERO
+# if defined USE_AS_STRCASECMP_L
+	.section .rodata.cst32, "aM", @progbits, 32
+	.align	32
+L(lcase_min):
+	.quad	0x3f3f3f3f3f3f3f3f
+	.quad	0x3f3f3f3f3f3f3f3f
+	.quad	0x3f3f3f3f3f3f3f3f
+	.quad	0x3f3f3f3f3f3f3f3f
+L(lcase_max):
+	.quad	0x9999999999999999
+	.quad	0x9999999999999999
+	.quad	0x9999999999999999
+	.quad	0x9999999999999999
+L(case_add):
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.previous
+
+	vmovdqa	L(lcase_min)(%rip), LCASE_MIN_ymm
+	vmovdqa	L(lcase_max)(%rip), LCASE_MAX_ymm
+	vmovdqa	L(case_add)(%rip), CASE_ADD_ymm
+# endif
 	movl	%edi, %eax
 	orl	%esi, %eax
 	sall	$20, %eax
@@ -138,8 +282,10 @@ ENTRY(STRCMP)
 L(no_page_cross):
 	/* Safe to compare 4x vectors.  */
 	VMOVU	(%rdi), %ymm0
-	/* 1s where s1 and s2 equal.  */
-	VPCMPEQ	(%rsi), %ymm0, %ymm1
+	/* 1s where s1 and s2 equal. Just VPCMPEQ if its not strcasecmp.
+	   Otherwise converts ymm0 and load from rsi to lower. ymm2 is
+	   scratch and ymm1 is the return.  */
+	CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
 	/* 1s at null CHAR.  */
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	/* 1s where s1 and s2 equal AND not null CHAR.  */
@@ -172,6 +318,8 @@ L(return_vec_0):
 # else
 	movzbl	(%rdi, %rcx), %eax
 	movzbl	(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 # endif
 L(ret0):
@@ -192,6 +340,10 @@ L(ret_zero):
 
 	.p2align 4,, 5
 L(one_or_less):
+#  ifdef USE_AS_STRCASECMP_L
+	/* Set locale argument for strcasecmp.  */
+	movq	%LOCALE_REG, %rdx
+#  endif
 	jb	L(ret_zero)
 	/* 'nbe' covers the case where length is negative (large
 	   unsigned).  */
@@ -207,6 +359,8 @@ L(one_or_less):
 #  else
 	movzbl	(%rdi), %eax
 	movzbl	(%rsi), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 #  endif
 L(ret1):
@@ -234,6 +388,8 @@ L(return_vec_1):
 # else
 	movzbl	VEC_SIZE(%rdi, %rcx), %eax
 	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 # endif
 L(ret2):
@@ -265,6 +421,8 @@ L(return_vec_2):
 # else
 	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
 	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 # endif
 L(ret3):
@@ -285,6 +443,8 @@ L(return_vec_3):
 #  else
 	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
 	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 #  endif
 L(ret4):
@@ -295,7 +455,7 @@ L(ret4):
 L(more_3x_vec):
 	/* Safe to compare 4x vectors.  */
 	VMOVU	VEC_SIZE(%rdi), %ymm0
-	VPCMPEQ	VEC_SIZE(%rsi), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -308,7 +468,7 @@ L(more_3x_vec):
 # endif
 
 	VMOVU	(VEC_SIZE * 2)(%rdi), %ymm0
-	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 2)(%rsi), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -316,7 +476,7 @@ L(more_3x_vec):
 	jnz	L(return_vec_2)
 
 	VMOVU	(VEC_SIZE * 3)(%rdi), %ymm0
-	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 3)(%rsi), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -391,12 +551,10 @@ L(loop_skip_page_cross_check):
 	VMOVA	(VEC_SIZE * 3)(%rdi), %ymm6
 
 	/* ymm1 all 1s where s1 and s2 equal. All 0s otherwise.  */
-	VPCMPEQ	(VEC_SIZE * 0)(%rsi), %ymm0, %ymm1
-
-	VPCMPEQ	(VEC_SIZE * 1)(%rsi), %ymm2, %ymm3
-	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
-	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
-
+	CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 0)(%rsi), %ymm3, %ymm1)
+	CMP_R1_S2_ymm (%ymm2, (VEC_SIZE * 1)(%rsi), %ymm5, %ymm3)
+	CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
+	CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
 
 	/* If any mismatches or null CHAR then 0 CHAR, otherwise non-
 	   zero.  */
@@ -465,6 +623,8 @@ L(return_vec_2_3_end):
 # else
 	movzbl	(VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax
 	movzbl	(VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -508,6 +668,8 @@ L(return_vec_0_end):
 # else
 	movzbl	(%rdi, %rcx), %eax
 	movzbl	(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -530,6 +692,8 @@ L(return_vec_1_end):
 #  else
 	movzbl	VEC_SIZE(%rdi, %rcx), %eax
 	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -556,6 +720,8 @@ L(return_vec_2_end):
 # else
 	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
 	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -583,7 +749,7 @@ L(page_cross_during_loop):
 	jle	L(less_1x_vec_till_page_cross)
 
 	VMOVA	(%rdi), %ymm0
-	VPCMPEQ	(%rsi), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -605,7 +771,7 @@ L(less_1x_vec_till_page_cross):
 	   here, it means the previous page (rdi - VEC_SIZE) has already
 	   been loaded earlier so must be valid.  */
 	VMOVU	-VEC_SIZE(%rdi, %rax), %ymm0
-	VPCMPEQ	-VEC_SIZE(%rsi, %rax), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, -VEC_SIZE(%rsi, %rax), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -647,6 +813,8 @@ L(return_page_cross_cmp_mem):
 # else
 	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
 	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -673,7 +841,7 @@ L(more_2x_vec_till_page_cross):
 	   iteration here.  */
 
 	VMOVU	VEC_SIZE(%rdi), %ymm0
-	VPCMPEQ	VEC_SIZE(%rsi), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -689,7 +857,7 @@ L(more_2x_vec_till_page_cross):
 
 	/* Safe to include comparisons from lower bytes.  */
 	VMOVU	-(VEC_SIZE * 2)(%rdi, %rax), %ymm0
-	VPCMPEQ	-(VEC_SIZE * 2)(%rsi, %rax), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 2)(%rsi, %rax), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -697,7 +865,7 @@ L(more_2x_vec_till_page_cross):
 	jnz	L(return_vec_page_cross_0)
 
 	VMOVU	-(VEC_SIZE * 1)(%rdi, %rax), %ymm0
-	VPCMPEQ	-(VEC_SIZE * 1)(%rsi, %rax), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 1)(%rsi, %rax), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -715,8 +883,8 @@ L(more_2x_vec_till_page_cross):
 	VMOVA	(VEC_SIZE * 2)(%rdi), %ymm4
 	VMOVA	(VEC_SIZE * 3)(%rdi), %ymm6
 
-	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
-	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
+	CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
+	CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
 	vpand	%ymm4, %ymm5, %ymm5
 	vpand	%ymm6, %ymm7, %ymm7
 	VPMINU	%ymm5, %ymm7, %ymm7
@@ -767,6 +935,8 @@ L(return_vec_page_cross_1):
 # else
 	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
 	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -822,7 +992,7 @@ L(page_cross):
 L(page_cross_loop):
 
 	VMOVU	(%rdi, %OFFSET_REG64), %ymm0
-	VPCMPEQ	(%rsi, %OFFSET_REG64), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -840,11 +1010,11 @@ L(page_cross_loop):
 	subl	%eax, %OFFSET_REG
 	/* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
 	   to not cross page so is safe to load. Since we have already
-	   loaded at least 1 VEC from rsi it is also guranteed to be safe.
-	 */
+	   loaded at least 1 VEC from rsi it is also guranteed to be
+	   safe.  */
 
 	VMOVU	(%rdi, %OFFSET_REG64), %ymm0
-	VPCMPEQ	(%rsi, %OFFSET_REG64), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -877,6 +1047,8 @@ L(ret_vec_page_cross_cont):
 # else
 	movzbl	(%rdi, %rcx), %eax
 	movzbl	(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -930,7 +1102,7 @@ L(less_1x_vec_till_page):
 	ja	L(less_16_till_page)
 
 	VMOVU	(%rdi), %xmm0
-	VPCMPEQ	(%rsi), %xmm0, %xmm1
+	CMP_R1_S2_xmm (%xmm0, (%rsi), %xmm2, %xmm1)
 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
 	vpandn	%xmm1, %xmm2, %xmm1
 	vpmovmskb %ymm1, %ecx
@@ -948,7 +1120,7 @@ L(less_1x_vec_till_page):
 # endif
 
 	VMOVU	(%rdi, %OFFSET_REG64), %xmm0
-	VPCMPEQ	(%rsi, %OFFSET_REG64), %xmm0, %xmm1
+	CMP_R1_S2_xmm (%xmm0, (%rsi, %OFFSET_REG64), %xmm2, %xmm1)
 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
 	vpandn	%xmm1, %xmm2, %xmm1
 	vpmovmskb %ymm1, %ecx
@@ -986,7 +1158,7 @@ L(less_16_till_page):
 	vmovq	(%rdi), %xmm0
 	vmovq	(%rsi), %xmm1
 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
-	VPCMPEQ	%xmm1, %xmm0, %xmm1
+	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
 	vpandn	%xmm1, %xmm2, %xmm1
 	vpmovmskb %ymm1, %ecx
 	incb	%cl
@@ -1006,7 +1178,7 @@ L(less_16_till_page):
 	vmovq	(%rdi, %OFFSET_REG64), %xmm0
 	vmovq	(%rsi, %OFFSET_REG64), %xmm1
 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
-	VPCMPEQ	%xmm1, %xmm0, %xmm1
+	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
 	vpandn	%xmm1, %xmm2, %xmm1
 	vpmovmskb %ymm1, %ecx
 	incb	%cl
@@ -1062,7 +1234,7 @@ L(ret_less_8_wcs):
 	vmovd	(%rdi), %xmm0
 	vmovd	(%rsi), %xmm1
 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
-	VPCMPEQ	%xmm1, %xmm0, %xmm1
+	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
 	vpandn	%xmm1, %xmm2, %xmm1
 	vpmovmskb %ymm1, %ecx
 	subl	$0xf, %ecx
@@ -1081,7 +1253,7 @@ L(ret_less_8_wcs):
 	vmovd	(%rdi, %OFFSET_REG64), %xmm0
 	vmovd	(%rsi, %OFFSET_REG64), %xmm1
 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
-	VPCMPEQ	%xmm1, %xmm0, %xmm1
+	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
 	vpandn	%xmm1, %xmm2, %xmm1
 	vpmovmskb %ymm1, %ecx
 	subl	$0xf, %ecx
@@ -1115,7 +1287,9 @@ L(less_4_till_page):
 L(less_4_loop):
 	movzbl	(%rdi), %eax
 	movzbl	(%rsi, %rdi), %ecx
-	subl	%ecx, %eax
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
+	subl	%BYTE_LOOP_REG, %eax
 	jnz	L(ret_less_4_loop)
 	testl	%ecx, %ecx
 	jz	L(ret_zero_4_loop)
@@ -1142,5 +1316,6 @@ L(ret_less_4_loop):
 	subl	%r8d, %eax
 	ret
 # endif
-END(STRCMP)
+	cfi_endproc
+	.size	STRCMP, .-STRCMP
 #endif
diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
new file mode 100644
index 0000000000..58c05dcfb8
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
@@ -0,0 +1,16 @@
+#ifndef STRCMP
+# define STRCMP	__strncasecmp_l_avx2_rtm
+#endif
+
+#define _GLABEL(x)	x ## _rtm
+#define GLABEL(x)	_GLABEL(x)
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN	\
+	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN	jmp L(return_vzeroupper)
+
+#define SECTION(p)	p##.avx.rtm
+#define OVERFLOW_STRCMP	__strcasecmp_l_avx2_rtm
+
+#include "strncase_l-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2.S b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
new file mode 100644
index 0000000000..48c0aa21f8
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
@@ -0,0 +1,27 @@
+/* strncasecmp_l optimized with AVX2.
+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef STRCMP
+# define STRCMP	__strncasecmp_l_avx2
+#endif
+#define USE_AS_STRCASECMP_L
+#define USE_AS_STRNCMP
+#ifndef OVERFLOW_STRCMP
+# define OVERFLOW_STRCMP	__strcasecmp_l_avx2
+#endif
+#include "strcmp-avx2.S"
-- 
2.25.1


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH v4 22/23] x86: Add EVEX optimized str{n}casecmp
  2022-03-24 23:56   ` [PATCH v4 21/23] x86: Add AVX2 " Noah Goldstein
@ 2022-03-24 23:56     ` Noah Goldstein
  2022-03-25 18:15       ` H.J. Lu
  2022-03-25 18:14     ` [PATCH v4 21/23] x86: Add AVX2 " H.J. Lu
  1 sibling, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-24 23:56 UTC (permalink / raw)
  To: libc-alpha

geometric_mean(N=40) of all benchmarks EVEX / SSE42: .621

All string/memory tests pass.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
 sysdeps/x86_64/multiarch/Makefile            |   2 +
 sysdeps/x86_64/multiarch/ifunc-impl-list.c   |  16 +
 sysdeps/x86_64/multiarch/ifunc-strcasecmp.h  |   5 +
 sysdeps/x86_64/multiarch/strcasecmp_l-evex.S |  23 ++
 sysdeps/x86_64/multiarch/strcmp-evex.S       | 290 ++++++++++++++++---
 sysdeps/x86_64/multiarch/strncase_l-evex.S   |  25 ++
 6 files changed, 321 insertions(+), 40 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
 create mode 100644 sysdeps/x86_64/multiarch/strncase_l-evex.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 06e1848823..35d80dc2ff 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -57,6 +57,7 @@ sysdep_routines += \
   strcasecmp_l-avx \
   strcasecmp_l-avx2 \
   strcasecmp_l-avx2-rtm \
+  strcasecmp_l-evex \
   strcasecmp_l-sse2 \
   strcasecmp_l-sse4_2 \
   strcasecmp_l-ssse3 \
@@ -97,6 +98,7 @@ sysdep_routines += \
   strncase_l-avx \
   strncase_l-avx2 \
   strncase_l-avx2-rtm \
+  strncase_l-evex \
   strncase_l-sse2 \
   strncase_l-sse4_2 \
   strncase_l-ssse3 \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 3c556d07ac..f1a4d3dac2 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -436,6 +436,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
   IFUNC_IMPL (i, name, strcasecmp,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __strcasecmp_evex)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp,
 			      CPU_FEATURE_USABLE (AVX2),
 			      __strcasecmp_avx2)
@@ -456,6 +460,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
   IFUNC_IMPL (i, name, strcasecmp_l,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __strcasecmp_l_evex)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp,
 			      CPU_FEATURE_USABLE (AVX2),
 			      __strcasecmp_l_avx2)
@@ -590,6 +598,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
   IFUNC_IMPL (i, name, strncasecmp,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __strncasecmp_evex)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp,
 			      CPU_FEATURE_USABLE (AVX2),
 			      __strncasecmp_avx2)
@@ -611,6 +623,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
   IFUNC_IMPL (i, name, strncasecmp_l,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __strncasecmp_l_evex)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp,
 			      CPU_FEATURE_USABLE (AVX2),
 			      __strncasecmp_l_avx2)
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
index c4de111fd0..bf0d146e7f 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
@@ -25,6 +25,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
 
 static inline void *
 IFUNC_SELECTOR (void)
@@ -34,6 +35,10 @@ IFUNC_SELECTOR (void)
   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
     {
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+        return OPTIMIZE (evex);
+
       if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
         return OPTIMIZE (avx2_rtm);
 
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
new file mode 100644
index 0000000000..58642db748
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
@@ -0,0 +1,23 @@
+/* strcasecmp_l optimized with EVEX.
+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef STRCMP
+# define STRCMP	__strcasecmp_l_evex
+#endif
+#define USE_AS_STRCASECMP_L
+#include "strcmp-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
index 56d8c118e4..2a5b3ce037 100644
--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
@@ -19,6 +19,9 @@
 #if IS_IN (libc)
 
 # include <sysdep.h>
+# if defined USE_AS_STRCASECMP_L
+#  include "locale-defines.h"
+# endif
 
 # ifndef STRCMP
 #  define STRCMP	__strcmp_evex
@@ -34,19 +37,29 @@
 # define VMOVA	vmovdqa64
 
 # ifdef USE_AS_WCSCMP
-#  define TESTEQ	subl	$0xff,
+#  ifndef OVERFLOW_STRCMP
+#   define OVERFLOW_STRCMP	__wcscmp_evex
+#  endif
+
+#  define TESTEQ	subl $0xff,
 	/* Compare packed dwords.  */
 #  define VPCMP	vpcmpd
 #  define VPMINU	vpminud
 #  define VPTESTM	vptestmd
+#  define VPTESTNM	vptestnmd
 	/* 1 dword char == 4 bytes.  */
 #  define SIZE_OF_CHAR	4
 # else
+#  ifndef OVERFLOW_STRCMP
+#   define OVERFLOW_STRCMP	__strcmp_evex
+#  endif
+
 #  define TESTEQ	incl
 	/* Compare packed bytes.  */
 #  define VPCMP	vpcmpb
 #  define VPMINU	vpminub
 #  define VPTESTM	vptestmb
+#  define VPTESTNM	vptestnmb
 	/* 1 byte char == 1 byte.  */
 #  define SIZE_OF_CHAR	1
 # endif
@@ -73,11 +86,16 @@
 #  define VEC_OFFSET	(-VEC_SIZE)
 # endif
 
-# define XMMZERO	xmm16
 # define XMM0	xmm17
 # define XMM1	xmm18
 
-# define YMMZERO	ymm16
+# define XMM10	xmm27
+# define XMM11	xmm28
+# define XMM12	xmm29
+# define XMM13	xmm30
+# define XMM14	xmm31
+
+
 # define YMM0	ymm17
 # define YMM1	ymm18
 # define YMM2	ymm19
@@ -89,6 +107,87 @@
 # define YMM8	ymm25
 # define YMM9	ymm26
 # define YMM10	ymm27
+# define YMM11	ymm28
+# define YMM12	ymm29
+# define YMM13	ymm30
+# define YMM14	ymm31
+
+# ifdef USE_AS_STRCASECMP_L
+#  define BYTE_LOOP_REG	OFFSET_REG
+# else
+#  define BYTE_LOOP_REG	ecx
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+#  ifdef USE_AS_STRNCMP
+#   define STRCASECMP	__strncasecmp_evex
+#   define LOCALE_REG	rcx
+#   define LOCALE_REG_LP	RCX_LP
+#   define STRCASECMP_NONASCII	__strncasecmp_l_nonascii
+#  else
+#   define STRCASECMP	__strcasecmp_evex
+#   define LOCALE_REG	rdx
+#   define LOCALE_REG_LP	RDX_LP
+#   define STRCASECMP_NONASCII	__strcasecmp_l_nonascii
+#  endif
+# endif
+
+# define LCASE_MIN_YMM	%YMM12
+# define LCASE_MAX_YMM	%YMM13
+# define CASE_ADD_YMM	%YMM14
+
+# define LCASE_MIN_XMM	%XMM12
+# define LCASE_MAX_XMM	%XMM13
+# define CASE_ADD_XMM	%XMM14
+
+	/* NB: wcsncmp uses r11 but strcasecmp is never used in
+	   conjunction with wcscmp.  */
+# define TOLOWER_BASE	%r11
+
+# ifdef USE_AS_STRCASECMP_L
+#  define _REG(x, y) x ## y
+#  define REG(x, y) _REG(x, y)
+#  define TOLOWER(reg1, reg2, ext)										\
+	vpsubb	REG(LCASE_MIN_, ext), reg1, REG(%ext, 10);					\
+	vpsubb	REG(LCASE_MIN_, ext), reg2, REG(%ext, 11);					\
+	vpcmpub	$1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5;				\
+	vpcmpub	$1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6;				\
+	vpaddb	reg1, REG(CASE_ADD_, ext), reg1{%k5};						\
+	vpaddb	reg2, REG(CASE_ADD_, ext), reg2{%k6}
+
+#  define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
+#  define TOLOWER_YMM(...)	TOLOWER(__VA_ARGS__, YMM)
+#  define TOLOWER_XMM(...)	TOLOWER(__VA_ARGS__, XMM)
+
+#  define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)						\
+	TOLOWER	(s1_reg, s2_reg, ext);										\
+	VPCMP	$0, s1_reg, s2_reg, reg_out
+
+#  define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext)				\
+	VMOVU	s2_mem, s2_reg;												\
+	CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)
+
+#  define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM)
+#  define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM)
+
+#  define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM)
+#  define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM)
+
+# else
+#  define TOLOWER_gpr(...)
+#  define TOLOWER_YMM(...)
+#  define TOLOWER_XMM(...)
+
+#  define CMP_R1_R2_YMM(s1_reg, s2_reg, reg_out)						\
+	VPCMP	$0, s2_reg, s1_reg, reg_out
+
+#  define CMP_R1_R2_XMM(...) CMP_R1_R2_YMM(__VA_ARGS__)
+
+#  define CMP_R1_S2_YMM(s1_reg, s2_mem, unused, reg_out)				\
+	VPCMP	$0, s2_mem, s1_reg, reg_out
+
+#  define CMP_R1_S2_XMM(...) CMP_R1_S2_YMM(__VA_ARGS__)
+# endif
 
 /* Warning!
            wcscmp/wcsncmp have to use SIGNED comparison for elements.
@@ -112,8 +211,45 @@
    returned.  */
 
 	.section .text.evex, "ax", @progbits
-ENTRY(STRCMP)
+	.align	16
+	.type	STRCMP, @function
+	.globl	STRCMP
+	.hidden	STRCMP
+
+# ifdef USE_AS_STRCASECMP_L
+ENTRY (STRCASECMP)
+	movq	__libc_tsd_LOCALE@gottpoff(%rip), %rax
+	mov	%fs:(%rax), %LOCALE_REG_LP
+
+	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
+	.p2align 4
+END (STRCASECMP)
+	/* FALLTHROUGH to strcasecmp/strncasecmp_l.  */
+# endif
+
+	.p2align 4
+STRCMP:
+	cfi_startproc
+	_CET_ENDBR
+	CALL_MCOUNT
+
+# if defined USE_AS_STRCASECMP_L
+	/* We have to fall back on the C implementation for locales with
+	   encodings not matching ASCII for single bytes.  */
+#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	mov	LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
+#  else
+	mov	(%LOCALE_REG), %RAX_LP
+#  endif
+	testl	$1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
+	jne	STRCASECMP_NONASCII
+	leaq	_nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
+# endif
+
 # ifdef USE_AS_STRNCMP
+	/* Don't overwrite LOCALE_REG (rcx) until we have pass
+	   L(one_or_less). Otherwise we might use the wrong locale in
+	   the OVERFLOW_STRCMP (strcasecmp_l).  */
 #  ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
 	movl	%edx, %edx
@@ -125,6 +261,32 @@ ENTRY(STRCMP)
 	   actually bound the buffer.  */
 	jle	L(one_or_less)
 # endif
+
+# if defined USE_AS_STRCASECMP_L
+	.section .rodata.cst32, "aM", @progbits, 32
+	.align	32
+L(lcase_min):
+	.quad	0x4141414141414141
+	.quad	0x4141414141414141
+	.quad	0x4141414141414141
+	.quad	0x4141414141414141
+L(lcase_max):
+	.quad	0x1a1a1a1a1a1a1a1a
+	.quad	0x1a1a1a1a1a1a1a1a
+	.quad	0x1a1a1a1a1a1a1a1a
+	.quad	0x1a1a1a1a1a1a1a1a
+L(case_add):
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.previous
+
+	vmovdqa64 L(lcase_min)(%rip), LCASE_MIN_YMM
+	vmovdqa64 L(lcase_max)(%rip), LCASE_MAX_YMM
+	vmovdqa64 L(case_add)(%rip), CASE_ADD_YMM
+# endif
+
 	movl	%edi, %eax
 	orl	%esi, %eax
 	/* Shift out the bits irrelivant to page boundary ([63:12]).  */
@@ -139,7 +301,7 @@ L(no_page_cross):
 	VPTESTM	%YMM0, %YMM0, %k2
 	/* Each bit cleared in K1 represents a mismatch or a null CHAR
 	   in YMM0 and 32 bytes at (%rsi).  */
-	VPCMP	$0, (%rsi), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
 # ifdef USE_AS_STRNCMP
 	cmpq	$CHAR_PER_VEC, %rdx
@@ -169,6 +331,8 @@ L(return_vec_0):
 # else
 	movzbl	(%rdi, %rcx), %eax
 	movzbl	(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 # endif
 L(ret0):
@@ -188,11 +352,15 @@ L(ret_zero):
 
 	.p2align 4,, 5
 L(one_or_less):
+#  ifdef USE_AS_STRCASECMP_L
+	/* Set locale argument for strcasecmp.  */
+	movq	%LOCALE_REG, %rdx
+#  endif
 	jb	L(ret_zero)
-#  ifdef USE_AS_WCSCMP
 	/* 'nbe' covers the case where length is negative (large
 	   unsigned).  */
-	jnbe	__wcscmp_evex
+	jnbe	OVERFLOW_STRCMP
+#  ifdef USE_AS_WCSCMP
 	movl	(%rdi), %edx
 	xorl	%eax, %eax
 	cmpl	(%rsi), %edx
@@ -201,11 +369,10 @@ L(one_or_less):
 	negl	%eax
 	orl	$1, %eax
 #  else
-	/* 'nbe' covers the case where length is negative (large
-	   unsigned).  */
-	jnbe	__strcmp_evex
 	movzbl	(%rdi), %eax
 	movzbl	(%rsi), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 #  endif
 L(ret1):
@@ -233,6 +400,8 @@ L(return_vec_1):
 # else
 	movzbl	VEC_SIZE(%rdi, %rcx), %eax
 	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 # endif
 L(ret2):
@@ -270,6 +439,8 @@ L(return_vec_2):
 # else
 	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
 	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 # endif
 L(ret3):
@@ -290,6 +461,8 @@ L(return_vec_3):
 #  else
 	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
 	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 #  endif
 L(ret4):
@@ -303,7 +476,7 @@ L(more_3x_vec):
 	/* Safe to compare 4x vectors.  */
 	VMOVU	(VEC_SIZE)(%rdi), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, (VEC_SIZE)(%rsi), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_1)
@@ -315,14 +488,14 @@ L(more_3x_vec):
 
 	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_2)
 
 	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_3)
@@ -381,7 +554,6 @@ L(prepare_loop_aligned):
 	subl	%esi, %eax
 	andl	$(PAGE_SIZE - 1), %eax
 
-	vpxorq	%YMMZERO, %YMMZERO, %YMMZERO
 
 	/* Loop 4x comparisons at a time.  */
 	.p2align 4
@@ -413,22 +585,35 @@ L(loop_skip_page_cross_check):
 	/* A zero CHAR in YMM9 means that there is a null CHAR.  */
 	VPMINU	%YMM8, %YMM9, %YMM9
 
-	/* Each bit set in K1 represents a non-null CHAR in YMM8.  */
+	/* Each bit set in K1 represents a non-null CHAR in YMM9.  */
 	VPTESTM	%YMM9, %YMM9, %k1
-
+# ifndef USE_AS_STRCASECMP_L
 	vpxorq	(VEC_SIZE * 0)(%rsi), %YMM0, %YMM1
 	vpxorq	(VEC_SIZE * 1)(%rsi), %YMM2, %YMM3
 	vpxorq	(VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
 	/* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
 	   oring with YMM1. Result is stored in YMM6.  */
 	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6
-
+# else
+	VMOVU	(VEC_SIZE * 0)(%rsi), %YMM1
+	TOLOWER_YMM (%YMM0, %YMM1)
+	VMOVU	(VEC_SIZE * 1)(%rsi), %YMM3
+	TOLOWER_YMM (%YMM2, %YMM3)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM5
+	TOLOWER_YMM (%YMM4, %YMM5)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM7
+	TOLOWER_YMM (%YMM6, %YMM7)
+	vpxorq	%YMM0, %YMM1, %YMM1
+	vpxorq	%YMM2, %YMM3, %YMM3
+	vpxorq	%YMM4, %YMM5, %YMM5
+	vpternlogd $0xde, %YMM7, %YMM1, %YMM6
+# endif
 	/* Or together YMM3, YMM5, and YMM6.  */
 	vpternlogd $0xfe, %YMM3, %YMM5, %YMM6
 
 
 	/* A non-zero CHAR in YMM6 represents a mismatch.  */
-	VPCMP	$0, %YMMZERO, %YMM6, %k0{%k1}
+	VPTESTNM %YMM6, %YMM6, %k0{%k1}
 	kmovd	%k0, %LOOP_REG
 
 	TESTEQ	%LOOP_REG
@@ -437,13 +622,13 @@ L(loop_skip_page_cross_check):
 
 	/* Find which VEC has the mismatch of end of string.  */
 	VPTESTM	%YMM0, %YMM0, %k1
-	VPCMP	$0, %YMMZERO, %YMM1, %k0{%k1}
+	VPTESTNM %YMM1, %YMM1, %k0{%k1}
 	kmovd	%k0, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_0_end)
 
 	VPTESTM	%YMM2, %YMM2, %k1
-	VPCMP	$0, %YMMZERO, %YMM3, %k0{%k1}
+	VPTESTNM %YMM3, %YMM3, %k0{%k1}
 	kmovd	%k0, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_1_end)
@@ -457,7 +642,7 @@ L(return_vec_2_3_end):
 # endif
 
 	VPTESTM	%YMM4, %YMM4, %k1
-	VPCMP	$0, %YMMZERO, %YMM5, %k0{%k1}
+	VPTESTNM %YMM5, %YMM5, %k0{%k1}
 	kmovd	%k0, %ecx
 	TESTEQ	%ecx
 # if CHAR_PER_VEC <= 16
@@ -493,6 +678,8 @@ L(return_vec_3_end):
 # else
 	movzbl	(VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax
 	movzbl	(VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -545,6 +732,8 @@ L(return_vec_0_end):
 # else
 	movzbl	(%rdi, %rcx), %eax
 	movzbl	(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	/* Flip `eax` if `rdi` and `rsi` where swapped in page cross
 	   logic. Subtract `r8d` after xor for zero case.  */
@@ -569,6 +758,8 @@ L(return_vec_1_end):
 #  else
 	movzbl	VEC_SIZE(%rdi, %rcx), %eax
 	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -598,7 +789,7 @@ L(page_cross_during_loop):
 
 	VMOVA	(%rdi), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, (%rsi), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_0_end)
@@ -619,8 +810,7 @@ L(less_1x_vec_till_page_cross):
 	   been loaded earlier so must be valid.  */
 	VMOVU	-VEC_SIZE(%rdi, %rax), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, -VEC_SIZE(%rsi, %rax), %YMM0, %k1{%k2}
-
+	CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2}
 	/* Mask of potentially valid bits. The lower bits can be out of
 	   range comparisons (but safe regarding page crosses).  */
 
@@ -642,6 +832,8 @@ L(less_1x_vec_till_page_cross):
 
 # ifdef USE_AS_STRNCMP
 #  ifdef USE_AS_WCSCMP
+	/* NB: strcasecmp not used with WCSCMP so this access to r11 is
+	   safe.  */
 	movl	%eax, %r11d
 	shrl	$2, %r11d
 	cmpq	%r11, %rdx
@@ -679,6 +871,8 @@ L(return_page_cross_cmp_mem):
 # else
 	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
 	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -709,7 +903,7 @@ L(more_2x_vec_till_page_cross):
 
 	VMOVA	VEC_SIZE(%rdi), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_1_end)
@@ -724,14 +918,14 @@ L(more_2x_vec_till_page_cross):
 	/* Safe to include comparisons from lower bytes.  */
 	VMOVU	-(VEC_SIZE * 2)(%rdi, %rax), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_page_cross_0)
 
 	VMOVU	-(VEC_SIZE * 1)(%rdi, %rax), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_page_cross_1)
@@ -740,6 +934,8 @@ L(more_2x_vec_till_page_cross):
 	/* Must check length here as length might proclude reading next
 	   page.  */
 #  ifdef USE_AS_WCSCMP
+	/* NB: strcasecmp not used with WCSCMP so this access to r11 is
+	   safe.  */
 	movl	%eax, %r11d
 	shrl	$2, %r11d
 	cmpq	%r11, %rdx
@@ -754,12 +950,19 @@ L(more_2x_vec_till_page_cross):
 	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM6
 	VPMINU	%YMM4, %YMM6, %YMM9
 	VPTESTM	%YMM9, %YMM9, %k1
-
+# ifndef USE_AS_STRCASECMP_L
 	vpxorq	(VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
 	/* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6).  */
 	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6
-
-	VPCMP	$0, %YMMZERO, %YMM6, %k0{%k1}
+# else
+	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM5
+	TOLOWER_YMM (%YMM4, %YMM5)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM7
+	TOLOWER_YMM (%YMM6, %YMM7)
+	vpxorq	%YMM4, %YMM5, %YMM5
+	vpternlogd $0xde, %YMM7, %YMM5, %YMM6
+# endif
+	VPTESTNM %YMM6, %YMM6, %k0{%k1}
 	kmovd	%k0, %LOOP_REG
 	TESTEQ	%LOOP_REG
 	jnz	L(return_vec_2_3_end)
@@ -815,6 +1018,8 @@ L(return_vec_page_cross_1):
 # else
 	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
 	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -871,7 +1076,7 @@ L(page_cross):
 L(page_cross_loop):
 	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(check_ret_vec_page_cross)
@@ -895,7 +1100,7 @@ L(page_cross_loop):
 	 */
 	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
 
 	kmovd	%k1, %ecx
 # ifdef USE_AS_STRNCMP
@@ -930,6 +1135,8 @@ L(ret_vec_page_cross_cont):
 # else
 	movzbl	(%rdi, %rcx, SIZE_OF_CHAR), %eax
 	movzbl	(%rsi, %rcx, SIZE_OF_CHAR), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -989,7 +1196,7 @@ L(less_1x_vec_till_page):
 	/* Use 16 byte comparison.  */
 	vmovdqu	(%rdi), %xmm0
 	VPTESTM	%xmm0, %xmm0, %k2
-	VPCMP	$0, (%rsi), %xmm0, %k1{%k2}
+	CMP_R1_S2_XMM (%xmm0, (%rsi), %xmm1, %k1){%k2}
 	kmovd	%k1, %ecx
 # ifdef USE_AS_WCSCMP
 	subl	$0xf, %ecx
@@ -1009,7 +1216,7 @@ L(less_1x_vec_till_page):
 # endif
 	vmovdqu	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
 	VPTESTM	%xmm0, %xmm0, %k2
-	VPCMP	$0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0, %k1{%k2}
+	CMP_R1_S2_XMM (%xmm0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1, %k1){%k2}
 	kmovd	%k1, %ecx
 # ifdef USE_AS_WCSCMP
 	subl	$0xf, %ecx
@@ -1048,7 +1255,7 @@ L(less_16_till_page):
 	vmovq	(%rdi), %xmm0
 	vmovq	(%rsi), %xmm1
 	VPTESTM	%xmm0, %xmm0, %k2
-	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
+	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
 	kmovd	%k1, %ecx
 # ifdef USE_AS_WCSCMP
 	subl	$0x3, %ecx
@@ -1068,7 +1275,7 @@ L(less_16_till_page):
 	vmovq	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
 	vmovq	(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
 	VPTESTM	%xmm0, %xmm0, %k2
-	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
+	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
 	kmovd	%k1, %ecx
 # ifdef USE_AS_WCSCMP
 	subl	$0x3, %ecx
@@ -1128,7 +1335,7 @@ L(ret_less_8_wcs):
 	vmovd	(%rdi), %xmm0
 	vmovd	(%rsi), %xmm1
 	VPTESTM	%xmm0, %xmm0, %k2
-	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
+	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
 	kmovd	%k1, %ecx
 	subl	$0xf, %ecx
 	jnz	L(check_ret_vec_page_cross)
@@ -1143,7 +1350,7 @@ L(ret_less_8_wcs):
 	vmovd	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
 	vmovd	(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
 	VPTESTM	%xmm0, %xmm0, %k2
-	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
+	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
 	kmovd	%k1, %ecx
 	subl	$0xf, %ecx
 	jnz	L(check_ret_vec_page_cross)
@@ -1176,7 +1383,9 @@ L(less_4_till_page):
 L(less_4_loop):
 	movzbl	(%rdi), %eax
 	movzbl	(%rsi, %rdi), %ecx
-	subl	%ecx, %eax
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
+	subl	%BYTE_LOOP_REG, %eax
 	jnz	L(ret_less_4_loop)
 	testl	%ecx, %ecx
 	jz	L(ret_zero_4_loop)
@@ -1203,5 +1412,6 @@ L(ret_less_4_loop):
 	subl	%r8d, %eax
 	ret
 # endif
-END(STRCMP)
+	cfi_endproc
+	.size	STRCMP, .-STRCMP
 #endif
diff --git a/sysdeps/x86_64/multiarch/strncase_l-evex.S b/sysdeps/x86_64/multiarch/strncase_l-evex.S
new file mode 100644
index 0000000000..8a5af3695c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncase_l-evex.S
@@ -0,0 +1,25 @@
+/* strncasecmp_l optimized with EVEX.
+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef STRCMP
+# define STRCMP	__strncasecmp_l_evex
+#endif
+#define OVERFLOW_STRCMP	__strcasecmp_l_evex
+#define USE_AS_STRCASECMP_L
+#define USE_AS_STRNCMP
+#include "strcmp-evex.S"
-- 
2.25.1


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v4 21/23] x86: Add AVX2 optimized str{n}casecmp
  2022-03-24 23:56   ` [PATCH v4 21/23] x86: Add AVX2 " Noah Goldstein
  2022-03-24 23:56     ` [PATCH v4 22/23] x86: Add EVEX " Noah Goldstein
@ 2022-03-25 18:14     ` H.J. Lu
  2022-05-12 19:52       ` Sunil Pandey
  1 sibling, 1 reply; 76+ messages in thread
From: H.J. Lu @ 2022-03-25 18:14 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Thu, Mar 24, 2022 at 4:56 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> geometric_mean(N=40) of all benchmarks AVX2 / SSE42: .702
>
> All string/memory tests pass.
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
> ---
>  sysdeps/x86_64/multiarch/Makefile             |   4 +
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  28 +++
>  sysdeps/x86_64/multiarch/ifunc-strcasecmp.h   |  12 +
>  .../x86_64/multiarch/strcasecmp_l-avx2-rtm.S  |  15 ++
>  sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S  |  23 ++
>  sysdeps/x86_64/multiarch/strcmp-avx2.S        | 237 +++++++++++++++---
>  .../x86_64/multiarch/strncase_l-avx2-rtm.S    |  16 ++
>  sysdeps/x86_64/multiarch/strncase_l-avx2.S    |  27 ++
>  8 files changed, 331 insertions(+), 31 deletions(-)
>  create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
>  create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
>  create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
>  create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index e7b413edad..06e1848823 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -55,6 +55,8 @@ sysdep_routines += \
>    stpncpy-sse2-unaligned \
>    stpncpy-ssse3 \
>    strcasecmp_l-avx \
> +  strcasecmp_l-avx2 \
> +  strcasecmp_l-avx2-rtm \
>    strcasecmp_l-sse2 \
>    strcasecmp_l-sse4_2 \
>    strcasecmp_l-ssse3 \
> @@ -93,6 +95,8 @@ sysdep_routines += \
>    strlen-evex \
>    strlen-sse2 \
>    strncase_l-avx \
> +  strncase_l-avx2 \
> +  strncase_l-avx2-rtm \
>    strncase_l-sse2 \
>    strncase_l-sse4_2 \
>    strncase_l-ssse3 \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index a594f4176e..3c556d07ac 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -436,6 +436,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
>    /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
>    IFUNC_IMPL (i, name, strcasecmp,
> +             IFUNC_IMPL_ADD (array, i, strcasecmp,
> +                             CPU_FEATURE_USABLE (AVX2),
> +                             __strcasecmp_avx2)
> +             IFUNC_IMPL_ADD (array, i, strcasecmp,
> +                             (CPU_FEATURE_USABLE (AVX2)
> +                              && CPU_FEATURE_USABLE (RTM)),
> +                             __strcasecmp_avx2_rtm)
>               IFUNC_IMPL_ADD (array, i, strcasecmp,
>                               CPU_FEATURE_USABLE (AVX),
>                               __strcasecmp_avx)
> @@ -449,6 +456,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
>    /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
>    IFUNC_IMPL (i, name, strcasecmp_l,
> +             IFUNC_IMPL_ADD (array, i, strcasecmp,
> +                             CPU_FEATURE_USABLE (AVX2),
> +                             __strcasecmp_l_avx2)
> +             IFUNC_IMPL_ADD (array, i, strcasecmp,
> +                             (CPU_FEATURE_USABLE (AVX2)
> +                              && CPU_FEATURE_USABLE (RTM)),
> +                             __strcasecmp_l_avx2_rtm)
>               IFUNC_IMPL_ADD (array, i, strcasecmp_l,
>                               CPU_FEATURE_USABLE (AVX),
>                               __strcasecmp_l_avx)
> @@ -576,6 +590,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
>    /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
>    IFUNC_IMPL (i, name, strncasecmp,
> +             IFUNC_IMPL_ADD (array, i, strncasecmp,
> +                             CPU_FEATURE_USABLE (AVX2),
> +                             __strncasecmp_avx2)
> +             IFUNC_IMPL_ADD (array, i, strncasecmp,
> +                             (CPU_FEATURE_USABLE (AVX2)
> +                              && CPU_FEATURE_USABLE (RTM)),
> +                             __strncasecmp_avx2_rtm)
>               IFUNC_IMPL_ADD (array, i, strncasecmp,
>                               CPU_FEATURE_USABLE (AVX),
>                               __strncasecmp_avx)
> @@ -590,6 +611,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
>    /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
>    IFUNC_IMPL (i, name, strncasecmp_l,
> +             IFUNC_IMPL_ADD (array, i, strncasecmp,
> +                             CPU_FEATURE_USABLE (AVX2),
> +                             __strncasecmp_l_avx2)
> +             IFUNC_IMPL_ADD (array, i, strncasecmp,
> +                             (CPU_FEATURE_USABLE (AVX2)
> +                              && CPU_FEATURE_USABLE (RTM)),
> +                             __strncasecmp_l_avx2_rtm)
>               IFUNC_IMPL_ADD (array, i, strncasecmp_l,
>                               CPU_FEATURE_USABLE (AVX),
>                               __strncasecmp_l_avx)
> diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> index 9e3cc61ac0..c4de111fd0 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> @@ -23,12 +23,24 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
>
>  static inline void *
>  IFUNC_SELECTOR (void)
>  {
>    const struct cpu_features* cpu_features = __get_cpu_features ();
>
> +  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> +      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
> +    {
> +      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> +        return OPTIMIZE (avx2_rtm);
> +
> +      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
> +        return OPTIMIZE (avx2);
> +    }
> +
>    if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
>      return OPTIMIZE (avx);
>
> diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
> new file mode 100644
> index 0000000000..09957fc3c5
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
> @@ -0,0 +1,15 @@
> +#ifndef STRCMP
> +# define STRCMP        __strcasecmp_l_avx2_rtm
> +#endif
> +
> +#define _GLABEL(x)     x ## _rtm
> +#define GLABEL(x)      _GLABEL(x)
> +
> +#define ZERO_UPPER_VEC_REGISTERS_RETURN        \
> +       ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> +
> +#define VZEROUPPER_RETURN      jmp L(return_vzeroupper)
> +
> +#define SECTION(p)     p##.avx.rtm
> +
> +#include "strcasecmp_l-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
> new file mode 100644
> index 0000000000..e2762f2a22
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
> @@ -0,0 +1,23 @@
> +/* strcasecmp_l optimized with AVX2.
> +   Copyright (C) 2017-2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef STRCMP
> +# define STRCMP        __strcasecmp_l_avx2
> +#endif
> +#define USE_AS_STRCASECMP_L
> +#include "strcmp-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> index 86a86b68e3..8da09bd86d 100644
> --- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> @@ -20,6 +20,10 @@
>
>  # include <sysdep.h>
>
> +# if defined USE_AS_STRCASECMP_L
> +#  include "locale-defines.h"
> +# endif
> +
>  # ifndef STRCMP
>  #  define STRCMP       __strcmp_avx2
>  # endif
> @@ -74,13 +78,88 @@
>  #  define VEC_OFFSET   (-VEC_SIZE)
>  # endif
>
> +# ifdef USE_AS_STRCASECMP_L
> +#  define BYTE_LOOP_REG        OFFSET_REG
> +# else
> +#  define BYTE_LOOP_REG        ecx
> +# endif
> +
> +# ifdef USE_AS_STRCASECMP_L
> +#  ifdef USE_AS_STRNCMP
> +#   define STRCASECMP  __strncasecmp_avx2
> +#   define LOCALE_REG  rcx
> +#   define LOCALE_REG_LP       RCX_LP
> +#   define STRCASECMP_NONASCII __strncasecmp_l_nonascii
> +#  else
> +#   define STRCASECMP  __strcasecmp_avx2
> +#   define LOCALE_REG  rdx
> +#   define LOCALE_REG_LP       RDX_LP
> +#   define STRCASECMP_NONASCII __strcasecmp_l_nonascii
> +#  endif
> +# endif
> +
>  # define xmmZERO       xmm15
>  # define ymmZERO       ymm15
>
> +# define LCASE_MIN_ymm %ymm10
> +# define LCASE_MAX_ymm %ymm11
> +# define CASE_ADD_ymm  %ymm12
> +
> +# define LCASE_MIN_xmm %xmm10
> +# define LCASE_MAX_xmm %xmm11
> +# define CASE_ADD_xmm  %xmm12
> +
> +       /* r11 is never use elsewhere so this is safe to maintain.  */
> +# define TOLOWER_BASE  %r11
> +
>  # ifndef SECTION
>  #  define SECTION(p)   p##.avx
>  # endif
>
> +# ifdef USE_AS_STRCASECMP_L
> +#  define REG(x, y) x ## y
> +#  define TOLOWER(reg1_in, reg1_out, reg2_in, reg2_out, ext)                   \
> +       vpaddb  REG(LCASE_MIN_, ext), reg1_in, REG(%ext, 8);                            \
> +       vpaddb  REG(LCASE_MIN_, ext), reg2_in, REG(%ext, 9);                            \
> +       vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 8), REG(%ext, 8);                      \
> +       vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 9), REG(%ext, 9);                      \
> +       vpandn  REG(CASE_ADD_, ext), REG(%ext, 8), REG(%ext, 8);                        \
> +       vpandn  REG(CASE_ADD_, ext), REG(%ext, 9), REG(%ext, 9);                        \
> +       vpaddb  REG(%ext, 8), reg1_in, reg1_out;                                                        \
> +       vpaddb  REG(%ext, 9), reg2_in, reg2_out
> +
> +#  define TOLOWER_gpr(src, dst)        movl (TOLOWER_BASE, src, 4), dst
> +#  define TOLOWER_ymm(...)     TOLOWER(__VA_ARGS__, ymm)
> +#  define TOLOWER_xmm(...)     TOLOWER(__VA_ARGS__, xmm)
> +
> +#  define CMP_R1_R2(s1_reg, s2_reg, scratch_reg, reg_out, ext)                 \
> +       TOLOWER (s1_reg, scratch_reg, s2_reg, s2_reg, ext);                                     \
> +       VPCMPEQ scratch_reg, s2_reg, reg_out
> +
> +#  define CMP_R1_S2(s1_reg, s2_mem, scratch_reg, reg_out, ext)                 \
> +       VMOVU   s2_mem, reg_out;                                                                                        \
> +       CMP_R1_R2(s1_reg, reg_out, scratch_reg, reg_out, ext)
> +
> +#  define CMP_R1_R2_ymm(...) CMP_R1_R2(__VA_ARGS__, ymm)
> +#  define CMP_R1_R2_xmm(...) CMP_R1_R2(__VA_ARGS__, xmm)
> +
> +#  define CMP_R1_S2_ymm(...) CMP_R1_S2(__VA_ARGS__, ymm)
> +#  define CMP_R1_S2_xmm(...) CMP_R1_S2(__VA_ARGS__, xmm)
> +
> +# else
> +#  define TOLOWER_gpr(...)
> +#  define TOLOWER_ymm(...)
> +#  define TOLOWER_xmm(...)
> +
> +#  define CMP_R1_R2_ymm(s1_reg, s2_reg, scratch_reg, reg_out)                  \
> +       VPCMPEQ s2_reg, s1_reg, reg_out
> +
> +#  define CMP_R1_R2_xmm(...) CMP_R1_R2_ymm(__VA_ARGS__)
> +
> +#  define CMP_R1_S2_ymm(...) CMP_R1_R2_ymm(__VA_ARGS__)
> +#  define CMP_R1_S2_xmm(...) CMP_R1_R2_xmm(__VA_ARGS__)
> +# endif
> +
>  /* Warning!
>             wcscmp/wcsncmp have to use SIGNED comparison for elements.
>             strcmp/strncmp have to use UNSIGNED comparison for elements.
> @@ -102,8 +181,49 @@
>     returned.  */
>
>         .section SECTION(.text), "ax", @progbits
> -ENTRY(STRCMP)
> +       .align  16
> +       .type   STRCMP, @function
> +       .globl  STRCMP
> +       .hidden STRCMP
> +
> +# ifndef GLABEL
> +#  define GLABEL(...)  __VA_ARGS__
> +# endif
> +
> +# ifdef USE_AS_STRCASECMP_L
> +ENTRY (GLABEL(STRCASECMP))
> +       movq    __libc_tsd_LOCALE@gottpoff(%rip), %rax
> +       mov     %fs:(%rax), %LOCALE_REG_LP
> +
> +       /* Either 1 or 5 bytes (dependeing if CET is enabled).  */
> +       .p2align 4
> +END (GLABEL(STRCASECMP))
> +       /* FALLTHROUGH to strcasecmp/strncasecmp_l.  */
> +# endif
> +
> +       .p2align 4
> +STRCMP:
> +       cfi_startproc
> +       _CET_ENDBR
> +       CALL_MCOUNT
> +
> +# if defined USE_AS_STRCASECMP_L
> +       /* We have to fall back on the C implementation for locales with
> +          encodings not matching ASCII for single bytes.  */
> +#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
> +       mov     LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
> +#  else
> +       mov     (%LOCALE_REG), %RAX_LP
> +#  endif
> +       testl   $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
> +       jne     STRCASECMP_NONASCII
> +       leaq    _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
> +# endif
> +
>  # ifdef USE_AS_STRNCMP
> +       /* Don't overwrite LOCALE_REG (rcx) until we have pass
> +          L(one_or_less). Otherwise we might use the wrong locale in
> +          the OVERFLOW_STRCMP (strcasecmp_l).  */
>  #  ifdef __ILP32__
>         /* Clear the upper 32 bits.  */
>         movl    %edx, %edx
> @@ -128,6 +248,30 @@ ENTRY(STRCMP)
>  #  endif
>  # endif
>         vpxor   %xmmZERO, %xmmZERO, %xmmZERO
> +# if defined USE_AS_STRCASECMP_L
> +       .section .rodata.cst32, "aM", @progbits, 32
> +       .align  32
> +L(lcase_min):
> +       .quad   0x3f3f3f3f3f3f3f3f
> +       .quad   0x3f3f3f3f3f3f3f3f
> +       .quad   0x3f3f3f3f3f3f3f3f
> +       .quad   0x3f3f3f3f3f3f3f3f
> +L(lcase_max):
> +       .quad   0x9999999999999999
> +       .quad   0x9999999999999999
> +       .quad   0x9999999999999999
> +       .quad   0x9999999999999999
> +L(case_add):
> +       .quad   0x2020202020202020
> +       .quad   0x2020202020202020
> +       .quad   0x2020202020202020
> +       .quad   0x2020202020202020
> +       .previous
> +
> +       vmovdqa L(lcase_min)(%rip), LCASE_MIN_ymm
> +       vmovdqa L(lcase_max)(%rip), LCASE_MAX_ymm
> +       vmovdqa L(case_add)(%rip), CASE_ADD_ymm
> +# endif
>         movl    %edi, %eax
>         orl     %esi, %eax
>         sall    $20, %eax
> @@ -138,8 +282,10 @@ ENTRY(STRCMP)
>  L(no_page_cross):
>         /* Safe to compare 4x vectors.  */
>         VMOVU   (%rdi), %ymm0
> -       /* 1s where s1 and s2 equal.  */
> -       VPCMPEQ (%rsi), %ymm0, %ymm1
> +       /* 1s where s1 and s2 equal. Just VPCMPEQ if its not strcasecmp.
> +          Otherwise converts ymm0 and load from rsi to lower. ymm2 is
> +          scratch and ymm1 is the return.  */
> +       CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
>         /* 1s at null CHAR.  */
>         VPCMPEQ %ymm0, %ymmZERO, %ymm2
>         /* 1s where s1 and s2 equal AND not null CHAR.  */
> @@ -172,6 +318,8 @@ L(return_vec_0):
>  # else
>         movzbl  (%rdi, %rcx), %eax
>         movzbl  (%rsi, %rcx), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>  # endif
>  L(ret0):
> @@ -192,6 +340,10 @@ L(ret_zero):
>
>         .p2align 4,, 5
>  L(one_or_less):
> +#  ifdef USE_AS_STRCASECMP_L
> +       /* Set locale argument for strcasecmp.  */
> +       movq    %LOCALE_REG, %rdx
> +#  endif
>         jb      L(ret_zero)
>         /* 'nbe' covers the case where length is negative (large
>            unsigned).  */
> @@ -207,6 +359,8 @@ L(one_or_less):
>  #  else
>         movzbl  (%rdi), %eax
>         movzbl  (%rsi), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>  #  endif
>  L(ret1):
> @@ -234,6 +388,8 @@ L(return_vec_1):
>  # else
>         movzbl  VEC_SIZE(%rdi, %rcx), %eax
>         movzbl  VEC_SIZE(%rsi, %rcx), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>  # endif
>  L(ret2):
> @@ -265,6 +421,8 @@ L(return_vec_2):
>  # else
>         movzbl  (VEC_SIZE * 2)(%rdi, %rcx), %eax
>         movzbl  (VEC_SIZE * 2)(%rsi, %rcx), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>  # endif
>  L(ret3):
> @@ -285,6 +443,8 @@ L(return_vec_3):
>  #  else
>         movzbl  (VEC_SIZE * 3)(%rdi, %rcx), %eax
>         movzbl  (VEC_SIZE * 3)(%rsi, %rcx), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>  #  endif
>  L(ret4):
> @@ -295,7 +455,7 @@ L(ret4):
>  L(more_3x_vec):
>         /* Safe to compare 4x vectors.  */
>         VMOVU   VEC_SIZE(%rdi), %ymm0
> -       VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1
> +       CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
>         VPCMPEQ %ymm0, %ymmZERO, %ymm2
>         vpandn  %ymm1, %ymm2, %ymm1
>         vpmovmskb %ymm1, %ecx
> @@ -308,7 +468,7 @@ L(more_3x_vec):
>  # endif
>
>         VMOVU   (VEC_SIZE * 2)(%rdi), %ymm0
> -       VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm0, %ymm1
> +       CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 2)(%rsi), %ymm2, %ymm1)
>         VPCMPEQ %ymm0, %ymmZERO, %ymm2
>         vpandn  %ymm1, %ymm2, %ymm1
>         vpmovmskb %ymm1, %ecx
> @@ -316,7 +476,7 @@ L(more_3x_vec):
>         jnz     L(return_vec_2)
>
>         VMOVU   (VEC_SIZE * 3)(%rdi), %ymm0
> -       VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm0, %ymm1
> +       CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 3)(%rsi), %ymm2, %ymm1)
>         VPCMPEQ %ymm0, %ymmZERO, %ymm2
>         vpandn  %ymm1, %ymm2, %ymm1
>         vpmovmskb %ymm1, %ecx
> @@ -391,12 +551,10 @@ L(loop_skip_page_cross_check):
>         VMOVA   (VEC_SIZE * 3)(%rdi), %ymm6
>
>         /* ymm1 all 1s where s1 and s2 equal. All 0s otherwise.  */
> -       VPCMPEQ (VEC_SIZE * 0)(%rsi), %ymm0, %ymm1
> -
> -       VPCMPEQ (VEC_SIZE * 1)(%rsi), %ymm2, %ymm3
> -       VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
> -       VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
> -
> +       CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 0)(%rsi), %ymm3, %ymm1)
> +       CMP_R1_S2_ymm (%ymm2, (VEC_SIZE * 1)(%rsi), %ymm5, %ymm3)
> +       CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
> +       CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
>
>         /* If any mismatches or null CHAR then 0 CHAR, otherwise non-
>            zero.  */
> @@ -465,6 +623,8 @@ L(return_vec_2_3_end):
>  # else
>         movzbl  (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax
>         movzbl  (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>         xorl    %r8d, %eax
>         subl    %r8d, %eax
> @@ -508,6 +668,8 @@ L(return_vec_0_end):
>  # else
>         movzbl  (%rdi, %rcx), %eax
>         movzbl  (%rsi, %rcx), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>         xorl    %r8d, %eax
>         subl    %r8d, %eax
> @@ -530,6 +692,8 @@ L(return_vec_1_end):
>  #  else
>         movzbl  VEC_SIZE(%rdi, %rcx), %eax
>         movzbl  VEC_SIZE(%rsi, %rcx), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>         xorl    %r8d, %eax
>         subl    %r8d, %eax
> @@ -556,6 +720,8 @@ L(return_vec_2_end):
>  # else
>         movzbl  (VEC_SIZE * 2)(%rdi, %rcx), %eax
>         movzbl  (VEC_SIZE * 2)(%rsi, %rcx), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>         xorl    %r8d, %eax
>         subl    %r8d, %eax
> @@ -583,7 +749,7 @@ L(page_cross_during_loop):
>         jle     L(less_1x_vec_till_page_cross)
>
>         VMOVA   (%rdi), %ymm0
> -       VPCMPEQ (%rsi), %ymm0, %ymm1
> +       CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
>         VPCMPEQ %ymm0, %ymmZERO, %ymm2
>         vpandn  %ymm1, %ymm2, %ymm1
>         vpmovmskb %ymm1, %ecx
> @@ -605,7 +771,7 @@ L(less_1x_vec_till_page_cross):
>            here, it means the previous page (rdi - VEC_SIZE) has already
>            been loaded earlier so must be valid.  */
>         VMOVU   -VEC_SIZE(%rdi, %rax), %ymm0
> -       VPCMPEQ -VEC_SIZE(%rsi, %rax), %ymm0, %ymm1
> +       CMP_R1_S2_ymm (%ymm0, -VEC_SIZE(%rsi, %rax), %ymm2, %ymm1)
>         VPCMPEQ %ymm0, %ymmZERO, %ymm2
>         vpandn  %ymm1, %ymm2, %ymm1
>         vpmovmskb %ymm1, %ecx
> @@ -647,6 +813,8 @@ L(return_page_cross_cmp_mem):
>  # else
>         movzbl  VEC_OFFSET(%rdi, %rcx), %eax
>         movzbl  VEC_OFFSET(%rsi, %rcx), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>         xorl    %r8d, %eax
>         subl    %r8d, %eax
> @@ -673,7 +841,7 @@ L(more_2x_vec_till_page_cross):
>            iteration here.  */
>
>         VMOVU   VEC_SIZE(%rdi), %ymm0
> -       VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1
> +       CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
>         VPCMPEQ %ymm0, %ymmZERO, %ymm2
>         vpandn  %ymm1, %ymm2, %ymm1
>         vpmovmskb %ymm1, %ecx
> @@ -689,7 +857,7 @@ L(more_2x_vec_till_page_cross):
>
>         /* Safe to include comparisons from lower bytes.  */
>         VMOVU   -(VEC_SIZE * 2)(%rdi, %rax), %ymm0
> -       VPCMPEQ -(VEC_SIZE * 2)(%rsi, %rax), %ymm0, %ymm1
> +       CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 2)(%rsi, %rax), %ymm2, %ymm1)
>         VPCMPEQ %ymm0, %ymmZERO, %ymm2
>         vpandn  %ymm1, %ymm2, %ymm1
>         vpmovmskb %ymm1, %ecx
> @@ -697,7 +865,7 @@ L(more_2x_vec_till_page_cross):
>         jnz     L(return_vec_page_cross_0)
>
>         VMOVU   -(VEC_SIZE * 1)(%rdi, %rax), %ymm0
> -       VPCMPEQ -(VEC_SIZE * 1)(%rsi, %rax), %ymm0, %ymm1
> +       CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 1)(%rsi, %rax), %ymm2, %ymm1)
>         VPCMPEQ %ymm0, %ymmZERO, %ymm2
>         vpandn  %ymm1, %ymm2, %ymm1
>         vpmovmskb %ymm1, %ecx
> @@ -715,8 +883,8 @@ L(more_2x_vec_till_page_cross):
>         VMOVA   (VEC_SIZE * 2)(%rdi), %ymm4
>         VMOVA   (VEC_SIZE * 3)(%rdi), %ymm6
>
> -       VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
> -       VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
> +       CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
> +       CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
>         vpand   %ymm4, %ymm5, %ymm5
>         vpand   %ymm6, %ymm7, %ymm7
>         VPMINU  %ymm5, %ymm7, %ymm7
> @@ -767,6 +935,8 @@ L(return_vec_page_cross_1):
>  # else
>         movzbl  VEC_OFFSET(%rdi, %rcx), %eax
>         movzbl  VEC_OFFSET(%rsi, %rcx), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>         xorl    %r8d, %eax
>         subl    %r8d, %eax
> @@ -822,7 +992,7 @@ L(page_cross):
>  L(page_cross_loop):
>
>         VMOVU   (%rdi, %OFFSET_REG64), %ymm0
> -       VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1
> +       CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
>         VPCMPEQ %ymm0, %ymmZERO, %ymm2
>         vpandn  %ymm1, %ymm2, %ymm1
>         vpmovmskb %ymm1, %ecx
> @@ -840,11 +1010,11 @@ L(page_cross_loop):
>         subl    %eax, %OFFSET_REG
>         /* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
>            to not cross page so is safe to load. Since we have already
> -          loaded at least 1 VEC from rsi it is also guranteed to be safe.
> -        */
> +          loaded at least 1 VEC from rsi it is also guranteed to be
> +          safe.  */
>
>         VMOVU   (%rdi, %OFFSET_REG64), %ymm0
> -       VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1
> +       CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
>         VPCMPEQ %ymm0, %ymmZERO, %ymm2
>         vpandn  %ymm1, %ymm2, %ymm1
>         vpmovmskb %ymm1, %ecx
> @@ -877,6 +1047,8 @@ L(ret_vec_page_cross_cont):
>  # else
>         movzbl  (%rdi, %rcx), %eax
>         movzbl  (%rsi, %rcx), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>         xorl    %r8d, %eax
>         subl    %r8d, %eax
> @@ -930,7 +1102,7 @@ L(less_1x_vec_till_page):
>         ja      L(less_16_till_page)
>
>         VMOVU   (%rdi), %xmm0
> -       VPCMPEQ (%rsi), %xmm0, %xmm1
> +       CMP_R1_S2_xmm (%xmm0, (%rsi), %xmm2, %xmm1)
>         VPCMPEQ %xmm0, %xmmZERO, %xmm2
>         vpandn  %xmm1, %xmm2, %xmm1
>         vpmovmskb %ymm1, %ecx
> @@ -948,7 +1120,7 @@ L(less_1x_vec_till_page):
>  # endif
>
>         VMOVU   (%rdi, %OFFSET_REG64), %xmm0
> -       VPCMPEQ (%rsi, %OFFSET_REG64), %xmm0, %xmm1
> +       CMP_R1_S2_xmm (%xmm0, (%rsi, %OFFSET_REG64), %xmm2, %xmm1)
>         VPCMPEQ %xmm0, %xmmZERO, %xmm2
>         vpandn  %xmm1, %xmm2, %xmm1
>         vpmovmskb %ymm1, %ecx
> @@ -986,7 +1158,7 @@ L(less_16_till_page):
>         vmovq   (%rdi), %xmm0
>         vmovq   (%rsi), %xmm1
>         VPCMPEQ %xmm0, %xmmZERO, %xmm2
> -       VPCMPEQ %xmm1, %xmm0, %xmm1
> +       CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
>         vpandn  %xmm1, %xmm2, %xmm1
>         vpmovmskb %ymm1, %ecx
>         incb    %cl
> @@ -1006,7 +1178,7 @@ L(less_16_till_page):
>         vmovq   (%rdi, %OFFSET_REG64), %xmm0
>         vmovq   (%rsi, %OFFSET_REG64), %xmm1
>         VPCMPEQ %xmm0, %xmmZERO, %xmm2
> -       VPCMPEQ %xmm1, %xmm0, %xmm1
> +       CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
>         vpandn  %xmm1, %xmm2, %xmm1
>         vpmovmskb %ymm1, %ecx
>         incb    %cl
> @@ -1062,7 +1234,7 @@ L(ret_less_8_wcs):
>         vmovd   (%rdi), %xmm0
>         vmovd   (%rsi), %xmm1
>         VPCMPEQ %xmm0, %xmmZERO, %xmm2
> -       VPCMPEQ %xmm1, %xmm0, %xmm1
> +       CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
>         vpandn  %xmm1, %xmm2, %xmm1
>         vpmovmskb %ymm1, %ecx
>         subl    $0xf, %ecx
> @@ -1081,7 +1253,7 @@ L(ret_less_8_wcs):
>         vmovd   (%rdi, %OFFSET_REG64), %xmm0
>         vmovd   (%rsi, %OFFSET_REG64), %xmm1
>         VPCMPEQ %xmm0, %xmmZERO, %xmm2
> -       VPCMPEQ %xmm1, %xmm0, %xmm1
> +       CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
>         vpandn  %xmm1, %xmm2, %xmm1
>         vpmovmskb %ymm1, %ecx
>         subl    $0xf, %ecx
> @@ -1115,7 +1287,9 @@ L(less_4_till_page):
>  L(less_4_loop):
>         movzbl  (%rdi), %eax
>         movzbl  (%rsi, %rdi), %ecx
> -       subl    %ecx, %eax
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
> +       subl    %BYTE_LOOP_REG, %eax
>         jnz     L(ret_less_4_loop)
>         testl   %ecx, %ecx
>         jz      L(ret_zero_4_loop)
> @@ -1142,5 +1316,6 @@ L(ret_less_4_loop):
>         subl    %r8d, %eax
>         ret
>  # endif
> -END(STRCMP)
> +       cfi_endproc
> +       .size   STRCMP, .-STRCMP
>  #endif
> diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
> new file mode 100644
> index 0000000000..58c05dcfb8
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
> @@ -0,0 +1,16 @@
> +#ifndef STRCMP
> +# define STRCMP        __strncasecmp_l_avx2_rtm
> +#endif
> +
> +#define _GLABEL(x)     x ## _rtm
> +#define GLABEL(x)      _GLABEL(x)
> +
> +#define ZERO_UPPER_VEC_REGISTERS_RETURN        \
> +       ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> +
> +#define VZEROUPPER_RETURN      jmp L(return_vzeroupper)
> +
> +#define SECTION(p)     p##.avx.rtm
> +#define OVERFLOW_STRCMP        __strcasecmp_l_avx2_rtm
> +
> +#include "strncase_l-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2.S b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
> new file mode 100644
> index 0000000000..48c0aa21f8
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
> @@ -0,0 +1,27 @@
> +/* strncasecmp_l optimized with AVX2.
> +   Copyright (C) 2017-2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef STRCMP
> +# define STRCMP        __strncasecmp_l_avx2
> +#endif
> +#define USE_AS_STRCASECMP_L
> +#define USE_AS_STRNCMP
> +#ifndef OVERFLOW_STRCMP
> +# define OVERFLOW_STRCMP       __strcasecmp_l_avx2
> +#endif
> +#include "strcmp-avx2.S"
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v4 22/23] x86: Add EVEX optimized str{n}casecmp
  2022-03-24 23:56     ` [PATCH v4 22/23] x86: Add EVEX " Noah Goldstein
@ 2022-03-25 18:15       ` H.J. Lu
  2022-03-25 18:18         ` Noah Goldstein
  0 siblings, 1 reply; 76+ messages in thread
From: H.J. Lu @ 2022-03-25 18:15 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Thu, Mar 24, 2022 at 4:56 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> geometric_mean(N=40) of all benchmarks EVEX / SSE42: .621
>
> All string/memory tests pass.
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
> ---
>  sysdeps/x86_64/multiarch/Makefile            |   2 +
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c   |  16 +
>  sysdeps/x86_64/multiarch/ifunc-strcasecmp.h  |   5 +
>  sysdeps/x86_64/multiarch/strcasecmp_l-evex.S |  23 ++
>  sysdeps/x86_64/multiarch/strcmp-evex.S       | 290 ++++++++++++++++---
>  sysdeps/x86_64/multiarch/strncase_l-evex.S   |  25 ++
>  6 files changed, 321 insertions(+), 40 deletions(-)
>  create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
>  create mode 100644 sysdeps/x86_64/multiarch/strncase_l-evex.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 06e1848823..35d80dc2ff 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -57,6 +57,7 @@ sysdep_routines += \
>    strcasecmp_l-avx \
>    strcasecmp_l-avx2 \
>    strcasecmp_l-avx2-rtm \
> +  strcasecmp_l-evex \
>    strcasecmp_l-sse2 \
>    strcasecmp_l-sse4_2 \
>    strcasecmp_l-ssse3 \
> @@ -97,6 +98,7 @@ sysdep_routines += \
>    strncase_l-avx \
>    strncase_l-avx2 \
>    strncase_l-avx2-rtm \
> +  strncase_l-evex \
>    strncase_l-sse2 \
>    strncase_l-sse4_2 \
>    strncase_l-ssse3 \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 3c556d07ac..f1a4d3dac2 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -436,6 +436,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
>    /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
>    IFUNC_IMPL (i, name, strcasecmp,
> +             IFUNC_IMPL_ADD (array, i, strcasecmp,
> +                             (CPU_FEATURE_USABLE (AVX512VL)
> +                              && CPU_FEATURE_USABLE (AVX512BW)),
> +                             __strcasecmp_evex)
>               IFUNC_IMPL_ADD (array, i, strcasecmp,
>                               CPU_FEATURE_USABLE (AVX2),
>                               __strcasecmp_avx2)
> @@ -456,6 +460,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
>    /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
>    IFUNC_IMPL (i, name, strcasecmp_l,
> +             IFUNC_IMPL_ADD (array, i, strcasecmp,
> +                             (CPU_FEATURE_USABLE (AVX512VL)
> +                              && CPU_FEATURE_USABLE (AVX512BW)),
> +                             __strcasecmp_l_evex)
>               IFUNC_IMPL_ADD (array, i, strcasecmp,
>                               CPU_FEATURE_USABLE (AVX2),
>                               __strcasecmp_l_avx2)
> @@ -590,6 +598,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
>    /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
>    IFUNC_IMPL (i, name, strncasecmp,
> +             IFUNC_IMPL_ADD (array, i, strncasecmp,
> +                             (CPU_FEATURE_USABLE (AVX512VL)
> +                              && CPU_FEATURE_USABLE (AVX512BW)),
> +                             __strncasecmp_evex)
>               IFUNC_IMPL_ADD (array, i, strncasecmp,
>                               CPU_FEATURE_USABLE (AVX2),
>                               __strncasecmp_avx2)
> @@ -611,6 +623,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
>    /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
>    IFUNC_IMPL (i, name, strncasecmp_l,
> +             IFUNC_IMPL_ADD (array, i, strncasecmp,
> +                             (CPU_FEATURE_USABLE (AVX512VL)
> +                              && CPU_FEATURE_USABLE (AVX512BW)),
> +                             __strncasecmp_l_evex)
>               IFUNC_IMPL_ADD (array, i, strncasecmp,
>                               CPU_FEATURE_USABLE (AVX2),
>                               __strncasecmp_l_avx2)
> diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> index c4de111fd0..bf0d146e7f 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> @@ -25,6 +25,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
>
>  static inline void *
>  IFUNC_SELECTOR (void)
> @@ -34,6 +35,10 @@ IFUNC_SELECTOR (void)
>    if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
>        && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
>      {
> +      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> +          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
> +        return OPTIMIZE (evex);
> +
>        if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
>          return OPTIMIZE (avx2_rtm);
>
> diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
> new file mode 100644
> index 0000000000..58642db748
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
> @@ -0,0 +1,23 @@
> +/* strcasecmp_l optimized with EVEX.
> +   Copyright (C) 2017-2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef STRCMP
> +# define STRCMP        __strcasecmp_l_evex
> +#endif
> +#define USE_AS_STRCASECMP_L
> +#include "strcmp-evex.S"
> diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
> index 56d8c118e4..2a5b3ce037 100644
> --- a/sysdeps/x86_64/multiarch/strcmp-evex.S
> +++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
> @@ -19,6 +19,9 @@
>  #if IS_IN (libc)
>
>  # include <sysdep.h>
> +# if defined USE_AS_STRCASECMP_L
> +#  include "locale-defines.h"
> +# endif
>
>  # ifndef STRCMP
>  #  define STRCMP       __strcmp_evex
> @@ -34,19 +37,29 @@
>  # define VMOVA vmovdqa64
>
>  # ifdef USE_AS_WCSCMP
> -#  define TESTEQ       subl    $0xff,
> +#  ifndef OVERFLOW_STRCMP
> +#   define OVERFLOW_STRCMP     __wcscmp_evex
> +#  endif
> +
> +#  define TESTEQ       subl $0xff,
>         /* Compare packed dwords.  */
>  #  define VPCMP        vpcmpd
>  #  define VPMINU       vpminud
>  #  define VPTESTM      vptestmd
> +#  define VPTESTNM     vptestnmd
>         /* 1 dword char == 4 bytes.  */
>  #  define SIZE_OF_CHAR 4
>  # else
> +#  ifndef OVERFLOW_STRCMP
> +#   define OVERFLOW_STRCMP     __strcmp_evex
> +#  endif
> +
>  #  define TESTEQ       incl
>         /* Compare packed bytes.  */
>  #  define VPCMP        vpcmpb
>  #  define VPMINU       vpminub
>  #  define VPTESTM      vptestmb
> +#  define VPTESTNM     vptestnmb
>         /* 1 byte char == 1 byte.  */
>  #  define SIZE_OF_CHAR 1
>  # endif
> @@ -73,11 +86,16 @@
>  #  define VEC_OFFSET   (-VEC_SIZE)
>  # endif
>
> -# define XMMZERO       xmm16
>  # define XMM0  xmm17
>  # define XMM1  xmm18
>
> -# define YMMZERO       ymm16
> +# define XMM10 xmm27
> +# define XMM11 xmm28
> +# define XMM12 xmm29
> +# define XMM13 xmm30
> +# define XMM14 xmm31
> +
> +
>  # define YMM0  ymm17
>  # define YMM1  ymm18
>  # define YMM2  ymm19
> @@ -89,6 +107,87 @@
>  # define YMM8  ymm25
>  # define YMM9  ymm26
>  # define YMM10 ymm27
> +# define YMM11 ymm28
> +# define YMM12 ymm29
> +# define YMM13 ymm30
> +# define YMM14 ymm31
> +
> +# ifdef USE_AS_STRCASECMP_L
> +#  define BYTE_LOOP_REG        OFFSET_REG
> +# else
> +#  define BYTE_LOOP_REG        ecx
> +# endif
> +
> +# ifdef USE_AS_STRCASECMP_L
> +#  ifdef USE_AS_STRNCMP
> +#   define STRCASECMP  __strncasecmp_evex
> +#   define LOCALE_REG  rcx
> +#   define LOCALE_REG_LP       RCX_LP
> +#   define STRCASECMP_NONASCII __strncasecmp_l_nonascii
> +#  else
> +#   define STRCASECMP  __strcasecmp_evex
> +#   define LOCALE_REG  rdx
> +#   define LOCALE_REG_LP       RDX_LP
> +#   define STRCASECMP_NONASCII __strcasecmp_l_nonascii
> +#  endif
> +# endif
> +
> +# define LCASE_MIN_YMM %YMM12
> +# define LCASE_MAX_YMM %YMM13
> +# define CASE_ADD_YMM  %YMM14
> +
> +# define LCASE_MIN_XMM %XMM12
> +# define LCASE_MAX_XMM %XMM13
> +# define CASE_ADD_XMM  %XMM14
> +
> +       /* NB: wcsncmp uses r11 but strcasecmp is never used in
> +          conjunction with wcscmp.  */
> +# define TOLOWER_BASE  %r11
> +
> +# ifdef USE_AS_STRCASECMP_L
> +#  define _REG(x, y) x ## y
> +#  define REG(x, y) _REG(x, y)
> +#  define TOLOWER(reg1, reg2, ext)                                                                             \
> +       vpsubb  REG(LCASE_MIN_, ext), reg1, REG(%ext, 10);                                      \
> +       vpsubb  REG(LCASE_MIN_, ext), reg2, REG(%ext, 11);                                      \
> +       vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5;                           \
> +       vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6;                           \
> +       vpaddb  reg1, REG(CASE_ADD_, ext), reg1{%k5};                                           \
> +       vpaddb  reg2, REG(CASE_ADD_, ext), reg2{%k6}
> +
> +#  define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
> +#  define TOLOWER_YMM(...)     TOLOWER(__VA_ARGS__, YMM)
> +#  define TOLOWER_XMM(...)     TOLOWER(__VA_ARGS__, XMM)
> +
> +#  define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)                                              \
> +       TOLOWER (s1_reg, s2_reg, ext);                                                                          \
> +       VPCMP   $0, s1_reg, s2_reg, reg_out
> +
> +#  define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext)                              \
> +       VMOVU   s2_mem, s2_reg;                                                                                         \
> +       CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)
> +
> +#  define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM)
> +#  define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM)
> +
> +#  define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM)
> +#  define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM)
> +
> +# else
> +#  define TOLOWER_gpr(...)
> +#  define TOLOWER_YMM(...)
> +#  define TOLOWER_XMM(...)
> +
> +#  define CMP_R1_R2_YMM(s1_reg, s2_reg, reg_out)                                               \
> +       VPCMP   $0, s2_reg, s1_reg, reg_out
> +
> +#  define CMP_R1_R2_XMM(...) CMP_R1_R2_YMM(__VA_ARGS__)
> +
> +#  define CMP_R1_S2_YMM(s1_reg, s2_mem, unused, reg_out)                               \
> +       VPCMP   $0, s2_mem, s1_reg, reg_out
> +
> +#  define CMP_R1_S2_XMM(...) CMP_R1_S2_YMM(__VA_ARGS__)
> +# endif
>
>  /* Warning!
>             wcscmp/wcsncmp have to use SIGNED comparison for elements.
> @@ -112,8 +211,45 @@
>     returned.  */
>
>         .section .text.evex, "ax", @progbits
> -ENTRY(STRCMP)
> +       .align  16
> +       .type   STRCMP, @function
> +       .globl  STRCMP
> +       .hidden STRCMP
> +
> +# ifdef USE_AS_STRCASECMP_L
> +ENTRY (STRCASECMP)
> +       movq    __libc_tsd_LOCALE@gottpoff(%rip), %rax
> +       mov     %fs:(%rax), %LOCALE_REG_LP
> +
> +       /* Either 1 or 5 bytes (dependeing if CET is enabled).  */
> +       .p2align 4
> +END (STRCASECMP)
> +       /* FALLTHROUGH to strcasecmp/strncasecmp_l.  */
> +# endif
> +
> +       .p2align 4
> +STRCMP:
> +       cfi_startproc
> +       _CET_ENDBR
> +       CALL_MCOUNT
> +
> +# if defined USE_AS_STRCASECMP_L
> +       /* We have to fall back on the C implementation for locales with
> +          encodings not matching ASCII for single bytes.  */
> +#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
> +       mov     LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
> +#  else
> +       mov     (%LOCALE_REG), %RAX_LP
> +#  endif
> +       testl   $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
> +       jne     STRCASECMP_NONASCII
> +       leaq    _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
> +# endif
> +
>  # ifdef USE_AS_STRNCMP
> +       /* Don't overwrite LOCALE_REG (rcx) until we have pass
> +          L(one_or_less). Otherwise we might use the wrong locale in
> +          the OVERFLOW_STRCMP (strcasecmp_l).  */
>  #  ifdef __ILP32__
>         /* Clear the upper 32 bits.  */
>         movl    %edx, %edx
> @@ -125,6 +261,32 @@ ENTRY(STRCMP)
>            actually bound the buffer.  */
>         jle     L(one_or_less)
>  # endif
> +
> +# if defined USE_AS_STRCASECMP_L
> +       .section .rodata.cst32, "aM", @progbits, 32
> +       .align  32
> +L(lcase_min):
> +       .quad   0x4141414141414141
> +       .quad   0x4141414141414141
> +       .quad   0x4141414141414141
> +       .quad   0x4141414141414141
> +L(lcase_max):
> +       .quad   0x1a1a1a1a1a1a1a1a
> +       .quad   0x1a1a1a1a1a1a1a1a
> +       .quad   0x1a1a1a1a1a1a1a1a
> +       .quad   0x1a1a1a1a1a1a1a1a
> +L(case_add):
> +       .quad   0x2020202020202020
> +       .quad   0x2020202020202020
> +       .quad   0x2020202020202020
> +       .quad   0x2020202020202020
> +       .previous
> +
> +       vmovdqa64 L(lcase_min)(%rip), LCASE_MIN_YMM
> +       vmovdqa64 L(lcase_max)(%rip), LCASE_MAX_YMM
> +       vmovdqa64 L(case_add)(%rip), CASE_ADD_YMM
> +# endif
> +
>         movl    %edi, %eax
>         orl     %esi, %eax
>         /* Shift out the bits irrelivant to page boundary ([63:12]).  */
> @@ -139,7 +301,7 @@ L(no_page_cross):
>         VPTESTM %YMM0, %YMM0, %k2
>         /* Each bit cleared in K1 represents a mismatch or a null CHAR
>            in YMM0 and 32 bytes at (%rsi).  */
> -       VPCMP   $0, (%rsi), %YMM0, %k1{%k2}
> +       CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
>         kmovd   %k1, %ecx
>  # ifdef USE_AS_STRNCMP
>         cmpq    $CHAR_PER_VEC, %rdx
> @@ -169,6 +331,8 @@ L(return_vec_0):
>  # else
>         movzbl  (%rdi, %rcx), %eax
>         movzbl  (%rsi, %rcx), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>  # endif
>  L(ret0):
> @@ -188,11 +352,15 @@ L(ret_zero):
>
>         .p2align 4,, 5
>  L(one_or_less):
> +#  ifdef USE_AS_STRCASECMP_L
> +       /* Set locale argument for strcasecmp.  */
> +       movq    %LOCALE_REG, %rdx
> +#  endif
>         jb      L(ret_zero)
> -#  ifdef USE_AS_WCSCMP
>         /* 'nbe' covers the case where length is negative (large
>            unsigned).  */
> -       jnbe    __wcscmp_evex
> +       jnbe    OVERFLOW_STRCMP
> +#  ifdef USE_AS_WCSCMP
>         movl    (%rdi), %edx
>         xorl    %eax, %eax
>         cmpl    (%rsi), %edx
> @@ -201,11 +369,10 @@ L(one_or_less):
>         negl    %eax
>         orl     $1, %eax
>  #  else
> -       /* 'nbe' covers the case where length is negative (large
> -          unsigned).  */
> -       jnbe    __strcmp_evex
>         movzbl  (%rdi), %eax
>         movzbl  (%rsi), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>  #  endif
>  L(ret1):
> @@ -233,6 +400,8 @@ L(return_vec_1):
>  # else
>         movzbl  VEC_SIZE(%rdi, %rcx), %eax
>         movzbl  VEC_SIZE(%rsi, %rcx), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>  # endif
>  L(ret2):
> @@ -270,6 +439,8 @@ L(return_vec_2):
>  # else
>         movzbl  (VEC_SIZE * 2)(%rdi, %rcx), %eax
>         movzbl  (VEC_SIZE * 2)(%rsi, %rcx), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>  # endif
>  L(ret3):
> @@ -290,6 +461,8 @@ L(return_vec_3):
>  #  else
>         movzbl  (VEC_SIZE * 3)(%rdi, %rcx), %eax
>         movzbl  (VEC_SIZE * 3)(%rsi, %rcx), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>  #  endif
>  L(ret4):
> @@ -303,7 +476,7 @@ L(more_3x_vec):
>         /* Safe to compare 4x vectors.  */
>         VMOVU   (VEC_SIZE)(%rdi), %YMM0
>         VPTESTM %YMM0, %YMM0, %k2
> -       VPCMP   $0, (VEC_SIZE)(%rsi), %YMM0, %k1{%k2}
> +       CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
>         kmovd   %k1, %ecx
>         TESTEQ  %ecx
>         jnz     L(return_vec_1)
> @@ -315,14 +488,14 @@ L(more_3x_vec):
>
>         VMOVU   (VEC_SIZE * 2)(%rdi), %YMM0
>         VPTESTM %YMM0, %YMM0, %k2
> -       VPCMP   $0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
> +       CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2}
>         kmovd   %k1, %ecx
>         TESTEQ  %ecx
>         jnz     L(return_vec_2)
>
>         VMOVU   (VEC_SIZE * 3)(%rdi), %YMM0
>         VPTESTM %YMM0, %YMM0, %k2
> -       VPCMP   $0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
> +       CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2}
>         kmovd   %k1, %ecx
>         TESTEQ  %ecx
>         jnz     L(return_vec_3)
> @@ -381,7 +554,6 @@ L(prepare_loop_aligned):
>         subl    %esi, %eax
>         andl    $(PAGE_SIZE - 1), %eax
>
> -       vpxorq  %YMMZERO, %YMMZERO, %YMMZERO
>
>         /* Loop 4x comparisons at a time.  */
>         .p2align 4
> @@ -413,22 +585,35 @@ L(loop_skip_page_cross_check):
>         /* A zero CHAR in YMM9 means that there is a null CHAR.  */
>         VPMINU  %YMM8, %YMM9, %YMM9
>
> -       /* Each bit set in K1 represents a non-null CHAR in YMM8.  */
> +       /* Each bit set in K1 represents a non-null CHAR in YMM9.  */
>         VPTESTM %YMM9, %YMM9, %k1
> -
> +# ifndef USE_AS_STRCASECMP_L
>         vpxorq  (VEC_SIZE * 0)(%rsi), %YMM0, %YMM1
>         vpxorq  (VEC_SIZE * 1)(%rsi), %YMM2, %YMM3
>         vpxorq  (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
>         /* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
>            oring with YMM1. Result is stored in YMM6.  */
>         vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6
> -
> +# else
> +       VMOVU   (VEC_SIZE * 0)(%rsi), %YMM1
> +       TOLOWER_YMM (%YMM0, %YMM1)
> +       VMOVU   (VEC_SIZE * 1)(%rsi), %YMM3
> +       TOLOWER_YMM (%YMM2, %YMM3)
> +       VMOVU   (VEC_SIZE * 2)(%rsi), %YMM5
> +       TOLOWER_YMM (%YMM4, %YMM5)
> +       VMOVU   (VEC_SIZE * 3)(%rsi), %YMM7
> +       TOLOWER_YMM (%YMM6, %YMM7)
> +       vpxorq  %YMM0, %YMM1, %YMM1
> +       vpxorq  %YMM2, %YMM3, %YMM3
> +       vpxorq  %YMM4, %YMM5, %YMM5
> +       vpternlogd $0xde, %YMM7, %YMM1, %YMM6
> +# endif
>         /* Or together YMM3, YMM5, and YMM6.  */
>         vpternlogd $0xfe, %YMM3, %YMM5, %YMM6
>
>
>         /* A non-zero CHAR in YMM6 represents a mismatch.  */
> -       VPCMP   $0, %YMMZERO, %YMM6, %k0{%k1}
> +       VPTESTNM %YMM6, %YMM6, %k0{%k1}
>         kmovd   %k0, %LOOP_REG
>
>         TESTEQ  %LOOP_REG
> @@ -437,13 +622,13 @@ L(loop_skip_page_cross_check):
>
>         /* Find which VEC has the mismatch of end of string.  */
>         VPTESTM %YMM0, %YMM0, %k1
> -       VPCMP   $0, %YMMZERO, %YMM1, %k0{%k1}
> +       VPTESTNM %YMM1, %YMM1, %k0{%k1}
>         kmovd   %k0, %ecx
>         TESTEQ  %ecx
>         jnz     L(return_vec_0_end)
>
>         VPTESTM %YMM2, %YMM2, %k1
> -       VPCMP   $0, %YMMZERO, %YMM3, %k0{%k1}
> +       VPTESTNM %YMM3, %YMM3, %k0{%k1}
>         kmovd   %k0, %ecx
>         TESTEQ  %ecx
>         jnz     L(return_vec_1_end)
> @@ -457,7 +642,7 @@ L(return_vec_2_3_end):
>  # endif
>
>         VPTESTM %YMM4, %YMM4, %k1
> -       VPCMP   $0, %YMMZERO, %YMM5, %k0{%k1}
> +       VPTESTNM %YMM5, %YMM5, %k0{%k1}
>         kmovd   %k0, %ecx
>         TESTEQ  %ecx
>  # if CHAR_PER_VEC <= 16
> @@ -493,6 +678,8 @@ L(return_vec_3_end):
>  # else
>         movzbl  (VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax
>         movzbl  (VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>         xorl    %r8d, %eax
>         subl    %r8d, %eax
> @@ -545,6 +732,8 @@ L(return_vec_0_end):
>  # else
>         movzbl  (%rdi, %rcx), %eax
>         movzbl  (%rsi, %rcx), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>         /* Flip `eax` if `rdi` and `rsi` where swapped in page cross
>            logic. Subtract `r8d` after xor for zero case.  */
> @@ -569,6 +758,8 @@ L(return_vec_1_end):
>  #  else
>         movzbl  VEC_SIZE(%rdi, %rcx), %eax
>         movzbl  VEC_SIZE(%rsi, %rcx), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>         xorl    %r8d, %eax
>         subl    %r8d, %eax
> @@ -598,7 +789,7 @@ L(page_cross_during_loop):
>
>         VMOVA   (%rdi), %YMM0
>         VPTESTM %YMM0, %YMM0, %k2
> -       VPCMP   $0, (%rsi), %YMM0, %k1{%k2}
> +       CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
>         kmovd   %k1, %ecx
>         TESTEQ  %ecx
>         jnz     L(return_vec_0_end)
> @@ -619,8 +810,7 @@ L(less_1x_vec_till_page_cross):
>            been loaded earlier so must be valid.  */
>         VMOVU   -VEC_SIZE(%rdi, %rax), %YMM0
>         VPTESTM %YMM0, %YMM0, %k2
> -       VPCMP   $0, -VEC_SIZE(%rsi, %rax), %YMM0, %k1{%k2}
> -
> +       CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2}
>         /* Mask of potentially valid bits. The lower bits can be out of
>            range comparisons (but safe regarding page crosses).  */
>
> @@ -642,6 +832,8 @@ L(less_1x_vec_till_page_cross):
>
>  # ifdef USE_AS_STRNCMP
>  #  ifdef USE_AS_WCSCMP
> +       /* NB: strcasecmp not used with WCSCMP so this access to r11 is
> +          safe.  */
>         movl    %eax, %r11d
>         shrl    $2, %r11d
>         cmpq    %r11, %rdx
> @@ -679,6 +871,8 @@ L(return_page_cross_cmp_mem):
>  # else
>         movzbl  VEC_OFFSET(%rdi, %rcx), %eax
>         movzbl  VEC_OFFSET(%rsi, %rcx), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>         xorl    %r8d, %eax
>         subl    %r8d, %eax
> @@ -709,7 +903,7 @@ L(more_2x_vec_till_page_cross):
>
>         VMOVA   VEC_SIZE(%rdi), %YMM0
>         VPTESTM %YMM0, %YMM0, %k2
> -       VPCMP   $0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
> +       CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
>         kmovd   %k1, %ecx
>         TESTEQ  %ecx
>         jnz     L(return_vec_1_end)
> @@ -724,14 +918,14 @@ L(more_2x_vec_till_page_cross):
>         /* Safe to include comparisons from lower bytes.  */
>         VMOVU   -(VEC_SIZE * 2)(%rdi, %rax), %YMM0
>         VPTESTM %YMM0, %YMM0, %k2
> -       VPCMP   $0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM0, %k1{%k2}
> +       CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2}
>         kmovd   %k1, %ecx
>         TESTEQ  %ecx
>         jnz     L(return_vec_page_cross_0)
>
>         VMOVU   -(VEC_SIZE * 1)(%rdi, %rax), %YMM0
>         VPTESTM %YMM0, %YMM0, %k2
> -       VPCMP   $0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM0, %k1{%k2}
> +       CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2}
>         kmovd   %k1, %ecx
>         TESTEQ  %ecx
>         jnz     L(return_vec_page_cross_1)
> @@ -740,6 +934,8 @@ L(more_2x_vec_till_page_cross):
>         /* Must check length here as length might proclude reading next
>            page.  */
>  #  ifdef USE_AS_WCSCMP
> +       /* NB: strcasecmp not used with WCSCMP so this access to r11 is
> +          safe.  */
>         movl    %eax, %r11d
>         shrl    $2, %r11d
>         cmpq    %r11, %rdx
> @@ -754,12 +950,19 @@ L(more_2x_vec_till_page_cross):
>         VMOVA   (VEC_SIZE * 3)(%rdi), %YMM6
>         VPMINU  %YMM4, %YMM6, %YMM9
>         VPTESTM %YMM9, %YMM9, %k1
> -
> +# ifndef USE_AS_STRCASECMP_L
>         vpxorq  (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
>         /* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6).  */
>         vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6
> -
> -       VPCMP   $0, %YMMZERO, %YMM6, %k0{%k1}
> +# else
> +       VMOVU   (VEC_SIZE * 2)(%rsi), %YMM5
> +       TOLOWER_YMM (%YMM4, %YMM5)
> +       VMOVU   (VEC_SIZE * 3)(%rsi), %YMM7
> +       TOLOWER_YMM (%YMM6, %YMM7)
> +       vpxorq  %YMM4, %YMM5, %YMM5
> +       vpternlogd $0xde, %YMM7, %YMM5, %YMM6
> +# endif
> +       VPTESTNM %YMM6, %YMM6, %k0{%k1}
>         kmovd   %k0, %LOOP_REG
>         TESTEQ  %LOOP_REG
>         jnz     L(return_vec_2_3_end)
> @@ -815,6 +1018,8 @@ L(return_vec_page_cross_1):
>  # else
>         movzbl  VEC_OFFSET(%rdi, %rcx), %eax
>         movzbl  VEC_OFFSET(%rsi, %rcx), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>         xorl    %r8d, %eax
>         subl    %r8d, %eax
> @@ -871,7 +1076,7 @@ L(page_cross):
>  L(page_cross_loop):
>         VMOVU   (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
>         VPTESTM %YMM0, %YMM0, %k2
> -       VPCMP   $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
> +       CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
>         kmovd   %k1, %ecx
>         TESTEQ  %ecx
>         jnz     L(check_ret_vec_page_cross)
> @@ -895,7 +1100,7 @@ L(page_cross_loop):
>          */
>         VMOVU   (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
>         VPTESTM %YMM0, %YMM0, %k2
> -       VPCMP   $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
> +       CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
>
>         kmovd   %k1, %ecx
>  # ifdef USE_AS_STRNCMP
> @@ -930,6 +1135,8 @@ L(ret_vec_page_cross_cont):
>  # else
>         movzbl  (%rdi, %rcx, SIZE_OF_CHAR), %eax
>         movzbl  (%rsi, %rcx, SIZE_OF_CHAR), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>         xorl    %r8d, %eax
>         subl    %r8d, %eax
> @@ -989,7 +1196,7 @@ L(less_1x_vec_till_page):
>         /* Use 16 byte comparison.  */
>         vmovdqu (%rdi), %xmm0
>         VPTESTM %xmm0, %xmm0, %k2
> -       VPCMP   $0, (%rsi), %xmm0, %k1{%k2}
> +       CMP_R1_S2_XMM (%xmm0, (%rsi), %xmm1, %k1){%k2}
>         kmovd   %k1, %ecx
>  # ifdef USE_AS_WCSCMP
>         subl    $0xf, %ecx
> @@ -1009,7 +1216,7 @@ L(less_1x_vec_till_page):
>  # endif
>         vmovdqu (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
>         VPTESTM %xmm0, %xmm0, %k2
> -       VPCMP   $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0, %k1{%k2}
> +       CMP_R1_S2_XMM (%xmm0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1, %k1){%k2}
>         kmovd   %k1, %ecx
>  # ifdef USE_AS_WCSCMP
>         subl    $0xf, %ecx
> @@ -1048,7 +1255,7 @@ L(less_16_till_page):
>         vmovq   (%rdi), %xmm0
>         vmovq   (%rsi), %xmm1
>         VPTESTM %xmm0, %xmm0, %k2
> -       VPCMP   $0, %xmm1, %xmm0, %k1{%k2}
> +       CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
>         kmovd   %k1, %ecx
>  # ifdef USE_AS_WCSCMP
>         subl    $0x3, %ecx
> @@ -1068,7 +1275,7 @@ L(less_16_till_page):
>         vmovq   (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
>         vmovq   (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
>         VPTESTM %xmm0, %xmm0, %k2
> -       VPCMP   $0, %xmm1, %xmm0, %k1{%k2}
> +       CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
>         kmovd   %k1, %ecx
>  # ifdef USE_AS_WCSCMP
>         subl    $0x3, %ecx
> @@ -1128,7 +1335,7 @@ L(ret_less_8_wcs):
>         vmovd   (%rdi), %xmm0
>         vmovd   (%rsi), %xmm1
>         VPTESTM %xmm0, %xmm0, %k2
> -       VPCMP   $0, %xmm1, %xmm0, %k1{%k2}
> +       CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
>         kmovd   %k1, %ecx
>         subl    $0xf, %ecx
>         jnz     L(check_ret_vec_page_cross)
> @@ -1143,7 +1350,7 @@ L(ret_less_8_wcs):
>         vmovd   (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
>         vmovd   (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
>         VPTESTM %xmm0, %xmm0, %k2
> -       VPCMP   $0, %xmm1, %xmm0, %k1{%k2}
> +       CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
>         kmovd   %k1, %ecx
>         subl    $0xf, %ecx
>         jnz     L(check_ret_vec_page_cross)
> @@ -1176,7 +1383,9 @@ L(less_4_till_page):
>  L(less_4_loop):
>         movzbl  (%rdi), %eax
>         movzbl  (%rsi, %rdi), %ecx
> -       subl    %ecx, %eax
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
> +       subl    %BYTE_LOOP_REG, %eax
>         jnz     L(ret_less_4_loop)
>         testl   %ecx, %ecx
>         jz      L(ret_zero_4_loop)
> @@ -1203,5 +1412,6 @@ L(ret_less_4_loop):
>         subl    %r8d, %eax
>         ret
>  # endif
> -END(STRCMP)
> +       cfi_endproc
> +       .size   STRCMP, .-STRCMP
>  #endif
> diff --git a/sysdeps/x86_64/multiarch/strncase_l-evex.S b/sysdeps/x86_64/multiarch/strncase_l-evex.S
> new file mode 100644
> index 0000000000..8a5af3695c
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strncase_l-evex.S
> @@ -0,0 +1,25 @@
> +/* strncasecmp_l optimized with EVEX.
> +   Copyright (C) 2017-2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef STRCMP
> +# define STRCMP        __strncasecmp_l_evex
> +#endif
> +#define OVERFLOW_STRCMP        __strcasecmp_l_evex
> +#define USE_AS_STRCASECMP_L
> +#define USE_AS_STRNCMP
> +#include "strcmp-evex.S"
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v4 22/23] x86: Add EVEX optimized str{n}casecmp
  2022-03-25 18:15       ` H.J. Lu
@ 2022-03-25 18:18         ` Noah Goldstein
  2022-05-12 19:47           ` Sunil Pandey
  0 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-25 18:18 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell

On Fri, Mar 25, 2022 at 1:15 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Thu, Mar 24, 2022 at 4:56 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > geometric_mean(N=40) of all benchmarks EVEX / SSE42: .621
> >
> > All string/memory tests pass.
> > Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
> > ---
> >  sysdeps/x86_64/multiarch/Makefile            |   2 +
> >  sysdeps/x86_64/multiarch/ifunc-impl-list.c   |  16 +
> >  sysdeps/x86_64/multiarch/ifunc-strcasecmp.h  |   5 +
> >  sysdeps/x86_64/multiarch/strcasecmp_l-evex.S |  23 ++
> >  sysdeps/x86_64/multiarch/strcmp-evex.S       | 290 ++++++++++++++++---
> >  sysdeps/x86_64/multiarch/strncase_l-evex.S   |  25 ++
> >  6 files changed, 321 insertions(+), 40 deletions(-)
> >  create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
> >  create mode 100644 sysdeps/x86_64/multiarch/strncase_l-evex.S
> >
> > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > index 06e1848823..35d80dc2ff 100644
> > --- a/sysdeps/x86_64/multiarch/Makefile
> > +++ b/sysdeps/x86_64/multiarch/Makefile
> > @@ -57,6 +57,7 @@ sysdep_routines += \
> >    strcasecmp_l-avx \
> >    strcasecmp_l-avx2 \
> >    strcasecmp_l-avx2-rtm \
> > +  strcasecmp_l-evex \
> >    strcasecmp_l-sse2 \
> >    strcasecmp_l-sse4_2 \
> >    strcasecmp_l-ssse3 \
> > @@ -97,6 +98,7 @@ sysdep_routines += \
> >    strncase_l-avx \
> >    strncase_l-avx2 \
> >    strncase_l-avx2-rtm \
> > +  strncase_l-evex \
> >    strncase_l-sse2 \
> >    strncase_l-sse4_2 \
> >    strncase_l-ssse3 \
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > index 3c556d07ac..f1a4d3dac2 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > @@ -436,6 +436,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >
> >    /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
> >    IFUNC_IMPL (i, name, strcasecmp,
> > +             IFUNC_IMPL_ADD (array, i, strcasecmp,
> > +                             (CPU_FEATURE_USABLE (AVX512VL)
> > +                              && CPU_FEATURE_USABLE (AVX512BW)),
> > +                             __strcasecmp_evex)
> >               IFUNC_IMPL_ADD (array, i, strcasecmp,
> >                               CPU_FEATURE_USABLE (AVX2),
> >                               __strcasecmp_avx2)
> > @@ -456,6 +460,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >
> >    /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
> >    IFUNC_IMPL (i, name, strcasecmp_l,
> > +             IFUNC_IMPL_ADD (array, i, strcasecmp,
> > +                             (CPU_FEATURE_USABLE (AVX512VL)
> > +                              && CPU_FEATURE_USABLE (AVX512BW)),
> > +                             __strcasecmp_l_evex)
> >               IFUNC_IMPL_ADD (array, i, strcasecmp,
> >                               CPU_FEATURE_USABLE (AVX2),
> >                               __strcasecmp_l_avx2)
> > @@ -590,6 +598,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >
> >    /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
> >    IFUNC_IMPL (i, name, strncasecmp,
> > +             IFUNC_IMPL_ADD (array, i, strncasecmp,
> > +                             (CPU_FEATURE_USABLE (AVX512VL)
> > +                              && CPU_FEATURE_USABLE (AVX512BW)),
> > +                             __strncasecmp_evex)
> >               IFUNC_IMPL_ADD (array, i, strncasecmp,
> >                               CPU_FEATURE_USABLE (AVX2),
> >                               __strncasecmp_avx2)
> > @@ -611,6 +623,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >
> >    /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
> >    IFUNC_IMPL (i, name, strncasecmp_l,
> > +             IFUNC_IMPL_ADD (array, i, strncasecmp,
> > +                             (CPU_FEATURE_USABLE (AVX512VL)
> > +                              && CPU_FEATURE_USABLE (AVX512BW)),
> > +                             __strncasecmp_l_evex)
> >               IFUNC_IMPL_ADD (array, i, strncasecmp,
> >                               CPU_FEATURE_USABLE (AVX2),
> >                               __strncasecmp_l_avx2)
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> > index c4de111fd0..bf0d146e7f 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> > +++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> > @@ -25,6 +25,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
> >  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
> >  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> >  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
> >
> >  static inline void *
> >  IFUNC_SELECTOR (void)
> > @@ -34,6 +35,10 @@ IFUNC_SELECTOR (void)
> >    if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> >        && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
> >      {
> > +      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> > +          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
> > +        return OPTIMIZE (evex);
> > +
> >        if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> >          return OPTIMIZE (avx2_rtm);
> >
> > diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
> > new file mode 100644
> > index 0000000000..58642db748
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
> > @@ -0,0 +1,23 @@
> > +/* strcasecmp_l optimized with EVEX.
> > +   Copyright (C) 2017-2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef STRCMP
> > +# define STRCMP        __strcasecmp_l_evex
> > +#endif
> > +#define USE_AS_STRCASECMP_L
> > +#include "strcmp-evex.S"
> > diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
> > index 56d8c118e4..2a5b3ce037 100644
> > --- a/sysdeps/x86_64/multiarch/strcmp-evex.S
> > +++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
> > @@ -19,6 +19,9 @@
> >  #if IS_IN (libc)
> >
> >  # include <sysdep.h>
> > +# if defined USE_AS_STRCASECMP_L
> > +#  include "locale-defines.h"
> > +# endif
> >
> >  # ifndef STRCMP
> >  #  define STRCMP       __strcmp_evex
> > @@ -34,19 +37,29 @@
> >  # define VMOVA vmovdqa64
> >
> >  # ifdef USE_AS_WCSCMP
> > -#  define TESTEQ       subl    $0xff,
> > +#  ifndef OVERFLOW_STRCMP
> > +#   define OVERFLOW_STRCMP     __wcscmp_evex
> > +#  endif
> > +
> > +#  define TESTEQ       subl $0xff,
> >         /* Compare packed dwords.  */
> >  #  define VPCMP        vpcmpd
> >  #  define VPMINU       vpminud
> >  #  define VPTESTM      vptestmd
> > +#  define VPTESTNM     vptestnmd
> >         /* 1 dword char == 4 bytes.  */
> >  #  define SIZE_OF_CHAR 4
> >  # else
> > +#  ifndef OVERFLOW_STRCMP
> > +#   define OVERFLOW_STRCMP     __strcmp_evex
> > +#  endif
> > +
> >  #  define TESTEQ       incl
> >         /* Compare packed bytes.  */
> >  #  define VPCMP        vpcmpb
> >  #  define VPMINU       vpminub
> >  #  define VPTESTM      vptestmb
> > +#  define VPTESTNM     vptestnmb
> >         /* 1 byte char == 1 byte.  */
> >  #  define SIZE_OF_CHAR 1
> >  # endif
> > @@ -73,11 +86,16 @@
> >  #  define VEC_OFFSET   (-VEC_SIZE)
> >  # endif
> >
> > -# define XMMZERO       xmm16
> >  # define XMM0  xmm17
> >  # define XMM1  xmm18
> >
> > -# define YMMZERO       ymm16
> > +# define XMM10 xmm27
> > +# define XMM11 xmm28
> > +# define XMM12 xmm29
> > +# define XMM13 xmm30
> > +# define XMM14 xmm31
> > +
> > +
> >  # define YMM0  ymm17
> >  # define YMM1  ymm18
> >  # define YMM2  ymm19
> > @@ -89,6 +107,87 @@
> >  # define YMM8  ymm25
> >  # define YMM9  ymm26
> >  # define YMM10 ymm27
> > +# define YMM11 ymm28
> > +# define YMM12 ymm29
> > +# define YMM13 ymm30
> > +# define YMM14 ymm31
> > +
> > +# ifdef USE_AS_STRCASECMP_L
> > +#  define BYTE_LOOP_REG        OFFSET_REG
> > +# else
> > +#  define BYTE_LOOP_REG        ecx
> > +# endif
> > +
> > +# ifdef USE_AS_STRCASECMP_L
> > +#  ifdef USE_AS_STRNCMP
> > +#   define STRCASECMP  __strncasecmp_evex
> > +#   define LOCALE_REG  rcx
> > +#   define LOCALE_REG_LP       RCX_LP
> > +#   define STRCASECMP_NONASCII __strncasecmp_l_nonascii
> > +#  else
> > +#   define STRCASECMP  __strcasecmp_evex
> > +#   define LOCALE_REG  rdx
> > +#   define LOCALE_REG_LP       RDX_LP
> > +#   define STRCASECMP_NONASCII __strcasecmp_l_nonascii
> > +#  endif
> > +# endif
> > +
> > +# define LCASE_MIN_YMM %YMM12
> > +# define LCASE_MAX_YMM %YMM13
> > +# define CASE_ADD_YMM  %YMM14
> > +
> > +# define LCASE_MIN_XMM %XMM12
> > +# define LCASE_MAX_XMM %XMM13
> > +# define CASE_ADD_XMM  %XMM14
> > +
> > +       /* NB: wcsncmp uses r11 but strcasecmp is never used in
> > +          conjunction with wcscmp.  */
> > +# define TOLOWER_BASE  %r11
> > +
> > +# ifdef USE_AS_STRCASECMP_L
> > +#  define _REG(x, y) x ## y
> > +#  define REG(x, y) _REG(x, y)
> > +#  define TOLOWER(reg1, reg2, ext)                                                                             \
> > +       vpsubb  REG(LCASE_MIN_, ext), reg1, REG(%ext, 10);                                      \
> > +       vpsubb  REG(LCASE_MIN_, ext), reg2, REG(%ext, 11);                                      \
> > +       vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5;                           \
> > +       vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6;                           \
> > +       vpaddb  reg1, REG(CASE_ADD_, ext), reg1{%k5};                                           \
> > +       vpaddb  reg2, REG(CASE_ADD_, ext), reg2{%k6}
> > +
> > +#  define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
> > +#  define TOLOWER_YMM(...)     TOLOWER(__VA_ARGS__, YMM)
> > +#  define TOLOWER_XMM(...)     TOLOWER(__VA_ARGS__, XMM)
> > +
> > +#  define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)                                              \
> > +       TOLOWER (s1_reg, s2_reg, ext);                                                                          \
> > +       VPCMP   $0, s1_reg, s2_reg, reg_out
> > +
> > +#  define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext)                              \
> > +       VMOVU   s2_mem, s2_reg;                                                                                         \
> > +       CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)
> > +
> > +#  define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM)
> > +#  define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM)
> > +
> > +#  define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM)
> > +#  define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM)
> > +
> > +# else
> > +#  define TOLOWER_gpr(...)
> > +#  define TOLOWER_YMM(...)
> > +#  define TOLOWER_XMM(...)
> > +
> > +#  define CMP_R1_R2_YMM(s1_reg, s2_reg, reg_out)                                               \
> > +       VPCMP   $0, s2_reg, s1_reg, reg_out
> > +
> > +#  define CMP_R1_R2_XMM(...) CMP_R1_R2_YMM(__VA_ARGS__)
> > +
> > +#  define CMP_R1_S2_YMM(s1_reg, s2_mem, unused, reg_out)                               \
> > +       VPCMP   $0, s2_mem, s1_reg, reg_out
> > +
> > +#  define CMP_R1_S2_XMM(...) CMP_R1_S2_YMM(__VA_ARGS__)
> > +# endif
> >
> >  /* Warning!
> >             wcscmp/wcsncmp have to use SIGNED comparison for elements.
> > @@ -112,8 +211,45 @@
> >     returned.  */
> >
> >         .section .text.evex, "ax", @progbits
> > -ENTRY(STRCMP)
> > +       .align  16
> > +       .type   STRCMP, @function
> > +       .globl  STRCMP
> > +       .hidden STRCMP
> > +
> > +# ifdef USE_AS_STRCASECMP_L
> > +ENTRY (STRCASECMP)
> > +       movq    __libc_tsd_LOCALE@gottpoff(%rip), %rax
> > +       mov     %fs:(%rax), %LOCALE_REG_LP
> > +
> > +       /* Either 1 or 5 bytes (dependeing if CET is enabled).  */
> > +       .p2align 4
> > +END (STRCASECMP)
> > +       /* FALLTHROUGH to strcasecmp/strncasecmp_l.  */
> > +# endif
> > +
> > +       .p2align 4
> > +STRCMP:
> > +       cfi_startproc
> > +       _CET_ENDBR
> > +       CALL_MCOUNT
> > +
> > +# if defined USE_AS_STRCASECMP_L
> > +       /* We have to fall back on the C implementation for locales with
> > +          encodings not matching ASCII for single bytes.  */
> > +#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
> > +       mov     LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
> > +#  else
> > +       mov     (%LOCALE_REG), %RAX_LP
> > +#  endif
> > +       testl   $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
> > +       jne     STRCASECMP_NONASCII
> > +       leaq    _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
> > +# endif
> > +
> >  # ifdef USE_AS_STRNCMP
> > +       /* Don't overwrite LOCALE_REG (rcx) until we have pass
> > +          L(one_or_less). Otherwise we might use the wrong locale in
> > +          the OVERFLOW_STRCMP (strcasecmp_l).  */
> >  #  ifdef __ILP32__
> >         /* Clear the upper 32 bits.  */
> >         movl    %edx, %edx
> > @@ -125,6 +261,32 @@ ENTRY(STRCMP)
> >            actually bound the buffer.  */
> >         jle     L(one_or_less)
> >  # endif
> > +
> > +# if defined USE_AS_STRCASECMP_L
> > +       .section .rodata.cst32, "aM", @progbits, 32
> > +       .align  32
> > +L(lcase_min):
> > +       .quad   0x4141414141414141
> > +       .quad   0x4141414141414141
> > +       .quad   0x4141414141414141
> > +       .quad   0x4141414141414141
> > +L(lcase_max):
> > +       .quad   0x1a1a1a1a1a1a1a1a
> > +       .quad   0x1a1a1a1a1a1a1a1a
> > +       .quad   0x1a1a1a1a1a1a1a1a
> > +       .quad   0x1a1a1a1a1a1a1a1a
> > +L(case_add):
> > +       .quad   0x2020202020202020
> > +       .quad   0x2020202020202020
> > +       .quad   0x2020202020202020
> > +       .quad   0x2020202020202020
> > +       .previous
> > +
> > +       vmovdqa64 L(lcase_min)(%rip), LCASE_MIN_YMM
> > +       vmovdqa64 L(lcase_max)(%rip), LCASE_MAX_YMM
> > +       vmovdqa64 L(case_add)(%rip), CASE_ADD_YMM
> > +# endif
> > +
> >         movl    %edi, %eax
> >         orl     %esi, %eax
> >         /* Shift out the bits irrelivant to page boundary ([63:12]).  */
> > @@ -139,7 +301,7 @@ L(no_page_cross):
> >         VPTESTM %YMM0, %YMM0, %k2
> >         /* Each bit cleared in K1 represents a mismatch or a null CHAR
> >            in YMM0 and 32 bytes at (%rsi).  */
> > -       VPCMP   $0, (%rsi), %YMM0, %k1{%k2}
> > +       CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
> >         kmovd   %k1, %ecx
> >  # ifdef USE_AS_STRNCMP
> >         cmpq    $CHAR_PER_VEC, %rdx
> > @@ -169,6 +331,8 @@ L(return_vec_0):
> >  # else
> >         movzbl  (%rdi, %rcx), %eax
> >         movzbl  (%rsi, %rcx), %ecx
> > +       TOLOWER_gpr (%rax, %eax)
> > +       TOLOWER_gpr (%rcx, %ecx)
> >         subl    %ecx, %eax
> >  # endif
> >  L(ret0):
> > @@ -188,11 +352,15 @@ L(ret_zero):
> >
> >         .p2align 4,, 5
> >  L(one_or_less):
> > +#  ifdef USE_AS_STRCASECMP_L
> > +       /* Set locale argument for strcasecmp.  */
> > +       movq    %LOCALE_REG, %rdx
> > +#  endif
> >         jb      L(ret_zero)
> > -#  ifdef USE_AS_WCSCMP
> >         /* 'nbe' covers the case where length is negative (large
> >            unsigned).  */
> > -       jnbe    __wcscmp_evex
> > +       jnbe    OVERFLOW_STRCMP
> > +#  ifdef USE_AS_WCSCMP
> >         movl    (%rdi), %edx
> >         xorl    %eax, %eax
> >         cmpl    (%rsi), %edx
> > @@ -201,11 +369,10 @@ L(one_or_less):
> >         negl    %eax
> >         orl     $1, %eax
> >  #  else
> > -       /* 'nbe' covers the case where length is negative (large
> > -          unsigned).  */
> > -       jnbe    __strcmp_evex
> >         movzbl  (%rdi), %eax
> >         movzbl  (%rsi), %ecx
> > +       TOLOWER_gpr (%rax, %eax)
> > +       TOLOWER_gpr (%rcx, %ecx)
> >         subl    %ecx, %eax
> >  #  endif
> >  L(ret1):
> > @@ -233,6 +400,8 @@ L(return_vec_1):
> >  # else
> >         movzbl  VEC_SIZE(%rdi, %rcx), %eax
> >         movzbl  VEC_SIZE(%rsi, %rcx), %ecx
> > +       TOLOWER_gpr (%rax, %eax)
> > +       TOLOWER_gpr (%rcx, %ecx)
> >         subl    %ecx, %eax
> >  # endif
> >  L(ret2):
> > @@ -270,6 +439,8 @@ L(return_vec_2):
> >  # else
> >         movzbl  (VEC_SIZE * 2)(%rdi, %rcx), %eax
> >         movzbl  (VEC_SIZE * 2)(%rsi, %rcx), %ecx
> > +       TOLOWER_gpr (%rax, %eax)
> > +       TOLOWER_gpr (%rcx, %ecx)
> >         subl    %ecx, %eax
> >  # endif
> >  L(ret3):
> > @@ -290,6 +461,8 @@ L(return_vec_3):
> >  #  else
> >         movzbl  (VEC_SIZE * 3)(%rdi, %rcx), %eax
> >         movzbl  (VEC_SIZE * 3)(%rsi, %rcx), %ecx
> > +       TOLOWER_gpr (%rax, %eax)
> > +       TOLOWER_gpr (%rcx, %ecx)
> >         subl    %ecx, %eax
> >  #  endif
> >  L(ret4):
> > @@ -303,7 +476,7 @@ L(more_3x_vec):
> >         /* Safe to compare 4x vectors.  */
> >         VMOVU   (VEC_SIZE)(%rdi), %YMM0
> >         VPTESTM %YMM0, %YMM0, %k2
> > -       VPCMP   $0, (VEC_SIZE)(%rsi), %YMM0, %k1{%k2}
> > +       CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
> >         kmovd   %k1, %ecx
> >         TESTEQ  %ecx
> >         jnz     L(return_vec_1)
> > @@ -315,14 +488,14 @@ L(more_3x_vec):
> >
> >         VMOVU   (VEC_SIZE * 2)(%rdi), %YMM0
> >         VPTESTM %YMM0, %YMM0, %k2
> > -       VPCMP   $0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
> > +       CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2}
> >         kmovd   %k1, %ecx
> >         TESTEQ  %ecx
> >         jnz     L(return_vec_2)
> >
> >         VMOVU   (VEC_SIZE * 3)(%rdi), %YMM0
> >         VPTESTM %YMM0, %YMM0, %k2
> > -       VPCMP   $0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
> > +       CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2}
> >         kmovd   %k1, %ecx
> >         TESTEQ  %ecx
> >         jnz     L(return_vec_3)
> > @@ -381,7 +554,6 @@ L(prepare_loop_aligned):
> >         subl    %esi, %eax
> >         andl    $(PAGE_SIZE - 1), %eax
> >
> > -       vpxorq  %YMMZERO, %YMMZERO, %YMMZERO
> >
> >         /* Loop 4x comparisons at a time.  */
> >         .p2align 4
> > @@ -413,22 +585,35 @@ L(loop_skip_page_cross_check):
> >         /* A zero CHAR in YMM9 means that there is a null CHAR.  */
> >         VPMINU  %YMM8, %YMM9, %YMM9
> >
> > -       /* Each bit set in K1 represents a non-null CHAR in YMM8.  */
> > +       /* Each bit set in K1 represents a non-null CHAR in YMM9.  */
> >         VPTESTM %YMM9, %YMM9, %k1
> > -
> > +# ifndef USE_AS_STRCASECMP_L
> >         vpxorq  (VEC_SIZE * 0)(%rsi), %YMM0, %YMM1
> >         vpxorq  (VEC_SIZE * 1)(%rsi), %YMM2, %YMM3
> >         vpxorq  (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
> >         /* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
> >            oring with YMM1. Result is stored in YMM6.  */
> >         vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6
> > -
> > +# else
> > +       VMOVU   (VEC_SIZE * 0)(%rsi), %YMM1
> > +       TOLOWER_YMM (%YMM0, %YMM1)
> > +       VMOVU   (VEC_SIZE * 1)(%rsi), %YMM3
> > +       TOLOWER_YMM (%YMM2, %YMM3)
> > +       VMOVU   (VEC_SIZE * 2)(%rsi), %YMM5
> > +       TOLOWER_YMM (%YMM4, %YMM5)
> > +       VMOVU   (VEC_SIZE * 3)(%rsi), %YMM7
> > +       TOLOWER_YMM (%YMM6, %YMM7)
> > +       vpxorq  %YMM0, %YMM1, %YMM1
> > +       vpxorq  %YMM2, %YMM3, %YMM3
> > +       vpxorq  %YMM4, %YMM5, %YMM5
> > +       vpternlogd $0xde, %YMM7, %YMM1, %YMM6
> > +# endif
> >         /* Or together YMM3, YMM5, and YMM6.  */
> >         vpternlogd $0xfe, %YMM3, %YMM5, %YMM6
> >
> >
> >         /* A non-zero CHAR in YMM6 represents a mismatch.  */
> > -       VPCMP   $0, %YMMZERO, %YMM6, %k0{%k1}
> > +       VPTESTNM %YMM6, %YMM6, %k0{%k1}
> >         kmovd   %k0, %LOOP_REG
> >
> >         TESTEQ  %LOOP_REG
> > @@ -437,13 +622,13 @@ L(loop_skip_page_cross_check):
> >
> >         /* Find which VEC has the mismatch of end of string.  */
> >         VPTESTM %YMM0, %YMM0, %k1
> > -       VPCMP   $0, %YMMZERO, %YMM1, %k0{%k1}
> > +       VPTESTNM %YMM1, %YMM1, %k0{%k1}
> >         kmovd   %k0, %ecx
> >         TESTEQ  %ecx
> >         jnz     L(return_vec_0_end)
> >
> >         VPTESTM %YMM2, %YMM2, %k1
> > -       VPCMP   $0, %YMMZERO, %YMM3, %k0{%k1}
> > +       VPTESTNM %YMM3, %YMM3, %k0{%k1}
> >         kmovd   %k0, %ecx
> >         TESTEQ  %ecx
> >         jnz     L(return_vec_1_end)
> > @@ -457,7 +642,7 @@ L(return_vec_2_3_end):
> >  # endif
> >
> >         VPTESTM %YMM4, %YMM4, %k1
> > -       VPCMP   $0, %YMMZERO, %YMM5, %k0{%k1}
> > +       VPTESTNM %YMM5, %YMM5, %k0{%k1}
> >         kmovd   %k0, %ecx
> >         TESTEQ  %ecx
> >  # if CHAR_PER_VEC <= 16
> > @@ -493,6 +678,8 @@ L(return_vec_3_end):
> >  # else
> >         movzbl  (VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax
> >         movzbl  (VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx
> > +       TOLOWER_gpr (%rax, %eax)
> > +       TOLOWER_gpr (%rcx, %ecx)
> >         subl    %ecx, %eax
> >         xorl    %r8d, %eax
> >         subl    %r8d, %eax
> > @@ -545,6 +732,8 @@ L(return_vec_0_end):
> >  # else
> >         movzbl  (%rdi, %rcx), %eax
> >         movzbl  (%rsi, %rcx), %ecx
> > +       TOLOWER_gpr (%rax, %eax)
> > +       TOLOWER_gpr (%rcx, %ecx)
> >         subl    %ecx, %eax
> >         /* Flip `eax` if `rdi` and `rsi` where swapped in page cross
> >            logic. Subtract `r8d` after xor for zero case.  */
> > @@ -569,6 +758,8 @@ L(return_vec_1_end):
> >  #  else
> >         movzbl  VEC_SIZE(%rdi, %rcx), %eax
> >         movzbl  VEC_SIZE(%rsi, %rcx), %ecx
> > +       TOLOWER_gpr (%rax, %eax)
> > +       TOLOWER_gpr (%rcx, %ecx)
> >         subl    %ecx, %eax
> >         xorl    %r8d, %eax
> >         subl    %r8d, %eax
> > @@ -598,7 +789,7 @@ L(page_cross_during_loop):
> >
> >         VMOVA   (%rdi), %YMM0
> >         VPTESTM %YMM0, %YMM0, %k2
> > -       VPCMP   $0, (%rsi), %YMM0, %k1{%k2}
> > +       CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
> >         kmovd   %k1, %ecx
> >         TESTEQ  %ecx
> >         jnz     L(return_vec_0_end)
> > @@ -619,8 +810,7 @@ L(less_1x_vec_till_page_cross):
> >            been loaded earlier so must be valid.  */
> >         VMOVU   -VEC_SIZE(%rdi, %rax), %YMM0
> >         VPTESTM %YMM0, %YMM0, %k2
> > -       VPCMP   $0, -VEC_SIZE(%rsi, %rax), %YMM0, %k1{%k2}
> > -
> > +       CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2}
> >         /* Mask of potentially valid bits. The lower bits can be out of
> >            range comparisons (but safe regarding page crosses).  */
> >
> > @@ -642,6 +832,8 @@ L(less_1x_vec_till_page_cross):
> >
> >  # ifdef USE_AS_STRNCMP
> >  #  ifdef USE_AS_WCSCMP
> > +       /* NB: strcasecmp not used with WCSCMP so this access to r11 is
> > +          safe.  */
> >         movl    %eax, %r11d
> >         shrl    $2, %r11d
> >         cmpq    %r11, %rdx
> > @@ -679,6 +871,8 @@ L(return_page_cross_cmp_mem):
> >  # else
> >         movzbl  VEC_OFFSET(%rdi, %rcx), %eax
> >         movzbl  VEC_OFFSET(%rsi, %rcx), %ecx
> > +       TOLOWER_gpr (%rax, %eax)
> > +       TOLOWER_gpr (%rcx, %ecx)
> >         subl    %ecx, %eax
> >         xorl    %r8d, %eax
> >         subl    %r8d, %eax
> > @@ -709,7 +903,7 @@ L(more_2x_vec_till_page_cross):
> >
> >         VMOVA   VEC_SIZE(%rdi), %YMM0
> >         VPTESTM %YMM0, %YMM0, %k2
> > -       VPCMP   $0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
> > +       CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
> >         kmovd   %k1, %ecx
> >         TESTEQ  %ecx
> >         jnz     L(return_vec_1_end)
> > @@ -724,14 +918,14 @@ L(more_2x_vec_till_page_cross):
> >         /* Safe to include comparisons from lower bytes.  */
> >         VMOVU   -(VEC_SIZE * 2)(%rdi, %rax), %YMM0
> >         VPTESTM %YMM0, %YMM0, %k2
> > -       VPCMP   $0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM0, %k1{%k2}
> > +       CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2}
> >         kmovd   %k1, %ecx
> >         TESTEQ  %ecx
> >         jnz     L(return_vec_page_cross_0)
> >
> >         VMOVU   -(VEC_SIZE * 1)(%rdi, %rax), %YMM0
> >         VPTESTM %YMM0, %YMM0, %k2
> > -       VPCMP   $0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM0, %k1{%k2}
> > +       CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2}
> >         kmovd   %k1, %ecx
> >         TESTEQ  %ecx
> >         jnz     L(return_vec_page_cross_1)
> > @@ -740,6 +934,8 @@ L(more_2x_vec_till_page_cross):
> >         /* Must check length here as length might proclude reading next
> >            page.  */
> >  #  ifdef USE_AS_WCSCMP
> > +       /* NB: strcasecmp not used with WCSCMP so this access to r11 is
> > +          safe.  */
> >         movl    %eax, %r11d
> >         shrl    $2, %r11d
> >         cmpq    %r11, %rdx
> > @@ -754,12 +950,19 @@ L(more_2x_vec_till_page_cross):
> >         VMOVA   (VEC_SIZE * 3)(%rdi), %YMM6
> >         VPMINU  %YMM4, %YMM6, %YMM9
> >         VPTESTM %YMM9, %YMM9, %k1
> > -
> > +# ifndef USE_AS_STRCASECMP_L
> >         vpxorq  (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
> >         /* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6).  */
> >         vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6
> > -
> > -       VPCMP   $0, %YMMZERO, %YMM6, %k0{%k1}
> > +# else
> > +       VMOVU   (VEC_SIZE * 2)(%rsi), %YMM5
> > +       TOLOWER_YMM (%YMM4, %YMM5)
> > +       VMOVU   (VEC_SIZE * 3)(%rsi), %YMM7
> > +       TOLOWER_YMM (%YMM6, %YMM7)
> > +       vpxorq  %YMM4, %YMM5, %YMM5
> > +       vpternlogd $0xde, %YMM7, %YMM5, %YMM6
> > +# endif
> > +       VPTESTNM %YMM6, %YMM6, %k0{%k1}
> >         kmovd   %k0, %LOOP_REG
> >         TESTEQ  %LOOP_REG
> >         jnz     L(return_vec_2_3_end)
> > @@ -815,6 +1018,8 @@ L(return_vec_page_cross_1):
> >  # else
> >         movzbl  VEC_OFFSET(%rdi, %rcx), %eax
> >         movzbl  VEC_OFFSET(%rsi, %rcx), %ecx
> > +       TOLOWER_gpr (%rax, %eax)
> > +       TOLOWER_gpr (%rcx, %ecx)
> >         subl    %ecx, %eax
> >         xorl    %r8d, %eax
> >         subl    %r8d, %eax
> > @@ -871,7 +1076,7 @@ L(page_cross):
> >  L(page_cross_loop):
> >         VMOVU   (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
> >         VPTESTM %YMM0, %YMM0, %k2
> > -       VPCMP   $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
> > +       CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
> >         kmovd   %k1, %ecx
> >         TESTEQ  %ecx
> >         jnz     L(check_ret_vec_page_cross)
> > @@ -895,7 +1100,7 @@ L(page_cross_loop):
> >          */
> >         VMOVU   (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
> >         VPTESTM %YMM0, %YMM0, %k2
> > -       VPCMP   $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
> > +       CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
> >
> >         kmovd   %k1, %ecx
> >  # ifdef USE_AS_STRNCMP
> > @@ -930,6 +1135,8 @@ L(ret_vec_page_cross_cont):
> >  # else
> >         movzbl  (%rdi, %rcx, SIZE_OF_CHAR), %eax
> >         movzbl  (%rsi, %rcx, SIZE_OF_CHAR), %ecx
> > +       TOLOWER_gpr (%rax, %eax)
> > +       TOLOWER_gpr (%rcx, %ecx)
> >         subl    %ecx, %eax
> >         xorl    %r8d, %eax
> >         subl    %r8d, %eax
> > @@ -989,7 +1196,7 @@ L(less_1x_vec_till_page):
> >         /* Use 16 byte comparison.  */
> >         vmovdqu (%rdi), %xmm0
> >         VPTESTM %xmm0, %xmm0, %k2
> > -       VPCMP   $0, (%rsi), %xmm0, %k1{%k2}
> > +       CMP_R1_S2_XMM (%xmm0, (%rsi), %xmm1, %k1){%k2}
> >         kmovd   %k1, %ecx
> >  # ifdef USE_AS_WCSCMP
> >         subl    $0xf, %ecx
> > @@ -1009,7 +1216,7 @@ L(less_1x_vec_till_page):
> >  # endif
> >         vmovdqu (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
> >         VPTESTM %xmm0, %xmm0, %k2
> > -       VPCMP   $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0, %k1{%k2}
> > +       CMP_R1_S2_XMM (%xmm0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1, %k1){%k2}
> >         kmovd   %k1, %ecx
> >  # ifdef USE_AS_WCSCMP
> >         subl    $0xf, %ecx
> > @@ -1048,7 +1255,7 @@ L(less_16_till_page):
> >         vmovq   (%rdi), %xmm0
> >         vmovq   (%rsi), %xmm1
> >         VPTESTM %xmm0, %xmm0, %k2
> > -       VPCMP   $0, %xmm1, %xmm0, %k1{%k2}
> > +       CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
> >         kmovd   %k1, %ecx
> >  # ifdef USE_AS_WCSCMP
> >         subl    $0x3, %ecx
> > @@ -1068,7 +1275,7 @@ L(less_16_till_page):
> >         vmovq   (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
> >         vmovq   (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
> >         VPTESTM %xmm0, %xmm0, %k2
> > -       VPCMP   $0, %xmm1, %xmm0, %k1{%k2}
> > +       CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
> >         kmovd   %k1, %ecx
> >  # ifdef USE_AS_WCSCMP
> >         subl    $0x3, %ecx
> > @@ -1128,7 +1335,7 @@ L(ret_less_8_wcs):
> >         vmovd   (%rdi), %xmm0
> >         vmovd   (%rsi), %xmm1
> >         VPTESTM %xmm0, %xmm0, %k2
> > -       VPCMP   $0, %xmm1, %xmm0, %k1{%k2}
> > +       CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
> >         kmovd   %k1, %ecx
> >         subl    $0xf, %ecx
> >         jnz     L(check_ret_vec_page_cross)
> > @@ -1143,7 +1350,7 @@ L(ret_less_8_wcs):
> >         vmovd   (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
> >         vmovd   (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
> >         VPTESTM %xmm0, %xmm0, %k2
> > -       VPCMP   $0, %xmm1, %xmm0, %k1{%k2}
> > +       CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
> >         kmovd   %k1, %ecx
> >         subl    $0xf, %ecx
> >         jnz     L(check_ret_vec_page_cross)
> > @@ -1176,7 +1383,9 @@ L(less_4_till_page):
> >  L(less_4_loop):
> >         movzbl  (%rdi), %eax
> >         movzbl  (%rsi, %rdi), %ecx
> > -       subl    %ecx, %eax
> > +       TOLOWER_gpr (%rax, %eax)
> > +       TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
> > +       subl    %BYTE_LOOP_REG, %eax
> >         jnz     L(ret_less_4_loop)
> >         testl   %ecx, %ecx
> >         jz      L(ret_zero_4_loop)
> > @@ -1203,5 +1412,6 @@ L(ret_less_4_loop):
> >         subl    %r8d, %eax
> >         ret
> >  # endif
> > -END(STRCMP)
> > +       cfi_endproc
> > +       .size   STRCMP, .-STRCMP
> >  #endif
> > diff --git a/sysdeps/x86_64/multiarch/strncase_l-evex.S b/sysdeps/x86_64/multiarch/strncase_l-evex.S
> > new file mode 100644
> > index 0000000000..8a5af3695c
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strncase_l-evex.S
> > @@ -0,0 +1,25 @@
> > +/* strncasecmp_l optimized with EVEX.
> > +   Copyright (C) 2017-2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef STRCMP
> > +# define STRCMP        __strncasecmp_l_evex
> > +#endif
> > +#define OVERFLOW_STRCMP        __strcasecmp_l_evex
> > +#define USE_AS_STRCASECMP_L
> > +#define USE_AS_STRNCMP
> > +#include "strcmp-evex.S"
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks, pushed the patchset.
>
> Thanks.
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v2 12/31] x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896]
  2022-03-24 21:58         ` H.J. Lu
@ 2022-05-04  6:05           ` Sunil Pandey
  0 siblings, 0 replies; 76+ messages in thread
From: Sunil Pandey @ 2022-05-04  6:05 UTC (permalink / raw)
  To: H.J. Lu, Libc-stable Mailing List; +Cc: Noah Goldstein, GNU C Library

On Thu, Mar 24, 2022 at 2:59 PM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Thu, Mar 24, 2022 at 2:43 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Thu, Mar 24, 2022 at 4:26 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Thu, Mar 24, 2022 at 1:50 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > Overflow case for __wcsncmp_avx2_rtm should be __wcscmp_avx2_rtm not
> > > > __wcscmp_avx2.
> > > >
> > > > commit ddf0992cf57a93200e0c782e2a94d0733a5a0b87
> > > > Author: Noah Goldstein <goldstein.w.n@gmail.com>
> > > > Date:   Sun Jan 9 16:02:21 2022 -0600
> > > >
> > > >     x86: Fix __wcsncmp_avx2 in strcmp-avx2.S [BZ# 28755]
> > > >
> > > > Set the wrong fallback function for `__wcsncmp_avx2_rtm`. It was set
> > > > to fallback on to `__wcscmp_avx2` instead of `__wcscmp_avx2_rtm` which
> > > > can cause spurious aborts.
> > > >
> > > > This change will need to be backported.
> > > >
> > > > All string/memory tests pass.
> > > > ---
> > > >  sysdeps/x86/tst-strncmp-rtm.c          | 15 +++++++++++++++
> > > >  sysdeps/x86_64/multiarch/strcmp-avx2.S |  2 +-
> > > >  2 files changed, 16 insertions(+), 1 deletion(-)
> > > >
> > > > diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
> > > > index 300bc8c281..a3b14e72ff 100644
> > > > --- a/sysdeps/x86/tst-strncmp-rtm.c
> > > > +++ b/sysdeps/x86/tst-strncmp-rtm.c
> > > > @@ -70,6 +70,16 @@ function_overflow (void)
> > > >      return 1;
> > > >  }
> > > >
> > > > +__attribute__ ((noinline, noclone))
> > > > +static int
> > > > +function_overflow2 (void)
> > > > +{
> > > > +  if (STRNCMP (string1, string2, SIZE_MAX >> 4) == 0)
> > > > +    return 0;
> > > > +  else
> > > > +    return 1;
> > > > +}
> > > > +
> > > >  static int
> > > >  do_test (void)
> > > >  {
> > > > @@ -77,5 +87,10 @@ do_test (void)
> > > >    if (status != EXIT_SUCCESS)
> > > >      return status;
> > > >    status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow);
> > > > +  if (status != EXIT_SUCCESS)
> > > > +    return status;
> > > > +  status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow2);
> > > > +  if (status != EXIT_SUCCESS)
> > > > +    return status;
> > > >    return status;
> > > >  }
> > > > diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> > > > index 52ff5ad724..86a86b68e3 100644
> > > > --- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
> > > > +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> > > > @@ -122,7 +122,7 @@ ENTRY(STRCMP)
> > > >            are cases where length is large enough that it can never be a
> > > >            bound on valid memory so just use wcscmp.  */
> > > >         shrq    $56, %rcx
> > > > -       jnz     __wcscmp_avx2
> > > > +       jnz     OVERFLOW_STRCMP
> > > >
> > > >         leaq    (, %rdx, 4), %rdx
> > > >  #  endif
> > > > --
> > > > 2.25.1
> > > >
> > >
> > > LGTM.   Verified on AVX2 machine with RTM.  Without the fix,
> > > the new testcase failed.
> >
> > And that w/ the fix it passes?
>
> Yes.
>
> > >
> > > Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
> > >
> > > Thanks.
> > >
> > > --
> > > H.J.
>
>
>
> --
> H.J.

I would like to backport this patch to release branches.
Any comments or objections?

--Sunil

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v1 03/23] x86: Code cleanup in strchr-avx2 and comment justifying branch
  2022-03-24 19:36       ` H.J. Lu
@ 2022-05-12 19:31         ` Sunil Pandey
  0 siblings, 0 replies; 76+ messages in thread
From: Sunil Pandey @ 2022-05-12 19:31 UTC (permalink / raw)
  To: H.J. Lu, Libc-stable Mailing List; +Cc: Noah Goldstein, GNU C Library

On Thu, Mar 24, 2022 at 12:37 PM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Thu, Mar 24, 2022 at 12:20 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Thu, Mar 24, 2022 at 1:53 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Wed, Mar 23, 2022 at 2:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > Small code cleanup for size: -53 bytes.
> > > >
> > > > Add comment justifying using a branch to do NULL/non-null return.
> > >
> > >
> > > Do you have followup patches to improve its performance?  We are
> > > backporting all x86-64 improvements to Intel release branches:
> > >
> > > https://gitlab.com/x86-glibc/glibc/-/wikis/home
> > >
> > > Patches without performance improvements are undesirable.
> >
> > No further changes planned at the moment, code size saves
> > seem worth it for master though. Also in favor of adding the comment
> > as I think its non-intuitive.
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.

I would like to backport this patch to release branches.
Any comments or objections?

--Sunil

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v1 04/23] x86: Code cleanup in strchr-evex and comment justifying branch
  2022-03-24 18:54   ` H.J. Lu
@ 2022-05-12 19:32     ` Sunil Pandey
  0 siblings, 0 replies; 76+ messages in thread
From: Sunil Pandey @ 2022-05-12 19:32 UTC (permalink / raw)
  To: H.J. Lu, Libc-stable Mailing List; +Cc: Noah Goldstein, GNU C Library

On Thu, Mar 24, 2022 at 11:55 AM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Wed, Mar 23, 2022 at 2:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Small code cleanup for size: -81 bytes.
> >
> > Add comment justifying using a branch to do NULL/non-null return.
> >
> > All string/memory tests pass and no regressions in benchtests.
> >
> > geometric_mean(N=20) of all benchmarks New / Original: .985
> > ---
> > Geomtric Mean N=20 runs; All functions page aligned
> > length, alignment,  pos, rand, seek_char/branch, max_char/perc-zero, New Time / Old Time
> >   2048,         0,   32,    0,               23,                127,               0.878
> >   2048,         1,   32,    0,               23,                127,                0.88
> >   2048,         0,   64,    0,               23,                127,               0.997
> >   2048,         2,   64,    0,               23,                127,               1.001
> >   2048,         0,  128,    0,               23,                127,               0.973
> >   2048,         3,  128,    0,               23,                127,               0.971
> >   2048,         0,  256,    0,               23,                127,               0.976
> >   2048,         4,  256,    0,               23,                127,               0.973
> >   2048,         0,  512,    0,               23,                127,               1.001
> >   2048,         5,  512,    0,               23,                127,               1.004
> >   2048,         0, 1024,    0,               23,                127,               1.005
> >   2048,         6, 1024,    0,               23,                127,               1.007
> >   2048,         0, 2048,    0,               23,                127,               1.035
> >   2048,         7, 2048,    0,               23,                127,                1.03
> >   4096,         0,   32,    0,               23,                127,               0.889
> >   4096,         1,   32,    0,               23,                127,               0.891
> >   4096,         0,   64,    0,               23,                127,               1.012
> >   4096,         2,   64,    0,               23,                127,               1.017
> >   4096,         0,  128,    0,               23,                127,               0.975
> >   4096,         3,  128,    0,               23,                127,               0.974
> >   4096,         0,  256,    0,               23,                127,               0.974
> >   4096,         4,  256,    0,               23,                127,               0.972
> >   4096,         0,  512,    0,               23,                127,               1.002
> >   4096,         5,  512,    0,               23,                127,               1.016
> >   4096,         0, 1024,    0,               23,                127,               1.009
> >   4096,         6, 1024,    0,               23,                127,               1.008
> >   4096,         0, 2048,    0,               23,                127,               1.003
> >   4096,         7, 2048,    0,               23,                127,               1.004
> >    256,         1,   64,    0,               23,                127,               0.993
> >    256,         2,   64,    0,               23,                127,               0.999
> >    256,         3,   64,    0,               23,                127,               0.992
> >    256,         4,   64,    0,               23,                127,                0.99
> >    256,         5,   64,    0,               23,                127,                0.99
> >    256,         6,   64,    0,               23,                127,               0.994
> >    256,         7,   64,    0,               23,                127,               0.991
> >    512,         0,  256,    0,               23,                127,               0.971
> >    512,        16,  256,    0,               23,                127,               0.971
> >    512,        32,  256,    0,               23,                127,               1.005
> >    512,        48,  256,    0,               23,                127,               0.998
> >    512,        64,  256,    0,               23,                127,               1.001
> >    512,        80,  256,    0,               23,                127,               1.002
> >    512,        96,  256,    0,               23,                127,               1.005
> >    512,       112,  256,    0,               23,                127,               1.012
> >      1,         0,    0,    0,               23,                127,               1.024
> >      2,         0,    1,    0,               23,                127,               0.991
> >      3,         0,    2,    0,               23,                127,               0.997
> >      4,         0,    3,    0,               23,                127,               0.984
> >      5,         0,    4,    0,               23,                127,               0.993
> >      6,         0,    5,    0,               23,                127,               0.985
> >      7,         0,    6,    0,               23,                127,               0.979
> >      8,         0,    7,    0,               23,                127,               0.975
> >      9,         0,    8,    0,               23,                127,               0.965
> >     10,         0,    9,    0,               23,                127,               0.957
> >     11,         0,   10,    0,               23,                127,               0.979
> >     12,         0,   11,    0,               23,                127,               0.987
> >     13,         0,   12,    0,               23,                127,               1.023
> >     14,         0,   13,    0,               23,                127,               0.997
> >     15,         0,   14,    0,               23,                127,               0.983
> >     16,         0,   15,    0,               23,                127,               0.987
> >     17,         0,   16,    0,               23,                127,               0.993
> >     18,         0,   17,    0,               23,                127,               0.985
> >     19,         0,   18,    0,               23,                127,               0.999
> >     20,         0,   19,    0,               23,                127,               0.998
> >     21,         0,   20,    0,               23,                127,               0.983
> >     22,         0,   21,    0,               23,                127,               0.983
> >     23,         0,   22,    0,               23,                127,               1.002
> >     24,         0,   23,    0,               23,                127,                 1.0
> >     25,         0,   24,    0,               23,                127,               1.002
> >     26,         0,   25,    0,               23,                127,               0.984
> >     27,         0,   26,    0,               23,                127,               0.994
> >     28,         0,   27,    0,               23,                127,               0.995
> >     29,         0,   28,    0,               23,                127,               1.017
> >     30,         0,   29,    0,               23,                127,               1.009
> >     31,         0,   30,    0,               23,                127,               1.001
> >     32,         0,   31,    0,               23,                127,               1.021
> >   2048,         0,   32,    0,                0,                127,               0.899
> >   2048,         1,   32,    0,                0,                127,                0.93
> >   2048,         0,   64,    0,                0,                127,               1.009
> >   2048,         2,   64,    0,                0,                127,               1.023
> >   2048,         0,  128,    0,                0,                127,               0.973
> >   2048,         3,  128,    0,                0,                127,               0.975
> >   2048,         0,  256,    0,                0,                127,               0.974
> >   2048,         4,  256,    0,                0,                127,                0.97
> >   2048,         0,  512,    0,                0,                127,               0.999
> >   2048,         5,  512,    0,                0,                127,               1.004
> >   2048,         0, 1024,    0,                0,                127,               1.008
> >   2048,         6, 1024,    0,                0,                127,               1.008
> >   2048,         0, 2048,    0,                0,                127,               0.996
> >   2048,         7, 2048,    0,                0,                127,               1.002
> >   4096,         0,   32,    0,                0,                127,               0.872
> >   4096,         1,   32,    0,                0,                127,               0.881
> >   4096,         0,   64,    0,                0,                127,               1.006
> >   4096,         2,   64,    0,                0,                127,               1.005
> >   4096,         0,  128,    0,                0,                127,               0.973
> >   4096,         3,  128,    0,                0,                127,               0.974
> >   4096,         0,  256,    0,                0,                127,               0.969
> >   4096,         4,  256,    0,                0,                127,               0.971
> >   4096,         0,  512,    0,                0,                127,                 1.0
> >   4096,         5,  512,    0,                0,                127,               1.005
> >   4096,         0, 1024,    0,                0,                127,               1.007
> >   4096,         6, 1024,    0,                0,                127,               1.009
> >   4096,         0, 2048,    0,                0,                127,               1.005
> >   4096,         7, 2048,    0,                0,                127,               1.007
> >    256,         1,   64,    0,                0,                127,               0.994
> >    256,         2,   64,    0,                0,                127,               1.008
> >    256,         3,   64,    0,                0,                127,               1.019
> >    256,         4,   64,    0,                0,                127,               0.991
> >    256,         5,   64,    0,                0,                127,               0.992
> >    256,         6,   64,    0,                0,                127,               0.991
> >    256,         7,   64,    0,                0,                127,               0.988
> >    512,         0,  256,    0,                0,                127,               0.971
> >    512,        16,  256,    0,                0,                127,               0.967
> >    512,        32,  256,    0,                0,                127,               1.005
> >    512,        48,  256,    0,                0,                127,               1.001
> >    512,        64,  256,    0,                0,                127,               1.009
> >    512,        80,  256,    0,                0,                127,               1.008
> >    512,        96,  256,    0,                0,                127,               1.009
> >    512,       112,  256,    0,                0,                127,               1.016
> >      1,         0,    0,    0,                0,                127,               1.038
> >      2,         0,    1,    0,                0,                127,               1.009
> >      3,         0,    2,    0,                0,                127,               0.992
> >      4,         0,    3,    0,                0,                127,               1.004
> >      5,         0,    4,    0,                0,                127,               0.966
> >      6,         0,    5,    0,                0,                127,               0.968
> >      7,         0,    6,    0,                0,                127,               1.004
> >      8,         0,    7,    0,                0,                127,                0.99
> >      9,         0,    8,    0,                0,                127,               0.958
> >     10,         0,    9,    0,                0,                127,                0.96
> >     11,         0,   10,    0,                0,                127,               0.948
> >     12,         0,   11,    0,                0,                127,               0.984
> >     13,         0,   12,    0,                0,                127,               0.967
> >     14,         0,   13,    0,                0,                127,               0.993
> >     15,         0,   14,    0,                0,                127,               0.991
> >     16,         0,   15,    0,                0,                127,                 1.0
> >     17,         0,   16,    0,                0,                127,               0.982
> >     18,         0,   17,    0,                0,                127,               0.977
> >     19,         0,   18,    0,                0,                127,               0.987
> >     20,         0,   19,    0,                0,                127,               0.978
> >     21,         0,   20,    0,                0,                127,                 1.0
> >     22,         0,   21,    0,                0,                127,                0.99
> >     23,         0,   22,    0,                0,                127,               0.988
> >     24,         0,   23,    0,                0,                127,               0.997
> >     25,         0,   24,    0,                0,                127,               1.003
> >     26,         0,   25,    0,                0,                127,               1.004
> >     27,         0,   26,    0,                0,                127,               0.982
> >     28,         0,   27,    0,                0,                127,               0.972
> >     29,         0,   28,    0,                0,                127,               0.978
> >     30,         0,   29,    0,                0,                127,               0.992
> >     31,         0,   30,    0,                0,                127,               0.986
> >     32,         0,   31,    0,                0,                127,                 1.0
> >
> >     16,         0,   15,    1,                1,                  0,               0.997
> >     16,         0,   15,    1,                0,                  0,               1.001
> >     16,         0,   15,    1,                1,                0.1,               0.984
> >     16,         0,   15,    1,                0,                0.1,               0.999
> >     16,         0,   15,    1,                1,               0.25,               0.929
> >     16,         0,   15,    1,                0,               0.25,               1.001
> >     16,         0,   15,    1,                1,               0.33,               0.892
> >     16,         0,   15,    1,                0,               0.33,               0.996
> >     16,         0,   15,    1,                1,                0.5,               0.897
> >     16,         0,   15,    1,                0,                0.5,               1.009
> >     16,         0,   15,    1,                1,               0.66,               0.882
> >     16,         0,   15,    1,                0,               0.66,               0.967
> >     16,         0,   15,    1,                1,               0.75,               0.919
> >     16,         0,   15,    1,                0,               0.75,               1.027
> >     16,         0,   15,    1,                1,                0.9,               0.949
> >     16,         0,   15,    1,                0,                0.9,               1.021
> >     16,         0,   15,    1,                1,                  1,               0.998
> >     16,         0,   15,    1,                0,                  1,               0.999
> >
> >  sysdeps/x86_64/multiarch/strchr-evex.S | 146 ++++++++++++++-----------
> >  1 file changed, 80 insertions(+), 66 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
> > index f62cd9d144..ec739fb8f9 100644
> > --- a/sysdeps/x86_64/multiarch/strchr-evex.S
> > +++ b/sysdeps/x86_64/multiarch/strchr-evex.S
> > @@ -30,6 +30,7 @@
> >  # ifdef USE_AS_WCSCHR
> >  #  define VPBROADCAST  vpbroadcastd
> >  #  define VPCMP                vpcmpd
> > +#  define VPTESTN      vptestnmd
> >  #  define VPMINU       vpminud
> >  #  define CHAR_REG     esi
> >  #  define SHIFT_REG    ecx
> > @@ -37,6 +38,7 @@
> >  # else
> >  #  define VPBROADCAST  vpbroadcastb
> >  #  define VPCMP                vpcmpb
> > +#  define VPTESTN      vptestnmb
> >  #  define VPMINU       vpminub
> >  #  define CHAR_REG     sil
> >  #  define SHIFT_REG    edx
> > @@ -61,13 +63,11 @@
> >  # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
> >
> >         .section .text.evex,"ax",@progbits
> > -ENTRY (STRCHR)
> > +ENTRY_P2ALIGN (STRCHR, 5)
> >         /* Broadcast CHAR to YMM0.      */
> >         VPBROADCAST     %esi, %YMM0
> >         movl    %edi, %eax
> >         andl    $(PAGE_SIZE - 1), %eax
> > -       vpxorq  %XMMZERO, %XMMZERO, %XMMZERO
> > -
> >         /* Check if we cross page boundary with one vector load.
> >            Otherwise it is safe to use an unaligned load.  */
> >         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > @@ -81,49 +81,35 @@ ENTRY (STRCHR)
> >         vpxorq  %YMM1, %YMM0, %YMM2
> >         VPMINU  %YMM2, %YMM1, %YMM2
> >         /* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
> > -       VPCMP   $0, %YMMZERO, %YMM2, %k0
> > +       VPTESTN %YMM2, %YMM2, %k0
> >         kmovd   %k0, %eax
> >         testl   %eax, %eax
> >         jz      L(aligned_more)
> >         tzcntl  %eax, %eax
> > +# ifndef USE_AS_STRCHRNUL
> > +       /* Found CHAR or the null byte.  */
> > +       cmp     (%rdi, %rax, CHAR_SIZE), %CHAR_REG
> > +       /* NB: Use a branch instead of cmovcc here. The expectation is
> > +          that with strchr the user will branch based on input being
> > +          null. Since this branch will be 100% predictive of the user
> > +          branch a branch miss here should save what otherwise would
> > +          be branch miss in the user code. Otherwise using a branch 1)
> > +          saves code size and 2) is faster in highly predictable
> > +          environments.  */
> > +       jne     L(zero)
> > +# endif
> >  # ifdef USE_AS_WCSCHR
> >         /* NB: Multiply wchar_t count by 4 to get the number of bytes.
> >          */
> >         leaq    (%rdi, %rax, CHAR_SIZE), %rax
> >  # else
> >         addq    %rdi, %rax
> > -# endif
> > -# ifndef USE_AS_STRCHRNUL
> > -       /* Found CHAR or the null byte.  */
> > -       cmp     (%rax), %CHAR_REG
> > -       jne     L(zero)
> >  # endif
> >         ret
> >
> > -       /* .p2align 5 helps keep performance more consistent if ENTRY()
> > -          alignment % 32 was either 16 or 0. As well this makes the
> > -          alignment % 32 of the loop_4x_vec fixed which makes tuning it
> > -          easier.  */
> > -       .p2align 5
> > -L(first_vec_x3):
> > -       tzcntl  %eax, %eax
> > -# ifndef USE_AS_STRCHRNUL
> > -       /* Found CHAR or the null byte.  */
> > -       cmp     (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
> > -       jne     L(zero)
> > -# endif
> > -       /* NB: Multiply sizeof char type (1 or 4) to get the number of
> > -          bytes.  */
> > -       leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> > -       ret
> >
> > -# ifndef USE_AS_STRCHRNUL
> > -L(zero):
> > -       xorl    %eax, %eax
> > -       ret
> > -# endif
> >
> > -       .p2align 4
> > +       .p2align 4,, 10
> >  L(first_vec_x4):
> >  # ifndef USE_AS_STRCHRNUL
> >         /* Check to see if first match was CHAR (k0) or null (k1).  */
> > @@ -144,9 +130,18 @@ L(first_vec_x4):
> >         leaq    (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> >         ret
> >
> > +# ifndef USE_AS_STRCHRNUL
> > +L(zero):
> > +       xorl    %eax, %eax
> > +       ret
> > +# endif
> > +
> > +
> >         .p2align 4
> >  L(first_vec_x1):
> > -       tzcntl  %eax, %eax
> > +       /* Use bsf here to save 1-byte keeping keeping the block in 1x
> > +          fetch block. eax guranteed non-zero.  */
> > +       bsfl    %eax, %eax
> >  # ifndef USE_AS_STRCHRNUL
> >         /* Found CHAR or the null byte.  */
> >         cmp     (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
> > @@ -158,7 +153,7 @@ L(first_vec_x1):
> >         leaq    (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
> >         ret
> >
> > -       .p2align 4
> > +       .p2align 4,, 10
> >  L(first_vec_x2):
> >  # ifndef USE_AS_STRCHRNUL
> >         /* Check to see if first match was CHAR (k0) or null (k1).  */
> > @@ -179,6 +174,21 @@ L(first_vec_x2):
> >         leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> >         ret
> >
> > +       .p2align 4,, 10
> > +L(first_vec_x3):
> > +       /* Use bsf here to save 1-byte keeping keeping the block in 1x
> > +          fetch block. eax guranteed non-zero.  */
> > +       bsfl    %eax, %eax
> > +# ifndef USE_AS_STRCHRNUL
> > +       /* Found CHAR or the null byte.  */
> > +       cmp     (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
> > +       jne     L(zero)
> > +# endif
> > +       /* NB: Multiply sizeof char type (1 or 4) to get the number of
> > +          bytes.  */
> > +       leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> > +       ret
> > +
> >         .p2align 4
> >  L(aligned_more):
> >         /* Align data to VEC_SIZE.  */
> > @@ -195,7 +205,7 @@ L(cross_page_continue):
> >         vpxorq  %YMM1, %YMM0, %YMM2
> >         VPMINU  %YMM2, %YMM1, %YMM2
> >         /* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
> > -       VPCMP   $0, %YMMZERO, %YMM2, %k0
> > +       VPTESTN %YMM2, %YMM2, %k0
> >         kmovd   %k0, %eax
> >         testl   %eax, %eax
> >         jnz     L(first_vec_x1)
> > @@ -206,7 +216,7 @@ L(cross_page_continue):
> >         /* Each bit in K0 represents a CHAR in YMM1.  */
> >         VPCMP   $0, %YMM1, %YMM0, %k0
> >         /* Each bit in K1 represents a CHAR in YMM1.  */
> > -       VPCMP   $0, %YMM1, %YMMZERO, %k1
> > +       VPTESTN %YMM1, %YMM1, %k1
> >         kortestd        %k0, %k1
> >         jnz     L(first_vec_x2)
> >
> > @@ -215,7 +225,7 @@ L(cross_page_continue):
> >         vpxorq  %YMM1, %YMM0, %YMM2
> >         VPMINU  %YMM2, %YMM1, %YMM2
> >         /* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
> > -       VPCMP   $0, %YMMZERO, %YMM2, %k0
> > +       VPTESTN %YMM2, %YMM2, %k0
> >         kmovd   %k0, %eax
> >         testl   %eax, %eax
> >         jnz     L(first_vec_x3)
> > @@ -224,7 +234,7 @@ L(cross_page_continue):
> >         /* Each bit in K0 represents a CHAR in YMM1.  */
> >         VPCMP   $0, %YMM1, %YMM0, %k0
> >         /* Each bit in K1 represents a CHAR in YMM1.  */
> > -       VPCMP   $0, %YMM1, %YMMZERO, %k1
> > +       VPTESTN %YMM1, %YMM1, %k1
> >         kortestd        %k0, %k1
> >         jnz     L(first_vec_x4)
> >
> > @@ -265,33 +275,33 @@ L(loop_4x_vec):
> >         VPMINU  %YMM3, %YMM4, %YMM4
> >         VPMINU  %YMM2, %YMM4, %YMM4{%k4}{z}
> >
> > -       VPCMP   $0, %YMMZERO, %YMM4, %k1
> > +       VPTESTN %YMM4, %YMM4, %k1
> >         kmovd   %k1, %ecx
> >         subq    $-(VEC_SIZE * 4), %rdi
> >         testl   %ecx, %ecx
> >         jz      L(loop_4x_vec)
> >
> > -       VPCMP   $0, %YMMZERO, %YMM1, %k0
> > +       VPTESTN %YMM1, %YMM1, %k0
> >         kmovd   %k0, %eax
> >         testl   %eax, %eax
> >         jnz     L(last_vec_x1)
> >
> > -       VPCMP   $0, %YMMZERO, %YMM2, %k0
> > +       VPTESTN %YMM2, %YMM2, %k0
> >         kmovd   %k0, %eax
> >         testl   %eax, %eax
> >         jnz     L(last_vec_x2)
> >
> > -       VPCMP   $0, %YMMZERO, %YMM3, %k0
> > +       VPTESTN %YMM3, %YMM3, %k0
> >         kmovd   %k0, %eax
> >         /* Combine YMM3 matches (eax) with YMM4 matches (ecx).  */
> >  # ifdef USE_AS_WCSCHR
> >         sall    $8, %ecx
> >         orl     %ecx, %eax
> > -       tzcntl  %eax, %eax
> > +       bsfl    %eax, %eax
> >  # else
> >         salq    $32, %rcx
> >         orq     %rcx, %rax
> > -       tzcntq  %rax, %rax
> > +       bsfq    %rax, %rax
> >  # endif
> >  # ifndef USE_AS_STRCHRNUL
> >         /* Check if match was CHAR or null.  */
> > @@ -303,28 +313,28 @@ L(loop_4x_vec):
> >         leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> >         ret
> >
> > -# ifndef USE_AS_STRCHRNUL
> > -L(zero_end):
> > -       xorl    %eax, %eax
> > -       ret
> > +       .p2align 4,, 8
> > +L(last_vec_x1):
> > +       bsfl    %eax, %eax
> > +# ifdef USE_AS_WCSCHR
> > +       /* NB: Multiply wchar_t count by 4 to get the number of bytes.
> > +          */
> > +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > +# else
> > +       addq    %rdi, %rax
> >  # endif
> >
> > -       .p2align 4
> > -L(last_vec_x1):
> > -       tzcntl  %eax, %eax
> >  # ifndef USE_AS_STRCHRNUL
> >         /* Check if match was null.  */
> > -       cmp     (%rdi, %rax, CHAR_SIZE), %CHAR_REG
> > +       cmp     (%rax), %CHAR_REG
> >         jne     L(zero_end)
> >  # endif
> > -       /* NB: Multiply sizeof char type (1 or 4) to get the number of
> > -          bytes.  */
> > -       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > +
> >         ret
> >
> > -       .p2align 4
> > +       .p2align 4,, 8
> >  L(last_vec_x2):
> > -       tzcntl  %eax, %eax
> > +       bsfl    %eax, %eax
> >  # ifndef USE_AS_STRCHRNUL
> >         /* Check if match was null.  */
> >         cmp     (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
> > @@ -336,7 +346,7 @@ L(last_vec_x2):
> >         ret
> >
> >         /* Cold case for crossing page with first load.  */
> > -       .p2align 4
> > +       .p2align 4,, 8
> >  L(cross_page_boundary):
> >         movq    %rdi, %rdx
> >         /* Align rdi.  */
> > @@ -346,9 +356,9 @@ L(cross_page_boundary):
> >         vpxorq  %YMM1, %YMM0, %YMM2
> >         VPMINU  %YMM2, %YMM1, %YMM2
> >         /* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
> > -       VPCMP   $0, %YMMZERO, %YMM2, %k0
> > +       VPTESTN %YMM2, %YMM2, %k0
> >         kmovd   %k0, %eax
> > -       /* Remove the leading bits.      */
> > +       /* Remove the leading bits.  */
> >  # ifdef USE_AS_WCSCHR
> >         movl    %edx, %SHIFT_REG
> >         /* NB: Divide shift count by 4 since each bit in K1 represent 4
> > @@ -360,20 +370,24 @@ L(cross_page_boundary):
> >         /* If eax is zero continue.  */
> >         testl   %eax, %eax
> >         jz      L(cross_page_continue)
> > -       tzcntl  %eax, %eax
> > -# ifndef USE_AS_STRCHRNUL
> > -       /* Check to see if match was CHAR or null.  */
> > -       cmp     (%rdx, %rax, CHAR_SIZE), %CHAR_REG
> > -       jne     L(zero_end)
> > -# endif
> > +       bsfl    %eax, %eax
> > +
> >  # ifdef USE_AS_WCSCHR
> >         /* NB: Multiply wchar_t count by 4 to get the number of
> >            bytes.  */
> >         leaq    (%rdx, %rax, CHAR_SIZE), %rax
> >  # else
> >         addq    %rdx, %rax
> > +# endif
> > +# ifndef USE_AS_STRCHRNUL
> > +       /* Check to see if match was CHAR or null.  */
> > +       cmp     (%rax), %CHAR_REG
> > +       je      L(cross_page_ret)
> > +L(zero_end):
> > +       xorl    %eax, %eax
> > +L(cross_page_ret):
> >  # endif
> >         ret
> >
> >  END (STRCHR)
> > -# endif
> > +#endif
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.

I would like to backport this patch to release branches.
Any comments or objections?

--Sunil

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v1 07/23] x86: Optimize strcspn and strpbrk in strcspn-c.c
  2022-03-24 18:55   ` H.J. Lu
@ 2022-05-12 19:34     ` Sunil Pandey
  0 siblings, 0 replies; 76+ messages in thread
From: Sunil Pandey @ 2022-05-12 19:34 UTC (permalink / raw)
  To: H.J. Lu, Libc-stable Mailing List; +Cc: Noah Goldstein, GNU C Library

On Thu, Mar 24, 2022 at 11:57 AM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Wed, Mar 23, 2022 at 2:59 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of
> > _mm_cmpistri. Also change offset to unsigned to avoid unnecessary
> > sign extensions.
> >
> > geometric_mean(N=20) of all benchmarks that dont fallback on
> > sse2/strlen; New / Original: .928
> >
> > All string/memory tests pass.
> > ---
> > Geomtric Mean N=20 runs; All functions page aligned
> > len, align1, align2,  pos, New Time / Old Time
> >   0,      0,      0,  512,               1.207
> >   1,      0,      0,  512,               1.039
> >   1,      1,      0,  512,               0.997
> >   1,      0,      1,  512,               0.981
> >   1,      1,      1,  512,               0.977
> >   2,      0,      0,  512,                1.02
> >   2,      2,      0,  512,               0.979
> >   2,      0,      2,  512,               0.902
> >   2,      2,      2,  512,               0.958
> >   3,      0,      0,  512,               0.978
> >   3,      3,      0,  512,               0.988
> >   3,      0,      3,  512,               0.979
> >   3,      3,      3,  512,               0.955
> >   4,      0,      0,  512,               0.969
> >   4,      4,      0,  512,               0.991
> >   4,      0,      4,  512,                0.94
> >   4,      4,      4,  512,               0.958
> >   5,      0,      0,  512,               0.963
> >   5,      5,      0,  512,               1.004
> >   5,      0,      5,  512,               0.948
> >   5,      5,      5,  512,               0.971
> >   6,      0,      0,  512,               0.933
> >   6,      6,      0,  512,               1.007
> >   6,      0,      6,  512,               0.921
> >   6,      6,      6,  512,               0.969
> >   7,      0,      0,  512,               0.928
> >   7,      7,      0,  512,               0.976
> >   7,      0,      7,  512,               0.932
> >   7,      7,      7,  512,               0.995
> >   8,      0,      0,  512,               0.931
> >   8,      0,      8,  512,               0.766
> >   9,      0,      0,  512,               0.965
> >   9,      1,      0,  512,               0.999
> >   9,      0,      9,  512,               0.765
> >   9,      1,      9,  512,                0.97
> >  10,      0,      0,  512,               0.976
> >  10,      2,      0,  512,               0.991
> >  10,      0,     10,  512,               0.768
> >  10,      2,     10,  512,               0.926
> >  11,      0,      0,  512,               0.958
> >  11,      3,      0,  512,               1.006
> >  11,      0,     11,  512,               0.768
> >  11,      3,     11,  512,               0.908
> >  12,      0,      0,  512,               0.945
> >  12,      4,      0,  512,               0.896
> >  12,      0,     12,  512,               0.764
> >  12,      4,     12,  512,               0.785
> >  13,      0,      0,  512,               0.957
> >  13,      5,      0,  512,               1.019
> >  13,      0,     13,  512,                0.76
> >  13,      5,     13,  512,               0.785
> >  14,      0,      0,  512,               0.918
> >  14,      6,      0,  512,               1.004
> >  14,      0,     14,  512,                0.78
> >  14,      6,     14,  512,               0.711
> >  15,      0,      0,  512,               0.855
> >  15,      7,      0,  512,               0.985
> >  15,      0,     15,  512,               0.779
> >  15,      7,     15,  512,               0.772
> >  16,      0,      0,  512,               0.987
> >  16,      0,     16,  512,                0.99
> >  17,      0,      0,  512,               0.996
> >  17,      1,      0,  512,               0.979
> >  17,      0,     17,  512,               1.001
> >  17,      1,     17,  512,                1.03
> >  18,      0,      0,  512,               0.976
> >  18,      2,      0,  512,               0.989
> >  18,      0,     18,  512,               0.976
> >  18,      2,     18,  512,               0.992
> >  19,      0,      0,  512,               0.991
> >  19,      3,      0,  512,               0.988
> >  19,      0,     19,  512,               1.009
> >  19,      3,     19,  512,               1.018
> >  20,      0,      0,  512,               0.999
> >  20,      4,      0,  512,               1.005
> >  20,      0,     20,  512,               0.993
> >  20,      4,     20,  512,               0.983
> >  21,      0,      0,  512,               0.982
> >  21,      5,      0,  512,               0.988
> >  21,      0,     21,  512,               0.978
> >  21,      5,     21,  512,               0.984
> >  22,      0,      0,  512,               0.988
> >  22,      6,      0,  512,               0.979
> >  22,      0,     22,  512,               0.984
> >  22,      6,     22,  512,               0.983
> >  23,      0,      0,  512,               0.996
> >  23,      7,      0,  512,               0.998
> >  23,      0,     23,  512,               0.979
> >  23,      7,     23,  512,               0.987
> >  24,      0,      0,  512,                0.99
> >  24,      0,     24,  512,               0.979
> >  25,      0,      0,  512,               0.985
> >  25,      1,      0,  512,               0.988
> >  25,      0,     25,  512,                0.99
> >  25,      1,     25,  512,               0.986
> >  26,      0,      0,  512,               1.005
> >  26,      2,      0,  512,               0.995
> >  26,      0,     26,  512,               0.992
> >  26,      2,     26,  512,               0.983
> >  27,      0,      0,  512,               0.986
> >  27,      3,      0,  512,               0.978
> >  27,      0,     27,  512,               0.986
> >  27,      3,     27,  512,               0.973
> >  28,      0,      0,  512,               0.995
> >  28,      4,      0,  512,               0.993
> >  28,      0,     28,  512,               0.983
> >  28,      4,     28,  512,               1.005
> >  29,      0,      0,  512,               0.983
> >  29,      5,      0,  512,               0.982
> >  29,      0,     29,  512,               0.984
> >  29,      5,     29,  512,               1.005
> >  30,      0,      0,  512,               0.978
> >  30,      6,      0,  512,               0.985
> >  30,      0,     30,  512,               0.994
> >  30,      6,     30,  512,               0.993
> >  31,      0,      0,  512,               0.984
> >  31,      7,      0,  512,               0.983
> >  31,      0,     31,  512,                 1.0
> >  31,      7,     31,  512,               1.031
> >   4,      0,      0,   32,               0.916
> >   4,      1,      0,   32,               0.952
> >   4,      0,      1,   32,               0.927
> >   4,      1,      1,   32,               0.969
> >   4,      0,      0,   64,               0.961
> >   4,      2,      0,   64,               0.955
> >   4,      0,      2,   64,               0.975
> >   4,      2,      2,   64,               0.972
> >   4,      0,      0,  128,               0.971
> >   4,      3,      0,  128,               0.982
> >   4,      0,      3,  128,               0.945
> >   4,      3,      3,  128,               0.971
> >   4,      0,      0,  256,               1.004
> >   4,      4,      0,  256,               0.966
> >   4,      0,      4,  256,               0.961
> >   4,      4,      4,  256,               0.971
> >   4,      5,      0,  512,               0.929
> >   4,      0,      5,  512,               0.969
> >   4,      5,      5,  512,               0.985
> >   4,      0,      0, 1024,               1.003
> >   4,      6,      0, 1024,               1.009
> >   4,      0,      6, 1024,               1.005
> >   4,      6,      6, 1024,               0.999
> >   4,      0,      0, 2048,               0.917
> >   4,      7,      0, 2048,               1.015
> >   4,      0,      7, 2048,               1.011
> >   4,      7,      7, 2048,               0.907
> >  10,      1,      0,   64,               0.964
> >  10,      1,      1,   64,               0.966
> >  10,      2,      0,   64,               0.953
> >  10,      2,      2,   64,               0.972
> >  10,      3,      0,   64,               0.962
> >  10,      3,      3,   64,               0.969
> >  10,      4,      0,   64,               0.957
> >  10,      4,      4,   64,               0.969
> >  10,      5,      0,   64,               0.961
> >  10,      5,      5,   64,               0.965
> >  10,      6,      0,   64,               0.949
> >  10,      6,      6,   64,                 0.9
> >  10,      7,      0,   64,               0.957
> >  10,      7,      7,   64,               0.897
> >   6,      0,      0,    0,               0.991
> >   6,      0,      0,    1,               1.011
> >   6,      0,      1,    1,               0.939
> >   6,      0,      0,    2,               1.016
> >   6,      0,      2,    2,                0.94
> >   6,      0,      0,    3,               1.019
> >   6,      0,      3,    3,               0.941
> >   6,      0,      0,    4,               1.056
> >   6,      0,      4,    4,               0.884
> >   6,      0,      0,    5,               0.977
> >   6,      0,      5,    5,               0.934
> >   6,      0,      0,    6,               0.954
> >   6,      0,      6,    6,                0.93
> >   6,      0,      0,    7,               0.963
> >   6,      0,      7,    7,               0.916
> >   6,      0,      0,    8,               0.963
> >   6,      0,      8,    8,               0.945
> >   6,      0,      0,    9,               1.028
> >   6,      0,      9,    9,               0.942
> >   6,      0,      0,   10,               0.955
> >   6,      0,     10,   10,               0.831
> >   6,      0,      0,   11,               0.948
> >   6,      0,     11,   11,                0.82
> >   6,      0,      0,   12,               1.033
> >   6,      0,     12,   12,               0.873
> >   6,      0,      0,   13,               0.983
> >   6,      0,     13,   13,               0.852
> >   6,      0,      0,   14,               0.984
> >   6,      0,     14,   14,               0.853
> >   6,      0,      0,   15,               0.984
> >   6,      0,     15,   15,               0.882
> >   6,      0,      0,   16,               0.971
> >   6,      0,     16,   16,               0.958
> >   6,      0,      0,   17,               0.938
> >   6,      0,     17,   17,               0.947
> >   6,      0,      0,   18,                0.96
> >   6,      0,     18,   18,               0.938
> >   6,      0,      0,   19,               0.903
> >   6,      0,     19,   19,               0.943
> >   6,      0,      0,   20,               0.947
> >   6,      0,     20,   20,               0.951
> >   6,      0,      0,   21,               0.948
> >   6,      0,     21,   21,                0.96
> >   6,      0,      0,   22,               0.926
> >   6,      0,     22,   22,               0.951
> >   6,      0,      0,   23,               0.923
> >   6,      0,     23,   23,               0.959
> >   6,      0,      0,   24,               0.918
> >   6,      0,     24,   24,               0.952
> >   6,      0,      0,   25,                0.97
> >   6,      0,     25,   25,               0.952
> >   6,      0,      0,   26,               0.871
> >   6,      0,     26,   26,               0.869
> >   6,      0,      0,   27,               0.935
> >   6,      0,     27,   27,               0.836
> >   6,      0,      0,   28,               0.936
> >   6,      0,     28,   28,               0.857
> >   6,      0,      0,   29,               0.876
> >   6,      0,     29,   29,               0.859
> >   6,      0,      0,   30,               0.934
> >   6,      0,     30,   30,               0.857
> >   6,      0,      0,   31,               0.962
> >   6,      0,     31,   31,                0.86
> >   6,      0,      0,   32,               0.912
> >   6,      0,     32,   32,                0.94
> >   6,      0,      0,   33,               0.903
> >   6,      0,     33,   33,               0.968
> >   6,      0,      0,   34,               0.913
> >   6,      0,     34,   34,               0.896
> >   6,      0,      0,   35,               0.904
> >   6,      0,     35,   35,               0.913
> >   6,      0,      0,   36,               0.905
> >   6,      0,     36,   36,               0.907
> >   6,      0,      0,   37,               0.899
> >   6,      0,     37,   37,                 0.9
> >   6,      0,      0,   38,               0.912
> >   6,      0,     38,   38,               0.919
> >   6,      0,      0,   39,               0.925
> >   6,      0,     39,   39,               0.927
> >   6,      0,      0,   40,               0.923
> >   6,      0,     40,   40,               0.972
> >   6,      0,      0,   41,                0.92
> >   6,      0,     41,   41,               0.966
> >   6,      0,      0,   42,               0.915
> >   6,      0,     42,   42,               0.834
> >   6,      0,      0,   43,                0.92
> >   6,      0,     43,   43,               0.856
> >   6,      0,      0,   44,               0.908
> >   6,      0,     44,   44,               0.858
> >   6,      0,      0,   45,               0.932
> >   6,      0,     45,   45,               0.847
> >   6,      0,      0,   46,               0.927
> >   6,      0,     46,   46,               0.859
> >   6,      0,      0,   47,               0.902
> >   6,      0,     47,   47,               0.855
> >   6,      0,      0,   48,               0.949
> >   6,      0,     48,   48,               0.934
> >   6,      0,      0,   49,               0.907
> >   6,      0,     49,   49,               0.943
> >   6,      0,      0,   50,               0.934
> >   6,      0,     50,   50,               0.943
> >   6,      0,      0,   51,               0.933
> >   6,      0,     51,   51,               0.939
> >   6,      0,      0,   52,               0.944
> >   6,      0,     52,   52,               0.944
> >   6,      0,      0,   53,               0.939
> >   6,      0,     53,   53,               0.938
> >   6,      0,      0,   54,                 0.9
> >   6,      0,     54,   54,               0.923
> >   6,      0,      0,   55,                 0.9
> >   6,      0,     55,   55,               0.927
> >   6,      0,      0,   56,                 0.9
> >   6,      0,     56,   56,               0.917
> >   6,      0,      0,   57,                 0.9
> >   6,      0,     57,   57,               0.916
> >   6,      0,      0,   58,               0.914
> >   6,      0,     58,   58,               0.784
> >   6,      0,      0,   59,               0.863
> >   6,      0,     59,   59,               0.846
> >   6,      0,      0,   60,                0.88
> >   6,      0,     60,   60,               0.827
> >   6,      0,      0,   61,               0.896
> >   6,      0,     61,   61,               0.847
> >   6,      0,      0,   62,               0.894
> >   6,      0,     62,   62,               0.865
> >   6,      0,      0,   63,               0.934
> >   6,      0,     63,   63,               0.866
> >
> >  sysdeps/x86_64/multiarch/strcspn-c.c | 83 +++++++++++++---------------
> >  1 file changed, 37 insertions(+), 46 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c
> > index 013aebf797..c312fab8b1 100644
> > --- a/sysdeps/x86_64/multiarch/strcspn-c.c
> > +++ b/sysdeps/x86_64/multiarch/strcspn-c.c
> > @@ -84,83 +84,74 @@ STRCSPN_SSE42 (const char *s, const char *a)
> >      RETURN (NULL, strlen (s));
> >
> >    const char *aligned;
> > -  __m128i mask;
> > -  int offset = (int) ((size_t) a & 15);
> > +  __m128i mask, maskz, zero;
> > +  unsigned int maskz_bits;
> > +  unsigned int offset = (unsigned int) ((size_t) a & 15);
> > +  zero = _mm_set1_epi8 (0);
> >    if (offset != 0)
> >      {
> >        /* Load masks.  */
> >        aligned = (const char *) ((size_t) a & -16L);
> >        __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
> > -
> > -      mask = __m128i_shift_right (mask0, offset);
> > +      maskz = _mm_cmpeq_epi8 (mask0, zero);
> >
> >        /* Find where the NULL terminator is.  */
> > -      int length = _mm_cmpistri (mask, mask, 0x3a);
> > -      if (length == 16 - offset)
> > -       {
> > -         /* There is no NULL terminator.  */
> > -         __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
> > -         int index = _mm_cmpistri (mask1, mask1, 0x3a);
> > -         length += index;
> > -
> > -         /* Don't use SSE4.2 if the length of A > 16.  */
> > -         if (length > 16)
> > -           return STRCSPN_SSE2 (s, a);
> > -
> > -         if (index != 0)
> > -           {
> > -             /* Combine mask0 and mask1.  We could play games with
> > -                palignr, but frankly this data should be in L1 now
> > -                so do the merge via an unaligned load.  */
> > -             mask = _mm_loadu_si128 ((__m128i *) a);
> > -           }
> > -       }
> > +      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
> > +      if (maskz_bits != 0)
> > +        {
> > +          mask = __m128i_shift_right (mask0, offset);
> > +          offset = (unsigned int) ((size_t) s & 15);
> > +          if (offset)
> > +            goto start_unaligned;
> > +
> > +          aligned = s;
> > +          goto start_loop;
> > +        }
> >      }
> > -  else
> > -    {
> > -      /* A is aligned.  */
> > -      mask = _mm_load_si128 ((__m128i *) a);
> >
> > -      /* Find where the NULL terminator is.  */
> > -      int length = _mm_cmpistri (mask, mask, 0x3a);
> > -      if (length == 16)
> > -       {
> > -         /* There is no NULL terminator.  Don't use SSE4.2 if the length
> > -            of A > 16.  */
> > -         if (a[16] != 0)
> > -           return STRCSPN_SSE2 (s, a);
> > -       }
> > +  /* A is aligned.  */
> > +  mask = _mm_loadu_si128 ((__m128i *) a);
> > +  /* Find where the NULL terminator is.  */
> > +  maskz = _mm_cmpeq_epi8 (mask, zero);
> > +  maskz_bits = _mm_movemask_epi8 (maskz);
> > +  if (maskz_bits == 0)
> > +    {
> > +      /* There is no NULL terminator.  Don't use SSE4.2 if the length
> > +         of A > 16.  */
> > +      if (a[16] != 0)
> > +        return STRCSPN_SSE2 (s, a);
> >      }
> >
> > -  offset = (int) ((size_t) s & 15);
> > +  aligned = s;
> > +  offset = (unsigned int) ((size_t) s & 15);
> >    if (offset != 0)
> >      {
> > +    start_unaligned:
> >        /* Check partial string.  */
> >        aligned = (const char *) ((size_t) s & -16L);
> >        __m128i value = _mm_load_si128 ((__m128i *) aligned);
> >
> >        value = __m128i_shift_right (value, offset);
> >
> > -      int length = _mm_cmpistri (mask, value, 0x2);
> > +      unsigned int length = _mm_cmpistri (mask, value, 0x2);
> >        /* No need to check ZFlag since ZFlag is always 1.  */
> > -      int cflag = _mm_cmpistrc (mask, value, 0x2);
> > +      unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
> >        if (cflag)
> >         RETURN ((char *) (s + length), length);
> >        /* Find where the NULL terminator is.  */
> > -      int index = _mm_cmpistri (value, value, 0x3a);
> > +      unsigned int index = _mm_cmpistri (value, value, 0x3a);
> >        if (index < 16 - offset)
> >         RETURN (NULL, index);
> >        aligned += 16;
> >      }
> > -  else
> > -    aligned = s;
> >
> > +start_loop:
> >    while (1)
> >      {
> >        __m128i value = _mm_load_si128 ((__m128i *) aligned);
> > -      int index = _mm_cmpistri (mask, value, 0x2);
> > -      int cflag = _mm_cmpistrc (mask, value, 0x2);
> > -      int zflag = _mm_cmpistrz (mask, value, 0x2);
> > +      unsigned int index = _mm_cmpistri (mask, value, 0x2);
> > +      unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
> > +      unsigned int zflag = _mm_cmpistrz (mask, value, 0x2);
> >        if (cflag)
> >         RETURN ((char *) (aligned + index), (size_t) (aligned + index - s));
> >        if (zflag)
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.

I would like to backport this patch to release branches.
Any comments or objections?

--Sunil

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v1 08/23] x86: Optimize strspn in strspn-c.c
  2022-03-24 18:56   ` H.J. Lu
@ 2022-05-12 19:39     ` Sunil Pandey
  0 siblings, 0 replies; 76+ messages in thread
From: Sunil Pandey @ 2022-05-12 19:39 UTC (permalink / raw)
  To: H.J. Lu, Libc-stable Mailing List; +Cc: Noah Goldstein, GNU C Library

On Thu, Mar 24, 2022 at 11:58 AM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Wed, Mar 23, 2022 at 2:59 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of
> > _mm_cmpistri. Also change offset to unsigned to avoid unnecessary
> > sign extensions.
> >
> > geometric_mean(N=20) of all benchmarks that dont fallback on
> > sse2; New / Original: .901
> >
> > All string/memory tests pass.
> > ---
> > Geomtric Mean N=20 runs; All functions page aligned
> > len, align1, align2,  pos, New Time / Old Time
> >   1,      0,      0,  512,               0.768
> >   1,      1,      0,  512,               0.666
> >   1,      0,      1,  512,               1.193
> >   1,      1,      1,  512,               0.872
> >   2,      0,      0,  512,               0.698
> >   2,      2,      0,  512,               0.687
> >   2,      0,      2,  512,               1.393
> >   2,      2,      2,  512,               0.944
> >   3,      0,      0,  512,               0.691
> >   3,      3,      0,  512,               0.676
> >   3,      0,      3,  512,               1.388
> >   3,      3,      3,  512,               0.948
> >   4,      0,      0,  512,                0.74
> >   4,      4,      0,  512,               0.678
> >   4,      0,      4,  512,               1.421
> >   4,      4,      4,  512,               0.943
> >   5,      0,      0,  512,               0.691
> >   5,      5,      0,  512,               0.675
> >   5,      0,      5,  512,               1.348
> >   5,      5,      5,  512,               0.952
> >   6,      0,      0,  512,               0.685
> >   6,      6,      0,  512,                0.67
> >   6,      0,      6,  512,               1.333
> >   6,      6,      6,  512,                0.95
> >   7,      0,      0,  512,               0.688
> >   7,      7,      0,  512,               0.675
> >   7,      0,      7,  512,               1.344
> >   7,      7,      7,  512,               0.919
> >   8,      0,      0,  512,               0.716
> >   8,      0,      8,  512,               0.935
> >   9,      0,      0,  512,               0.716
> >   9,      1,      0,  512,               0.712
> >   9,      0,      9,  512,               0.956
> >   9,      1,      9,  512,               0.992
> >  10,      0,      0,  512,               0.699
> >  10,      2,      0,  512,                0.68
> >  10,      0,     10,  512,               0.952
> >  10,      2,     10,  512,               0.932
> >  11,      0,      0,  512,               0.705
> >  11,      3,      0,  512,               0.685
> >  11,      0,     11,  512,               0.956
> >  11,      3,     11,  512,               0.927
> >  12,      0,      0,  512,               0.695
> >  12,      4,      0,  512,               0.675
> >  12,      0,     12,  512,               0.948
> >  12,      4,     12,  512,               0.928
> >  13,      0,      0,  512,                 0.7
> >  13,      5,      0,  512,               0.678
> >  13,      0,     13,  512,               0.944
> >  13,      5,     13,  512,               0.931
> >  14,      0,      0,  512,               0.703
> >  14,      6,      0,  512,               0.678
> >  14,      0,     14,  512,               0.949
> >  14,      6,     14,  512,                0.93
> >  15,      0,      0,  512,               0.694
> >  15,      7,      0,  512,               0.678
> >  15,      0,     15,  512,               0.953
> >  15,      7,     15,  512,               0.924
> >  16,      0,      0,  512,               1.021
> >  16,      0,     16,  512,               1.067
> >  17,      0,      0,  512,               0.991
> >  17,      1,      0,  512,               0.984
> >  17,      0,     17,  512,               0.979
> >  17,      1,     17,  512,               0.993
> >  18,      0,      0,  512,               0.992
> >  18,      2,      0,  512,               1.008
> >  18,      0,     18,  512,               1.016
> >  18,      2,     18,  512,               0.993
> >  19,      0,      0,  512,               0.984
> >  19,      3,      0,  512,               0.985
> >  19,      0,     19,  512,               1.007
> >  19,      3,     19,  512,               1.006
> >  20,      0,      0,  512,               0.969
> >  20,      4,      0,  512,               0.968
> >  20,      0,     20,  512,               0.975
> >  20,      4,     20,  512,               0.975
> >  21,      0,      0,  512,               0.992
> >  21,      5,      0,  512,               0.992
> >  21,      0,     21,  512,                0.98
> >  21,      5,     21,  512,                0.97
> >  22,      0,      0,  512,               0.989
> >  22,      6,      0,  512,               0.987
> >  22,      0,     22,  512,                0.99
> >  22,      6,     22,  512,               0.985
> >  23,      0,      0,  512,               0.989
> >  23,      7,      0,  512,                0.98
> >  23,      0,     23,  512,                 1.0
> >  23,      7,     23,  512,               0.993
> >  24,      0,      0,  512,                0.99
> >  24,      0,     24,  512,               0.998
> >  25,      0,      0,  512,                1.01
> >  25,      1,      0,  512,                 1.0
> >  25,      0,     25,  512,                0.97
> >  25,      1,     25,  512,               0.967
> >  26,      0,      0,  512,               1.009
> >  26,      2,      0,  512,               0.986
> >  26,      0,     26,  512,               0.997
> >  26,      2,     26,  512,               0.993
> >  27,      0,      0,  512,               0.984
> >  27,      3,      0,  512,               0.997
> >  27,      0,     27,  512,               0.989
> >  27,      3,     27,  512,               0.976
> >  28,      0,      0,  512,               0.991
> >  28,      4,      0,  512,               1.003
> >  28,      0,     28,  512,               0.986
> >  28,      4,     28,  512,               0.989
> >  29,      0,      0,  512,               0.986
> >  29,      5,      0,  512,               0.985
> >  29,      0,     29,  512,               0.984
> >  29,      5,     29,  512,               0.977
> >  30,      0,      0,  512,               0.991
> >  30,      6,      0,  512,               0.987
> >  30,      0,     30,  512,               0.979
> >  30,      6,     30,  512,               0.974
> >  31,      0,      0,  512,               0.995
> >  31,      7,      0,  512,               0.995
> >  31,      0,     31,  512,               0.994
> >  31,      7,     31,  512,               0.984
> >   4,      0,      0,   32,               0.861
> >   4,      1,      0,   32,               0.864
> >   4,      0,      1,   32,               0.962
> >   4,      1,      1,   32,               0.967
> >   4,      0,      0,   64,               0.884
> >   4,      2,      0,   64,               0.818
> >   4,      0,      2,   64,               0.889
> >   4,      2,      2,   64,               0.918
> >   4,      0,      0,  128,               0.942
> >   4,      3,      0,  128,               0.884
> >   4,      0,      3,  128,               0.931
> >   4,      3,      3,  128,               0.883
> >   4,      0,      0,  256,               0.964
> >   4,      4,      0,  256,               0.922
> >   4,      0,      4,  256,               0.956
> >   4,      4,      4,  256,                0.93
> >   4,      5,      0,  512,               0.833
> >   4,      0,      5,  512,               1.027
> >   4,      5,      5,  512,               0.929
> >   4,      0,      0, 1024,               0.998
> >   4,      6,      0, 1024,               0.986
> >   4,      0,      6, 1024,               0.984
> >   4,      6,      6, 1024,               0.977
> >   4,      0,      0, 2048,               0.991
> >   4,      7,      0, 2048,               0.987
> >   4,      0,      7, 2048,               0.996
> >   4,      7,      7, 2048,                0.98
> >  10,      1,      0,   64,               0.826
> >  10,      1,      1,   64,               0.907
> >  10,      2,      0,   64,               0.829
> >  10,      2,      2,   64,                0.91
> >  10,      3,      0,   64,                0.83
> >  10,      3,      3,   64,               0.915
> >  10,      4,      0,   64,                0.83
> >  10,      4,      4,   64,               0.911
> >  10,      5,      0,   64,               0.828
> >  10,      5,      5,   64,               0.905
> >  10,      6,      0,   64,               0.828
> >  10,      6,      6,   64,               0.812
> >  10,      7,      0,   64,                0.83
> >  10,      7,      7,   64,               0.819
> >   6,      0,      0,    0,               1.261
> >   6,      0,      0,    1,               1.252
> >   6,      0,      1,    1,               0.845
> >   6,      0,      0,    2,                1.27
> >   6,      0,      2,    2,                0.85
> >   6,      0,      0,    3,               1.269
> >   6,      0,      3,    3,               0.845
> >   6,      0,      0,    4,               1.287
> >   6,      0,      4,    4,               0.852
> >   6,      0,      0,    5,               1.278
> >   6,      0,      5,    5,               0.851
> >   6,      0,      0,    6,               1.269
> >   6,      0,      6,    6,               0.841
> >   6,      0,      0,    7,               1.268
> >   6,      0,      7,    7,               0.851
> >   6,      0,      0,    8,               1.291
> >   6,      0,      8,    8,               0.837
> >   6,      0,      0,    9,               1.283
> >   6,      0,      9,    9,               0.831
> >   6,      0,      0,   10,               1.252
> >   6,      0,     10,   10,               0.997
> >   6,      0,      0,   11,               1.295
> >   6,      0,     11,   11,               1.046
> >   6,      0,      0,   12,               1.296
> >   6,      0,     12,   12,               1.038
> >   6,      0,      0,   13,               1.287
> >   6,      0,     13,   13,               1.082
> >   6,      0,      0,   14,               1.284
> >   6,      0,     14,   14,               1.001
> >   6,      0,      0,   15,               1.286
> >   6,      0,     15,   15,               1.002
> >   6,      0,      0,   16,               0.894
> >   6,      0,     16,   16,               0.874
> >   6,      0,      0,   17,               0.892
> >   6,      0,     17,   17,               0.974
> >   6,      0,      0,   18,               0.907
> >   6,      0,     18,   18,               0.993
> >   6,      0,      0,   19,               0.909
> >   6,      0,     19,   19,                0.99
> >   6,      0,      0,   20,               0.894
> >   6,      0,     20,   20,               0.978
> >   6,      0,      0,   21,                0.89
> >   6,      0,     21,   21,               0.958
> >   6,      0,      0,   22,               0.893
> >   6,      0,     22,   22,                0.99
> >   6,      0,      0,   23,               0.899
> >   6,      0,     23,   23,               0.986
> >   6,      0,      0,   24,               0.893
> >   6,      0,     24,   24,               0.989
> >   6,      0,      0,   25,               0.889
> >   6,      0,     25,   25,               0.982
> >   6,      0,      0,   26,               0.889
> >   6,      0,     26,   26,               0.852
> >   6,      0,      0,   27,                0.89
> >   6,      0,     27,   27,               0.832
> >   6,      0,      0,   28,                0.89
> >   6,      0,     28,   28,               0.831
> >   6,      0,      0,   29,                0.89
> >   6,      0,     29,   29,               0.838
> >   6,      0,      0,   30,               0.907
> >   6,      0,     30,   30,               0.833
> >   6,      0,      0,   31,               0.888
> >   6,      0,     31,   31,               0.837
> >   6,      0,      0,   32,               0.853
> >   6,      0,     32,   32,               0.828
> >   6,      0,      0,   33,               0.857
> >   6,      0,     33,   33,               0.947
> >   6,      0,      0,   34,               0.847
> >   6,      0,     34,   34,               0.954
> >   6,      0,      0,   35,               0.841
> >   6,      0,     35,   35,                0.94
> >   6,      0,      0,   36,               0.854
> >   6,      0,     36,   36,               0.958
> >   6,      0,      0,   37,               0.856
> >   6,      0,     37,   37,               0.957
> >   6,      0,      0,   38,               0.839
> >   6,      0,     38,   38,               0.962
> >   6,      0,      0,   39,               0.866
> >   6,      0,     39,   39,               0.945
> >   6,      0,      0,   40,               0.845
> >   6,      0,     40,   40,               0.961
> >   6,      0,      0,   41,               0.858
> >   6,      0,     41,   41,               0.961
> >   6,      0,      0,   42,               0.862
> >   6,      0,     42,   42,               0.825
> >   6,      0,      0,   43,               0.864
> >   6,      0,     43,   43,                0.82
> >   6,      0,      0,   44,               0.843
> >   6,      0,     44,   44,                0.81
> >   6,      0,      0,   45,               0.859
> >   6,      0,     45,   45,               0.816
> >   6,      0,      0,   46,               0.866
> >   6,      0,     46,   46,                0.81
> >   6,      0,      0,   47,               0.858
> >   6,      0,     47,   47,               0.807
> >   6,      0,      0,   48,                0.87
> >   6,      0,     48,   48,                0.87
> >   6,      0,      0,   49,               0.871
> >   6,      0,     49,   49,               0.874
> >   6,      0,      0,   50,                0.87
> >   6,      0,     50,   50,               0.881
> >   6,      0,      0,   51,               0.868
> >   6,      0,     51,   51,               0.875
> >   6,      0,      0,   52,               0.873
> >   6,      0,     52,   52,               0.871
> >   6,      0,      0,   53,               0.866
> >   6,      0,     53,   53,               0.882
> >   6,      0,      0,   54,               0.863
> >   6,      0,     54,   54,               0.876
> >   6,      0,      0,   55,               0.851
> >   6,      0,     55,   55,               0.871
> >   6,      0,      0,   56,               0.867
> >   6,      0,     56,   56,               0.888
> >   6,      0,      0,   57,               0.862
> >   6,      0,     57,   57,               0.899
> >   6,      0,      0,   58,               0.873
> >   6,      0,     58,   58,               0.798
> >   6,      0,      0,   59,               0.881
> >   6,      0,     59,   59,               0.785
> >   6,      0,      0,   60,               0.867
> >   6,      0,     60,   60,               0.797
> >   6,      0,      0,   61,               0.872
> >   6,      0,     61,   61,               0.791
> >   6,      0,      0,   62,               0.859
> >   6,      0,     62,   62,                0.79
> >   6,      0,      0,   63,                0.87
> >   6,      0,     63,   63,               0.796
> >
> >  sysdeps/x86_64/multiarch/strspn-c.c | 86 +++++++++++++----------------
> >  1 file changed, 39 insertions(+), 47 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c
> > index 8fb3aba64d..6124033ceb 100644
> > --- a/sysdeps/x86_64/multiarch/strspn-c.c
> > +++ b/sysdeps/x86_64/multiarch/strspn-c.c
> > @@ -62,81 +62,73 @@ __strspn_sse42 (const char *s, const char *a)
> >      return 0;
> >
> >    const char *aligned;
> > -  __m128i mask;
> > -  int offset = (int) ((size_t) a & 15);
> > +  __m128i mask, maskz, zero;
> > +  unsigned int maskz_bits;
> > +  unsigned int offset = (int) ((size_t) a & 15);
> > +  zero = _mm_set1_epi8 (0);
> >    if (offset != 0)
> >      {
> >        /* Load masks.  */
> >        aligned = (const char *) ((size_t) a & -16L);
> >        __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
> > -
> > -      mask = __m128i_shift_right (mask0, offset);
> > +      maskz = _mm_cmpeq_epi8 (mask0, zero);
> >
> >        /* Find where the NULL terminator is.  */
> > -      int length = _mm_cmpistri (mask, mask, 0x3a);
> > -      if (length == 16 - offset)
> > -       {
> > -         /* There is no NULL terminator.  */
> > -         __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
> > -         int index = _mm_cmpistri (mask1, mask1, 0x3a);
> > -         length += index;
> > -
> > -         /* Don't use SSE4.2 if the length of A > 16.  */
> > -         if (length > 16)
> > -           return __strspn_sse2 (s, a);
> > -
> > -         if (index != 0)
> > -           {
> > -             /* Combine mask0 and mask1.  We could play games with
> > -                palignr, but frankly this data should be in L1 now
> > -                so do the merge via an unaligned load.  */
> > -             mask = _mm_loadu_si128 ((__m128i *) a);
> > -           }
> > -       }
> > +      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
> > +      if (maskz_bits != 0)
> > +        {
> > +          mask = __m128i_shift_right (mask0, offset);
> > +          offset = (unsigned int) ((size_t) s & 15);
> > +          if (offset)
> > +            goto start_unaligned;
> > +
> > +          aligned = s;
> > +          goto start_loop;
> > +        }
> >      }
> > -  else
> > -    {
> > -      /* A is aligned.  */
> > -      mask = _mm_load_si128 ((__m128i *) a);
> >
> > -      /* Find where the NULL terminator is.  */
> > -      int length = _mm_cmpistri (mask, mask, 0x3a);
> > -      if (length == 16)
> > -       {
> > -         /* There is no NULL terminator.  Don't use SSE4.2 if the length
> > -            of A > 16.  */
> > -         if (a[16] != 0)
> > -           return __strspn_sse2 (s, a);
> > -       }
> > +  /* A is aligned.  */
> > +  mask = _mm_loadu_si128 ((__m128i *) a);
> > +
> > +  /* Find where the NULL terminator is.  */
> > +  maskz = _mm_cmpeq_epi8 (mask, zero);
> > +  maskz_bits = _mm_movemask_epi8 (maskz);
> > +  if (maskz_bits == 0)
> > +    {
> > +      /* There is no NULL terminator.  Don't use SSE4.2 if the length
> > +         of A > 16.  */
> > +      if (a[16] != 0)
> > +        return __strspn_sse2 (s, a);
> >      }
> > +  aligned = s;
> > +  offset = (unsigned int) ((size_t) s & 15);
> >
> > -  offset = (int) ((size_t) s & 15);
> >    if (offset != 0)
> >      {
> > +    start_unaligned:
> >        /* Check partial string.  */
> >        aligned = (const char *) ((size_t) s & -16L);
> >        __m128i value = _mm_load_si128 ((__m128i *) aligned);
> > +      __m128i adj_value = __m128i_shift_right (value, offset);
> >
> > -      value = __m128i_shift_right (value, offset);
> > -
> > -      int length = _mm_cmpistri (mask, value, 0x12);
> > +      unsigned int length = _mm_cmpistri (mask, adj_value, 0x12);
> >        /* No need to check CFlag since it is always 1.  */
> >        if (length < 16 - offset)
> >         return length;
> >        /* Find where the NULL terminator is.  */
> > -      int index = _mm_cmpistri (value, value, 0x3a);
> > -      if (index < 16 - offset)
> > +      maskz = _mm_cmpeq_epi8 (value, zero);
> > +      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
> > +      if (maskz_bits != 0)
> >         return length;
> >        aligned += 16;
> >      }
> > -  else
> > -    aligned = s;
> >
> > +start_loop:
> >    while (1)
> >      {
> >        __m128i value = _mm_load_si128 ((__m128i *) aligned);
> > -      int index = _mm_cmpistri (mask, value, 0x12);
> > -      int cflag = _mm_cmpistrc (mask, value, 0x12);
> > +      unsigned int index = _mm_cmpistri (mask, value, 0x12);
> > +      unsigned int cflag = _mm_cmpistrc (mask, value, 0x12);
> >        if (cflag)
> >         return (size_t) (aligned + index - s);
> >        aligned += 16;
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.

I would like to backport this patch to release branches.
Any comments or objections?

--Sunil

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v1 09/23] x86: Remove strcspn-sse2.S and use the generic implementation
  2022-03-24 18:57   ` H.J. Lu
@ 2022-05-12 19:40     ` Sunil Pandey
  0 siblings, 0 replies; 76+ messages in thread
From: Sunil Pandey @ 2022-05-12 19:40 UTC (permalink / raw)
  To: H.J. Lu, Libc-stable Mailing List; +Cc: Noah Goldstein, GNU C Library

On Thu, Mar 24, 2022 at 11:59 AM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Wed, Mar 23, 2022 at 3:00 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > The generic implementation is faster.
> >
> > geometric_mean(N=20) of all benchmarks New / Original: .678
> >
> > All string/memory tests pass.
> > ---
> > Geomtric Mean N=20 runs; All functions page aligned
> > len, align1, align2,  pos, New Time / Old Time
> >   0,      0,      0,  512,               0.054
> >   1,      0,      0,  512,               0.055
> >   1,      1,      0,  512,               0.051
> >   1,      0,      1,  512,               0.054
> >   1,      1,      1,  512,               0.054
> >   2,      0,      0,  512,               0.861
> >   2,      2,      0,  512,               0.861
> >   2,      0,      2,  512,               0.861
> >   2,      2,      2,  512,               0.864
> >   3,      0,      0,  512,               0.854
> >   3,      3,      0,  512,               0.848
> >   3,      0,      3,  512,               0.845
> >   3,      3,      3,  512,                0.85
> >   4,      0,      0,  512,               0.851
> >   4,      4,      0,  512,                0.85
> >   4,      0,      4,  512,               0.852
> >   4,      4,      4,  512,               0.849
> >   5,      0,      0,  512,               0.938
> >   5,      5,      0,  512,                0.94
> >   5,      0,      5,  512,               0.864
> >   5,      5,      5,  512,                0.86
> >   6,      0,      0,  512,               0.858
> >   6,      6,      0,  512,               0.869
> >   6,      0,      6,  512,               0.847
> >   6,      6,      6,  512,               0.868
> >   7,      0,      0,  512,               0.867
> >   7,      7,      0,  512,               0.861
> >   7,      0,      7,  512,               0.864
> >   7,      7,      7,  512,               0.863
> >   8,      0,      0,  512,               0.884
> >   8,      0,      8,  512,               0.884
> >   9,      0,      0,  512,               0.886
> >   9,      1,      0,  512,               0.894
> >   9,      0,      9,  512,               0.889
> >   9,      1,      9,  512,               0.886
> >  10,      0,      0,  512,               0.859
> >  10,      2,      0,  512,               0.859
> >  10,      0,     10,  512,               0.862
> >  10,      2,     10,  512,               0.861
> >  11,      0,      0,  512,               0.846
> >  11,      3,      0,  512,               0.865
> >  11,      0,     11,  512,               0.859
> >  11,      3,     11,  512,               0.862
> >  12,      0,      0,  512,               0.858
> >  12,      4,      0,  512,               0.857
> >  12,      0,     12,  512,               0.964
> >  12,      4,     12,  512,               0.876
> >  13,      0,      0,  512,               0.827
> >  13,      5,      0,  512,               0.805
> >  13,      0,     13,  512,               0.821
> >  13,      5,     13,  512,               0.825
> >  14,      0,      0,  512,               0.786
> >  14,      6,      0,  512,               0.786
> >  14,      0,     14,  512,               0.803
> >  14,      6,     14,  512,               0.783
> >  15,      0,      0,  512,               0.778
> >  15,      7,      0,  512,               0.792
> >  15,      0,     15,  512,               0.796
> >  15,      7,     15,  512,               0.799
> >  16,      0,      0,  512,               0.803
> >  16,      0,     16,  512,               0.815
> >  17,      0,      0,  512,               0.812
> >  17,      1,      0,  512,               0.826
> >  17,      0,     17,  512,               0.803
> >  17,      1,     17,  512,               0.856
> >  18,      0,      0,  512,               0.801
> >  18,      2,      0,  512,               0.886
> >  18,      0,     18,  512,               0.805
> >  18,      2,     18,  512,               0.807
> >  19,      0,      0,  512,               0.814
> >  19,      3,      0,  512,               0.804
> >  19,      0,     19,  512,               0.813
> >  19,      3,     19,  512,               0.814
> >  20,      0,      0,  512,               0.885
> >  20,      4,      0,  512,               0.799
> >  20,      0,     20,  512,               0.826
> >  20,      4,     20,  512,               0.808
> >  21,      0,      0,  512,               0.816
> >  21,      5,      0,  512,               0.824
> >  21,      0,     21,  512,               0.819
> >  21,      5,     21,  512,               0.826
> >  22,      0,      0,  512,               0.814
> >  22,      6,      0,  512,               0.824
> >  22,      0,     22,  512,                0.81
> >  22,      6,     22,  512,               0.806
> >  23,      0,      0,  512,               0.825
> >  23,      7,      0,  512,               0.829
> >  23,      0,     23,  512,               0.809
> >  23,      7,     23,  512,               0.823
> >  24,      0,      0,  512,               0.829
> >  24,      0,     24,  512,               0.823
> >  25,      0,      0,  512,               0.864
> >  25,      1,      0,  512,               0.895
> >  25,      0,     25,  512,                0.88
> >  25,      1,     25,  512,               0.848
> >  26,      0,      0,  512,               0.903
> >  26,      2,      0,  512,               0.888
> >  26,      0,     26,  512,               0.894
> >  26,      2,     26,  512,                0.89
> >  27,      0,      0,  512,               0.914
> >  27,      3,      0,  512,               0.917
> >  27,      0,     27,  512,               0.902
> >  27,      3,     27,  512,               0.887
> >  28,      0,      0,  512,               0.887
> >  28,      4,      0,  512,               0.877
> >  28,      0,     28,  512,               0.893
> >  28,      4,     28,  512,               0.866
> >  29,      0,      0,  512,               0.885
> >  29,      5,      0,  512,               0.907
> >  29,      0,     29,  512,               0.894
> >  29,      5,     29,  512,               0.906
> >  30,      0,      0,  512,                0.88
> >  30,      6,      0,  512,               0.898
> >  30,      0,     30,  512,                 0.9
> >  30,      6,     30,  512,               0.895
> >  31,      0,      0,  512,               0.893
> >  31,      7,      0,  512,               0.874
> >  31,      0,     31,  512,               0.894
> >  31,      7,     31,  512,               0.899
> >   4,      0,      0,   32,               0.618
> >   4,      1,      0,   32,               0.627
> >   4,      0,      1,   32,               0.625
> >   4,      1,      1,   32,               0.613
> >   4,      0,      0,   64,               0.913
> >   4,      2,      0,   64,               0.801
> >   4,      0,      2,   64,               0.759
> >   4,      2,      2,   64,               0.761
> >   4,      0,      0,  128,               0.822
> >   4,      3,      0,  128,               0.863
> >   4,      0,      3,  128,               0.867
> >   4,      3,      3,  128,               0.917
> >   4,      0,      0,  256,               0.816
> >   4,      4,      0,  256,               0.812
> >   4,      0,      4,  256,               0.803
> >   4,      4,      4,  256,               0.811
> >   4,      5,      0,  512,               0.848
> >   4,      0,      5,  512,               0.843
> >   4,      5,      5,  512,               0.857
> >   4,      0,      0, 1024,               0.886
> >   4,      6,      0, 1024,               0.887
> >   4,      0,      6, 1024,               0.881
> >   4,      6,      6, 1024,               0.873
> >   4,      0,      0, 2048,               0.892
> >   4,      7,      0, 2048,               0.894
> >   4,      0,      7, 2048,                0.89
> >   4,      7,      7, 2048,               0.874
> >  10,      1,      0,   64,               0.946
> >  10,      1,      1,   64,                0.81
> >  10,      2,      0,   64,               0.804
> >  10,      2,      2,   64,                0.82
> >  10,      3,      0,   64,               0.772
> >  10,      3,      3,   64,               0.772
> >  10,      4,      0,   64,               0.748
> >  10,      4,      4,   64,               0.751
> >  10,      5,      0,   64,                0.76
> >  10,      5,      5,   64,                0.76
> >  10,      6,      0,   64,               0.726
> >  10,      6,      6,   64,               0.718
> >  10,      7,      0,   64,               0.724
> >  10,      7,      7,   64,                0.72
> >   6,      0,      0,    0,               0.415
> >   6,      0,      0,    1,               0.423
> >   6,      0,      1,    1,               0.412
> >   6,      0,      0,    2,               0.433
> >   6,      0,      2,    2,               0.434
> >   6,      0,      0,    3,               0.427
> >   6,      0,      3,    3,               0.428
> >   6,      0,      0,    4,               0.465
> >   6,      0,      4,    4,               0.466
> >   6,      0,      0,    5,               0.463
> >   6,      0,      5,    5,               0.468
> >   6,      0,      0,    6,               0.435
> >   6,      0,      6,    6,               0.444
> >   6,      0,      0,    7,                0.41
> >   6,      0,      7,    7,                0.42
> >   6,      0,      0,    8,               0.474
> >   6,      0,      8,    8,               0.501
> >   6,      0,      0,    9,               0.471
> >   6,      0,      9,    9,               0.489
> >   6,      0,      0,   10,               0.462
> >   6,      0,     10,   10,                0.46
> >   6,      0,      0,   11,               0.459
> >   6,      0,     11,   11,               0.458
> >   6,      0,      0,   12,               0.516
> >   6,      0,     12,   12,                0.51
> >   6,      0,      0,   13,               0.494
> >   6,      0,     13,   13,               0.524
> >   6,      0,      0,   14,               0.486
> >   6,      0,     14,   14,                 0.5
> >   6,      0,      0,   15,                0.48
> >   6,      0,     15,   15,               0.501
> >   6,      0,      0,   16,                0.54
> >   6,      0,     16,   16,               0.538
> >   6,      0,      0,   17,               0.503
> >   6,      0,     17,   17,               0.541
> >   6,      0,      0,   18,               0.537
> >   6,      0,     18,   18,               0.549
> >   6,      0,      0,   19,               0.527
> >   6,      0,     19,   19,               0.537
> >   6,      0,      0,   20,               0.539
> >   6,      0,     20,   20,               0.554
> >   6,      0,      0,   21,               0.558
> >   6,      0,     21,   21,               0.541
> >   6,      0,      0,   22,               0.546
> >   6,      0,     22,   22,               0.561
> >   6,      0,      0,   23,                0.54
> >   6,      0,     23,   23,               0.536
> >   6,      0,      0,   24,               0.565
> >   6,      0,     24,   24,               0.584
> >   6,      0,      0,   25,               0.563
> >   6,      0,     25,   25,                0.58
> >   6,      0,      0,   26,               0.555
> >   6,      0,     26,   26,               0.584
> >   6,      0,      0,   27,               0.569
> >   6,      0,     27,   27,               0.587
> >   6,      0,      0,   28,               0.612
> >   6,      0,     28,   28,               0.623
> >   6,      0,      0,   29,               0.604
> >   6,      0,     29,   29,               0.621
> >   6,      0,      0,   30,                0.59
> >   6,      0,     30,   30,               0.609
> >   6,      0,      0,   31,               0.577
> >   6,      0,     31,   31,               0.588
> >   6,      0,      0,   32,               0.621
> >   6,      0,     32,   32,               0.608
> >   6,      0,      0,   33,               0.601
> >   6,      0,     33,   33,               0.623
> >   6,      0,      0,   34,               0.614
> >   6,      0,     34,   34,               0.615
> >   6,      0,      0,   35,               0.598
> >   6,      0,     35,   35,               0.608
> >   6,      0,      0,   36,               0.626
> >   6,      0,     36,   36,               0.634
> >   6,      0,      0,   37,                0.62
> >   6,      0,     37,   37,               0.634
> >   6,      0,      0,   38,               0.612
> >   6,      0,     38,   38,               0.637
> >   6,      0,      0,   39,               0.627
> >   6,      0,     39,   39,               0.612
> >   6,      0,      0,   40,               0.661
> >   6,      0,     40,   40,               0.674
> >   6,      0,      0,   41,               0.633
> >   6,      0,     41,   41,               0.643
> >   6,      0,      0,   42,               0.634
> >   6,      0,     42,   42,               0.636
> >   6,      0,      0,   43,               0.619
> >   6,      0,     43,   43,               0.625
> >   6,      0,      0,   44,               0.654
> >   6,      0,     44,   44,               0.654
> >   6,      0,      0,   45,               0.647
> >   6,      0,     45,   45,               0.649
> >   6,      0,      0,   46,               0.651
> >   6,      0,     46,   46,               0.651
> >   6,      0,      0,   47,               0.646
> >   6,      0,     47,   47,               0.648
> >   6,      0,      0,   48,               0.662
> >   6,      0,     48,   48,               0.664
> >   6,      0,      0,   49,                0.68
> >   6,      0,     49,   49,               0.667
> >   6,      0,      0,   50,               0.654
> >   6,      0,     50,   50,               0.659
> >   6,      0,      0,   51,               0.638
> >   6,      0,     51,   51,               0.639
> >   6,      0,      0,   52,               0.665
> >   6,      0,     52,   52,               0.669
> >   6,      0,      0,   53,               0.658
> >   6,      0,     53,   53,               0.656
> >   6,      0,      0,   54,               0.669
> >   6,      0,     54,   54,                0.67
> >   6,      0,      0,   55,               0.668
> >   6,      0,     55,   55,               0.664
> >   6,      0,      0,   56,               0.701
> >   6,      0,     56,   56,               0.695
> >   6,      0,      0,   57,               0.687
> >   6,      0,     57,   57,               0.696
> >   6,      0,      0,   58,               0.693
> >   6,      0,     58,   58,               0.704
> >   6,      0,      0,   59,               0.695
> >   6,      0,     59,   59,               0.708
> >   6,      0,      0,   60,               0.708
> >   6,      0,     60,   60,               0.728
> >   6,      0,      0,   61,               0.708
> >   6,      0,     61,   61,                0.71
> >   6,      0,      0,   62,               0.715
> >   6,      0,     62,   62,               0.705
> >   6,      0,      0,   63,               0.677
> >   6,      0,     63,   63,               0.702
> >
> >  .../{strcspn-sse2.S => strcspn-sse2.c}        |   8 +-
> >  sysdeps/x86_64/strcspn.S                      | 119 ------------------
> >  2 files changed, 4 insertions(+), 123 deletions(-)
> >  rename sysdeps/x86_64/multiarch/{strcspn-sse2.S => strcspn-sse2.c} (85%)
> >  delete mode 100644 sysdeps/x86_64/strcspn.S
> >
> > diff --git a/sysdeps/x86_64/multiarch/strcspn-sse2.S b/sysdeps/x86_64/multiarch/strcspn-sse2.c
> > similarity index 85%
> > rename from sysdeps/x86_64/multiarch/strcspn-sse2.S
> > rename to sysdeps/x86_64/multiarch/strcspn-sse2.c
> > index f97e856e1f..3a04bb39fc 100644
> > --- a/sysdeps/x86_64/multiarch/strcspn-sse2.S
> > +++ b/sysdeps/x86_64/multiarch/strcspn-sse2.c
> > @@ -1,4 +1,4 @@
> > -/* strcspn optimized with SSE2.
> > +/* strcspn.
> >     Copyright (C) 2017-2022 Free Software Foundation, Inc.
> >     This file is part of the GNU C Library.
> >
> > @@ -19,10 +19,10 @@
> >  #if IS_IN (libc)
> >
> >  # include <sysdep.h>
> > -# define strcspn __strcspn_sse2
> > +# define STRCSPN __strcspn_sse2
> >
> >  # undef libc_hidden_builtin_def
> > -# define libc_hidden_builtin_def(strcspn)
> > +# define libc_hidden_builtin_def(STRCSPN)
> >  #endif
> >
> > -#include <sysdeps/x86_64/strcspn.S>
> > +#include <string/strcspn.c>
> > diff --git a/sysdeps/x86_64/strcspn.S b/sysdeps/x86_64/strcspn.S
> > deleted file mode 100644
> > index f3cd86c606..0000000000
> > --- a/sysdeps/x86_64/strcspn.S
> > +++ /dev/null
> > @@ -1,119 +0,0 @@
> > -/* strcspn (str, ss) -- Return the length of the initial segment of STR
> > -                       which contains no characters from SS.
> > -   For AMD x86-64.
> > -   Copyright (C) 1994-2022 Free Software Foundation, Inc.
> > -   This file is part of the GNU C Library.
> > -
> > -   The GNU C Library is free software; you can redistribute it and/or
> > -   modify it under the terms of the GNU Lesser General Public
> > -   License as published by the Free Software Foundation; either
> > -   version 2.1 of the License, or (at your option) any later version.
> > -
> > -   The GNU C Library is distributed in the hope that it will be useful,
> > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > -   Lesser General Public License for more details.
> > -
> > -   You should have received a copy of the GNU Lesser General Public
> > -   License along with the GNU C Library; if not, see
> > -   <https://www.gnu.org/licenses/>.  */
> > -
> > -#include <sysdep.h>
> > -#include "asm-syntax.h"
> > -
> > -       .text
> > -ENTRY (strcspn)
> > -
> > -       movq %rdi, %rdx         /* Save SRC.  */
> > -
> > -       /* First we create a table with flags for all possible characters.
> > -          For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
> > -          supported by the C string functions we have 256 characters.
> > -          Before inserting marks for the stop characters we clear the whole
> > -          table.  */
> > -       movq %rdi, %r8                  /* Save value.  */
> > -       subq $256, %rsp                 /* Make space for 256 bytes.  */
> > -       cfi_adjust_cfa_offset(256)
> > -       movl $32,  %ecx                 /* 32*8 bytes = 256 bytes.  */
> > -       movq %rsp, %rdi
> > -       xorl %eax, %eax                 /* We store 0s.  */
> > -       cld
> > -       rep
> > -       stosq
> > -
> > -       movq %rsi, %rax                 /* Setup skipset.  */
> > -
> > -/* For understanding the following code remember that %rcx == 0 now.
> > -   Although all the following instruction only modify %cl we always
> > -   have a correct zero-extended 64-bit value in %rcx.  */
> > -
> > -       .p2align 4
> > -L(2):  movb (%rax), %cl        /* get byte from skipset */
> > -       testb %cl, %cl          /* is NUL char? */
> > -       jz L(1)                 /* yes => start compare loop */
> > -       movb %cl, (%rsp,%rcx)   /* set corresponding byte in skipset table */
> > -
> > -       movb 1(%rax), %cl       /* get byte from skipset */
> > -       testb $0xff, %cl        /* is NUL char? */
> > -       jz L(1)                 /* yes => start compare loop */
> > -       movb %cl, (%rsp,%rcx)   /* set corresponding byte in skipset table */
> > -
> > -       movb 2(%rax), %cl       /* get byte from skipset */
> > -       testb $0xff, %cl        /* is NUL char? */
> > -       jz L(1)                 /* yes => start compare loop */
> > -       movb %cl, (%rsp,%rcx)   /* set corresponding byte in skipset table */
> > -
> > -       movb 3(%rax), %cl       /* get byte from skipset */
> > -       addq $4, %rax           /* increment skipset pointer */
> > -       movb %cl, (%rsp,%rcx)   /* set corresponding byte in skipset table */
> > -       testb $0xff, %cl        /* is NUL char? */
> > -       jnz L(2)                /* no => process next dword from skipset */
> > -
> > -L(1):  leaq -4(%rdx), %rax     /* prepare loop */
> > -
> > -       /* We use a neat trick for the following loop.  Normally we would
> > -          have to test for two termination conditions
> > -          1. a character in the skipset was found
> > -          and
> > -          2. the end of the string was found
> > -          But as a sign that the character is in the skipset we store its
> > -          value in the table.  But the value of NUL is NUL so the loop
> > -          terminates for NUL in every case.  */
> > -
> > -       .p2align 4
> > -L(3):  addq $4, %rax           /* adjust pointer for full loop round */
> > -
> > -       movb (%rax), %cl        /* get byte from string */
> > -       cmpb %cl, (%rsp,%rcx)   /* is it contained in skipset? */
> > -       je L(4)                 /* yes => return */
> > -
> > -       movb 1(%rax), %cl       /* get byte from string */
> > -       cmpb %cl, (%rsp,%rcx)   /* is it contained in skipset? */
> > -       je L(5)                 /* yes => return */
> > -
> > -       movb 2(%rax), %cl       /* get byte from string */
> > -       cmpb %cl, (%rsp,%rcx)   /* is it contained in skipset? */
> > -       jz L(6)                 /* yes => return */
> > -
> > -       movb 3(%rax), %cl       /* get byte from string */
> > -       cmpb %cl, (%rsp,%rcx)   /* is it contained in skipset? */
> > -       jne L(3)                /* no => start loop again */
> > -
> > -       incq %rax               /* adjust pointer */
> > -L(6):  incq %rax
> > -L(5):  incq %rax
> > -
> > -L(4):  addq $256, %rsp         /* remove skipset */
> > -       cfi_adjust_cfa_offset(-256)
> > -#ifdef USE_AS_STRPBRK
> > -       xorl %edx,%edx
> > -       orb %cl, %cl            /* was last character NUL? */
> > -       cmovzq %rdx, %rax       /* Yes: return NULL */
> > -#else
> > -       subq %rdx, %rax         /* we have to return the number of valid
> > -                                  characters, so compute distance to first
> > -                                  non-valid character */
> > -#endif
> > -       ret
> > -END (strcspn)
> > -libc_hidden_builtin_def (strcspn)
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.

I would like to backport this patch to release branches.
Any comments or objections?

--Sunil

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v1 10/23] x86: Remove strpbrk-sse2.S and use the generic implementation
  2022-03-24 18:57   ` H.J. Lu
@ 2022-05-12 19:41     ` Sunil Pandey
  0 siblings, 0 replies; 76+ messages in thread
From: Sunil Pandey @ 2022-05-12 19:41 UTC (permalink / raw)
  To: H.J. Lu, Libc-stable Mailing List; +Cc: Noah Goldstein, GNU C Library

On Thu, Mar 24, 2022 at 12:00 PM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Wed, Mar 23, 2022 at 3:00 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > The generic implementation is faster (see strcspn commit).
> >
> > All string/memory tests pass.
> > ---
> >  .../x86_64/multiarch/{strpbrk-sse2.S => strpbrk-sse2.c}  | 9 ++++-----
> >  sysdeps/x86_64/strpbrk.S                                 | 3 ---
> >  2 files changed, 4 insertions(+), 8 deletions(-)
> >  rename sysdeps/x86_64/multiarch/{strpbrk-sse2.S => strpbrk-sse2.c} (84%)
> >  delete mode 100644 sysdeps/x86_64/strpbrk.S
> >
> > diff --git a/sysdeps/x86_64/multiarch/strpbrk-sse2.S b/sysdeps/x86_64/multiarch/strpbrk-sse2.c
> > similarity index 84%
> > rename from sysdeps/x86_64/multiarch/strpbrk-sse2.S
> > rename to sysdeps/x86_64/multiarch/strpbrk-sse2.c
> > index d537b6c27b..d03214c4fb 100644
> > --- a/sysdeps/x86_64/multiarch/strpbrk-sse2.S
> > +++ b/sysdeps/x86_64/multiarch/strpbrk-sse2.c
> > @@ -1,4 +1,4 @@
> > -/* strpbrk optimized with SSE2.
> > +/* strpbrk.
> >     Copyright (C) 2017-2022 Free Software Foundation, Inc.
> >     This file is part of the GNU C Library.
> >
> > @@ -19,11 +19,10 @@
> >  #if IS_IN (libc)
> >
> >  # include <sysdep.h>
> > -# define strcspn __strpbrk_sse2
> > +# define STRPBRK __strpbrk_sse2
> >
> >  # undef libc_hidden_builtin_def
> > -# define libc_hidden_builtin_def(strpbrk)
> > +# define libc_hidden_builtin_def(STRPBRK)
> >  #endif
> >
> > -#define USE_AS_STRPBRK
> > -#include <sysdeps/x86_64/strcspn.S>
> > +#include <string/strpbrk.c>
> > diff --git a/sysdeps/x86_64/strpbrk.S b/sysdeps/x86_64/strpbrk.S
> > deleted file mode 100644
> > index 21888a5b92..0000000000
> > --- a/sysdeps/x86_64/strpbrk.S
> > +++ /dev/null
> > @@ -1,3 +0,0 @@
> > -#define strcspn strpbrk
> > -#define USE_AS_STRPBRK
> > -#include <sysdeps/x86_64/strcspn.S>
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.

I would like to backport this patch to release branches.
Any comments or objections?

--Sunil

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v1 11/23] x86: Remove strspn-sse2.S and use the generic implementation
  2022-03-24 18:57   ` H.J. Lu
@ 2022-05-12 19:42     ` Sunil Pandey
  0 siblings, 0 replies; 76+ messages in thread
From: Sunil Pandey @ 2022-05-12 19:42 UTC (permalink / raw)
  To: H.J. Lu, Libc-stable Mailing List; +Cc: Noah Goldstein, GNU C Library

On Thu, Mar 24, 2022 at 12:00 PM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Wed, Mar 23, 2022 at 3:01 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > The generic implementation is faster.
> >
> > geometric_mean(N=20) of all benchmarks New / Original: .710
> >
> > All string/memory tests pass.
> > ---
> > Geomtric Mean N=20 runs; All functions page aligned
> > len, align1, align2,  pos, New Time / Old Time
> >   1,      0,      0,  512,               0.824
> >   1,      1,      0,  512,               1.018
> >   1,      0,      1,  512,               0.986
> >   1,      1,      1,  512,               1.092
> >   2,      0,      0,  512,                0.86
> >   2,      2,      0,  512,               0.868
> >   2,      0,      2,  512,               0.858
> >   2,      2,      2,  512,               0.857
> >   3,      0,      0,  512,               0.836
> >   3,      3,      0,  512,               0.849
> >   3,      0,      3,  512,                0.84
> >   3,      3,      3,  512,                0.85
> >   4,      0,      0,  512,               0.843
> >   4,      4,      0,  512,               0.837
> >   4,      0,      4,  512,               0.835
> >   4,      4,      4,  512,               0.846
> >   5,      0,      0,  512,               0.852
> >   5,      5,      0,  512,               0.848
> >   5,      0,      5,  512,                0.85
> >   5,      5,      5,  512,                0.85
> >   6,      0,      0,  512,               0.853
> >   6,      6,      0,  512,               0.855
> >   6,      0,      6,  512,               0.853
> >   6,      6,      6,  512,               0.853
> >   7,      0,      0,  512,               0.857
> >   7,      7,      0,  512,               0.861
> >   7,      0,      7,  512,                0.94
> >   7,      7,      7,  512,               0.856
> >   8,      0,      0,  512,               0.927
> >   8,      0,      8,  512,               0.965
> >   9,      0,      0,  512,               0.967
> >   9,      1,      0,  512,               0.976
> >   9,      0,      9,  512,               0.887
> >   9,      1,      9,  512,               0.881
> >  10,      0,      0,  512,               0.853
> >  10,      2,      0,  512,               0.846
> >  10,      0,     10,  512,               0.855
> >  10,      2,     10,  512,               0.849
> >  11,      0,      0,  512,               0.854
> >  11,      3,      0,  512,               0.855
> >  11,      0,     11,  512,                0.85
> >  11,      3,     11,  512,               0.854
> >  12,      0,      0,  512,               0.864
> >  12,      4,      0,  512,               0.864
> >  12,      0,     12,  512,               0.867
> >  12,      4,     12,  512,                0.87
> >  13,      0,      0,  512,               0.853
> >  13,      5,      0,  512,               0.841
> >  13,      0,     13,  512,               0.837
> >  13,      5,     13,  512,                0.85
> >  14,      0,      0,  512,               0.838
> >  14,      6,      0,  512,               0.842
> >  14,      0,     14,  512,               0.818
> >  14,      6,     14,  512,               0.845
> >  15,      0,      0,  512,               0.799
> >  15,      7,      0,  512,               0.847
> >  15,      0,     15,  512,               0.787
> >  15,      7,     15,  512,                0.84
> >  16,      0,      0,  512,               0.824
> >  16,      0,     16,  512,               0.827
> >  17,      0,      0,  512,               0.817
> >  17,      1,      0,  512,               0.823
> >  17,      0,     17,  512,                0.82
> >  17,      1,     17,  512,               0.814
> >  18,      0,      0,  512,                0.81
> >  18,      2,      0,  512,               0.833
> >  18,      0,     18,  512,               0.811
> >  18,      2,     18,  512,               0.842
> >  19,      0,      0,  512,               0.823
> >  19,      3,      0,  512,               0.818
> >  19,      0,     19,  512,               0.821
> >  19,      3,     19,  512,               0.824
> >  20,      0,      0,  512,               0.814
> >  20,      4,      0,  512,               0.818
> >  20,      0,     20,  512,               0.806
> >  20,      4,     20,  512,               0.802
> >  21,      0,      0,  512,               0.835
> >  21,      5,      0,  512,               0.839
> >  21,      0,     21,  512,               0.842
> >  21,      5,     21,  512,                0.82
> >  22,      0,      0,  512,               0.824
> >  22,      6,      0,  512,               0.831
> >  22,      0,     22,  512,               0.819
> >  22,      6,     22,  512,               0.824
> >  23,      0,      0,  512,               0.816
> >  23,      7,      0,  512,               0.856
> >  23,      0,     23,  512,               0.808
> >  23,      7,     23,  512,               0.848
> >  24,      0,      0,  512,                0.88
> >  24,      0,     24,  512,               0.846
> >  25,      0,      0,  512,               0.929
> >  25,      1,      0,  512,               0.917
> >  25,      0,     25,  512,               0.884
> >  25,      1,     25,  512,               0.859
> >  26,      0,      0,  512,               0.919
> >  26,      2,      0,  512,               0.867
> >  26,      0,     26,  512,               0.914
> >  26,      2,     26,  512,               0.845
> >  27,      0,      0,  512,               0.919
> >  27,      3,      0,  512,               0.864
> >  27,      0,     27,  512,               0.917
> >  27,      3,     27,  512,               0.847
> >  28,      0,      0,  512,               0.905
> >  28,      4,      0,  512,               0.896
> >  28,      0,     28,  512,               0.898
> >  28,      4,     28,  512,               0.871
> >  29,      0,      0,  512,               0.911
> >  29,      5,      0,  512,                0.91
> >  29,      0,     29,  512,               0.905
> >  29,      5,     29,  512,               0.884
> >  30,      0,      0,  512,               0.907
> >  30,      6,      0,  512,               0.802
> >  30,      0,     30,  512,               0.906
> >  30,      6,     30,  512,               0.818
> >  31,      0,      0,  512,               0.907
> >  31,      7,      0,  512,               0.821
> >  31,      0,     31,  512,                0.89
> >  31,      7,     31,  512,               0.787
> >   4,      0,      0,   32,               0.623
> >   4,      1,      0,   32,               0.606
> >   4,      0,      1,   32,                 0.6
> >   4,      1,      1,   32,               0.603
> >   4,      0,      0,   64,               0.731
> >   4,      2,      0,   64,               0.733
> >   4,      0,      2,   64,               0.734
> >   4,      2,      2,   64,               0.755
> >   4,      0,      0,  128,               0.822
> >   4,      3,      0,  128,               0.873
> >   4,      0,      3,  128,                0.89
> >   4,      3,      3,  128,               0.907
> >   4,      0,      0,  256,               0.827
> >   4,      4,      0,  256,               0.811
> >   4,      0,      4,  256,               0.794
> >   4,      4,      4,  256,               0.814
> >   4,      5,      0,  512,               0.841
> >   4,      0,      5,  512,               0.831
> >   4,      5,      5,  512,               0.845
> >   4,      0,      0, 1024,               0.861
> >   4,      6,      0, 1024,               0.857
> >   4,      0,      6, 1024,                 0.9
> >   4,      6,      6, 1024,               0.861
> >   4,      0,      0, 2048,               0.879
> >   4,      7,      0, 2048,               0.875
> >   4,      0,      7, 2048,               0.883
> >   4,      7,      7, 2048,                0.88
> >  10,      1,      0,   64,               0.747
> >  10,      1,      1,   64,               0.743
> >  10,      2,      0,   64,               0.732
> >  10,      2,      2,   64,               0.729
> >  10,      3,      0,   64,               0.747
> >  10,      3,      3,   64,               0.733
> >  10,      4,      0,   64,                0.74
> >  10,      4,      4,   64,               0.751
> >  10,      5,      0,   64,               0.735
> >  10,      5,      5,   64,               0.746
> >  10,      6,      0,   64,               0.735
> >  10,      6,      6,   64,               0.733
> >  10,      7,      0,   64,               0.734
> >  10,      7,      7,   64,                0.74
> >   6,      0,      0,    0,               0.377
> >   6,      0,      0,    1,               0.369
> >   6,      0,      1,    1,               0.383
> >   6,      0,      0,    2,               0.391
> >   6,      0,      2,    2,               0.394
> >   6,      0,      0,    3,               0.416
> >   6,      0,      3,    3,               0.411
> >   6,      0,      0,    4,               0.475
> >   6,      0,      4,    4,               0.483
> >   6,      0,      0,    5,               0.473
> >   6,      0,      5,    5,               0.476
> >   6,      0,      0,    6,               0.459
> >   6,      0,      6,    6,               0.445
> >   6,      0,      0,    7,               0.433
> >   6,      0,      7,    7,               0.432
> >   6,      0,      0,    8,               0.492
> >   6,      0,      8,    8,               0.494
> >   6,      0,      0,    9,               0.476
> >   6,      0,      9,    9,               0.483
> >   6,      0,      0,   10,                0.46
> >   6,      0,     10,   10,               0.476
> >   6,      0,      0,   11,               0.463
> >   6,      0,     11,   11,               0.463
> >   6,      0,      0,   12,               0.511
> >   6,      0,     12,   12,               0.515
> >   6,      0,      0,   13,               0.506
> >   6,      0,     13,   13,               0.536
> >   6,      0,      0,   14,               0.496
> >   6,      0,     14,   14,               0.484
> >   6,      0,      0,   15,               0.473
> >   6,      0,     15,   15,               0.475
> >   6,      0,      0,   16,               0.534
> >   6,      0,     16,   16,               0.534
> >   6,      0,      0,   17,               0.525
> >   6,      0,     17,   17,               0.523
> >   6,      0,      0,   18,               0.522
> >   6,      0,     18,   18,               0.524
> >   6,      0,      0,   19,               0.512
> >   6,      0,     19,   19,               0.514
> >   6,      0,      0,   20,               0.535
> >   6,      0,     20,   20,                0.54
> >   6,      0,      0,   21,               0.543
> >   6,      0,     21,   21,               0.536
> >   6,      0,      0,   22,               0.542
> >   6,      0,     22,   22,               0.542
> >   6,      0,      0,   23,               0.529
> >   6,      0,     23,   23,                0.53
> >   6,      0,      0,   24,               0.596
> >   6,      0,     24,   24,               0.589
> >   6,      0,      0,   25,               0.583
> >   6,      0,     25,   25,                0.58
> >   6,      0,      0,   26,               0.574
> >   6,      0,     26,   26,                0.58
> >   6,      0,      0,   27,               0.575
> >   6,      0,     27,   27,               0.558
> >   6,      0,      0,   28,               0.606
> >   6,      0,     28,   28,               0.606
> >   6,      0,      0,   29,               0.589
> >   6,      0,     29,   29,               0.595
> >   6,      0,      0,   30,               0.592
> >   6,      0,     30,   30,               0.585
> >   6,      0,      0,   31,               0.585
> >   6,      0,     31,   31,               0.579
> >   6,      0,      0,   32,               0.625
> >   6,      0,     32,   32,               0.615
> >   6,      0,      0,   33,               0.615
> >   6,      0,     33,   33,                0.61
> >   6,      0,      0,   34,               0.604
> >   6,      0,     34,   34,                 0.6
> >   6,      0,      0,   35,               0.602
> >   6,      0,     35,   35,               0.608
> >   6,      0,      0,   36,               0.644
> >   6,      0,     36,   36,               0.644
> >   6,      0,      0,   37,               0.658
> >   6,      0,     37,   37,               0.651
> >   6,      0,      0,   38,               0.644
> >   6,      0,     38,   38,               0.649
> >   6,      0,      0,   39,               0.626
> >   6,      0,     39,   39,               0.632
> >   6,      0,      0,   40,               0.662
> >   6,      0,     40,   40,               0.661
> >   6,      0,      0,   41,               0.656
> >   6,      0,     41,   41,               0.655
> >   6,      0,      0,   42,               0.643
> >   6,      0,     42,   42,               0.637
> >   6,      0,      0,   43,               0.622
> >   6,      0,     43,   43,               0.628
> >   6,      0,      0,   44,               0.673
> >   6,      0,     44,   44,               0.687
> >   6,      0,      0,   45,               0.661
> >   6,      0,     45,   45,               0.659
> >   6,      0,      0,   46,               0.657
> >   6,      0,     46,   46,               0.653
> >   6,      0,      0,   47,               0.658
> >   6,      0,     47,   47,                0.65
> >   6,      0,      0,   48,               0.678
> >   6,      0,     48,   48,               0.683
> >   6,      0,      0,   49,               0.676
> >   6,      0,     49,   49,               0.661
> >   6,      0,      0,   50,               0.672
> >   6,      0,     50,   50,               0.662
> >   6,      0,      0,   51,               0.656
> >   6,      0,     51,   51,               0.659
> >   6,      0,      0,   52,               0.682
> >   6,      0,     52,   52,               0.686
> >   6,      0,      0,   53,                0.67
> >   6,      0,     53,   53,               0.674
> >   6,      0,      0,   54,               0.663
> >   6,      0,     54,   54,               0.675
> >   6,      0,      0,   55,               0.662
> >   6,      0,     55,   55,               0.665
> >   6,      0,      0,   56,               0.681
> >   6,      0,     56,   56,               0.697
> >   6,      0,      0,   57,               0.686
> >   6,      0,     57,   57,               0.687
> >   6,      0,      0,   58,               0.701
> >   6,      0,     58,   58,               0.693
> >   6,      0,      0,   59,               0.709
> >   6,      0,     59,   59,               0.698
> >   6,      0,      0,   60,               0.708
> >   6,      0,     60,   60,               0.708
> >   6,      0,      0,   61,               0.709
> >   6,      0,     61,   61,               0.716
> >   6,      0,      0,   62,               0.709
> >   6,      0,     62,   62,               0.707
> >   6,      0,      0,   63,               0.703
> >   6,      0,     63,   63,               0.716
> >
> >  .../{strspn-sse2.S => strspn-sse2.c}          |   8 +-
> >  sysdeps/x86_64/strspn.S                       | 112 ------------------
> >  2 files changed, 4 insertions(+), 116 deletions(-)
> >  rename sysdeps/x86_64/multiarch/{strspn-sse2.S => strspn-sse2.c} (86%)
> >  delete mode 100644 sysdeps/x86_64/strspn.S
> >
> > diff --git a/sysdeps/x86_64/multiarch/strspn-sse2.S b/sysdeps/x86_64/multiarch/strspn-sse2.c
> > similarity index 86%
> > rename from sysdeps/x86_64/multiarch/strspn-sse2.S
> > rename to sysdeps/x86_64/multiarch/strspn-sse2.c
> > index e0a095f25a..61cc6cb0a5 100644
> > --- a/sysdeps/x86_64/multiarch/strspn-sse2.S
> > +++ b/sysdeps/x86_64/multiarch/strspn-sse2.c
> > @@ -1,4 +1,4 @@
> > -/* strspn optimized with SSE2.
> > +/* strspn.
> >     Copyright (C) 2017-2022 Free Software Foundation, Inc.
> >     This file is part of the GNU C Library.
> >
> > @@ -19,10 +19,10 @@
> >  #if IS_IN (libc)
> >
> >  # include <sysdep.h>
> > -# define strspn __strspn_sse2
> > +# define STRSPN __strspn_sse2
> >
> >  # undef libc_hidden_builtin_def
> > -# define libc_hidden_builtin_def(strspn)
> > +# define libc_hidden_builtin_def(STRSPN)
> >  #endif
> >
> > -#include <sysdeps/x86_64/strspn.S>
> > +#include <string/strspn.c>
> > diff --git a/sysdeps/x86_64/strspn.S b/sysdeps/x86_64/strspn.S
> > deleted file mode 100644
> > index 61b76ee0a1..0000000000
> > --- a/sysdeps/x86_64/strspn.S
> > +++ /dev/null
> > @@ -1,112 +0,0 @@
> > -/* strspn (str, ss) -- Return the length of the initial segment of STR
> > -                       which contains only characters from SS.
> > -   For AMD x86-64.
> > -   Copyright (C) 1994-2022 Free Software Foundation, Inc.
> > -   This file is part of the GNU C Library.
> > -
> > -   The GNU C Library is free software; you can redistribute it and/or
> > -   modify it under the terms of the GNU Lesser General Public
> > -   License as published by the Free Software Foundation; either
> > -   version 2.1 of the License, or (at your option) any later version.
> > -
> > -   The GNU C Library is distributed in the hope that it will be useful,
> > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > -   Lesser General Public License for more details.
> > -
> > -   You should have received a copy of the GNU Lesser General Public
> > -   License along with the GNU C Library; if not, see
> > -   <https://www.gnu.org/licenses/>.  */
> > -
> > -#include <sysdep.h>
> > -
> > -       .text
> > -ENTRY (strspn)
> > -
> > -       movq %rdi, %rdx         /* Save SRC.  */
> > -
> > -       /* First we create a table with flags for all possible characters.
> > -          For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
> > -          supported by the C string functions we have 256 characters.
> > -          Before inserting marks for the stop characters we clear the whole
> > -          table.  */
> > -       movq %rdi, %r8                  /* Save value.  */
> > -       subq $256, %rsp                 /* Make space for 256 bytes.  */
> > -       cfi_adjust_cfa_offset(256)
> > -       movl $32,  %ecx                 /* 32*8 bytes = 256 bytes.  */
> > -       movq %rsp, %rdi
> > -       xorl %eax, %eax                 /* We store 0s.  */
> > -       cld
> > -       rep
> > -       stosq
> > -
> > -       movq %rsi, %rax                 /* Setup stopset.  */
> > -
> > -/* For understanding the following code remember that %rcx == 0 now.
> > -   Although all the following instruction only modify %cl we always
> > -   have a correct zero-extended 64-bit value in %rcx.  */
> > -
> > -       .p2align 4
> > -L(2):  movb (%rax), %cl        /* get byte from stopset */
> > -       testb %cl, %cl          /* is NUL char? */
> > -       jz L(1)                 /* yes => start compare loop */
> > -       movb %cl, (%rsp,%rcx)   /* set corresponding byte in stopset table */
> > -
> > -       movb 1(%rax), %cl       /* get byte from stopset */
> > -       testb $0xff, %cl        /* is NUL char? */
> > -       jz L(1)                 /* yes => start compare loop */
> > -       movb %cl, (%rsp,%rcx)   /* set corresponding byte in stopset table */
> > -
> > -       movb 2(%rax), %cl       /* get byte from stopset */
> > -       testb $0xff, %cl        /* is NUL char? */
> > -       jz L(1)                 /* yes => start compare loop */
> > -       movb %cl, (%rsp,%rcx)   /* set corresponding byte in stopset table */
> > -
> > -       movb 3(%rax), %cl       /* get byte from stopset */
> > -       addq $4, %rax           /* increment stopset pointer */
> > -       movb %cl, (%rsp,%rcx)   /* set corresponding byte in stopset table */
> > -       testb $0xff, %cl        /* is NUL char? */
> > -       jnz L(2)                /* no => process next dword from stopset */
> > -
> > -L(1):  leaq -4(%rdx), %rax     /* prepare loop */
> > -
> > -       /* We use a neat trick for the following loop.  Normally we would
> > -          have to test for two termination conditions
> > -          1. a character in the stopset was found
> > -          and
> > -          2. the end of the string was found
> > -          But as a sign that the character is in the stopset we store its
> > -          value in the table.  But the value of NUL is NUL so the loop
> > -          terminates for NUL in every case.  */
> > -
> > -       .p2align 4
> > -L(3):  addq $4, %rax           /* adjust pointer for full loop round */
> > -
> > -       movb (%rax), %cl        /* get byte from string */
> > -       testb %cl, (%rsp,%rcx)  /* is it contained in skipset? */
> > -       jz L(4)                 /* no => return */
> > -
> > -       movb 1(%rax), %cl       /* get byte from string */
> > -       testb %cl, (%rsp,%rcx)  /* is it contained in skipset? */
> > -       jz L(5)                 /* no => return */
> > -
> > -       movb 2(%rax), %cl       /* get byte from string */
> > -       testb %cl, (%rsp,%rcx)  /* is it contained in skipset? */
> > -       jz L(6)                 /* no => return */
> > -
> > -       movb 3(%rax), %cl       /* get byte from string */
> > -       testb %cl, (%rsp,%rcx)  /* is it contained in skipset? */
> > -       jnz L(3)                /* yes => start loop again */
> > -
> > -       incq %rax               /* adjust pointer */
> > -L(6):  incq %rax
> > -L(5):  incq %rax
> > -
> > -L(4):  addq $256, %rsp         /* remove stopset */
> > -       cfi_adjust_cfa_offset(-256)
> > -       subq %rdx, %rax         /* we have to return the number of valid
> > -                                  characters, so compute distance to first
> > -                                  non-valid character */
> > -       ret
> > -END (strspn)
> > -libc_hidden_builtin_def (strspn)
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.

I would like to backport this patch to release branches.
Any comments or objections?

--Sunil

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v1 17/23] x86: Optimize str{n}casecmp TOLOWER logic in strcmp.S
  2022-03-24 19:02   ` H.J. Lu
@ 2022-05-12 19:44     ` Sunil Pandey
  0 siblings, 0 replies; 76+ messages in thread
From: Sunil Pandey @ 2022-05-12 19:44 UTC (permalink / raw)
  To: H.J. Lu, Libc-stable Mailing List; +Cc: Noah Goldstein, GNU C Library

On Thu, Mar 24, 2022 at 12:05 PM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Wed, Mar 23, 2022 at 3:01 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Slightly faster method of doing TOLOWER that saves an
> > instruction.
> >
> > Also replace the hard coded 5-byte no with .p2align 4. On builds with
> > CET enabled this misaligned entry to strcasecmp.
> >
> > geometric_mean(N=40) of all benchmarks New / Original: .894
> >
> > All string/memory tests pass.
> > ---
> > Geomtric Mean N=40 runs; All functions page aligned
> > length, align1, align2, max_char, New Time / Old Time
> >      1,      1,      1,      127,               0.903
> >      2,      2,      2,      127,               0.905
> >      3,      3,      3,      127,               0.877
> >      4,      4,      4,      127,               0.888
> >      5,      5,      5,      127,               0.901
> >      6,      6,      6,      127,               0.954
> >      7,      7,      7,      127,               0.932
> >      8,      0,      0,      127,               0.918
> >      9,      1,      1,      127,               0.914
> >     10,      2,      2,      127,               0.877
> >     11,      3,      3,      127,               0.909
> >     12,      4,      4,      127,               0.876
> >     13,      5,      5,      127,               0.886
> >     14,      6,      6,      127,               0.914
> >     15,      7,      7,      127,               0.939
> >      4,      0,      0,      127,               0.963
> >      4,      0,      0,      254,               0.943
> >      8,      0,      0,      254,               0.927
> >     16,      0,      0,      127,               0.876
> >     16,      0,      0,      254,               0.865
> >     32,      0,      0,      127,               0.865
> >     32,      0,      0,      254,               0.862
> >     64,      0,      0,      127,               0.863
> >     64,      0,      0,      254,               0.896
> >    128,      0,      0,      127,               0.885
> >    128,      0,      0,      254,               0.882
> >    256,      0,      0,      127,                0.87
> >    256,      0,      0,      254,               0.869
> >    512,      0,      0,      127,               0.832
> >    512,      0,      0,      254,               0.848
> >   1024,      0,      0,      127,               0.835
> >   1024,      0,      0,      254,               0.843
> >     16,      1,      2,      127,               0.914
> >     16,      2,      1,      254,               0.949
> >     32,      2,      4,      127,               0.955
> >     32,      4,      2,      254,               1.004
> >     64,      3,      6,      127,               0.844
> >     64,      6,      3,      254,               0.905
> >    128,      4,      0,      127,               0.889
> >    128,      0,      4,      254,               0.845
> >    256,      5,      2,      127,               0.929
> >    256,      2,      5,      254,               0.907
> >    512,      6,      4,      127,               0.837
> >    512,      4,      6,      254,               0.862
> >   1024,      7,      6,      127,               0.895
> >   1024,      6,      7,      254,                0.89
> >
> >  sysdeps/x86_64/strcmp.S | 64 +++++++++++++++++++----------------------
> >  1 file changed, 29 insertions(+), 35 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
> > index e2ab59c555..99d8b36f1d 100644
> > --- a/sysdeps/x86_64/strcmp.S
> > +++ b/sysdeps/x86_64/strcmp.S
> > @@ -75,9 +75,8 @@ ENTRY2 (__strcasecmp)
> >         movq    __libc_tsd_LOCALE@gottpoff(%rip),%rax
> >         mov     %fs:(%rax),%RDX_LP
> >
> > -       // XXX 5 byte should be before the function
> > -       /* 5-byte NOP.  */
> > -       .byte   0x0f,0x1f,0x44,0x00,0x00
> > +       /* Either 1 or 5 bytes (dependeing if CET is enabled).  */
> > +       .p2align 4
> >  END2 (__strcasecmp)
> >  # ifndef NO_NOLOCALE_ALIAS
> >  weak_alias (__strcasecmp, strcasecmp)
> > @@ -94,9 +93,8 @@ ENTRY2 (__strncasecmp)
> >         movq    __libc_tsd_LOCALE@gottpoff(%rip),%rax
> >         mov     %fs:(%rax),%RCX_LP
> >
> > -       // XXX 5 byte should be before the function
> > -       /* 5-byte NOP.  */
> > -       .byte   0x0f,0x1f,0x44,0x00,0x00
> > +       /* Either 1 or 5 bytes (dependeing if CET is enabled).  */
> > +       .p2align 4
> >  END2 (__strncasecmp)
> >  # ifndef NO_NOLOCALE_ALIAS
> >  weak_alias (__strncasecmp, strncasecmp)
> > @@ -146,22 +144,22 @@ ENTRY (STRCMP)
> >  #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> >         .section .rodata.cst16,"aM",@progbits,16
> >         .align 16
> > -.Lbelowupper:
> > -       .quad   0x4040404040404040
> > -       .quad   0x4040404040404040
> > -.Ltopupper:
> > -       .quad   0x5b5b5b5b5b5b5b5b
> > -       .quad   0x5b5b5b5b5b5b5b5b
> > -.Ltouppermask:
> > +.Llcase_min:
> > +       .quad   0x3f3f3f3f3f3f3f3f
> > +       .quad   0x3f3f3f3f3f3f3f3f
> > +.Llcase_max:
> > +       .quad   0x9999999999999999
> > +       .quad   0x9999999999999999
> > +.Lcase_add:
> >         .quad   0x2020202020202020
> >         .quad   0x2020202020202020
> >         .previous
> > -       movdqa  .Lbelowupper(%rip), %xmm5
> > -# define UCLOW_reg %xmm5
> > -       movdqa  .Ltopupper(%rip), %xmm6
> > -# define UCHIGH_reg %xmm6
> > -       movdqa  .Ltouppermask(%rip), %xmm7
> > -# define LCQWORD_reg %xmm7
> > +       movdqa  .Llcase_min(%rip), %xmm5
> > +# define LCASE_MIN_reg %xmm5
> > +       movdqa  .Llcase_max(%rip), %xmm6
> > +# define LCASE_MAX_reg %xmm6
> > +       movdqa  .Lcase_add(%rip), %xmm7
> > +# define CASE_ADD_reg %xmm7
> >  #endif
> >         cmp     $0x30, %ecx
> >         ja      LABEL(crosscache)       /* rsi: 16-byte load will cross cache line */
> > @@ -172,22 +170,18 @@ ENTRY (STRCMP)
> >         movhpd  8(%rdi), %xmm1
> >         movhpd  8(%rsi), %xmm2
> >  #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> > -# define TOLOWER(reg1, reg2) \
> > -       movdqa  reg1, %xmm8;                                    \
> > -       movdqa  UCHIGH_reg, %xmm9;                              \
> > -       movdqa  reg2, %xmm10;                                   \
> > -       movdqa  UCHIGH_reg, %xmm11;                             \
> > -       pcmpgtb UCLOW_reg, %xmm8;                               \
> > -       pcmpgtb reg1, %xmm9;                                    \
> > -       pcmpgtb UCLOW_reg, %xmm10;                              \
> > -       pcmpgtb reg2, %xmm11;                                   \
> > -       pand    %xmm9, %xmm8;                                   \
> > -       pand    %xmm11, %xmm10;                                 \
> > -       pand    LCQWORD_reg, %xmm8;                             \
> > -       pand    LCQWORD_reg, %xmm10;                            \
> > -       por     %xmm8, reg1;                                    \
> > -       por     %xmm10, reg2
> > -       TOLOWER (%xmm1, %xmm2)
> > +#  define TOLOWER(reg1, reg2) \
> > +       movdqa  LCASE_MIN_reg, %xmm8;                                   \
> > +       movdqa  LCASE_MIN_reg, %xmm9;                                   \
> > +       paddb   reg1, %xmm8;                                    \
> > +       paddb   reg2, %xmm9;                                    \
> > +       pcmpgtb LCASE_MAX_reg, %xmm8;                           \
> > +       pcmpgtb LCASE_MAX_reg, %xmm9;                           \
> > +       pandn   CASE_ADD_reg, %xmm8;                                    \
> > +       pandn   CASE_ADD_reg, %xmm9;                                    \
> > +       paddb   %xmm8, reg1;                                    \
> > +       paddb   %xmm9, reg2
> > +       TOLOWER (%xmm1, %xmm2)
> >  #else
> >  # define TOLOWER(reg1, reg2)
> >  #endif
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.

I would like to backport this patch to release branches.
Any comments or objections?

--Sunil

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v1 18/23] x86: Optimize str{n}casecmp TOLOWER logic in strcmp-sse42.S
  2022-03-24 19:02   ` H.J. Lu
@ 2022-05-12 19:45     ` Sunil Pandey
  0 siblings, 0 replies; 76+ messages in thread
From: Sunil Pandey @ 2022-05-12 19:45 UTC (permalink / raw)
  To: H.J. Lu, Libc-stable Mailing List; +Cc: Noah Goldstein, GNU C Library

On Thu, Mar 24, 2022 at 12:05 PM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Wed, Mar 23, 2022 at 3:02 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Slightly faster method of doing TOLOWER that saves an
> > instruction.
> >
> > Also replace the hard coded 5-byte no with .p2align 4. On builds with
> > CET enabled this misaligned entry to strcasecmp.
> >
> > geometric_mean(N=40) of all benchmarks New / Original: .920
> >
> > All string/memory tests pass.
> > ---
> > Geomtric Mean N=40 runs; All functions page aligned
> > length, align1, align2, max_char, New Time / Old Time
> >      1,      1,      1,      127,               0.914
> >      2,      2,      2,      127,               0.952
> >      3,      3,      3,      127,               0.924
> >      4,      4,      4,      127,               0.995
> >      5,      5,      5,      127,               0.985
> >      6,      6,      6,      127,               1.017
> >      7,      7,      7,      127,               1.031
> >      8,      0,      0,      127,               0.967
> >      9,      1,      1,      127,               0.969
> >     10,      2,      2,      127,               0.951
> >     11,      3,      3,      127,               0.938
> >     12,      4,      4,      127,               0.937
> >     13,      5,      5,      127,               0.967
> >     14,      6,      6,      127,               0.941
> >     15,      7,      7,      127,               0.951
> >      4,      0,      0,      127,               0.959
> >      4,      0,      0,      254,                0.98
> >      8,      0,      0,      254,               0.959
> >     16,      0,      0,      127,               0.895
> >     16,      0,      0,      254,               0.901
> >     32,      0,      0,      127,                0.85
> >     32,      0,      0,      254,               0.851
> >     64,      0,      0,      127,               0.897
> >     64,      0,      0,      254,               0.895
> >    128,      0,      0,      127,               0.944
> >    128,      0,      0,      254,               0.935
> >    256,      0,      0,      127,               0.922
> >    256,      0,      0,      254,               0.913
> >    512,      0,      0,      127,               0.921
> >    512,      0,      0,      254,               0.914
> >   1024,      0,      0,      127,               0.845
> >   1024,      0,      0,      254,                0.84
> >     16,      1,      2,      127,               0.923
> >     16,      2,      1,      254,               0.955
> >     32,      2,      4,      127,               0.979
> >     32,      4,      2,      254,               0.957
> >     64,      3,      6,      127,               0.866
> >     64,      6,      3,      254,               0.849
> >    128,      4,      0,      127,               0.882
> >    128,      0,      4,      254,               0.876
> >    256,      5,      2,      127,               0.877
> >    256,      2,      5,      254,               0.882
> >    512,      6,      4,      127,               0.822
> >    512,      4,      6,      254,               0.862
> >   1024,      7,      6,      127,               0.903
> >   1024,      6,      7,      254,               0.908
> >
> >  sysdeps/x86_64/multiarch/strcmp-sse42.S | 83 +++++++++++--------------
> >  1 file changed, 35 insertions(+), 48 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
> > index 580feb90e9..7805ae9d41 100644
> > --- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
> > +++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
> > @@ -88,9 +88,8 @@ ENTRY (GLABEL(__strcasecmp))
> >         movq    __libc_tsd_LOCALE@gottpoff(%rip),%rax
> >         mov     %fs:(%rax),%RDX_LP
> >
> > -       // XXX 5 byte should be before the function
> > -       /* 5-byte NOP.  */
> > -       .byte   0x0f,0x1f,0x44,0x00,0x00
> > +       /* Either 1 or 5 bytes (dependeing if CET is enabled).  */
> > +       .p2align 4
> >  END (GLABEL(__strcasecmp))
> >         /* FALLTHROUGH to strcasecmp_l.  */
> >  #endif
> > @@ -99,9 +98,8 @@ ENTRY (GLABEL(__strncasecmp))
> >         movq    __libc_tsd_LOCALE@gottpoff(%rip),%rax
> >         mov     %fs:(%rax),%RCX_LP
> >
> > -       // XXX 5 byte should be before the function
> > -       /* 5-byte NOP.  */
> > -       .byte   0x0f,0x1f,0x44,0x00,0x00
> > +       /* Either 1 or 5 bytes (dependeing if CET is enabled).  */
> > +       .p2align 4
> >  END (GLABEL(__strncasecmp))
> >         /* FALLTHROUGH to strncasecmp_l.  */
> >  #endif
> > @@ -169,27 +167,22 @@ STRCMP_SSE42:
> >  #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> >         .section .rodata.cst16,"aM",@progbits,16
> >         .align 16
> > -LABEL(belowupper):
> > -       .quad   0x4040404040404040
> > -       .quad   0x4040404040404040
> > -LABEL(topupper):
> > -# ifdef USE_AVX
> > -       .quad   0x5a5a5a5a5a5a5a5a
> > -       .quad   0x5a5a5a5a5a5a5a5a
> > -# else
> > -       .quad   0x5b5b5b5b5b5b5b5b
> > -       .quad   0x5b5b5b5b5b5b5b5b
> > -# endif
> > -LABEL(touppermask):
> > +LABEL(lcase_min):
> > +       .quad   0x3f3f3f3f3f3f3f3f
> > +       .quad   0x3f3f3f3f3f3f3f3f
> > +LABEL(lcase_max):
> > +       .quad   0x9999999999999999
> > +       .quad   0x9999999999999999
> > +LABEL(case_add):
> >         .quad   0x2020202020202020
> >         .quad   0x2020202020202020
> >         .previous
> > -       movdqa  LABEL(belowupper)(%rip), %xmm4
> > -# define UCLOW_reg %xmm4
> > -       movdqa  LABEL(topupper)(%rip), %xmm5
> > -# define UCHIGH_reg %xmm5
> > -       movdqa  LABEL(touppermask)(%rip), %xmm6
> > -# define LCQWORD_reg %xmm6
> > +       movdqa  LABEL(lcase_min)(%rip), %xmm4
> > +# define LCASE_MIN_reg %xmm4
> > +       movdqa  LABEL(lcase_max)(%rip), %xmm5
> > +# define LCASE_MAX_reg %xmm5
> > +       movdqa  LABEL(case_add)(%rip), %xmm6
> > +# define CASE_ADD_reg %xmm6
> >  #endif
> >         cmp     $0x30, %ecx
> >         ja      LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
> > @@ -200,32 +193,26 @@ LABEL(touppermask):
> >  #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> >  # ifdef USE_AVX
> >  #  define TOLOWER(reg1, reg2) \
> > -       vpcmpgtb UCLOW_reg, reg1, %xmm7;                        \
> > -       vpcmpgtb UCHIGH_reg, reg1, %xmm8;                       \
> > -       vpcmpgtb UCLOW_reg, reg2, %xmm9;                        \
> > -       vpcmpgtb UCHIGH_reg, reg2, %xmm10;                      \
> > -       vpandn  %xmm7, %xmm8, %xmm8;                                    \
> > -       vpandn  %xmm9, %xmm10, %xmm10;                                  \
> > -       vpand   LCQWORD_reg, %xmm8, %xmm8;                              \
> > -       vpand   LCQWORD_reg, %xmm10, %xmm10;                            \
> > -       vpor    reg1, %xmm8, reg1;                                      \
> > -       vpor    reg2, %xmm10, reg2
> > +       vpaddb  LCASE_MIN_reg, reg1, %xmm7;                                     \
> > +       vpaddb  LCASE_MIN_reg, reg2, %xmm8;                                     \
> > +       vpcmpgtb LCASE_MAX_reg, %xmm7, %xmm7;                                   \
> > +       vpcmpgtb LCASE_MAX_reg, %xmm8, %xmm8;                                   \
> > +       vpandn  CASE_ADD_reg, %xmm7, %xmm7;                                     \
> > +       vpandn  CASE_ADD_reg, %xmm8, %xmm8;                                     \
> > +       vpaddb  %xmm7, reg1, reg1;                                      \
> > +       vpaddb  %xmm8, reg2, reg2
> >  # else
> >  #  define TOLOWER(reg1, reg2) \
> > -       movdqa  reg1, %xmm7;                                    \
> > -       movdqa  UCHIGH_reg, %xmm8;                              \
> > -       movdqa  reg2, %xmm9;                                    \
> > -       movdqa  UCHIGH_reg, %xmm10;                             \
> > -       pcmpgtb UCLOW_reg, %xmm7;                               \
> > -       pcmpgtb reg1, %xmm8;                                    \
> > -       pcmpgtb UCLOW_reg, %xmm9;                               \
> > -       pcmpgtb reg2, %xmm10;                                   \
> > -       pand    %xmm8, %xmm7;                                   \
> > -       pand    %xmm10, %xmm9;                                  \
> > -       pand    LCQWORD_reg, %xmm7;                             \
> > -       pand    LCQWORD_reg, %xmm9;                             \
> > -       por     %xmm7, reg1;                                    \
> > -       por     %xmm9, reg2
> > +       movdqa  LCASE_MIN_reg, %xmm7;                                   \
> > +       movdqa  LCASE_MIN_reg, %xmm8;                                   \
> > +       paddb   reg1, %xmm7;                                    \
> > +       paddb   reg2, %xmm8;                                    \
> > +       pcmpgtb LCASE_MAX_reg, %xmm7;                           \
> > +       pcmpgtb LCASE_MAX_reg, %xmm8;                           \
> > +       pandn   CASE_ADD_reg, %xmm7;                                    \
> > +       pandn   CASE_ADD_reg, %xmm8;                                    \
> > +       paddb   %xmm7, reg1;                                    \
> > +       paddb   %xmm8, reg2
> >  # endif
> >         TOLOWER (%xmm1, %xmm2)
> >  #else
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.

I would like to backport this patch to release branches.
Any comments or objections?

--Sunil

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v4 22/23] x86: Add EVEX optimized str{n}casecmp
  2022-03-25 18:18         ` Noah Goldstein
@ 2022-05-12 19:47           ` Sunil Pandey
  2022-05-12 19:52             ` Sunil Pandey
  0 siblings, 1 reply; 76+ messages in thread
From: Sunil Pandey @ 2022-05-12 19:47 UTC (permalink / raw)
  To: Noah Goldstein, Libc-stable Mailing List; +Cc: H.J. Lu, GNU C Library

[-- Attachment #1: Type: text/plain, Size: 34190 bytes --]

On Fri, Mar 25, 2022 at 11:20 AM Noah Goldstein via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Fri, Mar 25, 2022 at 1:15 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Thu, Mar 24, 2022 at 4:56 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > geometric_mean(N=40) of all benchmarks EVEX / SSE42: .621
> > >
> > > All string/memory tests pass.
> > > Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
> > > ---
> > >  sysdeps/x86_64/multiarch/Makefile            |   2 +
> > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c   |  16 +
> > >  sysdeps/x86_64/multiarch/ifunc-strcasecmp.h  |   5 +
> > >  sysdeps/x86_64/multiarch/strcasecmp_l-evex.S |  23 ++
> > >  sysdeps/x86_64/multiarch/strcmp-evex.S       | 290 ++++++++++++++++---
> > >  sysdeps/x86_64/multiarch/strncase_l-evex.S   |  25 ++
> > >  6 files changed, 321 insertions(+), 40 deletions(-)
> > >  create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
> > >  create mode 100644 sysdeps/x86_64/multiarch/strncase_l-evex.S
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > > index 06e1848823..35d80dc2ff 100644
> > > --- a/sysdeps/x86_64/multiarch/Makefile
> > > +++ b/sysdeps/x86_64/multiarch/Makefile
> > > @@ -57,6 +57,7 @@ sysdep_routines += \
> > >    strcasecmp_l-avx \
> > >    strcasecmp_l-avx2 \
> > >    strcasecmp_l-avx2-rtm \
> > > +  strcasecmp_l-evex \
> > >    strcasecmp_l-sse2 \
> > >    strcasecmp_l-sse4_2 \
> > >    strcasecmp_l-ssse3 \
> > > @@ -97,6 +98,7 @@ sysdep_routines += \
> > >    strncase_l-avx \
> > >    strncase_l-avx2 \
> > >    strncase_l-avx2-rtm \
> > > +  strncase_l-evex \
> > >    strncase_l-sse2 \
> > >    strncase_l-sse4_2 \
> > >    strncase_l-ssse3 \
> > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > index 3c556d07ac..f1a4d3dac2 100644
> > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > @@ -436,6 +436,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >
> > >    /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
> > >    IFUNC_IMPL (i, name, strcasecmp,
> > > +             IFUNC_IMPL_ADD (array, i, strcasecmp,
> > > +                             (CPU_FEATURE_USABLE (AVX512VL)
> > > +                              && CPU_FEATURE_USABLE (AVX512BW)),
> > > +                             __strcasecmp_evex)
> > >               IFUNC_IMPL_ADD (array, i, strcasecmp,
> > >                               CPU_FEATURE_USABLE (AVX2),
> > >                               __strcasecmp_avx2)
> > > @@ -456,6 +460,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >
> > >    /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
> > >    IFUNC_IMPL (i, name, strcasecmp_l,
> > > +             IFUNC_IMPL_ADD (array, i, strcasecmp,
> > > +                             (CPU_FEATURE_USABLE (AVX512VL)
> > > +                              && CPU_FEATURE_USABLE (AVX512BW)),
> > > +                             __strcasecmp_l_evex)
> > >               IFUNC_IMPL_ADD (array, i, strcasecmp,
> > >                               CPU_FEATURE_USABLE (AVX2),
> > >                               __strcasecmp_l_avx2)
> > > @@ -590,6 +598,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >
> > >    /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
> > >    IFUNC_IMPL (i, name, strncasecmp,
> > > +             IFUNC_IMPL_ADD (array, i, strncasecmp,
> > > +                             (CPU_FEATURE_USABLE (AVX512VL)
> > > +                              && CPU_FEATURE_USABLE (AVX512BW)),
> > > +                             __strncasecmp_evex)
> > >               IFUNC_IMPL_ADD (array, i, strncasecmp,
> > >                               CPU_FEATURE_USABLE (AVX2),
> > >                               __strncasecmp_avx2)
> > > @@ -611,6 +623,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >
> > >    /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
> > >    IFUNC_IMPL (i, name, strncasecmp_l,
> > > +             IFUNC_IMPL_ADD (array, i, strncasecmp,
> > > +                             (CPU_FEATURE_USABLE (AVX512VL)
> > > +                              && CPU_FEATURE_USABLE (AVX512BW)),
> > > +                             __strncasecmp_l_evex)
> > >               IFUNC_IMPL_ADD (array, i, strncasecmp,
> > >                               CPU_FEATURE_USABLE (AVX2),
> > >                               __strncasecmp_l_avx2)
> > > diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> > > index c4de111fd0..bf0d146e7f 100644
> > > --- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> > > +++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> > > @@ -25,6 +25,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
> > >  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
> > >  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> > >  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
> > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
> > >
> > >  static inline void *
> > >  IFUNC_SELECTOR (void)
> > > @@ -34,6 +35,10 @@ IFUNC_SELECTOR (void)
> > >    if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> > >        && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
> > >      {
> > > +      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> > > +          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
> > > +        return OPTIMIZE (evex);
> > > +
> > >        if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> > >          return OPTIMIZE (avx2_rtm);
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
> > > new file mode 100644
> > > index 0000000000..58642db748
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
> > > @@ -0,0 +1,23 @@
> > > +/* strcasecmp_l optimized with EVEX.
> > > +   Copyright (C) 2017-2022 Free Software Foundation, Inc.
> > > +   This file is part of the GNU C Library.
> > > +
> > > +   The GNU C Library is free software; you can redistribute it and/or
> > > +   modify it under the terms of the GNU Lesser General Public
> > > +   License as published by the Free Software Foundation; either
> > > +   version 2.1 of the License, or (at your option) any later version.
> > > +
> > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > +   Lesser General Public License for more details.
> > > +
> > > +   You should have received a copy of the GNU Lesser General Public
> > > +   License along with the GNU C Library; if not, see
> > > +   <https://www.gnu.org/licenses/>.  */
> > > +
> > > +#ifndef STRCMP
> > > +# define STRCMP        __strcasecmp_l_evex
> > > +#endif
> > > +#define USE_AS_STRCASECMP_L
> > > +#include "strcmp-evex.S"
> > > diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
> > > index 56d8c118e4..2a5b3ce037 100644
> > > --- a/sysdeps/x86_64/multiarch/strcmp-evex.S
> > > +++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
> > > @@ -19,6 +19,9 @@
> > >  #if IS_IN (libc)
> > >
> > >  # include <sysdep.h>
> > > +# if defined USE_AS_STRCASECMP_L
> > > +#  include "locale-defines.h"
> > > +# endif
> > >
> > >  # ifndef STRCMP
> > >  #  define STRCMP       __strcmp_evex
> > > @@ -34,19 +37,29 @@
> > >  # define VMOVA vmovdqa64
> > >
> > >  # ifdef USE_AS_WCSCMP
> > > -#  define TESTEQ       subl    $0xff,
> > > +#  ifndef OVERFLOW_STRCMP
> > > +#   define OVERFLOW_STRCMP     __wcscmp_evex
> > > +#  endif
> > > +
> > > +#  define TESTEQ       subl $0xff,
> > >         /* Compare packed dwords.  */
> > >  #  define VPCMP        vpcmpd
> > >  #  define VPMINU       vpminud
> > >  #  define VPTESTM      vptestmd
> > > +#  define VPTESTNM     vptestnmd
> > >         /* 1 dword char == 4 bytes.  */
> > >  #  define SIZE_OF_CHAR 4
> > >  # else
> > > +#  ifndef OVERFLOW_STRCMP
> > > +#   define OVERFLOW_STRCMP     __strcmp_evex
> > > +#  endif
> > > +
> > >  #  define TESTEQ       incl
> > >         /* Compare packed bytes.  */
> > >  #  define VPCMP        vpcmpb
> > >  #  define VPMINU       vpminub
> > >  #  define VPTESTM      vptestmb
> > > +#  define VPTESTNM     vptestnmb
> > >         /* 1 byte char == 1 byte.  */
> > >  #  define SIZE_OF_CHAR 1
> > >  # endif
> > > @@ -73,11 +86,16 @@
> > >  #  define VEC_OFFSET   (-VEC_SIZE)
> > >  # endif
> > >
> > > -# define XMMZERO       xmm16
> > >  # define XMM0  xmm17
> > >  # define XMM1  xmm18
> > >
> > > -# define YMMZERO       ymm16
> > > +# define XMM10 xmm27
> > > +# define XMM11 xmm28
> > > +# define XMM12 xmm29
> > > +# define XMM13 xmm30
> > > +# define XMM14 xmm31
> > > +
> > > +
> > >  # define YMM0  ymm17
> > >  # define YMM1  ymm18
> > >  # define YMM2  ymm19
> > > @@ -89,6 +107,87 @@
> > >  # define YMM8  ymm25
> > >  # define YMM9  ymm26
> > >  # define YMM10 ymm27
> > > +# define YMM11 ymm28
> > > +# define YMM12 ymm29
> > > +# define YMM13 ymm30
> > > +# define YMM14 ymm31
> > > +
> > > +# ifdef USE_AS_STRCASECMP_L
> > > +#  define BYTE_LOOP_REG        OFFSET_REG
> > > +# else
> > > +#  define BYTE_LOOP_REG        ecx
> > > +# endif
> > > +
> > > +# ifdef USE_AS_STRCASECMP_L
> > > +#  ifdef USE_AS_STRNCMP
> > > +#   define STRCASECMP  __strncasecmp_evex
> > > +#   define LOCALE_REG  rcx
> > > +#   define LOCALE_REG_LP       RCX_LP
> > > +#   define STRCASECMP_NONASCII __strncasecmp_l_nonascii
> > > +#  else
> > > +#   define STRCASECMP  __strcasecmp_evex
> > > +#   define LOCALE_REG  rdx
> > > +#   define LOCALE_REG_LP       RDX_LP
> > > +#   define STRCASECMP_NONASCII __strcasecmp_l_nonascii
> > > +#  endif
> > > +# endif
> > > +
> > > +# define LCASE_MIN_YMM %YMM12
> > > +# define LCASE_MAX_YMM %YMM13
> > > +# define CASE_ADD_YMM  %YMM14
> > > +
> > > +# define LCASE_MIN_XMM %XMM12
> > > +# define LCASE_MAX_XMM %XMM13
> > > +# define CASE_ADD_XMM  %XMM14
> > > +
> > > +       /* NB: wcsncmp uses r11 but strcasecmp is never used in
> > > +          conjunction with wcscmp.  */
> > > +# define TOLOWER_BASE  %r11
> > > +
> > > +# ifdef USE_AS_STRCASECMP_L
> > > +#  define _REG(x, y) x ## y
> > > +#  define REG(x, y) _REG(x, y)
> > > +#  define TOLOWER(reg1, reg2, ext)                                                                             \
> > > +       vpsubb  REG(LCASE_MIN_, ext), reg1, REG(%ext, 10);                                      \
> > > +       vpsubb  REG(LCASE_MIN_, ext), reg2, REG(%ext, 11);                                      \
> > > +       vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5;                           \
> > > +       vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6;                           \
> > > +       vpaddb  reg1, REG(CASE_ADD_, ext), reg1{%k5};                                           \
> > > +       vpaddb  reg2, REG(CASE_ADD_, ext), reg2{%k6}
> > > +
> > > +#  define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
> > > +#  define TOLOWER_YMM(...)     TOLOWER(__VA_ARGS__, YMM)
> > > +#  define TOLOWER_XMM(...)     TOLOWER(__VA_ARGS__, XMM)
> > > +
> > > +#  define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)                                              \
> > > +       TOLOWER (s1_reg, s2_reg, ext);                                                                          \
> > > +       VPCMP   $0, s1_reg, s2_reg, reg_out
> > > +
> > > +#  define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext)                              \
> > > +       VMOVU   s2_mem, s2_reg;                                                                                         \
> > > +       CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)
> > > +
> > > +#  define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM)
> > > +#  define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM)
> > > +
> > > +#  define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM)
> > > +#  define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM)
> > > +
> > > +# else
> > > +#  define TOLOWER_gpr(...)
> > > +#  define TOLOWER_YMM(...)
> > > +#  define TOLOWER_XMM(...)
> > > +
> > > +#  define CMP_R1_R2_YMM(s1_reg, s2_reg, reg_out)                                               \
> > > +       VPCMP   $0, s2_reg, s1_reg, reg_out
> > > +
> > > +#  define CMP_R1_R2_XMM(...) CMP_R1_R2_YMM(__VA_ARGS__)
> > > +
> > > +#  define CMP_R1_S2_YMM(s1_reg, s2_mem, unused, reg_out)                               \
> > > +       VPCMP   $0, s2_mem, s1_reg, reg_out
> > > +
> > > +#  define CMP_R1_S2_XMM(...) CMP_R1_S2_YMM(__VA_ARGS__)
> > > +# endif
> > >
> > >  /* Warning!
> > >             wcscmp/wcsncmp have to use SIGNED comparison for elements.
> > > @@ -112,8 +211,45 @@
> > >     returned.  */
> > >
> > >         .section .text.evex, "ax", @progbits
> > > -ENTRY(STRCMP)
> > > +       .align  16
> > > +       .type   STRCMP, @function
> > > +       .globl  STRCMP
> > > +       .hidden STRCMP
> > > +
> > > +# ifdef USE_AS_STRCASECMP_L
> > > +ENTRY (STRCASECMP)
> > > +       movq    __libc_tsd_LOCALE@gottpoff(%rip), %rax
> > > +       mov     %fs:(%rax), %LOCALE_REG_LP
> > > +
> > > +       /* Either 1 or 5 bytes (dependeing if CET is enabled).  */
> > > +       .p2align 4
> > > +END (STRCASECMP)
> > > +       /* FALLTHROUGH to strcasecmp/strncasecmp_l.  */
> > > +# endif
> > > +
> > > +       .p2align 4
> > > +STRCMP:
> > > +       cfi_startproc
> > > +       _CET_ENDBR
> > > +       CALL_MCOUNT
> > > +
> > > +# if defined USE_AS_STRCASECMP_L
> > > +       /* We have to fall back on the C implementation for locales with
> > > +          encodings not matching ASCII for single bytes.  */
> > > +#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
> > > +       mov     LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
> > > +#  else
> > > +       mov     (%LOCALE_REG), %RAX_LP
> > > +#  endif
> > > +       testl   $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
> > > +       jne     STRCASECMP_NONASCII
> > > +       leaq    _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
> > > +# endif
> > > +
> > >  # ifdef USE_AS_STRNCMP
> > > +       /* Don't overwrite LOCALE_REG (rcx) until we have pass
> > > +          L(one_or_less). Otherwise we might use the wrong locale in
> > > +          the OVERFLOW_STRCMP (strcasecmp_l).  */
> > >  #  ifdef __ILP32__
> > >         /* Clear the upper 32 bits.  */
> > >         movl    %edx, %edx
> > > @@ -125,6 +261,32 @@ ENTRY(STRCMP)
> > >            actually bound the buffer.  */
> > >         jle     L(one_or_less)
> > >  # endif
> > > +
> > > +# if defined USE_AS_STRCASECMP_L
> > > +       .section .rodata.cst32, "aM", @progbits, 32
> > > +       .align  32
> > > +L(lcase_min):
> > > +       .quad   0x4141414141414141
> > > +       .quad   0x4141414141414141
> > > +       .quad   0x4141414141414141
> > > +       .quad   0x4141414141414141
> > > +L(lcase_max):
> > > +       .quad   0x1a1a1a1a1a1a1a1a
> > > +       .quad   0x1a1a1a1a1a1a1a1a
> > > +       .quad   0x1a1a1a1a1a1a1a1a
> > > +       .quad   0x1a1a1a1a1a1a1a1a
> > > +L(case_add):
> > > +       .quad   0x2020202020202020
> > > +       .quad   0x2020202020202020
> > > +       .quad   0x2020202020202020
> > > +       .quad   0x2020202020202020
> > > +       .previous
> > > +
> > > +       vmovdqa64 L(lcase_min)(%rip), LCASE_MIN_YMM
> > > +       vmovdqa64 L(lcase_max)(%rip), LCASE_MAX_YMM
> > > +       vmovdqa64 L(case_add)(%rip), CASE_ADD_YMM
> > > +# endif
> > > +
> > >         movl    %edi, %eax
> > >         orl     %esi, %eax
> > >         /* Shift out the bits irrelivant to page boundary ([63:12]).  */
> > > @@ -139,7 +301,7 @@ L(no_page_cross):
> > >         VPTESTM %YMM0, %YMM0, %k2
> > >         /* Each bit cleared in K1 represents a mismatch or a null CHAR
> > >            in YMM0 and 32 bytes at (%rsi).  */
> > > -       VPCMP   $0, (%rsi), %YMM0, %k1{%k2}
> > > +       CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
> > >         kmovd   %k1, %ecx
> > >  # ifdef USE_AS_STRNCMP
> > >         cmpq    $CHAR_PER_VEC, %rdx
> > > @@ -169,6 +331,8 @@ L(return_vec_0):
> > >  # else
> > >         movzbl  (%rdi, %rcx), %eax
> > >         movzbl  (%rsi, %rcx), %ecx
> > > +       TOLOWER_gpr (%rax, %eax)
> > > +       TOLOWER_gpr (%rcx, %ecx)
> > >         subl    %ecx, %eax
> > >  # endif
> > >  L(ret0):
> > > @@ -188,11 +352,15 @@ L(ret_zero):
> > >
> > >         .p2align 4,, 5
> > >  L(one_or_less):
> > > +#  ifdef USE_AS_STRCASECMP_L
> > > +       /* Set locale argument for strcasecmp.  */
> > > +       movq    %LOCALE_REG, %rdx
> > > +#  endif
> > >         jb      L(ret_zero)
> > > -#  ifdef USE_AS_WCSCMP
> > >         /* 'nbe' covers the case where length is negative (large
> > >            unsigned).  */
> > > -       jnbe    __wcscmp_evex
> > > +       jnbe    OVERFLOW_STRCMP
> > > +#  ifdef USE_AS_WCSCMP
> > >         movl    (%rdi), %edx
> > >         xorl    %eax, %eax
> > >         cmpl    (%rsi), %edx
> > > @@ -201,11 +369,10 @@ L(one_or_less):
> > >         negl    %eax
> > >         orl     $1, %eax
> > >  #  else
> > > -       /* 'nbe' covers the case where length is negative (large
> > > -          unsigned).  */
> > > -       jnbe    __strcmp_evex
> > >         movzbl  (%rdi), %eax
> > >         movzbl  (%rsi), %ecx
> > > +       TOLOWER_gpr (%rax, %eax)
> > > +       TOLOWER_gpr (%rcx, %ecx)
> > >         subl    %ecx, %eax
> > >  #  endif
> > >  L(ret1):
> > > @@ -233,6 +400,8 @@ L(return_vec_1):
> > >  # else
> > >         movzbl  VEC_SIZE(%rdi, %rcx), %eax
> > >         movzbl  VEC_SIZE(%rsi, %rcx), %ecx
> > > +       TOLOWER_gpr (%rax, %eax)
> > > +       TOLOWER_gpr (%rcx, %ecx)
> > >         subl    %ecx, %eax
> > >  # endif
> > >  L(ret2):
> > > @@ -270,6 +439,8 @@ L(return_vec_2):
> > >  # else
> > >         movzbl  (VEC_SIZE * 2)(%rdi, %rcx), %eax
> > >         movzbl  (VEC_SIZE * 2)(%rsi, %rcx), %ecx
> > > +       TOLOWER_gpr (%rax, %eax)
> > > +       TOLOWER_gpr (%rcx, %ecx)
> > >         subl    %ecx, %eax
> > >  # endif
> > >  L(ret3):
> > > @@ -290,6 +461,8 @@ L(return_vec_3):
> > >  #  else
> > >         movzbl  (VEC_SIZE * 3)(%rdi, %rcx), %eax
> > >         movzbl  (VEC_SIZE * 3)(%rsi, %rcx), %ecx
> > > +       TOLOWER_gpr (%rax, %eax)
> > > +       TOLOWER_gpr (%rcx, %ecx)
> > >         subl    %ecx, %eax
> > >  #  endif
> > >  L(ret4):
> > > @@ -303,7 +476,7 @@ L(more_3x_vec):
> > >         /* Safe to compare 4x vectors.  */
> > >         VMOVU   (VEC_SIZE)(%rdi), %YMM0
> > >         VPTESTM %YMM0, %YMM0, %k2
> > > -       VPCMP   $0, (VEC_SIZE)(%rsi), %YMM0, %k1{%k2}
> > > +       CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
> > >         kmovd   %k1, %ecx
> > >         TESTEQ  %ecx
> > >         jnz     L(return_vec_1)
> > > @@ -315,14 +488,14 @@ L(more_3x_vec):
> > >
> > >         VMOVU   (VEC_SIZE * 2)(%rdi), %YMM0
> > >         VPTESTM %YMM0, %YMM0, %k2
> > > -       VPCMP   $0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
> > > +       CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2}
> > >         kmovd   %k1, %ecx
> > >         TESTEQ  %ecx
> > >         jnz     L(return_vec_2)
> > >
> > >         VMOVU   (VEC_SIZE * 3)(%rdi), %YMM0
> > >         VPTESTM %YMM0, %YMM0, %k2
> > > -       VPCMP   $0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
> > > +       CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2}
> > >         kmovd   %k1, %ecx
> > >         TESTEQ  %ecx
> > >         jnz     L(return_vec_3)
> > > @@ -381,7 +554,6 @@ L(prepare_loop_aligned):
> > >         subl    %esi, %eax
> > >         andl    $(PAGE_SIZE - 1), %eax
> > >
> > > -       vpxorq  %YMMZERO, %YMMZERO, %YMMZERO
> > >
> > >         /* Loop 4x comparisons at a time.  */
> > >         .p2align 4
> > > @@ -413,22 +585,35 @@ L(loop_skip_page_cross_check):
> > >         /* A zero CHAR in YMM9 means that there is a null CHAR.  */
> > >         VPMINU  %YMM8, %YMM9, %YMM9
> > >
> > > -       /* Each bit set in K1 represents a non-null CHAR in YMM8.  */
> > > +       /* Each bit set in K1 represents a non-null CHAR in YMM9.  */
> > >         VPTESTM %YMM9, %YMM9, %k1
> > > -
> > > +# ifndef USE_AS_STRCASECMP_L
> > >         vpxorq  (VEC_SIZE * 0)(%rsi), %YMM0, %YMM1
> > >         vpxorq  (VEC_SIZE * 1)(%rsi), %YMM2, %YMM3
> > >         vpxorq  (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
> > >         /* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
> > >            oring with YMM1. Result is stored in YMM6.  */
> > >         vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6
> > > -
> > > +# else
> > > +       VMOVU   (VEC_SIZE * 0)(%rsi), %YMM1
> > > +       TOLOWER_YMM (%YMM0, %YMM1)
> > > +       VMOVU   (VEC_SIZE * 1)(%rsi), %YMM3
> > > +       TOLOWER_YMM (%YMM2, %YMM3)
> > > +       VMOVU   (VEC_SIZE * 2)(%rsi), %YMM5
> > > +       TOLOWER_YMM (%YMM4, %YMM5)
> > > +       VMOVU   (VEC_SIZE * 3)(%rsi), %YMM7
> > > +       TOLOWER_YMM (%YMM6, %YMM7)
> > > +       vpxorq  %YMM0, %YMM1, %YMM1
> > > +       vpxorq  %YMM2, %YMM3, %YMM3
> > > +       vpxorq  %YMM4, %YMM5, %YMM5
> > > +       vpternlogd $0xde, %YMM7, %YMM1, %YMM6
> > > +# endif
> > >         /* Or together YMM3, YMM5, and YMM6.  */
> > >         vpternlogd $0xfe, %YMM3, %YMM5, %YMM6
> > >
> > >
> > >         /* A non-zero CHAR in YMM6 represents a mismatch.  */
> > > -       VPCMP   $0, %YMMZERO, %YMM6, %k0{%k1}
> > > +       VPTESTNM %YMM6, %YMM6, %k0{%k1}
> > >         kmovd   %k0, %LOOP_REG
> > >
> > >         TESTEQ  %LOOP_REG
> > > @@ -437,13 +622,13 @@ L(loop_skip_page_cross_check):
> > >
> > >         /* Find which VEC has the mismatch of end of string.  */
> > >         VPTESTM %YMM0, %YMM0, %k1
> > > -       VPCMP   $0, %YMMZERO, %YMM1, %k0{%k1}
> > > +       VPTESTNM %YMM1, %YMM1, %k0{%k1}
> > >         kmovd   %k0, %ecx
> > >         TESTEQ  %ecx
> > >         jnz     L(return_vec_0_end)
> > >
> > >         VPTESTM %YMM2, %YMM2, %k1
> > > -       VPCMP   $0, %YMMZERO, %YMM3, %k0{%k1}
> > > +       VPTESTNM %YMM3, %YMM3, %k0{%k1}
> > >         kmovd   %k0, %ecx
> > >         TESTEQ  %ecx
> > >         jnz     L(return_vec_1_end)
> > > @@ -457,7 +642,7 @@ L(return_vec_2_3_end):
> > >  # endif
> > >
> > >         VPTESTM %YMM4, %YMM4, %k1
> > > -       VPCMP   $0, %YMMZERO, %YMM5, %k0{%k1}
> > > +       VPTESTNM %YMM5, %YMM5, %k0{%k1}
> > >         kmovd   %k0, %ecx
> > >         TESTEQ  %ecx
> > >  # if CHAR_PER_VEC <= 16
> > > @@ -493,6 +678,8 @@ L(return_vec_3_end):
> > >  # else
> > >         movzbl  (VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax
> > >         movzbl  (VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx
> > > +       TOLOWER_gpr (%rax, %eax)
> > > +       TOLOWER_gpr (%rcx, %ecx)
> > >         subl    %ecx, %eax
> > >         xorl    %r8d, %eax
> > >         subl    %r8d, %eax
> > > @@ -545,6 +732,8 @@ L(return_vec_0_end):
> > >  # else
> > >         movzbl  (%rdi, %rcx), %eax
> > >         movzbl  (%rsi, %rcx), %ecx
> > > +       TOLOWER_gpr (%rax, %eax)
> > > +       TOLOWER_gpr (%rcx, %ecx)
> > >         subl    %ecx, %eax
> > >         /* Flip `eax` if `rdi` and `rsi` where swapped in page cross
> > >            logic. Subtract `r8d` after xor for zero case.  */
> > > @@ -569,6 +758,8 @@ L(return_vec_1_end):
> > >  #  else
> > >         movzbl  VEC_SIZE(%rdi, %rcx), %eax
> > >         movzbl  VEC_SIZE(%rsi, %rcx), %ecx
> > > +       TOLOWER_gpr (%rax, %eax)
> > > +       TOLOWER_gpr (%rcx, %ecx)
> > >         subl    %ecx, %eax
> > >         xorl    %r8d, %eax
> > >         subl    %r8d, %eax
> > > @@ -598,7 +789,7 @@ L(page_cross_during_loop):
> > >
> > >         VMOVA   (%rdi), %YMM0
> > >         VPTESTM %YMM0, %YMM0, %k2
> > > -       VPCMP   $0, (%rsi), %YMM0, %k1{%k2}
> > > +       CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
> > >         kmovd   %k1, %ecx
> > >         TESTEQ  %ecx
> > >         jnz     L(return_vec_0_end)
> > > @@ -619,8 +810,7 @@ L(less_1x_vec_till_page_cross):
> > >            been loaded earlier so must be valid.  */
> > >         VMOVU   -VEC_SIZE(%rdi, %rax), %YMM0
> > >         VPTESTM %YMM0, %YMM0, %k2
> > > -       VPCMP   $0, -VEC_SIZE(%rsi, %rax), %YMM0, %k1{%k2}
> > > -
> > > +       CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2}
> > >         /* Mask of potentially valid bits. The lower bits can be out of
> > >            range comparisons (but safe regarding page crosses).  */
> > >
> > > @@ -642,6 +832,8 @@ L(less_1x_vec_till_page_cross):
> > >
> > >  # ifdef USE_AS_STRNCMP
> > >  #  ifdef USE_AS_WCSCMP
> > > +       /* NB: strcasecmp not used with WCSCMP so this access to r11 is
> > > +          safe.  */
> > >         movl    %eax, %r11d
> > >         shrl    $2, %r11d
> > >         cmpq    %r11, %rdx
> > > @@ -679,6 +871,8 @@ L(return_page_cross_cmp_mem):
> > >  # else
> > >         movzbl  VEC_OFFSET(%rdi, %rcx), %eax
> > >         movzbl  VEC_OFFSET(%rsi, %rcx), %ecx
> > > +       TOLOWER_gpr (%rax, %eax)
> > > +       TOLOWER_gpr (%rcx, %ecx)
> > >         subl    %ecx, %eax
> > >         xorl    %r8d, %eax
> > >         subl    %r8d, %eax
> > > @@ -709,7 +903,7 @@ L(more_2x_vec_till_page_cross):
> > >
> > >         VMOVA   VEC_SIZE(%rdi), %YMM0
> > >         VPTESTM %YMM0, %YMM0, %k2
> > > -       VPCMP   $0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
> > > +       CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
> > >         kmovd   %k1, %ecx
> > >         TESTEQ  %ecx
> > >         jnz     L(return_vec_1_end)
> > > @@ -724,14 +918,14 @@ L(more_2x_vec_till_page_cross):
> > >         /* Safe to include comparisons from lower bytes.  */
> > >         VMOVU   -(VEC_SIZE * 2)(%rdi, %rax), %YMM0
> > >         VPTESTM %YMM0, %YMM0, %k2
> > > -       VPCMP   $0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM0, %k1{%k2}
> > > +       CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2}
> > >         kmovd   %k1, %ecx
> > >         TESTEQ  %ecx
> > >         jnz     L(return_vec_page_cross_0)
> > >
> > >         VMOVU   -(VEC_SIZE * 1)(%rdi, %rax), %YMM0
> > >         VPTESTM %YMM0, %YMM0, %k2
> > > -       VPCMP   $0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM0, %k1{%k2}
> > > +       CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2}
> > >         kmovd   %k1, %ecx
> > >         TESTEQ  %ecx
> > >         jnz     L(return_vec_page_cross_1)
> > > @@ -740,6 +934,8 @@ L(more_2x_vec_till_page_cross):
> > >         /* Must check length here as length might proclude reading next
> > >            page.  */
> > >  #  ifdef USE_AS_WCSCMP
> > > +       /* NB: strcasecmp not used with WCSCMP so this access to r11 is
> > > +          safe.  */
> > >         movl    %eax, %r11d
> > >         shrl    $2, %r11d
> > >         cmpq    %r11, %rdx
> > > @@ -754,12 +950,19 @@ L(more_2x_vec_till_page_cross):
> > >         VMOVA   (VEC_SIZE * 3)(%rdi), %YMM6
> > >         VPMINU  %YMM4, %YMM6, %YMM9
> > >         VPTESTM %YMM9, %YMM9, %k1
> > > -
> > > +# ifndef USE_AS_STRCASECMP_L
> > >         vpxorq  (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
> > >         /* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6).  */
> > >         vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6
> > > -
> > > -       VPCMP   $0, %YMMZERO, %YMM6, %k0{%k1}
> > > +# else
> > > +       VMOVU   (VEC_SIZE * 2)(%rsi), %YMM5
> > > +       TOLOWER_YMM (%YMM4, %YMM5)
> > > +       VMOVU   (VEC_SIZE * 3)(%rsi), %YMM7
> > > +       TOLOWER_YMM (%YMM6, %YMM7)
> > > +       vpxorq  %YMM4, %YMM5, %YMM5
> > > +       vpternlogd $0xde, %YMM7, %YMM5, %YMM6
> > > +# endif
> > > +       VPTESTNM %YMM6, %YMM6, %k0{%k1}
> > >         kmovd   %k0, %LOOP_REG
> > >         TESTEQ  %LOOP_REG
> > >         jnz     L(return_vec_2_3_end)
> > > @@ -815,6 +1018,8 @@ L(return_vec_page_cross_1):
> > >  # else
> > >         movzbl  VEC_OFFSET(%rdi, %rcx), %eax
> > >         movzbl  VEC_OFFSET(%rsi, %rcx), %ecx
> > > +       TOLOWER_gpr (%rax, %eax)
> > > +       TOLOWER_gpr (%rcx, %ecx)
> > >         subl    %ecx, %eax
> > >         xorl    %r8d, %eax
> > >         subl    %r8d, %eax
> > > @@ -871,7 +1076,7 @@ L(page_cross):
> > >  L(page_cross_loop):
> > >         VMOVU   (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
> > >         VPTESTM %YMM0, %YMM0, %k2
> > > -       VPCMP   $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
> > > +       CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
> > >         kmovd   %k1, %ecx
> > >         TESTEQ  %ecx
> > >         jnz     L(check_ret_vec_page_cross)
> > > @@ -895,7 +1100,7 @@ L(page_cross_loop):
> > >          */
> > >         VMOVU   (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
> > >         VPTESTM %YMM0, %YMM0, %k2
> > > -       VPCMP   $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
> > > +       CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
> > >
> > >         kmovd   %k1, %ecx
> > >  # ifdef USE_AS_STRNCMP
> > > @@ -930,6 +1135,8 @@ L(ret_vec_page_cross_cont):
> > >  # else
> > >         movzbl  (%rdi, %rcx, SIZE_OF_CHAR), %eax
> > >         movzbl  (%rsi, %rcx, SIZE_OF_CHAR), %ecx
> > > +       TOLOWER_gpr (%rax, %eax)
> > > +       TOLOWER_gpr (%rcx, %ecx)
> > >         subl    %ecx, %eax
> > >         xorl    %r8d, %eax
> > >         subl    %r8d, %eax
> > > @@ -989,7 +1196,7 @@ L(less_1x_vec_till_page):
> > >         /* Use 16 byte comparison.  */
> > >         vmovdqu (%rdi), %xmm0
> > >         VPTESTM %xmm0, %xmm0, %k2
> > > -       VPCMP   $0, (%rsi), %xmm0, %k1{%k2}
> > > +       CMP_R1_S2_XMM (%xmm0, (%rsi), %xmm1, %k1){%k2}
> > >         kmovd   %k1, %ecx
> > >  # ifdef USE_AS_WCSCMP
> > >         subl    $0xf, %ecx
> > > @@ -1009,7 +1216,7 @@ L(less_1x_vec_till_page):
> > >  # endif
> > >         vmovdqu (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
> > >         VPTESTM %xmm0, %xmm0, %k2
> > > -       VPCMP   $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0, %k1{%k2}
> > > +       CMP_R1_S2_XMM (%xmm0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1, %k1){%k2}
> > >         kmovd   %k1, %ecx
> > >  # ifdef USE_AS_WCSCMP
> > >         subl    $0xf, %ecx
> > > @@ -1048,7 +1255,7 @@ L(less_16_till_page):
> > >         vmovq   (%rdi), %xmm0
> > >         vmovq   (%rsi), %xmm1
> > >         VPTESTM %xmm0, %xmm0, %k2
> > > -       VPCMP   $0, %xmm1, %xmm0, %k1{%k2}
> > > +       CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
> > >         kmovd   %k1, %ecx
> > >  # ifdef USE_AS_WCSCMP
> > >         subl    $0x3, %ecx
> > > @@ -1068,7 +1275,7 @@ L(less_16_till_page):
> > >         vmovq   (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
> > >         vmovq   (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
> > >         VPTESTM %xmm0, %xmm0, %k2
> > > -       VPCMP   $0, %xmm1, %xmm0, %k1{%k2}
> > > +       CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
> > >         kmovd   %k1, %ecx
> > >  # ifdef USE_AS_WCSCMP
> > >         subl    $0x3, %ecx
> > > @@ -1128,7 +1335,7 @@ L(ret_less_8_wcs):
> > >         vmovd   (%rdi), %xmm0
> > >         vmovd   (%rsi), %xmm1
> > >         VPTESTM %xmm0, %xmm0, %k2
> > > -       VPCMP   $0, %xmm1, %xmm0, %k1{%k2}
> > > +       CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
> > >         kmovd   %k1, %ecx
> > >         subl    $0xf, %ecx
> > >         jnz     L(check_ret_vec_page_cross)
> > > @@ -1143,7 +1350,7 @@ L(ret_less_8_wcs):
> > >         vmovd   (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
> > >         vmovd   (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
> > >         VPTESTM %xmm0, %xmm0, %k2
> > > -       VPCMP   $0, %xmm1, %xmm0, %k1{%k2}
> > > +       CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
> > >         kmovd   %k1, %ecx
> > >         subl    $0xf, %ecx
> > >         jnz     L(check_ret_vec_page_cross)
> > > @@ -1176,7 +1383,9 @@ L(less_4_till_page):
> > >  L(less_4_loop):
> > >         movzbl  (%rdi), %eax
> > >         movzbl  (%rsi, %rdi), %ecx
> > > -       subl    %ecx, %eax
> > > +       TOLOWER_gpr (%rax, %eax)
> > > +       TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
> > > +       subl    %BYTE_LOOP_REG, %eax
> > >         jnz     L(ret_less_4_loop)
> > >         testl   %ecx, %ecx
> > >         jz      L(ret_zero_4_loop)
> > > @@ -1203,5 +1412,6 @@ L(ret_less_4_loop):
> > >         subl    %r8d, %eax
> > >         ret
> > >  # endif
> > > -END(STRCMP)
> > > +       cfi_endproc
> > > +       .size   STRCMP, .-STRCMP
> > >  #endif
> > > diff --git a/sysdeps/x86_64/multiarch/strncase_l-evex.S b/sysdeps/x86_64/multiarch/strncase_l-evex.S
> > > new file mode 100644
> > > index 0000000000..8a5af3695c
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/strncase_l-evex.S
> > > @@ -0,0 +1,25 @@
> > > +/* strncasecmp_l optimized with EVEX.
> > > +   Copyright (C) 2017-2022 Free Software Foundation, Inc.
> > > +   This file is part of the GNU C Library.
> > > +
> > > +   The GNU C Library is free software; you can redistribute it and/or
> > > +   modify it under the terms of the GNU Lesser General Public
> > > +   License as published by the Free Software Foundation; either
> > > +   version 2.1 of the License, or (at your option) any later version.
> > > +
> > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > +   Lesser General Public License for more details.
> > > +
> > > +   You should have received a copy of the GNU Lesser General Public
> > > +   License along with the GNU C Library; if not, see
> > > +   <https://www.gnu.org/licenses/>.  */
> > > +
> > > +#ifndef STRCMP
> > > +# define STRCMP        __strncasecmp_l_evex
> > > +#endif
> > > +#define OVERFLOW_STRCMP        __strcasecmp_l_evex
> > > +#define USE_AS_STRCASECMP_L
> > > +#define USE_AS_STRNCMP
> > > +#include "strcmp-evex.S"
> > > --
> > > 2.25.1
> > >
> >
> > LGTM.
> >
> > Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks, pushed the patchset.
> >
> > Thanks.
> >
> > --
> > H.J.

I would like to backport this patch to release branches.
Any comments or objections?

Conflict resolution patch attached.

--Sunil

[-- Attachment #2: 0015-x86-Add-AVX2-optimized-str-n-casecmp.patch --]
[-- Type: application/octet-stream, Size: 24673 bytes --]

From b382e4caf50dfee62e170f9b6617b470b1289dcb Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Thu, 24 Mar 2022 18:56:12 -0500
Subject: [PATCH 15/26] x86: Add AVX2 optimized str{n}casecmp

geometric_mean(N=40) of all benchmarks AVX2 / SSE42: .702

All string/memory tests pass.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

(cherry picked from commit bbf81222343fed5cd704001a2ae0d86c71544151)
---
 sysdeps/x86_64/multiarch/Makefile             |   4 +
 sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  28 +++
 sysdeps/x86_64/multiarch/ifunc-strcasecmp.h   |  12 +
 .../x86_64/multiarch/strcasecmp_l-avx2-rtm.S  |  15 ++
 sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S  |  23 ++
 sysdeps/x86_64/multiarch/strcmp-avx2.S        | 237 +++++++++++++++---
 .../x86_64/multiarch/strncase_l-avx2-rtm.S    |  16 ++
 sysdeps/x86_64/multiarch/strncase_l-avx2.S    |  27 ++
 8 files changed, 331 insertions(+), 31 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
 create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
 create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index e7b413edad..06e1848823 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -55,6 +55,8 @@ sysdep_routines += \
   stpncpy-sse2-unaligned \
   stpncpy-ssse3 \
   strcasecmp_l-avx \
+  strcasecmp_l-avx2 \
+  strcasecmp_l-avx2-rtm \
   strcasecmp_l-sse2 \
   strcasecmp_l-sse4_2 \
   strcasecmp_l-ssse3 \
@@ -93,6 +95,8 @@ sysdep_routines += \
   strlen-evex \
   strlen-sse2 \
   strncase_l-avx \
+  strncase_l-avx2 \
+  strncase_l-avx2-rtm \
   strncase_l-sse2 \
   strncase_l-sse4_2 \
   strncase_l-ssse3 \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index a594f4176e..3c556d07ac 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -436,6 +436,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
   IFUNC_IMPL (i, name, strcasecmp,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      CPU_FEATURE_USABLE (AVX2),
+			      __strcasecmp_avx2)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __strcasecmp_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp,
 			      CPU_FEATURE_USABLE (AVX),
 			      __strcasecmp_avx)
@@ -449,6 +456,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
   IFUNC_IMPL (i, name, strcasecmp_l,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      CPU_FEATURE_USABLE (AVX2),
+			      __strcasecmp_l_avx2)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __strcasecmp_l_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
 			      CPU_FEATURE_USABLE (AVX),
 			      __strcasecmp_l_avx)
@@ -576,6 +590,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
   IFUNC_IMPL (i, name, strncasecmp,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      CPU_FEATURE_USABLE (AVX2),
+			      __strncasecmp_avx2)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __strncasecmp_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp,
 			      CPU_FEATURE_USABLE (AVX),
 			      __strncasecmp_avx)
@@ -590,6 +611,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
   IFUNC_IMPL (i, name, strncasecmp_l,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      CPU_FEATURE_USABLE (AVX2),
+			      __strncasecmp_l_avx2)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __strncasecmp_l_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
 			      CPU_FEATURE_USABLE (AVX),
 			      __strncasecmp_l_avx)
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
index 9e3cc61ac0..c4de111fd0 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
@@ -23,12 +23,24 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
 
 static inline void *
 IFUNC_SELECTOR (void)
 {
   const struct cpu_features* cpu_features = __get_cpu_features ();
 
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+    {
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+        return OPTIMIZE (avx2_rtm);
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+        return OPTIMIZE (avx2);
+    }
+
   if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
     return OPTIMIZE (avx);
 
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
new file mode 100644
index 0000000000..09957fc3c5
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
@@ -0,0 +1,15 @@
+#ifndef STRCMP
+# define STRCMP	__strcasecmp_l_avx2_rtm
+#endif
+
+#define _GLABEL(x)	x ## _rtm
+#define GLABEL(x)	_GLABEL(x)
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN	\
+	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN	jmp L(return_vzeroupper)
+
+#define SECTION(p)	p##.avx.rtm
+
+#include "strcasecmp_l-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
new file mode 100644
index 0000000000..e2762f2a22
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
@@ -0,0 +1,23 @@
+/* strcasecmp_l optimized with AVX2.
+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef STRCMP
+# define STRCMP	__strcasecmp_l_avx2
+#endif
+#define USE_AS_STRCASECMP_L
+#include "strcmp-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
index b94fc5c39a..3366d0b083 100644
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
@@ -20,6 +20,10 @@
 
 # include <sysdep.h>
 
+# if defined USE_AS_STRCASECMP_L
+#  include "locale-defines.h"
+# endif
+
 # ifndef STRCMP
 #  define STRCMP	__strcmp_avx2
 # endif
@@ -74,13 +78,88 @@
 #  define VEC_OFFSET	(-VEC_SIZE)
 # endif
 
+# ifdef USE_AS_STRCASECMP_L
+#  define BYTE_LOOP_REG	OFFSET_REG
+# else
+#  define BYTE_LOOP_REG	ecx
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+#  ifdef USE_AS_STRNCMP
+#   define STRCASECMP	__strncasecmp_avx2
+#   define LOCALE_REG	rcx
+#   define LOCALE_REG_LP	RCX_LP
+#   define STRCASECMP_NONASCII	__strncasecmp_l_nonascii
+#  else
+#   define STRCASECMP	__strcasecmp_avx2
+#   define LOCALE_REG	rdx
+#   define LOCALE_REG_LP	RDX_LP
+#   define STRCASECMP_NONASCII	__strcasecmp_l_nonascii
+#  endif
+# endif
+
 # define xmmZERO	xmm15
 # define ymmZERO	ymm15
 
+# define LCASE_MIN_ymm	%ymm10
+# define LCASE_MAX_ymm	%ymm11
+# define CASE_ADD_ymm	%ymm12
+
+# define LCASE_MIN_xmm	%xmm10
+# define LCASE_MAX_xmm	%xmm11
+# define CASE_ADD_xmm	%xmm12
+
+	/* r11 is never use elsewhere so this is safe to maintain.  */
+# define TOLOWER_BASE	%r11
+
 # ifndef SECTION
 #  define SECTION(p)	p##.avx
 # endif
 
+# ifdef USE_AS_STRCASECMP_L
+#  define REG(x, y) x ## y
+#  define TOLOWER(reg1_in, reg1_out, reg2_in, reg2_out, ext)			\
+	vpaddb	REG(LCASE_MIN_, ext), reg1_in, REG(%ext, 8);				\
+	vpaddb	REG(LCASE_MIN_, ext), reg2_in, REG(%ext, 9);				\
+	vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 8), REG(%ext, 8);			\
+	vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 9), REG(%ext, 9);			\
+	vpandn	REG(CASE_ADD_, ext), REG(%ext, 8), REG(%ext, 8);			\
+	vpandn	REG(CASE_ADD_, ext), REG(%ext, 9), REG(%ext, 9);			\
+	vpaddb	REG(%ext, 8), reg1_in, reg1_out;							\
+	vpaddb	REG(%ext, 9), reg2_in, reg2_out
+
+#  define TOLOWER_gpr(src, dst)	movl (TOLOWER_BASE, src, 4), dst
+#  define TOLOWER_ymm(...)	TOLOWER(__VA_ARGS__, ymm)
+#  define TOLOWER_xmm(...)	TOLOWER(__VA_ARGS__, xmm)
+
+#  define CMP_R1_R2(s1_reg, s2_reg, scratch_reg, reg_out, ext)			\
+	TOLOWER	(s1_reg, scratch_reg, s2_reg, s2_reg, ext);					\
+	VPCMPEQ	scratch_reg, s2_reg, reg_out
+
+#  define CMP_R1_S2(s1_reg, s2_mem, scratch_reg, reg_out, ext)			\
+	VMOVU	s2_mem, reg_out;											\
+	CMP_R1_R2(s1_reg, reg_out, scratch_reg, reg_out, ext)
+
+#  define CMP_R1_R2_ymm(...) CMP_R1_R2(__VA_ARGS__, ymm)
+#  define CMP_R1_R2_xmm(...) CMP_R1_R2(__VA_ARGS__, xmm)
+
+#  define CMP_R1_S2_ymm(...) CMP_R1_S2(__VA_ARGS__, ymm)
+#  define CMP_R1_S2_xmm(...) CMP_R1_S2(__VA_ARGS__, xmm)
+
+# else
+#  define TOLOWER_gpr(...)
+#  define TOLOWER_ymm(...)
+#  define TOLOWER_xmm(...)
+
+#  define CMP_R1_R2_ymm(s1_reg, s2_reg, scratch_reg, reg_out)			\
+	VPCMPEQ	s2_reg, s1_reg, reg_out
+
+#  define CMP_R1_R2_xmm(...) CMP_R1_R2_ymm(__VA_ARGS__)
+
+#  define CMP_R1_S2_ymm(...) CMP_R1_R2_ymm(__VA_ARGS__)
+#  define CMP_R1_S2_xmm(...) CMP_R1_R2_xmm(__VA_ARGS__)
+# endif
+
 /* Warning!
            wcscmp/wcsncmp have to use SIGNED comparison for elements.
            strcmp/strncmp have to use UNSIGNED comparison for elements.
@@ -102,8 +181,49 @@
    returned.  */
 
 	.section SECTION(.text), "ax", @progbits
-ENTRY(STRCMP)
+	.align	16
+	.type	STRCMP, @function
+	.globl	STRCMP
+	.hidden	STRCMP
+
+# ifndef GLABEL
+#  define GLABEL(...)	__VA_ARGS__
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+ENTRY (GLABEL(STRCASECMP))
+	movq	__libc_tsd_LOCALE@gottpoff(%rip), %rax
+	mov	%fs:(%rax), %LOCALE_REG_LP
+
+	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
+	.p2align 4
+END (GLABEL(STRCASECMP))
+	/* FALLTHROUGH to strcasecmp/strncasecmp_l.  */
+# endif
+
+	.p2align 4
+STRCMP:
+	cfi_startproc
+	_CET_ENDBR
+	CALL_MCOUNT
+
+# if defined USE_AS_STRCASECMP_L
+	/* We have to fall back on the C implementation for locales with
+	   encodings not matching ASCII for single bytes.  */
+#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	mov	LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
+#  else
+	mov	(%LOCALE_REG), %RAX_LP
+#  endif
+	testl	$1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
+	jne	STRCASECMP_NONASCII
+	leaq	_nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
+# endif
+
 # ifdef USE_AS_STRNCMP
+	/* Don't overwrite LOCALE_REG (rcx) until we have pass
+	   L(one_or_less). Otherwise we might use the wrong locale in
+	   the OVERFLOW_STRCMP (strcasecmp_l).  */
 #  ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
 	movl	%edx, %edx
@@ -128,6 +248,30 @@ ENTRY(STRCMP)
 #  endif
 # endif
 	vpxor	%xmmZERO, %xmmZERO, %xmmZERO
+# if defined USE_AS_STRCASECMP_L
+	.section .rodata.cst32, "aM", @progbits, 32
+	.align	32
+L(lcase_min):
+	.quad	0x3f3f3f3f3f3f3f3f
+	.quad	0x3f3f3f3f3f3f3f3f
+	.quad	0x3f3f3f3f3f3f3f3f
+	.quad	0x3f3f3f3f3f3f3f3f
+L(lcase_max):
+	.quad	0x9999999999999999
+	.quad	0x9999999999999999
+	.quad	0x9999999999999999
+	.quad	0x9999999999999999
+L(case_add):
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.previous
+
+	vmovdqa	L(lcase_min)(%rip), LCASE_MIN_ymm
+	vmovdqa	L(lcase_max)(%rip), LCASE_MAX_ymm
+	vmovdqa	L(case_add)(%rip), CASE_ADD_ymm
+# endif
 	movl	%edi, %eax
 	orl	%esi, %eax
 	sall	$20, %eax
@@ -138,8 +282,10 @@ ENTRY(STRCMP)
 L(no_page_cross):
 	/* Safe to compare 4x vectors.  */
 	VMOVU	(%rdi), %ymm0
-	/* 1s where s1 and s2 equal.  */
-	VPCMPEQ	(%rsi), %ymm0, %ymm1
+	/* 1s where s1 and s2 equal. Just VPCMPEQ if its not strcasecmp.
+	   Otherwise converts ymm0 and load from rsi to lower. ymm2 is
+	   scratch and ymm1 is the return.  */
+	CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
 	/* 1s at null CHAR.  */
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	/* 1s where s1 and s2 equal AND not null CHAR.  */
@@ -172,6 +318,8 @@ L(return_vec_0):
 # else
 	movzbl	(%rdi, %rcx), %eax
 	movzbl	(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 # endif
 L(ret0):
@@ -192,6 +340,10 @@ L(ret_zero):
 
 	.p2align 4,, 5
 L(one_or_less):
+#  ifdef USE_AS_STRCASECMP_L
+	/* Set locale argument for strcasecmp.  */
+	movq	%LOCALE_REG, %rdx
+#  endif
 	jb	L(ret_zero)
 #  ifdef USE_AS_WCSCMP
 	/* 'nbe' covers the case where length is negative (large
@@ -211,6 +363,8 @@ L(one_or_less):
 	jnbe	__strcmp_avx2
 	movzbl	(%rdi), %eax
 	movzbl	(%rsi), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 #  endif
 L(ret1):
@@ -238,6 +392,8 @@ L(return_vec_1):
 # else
 	movzbl	VEC_SIZE(%rdi, %rcx), %eax
 	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 # endif
 L(ret2):
@@ -269,6 +425,8 @@ L(return_vec_2):
 # else
 	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
 	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 # endif
 L(ret3):
@@ -289,6 +447,8 @@ L(return_vec_3):
 #  else
 	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
 	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 #  endif
 L(ret4):
@@ -299,7 +459,7 @@ L(ret4):
 L(more_3x_vec):
 	/* Safe to compare 4x vectors.  */
 	VMOVU	VEC_SIZE(%rdi), %ymm0
-	VPCMPEQ	VEC_SIZE(%rsi), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -312,7 +472,7 @@ L(more_3x_vec):
 # endif
 
 	VMOVU	(VEC_SIZE * 2)(%rdi), %ymm0
-	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 2)(%rsi), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -320,7 +480,7 @@ L(more_3x_vec):
 	jnz	L(return_vec_2)
 
 	VMOVU	(VEC_SIZE * 3)(%rdi), %ymm0
-	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 3)(%rsi), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -395,12 +555,10 @@ L(loop_skip_page_cross_check):
 	VMOVA	(VEC_SIZE * 3)(%rdi), %ymm6
 
 	/* ymm1 all 1s where s1 and s2 equal. All 0s otherwise.  */
-	VPCMPEQ	(VEC_SIZE * 0)(%rsi), %ymm0, %ymm1
-
-	VPCMPEQ	(VEC_SIZE * 1)(%rsi), %ymm2, %ymm3
-	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
-	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
-
+	CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 0)(%rsi), %ymm3, %ymm1)
+	CMP_R1_S2_ymm (%ymm2, (VEC_SIZE * 1)(%rsi), %ymm5, %ymm3)
+	CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
+	CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
 
 	/* If any mismatches or null CHAR then 0 CHAR, otherwise non-
 	   zero.  */
@@ -469,6 +627,8 @@ L(return_vec_2_3_end):
 # else
 	movzbl	(VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax
 	movzbl	(VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -512,6 +672,8 @@ L(return_vec_0_end):
 # else
 	movzbl	(%rdi, %rcx), %eax
 	movzbl	(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -534,6 +696,8 @@ L(return_vec_1_end):
 #  else
 	movzbl	VEC_SIZE(%rdi, %rcx), %eax
 	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -560,6 +724,8 @@ L(return_vec_2_end):
 # else
 	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
 	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -587,7 +753,7 @@ L(page_cross_during_loop):
 	jle	L(less_1x_vec_till_page_cross)
 
 	VMOVA	(%rdi), %ymm0
-	VPCMPEQ	(%rsi), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -609,7 +775,7 @@ L(less_1x_vec_till_page_cross):
 	   here, it means the previous page (rdi - VEC_SIZE) has already
 	   been loaded earlier so must be valid.  */
 	VMOVU	-VEC_SIZE(%rdi, %rax), %ymm0
-	VPCMPEQ	-VEC_SIZE(%rsi, %rax), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, -VEC_SIZE(%rsi, %rax), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -651,6 +817,8 @@ L(return_page_cross_cmp_mem):
 # else
 	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
 	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -677,7 +845,7 @@ L(more_2x_vec_till_page_cross):
 	   iteration here.  */
 
 	VMOVU	VEC_SIZE(%rdi), %ymm0
-	VPCMPEQ	VEC_SIZE(%rsi), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -693,7 +861,7 @@ L(more_2x_vec_till_page_cross):
 
 	/* Safe to include comparisons from lower bytes.  */
 	VMOVU	-(VEC_SIZE * 2)(%rdi, %rax), %ymm0
-	VPCMPEQ	-(VEC_SIZE * 2)(%rsi, %rax), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 2)(%rsi, %rax), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -701,7 +869,7 @@ L(more_2x_vec_till_page_cross):
 	jnz	L(return_vec_page_cross_0)
 
 	VMOVU	-(VEC_SIZE * 1)(%rdi, %rax), %ymm0
-	VPCMPEQ	-(VEC_SIZE * 1)(%rsi, %rax), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 1)(%rsi, %rax), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -719,8 +887,8 @@ L(more_2x_vec_till_page_cross):
 	VMOVA	(VEC_SIZE * 2)(%rdi), %ymm4
 	VMOVA	(VEC_SIZE * 3)(%rdi), %ymm6
 
-	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
-	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
+	CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
+	CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
 	vpand	%ymm4, %ymm5, %ymm5
 	vpand	%ymm6, %ymm7, %ymm7
 	VPMINU	%ymm5, %ymm7, %ymm7
@@ -771,6 +939,8 @@ L(return_vec_page_cross_1):
 # else
 	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
 	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -826,7 +996,7 @@ L(page_cross):
 L(page_cross_loop):
 
 	VMOVU	(%rdi, %OFFSET_REG64), %ymm0
-	VPCMPEQ	(%rsi, %OFFSET_REG64), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -844,11 +1014,11 @@ L(page_cross_loop):
 	subl	%eax, %OFFSET_REG
 	/* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
 	   to not cross page so is safe to load. Since we have already
-	   loaded at least 1 VEC from rsi it is also guranteed to be safe.
-	 */
+	   loaded at least 1 VEC from rsi it is also guranteed to be
+	   safe.  */
 
 	VMOVU	(%rdi, %OFFSET_REG64), %ymm0
-	VPCMPEQ	(%rsi, %OFFSET_REG64), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -881,6 +1051,8 @@ L(ret_vec_page_cross_cont):
 # else
 	movzbl	(%rdi, %rcx), %eax
 	movzbl	(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -934,7 +1106,7 @@ L(less_1x_vec_till_page):
 	ja	L(less_16_till_page)
 
 	VMOVU	(%rdi), %xmm0
-	VPCMPEQ	(%rsi), %xmm0, %xmm1
+	CMP_R1_S2_xmm (%xmm0, (%rsi), %xmm2, %xmm1)
 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
 	vpandn	%xmm1, %xmm2, %xmm1
 	vpmovmskb %ymm1, %ecx
@@ -952,7 +1124,7 @@ L(less_1x_vec_till_page):
 # endif
 
 	VMOVU	(%rdi, %OFFSET_REG64), %xmm0
-	VPCMPEQ	(%rsi, %OFFSET_REG64), %xmm0, %xmm1
+	CMP_R1_S2_xmm (%xmm0, (%rsi, %OFFSET_REG64), %xmm2, %xmm1)
 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
 	vpandn	%xmm1, %xmm2, %xmm1
 	vpmovmskb %ymm1, %ecx
@@ -990,7 +1162,7 @@ L(less_16_till_page):
 	vmovq	(%rdi), %xmm0
 	vmovq	(%rsi), %xmm1
 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
-	VPCMPEQ	%xmm1, %xmm0, %xmm1
+	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
 	vpandn	%xmm1, %xmm2, %xmm1
 	vpmovmskb %ymm1, %ecx
 	incb	%cl
@@ -1010,7 +1182,7 @@ L(less_16_till_page):
 	vmovq	(%rdi, %OFFSET_REG64), %xmm0
 	vmovq	(%rsi, %OFFSET_REG64), %xmm1
 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
-	VPCMPEQ	%xmm1, %xmm0, %xmm1
+	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
 	vpandn	%xmm1, %xmm2, %xmm1
 	vpmovmskb %ymm1, %ecx
 	incb	%cl
@@ -1066,7 +1238,7 @@ L(ret_less_8_wcs):
 	vmovd	(%rdi), %xmm0
 	vmovd	(%rsi), %xmm1
 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
-	VPCMPEQ	%xmm1, %xmm0, %xmm1
+	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
 	vpandn	%xmm1, %xmm2, %xmm1
 	vpmovmskb %ymm1, %ecx
 	subl	$0xf, %ecx
@@ -1085,7 +1257,7 @@ L(ret_less_8_wcs):
 	vmovd	(%rdi, %OFFSET_REG64), %xmm0
 	vmovd	(%rsi, %OFFSET_REG64), %xmm1
 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
-	VPCMPEQ	%xmm1, %xmm0, %xmm1
+	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
 	vpandn	%xmm1, %xmm2, %xmm1
 	vpmovmskb %ymm1, %ecx
 	subl	$0xf, %ecx
@@ -1119,7 +1291,9 @@ L(less_4_till_page):
 L(less_4_loop):
 	movzbl	(%rdi), %eax
 	movzbl	(%rsi, %rdi), %ecx
-	subl	%ecx, %eax
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
+	subl	%BYTE_LOOP_REG, %eax
 	jnz	L(ret_less_4_loop)
 	testl	%ecx, %ecx
 	jz	L(ret_zero_4_loop)
@@ -1146,5 +1320,6 @@ L(ret_less_4_loop):
 	subl	%r8d, %eax
 	ret
 # endif
-END(STRCMP)
+	cfi_endproc
+	.size	STRCMP, .-STRCMP
 #endif
diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
new file mode 100644
index 0000000000..58c05dcfb8
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
@@ -0,0 +1,16 @@
+#ifndef STRCMP
+# define STRCMP	__strncasecmp_l_avx2_rtm
+#endif
+
+#define _GLABEL(x)	x ## _rtm
+#define GLABEL(x)	_GLABEL(x)
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN	\
+	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN	jmp L(return_vzeroupper)
+
+#define SECTION(p)	p##.avx.rtm
+#define OVERFLOW_STRCMP	__strcasecmp_l_avx2_rtm
+
+#include "strncase_l-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2.S b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
new file mode 100644
index 0000000000..48c0aa21f8
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
@@ -0,0 +1,27 @@
+/* strncasecmp_l optimized with AVX2.
+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef STRCMP
+# define STRCMP	__strncasecmp_l_avx2
+#endif
+#define USE_AS_STRCASECMP_L
+#define USE_AS_STRNCMP
+#ifndef OVERFLOW_STRCMP
+# define OVERFLOW_STRCMP	__strcasecmp_l_avx2
+#endif
+#include "strcmp-avx2.S"
-- 
2.35.1


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v4 21/23] x86: Add AVX2 optimized str{n}casecmp
  2022-03-25 18:14     ` [PATCH v4 21/23] x86: Add AVX2 " H.J. Lu
@ 2022-05-12 19:52       ` Sunil Pandey
  0 siblings, 0 replies; 76+ messages in thread
From: Sunil Pandey @ 2022-05-12 19:52 UTC (permalink / raw)
  To: H.J. Lu, Libc-stable Mailing List; +Cc: Noah Goldstein, GNU C Library

[-- Attachment #1: Type: text/plain, Size: 31021 bytes --]

On Fri, Mar 25, 2022 at 11:15 AM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Thu, Mar 24, 2022 at 4:56 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > geometric_mean(N=40) of all benchmarks AVX2 / SSE42: .702
> >
> > All string/memory tests pass.
> > Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
> > ---
> >  sysdeps/x86_64/multiarch/Makefile             |   4 +
> >  sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  28 +++
> >  sysdeps/x86_64/multiarch/ifunc-strcasecmp.h   |  12 +
> >  .../x86_64/multiarch/strcasecmp_l-avx2-rtm.S  |  15 ++
> >  sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S  |  23 ++
> >  sysdeps/x86_64/multiarch/strcmp-avx2.S        | 237 +++++++++++++++---
> >  .../x86_64/multiarch/strncase_l-avx2-rtm.S    |  16 ++
> >  sysdeps/x86_64/multiarch/strncase_l-avx2.S    |  27 ++
> >  8 files changed, 331 insertions(+), 31 deletions(-)
> >  create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
> >  create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
> >  create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
> >  create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2.S
> >
> > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > index e7b413edad..06e1848823 100644
> > --- a/sysdeps/x86_64/multiarch/Makefile
> > +++ b/sysdeps/x86_64/multiarch/Makefile
> > @@ -55,6 +55,8 @@ sysdep_routines += \
> >    stpncpy-sse2-unaligned \
> >    stpncpy-ssse3 \
> >    strcasecmp_l-avx \
> > +  strcasecmp_l-avx2 \
> > +  strcasecmp_l-avx2-rtm \
> >    strcasecmp_l-sse2 \
> >    strcasecmp_l-sse4_2 \
> >    strcasecmp_l-ssse3 \
> > @@ -93,6 +95,8 @@ sysdep_routines += \
> >    strlen-evex \
> >    strlen-sse2 \
> >    strncase_l-avx \
> > +  strncase_l-avx2 \
> > +  strncase_l-avx2-rtm \
> >    strncase_l-sse2 \
> >    strncase_l-sse4_2 \
> >    strncase_l-ssse3 \
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > index a594f4176e..3c556d07ac 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > @@ -436,6 +436,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >
> >    /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
> >    IFUNC_IMPL (i, name, strcasecmp,
> > +             IFUNC_IMPL_ADD (array, i, strcasecmp,
> > +                             CPU_FEATURE_USABLE (AVX2),
> > +                             __strcasecmp_avx2)
> > +             IFUNC_IMPL_ADD (array, i, strcasecmp,
> > +                             (CPU_FEATURE_USABLE (AVX2)
> > +                              && CPU_FEATURE_USABLE (RTM)),
> > +                             __strcasecmp_avx2_rtm)
> >               IFUNC_IMPL_ADD (array, i, strcasecmp,
> >                               CPU_FEATURE_USABLE (AVX),
> >                               __strcasecmp_avx)
> > @@ -449,6 +456,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >
> >    /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
> >    IFUNC_IMPL (i, name, strcasecmp_l,
> > +             IFUNC_IMPL_ADD (array, i, strcasecmp,
> > +                             CPU_FEATURE_USABLE (AVX2),
> > +                             __strcasecmp_l_avx2)
> > +             IFUNC_IMPL_ADD (array, i, strcasecmp,
> > +                             (CPU_FEATURE_USABLE (AVX2)
> > +                              && CPU_FEATURE_USABLE (RTM)),
> > +                             __strcasecmp_l_avx2_rtm)
> >               IFUNC_IMPL_ADD (array, i, strcasecmp_l,
> >                               CPU_FEATURE_USABLE (AVX),
> >                               __strcasecmp_l_avx)
> > @@ -576,6 +590,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >
> >    /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
> >    IFUNC_IMPL (i, name, strncasecmp,
> > +             IFUNC_IMPL_ADD (array, i, strncasecmp,
> > +                             CPU_FEATURE_USABLE (AVX2),
> > +                             __strncasecmp_avx2)
> > +             IFUNC_IMPL_ADD (array, i, strncasecmp,
> > +                             (CPU_FEATURE_USABLE (AVX2)
> > +                              && CPU_FEATURE_USABLE (RTM)),
> > +                             __strncasecmp_avx2_rtm)
> >               IFUNC_IMPL_ADD (array, i, strncasecmp,
> >                               CPU_FEATURE_USABLE (AVX),
> >                               __strncasecmp_avx)
> > @@ -590,6 +611,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >
> >    /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
> >    IFUNC_IMPL (i, name, strncasecmp_l,
> > +             IFUNC_IMPL_ADD (array, i, strncasecmp,
> > +                             CPU_FEATURE_USABLE (AVX2),
> > +                             __strncasecmp_l_avx2)
> > +             IFUNC_IMPL_ADD (array, i, strncasecmp,
> > +                             (CPU_FEATURE_USABLE (AVX2)
> > +                              && CPU_FEATURE_USABLE (RTM)),
> > +                             __strncasecmp_l_avx2_rtm)
> >               IFUNC_IMPL_ADD (array, i, strncasecmp_l,
> >                               CPU_FEATURE_USABLE (AVX),
> >                               __strncasecmp_l_avx)
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> > index 9e3cc61ac0..c4de111fd0 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> > +++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> > @@ -23,12 +23,24 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> >  extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
> >  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
> >  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
> >
> >  static inline void *
> >  IFUNC_SELECTOR (void)
> >  {
> >    const struct cpu_features* cpu_features = __get_cpu_features ();
> >
> > +  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> > +      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
> > +    {
> > +      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> > +        return OPTIMIZE (avx2_rtm);
> > +
> > +      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
> > +        return OPTIMIZE (avx2);
> > +    }
> > +
> >    if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
> >      return OPTIMIZE (avx);
> >
> > diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
> > new file mode 100644
> > index 0000000000..09957fc3c5
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
> > @@ -0,0 +1,15 @@
> > +#ifndef STRCMP
> > +# define STRCMP        __strcasecmp_l_avx2_rtm
> > +#endif
> > +
> > +#define _GLABEL(x)     x ## _rtm
> > +#define GLABEL(x)      _GLABEL(x)
> > +
> > +#define ZERO_UPPER_VEC_REGISTERS_RETURN        \
> > +       ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> > +
> > +#define VZEROUPPER_RETURN      jmp L(return_vzeroupper)
> > +
> > +#define SECTION(p)     p##.avx.rtm
> > +
> > +#include "strcasecmp_l-avx2.S"
> > diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
> > new file mode 100644
> > index 0000000000..e2762f2a22
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
> > @@ -0,0 +1,23 @@
> > +/* strcasecmp_l optimized with AVX2.
> > +   Copyright (C) 2017-2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef STRCMP
> > +# define STRCMP        __strcasecmp_l_avx2
> > +#endif
> > +#define USE_AS_STRCASECMP_L
> > +#include "strcmp-avx2.S"
> > diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> > index 86a86b68e3..8da09bd86d 100644
> > --- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
> > +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> > @@ -20,6 +20,10 @@
> >
> >  # include <sysdep.h>
> >
> > +# if defined USE_AS_STRCASECMP_L
> > +#  include "locale-defines.h"
> > +# endif
> > +
> >  # ifndef STRCMP
> >  #  define STRCMP       __strcmp_avx2
> >  # endif
> > @@ -74,13 +78,88 @@
> >  #  define VEC_OFFSET   (-VEC_SIZE)
> >  # endif
> >
> > +# ifdef USE_AS_STRCASECMP_L
> > +#  define BYTE_LOOP_REG        OFFSET_REG
> > +# else
> > +#  define BYTE_LOOP_REG        ecx
> > +# endif
> > +
> > +# ifdef USE_AS_STRCASECMP_L
> > +#  ifdef USE_AS_STRNCMP
> > +#   define STRCASECMP  __strncasecmp_avx2
> > +#   define LOCALE_REG  rcx
> > +#   define LOCALE_REG_LP       RCX_LP
> > +#   define STRCASECMP_NONASCII __strncasecmp_l_nonascii
> > +#  else
> > +#   define STRCASECMP  __strcasecmp_avx2
> > +#   define LOCALE_REG  rdx
> > +#   define LOCALE_REG_LP       RDX_LP
> > +#   define STRCASECMP_NONASCII __strcasecmp_l_nonascii
> > +#  endif
> > +# endif
> > +
> >  # define xmmZERO       xmm15
> >  # define ymmZERO       ymm15
> >
> > +# define LCASE_MIN_ymm %ymm10
> > +# define LCASE_MAX_ymm %ymm11
> > +# define CASE_ADD_ymm  %ymm12
> > +
> > +# define LCASE_MIN_xmm %xmm10
> > +# define LCASE_MAX_xmm %xmm11
> > +# define CASE_ADD_xmm  %xmm12
> > +
> > +       /* r11 is never use elsewhere so this is safe to maintain.  */
> > +# define TOLOWER_BASE  %r11
> > +
> >  # ifndef SECTION
> >  #  define SECTION(p)   p##.avx
> >  # endif
> >
> > +# ifdef USE_AS_STRCASECMP_L
> > +#  define REG(x, y) x ## y
> > +#  define TOLOWER(reg1_in, reg1_out, reg2_in, reg2_out, ext)                   \
> > +       vpaddb  REG(LCASE_MIN_, ext), reg1_in, REG(%ext, 8);                            \
> > +       vpaddb  REG(LCASE_MIN_, ext), reg2_in, REG(%ext, 9);                            \
> > +       vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 8), REG(%ext, 8);                      \
> > +       vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 9), REG(%ext, 9);                      \
> > +       vpandn  REG(CASE_ADD_, ext), REG(%ext, 8), REG(%ext, 8);                        \
> > +       vpandn  REG(CASE_ADD_, ext), REG(%ext, 9), REG(%ext, 9);                        \
> > +       vpaddb  REG(%ext, 8), reg1_in, reg1_out;                                                        \
> > +       vpaddb  REG(%ext, 9), reg2_in, reg2_out
> > +
> > +#  define TOLOWER_gpr(src, dst)        movl (TOLOWER_BASE, src, 4), dst
> > +#  define TOLOWER_ymm(...)     TOLOWER(__VA_ARGS__, ymm)
> > +#  define TOLOWER_xmm(...)     TOLOWER(__VA_ARGS__, xmm)
> > +
> > +#  define CMP_R1_R2(s1_reg, s2_reg, scratch_reg, reg_out, ext)                 \
> > +       TOLOWER (s1_reg, scratch_reg, s2_reg, s2_reg, ext);                                     \
> > +       VPCMPEQ scratch_reg, s2_reg, reg_out
> > +
> > +#  define CMP_R1_S2(s1_reg, s2_mem, scratch_reg, reg_out, ext)                 \
> > +       VMOVU   s2_mem, reg_out;                                                                                        \
> > +       CMP_R1_R2(s1_reg, reg_out, scratch_reg, reg_out, ext)
> > +
> > +#  define CMP_R1_R2_ymm(...) CMP_R1_R2(__VA_ARGS__, ymm)
> > +#  define CMP_R1_R2_xmm(...) CMP_R1_R2(__VA_ARGS__, xmm)
> > +
> > +#  define CMP_R1_S2_ymm(...) CMP_R1_S2(__VA_ARGS__, ymm)
> > +#  define CMP_R1_S2_xmm(...) CMP_R1_S2(__VA_ARGS__, xmm)
> > +
> > +# else
> > +#  define TOLOWER_gpr(...)
> > +#  define TOLOWER_ymm(...)
> > +#  define TOLOWER_xmm(...)
> > +
> > +#  define CMP_R1_R2_ymm(s1_reg, s2_reg, scratch_reg, reg_out)                  \
> > +       VPCMPEQ s2_reg, s1_reg, reg_out
> > +
> > +#  define CMP_R1_R2_xmm(...) CMP_R1_R2_ymm(__VA_ARGS__)
> > +
> > +#  define CMP_R1_S2_ymm(...) CMP_R1_R2_ymm(__VA_ARGS__)
> > +#  define CMP_R1_S2_xmm(...) CMP_R1_R2_xmm(__VA_ARGS__)
> > +# endif
> > +
> >  /* Warning!
> >             wcscmp/wcsncmp have to use SIGNED comparison for elements.
> >             strcmp/strncmp have to use UNSIGNED comparison for elements.
> > @@ -102,8 +181,49 @@
> >     returned.  */
> >
> >         .section SECTION(.text), "ax", @progbits
> > -ENTRY(STRCMP)
> > +       .align  16
> > +       .type   STRCMP, @function
> > +       .globl  STRCMP
> > +       .hidden STRCMP
> > +
> > +# ifndef GLABEL
> > +#  define GLABEL(...)  __VA_ARGS__
> > +# endif
> > +
> > +# ifdef USE_AS_STRCASECMP_L
> > +ENTRY (GLABEL(STRCASECMP))
> > +       movq    __libc_tsd_LOCALE@gottpoff(%rip), %rax
> > +       mov     %fs:(%rax), %LOCALE_REG_LP
> > +
> > +       /* Either 1 or 5 bytes (dependeing if CET is enabled).  */
> > +       .p2align 4
> > +END (GLABEL(STRCASECMP))
> > +       /* FALLTHROUGH to strcasecmp/strncasecmp_l.  */
> > +# endif
> > +
> > +       .p2align 4
> > +STRCMP:
> > +       cfi_startproc
> > +       _CET_ENDBR
> > +       CALL_MCOUNT
> > +
> > +# if defined USE_AS_STRCASECMP_L
> > +       /* We have to fall back on the C implementation for locales with
> > +          encodings not matching ASCII for single bytes.  */
> > +#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
> > +       mov     LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
> > +#  else
> > +       mov     (%LOCALE_REG), %RAX_LP
> > +#  endif
> > +       testl   $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
> > +       jne     STRCASECMP_NONASCII
> > +       leaq    _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
> > +# endif
> > +
> >  # ifdef USE_AS_STRNCMP
> > +       /* Don't overwrite LOCALE_REG (rcx) until we have pass
> > +          L(one_or_less). Otherwise we might use the wrong locale in
> > +          the OVERFLOW_STRCMP (strcasecmp_l).  */
> >  #  ifdef __ILP32__
> >         /* Clear the upper 32 bits.  */
> >         movl    %edx, %edx
> > @@ -128,6 +248,30 @@ ENTRY(STRCMP)
> >  #  endif
> >  # endif
> >         vpxor   %xmmZERO, %xmmZERO, %xmmZERO
> > +# if defined USE_AS_STRCASECMP_L
> > +       .section .rodata.cst32, "aM", @progbits, 32
> > +       .align  32
> > +L(lcase_min):
> > +       .quad   0x3f3f3f3f3f3f3f3f
> > +       .quad   0x3f3f3f3f3f3f3f3f
> > +       .quad   0x3f3f3f3f3f3f3f3f
> > +       .quad   0x3f3f3f3f3f3f3f3f
> > +L(lcase_max):
> > +       .quad   0x9999999999999999
> > +       .quad   0x9999999999999999
> > +       .quad   0x9999999999999999
> > +       .quad   0x9999999999999999
> > +L(case_add):
> > +       .quad   0x2020202020202020
> > +       .quad   0x2020202020202020
> > +       .quad   0x2020202020202020
> > +       .quad   0x2020202020202020
> > +       .previous
> > +
> > +       vmovdqa L(lcase_min)(%rip), LCASE_MIN_ymm
> > +       vmovdqa L(lcase_max)(%rip), LCASE_MAX_ymm
> > +       vmovdqa L(case_add)(%rip), CASE_ADD_ymm
> > +# endif
> >         movl    %edi, %eax
> >         orl     %esi, %eax
> >         sall    $20, %eax
> > @@ -138,8 +282,10 @@ ENTRY(STRCMP)
> >  L(no_page_cross):
> >         /* Safe to compare 4x vectors.  */
> >         VMOVU   (%rdi), %ymm0
> > -       /* 1s where s1 and s2 equal.  */
> > -       VPCMPEQ (%rsi), %ymm0, %ymm1
> > +       /* 1s where s1 and s2 equal. Just VPCMPEQ if its not strcasecmp.
> > +          Otherwise converts ymm0 and load from rsi to lower. ymm2 is
> > +          scratch and ymm1 is the return.  */
> > +       CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
> >         /* 1s at null CHAR.  */
> >         VPCMPEQ %ymm0, %ymmZERO, %ymm2
> >         /* 1s where s1 and s2 equal AND not null CHAR.  */
> > @@ -172,6 +318,8 @@ L(return_vec_0):
> >  # else
> >         movzbl  (%rdi, %rcx), %eax
> >         movzbl  (%rsi, %rcx), %ecx
> > +       TOLOWER_gpr (%rax, %eax)
> > +       TOLOWER_gpr (%rcx, %ecx)
> >         subl    %ecx, %eax
> >  # endif
> >  L(ret0):
> > @@ -192,6 +340,10 @@ L(ret_zero):
> >
> >         .p2align 4,, 5
> >  L(one_or_less):
> > +#  ifdef USE_AS_STRCASECMP_L
> > +       /* Set locale argument for strcasecmp.  */
> > +       movq    %LOCALE_REG, %rdx
> > +#  endif
> >         jb      L(ret_zero)
> >         /* 'nbe' covers the case where length is negative (large
> >            unsigned).  */
> > @@ -207,6 +359,8 @@ L(one_or_less):
> >  #  else
> >         movzbl  (%rdi), %eax
> >         movzbl  (%rsi), %ecx
> > +       TOLOWER_gpr (%rax, %eax)
> > +       TOLOWER_gpr (%rcx, %ecx)
> >         subl    %ecx, %eax
> >  #  endif
> >  L(ret1):
> > @@ -234,6 +388,8 @@ L(return_vec_1):
> >  # else
> >         movzbl  VEC_SIZE(%rdi, %rcx), %eax
> >         movzbl  VEC_SIZE(%rsi, %rcx), %ecx
> > +       TOLOWER_gpr (%rax, %eax)
> > +       TOLOWER_gpr (%rcx, %ecx)
> >         subl    %ecx, %eax
> >  # endif
> >  L(ret2):
> > @@ -265,6 +421,8 @@ L(return_vec_2):
> >  # else
> >         movzbl  (VEC_SIZE * 2)(%rdi, %rcx), %eax
> >         movzbl  (VEC_SIZE * 2)(%rsi, %rcx), %ecx
> > +       TOLOWER_gpr (%rax, %eax)
> > +       TOLOWER_gpr (%rcx, %ecx)
> >         subl    %ecx, %eax
> >  # endif
> >  L(ret3):
> > @@ -285,6 +443,8 @@ L(return_vec_3):
> >  #  else
> >         movzbl  (VEC_SIZE * 3)(%rdi, %rcx), %eax
> >         movzbl  (VEC_SIZE * 3)(%rsi, %rcx), %ecx
> > +       TOLOWER_gpr (%rax, %eax)
> > +       TOLOWER_gpr (%rcx, %ecx)
> >         subl    %ecx, %eax
> >  #  endif
> >  L(ret4):
> > @@ -295,7 +455,7 @@ L(ret4):
> >  L(more_3x_vec):
> >         /* Safe to compare 4x vectors.  */
> >         VMOVU   VEC_SIZE(%rdi), %ymm0
> > -       VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1
> > +       CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
> >         VPCMPEQ %ymm0, %ymmZERO, %ymm2
> >         vpandn  %ymm1, %ymm2, %ymm1
> >         vpmovmskb %ymm1, %ecx
> > @@ -308,7 +468,7 @@ L(more_3x_vec):
> >  # endif
> >
> >         VMOVU   (VEC_SIZE * 2)(%rdi), %ymm0
> > -       VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm0, %ymm1
> > +       CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 2)(%rsi), %ymm2, %ymm1)
> >         VPCMPEQ %ymm0, %ymmZERO, %ymm2
> >         vpandn  %ymm1, %ymm2, %ymm1
> >         vpmovmskb %ymm1, %ecx
> > @@ -316,7 +476,7 @@ L(more_3x_vec):
> >         jnz     L(return_vec_2)
> >
> >         VMOVU   (VEC_SIZE * 3)(%rdi), %ymm0
> > -       VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm0, %ymm1
> > +       CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 3)(%rsi), %ymm2, %ymm1)
> >         VPCMPEQ %ymm0, %ymmZERO, %ymm2
> >         vpandn  %ymm1, %ymm2, %ymm1
> >         vpmovmskb %ymm1, %ecx
> > @@ -391,12 +551,10 @@ L(loop_skip_page_cross_check):
> >         VMOVA   (VEC_SIZE * 3)(%rdi), %ymm6
> >
> >         /* ymm1 all 1s where s1 and s2 equal. All 0s otherwise.  */
> > -       VPCMPEQ (VEC_SIZE * 0)(%rsi), %ymm0, %ymm1
> > -
> > -       VPCMPEQ (VEC_SIZE * 1)(%rsi), %ymm2, %ymm3
> > -       VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
> > -       VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
> > -
> > +       CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 0)(%rsi), %ymm3, %ymm1)
> > +       CMP_R1_S2_ymm (%ymm2, (VEC_SIZE * 1)(%rsi), %ymm5, %ymm3)
> > +       CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
> > +       CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
> >
> >         /* If any mismatches or null CHAR then 0 CHAR, otherwise non-
> >            zero.  */
> > @@ -465,6 +623,8 @@ L(return_vec_2_3_end):
> >  # else
> >         movzbl  (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax
> >         movzbl  (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
> > +       TOLOWER_gpr (%rax, %eax)
> > +       TOLOWER_gpr (%rcx, %ecx)
> >         subl    %ecx, %eax
> >         xorl    %r8d, %eax
> >         subl    %r8d, %eax
> > @@ -508,6 +668,8 @@ L(return_vec_0_end):
> >  # else
> >         movzbl  (%rdi, %rcx), %eax
> >         movzbl  (%rsi, %rcx), %ecx
> > +       TOLOWER_gpr (%rax, %eax)
> > +       TOLOWER_gpr (%rcx, %ecx)
> >         subl    %ecx, %eax
> >         xorl    %r8d, %eax
> >         subl    %r8d, %eax
> > @@ -530,6 +692,8 @@ L(return_vec_1_end):
> >  #  else
> >         movzbl  VEC_SIZE(%rdi, %rcx), %eax
> >         movzbl  VEC_SIZE(%rsi, %rcx), %ecx
> > +       TOLOWER_gpr (%rax, %eax)
> > +       TOLOWER_gpr (%rcx, %ecx)
> >         subl    %ecx, %eax
> >         xorl    %r8d, %eax
> >         subl    %r8d, %eax
> > @@ -556,6 +720,8 @@ L(return_vec_2_end):
> >  # else
> >         movzbl  (VEC_SIZE * 2)(%rdi, %rcx), %eax
> >         movzbl  (VEC_SIZE * 2)(%rsi, %rcx), %ecx
> > +       TOLOWER_gpr (%rax, %eax)
> > +       TOLOWER_gpr (%rcx, %ecx)
> >         subl    %ecx, %eax
> >         xorl    %r8d, %eax
> >         subl    %r8d, %eax
> > @@ -583,7 +749,7 @@ L(page_cross_during_loop):
> >         jle     L(less_1x_vec_till_page_cross)
> >
> >         VMOVA   (%rdi), %ymm0
> > -       VPCMPEQ (%rsi), %ymm0, %ymm1
> > +       CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
> >         VPCMPEQ %ymm0, %ymmZERO, %ymm2
> >         vpandn  %ymm1, %ymm2, %ymm1
> >         vpmovmskb %ymm1, %ecx
> > @@ -605,7 +771,7 @@ L(less_1x_vec_till_page_cross):
> >            here, it means the previous page (rdi - VEC_SIZE) has already
> >            been loaded earlier so must be valid.  */
> >         VMOVU   -VEC_SIZE(%rdi, %rax), %ymm0
> > -       VPCMPEQ -VEC_SIZE(%rsi, %rax), %ymm0, %ymm1
> > +       CMP_R1_S2_ymm (%ymm0, -VEC_SIZE(%rsi, %rax), %ymm2, %ymm1)
> >         VPCMPEQ %ymm0, %ymmZERO, %ymm2
> >         vpandn  %ymm1, %ymm2, %ymm1
> >         vpmovmskb %ymm1, %ecx
> > @@ -647,6 +813,8 @@ L(return_page_cross_cmp_mem):
> >  # else
> >         movzbl  VEC_OFFSET(%rdi, %rcx), %eax
> >         movzbl  VEC_OFFSET(%rsi, %rcx), %ecx
> > +       TOLOWER_gpr (%rax, %eax)
> > +       TOLOWER_gpr (%rcx, %ecx)
> >         subl    %ecx, %eax
> >         xorl    %r8d, %eax
> >         subl    %r8d, %eax
> > @@ -673,7 +841,7 @@ L(more_2x_vec_till_page_cross):
> >            iteration here.  */
> >
> >         VMOVU   VEC_SIZE(%rdi), %ymm0
> > -       VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1
> > +       CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
> >         VPCMPEQ %ymm0, %ymmZERO, %ymm2
> >         vpandn  %ymm1, %ymm2, %ymm1
> >         vpmovmskb %ymm1, %ecx
> > @@ -689,7 +857,7 @@ L(more_2x_vec_till_page_cross):
> >
> >         /* Safe to include comparisons from lower bytes.  */
> >         VMOVU   -(VEC_SIZE * 2)(%rdi, %rax), %ymm0
> > -       VPCMPEQ -(VEC_SIZE * 2)(%rsi, %rax), %ymm0, %ymm1
> > +       CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 2)(%rsi, %rax), %ymm2, %ymm1)
> >         VPCMPEQ %ymm0, %ymmZERO, %ymm2
> >         vpandn  %ymm1, %ymm2, %ymm1
> >         vpmovmskb %ymm1, %ecx
> > @@ -697,7 +865,7 @@ L(more_2x_vec_till_page_cross):
> >         jnz     L(return_vec_page_cross_0)
> >
> >         VMOVU   -(VEC_SIZE * 1)(%rdi, %rax), %ymm0
> > -       VPCMPEQ -(VEC_SIZE * 1)(%rsi, %rax), %ymm0, %ymm1
> > +       CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 1)(%rsi, %rax), %ymm2, %ymm1)
> >         VPCMPEQ %ymm0, %ymmZERO, %ymm2
> >         vpandn  %ymm1, %ymm2, %ymm1
> >         vpmovmskb %ymm1, %ecx
> > @@ -715,8 +883,8 @@ L(more_2x_vec_till_page_cross):
> >         VMOVA   (VEC_SIZE * 2)(%rdi), %ymm4
> >         VMOVA   (VEC_SIZE * 3)(%rdi), %ymm6
> >
> > -       VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
> > -       VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
> > +       CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
> > +       CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
> >         vpand   %ymm4, %ymm5, %ymm5
> >         vpand   %ymm6, %ymm7, %ymm7
> >         VPMINU  %ymm5, %ymm7, %ymm7
> > @@ -767,6 +935,8 @@ L(return_vec_page_cross_1):
> >  # else
> >         movzbl  VEC_OFFSET(%rdi, %rcx), %eax
> >         movzbl  VEC_OFFSET(%rsi, %rcx), %ecx
> > +       TOLOWER_gpr (%rax, %eax)
> > +       TOLOWER_gpr (%rcx, %ecx)
> >         subl    %ecx, %eax
> >         xorl    %r8d, %eax
> >         subl    %r8d, %eax
> > @@ -822,7 +992,7 @@ L(page_cross):
> >  L(page_cross_loop):
> >
> >         VMOVU   (%rdi, %OFFSET_REG64), %ymm0
> > -       VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1
> > +       CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
> >         VPCMPEQ %ymm0, %ymmZERO, %ymm2
> >         vpandn  %ymm1, %ymm2, %ymm1
> >         vpmovmskb %ymm1, %ecx
> > @@ -840,11 +1010,11 @@ L(page_cross_loop):
> >         subl    %eax, %OFFSET_REG
> >         /* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
> >            to not cross page so is safe to load. Since we have already
> > -          loaded at least 1 VEC from rsi it is also guranteed to be safe.
> > -        */
> > +          loaded at least 1 VEC from rsi it is also guranteed to be
> > +          safe.  */
> >
> >         VMOVU   (%rdi, %OFFSET_REG64), %ymm0
> > -       VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1
> > +       CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
> >         VPCMPEQ %ymm0, %ymmZERO, %ymm2
> >         vpandn  %ymm1, %ymm2, %ymm1
> >         vpmovmskb %ymm1, %ecx
> > @@ -877,6 +1047,8 @@ L(ret_vec_page_cross_cont):
> >  # else
> >         movzbl  (%rdi, %rcx), %eax
> >         movzbl  (%rsi, %rcx), %ecx
> > +       TOLOWER_gpr (%rax, %eax)
> > +       TOLOWER_gpr (%rcx, %ecx)
> >         subl    %ecx, %eax
> >         xorl    %r8d, %eax
> >         subl    %r8d, %eax
> > @@ -930,7 +1102,7 @@ L(less_1x_vec_till_page):
> >         ja      L(less_16_till_page)
> >
> >         VMOVU   (%rdi), %xmm0
> > -       VPCMPEQ (%rsi), %xmm0, %xmm1
> > +       CMP_R1_S2_xmm (%xmm0, (%rsi), %xmm2, %xmm1)
> >         VPCMPEQ %xmm0, %xmmZERO, %xmm2
> >         vpandn  %xmm1, %xmm2, %xmm1
> >         vpmovmskb %ymm1, %ecx
> > @@ -948,7 +1120,7 @@ L(less_1x_vec_till_page):
> >  # endif
> >
> >         VMOVU   (%rdi, %OFFSET_REG64), %xmm0
> > -       VPCMPEQ (%rsi, %OFFSET_REG64), %xmm0, %xmm1
> > +       CMP_R1_S2_xmm (%xmm0, (%rsi, %OFFSET_REG64), %xmm2, %xmm1)
> >         VPCMPEQ %xmm0, %xmmZERO, %xmm2
> >         vpandn  %xmm1, %xmm2, %xmm1
> >         vpmovmskb %ymm1, %ecx
> > @@ -986,7 +1158,7 @@ L(less_16_till_page):
> >         vmovq   (%rdi), %xmm0
> >         vmovq   (%rsi), %xmm1
> >         VPCMPEQ %xmm0, %xmmZERO, %xmm2
> > -       VPCMPEQ %xmm1, %xmm0, %xmm1
> > +       CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
> >         vpandn  %xmm1, %xmm2, %xmm1
> >         vpmovmskb %ymm1, %ecx
> >         incb    %cl
> > @@ -1006,7 +1178,7 @@ L(less_16_till_page):
> >         vmovq   (%rdi, %OFFSET_REG64), %xmm0
> >         vmovq   (%rsi, %OFFSET_REG64), %xmm1
> >         VPCMPEQ %xmm0, %xmmZERO, %xmm2
> > -       VPCMPEQ %xmm1, %xmm0, %xmm1
> > +       CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
> >         vpandn  %xmm1, %xmm2, %xmm1
> >         vpmovmskb %ymm1, %ecx
> >         incb    %cl
> > @@ -1062,7 +1234,7 @@ L(ret_less_8_wcs):
> >         vmovd   (%rdi), %xmm0
> >         vmovd   (%rsi), %xmm1
> >         VPCMPEQ %xmm0, %xmmZERO, %xmm2
> > -       VPCMPEQ %xmm1, %xmm0, %xmm1
> > +       CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
> >         vpandn  %xmm1, %xmm2, %xmm1
> >         vpmovmskb %ymm1, %ecx
> >         subl    $0xf, %ecx
> > @@ -1081,7 +1253,7 @@ L(ret_less_8_wcs):
> >         vmovd   (%rdi, %OFFSET_REG64), %xmm0
> >         vmovd   (%rsi, %OFFSET_REG64), %xmm1
> >         VPCMPEQ %xmm0, %xmmZERO, %xmm2
> > -       VPCMPEQ %xmm1, %xmm0, %xmm1
> > +       CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
> >         vpandn  %xmm1, %xmm2, %xmm1
> >         vpmovmskb %ymm1, %ecx
> >         subl    $0xf, %ecx
> > @@ -1115,7 +1287,9 @@ L(less_4_till_page):
> >  L(less_4_loop):
> >         movzbl  (%rdi), %eax
> >         movzbl  (%rsi, %rdi), %ecx
> > -       subl    %ecx, %eax
> > +       TOLOWER_gpr (%rax, %eax)
> > +       TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
> > +       subl    %BYTE_LOOP_REG, %eax
> >         jnz     L(ret_less_4_loop)
> >         testl   %ecx, %ecx
> >         jz      L(ret_zero_4_loop)
> > @@ -1142,5 +1316,6 @@ L(ret_less_4_loop):
> >         subl    %r8d, %eax
> >         ret
> >  # endif
> > -END(STRCMP)
> > +       cfi_endproc
> > +       .size   STRCMP, .-STRCMP
> >  #endif
> > diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
> > new file mode 100644
> > index 0000000000..58c05dcfb8
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
> > @@ -0,0 +1,16 @@
> > +#ifndef STRCMP
> > +# define STRCMP        __strncasecmp_l_avx2_rtm
> > +#endif
> > +
> > +#define _GLABEL(x)     x ## _rtm
> > +#define GLABEL(x)      _GLABEL(x)
> > +
> > +#define ZERO_UPPER_VEC_REGISTERS_RETURN        \
> > +       ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> > +
> > +#define VZEROUPPER_RETURN      jmp L(return_vzeroupper)
> > +
> > +#define SECTION(p)     p##.avx.rtm
> > +#define OVERFLOW_STRCMP        __strcasecmp_l_avx2_rtm
> > +
> > +#include "strncase_l-avx2.S"
> > diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2.S b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
> > new file mode 100644
> > index 0000000000..48c0aa21f8
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
> > @@ -0,0 +1,27 @@
> > +/* strncasecmp_l optimized with AVX2.
> > +   Copyright (C) 2017-2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef STRCMP
> > +# define STRCMP        __strncasecmp_l_avx2
> > +#endif
> > +#define USE_AS_STRCASECMP_L
> > +#define USE_AS_STRNCMP
> > +#ifndef OVERFLOW_STRCMP
> > +# define OVERFLOW_STRCMP       __strcasecmp_l_avx2
> > +#endif
> > +#include "strcmp-avx2.S"
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.

I would like to backport this patch to release branches.
Any comments or objections?

Conflict resolution patch attached.

--Sunil

[-- Attachment #2: 0015-x86-Add-AVX2-optimized-str-n-casecmp.patch --]
[-- Type: application/octet-stream, Size: 24673 bytes --]

From b382e4caf50dfee62e170f9b6617b470b1289dcb Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Thu, 24 Mar 2022 18:56:12 -0500
Subject: [PATCH 15/26] x86: Add AVX2 optimized str{n}casecmp

geometric_mean(N=40) of all benchmarks AVX2 / SSE42: .702

All string/memory tests pass.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

(cherry picked from commit bbf81222343fed5cd704001a2ae0d86c71544151)
---
 sysdeps/x86_64/multiarch/Makefile             |   4 +
 sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  28 +++
 sysdeps/x86_64/multiarch/ifunc-strcasecmp.h   |  12 +
 .../x86_64/multiarch/strcasecmp_l-avx2-rtm.S  |  15 ++
 sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S  |  23 ++
 sysdeps/x86_64/multiarch/strcmp-avx2.S        | 237 +++++++++++++++---
 .../x86_64/multiarch/strncase_l-avx2-rtm.S    |  16 ++
 sysdeps/x86_64/multiarch/strncase_l-avx2.S    |  27 ++
 8 files changed, 331 insertions(+), 31 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
 create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
 create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index e7b413edad..06e1848823 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -55,6 +55,8 @@ sysdep_routines += \
   stpncpy-sse2-unaligned \
   stpncpy-ssse3 \
   strcasecmp_l-avx \
+  strcasecmp_l-avx2 \
+  strcasecmp_l-avx2-rtm \
   strcasecmp_l-sse2 \
   strcasecmp_l-sse4_2 \
   strcasecmp_l-ssse3 \
@@ -93,6 +95,8 @@ sysdep_routines += \
   strlen-evex \
   strlen-sse2 \
   strncase_l-avx \
+  strncase_l-avx2 \
+  strncase_l-avx2-rtm \
   strncase_l-sse2 \
   strncase_l-sse4_2 \
   strncase_l-ssse3 \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index a594f4176e..3c556d07ac 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -436,6 +436,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
   IFUNC_IMPL (i, name, strcasecmp,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      CPU_FEATURE_USABLE (AVX2),
+			      __strcasecmp_avx2)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __strcasecmp_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp,
 			      CPU_FEATURE_USABLE (AVX),
 			      __strcasecmp_avx)
@@ -449,6 +456,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
   IFUNC_IMPL (i, name, strcasecmp_l,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      CPU_FEATURE_USABLE (AVX2),
+			      __strcasecmp_l_avx2)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __strcasecmp_l_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
 			      CPU_FEATURE_USABLE (AVX),
 			      __strcasecmp_l_avx)
@@ -576,6 +590,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
   IFUNC_IMPL (i, name, strncasecmp,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      CPU_FEATURE_USABLE (AVX2),
+			      __strncasecmp_avx2)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __strncasecmp_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp,
 			      CPU_FEATURE_USABLE (AVX),
 			      __strncasecmp_avx)
@@ -590,6 +611,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
   IFUNC_IMPL (i, name, strncasecmp_l,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      CPU_FEATURE_USABLE (AVX2),
+			      __strncasecmp_l_avx2)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __strncasecmp_l_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
 			      CPU_FEATURE_USABLE (AVX),
 			      __strncasecmp_l_avx)
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
index 9e3cc61ac0..c4de111fd0 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
@@ -23,12 +23,24 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
 
 static inline void *
 IFUNC_SELECTOR (void)
 {
   const struct cpu_features* cpu_features = __get_cpu_features ();
 
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+    {
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+        return OPTIMIZE (avx2_rtm);
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+        return OPTIMIZE (avx2);
+    }
+
   if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
     return OPTIMIZE (avx);
 
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
new file mode 100644
index 0000000000..09957fc3c5
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
@@ -0,0 +1,15 @@
+#ifndef STRCMP
+# define STRCMP	__strcasecmp_l_avx2_rtm
+#endif
+
+#define _GLABEL(x)	x ## _rtm
+#define GLABEL(x)	_GLABEL(x)
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN	\
+	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN	jmp L(return_vzeroupper)
+
+#define SECTION(p)	p##.avx.rtm
+
+#include "strcasecmp_l-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
new file mode 100644
index 0000000000..e2762f2a22
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
@@ -0,0 +1,23 @@
+/* strcasecmp_l optimized with AVX2.
+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef STRCMP
+# define STRCMP	__strcasecmp_l_avx2
+#endif
+#define USE_AS_STRCASECMP_L
+#include "strcmp-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
index b94fc5c39a..3366d0b083 100644
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
@@ -20,6 +20,10 @@
 
 # include <sysdep.h>
 
+# if defined USE_AS_STRCASECMP_L
+#  include "locale-defines.h"
+# endif
+
 # ifndef STRCMP
 #  define STRCMP	__strcmp_avx2
 # endif
@@ -74,13 +78,88 @@
 #  define VEC_OFFSET	(-VEC_SIZE)
 # endif
 
+# ifdef USE_AS_STRCASECMP_L
+#  define BYTE_LOOP_REG	OFFSET_REG
+# else
+#  define BYTE_LOOP_REG	ecx
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+#  ifdef USE_AS_STRNCMP
+#   define STRCASECMP	__strncasecmp_avx2
+#   define LOCALE_REG	rcx
+#   define LOCALE_REG_LP	RCX_LP
+#   define STRCASECMP_NONASCII	__strncasecmp_l_nonascii
+#  else
+#   define STRCASECMP	__strcasecmp_avx2
+#   define LOCALE_REG	rdx
+#   define LOCALE_REG_LP	RDX_LP
+#   define STRCASECMP_NONASCII	__strcasecmp_l_nonascii
+#  endif
+# endif
+
 # define xmmZERO	xmm15
 # define ymmZERO	ymm15
 
+# define LCASE_MIN_ymm	%ymm10
+# define LCASE_MAX_ymm	%ymm11
+# define CASE_ADD_ymm	%ymm12
+
+# define LCASE_MIN_xmm	%xmm10
+# define LCASE_MAX_xmm	%xmm11
+# define CASE_ADD_xmm	%xmm12
+
+	/* r11 is never use elsewhere so this is safe to maintain.  */
+# define TOLOWER_BASE	%r11
+
 # ifndef SECTION
 #  define SECTION(p)	p##.avx
 # endif
 
+# ifdef USE_AS_STRCASECMP_L
+#  define REG(x, y) x ## y
+#  define TOLOWER(reg1_in, reg1_out, reg2_in, reg2_out, ext)			\
+	vpaddb	REG(LCASE_MIN_, ext), reg1_in, REG(%ext, 8);				\
+	vpaddb	REG(LCASE_MIN_, ext), reg2_in, REG(%ext, 9);				\
+	vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 8), REG(%ext, 8);			\
+	vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 9), REG(%ext, 9);			\
+	vpandn	REG(CASE_ADD_, ext), REG(%ext, 8), REG(%ext, 8);			\
+	vpandn	REG(CASE_ADD_, ext), REG(%ext, 9), REG(%ext, 9);			\
+	vpaddb	REG(%ext, 8), reg1_in, reg1_out;							\
+	vpaddb	REG(%ext, 9), reg2_in, reg2_out
+
+#  define TOLOWER_gpr(src, dst)	movl (TOLOWER_BASE, src, 4), dst
+#  define TOLOWER_ymm(...)	TOLOWER(__VA_ARGS__, ymm)
+#  define TOLOWER_xmm(...)	TOLOWER(__VA_ARGS__, xmm)
+
+#  define CMP_R1_R2(s1_reg, s2_reg, scratch_reg, reg_out, ext)			\
+	TOLOWER	(s1_reg, scratch_reg, s2_reg, s2_reg, ext);					\
+	VPCMPEQ	scratch_reg, s2_reg, reg_out
+
+#  define CMP_R1_S2(s1_reg, s2_mem, scratch_reg, reg_out, ext)			\
+	VMOVU	s2_mem, reg_out;											\
+	CMP_R1_R2(s1_reg, reg_out, scratch_reg, reg_out, ext)
+
+#  define CMP_R1_R2_ymm(...) CMP_R1_R2(__VA_ARGS__, ymm)
+#  define CMP_R1_R2_xmm(...) CMP_R1_R2(__VA_ARGS__, xmm)
+
+#  define CMP_R1_S2_ymm(...) CMP_R1_S2(__VA_ARGS__, ymm)
+#  define CMP_R1_S2_xmm(...) CMP_R1_S2(__VA_ARGS__, xmm)
+
+# else
+#  define TOLOWER_gpr(...)
+#  define TOLOWER_ymm(...)
+#  define TOLOWER_xmm(...)
+
+#  define CMP_R1_R2_ymm(s1_reg, s2_reg, scratch_reg, reg_out)			\
+	VPCMPEQ	s2_reg, s1_reg, reg_out
+
+#  define CMP_R1_R2_xmm(...) CMP_R1_R2_ymm(__VA_ARGS__)
+
+#  define CMP_R1_S2_ymm(...) CMP_R1_R2_ymm(__VA_ARGS__)
+#  define CMP_R1_S2_xmm(...) CMP_R1_R2_xmm(__VA_ARGS__)
+# endif
+
 /* Warning!
            wcscmp/wcsncmp have to use SIGNED comparison for elements.
            strcmp/strncmp have to use UNSIGNED comparison for elements.
@@ -102,8 +181,49 @@
    returned.  */
 
 	.section SECTION(.text), "ax", @progbits
-ENTRY(STRCMP)
+	.align	16
+	.type	STRCMP, @function
+	.globl	STRCMP
+	.hidden	STRCMP
+
+# ifndef GLABEL
+#  define GLABEL(...)	__VA_ARGS__
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+ENTRY (GLABEL(STRCASECMP))
+	movq	__libc_tsd_LOCALE@gottpoff(%rip), %rax
+	mov	%fs:(%rax), %LOCALE_REG_LP
+
+	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
+	.p2align 4
+END (GLABEL(STRCASECMP))
+	/* FALLTHROUGH to strcasecmp/strncasecmp_l.  */
+# endif
+
+	.p2align 4
+STRCMP:
+	cfi_startproc
+	_CET_ENDBR
+	CALL_MCOUNT
+
+# if defined USE_AS_STRCASECMP_L
+	/* We have to fall back on the C implementation for locales with
+	   encodings not matching ASCII for single bytes.  */
+#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	mov	LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
+#  else
+	mov	(%LOCALE_REG), %RAX_LP
+#  endif
+	testl	$1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
+	jne	STRCASECMP_NONASCII
+	leaq	_nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
+# endif
+
 # ifdef USE_AS_STRNCMP
+	/* Don't overwrite LOCALE_REG (rcx) until we have pass
+	   L(one_or_less). Otherwise we might use the wrong locale in
+	   the OVERFLOW_STRCMP (strcasecmp_l).  */
 #  ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
 	movl	%edx, %edx
@@ -128,6 +248,30 @@ ENTRY(STRCMP)
 #  endif
 # endif
 	vpxor	%xmmZERO, %xmmZERO, %xmmZERO
+# if defined USE_AS_STRCASECMP_L
+	.section .rodata.cst32, "aM", @progbits, 32
+	.align	32
+L(lcase_min):
+	.quad	0x3f3f3f3f3f3f3f3f
+	.quad	0x3f3f3f3f3f3f3f3f
+	.quad	0x3f3f3f3f3f3f3f3f
+	.quad	0x3f3f3f3f3f3f3f3f
+L(lcase_max):
+	.quad	0x9999999999999999
+	.quad	0x9999999999999999
+	.quad	0x9999999999999999
+	.quad	0x9999999999999999
+L(case_add):
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.previous
+
+	vmovdqa	L(lcase_min)(%rip), LCASE_MIN_ymm
+	vmovdqa	L(lcase_max)(%rip), LCASE_MAX_ymm
+	vmovdqa	L(case_add)(%rip), CASE_ADD_ymm
+# endif
 	movl	%edi, %eax
 	orl	%esi, %eax
 	sall	$20, %eax
@@ -138,8 +282,10 @@ ENTRY(STRCMP)
 L(no_page_cross):
 	/* Safe to compare 4x vectors.  */
 	VMOVU	(%rdi), %ymm0
-	/* 1s where s1 and s2 equal.  */
-	VPCMPEQ	(%rsi), %ymm0, %ymm1
+	/* 1s where s1 and s2 equal. Just VPCMPEQ if its not strcasecmp.
+	   Otherwise converts ymm0 and load from rsi to lower. ymm2 is
+	   scratch and ymm1 is the return.  */
+	CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
 	/* 1s at null CHAR.  */
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	/* 1s where s1 and s2 equal AND not null CHAR.  */
@@ -172,6 +318,8 @@ L(return_vec_0):
 # else
 	movzbl	(%rdi, %rcx), %eax
 	movzbl	(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 # endif
 L(ret0):
@@ -192,6 +340,10 @@ L(ret_zero):
 
 	.p2align 4,, 5
 L(one_or_less):
+#  ifdef USE_AS_STRCASECMP_L
+	/* Set locale argument for strcasecmp.  */
+	movq	%LOCALE_REG, %rdx
+#  endif
 	jb	L(ret_zero)
 #  ifdef USE_AS_WCSCMP
 	/* 'nbe' covers the case where length is negative (large
@@ -211,6 +363,8 @@ L(one_or_less):
 	jnbe	__strcmp_avx2
 	movzbl	(%rdi), %eax
 	movzbl	(%rsi), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 #  endif
 L(ret1):
@@ -238,6 +392,8 @@ L(return_vec_1):
 # else
 	movzbl	VEC_SIZE(%rdi, %rcx), %eax
 	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 # endif
 L(ret2):
@@ -269,6 +425,8 @@ L(return_vec_2):
 # else
 	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
 	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 # endif
 L(ret3):
@@ -289,6 +447,8 @@ L(return_vec_3):
 #  else
 	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
 	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 #  endif
 L(ret4):
@@ -299,7 +459,7 @@ L(ret4):
 L(more_3x_vec):
 	/* Safe to compare 4x vectors.  */
 	VMOVU	VEC_SIZE(%rdi), %ymm0
-	VPCMPEQ	VEC_SIZE(%rsi), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -312,7 +472,7 @@ L(more_3x_vec):
 # endif
 
 	VMOVU	(VEC_SIZE * 2)(%rdi), %ymm0
-	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 2)(%rsi), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -320,7 +480,7 @@ L(more_3x_vec):
 	jnz	L(return_vec_2)
 
 	VMOVU	(VEC_SIZE * 3)(%rdi), %ymm0
-	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 3)(%rsi), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -395,12 +555,10 @@ L(loop_skip_page_cross_check):
 	VMOVA	(VEC_SIZE * 3)(%rdi), %ymm6
 
 	/* ymm1 all 1s where s1 and s2 equal. All 0s otherwise.  */
-	VPCMPEQ	(VEC_SIZE * 0)(%rsi), %ymm0, %ymm1
-
-	VPCMPEQ	(VEC_SIZE * 1)(%rsi), %ymm2, %ymm3
-	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
-	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
-
+	CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 0)(%rsi), %ymm3, %ymm1)
+	CMP_R1_S2_ymm (%ymm2, (VEC_SIZE * 1)(%rsi), %ymm5, %ymm3)
+	CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
+	CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
 
 	/* If any mismatches or null CHAR then 0 CHAR, otherwise non-
 	   zero.  */
@@ -469,6 +627,8 @@ L(return_vec_2_3_end):
 # else
 	movzbl	(VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax
 	movzbl	(VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -512,6 +672,8 @@ L(return_vec_0_end):
 # else
 	movzbl	(%rdi, %rcx), %eax
 	movzbl	(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -534,6 +696,8 @@ L(return_vec_1_end):
 #  else
 	movzbl	VEC_SIZE(%rdi, %rcx), %eax
 	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -560,6 +724,8 @@ L(return_vec_2_end):
 # else
 	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
 	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -587,7 +753,7 @@ L(page_cross_during_loop):
 	jle	L(less_1x_vec_till_page_cross)
 
 	VMOVA	(%rdi), %ymm0
-	VPCMPEQ	(%rsi), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -609,7 +775,7 @@ L(less_1x_vec_till_page_cross):
 	   here, it means the previous page (rdi - VEC_SIZE) has already
 	   been loaded earlier so must be valid.  */
 	VMOVU	-VEC_SIZE(%rdi, %rax), %ymm0
-	VPCMPEQ	-VEC_SIZE(%rsi, %rax), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, -VEC_SIZE(%rsi, %rax), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -651,6 +817,8 @@ L(return_page_cross_cmp_mem):
 # else
 	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
 	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -677,7 +845,7 @@ L(more_2x_vec_till_page_cross):
 	   iteration here.  */
 
 	VMOVU	VEC_SIZE(%rdi), %ymm0
-	VPCMPEQ	VEC_SIZE(%rsi), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -693,7 +861,7 @@ L(more_2x_vec_till_page_cross):
 
 	/* Safe to include comparisons from lower bytes.  */
 	VMOVU	-(VEC_SIZE * 2)(%rdi, %rax), %ymm0
-	VPCMPEQ	-(VEC_SIZE * 2)(%rsi, %rax), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 2)(%rsi, %rax), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -701,7 +869,7 @@ L(more_2x_vec_till_page_cross):
 	jnz	L(return_vec_page_cross_0)
 
 	VMOVU	-(VEC_SIZE * 1)(%rdi, %rax), %ymm0
-	VPCMPEQ	-(VEC_SIZE * 1)(%rsi, %rax), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 1)(%rsi, %rax), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -719,8 +887,8 @@ L(more_2x_vec_till_page_cross):
 	VMOVA	(VEC_SIZE * 2)(%rdi), %ymm4
 	VMOVA	(VEC_SIZE * 3)(%rdi), %ymm6
 
-	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
-	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
+	CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
+	CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
 	vpand	%ymm4, %ymm5, %ymm5
 	vpand	%ymm6, %ymm7, %ymm7
 	VPMINU	%ymm5, %ymm7, %ymm7
@@ -771,6 +939,8 @@ L(return_vec_page_cross_1):
 # else
 	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
 	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -826,7 +996,7 @@ L(page_cross):
 L(page_cross_loop):
 
 	VMOVU	(%rdi, %OFFSET_REG64), %ymm0
-	VPCMPEQ	(%rsi, %OFFSET_REG64), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -844,11 +1014,11 @@ L(page_cross_loop):
 	subl	%eax, %OFFSET_REG
 	/* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
 	   to not cross page so is safe to load. Since we have already
-	   loaded at least 1 VEC from rsi it is also guranteed to be safe.
-	 */
+	   loaded at least 1 VEC from rsi it is also guranteed to be
+	   safe.  */
 
 	VMOVU	(%rdi, %OFFSET_REG64), %ymm0
-	VPCMPEQ	(%rsi, %OFFSET_REG64), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -881,6 +1051,8 @@ L(ret_vec_page_cross_cont):
 # else
 	movzbl	(%rdi, %rcx), %eax
 	movzbl	(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -934,7 +1106,7 @@ L(less_1x_vec_till_page):
 	ja	L(less_16_till_page)
 
 	VMOVU	(%rdi), %xmm0
-	VPCMPEQ	(%rsi), %xmm0, %xmm1
+	CMP_R1_S2_xmm (%xmm0, (%rsi), %xmm2, %xmm1)
 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
 	vpandn	%xmm1, %xmm2, %xmm1
 	vpmovmskb %ymm1, %ecx
@@ -952,7 +1124,7 @@ L(less_1x_vec_till_page):
 # endif
 
 	VMOVU	(%rdi, %OFFSET_REG64), %xmm0
-	VPCMPEQ	(%rsi, %OFFSET_REG64), %xmm0, %xmm1
+	CMP_R1_S2_xmm (%xmm0, (%rsi, %OFFSET_REG64), %xmm2, %xmm1)
 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
 	vpandn	%xmm1, %xmm2, %xmm1
 	vpmovmskb %ymm1, %ecx
@@ -990,7 +1162,7 @@ L(less_16_till_page):
 	vmovq	(%rdi), %xmm0
 	vmovq	(%rsi), %xmm1
 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
-	VPCMPEQ	%xmm1, %xmm0, %xmm1
+	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
 	vpandn	%xmm1, %xmm2, %xmm1
 	vpmovmskb %ymm1, %ecx
 	incb	%cl
@@ -1010,7 +1182,7 @@ L(less_16_till_page):
 	vmovq	(%rdi, %OFFSET_REG64), %xmm0
 	vmovq	(%rsi, %OFFSET_REG64), %xmm1
 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
-	VPCMPEQ	%xmm1, %xmm0, %xmm1
+	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
 	vpandn	%xmm1, %xmm2, %xmm1
 	vpmovmskb %ymm1, %ecx
 	incb	%cl
@@ -1066,7 +1238,7 @@ L(ret_less_8_wcs):
 	vmovd	(%rdi), %xmm0
 	vmovd	(%rsi), %xmm1
 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
-	VPCMPEQ	%xmm1, %xmm0, %xmm1
+	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
 	vpandn	%xmm1, %xmm2, %xmm1
 	vpmovmskb %ymm1, %ecx
 	subl	$0xf, %ecx
@@ -1085,7 +1257,7 @@ L(ret_less_8_wcs):
 	vmovd	(%rdi, %OFFSET_REG64), %xmm0
 	vmovd	(%rsi, %OFFSET_REG64), %xmm1
 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
-	VPCMPEQ	%xmm1, %xmm0, %xmm1
+	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
 	vpandn	%xmm1, %xmm2, %xmm1
 	vpmovmskb %ymm1, %ecx
 	subl	$0xf, %ecx
@@ -1119,7 +1291,9 @@ L(less_4_till_page):
 L(less_4_loop):
 	movzbl	(%rdi), %eax
 	movzbl	(%rsi, %rdi), %ecx
-	subl	%ecx, %eax
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
+	subl	%BYTE_LOOP_REG, %eax
 	jnz	L(ret_less_4_loop)
 	testl	%ecx, %ecx
 	jz	L(ret_zero_4_loop)
@@ -1146,5 +1320,6 @@ L(ret_less_4_loop):
 	subl	%r8d, %eax
 	ret
 # endif
-END(STRCMP)
+	cfi_endproc
+	.size	STRCMP, .-STRCMP
 #endif
diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
new file mode 100644
index 0000000000..58c05dcfb8
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
@@ -0,0 +1,16 @@
+#ifndef STRCMP
+# define STRCMP	__strncasecmp_l_avx2_rtm
+#endif
+
+#define _GLABEL(x)	x ## _rtm
+#define GLABEL(x)	_GLABEL(x)
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN	\
+	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN	jmp L(return_vzeroupper)
+
+#define SECTION(p)	p##.avx.rtm
+#define OVERFLOW_STRCMP	__strcasecmp_l_avx2_rtm
+
+#include "strncase_l-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2.S b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
new file mode 100644
index 0000000000..48c0aa21f8
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
@@ -0,0 +1,27 @@
+/* strncasecmp_l optimized with AVX2.
+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef STRCMP
+# define STRCMP	__strncasecmp_l_avx2
+#endif
+#define USE_AS_STRCASECMP_L
+#define USE_AS_STRNCMP
+#ifndef OVERFLOW_STRCMP
+# define OVERFLOW_STRCMP	__strcasecmp_l_avx2
+#endif
+#include "strcmp-avx2.S"
-- 
2.35.1


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v4 22/23] x86: Add EVEX optimized str{n}casecmp
  2022-05-12 19:47           ` Sunil Pandey
@ 2022-05-12 19:52             ` Sunil Pandey
  0 siblings, 0 replies; 76+ messages in thread
From: Sunil Pandey @ 2022-05-12 19:52 UTC (permalink / raw)
  To: Noah Goldstein, Libc-stable Mailing List; +Cc: H.J. Lu, GNU C Library

On Thu, May 12, 2022 at 12:47 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
> On Fri, Mar 25, 2022 at 11:20 AM Noah Goldstein via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > On Fri, Mar 25, 2022 at 1:15 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Thu, Mar 24, 2022 at 4:56 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > geometric_mean(N=40) of all benchmarks EVEX / SSE42: .621
> > > >
> > > > All string/memory tests pass.
> > > > Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
> > > > ---
> > > >  sysdeps/x86_64/multiarch/Makefile            |   2 +
> > > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c   |  16 +
> > > >  sysdeps/x86_64/multiarch/ifunc-strcasecmp.h  |   5 +
> > > >  sysdeps/x86_64/multiarch/strcasecmp_l-evex.S |  23 ++
> > > >  sysdeps/x86_64/multiarch/strcmp-evex.S       | 290 ++++++++++++++++---
> > > >  sysdeps/x86_64/multiarch/strncase_l-evex.S   |  25 ++
> > > >  6 files changed, 321 insertions(+), 40 deletions(-)
> > > >  create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
> > > >  create mode 100644 sysdeps/x86_64/multiarch/strncase_l-evex.S
> > > >
> > > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > > > index 06e1848823..35d80dc2ff 100644
> > > > --- a/sysdeps/x86_64/multiarch/Makefile
> > > > +++ b/sysdeps/x86_64/multiarch/Makefile
> > > > @@ -57,6 +57,7 @@ sysdep_routines += \
> > > >    strcasecmp_l-avx \
> > > >    strcasecmp_l-avx2 \
> > > >    strcasecmp_l-avx2-rtm \
> > > > +  strcasecmp_l-evex \
> > > >    strcasecmp_l-sse2 \
> > > >    strcasecmp_l-sse4_2 \
> > > >    strcasecmp_l-ssse3 \
> > > > @@ -97,6 +98,7 @@ sysdep_routines += \
> > > >    strncase_l-avx \
> > > >    strncase_l-avx2 \
> > > >    strncase_l-avx2-rtm \
> > > > +  strncase_l-evex \
> > > >    strncase_l-sse2 \
> > > >    strncase_l-sse4_2 \
> > > >    strncase_l-ssse3 \
> > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > index 3c556d07ac..f1a4d3dac2 100644
> > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > @@ -436,6 +436,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > >
> > > >    /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
> > > >    IFUNC_IMPL (i, name, strcasecmp,
> > > > +             IFUNC_IMPL_ADD (array, i, strcasecmp,
> > > > +                             (CPU_FEATURE_USABLE (AVX512VL)
> > > > +                              && CPU_FEATURE_USABLE (AVX512BW)),
> > > > +                             __strcasecmp_evex)
> > > >               IFUNC_IMPL_ADD (array, i, strcasecmp,
> > > >                               CPU_FEATURE_USABLE (AVX2),
> > > >                               __strcasecmp_avx2)
> > > > @@ -456,6 +460,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > >
> > > >    /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
> > > >    IFUNC_IMPL (i, name, strcasecmp_l,
> > > > +             IFUNC_IMPL_ADD (array, i, strcasecmp,
> > > > +                             (CPU_FEATURE_USABLE (AVX512VL)
> > > > +                              && CPU_FEATURE_USABLE (AVX512BW)),
> > > > +                             __strcasecmp_l_evex)
> > > >               IFUNC_IMPL_ADD (array, i, strcasecmp,
> > > >                               CPU_FEATURE_USABLE (AVX2),
> > > >                               __strcasecmp_l_avx2)
> > > > @@ -590,6 +598,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > >
> > > >    /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
> > > >    IFUNC_IMPL (i, name, strncasecmp,
> > > > +             IFUNC_IMPL_ADD (array, i, strncasecmp,
> > > > +                             (CPU_FEATURE_USABLE (AVX512VL)
> > > > +                              && CPU_FEATURE_USABLE (AVX512BW)),
> > > > +                             __strncasecmp_evex)
> > > >               IFUNC_IMPL_ADD (array, i, strncasecmp,
> > > >                               CPU_FEATURE_USABLE (AVX2),
> > > >                               __strncasecmp_avx2)
> > > > @@ -611,6 +623,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > >
> > > >    /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
> > > >    IFUNC_IMPL (i, name, strncasecmp_l,
> > > > +             IFUNC_IMPL_ADD (array, i, strncasecmp,
> > > > +                             (CPU_FEATURE_USABLE (AVX512VL)
> > > > +                              && CPU_FEATURE_USABLE (AVX512BW)),
> > > > +                             __strncasecmp_l_evex)
> > > >               IFUNC_IMPL_ADD (array, i, strncasecmp,
> > > >                               CPU_FEATURE_USABLE (AVX2),
> > > >                               __strncasecmp_l_avx2)
> > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> > > > index c4de111fd0..bf0d146e7f 100644
> > > > --- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> > > > +++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> > > > @@ -25,6 +25,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
> > > >  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
> > > >  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> > > >  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
> > > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
> > > >
> > > >  static inline void *
> > > >  IFUNC_SELECTOR (void)
> > > > @@ -34,6 +35,10 @@ IFUNC_SELECTOR (void)
> > > >    if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> > > >        && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
> > > >      {
> > > > +      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> > > > +          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
> > > > +        return OPTIMIZE (evex);
> > > > +
> > > >        if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> > > >          return OPTIMIZE (avx2_rtm);
> > > >
> > > > diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
> > > > new file mode 100644
> > > > index 0000000000..58642db748
> > > > --- /dev/null
> > > > +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
> > > > @@ -0,0 +1,23 @@
> > > > +/* strcasecmp_l optimized with EVEX.
> > > > +   Copyright (C) 2017-2022 Free Software Foundation, Inc.
> > > > +   This file is part of the GNU C Library.
> > > > +
> > > > +   The GNU C Library is free software; you can redistribute it and/or
> > > > +   modify it under the terms of the GNU Lesser General Public
> > > > +   License as published by the Free Software Foundation; either
> > > > +   version 2.1 of the License, or (at your option) any later version.
> > > > +
> > > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > +   Lesser General Public License for more details.
> > > > +
> > > > +   You should have received a copy of the GNU Lesser General Public
> > > > +   License along with the GNU C Library; if not, see
> > > > +   <https://www.gnu.org/licenses/>.  */
> > > > +
> > > > +#ifndef STRCMP
> > > > +# define STRCMP        __strcasecmp_l_evex
> > > > +#endif
> > > > +#define USE_AS_STRCASECMP_L
> > > > +#include "strcmp-evex.S"
> > > > diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
> > > > index 56d8c118e4..2a5b3ce037 100644
> > > > --- a/sysdeps/x86_64/multiarch/strcmp-evex.S
> > > > +++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
> > > > @@ -19,6 +19,9 @@
> > > >  #if IS_IN (libc)
> > > >
> > > >  # include <sysdep.h>
> > > > +# if defined USE_AS_STRCASECMP_L
> > > > +#  include "locale-defines.h"
> > > > +# endif
> > > >
> > > >  # ifndef STRCMP
> > > >  #  define STRCMP       __strcmp_evex
> > > > @@ -34,19 +37,29 @@
> > > >  # define VMOVA vmovdqa64
> > > >
> > > >  # ifdef USE_AS_WCSCMP
> > > > -#  define TESTEQ       subl    $0xff,
> > > > +#  ifndef OVERFLOW_STRCMP
> > > > +#   define OVERFLOW_STRCMP     __wcscmp_evex
> > > > +#  endif
> > > > +
> > > > +#  define TESTEQ       subl $0xff,
> > > >         /* Compare packed dwords.  */
> > > >  #  define VPCMP        vpcmpd
> > > >  #  define VPMINU       vpminud
> > > >  #  define VPTESTM      vptestmd
> > > > +#  define VPTESTNM     vptestnmd
> > > >         /* 1 dword char == 4 bytes.  */
> > > >  #  define SIZE_OF_CHAR 4
> > > >  # else
> > > > +#  ifndef OVERFLOW_STRCMP
> > > > +#   define OVERFLOW_STRCMP     __strcmp_evex
> > > > +#  endif
> > > > +
> > > >  #  define TESTEQ       incl
> > > >         /* Compare packed bytes.  */
> > > >  #  define VPCMP        vpcmpb
> > > >  #  define VPMINU       vpminub
> > > >  #  define VPTESTM      vptestmb
> > > > +#  define VPTESTNM     vptestnmb
> > > >         /* 1 byte char == 1 byte.  */
> > > >  #  define SIZE_OF_CHAR 1
> > > >  # endif
> > > > @@ -73,11 +86,16 @@
> > > >  #  define VEC_OFFSET   (-VEC_SIZE)
> > > >  # endif
> > > >
> > > > -# define XMMZERO       xmm16
> > > >  # define XMM0  xmm17
> > > >  # define XMM1  xmm18
> > > >
> > > > -# define YMMZERO       ymm16
> > > > +# define XMM10 xmm27
> > > > +# define XMM11 xmm28
> > > > +# define XMM12 xmm29
> > > > +# define XMM13 xmm30
> > > > +# define XMM14 xmm31
> > > > +
> > > > +
> > > >  # define YMM0  ymm17
> > > >  # define YMM1  ymm18
> > > >  # define YMM2  ymm19
> > > > @@ -89,6 +107,87 @@
> > > >  # define YMM8  ymm25
> > > >  # define YMM9  ymm26
> > > >  # define YMM10 ymm27
> > > > +# define YMM11 ymm28
> > > > +# define YMM12 ymm29
> > > > +# define YMM13 ymm30
> > > > +# define YMM14 ymm31
> > > > +
> > > > +# ifdef USE_AS_STRCASECMP_L
> > > > +#  define BYTE_LOOP_REG        OFFSET_REG
> > > > +# else
> > > > +#  define BYTE_LOOP_REG        ecx
> > > > +# endif
> > > > +
> > > > +# ifdef USE_AS_STRCASECMP_L
> > > > +#  ifdef USE_AS_STRNCMP
> > > > +#   define STRCASECMP  __strncasecmp_evex
> > > > +#   define LOCALE_REG  rcx
> > > > +#   define LOCALE_REG_LP       RCX_LP
> > > > +#   define STRCASECMP_NONASCII __strncasecmp_l_nonascii
> > > > +#  else
> > > > +#   define STRCASECMP  __strcasecmp_evex
> > > > +#   define LOCALE_REG  rdx
> > > > +#   define LOCALE_REG_LP       RDX_LP
> > > > +#   define STRCASECMP_NONASCII __strcasecmp_l_nonascii
> > > > +#  endif
> > > > +# endif
> > > > +
> > > > +# define LCASE_MIN_YMM %YMM12
> > > > +# define LCASE_MAX_YMM %YMM13
> > > > +# define CASE_ADD_YMM  %YMM14
> > > > +
> > > > +# define LCASE_MIN_XMM %XMM12
> > > > +# define LCASE_MAX_XMM %XMM13
> > > > +# define CASE_ADD_XMM  %XMM14
> > > > +
> > > > +       /* NB: wcsncmp uses r11 but strcasecmp is never used in
> > > > +          conjunction with wcscmp.  */
> > > > +# define TOLOWER_BASE  %r11
> > > > +
> > > > +# ifdef USE_AS_STRCASECMP_L
> > > > +#  define _REG(x, y) x ## y
> > > > +#  define REG(x, y) _REG(x, y)
> > > > +#  define TOLOWER(reg1, reg2, ext)                                                                             \
> > > > +       vpsubb  REG(LCASE_MIN_, ext), reg1, REG(%ext, 10);                                      \
> > > > +       vpsubb  REG(LCASE_MIN_, ext), reg2, REG(%ext, 11);                                      \
> > > > +       vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5;                           \
> > > > +       vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6;                           \
> > > > +       vpaddb  reg1, REG(CASE_ADD_, ext), reg1{%k5};                                           \
> > > > +       vpaddb  reg2, REG(CASE_ADD_, ext), reg2{%k6}
> > > > +
> > > > +#  define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
> > > > +#  define TOLOWER_YMM(...)     TOLOWER(__VA_ARGS__, YMM)
> > > > +#  define TOLOWER_XMM(...)     TOLOWER(__VA_ARGS__, XMM)
> > > > +
> > > > +#  define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)                                              \
> > > > +       TOLOWER (s1_reg, s2_reg, ext);                                                                          \
> > > > +       VPCMP   $0, s1_reg, s2_reg, reg_out
> > > > +
> > > > +#  define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext)                              \
> > > > +       VMOVU   s2_mem, s2_reg;                                                                                         \
> > > > +       CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)
> > > > +
> > > > +#  define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM)
> > > > +#  define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM)
> > > > +
> > > > +#  define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM)
> > > > +#  define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM)
> > > > +
> > > > +# else
> > > > +#  define TOLOWER_gpr(...)
> > > > +#  define TOLOWER_YMM(...)
> > > > +#  define TOLOWER_XMM(...)
> > > > +
> > > > +#  define CMP_R1_R2_YMM(s1_reg, s2_reg, reg_out)                                               \
> > > > +       VPCMP   $0, s2_reg, s1_reg, reg_out
> > > > +
> > > > +#  define CMP_R1_R2_XMM(...) CMP_R1_R2_YMM(__VA_ARGS__)
> > > > +
> > > > +#  define CMP_R1_S2_YMM(s1_reg, s2_mem, unused, reg_out)                               \
> > > > +       VPCMP   $0, s2_mem, s1_reg, reg_out
> > > > +
> > > > +#  define CMP_R1_S2_XMM(...) CMP_R1_S2_YMM(__VA_ARGS__)
> > > > +# endif
> > > >
> > > >  /* Warning!
> > > >             wcscmp/wcsncmp have to use SIGNED comparison for elements.
> > > > @@ -112,8 +211,45 @@
> > > >     returned.  */
> > > >
> > > >         .section .text.evex, "ax", @progbits
> > > > -ENTRY(STRCMP)
> > > > +       .align  16
> > > > +       .type   STRCMP, @function
> > > > +       .globl  STRCMP
> > > > +       .hidden STRCMP
> > > > +
> > > > +# ifdef USE_AS_STRCASECMP_L
> > > > +ENTRY (STRCASECMP)
> > > > +       movq    __libc_tsd_LOCALE@gottpoff(%rip), %rax
> > > > +       mov     %fs:(%rax), %LOCALE_REG_LP
> > > > +
> > > > +       /* Either 1 or 5 bytes (dependeing if CET is enabled).  */
> > > > +       .p2align 4
> > > > +END (STRCASECMP)
> > > > +       /* FALLTHROUGH to strcasecmp/strncasecmp_l.  */
> > > > +# endif
> > > > +
> > > > +       .p2align 4
> > > > +STRCMP:
> > > > +       cfi_startproc
> > > > +       _CET_ENDBR
> > > > +       CALL_MCOUNT
> > > > +
> > > > +# if defined USE_AS_STRCASECMP_L
> > > > +       /* We have to fall back on the C implementation for locales with
> > > > +          encodings not matching ASCII for single bytes.  */
> > > > +#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
> > > > +       mov     LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
> > > > +#  else
> > > > +       mov     (%LOCALE_REG), %RAX_LP
> > > > +#  endif
> > > > +       testl   $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
> > > > +       jne     STRCASECMP_NONASCII
> > > > +       leaq    _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
> > > > +# endif
> > > > +
> > > >  # ifdef USE_AS_STRNCMP
> > > > +       /* Don't overwrite LOCALE_REG (rcx) until we have pass
> > > > +          L(one_or_less). Otherwise we might use the wrong locale in
> > > > +          the OVERFLOW_STRCMP (strcasecmp_l).  */
> > > >  #  ifdef __ILP32__
> > > >         /* Clear the upper 32 bits.  */
> > > >         movl    %edx, %edx
> > > > @@ -125,6 +261,32 @@ ENTRY(STRCMP)
> > > >            actually bound the buffer.  */
> > > >         jle     L(one_or_less)
> > > >  # endif
> > > > +
> > > > +# if defined USE_AS_STRCASECMP_L
> > > > +       .section .rodata.cst32, "aM", @progbits, 32
> > > > +       .align  32
> > > > +L(lcase_min):
> > > > +       .quad   0x4141414141414141
> > > > +       .quad   0x4141414141414141
> > > > +       .quad   0x4141414141414141
> > > > +       .quad   0x4141414141414141
> > > > +L(lcase_max):
> > > > +       .quad   0x1a1a1a1a1a1a1a1a
> > > > +       .quad   0x1a1a1a1a1a1a1a1a
> > > > +       .quad   0x1a1a1a1a1a1a1a1a
> > > > +       .quad   0x1a1a1a1a1a1a1a1a
> > > > +L(case_add):
> > > > +       .quad   0x2020202020202020
> > > > +       .quad   0x2020202020202020
> > > > +       .quad   0x2020202020202020
> > > > +       .quad   0x2020202020202020
> > > > +       .previous
> > > > +
> > > > +       vmovdqa64 L(lcase_min)(%rip), LCASE_MIN_YMM
> > > > +       vmovdqa64 L(lcase_max)(%rip), LCASE_MAX_YMM
> > > > +       vmovdqa64 L(case_add)(%rip), CASE_ADD_YMM
> > > > +# endif
> > > > +
> > > >         movl    %edi, %eax
> > > >         orl     %esi, %eax
> > > >         /* Shift out the bits irrelivant to page boundary ([63:12]).  */
> > > > @@ -139,7 +301,7 @@ L(no_page_cross):
> > > >         VPTESTM %YMM0, %YMM0, %k2
> > > >         /* Each bit cleared in K1 represents a mismatch or a null CHAR
> > > >            in YMM0 and 32 bytes at (%rsi).  */
> > > > -       VPCMP   $0, (%rsi), %YMM0, %k1{%k2}
> > > > +       CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
> > > >         kmovd   %k1, %ecx
> > > >  # ifdef USE_AS_STRNCMP
> > > >         cmpq    $CHAR_PER_VEC, %rdx
> > > > @@ -169,6 +331,8 @@ L(return_vec_0):
> > > >  # else
> > > >         movzbl  (%rdi, %rcx), %eax
> > > >         movzbl  (%rsi, %rcx), %ecx
> > > > +       TOLOWER_gpr (%rax, %eax)
> > > > +       TOLOWER_gpr (%rcx, %ecx)
> > > >         subl    %ecx, %eax
> > > >  # endif
> > > >  L(ret0):
> > > > @@ -188,11 +352,15 @@ L(ret_zero):
> > > >
> > > >         .p2align 4,, 5
> > > >  L(one_or_less):
> > > > +#  ifdef USE_AS_STRCASECMP_L
> > > > +       /* Set locale argument for strcasecmp.  */
> > > > +       movq    %LOCALE_REG, %rdx
> > > > +#  endif
> > > >         jb      L(ret_zero)
> > > > -#  ifdef USE_AS_WCSCMP
> > > >         /* 'nbe' covers the case where length is negative (large
> > > >            unsigned).  */
> > > > -       jnbe    __wcscmp_evex
> > > > +       jnbe    OVERFLOW_STRCMP
> > > > +#  ifdef USE_AS_WCSCMP
> > > >         movl    (%rdi), %edx
> > > >         xorl    %eax, %eax
> > > >         cmpl    (%rsi), %edx
> > > > @@ -201,11 +369,10 @@ L(one_or_less):
> > > >         negl    %eax
> > > >         orl     $1, %eax
> > > >  #  else
> > > > -       /* 'nbe' covers the case where length is negative (large
> > > > -          unsigned).  */
> > > > -       jnbe    __strcmp_evex
> > > >         movzbl  (%rdi), %eax
> > > >         movzbl  (%rsi), %ecx
> > > > +       TOLOWER_gpr (%rax, %eax)
> > > > +       TOLOWER_gpr (%rcx, %ecx)
> > > >         subl    %ecx, %eax
> > > >  #  endif
> > > >  L(ret1):
> > > > @@ -233,6 +400,8 @@ L(return_vec_1):
> > > >  # else
> > > >         movzbl  VEC_SIZE(%rdi, %rcx), %eax
> > > >         movzbl  VEC_SIZE(%rsi, %rcx), %ecx
> > > > +       TOLOWER_gpr (%rax, %eax)
> > > > +       TOLOWER_gpr (%rcx, %ecx)
> > > >         subl    %ecx, %eax
> > > >  # endif
> > > >  L(ret2):
> > > > @@ -270,6 +439,8 @@ L(return_vec_2):
> > > >  # else
> > > >         movzbl  (VEC_SIZE * 2)(%rdi, %rcx), %eax
> > > >         movzbl  (VEC_SIZE * 2)(%rsi, %rcx), %ecx
> > > > +       TOLOWER_gpr (%rax, %eax)
> > > > +       TOLOWER_gpr (%rcx, %ecx)
> > > >         subl    %ecx, %eax
> > > >  # endif
> > > >  L(ret3):
> > > > @@ -290,6 +461,8 @@ L(return_vec_3):
> > > >  #  else
> > > >         movzbl  (VEC_SIZE * 3)(%rdi, %rcx), %eax
> > > >         movzbl  (VEC_SIZE * 3)(%rsi, %rcx), %ecx
> > > > +       TOLOWER_gpr (%rax, %eax)
> > > > +       TOLOWER_gpr (%rcx, %ecx)
> > > >         subl    %ecx, %eax
> > > >  #  endif
> > > >  L(ret4):
> > > > @@ -303,7 +476,7 @@ L(more_3x_vec):
> > > >         /* Safe to compare 4x vectors.  */
> > > >         VMOVU   (VEC_SIZE)(%rdi), %YMM0
> > > >         VPTESTM %YMM0, %YMM0, %k2
> > > > -       VPCMP   $0, (VEC_SIZE)(%rsi), %YMM0, %k1{%k2}
> > > > +       CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
> > > >         kmovd   %k1, %ecx
> > > >         TESTEQ  %ecx
> > > >         jnz     L(return_vec_1)
> > > > @@ -315,14 +488,14 @@ L(more_3x_vec):
> > > >
> > > >         VMOVU   (VEC_SIZE * 2)(%rdi), %YMM0
> > > >         VPTESTM %YMM0, %YMM0, %k2
> > > > -       VPCMP   $0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
> > > > +       CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2}
> > > >         kmovd   %k1, %ecx
> > > >         TESTEQ  %ecx
> > > >         jnz     L(return_vec_2)
> > > >
> > > >         VMOVU   (VEC_SIZE * 3)(%rdi), %YMM0
> > > >         VPTESTM %YMM0, %YMM0, %k2
> > > > -       VPCMP   $0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
> > > > +       CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2}
> > > >         kmovd   %k1, %ecx
> > > >         TESTEQ  %ecx
> > > >         jnz     L(return_vec_3)
> > > > @@ -381,7 +554,6 @@ L(prepare_loop_aligned):
> > > >         subl    %esi, %eax
> > > >         andl    $(PAGE_SIZE - 1), %eax
> > > >
> > > > -       vpxorq  %YMMZERO, %YMMZERO, %YMMZERO
> > > >
> > > >         /* Loop 4x comparisons at a time.  */
> > > >         .p2align 4
> > > > @@ -413,22 +585,35 @@ L(loop_skip_page_cross_check):
> > > >         /* A zero CHAR in YMM9 means that there is a null CHAR.  */
> > > >         VPMINU  %YMM8, %YMM9, %YMM9
> > > >
> > > > -       /* Each bit set in K1 represents a non-null CHAR in YMM8.  */
> > > > +       /* Each bit set in K1 represents a non-null CHAR in YMM9.  */
> > > >         VPTESTM %YMM9, %YMM9, %k1
> > > > -
> > > > +# ifndef USE_AS_STRCASECMP_L
> > > >         vpxorq  (VEC_SIZE * 0)(%rsi), %YMM0, %YMM1
> > > >         vpxorq  (VEC_SIZE * 1)(%rsi), %YMM2, %YMM3
> > > >         vpxorq  (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
> > > >         /* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
> > > >            oring with YMM1. Result is stored in YMM6.  */
> > > >         vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6
> > > > -
> > > > +# else
> > > > +       VMOVU   (VEC_SIZE * 0)(%rsi), %YMM1
> > > > +       TOLOWER_YMM (%YMM0, %YMM1)
> > > > +       VMOVU   (VEC_SIZE * 1)(%rsi), %YMM3
> > > > +       TOLOWER_YMM (%YMM2, %YMM3)
> > > > +       VMOVU   (VEC_SIZE * 2)(%rsi), %YMM5
> > > > +       TOLOWER_YMM (%YMM4, %YMM5)
> > > > +       VMOVU   (VEC_SIZE * 3)(%rsi), %YMM7
> > > > +       TOLOWER_YMM (%YMM6, %YMM7)
> > > > +       vpxorq  %YMM0, %YMM1, %YMM1
> > > > +       vpxorq  %YMM2, %YMM3, %YMM3
> > > > +       vpxorq  %YMM4, %YMM5, %YMM5
> > > > +       vpternlogd $0xde, %YMM7, %YMM1, %YMM6
> > > > +# endif
> > > >         /* Or together YMM3, YMM5, and YMM6.  */
> > > >         vpternlogd $0xfe, %YMM3, %YMM5, %YMM6
> > > >
> > > >
> > > >         /* A non-zero CHAR in YMM6 represents a mismatch.  */
> > > > -       VPCMP   $0, %YMMZERO, %YMM6, %k0{%k1}
> > > > +       VPTESTNM %YMM6, %YMM6, %k0{%k1}
> > > >         kmovd   %k0, %LOOP_REG
> > > >
> > > >         TESTEQ  %LOOP_REG
> > > > @@ -437,13 +622,13 @@ L(loop_skip_page_cross_check):
> > > >
> > > >         /* Find which VEC has the mismatch of end of string.  */
> > > >         VPTESTM %YMM0, %YMM0, %k1
> > > > -       VPCMP   $0, %YMMZERO, %YMM1, %k0{%k1}
> > > > +       VPTESTNM %YMM1, %YMM1, %k0{%k1}
> > > >         kmovd   %k0, %ecx
> > > >         TESTEQ  %ecx
> > > >         jnz     L(return_vec_0_end)
> > > >
> > > >         VPTESTM %YMM2, %YMM2, %k1
> > > > -       VPCMP   $0, %YMMZERO, %YMM3, %k0{%k1}
> > > > +       VPTESTNM %YMM3, %YMM3, %k0{%k1}
> > > >         kmovd   %k0, %ecx
> > > >         TESTEQ  %ecx
> > > >         jnz     L(return_vec_1_end)
> > > > @@ -457,7 +642,7 @@ L(return_vec_2_3_end):
> > > >  # endif
> > > >
> > > >         VPTESTM %YMM4, %YMM4, %k1
> > > > -       VPCMP   $0, %YMMZERO, %YMM5, %k0{%k1}
> > > > +       VPTESTNM %YMM5, %YMM5, %k0{%k1}
> > > >         kmovd   %k0, %ecx
> > > >         TESTEQ  %ecx
> > > >  # if CHAR_PER_VEC <= 16
> > > > @@ -493,6 +678,8 @@ L(return_vec_3_end):
> > > >  # else
> > > >         movzbl  (VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax
> > > >         movzbl  (VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx
> > > > +       TOLOWER_gpr (%rax, %eax)
> > > > +       TOLOWER_gpr (%rcx, %ecx)
> > > >         subl    %ecx, %eax
> > > >         xorl    %r8d, %eax
> > > >         subl    %r8d, %eax
> > > > @@ -545,6 +732,8 @@ L(return_vec_0_end):
> > > >  # else
> > > >         movzbl  (%rdi, %rcx), %eax
> > > >         movzbl  (%rsi, %rcx), %ecx
> > > > +       TOLOWER_gpr (%rax, %eax)
> > > > +       TOLOWER_gpr (%rcx, %ecx)
> > > >         subl    %ecx, %eax
> > > >         /* Flip `eax` if `rdi` and `rsi` where swapped in page cross
> > > >            logic. Subtract `r8d` after xor for zero case.  */
> > > > @@ -569,6 +758,8 @@ L(return_vec_1_end):
> > > >  #  else
> > > >         movzbl  VEC_SIZE(%rdi, %rcx), %eax
> > > >         movzbl  VEC_SIZE(%rsi, %rcx), %ecx
> > > > +       TOLOWER_gpr (%rax, %eax)
> > > > +       TOLOWER_gpr (%rcx, %ecx)
> > > >         subl    %ecx, %eax
> > > >         xorl    %r8d, %eax
> > > >         subl    %r8d, %eax
> > > > @@ -598,7 +789,7 @@ L(page_cross_during_loop):
> > > >
> > > >         VMOVA   (%rdi), %YMM0
> > > >         VPTESTM %YMM0, %YMM0, %k2
> > > > -       VPCMP   $0, (%rsi), %YMM0, %k1{%k2}
> > > > +       CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
> > > >         kmovd   %k1, %ecx
> > > >         TESTEQ  %ecx
> > > >         jnz     L(return_vec_0_end)
> > > > @@ -619,8 +810,7 @@ L(less_1x_vec_till_page_cross):
> > > >            been loaded earlier so must be valid.  */
> > > >         VMOVU   -VEC_SIZE(%rdi, %rax), %YMM0
> > > >         VPTESTM %YMM0, %YMM0, %k2
> > > > -       VPCMP   $0, -VEC_SIZE(%rsi, %rax), %YMM0, %k1{%k2}
> > > > -
> > > > +       CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2}
> > > >         /* Mask of potentially valid bits. The lower bits can be out of
> > > >            range comparisons (but safe regarding page crosses).  */
> > > >
> > > > @@ -642,6 +832,8 @@ L(less_1x_vec_till_page_cross):
> > > >
> > > >  # ifdef USE_AS_STRNCMP
> > > >  #  ifdef USE_AS_WCSCMP
> > > > +       /* NB: strcasecmp not used with WCSCMP so this access to r11 is
> > > > +          safe.  */
> > > >         movl    %eax, %r11d
> > > >         shrl    $2, %r11d
> > > >         cmpq    %r11, %rdx
> > > > @@ -679,6 +871,8 @@ L(return_page_cross_cmp_mem):
> > > >  # else
> > > >         movzbl  VEC_OFFSET(%rdi, %rcx), %eax
> > > >         movzbl  VEC_OFFSET(%rsi, %rcx), %ecx
> > > > +       TOLOWER_gpr (%rax, %eax)
> > > > +       TOLOWER_gpr (%rcx, %ecx)
> > > >         subl    %ecx, %eax
> > > >         xorl    %r8d, %eax
> > > >         subl    %r8d, %eax
> > > > @@ -709,7 +903,7 @@ L(more_2x_vec_till_page_cross):
> > > >
> > > >         VMOVA   VEC_SIZE(%rdi), %YMM0
> > > >         VPTESTM %YMM0, %YMM0, %k2
> > > > -       VPCMP   $0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
> > > > +       CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
> > > >         kmovd   %k1, %ecx
> > > >         TESTEQ  %ecx
> > > >         jnz     L(return_vec_1_end)
> > > > @@ -724,14 +918,14 @@ L(more_2x_vec_till_page_cross):
> > > >         /* Safe to include comparisons from lower bytes.  */
> > > >         VMOVU   -(VEC_SIZE * 2)(%rdi, %rax), %YMM0
> > > >         VPTESTM %YMM0, %YMM0, %k2
> > > > -       VPCMP   $0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM0, %k1{%k2}
> > > > +       CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2}
> > > >         kmovd   %k1, %ecx
> > > >         TESTEQ  %ecx
> > > >         jnz     L(return_vec_page_cross_0)
> > > >
> > > >         VMOVU   -(VEC_SIZE * 1)(%rdi, %rax), %YMM0
> > > >         VPTESTM %YMM0, %YMM0, %k2
> > > > -       VPCMP   $0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM0, %k1{%k2}
> > > > +       CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2}
> > > >         kmovd   %k1, %ecx
> > > >         TESTEQ  %ecx
> > > >         jnz     L(return_vec_page_cross_1)
> > > > @@ -740,6 +934,8 @@ L(more_2x_vec_till_page_cross):
> > > >         /* Must check length here as length might proclude reading next
> > > >            page.  */
> > > >  #  ifdef USE_AS_WCSCMP
> > > > +       /* NB: strcasecmp not used with WCSCMP so this access to r11 is
> > > > +          safe.  */
> > > >         movl    %eax, %r11d
> > > >         shrl    $2, %r11d
> > > >         cmpq    %r11, %rdx
> > > > @@ -754,12 +950,19 @@ L(more_2x_vec_till_page_cross):
> > > >         VMOVA   (VEC_SIZE * 3)(%rdi), %YMM6
> > > >         VPMINU  %YMM4, %YMM6, %YMM9
> > > >         VPTESTM %YMM9, %YMM9, %k1
> > > > -
> > > > +# ifndef USE_AS_STRCASECMP_L
> > > >         vpxorq  (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
> > > >         /* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6).  */
> > > >         vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6
> > > > -
> > > > -       VPCMP   $0, %YMMZERO, %YMM6, %k0{%k1}
> > > > +# else
> > > > +       VMOVU   (VEC_SIZE * 2)(%rsi), %YMM5
> > > > +       TOLOWER_YMM (%YMM4, %YMM5)
> > > > +       VMOVU   (VEC_SIZE * 3)(%rsi), %YMM7
> > > > +       TOLOWER_YMM (%YMM6, %YMM7)
> > > > +       vpxorq  %YMM4, %YMM5, %YMM5
> > > > +       vpternlogd $0xde, %YMM7, %YMM5, %YMM6
> > > > +# endif
> > > > +       VPTESTNM %YMM6, %YMM6, %k0{%k1}
> > > >         kmovd   %k0, %LOOP_REG
> > > >         TESTEQ  %LOOP_REG
> > > >         jnz     L(return_vec_2_3_end)
> > > > @@ -815,6 +1018,8 @@ L(return_vec_page_cross_1):
> > > >  # else
> > > >         movzbl  VEC_OFFSET(%rdi, %rcx), %eax
> > > >         movzbl  VEC_OFFSET(%rsi, %rcx), %ecx
> > > > +       TOLOWER_gpr (%rax, %eax)
> > > > +       TOLOWER_gpr (%rcx, %ecx)
> > > >         subl    %ecx, %eax
> > > >         xorl    %r8d, %eax
> > > >         subl    %r8d, %eax
> > > > @@ -871,7 +1076,7 @@ L(page_cross):
> > > >  L(page_cross_loop):
> > > >         VMOVU   (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
> > > >         VPTESTM %YMM0, %YMM0, %k2
> > > > -       VPCMP   $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
> > > > +       CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
> > > >         kmovd   %k1, %ecx
> > > >         TESTEQ  %ecx
> > > >         jnz     L(check_ret_vec_page_cross)
> > > > @@ -895,7 +1100,7 @@ L(page_cross_loop):
> > > >          */
> > > >         VMOVU   (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
> > > >         VPTESTM %YMM0, %YMM0, %k2
> > > > -       VPCMP   $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
> > > > +       CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
> > > >
> > > >         kmovd   %k1, %ecx
> > > >  # ifdef USE_AS_STRNCMP
> > > > @@ -930,6 +1135,8 @@ L(ret_vec_page_cross_cont):
> > > >  # else
> > > >         movzbl  (%rdi, %rcx, SIZE_OF_CHAR), %eax
> > > >         movzbl  (%rsi, %rcx, SIZE_OF_CHAR), %ecx
> > > > +       TOLOWER_gpr (%rax, %eax)
> > > > +       TOLOWER_gpr (%rcx, %ecx)
> > > >         subl    %ecx, %eax
> > > >         xorl    %r8d, %eax
> > > >         subl    %r8d, %eax
> > > > @@ -989,7 +1196,7 @@ L(less_1x_vec_till_page):
> > > >         /* Use 16 byte comparison.  */
> > > >         vmovdqu (%rdi), %xmm0
> > > >         VPTESTM %xmm0, %xmm0, %k2
> > > > -       VPCMP   $0, (%rsi), %xmm0, %k1{%k2}
> > > > +       CMP_R1_S2_XMM (%xmm0, (%rsi), %xmm1, %k1){%k2}
> > > >         kmovd   %k1, %ecx
> > > >  # ifdef USE_AS_WCSCMP
> > > >         subl    $0xf, %ecx
> > > > @@ -1009,7 +1216,7 @@ L(less_1x_vec_till_page):
> > > >  # endif
> > > >         vmovdqu (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
> > > >         VPTESTM %xmm0, %xmm0, %k2
> > > > -       VPCMP   $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0, %k1{%k2}
> > > > +       CMP_R1_S2_XMM (%xmm0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1, %k1){%k2}
> > > >         kmovd   %k1, %ecx
> > > >  # ifdef USE_AS_WCSCMP
> > > >         subl    $0xf, %ecx
> > > > @@ -1048,7 +1255,7 @@ L(less_16_till_page):
> > > >         vmovq   (%rdi), %xmm0
> > > >         vmovq   (%rsi), %xmm1
> > > >         VPTESTM %xmm0, %xmm0, %k2
> > > > -       VPCMP   $0, %xmm1, %xmm0, %k1{%k2}
> > > > +       CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
> > > >         kmovd   %k1, %ecx
> > > >  # ifdef USE_AS_WCSCMP
> > > >         subl    $0x3, %ecx
> > > > @@ -1068,7 +1275,7 @@ L(less_16_till_page):
> > > >         vmovq   (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
> > > >         vmovq   (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
> > > >         VPTESTM %xmm0, %xmm0, %k2
> > > > -       VPCMP   $0, %xmm1, %xmm0, %k1{%k2}
> > > > +       CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
> > > >         kmovd   %k1, %ecx
> > > >  # ifdef USE_AS_WCSCMP
> > > >         subl    $0x3, %ecx
> > > > @@ -1128,7 +1335,7 @@ L(ret_less_8_wcs):
> > > >         vmovd   (%rdi), %xmm0
> > > >         vmovd   (%rsi), %xmm1
> > > >         VPTESTM %xmm0, %xmm0, %k2
> > > > -       VPCMP   $0, %xmm1, %xmm0, %k1{%k2}
> > > > +       CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
> > > >         kmovd   %k1, %ecx
> > > >         subl    $0xf, %ecx
> > > >         jnz     L(check_ret_vec_page_cross)
> > > > @@ -1143,7 +1350,7 @@ L(ret_less_8_wcs):
> > > >         vmovd   (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
> > > >         vmovd   (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
> > > >         VPTESTM %xmm0, %xmm0, %k2
> > > > -       VPCMP   $0, %xmm1, %xmm0, %k1{%k2}
> > > > +       CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
> > > >         kmovd   %k1, %ecx
> > > >         subl    $0xf, %ecx
> > > >         jnz     L(check_ret_vec_page_cross)
> > > > @@ -1176,7 +1383,9 @@ L(less_4_till_page):
> > > >  L(less_4_loop):
> > > >         movzbl  (%rdi), %eax
> > > >         movzbl  (%rsi, %rdi), %ecx
> > > > -       subl    %ecx, %eax
> > > > +       TOLOWER_gpr (%rax, %eax)
> > > > +       TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
> > > > +       subl    %BYTE_LOOP_REG, %eax
> > > >         jnz     L(ret_less_4_loop)
> > > >         testl   %ecx, %ecx
> > > >         jz      L(ret_zero_4_loop)
> > > > @@ -1203,5 +1412,6 @@ L(ret_less_4_loop):
> > > >         subl    %r8d, %eax
> > > >         ret
> > > >  # endif
> > > > -END(STRCMP)
> > > > +       cfi_endproc
> > > > +       .size   STRCMP, .-STRCMP
> > > >  #endif
> > > > diff --git a/sysdeps/x86_64/multiarch/strncase_l-evex.S b/sysdeps/x86_64/multiarch/strncase_l-evex.S
> > > > new file mode 100644
> > > > index 0000000000..8a5af3695c
> > > > --- /dev/null
> > > > +++ b/sysdeps/x86_64/multiarch/strncase_l-evex.S
> > > > @@ -0,0 +1,25 @@
> > > > +/* strncasecmp_l optimized with EVEX.
> > > > +   Copyright (C) 2017-2022 Free Software Foundation, Inc.
> > > > +   This file is part of the GNU C Library.
> > > > +
> > > > +   The GNU C Library is free software; you can redistribute it and/or
> > > > +   modify it under the terms of the GNU Lesser General Public
> > > > +   License as published by the Free Software Foundation; either
> > > > +   version 2.1 of the License, or (at your option) any later version.
> > > > +
> > > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > +   Lesser General Public License for more details.
> > > > +
> > > > +   You should have received a copy of the GNU Lesser General Public
> > > > +   License along with the GNU C Library; if not, see
> > > > +   <https://www.gnu.org/licenses/>.  */
> > > > +
> > > > +#ifndef STRCMP
> > > > +# define STRCMP        __strncasecmp_l_evex
> > > > +#endif
> > > > +#define OVERFLOW_STRCMP        __strcasecmp_l_evex
> > > > +#define USE_AS_STRCASECMP_L
> > > > +#define USE_AS_STRNCMP
> > > > +#include "strcmp-evex.S"
> > > > --
> > > > 2.25.1
> > > >
> > >
> > > LGTM.
> > >
> > > Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
> >
> > Thanks, pushed the patchset.
> > >
> > > Thanks.
> > >
> > > --
> > > H.J.
>
> I would like to backport this patch to release branches.
> Any comments or objections?
>
> Conflict resolution patch attached.
>
> --Sunil

Please ignore, this patch doesn't have any conflict.

--Sunil


--Sunil

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v1 23/23] x86: Remove AVX str{n}casecmp
  2022-03-24 19:04   ` H.J. Lu
@ 2022-05-12 19:54     ` Sunil Pandey
  0 siblings, 0 replies; 76+ messages in thread
From: Sunil Pandey @ 2022-05-12 19:54 UTC (permalink / raw)
  To: H.J. Lu, Libc-stable Mailing List; +Cc: Noah Goldstein, GNU C Library

On Thu, Mar 24, 2022 at 12:09 PM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Wed, Mar 23, 2022 at 3:03 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > The rational is:
> >
> > 1. SSE42 has nearly identical logic so any benefit is minimal (3.4%
> >    regression on Tigerlake using SSE42 versus AVX across the
> >    benchtest suite).
> > 2. AVX2 version covers the majority of targets that previously
> >    prefered it.
> > 3. The targets where AVX would still be best (SnB and IVB) are
> >    becoming outdated.
> >
> > All in all the saving the code size is worth it.
> >
> > All string/memory tests pass.
> > ---
> > Geomtric Mean N=40 runs; All functions page aligned
> > length, align1, align2, max_char, AVX Time / SSE42 Time
> >      1,      1,      1,      127,                 0.928
> >      2,      2,      2,      127,                 0.934
> >      3,      3,      3,      127,                 0.975
> >      4,      4,      4,      127,                  0.96
> >      5,      5,      5,      127,                 0.935
> >      6,      6,      6,      127,                 0.929
> >      7,      7,      7,      127,                 0.959
> >      8,      0,      0,      127,                 0.955
> >      9,      1,      1,      127,                 0.944
> >     10,      2,      2,      127,                 0.975
> >     11,      3,      3,      127,                 0.935
> >     12,      4,      4,      127,                 0.931
> >     13,      5,      5,      127,                 0.926
> >     14,      6,      6,      127,                 0.901
> >     15,      7,      7,      127,                 0.951
> >      4,      0,      0,      127,                 0.958
> >      4,      0,      0,      254,                 0.956
> >      8,      0,      0,      254,                 0.977
> >     16,      0,      0,      127,                 0.955
> >     16,      0,      0,      254,                 0.953
> >     32,      0,      0,      127,                 0.943
> >     32,      0,      0,      254,                 0.941
> >     64,      0,      0,      127,                 0.941
> >     64,      0,      0,      254,                 0.955
> >    128,      0,      0,      127,                 0.972
> >    128,      0,      0,      254,                 0.975
> >    256,      0,      0,      127,                 0.996
> >    256,      0,      0,      254,                 0.993
> >    512,      0,      0,      127,                 0.992
> >    512,      0,      0,      254,                 0.986
> >   1024,      0,      0,      127,                 0.994
> >   1024,      0,      0,      254,                 0.993
> >     16,      1,      2,      127,                 0.933
> >     16,      2,      1,      254,                 0.953
> >     32,      2,      4,      127,                 0.927
> >     32,      4,      2,      254,                 0.986
> >     64,      3,      6,      127,                 0.991
> >     64,      6,      3,      254,                 1.014
> >    128,      4,      0,      127,                 1.001
> >    128,      0,      4,      254,                 0.991
> >    256,      5,      2,      127,                 1.011
> >    256,      2,      5,      254,                 1.013
> >    512,      6,      4,      127,                 1.056
> >    512,      4,      6,      254,                 0.916
> >   1024,      7,      6,      127,                 1.059
> >   1024,      6,      7,      254,                 1.043
> >
> >  sysdeps/x86_64/multiarch/Makefile           |   2 -
> >  sysdeps/x86_64/multiarch/ifunc-impl-list.c  |  12 -
> >  sysdeps/x86_64/multiarch/ifunc-strcasecmp.h |   4 -
> >  sysdeps/x86_64/multiarch/strcasecmp_l-avx.S |  22 --
> >  sysdeps/x86_64/multiarch/strcmp-sse42.S     | 240 +++++++++-----------
> >  sysdeps/x86_64/multiarch/strncase_l-avx.S   |  22 --
> >  6 files changed, 105 insertions(+), 197 deletions(-)
> >  delete mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx.S
> >  delete mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx.S
> >
> > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > index 35d80dc2ff..6507d1b7fa 100644
> > --- a/sysdeps/x86_64/multiarch/Makefile
> > +++ b/sysdeps/x86_64/multiarch/Makefile
> > @@ -54,7 +54,6 @@ sysdep_routines += \
> >    stpncpy-evex \
> >    stpncpy-sse2-unaligned \
> >    stpncpy-ssse3 \
> > -  strcasecmp_l-avx \
> >    strcasecmp_l-avx2 \
> >    strcasecmp_l-avx2-rtm \
> >    strcasecmp_l-evex \
> > @@ -95,7 +94,6 @@ sysdep_routines += \
> >    strlen-avx2-rtm \
> >    strlen-evex \
> >    strlen-sse2 \
> > -  strncase_l-avx \
> >    strncase_l-avx2 \
> >    strncase_l-avx2-rtm \
> >    strncase_l-evex \
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > index f1a4d3dac2..40cc6cc49e 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > @@ -447,9 +447,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                               (CPU_FEATURE_USABLE (AVX2)
> >                                && CPU_FEATURE_USABLE (RTM)),
> >                               __strcasecmp_avx2_rtm)
> > -             IFUNC_IMPL_ADD (array, i, strcasecmp,
> > -                             CPU_FEATURE_USABLE (AVX),
> > -                             __strcasecmp_avx)
> >               IFUNC_IMPL_ADD (array, i, strcasecmp,
> >                               CPU_FEATURE_USABLE (SSE4_2),
> >                               __strcasecmp_sse42)
> > @@ -471,9 +468,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                               (CPU_FEATURE_USABLE (AVX2)
> >                                && CPU_FEATURE_USABLE (RTM)),
> >                               __strcasecmp_l_avx2_rtm)
> > -             IFUNC_IMPL_ADD (array, i, strcasecmp_l,
> > -                             CPU_FEATURE_USABLE (AVX),
> > -                             __strcasecmp_l_avx)
> >               IFUNC_IMPL_ADD (array, i, strcasecmp_l,
> >                               CPU_FEATURE_USABLE (SSE4_2),
> >                               __strcasecmp_l_sse42)
> > @@ -609,9 +603,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                               (CPU_FEATURE_USABLE (AVX2)
> >                                && CPU_FEATURE_USABLE (RTM)),
> >                               __strncasecmp_avx2_rtm)
> > -             IFUNC_IMPL_ADD (array, i, strncasecmp,
> > -                             CPU_FEATURE_USABLE (AVX),
> > -                             __strncasecmp_avx)
> >               IFUNC_IMPL_ADD (array, i, strncasecmp,
> >                               CPU_FEATURE_USABLE (SSE4_2),
> >                               __strncasecmp_sse42)
> > @@ -634,9 +625,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                               (CPU_FEATURE_USABLE (AVX2)
> >                                && CPU_FEATURE_USABLE (RTM)),
> >                               __strncasecmp_l_avx2_rtm)
> > -             IFUNC_IMPL_ADD (array, i, strncasecmp_l,
> > -                             CPU_FEATURE_USABLE (AVX),
> > -                             __strncasecmp_l_avx)
> >               IFUNC_IMPL_ADD (array, i, strncasecmp_l,
> >                               CPU_FEATURE_USABLE (SSE4_2),
> >                               __strncasecmp_l_sse42)
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> > index bf0d146e7f..766539c241 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> > +++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> > @@ -22,7 +22,6 @@
> >  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> >  extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
> >  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
> > -extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
> >  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> >  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
> >  extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
> > @@ -46,9 +45,6 @@ IFUNC_SELECTOR (void)
> >          return OPTIMIZE (avx2);
> >      }
> >
> > -  if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
> > -    return OPTIMIZE (avx);
> > -
> >    if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_2)
> >        && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
> >      return OPTIMIZE (sse42);
> > diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S
> > deleted file mode 100644
> > index 7ec7c21b5a..0000000000
> > --- a/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S
> > +++ /dev/null
> > @@ -1,22 +0,0 @@
> > -/* strcasecmp_l optimized with AVX.
> > -   Copyright (C) 2017-2022 Free Software Foundation, Inc.
> > -   This file is part of the GNU C Library.
> > -
> > -   The GNU C Library is free software; you can redistribute it and/or
> > -   modify it under the terms of the GNU Lesser General Public
> > -   License as published by the Free Software Foundation; either
> > -   version 2.1 of the License, or (at your option) any later version.
> > -
> > -   The GNU C Library is distributed in the hope that it will be useful,
> > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > -   Lesser General Public License for more details.
> > -
> > -   You should have received a copy of the GNU Lesser General Public
> > -   License along with the GNU C Library; if not, see
> > -   <https://www.gnu.org/licenses/>.  */
> > -
> > -#define STRCMP_SSE42 __strcasecmp_l_avx
> > -#define USE_AVX 1
> > -#define USE_AS_STRCASECMP_L
> > -#include "strcmp-sse42.S"
> > diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
> > index 7805ae9d41..a9178ad25c 100644
> > --- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
> > +++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
> > @@ -41,13 +41,8 @@
> >  # define UPDATE_STRNCMP_COUNTER
> >  #endif
> >
> > -#ifdef USE_AVX
> > -# define SECTION       avx
> > -# define GLABEL(l)     l##_avx
> > -#else
> > -# define SECTION       sse4.2
> > -# define GLABEL(l)     l##_sse42
> > -#endif
> > +#define SECTION        sse4.2
> > +#define GLABEL(l)      l##_sse42
> >
> >  #define LABEL(l)       .L##l
> >
> > @@ -105,21 +100,7 @@ END (GLABEL(__strncasecmp))
> >  #endif
> >
> >
> > -#ifdef USE_AVX
> > -# define movdqa vmovdqa
> > -# define movdqu vmovdqu
> > -# define pmovmskb vpmovmskb
> > -# define pcmpistri vpcmpistri
> > -# define psubb vpsubb
> > -# define pcmpeqb vpcmpeqb
> > -# define psrldq vpsrldq
> > -# define pslldq vpslldq
> > -# define palignr vpalignr
> > -# define pxor vpxor
> > -# define D(arg) arg, arg
> > -#else
> > -# define D(arg) arg
> > -#endif
> > +#define arg arg
> >
> >  STRCMP_SSE42:
> >         cfi_startproc
> > @@ -191,18 +172,7 @@ LABEL(case_add):
> >         movdqu  (%rdi), %xmm1
> >         movdqu  (%rsi), %xmm2
> >  #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> > -# ifdef USE_AVX
> > -#  define TOLOWER(reg1, reg2) \
> > -       vpaddb  LCASE_MIN_reg, reg1, %xmm7;                                     \
> > -       vpaddb  LCASE_MIN_reg, reg2, %xmm8;                                     \
> > -       vpcmpgtb LCASE_MAX_reg, %xmm7, %xmm7;                                   \
> > -       vpcmpgtb LCASE_MAX_reg, %xmm8, %xmm8;                                   \
> > -       vpandn  CASE_ADD_reg, %xmm7, %xmm7;                                     \
> > -       vpandn  CASE_ADD_reg, %xmm8, %xmm8;                                     \
> > -       vpaddb  %xmm7, reg1, reg1;                                      \
> > -       vpaddb  %xmm8, reg2, reg2
> > -# else
> > -#  define TOLOWER(reg1, reg2) \
> > +# define TOLOWER(reg1, reg2) \
> >         movdqa  LCASE_MIN_reg, %xmm7;                                   \
> >         movdqa  LCASE_MIN_reg, %xmm8;                                   \
> >         paddb   reg1, %xmm7;                                    \
> > @@ -213,15 +183,15 @@ LABEL(case_add):
> >         pandn   CASE_ADD_reg, %xmm8;                                    \
> >         paddb   %xmm7, reg1;                                    \
> >         paddb   %xmm8, reg2
> > -# endif
> > +
> >         TOLOWER (%xmm1, %xmm2)
> >  #else
> >  # define TOLOWER(reg1, reg2)
> >  #endif
> > -       pxor    %xmm0, D(%xmm0)         /* clear %xmm0 for null char checks */
> > -       pcmpeqb %xmm1, D(%xmm0)         /* Any null chars? */
> > -       pcmpeqb %xmm2, D(%xmm1)         /* compare first 16 bytes for equality */
> > -       psubb   %xmm0, D(%xmm1)         /* packed sub of comparison results*/
> > +       pxor    %xmm0, %xmm0            /* clear %xmm0 for null char checks */
> > +       pcmpeqb %xmm1, %xmm0            /* Any null chars? */
> > +       pcmpeqb %xmm2, %xmm1            /* compare first 16 bytes for equality */
> > +       psubb   %xmm0, %xmm1            /* packed sub of comparison results*/
> >         pmovmskb %xmm1, %edx
> >         sub     $0xffff, %edx           /* if first 16 bytes are same, edx == 0xffff */
> >         jnz     LABEL(less16bytes)/* If not, find different value or null char */
> > @@ -245,7 +215,7 @@ LABEL(crosscache):
> >         xor     %r8d, %r8d
> >         and     $0xf, %ecx              /* offset of rsi */
> >         and     $0xf, %eax              /* offset of rdi */
> > -       pxor    %xmm0, D(%xmm0)         /* clear %xmm0 for null char check */
> > +       pxor    %xmm0, %xmm0            /* clear %xmm0 for null char check */
> >         cmp     %eax, %ecx
> >         je      LABEL(ashr_0)           /* rsi and rdi relative offset same */
> >         ja      LABEL(bigger)
> > @@ -259,7 +229,7 @@ LABEL(bigger):
> >         sub     %rcx, %r9
> >         lea     LABEL(unaligned_table)(%rip), %r10
> >         movslq  (%r10, %r9,4), %r9
> > -       pcmpeqb %xmm1, D(%xmm0)         /* Any null chars? */
> > +       pcmpeqb %xmm1, %xmm0            /* Any null chars? */
> >         lea     (%r10, %r9), %r10
> >         _CET_NOTRACK jmp *%r10          /* jump to corresponding case */
> >
> > @@ -272,15 +242,15 @@ LABEL(bigger):
> >  LABEL(ashr_0):
> >
> >         movdqa  (%rsi), %xmm1
> > -       pcmpeqb %xmm1, D(%xmm0)         /* Any null chars? */
> > +       pcmpeqb %xmm1, %xmm0            /* Any null chars? */
> >  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> > -       pcmpeqb (%rdi), D(%xmm1)        /* compare 16 bytes for equality */
> > +       pcmpeqb (%rdi), %xmm1           /* compare 16 bytes for equality */
> >  #else
> >         movdqa  (%rdi), %xmm2
> >         TOLOWER (%xmm1, %xmm2)
> > -       pcmpeqb %xmm2, D(%xmm1)         /* compare 16 bytes for equality */
> > +       pcmpeqb %xmm2, %xmm1            /* compare 16 bytes for equality */
> >  #endif
> > -       psubb   %xmm0, D(%xmm1)         /* packed sub of comparison results*/
> > +       psubb   %xmm0, %xmm1            /* packed sub of comparison results*/
> >         pmovmskb %xmm1, %r9d
> >         shr     %cl, %edx               /* adjust 0xffff for offset */
> >         shr     %cl, %r9d               /* adjust for 16-byte offset */
> > @@ -360,10 +330,10 @@ LABEL(ashr_0_exit_use):
> >   */
> >         .p2align 4
> >  LABEL(ashr_1):
> > -       pslldq  $15, D(%xmm2)           /* shift first string to align with second */
> > +       pslldq  $15, %xmm2              /* shift first string to align with second */
> >         TOLOWER (%xmm1, %xmm2)
> > -       pcmpeqb %xmm1, D(%xmm2)         /* compare 16 bytes for equality */
> > -       psubb   %xmm0, D(%xmm2)         /* packed sub of comparison results*/
> > +       pcmpeqb %xmm1, %xmm2            /* compare 16 bytes for equality */
> > +       psubb   %xmm0, %xmm2            /* packed sub of comparison results*/
> >         pmovmskb %xmm2, %r9d
> >         shr     %cl, %edx               /* adjust 0xffff for offset */
> >         shr     %cl, %r9d               /* adjust for 16-byte offset */
> > @@ -391,7 +361,7 @@ LABEL(loop_ashr_1_use):
> >
> >  LABEL(nibble_ashr_1_restart_use):
> >         movdqa  (%rdi, %rdx), %xmm0
> > -       palignr $1, -16(%rdi, %rdx), D(%xmm0)
> > +       palignr $1, -16(%rdi, %rdx), %xmm0
> >  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> >         pcmpistri       $0x1a,(%rsi,%rdx), %xmm0
> >  #else
> > @@ -410,7 +380,7 @@ LABEL(nibble_ashr_1_restart_use):
> >         jg      LABEL(nibble_ashr_1_use)
> >
> >         movdqa  (%rdi, %rdx), %xmm0
> > -       palignr $1, -16(%rdi, %rdx), D(%xmm0)
> > +       palignr $1, -16(%rdi, %rdx), %xmm0
> >  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> >         pcmpistri       $0x1a,(%rsi,%rdx), %xmm0
> >  #else
> > @@ -430,7 +400,7 @@ LABEL(nibble_ashr_1_restart_use):
> >  LABEL(nibble_ashr_1_use):
> >         sub     $0x1000, %r10
> >         movdqa  -16(%rdi, %rdx), %xmm0
> > -       psrldq  $1, D(%xmm0)
> > +       psrldq  $1, %xmm0
> >         pcmpistri      $0x3a,%xmm0, %xmm0
> >  #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> >         cmp     %r11, %rcx
> > @@ -448,10 +418,10 @@ LABEL(nibble_ashr_1_use):
> >   */
> >         .p2align 4
> >  LABEL(ashr_2):
> > -       pslldq  $14, D(%xmm2)
> > +       pslldq  $14, %xmm2
> >         TOLOWER (%xmm1, %xmm2)
> > -       pcmpeqb %xmm1, D(%xmm2)
> > -       psubb   %xmm0, D(%xmm2)
> > +       pcmpeqb %xmm1, %xmm2
> > +       psubb   %xmm0, %xmm2
> >         pmovmskb %xmm2, %r9d
> >         shr     %cl, %edx
> >         shr     %cl, %r9d
> > @@ -479,7 +449,7 @@ LABEL(loop_ashr_2_use):
> >
> >  LABEL(nibble_ashr_2_restart_use):
> >         movdqa  (%rdi, %rdx), %xmm0
> > -       palignr $2, -16(%rdi, %rdx), D(%xmm0)
> > +       palignr $2, -16(%rdi, %rdx), %xmm0
> >  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> >         pcmpistri       $0x1a,(%rsi,%rdx), %xmm0
> >  #else
> > @@ -498,7 +468,7 @@ LABEL(nibble_ashr_2_restart_use):
> >         jg      LABEL(nibble_ashr_2_use)
> >
> >         movdqa  (%rdi, %rdx), %xmm0
> > -       palignr $2, -16(%rdi, %rdx), D(%xmm0)
> > +       palignr $2, -16(%rdi, %rdx), %xmm0
> >  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> >         pcmpistri       $0x1a,(%rsi,%rdx), %xmm0
> >  #else
> > @@ -518,7 +488,7 @@ LABEL(nibble_ashr_2_restart_use):
> >  LABEL(nibble_ashr_2_use):
> >         sub     $0x1000, %r10
> >         movdqa  -16(%rdi, %rdx), %xmm0
> > -       psrldq  $2, D(%xmm0)
> > +       psrldq  $2, %xmm0
> >         pcmpistri      $0x3a,%xmm0, %xmm0
> >  #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> >         cmp     %r11, %rcx
> > @@ -536,10 +506,10 @@ LABEL(nibble_ashr_2_use):
> >   */
> >         .p2align 4
> >  LABEL(ashr_3):
> > -       pslldq  $13, D(%xmm2)
> > +       pslldq  $13, %xmm2
> >         TOLOWER (%xmm1, %xmm2)
> > -       pcmpeqb %xmm1, D(%xmm2)
> > -       psubb   %xmm0, D(%xmm2)
> > +       pcmpeqb %xmm1, %xmm2
> > +       psubb   %xmm0, %xmm2
> >         pmovmskb %xmm2, %r9d
> >         shr     %cl, %edx
> >         shr     %cl, %r9d
> > @@ -567,7 +537,7 @@ LABEL(loop_ashr_3_use):
> >
> >  LABEL(nibble_ashr_3_restart_use):
> >         movdqa  (%rdi, %rdx), %xmm0
> > -       palignr $3, -16(%rdi, %rdx), D(%xmm0)
> > +       palignr $3, -16(%rdi, %rdx), %xmm0
> >  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> >         pcmpistri       $0x1a,(%rsi,%rdx), %xmm0
> >  #else
> > @@ -586,7 +556,7 @@ LABEL(nibble_ashr_3_restart_use):
> >         jg      LABEL(nibble_ashr_3_use)
> >
> >         movdqa  (%rdi, %rdx), %xmm0
> > -       palignr $3, -16(%rdi, %rdx), D(%xmm0)
> > +       palignr $3, -16(%rdi, %rdx), %xmm0
> >  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> >         pcmpistri       $0x1a,(%rsi,%rdx), %xmm0
> >  #else
> > @@ -606,7 +576,7 @@ LABEL(nibble_ashr_3_restart_use):
> >  LABEL(nibble_ashr_3_use):
> >         sub     $0x1000, %r10
> >         movdqa  -16(%rdi, %rdx), %xmm0
> > -       psrldq  $3, D(%xmm0)
> > +       psrldq  $3, %xmm0
> >         pcmpistri      $0x3a,%xmm0, %xmm0
> >  #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> >         cmp     %r11, %rcx
> > @@ -624,10 +594,10 @@ LABEL(nibble_ashr_3_use):
> >   */
> >         .p2align 4
> >  LABEL(ashr_4):
> > -       pslldq  $12, D(%xmm2)
> > +       pslldq  $12, %xmm2
> >         TOLOWER (%xmm1, %xmm2)
> > -       pcmpeqb %xmm1, D(%xmm2)
> > -       psubb   %xmm0, D(%xmm2)
> > +       pcmpeqb %xmm1, %xmm2
> > +       psubb   %xmm0, %xmm2
> >         pmovmskb %xmm2, %r9d
> >         shr     %cl, %edx
> >         shr     %cl, %r9d
> > @@ -656,7 +626,7 @@ LABEL(loop_ashr_4_use):
> >
> >  LABEL(nibble_ashr_4_restart_use):
> >         movdqa  (%rdi, %rdx), %xmm0
> > -       palignr $4, -16(%rdi, %rdx), D(%xmm0)
> > +       palignr $4, -16(%rdi, %rdx), %xmm0
> >  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> >         pcmpistri       $0x1a,(%rsi,%rdx), %xmm0
> >  #else
> > @@ -675,7 +645,7 @@ LABEL(nibble_ashr_4_restart_use):
> >         jg      LABEL(nibble_ashr_4_use)
> >
> >         movdqa  (%rdi, %rdx), %xmm0
> > -       palignr $4, -16(%rdi, %rdx), D(%xmm0)
> > +       palignr $4, -16(%rdi, %rdx), %xmm0
> >  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> >         pcmpistri       $0x1a,(%rsi,%rdx), %xmm0
> >  #else
> > @@ -695,7 +665,7 @@ LABEL(nibble_ashr_4_restart_use):
> >  LABEL(nibble_ashr_4_use):
> >         sub     $0x1000, %r10
> >         movdqa  -16(%rdi, %rdx), %xmm0
> > -       psrldq  $4, D(%xmm0)
> > +       psrldq  $4, %xmm0
> >         pcmpistri      $0x3a,%xmm0, %xmm0
> >  #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> >         cmp     %r11, %rcx
> > @@ -713,10 +683,10 @@ LABEL(nibble_ashr_4_use):
> >   */
> >         .p2align 4
> >  LABEL(ashr_5):
> > -       pslldq  $11, D(%xmm2)
> > +       pslldq  $11, %xmm2
> >         TOLOWER (%xmm1, %xmm2)
> > -       pcmpeqb %xmm1, D(%xmm2)
> > -       psubb   %xmm0, D(%xmm2)
> > +       pcmpeqb %xmm1, %xmm2
> > +       psubb   %xmm0, %xmm2
> >         pmovmskb %xmm2, %r9d
> >         shr     %cl, %edx
> >         shr     %cl, %r9d
> > @@ -745,7 +715,7 @@ LABEL(loop_ashr_5_use):
> >
> >  LABEL(nibble_ashr_5_restart_use):
> >         movdqa  (%rdi, %rdx), %xmm0
> > -       palignr $5, -16(%rdi, %rdx), D(%xmm0)
> > +       palignr $5, -16(%rdi, %rdx), %xmm0
> >  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> >         pcmpistri       $0x1a,(%rsi,%rdx), %xmm0
> >  #else
> > @@ -765,7 +735,7 @@ LABEL(nibble_ashr_5_restart_use):
> >
> >         movdqa  (%rdi, %rdx), %xmm0
> >
> > -       palignr $5, -16(%rdi, %rdx), D(%xmm0)
> > +       palignr $5, -16(%rdi, %rdx), %xmm0
> >  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> >         pcmpistri       $0x1a,(%rsi,%rdx), %xmm0
> >  #else
> > @@ -785,7 +755,7 @@ LABEL(nibble_ashr_5_restart_use):
> >  LABEL(nibble_ashr_5_use):
> >         sub     $0x1000, %r10
> >         movdqa  -16(%rdi, %rdx), %xmm0
> > -       psrldq  $5, D(%xmm0)
> > +       psrldq  $5, %xmm0
> >         pcmpistri      $0x3a,%xmm0, %xmm0
> >  #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> >         cmp     %r11, %rcx
> > @@ -803,10 +773,10 @@ LABEL(nibble_ashr_5_use):
> >   */
> >         .p2align 4
> >  LABEL(ashr_6):
> > -       pslldq  $10, D(%xmm2)
> > +       pslldq  $10, %xmm2
> >         TOLOWER (%xmm1, %xmm2)
> > -       pcmpeqb %xmm1, D(%xmm2)
> > -       psubb   %xmm0, D(%xmm2)
> > +       pcmpeqb %xmm1, %xmm2
> > +       psubb   %xmm0, %xmm2
> >         pmovmskb %xmm2, %r9d
> >         shr     %cl, %edx
> >         shr     %cl, %r9d
> > @@ -835,7 +805,7 @@ LABEL(loop_ashr_6_use):
> >
> >  LABEL(nibble_ashr_6_restart_use):
> >         movdqa  (%rdi, %rdx), %xmm0
> > -       palignr $6, -16(%rdi, %rdx), D(%xmm0)
> > +       palignr $6, -16(%rdi, %rdx), %xmm0
> >  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> >         pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> >  #else
> > @@ -854,7 +824,7 @@ LABEL(nibble_ashr_6_restart_use):
> >         jg      LABEL(nibble_ashr_6_use)
> >
> >         movdqa  (%rdi, %rdx), %xmm0
> > -       palignr $6, -16(%rdi, %rdx), D(%xmm0)
> > +       palignr $6, -16(%rdi, %rdx), %xmm0
> >  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> >         pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> >  #else
> > @@ -874,7 +844,7 @@ LABEL(nibble_ashr_6_restart_use):
> >  LABEL(nibble_ashr_6_use):
> >         sub     $0x1000, %r10
> >         movdqa  -16(%rdi, %rdx), %xmm0
> > -       psrldq  $6, D(%xmm0)
> > +       psrldq  $6, %xmm0
> >         pcmpistri      $0x3a,%xmm0, %xmm0
> >  #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> >         cmp     %r11, %rcx
> > @@ -892,10 +862,10 @@ LABEL(nibble_ashr_6_use):
> >   */
> >         .p2align 4
> >  LABEL(ashr_7):
> > -       pslldq  $9, D(%xmm2)
> > +       pslldq  $9, %xmm2
> >         TOLOWER (%xmm1, %xmm2)
> > -       pcmpeqb %xmm1, D(%xmm2)
> > -       psubb   %xmm0, D(%xmm2)
> > +       pcmpeqb %xmm1, %xmm2
> > +       psubb   %xmm0, %xmm2
> >         pmovmskb %xmm2, %r9d
> >         shr     %cl, %edx
> >         shr     %cl, %r9d
> > @@ -924,7 +894,7 @@ LABEL(loop_ashr_7_use):
> >
> >  LABEL(nibble_ashr_7_restart_use):
> >         movdqa  (%rdi, %rdx), %xmm0
> > -       palignr $7, -16(%rdi, %rdx), D(%xmm0)
> > +       palignr $7, -16(%rdi, %rdx), %xmm0
> >  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> >         pcmpistri       $0x1a,(%rsi,%rdx), %xmm0
> >  #else
> > @@ -943,7 +913,7 @@ LABEL(nibble_ashr_7_restart_use):
> >         jg      LABEL(nibble_ashr_7_use)
> >
> >         movdqa  (%rdi, %rdx), %xmm0
> > -       palignr $7, -16(%rdi, %rdx), D(%xmm0)
> > +       palignr $7, -16(%rdi, %rdx), %xmm0
> >  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> >         pcmpistri       $0x1a,(%rsi,%rdx), %xmm0
> >  #else
> > @@ -963,7 +933,7 @@ LABEL(nibble_ashr_7_restart_use):
> >  LABEL(nibble_ashr_7_use):
> >         sub     $0x1000, %r10
> >         movdqa  -16(%rdi, %rdx), %xmm0
> > -       psrldq  $7, D(%xmm0)
> > +       psrldq  $7, %xmm0
> >         pcmpistri      $0x3a,%xmm0, %xmm0
> >  #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> >         cmp     %r11, %rcx
> > @@ -981,10 +951,10 @@ LABEL(nibble_ashr_7_use):
> >   */
> >         .p2align 4
> >  LABEL(ashr_8):
> > -       pslldq  $8, D(%xmm2)
> > +       pslldq  $8, %xmm2
> >         TOLOWER (%xmm1, %xmm2)
> > -       pcmpeqb %xmm1, D(%xmm2)
> > -       psubb   %xmm0, D(%xmm2)
> > +       pcmpeqb %xmm1, %xmm2
> > +       psubb   %xmm0, %xmm2
> >         pmovmskb %xmm2, %r9d
> >         shr     %cl, %edx
> >         shr     %cl, %r9d
> > @@ -1013,7 +983,7 @@ LABEL(loop_ashr_8_use):
> >
> >  LABEL(nibble_ashr_8_restart_use):
> >         movdqa  (%rdi, %rdx), %xmm0
> > -       palignr $8, -16(%rdi, %rdx), D(%xmm0)
> > +       palignr $8, -16(%rdi, %rdx), %xmm0
> >  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> >         pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> >  #else
> > @@ -1032,7 +1002,7 @@ LABEL(nibble_ashr_8_restart_use):
> >         jg      LABEL(nibble_ashr_8_use)
> >
> >         movdqa  (%rdi, %rdx), %xmm0
> > -       palignr $8, -16(%rdi, %rdx), D(%xmm0)
> > +       palignr $8, -16(%rdi, %rdx), %xmm0
> >  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> >         pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> >  #else
> > @@ -1052,7 +1022,7 @@ LABEL(nibble_ashr_8_restart_use):
> >  LABEL(nibble_ashr_8_use):
> >         sub     $0x1000, %r10
> >         movdqa  -16(%rdi, %rdx), %xmm0
> > -       psrldq  $8, D(%xmm0)
> > +       psrldq  $8, %xmm0
> >         pcmpistri      $0x3a,%xmm0, %xmm0
> >  #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> >         cmp     %r11, %rcx
> > @@ -1070,10 +1040,10 @@ LABEL(nibble_ashr_8_use):
> >   */
> >         .p2align 4
> >  LABEL(ashr_9):
> > -       pslldq  $7, D(%xmm2)
> > +       pslldq  $7, %xmm2
> >         TOLOWER (%xmm1, %xmm2)
> > -       pcmpeqb %xmm1, D(%xmm2)
> > -       psubb   %xmm0, D(%xmm2)
> > +       pcmpeqb %xmm1, %xmm2
> > +       psubb   %xmm0, %xmm2
> >         pmovmskb %xmm2, %r9d
> >         shr     %cl, %edx
> >         shr     %cl, %r9d
> > @@ -1103,7 +1073,7 @@ LABEL(loop_ashr_9_use):
> >  LABEL(nibble_ashr_9_restart_use):
> >         movdqa  (%rdi, %rdx), %xmm0
> >
> > -       palignr $9, -16(%rdi, %rdx), D(%xmm0)
> > +       palignr $9, -16(%rdi, %rdx), %xmm0
> >  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> >         pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> >  #else
> > @@ -1122,7 +1092,7 @@ LABEL(nibble_ashr_9_restart_use):
> >         jg      LABEL(nibble_ashr_9_use)
> >
> >         movdqa  (%rdi, %rdx), %xmm0
> > -       palignr $9, -16(%rdi, %rdx), D(%xmm0)
> > +       palignr $9, -16(%rdi, %rdx), %xmm0
> >  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> >         pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> >  #else
> > @@ -1142,7 +1112,7 @@ LABEL(nibble_ashr_9_restart_use):
> >  LABEL(nibble_ashr_9_use):
> >         sub     $0x1000, %r10
> >         movdqa  -16(%rdi, %rdx), %xmm0
> > -       psrldq  $9, D(%xmm0)
> > +       psrldq  $9, %xmm0
> >         pcmpistri      $0x3a,%xmm0, %xmm0
> >  #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> >         cmp     %r11, %rcx
> > @@ -1160,10 +1130,10 @@ LABEL(nibble_ashr_9_use):
> >   */
> >         .p2align 4
> >  LABEL(ashr_10):
> > -       pslldq  $6, D(%xmm2)
> > +       pslldq  $6, %xmm2
> >         TOLOWER (%xmm1, %xmm2)
> > -       pcmpeqb %xmm1, D(%xmm2)
> > -       psubb   %xmm0, D(%xmm2)
> > +       pcmpeqb %xmm1, %xmm2
> > +       psubb   %xmm0, %xmm2
> >         pmovmskb %xmm2, %r9d
> >         shr     %cl, %edx
> >         shr     %cl, %r9d
> > @@ -1192,7 +1162,7 @@ LABEL(loop_ashr_10_use):
> >
> >  LABEL(nibble_ashr_10_restart_use):
> >         movdqa  (%rdi, %rdx), %xmm0
> > -       palignr $10, -16(%rdi, %rdx), D(%xmm0)
> > +       palignr $10, -16(%rdi, %rdx), %xmm0
> >  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> >         pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> >  #else
> > @@ -1211,7 +1181,7 @@ LABEL(nibble_ashr_10_restart_use):
> >         jg      LABEL(nibble_ashr_10_use)
> >
> >         movdqa  (%rdi, %rdx), %xmm0
> > -       palignr $10, -16(%rdi, %rdx), D(%xmm0)
> > +       palignr $10, -16(%rdi, %rdx), %xmm0
> >  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> >         pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> >  #else
> > @@ -1231,7 +1201,7 @@ LABEL(nibble_ashr_10_restart_use):
> >  LABEL(nibble_ashr_10_use):
> >         sub     $0x1000, %r10
> >         movdqa  -16(%rdi, %rdx), %xmm0
> > -       psrldq  $10, D(%xmm0)
> > +       psrldq  $10, %xmm0
> >         pcmpistri      $0x3a,%xmm0, %xmm0
> >  #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> >         cmp     %r11, %rcx
> > @@ -1249,10 +1219,10 @@ LABEL(nibble_ashr_10_use):
> >   */
> >         .p2align 4
> >  LABEL(ashr_11):
> > -       pslldq  $5, D(%xmm2)
> > +       pslldq  $5, %xmm2
> >         TOLOWER (%xmm1, %xmm2)
> > -       pcmpeqb %xmm1, D(%xmm2)
> > -       psubb   %xmm0, D(%xmm2)
> > +       pcmpeqb %xmm1, %xmm2
> > +       psubb   %xmm0, %xmm2
> >         pmovmskb %xmm2, %r9d
> >         shr     %cl, %edx
> >         shr     %cl, %r9d
> > @@ -1281,7 +1251,7 @@ LABEL(loop_ashr_11_use):
> >
> >  LABEL(nibble_ashr_11_restart_use):
> >         movdqa  (%rdi, %rdx), %xmm0
> > -       palignr $11, -16(%rdi, %rdx), D(%xmm0)
> > +       palignr $11, -16(%rdi, %rdx), %xmm0
> >  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> >         pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> >  #else
> > @@ -1300,7 +1270,7 @@ LABEL(nibble_ashr_11_restart_use):
> >         jg      LABEL(nibble_ashr_11_use)
> >
> >         movdqa  (%rdi, %rdx), %xmm0
> > -       palignr $11, -16(%rdi, %rdx), D(%xmm0)
> > +       palignr $11, -16(%rdi, %rdx), %xmm0
> >  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> >         pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> >  #else
> > @@ -1320,7 +1290,7 @@ LABEL(nibble_ashr_11_restart_use):
> >  LABEL(nibble_ashr_11_use):
> >         sub     $0x1000, %r10
> >         movdqa  -16(%rdi, %rdx), %xmm0
> > -       psrldq  $11, D(%xmm0)
> > +       psrldq  $11, %xmm0
> >         pcmpistri      $0x3a,%xmm0, %xmm0
> >  #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> >         cmp     %r11, %rcx
> > @@ -1338,10 +1308,10 @@ LABEL(nibble_ashr_11_use):
> >   */
> >         .p2align 4
> >  LABEL(ashr_12):
> > -       pslldq  $4, D(%xmm2)
> > +       pslldq  $4, %xmm2
> >         TOLOWER (%xmm1, %xmm2)
> > -       pcmpeqb %xmm1, D(%xmm2)
> > -       psubb   %xmm0, D(%xmm2)
> > +       pcmpeqb %xmm1, %xmm2
> > +       psubb   %xmm0, %xmm2
> >         pmovmskb %xmm2, %r9d
> >         shr     %cl, %edx
> >         shr     %cl, %r9d
> > @@ -1370,7 +1340,7 @@ LABEL(loop_ashr_12_use):
> >
> >  LABEL(nibble_ashr_12_restart_use):
> >         movdqa  (%rdi, %rdx), %xmm0
> > -       palignr $12, -16(%rdi, %rdx), D(%xmm0)
> > +       palignr $12, -16(%rdi, %rdx), %xmm0
> >  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> >         pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> >  #else
> > @@ -1389,7 +1359,7 @@ LABEL(nibble_ashr_12_restart_use):
> >         jg      LABEL(nibble_ashr_12_use)
> >
> >         movdqa  (%rdi, %rdx), %xmm0
> > -       palignr $12, -16(%rdi, %rdx), D(%xmm0)
> > +       palignr $12, -16(%rdi, %rdx), %xmm0
> >  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> >         pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> >  #else
> > @@ -1409,7 +1379,7 @@ LABEL(nibble_ashr_12_restart_use):
> >  LABEL(nibble_ashr_12_use):
> >         sub     $0x1000, %r10
> >         movdqa  -16(%rdi, %rdx), %xmm0
> > -       psrldq  $12, D(%xmm0)
> > +       psrldq  $12, %xmm0
> >         pcmpistri      $0x3a,%xmm0, %xmm0
> >  #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> >         cmp     %r11, %rcx
> > @@ -1427,10 +1397,10 @@ LABEL(nibble_ashr_12_use):
> >   */
> >         .p2align 4
> >  LABEL(ashr_13):
> > -       pslldq  $3, D(%xmm2)
> > +       pslldq  $3, %xmm2
> >         TOLOWER (%xmm1, %xmm2)
> > -       pcmpeqb %xmm1, D(%xmm2)
> > -       psubb   %xmm0, D(%xmm2)
> > +       pcmpeqb %xmm1, %xmm2
> > +       psubb   %xmm0, %xmm2
> >         pmovmskb %xmm2, %r9d
> >         shr     %cl, %edx
> >         shr     %cl, %r9d
> > @@ -1460,7 +1430,7 @@ LABEL(loop_ashr_13_use):
> >
> >  LABEL(nibble_ashr_13_restart_use):
> >         movdqa  (%rdi, %rdx), %xmm0
> > -       palignr $13, -16(%rdi, %rdx), D(%xmm0)
> > +       palignr $13, -16(%rdi, %rdx), %xmm0
> >  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> >         pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> >  #else
> > @@ -1479,7 +1449,7 @@ LABEL(nibble_ashr_13_restart_use):
> >         jg      LABEL(nibble_ashr_13_use)
> >
> >         movdqa  (%rdi, %rdx), %xmm0
> > -       palignr $13, -16(%rdi, %rdx), D(%xmm0)
> > +       palignr $13, -16(%rdi, %rdx), %xmm0
> >  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> >         pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> >  #else
> > @@ -1499,7 +1469,7 @@ LABEL(nibble_ashr_13_restart_use):
> >  LABEL(nibble_ashr_13_use):
> >         sub     $0x1000, %r10
> >         movdqa  -16(%rdi, %rdx), %xmm0
> > -       psrldq  $13, D(%xmm0)
> > +       psrldq  $13, %xmm0
> >         pcmpistri      $0x3a,%xmm0, %xmm0
> >  #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> >         cmp     %r11, %rcx
> > @@ -1517,10 +1487,10 @@ LABEL(nibble_ashr_13_use):
> >   */
> >         .p2align 4
> >  LABEL(ashr_14):
> > -       pslldq  $2, D(%xmm2)
> > +       pslldq  $2, %xmm2
> >         TOLOWER (%xmm1, %xmm2)
> > -       pcmpeqb %xmm1, D(%xmm2)
> > -       psubb   %xmm0, D(%xmm2)
> > +       pcmpeqb %xmm1, %xmm2
> > +       psubb   %xmm0, %xmm2
> >         pmovmskb %xmm2, %r9d
> >         shr     %cl, %edx
> >         shr     %cl, %r9d
> > @@ -1550,7 +1520,7 @@ LABEL(loop_ashr_14_use):
> >
> >  LABEL(nibble_ashr_14_restart_use):
> >         movdqa  (%rdi, %rdx), %xmm0
> > -       palignr $14, -16(%rdi, %rdx), D(%xmm0)
> > +       palignr $14, -16(%rdi, %rdx), %xmm0
> >  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> >         pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> >  #else
> > @@ -1569,7 +1539,7 @@ LABEL(nibble_ashr_14_restart_use):
> >         jg      LABEL(nibble_ashr_14_use)
> >
> >         movdqa  (%rdi, %rdx), %xmm0
> > -       palignr $14, -16(%rdi, %rdx), D(%xmm0)
> > +       palignr $14, -16(%rdi, %rdx), %xmm0
> >  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> >         pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> >  #else
> > @@ -1589,7 +1559,7 @@ LABEL(nibble_ashr_14_restart_use):
> >  LABEL(nibble_ashr_14_use):
> >         sub     $0x1000, %r10
> >         movdqa  -16(%rdi, %rdx), %xmm0
> > -       psrldq  $14, D(%xmm0)
> > +       psrldq  $14, %xmm0
> >         pcmpistri      $0x3a,%xmm0, %xmm0
> >  #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> >         cmp     %r11, %rcx
> > @@ -1607,10 +1577,10 @@ LABEL(nibble_ashr_14_use):
> >   */
> >         .p2align 4
> >  LABEL(ashr_15):
> > -       pslldq  $1, D(%xmm2)
> > +       pslldq  $1, %xmm2
> >         TOLOWER (%xmm1, %xmm2)
> > -       pcmpeqb %xmm1, D(%xmm2)
> > -       psubb   %xmm0, D(%xmm2)
> > +       pcmpeqb %xmm1, %xmm2
> > +       psubb   %xmm0, %xmm2
> >         pmovmskb %xmm2, %r9d
> >         shr     %cl, %edx
> >         shr     %cl, %r9d
> > @@ -1642,7 +1612,7 @@ LABEL(loop_ashr_15_use):
> >
> >  LABEL(nibble_ashr_15_restart_use):
> >         movdqa  (%rdi, %rdx), %xmm0
> > -       palignr $15, -16(%rdi, %rdx), D(%xmm0)
> > +       palignr $15, -16(%rdi, %rdx), %xmm0
> >  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> >         pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> >  #else
> > @@ -1661,7 +1631,7 @@ LABEL(nibble_ashr_15_restart_use):
> >         jg      LABEL(nibble_ashr_15_use)
> >
> >         movdqa  (%rdi, %rdx), %xmm0
> > -       palignr $15, -16(%rdi, %rdx), D(%xmm0)
> > +       palignr $15, -16(%rdi, %rdx), %xmm0
> >  #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> >         pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> >  #else
> > @@ -1681,7 +1651,7 @@ LABEL(nibble_ashr_15_restart_use):
> >  LABEL(nibble_ashr_15_use):
> >         sub     $0x1000, %r10
> >         movdqa  -16(%rdi, %rdx), %xmm0
> > -       psrldq  $15, D(%xmm0)
> > +       psrldq  $15, %xmm0
> >         pcmpistri      $0x3a,%xmm0, %xmm0
> >  #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> >         cmp     %r11, %rcx
> > diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx.S b/sysdeps/x86_64/multiarch/strncase_l-avx.S
> > deleted file mode 100644
> > index b51b86d223..0000000000
> > --- a/sysdeps/x86_64/multiarch/strncase_l-avx.S
> > +++ /dev/null
> > @@ -1,22 +0,0 @@
> > -/* strncasecmp_l optimized with AVX.
> > -   Copyright (C) 2017-2022 Free Software Foundation, Inc.
> > -   This file is part of the GNU C Library.
> > -
> > -   The GNU C Library is free software; you can redistribute it and/or
> > -   modify it under the terms of the GNU Lesser General Public
> > -   License as published by the Free Software Foundation; either
> > -   version 2.1 of the License, or (at your option) any later version.
> > -
> > -   The GNU C Library is distributed in the hope that it will be useful,
> > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > -   Lesser General Public License for more details.
> > -
> > -   You should have received a copy of the GNU Lesser General Public
> > -   License along with the GNU C Library; if not, see
> > -   <https://www.gnu.org/licenses/>.  */
> > -
> > -#define STRCMP_SSE42 __strncasecmp_l_avx
> > -#define USE_AVX 1
> > -#define USE_AS_STRNCASECMP_L
> > -#include "strcmp-sse42.S"
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.

I would like to backport this patch to release branches.
Any comments or objections?

--Sunil

^ permalink raw reply	[flat|nested] 76+ messages in thread

end of thread, other threads:[~2022-05-12 19:54 UTC | newest]

Thread overview: 76+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
2022-03-23 21:57 ` [PATCH v1 02/23] benchtests: Add random benchmark " Noah Goldstein
2022-03-24 18:44   ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 03/23] x86: Code cleanup in strchr-avx2 and comment justifying branch Noah Goldstein
2022-03-24 18:53   ` H.J. Lu
2022-03-24 19:20     ` Noah Goldstein
2022-03-24 19:36       ` H.J. Lu
2022-05-12 19:31         ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 04/23] x86: Code cleanup in strchr-evex " Noah Goldstein
2022-03-24 18:54   ` H.J. Lu
2022-05-12 19:32     ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 05/23] benchtests: Use json-lib in bench-strpbrk.c Noah Goldstein
2022-03-24 18:54   ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 06/23] benchtests: Use json-lib in bench-strspn.c Noah Goldstein
2022-03-24 18:54   ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 07/23] x86: Optimize strcspn and strpbrk in strcspn-c.c Noah Goldstein
2022-03-24 18:55   ` H.J. Lu
2022-05-12 19:34     ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 08/23] x86: Optimize strspn in strspn-c.c Noah Goldstein
2022-03-24 18:56   ` H.J. Lu
2022-05-12 19:39     ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 09/23] x86: Remove strcspn-sse2.S and use the generic implementation Noah Goldstein
2022-03-24 18:57   ` H.J. Lu
2022-05-12 19:40     ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 10/23] x86: Remove strpbrk-sse2.S " Noah Goldstein
2022-03-24 18:57   ` H.J. Lu
2022-05-12 19:41     ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 11/23] x86: Remove strspn-sse2.S " Noah Goldstein
2022-03-24 18:57   ` H.J. Lu
2022-05-12 19:42     ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 12/23] x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896] Noah Goldstein
2022-03-24 18:59   ` H.J. Lu
2022-03-24 19:18     ` Noah Goldstein
2022-03-24 19:34       ` H.J. Lu
2022-03-24 19:39         ` Noah Goldstein
2022-03-24 20:50   ` [PATCH v2 12/31] " Noah Goldstein
2022-03-24 21:26     ` H.J. Lu
2022-03-24 21:43       ` Noah Goldstein
2022-03-24 21:58         ` H.J. Lu
2022-05-04  6:05           ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 13/23] benchtests: Use json-lib in bench-strcasecmp.c Noah Goldstein
2022-03-24 19:00   ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 14/23] benchtests: Use json-lib in bench-strncasecmp.c Noah Goldstein
2022-03-24 19:00   ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 15/23] string: Expand page cross tests in test-strcasecmp.c Noah Goldstein
2022-03-24 19:01   ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 16/23] string: Expand page cross tests in test-strncasecmp.c Noah Goldstein
2022-03-24 19:01   ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 17/23] x86: Optimize str{n}casecmp TOLOWER logic in strcmp.S Noah Goldstein
2022-03-24 19:02   ` H.J. Lu
2022-05-12 19:44     ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 18/23] x86: Optimize str{n}casecmp TOLOWER logic in strcmp-sse42.S Noah Goldstein
2022-03-24 19:02   ` H.J. Lu
2022-05-12 19:45     ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 19/23] string: Expand page cross test cases in test-strcmp.c Noah Goldstein
2022-03-24 19:02   ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 20/23] string: Expand page cross test cases in test-strncmp.c Noah Goldstein
2022-03-24 19:02   ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 21/23] x86: Add AVX2 optimized str{n}casecmp Noah Goldstein
2022-03-24 19:03   ` H.J. Lu
2022-03-24 22:41   ` [PATCH v3 " Noah Goldstein
2022-03-24 22:41   ` [PATCH v3 22/23] x86: Add EVEX " Noah Goldstein
2022-03-24 23:56   ` [PATCH v4 21/23] x86: Add AVX2 " Noah Goldstein
2022-03-24 23:56     ` [PATCH v4 22/23] x86: Add EVEX " Noah Goldstein
2022-03-25 18:15       ` H.J. Lu
2022-03-25 18:18         ` Noah Goldstein
2022-05-12 19:47           ` Sunil Pandey
2022-05-12 19:52             ` Sunil Pandey
2022-03-25 18:14     ` [PATCH v4 21/23] x86: Add AVX2 " H.J. Lu
2022-05-12 19:52       ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 22/23] x86: Add EVEX " Noah Goldstein
2022-03-24 19:04   ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 23/23] x86: Remove AVX str{n}casecmp Noah Goldstein
2022-03-24 19:04   ` H.J. Lu
2022-05-12 19:54     ` Sunil Pandey
2022-03-24 18:43 ` [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c H.J. Lu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).