* [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c
@ 2022-03-23 21:57 Noah Goldstein
2022-03-23 21:57 ` [PATCH v1 02/23] benchtests: Add random benchmark " Noah Goldstein
` (22 more replies)
0 siblings, 23 replies; 76+ messages in thread
From: Noah Goldstein @ 2022-03-23 21:57 UTC (permalink / raw)
To: libc-alpha
Just QOL change to make parsing the output of the benchtests more
consistent.
---
benchtests/bench-strchr.c | 94 ++++++++++++++++++++++++++-------------
1 file changed, 64 insertions(+), 30 deletions(-)
diff --git a/benchtests/bench-strchr.c b/benchtests/bench-strchr.c
index 821bc615b0..203900d4ad 100644
--- a/benchtests/bench-strchr.c
+++ b/benchtests/bench-strchr.c
@@ -32,6 +32,7 @@
#endif /* WIDE */
#include "bench-string.h"
+#include "json-lib.h"
#define BIG_CHAR MAX_CHAR
#ifndef WIDE
@@ -74,10 +75,19 @@ IMPL (simple_STRCHR, 0)
IMPL (STRCHR, 1)
static void
-do_one_test (impl_t *impl, const CHAR *s, int c, const CHAR *exp_res)
+do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s, int c,
+ const CHAR *exp_res)
{
size_t i, iters = INNER_LOOP_ITERS_LARGE;
timing_t start, stop, cur;
+ const CHAR *res = CALL (impl, s, c);
+ if (res != exp_res)
+ {
+ error (0, 0, "Wrong result in function %s %p != %p", impl->name, res,
+ exp_res);
+ ret = 1;
+ return;
+ }
TIMING_NOW (start);
for (i = 0; i < iters; ++i)
@@ -88,11 +98,12 @@ do_one_test (impl_t *impl, const CHAR *s, int c, const CHAR *exp_res)
TIMING_DIFF (cur, start, stop);
- TIMING_PRINT_MEAN ((double) cur, (double) iters);
+ json_element_double (json_ctx, (double)cur / (double)iters);
}
static void
-do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
+do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
+ int seek_char, int max_char)
/* For wcschr: align here means align not in bytes,
but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
len for wcschr here isn't in bytes but it's number of wchar_t symbols. */
@@ -124,87 +135,110 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
else
result = NULLRET (buf + align + len);
- printf ("Length %4zd, alignment in bytes %2zd:",
- pos, align * sizeof (CHAR));
+ json_element_object_begin (json_ctx);
+ json_attr_uint (json_ctx, "length", len);
+ json_attr_uint (json_ctx, "pos", pos);
+ json_attr_uint (json_ctx, "seek_char", seek_char);
+ json_attr_uint (json_ctx, "max_char", max_char);
+ json_attr_uint (json_ctx, "alignment", align);
+ json_array_begin (json_ctx, "timings");
FOR_EACH_IMPL (impl, 0)
- do_one_test (impl, buf + align, seek_char, result);
+ do_one_test (json_ctx, impl, buf + align, seek_char, result);
- putchar ('\n');
+ json_array_end (json_ctx);
+ json_element_object_end (json_ctx);
}
int
test_main (void)
{
+ json_ctx_t json_ctx;
size_t i;
test_init ();
- printf ("%20s", "");
+ json_init (&json_ctx, 0, stdout);
+
+ json_document_begin (&json_ctx);
+ json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+
+ json_attr_object_begin (&json_ctx, "functions");
+ json_attr_object_begin (&json_ctx, TEST_NAME);
+ json_attr_string (&json_ctx, "bench-variant", "");
+
+ json_array_begin (&json_ctx, "ifuncs");
FOR_EACH_IMPL (impl, 0)
- printf ("\t%s", impl->name);
- putchar ('\n');
+ json_element_string (&json_ctx, impl->name);
+ json_array_end (&json_ctx);
+
+ json_array_begin (&json_ctx, "results");
for (i = 1; i < 8; ++i)
{
- do_test (0, 16 << i, 2048, SMALL_CHAR, MIDDLE_CHAR);
- do_test (i, 16 << i, 2048, SMALL_CHAR, MIDDLE_CHAR);
+ do_test (&json_ctx, 0, 16 << i, 2048, SMALL_CHAR, MIDDLE_CHAR);
+ do_test (&json_ctx, i, 16 << i, 2048, SMALL_CHAR, MIDDLE_CHAR);
}
for (i = 1; i < 8; ++i)
{
- do_test (0, 16 << i, 4096, SMALL_CHAR, MIDDLE_CHAR);
- do_test (i, 16 << i, 4096, SMALL_CHAR, MIDDLE_CHAR);
+ do_test (&json_ctx, 0, 16 << i, 4096, SMALL_CHAR, MIDDLE_CHAR);
+ do_test (&json_ctx, i, 16 << i, 4096, SMALL_CHAR, MIDDLE_CHAR);
}
for (i = 1; i < 8; ++i)
{
- do_test (i, 64, 256, SMALL_CHAR, MIDDLE_CHAR);
- do_test (i, 64, 256, SMALL_CHAR, BIG_CHAR);
+ do_test (&json_ctx, i, 64, 256, SMALL_CHAR, MIDDLE_CHAR);
+ do_test (&json_ctx, i, 64, 256, SMALL_CHAR, BIG_CHAR);
}
for (i = 0; i < 8; ++i)
{
- do_test (16 * i, 256, 512, SMALL_CHAR, MIDDLE_CHAR);
- do_test (16 * i, 256, 512, SMALL_CHAR, BIG_CHAR);
+ do_test (&json_ctx, 16 * i, 256, 512, SMALL_CHAR, MIDDLE_CHAR);
+ do_test (&json_ctx, 16 * i, 256, 512, SMALL_CHAR, BIG_CHAR);
}
for (i = 0; i < 32; ++i)
{
- do_test (0, i, i + 1, SMALL_CHAR, MIDDLE_CHAR);
- do_test (0, i, i + 1, SMALL_CHAR, BIG_CHAR);
+ do_test (&json_ctx, 0, i, i + 1, SMALL_CHAR, MIDDLE_CHAR);
+ do_test (&json_ctx, 0, i, i + 1, SMALL_CHAR, BIG_CHAR);
}
for (i = 1; i < 8; ++i)
{
- do_test (0, 16 << i, 2048, 0, MIDDLE_CHAR);
- do_test (i, 16 << i, 2048, 0, MIDDLE_CHAR);
+ do_test (&json_ctx, 0, 16 << i, 2048, 0, MIDDLE_CHAR);
+ do_test (&json_ctx, i, 16 << i, 2048, 0, MIDDLE_CHAR);
}
for (i = 1; i < 8; ++i)
{
- do_test (0, 16 << i, 4096, 0, MIDDLE_CHAR);
- do_test (i, 16 << i, 4096, 0, MIDDLE_CHAR);
+ do_test (&json_ctx, 0, 16 << i, 4096, 0, MIDDLE_CHAR);
+ do_test (&json_ctx, i, 16 << i, 4096, 0, MIDDLE_CHAR);
}
for (i = 1; i < 8; ++i)
{
- do_test (i, 64, 256, 0, MIDDLE_CHAR);
- do_test (i, 64, 256, 0, BIG_CHAR);
+ do_test (&json_ctx, i, 64, 256, 0, MIDDLE_CHAR);
+ do_test (&json_ctx, i, 64, 256, 0, BIG_CHAR);
}
for (i = 0; i < 8; ++i)
{
- do_test (16 * i, 256, 512, 0, MIDDLE_CHAR);
- do_test (16 * i, 256, 512, 0, BIG_CHAR);
+ do_test (&json_ctx, 16 * i, 256, 512, 0, MIDDLE_CHAR);
+ do_test (&json_ctx, 16 * i, 256, 512, 0, BIG_CHAR);
}
for (i = 0; i < 32; ++i)
{
- do_test (0, i, i + 1, 0, MIDDLE_CHAR);
- do_test (0, i, i + 1, 0, BIG_CHAR);
+ do_test (&json_ctx, 0, i, i + 1, 0, MIDDLE_CHAR);
+ do_test (&json_ctx, 0, i, i + 1, 0, BIG_CHAR);
}
+ json_array_end (&json_ctx);
+ json_attr_object_end (&json_ctx);
+ json_attr_object_end (&json_ctx);
+ json_document_end (&json_ctx);
+
return ret;
}
--
2.25.1
^ permalink raw reply [flat|nested] 76+ messages in thread
* [PATCH v1 02/23] benchtests: Add random benchmark in bench-strchr.c
2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
@ 2022-03-23 21:57 ` Noah Goldstein
2022-03-24 18:44 ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 03/23] x86: Code cleanup in strchr-avx2 and comment justifying branch Noah Goldstein
` (21 subsequent siblings)
22 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-23 21:57 UTC (permalink / raw)
To: libc-alpha
Add benchmark that randomizes whether return should be NULL or pointer
to CHAR. The rationale is on many architectures there is a choice
between a predicate execution option (i.e cmovcc on x86) or a branch.
On x86 the results for cmovcc vs branch are something along the lines
of the following:
perc-zero, Br On Result, Time Br / Time cmov
0.10, 1, ,0.983
0.10, 0, ,1.246
0.25, 1, ,1.035
0.25, 0, ,1.49
0.33, 1, ,1.016
0.33, 0, ,1.579
0.50, 1, ,1.228
0.50, 0, ,1.739
0.66, 1, ,1.039
0.66, 0, ,1.764
0.75, 1, ,0.996
0.75, 0, ,1.642
0.90, 1, ,1.071
0.90, 0, ,1.409
1.00, 1, ,0.937
1.00, 0, ,0.999
---
benchtests/bench-strchr.c | 143 ++++++++++++++++++++++++++++++++++++++
1 file changed, 143 insertions(+)
diff --git a/benchtests/bench-strchr.c b/benchtests/bench-strchr.c
index 203900d4ad..54640bde7e 100644
--- a/benchtests/bench-strchr.c
+++ b/benchtests/bench-strchr.c
@@ -53,6 +53,11 @@
# define SMALL_CHAR 851
#endif /* WIDE */
+#ifdef USE_FOR_STRCHRNUL
+# define DO_RAND_TEST(...)
+#else
+# define DO_RAND_TEST(...) do_rand_test(__VA_ARGS__)
+#endif
#ifdef USE_FOR_STRCHRNUL
# define NULLRET(endptr) endptr
#else
@@ -74,6 +79,133 @@ simple_STRCHR (const CHAR *s, int c)
IMPL (simple_STRCHR, 0)
IMPL (STRCHR, 1)
+#ifndef USE_FOR_STRCHRNUL
+/* Random benchmarks for strchr (if return is CHAR or NULL). The
+ rational for the benchmark is returning null/char can be done with
+ predicate execution (i.e cmovcc on x86) or a branch. */
+
+
+/* Large enough that full history can't be stored in BHT. */
+#define NUM_SEARCH_CHARS 2048
+
+/* Expectation is usecases of strchr check the return. Otherwise
+ strchrnul would almost always be better. Since there is another
+ branch coming we want to test the case where a potential branch in
+ strchr can be used to skip a later mispredict because of the
+ relationship between the two branches. */
+static void __attribute__ ((noinline, noclone))
+do_one_rand_plus_branch_test (json_ctx_t *json_ctx, impl_t *impl,
+ const CHAR *s, const CHAR *c)
+{
+ size_t i, iters = INNER_LOOP_ITERS_LARGE;
+ int must_execute = 0;
+ timing_t start, stop, cur;
+ TIMING_NOW (start);
+ for (i = 0; i < iters; ++i)
+ {
+ if (CALL (impl, s, c[i % NUM_SEARCH_CHARS]))
+ {
+ /* We just need something that will force compiler to emit
+ a branch instead of conditional execution. */
+ ++must_execute;
+ asm volatile("" : : :);
+ }
+ }
+ TIMING_NOW (stop);
+
+ TIMING_DIFF (cur, start, stop);
+
+ json_element_double (json_ctx, (double)cur / (double)iters);
+}
+
+static void __attribute__ ((noinline, noclone))
+do_one_rand_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s,
+ const CHAR *c)
+{
+ size_t i, iters = INNER_LOOP_ITERS_LARGE;
+ timing_t start, stop, cur;
+ TIMING_NOW (start);
+ for (i = 0; i < iters; ++i)
+ {
+ CALL (impl, s, c[i % NUM_SEARCH_CHARS]);
+ }
+ TIMING_NOW (stop);
+
+ TIMING_DIFF (cur, start, stop);
+
+ json_element_double (json_ctx, (double)cur / (double)iters);
+}
+
+static void
+do_rand_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
+ float perc_zero)
+{
+ size_t i;
+ int perc_zero_int;
+ CHAR *buf = (CHAR *)buf1;
+ CHAR *c = (CHAR *)buf2;
+ align &= 127;
+ if ((align + len) * sizeof (CHAR) >= page_size)
+ return;
+
+ /* Test is only interesting if we can hit both cases. */
+ if (pos >= len)
+ return;
+
+ /* Segfault if we run the test. */
+ if (NUM_SEARCH_CHARS * sizeof (CHAR) > page_size)
+ return;
+
+ for (i = 0; i < len; ++i)
+ {
+ buf[align + i] = 2;
+ }
+ buf[align + len] = 0;
+ buf[align + pos] = 1;
+
+ perc_zero_int = perc_zero * RAND_MAX;
+ for (i = 0; i < NUM_SEARCH_CHARS; ++i)
+ {
+ if (rand () > perc_zero_int)
+ c[i] = 0;
+ else
+ c[i] = 1;
+ }
+ {
+ json_element_object_begin (json_ctx);
+ json_attr_uint (json_ctx, "rand", 1);
+ json_attr_uint (json_ctx, "branch", 1);
+ json_attr_double (json_ctx, "perc-zero", perc_zero);
+ json_attr_uint (json_ctx, "length", len);
+ json_attr_uint (json_ctx, "pos", pos);
+ json_attr_uint (json_ctx, "alignment", align);
+ json_array_begin (json_ctx, "timings");
+
+ FOR_EACH_IMPL (impl, 0)
+ do_one_rand_plus_branch_test (json_ctx, impl, buf + align, c);
+
+ json_array_end (json_ctx);
+ json_element_object_end (json_ctx);
+ }
+ {
+ json_element_object_begin (json_ctx);
+ json_attr_uint (json_ctx, "rand", 1);
+ json_attr_uint (json_ctx, "branch", 0);
+ json_attr_double (json_ctx, "perc-zero", perc_zero);
+ json_attr_uint (json_ctx, "length", len);
+ json_attr_uint (json_ctx, "pos", pos);
+ json_attr_uint (json_ctx, "alignment", align);
+ json_array_begin (json_ctx, "timings");
+
+ FOR_EACH_IMPL (impl, 0)
+ do_one_rand_test (json_ctx, impl, buf + align, c);
+
+ json_array_end (json_ctx);
+ json_element_object_end (json_ctx);
+ }
+}
+#endif
+
static void
do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s, int c,
const CHAR *exp_res)
@@ -136,6 +268,7 @@ do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
result = NULLRET (buf + align + len);
json_element_object_begin (json_ctx);
+ json_attr_uint (json_ctx, "rand", 0);
json_attr_uint (json_ctx, "length", len);
json_attr_uint (json_ctx, "pos", pos);
json_attr_uint (json_ctx, "seek_char", seek_char);
@@ -234,6 +367,16 @@ test_main (void)
do_test (&json_ctx, 0, i, i + 1, 0, BIG_CHAR);
}
+ DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.0);
+ DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.1);
+ DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.25);
+ DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.33);
+ DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.5);
+ DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.66);
+ DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.75);
+ DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.9);
+ DO_RAND_TEST(&json_ctx, 0, 15, 16, 1.0);
+
json_array_end (&json_ctx);
json_attr_object_end (&json_ctx);
json_attr_object_end (&json_ctx);
--
2.25.1
^ permalink raw reply [flat|nested] 76+ messages in thread
* [PATCH v1 03/23] x86: Code cleanup in strchr-avx2 and comment justifying branch
2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
2022-03-23 21:57 ` [PATCH v1 02/23] benchtests: Add random benchmark " Noah Goldstein
@ 2022-03-23 21:57 ` Noah Goldstein
2022-03-24 18:53 ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 04/23] x86: Code cleanup in strchr-evex " Noah Goldstein
` (20 subsequent siblings)
22 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-23 21:57 UTC (permalink / raw)
To: libc-alpha
Small code cleanup for size: -53 bytes.
Add comment justifying using a branch to do NULL/non-null return.
All string/memory tests pass and no regressions in benchtests.
geometric_mean(N=20) of all benchmarks Original / New: 1.00
---
Geomtric Mean N=20 runs; All functions page aligned
length, alignment, pos, rand, seek_char/branch, max_char/perc-zero, New Time / Old Time
2048, 0, 32, 0, 23, 127, 1.033
2048, 1, 32, 0, 23, 127, 1.006
2048, 0, 64, 0, 23, 127, 1.02
2048, 2, 64, 0, 23, 127, 0.992
2048, 0, 128, 0, 23, 127, 0.996
2048, 3, 128, 0, 23, 127, 0.966
2048, 0, 256, 0, 23, 127, 0.996
2048, 4, 256, 0, 23, 127, 0.998
2048, 0, 512, 0, 23, 127, 0.991
2048, 5, 512, 0, 23, 127, 0.991
2048, 0, 1024, 0, 23, 127, 0.993
2048, 6, 1024, 0, 23, 127, 0.992
2048, 0, 2048, 0, 23, 127, 0.992
2048, 7, 2048, 0, 23, 127, 0.976
4096, 0, 32, 0, 23, 127, 0.983
4096, 1, 32, 0, 23, 127, 0.994
4096, 0, 64, 0, 23, 127, 0.968
4096, 2, 64, 0, 23, 127, 1.018
4096, 0, 128, 0, 23, 127, 0.99
4096, 3, 128, 0, 23, 127, 1.001
4096, 0, 256, 0, 23, 127, 1.0
4096, 4, 256, 0, 23, 127, 1.001
4096, 0, 512, 0, 23, 127, 0.989
4096, 5, 512, 0, 23, 127, 0.988
4096, 0, 1024, 0, 23, 127, 0.994
4096, 6, 1024, 0, 23, 127, 0.993
4096, 0, 2048, 0, 23, 127, 0.987
4096, 7, 2048, 0, 23, 127, 0.996
256, 1, 64, 0, 23, 127, 1.004
256, 2, 64, 0, 23, 127, 1.004
256, 3, 64, 0, 23, 127, 0.992
256, 4, 64, 0, 23, 127, 1.001
256, 5, 64, 0, 23, 127, 1.001
256, 6, 64, 0, 23, 127, 0.998
256, 7, 64, 0, 23, 127, 0.994
512, 0, 256, 0, 23, 127, 0.999
512, 16, 256, 0, 23, 127, 1.002
512, 32, 256, 0, 23, 127, 0.994
512, 48, 256, 0, 23, 127, 0.991
512, 64, 256, 0, 23, 127, 0.994
512, 80, 256, 0, 23, 127, 0.994
512, 96, 256, 0, 23, 127, 0.996
512, 112, 256, 0, 23, 127, 0.999
1, 0, 0, 0, 23, 127, 0.978
2, 0, 1, 0, 23, 127, 0.981
3, 0, 2, 0, 23, 127, 0.993
4, 0, 3, 0, 23, 127, 1.004
5, 0, 4, 0, 23, 127, 1.002
6, 0, 5, 0, 23, 127, 0.991
7, 0, 6, 0, 23, 127, 0.99
8, 0, 7, 0, 23, 127, 1.012
9, 0, 8, 0, 23, 127, 0.994
10, 0, 9, 0, 23, 127, 1.003
11, 0, 10, 0, 23, 127, 0.999
12, 0, 11, 0, 23, 127, 1.007
13, 0, 12, 0, 23, 127, 1.0
14, 0, 13, 0, 23, 127, 0.997
15, 0, 14, 0, 23, 127, 0.996
16, 0, 15, 0, 23, 127, 0.993
17, 0, 16, 0, 23, 127, 1.002
18, 0, 17, 0, 23, 127, 0.997
19, 0, 18, 0, 23, 127, 0.998
20, 0, 19, 0, 23, 127, 0.994
21, 0, 20, 0, 23, 127, 0.99
22, 0, 21, 0, 23, 127, 0.992
23, 0, 22, 0, 23, 127, 0.996
24, 0, 23, 0, 23, 127, 0.991
25, 0, 24, 0, 23, 127, 0.997
26, 0, 25, 0, 23, 127, 1.011
27, 0, 26, 0, 23, 127, 1.013
28, 0, 27, 0, 23, 127, 0.996
29, 0, 28, 0, 23, 127, 0.993
30, 0, 29, 0, 23, 127, 1.009
31, 0, 30, 0, 23, 127, 1.009
32, 0, 31, 0, 23, 127, 1.008
2048, 0, 32, 0, 0, 127, 1.0
2048, 1, 32, 0, 0, 127, 1.01
2048, 0, 64, 0, 0, 127, 0.997
2048, 2, 64, 0, 0, 127, 1.002
2048, 0, 128, 0, 0, 127, 0.986
2048, 3, 128, 0, 0, 127, 0.997
2048, 0, 256, 0, 0, 127, 1.002
2048, 4, 256, 0, 0, 127, 0.999
2048, 0, 512, 0, 0, 127, 0.991
2048, 5, 512, 0, 0, 127, 0.984
2048, 0, 1024, 0, 0, 127, 0.994
2048, 6, 1024, 0, 0, 127, 0.993
2048, 0, 2048, 0, 0, 127, 0.951
2048, 7, 2048, 0, 0, 127, 0.989
4096, 0, 32, 0, 0, 127, 0.993
4096, 1, 32, 0, 0, 127, 0.997
4096, 0, 64, 0, 0, 127, 1.004
4096, 2, 64, 0, 0, 127, 1.016
4096, 0, 128, 0, 0, 127, 0.973
4096, 3, 128, 0, 0, 127, 1.001
4096, 0, 256, 0, 0, 127, 0.999
4096, 4, 256, 0, 0, 127, 0.998
4096, 0, 512, 0, 0, 127, 0.99
4096, 5, 512, 0, 0, 127, 0.985
4096, 0, 1024, 0, 0, 127, 0.993
4096, 6, 1024, 0, 0, 127, 0.997
4096, 0, 2048, 0, 0, 127, 0.995
4096, 7, 2048, 0, 0, 127, 0.996
256, 1, 64, 0, 0, 127, 1.01
256, 2, 64, 0, 0, 127, 1.024
256, 3, 64, 0, 0, 127, 1.03
256, 4, 64, 0, 0, 127, 1.004
256, 5, 64, 0, 0, 127, 0.998
256, 6, 64, 0, 0, 127, 0.998
256, 7, 64, 0, 0, 127, 0.997
512, 0, 256, 0, 0, 127, 0.996
512, 16, 256, 0, 0, 127, 0.995
512, 32, 256, 0, 0, 127, 0.996
512, 48, 256, 0, 0, 127, 0.992
512, 64, 256, 0, 0, 127, 0.999
512, 80, 256, 0, 0, 127, 1.002
512, 96, 256, 0, 0, 127, 0.999
512, 112, 256, 0, 0, 127, 0.998
1, 0, 0, 0, 0, 127, 1.016
2, 0, 1, 0, 0, 127, 0.998
3, 0, 2, 0, 0, 127, 1.02
4, 0, 3, 0, 0, 127, 1.004
5, 0, 4, 0, 0, 127, 1.021
6, 0, 5, 0, 0, 127, 1.014
7, 0, 6, 0, 0, 127, 1.007
8, 0, 7, 0, 0, 127, 1.016
9, 0, 8, 0, 0, 127, 1.003
10, 0, 9, 0, 0, 127, 1.004
11, 0, 10, 0, 0, 127, 0.995
12, 0, 11, 0, 0, 127, 1.009
13, 0, 12, 0, 0, 127, 1.005
14, 0, 13, 0, 0, 127, 0.987
15, 0, 14, 0, 0, 127, 0.998
16, 0, 15, 0, 0, 127, 1.004
17, 0, 16, 0, 0, 127, 1.01
18, 0, 17, 0, 0, 127, 1.01
19, 0, 18, 0, 0, 127, 1.006
20, 0, 19, 0, 0, 127, 1.012
21, 0, 20, 0, 0, 127, 0.999
22, 0, 21, 0, 0, 127, 1.004
23, 0, 22, 0, 0, 127, 0.988
24, 0, 23, 0, 0, 127, 0.993
25, 0, 24, 0, 0, 127, 1.004
26, 0, 25, 0, 0, 127, 0.99
27, 0, 26, 0, 0, 127, 1.016
28, 0, 27, 0, 0, 127, 0.987
29, 0, 28, 0, 0, 127, 0.989
30, 0, 29, 0, 0, 127, 0.998
31, 0, 30, 0, 0, 127, 1.005
32, 0, 31, 0, 0, 127, 0.993
16, 0, 15, 1, 1, 0, 1.002
16, 0, 15, 1, 0, 0, 1.0
16, 0, 15, 1, 1, 0.1, 1.034
16, 0, 15, 1, 0, 0.1, 1.03
16, 0, 15, 1, 1, 0.25, 0.993
16, 0, 15, 1, 0, 0.25, 1.081
16, 0, 15, 1, 1, 0.33, 0.959
16, 0, 15, 1, 0, 0.33, 1.142
16, 0, 15, 1, 1, 0.5, 0.929
16, 0, 15, 1, 0, 0.5, 1.072
16, 0, 15, 1, 1, 0.66, 0.984
16, 0, 15, 1, 0, 0.66, 1.069
16, 0, 15, 1, 1, 0.75, 0.969
16, 0, 15, 1, 0, 0.75, 1.059
16, 0, 15, 1, 1, 0.9, 0.98
16, 0, 15, 1, 0, 0.9, 0.994
16, 0, 15, 1, 1, 1, 0.993
16, 0, 15, 1, 0, 1, 0.996
sysdeps/x86_64/multiarch/strchr-avx2.S | 204 +++++++++++++------------
1 file changed, 107 insertions(+), 97 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
index 086cabf76a..1a916cc951 100644
--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
@@ -48,13 +48,13 @@
# define PAGE_SIZE 4096
.section SECTION(.text),"ax",@progbits
-ENTRY (STRCHR)
+ENTRY_P2ALIGN (STRCHR, 5)
/* Broadcast CHAR to YMM0. */
vmovd %esi, %xmm0
movl %edi, %eax
andl $(PAGE_SIZE - 1), %eax
VPBROADCAST %xmm0, %ymm0
- vpxor %xmm9, %xmm9, %xmm9
+ vpxor %xmm1, %xmm1, %xmm1
/* Check if we cross page boundary with one vector load. */
cmpl $(PAGE_SIZE - VEC_SIZE), %eax
@@ -62,37 +62,29 @@ ENTRY (STRCHR)
/* Check the first VEC_SIZE bytes. Search for both CHAR and the
null byte. */
- vmovdqu (%rdi), %ymm8
- VPCMPEQ %ymm8, %ymm0, %ymm1
- VPCMPEQ %ymm8, %ymm9, %ymm2
- vpor %ymm1, %ymm2, %ymm1
- vpmovmskb %ymm1, %eax
+ vmovdqu (%rdi), %ymm2
+ VPCMPEQ %ymm2, %ymm0, %ymm3
+ VPCMPEQ %ymm2, %ymm1, %ymm2
+ vpor %ymm3, %ymm2, %ymm3
+ vpmovmskb %ymm3, %eax
testl %eax, %eax
jz L(aligned_more)
tzcntl %eax, %eax
# ifndef USE_AS_STRCHRNUL
- /* Found CHAR or the null byte. */
- cmp (%rdi, %rax), %CHAR_REG
- jne L(zero)
-# endif
- addq %rdi, %rax
- VZEROUPPER_RETURN
-
- /* .p2align 5 helps keep performance more consistent if ENTRY()
- alignment % 32 was either 16 or 0. As well this makes the
- alignment % 32 of the loop_4x_vec fixed which makes tuning it
- easier. */
- .p2align 5
-L(first_vec_x4):
- tzcntl %eax, %eax
- addq $(VEC_SIZE * 3 + 1), %rdi
-# ifndef USE_AS_STRCHRNUL
- /* Found CHAR or the null byte. */
+ /* Found CHAR or the null byte. */
cmp (%rdi, %rax), %CHAR_REG
+ /* NB: Use a branch instead of cmovcc here. The expectation is
+ that with strchr the user will branch based on input being
+ null. Since this branch will be 100% predictive of the user
+ branch a branch miss here should save what otherwise would
+ be branch miss in the user code. Otherwise using a branch 1)
+ saves code size and 2) is faster in highly predictable
+ environments. */
jne L(zero)
# endif
addq %rdi, %rax
- VZEROUPPER_RETURN
+L(return_vzeroupper):
+ ZERO_UPPER_VEC_REGISTERS_RETURN
# ifndef USE_AS_STRCHRNUL
L(zero):
@@ -103,7 +95,8 @@ L(zero):
.p2align 4
L(first_vec_x1):
- tzcntl %eax, %eax
+ /* Use bsf to save code size. */
+ bsfl %eax, %eax
incq %rdi
# ifndef USE_AS_STRCHRNUL
/* Found CHAR or the null byte. */
@@ -113,9 +106,10 @@ L(first_vec_x1):
addq %rdi, %rax
VZEROUPPER_RETURN
- .p2align 4
+ .p2align 4,, 10
L(first_vec_x2):
- tzcntl %eax, %eax
+ /* Use bsf to save code size. */
+ bsfl %eax, %eax
addq $(VEC_SIZE + 1), %rdi
# ifndef USE_AS_STRCHRNUL
/* Found CHAR or the null byte. */
@@ -125,9 +119,10 @@ L(first_vec_x2):
addq %rdi, %rax
VZEROUPPER_RETURN
- .p2align 4
+ .p2align 4,, 8
L(first_vec_x3):
- tzcntl %eax, %eax
+ /* Use bsf to save code size. */
+ bsfl %eax, %eax
addq $(VEC_SIZE * 2 + 1), %rdi
# ifndef USE_AS_STRCHRNUL
/* Found CHAR or the null byte. */
@@ -137,6 +132,21 @@ L(first_vec_x3):
addq %rdi, %rax
VZEROUPPER_RETURN
+ .p2align 4,, 10
+L(first_vec_x4):
+ /* Use bsf to save code size. */
+ bsfl %eax, %eax
+ addq $(VEC_SIZE * 3 + 1), %rdi
+# ifndef USE_AS_STRCHRNUL
+ /* Found CHAR or the null byte. */
+ cmp (%rdi, %rax), %CHAR_REG
+ jne L(zero)
+# endif
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
+
+
+
.p2align 4
L(aligned_more):
/* Align data to VEC_SIZE - 1. This is the same number of
@@ -146,90 +156,92 @@ L(aligned_more):
L(cross_page_continue):
/* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
since data is only aligned to VEC_SIZE. */
- vmovdqa 1(%rdi), %ymm8
- VPCMPEQ %ymm8, %ymm0, %ymm1
- VPCMPEQ %ymm8, %ymm9, %ymm2
- vpor %ymm1, %ymm2, %ymm1
- vpmovmskb %ymm1, %eax
+ vmovdqa 1(%rdi), %ymm2
+ VPCMPEQ %ymm2, %ymm0, %ymm3
+ VPCMPEQ %ymm2, %ymm1, %ymm2
+ vpor %ymm3, %ymm2, %ymm3
+ vpmovmskb %ymm3, %eax
testl %eax, %eax
jnz L(first_vec_x1)
- vmovdqa (VEC_SIZE + 1)(%rdi), %ymm8
- VPCMPEQ %ymm8, %ymm0, %ymm1
- VPCMPEQ %ymm8, %ymm9, %ymm2
- vpor %ymm1, %ymm2, %ymm1
- vpmovmskb %ymm1, %eax
+ vmovdqa (VEC_SIZE + 1)(%rdi), %ymm2
+ VPCMPEQ %ymm2, %ymm0, %ymm3
+ VPCMPEQ %ymm2, %ymm1, %ymm2
+ vpor %ymm3, %ymm2, %ymm3
+ vpmovmskb %ymm3, %eax
testl %eax, %eax
jnz L(first_vec_x2)
- vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm8
- VPCMPEQ %ymm8, %ymm0, %ymm1
- VPCMPEQ %ymm8, %ymm9, %ymm2
- vpor %ymm1, %ymm2, %ymm1
- vpmovmskb %ymm1, %eax
+ vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm2
+ VPCMPEQ %ymm2, %ymm0, %ymm3
+ VPCMPEQ %ymm2, %ymm1, %ymm2
+ vpor %ymm3, %ymm2, %ymm3
+ vpmovmskb %ymm3, %eax
testl %eax, %eax
jnz L(first_vec_x3)
- vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm8
- VPCMPEQ %ymm8, %ymm0, %ymm1
- VPCMPEQ %ymm8, %ymm9, %ymm2
- vpor %ymm1, %ymm2, %ymm1
- vpmovmskb %ymm1, %eax
+ vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm2
+ VPCMPEQ %ymm2, %ymm0, %ymm3
+ VPCMPEQ %ymm2, %ymm1, %ymm2
+ vpor %ymm3, %ymm2, %ymm3
+ vpmovmskb %ymm3, %eax
testl %eax, %eax
jnz L(first_vec_x4)
- /* Align data to VEC_SIZE * 4 - 1. */
- addq $(VEC_SIZE * 4 + 1), %rdi
- andq $-(VEC_SIZE * 4), %rdi
+ /* Align data to VEC_SIZE * 4 - 1. */
+ incq %rdi
+ orq $(VEC_SIZE * 4 - 1), %rdi
.p2align 4
L(loop_4x_vec):
/* Compare 4 * VEC at a time forward. */
- vmovdqa (%rdi), %ymm5
- vmovdqa (VEC_SIZE)(%rdi), %ymm6
- vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
- vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
+ vmovdqa 1(%rdi), %ymm6
+ vmovdqa (VEC_SIZE + 1)(%rdi), %ymm7
/* Leaves only CHARS matching esi as 0. */
- vpxor %ymm5, %ymm0, %ymm1
vpxor %ymm6, %ymm0, %ymm2
vpxor %ymm7, %ymm0, %ymm3
- vpxor %ymm8, %ymm0, %ymm4
- VPMINU %ymm1, %ymm5, %ymm1
VPMINU %ymm2, %ymm6, %ymm2
VPMINU %ymm3, %ymm7, %ymm3
- VPMINU %ymm4, %ymm8, %ymm4
- VPMINU %ymm1, %ymm2, %ymm5
- VPMINU %ymm3, %ymm4, %ymm6
+ vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm6
+ vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm7
+
+ vpxor %ymm6, %ymm0, %ymm4
+ vpxor %ymm7, %ymm0, %ymm5
+
+ VPMINU %ymm4, %ymm6, %ymm4
+ VPMINU %ymm5, %ymm7, %ymm5
- VPMINU %ymm5, %ymm6, %ymm6
+ VPMINU %ymm2, %ymm3, %ymm6
+ VPMINU %ymm4, %ymm5, %ymm7
- VPCMPEQ %ymm6, %ymm9, %ymm6
- vpmovmskb %ymm6, %ecx
+ VPMINU %ymm6, %ymm7, %ymm7
+
+ VPCMPEQ %ymm7, %ymm1, %ymm7
+ vpmovmskb %ymm7, %ecx
subq $-(VEC_SIZE * 4), %rdi
testl %ecx, %ecx
jz L(loop_4x_vec)
-
- VPCMPEQ %ymm1, %ymm9, %ymm1
- vpmovmskb %ymm1, %eax
+ VPCMPEQ %ymm2, %ymm1, %ymm2
+ vpmovmskb %ymm2, %eax
testl %eax, %eax
jnz L(last_vec_x0)
- VPCMPEQ %ymm5, %ymm9, %ymm2
- vpmovmskb %ymm2, %eax
+ VPCMPEQ %ymm3, %ymm1, %ymm3
+ vpmovmskb %ymm3, %eax
testl %eax, %eax
jnz L(last_vec_x1)
- VPCMPEQ %ymm3, %ymm9, %ymm3
- vpmovmskb %ymm3, %eax
+ VPCMPEQ %ymm4, %ymm1, %ymm4
+ vpmovmskb %ymm4, %eax
/* rcx has combined result from all 4 VEC. It will only be used
if the first 3 other VEC all did not contain a match. */
salq $32, %rcx
orq %rcx, %rax
tzcntq %rax, %rax
- subq $(VEC_SIZE * 2), %rdi
+ subq $(VEC_SIZE * 2 - 1), %rdi
# ifndef USE_AS_STRCHRNUL
/* Found CHAR or the null byte. */
cmp (%rdi, %rax), %CHAR_REG
@@ -239,10 +251,11 @@ L(loop_4x_vec):
VZEROUPPER_RETURN
- .p2align 4
+ .p2align 4,, 10
L(last_vec_x0):
- tzcntl %eax, %eax
- addq $-(VEC_SIZE * 4), %rdi
+ /* Use bsf to save code size. */
+ bsfl %eax, %eax
+ addq $-(VEC_SIZE * 4 - 1), %rdi
# ifndef USE_AS_STRCHRNUL
/* Found CHAR or the null byte. */
cmp (%rdi, %rax), %CHAR_REG
@@ -251,16 +264,11 @@ L(last_vec_x0):
addq %rdi, %rax
VZEROUPPER_RETURN
-# ifndef USE_AS_STRCHRNUL
-L(zero_end):
- xorl %eax, %eax
- VZEROUPPER_RETURN
-# endif
- .p2align 4
+ .p2align 4,, 10
L(last_vec_x1):
tzcntl %eax, %eax
- subq $(VEC_SIZE * 3), %rdi
+ subq $(VEC_SIZE * 3 - 1), %rdi
# ifndef USE_AS_STRCHRNUL
/* Found CHAR or the null byte. */
cmp (%rdi, %rax), %CHAR_REG
@@ -269,18 +277,23 @@ L(last_vec_x1):
addq %rdi, %rax
VZEROUPPER_RETURN
+# ifndef USE_AS_STRCHRNUL
+L(zero_end):
+ xorl %eax, %eax
+ VZEROUPPER_RETURN
+# endif
/* Cold case for crossing page with first load. */
- .p2align 4
+ .p2align 4,, 8
L(cross_page_boundary):
movq %rdi, %rdx
/* Align rdi to VEC_SIZE - 1. */
orq $(VEC_SIZE - 1), %rdi
- vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm8
- VPCMPEQ %ymm8, %ymm0, %ymm1
- VPCMPEQ %ymm8, %ymm9, %ymm2
- vpor %ymm1, %ymm2, %ymm1
- vpmovmskb %ymm1, %eax
+ vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm2
+ VPCMPEQ %ymm2, %ymm0, %ymm3
+ VPCMPEQ %ymm2, %ymm1, %ymm2
+ vpor %ymm3, %ymm2, %ymm3
+ vpmovmskb %ymm3, %eax
/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
so no need to manually mod edx. */
sarxl %edx, %eax, %eax
@@ -291,13 +304,10 @@ L(cross_page_boundary):
xorl %ecx, %ecx
/* Found CHAR or the null byte. */
cmp (%rdx, %rax), %CHAR_REG
- leaq (%rdx, %rax), %rax
- cmovne %rcx, %rax
-# else
- addq %rdx, %rax
+ jne L(zero_end)
# endif
-L(return_vzeroupper):
- ZERO_UPPER_VEC_REGISTERS_RETURN
+ addq %rdx, %rax
+ VZEROUPPER_RETURN
END (STRCHR)
-# endif
+#endif
--
2.25.1
^ permalink raw reply [flat|nested] 76+ messages in thread
* [PATCH v1 04/23] x86: Code cleanup in strchr-evex and comment justifying branch
2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
2022-03-23 21:57 ` [PATCH v1 02/23] benchtests: Add random benchmark " Noah Goldstein
2022-03-23 21:57 ` [PATCH v1 03/23] x86: Code cleanup in strchr-avx2 and comment justifying branch Noah Goldstein
@ 2022-03-23 21:57 ` Noah Goldstein
2022-03-24 18:54 ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 05/23] benchtests: Use json-lib in bench-strpbrk.c Noah Goldstein
` (19 subsequent siblings)
22 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-23 21:57 UTC (permalink / raw)
To: libc-alpha
Small code cleanup for size: -81 bytes.
Add comment justifying using a branch to do NULL/non-null return.
All string/memory tests pass and no regressions in benchtests.
geometric_mean(N=20) of all benchmarks New / Original: .985
---
Geomtric Mean N=20 runs; All functions page aligned
length, alignment, pos, rand, seek_char/branch, max_char/perc-zero, New Time / Old Time
2048, 0, 32, 0, 23, 127, 0.878
2048, 1, 32, 0, 23, 127, 0.88
2048, 0, 64, 0, 23, 127, 0.997
2048, 2, 64, 0, 23, 127, 1.001
2048, 0, 128, 0, 23, 127, 0.973
2048, 3, 128, 0, 23, 127, 0.971
2048, 0, 256, 0, 23, 127, 0.976
2048, 4, 256, 0, 23, 127, 0.973
2048, 0, 512, 0, 23, 127, 1.001
2048, 5, 512, 0, 23, 127, 1.004
2048, 0, 1024, 0, 23, 127, 1.005
2048, 6, 1024, 0, 23, 127, 1.007
2048, 0, 2048, 0, 23, 127, 1.035
2048, 7, 2048, 0, 23, 127, 1.03
4096, 0, 32, 0, 23, 127, 0.889
4096, 1, 32, 0, 23, 127, 0.891
4096, 0, 64, 0, 23, 127, 1.012
4096, 2, 64, 0, 23, 127, 1.017
4096, 0, 128, 0, 23, 127, 0.975
4096, 3, 128, 0, 23, 127, 0.974
4096, 0, 256, 0, 23, 127, 0.974
4096, 4, 256, 0, 23, 127, 0.972
4096, 0, 512, 0, 23, 127, 1.002
4096, 5, 512, 0, 23, 127, 1.016
4096, 0, 1024, 0, 23, 127, 1.009
4096, 6, 1024, 0, 23, 127, 1.008
4096, 0, 2048, 0, 23, 127, 1.003
4096, 7, 2048, 0, 23, 127, 1.004
256, 1, 64, 0, 23, 127, 0.993
256, 2, 64, 0, 23, 127, 0.999
256, 3, 64, 0, 23, 127, 0.992
256, 4, 64, 0, 23, 127, 0.99
256, 5, 64, 0, 23, 127, 0.99
256, 6, 64, 0, 23, 127, 0.994
256, 7, 64, 0, 23, 127, 0.991
512, 0, 256, 0, 23, 127, 0.971
512, 16, 256, 0, 23, 127, 0.971
512, 32, 256, 0, 23, 127, 1.005
512, 48, 256, 0, 23, 127, 0.998
512, 64, 256, 0, 23, 127, 1.001
512, 80, 256, 0, 23, 127, 1.002
512, 96, 256, 0, 23, 127, 1.005
512, 112, 256, 0, 23, 127, 1.012
1, 0, 0, 0, 23, 127, 1.024
2, 0, 1, 0, 23, 127, 0.991
3, 0, 2, 0, 23, 127, 0.997
4, 0, 3, 0, 23, 127, 0.984
5, 0, 4, 0, 23, 127, 0.993
6, 0, 5, 0, 23, 127, 0.985
7, 0, 6, 0, 23, 127, 0.979
8, 0, 7, 0, 23, 127, 0.975
9, 0, 8, 0, 23, 127, 0.965
10, 0, 9, 0, 23, 127, 0.957
11, 0, 10, 0, 23, 127, 0.979
12, 0, 11, 0, 23, 127, 0.987
13, 0, 12, 0, 23, 127, 1.023
14, 0, 13, 0, 23, 127, 0.997
15, 0, 14, 0, 23, 127, 0.983
16, 0, 15, 0, 23, 127, 0.987
17, 0, 16, 0, 23, 127, 0.993
18, 0, 17, 0, 23, 127, 0.985
19, 0, 18, 0, 23, 127, 0.999
20, 0, 19, 0, 23, 127, 0.998
21, 0, 20, 0, 23, 127, 0.983
22, 0, 21, 0, 23, 127, 0.983
23, 0, 22, 0, 23, 127, 1.002
24, 0, 23, 0, 23, 127, 1.0
25, 0, 24, 0, 23, 127, 1.002
26, 0, 25, 0, 23, 127, 0.984
27, 0, 26, 0, 23, 127, 0.994
28, 0, 27, 0, 23, 127, 0.995
29, 0, 28, 0, 23, 127, 1.017
30, 0, 29, 0, 23, 127, 1.009
31, 0, 30, 0, 23, 127, 1.001
32, 0, 31, 0, 23, 127, 1.021
2048, 0, 32, 0, 0, 127, 0.899
2048, 1, 32, 0, 0, 127, 0.93
2048, 0, 64, 0, 0, 127, 1.009
2048, 2, 64, 0, 0, 127, 1.023
2048, 0, 128, 0, 0, 127, 0.973
2048, 3, 128, 0, 0, 127, 0.975
2048, 0, 256, 0, 0, 127, 0.974
2048, 4, 256, 0, 0, 127, 0.97
2048, 0, 512, 0, 0, 127, 0.999
2048, 5, 512, 0, 0, 127, 1.004
2048, 0, 1024, 0, 0, 127, 1.008
2048, 6, 1024, 0, 0, 127, 1.008
2048, 0, 2048, 0, 0, 127, 0.996
2048, 7, 2048, 0, 0, 127, 1.002
4096, 0, 32, 0, 0, 127, 0.872
4096, 1, 32, 0, 0, 127, 0.881
4096, 0, 64, 0, 0, 127, 1.006
4096, 2, 64, 0, 0, 127, 1.005
4096, 0, 128, 0, 0, 127, 0.973
4096, 3, 128, 0, 0, 127, 0.974
4096, 0, 256, 0, 0, 127, 0.969
4096, 4, 256, 0, 0, 127, 0.971
4096, 0, 512, 0, 0, 127, 1.0
4096, 5, 512, 0, 0, 127, 1.005
4096, 0, 1024, 0, 0, 127, 1.007
4096, 6, 1024, 0, 0, 127, 1.009
4096, 0, 2048, 0, 0, 127, 1.005
4096, 7, 2048, 0, 0, 127, 1.007
256, 1, 64, 0, 0, 127, 0.994
256, 2, 64, 0, 0, 127, 1.008
256, 3, 64, 0, 0, 127, 1.019
256, 4, 64, 0, 0, 127, 0.991
256, 5, 64, 0, 0, 127, 0.992
256, 6, 64, 0, 0, 127, 0.991
256, 7, 64, 0, 0, 127, 0.988
512, 0, 256, 0, 0, 127, 0.971
512, 16, 256, 0, 0, 127, 0.967
512, 32, 256, 0, 0, 127, 1.005
512, 48, 256, 0, 0, 127, 1.001
512, 64, 256, 0, 0, 127, 1.009
512, 80, 256, 0, 0, 127, 1.008
512, 96, 256, 0, 0, 127, 1.009
512, 112, 256, 0, 0, 127, 1.016
1, 0, 0, 0, 0, 127, 1.038
2, 0, 1, 0, 0, 127, 1.009
3, 0, 2, 0, 0, 127, 0.992
4, 0, 3, 0, 0, 127, 1.004
5, 0, 4, 0, 0, 127, 0.966
6, 0, 5, 0, 0, 127, 0.968
7, 0, 6, 0, 0, 127, 1.004
8, 0, 7, 0, 0, 127, 0.99
9, 0, 8, 0, 0, 127, 0.958
10, 0, 9, 0, 0, 127, 0.96
11, 0, 10, 0, 0, 127, 0.948
12, 0, 11, 0, 0, 127, 0.984
13, 0, 12, 0, 0, 127, 0.967
14, 0, 13, 0, 0, 127, 0.993
15, 0, 14, 0, 0, 127, 0.991
16, 0, 15, 0, 0, 127, 1.0
17, 0, 16, 0, 0, 127, 0.982
18, 0, 17, 0, 0, 127, 0.977
19, 0, 18, 0, 0, 127, 0.987
20, 0, 19, 0, 0, 127, 0.978
21, 0, 20, 0, 0, 127, 1.0
22, 0, 21, 0, 0, 127, 0.99
23, 0, 22, 0, 0, 127, 0.988
24, 0, 23, 0, 0, 127, 0.997
25, 0, 24, 0, 0, 127, 1.003
26, 0, 25, 0, 0, 127, 1.004
27, 0, 26, 0, 0, 127, 0.982
28, 0, 27, 0, 0, 127, 0.972
29, 0, 28, 0, 0, 127, 0.978
30, 0, 29, 0, 0, 127, 0.992
31, 0, 30, 0, 0, 127, 0.986
32, 0, 31, 0, 0, 127, 1.0
16, 0, 15, 1, 1, 0, 0.997
16, 0, 15, 1, 0, 0, 1.001
16, 0, 15, 1, 1, 0.1, 0.984
16, 0, 15, 1, 0, 0.1, 0.999
16, 0, 15, 1, 1, 0.25, 0.929
16, 0, 15, 1, 0, 0.25, 1.001
16, 0, 15, 1, 1, 0.33, 0.892
16, 0, 15, 1, 0, 0.33, 0.996
16, 0, 15, 1, 1, 0.5, 0.897
16, 0, 15, 1, 0, 0.5, 1.009
16, 0, 15, 1, 1, 0.66, 0.882
16, 0, 15, 1, 0, 0.66, 0.967
16, 0, 15, 1, 1, 0.75, 0.919
16, 0, 15, 1, 0, 0.75, 1.027
16, 0, 15, 1, 1, 0.9, 0.949
16, 0, 15, 1, 0, 0.9, 1.021
16, 0, 15, 1, 1, 1, 0.998
16, 0, 15, 1, 0, 1, 0.999
sysdeps/x86_64/multiarch/strchr-evex.S | 146 ++++++++++++++-----------
1 file changed, 80 insertions(+), 66 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
index f62cd9d144..ec739fb8f9 100644
--- a/sysdeps/x86_64/multiarch/strchr-evex.S
+++ b/sysdeps/x86_64/multiarch/strchr-evex.S
@@ -30,6 +30,7 @@
# ifdef USE_AS_WCSCHR
# define VPBROADCAST vpbroadcastd
# define VPCMP vpcmpd
+# define VPTESTN vptestnmd
# define VPMINU vpminud
# define CHAR_REG esi
# define SHIFT_REG ecx
@@ -37,6 +38,7 @@
# else
# define VPBROADCAST vpbroadcastb
# define VPCMP vpcmpb
+# define VPTESTN vptestnmb
# define VPMINU vpminub
# define CHAR_REG sil
# define SHIFT_REG edx
@@ -61,13 +63,11 @@
# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
.section .text.evex,"ax",@progbits
-ENTRY (STRCHR)
+ENTRY_P2ALIGN (STRCHR, 5)
/* Broadcast CHAR to YMM0. */
VPBROADCAST %esi, %YMM0
movl %edi, %eax
andl $(PAGE_SIZE - 1), %eax
- vpxorq %XMMZERO, %XMMZERO, %XMMZERO
-
/* Check if we cross page boundary with one vector load.
Otherwise it is safe to use an unaligned load. */
cmpl $(PAGE_SIZE - VEC_SIZE), %eax
@@ -81,49 +81,35 @@ ENTRY (STRCHR)
vpxorq %YMM1, %YMM0, %YMM2
VPMINU %YMM2, %YMM1, %YMM2
/* Each bit in K0 represents a CHAR or a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM2, %k0
+ VPTESTN %YMM2, %YMM2, %k0
kmovd %k0, %eax
testl %eax, %eax
jz L(aligned_more)
tzcntl %eax, %eax
+# ifndef USE_AS_STRCHRNUL
+ /* Found CHAR or the null byte. */
+ cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG
+ /* NB: Use a branch instead of cmovcc here. The expectation is
+ that with strchr the user will branch based on input being
+ null. Since this branch will be 100% predictive of the user
+ branch a branch miss here should save what otherwise would
+ be branch miss in the user code. Otherwise using a branch 1)
+ saves code size and 2) is faster in highly predictable
+ environments. */
+ jne L(zero)
+# endif
# ifdef USE_AS_WCSCHR
/* NB: Multiply wchar_t count by 4 to get the number of bytes.
*/
leaq (%rdi, %rax, CHAR_SIZE), %rax
# else
addq %rdi, %rax
-# endif
-# ifndef USE_AS_STRCHRNUL
- /* Found CHAR or the null byte. */
- cmp (%rax), %CHAR_REG
- jne L(zero)
# endif
ret
- /* .p2align 5 helps keep performance more consistent if ENTRY()
- alignment % 32 was either 16 or 0. As well this makes the
- alignment % 32 of the loop_4x_vec fixed which makes tuning it
- easier. */
- .p2align 5
-L(first_vec_x3):
- tzcntl %eax, %eax
-# ifndef USE_AS_STRCHRNUL
- /* Found CHAR or the null byte. */
- cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
- jne L(zero)
-# endif
- /* NB: Multiply sizeof char type (1 or 4) to get the number of
- bytes. */
- leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
- ret
-# ifndef USE_AS_STRCHRNUL
-L(zero):
- xorl %eax, %eax
- ret
-# endif
- .p2align 4
+ .p2align 4,, 10
L(first_vec_x4):
# ifndef USE_AS_STRCHRNUL
/* Check to see if first match was CHAR (k0) or null (k1). */
@@ -144,9 +130,18 @@ L(first_vec_x4):
leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
ret
+# ifndef USE_AS_STRCHRNUL
+L(zero):
+ xorl %eax, %eax
+ ret
+# endif
+
+
.p2align 4
L(first_vec_x1):
- tzcntl %eax, %eax
+ /* Use bsf here to save 1-byte keeping keeping the block in 1x
+ fetch block. eax guranteed non-zero. */
+ bsfl %eax, %eax
# ifndef USE_AS_STRCHRNUL
/* Found CHAR or the null byte. */
cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
@@ -158,7 +153,7 @@ L(first_vec_x1):
leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
ret
- .p2align 4
+ .p2align 4,, 10
L(first_vec_x2):
# ifndef USE_AS_STRCHRNUL
/* Check to see if first match was CHAR (k0) or null (k1). */
@@ -179,6 +174,21 @@ L(first_vec_x2):
leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
ret
+ .p2align 4,, 10
+L(first_vec_x3):
+ /* Use bsf here to save 1-byte keeping keeping the block in 1x
+ fetch block. eax guranteed non-zero. */
+ bsfl %eax, %eax
+# ifndef USE_AS_STRCHRNUL
+ /* Found CHAR or the null byte. */
+ cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+ jne L(zero)
+# endif
+ /* NB: Multiply sizeof char type (1 or 4) to get the number of
+ bytes. */
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
+
.p2align 4
L(aligned_more):
/* Align data to VEC_SIZE. */
@@ -195,7 +205,7 @@ L(cross_page_continue):
vpxorq %YMM1, %YMM0, %YMM2
VPMINU %YMM2, %YMM1, %YMM2
/* Each bit in K0 represents a CHAR or a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM2, %k0
+ VPTESTN %YMM2, %YMM2, %k0
kmovd %k0, %eax
testl %eax, %eax
jnz L(first_vec_x1)
@@ -206,7 +216,7 @@ L(cross_page_continue):
/* Each bit in K0 represents a CHAR in YMM1. */
VPCMP $0, %YMM1, %YMM0, %k0
/* Each bit in K1 represents a CHAR in YMM1. */
- VPCMP $0, %YMM1, %YMMZERO, %k1
+ VPTESTN %YMM1, %YMM1, %k1
kortestd %k0, %k1
jnz L(first_vec_x2)
@@ -215,7 +225,7 @@ L(cross_page_continue):
vpxorq %YMM1, %YMM0, %YMM2
VPMINU %YMM2, %YMM1, %YMM2
/* Each bit in K0 represents a CHAR or a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM2, %k0
+ VPTESTN %YMM2, %YMM2, %k0
kmovd %k0, %eax
testl %eax, %eax
jnz L(first_vec_x3)
@@ -224,7 +234,7 @@ L(cross_page_continue):
/* Each bit in K0 represents a CHAR in YMM1. */
VPCMP $0, %YMM1, %YMM0, %k0
/* Each bit in K1 represents a CHAR in YMM1. */
- VPCMP $0, %YMM1, %YMMZERO, %k1
+ VPTESTN %YMM1, %YMM1, %k1
kortestd %k0, %k1
jnz L(first_vec_x4)
@@ -265,33 +275,33 @@ L(loop_4x_vec):
VPMINU %YMM3, %YMM4, %YMM4
VPMINU %YMM2, %YMM4, %YMM4{%k4}{z}
- VPCMP $0, %YMMZERO, %YMM4, %k1
+ VPTESTN %YMM4, %YMM4, %k1
kmovd %k1, %ecx
subq $-(VEC_SIZE * 4), %rdi
testl %ecx, %ecx
jz L(loop_4x_vec)
- VPCMP $0, %YMMZERO, %YMM1, %k0
+ VPTESTN %YMM1, %YMM1, %k0
kmovd %k0, %eax
testl %eax, %eax
jnz L(last_vec_x1)
- VPCMP $0, %YMMZERO, %YMM2, %k0
+ VPTESTN %YMM2, %YMM2, %k0
kmovd %k0, %eax
testl %eax, %eax
jnz L(last_vec_x2)
- VPCMP $0, %YMMZERO, %YMM3, %k0
+ VPTESTN %YMM3, %YMM3, %k0
kmovd %k0, %eax
/* Combine YMM3 matches (eax) with YMM4 matches (ecx). */
# ifdef USE_AS_WCSCHR
sall $8, %ecx
orl %ecx, %eax
- tzcntl %eax, %eax
+ bsfl %eax, %eax
# else
salq $32, %rcx
orq %rcx, %rax
- tzcntq %rax, %rax
+ bsfq %rax, %rax
# endif
# ifndef USE_AS_STRCHRNUL
/* Check if match was CHAR or null. */
@@ -303,28 +313,28 @@ L(loop_4x_vec):
leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
ret
-# ifndef USE_AS_STRCHRNUL
-L(zero_end):
- xorl %eax, %eax
- ret
+ .p2align 4,, 8
+L(last_vec_x1):
+ bsfl %eax, %eax
+# ifdef USE_AS_WCSCHR
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes.
+ */
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
+# else
+ addq %rdi, %rax
# endif
- .p2align 4
-L(last_vec_x1):
- tzcntl %eax, %eax
# ifndef USE_AS_STRCHRNUL
/* Check if match was null. */
- cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG
+ cmp (%rax), %CHAR_REG
jne L(zero_end)
# endif
- /* NB: Multiply sizeof char type (1 or 4) to get the number of
- bytes. */
- leaq (%rdi, %rax, CHAR_SIZE), %rax
+
ret
- .p2align 4
+ .p2align 4,, 8
L(last_vec_x2):
- tzcntl %eax, %eax
+ bsfl %eax, %eax
# ifndef USE_AS_STRCHRNUL
/* Check if match was null. */
cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
@@ -336,7 +346,7 @@ L(last_vec_x2):
ret
/* Cold case for crossing page with first load. */
- .p2align 4
+ .p2align 4,, 8
L(cross_page_boundary):
movq %rdi, %rdx
/* Align rdi. */
@@ -346,9 +356,9 @@ L(cross_page_boundary):
vpxorq %YMM1, %YMM0, %YMM2
VPMINU %YMM2, %YMM1, %YMM2
/* Each bit in K0 represents a CHAR or a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM2, %k0
+ VPTESTN %YMM2, %YMM2, %k0
kmovd %k0, %eax
- /* Remove the leading bits. */
+ /* Remove the leading bits. */
# ifdef USE_AS_WCSCHR
movl %edx, %SHIFT_REG
/* NB: Divide shift count by 4 since each bit in K1 represent 4
@@ -360,20 +370,24 @@ L(cross_page_boundary):
/* If eax is zero continue. */
testl %eax, %eax
jz L(cross_page_continue)
- tzcntl %eax, %eax
-# ifndef USE_AS_STRCHRNUL
- /* Check to see if match was CHAR or null. */
- cmp (%rdx, %rax, CHAR_SIZE), %CHAR_REG
- jne L(zero_end)
-# endif
+ bsfl %eax, %eax
+
# ifdef USE_AS_WCSCHR
/* NB: Multiply wchar_t count by 4 to get the number of
bytes. */
leaq (%rdx, %rax, CHAR_SIZE), %rax
# else
addq %rdx, %rax
+# endif
+# ifndef USE_AS_STRCHRNUL
+ /* Check to see if match was CHAR or null. */
+ cmp (%rax), %CHAR_REG
+ je L(cross_page_ret)
+L(zero_end):
+ xorl %eax, %eax
+L(cross_page_ret):
# endif
ret
END (STRCHR)
-# endif
+#endif
--
2.25.1
^ permalink raw reply [flat|nested] 76+ messages in thread
* [PATCH v1 05/23] benchtests: Use json-lib in bench-strpbrk.c
2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
` (2 preceding siblings ...)
2022-03-23 21:57 ` [PATCH v1 04/23] x86: Code cleanup in strchr-evex " Noah Goldstein
@ 2022-03-23 21:57 ` Noah Goldstein
2022-03-24 18:54 ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 06/23] benchtests: Use json-lib in bench-strspn.c Noah Goldstein
` (18 subsequent siblings)
22 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-23 21:57 UTC (permalink / raw)
To: libc-alpha
Just QOL change to make parsing the output of the benchtests more
consistent.
---
benchtests/bench-strpbrk.c | 81 ++++++++++++++++++++++++++++----------
1 file changed, 61 insertions(+), 20 deletions(-)
diff --git a/benchtests/bench-strpbrk.c b/benchtests/bench-strpbrk.c
index d46bf9c0e2..a7522a76e6 100644
--- a/benchtests/bench-strpbrk.c
+++ b/benchtests/bench-strpbrk.c
@@ -62,11 +62,14 @@ SIMPLE_STRPBRK (const CHAR *s, const CHAR *rej)
#endif /* !STRPBRK_RESULT */
+#include "json-lib.h"
+
static void
-do_one_test (impl_t *impl, const CHAR *s, const CHAR *rej, RES_TYPE exp_res)
+do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s,
+ const CHAR *rej, RES_TYPE exp_res)
{
RES_TYPE res = CALL (impl, s, rej);
- size_t i, iters = INNER_LOOP_ITERS_MEDIUM;
+ size_t i, iters = INNER_LOOP_ITERS;
timing_t start, stop, cur;
if (res != exp_res)
@@ -86,23 +89,26 @@ do_one_test (impl_t *impl, const CHAR *s, const CHAR *rej, RES_TYPE exp_res)
TIMING_DIFF (cur, start, stop);
- TIMING_PRINT_MEAN ((double) cur, (double) iters);
+ json_element_double (json_ctx, (double)cur / (double)iters);
}
static void
-do_test (size_t align, size_t pos, size_t len)
+do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t pos,
+ size_t len)
{
size_t i;
int c;
RES_TYPE result;
CHAR *rej, *s;
- align &= 7;
- if ((align + pos + 10) * sizeof (CHAR) >= page_size || len > 240)
+ align1 &= 7;
+ if ((align1 + pos + 10) * sizeof (CHAR) >= page_size || len > 240)
+ return;
+ if ((align2 + len) * sizeof (CHAR) >= page_size)
return;
- rej = (CHAR *) (buf2) + (random () & 255);
- s = (CHAR *) (buf1) + align;
+ rej = (CHAR *) (buf2) + align2;
+ s = (CHAR *) (buf1) + align1;
for (i = 0; i < len; ++i)
{
@@ -136,43 +142,78 @@ do_test (size_t align, size_t pos, size_t len)
}
result = STRPBRK_RESULT (s, pos);
- printf ("Length %4zd, alignment %2zd, rej len %2zd:", pos, align, len);
+ json_element_object_begin (json_ctx);
+ json_attr_uint (json_ctx, "len", len);
+ json_attr_uint (json_ctx, "pos", pos);
+ json_attr_uint (json_ctx, "align1", align1);
+ json_attr_uint (json_ctx, "align2", align2);
+ json_array_begin (json_ctx, "timings");
FOR_EACH_IMPL (impl, 0)
- do_one_test (impl, s, rej, result);
+ do_one_test (json_ctx, impl, s, rej, result);
- putchar ('\n');
+ json_array_end (json_ctx);
+ json_element_object_end (json_ctx);
}
int
test_main (void)
{
+ json_ctx_t json_ctx;
size_t i;
test_init ();
- printf ("%32s", "");
+ json_init (&json_ctx, 0, stdout);
+
+ json_document_begin (&json_ctx);
+ json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+
+ json_attr_object_begin (&json_ctx, "functions");
+ json_attr_object_begin (&json_ctx, TEST_NAME);
+ json_attr_string (&json_ctx, "bench-variant", "");
+
+ json_array_begin (&json_ctx, "ifuncs");
FOR_EACH_IMPL (impl, 0)
- printf ("\t%s", impl->name);
- putchar ('\n');
+ json_element_string (&json_ctx, impl->name);
+ json_array_end (&json_ctx);
+
+ json_array_begin (&json_ctx, "results");
+
for (i = 0; i < 32; ++i)
{
- do_test (0, 512, i);
- do_test (i, 512, i);
+ do_test (&json_ctx, 0, 0, 512, i);
+ do_test (&json_ctx, i, 0, 512, i);
+ do_test (&json_ctx, 0, i, 512, i);
+ do_test (&json_ctx, i, i, 512, i);
+
}
for (i = 1; i < 8; ++i)
{
- do_test (0, 16 << i, 4);
- do_test (i, 16 << i, 4);
+ do_test (&json_ctx, 0, 0, 16 << i, 4);
+ do_test (&json_ctx, i, 0, 16 << i, 4);
+ do_test (&json_ctx, 0, i, 16 << i, 4);
+ do_test (&json_ctx, i, i, 16 << i, 4);
}
for (i = 1; i < 8; ++i)
- do_test (i, 64, 10);
+ {
+ do_test (&json_ctx, i, 0, 64, 10);
+ do_test (&json_ctx, i, i, 64, 10);
+ }
for (i = 0; i < 64; ++i)
- do_test (0, i, 6);
+ {
+ do_test (&json_ctx, 0, 0, i, 6);
+ do_test (&json_ctx, 0, i, i, 6);
+ }
+
+ json_array_end (&json_ctx);
+ json_attr_object_end (&json_ctx);
+ json_attr_object_end (&json_ctx);
+ json_document_end (&json_ctx);
return ret;
}
--
2.25.1
^ permalink raw reply [flat|nested] 76+ messages in thread
* [PATCH v1 06/23] benchtests: Use json-lib in bench-strspn.c
2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
` (3 preceding siblings ...)
2022-03-23 21:57 ` [PATCH v1 05/23] benchtests: Use json-lib in bench-strpbrk.c Noah Goldstein
@ 2022-03-23 21:57 ` Noah Goldstein
2022-03-24 18:54 ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 07/23] x86: Optimize strcspn and strpbrk in strcspn-c.c Noah Goldstein
` (17 subsequent siblings)
22 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-23 21:57 UTC (permalink / raw)
To: libc-alpha
Just QOL change to make parsing the output of the benchtests more
consistent.
---
benchtests/bench-strspn.c | 78 +++++++++++++++++++++++++++++----------
1 file changed, 58 insertions(+), 20 deletions(-)
diff --git a/benchtests/bench-strspn.c b/benchtests/bench-strspn.c
index d79c36fae6..061e90c54d 100644
--- a/benchtests/bench-strspn.c
+++ b/benchtests/bench-strspn.c
@@ -23,6 +23,7 @@
# define TEST_NAME "wcsspn"
#endif /* WIDE */
#include "bench-string.h"
+#include "json-lib.h"
#define BIG_CHAR MAX_CHAR
@@ -58,9 +59,10 @@ SIMPLE_STRSPN (const CHAR *s, const CHAR *acc)
}
static void
-do_one_test (impl_t *impl, const CHAR *s, const CHAR *acc, size_t exp_res)
+do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s,
+ const CHAR *acc, size_t exp_res)
{
- size_t res = CALL (impl, s, acc), i, iters = INNER_LOOP_ITERS_MEDIUM;
+ size_t res = CALL (impl, s, acc), i, iters = INNER_LOOP_ITERS;
timing_t start, stop, cur;
if (res != exp_res)
@@ -80,21 +82,24 @@ do_one_test (impl_t *impl, const CHAR *s, const CHAR *acc, size_t exp_res)
TIMING_DIFF (cur, start, stop);
- TIMING_PRINT_MEAN ((double) cur, (double) iters);
+ json_element_double (json_ctx, (double)cur / (double)iters);
}
static void
-do_test (size_t align, size_t pos, size_t len)
+do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t pos,
+ size_t len)
{
size_t i;
CHAR *acc, *s;
- align &= 7;
- if ((align + pos + 10) * sizeof (CHAR) >= page_size || len > 240 || ! len)
+ align1 &= 7;
+ if ((align1 + pos + 10) * sizeof (CHAR) >= page_size || len > 240 || !len)
+ return;
+ if ((align2 + len) * sizeof (CHAR) >= page_size)
return;
- acc = (CHAR *) (buf2) + (random () & 255);
- s = (CHAR *) (buf1) + align;
+ acc = (CHAR *) (buf2) + align2;
+ s = (CHAR *) (buf1) + align1;
for (i = 0; i < len; ++i)
{
@@ -118,43 +123,76 @@ do_test (size_t align, size_t pos, size_t len)
s[i] = '\0';
}
- printf ("Length %4zd, alignment %2zd, acc len %2zd:", pos, align, len);
+ json_element_object_begin (json_ctx);
+ json_attr_uint (json_ctx, "len", len);
+ json_attr_uint (json_ctx, "pos", pos);
+ json_attr_uint (json_ctx, "align1", align1);
+ json_attr_uint (json_ctx, "align2", align2);
+ json_array_begin (json_ctx, "timings");
FOR_EACH_IMPL (impl, 0)
- do_one_test (impl, s, acc, pos);
+ do_one_test (json_ctx, impl, s, acc, pos);
- putchar ('\n');
+ json_array_end (json_ctx);
+ json_element_object_end (json_ctx);
}
int
test_main (void)
{
+ json_ctx_t json_ctx;
size_t i;
test_init ();
- printf ("%32s", "");
+ json_init (&json_ctx, 0, stdout);
+
+ json_document_begin (&json_ctx);
+ json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+
+ json_attr_object_begin (&json_ctx, "functions");
+ json_attr_object_begin (&json_ctx, TEST_NAME);
+ json_attr_string (&json_ctx, "bench-variant", "");
+
+ json_array_begin (&json_ctx, "ifuncs");
FOR_EACH_IMPL (impl, 0)
- printf ("\t%s", impl->name);
- putchar ('\n');
+ json_element_string (&json_ctx, impl->name);
+ json_array_end (&json_ctx);
+
+ json_array_begin (&json_ctx, "results");
for (i = 0; i < 32; ++i)
{
- do_test (0, 512, i);
- do_test (i, 512, i);
+ do_test (&json_ctx, 0, 0, 512, i);
+ do_test (&json_ctx, i, 0, 512, i);
+ do_test (&json_ctx, 0, i, 512, i);
+ do_test (&json_ctx, i, i, 512, i);
}
for (i = 1; i < 8; ++i)
{
- do_test (0, 16 << i, 4);
- do_test (i, 16 << i, 4);
+ do_test (&json_ctx, 0, 0, 16 << i, 4);
+ do_test (&json_ctx, i, 0, 16 << i, 4);
+ do_test (&json_ctx, 0, i, 16 << i, 4);
+ do_test (&json_ctx, i, i, 16 << i, 4);
}
for (i = 1; i < 8; ++i)
- do_test (i, 64, 10);
+ {
+ do_test (&json_ctx, i, 0, 64, 10);
+ do_test (&json_ctx, i, i, 64, 10);
+ }
for (i = 0; i < 64; ++i)
- do_test (0, i, 6);
+ {
+ do_test (&json_ctx, 0, 0, i, 6);
+ do_test (&json_ctx, 0, i, i, 6);
+ }
+
+ json_array_end (&json_ctx);
+ json_attr_object_end (&json_ctx);
+ json_attr_object_end (&json_ctx);
+ json_document_end (&json_ctx);
return ret;
}
--
2.25.1
^ permalink raw reply [flat|nested] 76+ messages in thread
* [PATCH v1 07/23] x86: Optimize strcspn and strpbrk in strcspn-c.c
2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
` (4 preceding siblings ...)
2022-03-23 21:57 ` [PATCH v1 06/23] benchtests: Use json-lib in bench-strspn.c Noah Goldstein
@ 2022-03-23 21:57 ` Noah Goldstein
2022-03-24 18:55 ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 08/23] x86: Optimize strspn in strspn-c.c Noah Goldstein
` (16 subsequent siblings)
22 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-23 21:57 UTC (permalink / raw)
To: libc-alpha
Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of
_mm_cmpistri. Also change offset to unsigned to avoid unnecessary
sign extensions.
geometric_mean(N=20) of all benchmarks that dont fallback on
sse2/strlen; New / Original: .928
All string/memory tests pass.
---
Geomtric Mean N=20 runs; All functions page aligned
len, align1, align2, pos, New Time / Old Time
0, 0, 0, 512, 1.207
1, 0, 0, 512, 1.039
1, 1, 0, 512, 0.997
1, 0, 1, 512, 0.981
1, 1, 1, 512, 0.977
2, 0, 0, 512, 1.02
2, 2, 0, 512, 0.979
2, 0, 2, 512, 0.902
2, 2, 2, 512, 0.958
3, 0, 0, 512, 0.978
3, 3, 0, 512, 0.988
3, 0, 3, 512, 0.979
3, 3, 3, 512, 0.955
4, 0, 0, 512, 0.969
4, 4, 0, 512, 0.991
4, 0, 4, 512, 0.94
4, 4, 4, 512, 0.958
5, 0, 0, 512, 0.963
5, 5, 0, 512, 1.004
5, 0, 5, 512, 0.948
5, 5, 5, 512, 0.971
6, 0, 0, 512, 0.933
6, 6, 0, 512, 1.007
6, 0, 6, 512, 0.921
6, 6, 6, 512, 0.969
7, 0, 0, 512, 0.928
7, 7, 0, 512, 0.976
7, 0, 7, 512, 0.932
7, 7, 7, 512, 0.995
8, 0, 0, 512, 0.931
8, 0, 8, 512, 0.766
9, 0, 0, 512, 0.965
9, 1, 0, 512, 0.999
9, 0, 9, 512, 0.765
9, 1, 9, 512, 0.97
10, 0, 0, 512, 0.976
10, 2, 0, 512, 0.991
10, 0, 10, 512, 0.768
10, 2, 10, 512, 0.926
11, 0, 0, 512, 0.958
11, 3, 0, 512, 1.006
11, 0, 11, 512, 0.768
11, 3, 11, 512, 0.908
12, 0, 0, 512, 0.945
12, 4, 0, 512, 0.896
12, 0, 12, 512, 0.764
12, 4, 12, 512, 0.785
13, 0, 0, 512, 0.957
13, 5, 0, 512, 1.019
13, 0, 13, 512, 0.76
13, 5, 13, 512, 0.785
14, 0, 0, 512, 0.918
14, 6, 0, 512, 1.004
14, 0, 14, 512, 0.78
14, 6, 14, 512, 0.711
15, 0, 0, 512, 0.855
15, 7, 0, 512, 0.985
15, 0, 15, 512, 0.779
15, 7, 15, 512, 0.772
16, 0, 0, 512, 0.987
16, 0, 16, 512, 0.99
17, 0, 0, 512, 0.996
17, 1, 0, 512, 0.979
17, 0, 17, 512, 1.001
17, 1, 17, 512, 1.03
18, 0, 0, 512, 0.976
18, 2, 0, 512, 0.989
18, 0, 18, 512, 0.976
18, 2, 18, 512, 0.992
19, 0, 0, 512, 0.991
19, 3, 0, 512, 0.988
19, 0, 19, 512, 1.009
19, 3, 19, 512, 1.018
20, 0, 0, 512, 0.999
20, 4, 0, 512, 1.005
20, 0, 20, 512, 0.993
20, 4, 20, 512, 0.983
21, 0, 0, 512, 0.982
21, 5, 0, 512, 0.988
21, 0, 21, 512, 0.978
21, 5, 21, 512, 0.984
22, 0, 0, 512, 0.988
22, 6, 0, 512, 0.979
22, 0, 22, 512, 0.984
22, 6, 22, 512, 0.983
23, 0, 0, 512, 0.996
23, 7, 0, 512, 0.998
23, 0, 23, 512, 0.979
23, 7, 23, 512, 0.987
24, 0, 0, 512, 0.99
24, 0, 24, 512, 0.979
25, 0, 0, 512, 0.985
25, 1, 0, 512, 0.988
25, 0, 25, 512, 0.99
25, 1, 25, 512, 0.986
26, 0, 0, 512, 1.005
26, 2, 0, 512, 0.995
26, 0, 26, 512, 0.992
26, 2, 26, 512, 0.983
27, 0, 0, 512, 0.986
27, 3, 0, 512, 0.978
27, 0, 27, 512, 0.986
27, 3, 27, 512, 0.973
28, 0, 0, 512, 0.995
28, 4, 0, 512, 0.993
28, 0, 28, 512, 0.983
28, 4, 28, 512, 1.005
29, 0, 0, 512, 0.983
29, 5, 0, 512, 0.982
29, 0, 29, 512, 0.984
29, 5, 29, 512, 1.005
30, 0, 0, 512, 0.978
30, 6, 0, 512, 0.985
30, 0, 30, 512, 0.994
30, 6, 30, 512, 0.993
31, 0, 0, 512, 0.984
31, 7, 0, 512, 0.983
31, 0, 31, 512, 1.0
31, 7, 31, 512, 1.031
4, 0, 0, 32, 0.916
4, 1, 0, 32, 0.952
4, 0, 1, 32, 0.927
4, 1, 1, 32, 0.969
4, 0, 0, 64, 0.961
4, 2, 0, 64, 0.955
4, 0, 2, 64, 0.975
4, 2, 2, 64, 0.972
4, 0, 0, 128, 0.971
4, 3, 0, 128, 0.982
4, 0, 3, 128, 0.945
4, 3, 3, 128, 0.971
4, 0, 0, 256, 1.004
4, 4, 0, 256, 0.966
4, 0, 4, 256, 0.961
4, 4, 4, 256, 0.971
4, 5, 0, 512, 0.929
4, 0, 5, 512, 0.969
4, 5, 5, 512, 0.985
4, 0, 0, 1024, 1.003
4, 6, 0, 1024, 1.009
4, 0, 6, 1024, 1.005
4, 6, 6, 1024, 0.999
4, 0, 0, 2048, 0.917
4, 7, 0, 2048, 1.015
4, 0, 7, 2048, 1.011
4, 7, 7, 2048, 0.907
10, 1, 0, 64, 0.964
10, 1, 1, 64, 0.966
10, 2, 0, 64, 0.953
10, 2, 2, 64, 0.972
10, 3, 0, 64, 0.962
10, 3, 3, 64, 0.969
10, 4, 0, 64, 0.957
10, 4, 4, 64, 0.969
10, 5, 0, 64, 0.961
10, 5, 5, 64, 0.965
10, 6, 0, 64, 0.949
10, 6, 6, 64, 0.9
10, 7, 0, 64, 0.957
10, 7, 7, 64, 0.897
6, 0, 0, 0, 0.991
6, 0, 0, 1, 1.011
6, 0, 1, 1, 0.939
6, 0, 0, 2, 1.016
6, 0, 2, 2, 0.94
6, 0, 0, 3, 1.019
6, 0, 3, 3, 0.941
6, 0, 0, 4, 1.056
6, 0, 4, 4, 0.884
6, 0, 0, 5, 0.977
6, 0, 5, 5, 0.934
6, 0, 0, 6, 0.954
6, 0, 6, 6, 0.93
6, 0, 0, 7, 0.963
6, 0, 7, 7, 0.916
6, 0, 0, 8, 0.963
6, 0, 8, 8, 0.945
6, 0, 0, 9, 1.028
6, 0, 9, 9, 0.942
6, 0, 0, 10, 0.955
6, 0, 10, 10, 0.831
6, 0, 0, 11, 0.948
6, 0, 11, 11, 0.82
6, 0, 0, 12, 1.033
6, 0, 12, 12, 0.873
6, 0, 0, 13, 0.983
6, 0, 13, 13, 0.852
6, 0, 0, 14, 0.984
6, 0, 14, 14, 0.853
6, 0, 0, 15, 0.984
6, 0, 15, 15, 0.882
6, 0, 0, 16, 0.971
6, 0, 16, 16, 0.958
6, 0, 0, 17, 0.938
6, 0, 17, 17, 0.947
6, 0, 0, 18, 0.96
6, 0, 18, 18, 0.938
6, 0, 0, 19, 0.903
6, 0, 19, 19, 0.943
6, 0, 0, 20, 0.947
6, 0, 20, 20, 0.951
6, 0, 0, 21, 0.948
6, 0, 21, 21, 0.96
6, 0, 0, 22, 0.926
6, 0, 22, 22, 0.951
6, 0, 0, 23, 0.923
6, 0, 23, 23, 0.959
6, 0, 0, 24, 0.918
6, 0, 24, 24, 0.952
6, 0, 0, 25, 0.97
6, 0, 25, 25, 0.952
6, 0, 0, 26, 0.871
6, 0, 26, 26, 0.869
6, 0, 0, 27, 0.935
6, 0, 27, 27, 0.836
6, 0, 0, 28, 0.936
6, 0, 28, 28, 0.857
6, 0, 0, 29, 0.876
6, 0, 29, 29, 0.859
6, 0, 0, 30, 0.934
6, 0, 30, 30, 0.857
6, 0, 0, 31, 0.962
6, 0, 31, 31, 0.86
6, 0, 0, 32, 0.912
6, 0, 32, 32, 0.94
6, 0, 0, 33, 0.903
6, 0, 33, 33, 0.968
6, 0, 0, 34, 0.913
6, 0, 34, 34, 0.896
6, 0, 0, 35, 0.904
6, 0, 35, 35, 0.913
6, 0, 0, 36, 0.905
6, 0, 36, 36, 0.907
6, 0, 0, 37, 0.899
6, 0, 37, 37, 0.9
6, 0, 0, 38, 0.912
6, 0, 38, 38, 0.919
6, 0, 0, 39, 0.925
6, 0, 39, 39, 0.927
6, 0, 0, 40, 0.923
6, 0, 40, 40, 0.972
6, 0, 0, 41, 0.92
6, 0, 41, 41, 0.966
6, 0, 0, 42, 0.915
6, 0, 42, 42, 0.834
6, 0, 0, 43, 0.92
6, 0, 43, 43, 0.856
6, 0, 0, 44, 0.908
6, 0, 44, 44, 0.858
6, 0, 0, 45, 0.932
6, 0, 45, 45, 0.847
6, 0, 0, 46, 0.927
6, 0, 46, 46, 0.859
6, 0, 0, 47, 0.902
6, 0, 47, 47, 0.855
6, 0, 0, 48, 0.949
6, 0, 48, 48, 0.934
6, 0, 0, 49, 0.907
6, 0, 49, 49, 0.943
6, 0, 0, 50, 0.934
6, 0, 50, 50, 0.943
6, 0, 0, 51, 0.933
6, 0, 51, 51, 0.939
6, 0, 0, 52, 0.944
6, 0, 52, 52, 0.944
6, 0, 0, 53, 0.939
6, 0, 53, 53, 0.938
6, 0, 0, 54, 0.9
6, 0, 54, 54, 0.923
6, 0, 0, 55, 0.9
6, 0, 55, 55, 0.927
6, 0, 0, 56, 0.9
6, 0, 56, 56, 0.917
6, 0, 0, 57, 0.9
6, 0, 57, 57, 0.916
6, 0, 0, 58, 0.914
6, 0, 58, 58, 0.784
6, 0, 0, 59, 0.863
6, 0, 59, 59, 0.846
6, 0, 0, 60, 0.88
6, 0, 60, 60, 0.827
6, 0, 0, 61, 0.896
6, 0, 61, 61, 0.847
6, 0, 0, 62, 0.894
6, 0, 62, 62, 0.865
6, 0, 0, 63, 0.934
6, 0, 63, 63, 0.866
sysdeps/x86_64/multiarch/strcspn-c.c | 83 +++++++++++++---------------
1 file changed, 37 insertions(+), 46 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c
index 013aebf797..c312fab8b1 100644
--- a/sysdeps/x86_64/multiarch/strcspn-c.c
+++ b/sysdeps/x86_64/multiarch/strcspn-c.c
@@ -84,83 +84,74 @@ STRCSPN_SSE42 (const char *s, const char *a)
RETURN (NULL, strlen (s));
const char *aligned;
- __m128i mask;
- int offset = (int) ((size_t) a & 15);
+ __m128i mask, maskz, zero;
+ unsigned int maskz_bits;
+ unsigned int offset = (unsigned int) ((size_t) a & 15);
+ zero = _mm_set1_epi8 (0);
if (offset != 0)
{
/* Load masks. */
aligned = (const char *) ((size_t) a & -16L);
__m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
-
- mask = __m128i_shift_right (mask0, offset);
+ maskz = _mm_cmpeq_epi8 (mask0, zero);
/* Find where the NULL terminator is. */
- int length = _mm_cmpistri (mask, mask, 0x3a);
- if (length == 16 - offset)
- {
- /* There is no NULL terminator. */
- __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
- int index = _mm_cmpistri (mask1, mask1, 0x3a);
- length += index;
-
- /* Don't use SSE4.2 if the length of A > 16. */
- if (length > 16)
- return STRCSPN_SSE2 (s, a);
-
- if (index != 0)
- {
- /* Combine mask0 and mask1. We could play games with
- palignr, but frankly this data should be in L1 now
- so do the merge via an unaligned load. */
- mask = _mm_loadu_si128 ((__m128i *) a);
- }
- }
+ maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
+ if (maskz_bits != 0)
+ {
+ mask = __m128i_shift_right (mask0, offset);
+ offset = (unsigned int) ((size_t) s & 15);
+ if (offset)
+ goto start_unaligned;
+
+ aligned = s;
+ goto start_loop;
+ }
}
- else
- {
- /* A is aligned. */
- mask = _mm_load_si128 ((__m128i *) a);
- /* Find where the NULL terminator is. */
- int length = _mm_cmpistri (mask, mask, 0x3a);
- if (length == 16)
- {
- /* There is no NULL terminator. Don't use SSE4.2 if the length
- of A > 16. */
- if (a[16] != 0)
- return STRCSPN_SSE2 (s, a);
- }
+ /* A is aligned. */
+ mask = _mm_loadu_si128 ((__m128i *) a);
+ /* Find where the NULL terminator is. */
+ maskz = _mm_cmpeq_epi8 (mask, zero);
+ maskz_bits = _mm_movemask_epi8 (maskz);
+ if (maskz_bits == 0)
+ {
+ /* There is no NULL terminator. Don't use SSE4.2 if the length
+ of A > 16. */
+ if (a[16] != 0)
+ return STRCSPN_SSE2 (s, a);
}
- offset = (int) ((size_t) s & 15);
+ aligned = s;
+ offset = (unsigned int) ((size_t) s & 15);
if (offset != 0)
{
+ start_unaligned:
/* Check partial string. */
aligned = (const char *) ((size_t) s & -16L);
__m128i value = _mm_load_si128 ((__m128i *) aligned);
value = __m128i_shift_right (value, offset);
- int length = _mm_cmpistri (mask, value, 0x2);
+ unsigned int length = _mm_cmpistri (mask, value, 0x2);
/* No need to check ZFlag since ZFlag is always 1. */
- int cflag = _mm_cmpistrc (mask, value, 0x2);
+ unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
if (cflag)
RETURN ((char *) (s + length), length);
/* Find where the NULL terminator is. */
- int index = _mm_cmpistri (value, value, 0x3a);
+ unsigned int index = _mm_cmpistri (value, value, 0x3a);
if (index < 16 - offset)
RETURN (NULL, index);
aligned += 16;
}
- else
- aligned = s;
+start_loop:
while (1)
{
__m128i value = _mm_load_si128 ((__m128i *) aligned);
- int index = _mm_cmpistri (mask, value, 0x2);
- int cflag = _mm_cmpistrc (mask, value, 0x2);
- int zflag = _mm_cmpistrz (mask, value, 0x2);
+ unsigned int index = _mm_cmpistri (mask, value, 0x2);
+ unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
+ unsigned int zflag = _mm_cmpistrz (mask, value, 0x2);
if (cflag)
RETURN ((char *) (aligned + index), (size_t) (aligned + index - s));
if (zflag)
--
2.25.1
^ permalink raw reply [flat|nested] 76+ messages in thread
* [PATCH v1 08/23] x86: Optimize strspn in strspn-c.c
2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
` (5 preceding siblings ...)
2022-03-23 21:57 ` [PATCH v1 07/23] x86: Optimize strcspn and strpbrk in strcspn-c.c Noah Goldstein
@ 2022-03-23 21:57 ` Noah Goldstein
2022-03-24 18:56 ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 09/23] x86: Remove strcspn-sse2.S and use the generic implementation Noah Goldstein
` (15 subsequent siblings)
22 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-23 21:57 UTC (permalink / raw)
To: libc-alpha
Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of
_mm_cmpistri. Also change offset to unsigned to avoid unnecessary
sign extensions.
geometric_mean(N=20) of all benchmarks that dont fallback on
sse2; New / Original: .901
All string/memory tests pass.
---
Geomtric Mean N=20 runs; All functions page aligned
len, align1, align2, pos, New Time / Old Time
1, 0, 0, 512, 0.768
1, 1, 0, 512, 0.666
1, 0, 1, 512, 1.193
1, 1, 1, 512, 0.872
2, 0, 0, 512, 0.698
2, 2, 0, 512, 0.687
2, 0, 2, 512, 1.393
2, 2, 2, 512, 0.944
3, 0, 0, 512, 0.691
3, 3, 0, 512, 0.676
3, 0, 3, 512, 1.388
3, 3, 3, 512, 0.948
4, 0, 0, 512, 0.74
4, 4, 0, 512, 0.678
4, 0, 4, 512, 1.421
4, 4, 4, 512, 0.943
5, 0, 0, 512, 0.691
5, 5, 0, 512, 0.675
5, 0, 5, 512, 1.348
5, 5, 5, 512, 0.952
6, 0, 0, 512, 0.685
6, 6, 0, 512, 0.67
6, 0, 6, 512, 1.333
6, 6, 6, 512, 0.95
7, 0, 0, 512, 0.688
7, 7, 0, 512, 0.675
7, 0, 7, 512, 1.344
7, 7, 7, 512, 0.919
8, 0, 0, 512, 0.716
8, 0, 8, 512, 0.935
9, 0, 0, 512, 0.716
9, 1, 0, 512, 0.712
9, 0, 9, 512, 0.956
9, 1, 9, 512, 0.992
10, 0, 0, 512, 0.699
10, 2, 0, 512, 0.68
10, 0, 10, 512, 0.952
10, 2, 10, 512, 0.932
11, 0, 0, 512, 0.705
11, 3, 0, 512, 0.685
11, 0, 11, 512, 0.956
11, 3, 11, 512, 0.927
12, 0, 0, 512, 0.695
12, 4, 0, 512, 0.675
12, 0, 12, 512, 0.948
12, 4, 12, 512, 0.928
13, 0, 0, 512, 0.7
13, 5, 0, 512, 0.678
13, 0, 13, 512, 0.944
13, 5, 13, 512, 0.931
14, 0, 0, 512, 0.703
14, 6, 0, 512, 0.678
14, 0, 14, 512, 0.949
14, 6, 14, 512, 0.93
15, 0, 0, 512, 0.694
15, 7, 0, 512, 0.678
15, 0, 15, 512, 0.953
15, 7, 15, 512, 0.924
16, 0, 0, 512, 1.021
16, 0, 16, 512, 1.067
17, 0, 0, 512, 0.991
17, 1, 0, 512, 0.984
17, 0, 17, 512, 0.979
17, 1, 17, 512, 0.993
18, 0, 0, 512, 0.992
18, 2, 0, 512, 1.008
18, 0, 18, 512, 1.016
18, 2, 18, 512, 0.993
19, 0, 0, 512, 0.984
19, 3, 0, 512, 0.985
19, 0, 19, 512, 1.007
19, 3, 19, 512, 1.006
20, 0, 0, 512, 0.969
20, 4, 0, 512, 0.968
20, 0, 20, 512, 0.975
20, 4, 20, 512, 0.975
21, 0, 0, 512, 0.992
21, 5, 0, 512, 0.992
21, 0, 21, 512, 0.98
21, 5, 21, 512, 0.97
22, 0, 0, 512, 0.989
22, 6, 0, 512, 0.987
22, 0, 22, 512, 0.99
22, 6, 22, 512, 0.985
23, 0, 0, 512, 0.989
23, 7, 0, 512, 0.98
23, 0, 23, 512, 1.0
23, 7, 23, 512, 0.993
24, 0, 0, 512, 0.99
24, 0, 24, 512, 0.998
25, 0, 0, 512, 1.01
25, 1, 0, 512, 1.0
25, 0, 25, 512, 0.97
25, 1, 25, 512, 0.967
26, 0, 0, 512, 1.009
26, 2, 0, 512, 0.986
26, 0, 26, 512, 0.997
26, 2, 26, 512, 0.993
27, 0, 0, 512, 0.984
27, 3, 0, 512, 0.997
27, 0, 27, 512, 0.989
27, 3, 27, 512, 0.976
28, 0, 0, 512, 0.991
28, 4, 0, 512, 1.003
28, 0, 28, 512, 0.986
28, 4, 28, 512, 0.989
29, 0, 0, 512, 0.986
29, 5, 0, 512, 0.985
29, 0, 29, 512, 0.984
29, 5, 29, 512, 0.977
30, 0, 0, 512, 0.991
30, 6, 0, 512, 0.987
30, 0, 30, 512, 0.979
30, 6, 30, 512, 0.974
31, 0, 0, 512, 0.995
31, 7, 0, 512, 0.995
31, 0, 31, 512, 0.994
31, 7, 31, 512, 0.984
4, 0, 0, 32, 0.861
4, 1, 0, 32, 0.864
4, 0, 1, 32, 0.962
4, 1, 1, 32, 0.967
4, 0, 0, 64, 0.884
4, 2, 0, 64, 0.818
4, 0, 2, 64, 0.889
4, 2, 2, 64, 0.918
4, 0, 0, 128, 0.942
4, 3, 0, 128, 0.884
4, 0, 3, 128, 0.931
4, 3, 3, 128, 0.883
4, 0, 0, 256, 0.964
4, 4, 0, 256, 0.922
4, 0, 4, 256, 0.956
4, 4, 4, 256, 0.93
4, 5, 0, 512, 0.833
4, 0, 5, 512, 1.027
4, 5, 5, 512, 0.929
4, 0, 0, 1024, 0.998
4, 6, 0, 1024, 0.986
4, 0, 6, 1024, 0.984
4, 6, 6, 1024, 0.977
4, 0, 0, 2048, 0.991
4, 7, 0, 2048, 0.987
4, 0, 7, 2048, 0.996
4, 7, 7, 2048, 0.98
10, 1, 0, 64, 0.826
10, 1, 1, 64, 0.907
10, 2, 0, 64, 0.829
10, 2, 2, 64, 0.91
10, 3, 0, 64, 0.83
10, 3, 3, 64, 0.915
10, 4, 0, 64, 0.83
10, 4, 4, 64, 0.911
10, 5, 0, 64, 0.828
10, 5, 5, 64, 0.905
10, 6, 0, 64, 0.828
10, 6, 6, 64, 0.812
10, 7, 0, 64, 0.83
10, 7, 7, 64, 0.819
6, 0, 0, 0, 1.261
6, 0, 0, 1, 1.252
6, 0, 1, 1, 0.845
6, 0, 0, 2, 1.27
6, 0, 2, 2, 0.85
6, 0, 0, 3, 1.269
6, 0, 3, 3, 0.845
6, 0, 0, 4, 1.287
6, 0, 4, 4, 0.852
6, 0, 0, 5, 1.278
6, 0, 5, 5, 0.851
6, 0, 0, 6, 1.269
6, 0, 6, 6, 0.841
6, 0, 0, 7, 1.268
6, 0, 7, 7, 0.851
6, 0, 0, 8, 1.291
6, 0, 8, 8, 0.837
6, 0, 0, 9, 1.283
6, 0, 9, 9, 0.831
6, 0, 0, 10, 1.252
6, 0, 10, 10, 0.997
6, 0, 0, 11, 1.295
6, 0, 11, 11, 1.046
6, 0, 0, 12, 1.296
6, 0, 12, 12, 1.038
6, 0, 0, 13, 1.287
6, 0, 13, 13, 1.082
6, 0, 0, 14, 1.284
6, 0, 14, 14, 1.001
6, 0, 0, 15, 1.286
6, 0, 15, 15, 1.002
6, 0, 0, 16, 0.894
6, 0, 16, 16, 0.874
6, 0, 0, 17, 0.892
6, 0, 17, 17, 0.974
6, 0, 0, 18, 0.907
6, 0, 18, 18, 0.993
6, 0, 0, 19, 0.909
6, 0, 19, 19, 0.99
6, 0, 0, 20, 0.894
6, 0, 20, 20, 0.978
6, 0, 0, 21, 0.89
6, 0, 21, 21, 0.958
6, 0, 0, 22, 0.893
6, 0, 22, 22, 0.99
6, 0, 0, 23, 0.899
6, 0, 23, 23, 0.986
6, 0, 0, 24, 0.893
6, 0, 24, 24, 0.989
6, 0, 0, 25, 0.889
6, 0, 25, 25, 0.982
6, 0, 0, 26, 0.889
6, 0, 26, 26, 0.852
6, 0, 0, 27, 0.89
6, 0, 27, 27, 0.832
6, 0, 0, 28, 0.89
6, 0, 28, 28, 0.831
6, 0, 0, 29, 0.89
6, 0, 29, 29, 0.838
6, 0, 0, 30, 0.907
6, 0, 30, 30, 0.833
6, 0, 0, 31, 0.888
6, 0, 31, 31, 0.837
6, 0, 0, 32, 0.853
6, 0, 32, 32, 0.828
6, 0, 0, 33, 0.857
6, 0, 33, 33, 0.947
6, 0, 0, 34, 0.847
6, 0, 34, 34, 0.954
6, 0, 0, 35, 0.841
6, 0, 35, 35, 0.94
6, 0, 0, 36, 0.854
6, 0, 36, 36, 0.958
6, 0, 0, 37, 0.856
6, 0, 37, 37, 0.957
6, 0, 0, 38, 0.839
6, 0, 38, 38, 0.962
6, 0, 0, 39, 0.866
6, 0, 39, 39, 0.945
6, 0, 0, 40, 0.845
6, 0, 40, 40, 0.961
6, 0, 0, 41, 0.858
6, 0, 41, 41, 0.961
6, 0, 0, 42, 0.862
6, 0, 42, 42, 0.825
6, 0, 0, 43, 0.864
6, 0, 43, 43, 0.82
6, 0, 0, 44, 0.843
6, 0, 44, 44, 0.81
6, 0, 0, 45, 0.859
6, 0, 45, 45, 0.816
6, 0, 0, 46, 0.866
6, 0, 46, 46, 0.81
6, 0, 0, 47, 0.858
6, 0, 47, 47, 0.807
6, 0, 0, 48, 0.87
6, 0, 48, 48, 0.87
6, 0, 0, 49, 0.871
6, 0, 49, 49, 0.874
6, 0, 0, 50, 0.87
6, 0, 50, 50, 0.881
6, 0, 0, 51, 0.868
6, 0, 51, 51, 0.875
6, 0, 0, 52, 0.873
6, 0, 52, 52, 0.871
6, 0, 0, 53, 0.866
6, 0, 53, 53, 0.882
6, 0, 0, 54, 0.863
6, 0, 54, 54, 0.876
6, 0, 0, 55, 0.851
6, 0, 55, 55, 0.871
6, 0, 0, 56, 0.867
6, 0, 56, 56, 0.888
6, 0, 0, 57, 0.862
6, 0, 57, 57, 0.899
6, 0, 0, 58, 0.873
6, 0, 58, 58, 0.798
6, 0, 0, 59, 0.881
6, 0, 59, 59, 0.785
6, 0, 0, 60, 0.867
6, 0, 60, 60, 0.797
6, 0, 0, 61, 0.872
6, 0, 61, 61, 0.791
6, 0, 0, 62, 0.859
6, 0, 62, 62, 0.79
6, 0, 0, 63, 0.87
6, 0, 63, 63, 0.796
sysdeps/x86_64/multiarch/strspn-c.c | 86 +++++++++++++----------------
1 file changed, 39 insertions(+), 47 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c
index 8fb3aba64d..6124033ceb 100644
--- a/sysdeps/x86_64/multiarch/strspn-c.c
+++ b/sysdeps/x86_64/multiarch/strspn-c.c
@@ -62,81 +62,73 @@ __strspn_sse42 (const char *s, const char *a)
return 0;
const char *aligned;
- __m128i mask;
- int offset = (int) ((size_t) a & 15);
+ __m128i mask, maskz, zero;
+ unsigned int maskz_bits;
+ unsigned int offset = (int) ((size_t) a & 15);
+ zero = _mm_set1_epi8 (0);
if (offset != 0)
{
/* Load masks. */
aligned = (const char *) ((size_t) a & -16L);
__m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
-
- mask = __m128i_shift_right (mask0, offset);
+ maskz = _mm_cmpeq_epi8 (mask0, zero);
/* Find where the NULL terminator is. */
- int length = _mm_cmpistri (mask, mask, 0x3a);
- if (length == 16 - offset)
- {
- /* There is no NULL terminator. */
- __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
- int index = _mm_cmpistri (mask1, mask1, 0x3a);
- length += index;
-
- /* Don't use SSE4.2 if the length of A > 16. */
- if (length > 16)
- return __strspn_sse2 (s, a);
-
- if (index != 0)
- {
- /* Combine mask0 and mask1. We could play games with
- palignr, but frankly this data should be in L1 now
- so do the merge via an unaligned load. */
- mask = _mm_loadu_si128 ((__m128i *) a);
- }
- }
+ maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
+ if (maskz_bits != 0)
+ {
+ mask = __m128i_shift_right (mask0, offset);
+ offset = (unsigned int) ((size_t) s & 15);
+ if (offset)
+ goto start_unaligned;
+
+ aligned = s;
+ goto start_loop;
+ }
}
- else
- {
- /* A is aligned. */
- mask = _mm_load_si128 ((__m128i *) a);
- /* Find where the NULL terminator is. */
- int length = _mm_cmpistri (mask, mask, 0x3a);
- if (length == 16)
- {
- /* There is no NULL terminator. Don't use SSE4.2 if the length
- of A > 16. */
- if (a[16] != 0)
- return __strspn_sse2 (s, a);
- }
+ /* A is aligned. */
+ mask = _mm_loadu_si128 ((__m128i *) a);
+
+ /* Find where the NULL terminator is. */
+ maskz = _mm_cmpeq_epi8 (mask, zero);
+ maskz_bits = _mm_movemask_epi8 (maskz);
+ if (maskz_bits == 0)
+ {
+ /* There is no NULL terminator. Don't use SSE4.2 if the length
+ of A > 16. */
+ if (a[16] != 0)
+ return __strspn_sse2 (s, a);
}
+ aligned = s;
+ offset = (unsigned int) ((size_t) s & 15);
- offset = (int) ((size_t) s & 15);
if (offset != 0)
{
+ start_unaligned:
/* Check partial string. */
aligned = (const char *) ((size_t) s & -16L);
__m128i value = _mm_load_si128 ((__m128i *) aligned);
+ __m128i adj_value = __m128i_shift_right (value, offset);
- value = __m128i_shift_right (value, offset);
-
- int length = _mm_cmpistri (mask, value, 0x12);
+ unsigned int length = _mm_cmpistri (mask, adj_value, 0x12);
/* No need to check CFlag since it is always 1. */
if (length < 16 - offset)
return length;
/* Find where the NULL terminator is. */
- int index = _mm_cmpistri (value, value, 0x3a);
- if (index < 16 - offset)
+ maskz = _mm_cmpeq_epi8 (value, zero);
+ maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
+ if (maskz_bits != 0)
return length;
aligned += 16;
}
- else
- aligned = s;
+start_loop:
while (1)
{
__m128i value = _mm_load_si128 ((__m128i *) aligned);
- int index = _mm_cmpistri (mask, value, 0x12);
- int cflag = _mm_cmpistrc (mask, value, 0x12);
+ unsigned int index = _mm_cmpistri (mask, value, 0x12);
+ unsigned int cflag = _mm_cmpistrc (mask, value, 0x12);
if (cflag)
return (size_t) (aligned + index - s);
aligned += 16;
--
2.25.1
^ permalink raw reply [flat|nested] 76+ messages in thread
* [PATCH v1 09/23] x86: Remove strcspn-sse2.S and use the generic implementation
2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
` (6 preceding siblings ...)
2022-03-23 21:57 ` [PATCH v1 08/23] x86: Optimize strspn in strspn-c.c Noah Goldstein
@ 2022-03-23 21:57 ` Noah Goldstein
2022-03-24 18:57 ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 10/23] x86: Remove strpbrk-sse2.S " Noah Goldstein
` (14 subsequent siblings)
22 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-23 21:57 UTC (permalink / raw)
To: libc-alpha
The generic implementation is faster.
geometric_mean(N=20) of all benchmarks New / Original: .678
All string/memory tests pass.
---
Geomtric Mean N=20 runs; All functions page aligned
len, align1, align2, pos, New Time / Old Time
0, 0, 0, 512, 0.054
1, 0, 0, 512, 0.055
1, 1, 0, 512, 0.051
1, 0, 1, 512, 0.054
1, 1, 1, 512, 0.054
2, 0, 0, 512, 0.861
2, 2, 0, 512, 0.861
2, 0, 2, 512, 0.861
2, 2, 2, 512, 0.864
3, 0, 0, 512, 0.854
3, 3, 0, 512, 0.848
3, 0, 3, 512, 0.845
3, 3, 3, 512, 0.85
4, 0, 0, 512, 0.851
4, 4, 0, 512, 0.85
4, 0, 4, 512, 0.852
4, 4, 4, 512, 0.849
5, 0, 0, 512, 0.938
5, 5, 0, 512, 0.94
5, 0, 5, 512, 0.864
5, 5, 5, 512, 0.86
6, 0, 0, 512, 0.858
6, 6, 0, 512, 0.869
6, 0, 6, 512, 0.847
6, 6, 6, 512, 0.868
7, 0, 0, 512, 0.867
7, 7, 0, 512, 0.861
7, 0, 7, 512, 0.864
7, 7, 7, 512, 0.863
8, 0, 0, 512, 0.884
8, 0, 8, 512, 0.884
9, 0, 0, 512, 0.886
9, 1, 0, 512, 0.894
9, 0, 9, 512, 0.889
9, 1, 9, 512, 0.886
10, 0, 0, 512, 0.859
10, 2, 0, 512, 0.859
10, 0, 10, 512, 0.862
10, 2, 10, 512, 0.861
11, 0, 0, 512, 0.846
11, 3, 0, 512, 0.865
11, 0, 11, 512, 0.859
11, 3, 11, 512, 0.862
12, 0, 0, 512, 0.858
12, 4, 0, 512, 0.857
12, 0, 12, 512, 0.964
12, 4, 12, 512, 0.876
13, 0, 0, 512, 0.827
13, 5, 0, 512, 0.805
13, 0, 13, 512, 0.821
13, 5, 13, 512, 0.825
14, 0, 0, 512, 0.786
14, 6, 0, 512, 0.786
14, 0, 14, 512, 0.803
14, 6, 14, 512, 0.783
15, 0, 0, 512, 0.778
15, 7, 0, 512, 0.792
15, 0, 15, 512, 0.796
15, 7, 15, 512, 0.799
16, 0, 0, 512, 0.803
16, 0, 16, 512, 0.815
17, 0, 0, 512, 0.812
17, 1, 0, 512, 0.826
17, 0, 17, 512, 0.803
17, 1, 17, 512, 0.856
18, 0, 0, 512, 0.801
18, 2, 0, 512, 0.886
18, 0, 18, 512, 0.805
18, 2, 18, 512, 0.807
19, 0, 0, 512, 0.814
19, 3, 0, 512, 0.804
19, 0, 19, 512, 0.813
19, 3, 19, 512, 0.814
20, 0, 0, 512, 0.885
20, 4, 0, 512, 0.799
20, 0, 20, 512, 0.826
20, 4, 20, 512, 0.808
21, 0, 0, 512, 0.816
21, 5, 0, 512, 0.824
21, 0, 21, 512, 0.819
21, 5, 21, 512, 0.826
22, 0, 0, 512, 0.814
22, 6, 0, 512, 0.824
22, 0, 22, 512, 0.81
22, 6, 22, 512, 0.806
23, 0, 0, 512, 0.825
23, 7, 0, 512, 0.829
23, 0, 23, 512, 0.809
23, 7, 23, 512, 0.823
24, 0, 0, 512, 0.829
24, 0, 24, 512, 0.823
25, 0, 0, 512, 0.864
25, 1, 0, 512, 0.895
25, 0, 25, 512, 0.88
25, 1, 25, 512, 0.848
26, 0, 0, 512, 0.903
26, 2, 0, 512, 0.888
26, 0, 26, 512, 0.894
26, 2, 26, 512, 0.89
27, 0, 0, 512, 0.914
27, 3, 0, 512, 0.917
27, 0, 27, 512, 0.902
27, 3, 27, 512, 0.887
28, 0, 0, 512, 0.887
28, 4, 0, 512, 0.877
28, 0, 28, 512, 0.893
28, 4, 28, 512, 0.866
29, 0, 0, 512, 0.885
29, 5, 0, 512, 0.907
29, 0, 29, 512, 0.894
29, 5, 29, 512, 0.906
30, 0, 0, 512, 0.88
30, 6, 0, 512, 0.898
30, 0, 30, 512, 0.9
30, 6, 30, 512, 0.895
31, 0, 0, 512, 0.893
31, 7, 0, 512, 0.874
31, 0, 31, 512, 0.894
31, 7, 31, 512, 0.899
4, 0, 0, 32, 0.618
4, 1, 0, 32, 0.627
4, 0, 1, 32, 0.625
4, 1, 1, 32, 0.613
4, 0, 0, 64, 0.913
4, 2, 0, 64, 0.801
4, 0, 2, 64, 0.759
4, 2, 2, 64, 0.761
4, 0, 0, 128, 0.822
4, 3, 0, 128, 0.863
4, 0, 3, 128, 0.867
4, 3, 3, 128, 0.917
4, 0, 0, 256, 0.816
4, 4, 0, 256, 0.812
4, 0, 4, 256, 0.803
4, 4, 4, 256, 0.811
4, 5, 0, 512, 0.848
4, 0, 5, 512, 0.843
4, 5, 5, 512, 0.857
4, 0, 0, 1024, 0.886
4, 6, 0, 1024, 0.887
4, 0, 6, 1024, 0.881
4, 6, 6, 1024, 0.873
4, 0, 0, 2048, 0.892
4, 7, 0, 2048, 0.894
4, 0, 7, 2048, 0.89
4, 7, 7, 2048, 0.874
10, 1, 0, 64, 0.946
10, 1, 1, 64, 0.81
10, 2, 0, 64, 0.804
10, 2, 2, 64, 0.82
10, 3, 0, 64, 0.772
10, 3, 3, 64, 0.772
10, 4, 0, 64, 0.748
10, 4, 4, 64, 0.751
10, 5, 0, 64, 0.76
10, 5, 5, 64, 0.76
10, 6, 0, 64, 0.726
10, 6, 6, 64, 0.718
10, 7, 0, 64, 0.724
10, 7, 7, 64, 0.72
6, 0, 0, 0, 0.415
6, 0, 0, 1, 0.423
6, 0, 1, 1, 0.412
6, 0, 0, 2, 0.433
6, 0, 2, 2, 0.434
6, 0, 0, 3, 0.427
6, 0, 3, 3, 0.428
6, 0, 0, 4, 0.465
6, 0, 4, 4, 0.466
6, 0, 0, 5, 0.463
6, 0, 5, 5, 0.468
6, 0, 0, 6, 0.435
6, 0, 6, 6, 0.444
6, 0, 0, 7, 0.41
6, 0, 7, 7, 0.42
6, 0, 0, 8, 0.474
6, 0, 8, 8, 0.501
6, 0, 0, 9, 0.471
6, 0, 9, 9, 0.489
6, 0, 0, 10, 0.462
6, 0, 10, 10, 0.46
6, 0, 0, 11, 0.459
6, 0, 11, 11, 0.458
6, 0, 0, 12, 0.516
6, 0, 12, 12, 0.51
6, 0, 0, 13, 0.494
6, 0, 13, 13, 0.524
6, 0, 0, 14, 0.486
6, 0, 14, 14, 0.5
6, 0, 0, 15, 0.48
6, 0, 15, 15, 0.501
6, 0, 0, 16, 0.54
6, 0, 16, 16, 0.538
6, 0, 0, 17, 0.503
6, 0, 17, 17, 0.541
6, 0, 0, 18, 0.537
6, 0, 18, 18, 0.549
6, 0, 0, 19, 0.527
6, 0, 19, 19, 0.537
6, 0, 0, 20, 0.539
6, 0, 20, 20, 0.554
6, 0, 0, 21, 0.558
6, 0, 21, 21, 0.541
6, 0, 0, 22, 0.546
6, 0, 22, 22, 0.561
6, 0, 0, 23, 0.54
6, 0, 23, 23, 0.536
6, 0, 0, 24, 0.565
6, 0, 24, 24, 0.584
6, 0, 0, 25, 0.563
6, 0, 25, 25, 0.58
6, 0, 0, 26, 0.555
6, 0, 26, 26, 0.584
6, 0, 0, 27, 0.569
6, 0, 27, 27, 0.587
6, 0, 0, 28, 0.612
6, 0, 28, 28, 0.623
6, 0, 0, 29, 0.604
6, 0, 29, 29, 0.621
6, 0, 0, 30, 0.59
6, 0, 30, 30, 0.609
6, 0, 0, 31, 0.577
6, 0, 31, 31, 0.588
6, 0, 0, 32, 0.621
6, 0, 32, 32, 0.608
6, 0, 0, 33, 0.601
6, 0, 33, 33, 0.623
6, 0, 0, 34, 0.614
6, 0, 34, 34, 0.615
6, 0, 0, 35, 0.598
6, 0, 35, 35, 0.608
6, 0, 0, 36, 0.626
6, 0, 36, 36, 0.634
6, 0, 0, 37, 0.62
6, 0, 37, 37, 0.634
6, 0, 0, 38, 0.612
6, 0, 38, 38, 0.637
6, 0, 0, 39, 0.627
6, 0, 39, 39, 0.612
6, 0, 0, 40, 0.661
6, 0, 40, 40, 0.674
6, 0, 0, 41, 0.633
6, 0, 41, 41, 0.643
6, 0, 0, 42, 0.634
6, 0, 42, 42, 0.636
6, 0, 0, 43, 0.619
6, 0, 43, 43, 0.625
6, 0, 0, 44, 0.654
6, 0, 44, 44, 0.654
6, 0, 0, 45, 0.647
6, 0, 45, 45, 0.649
6, 0, 0, 46, 0.651
6, 0, 46, 46, 0.651
6, 0, 0, 47, 0.646
6, 0, 47, 47, 0.648
6, 0, 0, 48, 0.662
6, 0, 48, 48, 0.664
6, 0, 0, 49, 0.68
6, 0, 49, 49, 0.667
6, 0, 0, 50, 0.654
6, 0, 50, 50, 0.659
6, 0, 0, 51, 0.638
6, 0, 51, 51, 0.639
6, 0, 0, 52, 0.665
6, 0, 52, 52, 0.669
6, 0, 0, 53, 0.658
6, 0, 53, 53, 0.656
6, 0, 0, 54, 0.669
6, 0, 54, 54, 0.67
6, 0, 0, 55, 0.668
6, 0, 55, 55, 0.664
6, 0, 0, 56, 0.701
6, 0, 56, 56, 0.695
6, 0, 0, 57, 0.687
6, 0, 57, 57, 0.696
6, 0, 0, 58, 0.693
6, 0, 58, 58, 0.704
6, 0, 0, 59, 0.695
6, 0, 59, 59, 0.708
6, 0, 0, 60, 0.708
6, 0, 60, 60, 0.728
6, 0, 0, 61, 0.708
6, 0, 61, 61, 0.71
6, 0, 0, 62, 0.715
6, 0, 62, 62, 0.705
6, 0, 0, 63, 0.677
6, 0, 63, 63, 0.702
.../{strcspn-sse2.S => strcspn-sse2.c} | 8 +-
sysdeps/x86_64/strcspn.S | 119 ------------------
2 files changed, 4 insertions(+), 123 deletions(-)
rename sysdeps/x86_64/multiarch/{strcspn-sse2.S => strcspn-sse2.c} (85%)
delete mode 100644 sysdeps/x86_64/strcspn.S
diff --git a/sysdeps/x86_64/multiarch/strcspn-sse2.S b/sysdeps/x86_64/multiarch/strcspn-sse2.c
similarity index 85%
rename from sysdeps/x86_64/multiarch/strcspn-sse2.S
rename to sysdeps/x86_64/multiarch/strcspn-sse2.c
index f97e856e1f..3a04bb39fc 100644
--- a/sysdeps/x86_64/multiarch/strcspn-sse2.S
+++ b/sysdeps/x86_64/multiarch/strcspn-sse2.c
@@ -1,4 +1,4 @@
-/* strcspn optimized with SSE2.
+/* strcspn.
Copyright (C) 2017-2022 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -19,10 +19,10 @@
#if IS_IN (libc)
# include <sysdep.h>
-# define strcspn __strcspn_sse2
+# define STRCSPN __strcspn_sse2
# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(strcspn)
+# define libc_hidden_builtin_def(STRCSPN)
#endif
-#include <sysdeps/x86_64/strcspn.S>
+#include <string/strcspn.c>
diff --git a/sysdeps/x86_64/strcspn.S b/sysdeps/x86_64/strcspn.S
deleted file mode 100644
index f3cd86c606..0000000000
--- a/sysdeps/x86_64/strcspn.S
+++ /dev/null
@@ -1,119 +0,0 @@
-/* strcspn (str, ss) -- Return the length of the initial segment of STR
- which contains no characters from SS.
- For AMD x86-64.
- Copyright (C) 1994-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include "asm-syntax.h"
-
- .text
-ENTRY (strcspn)
-
- movq %rdi, %rdx /* Save SRC. */
-
- /* First we create a table with flags for all possible characters.
- For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
- supported by the C string functions we have 256 characters.
- Before inserting marks for the stop characters we clear the whole
- table. */
- movq %rdi, %r8 /* Save value. */
- subq $256, %rsp /* Make space for 256 bytes. */
- cfi_adjust_cfa_offset(256)
- movl $32, %ecx /* 32*8 bytes = 256 bytes. */
- movq %rsp, %rdi
- xorl %eax, %eax /* We store 0s. */
- cld
- rep
- stosq
-
- movq %rsi, %rax /* Setup skipset. */
-
-/* For understanding the following code remember that %rcx == 0 now.
- Although all the following instruction only modify %cl we always
- have a correct zero-extended 64-bit value in %rcx. */
-
- .p2align 4
-L(2): movb (%rax), %cl /* get byte from skipset */
- testb %cl, %cl /* is NUL char? */
- jz L(1) /* yes => start compare loop */
- movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */
-
- movb 1(%rax), %cl /* get byte from skipset */
- testb $0xff, %cl /* is NUL char? */
- jz L(1) /* yes => start compare loop */
- movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */
-
- movb 2(%rax), %cl /* get byte from skipset */
- testb $0xff, %cl /* is NUL char? */
- jz L(1) /* yes => start compare loop */
- movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */
-
- movb 3(%rax), %cl /* get byte from skipset */
- addq $4, %rax /* increment skipset pointer */
- movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */
- testb $0xff, %cl /* is NUL char? */
- jnz L(2) /* no => process next dword from skipset */
-
-L(1): leaq -4(%rdx), %rax /* prepare loop */
-
- /* We use a neat trick for the following loop. Normally we would
- have to test for two termination conditions
- 1. a character in the skipset was found
- and
- 2. the end of the string was found
- But as a sign that the character is in the skipset we store its
- value in the table. But the value of NUL is NUL so the loop
- terminates for NUL in every case. */
-
- .p2align 4
-L(3): addq $4, %rax /* adjust pointer for full loop round */
-
- movb (%rax), %cl /* get byte from string */
- cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */
- je L(4) /* yes => return */
-
- movb 1(%rax), %cl /* get byte from string */
- cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */
- je L(5) /* yes => return */
-
- movb 2(%rax), %cl /* get byte from string */
- cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */
- jz L(6) /* yes => return */
-
- movb 3(%rax), %cl /* get byte from string */
- cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */
- jne L(3) /* no => start loop again */
-
- incq %rax /* adjust pointer */
-L(6): incq %rax
-L(5): incq %rax
-
-L(4): addq $256, %rsp /* remove skipset */
- cfi_adjust_cfa_offset(-256)
-#ifdef USE_AS_STRPBRK
- xorl %edx,%edx
- orb %cl, %cl /* was last character NUL? */
- cmovzq %rdx, %rax /* Yes: return NULL */
-#else
- subq %rdx, %rax /* we have to return the number of valid
- characters, so compute distance to first
- non-valid character */
-#endif
- ret
-END (strcspn)
-libc_hidden_builtin_def (strcspn)
--
2.25.1
^ permalink raw reply [flat|nested] 76+ messages in thread
* [PATCH v1 10/23] x86: Remove strpbrk-sse2.S and use the generic implementation
2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
` (7 preceding siblings ...)
2022-03-23 21:57 ` [PATCH v1 09/23] x86: Remove strcspn-sse2.S and use the generic implementation Noah Goldstein
@ 2022-03-23 21:57 ` Noah Goldstein
2022-03-24 18:57 ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 11/23] x86: Remove strspn-sse2.S " Noah Goldstein
` (13 subsequent siblings)
22 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-23 21:57 UTC (permalink / raw)
To: libc-alpha
The generic implementation is faster (see strcspn commit).
All string/memory tests pass.
---
.../x86_64/multiarch/{strpbrk-sse2.S => strpbrk-sse2.c} | 9 ++++-----
sysdeps/x86_64/strpbrk.S | 3 ---
2 files changed, 4 insertions(+), 8 deletions(-)
rename sysdeps/x86_64/multiarch/{strpbrk-sse2.S => strpbrk-sse2.c} (84%)
delete mode 100644 sysdeps/x86_64/strpbrk.S
diff --git a/sysdeps/x86_64/multiarch/strpbrk-sse2.S b/sysdeps/x86_64/multiarch/strpbrk-sse2.c
similarity index 84%
rename from sysdeps/x86_64/multiarch/strpbrk-sse2.S
rename to sysdeps/x86_64/multiarch/strpbrk-sse2.c
index d537b6c27b..d03214c4fb 100644
--- a/sysdeps/x86_64/multiarch/strpbrk-sse2.S
+++ b/sysdeps/x86_64/multiarch/strpbrk-sse2.c
@@ -1,4 +1,4 @@
-/* strpbrk optimized with SSE2.
+/* strpbrk.
Copyright (C) 2017-2022 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -19,11 +19,10 @@
#if IS_IN (libc)
# include <sysdep.h>
-# define strcspn __strpbrk_sse2
+# define STRPBRK __strpbrk_sse2
# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(strpbrk)
+# define libc_hidden_builtin_def(STRPBRK)
#endif
-#define USE_AS_STRPBRK
-#include <sysdeps/x86_64/strcspn.S>
+#include <string/strpbrk.c>
diff --git a/sysdeps/x86_64/strpbrk.S b/sysdeps/x86_64/strpbrk.S
deleted file mode 100644
index 21888a5b92..0000000000
--- a/sysdeps/x86_64/strpbrk.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define strcspn strpbrk
-#define USE_AS_STRPBRK
-#include <sysdeps/x86_64/strcspn.S>
--
2.25.1
^ permalink raw reply [flat|nested] 76+ messages in thread
* [PATCH v1 11/23] x86: Remove strspn-sse2.S and use the generic implementation
2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
` (8 preceding siblings ...)
2022-03-23 21:57 ` [PATCH v1 10/23] x86: Remove strpbrk-sse2.S " Noah Goldstein
@ 2022-03-23 21:57 ` Noah Goldstein
2022-03-24 18:57 ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 12/23] x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896] Noah Goldstein
` (12 subsequent siblings)
22 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-23 21:57 UTC (permalink / raw)
To: libc-alpha
The generic implementation is faster.
geometric_mean(N=20) of all benchmarks New / Original: .710
All string/memory tests pass.
---
Geomtric Mean N=20 runs; All functions page aligned
len, align1, align2, pos, New Time / Old Time
1, 0, 0, 512, 0.824
1, 1, 0, 512, 1.018
1, 0, 1, 512, 0.986
1, 1, 1, 512, 1.092
2, 0, 0, 512, 0.86
2, 2, 0, 512, 0.868
2, 0, 2, 512, 0.858
2, 2, 2, 512, 0.857
3, 0, 0, 512, 0.836
3, 3, 0, 512, 0.849
3, 0, 3, 512, 0.84
3, 3, 3, 512, 0.85
4, 0, 0, 512, 0.843
4, 4, 0, 512, 0.837
4, 0, 4, 512, 0.835
4, 4, 4, 512, 0.846
5, 0, 0, 512, 0.852
5, 5, 0, 512, 0.848
5, 0, 5, 512, 0.85
5, 5, 5, 512, 0.85
6, 0, 0, 512, 0.853
6, 6, 0, 512, 0.855
6, 0, 6, 512, 0.853
6, 6, 6, 512, 0.853
7, 0, 0, 512, 0.857
7, 7, 0, 512, 0.861
7, 0, 7, 512, 0.94
7, 7, 7, 512, 0.856
8, 0, 0, 512, 0.927
8, 0, 8, 512, 0.965
9, 0, 0, 512, 0.967
9, 1, 0, 512, 0.976
9, 0, 9, 512, 0.887
9, 1, 9, 512, 0.881
10, 0, 0, 512, 0.853
10, 2, 0, 512, 0.846
10, 0, 10, 512, 0.855
10, 2, 10, 512, 0.849
11, 0, 0, 512, 0.854
11, 3, 0, 512, 0.855
11, 0, 11, 512, 0.85
11, 3, 11, 512, 0.854
12, 0, 0, 512, 0.864
12, 4, 0, 512, 0.864
12, 0, 12, 512, 0.867
12, 4, 12, 512, 0.87
13, 0, 0, 512, 0.853
13, 5, 0, 512, 0.841
13, 0, 13, 512, 0.837
13, 5, 13, 512, 0.85
14, 0, 0, 512, 0.838
14, 6, 0, 512, 0.842
14, 0, 14, 512, 0.818
14, 6, 14, 512, 0.845
15, 0, 0, 512, 0.799
15, 7, 0, 512, 0.847
15, 0, 15, 512, 0.787
15, 7, 15, 512, 0.84
16, 0, 0, 512, 0.824
16, 0, 16, 512, 0.827
17, 0, 0, 512, 0.817
17, 1, 0, 512, 0.823
17, 0, 17, 512, 0.82
17, 1, 17, 512, 0.814
18, 0, 0, 512, 0.81
18, 2, 0, 512, 0.833
18, 0, 18, 512, 0.811
18, 2, 18, 512, 0.842
19, 0, 0, 512, 0.823
19, 3, 0, 512, 0.818
19, 0, 19, 512, 0.821
19, 3, 19, 512, 0.824
20, 0, 0, 512, 0.814
20, 4, 0, 512, 0.818
20, 0, 20, 512, 0.806
20, 4, 20, 512, 0.802
21, 0, 0, 512, 0.835
21, 5, 0, 512, 0.839
21, 0, 21, 512, 0.842
21, 5, 21, 512, 0.82
22, 0, 0, 512, 0.824
22, 6, 0, 512, 0.831
22, 0, 22, 512, 0.819
22, 6, 22, 512, 0.824
23, 0, 0, 512, 0.816
23, 7, 0, 512, 0.856
23, 0, 23, 512, 0.808
23, 7, 23, 512, 0.848
24, 0, 0, 512, 0.88
24, 0, 24, 512, 0.846
25, 0, 0, 512, 0.929
25, 1, 0, 512, 0.917
25, 0, 25, 512, 0.884
25, 1, 25, 512, 0.859
26, 0, 0, 512, 0.919
26, 2, 0, 512, 0.867
26, 0, 26, 512, 0.914
26, 2, 26, 512, 0.845
27, 0, 0, 512, 0.919
27, 3, 0, 512, 0.864
27, 0, 27, 512, 0.917
27, 3, 27, 512, 0.847
28, 0, 0, 512, 0.905
28, 4, 0, 512, 0.896
28, 0, 28, 512, 0.898
28, 4, 28, 512, 0.871
29, 0, 0, 512, 0.911
29, 5, 0, 512, 0.91
29, 0, 29, 512, 0.905
29, 5, 29, 512, 0.884
30, 0, 0, 512, 0.907
30, 6, 0, 512, 0.802
30, 0, 30, 512, 0.906
30, 6, 30, 512, 0.818
31, 0, 0, 512, 0.907
31, 7, 0, 512, 0.821
31, 0, 31, 512, 0.89
31, 7, 31, 512, 0.787
4, 0, 0, 32, 0.623
4, 1, 0, 32, 0.606
4, 0, 1, 32, 0.6
4, 1, 1, 32, 0.603
4, 0, 0, 64, 0.731
4, 2, 0, 64, 0.733
4, 0, 2, 64, 0.734
4, 2, 2, 64, 0.755
4, 0, 0, 128, 0.822
4, 3, 0, 128, 0.873
4, 0, 3, 128, 0.89
4, 3, 3, 128, 0.907
4, 0, 0, 256, 0.827
4, 4, 0, 256, 0.811
4, 0, 4, 256, 0.794
4, 4, 4, 256, 0.814
4, 5, 0, 512, 0.841
4, 0, 5, 512, 0.831
4, 5, 5, 512, 0.845
4, 0, 0, 1024, 0.861
4, 6, 0, 1024, 0.857
4, 0, 6, 1024, 0.9
4, 6, 6, 1024, 0.861
4, 0, 0, 2048, 0.879
4, 7, 0, 2048, 0.875
4, 0, 7, 2048, 0.883
4, 7, 7, 2048, 0.88
10, 1, 0, 64, 0.747
10, 1, 1, 64, 0.743
10, 2, 0, 64, 0.732
10, 2, 2, 64, 0.729
10, 3, 0, 64, 0.747
10, 3, 3, 64, 0.733
10, 4, 0, 64, 0.74
10, 4, 4, 64, 0.751
10, 5, 0, 64, 0.735
10, 5, 5, 64, 0.746
10, 6, 0, 64, 0.735
10, 6, 6, 64, 0.733
10, 7, 0, 64, 0.734
10, 7, 7, 64, 0.74
6, 0, 0, 0, 0.377
6, 0, 0, 1, 0.369
6, 0, 1, 1, 0.383
6, 0, 0, 2, 0.391
6, 0, 2, 2, 0.394
6, 0, 0, 3, 0.416
6, 0, 3, 3, 0.411
6, 0, 0, 4, 0.475
6, 0, 4, 4, 0.483
6, 0, 0, 5, 0.473
6, 0, 5, 5, 0.476
6, 0, 0, 6, 0.459
6, 0, 6, 6, 0.445
6, 0, 0, 7, 0.433
6, 0, 7, 7, 0.432
6, 0, 0, 8, 0.492
6, 0, 8, 8, 0.494
6, 0, 0, 9, 0.476
6, 0, 9, 9, 0.483
6, 0, 0, 10, 0.46
6, 0, 10, 10, 0.476
6, 0, 0, 11, 0.463
6, 0, 11, 11, 0.463
6, 0, 0, 12, 0.511
6, 0, 12, 12, 0.515
6, 0, 0, 13, 0.506
6, 0, 13, 13, 0.536
6, 0, 0, 14, 0.496
6, 0, 14, 14, 0.484
6, 0, 0, 15, 0.473
6, 0, 15, 15, 0.475
6, 0, 0, 16, 0.534
6, 0, 16, 16, 0.534
6, 0, 0, 17, 0.525
6, 0, 17, 17, 0.523
6, 0, 0, 18, 0.522
6, 0, 18, 18, 0.524
6, 0, 0, 19, 0.512
6, 0, 19, 19, 0.514
6, 0, 0, 20, 0.535
6, 0, 20, 20, 0.54
6, 0, 0, 21, 0.543
6, 0, 21, 21, 0.536
6, 0, 0, 22, 0.542
6, 0, 22, 22, 0.542
6, 0, 0, 23, 0.529
6, 0, 23, 23, 0.53
6, 0, 0, 24, 0.596
6, 0, 24, 24, 0.589
6, 0, 0, 25, 0.583
6, 0, 25, 25, 0.58
6, 0, 0, 26, 0.574
6, 0, 26, 26, 0.58
6, 0, 0, 27, 0.575
6, 0, 27, 27, 0.558
6, 0, 0, 28, 0.606
6, 0, 28, 28, 0.606
6, 0, 0, 29, 0.589
6, 0, 29, 29, 0.595
6, 0, 0, 30, 0.592
6, 0, 30, 30, 0.585
6, 0, 0, 31, 0.585
6, 0, 31, 31, 0.579
6, 0, 0, 32, 0.625
6, 0, 32, 32, 0.615
6, 0, 0, 33, 0.615
6, 0, 33, 33, 0.61
6, 0, 0, 34, 0.604
6, 0, 34, 34, 0.6
6, 0, 0, 35, 0.602
6, 0, 35, 35, 0.608
6, 0, 0, 36, 0.644
6, 0, 36, 36, 0.644
6, 0, 0, 37, 0.658
6, 0, 37, 37, 0.651
6, 0, 0, 38, 0.644
6, 0, 38, 38, 0.649
6, 0, 0, 39, 0.626
6, 0, 39, 39, 0.632
6, 0, 0, 40, 0.662
6, 0, 40, 40, 0.661
6, 0, 0, 41, 0.656
6, 0, 41, 41, 0.655
6, 0, 0, 42, 0.643
6, 0, 42, 42, 0.637
6, 0, 0, 43, 0.622
6, 0, 43, 43, 0.628
6, 0, 0, 44, 0.673
6, 0, 44, 44, 0.687
6, 0, 0, 45, 0.661
6, 0, 45, 45, 0.659
6, 0, 0, 46, 0.657
6, 0, 46, 46, 0.653
6, 0, 0, 47, 0.658
6, 0, 47, 47, 0.65
6, 0, 0, 48, 0.678
6, 0, 48, 48, 0.683
6, 0, 0, 49, 0.676
6, 0, 49, 49, 0.661
6, 0, 0, 50, 0.672
6, 0, 50, 50, 0.662
6, 0, 0, 51, 0.656
6, 0, 51, 51, 0.659
6, 0, 0, 52, 0.682
6, 0, 52, 52, 0.686
6, 0, 0, 53, 0.67
6, 0, 53, 53, 0.674
6, 0, 0, 54, 0.663
6, 0, 54, 54, 0.675
6, 0, 0, 55, 0.662
6, 0, 55, 55, 0.665
6, 0, 0, 56, 0.681
6, 0, 56, 56, 0.697
6, 0, 0, 57, 0.686
6, 0, 57, 57, 0.687
6, 0, 0, 58, 0.701
6, 0, 58, 58, 0.693
6, 0, 0, 59, 0.709
6, 0, 59, 59, 0.698
6, 0, 0, 60, 0.708
6, 0, 60, 60, 0.708
6, 0, 0, 61, 0.709
6, 0, 61, 61, 0.716
6, 0, 0, 62, 0.709
6, 0, 62, 62, 0.707
6, 0, 0, 63, 0.703
6, 0, 63, 63, 0.716
.../{strspn-sse2.S => strspn-sse2.c} | 8 +-
sysdeps/x86_64/strspn.S | 112 ------------------
2 files changed, 4 insertions(+), 116 deletions(-)
rename sysdeps/x86_64/multiarch/{strspn-sse2.S => strspn-sse2.c} (86%)
delete mode 100644 sysdeps/x86_64/strspn.S
diff --git a/sysdeps/x86_64/multiarch/strspn-sse2.S b/sysdeps/x86_64/multiarch/strspn-sse2.c
similarity index 86%
rename from sysdeps/x86_64/multiarch/strspn-sse2.S
rename to sysdeps/x86_64/multiarch/strspn-sse2.c
index e0a095f25a..61cc6cb0a5 100644
--- a/sysdeps/x86_64/multiarch/strspn-sse2.S
+++ b/sysdeps/x86_64/multiarch/strspn-sse2.c
@@ -1,4 +1,4 @@
-/* strspn optimized with SSE2.
+/* strspn.
Copyright (C) 2017-2022 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -19,10 +19,10 @@
#if IS_IN (libc)
# include <sysdep.h>
-# define strspn __strspn_sse2
+# define STRSPN __strspn_sse2
# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(strspn)
+# define libc_hidden_builtin_def(STRSPN)
#endif
-#include <sysdeps/x86_64/strspn.S>
+#include <string/strspn.c>
diff --git a/sysdeps/x86_64/strspn.S b/sysdeps/x86_64/strspn.S
deleted file mode 100644
index 61b76ee0a1..0000000000
--- a/sysdeps/x86_64/strspn.S
+++ /dev/null
@@ -1,112 +0,0 @@
-/* strspn (str, ss) -- Return the length of the initial segment of STR
- which contains only characters from SS.
- For AMD x86-64.
- Copyright (C) 1994-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
- .text
-ENTRY (strspn)
-
- movq %rdi, %rdx /* Save SRC. */
-
- /* First we create a table with flags for all possible characters.
- For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
- supported by the C string functions we have 256 characters.
- Before inserting marks for the stop characters we clear the whole
- table. */
- movq %rdi, %r8 /* Save value. */
- subq $256, %rsp /* Make space for 256 bytes. */
- cfi_adjust_cfa_offset(256)
- movl $32, %ecx /* 32*8 bytes = 256 bytes. */
- movq %rsp, %rdi
- xorl %eax, %eax /* We store 0s. */
- cld
- rep
- stosq
-
- movq %rsi, %rax /* Setup stopset. */
-
-/* For understanding the following code remember that %rcx == 0 now.
- Although all the following instruction only modify %cl we always
- have a correct zero-extended 64-bit value in %rcx. */
-
- .p2align 4
-L(2): movb (%rax), %cl /* get byte from stopset */
- testb %cl, %cl /* is NUL char? */
- jz L(1) /* yes => start compare loop */
- movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */
-
- movb 1(%rax), %cl /* get byte from stopset */
- testb $0xff, %cl /* is NUL char? */
- jz L(1) /* yes => start compare loop */
- movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */
-
- movb 2(%rax), %cl /* get byte from stopset */
- testb $0xff, %cl /* is NUL char? */
- jz L(1) /* yes => start compare loop */
- movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */
-
- movb 3(%rax), %cl /* get byte from stopset */
- addq $4, %rax /* increment stopset pointer */
- movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */
- testb $0xff, %cl /* is NUL char? */
- jnz L(2) /* no => process next dword from stopset */
-
-L(1): leaq -4(%rdx), %rax /* prepare loop */
-
- /* We use a neat trick for the following loop. Normally we would
- have to test for two termination conditions
- 1. a character in the stopset was found
- and
- 2. the end of the string was found
- But as a sign that the character is in the stopset we store its
- value in the table. But the value of NUL is NUL so the loop
- terminates for NUL in every case. */
-
- .p2align 4
-L(3): addq $4, %rax /* adjust pointer for full loop round */
-
- movb (%rax), %cl /* get byte from string */
- testb %cl, (%rsp,%rcx) /* is it contained in skipset? */
- jz L(4) /* no => return */
-
- movb 1(%rax), %cl /* get byte from string */
- testb %cl, (%rsp,%rcx) /* is it contained in skipset? */
- jz L(5) /* no => return */
-
- movb 2(%rax), %cl /* get byte from string */
- testb %cl, (%rsp,%rcx) /* is it contained in skipset? */
- jz L(6) /* no => return */
-
- movb 3(%rax), %cl /* get byte from string */
- testb %cl, (%rsp,%rcx) /* is it contained in skipset? */
- jnz L(3) /* yes => start loop again */
-
- incq %rax /* adjust pointer */
-L(6): incq %rax
-L(5): incq %rax
-
-L(4): addq $256, %rsp /* remove stopset */
- cfi_adjust_cfa_offset(-256)
- subq %rdx, %rax /* we have to return the number of valid
- characters, so compute distance to first
- non-valid character */
- ret
-END (strspn)
-libc_hidden_builtin_def (strspn)
--
2.25.1
^ permalink raw reply [flat|nested] 76+ messages in thread
* [PATCH v1 12/23] x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896]
2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
` (9 preceding siblings ...)
2022-03-23 21:57 ` [PATCH v1 11/23] x86: Remove strspn-sse2.S " Noah Goldstein
@ 2022-03-23 21:57 ` Noah Goldstein
2022-03-24 18:59 ` H.J. Lu
2022-03-24 20:50 ` [PATCH v2 12/31] " Noah Goldstein
2022-03-23 21:57 ` [PATCH v1 13/23] benchtests: Use json-lib in bench-strcasecmp.c Noah Goldstein
` (11 subsequent siblings)
22 siblings, 2 replies; 76+ messages in thread
From: Noah Goldstein @ 2022-03-23 21:57 UTC (permalink / raw)
To: libc-alpha
Overflow case for __wcsncmp_avx2_rtm should be __wcscmp_avx2_rtm not
__wcscmp_avx2.
All string/memory tests pass.
---
sysdeps/x86_64/multiarch/strcmp-avx2.S | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
index 52ff5ad724..86a86b68e3 100644
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
@@ -122,7 +122,7 @@ ENTRY(STRCMP)
are cases where length is large enough that it can never be a
bound on valid memory so just use wcscmp. */
shrq $56, %rcx
- jnz __wcscmp_avx2
+ jnz OVERFLOW_STRCMP
leaq (, %rdx, 4), %rdx
# endif
--
2.25.1
^ permalink raw reply [flat|nested] 76+ messages in thread
* [PATCH v1 13/23] benchtests: Use json-lib in bench-strcasecmp.c
2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
` (10 preceding siblings ...)
2022-03-23 21:57 ` [PATCH v1 12/23] x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896] Noah Goldstein
@ 2022-03-23 21:57 ` Noah Goldstein
2022-03-24 19:00 ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 14/23] benchtests: Use json-lib in bench-strncasecmp.c Noah Goldstein
` (10 subsequent siblings)
22 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-23 21:57 UTC (permalink / raw)
To: libc-alpha
Just QOL change to make parsing the output of the benchtests more
consistent.
---
benchtests/bench-strcasecmp.c | 77 +++++++++++++++++++++++------------
1 file changed, 51 insertions(+), 26 deletions(-)
diff --git a/benchtests/bench-strcasecmp.c b/benchtests/bench-strcasecmp.c
index daccf1d245..855f2db2ad 100644
--- a/benchtests/bench-strcasecmp.c
+++ b/benchtests/bench-strcasecmp.c
@@ -20,6 +20,7 @@
#define TEST_MAIN
#define TEST_NAME "strcasecmp"
#include "bench-string.h"
+#include "json-lib.h"
typedef int (*proto_t) (const char *, const char *);
static int simple_strcasecmp (const char *, const char *);
@@ -40,7 +41,8 @@ simple_strcasecmp (const char *s1, const char *s2)
}
static void
-do_one_test (impl_t *impl, const char *s1, const char *s2, int exp_result)
+do_one_test (json_ctx_t *json_ctx, impl_t *impl, const char *s1,
+ const char *s2, int exp_result)
{
size_t i, iters = INNER_LOOP_ITERS;
timing_t start, stop, cur;
@@ -64,12 +66,12 @@ do_one_test (impl_t *impl, const char *s1, const char *s2, int exp_result)
TIMING_DIFF (cur, start, stop);
- TIMING_PRINT_MEAN ((double) cur, (double) iters);
+ json_element_double (json_ctx, (double) cur / (double) iters);
}
static void
-do_test (size_t align1, size_t align2, size_t len, int max_char,
- int exp_result)
+do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len,
+ int max_char, int exp_result)
{
size_t i;
char *s1, *s2;
@@ -85,6 +87,13 @@ do_test (size_t align1, size_t align2, size_t len, int max_char,
if (align2 + len + 1 >= page_size)
return;
+ json_element_object_begin (json_ctx);
+ json_attr_uint (json_ctx, "length", len);
+ json_attr_uint (json_ctx, "align1", align1);
+ json_attr_uint (json_ctx, "align2", align2);
+ json_attr_uint (json_ctx, "max_char", max_char);
+ json_array_begin (json_ctx, "timings");
+
s1 = (char *) (buf1 + align1);
s2 = (char *) (buf2 + align2);
@@ -103,53 +112,69 @@ do_test (size_t align1, size_t align2, size_t len, int max_char,
else
s2[len - 1] -= exp_result;
- printf ("Length %4zd, alignment %2zd/%2zd:", len, align1, align2);
-
FOR_EACH_IMPL (impl, 0)
- do_one_test (impl, s1, s2, exp_result);
+ do_one_test (json_ctx, impl, s1, s2, exp_result);
- putchar ('\n');
+ json_array_end (json_ctx);
+ json_element_object_end (json_ctx);
}
int
test_main (void)
{
+ json_ctx_t json_ctx;
size_t i;
test_init ();
- printf ("%23s", "");
+ json_init (&json_ctx, 0, stdout);
+
+ json_document_begin (&json_ctx);
+ json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+
+ json_attr_object_begin (&json_ctx, "functions");
+ json_attr_object_begin (&json_ctx, TEST_NAME);
+ json_attr_string (&json_ctx, "bench-variant", "");
+
+ json_array_begin (&json_ctx, "ifuncs");
FOR_EACH_IMPL (impl, 0)
- printf ("\t%s", impl->name);
- putchar ('\n');
+ json_element_string (&json_ctx, impl->name);
+ json_array_end (&json_ctx);
+
+ json_array_begin (&json_ctx, "results");
for (i = 1; i < 16; ++i)
{
- do_test (i, i, i, 127, 0);
- do_test (i, i, i, 127, 1);
- do_test (i, i, i, 127, -1);
+ do_test (&json_ctx, i, i, i, 127, 0);
+ do_test (&json_ctx, i, i, i, 127, 1);
+ do_test (&json_ctx, i, i, i, 127, -1);
}
for (i = 1; i < 10; ++i)
{
- do_test (0, 0, 2 << i, 127, 0);
- do_test (0, 0, 2 << i, 254, 0);
- do_test (0, 0, 2 << i, 127, 1);
- do_test (0, 0, 2 << i, 254, 1);
- do_test (0, 0, 2 << i, 127, -1);
- do_test (0, 0, 2 << i, 254, -1);
+ do_test (&json_ctx, 0, 0, 2 << i, 127, 0);
+ do_test (&json_ctx, 0, 0, 2 << i, 254, 0);
+ do_test (&json_ctx, 0, 0, 2 << i, 127, 1);
+ do_test (&json_ctx, 0, 0, 2 << i, 254, 1);
+ do_test (&json_ctx, 0, 0, 2 << i, 127, -1);
+ do_test (&json_ctx, 0, 0, 2 << i, 254, -1);
}
for (i = 1; i < 8; ++i)
{
- do_test (i, 2 * i, 8 << i, 127, 0);
- do_test (2 * i, i, 8 << i, 254, 0);
- do_test (i, 2 * i, 8 << i, 127, 1);
- do_test (2 * i, i, 8 << i, 254, 1);
- do_test (i, 2 * i, 8 << i, 127, -1);
- do_test (2 * i, i, 8 << i, 254, -1);
+ do_test (&json_ctx, i, 2 * i, 8 << i, 127, 0);
+ do_test (&json_ctx, 2 * i, i, 8 << i, 254, 0);
+ do_test (&json_ctx, i, 2 * i, 8 << i, 127, 1);
+ do_test (&json_ctx, 2 * i, i, 8 << i, 254, 1);
+ do_test (&json_ctx, i, 2 * i, 8 << i, 127, -1);
+ do_test (&json_ctx, 2 * i, i, 8 << i, 254, -1);
}
+ json_array_end (&json_ctx);
+ json_attr_object_end (&json_ctx);
+ json_attr_object_end (&json_ctx);
+ json_document_end (&json_ctx);
+
return ret;
}
--
2.25.1
^ permalink raw reply [flat|nested] 76+ messages in thread
* [PATCH v1 14/23] benchtests: Use json-lib in bench-strncasecmp.c
2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
` (11 preceding siblings ...)
2022-03-23 21:57 ` [PATCH v1 13/23] benchtests: Use json-lib in bench-strcasecmp.c Noah Goldstein
@ 2022-03-23 21:57 ` Noah Goldstein
2022-03-24 19:00 ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 15/23] string: Expand page cross tests in test-strcasecmp.c Noah Goldstein
` (9 subsequent siblings)
22 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-23 21:57 UTC (permalink / raw)
To: libc-alpha
Just QOL change to make parsing the output of the benchtests more
consistent.
---
benchtests/bench-strncasecmp.c | 113 ++++++++++++++++++++-------------
1 file changed, 69 insertions(+), 44 deletions(-)
diff --git a/benchtests/bench-strncasecmp.c b/benchtests/bench-strncasecmp.c
index a9819efc73..91f49cc8d3 100644
--- a/benchtests/bench-strncasecmp.c
+++ b/benchtests/bench-strncasecmp.c
@@ -20,6 +20,7 @@
#define TEST_MAIN
#define TEST_NAME "strncasecmp"
#include "bench-string.h"
+#include "json-lib.h"
typedef int (*proto_t) (const char *, const char *, size_t);
static int simple_strncasecmp (const char *, const char *, size_t);
@@ -47,8 +48,8 @@ simple_strncasecmp (const char *s1, const char *s2, size_t n)
}
static void
-do_one_test (impl_t *impl, const char *s1, const char *s2, size_t n,
- int exp_result)
+do_one_test (json_ctx_t *json_ctx, impl_t *impl, const char *s1,
+ const char *s2, size_t n, int exp_result)
{
size_t i, iters = INNER_LOOP_ITERS;
timing_t start, stop, cur;
@@ -62,12 +63,12 @@ do_one_test (impl_t *impl, const char *s1, const char *s2, size_t n,
TIMING_DIFF (cur, start, stop);
- TIMING_PRINT_MEAN ((double) cur, (double) iters);
+ json_element_double (json_ctx, (double) cur / (double) iters);
}
static void
-do_test (size_t align1, size_t align2, size_t n, size_t len, int max_char,
- int exp_result)
+do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t n,
+ size_t len, int max_char, int exp_result)
{
size_t i;
char *s1, *s2;
@@ -101,83 +102,107 @@ do_test (size_t align1, size_t align2, size_t n, size_t len, int max_char,
else
s2[len - 1] -= exp_result;
- printf ("Length %4zd, alignment %2zd/%2zd:", len, align1, align2);
+ json_element_object_begin (json_ctx);
+ json_attr_uint (json_ctx, "length", len);
+ json_attr_uint (json_ctx, "n", n);
+ json_attr_uint (json_ctx, "align1", align1);
+ json_attr_uint (json_ctx, "align2", align2);
+ json_attr_uint (json_ctx, "max_char", max_char);
+ json_array_begin (json_ctx, "timings");
FOR_EACH_IMPL (impl, 0)
- do_one_test (impl, s1, s2, n, exp_result);
+ do_one_test (json_ctx, impl, s1, s2, n, exp_result);
- putchar ('\n');
+ json_array_end (json_ctx);
+ json_element_object_end (json_ctx);
}
int
test_main (void)
{
+ json_ctx_t json_ctx;
size_t i;
test_init ();
- printf ("%23s", "");
+ json_init (&json_ctx, 0, stdout);
+
+ json_document_begin (&json_ctx);
+ json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+
+ json_attr_object_begin (&json_ctx, "functions");
+ json_attr_object_begin (&json_ctx, TEST_NAME);
+ json_attr_string (&json_ctx, "bench-variant", "");
+
+ json_array_begin (&json_ctx, "ifuncs");
FOR_EACH_IMPL (impl, 0)
- printf ("\t%s", impl->name);
- putchar ('\n');
+ json_element_string (&json_ctx, impl->name);
+ json_array_end (&json_ctx);
+
+ json_array_begin (&json_ctx, "results");
for (i = 1; i < 16; ++i)
{
- do_test (i, i, i - 1, i, 127, 0);
+ do_test (&json_ctx, i, i, i - 1, i, 127, 0);
- do_test (i, i, i, i, 127, 0);
- do_test (i, i, i, i, 127, 1);
- do_test (i, i, i, i, 127, -1);
+ do_test (&json_ctx, i, i, i, i, 127, 0);
+ do_test (&json_ctx, i, i, i, i, 127, 1);
+ do_test (&json_ctx, i, i, i, i, 127, -1);
- do_test (i, i, i + 1, i, 127, 0);
- do_test (i, i, i + 1, i, 127, 1);
- do_test (i, i, i + 1, i, 127, -1);
+ do_test (&json_ctx, i, i, i + 1, i, 127, 0);
+ do_test (&json_ctx, i, i, i + 1, i, 127, 1);
+ do_test (&json_ctx, i, i, i + 1, i, 127, -1);
}
for (i = 1; i < 10; ++i)
{
- do_test (0, 0, (2 << i) - 1, 2 << i, 127, 0);
- do_test (0, 0, 2 << i, 2 << i, 254, 0);
- do_test (0, 0, (2 << i) + 1, 2 << i, 127, 0);
+ do_test (&json_ctx, 0, 0, (2 << i) - 1, 2 << i, 127, 0);
+ do_test (&json_ctx, 0, 0, 2 << i, 2 << i, 254, 0);
+ do_test (&json_ctx, 0, 0, (2 << i) + 1, 2 << i, 127, 0);
- do_test (0, 0, (2 << i) + 1, 2 << i, 254, 0);
+ do_test (&json_ctx, 0, 0, (2 << i) + 1, 2 << i, 254, 0);
- do_test (0, 0, 2 << i, 2 << i, 127, 1);
- do_test (0, 0, (2 << i) + 10, 2 << i, 127, 1);
+ do_test (&json_ctx, 0, 0, 2 << i, 2 << i, 127, 1);
+ do_test (&json_ctx, 0, 0, (2 << i) + 10, 2 << i, 127, 1);
- do_test (0, 0, 2 << i, 2 << i, 254, 1);
- do_test (0, 0, (2 << i) + 10, 2 << i, 254, 1);
+ do_test (&json_ctx, 0, 0, 2 << i, 2 << i, 254, 1);
+ do_test (&json_ctx, 0, 0, (2 << i) + 10, 2 << i, 254, 1);
- do_test (0, 0, 2 << i, 2 << i, 127, -1);
- do_test (0, 0, (2 << i) + 10, 2 << i, 127, -1);
+ do_test (&json_ctx, 0, 0, 2 << i, 2 << i, 127, -1);
+ do_test (&json_ctx, 0, 0, (2 << i) + 10, 2 << i, 127, -1);
- do_test (0, 0, 2 << i, 2 << i, 254, -1);
- do_test (0, 0, (2 << i) + 10, 2 << i, 254, -1);
+ do_test (&json_ctx, 0, 0, 2 << i, 2 << i, 254, -1);
+ do_test (&json_ctx, 0, 0, (2 << i) + 10, 2 << i, 254, -1);
}
for (i = 1; i < 8; ++i)
{
- do_test (i, 2 * i, (8 << i) - 1, 8 << i, 127, 0);
- do_test (i, 2 * i, 8 << i, 8 << i, 127, 0);
- do_test (i, 2 * i, (8 << i) + 100, 8 << i, 127, 0);
+ do_test (&json_ctx, i, 2 * i, (8 << i) - 1, 8 << i, 127, 0);
+ do_test (&json_ctx, i, 2 * i, 8 << i, 8 << i, 127, 0);
+ do_test (&json_ctx, i, 2 * i, (8 << i) + 100, 8 << i, 127, 0);
- do_test (2 * i, i, (8 << i) - 1, 8 << i, 254, 0);
- do_test (2 * i, i, 8 << i, 8 << i, 254, 0);
- do_test (2 * i, i, (8 << i) + 100, 8 << i, 254, 0);
+ do_test (&json_ctx, 2 * i, i, (8 << i) - 1, 8 << i, 254, 0);
+ do_test (&json_ctx, 2 * i, i, 8 << i, 8 << i, 254, 0);
+ do_test (&json_ctx, 2 * i, i, (8 << i) + 100, 8 << i, 254, 0);
- do_test (i, 2 * i, 8 << i, 8 << i, 127, 1);
- do_test (i, 2 * i, (8 << i) + 100, 8 << i, 127, 1);
+ do_test (&json_ctx, i, 2 * i, 8 << i, 8 << i, 127, 1);
+ do_test (&json_ctx, i, 2 * i, (8 << i) + 100, 8 << i, 127, 1);
- do_test (2 * i, i, 8 << i, 8 << i, 254, 1);
- do_test (2 * i, i, (8 << i) + 100, 8 << i, 254, 1);
+ do_test (&json_ctx, 2 * i, i, 8 << i, 8 << i, 254, 1);
+ do_test (&json_ctx, 2 * i, i, (8 << i) + 100, 8 << i, 254, 1);
- do_test (i, 2 * i, 8 << i, 8 << i, 127, -1);
- do_test (i, 2 * i, (8 << i) + 100, 8 << i, 127, -1);
+ do_test (&json_ctx, i, 2 * i, 8 << i, 8 << i, 127, -1);
+ do_test (&json_ctx, i, 2 * i, (8 << i) + 100, 8 << i, 127, -1);
- do_test (2 * i, i, 8 << i, 8 << i, 254, -1);
- do_test (2 * i, i, (8 << i) + 100, 8 << i, 254, -1);
+ do_test (&json_ctx, 2 * i, i, 8 << i, 8 << i, 254, -1);
+ do_test (&json_ctx, 2 * i, i, (8 << i) + 100, 8 << i, 254, -1);
}
+ json_array_end (&json_ctx);
+ json_attr_object_end (&json_ctx);
+ json_attr_object_end (&json_ctx);
+ json_document_end (&json_ctx);
+
return ret;
}
--
2.25.1
^ permalink raw reply [flat|nested] 76+ messages in thread
* [PATCH v1 15/23] string: Expand page cross tests in test-strcasecmp.c
2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
` (12 preceding siblings ...)
2022-03-23 21:57 ` [PATCH v1 14/23] benchtests: Use json-lib in bench-strncasecmp.c Noah Goldstein
@ 2022-03-23 21:57 ` Noah Goldstein
2022-03-24 19:01 ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 16/23] string: Expand page cross tests in test-strncasecmp.c Noah Goldstein
` (8 subsequent siblings)
22 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-23 21:57 UTC (permalink / raw)
To: libc-alpha
Add more robust tests that cover all the page cross edge cases.
---
string/test-strcasecmp.c | 112 ++++++++++++++++++++++++++++++++++-----
1 file changed, 100 insertions(+), 12 deletions(-)
diff --git a/string/test-strcasecmp.c b/string/test-strcasecmp.c
index 3d994f9d64..438a9713ac 100644
--- a/string/test-strcasecmp.c
+++ b/string/test-strcasecmp.c
@@ -18,6 +18,10 @@
#include <locale.h>
#include <ctype.h>
+#include <assert.h>
+#define TEST_LEN (getpagesize () * 3)
+#define MIN_PAGE_SIZE (TEST_LEN + 2 * getpagesize ())
+
#define TEST_MAIN
#define TEST_NAME "strcasecmp"
#include "test-string.h"
@@ -85,12 +89,13 @@ do_test (size_t align1, size_t align2, size_t len, int max_char,
if (len == 0)
return;
- align1 &= 7;
- if (align1 + len + 1 >= page_size)
+
+ align1 &= getpagesize () - 1;
+ if (align1 + (len + 1) >= page_size)
return;
- align2 &= 7;
- if (align2 + len + 1 >= page_size)
+ align2 &= getpagesize () - 1;
+ if (align2 + (len + 1) >= page_size)
return;
s1 = (char *) (buf1 + align1);
@@ -105,12 +110,33 @@ do_test (size_t align1, size_t align2, size_t len, int max_char,
s1[len] = s2[len] = 0;
s1[len + 1] = 23;
s2[len + 1] = 24 + exp_result;
+
if ((s2[len - 1] == 'z' && exp_result == -1)
|| (s2[len - 1] == 'a' && exp_result == 1))
s1[len - 1] += exp_result;
+ else if ((s1[len - 1] == 'Z' + 1 && exp_result == 1)
+ || (s1[len - 1] == 'A' - 1 && exp_result == -1))
+ s1[len - 1] = tolower (s2[len - 1]) + exp_result;
else
s2[len - 1] -= exp_result;
+ /* For some locals this is not guranteed yet. */
+ if (tolower (s1[len - 1]) - tolower (s2[len - 1]) != exp_result)
+ {
+ if (exp_result == -1)
+ {
+ s1[len - 1] = tolower ('a');
+ s2[len - 1] = toupper (tolower ('a') - 1);
+ }
+ else if (exp_result == 0)
+ s1[len - 1] = toupper (s2[len - 1]);
+ else
+ {
+ s1[len - 1] = tolower ('a');
+ s2[len - 1] = toupper (tolower ('a') + 1);
+ }
+ }
+
FOR_EACH_IMPL (impl, 0)
do_one_test (impl, s1, s2, exp_result);
}
@@ -207,10 +233,10 @@ do_random_tests (void)
}
static void
-test_locale (const char *locale)
+test_locale (const char *locale, int extra_tests)
{
- size_t i;
-
+ size_t i, j, k;
+ const size_t test_len = MIN(TEST_LEN, 3 * 4096);
if (setlocale (LC_CTYPE, locale) == NULL)
{
error (0, 0, "cannot set locale \"%s\"", locale);
@@ -249,6 +275,68 @@ test_locale (const char *locale)
do_test (2 * i, i, 8 << i, 254, -1);
}
+ for (j = 0; extra_tests && j < 160; ++j)
+ {
+ for (i = 0; i < test_len;)
+ {
+ do_test (getpagesize () - j - 1, 0, i, 127, 0);
+ do_test (getpagesize () - j - 1, 0, i, 127, 1);
+ do_test (getpagesize () - j - 1, 0, i, 127, -1);
+
+ do_test (getpagesize () - j - 1, j, i, 127, 0);
+ do_test (getpagesize () - j - 1, j, i, 127, 1);
+ do_test (getpagesize () - j - 1, j, i, 127, -1);
+
+ do_test (0, getpagesize () - j - 1, i, 127, 0);
+ do_test (0, getpagesize () - j - 1, i, 127, 1);
+ do_test (0, getpagesize () - j - 1, i, 127, -1);
+
+ do_test (j, getpagesize () - j - 1, i, 127, 0);
+ do_test (j, getpagesize () - j - 1, i, 127, 1);
+ do_test (j, getpagesize () - j - 1, i, 127, -1);
+
+ for (k = 2; k <= 128; k += k)
+ {
+ do_test (getpagesize () - k, getpagesize () - j - 1, i, 127, 0);
+ do_test (getpagesize () - k - 1, getpagesize () - j - 1, i, 127,
+ 0);
+ do_test (getpagesize () - k, getpagesize () - j - 1, i, 127, 1);
+ do_test (getpagesize () - k - 1, getpagesize () - j - 1, i, 127,
+ 1);
+ do_test (getpagesize () - k, getpagesize () - j - 1, i, 127, -1);
+ do_test (getpagesize () - k - 1, getpagesize () - j - 1, i, 127,
+ -1);
+ }
+
+ if (i < 32)
+ {
+ i += 1;
+ }
+ else if (i < 161)
+ {
+ i += 7;
+ }
+ else if (i + 161 < test_len)
+ {
+ i += 31;
+ i *= 17;
+ i /= 16;
+ if (i + 161 > test_len)
+ {
+ i = test_len - 160;
+ }
+ }
+ else if (i + 32 < test_len)
+ {
+ i += 7;
+ }
+ else
+ {
+ i += 1;
+ }
+ }
+ }
+
do_random_tests ();
}
@@ -257,11 +345,11 @@ test_main (void)
{
test_init ();
- test_locale ("C");
- test_locale ("en_US.ISO-8859-1");
- test_locale ("en_US.UTF-8");
- test_locale ("tr_TR.ISO-8859-9");
- test_locale ("tr_TR.UTF-8");
+ test_locale ("C", 1);
+ test_locale ("en_US.ISO-8859-1", 0);
+ test_locale ("en_US.UTF-8", 0);
+ test_locale ("tr_TR.ISO-8859-9", 0);
+ test_locale ("tr_TR.UTF-8", 0);
return ret;
}
--
2.25.1
^ permalink raw reply [flat|nested] 76+ messages in thread
* [PATCH v1 16/23] string: Expand page cross tests in test-strncasecmp.c
2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
` (13 preceding siblings ...)
2022-03-23 21:57 ` [PATCH v1 15/23] string: Expand page cross tests in test-strcasecmp.c Noah Goldstein
@ 2022-03-23 21:57 ` Noah Goldstein
2022-03-24 19:01 ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 17/23] x86: Optimize str{n}casecmp TOLOWER logic in strcmp.S Noah Goldstein
` (7 subsequent siblings)
22 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-23 21:57 UTC (permalink / raw)
To: libc-alpha
Add more robust tests that cover all the page cross edge cases.
---
string/test-strncasecmp.c | 166 +++++++++++++++++++++++++++++++++++---
1 file changed, 154 insertions(+), 12 deletions(-)
diff --git a/string/test-strncasecmp.c b/string/test-strncasecmp.c
index a3c848165a..b86c630bf6 100644
--- a/string/test-strncasecmp.c
+++ b/string/test-strncasecmp.c
@@ -18,6 +18,10 @@
#include <locale.h>
#include <ctype.h>
+
+#define TEST_LEN (getpagesize () * 3)
+#define MIN_PAGE_SIZE (TEST_LEN + 2 * getpagesize ())
+
#define TEST_MAIN
#define TEST_NAME "strncasecmp"
#include "test-string.h"
@@ -106,14 +110,15 @@ do_test (size_t align1, size_t align2, size_t n, size_t len, int max_char,
if (len == 0)
return;
- align1 &= 7;
- if (align1 + len + 1 >= page_size)
+ align1 &= getpagesize () - 1;
+ if (align1 + (len + 2) >= page_size)
return;
- align2 &= 7;
- if (align2 + len + 1 >= page_size)
+ align2 &= getpagesize () - 1;
+ if (align2 + (len + 2) >= page_size)
return;
+
s1 = (char *) (buf1 + align1);
s2 = (char *) (buf2 + align2);
@@ -126,12 +131,33 @@ do_test (size_t align1, size_t align2, size_t n, size_t len, int max_char,
s1[len] = s2[len] = 0;
s1[len + 1] = 23;
s2[len + 1] = 24 + exp_result;
+
if ((s2[len - 1] == 'z' && exp_result == -1)
|| (s2[len - 1] == 'a' && exp_result == 1))
s1[len - 1] += exp_result;
+ else if ((s1[len - 1] == 'Z' + 1 && exp_result == 1)
+ || (s1[len - 1] == 'A' - 1 && exp_result == -1))
+ s1[len - 1] = tolower (s2[len - 1]) + exp_result;
else
s2[len - 1] -= exp_result;
+ /* For some locals this is not guranteed yet. */
+ if (tolower (s1[len - 1]) - tolower (s2[len - 1]) != exp_result)
+ {
+ if (exp_result == -1)
+ {
+ s1[len - 1] = tolower ('a');
+ s2[len - 1] = toupper (tolower ('a') - 1);
+ }
+ else if (exp_result == 0)
+ s1[len - 1] = toupper (s2[len - 1]);
+ else
+ {
+ s1[len - 1] = tolower ('a');
+ s2[len - 1] = toupper (tolower ('a') + 1);
+ }
+ }
+
FOR_EACH_IMPL (impl, 0)
do_one_test (impl, s1, s2, n, exp_result);
}
@@ -299,10 +325,10 @@ bz14195 (void)
}
static void
-test_locale (const char *locale)
+test_locale (const char *locale, int extra_tests)
{
- size_t i;
-
+ size_t i, j, k;
+ const size_t test_len = MIN(TEST_LEN, 3 * 4096);
if (setlocale (LC_CTYPE, locale) == NULL)
{
error (0, 0, "cannot set locale \"%s\"", locale);
@@ -374,6 +400,122 @@ test_locale (const char *locale)
do_test (2 * i, i, (8 << i) + 100, 8 << i, 254, -1);
}
+ for (j = 0; extra_tests && j < 160; ++j)
+ {
+ for (i = 0; i < test_len;)
+ {
+ do_test (getpagesize () - j - 1, 0, i + 1, i, 127, 0);
+ do_test (getpagesize () - j - 1, 0, i + 1, i, 127, 1);
+ do_test (getpagesize () - j - 1, 0, i + 1, i, 127, -1);
+
+ do_test (getpagesize () - j - 1, 0, i, i, 127, 0);
+ do_test (getpagesize () - j - 1, 0, i - 1, i, 127, 0);
+
+ do_test (getpagesize () - j - 1, 0, ULONG_MAX, i, 127, 0);
+ do_test (getpagesize () - j - 1, 0, ULONG_MAX, i, 127, 1);
+ do_test (getpagesize () - j - 1, 0, ULONG_MAX, i, 127, -1);
+
+ do_test (getpagesize () - j - 1, 0, ULONG_MAX - i, i, 127, 0);
+ do_test (getpagesize () - j - 1, 0, ULONG_MAX - i, i, 127, 1);
+ do_test (getpagesize () - j - 1, 0, ULONG_MAX - i, i, 127, -1);
+
+ do_test (getpagesize () - j - 1, j, i + 1, i, 127, 0);
+ do_test (getpagesize () - j - 1, j, i + 1, i, 127, 1);
+ do_test (getpagesize () - j - 1, j, i + 1, i, 127, -1);
+
+ do_test (getpagesize () - j - 1, j, i, i, 127, 0);
+ do_test (getpagesize () - j - 1, j, i - 1, i, 127, 0);
+
+ do_test (getpagesize () - j - 1, j, ULONG_MAX, i, 127, 0);
+ do_test (getpagesize () - j - 1, j, ULONG_MAX, i, 127, 1);
+ do_test (getpagesize () - j - 1, j, ULONG_MAX, i, 127, -1);
+
+ do_test (getpagesize () - j - 1, j, ULONG_MAX - i, i, 127, 0);
+ do_test (getpagesize () - j - 1, j, ULONG_MAX - i, i, 127, 1);
+ do_test (getpagesize () - j - 1, j, ULONG_MAX - i, i, 127, -1);
+
+ do_test (0, getpagesize () - j - 1, i + 1, i, 127, 0);
+ do_test (0, getpagesize () - j - 1, i + 1, i, 127, 1);
+ do_test (0, getpagesize () - j - 1, i + 1, i, 127, -1);
+
+ do_test (0, getpagesize () - j - 1, i, i, 127, 0);
+ do_test (0, getpagesize () - j - 1, i - 1, i, 127, 0);
+
+ do_test (0, getpagesize () - j - 1, ULONG_MAX, i, 127, 0);
+ do_test (0, getpagesize () - j - 1, ULONG_MAX, i, 127, 1);
+ do_test (0, getpagesize () - j - 1, ULONG_MAX, i, 127, -1);
+
+ do_test (0, getpagesize () - j - 1, ULONG_MAX - i, i, 127, 0);
+ do_test (0, getpagesize () - j - 1, ULONG_MAX - i, i, 127, 1);
+ do_test (0, getpagesize () - j - 1, ULONG_MAX - i, i, 127, -1);
+
+ do_test (j, getpagesize () - j - 1, i + 1, i, 127, 0);
+ do_test (j, getpagesize () - j - 1, i + 1, i, 127, 1);
+ do_test (j, getpagesize () - j - 1, i + 1, i, 127, -1);
+
+ do_test (j, getpagesize () - j - 1, i, i, 127, 0);
+ do_test (j, getpagesize () - j - 1, i - 1, i, 127, 0);
+
+ do_test (j, getpagesize () - j - 1, ULONG_MAX, i, 127, 0);
+ do_test (j, getpagesize () - j - 1, ULONG_MAX, i, 127, 1);
+ do_test (j, getpagesize () - j - 1, ULONG_MAX, i, 127, -1);
+
+ do_test (j, getpagesize () - j - 1, ULONG_MAX - i, i, 127, 0);
+ do_test (j, getpagesize () - j - 1, ULONG_MAX - i, i, 127, 1);
+ do_test (j, getpagesize () - j - 1, ULONG_MAX - i, i, 127, -1);
+
+ for (k = 2; k <= 128; k += k)
+ {
+ do_test (getpagesize () - k, getpagesize () - j - 1, i - 1, i,
+ 127, 0);
+ do_test (getpagesize () - k - 1, getpagesize () - j - 1, i - 1,
+ i, 127, 0);
+ do_test (getpagesize () - k, getpagesize () - j - 1, i + 1, i,
+ 127, 0);
+ do_test (getpagesize () - k - 1, getpagesize () - j - 1, i + 1,
+ i, 127, 0);
+ do_test (getpagesize () - k, getpagesize () - j - 1, i, i, 127,
+ 0);
+ do_test (getpagesize () - k - 1, getpagesize () - j - 1, i, i,
+ 127, 0);
+ do_test (getpagesize () - k, getpagesize () - j - 1, i + 1, i,
+ 127, -1);
+ do_test (getpagesize () - k - 1, getpagesize () - j - 1, i + 1,
+ i, 127, -1);
+ do_test (getpagesize () - k, getpagesize () - j - 1, i + 1, i,
+ 127, 1);
+ do_test (getpagesize () - k - 1, getpagesize () - j - 1, i + 1,
+ i, 127, 1);
+ }
+ if (i < 32)
+ {
+ i += 1;
+ }
+ else if (i < 161)
+ {
+ i += 7;
+ }
+ else if (i + 161 < test_len)
+ {
+ i += 31;
+ i *= 17;
+ i /= 16;
+ if (i + 161 > test_len)
+ {
+ i = test_len - 160;
+ }
+ }
+ else if (i + 32 < test_len)
+ {
+ i += 7;
+ }
+ else
+ {
+ i += 1;
+ }
+ }
+ }
+
do_random_tests ();
do_page_tests ();
}
@@ -383,11 +525,11 @@ test_main (void)
{
test_init ();
- test_locale ("C");
- test_locale ("en_US.ISO-8859-1");
- test_locale ("en_US.UTF-8");
- test_locale ("tr_TR.ISO-8859-9");
- test_locale ("tr_TR.UTF-8");
+ test_locale ("C", 1);
+ test_locale ("en_US.ISO-8859-1", 0);
+ test_locale ("en_US.UTF-8", 0);
+ test_locale ("tr_TR.ISO-8859-9", 0);
+ test_locale ("tr_TR.UTF-8", 0);
return ret;
}
--
2.25.1
^ permalink raw reply [flat|nested] 76+ messages in thread
* [PATCH v1 17/23] x86: Optimize str{n}casecmp TOLOWER logic in strcmp.S
2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
` (14 preceding siblings ...)
2022-03-23 21:57 ` [PATCH v1 16/23] string: Expand page cross tests in test-strncasecmp.c Noah Goldstein
@ 2022-03-23 21:57 ` Noah Goldstein
2022-03-24 19:02 ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 18/23] x86: Optimize str{n}casecmp TOLOWER logic in strcmp-sse42.S Noah Goldstein
` (6 subsequent siblings)
22 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-23 21:57 UTC (permalink / raw)
To: libc-alpha
Slightly faster method of doing TOLOWER that saves an
instruction.
Also replace the hard coded 5-byte no with .p2align 4. On builds with
CET enabled this misaligned entry to strcasecmp.
geometric_mean(N=40) of all benchmarks New / Original: .894
All string/memory tests pass.
---
Geomtric Mean N=40 runs; All functions page aligned
length, align1, align2, max_char, New Time / Old Time
1, 1, 1, 127, 0.903
2, 2, 2, 127, 0.905
3, 3, 3, 127, 0.877
4, 4, 4, 127, 0.888
5, 5, 5, 127, 0.901
6, 6, 6, 127, 0.954
7, 7, 7, 127, 0.932
8, 0, 0, 127, 0.918
9, 1, 1, 127, 0.914
10, 2, 2, 127, 0.877
11, 3, 3, 127, 0.909
12, 4, 4, 127, 0.876
13, 5, 5, 127, 0.886
14, 6, 6, 127, 0.914
15, 7, 7, 127, 0.939
4, 0, 0, 127, 0.963
4, 0, 0, 254, 0.943
8, 0, 0, 254, 0.927
16, 0, 0, 127, 0.876
16, 0, 0, 254, 0.865
32, 0, 0, 127, 0.865
32, 0, 0, 254, 0.862
64, 0, 0, 127, 0.863
64, 0, 0, 254, 0.896
128, 0, 0, 127, 0.885
128, 0, 0, 254, 0.882
256, 0, 0, 127, 0.87
256, 0, 0, 254, 0.869
512, 0, 0, 127, 0.832
512, 0, 0, 254, 0.848
1024, 0, 0, 127, 0.835
1024, 0, 0, 254, 0.843
16, 1, 2, 127, 0.914
16, 2, 1, 254, 0.949
32, 2, 4, 127, 0.955
32, 4, 2, 254, 1.004
64, 3, 6, 127, 0.844
64, 6, 3, 254, 0.905
128, 4, 0, 127, 0.889
128, 0, 4, 254, 0.845
256, 5, 2, 127, 0.929
256, 2, 5, 254, 0.907
512, 6, 4, 127, 0.837
512, 4, 6, 254, 0.862
1024, 7, 6, 127, 0.895
1024, 6, 7, 254, 0.89
sysdeps/x86_64/strcmp.S | 64 +++++++++++++++++++----------------------
1 file changed, 29 insertions(+), 35 deletions(-)
diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
index e2ab59c555..99d8b36f1d 100644
--- a/sysdeps/x86_64/strcmp.S
+++ b/sysdeps/x86_64/strcmp.S
@@ -75,9 +75,8 @@ ENTRY2 (__strcasecmp)
movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
mov %fs:(%rax),%RDX_LP
- // XXX 5 byte should be before the function
- /* 5-byte NOP. */
- .byte 0x0f,0x1f,0x44,0x00,0x00
+ /* Either 1 or 5 bytes (dependeing if CET is enabled). */
+ .p2align 4
END2 (__strcasecmp)
# ifndef NO_NOLOCALE_ALIAS
weak_alias (__strcasecmp, strcasecmp)
@@ -94,9 +93,8 @@ ENTRY2 (__strncasecmp)
movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
mov %fs:(%rax),%RCX_LP
- // XXX 5 byte should be before the function
- /* 5-byte NOP. */
- .byte 0x0f,0x1f,0x44,0x00,0x00
+ /* Either 1 or 5 bytes (dependeing if CET is enabled). */
+ .p2align 4
END2 (__strncasecmp)
# ifndef NO_NOLOCALE_ALIAS
weak_alias (__strncasecmp, strncasecmp)
@@ -146,22 +144,22 @@ ENTRY (STRCMP)
#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
.section .rodata.cst16,"aM",@progbits,16
.align 16
-.Lbelowupper:
- .quad 0x4040404040404040
- .quad 0x4040404040404040
-.Ltopupper:
- .quad 0x5b5b5b5b5b5b5b5b
- .quad 0x5b5b5b5b5b5b5b5b
-.Ltouppermask:
+.Llcase_min:
+ .quad 0x3f3f3f3f3f3f3f3f
+ .quad 0x3f3f3f3f3f3f3f3f
+.Llcase_max:
+ .quad 0x9999999999999999
+ .quad 0x9999999999999999
+.Lcase_add:
.quad 0x2020202020202020
.quad 0x2020202020202020
.previous
- movdqa .Lbelowupper(%rip), %xmm5
-# define UCLOW_reg %xmm5
- movdqa .Ltopupper(%rip), %xmm6
-# define UCHIGH_reg %xmm6
- movdqa .Ltouppermask(%rip), %xmm7
-# define LCQWORD_reg %xmm7
+ movdqa .Llcase_min(%rip), %xmm5
+# define LCASE_MIN_reg %xmm5
+ movdqa .Llcase_max(%rip), %xmm6
+# define LCASE_MAX_reg %xmm6
+ movdqa .Lcase_add(%rip), %xmm7
+# define CASE_ADD_reg %xmm7
#endif
cmp $0x30, %ecx
ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */
@@ -172,22 +170,18 @@ ENTRY (STRCMP)
movhpd 8(%rdi), %xmm1
movhpd 8(%rsi), %xmm2
#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
-# define TOLOWER(reg1, reg2) \
- movdqa reg1, %xmm8; \
- movdqa UCHIGH_reg, %xmm9; \
- movdqa reg2, %xmm10; \
- movdqa UCHIGH_reg, %xmm11; \
- pcmpgtb UCLOW_reg, %xmm8; \
- pcmpgtb reg1, %xmm9; \
- pcmpgtb UCLOW_reg, %xmm10; \
- pcmpgtb reg2, %xmm11; \
- pand %xmm9, %xmm8; \
- pand %xmm11, %xmm10; \
- pand LCQWORD_reg, %xmm8; \
- pand LCQWORD_reg, %xmm10; \
- por %xmm8, reg1; \
- por %xmm10, reg2
- TOLOWER (%xmm1, %xmm2)
+# define TOLOWER(reg1, reg2) \
+ movdqa LCASE_MIN_reg, %xmm8; \
+ movdqa LCASE_MIN_reg, %xmm9; \
+ paddb reg1, %xmm8; \
+ paddb reg2, %xmm9; \
+ pcmpgtb LCASE_MAX_reg, %xmm8; \
+ pcmpgtb LCASE_MAX_reg, %xmm9; \
+ pandn CASE_ADD_reg, %xmm8; \
+ pandn CASE_ADD_reg, %xmm9; \
+ paddb %xmm8, reg1; \
+ paddb %xmm9, reg2
+ TOLOWER (%xmm1, %xmm2)
#else
# define TOLOWER(reg1, reg2)
#endif
--
2.25.1
^ permalink raw reply [flat|nested] 76+ messages in thread
* [PATCH v1 18/23] x86: Optimize str{n}casecmp TOLOWER logic in strcmp-sse42.S
2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
` (15 preceding siblings ...)
2022-03-23 21:57 ` [PATCH v1 17/23] x86: Optimize str{n}casecmp TOLOWER logic in strcmp.S Noah Goldstein
@ 2022-03-23 21:57 ` Noah Goldstein
2022-03-24 19:02 ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 19/23] string: Expand page cross test cases in test-strcmp.c Noah Goldstein
` (5 subsequent siblings)
22 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-23 21:57 UTC (permalink / raw)
To: libc-alpha
Slightly faster method of doing TOLOWER that saves an
instruction.
Also replace the hard coded 5-byte no with .p2align 4. On builds with
CET enabled this misaligned entry to strcasecmp.
geometric_mean(N=40) of all benchmarks New / Original: .920
All string/memory tests pass.
---
Geomtric Mean N=40 runs; All functions page aligned
length, align1, align2, max_char, New Time / Old Time
1, 1, 1, 127, 0.914
2, 2, 2, 127, 0.952
3, 3, 3, 127, 0.924
4, 4, 4, 127, 0.995
5, 5, 5, 127, 0.985
6, 6, 6, 127, 1.017
7, 7, 7, 127, 1.031
8, 0, 0, 127, 0.967
9, 1, 1, 127, 0.969
10, 2, 2, 127, 0.951
11, 3, 3, 127, 0.938
12, 4, 4, 127, 0.937
13, 5, 5, 127, 0.967
14, 6, 6, 127, 0.941
15, 7, 7, 127, 0.951
4, 0, 0, 127, 0.959
4, 0, 0, 254, 0.98
8, 0, 0, 254, 0.959
16, 0, 0, 127, 0.895
16, 0, 0, 254, 0.901
32, 0, 0, 127, 0.85
32, 0, 0, 254, 0.851
64, 0, 0, 127, 0.897
64, 0, 0, 254, 0.895
128, 0, 0, 127, 0.944
128, 0, 0, 254, 0.935
256, 0, 0, 127, 0.922
256, 0, 0, 254, 0.913
512, 0, 0, 127, 0.921
512, 0, 0, 254, 0.914
1024, 0, 0, 127, 0.845
1024, 0, 0, 254, 0.84
16, 1, 2, 127, 0.923
16, 2, 1, 254, 0.955
32, 2, 4, 127, 0.979
32, 4, 2, 254, 0.957
64, 3, 6, 127, 0.866
64, 6, 3, 254, 0.849
128, 4, 0, 127, 0.882
128, 0, 4, 254, 0.876
256, 5, 2, 127, 0.877
256, 2, 5, 254, 0.882
512, 6, 4, 127, 0.822
512, 4, 6, 254, 0.862
1024, 7, 6, 127, 0.903
1024, 6, 7, 254, 0.908
sysdeps/x86_64/multiarch/strcmp-sse42.S | 83 +++++++++++--------------
1 file changed, 35 insertions(+), 48 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
index 580feb90e9..7805ae9d41 100644
--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
+++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
@@ -88,9 +88,8 @@ ENTRY (GLABEL(__strcasecmp))
movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
mov %fs:(%rax),%RDX_LP
- // XXX 5 byte should be before the function
- /* 5-byte NOP. */
- .byte 0x0f,0x1f,0x44,0x00,0x00
+ /* Either 1 or 5 bytes (dependeing if CET is enabled). */
+ .p2align 4
END (GLABEL(__strcasecmp))
/* FALLTHROUGH to strcasecmp_l. */
#endif
@@ -99,9 +98,8 @@ ENTRY (GLABEL(__strncasecmp))
movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
mov %fs:(%rax),%RCX_LP
- // XXX 5 byte should be before the function
- /* 5-byte NOP. */
- .byte 0x0f,0x1f,0x44,0x00,0x00
+ /* Either 1 or 5 bytes (dependeing if CET is enabled). */
+ .p2align 4
END (GLABEL(__strncasecmp))
/* FALLTHROUGH to strncasecmp_l. */
#endif
@@ -169,27 +167,22 @@ STRCMP_SSE42:
#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
.section .rodata.cst16,"aM",@progbits,16
.align 16
-LABEL(belowupper):
- .quad 0x4040404040404040
- .quad 0x4040404040404040
-LABEL(topupper):
-# ifdef USE_AVX
- .quad 0x5a5a5a5a5a5a5a5a
- .quad 0x5a5a5a5a5a5a5a5a
-# else
- .quad 0x5b5b5b5b5b5b5b5b
- .quad 0x5b5b5b5b5b5b5b5b
-# endif
-LABEL(touppermask):
+LABEL(lcase_min):
+ .quad 0x3f3f3f3f3f3f3f3f
+ .quad 0x3f3f3f3f3f3f3f3f
+LABEL(lcase_max):
+ .quad 0x9999999999999999
+ .quad 0x9999999999999999
+LABEL(case_add):
.quad 0x2020202020202020
.quad 0x2020202020202020
.previous
- movdqa LABEL(belowupper)(%rip), %xmm4
-# define UCLOW_reg %xmm4
- movdqa LABEL(topupper)(%rip), %xmm5
-# define UCHIGH_reg %xmm5
- movdqa LABEL(touppermask)(%rip), %xmm6
-# define LCQWORD_reg %xmm6
+ movdqa LABEL(lcase_min)(%rip), %xmm4
+# define LCASE_MIN_reg %xmm4
+ movdqa LABEL(lcase_max)(%rip), %xmm5
+# define LCASE_MAX_reg %xmm5
+ movdqa LABEL(case_add)(%rip), %xmm6
+# define CASE_ADD_reg %xmm6
#endif
cmp $0x30, %ecx
ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
@@ -200,32 +193,26 @@ LABEL(touppermask):
#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
# ifdef USE_AVX
# define TOLOWER(reg1, reg2) \
- vpcmpgtb UCLOW_reg, reg1, %xmm7; \
- vpcmpgtb UCHIGH_reg, reg1, %xmm8; \
- vpcmpgtb UCLOW_reg, reg2, %xmm9; \
- vpcmpgtb UCHIGH_reg, reg2, %xmm10; \
- vpandn %xmm7, %xmm8, %xmm8; \
- vpandn %xmm9, %xmm10, %xmm10; \
- vpand LCQWORD_reg, %xmm8, %xmm8; \
- vpand LCQWORD_reg, %xmm10, %xmm10; \
- vpor reg1, %xmm8, reg1; \
- vpor reg2, %xmm10, reg2
+ vpaddb LCASE_MIN_reg, reg1, %xmm7; \
+ vpaddb LCASE_MIN_reg, reg2, %xmm8; \
+ vpcmpgtb LCASE_MAX_reg, %xmm7, %xmm7; \
+ vpcmpgtb LCASE_MAX_reg, %xmm8, %xmm8; \
+ vpandn CASE_ADD_reg, %xmm7, %xmm7; \
+ vpandn CASE_ADD_reg, %xmm8, %xmm8; \
+ vpaddb %xmm7, reg1, reg1; \
+ vpaddb %xmm8, reg2, reg2
# else
# define TOLOWER(reg1, reg2) \
- movdqa reg1, %xmm7; \
- movdqa UCHIGH_reg, %xmm8; \
- movdqa reg2, %xmm9; \
- movdqa UCHIGH_reg, %xmm10; \
- pcmpgtb UCLOW_reg, %xmm7; \
- pcmpgtb reg1, %xmm8; \
- pcmpgtb UCLOW_reg, %xmm9; \
- pcmpgtb reg2, %xmm10; \
- pand %xmm8, %xmm7; \
- pand %xmm10, %xmm9; \
- pand LCQWORD_reg, %xmm7; \
- pand LCQWORD_reg, %xmm9; \
- por %xmm7, reg1; \
- por %xmm9, reg2
+ movdqa LCASE_MIN_reg, %xmm7; \
+ movdqa LCASE_MIN_reg, %xmm8; \
+ paddb reg1, %xmm7; \
+ paddb reg2, %xmm8; \
+ pcmpgtb LCASE_MAX_reg, %xmm7; \
+ pcmpgtb LCASE_MAX_reg, %xmm8; \
+ pandn CASE_ADD_reg, %xmm7; \
+ pandn CASE_ADD_reg, %xmm8; \
+ paddb %xmm7, reg1; \
+ paddb %xmm8, reg2
# endif
TOLOWER (%xmm1, %xmm2)
#else
--
2.25.1
^ permalink raw reply [flat|nested] 76+ messages in thread
* [PATCH v1 19/23] string: Expand page cross test cases in test-strcmp.c
2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
` (16 preceding siblings ...)
2022-03-23 21:57 ` [PATCH v1 18/23] x86: Optimize str{n}casecmp TOLOWER logic in strcmp-sse42.S Noah Goldstein
@ 2022-03-23 21:57 ` Noah Goldstein
2022-03-24 19:02 ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 20/23] string: Expand page cross test cases in test-strncmp.c Noah Goldstein
` (4 subsequent siblings)
22 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-23 21:57 UTC (permalink / raw)
To: libc-alpha
Test cases for when both `s1` and `s2` are near the end of a page
where previously missing.
---
string/test-strcmp.c | 15 ++++++++++++++-
1 file changed, 14 insertions(+), 1 deletion(-)
diff --git a/string/test-strcmp.c b/string/test-strcmp.c
index 0abce769d0..ece03c6d0b 100644
--- a/string/test-strcmp.c
+++ b/string/test-strcmp.c
@@ -392,7 +392,7 @@ check3 (void)
int
test_main (void)
{
- size_t i, j;
+ size_t i, j, k;
const size_t test_len = MIN(TEST_LEN, 3 * 4096);
test_init ();
check();
@@ -453,6 +453,19 @@ test_main (void)
do_test (j, getpagesize () - j - 1, i, 127, 1);
do_test (j, getpagesize () - j - 1, i, 127, -1);
+ for (k = 2; k <= 128; k += k)
+ {
+ do_test (getpagesize () - k, getpagesize () - j - 1, i, 127, 0);
+ do_test (getpagesize () - k - 1, getpagesize () - j - 1, i, 127,
+ 0);
+ do_test (getpagesize () - k, getpagesize () - j - 1, i, 127, 1);
+ do_test (getpagesize () - k - 1, getpagesize () - j - 1, i, 127,
+ 1);
+ do_test (getpagesize () - k, getpagesize () - j - 1, i, 127, -1);
+ do_test (getpagesize () - k - 1, getpagesize () - j - 1, i, 127,
+ -1);
+ }
+
if (i < 32)
{
i += 1;
--
2.25.1
^ permalink raw reply [flat|nested] 76+ messages in thread
* [PATCH v1 20/23] string: Expand page cross test cases in test-strncmp.c
2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
` (17 preceding siblings ...)
2022-03-23 21:57 ` [PATCH v1 19/23] string: Expand page cross test cases in test-strcmp.c Noah Goldstein
@ 2022-03-23 21:57 ` Noah Goldstein
2022-03-24 19:02 ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 21/23] x86: Add AVX2 optimized str{n}casecmp Noah Goldstein
` (3 subsequent siblings)
22 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-23 21:57 UTC (permalink / raw)
To: libc-alpha
Test cases for when both `s1` and `s2` are near the end of a page
where previously missing.
---
string/test-strncmp.c | 27 ++++++++++++++++++++++++++-
1 file changed, 26 insertions(+), 1 deletion(-)
diff --git a/string/test-strncmp.c b/string/test-strncmp.c
index 1a87f0e73e..bba9e3d2dc 100644
--- a/string/test-strncmp.c
+++ b/string/test-strncmp.c
@@ -573,7 +573,7 @@ check_overflow (void)
int
test_main (void)
{
- size_t i, j;
+ size_t i, j, k;
const size_t test_len = MIN(TEST_LEN, 3 * 4096);
test_init ();
@@ -705,6 +705,31 @@ test_main (void)
do_test_n (j, getpagesize () - j - 1, i, ULONG_MAX - i, 0, 127, 0);
do_test_n (j, getpagesize () - j - 1, i, ULONG_MAX - i, 0, 127, 1);
do_test_n (j, getpagesize () - j - 1, i, ULONG_MAX - i, 0, 127, -1);
+
+ for (k = 2; k <= 128; k += k)
+ {
+ do_test (getpagesize () - k, getpagesize () - j - 1, i - 1, i,
+ 127, 0);
+ do_test (getpagesize () - k - 1, getpagesize () - j - 1, i - 1,
+ i, 127, 0);
+ do_test (getpagesize () - k, getpagesize () - j - 1, i + 1, i,
+ 127, 0);
+ do_test (getpagesize () - k - 1, getpagesize () - j - 1, i + 1,
+ i, 127, 0);
+ do_test (getpagesize () - k, getpagesize () - j - 1, i, i, 127,
+ 0);
+ do_test (getpagesize () - k - 1, getpagesize () - j - 1, i, i,
+ 127, 0);
+ do_test (getpagesize () - k, getpagesize () - j - 1, i + 1, i,
+ 127, -1);
+ do_test (getpagesize () - k - 1, getpagesize () - j - 1, i + 1,
+ i, 127, -1);
+ do_test (getpagesize () - k, getpagesize () - j - 1, i + 1, i,
+ 127, 1);
+ do_test (getpagesize () - k - 1, getpagesize () - j - 1, i + 1,
+ i, 127, 1);
+ }
+
if (i < 32)
{
i += 1;
--
2.25.1
^ permalink raw reply [flat|nested] 76+ messages in thread
* [PATCH v1 21/23] x86: Add AVX2 optimized str{n}casecmp
2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
` (18 preceding siblings ...)
2022-03-23 21:57 ` [PATCH v1 20/23] string: Expand page cross test cases in test-strncmp.c Noah Goldstein
@ 2022-03-23 21:57 ` Noah Goldstein
2022-03-24 19:03 ` H.J. Lu
` (3 more replies)
2022-03-23 21:57 ` [PATCH v1 22/23] x86: Add EVEX " Noah Goldstein
` (2 subsequent siblings)
22 siblings, 4 replies; 76+ messages in thread
From: Noah Goldstein @ 2022-03-23 21:57 UTC (permalink / raw)
To: libc-alpha
geometric_mean(N=40) of all benchmarks AVX2 / SSE42: .702
All string/memory tests pass.
---
Geomtric Mean N=40 runs; All functions page aligned
length, align1, align2, max_char, AVX2 Time / SSE42 Time
1, 1, 1, 127, 1.032
2, 2, 2, 127, 1.006
3, 3, 3, 127, 1.009
4, 4, 4, 127, 0.964
5, 5, 5, 127, 0.929
6, 6, 6, 127, 0.94
7, 7, 7, 127, 0.958
8, 0, 0, 127, 0.988
9, 1, 1, 127, 0.99
10, 2, 2, 127, 0.995
11, 3, 3, 127, 0.991
12, 4, 4, 127, 0.975
13, 5, 5, 127, 0.943
14, 6, 6, 127, 0.955
15, 7, 7, 127, 0.988
4, 0, 0, 127, 0.983
4, 0, 0, 254, 0.978
8, 0, 0, 254, 0.989
16, 0, 0, 127, 0.792
16, 0, 0, 254, 0.774
32, 0, 0, 127, 0.568
32, 0, 0, 254, 0.555
64, 0, 0, 127, 0.561
64, 0, 0, 254, 0.561
128, 0, 0, 127, 0.574
128, 0, 0, 254, 0.577
256, 0, 0, 127, 0.561
256, 0, 0, 254, 0.552
512, 0, 0, 127, 0.59
512, 0, 0, 254, 0.594
1024, 0, 0, 127, 0.528
1024, 0, 0, 254, 0.517
16, 1, 2, 127, 0.758
16, 2, 1, 254, 0.748
32, 2, 4, 127, 0.419
32, 4, 2, 254, 0.428
64, 3, 6, 127, 0.472
64, 6, 3, 254, 0.464
128, 4, 0, 127, 0.534
128, 0, 4, 254, 0.53
256, 5, 2, 127, 0.679
256, 2, 5, 254, 0.676
512, 6, 4, 127, 0.525
512, 4, 6, 254, 0.523
1024, 7, 6, 127, 0.518
1024, 6, 7, 254, 0.505
sysdeps/x86_64/multiarch/Makefile | 4 +
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 28 +++
sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 12 +
.../x86_64/multiarch/strcasecmp_l-avx2-rtm.S | 15 ++
sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S | 23 ++
sysdeps/x86_64/multiarch/strcmp-avx2.S | 230 +++++++++++++++---
.../x86_64/multiarch/strncase_l-avx2-rtm.S | 16 ++
sysdeps/x86_64/multiarch/strncase_l-avx2.S | 27 ++
8 files changed, 324 insertions(+), 31 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index e7b413edad..06e1848823 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -55,6 +55,8 @@ sysdep_routines += \
stpncpy-sse2-unaligned \
stpncpy-ssse3 \
strcasecmp_l-avx \
+ strcasecmp_l-avx2 \
+ strcasecmp_l-avx2-rtm \
strcasecmp_l-sse2 \
strcasecmp_l-sse4_2 \
strcasecmp_l-ssse3 \
@@ -93,6 +95,8 @@ sysdep_routines += \
strlen-evex \
strlen-sse2 \
strncase_l-avx \
+ strncase_l-avx2 \
+ strncase_l-avx2-rtm \
strncase_l-sse2 \
strncase_l-sse4_2 \
strncase_l-ssse3 \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index a594f4176e..3c556d07ac 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -436,6 +436,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
IFUNC_IMPL (i, name, strcasecmp,
+ IFUNC_IMPL_ADD (array, i, strcasecmp,
+ CPU_FEATURE_USABLE (AVX2),
+ __strcasecmp_avx2)
+ IFUNC_IMPL_ADD (array, i, strcasecmp,
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (RTM)),
+ __strcasecmp_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strcasecmp,
CPU_FEATURE_USABLE (AVX),
__strcasecmp_avx)
@@ -449,6 +456,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
IFUNC_IMPL (i, name, strcasecmp_l,
+ IFUNC_IMPL_ADD (array, i, strcasecmp,
+ CPU_FEATURE_USABLE (AVX2),
+ __strcasecmp_l_avx2)
+ IFUNC_IMPL_ADD (array, i, strcasecmp,
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (RTM)),
+ __strcasecmp_l_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strcasecmp_l,
CPU_FEATURE_USABLE (AVX),
__strcasecmp_l_avx)
@@ -576,6 +590,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strncase_l.c. */
IFUNC_IMPL (i, name, strncasecmp,
+ IFUNC_IMPL_ADD (array, i, strncasecmp,
+ CPU_FEATURE_USABLE (AVX2),
+ __strncasecmp_avx2)
+ IFUNC_IMPL_ADD (array, i, strncasecmp,
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (RTM)),
+ __strncasecmp_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strncasecmp,
CPU_FEATURE_USABLE (AVX),
__strncasecmp_avx)
@@ -590,6 +611,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strncase_l.c. */
IFUNC_IMPL (i, name, strncasecmp_l,
+ IFUNC_IMPL_ADD (array, i, strncasecmp,
+ CPU_FEATURE_USABLE (AVX2),
+ __strncasecmp_l_avx2)
+ IFUNC_IMPL_ADD (array, i, strncasecmp,
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (RTM)),
+ __strncasecmp_l_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strncasecmp_l,
CPU_FEATURE_USABLE (AVX),
__strncasecmp_l_avx)
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
index 9e3cc61ac0..c4de111fd0 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
@@ -23,12 +23,24 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
static inline void *
IFUNC_SELECTOR (void)
{
const struct cpu_features* cpu_features = __get_cpu_features ();
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+ {
+ if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+ return OPTIMIZE (avx2_rtm);
+
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ return OPTIMIZE (avx2);
+ }
+
if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
return OPTIMIZE (avx);
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
new file mode 100644
index 0000000000..09957fc3c5
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
@@ -0,0 +1,15 @@
+#ifndef STRCMP
+# define STRCMP __strcasecmp_l_avx2_rtm
+#endif
+
+#define _GLABEL(x) x ## _rtm
+#define GLABEL(x) _GLABEL(x)
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "strcasecmp_l-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
new file mode 100644
index 0000000000..e2762f2a22
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
@@ -0,0 +1,23 @@
+/* strcasecmp_l optimized with AVX2.
+ Copyright (C) 2017-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef STRCMP
+# define STRCMP __strcasecmp_l_avx2
+#endif
+#define USE_AS_STRCASECMP_L
+#include "strcmp-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
index 86a86b68e3..eeb90a0da6 100644
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
@@ -20,6 +20,10 @@
# include <sysdep.h>
+# if defined USE_AS_STRCASECMP_L
+# include "locale-defines.h"
+# endif
+
# ifndef STRCMP
# define STRCMP __strcmp_avx2
# endif
@@ -74,13 +78,88 @@
# define VEC_OFFSET (-VEC_SIZE)
# endif
+# ifdef USE_AS_STRCASECMP_L
+# define BYTE_LOOP_REG OFFSET_REG
+# else
+# define BYTE_LOOP_REG ecx
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+# ifdef USE_AS_STRNCMP
+# define STRCASECMP __strncasecmp_avx2
+# define LOCALE_REG rcx
+# define LOCALE_REG_LP RCX_LP
+# define STRCASECMP_NONASCII __strncasecmp_l_nonascii
+# else
+# define STRCASECMP __strcasecmp_avx2
+# define LOCALE_REG rdx
+# define LOCALE_REG_LP RDX_LP
+# define STRCASECMP_NONASCII __strcasecmp_l_nonascii
+# endif
+# endif
+
# define xmmZERO xmm15
# define ymmZERO ymm15
+# define LCASE_MIN_ymm %ymm10
+# define LCASE_MAX_ymm %ymm11
+# define CASE_ADD_ymm %ymm12
+
+# define LCASE_MIN_xmm %xmm10
+# define LCASE_MAX_xmm %xmm11
+# define CASE_ADD_xmm %xmm12
+
+ /* r11 is never use elsewhere so this is safe to maintain. */
+# define TOLOWER_BASE %r11
+
# ifndef SECTION
# define SECTION(p) p##.avx
# endif
+# ifdef USE_AS_STRCASECMP_L
+# define REG(x, y) x ## y
+# define TOLOWER(reg1_in, reg1_out, reg2_in, reg2_out, ext) \
+ vpaddb REG(LCASE_MIN_, ext), reg1_in, REG(%ext, 8); \
+ vpaddb REG(LCASE_MIN_, ext), reg2_in, REG(%ext, 9); \
+ vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 8), REG(%ext, 8); \
+ vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 9), REG(%ext, 9); \
+ vpandn REG(CASE_ADD_, ext), REG(%ext, 8), REG(%ext, 8); \
+ vpandn REG(CASE_ADD_, ext), REG(%ext, 9), REG(%ext, 9); \
+ vpaddb REG(%ext, 8), reg1_in, reg1_out; \
+ vpaddb REG(%ext, 9), reg2_in, reg2_out
+
+# define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
+# define TOLOWER_ymm(...) TOLOWER(__VA_ARGS__, ymm)
+# define TOLOWER_xmm(...) TOLOWER(__VA_ARGS__, xmm)
+
+# define CMP_R1_R2(s1_reg, s2_reg, scratch_reg, reg_out, ext) \
+ TOLOWER (s1_reg, scratch_reg, s2_reg, s2_reg, ext); \
+ VPCMPEQ scratch_reg, s2_reg, reg_out
+
+# define CMP_R1_S2(s1_reg, s2_mem, scratch_reg, reg_out, ext) \
+ VMOVU s2_mem, reg_out; \
+ CMP_R1_R2(s1_reg, reg_out, scratch_reg, reg_out, ext)
+
+# define CMP_R1_R2_ymm(...) CMP_R1_R2(__VA_ARGS__, ymm)
+# define CMP_R1_R2_xmm(...) CMP_R1_R2(__VA_ARGS__, xmm)
+
+# define CMP_R1_S2_ymm(...) CMP_R1_S2(__VA_ARGS__, ymm)
+# define CMP_R1_S2_xmm(...) CMP_R1_S2(__VA_ARGS__, xmm)
+
+# else
+# define TOLOWER_gpr(...)
+# define TOLOWER_ymm(...)
+# define TOLOWER_xmm(...)
+
+# define CMP_R1_R2_ymm(s1_reg, s2_reg, scratch_reg, reg_out) \
+ VPCMPEQ s2_reg, s1_reg, reg_out
+
+# define CMP_R1_R2_xmm(...) CMP_R1_R2_ymm(__VA_ARGS__)
+
+# define CMP_R1_S2_ymm(...) CMP_R1_R2_ymm(__VA_ARGS__)
+# define CMP_R1_S2_xmm(...) CMP_R1_R2_xmm(__VA_ARGS__)
+# endif
+
/* Warning!
wcscmp/wcsncmp have to use SIGNED comparison for elements.
strcmp/strncmp have to use UNSIGNED comparison for elements.
@@ -102,7 +181,45 @@
returned. */
.section SECTION(.text), "ax", @progbits
-ENTRY(STRCMP)
+ .align 16
+ .type STRCMP, @function
+ .globl STRCMP
+ .hidden STRCMP
+
+# ifndef GLABEL
+# define GLABEL(...) __VA_ARGS__
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+ENTRY (GLABEL(STRCASECMP))
+ movq __libc_tsd_LOCALE@gottpoff(%rip), %rax
+ mov %fs:(%rax), %LOCALE_REG_LP
+
+ /* Either 1 or 5 bytes (dependeing if CET is enabled). */
+ .p2align 4
+END (GLABEL(STRCASECMP))
+ /* FALLTHROUGH to strcasecmp/strncasecmp_l. */
+# endif
+
+ .p2align 4
+STRCMP:
+ cfi_startproc
+ _CET_ENDBR
+ CALL_MCOUNT
+
+# if defined USE_AS_STRCASECMP_L
+ /* We have to fall back on the C implementation for locales with
+ encodings not matching ASCII for single bytes. */
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+ mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
+# else
+ mov (%LOCALE_REG), %RAX_LP
+# endif
+ testl $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
+ jne STRCASECMP_NONASCII
+ leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
+# endif
+
# ifdef USE_AS_STRNCMP
# ifdef __ILP32__
/* Clear the upper 32 bits. */
@@ -128,6 +245,30 @@ ENTRY(STRCMP)
# endif
# endif
vpxor %xmmZERO, %xmmZERO, %xmmZERO
+# if defined USE_AS_STRCASECMP_L
+ .section .rodata.cst32, "aM", @progbits, 32
+ .align 32
+L(lcase_min):
+ .quad 0x3f3f3f3f3f3f3f3f
+ .quad 0x3f3f3f3f3f3f3f3f
+ .quad 0x3f3f3f3f3f3f3f3f
+ .quad 0x3f3f3f3f3f3f3f3f
+L(lcase_max):
+ .quad 0x9999999999999999
+ .quad 0x9999999999999999
+ .quad 0x9999999999999999
+ .quad 0x9999999999999999
+L(case_add):
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .previous
+
+ vmovdqa L(lcase_min)(%rip), LCASE_MIN_ymm
+ vmovdqa L(lcase_max)(%rip), LCASE_MAX_ymm
+ vmovdqa L(case_add)(%rip), CASE_ADD_ymm
+# endif
movl %edi, %eax
orl %esi, %eax
sall $20, %eax
@@ -138,8 +279,10 @@ ENTRY(STRCMP)
L(no_page_cross):
/* Safe to compare 4x vectors. */
VMOVU (%rdi), %ymm0
- /* 1s where s1 and s2 equal. */
- VPCMPEQ (%rsi), %ymm0, %ymm1
+ /* 1s where s1 and s2 equal. Just VPCMPEQ if its not strcasecmp.
+ Otherwise converts ymm0 and load from rsi to lower. ymm2 is
+ scratch and ymm1 is the return. */
+ CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
/* 1s at null CHAR. */
VPCMPEQ %ymm0, %ymmZERO, %ymm2
/* 1s where s1 and s2 equal AND not null CHAR. */
@@ -172,6 +315,8 @@ L(return_vec_0):
# else
movzbl (%rdi, %rcx), %eax
movzbl (%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret0):
@@ -207,6 +352,8 @@ L(one_or_less):
# else
movzbl (%rdi), %eax
movzbl (%rsi), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret1):
@@ -234,6 +381,8 @@ L(return_vec_1):
# else
movzbl VEC_SIZE(%rdi, %rcx), %eax
movzbl VEC_SIZE(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret2):
@@ -265,6 +414,8 @@ L(return_vec_2):
# else
movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret3):
@@ -285,6 +436,8 @@ L(return_vec_3):
# else
movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret4):
@@ -295,7 +448,7 @@ L(ret4):
L(more_3x_vec):
/* Safe to compare 4x vectors. */
VMOVU VEC_SIZE(%rdi), %ymm0
- VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -308,7 +461,7 @@ L(more_3x_vec):
# endif
VMOVU (VEC_SIZE * 2)(%rdi), %ymm0
- VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 2)(%rsi), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -316,7 +469,7 @@ L(more_3x_vec):
jnz L(return_vec_2)
VMOVU (VEC_SIZE * 3)(%rdi), %ymm0
- VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 3)(%rsi), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -391,12 +544,10 @@ L(loop_skip_page_cross_check):
VMOVA (VEC_SIZE * 3)(%rdi), %ymm6
/* ymm1 all 1s where s1 and s2 equal. All 0s otherwise. */
- VPCMPEQ (VEC_SIZE * 0)(%rsi), %ymm0, %ymm1
-
- VPCMPEQ (VEC_SIZE * 1)(%rsi), %ymm2, %ymm3
- VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
- VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
-
+ CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 0)(%rsi), %ymm3, %ymm1)
+ CMP_R1_S2_ymm (%ymm2, (VEC_SIZE * 1)(%rsi), %ymm5, %ymm3)
+ CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
+ CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
/* If any mismatches or null CHAR then 0 CHAR, otherwise non-
zero. */
@@ -465,6 +616,8 @@ L(return_vec_2_3_end):
# else
movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax
movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -508,6 +661,8 @@ L(return_vec_0_end):
# else
movzbl (%rdi, %rcx), %eax
movzbl (%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -530,6 +685,8 @@ L(return_vec_1_end):
# else
movzbl VEC_SIZE(%rdi, %rcx), %eax
movzbl VEC_SIZE(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -556,6 +713,8 @@ L(return_vec_2_end):
# else
movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -583,7 +742,7 @@ L(page_cross_during_loop):
jle L(less_1x_vec_till_page_cross)
VMOVA (%rdi), %ymm0
- VPCMPEQ (%rsi), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -605,7 +764,7 @@ L(less_1x_vec_till_page_cross):
here, it means the previous page (rdi - VEC_SIZE) has already
been loaded earlier so must be valid. */
VMOVU -VEC_SIZE(%rdi, %rax), %ymm0
- VPCMPEQ -VEC_SIZE(%rsi, %rax), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, -VEC_SIZE(%rsi, %rax), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -647,6 +806,8 @@ L(return_page_cross_cmp_mem):
# else
movzbl VEC_OFFSET(%rdi, %rcx), %eax
movzbl VEC_OFFSET(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -673,7 +834,7 @@ L(more_2x_vec_till_page_cross):
iteration here. */
VMOVU VEC_SIZE(%rdi), %ymm0
- VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -689,7 +850,7 @@ L(more_2x_vec_till_page_cross):
/* Safe to include comparisons from lower bytes. */
VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %ymm0
- VPCMPEQ -(VEC_SIZE * 2)(%rsi, %rax), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 2)(%rsi, %rax), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -697,7 +858,7 @@ L(more_2x_vec_till_page_cross):
jnz L(return_vec_page_cross_0)
VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %ymm0
- VPCMPEQ -(VEC_SIZE * 1)(%rsi, %rax), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 1)(%rsi, %rax), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -715,8 +876,8 @@ L(more_2x_vec_till_page_cross):
VMOVA (VEC_SIZE * 2)(%rdi), %ymm4
VMOVA (VEC_SIZE * 3)(%rdi), %ymm6
- VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
- VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
+ CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
+ CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
vpand %ymm4, %ymm5, %ymm5
vpand %ymm6, %ymm7, %ymm7
VPMINU %ymm5, %ymm7, %ymm7
@@ -767,6 +928,8 @@ L(return_vec_page_cross_1):
# else
movzbl VEC_OFFSET(%rdi, %rcx), %eax
movzbl VEC_OFFSET(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -822,7 +985,7 @@ L(page_cross):
L(page_cross_loop):
VMOVU (%rdi, %OFFSET_REG64), %ymm0
- VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -840,11 +1003,11 @@ L(page_cross_loop):
subl %eax, %OFFSET_REG
/* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
to not cross page so is safe to load. Since we have already
- loaded at least 1 VEC from rsi it is also guranteed to be safe.
- */
+ loaded at least 1 VEC from rsi it is also guranteed to be
+ safe. */
VMOVU (%rdi, %OFFSET_REG64), %ymm0
- VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -877,6 +1040,8 @@ L(ret_vec_page_cross_cont):
# else
movzbl (%rdi, %rcx), %eax
movzbl (%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -930,7 +1095,7 @@ L(less_1x_vec_till_page):
ja L(less_16_till_page)
VMOVU (%rdi), %xmm0
- VPCMPEQ (%rsi), %xmm0, %xmm1
+ CMP_R1_S2_xmm (%xmm0, (%rsi), %xmm2, %xmm1)
VPCMPEQ %xmm0, %xmmZERO, %xmm2
vpandn %xmm1, %xmm2, %xmm1
vpmovmskb %ymm1, %ecx
@@ -948,7 +1113,7 @@ L(less_1x_vec_till_page):
# endif
VMOVU (%rdi, %OFFSET_REG64), %xmm0
- VPCMPEQ (%rsi, %OFFSET_REG64), %xmm0, %xmm1
+ CMP_R1_S2_xmm (%xmm0, (%rsi, %OFFSET_REG64), %xmm2, %xmm1)
VPCMPEQ %xmm0, %xmmZERO, %xmm2
vpandn %xmm1, %xmm2, %xmm1
vpmovmskb %ymm1, %ecx
@@ -986,7 +1151,7 @@ L(less_16_till_page):
vmovq (%rdi), %xmm0
vmovq (%rsi), %xmm1
VPCMPEQ %xmm0, %xmmZERO, %xmm2
- VPCMPEQ %xmm1, %xmm0, %xmm1
+ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
vpandn %xmm1, %xmm2, %xmm1
vpmovmskb %ymm1, %ecx
incb %cl
@@ -1006,7 +1171,7 @@ L(less_16_till_page):
vmovq (%rdi, %OFFSET_REG64), %xmm0
vmovq (%rsi, %OFFSET_REG64), %xmm1
VPCMPEQ %xmm0, %xmmZERO, %xmm2
- VPCMPEQ %xmm1, %xmm0, %xmm1
+ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
vpandn %xmm1, %xmm2, %xmm1
vpmovmskb %ymm1, %ecx
incb %cl
@@ -1062,7 +1227,7 @@ L(ret_less_8_wcs):
vmovd (%rdi), %xmm0
vmovd (%rsi), %xmm1
VPCMPEQ %xmm0, %xmmZERO, %xmm2
- VPCMPEQ %xmm1, %xmm0, %xmm1
+ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
vpandn %xmm1, %xmm2, %xmm1
vpmovmskb %ymm1, %ecx
subl $0xf, %ecx
@@ -1081,7 +1246,7 @@ L(ret_less_8_wcs):
vmovd (%rdi, %OFFSET_REG64), %xmm0
vmovd (%rsi, %OFFSET_REG64), %xmm1
VPCMPEQ %xmm0, %xmmZERO, %xmm2
- VPCMPEQ %xmm1, %xmm0, %xmm1
+ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
vpandn %xmm1, %xmm2, %xmm1
vpmovmskb %ymm1, %ecx
subl $0xf, %ecx
@@ -1115,7 +1280,9 @@ L(less_4_till_page):
L(less_4_loop):
movzbl (%rdi), %eax
movzbl (%rsi, %rdi), %ecx
- subl %ecx, %eax
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
+ subl %BYTE_LOOP_REG, %eax
jnz L(ret_less_4_loop)
testl %ecx, %ecx
jz L(ret_zero_4_loop)
@@ -1142,5 +1309,6 @@ L(ret_less_4_loop):
subl %r8d, %eax
ret
# endif
-END(STRCMP)
+ cfi_endproc
+ .size STRCMP, .-STRCMP
#endif
diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
new file mode 100644
index 0000000000..e194936c36
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
@@ -0,0 +1,16 @@
+#ifndef STRCMP
+# define STRCMP __strncasecmp_l_avx2_rtm
+#endif
+
+#define _GLABEL(x) x ## _rtm
+#define GLABEL(x) _GLABEL(x)
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+#define OVERFLOW_STRCMP __strcasecmp_avx2_rtm
+
+#include "strncase_l-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2.S b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
new file mode 100644
index 0000000000..29afccbcc5
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
@@ -0,0 +1,27 @@
+/* strncasecmp_l optimized with AVX2.
+ Copyright (C) 2017-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef STRCMP
+# define STRCMP __strncasecmp_l_avx2
+#endif
+#define USE_AS_STRCASECMP_L
+#define USE_AS_STRNCMP
+#ifndef OVERFLOW_STRCMP
+# define OVERFLOW_STRCMP __strcasecmp_avx2
+#endif
+#include "strcmp-avx2.S"
--
2.25.1
^ permalink raw reply [flat|nested] 76+ messages in thread
* [PATCH v1 22/23] x86: Add EVEX optimized str{n}casecmp
2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
` (19 preceding siblings ...)
2022-03-23 21:57 ` [PATCH v1 21/23] x86: Add AVX2 optimized str{n}casecmp Noah Goldstein
@ 2022-03-23 21:57 ` Noah Goldstein
2022-03-24 19:04 ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 23/23] x86: Remove AVX str{n}casecmp Noah Goldstein
2022-03-24 18:43 ` [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c H.J. Lu
22 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-23 21:57 UTC (permalink / raw)
To: libc-alpha
geometric_mean(N=40) of all benchmarks EVEX / SSE42: .621
All string/memory tests pass.
---
Geomtric Mean N=40 runs; All functions page aligned
length, align1, align2, max_char, EVEX Time / SSE42 Time
1, 1, 1, 127, 0.871
2, 2, 2, 127, 0.833
3, 3, 3, 127, 0.851
4, 4, 4, 127, 0.824
5, 5, 5, 127, 0.791
6, 6, 6, 127, 0.789
7, 7, 7, 127, 0.804
8, 0, 0, 127, 0.838
9, 1, 1, 127, 0.837
10, 2, 2, 127, 0.834
11, 3, 3, 127, 0.839
12, 4, 4, 127, 0.844
13, 5, 5, 127, 0.796
14, 6, 6, 127, 0.811
15, 7, 7, 127, 0.838
4, 0, 0, 127, 0.84
4, 0, 0, 254, 0.823
8, 0, 0, 254, 0.838
16, 0, 0, 127, 0.669
16, 0, 0, 254, 0.656
32, 0, 0, 127, 0.488
32, 0, 0, 254, 0.484
64, 0, 0, 127, 0.492
64, 0, 0, 254, 0.502
128, 0, 0, 127, 0.508
128, 0, 0, 254, 0.497
256, 0, 0, 127, 0.574
256, 0, 0, 254, 0.581
512, 0, 0, 127, 0.573
512, 0, 0, 254, 0.577
1024, 0, 0, 127, 0.489
1024, 0, 0, 254, 0.485
16, 1, 2, 127, 0.655
16, 2, 1, 254, 0.646
32, 2, 4, 127, 0.368
32, 4, 2, 254, 0.376
64, 3, 6, 127, 0.428
64, 6, 3, 254, 0.426
128, 4, 0, 127, 0.478
128, 0, 4, 254, 0.473
256, 5, 2, 127, 0.65
256, 2, 5, 254, 0.654
512, 6, 4, 127, 0.492
512, 4, 6, 254, 0.489
1024, 7, 6, 127, 0.463
1024, 6, 7, 254, 0.457
sysdeps/x86_64/multiarch/Makefile | 2 +
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 ++
sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 5 +
sysdeps/x86_64/multiarch/strcasecmp_l-evex.S | 23 ++
sysdeps/x86_64/multiarch/strcmp-evex.S | 280 ++++++++++++++++---
sysdeps/x86_64/multiarch/strncase_l-evex.S | 25 ++
6 files changed, 314 insertions(+), 37 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
create mode 100644 sysdeps/x86_64/multiarch/strncase_l-evex.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 06e1848823..35d80dc2ff 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -57,6 +57,7 @@ sysdep_routines += \
strcasecmp_l-avx \
strcasecmp_l-avx2 \
strcasecmp_l-avx2-rtm \
+ strcasecmp_l-evex \
strcasecmp_l-sse2 \
strcasecmp_l-sse4_2 \
strcasecmp_l-ssse3 \
@@ -97,6 +98,7 @@ sysdep_routines += \
strncase_l-avx \
strncase_l-avx2 \
strncase_l-avx2-rtm \
+ strncase_l-evex \
strncase_l-sse2 \
strncase_l-sse4_2 \
strncase_l-ssse3 \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 3c556d07ac..f1a4d3dac2 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -436,6 +436,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
IFUNC_IMPL (i, name, strcasecmp,
+ IFUNC_IMPL_ADD (array, i, strcasecmp,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)),
+ __strcasecmp_evex)
IFUNC_IMPL_ADD (array, i, strcasecmp,
CPU_FEATURE_USABLE (AVX2),
__strcasecmp_avx2)
@@ -456,6 +460,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
IFUNC_IMPL (i, name, strcasecmp_l,
+ IFUNC_IMPL_ADD (array, i, strcasecmp,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)),
+ __strcasecmp_l_evex)
IFUNC_IMPL_ADD (array, i, strcasecmp,
CPU_FEATURE_USABLE (AVX2),
__strcasecmp_l_avx2)
@@ -590,6 +598,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strncase_l.c. */
IFUNC_IMPL (i, name, strncasecmp,
+ IFUNC_IMPL_ADD (array, i, strncasecmp,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)),
+ __strncasecmp_evex)
IFUNC_IMPL_ADD (array, i, strncasecmp,
CPU_FEATURE_USABLE (AVX2),
__strncasecmp_avx2)
@@ -611,6 +623,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strncase_l.c. */
IFUNC_IMPL (i, name, strncasecmp_l,
+ IFUNC_IMPL_ADD (array, i, strncasecmp,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)),
+ __strncasecmp_l_evex)
IFUNC_IMPL_ADD (array, i, strncasecmp,
CPU_FEATURE_USABLE (AVX2),
__strncasecmp_l_avx2)
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
index c4de111fd0..bf0d146e7f 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
@@ -25,6 +25,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
static inline void *
IFUNC_SELECTOR (void)
@@ -34,6 +35,10 @@ IFUNC_SELECTOR (void)
if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
{
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+ return OPTIMIZE (evex);
+
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
return OPTIMIZE (avx2_rtm);
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
new file mode 100644
index 0000000000..58642db748
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
@@ -0,0 +1,23 @@
+/* strcasecmp_l optimized with EVEX.
+ Copyright (C) 2017-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef STRCMP
+# define STRCMP __strcasecmp_l_evex
+#endif
+#define USE_AS_STRCASECMP_L
+#include "strcmp-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
index 56d8c118e4..85afd6535f 100644
--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
@@ -19,6 +19,9 @@
#if IS_IN (libc)
# include <sysdep.h>
+# if defined USE_AS_STRCASECMP_L
+# include "locale-defines.h"
+# endif
# ifndef STRCMP
# define STRCMP __strcmp_evex
@@ -34,19 +37,29 @@
# define VMOVA vmovdqa64
# ifdef USE_AS_WCSCMP
-# define TESTEQ subl $0xff,
+# ifndef OVERFLOW_STRCMP
+# define OVERFLOW_STRCMP __wcscmp_evex
+# endif
+
+# define TESTEQ subl $0xff,
/* Compare packed dwords. */
# define VPCMP vpcmpd
# define VPMINU vpminud
# define VPTESTM vptestmd
+# define VPTESTNM vptestnmd
/* 1 dword char == 4 bytes. */
# define SIZE_OF_CHAR 4
# else
+# ifndef OVERFLOW_STRCMP
+# define OVERFLOW_STRCMP __strcmp_evex
+# endif
+
# define TESTEQ incl
/* Compare packed bytes. */
# define VPCMP vpcmpb
# define VPMINU vpminub
# define VPTESTM vptestmb
+# define VPTESTNM vptestnmb
/* 1 byte char == 1 byte. */
# define SIZE_OF_CHAR 1
# endif
@@ -73,11 +86,16 @@
# define VEC_OFFSET (-VEC_SIZE)
# endif
-# define XMMZERO xmm16
# define XMM0 xmm17
# define XMM1 xmm18
-# define YMMZERO ymm16
+# define XMM10 xmm27
+# define XMM11 xmm28
+# define XMM12 xmm29
+# define XMM13 xmm30
+# define XMM14 xmm31
+
+
# define YMM0 ymm17
# define YMM1 ymm18
# define YMM2 ymm19
@@ -89,6 +107,87 @@
# define YMM8 ymm25
# define YMM9 ymm26
# define YMM10 ymm27
+# define YMM11 ymm28
+# define YMM12 ymm29
+# define YMM13 ymm30
+# define YMM14 ymm31
+
+# ifdef USE_AS_STRCASECMP_L
+# define BYTE_LOOP_REG OFFSET_REG
+# else
+# define BYTE_LOOP_REG ecx
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+# ifdef USE_AS_STRNCMP
+# define STRCASECMP __strncasecmp_evex
+# define LOCALE_REG rcx
+# define LOCALE_REG_LP RCX_LP
+# define STRCASECMP_NONASCII __strncasecmp_l_nonascii
+# else
+# define STRCASECMP __strcasecmp_evex
+# define LOCALE_REG rdx
+# define LOCALE_REG_LP RDX_LP
+# define STRCASECMP_NONASCII __strcasecmp_l_nonascii
+# endif
+# endif
+
+# define LCASE_MIN_YMM %YMM12
+# define LCASE_MAX_YMM %YMM13
+# define CASE_ADD_YMM %YMM14
+
+# define LCASE_MIN_XMM %XMM12
+# define LCASE_MAX_XMM %XMM13
+# define CASE_ADD_XMM %XMM14
+
+ /* NB: wcsncmp uses r11 but strcasecmp is never used in
+ conjunction with wcscmp. */
+# define TOLOWER_BASE %r11
+
+# ifdef USE_AS_STRCASECMP_L
+# define _REG(x, y) x ## y
+# define REG(x, y) _REG(x, y)
+# define TOLOWER(reg1, reg2, ext) \
+ vpsubb REG(LCASE_MIN_, ext), reg1, REG(%ext, 10); \
+ vpsubb REG(LCASE_MIN_, ext), reg2, REG(%ext, 11); \
+ vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5; \
+ vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6; \
+ vpaddb reg1, REG(CASE_ADD_, ext), reg1{%k5}; \
+ vpaddb reg2, REG(CASE_ADD_, ext), reg2{%k6}
+
+# define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
+# define TOLOWER_YMM(...) TOLOWER(__VA_ARGS__, YMM)
+# define TOLOWER_XMM(...) TOLOWER(__VA_ARGS__, XMM)
+
+# define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext) \
+ TOLOWER (s1_reg, s2_reg, ext); \
+ VPCMP $0, s1_reg, s2_reg, reg_out
+
+# define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext) \
+ VMOVU s2_mem, s2_reg; \
+ CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)
+
+# define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM)
+# define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM)
+
+# define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM)
+# define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM)
+
+# else
+# define TOLOWER_gpr(...)
+# define TOLOWER_YMM(...)
+# define TOLOWER_XMM(...)
+
+# define CMP_R1_R2_YMM(s1_reg, s2_reg, reg_out) \
+ VPCMP $0, s2_reg, s1_reg, reg_out
+
+# define CMP_R1_R2_XMM(...) CMP_R1_R2_YMM(__VA_ARGS__)
+
+# define CMP_R1_S2_YMM(s1_reg, s2_mem, unused, reg_out) \
+ VPCMP $0, s2_mem, s1_reg, reg_out
+
+# define CMP_R1_S2_XMM(...) CMP_R1_S2_YMM(__VA_ARGS__)
+# endif
/* Warning!
wcscmp/wcsncmp have to use SIGNED comparison for elements.
@@ -112,7 +211,41 @@
returned. */
.section .text.evex, "ax", @progbits
-ENTRY(STRCMP)
+ .align 16
+ .type STRCMP, @function
+ .globl STRCMP
+ .hidden STRCMP
+
+# ifdef USE_AS_STRCASECMP_L
+ENTRY (STRCASECMP)
+ movq __libc_tsd_LOCALE@gottpoff(%rip), %rax
+ mov %fs:(%rax), %LOCALE_REG_LP
+
+ /* Either 1 or 5 bytes (dependeing if CET is enabled). */
+ .p2align 4
+END (STRCASECMP)
+ /* FALLTHROUGH to strcasecmp/strncasecmp_l. */
+# endif
+
+ .p2align 4
+STRCMP:
+ cfi_startproc
+ _CET_ENDBR
+ CALL_MCOUNT
+
+# if defined USE_AS_STRCASECMP_L
+ /* We have to fall back on the C implementation for locales with
+ encodings not matching ASCII for single bytes. */
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+ mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
+# else
+ mov (%LOCALE_REG), %RAX_LP
+# endif
+ testl $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
+ jne STRCASECMP_NONASCII
+ leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
+# endif
+
# ifdef USE_AS_STRNCMP
# ifdef __ILP32__
/* Clear the upper 32 bits. */
@@ -125,6 +258,32 @@ ENTRY(STRCMP)
actually bound the buffer. */
jle L(one_or_less)
# endif
+
+# if defined USE_AS_STRCASECMP_L
+ .section .rodata.cst32, "aM", @progbits, 32
+ .align 32
+L(lcase_min):
+ .quad 0x4141414141414141
+ .quad 0x4141414141414141
+ .quad 0x4141414141414141
+ .quad 0x4141414141414141
+L(lcase_max):
+ .quad 0x1a1a1a1a1a1a1a1a
+ .quad 0x1a1a1a1a1a1a1a1a
+ .quad 0x1a1a1a1a1a1a1a1a
+ .quad 0x1a1a1a1a1a1a1a1a
+L(case_add):
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .previous
+
+ vmovdqa64 L(lcase_min)(%rip), LCASE_MIN_YMM
+ vmovdqa64 L(lcase_max)(%rip), LCASE_MAX_YMM
+ vmovdqa64 L(case_add)(%rip), CASE_ADD_YMM
+# endif
+
movl %edi, %eax
orl %esi, %eax
/* Shift out the bits irrelivant to page boundary ([63:12]). */
@@ -139,7 +298,7 @@ L(no_page_cross):
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi). */
- VPCMP $0, (%rsi), %YMM0, %k1{%k2}
+ CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
kmovd %k1, %ecx
# ifdef USE_AS_STRNCMP
cmpq $CHAR_PER_VEC, %rdx
@@ -169,6 +328,8 @@ L(return_vec_0):
# else
movzbl (%rdi, %rcx), %eax
movzbl (%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret0):
@@ -192,7 +353,7 @@ L(one_or_less):
# ifdef USE_AS_WCSCMP
/* 'nbe' covers the case where length is negative (large
unsigned). */
- jnbe __wcscmp_evex
+ jnbe OVERFLOW_STRCMP
movl (%rdi), %edx
xorl %eax, %eax
cmpl (%rsi), %edx
@@ -203,9 +364,11 @@ L(one_or_less):
# else
/* 'nbe' covers the case where length is negative (large
unsigned). */
- jnbe __strcmp_evex
+ jnbe OVERFLOW_STRCMP
movzbl (%rdi), %eax
movzbl (%rsi), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret1):
@@ -233,6 +396,8 @@ L(return_vec_1):
# else
movzbl VEC_SIZE(%rdi, %rcx), %eax
movzbl VEC_SIZE(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret2):
@@ -270,6 +435,8 @@ L(return_vec_2):
# else
movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret3):
@@ -290,6 +457,8 @@ L(return_vec_3):
# else
movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret4):
@@ -303,7 +472,7 @@ L(more_3x_vec):
/* Safe to compare 4x vectors. */
VMOVU (VEC_SIZE)(%rdi), %YMM0
VPTESTM %YMM0, %YMM0, %k2
- VPCMP $0, (VEC_SIZE)(%rsi), %YMM0, %k1{%k2}
+ CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
kmovd %k1, %ecx
TESTEQ %ecx
jnz L(return_vec_1)
@@ -315,14 +484,14 @@ L(more_3x_vec):
VMOVU (VEC_SIZE * 2)(%rdi), %YMM0
VPTESTM %YMM0, %YMM0, %k2
- VPCMP $0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
+ CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2}
kmovd %k1, %ecx
TESTEQ %ecx
jnz L(return_vec_2)
VMOVU (VEC_SIZE * 3)(%rdi), %YMM0
VPTESTM %YMM0, %YMM0, %k2
- VPCMP $0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
+ CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2}
kmovd %k1, %ecx
TESTEQ %ecx
jnz L(return_vec_3)
@@ -381,7 +550,6 @@ L(prepare_loop_aligned):
subl %esi, %eax
andl $(PAGE_SIZE - 1), %eax
- vpxorq %YMMZERO, %YMMZERO, %YMMZERO
/* Loop 4x comparisons at a time. */
.p2align 4
@@ -413,22 +581,35 @@ L(loop_skip_page_cross_check):
/* A zero CHAR in YMM9 means that there is a null CHAR. */
VPMINU %YMM8, %YMM9, %YMM9
- /* Each bit set in K1 represents a non-null CHAR in YMM8. */
+ /* Each bit set in K1 represents a non-null CHAR in YMM9. */
VPTESTM %YMM9, %YMM9, %k1
-
+# ifndef USE_AS_STRCASECMP_L
vpxorq (VEC_SIZE * 0)(%rsi), %YMM0, %YMM1
vpxorq (VEC_SIZE * 1)(%rsi), %YMM2, %YMM3
vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
/* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
oring with YMM1. Result is stored in YMM6. */
vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6
-
+# else
+ VMOVU (VEC_SIZE * 0)(%rsi), %YMM1
+ TOLOWER_YMM (%YMM0, %YMM1)
+ VMOVU (VEC_SIZE * 1)(%rsi), %YMM3
+ TOLOWER_YMM (%YMM2, %YMM3)
+ VMOVU (VEC_SIZE * 2)(%rsi), %YMM5
+ TOLOWER_YMM (%YMM4, %YMM5)
+ VMOVU (VEC_SIZE * 3)(%rsi), %YMM7
+ TOLOWER_YMM (%YMM6, %YMM7)
+ vpxorq %YMM0, %YMM1, %YMM1
+ vpxorq %YMM2, %YMM3, %YMM3
+ vpxorq %YMM4, %YMM5, %YMM5
+ vpternlogd $0xde, %YMM7, %YMM1, %YMM6
+# endif
/* Or together YMM3, YMM5, and YMM6. */
vpternlogd $0xfe, %YMM3, %YMM5, %YMM6
/* A non-zero CHAR in YMM6 represents a mismatch. */
- VPCMP $0, %YMMZERO, %YMM6, %k0{%k1}
+ VPTESTNM %YMM6, %YMM6, %k0{%k1}
kmovd %k0, %LOOP_REG
TESTEQ %LOOP_REG
@@ -437,13 +618,13 @@ L(loop_skip_page_cross_check):
/* Find which VEC has the mismatch of end of string. */
VPTESTM %YMM0, %YMM0, %k1
- VPCMP $0, %YMMZERO, %YMM1, %k0{%k1}
+ VPTESTNM %YMM1, %YMM1, %k0{%k1}
kmovd %k0, %ecx
TESTEQ %ecx
jnz L(return_vec_0_end)
VPTESTM %YMM2, %YMM2, %k1
- VPCMP $0, %YMMZERO, %YMM3, %k0{%k1}
+ VPTESTNM %YMM3, %YMM3, %k0{%k1}
kmovd %k0, %ecx
TESTEQ %ecx
jnz L(return_vec_1_end)
@@ -457,7 +638,7 @@ L(return_vec_2_3_end):
# endif
VPTESTM %YMM4, %YMM4, %k1
- VPCMP $0, %YMMZERO, %YMM5, %k0{%k1}
+ VPTESTNM %YMM5, %YMM5, %k0{%k1}
kmovd %k0, %ecx
TESTEQ %ecx
# if CHAR_PER_VEC <= 16
@@ -493,6 +674,8 @@ L(return_vec_3_end):
# else
movzbl (VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax
movzbl (VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -545,6 +728,8 @@ L(return_vec_0_end):
# else
movzbl (%rdi, %rcx), %eax
movzbl (%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
/* Flip `eax` if `rdi` and `rsi` where swapped in page cross
logic. Subtract `r8d` after xor for zero case. */
@@ -569,6 +754,8 @@ L(return_vec_1_end):
# else
movzbl VEC_SIZE(%rdi, %rcx), %eax
movzbl VEC_SIZE(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -598,7 +785,7 @@ L(page_cross_during_loop):
VMOVA (%rdi), %YMM0
VPTESTM %YMM0, %YMM0, %k2
- VPCMP $0, (%rsi), %YMM0, %k1{%k2}
+ CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
kmovd %k1, %ecx
TESTEQ %ecx
jnz L(return_vec_0_end)
@@ -619,8 +806,7 @@ L(less_1x_vec_till_page_cross):
been loaded earlier so must be valid. */
VMOVU -VEC_SIZE(%rdi, %rax), %YMM0
VPTESTM %YMM0, %YMM0, %k2
- VPCMP $0, -VEC_SIZE(%rsi, %rax), %YMM0, %k1{%k2}
-
+ CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2}
/* Mask of potentially valid bits. The lower bits can be out of
range comparisons (but safe regarding page crosses). */
@@ -642,6 +828,8 @@ L(less_1x_vec_till_page_cross):
# ifdef USE_AS_STRNCMP
# ifdef USE_AS_WCSCMP
+ /* NB: strcasecmp not used with WCSCMP so this access to r11 is
+ safe. */
movl %eax, %r11d
shrl $2, %r11d
cmpq %r11, %rdx
@@ -679,6 +867,8 @@ L(return_page_cross_cmp_mem):
# else
movzbl VEC_OFFSET(%rdi, %rcx), %eax
movzbl VEC_OFFSET(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -709,7 +899,7 @@ L(more_2x_vec_till_page_cross):
VMOVA VEC_SIZE(%rdi), %YMM0
VPTESTM %YMM0, %YMM0, %k2
- VPCMP $0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
+ CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
kmovd %k1, %ecx
TESTEQ %ecx
jnz L(return_vec_1_end)
@@ -724,14 +914,14 @@ L(more_2x_vec_till_page_cross):
/* Safe to include comparisons from lower bytes. */
VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %YMM0
VPTESTM %YMM0, %YMM0, %k2
- VPCMP $0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM0, %k1{%k2}
+ CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2}
kmovd %k1, %ecx
TESTEQ %ecx
jnz L(return_vec_page_cross_0)
VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %YMM0
VPTESTM %YMM0, %YMM0, %k2
- VPCMP $0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM0, %k1{%k2}
+ CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2}
kmovd %k1, %ecx
TESTEQ %ecx
jnz L(return_vec_page_cross_1)
@@ -740,6 +930,8 @@ L(more_2x_vec_till_page_cross):
/* Must check length here as length might proclude reading next
page. */
# ifdef USE_AS_WCSCMP
+ /* NB: strcasecmp not used with WCSCMP so this access to r11 is
+ safe. */
movl %eax, %r11d
shrl $2, %r11d
cmpq %r11, %rdx
@@ -754,12 +946,19 @@ L(more_2x_vec_till_page_cross):
VMOVA (VEC_SIZE * 3)(%rdi), %YMM6
VPMINU %YMM4, %YMM6, %YMM9
VPTESTM %YMM9, %YMM9, %k1
-
+# ifndef USE_AS_STRCASECMP_L
vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
/* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6). */
vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6
-
- VPCMP $0, %YMMZERO, %YMM6, %k0{%k1}
+# else
+ VMOVU (VEC_SIZE * 2)(%rsi), %YMM5
+ TOLOWER_YMM (%YMM4, %YMM5)
+ VMOVU (VEC_SIZE * 3)(%rsi), %YMM7
+ TOLOWER_YMM (%YMM6, %YMM7)
+ vpxorq %YMM4, %YMM5, %YMM5
+ vpternlogd $0xde, %YMM7, %YMM5, %YMM6
+# endif
+ VPTESTNM %YMM6, %YMM6, %k0{%k1}
kmovd %k0, %LOOP_REG
TESTEQ %LOOP_REG
jnz L(return_vec_2_3_end)
@@ -815,6 +1014,8 @@ L(return_vec_page_cross_1):
# else
movzbl VEC_OFFSET(%rdi, %rcx), %eax
movzbl VEC_OFFSET(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -871,7 +1072,7 @@ L(page_cross):
L(page_cross_loop):
VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
VPTESTM %YMM0, %YMM0, %k2
- VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
+ CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
kmovd %k1, %ecx
TESTEQ %ecx
jnz L(check_ret_vec_page_cross)
@@ -895,7 +1096,7 @@ L(page_cross_loop):
*/
VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
VPTESTM %YMM0, %YMM0, %k2
- VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
+ CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
kmovd %k1, %ecx
# ifdef USE_AS_STRNCMP
@@ -930,6 +1131,8 @@ L(ret_vec_page_cross_cont):
# else
movzbl (%rdi, %rcx, SIZE_OF_CHAR), %eax
movzbl (%rsi, %rcx, SIZE_OF_CHAR), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -989,7 +1192,7 @@ L(less_1x_vec_till_page):
/* Use 16 byte comparison. */
vmovdqu (%rdi), %xmm0
VPTESTM %xmm0, %xmm0, %k2
- VPCMP $0, (%rsi), %xmm0, %k1{%k2}
+ CMP_R1_S2_XMM (%xmm0, (%rsi), %xmm1, %k1){%k2}
kmovd %k1, %ecx
# ifdef USE_AS_WCSCMP
subl $0xf, %ecx
@@ -1009,7 +1212,7 @@ L(less_1x_vec_till_page):
# endif
vmovdqu (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
VPTESTM %xmm0, %xmm0, %k2
- VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0, %k1{%k2}
+ CMP_R1_S2_XMM (%xmm0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1, %k1){%k2}
kmovd %k1, %ecx
# ifdef USE_AS_WCSCMP
subl $0xf, %ecx
@@ -1048,7 +1251,7 @@ L(less_16_till_page):
vmovq (%rdi), %xmm0
vmovq (%rsi), %xmm1
VPTESTM %xmm0, %xmm0, %k2
- VPCMP $0, %xmm1, %xmm0, %k1{%k2}
+ CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
kmovd %k1, %ecx
# ifdef USE_AS_WCSCMP
subl $0x3, %ecx
@@ -1068,7 +1271,7 @@ L(less_16_till_page):
vmovq (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
vmovq (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
VPTESTM %xmm0, %xmm0, %k2
- VPCMP $0, %xmm1, %xmm0, %k1{%k2}
+ CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
kmovd %k1, %ecx
# ifdef USE_AS_WCSCMP
subl $0x3, %ecx
@@ -1128,7 +1331,7 @@ L(ret_less_8_wcs):
vmovd (%rdi), %xmm0
vmovd (%rsi), %xmm1
VPTESTM %xmm0, %xmm0, %k2
- VPCMP $0, %xmm1, %xmm0, %k1{%k2}
+ CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
kmovd %k1, %ecx
subl $0xf, %ecx
jnz L(check_ret_vec_page_cross)
@@ -1143,7 +1346,7 @@ L(ret_less_8_wcs):
vmovd (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
vmovd (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
VPTESTM %xmm0, %xmm0, %k2
- VPCMP $0, %xmm1, %xmm0, %k1{%k2}
+ CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
kmovd %k1, %ecx
subl $0xf, %ecx
jnz L(check_ret_vec_page_cross)
@@ -1176,7 +1379,9 @@ L(less_4_till_page):
L(less_4_loop):
movzbl (%rdi), %eax
movzbl (%rsi, %rdi), %ecx
- subl %ecx, %eax
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
+ subl %BYTE_LOOP_REG, %eax
jnz L(ret_less_4_loop)
testl %ecx, %ecx
jz L(ret_zero_4_loop)
@@ -1203,5 +1408,6 @@ L(ret_less_4_loop):
subl %r8d, %eax
ret
# endif
-END(STRCMP)
+ cfi_endproc
+ .size STRCMP, .-STRCMP
#endif
diff --git a/sysdeps/x86_64/multiarch/strncase_l-evex.S b/sysdeps/x86_64/multiarch/strncase_l-evex.S
new file mode 100644
index 0000000000..b0808c1b21
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncase_l-evex.S
@@ -0,0 +1,25 @@
+/* strncasecmp_l optimized with EVEX.
+ Copyright (C) 2017-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef STRCMP
+# define STRCMP __strncasecmp_l_evex
+#endif
+#define OVERFLOW_STRCMP __strcasecmp_evex
+#define USE_AS_STRCASECMP_L
+#define USE_AS_STRNCMP
+#include "strcmp-evex.S"
--
2.25.1
^ permalink raw reply [flat|nested] 76+ messages in thread
* [PATCH v1 23/23] x86: Remove AVX str{n}casecmp
2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
` (20 preceding siblings ...)
2022-03-23 21:57 ` [PATCH v1 22/23] x86: Add EVEX " Noah Goldstein
@ 2022-03-23 21:57 ` Noah Goldstein
2022-03-24 19:04 ` H.J. Lu
2022-03-24 18:43 ` [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c H.J. Lu
22 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-23 21:57 UTC (permalink / raw)
To: libc-alpha
The rational is:
1. SSE42 has nearly identical logic so any benefit is minimal (3.4%
regression on Tigerlake using SSE42 versus AVX across the
benchtest suite).
2. AVX2 version covers the majority of targets that previously
prefered it.
3. The targets where AVX would still be best (SnB and IVB) are
becoming outdated.
All in all the saving the code size is worth it.
All string/memory tests pass.
---
Geomtric Mean N=40 runs; All functions page aligned
length, align1, align2, max_char, AVX Time / SSE42 Time
1, 1, 1, 127, 0.928
2, 2, 2, 127, 0.934
3, 3, 3, 127, 0.975
4, 4, 4, 127, 0.96
5, 5, 5, 127, 0.935
6, 6, 6, 127, 0.929
7, 7, 7, 127, 0.959
8, 0, 0, 127, 0.955
9, 1, 1, 127, 0.944
10, 2, 2, 127, 0.975
11, 3, 3, 127, 0.935
12, 4, 4, 127, 0.931
13, 5, 5, 127, 0.926
14, 6, 6, 127, 0.901
15, 7, 7, 127, 0.951
4, 0, 0, 127, 0.958
4, 0, 0, 254, 0.956
8, 0, 0, 254, 0.977
16, 0, 0, 127, 0.955
16, 0, 0, 254, 0.953
32, 0, 0, 127, 0.943
32, 0, 0, 254, 0.941
64, 0, 0, 127, 0.941
64, 0, 0, 254, 0.955
128, 0, 0, 127, 0.972
128, 0, 0, 254, 0.975
256, 0, 0, 127, 0.996
256, 0, 0, 254, 0.993
512, 0, 0, 127, 0.992
512, 0, 0, 254, 0.986
1024, 0, 0, 127, 0.994
1024, 0, 0, 254, 0.993
16, 1, 2, 127, 0.933
16, 2, 1, 254, 0.953
32, 2, 4, 127, 0.927
32, 4, 2, 254, 0.986
64, 3, 6, 127, 0.991
64, 6, 3, 254, 1.014
128, 4, 0, 127, 1.001
128, 0, 4, 254, 0.991
256, 5, 2, 127, 1.011
256, 2, 5, 254, 1.013
512, 6, 4, 127, 1.056
512, 4, 6, 254, 0.916
1024, 7, 6, 127, 1.059
1024, 6, 7, 254, 1.043
sysdeps/x86_64/multiarch/Makefile | 2 -
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 -
sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 4 -
sysdeps/x86_64/multiarch/strcasecmp_l-avx.S | 22 --
sysdeps/x86_64/multiarch/strcmp-sse42.S | 240 +++++++++-----------
sysdeps/x86_64/multiarch/strncase_l-avx.S | 22 --
6 files changed, 105 insertions(+), 197 deletions(-)
delete mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx.S
delete mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 35d80dc2ff..6507d1b7fa 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -54,7 +54,6 @@ sysdep_routines += \
stpncpy-evex \
stpncpy-sse2-unaligned \
stpncpy-ssse3 \
- strcasecmp_l-avx \
strcasecmp_l-avx2 \
strcasecmp_l-avx2-rtm \
strcasecmp_l-evex \
@@ -95,7 +94,6 @@ sysdep_routines += \
strlen-avx2-rtm \
strlen-evex \
strlen-sse2 \
- strncase_l-avx \
strncase_l-avx2 \
strncase_l-avx2-rtm \
strncase_l-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index f1a4d3dac2..40cc6cc49e 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -447,9 +447,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (RTM)),
__strcasecmp_avx2_rtm)
- IFUNC_IMPL_ADD (array, i, strcasecmp,
- CPU_FEATURE_USABLE (AVX),
- __strcasecmp_avx)
IFUNC_IMPL_ADD (array, i, strcasecmp,
CPU_FEATURE_USABLE (SSE4_2),
__strcasecmp_sse42)
@@ -471,9 +468,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (RTM)),
__strcasecmp_l_avx2_rtm)
- IFUNC_IMPL_ADD (array, i, strcasecmp_l,
- CPU_FEATURE_USABLE (AVX),
- __strcasecmp_l_avx)
IFUNC_IMPL_ADD (array, i, strcasecmp_l,
CPU_FEATURE_USABLE (SSE4_2),
__strcasecmp_l_sse42)
@@ -609,9 +603,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (RTM)),
__strncasecmp_avx2_rtm)
- IFUNC_IMPL_ADD (array, i, strncasecmp,
- CPU_FEATURE_USABLE (AVX),
- __strncasecmp_avx)
IFUNC_IMPL_ADD (array, i, strncasecmp,
CPU_FEATURE_USABLE (SSE4_2),
__strncasecmp_sse42)
@@ -634,9 +625,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (RTM)),
__strncasecmp_l_avx2_rtm)
- IFUNC_IMPL_ADD (array, i, strncasecmp_l,
- CPU_FEATURE_USABLE (AVX),
- __strncasecmp_l_avx)
IFUNC_IMPL_ADD (array, i, strncasecmp_l,
CPU_FEATURE_USABLE (SSE4_2),
__strncasecmp_l_sse42)
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
index bf0d146e7f..766539c241 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
@@ -22,7 +22,6 @@
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
@@ -46,9 +45,6 @@ IFUNC_SELECTOR (void)
return OPTIMIZE (avx2);
}
- if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
- return OPTIMIZE (avx);
-
if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_2)
&& !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
return OPTIMIZE (sse42);
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S
deleted file mode 100644
index 7ec7c21b5a..0000000000
--- a/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S
+++ /dev/null
@@ -1,22 +0,0 @@
-/* strcasecmp_l optimized with AVX.
- Copyright (C) 2017-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#define STRCMP_SSE42 __strcasecmp_l_avx
-#define USE_AVX 1
-#define USE_AS_STRCASECMP_L
-#include "strcmp-sse42.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
index 7805ae9d41..a9178ad25c 100644
--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
+++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
@@ -41,13 +41,8 @@
# define UPDATE_STRNCMP_COUNTER
#endif
-#ifdef USE_AVX
-# define SECTION avx
-# define GLABEL(l) l##_avx
-#else
-# define SECTION sse4.2
-# define GLABEL(l) l##_sse42
-#endif
+#define SECTION sse4.2
+#define GLABEL(l) l##_sse42
#define LABEL(l) .L##l
@@ -105,21 +100,7 @@ END (GLABEL(__strncasecmp))
#endif
-#ifdef USE_AVX
-# define movdqa vmovdqa
-# define movdqu vmovdqu
-# define pmovmskb vpmovmskb
-# define pcmpistri vpcmpistri
-# define psubb vpsubb
-# define pcmpeqb vpcmpeqb
-# define psrldq vpsrldq
-# define pslldq vpslldq
-# define palignr vpalignr
-# define pxor vpxor
-# define D(arg) arg, arg
-#else
-# define D(arg) arg
-#endif
+#define arg arg
STRCMP_SSE42:
cfi_startproc
@@ -191,18 +172,7 @@ LABEL(case_add):
movdqu (%rdi), %xmm1
movdqu (%rsi), %xmm2
#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
-# ifdef USE_AVX
-# define TOLOWER(reg1, reg2) \
- vpaddb LCASE_MIN_reg, reg1, %xmm7; \
- vpaddb LCASE_MIN_reg, reg2, %xmm8; \
- vpcmpgtb LCASE_MAX_reg, %xmm7, %xmm7; \
- vpcmpgtb LCASE_MAX_reg, %xmm8, %xmm8; \
- vpandn CASE_ADD_reg, %xmm7, %xmm7; \
- vpandn CASE_ADD_reg, %xmm8, %xmm8; \
- vpaddb %xmm7, reg1, reg1; \
- vpaddb %xmm8, reg2, reg2
-# else
-# define TOLOWER(reg1, reg2) \
+# define TOLOWER(reg1, reg2) \
movdqa LCASE_MIN_reg, %xmm7; \
movdqa LCASE_MIN_reg, %xmm8; \
paddb reg1, %xmm7; \
@@ -213,15 +183,15 @@ LABEL(case_add):
pandn CASE_ADD_reg, %xmm8; \
paddb %xmm7, reg1; \
paddb %xmm8, reg2
-# endif
+
TOLOWER (%xmm1, %xmm2)
#else
# define TOLOWER(reg1, reg2)
#endif
- pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char checks */
- pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
- pcmpeqb %xmm2, D(%xmm1) /* compare first 16 bytes for equality */
- psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/
+ pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
+ pcmpeqb %xmm1, %xmm0 /* Any null chars? */
+ pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
pmovmskb %xmm1, %edx
sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
jnz LABEL(less16bytes)/* If not, find different value or null char */
@@ -245,7 +215,7 @@ LABEL(crosscache):
xor %r8d, %r8d
and $0xf, %ecx /* offset of rsi */
and $0xf, %eax /* offset of rdi */
- pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char check */
+ pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */
cmp %eax, %ecx
je LABEL(ashr_0) /* rsi and rdi relative offset same */
ja LABEL(bigger)
@@ -259,7 +229,7 @@ LABEL(bigger):
sub %rcx, %r9
lea LABEL(unaligned_table)(%rip), %r10
movslq (%r10, %r9,4), %r9
- pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
+ pcmpeqb %xmm1, %xmm0 /* Any null chars? */
lea (%r10, %r9), %r10
_CET_NOTRACK jmp *%r10 /* jump to corresponding case */
@@ -272,15 +242,15 @@ LABEL(bigger):
LABEL(ashr_0):
movdqa (%rsi), %xmm1
- pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
+ pcmpeqb %xmm1, %xmm0 /* Any null chars? */
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpeqb (%rdi), D(%xmm1) /* compare 16 bytes for equality */
+ pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */
#else
movdqa (%rdi), %xmm2
TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm2, D(%xmm1) /* compare 16 bytes for equality */
+ pcmpeqb %xmm2, %xmm1 /* compare 16 bytes for equality */
#endif
- psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
pmovmskb %xmm1, %r9d
shr %cl, %edx /* adjust 0xffff for offset */
shr %cl, %r9d /* adjust for 16-byte offset */
@@ -360,10 +330,10 @@ LABEL(ashr_0_exit_use):
*/
.p2align 4
LABEL(ashr_1):
- pslldq $15, D(%xmm2) /* shift first string to align with second */
+ pslldq $15, %xmm2 /* shift first string to align with second */
TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2) /* compare 16 bytes for equality */
- psubb %xmm0, D(%xmm2) /* packed sub of comparison results*/
+ pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */
+ psubb %xmm0, %xmm2 /* packed sub of comparison results*/
pmovmskb %xmm2, %r9d
shr %cl, %edx /* adjust 0xffff for offset */
shr %cl, %r9d /* adjust for 16-byte offset */
@@ -391,7 +361,7 @@ LABEL(loop_ashr_1_use):
LABEL(nibble_ashr_1_restart_use):
movdqa (%rdi, %rdx), %xmm0
- palignr $1, -16(%rdi, %rdx), D(%xmm0)
+ palignr $1, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
#else
@@ -410,7 +380,7 @@ LABEL(nibble_ashr_1_restart_use):
jg LABEL(nibble_ashr_1_use)
movdqa (%rdi, %rdx), %xmm0
- palignr $1, -16(%rdi, %rdx), D(%xmm0)
+ palignr $1, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
#else
@@ -430,7 +400,7 @@ LABEL(nibble_ashr_1_restart_use):
LABEL(nibble_ashr_1_use):
sub $0x1000, %r10
movdqa -16(%rdi, %rdx), %xmm0
- psrldq $1, D(%xmm0)
+ psrldq $1, %xmm0
pcmpistri $0x3a,%xmm0, %xmm0
#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
cmp %r11, %rcx
@@ -448,10 +418,10 @@ LABEL(nibble_ashr_1_use):
*/
.p2align 4
LABEL(ashr_2):
- pslldq $14, D(%xmm2)
+ pslldq $14, %xmm2
TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
@@ -479,7 +449,7 @@ LABEL(loop_ashr_2_use):
LABEL(nibble_ashr_2_restart_use):
movdqa (%rdi, %rdx), %xmm0
- palignr $2, -16(%rdi, %rdx), D(%xmm0)
+ palignr $2, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
#else
@@ -498,7 +468,7 @@ LABEL(nibble_ashr_2_restart_use):
jg LABEL(nibble_ashr_2_use)
movdqa (%rdi, %rdx), %xmm0
- palignr $2, -16(%rdi, %rdx), D(%xmm0)
+ palignr $2, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
#else
@@ -518,7 +488,7 @@ LABEL(nibble_ashr_2_restart_use):
LABEL(nibble_ashr_2_use):
sub $0x1000, %r10
movdqa -16(%rdi, %rdx), %xmm0
- psrldq $2, D(%xmm0)
+ psrldq $2, %xmm0
pcmpistri $0x3a,%xmm0, %xmm0
#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
cmp %r11, %rcx
@@ -536,10 +506,10 @@ LABEL(nibble_ashr_2_use):
*/
.p2align 4
LABEL(ashr_3):
- pslldq $13, D(%xmm2)
+ pslldq $13, %xmm2
TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
@@ -567,7 +537,7 @@ LABEL(loop_ashr_3_use):
LABEL(nibble_ashr_3_restart_use):
movdqa (%rdi, %rdx), %xmm0
- palignr $3, -16(%rdi, %rdx), D(%xmm0)
+ palignr $3, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
#else
@@ -586,7 +556,7 @@ LABEL(nibble_ashr_3_restart_use):
jg LABEL(nibble_ashr_3_use)
movdqa (%rdi, %rdx), %xmm0
- palignr $3, -16(%rdi, %rdx), D(%xmm0)
+ palignr $3, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
#else
@@ -606,7 +576,7 @@ LABEL(nibble_ashr_3_restart_use):
LABEL(nibble_ashr_3_use):
sub $0x1000, %r10
movdqa -16(%rdi, %rdx), %xmm0
- psrldq $3, D(%xmm0)
+ psrldq $3, %xmm0
pcmpistri $0x3a,%xmm0, %xmm0
#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
cmp %r11, %rcx
@@ -624,10 +594,10 @@ LABEL(nibble_ashr_3_use):
*/
.p2align 4
LABEL(ashr_4):
- pslldq $12, D(%xmm2)
+ pslldq $12, %xmm2
TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
@@ -656,7 +626,7 @@ LABEL(loop_ashr_4_use):
LABEL(nibble_ashr_4_restart_use):
movdqa (%rdi, %rdx), %xmm0
- palignr $4, -16(%rdi, %rdx), D(%xmm0)
+ palignr $4, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
#else
@@ -675,7 +645,7 @@ LABEL(nibble_ashr_4_restart_use):
jg LABEL(nibble_ashr_4_use)
movdqa (%rdi, %rdx), %xmm0
- palignr $4, -16(%rdi, %rdx), D(%xmm0)
+ palignr $4, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
#else
@@ -695,7 +665,7 @@ LABEL(nibble_ashr_4_restart_use):
LABEL(nibble_ashr_4_use):
sub $0x1000, %r10
movdqa -16(%rdi, %rdx), %xmm0
- psrldq $4, D(%xmm0)
+ psrldq $4, %xmm0
pcmpistri $0x3a,%xmm0, %xmm0
#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
cmp %r11, %rcx
@@ -713,10 +683,10 @@ LABEL(nibble_ashr_4_use):
*/
.p2align 4
LABEL(ashr_5):
- pslldq $11, D(%xmm2)
+ pslldq $11, %xmm2
TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
@@ -745,7 +715,7 @@ LABEL(loop_ashr_5_use):
LABEL(nibble_ashr_5_restart_use):
movdqa (%rdi, %rdx), %xmm0
- palignr $5, -16(%rdi, %rdx), D(%xmm0)
+ palignr $5, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
#else
@@ -765,7 +735,7 @@ LABEL(nibble_ashr_5_restart_use):
movdqa (%rdi, %rdx), %xmm0
- palignr $5, -16(%rdi, %rdx), D(%xmm0)
+ palignr $5, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
#else
@@ -785,7 +755,7 @@ LABEL(nibble_ashr_5_restart_use):
LABEL(nibble_ashr_5_use):
sub $0x1000, %r10
movdqa -16(%rdi, %rdx), %xmm0
- psrldq $5, D(%xmm0)
+ psrldq $5, %xmm0
pcmpistri $0x3a,%xmm0, %xmm0
#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
cmp %r11, %rcx
@@ -803,10 +773,10 @@ LABEL(nibble_ashr_5_use):
*/
.p2align 4
LABEL(ashr_6):
- pslldq $10, D(%xmm2)
+ pslldq $10, %xmm2
TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
@@ -835,7 +805,7 @@ LABEL(loop_ashr_6_use):
LABEL(nibble_ashr_6_restart_use):
movdqa (%rdi, %rdx), %xmm0
- palignr $6, -16(%rdi, %rdx), D(%xmm0)
+ palignr $6, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
#else
@@ -854,7 +824,7 @@ LABEL(nibble_ashr_6_restart_use):
jg LABEL(nibble_ashr_6_use)
movdqa (%rdi, %rdx), %xmm0
- palignr $6, -16(%rdi, %rdx), D(%xmm0)
+ palignr $6, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
#else
@@ -874,7 +844,7 @@ LABEL(nibble_ashr_6_restart_use):
LABEL(nibble_ashr_6_use):
sub $0x1000, %r10
movdqa -16(%rdi, %rdx), %xmm0
- psrldq $6, D(%xmm0)
+ psrldq $6, %xmm0
pcmpistri $0x3a,%xmm0, %xmm0
#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
cmp %r11, %rcx
@@ -892,10 +862,10 @@ LABEL(nibble_ashr_6_use):
*/
.p2align 4
LABEL(ashr_7):
- pslldq $9, D(%xmm2)
+ pslldq $9, %xmm2
TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
@@ -924,7 +894,7 @@ LABEL(loop_ashr_7_use):
LABEL(nibble_ashr_7_restart_use):
movdqa (%rdi, %rdx), %xmm0
- palignr $7, -16(%rdi, %rdx), D(%xmm0)
+ palignr $7, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
#else
@@ -943,7 +913,7 @@ LABEL(nibble_ashr_7_restart_use):
jg LABEL(nibble_ashr_7_use)
movdqa (%rdi, %rdx), %xmm0
- palignr $7, -16(%rdi, %rdx), D(%xmm0)
+ palignr $7, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
#else
@@ -963,7 +933,7 @@ LABEL(nibble_ashr_7_restart_use):
LABEL(nibble_ashr_7_use):
sub $0x1000, %r10
movdqa -16(%rdi, %rdx), %xmm0
- psrldq $7, D(%xmm0)
+ psrldq $7, %xmm0
pcmpistri $0x3a,%xmm0, %xmm0
#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
cmp %r11, %rcx
@@ -981,10 +951,10 @@ LABEL(nibble_ashr_7_use):
*/
.p2align 4
LABEL(ashr_8):
- pslldq $8, D(%xmm2)
+ pslldq $8, %xmm2
TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
@@ -1013,7 +983,7 @@ LABEL(loop_ashr_8_use):
LABEL(nibble_ashr_8_restart_use):
movdqa (%rdi, %rdx), %xmm0
- palignr $8, -16(%rdi, %rdx), D(%xmm0)
+ palignr $8, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
@@ -1032,7 +1002,7 @@ LABEL(nibble_ashr_8_restart_use):
jg LABEL(nibble_ashr_8_use)
movdqa (%rdi, %rdx), %xmm0
- palignr $8, -16(%rdi, %rdx), D(%xmm0)
+ palignr $8, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
@@ -1052,7 +1022,7 @@ LABEL(nibble_ashr_8_restart_use):
LABEL(nibble_ashr_8_use):
sub $0x1000, %r10
movdqa -16(%rdi, %rdx), %xmm0
- psrldq $8, D(%xmm0)
+ psrldq $8, %xmm0
pcmpistri $0x3a,%xmm0, %xmm0
#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
cmp %r11, %rcx
@@ -1070,10 +1040,10 @@ LABEL(nibble_ashr_8_use):
*/
.p2align 4
LABEL(ashr_9):
- pslldq $7, D(%xmm2)
+ pslldq $7, %xmm2
TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
@@ -1103,7 +1073,7 @@ LABEL(loop_ashr_9_use):
LABEL(nibble_ashr_9_restart_use):
movdqa (%rdi, %rdx), %xmm0
- palignr $9, -16(%rdi, %rdx), D(%xmm0)
+ palignr $9, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
@@ -1122,7 +1092,7 @@ LABEL(nibble_ashr_9_restart_use):
jg LABEL(nibble_ashr_9_use)
movdqa (%rdi, %rdx), %xmm0
- palignr $9, -16(%rdi, %rdx), D(%xmm0)
+ palignr $9, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
@@ -1142,7 +1112,7 @@ LABEL(nibble_ashr_9_restart_use):
LABEL(nibble_ashr_9_use):
sub $0x1000, %r10
movdqa -16(%rdi, %rdx), %xmm0
- psrldq $9, D(%xmm0)
+ psrldq $9, %xmm0
pcmpistri $0x3a,%xmm0, %xmm0
#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
cmp %r11, %rcx
@@ -1160,10 +1130,10 @@ LABEL(nibble_ashr_9_use):
*/
.p2align 4
LABEL(ashr_10):
- pslldq $6, D(%xmm2)
+ pslldq $6, %xmm2
TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
@@ -1192,7 +1162,7 @@ LABEL(loop_ashr_10_use):
LABEL(nibble_ashr_10_restart_use):
movdqa (%rdi, %rdx), %xmm0
- palignr $10, -16(%rdi, %rdx), D(%xmm0)
+ palignr $10, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
@@ -1211,7 +1181,7 @@ LABEL(nibble_ashr_10_restart_use):
jg LABEL(nibble_ashr_10_use)
movdqa (%rdi, %rdx), %xmm0
- palignr $10, -16(%rdi, %rdx), D(%xmm0)
+ palignr $10, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
@@ -1231,7 +1201,7 @@ LABEL(nibble_ashr_10_restart_use):
LABEL(nibble_ashr_10_use):
sub $0x1000, %r10
movdqa -16(%rdi, %rdx), %xmm0
- psrldq $10, D(%xmm0)
+ psrldq $10, %xmm0
pcmpistri $0x3a,%xmm0, %xmm0
#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
cmp %r11, %rcx
@@ -1249,10 +1219,10 @@ LABEL(nibble_ashr_10_use):
*/
.p2align 4
LABEL(ashr_11):
- pslldq $5, D(%xmm2)
+ pslldq $5, %xmm2
TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
@@ -1281,7 +1251,7 @@ LABEL(loop_ashr_11_use):
LABEL(nibble_ashr_11_restart_use):
movdqa (%rdi, %rdx), %xmm0
- palignr $11, -16(%rdi, %rdx), D(%xmm0)
+ palignr $11, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
@@ -1300,7 +1270,7 @@ LABEL(nibble_ashr_11_restart_use):
jg LABEL(nibble_ashr_11_use)
movdqa (%rdi, %rdx), %xmm0
- palignr $11, -16(%rdi, %rdx), D(%xmm0)
+ palignr $11, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
@@ -1320,7 +1290,7 @@ LABEL(nibble_ashr_11_restart_use):
LABEL(nibble_ashr_11_use):
sub $0x1000, %r10
movdqa -16(%rdi, %rdx), %xmm0
- psrldq $11, D(%xmm0)
+ psrldq $11, %xmm0
pcmpistri $0x3a,%xmm0, %xmm0
#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
cmp %r11, %rcx
@@ -1338,10 +1308,10 @@ LABEL(nibble_ashr_11_use):
*/
.p2align 4
LABEL(ashr_12):
- pslldq $4, D(%xmm2)
+ pslldq $4, %xmm2
TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
@@ -1370,7 +1340,7 @@ LABEL(loop_ashr_12_use):
LABEL(nibble_ashr_12_restart_use):
movdqa (%rdi, %rdx), %xmm0
- palignr $12, -16(%rdi, %rdx), D(%xmm0)
+ palignr $12, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
@@ -1389,7 +1359,7 @@ LABEL(nibble_ashr_12_restart_use):
jg LABEL(nibble_ashr_12_use)
movdqa (%rdi, %rdx), %xmm0
- palignr $12, -16(%rdi, %rdx), D(%xmm0)
+ palignr $12, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
@@ -1409,7 +1379,7 @@ LABEL(nibble_ashr_12_restart_use):
LABEL(nibble_ashr_12_use):
sub $0x1000, %r10
movdqa -16(%rdi, %rdx), %xmm0
- psrldq $12, D(%xmm0)
+ psrldq $12, %xmm0
pcmpistri $0x3a,%xmm0, %xmm0
#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
cmp %r11, %rcx
@@ -1427,10 +1397,10 @@ LABEL(nibble_ashr_12_use):
*/
.p2align 4
LABEL(ashr_13):
- pslldq $3, D(%xmm2)
+ pslldq $3, %xmm2
TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
@@ -1460,7 +1430,7 @@ LABEL(loop_ashr_13_use):
LABEL(nibble_ashr_13_restart_use):
movdqa (%rdi, %rdx), %xmm0
- palignr $13, -16(%rdi, %rdx), D(%xmm0)
+ palignr $13, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
@@ -1479,7 +1449,7 @@ LABEL(nibble_ashr_13_restart_use):
jg LABEL(nibble_ashr_13_use)
movdqa (%rdi, %rdx), %xmm0
- palignr $13, -16(%rdi, %rdx), D(%xmm0)
+ palignr $13, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
@@ -1499,7 +1469,7 @@ LABEL(nibble_ashr_13_restart_use):
LABEL(nibble_ashr_13_use):
sub $0x1000, %r10
movdqa -16(%rdi, %rdx), %xmm0
- psrldq $13, D(%xmm0)
+ psrldq $13, %xmm0
pcmpistri $0x3a,%xmm0, %xmm0
#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
cmp %r11, %rcx
@@ -1517,10 +1487,10 @@ LABEL(nibble_ashr_13_use):
*/
.p2align 4
LABEL(ashr_14):
- pslldq $2, D(%xmm2)
+ pslldq $2, %xmm2
TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
@@ -1550,7 +1520,7 @@ LABEL(loop_ashr_14_use):
LABEL(nibble_ashr_14_restart_use):
movdqa (%rdi, %rdx), %xmm0
- palignr $14, -16(%rdi, %rdx), D(%xmm0)
+ palignr $14, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
@@ -1569,7 +1539,7 @@ LABEL(nibble_ashr_14_restart_use):
jg LABEL(nibble_ashr_14_use)
movdqa (%rdi, %rdx), %xmm0
- palignr $14, -16(%rdi, %rdx), D(%xmm0)
+ palignr $14, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
@@ -1589,7 +1559,7 @@ LABEL(nibble_ashr_14_restart_use):
LABEL(nibble_ashr_14_use):
sub $0x1000, %r10
movdqa -16(%rdi, %rdx), %xmm0
- psrldq $14, D(%xmm0)
+ psrldq $14, %xmm0
pcmpistri $0x3a,%xmm0, %xmm0
#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
cmp %r11, %rcx
@@ -1607,10 +1577,10 @@ LABEL(nibble_ashr_14_use):
*/
.p2align 4
LABEL(ashr_15):
- pslldq $1, D(%xmm2)
+ pslldq $1, %xmm2
TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
@@ -1642,7 +1612,7 @@ LABEL(loop_ashr_15_use):
LABEL(nibble_ashr_15_restart_use):
movdqa (%rdi, %rdx), %xmm0
- palignr $15, -16(%rdi, %rdx), D(%xmm0)
+ palignr $15, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
@@ -1661,7 +1631,7 @@ LABEL(nibble_ashr_15_restart_use):
jg LABEL(nibble_ashr_15_use)
movdqa (%rdi, %rdx), %xmm0
- palignr $15, -16(%rdi, %rdx), D(%xmm0)
+ palignr $15, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
@@ -1681,7 +1651,7 @@ LABEL(nibble_ashr_15_restart_use):
LABEL(nibble_ashr_15_use):
sub $0x1000, %r10
movdqa -16(%rdi, %rdx), %xmm0
- psrldq $15, D(%xmm0)
+ psrldq $15, %xmm0
pcmpistri $0x3a,%xmm0, %xmm0
#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
cmp %r11, %rcx
diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx.S b/sysdeps/x86_64/multiarch/strncase_l-avx.S
deleted file mode 100644
index b51b86d223..0000000000
--- a/sysdeps/x86_64/multiarch/strncase_l-avx.S
+++ /dev/null
@@ -1,22 +0,0 @@
-/* strncasecmp_l optimized with AVX.
- Copyright (C) 2017-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#define STRCMP_SSE42 __strncasecmp_l_avx
-#define USE_AVX 1
-#define USE_AS_STRNCASECMP_L
-#include "strcmp-sse42.S"
--
2.25.1
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c
2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
` (21 preceding siblings ...)
2022-03-23 21:57 ` [PATCH v1 23/23] x86: Remove AVX str{n}casecmp Noah Goldstein
@ 2022-03-24 18:43 ` H.J. Lu
22 siblings, 0 replies; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 18:43 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Wed, Mar 23, 2022 at 2:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Just QOL change to make parsing the output of the benchtests more
> consistent.
> ---
> benchtests/bench-strchr.c | 94 ++++++++++++++++++++++++++-------------
> 1 file changed, 64 insertions(+), 30 deletions(-)
>
> diff --git a/benchtests/bench-strchr.c b/benchtests/bench-strchr.c
> index 821bc615b0..203900d4ad 100644
> --- a/benchtests/bench-strchr.c
> +++ b/benchtests/bench-strchr.c
> @@ -32,6 +32,7 @@
> #endif /* WIDE */
> #include "bench-string.h"
>
> +#include "json-lib.h"
> #define BIG_CHAR MAX_CHAR
>
> #ifndef WIDE
> @@ -74,10 +75,19 @@ IMPL (simple_STRCHR, 0)
> IMPL (STRCHR, 1)
>
> static void
> -do_one_test (impl_t *impl, const CHAR *s, int c, const CHAR *exp_res)
> +do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s, int c,
> + const CHAR *exp_res)
> {
> size_t i, iters = INNER_LOOP_ITERS_LARGE;
> timing_t start, stop, cur;
> + const CHAR *res = CALL (impl, s, c);
> + if (res != exp_res)
> + {
> + error (0, 0, "Wrong result in function %s %p != %p", impl->name, res,
> + exp_res);
> + ret = 1;
> + return;
> + }
>
> TIMING_NOW (start);
> for (i = 0; i < iters; ++i)
> @@ -88,11 +98,12 @@ do_one_test (impl_t *impl, const CHAR *s, int c, const CHAR *exp_res)
>
> TIMING_DIFF (cur, start, stop);
>
> - TIMING_PRINT_MEAN ((double) cur, (double) iters);
> + json_element_double (json_ctx, (double)cur / (double)iters);
> }
>
> static void
> -do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
> +do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
> + int seek_char, int max_char)
> /* For wcschr: align here means align not in bytes,
> but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
> len for wcschr here isn't in bytes but it's number of wchar_t symbols. */
> @@ -124,87 +135,110 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
> else
> result = NULLRET (buf + align + len);
>
> - printf ("Length %4zd, alignment in bytes %2zd:",
> - pos, align * sizeof (CHAR));
> + json_element_object_begin (json_ctx);
> + json_attr_uint (json_ctx, "length", len);
> + json_attr_uint (json_ctx, "pos", pos);
> + json_attr_uint (json_ctx, "seek_char", seek_char);
> + json_attr_uint (json_ctx, "max_char", max_char);
> + json_attr_uint (json_ctx, "alignment", align);
> + json_array_begin (json_ctx, "timings");
>
> FOR_EACH_IMPL (impl, 0)
> - do_one_test (impl, buf + align, seek_char, result);
> + do_one_test (json_ctx, impl, buf + align, seek_char, result);
>
> - putchar ('\n');
> + json_array_end (json_ctx);
> + json_element_object_end (json_ctx);
> }
>
> int
> test_main (void)
> {
> + json_ctx_t json_ctx;
> size_t i;
>
> test_init ();
>
> - printf ("%20s", "");
> + json_init (&json_ctx, 0, stdout);
> +
> + json_document_begin (&json_ctx);
> + json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> +
> + json_attr_object_begin (&json_ctx, "functions");
> + json_attr_object_begin (&json_ctx, TEST_NAME);
> + json_attr_string (&json_ctx, "bench-variant", "");
> +
> + json_array_begin (&json_ctx, "ifuncs");
> FOR_EACH_IMPL (impl, 0)
> - printf ("\t%s", impl->name);
> - putchar ('\n');
> + json_element_string (&json_ctx, impl->name);
> + json_array_end (&json_ctx);
> +
> + json_array_begin (&json_ctx, "results");
>
> for (i = 1; i < 8; ++i)
> {
> - do_test (0, 16 << i, 2048, SMALL_CHAR, MIDDLE_CHAR);
> - do_test (i, 16 << i, 2048, SMALL_CHAR, MIDDLE_CHAR);
> + do_test (&json_ctx, 0, 16 << i, 2048, SMALL_CHAR, MIDDLE_CHAR);
> + do_test (&json_ctx, i, 16 << i, 2048, SMALL_CHAR, MIDDLE_CHAR);
> }
>
> for (i = 1; i < 8; ++i)
> {
> - do_test (0, 16 << i, 4096, SMALL_CHAR, MIDDLE_CHAR);
> - do_test (i, 16 << i, 4096, SMALL_CHAR, MIDDLE_CHAR);
> + do_test (&json_ctx, 0, 16 << i, 4096, SMALL_CHAR, MIDDLE_CHAR);
> + do_test (&json_ctx, i, 16 << i, 4096, SMALL_CHAR, MIDDLE_CHAR);
> }
>
> for (i = 1; i < 8; ++i)
> {
> - do_test (i, 64, 256, SMALL_CHAR, MIDDLE_CHAR);
> - do_test (i, 64, 256, SMALL_CHAR, BIG_CHAR);
> + do_test (&json_ctx, i, 64, 256, SMALL_CHAR, MIDDLE_CHAR);
> + do_test (&json_ctx, i, 64, 256, SMALL_CHAR, BIG_CHAR);
> }
>
> for (i = 0; i < 8; ++i)
> {
> - do_test (16 * i, 256, 512, SMALL_CHAR, MIDDLE_CHAR);
> - do_test (16 * i, 256, 512, SMALL_CHAR, BIG_CHAR);
> + do_test (&json_ctx, 16 * i, 256, 512, SMALL_CHAR, MIDDLE_CHAR);
> + do_test (&json_ctx, 16 * i, 256, 512, SMALL_CHAR, BIG_CHAR);
> }
>
> for (i = 0; i < 32; ++i)
> {
> - do_test (0, i, i + 1, SMALL_CHAR, MIDDLE_CHAR);
> - do_test (0, i, i + 1, SMALL_CHAR, BIG_CHAR);
> + do_test (&json_ctx, 0, i, i + 1, SMALL_CHAR, MIDDLE_CHAR);
> + do_test (&json_ctx, 0, i, i + 1, SMALL_CHAR, BIG_CHAR);
> }
>
> for (i = 1; i < 8; ++i)
> {
> - do_test (0, 16 << i, 2048, 0, MIDDLE_CHAR);
> - do_test (i, 16 << i, 2048, 0, MIDDLE_CHAR);
> + do_test (&json_ctx, 0, 16 << i, 2048, 0, MIDDLE_CHAR);
> + do_test (&json_ctx, i, 16 << i, 2048, 0, MIDDLE_CHAR);
> }
>
> for (i = 1; i < 8; ++i)
> {
> - do_test (0, 16 << i, 4096, 0, MIDDLE_CHAR);
> - do_test (i, 16 << i, 4096, 0, MIDDLE_CHAR);
> + do_test (&json_ctx, 0, 16 << i, 4096, 0, MIDDLE_CHAR);
> + do_test (&json_ctx, i, 16 << i, 4096, 0, MIDDLE_CHAR);
> }
>
> for (i = 1; i < 8; ++i)
> {
> - do_test (i, 64, 256, 0, MIDDLE_CHAR);
> - do_test (i, 64, 256, 0, BIG_CHAR);
> + do_test (&json_ctx, i, 64, 256, 0, MIDDLE_CHAR);
> + do_test (&json_ctx, i, 64, 256, 0, BIG_CHAR);
> }
>
> for (i = 0; i < 8; ++i)
> {
> - do_test (16 * i, 256, 512, 0, MIDDLE_CHAR);
> - do_test (16 * i, 256, 512, 0, BIG_CHAR);
> + do_test (&json_ctx, 16 * i, 256, 512, 0, MIDDLE_CHAR);
> + do_test (&json_ctx, 16 * i, 256, 512, 0, BIG_CHAR);
> }
>
> for (i = 0; i < 32; ++i)
> {
> - do_test (0, i, i + 1, 0, MIDDLE_CHAR);
> - do_test (0, i, i + 1, 0, BIG_CHAR);
> + do_test (&json_ctx, 0, i, i + 1, 0, MIDDLE_CHAR);
> + do_test (&json_ctx, 0, i, i + 1, 0, BIG_CHAR);
> }
>
> + json_array_end (&json_ctx);
> + json_attr_object_end (&json_ctx);
> + json_attr_object_end (&json_ctx);
> + json_document_end (&json_ctx);
> +
> return ret;
> }
>
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v1 02/23] benchtests: Add random benchmark in bench-strchr.c
2022-03-23 21:57 ` [PATCH v1 02/23] benchtests: Add random benchmark " Noah Goldstein
@ 2022-03-24 18:44 ` H.J. Lu
0 siblings, 0 replies; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 18:44 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Wed, Mar 23, 2022 at 2:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Add benchmark that randomizes whether return should be NULL or pointer
> to CHAR. The rationale is on many architectures there is a choice
> between a predicate execution option (i.e cmovcc on x86) or a branch.
>
> On x86 the results for cmovcc vs branch are something along the lines
> of the following:
>
> perc-zero, Br On Result, Time Br / Time cmov
> 0.10, 1, ,0.983
> 0.10, 0, ,1.246
> 0.25, 1, ,1.035
> 0.25, 0, ,1.49
> 0.33, 1, ,1.016
> 0.33, 0, ,1.579
> 0.50, 1, ,1.228
> 0.50, 0, ,1.739
> 0.66, 1, ,1.039
> 0.66, 0, ,1.764
> 0.75, 1, ,0.996
> 0.75, 0, ,1.642
> 0.90, 1, ,1.071
> 0.90, 0, ,1.409
> 1.00, 1, ,0.937
> 1.00, 0, ,0.999
> ---
> benchtests/bench-strchr.c | 143 ++++++++++++++++++++++++++++++++++++++
> 1 file changed, 143 insertions(+)
>
> diff --git a/benchtests/bench-strchr.c b/benchtests/bench-strchr.c
> index 203900d4ad..54640bde7e 100644
> --- a/benchtests/bench-strchr.c
> +++ b/benchtests/bench-strchr.c
> @@ -53,6 +53,11 @@
> # define SMALL_CHAR 851
> #endif /* WIDE */
>
> +#ifdef USE_FOR_STRCHRNUL
> +# define DO_RAND_TEST(...)
> +#else
> +# define DO_RAND_TEST(...) do_rand_test(__VA_ARGS__)
> +#endif
> #ifdef USE_FOR_STRCHRNUL
> # define NULLRET(endptr) endptr
> #else
> @@ -74,6 +79,133 @@ simple_STRCHR (const CHAR *s, int c)
> IMPL (simple_STRCHR, 0)
> IMPL (STRCHR, 1)
>
> +#ifndef USE_FOR_STRCHRNUL
> +/* Random benchmarks for strchr (if return is CHAR or NULL). The
> + rational for the benchmark is returning null/char can be done with
> + predicate execution (i.e cmovcc on x86) or a branch. */
> +
> +
> +/* Large enough that full history can't be stored in BHT. */
> +#define NUM_SEARCH_CHARS 2048
> +
> +/* Expectation is usecases of strchr check the return. Otherwise
> + strchrnul would almost always be better. Since there is another
> + branch coming we want to test the case where a potential branch in
> + strchr can be used to skip a later mispredict because of the
> + relationship between the two branches. */
> +static void __attribute__ ((noinline, noclone))
> +do_one_rand_plus_branch_test (json_ctx_t *json_ctx, impl_t *impl,
> + const CHAR *s, const CHAR *c)
> +{
> + size_t i, iters = INNER_LOOP_ITERS_LARGE;
> + int must_execute = 0;
> + timing_t start, stop, cur;
> + TIMING_NOW (start);
> + for (i = 0; i < iters; ++i)
> + {
> + if (CALL (impl, s, c[i % NUM_SEARCH_CHARS]))
> + {
> + /* We just need something that will force compiler to emit
> + a branch instead of conditional execution. */
> + ++must_execute;
> + asm volatile("" : : :);
> + }
> + }
> + TIMING_NOW (stop);
> +
> + TIMING_DIFF (cur, start, stop);
> +
> + json_element_double (json_ctx, (double)cur / (double)iters);
> +}
> +
> +static void __attribute__ ((noinline, noclone))
> +do_one_rand_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s,
> + const CHAR *c)
> +{
> + size_t i, iters = INNER_LOOP_ITERS_LARGE;
> + timing_t start, stop, cur;
> + TIMING_NOW (start);
> + for (i = 0; i < iters; ++i)
> + {
> + CALL (impl, s, c[i % NUM_SEARCH_CHARS]);
> + }
> + TIMING_NOW (stop);
> +
> + TIMING_DIFF (cur, start, stop);
> +
> + json_element_double (json_ctx, (double)cur / (double)iters);
> +}
> +
> +static void
> +do_rand_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
> + float perc_zero)
> +{
> + size_t i;
> + int perc_zero_int;
> + CHAR *buf = (CHAR *)buf1;
> + CHAR *c = (CHAR *)buf2;
> + align &= 127;
> + if ((align + len) * sizeof (CHAR) >= page_size)
> + return;
> +
> + /* Test is only interesting if we can hit both cases. */
> + if (pos >= len)
> + return;
> +
> + /* Segfault if we run the test. */
> + if (NUM_SEARCH_CHARS * sizeof (CHAR) > page_size)
> + return;
> +
> + for (i = 0; i < len; ++i)
> + {
> + buf[align + i] = 2;
> + }
> + buf[align + len] = 0;
> + buf[align + pos] = 1;
> +
> + perc_zero_int = perc_zero * RAND_MAX;
> + for (i = 0; i < NUM_SEARCH_CHARS; ++i)
> + {
> + if (rand () > perc_zero_int)
> + c[i] = 0;
> + else
> + c[i] = 1;
> + }
> + {
> + json_element_object_begin (json_ctx);
> + json_attr_uint (json_ctx, "rand", 1);
> + json_attr_uint (json_ctx, "branch", 1);
> + json_attr_double (json_ctx, "perc-zero", perc_zero);
> + json_attr_uint (json_ctx, "length", len);
> + json_attr_uint (json_ctx, "pos", pos);
> + json_attr_uint (json_ctx, "alignment", align);
> + json_array_begin (json_ctx, "timings");
> +
> + FOR_EACH_IMPL (impl, 0)
> + do_one_rand_plus_branch_test (json_ctx, impl, buf + align, c);
> +
> + json_array_end (json_ctx);
> + json_element_object_end (json_ctx);
> + }
> + {
> + json_element_object_begin (json_ctx);
> + json_attr_uint (json_ctx, "rand", 1);
> + json_attr_uint (json_ctx, "branch", 0);
> + json_attr_double (json_ctx, "perc-zero", perc_zero);
> + json_attr_uint (json_ctx, "length", len);
> + json_attr_uint (json_ctx, "pos", pos);
> + json_attr_uint (json_ctx, "alignment", align);
> + json_array_begin (json_ctx, "timings");
> +
> + FOR_EACH_IMPL (impl, 0)
> + do_one_rand_test (json_ctx, impl, buf + align, c);
> +
> + json_array_end (json_ctx);
> + json_element_object_end (json_ctx);
> + }
> +}
> +#endif
> +
> static void
> do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s, int c,
> const CHAR *exp_res)
> @@ -136,6 +268,7 @@ do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
> result = NULLRET (buf + align + len);
>
> json_element_object_begin (json_ctx);
> + json_attr_uint (json_ctx, "rand", 0);
> json_attr_uint (json_ctx, "length", len);
> json_attr_uint (json_ctx, "pos", pos);
> json_attr_uint (json_ctx, "seek_char", seek_char);
> @@ -234,6 +367,16 @@ test_main (void)
> do_test (&json_ctx, 0, i, i + 1, 0, BIG_CHAR);
> }
>
> + DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.0);
> + DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.1);
> + DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.25);
> + DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.33);
> + DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.5);
> + DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.66);
> + DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.75);
> + DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.9);
> + DO_RAND_TEST(&json_ctx, 0, 15, 16, 1.0);
> +
> json_array_end (&json_ctx);
> json_attr_object_end (&json_ctx);
> json_attr_object_end (&json_ctx);
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v1 03/23] x86: Code cleanup in strchr-avx2 and comment justifying branch
2022-03-23 21:57 ` [PATCH v1 03/23] x86: Code cleanup in strchr-avx2 and comment justifying branch Noah Goldstein
@ 2022-03-24 18:53 ` H.J. Lu
2022-03-24 19:20 ` Noah Goldstein
0 siblings, 1 reply; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 18:53 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Wed, Mar 23, 2022 at 2:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Small code cleanup for size: -53 bytes.
>
> Add comment justifying using a branch to do NULL/non-null return.
Do you have followup patches to improve its performance? We are
backporting all x86-64 improvements to Intel release branches:
https://gitlab.com/x86-glibc/glibc/-/wikis/home
Patches without performance improvements are undesirable.
> All string/memory tests pass and no regressions in benchtests.
>
> geometric_mean(N=20) of all benchmarks Original / New: 1.00
> ---
> Geomtric Mean N=20 runs; All functions page aligned
> length, alignment, pos, rand, seek_char/branch, max_char/perc-zero, New Time / Old Time
> 2048, 0, 32, 0, 23, 127, 1.033
> 2048, 1, 32, 0, 23, 127, 1.006
> 2048, 0, 64, 0, 23, 127, 1.02
> 2048, 2, 64, 0, 23, 127, 0.992
> 2048, 0, 128, 0, 23, 127, 0.996
> 2048, 3, 128, 0, 23, 127, 0.966
> 2048, 0, 256, 0, 23, 127, 0.996
> 2048, 4, 256, 0, 23, 127, 0.998
> 2048, 0, 512, 0, 23, 127, 0.991
> 2048, 5, 512, 0, 23, 127, 0.991
> 2048, 0, 1024, 0, 23, 127, 0.993
> 2048, 6, 1024, 0, 23, 127, 0.992
> 2048, 0, 2048, 0, 23, 127, 0.992
> 2048, 7, 2048, 0, 23, 127, 0.976
> 4096, 0, 32, 0, 23, 127, 0.983
> 4096, 1, 32, 0, 23, 127, 0.994
> 4096, 0, 64, 0, 23, 127, 0.968
> 4096, 2, 64, 0, 23, 127, 1.018
> 4096, 0, 128, 0, 23, 127, 0.99
> 4096, 3, 128, 0, 23, 127, 1.001
> 4096, 0, 256, 0, 23, 127, 1.0
> 4096, 4, 256, 0, 23, 127, 1.001
> 4096, 0, 512, 0, 23, 127, 0.989
> 4096, 5, 512, 0, 23, 127, 0.988
> 4096, 0, 1024, 0, 23, 127, 0.994
> 4096, 6, 1024, 0, 23, 127, 0.993
> 4096, 0, 2048, 0, 23, 127, 0.987
> 4096, 7, 2048, 0, 23, 127, 0.996
> 256, 1, 64, 0, 23, 127, 1.004
> 256, 2, 64, 0, 23, 127, 1.004
> 256, 3, 64, 0, 23, 127, 0.992
> 256, 4, 64, 0, 23, 127, 1.001
> 256, 5, 64, 0, 23, 127, 1.001
> 256, 6, 64, 0, 23, 127, 0.998
> 256, 7, 64, 0, 23, 127, 0.994
> 512, 0, 256, 0, 23, 127, 0.999
> 512, 16, 256, 0, 23, 127, 1.002
> 512, 32, 256, 0, 23, 127, 0.994
> 512, 48, 256, 0, 23, 127, 0.991
> 512, 64, 256, 0, 23, 127, 0.994
> 512, 80, 256, 0, 23, 127, 0.994
> 512, 96, 256, 0, 23, 127, 0.996
> 512, 112, 256, 0, 23, 127, 0.999
> 1, 0, 0, 0, 23, 127, 0.978
> 2, 0, 1, 0, 23, 127, 0.981
> 3, 0, 2, 0, 23, 127, 0.993
> 4, 0, 3, 0, 23, 127, 1.004
> 5, 0, 4, 0, 23, 127, 1.002
> 6, 0, 5, 0, 23, 127, 0.991
> 7, 0, 6, 0, 23, 127, 0.99
> 8, 0, 7, 0, 23, 127, 1.012
> 9, 0, 8, 0, 23, 127, 0.994
> 10, 0, 9, 0, 23, 127, 1.003
> 11, 0, 10, 0, 23, 127, 0.999
> 12, 0, 11, 0, 23, 127, 1.007
> 13, 0, 12, 0, 23, 127, 1.0
> 14, 0, 13, 0, 23, 127, 0.997
> 15, 0, 14, 0, 23, 127, 0.996
> 16, 0, 15, 0, 23, 127, 0.993
> 17, 0, 16, 0, 23, 127, 1.002
> 18, 0, 17, 0, 23, 127, 0.997
> 19, 0, 18, 0, 23, 127, 0.998
> 20, 0, 19, 0, 23, 127, 0.994
> 21, 0, 20, 0, 23, 127, 0.99
> 22, 0, 21, 0, 23, 127, 0.992
> 23, 0, 22, 0, 23, 127, 0.996
> 24, 0, 23, 0, 23, 127, 0.991
> 25, 0, 24, 0, 23, 127, 0.997
> 26, 0, 25, 0, 23, 127, 1.011
> 27, 0, 26, 0, 23, 127, 1.013
> 28, 0, 27, 0, 23, 127, 0.996
> 29, 0, 28, 0, 23, 127, 0.993
> 30, 0, 29, 0, 23, 127, 1.009
> 31, 0, 30, 0, 23, 127, 1.009
> 32, 0, 31, 0, 23, 127, 1.008
> 2048, 0, 32, 0, 0, 127, 1.0
> 2048, 1, 32, 0, 0, 127, 1.01
> 2048, 0, 64, 0, 0, 127, 0.997
> 2048, 2, 64, 0, 0, 127, 1.002
> 2048, 0, 128, 0, 0, 127, 0.986
> 2048, 3, 128, 0, 0, 127, 0.997
> 2048, 0, 256, 0, 0, 127, 1.002
> 2048, 4, 256, 0, 0, 127, 0.999
> 2048, 0, 512, 0, 0, 127, 0.991
> 2048, 5, 512, 0, 0, 127, 0.984
> 2048, 0, 1024, 0, 0, 127, 0.994
> 2048, 6, 1024, 0, 0, 127, 0.993
> 2048, 0, 2048, 0, 0, 127, 0.951
> 2048, 7, 2048, 0, 0, 127, 0.989
> 4096, 0, 32, 0, 0, 127, 0.993
> 4096, 1, 32, 0, 0, 127, 0.997
> 4096, 0, 64, 0, 0, 127, 1.004
> 4096, 2, 64, 0, 0, 127, 1.016
> 4096, 0, 128, 0, 0, 127, 0.973
> 4096, 3, 128, 0, 0, 127, 1.001
> 4096, 0, 256, 0, 0, 127, 0.999
> 4096, 4, 256, 0, 0, 127, 0.998
> 4096, 0, 512, 0, 0, 127, 0.99
> 4096, 5, 512, 0, 0, 127, 0.985
> 4096, 0, 1024, 0, 0, 127, 0.993
> 4096, 6, 1024, 0, 0, 127, 0.997
> 4096, 0, 2048, 0, 0, 127, 0.995
> 4096, 7, 2048, 0, 0, 127, 0.996
> 256, 1, 64, 0, 0, 127, 1.01
> 256, 2, 64, 0, 0, 127, 1.024
> 256, 3, 64, 0, 0, 127, 1.03
> 256, 4, 64, 0, 0, 127, 1.004
> 256, 5, 64, 0, 0, 127, 0.998
> 256, 6, 64, 0, 0, 127, 0.998
> 256, 7, 64, 0, 0, 127, 0.997
> 512, 0, 256, 0, 0, 127, 0.996
> 512, 16, 256, 0, 0, 127, 0.995
> 512, 32, 256, 0, 0, 127, 0.996
> 512, 48, 256, 0, 0, 127, 0.992
> 512, 64, 256, 0, 0, 127, 0.999
> 512, 80, 256, 0, 0, 127, 1.002
> 512, 96, 256, 0, 0, 127, 0.999
> 512, 112, 256, 0, 0, 127, 0.998
> 1, 0, 0, 0, 0, 127, 1.016
> 2, 0, 1, 0, 0, 127, 0.998
> 3, 0, 2, 0, 0, 127, 1.02
> 4, 0, 3, 0, 0, 127, 1.004
> 5, 0, 4, 0, 0, 127, 1.021
> 6, 0, 5, 0, 0, 127, 1.014
> 7, 0, 6, 0, 0, 127, 1.007
> 8, 0, 7, 0, 0, 127, 1.016
> 9, 0, 8, 0, 0, 127, 1.003
> 10, 0, 9, 0, 0, 127, 1.004
> 11, 0, 10, 0, 0, 127, 0.995
> 12, 0, 11, 0, 0, 127, 1.009
> 13, 0, 12, 0, 0, 127, 1.005
> 14, 0, 13, 0, 0, 127, 0.987
> 15, 0, 14, 0, 0, 127, 0.998
> 16, 0, 15, 0, 0, 127, 1.004
> 17, 0, 16, 0, 0, 127, 1.01
> 18, 0, 17, 0, 0, 127, 1.01
> 19, 0, 18, 0, 0, 127, 1.006
> 20, 0, 19, 0, 0, 127, 1.012
> 21, 0, 20, 0, 0, 127, 0.999
> 22, 0, 21, 0, 0, 127, 1.004
> 23, 0, 22, 0, 0, 127, 0.988
> 24, 0, 23, 0, 0, 127, 0.993
> 25, 0, 24, 0, 0, 127, 1.004
> 26, 0, 25, 0, 0, 127, 0.99
> 27, 0, 26, 0, 0, 127, 1.016
> 28, 0, 27, 0, 0, 127, 0.987
> 29, 0, 28, 0, 0, 127, 0.989
> 30, 0, 29, 0, 0, 127, 0.998
> 31, 0, 30, 0, 0, 127, 1.005
> 32, 0, 31, 0, 0, 127, 0.993
>
> 16, 0, 15, 1, 1, 0, 1.002
> 16, 0, 15, 1, 0, 0, 1.0
> 16, 0, 15, 1, 1, 0.1, 1.034
> 16, 0, 15, 1, 0, 0.1, 1.03
> 16, 0, 15, 1, 1, 0.25, 0.993
> 16, 0, 15, 1, 0, 0.25, 1.081
> 16, 0, 15, 1, 1, 0.33, 0.959
> 16, 0, 15, 1, 0, 0.33, 1.142
> 16, 0, 15, 1, 1, 0.5, 0.929
> 16, 0, 15, 1, 0, 0.5, 1.072
> 16, 0, 15, 1, 1, 0.66, 0.984
> 16, 0, 15, 1, 0, 0.66, 1.069
> 16, 0, 15, 1, 1, 0.75, 0.969
> 16, 0, 15, 1, 0, 0.75, 1.059
> 16, 0, 15, 1, 1, 0.9, 0.98
> 16, 0, 15, 1, 0, 0.9, 0.994
> 16, 0, 15, 1, 1, 1, 0.993
> 16, 0, 15, 1, 0, 1, 0.996
>
> sysdeps/x86_64/multiarch/strchr-avx2.S | 204 +++++++++++++------------
> 1 file changed, 107 insertions(+), 97 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
> index 086cabf76a..1a916cc951 100644
> --- a/sysdeps/x86_64/multiarch/strchr-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
> @@ -48,13 +48,13 @@
> # define PAGE_SIZE 4096
>
> .section SECTION(.text),"ax",@progbits
> -ENTRY (STRCHR)
> +ENTRY_P2ALIGN (STRCHR, 5)
> /* Broadcast CHAR to YMM0. */
> vmovd %esi, %xmm0
> movl %edi, %eax
> andl $(PAGE_SIZE - 1), %eax
> VPBROADCAST %xmm0, %ymm0
> - vpxor %xmm9, %xmm9, %xmm9
> + vpxor %xmm1, %xmm1, %xmm1
>
> /* Check if we cross page boundary with one vector load. */
> cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> @@ -62,37 +62,29 @@ ENTRY (STRCHR)
>
> /* Check the first VEC_SIZE bytes. Search for both CHAR and the
> null byte. */
> - vmovdqu (%rdi), %ymm8
> - VPCMPEQ %ymm8, %ymm0, %ymm1
> - VPCMPEQ %ymm8, %ymm9, %ymm2
> - vpor %ymm1, %ymm2, %ymm1
> - vpmovmskb %ymm1, %eax
> + vmovdqu (%rdi), %ymm2
> + VPCMPEQ %ymm2, %ymm0, %ymm3
> + VPCMPEQ %ymm2, %ymm1, %ymm2
> + vpor %ymm3, %ymm2, %ymm3
> + vpmovmskb %ymm3, %eax
> testl %eax, %eax
> jz L(aligned_more)
> tzcntl %eax, %eax
> # ifndef USE_AS_STRCHRNUL
> - /* Found CHAR or the null byte. */
> - cmp (%rdi, %rax), %CHAR_REG
> - jne L(zero)
> -# endif
> - addq %rdi, %rax
> - VZEROUPPER_RETURN
> -
> - /* .p2align 5 helps keep performance more consistent if ENTRY()
> - alignment % 32 was either 16 or 0. As well this makes the
> - alignment % 32 of the loop_4x_vec fixed which makes tuning it
> - easier. */
> - .p2align 5
> -L(first_vec_x4):
> - tzcntl %eax, %eax
> - addq $(VEC_SIZE * 3 + 1), %rdi
> -# ifndef USE_AS_STRCHRNUL
> - /* Found CHAR or the null byte. */
> + /* Found CHAR or the null byte. */
> cmp (%rdi, %rax), %CHAR_REG
> + /* NB: Use a branch instead of cmovcc here. The expectation is
> + that with strchr the user will branch based on input being
> + null. Since this branch will be 100% predictive of the user
> + branch a branch miss here should save what otherwise would
> + be branch miss in the user code. Otherwise using a branch 1)
> + saves code size and 2) is faster in highly predictable
> + environments. */
> jne L(zero)
> # endif
> addq %rdi, %rax
> - VZEROUPPER_RETURN
> +L(return_vzeroupper):
> + ZERO_UPPER_VEC_REGISTERS_RETURN
>
> # ifndef USE_AS_STRCHRNUL
> L(zero):
> @@ -103,7 +95,8 @@ L(zero):
>
> .p2align 4
> L(first_vec_x1):
> - tzcntl %eax, %eax
> + /* Use bsf to save code size. */
> + bsfl %eax, %eax
> incq %rdi
> # ifndef USE_AS_STRCHRNUL
> /* Found CHAR or the null byte. */
> @@ -113,9 +106,10 @@ L(first_vec_x1):
> addq %rdi, %rax
> VZEROUPPER_RETURN
>
> - .p2align 4
> + .p2align 4,, 10
> L(first_vec_x2):
> - tzcntl %eax, %eax
> + /* Use bsf to save code size. */
> + bsfl %eax, %eax
> addq $(VEC_SIZE + 1), %rdi
> # ifndef USE_AS_STRCHRNUL
> /* Found CHAR or the null byte. */
> @@ -125,9 +119,10 @@ L(first_vec_x2):
> addq %rdi, %rax
> VZEROUPPER_RETURN
>
> - .p2align 4
> + .p2align 4,, 8
> L(first_vec_x3):
> - tzcntl %eax, %eax
> + /* Use bsf to save code size. */
> + bsfl %eax, %eax
> addq $(VEC_SIZE * 2 + 1), %rdi
> # ifndef USE_AS_STRCHRNUL
> /* Found CHAR or the null byte. */
> @@ -137,6 +132,21 @@ L(first_vec_x3):
> addq %rdi, %rax
> VZEROUPPER_RETURN
>
> + .p2align 4,, 10
> +L(first_vec_x4):
> + /* Use bsf to save code size. */
> + bsfl %eax, %eax
> + addq $(VEC_SIZE * 3 + 1), %rdi
> +# ifndef USE_AS_STRCHRNUL
> + /* Found CHAR or the null byte. */
> + cmp (%rdi, %rax), %CHAR_REG
> + jne L(zero)
> +# endif
> + addq %rdi, %rax
> + VZEROUPPER_RETURN
> +
> +
> +
> .p2align 4
> L(aligned_more):
> /* Align data to VEC_SIZE - 1. This is the same number of
> @@ -146,90 +156,92 @@ L(aligned_more):
> L(cross_page_continue):
> /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
> since data is only aligned to VEC_SIZE. */
> - vmovdqa 1(%rdi), %ymm8
> - VPCMPEQ %ymm8, %ymm0, %ymm1
> - VPCMPEQ %ymm8, %ymm9, %ymm2
> - vpor %ymm1, %ymm2, %ymm1
> - vpmovmskb %ymm1, %eax
> + vmovdqa 1(%rdi), %ymm2
> + VPCMPEQ %ymm2, %ymm0, %ymm3
> + VPCMPEQ %ymm2, %ymm1, %ymm2
> + vpor %ymm3, %ymm2, %ymm3
> + vpmovmskb %ymm3, %eax
> testl %eax, %eax
> jnz L(first_vec_x1)
>
> - vmovdqa (VEC_SIZE + 1)(%rdi), %ymm8
> - VPCMPEQ %ymm8, %ymm0, %ymm1
> - VPCMPEQ %ymm8, %ymm9, %ymm2
> - vpor %ymm1, %ymm2, %ymm1
> - vpmovmskb %ymm1, %eax
> + vmovdqa (VEC_SIZE + 1)(%rdi), %ymm2
> + VPCMPEQ %ymm2, %ymm0, %ymm3
> + VPCMPEQ %ymm2, %ymm1, %ymm2
> + vpor %ymm3, %ymm2, %ymm3
> + vpmovmskb %ymm3, %eax
> testl %eax, %eax
> jnz L(first_vec_x2)
>
> - vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm8
> - VPCMPEQ %ymm8, %ymm0, %ymm1
> - VPCMPEQ %ymm8, %ymm9, %ymm2
> - vpor %ymm1, %ymm2, %ymm1
> - vpmovmskb %ymm1, %eax
> + vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm2
> + VPCMPEQ %ymm2, %ymm0, %ymm3
> + VPCMPEQ %ymm2, %ymm1, %ymm2
> + vpor %ymm3, %ymm2, %ymm3
> + vpmovmskb %ymm3, %eax
> testl %eax, %eax
> jnz L(first_vec_x3)
>
> - vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm8
> - VPCMPEQ %ymm8, %ymm0, %ymm1
> - VPCMPEQ %ymm8, %ymm9, %ymm2
> - vpor %ymm1, %ymm2, %ymm1
> - vpmovmskb %ymm1, %eax
> + vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm2
> + VPCMPEQ %ymm2, %ymm0, %ymm3
> + VPCMPEQ %ymm2, %ymm1, %ymm2
> + vpor %ymm3, %ymm2, %ymm3
> + vpmovmskb %ymm3, %eax
> testl %eax, %eax
> jnz L(first_vec_x4)
> - /* Align data to VEC_SIZE * 4 - 1. */
> - addq $(VEC_SIZE * 4 + 1), %rdi
> - andq $-(VEC_SIZE * 4), %rdi
> + /* Align data to VEC_SIZE * 4 - 1. */
> + incq %rdi
> + orq $(VEC_SIZE * 4 - 1), %rdi
> .p2align 4
> L(loop_4x_vec):
> /* Compare 4 * VEC at a time forward. */
> - vmovdqa (%rdi), %ymm5
> - vmovdqa (VEC_SIZE)(%rdi), %ymm6
> - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
> - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
> + vmovdqa 1(%rdi), %ymm6
> + vmovdqa (VEC_SIZE + 1)(%rdi), %ymm7
>
> /* Leaves only CHARS matching esi as 0. */
> - vpxor %ymm5, %ymm0, %ymm1
> vpxor %ymm6, %ymm0, %ymm2
> vpxor %ymm7, %ymm0, %ymm3
> - vpxor %ymm8, %ymm0, %ymm4
>
> - VPMINU %ymm1, %ymm5, %ymm1
> VPMINU %ymm2, %ymm6, %ymm2
> VPMINU %ymm3, %ymm7, %ymm3
> - VPMINU %ymm4, %ymm8, %ymm4
>
> - VPMINU %ymm1, %ymm2, %ymm5
> - VPMINU %ymm3, %ymm4, %ymm6
> + vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm6
> + vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm7
> +
> + vpxor %ymm6, %ymm0, %ymm4
> + vpxor %ymm7, %ymm0, %ymm5
> +
> + VPMINU %ymm4, %ymm6, %ymm4
> + VPMINU %ymm5, %ymm7, %ymm5
>
> - VPMINU %ymm5, %ymm6, %ymm6
> + VPMINU %ymm2, %ymm3, %ymm6
> + VPMINU %ymm4, %ymm5, %ymm7
>
> - VPCMPEQ %ymm6, %ymm9, %ymm6
> - vpmovmskb %ymm6, %ecx
> + VPMINU %ymm6, %ymm7, %ymm7
> +
> + VPCMPEQ %ymm7, %ymm1, %ymm7
> + vpmovmskb %ymm7, %ecx
> subq $-(VEC_SIZE * 4), %rdi
> testl %ecx, %ecx
> jz L(loop_4x_vec)
>
> -
> - VPCMPEQ %ymm1, %ymm9, %ymm1
> - vpmovmskb %ymm1, %eax
> + VPCMPEQ %ymm2, %ymm1, %ymm2
> + vpmovmskb %ymm2, %eax
> testl %eax, %eax
> jnz L(last_vec_x0)
>
>
> - VPCMPEQ %ymm5, %ymm9, %ymm2
> - vpmovmskb %ymm2, %eax
> + VPCMPEQ %ymm3, %ymm1, %ymm3
> + vpmovmskb %ymm3, %eax
> testl %eax, %eax
> jnz L(last_vec_x1)
>
> - VPCMPEQ %ymm3, %ymm9, %ymm3
> - vpmovmskb %ymm3, %eax
> + VPCMPEQ %ymm4, %ymm1, %ymm4
> + vpmovmskb %ymm4, %eax
> /* rcx has combined result from all 4 VEC. It will only be used
> if the first 3 other VEC all did not contain a match. */
> salq $32, %rcx
> orq %rcx, %rax
> tzcntq %rax, %rax
> - subq $(VEC_SIZE * 2), %rdi
> + subq $(VEC_SIZE * 2 - 1), %rdi
> # ifndef USE_AS_STRCHRNUL
> /* Found CHAR or the null byte. */
> cmp (%rdi, %rax), %CHAR_REG
> @@ -239,10 +251,11 @@ L(loop_4x_vec):
> VZEROUPPER_RETURN
>
>
> - .p2align 4
> + .p2align 4,, 10
> L(last_vec_x0):
> - tzcntl %eax, %eax
> - addq $-(VEC_SIZE * 4), %rdi
> + /* Use bsf to save code size. */
> + bsfl %eax, %eax
> + addq $-(VEC_SIZE * 4 - 1), %rdi
> # ifndef USE_AS_STRCHRNUL
> /* Found CHAR or the null byte. */
> cmp (%rdi, %rax), %CHAR_REG
> @@ -251,16 +264,11 @@ L(last_vec_x0):
> addq %rdi, %rax
> VZEROUPPER_RETURN
>
> -# ifndef USE_AS_STRCHRNUL
> -L(zero_end):
> - xorl %eax, %eax
> - VZEROUPPER_RETURN
> -# endif
>
> - .p2align 4
> + .p2align 4,, 10
> L(last_vec_x1):
> tzcntl %eax, %eax
> - subq $(VEC_SIZE * 3), %rdi
> + subq $(VEC_SIZE * 3 - 1), %rdi
> # ifndef USE_AS_STRCHRNUL
> /* Found CHAR or the null byte. */
> cmp (%rdi, %rax), %CHAR_REG
> @@ -269,18 +277,23 @@ L(last_vec_x1):
> addq %rdi, %rax
> VZEROUPPER_RETURN
>
> +# ifndef USE_AS_STRCHRNUL
> +L(zero_end):
> + xorl %eax, %eax
> + VZEROUPPER_RETURN
> +# endif
>
> /* Cold case for crossing page with first load. */
> - .p2align 4
> + .p2align 4,, 8
> L(cross_page_boundary):
> movq %rdi, %rdx
> /* Align rdi to VEC_SIZE - 1. */
> orq $(VEC_SIZE - 1), %rdi
> - vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm8
> - VPCMPEQ %ymm8, %ymm0, %ymm1
> - VPCMPEQ %ymm8, %ymm9, %ymm2
> - vpor %ymm1, %ymm2, %ymm1
> - vpmovmskb %ymm1, %eax
> + vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm2
> + VPCMPEQ %ymm2, %ymm0, %ymm3
> + VPCMPEQ %ymm2, %ymm1, %ymm2
> + vpor %ymm3, %ymm2, %ymm3
> + vpmovmskb %ymm3, %eax
> /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
> so no need to manually mod edx. */
> sarxl %edx, %eax, %eax
> @@ -291,13 +304,10 @@ L(cross_page_boundary):
> xorl %ecx, %ecx
> /* Found CHAR or the null byte. */
> cmp (%rdx, %rax), %CHAR_REG
> - leaq (%rdx, %rax), %rax
> - cmovne %rcx, %rax
> -# else
> - addq %rdx, %rax
> + jne L(zero_end)
> # endif
> -L(return_vzeroupper):
> - ZERO_UPPER_VEC_REGISTERS_RETURN
> + addq %rdx, %rax
> + VZEROUPPER_RETURN
>
> END (STRCHR)
> -# endif
> +#endif
> --
> 2.25.1
>
--
H.J.
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v1 04/23] x86: Code cleanup in strchr-evex and comment justifying branch
2022-03-23 21:57 ` [PATCH v1 04/23] x86: Code cleanup in strchr-evex " Noah Goldstein
@ 2022-03-24 18:54 ` H.J. Lu
2022-05-12 19:32 ` Sunil Pandey
0 siblings, 1 reply; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 18:54 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Wed, Mar 23, 2022 at 2:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Small code cleanup for size: -81 bytes.
>
> Add comment justifying using a branch to do NULL/non-null return.
>
> All string/memory tests pass and no regressions in benchtests.
>
> geometric_mean(N=20) of all benchmarks New / Original: .985
> ---
> Geomtric Mean N=20 runs; All functions page aligned
> length, alignment, pos, rand, seek_char/branch, max_char/perc-zero, New Time / Old Time
> 2048, 0, 32, 0, 23, 127, 0.878
> 2048, 1, 32, 0, 23, 127, 0.88
> 2048, 0, 64, 0, 23, 127, 0.997
> 2048, 2, 64, 0, 23, 127, 1.001
> 2048, 0, 128, 0, 23, 127, 0.973
> 2048, 3, 128, 0, 23, 127, 0.971
> 2048, 0, 256, 0, 23, 127, 0.976
> 2048, 4, 256, 0, 23, 127, 0.973
> 2048, 0, 512, 0, 23, 127, 1.001
> 2048, 5, 512, 0, 23, 127, 1.004
> 2048, 0, 1024, 0, 23, 127, 1.005
> 2048, 6, 1024, 0, 23, 127, 1.007
> 2048, 0, 2048, 0, 23, 127, 1.035
> 2048, 7, 2048, 0, 23, 127, 1.03
> 4096, 0, 32, 0, 23, 127, 0.889
> 4096, 1, 32, 0, 23, 127, 0.891
> 4096, 0, 64, 0, 23, 127, 1.012
> 4096, 2, 64, 0, 23, 127, 1.017
> 4096, 0, 128, 0, 23, 127, 0.975
> 4096, 3, 128, 0, 23, 127, 0.974
> 4096, 0, 256, 0, 23, 127, 0.974
> 4096, 4, 256, 0, 23, 127, 0.972
> 4096, 0, 512, 0, 23, 127, 1.002
> 4096, 5, 512, 0, 23, 127, 1.016
> 4096, 0, 1024, 0, 23, 127, 1.009
> 4096, 6, 1024, 0, 23, 127, 1.008
> 4096, 0, 2048, 0, 23, 127, 1.003
> 4096, 7, 2048, 0, 23, 127, 1.004
> 256, 1, 64, 0, 23, 127, 0.993
> 256, 2, 64, 0, 23, 127, 0.999
> 256, 3, 64, 0, 23, 127, 0.992
> 256, 4, 64, 0, 23, 127, 0.99
> 256, 5, 64, 0, 23, 127, 0.99
> 256, 6, 64, 0, 23, 127, 0.994
> 256, 7, 64, 0, 23, 127, 0.991
> 512, 0, 256, 0, 23, 127, 0.971
> 512, 16, 256, 0, 23, 127, 0.971
> 512, 32, 256, 0, 23, 127, 1.005
> 512, 48, 256, 0, 23, 127, 0.998
> 512, 64, 256, 0, 23, 127, 1.001
> 512, 80, 256, 0, 23, 127, 1.002
> 512, 96, 256, 0, 23, 127, 1.005
> 512, 112, 256, 0, 23, 127, 1.012
> 1, 0, 0, 0, 23, 127, 1.024
> 2, 0, 1, 0, 23, 127, 0.991
> 3, 0, 2, 0, 23, 127, 0.997
> 4, 0, 3, 0, 23, 127, 0.984
> 5, 0, 4, 0, 23, 127, 0.993
> 6, 0, 5, 0, 23, 127, 0.985
> 7, 0, 6, 0, 23, 127, 0.979
> 8, 0, 7, 0, 23, 127, 0.975
> 9, 0, 8, 0, 23, 127, 0.965
> 10, 0, 9, 0, 23, 127, 0.957
> 11, 0, 10, 0, 23, 127, 0.979
> 12, 0, 11, 0, 23, 127, 0.987
> 13, 0, 12, 0, 23, 127, 1.023
> 14, 0, 13, 0, 23, 127, 0.997
> 15, 0, 14, 0, 23, 127, 0.983
> 16, 0, 15, 0, 23, 127, 0.987
> 17, 0, 16, 0, 23, 127, 0.993
> 18, 0, 17, 0, 23, 127, 0.985
> 19, 0, 18, 0, 23, 127, 0.999
> 20, 0, 19, 0, 23, 127, 0.998
> 21, 0, 20, 0, 23, 127, 0.983
> 22, 0, 21, 0, 23, 127, 0.983
> 23, 0, 22, 0, 23, 127, 1.002
> 24, 0, 23, 0, 23, 127, 1.0
> 25, 0, 24, 0, 23, 127, 1.002
> 26, 0, 25, 0, 23, 127, 0.984
> 27, 0, 26, 0, 23, 127, 0.994
> 28, 0, 27, 0, 23, 127, 0.995
> 29, 0, 28, 0, 23, 127, 1.017
> 30, 0, 29, 0, 23, 127, 1.009
> 31, 0, 30, 0, 23, 127, 1.001
> 32, 0, 31, 0, 23, 127, 1.021
> 2048, 0, 32, 0, 0, 127, 0.899
> 2048, 1, 32, 0, 0, 127, 0.93
> 2048, 0, 64, 0, 0, 127, 1.009
> 2048, 2, 64, 0, 0, 127, 1.023
> 2048, 0, 128, 0, 0, 127, 0.973
> 2048, 3, 128, 0, 0, 127, 0.975
> 2048, 0, 256, 0, 0, 127, 0.974
> 2048, 4, 256, 0, 0, 127, 0.97
> 2048, 0, 512, 0, 0, 127, 0.999
> 2048, 5, 512, 0, 0, 127, 1.004
> 2048, 0, 1024, 0, 0, 127, 1.008
> 2048, 6, 1024, 0, 0, 127, 1.008
> 2048, 0, 2048, 0, 0, 127, 0.996
> 2048, 7, 2048, 0, 0, 127, 1.002
> 4096, 0, 32, 0, 0, 127, 0.872
> 4096, 1, 32, 0, 0, 127, 0.881
> 4096, 0, 64, 0, 0, 127, 1.006
> 4096, 2, 64, 0, 0, 127, 1.005
> 4096, 0, 128, 0, 0, 127, 0.973
> 4096, 3, 128, 0, 0, 127, 0.974
> 4096, 0, 256, 0, 0, 127, 0.969
> 4096, 4, 256, 0, 0, 127, 0.971
> 4096, 0, 512, 0, 0, 127, 1.0
> 4096, 5, 512, 0, 0, 127, 1.005
> 4096, 0, 1024, 0, 0, 127, 1.007
> 4096, 6, 1024, 0, 0, 127, 1.009
> 4096, 0, 2048, 0, 0, 127, 1.005
> 4096, 7, 2048, 0, 0, 127, 1.007
> 256, 1, 64, 0, 0, 127, 0.994
> 256, 2, 64, 0, 0, 127, 1.008
> 256, 3, 64, 0, 0, 127, 1.019
> 256, 4, 64, 0, 0, 127, 0.991
> 256, 5, 64, 0, 0, 127, 0.992
> 256, 6, 64, 0, 0, 127, 0.991
> 256, 7, 64, 0, 0, 127, 0.988
> 512, 0, 256, 0, 0, 127, 0.971
> 512, 16, 256, 0, 0, 127, 0.967
> 512, 32, 256, 0, 0, 127, 1.005
> 512, 48, 256, 0, 0, 127, 1.001
> 512, 64, 256, 0, 0, 127, 1.009
> 512, 80, 256, 0, 0, 127, 1.008
> 512, 96, 256, 0, 0, 127, 1.009
> 512, 112, 256, 0, 0, 127, 1.016
> 1, 0, 0, 0, 0, 127, 1.038
> 2, 0, 1, 0, 0, 127, 1.009
> 3, 0, 2, 0, 0, 127, 0.992
> 4, 0, 3, 0, 0, 127, 1.004
> 5, 0, 4, 0, 0, 127, 0.966
> 6, 0, 5, 0, 0, 127, 0.968
> 7, 0, 6, 0, 0, 127, 1.004
> 8, 0, 7, 0, 0, 127, 0.99
> 9, 0, 8, 0, 0, 127, 0.958
> 10, 0, 9, 0, 0, 127, 0.96
> 11, 0, 10, 0, 0, 127, 0.948
> 12, 0, 11, 0, 0, 127, 0.984
> 13, 0, 12, 0, 0, 127, 0.967
> 14, 0, 13, 0, 0, 127, 0.993
> 15, 0, 14, 0, 0, 127, 0.991
> 16, 0, 15, 0, 0, 127, 1.0
> 17, 0, 16, 0, 0, 127, 0.982
> 18, 0, 17, 0, 0, 127, 0.977
> 19, 0, 18, 0, 0, 127, 0.987
> 20, 0, 19, 0, 0, 127, 0.978
> 21, 0, 20, 0, 0, 127, 1.0
> 22, 0, 21, 0, 0, 127, 0.99
> 23, 0, 22, 0, 0, 127, 0.988
> 24, 0, 23, 0, 0, 127, 0.997
> 25, 0, 24, 0, 0, 127, 1.003
> 26, 0, 25, 0, 0, 127, 1.004
> 27, 0, 26, 0, 0, 127, 0.982
> 28, 0, 27, 0, 0, 127, 0.972
> 29, 0, 28, 0, 0, 127, 0.978
> 30, 0, 29, 0, 0, 127, 0.992
> 31, 0, 30, 0, 0, 127, 0.986
> 32, 0, 31, 0, 0, 127, 1.0
>
> 16, 0, 15, 1, 1, 0, 0.997
> 16, 0, 15, 1, 0, 0, 1.001
> 16, 0, 15, 1, 1, 0.1, 0.984
> 16, 0, 15, 1, 0, 0.1, 0.999
> 16, 0, 15, 1, 1, 0.25, 0.929
> 16, 0, 15, 1, 0, 0.25, 1.001
> 16, 0, 15, 1, 1, 0.33, 0.892
> 16, 0, 15, 1, 0, 0.33, 0.996
> 16, 0, 15, 1, 1, 0.5, 0.897
> 16, 0, 15, 1, 0, 0.5, 1.009
> 16, 0, 15, 1, 1, 0.66, 0.882
> 16, 0, 15, 1, 0, 0.66, 0.967
> 16, 0, 15, 1, 1, 0.75, 0.919
> 16, 0, 15, 1, 0, 0.75, 1.027
> 16, 0, 15, 1, 1, 0.9, 0.949
> 16, 0, 15, 1, 0, 0.9, 1.021
> 16, 0, 15, 1, 1, 1, 0.998
> 16, 0, 15, 1, 0, 1, 0.999
>
> sysdeps/x86_64/multiarch/strchr-evex.S | 146 ++++++++++++++-----------
> 1 file changed, 80 insertions(+), 66 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
> index f62cd9d144..ec739fb8f9 100644
> --- a/sysdeps/x86_64/multiarch/strchr-evex.S
> +++ b/sysdeps/x86_64/multiarch/strchr-evex.S
> @@ -30,6 +30,7 @@
> # ifdef USE_AS_WCSCHR
> # define VPBROADCAST vpbroadcastd
> # define VPCMP vpcmpd
> +# define VPTESTN vptestnmd
> # define VPMINU vpminud
> # define CHAR_REG esi
> # define SHIFT_REG ecx
> @@ -37,6 +38,7 @@
> # else
> # define VPBROADCAST vpbroadcastb
> # define VPCMP vpcmpb
> +# define VPTESTN vptestnmb
> # define VPMINU vpminub
> # define CHAR_REG sil
> # define SHIFT_REG edx
> @@ -61,13 +63,11 @@
> # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
>
> .section .text.evex,"ax",@progbits
> -ENTRY (STRCHR)
> +ENTRY_P2ALIGN (STRCHR, 5)
> /* Broadcast CHAR to YMM0. */
> VPBROADCAST %esi, %YMM0
> movl %edi, %eax
> andl $(PAGE_SIZE - 1), %eax
> - vpxorq %XMMZERO, %XMMZERO, %XMMZERO
> -
> /* Check if we cross page boundary with one vector load.
> Otherwise it is safe to use an unaligned load. */
> cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> @@ -81,49 +81,35 @@ ENTRY (STRCHR)
> vpxorq %YMM1, %YMM0, %YMM2
> VPMINU %YMM2, %YMM1, %YMM2
> /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
> - VPCMP $0, %YMMZERO, %YMM2, %k0
> + VPTESTN %YMM2, %YMM2, %k0
> kmovd %k0, %eax
> testl %eax, %eax
> jz L(aligned_more)
> tzcntl %eax, %eax
> +# ifndef USE_AS_STRCHRNUL
> + /* Found CHAR or the null byte. */
> + cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG
> + /* NB: Use a branch instead of cmovcc here. The expectation is
> + that with strchr the user will branch based on input being
> + null. Since this branch will be 100% predictive of the user
> + branch a branch miss here should save what otherwise would
> + be branch miss in the user code. Otherwise using a branch 1)
> + saves code size and 2) is faster in highly predictable
> + environments. */
> + jne L(zero)
> +# endif
> # ifdef USE_AS_WCSCHR
> /* NB: Multiply wchar_t count by 4 to get the number of bytes.
> */
> leaq (%rdi, %rax, CHAR_SIZE), %rax
> # else
> addq %rdi, %rax
> -# endif
> -# ifndef USE_AS_STRCHRNUL
> - /* Found CHAR or the null byte. */
> - cmp (%rax), %CHAR_REG
> - jne L(zero)
> # endif
> ret
>
> - /* .p2align 5 helps keep performance more consistent if ENTRY()
> - alignment % 32 was either 16 or 0. As well this makes the
> - alignment % 32 of the loop_4x_vec fixed which makes tuning it
> - easier. */
> - .p2align 5
> -L(first_vec_x3):
> - tzcntl %eax, %eax
> -# ifndef USE_AS_STRCHRNUL
> - /* Found CHAR or the null byte. */
> - cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
> - jne L(zero)
> -# endif
> - /* NB: Multiply sizeof char type (1 or 4) to get the number of
> - bytes. */
> - leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> - ret
>
> -# ifndef USE_AS_STRCHRNUL
> -L(zero):
> - xorl %eax, %eax
> - ret
> -# endif
>
> - .p2align 4
> + .p2align 4,, 10
> L(first_vec_x4):
> # ifndef USE_AS_STRCHRNUL
> /* Check to see if first match was CHAR (k0) or null (k1). */
> @@ -144,9 +130,18 @@ L(first_vec_x4):
> leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> ret
>
> +# ifndef USE_AS_STRCHRNUL
> +L(zero):
> + xorl %eax, %eax
> + ret
> +# endif
> +
> +
> .p2align 4
> L(first_vec_x1):
> - tzcntl %eax, %eax
> + /* Use bsf here to save 1-byte keeping keeping the block in 1x
> + fetch block. eax guranteed non-zero. */
> + bsfl %eax, %eax
> # ifndef USE_AS_STRCHRNUL
> /* Found CHAR or the null byte. */
> cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
> @@ -158,7 +153,7 @@ L(first_vec_x1):
> leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
> ret
>
> - .p2align 4
> + .p2align 4,, 10
> L(first_vec_x2):
> # ifndef USE_AS_STRCHRNUL
> /* Check to see if first match was CHAR (k0) or null (k1). */
> @@ -179,6 +174,21 @@ L(first_vec_x2):
> leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> ret
>
> + .p2align 4,, 10
> +L(first_vec_x3):
> + /* Use bsf here to save 1-byte keeping keeping the block in 1x
> + fetch block. eax guranteed non-zero. */
> + bsfl %eax, %eax
> +# ifndef USE_AS_STRCHRNUL
> + /* Found CHAR or the null byte. */
> + cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
> + jne L(zero)
> +# endif
> + /* NB: Multiply sizeof char type (1 or 4) to get the number of
> + bytes. */
> + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> + ret
> +
> .p2align 4
> L(aligned_more):
> /* Align data to VEC_SIZE. */
> @@ -195,7 +205,7 @@ L(cross_page_continue):
> vpxorq %YMM1, %YMM0, %YMM2
> VPMINU %YMM2, %YMM1, %YMM2
> /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
> - VPCMP $0, %YMMZERO, %YMM2, %k0
> + VPTESTN %YMM2, %YMM2, %k0
> kmovd %k0, %eax
> testl %eax, %eax
> jnz L(first_vec_x1)
> @@ -206,7 +216,7 @@ L(cross_page_continue):
> /* Each bit in K0 represents a CHAR in YMM1. */
> VPCMP $0, %YMM1, %YMM0, %k0
> /* Each bit in K1 represents a CHAR in YMM1. */
> - VPCMP $0, %YMM1, %YMMZERO, %k1
> + VPTESTN %YMM1, %YMM1, %k1
> kortestd %k0, %k1
> jnz L(first_vec_x2)
>
> @@ -215,7 +225,7 @@ L(cross_page_continue):
> vpxorq %YMM1, %YMM0, %YMM2
> VPMINU %YMM2, %YMM1, %YMM2
> /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
> - VPCMP $0, %YMMZERO, %YMM2, %k0
> + VPTESTN %YMM2, %YMM2, %k0
> kmovd %k0, %eax
> testl %eax, %eax
> jnz L(first_vec_x3)
> @@ -224,7 +234,7 @@ L(cross_page_continue):
> /* Each bit in K0 represents a CHAR in YMM1. */
> VPCMP $0, %YMM1, %YMM0, %k0
> /* Each bit in K1 represents a CHAR in YMM1. */
> - VPCMP $0, %YMM1, %YMMZERO, %k1
> + VPTESTN %YMM1, %YMM1, %k1
> kortestd %k0, %k1
> jnz L(first_vec_x4)
>
> @@ -265,33 +275,33 @@ L(loop_4x_vec):
> VPMINU %YMM3, %YMM4, %YMM4
> VPMINU %YMM2, %YMM4, %YMM4{%k4}{z}
>
> - VPCMP $0, %YMMZERO, %YMM4, %k1
> + VPTESTN %YMM4, %YMM4, %k1
> kmovd %k1, %ecx
> subq $-(VEC_SIZE * 4), %rdi
> testl %ecx, %ecx
> jz L(loop_4x_vec)
>
> - VPCMP $0, %YMMZERO, %YMM1, %k0
> + VPTESTN %YMM1, %YMM1, %k0
> kmovd %k0, %eax
> testl %eax, %eax
> jnz L(last_vec_x1)
>
> - VPCMP $0, %YMMZERO, %YMM2, %k0
> + VPTESTN %YMM2, %YMM2, %k0
> kmovd %k0, %eax
> testl %eax, %eax
> jnz L(last_vec_x2)
>
> - VPCMP $0, %YMMZERO, %YMM3, %k0
> + VPTESTN %YMM3, %YMM3, %k0
> kmovd %k0, %eax
> /* Combine YMM3 matches (eax) with YMM4 matches (ecx). */
> # ifdef USE_AS_WCSCHR
> sall $8, %ecx
> orl %ecx, %eax
> - tzcntl %eax, %eax
> + bsfl %eax, %eax
> # else
> salq $32, %rcx
> orq %rcx, %rax
> - tzcntq %rax, %rax
> + bsfq %rax, %rax
> # endif
> # ifndef USE_AS_STRCHRNUL
> /* Check if match was CHAR or null. */
> @@ -303,28 +313,28 @@ L(loop_4x_vec):
> leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> ret
>
> -# ifndef USE_AS_STRCHRNUL
> -L(zero_end):
> - xorl %eax, %eax
> - ret
> + .p2align 4,, 8
> +L(last_vec_x1):
> + bsfl %eax, %eax
> +# ifdef USE_AS_WCSCHR
> + /* NB: Multiply wchar_t count by 4 to get the number of bytes.
> + */
> + leaq (%rdi, %rax, CHAR_SIZE), %rax
> +# else
> + addq %rdi, %rax
> # endif
>
> - .p2align 4
> -L(last_vec_x1):
> - tzcntl %eax, %eax
> # ifndef USE_AS_STRCHRNUL
> /* Check if match was null. */
> - cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG
> + cmp (%rax), %CHAR_REG
> jne L(zero_end)
> # endif
> - /* NB: Multiply sizeof char type (1 or 4) to get the number of
> - bytes. */
> - leaq (%rdi, %rax, CHAR_SIZE), %rax
> +
> ret
>
> - .p2align 4
> + .p2align 4,, 8
> L(last_vec_x2):
> - tzcntl %eax, %eax
> + bsfl %eax, %eax
> # ifndef USE_AS_STRCHRNUL
> /* Check if match was null. */
> cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
> @@ -336,7 +346,7 @@ L(last_vec_x2):
> ret
>
> /* Cold case for crossing page with first load. */
> - .p2align 4
> + .p2align 4,, 8
> L(cross_page_boundary):
> movq %rdi, %rdx
> /* Align rdi. */
> @@ -346,9 +356,9 @@ L(cross_page_boundary):
> vpxorq %YMM1, %YMM0, %YMM2
> VPMINU %YMM2, %YMM1, %YMM2
> /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
> - VPCMP $0, %YMMZERO, %YMM2, %k0
> + VPTESTN %YMM2, %YMM2, %k0
> kmovd %k0, %eax
> - /* Remove the leading bits. */
> + /* Remove the leading bits. */
> # ifdef USE_AS_WCSCHR
> movl %edx, %SHIFT_REG
> /* NB: Divide shift count by 4 since each bit in K1 represent 4
> @@ -360,20 +370,24 @@ L(cross_page_boundary):
> /* If eax is zero continue. */
> testl %eax, %eax
> jz L(cross_page_continue)
> - tzcntl %eax, %eax
> -# ifndef USE_AS_STRCHRNUL
> - /* Check to see if match was CHAR or null. */
> - cmp (%rdx, %rax, CHAR_SIZE), %CHAR_REG
> - jne L(zero_end)
> -# endif
> + bsfl %eax, %eax
> +
> # ifdef USE_AS_WCSCHR
> /* NB: Multiply wchar_t count by 4 to get the number of
> bytes. */
> leaq (%rdx, %rax, CHAR_SIZE), %rax
> # else
> addq %rdx, %rax
> +# endif
> +# ifndef USE_AS_STRCHRNUL
> + /* Check to see if match was CHAR or null. */
> + cmp (%rax), %CHAR_REG
> + je L(cross_page_ret)
> +L(zero_end):
> + xorl %eax, %eax
> +L(cross_page_ret):
> # endif
> ret
>
> END (STRCHR)
> -# endif
> +#endif
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v1 05/23] benchtests: Use json-lib in bench-strpbrk.c
2022-03-23 21:57 ` [PATCH v1 05/23] benchtests: Use json-lib in bench-strpbrk.c Noah Goldstein
@ 2022-03-24 18:54 ` H.J. Lu
0 siblings, 0 replies; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 18:54 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Wed, Mar 23, 2022 at 2:59 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Just QOL change to make parsing the output of the benchtests more
> consistent.
> ---
> benchtests/bench-strpbrk.c | 81 ++++++++++++++++++++++++++++----------
> 1 file changed, 61 insertions(+), 20 deletions(-)
>
> diff --git a/benchtests/bench-strpbrk.c b/benchtests/bench-strpbrk.c
> index d46bf9c0e2..a7522a76e6 100644
> --- a/benchtests/bench-strpbrk.c
> +++ b/benchtests/bench-strpbrk.c
> @@ -62,11 +62,14 @@ SIMPLE_STRPBRK (const CHAR *s, const CHAR *rej)
>
> #endif /* !STRPBRK_RESULT */
>
> +#include "json-lib.h"
> +
> static void
> -do_one_test (impl_t *impl, const CHAR *s, const CHAR *rej, RES_TYPE exp_res)
> +do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s,
> + const CHAR *rej, RES_TYPE exp_res)
> {
> RES_TYPE res = CALL (impl, s, rej);
> - size_t i, iters = INNER_LOOP_ITERS_MEDIUM;
> + size_t i, iters = INNER_LOOP_ITERS;
> timing_t start, stop, cur;
>
> if (res != exp_res)
> @@ -86,23 +89,26 @@ do_one_test (impl_t *impl, const CHAR *s, const CHAR *rej, RES_TYPE exp_res)
>
> TIMING_DIFF (cur, start, stop);
>
> - TIMING_PRINT_MEAN ((double) cur, (double) iters);
> + json_element_double (json_ctx, (double)cur / (double)iters);
> }
>
> static void
> -do_test (size_t align, size_t pos, size_t len)
> +do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t pos,
> + size_t len)
> {
> size_t i;
> int c;
> RES_TYPE result;
> CHAR *rej, *s;
>
> - align &= 7;
> - if ((align + pos + 10) * sizeof (CHAR) >= page_size || len > 240)
> + align1 &= 7;
> + if ((align1 + pos + 10) * sizeof (CHAR) >= page_size || len > 240)
> + return;
> + if ((align2 + len) * sizeof (CHAR) >= page_size)
> return;
>
> - rej = (CHAR *) (buf2) + (random () & 255);
> - s = (CHAR *) (buf1) + align;
> + rej = (CHAR *) (buf2) + align2;
> + s = (CHAR *) (buf1) + align1;
>
> for (i = 0; i < len; ++i)
> {
> @@ -136,43 +142,78 @@ do_test (size_t align, size_t pos, size_t len)
> }
> result = STRPBRK_RESULT (s, pos);
>
> - printf ("Length %4zd, alignment %2zd, rej len %2zd:", pos, align, len);
> + json_element_object_begin (json_ctx);
> + json_attr_uint (json_ctx, "len", len);
> + json_attr_uint (json_ctx, "pos", pos);
> + json_attr_uint (json_ctx, "align1", align1);
> + json_attr_uint (json_ctx, "align2", align2);
> + json_array_begin (json_ctx, "timings");
>
> FOR_EACH_IMPL (impl, 0)
> - do_one_test (impl, s, rej, result);
> + do_one_test (json_ctx, impl, s, rej, result);
>
> - putchar ('\n');
> + json_array_end (json_ctx);
> + json_element_object_end (json_ctx);
> }
>
> int
> test_main (void)
> {
> + json_ctx_t json_ctx;
> size_t i;
>
> test_init ();
>
> - printf ("%32s", "");
> + json_init (&json_ctx, 0, stdout);
> +
> + json_document_begin (&json_ctx);
> + json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> +
> + json_attr_object_begin (&json_ctx, "functions");
> + json_attr_object_begin (&json_ctx, TEST_NAME);
> + json_attr_string (&json_ctx, "bench-variant", "");
> +
> + json_array_begin (&json_ctx, "ifuncs");
> FOR_EACH_IMPL (impl, 0)
> - printf ("\t%s", impl->name);
> - putchar ('\n');
> + json_element_string (&json_ctx, impl->name);
> + json_array_end (&json_ctx);
> +
> + json_array_begin (&json_ctx, "results");
> +
>
> for (i = 0; i < 32; ++i)
> {
> - do_test (0, 512, i);
> - do_test (i, 512, i);
> + do_test (&json_ctx, 0, 0, 512, i);
> + do_test (&json_ctx, i, 0, 512, i);
> + do_test (&json_ctx, 0, i, 512, i);
> + do_test (&json_ctx, i, i, 512, i);
> +
> }
>
> for (i = 1; i < 8; ++i)
> {
> - do_test (0, 16 << i, 4);
> - do_test (i, 16 << i, 4);
> + do_test (&json_ctx, 0, 0, 16 << i, 4);
> + do_test (&json_ctx, i, 0, 16 << i, 4);
> + do_test (&json_ctx, 0, i, 16 << i, 4);
> + do_test (&json_ctx, i, i, 16 << i, 4);
> }
>
> for (i = 1; i < 8; ++i)
> - do_test (i, 64, 10);
> + {
> + do_test (&json_ctx, i, 0, 64, 10);
> + do_test (&json_ctx, i, i, 64, 10);
> + }
>
> for (i = 0; i < 64; ++i)
> - do_test (0, i, 6);
> + {
> + do_test (&json_ctx, 0, 0, i, 6);
> + do_test (&json_ctx, 0, i, i, 6);
> + }
> +
> + json_array_end (&json_ctx);
> + json_attr_object_end (&json_ctx);
> + json_attr_object_end (&json_ctx);
> + json_document_end (&json_ctx);
>
> return ret;
> }
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v1 06/23] benchtests: Use json-lib in bench-strspn.c
2022-03-23 21:57 ` [PATCH v1 06/23] benchtests: Use json-lib in bench-strspn.c Noah Goldstein
@ 2022-03-24 18:54 ` H.J. Lu
0 siblings, 0 replies; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 18:54 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Wed, Mar 23, 2022 at 2:59 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Just QOL change to make parsing the output of the benchtests more
> consistent.
> ---
> benchtests/bench-strspn.c | 78 +++++++++++++++++++++++++++++----------
> 1 file changed, 58 insertions(+), 20 deletions(-)
>
> diff --git a/benchtests/bench-strspn.c b/benchtests/bench-strspn.c
> index d79c36fae6..061e90c54d 100644
> --- a/benchtests/bench-strspn.c
> +++ b/benchtests/bench-strspn.c
> @@ -23,6 +23,7 @@
> # define TEST_NAME "wcsspn"
> #endif /* WIDE */
> #include "bench-string.h"
> +#include "json-lib.h"
>
> #define BIG_CHAR MAX_CHAR
>
> @@ -58,9 +59,10 @@ SIMPLE_STRSPN (const CHAR *s, const CHAR *acc)
> }
>
> static void
> -do_one_test (impl_t *impl, const CHAR *s, const CHAR *acc, size_t exp_res)
> +do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s,
> + const CHAR *acc, size_t exp_res)
> {
> - size_t res = CALL (impl, s, acc), i, iters = INNER_LOOP_ITERS_MEDIUM;
> + size_t res = CALL (impl, s, acc), i, iters = INNER_LOOP_ITERS;
> timing_t start, stop, cur;
>
> if (res != exp_res)
> @@ -80,21 +82,24 @@ do_one_test (impl_t *impl, const CHAR *s, const CHAR *acc, size_t exp_res)
>
> TIMING_DIFF (cur, start, stop);
>
> - TIMING_PRINT_MEAN ((double) cur, (double) iters);
> + json_element_double (json_ctx, (double)cur / (double)iters);
> }
>
> static void
> -do_test (size_t align, size_t pos, size_t len)
> +do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t pos,
> + size_t len)
> {
> size_t i;
> CHAR *acc, *s;
>
> - align &= 7;
> - if ((align + pos + 10) * sizeof (CHAR) >= page_size || len > 240 || ! len)
> + align1 &= 7;
> + if ((align1 + pos + 10) * sizeof (CHAR) >= page_size || len > 240 || !len)
> + return;
> + if ((align2 + len) * sizeof (CHAR) >= page_size)
> return;
>
> - acc = (CHAR *) (buf2) + (random () & 255);
> - s = (CHAR *) (buf1) + align;
> + acc = (CHAR *) (buf2) + align2;
> + s = (CHAR *) (buf1) + align1;
>
> for (i = 0; i < len; ++i)
> {
> @@ -118,43 +123,76 @@ do_test (size_t align, size_t pos, size_t len)
> s[i] = '\0';
> }
>
> - printf ("Length %4zd, alignment %2zd, acc len %2zd:", pos, align, len);
> + json_element_object_begin (json_ctx);
> + json_attr_uint (json_ctx, "len", len);
> + json_attr_uint (json_ctx, "pos", pos);
> + json_attr_uint (json_ctx, "align1", align1);
> + json_attr_uint (json_ctx, "align2", align2);
> + json_array_begin (json_ctx, "timings");
>
> FOR_EACH_IMPL (impl, 0)
> - do_one_test (impl, s, acc, pos);
> + do_one_test (json_ctx, impl, s, acc, pos);
>
> - putchar ('\n');
> + json_array_end (json_ctx);
> + json_element_object_end (json_ctx);
> }
>
> int
> test_main (void)
> {
> + json_ctx_t json_ctx;
> size_t i;
>
> test_init ();
>
> - printf ("%32s", "");
> + json_init (&json_ctx, 0, stdout);
> +
> + json_document_begin (&json_ctx);
> + json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> +
> + json_attr_object_begin (&json_ctx, "functions");
> + json_attr_object_begin (&json_ctx, TEST_NAME);
> + json_attr_string (&json_ctx, "bench-variant", "");
> +
> + json_array_begin (&json_ctx, "ifuncs");
> FOR_EACH_IMPL (impl, 0)
> - printf ("\t%s", impl->name);
> - putchar ('\n');
> + json_element_string (&json_ctx, impl->name);
> + json_array_end (&json_ctx);
> +
> + json_array_begin (&json_ctx, "results");
>
> for (i = 0; i < 32; ++i)
> {
> - do_test (0, 512, i);
> - do_test (i, 512, i);
> + do_test (&json_ctx, 0, 0, 512, i);
> + do_test (&json_ctx, i, 0, 512, i);
> + do_test (&json_ctx, 0, i, 512, i);
> + do_test (&json_ctx, i, i, 512, i);
> }
>
> for (i = 1; i < 8; ++i)
> {
> - do_test (0, 16 << i, 4);
> - do_test (i, 16 << i, 4);
> + do_test (&json_ctx, 0, 0, 16 << i, 4);
> + do_test (&json_ctx, i, 0, 16 << i, 4);
> + do_test (&json_ctx, 0, i, 16 << i, 4);
> + do_test (&json_ctx, i, i, 16 << i, 4);
> }
>
> for (i = 1; i < 8; ++i)
> - do_test (i, 64, 10);
> + {
> + do_test (&json_ctx, i, 0, 64, 10);
> + do_test (&json_ctx, i, i, 64, 10);
> + }
>
> for (i = 0; i < 64; ++i)
> - do_test (0, i, 6);
> + {
> + do_test (&json_ctx, 0, 0, i, 6);
> + do_test (&json_ctx, 0, i, i, 6);
> + }
> +
> + json_array_end (&json_ctx);
> + json_attr_object_end (&json_ctx);
> + json_attr_object_end (&json_ctx);
> + json_document_end (&json_ctx);
>
> return ret;
> }
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v1 07/23] x86: Optimize strcspn and strpbrk in strcspn-c.c
2022-03-23 21:57 ` [PATCH v1 07/23] x86: Optimize strcspn and strpbrk in strcspn-c.c Noah Goldstein
@ 2022-03-24 18:55 ` H.J. Lu
2022-05-12 19:34 ` Sunil Pandey
0 siblings, 1 reply; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 18:55 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Wed, Mar 23, 2022 at 2:59 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of
> _mm_cmpistri. Also change offset to unsigned to avoid unnecessary
> sign extensions.
>
> geometric_mean(N=20) of all benchmarks that dont fallback on
> sse2/strlen; New / Original: .928
>
> All string/memory tests pass.
> ---
> Geomtric Mean N=20 runs; All functions page aligned
> len, align1, align2, pos, New Time / Old Time
> 0, 0, 0, 512, 1.207
> 1, 0, 0, 512, 1.039
> 1, 1, 0, 512, 0.997
> 1, 0, 1, 512, 0.981
> 1, 1, 1, 512, 0.977
> 2, 0, 0, 512, 1.02
> 2, 2, 0, 512, 0.979
> 2, 0, 2, 512, 0.902
> 2, 2, 2, 512, 0.958
> 3, 0, 0, 512, 0.978
> 3, 3, 0, 512, 0.988
> 3, 0, 3, 512, 0.979
> 3, 3, 3, 512, 0.955
> 4, 0, 0, 512, 0.969
> 4, 4, 0, 512, 0.991
> 4, 0, 4, 512, 0.94
> 4, 4, 4, 512, 0.958
> 5, 0, 0, 512, 0.963
> 5, 5, 0, 512, 1.004
> 5, 0, 5, 512, 0.948
> 5, 5, 5, 512, 0.971
> 6, 0, 0, 512, 0.933
> 6, 6, 0, 512, 1.007
> 6, 0, 6, 512, 0.921
> 6, 6, 6, 512, 0.969
> 7, 0, 0, 512, 0.928
> 7, 7, 0, 512, 0.976
> 7, 0, 7, 512, 0.932
> 7, 7, 7, 512, 0.995
> 8, 0, 0, 512, 0.931
> 8, 0, 8, 512, 0.766
> 9, 0, 0, 512, 0.965
> 9, 1, 0, 512, 0.999
> 9, 0, 9, 512, 0.765
> 9, 1, 9, 512, 0.97
> 10, 0, 0, 512, 0.976
> 10, 2, 0, 512, 0.991
> 10, 0, 10, 512, 0.768
> 10, 2, 10, 512, 0.926
> 11, 0, 0, 512, 0.958
> 11, 3, 0, 512, 1.006
> 11, 0, 11, 512, 0.768
> 11, 3, 11, 512, 0.908
> 12, 0, 0, 512, 0.945
> 12, 4, 0, 512, 0.896
> 12, 0, 12, 512, 0.764
> 12, 4, 12, 512, 0.785
> 13, 0, 0, 512, 0.957
> 13, 5, 0, 512, 1.019
> 13, 0, 13, 512, 0.76
> 13, 5, 13, 512, 0.785
> 14, 0, 0, 512, 0.918
> 14, 6, 0, 512, 1.004
> 14, 0, 14, 512, 0.78
> 14, 6, 14, 512, 0.711
> 15, 0, 0, 512, 0.855
> 15, 7, 0, 512, 0.985
> 15, 0, 15, 512, 0.779
> 15, 7, 15, 512, 0.772
> 16, 0, 0, 512, 0.987
> 16, 0, 16, 512, 0.99
> 17, 0, 0, 512, 0.996
> 17, 1, 0, 512, 0.979
> 17, 0, 17, 512, 1.001
> 17, 1, 17, 512, 1.03
> 18, 0, 0, 512, 0.976
> 18, 2, 0, 512, 0.989
> 18, 0, 18, 512, 0.976
> 18, 2, 18, 512, 0.992
> 19, 0, 0, 512, 0.991
> 19, 3, 0, 512, 0.988
> 19, 0, 19, 512, 1.009
> 19, 3, 19, 512, 1.018
> 20, 0, 0, 512, 0.999
> 20, 4, 0, 512, 1.005
> 20, 0, 20, 512, 0.993
> 20, 4, 20, 512, 0.983
> 21, 0, 0, 512, 0.982
> 21, 5, 0, 512, 0.988
> 21, 0, 21, 512, 0.978
> 21, 5, 21, 512, 0.984
> 22, 0, 0, 512, 0.988
> 22, 6, 0, 512, 0.979
> 22, 0, 22, 512, 0.984
> 22, 6, 22, 512, 0.983
> 23, 0, 0, 512, 0.996
> 23, 7, 0, 512, 0.998
> 23, 0, 23, 512, 0.979
> 23, 7, 23, 512, 0.987
> 24, 0, 0, 512, 0.99
> 24, 0, 24, 512, 0.979
> 25, 0, 0, 512, 0.985
> 25, 1, 0, 512, 0.988
> 25, 0, 25, 512, 0.99
> 25, 1, 25, 512, 0.986
> 26, 0, 0, 512, 1.005
> 26, 2, 0, 512, 0.995
> 26, 0, 26, 512, 0.992
> 26, 2, 26, 512, 0.983
> 27, 0, 0, 512, 0.986
> 27, 3, 0, 512, 0.978
> 27, 0, 27, 512, 0.986
> 27, 3, 27, 512, 0.973
> 28, 0, 0, 512, 0.995
> 28, 4, 0, 512, 0.993
> 28, 0, 28, 512, 0.983
> 28, 4, 28, 512, 1.005
> 29, 0, 0, 512, 0.983
> 29, 5, 0, 512, 0.982
> 29, 0, 29, 512, 0.984
> 29, 5, 29, 512, 1.005
> 30, 0, 0, 512, 0.978
> 30, 6, 0, 512, 0.985
> 30, 0, 30, 512, 0.994
> 30, 6, 30, 512, 0.993
> 31, 0, 0, 512, 0.984
> 31, 7, 0, 512, 0.983
> 31, 0, 31, 512, 1.0
> 31, 7, 31, 512, 1.031
> 4, 0, 0, 32, 0.916
> 4, 1, 0, 32, 0.952
> 4, 0, 1, 32, 0.927
> 4, 1, 1, 32, 0.969
> 4, 0, 0, 64, 0.961
> 4, 2, 0, 64, 0.955
> 4, 0, 2, 64, 0.975
> 4, 2, 2, 64, 0.972
> 4, 0, 0, 128, 0.971
> 4, 3, 0, 128, 0.982
> 4, 0, 3, 128, 0.945
> 4, 3, 3, 128, 0.971
> 4, 0, 0, 256, 1.004
> 4, 4, 0, 256, 0.966
> 4, 0, 4, 256, 0.961
> 4, 4, 4, 256, 0.971
> 4, 5, 0, 512, 0.929
> 4, 0, 5, 512, 0.969
> 4, 5, 5, 512, 0.985
> 4, 0, 0, 1024, 1.003
> 4, 6, 0, 1024, 1.009
> 4, 0, 6, 1024, 1.005
> 4, 6, 6, 1024, 0.999
> 4, 0, 0, 2048, 0.917
> 4, 7, 0, 2048, 1.015
> 4, 0, 7, 2048, 1.011
> 4, 7, 7, 2048, 0.907
> 10, 1, 0, 64, 0.964
> 10, 1, 1, 64, 0.966
> 10, 2, 0, 64, 0.953
> 10, 2, 2, 64, 0.972
> 10, 3, 0, 64, 0.962
> 10, 3, 3, 64, 0.969
> 10, 4, 0, 64, 0.957
> 10, 4, 4, 64, 0.969
> 10, 5, 0, 64, 0.961
> 10, 5, 5, 64, 0.965
> 10, 6, 0, 64, 0.949
> 10, 6, 6, 64, 0.9
> 10, 7, 0, 64, 0.957
> 10, 7, 7, 64, 0.897
> 6, 0, 0, 0, 0.991
> 6, 0, 0, 1, 1.011
> 6, 0, 1, 1, 0.939
> 6, 0, 0, 2, 1.016
> 6, 0, 2, 2, 0.94
> 6, 0, 0, 3, 1.019
> 6, 0, 3, 3, 0.941
> 6, 0, 0, 4, 1.056
> 6, 0, 4, 4, 0.884
> 6, 0, 0, 5, 0.977
> 6, 0, 5, 5, 0.934
> 6, 0, 0, 6, 0.954
> 6, 0, 6, 6, 0.93
> 6, 0, 0, 7, 0.963
> 6, 0, 7, 7, 0.916
> 6, 0, 0, 8, 0.963
> 6, 0, 8, 8, 0.945
> 6, 0, 0, 9, 1.028
> 6, 0, 9, 9, 0.942
> 6, 0, 0, 10, 0.955
> 6, 0, 10, 10, 0.831
> 6, 0, 0, 11, 0.948
> 6, 0, 11, 11, 0.82
> 6, 0, 0, 12, 1.033
> 6, 0, 12, 12, 0.873
> 6, 0, 0, 13, 0.983
> 6, 0, 13, 13, 0.852
> 6, 0, 0, 14, 0.984
> 6, 0, 14, 14, 0.853
> 6, 0, 0, 15, 0.984
> 6, 0, 15, 15, 0.882
> 6, 0, 0, 16, 0.971
> 6, 0, 16, 16, 0.958
> 6, 0, 0, 17, 0.938
> 6, 0, 17, 17, 0.947
> 6, 0, 0, 18, 0.96
> 6, 0, 18, 18, 0.938
> 6, 0, 0, 19, 0.903
> 6, 0, 19, 19, 0.943
> 6, 0, 0, 20, 0.947
> 6, 0, 20, 20, 0.951
> 6, 0, 0, 21, 0.948
> 6, 0, 21, 21, 0.96
> 6, 0, 0, 22, 0.926
> 6, 0, 22, 22, 0.951
> 6, 0, 0, 23, 0.923
> 6, 0, 23, 23, 0.959
> 6, 0, 0, 24, 0.918
> 6, 0, 24, 24, 0.952
> 6, 0, 0, 25, 0.97
> 6, 0, 25, 25, 0.952
> 6, 0, 0, 26, 0.871
> 6, 0, 26, 26, 0.869
> 6, 0, 0, 27, 0.935
> 6, 0, 27, 27, 0.836
> 6, 0, 0, 28, 0.936
> 6, 0, 28, 28, 0.857
> 6, 0, 0, 29, 0.876
> 6, 0, 29, 29, 0.859
> 6, 0, 0, 30, 0.934
> 6, 0, 30, 30, 0.857
> 6, 0, 0, 31, 0.962
> 6, 0, 31, 31, 0.86
> 6, 0, 0, 32, 0.912
> 6, 0, 32, 32, 0.94
> 6, 0, 0, 33, 0.903
> 6, 0, 33, 33, 0.968
> 6, 0, 0, 34, 0.913
> 6, 0, 34, 34, 0.896
> 6, 0, 0, 35, 0.904
> 6, 0, 35, 35, 0.913
> 6, 0, 0, 36, 0.905
> 6, 0, 36, 36, 0.907
> 6, 0, 0, 37, 0.899
> 6, 0, 37, 37, 0.9
> 6, 0, 0, 38, 0.912
> 6, 0, 38, 38, 0.919
> 6, 0, 0, 39, 0.925
> 6, 0, 39, 39, 0.927
> 6, 0, 0, 40, 0.923
> 6, 0, 40, 40, 0.972
> 6, 0, 0, 41, 0.92
> 6, 0, 41, 41, 0.966
> 6, 0, 0, 42, 0.915
> 6, 0, 42, 42, 0.834
> 6, 0, 0, 43, 0.92
> 6, 0, 43, 43, 0.856
> 6, 0, 0, 44, 0.908
> 6, 0, 44, 44, 0.858
> 6, 0, 0, 45, 0.932
> 6, 0, 45, 45, 0.847
> 6, 0, 0, 46, 0.927
> 6, 0, 46, 46, 0.859
> 6, 0, 0, 47, 0.902
> 6, 0, 47, 47, 0.855
> 6, 0, 0, 48, 0.949
> 6, 0, 48, 48, 0.934
> 6, 0, 0, 49, 0.907
> 6, 0, 49, 49, 0.943
> 6, 0, 0, 50, 0.934
> 6, 0, 50, 50, 0.943
> 6, 0, 0, 51, 0.933
> 6, 0, 51, 51, 0.939
> 6, 0, 0, 52, 0.944
> 6, 0, 52, 52, 0.944
> 6, 0, 0, 53, 0.939
> 6, 0, 53, 53, 0.938
> 6, 0, 0, 54, 0.9
> 6, 0, 54, 54, 0.923
> 6, 0, 0, 55, 0.9
> 6, 0, 55, 55, 0.927
> 6, 0, 0, 56, 0.9
> 6, 0, 56, 56, 0.917
> 6, 0, 0, 57, 0.9
> 6, 0, 57, 57, 0.916
> 6, 0, 0, 58, 0.914
> 6, 0, 58, 58, 0.784
> 6, 0, 0, 59, 0.863
> 6, 0, 59, 59, 0.846
> 6, 0, 0, 60, 0.88
> 6, 0, 60, 60, 0.827
> 6, 0, 0, 61, 0.896
> 6, 0, 61, 61, 0.847
> 6, 0, 0, 62, 0.894
> 6, 0, 62, 62, 0.865
> 6, 0, 0, 63, 0.934
> 6, 0, 63, 63, 0.866
>
> sysdeps/x86_64/multiarch/strcspn-c.c | 83 +++++++++++++---------------
> 1 file changed, 37 insertions(+), 46 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c
> index 013aebf797..c312fab8b1 100644
> --- a/sysdeps/x86_64/multiarch/strcspn-c.c
> +++ b/sysdeps/x86_64/multiarch/strcspn-c.c
> @@ -84,83 +84,74 @@ STRCSPN_SSE42 (const char *s, const char *a)
> RETURN (NULL, strlen (s));
>
> const char *aligned;
> - __m128i mask;
> - int offset = (int) ((size_t) a & 15);
> + __m128i mask, maskz, zero;
> + unsigned int maskz_bits;
> + unsigned int offset = (unsigned int) ((size_t) a & 15);
> + zero = _mm_set1_epi8 (0);
> if (offset != 0)
> {
> /* Load masks. */
> aligned = (const char *) ((size_t) a & -16L);
> __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
> -
> - mask = __m128i_shift_right (mask0, offset);
> + maskz = _mm_cmpeq_epi8 (mask0, zero);
>
> /* Find where the NULL terminator is. */
> - int length = _mm_cmpistri (mask, mask, 0x3a);
> - if (length == 16 - offset)
> - {
> - /* There is no NULL terminator. */
> - __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
> - int index = _mm_cmpistri (mask1, mask1, 0x3a);
> - length += index;
> -
> - /* Don't use SSE4.2 if the length of A > 16. */
> - if (length > 16)
> - return STRCSPN_SSE2 (s, a);
> -
> - if (index != 0)
> - {
> - /* Combine mask0 and mask1. We could play games with
> - palignr, but frankly this data should be in L1 now
> - so do the merge via an unaligned load. */
> - mask = _mm_loadu_si128 ((__m128i *) a);
> - }
> - }
> + maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
> + if (maskz_bits != 0)
> + {
> + mask = __m128i_shift_right (mask0, offset);
> + offset = (unsigned int) ((size_t) s & 15);
> + if (offset)
> + goto start_unaligned;
> +
> + aligned = s;
> + goto start_loop;
> + }
> }
> - else
> - {
> - /* A is aligned. */
> - mask = _mm_load_si128 ((__m128i *) a);
>
> - /* Find where the NULL terminator is. */
> - int length = _mm_cmpistri (mask, mask, 0x3a);
> - if (length == 16)
> - {
> - /* There is no NULL terminator. Don't use SSE4.2 if the length
> - of A > 16. */
> - if (a[16] != 0)
> - return STRCSPN_SSE2 (s, a);
> - }
> + /* A is aligned. */
> + mask = _mm_loadu_si128 ((__m128i *) a);
> + /* Find where the NULL terminator is. */
> + maskz = _mm_cmpeq_epi8 (mask, zero);
> + maskz_bits = _mm_movemask_epi8 (maskz);
> + if (maskz_bits == 0)
> + {
> + /* There is no NULL terminator. Don't use SSE4.2 if the length
> + of A > 16. */
> + if (a[16] != 0)
> + return STRCSPN_SSE2 (s, a);
> }
>
> - offset = (int) ((size_t) s & 15);
> + aligned = s;
> + offset = (unsigned int) ((size_t) s & 15);
> if (offset != 0)
> {
> + start_unaligned:
> /* Check partial string. */
> aligned = (const char *) ((size_t) s & -16L);
> __m128i value = _mm_load_si128 ((__m128i *) aligned);
>
> value = __m128i_shift_right (value, offset);
>
> - int length = _mm_cmpistri (mask, value, 0x2);
> + unsigned int length = _mm_cmpistri (mask, value, 0x2);
> /* No need to check ZFlag since ZFlag is always 1. */
> - int cflag = _mm_cmpistrc (mask, value, 0x2);
> + unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
> if (cflag)
> RETURN ((char *) (s + length), length);
> /* Find where the NULL terminator is. */
> - int index = _mm_cmpistri (value, value, 0x3a);
> + unsigned int index = _mm_cmpistri (value, value, 0x3a);
> if (index < 16 - offset)
> RETURN (NULL, index);
> aligned += 16;
> }
> - else
> - aligned = s;
>
> +start_loop:
> while (1)
> {
> __m128i value = _mm_load_si128 ((__m128i *) aligned);
> - int index = _mm_cmpistri (mask, value, 0x2);
> - int cflag = _mm_cmpistrc (mask, value, 0x2);
> - int zflag = _mm_cmpistrz (mask, value, 0x2);
> + unsigned int index = _mm_cmpistri (mask, value, 0x2);
> + unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
> + unsigned int zflag = _mm_cmpistrz (mask, value, 0x2);
> if (cflag)
> RETURN ((char *) (aligned + index), (size_t) (aligned + index - s));
> if (zflag)
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v1 08/23] x86: Optimize strspn in strspn-c.c
2022-03-23 21:57 ` [PATCH v1 08/23] x86: Optimize strspn in strspn-c.c Noah Goldstein
@ 2022-03-24 18:56 ` H.J. Lu
2022-05-12 19:39 ` Sunil Pandey
0 siblings, 1 reply; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 18:56 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Wed, Mar 23, 2022 at 2:59 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of
> _mm_cmpistri. Also change offset to unsigned to avoid unnecessary
> sign extensions.
>
> geometric_mean(N=20) of all benchmarks that dont fallback on
> sse2; New / Original: .901
>
> All string/memory tests pass.
> ---
> Geomtric Mean N=20 runs; All functions page aligned
> len, align1, align2, pos, New Time / Old Time
> 1, 0, 0, 512, 0.768
> 1, 1, 0, 512, 0.666
> 1, 0, 1, 512, 1.193
> 1, 1, 1, 512, 0.872
> 2, 0, 0, 512, 0.698
> 2, 2, 0, 512, 0.687
> 2, 0, 2, 512, 1.393
> 2, 2, 2, 512, 0.944
> 3, 0, 0, 512, 0.691
> 3, 3, 0, 512, 0.676
> 3, 0, 3, 512, 1.388
> 3, 3, 3, 512, 0.948
> 4, 0, 0, 512, 0.74
> 4, 4, 0, 512, 0.678
> 4, 0, 4, 512, 1.421
> 4, 4, 4, 512, 0.943
> 5, 0, 0, 512, 0.691
> 5, 5, 0, 512, 0.675
> 5, 0, 5, 512, 1.348
> 5, 5, 5, 512, 0.952
> 6, 0, 0, 512, 0.685
> 6, 6, 0, 512, 0.67
> 6, 0, 6, 512, 1.333
> 6, 6, 6, 512, 0.95
> 7, 0, 0, 512, 0.688
> 7, 7, 0, 512, 0.675
> 7, 0, 7, 512, 1.344
> 7, 7, 7, 512, 0.919
> 8, 0, 0, 512, 0.716
> 8, 0, 8, 512, 0.935
> 9, 0, 0, 512, 0.716
> 9, 1, 0, 512, 0.712
> 9, 0, 9, 512, 0.956
> 9, 1, 9, 512, 0.992
> 10, 0, 0, 512, 0.699
> 10, 2, 0, 512, 0.68
> 10, 0, 10, 512, 0.952
> 10, 2, 10, 512, 0.932
> 11, 0, 0, 512, 0.705
> 11, 3, 0, 512, 0.685
> 11, 0, 11, 512, 0.956
> 11, 3, 11, 512, 0.927
> 12, 0, 0, 512, 0.695
> 12, 4, 0, 512, 0.675
> 12, 0, 12, 512, 0.948
> 12, 4, 12, 512, 0.928
> 13, 0, 0, 512, 0.7
> 13, 5, 0, 512, 0.678
> 13, 0, 13, 512, 0.944
> 13, 5, 13, 512, 0.931
> 14, 0, 0, 512, 0.703
> 14, 6, 0, 512, 0.678
> 14, 0, 14, 512, 0.949
> 14, 6, 14, 512, 0.93
> 15, 0, 0, 512, 0.694
> 15, 7, 0, 512, 0.678
> 15, 0, 15, 512, 0.953
> 15, 7, 15, 512, 0.924
> 16, 0, 0, 512, 1.021
> 16, 0, 16, 512, 1.067
> 17, 0, 0, 512, 0.991
> 17, 1, 0, 512, 0.984
> 17, 0, 17, 512, 0.979
> 17, 1, 17, 512, 0.993
> 18, 0, 0, 512, 0.992
> 18, 2, 0, 512, 1.008
> 18, 0, 18, 512, 1.016
> 18, 2, 18, 512, 0.993
> 19, 0, 0, 512, 0.984
> 19, 3, 0, 512, 0.985
> 19, 0, 19, 512, 1.007
> 19, 3, 19, 512, 1.006
> 20, 0, 0, 512, 0.969
> 20, 4, 0, 512, 0.968
> 20, 0, 20, 512, 0.975
> 20, 4, 20, 512, 0.975
> 21, 0, 0, 512, 0.992
> 21, 5, 0, 512, 0.992
> 21, 0, 21, 512, 0.98
> 21, 5, 21, 512, 0.97
> 22, 0, 0, 512, 0.989
> 22, 6, 0, 512, 0.987
> 22, 0, 22, 512, 0.99
> 22, 6, 22, 512, 0.985
> 23, 0, 0, 512, 0.989
> 23, 7, 0, 512, 0.98
> 23, 0, 23, 512, 1.0
> 23, 7, 23, 512, 0.993
> 24, 0, 0, 512, 0.99
> 24, 0, 24, 512, 0.998
> 25, 0, 0, 512, 1.01
> 25, 1, 0, 512, 1.0
> 25, 0, 25, 512, 0.97
> 25, 1, 25, 512, 0.967
> 26, 0, 0, 512, 1.009
> 26, 2, 0, 512, 0.986
> 26, 0, 26, 512, 0.997
> 26, 2, 26, 512, 0.993
> 27, 0, 0, 512, 0.984
> 27, 3, 0, 512, 0.997
> 27, 0, 27, 512, 0.989
> 27, 3, 27, 512, 0.976
> 28, 0, 0, 512, 0.991
> 28, 4, 0, 512, 1.003
> 28, 0, 28, 512, 0.986
> 28, 4, 28, 512, 0.989
> 29, 0, 0, 512, 0.986
> 29, 5, 0, 512, 0.985
> 29, 0, 29, 512, 0.984
> 29, 5, 29, 512, 0.977
> 30, 0, 0, 512, 0.991
> 30, 6, 0, 512, 0.987
> 30, 0, 30, 512, 0.979
> 30, 6, 30, 512, 0.974
> 31, 0, 0, 512, 0.995
> 31, 7, 0, 512, 0.995
> 31, 0, 31, 512, 0.994
> 31, 7, 31, 512, 0.984
> 4, 0, 0, 32, 0.861
> 4, 1, 0, 32, 0.864
> 4, 0, 1, 32, 0.962
> 4, 1, 1, 32, 0.967
> 4, 0, 0, 64, 0.884
> 4, 2, 0, 64, 0.818
> 4, 0, 2, 64, 0.889
> 4, 2, 2, 64, 0.918
> 4, 0, 0, 128, 0.942
> 4, 3, 0, 128, 0.884
> 4, 0, 3, 128, 0.931
> 4, 3, 3, 128, 0.883
> 4, 0, 0, 256, 0.964
> 4, 4, 0, 256, 0.922
> 4, 0, 4, 256, 0.956
> 4, 4, 4, 256, 0.93
> 4, 5, 0, 512, 0.833
> 4, 0, 5, 512, 1.027
> 4, 5, 5, 512, 0.929
> 4, 0, 0, 1024, 0.998
> 4, 6, 0, 1024, 0.986
> 4, 0, 6, 1024, 0.984
> 4, 6, 6, 1024, 0.977
> 4, 0, 0, 2048, 0.991
> 4, 7, 0, 2048, 0.987
> 4, 0, 7, 2048, 0.996
> 4, 7, 7, 2048, 0.98
> 10, 1, 0, 64, 0.826
> 10, 1, 1, 64, 0.907
> 10, 2, 0, 64, 0.829
> 10, 2, 2, 64, 0.91
> 10, 3, 0, 64, 0.83
> 10, 3, 3, 64, 0.915
> 10, 4, 0, 64, 0.83
> 10, 4, 4, 64, 0.911
> 10, 5, 0, 64, 0.828
> 10, 5, 5, 64, 0.905
> 10, 6, 0, 64, 0.828
> 10, 6, 6, 64, 0.812
> 10, 7, 0, 64, 0.83
> 10, 7, 7, 64, 0.819
> 6, 0, 0, 0, 1.261
> 6, 0, 0, 1, 1.252
> 6, 0, 1, 1, 0.845
> 6, 0, 0, 2, 1.27
> 6, 0, 2, 2, 0.85
> 6, 0, 0, 3, 1.269
> 6, 0, 3, 3, 0.845
> 6, 0, 0, 4, 1.287
> 6, 0, 4, 4, 0.852
> 6, 0, 0, 5, 1.278
> 6, 0, 5, 5, 0.851
> 6, 0, 0, 6, 1.269
> 6, 0, 6, 6, 0.841
> 6, 0, 0, 7, 1.268
> 6, 0, 7, 7, 0.851
> 6, 0, 0, 8, 1.291
> 6, 0, 8, 8, 0.837
> 6, 0, 0, 9, 1.283
> 6, 0, 9, 9, 0.831
> 6, 0, 0, 10, 1.252
> 6, 0, 10, 10, 0.997
> 6, 0, 0, 11, 1.295
> 6, 0, 11, 11, 1.046
> 6, 0, 0, 12, 1.296
> 6, 0, 12, 12, 1.038
> 6, 0, 0, 13, 1.287
> 6, 0, 13, 13, 1.082
> 6, 0, 0, 14, 1.284
> 6, 0, 14, 14, 1.001
> 6, 0, 0, 15, 1.286
> 6, 0, 15, 15, 1.002
> 6, 0, 0, 16, 0.894
> 6, 0, 16, 16, 0.874
> 6, 0, 0, 17, 0.892
> 6, 0, 17, 17, 0.974
> 6, 0, 0, 18, 0.907
> 6, 0, 18, 18, 0.993
> 6, 0, 0, 19, 0.909
> 6, 0, 19, 19, 0.99
> 6, 0, 0, 20, 0.894
> 6, 0, 20, 20, 0.978
> 6, 0, 0, 21, 0.89
> 6, 0, 21, 21, 0.958
> 6, 0, 0, 22, 0.893
> 6, 0, 22, 22, 0.99
> 6, 0, 0, 23, 0.899
> 6, 0, 23, 23, 0.986
> 6, 0, 0, 24, 0.893
> 6, 0, 24, 24, 0.989
> 6, 0, 0, 25, 0.889
> 6, 0, 25, 25, 0.982
> 6, 0, 0, 26, 0.889
> 6, 0, 26, 26, 0.852
> 6, 0, 0, 27, 0.89
> 6, 0, 27, 27, 0.832
> 6, 0, 0, 28, 0.89
> 6, 0, 28, 28, 0.831
> 6, 0, 0, 29, 0.89
> 6, 0, 29, 29, 0.838
> 6, 0, 0, 30, 0.907
> 6, 0, 30, 30, 0.833
> 6, 0, 0, 31, 0.888
> 6, 0, 31, 31, 0.837
> 6, 0, 0, 32, 0.853
> 6, 0, 32, 32, 0.828
> 6, 0, 0, 33, 0.857
> 6, 0, 33, 33, 0.947
> 6, 0, 0, 34, 0.847
> 6, 0, 34, 34, 0.954
> 6, 0, 0, 35, 0.841
> 6, 0, 35, 35, 0.94
> 6, 0, 0, 36, 0.854
> 6, 0, 36, 36, 0.958
> 6, 0, 0, 37, 0.856
> 6, 0, 37, 37, 0.957
> 6, 0, 0, 38, 0.839
> 6, 0, 38, 38, 0.962
> 6, 0, 0, 39, 0.866
> 6, 0, 39, 39, 0.945
> 6, 0, 0, 40, 0.845
> 6, 0, 40, 40, 0.961
> 6, 0, 0, 41, 0.858
> 6, 0, 41, 41, 0.961
> 6, 0, 0, 42, 0.862
> 6, 0, 42, 42, 0.825
> 6, 0, 0, 43, 0.864
> 6, 0, 43, 43, 0.82
> 6, 0, 0, 44, 0.843
> 6, 0, 44, 44, 0.81
> 6, 0, 0, 45, 0.859
> 6, 0, 45, 45, 0.816
> 6, 0, 0, 46, 0.866
> 6, 0, 46, 46, 0.81
> 6, 0, 0, 47, 0.858
> 6, 0, 47, 47, 0.807
> 6, 0, 0, 48, 0.87
> 6, 0, 48, 48, 0.87
> 6, 0, 0, 49, 0.871
> 6, 0, 49, 49, 0.874
> 6, 0, 0, 50, 0.87
> 6, 0, 50, 50, 0.881
> 6, 0, 0, 51, 0.868
> 6, 0, 51, 51, 0.875
> 6, 0, 0, 52, 0.873
> 6, 0, 52, 52, 0.871
> 6, 0, 0, 53, 0.866
> 6, 0, 53, 53, 0.882
> 6, 0, 0, 54, 0.863
> 6, 0, 54, 54, 0.876
> 6, 0, 0, 55, 0.851
> 6, 0, 55, 55, 0.871
> 6, 0, 0, 56, 0.867
> 6, 0, 56, 56, 0.888
> 6, 0, 0, 57, 0.862
> 6, 0, 57, 57, 0.899
> 6, 0, 0, 58, 0.873
> 6, 0, 58, 58, 0.798
> 6, 0, 0, 59, 0.881
> 6, 0, 59, 59, 0.785
> 6, 0, 0, 60, 0.867
> 6, 0, 60, 60, 0.797
> 6, 0, 0, 61, 0.872
> 6, 0, 61, 61, 0.791
> 6, 0, 0, 62, 0.859
> 6, 0, 62, 62, 0.79
> 6, 0, 0, 63, 0.87
> 6, 0, 63, 63, 0.796
>
> sysdeps/x86_64/multiarch/strspn-c.c | 86 +++++++++++++----------------
> 1 file changed, 39 insertions(+), 47 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c
> index 8fb3aba64d..6124033ceb 100644
> --- a/sysdeps/x86_64/multiarch/strspn-c.c
> +++ b/sysdeps/x86_64/multiarch/strspn-c.c
> @@ -62,81 +62,73 @@ __strspn_sse42 (const char *s, const char *a)
> return 0;
>
> const char *aligned;
> - __m128i mask;
> - int offset = (int) ((size_t) a & 15);
> + __m128i mask, maskz, zero;
> + unsigned int maskz_bits;
> + unsigned int offset = (int) ((size_t) a & 15);
> + zero = _mm_set1_epi8 (0);
> if (offset != 0)
> {
> /* Load masks. */
> aligned = (const char *) ((size_t) a & -16L);
> __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
> -
> - mask = __m128i_shift_right (mask0, offset);
> + maskz = _mm_cmpeq_epi8 (mask0, zero);
>
> /* Find where the NULL terminator is. */
> - int length = _mm_cmpistri (mask, mask, 0x3a);
> - if (length == 16 - offset)
> - {
> - /* There is no NULL terminator. */
> - __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
> - int index = _mm_cmpistri (mask1, mask1, 0x3a);
> - length += index;
> -
> - /* Don't use SSE4.2 if the length of A > 16. */
> - if (length > 16)
> - return __strspn_sse2 (s, a);
> -
> - if (index != 0)
> - {
> - /* Combine mask0 and mask1. We could play games with
> - palignr, but frankly this data should be in L1 now
> - so do the merge via an unaligned load. */
> - mask = _mm_loadu_si128 ((__m128i *) a);
> - }
> - }
> + maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
> + if (maskz_bits != 0)
> + {
> + mask = __m128i_shift_right (mask0, offset);
> + offset = (unsigned int) ((size_t) s & 15);
> + if (offset)
> + goto start_unaligned;
> +
> + aligned = s;
> + goto start_loop;
> + }
> }
> - else
> - {
> - /* A is aligned. */
> - mask = _mm_load_si128 ((__m128i *) a);
>
> - /* Find where the NULL terminator is. */
> - int length = _mm_cmpistri (mask, mask, 0x3a);
> - if (length == 16)
> - {
> - /* There is no NULL terminator. Don't use SSE4.2 if the length
> - of A > 16. */
> - if (a[16] != 0)
> - return __strspn_sse2 (s, a);
> - }
> + /* A is aligned. */
> + mask = _mm_loadu_si128 ((__m128i *) a);
> +
> + /* Find where the NULL terminator is. */
> + maskz = _mm_cmpeq_epi8 (mask, zero);
> + maskz_bits = _mm_movemask_epi8 (maskz);
> + if (maskz_bits == 0)
> + {
> + /* There is no NULL terminator. Don't use SSE4.2 if the length
> + of A > 16. */
> + if (a[16] != 0)
> + return __strspn_sse2 (s, a);
> }
> + aligned = s;
> + offset = (unsigned int) ((size_t) s & 15);
>
> - offset = (int) ((size_t) s & 15);
> if (offset != 0)
> {
> + start_unaligned:
> /* Check partial string. */
> aligned = (const char *) ((size_t) s & -16L);
> __m128i value = _mm_load_si128 ((__m128i *) aligned);
> + __m128i adj_value = __m128i_shift_right (value, offset);
>
> - value = __m128i_shift_right (value, offset);
> -
> - int length = _mm_cmpistri (mask, value, 0x12);
> + unsigned int length = _mm_cmpistri (mask, adj_value, 0x12);
> /* No need to check CFlag since it is always 1. */
> if (length < 16 - offset)
> return length;
> /* Find where the NULL terminator is. */
> - int index = _mm_cmpistri (value, value, 0x3a);
> - if (index < 16 - offset)
> + maskz = _mm_cmpeq_epi8 (value, zero);
> + maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
> + if (maskz_bits != 0)
> return length;
> aligned += 16;
> }
> - else
> - aligned = s;
>
> +start_loop:
> while (1)
> {
> __m128i value = _mm_load_si128 ((__m128i *) aligned);
> - int index = _mm_cmpistri (mask, value, 0x12);
> - int cflag = _mm_cmpistrc (mask, value, 0x12);
> + unsigned int index = _mm_cmpistri (mask, value, 0x12);
> + unsigned int cflag = _mm_cmpistrc (mask, value, 0x12);
> if (cflag)
> return (size_t) (aligned + index - s);
> aligned += 16;
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v1 09/23] x86: Remove strcspn-sse2.S and use the generic implementation
2022-03-23 21:57 ` [PATCH v1 09/23] x86: Remove strcspn-sse2.S and use the generic implementation Noah Goldstein
@ 2022-03-24 18:57 ` H.J. Lu
2022-05-12 19:40 ` Sunil Pandey
0 siblings, 1 reply; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 18:57 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Wed, Mar 23, 2022 at 3:00 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The generic implementation is faster.
>
> geometric_mean(N=20) of all benchmarks New / Original: .678
>
> All string/memory tests pass.
> ---
> Geomtric Mean N=20 runs; All functions page aligned
> len, align1, align2, pos, New Time / Old Time
> 0, 0, 0, 512, 0.054
> 1, 0, 0, 512, 0.055
> 1, 1, 0, 512, 0.051
> 1, 0, 1, 512, 0.054
> 1, 1, 1, 512, 0.054
> 2, 0, 0, 512, 0.861
> 2, 2, 0, 512, 0.861
> 2, 0, 2, 512, 0.861
> 2, 2, 2, 512, 0.864
> 3, 0, 0, 512, 0.854
> 3, 3, 0, 512, 0.848
> 3, 0, 3, 512, 0.845
> 3, 3, 3, 512, 0.85
> 4, 0, 0, 512, 0.851
> 4, 4, 0, 512, 0.85
> 4, 0, 4, 512, 0.852
> 4, 4, 4, 512, 0.849
> 5, 0, 0, 512, 0.938
> 5, 5, 0, 512, 0.94
> 5, 0, 5, 512, 0.864
> 5, 5, 5, 512, 0.86
> 6, 0, 0, 512, 0.858
> 6, 6, 0, 512, 0.869
> 6, 0, 6, 512, 0.847
> 6, 6, 6, 512, 0.868
> 7, 0, 0, 512, 0.867
> 7, 7, 0, 512, 0.861
> 7, 0, 7, 512, 0.864
> 7, 7, 7, 512, 0.863
> 8, 0, 0, 512, 0.884
> 8, 0, 8, 512, 0.884
> 9, 0, 0, 512, 0.886
> 9, 1, 0, 512, 0.894
> 9, 0, 9, 512, 0.889
> 9, 1, 9, 512, 0.886
> 10, 0, 0, 512, 0.859
> 10, 2, 0, 512, 0.859
> 10, 0, 10, 512, 0.862
> 10, 2, 10, 512, 0.861
> 11, 0, 0, 512, 0.846
> 11, 3, 0, 512, 0.865
> 11, 0, 11, 512, 0.859
> 11, 3, 11, 512, 0.862
> 12, 0, 0, 512, 0.858
> 12, 4, 0, 512, 0.857
> 12, 0, 12, 512, 0.964
> 12, 4, 12, 512, 0.876
> 13, 0, 0, 512, 0.827
> 13, 5, 0, 512, 0.805
> 13, 0, 13, 512, 0.821
> 13, 5, 13, 512, 0.825
> 14, 0, 0, 512, 0.786
> 14, 6, 0, 512, 0.786
> 14, 0, 14, 512, 0.803
> 14, 6, 14, 512, 0.783
> 15, 0, 0, 512, 0.778
> 15, 7, 0, 512, 0.792
> 15, 0, 15, 512, 0.796
> 15, 7, 15, 512, 0.799
> 16, 0, 0, 512, 0.803
> 16, 0, 16, 512, 0.815
> 17, 0, 0, 512, 0.812
> 17, 1, 0, 512, 0.826
> 17, 0, 17, 512, 0.803
> 17, 1, 17, 512, 0.856
> 18, 0, 0, 512, 0.801
> 18, 2, 0, 512, 0.886
> 18, 0, 18, 512, 0.805
> 18, 2, 18, 512, 0.807
> 19, 0, 0, 512, 0.814
> 19, 3, 0, 512, 0.804
> 19, 0, 19, 512, 0.813
> 19, 3, 19, 512, 0.814
> 20, 0, 0, 512, 0.885
> 20, 4, 0, 512, 0.799
> 20, 0, 20, 512, 0.826
> 20, 4, 20, 512, 0.808
> 21, 0, 0, 512, 0.816
> 21, 5, 0, 512, 0.824
> 21, 0, 21, 512, 0.819
> 21, 5, 21, 512, 0.826
> 22, 0, 0, 512, 0.814
> 22, 6, 0, 512, 0.824
> 22, 0, 22, 512, 0.81
> 22, 6, 22, 512, 0.806
> 23, 0, 0, 512, 0.825
> 23, 7, 0, 512, 0.829
> 23, 0, 23, 512, 0.809
> 23, 7, 23, 512, 0.823
> 24, 0, 0, 512, 0.829
> 24, 0, 24, 512, 0.823
> 25, 0, 0, 512, 0.864
> 25, 1, 0, 512, 0.895
> 25, 0, 25, 512, 0.88
> 25, 1, 25, 512, 0.848
> 26, 0, 0, 512, 0.903
> 26, 2, 0, 512, 0.888
> 26, 0, 26, 512, 0.894
> 26, 2, 26, 512, 0.89
> 27, 0, 0, 512, 0.914
> 27, 3, 0, 512, 0.917
> 27, 0, 27, 512, 0.902
> 27, 3, 27, 512, 0.887
> 28, 0, 0, 512, 0.887
> 28, 4, 0, 512, 0.877
> 28, 0, 28, 512, 0.893
> 28, 4, 28, 512, 0.866
> 29, 0, 0, 512, 0.885
> 29, 5, 0, 512, 0.907
> 29, 0, 29, 512, 0.894
> 29, 5, 29, 512, 0.906
> 30, 0, 0, 512, 0.88
> 30, 6, 0, 512, 0.898
> 30, 0, 30, 512, 0.9
> 30, 6, 30, 512, 0.895
> 31, 0, 0, 512, 0.893
> 31, 7, 0, 512, 0.874
> 31, 0, 31, 512, 0.894
> 31, 7, 31, 512, 0.899
> 4, 0, 0, 32, 0.618
> 4, 1, 0, 32, 0.627
> 4, 0, 1, 32, 0.625
> 4, 1, 1, 32, 0.613
> 4, 0, 0, 64, 0.913
> 4, 2, 0, 64, 0.801
> 4, 0, 2, 64, 0.759
> 4, 2, 2, 64, 0.761
> 4, 0, 0, 128, 0.822
> 4, 3, 0, 128, 0.863
> 4, 0, 3, 128, 0.867
> 4, 3, 3, 128, 0.917
> 4, 0, 0, 256, 0.816
> 4, 4, 0, 256, 0.812
> 4, 0, 4, 256, 0.803
> 4, 4, 4, 256, 0.811
> 4, 5, 0, 512, 0.848
> 4, 0, 5, 512, 0.843
> 4, 5, 5, 512, 0.857
> 4, 0, 0, 1024, 0.886
> 4, 6, 0, 1024, 0.887
> 4, 0, 6, 1024, 0.881
> 4, 6, 6, 1024, 0.873
> 4, 0, 0, 2048, 0.892
> 4, 7, 0, 2048, 0.894
> 4, 0, 7, 2048, 0.89
> 4, 7, 7, 2048, 0.874
> 10, 1, 0, 64, 0.946
> 10, 1, 1, 64, 0.81
> 10, 2, 0, 64, 0.804
> 10, 2, 2, 64, 0.82
> 10, 3, 0, 64, 0.772
> 10, 3, 3, 64, 0.772
> 10, 4, 0, 64, 0.748
> 10, 4, 4, 64, 0.751
> 10, 5, 0, 64, 0.76
> 10, 5, 5, 64, 0.76
> 10, 6, 0, 64, 0.726
> 10, 6, 6, 64, 0.718
> 10, 7, 0, 64, 0.724
> 10, 7, 7, 64, 0.72
> 6, 0, 0, 0, 0.415
> 6, 0, 0, 1, 0.423
> 6, 0, 1, 1, 0.412
> 6, 0, 0, 2, 0.433
> 6, 0, 2, 2, 0.434
> 6, 0, 0, 3, 0.427
> 6, 0, 3, 3, 0.428
> 6, 0, 0, 4, 0.465
> 6, 0, 4, 4, 0.466
> 6, 0, 0, 5, 0.463
> 6, 0, 5, 5, 0.468
> 6, 0, 0, 6, 0.435
> 6, 0, 6, 6, 0.444
> 6, 0, 0, 7, 0.41
> 6, 0, 7, 7, 0.42
> 6, 0, 0, 8, 0.474
> 6, 0, 8, 8, 0.501
> 6, 0, 0, 9, 0.471
> 6, 0, 9, 9, 0.489
> 6, 0, 0, 10, 0.462
> 6, 0, 10, 10, 0.46
> 6, 0, 0, 11, 0.459
> 6, 0, 11, 11, 0.458
> 6, 0, 0, 12, 0.516
> 6, 0, 12, 12, 0.51
> 6, 0, 0, 13, 0.494
> 6, 0, 13, 13, 0.524
> 6, 0, 0, 14, 0.486
> 6, 0, 14, 14, 0.5
> 6, 0, 0, 15, 0.48
> 6, 0, 15, 15, 0.501
> 6, 0, 0, 16, 0.54
> 6, 0, 16, 16, 0.538
> 6, 0, 0, 17, 0.503
> 6, 0, 17, 17, 0.541
> 6, 0, 0, 18, 0.537
> 6, 0, 18, 18, 0.549
> 6, 0, 0, 19, 0.527
> 6, 0, 19, 19, 0.537
> 6, 0, 0, 20, 0.539
> 6, 0, 20, 20, 0.554
> 6, 0, 0, 21, 0.558
> 6, 0, 21, 21, 0.541
> 6, 0, 0, 22, 0.546
> 6, 0, 22, 22, 0.561
> 6, 0, 0, 23, 0.54
> 6, 0, 23, 23, 0.536
> 6, 0, 0, 24, 0.565
> 6, 0, 24, 24, 0.584
> 6, 0, 0, 25, 0.563
> 6, 0, 25, 25, 0.58
> 6, 0, 0, 26, 0.555
> 6, 0, 26, 26, 0.584
> 6, 0, 0, 27, 0.569
> 6, 0, 27, 27, 0.587
> 6, 0, 0, 28, 0.612
> 6, 0, 28, 28, 0.623
> 6, 0, 0, 29, 0.604
> 6, 0, 29, 29, 0.621
> 6, 0, 0, 30, 0.59
> 6, 0, 30, 30, 0.609
> 6, 0, 0, 31, 0.577
> 6, 0, 31, 31, 0.588
> 6, 0, 0, 32, 0.621
> 6, 0, 32, 32, 0.608
> 6, 0, 0, 33, 0.601
> 6, 0, 33, 33, 0.623
> 6, 0, 0, 34, 0.614
> 6, 0, 34, 34, 0.615
> 6, 0, 0, 35, 0.598
> 6, 0, 35, 35, 0.608
> 6, 0, 0, 36, 0.626
> 6, 0, 36, 36, 0.634
> 6, 0, 0, 37, 0.62
> 6, 0, 37, 37, 0.634
> 6, 0, 0, 38, 0.612
> 6, 0, 38, 38, 0.637
> 6, 0, 0, 39, 0.627
> 6, 0, 39, 39, 0.612
> 6, 0, 0, 40, 0.661
> 6, 0, 40, 40, 0.674
> 6, 0, 0, 41, 0.633
> 6, 0, 41, 41, 0.643
> 6, 0, 0, 42, 0.634
> 6, 0, 42, 42, 0.636
> 6, 0, 0, 43, 0.619
> 6, 0, 43, 43, 0.625
> 6, 0, 0, 44, 0.654
> 6, 0, 44, 44, 0.654
> 6, 0, 0, 45, 0.647
> 6, 0, 45, 45, 0.649
> 6, 0, 0, 46, 0.651
> 6, 0, 46, 46, 0.651
> 6, 0, 0, 47, 0.646
> 6, 0, 47, 47, 0.648
> 6, 0, 0, 48, 0.662
> 6, 0, 48, 48, 0.664
> 6, 0, 0, 49, 0.68
> 6, 0, 49, 49, 0.667
> 6, 0, 0, 50, 0.654
> 6, 0, 50, 50, 0.659
> 6, 0, 0, 51, 0.638
> 6, 0, 51, 51, 0.639
> 6, 0, 0, 52, 0.665
> 6, 0, 52, 52, 0.669
> 6, 0, 0, 53, 0.658
> 6, 0, 53, 53, 0.656
> 6, 0, 0, 54, 0.669
> 6, 0, 54, 54, 0.67
> 6, 0, 0, 55, 0.668
> 6, 0, 55, 55, 0.664
> 6, 0, 0, 56, 0.701
> 6, 0, 56, 56, 0.695
> 6, 0, 0, 57, 0.687
> 6, 0, 57, 57, 0.696
> 6, 0, 0, 58, 0.693
> 6, 0, 58, 58, 0.704
> 6, 0, 0, 59, 0.695
> 6, 0, 59, 59, 0.708
> 6, 0, 0, 60, 0.708
> 6, 0, 60, 60, 0.728
> 6, 0, 0, 61, 0.708
> 6, 0, 61, 61, 0.71
> 6, 0, 0, 62, 0.715
> 6, 0, 62, 62, 0.705
> 6, 0, 0, 63, 0.677
> 6, 0, 63, 63, 0.702
>
> .../{strcspn-sse2.S => strcspn-sse2.c} | 8 +-
> sysdeps/x86_64/strcspn.S | 119 ------------------
> 2 files changed, 4 insertions(+), 123 deletions(-)
> rename sysdeps/x86_64/multiarch/{strcspn-sse2.S => strcspn-sse2.c} (85%)
> delete mode 100644 sysdeps/x86_64/strcspn.S
>
> diff --git a/sysdeps/x86_64/multiarch/strcspn-sse2.S b/sysdeps/x86_64/multiarch/strcspn-sse2.c
> similarity index 85%
> rename from sysdeps/x86_64/multiarch/strcspn-sse2.S
> rename to sysdeps/x86_64/multiarch/strcspn-sse2.c
> index f97e856e1f..3a04bb39fc 100644
> --- a/sysdeps/x86_64/multiarch/strcspn-sse2.S
> +++ b/sysdeps/x86_64/multiarch/strcspn-sse2.c
> @@ -1,4 +1,4 @@
> -/* strcspn optimized with SSE2.
> +/* strcspn.
> Copyright (C) 2017-2022 Free Software Foundation, Inc.
> This file is part of the GNU C Library.
>
> @@ -19,10 +19,10 @@
> #if IS_IN (libc)
>
> # include <sysdep.h>
> -# define strcspn __strcspn_sse2
> +# define STRCSPN __strcspn_sse2
>
> # undef libc_hidden_builtin_def
> -# define libc_hidden_builtin_def(strcspn)
> +# define libc_hidden_builtin_def(STRCSPN)
> #endif
>
> -#include <sysdeps/x86_64/strcspn.S>
> +#include <string/strcspn.c>
> diff --git a/sysdeps/x86_64/strcspn.S b/sysdeps/x86_64/strcspn.S
> deleted file mode 100644
> index f3cd86c606..0000000000
> --- a/sysdeps/x86_64/strcspn.S
> +++ /dev/null
> @@ -1,119 +0,0 @@
> -/* strcspn (str, ss) -- Return the length of the initial segment of STR
> - which contains no characters from SS.
> - For AMD x86-64.
> - Copyright (C) 1994-2022 Free Software Foundation, Inc.
> - This file is part of the GNU C Library.
> -
> - The GNU C Library is free software; you can redistribute it and/or
> - modify it under the terms of the GNU Lesser General Public
> - License as published by the Free Software Foundation; either
> - version 2.1 of the License, or (at your option) any later version.
> -
> - The GNU C Library is distributed in the hope that it will be useful,
> - but WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - Lesser General Public License for more details.
> -
> - You should have received a copy of the GNU Lesser General Public
> - License along with the GNU C Library; if not, see
> - <https://www.gnu.org/licenses/>. */
> -
> -#include <sysdep.h>
> -#include "asm-syntax.h"
> -
> - .text
> -ENTRY (strcspn)
> -
> - movq %rdi, %rdx /* Save SRC. */
> -
> - /* First we create a table with flags for all possible characters.
> - For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
> - supported by the C string functions we have 256 characters.
> - Before inserting marks for the stop characters we clear the whole
> - table. */
> - movq %rdi, %r8 /* Save value. */
> - subq $256, %rsp /* Make space for 256 bytes. */
> - cfi_adjust_cfa_offset(256)
> - movl $32, %ecx /* 32*8 bytes = 256 bytes. */
> - movq %rsp, %rdi
> - xorl %eax, %eax /* We store 0s. */
> - cld
> - rep
> - stosq
> -
> - movq %rsi, %rax /* Setup skipset. */
> -
> -/* For understanding the following code remember that %rcx == 0 now.
> - Although all the following instruction only modify %cl we always
> - have a correct zero-extended 64-bit value in %rcx. */
> -
> - .p2align 4
> -L(2): movb (%rax), %cl /* get byte from skipset */
> - testb %cl, %cl /* is NUL char? */
> - jz L(1) /* yes => start compare loop */
> - movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */
> -
> - movb 1(%rax), %cl /* get byte from skipset */
> - testb $0xff, %cl /* is NUL char? */
> - jz L(1) /* yes => start compare loop */
> - movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */
> -
> - movb 2(%rax), %cl /* get byte from skipset */
> - testb $0xff, %cl /* is NUL char? */
> - jz L(1) /* yes => start compare loop */
> - movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */
> -
> - movb 3(%rax), %cl /* get byte from skipset */
> - addq $4, %rax /* increment skipset pointer */
> - movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */
> - testb $0xff, %cl /* is NUL char? */
> - jnz L(2) /* no => process next dword from skipset */
> -
> -L(1): leaq -4(%rdx), %rax /* prepare loop */
> -
> - /* We use a neat trick for the following loop. Normally we would
> - have to test for two termination conditions
> - 1. a character in the skipset was found
> - and
> - 2. the end of the string was found
> - But as a sign that the character is in the skipset we store its
> - value in the table. But the value of NUL is NUL so the loop
> - terminates for NUL in every case. */
> -
> - .p2align 4
> -L(3): addq $4, %rax /* adjust pointer for full loop round */
> -
> - movb (%rax), %cl /* get byte from string */
> - cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */
> - je L(4) /* yes => return */
> -
> - movb 1(%rax), %cl /* get byte from string */
> - cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */
> - je L(5) /* yes => return */
> -
> - movb 2(%rax), %cl /* get byte from string */
> - cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */
> - jz L(6) /* yes => return */
> -
> - movb 3(%rax), %cl /* get byte from string */
> - cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */
> - jne L(3) /* no => start loop again */
> -
> - incq %rax /* adjust pointer */
> -L(6): incq %rax
> -L(5): incq %rax
> -
> -L(4): addq $256, %rsp /* remove skipset */
> - cfi_adjust_cfa_offset(-256)
> -#ifdef USE_AS_STRPBRK
> - xorl %edx,%edx
> - orb %cl, %cl /* was last character NUL? */
> - cmovzq %rdx, %rax /* Yes: return NULL */
> -#else
> - subq %rdx, %rax /* we have to return the number of valid
> - characters, so compute distance to first
> - non-valid character */
> -#endif
> - ret
> -END (strcspn)
> -libc_hidden_builtin_def (strcspn)
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v1 10/23] x86: Remove strpbrk-sse2.S and use the generic implementation
2022-03-23 21:57 ` [PATCH v1 10/23] x86: Remove strpbrk-sse2.S " Noah Goldstein
@ 2022-03-24 18:57 ` H.J. Lu
2022-05-12 19:41 ` Sunil Pandey
0 siblings, 1 reply; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 18:57 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Wed, Mar 23, 2022 at 3:00 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The generic implementation is faster (see strcspn commit).
>
> All string/memory tests pass.
> ---
> .../x86_64/multiarch/{strpbrk-sse2.S => strpbrk-sse2.c} | 9 ++++-----
> sysdeps/x86_64/strpbrk.S | 3 ---
> 2 files changed, 4 insertions(+), 8 deletions(-)
> rename sysdeps/x86_64/multiarch/{strpbrk-sse2.S => strpbrk-sse2.c} (84%)
> delete mode 100644 sysdeps/x86_64/strpbrk.S
>
> diff --git a/sysdeps/x86_64/multiarch/strpbrk-sse2.S b/sysdeps/x86_64/multiarch/strpbrk-sse2.c
> similarity index 84%
> rename from sysdeps/x86_64/multiarch/strpbrk-sse2.S
> rename to sysdeps/x86_64/multiarch/strpbrk-sse2.c
> index d537b6c27b..d03214c4fb 100644
> --- a/sysdeps/x86_64/multiarch/strpbrk-sse2.S
> +++ b/sysdeps/x86_64/multiarch/strpbrk-sse2.c
> @@ -1,4 +1,4 @@
> -/* strpbrk optimized with SSE2.
> +/* strpbrk.
> Copyright (C) 2017-2022 Free Software Foundation, Inc.
> This file is part of the GNU C Library.
>
> @@ -19,11 +19,10 @@
> #if IS_IN (libc)
>
> # include <sysdep.h>
> -# define strcspn __strpbrk_sse2
> +# define STRPBRK __strpbrk_sse2
>
> # undef libc_hidden_builtin_def
> -# define libc_hidden_builtin_def(strpbrk)
> +# define libc_hidden_builtin_def(STRPBRK)
> #endif
>
> -#define USE_AS_STRPBRK
> -#include <sysdeps/x86_64/strcspn.S>
> +#include <string/strpbrk.c>
> diff --git a/sysdeps/x86_64/strpbrk.S b/sysdeps/x86_64/strpbrk.S
> deleted file mode 100644
> index 21888a5b92..0000000000
> --- a/sysdeps/x86_64/strpbrk.S
> +++ /dev/null
> @@ -1,3 +0,0 @@
> -#define strcspn strpbrk
> -#define USE_AS_STRPBRK
> -#include <sysdeps/x86_64/strcspn.S>
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v1 11/23] x86: Remove strspn-sse2.S and use the generic implementation
2022-03-23 21:57 ` [PATCH v1 11/23] x86: Remove strspn-sse2.S " Noah Goldstein
@ 2022-03-24 18:57 ` H.J. Lu
2022-05-12 19:42 ` Sunil Pandey
0 siblings, 1 reply; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 18:57 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Wed, Mar 23, 2022 at 3:01 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The generic implementation is faster.
>
> geometric_mean(N=20) of all benchmarks New / Original: .710
>
> All string/memory tests pass.
> ---
> Geomtric Mean N=20 runs; All functions page aligned
> len, align1, align2, pos, New Time / Old Time
> 1, 0, 0, 512, 0.824
> 1, 1, 0, 512, 1.018
> 1, 0, 1, 512, 0.986
> 1, 1, 1, 512, 1.092
> 2, 0, 0, 512, 0.86
> 2, 2, 0, 512, 0.868
> 2, 0, 2, 512, 0.858
> 2, 2, 2, 512, 0.857
> 3, 0, 0, 512, 0.836
> 3, 3, 0, 512, 0.849
> 3, 0, 3, 512, 0.84
> 3, 3, 3, 512, 0.85
> 4, 0, 0, 512, 0.843
> 4, 4, 0, 512, 0.837
> 4, 0, 4, 512, 0.835
> 4, 4, 4, 512, 0.846
> 5, 0, 0, 512, 0.852
> 5, 5, 0, 512, 0.848
> 5, 0, 5, 512, 0.85
> 5, 5, 5, 512, 0.85
> 6, 0, 0, 512, 0.853
> 6, 6, 0, 512, 0.855
> 6, 0, 6, 512, 0.853
> 6, 6, 6, 512, 0.853
> 7, 0, 0, 512, 0.857
> 7, 7, 0, 512, 0.861
> 7, 0, 7, 512, 0.94
> 7, 7, 7, 512, 0.856
> 8, 0, 0, 512, 0.927
> 8, 0, 8, 512, 0.965
> 9, 0, 0, 512, 0.967
> 9, 1, 0, 512, 0.976
> 9, 0, 9, 512, 0.887
> 9, 1, 9, 512, 0.881
> 10, 0, 0, 512, 0.853
> 10, 2, 0, 512, 0.846
> 10, 0, 10, 512, 0.855
> 10, 2, 10, 512, 0.849
> 11, 0, 0, 512, 0.854
> 11, 3, 0, 512, 0.855
> 11, 0, 11, 512, 0.85
> 11, 3, 11, 512, 0.854
> 12, 0, 0, 512, 0.864
> 12, 4, 0, 512, 0.864
> 12, 0, 12, 512, 0.867
> 12, 4, 12, 512, 0.87
> 13, 0, 0, 512, 0.853
> 13, 5, 0, 512, 0.841
> 13, 0, 13, 512, 0.837
> 13, 5, 13, 512, 0.85
> 14, 0, 0, 512, 0.838
> 14, 6, 0, 512, 0.842
> 14, 0, 14, 512, 0.818
> 14, 6, 14, 512, 0.845
> 15, 0, 0, 512, 0.799
> 15, 7, 0, 512, 0.847
> 15, 0, 15, 512, 0.787
> 15, 7, 15, 512, 0.84
> 16, 0, 0, 512, 0.824
> 16, 0, 16, 512, 0.827
> 17, 0, 0, 512, 0.817
> 17, 1, 0, 512, 0.823
> 17, 0, 17, 512, 0.82
> 17, 1, 17, 512, 0.814
> 18, 0, 0, 512, 0.81
> 18, 2, 0, 512, 0.833
> 18, 0, 18, 512, 0.811
> 18, 2, 18, 512, 0.842
> 19, 0, 0, 512, 0.823
> 19, 3, 0, 512, 0.818
> 19, 0, 19, 512, 0.821
> 19, 3, 19, 512, 0.824
> 20, 0, 0, 512, 0.814
> 20, 4, 0, 512, 0.818
> 20, 0, 20, 512, 0.806
> 20, 4, 20, 512, 0.802
> 21, 0, 0, 512, 0.835
> 21, 5, 0, 512, 0.839
> 21, 0, 21, 512, 0.842
> 21, 5, 21, 512, 0.82
> 22, 0, 0, 512, 0.824
> 22, 6, 0, 512, 0.831
> 22, 0, 22, 512, 0.819
> 22, 6, 22, 512, 0.824
> 23, 0, 0, 512, 0.816
> 23, 7, 0, 512, 0.856
> 23, 0, 23, 512, 0.808
> 23, 7, 23, 512, 0.848
> 24, 0, 0, 512, 0.88
> 24, 0, 24, 512, 0.846
> 25, 0, 0, 512, 0.929
> 25, 1, 0, 512, 0.917
> 25, 0, 25, 512, 0.884
> 25, 1, 25, 512, 0.859
> 26, 0, 0, 512, 0.919
> 26, 2, 0, 512, 0.867
> 26, 0, 26, 512, 0.914
> 26, 2, 26, 512, 0.845
> 27, 0, 0, 512, 0.919
> 27, 3, 0, 512, 0.864
> 27, 0, 27, 512, 0.917
> 27, 3, 27, 512, 0.847
> 28, 0, 0, 512, 0.905
> 28, 4, 0, 512, 0.896
> 28, 0, 28, 512, 0.898
> 28, 4, 28, 512, 0.871
> 29, 0, 0, 512, 0.911
> 29, 5, 0, 512, 0.91
> 29, 0, 29, 512, 0.905
> 29, 5, 29, 512, 0.884
> 30, 0, 0, 512, 0.907
> 30, 6, 0, 512, 0.802
> 30, 0, 30, 512, 0.906
> 30, 6, 30, 512, 0.818
> 31, 0, 0, 512, 0.907
> 31, 7, 0, 512, 0.821
> 31, 0, 31, 512, 0.89
> 31, 7, 31, 512, 0.787
> 4, 0, 0, 32, 0.623
> 4, 1, 0, 32, 0.606
> 4, 0, 1, 32, 0.6
> 4, 1, 1, 32, 0.603
> 4, 0, 0, 64, 0.731
> 4, 2, 0, 64, 0.733
> 4, 0, 2, 64, 0.734
> 4, 2, 2, 64, 0.755
> 4, 0, 0, 128, 0.822
> 4, 3, 0, 128, 0.873
> 4, 0, 3, 128, 0.89
> 4, 3, 3, 128, 0.907
> 4, 0, 0, 256, 0.827
> 4, 4, 0, 256, 0.811
> 4, 0, 4, 256, 0.794
> 4, 4, 4, 256, 0.814
> 4, 5, 0, 512, 0.841
> 4, 0, 5, 512, 0.831
> 4, 5, 5, 512, 0.845
> 4, 0, 0, 1024, 0.861
> 4, 6, 0, 1024, 0.857
> 4, 0, 6, 1024, 0.9
> 4, 6, 6, 1024, 0.861
> 4, 0, 0, 2048, 0.879
> 4, 7, 0, 2048, 0.875
> 4, 0, 7, 2048, 0.883
> 4, 7, 7, 2048, 0.88
> 10, 1, 0, 64, 0.747
> 10, 1, 1, 64, 0.743
> 10, 2, 0, 64, 0.732
> 10, 2, 2, 64, 0.729
> 10, 3, 0, 64, 0.747
> 10, 3, 3, 64, 0.733
> 10, 4, 0, 64, 0.74
> 10, 4, 4, 64, 0.751
> 10, 5, 0, 64, 0.735
> 10, 5, 5, 64, 0.746
> 10, 6, 0, 64, 0.735
> 10, 6, 6, 64, 0.733
> 10, 7, 0, 64, 0.734
> 10, 7, 7, 64, 0.74
> 6, 0, 0, 0, 0.377
> 6, 0, 0, 1, 0.369
> 6, 0, 1, 1, 0.383
> 6, 0, 0, 2, 0.391
> 6, 0, 2, 2, 0.394
> 6, 0, 0, 3, 0.416
> 6, 0, 3, 3, 0.411
> 6, 0, 0, 4, 0.475
> 6, 0, 4, 4, 0.483
> 6, 0, 0, 5, 0.473
> 6, 0, 5, 5, 0.476
> 6, 0, 0, 6, 0.459
> 6, 0, 6, 6, 0.445
> 6, 0, 0, 7, 0.433
> 6, 0, 7, 7, 0.432
> 6, 0, 0, 8, 0.492
> 6, 0, 8, 8, 0.494
> 6, 0, 0, 9, 0.476
> 6, 0, 9, 9, 0.483
> 6, 0, 0, 10, 0.46
> 6, 0, 10, 10, 0.476
> 6, 0, 0, 11, 0.463
> 6, 0, 11, 11, 0.463
> 6, 0, 0, 12, 0.511
> 6, 0, 12, 12, 0.515
> 6, 0, 0, 13, 0.506
> 6, 0, 13, 13, 0.536
> 6, 0, 0, 14, 0.496
> 6, 0, 14, 14, 0.484
> 6, 0, 0, 15, 0.473
> 6, 0, 15, 15, 0.475
> 6, 0, 0, 16, 0.534
> 6, 0, 16, 16, 0.534
> 6, 0, 0, 17, 0.525
> 6, 0, 17, 17, 0.523
> 6, 0, 0, 18, 0.522
> 6, 0, 18, 18, 0.524
> 6, 0, 0, 19, 0.512
> 6, 0, 19, 19, 0.514
> 6, 0, 0, 20, 0.535
> 6, 0, 20, 20, 0.54
> 6, 0, 0, 21, 0.543
> 6, 0, 21, 21, 0.536
> 6, 0, 0, 22, 0.542
> 6, 0, 22, 22, 0.542
> 6, 0, 0, 23, 0.529
> 6, 0, 23, 23, 0.53
> 6, 0, 0, 24, 0.596
> 6, 0, 24, 24, 0.589
> 6, 0, 0, 25, 0.583
> 6, 0, 25, 25, 0.58
> 6, 0, 0, 26, 0.574
> 6, 0, 26, 26, 0.58
> 6, 0, 0, 27, 0.575
> 6, 0, 27, 27, 0.558
> 6, 0, 0, 28, 0.606
> 6, 0, 28, 28, 0.606
> 6, 0, 0, 29, 0.589
> 6, 0, 29, 29, 0.595
> 6, 0, 0, 30, 0.592
> 6, 0, 30, 30, 0.585
> 6, 0, 0, 31, 0.585
> 6, 0, 31, 31, 0.579
> 6, 0, 0, 32, 0.625
> 6, 0, 32, 32, 0.615
> 6, 0, 0, 33, 0.615
> 6, 0, 33, 33, 0.61
> 6, 0, 0, 34, 0.604
> 6, 0, 34, 34, 0.6
> 6, 0, 0, 35, 0.602
> 6, 0, 35, 35, 0.608
> 6, 0, 0, 36, 0.644
> 6, 0, 36, 36, 0.644
> 6, 0, 0, 37, 0.658
> 6, 0, 37, 37, 0.651
> 6, 0, 0, 38, 0.644
> 6, 0, 38, 38, 0.649
> 6, 0, 0, 39, 0.626
> 6, 0, 39, 39, 0.632
> 6, 0, 0, 40, 0.662
> 6, 0, 40, 40, 0.661
> 6, 0, 0, 41, 0.656
> 6, 0, 41, 41, 0.655
> 6, 0, 0, 42, 0.643
> 6, 0, 42, 42, 0.637
> 6, 0, 0, 43, 0.622
> 6, 0, 43, 43, 0.628
> 6, 0, 0, 44, 0.673
> 6, 0, 44, 44, 0.687
> 6, 0, 0, 45, 0.661
> 6, 0, 45, 45, 0.659
> 6, 0, 0, 46, 0.657
> 6, 0, 46, 46, 0.653
> 6, 0, 0, 47, 0.658
> 6, 0, 47, 47, 0.65
> 6, 0, 0, 48, 0.678
> 6, 0, 48, 48, 0.683
> 6, 0, 0, 49, 0.676
> 6, 0, 49, 49, 0.661
> 6, 0, 0, 50, 0.672
> 6, 0, 50, 50, 0.662
> 6, 0, 0, 51, 0.656
> 6, 0, 51, 51, 0.659
> 6, 0, 0, 52, 0.682
> 6, 0, 52, 52, 0.686
> 6, 0, 0, 53, 0.67
> 6, 0, 53, 53, 0.674
> 6, 0, 0, 54, 0.663
> 6, 0, 54, 54, 0.675
> 6, 0, 0, 55, 0.662
> 6, 0, 55, 55, 0.665
> 6, 0, 0, 56, 0.681
> 6, 0, 56, 56, 0.697
> 6, 0, 0, 57, 0.686
> 6, 0, 57, 57, 0.687
> 6, 0, 0, 58, 0.701
> 6, 0, 58, 58, 0.693
> 6, 0, 0, 59, 0.709
> 6, 0, 59, 59, 0.698
> 6, 0, 0, 60, 0.708
> 6, 0, 60, 60, 0.708
> 6, 0, 0, 61, 0.709
> 6, 0, 61, 61, 0.716
> 6, 0, 0, 62, 0.709
> 6, 0, 62, 62, 0.707
> 6, 0, 0, 63, 0.703
> 6, 0, 63, 63, 0.716
>
> .../{strspn-sse2.S => strspn-sse2.c} | 8 +-
> sysdeps/x86_64/strspn.S | 112 ------------------
> 2 files changed, 4 insertions(+), 116 deletions(-)
> rename sysdeps/x86_64/multiarch/{strspn-sse2.S => strspn-sse2.c} (86%)
> delete mode 100644 sysdeps/x86_64/strspn.S
>
> diff --git a/sysdeps/x86_64/multiarch/strspn-sse2.S b/sysdeps/x86_64/multiarch/strspn-sse2.c
> similarity index 86%
> rename from sysdeps/x86_64/multiarch/strspn-sse2.S
> rename to sysdeps/x86_64/multiarch/strspn-sse2.c
> index e0a095f25a..61cc6cb0a5 100644
> --- a/sysdeps/x86_64/multiarch/strspn-sse2.S
> +++ b/sysdeps/x86_64/multiarch/strspn-sse2.c
> @@ -1,4 +1,4 @@
> -/* strspn optimized with SSE2.
> +/* strspn.
> Copyright (C) 2017-2022 Free Software Foundation, Inc.
> This file is part of the GNU C Library.
>
> @@ -19,10 +19,10 @@
> #if IS_IN (libc)
>
> # include <sysdep.h>
> -# define strspn __strspn_sse2
> +# define STRSPN __strspn_sse2
>
> # undef libc_hidden_builtin_def
> -# define libc_hidden_builtin_def(strspn)
> +# define libc_hidden_builtin_def(STRSPN)
> #endif
>
> -#include <sysdeps/x86_64/strspn.S>
> +#include <string/strspn.c>
> diff --git a/sysdeps/x86_64/strspn.S b/sysdeps/x86_64/strspn.S
> deleted file mode 100644
> index 61b76ee0a1..0000000000
> --- a/sysdeps/x86_64/strspn.S
> +++ /dev/null
> @@ -1,112 +0,0 @@
> -/* strspn (str, ss) -- Return the length of the initial segment of STR
> - which contains only characters from SS.
> - For AMD x86-64.
> - Copyright (C) 1994-2022 Free Software Foundation, Inc.
> - This file is part of the GNU C Library.
> -
> - The GNU C Library is free software; you can redistribute it and/or
> - modify it under the terms of the GNU Lesser General Public
> - License as published by the Free Software Foundation; either
> - version 2.1 of the License, or (at your option) any later version.
> -
> - The GNU C Library is distributed in the hope that it will be useful,
> - but WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - Lesser General Public License for more details.
> -
> - You should have received a copy of the GNU Lesser General Public
> - License along with the GNU C Library; if not, see
> - <https://www.gnu.org/licenses/>. */
> -
> -#include <sysdep.h>
> -
> - .text
> -ENTRY (strspn)
> -
> - movq %rdi, %rdx /* Save SRC. */
> -
> - /* First we create a table with flags for all possible characters.
> - For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
> - supported by the C string functions we have 256 characters.
> - Before inserting marks for the stop characters we clear the whole
> - table. */
> - movq %rdi, %r8 /* Save value. */
> - subq $256, %rsp /* Make space for 256 bytes. */
> - cfi_adjust_cfa_offset(256)
> - movl $32, %ecx /* 32*8 bytes = 256 bytes. */
> - movq %rsp, %rdi
> - xorl %eax, %eax /* We store 0s. */
> - cld
> - rep
> - stosq
> -
> - movq %rsi, %rax /* Setup stopset. */
> -
> -/* For understanding the following code remember that %rcx == 0 now.
> - Although all the following instruction only modify %cl we always
> - have a correct zero-extended 64-bit value in %rcx. */
> -
> - .p2align 4
> -L(2): movb (%rax), %cl /* get byte from stopset */
> - testb %cl, %cl /* is NUL char? */
> - jz L(1) /* yes => start compare loop */
> - movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */
> -
> - movb 1(%rax), %cl /* get byte from stopset */
> - testb $0xff, %cl /* is NUL char? */
> - jz L(1) /* yes => start compare loop */
> - movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */
> -
> - movb 2(%rax), %cl /* get byte from stopset */
> - testb $0xff, %cl /* is NUL char? */
> - jz L(1) /* yes => start compare loop */
> - movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */
> -
> - movb 3(%rax), %cl /* get byte from stopset */
> - addq $4, %rax /* increment stopset pointer */
> - movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */
> - testb $0xff, %cl /* is NUL char? */
> - jnz L(2) /* no => process next dword from stopset */
> -
> -L(1): leaq -4(%rdx), %rax /* prepare loop */
> -
> - /* We use a neat trick for the following loop. Normally we would
> - have to test for two termination conditions
> - 1. a character in the stopset was found
> - and
> - 2. the end of the string was found
> - But as a sign that the character is in the stopset we store its
> - value in the table. But the value of NUL is NUL so the loop
> - terminates for NUL in every case. */
> -
> - .p2align 4
> -L(3): addq $4, %rax /* adjust pointer for full loop round */
> -
> - movb (%rax), %cl /* get byte from string */
> - testb %cl, (%rsp,%rcx) /* is it contained in skipset? */
> - jz L(4) /* no => return */
> -
> - movb 1(%rax), %cl /* get byte from string */
> - testb %cl, (%rsp,%rcx) /* is it contained in skipset? */
> - jz L(5) /* no => return */
> -
> - movb 2(%rax), %cl /* get byte from string */
> - testb %cl, (%rsp,%rcx) /* is it contained in skipset? */
> - jz L(6) /* no => return */
> -
> - movb 3(%rax), %cl /* get byte from string */
> - testb %cl, (%rsp,%rcx) /* is it contained in skipset? */
> - jnz L(3) /* yes => start loop again */
> -
> - incq %rax /* adjust pointer */
> -L(6): incq %rax
> -L(5): incq %rax
> -
> -L(4): addq $256, %rsp /* remove stopset */
> - cfi_adjust_cfa_offset(-256)
> - subq %rdx, %rax /* we have to return the number of valid
> - characters, so compute distance to first
> - non-valid character */
> - ret
> -END (strspn)
> -libc_hidden_builtin_def (strspn)
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v1 12/23] x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896]
2022-03-23 21:57 ` [PATCH v1 12/23] x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896] Noah Goldstein
@ 2022-03-24 18:59 ` H.J. Lu
2022-03-24 19:18 ` Noah Goldstein
2022-03-24 20:50 ` [PATCH v2 12/31] " Noah Goldstein
1 sibling, 1 reply; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 18:59 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Wed, Mar 23, 2022 at 3:01 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Overflow case for __wcsncmp_avx2_rtm should be __wcscmp_avx2_rtm not
> __wcscmp_avx2.
>
> All string/memory tests pass.
> ---
> sysdeps/x86_64/multiarch/strcmp-avx2.S | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> index 52ff5ad724..86a86b68e3 100644
> --- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> @@ -122,7 +122,7 @@ ENTRY(STRCMP)
> are cases where length is large enough that it can never be a
> bound on valid memory so just use wcscmp. */
> shrq $56, %rcx
> - jnz __wcscmp_avx2
> + jnz OVERFLOW_STRCMP
>
> leaq (, %rdx, 4), %rdx
> # endif
> --
> 2.25.1
>
Isn't it a bug? Is there a glibc bug? Should this also be fixed on release
branches?
--
H.J.
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v1 13/23] benchtests: Use json-lib in bench-strcasecmp.c
2022-03-23 21:57 ` [PATCH v1 13/23] benchtests: Use json-lib in bench-strcasecmp.c Noah Goldstein
@ 2022-03-24 19:00 ` H.J. Lu
0 siblings, 0 replies; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 19:00 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Wed, Mar 23, 2022 at 3:01 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Just QOL change to make parsing the output of the benchtests more
> consistent.
> ---
> benchtests/bench-strcasecmp.c | 77 +++++++++++++++++++++++------------
> 1 file changed, 51 insertions(+), 26 deletions(-)
>
> diff --git a/benchtests/bench-strcasecmp.c b/benchtests/bench-strcasecmp.c
> index daccf1d245..855f2db2ad 100644
> --- a/benchtests/bench-strcasecmp.c
> +++ b/benchtests/bench-strcasecmp.c
> @@ -20,6 +20,7 @@
> #define TEST_MAIN
> #define TEST_NAME "strcasecmp"
> #include "bench-string.h"
> +#include "json-lib.h"
>
> typedef int (*proto_t) (const char *, const char *);
> static int simple_strcasecmp (const char *, const char *);
> @@ -40,7 +41,8 @@ simple_strcasecmp (const char *s1, const char *s2)
> }
>
> static void
> -do_one_test (impl_t *impl, const char *s1, const char *s2, int exp_result)
> +do_one_test (json_ctx_t *json_ctx, impl_t *impl, const char *s1,
> + const char *s2, int exp_result)
> {
> size_t i, iters = INNER_LOOP_ITERS;
> timing_t start, stop, cur;
> @@ -64,12 +66,12 @@ do_one_test (impl_t *impl, const char *s1, const char *s2, int exp_result)
>
> TIMING_DIFF (cur, start, stop);
>
> - TIMING_PRINT_MEAN ((double) cur, (double) iters);
> + json_element_double (json_ctx, (double) cur / (double) iters);
> }
>
> static void
> -do_test (size_t align1, size_t align2, size_t len, int max_char,
> - int exp_result)
> +do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len,
> + int max_char, int exp_result)
> {
> size_t i;
> char *s1, *s2;
> @@ -85,6 +87,13 @@ do_test (size_t align1, size_t align2, size_t len, int max_char,
> if (align2 + len + 1 >= page_size)
> return;
>
> + json_element_object_begin (json_ctx);
> + json_attr_uint (json_ctx, "length", len);
> + json_attr_uint (json_ctx, "align1", align1);
> + json_attr_uint (json_ctx, "align2", align2);
> + json_attr_uint (json_ctx, "max_char", max_char);
> + json_array_begin (json_ctx, "timings");
> +
> s1 = (char *) (buf1 + align1);
> s2 = (char *) (buf2 + align2);
>
> @@ -103,53 +112,69 @@ do_test (size_t align1, size_t align2, size_t len, int max_char,
> else
> s2[len - 1] -= exp_result;
>
> - printf ("Length %4zd, alignment %2zd/%2zd:", len, align1, align2);
> -
> FOR_EACH_IMPL (impl, 0)
> - do_one_test (impl, s1, s2, exp_result);
> + do_one_test (json_ctx, impl, s1, s2, exp_result);
>
> - putchar ('\n');
> + json_array_end (json_ctx);
> + json_element_object_end (json_ctx);
> }
>
> int
> test_main (void)
> {
> + json_ctx_t json_ctx;
> size_t i;
>
> test_init ();
>
> - printf ("%23s", "");
> + json_init (&json_ctx, 0, stdout);
> +
> + json_document_begin (&json_ctx);
> + json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> +
> + json_attr_object_begin (&json_ctx, "functions");
> + json_attr_object_begin (&json_ctx, TEST_NAME);
> + json_attr_string (&json_ctx, "bench-variant", "");
> +
> + json_array_begin (&json_ctx, "ifuncs");
> FOR_EACH_IMPL (impl, 0)
> - printf ("\t%s", impl->name);
> - putchar ('\n');
> + json_element_string (&json_ctx, impl->name);
> + json_array_end (&json_ctx);
> +
> + json_array_begin (&json_ctx, "results");
>
> for (i = 1; i < 16; ++i)
> {
> - do_test (i, i, i, 127, 0);
> - do_test (i, i, i, 127, 1);
> - do_test (i, i, i, 127, -1);
> + do_test (&json_ctx, i, i, i, 127, 0);
> + do_test (&json_ctx, i, i, i, 127, 1);
> + do_test (&json_ctx, i, i, i, 127, -1);
> }
>
> for (i = 1; i < 10; ++i)
> {
> - do_test (0, 0, 2 << i, 127, 0);
> - do_test (0, 0, 2 << i, 254, 0);
> - do_test (0, 0, 2 << i, 127, 1);
> - do_test (0, 0, 2 << i, 254, 1);
> - do_test (0, 0, 2 << i, 127, -1);
> - do_test (0, 0, 2 << i, 254, -1);
> + do_test (&json_ctx, 0, 0, 2 << i, 127, 0);
> + do_test (&json_ctx, 0, 0, 2 << i, 254, 0);
> + do_test (&json_ctx, 0, 0, 2 << i, 127, 1);
> + do_test (&json_ctx, 0, 0, 2 << i, 254, 1);
> + do_test (&json_ctx, 0, 0, 2 << i, 127, -1);
> + do_test (&json_ctx, 0, 0, 2 << i, 254, -1);
> }
>
> for (i = 1; i < 8; ++i)
> {
> - do_test (i, 2 * i, 8 << i, 127, 0);
> - do_test (2 * i, i, 8 << i, 254, 0);
> - do_test (i, 2 * i, 8 << i, 127, 1);
> - do_test (2 * i, i, 8 << i, 254, 1);
> - do_test (i, 2 * i, 8 << i, 127, -1);
> - do_test (2 * i, i, 8 << i, 254, -1);
> + do_test (&json_ctx, i, 2 * i, 8 << i, 127, 0);
> + do_test (&json_ctx, 2 * i, i, 8 << i, 254, 0);
> + do_test (&json_ctx, i, 2 * i, 8 << i, 127, 1);
> + do_test (&json_ctx, 2 * i, i, 8 << i, 254, 1);
> + do_test (&json_ctx, i, 2 * i, 8 << i, 127, -1);
> + do_test (&json_ctx, 2 * i, i, 8 << i, 254, -1);
> }
>
> + json_array_end (&json_ctx);
> + json_attr_object_end (&json_ctx);
> + json_attr_object_end (&json_ctx);
> + json_document_end (&json_ctx);
> +
> return ret;
> }
>
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v1 14/23] benchtests: Use json-lib in bench-strncasecmp.c
2022-03-23 21:57 ` [PATCH v1 14/23] benchtests: Use json-lib in bench-strncasecmp.c Noah Goldstein
@ 2022-03-24 19:00 ` H.J. Lu
0 siblings, 0 replies; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 19:00 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Wed, Mar 23, 2022 at 3:01 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Just QOL change to make parsing the output of the benchtests more
> consistent.
> ---
> benchtests/bench-strncasecmp.c | 113 ++++++++++++++++++++-------------
> 1 file changed, 69 insertions(+), 44 deletions(-)
>
> diff --git a/benchtests/bench-strncasecmp.c b/benchtests/bench-strncasecmp.c
> index a9819efc73..91f49cc8d3 100644
> --- a/benchtests/bench-strncasecmp.c
> +++ b/benchtests/bench-strncasecmp.c
> @@ -20,6 +20,7 @@
> #define TEST_MAIN
> #define TEST_NAME "strncasecmp"
> #include "bench-string.h"
> +#include "json-lib.h"
>
> typedef int (*proto_t) (const char *, const char *, size_t);
> static int simple_strncasecmp (const char *, const char *, size_t);
> @@ -47,8 +48,8 @@ simple_strncasecmp (const char *s1, const char *s2, size_t n)
> }
>
> static void
> -do_one_test (impl_t *impl, const char *s1, const char *s2, size_t n,
> - int exp_result)
> +do_one_test (json_ctx_t *json_ctx, impl_t *impl, const char *s1,
> + const char *s2, size_t n, int exp_result)
> {
> size_t i, iters = INNER_LOOP_ITERS;
> timing_t start, stop, cur;
> @@ -62,12 +63,12 @@ do_one_test (impl_t *impl, const char *s1, const char *s2, size_t n,
>
> TIMING_DIFF (cur, start, stop);
>
> - TIMING_PRINT_MEAN ((double) cur, (double) iters);
> + json_element_double (json_ctx, (double) cur / (double) iters);
> }
>
> static void
> -do_test (size_t align1, size_t align2, size_t n, size_t len, int max_char,
> - int exp_result)
> +do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t n,
> + size_t len, int max_char, int exp_result)
> {
> size_t i;
> char *s1, *s2;
> @@ -101,83 +102,107 @@ do_test (size_t align1, size_t align2, size_t n, size_t len, int max_char,
> else
> s2[len - 1] -= exp_result;
>
> - printf ("Length %4zd, alignment %2zd/%2zd:", len, align1, align2);
> + json_element_object_begin (json_ctx);
> + json_attr_uint (json_ctx, "length", len);
> + json_attr_uint (json_ctx, "n", n);
> + json_attr_uint (json_ctx, "align1", align1);
> + json_attr_uint (json_ctx, "align2", align2);
> + json_attr_uint (json_ctx, "max_char", max_char);
> + json_array_begin (json_ctx, "timings");
>
> FOR_EACH_IMPL (impl, 0)
> - do_one_test (impl, s1, s2, n, exp_result);
> + do_one_test (json_ctx, impl, s1, s2, n, exp_result);
>
> - putchar ('\n');
> + json_array_end (json_ctx);
> + json_element_object_end (json_ctx);
> }
>
> int
> test_main (void)
> {
> + json_ctx_t json_ctx;
> size_t i;
>
> test_init ();
>
> - printf ("%23s", "");
> + json_init (&json_ctx, 0, stdout);
> +
> + json_document_begin (&json_ctx);
> + json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> +
> + json_attr_object_begin (&json_ctx, "functions");
> + json_attr_object_begin (&json_ctx, TEST_NAME);
> + json_attr_string (&json_ctx, "bench-variant", "");
> +
> + json_array_begin (&json_ctx, "ifuncs");
> FOR_EACH_IMPL (impl, 0)
> - printf ("\t%s", impl->name);
> - putchar ('\n');
> + json_element_string (&json_ctx, impl->name);
> + json_array_end (&json_ctx);
> +
> + json_array_begin (&json_ctx, "results");
>
> for (i = 1; i < 16; ++i)
> {
> - do_test (i, i, i - 1, i, 127, 0);
> + do_test (&json_ctx, i, i, i - 1, i, 127, 0);
>
> - do_test (i, i, i, i, 127, 0);
> - do_test (i, i, i, i, 127, 1);
> - do_test (i, i, i, i, 127, -1);
> + do_test (&json_ctx, i, i, i, i, 127, 0);
> + do_test (&json_ctx, i, i, i, i, 127, 1);
> + do_test (&json_ctx, i, i, i, i, 127, -1);
>
> - do_test (i, i, i + 1, i, 127, 0);
> - do_test (i, i, i + 1, i, 127, 1);
> - do_test (i, i, i + 1, i, 127, -1);
> + do_test (&json_ctx, i, i, i + 1, i, 127, 0);
> + do_test (&json_ctx, i, i, i + 1, i, 127, 1);
> + do_test (&json_ctx, i, i, i + 1, i, 127, -1);
> }
>
> for (i = 1; i < 10; ++i)
> {
> - do_test (0, 0, (2 << i) - 1, 2 << i, 127, 0);
> - do_test (0, 0, 2 << i, 2 << i, 254, 0);
> - do_test (0, 0, (2 << i) + 1, 2 << i, 127, 0);
> + do_test (&json_ctx, 0, 0, (2 << i) - 1, 2 << i, 127, 0);
> + do_test (&json_ctx, 0, 0, 2 << i, 2 << i, 254, 0);
> + do_test (&json_ctx, 0, 0, (2 << i) + 1, 2 << i, 127, 0);
>
> - do_test (0, 0, (2 << i) + 1, 2 << i, 254, 0);
> + do_test (&json_ctx, 0, 0, (2 << i) + 1, 2 << i, 254, 0);
>
> - do_test (0, 0, 2 << i, 2 << i, 127, 1);
> - do_test (0, 0, (2 << i) + 10, 2 << i, 127, 1);
> + do_test (&json_ctx, 0, 0, 2 << i, 2 << i, 127, 1);
> + do_test (&json_ctx, 0, 0, (2 << i) + 10, 2 << i, 127, 1);
>
> - do_test (0, 0, 2 << i, 2 << i, 254, 1);
> - do_test (0, 0, (2 << i) + 10, 2 << i, 254, 1);
> + do_test (&json_ctx, 0, 0, 2 << i, 2 << i, 254, 1);
> + do_test (&json_ctx, 0, 0, (2 << i) + 10, 2 << i, 254, 1);
>
> - do_test (0, 0, 2 << i, 2 << i, 127, -1);
> - do_test (0, 0, (2 << i) + 10, 2 << i, 127, -1);
> + do_test (&json_ctx, 0, 0, 2 << i, 2 << i, 127, -1);
> + do_test (&json_ctx, 0, 0, (2 << i) + 10, 2 << i, 127, -1);
>
> - do_test (0, 0, 2 << i, 2 << i, 254, -1);
> - do_test (0, 0, (2 << i) + 10, 2 << i, 254, -1);
> + do_test (&json_ctx, 0, 0, 2 << i, 2 << i, 254, -1);
> + do_test (&json_ctx, 0, 0, (2 << i) + 10, 2 << i, 254, -1);
> }
>
> for (i = 1; i < 8; ++i)
> {
> - do_test (i, 2 * i, (8 << i) - 1, 8 << i, 127, 0);
> - do_test (i, 2 * i, 8 << i, 8 << i, 127, 0);
> - do_test (i, 2 * i, (8 << i) + 100, 8 << i, 127, 0);
> + do_test (&json_ctx, i, 2 * i, (8 << i) - 1, 8 << i, 127, 0);
> + do_test (&json_ctx, i, 2 * i, 8 << i, 8 << i, 127, 0);
> + do_test (&json_ctx, i, 2 * i, (8 << i) + 100, 8 << i, 127, 0);
>
> - do_test (2 * i, i, (8 << i) - 1, 8 << i, 254, 0);
> - do_test (2 * i, i, 8 << i, 8 << i, 254, 0);
> - do_test (2 * i, i, (8 << i) + 100, 8 << i, 254, 0);
> + do_test (&json_ctx, 2 * i, i, (8 << i) - 1, 8 << i, 254, 0);
> + do_test (&json_ctx, 2 * i, i, 8 << i, 8 << i, 254, 0);
> + do_test (&json_ctx, 2 * i, i, (8 << i) + 100, 8 << i, 254, 0);
>
> - do_test (i, 2 * i, 8 << i, 8 << i, 127, 1);
> - do_test (i, 2 * i, (8 << i) + 100, 8 << i, 127, 1);
> + do_test (&json_ctx, i, 2 * i, 8 << i, 8 << i, 127, 1);
> + do_test (&json_ctx, i, 2 * i, (8 << i) + 100, 8 << i, 127, 1);
>
> - do_test (2 * i, i, 8 << i, 8 << i, 254, 1);
> - do_test (2 * i, i, (8 << i) + 100, 8 << i, 254, 1);
> + do_test (&json_ctx, 2 * i, i, 8 << i, 8 << i, 254, 1);
> + do_test (&json_ctx, 2 * i, i, (8 << i) + 100, 8 << i, 254, 1);
>
> - do_test (i, 2 * i, 8 << i, 8 << i, 127, -1);
> - do_test (i, 2 * i, (8 << i) + 100, 8 << i, 127, -1);
> + do_test (&json_ctx, i, 2 * i, 8 << i, 8 << i, 127, -1);
> + do_test (&json_ctx, i, 2 * i, (8 << i) + 100, 8 << i, 127, -1);
>
> - do_test (2 * i, i, 8 << i, 8 << i, 254, -1);
> - do_test (2 * i, i, (8 << i) + 100, 8 << i, 254, -1);
> + do_test (&json_ctx, 2 * i, i, 8 << i, 8 << i, 254, -1);
> + do_test (&json_ctx, 2 * i, i, (8 << i) + 100, 8 << i, 254, -1);
> }
>
> + json_array_end (&json_ctx);
> + json_attr_object_end (&json_ctx);
> + json_attr_object_end (&json_ctx);
> + json_document_end (&json_ctx);
> +
> return ret;
> }
>
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v1 15/23] string: Expand page cross tests in test-strcasecmp.c
2022-03-23 21:57 ` [PATCH v1 15/23] string: Expand page cross tests in test-strcasecmp.c Noah Goldstein
@ 2022-03-24 19:01 ` H.J. Lu
0 siblings, 0 replies; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 19:01 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Wed, Mar 23, 2022 at 3:01 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Add more robust tests that cover all the page cross edge cases.
> ---
> string/test-strcasecmp.c | 112 ++++++++++++++++++++++++++++++++++-----
> 1 file changed, 100 insertions(+), 12 deletions(-)
>
> diff --git a/string/test-strcasecmp.c b/string/test-strcasecmp.c
> index 3d994f9d64..438a9713ac 100644
> --- a/string/test-strcasecmp.c
> +++ b/string/test-strcasecmp.c
> @@ -18,6 +18,10 @@
>
> #include <locale.h>
> #include <ctype.h>
> +#include <assert.h>
> +#define TEST_LEN (getpagesize () * 3)
> +#define MIN_PAGE_SIZE (TEST_LEN + 2 * getpagesize ())
> +
> #define TEST_MAIN
> #define TEST_NAME "strcasecmp"
> #include "test-string.h"
> @@ -85,12 +89,13 @@ do_test (size_t align1, size_t align2, size_t len, int max_char,
> if (len == 0)
> return;
>
> - align1 &= 7;
> - if (align1 + len + 1 >= page_size)
> +
> + align1 &= getpagesize () - 1;
> + if (align1 + (len + 1) >= page_size)
> return;
>
> - align2 &= 7;
> - if (align2 + len + 1 >= page_size)
> + align2 &= getpagesize () - 1;
> + if (align2 + (len + 1) >= page_size)
> return;
>
> s1 = (char *) (buf1 + align1);
> @@ -105,12 +110,33 @@ do_test (size_t align1, size_t align2, size_t len, int max_char,
> s1[len] = s2[len] = 0;
> s1[len + 1] = 23;
> s2[len + 1] = 24 + exp_result;
> +
> if ((s2[len - 1] == 'z' && exp_result == -1)
> || (s2[len - 1] == 'a' && exp_result == 1))
> s1[len - 1] += exp_result;
> + else if ((s1[len - 1] == 'Z' + 1 && exp_result == 1)
> + || (s1[len - 1] == 'A' - 1 && exp_result == -1))
> + s1[len - 1] = tolower (s2[len - 1]) + exp_result;
> else
> s2[len - 1] -= exp_result;
>
> + /* For some locals this is not guranteed yet. */
> + if (tolower (s1[len - 1]) - tolower (s2[len - 1]) != exp_result)
> + {
> + if (exp_result == -1)
> + {
> + s1[len - 1] = tolower ('a');
> + s2[len - 1] = toupper (tolower ('a') - 1);
> + }
> + else if (exp_result == 0)
> + s1[len - 1] = toupper (s2[len - 1]);
> + else
> + {
> + s1[len - 1] = tolower ('a');
> + s2[len - 1] = toupper (tolower ('a') + 1);
> + }
> + }
> +
> FOR_EACH_IMPL (impl, 0)
> do_one_test (impl, s1, s2, exp_result);
> }
> @@ -207,10 +233,10 @@ do_random_tests (void)
> }
>
> static void
> -test_locale (const char *locale)
> +test_locale (const char *locale, int extra_tests)
> {
> - size_t i;
> -
> + size_t i, j, k;
> + const size_t test_len = MIN(TEST_LEN, 3 * 4096);
> if (setlocale (LC_CTYPE, locale) == NULL)
> {
> error (0, 0, "cannot set locale \"%s\"", locale);
> @@ -249,6 +275,68 @@ test_locale (const char *locale)
> do_test (2 * i, i, 8 << i, 254, -1);
> }
>
> + for (j = 0; extra_tests && j < 160; ++j)
> + {
> + for (i = 0; i < test_len;)
> + {
> + do_test (getpagesize () - j - 1, 0, i, 127, 0);
> + do_test (getpagesize () - j - 1, 0, i, 127, 1);
> + do_test (getpagesize () - j - 1, 0, i, 127, -1);
> +
> + do_test (getpagesize () - j - 1, j, i, 127, 0);
> + do_test (getpagesize () - j - 1, j, i, 127, 1);
> + do_test (getpagesize () - j - 1, j, i, 127, -1);
> +
> + do_test (0, getpagesize () - j - 1, i, 127, 0);
> + do_test (0, getpagesize () - j - 1, i, 127, 1);
> + do_test (0, getpagesize () - j - 1, i, 127, -1);
> +
> + do_test (j, getpagesize () - j - 1, i, 127, 0);
> + do_test (j, getpagesize () - j - 1, i, 127, 1);
> + do_test (j, getpagesize () - j - 1, i, 127, -1);
> +
> + for (k = 2; k <= 128; k += k)
> + {
> + do_test (getpagesize () - k, getpagesize () - j - 1, i, 127, 0);
> + do_test (getpagesize () - k - 1, getpagesize () - j - 1, i, 127,
> + 0);
> + do_test (getpagesize () - k, getpagesize () - j - 1, i, 127, 1);
> + do_test (getpagesize () - k - 1, getpagesize () - j - 1, i, 127,
> + 1);
> + do_test (getpagesize () - k, getpagesize () - j - 1, i, 127, -1);
> + do_test (getpagesize () - k - 1, getpagesize () - j - 1, i, 127,
> + -1);
> + }
> +
> + if (i < 32)
> + {
> + i += 1;
> + }
> + else if (i < 161)
> + {
> + i += 7;
> + }
> + else if (i + 161 < test_len)
> + {
> + i += 31;
> + i *= 17;
> + i /= 16;
> + if (i + 161 > test_len)
> + {
> + i = test_len - 160;
> + }
> + }
> + else if (i + 32 < test_len)
> + {
> + i += 7;
> + }
> + else
> + {
> + i += 1;
> + }
> + }
> + }
> +
> do_random_tests ();
> }
>
> @@ -257,11 +345,11 @@ test_main (void)
> {
> test_init ();
>
> - test_locale ("C");
> - test_locale ("en_US.ISO-8859-1");
> - test_locale ("en_US.UTF-8");
> - test_locale ("tr_TR.ISO-8859-9");
> - test_locale ("tr_TR.UTF-8");
> + test_locale ("C", 1);
> + test_locale ("en_US.ISO-8859-1", 0);
> + test_locale ("en_US.UTF-8", 0);
> + test_locale ("tr_TR.ISO-8859-9", 0);
> + test_locale ("tr_TR.UTF-8", 0);
>
> return ret;
> }
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v1 16/23] string: Expand page cross tests in test-strncasecmp.c
2022-03-23 21:57 ` [PATCH v1 16/23] string: Expand page cross tests in test-strncasecmp.c Noah Goldstein
@ 2022-03-24 19:01 ` H.J. Lu
0 siblings, 0 replies; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 19:01 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Wed, Mar 23, 2022 at 3:01 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Add more robust tests that cover all the page cross edge cases.
> ---
> string/test-strncasecmp.c | 166 +++++++++++++++++++++++++++++++++++---
> 1 file changed, 154 insertions(+), 12 deletions(-)
>
> diff --git a/string/test-strncasecmp.c b/string/test-strncasecmp.c
> index a3c848165a..b86c630bf6 100644
> --- a/string/test-strncasecmp.c
> +++ b/string/test-strncasecmp.c
> @@ -18,6 +18,10 @@
>
> #include <locale.h>
> #include <ctype.h>
> +
> +#define TEST_LEN (getpagesize () * 3)
> +#define MIN_PAGE_SIZE (TEST_LEN + 2 * getpagesize ())
> +
> #define TEST_MAIN
> #define TEST_NAME "strncasecmp"
> #include "test-string.h"
> @@ -106,14 +110,15 @@ do_test (size_t align1, size_t align2, size_t n, size_t len, int max_char,
> if (len == 0)
> return;
>
> - align1 &= 7;
> - if (align1 + len + 1 >= page_size)
> + align1 &= getpagesize () - 1;
> + if (align1 + (len + 2) >= page_size)
> return;
>
> - align2 &= 7;
> - if (align2 + len + 1 >= page_size)
> + align2 &= getpagesize () - 1;
> + if (align2 + (len + 2) >= page_size)
> return;
>
> +
> s1 = (char *) (buf1 + align1);
> s2 = (char *) (buf2 + align2);
>
> @@ -126,12 +131,33 @@ do_test (size_t align1, size_t align2, size_t n, size_t len, int max_char,
> s1[len] = s2[len] = 0;
> s1[len + 1] = 23;
> s2[len + 1] = 24 + exp_result;
> +
> if ((s2[len - 1] == 'z' && exp_result == -1)
> || (s2[len - 1] == 'a' && exp_result == 1))
> s1[len - 1] += exp_result;
> + else if ((s1[len - 1] == 'Z' + 1 && exp_result == 1)
> + || (s1[len - 1] == 'A' - 1 && exp_result == -1))
> + s1[len - 1] = tolower (s2[len - 1]) + exp_result;
> else
> s2[len - 1] -= exp_result;
>
> + /* For some locals this is not guranteed yet. */
> + if (tolower (s1[len - 1]) - tolower (s2[len - 1]) != exp_result)
> + {
> + if (exp_result == -1)
> + {
> + s1[len - 1] = tolower ('a');
> + s2[len - 1] = toupper (tolower ('a') - 1);
> + }
> + else if (exp_result == 0)
> + s1[len - 1] = toupper (s2[len - 1]);
> + else
> + {
> + s1[len - 1] = tolower ('a');
> + s2[len - 1] = toupper (tolower ('a') + 1);
> + }
> + }
> +
> FOR_EACH_IMPL (impl, 0)
> do_one_test (impl, s1, s2, n, exp_result);
> }
> @@ -299,10 +325,10 @@ bz14195 (void)
> }
>
> static void
> -test_locale (const char *locale)
> +test_locale (const char *locale, int extra_tests)
> {
> - size_t i;
> -
> + size_t i, j, k;
> + const size_t test_len = MIN(TEST_LEN, 3 * 4096);
> if (setlocale (LC_CTYPE, locale) == NULL)
> {
> error (0, 0, "cannot set locale \"%s\"", locale);
> @@ -374,6 +400,122 @@ test_locale (const char *locale)
> do_test (2 * i, i, (8 << i) + 100, 8 << i, 254, -1);
> }
>
> + for (j = 0; extra_tests && j < 160; ++j)
> + {
> + for (i = 0; i < test_len;)
> + {
> + do_test (getpagesize () - j - 1, 0, i + 1, i, 127, 0);
> + do_test (getpagesize () - j - 1, 0, i + 1, i, 127, 1);
> + do_test (getpagesize () - j - 1, 0, i + 1, i, 127, -1);
> +
> + do_test (getpagesize () - j - 1, 0, i, i, 127, 0);
> + do_test (getpagesize () - j - 1, 0, i - 1, i, 127, 0);
> +
> + do_test (getpagesize () - j - 1, 0, ULONG_MAX, i, 127, 0);
> + do_test (getpagesize () - j - 1, 0, ULONG_MAX, i, 127, 1);
> + do_test (getpagesize () - j - 1, 0, ULONG_MAX, i, 127, -1);
> +
> + do_test (getpagesize () - j - 1, 0, ULONG_MAX - i, i, 127, 0);
> + do_test (getpagesize () - j - 1, 0, ULONG_MAX - i, i, 127, 1);
> + do_test (getpagesize () - j - 1, 0, ULONG_MAX - i, i, 127, -1);
> +
> + do_test (getpagesize () - j - 1, j, i + 1, i, 127, 0);
> + do_test (getpagesize () - j - 1, j, i + 1, i, 127, 1);
> + do_test (getpagesize () - j - 1, j, i + 1, i, 127, -1);
> +
> + do_test (getpagesize () - j - 1, j, i, i, 127, 0);
> + do_test (getpagesize () - j - 1, j, i - 1, i, 127, 0);
> +
> + do_test (getpagesize () - j - 1, j, ULONG_MAX, i, 127, 0);
> + do_test (getpagesize () - j - 1, j, ULONG_MAX, i, 127, 1);
> + do_test (getpagesize () - j - 1, j, ULONG_MAX, i, 127, -1);
> +
> + do_test (getpagesize () - j - 1, j, ULONG_MAX - i, i, 127, 0);
> + do_test (getpagesize () - j - 1, j, ULONG_MAX - i, i, 127, 1);
> + do_test (getpagesize () - j - 1, j, ULONG_MAX - i, i, 127, -1);
> +
> + do_test (0, getpagesize () - j - 1, i + 1, i, 127, 0);
> + do_test (0, getpagesize () - j - 1, i + 1, i, 127, 1);
> + do_test (0, getpagesize () - j - 1, i + 1, i, 127, -1);
> +
> + do_test (0, getpagesize () - j - 1, i, i, 127, 0);
> + do_test (0, getpagesize () - j - 1, i - 1, i, 127, 0);
> +
> + do_test (0, getpagesize () - j - 1, ULONG_MAX, i, 127, 0);
> + do_test (0, getpagesize () - j - 1, ULONG_MAX, i, 127, 1);
> + do_test (0, getpagesize () - j - 1, ULONG_MAX, i, 127, -1);
> +
> + do_test (0, getpagesize () - j - 1, ULONG_MAX - i, i, 127, 0);
> + do_test (0, getpagesize () - j - 1, ULONG_MAX - i, i, 127, 1);
> + do_test (0, getpagesize () - j - 1, ULONG_MAX - i, i, 127, -1);
> +
> + do_test (j, getpagesize () - j - 1, i + 1, i, 127, 0);
> + do_test (j, getpagesize () - j - 1, i + 1, i, 127, 1);
> + do_test (j, getpagesize () - j - 1, i + 1, i, 127, -1);
> +
> + do_test (j, getpagesize () - j - 1, i, i, 127, 0);
> + do_test (j, getpagesize () - j - 1, i - 1, i, 127, 0);
> +
> + do_test (j, getpagesize () - j - 1, ULONG_MAX, i, 127, 0);
> + do_test (j, getpagesize () - j - 1, ULONG_MAX, i, 127, 1);
> + do_test (j, getpagesize () - j - 1, ULONG_MAX, i, 127, -1);
> +
> + do_test (j, getpagesize () - j - 1, ULONG_MAX - i, i, 127, 0);
> + do_test (j, getpagesize () - j - 1, ULONG_MAX - i, i, 127, 1);
> + do_test (j, getpagesize () - j - 1, ULONG_MAX - i, i, 127, -1);
> +
> + for (k = 2; k <= 128; k += k)
> + {
> + do_test (getpagesize () - k, getpagesize () - j - 1, i - 1, i,
> + 127, 0);
> + do_test (getpagesize () - k - 1, getpagesize () - j - 1, i - 1,
> + i, 127, 0);
> + do_test (getpagesize () - k, getpagesize () - j - 1, i + 1, i,
> + 127, 0);
> + do_test (getpagesize () - k - 1, getpagesize () - j - 1, i + 1,
> + i, 127, 0);
> + do_test (getpagesize () - k, getpagesize () - j - 1, i, i, 127,
> + 0);
> + do_test (getpagesize () - k - 1, getpagesize () - j - 1, i, i,
> + 127, 0);
> + do_test (getpagesize () - k, getpagesize () - j - 1, i + 1, i,
> + 127, -1);
> + do_test (getpagesize () - k - 1, getpagesize () - j - 1, i + 1,
> + i, 127, -1);
> + do_test (getpagesize () - k, getpagesize () - j - 1, i + 1, i,
> + 127, 1);
> + do_test (getpagesize () - k - 1, getpagesize () - j - 1, i + 1,
> + i, 127, 1);
> + }
> + if (i < 32)
> + {
> + i += 1;
> + }
> + else if (i < 161)
> + {
> + i += 7;
> + }
> + else if (i + 161 < test_len)
> + {
> + i += 31;
> + i *= 17;
> + i /= 16;
> + if (i + 161 > test_len)
> + {
> + i = test_len - 160;
> + }
> + }
> + else if (i + 32 < test_len)
> + {
> + i += 7;
> + }
> + else
> + {
> + i += 1;
> + }
> + }
> + }
> +
> do_random_tests ();
> do_page_tests ();
> }
> @@ -383,11 +525,11 @@ test_main (void)
> {
> test_init ();
>
> - test_locale ("C");
> - test_locale ("en_US.ISO-8859-1");
> - test_locale ("en_US.UTF-8");
> - test_locale ("tr_TR.ISO-8859-9");
> - test_locale ("tr_TR.UTF-8");
> + test_locale ("C", 1);
> + test_locale ("en_US.ISO-8859-1", 0);
> + test_locale ("en_US.UTF-8", 0);
> + test_locale ("tr_TR.ISO-8859-9", 0);
> + test_locale ("tr_TR.UTF-8", 0);
>
> return ret;
> }
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v1 17/23] x86: Optimize str{n}casecmp TOLOWER logic in strcmp.S
2022-03-23 21:57 ` [PATCH v1 17/23] x86: Optimize str{n}casecmp TOLOWER logic in strcmp.S Noah Goldstein
@ 2022-03-24 19:02 ` H.J. Lu
2022-05-12 19:44 ` Sunil Pandey
0 siblings, 1 reply; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 19:02 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Wed, Mar 23, 2022 at 3:01 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Slightly faster method of doing TOLOWER that saves an
> instruction.
>
> Also replace the hard coded 5-byte no with .p2align 4. On builds with
> CET enabled this misaligned entry to strcasecmp.
>
> geometric_mean(N=40) of all benchmarks New / Original: .894
>
> All string/memory tests pass.
> ---
> Geomtric Mean N=40 runs; All functions page aligned
> length, align1, align2, max_char, New Time / Old Time
> 1, 1, 1, 127, 0.903
> 2, 2, 2, 127, 0.905
> 3, 3, 3, 127, 0.877
> 4, 4, 4, 127, 0.888
> 5, 5, 5, 127, 0.901
> 6, 6, 6, 127, 0.954
> 7, 7, 7, 127, 0.932
> 8, 0, 0, 127, 0.918
> 9, 1, 1, 127, 0.914
> 10, 2, 2, 127, 0.877
> 11, 3, 3, 127, 0.909
> 12, 4, 4, 127, 0.876
> 13, 5, 5, 127, 0.886
> 14, 6, 6, 127, 0.914
> 15, 7, 7, 127, 0.939
> 4, 0, 0, 127, 0.963
> 4, 0, 0, 254, 0.943
> 8, 0, 0, 254, 0.927
> 16, 0, 0, 127, 0.876
> 16, 0, 0, 254, 0.865
> 32, 0, 0, 127, 0.865
> 32, 0, 0, 254, 0.862
> 64, 0, 0, 127, 0.863
> 64, 0, 0, 254, 0.896
> 128, 0, 0, 127, 0.885
> 128, 0, 0, 254, 0.882
> 256, 0, 0, 127, 0.87
> 256, 0, 0, 254, 0.869
> 512, 0, 0, 127, 0.832
> 512, 0, 0, 254, 0.848
> 1024, 0, 0, 127, 0.835
> 1024, 0, 0, 254, 0.843
> 16, 1, 2, 127, 0.914
> 16, 2, 1, 254, 0.949
> 32, 2, 4, 127, 0.955
> 32, 4, 2, 254, 1.004
> 64, 3, 6, 127, 0.844
> 64, 6, 3, 254, 0.905
> 128, 4, 0, 127, 0.889
> 128, 0, 4, 254, 0.845
> 256, 5, 2, 127, 0.929
> 256, 2, 5, 254, 0.907
> 512, 6, 4, 127, 0.837
> 512, 4, 6, 254, 0.862
> 1024, 7, 6, 127, 0.895
> 1024, 6, 7, 254, 0.89
>
> sysdeps/x86_64/strcmp.S | 64 +++++++++++++++++++----------------------
> 1 file changed, 29 insertions(+), 35 deletions(-)
>
> diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
> index e2ab59c555..99d8b36f1d 100644
> --- a/sysdeps/x86_64/strcmp.S
> +++ b/sysdeps/x86_64/strcmp.S
> @@ -75,9 +75,8 @@ ENTRY2 (__strcasecmp)
> movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
> mov %fs:(%rax),%RDX_LP
>
> - // XXX 5 byte should be before the function
> - /* 5-byte NOP. */
> - .byte 0x0f,0x1f,0x44,0x00,0x00
> + /* Either 1 or 5 bytes (dependeing if CET is enabled). */
> + .p2align 4
> END2 (__strcasecmp)
> # ifndef NO_NOLOCALE_ALIAS
> weak_alias (__strcasecmp, strcasecmp)
> @@ -94,9 +93,8 @@ ENTRY2 (__strncasecmp)
> movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
> mov %fs:(%rax),%RCX_LP
>
> - // XXX 5 byte should be before the function
> - /* 5-byte NOP. */
> - .byte 0x0f,0x1f,0x44,0x00,0x00
> + /* Either 1 or 5 bytes (dependeing if CET is enabled). */
> + .p2align 4
> END2 (__strncasecmp)
> # ifndef NO_NOLOCALE_ALIAS
> weak_alias (__strncasecmp, strncasecmp)
> @@ -146,22 +144,22 @@ ENTRY (STRCMP)
> #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> .section .rodata.cst16,"aM",@progbits,16
> .align 16
> -.Lbelowupper:
> - .quad 0x4040404040404040
> - .quad 0x4040404040404040
> -.Ltopupper:
> - .quad 0x5b5b5b5b5b5b5b5b
> - .quad 0x5b5b5b5b5b5b5b5b
> -.Ltouppermask:
> +.Llcase_min:
> + .quad 0x3f3f3f3f3f3f3f3f
> + .quad 0x3f3f3f3f3f3f3f3f
> +.Llcase_max:
> + .quad 0x9999999999999999
> + .quad 0x9999999999999999
> +.Lcase_add:
> .quad 0x2020202020202020
> .quad 0x2020202020202020
> .previous
> - movdqa .Lbelowupper(%rip), %xmm5
> -# define UCLOW_reg %xmm5
> - movdqa .Ltopupper(%rip), %xmm6
> -# define UCHIGH_reg %xmm6
> - movdqa .Ltouppermask(%rip), %xmm7
> -# define LCQWORD_reg %xmm7
> + movdqa .Llcase_min(%rip), %xmm5
> +# define LCASE_MIN_reg %xmm5
> + movdqa .Llcase_max(%rip), %xmm6
> +# define LCASE_MAX_reg %xmm6
> + movdqa .Lcase_add(%rip), %xmm7
> +# define CASE_ADD_reg %xmm7
> #endif
> cmp $0x30, %ecx
> ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */
> @@ -172,22 +170,18 @@ ENTRY (STRCMP)
> movhpd 8(%rdi), %xmm1
> movhpd 8(%rsi), %xmm2
> #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> -# define TOLOWER(reg1, reg2) \
> - movdqa reg1, %xmm8; \
> - movdqa UCHIGH_reg, %xmm9; \
> - movdqa reg2, %xmm10; \
> - movdqa UCHIGH_reg, %xmm11; \
> - pcmpgtb UCLOW_reg, %xmm8; \
> - pcmpgtb reg1, %xmm9; \
> - pcmpgtb UCLOW_reg, %xmm10; \
> - pcmpgtb reg2, %xmm11; \
> - pand %xmm9, %xmm8; \
> - pand %xmm11, %xmm10; \
> - pand LCQWORD_reg, %xmm8; \
> - pand LCQWORD_reg, %xmm10; \
> - por %xmm8, reg1; \
> - por %xmm10, reg2
> - TOLOWER (%xmm1, %xmm2)
> +# define TOLOWER(reg1, reg2) \
> + movdqa LCASE_MIN_reg, %xmm8; \
> + movdqa LCASE_MIN_reg, %xmm9; \
> + paddb reg1, %xmm8; \
> + paddb reg2, %xmm9; \
> + pcmpgtb LCASE_MAX_reg, %xmm8; \
> + pcmpgtb LCASE_MAX_reg, %xmm9; \
> + pandn CASE_ADD_reg, %xmm8; \
> + pandn CASE_ADD_reg, %xmm9; \
> + paddb %xmm8, reg1; \
> + paddb %xmm9, reg2
> + TOLOWER (%xmm1, %xmm2)
> #else
> # define TOLOWER(reg1, reg2)
> #endif
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v1 18/23] x86: Optimize str{n}casecmp TOLOWER logic in strcmp-sse42.S
2022-03-23 21:57 ` [PATCH v1 18/23] x86: Optimize str{n}casecmp TOLOWER logic in strcmp-sse42.S Noah Goldstein
@ 2022-03-24 19:02 ` H.J. Lu
2022-05-12 19:45 ` Sunil Pandey
0 siblings, 1 reply; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 19:02 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Wed, Mar 23, 2022 at 3:02 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Slightly faster method of doing TOLOWER that saves an
> instruction.
>
> Also replace the hard coded 5-byte no with .p2align 4. On builds with
> CET enabled this misaligned entry to strcasecmp.
>
> geometric_mean(N=40) of all benchmarks New / Original: .920
>
> All string/memory tests pass.
> ---
> Geomtric Mean N=40 runs; All functions page aligned
> length, align1, align2, max_char, New Time / Old Time
> 1, 1, 1, 127, 0.914
> 2, 2, 2, 127, 0.952
> 3, 3, 3, 127, 0.924
> 4, 4, 4, 127, 0.995
> 5, 5, 5, 127, 0.985
> 6, 6, 6, 127, 1.017
> 7, 7, 7, 127, 1.031
> 8, 0, 0, 127, 0.967
> 9, 1, 1, 127, 0.969
> 10, 2, 2, 127, 0.951
> 11, 3, 3, 127, 0.938
> 12, 4, 4, 127, 0.937
> 13, 5, 5, 127, 0.967
> 14, 6, 6, 127, 0.941
> 15, 7, 7, 127, 0.951
> 4, 0, 0, 127, 0.959
> 4, 0, 0, 254, 0.98
> 8, 0, 0, 254, 0.959
> 16, 0, 0, 127, 0.895
> 16, 0, 0, 254, 0.901
> 32, 0, 0, 127, 0.85
> 32, 0, 0, 254, 0.851
> 64, 0, 0, 127, 0.897
> 64, 0, 0, 254, 0.895
> 128, 0, 0, 127, 0.944
> 128, 0, 0, 254, 0.935
> 256, 0, 0, 127, 0.922
> 256, 0, 0, 254, 0.913
> 512, 0, 0, 127, 0.921
> 512, 0, 0, 254, 0.914
> 1024, 0, 0, 127, 0.845
> 1024, 0, 0, 254, 0.84
> 16, 1, 2, 127, 0.923
> 16, 2, 1, 254, 0.955
> 32, 2, 4, 127, 0.979
> 32, 4, 2, 254, 0.957
> 64, 3, 6, 127, 0.866
> 64, 6, 3, 254, 0.849
> 128, 4, 0, 127, 0.882
> 128, 0, 4, 254, 0.876
> 256, 5, 2, 127, 0.877
> 256, 2, 5, 254, 0.882
> 512, 6, 4, 127, 0.822
> 512, 4, 6, 254, 0.862
> 1024, 7, 6, 127, 0.903
> 1024, 6, 7, 254, 0.908
>
> sysdeps/x86_64/multiarch/strcmp-sse42.S | 83 +++++++++++--------------
> 1 file changed, 35 insertions(+), 48 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
> index 580feb90e9..7805ae9d41 100644
> --- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
> +++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
> @@ -88,9 +88,8 @@ ENTRY (GLABEL(__strcasecmp))
> movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
> mov %fs:(%rax),%RDX_LP
>
> - // XXX 5 byte should be before the function
> - /* 5-byte NOP. */
> - .byte 0x0f,0x1f,0x44,0x00,0x00
> + /* Either 1 or 5 bytes (dependeing if CET is enabled). */
> + .p2align 4
> END (GLABEL(__strcasecmp))
> /* FALLTHROUGH to strcasecmp_l. */
> #endif
> @@ -99,9 +98,8 @@ ENTRY (GLABEL(__strncasecmp))
> movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
> mov %fs:(%rax),%RCX_LP
>
> - // XXX 5 byte should be before the function
> - /* 5-byte NOP. */
> - .byte 0x0f,0x1f,0x44,0x00,0x00
> + /* Either 1 or 5 bytes (dependeing if CET is enabled). */
> + .p2align 4
> END (GLABEL(__strncasecmp))
> /* FALLTHROUGH to strncasecmp_l. */
> #endif
> @@ -169,27 +167,22 @@ STRCMP_SSE42:
> #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> .section .rodata.cst16,"aM",@progbits,16
> .align 16
> -LABEL(belowupper):
> - .quad 0x4040404040404040
> - .quad 0x4040404040404040
> -LABEL(topupper):
> -# ifdef USE_AVX
> - .quad 0x5a5a5a5a5a5a5a5a
> - .quad 0x5a5a5a5a5a5a5a5a
> -# else
> - .quad 0x5b5b5b5b5b5b5b5b
> - .quad 0x5b5b5b5b5b5b5b5b
> -# endif
> -LABEL(touppermask):
> +LABEL(lcase_min):
> + .quad 0x3f3f3f3f3f3f3f3f
> + .quad 0x3f3f3f3f3f3f3f3f
> +LABEL(lcase_max):
> + .quad 0x9999999999999999
> + .quad 0x9999999999999999
> +LABEL(case_add):
> .quad 0x2020202020202020
> .quad 0x2020202020202020
> .previous
> - movdqa LABEL(belowupper)(%rip), %xmm4
> -# define UCLOW_reg %xmm4
> - movdqa LABEL(topupper)(%rip), %xmm5
> -# define UCHIGH_reg %xmm5
> - movdqa LABEL(touppermask)(%rip), %xmm6
> -# define LCQWORD_reg %xmm6
> + movdqa LABEL(lcase_min)(%rip), %xmm4
> +# define LCASE_MIN_reg %xmm4
> + movdqa LABEL(lcase_max)(%rip), %xmm5
> +# define LCASE_MAX_reg %xmm5
> + movdqa LABEL(case_add)(%rip), %xmm6
> +# define CASE_ADD_reg %xmm6
> #endif
> cmp $0x30, %ecx
> ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
> @@ -200,32 +193,26 @@ LABEL(touppermask):
> #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> # ifdef USE_AVX
> # define TOLOWER(reg1, reg2) \
> - vpcmpgtb UCLOW_reg, reg1, %xmm7; \
> - vpcmpgtb UCHIGH_reg, reg1, %xmm8; \
> - vpcmpgtb UCLOW_reg, reg2, %xmm9; \
> - vpcmpgtb UCHIGH_reg, reg2, %xmm10; \
> - vpandn %xmm7, %xmm8, %xmm8; \
> - vpandn %xmm9, %xmm10, %xmm10; \
> - vpand LCQWORD_reg, %xmm8, %xmm8; \
> - vpand LCQWORD_reg, %xmm10, %xmm10; \
> - vpor reg1, %xmm8, reg1; \
> - vpor reg2, %xmm10, reg2
> + vpaddb LCASE_MIN_reg, reg1, %xmm7; \
> + vpaddb LCASE_MIN_reg, reg2, %xmm8; \
> + vpcmpgtb LCASE_MAX_reg, %xmm7, %xmm7; \
> + vpcmpgtb LCASE_MAX_reg, %xmm8, %xmm8; \
> + vpandn CASE_ADD_reg, %xmm7, %xmm7; \
> + vpandn CASE_ADD_reg, %xmm8, %xmm8; \
> + vpaddb %xmm7, reg1, reg1; \
> + vpaddb %xmm8, reg2, reg2
> # else
> # define TOLOWER(reg1, reg2) \
> - movdqa reg1, %xmm7; \
> - movdqa UCHIGH_reg, %xmm8; \
> - movdqa reg2, %xmm9; \
> - movdqa UCHIGH_reg, %xmm10; \
> - pcmpgtb UCLOW_reg, %xmm7; \
> - pcmpgtb reg1, %xmm8; \
> - pcmpgtb UCLOW_reg, %xmm9; \
> - pcmpgtb reg2, %xmm10; \
> - pand %xmm8, %xmm7; \
> - pand %xmm10, %xmm9; \
> - pand LCQWORD_reg, %xmm7; \
> - pand LCQWORD_reg, %xmm9; \
> - por %xmm7, reg1; \
> - por %xmm9, reg2
> + movdqa LCASE_MIN_reg, %xmm7; \
> + movdqa LCASE_MIN_reg, %xmm8; \
> + paddb reg1, %xmm7; \
> + paddb reg2, %xmm8; \
> + pcmpgtb LCASE_MAX_reg, %xmm7; \
> + pcmpgtb LCASE_MAX_reg, %xmm8; \
> + pandn CASE_ADD_reg, %xmm7; \
> + pandn CASE_ADD_reg, %xmm8; \
> + paddb %xmm7, reg1; \
> + paddb %xmm8, reg2
> # endif
> TOLOWER (%xmm1, %xmm2)
> #else
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v1 19/23] string: Expand page cross test cases in test-strcmp.c
2022-03-23 21:57 ` [PATCH v1 19/23] string: Expand page cross test cases in test-strcmp.c Noah Goldstein
@ 2022-03-24 19:02 ` H.J. Lu
0 siblings, 0 replies; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 19:02 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Wed, Mar 23, 2022 at 3:02 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Test cases for when both `s1` and `s2` are near the end of a page
> where previously missing.
> ---
> string/test-strcmp.c | 15 ++++++++++++++-
> 1 file changed, 14 insertions(+), 1 deletion(-)
>
> diff --git a/string/test-strcmp.c b/string/test-strcmp.c
> index 0abce769d0..ece03c6d0b 100644
> --- a/string/test-strcmp.c
> +++ b/string/test-strcmp.c
> @@ -392,7 +392,7 @@ check3 (void)
> int
> test_main (void)
> {
> - size_t i, j;
> + size_t i, j, k;
> const size_t test_len = MIN(TEST_LEN, 3 * 4096);
> test_init ();
> check();
> @@ -453,6 +453,19 @@ test_main (void)
> do_test (j, getpagesize () - j - 1, i, 127, 1);
> do_test (j, getpagesize () - j - 1, i, 127, -1);
>
> + for (k = 2; k <= 128; k += k)
> + {
> + do_test (getpagesize () - k, getpagesize () - j - 1, i, 127, 0);
> + do_test (getpagesize () - k - 1, getpagesize () - j - 1, i, 127,
> + 0);
> + do_test (getpagesize () - k, getpagesize () - j - 1, i, 127, 1);
> + do_test (getpagesize () - k - 1, getpagesize () - j - 1, i, 127,
> + 1);
> + do_test (getpagesize () - k, getpagesize () - j - 1, i, 127, -1);
> + do_test (getpagesize () - k - 1, getpagesize () - j - 1, i, 127,
> + -1);
> + }
> +
> if (i < 32)
> {
> i += 1;
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v1 20/23] string: Expand page cross test cases in test-strncmp.c
2022-03-23 21:57 ` [PATCH v1 20/23] string: Expand page cross test cases in test-strncmp.c Noah Goldstein
@ 2022-03-24 19:02 ` H.J. Lu
0 siblings, 0 replies; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 19:02 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Wed, Mar 23, 2022 at 3:02 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Test cases for when both `s1` and `s2` are near the end of a page
> where previously missing.
> ---
> string/test-strncmp.c | 27 ++++++++++++++++++++++++++-
> 1 file changed, 26 insertions(+), 1 deletion(-)
>
> diff --git a/string/test-strncmp.c b/string/test-strncmp.c
> index 1a87f0e73e..bba9e3d2dc 100644
> --- a/string/test-strncmp.c
> +++ b/string/test-strncmp.c
> @@ -573,7 +573,7 @@ check_overflow (void)
> int
> test_main (void)
> {
> - size_t i, j;
> + size_t i, j, k;
> const size_t test_len = MIN(TEST_LEN, 3 * 4096);
> test_init ();
>
> @@ -705,6 +705,31 @@ test_main (void)
> do_test_n (j, getpagesize () - j - 1, i, ULONG_MAX - i, 0, 127, 0);
> do_test_n (j, getpagesize () - j - 1, i, ULONG_MAX - i, 0, 127, 1);
> do_test_n (j, getpagesize () - j - 1, i, ULONG_MAX - i, 0, 127, -1);
> +
> + for (k = 2; k <= 128; k += k)
> + {
> + do_test (getpagesize () - k, getpagesize () - j - 1, i - 1, i,
> + 127, 0);
> + do_test (getpagesize () - k - 1, getpagesize () - j - 1, i - 1,
> + i, 127, 0);
> + do_test (getpagesize () - k, getpagesize () - j - 1, i + 1, i,
> + 127, 0);
> + do_test (getpagesize () - k - 1, getpagesize () - j - 1, i + 1,
> + i, 127, 0);
> + do_test (getpagesize () - k, getpagesize () - j - 1, i, i, 127,
> + 0);
> + do_test (getpagesize () - k - 1, getpagesize () - j - 1, i, i,
> + 127, 0);
> + do_test (getpagesize () - k, getpagesize () - j - 1, i + 1, i,
> + 127, -1);
> + do_test (getpagesize () - k - 1, getpagesize () - j - 1, i + 1,
> + i, 127, -1);
> + do_test (getpagesize () - k, getpagesize () - j - 1, i + 1, i,
> + 127, 1);
> + do_test (getpagesize () - k - 1, getpagesize () - j - 1, i + 1,
> + i, 127, 1);
> + }
> +
> if (i < 32)
> {
> i += 1;
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v1 21/23] x86: Add AVX2 optimized str{n}casecmp
2022-03-23 21:57 ` [PATCH v1 21/23] x86: Add AVX2 optimized str{n}casecmp Noah Goldstein
@ 2022-03-24 19:03 ` H.J. Lu
2022-03-24 22:41 ` [PATCH v3 " Noah Goldstein
` (2 subsequent siblings)
3 siblings, 0 replies; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 19:03 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Wed, Mar 23, 2022 at 3:02 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> geometric_mean(N=40) of all benchmarks AVX2 / SSE42: .702
>
> All string/memory tests pass.
> ---
> Geomtric Mean N=40 runs; All functions page aligned
> length, align1, align2, max_char, AVX2 Time / SSE42 Time
> 1, 1, 1, 127, 1.032
> 2, 2, 2, 127, 1.006
> 3, 3, 3, 127, 1.009
> 4, 4, 4, 127, 0.964
> 5, 5, 5, 127, 0.929
> 6, 6, 6, 127, 0.94
> 7, 7, 7, 127, 0.958
> 8, 0, 0, 127, 0.988
> 9, 1, 1, 127, 0.99
> 10, 2, 2, 127, 0.995
> 11, 3, 3, 127, 0.991
> 12, 4, 4, 127, 0.975
> 13, 5, 5, 127, 0.943
> 14, 6, 6, 127, 0.955
> 15, 7, 7, 127, 0.988
> 4, 0, 0, 127, 0.983
> 4, 0, 0, 254, 0.978
> 8, 0, 0, 254, 0.989
> 16, 0, 0, 127, 0.792
> 16, 0, 0, 254, 0.774
> 32, 0, 0, 127, 0.568
> 32, 0, 0, 254, 0.555
> 64, 0, 0, 127, 0.561
> 64, 0, 0, 254, 0.561
> 128, 0, 0, 127, 0.574
> 128, 0, 0, 254, 0.577
> 256, 0, 0, 127, 0.561
> 256, 0, 0, 254, 0.552
> 512, 0, 0, 127, 0.59
> 512, 0, 0, 254, 0.594
> 1024, 0, 0, 127, 0.528
> 1024, 0, 0, 254, 0.517
> 16, 1, 2, 127, 0.758
> 16, 2, 1, 254, 0.748
> 32, 2, 4, 127, 0.419
> 32, 4, 2, 254, 0.428
> 64, 3, 6, 127, 0.472
> 64, 6, 3, 254, 0.464
> 128, 4, 0, 127, 0.534
> 128, 0, 4, 254, 0.53
> 256, 5, 2, 127, 0.679
> 256, 2, 5, 254, 0.676
> 512, 6, 4, 127, 0.525
> 512, 4, 6, 254, 0.523
> 1024, 7, 6, 127, 0.518
> 1024, 6, 7, 254, 0.505
>
> sysdeps/x86_64/multiarch/Makefile | 4 +
> sysdeps/x86_64/multiarch/ifunc-impl-list.c | 28 +++
> sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 12 +
> .../x86_64/multiarch/strcasecmp_l-avx2-rtm.S | 15 ++
> sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S | 23 ++
> sysdeps/x86_64/multiarch/strcmp-avx2.S | 230 +++++++++++++++---
> .../x86_64/multiarch/strncase_l-avx2-rtm.S | 16 ++
> sysdeps/x86_64/multiarch/strncase_l-avx2.S | 27 ++
> 8 files changed, 324 insertions(+), 31 deletions(-)
> create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
> create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
> create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
> create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index e7b413edad..06e1848823 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -55,6 +55,8 @@ sysdep_routines += \
> stpncpy-sse2-unaligned \
> stpncpy-ssse3 \
> strcasecmp_l-avx \
> + strcasecmp_l-avx2 \
> + strcasecmp_l-avx2-rtm \
> strcasecmp_l-sse2 \
> strcasecmp_l-sse4_2 \
> strcasecmp_l-ssse3 \
> @@ -93,6 +95,8 @@ sysdep_routines += \
> strlen-evex \
> strlen-sse2 \
> strncase_l-avx \
> + strncase_l-avx2 \
> + strncase_l-avx2-rtm \
> strncase_l-sse2 \
> strncase_l-sse4_2 \
> strncase_l-ssse3 \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index a594f4176e..3c556d07ac 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -436,6 +436,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
> IFUNC_IMPL (i, name, strcasecmp,
> + IFUNC_IMPL_ADD (array, i, strcasecmp,
> + CPU_FEATURE_USABLE (AVX2),
> + __strcasecmp_avx2)
> + IFUNC_IMPL_ADD (array, i, strcasecmp,
> + (CPU_FEATURE_USABLE (AVX2)
> + && CPU_FEATURE_USABLE (RTM)),
> + __strcasecmp_avx2_rtm)
> IFUNC_IMPL_ADD (array, i, strcasecmp,
> CPU_FEATURE_USABLE (AVX),
> __strcasecmp_avx)
> @@ -449,6 +456,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
> IFUNC_IMPL (i, name, strcasecmp_l,
> + IFUNC_IMPL_ADD (array, i, strcasecmp,
> + CPU_FEATURE_USABLE (AVX2),
> + __strcasecmp_l_avx2)
> + IFUNC_IMPL_ADD (array, i, strcasecmp,
> + (CPU_FEATURE_USABLE (AVX2)
> + && CPU_FEATURE_USABLE (RTM)),
> + __strcasecmp_l_avx2_rtm)
> IFUNC_IMPL_ADD (array, i, strcasecmp_l,
> CPU_FEATURE_USABLE (AVX),
> __strcasecmp_l_avx)
> @@ -576,6 +590,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/x86_64/multiarch/strncase_l.c. */
> IFUNC_IMPL (i, name, strncasecmp,
> + IFUNC_IMPL_ADD (array, i, strncasecmp,
> + CPU_FEATURE_USABLE (AVX2),
> + __strncasecmp_avx2)
> + IFUNC_IMPL_ADD (array, i, strncasecmp,
> + (CPU_FEATURE_USABLE (AVX2)
> + && CPU_FEATURE_USABLE (RTM)),
> + __strncasecmp_avx2_rtm)
> IFUNC_IMPL_ADD (array, i, strncasecmp,
> CPU_FEATURE_USABLE (AVX),
> __strncasecmp_avx)
> @@ -590,6 +611,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/x86_64/multiarch/strncase_l.c. */
> IFUNC_IMPL (i, name, strncasecmp_l,
> + IFUNC_IMPL_ADD (array, i, strncasecmp,
> + CPU_FEATURE_USABLE (AVX2),
> + __strncasecmp_l_avx2)
> + IFUNC_IMPL_ADD (array, i, strncasecmp,
> + (CPU_FEATURE_USABLE (AVX2)
> + && CPU_FEATURE_USABLE (RTM)),
> + __strncasecmp_l_avx2_rtm)
> IFUNC_IMPL_ADD (array, i, strncasecmp_l,
> CPU_FEATURE_USABLE (AVX),
> __strncasecmp_l_avx)
> diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> index 9e3cc61ac0..c4de111fd0 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> @@ -23,12 +23,24 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
>
> static inline void *
> IFUNC_SELECTOR (void)
> {
> const struct cpu_features* cpu_features = __get_cpu_features ();
>
> + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
> + {
> + if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> + return OPTIMIZE (avx2_rtm);
> +
> + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
> + return OPTIMIZE (avx2);
> + }
> +
> if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
> return OPTIMIZE (avx);
>
> diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
> new file mode 100644
> index 0000000000..09957fc3c5
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
> @@ -0,0 +1,15 @@
> +#ifndef STRCMP
> +# define STRCMP __strcasecmp_l_avx2_rtm
> +#endif
> +
> +#define _GLABEL(x) x ## _rtm
> +#define GLABEL(x) _GLABEL(x)
> +
> +#define ZERO_UPPER_VEC_REGISTERS_RETURN \
> + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> +
> +#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
> +
> +#define SECTION(p) p##.avx.rtm
> +
> +#include "strcasecmp_l-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
> new file mode 100644
> index 0000000000..e2762f2a22
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
> @@ -0,0 +1,23 @@
> +/* strcasecmp_l optimized with AVX2.
> + Copyright (C) 2017-2022 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#ifndef STRCMP
> +# define STRCMP __strcasecmp_l_avx2
> +#endif
> +#define USE_AS_STRCASECMP_L
> +#include "strcmp-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> index 86a86b68e3..eeb90a0da6 100644
> --- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> @@ -20,6 +20,10 @@
>
> # include <sysdep.h>
>
> +# if defined USE_AS_STRCASECMP_L
> +# include "locale-defines.h"
> +# endif
> +
> # ifndef STRCMP
> # define STRCMP __strcmp_avx2
> # endif
> @@ -74,13 +78,88 @@
> # define VEC_OFFSET (-VEC_SIZE)
> # endif
>
> +# ifdef USE_AS_STRCASECMP_L
> +# define BYTE_LOOP_REG OFFSET_REG
> +# else
> +# define BYTE_LOOP_REG ecx
> +# endif
> +
> +# ifdef USE_AS_STRCASECMP_L
> +# ifdef USE_AS_STRNCMP
> +# define STRCASECMP __strncasecmp_avx2
> +# define LOCALE_REG rcx
> +# define LOCALE_REG_LP RCX_LP
> +# define STRCASECMP_NONASCII __strncasecmp_l_nonascii
> +# else
> +# define STRCASECMP __strcasecmp_avx2
> +# define LOCALE_REG rdx
> +# define LOCALE_REG_LP RDX_LP
> +# define STRCASECMP_NONASCII __strcasecmp_l_nonascii
> +# endif
> +# endif
> +
> # define xmmZERO xmm15
> # define ymmZERO ymm15
>
> +# define LCASE_MIN_ymm %ymm10
> +# define LCASE_MAX_ymm %ymm11
> +# define CASE_ADD_ymm %ymm12
> +
> +# define LCASE_MIN_xmm %xmm10
> +# define LCASE_MAX_xmm %xmm11
> +# define CASE_ADD_xmm %xmm12
> +
> + /* r11 is never use elsewhere so this is safe to maintain. */
> +# define TOLOWER_BASE %r11
> +
> # ifndef SECTION
> # define SECTION(p) p##.avx
> # endif
>
> +# ifdef USE_AS_STRCASECMP_L
> +# define REG(x, y) x ## y
> +# define TOLOWER(reg1_in, reg1_out, reg2_in, reg2_out, ext) \
> + vpaddb REG(LCASE_MIN_, ext), reg1_in, REG(%ext, 8); \
> + vpaddb REG(LCASE_MIN_, ext), reg2_in, REG(%ext, 9); \
> + vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 8), REG(%ext, 8); \
> + vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 9), REG(%ext, 9); \
> + vpandn REG(CASE_ADD_, ext), REG(%ext, 8), REG(%ext, 8); \
> + vpandn REG(CASE_ADD_, ext), REG(%ext, 9), REG(%ext, 9); \
> + vpaddb REG(%ext, 8), reg1_in, reg1_out; \
> + vpaddb REG(%ext, 9), reg2_in, reg2_out
> +
> +# define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
> +# define TOLOWER_ymm(...) TOLOWER(__VA_ARGS__, ymm)
> +# define TOLOWER_xmm(...) TOLOWER(__VA_ARGS__, xmm)
> +
> +# define CMP_R1_R2(s1_reg, s2_reg, scratch_reg, reg_out, ext) \
> + TOLOWER (s1_reg, scratch_reg, s2_reg, s2_reg, ext); \
> + VPCMPEQ scratch_reg, s2_reg, reg_out
> +
> +# define CMP_R1_S2(s1_reg, s2_mem, scratch_reg, reg_out, ext) \
> + VMOVU s2_mem, reg_out; \
> + CMP_R1_R2(s1_reg, reg_out, scratch_reg, reg_out, ext)
> +
> +# define CMP_R1_R2_ymm(...) CMP_R1_R2(__VA_ARGS__, ymm)
> +# define CMP_R1_R2_xmm(...) CMP_R1_R2(__VA_ARGS__, xmm)
> +
> +# define CMP_R1_S2_ymm(...) CMP_R1_S2(__VA_ARGS__, ymm)
> +# define CMP_R1_S2_xmm(...) CMP_R1_S2(__VA_ARGS__, xmm)
> +
> +# else
> +# define TOLOWER_gpr(...)
> +# define TOLOWER_ymm(...)
> +# define TOLOWER_xmm(...)
> +
> +# define CMP_R1_R2_ymm(s1_reg, s2_reg, scratch_reg, reg_out) \
> + VPCMPEQ s2_reg, s1_reg, reg_out
> +
> +# define CMP_R1_R2_xmm(...) CMP_R1_R2_ymm(__VA_ARGS__)
> +
> +# define CMP_R1_S2_ymm(...) CMP_R1_R2_ymm(__VA_ARGS__)
> +# define CMP_R1_S2_xmm(...) CMP_R1_R2_xmm(__VA_ARGS__)
> +# endif
> +
> /* Warning!
> wcscmp/wcsncmp have to use SIGNED comparison for elements.
> strcmp/strncmp have to use UNSIGNED comparison for elements.
> @@ -102,7 +181,45 @@
> returned. */
>
> .section SECTION(.text), "ax", @progbits
> -ENTRY(STRCMP)
> + .align 16
> + .type STRCMP, @function
> + .globl STRCMP
> + .hidden STRCMP
> +
> +# ifndef GLABEL
> +# define GLABEL(...) __VA_ARGS__
> +# endif
> +
> +# ifdef USE_AS_STRCASECMP_L
> +ENTRY (GLABEL(STRCASECMP))
> + movq __libc_tsd_LOCALE@gottpoff(%rip), %rax
> + mov %fs:(%rax), %LOCALE_REG_LP
> +
> + /* Either 1 or 5 bytes (dependeing if CET is enabled). */
> + .p2align 4
> +END (GLABEL(STRCASECMP))
> + /* FALLTHROUGH to strcasecmp/strncasecmp_l. */
> +# endif
> +
> + .p2align 4
> +STRCMP:
> + cfi_startproc
> + _CET_ENDBR
> + CALL_MCOUNT
> +
> +# if defined USE_AS_STRCASECMP_L
> + /* We have to fall back on the C implementation for locales with
> + encodings not matching ASCII for single bytes. */
> +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
> + mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
> +# else
> + mov (%LOCALE_REG), %RAX_LP
> +# endif
> + testl $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
> + jne STRCASECMP_NONASCII
> + leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
> +# endif
> +
> # ifdef USE_AS_STRNCMP
> # ifdef __ILP32__
> /* Clear the upper 32 bits. */
> @@ -128,6 +245,30 @@ ENTRY(STRCMP)
> # endif
> # endif
> vpxor %xmmZERO, %xmmZERO, %xmmZERO
> +# if defined USE_AS_STRCASECMP_L
> + .section .rodata.cst32, "aM", @progbits, 32
> + .align 32
> +L(lcase_min):
> + .quad 0x3f3f3f3f3f3f3f3f
> + .quad 0x3f3f3f3f3f3f3f3f
> + .quad 0x3f3f3f3f3f3f3f3f
> + .quad 0x3f3f3f3f3f3f3f3f
> +L(lcase_max):
> + .quad 0x9999999999999999
> + .quad 0x9999999999999999
> + .quad 0x9999999999999999
> + .quad 0x9999999999999999
> +L(case_add):
> + .quad 0x2020202020202020
> + .quad 0x2020202020202020
> + .quad 0x2020202020202020
> + .quad 0x2020202020202020
> + .previous
> +
> + vmovdqa L(lcase_min)(%rip), LCASE_MIN_ymm
> + vmovdqa L(lcase_max)(%rip), LCASE_MAX_ymm
> + vmovdqa L(case_add)(%rip), CASE_ADD_ymm
> +# endif
> movl %edi, %eax
> orl %esi, %eax
> sall $20, %eax
> @@ -138,8 +279,10 @@ ENTRY(STRCMP)
> L(no_page_cross):
> /* Safe to compare 4x vectors. */
> VMOVU (%rdi), %ymm0
> - /* 1s where s1 and s2 equal. */
> - VPCMPEQ (%rsi), %ymm0, %ymm1
> + /* 1s where s1 and s2 equal. Just VPCMPEQ if its not strcasecmp.
> + Otherwise converts ymm0 and load from rsi to lower. ymm2 is
> + scratch and ymm1 is the return. */
> + CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
> /* 1s at null CHAR. */
> VPCMPEQ %ymm0, %ymmZERO, %ymm2
> /* 1s where s1 and s2 equal AND not null CHAR. */
> @@ -172,6 +315,8 @@ L(return_vec_0):
> # else
> movzbl (%rdi, %rcx), %eax
> movzbl (%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> # endif
> L(ret0):
> @@ -207,6 +352,8 @@ L(one_or_less):
> # else
> movzbl (%rdi), %eax
> movzbl (%rsi), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> # endif
> L(ret1):
> @@ -234,6 +381,8 @@ L(return_vec_1):
> # else
> movzbl VEC_SIZE(%rdi, %rcx), %eax
> movzbl VEC_SIZE(%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> # endif
> L(ret2):
> @@ -265,6 +414,8 @@ L(return_vec_2):
> # else
> movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
> movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> # endif
> L(ret3):
> @@ -285,6 +436,8 @@ L(return_vec_3):
> # else
> movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
> movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> # endif
> L(ret4):
> @@ -295,7 +448,7 @@ L(ret4):
> L(more_3x_vec):
> /* Safe to compare 4x vectors. */
> VMOVU VEC_SIZE(%rdi), %ymm0
> - VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1
> + CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
> VPCMPEQ %ymm0, %ymmZERO, %ymm2
> vpandn %ymm1, %ymm2, %ymm1
> vpmovmskb %ymm1, %ecx
> @@ -308,7 +461,7 @@ L(more_3x_vec):
> # endif
>
> VMOVU (VEC_SIZE * 2)(%rdi), %ymm0
> - VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm0, %ymm1
> + CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 2)(%rsi), %ymm2, %ymm1)
> VPCMPEQ %ymm0, %ymmZERO, %ymm2
> vpandn %ymm1, %ymm2, %ymm1
> vpmovmskb %ymm1, %ecx
> @@ -316,7 +469,7 @@ L(more_3x_vec):
> jnz L(return_vec_2)
>
> VMOVU (VEC_SIZE * 3)(%rdi), %ymm0
> - VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm0, %ymm1
> + CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 3)(%rsi), %ymm2, %ymm1)
> VPCMPEQ %ymm0, %ymmZERO, %ymm2
> vpandn %ymm1, %ymm2, %ymm1
> vpmovmskb %ymm1, %ecx
> @@ -391,12 +544,10 @@ L(loop_skip_page_cross_check):
> VMOVA (VEC_SIZE * 3)(%rdi), %ymm6
>
> /* ymm1 all 1s where s1 and s2 equal. All 0s otherwise. */
> - VPCMPEQ (VEC_SIZE * 0)(%rsi), %ymm0, %ymm1
> -
> - VPCMPEQ (VEC_SIZE * 1)(%rsi), %ymm2, %ymm3
> - VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
> - VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
> -
> + CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 0)(%rsi), %ymm3, %ymm1)
> + CMP_R1_S2_ymm (%ymm2, (VEC_SIZE * 1)(%rsi), %ymm5, %ymm3)
> + CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
> + CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
>
> /* If any mismatches or null CHAR then 0 CHAR, otherwise non-
> zero. */
> @@ -465,6 +616,8 @@ L(return_vec_2_3_end):
> # else
> movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax
> movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> xorl %r8d, %eax
> subl %r8d, %eax
> @@ -508,6 +661,8 @@ L(return_vec_0_end):
> # else
> movzbl (%rdi, %rcx), %eax
> movzbl (%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> xorl %r8d, %eax
> subl %r8d, %eax
> @@ -530,6 +685,8 @@ L(return_vec_1_end):
> # else
> movzbl VEC_SIZE(%rdi, %rcx), %eax
> movzbl VEC_SIZE(%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> xorl %r8d, %eax
> subl %r8d, %eax
> @@ -556,6 +713,8 @@ L(return_vec_2_end):
> # else
> movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
> movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> xorl %r8d, %eax
> subl %r8d, %eax
> @@ -583,7 +742,7 @@ L(page_cross_during_loop):
> jle L(less_1x_vec_till_page_cross)
>
> VMOVA (%rdi), %ymm0
> - VPCMPEQ (%rsi), %ymm0, %ymm1
> + CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
> VPCMPEQ %ymm0, %ymmZERO, %ymm2
> vpandn %ymm1, %ymm2, %ymm1
> vpmovmskb %ymm1, %ecx
> @@ -605,7 +764,7 @@ L(less_1x_vec_till_page_cross):
> here, it means the previous page (rdi - VEC_SIZE) has already
> been loaded earlier so must be valid. */
> VMOVU -VEC_SIZE(%rdi, %rax), %ymm0
> - VPCMPEQ -VEC_SIZE(%rsi, %rax), %ymm0, %ymm1
> + CMP_R1_S2_ymm (%ymm0, -VEC_SIZE(%rsi, %rax), %ymm2, %ymm1)
> VPCMPEQ %ymm0, %ymmZERO, %ymm2
> vpandn %ymm1, %ymm2, %ymm1
> vpmovmskb %ymm1, %ecx
> @@ -647,6 +806,8 @@ L(return_page_cross_cmp_mem):
> # else
> movzbl VEC_OFFSET(%rdi, %rcx), %eax
> movzbl VEC_OFFSET(%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> xorl %r8d, %eax
> subl %r8d, %eax
> @@ -673,7 +834,7 @@ L(more_2x_vec_till_page_cross):
> iteration here. */
>
> VMOVU VEC_SIZE(%rdi), %ymm0
> - VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1
> + CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
> VPCMPEQ %ymm0, %ymmZERO, %ymm2
> vpandn %ymm1, %ymm2, %ymm1
> vpmovmskb %ymm1, %ecx
> @@ -689,7 +850,7 @@ L(more_2x_vec_till_page_cross):
>
> /* Safe to include comparisons from lower bytes. */
> VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %ymm0
> - VPCMPEQ -(VEC_SIZE * 2)(%rsi, %rax), %ymm0, %ymm1
> + CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 2)(%rsi, %rax), %ymm2, %ymm1)
> VPCMPEQ %ymm0, %ymmZERO, %ymm2
> vpandn %ymm1, %ymm2, %ymm1
> vpmovmskb %ymm1, %ecx
> @@ -697,7 +858,7 @@ L(more_2x_vec_till_page_cross):
> jnz L(return_vec_page_cross_0)
>
> VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %ymm0
> - VPCMPEQ -(VEC_SIZE * 1)(%rsi, %rax), %ymm0, %ymm1
> + CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 1)(%rsi, %rax), %ymm2, %ymm1)
> VPCMPEQ %ymm0, %ymmZERO, %ymm2
> vpandn %ymm1, %ymm2, %ymm1
> vpmovmskb %ymm1, %ecx
> @@ -715,8 +876,8 @@ L(more_2x_vec_till_page_cross):
> VMOVA (VEC_SIZE * 2)(%rdi), %ymm4
> VMOVA (VEC_SIZE * 3)(%rdi), %ymm6
>
> - VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
> - VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
> + CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
> + CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
> vpand %ymm4, %ymm5, %ymm5
> vpand %ymm6, %ymm7, %ymm7
> VPMINU %ymm5, %ymm7, %ymm7
> @@ -767,6 +928,8 @@ L(return_vec_page_cross_1):
> # else
> movzbl VEC_OFFSET(%rdi, %rcx), %eax
> movzbl VEC_OFFSET(%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> xorl %r8d, %eax
> subl %r8d, %eax
> @@ -822,7 +985,7 @@ L(page_cross):
> L(page_cross_loop):
>
> VMOVU (%rdi, %OFFSET_REG64), %ymm0
> - VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1
> + CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
> VPCMPEQ %ymm0, %ymmZERO, %ymm2
> vpandn %ymm1, %ymm2, %ymm1
> vpmovmskb %ymm1, %ecx
> @@ -840,11 +1003,11 @@ L(page_cross_loop):
> subl %eax, %OFFSET_REG
> /* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
> to not cross page so is safe to load. Since we have already
> - loaded at least 1 VEC from rsi it is also guranteed to be safe.
> - */
> + loaded at least 1 VEC from rsi it is also guranteed to be
> + safe. */
>
> VMOVU (%rdi, %OFFSET_REG64), %ymm0
> - VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1
> + CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
> VPCMPEQ %ymm0, %ymmZERO, %ymm2
> vpandn %ymm1, %ymm2, %ymm1
> vpmovmskb %ymm1, %ecx
> @@ -877,6 +1040,8 @@ L(ret_vec_page_cross_cont):
> # else
> movzbl (%rdi, %rcx), %eax
> movzbl (%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> xorl %r8d, %eax
> subl %r8d, %eax
> @@ -930,7 +1095,7 @@ L(less_1x_vec_till_page):
> ja L(less_16_till_page)
>
> VMOVU (%rdi), %xmm0
> - VPCMPEQ (%rsi), %xmm0, %xmm1
> + CMP_R1_S2_xmm (%xmm0, (%rsi), %xmm2, %xmm1)
> VPCMPEQ %xmm0, %xmmZERO, %xmm2
> vpandn %xmm1, %xmm2, %xmm1
> vpmovmskb %ymm1, %ecx
> @@ -948,7 +1113,7 @@ L(less_1x_vec_till_page):
> # endif
>
> VMOVU (%rdi, %OFFSET_REG64), %xmm0
> - VPCMPEQ (%rsi, %OFFSET_REG64), %xmm0, %xmm1
> + CMP_R1_S2_xmm (%xmm0, (%rsi, %OFFSET_REG64), %xmm2, %xmm1)
> VPCMPEQ %xmm0, %xmmZERO, %xmm2
> vpandn %xmm1, %xmm2, %xmm1
> vpmovmskb %ymm1, %ecx
> @@ -986,7 +1151,7 @@ L(less_16_till_page):
> vmovq (%rdi), %xmm0
> vmovq (%rsi), %xmm1
> VPCMPEQ %xmm0, %xmmZERO, %xmm2
> - VPCMPEQ %xmm1, %xmm0, %xmm1
> + CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
> vpandn %xmm1, %xmm2, %xmm1
> vpmovmskb %ymm1, %ecx
> incb %cl
> @@ -1006,7 +1171,7 @@ L(less_16_till_page):
> vmovq (%rdi, %OFFSET_REG64), %xmm0
> vmovq (%rsi, %OFFSET_REG64), %xmm1
> VPCMPEQ %xmm0, %xmmZERO, %xmm2
> - VPCMPEQ %xmm1, %xmm0, %xmm1
> + CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
> vpandn %xmm1, %xmm2, %xmm1
> vpmovmskb %ymm1, %ecx
> incb %cl
> @@ -1062,7 +1227,7 @@ L(ret_less_8_wcs):
> vmovd (%rdi), %xmm0
> vmovd (%rsi), %xmm1
> VPCMPEQ %xmm0, %xmmZERO, %xmm2
> - VPCMPEQ %xmm1, %xmm0, %xmm1
> + CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
> vpandn %xmm1, %xmm2, %xmm1
> vpmovmskb %ymm1, %ecx
> subl $0xf, %ecx
> @@ -1081,7 +1246,7 @@ L(ret_less_8_wcs):
> vmovd (%rdi, %OFFSET_REG64), %xmm0
> vmovd (%rsi, %OFFSET_REG64), %xmm1
> VPCMPEQ %xmm0, %xmmZERO, %xmm2
> - VPCMPEQ %xmm1, %xmm0, %xmm1
> + CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
> vpandn %xmm1, %xmm2, %xmm1
> vpmovmskb %ymm1, %ecx
> subl $0xf, %ecx
> @@ -1115,7 +1280,9 @@ L(less_4_till_page):
> L(less_4_loop):
> movzbl (%rdi), %eax
> movzbl (%rsi, %rdi), %ecx
> - subl %ecx, %eax
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
> + subl %BYTE_LOOP_REG, %eax
> jnz L(ret_less_4_loop)
> testl %ecx, %ecx
> jz L(ret_zero_4_loop)
> @@ -1142,5 +1309,6 @@ L(ret_less_4_loop):
> subl %r8d, %eax
> ret
> # endif
> -END(STRCMP)
> + cfi_endproc
> + .size STRCMP, .-STRCMP
> #endif
> diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
> new file mode 100644
> index 0000000000..e194936c36
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
> @@ -0,0 +1,16 @@
> +#ifndef STRCMP
> +# define STRCMP __strncasecmp_l_avx2_rtm
> +#endif
> +
> +#define _GLABEL(x) x ## _rtm
> +#define GLABEL(x) _GLABEL(x)
> +
> +#define ZERO_UPPER_VEC_REGISTERS_RETURN \
> + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> +
> +#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
> +
> +#define SECTION(p) p##.avx.rtm
> +#define OVERFLOW_STRCMP __strcasecmp_avx2_rtm
> +
> +#include "strncase_l-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2.S b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
> new file mode 100644
> index 0000000000..29afccbcc5
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
> @@ -0,0 +1,27 @@
> +/* strncasecmp_l optimized with AVX2.
> + Copyright (C) 2017-2022 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#ifndef STRCMP
> +# define STRCMP __strncasecmp_l_avx2
> +#endif
> +#define USE_AS_STRCASECMP_L
> +#define USE_AS_STRNCMP
> +#ifndef OVERFLOW_STRCMP
> +# define OVERFLOW_STRCMP __strcasecmp_avx2
> +#endif
> +#include "strcmp-avx2.S"
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v1 22/23] x86: Add EVEX optimized str{n}casecmp
2022-03-23 21:57 ` [PATCH v1 22/23] x86: Add EVEX " Noah Goldstein
@ 2022-03-24 19:04 ` H.J. Lu
0 siblings, 0 replies; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 19:04 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Wed, Mar 23, 2022 at 3:02 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> geometric_mean(N=40) of all benchmarks EVEX / SSE42: .621
>
> All string/memory tests pass.
> ---
> Geomtric Mean N=40 runs; All functions page aligned
> length, align1, align2, max_char, EVEX Time / SSE42 Time
> 1, 1, 1, 127, 0.871
> 2, 2, 2, 127, 0.833
> 3, 3, 3, 127, 0.851
> 4, 4, 4, 127, 0.824
> 5, 5, 5, 127, 0.791
> 6, 6, 6, 127, 0.789
> 7, 7, 7, 127, 0.804
> 8, 0, 0, 127, 0.838
> 9, 1, 1, 127, 0.837
> 10, 2, 2, 127, 0.834
> 11, 3, 3, 127, 0.839
> 12, 4, 4, 127, 0.844
> 13, 5, 5, 127, 0.796
> 14, 6, 6, 127, 0.811
> 15, 7, 7, 127, 0.838
> 4, 0, 0, 127, 0.84
> 4, 0, 0, 254, 0.823
> 8, 0, 0, 254, 0.838
> 16, 0, 0, 127, 0.669
> 16, 0, 0, 254, 0.656
> 32, 0, 0, 127, 0.488
> 32, 0, 0, 254, 0.484
> 64, 0, 0, 127, 0.492
> 64, 0, 0, 254, 0.502
> 128, 0, 0, 127, 0.508
> 128, 0, 0, 254, 0.497
> 256, 0, 0, 127, 0.574
> 256, 0, 0, 254, 0.581
> 512, 0, 0, 127, 0.573
> 512, 0, 0, 254, 0.577
> 1024, 0, 0, 127, 0.489
> 1024, 0, 0, 254, 0.485
> 16, 1, 2, 127, 0.655
> 16, 2, 1, 254, 0.646
> 32, 2, 4, 127, 0.368
> 32, 4, 2, 254, 0.376
> 64, 3, 6, 127, 0.428
> 64, 6, 3, 254, 0.426
> 128, 4, 0, 127, 0.478
> 128, 0, 4, 254, 0.473
> 256, 5, 2, 127, 0.65
> 256, 2, 5, 254, 0.654
> 512, 6, 4, 127, 0.492
> 512, 4, 6, 254, 0.489
> 1024, 7, 6, 127, 0.463
> 1024, 6, 7, 254, 0.457
>
> sysdeps/x86_64/multiarch/Makefile | 2 +
> sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 ++
> sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 5 +
> sysdeps/x86_64/multiarch/strcasecmp_l-evex.S | 23 ++
> sysdeps/x86_64/multiarch/strcmp-evex.S | 280 ++++++++++++++++---
> sysdeps/x86_64/multiarch/strncase_l-evex.S | 25 ++
> 6 files changed, 314 insertions(+), 37 deletions(-)
> create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
> create mode 100644 sysdeps/x86_64/multiarch/strncase_l-evex.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 06e1848823..35d80dc2ff 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -57,6 +57,7 @@ sysdep_routines += \
> strcasecmp_l-avx \
> strcasecmp_l-avx2 \
> strcasecmp_l-avx2-rtm \
> + strcasecmp_l-evex \
> strcasecmp_l-sse2 \
> strcasecmp_l-sse4_2 \
> strcasecmp_l-ssse3 \
> @@ -97,6 +98,7 @@ sysdep_routines += \
> strncase_l-avx \
> strncase_l-avx2 \
> strncase_l-avx2-rtm \
> + strncase_l-evex \
> strncase_l-sse2 \
> strncase_l-sse4_2 \
> strncase_l-ssse3 \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 3c556d07ac..f1a4d3dac2 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -436,6 +436,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
> IFUNC_IMPL (i, name, strcasecmp,
> + IFUNC_IMPL_ADD (array, i, strcasecmp,
> + (CPU_FEATURE_USABLE (AVX512VL)
> + && CPU_FEATURE_USABLE (AVX512BW)),
> + __strcasecmp_evex)
> IFUNC_IMPL_ADD (array, i, strcasecmp,
> CPU_FEATURE_USABLE (AVX2),
> __strcasecmp_avx2)
> @@ -456,6 +460,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
> IFUNC_IMPL (i, name, strcasecmp_l,
> + IFUNC_IMPL_ADD (array, i, strcasecmp,
> + (CPU_FEATURE_USABLE (AVX512VL)
> + && CPU_FEATURE_USABLE (AVX512BW)),
> + __strcasecmp_l_evex)
> IFUNC_IMPL_ADD (array, i, strcasecmp,
> CPU_FEATURE_USABLE (AVX2),
> __strcasecmp_l_avx2)
> @@ -590,6 +598,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/x86_64/multiarch/strncase_l.c. */
> IFUNC_IMPL (i, name, strncasecmp,
> + IFUNC_IMPL_ADD (array, i, strncasecmp,
> + (CPU_FEATURE_USABLE (AVX512VL)
> + && CPU_FEATURE_USABLE (AVX512BW)),
> + __strncasecmp_evex)
> IFUNC_IMPL_ADD (array, i, strncasecmp,
> CPU_FEATURE_USABLE (AVX2),
> __strncasecmp_avx2)
> @@ -611,6 +623,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/x86_64/multiarch/strncase_l.c. */
> IFUNC_IMPL (i, name, strncasecmp_l,
> + IFUNC_IMPL_ADD (array, i, strncasecmp,
> + (CPU_FEATURE_USABLE (AVX512VL)
> + && CPU_FEATURE_USABLE (AVX512BW)),
> + __strncasecmp_l_evex)
> IFUNC_IMPL_ADD (array, i, strncasecmp,
> CPU_FEATURE_USABLE (AVX2),
> __strncasecmp_l_avx2)
> diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> index c4de111fd0..bf0d146e7f 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> @@ -25,6 +25,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
>
> static inline void *
> IFUNC_SELECTOR (void)
> @@ -34,6 +35,10 @@ IFUNC_SELECTOR (void)
> if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
> {
> + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
> + return OPTIMIZE (evex);
> +
> if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> return OPTIMIZE (avx2_rtm);
>
> diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
> new file mode 100644
> index 0000000000..58642db748
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
> @@ -0,0 +1,23 @@
> +/* strcasecmp_l optimized with EVEX.
> + Copyright (C) 2017-2022 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#ifndef STRCMP
> +# define STRCMP __strcasecmp_l_evex
> +#endif
> +#define USE_AS_STRCASECMP_L
> +#include "strcmp-evex.S"
> diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
> index 56d8c118e4..85afd6535f 100644
> --- a/sysdeps/x86_64/multiarch/strcmp-evex.S
> +++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
> @@ -19,6 +19,9 @@
> #if IS_IN (libc)
>
> # include <sysdep.h>
> +# if defined USE_AS_STRCASECMP_L
> +# include "locale-defines.h"
> +# endif
>
> # ifndef STRCMP
> # define STRCMP __strcmp_evex
> @@ -34,19 +37,29 @@
> # define VMOVA vmovdqa64
>
> # ifdef USE_AS_WCSCMP
> -# define TESTEQ subl $0xff,
> +# ifndef OVERFLOW_STRCMP
> +# define OVERFLOW_STRCMP __wcscmp_evex
> +# endif
> +
> +# define TESTEQ subl $0xff,
> /* Compare packed dwords. */
> # define VPCMP vpcmpd
> # define VPMINU vpminud
> # define VPTESTM vptestmd
> +# define VPTESTNM vptestnmd
> /* 1 dword char == 4 bytes. */
> # define SIZE_OF_CHAR 4
> # else
> +# ifndef OVERFLOW_STRCMP
> +# define OVERFLOW_STRCMP __strcmp_evex
> +# endif
> +
> # define TESTEQ incl
> /* Compare packed bytes. */
> # define VPCMP vpcmpb
> # define VPMINU vpminub
> # define VPTESTM vptestmb
> +# define VPTESTNM vptestnmb
> /* 1 byte char == 1 byte. */
> # define SIZE_OF_CHAR 1
> # endif
> @@ -73,11 +86,16 @@
> # define VEC_OFFSET (-VEC_SIZE)
> # endif
>
> -# define XMMZERO xmm16
> # define XMM0 xmm17
> # define XMM1 xmm18
>
> -# define YMMZERO ymm16
> +# define XMM10 xmm27
> +# define XMM11 xmm28
> +# define XMM12 xmm29
> +# define XMM13 xmm30
> +# define XMM14 xmm31
> +
> +
> # define YMM0 ymm17
> # define YMM1 ymm18
> # define YMM2 ymm19
> @@ -89,6 +107,87 @@
> # define YMM8 ymm25
> # define YMM9 ymm26
> # define YMM10 ymm27
> +# define YMM11 ymm28
> +# define YMM12 ymm29
> +# define YMM13 ymm30
> +# define YMM14 ymm31
> +
> +# ifdef USE_AS_STRCASECMP_L
> +# define BYTE_LOOP_REG OFFSET_REG
> +# else
> +# define BYTE_LOOP_REG ecx
> +# endif
> +
> +# ifdef USE_AS_STRCASECMP_L
> +# ifdef USE_AS_STRNCMP
> +# define STRCASECMP __strncasecmp_evex
> +# define LOCALE_REG rcx
> +# define LOCALE_REG_LP RCX_LP
> +# define STRCASECMP_NONASCII __strncasecmp_l_nonascii
> +# else
> +# define STRCASECMP __strcasecmp_evex
> +# define LOCALE_REG rdx
> +# define LOCALE_REG_LP RDX_LP
> +# define STRCASECMP_NONASCII __strcasecmp_l_nonascii
> +# endif
> +# endif
> +
> +# define LCASE_MIN_YMM %YMM12
> +# define LCASE_MAX_YMM %YMM13
> +# define CASE_ADD_YMM %YMM14
> +
> +# define LCASE_MIN_XMM %XMM12
> +# define LCASE_MAX_XMM %XMM13
> +# define CASE_ADD_XMM %XMM14
> +
> + /* NB: wcsncmp uses r11 but strcasecmp is never used in
> + conjunction with wcscmp. */
> +# define TOLOWER_BASE %r11
> +
> +# ifdef USE_AS_STRCASECMP_L
> +# define _REG(x, y) x ## y
> +# define REG(x, y) _REG(x, y)
> +# define TOLOWER(reg1, reg2, ext) \
> + vpsubb REG(LCASE_MIN_, ext), reg1, REG(%ext, 10); \
> + vpsubb REG(LCASE_MIN_, ext), reg2, REG(%ext, 11); \
> + vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5; \
> + vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6; \
> + vpaddb reg1, REG(CASE_ADD_, ext), reg1{%k5}; \
> + vpaddb reg2, REG(CASE_ADD_, ext), reg2{%k6}
> +
> +# define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
> +# define TOLOWER_YMM(...) TOLOWER(__VA_ARGS__, YMM)
> +# define TOLOWER_XMM(...) TOLOWER(__VA_ARGS__, XMM)
> +
> +# define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext) \
> + TOLOWER (s1_reg, s2_reg, ext); \
> + VPCMP $0, s1_reg, s2_reg, reg_out
> +
> +# define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext) \
> + VMOVU s2_mem, s2_reg; \
> + CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)
> +
> +# define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM)
> +# define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM)
> +
> +# define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM)
> +# define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM)
> +
> +# else
> +# define TOLOWER_gpr(...)
> +# define TOLOWER_YMM(...)
> +# define TOLOWER_XMM(...)
> +
> +# define CMP_R1_R2_YMM(s1_reg, s2_reg, reg_out) \
> + VPCMP $0, s2_reg, s1_reg, reg_out
> +
> +# define CMP_R1_R2_XMM(...) CMP_R1_R2_YMM(__VA_ARGS__)
> +
> +# define CMP_R1_S2_YMM(s1_reg, s2_mem, unused, reg_out) \
> + VPCMP $0, s2_mem, s1_reg, reg_out
> +
> +# define CMP_R1_S2_XMM(...) CMP_R1_S2_YMM(__VA_ARGS__)
> +# endif
>
> /* Warning!
> wcscmp/wcsncmp have to use SIGNED comparison for elements.
> @@ -112,7 +211,41 @@
> returned. */
>
> .section .text.evex, "ax", @progbits
> -ENTRY(STRCMP)
> + .align 16
> + .type STRCMP, @function
> + .globl STRCMP
> + .hidden STRCMP
> +
> +# ifdef USE_AS_STRCASECMP_L
> +ENTRY (STRCASECMP)
> + movq __libc_tsd_LOCALE@gottpoff(%rip), %rax
> + mov %fs:(%rax), %LOCALE_REG_LP
> +
> + /* Either 1 or 5 bytes (dependeing if CET is enabled). */
> + .p2align 4
> +END (STRCASECMP)
> + /* FALLTHROUGH to strcasecmp/strncasecmp_l. */
> +# endif
> +
> + .p2align 4
> +STRCMP:
> + cfi_startproc
> + _CET_ENDBR
> + CALL_MCOUNT
> +
> +# if defined USE_AS_STRCASECMP_L
> + /* We have to fall back on the C implementation for locales with
> + encodings not matching ASCII for single bytes. */
> +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
> + mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
> +# else
> + mov (%LOCALE_REG), %RAX_LP
> +# endif
> + testl $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
> + jne STRCASECMP_NONASCII
> + leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
> +# endif
> +
> # ifdef USE_AS_STRNCMP
> # ifdef __ILP32__
> /* Clear the upper 32 bits. */
> @@ -125,6 +258,32 @@ ENTRY(STRCMP)
> actually bound the buffer. */
> jle L(one_or_less)
> # endif
> +
> +# if defined USE_AS_STRCASECMP_L
> + .section .rodata.cst32, "aM", @progbits, 32
> + .align 32
> +L(lcase_min):
> + .quad 0x4141414141414141
> + .quad 0x4141414141414141
> + .quad 0x4141414141414141
> + .quad 0x4141414141414141
> +L(lcase_max):
> + .quad 0x1a1a1a1a1a1a1a1a
> + .quad 0x1a1a1a1a1a1a1a1a
> + .quad 0x1a1a1a1a1a1a1a1a
> + .quad 0x1a1a1a1a1a1a1a1a
> +L(case_add):
> + .quad 0x2020202020202020
> + .quad 0x2020202020202020
> + .quad 0x2020202020202020
> + .quad 0x2020202020202020
> + .previous
> +
> + vmovdqa64 L(lcase_min)(%rip), LCASE_MIN_YMM
> + vmovdqa64 L(lcase_max)(%rip), LCASE_MAX_YMM
> + vmovdqa64 L(case_add)(%rip), CASE_ADD_YMM
> +# endif
> +
> movl %edi, %eax
> orl %esi, %eax
> /* Shift out the bits irrelivant to page boundary ([63:12]). */
> @@ -139,7 +298,7 @@ L(no_page_cross):
> VPTESTM %YMM0, %YMM0, %k2
> /* Each bit cleared in K1 represents a mismatch or a null CHAR
> in YMM0 and 32 bytes at (%rsi). */
> - VPCMP $0, (%rsi), %YMM0, %k1{%k2}
> + CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
> kmovd %k1, %ecx
> # ifdef USE_AS_STRNCMP
> cmpq $CHAR_PER_VEC, %rdx
> @@ -169,6 +328,8 @@ L(return_vec_0):
> # else
> movzbl (%rdi, %rcx), %eax
> movzbl (%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> # endif
> L(ret0):
> @@ -192,7 +353,7 @@ L(one_or_less):
> # ifdef USE_AS_WCSCMP
> /* 'nbe' covers the case where length is negative (large
> unsigned). */
> - jnbe __wcscmp_evex
> + jnbe OVERFLOW_STRCMP
> movl (%rdi), %edx
> xorl %eax, %eax
> cmpl (%rsi), %edx
> @@ -203,9 +364,11 @@ L(one_or_less):
> # else
> /* 'nbe' covers the case where length is negative (large
> unsigned). */
> - jnbe __strcmp_evex
> + jnbe OVERFLOW_STRCMP
> movzbl (%rdi), %eax
> movzbl (%rsi), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> # endif
> L(ret1):
> @@ -233,6 +396,8 @@ L(return_vec_1):
> # else
> movzbl VEC_SIZE(%rdi, %rcx), %eax
> movzbl VEC_SIZE(%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> # endif
> L(ret2):
> @@ -270,6 +435,8 @@ L(return_vec_2):
> # else
> movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
> movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> # endif
> L(ret3):
> @@ -290,6 +457,8 @@ L(return_vec_3):
> # else
> movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
> movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> # endif
> L(ret4):
> @@ -303,7 +472,7 @@ L(more_3x_vec):
> /* Safe to compare 4x vectors. */
> VMOVU (VEC_SIZE)(%rdi), %YMM0
> VPTESTM %YMM0, %YMM0, %k2
> - VPCMP $0, (VEC_SIZE)(%rsi), %YMM0, %k1{%k2}
> + CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
> kmovd %k1, %ecx
> TESTEQ %ecx
> jnz L(return_vec_1)
> @@ -315,14 +484,14 @@ L(more_3x_vec):
>
> VMOVU (VEC_SIZE * 2)(%rdi), %YMM0
> VPTESTM %YMM0, %YMM0, %k2
> - VPCMP $0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
> + CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2}
> kmovd %k1, %ecx
> TESTEQ %ecx
> jnz L(return_vec_2)
>
> VMOVU (VEC_SIZE * 3)(%rdi), %YMM0
> VPTESTM %YMM0, %YMM0, %k2
> - VPCMP $0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
> + CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2}
> kmovd %k1, %ecx
> TESTEQ %ecx
> jnz L(return_vec_3)
> @@ -381,7 +550,6 @@ L(prepare_loop_aligned):
> subl %esi, %eax
> andl $(PAGE_SIZE - 1), %eax
>
> - vpxorq %YMMZERO, %YMMZERO, %YMMZERO
>
> /* Loop 4x comparisons at a time. */
> .p2align 4
> @@ -413,22 +581,35 @@ L(loop_skip_page_cross_check):
> /* A zero CHAR in YMM9 means that there is a null CHAR. */
> VPMINU %YMM8, %YMM9, %YMM9
>
> - /* Each bit set in K1 represents a non-null CHAR in YMM8. */
> + /* Each bit set in K1 represents a non-null CHAR in YMM9. */
> VPTESTM %YMM9, %YMM9, %k1
> -
> +# ifndef USE_AS_STRCASECMP_L
> vpxorq (VEC_SIZE * 0)(%rsi), %YMM0, %YMM1
> vpxorq (VEC_SIZE * 1)(%rsi), %YMM2, %YMM3
> vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
> /* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
> oring with YMM1. Result is stored in YMM6. */
> vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6
> -
> +# else
> + VMOVU (VEC_SIZE * 0)(%rsi), %YMM1
> + TOLOWER_YMM (%YMM0, %YMM1)
> + VMOVU (VEC_SIZE * 1)(%rsi), %YMM3
> + TOLOWER_YMM (%YMM2, %YMM3)
> + VMOVU (VEC_SIZE * 2)(%rsi), %YMM5
> + TOLOWER_YMM (%YMM4, %YMM5)
> + VMOVU (VEC_SIZE * 3)(%rsi), %YMM7
> + TOLOWER_YMM (%YMM6, %YMM7)
> + vpxorq %YMM0, %YMM1, %YMM1
> + vpxorq %YMM2, %YMM3, %YMM3
> + vpxorq %YMM4, %YMM5, %YMM5
> + vpternlogd $0xde, %YMM7, %YMM1, %YMM6
> +# endif
> /* Or together YMM3, YMM5, and YMM6. */
> vpternlogd $0xfe, %YMM3, %YMM5, %YMM6
>
>
> /* A non-zero CHAR in YMM6 represents a mismatch. */
> - VPCMP $0, %YMMZERO, %YMM6, %k0{%k1}
> + VPTESTNM %YMM6, %YMM6, %k0{%k1}
> kmovd %k0, %LOOP_REG
>
> TESTEQ %LOOP_REG
> @@ -437,13 +618,13 @@ L(loop_skip_page_cross_check):
>
> /* Find which VEC has the mismatch of end of string. */
> VPTESTM %YMM0, %YMM0, %k1
> - VPCMP $0, %YMMZERO, %YMM1, %k0{%k1}
> + VPTESTNM %YMM1, %YMM1, %k0{%k1}
> kmovd %k0, %ecx
> TESTEQ %ecx
> jnz L(return_vec_0_end)
>
> VPTESTM %YMM2, %YMM2, %k1
> - VPCMP $0, %YMMZERO, %YMM3, %k0{%k1}
> + VPTESTNM %YMM3, %YMM3, %k0{%k1}
> kmovd %k0, %ecx
> TESTEQ %ecx
> jnz L(return_vec_1_end)
> @@ -457,7 +638,7 @@ L(return_vec_2_3_end):
> # endif
>
> VPTESTM %YMM4, %YMM4, %k1
> - VPCMP $0, %YMMZERO, %YMM5, %k0{%k1}
> + VPTESTNM %YMM5, %YMM5, %k0{%k1}
> kmovd %k0, %ecx
> TESTEQ %ecx
> # if CHAR_PER_VEC <= 16
> @@ -493,6 +674,8 @@ L(return_vec_3_end):
> # else
> movzbl (VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax
> movzbl (VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> xorl %r8d, %eax
> subl %r8d, %eax
> @@ -545,6 +728,8 @@ L(return_vec_0_end):
> # else
> movzbl (%rdi, %rcx), %eax
> movzbl (%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> /* Flip `eax` if `rdi` and `rsi` where swapped in page cross
> logic. Subtract `r8d` after xor for zero case. */
> @@ -569,6 +754,8 @@ L(return_vec_1_end):
> # else
> movzbl VEC_SIZE(%rdi, %rcx), %eax
> movzbl VEC_SIZE(%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> xorl %r8d, %eax
> subl %r8d, %eax
> @@ -598,7 +785,7 @@ L(page_cross_during_loop):
>
> VMOVA (%rdi), %YMM0
> VPTESTM %YMM0, %YMM0, %k2
> - VPCMP $0, (%rsi), %YMM0, %k1{%k2}
> + CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
> kmovd %k1, %ecx
> TESTEQ %ecx
> jnz L(return_vec_0_end)
> @@ -619,8 +806,7 @@ L(less_1x_vec_till_page_cross):
> been loaded earlier so must be valid. */
> VMOVU -VEC_SIZE(%rdi, %rax), %YMM0
> VPTESTM %YMM0, %YMM0, %k2
> - VPCMP $0, -VEC_SIZE(%rsi, %rax), %YMM0, %k1{%k2}
> -
> + CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2}
> /* Mask of potentially valid bits. The lower bits can be out of
> range comparisons (but safe regarding page crosses). */
>
> @@ -642,6 +828,8 @@ L(less_1x_vec_till_page_cross):
>
> # ifdef USE_AS_STRNCMP
> # ifdef USE_AS_WCSCMP
> + /* NB: strcasecmp not used with WCSCMP so this access to r11 is
> + safe. */
> movl %eax, %r11d
> shrl $2, %r11d
> cmpq %r11, %rdx
> @@ -679,6 +867,8 @@ L(return_page_cross_cmp_mem):
> # else
> movzbl VEC_OFFSET(%rdi, %rcx), %eax
> movzbl VEC_OFFSET(%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> xorl %r8d, %eax
> subl %r8d, %eax
> @@ -709,7 +899,7 @@ L(more_2x_vec_till_page_cross):
>
> VMOVA VEC_SIZE(%rdi), %YMM0
> VPTESTM %YMM0, %YMM0, %k2
> - VPCMP $0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
> + CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
> kmovd %k1, %ecx
> TESTEQ %ecx
> jnz L(return_vec_1_end)
> @@ -724,14 +914,14 @@ L(more_2x_vec_till_page_cross):
> /* Safe to include comparisons from lower bytes. */
> VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %YMM0
> VPTESTM %YMM0, %YMM0, %k2
> - VPCMP $0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM0, %k1{%k2}
> + CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2}
> kmovd %k1, %ecx
> TESTEQ %ecx
> jnz L(return_vec_page_cross_0)
>
> VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %YMM0
> VPTESTM %YMM0, %YMM0, %k2
> - VPCMP $0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM0, %k1{%k2}
> + CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2}
> kmovd %k1, %ecx
> TESTEQ %ecx
> jnz L(return_vec_page_cross_1)
> @@ -740,6 +930,8 @@ L(more_2x_vec_till_page_cross):
> /* Must check length here as length might proclude reading next
> page. */
> # ifdef USE_AS_WCSCMP
> + /* NB: strcasecmp not used with WCSCMP so this access to r11 is
> + safe. */
> movl %eax, %r11d
> shrl $2, %r11d
> cmpq %r11, %rdx
> @@ -754,12 +946,19 @@ L(more_2x_vec_till_page_cross):
> VMOVA (VEC_SIZE * 3)(%rdi), %YMM6
> VPMINU %YMM4, %YMM6, %YMM9
> VPTESTM %YMM9, %YMM9, %k1
> -
> +# ifndef USE_AS_STRCASECMP_L
> vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
> /* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6). */
> vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6
> -
> - VPCMP $0, %YMMZERO, %YMM6, %k0{%k1}
> +# else
> + VMOVU (VEC_SIZE * 2)(%rsi), %YMM5
> + TOLOWER_YMM (%YMM4, %YMM5)
> + VMOVU (VEC_SIZE * 3)(%rsi), %YMM7
> + TOLOWER_YMM (%YMM6, %YMM7)
> + vpxorq %YMM4, %YMM5, %YMM5
> + vpternlogd $0xde, %YMM7, %YMM5, %YMM6
> +# endif
> + VPTESTNM %YMM6, %YMM6, %k0{%k1}
> kmovd %k0, %LOOP_REG
> TESTEQ %LOOP_REG
> jnz L(return_vec_2_3_end)
> @@ -815,6 +1014,8 @@ L(return_vec_page_cross_1):
> # else
> movzbl VEC_OFFSET(%rdi, %rcx), %eax
> movzbl VEC_OFFSET(%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> xorl %r8d, %eax
> subl %r8d, %eax
> @@ -871,7 +1072,7 @@ L(page_cross):
> L(page_cross_loop):
> VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
> VPTESTM %YMM0, %YMM0, %k2
> - VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
> + CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
> kmovd %k1, %ecx
> TESTEQ %ecx
> jnz L(check_ret_vec_page_cross)
> @@ -895,7 +1096,7 @@ L(page_cross_loop):
> */
> VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
> VPTESTM %YMM0, %YMM0, %k2
> - VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
> + CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
>
> kmovd %k1, %ecx
> # ifdef USE_AS_STRNCMP
> @@ -930,6 +1131,8 @@ L(ret_vec_page_cross_cont):
> # else
> movzbl (%rdi, %rcx, SIZE_OF_CHAR), %eax
> movzbl (%rsi, %rcx, SIZE_OF_CHAR), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> xorl %r8d, %eax
> subl %r8d, %eax
> @@ -989,7 +1192,7 @@ L(less_1x_vec_till_page):
> /* Use 16 byte comparison. */
> vmovdqu (%rdi), %xmm0
> VPTESTM %xmm0, %xmm0, %k2
> - VPCMP $0, (%rsi), %xmm0, %k1{%k2}
> + CMP_R1_S2_XMM (%xmm0, (%rsi), %xmm1, %k1){%k2}
> kmovd %k1, %ecx
> # ifdef USE_AS_WCSCMP
> subl $0xf, %ecx
> @@ -1009,7 +1212,7 @@ L(less_1x_vec_till_page):
> # endif
> vmovdqu (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
> VPTESTM %xmm0, %xmm0, %k2
> - VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0, %k1{%k2}
> + CMP_R1_S2_XMM (%xmm0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1, %k1){%k2}
> kmovd %k1, %ecx
> # ifdef USE_AS_WCSCMP
> subl $0xf, %ecx
> @@ -1048,7 +1251,7 @@ L(less_16_till_page):
> vmovq (%rdi), %xmm0
> vmovq (%rsi), %xmm1
> VPTESTM %xmm0, %xmm0, %k2
> - VPCMP $0, %xmm1, %xmm0, %k1{%k2}
> + CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
> kmovd %k1, %ecx
> # ifdef USE_AS_WCSCMP
> subl $0x3, %ecx
> @@ -1068,7 +1271,7 @@ L(less_16_till_page):
> vmovq (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
> vmovq (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
> VPTESTM %xmm0, %xmm0, %k2
> - VPCMP $0, %xmm1, %xmm0, %k1{%k2}
> + CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
> kmovd %k1, %ecx
> # ifdef USE_AS_WCSCMP
> subl $0x3, %ecx
> @@ -1128,7 +1331,7 @@ L(ret_less_8_wcs):
> vmovd (%rdi), %xmm0
> vmovd (%rsi), %xmm1
> VPTESTM %xmm0, %xmm0, %k2
> - VPCMP $0, %xmm1, %xmm0, %k1{%k2}
> + CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
> kmovd %k1, %ecx
> subl $0xf, %ecx
> jnz L(check_ret_vec_page_cross)
> @@ -1143,7 +1346,7 @@ L(ret_less_8_wcs):
> vmovd (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
> vmovd (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
> VPTESTM %xmm0, %xmm0, %k2
> - VPCMP $0, %xmm1, %xmm0, %k1{%k2}
> + CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
> kmovd %k1, %ecx
> subl $0xf, %ecx
> jnz L(check_ret_vec_page_cross)
> @@ -1176,7 +1379,9 @@ L(less_4_till_page):
> L(less_4_loop):
> movzbl (%rdi), %eax
> movzbl (%rsi, %rdi), %ecx
> - subl %ecx, %eax
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
> + subl %BYTE_LOOP_REG, %eax
> jnz L(ret_less_4_loop)
> testl %ecx, %ecx
> jz L(ret_zero_4_loop)
> @@ -1203,5 +1408,6 @@ L(ret_less_4_loop):
> subl %r8d, %eax
> ret
> # endif
> -END(STRCMP)
> + cfi_endproc
> + .size STRCMP, .-STRCMP
> #endif
> diff --git a/sysdeps/x86_64/multiarch/strncase_l-evex.S b/sysdeps/x86_64/multiarch/strncase_l-evex.S
> new file mode 100644
> index 0000000000..b0808c1b21
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strncase_l-evex.S
> @@ -0,0 +1,25 @@
> +/* strncasecmp_l optimized with EVEX.
> + Copyright (C) 2017-2022 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#ifndef STRCMP
> +# define STRCMP __strncasecmp_l_evex
> +#endif
> +#define OVERFLOW_STRCMP __strcasecmp_evex
> +#define USE_AS_STRCASECMP_L
> +#define USE_AS_STRNCMP
> +#include "strcmp-evex.S"
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v1 23/23] x86: Remove AVX str{n}casecmp
2022-03-23 21:57 ` [PATCH v1 23/23] x86: Remove AVX str{n}casecmp Noah Goldstein
@ 2022-03-24 19:04 ` H.J. Lu
2022-05-12 19:54 ` Sunil Pandey
0 siblings, 1 reply; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 19:04 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Wed, Mar 23, 2022 at 3:03 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The rational is:
>
> 1. SSE42 has nearly identical logic so any benefit is minimal (3.4%
> regression on Tigerlake using SSE42 versus AVX across the
> benchtest suite).
> 2. AVX2 version covers the majority of targets that previously
> prefered it.
> 3. The targets where AVX would still be best (SnB and IVB) are
> becoming outdated.
>
> All in all the saving the code size is worth it.
>
> All string/memory tests pass.
> ---
> Geomtric Mean N=40 runs; All functions page aligned
> length, align1, align2, max_char, AVX Time / SSE42 Time
> 1, 1, 1, 127, 0.928
> 2, 2, 2, 127, 0.934
> 3, 3, 3, 127, 0.975
> 4, 4, 4, 127, 0.96
> 5, 5, 5, 127, 0.935
> 6, 6, 6, 127, 0.929
> 7, 7, 7, 127, 0.959
> 8, 0, 0, 127, 0.955
> 9, 1, 1, 127, 0.944
> 10, 2, 2, 127, 0.975
> 11, 3, 3, 127, 0.935
> 12, 4, 4, 127, 0.931
> 13, 5, 5, 127, 0.926
> 14, 6, 6, 127, 0.901
> 15, 7, 7, 127, 0.951
> 4, 0, 0, 127, 0.958
> 4, 0, 0, 254, 0.956
> 8, 0, 0, 254, 0.977
> 16, 0, 0, 127, 0.955
> 16, 0, 0, 254, 0.953
> 32, 0, 0, 127, 0.943
> 32, 0, 0, 254, 0.941
> 64, 0, 0, 127, 0.941
> 64, 0, 0, 254, 0.955
> 128, 0, 0, 127, 0.972
> 128, 0, 0, 254, 0.975
> 256, 0, 0, 127, 0.996
> 256, 0, 0, 254, 0.993
> 512, 0, 0, 127, 0.992
> 512, 0, 0, 254, 0.986
> 1024, 0, 0, 127, 0.994
> 1024, 0, 0, 254, 0.993
> 16, 1, 2, 127, 0.933
> 16, 2, 1, 254, 0.953
> 32, 2, 4, 127, 0.927
> 32, 4, 2, 254, 0.986
> 64, 3, 6, 127, 0.991
> 64, 6, 3, 254, 1.014
> 128, 4, 0, 127, 1.001
> 128, 0, 4, 254, 0.991
> 256, 5, 2, 127, 1.011
> 256, 2, 5, 254, 1.013
> 512, 6, 4, 127, 1.056
> 512, 4, 6, 254, 0.916
> 1024, 7, 6, 127, 1.059
> 1024, 6, 7, 254, 1.043
>
> sysdeps/x86_64/multiarch/Makefile | 2 -
> sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 -
> sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 4 -
> sysdeps/x86_64/multiarch/strcasecmp_l-avx.S | 22 --
> sysdeps/x86_64/multiarch/strcmp-sse42.S | 240 +++++++++-----------
> sysdeps/x86_64/multiarch/strncase_l-avx.S | 22 --
> 6 files changed, 105 insertions(+), 197 deletions(-)
> delete mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx.S
> delete mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 35d80dc2ff..6507d1b7fa 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -54,7 +54,6 @@ sysdep_routines += \
> stpncpy-evex \
> stpncpy-sse2-unaligned \
> stpncpy-ssse3 \
> - strcasecmp_l-avx \
> strcasecmp_l-avx2 \
> strcasecmp_l-avx2-rtm \
> strcasecmp_l-evex \
> @@ -95,7 +94,6 @@ sysdep_routines += \
> strlen-avx2-rtm \
> strlen-evex \
> strlen-sse2 \
> - strncase_l-avx \
> strncase_l-avx2 \
> strncase_l-avx2-rtm \
> strncase_l-evex \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index f1a4d3dac2..40cc6cc49e 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -447,9 +447,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> (CPU_FEATURE_USABLE (AVX2)
> && CPU_FEATURE_USABLE (RTM)),
> __strcasecmp_avx2_rtm)
> - IFUNC_IMPL_ADD (array, i, strcasecmp,
> - CPU_FEATURE_USABLE (AVX),
> - __strcasecmp_avx)
> IFUNC_IMPL_ADD (array, i, strcasecmp,
> CPU_FEATURE_USABLE (SSE4_2),
> __strcasecmp_sse42)
> @@ -471,9 +468,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> (CPU_FEATURE_USABLE (AVX2)
> && CPU_FEATURE_USABLE (RTM)),
> __strcasecmp_l_avx2_rtm)
> - IFUNC_IMPL_ADD (array, i, strcasecmp_l,
> - CPU_FEATURE_USABLE (AVX),
> - __strcasecmp_l_avx)
> IFUNC_IMPL_ADD (array, i, strcasecmp_l,
> CPU_FEATURE_USABLE (SSE4_2),
> __strcasecmp_l_sse42)
> @@ -609,9 +603,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> (CPU_FEATURE_USABLE (AVX2)
> && CPU_FEATURE_USABLE (RTM)),
> __strncasecmp_avx2_rtm)
> - IFUNC_IMPL_ADD (array, i, strncasecmp,
> - CPU_FEATURE_USABLE (AVX),
> - __strncasecmp_avx)
> IFUNC_IMPL_ADD (array, i, strncasecmp,
> CPU_FEATURE_USABLE (SSE4_2),
> __strncasecmp_sse42)
> @@ -634,9 +625,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> (CPU_FEATURE_USABLE (AVX2)
> && CPU_FEATURE_USABLE (RTM)),
> __strncasecmp_l_avx2_rtm)
> - IFUNC_IMPL_ADD (array, i, strncasecmp_l,
> - CPU_FEATURE_USABLE (AVX),
> - __strncasecmp_l_avx)
> IFUNC_IMPL_ADD (array, i, strncasecmp_l,
> CPU_FEATURE_USABLE (SSE4_2),
> __strncasecmp_l_sse42)
> diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> index bf0d146e7f..766539c241 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> @@ -22,7 +22,6 @@
> extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
> -extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
> @@ -46,9 +45,6 @@ IFUNC_SELECTOR (void)
> return OPTIMIZE (avx2);
> }
>
> - if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
> - return OPTIMIZE (avx);
> -
> if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_2)
> && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
> return OPTIMIZE (sse42);
> diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S
> deleted file mode 100644
> index 7ec7c21b5a..0000000000
> --- a/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S
> +++ /dev/null
> @@ -1,22 +0,0 @@
> -/* strcasecmp_l optimized with AVX.
> - Copyright (C) 2017-2022 Free Software Foundation, Inc.
> - This file is part of the GNU C Library.
> -
> - The GNU C Library is free software; you can redistribute it and/or
> - modify it under the terms of the GNU Lesser General Public
> - License as published by the Free Software Foundation; either
> - version 2.1 of the License, or (at your option) any later version.
> -
> - The GNU C Library is distributed in the hope that it will be useful,
> - but WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - Lesser General Public License for more details.
> -
> - You should have received a copy of the GNU Lesser General Public
> - License along with the GNU C Library; if not, see
> - <https://www.gnu.org/licenses/>. */
> -
> -#define STRCMP_SSE42 __strcasecmp_l_avx
> -#define USE_AVX 1
> -#define USE_AS_STRCASECMP_L
> -#include "strcmp-sse42.S"
> diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
> index 7805ae9d41..a9178ad25c 100644
> --- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
> +++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
> @@ -41,13 +41,8 @@
> # define UPDATE_STRNCMP_COUNTER
> #endif
>
> -#ifdef USE_AVX
> -# define SECTION avx
> -# define GLABEL(l) l##_avx
> -#else
> -# define SECTION sse4.2
> -# define GLABEL(l) l##_sse42
> -#endif
> +#define SECTION sse4.2
> +#define GLABEL(l) l##_sse42
>
> #define LABEL(l) .L##l
>
> @@ -105,21 +100,7 @@ END (GLABEL(__strncasecmp))
> #endif
>
>
> -#ifdef USE_AVX
> -# define movdqa vmovdqa
> -# define movdqu vmovdqu
> -# define pmovmskb vpmovmskb
> -# define pcmpistri vpcmpistri
> -# define psubb vpsubb
> -# define pcmpeqb vpcmpeqb
> -# define psrldq vpsrldq
> -# define pslldq vpslldq
> -# define palignr vpalignr
> -# define pxor vpxor
> -# define D(arg) arg, arg
> -#else
> -# define D(arg) arg
> -#endif
> +#define arg arg
>
> STRCMP_SSE42:
> cfi_startproc
> @@ -191,18 +172,7 @@ LABEL(case_add):
> movdqu (%rdi), %xmm1
> movdqu (%rsi), %xmm2
> #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> -# ifdef USE_AVX
> -# define TOLOWER(reg1, reg2) \
> - vpaddb LCASE_MIN_reg, reg1, %xmm7; \
> - vpaddb LCASE_MIN_reg, reg2, %xmm8; \
> - vpcmpgtb LCASE_MAX_reg, %xmm7, %xmm7; \
> - vpcmpgtb LCASE_MAX_reg, %xmm8, %xmm8; \
> - vpandn CASE_ADD_reg, %xmm7, %xmm7; \
> - vpandn CASE_ADD_reg, %xmm8, %xmm8; \
> - vpaddb %xmm7, reg1, reg1; \
> - vpaddb %xmm8, reg2, reg2
> -# else
> -# define TOLOWER(reg1, reg2) \
> +# define TOLOWER(reg1, reg2) \
> movdqa LCASE_MIN_reg, %xmm7; \
> movdqa LCASE_MIN_reg, %xmm8; \
> paddb reg1, %xmm7; \
> @@ -213,15 +183,15 @@ LABEL(case_add):
> pandn CASE_ADD_reg, %xmm8; \
> paddb %xmm7, reg1; \
> paddb %xmm8, reg2
> -# endif
> +
> TOLOWER (%xmm1, %xmm2)
> #else
> # define TOLOWER(reg1, reg2)
> #endif
> - pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char checks */
> - pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
> - pcmpeqb %xmm2, D(%xmm1) /* compare first 16 bytes for equality */
> - psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/
> + pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
> + pcmpeqb %xmm1, %xmm0 /* Any null chars? */
> + pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */
> + psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> pmovmskb %xmm1, %edx
> sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
> jnz LABEL(less16bytes)/* If not, find different value or null char */
> @@ -245,7 +215,7 @@ LABEL(crosscache):
> xor %r8d, %r8d
> and $0xf, %ecx /* offset of rsi */
> and $0xf, %eax /* offset of rdi */
> - pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char check */
> + pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */
> cmp %eax, %ecx
> je LABEL(ashr_0) /* rsi and rdi relative offset same */
> ja LABEL(bigger)
> @@ -259,7 +229,7 @@ LABEL(bigger):
> sub %rcx, %r9
> lea LABEL(unaligned_table)(%rip), %r10
> movslq (%r10, %r9,4), %r9
> - pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
> + pcmpeqb %xmm1, %xmm0 /* Any null chars? */
> lea (%r10, %r9), %r10
> _CET_NOTRACK jmp *%r10 /* jump to corresponding case */
>
> @@ -272,15 +242,15 @@ LABEL(bigger):
> LABEL(ashr_0):
>
> movdqa (%rsi), %xmm1
> - pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
> + pcmpeqb %xmm1, %xmm0 /* Any null chars? */
> #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpeqb (%rdi), D(%xmm1) /* compare 16 bytes for equality */
> + pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */
> #else
> movdqa (%rdi), %xmm2
> TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm2, D(%xmm1) /* compare 16 bytes for equality */
> + pcmpeqb %xmm2, %xmm1 /* compare 16 bytes for equality */
> #endif
> - psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/
> + psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> pmovmskb %xmm1, %r9d
> shr %cl, %edx /* adjust 0xffff for offset */
> shr %cl, %r9d /* adjust for 16-byte offset */
> @@ -360,10 +330,10 @@ LABEL(ashr_0_exit_use):
> */
> .p2align 4
> LABEL(ashr_1):
> - pslldq $15, D(%xmm2) /* shift first string to align with second */
> + pslldq $15, %xmm2 /* shift first string to align with second */
> TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, D(%xmm2) /* compare 16 bytes for equality */
> - psubb %xmm0, D(%xmm2) /* packed sub of comparison results*/
> + pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */
> + psubb %xmm0, %xmm2 /* packed sub of comparison results*/
> pmovmskb %xmm2, %r9d
> shr %cl, %edx /* adjust 0xffff for offset */
> shr %cl, %r9d /* adjust for 16-byte offset */
> @@ -391,7 +361,7 @@ LABEL(loop_ashr_1_use):
>
> LABEL(nibble_ashr_1_restart_use):
> movdqa (%rdi, %rdx), %xmm0
> - palignr $1, -16(%rdi, %rdx), D(%xmm0)
> + palignr $1, -16(%rdi, %rdx), %xmm0
> #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> #else
> @@ -410,7 +380,7 @@ LABEL(nibble_ashr_1_restart_use):
> jg LABEL(nibble_ashr_1_use)
>
> movdqa (%rdi, %rdx), %xmm0
> - palignr $1, -16(%rdi, %rdx), D(%xmm0)
> + palignr $1, -16(%rdi, %rdx), %xmm0
> #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> #else
> @@ -430,7 +400,7 @@ LABEL(nibble_ashr_1_restart_use):
> LABEL(nibble_ashr_1_use):
> sub $0x1000, %r10
> movdqa -16(%rdi, %rdx), %xmm0
> - psrldq $1, D(%xmm0)
> + psrldq $1, %xmm0
> pcmpistri $0x3a,%xmm0, %xmm0
> #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> cmp %r11, %rcx
> @@ -448,10 +418,10 @@ LABEL(nibble_ashr_1_use):
> */
> .p2align 4
> LABEL(ashr_2):
> - pslldq $14, D(%xmm2)
> + pslldq $14, %xmm2
> TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, D(%xmm2)
> - psubb %xmm0, D(%xmm2)
> + pcmpeqb %xmm1, %xmm2
> + psubb %xmm0, %xmm2
> pmovmskb %xmm2, %r9d
> shr %cl, %edx
> shr %cl, %r9d
> @@ -479,7 +449,7 @@ LABEL(loop_ashr_2_use):
>
> LABEL(nibble_ashr_2_restart_use):
> movdqa (%rdi, %rdx), %xmm0
> - palignr $2, -16(%rdi, %rdx), D(%xmm0)
> + palignr $2, -16(%rdi, %rdx), %xmm0
> #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> #else
> @@ -498,7 +468,7 @@ LABEL(nibble_ashr_2_restart_use):
> jg LABEL(nibble_ashr_2_use)
>
> movdqa (%rdi, %rdx), %xmm0
> - palignr $2, -16(%rdi, %rdx), D(%xmm0)
> + palignr $2, -16(%rdi, %rdx), %xmm0
> #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> #else
> @@ -518,7 +488,7 @@ LABEL(nibble_ashr_2_restart_use):
> LABEL(nibble_ashr_2_use):
> sub $0x1000, %r10
> movdqa -16(%rdi, %rdx), %xmm0
> - psrldq $2, D(%xmm0)
> + psrldq $2, %xmm0
> pcmpistri $0x3a,%xmm0, %xmm0
> #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> cmp %r11, %rcx
> @@ -536,10 +506,10 @@ LABEL(nibble_ashr_2_use):
> */
> .p2align 4
> LABEL(ashr_3):
> - pslldq $13, D(%xmm2)
> + pslldq $13, %xmm2
> TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, D(%xmm2)
> - psubb %xmm0, D(%xmm2)
> + pcmpeqb %xmm1, %xmm2
> + psubb %xmm0, %xmm2
> pmovmskb %xmm2, %r9d
> shr %cl, %edx
> shr %cl, %r9d
> @@ -567,7 +537,7 @@ LABEL(loop_ashr_3_use):
>
> LABEL(nibble_ashr_3_restart_use):
> movdqa (%rdi, %rdx), %xmm0
> - palignr $3, -16(%rdi, %rdx), D(%xmm0)
> + palignr $3, -16(%rdi, %rdx), %xmm0
> #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> #else
> @@ -586,7 +556,7 @@ LABEL(nibble_ashr_3_restart_use):
> jg LABEL(nibble_ashr_3_use)
>
> movdqa (%rdi, %rdx), %xmm0
> - palignr $3, -16(%rdi, %rdx), D(%xmm0)
> + palignr $3, -16(%rdi, %rdx), %xmm0
> #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> #else
> @@ -606,7 +576,7 @@ LABEL(nibble_ashr_3_restart_use):
> LABEL(nibble_ashr_3_use):
> sub $0x1000, %r10
> movdqa -16(%rdi, %rdx), %xmm0
> - psrldq $3, D(%xmm0)
> + psrldq $3, %xmm0
> pcmpistri $0x3a,%xmm0, %xmm0
> #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> cmp %r11, %rcx
> @@ -624,10 +594,10 @@ LABEL(nibble_ashr_3_use):
> */
> .p2align 4
> LABEL(ashr_4):
> - pslldq $12, D(%xmm2)
> + pslldq $12, %xmm2
> TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, D(%xmm2)
> - psubb %xmm0, D(%xmm2)
> + pcmpeqb %xmm1, %xmm2
> + psubb %xmm0, %xmm2
> pmovmskb %xmm2, %r9d
> shr %cl, %edx
> shr %cl, %r9d
> @@ -656,7 +626,7 @@ LABEL(loop_ashr_4_use):
>
> LABEL(nibble_ashr_4_restart_use):
> movdqa (%rdi, %rdx), %xmm0
> - palignr $4, -16(%rdi, %rdx), D(%xmm0)
> + palignr $4, -16(%rdi, %rdx), %xmm0
> #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> #else
> @@ -675,7 +645,7 @@ LABEL(nibble_ashr_4_restart_use):
> jg LABEL(nibble_ashr_4_use)
>
> movdqa (%rdi, %rdx), %xmm0
> - palignr $4, -16(%rdi, %rdx), D(%xmm0)
> + palignr $4, -16(%rdi, %rdx), %xmm0
> #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> #else
> @@ -695,7 +665,7 @@ LABEL(nibble_ashr_4_restart_use):
> LABEL(nibble_ashr_4_use):
> sub $0x1000, %r10
> movdqa -16(%rdi, %rdx), %xmm0
> - psrldq $4, D(%xmm0)
> + psrldq $4, %xmm0
> pcmpistri $0x3a,%xmm0, %xmm0
> #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> cmp %r11, %rcx
> @@ -713,10 +683,10 @@ LABEL(nibble_ashr_4_use):
> */
> .p2align 4
> LABEL(ashr_5):
> - pslldq $11, D(%xmm2)
> + pslldq $11, %xmm2
> TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, D(%xmm2)
> - psubb %xmm0, D(%xmm2)
> + pcmpeqb %xmm1, %xmm2
> + psubb %xmm0, %xmm2
> pmovmskb %xmm2, %r9d
> shr %cl, %edx
> shr %cl, %r9d
> @@ -745,7 +715,7 @@ LABEL(loop_ashr_5_use):
>
> LABEL(nibble_ashr_5_restart_use):
> movdqa (%rdi, %rdx), %xmm0
> - palignr $5, -16(%rdi, %rdx), D(%xmm0)
> + palignr $5, -16(%rdi, %rdx), %xmm0
> #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> #else
> @@ -765,7 +735,7 @@ LABEL(nibble_ashr_5_restart_use):
>
> movdqa (%rdi, %rdx), %xmm0
>
> - palignr $5, -16(%rdi, %rdx), D(%xmm0)
> + palignr $5, -16(%rdi, %rdx), %xmm0
> #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> #else
> @@ -785,7 +755,7 @@ LABEL(nibble_ashr_5_restart_use):
> LABEL(nibble_ashr_5_use):
> sub $0x1000, %r10
> movdqa -16(%rdi, %rdx), %xmm0
> - psrldq $5, D(%xmm0)
> + psrldq $5, %xmm0
> pcmpistri $0x3a,%xmm0, %xmm0
> #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> cmp %r11, %rcx
> @@ -803,10 +773,10 @@ LABEL(nibble_ashr_5_use):
> */
> .p2align 4
> LABEL(ashr_6):
> - pslldq $10, D(%xmm2)
> + pslldq $10, %xmm2
> TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, D(%xmm2)
> - psubb %xmm0, D(%xmm2)
> + pcmpeqb %xmm1, %xmm2
> + psubb %xmm0, %xmm2
> pmovmskb %xmm2, %r9d
> shr %cl, %edx
> shr %cl, %r9d
> @@ -835,7 +805,7 @@ LABEL(loop_ashr_6_use):
>
> LABEL(nibble_ashr_6_restart_use):
> movdqa (%rdi, %rdx), %xmm0
> - palignr $6, -16(%rdi, %rdx), D(%xmm0)
> + palignr $6, -16(%rdi, %rdx), %xmm0
> #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> #else
> @@ -854,7 +824,7 @@ LABEL(nibble_ashr_6_restart_use):
> jg LABEL(nibble_ashr_6_use)
>
> movdqa (%rdi, %rdx), %xmm0
> - palignr $6, -16(%rdi, %rdx), D(%xmm0)
> + palignr $6, -16(%rdi, %rdx), %xmm0
> #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> #else
> @@ -874,7 +844,7 @@ LABEL(nibble_ashr_6_restart_use):
> LABEL(nibble_ashr_6_use):
> sub $0x1000, %r10
> movdqa -16(%rdi, %rdx), %xmm0
> - psrldq $6, D(%xmm0)
> + psrldq $6, %xmm0
> pcmpistri $0x3a,%xmm0, %xmm0
> #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> cmp %r11, %rcx
> @@ -892,10 +862,10 @@ LABEL(nibble_ashr_6_use):
> */
> .p2align 4
> LABEL(ashr_7):
> - pslldq $9, D(%xmm2)
> + pslldq $9, %xmm2
> TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, D(%xmm2)
> - psubb %xmm0, D(%xmm2)
> + pcmpeqb %xmm1, %xmm2
> + psubb %xmm0, %xmm2
> pmovmskb %xmm2, %r9d
> shr %cl, %edx
> shr %cl, %r9d
> @@ -924,7 +894,7 @@ LABEL(loop_ashr_7_use):
>
> LABEL(nibble_ashr_7_restart_use):
> movdqa (%rdi, %rdx), %xmm0
> - palignr $7, -16(%rdi, %rdx), D(%xmm0)
> + palignr $7, -16(%rdi, %rdx), %xmm0
> #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> #else
> @@ -943,7 +913,7 @@ LABEL(nibble_ashr_7_restart_use):
> jg LABEL(nibble_ashr_7_use)
>
> movdqa (%rdi, %rdx), %xmm0
> - palignr $7, -16(%rdi, %rdx), D(%xmm0)
> + palignr $7, -16(%rdi, %rdx), %xmm0
> #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> #else
> @@ -963,7 +933,7 @@ LABEL(nibble_ashr_7_restart_use):
> LABEL(nibble_ashr_7_use):
> sub $0x1000, %r10
> movdqa -16(%rdi, %rdx), %xmm0
> - psrldq $7, D(%xmm0)
> + psrldq $7, %xmm0
> pcmpistri $0x3a,%xmm0, %xmm0
> #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> cmp %r11, %rcx
> @@ -981,10 +951,10 @@ LABEL(nibble_ashr_7_use):
> */
> .p2align 4
> LABEL(ashr_8):
> - pslldq $8, D(%xmm2)
> + pslldq $8, %xmm2
> TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, D(%xmm2)
> - psubb %xmm0, D(%xmm2)
> + pcmpeqb %xmm1, %xmm2
> + psubb %xmm0, %xmm2
> pmovmskb %xmm2, %r9d
> shr %cl, %edx
> shr %cl, %r9d
> @@ -1013,7 +983,7 @@ LABEL(loop_ashr_8_use):
>
> LABEL(nibble_ashr_8_restart_use):
> movdqa (%rdi, %rdx), %xmm0
> - palignr $8, -16(%rdi, %rdx), D(%xmm0)
> + palignr $8, -16(%rdi, %rdx), %xmm0
> #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> #else
> @@ -1032,7 +1002,7 @@ LABEL(nibble_ashr_8_restart_use):
> jg LABEL(nibble_ashr_8_use)
>
> movdqa (%rdi, %rdx), %xmm0
> - palignr $8, -16(%rdi, %rdx), D(%xmm0)
> + palignr $8, -16(%rdi, %rdx), %xmm0
> #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> #else
> @@ -1052,7 +1022,7 @@ LABEL(nibble_ashr_8_restart_use):
> LABEL(nibble_ashr_8_use):
> sub $0x1000, %r10
> movdqa -16(%rdi, %rdx), %xmm0
> - psrldq $8, D(%xmm0)
> + psrldq $8, %xmm0
> pcmpistri $0x3a,%xmm0, %xmm0
> #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> cmp %r11, %rcx
> @@ -1070,10 +1040,10 @@ LABEL(nibble_ashr_8_use):
> */
> .p2align 4
> LABEL(ashr_9):
> - pslldq $7, D(%xmm2)
> + pslldq $7, %xmm2
> TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, D(%xmm2)
> - psubb %xmm0, D(%xmm2)
> + pcmpeqb %xmm1, %xmm2
> + psubb %xmm0, %xmm2
> pmovmskb %xmm2, %r9d
> shr %cl, %edx
> shr %cl, %r9d
> @@ -1103,7 +1073,7 @@ LABEL(loop_ashr_9_use):
> LABEL(nibble_ashr_9_restart_use):
> movdqa (%rdi, %rdx), %xmm0
>
> - palignr $9, -16(%rdi, %rdx), D(%xmm0)
> + palignr $9, -16(%rdi, %rdx), %xmm0
> #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> #else
> @@ -1122,7 +1092,7 @@ LABEL(nibble_ashr_9_restart_use):
> jg LABEL(nibble_ashr_9_use)
>
> movdqa (%rdi, %rdx), %xmm0
> - palignr $9, -16(%rdi, %rdx), D(%xmm0)
> + palignr $9, -16(%rdi, %rdx), %xmm0
> #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> #else
> @@ -1142,7 +1112,7 @@ LABEL(nibble_ashr_9_restart_use):
> LABEL(nibble_ashr_9_use):
> sub $0x1000, %r10
> movdqa -16(%rdi, %rdx), %xmm0
> - psrldq $9, D(%xmm0)
> + psrldq $9, %xmm0
> pcmpistri $0x3a,%xmm0, %xmm0
> #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> cmp %r11, %rcx
> @@ -1160,10 +1130,10 @@ LABEL(nibble_ashr_9_use):
> */
> .p2align 4
> LABEL(ashr_10):
> - pslldq $6, D(%xmm2)
> + pslldq $6, %xmm2
> TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, D(%xmm2)
> - psubb %xmm0, D(%xmm2)
> + pcmpeqb %xmm1, %xmm2
> + psubb %xmm0, %xmm2
> pmovmskb %xmm2, %r9d
> shr %cl, %edx
> shr %cl, %r9d
> @@ -1192,7 +1162,7 @@ LABEL(loop_ashr_10_use):
>
> LABEL(nibble_ashr_10_restart_use):
> movdqa (%rdi, %rdx), %xmm0
> - palignr $10, -16(%rdi, %rdx), D(%xmm0)
> + palignr $10, -16(%rdi, %rdx), %xmm0
> #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> #else
> @@ -1211,7 +1181,7 @@ LABEL(nibble_ashr_10_restart_use):
> jg LABEL(nibble_ashr_10_use)
>
> movdqa (%rdi, %rdx), %xmm0
> - palignr $10, -16(%rdi, %rdx), D(%xmm0)
> + palignr $10, -16(%rdi, %rdx), %xmm0
> #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> #else
> @@ -1231,7 +1201,7 @@ LABEL(nibble_ashr_10_restart_use):
> LABEL(nibble_ashr_10_use):
> sub $0x1000, %r10
> movdqa -16(%rdi, %rdx), %xmm0
> - psrldq $10, D(%xmm0)
> + psrldq $10, %xmm0
> pcmpistri $0x3a,%xmm0, %xmm0
> #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> cmp %r11, %rcx
> @@ -1249,10 +1219,10 @@ LABEL(nibble_ashr_10_use):
> */
> .p2align 4
> LABEL(ashr_11):
> - pslldq $5, D(%xmm2)
> + pslldq $5, %xmm2
> TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, D(%xmm2)
> - psubb %xmm0, D(%xmm2)
> + pcmpeqb %xmm1, %xmm2
> + psubb %xmm0, %xmm2
> pmovmskb %xmm2, %r9d
> shr %cl, %edx
> shr %cl, %r9d
> @@ -1281,7 +1251,7 @@ LABEL(loop_ashr_11_use):
>
> LABEL(nibble_ashr_11_restart_use):
> movdqa (%rdi, %rdx), %xmm0
> - palignr $11, -16(%rdi, %rdx), D(%xmm0)
> + palignr $11, -16(%rdi, %rdx), %xmm0
> #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> #else
> @@ -1300,7 +1270,7 @@ LABEL(nibble_ashr_11_restart_use):
> jg LABEL(nibble_ashr_11_use)
>
> movdqa (%rdi, %rdx), %xmm0
> - palignr $11, -16(%rdi, %rdx), D(%xmm0)
> + palignr $11, -16(%rdi, %rdx), %xmm0
> #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> #else
> @@ -1320,7 +1290,7 @@ LABEL(nibble_ashr_11_restart_use):
> LABEL(nibble_ashr_11_use):
> sub $0x1000, %r10
> movdqa -16(%rdi, %rdx), %xmm0
> - psrldq $11, D(%xmm0)
> + psrldq $11, %xmm0
> pcmpistri $0x3a,%xmm0, %xmm0
> #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> cmp %r11, %rcx
> @@ -1338,10 +1308,10 @@ LABEL(nibble_ashr_11_use):
> */
> .p2align 4
> LABEL(ashr_12):
> - pslldq $4, D(%xmm2)
> + pslldq $4, %xmm2
> TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, D(%xmm2)
> - psubb %xmm0, D(%xmm2)
> + pcmpeqb %xmm1, %xmm2
> + psubb %xmm0, %xmm2
> pmovmskb %xmm2, %r9d
> shr %cl, %edx
> shr %cl, %r9d
> @@ -1370,7 +1340,7 @@ LABEL(loop_ashr_12_use):
>
> LABEL(nibble_ashr_12_restart_use):
> movdqa (%rdi, %rdx), %xmm0
> - palignr $12, -16(%rdi, %rdx), D(%xmm0)
> + palignr $12, -16(%rdi, %rdx), %xmm0
> #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> #else
> @@ -1389,7 +1359,7 @@ LABEL(nibble_ashr_12_restart_use):
> jg LABEL(nibble_ashr_12_use)
>
> movdqa (%rdi, %rdx), %xmm0
> - palignr $12, -16(%rdi, %rdx), D(%xmm0)
> + palignr $12, -16(%rdi, %rdx), %xmm0
> #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> #else
> @@ -1409,7 +1379,7 @@ LABEL(nibble_ashr_12_restart_use):
> LABEL(nibble_ashr_12_use):
> sub $0x1000, %r10
> movdqa -16(%rdi, %rdx), %xmm0
> - psrldq $12, D(%xmm0)
> + psrldq $12, %xmm0
> pcmpistri $0x3a,%xmm0, %xmm0
> #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> cmp %r11, %rcx
> @@ -1427,10 +1397,10 @@ LABEL(nibble_ashr_12_use):
> */
> .p2align 4
> LABEL(ashr_13):
> - pslldq $3, D(%xmm2)
> + pslldq $3, %xmm2
> TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, D(%xmm2)
> - psubb %xmm0, D(%xmm2)
> + pcmpeqb %xmm1, %xmm2
> + psubb %xmm0, %xmm2
> pmovmskb %xmm2, %r9d
> shr %cl, %edx
> shr %cl, %r9d
> @@ -1460,7 +1430,7 @@ LABEL(loop_ashr_13_use):
>
> LABEL(nibble_ashr_13_restart_use):
> movdqa (%rdi, %rdx), %xmm0
> - palignr $13, -16(%rdi, %rdx), D(%xmm0)
> + palignr $13, -16(%rdi, %rdx), %xmm0
> #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> #else
> @@ -1479,7 +1449,7 @@ LABEL(nibble_ashr_13_restart_use):
> jg LABEL(nibble_ashr_13_use)
>
> movdqa (%rdi, %rdx), %xmm0
> - palignr $13, -16(%rdi, %rdx), D(%xmm0)
> + palignr $13, -16(%rdi, %rdx), %xmm0
> #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> #else
> @@ -1499,7 +1469,7 @@ LABEL(nibble_ashr_13_restart_use):
> LABEL(nibble_ashr_13_use):
> sub $0x1000, %r10
> movdqa -16(%rdi, %rdx), %xmm0
> - psrldq $13, D(%xmm0)
> + psrldq $13, %xmm0
> pcmpistri $0x3a,%xmm0, %xmm0
> #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> cmp %r11, %rcx
> @@ -1517,10 +1487,10 @@ LABEL(nibble_ashr_13_use):
> */
> .p2align 4
> LABEL(ashr_14):
> - pslldq $2, D(%xmm2)
> + pslldq $2, %xmm2
> TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, D(%xmm2)
> - psubb %xmm0, D(%xmm2)
> + pcmpeqb %xmm1, %xmm2
> + psubb %xmm0, %xmm2
> pmovmskb %xmm2, %r9d
> shr %cl, %edx
> shr %cl, %r9d
> @@ -1550,7 +1520,7 @@ LABEL(loop_ashr_14_use):
>
> LABEL(nibble_ashr_14_restart_use):
> movdqa (%rdi, %rdx), %xmm0
> - palignr $14, -16(%rdi, %rdx), D(%xmm0)
> + palignr $14, -16(%rdi, %rdx), %xmm0
> #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> #else
> @@ -1569,7 +1539,7 @@ LABEL(nibble_ashr_14_restart_use):
> jg LABEL(nibble_ashr_14_use)
>
> movdqa (%rdi, %rdx), %xmm0
> - palignr $14, -16(%rdi, %rdx), D(%xmm0)
> + palignr $14, -16(%rdi, %rdx), %xmm0
> #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> #else
> @@ -1589,7 +1559,7 @@ LABEL(nibble_ashr_14_restart_use):
> LABEL(nibble_ashr_14_use):
> sub $0x1000, %r10
> movdqa -16(%rdi, %rdx), %xmm0
> - psrldq $14, D(%xmm0)
> + psrldq $14, %xmm0
> pcmpistri $0x3a,%xmm0, %xmm0
> #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> cmp %r11, %rcx
> @@ -1607,10 +1577,10 @@ LABEL(nibble_ashr_14_use):
> */
> .p2align 4
> LABEL(ashr_15):
> - pslldq $1, D(%xmm2)
> + pslldq $1, %xmm2
> TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, D(%xmm2)
> - psubb %xmm0, D(%xmm2)
> + pcmpeqb %xmm1, %xmm2
> + psubb %xmm0, %xmm2
> pmovmskb %xmm2, %r9d
> shr %cl, %edx
> shr %cl, %r9d
> @@ -1642,7 +1612,7 @@ LABEL(loop_ashr_15_use):
>
> LABEL(nibble_ashr_15_restart_use):
> movdqa (%rdi, %rdx), %xmm0
> - palignr $15, -16(%rdi, %rdx), D(%xmm0)
> + palignr $15, -16(%rdi, %rdx), %xmm0
> #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> #else
> @@ -1661,7 +1631,7 @@ LABEL(nibble_ashr_15_restart_use):
> jg LABEL(nibble_ashr_15_use)
>
> movdqa (%rdi, %rdx), %xmm0
> - palignr $15, -16(%rdi, %rdx), D(%xmm0)
> + palignr $15, -16(%rdi, %rdx), %xmm0
> #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> #else
> @@ -1681,7 +1651,7 @@ LABEL(nibble_ashr_15_restart_use):
> LABEL(nibble_ashr_15_use):
> sub $0x1000, %r10
> movdqa -16(%rdi, %rdx), %xmm0
> - psrldq $15, D(%xmm0)
> + psrldq $15, %xmm0
> pcmpistri $0x3a,%xmm0, %xmm0
> #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> cmp %r11, %rcx
> diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx.S b/sysdeps/x86_64/multiarch/strncase_l-avx.S
> deleted file mode 100644
> index b51b86d223..0000000000
> --- a/sysdeps/x86_64/multiarch/strncase_l-avx.S
> +++ /dev/null
> @@ -1,22 +0,0 @@
> -/* strncasecmp_l optimized with AVX.
> - Copyright (C) 2017-2022 Free Software Foundation, Inc.
> - This file is part of the GNU C Library.
> -
> - The GNU C Library is free software; you can redistribute it and/or
> - modify it under the terms of the GNU Lesser General Public
> - License as published by the Free Software Foundation; either
> - version 2.1 of the License, or (at your option) any later version.
> -
> - The GNU C Library is distributed in the hope that it will be useful,
> - but WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - Lesser General Public License for more details.
> -
> - You should have received a copy of the GNU Lesser General Public
> - License along with the GNU C Library; if not, see
> - <https://www.gnu.org/licenses/>. */
> -
> -#define STRCMP_SSE42 __strncasecmp_l_avx
> -#define USE_AVX 1
> -#define USE_AS_STRNCASECMP_L
> -#include "strcmp-sse42.S"
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v1 12/23] x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896]
2022-03-24 18:59 ` H.J. Lu
@ 2022-03-24 19:18 ` Noah Goldstein
2022-03-24 19:34 ` H.J. Lu
0 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-24 19:18 UTC (permalink / raw)
To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell
On Thu, Mar 24, 2022 at 2:00 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Wed, Mar 23, 2022 at 3:01 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Overflow case for __wcsncmp_avx2_rtm should be __wcscmp_avx2_rtm not
> > __wcscmp_avx2.
> >
> > All string/memory tests pass.
> > ---
> > sysdeps/x86_64/multiarch/strcmp-avx2.S | 2 +-
> > 1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> > index 52ff5ad724..86a86b68e3 100644
> > --- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
> > +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> > @@ -122,7 +122,7 @@ ENTRY(STRCMP)
> > are cases where length is large enough that it can never be a
> > bound on valid memory so just use wcscmp. */
> > shrq $56, %rcx
> > - jnz __wcscmp_avx2
> > + jnz OVERFLOW_STRCMP
> >
> > leaq (, %rdx, 4), %rdx
> > # endif
> > --
> > 2.25.1
> >
>
> Isn't it a bug? Is there a glibc bug? Should this also be fixed on release
> branches?
It is bug but no need for backport.
>
> --
> H.J.
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v1 03/23] x86: Code cleanup in strchr-avx2 and comment justifying branch
2022-03-24 18:53 ` H.J. Lu
@ 2022-03-24 19:20 ` Noah Goldstein
2022-03-24 19:36 ` H.J. Lu
0 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-24 19:20 UTC (permalink / raw)
To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell
On Thu, Mar 24, 2022 at 1:53 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Wed, Mar 23, 2022 at 2:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Small code cleanup for size: -53 bytes.
> >
> > Add comment justifying using a branch to do NULL/non-null return.
>
>
> Do you have followup patches to improve its performance? We are
> backporting all x86-64 improvements to Intel release branches:
>
> https://gitlab.com/x86-glibc/glibc/-/wikis/home
>
> Patches without performance improvements are undesirable.
No further changes planned at the moment, code size saves
seem worth it for master though. Also in favor of adding the comment
as I think its non-intuitive.
>
> > All string/memory tests pass and no regressions in benchtests.
> >
> > geometric_mean(N=20) of all benchmarks Original / New: 1.00
> > ---
> > Geomtric Mean N=20 runs; All functions page aligned
> > length, alignment, pos, rand, seek_char/branch, max_char/perc-zero, New Time / Old Time
> > 2048, 0, 32, 0, 23, 127, 1.033
> > 2048, 1, 32, 0, 23, 127, 1.006
> > 2048, 0, 64, 0, 23, 127, 1.02
> > 2048, 2, 64, 0, 23, 127, 0.992
> > 2048, 0, 128, 0, 23, 127, 0.996
> > 2048, 3, 128, 0, 23, 127, 0.966
> > 2048, 0, 256, 0, 23, 127, 0.996
> > 2048, 4, 256, 0, 23, 127, 0.998
> > 2048, 0, 512, 0, 23, 127, 0.991
> > 2048, 5, 512, 0, 23, 127, 0.991
> > 2048, 0, 1024, 0, 23, 127, 0.993
> > 2048, 6, 1024, 0, 23, 127, 0.992
> > 2048, 0, 2048, 0, 23, 127, 0.992
> > 2048, 7, 2048, 0, 23, 127, 0.976
> > 4096, 0, 32, 0, 23, 127, 0.983
> > 4096, 1, 32, 0, 23, 127, 0.994
> > 4096, 0, 64, 0, 23, 127, 0.968
> > 4096, 2, 64, 0, 23, 127, 1.018
> > 4096, 0, 128, 0, 23, 127, 0.99
> > 4096, 3, 128, 0, 23, 127, 1.001
> > 4096, 0, 256, 0, 23, 127, 1.0
> > 4096, 4, 256, 0, 23, 127, 1.001
> > 4096, 0, 512, 0, 23, 127, 0.989
> > 4096, 5, 512, 0, 23, 127, 0.988
> > 4096, 0, 1024, 0, 23, 127, 0.994
> > 4096, 6, 1024, 0, 23, 127, 0.993
> > 4096, 0, 2048, 0, 23, 127, 0.987
> > 4096, 7, 2048, 0, 23, 127, 0.996
> > 256, 1, 64, 0, 23, 127, 1.004
> > 256, 2, 64, 0, 23, 127, 1.004
> > 256, 3, 64, 0, 23, 127, 0.992
> > 256, 4, 64, 0, 23, 127, 1.001
> > 256, 5, 64, 0, 23, 127, 1.001
> > 256, 6, 64, 0, 23, 127, 0.998
> > 256, 7, 64, 0, 23, 127, 0.994
> > 512, 0, 256, 0, 23, 127, 0.999
> > 512, 16, 256, 0, 23, 127, 1.002
> > 512, 32, 256, 0, 23, 127, 0.994
> > 512, 48, 256, 0, 23, 127, 0.991
> > 512, 64, 256, 0, 23, 127, 0.994
> > 512, 80, 256, 0, 23, 127, 0.994
> > 512, 96, 256, 0, 23, 127, 0.996
> > 512, 112, 256, 0, 23, 127, 0.999
> > 1, 0, 0, 0, 23, 127, 0.978
> > 2, 0, 1, 0, 23, 127, 0.981
> > 3, 0, 2, 0, 23, 127, 0.993
> > 4, 0, 3, 0, 23, 127, 1.004
> > 5, 0, 4, 0, 23, 127, 1.002
> > 6, 0, 5, 0, 23, 127, 0.991
> > 7, 0, 6, 0, 23, 127, 0.99
> > 8, 0, 7, 0, 23, 127, 1.012
> > 9, 0, 8, 0, 23, 127, 0.994
> > 10, 0, 9, 0, 23, 127, 1.003
> > 11, 0, 10, 0, 23, 127, 0.999
> > 12, 0, 11, 0, 23, 127, 1.007
> > 13, 0, 12, 0, 23, 127, 1.0
> > 14, 0, 13, 0, 23, 127, 0.997
> > 15, 0, 14, 0, 23, 127, 0.996
> > 16, 0, 15, 0, 23, 127, 0.993
> > 17, 0, 16, 0, 23, 127, 1.002
> > 18, 0, 17, 0, 23, 127, 0.997
> > 19, 0, 18, 0, 23, 127, 0.998
> > 20, 0, 19, 0, 23, 127, 0.994
> > 21, 0, 20, 0, 23, 127, 0.99
> > 22, 0, 21, 0, 23, 127, 0.992
> > 23, 0, 22, 0, 23, 127, 0.996
> > 24, 0, 23, 0, 23, 127, 0.991
> > 25, 0, 24, 0, 23, 127, 0.997
> > 26, 0, 25, 0, 23, 127, 1.011
> > 27, 0, 26, 0, 23, 127, 1.013
> > 28, 0, 27, 0, 23, 127, 0.996
> > 29, 0, 28, 0, 23, 127, 0.993
> > 30, 0, 29, 0, 23, 127, 1.009
> > 31, 0, 30, 0, 23, 127, 1.009
> > 32, 0, 31, 0, 23, 127, 1.008
> > 2048, 0, 32, 0, 0, 127, 1.0
> > 2048, 1, 32, 0, 0, 127, 1.01
> > 2048, 0, 64, 0, 0, 127, 0.997
> > 2048, 2, 64, 0, 0, 127, 1.002
> > 2048, 0, 128, 0, 0, 127, 0.986
> > 2048, 3, 128, 0, 0, 127, 0.997
> > 2048, 0, 256, 0, 0, 127, 1.002
> > 2048, 4, 256, 0, 0, 127, 0.999
> > 2048, 0, 512, 0, 0, 127, 0.991
> > 2048, 5, 512, 0, 0, 127, 0.984
> > 2048, 0, 1024, 0, 0, 127, 0.994
> > 2048, 6, 1024, 0, 0, 127, 0.993
> > 2048, 0, 2048, 0, 0, 127, 0.951
> > 2048, 7, 2048, 0, 0, 127, 0.989
> > 4096, 0, 32, 0, 0, 127, 0.993
> > 4096, 1, 32, 0, 0, 127, 0.997
> > 4096, 0, 64, 0, 0, 127, 1.004
> > 4096, 2, 64, 0, 0, 127, 1.016
> > 4096, 0, 128, 0, 0, 127, 0.973
> > 4096, 3, 128, 0, 0, 127, 1.001
> > 4096, 0, 256, 0, 0, 127, 0.999
> > 4096, 4, 256, 0, 0, 127, 0.998
> > 4096, 0, 512, 0, 0, 127, 0.99
> > 4096, 5, 512, 0, 0, 127, 0.985
> > 4096, 0, 1024, 0, 0, 127, 0.993
> > 4096, 6, 1024, 0, 0, 127, 0.997
> > 4096, 0, 2048, 0, 0, 127, 0.995
> > 4096, 7, 2048, 0, 0, 127, 0.996
> > 256, 1, 64, 0, 0, 127, 1.01
> > 256, 2, 64, 0, 0, 127, 1.024
> > 256, 3, 64, 0, 0, 127, 1.03
> > 256, 4, 64, 0, 0, 127, 1.004
> > 256, 5, 64, 0, 0, 127, 0.998
> > 256, 6, 64, 0, 0, 127, 0.998
> > 256, 7, 64, 0, 0, 127, 0.997
> > 512, 0, 256, 0, 0, 127, 0.996
> > 512, 16, 256, 0, 0, 127, 0.995
> > 512, 32, 256, 0, 0, 127, 0.996
> > 512, 48, 256, 0, 0, 127, 0.992
> > 512, 64, 256, 0, 0, 127, 0.999
> > 512, 80, 256, 0, 0, 127, 1.002
> > 512, 96, 256, 0, 0, 127, 0.999
> > 512, 112, 256, 0, 0, 127, 0.998
> > 1, 0, 0, 0, 0, 127, 1.016
> > 2, 0, 1, 0, 0, 127, 0.998
> > 3, 0, 2, 0, 0, 127, 1.02
> > 4, 0, 3, 0, 0, 127, 1.004
> > 5, 0, 4, 0, 0, 127, 1.021
> > 6, 0, 5, 0, 0, 127, 1.014
> > 7, 0, 6, 0, 0, 127, 1.007
> > 8, 0, 7, 0, 0, 127, 1.016
> > 9, 0, 8, 0, 0, 127, 1.003
> > 10, 0, 9, 0, 0, 127, 1.004
> > 11, 0, 10, 0, 0, 127, 0.995
> > 12, 0, 11, 0, 0, 127, 1.009
> > 13, 0, 12, 0, 0, 127, 1.005
> > 14, 0, 13, 0, 0, 127, 0.987
> > 15, 0, 14, 0, 0, 127, 0.998
> > 16, 0, 15, 0, 0, 127, 1.004
> > 17, 0, 16, 0, 0, 127, 1.01
> > 18, 0, 17, 0, 0, 127, 1.01
> > 19, 0, 18, 0, 0, 127, 1.006
> > 20, 0, 19, 0, 0, 127, 1.012
> > 21, 0, 20, 0, 0, 127, 0.999
> > 22, 0, 21, 0, 0, 127, 1.004
> > 23, 0, 22, 0, 0, 127, 0.988
> > 24, 0, 23, 0, 0, 127, 0.993
> > 25, 0, 24, 0, 0, 127, 1.004
> > 26, 0, 25, 0, 0, 127, 0.99
> > 27, 0, 26, 0, 0, 127, 1.016
> > 28, 0, 27, 0, 0, 127, 0.987
> > 29, 0, 28, 0, 0, 127, 0.989
> > 30, 0, 29, 0, 0, 127, 0.998
> > 31, 0, 30, 0, 0, 127, 1.005
> > 32, 0, 31, 0, 0, 127, 0.993
> >
> > 16, 0, 15, 1, 1, 0, 1.002
> > 16, 0, 15, 1, 0, 0, 1.0
> > 16, 0, 15, 1, 1, 0.1, 1.034
> > 16, 0, 15, 1, 0, 0.1, 1.03
> > 16, 0, 15, 1, 1, 0.25, 0.993
> > 16, 0, 15, 1, 0, 0.25, 1.081
> > 16, 0, 15, 1, 1, 0.33, 0.959
> > 16, 0, 15, 1, 0, 0.33, 1.142
> > 16, 0, 15, 1, 1, 0.5, 0.929
> > 16, 0, 15, 1, 0, 0.5, 1.072
> > 16, 0, 15, 1, 1, 0.66, 0.984
> > 16, 0, 15, 1, 0, 0.66, 1.069
> > 16, 0, 15, 1, 1, 0.75, 0.969
> > 16, 0, 15, 1, 0, 0.75, 1.059
> > 16, 0, 15, 1, 1, 0.9, 0.98
> > 16, 0, 15, 1, 0, 0.9, 0.994
> > 16, 0, 15, 1, 1, 1, 0.993
> > 16, 0, 15, 1, 0, 1, 0.996
> >
> > sysdeps/x86_64/multiarch/strchr-avx2.S | 204 +++++++++++++------------
> > 1 file changed, 107 insertions(+), 97 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
> > index 086cabf76a..1a916cc951 100644
> > --- a/sysdeps/x86_64/multiarch/strchr-avx2.S
> > +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
> > @@ -48,13 +48,13 @@
> > # define PAGE_SIZE 4096
> >
> > .section SECTION(.text),"ax",@progbits
> > -ENTRY (STRCHR)
> > +ENTRY_P2ALIGN (STRCHR, 5)
> > /* Broadcast CHAR to YMM0. */
> > vmovd %esi, %xmm0
> > movl %edi, %eax
> > andl $(PAGE_SIZE - 1), %eax
> > VPBROADCAST %xmm0, %ymm0
> > - vpxor %xmm9, %xmm9, %xmm9
> > + vpxor %xmm1, %xmm1, %xmm1
> >
> > /* Check if we cross page boundary with one vector load. */
> > cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> > @@ -62,37 +62,29 @@ ENTRY (STRCHR)
> >
> > /* Check the first VEC_SIZE bytes. Search for both CHAR and the
> > null byte. */
> > - vmovdqu (%rdi), %ymm8
> > - VPCMPEQ %ymm8, %ymm0, %ymm1
> > - VPCMPEQ %ymm8, %ymm9, %ymm2
> > - vpor %ymm1, %ymm2, %ymm1
> > - vpmovmskb %ymm1, %eax
> > + vmovdqu (%rdi), %ymm2
> > + VPCMPEQ %ymm2, %ymm0, %ymm3
> > + VPCMPEQ %ymm2, %ymm1, %ymm2
> > + vpor %ymm3, %ymm2, %ymm3
> > + vpmovmskb %ymm3, %eax
> > testl %eax, %eax
> > jz L(aligned_more)
> > tzcntl %eax, %eax
> > # ifndef USE_AS_STRCHRNUL
> > - /* Found CHAR or the null byte. */
> > - cmp (%rdi, %rax), %CHAR_REG
> > - jne L(zero)
> > -# endif
> > - addq %rdi, %rax
> > - VZEROUPPER_RETURN
> > -
> > - /* .p2align 5 helps keep performance more consistent if ENTRY()
> > - alignment % 32 was either 16 or 0. As well this makes the
> > - alignment % 32 of the loop_4x_vec fixed which makes tuning it
> > - easier. */
> > - .p2align 5
> > -L(first_vec_x4):
> > - tzcntl %eax, %eax
> > - addq $(VEC_SIZE * 3 + 1), %rdi
> > -# ifndef USE_AS_STRCHRNUL
> > - /* Found CHAR or the null byte. */
> > + /* Found CHAR or the null byte. */
> > cmp (%rdi, %rax), %CHAR_REG
> > + /* NB: Use a branch instead of cmovcc here. The expectation is
> > + that with strchr the user will branch based on input being
> > + null. Since this branch will be 100% predictive of the user
> > + branch a branch miss here should save what otherwise would
> > + be branch miss in the user code. Otherwise using a branch 1)
> > + saves code size and 2) is faster in highly predictable
> > + environments. */
> > jne L(zero)
> > # endif
> > addq %rdi, %rax
> > - VZEROUPPER_RETURN
> > +L(return_vzeroupper):
> > + ZERO_UPPER_VEC_REGISTERS_RETURN
> >
> > # ifndef USE_AS_STRCHRNUL
> > L(zero):
> > @@ -103,7 +95,8 @@ L(zero):
> >
> > .p2align 4
> > L(first_vec_x1):
> > - tzcntl %eax, %eax
> > + /* Use bsf to save code size. */
> > + bsfl %eax, %eax
> > incq %rdi
> > # ifndef USE_AS_STRCHRNUL
> > /* Found CHAR or the null byte. */
> > @@ -113,9 +106,10 @@ L(first_vec_x1):
> > addq %rdi, %rax
> > VZEROUPPER_RETURN
> >
> > - .p2align 4
> > + .p2align 4,, 10
> > L(first_vec_x2):
> > - tzcntl %eax, %eax
> > + /* Use bsf to save code size. */
> > + bsfl %eax, %eax
> > addq $(VEC_SIZE + 1), %rdi
> > # ifndef USE_AS_STRCHRNUL
> > /* Found CHAR or the null byte. */
> > @@ -125,9 +119,10 @@ L(first_vec_x2):
> > addq %rdi, %rax
> > VZEROUPPER_RETURN
> >
> > - .p2align 4
> > + .p2align 4,, 8
> > L(first_vec_x3):
> > - tzcntl %eax, %eax
> > + /* Use bsf to save code size. */
> > + bsfl %eax, %eax
> > addq $(VEC_SIZE * 2 + 1), %rdi
> > # ifndef USE_AS_STRCHRNUL
> > /* Found CHAR or the null byte. */
> > @@ -137,6 +132,21 @@ L(first_vec_x3):
> > addq %rdi, %rax
> > VZEROUPPER_RETURN
> >
> > + .p2align 4,, 10
> > +L(first_vec_x4):
> > + /* Use bsf to save code size. */
> > + bsfl %eax, %eax
> > + addq $(VEC_SIZE * 3 + 1), %rdi
> > +# ifndef USE_AS_STRCHRNUL
> > + /* Found CHAR or the null byte. */
> > + cmp (%rdi, %rax), %CHAR_REG
> > + jne L(zero)
> > +# endif
> > + addq %rdi, %rax
> > + VZEROUPPER_RETURN
> > +
> > +
> > +
> > .p2align 4
> > L(aligned_more):
> > /* Align data to VEC_SIZE - 1. This is the same number of
> > @@ -146,90 +156,92 @@ L(aligned_more):
> > L(cross_page_continue):
> > /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
> > since data is only aligned to VEC_SIZE. */
> > - vmovdqa 1(%rdi), %ymm8
> > - VPCMPEQ %ymm8, %ymm0, %ymm1
> > - VPCMPEQ %ymm8, %ymm9, %ymm2
> > - vpor %ymm1, %ymm2, %ymm1
> > - vpmovmskb %ymm1, %eax
> > + vmovdqa 1(%rdi), %ymm2
> > + VPCMPEQ %ymm2, %ymm0, %ymm3
> > + VPCMPEQ %ymm2, %ymm1, %ymm2
> > + vpor %ymm3, %ymm2, %ymm3
> > + vpmovmskb %ymm3, %eax
> > testl %eax, %eax
> > jnz L(first_vec_x1)
> >
> > - vmovdqa (VEC_SIZE + 1)(%rdi), %ymm8
> > - VPCMPEQ %ymm8, %ymm0, %ymm1
> > - VPCMPEQ %ymm8, %ymm9, %ymm2
> > - vpor %ymm1, %ymm2, %ymm1
> > - vpmovmskb %ymm1, %eax
> > + vmovdqa (VEC_SIZE + 1)(%rdi), %ymm2
> > + VPCMPEQ %ymm2, %ymm0, %ymm3
> > + VPCMPEQ %ymm2, %ymm1, %ymm2
> > + vpor %ymm3, %ymm2, %ymm3
> > + vpmovmskb %ymm3, %eax
> > testl %eax, %eax
> > jnz L(first_vec_x2)
> >
> > - vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm8
> > - VPCMPEQ %ymm8, %ymm0, %ymm1
> > - VPCMPEQ %ymm8, %ymm9, %ymm2
> > - vpor %ymm1, %ymm2, %ymm1
> > - vpmovmskb %ymm1, %eax
> > + vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm2
> > + VPCMPEQ %ymm2, %ymm0, %ymm3
> > + VPCMPEQ %ymm2, %ymm1, %ymm2
> > + vpor %ymm3, %ymm2, %ymm3
> > + vpmovmskb %ymm3, %eax
> > testl %eax, %eax
> > jnz L(first_vec_x3)
> >
> > - vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm8
> > - VPCMPEQ %ymm8, %ymm0, %ymm1
> > - VPCMPEQ %ymm8, %ymm9, %ymm2
> > - vpor %ymm1, %ymm2, %ymm1
> > - vpmovmskb %ymm1, %eax
> > + vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm2
> > + VPCMPEQ %ymm2, %ymm0, %ymm3
> > + VPCMPEQ %ymm2, %ymm1, %ymm2
> > + vpor %ymm3, %ymm2, %ymm3
> > + vpmovmskb %ymm3, %eax
> > testl %eax, %eax
> > jnz L(first_vec_x4)
> > - /* Align data to VEC_SIZE * 4 - 1. */
> > - addq $(VEC_SIZE * 4 + 1), %rdi
> > - andq $-(VEC_SIZE * 4), %rdi
> > + /* Align data to VEC_SIZE * 4 - 1. */
> > + incq %rdi
> > + orq $(VEC_SIZE * 4 - 1), %rdi
> > .p2align 4
> > L(loop_4x_vec):
> > /* Compare 4 * VEC at a time forward. */
> > - vmovdqa (%rdi), %ymm5
> > - vmovdqa (VEC_SIZE)(%rdi), %ymm6
> > - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
> > - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
> > + vmovdqa 1(%rdi), %ymm6
> > + vmovdqa (VEC_SIZE + 1)(%rdi), %ymm7
> >
> > /* Leaves only CHARS matching esi as 0. */
> > - vpxor %ymm5, %ymm0, %ymm1
> > vpxor %ymm6, %ymm0, %ymm2
> > vpxor %ymm7, %ymm0, %ymm3
> > - vpxor %ymm8, %ymm0, %ymm4
> >
> > - VPMINU %ymm1, %ymm5, %ymm1
> > VPMINU %ymm2, %ymm6, %ymm2
> > VPMINU %ymm3, %ymm7, %ymm3
> > - VPMINU %ymm4, %ymm8, %ymm4
> >
> > - VPMINU %ymm1, %ymm2, %ymm5
> > - VPMINU %ymm3, %ymm4, %ymm6
> > + vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm6
> > + vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm7
> > +
> > + vpxor %ymm6, %ymm0, %ymm4
> > + vpxor %ymm7, %ymm0, %ymm5
> > +
> > + VPMINU %ymm4, %ymm6, %ymm4
> > + VPMINU %ymm5, %ymm7, %ymm5
> >
> > - VPMINU %ymm5, %ymm6, %ymm6
> > + VPMINU %ymm2, %ymm3, %ymm6
> > + VPMINU %ymm4, %ymm5, %ymm7
> >
> > - VPCMPEQ %ymm6, %ymm9, %ymm6
> > - vpmovmskb %ymm6, %ecx
> > + VPMINU %ymm6, %ymm7, %ymm7
> > +
> > + VPCMPEQ %ymm7, %ymm1, %ymm7
> > + vpmovmskb %ymm7, %ecx
> > subq $-(VEC_SIZE * 4), %rdi
> > testl %ecx, %ecx
> > jz L(loop_4x_vec)
> >
> > -
> > - VPCMPEQ %ymm1, %ymm9, %ymm1
> > - vpmovmskb %ymm1, %eax
> > + VPCMPEQ %ymm2, %ymm1, %ymm2
> > + vpmovmskb %ymm2, %eax
> > testl %eax, %eax
> > jnz L(last_vec_x0)
> >
> >
> > - VPCMPEQ %ymm5, %ymm9, %ymm2
> > - vpmovmskb %ymm2, %eax
> > + VPCMPEQ %ymm3, %ymm1, %ymm3
> > + vpmovmskb %ymm3, %eax
> > testl %eax, %eax
> > jnz L(last_vec_x1)
> >
> > - VPCMPEQ %ymm3, %ymm9, %ymm3
> > - vpmovmskb %ymm3, %eax
> > + VPCMPEQ %ymm4, %ymm1, %ymm4
> > + vpmovmskb %ymm4, %eax
> > /* rcx has combined result from all 4 VEC. It will only be used
> > if the first 3 other VEC all did not contain a match. */
> > salq $32, %rcx
> > orq %rcx, %rax
> > tzcntq %rax, %rax
> > - subq $(VEC_SIZE * 2), %rdi
> > + subq $(VEC_SIZE * 2 - 1), %rdi
> > # ifndef USE_AS_STRCHRNUL
> > /* Found CHAR or the null byte. */
> > cmp (%rdi, %rax), %CHAR_REG
> > @@ -239,10 +251,11 @@ L(loop_4x_vec):
> > VZEROUPPER_RETURN
> >
> >
> > - .p2align 4
> > + .p2align 4,, 10
> > L(last_vec_x0):
> > - tzcntl %eax, %eax
> > - addq $-(VEC_SIZE * 4), %rdi
> > + /* Use bsf to save code size. */
> > + bsfl %eax, %eax
> > + addq $-(VEC_SIZE * 4 - 1), %rdi
> > # ifndef USE_AS_STRCHRNUL
> > /* Found CHAR or the null byte. */
> > cmp (%rdi, %rax), %CHAR_REG
> > @@ -251,16 +264,11 @@ L(last_vec_x0):
> > addq %rdi, %rax
> > VZEROUPPER_RETURN
> >
> > -# ifndef USE_AS_STRCHRNUL
> > -L(zero_end):
> > - xorl %eax, %eax
> > - VZEROUPPER_RETURN
> > -# endif
> >
> > - .p2align 4
> > + .p2align 4,, 10
> > L(last_vec_x1):
> > tzcntl %eax, %eax
> > - subq $(VEC_SIZE * 3), %rdi
> > + subq $(VEC_SIZE * 3 - 1), %rdi
> > # ifndef USE_AS_STRCHRNUL
> > /* Found CHAR or the null byte. */
> > cmp (%rdi, %rax), %CHAR_REG
> > @@ -269,18 +277,23 @@ L(last_vec_x1):
> > addq %rdi, %rax
> > VZEROUPPER_RETURN
> >
> > +# ifndef USE_AS_STRCHRNUL
> > +L(zero_end):
> > + xorl %eax, %eax
> > + VZEROUPPER_RETURN
> > +# endif
> >
> > /* Cold case for crossing page with first load. */
> > - .p2align 4
> > + .p2align 4,, 8
> > L(cross_page_boundary):
> > movq %rdi, %rdx
> > /* Align rdi to VEC_SIZE - 1. */
> > orq $(VEC_SIZE - 1), %rdi
> > - vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm8
> > - VPCMPEQ %ymm8, %ymm0, %ymm1
> > - VPCMPEQ %ymm8, %ymm9, %ymm2
> > - vpor %ymm1, %ymm2, %ymm1
> > - vpmovmskb %ymm1, %eax
> > + vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm2
> > + VPCMPEQ %ymm2, %ymm0, %ymm3
> > + VPCMPEQ %ymm2, %ymm1, %ymm2
> > + vpor %ymm3, %ymm2, %ymm3
> > + vpmovmskb %ymm3, %eax
> > /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
> > so no need to manually mod edx. */
> > sarxl %edx, %eax, %eax
> > @@ -291,13 +304,10 @@ L(cross_page_boundary):
> > xorl %ecx, %ecx
> > /* Found CHAR or the null byte. */
> > cmp (%rdx, %rax), %CHAR_REG
> > - leaq (%rdx, %rax), %rax
> > - cmovne %rcx, %rax
> > -# else
> > - addq %rdx, %rax
> > + jne L(zero_end)
> > # endif
> > -L(return_vzeroupper):
> > - ZERO_UPPER_VEC_REGISTERS_RETURN
> > + addq %rdx, %rax
> > + VZEROUPPER_RETURN
> >
> > END (STRCHR)
> > -# endif
> > +#endif
> > --
> > 2.25.1
> >
>
>
> --
> H.J.
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v1 12/23] x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896]
2022-03-24 19:18 ` Noah Goldstein
@ 2022-03-24 19:34 ` H.J. Lu
2022-03-24 19:39 ` Noah Goldstein
0 siblings, 1 reply; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 19:34 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Thu, Mar 24, 2022 at 12:18 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Thu, Mar 24, 2022 at 2:00 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Wed, Mar 23, 2022 at 3:01 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > Overflow case for __wcsncmp_avx2_rtm should be __wcscmp_avx2_rtm not
> > > __wcscmp_avx2.
> > >
> > > All string/memory tests pass.
> > > ---
> > > sysdeps/x86_64/multiarch/strcmp-avx2.S | 2 +-
> > > 1 file changed, 1 insertion(+), 1 deletion(-)
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> > > index 52ff5ad724..86a86b68e3 100644
> > > --- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
> > > +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> > > @@ -122,7 +122,7 @@ ENTRY(STRCMP)
> > > are cases where length is large enough that it can never be a
> > > bound on valid memory so just use wcscmp. */
> > > shrq $56, %rcx
> > > - jnz __wcscmp_avx2
> > > + jnz OVERFLOW_STRCMP
> > >
> > > leaq (, %rdx, 4), %rdx
> > > # endif
> > > --
> > > 2.25.1
> > >
> >
> > Isn't it a bug? Is there a glibc bug? Should this also be fixed on release
> > branches?
>
> It is bug but no need for backport.
Why no need for backport? Is there a testcase?
--
H.J.
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v1 03/23] x86: Code cleanup in strchr-avx2 and comment justifying branch
2022-03-24 19:20 ` Noah Goldstein
@ 2022-03-24 19:36 ` H.J. Lu
2022-05-12 19:31 ` Sunil Pandey
0 siblings, 1 reply; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 19:36 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Thu, Mar 24, 2022 at 12:20 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Thu, Mar 24, 2022 at 1:53 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Wed, Mar 23, 2022 at 2:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > Small code cleanup for size: -53 bytes.
> > >
> > > Add comment justifying using a branch to do NULL/non-null return.
> >
> >
> > Do you have followup patches to improve its performance? We are
> > backporting all x86-64 improvements to Intel release branches:
> >
> > https://gitlab.com/x86-glibc/glibc/-/wikis/home
> >
> > Patches without performance improvements are undesirable.
>
> No further changes planned at the moment, code size saves
> seem worth it for master though. Also in favor of adding the comment
> as I think its non-intuitive.
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v1 12/23] x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896]
2022-03-24 19:34 ` H.J. Lu
@ 2022-03-24 19:39 ` Noah Goldstein
0 siblings, 0 replies; 76+ messages in thread
From: Noah Goldstein @ 2022-03-24 19:39 UTC (permalink / raw)
To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell
On Thu, Mar 24, 2022 at 2:34 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Thu, Mar 24, 2022 at 12:18 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Thu, Mar 24, 2022 at 2:00 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Wed, Mar 23, 2022 at 3:01 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > Overflow case for __wcsncmp_avx2_rtm should be __wcscmp_avx2_rtm not
> > > > __wcscmp_avx2.
> > > >
> > > > All string/memory tests pass.
> > > > ---
> > > > sysdeps/x86_64/multiarch/strcmp-avx2.S | 2 +-
> > > > 1 file changed, 1 insertion(+), 1 deletion(-)
> > > >
> > > > diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> > > > index 52ff5ad724..86a86b68e3 100644
> > > > --- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
> > > > +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> > > > @@ -122,7 +122,7 @@ ENTRY(STRCMP)
> > > > are cases where length is large enough that it can never be a
> > > > bound on valid memory so just use wcscmp. */
> > > > shrq $56, %rcx
> > > > - jnz __wcscmp_avx2
> > > > + jnz OVERFLOW_STRCMP
> > > >
> > > > leaq (, %rdx, 4), %rdx
> > > > # endif
> > > > --
> > > > 2.25.1
> > > >
> > >
> > > Isn't it a bug? Is there a glibc bug? Should this also be fixed on release
> > > branches?
> >
> > It is bug but no need for backport.
>
> Why no need for backport? Is there a testcase?
Oh no, you're right. It needs to be backported. Had thought it was a different
commit that introduced.
Sorry, I'll update the commit message with more info, ping on the bugzilla,
and add a test case.
Going to push the rest of the patchset, will add v2 for this shortly.
>
> --
> H.J.
^ permalink raw reply [flat|nested] 76+ messages in thread
* [PATCH v2 12/31] x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896]
2022-03-23 21:57 ` [PATCH v1 12/23] x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896] Noah Goldstein
2022-03-24 18:59 ` H.J. Lu
@ 2022-03-24 20:50 ` Noah Goldstein
2022-03-24 21:26 ` H.J. Lu
1 sibling, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-24 20:50 UTC (permalink / raw)
To: libc-alpha
Overflow case for __wcsncmp_avx2_rtm should be __wcscmp_avx2_rtm not
__wcscmp_avx2.
commit ddf0992cf57a93200e0c782e2a94d0733a5a0b87
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Sun Jan 9 16:02:21 2022 -0600
x86: Fix __wcsncmp_avx2 in strcmp-avx2.S [BZ# 28755]
Set the wrong fallback function for `__wcsncmp_avx2_rtm`. It was set
to fallback on to `__wcscmp_avx2` instead of `__wcscmp_avx2_rtm` which
can cause spurious aborts.
This change will need to be backported.
All string/memory tests pass.
---
sysdeps/x86/tst-strncmp-rtm.c | 15 +++++++++++++++
sysdeps/x86_64/multiarch/strcmp-avx2.S | 2 +-
2 files changed, 16 insertions(+), 1 deletion(-)
diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
index 300bc8c281..a3b14e72ff 100644
--- a/sysdeps/x86/tst-strncmp-rtm.c
+++ b/sysdeps/x86/tst-strncmp-rtm.c
@@ -70,6 +70,16 @@ function_overflow (void)
return 1;
}
+__attribute__ ((noinline, noclone))
+static int
+function_overflow2 (void)
+{
+ if (STRNCMP (string1, string2, SIZE_MAX >> 4) == 0)
+ return 0;
+ else
+ return 1;
+}
+
static int
do_test (void)
{
@@ -77,5 +87,10 @@ do_test (void)
if (status != EXIT_SUCCESS)
return status;
status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow);
+ if (status != EXIT_SUCCESS)
+ return status;
+ status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow2);
+ if (status != EXIT_SUCCESS)
+ return status;
return status;
}
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
index 52ff5ad724..86a86b68e3 100644
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
@@ -122,7 +122,7 @@ ENTRY(STRCMP)
are cases where length is large enough that it can never be a
bound on valid memory so just use wcscmp. */
shrq $56, %rcx
- jnz __wcscmp_avx2
+ jnz OVERFLOW_STRCMP
leaq (, %rdx, 4), %rdx
# endif
--
2.25.1
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v2 12/31] x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896]
2022-03-24 20:50 ` [PATCH v2 12/31] " Noah Goldstein
@ 2022-03-24 21:26 ` H.J. Lu
2022-03-24 21:43 ` Noah Goldstein
0 siblings, 1 reply; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 21:26 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Thu, Mar 24, 2022 at 1:50 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Overflow case for __wcsncmp_avx2_rtm should be __wcscmp_avx2_rtm not
> __wcscmp_avx2.
>
> commit ddf0992cf57a93200e0c782e2a94d0733a5a0b87
> Author: Noah Goldstein <goldstein.w.n@gmail.com>
> Date: Sun Jan 9 16:02:21 2022 -0600
>
> x86: Fix __wcsncmp_avx2 in strcmp-avx2.S [BZ# 28755]
>
> Set the wrong fallback function for `__wcsncmp_avx2_rtm`. It was set
> to fallback on to `__wcscmp_avx2` instead of `__wcscmp_avx2_rtm` which
> can cause spurious aborts.
>
> This change will need to be backported.
>
> All string/memory tests pass.
> ---
> sysdeps/x86/tst-strncmp-rtm.c | 15 +++++++++++++++
> sysdeps/x86_64/multiarch/strcmp-avx2.S | 2 +-
> 2 files changed, 16 insertions(+), 1 deletion(-)
>
> diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
> index 300bc8c281..a3b14e72ff 100644
> --- a/sysdeps/x86/tst-strncmp-rtm.c
> +++ b/sysdeps/x86/tst-strncmp-rtm.c
> @@ -70,6 +70,16 @@ function_overflow (void)
> return 1;
> }
>
> +__attribute__ ((noinline, noclone))
> +static int
> +function_overflow2 (void)
> +{
> + if (STRNCMP (string1, string2, SIZE_MAX >> 4) == 0)
> + return 0;
> + else
> + return 1;
> +}
> +
> static int
> do_test (void)
> {
> @@ -77,5 +87,10 @@ do_test (void)
> if (status != EXIT_SUCCESS)
> return status;
> status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow);
> + if (status != EXIT_SUCCESS)
> + return status;
> + status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow2);
> + if (status != EXIT_SUCCESS)
> + return status;
> return status;
> }
> diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> index 52ff5ad724..86a86b68e3 100644
> --- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> @@ -122,7 +122,7 @@ ENTRY(STRCMP)
> are cases where length is large enough that it can never be a
> bound on valid memory so just use wcscmp. */
> shrq $56, %rcx
> - jnz __wcscmp_avx2
> + jnz OVERFLOW_STRCMP
>
> leaq (, %rdx, 4), %rdx
> # endif
> --
> 2.25.1
>
LGTM. Verified on AVX2 machine with RTM. Without the fix,
the new testcase failed.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v2 12/31] x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896]
2022-03-24 21:26 ` H.J. Lu
@ 2022-03-24 21:43 ` Noah Goldstein
2022-03-24 21:58 ` H.J. Lu
0 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-24 21:43 UTC (permalink / raw)
To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell
On Thu, Mar 24, 2022 at 4:26 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Thu, Mar 24, 2022 at 1:50 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Overflow case for __wcsncmp_avx2_rtm should be __wcscmp_avx2_rtm not
> > __wcscmp_avx2.
> >
> > commit ddf0992cf57a93200e0c782e2a94d0733a5a0b87
> > Author: Noah Goldstein <goldstein.w.n@gmail.com>
> > Date: Sun Jan 9 16:02:21 2022 -0600
> >
> > x86: Fix __wcsncmp_avx2 in strcmp-avx2.S [BZ# 28755]
> >
> > Set the wrong fallback function for `__wcsncmp_avx2_rtm`. It was set
> > to fallback on to `__wcscmp_avx2` instead of `__wcscmp_avx2_rtm` which
> > can cause spurious aborts.
> >
> > This change will need to be backported.
> >
> > All string/memory tests pass.
> > ---
> > sysdeps/x86/tst-strncmp-rtm.c | 15 +++++++++++++++
> > sysdeps/x86_64/multiarch/strcmp-avx2.S | 2 +-
> > 2 files changed, 16 insertions(+), 1 deletion(-)
> >
> > diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
> > index 300bc8c281..a3b14e72ff 100644
> > --- a/sysdeps/x86/tst-strncmp-rtm.c
> > +++ b/sysdeps/x86/tst-strncmp-rtm.c
> > @@ -70,6 +70,16 @@ function_overflow (void)
> > return 1;
> > }
> >
> > +__attribute__ ((noinline, noclone))
> > +static int
> > +function_overflow2 (void)
> > +{
> > + if (STRNCMP (string1, string2, SIZE_MAX >> 4) == 0)
> > + return 0;
> > + else
> > + return 1;
> > +}
> > +
> > static int
> > do_test (void)
> > {
> > @@ -77,5 +87,10 @@ do_test (void)
> > if (status != EXIT_SUCCESS)
> > return status;
> > status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow);
> > + if (status != EXIT_SUCCESS)
> > + return status;
> > + status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow2);
> > + if (status != EXIT_SUCCESS)
> > + return status;
> > return status;
> > }
> > diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> > index 52ff5ad724..86a86b68e3 100644
> > --- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
> > +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> > @@ -122,7 +122,7 @@ ENTRY(STRCMP)
> > are cases where length is large enough that it can never be a
> > bound on valid memory so just use wcscmp. */
> > shrq $56, %rcx
> > - jnz __wcscmp_avx2
> > + jnz OVERFLOW_STRCMP
> >
> > leaq (, %rdx, 4), %rdx
> > # endif
> > --
> > 2.25.1
> >
>
> LGTM. Verified on AVX2 machine with RTM. Without the fix,
> the new testcase failed.
And that w/ the fix it passes?
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v2 12/31] x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896]
2022-03-24 21:43 ` Noah Goldstein
@ 2022-03-24 21:58 ` H.J. Lu
2022-05-04 6:05 ` Sunil Pandey
0 siblings, 1 reply; 76+ messages in thread
From: H.J. Lu @ 2022-03-24 21:58 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Thu, Mar 24, 2022 at 2:43 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Thu, Mar 24, 2022 at 4:26 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Thu, Mar 24, 2022 at 1:50 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > Overflow case for __wcsncmp_avx2_rtm should be __wcscmp_avx2_rtm not
> > > __wcscmp_avx2.
> > >
> > > commit ddf0992cf57a93200e0c782e2a94d0733a5a0b87
> > > Author: Noah Goldstein <goldstein.w.n@gmail.com>
> > > Date: Sun Jan 9 16:02:21 2022 -0600
> > >
> > > x86: Fix __wcsncmp_avx2 in strcmp-avx2.S [BZ# 28755]
> > >
> > > Set the wrong fallback function for `__wcsncmp_avx2_rtm`. It was set
> > > to fallback on to `__wcscmp_avx2` instead of `__wcscmp_avx2_rtm` which
> > > can cause spurious aborts.
> > >
> > > This change will need to be backported.
> > >
> > > All string/memory tests pass.
> > > ---
> > > sysdeps/x86/tst-strncmp-rtm.c | 15 +++++++++++++++
> > > sysdeps/x86_64/multiarch/strcmp-avx2.S | 2 +-
> > > 2 files changed, 16 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
> > > index 300bc8c281..a3b14e72ff 100644
> > > --- a/sysdeps/x86/tst-strncmp-rtm.c
> > > +++ b/sysdeps/x86/tst-strncmp-rtm.c
> > > @@ -70,6 +70,16 @@ function_overflow (void)
> > > return 1;
> > > }
> > >
> > > +__attribute__ ((noinline, noclone))
> > > +static int
> > > +function_overflow2 (void)
> > > +{
> > > + if (STRNCMP (string1, string2, SIZE_MAX >> 4) == 0)
> > > + return 0;
> > > + else
> > > + return 1;
> > > +}
> > > +
> > > static int
> > > do_test (void)
> > > {
> > > @@ -77,5 +87,10 @@ do_test (void)
> > > if (status != EXIT_SUCCESS)
> > > return status;
> > > status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow);
> > > + if (status != EXIT_SUCCESS)
> > > + return status;
> > > + status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow2);
> > > + if (status != EXIT_SUCCESS)
> > > + return status;
> > > return status;
> > > }
> > > diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> > > index 52ff5ad724..86a86b68e3 100644
> > > --- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
> > > +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> > > @@ -122,7 +122,7 @@ ENTRY(STRCMP)
> > > are cases where length is large enough that it can never be a
> > > bound on valid memory so just use wcscmp. */
> > > shrq $56, %rcx
> > > - jnz __wcscmp_avx2
> > > + jnz OVERFLOW_STRCMP
> > >
> > > leaq (, %rdx, 4), %rdx
> > > # endif
> > > --
> > > 2.25.1
> > >
> >
> > LGTM. Verified on AVX2 machine with RTM. Without the fix,
> > the new testcase failed.
>
> And that w/ the fix it passes?
Yes.
> >
> > Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
> >
> > Thanks.
> >
> > --
> > H.J.
--
H.J.
^ permalink raw reply [flat|nested] 76+ messages in thread
* [PATCH v3 21/23] x86: Add AVX2 optimized str{n}casecmp
2022-03-23 21:57 ` [PATCH v1 21/23] x86: Add AVX2 optimized str{n}casecmp Noah Goldstein
2022-03-24 19:03 ` H.J. Lu
@ 2022-03-24 22:41 ` Noah Goldstein
2022-03-24 22:41 ` [PATCH v3 22/23] x86: Add EVEX " Noah Goldstein
2022-03-24 23:56 ` [PATCH v4 21/23] x86: Add AVX2 " Noah Goldstein
3 siblings, 0 replies; 76+ messages in thread
From: Noah Goldstein @ 2022-03-24 22:41 UTC (permalink / raw)
To: libc-alpha
geometric_mean(N=40) of all benchmarks AVX2 / SSE42: .702
All string/memory tests pass.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
Double checked, should be strcasecmp_l-* is the proper
fallback for strncasecmp*. Also added comment that
LOCALE_REG needs to be preserved until we finish
the fallback logic.
sysdeps/x86_64/multiarch/Makefile | 4 +
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 28 +++
sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 12 +
.../x86_64/multiarch/strcasecmp_l-avx2-rtm.S | 15 ++
sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S | 23 ++
sysdeps/x86_64/multiarch/strcmp-avx2.S | 233 +++++++++++++++---
.../x86_64/multiarch/strncase_l-avx2-rtm.S | 16 ++
sysdeps/x86_64/multiarch/strncase_l-avx2.S | 27 ++
8 files changed, 327 insertions(+), 31 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index e7b413edad..06e1848823 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -55,6 +55,8 @@ sysdep_routines += \
stpncpy-sse2-unaligned \
stpncpy-ssse3 \
strcasecmp_l-avx \
+ strcasecmp_l-avx2 \
+ strcasecmp_l-avx2-rtm \
strcasecmp_l-sse2 \
strcasecmp_l-sse4_2 \
strcasecmp_l-ssse3 \
@@ -93,6 +95,8 @@ sysdep_routines += \
strlen-evex \
strlen-sse2 \
strncase_l-avx \
+ strncase_l-avx2 \
+ strncase_l-avx2-rtm \
strncase_l-sse2 \
strncase_l-sse4_2 \
strncase_l-ssse3 \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index a594f4176e..3c556d07ac 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -436,6 +436,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
IFUNC_IMPL (i, name, strcasecmp,
+ IFUNC_IMPL_ADD (array, i, strcasecmp,
+ CPU_FEATURE_USABLE (AVX2),
+ __strcasecmp_avx2)
+ IFUNC_IMPL_ADD (array, i, strcasecmp,
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (RTM)),
+ __strcasecmp_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strcasecmp,
CPU_FEATURE_USABLE (AVX),
__strcasecmp_avx)
@@ -449,6 +456,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
IFUNC_IMPL (i, name, strcasecmp_l,
+ IFUNC_IMPL_ADD (array, i, strcasecmp,
+ CPU_FEATURE_USABLE (AVX2),
+ __strcasecmp_l_avx2)
+ IFUNC_IMPL_ADD (array, i, strcasecmp,
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (RTM)),
+ __strcasecmp_l_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strcasecmp_l,
CPU_FEATURE_USABLE (AVX),
__strcasecmp_l_avx)
@@ -576,6 +590,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strncase_l.c. */
IFUNC_IMPL (i, name, strncasecmp,
+ IFUNC_IMPL_ADD (array, i, strncasecmp,
+ CPU_FEATURE_USABLE (AVX2),
+ __strncasecmp_avx2)
+ IFUNC_IMPL_ADD (array, i, strncasecmp,
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (RTM)),
+ __strncasecmp_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strncasecmp,
CPU_FEATURE_USABLE (AVX),
__strncasecmp_avx)
@@ -590,6 +611,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strncase_l.c. */
IFUNC_IMPL (i, name, strncasecmp_l,
+ IFUNC_IMPL_ADD (array, i, strncasecmp,
+ CPU_FEATURE_USABLE (AVX2),
+ __strncasecmp_l_avx2)
+ IFUNC_IMPL_ADD (array, i, strncasecmp,
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (RTM)),
+ __strncasecmp_l_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strncasecmp_l,
CPU_FEATURE_USABLE (AVX),
__strncasecmp_l_avx)
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
index 9e3cc61ac0..c4de111fd0 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
@@ -23,12 +23,24 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
static inline void *
IFUNC_SELECTOR (void)
{
const struct cpu_features* cpu_features = __get_cpu_features ();
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+ {
+ if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+ return OPTIMIZE (avx2_rtm);
+
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ return OPTIMIZE (avx2);
+ }
+
if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
return OPTIMIZE (avx);
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
new file mode 100644
index 0000000000..09957fc3c5
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
@@ -0,0 +1,15 @@
+#ifndef STRCMP
+# define STRCMP __strcasecmp_l_avx2_rtm
+#endif
+
+#define _GLABEL(x) x ## _rtm
+#define GLABEL(x) _GLABEL(x)
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "strcasecmp_l-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
new file mode 100644
index 0000000000..e2762f2a22
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
@@ -0,0 +1,23 @@
+/* strcasecmp_l optimized with AVX2.
+ Copyright (C) 2017-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef STRCMP
+# define STRCMP __strcasecmp_l_avx2
+#endif
+#define USE_AS_STRCASECMP_L
+#include "strcmp-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
index 86a86b68e3..e16cc2378c 100644
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
@@ -20,6 +20,10 @@
# include <sysdep.h>
+# if defined USE_AS_STRCASECMP_L
+# include "locale-defines.h"
+# endif
+
# ifndef STRCMP
# define STRCMP __strcmp_avx2
# endif
@@ -74,13 +78,88 @@
# define VEC_OFFSET (-VEC_SIZE)
# endif
+# ifdef USE_AS_STRCASECMP_L
+# define BYTE_LOOP_REG OFFSET_REG
+# else
+# define BYTE_LOOP_REG ecx
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+# ifdef USE_AS_STRNCMP
+# define STRCASECMP __strncasecmp_avx2
+# define LOCALE_REG rcx
+# define LOCALE_REG_LP RCX_LP
+# define STRCASECMP_NONASCII __strncasecmp_l_nonascii
+# else
+# define STRCASECMP __strcasecmp_avx2
+# define LOCALE_REG rdx
+# define LOCALE_REG_LP RDX_LP
+# define STRCASECMP_NONASCII __strcasecmp_l_nonascii
+# endif
+# endif
+
# define xmmZERO xmm15
# define ymmZERO ymm15
+# define LCASE_MIN_ymm %ymm10
+# define LCASE_MAX_ymm %ymm11
+# define CASE_ADD_ymm %ymm12
+
+# define LCASE_MIN_xmm %xmm10
+# define LCASE_MAX_xmm %xmm11
+# define CASE_ADD_xmm %xmm12
+
+ /* r11 is never use elsewhere so this is safe to maintain. */
+# define TOLOWER_BASE %r11
+
# ifndef SECTION
# define SECTION(p) p##.avx
# endif
+# ifdef USE_AS_STRCASECMP_L
+# define REG(x, y) x ## y
+# define TOLOWER(reg1_in, reg1_out, reg2_in, reg2_out, ext) \
+ vpaddb REG(LCASE_MIN_, ext), reg1_in, REG(%ext, 8); \
+ vpaddb REG(LCASE_MIN_, ext), reg2_in, REG(%ext, 9); \
+ vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 8), REG(%ext, 8); \
+ vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 9), REG(%ext, 9); \
+ vpandn REG(CASE_ADD_, ext), REG(%ext, 8), REG(%ext, 8); \
+ vpandn REG(CASE_ADD_, ext), REG(%ext, 9), REG(%ext, 9); \
+ vpaddb REG(%ext, 8), reg1_in, reg1_out; \
+ vpaddb REG(%ext, 9), reg2_in, reg2_out
+
+# define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
+# define TOLOWER_ymm(...) TOLOWER(__VA_ARGS__, ymm)
+# define TOLOWER_xmm(...) TOLOWER(__VA_ARGS__, xmm)
+
+# define CMP_R1_R2(s1_reg, s2_reg, scratch_reg, reg_out, ext) \
+ TOLOWER (s1_reg, scratch_reg, s2_reg, s2_reg, ext); \
+ VPCMPEQ scratch_reg, s2_reg, reg_out
+
+# define CMP_R1_S2(s1_reg, s2_mem, scratch_reg, reg_out, ext) \
+ VMOVU s2_mem, reg_out; \
+ CMP_R1_R2(s1_reg, reg_out, scratch_reg, reg_out, ext)
+
+# define CMP_R1_R2_ymm(...) CMP_R1_R2(__VA_ARGS__, ymm)
+# define CMP_R1_R2_xmm(...) CMP_R1_R2(__VA_ARGS__, xmm)
+
+# define CMP_R1_S2_ymm(...) CMP_R1_S2(__VA_ARGS__, ymm)
+# define CMP_R1_S2_xmm(...) CMP_R1_S2(__VA_ARGS__, xmm)
+
+# else
+# define TOLOWER_gpr(...)
+# define TOLOWER_ymm(...)
+# define TOLOWER_xmm(...)
+
+# define CMP_R1_R2_ymm(s1_reg, s2_reg, scratch_reg, reg_out) \
+ VPCMPEQ s2_reg, s1_reg, reg_out
+
+# define CMP_R1_R2_xmm(...) CMP_R1_R2_ymm(__VA_ARGS__)
+
+# define CMP_R1_S2_ymm(...) CMP_R1_R2_ymm(__VA_ARGS__)
+# define CMP_R1_S2_xmm(...) CMP_R1_R2_xmm(__VA_ARGS__)
+# endif
+
/* Warning!
wcscmp/wcsncmp have to use SIGNED comparison for elements.
strcmp/strncmp have to use UNSIGNED comparison for elements.
@@ -102,8 +181,49 @@
returned. */
.section SECTION(.text), "ax", @progbits
-ENTRY(STRCMP)
+ .align 16
+ .type STRCMP, @function
+ .globl STRCMP
+ .hidden STRCMP
+
+# ifndef GLABEL
+# define GLABEL(...) __VA_ARGS__
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+ENTRY (GLABEL(STRCASECMP))
+ movq __libc_tsd_LOCALE@gottpoff(%rip), %rax
+ mov %fs:(%rax), %LOCALE_REG_LP
+
+ /* Either 1 or 5 bytes (dependeing if CET is enabled). */
+ .p2align 4
+END (GLABEL(STRCASECMP))
+ /* FALLTHROUGH to strcasecmp/strncasecmp_l. */
+# endif
+
+ .p2align 4
+STRCMP:
+ cfi_startproc
+ _CET_ENDBR
+ CALL_MCOUNT
+
+# if defined USE_AS_STRCASECMP_L
+ /* We have to fall back on the C implementation for locales with
+ encodings not matching ASCII for single bytes. */
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+ mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
+# else
+ mov (%LOCALE_REG), %RAX_LP
+# endif
+ testl $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
+ jne STRCASECMP_NONASCII
+ leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
+# endif
+
# ifdef USE_AS_STRNCMP
+ /* Don't overwrite LOCALE_REG (rcx) until we have pass
+ L(one_or_less). Otherwise we might use the wrong locale in
+ the OVERFLOW_STRCMP (strcasecmp_l). */
# ifdef __ILP32__
/* Clear the upper 32 bits. */
movl %edx, %edx
@@ -128,6 +248,30 @@ ENTRY(STRCMP)
# endif
# endif
vpxor %xmmZERO, %xmmZERO, %xmmZERO
+# if defined USE_AS_STRCASECMP_L
+ .section .rodata.cst32, "aM", @progbits, 32
+ .align 32
+L(lcase_min):
+ .quad 0x3f3f3f3f3f3f3f3f
+ .quad 0x3f3f3f3f3f3f3f3f
+ .quad 0x3f3f3f3f3f3f3f3f
+ .quad 0x3f3f3f3f3f3f3f3f
+L(lcase_max):
+ .quad 0x9999999999999999
+ .quad 0x9999999999999999
+ .quad 0x9999999999999999
+ .quad 0x9999999999999999
+L(case_add):
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .previous
+
+ vmovdqa L(lcase_min)(%rip), LCASE_MIN_ymm
+ vmovdqa L(lcase_max)(%rip), LCASE_MAX_ymm
+ vmovdqa L(case_add)(%rip), CASE_ADD_ymm
+# endif
movl %edi, %eax
orl %esi, %eax
sall $20, %eax
@@ -138,8 +282,10 @@ ENTRY(STRCMP)
L(no_page_cross):
/* Safe to compare 4x vectors. */
VMOVU (%rdi), %ymm0
- /* 1s where s1 and s2 equal. */
- VPCMPEQ (%rsi), %ymm0, %ymm1
+ /* 1s where s1 and s2 equal. Just VPCMPEQ if its not strcasecmp.
+ Otherwise converts ymm0 and load from rsi to lower. ymm2 is
+ scratch and ymm1 is the return. */
+ CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
/* 1s at null CHAR. */
VPCMPEQ %ymm0, %ymmZERO, %ymm2
/* 1s where s1 and s2 equal AND not null CHAR. */
@@ -172,6 +318,8 @@ L(return_vec_0):
# else
movzbl (%rdi, %rcx), %eax
movzbl (%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret0):
@@ -207,6 +355,8 @@ L(one_or_less):
# else
movzbl (%rdi), %eax
movzbl (%rsi), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret1):
@@ -234,6 +384,8 @@ L(return_vec_1):
# else
movzbl VEC_SIZE(%rdi, %rcx), %eax
movzbl VEC_SIZE(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret2):
@@ -265,6 +417,8 @@ L(return_vec_2):
# else
movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret3):
@@ -285,6 +439,8 @@ L(return_vec_3):
# else
movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret4):
@@ -295,7 +451,7 @@ L(ret4):
L(more_3x_vec):
/* Safe to compare 4x vectors. */
VMOVU VEC_SIZE(%rdi), %ymm0
- VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -308,7 +464,7 @@ L(more_3x_vec):
# endif
VMOVU (VEC_SIZE * 2)(%rdi), %ymm0
- VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 2)(%rsi), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -316,7 +472,7 @@ L(more_3x_vec):
jnz L(return_vec_2)
VMOVU (VEC_SIZE * 3)(%rdi), %ymm0
- VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 3)(%rsi), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -391,12 +547,10 @@ L(loop_skip_page_cross_check):
VMOVA (VEC_SIZE * 3)(%rdi), %ymm6
/* ymm1 all 1s where s1 and s2 equal. All 0s otherwise. */
- VPCMPEQ (VEC_SIZE * 0)(%rsi), %ymm0, %ymm1
-
- VPCMPEQ (VEC_SIZE * 1)(%rsi), %ymm2, %ymm3
- VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
- VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
-
+ CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 0)(%rsi), %ymm3, %ymm1)
+ CMP_R1_S2_ymm (%ymm2, (VEC_SIZE * 1)(%rsi), %ymm5, %ymm3)
+ CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
+ CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
/* If any mismatches or null CHAR then 0 CHAR, otherwise non-
zero. */
@@ -465,6 +619,8 @@ L(return_vec_2_3_end):
# else
movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax
movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -508,6 +664,8 @@ L(return_vec_0_end):
# else
movzbl (%rdi, %rcx), %eax
movzbl (%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -530,6 +688,8 @@ L(return_vec_1_end):
# else
movzbl VEC_SIZE(%rdi, %rcx), %eax
movzbl VEC_SIZE(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -556,6 +716,8 @@ L(return_vec_2_end):
# else
movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -583,7 +745,7 @@ L(page_cross_during_loop):
jle L(less_1x_vec_till_page_cross)
VMOVA (%rdi), %ymm0
- VPCMPEQ (%rsi), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -605,7 +767,7 @@ L(less_1x_vec_till_page_cross):
here, it means the previous page (rdi - VEC_SIZE) has already
been loaded earlier so must be valid. */
VMOVU -VEC_SIZE(%rdi, %rax), %ymm0
- VPCMPEQ -VEC_SIZE(%rsi, %rax), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, -VEC_SIZE(%rsi, %rax), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -647,6 +809,8 @@ L(return_page_cross_cmp_mem):
# else
movzbl VEC_OFFSET(%rdi, %rcx), %eax
movzbl VEC_OFFSET(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -673,7 +837,7 @@ L(more_2x_vec_till_page_cross):
iteration here. */
VMOVU VEC_SIZE(%rdi), %ymm0
- VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -689,7 +853,7 @@ L(more_2x_vec_till_page_cross):
/* Safe to include comparisons from lower bytes. */
VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %ymm0
- VPCMPEQ -(VEC_SIZE * 2)(%rsi, %rax), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 2)(%rsi, %rax), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -697,7 +861,7 @@ L(more_2x_vec_till_page_cross):
jnz L(return_vec_page_cross_0)
VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %ymm0
- VPCMPEQ -(VEC_SIZE * 1)(%rsi, %rax), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 1)(%rsi, %rax), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -715,8 +879,8 @@ L(more_2x_vec_till_page_cross):
VMOVA (VEC_SIZE * 2)(%rdi), %ymm4
VMOVA (VEC_SIZE * 3)(%rdi), %ymm6
- VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
- VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
+ CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
+ CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
vpand %ymm4, %ymm5, %ymm5
vpand %ymm6, %ymm7, %ymm7
VPMINU %ymm5, %ymm7, %ymm7
@@ -767,6 +931,8 @@ L(return_vec_page_cross_1):
# else
movzbl VEC_OFFSET(%rdi, %rcx), %eax
movzbl VEC_OFFSET(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -822,7 +988,7 @@ L(page_cross):
L(page_cross_loop):
VMOVU (%rdi, %OFFSET_REG64), %ymm0
- VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -840,11 +1006,11 @@ L(page_cross_loop):
subl %eax, %OFFSET_REG
/* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
to not cross page so is safe to load. Since we have already
- loaded at least 1 VEC from rsi it is also guranteed to be safe.
- */
+ loaded at least 1 VEC from rsi it is also guranteed to be
+ safe. */
VMOVU (%rdi, %OFFSET_REG64), %ymm0
- VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -877,6 +1043,8 @@ L(ret_vec_page_cross_cont):
# else
movzbl (%rdi, %rcx), %eax
movzbl (%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -930,7 +1098,7 @@ L(less_1x_vec_till_page):
ja L(less_16_till_page)
VMOVU (%rdi), %xmm0
- VPCMPEQ (%rsi), %xmm0, %xmm1
+ CMP_R1_S2_xmm (%xmm0, (%rsi), %xmm2, %xmm1)
VPCMPEQ %xmm0, %xmmZERO, %xmm2
vpandn %xmm1, %xmm2, %xmm1
vpmovmskb %ymm1, %ecx
@@ -948,7 +1116,7 @@ L(less_1x_vec_till_page):
# endif
VMOVU (%rdi, %OFFSET_REG64), %xmm0
- VPCMPEQ (%rsi, %OFFSET_REG64), %xmm0, %xmm1
+ CMP_R1_S2_xmm (%xmm0, (%rsi, %OFFSET_REG64), %xmm2, %xmm1)
VPCMPEQ %xmm0, %xmmZERO, %xmm2
vpandn %xmm1, %xmm2, %xmm1
vpmovmskb %ymm1, %ecx
@@ -986,7 +1154,7 @@ L(less_16_till_page):
vmovq (%rdi), %xmm0
vmovq (%rsi), %xmm1
VPCMPEQ %xmm0, %xmmZERO, %xmm2
- VPCMPEQ %xmm1, %xmm0, %xmm1
+ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
vpandn %xmm1, %xmm2, %xmm1
vpmovmskb %ymm1, %ecx
incb %cl
@@ -1006,7 +1174,7 @@ L(less_16_till_page):
vmovq (%rdi, %OFFSET_REG64), %xmm0
vmovq (%rsi, %OFFSET_REG64), %xmm1
VPCMPEQ %xmm0, %xmmZERO, %xmm2
- VPCMPEQ %xmm1, %xmm0, %xmm1
+ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
vpandn %xmm1, %xmm2, %xmm1
vpmovmskb %ymm1, %ecx
incb %cl
@@ -1062,7 +1230,7 @@ L(ret_less_8_wcs):
vmovd (%rdi), %xmm0
vmovd (%rsi), %xmm1
VPCMPEQ %xmm0, %xmmZERO, %xmm2
- VPCMPEQ %xmm1, %xmm0, %xmm1
+ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
vpandn %xmm1, %xmm2, %xmm1
vpmovmskb %ymm1, %ecx
subl $0xf, %ecx
@@ -1081,7 +1249,7 @@ L(ret_less_8_wcs):
vmovd (%rdi, %OFFSET_REG64), %xmm0
vmovd (%rsi, %OFFSET_REG64), %xmm1
VPCMPEQ %xmm0, %xmmZERO, %xmm2
- VPCMPEQ %xmm1, %xmm0, %xmm1
+ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
vpandn %xmm1, %xmm2, %xmm1
vpmovmskb %ymm1, %ecx
subl $0xf, %ecx
@@ -1115,7 +1283,9 @@ L(less_4_till_page):
L(less_4_loop):
movzbl (%rdi), %eax
movzbl (%rsi, %rdi), %ecx
- subl %ecx, %eax
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
+ subl %BYTE_LOOP_REG, %eax
jnz L(ret_less_4_loop)
testl %ecx, %ecx
jz L(ret_zero_4_loop)
@@ -1142,5 +1312,6 @@ L(ret_less_4_loop):
subl %r8d, %eax
ret
# endif
-END(STRCMP)
+ cfi_endproc
+ .size STRCMP, .-STRCMP
#endif
diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
new file mode 100644
index 0000000000..58c05dcfb8
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
@@ -0,0 +1,16 @@
+#ifndef STRCMP
+# define STRCMP __strncasecmp_l_avx2_rtm
+#endif
+
+#define _GLABEL(x) x ## _rtm
+#define GLABEL(x) _GLABEL(x)
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+#define OVERFLOW_STRCMP __strcasecmp_l_avx2_rtm
+
+#include "strncase_l-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2.S b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
new file mode 100644
index 0000000000..48c0aa21f8
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
@@ -0,0 +1,27 @@
+/* strncasecmp_l optimized with AVX2.
+ Copyright (C) 2017-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef STRCMP
+# define STRCMP __strncasecmp_l_avx2
+#endif
+#define USE_AS_STRCASECMP_L
+#define USE_AS_STRNCMP
+#ifndef OVERFLOW_STRCMP
+# define OVERFLOW_STRCMP __strcasecmp_l_avx2
+#endif
+#include "strcmp-avx2.S"
--
2.25.1
^ permalink raw reply [flat|nested] 76+ messages in thread
* [PATCH v3 22/23] x86: Add EVEX optimized str{n}casecmp
2022-03-23 21:57 ` [PATCH v1 21/23] x86: Add AVX2 optimized str{n}casecmp Noah Goldstein
2022-03-24 19:03 ` H.J. Lu
2022-03-24 22:41 ` [PATCH v3 " Noah Goldstein
@ 2022-03-24 22:41 ` Noah Goldstein
2022-03-24 23:56 ` [PATCH v4 21/23] x86: Add AVX2 " Noah Goldstein
3 siblings, 0 replies; 76+ messages in thread
From: Noah Goldstein @ 2022-03-24 22:41 UTC (permalink / raw)
To: libc-alpha
geometric_mean(N=40) of all benchmarks EVEX / SSE42: .621
All string/memory tests pass.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/multiarch/Makefile | 2 +
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 ++
sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 5 +
sysdeps/x86_64/multiarch/strcasecmp_l-evex.S | 23 ++
sysdeps/x86_64/multiarch/strcmp-evex.S | 286 ++++++++++++++++---
sysdeps/x86_64/multiarch/strncase_l-evex.S | 25 ++
6 files changed, 317 insertions(+), 40 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
create mode 100644 sysdeps/x86_64/multiarch/strncase_l-evex.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 06e1848823..35d80dc2ff 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -57,6 +57,7 @@ sysdep_routines += \
strcasecmp_l-avx \
strcasecmp_l-avx2 \
strcasecmp_l-avx2-rtm \
+ strcasecmp_l-evex \
strcasecmp_l-sse2 \
strcasecmp_l-sse4_2 \
strcasecmp_l-ssse3 \
@@ -97,6 +98,7 @@ sysdep_routines += \
strncase_l-avx \
strncase_l-avx2 \
strncase_l-avx2-rtm \
+ strncase_l-evex \
strncase_l-sse2 \
strncase_l-sse4_2 \
strncase_l-ssse3 \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 3c556d07ac..f1a4d3dac2 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -436,6 +436,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
IFUNC_IMPL (i, name, strcasecmp,
+ IFUNC_IMPL_ADD (array, i, strcasecmp,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)),
+ __strcasecmp_evex)
IFUNC_IMPL_ADD (array, i, strcasecmp,
CPU_FEATURE_USABLE (AVX2),
__strcasecmp_avx2)
@@ -456,6 +460,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
IFUNC_IMPL (i, name, strcasecmp_l,
+ IFUNC_IMPL_ADD (array, i, strcasecmp,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)),
+ __strcasecmp_l_evex)
IFUNC_IMPL_ADD (array, i, strcasecmp,
CPU_FEATURE_USABLE (AVX2),
__strcasecmp_l_avx2)
@@ -590,6 +598,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strncase_l.c. */
IFUNC_IMPL (i, name, strncasecmp,
+ IFUNC_IMPL_ADD (array, i, strncasecmp,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)),
+ __strncasecmp_evex)
IFUNC_IMPL_ADD (array, i, strncasecmp,
CPU_FEATURE_USABLE (AVX2),
__strncasecmp_avx2)
@@ -611,6 +623,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strncase_l.c. */
IFUNC_IMPL (i, name, strncasecmp_l,
+ IFUNC_IMPL_ADD (array, i, strncasecmp,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)),
+ __strncasecmp_l_evex)
IFUNC_IMPL_ADD (array, i, strncasecmp,
CPU_FEATURE_USABLE (AVX2),
__strncasecmp_l_avx2)
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
index c4de111fd0..bf0d146e7f 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
@@ -25,6 +25,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
static inline void *
IFUNC_SELECTOR (void)
@@ -34,6 +35,10 @@ IFUNC_SELECTOR (void)
if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
{
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+ return OPTIMIZE (evex);
+
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
return OPTIMIZE (avx2_rtm);
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
new file mode 100644
index 0000000000..58642db748
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
@@ -0,0 +1,23 @@
+/* strcasecmp_l optimized with EVEX.
+ Copyright (C) 2017-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef STRCMP
+# define STRCMP __strcasecmp_l_evex
+#endif
+#define USE_AS_STRCASECMP_L
+#include "strcmp-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
index 56d8c118e4..002dd600ed 100644
--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
@@ -19,6 +19,9 @@
#if IS_IN (libc)
# include <sysdep.h>
+# if defined USE_AS_STRCASECMP_L
+# include "locale-defines.h"
+# endif
# ifndef STRCMP
# define STRCMP __strcmp_evex
@@ -34,19 +37,29 @@
# define VMOVA vmovdqa64
# ifdef USE_AS_WCSCMP
-# define TESTEQ subl $0xff,
+# ifndef OVERFLOW_STRCMP
+# define OVERFLOW_STRCMP __wcscmp_evex
+# endif
+
+# define TESTEQ subl $0xff,
/* Compare packed dwords. */
# define VPCMP vpcmpd
# define VPMINU vpminud
# define VPTESTM vptestmd
+# define VPTESTNM vptestnmd
/* 1 dword char == 4 bytes. */
# define SIZE_OF_CHAR 4
# else
+# ifndef OVERFLOW_STRCMP
+# define OVERFLOW_STRCMP __strcmp_evex
+# endif
+
# define TESTEQ incl
/* Compare packed bytes. */
# define VPCMP vpcmpb
# define VPMINU vpminub
# define VPTESTM vptestmb
+# define VPTESTNM vptestnmb
/* 1 byte char == 1 byte. */
# define SIZE_OF_CHAR 1
# endif
@@ -73,11 +86,16 @@
# define VEC_OFFSET (-VEC_SIZE)
# endif
-# define XMMZERO xmm16
# define XMM0 xmm17
# define XMM1 xmm18
-# define YMMZERO ymm16
+# define XMM10 xmm27
+# define XMM11 xmm28
+# define XMM12 xmm29
+# define XMM13 xmm30
+# define XMM14 xmm31
+
+
# define YMM0 ymm17
# define YMM1 ymm18
# define YMM2 ymm19
@@ -89,6 +107,87 @@
# define YMM8 ymm25
# define YMM9 ymm26
# define YMM10 ymm27
+# define YMM11 ymm28
+# define YMM12 ymm29
+# define YMM13 ymm30
+# define YMM14 ymm31
+
+# ifdef USE_AS_STRCASECMP_L
+# define BYTE_LOOP_REG OFFSET_REG
+# else
+# define BYTE_LOOP_REG ecx
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+# ifdef USE_AS_STRNCMP
+# define STRCASECMP __strncasecmp_evex
+# define LOCALE_REG rcx
+# define LOCALE_REG_LP RCX_LP
+# define STRCASECMP_NONASCII __strncasecmp_l_nonascii
+# else
+# define STRCASECMP __strcasecmp_evex
+# define LOCALE_REG rdx
+# define LOCALE_REG_LP RDX_LP
+# define STRCASECMP_NONASCII __strcasecmp_l_nonascii
+# endif
+# endif
+
+# define LCASE_MIN_YMM %YMM12
+# define LCASE_MAX_YMM %YMM13
+# define CASE_ADD_YMM %YMM14
+
+# define LCASE_MIN_XMM %XMM12
+# define LCASE_MAX_XMM %XMM13
+# define CASE_ADD_XMM %XMM14
+
+ /* NB: wcsncmp uses r11 but strcasecmp is never used in
+ conjunction with wcscmp. */
+# define TOLOWER_BASE %r11
+
+# ifdef USE_AS_STRCASECMP_L
+# define _REG(x, y) x ## y
+# define REG(x, y) _REG(x, y)
+# define TOLOWER(reg1, reg2, ext) \
+ vpsubb REG(LCASE_MIN_, ext), reg1, REG(%ext, 10); \
+ vpsubb REG(LCASE_MIN_, ext), reg2, REG(%ext, 11); \
+ vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5; \
+ vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6; \
+ vpaddb reg1, REG(CASE_ADD_, ext), reg1{%k5}; \
+ vpaddb reg2, REG(CASE_ADD_, ext), reg2{%k6}
+
+# define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
+# define TOLOWER_YMM(...) TOLOWER(__VA_ARGS__, YMM)
+# define TOLOWER_XMM(...) TOLOWER(__VA_ARGS__, XMM)
+
+# define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext) \
+ TOLOWER (s1_reg, s2_reg, ext); \
+ VPCMP $0, s1_reg, s2_reg, reg_out
+
+# define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext) \
+ VMOVU s2_mem, s2_reg; \
+ CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)
+
+# define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM)
+# define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM)
+
+# define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM)
+# define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM)
+
+# else
+# define TOLOWER_gpr(...)
+# define TOLOWER_YMM(...)
+# define TOLOWER_XMM(...)
+
+# define CMP_R1_R2_YMM(s1_reg, s2_reg, reg_out) \
+ VPCMP $0, s2_reg, s1_reg, reg_out
+
+# define CMP_R1_R2_XMM(...) CMP_R1_R2_YMM(__VA_ARGS__)
+
+# define CMP_R1_S2_YMM(s1_reg, s2_mem, unused, reg_out) \
+ VPCMP $0, s2_mem, s1_reg, reg_out
+
+# define CMP_R1_S2_XMM(...) CMP_R1_S2_YMM(__VA_ARGS__)
+# endif
/* Warning!
wcscmp/wcsncmp have to use SIGNED comparison for elements.
@@ -112,8 +211,45 @@
returned. */
.section .text.evex, "ax", @progbits
-ENTRY(STRCMP)
+ .align 16
+ .type STRCMP, @function
+ .globl STRCMP
+ .hidden STRCMP
+
+# ifdef USE_AS_STRCASECMP_L
+ENTRY (STRCASECMP)
+ movq __libc_tsd_LOCALE@gottpoff(%rip), %rax
+ mov %fs:(%rax), %LOCALE_REG_LP
+
+ /* Either 1 or 5 bytes (dependeing if CET is enabled). */
+ .p2align 4
+END (STRCASECMP)
+ /* FALLTHROUGH to strcasecmp/strncasecmp_l. */
+# endif
+
+ .p2align 4
+STRCMP:
+ cfi_startproc
+ _CET_ENDBR
+ CALL_MCOUNT
+
+# if defined USE_AS_STRCASECMP_L
+ /* We have to fall back on the C implementation for locales with
+ encodings not matching ASCII for single bytes. */
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+ mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
+# else
+ mov (%LOCALE_REG), %RAX_LP
+# endif
+ testl $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
+ jne STRCASECMP_NONASCII
+ leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
+# endif
+
# ifdef USE_AS_STRNCMP
+ /* Don't overwrite LOCALE_REG (rcx) until we have pass
+ L(one_or_less). Otherwise we might use the wrong locale in
+ the OVERFLOW_STRCMP (strcasecmp_l). */
# ifdef __ILP32__
/* Clear the upper 32 bits. */
movl %edx, %edx
@@ -125,6 +261,32 @@ ENTRY(STRCMP)
actually bound the buffer. */
jle L(one_or_less)
# endif
+
+# if defined USE_AS_STRCASECMP_L
+ .section .rodata.cst32, "aM", @progbits, 32
+ .align 32
+L(lcase_min):
+ .quad 0x4141414141414141
+ .quad 0x4141414141414141
+ .quad 0x4141414141414141
+ .quad 0x4141414141414141
+L(lcase_max):
+ .quad 0x1a1a1a1a1a1a1a1a
+ .quad 0x1a1a1a1a1a1a1a1a
+ .quad 0x1a1a1a1a1a1a1a1a
+ .quad 0x1a1a1a1a1a1a1a1a
+L(case_add):
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .previous
+
+ vmovdqa64 L(lcase_min)(%rip), LCASE_MIN_YMM
+ vmovdqa64 L(lcase_max)(%rip), LCASE_MAX_YMM
+ vmovdqa64 L(case_add)(%rip), CASE_ADD_YMM
+# endif
+
movl %edi, %eax
orl %esi, %eax
/* Shift out the bits irrelivant to page boundary ([63:12]). */
@@ -139,7 +301,7 @@ L(no_page_cross):
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi). */
- VPCMP $0, (%rsi), %YMM0, %k1{%k2}
+ CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
kmovd %k1, %ecx
# ifdef USE_AS_STRNCMP
cmpq $CHAR_PER_VEC, %rdx
@@ -169,6 +331,8 @@ L(return_vec_0):
# else
movzbl (%rdi, %rcx), %eax
movzbl (%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret0):
@@ -189,10 +353,10 @@ L(ret_zero):
.p2align 4,, 5
L(one_or_less):
jb L(ret_zero)
-# ifdef USE_AS_WCSCMP
/* 'nbe' covers the case where length is negative (large
unsigned). */
- jnbe __wcscmp_evex
+ jnbe OVERFLOW_STRCMP
+# ifdef USE_AS_WCSCMP
movl (%rdi), %edx
xorl %eax, %eax
cmpl (%rsi), %edx
@@ -201,11 +365,10 @@ L(one_or_less):
negl %eax
orl $1, %eax
# else
- /* 'nbe' covers the case where length is negative (large
- unsigned). */
- jnbe __strcmp_evex
movzbl (%rdi), %eax
movzbl (%rsi), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret1):
@@ -233,6 +396,8 @@ L(return_vec_1):
# else
movzbl VEC_SIZE(%rdi, %rcx), %eax
movzbl VEC_SIZE(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret2):
@@ -270,6 +435,8 @@ L(return_vec_2):
# else
movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret3):
@@ -290,6 +457,8 @@ L(return_vec_3):
# else
movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret4):
@@ -303,7 +472,7 @@ L(more_3x_vec):
/* Safe to compare 4x vectors. */
VMOVU (VEC_SIZE)(%rdi), %YMM0
VPTESTM %YMM0, %YMM0, %k2
- VPCMP $0, (VEC_SIZE)(%rsi), %YMM0, %k1{%k2}
+ CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
kmovd %k1, %ecx
TESTEQ %ecx
jnz L(return_vec_1)
@@ -315,14 +484,14 @@ L(more_3x_vec):
VMOVU (VEC_SIZE * 2)(%rdi), %YMM0
VPTESTM %YMM0, %YMM0, %k2
- VPCMP $0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
+ CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2}
kmovd %k1, %ecx
TESTEQ %ecx
jnz L(return_vec_2)
VMOVU (VEC_SIZE * 3)(%rdi), %YMM0
VPTESTM %YMM0, %YMM0, %k2
- VPCMP $0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
+ CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2}
kmovd %k1, %ecx
TESTEQ %ecx
jnz L(return_vec_3)
@@ -381,7 +550,6 @@ L(prepare_loop_aligned):
subl %esi, %eax
andl $(PAGE_SIZE - 1), %eax
- vpxorq %YMMZERO, %YMMZERO, %YMMZERO
/* Loop 4x comparisons at a time. */
.p2align 4
@@ -413,22 +581,35 @@ L(loop_skip_page_cross_check):
/* A zero CHAR in YMM9 means that there is a null CHAR. */
VPMINU %YMM8, %YMM9, %YMM9
- /* Each bit set in K1 represents a non-null CHAR in YMM8. */
+ /* Each bit set in K1 represents a non-null CHAR in YMM9. */
VPTESTM %YMM9, %YMM9, %k1
-
+# ifndef USE_AS_STRCASECMP_L
vpxorq (VEC_SIZE * 0)(%rsi), %YMM0, %YMM1
vpxorq (VEC_SIZE * 1)(%rsi), %YMM2, %YMM3
vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
/* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
oring with YMM1. Result is stored in YMM6. */
vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6
-
+# else
+ VMOVU (VEC_SIZE * 0)(%rsi), %YMM1
+ TOLOWER_YMM (%YMM0, %YMM1)
+ VMOVU (VEC_SIZE * 1)(%rsi), %YMM3
+ TOLOWER_YMM (%YMM2, %YMM3)
+ VMOVU (VEC_SIZE * 2)(%rsi), %YMM5
+ TOLOWER_YMM (%YMM4, %YMM5)
+ VMOVU (VEC_SIZE * 3)(%rsi), %YMM7
+ TOLOWER_YMM (%YMM6, %YMM7)
+ vpxorq %YMM0, %YMM1, %YMM1
+ vpxorq %YMM2, %YMM3, %YMM3
+ vpxorq %YMM4, %YMM5, %YMM5
+ vpternlogd $0xde, %YMM7, %YMM1, %YMM6
+# endif
/* Or together YMM3, YMM5, and YMM6. */
vpternlogd $0xfe, %YMM3, %YMM5, %YMM6
/* A non-zero CHAR in YMM6 represents a mismatch. */
- VPCMP $0, %YMMZERO, %YMM6, %k0{%k1}
+ VPTESTNM %YMM6, %YMM6, %k0{%k1}
kmovd %k0, %LOOP_REG
TESTEQ %LOOP_REG
@@ -437,13 +618,13 @@ L(loop_skip_page_cross_check):
/* Find which VEC has the mismatch of end of string. */
VPTESTM %YMM0, %YMM0, %k1
- VPCMP $0, %YMMZERO, %YMM1, %k0{%k1}
+ VPTESTNM %YMM1, %YMM1, %k0{%k1}
kmovd %k0, %ecx
TESTEQ %ecx
jnz L(return_vec_0_end)
VPTESTM %YMM2, %YMM2, %k1
- VPCMP $0, %YMMZERO, %YMM3, %k0{%k1}
+ VPTESTNM %YMM3, %YMM3, %k0{%k1}
kmovd %k0, %ecx
TESTEQ %ecx
jnz L(return_vec_1_end)
@@ -457,7 +638,7 @@ L(return_vec_2_3_end):
# endif
VPTESTM %YMM4, %YMM4, %k1
- VPCMP $0, %YMMZERO, %YMM5, %k0{%k1}
+ VPTESTNM %YMM5, %YMM5, %k0{%k1}
kmovd %k0, %ecx
TESTEQ %ecx
# if CHAR_PER_VEC <= 16
@@ -493,6 +674,8 @@ L(return_vec_3_end):
# else
movzbl (VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax
movzbl (VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -545,6 +728,8 @@ L(return_vec_0_end):
# else
movzbl (%rdi, %rcx), %eax
movzbl (%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
/* Flip `eax` if `rdi` and `rsi` where swapped in page cross
logic. Subtract `r8d` after xor for zero case. */
@@ -569,6 +754,8 @@ L(return_vec_1_end):
# else
movzbl VEC_SIZE(%rdi, %rcx), %eax
movzbl VEC_SIZE(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -598,7 +785,7 @@ L(page_cross_during_loop):
VMOVA (%rdi), %YMM0
VPTESTM %YMM0, %YMM0, %k2
- VPCMP $0, (%rsi), %YMM0, %k1{%k2}
+ CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
kmovd %k1, %ecx
TESTEQ %ecx
jnz L(return_vec_0_end)
@@ -619,8 +806,7 @@ L(less_1x_vec_till_page_cross):
been loaded earlier so must be valid. */
VMOVU -VEC_SIZE(%rdi, %rax), %YMM0
VPTESTM %YMM0, %YMM0, %k2
- VPCMP $0, -VEC_SIZE(%rsi, %rax), %YMM0, %k1{%k2}
-
+ CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2}
/* Mask of potentially valid bits. The lower bits can be out of
range comparisons (but safe regarding page crosses). */
@@ -642,6 +828,8 @@ L(less_1x_vec_till_page_cross):
# ifdef USE_AS_STRNCMP
# ifdef USE_AS_WCSCMP
+ /* NB: strcasecmp not used with WCSCMP so this access to r11 is
+ safe. */
movl %eax, %r11d
shrl $2, %r11d
cmpq %r11, %rdx
@@ -679,6 +867,8 @@ L(return_page_cross_cmp_mem):
# else
movzbl VEC_OFFSET(%rdi, %rcx), %eax
movzbl VEC_OFFSET(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -709,7 +899,7 @@ L(more_2x_vec_till_page_cross):
VMOVA VEC_SIZE(%rdi), %YMM0
VPTESTM %YMM0, %YMM0, %k2
- VPCMP $0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
+ CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
kmovd %k1, %ecx
TESTEQ %ecx
jnz L(return_vec_1_end)
@@ -724,14 +914,14 @@ L(more_2x_vec_till_page_cross):
/* Safe to include comparisons from lower bytes. */
VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %YMM0
VPTESTM %YMM0, %YMM0, %k2
- VPCMP $0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM0, %k1{%k2}
+ CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2}
kmovd %k1, %ecx
TESTEQ %ecx
jnz L(return_vec_page_cross_0)
VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %YMM0
VPTESTM %YMM0, %YMM0, %k2
- VPCMP $0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM0, %k1{%k2}
+ CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2}
kmovd %k1, %ecx
TESTEQ %ecx
jnz L(return_vec_page_cross_1)
@@ -740,6 +930,8 @@ L(more_2x_vec_till_page_cross):
/* Must check length here as length might proclude reading next
page. */
# ifdef USE_AS_WCSCMP
+ /* NB: strcasecmp not used with WCSCMP so this access to r11 is
+ safe. */
movl %eax, %r11d
shrl $2, %r11d
cmpq %r11, %rdx
@@ -754,12 +946,19 @@ L(more_2x_vec_till_page_cross):
VMOVA (VEC_SIZE * 3)(%rdi), %YMM6
VPMINU %YMM4, %YMM6, %YMM9
VPTESTM %YMM9, %YMM9, %k1
-
+# ifndef USE_AS_STRCASECMP_L
vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
/* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6). */
vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6
-
- VPCMP $0, %YMMZERO, %YMM6, %k0{%k1}
+# else
+ VMOVU (VEC_SIZE * 2)(%rsi), %YMM5
+ TOLOWER_YMM (%YMM4, %YMM5)
+ VMOVU (VEC_SIZE * 3)(%rsi), %YMM7
+ TOLOWER_YMM (%YMM6, %YMM7)
+ vpxorq %YMM4, %YMM5, %YMM5
+ vpternlogd $0xde, %YMM7, %YMM5, %YMM6
+# endif
+ VPTESTNM %YMM6, %YMM6, %k0{%k1}
kmovd %k0, %LOOP_REG
TESTEQ %LOOP_REG
jnz L(return_vec_2_3_end)
@@ -815,6 +1014,8 @@ L(return_vec_page_cross_1):
# else
movzbl VEC_OFFSET(%rdi, %rcx), %eax
movzbl VEC_OFFSET(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -871,7 +1072,7 @@ L(page_cross):
L(page_cross_loop):
VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
VPTESTM %YMM0, %YMM0, %k2
- VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
+ CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
kmovd %k1, %ecx
TESTEQ %ecx
jnz L(check_ret_vec_page_cross)
@@ -895,7 +1096,7 @@ L(page_cross_loop):
*/
VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
VPTESTM %YMM0, %YMM0, %k2
- VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
+ CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
kmovd %k1, %ecx
# ifdef USE_AS_STRNCMP
@@ -930,6 +1131,8 @@ L(ret_vec_page_cross_cont):
# else
movzbl (%rdi, %rcx, SIZE_OF_CHAR), %eax
movzbl (%rsi, %rcx, SIZE_OF_CHAR), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -989,7 +1192,7 @@ L(less_1x_vec_till_page):
/* Use 16 byte comparison. */
vmovdqu (%rdi), %xmm0
VPTESTM %xmm0, %xmm0, %k2
- VPCMP $0, (%rsi), %xmm0, %k1{%k2}
+ CMP_R1_S2_XMM (%xmm0, (%rsi), %xmm1, %k1){%k2}
kmovd %k1, %ecx
# ifdef USE_AS_WCSCMP
subl $0xf, %ecx
@@ -1009,7 +1212,7 @@ L(less_1x_vec_till_page):
# endif
vmovdqu (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
VPTESTM %xmm0, %xmm0, %k2
- VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0, %k1{%k2}
+ CMP_R1_S2_XMM (%xmm0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1, %k1){%k2}
kmovd %k1, %ecx
# ifdef USE_AS_WCSCMP
subl $0xf, %ecx
@@ -1048,7 +1251,7 @@ L(less_16_till_page):
vmovq (%rdi), %xmm0
vmovq (%rsi), %xmm1
VPTESTM %xmm0, %xmm0, %k2
- VPCMP $0, %xmm1, %xmm0, %k1{%k2}
+ CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
kmovd %k1, %ecx
# ifdef USE_AS_WCSCMP
subl $0x3, %ecx
@@ -1068,7 +1271,7 @@ L(less_16_till_page):
vmovq (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
vmovq (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
VPTESTM %xmm0, %xmm0, %k2
- VPCMP $0, %xmm1, %xmm0, %k1{%k2}
+ CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
kmovd %k1, %ecx
# ifdef USE_AS_WCSCMP
subl $0x3, %ecx
@@ -1128,7 +1331,7 @@ L(ret_less_8_wcs):
vmovd (%rdi), %xmm0
vmovd (%rsi), %xmm1
VPTESTM %xmm0, %xmm0, %k2
- VPCMP $0, %xmm1, %xmm0, %k1{%k2}
+ CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
kmovd %k1, %ecx
subl $0xf, %ecx
jnz L(check_ret_vec_page_cross)
@@ -1143,7 +1346,7 @@ L(ret_less_8_wcs):
vmovd (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
vmovd (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
VPTESTM %xmm0, %xmm0, %k2
- VPCMP $0, %xmm1, %xmm0, %k1{%k2}
+ CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
kmovd %k1, %ecx
subl $0xf, %ecx
jnz L(check_ret_vec_page_cross)
@@ -1176,7 +1379,9 @@ L(less_4_till_page):
L(less_4_loop):
movzbl (%rdi), %eax
movzbl (%rsi, %rdi), %ecx
- subl %ecx, %eax
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
+ subl %BYTE_LOOP_REG, %eax
jnz L(ret_less_4_loop)
testl %ecx, %ecx
jz L(ret_zero_4_loop)
@@ -1203,5 +1408,6 @@ L(ret_less_4_loop):
subl %r8d, %eax
ret
# endif
-END(STRCMP)
+ cfi_endproc
+ .size STRCMP, .-STRCMP
#endif
diff --git a/sysdeps/x86_64/multiarch/strncase_l-evex.S b/sysdeps/x86_64/multiarch/strncase_l-evex.S
new file mode 100644
index 0000000000..8a5af3695c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncase_l-evex.S
@@ -0,0 +1,25 @@
+/* strncasecmp_l optimized with EVEX.
+ Copyright (C) 2017-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef STRCMP
+# define STRCMP __strncasecmp_l_evex
+#endif
+#define OVERFLOW_STRCMP __strcasecmp_l_evex
+#define USE_AS_STRCASECMP_L
+#define USE_AS_STRNCMP
+#include "strcmp-evex.S"
--
2.25.1
^ permalink raw reply [flat|nested] 76+ messages in thread
* [PATCH v4 21/23] x86: Add AVX2 optimized str{n}casecmp
2022-03-23 21:57 ` [PATCH v1 21/23] x86: Add AVX2 optimized str{n}casecmp Noah Goldstein
` (2 preceding siblings ...)
2022-03-24 22:41 ` [PATCH v3 22/23] x86: Add EVEX " Noah Goldstein
@ 2022-03-24 23:56 ` Noah Goldstein
2022-03-24 23:56 ` [PATCH v4 22/23] x86: Add EVEX " Noah Goldstein
2022-03-25 18:14 ` [PATCH v4 21/23] x86: Add AVX2 " H.J. Lu
3 siblings, 2 replies; 76+ messages in thread
From: Noah Goldstein @ 2022-03-24 23:56 UTC (permalink / raw)
To: libc-alpha
geometric_mean(N=40) of all benchmarks AVX2 / SSE42: .702
All string/memory tests pass.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/multiarch/Makefile | 4 +
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 28 +++
sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 12 +
.../x86_64/multiarch/strcasecmp_l-avx2-rtm.S | 15 ++
sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S | 23 ++
sysdeps/x86_64/multiarch/strcmp-avx2.S | 237 +++++++++++++++---
.../x86_64/multiarch/strncase_l-avx2-rtm.S | 16 ++
sysdeps/x86_64/multiarch/strncase_l-avx2.S | 27 ++
8 files changed, 331 insertions(+), 31 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index e7b413edad..06e1848823 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -55,6 +55,8 @@ sysdep_routines += \
stpncpy-sse2-unaligned \
stpncpy-ssse3 \
strcasecmp_l-avx \
+ strcasecmp_l-avx2 \
+ strcasecmp_l-avx2-rtm \
strcasecmp_l-sse2 \
strcasecmp_l-sse4_2 \
strcasecmp_l-ssse3 \
@@ -93,6 +95,8 @@ sysdep_routines += \
strlen-evex \
strlen-sse2 \
strncase_l-avx \
+ strncase_l-avx2 \
+ strncase_l-avx2-rtm \
strncase_l-sse2 \
strncase_l-sse4_2 \
strncase_l-ssse3 \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index a594f4176e..3c556d07ac 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -436,6 +436,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
IFUNC_IMPL (i, name, strcasecmp,
+ IFUNC_IMPL_ADD (array, i, strcasecmp,
+ CPU_FEATURE_USABLE (AVX2),
+ __strcasecmp_avx2)
+ IFUNC_IMPL_ADD (array, i, strcasecmp,
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (RTM)),
+ __strcasecmp_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strcasecmp,
CPU_FEATURE_USABLE (AVX),
__strcasecmp_avx)
@@ -449,6 +456,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
IFUNC_IMPL (i, name, strcasecmp_l,
+ IFUNC_IMPL_ADD (array, i, strcasecmp,
+ CPU_FEATURE_USABLE (AVX2),
+ __strcasecmp_l_avx2)
+ IFUNC_IMPL_ADD (array, i, strcasecmp,
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (RTM)),
+ __strcasecmp_l_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strcasecmp_l,
CPU_FEATURE_USABLE (AVX),
__strcasecmp_l_avx)
@@ -576,6 +590,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strncase_l.c. */
IFUNC_IMPL (i, name, strncasecmp,
+ IFUNC_IMPL_ADD (array, i, strncasecmp,
+ CPU_FEATURE_USABLE (AVX2),
+ __strncasecmp_avx2)
+ IFUNC_IMPL_ADD (array, i, strncasecmp,
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (RTM)),
+ __strncasecmp_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strncasecmp,
CPU_FEATURE_USABLE (AVX),
__strncasecmp_avx)
@@ -590,6 +611,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strncase_l.c. */
IFUNC_IMPL (i, name, strncasecmp_l,
+ IFUNC_IMPL_ADD (array, i, strncasecmp,
+ CPU_FEATURE_USABLE (AVX2),
+ __strncasecmp_l_avx2)
+ IFUNC_IMPL_ADD (array, i, strncasecmp,
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (RTM)),
+ __strncasecmp_l_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strncasecmp_l,
CPU_FEATURE_USABLE (AVX),
__strncasecmp_l_avx)
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
index 9e3cc61ac0..c4de111fd0 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
@@ -23,12 +23,24 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
static inline void *
IFUNC_SELECTOR (void)
{
const struct cpu_features* cpu_features = __get_cpu_features ();
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+ {
+ if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+ return OPTIMIZE (avx2_rtm);
+
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ return OPTIMIZE (avx2);
+ }
+
if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
return OPTIMIZE (avx);
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
new file mode 100644
index 0000000000..09957fc3c5
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
@@ -0,0 +1,15 @@
+#ifndef STRCMP
+# define STRCMP __strcasecmp_l_avx2_rtm
+#endif
+
+#define _GLABEL(x) x ## _rtm
+#define GLABEL(x) _GLABEL(x)
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "strcasecmp_l-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
new file mode 100644
index 0000000000..e2762f2a22
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
@@ -0,0 +1,23 @@
+/* strcasecmp_l optimized with AVX2.
+ Copyright (C) 2017-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef STRCMP
+# define STRCMP __strcasecmp_l_avx2
+#endif
+#define USE_AS_STRCASECMP_L
+#include "strcmp-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
index 86a86b68e3..8da09bd86d 100644
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
@@ -20,6 +20,10 @@
# include <sysdep.h>
+# if defined USE_AS_STRCASECMP_L
+# include "locale-defines.h"
+# endif
+
# ifndef STRCMP
# define STRCMP __strcmp_avx2
# endif
@@ -74,13 +78,88 @@
# define VEC_OFFSET (-VEC_SIZE)
# endif
+# ifdef USE_AS_STRCASECMP_L
+# define BYTE_LOOP_REG OFFSET_REG
+# else
+# define BYTE_LOOP_REG ecx
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+# ifdef USE_AS_STRNCMP
+# define STRCASECMP __strncasecmp_avx2
+# define LOCALE_REG rcx
+# define LOCALE_REG_LP RCX_LP
+# define STRCASECMP_NONASCII __strncasecmp_l_nonascii
+# else
+# define STRCASECMP __strcasecmp_avx2
+# define LOCALE_REG rdx
+# define LOCALE_REG_LP RDX_LP
+# define STRCASECMP_NONASCII __strcasecmp_l_nonascii
+# endif
+# endif
+
# define xmmZERO xmm15
# define ymmZERO ymm15
+# define LCASE_MIN_ymm %ymm10
+# define LCASE_MAX_ymm %ymm11
+# define CASE_ADD_ymm %ymm12
+
+# define LCASE_MIN_xmm %xmm10
+# define LCASE_MAX_xmm %xmm11
+# define CASE_ADD_xmm %xmm12
+
+ /* r11 is never use elsewhere so this is safe to maintain. */
+# define TOLOWER_BASE %r11
+
# ifndef SECTION
# define SECTION(p) p##.avx
# endif
+# ifdef USE_AS_STRCASECMP_L
+# define REG(x, y) x ## y
+# define TOLOWER(reg1_in, reg1_out, reg2_in, reg2_out, ext) \
+ vpaddb REG(LCASE_MIN_, ext), reg1_in, REG(%ext, 8); \
+ vpaddb REG(LCASE_MIN_, ext), reg2_in, REG(%ext, 9); \
+ vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 8), REG(%ext, 8); \
+ vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 9), REG(%ext, 9); \
+ vpandn REG(CASE_ADD_, ext), REG(%ext, 8), REG(%ext, 8); \
+ vpandn REG(CASE_ADD_, ext), REG(%ext, 9), REG(%ext, 9); \
+ vpaddb REG(%ext, 8), reg1_in, reg1_out; \
+ vpaddb REG(%ext, 9), reg2_in, reg2_out
+
+# define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
+# define TOLOWER_ymm(...) TOLOWER(__VA_ARGS__, ymm)
+# define TOLOWER_xmm(...) TOLOWER(__VA_ARGS__, xmm)
+
+# define CMP_R1_R2(s1_reg, s2_reg, scratch_reg, reg_out, ext) \
+ TOLOWER (s1_reg, scratch_reg, s2_reg, s2_reg, ext); \
+ VPCMPEQ scratch_reg, s2_reg, reg_out
+
+# define CMP_R1_S2(s1_reg, s2_mem, scratch_reg, reg_out, ext) \
+ VMOVU s2_mem, reg_out; \
+ CMP_R1_R2(s1_reg, reg_out, scratch_reg, reg_out, ext)
+
+# define CMP_R1_R2_ymm(...) CMP_R1_R2(__VA_ARGS__, ymm)
+# define CMP_R1_R2_xmm(...) CMP_R1_R2(__VA_ARGS__, xmm)
+
+# define CMP_R1_S2_ymm(...) CMP_R1_S2(__VA_ARGS__, ymm)
+# define CMP_R1_S2_xmm(...) CMP_R1_S2(__VA_ARGS__, xmm)
+
+# else
+# define TOLOWER_gpr(...)
+# define TOLOWER_ymm(...)
+# define TOLOWER_xmm(...)
+
+# define CMP_R1_R2_ymm(s1_reg, s2_reg, scratch_reg, reg_out) \
+ VPCMPEQ s2_reg, s1_reg, reg_out
+
+# define CMP_R1_R2_xmm(...) CMP_R1_R2_ymm(__VA_ARGS__)
+
+# define CMP_R1_S2_ymm(...) CMP_R1_R2_ymm(__VA_ARGS__)
+# define CMP_R1_S2_xmm(...) CMP_R1_R2_xmm(__VA_ARGS__)
+# endif
+
/* Warning!
wcscmp/wcsncmp have to use SIGNED comparison for elements.
strcmp/strncmp have to use UNSIGNED comparison for elements.
@@ -102,8 +181,49 @@
returned. */
.section SECTION(.text), "ax", @progbits
-ENTRY(STRCMP)
+ .align 16
+ .type STRCMP, @function
+ .globl STRCMP
+ .hidden STRCMP
+
+# ifndef GLABEL
+# define GLABEL(...) __VA_ARGS__
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+ENTRY (GLABEL(STRCASECMP))
+ movq __libc_tsd_LOCALE@gottpoff(%rip), %rax
+ mov %fs:(%rax), %LOCALE_REG_LP
+
+ /* Either 1 or 5 bytes (dependeing if CET is enabled). */
+ .p2align 4
+END (GLABEL(STRCASECMP))
+ /* FALLTHROUGH to strcasecmp/strncasecmp_l. */
+# endif
+
+ .p2align 4
+STRCMP:
+ cfi_startproc
+ _CET_ENDBR
+ CALL_MCOUNT
+
+# if defined USE_AS_STRCASECMP_L
+ /* We have to fall back on the C implementation for locales with
+ encodings not matching ASCII for single bytes. */
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+ mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
+# else
+ mov (%LOCALE_REG), %RAX_LP
+# endif
+ testl $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
+ jne STRCASECMP_NONASCII
+ leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
+# endif
+
# ifdef USE_AS_STRNCMP
+ /* Don't overwrite LOCALE_REG (rcx) until we have pass
+ L(one_or_less). Otherwise we might use the wrong locale in
+ the OVERFLOW_STRCMP (strcasecmp_l). */
# ifdef __ILP32__
/* Clear the upper 32 bits. */
movl %edx, %edx
@@ -128,6 +248,30 @@ ENTRY(STRCMP)
# endif
# endif
vpxor %xmmZERO, %xmmZERO, %xmmZERO
+# if defined USE_AS_STRCASECMP_L
+ .section .rodata.cst32, "aM", @progbits, 32
+ .align 32
+L(lcase_min):
+ .quad 0x3f3f3f3f3f3f3f3f
+ .quad 0x3f3f3f3f3f3f3f3f
+ .quad 0x3f3f3f3f3f3f3f3f
+ .quad 0x3f3f3f3f3f3f3f3f
+L(lcase_max):
+ .quad 0x9999999999999999
+ .quad 0x9999999999999999
+ .quad 0x9999999999999999
+ .quad 0x9999999999999999
+L(case_add):
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .previous
+
+ vmovdqa L(lcase_min)(%rip), LCASE_MIN_ymm
+ vmovdqa L(lcase_max)(%rip), LCASE_MAX_ymm
+ vmovdqa L(case_add)(%rip), CASE_ADD_ymm
+# endif
movl %edi, %eax
orl %esi, %eax
sall $20, %eax
@@ -138,8 +282,10 @@ ENTRY(STRCMP)
L(no_page_cross):
/* Safe to compare 4x vectors. */
VMOVU (%rdi), %ymm0
- /* 1s where s1 and s2 equal. */
- VPCMPEQ (%rsi), %ymm0, %ymm1
+ /* 1s where s1 and s2 equal. Just VPCMPEQ if its not strcasecmp.
+ Otherwise converts ymm0 and load from rsi to lower. ymm2 is
+ scratch and ymm1 is the return. */
+ CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
/* 1s at null CHAR. */
VPCMPEQ %ymm0, %ymmZERO, %ymm2
/* 1s where s1 and s2 equal AND not null CHAR. */
@@ -172,6 +318,8 @@ L(return_vec_0):
# else
movzbl (%rdi, %rcx), %eax
movzbl (%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret0):
@@ -192,6 +340,10 @@ L(ret_zero):
.p2align 4,, 5
L(one_or_less):
+# ifdef USE_AS_STRCASECMP_L
+ /* Set locale argument for strcasecmp. */
+ movq %LOCALE_REG, %rdx
+# endif
jb L(ret_zero)
/* 'nbe' covers the case where length is negative (large
unsigned). */
@@ -207,6 +359,8 @@ L(one_or_less):
# else
movzbl (%rdi), %eax
movzbl (%rsi), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret1):
@@ -234,6 +388,8 @@ L(return_vec_1):
# else
movzbl VEC_SIZE(%rdi, %rcx), %eax
movzbl VEC_SIZE(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret2):
@@ -265,6 +421,8 @@ L(return_vec_2):
# else
movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret3):
@@ -285,6 +443,8 @@ L(return_vec_3):
# else
movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret4):
@@ -295,7 +455,7 @@ L(ret4):
L(more_3x_vec):
/* Safe to compare 4x vectors. */
VMOVU VEC_SIZE(%rdi), %ymm0
- VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -308,7 +468,7 @@ L(more_3x_vec):
# endif
VMOVU (VEC_SIZE * 2)(%rdi), %ymm0
- VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 2)(%rsi), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -316,7 +476,7 @@ L(more_3x_vec):
jnz L(return_vec_2)
VMOVU (VEC_SIZE * 3)(%rdi), %ymm0
- VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 3)(%rsi), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -391,12 +551,10 @@ L(loop_skip_page_cross_check):
VMOVA (VEC_SIZE * 3)(%rdi), %ymm6
/* ymm1 all 1s where s1 and s2 equal. All 0s otherwise. */
- VPCMPEQ (VEC_SIZE * 0)(%rsi), %ymm0, %ymm1
-
- VPCMPEQ (VEC_SIZE * 1)(%rsi), %ymm2, %ymm3
- VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
- VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
-
+ CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 0)(%rsi), %ymm3, %ymm1)
+ CMP_R1_S2_ymm (%ymm2, (VEC_SIZE * 1)(%rsi), %ymm5, %ymm3)
+ CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
+ CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
/* If any mismatches or null CHAR then 0 CHAR, otherwise non-
zero. */
@@ -465,6 +623,8 @@ L(return_vec_2_3_end):
# else
movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax
movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -508,6 +668,8 @@ L(return_vec_0_end):
# else
movzbl (%rdi, %rcx), %eax
movzbl (%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -530,6 +692,8 @@ L(return_vec_1_end):
# else
movzbl VEC_SIZE(%rdi, %rcx), %eax
movzbl VEC_SIZE(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -556,6 +720,8 @@ L(return_vec_2_end):
# else
movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -583,7 +749,7 @@ L(page_cross_during_loop):
jle L(less_1x_vec_till_page_cross)
VMOVA (%rdi), %ymm0
- VPCMPEQ (%rsi), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -605,7 +771,7 @@ L(less_1x_vec_till_page_cross):
here, it means the previous page (rdi - VEC_SIZE) has already
been loaded earlier so must be valid. */
VMOVU -VEC_SIZE(%rdi, %rax), %ymm0
- VPCMPEQ -VEC_SIZE(%rsi, %rax), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, -VEC_SIZE(%rsi, %rax), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -647,6 +813,8 @@ L(return_page_cross_cmp_mem):
# else
movzbl VEC_OFFSET(%rdi, %rcx), %eax
movzbl VEC_OFFSET(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -673,7 +841,7 @@ L(more_2x_vec_till_page_cross):
iteration here. */
VMOVU VEC_SIZE(%rdi), %ymm0
- VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -689,7 +857,7 @@ L(more_2x_vec_till_page_cross):
/* Safe to include comparisons from lower bytes. */
VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %ymm0
- VPCMPEQ -(VEC_SIZE * 2)(%rsi, %rax), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 2)(%rsi, %rax), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -697,7 +865,7 @@ L(more_2x_vec_till_page_cross):
jnz L(return_vec_page_cross_0)
VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %ymm0
- VPCMPEQ -(VEC_SIZE * 1)(%rsi, %rax), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 1)(%rsi, %rax), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -715,8 +883,8 @@ L(more_2x_vec_till_page_cross):
VMOVA (VEC_SIZE * 2)(%rdi), %ymm4
VMOVA (VEC_SIZE * 3)(%rdi), %ymm6
- VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
- VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
+ CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
+ CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
vpand %ymm4, %ymm5, %ymm5
vpand %ymm6, %ymm7, %ymm7
VPMINU %ymm5, %ymm7, %ymm7
@@ -767,6 +935,8 @@ L(return_vec_page_cross_1):
# else
movzbl VEC_OFFSET(%rdi, %rcx), %eax
movzbl VEC_OFFSET(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -822,7 +992,7 @@ L(page_cross):
L(page_cross_loop):
VMOVU (%rdi, %OFFSET_REG64), %ymm0
- VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -840,11 +1010,11 @@ L(page_cross_loop):
subl %eax, %OFFSET_REG
/* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
to not cross page so is safe to load. Since we have already
- loaded at least 1 VEC from rsi it is also guranteed to be safe.
- */
+ loaded at least 1 VEC from rsi it is also guranteed to be
+ safe. */
VMOVU (%rdi, %OFFSET_REG64), %ymm0
- VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -877,6 +1047,8 @@ L(ret_vec_page_cross_cont):
# else
movzbl (%rdi, %rcx), %eax
movzbl (%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -930,7 +1102,7 @@ L(less_1x_vec_till_page):
ja L(less_16_till_page)
VMOVU (%rdi), %xmm0
- VPCMPEQ (%rsi), %xmm0, %xmm1
+ CMP_R1_S2_xmm (%xmm0, (%rsi), %xmm2, %xmm1)
VPCMPEQ %xmm0, %xmmZERO, %xmm2
vpandn %xmm1, %xmm2, %xmm1
vpmovmskb %ymm1, %ecx
@@ -948,7 +1120,7 @@ L(less_1x_vec_till_page):
# endif
VMOVU (%rdi, %OFFSET_REG64), %xmm0
- VPCMPEQ (%rsi, %OFFSET_REG64), %xmm0, %xmm1
+ CMP_R1_S2_xmm (%xmm0, (%rsi, %OFFSET_REG64), %xmm2, %xmm1)
VPCMPEQ %xmm0, %xmmZERO, %xmm2
vpandn %xmm1, %xmm2, %xmm1
vpmovmskb %ymm1, %ecx
@@ -986,7 +1158,7 @@ L(less_16_till_page):
vmovq (%rdi), %xmm0
vmovq (%rsi), %xmm1
VPCMPEQ %xmm0, %xmmZERO, %xmm2
- VPCMPEQ %xmm1, %xmm0, %xmm1
+ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
vpandn %xmm1, %xmm2, %xmm1
vpmovmskb %ymm1, %ecx
incb %cl
@@ -1006,7 +1178,7 @@ L(less_16_till_page):
vmovq (%rdi, %OFFSET_REG64), %xmm0
vmovq (%rsi, %OFFSET_REG64), %xmm1
VPCMPEQ %xmm0, %xmmZERO, %xmm2
- VPCMPEQ %xmm1, %xmm0, %xmm1
+ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
vpandn %xmm1, %xmm2, %xmm1
vpmovmskb %ymm1, %ecx
incb %cl
@@ -1062,7 +1234,7 @@ L(ret_less_8_wcs):
vmovd (%rdi), %xmm0
vmovd (%rsi), %xmm1
VPCMPEQ %xmm0, %xmmZERO, %xmm2
- VPCMPEQ %xmm1, %xmm0, %xmm1
+ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
vpandn %xmm1, %xmm2, %xmm1
vpmovmskb %ymm1, %ecx
subl $0xf, %ecx
@@ -1081,7 +1253,7 @@ L(ret_less_8_wcs):
vmovd (%rdi, %OFFSET_REG64), %xmm0
vmovd (%rsi, %OFFSET_REG64), %xmm1
VPCMPEQ %xmm0, %xmmZERO, %xmm2
- VPCMPEQ %xmm1, %xmm0, %xmm1
+ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
vpandn %xmm1, %xmm2, %xmm1
vpmovmskb %ymm1, %ecx
subl $0xf, %ecx
@@ -1115,7 +1287,9 @@ L(less_4_till_page):
L(less_4_loop):
movzbl (%rdi), %eax
movzbl (%rsi, %rdi), %ecx
- subl %ecx, %eax
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
+ subl %BYTE_LOOP_REG, %eax
jnz L(ret_less_4_loop)
testl %ecx, %ecx
jz L(ret_zero_4_loop)
@@ -1142,5 +1316,6 @@ L(ret_less_4_loop):
subl %r8d, %eax
ret
# endif
-END(STRCMP)
+ cfi_endproc
+ .size STRCMP, .-STRCMP
#endif
diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
new file mode 100644
index 0000000000..58c05dcfb8
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
@@ -0,0 +1,16 @@
+#ifndef STRCMP
+# define STRCMP __strncasecmp_l_avx2_rtm
+#endif
+
+#define _GLABEL(x) x ## _rtm
+#define GLABEL(x) _GLABEL(x)
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+#define OVERFLOW_STRCMP __strcasecmp_l_avx2_rtm
+
+#include "strncase_l-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2.S b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
new file mode 100644
index 0000000000..48c0aa21f8
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
@@ -0,0 +1,27 @@
+/* strncasecmp_l optimized with AVX2.
+ Copyright (C) 2017-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef STRCMP
+# define STRCMP __strncasecmp_l_avx2
+#endif
+#define USE_AS_STRCASECMP_L
+#define USE_AS_STRNCMP
+#ifndef OVERFLOW_STRCMP
+# define OVERFLOW_STRCMP __strcasecmp_l_avx2
+#endif
+#include "strcmp-avx2.S"
--
2.25.1
^ permalink raw reply [flat|nested] 76+ messages in thread
* [PATCH v4 22/23] x86: Add EVEX optimized str{n}casecmp
2022-03-24 23:56 ` [PATCH v4 21/23] x86: Add AVX2 " Noah Goldstein
@ 2022-03-24 23:56 ` Noah Goldstein
2022-03-25 18:15 ` H.J. Lu
2022-03-25 18:14 ` [PATCH v4 21/23] x86: Add AVX2 " H.J. Lu
1 sibling, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-24 23:56 UTC (permalink / raw)
To: libc-alpha
geometric_mean(N=40) of all benchmarks EVEX / SSE42: .621
All string/memory tests pass.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/multiarch/Makefile | 2 +
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 +
sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 5 +
sysdeps/x86_64/multiarch/strcasecmp_l-evex.S | 23 ++
sysdeps/x86_64/multiarch/strcmp-evex.S | 290 ++++++++++++++++---
sysdeps/x86_64/multiarch/strncase_l-evex.S | 25 ++
6 files changed, 321 insertions(+), 40 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
create mode 100644 sysdeps/x86_64/multiarch/strncase_l-evex.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 06e1848823..35d80dc2ff 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -57,6 +57,7 @@ sysdep_routines += \
strcasecmp_l-avx \
strcasecmp_l-avx2 \
strcasecmp_l-avx2-rtm \
+ strcasecmp_l-evex \
strcasecmp_l-sse2 \
strcasecmp_l-sse4_2 \
strcasecmp_l-ssse3 \
@@ -97,6 +98,7 @@ sysdep_routines += \
strncase_l-avx \
strncase_l-avx2 \
strncase_l-avx2-rtm \
+ strncase_l-evex \
strncase_l-sse2 \
strncase_l-sse4_2 \
strncase_l-ssse3 \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 3c556d07ac..f1a4d3dac2 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -436,6 +436,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
IFUNC_IMPL (i, name, strcasecmp,
+ IFUNC_IMPL_ADD (array, i, strcasecmp,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)),
+ __strcasecmp_evex)
IFUNC_IMPL_ADD (array, i, strcasecmp,
CPU_FEATURE_USABLE (AVX2),
__strcasecmp_avx2)
@@ -456,6 +460,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
IFUNC_IMPL (i, name, strcasecmp_l,
+ IFUNC_IMPL_ADD (array, i, strcasecmp,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)),
+ __strcasecmp_l_evex)
IFUNC_IMPL_ADD (array, i, strcasecmp,
CPU_FEATURE_USABLE (AVX2),
__strcasecmp_l_avx2)
@@ -590,6 +598,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strncase_l.c. */
IFUNC_IMPL (i, name, strncasecmp,
+ IFUNC_IMPL_ADD (array, i, strncasecmp,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)),
+ __strncasecmp_evex)
IFUNC_IMPL_ADD (array, i, strncasecmp,
CPU_FEATURE_USABLE (AVX2),
__strncasecmp_avx2)
@@ -611,6 +623,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strncase_l.c. */
IFUNC_IMPL (i, name, strncasecmp_l,
+ IFUNC_IMPL_ADD (array, i, strncasecmp,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)),
+ __strncasecmp_l_evex)
IFUNC_IMPL_ADD (array, i, strncasecmp,
CPU_FEATURE_USABLE (AVX2),
__strncasecmp_l_avx2)
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
index c4de111fd0..bf0d146e7f 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
@@ -25,6 +25,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
static inline void *
IFUNC_SELECTOR (void)
@@ -34,6 +35,10 @@ IFUNC_SELECTOR (void)
if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
{
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+ return OPTIMIZE (evex);
+
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
return OPTIMIZE (avx2_rtm);
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
new file mode 100644
index 0000000000..58642db748
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
@@ -0,0 +1,23 @@
+/* strcasecmp_l optimized with EVEX.
+ Copyright (C) 2017-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef STRCMP
+# define STRCMP __strcasecmp_l_evex
+#endif
+#define USE_AS_STRCASECMP_L
+#include "strcmp-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
index 56d8c118e4..2a5b3ce037 100644
--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
@@ -19,6 +19,9 @@
#if IS_IN (libc)
# include <sysdep.h>
+# if defined USE_AS_STRCASECMP_L
+# include "locale-defines.h"
+# endif
# ifndef STRCMP
# define STRCMP __strcmp_evex
@@ -34,19 +37,29 @@
# define VMOVA vmovdqa64
# ifdef USE_AS_WCSCMP
-# define TESTEQ subl $0xff,
+# ifndef OVERFLOW_STRCMP
+# define OVERFLOW_STRCMP __wcscmp_evex
+# endif
+
+# define TESTEQ subl $0xff,
/* Compare packed dwords. */
# define VPCMP vpcmpd
# define VPMINU vpminud
# define VPTESTM vptestmd
+# define VPTESTNM vptestnmd
/* 1 dword char == 4 bytes. */
# define SIZE_OF_CHAR 4
# else
+# ifndef OVERFLOW_STRCMP
+# define OVERFLOW_STRCMP __strcmp_evex
+# endif
+
# define TESTEQ incl
/* Compare packed bytes. */
# define VPCMP vpcmpb
# define VPMINU vpminub
# define VPTESTM vptestmb
+# define VPTESTNM vptestnmb
/* 1 byte char == 1 byte. */
# define SIZE_OF_CHAR 1
# endif
@@ -73,11 +86,16 @@
# define VEC_OFFSET (-VEC_SIZE)
# endif
-# define XMMZERO xmm16
# define XMM0 xmm17
# define XMM1 xmm18
-# define YMMZERO ymm16
+# define XMM10 xmm27
+# define XMM11 xmm28
+# define XMM12 xmm29
+# define XMM13 xmm30
+# define XMM14 xmm31
+
+
# define YMM0 ymm17
# define YMM1 ymm18
# define YMM2 ymm19
@@ -89,6 +107,87 @@
# define YMM8 ymm25
# define YMM9 ymm26
# define YMM10 ymm27
+# define YMM11 ymm28
+# define YMM12 ymm29
+# define YMM13 ymm30
+# define YMM14 ymm31
+
+# ifdef USE_AS_STRCASECMP_L
+# define BYTE_LOOP_REG OFFSET_REG
+# else
+# define BYTE_LOOP_REG ecx
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+# ifdef USE_AS_STRNCMP
+# define STRCASECMP __strncasecmp_evex
+# define LOCALE_REG rcx
+# define LOCALE_REG_LP RCX_LP
+# define STRCASECMP_NONASCII __strncasecmp_l_nonascii
+# else
+# define STRCASECMP __strcasecmp_evex
+# define LOCALE_REG rdx
+# define LOCALE_REG_LP RDX_LP
+# define STRCASECMP_NONASCII __strcasecmp_l_nonascii
+# endif
+# endif
+
+# define LCASE_MIN_YMM %YMM12
+# define LCASE_MAX_YMM %YMM13
+# define CASE_ADD_YMM %YMM14
+
+# define LCASE_MIN_XMM %XMM12
+# define LCASE_MAX_XMM %XMM13
+# define CASE_ADD_XMM %XMM14
+
+ /* NB: wcsncmp uses r11 but strcasecmp is never used in
+ conjunction with wcscmp. */
+# define TOLOWER_BASE %r11
+
+# ifdef USE_AS_STRCASECMP_L
+# define _REG(x, y) x ## y
+# define REG(x, y) _REG(x, y)
+# define TOLOWER(reg1, reg2, ext) \
+ vpsubb REG(LCASE_MIN_, ext), reg1, REG(%ext, 10); \
+ vpsubb REG(LCASE_MIN_, ext), reg2, REG(%ext, 11); \
+ vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5; \
+ vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6; \
+ vpaddb reg1, REG(CASE_ADD_, ext), reg1{%k5}; \
+ vpaddb reg2, REG(CASE_ADD_, ext), reg2{%k6}
+
+# define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
+# define TOLOWER_YMM(...) TOLOWER(__VA_ARGS__, YMM)
+# define TOLOWER_XMM(...) TOLOWER(__VA_ARGS__, XMM)
+
+# define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext) \
+ TOLOWER (s1_reg, s2_reg, ext); \
+ VPCMP $0, s1_reg, s2_reg, reg_out
+
+# define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext) \
+ VMOVU s2_mem, s2_reg; \
+ CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)
+
+# define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM)
+# define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM)
+
+# define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM)
+# define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM)
+
+# else
+# define TOLOWER_gpr(...)
+# define TOLOWER_YMM(...)
+# define TOLOWER_XMM(...)
+
+# define CMP_R1_R2_YMM(s1_reg, s2_reg, reg_out) \
+ VPCMP $0, s2_reg, s1_reg, reg_out
+
+# define CMP_R1_R2_XMM(...) CMP_R1_R2_YMM(__VA_ARGS__)
+
+# define CMP_R1_S2_YMM(s1_reg, s2_mem, unused, reg_out) \
+ VPCMP $0, s2_mem, s1_reg, reg_out
+
+# define CMP_R1_S2_XMM(...) CMP_R1_S2_YMM(__VA_ARGS__)
+# endif
/* Warning!
wcscmp/wcsncmp have to use SIGNED comparison for elements.
@@ -112,8 +211,45 @@
returned. */
.section .text.evex, "ax", @progbits
-ENTRY(STRCMP)
+ .align 16
+ .type STRCMP, @function
+ .globl STRCMP
+ .hidden STRCMP
+
+# ifdef USE_AS_STRCASECMP_L
+ENTRY (STRCASECMP)
+ movq __libc_tsd_LOCALE@gottpoff(%rip), %rax
+ mov %fs:(%rax), %LOCALE_REG_LP
+
+ /* Either 1 or 5 bytes (dependeing if CET is enabled). */
+ .p2align 4
+END (STRCASECMP)
+ /* FALLTHROUGH to strcasecmp/strncasecmp_l. */
+# endif
+
+ .p2align 4
+STRCMP:
+ cfi_startproc
+ _CET_ENDBR
+ CALL_MCOUNT
+
+# if defined USE_AS_STRCASECMP_L
+ /* We have to fall back on the C implementation for locales with
+ encodings not matching ASCII for single bytes. */
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+ mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
+# else
+ mov (%LOCALE_REG), %RAX_LP
+# endif
+ testl $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
+ jne STRCASECMP_NONASCII
+ leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
+# endif
+
# ifdef USE_AS_STRNCMP
+ /* Don't overwrite LOCALE_REG (rcx) until we have pass
+ L(one_or_less). Otherwise we might use the wrong locale in
+ the OVERFLOW_STRCMP (strcasecmp_l). */
# ifdef __ILP32__
/* Clear the upper 32 bits. */
movl %edx, %edx
@@ -125,6 +261,32 @@ ENTRY(STRCMP)
actually bound the buffer. */
jle L(one_or_less)
# endif
+
+# if defined USE_AS_STRCASECMP_L
+ .section .rodata.cst32, "aM", @progbits, 32
+ .align 32
+L(lcase_min):
+ .quad 0x4141414141414141
+ .quad 0x4141414141414141
+ .quad 0x4141414141414141
+ .quad 0x4141414141414141
+L(lcase_max):
+ .quad 0x1a1a1a1a1a1a1a1a
+ .quad 0x1a1a1a1a1a1a1a1a
+ .quad 0x1a1a1a1a1a1a1a1a
+ .quad 0x1a1a1a1a1a1a1a1a
+L(case_add):
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .previous
+
+ vmovdqa64 L(lcase_min)(%rip), LCASE_MIN_YMM
+ vmovdqa64 L(lcase_max)(%rip), LCASE_MAX_YMM
+ vmovdqa64 L(case_add)(%rip), CASE_ADD_YMM
+# endif
+
movl %edi, %eax
orl %esi, %eax
/* Shift out the bits irrelivant to page boundary ([63:12]). */
@@ -139,7 +301,7 @@ L(no_page_cross):
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi). */
- VPCMP $0, (%rsi), %YMM0, %k1{%k2}
+ CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
kmovd %k1, %ecx
# ifdef USE_AS_STRNCMP
cmpq $CHAR_PER_VEC, %rdx
@@ -169,6 +331,8 @@ L(return_vec_0):
# else
movzbl (%rdi, %rcx), %eax
movzbl (%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret0):
@@ -188,11 +352,15 @@ L(ret_zero):
.p2align 4,, 5
L(one_or_less):
+# ifdef USE_AS_STRCASECMP_L
+ /* Set locale argument for strcasecmp. */
+ movq %LOCALE_REG, %rdx
+# endif
jb L(ret_zero)
-# ifdef USE_AS_WCSCMP
/* 'nbe' covers the case where length is negative (large
unsigned). */
- jnbe __wcscmp_evex
+ jnbe OVERFLOW_STRCMP
+# ifdef USE_AS_WCSCMP
movl (%rdi), %edx
xorl %eax, %eax
cmpl (%rsi), %edx
@@ -201,11 +369,10 @@ L(one_or_less):
negl %eax
orl $1, %eax
# else
- /* 'nbe' covers the case where length is negative (large
- unsigned). */
- jnbe __strcmp_evex
movzbl (%rdi), %eax
movzbl (%rsi), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret1):
@@ -233,6 +400,8 @@ L(return_vec_1):
# else
movzbl VEC_SIZE(%rdi, %rcx), %eax
movzbl VEC_SIZE(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret2):
@@ -270,6 +439,8 @@ L(return_vec_2):
# else
movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret3):
@@ -290,6 +461,8 @@ L(return_vec_3):
# else
movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret4):
@@ -303,7 +476,7 @@ L(more_3x_vec):
/* Safe to compare 4x vectors. */
VMOVU (VEC_SIZE)(%rdi), %YMM0
VPTESTM %YMM0, %YMM0, %k2
- VPCMP $0, (VEC_SIZE)(%rsi), %YMM0, %k1{%k2}
+ CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
kmovd %k1, %ecx
TESTEQ %ecx
jnz L(return_vec_1)
@@ -315,14 +488,14 @@ L(more_3x_vec):
VMOVU (VEC_SIZE * 2)(%rdi), %YMM0
VPTESTM %YMM0, %YMM0, %k2
- VPCMP $0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
+ CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2}
kmovd %k1, %ecx
TESTEQ %ecx
jnz L(return_vec_2)
VMOVU (VEC_SIZE * 3)(%rdi), %YMM0
VPTESTM %YMM0, %YMM0, %k2
- VPCMP $0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
+ CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2}
kmovd %k1, %ecx
TESTEQ %ecx
jnz L(return_vec_3)
@@ -381,7 +554,6 @@ L(prepare_loop_aligned):
subl %esi, %eax
andl $(PAGE_SIZE - 1), %eax
- vpxorq %YMMZERO, %YMMZERO, %YMMZERO
/* Loop 4x comparisons at a time. */
.p2align 4
@@ -413,22 +585,35 @@ L(loop_skip_page_cross_check):
/* A zero CHAR in YMM9 means that there is a null CHAR. */
VPMINU %YMM8, %YMM9, %YMM9
- /* Each bit set in K1 represents a non-null CHAR in YMM8. */
+ /* Each bit set in K1 represents a non-null CHAR in YMM9. */
VPTESTM %YMM9, %YMM9, %k1
-
+# ifndef USE_AS_STRCASECMP_L
vpxorq (VEC_SIZE * 0)(%rsi), %YMM0, %YMM1
vpxorq (VEC_SIZE * 1)(%rsi), %YMM2, %YMM3
vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
/* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
oring with YMM1. Result is stored in YMM6. */
vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6
-
+# else
+ VMOVU (VEC_SIZE * 0)(%rsi), %YMM1
+ TOLOWER_YMM (%YMM0, %YMM1)
+ VMOVU (VEC_SIZE * 1)(%rsi), %YMM3
+ TOLOWER_YMM (%YMM2, %YMM3)
+ VMOVU (VEC_SIZE * 2)(%rsi), %YMM5
+ TOLOWER_YMM (%YMM4, %YMM5)
+ VMOVU (VEC_SIZE * 3)(%rsi), %YMM7
+ TOLOWER_YMM (%YMM6, %YMM7)
+ vpxorq %YMM0, %YMM1, %YMM1
+ vpxorq %YMM2, %YMM3, %YMM3
+ vpxorq %YMM4, %YMM5, %YMM5
+ vpternlogd $0xde, %YMM7, %YMM1, %YMM6
+# endif
/* Or together YMM3, YMM5, and YMM6. */
vpternlogd $0xfe, %YMM3, %YMM5, %YMM6
/* A non-zero CHAR in YMM6 represents a mismatch. */
- VPCMP $0, %YMMZERO, %YMM6, %k0{%k1}
+ VPTESTNM %YMM6, %YMM6, %k0{%k1}
kmovd %k0, %LOOP_REG
TESTEQ %LOOP_REG
@@ -437,13 +622,13 @@ L(loop_skip_page_cross_check):
/* Find which VEC has the mismatch of end of string. */
VPTESTM %YMM0, %YMM0, %k1
- VPCMP $0, %YMMZERO, %YMM1, %k0{%k1}
+ VPTESTNM %YMM1, %YMM1, %k0{%k1}
kmovd %k0, %ecx
TESTEQ %ecx
jnz L(return_vec_0_end)
VPTESTM %YMM2, %YMM2, %k1
- VPCMP $0, %YMMZERO, %YMM3, %k0{%k1}
+ VPTESTNM %YMM3, %YMM3, %k0{%k1}
kmovd %k0, %ecx
TESTEQ %ecx
jnz L(return_vec_1_end)
@@ -457,7 +642,7 @@ L(return_vec_2_3_end):
# endif
VPTESTM %YMM4, %YMM4, %k1
- VPCMP $0, %YMMZERO, %YMM5, %k0{%k1}
+ VPTESTNM %YMM5, %YMM5, %k0{%k1}
kmovd %k0, %ecx
TESTEQ %ecx
# if CHAR_PER_VEC <= 16
@@ -493,6 +678,8 @@ L(return_vec_3_end):
# else
movzbl (VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax
movzbl (VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -545,6 +732,8 @@ L(return_vec_0_end):
# else
movzbl (%rdi, %rcx), %eax
movzbl (%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
/* Flip `eax` if `rdi` and `rsi` where swapped in page cross
logic. Subtract `r8d` after xor for zero case. */
@@ -569,6 +758,8 @@ L(return_vec_1_end):
# else
movzbl VEC_SIZE(%rdi, %rcx), %eax
movzbl VEC_SIZE(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -598,7 +789,7 @@ L(page_cross_during_loop):
VMOVA (%rdi), %YMM0
VPTESTM %YMM0, %YMM0, %k2
- VPCMP $0, (%rsi), %YMM0, %k1{%k2}
+ CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
kmovd %k1, %ecx
TESTEQ %ecx
jnz L(return_vec_0_end)
@@ -619,8 +810,7 @@ L(less_1x_vec_till_page_cross):
been loaded earlier so must be valid. */
VMOVU -VEC_SIZE(%rdi, %rax), %YMM0
VPTESTM %YMM0, %YMM0, %k2
- VPCMP $0, -VEC_SIZE(%rsi, %rax), %YMM0, %k1{%k2}
-
+ CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2}
/* Mask of potentially valid bits. The lower bits can be out of
range comparisons (but safe regarding page crosses). */
@@ -642,6 +832,8 @@ L(less_1x_vec_till_page_cross):
# ifdef USE_AS_STRNCMP
# ifdef USE_AS_WCSCMP
+ /* NB: strcasecmp not used with WCSCMP so this access to r11 is
+ safe. */
movl %eax, %r11d
shrl $2, %r11d
cmpq %r11, %rdx
@@ -679,6 +871,8 @@ L(return_page_cross_cmp_mem):
# else
movzbl VEC_OFFSET(%rdi, %rcx), %eax
movzbl VEC_OFFSET(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -709,7 +903,7 @@ L(more_2x_vec_till_page_cross):
VMOVA VEC_SIZE(%rdi), %YMM0
VPTESTM %YMM0, %YMM0, %k2
- VPCMP $0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
+ CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
kmovd %k1, %ecx
TESTEQ %ecx
jnz L(return_vec_1_end)
@@ -724,14 +918,14 @@ L(more_2x_vec_till_page_cross):
/* Safe to include comparisons from lower bytes. */
VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %YMM0
VPTESTM %YMM0, %YMM0, %k2
- VPCMP $0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM0, %k1{%k2}
+ CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2}
kmovd %k1, %ecx
TESTEQ %ecx
jnz L(return_vec_page_cross_0)
VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %YMM0
VPTESTM %YMM0, %YMM0, %k2
- VPCMP $0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM0, %k1{%k2}
+ CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2}
kmovd %k1, %ecx
TESTEQ %ecx
jnz L(return_vec_page_cross_1)
@@ -740,6 +934,8 @@ L(more_2x_vec_till_page_cross):
/* Must check length here as length might proclude reading next
page. */
# ifdef USE_AS_WCSCMP
+ /* NB: strcasecmp not used with WCSCMP so this access to r11 is
+ safe. */
movl %eax, %r11d
shrl $2, %r11d
cmpq %r11, %rdx
@@ -754,12 +950,19 @@ L(more_2x_vec_till_page_cross):
VMOVA (VEC_SIZE * 3)(%rdi), %YMM6
VPMINU %YMM4, %YMM6, %YMM9
VPTESTM %YMM9, %YMM9, %k1
-
+# ifndef USE_AS_STRCASECMP_L
vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
/* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6). */
vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6
-
- VPCMP $0, %YMMZERO, %YMM6, %k0{%k1}
+# else
+ VMOVU (VEC_SIZE * 2)(%rsi), %YMM5
+ TOLOWER_YMM (%YMM4, %YMM5)
+ VMOVU (VEC_SIZE * 3)(%rsi), %YMM7
+ TOLOWER_YMM (%YMM6, %YMM7)
+ vpxorq %YMM4, %YMM5, %YMM5
+ vpternlogd $0xde, %YMM7, %YMM5, %YMM6
+# endif
+ VPTESTNM %YMM6, %YMM6, %k0{%k1}
kmovd %k0, %LOOP_REG
TESTEQ %LOOP_REG
jnz L(return_vec_2_3_end)
@@ -815,6 +1018,8 @@ L(return_vec_page_cross_1):
# else
movzbl VEC_OFFSET(%rdi, %rcx), %eax
movzbl VEC_OFFSET(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -871,7 +1076,7 @@ L(page_cross):
L(page_cross_loop):
VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
VPTESTM %YMM0, %YMM0, %k2
- VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
+ CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
kmovd %k1, %ecx
TESTEQ %ecx
jnz L(check_ret_vec_page_cross)
@@ -895,7 +1100,7 @@ L(page_cross_loop):
*/
VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
VPTESTM %YMM0, %YMM0, %k2
- VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
+ CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
kmovd %k1, %ecx
# ifdef USE_AS_STRNCMP
@@ -930,6 +1135,8 @@ L(ret_vec_page_cross_cont):
# else
movzbl (%rdi, %rcx, SIZE_OF_CHAR), %eax
movzbl (%rsi, %rcx, SIZE_OF_CHAR), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -989,7 +1196,7 @@ L(less_1x_vec_till_page):
/* Use 16 byte comparison. */
vmovdqu (%rdi), %xmm0
VPTESTM %xmm0, %xmm0, %k2
- VPCMP $0, (%rsi), %xmm0, %k1{%k2}
+ CMP_R1_S2_XMM (%xmm0, (%rsi), %xmm1, %k1){%k2}
kmovd %k1, %ecx
# ifdef USE_AS_WCSCMP
subl $0xf, %ecx
@@ -1009,7 +1216,7 @@ L(less_1x_vec_till_page):
# endif
vmovdqu (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
VPTESTM %xmm0, %xmm0, %k2
- VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0, %k1{%k2}
+ CMP_R1_S2_XMM (%xmm0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1, %k1){%k2}
kmovd %k1, %ecx
# ifdef USE_AS_WCSCMP
subl $0xf, %ecx
@@ -1048,7 +1255,7 @@ L(less_16_till_page):
vmovq (%rdi), %xmm0
vmovq (%rsi), %xmm1
VPTESTM %xmm0, %xmm0, %k2
- VPCMP $0, %xmm1, %xmm0, %k1{%k2}
+ CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
kmovd %k1, %ecx
# ifdef USE_AS_WCSCMP
subl $0x3, %ecx
@@ -1068,7 +1275,7 @@ L(less_16_till_page):
vmovq (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
vmovq (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
VPTESTM %xmm0, %xmm0, %k2
- VPCMP $0, %xmm1, %xmm0, %k1{%k2}
+ CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
kmovd %k1, %ecx
# ifdef USE_AS_WCSCMP
subl $0x3, %ecx
@@ -1128,7 +1335,7 @@ L(ret_less_8_wcs):
vmovd (%rdi), %xmm0
vmovd (%rsi), %xmm1
VPTESTM %xmm0, %xmm0, %k2
- VPCMP $0, %xmm1, %xmm0, %k1{%k2}
+ CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
kmovd %k1, %ecx
subl $0xf, %ecx
jnz L(check_ret_vec_page_cross)
@@ -1143,7 +1350,7 @@ L(ret_less_8_wcs):
vmovd (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
vmovd (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
VPTESTM %xmm0, %xmm0, %k2
- VPCMP $0, %xmm1, %xmm0, %k1{%k2}
+ CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
kmovd %k1, %ecx
subl $0xf, %ecx
jnz L(check_ret_vec_page_cross)
@@ -1176,7 +1383,9 @@ L(less_4_till_page):
L(less_4_loop):
movzbl (%rdi), %eax
movzbl (%rsi, %rdi), %ecx
- subl %ecx, %eax
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
+ subl %BYTE_LOOP_REG, %eax
jnz L(ret_less_4_loop)
testl %ecx, %ecx
jz L(ret_zero_4_loop)
@@ -1203,5 +1412,6 @@ L(ret_less_4_loop):
subl %r8d, %eax
ret
# endif
-END(STRCMP)
+ cfi_endproc
+ .size STRCMP, .-STRCMP
#endif
diff --git a/sysdeps/x86_64/multiarch/strncase_l-evex.S b/sysdeps/x86_64/multiarch/strncase_l-evex.S
new file mode 100644
index 0000000000..8a5af3695c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncase_l-evex.S
@@ -0,0 +1,25 @@
+/* strncasecmp_l optimized with EVEX.
+ Copyright (C) 2017-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef STRCMP
+# define STRCMP __strncasecmp_l_evex
+#endif
+#define OVERFLOW_STRCMP __strcasecmp_l_evex
+#define USE_AS_STRCASECMP_L
+#define USE_AS_STRNCMP
+#include "strcmp-evex.S"
--
2.25.1
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v4 21/23] x86: Add AVX2 optimized str{n}casecmp
2022-03-24 23:56 ` [PATCH v4 21/23] x86: Add AVX2 " Noah Goldstein
2022-03-24 23:56 ` [PATCH v4 22/23] x86: Add EVEX " Noah Goldstein
@ 2022-03-25 18:14 ` H.J. Lu
2022-05-12 19:52 ` Sunil Pandey
1 sibling, 1 reply; 76+ messages in thread
From: H.J. Lu @ 2022-03-25 18:14 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Thu, Mar 24, 2022 at 4:56 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> geometric_mean(N=40) of all benchmarks AVX2 / SSE42: .702
>
> All string/memory tests pass.
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
> ---
> sysdeps/x86_64/multiarch/Makefile | 4 +
> sysdeps/x86_64/multiarch/ifunc-impl-list.c | 28 +++
> sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 12 +
> .../x86_64/multiarch/strcasecmp_l-avx2-rtm.S | 15 ++
> sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S | 23 ++
> sysdeps/x86_64/multiarch/strcmp-avx2.S | 237 +++++++++++++++---
> .../x86_64/multiarch/strncase_l-avx2-rtm.S | 16 ++
> sysdeps/x86_64/multiarch/strncase_l-avx2.S | 27 ++
> 8 files changed, 331 insertions(+), 31 deletions(-)
> create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
> create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
> create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
> create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index e7b413edad..06e1848823 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -55,6 +55,8 @@ sysdep_routines += \
> stpncpy-sse2-unaligned \
> stpncpy-ssse3 \
> strcasecmp_l-avx \
> + strcasecmp_l-avx2 \
> + strcasecmp_l-avx2-rtm \
> strcasecmp_l-sse2 \
> strcasecmp_l-sse4_2 \
> strcasecmp_l-ssse3 \
> @@ -93,6 +95,8 @@ sysdep_routines += \
> strlen-evex \
> strlen-sse2 \
> strncase_l-avx \
> + strncase_l-avx2 \
> + strncase_l-avx2-rtm \
> strncase_l-sse2 \
> strncase_l-sse4_2 \
> strncase_l-ssse3 \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index a594f4176e..3c556d07ac 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -436,6 +436,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
> IFUNC_IMPL (i, name, strcasecmp,
> + IFUNC_IMPL_ADD (array, i, strcasecmp,
> + CPU_FEATURE_USABLE (AVX2),
> + __strcasecmp_avx2)
> + IFUNC_IMPL_ADD (array, i, strcasecmp,
> + (CPU_FEATURE_USABLE (AVX2)
> + && CPU_FEATURE_USABLE (RTM)),
> + __strcasecmp_avx2_rtm)
> IFUNC_IMPL_ADD (array, i, strcasecmp,
> CPU_FEATURE_USABLE (AVX),
> __strcasecmp_avx)
> @@ -449,6 +456,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
> IFUNC_IMPL (i, name, strcasecmp_l,
> + IFUNC_IMPL_ADD (array, i, strcasecmp,
> + CPU_FEATURE_USABLE (AVX2),
> + __strcasecmp_l_avx2)
> + IFUNC_IMPL_ADD (array, i, strcasecmp,
> + (CPU_FEATURE_USABLE (AVX2)
> + && CPU_FEATURE_USABLE (RTM)),
> + __strcasecmp_l_avx2_rtm)
> IFUNC_IMPL_ADD (array, i, strcasecmp_l,
> CPU_FEATURE_USABLE (AVX),
> __strcasecmp_l_avx)
> @@ -576,6 +590,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/x86_64/multiarch/strncase_l.c. */
> IFUNC_IMPL (i, name, strncasecmp,
> + IFUNC_IMPL_ADD (array, i, strncasecmp,
> + CPU_FEATURE_USABLE (AVX2),
> + __strncasecmp_avx2)
> + IFUNC_IMPL_ADD (array, i, strncasecmp,
> + (CPU_FEATURE_USABLE (AVX2)
> + && CPU_FEATURE_USABLE (RTM)),
> + __strncasecmp_avx2_rtm)
> IFUNC_IMPL_ADD (array, i, strncasecmp,
> CPU_FEATURE_USABLE (AVX),
> __strncasecmp_avx)
> @@ -590,6 +611,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/x86_64/multiarch/strncase_l.c. */
> IFUNC_IMPL (i, name, strncasecmp_l,
> + IFUNC_IMPL_ADD (array, i, strncasecmp,
> + CPU_FEATURE_USABLE (AVX2),
> + __strncasecmp_l_avx2)
> + IFUNC_IMPL_ADD (array, i, strncasecmp,
> + (CPU_FEATURE_USABLE (AVX2)
> + && CPU_FEATURE_USABLE (RTM)),
> + __strncasecmp_l_avx2_rtm)
> IFUNC_IMPL_ADD (array, i, strncasecmp_l,
> CPU_FEATURE_USABLE (AVX),
> __strncasecmp_l_avx)
> diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> index 9e3cc61ac0..c4de111fd0 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> @@ -23,12 +23,24 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
>
> static inline void *
> IFUNC_SELECTOR (void)
> {
> const struct cpu_features* cpu_features = __get_cpu_features ();
>
> + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
> + {
> + if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> + return OPTIMIZE (avx2_rtm);
> +
> + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
> + return OPTIMIZE (avx2);
> + }
> +
> if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
> return OPTIMIZE (avx);
>
> diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
> new file mode 100644
> index 0000000000..09957fc3c5
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
> @@ -0,0 +1,15 @@
> +#ifndef STRCMP
> +# define STRCMP __strcasecmp_l_avx2_rtm
> +#endif
> +
> +#define _GLABEL(x) x ## _rtm
> +#define GLABEL(x) _GLABEL(x)
> +
> +#define ZERO_UPPER_VEC_REGISTERS_RETURN \
> + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> +
> +#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
> +
> +#define SECTION(p) p##.avx.rtm
> +
> +#include "strcasecmp_l-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
> new file mode 100644
> index 0000000000..e2762f2a22
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
> @@ -0,0 +1,23 @@
> +/* strcasecmp_l optimized with AVX2.
> + Copyright (C) 2017-2022 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#ifndef STRCMP
> +# define STRCMP __strcasecmp_l_avx2
> +#endif
> +#define USE_AS_STRCASECMP_L
> +#include "strcmp-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> index 86a86b68e3..8da09bd86d 100644
> --- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> @@ -20,6 +20,10 @@
>
> # include <sysdep.h>
>
> +# if defined USE_AS_STRCASECMP_L
> +# include "locale-defines.h"
> +# endif
> +
> # ifndef STRCMP
> # define STRCMP __strcmp_avx2
> # endif
> @@ -74,13 +78,88 @@
> # define VEC_OFFSET (-VEC_SIZE)
> # endif
>
> +# ifdef USE_AS_STRCASECMP_L
> +# define BYTE_LOOP_REG OFFSET_REG
> +# else
> +# define BYTE_LOOP_REG ecx
> +# endif
> +
> +# ifdef USE_AS_STRCASECMP_L
> +# ifdef USE_AS_STRNCMP
> +# define STRCASECMP __strncasecmp_avx2
> +# define LOCALE_REG rcx
> +# define LOCALE_REG_LP RCX_LP
> +# define STRCASECMP_NONASCII __strncasecmp_l_nonascii
> +# else
> +# define STRCASECMP __strcasecmp_avx2
> +# define LOCALE_REG rdx
> +# define LOCALE_REG_LP RDX_LP
> +# define STRCASECMP_NONASCII __strcasecmp_l_nonascii
> +# endif
> +# endif
> +
> # define xmmZERO xmm15
> # define ymmZERO ymm15
>
> +# define LCASE_MIN_ymm %ymm10
> +# define LCASE_MAX_ymm %ymm11
> +# define CASE_ADD_ymm %ymm12
> +
> +# define LCASE_MIN_xmm %xmm10
> +# define LCASE_MAX_xmm %xmm11
> +# define CASE_ADD_xmm %xmm12
> +
> + /* r11 is never use elsewhere so this is safe to maintain. */
> +# define TOLOWER_BASE %r11
> +
> # ifndef SECTION
> # define SECTION(p) p##.avx
> # endif
>
> +# ifdef USE_AS_STRCASECMP_L
> +# define REG(x, y) x ## y
> +# define TOLOWER(reg1_in, reg1_out, reg2_in, reg2_out, ext) \
> + vpaddb REG(LCASE_MIN_, ext), reg1_in, REG(%ext, 8); \
> + vpaddb REG(LCASE_MIN_, ext), reg2_in, REG(%ext, 9); \
> + vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 8), REG(%ext, 8); \
> + vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 9), REG(%ext, 9); \
> + vpandn REG(CASE_ADD_, ext), REG(%ext, 8), REG(%ext, 8); \
> + vpandn REG(CASE_ADD_, ext), REG(%ext, 9), REG(%ext, 9); \
> + vpaddb REG(%ext, 8), reg1_in, reg1_out; \
> + vpaddb REG(%ext, 9), reg2_in, reg2_out
> +
> +# define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
> +# define TOLOWER_ymm(...) TOLOWER(__VA_ARGS__, ymm)
> +# define TOLOWER_xmm(...) TOLOWER(__VA_ARGS__, xmm)
> +
> +# define CMP_R1_R2(s1_reg, s2_reg, scratch_reg, reg_out, ext) \
> + TOLOWER (s1_reg, scratch_reg, s2_reg, s2_reg, ext); \
> + VPCMPEQ scratch_reg, s2_reg, reg_out
> +
> +# define CMP_R1_S2(s1_reg, s2_mem, scratch_reg, reg_out, ext) \
> + VMOVU s2_mem, reg_out; \
> + CMP_R1_R2(s1_reg, reg_out, scratch_reg, reg_out, ext)
> +
> +# define CMP_R1_R2_ymm(...) CMP_R1_R2(__VA_ARGS__, ymm)
> +# define CMP_R1_R2_xmm(...) CMP_R1_R2(__VA_ARGS__, xmm)
> +
> +# define CMP_R1_S2_ymm(...) CMP_R1_S2(__VA_ARGS__, ymm)
> +# define CMP_R1_S2_xmm(...) CMP_R1_S2(__VA_ARGS__, xmm)
> +
> +# else
> +# define TOLOWER_gpr(...)
> +# define TOLOWER_ymm(...)
> +# define TOLOWER_xmm(...)
> +
> +# define CMP_R1_R2_ymm(s1_reg, s2_reg, scratch_reg, reg_out) \
> + VPCMPEQ s2_reg, s1_reg, reg_out
> +
> +# define CMP_R1_R2_xmm(...) CMP_R1_R2_ymm(__VA_ARGS__)
> +
> +# define CMP_R1_S2_ymm(...) CMP_R1_R2_ymm(__VA_ARGS__)
> +# define CMP_R1_S2_xmm(...) CMP_R1_R2_xmm(__VA_ARGS__)
> +# endif
> +
> /* Warning!
> wcscmp/wcsncmp have to use SIGNED comparison for elements.
> strcmp/strncmp have to use UNSIGNED comparison for elements.
> @@ -102,8 +181,49 @@
> returned. */
>
> .section SECTION(.text), "ax", @progbits
> -ENTRY(STRCMP)
> + .align 16
> + .type STRCMP, @function
> + .globl STRCMP
> + .hidden STRCMP
> +
> +# ifndef GLABEL
> +# define GLABEL(...) __VA_ARGS__
> +# endif
> +
> +# ifdef USE_AS_STRCASECMP_L
> +ENTRY (GLABEL(STRCASECMP))
> + movq __libc_tsd_LOCALE@gottpoff(%rip), %rax
> + mov %fs:(%rax), %LOCALE_REG_LP
> +
> + /* Either 1 or 5 bytes (dependeing if CET is enabled). */
> + .p2align 4
> +END (GLABEL(STRCASECMP))
> + /* FALLTHROUGH to strcasecmp/strncasecmp_l. */
> +# endif
> +
> + .p2align 4
> +STRCMP:
> + cfi_startproc
> + _CET_ENDBR
> + CALL_MCOUNT
> +
> +# if defined USE_AS_STRCASECMP_L
> + /* We have to fall back on the C implementation for locales with
> + encodings not matching ASCII for single bytes. */
> +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
> + mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
> +# else
> + mov (%LOCALE_REG), %RAX_LP
> +# endif
> + testl $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
> + jne STRCASECMP_NONASCII
> + leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
> +# endif
> +
> # ifdef USE_AS_STRNCMP
> + /* Don't overwrite LOCALE_REG (rcx) until we have pass
> + L(one_or_less). Otherwise we might use the wrong locale in
> + the OVERFLOW_STRCMP (strcasecmp_l). */
> # ifdef __ILP32__
> /* Clear the upper 32 bits. */
> movl %edx, %edx
> @@ -128,6 +248,30 @@ ENTRY(STRCMP)
> # endif
> # endif
> vpxor %xmmZERO, %xmmZERO, %xmmZERO
> +# if defined USE_AS_STRCASECMP_L
> + .section .rodata.cst32, "aM", @progbits, 32
> + .align 32
> +L(lcase_min):
> + .quad 0x3f3f3f3f3f3f3f3f
> + .quad 0x3f3f3f3f3f3f3f3f
> + .quad 0x3f3f3f3f3f3f3f3f
> + .quad 0x3f3f3f3f3f3f3f3f
> +L(lcase_max):
> + .quad 0x9999999999999999
> + .quad 0x9999999999999999
> + .quad 0x9999999999999999
> + .quad 0x9999999999999999
> +L(case_add):
> + .quad 0x2020202020202020
> + .quad 0x2020202020202020
> + .quad 0x2020202020202020
> + .quad 0x2020202020202020
> + .previous
> +
> + vmovdqa L(lcase_min)(%rip), LCASE_MIN_ymm
> + vmovdqa L(lcase_max)(%rip), LCASE_MAX_ymm
> + vmovdqa L(case_add)(%rip), CASE_ADD_ymm
> +# endif
> movl %edi, %eax
> orl %esi, %eax
> sall $20, %eax
> @@ -138,8 +282,10 @@ ENTRY(STRCMP)
> L(no_page_cross):
> /* Safe to compare 4x vectors. */
> VMOVU (%rdi), %ymm0
> - /* 1s where s1 and s2 equal. */
> - VPCMPEQ (%rsi), %ymm0, %ymm1
> + /* 1s where s1 and s2 equal. Just VPCMPEQ if its not strcasecmp.
> + Otherwise converts ymm0 and load from rsi to lower. ymm2 is
> + scratch and ymm1 is the return. */
> + CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
> /* 1s at null CHAR. */
> VPCMPEQ %ymm0, %ymmZERO, %ymm2
> /* 1s where s1 and s2 equal AND not null CHAR. */
> @@ -172,6 +318,8 @@ L(return_vec_0):
> # else
> movzbl (%rdi, %rcx), %eax
> movzbl (%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> # endif
> L(ret0):
> @@ -192,6 +340,10 @@ L(ret_zero):
>
> .p2align 4,, 5
> L(one_or_less):
> +# ifdef USE_AS_STRCASECMP_L
> + /* Set locale argument for strcasecmp. */
> + movq %LOCALE_REG, %rdx
> +# endif
> jb L(ret_zero)
> /* 'nbe' covers the case where length is negative (large
> unsigned). */
> @@ -207,6 +359,8 @@ L(one_or_less):
> # else
> movzbl (%rdi), %eax
> movzbl (%rsi), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> # endif
> L(ret1):
> @@ -234,6 +388,8 @@ L(return_vec_1):
> # else
> movzbl VEC_SIZE(%rdi, %rcx), %eax
> movzbl VEC_SIZE(%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> # endif
> L(ret2):
> @@ -265,6 +421,8 @@ L(return_vec_2):
> # else
> movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
> movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> # endif
> L(ret3):
> @@ -285,6 +443,8 @@ L(return_vec_3):
> # else
> movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
> movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> # endif
> L(ret4):
> @@ -295,7 +455,7 @@ L(ret4):
> L(more_3x_vec):
> /* Safe to compare 4x vectors. */
> VMOVU VEC_SIZE(%rdi), %ymm0
> - VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1
> + CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
> VPCMPEQ %ymm0, %ymmZERO, %ymm2
> vpandn %ymm1, %ymm2, %ymm1
> vpmovmskb %ymm1, %ecx
> @@ -308,7 +468,7 @@ L(more_3x_vec):
> # endif
>
> VMOVU (VEC_SIZE * 2)(%rdi), %ymm0
> - VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm0, %ymm1
> + CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 2)(%rsi), %ymm2, %ymm1)
> VPCMPEQ %ymm0, %ymmZERO, %ymm2
> vpandn %ymm1, %ymm2, %ymm1
> vpmovmskb %ymm1, %ecx
> @@ -316,7 +476,7 @@ L(more_3x_vec):
> jnz L(return_vec_2)
>
> VMOVU (VEC_SIZE * 3)(%rdi), %ymm0
> - VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm0, %ymm1
> + CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 3)(%rsi), %ymm2, %ymm1)
> VPCMPEQ %ymm0, %ymmZERO, %ymm2
> vpandn %ymm1, %ymm2, %ymm1
> vpmovmskb %ymm1, %ecx
> @@ -391,12 +551,10 @@ L(loop_skip_page_cross_check):
> VMOVA (VEC_SIZE * 3)(%rdi), %ymm6
>
> /* ymm1 all 1s where s1 and s2 equal. All 0s otherwise. */
> - VPCMPEQ (VEC_SIZE * 0)(%rsi), %ymm0, %ymm1
> -
> - VPCMPEQ (VEC_SIZE * 1)(%rsi), %ymm2, %ymm3
> - VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
> - VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
> -
> + CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 0)(%rsi), %ymm3, %ymm1)
> + CMP_R1_S2_ymm (%ymm2, (VEC_SIZE * 1)(%rsi), %ymm5, %ymm3)
> + CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
> + CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
>
> /* If any mismatches or null CHAR then 0 CHAR, otherwise non-
> zero. */
> @@ -465,6 +623,8 @@ L(return_vec_2_3_end):
> # else
> movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax
> movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> xorl %r8d, %eax
> subl %r8d, %eax
> @@ -508,6 +668,8 @@ L(return_vec_0_end):
> # else
> movzbl (%rdi, %rcx), %eax
> movzbl (%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> xorl %r8d, %eax
> subl %r8d, %eax
> @@ -530,6 +692,8 @@ L(return_vec_1_end):
> # else
> movzbl VEC_SIZE(%rdi, %rcx), %eax
> movzbl VEC_SIZE(%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> xorl %r8d, %eax
> subl %r8d, %eax
> @@ -556,6 +720,8 @@ L(return_vec_2_end):
> # else
> movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
> movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> xorl %r8d, %eax
> subl %r8d, %eax
> @@ -583,7 +749,7 @@ L(page_cross_during_loop):
> jle L(less_1x_vec_till_page_cross)
>
> VMOVA (%rdi), %ymm0
> - VPCMPEQ (%rsi), %ymm0, %ymm1
> + CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
> VPCMPEQ %ymm0, %ymmZERO, %ymm2
> vpandn %ymm1, %ymm2, %ymm1
> vpmovmskb %ymm1, %ecx
> @@ -605,7 +771,7 @@ L(less_1x_vec_till_page_cross):
> here, it means the previous page (rdi - VEC_SIZE) has already
> been loaded earlier so must be valid. */
> VMOVU -VEC_SIZE(%rdi, %rax), %ymm0
> - VPCMPEQ -VEC_SIZE(%rsi, %rax), %ymm0, %ymm1
> + CMP_R1_S2_ymm (%ymm0, -VEC_SIZE(%rsi, %rax), %ymm2, %ymm1)
> VPCMPEQ %ymm0, %ymmZERO, %ymm2
> vpandn %ymm1, %ymm2, %ymm1
> vpmovmskb %ymm1, %ecx
> @@ -647,6 +813,8 @@ L(return_page_cross_cmp_mem):
> # else
> movzbl VEC_OFFSET(%rdi, %rcx), %eax
> movzbl VEC_OFFSET(%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> xorl %r8d, %eax
> subl %r8d, %eax
> @@ -673,7 +841,7 @@ L(more_2x_vec_till_page_cross):
> iteration here. */
>
> VMOVU VEC_SIZE(%rdi), %ymm0
> - VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1
> + CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
> VPCMPEQ %ymm0, %ymmZERO, %ymm2
> vpandn %ymm1, %ymm2, %ymm1
> vpmovmskb %ymm1, %ecx
> @@ -689,7 +857,7 @@ L(more_2x_vec_till_page_cross):
>
> /* Safe to include comparisons from lower bytes. */
> VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %ymm0
> - VPCMPEQ -(VEC_SIZE * 2)(%rsi, %rax), %ymm0, %ymm1
> + CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 2)(%rsi, %rax), %ymm2, %ymm1)
> VPCMPEQ %ymm0, %ymmZERO, %ymm2
> vpandn %ymm1, %ymm2, %ymm1
> vpmovmskb %ymm1, %ecx
> @@ -697,7 +865,7 @@ L(more_2x_vec_till_page_cross):
> jnz L(return_vec_page_cross_0)
>
> VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %ymm0
> - VPCMPEQ -(VEC_SIZE * 1)(%rsi, %rax), %ymm0, %ymm1
> + CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 1)(%rsi, %rax), %ymm2, %ymm1)
> VPCMPEQ %ymm0, %ymmZERO, %ymm2
> vpandn %ymm1, %ymm2, %ymm1
> vpmovmskb %ymm1, %ecx
> @@ -715,8 +883,8 @@ L(more_2x_vec_till_page_cross):
> VMOVA (VEC_SIZE * 2)(%rdi), %ymm4
> VMOVA (VEC_SIZE * 3)(%rdi), %ymm6
>
> - VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
> - VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
> + CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
> + CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
> vpand %ymm4, %ymm5, %ymm5
> vpand %ymm6, %ymm7, %ymm7
> VPMINU %ymm5, %ymm7, %ymm7
> @@ -767,6 +935,8 @@ L(return_vec_page_cross_1):
> # else
> movzbl VEC_OFFSET(%rdi, %rcx), %eax
> movzbl VEC_OFFSET(%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> xorl %r8d, %eax
> subl %r8d, %eax
> @@ -822,7 +992,7 @@ L(page_cross):
> L(page_cross_loop):
>
> VMOVU (%rdi, %OFFSET_REG64), %ymm0
> - VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1
> + CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
> VPCMPEQ %ymm0, %ymmZERO, %ymm2
> vpandn %ymm1, %ymm2, %ymm1
> vpmovmskb %ymm1, %ecx
> @@ -840,11 +1010,11 @@ L(page_cross_loop):
> subl %eax, %OFFSET_REG
> /* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
> to not cross page so is safe to load. Since we have already
> - loaded at least 1 VEC from rsi it is also guranteed to be safe.
> - */
> + loaded at least 1 VEC from rsi it is also guranteed to be
> + safe. */
>
> VMOVU (%rdi, %OFFSET_REG64), %ymm0
> - VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1
> + CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
> VPCMPEQ %ymm0, %ymmZERO, %ymm2
> vpandn %ymm1, %ymm2, %ymm1
> vpmovmskb %ymm1, %ecx
> @@ -877,6 +1047,8 @@ L(ret_vec_page_cross_cont):
> # else
> movzbl (%rdi, %rcx), %eax
> movzbl (%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> xorl %r8d, %eax
> subl %r8d, %eax
> @@ -930,7 +1102,7 @@ L(less_1x_vec_till_page):
> ja L(less_16_till_page)
>
> VMOVU (%rdi), %xmm0
> - VPCMPEQ (%rsi), %xmm0, %xmm1
> + CMP_R1_S2_xmm (%xmm0, (%rsi), %xmm2, %xmm1)
> VPCMPEQ %xmm0, %xmmZERO, %xmm2
> vpandn %xmm1, %xmm2, %xmm1
> vpmovmskb %ymm1, %ecx
> @@ -948,7 +1120,7 @@ L(less_1x_vec_till_page):
> # endif
>
> VMOVU (%rdi, %OFFSET_REG64), %xmm0
> - VPCMPEQ (%rsi, %OFFSET_REG64), %xmm0, %xmm1
> + CMP_R1_S2_xmm (%xmm0, (%rsi, %OFFSET_REG64), %xmm2, %xmm1)
> VPCMPEQ %xmm0, %xmmZERO, %xmm2
> vpandn %xmm1, %xmm2, %xmm1
> vpmovmskb %ymm1, %ecx
> @@ -986,7 +1158,7 @@ L(less_16_till_page):
> vmovq (%rdi), %xmm0
> vmovq (%rsi), %xmm1
> VPCMPEQ %xmm0, %xmmZERO, %xmm2
> - VPCMPEQ %xmm1, %xmm0, %xmm1
> + CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
> vpandn %xmm1, %xmm2, %xmm1
> vpmovmskb %ymm1, %ecx
> incb %cl
> @@ -1006,7 +1178,7 @@ L(less_16_till_page):
> vmovq (%rdi, %OFFSET_REG64), %xmm0
> vmovq (%rsi, %OFFSET_REG64), %xmm1
> VPCMPEQ %xmm0, %xmmZERO, %xmm2
> - VPCMPEQ %xmm1, %xmm0, %xmm1
> + CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
> vpandn %xmm1, %xmm2, %xmm1
> vpmovmskb %ymm1, %ecx
> incb %cl
> @@ -1062,7 +1234,7 @@ L(ret_less_8_wcs):
> vmovd (%rdi), %xmm0
> vmovd (%rsi), %xmm1
> VPCMPEQ %xmm0, %xmmZERO, %xmm2
> - VPCMPEQ %xmm1, %xmm0, %xmm1
> + CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
> vpandn %xmm1, %xmm2, %xmm1
> vpmovmskb %ymm1, %ecx
> subl $0xf, %ecx
> @@ -1081,7 +1253,7 @@ L(ret_less_8_wcs):
> vmovd (%rdi, %OFFSET_REG64), %xmm0
> vmovd (%rsi, %OFFSET_REG64), %xmm1
> VPCMPEQ %xmm0, %xmmZERO, %xmm2
> - VPCMPEQ %xmm1, %xmm0, %xmm1
> + CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
> vpandn %xmm1, %xmm2, %xmm1
> vpmovmskb %ymm1, %ecx
> subl $0xf, %ecx
> @@ -1115,7 +1287,9 @@ L(less_4_till_page):
> L(less_4_loop):
> movzbl (%rdi), %eax
> movzbl (%rsi, %rdi), %ecx
> - subl %ecx, %eax
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
> + subl %BYTE_LOOP_REG, %eax
> jnz L(ret_less_4_loop)
> testl %ecx, %ecx
> jz L(ret_zero_4_loop)
> @@ -1142,5 +1316,6 @@ L(ret_less_4_loop):
> subl %r8d, %eax
> ret
> # endif
> -END(STRCMP)
> + cfi_endproc
> + .size STRCMP, .-STRCMP
> #endif
> diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
> new file mode 100644
> index 0000000000..58c05dcfb8
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
> @@ -0,0 +1,16 @@
> +#ifndef STRCMP
> +# define STRCMP __strncasecmp_l_avx2_rtm
> +#endif
> +
> +#define _GLABEL(x) x ## _rtm
> +#define GLABEL(x) _GLABEL(x)
> +
> +#define ZERO_UPPER_VEC_REGISTERS_RETURN \
> + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> +
> +#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
> +
> +#define SECTION(p) p##.avx.rtm
> +#define OVERFLOW_STRCMP __strcasecmp_l_avx2_rtm
> +
> +#include "strncase_l-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2.S b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
> new file mode 100644
> index 0000000000..48c0aa21f8
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
> @@ -0,0 +1,27 @@
> +/* strncasecmp_l optimized with AVX2.
> + Copyright (C) 2017-2022 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#ifndef STRCMP
> +# define STRCMP __strncasecmp_l_avx2
> +#endif
> +#define USE_AS_STRCASECMP_L
> +#define USE_AS_STRNCMP
> +#ifndef OVERFLOW_STRCMP
> +# define OVERFLOW_STRCMP __strcasecmp_l_avx2
> +#endif
> +#include "strcmp-avx2.S"
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v4 22/23] x86: Add EVEX optimized str{n}casecmp
2022-03-24 23:56 ` [PATCH v4 22/23] x86: Add EVEX " Noah Goldstein
@ 2022-03-25 18:15 ` H.J. Lu
2022-03-25 18:18 ` Noah Goldstein
0 siblings, 1 reply; 76+ messages in thread
From: H.J. Lu @ 2022-03-25 18:15 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Thu, Mar 24, 2022 at 4:56 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> geometric_mean(N=40) of all benchmarks EVEX / SSE42: .621
>
> All string/memory tests pass.
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
> ---
> sysdeps/x86_64/multiarch/Makefile | 2 +
> sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 +
> sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 5 +
> sysdeps/x86_64/multiarch/strcasecmp_l-evex.S | 23 ++
> sysdeps/x86_64/multiarch/strcmp-evex.S | 290 ++++++++++++++++---
> sysdeps/x86_64/multiarch/strncase_l-evex.S | 25 ++
> 6 files changed, 321 insertions(+), 40 deletions(-)
> create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
> create mode 100644 sysdeps/x86_64/multiarch/strncase_l-evex.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 06e1848823..35d80dc2ff 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -57,6 +57,7 @@ sysdep_routines += \
> strcasecmp_l-avx \
> strcasecmp_l-avx2 \
> strcasecmp_l-avx2-rtm \
> + strcasecmp_l-evex \
> strcasecmp_l-sse2 \
> strcasecmp_l-sse4_2 \
> strcasecmp_l-ssse3 \
> @@ -97,6 +98,7 @@ sysdep_routines += \
> strncase_l-avx \
> strncase_l-avx2 \
> strncase_l-avx2-rtm \
> + strncase_l-evex \
> strncase_l-sse2 \
> strncase_l-sse4_2 \
> strncase_l-ssse3 \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 3c556d07ac..f1a4d3dac2 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -436,6 +436,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
> IFUNC_IMPL (i, name, strcasecmp,
> + IFUNC_IMPL_ADD (array, i, strcasecmp,
> + (CPU_FEATURE_USABLE (AVX512VL)
> + && CPU_FEATURE_USABLE (AVX512BW)),
> + __strcasecmp_evex)
> IFUNC_IMPL_ADD (array, i, strcasecmp,
> CPU_FEATURE_USABLE (AVX2),
> __strcasecmp_avx2)
> @@ -456,6 +460,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
> IFUNC_IMPL (i, name, strcasecmp_l,
> + IFUNC_IMPL_ADD (array, i, strcasecmp,
> + (CPU_FEATURE_USABLE (AVX512VL)
> + && CPU_FEATURE_USABLE (AVX512BW)),
> + __strcasecmp_l_evex)
> IFUNC_IMPL_ADD (array, i, strcasecmp,
> CPU_FEATURE_USABLE (AVX2),
> __strcasecmp_l_avx2)
> @@ -590,6 +598,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/x86_64/multiarch/strncase_l.c. */
> IFUNC_IMPL (i, name, strncasecmp,
> + IFUNC_IMPL_ADD (array, i, strncasecmp,
> + (CPU_FEATURE_USABLE (AVX512VL)
> + && CPU_FEATURE_USABLE (AVX512BW)),
> + __strncasecmp_evex)
> IFUNC_IMPL_ADD (array, i, strncasecmp,
> CPU_FEATURE_USABLE (AVX2),
> __strncasecmp_avx2)
> @@ -611,6 +623,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/x86_64/multiarch/strncase_l.c. */
> IFUNC_IMPL (i, name, strncasecmp_l,
> + IFUNC_IMPL_ADD (array, i, strncasecmp,
> + (CPU_FEATURE_USABLE (AVX512VL)
> + && CPU_FEATURE_USABLE (AVX512BW)),
> + __strncasecmp_l_evex)
> IFUNC_IMPL_ADD (array, i, strncasecmp,
> CPU_FEATURE_USABLE (AVX2),
> __strncasecmp_l_avx2)
> diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> index c4de111fd0..bf0d146e7f 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> @@ -25,6 +25,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
>
> static inline void *
> IFUNC_SELECTOR (void)
> @@ -34,6 +35,10 @@ IFUNC_SELECTOR (void)
> if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
> {
> + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
> + return OPTIMIZE (evex);
> +
> if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> return OPTIMIZE (avx2_rtm);
>
> diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
> new file mode 100644
> index 0000000000..58642db748
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
> @@ -0,0 +1,23 @@
> +/* strcasecmp_l optimized with EVEX.
> + Copyright (C) 2017-2022 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#ifndef STRCMP
> +# define STRCMP __strcasecmp_l_evex
> +#endif
> +#define USE_AS_STRCASECMP_L
> +#include "strcmp-evex.S"
> diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
> index 56d8c118e4..2a5b3ce037 100644
> --- a/sysdeps/x86_64/multiarch/strcmp-evex.S
> +++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
> @@ -19,6 +19,9 @@
> #if IS_IN (libc)
>
> # include <sysdep.h>
> +# if defined USE_AS_STRCASECMP_L
> +# include "locale-defines.h"
> +# endif
>
> # ifndef STRCMP
> # define STRCMP __strcmp_evex
> @@ -34,19 +37,29 @@
> # define VMOVA vmovdqa64
>
> # ifdef USE_AS_WCSCMP
> -# define TESTEQ subl $0xff,
> +# ifndef OVERFLOW_STRCMP
> +# define OVERFLOW_STRCMP __wcscmp_evex
> +# endif
> +
> +# define TESTEQ subl $0xff,
> /* Compare packed dwords. */
> # define VPCMP vpcmpd
> # define VPMINU vpminud
> # define VPTESTM vptestmd
> +# define VPTESTNM vptestnmd
> /* 1 dword char == 4 bytes. */
> # define SIZE_OF_CHAR 4
> # else
> +# ifndef OVERFLOW_STRCMP
> +# define OVERFLOW_STRCMP __strcmp_evex
> +# endif
> +
> # define TESTEQ incl
> /* Compare packed bytes. */
> # define VPCMP vpcmpb
> # define VPMINU vpminub
> # define VPTESTM vptestmb
> +# define VPTESTNM vptestnmb
> /* 1 byte char == 1 byte. */
> # define SIZE_OF_CHAR 1
> # endif
> @@ -73,11 +86,16 @@
> # define VEC_OFFSET (-VEC_SIZE)
> # endif
>
> -# define XMMZERO xmm16
> # define XMM0 xmm17
> # define XMM1 xmm18
>
> -# define YMMZERO ymm16
> +# define XMM10 xmm27
> +# define XMM11 xmm28
> +# define XMM12 xmm29
> +# define XMM13 xmm30
> +# define XMM14 xmm31
> +
> +
> # define YMM0 ymm17
> # define YMM1 ymm18
> # define YMM2 ymm19
> @@ -89,6 +107,87 @@
> # define YMM8 ymm25
> # define YMM9 ymm26
> # define YMM10 ymm27
> +# define YMM11 ymm28
> +# define YMM12 ymm29
> +# define YMM13 ymm30
> +# define YMM14 ymm31
> +
> +# ifdef USE_AS_STRCASECMP_L
> +# define BYTE_LOOP_REG OFFSET_REG
> +# else
> +# define BYTE_LOOP_REG ecx
> +# endif
> +
> +# ifdef USE_AS_STRCASECMP_L
> +# ifdef USE_AS_STRNCMP
> +# define STRCASECMP __strncasecmp_evex
> +# define LOCALE_REG rcx
> +# define LOCALE_REG_LP RCX_LP
> +# define STRCASECMP_NONASCII __strncasecmp_l_nonascii
> +# else
> +# define STRCASECMP __strcasecmp_evex
> +# define LOCALE_REG rdx
> +# define LOCALE_REG_LP RDX_LP
> +# define STRCASECMP_NONASCII __strcasecmp_l_nonascii
> +# endif
> +# endif
> +
> +# define LCASE_MIN_YMM %YMM12
> +# define LCASE_MAX_YMM %YMM13
> +# define CASE_ADD_YMM %YMM14
> +
> +# define LCASE_MIN_XMM %XMM12
> +# define LCASE_MAX_XMM %XMM13
> +# define CASE_ADD_XMM %XMM14
> +
> + /* NB: wcsncmp uses r11 but strcasecmp is never used in
> + conjunction with wcscmp. */
> +# define TOLOWER_BASE %r11
> +
> +# ifdef USE_AS_STRCASECMP_L
> +# define _REG(x, y) x ## y
> +# define REG(x, y) _REG(x, y)
> +# define TOLOWER(reg1, reg2, ext) \
> + vpsubb REG(LCASE_MIN_, ext), reg1, REG(%ext, 10); \
> + vpsubb REG(LCASE_MIN_, ext), reg2, REG(%ext, 11); \
> + vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5; \
> + vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6; \
> + vpaddb reg1, REG(CASE_ADD_, ext), reg1{%k5}; \
> + vpaddb reg2, REG(CASE_ADD_, ext), reg2{%k6}
> +
> +# define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
> +# define TOLOWER_YMM(...) TOLOWER(__VA_ARGS__, YMM)
> +# define TOLOWER_XMM(...) TOLOWER(__VA_ARGS__, XMM)
> +
> +# define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext) \
> + TOLOWER (s1_reg, s2_reg, ext); \
> + VPCMP $0, s1_reg, s2_reg, reg_out
> +
> +# define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext) \
> + VMOVU s2_mem, s2_reg; \
> + CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)
> +
> +# define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM)
> +# define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM)
> +
> +# define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM)
> +# define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM)
> +
> +# else
> +# define TOLOWER_gpr(...)
> +# define TOLOWER_YMM(...)
> +# define TOLOWER_XMM(...)
> +
> +# define CMP_R1_R2_YMM(s1_reg, s2_reg, reg_out) \
> + VPCMP $0, s2_reg, s1_reg, reg_out
> +
> +# define CMP_R1_R2_XMM(...) CMP_R1_R2_YMM(__VA_ARGS__)
> +
> +# define CMP_R1_S2_YMM(s1_reg, s2_mem, unused, reg_out) \
> + VPCMP $0, s2_mem, s1_reg, reg_out
> +
> +# define CMP_R1_S2_XMM(...) CMP_R1_S2_YMM(__VA_ARGS__)
> +# endif
>
> /* Warning!
> wcscmp/wcsncmp have to use SIGNED comparison for elements.
> @@ -112,8 +211,45 @@
> returned. */
>
> .section .text.evex, "ax", @progbits
> -ENTRY(STRCMP)
> + .align 16
> + .type STRCMP, @function
> + .globl STRCMP
> + .hidden STRCMP
> +
> +# ifdef USE_AS_STRCASECMP_L
> +ENTRY (STRCASECMP)
> + movq __libc_tsd_LOCALE@gottpoff(%rip), %rax
> + mov %fs:(%rax), %LOCALE_REG_LP
> +
> + /* Either 1 or 5 bytes (dependeing if CET is enabled). */
> + .p2align 4
> +END (STRCASECMP)
> + /* FALLTHROUGH to strcasecmp/strncasecmp_l. */
> +# endif
> +
> + .p2align 4
> +STRCMP:
> + cfi_startproc
> + _CET_ENDBR
> + CALL_MCOUNT
> +
> +# if defined USE_AS_STRCASECMP_L
> + /* We have to fall back on the C implementation for locales with
> + encodings not matching ASCII for single bytes. */
> +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
> + mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
> +# else
> + mov (%LOCALE_REG), %RAX_LP
> +# endif
> + testl $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
> + jne STRCASECMP_NONASCII
> + leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
> +# endif
> +
> # ifdef USE_AS_STRNCMP
> + /* Don't overwrite LOCALE_REG (rcx) until we have pass
> + L(one_or_less). Otherwise we might use the wrong locale in
> + the OVERFLOW_STRCMP (strcasecmp_l). */
> # ifdef __ILP32__
> /* Clear the upper 32 bits. */
> movl %edx, %edx
> @@ -125,6 +261,32 @@ ENTRY(STRCMP)
> actually bound the buffer. */
> jle L(one_or_less)
> # endif
> +
> +# if defined USE_AS_STRCASECMP_L
> + .section .rodata.cst32, "aM", @progbits, 32
> + .align 32
> +L(lcase_min):
> + .quad 0x4141414141414141
> + .quad 0x4141414141414141
> + .quad 0x4141414141414141
> + .quad 0x4141414141414141
> +L(lcase_max):
> + .quad 0x1a1a1a1a1a1a1a1a
> + .quad 0x1a1a1a1a1a1a1a1a
> + .quad 0x1a1a1a1a1a1a1a1a
> + .quad 0x1a1a1a1a1a1a1a1a
> +L(case_add):
> + .quad 0x2020202020202020
> + .quad 0x2020202020202020
> + .quad 0x2020202020202020
> + .quad 0x2020202020202020
> + .previous
> +
> + vmovdqa64 L(lcase_min)(%rip), LCASE_MIN_YMM
> + vmovdqa64 L(lcase_max)(%rip), LCASE_MAX_YMM
> + vmovdqa64 L(case_add)(%rip), CASE_ADD_YMM
> +# endif
> +
> movl %edi, %eax
> orl %esi, %eax
> /* Shift out the bits irrelivant to page boundary ([63:12]). */
> @@ -139,7 +301,7 @@ L(no_page_cross):
> VPTESTM %YMM0, %YMM0, %k2
> /* Each bit cleared in K1 represents a mismatch or a null CHAR
> in YMM0 and 32 bytes at (%rsi). */
> - VPCMP $0, (%rsi), %YMM0, %k1{%k2}
> + CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
> kmovd %k1, %ecx
> # ifdef USE_AS_STRNCMP
> cmpq $CHAR_PER_VEC, %rdx
> @@ -169,6 +331,8 @@ L(return_vec_0):
> # else
> movzbl (%rdi, %rcx), %eax
> movzbl (%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> # endif
> L(ret0):
> @@ -188,11 +352,15 @@ L(ret_zero):
>
> .p2align 4,, 5
> L(one_or_less):
> +# ifdef USE_AS_STRCASECMP_L
> + /* Set locale argument for strcasecmp. */
> + movq %LOCALE_REG, %rdx
> +# endif
> jb L(ret_zero)
> -# ifdef USE_AS_WCSCMP
> /* 'nbe' covers the case where length is negative (large
> unsigned). */
> - jnbe __wcscmp_evex
> + jnbe OVERFLOW_STRCMP
> +# ifdef USE_AS_WCSCMP
> movl (%rdi), %edx
> xorl %eax, %eax
> cmpl (%rsi), %edx
> @@ -201,11 +369,10 @@ L(one_or_less):
> negl %eax
> orl $1, %eax
> # else
> - /* 'nbe' covers the case where length is negative (large
> - unsigned). */
> - jnbe __strcmp_evex
> movzbl (%rdi), %eax
> movzbl (%rsi), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> # endif
> L(ret1):
> @@ -233,6 +400,8 @@ L(return_vec_1):
> # else
> movzbl VEC_SIZE(%rdi, %rcx), %eax
> movzbl VEC_SIZE(%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> # endif
> L(ret2):
> @@ -270,6 +439,8 @@ L(return_vec_2):
> # else
> movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
> movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> # endif
> L(ret3):
> @@ -290,6 +461,8 @@ L(return_vec_3):
> # else
> movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
> movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> # endif
> L(ret4):
> @@ -303,7 +476,7 @@ L(more_3x_vec):
> /* Safe to compare 4x vectors. */
> VMOVU (VEC_SIZE)(%rdi), %YMM0
> VPTESTM %YMM0, %YMM0, %k2
> - VPCMP $0, (VEC_SIZE)(%rsi), %YMM0, %k1{%k2}
> + CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
> kmovd %k1, %ecx
> TESTEQ %ecx
> jnz L(return_vec_1)
> @@ -315,14 +488,14 @@ L(more_3x_vec):
>
> VMOVU (VEC_SIZE * 2)(%rdi), %YMM0
> VPTESTM %YMM0, %YMM0, %k2
> - VPCMP $0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
> + CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2}
> kmovd %k1, %ecx
> TESTEQ %ecx
> jnz L(return_vec_2)
>
> VMOVU (VEC_SIZE * 3)(%rdi), %YMM0
> VPTESTM %YMM0, %YMM0, %k2
> - VPCMP $0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
> + CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2}
> kmovd %k1, %ecx
> TESTEQ %ecx
> jnz L(return_vec_3)
> @@ -381,7 +554,6 @@ L(prepare_loop_aligned):
> subl %esi, %eax
> andl $(PAGE_SIZE - 1), %eax
>
> - vpxorq %YMMZERO, %YMMZERO, %YMMZERO
>
> /* Loop 4x comparisons at a time. */
> .p2align 4
> @@ -413,22 +585,35 @@ L(loop_skip_page_cross_check):
> /* A zero CHAR in YMM9 means that there is a null CHAR. */
> VPMINU %YMM8, %YMM9, %YMM9
>
> - /* Each bit set in K1 represents a non-null CHAR in YMM8. */
> + /* Each bit set in K1 represents a non-null CHAR in YMM9. */
> VPTESTM %YMM9, %YMM9, %k1
> -
> +# ifndef USE_AS_STRCASECMP_L
> vpxorq (VEC_SIZE * 0)(%rsi), %YMM0, %YMM1
> vpxorq (VEC_SIZE * 1)(%rsi), %YMM2, %YMM3
> vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
> /* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
> oring with YMM1. Result is stored in YMM6. */
> vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6
> -
> +# else
> + VMOVU (VEC_SIZE * 0)(%rsi), %YMM1
> + TOLOWER_YMM (%YMM0, %YMM1)
> + VMOVU (VEC_SIZE * 1)(%rsi), %YMM3
> + TOLOWER_YMM (%YMM2, %YMM3)
> + VMOVU (VEC_SIZE * 2)(%rsi), %YMM5
> + TOLOWER_YMM (%YMM4, %YMM5)
> + VMOVU (VEC_SIZE * 3)(%rsi), %YMM7
> + TOLOWER_YMM (%YMM6, %YMM7)
> + vpxorq %YMM0, %YMM1, %YMM1
> + vpxorq %YMM2, %YMM3, %YMM3
> + vpxorq %YMM4, %YMM5, %YMM5
> + vpternlogd $0xde, %YMM7, %YMM1, %YMM6
> +# endif
> /* Or together YMM3, YMM5, and YMM6. */
> vpternlogd $0xfe, %YMM3, %YMM5, %YMM6
>
>
> /* A non-zero CHAR in YMM6 represents a mismatch. */
> - VPCMP $0, %YMMZERO, %YMM6, %k0{%k1}
> + VPTESTNM %YMM6, %YMM6, %k0{%k1}
> kmovd %k0, %LOOP_REG
>
> TESTEQ %LOOP_REG
> @@ -437,13 +622,13 @@ L(loop_skip_page_cross_check):
>
> /* Find which VEC has the mismatch of end of string. */
> VPTESTM %YMM0, %YMM0, %k1
> - VPCMP $0, %YMMZERO, %YMM1, %k0{%k1}
> + VPTESTNM %YMM1, %YMM1, %k0{%k1}
> kmovd %k0, %ecx
> TESTEQ %ecx
> jnz L(return_vec_0_end)
>
> VPTESTM %YMM2, %YMM2, %k1
> - VPCMP $0, %YMMZERO, %YMM3, %k0{%k1}
> + VPTESTNM %YMM3, %YMM3, %k0{%k1}
> kmovd %k0, %ecx
> TESTEQ %ecx
> jnz L(return_vec_1_end)
> @@ -457,7 +642,7 @@ L(return_vec_2_3_end):
> # endif
>
> VPTESTM %YMM4, %YMM4, %k1
> - VPCMP $0, %YMMZERO, %YMM5, %k0{%k1}
> + VPTESTNM %YMM5, %YMM5, %k0{%k1}
> kmovd %k0, %ecx
> TESTEQ %ecx
> # if CHAR_PER_VEC <= 16
> @@ -493,6 +678,8 @@ L(return_vec_3_end):
> # else
> movzbl (VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax
> movzbl (VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> xorl %r8d, %eax
> subl %r8d, %eax
> @@ -545,6 +732,8 @@ L(return_vec_0_end):
> # else
> movzbl (%rdi, %rcx), %eax
> movzbl (%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> /* Flip `eax` if `rdi` and `rsi` where swapped in page cross
> logic. Subtract `r8d` after xor for zero case. */
> @@ -569,6 +758,8 @@ L(return_vec_1_end):
> # else
> movzbl VEC_SIZE(%rdi, %rcx), %eax
> movzbl VEC_SIZE(%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> xorl %r8d, %eax
> subl %r8d, %eax
> @@ -598,7 +789,7 @@ L(page_cross_during_loop):
>
> VMOVA (%rdi), %YMM0
> VPTESTM %YMM0, %YMM0, %k2
> - VPCMP $0, (%rsi), %YMM0, %k1{%k2}
> + CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
> kmovd %k1, %ecx
> TESTEQ %ecx
> jnz L(return_vec_0_end)
> @@ -619,8 +810,7 @@ L(less_1x_vec_till_page_cross):
> been loaded earlier so must be valid. */
> VMOVU -VEC_SIZE(%rdi, %rax), %YMM0
> VPTESTM %YMM0, %YMM0, %k2
> - VPCMP $0, -VEC_SIZE(%rsi, %rax), %YMM0, %k1{%k2}
> -
> + CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2}
> /* Mask of potentially valid bits. The lower bits can be out of
> range comparisons (but safe regarding page crosses). */
>
> @@ -642,6 +832,8 @@ L(less_1x_vec_till_page_cross):
>
> # ifdef USE_AS_STRNCMP
> # ifdef USE_AS_WCSCMP
> + /* NB: strcasecmp not used with WCSCMP so this access to r11 is
> + safe. */
> movl %eax, %r11d
> shrl $2, %r11d
> cmpq %r11, %rdx
> @@ -679,6 +871,8 @@ L(return_page_cross_cmp_mem):
> # else
> movzbl VEC_OFFSET(%rdi, %rcx), %eax
> movzbl VEC_OFFSET(%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> xorl %r8d, %eax
> subl %r8d, %eax
> @@ -709,7 +903,7 @@ L(more_2x_vec_till_page_cross):
>
> VMOVA VEC_SIZE(%rdi), %YMM0
> VPTESTM %YMM0, %YMM0, %k2
> - VPCMP $0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
> + CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
> kmovd %k1, %ecx
> TESTEQ %ecx
> jnz L(return_vec_1_end)
> @@ -724,14 +918,14 @@ L(more_2x_vec_till_page_cross):
> /* Safe to include comparisons from lower bytes. */
> VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %YMM0
> VPTESTM %YMM0, %YMM0, %k2
> - VPCMP $0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM0, %k1{%k2}
> + CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2}
> kmovd %k1, %ecx
> TESTEQ %ecx
> jnz L(return_vec_page_cross_0)
>
> VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %YMM0
> VPTESTM %YMM0, %YMM0, %k2
> - VPCMP $0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM0, %k1{%k2}
> + CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2}
> kmovd %k1, %ecx
> TESTEQ %ecx
> jnz L(return_vec_page_cross_1)
> @@ -740,6 +934,8 @@ L(more_2x_vec_till_page_cross):
> /* Must check length here as length might proclude reading next
> page. */
> # ifdef USE_AS_WCSCMP
> + /* NB: strcasecmp not used with WCSCMP so this access to r11 is
> + safe. */
> movl %eax, %r11d
> shrl $2, %r11d
> cmpq %r11, %rdx
> @@ -754,12 +950,19 @@ L(more_2x_vec_till_page_cross):
> VMOVA (VEC_SIZE * 3)(%rdi), %YMM6
> VPMINU %YMM4, %YMM6, %YMM9
> VPTESTM %YMM9, %YMM9, %k1
> -
> +# ifndef USE_AS_STRCASECMP_L
> vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
> /* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6). */
> vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6
> -
> - VPCMP $0, %YMMZERO, %YMM6, %k0{%k1}
> +# else
> + VMOVU (VEC_SIZE * 2)(%rsi), %YMM5
> + TOLOWER_YMM (%YMM4, %YMM5)
> + VMOVU (VEC_SIZE * 3)(%rsi), %YMM7
> + TOLOWER_YMM (%YMM6, %YMM7)
> + vpxorq %YMM4, %YMM5, %YMM5
> + vpternlogd $0xde, %YMM7, %YMM5, %YMM6
> +# endif
> + VPTESTNM %YMM6, %YMM6, %k0{%k1}
> kmovd %k0, %LOOP_REG
> TESTEQ %LOOP_REG
> jnz L(return_vec_2_3_end)
> @@ -815,6 +1018,8 @@ L(return_vec_page_cross_1):
> # else
> movzbl VEC_OFFSET(%rdi, %rcx), %eax
> movzbl VEC_OFFSET(%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> xorl %r8d, %eax
> subl %r8d, %eax
> @@ -871,7 +1076,7 @@ L(page_cross):
> L(page_cross_loop):
> VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
> VPTESTM %YMM0, %YMM0, %k2
> - VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
> + CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
> kmovd %k1, %ecx
> TESTEQ %ecx
> jnz L(check_ret_vec_page_cross)
> @@ -895,7 +1100,7 @@ L(page_cross_loop):
> */
> VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
> VPTESTM %YMM0, %YMM0, %k2
> - VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
> + CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
>
> kmovd %k1, %ecx
> # ifdef USE_AS_STRNCMP
> @@ -930,6 +1135,8 @@ L(ret_vec_page_cross_cont):
> # else
> movzbl (%rdi, %rcx, SIZE_OF_CHAR), %eax
> movzbl (%rsi, %rcx, SIZE_OF_CHAR), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> xorl %r8d, %eax
> subl %r8d, %eax
> @@ -989,7 +1196,7 @@ L(less_1x_vec_till_page):
> /* Use 16 byte comparison. */
> vmovdqu (%rdi), %xmm0
> VPTESTM %xmm0, %xmm0, %k2
> - VPCMP $0, (%rsi), %xmm0, %k1{%k2}
> + CMP_R1_S2_XMM (%xmm0, (%rsi), %xmm1, %k1){%k2}
> kmovd %k1, %ecx
> # ifdef USE_AS_WCSCMP
> subl $0xf, %ecx
> @@ -1009,7 +1216,7 @@ L(less_1x_vec_till_page):
> # endif
> vmovdqu (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
> VPTESTM %xmm0, %xmm0, %k2
> - VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0, %k1{%k2}
> + CMP_R1_S2_XMM (%xmm0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1, %k1){%k2}
> kmovd %k1, %ecx
> # ifdef USE_AS_WCSCMP
> subl $0xf, %ecx
> @@ -1048,7 +1255,7 @@ L(less_16_till_page):
> vmovq (%rdi), %xmm0
> vmovq (%rsi), %xmm1
> VPTESTM %xmm0, %xmm0, %k2
> - VPCMP $0, %xmm1, %xmm0, %k1{%k2}
> + CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
> kmovd %k1, %ecx
> # ifdef USE_AS_WCSCMP
> subl $0x3, %ecx
> @@ -1068,7 +1275,7 @@ L(less_16_till_page):
> vmovq (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
> vmovq (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
> VPTESTM %xmm0, %xmm0, %k2
> - VPCMP $0, %xmm1, %xmm0, %k1{%k2}
> + CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
> kmovd %k1, %ecx
> # ifdef USE_AS_WCSCMP
> subl $0x3, %ecx
> @@ -1128,7 +1335,7 @@ L(ret_less_8_wcs):
> vmovd (%rdi), %xmm0
> vmovd (%rsi), %xmm1
> VPTESTM %xmm0, %xmm0, %k2
> - VPCMP $0, %xmm1, %xmm0, %k1{%k2}
> + CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
> kmovd %k1, %ecx
> subl $0xf, %ecx
> jnz L(check_ret_vec_page_cross)
> @@ -1143,7 +1350,7 @@ L(ret_less_8_wcs):
> vmovd (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
> vmovd (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
> VPTESTM %xmm0, %xmm0, %k2
> - VPCMP $0, %xmm1, %xmm0, %k1{%k2}
> + CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
> kmovd %k1, %ecx
> subl $0xf, %ecx
> jnz L(check_ret_vec_page_cross)
> @@ -1176,7 +1383,9 @@ L(less_4_till_page):
> L(less_4_loop):
> movzbl (%rdi), %eax
> movzbl (%rsi, %rdi), %ecx
> - subl %ecx, %eax
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
> + subl %BYTE_LOOP_REG, %eax
> jnz L(ret_less_4_loop)
> testl %ecx, %ecx
> jz L(ret_zero_4_loop)
> @@ -1203,5 +1412,6 @@ L(ret_less_4_loop):
> subl %r8d, %eax
> ret
> # endif
> -END(STRCMP)
> + cfi_endproc
> + .size STRCMP, .-STRCMP
> #endif
> diff --git a/sysdeps/x86_64/multiarch/strncase_l-evex.S b/sysdeps/x86_64/multiarch/strncase_l-evex.S
> new file mode 100644
> index 0000000000..8a5af3695c
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strncase_l-evex.S
> @@ -0,0 +1,25 @@
> +/* strncasecmp_l optimized with EVEX.
> + Copyright (C) 2017-2022 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#ifndef STRCMP
> +# define STRCMP __strncasecmp_l_evex
> +#endif
> +#define OVERFLOW_STRCMP __strcasecmp_l_evex
> +#define USE_AS_STRCASECMP_L
> +#define USE_AS_STRNCMP
> +#include "strcmp-evex.S"
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v4 22/23] x86: Add EVEX optimized str{n}casecmp
2022-03-25 18:15 ` H.J. Lu
@ 2022-03-25 18:18 ` Noah Goldstein
2022-05-12 19:47 ` Sunil Pandey
0 siblings, 1 reply; 76+ messages in thread
From: Noah Goldstein @ 2022-03-25 18:18 UTC (permalink / raw)
To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell
On Fri, Mar 25, 2022 at 1:15 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Thu, Mar 24, 2022 at 4:56 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > geometric_mean(N=40) of all benchmarks EVEX / SSE42: .621
> >
> > All string/memory tests pass.
> > Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
> > ---
> > sysdeps/x86_64/multiarch/Makefile | 2 +
> > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 +
> > sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 5 +
> > sysdeps/x86_64/multiarch/strcasecmp_l-evex.S | 23 ++
> > sysdeps/x86_64/multiarch/strcmp-evex.S | 290 ++++++++++++++++---
> > sysdeps/x86_64/multiarch/strncase_l-evex.S | 25 ++
> > 6 files changed, 321 insertions(+), 40 deletions(-)
> > create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
> > create mode 100644 sysdeps/x86_64/multiarch/strncase_l-evex.S
> >
> > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > index 06e1848823..35d80dc2ff 100644
> > --- a/sysdeps/x86_64/multiarch/Makefile
> > +++ b/sysdeps/x86_64/multiarch/Makefile
> > @@ -57,6 +57,7 @@ sysdep_routines += \
> > strcasecmp_l-avx \
> > strcasecmp_l-avx2 \
> > strcasecmp_l-avx2-rtm \
> > + strcasecmp_l-evex \
> > strcasecmp_l-sse2 \
> > strcasecmp_l-sse4_2 \
> > strcasecmp_l-ssse3 \
> > @@ -97,6 +98,7 @@ sysdep_routines += \
> > strncase_l-avx \
> > strncase_l-avx2 \
> > strncase_l-avx2-rtm \
> > + strncase_l-evex \
> > strncase_l-sse2 \
> > strncase_l-sse4_2 \
> > strncase_l-ssse3 \
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > index 3c556d07ac..f1a4d3dac2 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > @@ -436,6 +436,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >
> > /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
> > IFUNC_IMPL (i, name, strcasecmp,
> > + IFUNC_IMPL_ADD (array, i, strcasecmp,
> > + (CPU_FEATURE_USABLE (AVX512VL)
> > + && CPU_FEATURE_USABLE (AVX512BW)),
> > + __strcasecmp_evex)
> > IFUNC_IMPL_ADD (array, i, strcasecmp,
> > CPU_FEATURE_USABLE (AVX2),
> > __strcasecmp_avx2)
> > @@ -456,6 +460,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >
> > /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
> > IFUNC_IMPL (i, name, strcasecmp_l,
> > + IFUNC_IMPL_ADD (array, i, strcasecmp,
> > + (CPU_FEATURE_USABLE (AVX512VL)
> > + && CPU_FEATURE_USABLE (AVX512BW)),
> > + __strcasecmp_l_evex)
> > IFUNC_IMPL_ADD (array, i, strcasecmp,
> > CPU_FEATURE_USABLE (AVX2),
> > __strcasecmp_l_avx2)
> > @@ -590,6 +598,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >
> > /* Support sysdeps/x86_64/multiarch/strncase_l.c. */
> > IFUNC_IMPL (i, name, strncasecmp,
> > + IFUNC_IMPL_ADD (array, i, strncasecmp,
> > + (CPU_FEATURE_USABLE (AVX512VL)
> > + && CPU_FEATURE_USABLE (AVX512BW)),
> > + __strncasecmp_evex)
> > IFUNC_IMPL_ADD (array, i, strncasecmp,
> > CPU_FEATURE_USABLE (AVX2),
> > __strncasecmp_avx2)
> > @@ -611,6 +623,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >
> > /* Support sysdeps/x86_64/multiarch/strncase_l.c. */
> > IFUNC_IMPL (i, name, strncasecmp_l,
> > + IFUNC_IMPL_ADD (array, i, strncasecmp,
> > + (CPU_FEATURE_USABLE (AVX512VL)
> > + && CPU_FEATURE_USABLE (AVX512BW)),
> > + __strncasecmp_l_evex)
> > IFUNC_IMPL_ADD (array, i, strncasecmp,
> > CPU_FEATURE_USABLE (AVX2),
> > __strncasecmp_l_avx2)
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> > index c4de111fd0..bf0d146e7f 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> > +++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> > @@ -25,6 +25,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
> > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
> > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
> >
> > static inline void *
> > IFUNC_SELECTOR (void)
> > @@ -34,6 +35,10 @@ IFUNC_SELECTOR (void)
> > if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> > && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
> > {
> > + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
> > + return OPTIMIZE (evex);
> > +
> > if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> > return OPTIMIZE (avx2_rtm);
> >
> > diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
> > new file mode 100644
> > index 0000000000..58642db748
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
> > @@ -0,0 +1,23 @@
> > +/* strcasecmp_l optimized with EVEX.
> > + Copyright (C) 2017-2022 Free Software Foundation, Inc.
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library; if not, see
> > + <https://www.gnu.org/licenses/>. */
> > +
> > +#ifndef STRCMP
> > +# define STRCMP __strcasecmp_l_evex
> > +#endif
> > +#define USE_AS_STRCASECMP_L
> > +#include "strcmp-evex.S"
> > diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
> > index 56d8c118e4..2a5b3ce037 100644
> > --- a/sysdeps/x86_64/multiarch/strcmp-evex.S
> > +++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
> > @@ -19,6 +19,9 @@
> > #if IS_IN (libc)
> >
> > # include <sysdep.h>
> > +# if defined USE_AS_STRCASECMP_L
> > +# include "locale-defines.h"
> > +# endif
> >
> > # ifndef STRCMP
> > # define STRCMP __strcmp_evex
> > @@ -34,19 +37,29 @@
> > # define VMOVA vmovdqa64
> >
> > # ifdef USE_AS_WCSCMP
> > -# define TESTEQ subl $0xff,
> > +# ifndef OVERFLOW_STRCMP
> > +# define OVERFLOW_STRCMP __wcscmp_evex
> > +# endif
> > +
> > +# define TESTEQ subl $0xff,
> > /* Compare packed dwords. */
> > # define VPCMP vpcmpd
> > # define VPMINU vpminud
> > # define VPTESTM vptestmd
> > +# define VPTESTNM vptestnmd
> > /* 1 dword char == 4 bytes. */
> > # define SIZE_OF_CHAR 4
> > # else
> > +# ifndef OVERFLOW_STRCMP
> > +# define OVERFLOW_STRCMP __strcmp_evex
> > +# endif
> > +
> > # define TESTEQ incl
> > /* Compare packed bytes. */
> > # define VPCMP vpcmpb
> > # define VPMINU vpminub
> > # define VPTESTM vptestmb
> > +# define VPTESTNM vptestnmb
> > /* 1 byte char == 1 byte. */
> > # define SIZE_OF_CHAR 1
> > # endif
> > @@ -73,11 +86,16 @@
> > # define VEC_OFFSET (-VEC_SIZE)
> > # endif
> >
> > -# define XMMZERO xmm16
> > # define XMM0 xmm17
> > # define XMM1 xmm18
> >
> > -# define YMMZERO ymm16
> > +# define XMM10 xmm27
> > +# define XMM11 xmm28
> > +# define XMM12 xmm29
> > +# define XMM13 xmm30
> > +# define XMM14 xmm31
> > +
> > +
> > # define YMM0 ymm17
> > # define YMM1 ymm18
> > # define YMM2 ymm19
> > @@ -89,6 +107,87 @@
> > # define YMM8 ymm25
> > # define YMM9 ymm26
> > # define YMM10 ymm27
> > +# define YMM11 ymm28
> > +# define YMM12 ymm29
> > +# define YMM13 ymm30
> > +# define YMM14 ymm31
> > +
> > +# ifdef USE_AS_STRCASECMP_L
> > +# define BYTE_LOOP_REG OFFSET_REG
> > +# else
> > +# define BYTE_LOOP_REG ecx
> > +# endif
> > +
> > +# ifdef USE_AS_STRCASECMP_L
> > +# ifdef USE_AS_STRNCMP
> > +# define STRCASECMP __strncasecmp_evex
> > +# define LOCALE_REG rcx
> > +# define LOCALE_REG_LP RCX_LP
> > +# define STRCASECMP_NONASCII __strncasecmp_l_nonascii
> > +# else
> > +# define STRCASECMP __strcasecmp_evex
> > +# define LOCALE_REG rdx
> > +# define LOCALE_REG_LP RDX_LP
> > +# define STRCASECMP_NONASCII __strcasecmp_l_nonascii
> > +# endif
> > +# endif
> > +
> > +# define LCASE_MIN_YMM %YMM12
> > +# define LCASE_MAX_YMM %YMM13
> > +# define CASE_ADD_YMM %YMM14
> > +
> > +# define LCASE_MIN_XMM %XMM12
> > +# define LCASE_MAX_XMM %XMM13
> > +# define CASE_ADD_XMM %XMM14
> > +
> > + /* NB: wcsncmp uses r11 but strcasecmp is never used in
> > + conjunction with wcscmp. */
> > +# define TOLOWER_BASE %r11
> > +
> > +# ifdef USE_AS_STRCASECMP_L
> > +# define _REG(x, y) x ## y
> > +# define REG(x, y) _REG(x, y)
> > +# define TOLOWER(reg1, reg2, ext) \
> > + vpsubb REG(LCASE_MIN_, ext), reg1, REG(%ext, 10); \
> > + vpsubb REG(LCASE_MIN_, ext), reg2, REG(%ext, 11); \
> > + vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5; \
> > + vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6; \
> > + vpaddb reg1, REG(CASE_ADD_, ext), reg1{%k5}; \
> > + vpaddb reg2, REG(CASE_ADD_, ext), reg2{%k6}
> > +
> > +# define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
> > +# define TOLOWER_YMM(...) TOLOWER(__VA_ARGS__, YMM)
> > +# define TOLOWER_XMM(...) TOLOWER(__VA_ARGS__, XMM)
> > +
> > +# define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext) \
> > + TOLOWER (s1_reg, s2_reg, ext); \
> > + VPCMP $0, s1_reg, s2_reg, reg_out
> > +
> > +# define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext) \
> > + VMOVU s2_mem, s2_reg; \
> > + CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)
> > +
> > +# define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM)
> > +# define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM)
> > +
> > +# define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM)
> > +# define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM)
> > +
> > +# else
> > +# define TOLOWER_gpr(...)
> > +# define TOLOWER_YMM(...)
> > +# define TOLOWER_XMM(...)
> > +
> > +# define CMP_R1_R2_YMM(s1_reg, s2_reg, reg_out) \
> > + VPCMP $0, s2_reg, s1_reg, reg_out
> > +
> > +# define CMP_R1_R2_XMM(...) CMP_R1_R2_YMM(__VA_ARGS__)
> > +
> > +# define CMP_R1_S2_YMM(s1_reg, s2_mem, unused, reg_out) \
> > + VPCMP $0, s2_mem, s1_reg, reg_out
> > +
> > +# define CMP_R1_S2_XMM(...) CMP_R1_S2_YMM(__VA_ARGS__)
> > +# endif
> >
> > /* Warning!
> > wcscmp/wcsncmp have to use SIGNED comparison for elements.
> > @@ -112,8 +211,45 @@
> > returned. */
> >
> > .section .text.evex, "ax", @progbits
> > -ENTRY(STRCMP)
> > + .align 16
> > + .type STRCMP, @function
> > + .globl STRCMP
> > + .hidden STRCMP
> > +
> > +# ifdef USE_AS_STRCASECMP_L
> > +ENTRY (STRCASECMP)
> > + movq __libc_tsd_LOCALE@gottpoff(%rip), %rax
> > + mov %fs:(%rax), %LOCALE_REG_LP
> > +
> > + /* Either 1 or 5 bytes (dependeing if CET is enabled). */
> > + .p2align 4
> > +END (STRCASECMP)
> > + /* FALLTHROUGH to strcasecmp/strncasecmp_l. */
> > +# endif
> > +
> > + .p2align 4
> > +STRCMP:
> > + cfi_startproc
> > + _CET_ENDBR
> > + CALL_MCOUNT
> > +
> > +# if defined USE_AS_STRCASECMP_L
> > + /* We have to fall back on the C implementation for locales with
> > + encodings not matching ASCII for single bytes. */
> > +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
> > + mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
> > +# else
> > + mov (%LOCALE_REG), %RAX_LP
> > +# endif
> > + testl $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
> > + jne STRCASECMP_NONASCII
> > + leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
> > +# endif
> > +
> > # ifdef USE_AS_STRNCMP
> > + /* Don't overwrite LOCALE_REG (rcx) until we have pass
> > + L(one_or_less). Otherwise we might use the wrong locale in
> > + the OVERFLOW_STRCMP (strcasecmp_l). */
> > # ifdef __ILP32__
> > /* Clear the upper 32 bits. */
> > movl %edx, %edx
> > @@ -125,6 +261,32 @@ ENTRY(STRCMP)
> > actually bound the buffer. */
> > jle L(one_or_less)
> > # endif
> > +
> > +# if defined USE_AS_STRCASECMP_L
> > + .section .rodata.cst32, "aM", @progbits, 32
> > + .align 32
> > +L(lcase_min):
> > + .quad 0x4141414141414141
> > + .quad 0x4141414141414141
> > + .quad 0x4141414141414141
> > + .quad 0x4141414141414141
> > +L(lcase_max):
> > + .quad 0x1a1a1a1a1a1a1a1a
> > + .quad 0x1a1a1a1a1a1a1a1a
> > + .quad 0x1a1a1a1a1a1a1a1a
> > + .quad 0x1a1a1a1a1a1a1a1a
> > +L(case_add):
> > + .quad 0x2020202020202020
> > + .quad 0x2020202020202020
> > + .quad 0x2020202020202020
> > + .quad 0x2020202020202020
> > + .previous
> > +
> > + vmovdqa64 L(lcase_min)(%rip), LCASE_MIN_YMM
> > + vmovdqa64 L(lcase_max)(%rip), LCASE_MAX_YMM
> > + vmovdqa64 L(case_add)(%rip), CASE_ADD_YMM
> > +# endif
> > +
> > movl %edi, %eax
> > orl %esi, %eax
> > /* Shift out the bits irrelivant to page boundary ([63:12]). */
> > @@ -139,7 +301,7 @@ L(no_page_cross):
> > VPTESTM %YMM0, %YMM0, %k2
> > /* Each bit cleared in K1 represents a mismatch or a null CHAR
> > in YMM0 and 32 bytes at (%rsi). */
> > - VPCMP $0, (%rsi), %YMM0, %k1{%k2}
> > + CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
> > kmovd %k1, %ecx
> > # ifdef USE_AS_STRNCMP
> > cmpq $CHAR_PER_VEC, %rdx
> > @@ -169,6 +331,8 @@ L(return_vec_0):
> > # else
> > movzbl (%rdi, %rcx), %eax
> > movzbl (%rsi, %rcx), %ecx
> > + TOLOWER_gpr (%rax, %eax)
> > + TOLOWER_gpr (%rcx, %ecx)
> > subl %ecx, %eax
> > # endif
> > L(ret0):
> > @@ -188,11 +352,15 @@ L(ret_zero):
> >
> > .p2align 4,, 5
> > L(one_or_less):
> > +# ifdef USE_AS_STRCASECMP_L
> > + /* Set locale argument for strcasecmp. */
> > + movq %LOCALE_REG, %rdx
> > +# endif
> > jb L(ret_zero)
> > -# ifdef USE_AS_WCSCMP
> > /* 'nbe' covers the case where length is negative (large
> > unsigned). */
> > - jnbe __wcscmp_evex
> > + jnbe OVERFLOW_STRCMP
> > +# ifdef USE_AS_WCSCMP
> > movl (%rdi), %edx
> > xorl %eax, %eax
> > cmpl (%rsi), %edx
> > @@ -201,11 +369,10 @@ L(one_or_less):
> > negl %eax
> > orl $1, %eax
> > # else
> > - /* 'nbe' covers the case where length is negative (large
> > - unsigned). */
> > - jnbe __strcmp_evex
> > movzbl (%rdi), %eax
> > movzbl (%rsi), %ecx
> > + TOLOWER_gpr (%rax, %eax)
> > + TOLOWER_gpr (%rcx, %ecx)
> > subl %ecx, %eax
> > # endif
> > L(ret1):
> > @@ -233,6 +400,8 @@ L(return_vec_1):
> > # else
> > movzbl VEC_SIZE(%rdi, %rcx), %eax
> > movzbl VEC_SIZE(%rsi, %rcx), %ecx
> > + TOLOWER_gpr (%rax, %eax)
> > + TOLOWER_gpr (%rcx, %ecx)
> > subl %ecx, %eax
> > # endif
> > L(ret2):
> > @@ -270,6 +439,8 @@ L(return_vec_2):
> > # else
> > movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
> > movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
> > + TOLOWER_gpr (%rax, %eax)
> > + TOLOWER_gpr (%rcx, %ecx)
> > subl %ecx, %eax
> > # endif
> > L(ret3):
> > @@ -290,6 +461,8 @@ L(return_vec_3):
> > # else
> > movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
> > movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
> > + TOLOWER_gpr (%rax, %eax)
> > + TOLOWER_gpr (%rcx, %ecx)
> > subl %ecx, %eax
> > # endif
> > L(ret4):
> > @@ -303,7 +476,7 @@ L(more_3x_vec):
> > /* Safe to compare 4x vectors. */
> > VMOVU (VEC_SIZE)(%rdi), %YMM0
> > VPTESTM %YMM0, %YMM0, %k2
> > - VPCMP $0, (VEC_SIZE)(%rsi), %YMM0, %k1{%k2}
> > + CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
> > kmovd %k1, %ecx
> > TESTEQ %ecx
> > jnz L(return_vec_1)
> > @@ -315,14 +488,14 @@ L(more_3x_vec):
> >
> > VMOVU (VEC_SIZE * 2)(%rdi), %YMM0
> > VPTESTM %YMM0, %YMM0, %k2
> > - VPCMP $0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
> > + CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2}
> > kmovd %k1, %ecx
> > TESTEQ %ecx
> > jnz L(return_vec_2)
> >
> > VMOVU (VEC_SIZE * 3)(%rdi), %YMM0
> > VPTESTM %YMM0, %YMM0, %k2
> > - VPCMP $0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
> > + CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2}
> > kmovd %k1, %ecx
> > TESTEQ %ecx
> > jnz L(return_vec_3)
> > @@ -381,7 +554,6 @@ L(prepare_loop_aligned):
> > subl %esi, %eax
> > andl $(PAGE_SIZE - 1), %eax
> >
> > - vpxorq %YMMZERO, %YMMZERO, %YMMZERO
> >
> > /* Loop 4x comparisons at a time. */
> > .p2align 4
> > @@ -413,22 +585,35 @@ L(loop_skip_page_cross_check):
> > /* A zero CHAR in YMM9 means that there is a null CHAR. */
> > VPMINU %YMM8, %YMM9, %YMM9
> >
> > - /* Each bit set in K1 represents a non-null CHAR in YMM8. */
> > + /* Each bit set in K1 represents a non-null CHAR in YMM9. */
> > VPTESTM %YMM9, %YMM9, %k1
> > -
> > +# ifndef USE_AS_STRCASECMP_L
> > vpxorq (VEC_SIZE * 0)(%rsi), %YMM0, %YMM1
> > vpxorq (VEC_SIZE * 1)(%rsi), %YMM2, %YMM3
> > vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
> > /* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
> > oring with YMM1. Result is stored in YMM6. */
> > vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6
> > -
> > +# else
> > + VMOVU (VEC_SIZE * 0)(%rsi), %YMM1
> > + TOLOWER_YMM (%YMM0, %YMM1)
> > + VMOVU (VEC_SIZE * 1)(%rsi), %YMM3
> > + TOLOWER_YMM (%YMM2, %YMM3)
> > + VMOVU (VEC_SIZE * 2)(%rsi), %YMM5
> > + TOLOWER_YMM (%YMM4, %YMM5)
> > + VMOVU (VEC_SIZE * 3)(%rsi), %YMM7
> > + TOLOWER_YMM (%YMM6, %YMM7)
> > + vpxorq %YMM0, %YMM1, %YMM1
> > + vpxorq %YMM2, %YMM3, %YMM3
> > + vpxorq %YMM4, %YMM5, %YMM5
> > + vpternlogd $0xde, %YMM7, %YMM1, %YMM6
> > +# endif
> > /* Or together YMM3, YMM5, and YMM6. */
> > vpternlogd $0xfe, %YMM3, %YMM5, %YMM6
> >
> >
> > /* A non-zero CHAR in YMM6 represents a mismatch. */
> > - VPCMP $0, %YMMZERO, %YMM6, %k0{%k1}
> > + VPTESTNM %YMM6, %YMM6, %k0{%k1}
> > kmovd %k0, %LOOP_REG
> >
> > TESTEQ %LOOP_REG
> > @@ -437,13 +622,13 @@ L(loop_skip_page_cross_check):
> >
> > /* Find which VEC has the mismatch of end of string. */
> > VPTESTM %YMM0, %YMM0, %k1
> > - VPCMP $0, %YMMZERO, %YMM1, %k0{%k1}
> > + VPTESTNM %YMM1, %YMM1, %k0{%k1}
> > kmovd %k0, %ecx
> > TESTEQ %ecx
> > jnz L(return_vec_0_end)
> >
> > VPTESTM %YMM2, %YMM2, %k1
> > - VPCMP $0, %YMMZERO, %YMM3, %k0{%k1}
> > + VPTESTNM %YMM3, %YMM3, %k0{%k1}
> > kmovd %k0, %ecx
> > TESTEQ %ecx
> > jnz L(return_vec_1_end)
> > @@ -457,7 +642,7 @@ L(return_vec_2_3_end):
> > # endif
> >
> > VPTESTM %YMM4, %YMM4, %k1
> > - VPCMP $0, %YMMZERO, %YMM5, %k0{%k1}
> > + VPTESTNM %YMM5, %YMM5, %k0{%k1}
> > kmovd %k0, %ecx
> > TESTEQ %ecx
> > # if CHAR_PER_VEC <= 16
> > @@ -493,6 +678,8 @@ L(return_vec_3_end):
> > # else
> > movzbl (VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax
> > movzbl (VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx
> > + TOLOWER_gpr (%rax, %eax)
> > + TOLOWER_gpr (%rcx, %ecx)
> > subl %ecx, %eax
> > xorl %r8d, %eax
> > subl %r8d, %eax
> > @@ -545,6 +732,8 @@ L(return_vec_0_end):
> > # else
> > movzbl (%rdi, %rcx), %eax
> > movzbl (%rsi, %rcx), %ecx
> > + TOLOWER_gpr (%rax, %eax)
> > + TOLOWER_gpr (%rcx, %ecx)
> > subl %ecx, %eax
> > /* Flip `eax` if `rdi` and `rsi` where swapped in page cross
> > logic. Subtract `r8d` after xor for zero case. */
> > @@ -569,6 +758,8 @@ L(return_vec_1_end):
> > # else
> > movzbl VEC_SIZE(%rdi, %rcx), %eax
> > movzbl VEC_SIZE(%rsi, %rcx), %ecx
> > + TOLOWER_gpr (%rax, %eax)
> > + TOLOWER_gpr (%rcx, %ecx)
> > subl %ecx, %eax
> > xorl %r8d, %eax
> > subl %r8d, %eax
> > @@ -598,7 +789,7 @@ L(page_cross_during_loop):
> >
> > VMOVA (%rdi), %YMM0
> > VPTESTM %YMM0, %YMM0, %k2
> > - VPCMP $0, (%rsi), %YMM0, %k1{%k2}
> > + CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
> > kmovd %k1, %ecx
> > TESTEQ %ecx
> > jnz L(return_vec_0_end)
> > @@ -619,8 +810,7 @@ L(less_1x_vec_till_page_cross):
> > been loaded earlier so must be valid. */
> > VMOVU -VEC_SIZE(%rdi, %rax), %YMM0
> > VPTESTM %YMM0, %YMM0, %k2
> > - VPCMP $0, -VEC_SIZE(%rsi, %rax), %YMM0, %k1{%k2}
> > -
> > + CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2}
> > /* Mask of potentially valid bits. The lower bits can be out of
> > range comparisons (but safe regarding page crosses). */
> >
> > @@ -642,6 +832,8 @@ L(less_1x_vec_till_page_cross):
> >
> > # ifdef USE_AS_STRNCMP
> > # ifdef USE_AS_WCSCMP
> > + /* NB: strcasecmp not used with WCSCMP so this access to r11 is
> > + safe. */
> > movl %eax, %r11d
> > shrl $2, %r11d
> > cmpq %r11, %rdx
> > @@ -679,6 +871,8 @@ L(return_page_cross_cmp_mem):
> > # else
> > movzbl VEC_OFFSET(%rdi, %rcx), %eax
> > movzbl VEC_OFFSET(%rsi, %rcx), %ecx
> > + TOLOWER_gpr (%rax, %eax)
> > + TOLOWER_gpr (%rcx, %ecx)
> > subl %ecx, %eax
> > xorl %r8d, %eax
> > subl %r8d, %eax
> > @@ -709,7 +903,7 @@ L(more_2x_vec_till_page_cross):
> >
> > VMOVA VEC_SIZE(%rdi), %YMM0
> > VPTESTM %YMM0, %YMM0, %k2
> > - VPCMP $0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
> > + CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
> > kmovd %k1, %ecx
> > TESTEQ %ecx
> > jnz L(return_vec_1_end)
> > @@ -724,14 +918,14 @@ L(more_2x_vec_till_page_cross):
> > /* Safe to include comparisons from lower bytes. */
> > VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %YMM0
> > VPTESTM %YMM0, %YMM0, %k2
> > - VPCMP $0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM0, %k1{%k2}
> > + CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2}
> > kmovd %k1, %ecx
> > TESTEQ %ecx
> > jnz L(return_vec_page_cross_0)
> >
> > VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %YMM0
> > VPTESTM %YMM0, %YMM0, %k2
> > - VPCMP $0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM0, %k1{%k2}
> > + CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2}
> > kmovd %k1, %ecx
> > TESTEQ %ecx
> > jnz L(return_vec_page_cross_1)
> > @@ -740,6 +934,8 @@ L(more_2x_vec_till_page_cross):
> > /* Must check length here as length might proclude reading next
> > page. */
> > # ifdef USE_AS_WCSCMP
> > + /* NB: strcasecmp not used with WCSCMP so this access to r11 is
> > + safe. */
> > movl %eax, %r11d
> > shrl $2, %r11d
> > cmpq %r11, %rdx
> > @@ -754,12 +950,19 @@ L(more_2x_vec_till_page_cross):
> > VMOVA (VEC_SIZE * 3)(%rdi), %YMM6
> > VPMINU %YMM4, %YMM6, %YMM9
> > VPTESTM %YMM9, %YMM9, %k1
> > -
> > +# ifndef USE_AS_STRCASECMP_L
> > vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
> > /* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6). */
> > vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6
> > -
> > - VPCMP $0, %YMMZERO, %YMM6, %k0{%k1}
> > +# else
> > + VMOVU (VEC_SIZE * 2)(%rsi), %YMM5
> > + TOLOWER_YMM (%YMM4, %YMM5)
> > + VMOVU (VEC_SIZE * 3)(%rsi), %YMM7
> > + TOLOWER_YMM (%YMM6, %YMM7)
> > + vpxorq %YMM4, %YMM5, %YMM5
> > + vpternlogd $0xde, %YMM7, %YMM5, %YMM6
> > +# endif
> > + VPTESTNM %YMM6, %YMM6, %k0{%k1}
> > kmovd %k0, %LOOP_REG
> > TESTEQ %LOOP_REG
> > jnz L(return_vec_2_3_end)
> > @@ -815,6 +1018,8 @@ L(return_vec_page_cross_1):
> > # else
> > movzbl VEC_OFFSET(%rdi, %rcx), %eax
> > movzbl VEC_OFFSET(%rsi, %rcx), %ecx
> > + TOLOWER_gpr (%rax, %eax)
> > + TOLOWER_gpr (%rcx, %ecx)
> > subl %ecx, %eax
> > xorl %r8d, %eax
> > subl %r8d, %eax
> > @@ -871,7 +1076,7 @@ L(page_cross):
> > L(page_cross_loop):
> > VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
> > VPTESTM %YMM0, %YMM0, %k2
> > - VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
> > + CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
> > kmovd %k1, %ecx
> > TESTEQ %ecx
> > jnz L(check_ret_vec_page_cross)
> > @@ -895,7 +1100,7 @@ L(page_cross_loop):
> > */
> > VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
> > VPTESTM %YMM0, %YMM0, %k2
> > - VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
> > + CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
> >
> > kmovd %k1, %ecx
> > # ifdef USE_AS_STRNCMP
> > @@ -930,6 +1135,8 @@ L(ret_vec_page_cross_cont):
> > # else
> > movzbl (%rdi, %rcx, SIZE_OF_CHAR), %eax
> > movzbl (%rsi, %rcx, SIZE_OF_CHAR), %ecx
> > + TOLOWER_gpr (%rax, %eax)
> > + TOLOWER_gpr (%rcx, %ecx)
> > subl %ecx, %eax
> > xorl %r8d, %eax
> > subl %r8d, %eax
> > @@ -989,7 +1196,7 @@ L(less_1x_vec_till_page):
> > /* Use 16 byte comparison. */
> > vmovdqu (%rdi), %xmm0
> > VPTESTM %xmm0, %xmm0, %k2
> > - VPCMP $0, (%rsi), %xmm0, %k1{%k2}
> > + CMP_R1_S2_XMM (%xmm0, (%rsi), %xmm1, %k1){%k2}
> > kmovd %k1, %ecx
> > # ifdef USE_AS_WCSCMP
> > subl $0xf, %ecx
> > @@ -1009,7 +1216,7 @@ L(less_1x_vec_till_page):
> > # endif
> > vmovdqu (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
> > VPTESTM %xmm0, %xmm0, %k2
> > - VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0, %k1{%k2}
> > + CMP_R1_S2_XMM (%xmm0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1, %k1){%k2}
> > kmovd %k1, %ecx
> > # ifdef USE_AS_WCSCMP
> > subl $0xf, %ecx
> > @@ -1048,7 +1255,7 @@ L(less_16_till_page):
> > vmovq (%rdi), %xmm0
> > vmovq (%rsi), %xmm1
> > VPTESTM %xmm0, %xmm0, %k2
> > - VPCMP $0, %xmm1, %xmm0, %k1{%k2}
> > + CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
> > kmovd %k1, %ecx
> > # ifdef USE_AS_WCSCMP
> > subl $0x3, %ecx
> > @@ -1068,7 +1275,7 @@ L(less_16_till_page):
> > vmovq (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
> > vmovq (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
> > VPTESTM %xmm0, %xmm0, %k2
> > - VPCMP $0, %xmm1, %xmm0, %k1{%k2}
> > + CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
> > kmovd %k1, %ecx
> > # ifdef USE_AS_WCSCMP
> > subl $0x3, %ecx
> > @@ -1128,7 +1335,7 @@ L(ret_less_8_wcs):
> > vmovd (%rdi), %xmm0
> > vmovd (%rsi), %xmm1
> > VPTESTM %xmm0, %xmm0, %k2
> > - VPCMP $0, %xmm1, %xmm0, %k1{%k2}
> > + CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
> > kmovd %k1, %ecx
> > subl $0xf, %ecx
> > jnz L(check_ret_vec_page_cross)
> > @@ -1143,7 +1350,7 @@ L(ret_less_8_wcs):
> > vmovd (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
> > vmovd (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
> > VPTESTM %xmm0, %xmm0, %k2
> > - VPCMP $0, %xmm1, %xmm0, %k1{%k2}
> > + CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
> > kmovd %k1, %ecx
> > subl $0xf, %ecx
> > jnz L(check_ret_vec_page_cross)
> > @@ -1176,7 +1383,9 @@ L(less_4_till_page):
> > L(less_4_loop):
> > movzbl (%rdi), %eax
> > movzbl (%rsi, %rdi), %ecx
> > - subl %ecx, %eax
> > + TOLOWER_gpr (%rax, %eax)
> > + TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
> > + subl %BYTE_LOOP_REG, %eax
> > jnz L(ret_less_4_loop)
> > testl %ecx, %ecx
> > jz L(ret_zero_4_loop)
> > @@ -1203,5 +1412,6 @@ L(ret_less_4_loop):
> > subl %r8d, %eax
> > ret
> > # endif
> > -END(STRCMP)
> > + cfi_endproc
> > + .size STRCMP, .-STRCMP
> > #endif
> > diff --git a/sysdeps/x86_64/multiarch/strncase_l-evex.S b/sysdeps/x86_64/multiarch/strncase_l-evex.S
> > new file mode 100644
> > index 0000000000..8a5af3695c
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strncase_l-evex.S
> > @@ -0,0 +1,25 @@
> > +/* strncasecmp_l optimized with EVEX.
> > + Copyright (C) 2017-2022 Free Software Foundation, Inc.
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library; if not, see
> > + <https://www.gnu.org/licenses/>. */
> > +
> > +#ifndef STRCMP
> > +# define STRCMP __strncasecmp_l_evex
> > +#endif
> > +#define OVERFLOW_STRCMP __strcasecmp_l_evex
> > +#define USE_AS_STRCASECMP_L
> > +#define USE_AS_STRNCMP
> > +#include "strcmp-evex.S"
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks, pushed the patchset.
>
> Thanks.
>
> --
> H.J.
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v2 12/31] x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896]
2022-03-24 21:58 ` H.J. Lu
@ 2022-05-04 6:05 ` Sunil Pandey
0 siblings, 0 replies; 76+ messages in thread
From: Sunil Pandey @ 2022-05-04 6:05 UTC (permalink / raw)
To: H.J. Lu, Libc-stable Mailing List; +Cc: Noah Goldstein, GNU C Library
On Thu, Mar 24, 2022 at 2:59 PM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Thu, Mar 24, 2022 at 2:43 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Thu, Mar 24, 2022 at 4:26 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Thu, Mar 24, 2022 at 1:50 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > Overflow case for __wcsncmp_avx2_rtm should be __wcscmp_avx2_rtm not
> > > > __wcscmp_avx2.
> > > >
> > > > commit ddf0992cf57a93200e0c782e2a94d0733a5a0b87
> > > > Author: Noah Goldstein <goldstein.w.n@gmail.com>
> > > > Date: Sun Jan 9 16:02:21 2022 -0600
> > > >
> > > > x86: Fix __wcsncmp_avx2 in strcmp-avx2.S [BZ# 28755]
> > > >
> > > > Set the wrong fallback function for `__wcsncmp_avx2_rtm`. It was set
> > > > to fallback on to `__wcscmp_avx2` instead of `__wcscmp_avx2_rtm` which
> > > > can cause spurious aborts.
> > > >
> > > > This change will need to be backported.
> > > >
> > > > All string/memory tests pass.
> > > > ---
> > > > sysdeps/x86/tst-strncmp-rtm.c | 15 +++++++++++++++
> > > > sysdeps/x86_64/multiarch/strcmp-avx2.S | 2 +-
> > > > 2 files changed, 16 insertions(+), 1 deletion(-)
> > > >
> > > > diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
> > > > index 300bc8c281..a3b14e72ff 100644
> > > > --- a/sysdeps/x86/tst-strncmp-rtm.c
> > > > +++ b/sysdeps/x86/tst-strncmp-rtm.c
> > > > @@ -70,6 +70,16 @@ function_overflow (void)
> > > > return 1;
> > > > }
> > > >
> > > > +__attribute__ ((noinline, noclone))
> > > > +static int
> > > > +function_overflow2 (void)
> > > > +{
> > > > + if (STRNCMP (string1, string2, SIZE_MAX >> 4) == 0)
> > > > + return 0;
> > > > + else
> > > > + return 1;
> > > > +}
> > > > +
> > > > static int
> > > > do_test (void)
> > > > {
> > > > @@ -77,5 +87,10 @@ do_test (void)
> > > > if (status != EXIT_SUCCESS)
> > > > return status;
> > > > status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow);
> > > > + if (status != EXIT_SUCCESS)
> > > > + return status;
> > > > + status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow2);
> > > > + if (status != EXIT_SUCCESS)
> > > > + return status;
> > > > return status;
> > > > }
> > > > diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> > > > index 52ff5ad724..86a86b68e3 100644
> > > > --- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
> > > > +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> > > > @@ -122,7 +122,7 @@ ENTRY(STRCMP)
> > > > are cases where length is large enough that it can never be a
> > > > bound on valid memory so just use wcscmp. */
> > > > shrq $56, %rcx
> > > > - jnz __wcscmp_avx2
> > > > + jnz OVERFLOW_STRCMP
> > > >
> > > > leaq (, %rdx, 4), %rdx
> > > > # endif
> > > > --
> > > > 2.25.1
> > > >
> > >
> > > LGTM. Verified on AVX2 machine with RTM. Without the fix,
> > > the new testcase failed.
> >
> > And that w/ the fix it passes?
>
> Yes.
>
> > >
> > > Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
> > >
> > > Thanks.
> > >
> > > --
> > > H.J.
>
>
>
> --
> H.J.
I would like to backport this patch to release branches.
Any comments or objections?
--Sunil
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v1 03/23] x86: Code cleanup in strchr-avx2 and comment justifying branch
2022-03-24 19:36 ` H.J. Lu
@ 2022-05-12 19:31 ` Sunil Pandey
0 siblings, 0 replies; 76+ messages in thread
From: Sunil Pandey @ 2022-05-12 19:31 UTC (permalink / raw)
To: H.J. Lu, Libc-stable Mailing List; +Cc: Noah Goldstein, GNU C Library
On Thu, Mar 24, 2022 at 12:37 PM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Thu, Mar 24, 2022 at 12:20 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Thu, Mar 24, 2022 at 1:53 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Wed, Mar 23, 2022 at 2:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > Small code cleanup for size: -53 bytes.
> > > >
> > > > Add comment justifying using a branch to do NULL/non-null return.
> > >
> > >
> > > Do you have followup patches to improve its performance? We are
> > > backporting all x86-64 improvements to Intel release branches:
> > >
> > > https://gitlab.com/x86-glibc/glibc/-/wikis/home
> > >
> > > Patches without performance improvements are undesirable.
> >
> > No further changes planned at the moment, code size saves
> > seem worth it for master though. Also in favor of adding the comment
> > as I think its non-intuitive.
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.
I would like to backport this patch to release branches.
Any comments or objections?
--Sunil
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v1 04/23] x86: Code cleanup in strchr-evex and comment justifying branch
2022-03-24 18:54 ` H.J. Lu
@ 2022-05-12 19:32 ` Sunil Pandey
0 siblings, 0 replies; 76+ messages in thread
From: Sunil Pandey @ 2022-05-12 19:32 UTC (permalink / raw)
To: H.J. Lu, Libc-stable Mailing List; +Cc: Noah Goldstein, GNU C Library
On Thu, Mar 24, 2022 at 11:55 AM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Wed, Mar 23, 2022 at 2:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Small code cleanup for size: -81 bytes.
> >
> > Add comment justifying using a branch to do NULL/non-null return.
> >
> > All string/memory tests pass and no regressions in benchtests.
> >
> > geometric_mean(N=20) of all benchmarks New / Original: .985
> > ---
> > Geomtric Mean N=20 runs; All functions page aligned
> > length, alignment, pos, rand, seek_char/branch, max_char/perc-zero, New Time / Old Time
> > 2048, 0, 32, 0, 23, 127, 0.878
> > 2048, 1, 32, 0, 23, 127, 0.88
> > 2048, 0, 64, 0, 23, 127, 0.997
> > 2048, 2, 64, 0, 23, 127, 1.001
> > 2048, 0, 128, 0, 23, 127, 0.973
> > 2048, 3, 128, 0, 23, 127, 0.971
> > 2048, 0, 256, 0, 23, 127, 0.976
> > 2048, 4, 256, 0, 23, 127, 0.973
> > 2048, 0, 512, 0, 23, 127, 1.001
> > 2048, 5, 512, 0, 23, 127, 1.004
> > 2048, 0, 1024, 0, 23, 127, 1.005
> > 2048, 6, 1024, 0, 23, 127, 1.007
> > 2048, 0, 2048, 0, 23, 127, 1.035
> > 2048, 7, 2048, 0, 23, 127, 1.03
> > 4096, 0, 32, 0, 23, 127, 0.889
> > 4096, 1, 32, 0, 23, 127, 0.891
> > 4096, 0, 64, 0, 23, 127, 1.012
> > 4096, 2, 64, 0, 23, 127, 1.017
> > 4096, 0, 128, 0, 23, 127, 0.975
> > 4096, 3, 128, 0, 23, 127, 0.974
> > 4096, 0, 256, 0, 23, 127, 0.974
> > 4096, 4, 256, 0, 23, 127, 0.972
> > 4096, 0, 512, 0, 23, 127, 1.002
> > 4096, 5, 512, 0, 23, 127, 1.016
> > 4096, 0, 1024, 0, 23, 127, 1.009
> > 4096, 6, 1024, 0, 23, 127, 1.008
> > 4096, 0, 2048, 0, 23, 127, 1.003
> > 4096, 7, 2048, 0, 23, 127, 1.004
> > 256, 1, 64, 0, 23, 127, 0.993
> > 256, 2, 64, 0, 23, 127, 0.999
> > 256, 3, 64, 0, 23, 127, 0.992
> > 256, 4, 64, 0, 23, 127, 0.99
> > 256, 5, 64, 0, 23, 127, 0.99
> > 256, 6, 64, 0, 23, 127, 0.994
> > 256, 7, 64, 0, 23, 127, 0.991
> > 512, 0, 256, 0, 23, 127, 0.971
> > 512, 16, 256, 0, 23, 127, 0.971
> > 512, 32, 256, 0, 23, 127, 1.005
> > 512, 48, 256, 0, 23, 127, 0.998
> > 512, 64, 256, 0, 23, 127, 1.001
> > 512, 80, 256, 0, 23, 127, 1.002
> > 512, 96, 256, 0, 23, 127, 1.005
> > 512, 112, 256, 0, 23, 127, 1.012
> > 1, 0, 0, 0, 23, 127, 1.024
> > 2, 0, 1, 0, 23, 127, 0.991
> > 3, 0, 2, 0, 23, 127, 0.997
> > 4, 0, 3, 0, 23, 127, 0.984
> > 5, 0, 4, 0, 23, 127, 0.993
> > 6, 0, 5, 0, 23, 127, 0.985
> > 7, 0, 6, 0, 23, 127, 0.979
> > 8, 0, 7, 0, 23, 127, 0.975
> > 9, 0, 8, 0, 23, 127, 0.965
> > 10, 0, 9, 0, 23, 127, 0.957
> > 11, 0, 10, 0, 23, 127, 0.979
> > 12, 0, 11, 0, 23, 127, 0.987
> > 13, 0, 12, 0, 23, 127, 1.023
> > 14, 0, 13, 0, 23, 127, 0.997
> > 15, 0, 14, 0, 23, 127, 0.983
> > 16, 0, 15, 0, 23, 127, 0.987
> > 17, 0, 16, 0, 23, 127, 0.993
> > 18, 0, 17, 0, 23, 127, 0.985
> > 19, 0, 18, 0, 23, 127, 0.999
> > 20, 0, 19, 0, 23, 127, 0.998
> > 21, 0, 20, 0, 23, 127, 0.983
> > 22, 0, 21, 0, 23, 127, 0.983
> > 23, 0, 22, 0, 23, 127, 1.002
> > 24, 0, 23, 0, 23, 127, 1.0
> > 25, 0, 24, 0, 23, 127, 1.002
> > 26, 0, 25, 0, 23, 127, 0.984
> > 27, 0, 26, 0, 23, 127, 0.994
> > 28, 0, 27, 0, 23, 127, 0.995
> > 29, 0, 28, 0, 23, 127, 1.017
> > 30, 0, 29, 0, 23, 127, 1.009
> > 31, 0, 30, 0, 23, 127, 1.001
> > 32, 0, 31, 0, 23, 127, 1.021
> > 2048, 0, 32, 0, 0, 127, 0.899
> > 2048, 1, 32, 0, 0, 127, 0.93
> > 2048, 0, 64, 0, 0, 127, 1.009
> > 2048, 2, 64, 0, 0, 127, 1.023
> > 2048, 0, 128, 0, 0, 127, 0.973
> > 2048, 3, 128, 0, 0, 127, 0.975
> > 2048, 0, 256, 0, 0, 127, 0.974
> > 2048, 4, 256, 0, 0, 127, 0.97
> > 2048, 0, 512, 0, 0, 127, 0.999
> > 2048, 5, 512, 0, 0, 127, 1.004
> > 2048, 0, 1024, 0, 0, 127, 1.008
> > 2048, 6, 1024, 0, 0, 127, 1.008
> > 2048, 0, 2048, 0, 0, 127, 0.996
> > 2048, 7, 2048, 0, 0, 127, 1.002
> > 4096, 0, 32, 0, 0, 127, 0.872
> > 4096, 1, 32, 0, 0, 127, 0.881
> > 4096, 0, 64, 0, 0, 127, 1.006
> > 4096, 2, 64, 0, 0, 127, 1.005
> > 4096, 0, 128, 0, 0, 127, 0.973
> > 4096, 3, 128, 0, 0, 127, 0.974
> > 4096, 0, 256, 0, 0, 127, 0.969
> > 4096, 4, 256, 0, 0, 127, 0.971
> > 4096, 0, 512, 0, 0, 127, 1.0
> > 4096, 5, 512, 0, 0, 127, 1.005
> > 4096, 0, 1024, 0, 0, 127, 1.007
> > 4096, 6, 1024, 0, 0, 127, 1.009
> > 4096, 0, 2048, 0, 0, 127, 1.005
> > 4096, 7, 2048, 0, 0, 127, 1.007
> > 256, 1, 64, 0, 0, 127, 0.994
> > 256, 2, 64, 0, 0, 127, 1.008
> > 256, 3, 64, 0, 0, 127, 1.019
> > 256, 4, 64, 0, 0, 127, 0.991
> > 256, 5, 64, 0, 0, 127, 0.992
> > 256, 6, 64, 0, 0, 127, 0.991
> > 256, 7, 64, 0, 0, 127, 0.988
> > 512, 0, 256, 0, 0, 127, 0.971
> > 512, 16, 256, 0, 0, 127, 0.967
> > 512, 32, 256, 0, 0, 127, 1.005
> > 512, 48, 256, 0, 0, 127, 1.001
> > 512, 64, 256, 0, 0, 127, 1.009
> > 512, 80, 256, 0, 0, 127, 1.008
> > 512, 96, 256, 0, 0, 127, 1.009
> > 512, 112, 256, 0, 0, 127, 1.016
> > 1, 0, 0, 0, 0, 127, 1.038
> > 2, 0, 1, 0, 0, 127, 1.009
> > 3, 0, 2, 0, 0, 127, 0.992
> > 4, 0, 3, 0, 0, 127, 1.004
> > 5, 0, 4, 0, 0, 127, 0.966
> > 6, 0, 5, 0, 0, 127, 0.968
> > 7, 0, 6, 0, 0, 127, 1.004
> > 8, 0, 7, 0, 0, 127, 0.99
> > 9, 0, 8, 0, 0, 127, 0.958
> > 10, 0, 9, 0, 0, 127, 0.96
> > 11, 0, 10, 0, 0, 127, 0.948
> > 12, 0, 11, 0, 0, 127, 0.984
> > 13, 0, 12, 0, 0, 127, 0.967
> > 14, 0, 13, 0, 0, 127, 0.993
> > 15, 0, 14, 0, 0, 127, 0.991
> > 16, 0, 15, 0, 0, 127, 1.0
> > 17, 0, 16, 0, 0, 127, 0.982
> > 18, 0, 17, 0, 0, 127, 0.977
> > 19, 0, 18, 0, 0, 127, 0.987
> > 20, 0, 19, 0, 0, 127, 0.978
> > 21, 0, 20, 0, 0, 127, 1.0
> > 22, 0, 21, 0, 0, 127, 0.99
> > 23, 0, 22, 0, 0, 127, 0.988
> > 24, 0, 23, 0, 0, 127, 0.997
> > 25, 0, 24, 0, 0, 127, 1.003
> > 26, 0, 25, 0, 0, 127, 1.004
> > 27, 0, 26, 0, 0, 127, 0.982
> > 28, 0, 27, 0, 0, 127, 0.972
> > 29, 0, 28, 0, 0, 127, 0.978
> > 30, 0, 29, 0, 0, 127, 0.992
> > 31, 0, 30, 0, 0, 127, 0.986
> > 32, 0, 31, 0, 0, 127, 1.0
> >
> > 16, 0, 15, 1, 1, 0, 0.997
> > 16, 0, 15, 1, 0, 0, 1.001
> > 16, 0, 15, 1, 1, 0.1, 0.984
> > 16, 0, 15, 1, 0, 0.1, 0.999
> > 16, 0, 15, 1, 1, 0.25, 0.929
> > 16, 0, 15, 1, 0, 0.25, 1.001
> > 16, 0, 15, 1, 1, 0.33, 0.892
> > 16, 0, 15, 1, 0, 0.33, 0.996
> > 16, 0, 15, 1, 1, 0.5, 0.897
> > 16, 0, 15, 1, 0, 0.5, 1.009
> > 16, 0, 15, 1, 1, 0.66, 0.882
> > 16, 0, 15, 1, 0, 0.66, 0.967
> > 16, 0, 15, 1, 1, 0.75, 0.919
> > 16, 0, 15, 1, 0, 0.75, 1.027
> > 16, 0, 15, 1, 1, 0.9, 0.949
> > 16, 0, 15, 1, 0, 0.9, 1.021
> > 16, 0, 15, 1, 1, 1, 0.998
> > 16, 0, 15, 1, 0, 1, 0.999
> >
> > sysdeps/x86_64/multiarch/strchr-evex.S | 146 ++++++++++++++-----------
> > 1 file changed, 80 insertions(+), 66 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
> > index f62cd9d144..ec739fb8f9 100644
> > --- a/sysdeps/x86_64/multiarch/strchr-evex.S
> > +++ b/sysdeps/x86_64/multiarch/strchr-evex.S
> > @@ -30,6 +30,7 @@
> > # ifdef USE_AS_WCSCHR
> > # define VPBROADCAST vpbroadcastd
> > # define VPCMP vpcmpd
> > +# define VPTESTN vptestnmd
> > # define VPMINU vpminud
> > # define CHAR_REG esi
> > # define SHIFT_REG ecx
> > @@ -37,6 +38,7 @@
> > # else
> > # define VPBROADCAST vpbroadcastb
> > # define VPCMP vpcmpb
> > +# define VPTESTN vptestnmb
> > # define VPMINU vpminub
> > # define CHAR_REG sil
> > # define SHIFT_REG edx
> > @@ -61,13 +63,11 @@
> > # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
> >
> > .section .text.evex,"ax",@progbits
> > -ENTRY (STRCHR)
> > +ENTRY_P2ALIGN (STRCHR, 5)
> > /* Broadcast CHAR to YMM0. */
> > VPBROADCAST %esi, %YMM0
> > movl %edi, %eax
> > andl $(PAGE_SIZE - 1), %eax
> > - vpxorq %XMMZERO, %XMMZERO, %XMMZERO
> > -
> > /* Check if we cross page boundary with one vector load.
> > Otherwise it is safe to use an unaligned load. */
> > cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> > @@ -81,49 +81,35 @@ ENTRY (STRCHR)
> > vpxorq %YMM1, %YMM0, %YMM2
> > VPMINU %YMM2, %YMM1, %YMM2
> > /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
> > - VPCMP $0, %YMMZERO, %YMM2, %k0
> > + VPTESTN %YMM2, %YMM2, %k0
> > kmovd %k0, %eax
> > testl %eax, %eax
> > jz L(aligned_more)
> > tzcntl %eax, %eax
> > +# ifndef USE_AS_STRCHRNUL
> > + /* Found CHAR or the null byte. */
> > + cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG
> > + /* NB: Use a branch instead of cmovcc here. The expectation is
> > + that with strchr the user will branch based on input being
> > + null. Since this branch will be 100% predictive of the user
> > + branch a branch miss here should save what otherwise would
> > + be branch miss in the user code. Otherwise using a branch 1)
> > + saves code size and 2) is faster in highly predictable
> > + environments. */
> > + jne L(zero)
> > +# endif
> > # ifdef USE_AS_WCSCHR
> > /* NB: Multiply wchar_t count by 4 to get the number of bytes.
> > */
> > leaq (%rdi, %rax, CHAR_SIZE), %rax
> > # else
> > addq %rdi, %rax
> > -# endif
> > -# ifndef USE_AS_STRCHRNUL
> > - /* Found CHAR or the null byte. */
> > - cmp (%rax), %CHAR_REG
> > - jne L(zero)
> > # endif
> > ret
> >
> > - /* .p2align 5 helps keep performance more consistent if ENTRY()
> > - alignment % 32 was either 16 or 0. As well this makes the
> > - alignment % 32 of the loop_4x_vec fixed which makes tuning it
> > - easier. */
> > - .p2align 5
> > -L(first_vec_x3):
> > - tzcntl %eax, %eax
> > -# ifndef USE_AS_STRCHRNUL
> > - /* Found CHAR or the null byte. */
> > - cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
> > - jne L(zero)
> > -# endif
> > - /* NB: Multiply sizeof char type (1 or 4) to get the number of
> > - bytes. */
> > - leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> > - ret
> >
> > -# ifndef USE_AS_STRCHRNUL
> > -L(zero):
> > - xorl %eax, %eax
> > - ret
> > -# endif
> >
> > - .p2align 4
> > + .p2align 4,, 10
> > L(first_vec_x4):
> > # ifndef USE_AS_STRCHRNUL
> > /* Check to see if first match was CHAR (k0) or null (k1). */
> > @@ -144,9 +130,18 @@ L(first_vec_x4):
> > leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> > ret
> >
> > +# ifndef USE_AS_STRCHRNUL
> > +L(zero):
> > + xorl %eax, %eax
> > + ret
> > +# endif
> > +
> > +
> > .p2align 4
> > L(first_vec_x1):
> > - tzcntl %eax, %eax
> > + /* Use bsf here to save 1-byte keeping keeping the block in 1x
> > + fetch block. eax guranteed non-zero. */
> > + bsfl %eax, %eax
> > # ifndef USE_AS_STRCHRNUL
> > /* Found CHAR or the null byte. */
> > cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
> > @@ -158,7 +153,7 @@ L(first_vec_x1):
> > leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
> > ret
> >
> > - .p2align 4
> > + .p2align 4,, 10
> > L(first_vec_x2):
> > # ifndef USE_AS_STRCHRNUL
> > /* Check to see if first match was CHAR (k0) or null (k1). */
> > @@ -179,6 +174,21 @@ L(first_vec_x2):
> > leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> > ret
> >
> > + .p2align 4,, 10
> > +L(first_vec_x3):
> > + /* Use bsf here to save 1-byte keeping keeping the block in 1x
> > + fetch block. eax guranteed non-zero. */
> > + bsfl %eax, %eax
> > +# ifndef USE_AS_STRCHRNUL
> > + /* Found CHAR or the null byte. */
> > + cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
> > + jne L(zero)
> > +# endif
> > + /* NB: Multiply sizeof char type (1 or 4) to get the number of
> > + bytes. */
> > + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> > + ret
> > +
> > .p2align 4
> > L(aligned_more):
> > /* Align data to VEC_SIZE. */
> > @@ -195,7 +205,7 @@ L(cross_page_continue):
> > vpxorq %YMM1, %YMM0, %YMM2
> > VPMINU %YMM2, %YMM1, %YMM2
> > /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
> > - VPCMP $0, %YMMZERO, %YMM2, %k0
> > + VPTESTN %YMM2, %YMM2, %k0
> > kmovd %k0, %eax
> > testl %eax, %eax
> > jnz L(first_vec_x1)
> > @@ -206,7 +216,7 @@ L(cross_page_continue):
> > /* Each bit in K0 represents a CHAR in YMM1. */
> > VPCMP $0, %YMM1, %YMM0, %k0
> > /* Each bit in K1 represents a CHAR in YMM1. */
> > - VPCMP $0, %YMM1, %YMMZERO, %k1
> > + VPTESTN %YMM1, %YMM1, %k1
> > kortestd %k0, %k1
> > jnz L(first_vec_x2)
> >
> > @@ -215,7 +225,7 @@ L(cross_page_continue):
> > vpxorq %YMM1, %YMM0, %YMM2
> > VPMINU %YMM2, %YMM1, %YMM2
> > /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
> > - VPCMP $0, %YMMZERO, %YMM2, %k0
> > + VPTESTN %YMM2, %YMM2, %k0
> > kmovd %k0, %eax
> > testl %eax, %eax
> > jnz L(first_vec_x3)
> > @@ -224,7 +234,7 @@ L(cross_page_continue):
> > /* Each bit in K0 represents a CHAR in YMM1. */
> > VPCMP $0, %YMM1, %YMM0, %k0
> > /* Each bit in K1 represents a CHAR in YMM1. */
> > - VPCMP $0, %YMM1, %YMMZERO, %k1
> > + VPTESTN %YMM1, %YMM1, %k1
> > kortestd %k0, %k1
> > jnz L(first_vec_x4)
> >
> > @@ -265,33 +275,33 @@ L(loop_4x_vec):
> > VPMINU %YMM3, %YMM4, %YMM4
> > VPMINU %YMM2, %YMM4, %YMM4{%k4}{z}
> >
> > - VPCMP $0, %YMMZERO, %YMM4, %k1
> > + VPTESTN %YMM4, %YMM4, %k1
> > kmovd %k1, %ecx
> > subq $-(VEC_SIZE * 4), %rdi
> > testl %ecx, %ecx
> > jz L(loop_4x_vec)
> >
> > - VPCMP $0, %YMMZERO, %YMM1, %k0
> > + VPTESTN %YMM1, %YMM1, %k0
> > kmovd %k0, %eax
> > testl %eax, %eax
> > jnz L(last_vec_x1)
> >
> > - VPCMP $0, %YMMZERO, %YMM2, %k0
> > + VPTESTN %YMM2, %YMM2, %k0
> > kmovd %k0, %eax
> > testl %eax, %eax
> > jnz L(last_vec_x2)
> >
> > - VPCMP $0, %YMMZERO, %YMM3, %k0
> > + VPTESTN %YMM3, %YMM3, %k0
> > kmovd %k0, %eax
> > /* Combine YMM3 matches (eax) with YMM4 matches (ecx). */
> > # ifdef USE_AS_WCSCHR
> > sall $8, %ecx
> > orl %ecx, %eax
> > - tzcntl %eax, %eax
> > + bsfl %eax, %eax
> > # else
> > salq $32, %rcx
> > orq %rcx, %rax
> > - tzcntq %rax, %rax
> > + bsfq %rax, %rax
> > # endif
> > # ifndef USE_AS_STRCHRNUL
> > /* Check if match was CHAR or null. */
> > @@ -303,28 +313,28 @@ L(loop_4x_vec):
> > leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> > ret
> >
> > -# ifndef USE_AS_STRCHRNUL
> > -L(zero_end):
> > - xorl %eax, %eax
> > - ret
> > + .p2align 4,, 8
> > +L(last_vec_x1):
> > + bsfl %eax, %eax
> > +# ifdef USE_AS_WCSCHR
> > + /* NB: Multiply wchar_t count by 4 to get the number of bytes.
> > + */
> > + leaq (%rdi, %rax, CHAR_SIZE), %rax
> > +# else
> > + addq %rdi, %rax
> > # endif
> >
> > - .p2align 4
> > -L(last_vec_x1):
> > - tzcntl %eax, %eax
> > # ifndef USE_AS_STRCHRNUL
> > /* Check if match was null. */
> > - cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG
> > + cmp (%rax), %CHAR_REG
> > jne L(zero_end)
> > # endif
> > - /* NB: Multiply sizeof char type (1 or 4) to get the number of
> > - bytes. */
> > - leaq (%rdi, %rax, CHAR_SIZE), %rax
> > +
> > ret
> >
> > - .p2align 4
> > + .p2align 4,, 8
> > L(last_vec_x2):
> > - tzcntl %eax, %eax
> > + bsfl %eax, %eax
> > # ifndef USE_AS_STRCHRNUL
> > /* Check if match was null. */
> > cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
> > @@ -336,7 +346,7 @@ L(last_vec_x2):
> > ret
> >
> > /* Cold case for crossing page with first load. */
> > - .p2align 4
> > + .p2align 4,, 8
> > L(cross_page_boundary):
> > movq %rdi, %rdx
> > /* Align rdi. */
> > @@ -346,9 +356,9 @@ L(cross_page_boundary):
> > vpxorq %YMM1, %YMM0, %YMM2
> > VPMINU %YMM2, %YMM1, %YMM2
> > /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
> > - VPCMP $0, %YMMZERO, %YMM2, %k0
> > + VPTESTN %YMM2, %YMM2, %k0
> > kmovd %k0, %eax
> > - /* Remove the leading bits. */
> > + /* Remove the leading bits. */
> > # ifdef USE_AS_WCSCHR
> > movl %edx, %SHIFT_REG
> > /* NB: Divide shift count by 4 since each bit in K1 represent 4
> > @@ -360,20 +370,24 @@ L(cross_page_boundary):
> > /* If eax is zero continue. */
> > testl %eax, %eax
> > jz L(cross_page_continue)
> > - tzcntl %eax, %eax
> > -# ifndef USE_AS_STRCHRNUL
> > - /* Check to see if match was CHAR or null. */
> > - cmp (%rdx, %rax, CHAR_SIZE), %CHAR_REG
> > - jne L(zero_end)
> > -# endif
> > + bsfl %eax, %eax
> > +
> > # ifdef USE_AS_WCSCHR
> > /* NB: Multiply wchar_t count by 4 to get the number of
> > bytes. */
> > leaq (%rdx, %rax, CHAR_SIZE), %rax
> > # else
> > addq %rdx, %rax
> > +# endif
> > +# ifndef USE_AS_STRCHRNUL
> > + /* Check to see if match was CHAR or null. */
> > + cmp (%rax), %CHAR_REG
> > + je L(cross_page_ret)
> > +L(zero_end):
> > + xorl %eax, %eax
> > +L(cross_page_ret):
> > # endif
> > ret
> >
> > END (STRCHR)
> > -# endif
> > +#endif
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.
I would like to backport this patch to release branches.
Any comments or objections?
--Sunil
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v1 07/23] x86: Optimize strcspn and strpbrk in strcspn-c.c
2022-03-24 18:55 ` H.J. Lu
@ 2022-05-12 19:34 ` Sunil Pandey
0 siblings, 0 replies; 76+ messages in thread
From: Sunil Pandey @ 2022-05-12 19:34 UTC (permalink / raw)
To: H.J. Lu, Libc-stable Mailing List; +Cc: Noah Goldstein, GNU C Library
On Thu, Mar 24, 2022 at 11:57 AM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Wed, Mar 23, 2022 at 2:59 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of
> > _mm_cmpistri. Also change offset to unsigned to avoid unnecessary
> > sign extensions.
> >
> > geometric_mean(N=20) of all benchmarks that dont fallback on
> > sse2/strlen; New / Original: .928
> >
> > All string/memory tests pass.
> > ---
> > Geomtric Mean N=20 runs; All functions page aligned
> > len, align1, align2, pos, New Time / Old Time
> > 0, 0, 0, 512, 1.207
> > 1, 0, 0, 512, 1.039
> > 1, 1, 0, 512, 0.997
> > 1, 0, 1, 512, 0.981
> > 1, 1, 1, 512, 0.977
> > 2, 0, 0, 512, 1.02
> > 2, 2, 0, 512, 0.979
> > 2, 0, 2, 512, 0.902
> > 2, 2, 2, 512, 0.958
> > 3, 0, 0, 512, 0.978
> > 3, 3, 0, 512, 0.988
> > 3, 0, 3, 512, 0.979
> > 3, 3, 3, 512, 0.955
> > 4, 0, 0, 512, 0.969
> > 4, 4, 0, 512, 0.991
> > 4, 0, 4, 512, 0.94
> > 4, 4, 4, 512, 0.958
> > 5, 0, 0, 512, 0.963
> > 5, 5, 0, 512, 1.004
> > 5, 0, 5, 512, 0.948
> > 5, 5, 5, 512, 0.971
> > 6, 0, 0, 512, 0.933
> > 6, 6, 0, 512, 1.007
> > 6, 0, 6, 512, 0.921
> > 6, 6, 6, 512, 0.969
> > 7, 0, 0, 512, 0.928
> > 7, 7, 0, 512, 0.976
> > 7, 0, 7, 512, 0.932
> > 7, 7, 7, 512, 0.995
> > 8, 0, 0, 512, 0.931
> > 8, 0, 8, 512, 0.766
> > 9, 0, 0, 512, 0.965
> > 9, 1, 0, 512, 0.999
> > 9, 0, 9, 512, 0.765
> > 9, 1, 9, 512, 0.97
> > 10, 0, 0, 512, 0.976
> > 10, 2, 0, 512, 0.991
> > 10, 0, 10, 512, 0.768
> > 10, 2, 10, 512, 0.926
> > 11, 0, 0, 512, 0.958
> > 11, 3, 0, 512, 1.006
> > 11, 0, 11, 512, 0.768
> > 11, 3, 11, 512, 0.908
> > 12, 0, 0, 512, 0.945
> > 12, 4, 0, 512, 0.896
> > 12, 0, 12, 512, 0.764
> > 12, 4, 12, 512, 0.785
> > 13, 0, 0, 512, 0.957
> > 13, 5, 0, 512, 1.019
> > 13, 0, 13, 512, 0.76
> > 13, 5, 13, 512, 0.785
> > 14, 0, 0, 512, 0.918
> > 14, 6, 0, 512, 1.004
> > 14, 0, 14, 512, 0.78
> > 14, 6, 14, 512, 0.711
> > 15, 0, 0, 512, 0.855
> > 15, 7, 0, 512, 0.985
> > 15, 0, 15, 512, 0.779
> > 15, 7, 15, 512, 0.772
> > 16, 0, 0, 512, 0.987
> > 16, 0, 16, 512, 0.99
> > 17, 0, 0, 512, 0.996
> > 17, 1, 0, 512, 0.979
> > 17, 0, 17, 512, 1.001
> > 17, 1, 17, 512, 1.03
> > 18, 0, 0, 512, 0.976
> > 18, 2, 0, 512, 0.989
> > 18, 0, 18, 512, 0.976
> > 18, 2, 18, 512, 0.992
> > 19, 0, 0, 512, 0.991
> > 19, 3, 0, 512, 0.988
> > 19, 0, 19, 512, 1.009
> > 19, 3, 19, 512, 1.018
> > 20, 0, 0, 512, 0.999
> > 20, 4, 0, 512, 1.005
> > 20, 0, 20, 512, 0.993
> > 20, 4, 20, 512, 0.983
> > 21, 0, 0, 512, 0.982
> > 21, 5, 0, 512, 0.988
> > 21, 0, 21, 512, 0.978
> > 21, 5, 21, 512, 0.984
> > 22, 0, 0, 512, 0.988
> > 22, 6, 0, 512, 0.979
> > 22, 0, 22, 512, 0.984
> > 22, 6, 22, 512, 0.983
> > 23, 0, 0, 512, 0.996
> > 23, 7, 0, 512, 0.998
> > 23, 0, 23, 512, 0.979
> > 23, 7, 23, 512, 0.987
> > 24, 0, 0, 512, 0.99
> > 24, 0, 24, 512, 0.979
> > 25, 0, 0, 512, 0.985
> > 25, 1, 0, 512, 0.988
> > 25, 0, 25, 512, 0.99
> > 25, 1, 25, 512, 0.986
> > 26, 0, 0, 512, 1.005
> > 26, 2, 0, 512, 0.995
> > 26, 0, 26, 512, 0.992
> > 26, 2, 26, 512, 0.983
> > 27, 0, 0, 512, 0.986
> > 27, 3, 0, 512, 0.978
> > 27, 0, 27, 512, 0.986
> > 27, 3, 27, 512, 0.973
> > 28, 0, 0, 512, 0.995
> > 28, 4, 0, 512, 0.993
> > 28, 0, 28, 512, 0.983
> > 28, 4, 28, 512, 1.005
> > 29, 0, 0, 512, 0.983
> > 29, 5, 0, 512, 0.982
> > 29, 0, 29, 512, 0.984
> > 29, 5, 29, 512, 1.005
> > 30, 0, 0, 512, 0.978
> > 30, 6, 0, 512, 0.985
> > 30, 0, 30, 512, 0.994
> > 30, 6, 30, 512, 0.993
> > 31, 0, 0, 512, 0.984
> > 31, 7, 0, 512, 0.983
> > 31, 0, 31, 512, 1.0
> > 31, 7, 31, 512, 1.031
> > 4, 0, 0, 32, 0.916
> > 4, 1, 0, 32, 0.952
> > 4, 0, 1, 32, 0.927
> > 4, 1, 1, 32, 0.969
> > 4, 0, 0, 64, 0.961
> > 4, 2, 0, 64, 0.955
> > 4, 0, 2, 64, 0.975
> > 4, 2, 2, 64, 0.972
> > 4, 0, 0, 128, 0.971
> > 4, 3, 0, 128, 0.982
> > 4, 0, 3, 128, 0.945
> > 4, 3, 3, 128, 0.971
> > 4, 0, 0, 256, 1.004
> > 4, 4, 0, 256, 0.966
> > 4, 0, 4, 256, 0.961
> > 4, 4, 4, 256, 0.971
> > 4, 5, 0, 512, 0.929
> > 4, 0, 5, 512, 0.969
> > 4, 5, 5, 512, 0.985
> > 4, 0, 0, 1024, 1.003
> > 4, 6, 0, 1024, 1.009
> > 4, 0, 6, 1024, 1.005
> > 4, 6, 6, 1024, 0.999
> > 4, 0, 0, 2048, 0.917
> > 4, 7, 0, 2048, 1.015
> > 4, 0, 7, 2048, 1.011
> > 4, 7, 7, 2048, 0.907
> > 10, 1, 0, 64, 0.964
> > 10, 1, 1, 64, 0.966
> > 10, 2, 0, 64, 0.953
> > 10, 2, 2, 64, 0.972
> > 10, 3, 0, 64, 0.962
> > 10, 3, 3, 64, 0.969
> > 10, 4, 0, 64, 0.957
> > 10, 4, 4, 64, 0.969
> > 10, 5, 0, 64, 0.961
> > 10, 5, 5, 64, 0.965
> > 10, 6, 0, 64, 0.949
> > 10, 6, 6, 64, 0.9
> > 10, 7, 0, 64, 0.957
> > 10, 7, 7, 64, 0.897
> > 6, 0, 0, 0, 0.991
> > 6, 0, 0, 1, 1.011
> > 6, 0, 1, 1, 0.939
> > 6, 0, 0, 2, 1.016
> > 6, 0, 2, 2, 0.94
> > 6, 0, 0, 3, 1.019
> > 6, 0, 3, 3, 0.941
> > 6, 0, 0, 4, 1.056
> > 6, 0, 4, 4, 0.884
> > 6, 0, 0, 5, 0.977
> > 6, 0, 5, 5, 0.934
> > 6, 0, 0, 6, 0.954
> > 6, 0, 6, 6, 0.93
> > 6, 0, 0, 7, 0.963
> > 6, 0, 7, 7, 0.916
> > 6, 0, 0, 8, 0.963
> > 6, 0, 8, 8, 0.945
> > 6, 0, 0, 9, 1.028
> > 6, 0, 9, 9, 0.942
> > 6, 0, 0, 10, 0.955
> > 6, 0, 10, 10, 0.831
> > 6, 0, 0, 11, 0.948
> > 6, 0, 11, 11, 0.82
> > 6, 0, 0, 12, 1.033
> > 6, 0, 12, 12, 0.873
> > 6, 0, 0, 13, 0.983
> > 6, 0, 13, 13, 0.852
> > 6, 0, 0, 14, 0.984
> > 6, 0, 14, 14, 0.853
> > 6, 0, 0, 15, 0.984
> > 6, 0, 15, 15, 0.882
> > 6, 0, 0, 16, 0.971
> > 6, 0, 16, 16, 0.958
> > 6, 0, 0, 17, 0.938
> > 6, 0, 17, 17, 0.947
> > 6, 0, 0, 18, 0.96
> > 6, 0, 18, 18, 0.938
> > 6, 0, 0, 19, 0.903
> > 6, 0, 19, 19, 0.943
> > 6, 0, 0, 20, 0.947
> > 6, 0, 20, 20, 0.951
> > 6, 0, 0, 21, 0.948
> > 6, 0, 21, 21, 0.96
> > 6, 0, 0, 22, 0.926
> > 6, 0, 22, 22, 0.951
> > 6, 0, 0, 23, 0.923
> > 6, 0, 23, 23, 0.959
> > 6, 0, 0, 24, 0.918
> > 6, 0, 24, 24, 0.952
> > 6, 0, 0, 25, 0.97
> > 6, 0, 25, 25, 0.952
> > 6, 0, 0, 26, 0.871
> > 6, 0, 26, 26, 0.869
> > 6, 0, 0, 27, 0.935
> > 6, 0, 27, 27, 0.836
> > 6, 0, 0, 28, 0.936
> > 6, 0, 28, 28, 0.857
> > 6, 0, 0, 29, 0.876
> > 6, 0, 29, 29, 0.859
> > 6, 0, 0, 30, 0.934
> > 6, 0, 30, 30, 0.857
> > 6, 0, 0, 31, 0.962
> > 6, 0, 31, 31, 0.86
> > 6, 0, 0, 32, 0.912
> > 6, 0, 32, 32, 0.94
> > 6, 0, 0, 33, 0.903
> > 6, 0, 33, 33, 0.968
> > 6, 0, 0, 34, 0.913
> > 6, 0, 34, 34, 0.896
> > 6, 0, 0, 35, 0.904
> > 6, 0, 35, 35, 0.913
> > 6, 0, 0, 36, 0.905
> > 6, 0, 36, 36, 0.907
> > 6, 0, 0, 37, 0.899
> > 6, 0, 37, 37, 0.9
> > 6, 0, 0, 38, 0.912
> > 6, 0, 38, 38, 0.919
> > 6, 0, 0, 39, 0.925
> > 6, 0, 39, 39, 0.927
> > 6, 0, 0, 40, 0.923
> > 6, 0, 40, 40, 0.972
> > 6, 0, 0, 41, 0.92
> > 6, 0, 41, 41, 0.966
> > 6, 0, 0, 42, 0.915
> > 6, 0, 42, 42, 0.834
> > 6, 0, 0, 43, 0.92
> > 6, 0, 43, 43, 0.856
> > 6, 0, 0, 44, 0.908
> > 6, 0, 44, 44, 0.858
> > 6, 0, 0, 45, 0.932
> > 6, 0, 45, 45, 0.847
> > 6, 0, 0, 46, 0.927
> > 6, 0, 46, 46, 0.859
> > 6, 0, 0, 47, 0.902
> > 6, 0, 47, 47, 0.855
> > 6, 0, 0, 48, 0.949
> > 6, 0, 48, 48, 0.934
> > 6, 0, 0, 49, 0.907
> > 6, 0, 49, 49, 0.943
> > 6, 0, 0, 50, 0.934
> > 6, 0, 50, 50, 0.943
> > 6, 0, 0, 51, 0.933
> > 6, 0, 51, 51, 0.939
> > 6, 0, 0, 52, 0.944
> > 6, 0, 52, 52, 0.944
> > 6, 0, 0, 53, 0.939
> > 6, 0, 53, 53, 0.938
> > 6, 0, 0, 54, 0.9
> > 6, 0, 54, 54, 0.923
> > 6, 0, 0, 55, 0.9
> > 6, 0, 55, 55, 0.927
> > 6, 0, 0, 56, 0.9
> > 6, 0, 56, 56, 0.917
> > 6, 0, 0, 57, 0.9
> > 6, 0, 57, 57, 0.916
> > 6, 0, 0, 58, 0.914
> > 6, 0, 58, 58, 0.784
> > 6, 0, 0, 59, 0.863
> > 6, 0, 59, 59, 0.846
> > 6, 0, 0, 60, 0.88
> > 6, 0, 60, 60, 0.827
> > 6, 0, 0, 61, 0.896
> > 6, 0, 61, 61, 0.847
> > 6, 0, 0, 62, 0.894
> > 6, 0, 62, 62, 0.865
> > 6, 0, 0, 63, 0.934
> > 6, 0, 63, 63, 0.866
> >
> > sysdeps/x86_64/multiarch/strcspn-c.c | 83 +++++++++++++---------------
> > 1 file changed, 37 insertions(+), 46 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c
> > index 013aebf797..c312fab8b1 100644
> > --- a/sysdeps/x86_64/multiarch/strcspn-c.c
> > +++ b/sysdeps/x86_64/multiarch/strcspn-c.c
> > @@ -84,83 +84,74 @@ STRCSPN_SSE42 (const char *s, const char *a)
> > RETURN (NULL, strlen (s));
> >
> > const char *aligned;
> > - __m128i mask;
> > - int offset = (int) ((size_t) a & 15);
> > + __m128i mask, maskz, zero;
> > + unsigned int maskz_bits;
> > + unsigned int offset = (unsigned int) ((size_t) a & 15);
> > + zero = _mm_set1_epi8 (0);
> > if (offset != 0)
> > {
> > /* Load masks. */
> > aligned = (const char *) ((size_t) a & -16L);
> > __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
> > -
> > - mask = __m128i_shift_right (mask0, offset);
> > + maskz = _mm_cmpeq_epi8 (mask0, zero);
> >
> > /* Find where the NULL terminator is. */
> > - int length = _mm_cmpistri (mask, mask, 0x3a);
> > - if (length == 16 - offset)
> > - {
> > - /* There is no NULL terminator. */
> > - __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
> > - int index = _mm_cmpistri (mask1, mask1, 0x3a);
> > - length += index;
> > -
> > - /* Don't use SSE4.2 if the length of A > 16. */
> > - if (length > 16)
> > - return STRCSPN_SSE2 (s, a);
> > -
> > - if (index != 0)
> > - {
> > - /* Combine mask0 and mask1. We could play games with
> > - palignr, but frankly this data should be in L1 now
> > - so do the merge via an unaligned load. */
> > - mask = _mm_loadu_si128 ((__m128i *) a);
> > - }
> > - }
> > + maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
> > + if (maskz_bits != 0)
> > + {
> > + mask = __m128i_shift_right (mask0, offset);
> > + offset = (unsigned int) ((size_t) s & 15);
> > + if (offset)
> > + goto start_unaligned;
> > +
> > + aligned = s;
> > + goto start_loop;
> > + }
> > }
> > - else
> > - {
> > - /* A is aligned. */
> > - mask = _mm_load_si128 ((__m128i *) a);
> >
> > - /* Find where the NULL terminator is. */
> > - int length = _mm_cmpistri (mask, mask, 0x3a);
> > - if (length == 16)
> > - {
> > - /* There is no NULL terminator. Don't use SSE4.2 if the length
> > - of A > 16. */
> > - if (a[16] != 0)
> > - return STRCSPN_SSE2 (s, a);
> > - }
> > + /* A is aligned. */
> > + mask = _mm_loadu_si128 ((__m128i *) a);
> > + /* Find where the NULL terminator is. */
> > + maskz = _mm_cmpeq_epi8 (mask, zero);
> > + maskz_bits = _mm_movemask_epi8 (maskz);
> > + if (maskz_bits == 0)
> > + {
> > + /* There is no NULL terminator. Don't use SSE4.2 if the length
> > + of A > 16. */
> > + if (a[16] != 0)
> > + return STRCSPN_SSE2 (s, a);
> > }
> >
> > - offset = (int) ((size_t) s & 15);
> > + aligned = s;
> > + offset = (unsigned int) ((size_t) s & 15);
> > if (offset != 0)
> > {
> > + start_unaligned:
> > /* Check partial string. */
> > aligned = (const char *) ((size_t) s & -16L);
> > __m128i value = _mm_load_si128 ((__m128i *) aligned);
> >
> > value = __m128i_shift_right (value, offset);
> >
> > - int length = _mm_cmpistri (mask, value, 0x2);
> > + unsigned int length = _mm_cmpistri (mask, value, 0x2);
> > /* No need to check ZFlag since ZFlag is always 1. */
> > - int cflag = _mm_cmpistrc (mask, value, 0x2);
> > + unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
> > if (cflag)
> > RETURN ((char *) (s + length), length);
> > /* Find where the NULL terminator is. */
> > - int index = _mm_cmpistri (value, value, 0x3a);
> > + unsigned int index = _mm_cmpistri (value, value, 0x3a);
> > if (index < 16 - offset)
> > RETURN (NULL, index);
> > aligned += 16;
> > }
> > - else
> > - aligned = s;
> >
> > +start_loop:
> > while (1)
> > {
> > __m128i value = _mm_load_si128 ((__m128i *) aligned);
> > - int index = _mm_cmpistri (mask, value, 0x2);
> > - int cflag = _mm_cmpistrc (mask, value, 0x2);
> > - int zflag = _mm_cmpistrz (mask, value, 0x2);
> > + unsigned int index = _mm_cmpistri (mask, value, 0x2);
> > + unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
> > + unsigned int zflag = _mm_cmpistrz (mask, value, 0x2);
> > if (cflag)
> > RETURN ((char *) (aligned + index), (size_t) (aligned + index - s));
> > if (zflag)
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.
I would like to backport this patch to release branches.
Any comments or objections?
--Sunil
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v1 08/23] x86: Optimize strspn in strspn-c.c
2022-03-24 18:56 ` H.J. Lu
@ 2022-05-12 19:39 ` Sunil Pandey
0 siblings, 0 replies; 76+ messages in thread
From: Sunil Pandey @ 2022-05-12 19:39 UTC (permalink / raw)
To: H.J. Lu, Libc-stable Mailing List; +Cc: Noah Goldstein, GNU C Library
On Thu, Mar 24, 2022 at 11:58 AM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Wed, Mar 23, 2022 at 2:59 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of
> > _mm_cmpistri. Also change offset to unsigned to avoid unnecessary
> > sign extensions.
> >
> > geometric_mean(N=20) of all benchmarks that dont fallback on
> > sse2; New / Original: .901
> >
> > All string/memory tests pass.
> > ---
> > Geomtric Mean N=20 runs; All functions page aligned
> > len, align1, align2, pos, New Time / Old Time
> > 1, 0, 0, 512, 0.768
> > 1, 1, 0, 512, 0.666
> > 1, 0, 1, 512, 1.193
> > 1, 1, 1, 512, 0.872
> > 2, 0, 0, 512, 0.698
> > 2, 2, 0, 512, 0.687
> > 2, 0, 2, 512, 1.393
> > 2, 2, 2, 512, 0.944
> > 3, 0, 0, 512, 0.691
> > 3, 3, 0, 512, 0.676
> > 3, 0, 3, 512, 1.388
> > 3, 3, 3, 512, 0.948
> > 4, 0, 0, 512, 0.74
> > 4, 4, 0, 512, 0.678
> > 4, 0, 4, 512, 1.421
> > 4, 4, 4, 512, 0.943
> > 5, 0, 0, 512, 0.691
> > 5, 5, 0, 512, 0.675
> > 5, 0, 5, 512, 1.348
> > 5, 5, 5, 512, 0.952
> > 6, 0, 0, 512, 0.685
> > 6, 6, 0, 512, 0.67
> > 6, 0, 6, 512, 1.333
> > 6, 6, 6, 512, 0.95
> > 7, 0, 0, 512, 0.688
> > 7, 7, 0, 512, 0.675
> > 7, 0, 7, 512, 1.344
> > 7, 7, 7, 512, 0.919
> > 8, 0, 0, 512, 0.716
> > 8, 0, 8, 512, 0.935
> > 9, 0, 0, 512, 0.716
> > 9, 1, 0, 512, 0.712
> > 9, 0, 9, 512, 0.956
> > 9, 1, 9, 512, 0.992
> > 10, 0, 0, 512, 0.699
> > 10, 2, 0, 512, 0.68
> > 10, 0, 10, 512, 0.952
> > 10, 2, 10, 512, 0.932
> > 11, 0, 0, 512, 0.705
> > 11, 3, 0, 512, 0.685
> > 11, 0, 11, 512, 0.956
> > 11, 3, 11, 512, 0.927
> > 12, 0, 0, 512, 0.695
> > 12, 4, 0, 512, 0.675
> > 12, 0, 12, 512, 0.948
> > 12, 4, 12, 512, 0.928
> > 13, 0, 0, 512, 0.7
> > 13, 5, 0, 512, 0.678
> > 13, 0, 13, 512, 0.944
> > 13, 5, 13, 512, 0.931
> > 14, 0, 0, 512, 0.703
> > 14, 6, 0, 512, 0.678
> > 14, 0, 14, 512, 0.949
> > 14, 6, 14, 512, 0.93
> > 15, 0, 0, 512, 0.694
> > 15, 7, 0, 512, 0.678
> > 15, 0, 15, 512, 0.953
> > 15, 7, 15, 512, 0.924
> > 16, 0, 0, 512, 1.021
> > 16, 0, 16, 512, 1.067
> > 17, 0, 0, 512, 0.991
> > 17, 1, 0, 512, 0.984
> > 17, 0, 17, 512, 0.979
> > 17, 1, 17, 512, 0.993
> > 18, 0, 0, 512, 0.992
> > 18, 2, 0, 512, 1.008
> > 18, 0, 18, 512, 1.016
> > 18, 2, 18, 512, 0.993
> > 19, 0, 0, 512, 0.984
> > 19, 3, 0, 512, 0.985
> > 19, 0, 19, 512, 1.007
> > 19, 3, 19, 512, 1.006
> > 20, 0, 0, 512, 0.969
> > 20, 4, 0, 512, 0.968
> > 20, 0, 20, 512, 0.975
> > 20, 4, 20, 512, 0.975
> > 21, 0, 0, 512, 0.992
> > 21, 5, 0, 512, 0.992
> > 21, 0, 21, 512, 0.98
> > 21, 5, 21, 512, 0.97
> > 22, 0, 0, 512, 0.989
> > 22, 6, 0, 512, 0.987
> > 22, 0, 22, 512, 0.99
> > 22, 6, 22, 512, 0.985
> > 23, 0, 0, 512, 0.989
> > 23, 7, 0, 512, 0.98
> > 23, 0, 23, 512, 1.0
> > 23, 7, 23, 512, 0.993
> > 24, 0, 0, 512, 0.99
> > 24, 0, 24, 512, 0.998
> > 25, 0, 0, 512, 1.01
> > 25, 1, 0, 512, 1.0
> > 25, 0, 25, 512, 0.97
> > 25, 1, 25, 512, 0.967
> > 26, 0, 0, 512, 1.009
> > 26, 2, 0, 512, 0.986
> > 26, 0, 26, 512, 0.997
> > 26, 2, 26, 512, 0.993
> > 27, 0, 0, 512, 0.984
> > 27, 3, 0, 512, 0.997
> > 27, 0, 27, 512, 0.989
> > 27, 3, 27, 512, 0.976
> > 28, 0, 0, 512, 0.991
> > 28, 4, 0, 512, 1.003
> > 28, 0, 28, 512, 0.986
> > 28, 4, 28, 512, 0.989
> > 29, 0, 0, 512, 0.986
> > 29, 5, 0, 512, 0.985
> > 29, 0, 29, 512, 0.984
> > 29, 5, 29, 512, 0.977
> > 30, 0, 0, 512, 0.991
> > 30, 6, 0, 512, 0.987
> > 30, 0, 30, 512, 0.979
> > 30, 6, 30, 512, 0.974
> > 31, 0, 0, 512, 0.995
> > 31, 7, 0, 512, 0.995
> > 31, 0, 31, 512, 0.994
> > 31, 7, 31, 512, 0.984
> > 4, 0, 0, 32, 0.861
> > 4, 1, 0, 32, 0.864
> > 4, 0, 1, 32, 0.962
> > 4, 1, 1, 32, 0.967
> > 4, 0, 0, 64, 0.884
> > 4, 2, 0, 64, 0.818
> > 4, 0, 2, 64, 0.889
> > 4, 2, 2, 64, 0.918
> > 4, 0, 0, 128, 0.942
> > 4, 3, 0, 128, 0.884
> > 4, 0, 3, 128, 0.931
> > 4, 3, 3, 128, 0.883
> > 4, 0, 0, 256, 0.964
> > 4, 4, 0, 256, 0.922
> > 4, 0, 4, 256, 0.956
> > 4, 4, 4, 256, 0.93
> > 4, 5, 0, 512, 0.833
> > 4, 0, 5, 512, 1.027
> > 4, 5, 5, 512, 0.929
> > 4, 0, 0, 1024, 0.998
> > 4, 6, 0, 1024, 0.986
> > 4, 0, 6, 1024, 0.984
> > 4, 6, 6, 1024, 0.977
> > 4, 0, 0, 2048, 0.991
> > 4, 7, 0, 2048, 0.987
> > 4, 0, 7, 2048, 0.996
> > 4, 7, 7, 2048, 0.98
> > 10, 1, 0, 64, 0.826
> > 10, 1, 1, 64, 0.907
> > 10, 2, 0, 64, 0.829
> > 10, 2, 2, 64, 0.91
> > 10, 3, 0, 64, 0.83
> > 10, 3, 3, 64, 0.915
> > 10, 4, 0, 64, 0.83
> > 10, 4, 4, 64, 0.911
> > 10, 5, 0, 64, 0.828
> > 10, 5, 5, 64, 0.905
> > 10, 6, 0, 64, 0.828
> > 10, 6, 6, 64, 0.812
> > 10, 7, 0, 64, 0.83
> > 10, 7, 7, 64, 0.819
> > 6, 0, 0, 0, 1.261
> > 6, 0, 0, 1, 1.252
> > 6, 0, 1, 1, 0.845
> > 6, 0, 0, 2, 1.27
> > 6, 0, 2, 2, 0.85
> > 6, 0, 0, 3, 1.269
> > 6, 0, 3, 3, 0.845
> > 6, 0, 0, 4, 1.287
> > 6, 0, 4, 4, 0.852
> > 6, 0, 0, 5, 1.278
> > 6, 0, 5, 5, 0.851
> > 6, 0, 0, 6, 1.269
> > 6, 0, 6, 6, 0.841
> > 6, 0, 0, 7, 1.268
> > 6, 0, 7, 7, 0.851
> > 6, 0, 0, 8, 1.291
> > 6, 0, 8, 8, 0.837
> > 6, 0, 0, 9, 1.283
> > 6, 0, 9, 9, 0.831
> > 6, 0, 0, 10, 1.252
> > 6, 0, 10, 10, 0.997
> > 6, 0, 0, 11, 1.295
> > 6, 0, 11, 11, 1.046
> > 6, 0, 0, 12, 1.296
> > 6, 0, 12, 12, 1.038
> > 6, 0, 0, 13, 1.287
> > 6, 0, 13, 13, 1.082
> > 6, 0, 0, 14, 1.284
> > 6, 0, 14, 14, 1.001
> > 6, 0, 0, 15, 1.286
> > 6, 0, 15, 15, 1.002
> > 6, 0, 0, 16, 0.894
> > 6, 0, 16, 16, 0.874
> > 6, 0, 0, 17, 0.892
> > 6, 0, 17, 17, 0.974
> > 6, 0, 0, 18, 0.907
> > 6, 0, 18, 18, 0.993
> > 6, 0, 0, 19, 0.909
> > 6, 0, 19, 19, 0.99
> > 6, 0, 0, 20, 0.894
> > 6, 0, 20, 20, 0.978
> > 6, 0, 0, 21, 0.89
> > 6, 0, 21, 21, 0.958
> > 6, 0, 0, 22, 0.893
> > 6, 0, 22, 22, 0.99
> > 6, 0, 0, 23, 0.899
> > 6, 0, 23, 23, 0.986
> > 6, 0, 0, 24, 0.893
> > 6, 0, 24, 24, 0.989
> > 6, 0, 0, 25, 0.889
> > 6, 0, 25, 25, 0.982
> > 6, 0, 0, 26, 0.889
> > 6, 0, 26, 26, 0.852
> > 6, 0, 0, 27, 0.89
> > 6, 0, 27, 27, 0.832
> > 6, 0, 0, 28, 0.89
> > 6, 0, 28, 28, 0.831
> > 6, 0, 0, 29, 0.89
> > 6, 0, 29, 29, 0.838
> > 6, 0, 0, 30, 0.907
> > 6, 0, 30, 30, 0.833
> > 6, 0, 0, 31, 0.888
> > 6, 0, 31, 31, 0.837
> > 6, 0, 0, 32, 0.853
> > 6, 0, 32, 32, 0.828
> > 6, 0, 0, 33, 0.857
> > 6, 0, 33, 33, 0.947
> > 6, 0, 0, 34, 0.847
> > 6, 0, 34, 34, 0.954
> > 6, 0, 0, 35, 0.841
> > 6, 0, 35, 35, 0.94
> > 6, 0, 0, 36, 0.854
> > 6, 0, 36, 36, 0.958
> > 6, 0, 0, 37, 0.856
> > 6, 0, 37, 37, 0.957
> > 6, 0, 0, 38, 0.839
> > 6, 0, 38, 38, 0.962
> > 6, 0, 0, 39, 0.866
> > 6, 0, 39, 39, 0.945
> > 6, 0, 0, 40, 0.845
> > 6, 0, 40, 40, 0.961
> > 6, 0, 0, 41, 0.858
> > 6, 0, 41, 41, 0.961
> > 6, 0, 0, 42, 0.862
> > 6, 0, 42, 42, 0.825
> > 6, 0, 0, 43, 0.864
> > 6, 0, 43, 43, 0.82
> > 6, 0, 0, 44, 0.843
> > 6, 0, 44, 44, 0.81
> > 6, 0, 0, 45, 0.859
> > 6, 0, 45, 45, 0.816
> > 6, 0, 0, 46, 0.866
> > 6, 0, 46, 46, 0.81
> > 6, 0, 0, 47, 0.858
> > 6, 0, 47, 47, 0.807
> > 6, 0, 0, 48, 0.87
> > 6, 0, 48, 48, 0.87
> > 6, 0, 0, 49, 0.871
> > 6, 0, 49, 49, 0.874
> > 6, 0, 0, 50, 0.87
> > 6, 0, 50, 50, 0.881
> > 6, 0, 0, 51, 0.868
> > 6, 0, 51, 51, 0.875
> > 6, 0, 0, 52, 0.873
> > 6, 0, 52, 52, 0.871
> > 6, 0, 0, 53, 0.866
> > 6, 0, 53, 53, 0.882
> > 6, 0, 0, 54, 0.863
> > 6, 0, 54, 54, 0.876
> > 6, 0, 0, 55, 0.851
> > 6, 0, 55, 55, 0.871
> > 6, 0, 0, 56, 0.867
> > 6, 0, 56, 56, 0.888
> > 6, 0, 0, 57, 0.862
> > 6, 0, 57, 57, 0.899
> > 6, 0, 0, 58, 0.873
> > 6, 0, 58, 58, 0.798
> > 6, 0, 0, 59, 0.881
> > 6, 0, 59, 59, 0.785
> > 6, 0, 0, 60, 0.867
> > 6, 0, 60, 60, 0.797
> > 6, 0, 0, 61, 0.872
> > 6, 0, 61, 61, 0.791
> > 6, 0, 0, 62, 0.859
> > 6, 0, 62, 62, 0.79
> > 6, 0, 0, 63, 0.87
> > 6, 0, 63, 63, 0.796
> >
> > sysdeps/x86_64/multiarch/strspn-c.c | 86 +++++++++++++----------------
> > 1 file changed, 39 insertions(+), 47 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c
> > index 8fb3aba64d..6124033ceb 100644
> > --- a/sysdeps/x86_64/multiarch/strspn-c.c
> > +++ b/sysdeps/x86_64/multiarch/strspn-c.c
> > @@ -62,81 +62,73 @@ __strspn_sse42 (const char *s, const char *a)
> > return 0;
> >
> > const char *aligned;
> > - __m128i mask;
> > - int offset = (int) ((size_t) a & 15);
> > + __m128i mask, maskz, zero;
> > + unsigned int maskz_bits;
> > + unsigned int offset = (int) ((size_t) a & 15);
> > + zero = _mm_set1_epi8 (0);
> > if (offset != 0)
> > {
> > /* Load masks. */
> > aligned = (const char *) ((size_t) a & -16L);
> > __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
> > -
> > - mask = __m128i_shift_right (mask0, offset);
> > + maskz = _mm_cmpeq_epi8 (mask0, zero);
> >
> > /* Find where the NULL terminator is. */
> > - int length = _mm_cmpistri (mask, mask, 0x3a);
> > - if (length == 16 - offset)
> > - {
> > - /* There is no NULL terminator. */
> > - __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
> > - int index = _mm_cmpistri (mask1, mask1, 0x3a);
> > - length += index;
> > -
> > - /* Don't use SSE4.2 if the length of A > 16. */
> > - if (length > 16)
> > - return __strspn_sse2 (s, a);
> > -
> > - if (index != 0)
> > - {
> > - /* Combine mask0 and mask1. We could play games with
> > - palignr, but frankly this data should be in L1 now
> > - so do the merge via an unaligned load. */
> > - mask = _mm_loadu_si128 ((__m128i *) a);
> > - }
> > - }
> > + maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
> > + if (maskz_bits != 0)
> > + {
> > + mask = __m128i_shift_right (mask0, offset);
> > + offset = (unsigned int) ((size_t) s & 15);
> > + if (offset)
> > + goto start_unaligned;
> > +
> > + aligned = s;
> > + goto start_loop;
> > + }
> > }
> > - else
> > - {
> > - /* A is aligned. */
> > - mask = _mm_load_si128 ((__m128i *) a);
> >
> > - /* Find where the NULL terminator is. */
> > - int length = _mm_cmpistri (mask, mask, 0x3a);
> > - if (length == 16)
> > - {
> > - /* There is no NULL terminator. Don't use SSE4.2 if the length
> > - of A > 16. */
> > - if (a[16] != 0)
> > - return __strspn_sse2 (s, a);
> > - }
> > + /* A is aligned. */
> > + mask = _mm_loadu_si128 ((__m128i *) a);
> > +
> > + /* Find where the NULL terminator is. */
> > + maskz = _mm_cmpeq_epi8 (mask, zero);
> > + maskz_bits = _mm_movemask_epi8 (maskz);
> > + if (maskz_bits == 0)
> > + {
> > + /* There is no NULL terminator. Don't use SSE4.2 if the length
> > + of A > 16. */
> > + if (a[16] != 0)
> > + return __strspn_sse2 (s, a);
> > }
> > + aligned = s;
> > + offset = (unsigned int) ((size_t) s & 15);
> >
> > - offset = (int) ((size_t) s & 15);
> > if (offset != 0)
> > {
> > + start_unaligned:
> > /* Check partial string. */
> > aligned = (const char *) ((size_t) s & -16L);
> > __m128i value = _mm_load_si128 ((__m128i *) aligned);
> > + __m128i adj_value = __m128i_shift_right (value, offset);
> >
> > - value = __m128i_shift_right (value, offset);
> > -
> > - int length = _mm_cmpistri (mask, value, 0x12);
> > + unsigned int length = _mm_cmpistri (mask, adj_value, 0x12);
> > /* No need to check CFlag since it is always 1. */
> > if (length < 16 - offset)
> > return length;
> > /* Find where the NULL terminator is. */
> > - int index = _mm_cmpistri (value, value, 0x3a);
> > - if (index < 16 - offset)
> > + maskz = _mm_cmpeq_epi8 (value, zero);
> > + maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
> > + if (maskz_bits != 0)
> > return length;
> > aligned += 16;
> > }
> > - else
> > - aligned = s;
> >
> > +start_loop:
> > while (1)
> > {
> > __m128i value = _mm_load_si128 ((__m128i *) aligned);
> > - int index = _mm_cmpistri (mask, value, 0x12);
> > - int cflag = _mm_cmpistrc (mask, value, 0x12);
> > + unsigned int index = _mm_cmpistri (mask, value, 0x12);
> > + unsigned int cflag = _mm_cmpistrc (mask, value, 0x12);
> > if (cflag)
> > return (size_t) (aligned + index - s);
> > aligned += 16;
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.
I would like to backport this patch to release branches.
Any comments or objections?
--Sunil
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v1 09/23] x86: Remove strcspn-sse2.S and use the generic implementation
2022-03-24 18:57 ` H.J. Lu
@ 2022-05-12 19:40 ` Sunil Pandey
0 siblings, 0 replies; 76+ messages in thread
From: Sunil Pandey @ 2022-05-12 19:40 UTC (permalink / raw)
To: H.J. Lu, Libc-stable Mailing List; +Cc: Noah Goldstein, GNU C Library
On Thu, Mar 24, 2022 at 11:59 AM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Wed, Mar 23, 2022 at 3:00 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > The generic implementation is faster.
> >
> > geometric_mean(N=20) of all benchmarks New / Original: .678
> >
> > All string/memory tests pass.
> > ---
> > Geomtric Mean N=20 runs; All functions page aligned
> > len, align1, align2, pos, New Time / Old Time
> > 0, 0, 0, 512, 0.054
> > 1, 0, 0, 512, 0.055
> > 1, 1, 0, 512, 0.051
> > 1, 0, 1, 512, 0.054
> > 1, 1, 1, 512, 0.054
> > 2, 0, 0, 512, 0.861
> > 2, 2, 0, 512, 0.861
> > 2, 0, 2, 512, 0.861
> > 2, 2, 2, 512, 0.864
> > 3, 0, 0, 512, 0.854
> > 3, 3, 0, 512, 0.848
> > 3, 0, 3, 512, 0.845
> > 3, 3, 3, 512, 0.85
> > 4, 0, 0, 512, 0.851
> > 4, 4, 0, 512, 0.85
> > 4, 0, 4, 512, 0.852
> > 4, 4, 4, 512, 0.849
> > 5, 0, 0, 512, 0.938
> > 5, 5, 0, 512, 0.94
> > 5, 0, 5, 512, 0.864
> > 5, 5, 5, 512, 0.86
> > 6, 0, 0, 512, 0.858
> > 6, 6, 0, 512, 0.869
> > 6, 0, 6, 512, 0.847
> > 6, 6, 6, 512, 0.868
> > 7, 0, 0, 512, 0.867
> > 7, 7, 0, 512, 0.861
> > 7, 0, 7, 512, 0.864
> > 7, 7, 7, 512, 0.863
> > 8, 0, 0, 512, 0.884
> > 8, 0, 8, 512, 0.884
> > 9, 0, 0, 512, 0.886
> > 9, 1, 0, 512, 0.894
> > 9, 0, 9, 512, 0.889
> > 9, 1, 9, 512, 0.886
> > 10, 0, 0, 512, 0.859
> > 10, 2, 0, 512, 0.859
> > 10, 0, 10, 512, 0.862
> > 10, 2, 10, 512, 0.861
> > 11, 0, 0, 512, 0.846
> > 11, 3, 0, 512, 0.865
> > 11, 0, 11, 512, 0.859
> > 11, 3, 11, 512, 0.862
> > 12, 0, 0, 512, 0.858
> > 12, 4, 0, 512, 0.857
> > 12, 0, 12, 512, 0.964
> > 12, 4, 12, 512, 0.876
> > 13, 0, 0, 512, 0.827
> > 13, 5, 0, 512, 0.805
> > 13, 0, 13, 512, 0.821
> > 13, 5, 13, 512, 0.825
> > 14, 0, 0, 512, 0.786
> > 14, 6, 0, 512, 0.786
> > 14, 0, 14, 512, 0.803
> > 14, 6, 14, 512, 0.783
> > 15, 0, 0, 512, 0.778
> > 15, 7, 0, 512, 0.792
> > 15, 0, 15, 512, 0.796
> > 15, 7, 15, 512, 0.799
> > 16, 0, 0, 512, 0.803
> > 16, 0, 16, 512, 0.815
> > 17, 0, 0, 512, 0.812
> > 17, 1, 0, 512, 0.826
> > 17, 0, 17, 512, 0.803
> > 17, 1, 17, 512, 0.856
> > 18, 0, 0, 512, 0.801
> > 18, 2, 0, 512, 0.886
> > 18, 0, 18, 512, 0.805
> > 18, 2, 18, 512, 0.807
> > 19, 0, 0, 512, 0.814
> > 19, 3, 0, 512, 0.804
> > 19, 0, 19, 512, 0.813
> > 19, 3, 19, 512, 0.814
> > 20, 0, 0, 512, 0.885
> > 20, 4, 0, 512, 0.799
> > 20, 0, 20, 512, 0.826
> > 20, 4, 20, 512, 0.808
> > 21, 0, 0, 512, 0.816
> > 21, 5, 0, 512, 0.824
> > 21, 0, 21, 512, 0.819
> > 21, 5, 21, 512, 0.826
> > 22, 0, 0, 512, 0.814
> > 22, 6, 0, 512, 0.824
> > 22, 0, 22, 512, 0.81
> > 22, 6, 22, 512, 0.806
> > 23, 0, 0, 512, 0.825
> > 23, 7, 0, 512, 0.829
> > 23, 0, 23, 512, 0.809
> > 23, 7, 23, 512, 0.823
> > 24, 0, 0, 512, 0.829
> > 24, 0, 24, 512, 0.823
> > 25, 0, 0, 512, 0.864
> > 25, 1, 0, 512, 0.895
> > 25, 0, 25, 512, 0.88
> > 25, 1, 25, 512, 0.848
> > 26, 0, 0, 512, 0.903
> > 26, 2, 0, 512, 0.888
> > 26, 0, 26, 512, 0.894
> > 26, 2, 26, 512, 0.89
> > 27, 0, 0, 512, 0.914
> > 27, 3, 0, 512, 0.917
> > 27, 0, 27, 512, 0.902
> > 27, 3, 27, 512, 0.887
> > 28, 0, 0, 512, 0.887
> > 28, 4, 0, 512, 0.877
> > 28, 0, 28, 512, 0.893
> > 28, 4, 28, 512, 0.866
> > 29, 0, 0, 512, 0.885
> > 29, 5, 0, 512, 0.907
> > 29, 0, 29, 512, 0.894
> > 29, 5, 29, 512, 0.906
> > 30, 0, 0, 512, 0.88
> > 30, 6, 0, 512, 0.898
> > 30, 0, 30, 512, 0.9
> > 30, 6, 30, 512, 0.895
> > 31, 0, 0, 512, 0.893
> > 31, 7, 0, 512, 0.874
> > 31, 0, 31, 512, 0.894
> > 31, 7, 31, 512, 0.899
> > 4, 0, 0, 32, 0.618
> > 4, 1, 0, 32, 0.627
> > 4, 0, 1, 32, 0.625
> > 4, 1, 1, 32, 0.613
> > 4, 0, 0, 64, 0.913
> > 4, 2, 0, 64, 0.801
> > 4, 0, 2, 64, 0.759
> > 4, 2, 2, 64, 0.761
> > 4, 0, 0, 128, 0.822
> > 4, 3, 0, 128, 0.863
> > 4, 0, 3, 128, 0.867
> > 4, 3, 3, 128, 0.917
> > 4, 0, 0, 256, 0.816
> > 4, 4, 0, 256, 0.812
> > 4, 0, 4, 256, 0.803
> > 4, 4, 4, 256, 0.811
> > 4, 5, 0, 512, 0.848
> > 4, 0, 5, 512, 0.843
> > 4, 5, 5, 512, 0.857
> > 4, 0, 0, 1024, 0.886
> > 4, 6, 0, 1024, 0.887
> > 4, 0, 6, 1024, 0.881
> > 4, 6, 6, 1024, 0.873
> > 4, 0, 0, 2048, 0.892
> > 4, 7, 0, 2048, 0.894
> > 4, 0, 7, 2048, 0.89
> > 4, 7, 7, 2048, 0.874
> > 10, 1, 0, 64, 0.946
> > 10, 1, 1, 64, 0.81
> > 10, 2, 0, 64, 0.804
> > 10, 2, 2, 64, 0.82
> > 10, 3, 0, 64, 0.772
> > 10, 3, 3, 64, 0.772
> > 10, 4, 0, 64, 0.748
> > 10, 4, 4, 64, 0.751
> > 10, 5, 0, 64, 0.76
> > 10, 5, 5, 64, 0.76
> > 10, 6, 0, 64, 0.726
> > 10, 6, 6, 64, 0.718
> > 10, 7, 0, 64, 0.724
> > 10, 7, 7, 64, 0.72
> > 6, 0, 0, 0, 0.415
> > 6, 0, 0, 1, 0.423
> > 6, 0, 1, 1, 0.412
> > 6, 0, 0, 2, 0.433
> > 6, 0, 2, 2, 0.434
> > 6, 0, 0, 3, 0.427
> > 6, 0, 3, 3, 0.428
> > 6, 0, 0, 4, 0.465
> > 6, 0, 4, 4, 0.466
> > 6, 0, 0, 5, 0.463
> > 6, 0, 5, 5, 0.468
> > 6, 0, 0, 6, 0.435
> > 6, 0, 6, 6, 0.444
> > 6, 0, 0, 7, 0.41
> > 6, 0, 7, 7, 0.42
> > 6, 0, 0, 8, 0.474
> > 6, 0, 8, 8, 0.501
> > 6, 0, 0, 9, 0.471
> > 6, 0, 9, 9, 0.489
> > 6, 0, 0, 10, 0.462
> > 6, 0, 10, 10, 0.46
> > 6, 0, 0, 11, 0.459
> > 6, 0, 11, 11, 0.458
> > 6, 0, 0, 12, 0.516
> > 6, 0, 12, 12, 0.51
> > 6, 0, 0, 13, 0.494
> > 6, 0, 13, 13, 0.524
> > 6, 0, 0, 14, 0.486
> > 6, 0, 14, 14, 0.5
> > 6, 0, 0, 15, 0.48
> > 6, 0, 15, 15, 0.501
> > 6, 0, 0, 16, 0.54
> > 6, 0, 16, 16, 0.538
> > 6, 0, 0, 17, 0.503
> > 6, 0, 17, 17, 0.541
> > 6, 0, 0, 18, 0.537
> > 6, 0, 18, 18, 0.549
> > 6, 0, 0, 19, 0.527
> > 6, 0, 19, 19, 0.537
> > 6, 0, 0, 20, 0.539
> > 6, 0, 20, 20, 0.554
> > 6, 0, 0, 21, 0.558
> > 6, 0, 21, 21, 0.541
> > 6, 0, 0, 22, 0.546
> > 6, 0, 22, 22, 0.561
> > 6, 0, 0, 23, 0.54
> > 6, 0, 23, 23, 0.536
> > 6, 0, 0, 24, 0.565
> > 6, 0, 24, 24, 0.584
> > 6, 0, 0, 25, 0.563
> > 6, 0, 25, 25, 0.58
> > 6, 0, 0, 26, 0.555
> > 6, 0, 26, 26, 0.584
> > 6, 0, 0, 27, 0.569
> > 6, 0, 27, 27, 0.587
> > 6, 0, 0, 28, 0.612
> > 6, 0, 28, 28, 0.623
> > 6, 0, 0, 29, 0.604
> > 6, 0, 29, 29, 0.621
> > 6, 0, 0, 30, 0.59
> > 6, 0, 30, 30, 0.609
> > 6, 0, 0, 31, 0.577
> > 6, 0, 31, 31, 0.588
> > 6, 0, 0, 32, 0.621
> > 6, 0, 32, 32, 0.608
> > 6, 0, 0, 33, 0.601
> > 6, 0, 33, 33, 0.623
> > 6, 0, 0, 34, 0.614
> > 6, 0, 34, 34, 0.615
> > 6, 0, 0, 35, 0.598
> > 6, 0, 35, 35, 0.608
> > 6, 0, 0, 36, 0.626
> > 6, 0, 36, 36, 0.634
> > 6, 0, 0, 37, 0.62
> > 6, 0, 37, 37, 0.634
> > 6, 0, 0, 38, 0.612
> > 6, 0, 38, 38, 0.637
> > 6, 0, 0, 39, 0.627
> > 6, 0, 39, 39, 0.612
> > 6, 0, 0, 40, 0.661
> > 6, 0, 40, 40, 0.674
> > 6, 0, 0, 41, 0.633
> > 6, 0, 41, 41, 0.643
> > 6, 0, 0, 42, 0.634
> > 6, 0, 42, 42, 0.636
> > 6, 0, 0, 43, 0.619
> > 6, 0, 43, 43, 0.625
> > 6, 0, 0, 44, 0.654
> > 6, 0, 44, 44, 0.654
> > 6, 0, 0, 45, 0.647
> > 6, 0, 45, 45, 0.649
> > 6, 0, 0, 46, 0.651
> > 6, 0, 46, 46, 0.651
> > 6, 0, 0, 47, 0.646
> > 6, 0, 47, 47, 0.648
> > 6, 0, 0, 48, 0.662
> > 6, 0, 48, 48, 0.664
> > 6, 0, 0, 49, 0.68
> > 6, 0, 49, 49, 0.667
> > 6, 0, 0, 50, 0.654
> > 6, 0, 50, 50, 0.659
> > 6, 0, 0, 51, 0.638
> > 6, 0, 51, 51, 0.639
> > 6, 0, 0, 52, 0.665
> > 6, 0, 52, 52, 0.669
> > 6, 0, 0, 53, 0.658
> > 6, 0, 53, 53, 0.656
> > 6, 0, 0, 54, 0.669
> > 6, 0, 54, 54, 0.67
> > 6, 0, 0, 55, 0.668
> > 6, 0, 55, 55, 0.664
> > 6, 0, 0, 56, 0.701
> > 6, 0, 56, 56, 0.695
> > 6, 0, 0, 57, 0.687
> > 6, 0, 57, 57, 0.696
> > 6, 0, 0, 58, 0.693
> > 6, 0, 58, 58, 0.704
> > 6, 0, 0, 59, 0.695
> > 6, 0, 59, 59, 0.708
> > 6, 0, 0, 60, 0.708
> > 6, 0, 60, 60, 0.728
> > 6, 0, 0, 61, 0.708
> > 6, 0, 61, 61, 0.71
> > 6, 0, 0, 62, 0.715
> > 6, 0, 62, 62, 0.705
> > 6, 0, 0, 63, 0.677
> > 6, 0, 63, 63, 0.702
> >
> > .../{strcspn-sse2.S => strcspn-sse2.c} | 8 +-
> > sysdeps/x86_64/strcspn.S | 119 ------------------
> > 2 files changed, 4 insertions(+), 123 deletions(-)
> > rename sysdeps/x86_64/multiarch/{strcspn-sse2.S => strcspn-sse2.c} (85%)
> > delete mode 100644 sysdeps/x86_64/strcspn.S
> >
> > diff --git a/sysdeps/x86_64/multiarch/strcspn-sse2.S b/sysdeps/x86_64/multiarch/strcspn-sse2.c
> > similarity index 85%
> > rename from sysdeps/x86_64/multiarch/strcspn-sse2.S
> > rename to sysdeps/x86_64/multiarch/strcspn-sse2.c
> > index f97e856e1f..3a04bb39fc 100644
> > --- a/sysdeps/x86_64/multiarch/strcspn-sse2.S
> > +++ b/sysdeps/x86_64/multiarch/strcspn-sse2.c
> > @@ -1,4 +1,4 @@
> > -/* strcspn optimized with SSE2.
> > +/* strcspn.
> > Copyright (C) 2017-2022 Free Software Foundation, Inc.
> > This file is part of the GNU C Library.
> >
> > @@ -19,10 +19,10 @@
> > #if IS_IN (libc)
> >
> > # include <sysdep.h>
> > -# define strcspn __strcspn_sse2
> > +# define STRCSPN __strcspn_sse2
> >
> > # undef libc_hidden_builtin_def
> > -# define libc_hidden_builtin_def(strcspn)
> > +# define libc_hidden_builtin_def(STRCSPN)
> > #endif
> >
> > -#include <sysdeps/x86_64/strcspn.S>
> > +#include <string/strcspn.c>
> > diff --git a/sysdeps/x86_64/strcspn.S b/sysdeps/x86_64/strcspn.S
> > deleted file mode 100644
> > index f3cd86c606..0000000000
> > --- a/sysdeps/x86_64/strcspn.S
> > +++ /dev/null
> > @@ -1,119 +0,0 @@
> > -/* strcspn (str, ss) -- Return the length of the initial segment of STR
> > - which contains no characters from SS.
> > - For AMD x86-64.
> > - Copyright (C) 1994-2022 Free Software Foundation, Inc.
> > - This file is part of the GNU C Library.
> > -
> > - The GNU C Library is free software; you can redistribute it and/or
> > - modify it under the terms of the GNU Lesser General Public
> > - License as published by the Free Software Foundation; either
> > - version 2.1 of the License, or (at your option) any later version.
> > -
> > - The GNU C Library is distributed in the hope that it will be useful,
> > - but WITHOUT ANY WARRANTY; without even the implied warranty of
> > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > - Lesser General Public License for more details.
> > -
> > - You should have received a copy of the GNU Lesser General Public
> > - License along with the GNU C Library; if not, see
> > - <https://www.gnu.org/licenses/>. */
> > -
> > -#include <sysdep.h>
> > -#include "asm-syntax.h"
> > -
> > - .text
> > -ENTRY (strcspn)
> > -
> > - movq %rdi, %rdx /* Save SRC. */
> > -
> > - /* First we create a table with flags for all possible characters.
> > - For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
> > - supported by the C string functions we have 256 characters.
> > - Before inserting marks for the stop characters we clear the whole
> > - table. */
> > - movq %rdi, %r8 /* Save value. */
> > - subq $256, %rsp /* Make space for 256 bytes. */
> > - cfi_adjust_cfa_offset(256)
> > - movl $32, %ecx /* 32*8 bytes = 256 bytes. */
> > - movq %rsp, %rdi
> > - xorl %eax, %eax /* We store 0s. */
> > - cld
> > - rep
> > - stosq
> > -
> > - movq %rsi, %rax /* Setup skipset. */
> > -
> > -/* For understanding the following code remember that %rcx == 0 now.
> > - Although all the following instruction only modify %cl we always
> > - have a correct zero-extended 64-bit value in %rcx. */
> > -
> > - .p2align 4
> > -L(2): movb (%rax), %cl /* get byte from skipset */
> > - testb %cl, %cl /* is NUL char? */
> > - jz L(1) /* yes => start compare loop */
> > - movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */
> > -
> > - movb 1(%rax), %cl /* get byte from skipset */
> > - testb $0xff, %cl /* is NUL char? */
> > - jz L(1) /* yes => start compare loop */
> > - movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */
> > -
> > - movb 2(%rax), %cl /* get byte from skipset */
> > - testb $0xff, %cl /* is NUL char? */
> > - jz L(1) /* yes => start compare loop */
> > - movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */
> > -
> > - movb 3(%rax), %cl /* get byte from skipset */
> > - addq $4, %rax /* increment skipset pointer */
> > - movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */
> > - testb $0xff, %cl /* is NUL char? */
> > - jnz L(2) /* no => process next dword from skipset */
> > -
> > -L(1): leaq -4(%rdx), %rax /* prepare loop */
> > -
> > - /* We use a neat trick for the following loop. Normally we would
> > - have to test for two termination conditions
> > - 1. a character in the skipset was found
> > - and
> > - 2. the end of the string was found
> > - But as a sign that the character is in the skipset we store its
> > - value in the table. But the value of NUL is NUL so the loop
> > - terminates for NUL in every case. */
> > -
> > - .p2align 4
> > -L(3): addq $4, %rax /* adjust pointer for full loop round */
> > -
> > - movb (%rax), %cl /* get byte from string */
> > - cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */
> > - je L(4) /* yes => return */
> > -
> > - movb 1(%rax), %cl /* get byte from string */
> > - cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */
> > - je L(5) /* yes => return */
> > -
> > - movb 2(%rax), %cl /* get byte from string */
> > - cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */
> > - jz L(6) /* yes => return */
> > -
> > - movb 3(%rax), %cl /* get byte from string */
> > - cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */
> > - jne L(3) /* no => start loop again */
> > -
> > - incq %rax /* adjust pointer */
> > -L(6): incq %rax
> > -L(5): incq %rax
> > -
> > -L(4): addq $256, %rsp /* remove skipset */
> > - cfi_adjust_cfa_offset(-256)
> > -#ifdef USE_AS_STRPBRK
> > - xorl %edx,%edx
> > - orb %cl, %cl /* was last character NUL? */
> > - cmovzq %rdx, %rax /* Yes: return NULL */
> > -#else
> > - subq %rdx, %rax /* we have to return the number of valid
> > - characters, so compute distance to first
> > - non-valid character */
> > -#endif
> > - ret
> > -END (strcspn)
> > -libc_hidden_builtin_def (strcspn)
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.
I would like to backport this patch to release branches.
Any comments or objections?
--Sunil
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v1 10/23] x86: Remove strpbrk-sse2.S and use the generic implementation
2022-03-24 18:57 ` H.J. Lu
@ 2022-05-12 19:41 ` Sunil Pandey
0 siblings, 0 replies; 76+ messages in thread
From: Sunil Pandey @ 2022-05-12 19:41 UTC (permalink / raw)
To: H.J. Lu, Libc-stable Mailing List; +Cc: Noah Goldstein, GNU C Library
On Thu, Mar 24, 2022 at 12:00 PM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Wed, Mar 23, 2022 at 3:00 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > The generic implementation is faster (see strcspn commit).
> >
> > All string/memory tests pass.
> > ---
> > .../x86_64/multiarch/{strpbrk-sse2.S => strpbrk-sse2.c} | 9 ++++-----
> > sysdeps/x86_64/strpbrk.S | 3 ---
> > 2 files changed, 4 insertions(+), 8 deletions(-)
> > rename sysdeps/x86_64/multiarch/{strpbrk-sse2.S => strpbrk-sse2.c} (84%)
> > delete mode 100644 sysdeps/x86_64/strpbrk.S
> >
> > diff --git a/sysdeps/x86_64/multiarch/strpbrk-sse2.S b/sysdeps/x86_64/multiarch/strpbrk-sse2.c
> > similarity index 84%
> > rename from sysdeps/x86_64/multiarch/strpbrk-sse2.S
> > rename to sysdeps/x86_64/multiarch/strpbrk-sse2.c
> > index d537b6c27b..d03214c4fb 100644
> > --- a/sysdeps/x86_64/multiarch/strpbrk-sse2.S
> > +++ b/sysdeps/x86_64/multiarch/strpbrk-sse2.c
> > @@ -1,4 +1,4 @@
> > -/* strpbrk optimized with SSE2.
> > +/* strpbrk.
> > Copyright (C) 2017-2022 Free Software Foundation, Inc.
> > This file is part of the GNU C Library.
> >
> > @@ -19,11 +19,10 @@
> > #if IS_IN (libc)
> >
> > # include <sysdep.h>
> > -# define strcspn __strpbrk_sse2
> > +# define STRPBRK __strpbrk_sse2
> >
> > # undef libc_hidden_builtin_def
> > -# define libc_hidden_builtin_def(strpbrk)
> > +# define libc_hidden_builtin_def(STRPBRK)
> > #endif
> >
> > -#define USE_AS_STRPBRK
> > -#include <sysdeps/x86_64/strcspn.S>
> > +#include <string/strpbrk.c>
> > diff --git a/sysdeps/x86_64/strpbrk.S b/sysdeps/x86_64/strpbrk.S
> > deleted file mode 100644
> > index 21888a5b92..0000000000
> > --- a/sysdeps/x86_64/strpbrk.S
> > +++ /dev/null
> > @@ -1,3 +0,0 @@
> > -#define strcspn strpbrk
> > -#define USE_AS_STRPBRK
> > -#include <sysdeps/x86_64/strcspn.S>
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.
I would like to backport this patch to release branches.
Any comments or objections?
--Sunil
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v1 11/23] x86: Remove strspn-sse2.S and use the generic implementation
2022-03-24 18:57 ` H.J. Lu
@ 2022-05-12 19:42 ` Sunil Pandey
0 siblings, 0 replies; 76+ messages in thread
From: Sunil Pandey @ 2022-05-12 19:42 UTC (permalink / raw)
To: H.J. Lu, Libc-stable Mailing List; +Cc: Noah Goldstein, GNU C Library
On Thu, Mar 24, 2022 at 12:00 PM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Wed, Mar 23, 2022 at 3:01 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > The generic implementation is faster.
> >
> > geometric_mean(N=20) of all benchmarks New / Original: .710
> >
> > All string/memory tests pass.
> > ---
> > Geomtric Mean N=20 runs; All functions page aligned
> > len, align1, align2, pos, New Time / Old Time
> > 1, 0, 0, 512, 0.824
> > 1, 1, 0, 512, 1.018
> > 1, 0, 1, 512, 0.986
> > 1, 1, 1, 512, 1.092
> > 2, 0, 0, 512, 0.86
> > 2, 2, 0, 512, 0.868
> > 2, 0, 2, 512, 0.858
> > 2, 2, 2, 512, 0.857
> > 3, 0, 0, 512, 0.836
> > 3, 3, 0, 512, 0.849
> > 3, 0, 3, 512, 0.84
> > 3, 3, 3, 512, 0.85
> > 4, 0, 0, 512, 0.843
> > 4, 4, 0, 512, 0.837
> > 4, 0, 4, 512, 0.835
> > 4, 4, 4, 512, 0.846
> > 5, 0, 0, 512, 0.852
> > 5, 5, 0, 512, 0.848
> > 5, 0, 5, 512, 0.85
> > 5, 5, 5, 512, 0.85
> > 6, 0, 0, 512, 0.853
> > 6, 6, 0, 512, 0.855
> > 6, 0, 6, 512, 0.853
> > 6, 6, 6, 512, 0.853
> > 7, 0, 0, 512, 0.857
> > 7, 7, 0, 512, 0.861
> > 7, 0, 7, 512, 0.94
> > 7, 7, 7, 512, 0.856
> > 8, 0, 0, 512, 0.927
> > 8, 0, 8, 512, 0.965
> > 9, 0, 0, 512, 0.967
> > 9, 1, 0, 512, 0.976
> > 9, 0, 9, 512, 0.887
> > 9, 1, 9, 512, 0.881
> > 10, 0, 0, 512, 0.853
> > 10, 2, 0, 512, 0.846
> > 10, 0, 10, 512, 0.855
> > 10, 2, 10, 512, 0.849
> > 11, 0, 0, 512, 0.854
> > 11, 3, 0, 512, 0.855
> > 11, 0, 11, 512, 0.85
> > 11, 3, 11, 512, 0.854
> > 12, 0, 0, 512, 0.864
> > 12, 4, 0, 512, 0.864
> > 12, 0, 12, 512, 0.867
> > 12, 4, 12, 512, 0.87
> > 13, 0, 0, 512, 0.853
> > 13, 5, 0, 512, 0.841
> > 13, 0, 13, 512, 0.837
> > 13, 5, 13, 512, 0.85
> > 14, 0, 0, 512, 0.838
> > 14, 6, 0, 512, 0.842
> > 14, 0, 14, 512, 0.818
> > 14, 6, 14, 512, 0.845
> > 15, 0, 0, 512, 0.799
> > 15, 7, 0, 512, 0.847
> > 15, 0, 15, 512, 0.787
> > 15, 7, 15, 512, 0.84
> > 16, 0, 0, 512, 0.824
> > 16, 0, 16, 512, 0.827
> > 17, 0, 0, 512, 0.817
> > 17, 1, 0, 512, 0.823
> > 17, 0, 17, 512, 0.82
> > 17, 1, 17, 512, 0.814
> > 18, 0, 0, 512, 0.81
> > 18, 2, 0, 512, 0.833
> > 18, 0, 18, 512, 0.811
> > 18, 2, 18, 512, 0.842
> > 19, 0, 0, 512, 0.823
> > 19, 3, 0, 512, 0.818
> > 19, 0, 19, 512, 0.821
> > 19, 3, 19, 512, 0.824
> > 20, 0, 0, 512, 0.814
> > 20, 4, 0, 512, 0.818
> > 20, 0, 20, 512, 0.806
> > 20, 4, 20, 512, 0.802
> > 21, 0, 0, 512, 0.835
> > 21, 5, 0, 512, 0.839
> > 21, 0, 21, 512, 0.842
> > 21, 5, 21, 512, 0.82
> > 22, 0, 0, 512, 0.824
> > 22, 6, 0, 512, 0.831
> > 22, 0, 22, 512, 0.819
> > 22, 6, 22, 512, 0.824
> > 23, 0, 0, 512, 0.816
> > 23, 7, 0, 512, 0.856
> > 23, 0, 23, 512, 0.808
> > 23, 7, 23, 512, 0.848
> > 24, 0, 0, 512, 0.88
> > 24, 0, 24, 512, 0.846
> > 25, 0, 0, 512, 0.929
> > 25, 1, 0, 512, 0.917
> > 25, 0, 25, 512, 0.884
> > 25, 1, 25, 512, 0.859
> > 26, 0, 0, 512, 0.919
> > 26, 2, 0, 512, 0.867
> > 26, 0, 26, 512, 0.914
> > 26, 2, 26, 512, 0.845
> > 27, 0, 0, 512, 0.919
> > 27, 3, 0, 512, 0.864
> > 27, 0, 27, 512, 0.917
> > 27, 3, 27, 512, 0.847
> > 28, 0, 0, 512, 0.905
> > 28, 4, 0, 512, 0.896
> > 28, 0, 28, 512, 0.898
> > 28, 4, 28, 512, 0.871
> > 29, 0, 0, 512, 0.911
> > 29, 5, 0, 512, 0.91
> > 29, 0, 29, 512, 0.905
> > 29, 5, 29, 512, 0.884
> > 30, 0, 0, 512, 0.907
> > 30, 6, 0, 512, 0.802
> > 30, 0, 30, 512, 0.906
> > 30, 6, 30, 512, 0.818
> > 31, 0, 0, 512, 0.907
> > 31, 7, 0, 512, 0.821
> > 31, 0, 31, 512, 0.89
> > 31, 7, 31, 512, 0.787
> > 4, 0, 0, 32, 0.623
> > 4, 1, 0, 32, 0.606
> > 4, 0, 1, 32, 0.6
> > 4, 1, 1, 32, 0.603
> > 4, 0, 0, 64, 0.731
> > 4, 2, 0, 64, 0.733
> > 4, 0, 2, 64, 0.734
> > 4, 2, 2, 64, 0.755
> > 4, 0, 0, 128, 0.822
> > 4, 3, 0, 128, 0.873
> > 4, 0, 3, 128, 0.89
> > 4, 3, 3, 128, 0.907
> > 4, 0, 0, 256, 0.827
> > 4, 4, 0, 256, 0.811
> > 4, 0, 4, 256, 0.794
> > 4, 4, 4, 256, 0.814
> > 4, 5, 0, 512, 0.841
> > 4, 0, 5, 512, 0.831
> > 4, 5, 5, 512, 0.845
> > 4, 0, 0, 1024, 0.861
> > 4, 6, 0, 1024, 0.857
> > 4, 0, 6, 1024, 0.9
> > 4, 6, 6, 1024, 0.861
> > 4, 0, 0, 2048, 0.879
> > 4, 7, 0, 2048, 0.875
> > 4, 0, 7, 2048, 0.883
> > 4, 7, 7, 2048, 0.88
> > 10, 1, 0, 64, 0.747
> > 10, 1, 1, 64, 0.743
> > 10, 2, 0, 64, 0.732
> > 10, 2, 2, 64, 0.729
> > 10, 3, 0, 64, 0.747
> > 10, 3, 3, 64, 0.733
> > 10, 4, 0, 64, 0.74
> > 10, 4, 4, 64, 0.751
> > 10, 5, 0, 64, 0.735
> > 10, 5, 5, 64, 0.746
> > 10, 6, 0, 64, 0.735
> > 10, 6, 6, 64, 0.733
> > 10, 7, 0, 64, 0.734
> > 10, 7, 7, 64, 0.74
> > 6, 0, 0, 0, 0.377
> > 6, 0, 0, 1, 0.369
> > 6, 0, 1, 1, 0.383
> > 6, 0, 0, 2, 0.391
> > 6, 0, 2, 2, 0.394
> > 6, 0, 0, 3, 0.416
> > 6, 0, 3, 3, 0.411
> > 6, 0, 0, 4, 0.475
> > 6, 0, 4, 4, 0.483
> > 6, 0, 0, 5, 0.473
> > 6, 0, 5, 5, 0.476
> > 6, 0, 0, 6, 0.459
> > 6, 0, 6, 6, 0.445
> > 6, 0, 0, 7, 0.433
> > 6, 0, 7, 7, 0.432
> > 6, 0, 0, 8, 0.492
> > 6, 0, 8, 8, 0.494
> > 6, 0, 0, 9, 0.476
> > 6, 0, 9, 9, 0.483
> > 6, 0, 0, 10, 0.46
> > 6, 0, 10, 10, 0.476
> > 6, 0, 0, 11, 0.463
> > 6, 0, 11, 11, 0.463
> > 6, 0, 0, 12, 0.511
> > 6, 0, 12, 12, 0.515
> > 6, 0, 0, 13, 0.506
> > 6, 0, 13, 13, 0.536
> > 6, 0, 0, 14, 0.496
> > 6, 0, 14, 14, 0.484
> > 6, 0, 0, 15, 0.473
> > 6, 0, 15, 15, 0.475
> > 6, 0, 0, 16, 0.534
> > 6, 0, 16, 16, 0.534
> > 6, 0, 0, 17, 0.525
> > 6, 0, 17, 17, 0.523
> > 6, 0, 0, 18, 0.522
> > 6, 0, 18, 18, 0.524
> > 6, 0, 0, 19, 0.512
> > 6, 0, 19, 19, 0.514
> > 6, 0, 0, 20, 0.535
> > 6, 0, 20, 20, 0.54
> > 6, 0, 0, 21, 0.543
> > 6, 0, 21, 21, 0.536
> > 6, 0, 0, 22, 0.542
> > 6, 0, 22, 22, 0.542
> > 6, 0, 0, 23, 0.529
> > 6, 0, 23, 23, 0.53
> > 6, 0, 0, 24, 0.596
> > 6, 0, 24, 24, 0.589
> > 6, 0, 0, 25, 0.583
> > 6, 0, 25, 25, 0.58
> > 6, 0, 0, 26, 0.574
> > 6, 0, 26, 26, 0.58
> > 6, 0, 0, 27, 0.575
> > 6, 0, 27, 27, 0.558
> > 6, 0, 0, 28, 0.606
> > 6, 0, 28, 28, 0.606
> > 6, 0, 0, 29, 0.589
> > 6, 0, 29, 29, 0.595
> > 6, 0, 0, 30, 0.592
> > 6, 0, 30, 30, 0.585
> > 6, 0, 0, 31, 0.585
> > 6, 0, 31, 31, 0.579
> > 6, 0, 0, 32, 0.625
> > 6, 0, 32, 32, 0.615
> > 6, 0, 0, 33, 0.615
> > 6, 0, 33, 33, 0.61
> > 6, 0, 0, 34, 0.604
> > 6, 0, 34, 34, 0.6
> > 6, 0, 0, 35, 0.602
> > 6, 0, 35, 35, 0.608
> > 6, 0, 0, 36, 0.644
> > 6, 0, 36, 36, 0.644
> > 6, 0, 0, 37, 0.658
> > 6, 0, 37, 37, 0.651
> > 6, 0, 0, 38, 0.644
> > 6, 0, 38, 38, 0.649
> > 6, 0, 0, 39, 0.626
> > 6, 0, 39, 39, 0.632
> > 6, 0, 0, 40, 0.662
> > 6, 0, 40, 40, 0.661
> > 6, 0, 0, 41, 0.656
> > 6, 0, 41, 41, 0.655
> > 6, 0, 0, 42, 0.643
> > 6, 0, 42, 42, 0.637
> > 6, 0, 0, 43, 0.622
> > 6, 0, 43, 43, 0.628
> > 6, 0, 0, 44, 0.673
> > 6, 0, 44, 44, 0.687
> > 6, 0, 0, 45, 0.661
> > 6, 0, 45, 45, 0.659
> > 6, 0, 0, 46, 0.657
> > 6, 0, 46, 46, 0.653
> > 6, 0, 0, 47, 0.658
> > 6, 0, 47, 47, 0.65
> > 6, 0, 0, 48, 0.678
> > 6, 0, 48, 48, 0.683
> > 6, 0, 0, 49, 0.676
> > 6, 0, 49, 49, 0.661
> > 6, 0, 0, 50, 0.672
> > 6, 0, 50, 50, 0.662
> > 6, 0, 0, 51, 0.656
> > 6, 0, 51, 51, 0.659
> > 6, 0, 0, 52, 0.682
> > 6, 0, 52, 52, 0.686
> > 6, 0, 0, 53, 0.67
> > 6, 0, 53, 53, 0.674
> > 6, 0, 0, 54, 0.663
> > 6, 0, 54, 54, 0.675
> > 6, 0, 0, 55, 0.662
> > 6, 0, 55, 55, 0.665
> > 6, 0, 0, 56, 0.681
> > 6, 0, 56, 56, 0.697
> > 6, 0, 0, 57, 0.686
> > 6, 0, 57, 57, 0.687
> > 6, 0, 0, 58, 0.701
> > 6, 0, 58, 58, 0.693
> > 6, 0, 0, 59, 0.709
> > 6, 0, 59, 59, 0.698
> > 6, 0, 0, 60, 0.708
> > 6, 0, 60, 60, 0.708
> > 6, 0, 0, 61, 0.709
> > 6, 0, 61, 61, 0.716
> > 6, 0, 0, 62, 0.709
> > 6, 0, 62, 62, 0.707
> > 6, 0, 0, 63, 0.703
> > 6, 0, 63, 63, 0.716
> >
> > .../{strspn-sse2.S => strspn-sse2.c} | 8 +-
> > sysdeps/x86_64/strspn.S | 112 ------------------
> > 2 files changed, 4 insertions(+), 116 deletions(-)
> > rename sysdeps/x86_64/multiarch/{strspn-sse2.S => strspn-sse2.c} (86%)
> > delete mode 100644 sysdeps/x86_64/strspn.S
> >
> > diff --git a/sysdeps/x86_64/multiarch/strspn-sse2.S b/sysdeps/x86_64/multiarch/strspn-sse2.c
> > similarity index 86%
> > rename from sysdeps/x86_64/multiarch/strspn-sse2.S
> > rename to sysdeps/x86_64/multiarch/strspn-sse2.c
> > index e0a095f25a..61cc6cb0a5 100644
> > --- a/sysdeps/x86_64/multiarch/strspn-sse2.S
> > +++ b/sysdeps/x86_64/multiarch/strspn-sse2.c
> > @@ -1,4 +1,4 @@
> > -/* strspn optimized with SSE2.
> > +/* strspn.
> > Copyright (C) 2017-2022 Free Software Foundation, Inc.
> > This file is part of the GNU C Library.
> >
> > @@ -19,10 +19,10 @@
> > #if IS_IN (libc)
> >
> > # include <sysdep.h>
> > -# define strspn __strspn_sse2
> > +# define STRSPN __strspn_sse2
> >
> > # undef libc_hidden_builtin_def
> > -# define libc_hidden_builtin_def(strspn)
> > +# define libc_hidden_builtin_def(STRSPN)
> > #endif
> >
> > -#include <sysdeps/x86_64/strspn.S>
> > +#include <string/strspn.c>
> > diff --git a/sysdeps/x86_64/strspn.S b/sysdeps/x86_64/strspn.S
> > deleted file mode 100644
> > index 61b76ee0a1..0000000000
> > --- a/sysdeps/x86_64/strspn.S
> > +++ /dev/null
> > @@ -1,112 +0,0 @@
> > -/* strspn (str, ss) -- Return the length of the initial segment of STR
> > - which contains only characters from SS.
> > - For AMD x86-64.
> > - Copyright (C) 1994-2022 Free Software Foundation, Inc.
> > - This file is part of the GNU C Library.
> > -
> > - The GNU C Library is free software; you can redistribute it and/or
> > - modify it under the terms of the GNU Lesser General Public
> > - License as published by the Free Software Foundation; either
> > - version 2.1 of the License, or (at your option) any later version.
> > -
> > - The GNU C Library is distributed in the hope that it will be useful,
> > - but WITHOUT ANY WARRANTY; without even the implied warranty of
> > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > - Lesser General Public License for more details.
> > -
> > - You should have received a copy of the GNU Lesser General Public
> > - License along with the GNU C Library; if not, see
> > - <https://www.gnu.org/licenses/>. */
> > -
> > -#include <sysdep.h>
> > -
> > - .text
> > -ENTRY (strspn)
> > -
> > - movq %rdi, %rdx /* Save SRC. */
> > -
> > - /* First we create a table with flags for all possible characters.
> > - For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
> > - supported by the C string functions we have 256 characters.
> > - Before inserting marks for the stop characters we clear the whole
> > - table. */
> > - movq %rdi, %r8 /* Save value. */
> > - subq $256, %rsp /* Make space for 256 bytes. */
> > - cfi_adjust_cfa_offset(256)
> > - movl $32, %ecx /* 32*8 bytes = 256 bytes. */
> > - movq %rsp, %rdi
> > - xorl %eax, %eax /* We store 0s. */
> > - cld
> > - rep
> > - stosq
> > -
> > - movq %rsi, %rax /* Setup stopset. */
> > -
> > -/* For understanding the following code remember that %rcx == 0 now.
> > - Although all the following instruction only modify %cl we always
> > - have a correct zero-extended 64-bit value in %rcx. */
> > -
> > - .p2align 4
> > -L(2): movb (%rax), %cl /* get byte from stopset */
> > - testb %cl, %cl /* is NUL char? */
> > - jz L(1) /* yes => start compare loop */
> > - movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */
> > -
> > - movb 1(%rax), %cl /* get byte from stopset */
> > - testb $0xff, %cl /* is NUL char? */
> > - jz L(1) /* yes => start compare loop */
> > - movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */
> > -
> > - movb 2(%rax), %cl /* get byte from stopset */
> > - testb $0xff, %cl /* is NUL char? */
> > - jz L(1) /* yes => start compare loop */
> > - movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */
> > -
> > - movb 3(%rax), %cl /* get byte from stopset */
> > - addq $4, %rax /* increment stopset pointer */
> > - movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */
> > - testb $0xff, %cl /* is NUL char? */
> > - jnz L(2) /* no => process next dword from stopset */
> > -
> > -L(1): leaq -4(%rdx), %rax /* prepare loop */
> > -
> > - /* We use a neat trick for the following loop. Normally we would
> > - have to test for two termination conditions
> > - 1. a character in the stopset was found
> > - and
> > - 2. the end of the string was found
> > - But as a sign that the character is in the stopset we store its
> > - value in the table. But the value of NUL is NUL so the loop
> > - terminates for NUL in every case. */
> > -
> > - .p2align 4
> > -L(3): addq $4, %rax /* adjust pointer for full loop round */
> > -
> > - movb (%rax), %cl /* get byte from string */
> > - testb %cl, (%rsp,%rcx) /* is it contained in skipset? */
> > - jz L(4) /* no => return */
> > -
> > - movb 1(%rax), %cl /* get byte from string */
> > - testb %cl, (%rsp,%rcx) /* is it contained in skipset? */
> > - jz L(5) /* no => return */
> > -
> > - movb 2(%rax), %cl /* get byte from string */
> > - testb %cl, (%rsp,%rcx) /* is it contained in skipset? */
> > - jz L(6) /* no => return */
> > -
> > - movb 3(%rax), %cl /* get byte from string */
> > - testb %cl, (%rsp,%rcx) /* is it contained in skipset? */
> > - jnz L(3) /* yes => start loop again */
> > -
> > - incq %rax /* adjust pointer */
> > -L(6): incq %rax
> > -L(5): incq %rax
> > -
> > -L(4): addq $256, %rsp /* remove stopset */
> > - cfi_adjust_cfa_offset(-256)
> > - subq %rdx, %rax /* we have to return the number of valid
> > - characters, so compute distance to first
> > - non-valid character */
> > - ret
> > -END (strspn)
> > -libc_hidden_builtin_def (strspn)
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.
I would like to backport this patch to release branches.
Any comments or objections?
--Sunil
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v1 17/23] x86: Optimize str{n}casecmp TOLOWER logic in strcmp.S
2022-03-24 19:02 ` H.J. Lu
@ 2022-05-12 19:44 ` Sunil Pandey
0 siblings, 0 replies; 76+ messages in thread
From: Sunil Pandey @ 2022-05-12 19:44 UTC (permalink / raw)
To: H.J. Lu, Libc-stable Mailing List; +Cc: Noah Goldstein, GNU C Library
On Thu, Mar 24, 2022 at 12:05 PM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Wed, Mar 23, 2022 at 3:01 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Slightly faster method of doing TOLOWER that saves an
> > instruction.
> >
> > Also replace the hard coded 5-byte no with .p2align 4. On builds with
> > CET enabled this misaligned entry to strcasecmp.
> >
> > geometric_mean(N=40) of all benchmarks New / Original: .894
> >
> > All string/memory tests pass.
> > ---
> > Geomtric Mean N=40 runs; All functions page aligned
> > length, align1, align2, max_char, New Time / Old Time
> > 1, 1, 1, 127, 0.903
> > 2, 2, 2, 127, 0.905
> > 3, 3, 3, 127, 0.877
> > 4, 4, 4, 127, 0.888
> > 5, 5, 5, 127, 0.901
> > 6, 6, 6, 127, 0.954
> > 7, 7, 7, 127, 0.932
> > 8, 0, 0, 127, 0.918
> > 9, 1, 1, 127, 0.914
> > 10, 2, 2, 127, 0.877
> > 11, 3, 3, 127, 0.909
> > 12, 4, 4, 127, 0.876
> > 13, 5, 5, 127, 0.886
> > 14, 6, 6, 127, 0.914
> > 15, 7, 7, 127, 0.939
> > 4, 0, 0, 127, 0.963
> > 4, 0, 0, 254, 0.943
> > 8, 0, 0, 254, 0.927
> > 16, 0, 0, 127, 0.876
> > 16, 0, 0, 254, 0.865
> > 32, 0, 0, 127, 0.865
> > 32, 0, 0, 254, 0.862
> > 64, 0, 0, 127, 0.863
> > 64, 0, 0, 254, 0.896
> > 128, 0, 0, 127, 0.885
> > 128, 0, 0, 254, 0.882
> > 256, 0, 0, 127, 0.87
> > 256, 0, 0, 254, 0.869
> > 512, 0, 0, 127, 0.832
> > 512, 0, 0, 254, 0.848
> > 1024, 0, 0, 127, 0.835
> > 1024, 0, 0, 254, 0.843
> > 16, 1, 2, 127, 0.914
> > 16, 2, 1, 254, 0.949
> > 32, 2, 4, 127, 0.955
> > 32, 4, 2, 254, 1.004
> > 64, 3, 6, 127, 0.844
> > 64, 6, 3, 254, 0.905
> > 128, 4, 0, 127, 0.889
> > 128, 0, 4, 254, 0.845
> > 256, 5, 2, 127, 0.929
> > 256, 2, 5, 254, 0.907
> > 512, 6, 4, 127, 0.837
> > 512, 4, 6, 254, 0.862
> > 1024, 7, 6, 127, 0.895
> > 1024, 6, 7, 254, 0.89
> >
> > sysdeps/x86_64/strcmp.S | 64 +++++++++++++++++++----------------------
> > 1 file changed, 29 insertions(+), 35 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
> > index e2ab59c555..99d8b36f1d 100644
> > --- a/sysdeps/x86_64/strcmp.S
> > +++ b/sysdeps/x86_64/strcmp.S
> > @@ -75,9 +75,8 @@ ENTRY2 (__strcasecmp)
> > movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
> > mov %fs:(%rax),%RDX_LP
> >
> > - // XXX 5 byte should be before the function
> > - /* 5-byte NOP. */
> > - .byte 0x0f,0x1f,0x44,0x00,0x00
> > + /* Either 1 or 5 bytes (dependeing if CET is enabled). */
> > + .p2align 4
> > END2 (__strcasecmp)
> > # ifndef NO_NOLOCALE_ALIAS
> > weak_alias (__strcasecmp, strcasecmp)
> > @@ -94,9 +93,8 @@ ENTRY2 (__strncasecmp)
> > movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
> > mov %fs:(%rax),%RCX_LP
> >
> > - // XXX 5 byte should be before the function
> > - /* 5-byte NOP. */
> > - .byte 0x0f,0x1f,0x44,0x00,0x00
> > + /* Either 1 or 5 bytes (dependeing if CET is enabled). */
> > + .p2align 4
> > END2 (__strncasecmp)
> > # ifndef NO_NOLOCALE_ALIAS
> > weak_alias (__strncasecmp, strncasecmp)
> > @@ -146,22 +144,22 @@ ENTRY (STRCMP)
> > #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> > .section .rodata.cst16,"aM",@progbits,16
> > .align 16
> > -.Lbelowupper:
> > - .quad 0x4040404040404040
> > - .quad 0x4040404040404040
> > -.Ltopupper:
> > - .quad 0x5b5b5b5b5b5b5b5b
> > - .quad 0x5b5b5b5b5b5b5b5b
> > -.Ltouppermask:
> > +.Llcase_min:
> > + .quad 0x3f3f3f3f3f3f3f3f
> > + .quad 0x3f3f3f3f3f3f3f3f
> > +.Llcase_max:
> > + .quad 0x9999999999999999
> > + .quad 0x9999999999999999
> > +.Lcase_add:
> > .quad 0x2020202020202020
> > .quad 0x2020202020202020
> > .previous
> > - movdqa .Lbelowupper(%rip), %xmm5
> > -# define UCLOW_reg %xmm5
> > - movdqa .Ltopupper(%rip), %xmm6
> > -# define UCHIGH_reg %xmm6
> > - movdqa .Ltouppermask(%rip), %xmm7
> > -# define LCQWORD_reg %xmm7
> > + movdqa .Llcase_min(%rip), %xmm5
> > +# define LCASE_MIN_reg %xmm5
> > + movdqa .Llcase_max(%rip), %xmm6
> > +# define LCASE_MAX_reg %xmm6
> > + movdqa .Lcase_add(%rip), %xmm7
> > +# define CASE_ADD_reg %xmm7
> > #endif
> > cmp $0x30, %ecx
> > ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */
> > @@ -172,22 +170,18 @@ ENTRY (STRCMP)
> > movhpd 8(%rdi), %xmm1
> > movhpd 8(%rsi), %xmm2
> > #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> > -# define TOLOWER(reg1, reg2) \
> > - movdqa reg1, %xmm8; \
> > - movdqa UCHIGH_reg, %xmm9; \
> > - movdqa reg2, %xmm10; \
> > - movdqa UCHIGH_reg, %xmm11; \
> > - pcmpgtb UCLOW_reg, %xmm8; \
> > - pcmpgtb reg1, %xmm9; \
> > - pcmpgtb UCLOW_reg, %xmm10; \
> > - pcmpgtb reg2, %xmm11; \
> > - pand %xmm9, %xmm8; \
> > - pand %xmm11, %xmm10; \
> > - pand LCQWORD_reg, %xmm8; \
> > - pand LCQWORD_reg, %xmm10; \
> > - por %xmm8, reg1; \
> > - por %xmm10, reg2
> > - TOLOWER (%xmm1, %xmm2)
> > +# define TOLOWER(reg1, reg2) \
> > + movdqa LCASE_MIN_reg, %xmm8; \
> > + movdqa LCASE_MIN_reg, %xmm9; \
> > + paddb reg1, %xmm8; \
> > + paddb reg2, %xmm9; \
> > + pcmpgtb LCASE_MAX_reg, %xmm8; \
> > + pcmpgtb LCASE_MAX_reg, %xmm9; \
> > + pandn CASE_ADD_reg, %xmm8; \
> > + pandn CASE_ADD_reg, %xmm9; \
> > + paddb %xmm8, reg1; \
> > + paddb %xmm9, reg2
> > + TOLOWER (%xmm1, %xmm2)
> > #else
> > # define TOLOWER(reg1, reg2)
> > #endif
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.
I would like to backport this patch to release branches.
Any comments or objections?
--Sunil
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v1 18/23] x86: Optimize str{n}casecmp TOLOWER logic in strcmp-sse42.S
2022-03-24 19:02 ` H.J. Lu
@ 2022-05-12 19:45 ` Sunil Pandey
0 siblings, 0 replies; 76+ messages in thread
From: Sunil Pandey @ 2022-05-12 19:45 UTC (permalink / raw)
To: H.J. Lu, Libc-stable Mailing List; +Cc: Noah Goldstein, GNU C Library
On Thu, Mar 24, 2022 at 12:05 PM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Wed, Mar 23, 2022 at 3:02 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Slightly faster method of doing TOLOWER that saves an
> > instruction.
> >
> > Also replace the hard coded 5-byte no with .p2align 4. On builds with
> > CET enabled this misaligned entry to strcasecmp.
> >
> > geometric_mean(N=40) of all benchmarks New / Original: .920
> >
> > All string/memory tests pass.
> > ---
> > Geomtric Mean N=40 runs; All functions page aligned
> > length, align1, align2, max_char, New Time / Old Time
> > 1, 1, 1, 127, 0.914
> > 2, 2, 2, 127, 0.952
> > 3, 3, 3, 127, 0.924
> > 4, 4, 4, 127, 0.995
> > 5, 5, 5, 127, 0.985
> > 6, 6, 6, 127, 1.017
> > 7, 7, 7, 127, 1.031
> > 8, 0, 0, 127, 0.967
> > 9, 1, 1, 127, 0.969
> > 10, 2, 2, 127, 0.951
> > 11, 3, 3, 127, 0.938
> > 12, 4, 4, 127, 0.937
> > 13, 5, 5, 127, 0.967
> > 14, 6, 6, 127, 0.941
> > 15, 7, 7, 127, 0.951
> > 4, 0, 0, 127, 0.959
> > 4, 0, 0, 254, 0.98
> > 8, 0, 0, 254, 0.959
> > 16, 0, 0, 127, 0.895
> > 16, 0, 0, 254, 0.901
> > 32, 0, 0, 127, 0.85
> > 32, 0, 0, 254, 0.851
> > 64, 0, 0, 127, 0.897
> > 64, 0, 0, 254, 0.895
> > 128, 0, 0, 127, 0.944
> > 128, 0, 0, 254, 0.935
> > 256, 0, 0, 127, 0.922
> > 256, 0, 0, 254, 0.913
> > 512, 0, 0, 127, 0.921
> > 512, 0, 0, 254, 0.914
> > 1024, 0, 0, 127, 0.845
> > 1024, 0, 0, 254, 0.84
> > 16, 1, 2, 127, 0.923
> > 16, 2, 1, 254, 0.955
> > 32, 2, 4, 127, 0.979
> > 32, 4, 2, 254, 0.957
> > 64, 3, 6, 127, 0.866
> > 64, 6, 3, 254, 0.849
> > 128, 4, 0, 127, 0.882
> > 128, 0, 4, 254, 0.876
> > 256, 5, 2, 127, 0.877
> > 256, 2, 5, 254, 0.882
> > 512, 6, 4, 127, 0.822
> > 512, 4, 6, 254, 0.862
> > 1024, 7, 6, 127, 0.903
> > 1024, 6, 7, 254, 0.908
> >
> > sysdeps/x86_64/multiarch/strcmp-sse42.S | 83 +++++++++++--------------
> > 1 file changed, 35 insertions(+), 48 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
> > index 580feb90e9..7805ae9d41 100644
> > --- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
> > +++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
> > @@ -88,9 +88,8 @@ ENTRY (GLABEL(__strcasecmp))
> > movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
> > mov %fs:(%rax),%RDX_LP
> >
> > - // XXX 5 byte should be before the function
> > - /* 5-byte NOP. */
> > - .byte 0x0f,0x1f,0x44,0x00,0x00
> > + /* Either 1 or 5 bytes (dependeing if CET is enabled). */
> > + .p2align 4
> > END (GLABEL(__strcasecmp))
> > /* FALLTHROUGH to strcasecmp_l. */
> > #endif
> > @@ -99,9 +98,8 @@ ENTRY (GLABEL(__strncasecmp))
> > movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
> > mov %fs:(%rax),%RCX_LP
> >
> > - // XXX 5 byte should be before the function
> > - /* 5-byte NOP. */
> > - .byte 0x0f,0x1f,0x44,0x00,0x00
> > + /* Either 1 or 5 bytes (dependeing if CET is enabled). */
> > + .p2align 4
> > END (GLABEL(__strncasecmp))
> > /* FALLTHROUGH to strncasecmp_l. */
> > #endif
> > @@ -169,27 +167,22 @@ STRCMP_SSE42:
> > #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> > .section .rodata.cst16,"aM",@progbits,16
> > .align 16
> > -LABEL(belowupper):
> > - .quad 0x4040404040404040
> > - .quad 0x4040404040404040
> > -LABEL(topupper):
> > -# ifdef USE_AVX
> > - .quad 0x5a5a5a5a5a5a5a5a
> > - .quad 0x5a5a5a5a5a5a5a5a
> > -# else
> > - .quad 0x5b5b5b5b5b5b5b5b
> > - .quad 0x5b5b5b5b5b5b5b5b
> > -# endif
> > -LABEL(touppermask):
> > +LABEL(lcase_min):
> > + .quad 0x3f3f3f3f3f3f3f3f
> > + .quad 0x3f3f3f3f3f3f3f3f
> > +LABEL(lcase_max):
> > + .quad 0x9999999999999999
> > + .quad 0x9999999999999999
> > +LABEL(case_add):
> > .quad 0x2020202020202020
> > .quad 0x2020202020202020
> > .previous
> > - movdqa LABEL(belowupper)(%rip), %xmm4
> > -# define UCLOW_reg %xmm4
> > - movdqa LABEL(topupper)(%rip), %xmm5
> > -# define UCHIGH_reg %xmm5
> > - movdqa LABEL(touppermask)(%rip), %xmm6
> > -# define LCQWORD_reg %xmm6
> > + movdqa LABEL(lcase_min)(%rip), %xmm4
> > +# define LCASE_MIN_reg %xmm4
> > + movdqa LABEL(lcase_max)(%rip), %xmm5
> > +# define LCASE_MAX_reg %xmm5
> > + movdqa LABEL(case_add)(%rip), %xmm6
> > +# define CASE_ADD_reg %xmm6
> > #endif
> > cmp $0x30, %ecx
> > ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
> > @@ -200,32 +193,26 @@ LABEL(touppermask):
> > #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> > # ifdef USE_AVX
> > # define TOLOWER(reg1, reg2) \
> > - vpcmpgtb UCLOW_reg, reg1, %xmm7; \
> > - vpcmpgtb UCHIGH_reg, reg1, %xmm8; \
> > - vpcmpgtb UCLOW_reg, reg2, %xmm9; \
> > - vpcmpgtb UCHIGH_reg, reg2, %xmm10; \
> > - vpandn %xmm7, %xmm8, %xmm8; \
> > - vpandn %xmm9, %xmm10, %xmm10; \
> > - vpand LCQWORD_reg, %xmm8, %xmm8; \
> > - vpand LCQWORD_reg, %xmm10, %xmm10; \
> > - vpor reg1, %xmm8, reg1; \
> > - vpor reg2, %xmm10, reg2
> > + vpaddb LCASE_MIN_reg, reg1, %xmm7; \
> > + vpaddb LCASE_MIN_reg, reg2, %xmm8; \
> > + vpcmpgtb LCASE_MAX_reg, %xmm7, %xmm7; \
> > + vpcmpgtb LCASE_MAX_reg, %xmm8, %xmm8; \
> > + vpandn CASE_ADD_reg, %xmm7, %xmm7; \
> > + vpandn CASE_ADD_reg, %xmm8, %xmm8; \
> > + vpaddb %xmm7, reg1, reg1; \
> > + vpaddb %xmm8, reg2, reg2
> > # else
> > # define TOLOWER(reg1, reg2) \
> > - movdqa reg1, %xmm7; \
> > - movdqa UCHIGH_reg, %xmm8; \
> > - movdqa reg2, %xmm9; \
> > - movdqa UCHIGH_reg, %xmm10; \
> > - pcmpgtb UCLOW_reg, %xmm7; \
> > - pcmpgtb reg1, %xmm8; \
> > - pcmpgtb UCLOW_reg, %xmm9; \
> > - pcmpgtb reg2, %xmm10; \
> > - pand %xmm8, %xmm7; \
> > - pand %xmm10, %xmm9; \
> > - pand LCQWORD_reg, %xmm7; \
> > - pand LCQWORD_reg, %xmm9; \
> > - por %xmm7, reg1; \
> > - por %xmm9, reg2
> > + movdqa LCASE_MIN_reg, %xmm7; \
> > + movdqa LCASE_MIN_reg, %xmm8; \
> > + paddb reg1, %xmm7; \
> > + paddb reg2, %xmm8; \
> > + pcmpgtb LCASE_MAX_reg, %xmm7; \
> > + pcmpgtb LCASE_MAX_reg, %xmm8; \
> > + pandn CASE_ADD_reg, %xmm7; \
> > + pandn CASE_ADD_reg, %xmm8; \
> > + paddb %xmm7, reg1; \
> > + paddb %xmm8, reg2
> > # endif
> > TOLOWER (%xmm1, %xmm2)
> > #else
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.
I would like to backport this patch to release branches.
Any comments or objections?
--Sunil
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v4 22/23] x86: Add EVEX optimized str{n}casecmp
2022-03-25 18:18 ` Noah Goldstein
@ 2022-05-12 19:47 ` Sunil Pandey
2022-05-12 19:52 ` Sunil Pandey
0 siblings, 1 reply; 76+ messages in thread
From: Sunil Pandey @ 2022-05-12 19:47 UTC (permalink / raw)
To: Noah Goldstein, Libc-stable Mailing List; +Cc: H.J. Lu, GNU C Library
[-- Attachment #1: Type: text/plain, Size: 34190 bytes --]
On Fri, Mar 25, 2022 at 11:20 AM Noah Goldstein via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Fri, Mar 25, 2022 at 1:15 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Thu, Mar 24, 2022 at 4:56 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > geometric_mean(N=40) of all benchmarks EVEX / SSE42: .621
> > >
> > > All string/memory tests pass.
> > > Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
> > > ---
> > > sysdeps/x86_64/multiarch/Makefile | 2 +
> > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 +
> > > sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 5 +
> > > sysdeps/x86_64/multiarch/strcasecmp_l-evex.S | 23 ++
> > > sysdeps/x86_64/multiarch/strcmp-evex.S | 290 ++++++++++++++++---
> > > sysdeps/x86_64/multiarch/strncase_l-evex.S | 25 ++
> > > 6 files changed, 321 insertions(+), 40 deletions(-)
> > > create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
> > > create mode 100644 sysdeps/x86_64/multiarch/strncase_l-evex.S
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > > index 06e1848823..35d80dc2ff 100644
> > > --- a/sysdeps/x86_64/multiarch/Makefile
> > > +++ b/sysdeps/x86_64/multiarch/Makefile
> > > @@ -57,6 +57,7 @@ sysdep_routines += \
> > > strcasecmp_l-avx \
> > > strcasecmp_l-avx2 \
> > > strcasecmp_l-avx2-rtm \
> > > + strcasecmp_l-evex \
> > > strcasecmp_l-sse2 \
> > > strcasecmp_l-sse4_2 \
> > > strcasecmp_l-ssse3 \
> > > @@ -97,6 +98,7 @@ sysdep_routines += \
> > > strncase_l-avx \
> > > strncase_l-avx2 \
> > > strncase_l-avx2-rtm \
> > > + strncase_l-evex \
> > > strncase_l-sse2 \
> > > strncase_l-sse4_2 \
> > > strncase_l-ssse3 \
> > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > index 3c556d07ac..f1a4d3dac2 100644
> > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > @@ -436,6 +436,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >
> > > /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
> > > IFUNC_IMPL (i, name, strcasecmp,
> > > + IFUNC_IMPL_ADD (array, i, strcasecmp,
> > > + (CPU_FEATURE_USABLE (AVX512VL)
> > > + && CPU_FEATURE_USABLE (AVX512BW)),
> > > + __strcasecmp_evex)
> > > IFUNC_IMPL_ADD (array, i, strcasecmp,
> > > CPU_FEATURE_USABLE (AVX2),
> > > __strcasecmp_avx2)
> > > @@ -456,6 +460,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >
> > > /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
> > > IFUNC_IMPL (i, name, strcasecmp_l,
> > > + IFUNC_IMPL_ADD (array, i, strcasecmp,
> > > + (CPU_FEATURE_USABLE (AVX512VL)
> > > + && CPU_FEATURE_USABLE (AVX512BW)),
> > > + __strcasecmp_l_evex)
> > > IFUNC_IMPL_ADD (array, i, strcasecmp,
> > > CPU_FEATURE_USABLE (AVX2),
> > > __strcasecmp_l_avx2)
> > > @@ -590,6 +598,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >
> > > /* Support sysdeps/x86_64/multiarch/strncase_l.c. */
> > > IFUNC_IMPL (i, name, strncasecmp,
> > > + IFUNC_IMPL_ADD (array, i, strncasecmp,
> > > + (CPU_FEATURE_USABLE (AVX512VL)
> > > + && CPU_FEATURE_USABLE (AVX512BW)),
> > > + __strncasecmp_evex)
> > > IFUNC_IMPL_ADD (array, i, strncasecmp,
> > > CPU_FEATURE_USABLE (AVX2),
> > > __strncasecmp_avx2)
> > > @@ -611,6 +623,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >
> > > /* Support sysdeps/x86_64/multiarch/strncase_l.c. */
> > > IFUNC_IMPL (i, name, strncasecmp_l,
> > > + IFUNC_IMPL_ADD (array, i, strncasecmp,
> > > + (CPU_FEATURE_USABLE (AVX512VL)
> > > + && CPU_FEATURE_USABLE (AVX512BW)),
> > > + __strncasecmp_l_evex)
> > > IFUNC_IMPL_ADD (array, i, strncasecmp,
> > > CPU_FEATURE_USABLE (AVX2),
> > > __strncasecmp_l_avx2)
> > > diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> > > index c4de111fd0..bf0d146e7f 100644
> > > --- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> > > +++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> > > @@ -25,6 +25,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
> > > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
> > > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> > > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
> > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
> > >
> > > static inline void *
> > > IFUNC_SELECTOR (void)
> > > @@ -34,6 +35,10 @@ IFUNC_SELECTOR (void)
> > > if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> > > && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
> > > {
> > > + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> > > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
> > > + return OPTIMIZE (evex);
> > > +
> > > if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> > > return OPTIMIZE (avx2_rtm);
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
> > > new file mode 100644
> > > index 0000000000..58642db748
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
> > > @@ -0,0 +1,23 @@
> > > +/* strcasecmp_l optimized with EVEX.
> > > + Copyright (C) 2017-2022 Free Software Foundation, Inc.
> > > + This file is part of the GNU C Library.
> > > +
> > > + The GNU C Library is free software; you can redistribute it and/or
> > > + modify it under the terms of the GNU Lesser General Public
> > > + License as published by the Free Software Foundation; either
> > > + version 2.1 of the License, or (at your option) any later version.
> > > +
> > > + The GNU C Library is distributed in the hope that it will be useful,
> > > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > > + Lesser General Public License for more details.
> > > +
> > > + You should have received a copy of the GNU Lesser General Public
> > > + License along with the GNU C Library; if not, see
> > > + <https://www.gnu.org/licenses/>. */
> > > +
> > > +#ifndef STRCMP
> > > +# define STRCMP __strcasecmp_l_evex
> > > +#endif
> > > +#define USE_AS_STRCASECMP_L
> > > +#include "strcmp-evex.S"
> > > diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
> > > index 56d8c118e4..2a5b3ce037 100644
> > > --- a/sysdeps/x86_64/multiarch/strcmp-evex.S
> > > +++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
> > > @@ -19,6 +19,9 @@
> > > #if IS_IN (libc)
> > >
> > > # include <sysdep.h>
> > > +# if defined USE_AS_STRCASECMP_L
> > > +# include "locale-defines.h"
> > > +# endif
> > >
> > > # ifndef STRCMP
> > > # define STRCMP __strcmp_evex
> > > @@ -34,19 +37,29 @@
> > > # define VMOVA vmovdqa64
> > >
> > > # ifdef USE_AS_WCSCMP
> > > -# define TESTEQ subl $0xff,
> > > +# ifndef OVERFLOW_STRCMP
> > > +# define OVERFLOW_STRCMP __wcscmp_evex
> > > +# endif
> > > +
> > > +# define TESTEQ subl $0xff,
> > > /* Compare packed dwords. */
> > > # define VPCMP vpcmpd
> > > # define VPMINU vpminud
> > > # define VPTESTM vptestmd
> > > +# define VPTESTNM vptestnmd
> > > /* 1 dword char == 4 bytes. */
> > > # define SIZE_OF_CHAR 4
> > > # else
> > > +# ifndef OVERFLOW_STRCMP
> > > +# define OVERFLOW_STRCMP __strcmp_evex
> > > +# endif
> > > +
> > > # define TESTEQ incl
> > > /* Compare packed bytes. */
> > > # define VPCMP vpcmpb
> > > # define VPMINU vpminub
> > > # define VPTESTM vptestmb
> > > +# define VPTESTNM vptestnmb
> > > /* 1 byte char == 1 byte. */
> > > # define SIZE_OF_CHAR 1
> > > # endif
> > > @@ -73,11 +86,16 @@
> > > # define VEC_OFFSET (-VEC_SIZE)
> > > # endif
> > >
> > > -# define XMMZERO xmm16
> > > # define XMM0 xmm17
> > > # define XMM1 xmm18
> > >
> > > -# define YMMZERO ymm16
> > > +# define XMM10 xmm27
> > > +# define XMM11 xmm28
> > > +# define XMM12 xmm29
> > > +# define XMM13 xmm30
> > > +# define XMM14 xmm31
> > > +
> > > +
> > > # define YMM0 ymm17
> > > # define YMM1 ymm18
> > > # define YMM2 ymm19
> > > @@ -89,6 +107,87 @@
> > > # define YMM8 ymm25
> > > # define YMM9 ymm26
> > > # define YMM10 ymm27
> > > +# define YMM11 ymm28
> > > +# define YMM12 ymm29
> > > +# define YMM13 ymm30
> > > +# define YMM14 ymm31
> > > +
> > > +# ifdef USE_AS_STRCASECMP_L
> > > +# define BYTE_LOOP_REG OFFSET_REG
> > > +# else
> > > +# define BYTE_LOOP_REG ecx
> > > +# endif
> > > +
> > > +# ifdef USE_AS_STRCASECMP_L
> > > +# ifdef USE_AS_STRNCMP
> > > +# define STRCASECMP __strncasecmp_evex
> > > +# define LOCALE_REG rcx
> > > +# define LOCALE_REG_LP RCX_LP
> > > +# define STRCASECMP_NONASCII __strncasecmp_l_nonascii
> > > +# else
> > > +# define STRCASECMP __strcasecmp_evex
> > > +# define LOCALE_REG rdx
> > > +# define LOCALE_REG_LP RDX_LP
> > > +# define STRCASECMP_NONASCII __strcasecmp_l_nonascii
> > > +# endif
> > > +# endif
> > > +
> > > +# define LCASE_MIN_YMM %YMM12
> > > +# define LCASE_MAX_YMM %YMM13
> > > +# define CASE_ADD_YMM %YMM14
> > > +
> > > +# define LCASE_MIN_XMM %XMM12
> > > +# define LCASE_MAX_XMM %XMM13
> > > +# define CASE_ADD_XMM %XMM14
> > > +
> > > + /* NB: wcsncmp uses r11 but strcasecmp is never used in
> > > + conjunction with wcscmp. */
> > > +# define TOLOWER_BASE %r11
> > > +
> > > +# ifdef USE_AS_STRCASECMP_L
> > > +# define _REG(x, y) x ## y
> > > +# define REG(x, y) _REG(x, y)
> > > +# define TOLOWER(reg1, reg2, ext) \
> > > + vpsubb REG(LCASE_MIN_, ext), reg1, REG(%ext, 10); \
> > > + vpsubb REG(LCASE_MIN_, ext), reg2, REG(%ext, 11); \
> > > + vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5; \
> > > + vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6; \
> > > + vpaddb reg1, REG(CASE_ADD_, ext), reg1{%k5}; \
> > > + vpaddb reg2, REG(CASE_ADD_, ext), reg2{%k6}
> > > +
> > > +# define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
> > > +# define TOLOWER_YMM(...) TOLOWER(__VA_ARGS__, YMM)
> > > +# define TOLOWER_XMM(...) TOLOWER(__VA_ARGS__, XMM)
> > > +
> > > +# define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext) \
> > > + TOLOWER (s1_reg, s2_reg, ext); \
> > > + VPCMP $0, s1_reg, s2_reg, reg_out
> > > +
> > > +# define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext) \
> > > + VMOVU s2_mem, s2_reg; \
> > > + CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)
> > > +
> > > +# define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM)
> > > +# define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM)
> > > +
> > > +# define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM)
> > > +# define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM)
> > > +
> > > +# else
> > > +# define TOLOWER_gpr(...)
> > > +# define TOLOWER_YMM(...)
> > > +# define TOLOWER_XMM(...)
> > > +
> > > +# define CMP_R1_R2_YMM(s1_reg, s2_reg, reg_out) \
> > > + VPCMP $0, s2_reg, s1_reg, reg_out
> > > +
> > > +# define CMP_R1_R2_XMM(...) CMP_R1_R2_YMM(__VA_ARGS__)
> > > +
> > > +# define CMP_R1_S2_YMM(s1_reg, s2_mem, unused, reg_out) \
> > > + VPCMP $0, s2_mem, s1_reg, reg_out
> > > +
> > > +# define CMP_R1_S2_XMM(...) CMP_R1_S2_YMM(__VA_ARGS__)
> > > +# endif
> > >
> > > /* Warning!
> > > wcscmp/wcsncmp have to use SIGNED comparison for elements.
> > > @@ -112,8 +211,45 @@
> > > returned. */
> > >
> > > .section .text.evex, "ax", @progbits
> > > -ENTRY(STRCMP)
> > > + .align 16
> > > + .type STRCMP, @function
> > > + .globl STRCMP
> > > + .hidden STRCMP
> > > +
> > > +# ifdef USE_AS_STRCASECMP_L
> > > +ENTRY (STRCASECMP)
> > > + movq __libc_tsd_LOCALE@gottpoff(%rip), %rax
> > > + mov %fs:(%rax), %LOCALE_REG_LP
> > > +
> > > + /* Either 1 or 5 bytes (dependeing if CET is enabled). */
> > > + .p2align 4
> > > +END (STRCASECMP)
> > > + /* FALLTHROUGH to strcasecmp/strncasecmp_l. */
> > > +# endif
> > > +
> > > + .p2align 4
> > > +STRCMP:
> > > + cfi_startproc
> > > + _CET_ENDBR
> > > + CALL_MCOUNT
> > > +
> > > +# if defined USE_AS_STRCASECMP_L
> > > + /* We have to fall back on the C implementation for locales with
> > > + encodings not matching ASCII for single bytes. */
> > > +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
> > > + mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
> > > +# else
> > > + mov (%LOCALE_REG), %RAX_LP
> > > +# endif
> > > + testl $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
> > > + jne STRCASECMP_NONASCII
> > > + leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
> > > +# endif
> > > +
> > > # ifdef USE_AS_STRNCMP
> > > + /* Don't overwrite LOCALE_REG (rcx) until we have pass
> > > + L(one_or_less). Otherwise we might use the wrong locale in
> > > + the OVERFLOW_STRCMP (strcasecmp_l). */
> > > # ifdef __ILP32__
> > > /* Clear the upper 32 bits. */
> > > movl %edx, %edx
> > > @@ -125,6 +261,32 @@ ENTRY(STRCMP)
> > > actually bound the buffer. */
> > > jle L(one_or_less)
> > > # endif
> > > +
> > > +# if defined USE_AS_STRCASECMP_L
> > > + .section .rodata.cst32, "aM", @progbits, 32
> > > + .align 32
> > > +L(lcase_min):
> > > + .quad 0x4141414141414141
> > > + .quad 0x4141414141414141
> > > + .quad 0x4141414141414141
> > > + .quad 0x4141414141414141
> > > +L(lcase_max):
> > > + .quad 0x1a1a1a1a1a1a1a1a
> > > + .quad 0x1a1a1a1a1a1a1a1a
> > > + .quad 0x1a1a1a1a1a1a1a1a
> > > + .quad 0x1a1a1a1a1a1a1a1a
> > > +L(case_add):
> > > + .quad 0x2020202020202020
> > > + .quad 0x2020202020202020
> > > + .quad 0x2020202020202020
> > > + .quad 0x2020202020202020
> > > + .previous
> > > +
> > > + vmovdqa64 L(lcase_min)(%rip), LCASE_MIN_YMM
> > > + vmovdqa64 L(lcase_max)(%rip), LCASE_MAX_YMM
> > > + vmovdqa64 L(case_add)(%rip), CASE_ADD_YMM
> > > +# endif
> > > +
> > > movl %edi, %eax
> > > orl %esi, %eax
> > > /* Shift out the bits irrelivant to page boundary ([63:12]). */
> > > @@ -139,7 +301,7 @@ L(no_page_cross):
> > > VPTESTM %YMM0, %YMM0, %k2
> > > /* Each bit cleared in K1 represents a mismatch or a null CHAR
> > > in YMM0 and 32 bytes at (%rsi). */
> > > - VPCMP $0, (%rsi), %YMM0, %k1{%k2}
> > > + CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
> > > kmovd %k1, %ecx
> > > # ifdef USE_AS_STRNCMP
> > > cmpq $CHAR_PER_VEC, %rdx
> > > @@ -169,6 +331,8 @@ L(return_vec_0):
> > > # else
> > > movzbl (%rdi, %rcx), %eax
> > > movzbl (%rsi, %rcx), %ecx
> > > + TOLOWER_gpr (%rax, %eax)
> > > + TOLOWER_gpr (%rcx, %ecx)
> > > subl %ecx, %eax
> > > # endif
> > > L(ret0):
> > > @@ -188,11 +352,15 @@ L(ret_zero):
> > >
> > > .p2align 4,, 5
> > > L(one_or_less):
> > > +# ifdef USE_AS_STRCASECMP_L
> > > + /* Set locale argument for strcasecmp. */
> > > + movq %LOCALE_REG, %rdx
> > > +# endif
> > > jb L(ret_zero)
> > > -# ifdef USE_AS_WCSCMP
> > > /* 'nbe' covers the case where length is negative (large
> > > unsigned). */
> > > - jnbe __wcscmp_evex
> > > + jnbe OVERFLOW_STRCMP
> > > +# ifdef USE_AS_WCSCMP
> > > movl (%rdi), %edx
> > > xorl %eax, %eax
> > > cmpl (%rsi), %edx
> > > @@ -201,11 +369,10 @@ L(one_or_less):
> > > negl %eax
> > > orl $1, %eax
> > > # else
> > > - /* 'nbe' covers the case where length is negative (large
> > > - unsigned). */
> > > - jnbe __strcmp_evex
> > > movzbl (%rdi), %eax
> > > movzbl (%rsi), %ecx
> > > + TOLOWER_gpr (%rax, %eax)
> > > + TOLOWER_gpr (%rcx, %ecx)
> > > subl %ecx, %eax
> > > # endif
> > > L(ret1):
> > > @@ -233,6 +400,8 @@ L(return_vec_1):
> > > # else
> > > movzbl VEC_SIZE(%rdi, %rcx), %eax
> > > movzbl VEC_SIZE(%rsi, %rcx), %ecx
> > > + TOLOWER_gpr (%rax, %eax)
> > > + TOLOWER_gpr (%rcx, %ecx)
> > > subl %ecx, %eax
> > > # endif
> > > L(ret2):
> > > @@ -270,6 +439,8 @@ L(return_vec_2):
> > > # else
> > > movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
> > > movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
> > > + TOLOWER_gpr (%rax, %eax)
> > > + TOLOWER_gpr (%rcx, %ecx)
> > > subl %ecx, %eax
> > > # endif
> > > L(ret3):
> > > @@ -290,6 +461,8 @@ L(return_vec_3):
> > > # else
> > > movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
> > > movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
> > > + TOLOWER_gpr (%rax, %eax)
> > > + TOLOWER_gpr (%rcx, %ecx)
> > > subl %ecx, %eax
> > > # endif
> > > L(ret4):
> > > @@ -303,7 +476,7 @@ L(more_3x_vec):
> > > /* Safe to compare 4x vectors. */
> > > VMOVU (VEC_SIZE)(%rdi), %YMM0
> > > VPTESTM %YMM0, %YMM0, %k2
> > > - VPCMP $0, (VEC_SIZE)(%rsi), %YMM0, %k1{%k2}
> > > + CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
> > > kmovd %k1, %ecx
> > > TESTEQ %ecx
> > > jnz L(return_vec_1)
> > > @@ -315,14 +488,14 @@ L(more_3x_vec):
> > >
> > > VMOVU (VEC_SIZE * 2)(%rdi), %YMM0
> > > VPTESTM %YMM0, %YMM0, %k2
> > > - VPCMP $0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
> > > + CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2}
> > > kmovd %k1, %ecx
> > > TESTEQ %ecx
> > > jnz L(return_vec_2)
> > >
> > > VMOVU (VEC_SIZE * 3)(%rdi), %YMM0
> > > VPTESTM %YMM0, %YMM0, %k2
> > > - VPCMP $0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
> > > + CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2}
> > > kmovd %k1, %ecx
> > > TESTEQ %ecx
> > > jnz L(return_vec_3)
> > > @@ -381,7 +554,6 @@ L(prepare_loop_aligned):
> > > subl %esi, %eax
> > > andl $(PAGE_SIZE - 1), %eax
> > >
> > > - vpxorq %YMMZERO, %YMMZERO, %YMMZERO
> > >
> > > /* Loop 4x comparisons at a time. */
> > > .p2align 4
> > > @@ -413,22 +585,35 @@ L(loop_skip_page_cross_check):
> > > /* A zero CHAR in YMM9 means that there is a null CHAR. */
> > > VPMINU %YMM8, %YMM9, %YMM9
> > >
> > > - /* Each bit set in K1 represents a non-null CHAR in YMM8. */
> > > + /* Each bit set in K1 represents a non-null CHAR in YMM9. */
> > > VPTESTM %YMM9, %YMM9, %k1
> > > -
> > > +# ifndef USE_AS_STRCASECMP_L
> > > vpxorq (VEC_SIZE * 0)(%rsi), %YMM0, %YMM1
> > > vpxorq (VEC_SIZE * 1)(%rsi), %YMM2, %YMM3
> > > vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
> > > /* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
> > > oring with YMM1. Result is stored in YMM6. */
> > > vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6
> > > -
> > > +# else
> > > + VMOVU (VEC_SIZE * 0)(%rsi), %YMM1
> > > + TOLOWER_YMM (%YMM0, %YMM1)
> > > + VMOVU (VEC_SIZE * 1)(%rsi), %YMM3
> > > + TOLOWER_YMM (%YMM2, %YMM3)
> > > + VMOVU (VEC_SIZE * 2)(%rsi), %YMM5
> > > + TOLOWER_YMM (%YMM4, %YMM5)
> > > + VMOVU (VEC_SIZE * 3)(%rsi), %YMM7
> > > + TOLOWER_YMM (%YMM6, %YMM7)
> > > + vpxorq %YMM0, %YMM1, %YMM1
> > > + vpxorq %YMM2, %YMM3, %YMM3
> > > + vpxorq %YMM4, %YMM5, %YMM5
> > > + vpternlogd $0xde, %YMM7, %YMM1, %YMM6
> > > +# endif
> > > /* Or together YMM3, YMM5, and YMM6. */
> > > vpternlogd $0xfe, %YMM3, %YMM5, %YMM6
> > >
> > >
> > > /* A non-zero CHAR in YMM6 represents a mismatch. */
> > > - VPCMP $0, %YMMZERO, %YMM6, %k0{%k1}
> > > + VPTESTNM %YMM6, %YMM6, %k0{%k1}
> > > kmovd %k0, %LOOP_REG
> > >
> > > TESTEQ %LOOP_REG
> > > @@ -437,13 +622,13 @@ L(loop_skip_page_cross_check):
> > >
> > > /* Find which VEC has the mismatch of end of string. */
> > > VPTESTM %YMM0, %YMM0, %k1
> > > - VPCMP $0, %YMMZERO, %YMM1, %k0{%k1}
> > > + VPTESTNM %YMM1, %YMM1, %k0{%k1}
> > > kmovd %k0, %ecx
> > > TESTEQ %ecx
> > > jnz L(return_vec_0_end)
> > >
> > > VPTESTM %YMM2, %YMM2, %k1
> > > - VPCMP $0, %YMMZERO, %YMM3, %k0{%k1}
> > > + VPTESTNM %YMM3, %YMM3, %k0{%k1}
> > > kmovd %k0, %ecx
> > > TESTEQ %ecx
> > > jnz L(return_vec_1_end)
> > > @@ -457,7 +642,7 @@ L(return_vec_2_3_end):
> > > # endif
> > >
> > > VPTESTM %YMM4, %YMM4, %k1
> > > - VPCMP $0, %YMMZERO, %YMM5, %k0{%k1}
> > > + VPTESTNM %YMM5, %YMM5, %k0{%k1}
> > > kmovd %k0, %ecx
> > > TESTEQ %ecx
> > > # if CHAR_PER_VEC <= 16
> > > @@ -493,6 +678,8 @@ L(return_vec_3_end):
> > > # else
> > > movzbl (VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax
> > > movzbl (VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx
> > > + TOLOWER_gpr (%rax, %eax)
> > > + TOLOWER_gpr (%rcx, %ecx)
> > > subl %ecx, %eax
> > > xorl %r8d, %eax
> > > subl %r8d, %eax
> > > @@ -545,6 +732,8 @@ L(return_vec_0_end):
> > > # else
> > > movzbl (%rdi, %rcx), %eax
> > > movzbl (%rsi, %rcx), %ecx
> > > + TOLOWER_gpr (%rax, %eax)
> > > + TOLOWER_gpr (%rcx, %ecx)
> > > subl %ecx, %eax
> > > /* Flip `eax` if `rdi` and `rsi` where swapped in page cross
> > > logic. Subtract `r8d` after xor for zero case. */
> > > @@ -569,6 +758,8 @@ L(return_vec_1_end):
> > > # else
> > > movzbl VEC_SIZE(%rdi, %rcx), %eax
> > > movzbl VEC_SIZE(%rsi, %rcx), %ecx
> > > + TOLOWER_gpr (%rax, %eax)
> > > + TOLOWER_gpr (%rcx, %ecx)
> > > subl %ecx, %eax
> > > xorl %r8d, %eax
> > > subl %r8d, %eax
> > > @@ -598,7 +789,7 @@ L(page_cross_during_loop):
> > >
> > > VMOVA (%rdi), %YMM0
> > > VPTESTM %YMM0, %YMM0, %k2
> > > - VPCMP $0, (%rsi), %YMM0, %k1{%k2}
> > > + CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
> > > kmovd %k1, %ecx
> > > TESTEQ %ecx
> > > jnz L(return_vec_0_end)
> > > @@ -619,8 +810,7 @@ L(less_1x_vec_till_page_cross):
> > > been loaded earlier so must be valid. */
> > > VMOVU -VEC_SIZE(%rdi, %rax), %YMM0
> > > VPTESTM %YMM0, %YMM0, %k2
> > > - VPCMP $0, -VEC_SIZE(%rsi, %rax), %YMM0, %k1{%k2}
> > > -
> > > + CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2}
> > > /* Mask of potentially valid bits. The lower bits can be out of
> > > range comparisons (but safe regarding page crosses). */
> > >
> > > @@ -642,6 +832,8 @@ L(less_1x_vec_till_page_cross):
> > >
> > > # ifdef USE_AS_STRNCMP
> > > # ifdef USE_AS_WCSCMP
> > > + /* NB: strcasecmp not used with WCSCMP so this access to r11 is
> > > + safe. */
> > > movl %eax, %r11d
> > > shrl $2, %r11d
> > > cmpq %r11, %rdx
> > > @@ -679,6 +871,8 @@ L(return_page_cross_cmp_mem):
> > > # else
> > > movzbl VEC_OFFSET(%rdi, %rcx), %eax
> > > movzbl VEC_OFFSET(%rsi, %rcx), %ecx
> > > + TOLOWER_gpr (%rax, %eax)
> > > + TOLOWER_gpr (%rcx, %ecx)
> > > subl %ecx, %eax
> > > xorl %r8d, %eax
> > > subl %r8d, %eax
> > > @@ -709,7 +903,7 @@ L(more_2x_vec_till_page_cross):
> > >
> > > VMOVA VEC_SIZE(%rdi), %YMM0
> > > VPTESTM %YMM0, %YMM0, %k2
> > > - VPCMP $0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
> > > + CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
> > > kmovd %k1, %ecx
> > > TESTEQ %ecx
> > > jnz L(return_vec_1_end)
> > > @@ -724,14 +918,14 @@ L(more_2x_vec_till_page_cross):
> > > /* Safe to include comparisons from lower bytes. */
> > > VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %YMM0
> > > VPTESTM %YMM0, %YMM0, %k2
> > > - VPCMP $0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM0, %k1{%k2}
> > > + CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2}
> > > kmovd %k1, %ecx
> > > TESTEQ %ecx
> > > jnz L(return_vec_page_cross_0)
> > >
> > > VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %YMM0
> > > VPTESTM %YMM0, %YMM0, %k2
> > > - VPCMP $0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM0, %k1{%k2}
> > > + CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2}
> > > kmovd %k1, %ecx
> > > TESTEQ %ecx
> > > jnz L(return_vec_page_cross_1)
> > > @@ -740,6 +934,8 @@ L(more_2x_vec_till_page_cross):
> > > /* Must check length here as length might proclude reading next
> > > page. */
> > > # ifdef USE_AS_WCSCMP
> > > + /* NB: strcasecmp not used with WCSCMP so this access to r11 is
> > > + safe. */
> > > movl %eax, %r11d
> > > shrl $2, %r11d
> > > cmpq %r11, %rdx
> > > @@ -754,12 +950,19 @@ L(more_2x_vec_till_page_cross):
> > > VMOVA (VEC_SIZE * 3)(%rdi), %YMM6
> > > VPMINU %YMM4, %YMM6, %YMM9
> > > VPTESTM %YMM9, %YMM9, %k1
> > > -
> > > +# ifndef USE_AS_STRCASECMP_L
> > > vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
> > > /* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6). */
> > > vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6
> > > -
> > > - VPCMP $0, %YMMZERO, %YMM6, %k0{%k1}
> > > +# else
> > > + VMOVU (VEC_SIZE * 2)(%rsi), %YMM5
> > > + TOLOWER_YMM (%YMM4, %YMM5)
> > > + VMOVU (VEC_SIZE * 3)(%rsi), %YMM7
> > > + TOLOWER_YMM (%YMM6, %YMM7)
> > > + vpxorq %YMM4, %YMM5, %YMM5
> > > + vpternlogd $0xde, %YMM7, %YMM5, %YMM6
> > > +# endif
> > > + VPTESTNM %YMM6, %YMM6, %k0{%k1}
> > > kmovd %k0, %LOOP_REG
> > > TESTEQ %LOOP_REG
> > > jnz L(return_vec_2_3_end)
> > > @@ -815,6 +1018,8 @@ L(return_vec_page_cross_1):
> > > # else
> > > movzbl VEC_OFFSET(%rdi, %rcx), %eax
> > > movzbl VEC_OFFSET(%rsi, %rcx), %ecx
> > > + TOLOWER_gpr (%rax, %eax)
> > > + TOLOWER_gpr (%rcx, %ecx)
> > > subl %ecx, %eax
> > > xorl %r8d, %eax
> > > subl %r8d, %eax
> > > @@ -871,7 +1076,7 @@ L(page_cross):
> > > L(page_cross_loop):
> > > VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
> > > VPTESTM %YMM0, %YMM0, %k2
> > > - VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
> > > + CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
> > > kmovd %k1, %ecx
> > > TESTEQ %ecx
> > > jnz L(check_ret_vec_page_cross)
> > > @@ -895,7 +1100,7 @@ L(page_cross_loop):
> > > */
> > > VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
> > > VPTESTM %YMM0, %YMM0, %k2
> > > - VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
> > > + CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
> > >
> > > kmovd %k1, %ecx
> > > # ifdef USE_AS_STRNCMP
> > > @@ -930,6 +1135,8 @@ L(ret_vec_page_cross_cont):
> > > # else
> > > movzbl (%rdi, %rcx, SIZE_OF_CHAR), %eax
> > > movzbl (%rsi, %rcx, SIZE_OF_CHAR), %ecx
> > > + TOLOWER_gpr (%rax, %eax)
> > > + TOLOWER_gpr (%rcx, %ecx)
> > > subl %ecx, %eax
> > > xorl %r8d, %eax
> > > subl %r8d, %eax
> > > @@ -989,7 +1196,7 @@ L(less_1x_vec_till_page):
> > > /* Use 16 byte comparison. */
> > > vmovdqu (%rdi), %xmm0
> > > VPTESTM %xmm0, %xmm0, %k2
> > > - VPCMP $0, (%rsi), %xmm0, %k1{%k2}
> > > + CMP_R1_S2_XMM (%xmm0, (%rsi), %xmm1, %k1){%k2}
> > > kmovd %k1, %ecx
> > > # ifdef USE_AS_WCSCMP
> > > subl $0xf, %ecx
> > > @@ -1009,7 +1216,7 @@ L(less_1x_vec_till_page):
> > > # endif
> > > vmovdqu (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
> > > VPTESTM %xmm0, %xmm0, %k2
> > > - VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0, %k1{%k2}
> > > + CMP_R1_S2_XMM (%xmm0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1, %k1){%k2}
> > > kmovd %k1, %ecx
> > > # ifdef USE_AS_WCSCMP
> > > subl $0xf, %ecx
> > > @@ -1048,7 +1255,7 @@ L(less_16_till_page):
> > > vmovq (%rdi), %xmm0
> > > vmovq (%rsi), %xmm1
> > > VPTESTM %xmm0, %xmm0, %k2
> > > - VPCMP $0, %xmm1, %xmm0, %k1{%k2}
> > > + CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
> > > kmovd %k1, %ecx
> > > # ifdef USE_AS_WCSCMP
> > > subl $0x3, %ecx
> > > @@ -1068,7 +1275,7 @@ L(less_16_till_page):
> > > vmovq (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
> > > vmovq (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
> > > VPTESTM %xmm0, %xmm0, %k2
> > > - VPCMP $0, %xmm1, %xmm0, %k1{%k2}
> > > + CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
> > > kmovd %k1, %ecx
> > > # ifdef USE_AS_WCSCMP
> > > subl $0x3, %ecx
> > > @@ -1128,7 +1335,7 @@ L(ret_less_8_wcs):
> > > vmovd (%rdi), %xmm0
> > > vmovd (%rsi), %xmm1
> > > VPTESTM %xmm0, %xmm0, %k2
> > > - VPCMP $0, %xmm1, %xmm0, %k1{%k2}
> > > + CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
> > > kmovd %k1, %ecx
> > > subl $0xf, %ecx
> > > jnz L(check_ret_vec_page_cross)
> > > @@ -1143,7 +1350,7 @@ L(ret_less_8_wcs):
> > > vmovd (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
> > > vmovd (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
> > > VPTESTM %xmm0, %xmm0, %k2
> > > - VPCMP $0, %xmm1, %xmm0, %k1{%k2}
> > > + CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
> > > kmovd %k1, %ecx
> > > subl $0xf, %ecx
> > > jnz L(check_ret_vec_page_cross)
> > > @@ -1176,7 +1383,9 @@ L(less_4_till_page):
> > > L(less_4_loop):
> > > movzbl (%rdi), %eax
> > > movzbl (%rsi, %rdi), %ecx
> > > - subl %ecx, %eax
> > > + TOLOWER_gpr (%rax, %eax)
> > > + TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
> > > + subl %BYTE_LOOP_REG, %eax
> > > jnz L(ret_less_4_loop)
> > > testl %ecx, %ecx
> > > jz L(ret_zero_4_loop)
> > > @@ -1203,5 +1412,6 @@ L(ret_less_4_loop):
> > > subl %r8d, %eax
> > > ret
> > > # endif
> > > -END(STRCMP)
> > > + cfi_endproc
> > > + .size STRCMP, .-STRCMP
> > > #endif
> > > diff --git a/sysdeps/x86_64/multiarch/strncase_l-evex.S b/sysdeps/x86_64/multiarch/strncase_l-evex.S
> > > new file mode 100644
> > > index 0000000000..8a5af3695c
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/strncase_l-evex.S
> > > @@ -0,0 +1,25 @@
> > > +/* strncasecmp_l optimized with EVEX.
> > > + Copyright (C) 2017-2022 Free Software Foundation, Inc.
> > > + This file is part of the GNU C Library.
> > > +
> > > + The GNU C Library is free software; you can redistribute it and/or
> > > + modify it under the terms of the GNU Lesser General Public
> > > + License as published by the Free Software Foundation; either
> > > + version 2.1 of the License, or (at your option) any later version.
> > > +
> > > + The GNU C Library is distributed in the hope that it will be useful,
> > > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > > + Lesser General Public License for more details.
> > > +
> > > + You should have received a copy of the GNU Lesser General Public
> > > + License along with the GNU C Library; if not, see
> > > + <https://www.gnu.org/licenses/>. */
> > > +
> > > +#ifndef STRCMP
> > > +# define STRCMP __strncasecmp_l_evex
> > > +#endif
> > > +#define OVERFLOW_STRCMP __strcasecmp_l_evex
> > > +#define USE_AS_STRCASECMP_L
> > > +#define USE_AS_STRNCMP
> > > +#include "strcmp-evex.S"
> > > --
> > > 2.25.1
> > >
> >
> > LGTM.
> >
> > Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks, pushed the patchset.
> >
> > Thanks.
> >
> > --
> > H.J.
I would like to backport this patch to release branches.
Any comments or objections?
Conflict resolution patch attached.
--Sunil
[-- Attachment #2: 0015-x86-Add-AVX2-optimized-str-n-casecmp.patch --]
[-- Type: application/octet-stream, Size: 24673 bytes --]
From b382e4caf50dfee62e170f9b6617b470b1289dcb Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Thu, 24 Mar 2022 18:56:12 -0500
Subject: [PATCH 15/26] x86: Add AVX2 optimized str{n}casecmp
geometric_mean(N=40) of all benchmarks AVX2 / SSE42: .702
All string/memory tests pass.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
(cherry picked from commit bbf81222343fed5cd704001a2ae0d86c71544151)
---
sysdeps/x86_64/multiarch/Makefile | 4 +
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 28 +++
sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 12 +
.../x86_64/multiarch/strcasecmp_l-avx2-rtm.S | 15 ++
sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S | 23 ++
sysdeps/x86_64/multiarch/strcmp-avx2.S | 237 +++++++++++++++---
.../x86_64/multiarch/strncase_l-avx2-rtm.S | 16 ++
sysdeps/x86_64/multiarch/strncase_l-avx2.S | 27 ++
8 files changed, 331 insertions(+), 31 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index e7b413edad..06e1848823 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -55,6 +55,8 @@ sysdep_routines += \
stpncpy-sse2-unaligned \
stpncpy-ssse3 \
strcasecmp_l-avx \
+ strcasecmp_l-avx2 \
+ strcasecmp_l-avx2-rtm \
strcasecmp_l-sse2 \
strcasecmp_l-sse4_2 \
strcasecmp_l-ssse3 \
@@ -93,6 +95,8 @@ sysdep_routines += \
strlen-evex \
strlen-sse2 \
strncase_l-avx \
+ strncase_l-avx2 \
+ strncase_l-avx2-rtm \
strncase_l-sse2 \
strncase_l-sse4_2 \
strncase_l-ssse3 \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index a594f4176e..3c556d07ac 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -436,6 +436,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
IFUNC_IMPL (i, name, strcasecmp,
+ IFUNC_IMPL_ADD (array, i, strcasecmp,
+ CPU_FEATURE_USABLE (AVX2),
+ __strcasecmp_avx2)
+ IFUNC_IMPL_ADD (array, i, strcasecmp,
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (RTM)),
+ __strcasecmp_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strcasecmp,
CPU_FEATURE_USABLE (AVX),
__strcasecmp_avx)
@@ -449,6 +456,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
IFUNC_IMPL (i, name, strcasecmp_l,
+ IFUNC_IMPL_ADD (array, i, strcasecmp,
+ CPU_FEATURE_USABLE (AVX2),
+ __strcasecmp_l_avx2)
+ IFUNC_IMPL_ADD (array, i, strcasecmp,
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (RTM)),
+ __strcasecmp_l_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strcasecmp_l,
CPU_FEATURE_USABLE (AVX),
__strcasecmp_l_avx)
@@ -576,6 +590,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strncase_l.c. */
IFUNC_IMPL (i, name, strncasecmp,
+ IFUNC_IMPL_ADD (array, i, strncasecmp,
+ CPU_FEATURE_USABLE (AVX2),
+ __strncasecmp_avx2)
+ IFUNC_IMPL_ADD (array, i, strncasecmp,
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (RTM)),
+ __strncasecmp_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strncasecmp,
CPU_FEATURE_USABLE (AVX),
__strncasecmp_avx)
@@ -590,6 +611,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strncase_l.c. */
IFUNC_IMPL (i, name, strncasecmp_l,
+ IFUNC_IMPL_ADD (array, i, strncasecmp,
+ CPU_FEATURE_USABLE (AVX2),
+ __strncasecmp_l_avx2)
+ IFUNC_IMPL_ADD (array, i, strncasecmp,
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (RTM)),
+ __strncasecmp_l_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strncasecmp_l,
CPU_FEATURE_USABLE (AVX),
__strncasecmp_l_avx)
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
index 9e3cc61ac0..c4de111fd0 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
@@ -23,12 +23,24 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
static inline void *
IFUNC_SELECTOR (void)
{
const struct cpu_features* cpu_features = __get_cpu_features ();
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+ {
+ if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+ return OPTIMIZE (avx2_rtm);
+
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ return OPTIMIZE (avx2);
+ }
+
if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
return OPTIMIZE (avx);
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
new file mode 100644
index 0000000000..09957fc3c5
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
@@ -0,0 +1,15 @@
+#ifndef STRCMP
+# define STRCMP __strcasecmp_l_avx2_rtm
+#endif
+
+#define _GLABEL(x) x ## _rtm
+#define GLABEL(x) _GLABEL(x)
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "strcasecmp_l-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
new file mode 100644
index 0000000000..e2762f2a22
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
@@ -0,0 +1,23 @@
+/* strcasecmp_l optimized with AVX2.
+ Copyright (C) 2017-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef STRCMP
+# define STRCMP __strcasecmp_l_avx2
+#endif
+#define USE_AS_STRCASECMP_L
+#include "strcmp-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
index b94fc5c39a..3366d0b083 100644
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
@@ -20,6 +20,10 @@
# include <sysdep.h>
+# if defined USE_AS_STRCASECMP_L
+# include "locale-defines.h"
+# endif
+
# ifndef STRCMP
# define STRCMP __strcmp_avx2
# endif
@@ -74,13 +78,88 @@
# define VEC_OFFSET (-VEC_SIZE)
# endif
+# ifdef USE_AS_STRCASECMP_L
+# define BYTE_LOOP_REG OFFSET_REG
+# else
+# define BYTE_LOOP_REG ecx
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+# ifdef USE_AS_STRNCMP
+# define STRCASECMP __strncasecmp_avx2
+# define LOCALE_REG rcx
+# define LOCALE_REG_LP RCX_LP
+# define STRCASECMP_NONASCII __strncasecmp_l_nonascii
+# else
+# define STRCASECMP __strcasecmp_avx2
+# define LOCALE_REG rdx
+# define LOCALE_REG_LP RDX_LP
+# define STRCASECMP_NONASCII __strcasecmp_l_nonascii
+# endif
+# endif
+
# define xmmZERO xmm15
# define ymmZERO ymm15
+# define LCASE_MIN_ymm %ymm10
+# define LCASE_MAX_ymm %ymm11
+# define CASE_ADD_ymm %ymm12
+
+# define LCASE_MIN_xmm %xmm10
+# define LCASE_MAX_xmm %xmm11
+# define CASE_ADD_xmm %xmm12
+
+ /* r11 is never use elsewhere so this is safe to maintain. */
+# define TOLOWER_BASE %r11
+
# ifndef SECTION
# define SECTION(p) p##.avx
# endif
+# ifdef USE_AS_STRCASECMP_L
+# define REG(x, y) x ## y
+# define TOLOWER(reg1_in, reg1_out, reg2_in, reg2_out, ext) \
+ vpaddb REG(LCASE_MIN_, ext), reg1_in, REG(%ext, 8); \
+ vpaddb REG(LCASE_MIN_, ext), reg2_in, REG(%ext, 9); \
+ vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 8), REG(%ext, 8); \
+ vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 9), REG(%ext, 9); \
+ vpandn REG(CASE_ADD_, ext), REG(%ext, 8), REG(%ext, 8); \
+ vpandn REG(CASE_ADD_, ext), REG(%ext, 9), REG(%ext, 9); \
+ vpaddb REG(%ext, 8), reg1_in, reg1_out; \
+ vpaddb REG(%ext, 9), reg2_in, reg2_out
+
+# define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
+# define TOLOWER_ymm(...) TOLOWER(__VA_ARGS__, ymm)
+# define TOLOWER_xmm(...) TOLOWER(__VA_ARGS__, xmm)
+
+# define CMP_R1_R2(s1_reg, s2_reg, scratch_reg, reg_out, ext) \
+ TOLOWER (s1_reg, scratch_reg, s2_reg, s2_reg, ext); \
+ VPCMPEQ scratch_reg, s2_reg, reg_out
+
+# define CMP_R1_S2(s1_reg, s2_mem, scratch_reg, reg_out, ext) \
+ VMOVU s2_mem, reg_out; \
+ CMP_R1_R2(s1_reg, reg_out, scratch_reg, reg_out, ext)
+
+# define CMP_R1_R2_ymm(...) CMP_R1_R2(__VA_ARGS__, ymm)
+# define CMP_R1_R2_xmm(...) CMP_R1_R2(__VA_ARGS__, xmm)
+
+# define CMP_R1_S2_ymm(...) CMP_R1_S2(__VA_ARGS__, ymm)
+# define CMP_R1_S2_xmm(...) CMP_R1_S2(__VA_ARGS__, xmm)
+
+# else
+# define TOLOWER_gpr(...)
+# define TOLOWER_ymm(...)
+# define TOLOWER_xmm(...)
+
+# define CMP_R1_R2_ymm(s1_reg, s2_reg, scratch_reg, reg_out) \
+ VPCMPEQ s2_reg, s1_reg, reg_out
+
+# define CMP_R1_R2_xmm(...) CMP_R1_R2_ymm(__VA_ARGS__)
+
+# define CMP_R1_S2_ymm(...) CMP_R1_R2_ymm(__VA_ARGS__)
+# define CMP_R1_S2_xmm(...) CMP_R1_R2_xmm(__VA_ARGS__)
+# endif
+
/* Warning!
wcscmp/wcsncmp have to use SIGNED comparison for elements.
strcmp/strncmp have to use UNSIGNED comparison for elements.
@@ -102,8 +181,49 @@
returned. */
.section SECTION(.text), "ax", @progbits
-ENTRY(STRCMP)
+ .align 16
+ .type STRCMP, @function
+ .globl STRCMP
+ .hidden STRCMP
+
+# ifndef GLABEL
+# define GLABEL(...) __VA_ARGS__
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+ENTRY (GLABEL(STRCASECMP))
+ movq __libc_tsd_LOCALE@gottpoff(%rip), %rax
+ mov %fs:(%rax), %LOCALE_REG_LP
+
+ /* Either 1 or 5 bytes (dependeing if CET is enabled). */
+ .p2align 4
+END (GLABEL(STRCASECMP))
+ /* FALLTHROUGH to strcasecmp/strncasecmp_l. */
+# endif
+
+ .p2align 4
+STRCMP:
+ cfi_startproc
+ _CET_ENDBR
+ CALL_MCOUNT
+
+# if defined USE_AS_STRCASECMP_L
+ /* We have to fall back on the C implementation for locales with
+ encodings not matching ASCII for single bytes. */
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+ mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
+# else
+ mov (%LOCALE_REG), %RAX_LP
+# endif
+ testl $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
+ jne STRCASECMP_NONASCII
+ leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
+# endif
+
# ifdef USE_AS_STRNCMP
+ /* Don't overwrite LOCALE_REG (rcx) until we have pass
+ L(one_or_less). Otherwise we might use the wrong locale in
+ the OVERFLOW_STRCMP (strcasecmp_l). */
# ifdef __ILP32__
/* Clear the upper 32 bits. */
movl %edx, %edx
@@ -128,6 +248,30 @@ ENTRY(STRCMP)
# endif
# endif
vpxor %xmmZERO, %xmmZERO, %xmmZERO
+# if defined USE_AS_STRCASECMP_L
+ .section .rodata.cst32, "aM", @progbits, 32
+ .align 32
+L(lcase_min):
+ .quad 0x3f3f3f3f3f3f3f3f
+ .quad 0x3f3f3f3f3f3f3f3f
+ .quad 0x3f3f3f3f3f3f3f3f
+ .quad 0x3f3f3f3f3f3f3f3f
+L(lcase_max):
+ .quad 0x9999999999999999
+ .quad 0x9999999999999999
+ .quad 0x9999999999999999
+ .quad 0x9999999999999999
+L(case_add):
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .previous
+
+ vmovdqa L(lcase_min)(%rip), LCASE_MIN_ymm
+ vmovdqa L(lcase_max)(%rip), LCASE_MAX_ymm
+ vmovdqa L(case_add)(%rip), CASE_ADD_ymm
+# endif
movl %edi, %eax
orl %esi, %eax
sall $20, %eax
@@ -138,8 +282,10 @@ ENTRY(STRCMP)
L(no_page_cross):
/* Safe to compare 4x vectors. */
VMOVU (%rdi), %ymm0
- /* 1s where s1 and s2 equal. */
- VPCMPEQ (%rsi), %ymm0, %ymm1
+ /* 1s where s1 and s2 equal. Just VPCMPEQ if its not strcasecmp.
+ Otherwise converts ymm0 and load from rsi to lower. ymm2 is
+ scratch and ymm1 is the return. */
+ CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
/* 1s at null CHAR. */
VPCMPEQ %ymm0, %ymmZERO, %ymm2
/* 1s where s1 and s2 equal AND not null CHAR. */
@@ -172,6 +318,8 @@ L(return_vec_0):
# else
movzbl (%rdi, %rcx), %eax
movzbl (%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret0):
@@ -192,6 +340,10 @@ L(ret_zero):
.p2align 4,, 5
L(one_or_less):
+# ifdef USE_AS_STRCASECMP_L
+ /* Set locale argument for strcasecmp. */
+ movq %LOCALE_REG, %rdx
+# endif
jb L(ret_zero)
# ifdef USE_AS_WCSCMP
/* 'nbe' covers the case where length is negative (large
@@ -211,6 +363,8 @@ L(one_or_less):
jnbe __strcmp_avx2
movzbl (%rdi), %eax
movzbl (%rsi), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret1):
@@ -238,6 +392,8 @@ L(return_vec_1):
# else
movzbl VEC_SIZE(%rdi, %rcx), %eax
movzbl VEC_SIZE(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret2):
@@ -269,6 +425,8 @@ L(return_vec_2):
# else
movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret3):
@@ -289,6 +447,8 @@ L(return_vec_3):
# else
movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret4):
@@ -299,7 +459,7 @@ L(ret4):
L(more_3x_vec):
/* Safe to compare 4x vectors. */
VMOVU VEC_SIZE(%rdi), %ymm0
- VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -312,7 +472,7 @@ L(more_3x_vec):
# endif
VMOVU (VEC_SIZE * 2)(%rdi), %ymm0
- VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 2)(%rsi), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -320,7 +480,7 @@ L(more_3x_vec):
jnz L(return_vec_2)
VMOVU (VEC_SIZE * 3)(%rdi), %ymm0
- VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 3)(%rsi), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -395,12 +555,10 @@ L(loop_skip_page_cross_check):
VMOVA (VEC_SIZE * 3)(%rdi), %ymm6
/* ymm1 all 1s where s1 and s2 equal. All 0s otherwise. */
- VPCMPEQ (VEC_SIZE * 0)(%rsi), %ymm0, %ymm1
-
- VPCMPEQ (VEC_SIZE * 1)(%rsi), %ymm2, %ymm3
- VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
- VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
-
+ CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 0)(%rsi), %ymm3, %ymm1)
+ CMP_R1_S2_ymm (%ymm2, (VEC_SIZE * 1)(%rsi), %ymm5, %ymm3)
+ CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
+ CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
/* If any mismatches or null CHAR then 0 CHAR, otherwise non-
zero. */
@@ -469,6 +627,8 @@ L(return_vec_2_3_end):
# else
movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax
movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -512,6 +672,8 @@ L(return_vec_0_end):
# else
movzbl (%rdi, %rcx), %eax
movzbl (%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -534,6 +696,8 @@ L(return_vec_1_end):
# else
movzbl VEC_SIZE(%rdi, %rcx), %eax
movzbl VEC_SIZE(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -560,6 +724,8 @@ L(return_vec_2_end):
# else
movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -587,7 +753,7 @@ L(page_cross_during_loop):
jle L(less_1x_vec_till_page_cross)
VMOVA (%rdi), %ymm0
- VPCMPEQ (%rsi), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -609,7 +775,7 @@ L(less_1x_vec_till_page_cross):
here, it means the previous page (rdi - VEC_SIZE) has already
been loaded earlier so must be valid. */
VMOVU -VEC_SIZE(%rdi, %rax), %ymm0
- VPCMPEQ -VEC_SIZE(%rsi, %rax), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, -VEC_SIZE(%rsi, %rax), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -651,6 +817,8 @@ L(return_page_cross_cmp_mem):
# else
movzbl VEC_OFFSET(%rdi, %rcx), %eax
movzbl VEC_OFFSET(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -677,7 +845,7 @@ L(more_2x_vec_till_page_cross):
iteration here. */
VMOVU VEC_SIZE(%rdi), %ymm0
- VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -693,7 +861,7 @@ L(more_2x_vec_till_page_cross):
/* Safe to include comparisons from lower bytes. */
VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %ymm0
- VPCMPEQ -(VEC_SIZE * 2)(%rsi, %rax), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 2)(%rsi, %rax), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -701,7 +869,7 @@ L(more_2x_vec_till_page_cross):
jnz L(return_vec_page_cross_0)
VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %ymm0
- VPCMPEQ -(VEC_SIZE * 1)(%rsi, %rax), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 1)(%rsi, %rax), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -719,8 +887,8 @@ L(more_2x_vec_till_page_cross):
VMOVA (VEC_SIZE * 2)(%rdi), %ymm4
VMOVA (VEC_SIZE * 3)(%rdi), %ymm6
- VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
- VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
+ CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
+ CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
vpand %ymm4, %ymm5, %ymm5
vpand %ymm6, %ymm7, %ymm7
VPMINU %ymm5, %ymm7, %ymm7
@@ -771,6 +939,8 @@ L(return_vec_page_cross_1):
# else
movzbl VEC_OFFSET(%rdi, %rcx), %eax
movzbl VEC_OFFSET(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -826,7 +996,7 @@ L(page_cross):
L(page_cross_loop):
VMOVU (%rdi, %OFFSET_REG64), %ymm0
- VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -844,11 +1014,11 @@ L(page_cross_loop):
subl %eax, %OFFSET_REG
/* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
to not cross page so is safe to load. Since we have already
- loaded at least 1 VEC from rsi it is also guranteed to be safe.
- */
+ loaded at least 1 VEC from rsi it is also guranteed to be
+ safe. */
VMOVU (%rdi, %OFFSET_REG64), %ymm0
- VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -881,6 +1051,8 @@ L(ret_vec_page_cross_cont):
# else
movzbl (%rdi, %rcx), %eax
movzbl (%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -934,7 +1106,7 @@ L(less_1x_vec_till_page):
ja L(less_16_till_page)
VMOVU (%rdi), %xmm0
- VPCMPEQ (%rsi), %xmm0, %xmm1
+ CMP_R1_S2_xmm (%xmm0, (%rsi), %xmm2, %xmm1)
VPCMPEQ %xmm0, %xmmZERO, %xmm2
vpandn %xmm1, %xmm2, %xmm1
vpmovmskb %ymm1, %ecx
@@ -952,7 +1124,7 @@ L(less_1x_vec_till_page):
# endif
VMOVU (%rdi, %OFFSET_REG64), %xmm0
- VPCMPEQ (%rsi, %OFFSET_REG64), %xmm0, %xmm1
+ CMP_R1_S2_xmm (%xmm0, (%rsi, %OFFSET_REG64), %xmm2, %xmm1)
VPCMPEQ %xmm0, %xmmZERO, %xmm2
vpandn %xmm1, %xmm2, %xmm1
vpmovmskb %ymm1, %ecx
@@ -990,7 +1162,7 @@ L(less_16_till_page):
vmovq (%rdi), %xmm0
vmovq (%rsi), %xmm1
VPCMPEQ %xmm0, %xmmZERO, %xmm2
- VPCMPEQ %xmm1, %xmm0, %xmm1
+ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
vpandn %xmm1, %xmm2, %xmm1
vpmovmskb %ymm1, %ecx
incb %cl
@@ -1010,7 +1182,7 @@ L(less_16_till_page):
vmovq (%rdi, %OFFSET_REG64), %xmm0
vmovq (%rsi, %OFFSET_REG64), %xmm1
VPCMPEQ %xmm0, %xmmZERO, %xmm2
- VPCMPEQ %xmm1, %xmm0, %xmm1
+ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
vpandn %xmm1, %xmm2, %xmm1
vpmovmskb %ymm1, %ecx
incb %cl
@@ -1066,7 +1238,7 @@ L(ret_less_8_wcs):
vmovd (%rdi), %xmm0
vmovd (%rsi), %xmm1
VPCMPEQ %xmm0, %xmmZERO, %xmm2
- VPCMPEQ %xmm1, %xmm0, %xmm1
+ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
vpandn %xmm1, %xmm2, %xmm1
vpmovmskb %ymm1, %ecx
subl $0xf, %ecx
@@ -1085,7 +1257,7 @@ L(ret_less_8_wcs):
vmovd (%rdi, %OFFSET_REG64), %xmm0
vmovd (%rsi, %OFFSET_REG64), %xmm1
VPCMPEQ %xmm0, %xmmZERO, %xmm2
- VPCMPEQ %xmm1, %xmm0, %xmm1
+ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
vpandn %xmm1, %xmm2, %xmm1
vpmovmskb %ymm1, %ecx
subl $0xf, %ecx
@@ -1119,7 +1291,9 @@ L(less_4_till_page):
L(less_4_loop):
movzbl (%rdi), %eax
movzbl (%rsi, %rdi), %ecx
- subl %ecx, %eax
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
+ subl %BYTE_LOOP_REG, %eax
jnz L(ret_less_4_loop)
testl %ecx, %ecx
jz L(ret_zero_4_loop)
@@ -1146,5 +1320,6 @@ L(ret_less_4_loop):
subl %r8d, %eax
ret
# endif
-END(STRCMP)
+ cfi_endproc
+ .size STRCMP, .-STRCMP
#endif
diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
new file mode 100644
index 0000000000..58c05dcfb8
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
@@ -0,0 +1,16 @@
+#ifndef STRCMP
+# define STRCMP __strncasecmp_l_avx2_rtm
+#endif
+
+#define _GLABEL(x) x ## _rtm
+#define GLABEL(x) _GLABEL(x)
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+#define OVERFLOW_STRCMP __strcasecmp_l_avx2_rtm
+
+#include "strncase_l-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2.S b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
new file mode 100644
index 0000000000..48c0aa21f8
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
@@ -0,0 +1,27 @@
+/* strncasecmp_l optimized with AVX2.
+ Copyright (C) 2017-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef STRCMP
+# define STRCMP __strncasecmp_l_avx2
+#endif
+#define USE_AS_STRCASECMP_L
+#define USE_AS_STRNCMP
+#ifndef OVERFLOW_STRCMP
+# define OVERFLOW_STRCMP __strcasecmp_l_avx2
+#endif
+#include "strcmp-avx2.S"
--
2.35.1
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v4 21/23] x86: Add AVX2 optimized str{n}casecmp
2022-03-25 18:14 ` [PATCH v4 21/23] x86: Add AVX2 " H.J. Lu
@ 2022-05-12 19:52 ` Sunil Pandey
0 siblings, 0 replies; 76+ messages in thread
From: Sunil Pandey @ 2022-05-12 19:52 UTC (permalink / raw)
To: H.J. Lu, Libc-stable Mailing List; +Cc: Noah Goldstein, GNU C Library
[-- Attachment #1: Type: text/plain, Size: 31021 bytes --]
On Fri, Mar 25, 2022 at 11:15 AM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Thu, Mar 24, 2022 at 4:56 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > geometric_mean(N=40) of all benchmarks AVX2 / SSE42: .702
> >
> > All string/memory tests pass.
> > Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
> > ---
> > sysdeps/x86_64/multiarch/Makefile | 4 +
> > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 28 +++
> > sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 12 +
> > .../x86_64/multiarch/strcasecmp_l-avx2-rtm.S | 15 ++
> > sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S | 23 ++
> > sysdeps/x86_64/multiarch/strcmp-avx2.S | 237 +++++++++++++++---
> > .../x86_64/multiarch/strncase_l-avx2-rtm.S | 16 ++
> > sysdeps/x86_64/multiarch/strncase_l-avx2.S | 27 ++
> > 8 files changed, 331 insertions(+), 31 deletions(-)
> > create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
> > create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
> > create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
> > create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2.S
> >
> > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > index e7b413edad..06e1848823 100644
> > --- a/sysdeps/x86_64/multiarch/Makefile
> > +++ b/sysdeps/x86_64/multiarch/Makefile
> > @@ -55,6 +55,8 @@ sysdep_routines += \
> > stpncpy-sse2-unaligned \
> > stpncpy-ssse3 \
> > strcasecmp_l-avx \
> > + strcasecmp_l-avx2 \
> > + strcasecmp_l-avx2-rtm \
> > strcasecmp_l-sse2 \
> > strcasecmp_l-sse4_2 \
> > strcasecmp_l-ssse3 \
> > @@ -93,6 +95,8 @@ sysdep_routines += \
> > strlen-evex \
> > strlen-sse2 \
> > strncase_l-avx \
> > + strncase_l-avx2 \
> > + strncase_l-avx2-rtm \
> > strncase_l-sse2 \
> > strncase_l-sse4_2 \
> > strncase_l-ssse3 \
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > index a594f4176e..3c556d07ac 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > @@ -436,6 +436,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >
> > /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
> > IFUNC_IMPL (i, name, strcasecmp,
> > + IFUNC_IMPL_ADD (array, i, strcasecmp,
> > + CPU_FEATURE_USABLE (AVX2),
> > + __strcasecmp_avx2)
> > + IFUNC_IMPL_ADD (array, i, strcasecmp,
> > + (CPU_FEATURE_USABLE (AVX2)
> > + && CPU_FEATURE_USABLE (RTM)),
> > + __strcasecmp_avx2_rtm)
> > IFUNC_IMPL_ADD (array, i, strcasecmp,
> > CPU_FEATURE_USABLE (AVX),
> > __strcasecmp_avx)
> > @@ -449,6 +456,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >
> > /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
> > IFUNC_IMPL (i, name, strcasecmp_l,
> > + IFUNC_IMPL_ADD (array, i, strcasecmp,
> > + CPU_FEATURE_USABLE (AVX2),
> > + __strcasecmp_l_avx2)
> > + IFUNC_IMPL_ADD (array, i, strcasecmp,
> > + (CPU_FEATURE_USABLE (AVX2)
> > + && CPU_FEATURE_USABLE (RTM)),
> > + __strcasecmp_l_avx2_rtm)
> > IFUNC_IMPL_ADD (array, i, strcasecmp_l,
> > CPU_FEATURE_USABLE (AVX),
> > __strcasecmp_l_avx)
> > @@ -576,6 +590,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >
> > /* Support sysdeps/x86_64/multiarch/strncase_l.c. */
> > IFUNC_IMPL (i, name, strncasecmp,
> > + IFUNC_IMPL_ADD (array, i, strncasecmp,
> > + CPU_FEATURE_USABLE (AVX2),
> > + __strncasecmp_avx2)
> > + IFUNC_IMPL_ADD (array, i, strncasecmp,
> > + (CPU_FEATURE_USABLE (AVX2)
> > + && CPU_FEATURE_USABLE (RTM)),
> > + __strncasecmp_avx2_rtm)
> > IFUNC_IMPL_ADD (array, i, strncasecmp,
> > CPU_FEATURE_USABLE (AVX),
> > __strncasecmp_avx)
> > @@ -590,6 +611,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >
> > /* Support sysdeps/x86_64/multiarch/strncase_l.c. */
> > IFUNC_IMPL (i, name, strncasecmp_l,
> > + IFUNC_IMPL_ADD (array, i, strncasecmp,
> > + CPU_FEATURE_USABLE (AVX2),
> > + __strncasecmp_l_avx2)
> > + IFUNC_IMPL_ADD (array, i, strncasecmp,
> > + (CPU_FEATURE_USABLE (AVX2)
> > + && CPU_FEATURE_USABLE (RTM)),
> > + __strncasecmp_l_avx2_rtm)
> > IFUNC_IMPL_ADD (array, i, strncasecmp_l,
> > CPU_FEATURE_USABLE (AVX),
> > __strncasecmp_l_avx)
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> > index 9e3cc61ac0..c4de111fd0 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> > +++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> > @@ -23,12 +23,24 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> > extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
> > extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
> > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
> >
> > static inline void *
> > IFUNC_SELECTOR (void)
> > {
> > const struct cpu_features* cpu_features = __get_cpu_features ();
> >
> > + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> > + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
> > + {
> > + if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> > + return OPTIMIZE (avx2_rtm);
> > +
> > + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
> > + return OPTIMIZE (avx2);
> > + }
> > +
> > if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
> > return OPTIMIZE (avx);
> >
> > diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
> > new file mode 100644
> > index 0000000000..09957fc3c5
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
> > @@ -0,0 +1,15 @@
> > +#ifndef STRCMP
> > +# define STRCMP __strcasecmp_l_avx2_rtm
> > +#endif
> > +
> > +#define _GLABEL(x) x ## _rtm
> > +#define GLABEL(x) _GLABEL(x)
> > +
> > +#define ZERO_UPPER_VEC_REGISTERS_RETURN \
> > + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> > +
> > +#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
> > +
> > +#define SECTION(p) p##.avx.rtm
> > +
> > +#include "strcasecmp_l-avx2.S"
> > diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
> > new file mode 100644
> > index 0000000000..e2762f2a22
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
> > @@ -0,0 +1,23 @@
> > +/* strcasecmp_l optimized with AVX2.
> > + Copyright (C) 2017-2022 Free Software Foundation, Inc.
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library; if not, see
> > + <https://www.gnu.org/licenses/>. */
> > +
> > +#ifndef STRCMP
> > +# define STRCMP __strcasecmp_l_avx2
> > +#endif
> > +#define USE_AS_STRCASECMP_L
> > +#include "strcmp-avx2.S"
> > diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> > index 86a86b68e3..8da09bd86d 100644
> > --- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
> > +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> > @@ -20,6 +20,10 @@
> >
> > # include <sysdep.h>
> >
> > +# if defined USE_AS_STRCASECMP_L
> > +# include "locale-defines.h"
> > +# endif
> > +
> > # ifndef STRCMP
> > # define STRCMP __strcmp_avx2
> > # endif
> > @@ -74,13 +78,88 @@
> > # define VEC_OFFSET (-VEC_SIZE)
> > # endif
> >
> > +# ifdef USE_AS_STRCASECMP_L
> > +# define BYTE_LOOP_REG OFFSET_REG
> > +# else
> > +# define BYTE_LOOP_REG ecx
> > +# endif
> > +
> > +# ifdef USE_AS_STRCASECMP_L
> > +# ifdef USE_AS_STRNCMP
> > +# define STRCASECMP __strncasecmp_avx2
> > +# define LOCALE_REG rcx
> > +# define LOCALE_REG_LP RCX_LP
> > +# define STRCASECMP_NONASCII __strncasecmp_l_nonascii
> > +# else
> > +# define STRCASECMP __strcasecmp_avx2
> > +# define LOCALE_REG rdx
> > +# define LOCALE_REG_LP RDX_LP
> > +# define STRCASECMP_NONASCII __strcasecmp_l_nonascii
> > +# endif
> > +# endif
> > +
> > # define xmmZERO xmm15
> > # define ymmZERO ymm15
> >
> > +# define LCASE_MIN_ymm %ymm10
> > +# define LCASE_MAX_ymm %ymm11
> > +# define CASE_ADD_ymm %ymm12
> > +
> > +# define LCASE_MIN_xmm %xmm10
> > +# define LCASE_MAX_xmm %xmm11
> > +# define CASE_ADD_xmm %xmm12
> > +
> > + /* r11 is never use elsewhere so this is safe to maintain. */
> > +# define TOLOWER_BASE %r11
> > +
> > # ifndef SECTION
> > # define SECTION(p) p##.avx
> > # endif
> >
> > +# ifdef USE_AS_STRCASECMP_L
> > +# define REG(x, y) x ## y
> > +# define TOLOWER(reg1_in, reg1_out, reg2_in, reg2_out, ext) \
> > + vpaddb REG(LCASE_MIN_, ext), reg1_in, REG(%ext, 8); \
> > + vpaddb REG(LCASE_MIN_, ext), reg2_in, REG(%ext, 9); \
> > + vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 8), REG(%ext, 8); \
> > + vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 9), REG(%ext, 9); \
> > + vpandn REG(CASE_ADD_, ext), REG(%ext, 8), REG(%ext, 8); \
> > + vpandn REG(CASE_ADD_, ext), REG(%ext, 9), REG(%ext, 9); \
> > + vpaddb REG(%ext, 8), reg1_in, reg1_out; \
> > + vpaddb REG(%ext, 9), reg2_in, reg2_out
> > +
> > +# define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
> > +# define TOLOWER_ymm(...) TOLOWER(__VA_ARGS__, ymm)
> > +# define TOLOWER_xmm(...) TOLOWER(__VA_ARGS__, xmm)
> > +
> > +# define CMP_R1_R2(s1_reg, s2_reg, scratch_reg, reg_out, ext) \
> > + TOLOWER (s1_reg, scratch_reg, s2_reg, s2_reg, ext); \
> > + VPCMPEQ scratch_reg, s2_reg, reg_out
> > +
> > +# define CMP_R1_S2(s1_reg, s2_mem, scratch_reg, reg_out, ext) \
> > + VMOVU s2_mem, reg_out; \
> > + CMP_R1_R2(s1_reg, reg_out, scratch_reg, reg_out, ext)
> > +
> > +# define CMP_R1_R2_ymm(...) CMP_R1_R2(__VA_ARGS__, ymm)
> > +# define CMP_R1_R2_xmm(...) CMP_R1_R2(__VA_ARGS__, xmm)
> > +
> > +# define CMP_R1_S2_ymm(...) CMP_R1_S2(__VA_ARGS__, ymm)
> > +# define CMP_R1_S2_xmm(...) CMP_R1_S2(__VA_ARGS__, xmm)
> > +
> > +# else
> > +# define TOLOWER_gpr(...)
> > +# define TOLOWER_ymm(...)
> > +# define TOLOWER_xmm(...)
> > +
> > +# define CMP_R1_R2_ymm(s1_reg, s2_reg, scratch_reg, reg_out) \
> > + VPCMPEQ s2_reg, s1_reg, reg_out
> > +
> > +# define CMP_R1_R2_xmm(...) CMP_R1_R2_ymm(__VA_ARGS__)
> > +
> > +# define CMP_R1_S2_ymm(...) CMP_R1_R2_ymm(__VA_ARGS__)
> > +# define CMP_R1_S2_xmm(...) CMP_R1_R2_xmm(__VA_ARGS__)
> > +# endif
> > +
> > /* Warning!
> > wcscmp/wcsncmp have to use SIGNED comparison for elements.
> > strcmp/strncmp have to use UNSIGNED comparison for elements.
> > @@ -102,8 +181,49 @@
> > returned. */
> >
> > .section SECTION(.text), "ax", @progbits
> > -ENTRY(STRCMP)
> > + .align 16
> > + .type STRCMP, @function
> > + .globl STRCMP
> > + .hidden STRCMP
> > +
> > +# ifndef GLABEL
> > +# define GLABEL(...) __VA_ARGS__
> > +# endif
> > +
> > +# ifdef USE_AS_STRCASECMP_L
> > +ENTRY (GLABEL(STRCASECMP))
> > + movq __libc_tsd_LOCALE@gottpoff(%rip), %rax
> > + mov %fs:(%rax), %LOCALE_REG_LP
> > +
> > + /* Either 1 or 5 bytes (dependeing if CET is enabled). */
> > + .p2align 4
> > +END (GLABEL(STRCASECMP))
> > + /* FALLTHROUGH to strcasecmp/strncasecmp_l. */
> > +# endif
> > +
> > + .p2align 4
> > +STRCMP:
> > + cfi_startproc
> > + _CET_ENDBR
> > + CALL_MCOUNT
> > +
> > +# if defined USE_AS_STRCASECMP_L
> > + /* We have to fall back on the C implementation for locales with
> > + encodings not matching ASCII for single bytes. */
> > +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
> > + mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
> > +# else
> > + mov (%LOCALE_REG), %RAX_LP
> > +# endif
> > + testl $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
> > + jne STRCASECMP_NONASCII
> > + leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
> > +# endif
> > +
> > # ifdef USE_AS_STRNCMP
> > + /* Don't overwrite LOCALE_REG (rcx) until we have pass
> > + L(one_or_less). Otherwise we might use the wrong locale in
> > + the OVERFLOW_STRCMP (strcasecmp_l). */
> > # ifdef __ILP32__
> > /* Clear the upper 32 bits. */
> > movl %edx, %edx
> > @@ -128,6 +248,30 @@ ENTRY(STRCMP)
> > # endif
> > # endif
> > vpxor %xmmZERO, %xmmZERO, %xmmZERO
> > +# if defined USE_AS_STRCASECMP_L
> > + .section .rodata.cst32, "aM", @progbits, 32
> > + .align 32
> > +L(lcase_min):
> > + .quad 0x3f3f3f3f3f3f3f3f
> > + .quad 0x3f3f3f3f3f3f3f3f
> > + .quad 0x3f3f3f3f3f3f3f3f
> > + .quad 0x3f3f3f3f3f3f3f3f
> > +L(lcase_max):
> > + .quad 0x9999999999999999
> > + .quad 0x9999999999999999
> > + .quad 0x9999999999999999
> > + .quad 0x9999999999999999
> > +L(case_add):
> > + .quad 0x2020202020202020
> > + .quad 0x2020202020202020
> > + .quad 0x2020202020202020
> > + .quad 0x2020202020202020
> > + .previous
> > +
> > + vmovdqa L(lcase_min)(%rip), LCASE_MIN_ymm
> > + vmovdqa L(lcase_max)(%rip), LCASE_MAX_ymm
> > + vmovdqa L(case_add)(%rip), CASE_ADD_ymm
> > +# endif
> > movl %edi, %eax
> > orl %esi, %eax
> > sall $20, %eax
> > @@ -138,8 +282,10 @@ ENTRY(STRCMP)
> > L(no_page_cross):
> > /* Safe to compare 4x vectors. */
> > VMOVU (%rdi), %ymm0
> > - /* 1s where s1 and s2 equal. */
> > - VPCMPEQ (%rsi), %ymm0, %ymm1
> > + /* 1s where s1 and s2 equal. Just VPCMPEQ if its not strcasecmp.
> > + Otherwise converts ymm0 and load from rsi to lower. ymm2 is
> > + scratch and ymm1 is the return. */
> > + CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
> > /* 1s at null CHAR. */
> > VPCMPEQ %ymm0, %ymmZERO, %ymm2
> > /* 1s where s1 and s2 equal AND not null CHAR. */
> > @@ -172,6 +318,8 @@ L(return_vec_0):
> > # else
> > movzbl (%rdi, %rcx), %eax
> > movzbl (%rsi, %rcx), %ecx
> > + TOLOWER_gpr (%rax, %eax)
> > + TOLOWER_gpr (%rcx, %ecx)
> > subl %ecx, %eax
> > # endif
> > L(ret0):
> > @@ -192,6 +340,10 @@ L(ret_zero):
> >
> > .p2align 4,, 5
> > L(one_or_less):
> > +# ifdef USE_AS_STRCASECMP_L
> > + /* Set locale argument for strcasecmp. */
> > + movq %LOCALE_REG, %rdx
> > +# endif
> > jb L(ret_zero)
> > /* 'nbe' covers the case where length is negative (large
> > unsigned). */
> > @@ -207,6 +359,8 @@ L(one_or_less):
> > # else
> > movzbl (%rdi), %eax
> > movzbl (%rsi), %ecx
> > + TOLOWER_gpr (%rax, %eax)
> > + TOLOWER_gpr (%rcx, %ecx)
> > subl %ecx, %eax
> > # endif
> > L(ret1):
> > @@ -234,6 +388,8 @@ L(return_vec_1):
> > # else
> > movzbl VEC_SIZE(%rdi, %rcx), %eax
> > movzbl VEC_SIZE(%rsi, %rcx), %ecx
> > + TOLOWER_gpr (%rax, %eax)
> > + TOLOWER_gpr (%rcx, %ecx)
> > subl %ecx, %eax
> > # endif
> > L(ret2):
> > @@ -265,6 +421,8 @@ L(return_vec_2):
> > # else
> > movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
> > movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
> > + TOLOWER_gpr (%rax, %eax)
> > + TOLOWER_gpr (%rcx, %ecx)
> > subl %ecx, %eax
> > # endif
> > L(ret3):
> > @@ -285,6 +443,8 @@ L(return_vec_3):
> > # else
> > movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
> > movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
> > + TOLOWER_gpr (%rax, %eax)
> > + TOLOWER_gpr (%rcx, %ecx)
> > subl %ecx, %eax
> > # endif
> > L(ret4):
> > @@ -295,7 +455,7 @@ L(ret4):
> > L(more_3x_vec):
> > /* Safe to compare 4x vectors. */
> > VMOVU VEC_SIZE(%rdi), %ymm0
> > - VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1
> > + CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
> > VPCMPEQ %ymm0, %ymmZERO, %ymm2
> > vpandn %ymm1, %ymm2, %ymm1
> > vpmovmskb %ymm1, %ecx
> > @@ -308,7 +468,7 @@ L(more_3x_vec):
> > # endif
> >
> > VMOVU (VEC_SIZE * 2)(%rdi), %ymm0
> > - VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm0, %ymm1
> > + CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 2)(%rsi), %ymm2, %ymm1)
> > VPCMPEQ %ymm0, %ymmZERO, %ymm2
> > vpandn %ymm1, %ymm2, %ymm1
> > vpmovmskb %ymm1, %ecx
> > @@ -316,7 +476,7 @@ L(more_3x_vec):
> > jnz L(return_vec_2)
> >
> > VMOVU (VEC_SIZE * 3)(%rdi), %ymm0
> > - VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm0, %ymm1
> > + CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 3)(%rsi), %ymm2, %ymm1)
> > VPCMPEQ %ymm0, %ymmZERO, %ymm2
> > vpandn %ymm1, %ymm2, %ymm1
> > vpmovmskb %ymm1, %ecx
> > @@ -391,12 +551,10 @@ L(loop_skip_page_cross_check):
> > VMOVA (VEC_SIZE * 3)(%rdi), %ymm6
> >
> > /* ymm1 all 1s where s1 and s2 equal. All 0s otherwise. */
> > - VPCMPEQ (VEC_SIZE * 0)(%rsi), %ymm0, %ymm1
> > -
> > - VPCMPEQ (VEC_SIZE * 1)(%rsi), %ymm2, %ymm3
> > - VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
> > - VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
> > -
> > + CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 0)(%rsi), %ymm3, %ymm1)
> > + CMP_R1_S2_ymm (%ymm2, (VEC_SIZE * 1)(%rsi), %ymm5, %ymm3)
> > + CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
> > + CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
> >
> > /* If any mismatches or null CHAR then 0 CHAR, otherwise non-
> > zero. */
> > @@ -465,6 +623,8 @@ L(return_vec_2_3_end):
> > # else
> > movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax
> > movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
> > + TOLOWER_gpr (%rax, %eax)
> > + TOLOWER_gpr (%rcx, %ecx)
> > subl %ecx, %eax
> > xorl %r8d, %eax
> > subl %r8d, %eax
> > @@ -508,6 +668,8 @@ L(return_vec_0_end):
> > # else
> > movzbl (%rdi, %rcx), %eax
> > movzbl (%rsi, %rcx), %ecx
> > + TOLOWER_gpr (%rax, %eax)
> > + TOLOWER_gpr (%rcx, %ecx)
> > subl %ecx, %eax
> > xorl %r8d, %eax
> > subl %r8d, %eax
> > @@ -530,6 +692,8 @@ L(return_vec_1_end):
> > # else
> > movzbl VEC_SIZE(%rdi, %rcx), %eax
> > movzbl VEC_SIZE(%rsi, %rcx), %ecx
> > + TOLOWER_gpr (%rax, %eax)
> > + TOLOWER_gpr (%rcx, %ecx)
> > subl %ecx, %eax
> > xorl %r8d, %eax
> > subl %r8d, %eax
> > @@ -556,6 +720,8 @@ L(return_vec_2_end):
> > # else
> > movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
> > movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
> > + TOLOWER_gpr (%rax, %eax)
> > + TOLOWER_gpr (%rcx, %ecx)
> > subl %ecx, %eax
> > xorl %r8d, %eax
> > subl %r8d, %eax
> > @@ -583,7 +749,7 @@ L(page_cross_during_loop):
> > jle L(less_1x_vec_till_page_cross)
> >
> > VMOVA (%rdi), %ymm0
> > - VPCMPEQ (%rsi), %ymm0, %ymm1
> > + CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
> > VPCMPEQ %ymm0, %ymmZERO, %ymm2
> > vpandn %ymm1, %ymm2, %ymm1
> > vpmovmskb %ymm1, %ecx
> > @@ -605,7 +771,7 @@ L(less_1x_vec_till_page_cross):
> > here, it means the previous page (rdi - VEC_SIZE) has already
> > been loaded earlier so must be valid. */
> > VMOVU -VEC_SIZE(%rdi, %rax), %ymm0
> > - VPCMPEQ -VEC_SIZE(%rsi, %rax), %ymm0, %ymm1
> > + CMP_R1_S2_ymm (%ymm0, -VEC_SIZE(%rsi, %rax), %ymm2, %ymm1)
> > VPCMPEQ %ymm0, %ymmZERO, %ymm2
> > vpandn %ymm1, %ymm2, %ymm1
> > vpmovmskb %ymm1, %ecx
> > @@ -647,6 +813,8 @@ L(return_page_cross_cmp_mem):
> > # else
> > movzbl VEC_OFFSET(%rdi, %rcx), %eax
> > movzbl VEC_OFFSET(%rsi, %rcx), %ecx
> > + TOLOWER_gpr (%rax, %eax)
> > + TOLOWER_gpr (%rcx, %ecx)
> > subl %ecx, %eax
> > xorl %r8d, %eax
> > subl %r8d, %eax
> > @@ -673,7 +841,7 @@ L(more_2x_vec_till_page_cross):
> > iteration here. */
> >
> > VMOVU VEC_SIZE(%rdi), %ymm0
> > - VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1
> > + CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
> > VPCMPEQ %ymm0, %ymmZERO, %ymm2
> > vpandn %ymm1, %ymm2, %ymm1
> > vpmovmskb %ymm1, %ecx
> > @@ -689,7 +857,7 @@ L(more_2x_vec_till_page_cross):
> >
> > /* Safe to include comparisons from lower bytes. */
> > VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %ymm0
> > - VPCMPEQ -(VEC_SIZE * 2)(%rsi, %rax), %ymm0, %ymm1
> > + CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 2)(%rsi, %rax), %ymm2, %ymm1)
> > VPCMPEQ %ymm0, %ymmZERO, %ymm2
> > vpandn %ymm1, %ymm2, %ymm1
> > vpmovmskb %ymm1, %ecx
> > @@ -697,7 +865,7 @@ L(more_2x_vec_till_page_cross):
> > jnz L(return_vec_page_cross_0)
> >
> > VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %ymm0
> > - VPCMPEQ -(VEC_SIZE * 1)(%rsi, %rax), %ymm0, %ymm1
> > + CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 1)(%rsi, %rax), %ymm2, %ymm1)
> > VPCMPEQ %ymm0, %ymmZERO, %ymm2
> > vpandn %ymm1, %ymm2, %ymm1
> > vpmovmskb %ymm1, %ecx
> > @@ -715,8 +883,8 @@ L(more_2x_vec_till_page_cross):
> > VMOVA (VEC_SIZE * 2)(%rdi), %ymm4
> > VMOVA (VEC_SIZE * 3)(%rdi), %ymm6
> >
> > - VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
> > - VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
> > + CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
> > + CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
> > vpand %ymm4, %ymm5, %ymm5
> > vpand %ymm6, %ymm7, %ymm7
> > VPMINU %ymm5, %ymm7, %ymm7
> > @@ -767,6 +935,8 @@ L(return_vec_page_cross_1):
> > # else
> > movzbl VEC_OFFSET(%rdi, %rcx), %eax
> > movzbl VEC_OFFSET(%rsi, %rcx), %ecx
> > + TOLOWER_gpr (%rax, %eax)
> > + TOLOWER_gpr (%rcx, %ecx)
> > subl %ecx, %eax
> > xorl %r8d, %eax
> > subl %r8d, %eax
> > @@ -822,7 +992,7 @@ L(page_cross):
> > L(page_cross_loop):
> >
> > VMOVU (%rdi, %OFFSET_REG64), %ymm0
> > - VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1
> > + CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
> > VPCMPEQ %ymm0, %ymmZERO, %ymm2
> > vpandn %ymm1, %ymm2, %ymm1
> > vpmovmskb %ymm1, %ecx
> > @@ -840,11 +1010,11 @@ L(page_cross_loop):
> > subl %eax, %OFFSET_REG
> > /* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
> > to not cross page so is safe to load. Since we have already
> > - loaded at least 1 VEC from rsi it is also guranteed to be safe.
> > - */
> > + loaded at least 1 VEC from rsi it is also guranteed to be
> > + safe. */
> >
> > VMOVU (%rdi, %OFFSET_REG64), %ymm0
> > - VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1
> > + CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
> > VPCMPEQ %ymm0, %ymmZERO, %ymm2
> > vpandn %ymm1, %ymm2, %ymm1
> > vpmovmskb %ymm1, %ecx
> > @@ -877,6 +1047,8 @@ L(ret_vec_page_cross_cont):
> > # else
> > movzbl (%rdi, %rcx), %eax
> > movzbl (%rsi, %rcx), %ecx
> > + TOLOWER_gpr (%rax, %eax)
> > + TOLOWER_gpr (%rcx, %ecx)
> > subl %ecx, %eax
> > xorl %r8d, %eax
> > subl %r8d, %eax
> > @@ -930,7 +1102,7 @@ L(less_1x_vec_till_page):
> > ja L(less_16_till_page)
> >
> > VMOVU (%rdi), %xmm0
> > - VPCMPEQ (%rsi), %xmm0, %xmm1
> > + CMP_R1_S2_xmm (%xmm0, (%rsi), %xmm2, %xmm1)
> > VPCMPEQ %xmm0, %xmmZERO, %xmm2
> > vpandn %xmm1, %xmm2, %xmm1
> > vpmovmskb %ymm1, %ecx
> > @@ -948,7 +1120,7 @@ L(less_1x_vec_till_page):
> > # endif
> >
> > VMOVU (%rdi, %OFFSET_REG64), %xmm0
> > - VPCMPEQ (%rsi, %OFFSET_REG64), %xmm0, %xmm1
> > + CMP_R1_S2_xmm (%xmm0, (%rsi, %OFFSET_REG64), %xmm2, %xmm1)
> > VPCMPEQ %xmm0, %xmmZERO, %xmm2
> > vpandn %xmm1, %xmm2, %xmm1
> > vpmovmskb %ymm1, %ecx
> > @@ -986,7 +1158,7 @@ L(less_16_till_page):
> > vmovq (%rdi), %xmm0
> > vmovq (%rsi), %xmm1
> > VPCMPEQ %xmm0, %xmmZERO, %xmm2
> > - VPCMPEQ %xmm1, %xmm0, %xmm1
> > + CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
> > vpandn %xmm1, %xmm2, %xmm1
> > vpmovmskb %ymm1, %ecx
> > incb %cl
> > @@ -1006,7 +1178,7 @@ L(less_16_till_page):
> > vmovq (%rdi, %OFFSET_REG64), %xmm0
> > vmovq (%rsi, %OFFSET_REG64), %xmm1
> > VPCMPEQ %xmm0, %xmmZERO, %xmm2
> > - VPCMPEQ %xmm1, %xmm0, %xmm1
> > + CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
> > vpandn %xmm1, %xmm2, %xmm1
> > vpmovmskb %ymm1, %ecx
> > incb %cl
> > @@ -1062,7 +1234,7 @@ L(ret_less_8_wcs):
> > vmovd (%rdi), %xmm0
> > vmovd (%rsi), %xmm1
> > VPCMPEQ %xmm0, %xmmZERO, %xmm2
> > - VPCMPEQ %xmm1, %xmm0, %xmm1
> > + CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
> > vpandn %xmm1, %xmm2, %xmm1
> > vpmovmskb %ymm1, %ecx
> > subl $0xf, %ecx
> > @@ -1081,7 +1253,7 @@ L(ret_less_8_wcs):
> > vmovd (%rdi, %OFFSET_REG64), %xmm0
> > vmovd (%rsi, %OFFSET_REG64), %xmm1
> > VPCMPEQ %xmm0, %xmmZERO, %xmm2
> > - VPCMPEQ %xmm1, %xmm0, %xmm1
> > + CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
> > vpandn %xmm1, %xmm2, %xmm1
> > vpmovmskb %ymm1, %ecx
> > subl $0xf, %ecx
> > @@ -1115,7 +1287,9 @@ L(less_4_till_page):
> > L(less_4_loop):
> > movzbl (%rdi), %eax
> > movzbl (%rsi, %rdi), %ecx
> > - subl %ecx, %eax
> > + TOLOWER_gpr (%rax, %eax)
> > + TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
> > + subl %BYTE_LOOP_REG, %eax
> > jnz L(ret_less_4_loop)
> > testl %ecx, %ecx
> > jz L(ret_zero_4_loop)
> > @@ -1142,5 +1316,6 @@ L(ret_less_4_loop):
> > subl %r8d, %eax
> > ret
> > # endif
> > -END(STRCMP)
> > + cfi_endproc
> > + .size STRCMP, .-STRCMP
> > #endif
> > diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
> > new file mode 100644
> > index 0000000000..58c05dcfb8
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
> > @@ -0,0 +1,16 @@
> > +#ifndef STRCMP
> > +# define STRCMP __strncasecmp_l_avx2_rtm
> > +#endif
> > +
> > +#define _GLABEL(x) x ## _rtm
> > +#define GLABEL(x) _GLABEL(x)
> > +
> > +#define ZERO_UPPER_VEC_REGISTERS_RETURN \
> > + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> > +
> > +#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
> > +
> > +#define SECTION(p) p##.avx.rtm
> > +#define OVERFLOW_STRCMP __strcasecmp_l_avx2_rtm
> > +
> > +#include "strncase_l-avx2.S"
> > diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2.S b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
> > new file mode 100644
> > index 0000000000..48c0aa21f8
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
> > @@ -0,0 +1,27 @@
> > +/* strncasecmp_l optimized with AVX2.
> > + Copyright (C) 2017-2022 Free Software Foundation, Inc.
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library; if not, see
> > + <https://www.gnu.org/licenses/>. */
> > +
> > +#ifndef STRCMP
> > +# define STRCMP __strncasecmp_l_avx2
> > +#endif
> > +#define USE_AS_STRCASECMP_L
> > +#define USE_AS_STRNCMP
> > +#ifndef OVERFLOW_STRCMP
> > +# define OVERFLOW_STRCMP __strcasecmp_l_avx2
> > +#endif
> > +#include "strcmp-avx2.S"
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.
I would like to backport this patch to release branches.
Any comments or objections?
Conflict resolution patch attached.
--Sunil
[-- Attachment #2: 0015-x86-Add-AVX2-optimized-str-n-casecmp.patch --]
[-- Type: application/octet-stream, Size: 24673 bytes --]
From b382e4caf50dfee62e170f9b6617b470b1289dcb Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Thu, 24 Mar 2022 18:56:12 -0500
Subject: [PATCH 15/26] x86: Add AVX2 optimized str{n}casecmp
geometric_mean(N=40) of all benchmarks AVX2 / SSE42: .702
All string/memory tests pass.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
(cherry picked from commit bbf81222343fed5cd704001a2ae0d86c71544151)
---
sysdeps/x86_64/multiarch/Makefile | 4 +
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 28 +++
sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 12 +
.../x86_64/multiarch/strcasecmp_l-avx2-rtm.S | 15 ++
sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S | 23 ++
sysdeps/x86_64/multiarch/strcmp-avx2.S | 237 +++++++++++++++---
.../x86_64/multiarch/strncase_l-avx2-rtm.S | 16 ++
sysdeps/x86_64/multiarch/strncase_l-avx2.S | 27 ++
8 files changed, 331 insertions(+), 31 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index e7b413edad..06e1848823 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -55,6 +55,8 @@ sysdep_routines += \
stpncpy-sse2-unaligned \
stpncpy-ssse3 \
strcasecmp_l-avx \
+ strcasecmp_l-avx2 \
+ strcasecmp_l-avx2-rtm \
strcasecmp_l-sse2 \
strcasecmp_l-sse4_2 \
strcasecmp_l-ssse3 \
@@ -93,6 +95,8 @@ sysdep_routines += \
strlen-evex \
strlen-sse2 \
strncase_l-avx \
+ strncase_l-avx2 \
+ strncase_l-avx2-rtm \
strncase_l-sse2 \
strncase_l-sse4_2 \
strncase_l-ssse3 \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index a594f4176e..3c556d07ac 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -436,6 +436,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
IFUNC_IMPL (i, name, strcasecmp,
+ IFUNC_IMPL_ADD (array, i, strcasecmp,
+ CPU_FEATURE_USABLE (AVX2),
+ __strcasecmp_avx2)
+ IFUNC_IMPL_ADD (array, i, strcasecmp,
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (RTM)),
+ __strcasecmp_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strcasecmp,
CPU_FEATURE_USABLE (AVX),
__strcasecmp_avx)
@@ -449,6 +456,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
IFUNC_IMPL (i, name, strcasecmp_l,
+ IFUNC_IMPL_ADD (array, i, strcasecmp,
+ CPU_FEATURE_USABLE (AVX2),
+ __strcasecmp_l_avx2)
+ IFUNC_IMPL_ADD (array, i, strcasecmp,
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (RTM)),
+ __strcasecmp_l_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strcasecmp_l,
CPU_FEATURE_USABLE (AVX),
__strcasecmp_l_avx)
@@ -576,6 +590,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strncase_l.c. */
IFUNC_IMPL (i, name, strncasecmp,
+ IFUNC_IMPL_ADD (array, i, strncasecmp,
+ CPU_FEATURE_USABLE (AVX2),
+ __strncasecmp_avx2)
+ IFUNC_IMPL_ADD (array, i, strncasecmp,
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (RTM)),
+ __strncasecmp_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strncasecmp,
CPU_FEATURE_USABLE (AVX),
__strncasecmp_avx)
@@ -590,6 +611,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strncase_l.c. */
IFUNC_IMPL (i, name, strncasecmp_l,
+ IFUNC_IMPL_ADD (array, i, strncasecmp,
+ CPU_FEATURE_USABLE (AVX2),
+ __strncasecmp_l_avx2)
+ IFUNC_IMPL_ADD (array, i, strncasecmp,
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (RTM)),
+ __strncasecmp_l_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strncasecmp_l,
CPU_FEATURE_USABLE (AVX),
__strncasecmp_l_avx)
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
index 9e3cc61ac0..c4de111fd0 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
@@ -23,12 +23,24 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
static inline void *
IFUNC_SELECTOR (void)
{
const struct cpu_features* cpu_features = __get_cpu_features ();
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+ {
+ if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+ return OPTIMIZE (avx2_rtm);
+
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ return OPTIMIZE (avx2);
+ }
+
if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
return OPTIMIZE (avx);
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
new file mode 100644
index 0000000000..09957fc3c5
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
@@ -0,0 +1,15 @@
+#ifndef STRCMP
+# define STRCMP __strcasecmp_l_avx2_rtm
+#endif
+
+#define _GLABEL(x) x ## _rtm
+#define GLABEL(x) _GLABEL(x)
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "strcasecmp_l-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
new file mode 100644
index 0000000000..e2762f2a22
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
@@ -0,0 +1,23 @@
+/* strcasecmp_l optimized with AVX2.
+ Copyright (C) 2017-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef STRCMP
+# define STRCMP __strcasecmp_l_avx2
+#endif
+#define USE_AS_STRCASECMP_L
+#include "strcmp-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
index b94fc5c39a..3366d0b083 100644
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
@@ -20,6 +20,10 @@
# include <sysdep.h>
+# if defined USE_AS_STRCASECMP_L
+# include "locale-defines.h"
+# endif
+
# ifndef STRCMP
# define STRCMP __strcmp_avx2
# endif
@@ -74,13 +78,88 @@
# define VEC_OFFSET (-VEC_SIZE)
# endif
+# ifdef USE_AS_STRCASECMP_L
+# define BYTE_LOOP_REG OFFSET_REG
+# else
+# define BYTE_LOOP_REG ecx
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+# ifdef USE_AS_STRNCMP
+# define STRCASECMP __strncasecmp_avx2
+# define LOCALE_REG rcx
+# define LOCALE_REG_LP RCX_LP
+# define STRCASECMP_NONASCII __strncasecmp_l_nonascii
+# else
+# define STRCASECMP __strcasecmp_avx2
+# define LOCALE_REG rdx
+# define LOCALE_REG_LP RDX_LP
+# define STRCASECMP_NONASCII __strcasecmp_l_nonascii
+# endif
+# endif
+
# define xmmZERO xmm15
# define ymmZERO ymm15
+# define LCASE_MIN_ymm %ymm10
+# define LCASE_MAX_ymm %ymm11
+# define CASE_ADD_ymm %ymm12
+
+# define LCASE_MIN_xmm %xmm10
+# define LCASE_MAX_xmm %xmm11
+# define CASE_ADD_xmm %xmm12
+
+ /* r11 is never use elsewhere so this is safe to maintain. */
+# define TOLOWER_BASE %r11
+
# ifndef SECTION
# define SECTION(p) p##.avx
# endif
+# ifdef USE_AS_STRCASECMP_L
+# define REG(x, y) x ## y
+# define TOLOWER(reg1_in, reg1_out, reg2_in, reg2_out, ext) \
+ vpaddb REG(LCASE_MIN_, ext), reg1_in, REG(%ext, 8); \
+ vpaddb REG(LCASE_MIN_, ext), reg2_in, REG(%ext, 9); \
+ vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 8), REG(%ext, 8); \
+ vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 9), REG(%ext, 9); \
+ vpandn REG(CASE_ADD_, ext), REG(%ext, 8), REG(%ext, 8); \
+ vpandn REG(CASE_ADD_, ext), REG(%ext, 9), REG(%ext, 9); \
+ vpaddb REG(%ext, 8), reg1_in, reg1_out; \
+ vpaddb REG(%ext, 9), reg2_in, reg2_out
+
+# define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
+# define TOLOWER_ymm(...) TOLOWER(__VA_ARGS__, ymm)
+# define TOLOWER_xmm(...) TOLOWER(__VA_ARGS__, xmm)
+
+# define CMP_R1_R2(s1_reg, s2_reg, scratch_reg, reg_out, ext) \
+ TOLOWER (s1_reg, scratch_reg, s2_reg, s2_reg, ext); \
+ VPCMPEQ scratch_reg, s2_reg, reg_out
+
+# define CMP_R1_S2(s1_reg, s2_mem, scratch_reg, reg_out, ext) \
+ VMOVU s2_mem, reg_out; \
+ CMP_R1_R2(s1_reg, reg_out, scratch_reg, reg_out, ext)
+
+# define CMP_R1_R2_ymm(...) CMP_R1_R2(__VA_ARGS__, ymm)
+# define CMP_R1_R2_xmm(...) CMP_R1_R2(__VA_ARGS__, xmm)
+
+# define CMP_R1_S2_ymm(...) CMP_R1_S2(__VA_ARGS__, ymm)
+# define CMP_R1_S2_xmm(...) CMP_R1_S2(__VA_ARGS__, xmm)
+
+# else
+# define TOLOWER_gpr(...)
+# define TOLOWER_ymm(...)
+# define TOLOWER_xmm(...)
+
+# define CMP_R1_R2_ymm(s1_reg, s2_reg, scratch_reg, reg_out) \
+ VPCMPEQ s2_reg, s1_reg, reg_out
+
+# define CMP_R1_R2_xmm(...) CMP_R1_R2_ymm(__VA_ARGS__)
+
+# define CMP_R1_S2_ymm(...) CMP_R1_R2_ymm(__VA_ARGS__)
+# define CMP_R1_S2_xmm(...) CMP_R1_R2_xmm(__VA_ARGS__)
+# endif
+
/* Warning!
wcscmp/wcsncmp have to use SIGNED comparison for elements.
strcmp/strncmp have to use UNSIGNED comparison for elements.
@@ -102,8 +181,49 @@
returned. */
.section SECTION(.text), "ax", @progbits
-ENTRY(STRCMP)
+ .align 16
+ .type STRCMP, @function
+ .globl STRCMP
+ .hidden STRCMP
+
+# ifndef GLABEL
+# define GLABEL(...) __VA_ARGS__
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+ENTRY (GLABEL(STRCASECMP))
+ movq __libc_tsd_LOCALE@gottpoff(%rip), %rax
+ mov %fs:(%rax), %LOCALE_REG_LP
+
+ /* Either 1 or 5 bytes (dependeing if CET is enabled). */
+ .p2align 4
+END (GLABEL(STRCASECMP))
+ /* FALLTHROUGH to strcasecmp/strncasecmp_l. */
+# endif
+
+ .p2align 4
+STRCMP:
+ cfi_startproc
+ _CET_ENDBR
+ CALL_MCOUNT
+
+# if defined USE_AS_STRCASECMP_L
+ /* We have to fall back on the C implementation for locales with
+ encodings not matching ASCII for single bytes. */
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+ mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
+# else
+ mov (%LOCALE_REG), %RAX_LP
+# endif
+ testl $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
+ jne STRCASECMP_NONASCII
+ leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
+# endif
+
# ifdef USE_AS_STRNCMP
+ /* Don't overwrite LOCALE_REG (rcx) until we have pass
+ L(one_or_less). Otherwise we might use the wrong locale in
+ the OVERFLOW_STRCMP (strcasecmp_l). */
# ifdef __ILP32__
/* Clear the upper 32 bits. */
movl %edx, %edx
@@ -128,6 +248,30 @@ ENTRY(STRCMP)
# endif
# endif
vpxor %xmmZERO, %xmmZERO, %xmmZERO
+# if defined USE_AS_STRCASECMP_L
+ .section .rodata.cst32, "aM", @progbits, 32
+ .align 32
+L(lcase_min):
+ .quad 0x3f3f3f3f3f3f3f3f
+ .quad 0x3f3f3f3f3f3f3f3f
+ .quad 0x3f3f3f3f3f3f3f3f
+ .quad 0x3f3f3f3f3f3f3f3f
+L(lcase_max):
+ .quad 0x9999999999999999
+ .quad 0x9999999999999999
+ .quad 0x9999999999999999
+ .quad 0x9999999999999999
+L(case_add):
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .previous
+
+ vmovdqa L(lcase_min)(%rip), LCASE_MIN_ymm
+ vmovdqa L(lcase_max)(%rip), LCASE_MAX_ymm
+ vmovdqa L(case_add)(%rip), CASE_ADD_ymm
+# endif
movl %edi, %eax
orl %esi, %eax
sall $20, %eax
@@ -138,8 +282,10 @@ ENTRY(STRCMP)
L(no_page_cross):
/* Safe to compare 4x vectors. */
VMOVU (%rdi), %ymm0
- /* 1s where s1 and s2 equal. */
- VPCMPEQ (%rsi), %ymm0, %ymm1
+ /* 1s where s1 and s2 equal. Just VPCMPEQ if its not strcasecmp.
+ Otherwise converts ymm0 and load from rsi to lower. ymm2 is
+ scratch and ymm1 is the return. */
+ CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
/* 1s at null CHAR. */
VPCMPEQ %ymm0, %ymmZERO, %ymm2
/* 1s where s1 and s2 equal AND not null CHAR. */
@@ -172,6 +318,8 @@ L(return_vec_0):
# else
movzbl (%rdi, %rcx), %eax
movzbl (%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret0):
@@ -192,6 +340,10 @@ L(ret_zero):
.p2align 4,, 5
L(one_or_less):
+# ifdef USE_AS_STRCASECMP_L
+ /* Set locale argument for strcasecmp. */
+ movq %LOCALE_REG, %rdx
+# endif
jb L(ret_zero)
# ifdef USE_AS_WCSCMP
/* 'nbe' covers the case where length is negative (large
@@ -211,6 +363,8 @@ L(one_or_less):
jnbe __strcmp_avx2
movzbl (%rdi), %eax
movzbl (%rsi), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret1):
@@ -238,6 +392,8 @@ L(return_vec_1):
# else
movzbl VEC_SIZE(%rdi, %rcx), %eax
movzbl VEC_SIZE(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret2):
@@ -269,6 +425,8 @@ L(return_vec_2):
# else
movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret3):
@@ -289,6 +447,8 @@ L(return_vec_3):
# else
movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret4):
@@ -299,7 +459,7 @@ L(ret4):
L(more_3x_vec):
/* Safe to compare 4x vectors. */
VMOVU VEC_SIZE(%rdi), %ymm0
- VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -312,7 +472,7 @@ L(more_3x_vec):
# endif
VMOVU (VEC_SIZE * 2)(%rdi), %ymm0
- VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 2)(%rsi), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -320,7 +480,7 @@ L(more_3x_vec):
jnz L(return_vec_2)
VMOVU (VEC_SIZE * 3)(%rdi), %ymm0
- VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 3)(%rsi), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -395,12 +555,10 @@ L(loop_skip_page_cross_check):
VMOVA (VEC_SIZE * 3)(%rdi), %ymm6
/* ymm1 all 1s where s1 and s2 equal. All 0s otherwise. */
- VPCMPEQ (VEC_SIZE * 0)(%rsi), %ymm0, %ymm1
-
- VPCMPEQ (VEC_SIZE * 1)(%rsi), %ymm2, %ymm3
- VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
- VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
-
+ CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 0)(%rsi), %ymm3, %ymm1)
+ CMP_R1_S2_ymm (%ymm2, (VEC_SIZE * 1)(%rsi), %ymm5, %ymm3)
+ CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
+ CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
/* If any mismatches or null CHAR then 0 CHAR, otherwise non-
zero. */
@@ -469,6 +627,8 @@ L(return_vec_2_3_end):
# else
movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax
movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -512,6 +672,8 @@ L(return_vec_0_end):
# else
movzbl (%rdi, %rcx), %eax
movzbl (%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -534,6 +696,8 @@ L(return_vec_1_end):
# else
movzbl VEC_SIZE(%rdi, %rcx), %eax
movzbl VEC_SIZE(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -560,6 +724,8 @@ L(return_vec_2_end):
# else
movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -587,7 +753,7 @@ L(page_cross_during_loop):
jle L(less_1x_vec_till_page_cross)
VMOVA (%rdi), %ymm0
- VPCMPEQ (%rsi), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -609,7 +775,7 @@ L(less_1x_vec_till_page_cross):
here, it means the previous page (rdi - VEC_SIZE) has already
been loaded earlier so must be valid. */
VMOVU -VEC_SIZE(%rdi, %rax), %ymm0
- VPCMPEQ -VEC_SIZE(%rsi, %rax), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, -VEC_SIZE(%rsi, %rax), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -651,6 +817,8 @@ L(return_page_cross_cmp_mem):
# else
movzbl VEC_OFFSET(%rdi, %rcx), %eax
movzbl VEC_OFFSET(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -677,7 +845,7 @@ L(more_2x_vec_till_page_cross):
iteration here. */
VMOVU VEC_SIZE(%rdi), %ymm0
- VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -693,7 +861,7 @@ L(more_2x_vec_till_page_cross):
/* Safe to include comparisons from lower bytes. */
VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %ymm0
- VPCMPEQ -(VEC_SIZE * 2)(%rsi, %rax), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 2)(%rsi, %rax), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -701,7 +869,7 @@ L(more_2x_vec_till_page_cross):
jnz L(return_vec_page_cross_0)
VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %ymm0
- VPCMPEQ -(VEC_SIZE * 1)(%rsi, %rax), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 1)(%rsi, %rax), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -719,8 +887,8 @@ L(more_2x_vec_till_page_cross):
VMOVA (VEC_SIZE * 2)(%rdi), %ymm4
VMOVA (VEC_SIZE * 3)(%rdi), %ymm6
- VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
- VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
+ CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
+ CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
vpand %ymm4, %ymm5, %ymm5
vpand %ymm6, %ymm7, %ymm7
VPMINU %ymm5, %ymm7, %ymm7
@@ -771,6 +939,8 @@ L(return_vec_page_cross_1):
# else
movzbl VEC_OFFSET(%rdi, %rcx), %eax
movzbl VEC_OFFSET(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -826,7 +996,7 @@ L(page_cross):
L(page_cross_loop):
VMOVU (%rdi, %OFFSET_REG64), %ymm0
- VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -844,11 +1014,11 @@ L(page_cross_loop):
subl %eax, %OFFSET_REG
/* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
to not cross page so is safe to load. Since we have already
- loaded at least 1 VEC from rsi it is also guranteed to be safe.
- */
+ loaded at least 1 VEC from rsi it is also guranteed to be
+ safe. */
VMOVU (%rdi, %OFFSET_REG64), %ymm0
- VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -881,6 +1051,8 @@ L(ret_vec_page_cross_cont):
# else
movzbl (%rdi, %rcx), %eax
movzbl (%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -934,7 +1106,7 @@ L(less_1x_vec_till_page):
ja L(less_16_till_page)
VMOVU (%rdi), %xmm0
- VPCMPEQ (%rsi), %xmm0, %xmm1
+ CMP_R1_S2_xmm (%xmm0, (%rsi), %xmm2, %xmm1)
VPCMPEQ %xmm0, %xmmZERO, %xmm2
vpandn %xmm1, %xmm2, %xmm1
vpmovmskb %ymm1, %ecx
@@ -952,7 +1124,7 @@ L(less_1x_vec_till_page):
# endif
VMOVU (%rdi, %OFFSET_REG64), %xmm0
- VPCMPEQ (%rsi, %OFFSET_REG64), %xmm0, %xmm1
+ CMP_R1_S2_xmm (%xmm0, (%rsi, %OFFSET_REG64), %xmm2, %xmm1)
VPCMPEQ %xmm0, %xmmZERO, %xmm2
vpandn %xmm1, %xmm2, %xmm1
vpmovmskb %ymm1, %ecx
@@ -990,7 +1162,7 @@ L(less_16_till_page):
vmovq (%rdi), %xmm0
vmovq (%rsi), %xmm1
VPCMPEQ %xmm0, %xmmZERO, %xmm2
- VPCMPEQ %xmm1, %xmm0, %xmm1
+ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
vpandn %xmm1, %xmm2, %xmm1
vpmovmskb %ymm1, %ecx
incb %cl
@@ -1010,7 +1182,7 @@ L(less_16_till_page):
vmovq (%rdi, %OFFSET_REG64), %xmm0
vmovq (%rsi, %OFFSET_REG64), %xmm1
VPCMPEQ %xmm0, %xmmZERO, %xmm2
- VPCMPEQ %xmm1, %xmm0, %xmm1
+ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
vpandn %xmm1, %xmm2, %xmm1
vpmovmskb %ymm1, %ecx
incb %cl
@@ -1066,7 +1238,7 @@ L(ret_less_8_wcs):
vmovd (%rdi), %xmm0
vmovd (%rsi), %xmm1
VPCMPEQ %xmm0, %xmmZERO, %xmm2
- VPCMPEQ %xmm1, %xmm0, %xmm1
+ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
vpandn %xmm1, %xmm2, %xmm1
vpmovmskb %ymm1, %ecx
subl $0xf, %ecx
@@ -1085,7 +1257,7 @@ L(ret_less_8_wcs):
vmovd (%rdi, %OFFSET_REG64), %xmm0
vmovd (%rsi, %OFFSET_REG64), %xmm1
VPCMPEQ %xmm0, %xmmZERO, %xmm2
- VPCMPEQ %xmm1, %xmm0, %xmm1
+ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
vpandn %xmm1, %xmm2, %xmm1
vpmovmskb %ymm1, %ecx
subl $0xf, %ecx
@@ -1119,7 +1291,9 @@ L(less_4_till_page):
L(less_4_loop):
movzbl (%rdi), %eax
movzbl (%rsi, %rdi), %ecx
- subl %ecx, %eax
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
+ subl %BYTE_LOOP_REG, %eax
jnz L(ret_less_4_loop)
testl %ecx, %ecx
jz L(ret_zero_4_loop)
@@ -1146,5 +1320,6 @@ L(ret_less_4_loop):
subl %r8d, %eax
ret
# endif
-END(STRCMP)
+ cfi_endproc
+ .size STRCMP, .-STRCMP
#endif
diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
new file mode 100644
index 0000000000..58c05dcfb8
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
@@ -0,0 +1,16 @@
+#ifndef STRCMP
+# define STRCMP __strncasecmp_l_avx2_rtm
+#endif
+
+#define _GLABEL(x) x ## _rtm
+#define GLABEL(x) _GLABEL(x)
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+#define OVERFLOW_STRCMP __strcasecmp_l_avx2_rtm
+
+#include "strncase_l-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2.S b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
new file mode 100644
index 0000000000..48c0aa21f8
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
@@ -0,0 +1,27 @@
+/* strncasecmp_l optimized with AVX2.
+ Copyright (C) 2017-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef STRCMP
+# define STRCMP __strncasecmp_l_avx2
+#endif
+#define USE_AS_STRCASECMP_L
+#define USE_AS_STRNCMP
+#ifndef OVERFLOW_STRCMP
+# define OVERFLOW_STRCMP __strcasecmp_l_avx2
+#endif
+#include "strcmp-avx2.S"
--
2.35.1
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v4 22/23] x86: Add EVEX optimized str{n}casecmp
2022-05-12 19:47 ` Sunil Pandey
@ 2022-05-12 19:52 ` Sunil Pandey
0 siblings, 0 replies; 76+ messages in thread
From: Sunil Pandey @ 2022-05-12 19:52 UTC (permalink / raw)
To: Noah Goldstein, Libc-stable Mailing List; +Cc: H.J. Lu, GNU C Library
On Thu, May 12, 2022 at 12:47 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
> On Fri, Mar 25, 2022 at 11:20 AM Noah Goldstein via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > On Fri, Mar 25, 2022 at 1:15 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Thu, Mar 24, 2022 at 4:56 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > geometric_mean(N=40) of all benchmarks EVEX / SSE42: .621
> > > >
> > > > All string/memory tests pass.
> > > > Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
> > > > ---
> > > > sysdeps/x86_64/multiarch/Makefile | 2 +
> > > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 +
> > > > sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 5 +
> > > > sysdeps/x86_64/multiarch/strcasecmp_l-evex.S | 23 ++
> > > > sysdeps/x86_64/multiarch/strcmp-evex.S | 290 ++++++++++++++++---
> > > > sysdeps/x86_64/multiarch/strncase_l-evex.S | 25 ++
> > > > 6 files changed, 321 insertions(+), 40 deletions(-)
> > > > create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
> > > > create mode 100644 sysdeps/x86_64/multiarch/strncase_l-evex.S
> > > >
> > > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > > > index 06e1848823..35d80dc2ff 100644
> > > > --- a/sysdeps/x86_64/multiarch/Makefile
> > > > +++ b/sysdeps/x86_64/multiarch/Makefile
> > > > @@ -57,6 +57,7 @@ sysdep_routines += \
> > > > strcasecmp_l-avx \
> > > > strcasecmp_l-avx2 \
> > > > strcasecmp_l-avx2-rtm \
> > > > + strcasecmp_l-evex \
> > > > strcasecmp_l-sse2 \
> > > > strcasecmp_l-sse4_2 \
> > > > strcasecmp_l-ssse3 \
> > > > @@ -97,6 +98,7 @@ sysdep_routines += \
> > > > strncase_l-avx \
> > > > strncase_l-avx2 \
> > > > strncase_l-avx2-rtm \
> > > > + strncase_l-evex \
> > > > strncase_l-sse2 \
> > > > strncase_l-sse4_2 \
> > > > strncase_l-ssse3 \
> > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > index 3c556d07ac..f1a4d3dac2 100644
> > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > @@ -436,6 +436,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > >
> > > > /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
> > > > IFUNC_IMPL (i, name, strcasecmp,
> > > > + IFUNC_IMPL_ADD (array, i, strcasecmp,
> > > > + (CPU_FEATURE_USABLE (AVX512VL)
> > > > + && CPU_FEATURE_USABLE (AVX512BW)),
> > > > + __strcasecmp_evex)
> > > > IFUNC_IMPL_ADD (array, i, strcasecmp,
> > > > CPU_FEATURE_USABLE (AVX2),
> > > > __strcasecmp_avx2)
> > > > @@ -456,6 +460,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > >
> > > > /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
> > > > IFUNC_IMPL (i, name, strcasecmp_l,
> > > > + IFUNC_IMPL_ADD (array, i, strcasecmp,
> > > > + (CPU_FEATURE_USABLE (AVX512VL)
> > > > + && CPU_FEATURE_USABLE (AVX512BW)),
> > > > + __strcasecmp_l_evex)
> > > > IFUNC_IMPL_ADD (array, i, strcasecmp,
> > > > CPU_FEATURE_USABLE (AVX2),
> > > > __strcasecmp_l_avx2)
> > > > @@ -590,6 +598,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > >
> > > > /* Support sysdeps/x86_64/multiarch/strncase_l.c. */
> > > > IFUNC_IMPL (i, name, strncasecmp,
> > > > + IFUNC_IMPL_ADD (array, i, strncasecmp,
> > > > + (CPU_FEATURE_USABLE (AVX512VL)
> > > > + && CPU_FEATURE_USABLE (AVX512BW)),
> > > > + __strncasecmp_evex)
> > > > IFUNC_IMPL_ADD (array, i, strncasecmp,
> > > > CPU_FEATURE_USABLE (AVX2),
> > > > __strncasecmp_avx2)
> > > > @@ -611,6 +623,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > >
> > > > /* Support sysdeps/x86_64/multiarch/strncase_l.c. */
> > > > IFUNC_IMPL (i, name, strncasecmp_l,
> > > > + IFUNC_IMPL_ADD (array, i, strncasecmp,
> > > > + (CPU_FEATURE_USABLE (AVX512VL)
> > > > + && CPU_FEATURE_USABLE (AVX512BW)),
> > > > + __strncasecmp_l_evex)
> > > > IFUNC_IMPL_ADD (array, i, strncasecmp,
> > > > CPU_FEATURE_USABLE (AVX2),
> > > > __strncasecmp_l_avx2)
> > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> > > > index c4de111fd0..bf0d146e7f 100644
> > > > --- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> > > > +++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> > > > @@ -25,6 +25,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
> > > > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
> > > > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> > > > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
> > > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
> > > >
> > > > static inline void *
> > > > IFUNC_SELECTOR (void)
> > > > @@ -34,6 +35,10 @@ IFUNC_SELECTOR (void)
> > > > if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> > > > && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
> > > > {
> > > > + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> > > > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
> > > > + return OPTIMIZE (evex);
> > > > +
> > > > if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> > > > return OPTIMIZE (avx2_rtm);
> > > >
> > > > diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
> > > > new file mode 100644
> > > > index 0000000000..58642db748
> > > > --- /dev/null
> > > > +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
> > > > @@ -0,0 +1,23 @@
> > > > +/* strcasecmp_l optimized with EVEX.
> > > > + Copyright (C) 2017-2022 Free Software Foundation, Inc.
> > > > + This file is part of the GNU C Library.
> > > > +
> > > > + The GNU C Library is free software; you can redistribute it and/or
> > > > + modify it under the terms of the GNU Lesser General Public
> > > > + License as published by the Free Software Foundation; either
> > > > + version 2.1 of the License, or (at your option) any later version.
> > > > +
> > > > + The GNU C Library is distributed in the hope that it will be useful,
> > > > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > > > + Lesser General Public License for more details.
> > > > +
> > > > + You should have received a copy of the GNU Lesser General Public
> > > > + License along with the GNU C Library; if not, see
> > > > + <https://www.gnu.org/licenses/>. */
> > > > +
> > > > +#ifndef STRCMP
> > > > +# define STRCMP __strcasecmp_l_evex
> > > > +#endif
> > > > +#define USE_AS_STRCASECMP_L
> > > > +#include "strcmp-evex.S"
> > > > diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
> > > > index 56d8c118e4..2a5b3ce037 100644
> > > > --- a/sysdeps/x86_64/multiarch/strcmp-evex.S
> > > > +++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
> > > > @@ -19,6 +19,9 @@
> > > > #if IS_IN (libc)
> > > >
> > > > # include <sysdep.h>
> > > > +# if defined USE_AS_STRCASECMP_L
> > > > +# include "locale-defines.h"
> > > > +# endif
> > > >
> > > > # ifndef STRCMP
> > > > # define STRCMP __strcmp_evex
> > > > @@ -34,19 +37,29 @@
> > > > # define VMOVA vmovdqa64
> > > >
> > > > # ifdef USE_AS_WCSCMP
> > > > -# define TESTEQ subl $0xff,
> > > > +# ifndef OVERFLOW_STRCMP
> > > > +# define OVERFLOW_STRCMP __wcscmp_evex
> > > > +# endif
> > > > +
> > > > +# define TESTEQ subl $0xff,
> > > > /* Compare packed dwords. */
> > > > # define VPCMP vpcmpd
> > > > # define VPMINU vpminud
> > > > # define VPTESTM vptestmd
> > > > +# define VPTESTNM vptestnmd
> > > > /* 1 dword char == 4 bytes. */
> > > > # define SIZE_OF_CHAR 4
> > > > # else
> > > > +# ifndef OVERFLOW_STRCMP
> > > > +# define OVERFLOW_STRCMP __strcmp_evex
> > > > +# endif
> > > > +
> > > > # define TESTEQ incl
> > > > /* Compare packed bytes. */
> > > > # define VPCMP vpcmpb
> > > > # define VPMINU vpminub
> > > > # define VPTESTM vptestmb
> > > > +# define VPTESTNM vptestnmb
> > > > /* 1 byte char == 1 byte. */
> > > > # define SIZE_OF_CHAR 1
> > > > # endif
> > > > @@ -73,11 +86,16 @@
> > > > # define VEC_OFFSET (-VEC_SIZE)
> > > > # endif
> > > >
> > > > -# define XMMZERO xmm16
> > > > # define XMM0 xmm17
> > > > # define XMM1 xmm18
> > > >
> > > > -# define YMMZERO ymm16
> > > > +# define XMM10 xmm27
> > > > +# define XMM11 xmm28
> > > > +# define XMM12 xmm29
> > > > +# define XMM13 xmm30
> > > > +# define XMM14 xmm31
> > > > +
> > > > +
> > > > # define YMM0 ymm17
> > > > # define YMM1 ymm18
> > > > # define YMM2 ymm19
> > > > @@ -89,6 +107,87 @@
> > > > # define YMM8 ymm25
> > > > # define YMM9 ymm26
> > > > # define YMM10 ymm27
> > > > +# define YMM11 ymm28
> > > > +# define YMM12 ymm29
> > > > +# define YMM13 ymm30
> > > > +# define YMM14 ymm31
> > > > +
> > > > +# ifdef USE_AS_STRCASECMP_L
> > > > +# define BYTE_LOOP_REG OFFSET_REG
> > > > +# else
> > > > +# define BYTE_LOOP_REG ecx
> > > > +# endif
> > > > +
> > > > +# ifdef USE_AS_STRCASECMP_L
> > > > +# ifdef USE_AS_STRNCMP
> > > > +# define STRCASECMP __strncasecmp_evex
> > > > +# define LOCALE_REG rcx
> > > > +# define LOCALE_REG_LP RCX_LP
> > > > +# define STRCASECMP_NONASCII __strncasecmp_l_nonascii
> > > > +# else
> > > > +# define STRCASECMP __strcasecmp_evex
> > > > +# define LOCALE_REG rdx
> > > > +# define LOCALE_REG_LP RDX_LP
> > > > +# define STRCASECMP_NONASCII __strcasecmp_l_nonascii
> > > > +# endif
> > > > +# endif
> > > > +
> > > > +# define LCASE_MIN_YMM %YMM12
> > > > +# define LCASE_MAX_YMM %YMM13
> > > > +# define CASE_ADD_YMM %YMM14
> > > > +
> > > > +# define LCASE_MIN_XMM %XMM12
> > > > +# define LCASE_MAX_XMM %XMM13
> > > > +# define CASE_ADD_XMM %XMM14
> > > > +
> > > > + /* NB: wcsncmp uses r11 but strcasecmp is never used in
> > > > + conjunction with wcscmp. */
> > > > +# define TOLOWER_BASE %r11
> > > > +
> > > > +# ifdef USE_AS_STRCASECMP_L
> > > > +# define _REG(x, y) x ## y
> > > > +# define REG(x, y) _REG(x, y)
> > > > +# define TOLOWER(reg1, reg2, ext) \
> > > > + vpsubb REG(LCASE_MIN_, ext), reg1, REG(%ext, 10); \
> > > > + vpsubb REG(LCASE_MIN_, ext), reg2, REG(%ext, 11); \
> > > > + vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5; \
> > > > + vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6; \
> > > > + vpaddb reg1, REG(CASE_ADD_, ext), reg1{%k5}; \
> > > > + vpaddb reg2, REG(CASE_ADD_, ext), reg2{%k6}
> > > > +
> > > > +# define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
> > > > +# define TOLOWER_YMM(...) TOLOWER(__VA_ARGS__, YMM)
> > > > +# define TOLOWER_XMM(...) TOLOWER(__VA_ARGS__, XMM)
> > > > +
> > > > +# define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext) \
> > > > + TOLOWER (s1_reg, s2_reg, ext); \
> > > > + VPCMP $0, s1_reg, s2_reg, reg_out
> > > > +
> > > > +# define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext) \
> > > > + VMOVU s2_mem, s2_reg; \
> > > > + CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)
> > > > +
> > > > +# define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM)
> > > > +# define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM)
> > > > +
> > > > +# define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM)
> > > > +# define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM)
> > > > +
> > > > +# else
> > > > +# define TOLOWER_gpr(...)
> > > > +# define TOLOWER_YMM(...)
> > > > +# define TOLOWER_XMM(...)
> > > > +
> > > > +# define CMP_R1_R2_YMM(s1_reg, s2_reg, reg_out) \
> > > > + VPCMP $0, s2_reg, s1_reg, reg_out
> > > > +
> > > > +# define CMP_R1_R2_XMM(...) CMP_R1_R2_YMM(__VA_ARGS__)
> > > > +
> > > > +# define CMP_R1_S2_YMM(s1_reg, s2_mem, unused, reg_out) \
> > > > + VPCMP $0, s2_mem, s1_reg, reg_out
> > > > +
> > > > +# define CMP_R1_S2_XMM(...) CMP_R1_S2_YMM(__VA_ARGS__)
> > > > +# endif
> > > >
> > > > /* Warning!
> > > > wcscmp/wcsncmp have to use SIGNED comparison for elements.
> > > > @@ -112,8 +211,45 @@
> > > > returned. */
> > > >
> > > > .section .text.evex, "ax", @progbits
> > > > -ENTRY(STRCMP)
> > > > + .align 16
> > > > + .type STRCMP, @function
> > > > + .globl STRCMP
> > > > + .hidden STRCMP
> > > > +
> > > > +# ifdef USE_AS_STRCASECMP_L
> > > > +ENTRY (STRCASECMP)
> > > > + movq __libc_tsd_LOCALE@gottpoff(%rip), %rax
> > > > + mov %fs:(%rax), %LOCALE_REG_LP
> > > > +
> > > > + /* Either 1 or 5 bytes (dependeing if CET is enabled). */
> > > > + .p2align 4
> > > > +END (STRCASECMP)
> > > > + /* FALLTHROUGH to strcasecmp/strncasecmp_l. */
> > > > +# endif
> > > > +
> > > > + .p2align 4
> > > > +STRCMP:
> > > > + cfi_startproc
> > > > + _CET_ENDBR
> > > > + CALL_MCOUNT
> > > > +
> > > > +# if defined USE_AS_STRCASECMP_L
> > > > + /* We have to fall back on the C implementation for locales with
> > > > + encodings not matching ASCII for single bytes. */
> > > > +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
> > > > + mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
> > > > +# else
> > > > + mov (%LOCALE_REG), %RAX_LP
> > > > +# endif
> > > > + testl $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
> > > > + jne STRCASECMP_NONASCII
> > > > + leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
> > > > +# endif
> > > > +
> > > > # ifdef USE_AS_STRNCMP
> > > > + /* Don't overwrite LOCALE_REG (rcx) until we have pass
> > > > + L(one_or_less). Otherwise we might use the wrong locale in
> > > > + the OVERFLOW_STRCMP (strcasecmp_l). */
> > > > # ifdef __ILP32__
> > > > /* Clear the upper 32 bits. */
> > > > movl %edx, %edx
> > > > @@ -125,6 +261,32 @@ ENTRY(STRCMP)
> > > > actually bound the buffer. */
> > > > jle L(one_or_less)
> > > > # endif
> > > > +
> > > > +# if defined USE_AS_STRCASECMP_L
> > > > + .section .rodata.cst32, "aM", @progbits, 32
> > > > + .align 32
> > > > +L(lcase_min):
> > > > + .quad 0x4141414141414141
> > > > + .quad 0x4141414141414141
> > > > + .quad 0x4141414141414141
> > > > + .quad 0x4141414141414141
> > > > +L(lcase_max):
> > > > + .quad 0x1a1a1a1a1a1a1a1a
> > > > + .quad 0x1a1a1a1a1a1a1a1a
> > > > + .quad 0x1a1a1a1a1a1a1a1a
> > > > + .quad 0x1a1a1a1a1a1a1a1a
> > > > +L(case_add):
> > > > + .quad 0x2020202020202020
> > > > + .quad 0x2020202020202020
> > > > + .quad 0x2020202020202020
> > > > + .quad 0x2020202020202020
> > > > + .previous
> > > > +
> > > > + vmovdqa64 L(lcase_min)(%rip), LCASE_MIN_YMM
> > > > + vmovdqa64 L(lcase_max)(%rip), LCASE_MAX_YMM
> > > > + vmovdqa64 L(case_add)(%rip), CASE_ADD_YMM
> > > > +# endif
> > > > +
> > > > movl %edi, %eax
> > > > orl %esi, %eax
> > > > /* Shift out the bits irrelivant to page boundary ([63:12]). */
> > > > @@ -139,7 +301,7 @@ L(no_page_cross):
> > > > VPTESTM %YMM0, %YMM0, %k2
> > > > /* Each bit cleared in K1 represents a mismatch or a null CHAR
> > > > in YMM0 and 32 bytes at (%rsi). */
> > > > - VPCMP $0, (%rsi), %YMM0, %k1{%k2}
> > > > + CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
> > > > kmovd %k1, %ecx
> > > > # ifdef USE_AS_STRNCMP
> > > > cmpq $CHAR_PER_VEC, %rdx
> > > > @@ -169,6 +331,8 @@ L(return_vec_0):
> > > > # else
> > > > movzbl (%rdi, %rcx), %eax
> > > > movzbl (%rsi, %rcx), %ecx
> > > > + TOLOWER_gpr (%rax, %eax)
> > > > + TOLOWER_gpr (%rcx, %ecx)
> > > > subl %ecx, %eax
> > > > # endif
> > > > L(ret0):
> > > > @@ -188,11 +352,15 @@ L(ret_zero):
> > > >
> > > > .p2align 4,, 5
> > > > L(one_or_less):
> > > > +# ifdef USE_AS_STRCASECMP_L
> > > > + /* Set locale argument for strcasecmp. */
> > > > + movq %LOCALE_REG, %rdx
> > > > +# endif
> > > > jb L(ret_zero)
> > > > -# ifdef USE_AS_WCSCMP
> > > > /* 'nbe' covers the case where length is negative (large
> > > > unsigned). */
> > > > - jnbe __wcscmp_evex
> > > > + jnbe OVERFLOW_STRCMP
> > > > +# ifdef USE_AS_WCSCMP
> > > > movl (%rdi), %edx
> > > > xorl %eax, %eax
> > > > cmpl (%rsi), %edx
> > > > @@ -201,11 +369,10 @@ L(one_or_less):
> > > > negl %eax
> > > > orl $1, %eax
> > > > # else
> > > > - /* 'nbe' covers the case where length is negative (large
> > > > - unsigned). */
> > > > - jnbe __strcmp_evex
> > > > movzbl (%rdi), %eax
> > > > movzbl (%rsi), %ecx
> > > > + TOLOWER_gpr (%rax, %eax)
> > > > + TOLOWER_gpr (%rcx, %ecx)
> > > > subl %ecx, %eax
> > > > # endif
> > > > L(ret1):
> > > > @@ -233,6 +400,8 @@ L(return_vec_1):
> > > > # else
> > > > movzbl VEC_SIZE(%rdi, %rcx), %eax
> > > > movzbl VEC_SIZE(%rsi, %rcx), %ecx
> > > > + TOLOWER_gpr (%rax, %eax)
> > > > + TOLOWER_gpr (%rcx, %ecx)
> > > > subl %ecx, %eax
> > > > # endif
> > > > L(ret2):
> > > > @@ -270,6 +439,8 @@ L(return_vec_2):
> > > > # else
> > > > movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
> > > > movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
> > > > + TOLOWER_gpr (%rax, %eax)
> > > > + TOLOWER_gpr (%rcx, %ecx)
> > > > subl %ecx, %eax
> > > > # endif
> > > > L(ret3):
> > > > @@ -290,6 +461,8 @@ L(return_vec_3):
> > > > # else
> > > > movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
> > > > movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
> > > > + TOLOWER_gpr (%rax, %eax)
> > > > + TOLOWER_gpr (%rcx, %ecx)
> > > > subl %ecx, %eax
> > > > # endif
> > > > L(ret4):
> > > > @@ -303,7 +476,7 @@ L(more_3x_vec):
> > > > /* Safe to compare 4x vectors. */
> > > > VMOVU (VEC_SIZE)(%rdi), %YMM0
> > > > VPTESTM %YMM0, %YMM0, %k2
> > > > - VPCMP $0, (VEC_SIZE)(%rsi), %YMM0, %k1{%k2}
> > > > + CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
> > > > kmovd %k1, %ecx
> > > > TESTEQ %ecx
> > > > jnz L(return_vec_1)
> > > > @@ -315,14 +488,14 @@ L(more_3x_vec):
> > > >
> > > > VMOVU (VEC_SIZE * 2)(%rdi), %YMM0
> > > > VPTESTM %YMM0, %YMM0, %k2
> > > > - VPCMP $0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
> > > > + CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2}
> > > > kmovd %k1, %ecx
> > > > TESTEQ %ecx
> > > > jnz L(return_vec_2)
> > > >
> > > > VMOVU (VEC_SIZE * 3)(%rdi), %YMM0
> > > > VPTESTM %YMM0, %YMM0, %k2
> > > > - VPCMP $0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
> > > > + CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2}
> > > > kmovd %k1, %ecx
> > > > TESTEQ %ecx
> > > > jnz L(return_vec_3)
> > > > @@ -381,7 +554,6 @@ L(prepare_loop_aligned):
> > > > subl %esi, %eax
> > > > andl $(PAGE_SIZE - 1), %eax
> > > >
> > > > - vpxorq %YMMZERO, %YMMZERO, %YMMZERO
> > > >
> > > > /* Loop 4x comparisons at a time. */
> > > > .p2align 4
> > > > @@ -413,22 +585,35 @@ L(loop_skip_page_cross_check):
> > > > /* A zero CHAR in YMM9 means that there is a null CHAR. */
> > > > VPMINU %YMM8, %YMM9, %YMM9
> > > >
> > > > - /* Each bit set in K1 represents a non-null CHAR in YMM8. */
> > > > + /* Each bit set in K1 represents a non-null CHAR in YMM9. */
> > > > VPTESTM %YMM9, %YMM9, %k1
> > > > -
> > > > +# ifndef USE_AS_STRCASECMP_L
> > > > vpxorq (VEC_SIZE * 0)(%rsi), %YMM0, %YMM1
> > > > vpxorq (VEC_SIZE * 1)(%rsi), %YMM2, %YMM3
> > > > vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
> > > > /* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
> > > > oring with YMM1. Result is stored in YMM6. */
> > > > vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6
> > > > -
> > > > +# else
> > > > + VMOVU (VEC_SIZE * 0)(%rsi), %YMM1
> > > > + TOLOWER_YMM (%YMM0, %YMM1)
> > > > + VMOVU (VEC_SIZE * 1)(%rsi), %YMM3
> > > > + TOLOWER_YMM (%YMM2, %YMM3)
> > > > + VMOVU (VEC_SIZE * 2)(%rsi), %YMM5
> > > > + TOLOWER_YMM (%YMM4, %YMM5)
> > > > + VMOVU (VEC_SIZE * 3)(%rsi), %YMM7
> > > > + TOLOWER_YMM (%YMM6, %YMM7)
> > > > + vpxorq %YMM0, %YMM1, %YMM1
> > > > + vpxorq %YMM2, %YMM3, %YMM3
> > > > + vpxorq %YMM4, %YMM5, %YMM5
> > > > + vpternlogd $0xde, %YMM7, %YMM1, %YMM6
> > > > +# endif
> > > > /* Or together YMM3, YMM5, and YMM6. */
> > > > vpternlogd $0xfe, %YMM3, %YMM5, %YMM6
> > > >
> > > >
> > > > /* A non-zero CHAR in YMM6 represents a mismatch. */
> > > > - VPCMP $0, %YMMZERO, %YMM6, %k0{%k1}
> > > > + VPTESTNM %YMM6, %YMM6, %k0{%k1}
> > > > kmovd %k0, %LOOP_REG
> > > >
> > > > TESTEQ %LOOP_REG
> > > > @@ -437,13 +622,13 @@ L(loop_skip_page_cross_check):
> > > >
> > > > /* Find which VEC has the mismatch of end of string. */
> > > > VPTESTM %YMM0, %YMM0, %k1
> > > > - VPCMP $0, %YMMZERO, %YMM1, %k0{%k1}
> > > > + VPTESTNM %YMM1, %YMM1, %k0{%k1}
> > > > kmovd %k0, %ecx
> > > > TESTEQ %ecx
> > > > jnz L(return_vec_0_end)
> > > >
> > > > VPTESTM %YMM2, %YMM2, %k1
> > > > - VPCMP $0, %YMMZERO, %YMM3, %k0{%k1}
> > > > + VPTESTNM %YMM3, %YMM3, %k0{%k1}
> > > > kmovd %k0, %ecx
> > > > TESTEQ %ecx
> > > > jnz L(return_vec_1_end)
> > > > @@ -457,7 +642,7 @@ L(return_vec_2_3_end):
> > > > # endif
> > > >
> > > > VPTESTM %YMM4, %YMM4, %k1
> > > > - VPCMP $0, %YMMZERO, %YMM5, %k0{%k1}
> > > > + VPTESTNM %YMM5, %YMM5, %k0{%k1}
> > > > kmovd %k0, %ecx
> > > > TESTEQ %ecx
> > > > # if CHAR_PER_VEC <= 16
> > > > @@ -493,6 +678,8 @@ L(return_vec_3_end):
> > > > # else
> > > > movzbl (VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax
> > > > movzbl (VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx
> > > > + TOLOWER_gpr (%rax, %eax)
> > > > + TOLOWER_gpr (%rcx, %ecx)
> > > > subl %ecx, %eax
> > > > xorl %r8d, %eax
> > > > subl %r8d, %eax
> > > > @@ -545,6 +732,8 @@ L(return_vec_0_end):
> > > > # else
> > > > movzbl (%rdi, %rcx), %eax
> > > > movzbl (%rsi, %rcx), %ecx
> > > > + TOLOWER_gpr (%rax, %eax)
> > > > + TOLOWER_gpr (%rcx, %ecx)
> > > > subl %ecx, %eax
> > > > /* Flip `eax` if `rdi` and `rsi` where swapped in page cross
> > > > logic. Subtract `r8d` after xor for zero case. */
> > > > @@ -569,6 +758,8 @@ L(return_vec_1_end):
> > > > # else
> > > > movzbl VEC_SIZE(%rdi, %rcx), %eax
> > > > movzbl VEC_SIZE(%rsi, %rcx), %ecx
> > > > + TOLOWER_gpr (%rax, %eax)
> > > > + TOLOWER_gpr (%rcx, %ecx)
> > > > subl %ecx, %eax
> > > > xorl %r8d, %eax
> > > > subl %r8d, %eax
> > > > @@ -598,7 +789,7 @@ L(page_cross_during_loop):
> > > >
> > > > VMOVA (%rdi), %YMM0
> > > > VPTESTM %YMM0, %YMM0, %k2
> > > > - VPCMP $0, (%rsi), %YMM0, %k1{%k2}
> > > > + CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
> > > > kmovd %k1, %ecx
> > > > TESTEQ %ecx
> > > > jnz L(return_vec_0_end)
> > > > @@ -619,8 +810,7 @@ L(less_1x_vec_till_page_cross):
> > > > been loaded earlier so must be valid. */
> > > > VMOVU -VEC_SIZE(%rdi, %rax), %YMM0
> > > > VPTESTM %YMM0, %YMM0, %k2
> > > > - VPCMP $0, -VEC_SIZE(%rsi, %rax), %YMM0, %k1{%k2}
> > > > -
> > > > + CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2}
> > > > /* Mask of potentially valid bits. The lower bits can be out of
> > > > range comparisons (but safe regarding page crosses). */
> > > >
> > > > @@ -642,6 +832,8 @@ L(less_1x_vec_till_page_cross):
> > > >
> > > > # ifdef USE_AS_STRNCMP
> > > > # ifdef USE_AS_WCSCMP
> > > > + /* NB: strcasecmp not used with WCSCMP so this access to r11 is
> > > > + safe. */
> > > > movl %eax, %r11d
> > > > shrl $2, %r11d
> > > > cmpq %r11, %rdx
> > > > @@ -679,6 +871,8 @@ L(return_page_cross_cmp_mem):
> > > > # else
> > > > movzbl VEC_OFFSET(%rdi, %rcx), %eax
> > > > movzbl VEC_OFFSET(%rsi, %rcx), %ecx
> > > > + TOLOWER_gpr (%rax, %eax)
> > > > + TOLOWER_gpr (%rcx, %ecx)
> > > > subl %ecx, %eax
> > > > xorl %r8d, %eax
> > > > subl %r8d, %eax
> > > > @@ -709,7 +903,7 @@ L(more_2x_vec_till_page_cross):
> > > >
> > > > VMOVA VEC_SIZE(%rdi), %YMM0
> > > > VPTESTM %YMM0, %YMM0, %k2
> > > > - VPCMP $0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
> > > > + CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
> > > > kmovd %k1, %ecx
> > > > TESTEQ %ecx
> > > > jnz L(return_vec_1_end)
> > > > @@ -724,14 +918,14 @@ L(more_2x_vec_till_page_cross):
> > > > /* Safe to include comparisons from lower bytes. */
> > > > VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %YMM0
> > > > VPTESTM %YMM0, %YMM0, %k2
> > > > - VPCMP $0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM0, %k1{%k2}
> > > > + CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2}
> > > > kmovd %k1, %ecx
> > > > TESTEQ %ecx
> > > > jnz L(return_vec_page_cross_0)
> > > >
> > > > VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %YMM0
> > > > VPTESTM %YMM0, %YMM0, %k2
> > > > - VPCMP $0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM0, %k1{%k2}
> > > > + CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2}
> > > > kmovd %k1, %ecx
> > > > TESTEQ %ecx
> > > > jnz L(return_vec_page_cross_1)
> > > > @@ -740,6 +934,8 @@ L(more_2x_vec_till_page_cross):
> > > > /* Must check length here as length might proclude reading next
> > > > page. */
> > > > # ifdef USE_AS_WCSCMP
> > > > + /* NB: strcasecmp not used with WCSCMP so this access to r11 is
> > > > + safe. */
> > > > movl %eax, %r11d
> > > > shrl $2, %r11d
> > > > cmpq %r11, %rdx
> > > > @@ -754,12 +950,19 @@ L(more_2x_vec_till_page_cross):
> > > > VMOVA (VEC_SIZE * 3)(%rdi), %YMM6
> > > > VPMINU %YMM4, %YMM6, %YMM9
> > > > VPTESTM %YMM9, %YMM9, %k1
> > > > -
> > > > +# ifndef USE_AS_STRCASECMP_L
> > > > vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
> > > > /* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6). */
> > > > vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6
> > > > -
> > > > - VPCMP $0, %YMMZERO, %YMM6, %k0{%k1}
> > > > +# else
> > > > + VMOVU (VEC_SIZE * 2)(%rsi), %YMM5
> > > > + TOLOWER_YMM (%YMM4, %YMM5)
> > > > + VMOVU (VEC_SIZE * 3)(%rsi), %YMM7
> > > > + TOLOWER_YMM (%YMM6, %YMM7)
> > > > + vpxorq %YMM4, %YMM5, %YMM5
> > > > + vpternlogd $0xde, %YMM7, %YMM5, %YMM6
> > > > +# endif
> > > > + VPTESTNM %YMM6, %YMM6, %k0{%k1}
> > > > kmovd %k0, %LOOP_REG
> > > > TESTEQ %LOOP_REG
> > > > jnz L(return_vec_2_3_end)
> > > > @@ -815,6 +1018,8 @@ L(return_vec_page_cross_1):
> > > > # else
> > > > movzbl VEC_OFFSET(%rdi, %rcx), %eax
> > > > movzbl VEC_OFFSET(%rsi, %rcx), %ecx
> > > > + TOLOWER_gpr (%rax, %eax)
> > > > + TOLOWER_gpr (%rcx, %ecx)
> > > > subl %ecx, %eax
> > > > xorl %r8d, %eax
> > > > subl %r8d, %eax
> > > > @@ -871,7 +1076,7 @@ L(page_cross):
> > > > L(page_cross_loop):
> > > > VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
> > > > VPTESTM %YMM0, %YMM0, %k2
> > > > - VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
> > > > + CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
> > > > kmovd %k1, %ecx
> > > > TESTEQ %ecx
> > > > jnz L(check_ret_vec_page_cross)
> > > > @@ -895,7 +1100,7 @@ L(page_cross_loop):
> > > > */
> > > > VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
> > > > VPTESTM %YMM0, %YMM0, %k2
> > > > - VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
> > > > + CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
> > > >
> > > > kmovd %k1, %ecx
> > > > # ifdef USE_AS_STRNCMP
> > > > @@ -930,6 +1135,8 @@ L(ret_vec_page_cross_cont):
> > > > # else
> > > > movzbl (%rdi, %rcx, SIZE_OF_CHAR), %eax
> > > > movzbl (%rsi, %rcx, SIZE_OF_CHAR), %ecx
> > > > + TOLOWER_gpr (%rax, %eax)
> > > > + TOLOWER_gpr (%rcx, %ecx)
> > > > subl %ecx, %eax
> > > > xorl %r8d, %eax
> > > > subl %r8d, %eax
> > > > @@ -989,7 +1196,7 @@ L(less_1x_vec_till_page):
> > > > /* Use 16 byte comparison. */
> > > > vmovdqu (%rdi), %xmm0
> > > > VPTESTM %xmm0, %xmm0, %k2
> > > > - VPCMP $0, (%rsi), %xmm0, %k1{%k2}
> > > > + CMP_R1_S2_XMM (%xmm0, (%rsi), %xmm1, %k1){%k2}
> > > > kmovd %k1, %ecx
> > > > # ifdef USE_AS_WCSCMP
> > > > subl $0xf, %ecx
> > > > @@ -1009,7 +1216,7 @@ L(less_1x_vec_till_page):
> > > > # endif
> > > > vmovdqu (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
> > > > VPTESTM %xmm0, %xmm0, %k2
> > > > - VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0, %k1{%k2}
> > > > + CMP_R1_S2_XMM (%xmm0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1, %k1){%k2}
> > > > kmovd %k1, %ecx
> > > > # ifdef USE_AS_WCSCMP
> > > > subl $0xf, %ecx
> > > > @@ -1048,7 +1255,7 @@ L(less_16_till_page):
> > > > vmovq (%rdi), %xmm0
> > > > vmovq (%rsi), %xmm1
> > > > VPTESTM %xmm0, %xmm0, %k2
> > > > - VPCMP $0, %xmm1, %xmm0, %k1{%k2}
> > > > + CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
> > > > kmovd %k1, %ecx
> > > > # ifdef USE_AS_WCSCMP
> > > > subl $0x3, %ecx
> > > > @@ -1068,7 +1275,7 @@ L(less_16_till_page):
> > > > vmovq (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
> > > > vmovq (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
> > > > VPTESTM %xmm0, %xmm0, %k2
> > > > - VPCMP $0, %xmm1, %xmm0, %k1{%k2}
> > > > + CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
> > > > kmovd %k1, %ecx
> > > > # ifdef USE_AS_WCSCMP
> > > > subl $0x3, %ecx
> > > > @@ -1128,7 +1335,7 @@ L(ret_less_8_wcs):
> > > > vmovd (%rdi), %xmm0
> > > > vmovd (%rsi), %xmm1
> > > > VPTESTM %xmm0, %xmm0, %k2
> > > > - VPCMP $0, %xmm1, %xmm0, %k1{%k2}
> > > > + CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
> > > > kmovd %k1, %ecx
> > > > subl $0xf, %ecx
> > > > jnz L(check_ret_vec_page_cross)
> > > > @@ -1143,7 +1350,7 @@ L(ret_less_8_wcs):
> > > > vmovd (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
> > > > vmovd (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
> > > > VPTESTM %xmm0, %xmm0, %k2
> > > > - VPCMP $0, %xmm1, %xmm0, %k1{%k2}
> > > > + CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
> > > > kmovd %k1, %ecx
> > > > subl $0xf, %ecx
> > > > jnz L(check_ret_vec_page_cross)
> > > > @@ -1176,7 +1383,9 @@ L(less_4_till_page):
> > > > L(less_4_loop):
> > > > movzbl (%rdi), %eax
> > > > movzbl (%rsi, %rdi), %ecx
> > > > - subl %ecx, %eax
> > > > + TOLOWER_gpr (%rax, %eax)
> > > > + TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
> > > > + subl %BYTE_LOOP_REG, %eax
> > > > jnz L(ret_less_4_loop)
> > > > testl %ecx, %ecx
> > > > jz L(ret_zero_4_loop)
> > > > @@ -1203,5 +1412,6 @@ L(ret_less_4_loop):
> > > > subl %r8d, %eax
> > > > ret
> > > > # endif
> > > > -END(STRCMP)
> > > > + cfi_endproc
> > > > + .size STRCMP, .-STRCMP
> > > > #endif
> > > > diff --git a/sysdeps/x86_64/multiarch/strncase_l-evex.S b/sysdeps/x86_64/multiarch/strncase_l-evex.S
> > > > new file mode 100644
> > > > index 0000000000..8a5af3695c
> > > > --- /dev/null
> > > > +++ b/sysdeps/x86_64/multiarch/strncase_l-evex.S
> > > > @@ -0,0 +1,25 @@
> > > > +/* strncasecmp_l optimized with EVEX.
> > > > + Copyright (C) 2017-2022 Free Software Foundation, Inc.
> > > > + This file is part of the GNU C Library.
> > > > +
> > > > + The GNU C Library is free software; you can redistribute it and/or
> > > > + modify it under the terms of the GNU Lesser General Public
> > > > + License as published by the Free Software Foundation; either
> > > > + version 2.1 of the License, or (at your option) any later version.
> > > > +
> > > > + The GNU C Library is distributed in the hope that it will be useful,
> > > > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > > > + Lesser General Public License for more details.
> > > > +
> > > > + You should have received a copy of the GNU Lesser General Public
> > > > + License along with the GNU C Library; if not, see
> > > > + <https://www.gnu.org/licenses/>. */
> > > > +
> > > > +#ifndef STRCMP
> > > > +# define STRCMP __strncasecmp_l_evex
> > > > +#endif
> > > > +#define OVERFLOW_STRCMP __strcasecmp_l_evex
> > > > +#define USE_AS_STRCASECMP_L
> > > > +#define USE_AS_STRNCMP
> > > > +#include "strcmp-evex.S"
> > > > --
> > > > 2.25.1
> > > >
> > >
> > > LGTM.
> > >
> > > Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
> >
> > Thanks, pushed the patchset.
> > >
> > > Thanks.
> > >
> > > --
> > > H.J.
>
> I would like to backport this patch to release branches.
> Any comments or objections?
>
> Conflict resolution patch attached.
>
> --Sunil
Please ignore, this patch doesn't have any conflict.
--Sunil
--Sunil
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v1 23/23] x86: Remove AVX str{n}casecmp
2022-03-24 19:04 ` H.J. Lu
@ 2022-05-12 19:54 ` Sunil Pandey
0 siblings, 0 replies; 76+ messages in thread
From: Sunil Pandey @ 2022-05-12 19:54 UTC (permalink / raw)
To: H.J. Lu, Libc-stable Mailing List; +Cc: Noah Goldstein, GNU C Library
On Thu, Mar 24, 2022 at 12:09 PM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Wed, Mar 23, 2022 at 3:03 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > The rational is:
> >
> > 1. SSE42 has nearly identical logic so any benefit is minimal (3.4%
> > regression on Tigerlake using SSE42 versus AVX across the
> > benchtest suite).
> > 2. AVX2 version covers the majority of targets that previously
> > prefered it.
> > 3. The targets where AVX would still be best (SnB and IVB) are
> > becoming outdated.
> >
> > All in all the saving the code size is worth it.
> >
> > All string/memory tests pass.
> > ---
> > Geomtric Mean N=40 runs; All functions page aligned
> > length, align1, align2, max_char, AVX Time / SSE42 Time
> > 1, 1, 1, 127, 0.928
> > 2, 2, 2, 127, 0.934
> > 3, 3, 3, 127, 0.975
> > 4, 4, 4, 127, 0.96
> > 5, 5, 5, 127, 0.935
> > 6, 6, 6, 127, 0.929
> > 7, 7, 7, 127, 0.959
> > 8, 0, 0, 127, 0.955
> > 9, 1, 1, 127, 0.944
> > 10, 2, 2, 127, 0.975
> > 11, 3, 3, 127, 0.935
> > 12, 4, 4, 127, 0.931
> > 13, 5, 5, 127, 0.926
> > 14, 6, 6, 127, 0.901
> > 15, 7, 7, 127, 0.951
> > 4, 0, 0, 127, 0.958
> > 4, 0, 0, 254, 0.956
> > 8, 0, 0, 254, 0.977
> > 16, 0, 0, 127, 0.955
> > 16, 0, 0, 254, 0.953
> > 32, 0, 0, 127, 0.943
> > 32, 0, 0, 254, 0.941
> > 64, 0, 0, 127, 0.941
> > 64, 0, 0, 254, 0.955
> > 128, 0, 0, 127, 0.972
> > 128, 0, 0, 254, 0.975
> > 256, 0, 0, 127, 0.996
> > 256, 0, 0, 254, 0.993
> > 512, 0, 0, 127, 0.992
> > 512, 0, 0, 254, 0.986
> > 1024, 0, 0, 127, 0.994
> > 1024, 0, 0, 254, 0.993
> > 16, 1, 2, 127, 0.933
> > 16, 2, 1, 254, 0.953
> > 32, 2, 4, 127, 0.927
> > 32, 4, 2, 254, 0.986
> > 64, 3, 6, 127, 0.991
> > 64, 6, 3, 254, 1.014
> > 128, 4, 0, 127, 1.001
> > 128, 0, 4, 254, 0.991
> > 256, 5, 2, 127, 1.011
> > 256, 2, 5, 254, 1.013
> > 512, 6, 4, 127, 1.056
> > 512, 4, 6, 254, 0.916
> > 1024, 7, 6, 127, 1.059
> > 1024, 6, 7, 254, 1.043
> >
> > sysdeps/x86_64/multiarch/Makefile | 2 -
> > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 -
> > sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 4 -
> > sysdeps/x86_64/multiarch/strcasecmp_l-avx.S | 22 --
> > sysdeps/x86_64/multiarch/strcmp-sse42.S | 240 +++++++++-----------
> > sysdeps/x86_64/multiarch/strncase_l-avx.S | 22 --
> > 6 files changed, 105 insertions(+), 197 deletions(-)
> > delete mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx.S
> > delete mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx.S
> >
> > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > index 35d80dc2ff..6507d1b7fa 100644
> > --- a/sysdeps/x86_64/multiarch/Makefile
> > +++ b/sysdeps/x86_64/multiarch/Makefile
> > @@ -54,7 +54,6 @@ sysdep_routines += \
> > stpncpy-evex \
> > stpncpy-sse2-unaligned \
> > stpncpy-ssse3 \
> > - strcasecmp_l-avx \
> > strcasecmp_l-avx2 \
> > strcasecmp_l-avx2-rtm \
> > strcasecmp_l-evex \
> > @@ -95,7 +94,6 @@ sysdep_routines += \
> > strlen-avx2-rtm \
> > strlen-evex \
> > strlen-sse2 \
> > - strncase_l-avx \
> > strncase_l-avx2 \
> > strncase_l-avx2-rtm \
> > strncase_l-evex \
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > index f1a4d3dac2..40cc6cc49e 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > @@ -447,9 +447,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > (CPU_FEATURE_USABLE (AVX2)
> > && CPU_FEATURE_USABLE (RTM)),
> > __strcasecmp_avx2_rtm)
> > - IFUNC_IMPL_ADD (array, i, strcasecmp,
> > - CPU_FEATURE_USABLE (AVX),
> > - __strcasecmp_avx)
> > IFUNC_IMPL_ADD (array, i, strcasecmp,
> > CPU_FEATURE_USABLE (SSE4_2),
> > __strcasecmp_sse42)
> > @@ -471,9 +468,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > (CPU_FEATURE_USABLE (AVX2)
> > && CPU_FEATURE_USABLE (RTM)),
> > __strcasecmp_l_avx2_rtm)
> > - IFUNC_IMPL_ADD (array, i, strcasecmp_l,
> > - CPU_FEATURE_USABLE (AVX),
> > - __strcasecmp_l_avx)
> > IFUNC_IMPL_ADD (array, i, strcasecmp_l,
> > CPU_FEATURE_USABLE (SSE4_2),
> > __strcasecmp_l_sse42)
> > @@ -609,9 +603,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > (CPU_FEATURE_USABLE (AVX2)
> > && CPU_FEATURE_USABLE (RTM)),
> > __strncasecmp_avx2_rtm)
> > - IFUNC_IMPL_ADD (array, i, strncasecmp,
> > - CPU_FEATURE_USABLE (AVX),
> > - __strncasecmp_avx)
> > IFUNC_IMPL_ADD (array, i, strncasecmp,
> > CPU_FEATURE_USABLE (SSE4_2),
> > __strncasecmp_sse42)
> > @@ -634,9 +625,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > (CPU_FEATURE_USABLE (AVX2)
> > && CPU_FEATURE_USABLE (RTM)),
> > __strncasecmp_l_avx2_rtm)
> > - IFUNC_IMPL_ADD (array, i, strncasecmp_l,
> > - CPU_FEATURE_USABLE (AVX),
> > - __strncasecmp_l_avx)
> > IFUNC_IMPL_ADD (array, i, strncasecmp_l,
> > CPU_FEATURE_USABLE (SSE4_2),
> > __strncasecmp_l_sse42)
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> > index bf0d146e7f..766539c241 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> > +++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> > @@ -22,7 +22,6 @@
> > extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> > extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
> > extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
> > -extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
> > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
> > extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
> > @@ -46,9 +45,6 @@ IFUNC_SELECTOR (void)
> > return OPTIMIZE (avx2);
> > }
> >
> > - if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
> > - return OPTIMIZE (avx);
> > -
> > if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_2)
> > && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
> > return OPTIMIZE (sse42);
> > diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S
> > deleted file mode 100644
> > index 7ec7c21b5a..0000000000
> > --- a/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S
> > +++ /dev/null
> > @@ -1,22 +0,0 @@
> > -/* strcasecmp_l optimized with AVX.
> > - Copyright (C) 2017-2022 Free Software Foundation, Inc.
> > - This file is part of the GNU C Library.
> > -
> > - The GNU C Library is free software; you can redistribute it and/or
> > - modify it under the terms of the GNU Lesser General Public
> > - License as published by the Free Software Foundation; either
> > - version 2.1 of the License, or (at your option) any later version.
> > -
> > - The GNU C Library is distributed in the hope that it will be useful,
> > - but WITHOUT ANY WARRANTY; without even the implied warranty of
> > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > - Lesser General Public License for more details.
> > -
> > - You should have received a copy of the GNU Lesser General Public
> > - License along with the GNU C Library; if not, see
> > - <https://www.gnu.org/licenses/>. */
> > -
> > -#define STRCMP_SSE42 __strcasecmp_l_avx
> > -#define USE_AVX 1
> > -#define USE_AS_STRCASECMP_L
> > -#include "strcmp-sse42.S"
> > diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
> > index 7805ae9d41..a9178ad25c 100644
> > --- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
> > +++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
> > @@ -41,13 +41,8 @@
> > # define UPDATE_STRNCMP_COUNTER
> > #endif
> >
> > -#ifdef USE_AVX
> > -# define SECTION avx
> > -# define GLABEL(l) l##_avx
> > -#else
> > -# define SECTION sse4.2
> > -# define GLABEL(l) l##_sse42
> > -#endif
> > +#define SECTION sse4.2
> > +#define GLABEL(l) l##_sse42
> >
> > #define LABEL(l) .L##l
> >
> > @@ -105,21 +100,7 @@ END (GLABEL(__strncasecmp))
> > #endif
> >
> >
> > -#ifdef USE_AVX
> > -# define movdqa vmovdqa
> > -# define movdqu vmovdqu
> > -# define pmovmskb vpmovmskb
> > -# define pcmpistri vpcmpistri
> > -# define psubb vpsubb
> > -# define pcmpeqb vpcmpeqb
> > -# define psrldq vpsrldq
> > -# define pslldq vpslldq
> > -# define palignr vpalignr
> > -# define pxor vpxor
> > -# define D(arg) arg, arg
> > -#else
> > -# define D(arg) arg
> > -#endif
> > +#define arg arg
> >
> > STRCMP_SSE42:
> > cfi_startproc
> > @@ -191,18 +172,7 @@ LABEL(case_add):
> > movdqu (%rdi), %xmm1
> > movdqu (%rsi), %xmm2
> > #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> > -# ifdef USE_AVX
> > -# define TOLOWER(reg1, reg2) \
> > - vpaddb LCASE_MIN_reg, reg1, %xmm7; \
> > - vpaddb LCASE_MIN_reg, reg2, %xmm8; \
> > - vpcmpgtb LCASE_MAX_reg, %xmm7, %xmm7; \
> > - vpcmpgtb LCASE_MAX_reg, %xmm8, %xmm8; \
> > - vpandn CASE_ADD_reg, %xmm7, %xmm7; \
> > - vpandn CASE_ADD_reg, %xmm8, %xmm8; \
> > - vpaddb %xmm7, reg1, reg1; \
> > - vpaddb %xmm8, reg2, reg2
> > -# else
> > -# define TOLOWER(reg1, reg2) \
> > +# define TOLOWER(reg1, reg2) \
> > movdqa LCASE_MIN_reg, %xmm7; \
> > movdqa LCASE_MIN_reg, %xmm8; \
> > paddb reg1, %xmm7; \
> > @@ -213,15 +183,15 @@ LABEL(case_add):
> > pandn CASE_ADD_reg, %xmm8; \
> > paddb %xmm7, reg1; \
> > paddb %xmm8, reg2
> > -# endif
> > +
> > TOLOWER (%xmm1, %xmm2)
> > #else
> > # define TOLOWER(reg1, reg2)
> > #endif
> > - pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char checks */
> > - pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
> > - pcmpeqb %xmm2, D(%xmm1) /* compare first 16 bytes for equality */
> > - psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/
> > + pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
> > + pcmpeqb %xmm1, %xmm0 /* Any null chars? */
> > + pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */
> > + psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> > pmovmskb %xmm1, %edx
> > sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
> > jnz LABEL(less16bytes)/* If not, find different value or null char */
> > @@ -245,7 +215,7 @@ LABEL(crosscache):
> > xor %r8d, %r8d
> > and $0xf, %ecx /* offset of rsi */
> > and $0xf, %eax /* offset of rdi */
> > - pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char check */
> > + pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */
> > cmp %eax, %ecx
> > je LABEL(ashr_0) /* rsi and rdi relative offset same */
> > ja LABEL(bigger)
> > @@ -259,7 +229,7 @@ LABEL(bigger):
> > sub %rcx, %r9
> > lea LABEL(unaligned_table)(%rip), %r10
> > movslq (%r10, %r9,4), %r9
> > - pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
> > + pcmpeqb %xmm1, %xmm0 /* Any null chars? */
> > lea (%r10, %r9), %r10
> > _CET_NOTRACK jmp *%r10 /* jump to corresponding case */
> >
> > @@ -272,15 +242,15 @@ LABEL(bigger):
> > LABEL(ashr_0):
> >
> > movdqa (%rsi), %xmm1
> > - pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
> > + pcmpeqb %xmm1, %xmm0 /* Any null chars? */
> > #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> > - pcmpeqb (%rdi), D(%xmm1) /* compare 16 bytes for equality */
> > + pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */
> > #else
> > movdqa (%rdi), %xmm2
> > TOLOWER (%xmm1, %xmm2)
> > - pcmpeqb %xmm2, D(%xmm1) /* compare 16 bytes for equality */
> > + pcmpeqb %xmm2, %xmm1 /* compare 16 bytes for equality */
> > #endif
> > - psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/
> > + psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> > pmovmskb %xmm1, %r9d
> > shr %cl, %edx /* adjust 0xffff for offset */
> > shr %cl, %r9d /* adjust for 16-byte offset */
> > @@ -360,10 +330,10 @@ LABEL(ashr_0_exit_use):
> > */
> > .p2align 4
> > LABEL(ashr_1):
> > - pslldq $15, D(%xmm2) /* shift first string to align with second */
> > + pslldq $15, %xmm2 /* shift first string to align with second */
> > TOLOWER (%xmm1, %xmm2)
> > - pcmpeqb %xmm1, D(%xmm2) /* compare 16 bytes for equality */
> > - psubb %xmm0, D(%xmm2) /* packed sub of comparison results*/
> > + pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */
> > + psubb %xmm0, %xmm2 /* packed sub of comparison results*/
> > pmovmskb %xmm2, %r9d
> > shr %cl, %edx /* adjust 0xffff for offset */
> > shr %cl, %r9d /* adjust for 16-byte offset */
> > @@ -391,7 +361,7 @@ LABEL(loop_ashr_1_use):
> >
> > LABEL(nibble_ashr_1_restart_use):
> > movdqa (%rdi, %rdx), %xmm0
> > - palignr $1, -16(%rdi, %rdx), D(%xmm0)
> > + palignr $1, -16(%rdi, %rdx), %xmm0
> > #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> > pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> > #else
> > @@ -410,7 +380,7 @@ LABEL(nibble_ashr_1_restart_use):
> > jg LABEL(nibble_ashr_1_use)
> >
> > movdqa (%rdi, %rdx), %xmm0
> > - palignr $1, -16(%rdi, %rdx), D(%xmm0)
> > + palignr $1, -16(%rdi, %rdx), %xmm0
> > #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> > pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> > #else
> > @@ -430,7 +400,7 @@ LABEL(nibble_ashr_1_restart_use):
> > LABEL(nibble_ashr_1_use):
> > sub $0x1000, %r10
> > movdqa -16(%rdi, %rdx), %xmm0
> > - psrldq $1, D(%xmm0)
> > + psrldq $1, %xmm0
> > pcmpistri $0x3a,%xmm0, %xmm0
> > #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> > cmp %r11, %rcx
> > @@ -448,10 +418,10 @@ LABEL(nibble_ashr_1_use):
> > */
> > .p2align 4
> > LABEL(ashr_2):
> > - pslldq $14, D(%xmm2)
> > + pslldq $14, %xmm2
> > TOLOWER (%xmm1, %xmm2)
> > - pcmpeqb %xmm1, D(%xmm2)
> > - psubb %xmm0, D(%xmm2)
> > + pcmpeqb %xmm1, %xmm2
> > + psubb %xmm0, %xmm2
> > pmovmskb %xmm2, %r9d
> > shr %cl, %edx
> > shr %cl, %r9d
> > @@ -479,7 +449,7 @@ LABEL(loop_ashr_2_use):
> >
> > LABEL(nibble_ashr_2_restart_use):
> > movdqa (%rdi, %rdx), %xmm0
> > - palignr $2, -16(%rdi, %rdx), D(%xmm0)
> > + palignr $2, -16(%rdi, %rdx), %xmm0
> > #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> > pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> > #else
> > @@ -498,7 +468,7 @@ LABEL(nibble_ashr_2_restart_use):
> > jg LABEL(nibble_ashr_2_use)
> >
> > movdqa (%rdi, %rdx), %xmm0
> > - palignr $2, -16(%rdi, %rdx), D(%xmm0)
> > + palignr $2, -16(%rdi, %rdx), %xmm0
> > #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> > pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> > #else
> > @@ -518,7 +488,7 @@ LABEL(nibble_ashr_2_restart_use):
> > LABEL(nibble_ashr_2_use):
> > sub $0x1000, %r10
> > movdqa -16(%rdi, %rdx), %xmm0
> > - psrldq $2, D(%xmm0)
> > + psrldq $2, %xmm0
> > pcmpistri $0x3a,%xmm0, %xmm0
> > #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> > cmp %r11, %rcx
> > @@ -536,10 +506,10 @@ LABEL(nibble_ashr_2_use):
> > */
> > .p2align 4
> > LABEL(ashr_3):
> > - pslldq $13, D(%xmm2)
> > + pslldq $13, %xmm2
> > TOLOWER (%xmm1, %xmm2)
> > - pcmpeqb %xmm1, D(%xmm2)
> > - psubb %xmm0, D(%xmm2)
> > + pcmpeqb %xmm1, %xmm2
> > + psubb %xmm0, %xmm2
> > pmovmskb %xmm2, %r9d
> > shr %cl, %edx
> > shr %cl, %r9d
> > @@ -567,7 +537,7 @@ LABEL(loop_ashr_3_use):
> >
> > LABEL(nibble_ashr_3_restart_use):
> > movdqa (%rdi, %rdx), %xmm0
> > - palignr $3, -16(%rdi, %rdx), D(%xmm0)
> > + palignr $3, -16(%rdi, %rdx), %xmm0
> > #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> > pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> > #else
> > @@ -586,7 +556,7 @@ LABEL(nibble_ashr_3_restart_use):
> > jg LABEL(nibble_ashr_3_use)
> >
> > movdqa (%rdi, %rdx), %xmm0
> > - palignr $3, -16(%rdi, %rdx), D(%xmm0)
> > + palignr $3, -16(%rdi, %rdx), %xmm0
> > #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> > pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> > #else
> > @@ -606,7 +576,7 @@ LABEL(nibble_ashr_3_restart_use):
> > LABEL(nibble_ashr_3_use):
> > sub $0x1000, %r10
> > movdqa -16(%rdi, %rdx), %xmm0
> > - psrldq $3, D(%xmm0)
> > + psrldq $3, %xmm0
> > pcmpistri $0x3a,%xmm0, %xmm0
> > #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> > cmp %r11, %rcx
> > @@ -624,10 +594,10 @@ LABEL(nibble_ashr_3_use):
> > */
> > .p2align 4
> > LABEL(ashr_4):
> > - pslldq $12, D(%xmm2)
> > + pslldq $12, %xmm2
> > TOLOWER (%xmm1, %xmm2)
> > - pcmpeqb %xmm1, D(%xmm2)
> > - psubb %xmm0, D(%xmm2)
> > + pcmpeqb %xmm1, %xmm2
> > + psubb %xmm0, %xmm2
> > pmovmskb %xmm2, %r9d
> > shr %cl, %edx
> > shr %cl, %r9d
> > @@ -656,7 +626,7 @@ LABEL(loop_ashr_4_use):
> >
> > LABEL(nibble_ashr_4_restart_use):
> > movdqa (%rdi, %rdx), %xmm0
> > - palignr $4, -16(%rdi, %rdx), D(%xmm0)
> > + palignr $4, -16(%rdi, %rdx), %xmm0
> > #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> > pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> > #else
> > @@ -675,7 +645,7 @@ LABEL(nibble_ashr_4_restart_use):
> > jg LABEL(nibble_ashr_4_use)
> >
> > movdqa (%rdi, %rdx), %xmm0
> > - palignr $4, -16(%rdi, %rdx), D(%xmm0)
> > + palignr $4, -16(%rdi, %rdx), %xmm0
> > #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> > pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> > #else
> > @@ -695,7 +665,7 @@ LABEL(nibble_ashr_4_restart_use):
> > LABEL(nibble_ashr_4_use):
> > sub $0x1000, %r10
> > movdqa -16(%rdi, %rdx), %xmm0
> > - psrldq $4, D(%xmm0)
> > + psrldq $4, %xmm0
> > pcmpistri $0x3a,%xmm0, %xmm0
> > #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> > cmp %r11, %rcx
> > @@ -713,10 +683,10 @@ LABEL(nibble_ashr_4_use):
> > */
> > .p2align 4
> > LABEL(ashr_5):
> > - pslldq $11, D(%xmm2)
> > + pslldq $11, %xmm2
> > TOLOWER (%xmm1, %xmm2)
> > - pcmpeqb %xmm1, D(%xmm2)
> > - psubb %xmm0, D(%xmm2)
> > + pcmpeqb %xmm1, %xmm2
> > + psubb %xmm0, %xmm2
> > pmovmskb %xmm2, %r9d
> > shr %cl, %edx
> > shr %cl, %r9d
> > @@ -745,7 +715,7 @@ LABEL(loop_ashr_5_use):
> >
> > LABEL(nibble_ashr_5_restart_use):
> > movdqa (%rdi, %rdx), %xmm0
> > - palignr $5, -16(%rdi, %rdx), D(%xmm0)
> > + palignr $5, -16(%rdi, %rdx), %xmm0
> > #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> > pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> > #else
> > @@ -765,7 +735,7 @@ LABEL(nibble_ashr_5_restart_use):
> >
> > movdqa (%rdi, %rdx), %xmm0
> >
> > - palignr $5, -16(%rdi, %rdx), D(%xmm0)
> > + palignr $5, -16(%rdi, %rdx), %xmm0
> > #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> > pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> > #else
> > @@ -785,7 +755,7 @@ LABEL(nibble_ashr_5_restart_use):
> > LABEL(nibble_ashr_5_use):
> > sub $0x1000, %r10
> > movdqa -16(%rdi, %rdx), %xmm0
> > - psrldq $5, D(%xmm0)
> > + psrldq $5, %xmm0
> > pcmpistri $0x3a,%xmm0, %xmm0
> > #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> > cmp %r11, %rcx
> > @@ -803,10 +773,10 @@ LABEL(nibble_ashr_5_use):
> > */
> > .p2align 4
> > LABEL(ashr_6):
> > - pslldq $10, D(%xmm2)
> > + pslldq $10, %xmm2
> > TOLOWER (%xmm1, %xmm2)
> > - pcmpeqb %xmm1, D(%xmm2)
> > - psubb %xmm0, D(%xmm2)
> > + pcmpeqb %xmm1, %xmm2
> > + psubb %xmm0, %xmm2
> > pmovmskb %xmm2, %r9d
> > shr %cl, %edx
> > shr %cl, %r9d
> > @@ -835,7 +805,7 @@ LABEL(loop_ashr_6_use):
> >
> > LABEL(nibble_ashr_6_restart_use):
> > movdqa (%rdi, %rdx), %xmm0
> > - palignr $6, -16(%rdi, %rdx), D(%xmm0)
> > + palignr $6, -16(%rdi, %rdx), %xmm0
> > #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> > pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> > #else
> > @@ -854,7 +824,7 @@ LABEL(nibble_ashr_6_restart_use):
> > jg LABEL(nibble_ashr_6_use)
> >
> > movdqa (%rdi, %rdx), %xmm0
> > - palignr $6, -16(%rdi, %rdx), D(%xmm0)
> > + palignr $6, -16(%rdi, %rdx), %xmm0
> > #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> > pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> > #else
> > @@ -874,7 +844,7 @@ LABEL(nibble_ashr_6_restart_use):
> > LABEL(nibble_ashr_6_use):
> > sub $0x1000, %r10
> > movdqa -16(%rdi, %rdx), %xmm0
> > - psrldq $6, D(%xmm0)
> > + psrldq $6, %xmm0
> > pcmpistri $0x3a,%xmm0, %xmm0
> > #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> > cmp %r11, %rcx
> > @@ -892,10 +862,10 @@ LABEL(nibble_ashr_6_use):
> > */
> > .p2align 4
> > LABEL(ashr_7):
> > - pslldq $9, D(%xmm2)
> > + pslldq $9, %xmm2
> > TOLOWER (%xmm1, %xmm2)
> > - pcmpeqb %xmm1, D(%xmm2)
> > - psubb %xmm0, D(%xmm2)
> > + pcmpeqb %xmm1, %xmm2
> > + psubb %xmm0, %xmm2
> > pmovmskb %xmm2, %r9d
> > shr %cl, %edx
> > shr %cl, %r9d
> > @@ -924,7 +894,7 @@ LABEL(loop_ashr_7_use):
> >
> > LABEL(nibble_ashr_7_restart_use):
> > movdqa (%rdi, %rdx), %xmm0
> > - palignr $7, -16(%rdi, %rdx), D(%xmm0)
> > + palignr $7, -16(%rdi, %rdx), %xmm0
> > #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> > pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> > #else
> > @@ -943,7 +913,7 @@ LABEL(nibble_ashr_7_restart_use):
> > jg LABEL(nibble_ashr_7_use)
> >
> > movdqa (%rdi, %rdx), %xmm0
> > - palignr $7, -16(%rdi, %rdx), D(%xmm0)
> > + palignr $7, -16(%rdi, %rdx), %xmm0
> > #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> > pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> > #else
> > @@ -963,7 +933,7 @@ LABEL(nibble_ashr_7_restart_use):
> > LABEL(nibble_ashr_7_use):
> > sub $0x1000, %r10
> > movdqa -16(%rdi, %rdx), %xmm0
> > - psrldq $7, D(%xmm0)
> > + psrldq $7, %xmm0
> > pcmpistri $0x3a,%xmm0, %xmm0
> > #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> > cmp %r11, %rcx
> > @@ -981,10 +951,10 @@ LABEL(nibble_ashr_7_use):
> > */
> > .p2align 4
> > LABEL(ashr_8):
> > - pslldq $8, D(%xmm2)
> > + pslldq $8, %xmm2
> > TOLOWER (%xmm1, %xmm2)
> > - pcmpeqb %xmm1, D(%xmm2)
> > - psubb %xmm0, D(%xmm2)
> > + pcmpeqb %xmm1, %xmm2
> > + psubb %xmm0, %xmm2
> > pmovmskb %xmm2, %r9d
> > shr %cl, %edx
> > shr %cl, %r9d
> > @@ -1013,7 +983,7 @@ LABEL(loop_ashr_8_use):
> >
> > LABEL(nibble_ashr_8_restart_use):
> > movdqa (%rdi, %rdx), %xmm0
> > - palignr $8, -16(%rdi, %rdx), D(%xmm0)
> > + palignr $8, -16(%rdi, %rdx), %xmm0
> > #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> > pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> > #else
> > @@ -1032,7 +1002,7 @@ LABEL(nibble_ashr_8_restart_use):
> > jg LABEL(nibble_ashr_8_use)
> >
> > movdqa (%rdi, %rdx), %xmm0
> > - palignr $8, -16(%rdi, %rdx), D(%xmm0)
> > + palignr $8, -16(%rdi, %rdx), %xmm0
> > #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> > pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> > #else
> > @@ -1052,7 +1022,7 @@ LABEL(nibble_ashr_8_restart_use):
> > LABEL(nibble_ashr_8_use):
> > sub $0x1000, %r10
> > movdqa -16(%rdi, %rdx), %xmm0
> > - psrldq $8, D(%xmm0)
> > + psrldq $8, %xmm0
> > pcmpistri $0x3a,%xmm0, %xmm0
> > #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> > cmp %r11, %rcx
> > @@ -1070,10 +1040,10 @@ LABEL(nibble_ashr_8_use):
> > */
> > .p2align 4
> > LABEL(ashr_9):
> > - pslldq $7, D(%xmm2)
> > + pslldq $7, %xmm2
> > TOLOWER (%xmm1, %xmm2)
> > - pcmpeqb %xmm1, D(%xmm2)
> > - psubb %xmm0, D(%xmm2)
> > + pcmpeqb %xmm1, %xmm2
> > + psubb %xmm0, %xmm2
> > pmovmskb %xmm2, %r9d
> > shr %cl, %edx
> > shr %cl, %r9d
> > @@ -1103,7 +1073,7 @@ LABEL(loop_ashr_9_use):
> > LABEL(nibble_ashr_9_restart_use):
> > movdqa (%rdi, %rdx), %xmm0
> >
> > - palignr $9, -16(%rdi, %rdx), D(%xmm0)
> > + palignr $9, -16(%rdi, %rdx), %xmm0
> > #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> > pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> > #else
> > @@ -1122,7 +1092,7 @@ LABEL(nibble_ashr_9_restart_use):
> > jg LABEL(nibble_ashr_9_use)
> >
> > movdqa (%rdi, %rdx), %xmm0
> > - palignr $9, -16(%rdi, %rdx), D(%xmm0)
> > + palignr $9, -16(%rdi, %rdx), %xmm0
> > #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> > pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> > #else
> > @@ -1142,7 +1112,7 @@ LABEL(nibble_ashr_9_restart_use):
> > LABEL(nibble_ashr_9_use):
> > sub $0x1000, %r10
> > movdqa -16(%rdi, %rdx), %xmm0
> > - psrldq $9, D(%xmm0)
> > + psrldq $9, %xmm0
> > pcmpistri $0x3a,%xmm0, %xmm0
> > #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> > cmp %r11, %rcx
> > @@ -1160,10 +1130,10 @@ LABEL(nibble_ashr_9_use):
> > */
> > .p2align 4
> > LABEL(ashr_10):
> > - pslldq $6, D(%xmm2)
> > + pslldq $6, %xmm2
> > TOLOWER (%xmm1, %xmm2)
> > - pcmpeqb %xmm1, D(%xmm2)
> > - psubb %xmm0, D(%xmm2)
> > + pcmpeqb %xmm1, %xmm2
> > + psubb %xmm0, %xmm2
> > pmovmskb %xmm2, %r9d
> > shr %cl, %edx
> > shr %cl, %r9d
> > @@ -1192,7 +1162,7 @@ LABEL(loop_ashr_10_use):
> >
> > LABEL(nibble_ashr_10_restart_use):
> > movdqa (%rdi, %rdx), %xmm0
> > - palignr $10, -16(%rdi, %rdx), D(%xmm0)
> > + palignr $10, -16(%rdi, %rdx), %xmm0
> > #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> > pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> > #else
> > @@ -1211,7 +1181,7 @@ LABEL(nibble_ashr_10_restart_use):
> > jg LABEL(nibble_ashr_10_use)
> >
> > movdqa (%rdi, %rdx), %xmm0
> > - palignr $10, -16(%rdi, %rdx), D(%xmm0)
> > + palignr $10, -16(%rdi, %rdx), %xmm0
> > #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> > pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> > #else
> > @@ -1231,7 +1201,7 @@ LABEL(nibble_ashr_10_restart_use):
> > LABEL(nibble_ashr_10_use):
> > sub $0x1000, %r10
> > movdqa -16(%rdi, %rdx), %xmm0
> > - psrldq $10, D(%xmm0)
> > + psrldq $10, %xmm0
> > pcmpistri $0x3a,%xmm0, %xmm0
> > #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> > cmp %r11, %rcx
> > @@ -1249,10 +1219,10 @@ LABEL(nibble_ashr_10_use):
> > */
> > .p2align 4
> > LABEL(ashr_11):
> > - pslldq $5, D(%xmm2)
> > + pslldq $5, %xmm2
> > TOLOWER (%xmm1, %xmm2)
> > - pcmpeqb %xmm1, D(%xmm2)
> > - psubb %xmm0, D(%xmm2)
> > + pcmpeqb %xmm1, %xmm2
> > + psubb %xmm0, %xmm2
> > pmovmskb %xmm2, %r9d
> > shr %cl, %edx
> > shr %cl, %r9d
> > @@ -1281,7 +1251,7 @@ LABEL(loop_ashr_11_use):
> >
> > LABEL(nibble_ashr_11_restart_use):
> > movdqa (%rdi, %rdx), %xmm0
> > - palignr $11, -16(%rdi, %rdx), D(%xmm0)
> > + palignr $11, -16(%rdi, %rdx), %xmm0
> > #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> > pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> > #else
> > @@ -1300,7 +1270,7 @@ LABEL(nibble_ashr_11_restart_use):
> > jg LABEL(nibble_ashr_11_use)
> >
> > movdqa (%rdi, %rdx), %xmm0
> > - palignr $11, -16(%rdi, %rdx), D(%xmm0)
> > + palignr $11, -16(%rdi, %rdx), %xmm0
> > #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> > pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> > #else
> > @@ -1320,7 +1290,7 @@ LABEL(nibble_ashr_11_restart_use):
> > LABEL(nibble_ashr_11_use):
> > sub $0x1000, %r10
> > movdqa -16(%rdi, %rdx), %xmm0
> > - psrldq $11, D(%xmm0)
> > + psrldq $11, %xmm0
> > pcmpistri $0x3a,%xmm0, %xmm0
> > #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> > cmp %r11, %rcx
> > @@ -1338,10 +1308,10 @@ LABEL(nibble_ashr_11_use):
> > */
> > .p2align 4
> > LABEL(ashr_12):
> > - pslldq $4, D(%xmm2)
> > + pslldq $4, %xmm2
> > TOLOWER (%xmm1, %xmm2)
> > - pcmpeqb %xmm1, D(%xmm2)
> > - psubb %xmm0, D(%xmm2)
> > + pcmpeqb %xmm1, %xmm2
> > + psubb %xmm0, %xmm2
> > pmovmskb %xmm2, %r9d
> > shr %cl, %edx
> > shr %cl, %r9d
> > @@ -1370,7 +1340,7 @@ LABEL(loop_ashr_12_use):
> >
> > LABEL(nibble_ashr_12_restart_use):
> > movdqa (%rdi, %rdx), %xmm0
> > - palignr $12, -16(%rdi, %rdx), D(%xmm0)
> > + palignr $12, -16(%rdi, %rdx), %xmm0
> > #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> > pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> > #else
> > @@ -1389,7 +1359,7 @@ LABEL(nibble_ashr_12_restart_use):
> > jg LABEL(nibble_ashr_12_use)
> >
> > movdqa (%rdi, %rdx), %xmm0
> > - palignr $12, -16(%rdi, %rdx), D(%xmm0)
> > + palignr $12, -16(%rdi, %rdx), %xmm0
> > #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> > pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> > #else
> > @@ -1409,7 +1379,7 @@ LABEL(nibble_ashr_12_restart_use):
> > LABEL(nibble_ashr_12_use):
> > sub $0x1000, %r10
> > movdqa -16(%rdi, %rdx), %xmm0
> > - psrldq $12, D(%xmm0)
> > + psrldq $12, %xmm0
> > pcmpistri $0x3a,%xmm0, %xmm0
> > #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> > cmp %r11, %rcx
> > @@ -1427,10 +1397,10 @@ LABEL(nibble_ashr_12_use):
> > */
> > .p2align 4
> > LABEL(ashr_13):
> > - pslldq $3, D(%xmm2)
> > + pslldq $3, %xmm2
> > TOLOWER (%xmm1, %xmm2)
> > - pcmpeqb %xmm1, D(%xmm2)
> > - psubb %xmm0, D(%xmm2)
> > + pcmpeqb %xmm1, %xmm2
> > + psubb %xmm0, %xmm2
> > pmovmskb %xmm2, %r9d
> > shr %cl, %edx
> > shr %cl, %r9d
> > @@ -1460,7 +1430,7 @@ LABEL(loop_ashr_13_use):
> >
> > LABEL(nibble_ashr_13_restart_use):
> > movdqa (%rdi, %rdx), %xmm0
> > - palignr $13, -16(%rdi, %rdx), D(%xmm0)
> > + palignr $13, -16(%rdi, %rdx), %xmm0
> > #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> > pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> > #else
> > @@ -1479,7 +1449,7 @@ LABEL(nibble_ashr_13_restart_use):
> > jg LABEL(nibble_ashr_13_use)
> >
> > movdqa (%rdi, %rdx), %xmm0
> > - palignr $13, -16(%rdi, %rdx), D(%xmm0)
> > + palignr $13, -16(%rdi, %rdx), %xmm0
> > #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> > pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> > #else
> > @@ -1499,7 +1469,7 @@ LABEL(nibble_ashr_13_restart_use):
> > LABEL(nibble_ashr_13_use):
> > sub $0x1000, %r10
> > movdqa -16(%rdi, %rdx), %xmm0
> > - psrldq $13, D(%xmm0)
> > + psrldq $13, %xmm0
> > pcmpistri $0x3a,%xmm0, %xmm0
> > #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> > cmp %r11, %rcx
> > @@ -1517,10 +1487,10 @@ LABEL(nibble_ashr_13_use):
> > */
> > .p2align 4
> > LABEL(ashr_14):
> > - pslldq $2, D(%xmm2)
> > + pslldq $2, %xmm2
> > TOLOWER (%xmm1, %xmm2)
> > - pcmpeqb %xmm1, D(%xmm2)
> > - psubb %xmm0, D(%xmm2)
> > + pcmpeqb %xmm1, %xmm2
> > + psubb %xmm0, %xmm2
> > pmovmskb %xmm2, %r9d
> > shr %cl, %edx
> > shr %cl, %r9d
> > @@ -1550,7 +1520,7 @@ LABEL(loop_ashr_14_use):
> >
> > LABEL(nibble_ashr_14_restart_use):
> > movdqa (%rdi, %rdx), %xmm0
> > - palignr $14, -16(%rdi, %rdx), D(%xmm0)
> > + palignr $14, -16(%rdi, %rdx), %xmm0
> > #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> > pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> > #else
> > @@ -1569,7 +1539,7 @@ LABEL(nibble_ashr_14_restart_use):
> > jg LABEL(nibble_ashr_14_use)
> >
> > movdqa (%rdi, %rdx), %xmm0
> > - palignr $14, -16(%rdi, %rdx), D(%xmm0)
> > + palignr $14, -16(%rdi, %rdx), %xmm0
> > #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> > pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> > #else
> > @@ -1589,7 +1559,7 @@ LABEL(nibble_ashr_14_restart_use):
> > LABEL(nibble_ashr_14_use):
> > sub $0x1000, %r10
> > movdqa -16(%rdi, %rdx), %xmm0
> > - psrldq $14, D(%xmm0)
> > + psrldq $14, %xmm0
> > pcmpistri $0x3a,%xmm0, %xmm0
> > #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> > cmp %r11, %rcx
> > @@ -1607,10 +1577,10 @@ LABEL(nibble_ashr_14_use):
> > */
> > .p2align 4
> > LABEL(ashr_15):
> > - pslldq $1, D(%xmm2)
> > + pslldq $1, %xmm2
> > TOLOWER (%xmm1, %xmm2)
> > - pcmpeqb %xmm1, D(%xmm2)
> > - psubb %xmm0, D(%xmm2)
> > + pcmpeqb %xmm1, %xmm2
> > + psubb %xmm0, %xmm2
> > pmovmskb %xmm2, %r9d
> > shr %cl, %edx
> > shr %cl, %r9d
> > @@ -1642,7 +1612,7 @@ LABEL(loop_ashr_15_use):
> >
> > LABEL(nibble_ashr_15_restart_use):
> > movdqa (%rdi, %rdx), %xmm0
> > - palignr $15, -16(%rdi, %rdx), D(%xmm0)
> > + palignr $15, -16(%rdi, %rdx), %xmm0
> > #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> > pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> > #else
> > @@ -1661,7 +1631,7 @@ LABEL(nibble_ashr_15_restart_use):
> > jg LABEL(nibble_ashr_15_use)
> >
> > movdqa (%rdi, %rdx), %xmm0
> > - palignr $15, -16(%rdi, %rdx), D(%xmm0)
> > + palignr $15, -16(%rdi, %rdx), %xmm0
> > #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> > pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> > #else
> > @@ -1681,7 +1651,7 @@ LABEL(nibble_ashr_15_restart_use):
> > LABEL(nibble_ashr_15_use):
> > sub $0x1000, %r10
> > movdqa -16(%rdi, %rdx), %xmm0
> > - psrldq $15, D(%xmm0)
> > + psrldq $15, %xmm0
> > pcmpistri $0x3a,%xmm0, %xmm0
> > #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> > cmp %r11, %rcx
> > diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx.S b/sysdeps/x86_64/multiarch/strncase_l-avx.S
> > deleted file mode 100644
> > index b51b86d223..0000000000
> > --- a/sysdeps/x86_64/multiarch/strncase_l-avx.S
> > +++ /dev/null
> > @@ -1,22 +0,0 @@
> > -/* strncasecmp_l optimized with AVX.
> > - Copyright (C) 2017-2022 Free Software Foundation, Inc.
> > - This file is part of the GNU C Library.
> > -
> > - The GNU C Library is free software; you can redistribute it and/or
> > - modify it under the terms of the GNU Lesser General Public
> > - License as published by the Free Software Foundation; either
> > - version 2.1 of the License, or (at your option) any later version.
> > -
> > - The GNU C Library is distributed in the hope that it will be useful,
> > - but WITHOUT ANY WARRANTY; without even the implied warranty of
> > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > - Lesser General Public License for more details.
> > -
> > - You should have received a copy of the GNU Lesser General Public
> > - License along with the GNU C Library; if not, see
> > - <https://www.gnu.org/licenses/>. */
> > -
> > -#define STRCMP_SSE42 __strncasecmp_l_avx
> > -#define USE_AVX 1
> > -#define USE_AS_STRNCASECMP_L
> > -#include "strcmp-sse42.S"
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.
I would like to backport this patch to release branches.
Any comments or objections?
--Sunil
^ permalink raw reply [flat|nested] 76+ messages in thread
end of thread, other threads:[~2022-05-12 19:54 UTC | newest]
Thread overview: 76+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
2022-03-23 21:57 ` [PATCH v1 02/23] benchtests: Add random benchmark " Noah Goldstein
2022-03-24 18:44 ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 03/23] x86: Code cleanup in strchr-avx2 and comment justifying branch Noah Goldstein
2022-03-24 18:53 ` H.J. Lu
2022-03-24 19:20 ` Noah Goldstein
2022-03-24 19:36 ` H.J. Lu
2022-05-12 19:31 ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 04/23] x86: Code cleanup in strchr-evex " Noah Goldstein
2022-03-24 18:54 ` H.J. Lu
2022-05-12 19:32 ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 05/23] benchtests: Use json-lib in bench-strpbrk.c Noah Goldstein
2022-03-24 18:54 ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 06/23] benchtests: Use json-lib in bench-strspn.c Noah Goldstein
2022-03-24 18:54 ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 07/23] x86: Optimize strcspn and strpbrk in strcspn-c.c Noah Goldstein
2022-03-24 18:55 ` H.J. Lu
2022-05-12 19:34 ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 08/23] x86: Optimize strspn in strspn-c.c Noah Goldstein
2022-03-24 18:56 ` H.J. Lu
2022-05-12 19:39 ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 09/23] x86: Remove strcspn-sse2.S and use the generic implementation Noah Goldstein
2022-03-24 18:57 ` H.J. Lu
2022-05-12 19:40 ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 10/23] x86: Remove strpbrk-sse2.S " Noah Goldstein
2022-03-24 18:57 ` H.J. Lu
2022-05-12 19:41 ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 11/23] x86: Remove strspn-sse2.S " Noah Goldstein
2022-03-24 18:57 ` H.J. Lu
2022-05-12 19:42 ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 12/23] x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896] Noah Goldstein
2022-03-24 18:59 ` H.J. Lu
2022-03-24 19:18 ` Noah Goldstein
2022-03-24 19:34 ` H.J. Lu
2022-03-24 19:39 ` Noah Goldstein
2022-03-24 20:50 ` [PATCH v2 12/31] " Noah Goldstein
2022-03-24 21:26 ` H.J. Lu
2022-03-24 21:43 ` Noah Goldstein
2022-03-24 21:58 ` H.J. Lu
2022-05-04 6:05 ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 13/23] benchtests: Use json-lib in bench-strcasecmp.c Noah Goldstein
2022-03-24 19:00 ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 14/23] benchtests: Use json-lib in bench-strncasecmp.c Noah Goldstein
2022-03-24 19:00 ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 15/23] string: Expand page cross tests in test-strcasecmp.c Noah Goldstein
2022-03-24 19:01 ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 16/23] string: Expand page cross tests in test-strncasecmp.c Noah Goldstein
2022-03-24 19:01 ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 17/23] x86: Optimize str{n}casecmp TOLOWER logic in strcmp.S Noah Goldstein
2022-03-24 19:02 ` H.J. Lu
2022-05-12 19:44 ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 18/23] x86: Optimize str{n}casecmp TOLOWER logic in strcmp-sse42.S Noah Goldstein
2022-03-24 19:02 ` H.J. Lu
2022-05-12 19:45 ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 19/23] string: Expand page cross test cases in test-strcmp.c Noah Goldstein
2022-03-24 19:02 ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 20/23] string: Expand page cross test cases in test-strncmp.c Noah Goldstein
2022-03-24 19:02 ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 21/23] x86: Add AVX2 optimized str{n}casecmp Noah Goldstein
2022-03-24 19:03 ` H.J. Lu
2022-03-24 22:41 ` [PATCH v3 " Noah Goldstein
2022-03-24 22:41 ` [PATCH v3 22/23] x86: Add EVEX " Noah Goldstein
2022-03-24 23:56 ` [PATCH v4 21/23] x86: Add AVX2 " Noah Goldstein
2022-03-24 23:56 ` [PATCH v4 22/23] x86: Add EVEX " Noah Goldstein
2022-03-25 18:15 ` H.J. Lu
2022-03-25 18:18 ` Noah Goldstein
2022-05-12 19:47 ` Sunil Pandey
2022-05-12 19:52 ` Sunil Pandey
2022-03-25 18:14 ` [PATCH v4 21/23] x86: Add AVX2 " H.J. Lu
2022-05-12 19:52 ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 22/23] x86: Add EVEX " Noah Goldstein
2022-03-24 19:04 ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 23/23] x86: Remove AVX str{n}casecmp Noah Goldstein
2022-03-24 19:04 ` H.J. Lu
2022-05-12 19:54 ` Sunil Pandey
2022-03-24 18:43 ` [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c H.J. Lu
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).