* [PATCH v1 1/5] benchtests: Improve bench-strrchr
@ 2022-04-21 3:14 Noah Goldstein
2022-04-21 3:14 ` [PATCH v1 2/5] x86: Optimize {str|wcs}rchr-sse2 Noah Goldstein
` (7 more replies)
0 siblings, 8 replies; 36+ messages in thread
From: Noah Goldstein @ 2022-04-21 3:14 UTC (permalink / raw)
To: libc-alpha
1. Use json-lib for printing results.
2. Expose all parameters (before pos, seek_char, and max_char where
not printed).
3. Add benchmarks that test multiple occurence of seek_char in the
string.
---
benchtests/bench-strrchr.c | 126 ++++++++++++++++++++++++-------------
1 file changed, 82 insertions(+), 44 deletions(-)
diff --git a/benchtests/bench-strrchr.c b/benchtests/bench-strrchr.c
index abdae60c51..cceea77e1b 100644
--- a/benchtests/bench-strrchr.c
+++ b/benchtests/bench-strrchr.c
@@ -23,6 +23,7 @@
# define TEST_NAME "strrchr"
#endif
#include "bench-string.h"
+#include "json-lib.h"
#define BIG_CHAR MAX_CHAR
@@ -53,7 +54,8 @@ SIMPLE_STRRCHR (const CHAR *s, int c)
}
static void
-do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
+do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s, int c,
+ CHAR *exp_res)
{
CHAR *res = CALL (impl, s, c);
size_t i, iters = INNER_LOOP_ITERS8;
@@ -61,8 +63,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
if (res != exp_res)
{
- error (0, 0, "Wrong result in function %s %p %p", impl->name,
- res, exp_res);
+ error (0, 0, "Wrong result in function %s %p %p", impl->name, res,
+ exp_res);
ret = 1;
return;
}
@@ -72,24 +74,28 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
{
CALL (impl, s, c);
}
- TIMING_NOW (stop);
+ TIMING_NOW (stop);
TIMING_DIFF (cur, start, stop);
- TIMING_PRINT_MEAN ((double) cur, (double) iters);
+ json_element_double (json_ctx, (double) cur / (double) iters);
+ return;
}
static void
-do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
+do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
+ int seek_char, int max_char, size_t freq)
/* For wcsrchr: align here means align not in bytes,
but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
len for wcschr here isn't in bytes but it's number of wchar_t symbols. */
{
size_t i;
+ size_t pos_chunk_sz = freq ? (pos / freq) : pos;
+ size_t last_pos = len;
CHAR *result;
CHAR *buf = (CHAR *) buf1;
- align &= 7;
+ align &= (getpagesize () - 1);
if ((align + len) * sizeof (CHAR) >= page_size)
return;
@@ -103,6 +109,16 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
if ((i > pos || pos >= len) && buf[align + i] == seek_char)
buf[align + i] = seek_char + 10 + (random () & 15);
}
+
+ if (pos_chunk_sz == 0 && pos)
+ pos_chunk_sz = 1;
+
+ for (i = pos_chunk_sz; i < pos && i < len; i += pos_chunk_sz)
+ {
+ buf[align + i] = seek_char;
+ last_pos = i;
+ }
+
buf[align + len] = 0;
if (pos < len)
@@ -110,66 +126,88 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
buf[align + pos] = seek_char;
result = (CHAR *) (buf + align + pos);
}
+ else if (last_pos < len)
+ result = (CHAR *) (buf + align + last_pos);
else if (seek_char == 0)
result = (CHAR *) (buf + align + len);
else
result = NULL;
- printf ("Length %4zd, alignment in bytes %2zd:", len, align * sizeof (CHAR));
+ json_element_object_begin (json_ctx);
+ json_attr_uint (json_ctx, "len", len);
+ json_attr_uint (json_ctx, "pos", pos);
+ json_attr_uint (json_ctx, "align", align);
+ json_attr_uint (json_ctx, "freq", freq);
+ json_attr_uint (json_ctx, "seek", seek_char);
+ json_attr_uint (json_ctx, "max_char", max_char);
+ json_array_begin (json_ctx, "timings");
FOR_EACH_IMPL (impl, 0)
- do_one_test (impl, (CHAR *) (buf + align), seek_char, result);
+ do_one_test (json_ctx, impl, (CHAR *) (buf + align), seek_char, result);
- putchar ('\n');
+ json_array_end (json_ctx);
+ json_element_object_end (json_ctx);
}
int
test_main (void)
{
- size_t i;
+ json_ctx_t json_ctx;
+ size_t i, j;
+ int seek;
test_init ();
+ json_init (&json_ctx, 0, stdout);
- printf ("%20s", "");
- FOR_EACH_IMPL (impl, 0)
- printf ("\t%s", impl->name);
- putchar ('\n');
+ json_document_begin (&json_ctx);
+ json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
- for (i = 1; i < 8; ++i)
- {
- do_test (0, 16 << i, 2048, 23, SMALL_CHAR);
- do_test (i, 16 << i, 2048, 23, SMALL_CHAR);
- }
+ json_attr_object_begin (&json_ctx, "functions");
+ json_attr_object_begin (&json_ctx, TEST_NAME);
+ json_attr_string (&json_ctx, "bench-variant", "");
- for (i = 1; i < 8; ++i)
- {
- do_test (i, 64, 256, 23, SMALL_CHAR);
- do_test (i, 64, 256, 23, BIG_CHAR);
- }
-
- for (i = 0; i < 32; ++i)
- {
- do_test (0, i, i + 1, 23, SMALL_CHAR);
- do_test (0, i, i + 1, 23, BIG_CHAR);
- }
+ json_array_begin (&json_ctx, "ifuncs");
+ FOR_EACH_IMPL (impl, 0)
+ json_element_string (&json_ctx, impl->name);
+ json_array_end (&json_ctx);
- for (i = 1; i < 8; ++i)
- {
- do_test (0, 16 << i, 2048, 0, SMALL_CHAR);
- do_test (i, 16 << i, 2048, 0, SMALL_CHAR);
- }
+ json_array_begin (&json_ctx, "results");
- for (i = 1; i < 8; ++i)
+ for (seek = 0; seek <= 23; seek += 23)
{
- do_test (i, 64, 256, 0, SMALL_CHAR);
- do_test (i, 64, 256, 0, BIG_CHAR);
+ for (j = 1; j < 32; j += j)
+ {
+ for (i = 1; i < 9; ++i)
+ {
+ do_test (&json_ctx, 0, 16 << i, 2048, seek, SMALL_CHAR, j);
+ do_test (&json_ctx, i, 16 << i, 2048, seek, SMALL_CHAR, j);
+ }
+
+ for (i = 1; i < 8; ++i)
+ {
+ do_test (&json_ctx, i, 64, 256, seek, SMALL_CHAR, j);
+ do_test (&json_ctx, i, 64, 256, seek, BIG_CHAR, j);
+
+ do_test (&json_ctx, i * 15, 64, 256, seek, SMALL_CHAR, j);
+ do_test (&json_ctx, i * 15, 64, 256, seek, BIG_CHAR, j);
+ }
+
+ for (i = 0; i < 32; ++i)
+ {
+ do_test (&json_ctx, 0, i, i + 1, seek, SMALL_CHAR, j);
+ do_test (&json_ctx, 0, i, i + 1, seek, BIG_CHAR, j);
+ }
+ if (seek == 0)
+ {
+ break;
+ }
+ }
}
- for (i = 0; i < 32; ++i)
- {
- do_test (0, i, i + 1, 0, SMALL_CHAR);
- do_test (0, i, i + 1, 0, BIG_CHAR);
- }
+ json_array_end (&json_ctx);
+ json_attr_object_end (&json_ctx);
+ json_attr_object_end (&json_ctx);
+ json_document_end (&json_ctx);
return ret;
}
--
2.25.1
^ permalink raw reply [flat|nested] 36+ messages in thread
* [PATCH v1 2/5] x86: Optimize {str|wcs}rchr-sse2
2022-04-21 3:14 [PATCH v1 1/5] benchtests: Improve bench-strrchr Noah Goldstein
@ 2022-04-21 3:14 ` Noah Goldstein
2022-04-21 20:26 ` H.J. Lu
2022-04-21 3:14 ` [PATCH v1 3/5] x86: Add wcsrchr optimized with SSE4_1 in wcsrchr-sse4_1.S Noah Goldstein
` (6 subsequent siblings)
7 siblings, 1 reply; 36+ messages in thread
From: Noah Goldstein @ 2022-04-21 3:14 UTC (permalink / raw)
To: libc-alpha
The new code unrolls the main loop slightly without adding too much
overhead and minimizes the comparisons for the search CHAR.
Geometric Mean of all benchmarks New / Old: 0.741
See email for all results.
Full xcheck passes on x86_64 with and without multiarch enabled.
---
Results For: strrchr
Geometric Mean of N=30 runs.
Geometric Mean of all benchmarks New / Old: 0.741
Benchmarks performance on Tigerlake:
https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i71165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html
len, align, pos, seek, max_char, freq, New Time / Old Time
2048, 0, 32, 0, 127, 1, 0.647
2048, 1, 32, 0, 127, 1, 0.621
2048, 0, 64, 0, 127, 1, 0.661
2048, 2, 64, 0, 127, 1, 0.655
2048, 0, 128, 0, 127, 1, 0.69
2048, 3, 128, 0, 127, 1, 0.689
2048, 0, 256, 0, 127, 1, 0.718
2048, 4, 256, 0, 127, 1, 0.718
2048, 0, 512, 0, 127, 1, 0.758
2048, 5, 512, 0, 127, 1, 0.754
2048, 0, 1024, 0, 127, 1, 1.029
2048, 6, 1024, 0, 127, 1, 1.032
2048, 0, 2048, 0, 127, 1, 0.826
2048, 7, 2048, 0, 127, 1, 0.834
2048, 0, 4096, 0, 127, 1, 0.825
2048, 8, 4096, 0, 127, 1, 0.83
256, 1, 64, 0, 127, 1, 0.657
256, 15, 64, 0, 127, 1, 0.657
256, 2, 64, 0, 127, 1, 0.657
256, 30, 64, 0, 127, 1, 0.523
256, 3, 64, 0, 127, 1, 0.657
256, 45, 64, 0, 127, 1, 0.654
256, 4, 64, 0, 127, 1, 0.657
256, 60, 64, 0, 127, 1, 0.526
256, 5, 64, 0, 127, 1, 0.658
256, 75, 64, 0, 127, 1, 0.658
256, 6, 64, 0, 127, 1, 0.655
256, 90, 64, 0, 127, 1, 0.523
256, 7, 64, 0, 127, 1, 0.655
256, 105, 64, 0, 127, 1, 0.654
1, 0, 0, 0, 127, 1, 0.98
2, 0, 1, 0, 127, 1, 0.978
3, 0, 2, 0, 127, 1, 0.975
4, 0, 3, 0, 127, 1, 0.976
5, 0, 4, 0, 127, 1, 0.977
6, 0, 5, 0, 127, 1, 0.981
7, 0, 6, 0, 127, 1, 0.982
8, 0, 7, 0, 127, 1, 0.98
9, 0, 8, 0, 127, 1, 0.978
10, 0, 9, 0, 127, 1, 0.981
11, 0, 10, 0, 127, 1, 0.984
12, 0, 11, 0, 127, 1, 0.982
13, 0, 12, 0, 127, 1, 0.98
14, 0, 13, 0, 127, 1, 0.978
15, 0, 14, 0, 127, 1, 0.979
16, 0, 15, 0, 127, 1, 0.986
17, 0, 16, 0, 127, 1, 0.529
18, 0, 17, 0, 127, 1, 0.566
19, 0, 18, 0, 127, 1, 0.575
20, 0, 19, 0, 127, 1, 0.573
21, 0, 20, 0, 127, 1, 0.579
22, 0, 21, 0, 127, 1, 0.595
23, 0, 22, 0, 127, 1, 0.585
24, 0, 23, 0, 127, 1, 0.586
25, 0, 24, 0, 127, 1, 0.587
26, 0, 25, 0, 127, 1, 0.592
27, 0, 26, 0, 127, 1, 0.595
28, 0, 27, 0, 127, 1, 0.592
29, 0, 28, 0, 127, 1, 0.6
30, 0, 29, 0, 127, 1, 0.598
31, 0, 30, 0, 127, 1, 0.595
32, 0, 31, 0, 127, 1, 0.592
2048, 0, 32, 23, 127, 1, 0.827
2048, 1, 32, 23, 127, 1, 0.826
2048, 0, 64, 23, 127, 1, 0.824
2048, 2, 64, 23, 127, 1, 0.825
2048, 0, 128, 23, 127, 1, 0.829
2048, 3, 128, 23, 127, 1, 0.824
2048, 0, 256, 23, 127, 1, 0.832
2048, 4, 256, 23, 127, 1, 0.825
2048, 0, 512, 23, 127, 1, 0.831
2048, 5, 512, 23, 127, 1, 0.837
2048, 0, 1024, 23, 127, 1, 0.721
2048, 6, 1024, 23, 127, 1, 0.757
2048, 0, 2048, 23, 127, 1, 0.825
2048, 7, 2048, 23, 127, 1, 0.824
2048, 0, 4096, 23, 127, 1, 0.828
2048, 8, 4096, 23, 127, 1, 0.823
256, 1, 64, 23, 127, 1, 0.665
256, 15, 64, 23, 127, 1, 0.661
256, 2, 64, 23, 127, 1, 0.674
256, 30, 64, 23, 127, 1, 0.605
256, 3, 64, 23, 127, 1, 0.668
256, 45, 64, 23, 127, 1, 0.661
256, 4, 64, 23, 127, 1, 0.657
256, 60, 64, 23, 127, 1, 0.594
256, 5, 64, 23, 127, 1, 0.654
256, 75, 64, 23, 127, 1, 0.673
256, 6, 64, 23, 127, 1, 0.688
256, 90, 64, 23, 127, 1, 0.6
256, 7, 64, 23, 127, 1, 0.66
256, 105, 64, 23, 127, 1, 0.654
1, 0, 0, 23, 127, 1, 0.981
2, 0, 1, 23, 127, 1, 0.976
3, 0, 2, 23, 127, 1, 0.983
4, 0, 3, 23, 127, 1, 0.984
5, 0, 4, 23, 127, 1, 0.973
6, 0, 5, 23, 127, 1, 0.987
7, 0, 6, 23, 127, 1, 0.977
8, 0, 7, 23, 127, 1, 0.979
9, 0, 8, 23, 127, 1, 0.981
10, 0, 9, 23, 127, 1, 0.98
11, 0, 10, 23, 127, 1, 0.983
12, 0, 11, 23, 127, 1, 0.98
13, 0, 12, 23, 127, 1, 0.98
14, 0, 13, 23, 127, 1, 0.977
15, 0, 14, 23, 127, 1, 0.982
16, 0, 15, 23, 127, 1, 0.581
17, 0, 16, 23, 127, 1, 0.551
18, 0, 17, 23, 127, 1, 0.555
19, 0, 18, 23, 127, 1, 0.586
20, 0, 19, 23, 127, 1, 0.585
21, 0, 20, 23, 127, 1, 0.582
22, 0, 21, 23, 127, 1, 0.571
23, 0, 22, 23, 127, 1, 0.576
24, 0, 23, 23, 127, 1, 0.581
25, 0, 24, 23, 127, 1, 0.589
26, 0, 25, 23, 127, 1, 0.593
27, 0, 26, 23, 127, 1, 0.595
28, 0, 27, 23, 127, 1, 0.583
29, 0, 28, 23, 127, 1, 0.595
30, 0, 29, 23, 127, 1, 0.58
31, 0, 30, 23, 127, 1, 0.594
32, 0, 31, 23, 127, 1, 0.665
2048, 0, 32, 23, 127, 2, 0.825
2048, 1, 32, 23, 127, 2, 0.818
2048, 0, 64, 23, 127, 2, 0.829
2048, 2, 64, 23, 127, 2, 0.828
2048, 0, 128, 23, 127, 2, 0.823
2048, 3, 128, 23, 127, 2, 0.825
2048, 0, 256, 23, 127, 2, 0.819
2048, 4, 256, 23, 127, 2, 0.828
2048, 0, 512, 23, 127, 2, 0.824
2048, 5, 512, 23, 127, 2, 0.827
2048, 0, 1024, 23, 127, 2, 0.813
2048, 6, 1024, 23, 127, 2, 0.834
2048, 0, 2048, 23, 127, 2, 0.927
2048, 7, 2048, 23, 127, 2, 0.923
2048, 0, 4096, 23, 127, 2, 0.818
2048, 8, 4096, 23, 127, 2, 0.82
256, 1, 64, 23, 127, 2, 0.693
256, 15, 64, 23, 127, 2, 0.686
256, 2, 64, 23, 127, 2, 0.69
256, 30, 64, 23, 127, 2, 0.611
256, 3, 64, 23, 127, 2, 0.692
256, 45, 64, 23, 127, 2, 0.685
256, 4, 64, 23, 127, 2, 0.688
256, 60, 64, 23, 127, 2, 0.6
256, 5, 64, 23, 127, 2, 0.69
256, 75, 64, 23, 127, 2, 0.689
256, 6, 64, 23, 127, 2, 0.688
256, 90, 64, 23, 127, 2, 0.611
256, 7, 64, 23, 127, 2, 0.69
256, 105, 64, 23, 127, 2, 0.686
1, 0, 0, 23, 127, 2, 0.982
2, 0, 1, 23, 127, 2, 0.987
3, 0, 2, 23, 127, 2, 0.978
4, 0, 3, 23, 127, 2, 0.977
5, 0, 4, 23, 127, 2, 0.979
6, 0, 5, 23, 127, 2, 0.985
7, 0, 6, 23, 127, 2, 0.975
8, 0, 7, 23, 127, 2, 0.981
9, 0, 8, 23, 127, 2, 0.984
10, 0, 9, 23, 127, 2, 0.983
11, 0, 10, 23, 127, 2, 0.982
12, 0, 11, 23, 127, 2, 0.976
13, 0, 12, 23, 127, 2, 0.985
14, 0, 13, 23, 127, 2, 0.984
15, 0, 14, 23, 127, 2, 0.98
16, 0, 15, 23, 127, 2, 0.583
17, 0, 16, 23, 127, 2, 0.552
18, 0, 17, 23, 127, 2, 0.564
19, 0, 18, 23, 127, 2, 0.585
20, 0, 19, 23, 127, 2, 0.578
21, 0, 20, 23, 127, 2, 0.578
22, 0, 21, 23, 127, 2, 0.571
23, 0, 22, 23, 127, 2, 0.587
24, 0, 23, 23, 127, 2, 0.589
25, 0, 24, 23, 127, 2, 0.593
26, 0, 25, 23, 127, 2, 0.589
27, 0, 26, 23, 127, 2, 0.588
28, 0, 27, 23, 127, 2, 0.593
29, 0, 28, 23, 127, 2, 0.579
30, 0, 29, 23, 127, 2, 0.572
31, 0, 30, 23, 127, 2, 0.582
32, 0, 31, 23, 127, 2, 0.659
2048, 0, 32, 23, 127, 4, 0.822
2048, 1, 32, 23, 127, 4, 0.818
2048, 0, 64, 23, 127, 4, 0.826
2048, 2, 64, 23, 127, 4, 0.824
2048, 0, 128, 23, 127, 4, 0.833
2048, 3, 128, 23, 127, 4, 0.831
2048, 0, 256, 23, 127, 4, 0.826
2048, 4, 256, 23, 127, 4, 0.831
2048, 0, 512, 23, 127, 4, 0.834
2048, 5, 512, 23, 127, 4, 0.83
2048, 0, 1024, 23, 127, 4, 0.836
2048, 6, 1024, 23, 127, 4, 0.844
2048, 0, 2048, 23, 127, 4, 0.696
2048, 7, 2048, 23, 127, 4, 0.704
2048, 0, 4096, 23, 127, 4, 0.936
2048, 8, 4096, 23, 127, 4, 0.925
256, 1, 64, 23, 127, 4, 0.694
256, 15, 64, 23, 127, 4, 0.69
256, 2, 64, 23, 127, 4, 0.687
256, 30, 64, 23, 127, 4, 0.612
256, 3, 64, 23, 127, 4, 0.685
256, 45, 64, 23, 127, 4, 0.685
256, 4, 64, 23, 127, 4, 0.684
256, 60, 64, 23, 127, 4, 0.606
256, 5, 64, 23, 127, 4, 0.69
256, 75, 64, 23, 127, 4, 0.688
256, 6, 64, 23, 127, 4, 0.69
256, 90, 64, 23, 127, 4, 0.615
256, 7, 64, 23, 127, 4, 0.691
256, 105, 64, 23, 127, 4, 0.688
1, 0, 0, 23, 127, 4, 0.982
2, 0, 1, 23, 127, 4, 0.983
3, 0, 2, 23, 127, 4, 0.981
4, 0, 3, 23, 127, 4, 0.984
5, 0, 4, 23, 127, 4, 0.963
6, 0, 5, 23, 127, 4, 0.978
7, 0, 6, 23, 127, 4, 0.985
8, 0, 7, 23, 127, 4, 0.986
9, 0, 8, 23, 127, 4, 0.978
10, 0, 9, 23, 127, 4, 0.985
11, 0, 10, 23, 127, 4, 0.986
12, 0, 11, 23, 127, 4, 0.983
13, 0, 12, 23, 127, 4, 0.986
14, 0, 13, 23, 127, 4, 0.98
15, 0, 14, 23, 127, 4, 0.979
16, 0, 15, 23, 127, 4, 0.582
17, 0, 16, 23, 127, 4, 0.542
18, 0, 17, 23, 127, 4, 0.564
19, 0, 18, 23, 127, 4, 0.571
20, 0, 19, 23, 127, 4, 0.582
21, 0, 20, 23, 127, 4, 0.573
22, 0, 21, 23, 127, 4, 0.575
23, 0, 22, 23, 127, 4, 0.578
24, 0, 23, 23, 127, 4, 0.58
25, 0, 24, 23, 127, 4, 0.592
26, 0, 25, 23, 127, 4, 0.588
27, 0, 26, 23, 127, 4, 0.574
28, 0, 27, 23, 127, 4, 0.589
29, 0, 28, 23, 127, 4, 0.56
30, 0, 29, 23, 127, 4, 0.587
31, 0, 30, 23, 127, 4, 0.584
32, 0, 31, 23, 127, 4, 0.664
2048, 0, 32, 23, 127, 8, 0.826
2048, 1, 32, 23, 127, 8, 0.821
2048, 0, 64, 23, 127, 8, 0.828
2048, 2, 64, 23, 127, 8, 0.827
2048, 0, 128, 23, 127, 8, 0.833
2048, 3, 128, 23, 127, 8, 0.83
2048, 0, 256, 23, 127, 8, 0.855
2048, 4, 256, 23, 127, 8, 0.849
2048, 0, 512, 23, 127, 8, 0.849
2048, 5, 512, 23, 127, 8, 0.851
2048, 0, 1024, 23, 127, 8, 0.856
2048, 6, 1024, 23, 127, 8, 0.862
2048, 0, 2048, 23, 127, 8, 0.709
2048, 7, 2048, 23, 127, 8, 0.712
2048, 0, 4096, 23, 127, 8, 0.702
2048, 8, 4096, 23, 127, 8, 0.701
256, 1, 64, 23, 127, 8, 0.689
256, 15, 64, 23, 127, 8, 0.688
256, 2, 64, 23, 127, 8, 0.691
256, 30, 64, 23, 127, 8, 0.612
256, 3, 64, 23, 127, 8, 0.688
256, 45, 64, 23, 127, 8, 0.686
256, 4, 64, 23, 127, 8, 0.694
256, 60, 64, 23, 127, 8, 0.609
256, 5, 64, 23, 127, 8, 0.69
256, 75, 64, 23, 127, 8, 0.69
256, 6, 64, 23, 127, 8, 0.691
256, 90, 64, 23, 127, 8, 0.612
256, 7, 64, 23, 127, 8, 0.689
256, 105, 64, 23, 127, 8, 0.688
1, 0, 0, 23, 127, 8, 0.98
2, 0, 1, 23, 127, 8, 0.978
3, 0, 2, 23, 127, 8, 0.98
4, 0, 3, 23, 127, 8, 0.978
5, 0, 4, 23, 127, 8, 0.977
6, 0, 5, 23, 127, 8, 0.984
7, 0, 6, 23, 127, 8, 0.982
8, 0, 7, 23, 127, 8, 0.983
9, 0, 8, 23, 127, 8, 0.987
10, 0, 9, 23, 127, 8, 0.979
11, 0, 10, 23, 127, 8, 0.985
12, 0, 11, 23, 127, 8, 0.981
13, 0, 12, 23, 127, 8, 0.98
14, 0, 13, 23, 127, 8, 0.982
15, 0, 14, 23, 127, 8, 0.981
16, 0, 15, 23, 127, 8, 0.579
17, 0, 16, 23, 127, 8, 0.531
18, 0, 17, 23, 127, 8, 0.577
19, 0, 18, 23, 127, 8, 0.588
20, 0, 19, 23, 127, 8, 0.571
21, 0, 20, 23, 127, 8, 0.576
22, 0, 21, 23, 127, 8, 0.59
23, 0, 22, 23, 127, 8, 0.574
24, 0, 23, 23, 127, 8, 0.583
25, 0, 24, 23, 127, 8, 0.581
26, 0, 25, 23, 127, 8, 0.592
27, 0, 26, 23, 127, 8, 0.586
28, 0, 27, 23, 127, 8, 0.588
29, 0, 28, 23, 127, 8, 0.578
30, 0, 29, 23, 127, 8, 0.573
31, 0, 30, 23, 127, 8, 0.588
32, 0, 31, 23, 127, 8, 0.664
2048, 0, 32, 23, 127, 16, 0.825
2048, 1, 32, 23, 127, 16, 0.823
2048, 0, 64, 23, 127, 16, 0.831
2048, 2, 64, 23, 127, 16, 0.822
2048, 0, 128, 23, 127, 16, 0.831
2048, 3, 128, 23, 127, 16, 0.831
2048, 0, 256, 23, 127, 16, 0.849
2048, 4, 256, 23, 127, 16, 0.85
2048, 0, 512, 23, 127, 16, 0.751
2048, 5, 512, 23, 127, 16, 0.75
2048, 0, 1024, 23, 127, 16, 0.913
2048, 6, 1024, 23, 127, 16, 0.895
2048, 0, 2048, 23, 127, 16, 0.736
2048, 7, 2048, 23, 127, 16, 0.741
2048, 0, 4096, 23, 127, 16, 0.712
2048, 8, 4096, 23, 127, 16, 0.711
256, 1, 64, 23, 127, 16, 0.758
256, 15, 64, 23, 127, 16, 0.692
256, 2, 64, 23, 127, 16, 0.692
256, 30, 64, 23, 127, 16, 0.613
256, 3, 64, 23, 127, 16, 0.69
256, 45, 64, 23, 127, 16, 0.687
256, 4, 64, 23, 127, 16, 0.69
256, 60, 64, 23, 127, 16, 0.604
256, 5, 64, 23, 127, 16, 0.687
256, 75, 64, 23, 127, 16, 0.687
256, 6, 64, 23, 127, 16, 0.69
256, 90, 64, 23, 127, 16, 0.61
256, 7, 64, 23, 127, 16, 0.69
256, 105, 64, 23, 127, 16, 0.685
1, 0, 0, 23, 127, 16, 0.981
2, 0, 1, 23, 127, 16, 0.985
3, 0, 2, 23, 127, 16, 0.985
4, 0, 3, 23, 127, 16, 0.981
5, 0, 4, 23, 127, 16, 0.979
6, 0, 5, 23, 127, 16, 0.986
7, 0, 6, 23, 127, 16, 0.986
8, 0, 7, 23, 127, 16, 0.982
9, 0, 8, 23, 127, 16, 0.982
10, 0, 9, 23, 127, 16, 0.98
11, 0, 10, 23, 127, 16, 0.983
12, 0, 11, 23, 127, 16, 0.982
13, 0, 12, 23, 127, 16, 0.982
14, 0, 13, 23, 127, 16, 0.982
15, 0, 14, 23, 127, 16, 0.982
16, 0, 15, 23, 127, 16, 0.582
17, 0, 16, 23, 127, 16, 0.542
18, 0, 17, 23, 127, 16, 0.554
19, 0, 18, 23, 127, 16, 0.562
20, 0, 19, 23, 127, 16, 0.587
21, 0, 20, 23, 127, 16, 0.584
22, 0, 21, 23, 127, 16, 0.587
23, 0, 22, 23, 127, 16, 0.594
24, 0, 23, 23, 127, 16, 0.581
25, 0, 24, 23, 127, 16, 0.577
26, 0, 25, 23, 127, 16, 0.588
27, 0, 26, 23, 127, 16, 0.589
28, 0, 27, 23, 127, 16, 0.596
29, 0, 28, 23, 127, 16, 0.591
30, 0, 29, 23, 127, 16, 0.585
31, 0, 30, 23, 127, 16, 0.59
32, 0, 31, 23, 127, 16, 0.669
sysdeps/x86_64/multiarch/strrchr-sse2.S | 2 +-
sysdeps/x86_64/multiarch/wcsrchr-sse2.S | 3 +-
sysdeps/x86_64/strrchr.S | 505 +++++++++++++++---------
sysdeps/x86_64/wcsrchr.S | 268 +------------
4 files changed, 334 insertions(+), 444 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
index db1b44c23c..866396e947 100644
--- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
+++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
@@ -17,7 +17,7 @@
<https://www.gnu.org/licenses/>. */
#if IS_IN (libc)
-# define strrchr __strrchr_sse2
+# define STRRCHR __strrchr_sse2
# undef weak_alias
# define weak_alias(strrchr, rindex)
diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
index 78d1ca6553..69d2f3cdb1 100644
--- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
+++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
@@ -17,7 +17,6 @@
<https://www.gnu.org/licenses/>. */
#if IS_IN (libc)
-# define wcsrchr __wcsrchr_sse2
+# define STRRCHR __wcsrchr_sse2
#endif
-
#include "../wcsrchr.S"
diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
index 50d886713e..94449ad806 100644
--- a/sysdeps/x86_64/strrchr.S
+++ b/sysdeps/x86_64/strrchr.S
@@ -19,210 +19,355 @@
#include <sysdep.h>
+#ifndef STRRCHR
+# define STRRCHR strrchr
+#endif
+
+#ifdef USE_AS_WCSRCHR
+# define PCMPEQ pcmpeqd
+# define CHAR_SIZE 4
+# define PMINU pminud
+#else
+# define PCMPEQ pcmpeqb
+# define CHAR_SIZE 1
+# define PMINU pminub
+#endif
+
+#define PAGE_SIZE 4096
+#define VEC_SIZE 16
+
.text
-ENTRY (strrchr)
- movd %esi, %xmm1
+ENTRY(STRRCHR)
+ movd %esi, %xmm0
movq %rdi, %rax
- andl $4095, %eax
- punpcklbw %xmm1, %xmm1
- cmpq $4032, %rax
- punpcklwd %xmm1, %xmm1
- pshufd $0, %xmm1, %xmm1
+ andl $(PAGE_SIZE - 1), %eax
+#ifndef USE_AS_WCSRCHR
+ punpcklbw %xmm0, %xmm0
+ punpcklwd %xmm0, %xmm0
+#endif
+ pshufd $0, %xmm0, %xmm0
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
ja L(cross_page)
- movdqu (%rdi), %xmm0
+
+L(cross_page_continue):
+ movups (%rdi), %xmm1
pxor %xmm2, %xmm2
- movdqa %xmm0, %xmm3
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm3
- pmovmskb %xmm0, %ecx
- pmovmskb %xmm3, %edx
- testq %rdx, %rdx
- je L(next_48_bytes)
- leaq -1(%rdx), %rax
- xorq %rdx, %rax
- andq %rcx, %rax
- je L(exit)
- bsrq %rax, %rax
+ PCMPEQ %xmm1, %xmm2
+ pmovmskb %xmm2, %ecx
+ testl %ecx, %ecx
+ jz L(aligned_more)
+
+ PCMPEQ %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+ leal -1(%rcx), %edx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(ret0)
+ bsrl %eax, %eax
addq %rdi, %rax
+ /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
+ search CHAR is zero we are correct. Either way `andq
+ -CHAR_SIZE, %rax` gets the correct result. */
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+L(ret0):
ret
+ /* Returns for first vec x1/x2 have hard coded backward search
+ path for earlier matches. */
.p2align 4
-L(next_48_bytes):
- movdqu 16(%rdi), %xmm4
- movdqa %xmm4, %xmm5
- movdqu 32(%rdi), %xmm3
- pcmpeqb %xmm1, %xmm4
- pcmpeqb %xmm2, %xmm5
- movdqu 48(%rdi), %xmm0
- pmovmskb %xmm5, %edx
- movdqa %xmm3, %xmm5
- pcmpeqb %xmm1, %xmm3
- pcmpeqb %xmm2, %xmm5
- pcmpeqb %xmm0, %xmm2
- salq $16, %rdx
- pmovmskb %xmm3, %r8d
- pmovmskb %xmm5, %eax
- pmovmskb %xmm2, %esi
- salq $32, %r8
- salq $32, %rax
- pcmpeqb %xmm1, %xmm0
- orq %rdx, %rax
- movq %rsi, %rdx
- pmovmskb %xmm4, %esi
- salq $48, %rdx
- salq $16, %rsi
- orq %r8, %rsi
- orq %rcx, %rsi
- pmovmskb %xmm0, %ecx
- salq $48, %rcx
- orq %rcx, %rsi
- orq %rdx, %rax
- je L(loop_header2)
- leaq -1(%rax), %rcx
- xorq %rax, %rcx
- andq %rcx, %rsi
- je L(exit)
- bsrq %rsi, %rsi
- leaq (%rdi,%rsi), %rax
+L(first_vec_x0_test):
+ PCMPEQ %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+ testl %eax, %eax
+ jz L(ret0)
+ bsrl %eax, %eax
+ addq %r8, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
ret
.p2align 4
-L(loop_header2):
- testq %rsi, %rsi
- movq %rdi, %rcx
- je L(no_c_found)
-L(loop_header):
- addq $64, %rdi
- pxor %xmm7, %xmm7
- andq $-64, %rdi
- jmp L(loop_entry)
+L(first_vec_x1):
+ PCMPEQ %xmm0, %xmm2
+ pmovmskb %xmm2, %eax
+ leal -1(%rcx), %edx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(first_vec_x0_test)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
.p2align 4
-L(loop64):
- testq %rdx, %rdx
- cmovne %rdx, %rsi
- cmovne %rdi, %rcx
- addq $64, %rdi
-L(loop_entry):
- movdqa 32(%rdi), %xmm3
- pxor %xmm6, %xmm6
- movdqa 48(%rdi), %xmm2
- movdqa %xmm3, %xmm0
- movdqa 16(%rdi), %xmm4
- pminub %xmm2, %xmm0
- movdqa (%rdi), %xmm5
- pminub %xmm4, %xmm0
- pminub %xmm5, %xmm0
- pcmpeqb %xmm7, %xmm0
- pmovmskb %xmm0, %eax
- movdqa %xmm5, %xmm0
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %r9d
- movdqa %xmm4, %xmm0
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %edx
- movdqa %xmm3, %xmm0
- pcmpeqb %xmm1, %xmm0
- salq $16, %rdx
- pmovmskb %xmm0, %r10d
- movdqa %xmm2, %xmm0
- pcmpeqb %xmm1, %xmm0
- salq $32, %r10
- orq %r10, %rdx
- pmovmskb %xmm0, %r8d
- orq %r9, %rdx
- salq $48, %r8
- orq %r8, %rdx
+L(first_vec_x1_test):
+ PCMPEQ %xmm0, %xmm2
+ pmovmskb %xmm2, %eax
testl %eax, %eax
- je L(loop64)
- pcmpeqb %xmm6, %xmm4
- pcmpeqb %xmm6, %xmm3
- pcmpeqb %xmm6, %xmm5
- pmovmskb %xmm4, %eax
- pmovmskb %xmm3, %r10d
- pcmpeqb %xmm6, %xmm2
- pmovmskb %xmm5, %r9d
- salq $32, %r10
- salq $16, %rax
- pmovmskb %xmm2, %r8d
- orq %r10, %rax
- orq %r9, %rax
- salq $48, %r8
- orq %r8, %rax
- leaq -1(%rax), %r8
- xorq %rax, %r8
- andq %r8, %rdx
- cmovne %rdi, %rcx
- cmovne %rdx, %rsi
- bsrq %rsi, %rsi
- leaq (%rcx,%rsi), %rax
+ jz L(first_vec_x0_test)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
+
+ .p2align 4
+L(first_vec_x2):
+ PCMPEQ %xmm0, %xmm3
+ pmovmskb %xmm3, %eax
+ leal -1(%rcx), %edx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(first_vec_x1_test)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
+
+ .p2align 4
+L(aligned_more):
+ /* Save original pointer if match was in VEC 0. */
+ movq %rdi, %r8
+ andq $-VEC_SIZE, %rdi
+
+ movaps VEC_SIZE(%rdi), %xmm2
+ pxor %xmm3, %xmm3
+ PCMPEQ %xmm2, %xmm3
+ pmovmskb %xmm3, %ecx
+ testl %ecx, %ecx
+ jnz L(first_vec_x1)
+
+ movaps (VEC_SIZE * 2)(%rdi), %xmm3
+ pxor %xmm4, %xmm4
+ PCMPEQ %xmm3, %xmm4
+ pmovmskb %xmm4, %ecx
+ testl %ecx, %ecx
+ jnz L(first_vec_x2)
+
+ addq $VEC_SIZE, %rdi
+ /* Save pointer again before realigning. */
+ movq %rdi, %rsi
+ andq $-(VEC_SIZE * 2), %rdi
+ .p2align 4
+L(first_loop):
+ /* Do 2x VEC at a time. */
+ movaps (VEC_SIZE * 2)(%rdi), %xmm4
+ movaps (VEC_SIZE * 3)(%rdi), %xmm5
+ /* If SSE2 no pminud. */
+#ifdef NO_PMINU
+ movaps %xmm5, %xmm6
+ pxor %xmm8, %xmm8
+
+ PCMPEQ %xmm8, %xmm5
+ PCMPEQ %xmm4, %xmm8
+ por %xmm5, %xmm8
+#else
+ movaps %xmm5, %xmm6
+ PMINU %xmm4, %xmm5
+#endif
+
+ movaps %xmm4, %xmm9
+ PCMPEQ %xmm0, %xmm4
+ PCMPEQ %xmm0, %xmm6
+ movaps %xmm6, %xmm7
+ por %xmm4, %xmm6
+#ifndef NO_PMINU
+ pxor %xmm8, %xmm8
+ PCMPEQ %xmm5, %xmm8
+#endif
+ pmovmskb %xmm8, %ecx
+ pmovmskb %xmm6, %eax
+
+ addq $(VEC_SIZE * 2), %rdi
+ /* Use `addl` 1) so we can undo it with `subl` and 2) it can
+ macro-fuse with `jz`. */
+ addl %ecx, %eax
+ jz L(first_loop)
+
+ /* Check if there is zero match. */
+ testl %ecx, %ecx
+ jz L(second_loop_match)
+
+ /* Check if there was a match in last iteration. */
+ subl %ecx, %eax
+ jnz L(new_match)
+
+L(first_loop_old_match):
+ PCMPEQ %xmm0, %xmm2
+ PCMPEQ %xmm0, %xmm3
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm3, %eax
+ addl %eax, %ecx
+ jz L(first_vec_x0_test)
+ /* NB: We could move this shift to before the branch and save a
+ bit of code size / performance on the fall through. The
+ branch leads to the null case which generally seems hotter
+ than char in first 3x VEC. */
+ sall $16, %eax
+ orl %ecx, %eax
+
+ bsrl %eax, %eax
+ addq %rsi, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
+
+ .p2align 4
+L(new_match):
+ pxor %xmm6, %xmm6
+ PCMPEQ %xmm9, %xmm6
+ pmovmskb %xmm6, %eax
+ sall $16, %ecx
+ orl %eax, %ecx
+
+ /* We can't reuse either of the old comparisons as since we mask
+ of zeros after first zero (instead of using the full
+ comparison) we can't gurantee no interference between match
+ after end of string and valid match. */
+ pmovmskb %xmm4, %eax
+ pmovmskb %xmm7, %edx
+ sall $16, %edx
+ orl %edx, %eax
+
+ leal -1(%ecx), %edx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(first_loop_old_match)
+ bsrl %eax, %eax
+ addq %rdi, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
ret
+ /* Save minimum state for getting most recent match. We can
+ throw out all previous work. */
.p2align 4
-L(no_c_found):
- movl $1, %esi
- xorl %ecx, %ecx
- jmp L(loop_header)
+L(second_loop_match):
+ movq %rdi, %rsi
+ movaps %xmm4, %xmm2
+ movaps %xmm7, %xmm3
.p2align 4
-L(exit):
- xorl %eax, %eax
+L(second_loop):
+ movaps (VEC_SIZE * 2)(%rdi), %xmm4
+ movaps (VEC_SIZE * 3)(%rdi), %xmm5
+#ifdef NO_PMINU
+ movaps %xmm5, %xmm6
+ pxor %xmm8, %xmm8
+
+ PCMPEQ %xmm8, %xmm5
+ PCMPEQ %xmm4, %xmm8
+ por %xmm5, %xmm8
+#else
+ movaps %xmm5, %xmm6
+ PMINU %xmm4, %xmm5
+#endif
+
+ movaps %xmm4, %xmm9
+ PCMPEQ %xmm0, %xmm4
+ PCMPEQ %xmm0, %xmm6
+ movaps %xmm6, %xmm7
+ por %xmm4, %xmm6
+#ifndef NO_PMINU
+ pxor %xmm8, %xmm8
+ PCMPEQ %xmm5, %xmm8
+#endif
+
+ pmovmskb %xmm8, %ecx
+ pmovmskb %xmm6, %eax
+
+ addq $(VEC_SIZE * 2), %rdi
+ /* Either null term or new occurence of CHAR. */
+ addl %ecx, %eax
+ jz L(second_loop)
+
+ /* No null term so much be new occurence of CHAR. */
+ testl %ecx, %ecx
+ jz L(second_loop_match)
+
+
+ subl %ecx, %eax
+ jnz L(second_loop_new_match)
+
+L(second_loop_old_match):
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm3, %eax
+ sall $16, %eax
+ orl %ecx, %eax
+ bsrl %eax, %eax
+ addq %rsi, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
ret
.p2align 4
+L(second_loop_new_match):
+ pxor %xmm6, %xmm6
+ PCMPEQ %xmm9, %xmm6
+ pmovmskb %xmm6, %eax
+ sall $16, %ecx
+ orl %eax, %ecx
+
+ /* We can't reuse either of the old comparisons as since we mask
+ of zeros after first zero (instead of using the full
+ comparison) we can't gurantee no interference between match
+ after end of string and valid match. */
+ pmovmskb %xmm4, %eax
+ pmovmskb %xmm7, %edx
+ sall $16, %edx
+ orl %edx, %eax
+
+ leal -1(%ecx), %edx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(second_loop_old_match)
+ bsrl %eax, %eax
+ addq %rdi, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
+
+ .p2align 4,, 4
L(cross_page):
- movq %rdi, %rax
- pxor %xmm0, %xmm0
- andq $-64, %rax
- movdqu (%rax), %xmm5
- movdqa %xmm5, %xmm6
- movdqu 16(%rax), %xmm4
- pcmpeqb %xmm1, %xmm5
- pcmpeqb %xmm0, %xmm6
- movdqu 32(%rax), %xmm3
- pmovmskb %xmm6, %esi
- movdqa %xmm4, %xmm6
- movdqu 48(%rax), %xmm2
- pcmpeqb %xmm1, %xmm4
- pcmpeqb %xmm0, %xmm6
- pmovmskb %xmm6, %edx
- movdqa %xmm3, %xmm6
- pcmpeqb %xmm1, %xmm3
- pcmpeqb %xmm0, %xmm6
- pcmpeqb %xmm2, %xmm0
- salq $16, %rdx
- pmovmskb %xmm3, %r9d
- pmovmskb %xmm6, %r8d
- pmovmskb %xmm0, %ecx
- salq $32, %r9
- salq $32, %r8
- pcmpeqb %xmm1, %xmm2
- orq %r8, %rdx
- salq $48, %rcx
- pmovmskb %xmm5, %r8d
- orq %rsi, %rdx
- pmovmskb %xmm4, %esi
- orq %rcx, %rdx
- pmovmskb %xmm2, %ecx
- salq $16, %rsi
- salq $48, %rcx
- orq %r9, %rsi
- orq %r8, %rsi
- orq %rcx, %rsi
+ movq %rdi, %rsi
+ andq $-VEC_SIZE, %rsi
+ movaps (%rsi), %xmm1
+ pxor %xmm2, %xmm2
+ PCMPEQ %xmm1, %xmm2
+ pmovmskb %xmm2, %edx
movl %edi, %ecx
- subl %eax, %ecx
- shrq %cl, %rdx
- shrq %cl, %rsi
- testq %rdx, %rdx
- je L(loop_header2)
- leaq -1(%rdx), %rax
- xorq %rdx, %rax
- andq %rax, %rsi
- je L(exit)
- bsrq %rsi, %rax
+ andl $(VEC_SIZE - 1), %ecx
+ sarl %cl, %edx
+ jz L(cross_page_continue)
+ PCMPEQ %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+ sarl %cl, %eax
+ leal -1(%rdx), %ecx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(ret1)
+ bsrl %eax, %eax
addq %rdi, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+L(ret1):
ret
-END (strrchr)
+END(STRRCHR)
-weak_alias (strrchr, rindex)
-libc_hidden_builtin_def (strrchr)
+#ifndef USE_AS_WCSRCHR
+ weak_alias (STRRCHR, rindex)
+ libc_hidden_builtin_def (STRRCHR)
+#endif
diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
index 61552954de..2b80efc5ef 100644
--- a/sysdeps/x86_64/wcsrchr.S
+++ b/sysdeps/x86_64/wcsrchr.S
@@ -1,4 +1,4 @@
-/* wcsrchr with SSSE3
+/* wcsrchr optimized with SSE2.
Copyright (C) 2011-2022 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -16,266 +16,12 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#include <sysdep.h>
- .text
-ENTRY (wcsrchr)
+#define USE_AS_WCSRCHR 1
+#define NO_PMINU 1
- movd %rsi, %xmm1
- mov %rdi, %rcx
- punpckldq %xmm1, %xmm1
- pxor %xmm2, %xmm2
- punpckldq %xmm1, %xmm1
- and $63, %rcx
- cmp $48, %rcx
- ja L(crosscache)
+#ifndef STRRCHR
+# define STRRCHR wcsrchr
+#endif
- movdqu (%rdi), %xmm0
- pcmpeqd %xmm0, %xmm2
- pcmpeqd %xmm1, %xmm0
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm0, %rax
- add $16, %rdi
-
- test %rax, %rax
- jnz L(unaligned_match1)
-
- test %rcx, %rcx
- jnz L(return_null)
-
- and $-16, %rdi
- xor %r8, %r8
- jmp L(loop)
-
- .p2align 4
-L(unaligned_match1):
- test %rcx, %rcx
- jnz L(prolog_find_zero_1)
-
- mov %rax, %r8
- mov %rdi, %rsi
- and $-16, %rdi
- jmp L(loop)
-
- .p2align 4
-L(crosscache):
- and $15, %rcx
- and $-16, %rdi
- pxor %xmm3, %xmm3
- movdqa (%rdi), %xmm0
- pcmpeqd %xmm0, %xmm3
- pcmpeqd %xmm1, %xmm0
- pmovmskb %xmm3, %rdx
- pmovmskb %xmm0, %rax
- shr %cl, %rdx
- shr %cl, %rax
- add $16, %rdi
-
- test %rax, %rax
- jnz L(unaligned_match)
-
- test %rdx, %rdx
- jnz L(return_null)
-
- xor %r8, %r8
- jmp L(loop)
-
- .p2align 4
-L(unaligned_match):
- test %rdx, %rdx
- jnz L(prolog_find_zero)
-
- mov %rax, %r8
- lea (%rdi, %rcx), %rsi
-
-/* Loop start on aligned string. */
- .p2align 4
-L(loop):
- movdqa (%rdi), %xmm0
- pcmpeqd %xmm0, %xmm2
- add $16, %rdi
- pcmpeqd %xmm1, %xmm0
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm0, %rax
- or %rax, %rcx
- jnz L(matches)
-
- movdqa (%rdi), %xmm3
- pcmpeqd %xmm3, %xmm2
- add $16, %rdi
- pcmpeqd %xmm1, %xmm3
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm3, %rax
- or %rax, %rcx
- jnz L(matches)
-
- movdqa (%rdi), %xmm4
- pcmpeqd %xmm4, %xmm2
- add $16, %rdi
- pcmpeqd %xmm1, %xmm4
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm4, %rax
- or %rax, %rcx
- jnz L(matches)
-
- movdqa (%rdi), %xmm5
- pcmpeqd %xmm5, %xmm2
- add $16, %rdi
- pcmpeqd %xmm1, %xmm5
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm5, %rax
- or %rax, %rcx
- jz L(loop)
-
- .p2align 4
-L(matches):
- test %rax, %rax
- jnz L(match)
-L(return_value):
- test %r8, %r8
- jz L(return_null)
- mov %r8, %rax
- mov %rsi, %rdi
-
- test $15 << 4, %ah
- jnz L(match_fourth_wchar)
- test %ah, %ah
- jnz L(match_third_wchar)
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(match):
- pmovmskb %xmm2, %rcx
- test %rcx, %rcx
- jnz L(find_zero)
- mov %rax, %r8
- mov %rdi, %rsi
- jmp L(loop)
-
- .p2align 4
-L(find_zero):
- test $15, %cl
- jnz L(find_zero_in_first_wchar)
- test %cl, %cl
- jnz L(find_zero_in_second_wchar)
- test $15, %ch
- jnz L(find_zero_in_third_wchar)
-
- and $1 << 13 - 1, %rax
- jz L(return_value)
-
- test $15 << 4, %ah
- jnz L(match_fourth_wchar)
- test %ah, %ah
- jnz L(match_third_wchar)
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(find_zero_in_first_wchar):
- test $1, %rax
- jz L(return_value)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(find_zero_in_second_wchar):
- and $1 << 5 - 1, %rax
- jz L(return_value)
-
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(find_zero_in_third_wchar):
- and $1 << 9 - 1, %rax
- jz L(return_value)
-
- test %ah, %ah
- jnz L(match_third_wchar)
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(prolog_find_zero):
- add %rcx, %rdi
- mov %rdx, %rcx
-L(prolog_find_zero_1):
- test $15, %cl
- jnz L(prolog_find_zero_in_first_wchar)
- test %cl, %cl
- jnz L(prolog_find_zero_in_second_wchar)
- test $15, %ch
- jnz L(prolog_find_zero_in_third_wchar)
-
- and $1 << 13 - 1, %rax
- jz L(return_null)
-
- test $15 << 4, %ah
- jnz L(match_fourth_wchar)
- test %ah, %ah
- jnz L(match_third_wchar)
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(prolog_find_zero_in_first_wchar):
- test $1, %rax
- jz L(return_null)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(prolog_find_zero_in_second_wchar):
- and $1 << 5 - 1, %rax
- jz L(return_null)
-
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(prolog_find_zero_in_third_wchar):
- and $1 << 9 - 1, %rax
- jz L(return_null)
-
- test %ah, %ah
- jnz L(match_third_wchar)
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(match_second_wchar):
- lea -12(%rdi), %rax
- ret
-
- .p2align 4
-L(match_third_wchar):
- lea -8(%rdi), %rax
- ret
-
- .p2align 4
-L(match_fourth_wchar):
- lea -4(%rdi), %rax
- ret
-
- .p2align 4
-L(return_null):
- xor %rax, %rax
- ret
-
-END (wcsrchr)
+#include "../strrchr.S"
--
2.25.1
^ permalink raw reply [flat|nested] 36+ messages in thread
* [PATCH v1 3/5] x86: Add wcsrchr optimized with SSE4_1 in wcsrchr-sse4_1.S
2022-04-21 3:14 [PATCH v1 1/5] benchtests: Improve bench-strrchr Noah Goldstein
2022-04-21 3:14 ` [PATCH v1 2/5] x86: Optimize {str|wcs}rchr-sse2 Noah Goldstein
@ 2022-04-21 3:14 ` Noah Goldstein
2022-04-21 3:14 ` [PATCH v1 4/5] x86: Optimize {str|wcs}rchr-avx2 Noah Goldstein
` (5 subsequent siblings)
7 siblings, 0 replies; 36+ messages in thread
From: Noah Goldstein @ 2022-04-21 3:14 UTC (permalink / raw)
To: libc-alpha
wcsrchr-sse2 can't use `pminud` which can speedup the main loop:
len, align, pos, seek, max_char, freq, New Time / Old Time
256, 1, 64, 23, 1273, 1, 1.082
256, 1, 64, 23, 2147483647, 1, 1.076
256, 15, 64, 23, 1273, 1, 1.061
256, 15, 64, 23, 2147483647, 1, 1.075
256, 2, 64, 23, 1273, 1, 1.108
256, 2, 64, 23, 2147483647, 1, 1.109
256, 30, 64, 23, 1273, 1, 1.072
256, 30, 64, 23, 2147483647, 1, 1.077
256, 3, 64, 23, 1273, 1, 1.108
256, 3, 64, 23, 2147483647, 1, 1.103
256, 45, 64, 23, 1273, 1, 1.076
256, 45, 64, 23, 2147483647, 1, 1.079
256, 4, 64, 23, 1273, 1, 1.119
256, 4, 64, 23, 2147483647, 1, 1.112
256, 60, 64, 23, 1273, 1, 1.117
256, 60, 64, 23, 2147483647, 1, 1.112
256, 5, 64, 23, 1273, 1, 1.21
256, 5, 64, 23, 2147483647, 1, 1.194
256, 75, 64, 23, 1273, 1, 1.055
256, 75, 64, 23, 2147483647, 1, 1.045
256, 6, 64, 23, 1273, 1, 1.264
256, 6, 64, 23, 2147483647, 1, 1.3
256, 90, 64, 23, 1273, 1, 1.022
256, 90, 64, 23, 2147483647, 1, 1.026
256, 7, 64, 23, 1273, 1, 1.316
256, 7, 64, 23, 2147483647, 1, 1.325
Overall this leads to a 5% performance improvement in the benchmark
suite.
Full xcheck passes on x86_64 with and without multiarch enabled.
---
sysdeps/x86_64/multiarch/Makefile | 1 +
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 3 +++
sysdeps/x86_64/multiarch/wcsrchr-sse4_1.S | 21 +++++++++++++++++++++
sysdeps/x86_64/multiarch/wcsrchr.c | 3 ++-
4 files changed, 27 insertions(+), 1 deletion(-)
create mode 100644 sysdeps/x86_64/multiarch/wcsrchr-sse4_1.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 0400ea332b..5ad7bc8c25 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -154,6 +154,7 @@ sysdep_routines += \
wcsrchr-avx2-rtm \
wcsrchr-evex \
wcsrchr-sse2 \
+ wcsrchr-sse4_1 \
wmemchr-avx2 \
wmemchr-avx2-rtm \
wmemchr-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index a8afcf81bb..1cbb6938c8 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -685,6 +685,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__wcsrchr_evex)
+ IFUNC_IMPL_ADD (array, i, wcsrchr,
+ CPU_FEATURE_USABLE (SSE4_1),
+ __wcsrchr_sse4_1)
IFUNC_IMPL_ADD (array, i, wcsrchr, 1, __wcsrchr_sse2))
/* Support sysdeps/x86_64/multiarch/wcscmp.c. */
diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse4_1.S b/sysdeps/x86_64/multiarch/wcsrchr-sse4_1.S
new file mode 100644
index 0000000000..34b92d28eb
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsrchr-sse4_1.S
@@ -0,0 +1,21 @@
+/* wcsrchr optimized with SSE4.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define USE_AS_WCSRCHR 1
+#define STRRCHR __wcsrchr_sse4_1
+#include "../strrchr.S"
diff --git a/sysdeps/x86_64/multiarch/wcsrchr.c b/sysdeps/x86_64/multiarch/wcsrchr.c
index 8b30c06f2e..eb18038eec 100644
--- a/sysdeps/x86_64/multiarch/wcsrchr.c
+++ b/sysdeps/x86_64/multiarch/wcsrchr.c
@@ -23,7 +23,8 @@
# undef wcsrchr
# define SYMBOL_NAME wcsrchr
-# include "ifunc-avx2.h"
+
+# include "ifunc-wcslen.h"
libc_ifunc_redirected (__redirect_wcsrchr, wcsrchr, IFUNC_SELECTOR ());
#endif
--
2.25.1
^ permalink raw reply [flat|nested] 36+ messages in thread
* [PATCH v1 4/5] x86: Optimize {str|wcs}rchr-avx2
2022-04-21 3:14 [PATCH v1 1/5] benchtests: Improve bench-strrchr Noah Goldstein
2022-04-21 3:14 ` [PATCH v1 2/5] x86: Optimize {str|wcs}rchr-sse2 Noah Goldstein
2022-04-21 3:14 ` [PATCH v1 3/5] x86: Add wcsrchr optimized with SSE4_1 in wcsrchr-sse4_1.S Noah Goldstein
@ 2022-04-21 3:14 ` Noah Goldstein
2022-04-21 3:14 ` [PATCH v1 5/5] x86: Optimize {str|wcs}rchr-evex Noah Goldstein
` (4 subsequent siblings)
7 siblings, 0 replies; 36+ messages in thread
From: Noah Goldstein @ 2022-04-21 3:14 UTC (permalink / raw)
To: libc-alpha
The new code unrolls the main loop slightly without adding too much
overhead and minimizes the comparisons for the search CHAR.
Geometric Mean of all benchmarks New / Old: 0.832
See email for all results.
Full xcheck passes on x86_64 with and without multiarch enabled.
---
Results For: strrchr
Geometric Mean of N=30 runs.
Geometric Mean of all benchmarks New / Old: 0.832
Benchmarks performance on Tigerlake:
https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i71165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html
len, align, pos, seek, max_char, freq, New Time / Old Time
2048, 0, 32, 0, 127, 1, 0.673
2048, 1, 32, 0, 127, 1, 0.68
2048, 0, 64, 0, 127, 1, 0.566
2048, 2, 64, 0, 127, 1, 0.574
2048, 0, 128, 0, 127, 1, 0.976
2048, 3, 128, 0, 127, 1, 0.967
2048, 0, 256, 0, 127, 1, 0.931
2048, 4, 256, 0, 127, 1, 0.921
2048, 0, 512, 0, 127, 1, 0.792
2048, 5, 512, 0, 127, 1, 0.78
2048, 0, 1024, 0, 127, 1, 0.733
2048, 6, 1024, 0, 127, 1, 0.729
2048, 0, 2048, 0, 127, 1, 0.795
2048, 7, 2048, 0, 127, 1, 0.805
2048, 0, 4096, 0, 127, 1, 0.803
2048, 8, 4096, 0, 127, 1, 0.794
256, 1, 64, 0, 127, 1, 0.584
256, 15, 64, 0, 127, 1, 0.587
256, 2, 64, 0, 127, 1, 0.586
256, 30, 64, 0, 127, 1, 0.592
256, 3, 64, 0, 127, 1, 0.586
256, 45, 64, 0, 127, 1, 0.505
256, 4, 64, 0, 127, 1, 0.59
256, 60, 64, 0, 127, 1, 0.501
256, 5, 64, 0, 127, 1, 0.595
256, 75, 64, 0, 127, 1, 0.588
256, 6, 64, 0, 127, 1, 0.593
256, 90, 64, 0, 127, 1, 0.594
256, 7, 64, 0, 127, 1, 0.596
256, 105, 64, 0, 127, 1, 0.506
1, 0, 0, 0, 127, 1, 0.872
2, 0, 1, 0, 127, 1, 0.861
3, 0, 2, 0, 127, 1, 0.862
4, 0, 3, 0, 127, 1, 0.884
5, 0, 4, 0, 127, 1, 0.869
6, 0, 5, 0, 127, 1, 0.861
7, 0, 6, 0, 127, 1, 0.865
8, 0, 7, 0, 127, 1, 0.884
9, 0, 8, 0, 127, 1, 0.862
10, 0, 9, 0, 127, 1, 0.889
11, 0, 10, 0, 127, 1, 0.9
12, 0, 11, 0, 127, 1, 0.897
13, 0, 12, 0, 127, 1, 0.909
14, 0, 13, 0, 127, 1, 0.885
15, 0, 14, 0, 127, 1, 0.929
16, 0, 15, 0, 127, 1, 0.871
17, 0, 16, 0, 127, 1, 0.875
18, 0, 17, 0, 127, 1, 0.878
19, 0, 18, 0, 127, 1, 0.889
20, 0, 19, 0, 127, 1, 0.89
21, 0, 20, 0, 127, 1, 0.901
22, 0, 21, 0, 127, 1, 0.91
23, 0, 22, 0, 127, 1, 0.912
24, 0, 23, 0, 127, 1, 0.907
25, 0, 24, 0, 127, 1, 0.947
26, 0, 25, 0, 127, 1, 0.904
27, 0, 26, 0, 127, 1, 0.921
28, 0, 27, 0, 127, 1, 0.899
29, 0, 28, 0, 127, 1, 0.923
30, 0, 29, 0, 127, 1, 0.918
31, 0, 30, 0, 127, 1, 0.943
32, 0, 31, 0, 127, 1, 0.914
2048, 0, 32, 23, 127, 1, 0.815
2048, 1, 32, 23, 127, 1, 0.829
2048, 0, 64, 23, 127, 1, 0.884
2048, 2, 64, 23, 127, 1, 0.882
2048, 0, 128, 23, 127, 1, 0.884
2048, 3, 128, 23, 127, 1, 0.851
2048, 0, 256, 23, 127, 1, 0.843
2048, 4, 256, 23, 127, 1, 0.867
2048, 0, 512, 23, 127, 1, 0.746
2048, 5, 512, 23, 127, 1, 0.863
2048, 0, 1024, 23, 127, 1, 0.662
2048, 6, 1024, 23, 127, 1, 0.683
2048, 0, 2048, 23, 127, 1, 0.852
2048, 7, 2048, 23, 127, 1, 0.837
2048, 0, 4096, 23, 127, 1, 0.837
2048, 8, 4096, 23, 127, 1, 0.829
256, 1, 64, 23, 127, 1, 0.934
256, 15, 64, 23, 127, 1, 0.936
256, 2, 64, 23, 127, 1, 0.931
256, 30, 64, 23, 127, 1, 0.938
256, 3, 64, 23, 127, 1, 0.927
256, 45, 64, 23, 127, 1, 0.863
256, 4, 64, 23, 127, 1, 0.939
256, 60, 64, 23, 127, 1, 0.871
256, 5, 64, 23, 127, 1, 0.94
256, 75, 64, 23, 127, 1, 0.933
256, 6, 64, 23, 127, 1, 0.915
256, 90, 64, 23, 127, 1, 0.934
256, 7, 64, 23, 127, 1, 0.938
256, 105, 64, 23, 127, 1, 0.871
1, 0, 0, 23, 127, 1, 0.865
2, 0, 1, 23, 127, 1, 0.87
3, 0, 2, 23, 127, 1, 0.882
4, 0, 3, 23, 127, 1, 0.901
5, 0, 4, 23, 127, 1, 0.879
6, 0, 5, 23, 127, 1, 0.934
7, 0, 6, 23, 127, 1, 0.874
8, 0, 7, 23, 127, 1, 0.895
9, 0, 8, 23, 127, 1, 0.873
10, 0, 9, 23, 127, 1, 0.861
11, 0, 10, 23, 127, 1, 0.865
12, 0, 11, 23, 127, 1, 0.875
13, 0, 12, 23, 127, 1, 0.878
14, 0, 13, 23, 127, 1, 0.86
15, 0, 14, 23, 127, 1, 0.889
16, 0, 15, 23, 127, 1, 0.875
17, 0, 16, 23, 127, 1, 0.911
18, 0, 17, 23, 127, 1, 0.891
19, 0, 18, 23, 127, 1, 0.921
20, 0, 19, 23, 127, 1, 0.898
21, 0, 20, 23, 127, 1, 0.895
22, 0, 21, 23, 127, 1, 0.906
23, 0, 22, 23, 127, 1, 0.911
24, 0, 23, 23, 127, 1, 0.877
25, 0, 24, 23, 127, 1, 0.9
26, 0, 25, 23, 127, 1, 0.911
27, 0, 26, 23, 127, 1, 0.926
28, 0, 27, 23, 127, 1, 0.918
29, 0, 28, 23, 127, 1, 0.952
30, 0, 29, 23, 127, 1, 0.943
31, 0, 30, 23, 127, 1, 0.934
32, 0, 31, 23, 127, 1, 0.8
2048, 0, 32, 23, 127, 2, 0.872
2048, 1, 32, 23, 127, 2, 0.819
2048, 0, 64, 23, 127, 2, 0.815
2048, 2, 64, 23, 127, 2, 0.805
2048, 0, 128, 23, 127, 2, 0.884
2048, 3, 128, 23, 127, 2, 0.852
2048, 0, 256, 23, 127, 2, 0.873
2048, 4, 256, 23, 127, 2, 0.871
2048, 0, 512, 23, 127, 2, 0.654
2048, 5, 512, 23, 127, 2, 0.762
2048, 0, 1024, 23, 127, 2, 0.646
2048, 6, 1024, 23, 127, 2, 0.665
2048, 0, 2048, 23, 127, 2, 0.678
2048, 7, 2048, 23, 127, 2, 0.675
2048, 0, 4096, 23, 127, 2, 0.849
2048, 8, 4096, 23, 127, 2, 0.835
256, 1, 64, 23, 127, 2, 0.917
256, 15, 64, 23, 127, 2, 0.915
256, 2, 64, 23, 127, 2, 0.911
256, 30, 64, 23, 127, 2, 0.907
256, 3, 64, 23, 127, 2, 0.9
256, 45, 64, 23, 127, 2, 0.816
256, 4, 64, 23, 127, 2, 0.912
256, 60, 64, 23, 127, 2, 0.81
256, 5, 64, 23, 127, 2, 0.904
256, 75, 64, 23, 127, 2, 0.911
256, 6, 64, 23, 127, 2, 0.898
256, 90, 64, 23, 127, 2, 0.912
256, 7, 64, 23, 127, 2, 0.909
256, 105, 64, 23, 127, 2, 0.81
1, 0, 0, 23, 127, 2, 0.858
2, 0, 1, 23, 127, 2, 0.89
3, 0, 2, 23, 127, 2, 0.877
4, 0, 3, 23, 127, 2, 0.863
5, 0, 4, 23, 127, 2, 0.863
6, 0, 5, 23, 127, 2, 0.889
7, 0, 6, 23, 127, 2, 0.898
8, 0, 7, 23, 127, 2, 0.885
9, 0, 8, 23, 127, 2, 0.863
10, 0, 9, 23, 127, 2, 0.902
11, 0, 10, 23, 127, 2, 0.865
12, 0, 11, 23, 127, 2, 0.864
13, 0, 12, 23, 127, 2, 0.87
14, 0, 13, 23, 127, 2, 0.862
15, 0, 14, 23, 127, 2, 0.861
16, 0, 15, 23, 127, 2, 0.859
17, 0, 16, 23, 127, 2, 0.87
18, 0, 17, 23, 127, 2, 0.892
19, 0, 18, 23, 127, 2, 0.874
20, 0, 19, 23, 127, 2, 0.866
21, 0, 20, 23, 127, 2, 0.877
22, 0, 21, 23, 127, 2, 0.868
23, 0, 22, 23, 127, 2, 0.884
24, 0, 23, 23, 127, 2, 0.881
25, 0, 24, 23, 127, 2, 0.872
26, 0, 25, 23, 127, 2, 0.866
27, 0, 26, 23, 127, 2, 0.881
28, 0, 27, 23, 127, 2, 0.93
29, 0, 28, 23, 127, 2, 0.886
30, 0, 29, 23, 127, 2, 0.869
31, 0, 30, 23, 127, 2, 0.869
32, 0, 31, 23, 127, 2, 0.667
2048, 0, 32, 23, 127, 4, 0.858
2048, 1, 32, 23, 127, 4, 0.858
2048, 0, 64, 23, 127, 4, 0.838
2048, 2, 64, 23, 127, 4, 0.834
2048, 0, 128, 23, 127, 4, 0.85
2048, 3, 128, 23, 127, 4, 0.762
2048, 0, 256, 23, 127, 4, 0.874
2048, 4, 256, 23, 127, 4, 0.796
2048, 0, 512, 23, 127, 4, 0.691
2048, 5, 512, 23, 127, 4, 0.755
2048, 0, 1024, 23, 127, 4, 0.676
2048, 6, 1024, 23, 127, 4, 0.661
2048, 0, 2048, 23, 127, 4, 0.678
2048, 7, 2048, 23, 127, 4, 0.678
2048, 0, 4096, 23, 127, 4, 0.676
2048, 8, 4096, 23, 127, 4, 0.677
256, 1, 64, 23, 127, 4, 0.875
256, 15, 64, 23, 127, 4, 0.877
256, 2, 64, 23, 127, 4, 0.875
256, 30, 64, 23, 127, 4, 0.875
256, 3, 64, 23, 127, 4, 0.878
256, 45, 64, 23, 127, 4, 0.829
256, 4, 64, 23, 127, 4, 0.876
256, 60, 64, 23, 127, 4, 0.807
256, 5, 64, 23, 127, 4, 0.874
256, 75, 64, 23, 127, 4, 0.872
256, 6, 64, 23, 127, 4, 0.874
256, 90, 64, 23, 127, 4, 0.874
256, 7, 64, 23, 127, 4, 0.873
256, 105, 64, 23, 127, 4, 0.826
1, 0, 0, 23, 127, 4, 0.863
2, 0, 1, 23, 127, 4, 0.861
3, 0, 2, 23, 127, 4, 0.863
4, 0, 3, 23, 127, 4, 0.867
5, 0, 4, 23, 127, 4, 0.866
6, 0, 5, 23, 127, 4, 0.873
7, 0, 6, 23, 127, 4, 0.873
8, 0, 7, 23, 127, 4, 0.866
9, 0, 8, 23, 127, 4, 0.861
10, 0, 9, 23, 127, 4, 0.861
11, 0, 10, 23, 127, 4, 0.857
12, 0, 11, 23, 127, 4, 0.864
13, 0, 12, 23, 127, 4, 0.86
14, 0, 13, 23, 127, 4, 0.859
15, 0, 14, 23, 127, 4, 0.854
16, 0, 15, 23, 127, 4, 0.857
17, 0, 16, 23, 127, 4, 0.881
18, 0, 17, 23, 127, 4, 0.863
19, 0, 18, 23, 127, 4, 0.86
20, 0, 19, 23, 127, 4, 0.906
21, 0, 20, 23, 127, 4, 0.924
22, 0, 21, 23, 127, 4, 0.885
23, 0, 22, 23, 127, 4, 0.861
24, 0, 23, 23, 127, 4, 0.907
25, 0, 24, 23, 127, 4, 0.909
26, 0, 25, 23, 127, 4, 0.863
27, 0, 26, 23, 127, 4, 0.862
28, 0, 27, 23, 127, 4, 0.887
29, 0, 28, 23, 127, 4, 0.879
30, 0, 29, 23, 127, 4, 0.932
31, 0, 30, 23, 127, 4, 0.895
32, 0, 31, 23, 127, 4, 0.666
2048, 0, 32, 23, 127, 8, 0.865
2048, 1, 32, 23, 127, 8, 0.892
2048, 0, 64, 23, 127, 8, 0.85
2048, 2, 64, 23, 127, 8, 0.834
2048, 0, 128, 23, 127, 8, 0.823
2048, 3, 128, 23, 127, 8, 0.809
2048, 0, 256, 23, 127, 8, 0.84
2048, 4, 256, 23, 127, 8, 0.738
2048, 0, 512, 23, 127, 8, 0.656
2048, 5, 512, 23, 127, 8, 0.644
2048, 0, 1024, 23, 127, 8, 0.705
2048, 6, 1024, 23, 127, 8, 0.708
2048, 0, 2048, 23, 127, 8, 0.701
2048, 7, 2048, 23, 127, 8, 0.7
2048, 0, 4096, 23, 127, 8, 0.68
2048, 8, 4096, 23, 127, 8, 0.678
256, 1, 64, 23, 127, 8, 0.881
256, 15, 64, 23, 127, 8, 0.879
256, 2, 64, 23, 127, 8, 0.878
256, 30, 64, 23, 127, 8, 0.877
256, 3, 64, 23, 127, 8, 0.88
256, 45, 64, 23, 127, 8, 0.829
256, 4, 64, 23, 127, 8, 0.883
256, 60, 64, 23, 127, 8, 0.808
256, 5, 64, 23, 127, 8, 0.875
256, 75, 64, 23, 127, 8, 0.877
256, 6, 64, 23, 127, 8, 0.874
256, 90, 64, 23, 127, 8, 0.874
256, 7, 64, 23, 127, 8, 0.874
256, 105, 64, 23, 127, 8, 0.83
1, 0, 0, 23, 127, 8, 0.862
2, 0, 1, 23, 127, 8, 0.865
3, 0, 2, 23, 127, 8, 0.866
4, 0, 3, 23, 127, 8, 0.863
5, 0, 4, 23, 127, 8, 0.874
6, 0, 5, 23, 127, 8, 0.87
7, 0, 6, 23, 127, 8, 0.87
8, 0, 7, 23, 127, 8, 0.864
9, 0, 8, 23, 127, 8, 0.87
10, 0, 9, 23, 127, 8, 0.861
11, 0, 10, 23, 127, 8, 0.862
12, 0, 11, 23, 127, 8, 0.87
13, 0, 12, 23, 127, 8, 0.858
14, 0, 13, 23, 127, 8, 0.86
15, 0, 14, 23, 127, 8, 0.863
16, 0, 15, 23, 127, 8, 0.866
17, 0, 16, 23, 127, 8, 0.86
18, 0, 17, 23, 127, 8, 0.887
19, 0, 18, 23, 127, 8, 0.858
20, 0, 19, 23, 127, 8, 0.891
21, 0, 20, 23, 127, 8, 0.874
22, 0, 21, 23, 127, 8, 0.891
23, 0, 22, 23, 127, 8, 0.873
24, 0, 23, 23, 127, 8, 0.895
25, 0, 24, 23, 127, 8, 0.884
26, 0, 25, 23, 127, 8, 0.878
27, 0, 26, 23, 127, 8, 0.878
28, 0, 27, 23, 127, 8, 0.891
29, 0, 28, 23, 127, 8, 0.91
30, 0, 29, 23, 127, 8, 0.881
31, 0, 30, 23, 127, 8, 0.917
32, 0, 31, 23, 127, 8, 0.667
2048, 0, 32, 23, 127, 16, 0.86
2048, 1, 32, 23, 127, 16, 0.847
2048, 0, 64, 23, 127, 16, 0.846
2048, 2, 64, 23, 127, 16, 0.852
2048, 0, 128, 23, 127, 16, 0.82
2048, 3, 128, 23, 127, 16, 0.751
2048, 0, 256, 23, 127, 16, 0.788
2048, 4, 256, 23, 127, 16, 0.712
2048, 0, 512, 23, 127, 16, 0.524
2048, 5, 512, 23, 127, 16, 0.517
2048, 0, 1024, 23, 127, 16, 0.583
2048, 6, 1024, 23, 127, 16, 0.682
2048, 0, 2048, 23, 127, 16, 0.77
2048, 7, 2048, 23, 127, 16, 0.659
2048, 0, 4096, 23, 127, 16, 0.7
2048, 8, 4096, 23, 127, 16, 0.7
256, 1, 64, 23, 127, 16, 0.798
256, 15, 64, 23, 127, 16, 0.873
256, 2, 64, 23, 127, 16, 0.875
256, 30, 64, 23, 127, 16, 0.877
256, 3, 64, 23, 127, 16, 0.875
256, 45, 64, 23, 127, 16, 0.834
256, 4, 64, 23, 127, 16, 0.873
256, 60, 64, 23, 127, 16, 0.809
256, 5, 64, 23, 127, 16, 0.879
256, 75, 64, 23, 127, 16, 0.884
256, 6, 64, 23, 127, 16, 0.874
256, 90, 64, 23, 127, 16, 0.876
256, 7, 64, 23, 127, 16, 0.876
256, 105, 64, 23, 127, 16, 0.827
1, 0, 0, 23, 127, 16, 0.859
2, 0, 1, 23, 127, 16, 0.864
3, 0, 2, 23, 127, 16, 0.871
4, 0, 3, 23, 127, 16, 0.869
5, 0, 4, 23, 127, 16, 0.881
6, 0, 5, 23, 127, 16, 0.869
7, 0, 6, 23, 127, 16, 0.867
8, 0, 7, 23, 127, 16, 0.877
9, 0, 8, 23, 127, 16, 0.862
10, 0, 9, 23, 127, 16, 0.861
11, 0, 10, 23, 127, 16, 0.859
12, 0, 11, 23, 127, 16, 0.858
13, 0, 12, 23, 127, 16, 0.867
14, 0, 13, 23, 127, 16, 0.857
15, 0, 14, 23, 127, 16, 0.858
16, 0, 15, 23, 127, 16, 0.857
17, 0, 16, 23, 127, 16, 0.858
18, 0, 17, 23, 127, 16, 0.867
19, 0, 18, 23, 127, 16, 0.875
20, 0, 19, 23, 127, 16, 0.868
21, 0, 20, 23, 127, 16, 0.861
22, 0, 21, 23, 127, 16, 0.868
23, 0, 22, 23, 127, 16, 0.866
24, 0, 23, 23, 127, 16, 0.858
25, 0, 24, 23, 127, 16, 0.859
26, 0, 25, 23, 127, 16, 0.857
27, 0, 26, 23, 127, 16, 0.866
28, 0, 27, 23, 127, 16, 0.875
29, 0, 28, 23, 127, 16, 0.896
30, 0, 29, 23, 127, 16, 0.889
31, 0, 30, 23, 127, 16, 0.903
32, 0, 31, 23, 127, 16, 0.667
sysdeps/x86_64/multiarch/strrchr-avx2.S | 415 +++++++++++++++---------
1 file changed, 258 insertions(+), 157 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
index 1df2adfad0..9d1e45defc 100644
--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
@@ -27,9 +27,13 @@
# ifdef USE_AS_WCSRCHR
# define VPBROADCAST vpbroadcastd
# define VPCMPEQ vpcmpeqd
+# define VPMIN vpminud
+# define CHAR_SIZE 4
# else
# define VPBROADCAST vpbroadcastb
# define VPCMPEQ vpcmpeqb
+# define VPMIN vpminub
+# define CHAR_SIZE 1
# endif
# ifndef VZEROUPPER
@@ -41,196 +45,293 @@
# endif
# define VEC_SIZE 32
+# define PAGE_SIZE 4096
- .section SECTION(.text),"ax",@progbits
-ENTRY (STRRCHR)
- movd %esi, %xmm4
- movl %edi, %ecx
+ .section SECTION(.text), "ax", @progbits
+ENTRY(STRRCHR)
+ movd %esi, %xmm7
+ movl %edi, %eax
/* Broadcast CHAR to YMM4. */
- VPBROADCAST %xmm4, %ymm4
+ VPBROADCAST %xmm7, %ymm7
vpxor %xmm0, %xmm0, %xmm0
- /* Check if we may cross page boundary with one vector load. */
- andl $(2 * VEC_SIZE - 1), %ecx
- cmpl $VEC_SIZE, %ecx
- ja L(cros_page_boundary)
+ /* Shift here instead of `andl` to save code size (saves a fetch
+ block). */
+ sall $20, %eax
+ cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax
+ ja L(cross_page)
+L(page_cross_continue):
vmovdqu (%rdi), %ymm1
- VPCMPEQ %ymm1, %ymm0, %ymm2
- VPCMPEQ %ymm1, %ymm4, %ymm3
- vpmovmskb %ymm2, %ecx
- vpmovmskb %ymm3, %eax
- addq $VEC_SIZE, %rdi
+ /* Check end of string match. */
+ VPCMPEQ %ymm1, %ymm0, %ymm6
+ vpmovmskb %ymm6, %ecx
+ testl %ecx, %ecx
+ jz L(aligned_more)
+
+ /* Only check match with search CHAR if needed. */
+ VPCMPEQ %ymm1, %ymm7, %ymm1
+ vpmovmskb %ymm1, %eax
+ /* Check if match before first zero. */
+ blsmskl %ecx, %ecx
+ andl %ecx, %eax
+ jz L(ret0)
+ bsrl %eax, %eax
+ addq %rdi, %rax
+ /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
+ search CHAR is zero we are correct. Either way `andq
+ -CHAR_SIZE, %rax` gets the correct result. */
+# ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+# endif
+L(ret0):
+L(return_vzeroupper):
+ ZERO_UPPER_VEC_REGISTERS_RETURN
+
+ /* Returns for first vec x1/x2 have hard coded backward search
+ path for earlier matches. */
+ .p2align 4,, 10
+L(first_vec_x1):
+ VPCMPEQ %ymm2, %ymm7, %ymm6
+ vpmovmskb %ymm6, %eax
+ blsmskl %ecx, %ecx
+ andl %ecx, %eax
+ jnz L(first_vec_x1_return)
+
+ .p2align 4,, 4
+L(first_vec_x0_test):
+ VPCMPEQ %ymm1, %ymm7, %ymm6
+ vpmovmskb %ymm6, %eax
+ testl %eax, %eax
+ jz L(ret1)
+ bsrl %eax, %eax
+ addq %r8, %rax
+# ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+# endif
+L(ret1):
+ VZEROUPPER_RETURN
+ .p2align 4,, 10
+L(first_vec_x0_x1_test):
+ VPCMPEQ %ymm2, %ymm7, %ymm6
+ vpmovmskb %ymm6, %eax
testl %eax, %eax
- jnz L(first_vec)
+ jz L(first_vec_x0_test)
+ .p2align 4,, 4
+L(first_vec_x1_return):
+ bsrl %eax, %eax
+ leaq 1(%rdi, %rax), %rax
+# ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+# endif
+ VZEROUPPER_RETURN
- testl %ecx, %ecx
- jnz L(return_null)
- andq $-VEC_SIZE, %rdi
- xorl %edx, %edx
- jmp L(aligned_loop)
+ .p2align 4,, 10
+L(first_vec_x2):
+ VPCMPEQ %ymm3, %ymm7, %ymm6
+ vpmovmskb %ymm6, %eax
+ blsmskl %ecx, %ecx
+ andl %ecx, %eax
+ jz L(first_vec_x0_x1_test)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE + 1)(%rdi, %rax), %rax
+# ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+# endif
+ VZEROUPPER_RETURN
+
.p2align 4
-L(first_vec):
- /* Check if there is a nul CHAR. */
+L(aligned_more):
+ /* Save original pointer if match was in VEC 0. */
+ movq %rdi, %r8
+
+ /* Align src. */
+ orq $(VEC_SIZE - 1), %rdi
+ vmovdqu 1(%rdi), %ymm2
+ VPCMPEQ %ymm2, %ymm0, %ymm6
+ vpmovmskb %ymm6, %ecx
testl %ecx, %ecx
- jnz L(char_and_nul_in_first_vec)
+ jnz L(first_vec_x1)
- /* Remember the match and keep searching. */
- movl %eax, %edx
- movq %rdi, %rsi
- andq $-VEC_SIZE, %rdi
- jmp L(aligned_loop)
+ vmovdqu (VEC_SIZE + 1)(%rdi), %ymm3
+ VPCMPEQ %ymm3, %ymm0, %ymm6
+ vpmovmskb %ymm6, %ecx
+ testl %ecx, %ecx
+ jnz L(first_vec_x2)
+ /* Save pointer again before realigning. */
+ movq %rdi, %rsi
+ addq $(VEC_SIZE + 1), %rdi
+ andq $-(VEC_SIZE * 2), %rdi
.p2align 4
-L(cros_page_boundary):
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
- vmovdqa (%rdi), %ymm1
- VPCMPEQ %ymm1, %ymm0, %ymm2
- VPCMPEQ %ymm1, %ymm4, %ymm3
- vpmovmskb %ymm2, %edx
- vpmovmskb %ymm3, %eax
- shrl %cl, %edx
- shrl %cl, %eax
- addq $VEC_SIZE, %rdi
-
- /* Check if there is a CHAR. */
+L(first_aligned_loop):
+ /* Do 2x VEC at a time. Any more and the cost of finding the
+ match outweights loop benefit. */
+ vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4
+ vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5
+
+ VPCMPEQ %ymm4, %ymm7, %ymm6
+ VPMIN %ymm4, %ymm5, %ymm8
+ VPCMPEQ %ymm5, %ymm7, %ymm10
+ vpor %ymm6, %ymm10, %ymm5
+ VPCMPEQ %ymm8, %ymm0, %ymm8
+ vpor %ymm5, %ymm8, %ymm9
+
+ vpmovmskb %ymm9, %eax
+ addq $(VEC_SIZE * 2), %rdi
+ /* No zero or search CHAR. */
testl %eax, %eax
- jnz L(found_char)
-
- testl %edx, %edx
- jnz L(return_null)
+ jz L(first_aligned_loop)
- jmp L(aligned_loop)
-
- .p2align 4
-L(found_char):
- testl %edx, %edx
- jnz L(char_and_nul)
+ /* If no zero CHAR then go to second loop (this allows us to
+ throw away all prior work). */
+ vpmovmskb %ymm8, %ecx
+ testl %ecx, %ecx
+ jz L(second_aligned_loop_prep)
- /* Remember the match and keep searching. */
- movl %eax, %edx
- leaq (%rdi, %rcx), %rsi
+ /* Search char could be zero so we need to get the true match.
+ */
+ vpmovmskb %ymm5, %eax
+ testl %eax, %eax
+ jnz L(first_aligned_loop_return)
- .p2align 4
-L(aligned_loop):
- vmovdqa (%rdi), %ymm1
- VPCMPEQ %ymm1, %ymm0, %ymm2
- addq $VEC_SIZE, %rdi
- VPCMPEQ %ymm1, %ymm4, %ymm3
- vpmovmskb %ymm2, %ecx
- vpmovmskb %ymm3, %eax
- orl %eax, %ecx
- jnz L(char_nor_null)
-
- vmovdqa (%rdi), %ymm1
- VPCMPEQ %ymm1, %ymm0, %ymm2
- add $VEC_SIZE, %rdi
- VPCMPEQ %ymm1, %ymm4, %ymm3
- vpmovmskb %ymm2, %ecx
+ .p2align 4,, 4
+L(first_vec_x1_or_x2):
+ VPCMPEQ %ymm3, %ymm7, %ymm3
+ VPCMPEQ %ymm2, %ymm7, %ymm2
vpmovmskb %ymm3, %eax
- orl %eax, %ecx
- jnz L(char_nor_null)
-
- vmovdqa (%rdi), %ymm1
- VPCMPEQ %ymm1, %ymm0, %ymm2
- addq $VEC_SIZE, %rdi
- VPCMPEQ %ymm1, %ymm4, %ymm3
- vpmovmskb %ymm2, %ecx
- vpmovmskb %ymm3, %eax
- orl %eax, %ecx
- jnz L(char_nor_null)
-
- vmovdqa (%rdi), %ymm1
- VPCMPEQ %ymm1, %ymm0, %ymm2
- addq $VEC_SIZE, %rdi
- VPCMPEQ %ymm1, %ymm4, %ymm3
- vpmovmskb %ymm2, %ecx
- vpmovmskb %ymm3, %eax
- orl %eax, %ecx
- jz L(aligned_loop)
-
- .p2align 4
-L(char_nor_null):
- /* Find a CHAR or a nul CHAR in a loop. */
- testl %eax, %eax
- jnz L(match)
-L(return_value):
- testl %edx, %edx
- jz L(return_null)
- movl %edx, %eax
- movq %rsi, %rdi
+ vpmovmskb %ymm2, %edx
+ /* Use add for macro-fusion. */
+ addq %rax, %rdx
+ jz L(first_vec_x0_test)
+ /* NB: We could move this shift to before the branch and save a
+ bit of code size / performance on the fall through. The
+ branch leads to the null case which generally seems hotter
+ than char in first 3x VEC. */
+ salq $32, %rax
+ addq %rdx, %rax
+ bsrq %rax, %rax
+ leaq 1(%rsi, %rax), %rax
+# ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+# endif
+ VZEROUPPER_RETURN
+ .p2align 4,, 8
+L(first_aligned_loop_return):
+ VPCMPEQ %ymm4, %ymm0, %ymm4
+ vpmovmskb %ymm4, %edx
+ salq $32, %rcx
+ orq %rdx, %rcx
+
+ vpmovmskb %ymm10, %eax
+ vpmovmskb %ymm6, %edx
+ salq $32, %rax
+ orq %rdx, %rax
+ blsmskq %rcx, %rcx
+ andq %rcx, %rax
+ jz L(first_vec_x1_or_x2)
+
+ bsrq %rax, %rax
+ leaq -(VEC_SIZE * 2)(%rdi, %rax), %rax
# ifdef USE_AS_WCSRCHR
- /* Keep the first bit for each matching CHAR for bsr. */
- andl $0x11111111, %eax
+ andq $-CHAR_SIZE, %rax
# endif
- bsrl %eax, %eax
- leaq -VEC_SIZE(%rdi, %rax), %rax
-L(return_vzeroupper):
- ZERO_UPPER_VEC_REGISTERS_RETURN
+ VZEROUPPER_RETURN
+ /* Search char cannot be zero. */
.p2align 4
-L(match):
- /* Find a CHAR. Check if there is a nul CHAR. */
- vpmovmskb %ymm2, %ecx
- testl %ecx, %ecx
- jnz L(find_nul)
-
- /* Remember the match and keep searching. */
- movl %eax, %edx
+L(second_aligned_loop_set_furthest_match):
+ /* Save VEC and pointer from most recent match. */
+L(second_aligned_loop_prep):
movq %rdi, %rsi
- jmp L(aligned_loop)
+ vmovdqu %ymm6, %ymm2
+ vmovdqu %ymm10, %ymm3
.p2align 4
-L(find_nul):
-# ifdef USE_AS_WCSRCHR
- /* Keep the first bit for each matching CHAR for bsr. */
- andl $0x11111111, %ecx
- andl $0x11111111, %eax
-# endif
- /* Mask out any matching bits after the nul CHAR. */
- movl %ecx, %r8d
- subl $1, %r8d
- xorl %ecx, %r8d
- andl %r8d, %eax
+L(second_aligned_loop):
+ /* Search 2x at at time. */
+ vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4
+ vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5
+
+ VPCMPEQ %ymm4, %ymm7, %ymm6
+ VPMIN %ymm4, %ymm5, %ymm1
+ VPCMPEQ %ymm5, %ymm7, %ymm10
+ vpor %ymm6, %ymm10, %ymm5
+ VPCMPEQ %ymm1, %ymm0, %ymm1
+ vpor %ymm5, %ymm1, %ymm9
+
+ vpmovmskb %ymm9, %eax
+ addq $(VEC_SIZE * 2), %rdi
testl %eax, %eax
- /* If there is no CHAR here, return the remembered one. */
- jz L(return_value)
- bsrl %eax, %eax
- leaq -VEC_SIZE(%rdi, %rax), %rax
- VZEROUPPER_RETURN
-
- .p2align 4
-L(char_and_nul):
- /* Find both a CHAR and a nul CHAR. */
- addq %rcx, %rdi
- movl %edx, %ecx
-L(char_and_nul_in_first_vec):
-# ifdef USE_AS_WCSRCHR
- /* Keep the first bit for each matching CHAR for bsr. */
- andl $0x11111111, %ecx
- andl $0x11111111, %eax
-# endif
- /* Mask out any matching bits after the nul CHAR. */
- movl %ecx, %r8d
- subl $1, %r8d
- xorl %ecx, %r8d
- andl %r8d, %eax
+ jz L(second_aligned_loop)
+ vpmovmskb %ymm1, %ecx
+ testl %ecx, %ecx
+ jz L(second_aligned_loop_set_furthest_match)
+ vpmovmskb %ymm5, %eax
testl %eax, %eax
- /* Return null pointer if the nul CHAR comes first. */
- jz L(return_null)
- bsrl %eax, %eax
- leaq -VEC_SIZE(%rdi, %rax), %rax
+ jnz L(return_new_match)
+
+ /* This is the hot patch. We know CHAR is inbounds and that
+ ymm3/ymm2 have latest match. */
+ .p2align 4,, 4
+L(return_old_match):
+ vpmovmskb %ymm3, %eax
+ vpmovmskb %ymm2, %edx
+ salq $32, %rax
+ orq %rdx, %rax
+ bsrq %rax, %rax
+ /* Search char cannot be zero so safe to just use lea for
+ wcsrchr. */
+ leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax
VZEROUPPER_RETURN
- .p2align 4
-L(return_null):
- xorl %eax, %eax
+ /* Last iteration also potentially has a match. */
+ .p2align 4,, 8
+L(return_new_match):
+ VPCMPEQ %ymm4, %ymm0, %ymm4
+ vpmovmskb %ymm4, %edx
+ salq $32, %rcx
+ orq %rdx, %rcx
+
+ vpmovmskb %ymm10, %eax
+ vpmovmskb %ymm6, %edx
+ salq $32, %rax
+ orq %rdx, %rax
+ blsmskq %rcx, %rcx
+ andq %rcx, %rax
+ jz L(return_old_match)
+ bsrq %rax, %rax
+ /* Search char cannot be zero so safe to just use lea for
+ wcsrchr. */
+ leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax
VZEROUPPER_RETURN
-END (STRRCHR)
+ .p2align 4,, 4
+L(cross_page):
+ movq %rdi, %rsi
+ andq $-VEC_SIZE, %rsi
+ vmovdqu (%rsi), %ymm1
+ VPCMPEQ %ymm1, %ymm0, %ymm6
+ vpmovmskb %ymm6, %ecx
+ shrxl %edi, %ecx, %ecx
+ testl %ecx, %ecx
+ jz L(page_cross_continue)
+ VPCMPEQ %ymm1, %ymm7, %ymm1
+ vpmovmskb %ymm1, %eax
+ shrxl %edi, %eax, %eax
+ blsmskl %ecx, %ecx
+ andl %ecx, %eax
+ jz L(ret2)
+ bsrl %eax, %eax
+ addq %rdi, %rax
+# ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+# endif
+L(ret2):
+ VZEROUPPER_RETURN
+END(STRRCHR)
#endif
--
2.25.1
^ permalink raw reply [flat|nested] 36+ messages in thread
* [PATCH v1 5/5] x86: Optimize {str|wcs}rchr-evex
2022-04-21 3:14 [PATCH v1 1/5] benchtests: Improve bench-strrchr Noah Goldstein
` (2 preceding siblings ...)
2022-04-21 3:14 ` [PATCH v1 4/5] x86: Optimize {str|wcs}rchr-avx2 Noah Goldstein
@ 2022-04-21 3:14 ` Noah Goldstein
2022-04-21 20:12 ` [PATCH v1 1/5] benchtests: Improve bench-strrchr H.J. Lu
` (3 subsequent siblings)
7 siblings, 0 replies; 36+ messages in thread
From: Noah Goldstein @ 2022-04-21 3:14 UTC (permalink / raw)
To: libc-alpha
The new code unrolls the main loop slightly without adding too much
overhead and minimizes the comparisons for the search CHAR.
Geometric Mean of all benchmarks New / Old: 0.755
See email for all results.
Full xcheck passes on x86_64 with and without multiarch enabled.
---
Results For: strrchr
Geometric Mean of N=30 runs.
Geometric Mean of all benchmarks New / Old: 0.755
Benchmarks performance on Tigerlake:
https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i71165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html
len, align, pos, seek, max_char, freq, New Time / Old Time
2048, 0, 32, 0, 127, 1, 0.669
2048, 1, 32, 0, 127, 1, 0.672
2048, 0, 64, 0, 127, 1, 0.579
2048, 2, 64, 0, 127, 1, 0.579
2048, 0, 128, 0, 127, 1, 0.828
2048, 3, 128, 0, 127, 1, 0.827
2048, 0, 256, 0, 127, 1, 0.693
2048, 4, 256, 0, 127, 1, 0.692
2048, 0, 512, 0, 127, 1, 0.619
2048, 5, 512, 0, 127, 1, 0.622
2048, 0, 1024, 0, 127, 1, 0.626
2048, 6, 1024, 0, 127, 1, 0.627
2048, 0, 2048, 0, 127, 1, 0.85
2048, 7, 2048, 0, 127, 1, 0.855
2048, 0, 4096, 0, 127, 1, 0.849
2048, 8, 4096, 0, 127, 1, 0.848
256, 1, 64, 0, 127, 1, 0.579
256, 15, 64, 0, 127, 1, 0.579
256, 2, 64, 0, 127, 1, 0.579
256, 30, 64, 0, 127, 1, 0.579
256, 3, 64, 0, 127, 1, 0.579
256, 45, 64, 0, 127, 1, 0.551
256, 4, 64, 0, 127, 1, 0.579
256, 60, 64, 0, 127, 1, 0.553
256, 5, 64, 0, 127, 1, 0.579
256, 75, 64, 0, 127, 1, 0.578
256, 6, 64, 0, 127, 1, 0.578
256, 90, 64, 0, 127, 1, 0.579
256, 7, 64, 0, 127, 1, 0.579
256, 105, 64, 0, 127, 1, 0.55
1, 0, 0, 0, 127, 1, 0.795
2, 0, 1, 0, 127, 1, 0.797
3, 0, 2, 0, 127, 1, 0.796
4, 0, 3, 0, 127, 1, 0.792
5, 0, 4, 0, 127, 1, 0.789
6, 0, 5, 0, 127, 1, 0.791
7, 0, 6, 0, 127, 1, 0.793
8, 0, 7, 0, 127, 1, 0.789
9, 0, 8, 0, 127, 1, 0.797
10, 0, 9, 0, 127, 1, 0.788
11, 0, 10, 0, 127, 1, 0.796
12, 0, 11, 0, 127, 1, 0.793
13, 0, 12, 0, 127, 1, 0.797
14, 0, 13, 0, 127, 1, 0.795
15, 0, 14, 0, 127, 1, 0.795
16, 0, 15, 0, 127, 1, 0.791
17, 0, 16, 0, 127, 1, 0.798
18, 0, 17, 0, 127, 1, 0.8
19, 0, 18, 0, 127, 1, 0.797
20, 0, 19, 0, 127, 1, 0.798
21, 0, 20, 0, 127, 1, 0.797
22, 0, 21, 0, 127, 1, 0.796
23, 0, 22, 0, 127, 1, 0.792
24, 0, 23, 0, 127, 1, 0.791
25, 0, 24, 0, 127, 1, 0.794
26, 0, 25, 0, 127, 1, 0.797
27, 0, 26, 0, 127, 1, 0.793
28, 0, 27, 0, 127, 1, 0.79
29, 0, 28, 0, 127, 1, 0.79
30, 0, 29, 0, 127, 1, 0.791
31, 0, 30, 0, 127, 1, 0.791
32, 0, 31, 0, 127, 1, 0.79
2048, 0, 32, 23, 127, 1, 0.734
2048, 1, 32, 23, 127, 1, 0.748
2048, 0, 64, 23, 127, 1, 0.759
2048, 2, 64, 23, 127, 1, 0.753
2048, 0, 128, 23, 127, 1, 0.834
2048, 3, 128, 23, 127, 1, 0.835
2048, 0, 256, 23, 127, 1, 0.789
2048, 4, 256, 23, 127, 1, 0.791
2048, 0, 512, 23, 127, 1, 0.882
2048, 5, 512, 23, 127, 1, 0.861
2048, 0, 1024, 23, 127, 1, 0.643
2048, 6, 1024, 23, 127, 1, 0.643
2048, 0, 2048, 23, 127, 1, 0.931
2048, 7, 2048, 23, 127, 1, 0.929
2048, 0, 4096, 23, 127, 1, 0.922
2048, 8, 4096, 23, 127, 1, 0.934
256, 1, 64, 23, 127, 1, 0.73
256, 15, 64, 23, 127, 1, 0.729
256, 2, 64, 23, 127, 1, 0.725
256, 30, 64, 23, 127, 1, 0.728
256, 3, 64, 23, 127, 1, 0.727
256, 45, 64, 23, 127, 1, 0.749
256, 4, 64, 23, 127, 1, 0.73
256, 60, 64, 23, 127, 1, 0.752
256, 5, 64, 23, 127, 1, 0.729
256, 75, 64, 23, 127, 1, 0.727
256, 6, 64, 23, 127, 1, 0.693
256, 90, 64, 23, 127, 1, 0.73
256, 7, 64, 23, 127, 1, 0.73
256, 105, 64, 23, 127, 1, 0.751
1, 0, 0, 23, 127, 1, 0.797
2, 0, 1, 23, 127, 1, 0.794
3, 0, 2, 23, 127, 1, 0.797
4, 0, 3, 23, 127, 1, 0.792
5, 0, 4, 23, 127, 1, 0.781
6, 0, 5, 23, 127, 1, 0.783
7, 0, 6, 23, 127, 1, 0.79
8, 0, 7, 23, 127, 1, 0.791
9, 0, 8, 23, 127, 1, 0.794
10, 0, 9, 23, 127, 1, 0.795
11, 0, 10, 23, 127, 1, 0.795
12, 0, 11, 23, 127, 1, 0.795
13, 0, 12, 23, 127, 1, 0.794
14, 0, 13, 23, 127, 1, 0.792
15, 0, 14, 23, 127, 1, 0.79
16, 0, 15, 23, 127, 1, 0.793
17, 0, 16, 23, 127, 1, 0.795
18, 0, 17, 23, 127, 1, 0.797
19, 0, 18, 23, 127, 1, 0.796
20, 0, 19, 23, 127, 1, 0.796
21, 0, 20, 23, 127, 1, 0.794
22, 0, 21, 23, 127, 1, 0.794
23, 0, 22, 23, 127, 1, 0.793
24, 0, 23, 23, 127, 1, 0.792
25, 0, 24, 23, 127, 1, 0.795
26, 0, 25, 23, 127, 1, 0.792
27, 0, 26, 23, 127, 1, 0.789
28, 0, 27, 23, 127, 1, 0.794
29, 0, 28, 23, 127, 1, 0.793
30, 0, 29, 23, 127, 1, 0.795
31, 0, 30, 23, 127, 1, 0.797
32, 0, 31, 23, 127, 1, 0.775
2048, 0, 32, 23, 127, 2, 0.736
2048, 1, 32, 23, 127, 2, 0.738
2048, 0, 64, 23, 127, 2, 0.895
2048, 2, 64, 23, 127, 2, 0.897
2048, 0, 128, 23, 127, 2, 0.852
2048, 3, 128, 23, 127, 2, 0.845
2048, 0, 256, 23, 127, 2, 0.755
2048, 4, 256, 23, 127, 2, 0.712
2048, 0, 512, 23, 127, 2, 0.857
2048, 5, 512, 23, 127, 2, 0.849
2048, 0, 1024, 23, 127, 2, 0.626
2048, 6, 1024, 23, 127, 2, 0.661
2048, 0, 2048, 23, 127, 2, 0.67
2048, 7, 2048, 23, 127, 2, 0.67
2048, 0, 4096, 23, 127, 2, 0.928
2048, 8, 4096, 23, 127, 2, 0.935
256, 1, 64, 23, 127, 2, 0.693
256, 15, 64, 23, 127, 2, 0.692
256, 2, 64, 23, 127, 2, 0.693
256, 30, 64, 23, 127, 2, 0.692
256, 3, 64, 23, 127, 2, 0.692
256, 45, 64, 23, 127, 2, 0.701
256, 4, 64, 23, 127, 2, 0.692
256, 60, 64, 23, 127, 2, 0.701
256, 5, 64, 23, 127, 2, 0.69
256, 75, 64, 23, 127, 2, 0.693
256, 6, 64, 23, 127, 2, 0.691
256, 90, 64, 23, 127, 2, 0.692
256, 7, 64, 23, 127, 2, 0.693
256, 105, 64, 23, 127, 2, 0.701
1, 0, 0, 23, 127, 2, 0.797
2, 0, 1, 23, 127, 2, 0.787
3, 0, 2, 23, 127, 2, 0.797
4, 0, 3, 23, 127, 2, 0.793
5, 0, 4, 23, 127, 2, 0.792
6, 0, 5, 23, 127, 2, 0.795
7, 0, 6, 23, 127, 2, 0.791
8, 0, 7, 23, 127, 2, 0.792
9, 0, 8, 23, 127, 2, 0.796
10, 0, 9, 23, 127, 2, 0.797
11, 0, 10, 23, 127, 2, 0.797
12, 0, 11, 23, 127, 2, 0.798
13, 0, 12, 23, 127, 2, 0.799
14, 0, 13, 23, 127, 2, 0.796
15, 0, 14, 23, 127, 2, 0.796
16, 0, 15, 23, 127, 2, 0.794
17, 0, 16, 23, 127, 2, 0.795
18, 0, 17, 23, 127, 2, 0.797
19, 0, 18, 23, 127, 2, 0.793
20, 0, 19, 23, 127, 2, 0.795
21, 0, 20, 23, 127, 2, 0.794
22, 0, 21, 23, 127, 2, 0.794
23, 0, 22, 23, 127, 2, 0.796
24, 0, 23, 23, 127, 2, 0.794
25, 0, 24, 23, 127, 2, 0.794
26, 0, 25, 23, 127, 2, 0.794
27, 0, 26, 23, 127, 2, 0.788
28, 0, 27, 23, 127, 2, 0.791
29, 0, 28, 23, 127, 2, 0.791
30, 0, 29, 23, 127, 2, 0.793
31, 0, 30, 23, 127, 2, 0.796
32, 0, 31, 23, 127, 2, 0.628
2048, 0, 32, 23, 127, 4, 0.742
2048, 1, 32, 23, 127, 4, 0.742
2048, 0, 64, 23, 127, 4, 0.899
2048, 2, 64, 23, 127, 4, 0.912
2048, 0, 128, 23, 127, 4, 0.783
2048, 3, 128, 23, 127, 4, 0.815
2048, 0, 256, 23, 127, 4, 0.854
2048, 4, 256, 23, 127, 4, 0.858
2048, 0, 512, 23, 127, 4, 0.907
2048, 5, 512, 23, 127, 4, 0.873
2048, 0, 1024, 23, 127, 4, 0.657
2048, 6, 1024, 23, 127, 4, 0.653
2048, 0, 2048, 23, 127, 4, 0.666
2048, 7, 2048, 23, 127, 4, 0.667
2048, 0, 4096, 23, 127, 4, 0.67
2048, 8, 4096, 23, 127, 4, 0.67
256, 1, 64, 23, 127, 4, 0.686
256, 15, 64, 23, 127, 4, 0.687
256, 2, 64, 23, 127, 4, 0.687
256, 30, 64, 23, 127, 4, 0.687
256, 3, 64, 23, 127, 4, 0.687
256, 45, 64, 23, 127, 4, 0.672
256, 4, 64, 23, 127, 4, 0.687
256, 60, 64, 23, 127, 4, 0.701
256, 5, 64, 23, 127, 4, 0.687
256, 75, 64, 23, 127, 4, 0.686
256, 6, 64, 23, 127, 4, 0.687
256, 90, 64, 23, 127, 4, 0.686
256, 7, 64, 23, 127, 4, 0.69
256, 105, 64, 23, 127, 4, 0.672
1, 0, 0, 23, 127, 4, 0.798
2, 0, 1, 23, 127, 4, 0.791
3, 0, 2, 23, 127, 4, 0.792
4, 0, 3, 23, 127, 4, 0.795
5, 0, 4, 23, 127, 4, 0.791
6, 0, 5, 23, 127, 4, 0.793
7, 0, 6, 23, 127, 4, 0.78
8, 0, 7, 23, 127, 4, 0.791
9, 0, 8, 23, 127, 4, 0.788
10, 0, 9, 23, 127, 4, 0.798
11, 0, 10, 23, 127, 4, 0.796
12, 0, 11, 23, 127, 4, 0.794
13, 0, 12, 23, 127, 4, 0.795
14, 0, 13, 23, 127, 4, 0.793
15, 0, 14, 23, 127, 4, 0.8
16, 0, 15, 23, 127, 4, 0.796
17, 0, 16, 23, 127, 4, 0.796
18, 0, 17, 23, 127, 4, 0.796
19, 0, 18, 23, 127, 4, 0.798
20, 0, 19, 23, 127, 4, 0.796
21, 0, 20, 23, 127, 4, 0.796
22, 0, 21, 23, 127, 4, 0.796
23, 0, 22, 23, 127, 4, 0.801
24, 0, 23, 23, 127, 4, 0.799
25, 0, 24, 23, 127, 4, 0.795
26, 0, 25, 23, 127, 4, 0.793
27, 0, 26, 23, 127, 4, 0.796
28, 0, 27, 23, 127, 4, 0.794
29, 0, 28, 23, 127, 4, 0.798
30, 0, 29, 23, 127, 4, 0.795
31, 0, 30, 23, 127, 4, 0.797
32, 0, 31, 23, 127, 4, 0.628
2048, 0, 32, 23, 127, 8, 0.738
2048, 1, 32, 23, 127, 8, 0.747
2048, 0, 64, 23, 127, 8, 0.905
2048, 2, 64, 23, 127, 8, 0.906
2048, 0, 128, 23, 127, 8, 0.822
2048, 3, 128, 23, 127, 8, 0.827
2048, 0, 256, 23, 127, 8, 0.825
2048, 4, 256, 23, 127, 8, 0.825
2048, 0, 512, 23, 127, 8, 0.851
2048, 5, 512, 23, 127, 8, 0.855
2048, 0, 1024, 23, 127, 8, 0.653
2048, 6, 1024, 23, 127, 8, 0.651
2048, 0, 2048, 23, 127, 8, 0.644
2048, 7, 2048, 23, 127, 8, 0.643
2048, 0, 4096, 23, 127, 8, 0.67
2048, 8, 4096, 23, 127, 8, 0.67
256, 1, 64, 23, 127, 8, 0.686
256, 15, 64, 23, 127, 8, 0.686
256, 2, 64, 23, 127, 8, 0.686
256, 30, 64, 23, 127, 8, 0.687
256, 3, 64, 23, 127, 8, 0.686
256, 45, 64, 23, 127, 8, 0.671
256, 4, 64, 23, 127, 8, 0.69
256, 60, 64, 23, 127, 8, 0.705
256, 5, 64, 23, 127, 8, 0.688
256, 75, 64, 23, 127, 8, 0.687
256, 6, 64, 23, 127, 8, 0.692
256, 90, 64, 23, 127, 8, 0.689
256, 7, 64, 23, 127, 8, 0.69
256, 105, 64, 23, 127, 8, 0.674
1, 0, 0, 23, 127, 8, 0.798
2, 0, 1, 23, 127, 8, 0.798
3, 0, 2, 23, 127, 8, 0.797
4, 0, 3, 23, 127, 8, 0.792
5, 0, 4, 23, 127, 8, 0.795
6, 0, 5, 23, 127, 8, 0.792
7, 0, 6, 23, 127, 8, 0.792
8, 0, 7, 23, 127, 8, 0.795
9, 0, 8, 23, 127, 8, 0.799
10, 0, 9, 23, 127, 8, 0.798
11, 0, 10, 23, 127, 8, 0.795
12, 0, 11, 23, 127, 8, 0.795
13, 0, 12, 23, 127, 8, 0.797
14, 0, 13, 23, 127, 8, 0.796
15, 0, 14, 23, 127, 8, 0.795
16, 0, 15, 23, 127, 8, 0.796
17, 0, 16, 23, 127, 8, 0.798
18, 0, 17, 23, 127, 8, 0.798
19, 0, 18, 23, 127, 8, 0.795
20, 0, 19, 23, 127, 8, 0.797
21, 0, 20, 23, 127, 8, 0.797
22, 0, 21, 23, 127, 8, 0.793
23, 0, 22, 23, 127, 8, 0.797
24, 0, 23, 23, 127, 8, 0.8
25, 0, 24, 23, 127, 8, 0.796
26, 0, 25, 23, 127, 8, 0.796
27, 0, 26, 23, 127, 8, 0.791
28, 0, 27, 23, 127, 8, 0.795
29, 0, 28, 23, 127, 8, 0.786
30, 0, 29, 23, 127, 8, 0.797
31, 0, 30, 23, 127, 8, 0.791
32, 0, 31, 23, 127, 8, 0.628
2048, 0, 32, 23, 127, 16, 0.736
2048, 1, 32, 23, 127, 16, 0.737
2048, 0, 64, 23, 127, 16, 0.905
2048, 2, 64, 23, 127, 16, 0.908
2048, 0, 128, 23, 127, 16, 0.829
2048, 3, 128, 23, 127, 16, 0.824
2048, 0, 256, 23, 127, 16, 0.827
2048, 4, 256, 23, 127, 16, 0.825
2048, 0, 512, 23, 127, 16, 0.694
2048, 5, 512, 23, 127, 16, 0.687
2048, 0, 1024, 23, 127, 16, 0.568
2048, 6, 1024, 23, 127, 16, 0.667
2048, 0, 2048, 23, 127, 16, 0.766
2048, 7, 2048, 23, 127, 16, 0.781
2048, 0, 4096, 23, 127, 16, 0.646
2048, 8, 4096, 23, 127, 16, 0.646
256, 1, 64, 23, 127, 16, 0.697
256, 15, 64, 23, 127, 16, 0.686
256, 2, 64, 23, 127, 16, 0.687
256, 30, 64, 23, 127, 16, 0.687
256, 3, 64, 23, 127, 16, 0.686
256, 45, 64, 23, 127, 16, 0.672
256, 4, 64, 23, 127, 16, 0.686
256, 60, 64, 23, 127, 16, 0.701
256, 5, 64, 23, 127, 16, 0.686
256, 75, 64, 23, 127, 16, 0.686
256, 6, 64, 23, 127, 16, 0.691
256, 90, 64, 23, 127, 16, 0.687
256, 7, 64, 23, 127, 16, 0.688
256, 105, 64, 23, 127, 16, 0.674
1, 0, 0, 23, 127, 16, 0.797
2, 0, 1, 23, 127, 16, 0.798
3, 0, 2, 23, 127, 16, 0.786
4, 0, 3, 23, 127, 16, 0.792
5, 0, 4, 23, 127, 16, 0.792
6, 0, 5, 23, 127, 16, 0.795
7, 0, 6, 23, 127, 16, 0.796
8, 0, 7, 23, 127, 16, 0.798
9, 0, 8, 23, 127, 16, 0.795
10, 0, 9, 23, 127, 16, 0.797
11, 0, 10, 23, 127, 16, 0.797
12, 0, 11, 23, 127, 16, 0.797
13, 0, 12, 23, 127, 16, 0.799
14, 0, 13, 23, 127, 16, 0.798
15, 0, 14, 23, 127, 16, 0.798
16, 0, 15, 23, 127, 16, 0.796
17, 0, 16, 23, 127, 16, 0.798
18, 0, 17, 23, 127, 16, 0.796
19, 0, 18, 23, 127, 16, 0.797
20, 0, 19, 23, 127, 16, 0.797
21, 0, 20, 23, 127, 16, 0.798
22, 0, 21, 23, 127, 16, 0.797
23, 0, 22, 23, 127, 16, 0.797
24, 0, 23, 23, 127, 16, 0.797
25, 0, 24, 23, 127, 16, 0.798
26, 0, 25, 23, 127, 16, 0.794
27, 0, 26, 23, 127, 16, 0.796
28, 0, 27, 23, 127, 16, 0.796
29, 0, 28, 23, 127, 16, 0.792
30, 0, 29, 23, 127, 16, 0.788
31, 0, 30, 23, 127, 16, 0.79
32, 0, 31, 23, 127, 16, 0.628
sysdeps/x86_64/multiarch/strrchr-evex.S | 441 ++++++++++++++----------
1 file changed, 259 insertions(+), 182 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
index adeddaed32..5cf9a8315b 100644
--- a/sysdeps/x86_64/multiarch/strrchr-evex.S
+++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
@@ -24,242 +24,319 @@
# define STRRCHR __strrchr_evex
# endif
-# define VMOVU vmovdqu64
-# define VMOVA vmovdqa64
+# define VMOVU vmovdqu64
+# define VMOVA vmovdqa64
# ifdef USE_AS_WCSRCHR
+# define SHIFT_REG esi
+
+# define kunpck kunpckbw
+# define kmov_2x kmovd
+# define maskz_2x ecx
+# define maskm_2x eax
+# define CHAR_SIZE 4
+# define VPMIN vpminud
+# define VPTESTN vptestnmd
# define VPBROADCAST vpbroadcastd
-# define VPCMP vpcmpd
-# define SHIFT_REG r8d
+# define VPCMP vpcmpd
# else
+# define SHIFT_REG edi
+
+# define kunpck kunpckdq
+# define kmov_2x kmovq
+# define maskz_2x rcx
+# define maskm_2x rax
+
+# define CHAR_SIZE 1
+# define VPMIN vpminub
+# define VPTESTN vptestnmb
# define VPBROADCAST vpbroadcastb
-# define VPCMP vpcmpb
-# define SHIFT_REG ecx
+# define VPCMP vpcmpb
# endif
# define XMMZERO xmm16
# define YMMZERO ymm16
# define YMMMATCH ymm17
-# define YMM1 ymm18
+# define YMMSAVE ymm18
+
+# define YMM1 ymm19
+# define YMM2 ymm20
+# define YMM3 ymm21
+# define YMM4 ymm22
+# define YMM5 ymm23
+# define YMM6 ymm24
+# define YMM7 ymm25
+# define YMM8 ymm26
-# define VEC_SIZE 32
- .section .text.evex,"ax",@progbits
-ENTRY (STRRCHR)
- movl %edi, %ecx
+# define VEC_SIZE 32
+# define PAGE_SIZE 4096
+ .section .text.evex, "ax", @progbits
+ENTRY(STRRCHR)
+ movl %edi, %eax
/* Broadcast CHAR to YMMMATCH. */
VPBROADCAST %esi, %YMMMATCH
- vpxorq %XMMZERO, %XMMZERO, %XMMZERO
-
- /* Check if we may cross page boundary with one vector load. */
- andl $(2 * VEC_SIZE - 1), %ecx
- cmpl $VEC_SIZE, %ecx
- ja L(cros_page_boundary)
+ andl $(PAGE_SIZE - 1), %eax
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ jg L(cross_page_boundary)
+L(page_cross_continue):
VMOVU (%rdi), %YMM1
-
- /* Each bit in K0 represents a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM1, %k0
- /* Each bit in K1 represents a CHAR in YMM1. */
- VPCMP $0, %YMMMATCH, %YMM1, %k1
+ VPTESTN %YMM1, %YMM1, %k0
kmovd %k0, %ecx
- kmovd %k1, %eax
-
- addq $VEC_SIZE, %rdi
-
- testl %eax, %eax
- jnz L(first_vec)
-
testl %ecx, %ecx
- jnz L(return_null)
-
- andq $-VEC_SIZE, %rdi
- xorl %edx, %edx
- jmp L(aligned_loop)
-
- .p2align 4
-L(first_vec):
- /* Check if there is a null byte. */
- testl %ecx, %ecx
- jnz L(char_and_nul_in_first_vec)
-
- /* Remember the match and keep searching. */
- movl %eax, %edx
- movq %rdi, %rsi
- andq $-VEC_SIZE, %rdi
- jmp L(aligned_loop)
-
- .p2align 4
-L(cros_page_boundary):
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
-
+ jz L(aligned_more)
+ VPCMP $0, %YMMMATCH, %YMM1, %k1
+ kmovd %k1, %eax
+ blsmskl %ecx, %ecx
+ andl %ecx, %eax
+ jz L(ret0)
+ bsrl %eax, %eax
# ifdef USE_AS_WCSRCHR
- /* NB: Divide shift count by 4 since each bit in K1 represent 4
- bytes. */
- movl %ecx, %SHIFT_REG
- sarl $2, %SHIFT_REG
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
+# else
+ addq %rdi, %rax
# endif
+L(ret0):
+ ret
- VMOVA (%rdi), %YMM1
-
- /* Each bit in K0 represents a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM1, %k0
- /* Each bit in K1 represents a CHAR in YMM1. */
+ /* Returns for first vec x1/x2/x3 have hard coded backward
+ search path for earlier matches. */
+ .p2align 4,, 6
+L(first_vec_x1):
+ VPCMP $0, %YMMMATCH, %YMM2, %k1
+ kmovd %k1, %eax
+ blsmskl %ecx, %ecx
+ andl %ecx, %eax
+ jnz L(first_vec_x1_return)
+ .p2align 4,, 4
+L(first_vec_x0_test):
VPCMP $0, %YMMMATCH, %YMM1, %k1
- kmovd %k0, %edx
kmovd %k1, %eax
-
- shrxl %SHIFT_REG, %edx, %edx
- shrxl %SHIFT_REG, %eax, %eax
- addq $VEC_SIZE, %rdi
-
- /* Check if there is a CHAR. */
testl %eax, %eax
- jnz L(found_char)
-
- testl %edx, %edx
- jnz L(return_null)
-
- jmp L(aligned_loop)
-
- .p2align 4
-L(found_char):
- testl %edx, %edx
- jnz L(char_and_nul)
-
- /* Remember the match and keep searching. */
- movl %eax, %edx
- leaq (%rdi, %rcx), %rsi
+ jz L(ret1)
+ bsrl %eax, %eax
+# ifdef USE_AS_WCSRCHR
+ leaq (%rsi, %rax, CHAR_SIZE), %rax
+# else
+ addq %rsi, %rax
+# endif
+L(ret1):
+ ret
- .p2align 4
-L(aligned_loop):
- VMOVA (%rdi), %YMM1
- addq $VEC_SIZE, %rdi
+ .p2align 4,, 10
+L(first_vec_x1_or_x2):
+ VPCMP $0, %YMM3, %YMMMATCH, %k3
+ VPCMP $0, %YMM2, %YMMMATCH, %k2
+ kortestd %k2, %k3
+ jz L(first_vec_x0_test)
+
+ kunpck %k2, %k3, %k3
+ kmovq %k3, %rax
+ bsrq %rax, %rax
+ leaq (VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax
+ ret
- /* Each bit in K0 represents a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM1, %k0
- /* Each bit in K1 represents a CHAR in YMM1. */
- VPCMP $0, %YMMMATCH, %YMM1, %k1
- kmovd %k0, %ecx
+ .p2align 4,, 6
+L(first_vec_x3):
+ VPCMP $0, %YMMMATCH, %YMM4, %k1
kmovd %k1, %eax
- orl %eax, %ecx
- jnz L(char_nor_null)
+ blsmskl %ecx, %ecx
+ andl %ecx, %eax
+ jz L(first_vec_x1_or_x2)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
- VMOVA (%rdi), %YMM1
- add $VEC_SIZE, %rdi
+ .p2align 4,, 6
+L(first_vec_x0_x1_test):
+ VPCMP $0, %YMMMATCH, %YMM2, %k1
+ kmovd %k1, %eax
+ testl %eax, %eax
+ jz L(first_vec_x0_test)
+ .p2align 4,, 4
+L(first_vec_x1_return):
+ bsrl %eax, %eax
+ leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
- /* Each bit in K0 represents a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM1, %k0
- /* Each bit in K1 represents a CHAR in YMM1. */
- VPCMP $0, %YMMMATCH, %YMM1, %k1
- kmovd %k0, %ecx
+ .p2align 4,, 10
+L(first_vec_x2):
+ VPCMP $0, %YMMMATCH, %YMM3, %k1
kmovd %k1, %eax
- orl %eax, %ecx
- jnz L(char_nor_null)
+ blsmskl %ecx, %ecx
+ andl %ecx, %eax
+ jz L(first_vec_x0_x1_test)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
- VMOVA (%rdi), %YMM1
- addq $VEC_SIZE, %rdi
- /* Each bit in K0 represents a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM1, %k0
- /* Each bit in K1 represents a CHAR in YMM1. */
- VPCMP $0, %YMMMATCH, %YMM1, %k1
+ .p2align 4
+L(aligned_more):
+ /* Need to keep original pointer incase YMM1 has last match. */
+ movq %rdi, %rsi
+ andq $-VEC_SIZE, %rdi
+ VMOVU VEC_SIZE(%rdi), %YMM2
+ VPTESTN %YMM2, %YMM2, %k0
kmovd %k0, %ecx
- kmovd %k1, %eax
- orl %eax, %ecx
- jnz L(char_nor_null)
+ testl %ecx, %ecx
+ jnz L(first_vec_x1)
- VMOVA (%rdi), %YMM1
- addq $VEC_SIZE, %rdi
+ VMOVU (VEC_SIZE * 2)(%rdi), %YMM3
+ VPTESTN %YMM3, %YMM3, %k0
+ kmovd %k0, %ecx
+ testl %ecx, %ecx
+ jnz L(first_vec_x2)
- /* Each bit in K0 represents a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM1, %k0
- /* Each bit in K1 represents a CHAR in YMM1. */
- VPCMP $0, %YMMMATCH, %YMM1, %k1
+ VMOVU (VEC_SIZE * 3)(%rdi), %YMM4
+ VPTESTN %YMM4, %YMM4, %k0
kmovd %k0, %ecx
- kmovd %k1, %eax
- orl %eax, %ecx
- jz L(aligned_loop)
+ movq %rdi, %r8
+ testl %ecx, %ecx
+ jnz L(first_vec_x3)
+ andq $-(VEC_SIZE * 2), %rdi
.p2align 4
-L(char_nor_null):
- /* Find a CHAR or a null byte in a loop. */
+L(first_aligned_loop):
+ /* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee
+ they don't store a match. */
+ VMOVA (VEC_SIZE * 4)(%rdi), %YMM5
+ VMOVA (VEC_SIZE * 5)(%rdi), %YMM6
+
+ VPCMP $0, %YMM5, %YMMMATCH, %k2
+ vpxord %YMM6, %YMMMATCH, %YMM7
+
+ VPMIN %YMM5, %YMM6, %YMM8
+ VPMIN %YMM8, %YMM7, %YMM7
+
+ VPTESTN %YMM7, %YMM7, %k1
+ subq $(VEC_SIZE * -2), %rdi
+ kortestd %k1, %k2
+ jz L(first_aligned_loop)
+
+ VPCMP $0, %YMM6, %YMMMATCH, %k3
+ VPTESTN %YMM8, %YMM8, %k1
+ ktestd %k1, %k1
+ jz L(second_aligned_loop_prep)
+
+ kortestd %k2, %k3
+ jnz L(return_first_aligned_loop)
+
+ .p2align 4,, 6
+L(first_vec_x1_or_x2_or_x3):
+ VPCMP $0, %YMM4, %YMMMATCH, %k4
+ kmovd %k4, %eax
testl %eax, %eax
- jnz L(match)
-L(return_value):
- testl %edx, %edx
- jz L(return_null)
- movl %edx, %eax
- movq %rsi, %rdi
+ jz L(first_vec_x1_or_x2)
bsrl %eax, %eax
-# ifdef USE_AS_WCSRCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq -VEC_SIZE(%rdi, %rax, 4), %rax
-# else
- leaq -VEC_SIZE(%rdi, %rax), %rax
-# endif
+ leaq (VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
ret
- .p2align 4
-L(match):
- /* Find a CHAR. Check if there is a null byte. */
- kmovd %k0, %ecx
- testl %ecx, %ecx
- jnz L(find_nul)
+ .p2align 4,, 8
+L(return_first_aligned_loop):
+ VPTESTN %YMM5, %YMM5, %k0
+ kunpck %k0, %k1, %k0
+ kmov_2x %k0, %maskz_2x
+
+ blsmsk %maskz_2x, %maskz_2x
+ kunpck %k2, %k3, %k3
+ kmov_2x %k3, %maskm_2x
+ and %maskz_2x, %maskm_2x
+ jz L(first_vec_x1_or_x2_or_x3)
+
+ bsr %maskm_2x, %maskm_2x
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
- /* Remember the match and keep searching. */
- movl %eax, %edx
+ .p2align 4
+ /* We can throw away the work done for the first 4x checks here
+ as we have a later match. This is the 'fast' path persay.
+ */
+L(second_aligned_loop_prep):
+L(second_aligned_loop_set_furthest_match):
movq %rdi, %rsi
- jmp L(aligned_loop)
+ kunpck %k2, %k3, %k4
.p2align 4
-L(find_nul):
- /* Mask out any matching bits after the null byte. */
- movl %ecx, %r8d
- subl $1, %r8d
- xorl %ecx, %r8d
- andl %r8d, %eax
- testl %eax, %eax
- /* If there is no CHAR here, return the remembered one. */
- jz L(return_value)
- bsrl %eax, %eax
+L(second_aligned_loop):
+ VMOVU (VEC_SIZE * 4)(%rdi), %YMM1
+ VMOVU (VEC_SIZE * 5)(%rdi), %YMM2
+
+ VPCMP $0, %YMM1, %YMMMATCH, %k2
+ vpxord %YMM2, %YMMMATCH, %YMM3
+
+ VPMIN %YMM1, %YMM2, %YMM4
+ VPMIN %YMM3, %YMM4, %YMM3
+
+ VPTESTN %YMM3, %YMM3, %k1
+ subq $(VEC_SIZE * -2), %rdi
+ kortestd %k1, %k2
+ jz L(second_aligned_loop)
+
+ VPCMP $0, %YMM2, %YMMMATCH, %k3
+ VPTESTN %YMM4, %YMM4, %k1
+ ktestd %k1, %k1
+ jz L(second_aligned_loop_set_furthest_match)
+
+ kortestd %k2, %k3
+ /* branch here because there is a significant advantage interms
+ of output dependency chance in using edx. */
+ jnz L(return_new_match)
+L(return_old_match):
+ kmovq %k4, %rax
+ bsrq %rax, %rax
+ leaq (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax
+ ret
+
+L(return_new_match):
+ VPTESTN %YMM1, %YMM1, %k0
+ kunpck %k0, %k1, %k0
+ kmov_2x %k0, %maskz_2x
+
+ blsmsk %maskz_2x, %maskz_2x
+ kunpck %k2, %k3, %k3
+ kmov_2x %k3, %maskm_2x
+ and %maskz_2x, %maskm_2x
+ jz L(return_old_match)
+
+ bsr %maskm_2x, %maskm_2x
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
+
+ /* This block is horribly aligned (% 16 == 15). This is
+ intentional. The L(cross_page_boundary) block is exactly
+ 32-bytes of code size. Ultimately this is a cold case so
+ save the code size by leaving misaligned. */
+L(cross_page_boundary):
+ xorq %rdi, %rax
+ VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %YMM1
+ VPTESTN %YMM1, %YMM1, %k0
+ kmovd %k0, %ecx
# ifdef USE_AS_WCSRCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq -VEC_SIZE(%rdi, %rax, 4), %rax
-# else
- leaq -VEC_SIZE(%rdi, %rax), %rax
+ movl %edi, %esi
+ andl $(VEC_SIZE - 1), %esi
+ shrl $2, %esi
# endif
- ret
+ shrxl %SHIFT_REG, %ecx, %ecx
- .p2align 4
-L(char_and_nul):
- /* Find both a CHAR and a null byte. */
- addq %rcx, %rdi
- movl %edx, %ecx
-L(char_and_nul_in_first_vec):
- /* Mask out any matching bits after the null byte. */
- movl %ecx, %r8d
- subl $1, %r8d
- xorl %ecx, %r8d
- andl %r8d, %eax
- testl %eax, %eax
- /* Return null pointer if the null byte comes first. */
- jz L(return_null)
+ testl %ecx, %ecx
+ jz L(page_cross_continue)
+ VPCMP $0, %YMMMATCH, %YMM1, %k1
+ kmovd %k1, %eax
+ shrxl %SHIFT_REG, %eax, %eax
+ blsmskl %ecx, %ecx
+ andl %ecx, %eax
+ jz L(ret3)
bsrl %eax, %eax
# ifdef USE_AS_WCSRCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq -VEC_SIZE(%rdi, %rax, 4), %rax
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
# else
- leaq -VEC_SIZE(%rdi, %rax), %rax
+ addq %rdi, %rax
# endif
+L(ret3):
ret
- .p2align 4
-L(return_null):
- xorl %eax, %eax
- ret
-
-END (STRRCHR)
+END(STRRCHR)
#endif
--
2.25.1
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH v1 1/5] benchtests: Improve bench-strrchr
2022-04-21 3:14 [PATCH v1 1/5] benchtests: Improve bench-strrchr Noah Goldstein
` (3 preceding siblings ...)
2022-04-21 3:14 ` [PATCH v1 5/5] x86: Optimize {str|wcs}rchr-evex Noah Goldstein
@ 2022-04-21 20:12 ` H.J. Lu
2022-04-21 22:07 ` Noah Goldstein
2022-04-21 22:22 ` [PATCH v2 1/4] " Noah Goldstein
` (2 subsequent siblings)
7 siblings, 1 reply; 36+ messages in thread
From: H.J. Lu @ 2022-04-21 20:12 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Wed, Apr 20, 2022 at 8:14 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> 1. Use json-lib for printing results.
> 2. Expose all parameters (before pos, seek_char, and max_char where
> not printed).
> 3. Add benchmarks that test multiple occurence of seek_char in the
> string.
> ---
> benchtests/bench-strrchr.c | 126 ++++++++++++++++++++++++-------------
> 1 file changed, 82 insertions(+), 44 deletions(-)
>
> diff --git a/benchtests/bench-strrchr.c b/benchtests/bench-strrchr.c
> index abdae60c51..cceea77e1b 100644
> --- a/benchtests/bench-strrchr.c
> +++ b/benchtests/bench-strrchr.c
> @@ -23,6 +23,7 @@
> # define TEST_NAME "strrchr"
> #endif
> #include "bench-string.h"
> +#include "json-lib.h"
>
> #define BIG_CHAR MAX_CHAR
>
> @@ -53,7 +54,8 @@ SIMPLE_STRRCHR (const CHAR *s, int c)
> }
>
> static void
> -do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
> +do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s, int c,
> + CHAR *exp_res)
> {
> CHAR *res = CALL (impl, s, c);
> size_t i, iters = INNER_LOOP_ITERS8;
> @@ -61,8 +63,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
>
> if (res != exp_res)
> {
> - error (0, 0, "Wrong result in function %s %p %p", impl->name,
> - res, exp_res);
> + error (0, 0, "Wrong result in function %s %p %p", impl->name, res,
> + exp_res);
These changes aren't needed.
> ret = 1;
> return;
> }
> @@ -72,24 +74,28 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
> {
> CALL (impl, s, c);
> }
> - TIMING_NOW (stop);
>
> + TIMING_NOW (stop);
Not needed.
> TIMING_DIFF (cur, start, stop);
>
> - TIMING_PRINT_MEAN ((double) cur, (double) iters);
> + json_element_double (json_ctx, (double) cur / (double) iters);
> + return;
Return isn't needed.
> }
>
> static void
> -do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
> +do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
> + int seek_char, int max_char, size_t freq)
> /* For wcsrchr: align here means align not in bytes,
> but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
> len for wcschr here isn't in bytes but it's number of wchar_t symbols. */
> {
> size_t i;
> + size_t pos_chunk_sz = freq ? (pos / freq) : pos;
> + size_t last_pos = len;
> CHAR *result;
> CHAR *buf = (CHAR *) buf1;
>
> - align &= 7;
> + align &= (getpagesize () - 1);
If we have such large alignments, the tests may be skipped.
Should we change it to 127 instead?
> if ((align + len) * sizeof (CHAR) >= page_size)
> return;
>
> @@ -103,6 +109,16 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
> if ((i > pos || pos >= len) && buf[align + i] == seek_char)
> buf[align + i] = seek_char + 10 + (random () & 15);
> }
> +
> + if (pos_chunk_sz == 0 && pos)
> + pos_chunk_sz = 1;
> +
> + for (i = pos_chunk_sz; i < pos && i < len; i += pos_chunk_sz)
> + {
> + buf[align + i] = seek_char;
> + last_pos = i;
> + }
> +
> buf[align + len] = 0;
>
> if (pos < len)
> @@ -110,66 +126,88 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
> buf[align + pos] = seek_char;
> result = (CHAR *) (buf + align + pos);
> }
> + else if (last_pos < len)
> + result = (CHAR *) (buf + align + last_pos);
> else if (seek_char == 0)
> result = (CHAR *) (buf + align + len);
> else
> result = NULL;
>
> - printf ("Length %4zd, alignment in bytes %2zd:", len, align * sizeof (CHAR));
> + json_element_object_begin (json_ctx);
> + json_attr_uint (json_ctx, "len", len);
> + json_attr_uint (json_ctx, "pos", pos);
> + json_attr_uint (json_ctx, "align", align);
> + json_attr_uint (json_ctx, "freq", freq);
> + json_attr_uint (json_ctx, "seek", seek_char);
> + json_attr_uint (json_ctx, "max_char", max_char);
> + json_array_begin (json_ctx, "timings");
>
> FOR_EACH_IMPL (impl, 0)
> - do_one_test (impl, (CHAR *) (buf + align), seek_char, result);
> + do_one_test (json_ctx, impl, (CHAR *) (buf + align), seek_char, result);
>
> - putchar ('\n');
> + json_array_end (json_ctx);
> + json_element_object_end (json_ctx);
> }
>
> int
> test_main (void)
> {
> - size_t i;
> + json_ctx_t json_ctx;
> + size_t i, j;
> + int seek;
>
> test_init ();
> + json_init (&json_ctx, 0, stdout);
>
> - printf ("%20s", "");
> - FOR_EACH_IMPL (impl, 0)
> - printf ("\t%s", impl->name);
> - putchar ('\n');
> + json_document_begin (&json_ctx);
> + json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
>
> - for (i = 1; i < 8; ++i)
> - {
> - do_test (0, 16 << i, 2048, 23, SMALL_CHAR);
> - do_test (i, 16 << i, 2048, 23, SMALL_CHAR);
> - }
> + json_attr_object_begin (&json_ctx, "functions");
> + json_attr_object_begin (&json_ctx, TEST_NAME);
> + json_attr_string (&json_ctx, "bench-variant", "");
>
> - for (i = 1; i < 8; ++i)
> - {
> - do_test (i, 64, 256, 23, SMALL_CHAR);
> - do_test (i, 64, 256, 23, BIG_CHAR);
> - }
> -
> - for (i = 0; i < 32; ++i)
> - {
> - do_test (0, i, i + 1, 23, SMALL_CHAR);
> - do_test (0, i, i + 1, 23, BIG_CHAR);
> - }
> + json_array_begin (&json_ctx, "ifuncs");
> + FOR_EACH_IMPL (impl, 0)
> + json_element_string (&json_ctx, impl->name);
> + json_array_end (&json_ctx);
>
> - for (i = 1; i < 8; ++i)
> - {
> - do_test (0, 16 << i, 2048, 0, SMALL_CHAR);
> - do_test (i, 16 << i, 2048, 0, SMALL_CHAR);
> - }
> + json_array_begin (&json_ctx, "results");
>
> - for (i = 1; i < 8; ++i)
> + for (seek = 0; seek <= 23; seek += 23)
> {
> - do_test (i, 64, 256, 0, SMALL_CHAR);
> - do_test (i, 64, 256, 0, BIG_CHAR);
> + for (j = 1; j < 32; j += j)
> + {
> + for (i = 1; i < 9; ++i)
> + {
> + do_test (&json_ctx, 0, 16 << i, 2048, seek, SMALL_CHAR, j);
> + do_test (&json_ctx, i, 16 << i, 2048, seek, SMALL_CHAR, j);
> + }
> +
> + for (i = 1; i < 8; ++i)
> + {
> + do_test (&json_ctx, i, 64, 256, seek, SMALL_CHAR, j);
> + do_test (&json_ctx, i, 64, 256, seek, BIG_CHAR, j);
> +
> + do_test (&json_ctx, i * 15, 64, 256, seek, SMALL_CHAR, j);
> + do_test (&json_ctx, i * 15, 64, 256, seek, BIG_CHAR, j);
> + }
> +
> + for (i = 0; i < 32; ++i)
> + {
> + do_test (&json_ctx, 0, i, i + 1, seek, SMALL_CHAR, j);
> + do_test (&json_ctx, 0, i, i + 1, seek, BIG_CHAR, j);
> + }
> + if (seek == 0)
> + {
> + break;
> + }
> + }
> }
>
> - for (i = 0; i < 32; ++i)
> - {
> - do_test (0, i, i + 1, 0, SMALL_CHAR);
> - do_test (0, i, i + 1, 0, BIG_CHAR);
> - }
> + json_array_end (&json_ctx);
> + json_attr_object_end (&json_ctx);
> + json_attr_object_end (&json_ctx);
> + json_document_end (&json_ctx);
>
> return ret;
> }
> --
> 2.25.1
>
--
H.J.
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH v1 2/5] x86: Optimize {str|wcs}rchr-sse2
2022-04-21 3:14 ` [PATCH v1 2/5] x86: Optimize {str|wcs}rchr-sse2 Noah Goldstein
@ 2022-04-21 20:26 ` H.J. Lu
2022-04-21 20:57 ` Noah Goldstein
0 siblings, 1 reply; 36+ messages in thread
From: H.J. Lu @ 2022-04-21 20:26 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Wed, Apr 20, 2022 at 8:14 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The new code unrolls the main loop slightly without adding too much
> overhead and minimizes the comparisons for the search CHAR.
>
> Geometric Mean of all benchmarks New / Old: 0.741
> See email for all results.
>
> Full xcheck passes on x86_64 with and without multiarch enabled.
> ---
> Results For: strrchr
>
> Geometric Mean of N=30 runs.
>
> Geometric Mean of all benchmarks New / Old: 0.741
> Benchmarks performance on Tigerlake:
> https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i71165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html
>
> len, align, pos, seek, max_char, freq, New Time / Old Time
> 2048, 0, 32, 0, 127, 1, 0.647
> 2048, 1, 32, 0, 127, 1, 0.621
> 2048, 0, 64, 0, 127, 1, 0.661
> 2048, 2, 64, 0, 127, 1, 0.655
> 2048, 0, 128, 0, 127, 1, 0.69
> 2048, 3, 128, 0, 127, 1, 0.689
> 2048, 0, 256, 0, 127, 1, 0.718
> 2048, 4, 256, 0, 127, 1, 0.718
> 2048, 0, 512, 0, 127, 1, 0.758
> 2048, 5, 512, 0, 127, 1, 0.754
> 2048, 0, 1024, 0, 127, 1, 1.029
> 2048, 6, 1024, 0, 127, 1, 1.032
> 2048, 0, 2048, 0, 127, 1, 0.826
> 2048, 7, 2048, 0, 127, 1, 0.834
> 2048, 0, 4096, 0, 127, 1, 0.825
> 2048, 8, 4096, 0, 127, 1, 0.83
> 256, 1, 64, 0, 127, 1, 0.657
> 256, 15, 64, 0, 127, 1, 0.657
> 256, 2, 64, 0, 127, 1, 0.657
> 256, 30, 64, 0, 127, 1, 0.523
> 256, 3, 64, 0, 127, 1, 0.657
> 256, 45, 64, 0, 127, 1, 0.654
> 256, 4, 64, 0, 127, 1, 0.657
> 256, 60, 64, 0, 127, 1, 0.526
> 256, 5, 64, 0, 127, 1, 0.658
> 256, 75, 64, 0, 127, 1, 0.658
> 256, 6, 64, 0, 127, 1, 0.655
> 256, 90, 64, 0, 127, 1, 0.523
> 256, 7, 64, 0, 127, 1, 0.655
> 256, 105, 64, 0, 127, 1, 0.654
> 1, 0, 0, 0, 127, 1, 0.98
> 2, 0, 1, 0, 127, 1, 0.978
> 3, 0, 2, 0, 127, 1, 0.975
> 4, 0, 3, 0, 127, 1, 0.976
> 5, 0, 4, 0, 127, 1, 0.977
> 6, 0, 5, 0, 127, 1, 0.981
> 7, 0, 6, 0, 127, 1, 0.982
> 8, 0, 7, 0, 127, 1, 0.98
> 9, 0, 8, 0, 127, 1, 0.978
> 10, 0, 9, 0, 127, 1, 0.981
> 11, 0, 10, 0, 127, 1, 0.984
> 12, 0, 11, 0, 127, 1, 0.982
> 13, 0, 12, 0, 127, 1, 0.98
> 14, 0, 13, 0, 127, 1, 0.978
> 15, 0, 14, 0, 127, 1, 0.979
> 16, 0, 15, 0, 127, 1, 0.986
> 17, 0, 16, 0, 127, 1, 0.529
> 18, 0, 17, 0, 127, 1, 0.566
> 19, 0, 18, 0, 127, 1, 0.575
> 20, 0, 19, 0, 127, 1, 0.573
> 21, 0, 20, 0, 127, 1, 0.579
> 22, 0, 21, 0, 127, 1, 0.595
> 23, 0, 22, 0, 127, 1, 0.585
> 24, 0, 23, 0, 127, 1, 0.586
> 25, 0, 24, 0, 127, 1, 0.587
> 26, 0, 25, 0, 127, 1, 0.592
> 27, 0, 26, 0, 127, 1, 0.595
> 28, 0, 27, 0, 127, 1, 0.592
> 29, 0, 28, 0, 127, 1, 0.6
> 30, 0, 29, 0, 127, 1, 0.598
> 31, 0, 30, 0, 127, 1, 0.595
> 32, 0, 31, 0, 127, 1, 0.592
> 2048, 0, 32, 23, 127, 1, 0.827
> 2048, 1, 32, 23, 127, 1, 0.826
> 2048, 0, 64, 23, 127, 1, 0.824
> 2048, 2, 64, 23, 127, 1, 0.825
> 2048, 0, 128, 23, 127, 1, 0.829
> 2048, 3, 128, 23, 127, 1, 0.824
> 2048, 0, 256, 23, 127, 1, 0.832
> 2048, 4, 256, 23, 127, 1, 0.825
> 2048, 0, 512, 23, 127, 1, 0.831
> 2048, 5, 512, 23, 127, 1, 0.837
> 2048, 0, 1024, 23, 127, 1, 0.721
> 2048, 6, 1024, 23, 127, 1, 0.757
> 2048, 0, 2048, 23, 127, 1, 0.825
> 2048, 7, 2048, 23, 127, 1, 0.824
> 2048, 0, 4096, 23, 127, 1, 0.828
> 2048, 8, 4096, 23, 127, 1, 0.823
> 256, 1, 64, 23, 127, 1, 0.665
> 256, 15, 64, 23, 127, 1, 0.661
> 256, 2, 64, 23, 127, 1, 0.674
> 256, 30, 64, 23, 127, 1, 0.605
> 256, 3, 64, 23, 127, 1, 0.668
> 256, 45, 64, 23, 127, 1, 0.661
> 256, 4, 64, 23, 127, 1, 0.657
> 256, 60, 64, 23, 127, 1, 0.594
> 256, 5, 64, 23, 127, 1, 0.654
> 256, 75, 64, 23, 127, 1, 0.673
> 256, 6, 64, 23, 127, 1, 0.688
> 256, 90, 64, 23, 127, 1, 0.6
> 256, 7, 64, 23, 127, 1, 0.66
> 256, 105, 64, 23, 127, 1, 0.654
> 1, 0, 0, 23, 127, 1, 0.981
> 2, 0, 1, 23, 127, 1, 0.976
> 3, 0, 2, 23, 127, 1, 0.983
> 4, 0, 3, 23, 127, 1, 0.984
> 5, 0, 4, 23, 127, 1, 0.973
> 6, 0, 5, 23, 127, 1, 0.987
> 7, 0, 6, 23, 127, 1, 0.977
> 8, 0, 7, 23, 127, 1, 0.979
> 9, 0, 8, 23, 127, 1, 0.981
> 10, 0, 9, 23, 127, 1, 0.98
> 11, 0, 10, 23, 127, 1, 0.983
> 12, 0, 11, 23, 127, 1, 0.98
> 13, 0, 12, 23, 127, 1, 0.98
> 14, 0, 13, 23, 127, 1, 0.977
> 15, 0, 14, 23, 127, 1, 0.982
> 16, 0, 15, 23, 127, 1, 0.581
> 17, 0, 16, 23, 127, 1, 0.551
> 18, 0, 17, 23, 127, 1, 0.555
> 19, 0, 18, 23, 127, 1, 0.586
> 20, 0, 19, 23, 127, 1, 0.585
> 21, 0, 20, 23, 127, 1, 0.582
> 22, 0, 21, 23, 127, 1, 0.571
> 23, 0, 22, 23, 127, 1, 0.576
> 24, 0, 23, 23, 127, 1, 0.581
> 25, 0, 24, 23, 127, 1, 0.589
> 26, 0, 25, 23, 127, 1, 0.593
> 27, 0, 26, 23, 127, 1, 0.595
> 28, 0, 27, 23, 127, 1, 0.583
> 29, 0, 28, 23, 127, 1, 0.595
> 30, 0, 29, 23, 127, 1, 0.58
> 31, 0, 30, 23, 127, 1, 0.594
> 32, 0, 31, 23, 127, 1, 0.665
> 2048, 0, 32, 23, 127, 2, 0.825
> 2048, 1, 32, 23, 127, 2, 0.818
> 2048, 0, 64, 23, 127, 2, 0.829
> 2048, 2, 64, 23, 127, 2, 0.828
> 2048, 0, 128, 23, 127, 2, 0.823
> 2048, 3, 128, 23, 127, 2, 0.825
> 2048, 0, 256, 23, 127, 2, 0.819
> 2048, 4, 256, 23, 127, 2, 0.828
> 2048, 0, 512, 23, 127, 2, 0.824
> 2048, 5, 512, 23, 127, 2, 0.827
> 2048, 0, 1024, 23, 127, 2, 0.813
> 2048, 6, 1024, 23, 127, 2, 0.834
> 2048, 0, 2048, 23, 127, 2, 0.927
> 2048, 7, 2048, 23, 127, 2, 0.923
> 2048, 0, 4096, 23, 127, 2, 0.818
> 2048, 8, 4096, 23, 127, 2, 0.82
> 256, 1, 64, 23, 127, 2, 0.693
> 256, 15, 64, 23, 127, 2, 0.686
> 256, 2, 64, 23, 127, 2, 0.69
> 256, 30, 64, 23, 127, 2, 0.611
> 256, 3, 64, 23, 127, 2, 0.692
> 256, 45, 64, 23, 127, 2, 0.685
> 256, 4, 64, 23, 127, 2, 0.688
> 256, 60, 64, 23, 127, 2, 0.6
> 256, 5, 64, 23, 127, 2, 0.69
> 256, 75, 64, 23, 127, 2, 0.689
> 256, 6, 64, 23, 127, 2, 0.688
> 256, 90, 64, 23, 127, 2, 0.611
> 256, 7, 64, 23, 127, 2, 0.69
> 256, 105, 64, 23, 127, 2, 0.686
> 1, 0, 0, 23, 127, 2, 0.982
> 2, 0, 1, 23, 127, 2, 0.987
> 3, 0, 2, 23, 127, 2, 0.978
> 4, 0, 3, 23, 127, 2, 0.977
> 5, 0, 4, 23, 127, 2, 0.979
> 6, 0, 5, 23, 127, 2, 0.985
> 7, 0, 6, 23, 127, 2, 0.975
> 8, 0, 7, 23, 127, 2, 0.981
> 9, 0, 8, 23, 127, 2, 0.984
> 10, 0, 9, 23, 127, 2, 0.983
> 11, 0, 10, 23, 127, 2, 0.982
> 12, 0, 11, 23, 127, 2, 0.976
> 13, 0, 12, 23, 127, 2, 0.985
> 14, 0, 13, 23, 127, 2, 0.984
> 15, 0, 14, 23, 127, 2, 0.98
> 16, 0, 15, 23, 127, 2, 0.583
> 17, 0, 16, 23, 127, 2, 0.552
> 18, 0, 17, 23, 127, 2, 0.564
> 19, 0, 18, 23, 127, 2, 0.585
> 20, 0, 19, 23, 127, 2, 0.578
> 21, 0, 20, 23, 127, 2, 0.578
> 22, 0, 21, 23, 127, 2, 0.571
> 23, 0, 22, 23, 127, 2, 0.587
> 24, 0, 23, 23, 127, 2, 0.589
> 25, 0, 24, 23, 127, 2, 0.593
> 26, 0, 25, 23, 127, 2, 0.589
> 27, 0, 26, 23, 127, 2, 0.588
> 28, 0, 27, 23, 127, 2, 0.593
> 29, 0, 28, 23, 127, 2, 0.579
> 30, 0, 29, 23, 127, 2, 0.572
> 31, 0, 30, 23, 127, 2, 0.582
> 32, 0, 31, 23, 127, 2, 0.659
> 2048, 0, 32, 23, 127, 4, 0.822
> 2048, 1, 32, 23, 127, 4, 0.818
> 2048, 0, 64, 23, 127, 4, 0.826
> 2048, 2, 64, 23, 127, 4, 0.824
> 2048, 0, 128, 23, 127, 4, 0.833
> 2048, 3, 128, 23, 127, 4, 0.831
> 2048, 0, 256, 23, 127, 4, 0.826
> 2048, 4, 256, 23, 127, 4, 0.831
> 2048, 0, 512, 23, 127, 4, 0.834
> 2048, 5, 512, 23, 127, 4, 0.83
> 2048, 0, 1024, 23, 127, 4, 0.836
> 2048, 6, 1024, 23, 127, 4, 0.844
> 2048, 0, 2048, 23, 127, 4, 0.696
> 2048, 7, 2048, 23, 127, 4, 0.704
> 2048, 0, 4096, 23, 127, 4, 0.936
> 2048, 8, 4096, 23, 127, 4, 0.925
> 256, 1, 64, 23, 127, 4, 0.694
> 256, 15, 64, 23, 127, 4, 0.69
> 256, 2, 64, 23, 127, 4, 0.687
> 256, 30, 64, 23, 127, 4, 0.612
> 256, 3, 64, 23, 127, 4, 0.685
> 256, 45, 64, 23, 127, 4, 0.685
> 256, 4, 64, 23, 127, 4, 0.684
> 256, 60, 64, 23, 127, 4, 0.606
> 256, 5, 64, 23, 127, 4, 0.69
> 256, 75, 64, 23, 127, 4, 0.688
> 256, 6, 64, 23, 127, 4, 0.69
> 256, 90, 64, 23, 127, 4, 0.615
> 256, 7, 64, 23, 127, 4, 0.691
> 256, 105, 64, 23, 127, 4, 0.688
> 1, 0, 0, 23, 127, 4, 0.982
> 2, 0, 1, 23, 127, 4, 0.983
> 3, 0, 2, 23, 127, 4, 0.981
> 4, 0, 3, 23, 127, 4, 0.984
> 5, 0, 4, 23, 127, 4, 0.963
> 6, 0, 5, 23, 127, 4, 0.978
> 7, 0, 6, 23, 127, 4, 0.985
> 8, 0, 7, 23, 127, 4, 0.986
> 9, 0, 8, 23, 127, 4, 0.978
> 10, 0, 9, 23, 127, 4, 0.985
> 11, 0, 10, 23, 127, 4, 0.986
> 12, 0, 11, 23, 127, 4, 0.983
> 13, 0, 12, 23, 127, 4, 0.986
> 14, 0, 13, 23, 127, 4, 0.98
> 15, 0, 14, 23, 127, 4, 0.979
> 16, 0, 15, 23, 127, 4, 0.582
> 17, 0, 16, 23, 127, 4, 0.542
> 18, 0, 17, 23, 127, 4, 0.564
> 19, 0, 18, 23, 127, 4, 0.571
> 20, 0, 19, 23, 127, 4, 0.582
> 21, 0, 20, 23, 127, 4, 0.573
> 22, 0, 21, 23, 127, 4, 0.575
> 23, 0, 22, 23, 127, 4, 0.578
> 24, 0, 23, 23, 127, 4, 0.58
> 25, 0, 24, 23, 127, 4, 0.592
> 26, 0, 25, 23, 127, 4, 0.588
> 27, 0, 26, 23, 127, 4, 0.574
> 28, 0, 27, 23, 127, 4, 0.589
> 29, 0, 28, 23, 127, 4, 0.56
> 30, 0, 29, 23, 127, 4, 0.587
> 31, 0, 30, 23, 127, 4, 0.584
> 32, 0, 31, 23, 127, 4, 0.664
> 2048, 0, 32, 23, 127, 8, 0.826
> 2048, 1, 32, 23, 127, 8, 0.821
> 2048, 0, 64, 23, 127, 8, 0.828
> 2048, 2, 64, 23, 127, 8, 0.827
> 2048, 0, 128, 23, 127, 8, 0.833
> 2048, 3, 128, 23, 127, 8, 0.83
> 2048, 0, 256, 23, 127, 8, 0.855
> 2048, 4, 256, 23, 127, 8, 0.849
> 2048, 0, 512, 23, 127, 8, 0.849
> 2048, 5, 512, 23, 127, 8, 0.851
> 2048, 0, 1024, 23, 127, 8, 0.856
> 2048, 6, 1024, 23, 127, 8, 0.862
> 2048, 0, 2048, 23, 127, 8, 0.709
> 2048, 7, 2048, 23, 127, 8, 0.712
> 2048, 0, 4096, 23, 127, 8, 0.702
> 2048, 8, 4096, 23, 127, 8, 0.701
> 256, 1, 64, 23, 127, 8, 0.689
> 256, 15, 64, 23, 127, 8, 0.688
> 256, 2, 64, 23, 127, 8, 0.691
> 256, 30, 64, 23, 127, 8, 0.612
> 256, 3, 64, 23, 127, 8, 0.688
> 256, 45, 64, 23, 127, 8, 0.686
> 256, 4, 64, 23, 127, 8, 0.694
> 256, 60, 64, 23, 127, 8, 0.609
> 256, 5, 64, 23, 127, 8, 0.69
> 256, 75, 64, 23, 127, 8, 0.69
> 256, 6, 64, 23, 127, 8, 0.691
> 256, 90, 64, 23, 127, 8, 0.612
> 256, 7, 64, 23, 127, 8, 0.689
> 256, 105, 64, 23, 127, 8, 0.688
> 1, 0, 0, 23, 127, 8, 0.98
> 2, 0, 1, 23, 127, 8, 0.978
> 3, 0, 2, 23, 127, 8, 0.98
> 4, 0, 3, 23, 127, 8, 0.978
> 5, 0, 4, 23, 127, 8, 0.977
> 6, 0, 5, 23, 127, 8, 0.984
> 7, 0, 6, 23, 127, 8, 0.982
> 8, 0, 7, 23, 127, 8, 0.983
> 9, 0, 8, 23, 127, 8, 0.987
> 10, 0, 9, 23, 127, 8, 0.979
> 11, 0, 10, 23, 127, 8, 0.985
> 12, 0, 11, 23, 127, 8, 0.981
> 13, 0, 12, 23, 127, 8, 0.98
> 14, 0, 13, 23, 127, 8, 0.982
> 15, 0, 14, 23, 127, 8, 0.981
> 16, 0, 15, 23, 127, 8, 0.579
> 17, 0, 16, 23, 127, 8, 0.531
> 18, 0, 17, 23, 127, 8, 0.577
> 19, 0, 18, 23, 127, 8, 0.588
> 20, 0, 19, 23, 127, 8, 0.571
> 21, 0, 20, 23, 127, 8, 0.576
> 22, 0, 21, 23, 127, 8, 0.59
> 23, 0, 22, 23, 127, 8, 0.574
> 24, 0, 23, 23, 127, 8, 0.583
> 25, 0, 24, 23, 127, 8, 0.581
> 26, 0, 25, 23, 127, 8, 0.592
> 27, 0, 26, 23, 127, 8, 0.586
> 28, 0, 27, 23, 127, 8, 0.588
> 29, 0, 28, 23, 127, 8, 0.578
> 30, 0, 29, 23, 127, 8, 0.573
> 31, 0, 30, 23, 127, 8, 0.588
> 32, 0, 31, 23, 127, 8, 0.664
> 2048, 0, 32, 23, 127, 16, 0.825
> 2048, 1, 32, 23, 127, 16, 0.823
> 2048, 0, 64, 23, 127, 16, 0.831
> 2048, 2, 64, 23, 127, 16, 0.822
> 2048, 0, 128, 23, 127, 16, 0.831
> 2048, 3, 128, 23, 127, 16, 0.831
> 2048, 0, 256, 23, 127, 16, 0.849
> 2048, 4, 256, 23, 127, 16, 0.85
> 2048, 0, 512, 23, 127, 16, 0.751
> 2048, 5, 512, 23, 127, 16, 0.75
> 2048, 0, 1024, 23, 127, 16, 0.913
> 2048, 6, 1024, 23, 127, 16, 0.895
> 2048, 0, 2048, 23, 127, 16, 0.736
> 2048, 7, 2048, 23, 127, 16, 0.741
> 2048, 0, 4096, 23, 127, 16, 0.712
> 2048, 8, 4096, 23, 127, 16, 0.711
> 256, 1, 64, 23, 127, 16, 0.758
> 256, 15, 64, 23, 127, 16, 0.692
> 256, 2, 64, 23, 127, 16, 0.692
> 256, 30, 64, 23, 127, 16, 0.613
> 256, 3, 64, 23, 127, 16, 0.69
> 256, 45, 64, 23, 127, 16, 0.687
> 256, 4, 64, 23, 127, 16, 0.69
> 256, 60, 64, 23, 127, 16, 0.604
> 256, 5, 64, 23, 127, 16, 0.687
> 256, 75, 64, 23, 127, 16, 0.687
> 256, 6, 64, 23, 127, 16, 0.69
> 256, 90, 64, 23, 127, 16, 0.61
> 256, 7, 64, 23, 127, 16, 0.69
> 256, 105, 64, 23, 127, 16, 0.685
> 1, 0, 0, 23, 127, 16, 0.981
> 2, 0, 1, 23, 127, 16, 0.985
> 3, 0, 2, 23, 127, 16, 0.985
> 4, 0, 3, 23, 127, 16, 0.981
> 5, 0, 4, 23, 127, 16, 0.979
> 6, 0, 5, 23, 127, 16, 0.986
> 7, 0, 6, 23, 127, 16, 0.986
> 8, 0, 7, 23, 127, 16, 0.982
> 9, 0, 8, 23, 127, 16, 0.982
> 10, 0, 9, 23, 127, 16, 0.98
> 11, 0, 10, 23, 127, 16, 0.983
> 12, 0, 11, 23, 127, 16, 0.982
> 13, 0, 12, 23, 127, 16, 0.982
> 14, 0, 13, 23, 127, 16, 0.982
> 15, 0, 14, 23, 127, 16, 0.982
> 16, 0, 15, 23, 127, 16, 0.582
> 17, 0, 16, 23, 127, 16, 0.542
> 18, 0, 17, 23, 127, 16, 0.554
> 19, 0, 18, 23, 127, 16, 0.562
> 20, 0, 19, 23, 127, 16, 0.587
> 21, 0, 20, 23, 127, 16, 0.584
> 22, 0, 21, 23, 127, 16, 0.587
> 23, 0, 22, 23, 127, 16, 0.594
> 24, 0, 23, 23, 127, 16, 0.581
> 25, 0, 24, 23, 127, 16, 0.577
> 26, 0, 25, 23, 127, 16, 0.588
> 27, 0, 26, 23, 127, 16, 0.589
> 28, 0, 27, 23, 127, 16, 0.596
> 29, 0, 28, 23, 127, 16, 0.591
> 30, 0, 29, 23, 127, 16, 0.585
> 31, 0, 30, 23, 127, 16, 0.59
> 32, 0, 31, 23, 127, 16, 0.669
>
> sysdeps/x86_64/multiarch/strrchr-sse2.S | 2 +-
> sysdeps/x86_64/multiarch/wcsrchr-sse2.S | 3 +-
> sysdeps/x86_64/strrchr.S | 505 +++++++++++++++---------
> sysdeps/x86_64/wcsrchr.S | 268 +------------
> 4 files changed, 334 insertions(+), 444 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
> index db1b44c23c..866396e947 100644
> --- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
> +++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
> @@ -17,7 +17,7 @@
> <https://www.gnu.org/licenses/>. */
>
> #if IS_IN (libc)
> -# define strrchr __strrchr_sse2
> +# define STRRCHR __strrchr_sse2
>
> # undef weak_alias
> # define weak_alias(strrchr, rindex)
> diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> index 78d1ca6553..69d2f3cdb1 100644
> --- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> +++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> @@ -17,7 +17,6 @@
> <https://www.gnu.org/licenses/>. */
>
> #if IS_IN (libc)
> -# define wcsrchr __wcsrchr_sse2
> +# define STRRCHR __wcsrchr_sse2
> #endif
> -
> #include "../wcsrchr.S"
> diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
> index 50d886713e..94449ad806 100644
> --- a/sysdeps/x86_64/strrchr.S
> +++ b/sysdeps/x86_64/strrchr.S
> @@ -19,210 +19,355 @@
>
> #include <sysdep.h>
>
> +#ifndef STRRCHR
> +# define STRRCHR strrchr
> +#endif
> +
> +#ifdef USE_AS_WCSRCHR
> +# define PCMPEQ pcmpeqd
> +# define CHAR_SIZE 4
> +# define PMINU pminud
> +#else
> +# define PCMPEQ pcmpeqb
> +# define CHAR_SIZE 1
> +# define PMINU pminub
> +#endif
> +
> +#define PAGE_SIZE 4096
> +#define VEC_SIZE 16
> +
> .text
> -ENTRY (strrchr)
> - movd %esi, %xmm1
> +ENTRY(STRRCHR)
> + movd %esi, %xmm0
> movq %rdi, %rax
> - andl $4095, %eax
> - punpcklbw %xmm1, %xmm1
> - cmpq $4032, %rax
> - punpcklwd %xmm1, %xmm1
> - pshufd $0, %xmm1, %xmm1
> + andl $(PAGE_SIZE - 1), %eax
> +#ifndef USE_AS_WCSRCHR
> + punpcklbw %xmm0, %xmm0
> + punpcklwd %xmm0, %xmm0
> +#endif
> + pshufd $0, %xmm0, %xmm0
> + cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> ja L(cross_page)
> - movdqu (%rdi), %xmm0
> +
> +L(cross_page_continue):
> + movups (%rdi), %xmm1
> pxor %xmm2, %xmm2
> - movdqa %xmm0, %xmm3
> - pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm2, %xmm3
> - pmovmskb %xmm0, %ecx
> - pmovmskb %xmm3, %edx
> - testq %rdx, %rdx
> - je L(next_48_bytes)
> - leaq -1(%rdx), %rax
> - xorq %rdx, %rax
> - andq %rcx, %rax
> - je L(exit)
> - bsrq %rax, %rax
> + PCMPEQ %xmm1, %xmm2
> + pmovmskb %xmm2, %ecx
> + testl %ecx, %ecx
> + jz L(aligned_more)
> +
> + PCMPEQ %xmm0, %xmm1
> + pmovmskb %xmm1, %eax
> + leal -1(%rcx), %edx
> + xorl %edx, %ecx
> + andl %ecx, %eax
> + jz L(ret0)
> + bsrl %eax, %eax
> addq %rdi, %rax
> + /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
> + search CHAR is zero we are correct. Either way `andq
> + -CHAR_SIZE, %rax` gets the correct result. */
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> +L(ret0):
> ret
>
> + /* Returns for first vec x1/x2 have hard coded backward search
> + path for earlier matches. */
> .p2align 4
> -L(next_48_bytes):
> - movdqu 16(%rdi), %xmm4
> - movdqa %xmm4, %xmm5
> - movdqu 32(%rdi), %xmm3
> - pcmpeqb %xmm1, %xmm4
> - pcmpeqb %xmm2, %xmm5
> - movdqu 48(%rdi), %xmm0
> - pmovmskb %xmm5, %edx
> - movdqa %xmm3, %xmm5
> - pcmpeqb %xmm1, %xmm3
> - pcmpeqb %xmm2, %xmm5
> - pcmpeqb %xmm0, %xmm2
> - salq $16, %rdx
> - pmovmskb %xmm3, %r8d
> - pmovmskb %xmm5, %eax
> - pmovmskb %xmm2, %esi
> - salq $32, %r8
> - salq $32, %rax
> - pcmpeqb %xmm1, %xmm0
> - orq %rdx, %rax
> - movq %rsi, %rdx
> - pmovmskb %xmm4, %esi
> - salq $48, %rdx
> - salq $16, %rsi
> - orq %r8, %rsi
> - orq %rcx, %rsi
> - pmovmskb %xmm0, %ecx
> - salq $48, %rcx
> - orq %rcx, %rsi
> - orq %rdx, %rax
> - je L(loop_header2)
> - leaq -1(%rax), %rcx
> - xorq %rax, %rcx
> - andq %rcx, %rsi
> - je L(exit)
> - bsrq %rsi, %rsi
> - leaq (%rdi,%rsi), %rax
> +L(first_vec_x0_test):
> + PCMPEQ %xmm0, %xmm1
> + pmovmskb %xmm1, %eax
> + testl %eax, %eax
> + jz L(ret0)
> + bsrl %eax, %eax
> + addq %r8, %rax
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> ret
>
> .p2align 4
> -L(loop_header2):
> - testq %rsi, %rsi
> - movq %rdi, %rcx
> - je L(no_c_found)
> -L(loop_header):
> - addq $64, %rdi
> - pxor %xmm7, %xmm7
> - andq $-64, %rdi
> - jmp L(loop_entry)
> +L(first_vec_x1):
> + PCMPEQ %xmm0, %xmm2
> + pmovmskb %xmm2, %eax
> + leal -1(%rcx), %edx
> + xorl %edx, %ecx
> + andl %ecx, %eax
> + jz L(first_vec_x0_test)
> + bsrl %eax, %eax
> + leaq (VEC_SIZE)(%rdi, %rax), %rax
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> + ret
>
> .p2align 4
> -L(loop64):
> - testq %rdx, %rdx
> - cmovne %rdx, %rsi
> - cmovne %rdi, %rcx
> - addq $64, %rdi
> -L(loop_entry):
> - movdqa 32(%rdi), %xmm3
> - pxor %xmm6, %xmm6
> - movdqa 48(%rdi), %xmm2
> - movdqa %xmm3, %xmm0
> - movdqa 16(%rdi), %xmm4
> - pminub %xmm2, %xmm0
> - movdqa (%rdi), %xmm5
> - pminub %xmm4, %xmm0
> - pminub %xmm5, %xmm0
> - pcmpeqb %xmm7, %xmm0
> - pmovmskb %xmm0, %eax
> - movdqa %xmm5, %xmm0
> - pcmpeqb %xmm1, %xmm0
> - pmovmskb %xmm0, %r9d
> - movdqa %xmm4, %xmm0
> - pcmpeqb %xmm1, %xmm0
> - pmovmskb %xmm0, %edx
> - movdqa %xmm3, %xmm0
> - pcmpeqb %xmm1, %xmm0
> - salq $16, %rdx
> - pmovmskb %xmm0, %r10d
> - movdqa %xmm2, %xmm0
> - pcmpeqb %xmm1, %xmm0
> - salq $32, %r10
> - orq %r10, %rdx
> - pmovmskb %xmm0, %r8d
> - orq %r9, %rdx
> - salq $48, %r8
> - orq %r8, %rdx
> +L(first_vec_x1_test):
> + PCMPEQ %xmm0, %xmm2
> + pmovmskb %xmm2, %eax
> testl %eax, %eax
> - je L(loop64)
> - pcmpeqb %xmm6, %xmm4
> - pcmpeqb %xmm6, %xmm3
> - pcmpeqb %xmm6, %xmm5
> - pmovmskb %xmm4, %eax
> - pmovmskb %xmm3, %r10d
> - pcmpeqb %xmm6, %xmm2
> - pmovmskb %xmm5, %r9d
> - salq $32, %r10
> - salq $16, %rax
> - pmovmskb %xmm2, %r8d
> - orq %r10, %rax
> - orq %r9, %rax
> - salq $48, %r8
> - orq %r8, %rax
> - leaq -1(%rax), %r8
> - xorq %rax, %r8
> - andq %r8, %rdx
> - cmovne %rdi, %rcx
> - cmovne %rdx, %rsi
> - bsrq %rsi, %rsi
> - leaq (%rcx,%rsi), %rax
> + jz L(first_vec_x0_test)
> + bsrl %eax, %eax
> + leaq (VEC_SIZE)(%rdi, %rax), %rax
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> + ret
> +
> + .p2align 4
> +L(first_vec_x2):
> + PCMPEQ %xmm0, %xmm3
> + pmovmskb %xmm3, %eax
> + leal -1(%rcx), %edx
> + xorl %edx, %ecx
> + andl %ecx, %eax
> + jz L(first_vec_x1_test)
> + bsrl %eax, %eax
> + leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> + ret
> +
> + .p2align 4
> +L(aligned_more):
> + /* Save original pointer if match was in VEC 0. */
> + movq %rdi, %r8
> + andq $-VEC_SIZE, %rdi
> +
> + movaps VEC_SIZE(%rdi), %xmm2
> + pxor %xmm3, %xmm3
> + PCMPEQ %xmm2, %xmm3
> + pmovmskb %xmm3, %ecx
> + testl %ecx, %ecx
> + jnz L(first_vec_x1)
> +
> + movaps (VEC_SIZE * 2)(%rdi), %xmm3
> + pxor %xmm4, %xmm4
> + PCMPEQ %xmm3, %xmm4
> + pmovmskb %xmm4, %ecx
> + testl %ecx, %ecx
> + jnz L(first_vec_x2)
> +
> + addq $VEC_SIZE, %rdi
> + /* Save pointer again before realigning. */
> + movq %rdi, %rsi
> + andq $-(VEC_SIZE * 2), %rdi
> + .p2align 4
> +L(first_loop):
> + /* Do 2x VEC at a time. */
> + movaps (VEC_SIZE * 2)(%rdi), %xmm4
> + movaps (VEC_SIZE * 3)(%rdi), %xmm5
> + /* If SSE2 no pminud. */
> +#ifdef NO_PMINU
Do we really need SSE4.1 wcsrchr? I think we should focus on AVX2 and
above.
> + movaps %xmm5, %xmm6
> + pxor %xmm8, %xmm8
> +
> + PCMPEQ %xmm8, %xmm5
> + PCMPEQ %xmm4, %xmm8
> + por %xmm5, %xmm8
> +#else
> + movaps %xmm5, %xmm6
> + PMINU %xmm4, %xmm5
> +#endif
> +
> + movaps %xmm4, %xmm9
> + PCMPEQ %xmm0, %xmm4
> + PCMPEQ %xmm0, %xmm6
> + movaps %xmm6, %xmm7
> + por %xmm4, %xmm6
> +#ifndef NO_PMINU
> + pxor %xmm8, %xmm8
> + PCMPEQ %xmm5, %xmm8
> +#endif
> + pmovmskb %xmm8, %ecx
> + pmovmskb %xmm6, %eax
> +
> + addq $(VEC_SIZE * 2), %rdi
> + /* Use `addl` 1) so we can undo it with `subl` and 2) it can
> + macro-fuse with `jz`. */
> + addl %ecx, %eax
> + jz L(first_loop)
> +
> + /* Check if there is zero match. */
> + testl %ecx, %ecx
> + jz L(second_loop_match)
> +
> + /* Check if there was a match in last iteration. */
> + subl %ecx, %eax
> + jnz L(new_match)
> +
> +L(first_loop_old_match):
> + PCMPEQ %xmm0, %xmm2
> + PCMPEQ %xmm0, %xmm3
> + pmovmskb %xmm2, %ecx
> + pmovmskb %xmm3, %eax
> + addl %eax, %ecx
> + jz L(first_vec_x0_test)
> + /* NB: We could move this shift to before the branch and save a
> + bit of code size / performance on the fall through. The
> + branch leads to the null case which generally seems hotter
> + than char in first 3x VEC. */
> + sall $16, %eax
> + orl %ecx, %eax
> +
> + bsrl %eax, %eax
> + addq %rsi, %rax
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> + ret
> +
> + .p2align 4
> +L(new_match):
> + pxor %xmm6, %xmm6
> + PCMPEQ %xmm9, %xmm6
> + pmovmskb %xmm6, %eax
> + sall $16, %ecx
> + orl %eax, %ecx
> +
> + /* We can't reuse either of the old comparisons as since we mask
> + of zeros after first zero (instead of using the full
> + comparison) we can't gurantee no interference between match
> + after end of string and valid match. */
> + pmovmskb %xmm4, %eax
> + pmovmskb %xmm7, %edx
> + sall $16, %edx
> + orl %edx, %eax
> +
> + leal -1(%ecx), %edx
> + xorl %edx, %ecx
> + andl %ecx, %eax
> + jz L(first_loop_old_match)
> + bsrl %eax, %eax
> + addq %rdi, %rax
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> ret
>
> + /* Save minimum state for getting most recent match. We can
> + throw out all previous work. */
> .p2align 4
> -L(no_c_found):
> - movl $1, %esi
> - xorl %ecx, %ecx
> - jmp L(loop_header)
> +L(second_loop_match):
> + movq %rdi, %rsi
> + movaps %xmm4, %xmm2
> + movaps %xmm7, %xmm3
>
> .p2align 4
> -L(exit):
> - xorl %eax, %eax
> +L(second_loop):
> + movaps (VEC_SIZE * 2)(%rdi), %xmm4
> + movaps (VEC_SIZE * 3)(%rdi), %xmm5
> +#ifdef NO_PMINU
> + movaps %xmm5, %xmm6
> + pxor %xmm8, %xmm8
> +
> + PCMPEQ %xmm8, %xmm5
> + PCMPEQ %xmm4, %xmm8
> + por %xmm5, %xmm8
> +#else
> + movaps %xmm5, %xmm6
> + PMINU %xmm4, %xmm5
> +#endif
> +
> + movaps %xmm4, %xmm9
> + PCMPEQ %xmm0, %xmm4
> + PCMPEQ %xmm0, %xmm6
> + movaps %xmm6, %xmm7
> + por %xmm4, %xmm6
> +#ifndef NO_PMINU
> + pxor %xmm8, %xmm8
> + PCMPEQ %xmm5, %xmm8
> +#endif
> +
> + pmovmskb %xmm8, %ecx
> + pmovmskb %xmm6, %eax
> +
> + addq $(VEC_SIZE * 2), %rdi
> + /* Either null term or new occurence of CHAR. */
> + addl %ecx, %eax
> + jz L(second_loop)
> +
> + /* No null term so much be new occurence of CHAR. */
> + testl %ecx, %ecx
> + jz L(second_loop_match)
> +
> +
> + subl %ecx, %eax
> + jnz L(second_loop_new_match)
> +
> +L(second_loop_old_match):
> + pmovmskb %xmm2, %ecx
> + pmovmskb %xmm3, %eax
> + sall $16, %eax
> + orl %ecx, %eax
> + bsrl %eax, %eax
> + addq %rsi, %rax
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> ret
>
> .p2align 4
> +L(second_loop_new_match):
> + pxor %xmm6, %xmm6
> + PCMPEQ %xmm9, %xmm6
> + pmovmskb %xmm6, %eax
> + sall $16, %ecx
> + orl %eax, %ecx
> +
> + /* We can't reuse either of the old comparisons as since we mask
> + of zeros after first zero (instead of using the full
> + comparison) we can't gurantee no interference between match
> + after end of string and valid match. */
> + pmovmskb %xmm4, %eax
> + pmovmskb %xmm7, %edx
> + sall $16, %edx
> + orl %edx, %eax
> +
> + leal -1(%ecx), %edx
> + xorl %edx, %ecx
> + andl %ecx, %eax
> + jz L(second_loop_old_match)
> + bsrl %eax, %eax
> + addq %rdi, %rax
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> + ret
> +
> + .p2align 4,, 4
> L(cross_page):
> - movq %rdi, %rax
> - pxor %xmm0, %xmm0
> - andq $-64, %rax
> - movdqu (%rax), %xmm5
> - movdqa %xmm5, %xmm6
> - movdqu 16(%rax), %xmm4
> - pcmpeqb %xmm1, %xmm5
> - pcmpeqb %xmm0, %xmm6
> - movdqu 32(%rax), %xmm3
> - pmovmskb %xmm6, %esi
> - movdqa %xmm4, %xmm6
> - movdqu 48(%rax), %xmm2
> - pcmpeqb %xmm1, %xmm4
> - pcmpeqb %xmm0, %xmm6
> - pmovmskb %xmm6, %edx
> - movdqa %xmm3, %xmm6
> - pcmpeqb %xmm1, %xmm3
> - pcmpeqb %xmm0, %xmm6
> - pcmpeqb %xmm2, %xmm0
> - salq $16, %rdx
> - pmovmskb %xmm3, %r9d
> - pmovmskb %xmm6, %r8d
> - pmovmskb %xmm0, %ecx
> - salq $32, %r9
> - salq $32, %r8
> - pcmpeqb %xmm1, %xmm2
> - orq %r8, %rdx
> - salq $48, %rcx
> - pmovmskb %xmm5, %r8d
> - orq %rsi, %rdx
> - pmovmskb %xmm4, %esi
> - orq %rcx, %rdx
> - pmovmskb %xmm2, %ecx
> - salq $16, %rsi
> - salq $48, %rcx
> - orq %r9, %rsi
> - orq %r8, %rsi
> - orq %rcx, %rsi
> + movq %rdi, %rsi
> + andq $-VEC_SIZE, %rsi
> + movaps (%rsi), %xmm1
> + pxor %xmm2, %xmm2
> + PCMPEQ %xmm1, %xmm2
> + pmovmskb %xmm2, %edx
> movl %edi, %ecx
> - subl %eax, %ecx
> - shrq %cl, %rdx
> - shrq %cl, %rsi
> - testq %rdx, %rdx
> - je L(loop_header2)
> - leaq -1(%rdx), %rax
> - xorq %rdx, %rax
> - andq %rax, %rsi
> - je L(exit)
> - bsrq %rsi, %rax
> + andl $(VEC_SIZE - 1), %ecx
> + sarl %cl, %edx
> + jz L(cross_page_continue)
> + PCMPEQ %xmm0, %xmm1
> + pmovmskb %xmm1, %eax
> + sarl %cl, %eax
> + leal -1(%rdx), %ecx
> + xorl %edx, %ecx
> + andl %ecx, %eax
> + jz L(ret1)
> + bsrl %eax, %eax
> addq %rdi, %rax
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> +L(ret1):
> ret
> -END (strrchr)
> +END(STRRCHR)
>
> -weak_alias (strrchr, rindex)
> -libc_hidden_builtin_def (strrchr)
> +#ifndef USE_AS_WCSRCHR
> + weak_alias (STRRCHR, rindex)
> + libc_hidden_builtin_def (STRRCHR)
> +#endif
> diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
> index 61552954de..2b80efc5ef 100644
> --- a/sysdeps/x86_64/wcsrchr.S
> +++ b/sysdeps/x86_64/wcsrchr.S
> @@ -1,4 +1,4 @@
> -/* wcsrchr with SSSE3
> +/* wcsrchr optimized with SSE2.
> Copyright (C) 2011-2022 Free Software Foundation, Inc.
> This file is part of the GNU C Library.
>
> @@ -16,266 +16,12 @@
> License along with the GNU C Library; if not, see
> <https://www.gnu.org/licenses/>. */
>
> -#include <sysdep.h>
>
> - .text
> -ENTRY (wcsrchr)
> +#define USE_AS_WCSRCHR 1
> +#define NO_PMINU 1
>
> - movd %rsi, %xmm1
> - mov %rdi, %rcx
> - punpckldq %xmm1, %xmm1
> - pxor %xmm2, %xmm2
> - punpckldq %xmm1, %xmm1
> - and $63, %rcx
> - cmp $48, %rcx
> - ja L(crosscache)
> +#ifndef STRRCHR
> +# define STRRCHR wcsrchr
> +#endif
>
> - movdqu (%rdi), %xmm0
> - pcmpeqd %xmm0, %xmm2
> - pcmpeqd %xmm1, %xmm0
> - pmovmskb %xmm2, %rcx
> - pmovmskb %xmm0, %rax
> - add $16, %rdi
> -
> - test %rax, %rax
> - jnz L(unaligned_match1)
> -
> - test %rcx, %rcx
> - jnz L(return_null)
> -
> - and $-16, %rdi
> - xor %r8, %r8
> - jmp L(loop)
> -
> - .p2align 4
> -L(unaligned_match1):
> - test %rcx, %rcx
> - jnz L(prolog_find_zero_1)
> -
> - mov %rax, %r8
> - mov %rdi, %rsi
> - and $-16, %rdi
> - jmp L(loop)
> -
> - .p2align 4
> -L(crosscache):
> - and $15, %rcx
> - and $-16, %rdi
> - pxor %xmm3, %xmm3
> - movdqa (%rdi), %xmm0
> - pcmpeqd %xmm0, %xmm3
> - pcmpeqd %xmm1, %xmm0
> - pmovmskb %xmm3, %rdx
> - pmovmskb %xmm0, %rax
> - shr %cl, %rdx
> - shr %cl, %rax
> - add $16, %rdi
> -
> - test %rax, %rax
> - jnz L(unaligned_match)
> -
> - test %rdx, %rdx
> - jnz L(return_null)
> -
> - xor %r8, %r8
> - jmp L(loop)
> -
> - .p2align 4
> -L(unaligned_match):
> - test %rdx, %rdx
> - jnz L(prolog_find_zero)
> -
> - mov %rax, %r8
> - lea (%rdi, %rcx), %rsi
> -
> -/* Loop start on aligned string. */
> - .p2align 4
> -L(loop):
> - movdqa (%rdi), %xmm0
> - pcmpeqd %xmm0, %xmm2
> - add $16, %rdi
> - pcmpeqd %xmm1, %xmm0
> - pmovmskb %xmm2, %rcx
> - pmovmskb %xmm0, %rax
> - or %rax, %rcx
> - jnz L(matches)
> -
> - movdqa (%rdi), %xmm3
> - pcmpeqd %xmm3, %xmm2
> - add $16, %rdi
> - pcmpeqd %xmm1, %xmm3
> - pmovmskb %xmm2, %rcx
> - pmovmskb %xmm3, %rax
> - or %rax, %rcx
> - jnz L(matches)
> -
> - movdqa (%rdi), %xmm4
> - pcmpeqd %xmm4, %xmm2
> - add $16, %rdi
> - pcmpeqd %xmm1, %xmm4
> - pmovmskb %xmm2, %rcx
> - pmovmskb %xmm4, %rax
> - or %rax, %rcx
> - jnz L(matches)
> -
> - movdqa (%rdi), %xmm5
> - pcmpeqd %xmm5, %xmm2
> - add $16, %rdi
> - pcmpeqd %xmm1, %xmm5
> - pmovmskb %xmm2, %rcx
> - pmovmskb %xmm5, %rax
> - or %rax, %rcx
> - jz L(loop)
> -
> - .p2align 4
> -L(matches):
> - test %rax, %rax
> - jnz L(match)
> -L(return_value):
> - test %r8, %r8
> - jz L(return_null)
> - mov %r8, %rax
> - mov %rsi, %rdi
> -
> - test $15 << 4, %ah
> - jnz L(match_fourth_wchar)
> - test %ah, %ah
> - jnz L(match_third_wchar)
> - test $15 << 4, %al
> - jnz L(match_second_wchar)
> - lea -16(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(match):
> - pmovmskb %xmm2, %rcx
> - test %rcx, %rcx
> - jnz L(find_zero)
> - mov %rax, %r8
> - mov %rdi, %rsi
> - jmp L(loop)
> -
> - .p2align 4
> -L(find_zero):
> - test $15, %cl
> - jnz L(find_zero_in_first_wchar)
> - test %cl, %cl
> - jnz L(find_zero_in_second_wchar)
> - test $15, %ch
> - jnz L(find_zero_in_third_wchar)
> -
> - and $1 << 13 - 1, %rax
> - jz L(return_value)
> -
> - test $15 << 4, %ah
> - jnz L(match_fourth_wchar)
> - test %ah, %ah
> - jnz L(match_third_wchar)
> - test $15 << 4, %al
> - jnz L(match_second_wchar)
> - lea -16(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(find_zero_in_first_wchar):
> - test $1, %rax
> - jz L(return_value)
> - lea -16(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(find_zero_in_second_wchar):
> - and $1 << 5 - 1, %rax
> - jz L(return_value)
> -
> - test $15 << 4, %al
> - jnz L(match_second_wchar)
> - lea -16(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(find_zero_in_third_wchar):
> - and $1 << 9 - 1, %rax
> - jz L(return_value)
> -
> - test %ah, %ah
> - jnz L(match_third_wchar)
> - test $15 << 4, %al
> - jnz L(match_second_wchar)
> - lea -16(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(prolog_find_zero):
> - add %rcx, %rdi
> - mov %rdx, %rcx
> -L(prolog_find_zero_1):
> - test $15, %cl
> - jnz L(prolog_find_zero_in_first_wchar)
> - test %cl, %cl
> - jnz L(prolog_find_zero_in_second_wchar)
> - test $15, %ch
> - jnz L(prolog_find_zero_in_third_wchar)
> -
> - and $1 << 13 - 1, %rax
> - jz L(return_null)
> -
> - test $15 << 4, %ah
> - jnz L(match_fourth_wchar)
> - test %ah, %ah
> - jnz L(match_third_wchar)
> - test $15 << 4, %al
> - jnz L(match_second_wchar)
> - lea -16(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(prolog_find_zero_in_first_wchar):
> - test $1, %rax
> - jz L(return_null)
> - lea -16(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(prolog_find_zero_in_second_wchar):
> - and $1 << 5 - 1, %rax
> - jz L(return_null)
> -
> - test $15 << 4, %al
> - jnz L(match_second_wchar)
> - lea -16(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(prolog_find_zero_in_third_wchar):
> - and $1 << 9 - 1, %rax
> - jz L(return_null)
> -
> - test %ah, %ah
> - jnz L(match_third_wchar)
> - test $15 << 4, %al
> - jnz L(match_second_wchar)
> - lea -16(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(match_second_wchar):
> - lea -12(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(match_third_wchar):
> - lea -8(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(match_fourth_wchar):
> - lea -4(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(return_null):
> - xor %rax, %rax
> - ret
> -
> -END (wcsrchr)
> +#include "../strrchr.S"
> --
> 2.25.1
>
--
H.J.
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH v1 2/5] x86: Optimize {str|wcs}rchr-sse2
2022-04-21 20:26 ` H.J. Lu
@ 2022-04-21 20:57 ` Noah Goldstein
2022-04-21 21:48 ` H.J. Lu
0 siblings, 1 reply; 36+ messages in thread
From: Noah Goldstein @ 2022-04-21 20:57 UTC (permalink / raw)
To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell
On Thu, Apr 21, 2022 at 3:27 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Wed, Apr 20, 2022 at 8:14 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > The new code unrolls the main loop slightly without adding too much
> > overhead and minimizes the comparisons for the search CHAR.
> >
> > Geometric Mean of all benchmarks New / Old: 0.741
> > See email for all results.
> >
> > Full xcheck passes on x86_64 with and without multiarch enabled.
> > ---
> > Results For: strrchr
> >
> > Geometric Mean of N=30 runs.
> >
> > Geometric Mean of all benchmarks New / Old: 0.741
> > Benchmarks performance on Tigerlake:
> > https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i71165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html
> >
> > len, align, pos, seek, max_char, freq, New Time / Old Time
> > 2048, 0, 32, 0, 127, 1, 0.647
> > 2048, 1, 32, 0, 127, 1, 0.621
> > 2048, 0, 64, 0, 127, 1, 0.661
> > 2048, 2, 64, 0, 127, 1, 0.655
> > 2048, 0, 128, 0, 127, 1, 0.69
> > 2048, 3, 128, 0, 127, 1, 0.689
> > 2048, 0, 256, 0, 127, 1, 0.718
> > 2048, 4, 256, 0, 127, 1, 0.718
> > 2048, 0, 512, 0, 127, 1, 0.758
> > 2048, 5, 512, 0, 127, 1, 0.754
> > 2048, 0, 1024, 0, 127, 1, 1.029
> > 2048, 6, 1024, 0, 127, 1, 1.032
> > 2048, 0, 2048, 0, 127, 1, 0.826
> > 2048, 7, 2048, 0, 127, 1, 0.834
> > 2048, 0, 4096, 0, 127, 1, 0.825
> > 2048, 8, 4096, 0, 127, 1, 0.83
> > 256, 1, 64, 0, 127, 1, 0.657
> > 256, 15, 64, 0, 127, 1, 0.657
> > 256, 2, 64, 0, 127, 1, 0.657
> > 256, 30, 64, 0, 127, 1, 0.523
> > 256, 3, 64, 0, 127, 1, 0.657
> > 256, 45, 64, 0, 127, 1, 0.654
> > 256, 4, 64, 0, 127, 1, 0.657
> > 256, 60, 64, 0, 127, 1, 0.526
> > 256, 5, 64, 0, 127, 1, 0.658
> > 256, 75, 64, 0, 127, 1, 0.658
> > 256, 6, 64, 0, 127, 1, 0.655
> > 256, 90, 64, 0, 127, 1, 0.523
> > 256, 7, 64, 0, 127, 1, 0.655
> > 256, 105, 64, 0, 127, 1, 0.654
> > 1, 0, 0, 0, 127, 1, 0.98
> > 2, 0, 1, 0, 127, 1, 0.978
> > 3, 0, 2, 0, 127, 1, 0.975
> > 4, 0, 3, 0, 127, 1, 0.976
> > 5, 0, 4, 0, 127, 1, 0.977
> > 6, 0, 5, 0, 127, 1, 0.981
> > 7, 0, 6, 0, 127, 1, 0.982
> > 8, 0, 7, 0, 127, 1, 0.98
> > 9, 0, 8, 0, 127, 1, 0.978
> > 10, 0, 9, 0, 127, 1, 0.981
> > 11, 0, 10, 0, 127, 1, 0.984
> > 12, 0, 11, 0, 127, 1, 0.982
> > 13, 0, 12, 0, 127, 1, 0.98
> > 14, 0, 13, 0, 127, 1, 0.978
> > 15, 0, 14, 0, 127, 1, 0.979
> > 16, 0, 15, 0, 127, 1, 0.986
> > 17, 0, 16, 0, 127, 1, 0.529
> > 18, 0, 17, 0, 127, 1, 0.566
> > 19, 0, 18, 0, 127, 1, 0.575
> > 20, 0, 19, 0, 127, 1, 0.573
> > 21, 0, 20, 0, 127, 1, 0.579
> > 22, 0, 21, 0, 127, 1, 0.595
> > 23, 0, 22, 0, 127, 1, 0.585
> > 24, 0, 23, 0, 127, 1, 0.586
> > 25, 0, 24, 0, 127, 1, 0.587
> > 26, 0, 25, 0, 127, 1, 0.592
> > 27, 0, 26, 0, 127, 1, 0.595
> > 28, 0, 27, 0, 127, 1, 0.592
> > 29, 0, 28, 0, 127, 1, 0.6
> > 30, 0, 29, 0, 127, 1, 0.598
> > 31, 0, 30, 0, 127, 1, 0.595
> > 32, 0, 31, 0, 127, 1, 0.592
> > 2048, 0, 32, 23, 127, 1, 0.827
> > 2048, 1, 32, 23, 127, 1, 0.826
> > 2048, 0, 64, 23, 127, 1, 0.824
> > 2048, 2, 64, 23, 127, 1, 0.825
> > 2048, 0, 128, 23, 127, 1, 0.829
> > 2048, 3, 128, 23, 127, 1, 0.824
> > 2048, 0, 256, 23, 127, 1, 0.832
> > 2048, 4, 256, 23, 127, 1, 0.825
> > 2048, 0, 512, 23, 127, 1, 0.831
> > 2048, 5, 512, 23, 127, 1, 0.837
> > 2048, 0, 1024, 23, 127, 1, 0.721
> > 2048, 6, 1024, 23, 127, 1, 0.757
> > 2048, 0, 2048, 23, 127, 1, 0.825
> > 2048, 7, 2048, 23, 127, 1, 0.824
> > 2048, 0, 4096, 23, 127, 1, 0.828
> > 2048, 8, 4096, 23, 127, 1, 0.823
> > 256, 1, 64, 23, 127, 1, 0.665
> > 256, 15, 64, 23, 127, 1, 0.661
> > 256, 2, 64, 23, 127, 1, 0.674
> > 256, 30, 64, 23, 127, 1, 0.605
> > 256, 3, 64, 23, 127, 1, 0.668
> > 256, 45, 64, 23, 127, 1, 0.661
> > 256, 4, 64, 23, 127, 1, 0.657
> > 256, 60, 64, 23, 127, 1, 0.594
> > 256, 5, 64, 23, 127, 1, 0.654
> > 256, 75, 64, 23, 127, 1, 0.673
> > 256, 6, 64, 23, 127, 1, 0.688
> > 256, 90, 64, 23, 127, 1, 0.6
> > 256, 7, 64, 23, 127, 1, 0.66
> > 256, 105, 64, 23, 127, 1, 0.654
> > 1, 0, 0, 23, 127, 1, 0.981
> > 2, 0, 1, 23, 127, 1, 0.976
> > 3, 0, 2, 23, 127, 1, 0.983
> > 4, 0, 3, 23, 127, 1, 0.984
> > 5, 0, 4, 23, 127, 1, 0.973
> > 6, 0, 5, 23, 127, 1, 0.987
> > 7, 0, 6, 23, 127, 1, 0.977
> > 8, 0, 7, 23, 127, 1, 0.979
> > 9, 0, 8, 23, 127, 1, 0.981
> > 10, 0, 9, 23, 127, 1, 0.98
> > 11, 0, 10, 23, 127, 1, 0.983
> > 12, 0, 11, 23, 127, 1, 0.98
> > 13, 0, 12, 23, 127, 1, 0.98
> > 14, 0, 13, 23, 127, 1, 0.977
> > 15, 0, 14, 23, 127, 1, 0.982
> > 16, 0, 15, 23, 127, 1, 0.581
> > 17, 0, 16, 23, 127, 1, 0.551
> > 18, 0, 17, 23, 127, 1, 0.555
> > 19, 0, 18, 23, 127, 1, 0.586
> > 20, 0, 19, 23, 127, 1, 0.585
> > 21, 0, 20, 23, 127, 1, 0.582
> > 22, 0, 21, 23, 127, 1, 0.571
> > 23, 0, 22, 23, 127, 1, 0.576
> > 24, 0, 23, 23, 127, 1, 0.581
> > 25, 0, 24, 23, 127, 1, 0.589
> > 26, 0, 25, 23, 127, 1, 0.593
> > 27, 0, 26, 23, 127, 1, 0.595
> > 28, 0, 27, 23, 127, 1, 0.583
> > 29, 0, 28, 23, 127, 1, 0.595
> > 30, 0, 29, 23, 127, 1, 0.58
> > 31, 0, 30, 23, 127, 1, 0.594
> > 32, 0, 31, 23, 127, 1, 0.665
> > 2048, 0, 32, 23, 127, 2, 0.825
> > 2048, 1, 32, 23, 127, 2, 0.818
> > 2048, 0, 64, 23, 127, 2, 0.829
> > 2048, 2, 64, 23, 127, 2, 0.828
> > 2048, 0, 128, 23, 127, 2, 0.823
> > 2048, 3, 128, 23, 127, 2, 0.825
> > 2048, 0, 256, 23, 127, 2, 0.819
> > 2048, 4, 256, 23, 127, 2, 0.828
> > 2048, 0, 512, 23, 127, 2, 0.824
> > 2048, 5, 512, 23, 127, 2, 0.827
> > 2048, 0, 1024, 23, 127, 2, 0.813
> > 2048, 6, 1024, 23, 127, 2, 0.834
> > 2048, 0, 2048, 23, 127, 2, 0.927
> > 2048, 7, 2048, 23, 127, 2, 0.923
> > 2048, 0, 4096, 23, 127, 2, 0.818
> > 2048, 8, 4096, 23, 127, 2, 0.82
> > 256, 1, 64, 23, 127, 2, 0.693
> > 256, 15, 64, 23, 127, 2, 0.686
> > 256, 2, 64, 23, 127, 2, 0.69
> > 256, 30, 64, 23, 127, 2, 0.611
> > 256, 3, 64, 23, 127, 2, 0.692
> > 256, 45, 64, 23, 127, 2, 0.685
> > 256, 4, 64, 23, 127, 2, 0.688
> > 256, 60, 64, 23, 127, 2, 0.6
> > 256, 5, 64, 23, 127, 2, 0.69
> > 256, 75, 64, 23, 127, 2, 0.689
> > 256, 6, 64, 23, 127, 2, 0.688
> > 256, 90, 64, 23, 127, 2, 0.611
> > 256, 7, 64, 23, 127, 2, 0.69
> > 256, 105, 64, 23, 127, 2, 0.686
> > 1, 0, 0, 23, 127, 2, 0.982
> > 2, 0, 1, 23, 127, 2, 0.987
> > 3, 0, 2, 23, 127, 2, 0.978
> > 4, 0, 3, 23, 127, 2, 0.977
> > 5, 0, 4, 23, 127, 2, 0.979
> > 6, 0, 5, 23, 127, 2, 0.985
> > 7, 0, 6, 23, 127, 2, 0.975
> > 8, 0, 7, 23, 127, 2, 0.981
> > 9, 0, 8, 23, 127, 2, 0.984
> > 10, 0, 9, 23, 127, 2, 0.983
> > 11, 0, 10, 23, 127, 2, 0.982
> > 12, 0, 11, 23, 127, 2, 0.976
> > 13, 0, 12, 23, 127, 2, 0.985
> > 14, 0, 13, 23, 127, 2, 0.984
> > 15, 0, 14, 23, 127, 2, 0.98
> > 16, 0, 15, 23, 127, 2, 0.583
> > 17, 0, 16, 23, 127, 2, 0.552
> > 18, 0, 17, 23, 127, 2, 0.564
> > 19, 0, 18, 23, 127, 2, 0.585
> > 20, 0, 19, 23, 127, 2, 0.578
> > 21, 0, 20, 23, 127, 2, 0.578
> > 22, 0, 21, 23, 127, 2, 0.571
> > 23, 0, 22, 23, 127, 2, 0.587
> > 24, 0, 23, 23, 127, 2, 0.589
> > 25, 0, 24, 23, 127, 2, 0.593
> > 26, 0, 25, 23, 127, 2, 0.589
> > 27, 0, 26, 23, 127, 2, 0.588
> > 28, 0, 27, 23, 127, 2, 0.593
> > 29, 0, 28, 23, 127, 2, 0.579
> > 30, 0, 29, 23, 127, 2, 0.572
> > 31, 0, 30, 23, 127, 2, 0.582
> > 32, 0, 31, 23, 127, 2, 0.659
> > 2048, 0, 32, 23, 127, 4, 0.822
> > 2048, 1, 32, 23, 127, 4, 0.818
> > 2048, 0, 64, 23, 127, 4, 0.826
> > 2048, 2, 64, 23, 127, 4, 0.824
> > 2048, 0, 128, 23, 127, 4, 0.833
> > 2048, 3, 128, 23, 127, 4, 0.831
> > 2048, 0, 256, 23, 127, 4, 0.826
> > 2048, 4, 256, 23, 127, 4, 0.831
> > 2048, 0, 512, 23, 127, 4, 0.834
> > 2048, 5, 512, 23, 127, 4, 0.83
> > 2048, 0, 1024, 23, 127, 4, 0.836
> > 2048, 6, 1024, 23, 127, 4, 0.844
> > 2048, 0, 2048, 23, 127, 4, 0.696
> > 2048, 7, 2048, 23, 127, 4, 0.704
> > 2048, 0, 4096, 23, 127, 4, 0.936
> > 2048, 8, 4096, 23, 127, 4, 0.925
> > 256, 1, 64, 23, 127, 4, 0.694
> > 256, 15, 64, 23, 127, 4, 0.69
> > 256, 2, 64, 23, 127, 4, 0.687
> > 256, 30, 64, 23, 127, 4, 0.612
> > 256, 3, 64, 23, 127, 4, 0.685
> > 256, 45, 64, 23, 127, 4, 0.685
> > 256, 4, 64, 23, 127, 4, 0.684
> > 256, 60, 64, 23, 127, 4, 0.606
> > 256, 5, 64, 23, 127, 4, 0.69
> > 256, 75, 64, 23, 127, 4, 0.688
> > 256, 6, 64, 23, 127, 4, 0.69
> > 256, 90, 64, 23, 127, 4, 0.615
> > 256, 7, 64, 23, 127, 4, 0.691
> > 256, 105, 64, 23, 127, 4, 0.688
> > 1, 0, 0, 23, 127, 4, 0.982
> > 2, 0, 1, 23, 127, 4, 0.983
> > 3, 0, 2, 23, 127, 4, 0.981
> > 4, 0, 3, 23, 127, 4, 0.984
> > 5, 0, 4, 23, 127, 4, 0.963
> > 6, 0, 5, 23, 127, 4, 0.978
> > 7, 0, 6, 23, 127, 4, 0.985
> > 8, 0, 7, 23, 127, 4, 0.986
> > 9, 0, 8, 23, 127, 4, 0.978
> > 10, 0, 9, 23, 127, 4, 0.985
> > 11, 0, 10, 23, 127, 4, 0.986
> > 12, 0, 11, 23, 127, 4, 0.983
> > 13, 0, 12, 23, 127, 4, 0.986
> > 14, 0, 13, 23, 127, 4, 0.98
> > 15, 0, 14, 23, 127, 4, 0.979
> > 16, 0, 15, 23, 127, 4, 0.582
> > 17, 0, 16, 23, 127, 4, 0.542
> > 18, 0, 17, 23, 127, 4, 0.564
> > 19, 0, 18, 23, 127, 4, 0.571
> > 20, 0, 19, 23, 127, 4, 0.582
> > 21, 0, 20, 23, 127, 4, 0.573
> > 22, 0, 21, 23, 127, 4, 0.575
> > 23, 0, 22, 23, 127, 4, 0.578
> > 24, 0, 23, 23, 127, 4, 0.58
> > 25, 0, 24, 23, 127, 4, 0.592
> > 26, 0, 25, 23, 127, 4, 0.588
> > 27, 0, 26, 23, 127, 4, 0.574
> > 28, 0, 27, 23, 127, 4, 0.589
> > 29, 0, 28, 23, 127, 4, 0.56
> > 30, 0, 29, 23, 127, 4, 0.587
> > 31, 0, 30, 23, 127, 4, 0.584
> > 32, 0, 31, 23, 127, 4, 0.664
> > 2048, 0, 32, 23, 127, 8, 0.826
> > 2048, 1, 32, 23, 127, 8, 0.821
> > 2048, 0, 64, 23, 127, 8, 0.828
> > 2048, 2, 64, 23, 127, 8, 0.827
> > 2048, 0, 128, 23, 127, 8, 0.833
> > 2048, 3, 128, 23, 127, 8, 0.83
> > 2048, 0, 256, 23, 127, 8, 0.855
> > 2048, 4, 256, 23, 127, 8, 0.849
> > 2048, 0, 512, 23, 127, 8, 0.849
> > 2048, 5, 512, 23, 127, 8, 0.851
> > 2048, 0, 1024, 23, 127, 8, 0.856
> > 2048, 6, 1024, 23, 127, 8, 0.862
> > 2048, 0, 2048, 23, 127, 8, 0.709
> > 2048, 7, 2048, 23, 127, 8, 0.712
> > 2048, 0, 4096, 23, 127, 8, 0.702
> > 2048, 8, 4096, 23, 127, 8, 0.701
> > 256, 1, 64, 23, 127, 8, 0.689
> > 256, 15, 64, 23, 127, 8, 0.688
> > 256, 2, 64, 23, 127, 8, 0.691
> > 256, 30, 64, 23, 127, 8, 0.612
> > 256, 3, 64, 23, 127, 8, 0.688
> > 256, 45, 64, 23, 127, 8, 0.686
> > 256, 4, 64, 23, 127, 8, 0.694
> > 256, 60, 64, 23, 127, 8, 0.609
> > 256, 5, 64, 23, 127, 8, 0.69
> > 256, 75, 64, 23, 127, 8, 0.69
> > 256, 6, 64, 23, 127, 8, 0.691
> > 256, 90, 64, 23, 127, 8, 0.612
> > 256, 7, 64, 23, 127, 8, 0.689
> > 256, 105, 64, 23, 127, 8, 0.688
> > 1, 0, 0, 23, 127, 8, 0.98
> > 2, 0, 1, 23, 127, 8, 0.978
> > 3, 0, 2, 23, 127, 8, 0.98
> > 4, 0, 3, 23, 127, 8, 0.978
> > 5, 0, 4, 23, 127, 8, 0.977
> > 6, 0, 5, 23, 127, 8, 0.984
> > 7, 0, 6, 23, 127, 8, 0.982
> > 8, 0, 7, 23, 127, 8, 0.983
> > 9, 0, 8, 23, 127, 8, 0.987
> > 10, 0, 9, 23, 127, 8, 0.979
> > 11, 0, 10, 23, 127, 8, 0.985
> > 12, 0, 11, 23, 127, 8, 0.981
> > 13, 0, 12, 23, 127, 8, 0.98
> > 14, 0, 13, 23, 127, 8, 0.982
> > 15, 0, 14, 23, 127, 8, 0.981
> > 16, 0, 15, 23, 127, 8, 0.579
> > 17, 0, 16, 23, 127, 8, 0.531
> > 18, 0, 17, 23, 127, 8, 0.577
> > 19, 0, 18, 23, 127, 8, 0.588
> > 20, 0, 19, 23, 127, 8, 0.571
> > 21, 0, 20, 23, 127, 8, 0.576
> > 22, 0, 21, 23, 127, 8, 0.59
> > 23, 0, 22, 23, 127, 8, 0.574
> > 24, 0, 23, 23, 127, 8, 0.583
> > 25, 0, 24, 23, 127, 8, 0.581
> > 26, 0, 25, 23, 127, 8, 0.592
> > 27, 0, 26, 23, 127, 8, 0.586
> > 28, 0, 27, 23, 127, 8, 0.588
> > 29, 0, 28, 23, 127, 8, 0.578
> > 30, 0, 29, 23, 127, 8, 0.573
> > 31, 0, 30, 23, 127, 8, 0.588
> > 32, 0, 31, 23, 127, 8, 0.664
> > 2048, 0, 32, 23, 127, 16, 0.825
> > 2048, 1, 32, 23, 127, 16, 0.823
> > 2048, 0, 64, 23, 127, 16, 0.831
> > 2048, 2, 64, 23, 127, 16, 0.822
> > 2048, 0, 128, 23, 127, 16, 0.831
> > 2048, 3, 128, 23, 127, 16, 0.831
> > 2048, 0, 256, 23, 127, 16, 0.849
> > 2048, 4, 256, 23, 127, 16, 0.85
> > 2048, 0, 512, 23, 127, 16, 0.751
> > 2048, 5, 512, 23, 127, 16, 0.75
> > 2048, 0, 1024, 23, 127, 16, 0.913
> > 2048, 6, 1024, 23, 127, 16, 0.895
> > 2048, 0, 2048, 23, 127, 16, 0.736
> > 2048, 7, 2048, 23, 127, 16, 0.741
> > 2048, 0, 4096, 23, 127, 16, 0.712
> > 2048, 8, 4096, 23, 127, 16, 0.711
> > 256, 1, 64, 23, 127, 16, 0.758
> > 256, 15, 64, 23, 127, 16, 0.692
> > 256, 2, 64, 23, 127, 16, 0.692
> > 256, 30, 64, 23, 127, 16, 0.613
> > 256, 3, 64, 23, 127, 16, 0.69
> > 256, 45, 64, 23, 127, 16, 0.687
> > 256, 4, 64, 23, 127, 16, 0.69
> > 256, 60, 64, 23, 127, 16, 0.604
> > 256, 5, 64, 23, 127, 16, 0.687
> > 256, 75, 64, 23, 127, 16, 0.687
> > 256, 6, 64, 23, 127, 16, 0.69
> > 256, 90, 64, 23, 127, 16, 0.61
> > 256, 7, 64, 23, 127, 16, 0.69
> > 256, 105, 64, 23, 127, 16, 0.685
> > 1, 0, 0, 23, 127, 16, 0.981
> > 2, 0, 1, 23, 127, 16, 0.985
> > 3, 0, 2, 23, 127, 16, 0.985
> > 4, 0, 3, 23, 127, 16, 0.981
> > 5, 0, 4, 23, 127, 16, 0.979
> > 6, 0, 5, 23, 127, 16, 0.986
> > 7, 0, 6, 23, 127, 16, 0.986
> > 8, 0, 7, 23, 127, 16, 0.982
> > 9, 0, 8, 23, 127, 16, 0.982
> > 10, 0, 9, 23, 127, 16, 0.98
> > 11, 0, 10, 23, 127, 16, 0.983
> > 12, 0, 11, 23, 127, 16, 0.982
> > 13, 0, 12, 23, 127, 16, 0.982
> > 14, 0, 13, 23, 127, 16, 0.982
> > 15, 0, 14, 23, 127, 16, 0.982
> > 16, 0, 15, 23, 127, 16, 0.582
> > 17, 0, 16, 23, 127, 16, 0.542
> > 18, 0, 17, 23, 127, 16, 0.554
> > 19, 0, 18, 23, 127, 16, 0.562
> > 20, 0, 19, 23, 127, 16, 0.587
> > 21, 0, 20, 23, 127, 16, 0.584
> > 22, 0, 21, 23, 127, 16, 0.587
> > 23, 0, 22, 23, 127, 16, 0.594
> > 24, 0, 23, 23, 127, 16, 0.581
> > 25, 0, 24, 23, 127, 16, 0.577
> > 26, 0, 25, 23, 127, 16, 0.588
> > 27, 0, 26, 23, 127, 16, 0.589
> > 28, 0, 27, 23, 127, 16, 0.596
> > 29, 0, 28, 23, 127, 16, 0.591
> > 30, 0, 29, 23, 127, 16, 0.585
> > 31, 0, 30, 23, 127, 16, 0.59
> > 32, 0, 31, 23, 127, 16, 0.669
> >
> > sysdeps/x86_64/multiarch/strrchr-sse2.S | 2 +-
> > sysdeps/x86_64/multiarch/wcsrchr-sse2.S | 3 +-
> > sysdeps/x86_64/strrchr.S | 505 +++++++++++++++---------
> > sysdeps/x86_64/wcsrchr.S | 268 +------------
> > 4 files changed, 334 insertions(+), 444 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
> > index db1b44c23c..866396e947 100644
> > --- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
> > +++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
> > @@ -17,7 +17,7 @@
> > <https://www.gnu.org/licenses/>. */
> >
> > #if IS_IN (libc)
> > -# define strrchr __strrchr_sse2
> > +# define STRRCHR __strrchr_sse2
> >
> > # undef weak_alias
> > # define weak_alias(strrchr, rindex)
> > diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> > index 78d1ca6553..69d2f3cdb1 100644
> > --- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> > +++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> > @@ -17,7 +17,6 @@
> > <https://www.gnu.org/licenses/>. */
> >
> > #if IS_IN (libc)
> > -# define wcsrchr __wcsrchr_sse2
> > +# define STRRCHR __wcsrchr_sse2
> > #endif
> > -
> > #include "../wcsrchr.S"
> > diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
> > index 50d886713e..94449ad806 100644
> > --- a/sysdeps/x86_64/strrchr.S
> > +++ b/sysdeps/x86_64/strrchr.S
> > @@ -19,210 +19,355 @@
> >
> > #include <sysdep.h>
> >
> > +#ifndef STRRCHR
> > +# define STRRCHR strrchr
> > +#endif
> > +
> > +#ifdef USE_AS_WCSRCHR
> > +# define PCMPEQ pcmpeqd
> > +# define CHAR_SIZE 4
> > +# define PMINU pminud
> > +#else
> > +# define PCMPEQ pcmpeqb
> > +# define CHAR_SIZE 1
> > +# define PMINU pminub
> > +#endif
> > +
> > +#define PAGE_SIZE 4096
> > +#define VEC_SIZE 16
> > +
> > .text
> > -ENTRY (strrchr)
> > - movd %esi, %xmm1
> > +ENTRY(STRRCHR)
> > + movd %esi, %xmm0
> > movq %rdi, %rax
> > - andl $4095, %eax
> > - punpcklbw %xmm1, %xmm1
> > - cmpq $4032, %rax
> > - punpcklwd %xmm1, %xmm1
> > - pshufd $0, %xmm1, %xmm1
> > + andl $(PAGE_SIZE - 1), %eax
> > +#ifndef USE_AS_WCSRCHR
> > + punpcklbw %xmm0, %xmm0
> > + punpcklwd %xmm0, %xmm0
> > +#endif
> > + pshufd $0, %xmm0, %xmm0
> > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> > ja L(cross_page)
> > - movdqu (%rdi), %xmm0
> > +
> > +L(cross_page_continue):
> > + movups (%rdi), %xmm1
> > pxor %xmm2, %xmm2
> > - movdqa %xmm0, %xmm3
> > - pcmpeqb %xmm1, %xmm0
> > - pcmpeqb %xmm2, %xmm3
> > - pmovmskb %xmm0, %ecx
> > - pmovmskb %xmm3, %edx
> > - testq %rdx, %rdx
> > - je L(next_48_bytes)
> > - leaq -1(%rdx), %rax
> > - xorq %rdx, %rax
> > - andq %rcx, %rax
> > - je L(exit)
> > - bsrq %rax, %rax
> > + PCMPEQ %xmm1, %xmm2
> > + pmovmskb %xmm2, %ecx
> > + testl %ecx, %ecx
> > + jz L(aligned_more)
> > +
> > + PCMPEQ %xmm0, %xmm1
> > + pmovmskb %xmm1, %eax
> > + leal -1(%rcx), %edx
> > + xorl %edx, %ecx
> > + andl %ecx, %eax
> > + jz L(ret0)
> > + bsrl %eax, %eax
> > addq %rdi, %rax
> > + /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
> > + search CHAR is zero we are correct. Either way `andq
> > + -CHAR_SIZE, %rax` gets the correct result. */
> > +#ifdef USE_AS_WCSRCHR
> > + andq $-CHAR_SIZE, %rax
> > +#endif
> > +L(ret0):
> > ret
> >
> > + /* Returns for first vec x1/x2 have hard coded backward search
> > + path for earlier matches. */
> > .p2align 4
> > -L(next_48_bytes):
> > - movdqu 16(%rdi), %xmm4
> > - movdqa %xmm4, %xmm5
> > - movdqu 32(%rdi), %xmm3
> > - pcmpeqb %xmm1, %xmm4
> > - pcmpeqb %xmm2, %xmm5
> > - movdqu 48(%rdi), %xmm0
> > - pmovmskb %xmm5, %edx
> > - movdqa %xmm3, %xmm5
> > - pcmpeqb %xmm1, %xmm3
> > - pcmpeqb %xmm2, %xmm5
> > - pcmpeqb %xmm0, %xmm2
> > - salq $16, %rdx
> > - pmovmskb %xmm3, %r8d
> > - pmovmskb %xmm5, %eax
> > - pmovmskb %xmm2, %esi
> > - salq $32, %r8
> > - salq $32, %rax
> > - pcmpeqb %xmm1, %xmm0
> > - orq %rdx, %rax
> > - movq %rsi, %rdx
> > - pmovmskb %xmm4, %esi
> > - salq $48, %rdx
> > - salq $16, %rsi
> > - orq %r8, %rsi
> > - orq %rcx, %rsi
> > - pmovmskb %xmm0, %ecx
> > - salq $48, %rcx
> > - orq %rcx, %rsi
> > - orq %rdx, %rax
> > - je L(loop_header2)
> > - leaq -1(%rax), %rcx
> > - xorq %rax, %rcx
> > - andq %rcx, %rsi
> > - je L(exit)
> > - bsrq %rsi, %rsi
> > - leaq (%rdi,%rsi), %rax
> > +L(first_vec_x0_test):
> > + PCMPEQ %xmm0, %xmm1
> > + pmovmskb %xmm1, %eax
> > + testl %eax, %eax
> > + jz L(ret0)
> > + bsrl %eax, %eax
> > + addq %r8, %rax
> > +#ifdef USE_AS_WCSRCHR
> > + andq $-CHAR_SIZE, %rax
> > +#endif
> > ret
> >
> > .p2align 4
> > -L(loop_header2):
> > - testq %rsi, %rsi
> > - movq %rdi, %rcx
> > - je L(no_c_found)
> > -L(loop_header):
> > - addq $64, %rdi
> > - pxor %xmm7, %xmm7
> > - andq $-64, %rdi
> > - jmp L(loop_entry)
> > +L(first_vec_x1):
> > + PCMPEQ %xmm0, %xmm2
> > + pmovmskb %xmm2, %eax
> > + leal -1(%rcx), %edx
> > + xorl %edx, %ecx
> > + andl %ecx, %eax
> > + jz L(first_vec_x0_test)
> > + bsrl %eax, %eax
> > + leaq (VEC_SIZE)(%rdi, %rax), %rax
> > +#ifdef USE_AS_WCSRCHR
> > + andq $-CHAR_SIZE, %rax
> > +#endif
> > + ret
> >
> > .p2align 4
> > -L(loop64):
> > - testq %rdx, %rdx
> > - cmovne %rdx, %rsi
> > - cmovne %rdi, %rcx
> > - addq $64, %rdi
> > -L(loop_entry):
> > - movdqa 32(%rdi), %xmm3
> > - pxor %xmm6, %xmm6
> > - movdqa 48(%rdi), %xmm2
> > - movdqa %xmm3, %xmm0
> > - movdqa 16(%rdi), %xmm4
> > - pminub %xmm2, %xmm0
> > - movdqa (%rdi), %xmm5
> > - pminub %xmm4, %xmm0
> > - pminub %xmm5, %xmm0
> > - pcmpeqb %xmm7, %xmm0
> > - pmovmskb %xmm0, %eax
> > - movdqa %xmm5, %xmm0
> > - pcmpeqb %xmm1, %xmm0
> > - pmovmskb %xmm0, %r9d
> > - movdqa %xmm4, %xmm0
> > - pcmpeqb %xmm1, %xmm0
> > - pmovmskb %xmm0, %edx
> > - movdqa %xmm3, %xmm0
> > - pcmpeqb %xmm1, %xmm0
> > - salq $16, %rdx
> > - pmovmskb %xmm0, %r10d
> > - movdqa %xmm2, %xmm0
> > - pcmpeqb %xmm1, %xmm0
> > - salq $32, %r10
> > - orq %r10, %rdx
> > - pmovmskb %xmm0, %r8d
> > - orq %r9, %rdx
> > - salq $48, %r8
> > - orq %r8, %rdx
> > +L(first_vec_x1_test):
> > + PCMPEQ %xmm0, %xmm2
> > + pmovmskb %xmm2, %eax
> > testl %eax, %eax
> > - je L(loop64)
> > - pcmpeqb %xmm6, %xmm4
> > - pcmpeqb %xmm6, %xmm3
> > - pcmpeqb %xmm6, %xmm5
> > - pmovmskb %xmm4, %eax
> > - pmovmskb %xmm3, %r10d
> > - pcmpeqb %xmm6, %xmm2
> > - pmovmskb %xmm5, %r9d
> > - salq $32, %r10
> > - salq $16, %rax
> > - pmovmskb %xmm2, %r8d
> > - orq %r10, %rax
> > - orq %r9, %rax
> > - salq $48, %r8
> > - orq %r8, %rax
> > - leaq -1(%rax), %r8
> > - xorq %rax, %r8
> > - andq %r8, %rdx
> > - cmovne %rdi, %rcx
> > - cmovne %rdx, %rsi
> > - bsrq %rsi, %rsi
> > - leaq (%rcx,%rsi), %rax
> > + jz L(first_vec_x0_test)
> > + bsrl %eax, %eax
> > + leaq (VEC_SIZE)(%rdi, %rax), %rax
> > +#ifdef USE_AS_WCSRCHR
> > + andq $-CHAR_SIZE, %rax
> > +#endif
> > + ret
> > +
> > + .p2align 4
> > +L(first_vec_x2):
> > + PCMPEQ %xmm0, %xmm3
> > + pmovmskb %xmm3, %eax
> > + leal -1(%rcx), %edx
> > + xorl %edx, %ecx
> > + andl %ecx, %eax
> > + jz L(first_vec_x1_test)
> > + bsrl %eax, %eax
> > + leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
> > +#ifdef USE_AS_WCSRCHR
> > + andq $-CHAR_SIZE, %rax
> > +#endif
> > + ret
> > +
> > + .p2align 4
> > +L(aligned_more):
> > + /* Save original pointer if match was in VEC 0. */
> > + movq %rdi, %r8
> > + andq $-VEC_SIZE, %rdi
> > +
> > + movaps VEC_SIZE(%rdi), %xmm2
> > + pxor %xmm3, %xmm3
> > + PCMPEQ %xmm2, %xmm3
> > + pmovmskb %xmm3, %ecx
> > + testl %ecx, %ecx
> > + jnz L(first_vec_x1)
> > +
> > + movaps (VEC_SIZE * 2)(%rdi), %xmm3
> > + pxor %xmm4, %xmm4
> > + PCMPEQ %xmm3, %xmm4
> > + pmovmskb %xmm4, %ecx
> > + testl %ecx, %ecx
> > + jnz L(first_vec_x2)
> > +
> > + addq $VEC_SIZE, %rdi
> > + /* Save pointer again before realigning. */
> > + movq %rdi, %rsi
> > + andq $-(VEC_SIZE * 2), %rdi
> > + .p2align 4
> > +L(first_loop):
> > + /* Do 2x VEC at a time. */
> > + movaps (VEC_SIZE * 2)(%rdi), %xmm4
> > + movaps (VEC_SIZE * 3)(%rdi), %xmm5
> > + /* If SSE2 no pminud. */
> > +#ifdef NO_PMINU
>
> Do we really need SSE4.1 wcsrchr? I think we should focus on AVX2 and
> above.
It seems like freebie performance that can make a difference in the loop
cases. (see the SSE4.1 commit for numbers).
Imo there is little harm but if you feel strongly I'll drop. (In V2 will
change the .text section for SSE4_1).
What do you think?
>
> > + movaps %xmm5, %xmm6
> > + pxor %xmm8, %xmm8
> > +
> > + PCMPEQ %xmm8, %xmm5
> > + PCMPEQ %xmm4, %xmm8
> > + por %xmm5, %xmm8
> > +#else
> > + movaps %xmm5, %xmm6
> > + PMINU %xmm4, %xmm5
> > +#endif
> > +
> > + movaps %xmm4, %xmm9
> > + PCMPEQ %xmm0, %xmm4
> > + PCMPEQ %xmm0, %xmm6
> > + movaps %xmm6, %xmm7
> > + por %xmm4, %xmm6
> > +#ifndef NO_PMINU
> > + pxor %xmm8, %xmm8
> > + PCMPEQ %xmm5, %xmm8
> > +#endif
> > + pmovmskb %xmm8, %ecx
> > + pmovmskb %xmm6, %eax
> > +
> > + addq $(VEC_SIZE * 2), %rdi
> > + /* Use `addl` 1) so we can undo it with `subl` and 2) it can
> > + macro-fuse with `jz`. */
> > + addl %ecx, %eax
> > + jz L(first_loop)
> > +
> > + /* Check if there is zero match. */
> > + testl %ecx, %ecx
> > + jz L(second_loop_match)
> > +
> > + /* Check if there was a match in last iteration. */
> > + subl %ecx, %eax
> > + jnz L(new_match)
> > +
> > +L(first_loop_old_match):
> > + PCMPEQ %xmm0, %xmm2
> > + PCMPEQ %xmm0, %xmm3
> > + pmovmskb %xmm2, %ecx
> > + pmovmskb %xmm3, %eax
> > + addl %eax, %ecx
> > + jz L(first_vec_x0_test)
> > + /* NB: We could move this shift to before the branch and save a
> > + bit of code size / performance on the fall through. The
> > + branch leads to the null case which generally seems hotter
> > + than char in first 3x VEC. */
> > + sall $16, %eax
> > + orl %ecx, %eax
> > +
> > + bsrl %eax, %eax
> > + addq %rsi, %rax
> > +#ifdef USE_AS_WCSRCHR
> > + andq $-CHAR_SIZE, %rax
> > +#endif
> > + ret
> > +
> > + .p2align 4
> > +L(new_match):
> > + pxor %xmm6, %xmm6
> > + PCMPEQ %xmm9, %xmm6
> > + pmovmskb %xmm6, %eax
> > + sall $16, %ecx
> > + orl %eax, %ecx
> > +
> > + /* We can't reuse either of the old comparisons as since we mask
> > + of zeros after first zero (instead of using the full
> > + comparison) we can't gurantee no interference between match
> > + after end of string and valid match. */
> > + pmovmskb %xmm4, %eax
> > + pmovmskb %xmm7, %edx
> > + sall $16, %edx
> > + orl %edx, %eax
> > +
> > + leal -1(%ecx), %edx
> > + xorl %edx, %ecx
> > + andl %ecx, %eax
> > + jz L(first_loop_old_match)
> > + bsrl %eax, %eax
> > + addq %rdi, %rax
> > +#ifdef USE_AS_WCSRCHR
> > + andq $-CHAR_SIZE, %rax
> > +#endif
> > ret
> >
> > + /* Save minimum state for getting most recent match. We can
> > + throw out all previous work. */
> > .p2align 4
> > -L(no_c_found):
> > - movl $1, %esi
> > - xorl %ecx, %ecx
> > - jmp L(loop_header)
> > +L(second_loop_match):
> > + movq %rdi, %rsi
> > + movaps %xmm4, %xmm2
> > + movaps %xmm7, %xmm3
> >
> > .p2align 4
> > -L(exit):
> > - xorl %eax, %eax
> > +L(second_loop):
> > + movaps (VEC_SIZE * 2)(%rdi), %xmm4
> > + movaps (VEC_SIZE * 3)(%rdi), %xmm5
> > +#ifdef NO_PMINU
> > + movaps %xmm5, %xmm6
> > + pxor %xmm8, %xmm8
> > +
> > + PCMPEQ %xmm8, %xmm5
> > + PCMPEQ %xmm4, %xmm8
> > + por %xmm5, %xmm8
> > +#else
> > + movaps %xmm5, %xmm6
> > + PMINU %xmm4, %xmm5
> > +#endif
> > +
> > + movaps %xmm4, %xmm9
> > + PCMPEQ %xmm0, %xmm4
> > + PCMPEQ %xmm0, %xmm6
> > + movaps %xmm6, %xmm7
> > + por %xmm4, %xmm6
> > +#ifndef NO_PMINU
> > + pxor %xmm8, %xmm8
> > + PCMPEQ %xmm5, %xmm8
> > +#endif
> > +
> > + pmovmskb %xmm8, %ecx
> > + pmovmskb %xmm6, %eax
> > +
> > + addq $(VEC_SIZE * 2), %rdi
> > + /* Either null term or new occurence of CHAR. */
> > + addl %ecx, %eax
> > + jz L(second_loop)
> > +
> > + /* No null term so much be new occurence of CHAR. */
> > + testl %ecx, %ecx
> > + jz L(second_loop_match)
> > +
> > +
> > + subl %ecx, %eax
> > + jnz L(second_loop_new_match)
> > +
> > +L(second_loop_old_match):
> > + pmovmskb %xmm2, %ecx
> > + pmovmskb %xmm3, %eax
> > + sall $16, %eax
> > + orl %ecx, %eax
> > + bsrl %eax, %eax
> > + addq %rsi, %rax
> > +#ifdef USE_AS_WCSRCHR
> > + andq $-CHAR_SIZE, %rax
> > +#endif
> > ret
> >
> > .p2align 4
> > +L(second_loop_new_match):
> > + pxor %xmm6, %xmm6
> > + PCMPEQ %xmm9, %xmm6
> > + pmovmskb %xmm6, %eax
> > + sall $16, %ecx
> > + orl %eax, %ecx
> > +
> > + /* We can't reuse either of the old comparisons as since we mask
> > + of zeros after first zero (instead of using the full
> > + comparison) we can't gurantee no interference between match
> > + after end of string and valid match. */
> > + pmovmskb %xmm4, %eax
> > + pmovmskb %xmm7, %edx
> > + sall $16, %edx
> > + orl %edx, %eax
> > +
> > + leal -1(%ecx), %edx
> > + xorl %edx, %ecx
> > + andl %ecx, %eax
> > + jz L(second_loop_old_match)
> > + bsrl %eax, %eax
> > + addq %rdi, %rax
> > +#ifdef USE_AS_WCSRCHR
> > + andq $-CHAR_SIZE, %rax
> > +#endif
> > + ret
> > +
> > + .p2align 4,, 4
> > L(cross_page):
> > - movq %rdi, %rax
> > - pxor %xmm0, %xmm0
> > - andq $-64, %rax
> > - movdqu (%rax), %xmm5
> > - movdqa %xmm5, %xmm6
> > - movdqu 16(%rax), %xmm4
> > - pcmpeqb %xmm1, %xmm5
> > - pcmpeqb %xmm0, %xmm6
> > - movdqu 32(%rax), %xmm3
> > - pmovmskb %xmm6, %esi
> > - movdqa %xmm4, %xmm6
> > - movdqu 48(%rax), %xmm2
> > - pcmpeqb %xmm1, %xmm4
> > - pcmpeqb %xmm0, %xmm6
> > - pmovmskb %xmm6, %edx
> > - movdqa %xmm3, %xmm6
> > - pcmpeqb %xmm1, %xmm3
> > - pcmpeqb %xmm0, %xmm6
> > - pcmpeqb %xmm2, %xmm0
> > - salq $16, %rdx
> > - pmovmskb %xmm3, %r9d
> > - pmovmskb %xmm6, %r8d
> > - pmovmskb %xmm0, %ecx
> > - salq $32, %r9
> > - salq $32, %r8
> > - pcmpeqb %xmm1, %xmm2
> > - orq %r8, %rdx
> > - salq $48, %rcx
> > - pmovmskb %xmm5, %r8d
> > - orq %rsi, %rdx
> > - pmovmskb %xmm4, %esi
> > - orq %rcx, %rdx
> > - pmovmskb %xmm2, %ecx
> > - salq $16, %rsi
> > - salq $48, %rcx
> > - orq %r9, %rsi
> > - orq %r8, %rsi
> > - orq %rcx, %rsi
> > + movq %rdi, %rsi
> > + andq $-VEC_SIZE, %rsi
> > + movaps (%rsi), %xmm1
> > + pxor %xmm2, %xmm2
> > + PCMPEQ %xmm1, %xmm2
> > + pmovmskb %xmm2, %edx
> > movl %edi, %ecx
> > - subl %eax, %ecx
> > - shrq %cl, %rdx
> > - shrq %cl, %rsi
> > - testq %rdx, %rdx
> > - je L(loop_header2)
> > - leaq -1(%rdx), %rax
> > - xorq %rdx, %rax
> > - andq %rax, %rsi
> > - je L(exit)
> > - bsrq %rsi, %rax
> > + andl $(VEC_SIZE - 1), %ecx
> > + sarl %cl, %edx
> > + jz L(cross_page_continue)
> > + PCMPEQ %xmm0, %xmm1
> > + pmovmskb %xmm1, %eax
> > + sarl %cl, %eax
> > + leal -1(%rdx), %ecx
> > + xorl %edx, %ecx
> > + andl %ecx, %eax
> > + jz L(ret1)
> > + bsrl %eax, %eax
> > addq %rdi, %rax
> > +#ifdef USE_AS_WCSRCHR
> > + andq $-CHAR_SIZE, %rax
> > +#endif
> > +L(ret1):
> > ret
> > -END (strrchr)
> > +END(STRRCHR)
> >
> > -weak_alias (strrchr, rindex)
> > -libc_hidden_builtin_def (strrchr)
> > +#ifndef USE_AS_WCSRCHR
> > + weak_alias (STRRCHR, rindex)
> > + libc_hidden_builtin_def (STRRCHR)
> > +#endif
> > diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
> > index 61552954de..2b80efc5ef 100644
> > --- a/sysdeps/x86_64/wcsrchr.S
> > +++ b/sysdeps/x86_64/wcsrchr.S
> > @@ -1,4 +1,4 @@
> > -/* wcsrchr with SSSE3
> > +/* wcsrchr optimized with SSE2.
> > Copyright (C) 2011-2022 Free Software Foundation, Inc.
> > This file is part of the GNU C Library.
> >
> > @@ -16,266 +16,12 @@
> > License along with the GNU C Library; if not, see
> > <https://www.gnu.org/licenses/>. */
> >
> > -#include <sysdep.h>
> >
> > - .text
> > -ENTRY (wcsrchr)
> > +#define USE_AS_WCSRCHR 1
> > +#define NO_PMINU 1
> >
> > - movd %rsi, %xmm1
> > - mov %rdi, %rcx
> > - punpckldq %xmm1, %xmm1
> > - pxor %xmm2, %xmm2
> > - punpckldq %xmm1, %xmm1
> > - and $63, %rcx
> > - cmp $48, %rcx
> > - ja L(crosscache)
> > +#ifndef STRRCHR
> > +# define STRRCHR wcsrchr
> > +#endif
> >
> > - movdqu (%rdi), %xmm0
> > - pcmpeqd %xmm0, %xmm2
> > - pcmpeqd %xmm1, %xmm0
> > - pmovmskb %xmm2, %rcx
> > - pmovmskb %xmm0, %rax
> > - add $16, %rdi
> > -
> > - test %rax, %rax
> > - jnz L(unaligned_match1)
> > -
> > - test %rcx, %rcx
> > - jnz L(return_null)
> > -
> > - and $-16, %rdi
> > - xor %r8, %r8
> > - jmp L(loop)
> > -
> > - .p2align 4
> > -L(unaligned_match1):
> > - test %rcx, %rcx
> > - jnz L(prolog_find_zero_1)
> > -
> > - mov %rax, %r8
> > - mov %rdi, %rsi
> > - and $-16, %rdi
> > - jmp L(loop)
> > -
> > - .p2align 4
> > -L(crosscache):
> > - and $15, %rcx
> > - and $-16, %rdi
> > - pxor %xmm3, %xmm3
> > - movdqa (%rdi), %xmm0
> > - pcmpeqd %xmm0, %xmm3
> > - pcmpeqd %xmm1, %xmm0
> > - pmovmskb %xmm3, %rdx
> > - pmovmskb %xmm0, %rax
> > - shr %cl, %rdx
> > - shr %cl, %rax
> > - add $16, %rdi
> > -
> > - test %rax, %rax
> > - jnz L(unaligned_match)
> > -
> > - test %rdx, %rdx
> > - jnz L(return_null)
> > -
> > - xor %r8, %r8
> > - jmp L(loop)
> > -
> > - .p2align 4
> > -L(unaligned_match):
> > - test %rdx, %rdx
> > - jnz L(prolog_find_zero)
> > -
> > - mov %rax, %r8
> > - lea (%rdi, %rcx), %rsi
> > -
> > -/* Loop start on aligned string. */
> > - .p2align 4
> > -L(loop):
> > - movdqa (%rdi), %xmm0
> > - pcmpeqd %xmm0, %xmm2
> > - add $16, %rdi
> > - pcmpeqd %xmm1, %xmm0
> > - pmovmskb %xmm2, %rcx
> > - pmovmskb %xmm0, %rax
> > - or %rax, %rcx
> > - jnz L(matches)
> > -
> > - movdqa (%rdi), %xmm3
> > - pcmpeqd %xmm3, %xmm2
> > - add $16, %rdi
> > - pcmpeqd %xmm1, %xmm3
> > - pmovmskb %xmm2, %rcx
> > - pmovmskb %xmm3, %rax
> > - or %rax, %rcx
> > - jnz L(matches)
> > -
> > - movdqa (%rdi), %xmm4
> > - pcmpeqd %xmm4, %xmm2
> > - add $16, %rdi
> > - pcmpeqd %xmm1, %xmm4
> > - pmovmskb %xmm2, %rcx
> > - pmovmskb %xmm4, %rax
> > - or %rax, %rcx
> > - jnz L(matches)
> > -
> > - movdqa (%rdi), %xmm5
> > - pcmpeqd %xmm5, %xmm2
> > - add $16, %rdi
> > - pcmpeqd %xmm1, %xmm5
> > - pmovmskb %xmm2, %rcx
> > - pmovmskb %xmm5, %rax
> > - or %rax, %rcx
> > - jz L(loop)
> > -
> > - .p2align 4
> > -L(matches):
> > - test %rax, %rax
> > - jnz L(match)
> > -L(return_value):
> > - test %r8, %r8
> > - jz L(return_null)
> > - mov %r8, %rax
> > - mov %rsi, %rdi
> > -
> > - test $15 << 4, %ah
> > - jnz L(match_fourth_wchar)
> > - test %ah, %ah
> > - jnz L(match_third_wchar)
> > - test $15 << 4, %al
> > - jnz L(match_second_wchar)
> > - lea -16(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(match):
> > - pmovmskb %xmm2, %rcx
> > - test %rcx, %rcx
> > - jnz L(find_zero)
> > - mov %rax, %r8
> > - mov %rdi, %rsi
> > - jmp L(loop)
> > -
> > - .p2align 4
> > -L(find_zero):
> > - test $15, %cl
> > - jnz L(find_zero_in_first_wchar)
> > - test %cl, %cl
> > - jnz L(find_zero_in_second_wchar)
> > - test $15, %ch
> > - jnz L(find_zero_in_third_wchar)
> > -
> > - and $1 << 13 - 1, %rax
> > - jz L(return_value)
> > -
> > - test $15 << 4, %ah
> > - jnz L(match_fourth_wchar)
> > - test %ah, %ah
> > - jnz L(match_third_wchar)
> > - test $15 << 4, %al
> > - jnz L(match_second_wchar)
> > - lea -16(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(find_zero_in_first_wchar):
> > - test $1, %rax
> > - jz L(return_value)
> > - lea -16(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(find_zero_in_second_wchar):
> > - and $1 << 5 - 1, %rax
> > - jz L(return_value)
> > -
> > - test $15 << 4, %al
> > - jnz L(match_second_wchar)
> > - lea -16(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(find_zero_in_third_wchar):
> > - and $1 << 9 - 1, %rax
> > - jz L(return_value)
> > -
> > - test %ah, %ah
> > - jnz L(match_third_wchar)
> > - test $15 << 4, %al
> > - jnz L(match_second_wchar)
> > - lea -16(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(prolog_find_zero):
> > - add %rcx, %rdi
> > - mov %rdx, %rcx
> > -L(prolog_find_zero_1):
> > - test $15, %cl
> > - jnz L(prolog_find_zero_in_first_wchar)
> > - test %cl, %cl
> > - jnz L(prolog_find_zero_in_second_wchar)
> > - test $15, %ch
> > - jnz L(prolog_find_zero_in_third_wchar)
> > -
> > - and $1 << 13 - 1, %rax
> > - jz L(return_null)
> > -
> > - test $15 << 4, %ah
> > - jnz L(match_fourth_wchar)
> > - test %ah, %ah
> > - jnz L(match_third_wchar)
> > - test $15 << 4, %al
> > - jnz L(match_second_wchar)
> > - lea -16(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(prolog_find_zero_in_first_wchar):
> > - test $1, %rax
> > - jz L(return_null)
> > - lea -16(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(prolog_find_zero_in_second_wchar):
> > - and $1 << 5 - 1, %rax
> > - jz L(return_null)
> > -
> > - test $15 << 4, %al
> > - jnz L(match_second_wchar)
> > - lea -16(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(prolog_find_zero_in_third_wchar):
> > - and $1 << 9 - 1, %rax
> > - jz L(return_null)
> > -
> > - test %ah, %ah
> > - jnz L(match_third_wchar)
> > - test $15 << 4, %al
> > - jnz L(match_second_wchar)
> > - lea -16(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(match_second_wchar):
> > - lea -12(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(match_third_wchar):
> > - lea -8(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(match_fourth_wchar):
> > - lea -4(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(return_null):
> > - xor %rax, %rax
> > - ret
> > -
> > -END (wcsrchr)
> > +#include "../strrchr.S"
> > --
> > 2.25.1
> >
>
>
> --
> H.J.
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH v1 2/5] x86: Optimize {str|wcs}rchr-sse2
2022-04-21 20:57 ` Noah Goldstein
@ 2022-04-21 21:48 ` H.J. Lu
2022-04-21 22:23 ` Noah Goldstein
0 siblings, 1 reply; 36+ messages in thread
From: H.J. Lu @ 2022-04-21 21:48 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Thu, Apr 21, 2022 at 1:57 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Thu, Apr 21, 2022 at 3:27 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Wed, Apr 20, 2022 at 8:14 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > The new code unrolls the main loop slightly without adding too much
> > > overhead and minimizes the comparisons for the search CHAR.
> > >
> > > Geometric Mean of all benchmarks New / Old: 0.741
> > > See email for all results.
> > >
> > > Full xcheck passes on x86_64 with and without multiarch enabled.
> > > ---
> > > Results For: strrchr
> > >
> > > Geometric Mean of N=30 runs.
> > >
> > > Geometric Mean of all benchmarks New / Old: 0.741
> > > Benchmarks performance on Tigerlake:
> > > https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i71165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html
> > >
> > > len, align, pos, seek, max_char, freq, New Time / Old Time
> > > 2048, 0, 32, 0, 127, 1, 0.647
> > > 2048, 1, 32, 0, 127, 1, 0.621
> > > 2048, 0, 64, 0, 127, 1, 0.661
> > > 2048, 2, 64, 0, 127, 1, 0.655
> > > 2048, 0, 128, 0, 127, 1, 0.69
> > > 2048, 3, 128, 0, 127, 1, 0.689
> > > 2048, 0, 256, 0, 127, 1, 0.718
> > > 2048, 4, 256, 0, 127, 1, 0.718
> > > 2048, 0, 512, 0, 127, 1, 0.758
> > > 2048, 5, 512, 0, 127, 1, 0.754
> > > 2048, 0, 1024, 0, 127, 1, 1.029
> > > 2048, 6, 1024, 0, 127, 1, 1.032
> > > 2048, 0, 2048, 0, 127, 1, 0.826
> > > 2048, 7, 2048, 0, 127, 1, 0.834
> > > 2048, 0, 4096, 0, 127, 1, 0.825
> > > 2048, 8, 4096, 0, 127, 1, 0.83
> > > 256, 1, 64, 0, 127, 1, 0.657
> > > 256, 15, 64, 0, 127, 1, 0.657
> > > 256, 2, 64, 0, 127, 1, 0.657
> > > 256, 30, 64, 0, 127, 1, 0.523
> > > 256, 3, 64, 0, 127, 1, 0.657
> > > 256, 45, 64, 0, 127, 1, 0.654
> > > 256, 4, 64, 0, 127, 1, 0.657
> > > 256, 60, 64, 0, 127, 1, 0.526
> > > 256, 5, 64, 0, 127, 1, 0.658
> > > 256, 75, 64, 0, 127, 1, 0.658
> > > 256, 6, 64, 0, 127, 1, 0.655
> > > 256, 90, 64, 0, 127, 1, 0.523
> > > 256, 7, 64, 0, 127, 1, 0.655
> > > 256, 105, 64, 0, 127, 1, 0.654
> > > 1, 0, 0, 0, 127, 1, 0.98
> > > 2, 0, 1, 0, 127, 1, 0.978
> > > 3, 0, 2, 0, 127, 1, 0.975
> > > 4, 0, 3, 0, 127, 1, 0.976
> > > 5, 0, 4, 0, 127, 1, 0.977
> > > 6, 0, 5, 0, 127, 1, 0.981
> > > 7, 0, 6, 0, 127, 1, 0.982
> > > 8, 0, 7, 0, 127, 1, 0.98
> > > 9, 0, 8, 0, 127, 1, 0.978
> > > 10, 0, 9, 0, 127, 1, 0.981
> > > 11, 0, 10, 0, 127, 1, 0.984
> > > 12, 0, 11, 0, 127, 1, 0.982
> > > 13, 0, 12, 0, 127, 1, 0.98
> > > 14, 0, 13, 0, 127, 1, 0.978
> > > 15, 0, 14, 0, 127, 1, 0.979
> > > 16, 0, 15, 0, 127, 1, 0.986
> > > 17, 0, 16, 0, 127, 1, 0.529
> > > 18, 0, 17, 0, 127, 1, 0.566
> > > 19, 0, 18, 0, 127, 1, 0.575
> > > 20, 0, 19, 0, 127, 1, 0.573
> > > 21, 0, 20, 0, 127, 1, 0.579
> > > 22, 0, 21, 0, 127, 1, 0.595
> > > 23, 0, 22, 0, 127, 1, 0.585
> > > 24, 0, 23, 0, 127, 1, 0.586
> > > 25, 0, 24, 0, 127, 1, 0.587
> > > 26, 0, 25, 0, 127, 1, 0.592
> > > 27, 0, 26, 0, 127, 1, 0.595
> > > 28, 0, 27, 0, 127, 1, 0.592
> > > 29, 0, 28, 0, 127, 1, 0.6
> > > 30, 0, 29, 0, 127, 1, 0.598
> > > 31, 0, 30, 0, 127, 1, 0.595
> > > 32, 0, 31, 0, 127, 1, 0.592
> > > 2048, 0, 32, 23, 127, 1, 0.827
> > > 2048, 1, 32, 23, 127, 1, 0.826
> > > 2048, 0, 64, 23, 127, 1, 0.824
> > > 2048, 2, 64, 23, 127, 1, 0.825
> > > 2048, 0, 128, 23, 127, 1, 0.829
> > > 2048, 3, 128, 23, 127, 1, 0.824
> > > 2048, 0, 256, 23, 127, 1, 0.832
> > > 2048, 4, 256, 23, 127, 1, 0.825
> > > 2048, 0, 512, 23, 127, 1, 0.831
> > > 2048, 5, 512, 23, 127, 1, 0.837
> > > 2048, 0, 1024, 23, 127, 1, 0.721
> > > 2048, 6, 1024, 23, 127, 1, 0.757
> > > 2048, 0, 2048, 23, 127, 1, 0.825
> > > 2048, 7, 2048, 23, 127, 1, 0.824
> > > 2048, 0, 4096, 23, 127, 1, 0.828
> > > 2048, 8, 4096, 23, 127, 1, 0.823
> > > 256, 1, 64, 23, 127, 1, 0.665
> > > 256, 15, 64, 23, 127, 1, 0.661
> > > 256, 2, 64, 23, 127, 1, 0.674
> > > 256, 30, 64, 23, 127, 1, 0.605
> > > 256, 3, 64, 23, 127, 1, 0.668
> > > 256, 45, 64, 23, 127, 1, 0.661
> > > 256, 4, 64, 23, 127, 1, 0.657
> > > 256, 60, 64, 23, 127, 1, 0.594
> > > 256, 5, 64, 23, 127, 1, 0.654
> > > 256, 75, 64, 23, 127, 1, 0.673
> > > 256, 6, 64, 23, 127, 1, 0.688
> > > 256, 90, 64, 23, 127, 1, 0.6
> > > 256, 7, 64, 23, 127, 1, 0.66
> > > 256, 105, 64, 23, 127, 1, 0.654
> > > 1, 0, 0, 23, 127, 1, 0.981
> > > 2, 0, 1, 23, 127, 1, 0.976
> > > 3, 0, 2, 23, 127, 1, 0.983
> > > 4, 0, 3, 23, 127, 1, 0.984
> > > 5, 0, 4, 23, 127, 1, 0.973
> > > 6, 0, 5, 23, 127, 1, 0.987
> > > 7, 0, 6, 23, 127, 1, 0.977
> > > 8, 0, 7, 23, 127, 1, 0.979
> > > 9, 0, 8, 23, 127, 1, 0.981
> > > 10, 0, 9, 23, 127, 1, 0.98
> > > 11, 0, 10, 23, 127, 1, 0.983
> > > 12, 0, 11, 23, 127, 1, 0.98
> > > 13, 0, 12, 23, 127, 1, 0.98
> > > 14, 0, 13, 23, 127, 1, 0.977
> > > 15, 0, 14, 23, 127, 1, 0.982
> > > 16, 0, 15, 23, 127, 1, 0.581
> > > 17, 0, 16, 23, 127, 1, 0.551
> > > 18, 0, 17, 23, 127, 1, 0.555
> > > 19, 0, 18, 23, 127, 1, 0.586
> > > 20, 0, 19, 23, 127, 1, 0.585
> > > 21, 0, 20, 23, 127, 1, 0.582
> > > 22, 0, 21, 23, 127, 1, 0.571
> > > 23, 0, 22, 23, 127, 1, 0.576
> > > 24, 0, 23, 23, 127, 1, 0.581
> > > 25, 0, 24, 23, 127, 1, 0.589
> > > 26, 0, 25, 23, 127, 1, 0.593
> > > 27, 0, 26, 23, 127, 1, 0.595
> > > 28, 0, 27, 23, 127, 1, 0.583
> > > 29, 0, 28, 23, 127, 1, 0.595
> > > 30, 0, 29, 23, 127, 1, 0.58
> > > 31, 0, 30, 23, 127, 1, 0.594
> > > 32, 0, 31, 23, 127, 1, 0.665
> > > 2048, 0, 32, 23, 127, 2, 0.825
> > > 2048, 1, 32, 23, 127, 2, 0.818
> > > 2048, 0, 64, 23, 127, 2, 0.829
> > > 2048, 2, 64, 23, 127, 2, 0.828
> > > 2048, 0, 128, 23, 127, 2, 0.823
> > > 2048, 3, 128, 23, 127, 2, 0.825
> > > 2048, 0, 256, 23, 127, 2, 0.819
> > > 2048, 4, 256, 23, 127, 2, 0.828
> > > 2048, 0, 512, 23, 127, 2, 0.824
> > > 2048, 5, 512, 23, 127, 2, 0.827
> > > 2048, 0, 1024, 23, 127, 2, 0.813
> > > 2048, 6, 1024, 23, 127, 2, 0.834
> > > 2048, 0, 2048, 23, 127, 2, 0.927
> > > 2048, 7, 2048, 23, 127, 2, 0.923
> > > 2048, 0, 4096, 23, 127, 2, 0.818
> > > 2048, 8, 4096, 23, 127, 2, 0.82
> > > 256, 1, 64, 23, 127, 2, 0.693
> > > 256, 15, 64, 23, 127, 2, 0.686
> > > 256, 2, 64, 23, 127, 2, 0.69
> > > 256, 30, 64, 23, 127, 2, 0.611
> > > 256, 3, 64, 23, 127, 2, 0.692
> > > 256, 45, 64, 23, 127, 2, 0.685
> > > 256, 4, 64, 23, 127, 2, 0.688
> > > 256, 60, 64, 23, 127, 2, 0.6
> > > 256, 5, 64, 23, 127, 2, 0.69
> > > 256, 75, 64, 23, 127, 2, 0.689
> > > 256, 6, 64, 23, 127, 2, 0.688
> > > 256, 90, 64, 23, 127, 2, 0.611
> > > 256, 7, 64, 23, 127, 2, 0.69
> > > 256, 105, 64, 23, 127, 2, 0.686
> > > 1, 0, 0, 23, 127, 2, 0.982
> > > 2, 0, 1, 23, 127, 2, 0.987
> > > 3, 0, 2, 23, 127, 2, 0.978
> > > 4, 0, 3, 23, 127, 2, 0.977
> > > 5, 0, 4, 23, 127, 2, 0.979
> > > 6, 0, 5, 23, 127, 2, 0.985
> > > 7, 0, 6, 23, 127, 2, 0.975
> > > 8, 0, 7, 23, 127, 2, 0.981
> > > 9, 0, 8, 23, 127, 2, 0.984
> > > 10, 0, 9, 23, 127, 2, 0.983
> > > 11, 0, 10, 23, 127, 2, 0.982
> > > 12, 0, 11, 23, 127, 2, 0.976
> > > 13, 0, 12, 23, 127, 2, 0.985
> > > 14, 0, 13, 23, 127, 2, 0.984
> > > 15, 0, 14, 23, 127, 2, 0.98
> > > 16, 0, 15, 23, 127, 2, 0.583
> > > 17, 0, 16, 23, 127, 2, 0.552
> > > 18, 0, 17, 23, 127, 2, 0.564
> > > 19, 0, 18, 23, 127, 2, 0.585
> > > 20, 0, 19, 23, 127, 2, 0.578
> > > 21, 0, 20, 23, 127, 2, 0.578
> > > 22, 0, 21, 23, 127, 2, 0.571
> > > 23, 0, 22, 23, 127, 2, 0.587
> > > 24, 0, 23, 23, 127, 2, 0.589
> > > 25, 0, 24, 23, 127, 2, 0.593
> > > 26, 0, 25, 23, 127, 2, 0.589
> > > 27, 0, 26, 23, 127, 2, 0.588
> > > 28, 0, 27, 23, 127, 2, 0.593
> > > 29, 0, 28, 23, 127, 2, 0.579
> > > 30, 0, 29, 23, 127, 2, 0.572
> > > 31, 0, 30, 23, 127, 2, 0.582
> > > 32, 0, 31, 23, 127, 2, 0.659
> > > 2048, 0, 32, 23, 127, 4, 0.822
> > > 2048, 1, 32, 23, 127, 4, 0.818
> > > 2048, 0, 64, 23, 127, 4, 0.826
> > > 2048, 2, 64, 23, 127, 4, 0.824
> > > 2048, 0, 128, 23, 127, 4, 0.833
> > > 2048, 3, 128, 23, 127, 4, 0.831
> > > 2048, 0, 256, 23, 127, 4, 0.826
> > > 2048, 4, 256, 23, 127, 4, 0.831
> > > 2048, 0, 512, 23, 127, 4, 0.834
> > > 2048, 5, 512, 23, 127, 4, 0.83
> > > 2048, 0, 1024, 23, 127, 4, 0.836
> > > 2048, 6, 1024, 23, 127, 4, 0.844
> > > 2048, 0, 2048, 23, 127, 4, 0.696
> > > 2048, 7, 2048, 23, 127, 4, 0.704
> > > 2048, 0, 4096, 23, 127, 4, 0.936
> > > 2048, 8, 4096, 23, 127, 4, 0.925
> > > 256, 1, 64, 23, 127, 4, 0.694
> > > 256, 15, 64, 23, 127, 4, 0.69
> > > 256, 2, 64, 23, 127, 4, 0.687
> > > 256, 30, 64, 23, 127, 4, 0.612
> > > 256, 3, 64, 23, 127, 4, 0.685
> > > 256, 45, 64, 23, 127, 4, 0.685
> > > 256, 4, 64, 23, 127, 4, 0.684
> > > 256, 60, 64, 23, 127, 4, 0.606
> > > 256, 5, 64, 23, 127, 4, 0.69
> > > 256, 75, 64, 23, 127, 4, 0.688
> > > 256, 6, 64, 23, 127, 4, 0.69
> > > 256, 90, 64, 23, 127, 4, 0.615
> > > 256, 7, 64, 23, 127, 4, 0.691
> > > 256, 105, 64, 23, 127, 4, 0.688
> > > 1, 0, 0, 23, 127, 4, 0.982
> > > 2, 0, 1, 23, 127, 4, 0.983
> > > 3, 0, 2, 23, 127, 4, 0.981
> > > 4, 0, 3, 23, 127, 4, 0.984
> > > 5, 0, 4, 23, 127, 4, 0.963
> > > 6, 0, 5, 23, 127, 4, 0.978
> > > 7, 0, 6, 23, 127, 4, 0.985
> > > 8, 0, 7, 23, 127, 4, 0.986
> > > 9, 0, 8, 23, 127, 4, 0.978
> > > 10, 0, 9, 23, 127, 4, 0.985
> > > 11, 0, 10, 23, 127, 4, 0.986
> > > 12, 0, 11, 23, 127, 4, 0.983
> > > 13, 0, 12, 23, 127, 4, 0.986
> > > 14, 0, 13, 23, 127, 4, 0.98
> > > 15, 0, 14, 23, 127, 4, 0.979
> > > 16, 0, 15, 23, 127, 4, 0.582
> > > 17, 0, 16, 23, 127, 4, 0.542
> > > 18, 0, 17, 23, 127, 4, 0.564
> > > 19, 0, 18, 23, 127, 4, 0.571
> > > 20, 0, 19, 23, 127, 4, 0.582
> > > 21, 0, 20, 23, 127, 4, 0.573
> > > 22, 0, 21, 23, 127, 4, 0.575
> > > 23, 0, 22, 23, 127, 4, 0.578
> > > 24, 0, 23, 23, 127, 4, 0.58
> > > 25, 0, 24, 23, 127, 4, 0.592
> > > 26, 0, 25, 23, 127, 4, 0.588
> > > 27, 0, 26, 23, 127, 4, 0.574
> > > 28, 0, 27, 23, 127, 4, 0.589
> > > 29, 0, 28, 23, 127, 4, 0.56
> > > 30, 0, 29, 23, 127, 4, 0.587
> > > 31, 0, 30, 23, 127, 4, 0.584
> > > 32, 0, 31, 23, 127, 4, 0.664
> > > 2048, 0, 32, 23, 127, 8, 0.826
> > > 2048, 1, 32, 23, 127, 8, 0.821
> > > 2048, 0, 64, 23, 127, 8, 0.828
> > > 2048, 2, 64, 23, 127, 8, 0.827
> > > 2048, 0, 128, 23, 127, 8, 0.833
> > > 2048, 3, 128, 23, 127, 8, 0.83
> > > 2048, 0, 256, 23, 127, 8, 0.855
> > > 2048, 4, 256, 23, 127, 8, 0.849
> > > 2048, 0, 512, 23, 127, 8, 0.849
> > > 2048, 5, 512, 23, 127, 8, 0.851
> > > 2048, 0, 1024, 23, 127, 8, 0.856
> > > 2048, 6, 1024, 23, 127, 8, 0.862
> > > 2048, 0, 2048, 23, 127, 8, 0.709
> > > 2048, 7, 2048, 23, 127, 8, 0.712
> > > 2048, 0, 4096, 23, 127, 8, 0.702
> > > 2048, 8, 4096, 23, 127, 8, 0.701
> > > 256, 1, 64, 23, 127, 8, 0.689
> > > 256, 15, 64, 23, 127, 8, 0.688
> > > 256, 2, 64, 23, 127, 8, 0.691
> > > 256, 30, 64, 23, 127, 8, 0.612
> > > 256, 3, 64, 23, 127, 8, 0.688
> > > 256, 45, 64, 23, 127, 8, 0.686
> > > 256, 4, 64, 23, 127, 8, 0.694
> > > 256, 60, 64, 23, 127, 8, 0.609
> > > 256, 5, 64, 23, 127, 8, 0.69
> > > 256, 75, 64, 23, 127, 8, 0.69
> > > 256, 6, 64, 23, 127, 8, 0.691
> > > 256, 90, 64, 23, 127, 8, 0.612
> > > 256, 7, 64, 23, 127, 8, 0.689
> > > 256, 105, 64, 23, 127, 8, 0.688
> > > 1, 0, 0, 23, 127, 8, 0.98
> > > 2, 0, 1, 23, 127, 8, 0.978
> > > 3, 0, 2, 23, 127, 8, 0.98
> > > 4, 0, 3, 23, 127, 8, 0.978
> > > 5, 0, 4, 23, 127, 8, 0.977
> > > 6, 0, 5, 23, 127, 8, 0.984
> > > 7, 0, 6, 23, 127, 8, 0.982
> > > 8, 0, 7, 23, 127, 8, 0.983
> > > 9, 0, 8, 23, 127, 8, 0.987
> > > 10, 0, 9, 23, 127, 8, 0.979
> > > 11, 0, 10, 23, 127, 8, 0.985
> > > 12, 0, 11, 23, 127, 8, 0.981
> > > 13, 0, 12, 23, 127, 8, 0.98
> > > 14, 0, 13, 23, 127, 8, 0.982
> > > 15, 0, 14, 23, 127, 8, 0.981
> > > 16, 0, 15, 23, 127, 8, 0.579
> > > 17, 0, 16, 23, 127, 8, 0.531
> > > 18, 0, 17, 23, 127, 8, 0.577
> > > 19, 0, 18, 23, 127, 8, 0.588
> > > 20, 0, 19, 23, 127, 8, 0.571
> > > 21, 0, 20, 23, 127, 8, 0.576
> > > 22, 0, 21, 23, 127, 8, 0.59
> > > 23, 0, 22, 23, 127, 8, 0.574
> > > 24, 0, 23, 23, 127, 8, 0.583
> > > 25, 0, 24, 23, 127, 8, 0.581
> > > 26, 0, 25, 23, 127, 8, 0.592
> > > 27, 0, 26, 23, 127, 8, 0.586
> > > 28, 0, 27, 23, 127, 8, 0.588
> > > 29, 0, 28, 23, 127, 8, 0.578
> > > 30, 0, 29, 23, 127, 8, 0.573
> > > 31, 0, 30, 23, 127, 8, 0.588
> > > 32, 0, 31, 23, 127, 8, 0.664
> > > 2048, 0, 32, 23, 127, 16, 0.825
> > > 2048, 1, 32, 23, 127, 16, 0.823
> > > 2048, 0, 64, 23, 127, 16, 0.831
> > > 2048, 2, 64, 23, 127, 16, 0.822
> > > 2048, 0, 128, 23, 127, 16, 0.831
> > > 2048, 3, 128, 23, 127, 16, 0.831
> > > 2048, 0, 256, 23, 127, 16, 0.849
> > > 2048, 4, 256, 23, 127, 16, 0.85
> > > 2048, 0, 512, 23, 127, 16, 0.751
> > > 2048, 5, 512, 23, 127, 16, 0.75
> > > 2048, 0, 1024, 23, 127, 16, 0.913
> > > 2048, 6, 1024, 23, 127, 16, 0.895
> > > 2048, 0, 2048, 23, 127, 16, 0.736
> > > 2048, 7, 2048, 23, 127, 16, 0.741
> > > 2048, 0, 4096, 23, 127, 16, 0.712
> > > 2048, 8, 4096, 23, 127, 16, 0.711
> > > 256, 1, 64, 23, 127, 16, 0.758
> > > 256, 15, 64, 23, 127, 16, 0.692
> > > 256, 2, 64, 23, 127, 16, 0.692
> > > 256, 30, 64, 23, 127, 16, 0.613
> > > 256, 3, 64, 23, 127, 16, 0.69
> > > 256, 45, 64, 23, 127, 16, 0.687
> > > 256, 4, 64, 23, 127, 16, 0.69
> > > 256, 60, 64, 23, 127, 16, 0.604
> > > 256, 5, 64, 23, 127, 16, 0.687
> > > 256, 75, 64, 23, 127, 16, 0.687
> > > 256, 6, 64, 23, 127, 16, 0.69
> > > 256, 90, 64, 23, 127, 16, 0.61
> > > 256, 7, 64, 23, 127, 16, 0.69
> > > 256, 105, 64, 23, 127, 16, 0.685
> > > 1, 0, 0, 23, 127, 16, 0.981
> > > 2, 0, 1, 23, 127, 16, 0.985
> > > 3, 0, 2, 23, 127, 16, 0.985
> > > 4, 0, 3, 23, 127, 16, 0.981
> > > 5, 0, 4, 23, 127, 16, 0.979
> > > 6, 0, 5, 23, 127, 16, 0.986
> > > 7, 0, 6, 23, 127, 16, 0.986
> > > 8, 0, 7, 23, 127, 16, 0.982
> > > 9, 0, 8, 23, 127, 16, 0.982
> > > 10, 0, 9, 23, 127, 16, 0.98
> > > 11, 0, 10, 23, 127, 16, 0.983
> > > 12, 0, 11, 23, 127, 16, 0.982
> > > 13, 0, 12, 23, 127, 16, 0.982
> > > 14, 0, 13, 23, 127, 16, 0.982
> > > 15, 0, 14, 23, 127, 16, 0.982
> > > 16, 0, 15, 23, 127, 16, 0.582
> > > 17, 0, 16, 23, 127, 16, 0.542
> > > 18, 0, 17, 23, 127, 16, 0.554
> > > 19, 0, 18, 23, 127, 16, 0.562
> > > 20, 0, 19, 23, 127, 16, 0.587
> > > 21, 0, 20, 23, 127, 16, 0.584
> > > 22, 0, 21, 23, 127, 16, 0.587
> > > 23, 0, 22, 23, 127, 16, 0.594
> > > 24, 0, 23, 23, 127, 16, 0.581
> > > 25, 0, 24, 23, 127, 16, 0.577
> > > 26, 0, 25, 23, 127, 16, 0.588
> > > 27, 0, 26, 23, 127, 16, 0.589
> > > 28, 0, 27, 23, 127, 16, 0.596
> > > 29, 0, 28, 23, 127, 16, 0.591
> > > 30, 0, 29, 23, 127, 16, 0.585
> > > 31, 0, 30, 23, 127, 16, 0.59
> > > 32, 0, 31, 23, 127, 16, 0.669
> > >
> > > sysdeps/x86_64/multiarch/strrchr-sse2.S | 2 +-
> > > sysdeps/x86_64/multiarch/wcsrchr-sse2.S | 3 +-
> > > sysdeps/x86_64/strrchr.S | 505 +++++++++++++++---------
> > > sysdeps/x86_64/wcsrchr.S | 268 +------------
> > > 4 files changed, 334 insertions(+), 444 deletions(-)
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
> > > index db1b44c23c..866396e947 100644
> > > --- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
> > > +++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
> > > @@ -17,7 +17,7 @@
> > > <https://www.gnu.org/licenses/>. */
> > >
> > > #if IS_IN (libc)
> > > -# define strrchr __strrchr_sse2
> > > +# define STRRCHR __strrchr_sse2
> > >
> > > # undef weak_alias
> > > # define weak_alias(strrchr, rindex)
> > > diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> > > index 78d1ca6553..69d2f3cdb1 100644
> > > --- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> > > +++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> > > @@ -17,7 +17,6 @@
> > > <https://www.gnu.org/licenses/>. */
> > >
> > > #if IS_IN (libc)
> > > -# define wcsrchr __wcsrchr_sse2
> > > +# define STRRCHR __wcsrchr_sse2
> > > #endif
> > > -
> > > #include "../wcsrchr.S"
> > > diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
> > > index 50d886713e..94449ad806 100644
> > > --- a/sysdeps/x86_64/strrchr.S
> > > +++ b/sysdeps/x86_64/strrchr.S
> > > @@ -19,210 +19,355 @@
> > >
> > > #include <sysdep.h>
> > >
> > > +#ifndef STRRCHR
> > > +# define STRRCHR strrchr
> > > +#endif
> > > +
> > > +#ifdef USE_AS_WCSRCHR
> > > +# define PCMPEQ pcmpeqd
> > > +# define CHAR_SIZE 4
> > > +# define PMINU pminud
> > > +#else
> > > +# define PCMPEQ pcmpeqb
> > > +# define CHAR_SIZE 1
> > > +# define PMINU pminub
> > > +#endif
> > > +
> > > +#define PAGE_SIZE 4096
> > > +#define VEC_SIZE 16
> > > +
> > > .text
> > > -ENTRY (strrchr)
> > > - movd %esi, %xmm1
> > > +ENTRY(STRRCHR)
> > > + movd %esi, %xmm0
> > > movq %rdi, %rax
> > > - andl $4095, %eax
> > > - punpcklbw %xmm1, %xmm1
> > > - cmpq $4032, %rax
> > > - punpcklwd %xmm1, %xmm1
> > > - pshufd $0, %xmm1, %xmm1
> > > + andl $(PAGE_SIZE - 1), %eax
> > > +#ifndef USE_AS_WCSRCHR
> > > + punpcklbw %xmm0, %xmm0
> > > + punpcklwd %xmm0, %xmm0
> > > +#endif
> > > + pshufd $0, %xmm0, %xmm0
> > > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> > > ja L(cross_page)
> > > - movdqu (%rdi), %xmm0
> > > +
> > > +L(cross_page_continue):
> > > + movups (%rdi), %xmm1
> > > pxor %xmm2, %xmm2
> > > - movdqa %xmm0, %xmm3
> > > - pcmpeqb %xmm1, %xmm0
> > > - pcmpeqb %xmm2, %xmm3
> > > - pmovmskb %xmm0, %ecx
> > > - pmovmskb %xmm3, %edx
> > > - testq %rdx, %rdx
> > > - je L(next_48_bytes)
> > > - leaq -1(%rdx), %rax
> > > - xorq %rdx, %rax
> > > - andq %rcx, %rax
> > > - je L(exit)
> > > - bsrq %rax, %rax
> > > + PCMPEQ %xmm1, %xmm2
> > > + pmovmskb %xmm2, %ecx
> > > + testl %ecx, %ecx
> > > + jz L(aligned_more)
> > > +
> > > + PCMPEQ %xmm0, %xmm1
> > > + pmovmskb %xmm1, %eax
> > > + leal -1(%rcx), %edx
> > > + xorl %edx, %ecx
> > > + andl %ecx, %eax
> > > + jz L(ret0)
> > > + bsrl %eax, %eax
> > > addq %rdi, %rax
> > > + /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
> > > + search CHAR is zero we are correct. Either way `andq
> > > + -CHAR_SIZE, %rax` gets the correct result. */
> > > +#ifdef USE_AS_WCSRCHR
> > > + andq $-CHAR_SIZE, %rax
> > > +#endif
> > > +L(ret0):
> > > ret
> > >
> > > + /* Returns for first vec x1/x2 have hard coded backward search
> > > + path for earlier matches. */
> > > .p2align 4
> > > -L(next_48_bytes):
> > > - movdqu 16(%rdi), %xmm4
> > > - movdqa %xmm4, %xmm5
> > > - movdqu 32(%rdi), %xmm3
> > > - pcmpeqb %xmm1, %xmm4
> > > - pcmpeqb %xmm2, %xmm5
> > > - movdqu 48(%rdi), %xmm0
> > > - pmovmskb %xmm5, %edx
> > > - movdqa %xmm3, %xmm5
> > > - pcmpeqb %xmm1, %xmm3
> > > - pcmpeqb %xmm2, %xmm5
> > > - pcmpeqb %xmm0, %xmm2
> > > - salq $16, %rdx
> > > - pmovmskb %xmm3, %r8d
> > > - pmovmskb %xmm5, %eax
> > > - pmovmskb %xmm2, %esi
> > > - salq $32, %r8
> > > - salq $32, %rax
> > > - pcmpeqb %xmm1, %xmm0
> > > - orq %rdx, %rax
> > > - movq %rsi, %rdx
> > > - pmovmskb %xmm4, %esi
> > > - salq $48, %rdx
> > > - salq $16, %rsi
> > > - orq %r8, %rsi
> > > - orq %rcx, %rsi
> > > - pmovmskb %xmm0, %ecx
> > > - salq $48, %rcx
> > > - orq %rcx, %rsi
> > > - orq %rdx, %rax
> > > - je L(loop_header2)
> > > - leaq -1(%rax), %rcx
> > > - xorq %rax, %rcx
> > > - andq %rcx, %rsi
> > > - je L(exit)
> > > - bsrq %rsi, %rsi
> > > - leaq (%rdi,%rsi), %rax
> > > +L(first_vec_x0_test):
> > > + PCMPEQ %xmm0, %xmm1
> > > + pmovmskb %xmm1, %eax
> > > + testl %eax, %eax
> > > + jz L(ret0)
> > > + bsrl %eax, %eax
> > > + addq %r8, %rax
> > > +#ifdef USE_AS_WCSRCHR
> > > + andq $-CHAR_SIZE, %rax
> > > +#endif
> > > ret
> > >
> > > .p2align 4
> > > -L(loop_header2):
> > > - testq %rsi, %rsi
> > > - movq %rdi, %rcx
> > > - je L(no_c_found)
> > > -L(loop_header):
> > > - addq $64, %rdi
> > > - pxor %xmm7, %xmm7
> > > - andq $-64, %rdi
> > > - jmp L(loop_entry)
> > > +L(first_vec_x1):
> > > + PCMPEQ %xmm0, %xmm2
> > > + pmovmskb %xmm2, %eax
> > > + leal -1(%rcx), %edx
> > > + xorl %edx, %ecx
> > > + andl %ecx, %eax
> > > + jz L(first_vec_x0_test)
> > > + bsrl %eax, %eax
> > > + leaq (VEC_SIZE)(%rdi, %rax), %rax
> > > +#ifdef USE_AS_WCSRCHR
> > > + andq $-CHAR_SIZE, %rax
> > > +#endif
> > > + ret
> > >
> > > .p2align 4
> > > -L(loop64):
> > > - testq %rdx, %rdx
> > > - cmovne %rdx, %rsi
> > > - cmovne %rdi, %rcx
> > > - addq $64, %rdi
> > > -L(loop_entry):
> > > - movdqa 32(%rdi), %xmm3
> > > - pxor %xmm6, %xmm6
> > > - movdqa 48(%rdi), %xmm2
> > > - movdqa %xmm3, %xmm0
> > > - movdqa 16(%rdi), %xmm4
> > > - pminub %xmm2, %xmm0
> > > - movdqa (%rdi), %xmm5
> > > - pminub %xmm4, %xmm0
> > > - pminub %xmm5, %xmm0
> > > - pcmpeqb %xmm7, %xmm0
> > > - pmovmskb %xmm0, %eax
> > > - movdqa %xmm5, %xmm0
> > > - pcmpeqb %xmm1, %xmm0
> > > - pmovmskb %xmm0, %r9d
> > > - movdqa %xmm4, %xmm0
> > > - pcmpeqb %xmm1, %xmm0
> > > - pmovmskb %xmm0, %edx
> > > - movdqa %xmm3, %xmm0
> > > - pcmpeqb %xmm1, %xmm0
> > > - salq $16, %rdx
> > > - pmovmskb %xmm0, %r10d
> > > - movdqa %xmm2, %xmm0
> > > - pcmpeqb %xmm1, %xmm0
> > > - salq $32, %r10
> > > - orq %r10, %rdx
> > > - pmovmskb %xmm0, %r8d
> > > - orq %r9, %rdx
> > > - salq $48, %r8
> > > - orq %r8, %rdx
> > > +L(first_vec_x1_test):
> > > + PCMPEQ %xmm0, %xmm2
> > > + pmovmskb %xmm2, %eax
> > > testl %eax, %eax
> > > - je L(loop64)
> > > - pcmpeqb %xmm6, %xmm4
> > > - pcmpeqb %xmm6, %xmm3
> > > - pcmpeqb %xmm6, %xmm5
> > > - pmovmskb %xmm4, %eax
> > > - pmovmskb %xmm3, %r10d
> > > - pcmpeqb %xmm6, %xmm2
> > > - pmovmskb %xmm5, %r9d
> > > - salq $32, %r10
> > > - salq $16, %rax
> > > - pmovmskb %xmm2, %r8d
> > > - orq %r10, %rax
> > > - orq %r9, %rax
> > > - salq $48, %r8
> > > - orq %r8, %rax
> > > - leaq -1(%rax), %r8
> > > - xorq %rax, %r8
> > > - andq %r8, %rdx
> > > - cmovne %rdi, %rcx
> > > - cmovne %rdx, %rsi
> > > - bsrq %rsi, %rsi
> > > - leaq (%rcx,%rsi), %rax
> > > + jz L(first_vec_x0_test)
> > > + bsrl %eax, %eax
> > > + leaq (VEC_SIZE)(%rdi, %rax), %rax
> > > +#ifdef USE_AS_WCSRCHR
> > > + andq $-CHAR_SIZE, %rax
> > > +#endif
> > > + ret
> > > +
> > > + .p2align 4
> > > +L(first_vec_x2):
> > > + PCMPEQ %xmm0, %xmm3
> > > + pmovmskb %xmm3, %eax
> > > + leal -1(%rcx), %edx
> > > + xorl %edx, %ecx
> > > + andl %ecx, %eax
> > > + jz L(first_vec_x1_test)
> > > + bsrl %eax, %eax
> > > + leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
> > > +#ifdef USE_AS_WCSRCHR
> > > + andq $-CHAR_SIZE, %rax
> > > +#endif
> > > + ret
> > > +
> > > + .p2align 4
> > > +L(aligned_more):
> > > + /* Save original pointer if match was in VEC 0. */
> > > + movq %rdi, %r8
> > > + andq $-VEC_SIZE, %rdi
> > > +
> > > + movaps VEC_SIZE(%rdi), %xmm2
> > > + pxor %xmm3, %xmm3
> > > + PCMPEQ %xmm2, %xmm3
> > > + pmovmskb %xmm3, %ecx
> > > + testl %ecx, %ecx
> > > + jnz L(first_vec_x1)
> > > +
> > > + movaps (VEC_SIZE * 2)(%rdi), %xmm3
> > > + pxor %xmm4, %xmm4
> > > + PCMPEQ %xmm3, %xmm4
> > > + pmovmskb %xmm4, %ecx
> > > + testl %ecx, %ecx
> > > + jnz L(first_vec_x2)
> > > +
> > > + addq $VEC_SIZE, %rdi
> > > + /* Save pointer again before realigning. */
> > > + movq %rdi, %rsi
> > > + andq $-(VEC_SIZE * 2), %rdi
> > > + .p2align 4
> > > +L(first_loop):
> > > + /* Do 2x VEC at a time. */
> > > + movaps (VEC_SIZE * 2)(%rdi), %xmm4
> > > + movaps (VEC_SIZE * 3)(%rdi), %xmm5
> > > + /* If SSE2 no pminud. */
> > > +#ifdef NO_PMINU
> >
> > Do we really need SSE4.1 wcsrchr? I think we should focus on AVX2 and
> > above.
>
> It seems like freebie performance that can make a difference in the loop
> cases. (see the SSE4.1 commit for numbers).
But these numbers are on Tiger Lake. I think we should continue to
improve SSE2
version and optimize AVX2/AVX512. I don't think we should increase code sizes
for SSE4.
> Imo there is little harm but if you feel strongly I'll drop. (In V2 will
> change the .text section for SSE4_1).
>
> What do you think?
> >
> > > + movaps %xmm5, %xmm6
> > > + pxor %xmm8, %xmm8
> > > +
> > > + PCMPEQ %xmm8, %xmm5
> > > + PCMPEQ %xmm4, %xmm8
> > > + por %xmm5, %xmm8
> > > +#else
> > > + movaps %xmm5, %xmm6
> > > + PMINU %xmm4, %xmm5
> > > +#endif
> > > +
> > > + movaps %xmm4, %xmm9
> > > + PCMPEQ %xmm0, %xmm4
> > > + PCMPEQ %xmm0, %xmm6
> > > + movaps %xmm6, %xmm7
> > > + por %xmm4, %xmm6
> > > +#ifndef NO_PMINU
> > > + pxor %xmm8, %xmm8
> > > + PCMPEQ %xmm5, %xmm8
> > > +#endif
> > > + pmovmskb %xmm8, %ecx
> > > + pmovmskb %xmm6, %eax
> > > +
> > > + addq $(VEC_SIZE * 2), %rdi
> > > + /* Use `addl` 1) so we can undo it with `subl` and 2) it can
> > > + macro-fuse with `jz`. */
> > > + addl %ecx, %eax
> > > + jz L(first_loop)
> > > +
> > > + /* Check if there is zero match. */
> > > + testl %ecx, %ecx
> > > + jz L(second_loop_match)
> > > +
> > > + /* Check if there was a match in last iteration. */
> > > + subl %ecx, %eax
> > > + jnz L(new_match)
> > > +
> > > +L(first_loop_old_match):
> > > + PCMPEQ %xmm0, %xmm2
> > > + PCMPEQ %xmm0, %xmm3
> > > + pmovmskb %xmm2, %ecx
> > > + pmovmskb %xmm3, %eax
> > > + addl %eax, %ecx
> > > + jz L(first_vec_x0_test)
> > > + /* NB: We could move this shift to before the branch and save a
> > > + bit of code size / performance on the fall through. The
> > > + branch leads to the null case which generally seems hotter
> > > + than char in first 3x VEC. */
> > > + sall $16, %eax
> > > + orl %ecx, %eax
> > > +
> > > + bsrl %eax, %eax
> > > + addq %rsi, %rax
> > > +#ifdef USE_AS_WCSRCHR
> > > + andq $-CHAR_SIZE, %rax
> > > +#endif
> > > + ret
> > > +
> > > + .p2align 4
> > > +L(new_match):
> > > + pxor %xmm6, %xmm6
> > > + PCMPEQ %xmm9, %xmm6
> > > + pmovmskb %xmm6, %eax
> > > + sall $16, %ecx
> > > + orl %eax, %ecx
> > > +
> > > + /* We can't reuse either of the old comparisons as since we mask
> > > + of zeros after first zero (instead of using the full
> > > + comparison) we can't gurantee no interference between match
> > > + after end of string and valid match. */
> > > + pmovmskb %xmm4, %eax
> > > + pmovmskb %xmm7, %edx
> > > + sall $16, %edx
> > > + orl %edx, %eax
> > > +
> > > + leal -1(%ecx), %edx
> > > + xorl %edx, %ecx
> > > + andl %ecx, %eax
> > > + jz L(first_loop_old_match)
> > > + bsrl %eax, %eax
> > > + addq %rdi, %rax
> > > +#ifdef USE_AS_WCSRCHR
> > > + andq $-CHAR_SIZE, %rax
> > > +#endif
> > > ret
> > >
> > > + /* Save minimum state for getting most recent match. We can
> > > + throw out all previous work. */
> > > .p2align 4
> > > -L(no_c_found):
> > > - movl $1, %esi
> > > - xorl %ecx, %ecx
> > > - jmp L(loop_header)
> > > +L(second_loop_match):
> > > + movq %rdi, %rsi
> > > + movaps %xmm4, %xmm2
> > > + movaps %xmm7, %xmm3
> > >
> > > .p2align 4
> > > -L(exit):
> > > - xorl %eax, %eax
> > > +L(second_loop):
> > > + movaps (VEC_SIZE * 2)(%rdi), %xmm4
> > > + movaps (VEC_SIZE * 3)(%rdi), %xmm5
> > > +#ifdef NO_PMINU
> > > + movaps %xmm5, %xmm6
> > > + pxor %xmm8, %xmm8
> > > +
> > > + PCMPEQ %xmm8, %xmm5
> > > + PCMPEQ %xmm4, %xmm8
> > > + por %xmm5, %xmm8
> > > +#else
> > > + movaps %xmm5, %xmm6
> > > + PMINU %xmm4, %xmm5
> > > +#endif
> > > +
> > > + movaps %xmm4, %xmm9
> > > + PCMPEQ %xmm0, %xmm4
> > > + PCMPEQ %xmm0, %xmm6
> > > + movaps %xmm6, %xmm7
> > > + por %xmm4, %xmm6
> > > +#ifndef NO_PMINU
> > > + pxor %xmm8, %xmm8
> > > + PCMPEQ %xmm5, %xmm8
> > > +#endif
> > > +
> > > + pmovmskb %xmm8, %ecx
> > > + pmovmskb %xmm6, %eax
> > > +
> > > + addq $(VEC_SIZE * 2), %rdi
> > > + /* Either null term or new occurence of CHAR. */
> > > + addl %ecx, %eax
> > > + jz L(second_loop)
> > > +
> > > + /* No null term so much be new occurence of CHAR. */
> > > + testl %ecx, %ecx
> > > + jz L(second_loop_match)
> > > +
> > > +
> > > + subl %ecx, %eax
> > > + jnz L(second_loop_new_match)
> > > +
> > > +L(second_loop_old_match):
> > > + pmovmskb %xmm2, %ecx
> > > + pmovmskb %xmm3, %eax
> > > + sall $16, %eax
> > > + orl %ecx, %eax
> > > + bsrl %eax, %eax
> > > + addq %rsi, %rax
> > > +#ifdef USE_AS_WCSRCHR
> > > + andq $-CHAR_SIZE, %rax
> > > +#endif
> > > ret
> > >
> > > .p2align 4
> > > +L(second_loop_new_match):
> > > + pxor %xmm6, %xmm6
> > > + PCMPEQ %xmm9, %xmm6
> > > + pmovmskb %xmm6, %eax
> > > + sall $16, %ecx
> > > + orl %eax, %ecx
> > > +
> > > + /* We can't reuse either of the old comparisons as since we mask
> > > + of zeros after first zero (instead of using the full
> > > + comparison) we can't gurantee no interference between match
> > > + after end of string and valid match. */
> > > + pmovmskb %xmm4, %eax
> > > + pmovmskb %xmm7, %edx
> > > + sall $16, %edx
> > > + orl %edx, %eax
> > > +
> > > + leal -1(%ecx), %edx
> > > + xorl %edx, %ecx
> > > + andl %ecx, %eax
> > > + jz L(second_loop_old_match)
> > > + bsrl %eax, %eax
> > > + addq %rdi, %rax
> > > +#ifdef USE_AS_WCSRCHR
> > > + andq $-CHAR_SIZE, %rax
> > > +#endif
> > > + ret
> > > +
> > > + .p2align 4,, 4
> > > L(cross_page):
> > > - movq %rdi, %rax
> > > - pxor %xmm0, %xmm0
> > > - andq $-64, %rax
> > > - movdqu (%rax), %xmm5
> > > - movdqa %xmm5, %xmm6
> > > - movdqu 16(%rax), %xmm4
> > > - pcmpeqb %xmm1, %xmm5
> > > - pcmpeqb %xmm0, %xmm6
> > > - movdqu 32(%rax), %xmm3
> > > - pmovmskb %xmm6, %esi
> > > - movdqa %xmm4, %xmm6
> > > - movdqu 48(%rax), %xmm2
> > > - pcmpeqb %xmm1, %xmm4
> > > - pcmpeqb %xmm0, %xmm6
> > > - pmovmskb %xmm6, %edx
> > > - movdqa %xmm3, %xmm6
> > > - pcmpeqb %xmm1, %xmm3
> > > - pcmpeqb %xmm0, %xmm6
> > > - pcmpeqb %xmm2, %xmm0
> > > - salq $16, %rdx
> > > - pmovmskb %xmm3, %r9d
> > > - pmovmskb %xmm6, %r8d
> > > - pmovmskb %xmm0, %ecx
> > > - salq $32, %r9
> > > - salq $32, %r8
> > > - pcmpeqb %xmm1, %xmm2
> > > - orq %r8, %rdx
> > > - salq $48, %rcx
> > > - pmovmskb %xmm5, %r8d
> > > - orq %rsi, %rdx
> > > - pmovmskb %xmm4, %esi
> > > - orq %rcx, %rdx
> > > - pmovmskb %xmm2, %ecx
> > > - salq $16, %rsi
> > > - salq $48, %rcx
> > > - orq %r9, %rsi
> > > - orq %r8, %rsi
> > > - orq %rcx, %rsi
> > > + movq %rdi, %rsi
> > > + andq $-VEC_SIZE, %rsi
> > > + movaps (%rsi), %xmm1
> > > + pxor %xmm2, %xmm2
> > > + PCMPEQ %xmm1, %xmm2
> > > + pmovmskb %xmm2, %edx
> > > movl %edi, %ecx
> > > - subl %eax, %ecx
> > > - shrq %cl, %rdx
> > > - shrq %cl, %rsi
> > > - testq %rdx, %rdx
> > > - je L(loop_header2)
> > > - leaq -1(%rdx), %rax
> > > - xorq %rdx, %rax
> > > - andq %rax, %rsi
> > > - je L(exit)
> > > - bsrq %rsi, %rax
> > > + andl $(VEC_SIZE - 1), %ecx
> > > + sarl %cl, %edx
> > > + jz L(cross_page_continue)
> > > + PCMPEQ %xmm0, %xmm1
> > > + pmovmskb %xmm1, %eax
> > > + sarl %cl, %eax
> > > + leal -1(%rdx), %ecx
> > > + xorl %edx, %ecx
> > > + andl %ecx, %eax
> > > + jz L(ret1)
> > > + bsrl %eax, %eax
> > > addq %rdi, %rax
> > > +#ifdef USE_AS_WCSRCHR
> > > + andq $-CHAR_SIZE, %rax
> > > +#endif
> > > +L(ret1):
> > > ret
> > > -END (strrchr)
> > > +END(STRRCHR)
> > >
> > > -weak_alias (strrchr, rindex)
> > > -libc_hidden_builtin_def (strrchr)
> > > +#ifndef USE_AS_WCSRCHR
> > > + weak_alias (STRRCHR, rindex)
> > > + libc_hidden_builtin_def (STRRCHR)
> > > +#endif
> > > diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
> > > index 61552954de..2b80efc5ef 100644
> > > --- a/sysdeps/x86_64/wcsrchr.S
> > > +++ b/sysdeps/x86_64/wcsrchr.S
> > > @@ -1,4 +1,4 @@
> > > -/* wcsrchr with SSSE3
> > > +/* wcsrchr optimized with SSE2.
> > > Copyright (C) 2011-2022 Free Software Foundation, Inc.
> > > This file is part of the GNU C Library.
> > >
> > > @@ -16,266 +16,12 @@
> > > License along with the GNU C Library; if not, see
> > > <https://www.gnu.org/licenses/>. */
> > >
> > > -#include <sysdep.h>
> > >
> > > - .text
> > > -ENTRY (wcsrchr)
> > > +#define USE_AS_WCSRCHR 1
> > > +#define NO_PMINU 1
> > >
> > > - movd %rsi, %xmm1
> > > - mov %rdi, %rcx
> > > - punpckldq %xmm1, %xmm1
> > > - pxor %xmm2, %xmm2
> > > - punpckldq %xmm1, %xmm1
> > > - and $63, %rcx
> > > - cmp $48, %rcx
> > > - ja L(crosscache)
> > > +#ifndef STRRCHR
> > > +# define STRRCHR wcsrchr
> > > +#endif
> > >
> > > - movdqu (%rdi), %xmm0
> > > - pcmpeqd %xmm0, %xmm2
> > > - pcmpeqd %xmm1, %xmm0
> > > - pmovmskb %xmm2, %rcx
> > > - pmovmskb %xmm0, %rax
> > > - add $16, %rdi
> > > -
> > > - test %rax, %rax
> > > - jnz L(unaligned_match1)
> > > -
> > > - test %rcx, %rcx
> > > - jnz L(return_null)
> > > -
> > > - and $-16, %rdi
> > > - xor %r8, %r8
> > > - jmp L(loop)
> > > -
> > > - .p2align 4
> > > -L(unaligned_match1):
> > > - test %rcx, %rcx
> > > - jnz L(prolog_find_zero_1)
> > > -
> > > - mov %rax, %r8
> > > - mov %rdi, %rsi
> > > - and $-16, %rdi
> > > - jmp L(loop)
> > > -
> > > - .p2align 4
> > > -L(crosscache):
> > > - and $15, %rcx
> > > - and $-16, %rdi
> > > - pxor %xmm3, %xmm3
> > > - movdqa (%rdi), %xmm0
> > > - pcmpeqd %xmm0, %xmm3
> > > - pcmpeqd %xmm1, %xmm0
> > > - pmovmskb %xmm3, %rdx
> > > - pmovmskb %xmm0, %rax
> > > - shr %cl, %rdx
> > > - shr %cl, %rax
> > > - add $16, %rdi
> > > -
> > > - test %rax, %rax
> > > - jnz L(unaligned_match)
> > > -
> > > - test %rdx, %rdx
> > > - jnz L(return_null)
> > > -
> > > - xor %r8, %r8
> > > - jmp L(loop)
> > > -
> > > - .p2align 4
> > > -L(unaligned_match):
> > > - test %rdx, %rdx
> > > - jnz L(prolog_find_zero)
> > > -
> > > - mov %rax, %r8
> > > - lea (%rdi, %rcx), %rsi
> > > -
> > > -/* Loop start on aligned string. */
> > > - .p2align 4
> > > -L(loop):
> > > - movdqa (%rdi), %xmm0
> > > - pcmpeqd %xmm0, %xmm2
> > > - add $16, %rdi
> > > - pcmpeqd %xmm1, %xmm0
> > > - pmovmskb %xmm2, %rcx
> > > - pmovmskb %xmm0, %rax
> > > - or %rax, %rcx
> > > - jnz L(matches)
> > > -
> > > - movdqa (%rdi), %xmm3
> > > - pcmpeqd %xmm3, %xmm2
> > > - add $16, %rdi
> > > - pcmpeqd %xmm1, %xmm3
> > > - pmovmskb %xmm2, %rcx
> > > - pmovmskb %xmm3, %rax
> > > - or %rax, %rcx
> > > - jnz L(matches)
> > > -
> > > - movdqa (%rdi), %xmm4
> > > - pcmpeqd %xmm4, %xmm2
> > > - add $16, %rdi
> > > - pcmpeqd %xmm1, %xmm4
> > > - pmovmskb %xmm2, %rcx
> > > - pmovmskb %xmm4, %rax
> > > - or %rax, %rcx
> > > - jnz L(matches)
> > > -
> > > - movdqa (%rdi), %xmm5
> > > - pcmpeqd %xmm5, %xmm2
> > > - add $16, %rdi
> > > - pcmpeqd %xmm1, %xmm5
> > > - pmovmskb %xmm2, %rcx
> > > - pmovmskb %xmm5, %rax
> > > - or %rax, %rcx
> > > - jz L(loop)
> > > -
> > > - .p2align 4
> > > -L(matches):
> > > - test %rax, %rax
> > > - jnz L(match)
> > > -L(return_value):
> > > - test %r8, %r8
> > > - jz L(return_null)
> > > - mov %r8, %rax
> > > - mov %rsi, %rdi
> > > -
> > > - test $15 << 4, %ah
> > > - jnz L(match_fourth_wchar)
> > > - test %ah, %ah
> > > - jnz L(match_third_wchar)
> > > - test $15 << 4, %al
> > > - jnz L(match_second_wchar)
> > > - lea -16(%rdi), %rax
> > > - ret
> > > -
> > > - .p2align 4
> > > -L(match):
> > > - pmovmskb %xmm2, %rcx
> > > - test %rcx, %rcx
> > > - jnz L(find_zero)
> > > - mov %rax, %r8
> > > - mov %rdi, %rsi
> > > - jmp L(loop)
> > > -
> > > - .p2align 4
> > > -L(find_zero):
> > > - test $15, %cl
> > > - jnz L(find_zero_in_first_wchar)
> > > - test %cl, %cl
> > > - jnz L(find_zero_in_second_wchar)
> > > - test $15, %ch
> > > - jnz L(find_zero_in_third_wchar)
> > > -
> > > - and $1 << 13 - 1, %rax
> > > - jz L(return_value)
> > > -
> > > - test $15 << 4, %ah
> > > - jnz L(match_fourth_wchar)
> > > - test %ah, %ah
> > > - jnz L(match_third_wchar)
> > > - test $15 << 4, %al
> > > - jnz L(match_second_wchar)
> > > - lea -16(%rdi), %rax
> > > - ret
> > > -
> > > - .p2align 4
> > > -L(find_zero_in_first_wchar):
> > > - test $1, %rax
> > > - jz L(return_value)
> > > - lea -16(%rdi), %rax
> > > - ret
> > > -
> > > - .p2align 4
> > > -L(find_zero_in_second_wchar):
> > > - and $1 << 5 - 1, %rax
> > > - jz L(return_value)
> > > -
> > > - test $15 << 4, %al
> > > - jnz L(match_second_wchar)
> > > - lea -16(%rdi), %rax
> > > - ret
> > > -
> > > - .p2align 4
> > > -L(find_zero_in_third_wchar):
> > > - and $1 << 9 - 1, %rax
> > > - jz L(return_value)
> > > -
> > > - test %ah, %ah
> > > - jnz L(match_third_wchar)
> > > - test $15 << 4, %al
> > > - jnz L(match_second_wchar)
> > > - lea -16(%rdi), %rax
> > > - ret
> > > -
> > > - .p2align 4
> > > -L(prolog_find_zero):
> > > - add %rcx, %rdi
> > > - mov %rdx, %rcx
> > > -L(prolog_find_zero_1):
> > > - test $15, %cl
> > > - jnz L(prolog_find_zero_in_first_wchar)
> > > - test %cl, %cl
> > > - jnz L(prolog_find_zero_in_second_wchar)
> > > - test $15, %ch
> > > - jnz L(prolog_find_zero_in_third_wchar)
> > > -
> > > - and $1 << 13 - 1, %rax
> > > - jz L(return_null)
> > > -
> > > - test $15 << 4, %ah
> > > - jnz L(match_fourth_wchar)
> > > - test %ah, %ah
> > > - jnz L(match_third_wchar)
> > > - test $15 << 4, %al
> > > - jnz L(match_second_wchar)
> > > - lea -16(%rdi), %rax
> > > - ret
> > > -
> > > - .p2align 4
> > > -L(prolog_find_zero_in_first_wchar):
> > > - test $1, %rax
> > > - jz L(return_null)
> > > - lea -16(%rdi), %rax
> > > - ret
> > > -
> > > - .p2align 4
> > > -L(prolog_find_zero_in_second_wchar):
> > > - and $1 << 5 - 1, %rax
> > > - jz L(return_null)
> > > -
> > > - test $15 << 4, %al
> > > - jnz L(match_second_wchar)
> > > - lea -16(%rdi), %rax
> > > - ret
> > > -
> > > - .p2align 4
> > > -L(prolog_find_zero_in_third_wchar):
> > > - and $1 << 9 - 1, %rax
> > > - jz L(return_null)
> > > -
> > > - test %ah, %ah
> > > - jnz L(match_third_wchar)
> > > - test $15 << 4, %al
> > > - jnz L(match_second_wchar)
> > > - lea -16(%rdi), %rax
> > > - ret
> > > -
> > > - .p2align 4
> > > -L(match_second_wchar):
> > > - lea -12(%rdi), %rax
> > > - ret
> > > -
> > > - .p2align 4
> > > -L(match_third_wchar):
> > > - lea -8(%rdi), %rax
> > > - ret
> > > -
> > > - .p2align 4
> > > -L(match_fourth_wchar):
> > > - lea -4(%rdi), %rax
> > > - ret
> > > -
> > > - .p2align 4
> > > -L(return_null):
> > > - xor %rax, %rax
> > > - ret
> > > -
> > > -END (wcsrchr)
> > > +#include "../strrchr.S"
> > > --
> > > 2.25.1
> > >
> >
> >
> > --
> > H.J.
--
H.J.
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH v1 1/5] benchtests: Improve bench-strrchr
2022-04-21 20:12 ` [PATCH v1 1/5] benchtests: Improve bench-strrchr H.J. Lu
@ 2022-04-21 22:07 ` Noah Goldstein
2022-04-21 23:49 ` H.J. Lu
0 siblings, 1 reply; 36+ messages in thread
From: Noah Goldstein @ 2022-04-21 22:07 UTC (permalink / raw)
To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell
On Thu, Apr 21, 2022 at 3:13 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Wed, Apr 20, 2022 at 8:14 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > 1. Use json-lib for printing results.
> > 2. Expose all parameters (before pos, seek_char, and max_char where
> > not printed).
> > 3. Add benchmarks that test multiple occurence of seek_char in the
> > string.
> > ---
> > benchtests/bench-strrchr.c | 126 ++++++++++++++++++++++++-------------
> > 1 file changed, 82 insertions(+), 44 deletions(-)
> >
> > diff --git a/benchtests/bench-strrchr.c b/benchtests/bench-strrchr.c
> > index abdae60c51..cceea77e1b 100644
> > --- a/benchtests/bench-strrchr.c
> > +++ b/benchtests/bench-strrchr.c
> > @@ -23,6 +23,7 @@
> > # define TEST_NAME "strrchr"
> > #endif
> > #include "bench-string.h"
> > +#include "json-lib.h"
> >
> > #define BIG_CHAR MAX_CHAR
> >
> > @@ -53,7 +54,8 @@ SIMPLE_STRRCHR (const CHAR *s, int c)
> > }
> >
> > static void
> > -do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
> > +do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s, int c,
> > + CHAR *exp_res)
> > {
> > CHAR *res = CALL (impl, s, c);
> > size_t i, iters = INNER_LOOP_ITERS8;
> > @@ -61,8 +63,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
> >
> > if (res != exp_res)
> > {
> > - error (0, 0, "Wrong result in function %s %p %p", impl->name,
> > - res, exp_res);
> > + error (0, 0, "Wrong result in function %s %p %p", impl->name, res,
> > + exp_res);
>
> These changes aren't needed.
>
> > ret = 1;
> > return;
> > }
> > @@ -72,24 +74,28 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
> > {
> > CALL (impl, s, c);
> > }
> > - TIMING_NOW (stop);
> >
> > + TIMING_NOW (stop);
>
> Not needed.
Will fix in V2
>
> > TIMING_DIFF (cur, start, stop);
> >
> > - TIMING_PRINT_MEAN ((double) cur, (double) iters);
> > + json_element_double (json_ctx, (double) cur / (double) iters);
> > + return;
>
> Return isn't needed.
Will fix in V2.
>
> > }
> >
> > static void
> > -do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
> > +do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
> > + int seek_char, int max_char, size_t freq)
> > /* For wcsrchr: align here means align not in bytes,
> > but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
> > len for wcschr here isn't in bytes but it's number of wchar_t symbols. */
> > {
> > size_t i;
> > + size_t pos_chunk_sz = freq ? (pos / freq) : pos;
> > + size_t last_pos = len;
> > CHAR *result;
> > CHAR *buf = (CHAR *) buf1;
> >
> > - align &= 7;
> > + align &= (getpagesize () - 1);
>
> If we have such large alignments, the tests may be skipped.
> Should we change it to 127 instead?
There is logic around page cross cases in x86_64 versions so think
makes sense to support benchmarking it.
Also i think that would tend to give the previous version a bit of
an unfair disadvantage as the slow aligning case will never be
tested in the new version.
>
> > if ((align + len) * sizeof (CHAR) >= page_size)
> > return;
> >
> > @@ -103,6 +109,16 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
> > if ((i > pos || pos >= len) && buf[align + i] == seek_char)
> > buf[align + i] = seek_char + 10 + (random () & 15);
> > }
> > +
> > + if (pos_chunk_sz == 0 && pos)
> > + pos_chunk_sz = 1;
> > +
> > + for (i = pos_chunk_sz; i < pos && i < len; i += pos_chunk_sz)
> > + {
> > + buf[align + i] = seek_char;
> > + last_pos = i;
> > + }
> > +
> > buf[align + len] = 0;
> >
> > if (pos < len)
> > @@ -110,66 +126,88 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
> > buf[align + pos] = seek_char;
> > result = (CHAR *) (buf + align + pos);
> > }
> > + else if (last_pos < len)
> > + result = (CHAR *) (buf + align + last_pos);
> > else if (seek_char == 0)
> > result = (CHAR *) (buf + align + len);
> > else
> > result = NULL;
> >
> > - printf ("Length %4zd, alignment in bytes %2zd:", len, align * sizeof (CHAR));
> > + json_element_object_begin (json_ctx);
> > + json_attr_uint (json_ctx, "len", len);
> > + json_attr_uint (json_ctx, "pos", pos);
> > + json_attr_uint (json_ctx, "align", align);
> > + json_attr_uint (json_ctx, "freq", freq);
> > + json_attr_uint (json_ctx, "seek", seek_char);
> > + json_attr_uint (json_ctx, "max_char", max_char);
> > + json_array_begin (json_ctx, "timings");
> >
> > FOR_EACH_IMPL (impl, 0)
> > - do_one_test (impl, (CHAR *) (buf + align), seek_char, result);
> > + do_one_test (json_ctx, impl, (CHAR *) (buf + align), seek_char, result);
> >
> > - putchar ('\n');
> > + json_array_end (json_ctx);
> > + json_element_object_end (json_ctx);
> > }
> >
> > int
> > test_main (void)
> > {
> > - size_t i;
> > + json_ctx_t json_ctx;
> > + size_t i, j;
> > + int seek;
> >
> > test_init ();
> > + json_init (&json_ctx, 0, stdout);
> >
> > - printf ("%20s", "");
> > - FOR_EACH_IMPL (impl, 0)
> > - printf ("\t%s", impl->name);
> > - putchar ('\n');
> > + json_document_begin (&json_ctx);
> > + json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> >
> > - for (i = 1; i < 8; ++i)
> > - {
> > - do_test (0, 16 << i, 2048, 23, SMALL_CHAR);
> > - do_test (i, 16 << i, 2048, 23, SMALL_CHAR);
> > - }
> > + json_attr_object_begin (&json_ctx, "functions");
> > + json_attr_object_begin (&json_ctx, TEST_NAME);
> > + json_attr_string (&json_ctx, "bench-variant", "");
> >
> > - for (i = 1; i < 8; ++i)
> > - {
> > - do_test (i, 64, 256, 23, SMALL_CHAR);
> > - do_test (i, 64, 256, 23, BIG_CHAR);
> > - }
> > -
> > - for (i = 0; i < 32; ++i)
> > - {
> > - do_test (0, i, i + 1, 23, SMALL_CHAR);
> > - do_test (0, i, i + 1, 23, BIG_CHAR);
> > - }
> > + json_array_begin (&json_ctx, "ifuncs");
> > + FOR_EACH_IMPL (impl, 0)
> > + json_element_string (&json_ctx, impl->name);
> > + json_array_end (&json_ctx);
> >
> > - for (i = 1; i < 8; ++i)
> > - {
> > - do_test (0, 16 << i, 2048, 0, SMALL_CHAR);
> > - do_test (i, 16 << i, 2048, 0, SMALL_CHAR);
> > - }
> > + json_array_begin (&json_ctx, "results");
> >
> > - for (i = 1; i < 8; ++i)
> > + for (seek = 0; seek <= 23; seek += 23)
> > {
> > - do_test (i, 64, 256, 0, SMALL_CHAR);
> > - do_test (i, 64, 256, 0, BIG_CHAR);
> > + for (j = 1; j < 32; j += j)
> > + {
> > + for (i = 1; i < 9; ++i)
> > + {
> > + do_test (&json_ctx, 0, 16 << i, 2048, seek, SMALL_CHAR, j);
> > + do_test (&json_ctx, i, 16 << i, 2048, seek, SMALL_CHAR, j);
> > + }
> > +
> > + for (i = 1; i < 8; ++i)
> > + {
> > + do_test (&json_ctx, i, 64, 256, seek, SMALL_CHAR, j);
> > + do_test (&json_ctx, i, 64, 256, seek, BIG_CHAR, j);
> > +
> > + do_test (&json_ctx, i * 15, 64, 256, seek, SMALL_CHAR, j);
> > + do_test (&json_ctx, i * 15, 64, 256, seek, BIG_CHAR, j);
> > + }
> > +
> > + for (i = 0; i < 32; ++i)
> > + {
> > + do_test (&json_ctx, 0, i, i + 1, seek, SMALL_CHAR, j);
> > + do_test (&json_ctx, 0, i, i + 1, seek, BIG_CHAR, j);
> > + }
> > + if (seek == 0)
> > + {
> > + break;
> > + }
> > + }
> > }
> >
> > - for (i = 0; i < 32; ++i)
> > - {
> > - do_test (0, i, i + 1, 0, SMALL_CHAR);
> > - do_test (0, i, i + 1, 0, BIG_CHAR);
> > - }
> > + json_array_end (&json_ctx);
> > + json_attr_object_end (&json_ctx);
> > + json_attr_object_end (&json_ctx);
> > + json_document_end (&json_ctx);
> >
> > return ret;
> > }
> > --
> > 2.25.1
> >
>
>
> --
> H.J.
^ permalink raw reply [flat|nested] 36+ messages in thread
* [PATCH v2 1/4] benchtests: Improve bench-strrchr
2022-04-21 3:14 [PATCH v1 1/5] benchtests: Improve bench-strrchr Noah Goldstein
` (4 preceding siblings ...)
2022-04-21 20:12 ` [PATCH v1 1/5] benchtests: Improve bench-strrchr H.J. Lu
@ 2022-04-21 22:22 ` Noah Goldstein
2022-04-21 22:22 ` [PATCH v2 2/4] x86: Optimize {str|wcs}rchr-sse2 Noah Goldstein
` (2 more replies)
2022-04-22 1:52 ` [PATCH v3 1/4] benchtests: Improve bench-strrchr Noah Goldstein
2022-04-22 19:11 ` [PATCH v4 " Noah Goldstein
7 siblings, 3 replies; 36+ messages in thread
From: Noah Goldstein @ 2022-04-21 22:22 UTC (permalink / raw)
To: libc-alpha
1. Use json-lib for printing results.
2. Expose all parameters (before pos, seek_char, and max_char where
not printed).
3. Add benchmarks that test multiple occurence of seek_char in the
string.
---
benchtests/bench-strrchr.c | 124 ++++++++++++++++++++++++-------------
1 file changed, 80 insertions(+), 44 deletions(-)
diff --git a/benchtests/bench-strrchr.c b/benchtests/bench-strrchr.c
index abdae60c51..ce4307a098 100644
--- a/benchtests/bench-strrchr.c
+++ b/benchtests/bench-strrchr.c
@@ -23,6 +23,7 @@
# define TEST_NAME "strrchr"
#endif
#include "bench-string.h"
+#include "json-lib.h"
#define BIG_CHAR MAX_CHAR
@@ -53,7 +54,8 @@ SIMPLE_STRRCHR (const CHAR *s, int c)
}
static void
-do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
+do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s, int c,
+ CHAR *exp_res)
{
CHAR *res = CALL (impl, s, c);
size_t i, iters = INNER_LOOP_ITERS8;
@@ -61,8 +63,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
if (res != exp_res)
{
- error (0, 0, "Wrong result in function %s %p %p", impl->name,
- res, exp_res);
+ error (0, 0, "Wrong result in function %s %p %p", impl->name, res,
+ exp_res);
ret = 1;
return;
}
@@ -73,23 +75,25 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
CALL (impl, s, c);
}
TIMING_NOW (stop);
-
TIMING_DIFF (cur, start, stop);
- TIMING_PRINT_MEAN ((double) cur, (double) iters);
+ json_element_double (json_ctx, (double) cur / (double) iters);
}
static void
-do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
+do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
+ int seek_char, int max_char, size_t freq)
/* For wcsrchr: align here means align not in bytes,
but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
len for wcschr here isn't in bytes but it's number of wchar_t symbols. */
{
size_t i;
+ size_t pos_chunk_sz = freq ? (pos / freq) : pos;
+ size_t last_pos = len;
CHAR *result;
CHAR *buf = (CHAR *) buf1;
- align &= 7;
+ align &= (getpagesize () - 1);
if ((align + len) * sizeof (CHAR) >= page_size)
return;
@@ -103,6 +107,16 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
if ((i > pos || pos >= len) && buf[align + i] == seek_char)
buf[align + i] = seek_char + 10 + (random () & 15);
}
+
+ if (pos_chunk_sz == 0 && pos)
+ pos_chunk_sz = 1;
+
+ for (i = pos_chunk_sz; i < pos && i < len; i += pos_chunk_sz)
+ {
+ buf[align + i] = seek_char;
+ last_pos = i;
+ }
+
buf[align + len] = 0;
if (pos < len)
@@ -110,66 +124,88 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
buf[align + pos] = seek_char;
result = (CHAR *) (buf + align + pos);
}
+ else if (last_pos < len)
+ result = (CHAR *) (buf + align + last_pos);
else if (seek_char == 0)
result = (CHAR *) (buf + align + len);
else
result = NULL;
- printf ("Length %4zd, alignment in bytes %2zd:", len, align * sizeof (CHAR));
+ json_element_object_begin (json_ctx);
+ json_attr_uint (json_ctx, "len", len);
+ json_attr_uint (json_ctx, "pos", pos);
+ json_attr_uint (json_ctx, "align", align);
+ json_attr_uint (json_ctx, "freq", freq);
+ json_attr_uint (json_ctx, "seek", seek_char);
+ json_attr_uint (json_ctx, "max_char", max_char);
+ json_array_begin (json_ctx, "timings");
FOR_EACH_IMPL (impl, 0)
- do_one_test (impl, (CHAR *) (buf + align), seek_char, result);
+ do_one_test (json_ctx, impl, (CHAR *) (buf + align), seek_char, result);
- putchar ('\n');
+ json_array_end (json_ctx);
+ json_element_object_end (json_ctx);
}
int
test_main (void)
{
- size_t i;
+ json_ctx_t json_ctx;
+ size_t i, j;
+ int seek;
test_init ();
+ json_init (&json_ctx, 0, stdout);
- printf ("%20s", "");
- FOR_EACH_IMPL (impl, 0)
- printf ("\t%s", impl->name);
- putchar ('\n');
-
- for (i = 1; i < 8; ++i)
- {
- do_test (0, 16 << i, 2048, 23, SMALL_CHAR);
- do_test (i, 16 << i, 2048, 23, SMALL_CHAR);
- }
+ json_document_begin (&json_ctx);
+ json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
- for (i = 1; i < 8; ++i)
- {
- do_test (i, 64, 256, 23, SMALL_CHAR);
- do_test (i, 64, 256, 23, BIG_CHAR);
- }
+ json_attr_object_begin (&json_ctx, "functions");
+ json_attr_object_begin (&json_ctx, TEST_NAME);
+ json_attr_string (&json_ctx, "bench-variant", "");
- for (i = 0; i < 32; ++i)
- {
- do_test (0, i, i + 1, 23, SMALL_CHAR);
- do_test (0, i, i + 1, 23, BIG_CHAR);
- }
+ json_array_begin (&json_ctx, "ifuncs");
+ FOR_EACH_IMPL (impl, 0)
+ json_element_string (&json_ctx, impl->name);
+ json_array_end (&json_ctx);
- for (i = 1; i < 8; ++i)
- {
- do_test (0, 16 << i, 2048, 0, SMALL_CHAR);
- do_test (i, 16 << i, 2048, 0, SMALL_CHAR);
- }
+ json_array_begin (&json_ctx, "results");
- for (i = 1; i < 8; ++i)
+ for (seek = 0; seek <= 23; seek += 23)
{
- do_test (i, 64, 256, 0, SMALL_CHAR);
- do_test (i, 64, 256, 0, BIG_CHAR);
+ for (j = 1; j < 32; j += j)
+ {
+ for (i = 1; i < 9; ++i)
+ {
+ do_test (&json_ctx, 0, 16 << i, 2048, seek, SMALL_CHAR, j);
+ do_test (&json_ctx, i, 16 << i, 2048, seek, SMALL_CHAR, j);
+ }
+
+ for (i = 1; i < 8; ++i)
+ {
+ do_test (&json_ctx, i, 64, 256, seek, SMALL_CHAR, j);
+ do_test (&json_ctx, i, 64, 256, seek, BIG_CHAR, j);
+
+ do_test (&json_ctx, i * 15, 64, 256, seek, SMALL_CHAR, j);
+ do_test (&json_ctx, i * 15, 64, 256, seek, BIG_CHAR, j);
+ }
+
+ for (i = 0; i < 32; ++i)
+ {
+ do_test (&json_ctx, 0, i, i + 1, seek, SMALL_CHAR, j);
+ do_test (&json_ctx, 0, i, i + 1, seek, BIG_CHAR, j);
+ }
+ if (seek == 0)
+ {
+ break;
+ }
+ }
}
- for (i = 0; i < 32; ++i)
- {
- do_test (0, i, i + 1, 0, SMALL_CHAR);
- do_test (0, i, i + 1, 0, BIG_CHAR);
- }
+ json_array_end (&json_ctx);
+ json_attr_object_end (&json_ctx);
+ json_attr_object_end (&json_ctx);
+ json_document_end (&json_ctx);
return ret;
}
--
2.25.1
^ permalink raw reply [flat|nested] 36+ messages in thread
* [PATCH v2 2/4] x86: Optimize {str|wcs}rchr-sse2
2022-04-21 22:22 ` [PATCH v2 1/4] " Noah Goldstein
@ 2022-04-21 22:22 ` Noah Goldstein
2022-04-21 23:46 ` H.J. Lu
2022-04-21 22:22 ` [PATCH v2 3/4] x86: Optimize {str|wcs}rchr-avx2 Noah Goldstein
2022-04-21 22:22 ` [PATCH v2 4/4] x86: Optimize {str|wcs}rchr-evex Noah Goldstein
2 siblings, 1 reply; 36+ messages in thread
From: Noah Goldstein @ 2022-04-21 22:22 UTC (permalink / raw)
To: libc-alpha
The new code unrolls the main loop slightly without adding too much
overhead and minimizes the comparisons for the search CHAR.
Geometric Mean of all benchmarks New / Old: 0.741
See email for all results.
Full xcheck passes on x86_64 with and without multiarch enabled.
---
sysdeps/x86_64/multiarch/strrchr-sse2.S | 2 +-
sysdeps/x86_64/multiarch/wcsrchr-sse2.S | 3 +-
sysdeps/x86_64/strrchr.S | 510 +++++++++++++++---------
sysdeps/x86_64/wcsrchr.S | 268 +------------
4 files changed, 339 insertions(+), 444 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
index db1b44c23c..866396e947 100644
--- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
+++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
@@ -17,7 +17,7 @@
<https://www.gnu.org/licenses/>. */
#if IS_IN (libc)
-# define strrchr __strrchr_sse2
+# define STRRCHR __strrchr_sse2
# undef weak_alias
# define weak_alias(strrchr, rindex)
diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
index 78d1ca6553..69d2f3cdb1 100644
--- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
+++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
@@ -17,7 +17,6 @@
<https://www.gnu.org/licenses/>. */
#if IS_IN (libc)
-# define wcsrchr __wcsrchr_sse2
+# define STRRCHR __wcsrchr_sse2
#endif
-
#include "../wcsrchr.S"
diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
index 50d886713e..6efb25c880 100644
--- a/sysdeps/x86_64/strrchr.S
+++ b/sysdeps/x86_64/strrchr.S
@@ -19,210 +19,360 @@
#include <sysdep.h>
+#ifndef STRRCHR
+# define STRRCHR strrchr
+#endif
+
+#ifdef USE_AS_WCSRCHR
+# define PCMPEQ pcmpeqd
+# define CHAR_SIZE 4
+# define PMINU pminud
+#else
+# define PCMPEQ pcmpeqb
+# define CHAR_SIZE 1
+# define PMINU pminub
+#endif
+
+#define PAGE_SIZE 4096
+#define VEC_SIZE 16
+
.text
-ENTRY (strrchr)
- movd %esi, %xmm1
+ENTRY(STRRCHR)
+ movd %esi, %xmm0
movq %rdi, %rax
- andl $4095, %eax
- punpcklbw %xmm1, %xmm1
- cmpq $4032, %rax
- punpcklwd %xmm1, %xmm1
- pshufd $0, %xmm1, %xmm1
+ andl $(PAGE_SIZE - 1), %eax
+#ifndef USE_AS_WCSRCHR
+ punpcklbw %xmm0, %xmm0
+ punpcklwd %xmm0, %xmm0
+#endif
+ pshufd $0, %xmm0, %xmm0
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
ja L(cross_page)
- movdqu (%rdi), %xmm0
+
+L(cross_page_continue):
+ movups (%rdi), %xmm1
pxor %xmm2, %xmm2
- movdqa %xmm0, %xmm3
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm3
- pmovmskb %xmm0, %ecx
- pmovmskb %xmm3, %edx
- testq %rdx, %rdx
- je L(next_48_bytes)
- leaq -1(%rdx), %rax
- xorq %rdx, %rax
- andq %rcx, %rax
- je L(exit)
- bsrq %rax, %rax
+ PCMPEQ %xmm1, %xmm2
+ pmovmskb %xmm2, %ecx
+ testl %ecx, %ecx
+ jz L(aligned_more)
+
+ PCMPEQ %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+ leal -1(%rcx), %edx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(ret0)
+ bsrl %eax, %eax
addq %rdi, %rax
+ /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
+ search CHAR is zero we are correct. Either way `andq
+ -CHAR_SIZE, %rax` gets the correct result. */
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+L(ret0):
ret
+ /* Returns for first vec x1/x2 have hard coded backward search
+ path for earlier matches. */
.p2align 4
-L(next_48_bytes):
- movdqu 16(%rdi), %xmm4
- movdqa %xmm4, %xmm5
- movdqu 32(%rdi), %xmm3
- pcmpeqb %xmm1, %xmm4
- pcmpeqb %xmm2, %xmm5
- movdqu 48(%rdi), %xmm0
- pmovmskb %xmm5, %edx
- movdqa %xmm3, %xmm5
- pcmpeqb %xmm1, %xmm3
- pcmpeqb %xmm2, %xmm5
- pcmpeqb %xmm0, %xmm2
- salq $16, %rdx
- pmovmskb %xmm3, %r8d
- pmovmskb %xmm5, %eax
- pmovmskb %xmm2, %esi
- salq $32, %r8
- salq $32, %rax
- pcmpeqb %xmm1, %xmm0
- orq %rdx, %rax
- movq %rsi, %rdx
- pmovmskb %xmm4, %esi
- salq $48, %rdx
- salq $16, %rsi
- orq %r8, %rsi
- orq %rcx, %rsi
- pmovmskb %xmm0, %ecx
- salq $48, %rcx
- orq %rcx, %rsi
- orq %rdx, %rax
- je L(loop_header2)
- leaq -1(%rax), %rcx
- xorq %rax, %rcx
- andq %rcx, %rsi
- je L(exit)
- bsrq %rsi, %rsi
- leaq (%rdi,%rsi), %rax
+L(first_vec_x0_test):
+ PCMPEQ %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+ testl %eax, %eax
+ jz L(ret0)
+ bsrl %eax, %eax
+ addq %r8, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
ret
.p2align 4
-L(loop_header2):
- testq %rsi, %rsi
- movq %rdi, %rcx
- je L(no_c_found)
-L(loop_header):
- addq $64, %rdi
- pxor %xmm7, %xmm7
- andq $-64, %rdi
- jmp L(loop_entry)
+L(first_vec_x1):
+ PCMPEQ %xmm0, %xmm2
+ pmovmskb %xmm2, %eax
+ leal -1(%rcx), %edx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(first_vec_x0_test)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
.p2align 4
-L(loop64):
- testq %rdx, %rdx
- cmovne %rdx, %rsi
- cmovne %rdi, %rcx
- addq $64, %rdi
-L(loop_entry):
- movdqa 32(%rdi), %xmm3
- pxor %xmm6, %xmm6
- movdqa 48(%rdi), %xmm2
- movdqa %xmm3, %xmm0
- movdqa 16(%rdi), %xmm4
- pminub %xmm2, %xmm0
- movdqa (%rdi), %xmm5
- pminub %xmm4, %xmm0
- pminub %xmm5, %xmm0
- pcmpeqb %xmm7, %xmm0
- pmovmskb %xmm0, %eax
- movdqa %xmm5, %xmm0
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %r9d
- movdqa %xmm4, %xmm0
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %edx
- movdqa %xmm3, %xmm0
- pcmpeqb %xmm1, %xmm0
- salq $16, %rdx
- pmovmskb %xmm0, %r10d
- movdqa %xmm2, %xmm0
- pcmpeqb %xmm1, %xmm0
- salq $32, %r10
- orq %r10, %rdx
- pmovmskb %xmm0, %r8d
- orq %r9, %rdx
- salq $48, %r8
- orq %r8, %rdx
+L(first_vec_x1_test):
+ PCMPEQ %xmm0, %xmm2
+ pmovmskb %xmm2, %eax
testl %eax, %eax
- je L(loop64)
- pcmpeqb %xmm6, %xmm4
- pcmpeqb %xmm6, %xmm3
- pcmpeqb %xmm6, %xmm5
- pmovmskb %xmm4, %eax
- pmovmskb %xmm3, %r10d
- pcmpeqb %xmm6, %xmm2
- pmovmskb %xmm5, %r9d
- salq $32, %r10
- salq $16, %rax
- pmovmskb %xmm2, %r8d
- orq %r10, %rax
- orq %r9, %rax
- salq $48, %r8
- orq %r8, %rax
- leaq -1(%rax), %r8
- xorq %rax, %r8
- andq %r8, %rdx
- cmovne %rdi, %rcx
- cmovne %rdx, %rsi
- bsrq %rsi, %rsi
- leaq (%rcx,%rsi), %rax
+ jz L(first_vec_x0_test)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
+
+ .p2align 4
+L(first_vec_x2):
+ PCMPEQ %xmm0, %xmm3
+ pmovmskb %xmm3, %eax
+ leal -1(%rcx), %edx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(first_vec_x1_test)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
+
+ .p2align 4
+L(aligned_more):
+ /* Save original pointer if match was in VEC 0. */
+ movq %rdi, %r8
+ andq $-VEC_SIZE, %rdi
+
+ movaps VEC_SIZE(%rdi), %xmm2
+ pxor %xmm3, %xmm3
+ PCMPEQ %xmm2, %xmm3
+ pmovmskb %xmm3, %ecx
+ testl %ecx, %ecx
+ jnz L(first_vec_x1)
+
+ movaps (VEC_SIZE * 2)(%rdi), %xmm3
+ pxor %xmm4, %xmm4
+ PCMPEQ %xmm3, %xmm4
+ pmovmskb %xmm4, %ecx
+ testl %ecx, %ecx
+ jnz L(first_vec_x2)
+
+ addq $VEC_SIZE, %rdi
+ /* Save pointer again before realigning. */
+ movq %rdi, %rsi
+ andq $-(VEC_SIZE * 2), %rdi
+ .p2align 4
+L(first_loop):
+ /* Do 2x VEC at a time. */
+ movaps (VEC_SIZE * 2)(%rdi), %xmm4
+ movaps (VEC_SIZE * 3)(%rdi), %xmm5
+ /* If SSE2 no pminud so wcsrchr needs seperate logic for
+ detecting zero. Note if this is found to be a bottleneck it
+ may be worth adding an SSE4.1 wcsrchr implementation. */
+#ifdef USE_AS_WCSRCHR
+ movaps %xmm5, %xmm6
+ pxor %xmm8, %xmm8
+
+ PCMPEQ %xmm8, %xmm5
+ PCMPEQ %xmm4, %xmm8
+ por %xmm5, %xmm8
+#else
+ movaps %xmm5, %xmm6
+ PMINU %xmm4, %xmm5
+#endif
+
+ movaps %xmm4, %xmm9
+ PCMPEQ %xmm0, %xmm4
+ PCMPEQ %xmm0, %xmm6
+ movaps %xmm6, %xmm7
+ por %xmm4, %xmm6
+#ifndef USE_AS_WCSRCHR
+ pxor %xmm8, %xmm8
+ PCMPEQ %xmm5, %xmm8
+#endif
+ pmovmskb %xmm8, %ecx
+ pmovmskb %xmm6, %eax
+
+ addq $(VEC_SIZE * 2), %rdi
+ /* Use `addl` 1) so we can undo it with `subl` and 2) it can
+ macro-fuse with `jz`. */
+ addl %ecx, %eax
+ jz L(first_loop)
+
+ /* Check if there is zero match. */
+ testl %ecx, %ecx
+ jz L(second_loop_match)
+
+ /* Check if there was a match in last iteration. */
+ subl %ecx, %eax
+ jnz L(new_match)
+
+L(first_loop_old_match):
+ PCMPEQ %xmm0, %xmm2
+ PCMPEQ %xmm0, %xmm3
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm3, %eax
+ addl %eax, %ecx
+ jz L(first_vec_x0_test)
+ /* NB: We could move this shift to before the branch and save a
+ bit of code size / performance on the fall through. The
+ branch leads to the null case which generally seems hotter
+ than char in first 3x VEC. */
+ sall $16, %eax
+ orl %ecx, %eax
+
+ bsrl %eax, %eax
+ addq %rsi, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
+
+ .p2align 4
+L(new_match):
+ pxor %xmm6, %xmm6
+ PCMPEQ %xmm9, %xmm6
+ pmovmskb %xmm6, %eax
+ sall $16, %ecx
+ orl %eax, %ecx
+
+ /* We can't reuse either of the old comparisons as since we mask
+ of zeros after first zero (instead of using the full
+ comparison) we can't gurantee no interference between match
+ after end of string and valid match. */
+ pmovmskb %xmm4, %eax
+ pmovmskb %xmm7, %edx
+ sall $16, %edx
+ orl %edx, %eax
+
+ leal -1(%ecx), %edx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(first_loop_old_match)
+ bsrl %eax, %eax
+ addq %rdi, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
ret
+ /* Save minimum state for getting most recent match. We can
+ throw out all previous work. */
.p2align 4
-L(no_c_found):
- movl $1, %esi
- xorl %ecx, %ecx
- jmp L(loop_header)
+L(second_loop_match):
+ movq %rdi, %rsi
+ movaps %xmm4, %xmm2
+ movaps %xmm7, %xmm3
.p2align 4
-L(exit):
- xorl %eax, %eax
+L(second_loop):
+ movaps (VEC_SIZE * 2)(%rdi), %xmm4
+ movaps (VEC_SIZE * 3)(%rdi), %xmm5
+ /* If SSE2 no pminud so wcsrchr needs seperate logic for
+ detecting zero. Note if this is found to be a bottleneck it
+ may be worth adding an SSE4.1 wcsrchr implementation. */
+#ifdef USE_AS_WCSRCHR
+ movaps %xmm5, %xmm6
+ pxor %xmm8, %xmm8
+
+ PCMPEQ %xmm8, %xmm5
+ PCMPEQ %xmm4, %xmm8
+ por %xmm5, %xmm8
+#else
+ movaps %xmm5, %xmm6
+ PMINU %xmm4, %xmm5
+#endif
+
+ movaps %xmm4, %xmm9
+ PCMPEQ %xmm0, %xmm4
+ PCMPEQ %xmm0, %xmm6
+ movaps %xmm6, %xmm7
+ por %xmm4, %xmm6
+#ifndef USE_AS_WCSRCHR
+ pxor %xmm8, %xmm8
+ PCMPEQ %xmm5, %xmm8
+#endif
+
+ pmovmskb %xmm8, %ecx
+ pmovmskb %xmm6, %eax
+
+ addq $(VEC_SIZE * 2), %rdi
+ /* Either null term or new occurence of CHAR. */
+ addl %ecx, %eax
+ jz L(second_loop)
+
+ /* No null term so much be new occurence of CHAR. */
+ testl %ecx, %ecx
+ jz L(second_loop_match)
+
+
+ subl %ecx, %eax
+ jnz L(second_loop_new_match)
+
+L(second_loop_old_match):
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm3, %eax
+ sall $16, %eax
+ orl %ecx, %eax
+ bsrl %eax, %eax
+ addq %rsi, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
ret
.p2align 4
+L(second_loop_new_match):
+ pxor %xmm6, %xmm6
+ PCMPEQ %xmm9, %xmm6
+ pmovmskb %xmm6, %eax
+ sall $16, %ecx
+ orl %eax, %ecx
+
+ /* We can't reuse either of the old comparisons as since we mask
+ of zeros after first zero (instead of using the full
+ comparison) we can't gurantee no interference between match
+ after end of string and valid match. */
+ pmovmskb %xmm4, %eax
+ pmovmskb %xmm7, %edx
+ sall $16, %edx
+ orl %edx, %eax
+
+ leal -1(%ecx), %edx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(second_loop_old_match)
+ bsrl %eax, %eax
+ addq %rdi, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
+
+ .p2align 4,, 4
L(cross_page):
- movq %rdi, %rax
- pxor %xmm0, %xmm0
- andq $-64, %rax
- movdqu (%rax), %xmm5
- movdqa %xmm5, %xmm6
- movdqu 16(%rax), %xmm4
- pcmpeqb %xmm1, %xmm5
- pcmpeqb %xmm0, %xmm6
- movdqu 32(%rax), %xmm3
- pmovmskb %xmm6, %esi
- movdqa %xmm4, %xmm6
- movdqu 48(%rax), %xmm2
- pcmpeqb %xmm1, %xmm4
- pcmpeqb %xmm0, %xmm6
- pmovmskb %xmm6, %edx
- movdqa %xmm3, %xmm6
- pcmpeqb %xmm1, %xmm3
- pcmpeqb %xmm0, %xmm6
- pcmpeqb %xmm2, %xmm0
- salq $16, %rdx
- pmovmskb %xmm3, %r9d
- pmovmskb %xmm6, %r8d
- pmovmskb %xmm0, %ecx
- salq $32, %r9
- salq $32, %r8
- pcmpeqb %xmm1, %xmm2
- orq %r8, %rdx
- salq $48, %rcx
- pmovmskb %xmm5, %r8d
- orq %rsi, %rdx
- pmovmskb %xmm4, %esi
- orq %rcx, %rdx
- pmovmskb %xmm2, %ecx
- salq $16, %rsi
- salq $48, %rcx
- orq %r9, %rsi
- orq %r8, %rsi
- orq %rcx, %rsi
+ movq %rdi, %rsi
+ andq $-VEC_SIZE, %rsi
+ movaps (%rsi), %xmm1
+ pxor %xmm2, %xmm2
+ PCMPEQ %xmm1, %xmm2
+ pmovmskb %xmm2, %edx
movl %edi, %ecx
- subl %eax, %ecx
- shrq %cl, %rdx
- shrq %cl, %rsi
- testq %rdx, %rdx
- je L(loop_header2)
- leaq -1(%rdx), %rax
- xorq %rdx, %rax
- andq %rax, %rsi
- je L(exit)
- bsrq %rsi, %rax
+ andl $(VEC_SIZE - 1), %ecx
+ sarl %cl, %edx
+ jz L(cross_page_continue)
+ PCMPEQ %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+ sarl %cl, %eax
+ leal -1(%rdx), %ecx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(ret1)
+ bsrl %eax, %eax
addq %rdi, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+L(ret1):
ret
-END (strrchr)
+END(STRRCHR)
-weak_alias (strrchr, rindex)
-libc_hidden_builtin_def (strrchr)
+#ifndef USE_AS_WCSRCHR
+ weak_alias (STRRCHR, rindex)
+ libc_hidden_builtin_def (STRRCHR)
+#endif
diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
index 61552954de..2b80efc5ef 100644
--- a/sysdeps/x86_64/wcsrchr.S
+++ b/sysdeps/x86_64/wcsrchr.S
@@ -1,4 +1,4 @@
-/* wcsrchr with SSSE3
+/* wcsrchr optimized with SSE2.
Copyright (C) 2011-2022 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -16,266 +16,12 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#include <sysdep.h>
- .text
-ENTRY (wcsrchr)
+#define USE_AS_WCSRCHR 1
+#define NO_PMINU 1
- movd %rsi, %xmm1
- mov %rdi, %rcx
- punpckldq %xmm1, %xmm1
- pxor %xmm2, %xmm2
- punpckldq %xmm1, %xmm1
- and $63, %rcx
- cmp $48, %rcx
- ja L(crosscache)
+#ifndef STRRCHR
+# define STRRCHR wcsrchr
+#endif
- movdqu (%rdi), %xmm0
- pcmpeqd %xmm0, %xmm2
- pcmpeqd %xmm1, %xmm0
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm0, %rax
- add $16, %rdi
-
- test %rax, %rax
- jnz L(unaligned_match1)
-
- test %rcx, %rcx
- jnz L(return_null)
-
- and $-16, %rdi
- xor %r8, %r8
- jmp L(loop)
-
- .p2align 4
-L(unaligned_match1):
- test %rcx, %rcx
- jnz L(prolog_find_zero_1)
-
- mov %rax, %r8
- mov %rdi, %rsi
- and $-16, %rdi
- jmp L(loop)
-
- .p2align 4
-L(crosscache):
- and $15, %rcx
- and $-16, %rdi
- pxor %xmm3, %xmm3
- movdqa (%rdi), %xmm0
- pcmpeqd %xmm0, %xmm3
- pcmpeqd %xmm1, %xmm0
- pmovmskb %xmm3, %rdx
- pmovmskb %xmm0, %rax
- shr %cl, %rdx
- shr %cl, %rax
- add $16, %rdi
-
- test %rax, %rax
- jnz L(unaligned_match)
-
- test %rdx, %rdx
- jnz L(return_null)
-
- xor %r8, %r8
- jmp L(loop)
-
- .p2align 4
-L(unaligned_match):
- test %rdx, %rdx
- jnz L(prolog_find_zero)
-
- mov %rax, %r8
- lea (%rdi, %rcx), %rsi
-
-/* Loop start on aligned string. */
- .p2align 4
-L(loop):
- movdqa (%rdi), %xmm0
- pcmpeqd %xmm0, %xmm2
- add $16, %rdi
- pcmpeqd %xmm1, %xmm0
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm0, %rax
- or %rax, %rcx
- jnz L(matches)
-
- movdqa (%rdi), %xmm3
- pcmpeqd %xmm3, %xmm2
- add $16, %rdi
- pcmpeqd %xmm1, %xmm3
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm3, %rax
- or %rax, %rcx
- jnz L(matches)
-
- movdqa (%rdi), %xmm4
- pcmpeqd %xmm4, %xmm2
- add $16, %rdi
- pcmpeqd %xmm1, %xmm4
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm4, %rax
- or %rax, %rcx
- jnz L(matches)
-
- movdqa (%rdi), %xmm5
- pcmpeqd %xmm5, %xmm2
- add $16, %rdi
- pcmpeqd %xmm1, %xmm5
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm5, %rax
- or %rax, %rcx
- jz L(loop)
-
- .p2align 4
-L(matches):
- test %rax, %rax
- jnz L(match)
-L(return_value):
- test %r8, %r8
- jz L(return_null)
- mov %r8, %rax
- mov %rsi, %rdi
-
- test $15 << 4, %ah
- jnz L(match_fourth_wchar)
- test %ah, %ah
- jnz L(match_third_wchar)
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(match):
- pmovmskb %xmm2, %rcx
- test %rcx, %rcx
- jnz L(find_zero)
- mov %rax, %r8
- mov %rdi, %rsi
- jmp L(loop)
-
- .p2align 4
-L(find_zero):
- test $15, %cl
- jnz L(find_zero_in_first_wchar)
- test %cl, %cl
- jnz L(find_zero_in_second_wchar)
- test $15, %ch
- jnz L(find_zero_in_third_wchar)
-
- and $1 << 13 - 1, %rax
- jz L(return_value)
-
- test $15 << 4, %ah
- jnz L(match_fourth_wchar)
- test %ah, %ah
- jnz L(match_third_wchar)
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(find_zero_in_first_wchar):
- test $1, %rax
- jz L(return_value)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(find_zero_in_second_wchar):
- and $1 << 5 - 1, %rax
- jz L(return_value)
-
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(find_zero_in_third_wchar):
- and $1 << 9 - 1, %rax
- jz L(return_value)
-
- test %ah, %ah
- jnz L(match_third_wchar)
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(prolog_find_zero):
- add %rcx, %rdi
- mov %rdx, %rcx
-L(prolog_find_zero_1):
- test $15, %cl
- jnz L(prolog_find_zero_in_first_wchar)
- test %cl, %cl
- jnz L(prolog_find_zero_in_second_wchar)
- test $15, %ch
- jnz L(prolog_find_zero_in_third_wchar)
-
- and $1 << 13 - 1, %rax
- jz L(return_null)
-
- test $15 << 4, %ah
- jnz L(match_fourth_wchar)
- test %ah, %ah
- jnz L(match_third_wchar)
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(prolog_find_zero_in_first_wchar):
- test $1, %rax
- jz L(return_null)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(prolog_find_zero_in_second_wchar):
- and $1 << 5 - 1, %rax
- jz L(return_null)
-
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(prolog_find_zero_in_third_wchar):
- and $1 << 9 - 1, %rax
- jz L(return_null)
-
- test %ah, %ah
- jnz L(match_third_wchar)
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(match_second_wchar):
- lea -12(%rdi), %rax
- ret
-
- .p2align 4
-L(match_third_wchar):
- lea -8(%rdi), %rax
- ret
-
- .p2align 4
-L(match_fourth_wchar):
- lea -4(%rdi), %rax
- ret
-
- .p2align 4
-L(return_null):
- xor %rax, %rax
- ret
-
-END (wcsrchr)
+#include "../strrchr.S"
--
2.25.1
^ permalink raw reply [flat|nested] 36+ messages in thread
* [PATCH v2 3/4] x86: Optimize {str|wcs}rchr-avx2
2022-04-21 22:22 ` [PATCH v2 1/4] " Noah Goldstein
2022-04-21 22:22 ` [PATCH v2 2/4] x86: Optimize {str|wcs}rchr-sse2 Noah Goldstein
@ 2022-04-21 22:22 ` Noah Goldstein
2022-04-21 22:22 ` [PATCH v2 4/4] x86: Optimize {str|wcs}rchr-evex Noah Goldstein
2 siblings, 0 replies; 36+ messages in thread
From: Noah Goldstein @ 2022-04-21 22:22 UTC (permalink / raw)
To: libc-alpha
The new code unrolls the main loop slightly without adding too much
overhead and minimizes the comparisons for the search CHAR.
Geometric Mean of all benchmarks New / Old: 0.832
See email for all results.
Full xcheck passes on x86_64 with and without multiarch enabled.
---
sysdeps/x86_64/multiarch/strrchr-avx2.S | 415 +++++++++++++++---------
1 file changed, 258 insertions(+), 157 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
index 1df2adfad0..9d1e45defc 100644
--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
@@ -27,9 +27,13 @@
# ifdef USE_AS_WCSRCHR
# define VPBROADCAST vpbroadcastd
# define VPCMPEQ vpcmpeqd
+# define VPMIN vpminud
+# define CHAR_SIZE 4
# else
# define VPBROADCAST vpbroadcastb
# define VPCMPEQ vpcmpeqb
+# define VPMIN vpminub
+# define CHAR_SIZE 1
# endif
# ifndef VZEROUPPER
@@ -41,196 +45,293 @@
# endif
# define VEC_SIZE 32
+# define PAGE_SIZE 4096
- .section SECTION(.text),"ax",@progbits
-ENTRY (STRRCHR)
- movd %esi, %xmm4
- movl %edi, %ecx
+ .section SECTION(.text), "ax", @progbits
+ENTRY(STRRCHR)
+ movd %esi, %xmm7
+ movl %edi, %eax
/* Broadcast CHAR to YMM4. */
- VPBROADCAST %xmm4, %ymm4
+ VPBROADCAST %xmm7, %ymm7
vpxor %xmm0, %xmm0, %xmm0
- /* Check if we may cross page boundary with one vector load. */
- andl $(2 * VEC_SIZE - 1), %ecx
- cmpl $VEC_SIZE, %ecx
- ja L(cros_page_boundary)
+ /* Shift here instead of `andl` to save code size (saves a fetch
+ block). */
+ sall $20, %eax
+ cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax
+ ja L(cross_page)
+L(page_cross_continue):
vmovdqu (%rdi), %ymm1
- VPCMPEQ %ymm1, %ymm0, %ymm2
- VPCMPEQ %ymm1, %ymm4, %ymm3
- vpmovmskb %ymm2, %ecx
- vpmovmskb %ymm3, %eax
- addq $VEC_SIZE, %rdi
+ /* Check end of string match. */
+ VPCMPEQ %ymm1, %ymm0, %ymm6
+ vpmovmskb %ymm6, %ecx
+ testl %ecx, %ecx
+ jz L(aligned_more)
+
+ /* Only check match with search CHAR if needed. */
+ VPCMPEQ %ymm1, %ymm7, %ymm1
+ vpmovmskb %ymm1, %eax
+ /* Check if match before first zero. */
+ blsmskl %ecx, %ecx
+ andl %ecx, %eax
+ jz L(ret0)
+ bsrl %eax, %eax
+ addq %rdi, %rax
+ /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
+ search CHAR is zero we are correct. Either way `andq
+ -CHAR_SIZE, %rax` gets the correct result. */
+# ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+# endif
+L(ret0):
+L(return_vzeroupper):
+ ZERO_UPPER_VEC_REGISTERS_RETURN
+
+ /* Returns for first vec x1/x2 have hard coded backward search
+ path for earlier matches. */
+ .p2align 4,, 10
+L(first_vec_x1):
+ VPCMPEQ %ymm2, %ymm7, %ymm6
+ vpmovmskb %ymm6, %eax
+ blsmskl %ecx, %ecx
+ andl %ecx, %eax
+ jnz L(first_vec_x1_return)
+
+ .p2align 4,, 4
+L(first_vec_x0_test):
+ VPCMPEQ %ymm1, %ymm7, %ymm6
+ vpmovmskb %ymm6, %eax
+ testl %eax, %eax
+ jz L(ret1)
+ bsrl %eax, %eax
+ addq %r8, %rax
+# ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+# endif
+L(ret1):
+ VZEROUPPER_RETURN
+ .p2align 4,, 10
+L(first_vec_x0_x1_test):
+ VPCMPEQ %ymm2, %ymm7, %ymm6
+ vpmovmskb %ymm6, %eax
testl %eax, %eax
- jnz L(first_vec)
+ jz L(first_vec_x0_test)
+ .p2align 4,, 4
+L(first_vec_x1_return):
+ bsrl %eax, %eax
+ leaq 1(%rdi, %rax), %rax
+# ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+# endif
+ VZEROUPPER_RETURN
- testl %ecx, %ecx
- jnz L(return_null)
- andq $-VEC_SIZE, %rdi
- xorl %edx, %edx
- jmp L(aligned_loop)
+ .p2align 4,, 10
+L(first_vec_x2):
+ VPCMPEQ %ymm3, %ymm7, %ymm6
+ vpmovmskb %ymm6, %eax
+ blsmskl %ecx, %ecx
+ andl %ecx, %eax
+ jz L(first_vec_x0_x1_test)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE + 1)(%rdi, %rax), %rax
+# ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+# endif
+ VZEROUPPER_RETURN
+
.p2align 4
-L(first_vec):
- /* Check if there is a nul CHAR. */
+L(aligned_more):
+ /* Save original pointer if match was in VEC 0. */
+ movq %rdi, %r8
+
+ /* Align src. */
+ orq $(VEC_SIZE - 1), %rdi
+ vmovdqu 1(%rdi), %ymm2
+ VPCMPEQ %ymm2, %ymm0, %ymm6
+ vpmovmskb %ymm6, %ecx
testl %ecx, %ecx
- jnz L(char_and_nul_in_first_vec)
+ jnz L(first_vec_x1)
- /* Remember the match and keep searching. */
- movl %eax, %edx
- movq %rdi, %rsi
- andq $-VEC_SIZE, %rdi
- jmp L(aligned_loop)
+ vmovdqu (VEC_SIZE + 1)(%rdi), %ymm3
+ VPCMPEQ %ymm3, %ymm0, %ymm6
+ vpmovmskb %ymm6, %ecx
+ testl %ecx, %ecx
+ jnz L(first_vec_x2)
+ /* Save pointer again before realigning. */
+ movq %rdi, %rsi
+ addq $(VEC_SIZE + 1), %rdi
+ andq $-(VEC_SIZE * 2), %rdi
.p2align 4
-L(cros_page_boundary):
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
- vmovdqa (%rdi), %ymm1
- VPCMPEQ %ymm1, %ymm0, %ymm2
- VPCMPEQ %ymm1, %ymm4, %ymm3
- vpmovmskb %ymm2, %edx
- vpmovmskb %ymm3, %eax
- shrl %cl, %edx
- shrl %cl, %eax
- addq $VEC_SIZE, %rdi
-
- /* Check if there is a CHAR. */
+L(first_aligned_loop):
+ /* Do 2x VEC at a time. Any more and the cost of finding the
+ match outweights loop benefit. */
+ vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4
+ vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5
+
+ VPCMPEQ %ymm4, %ymm7, %ymm6
+ VPMIN %ymm4, %ymm5, %ymm8
+ VPCMPEQ %ymm5, %ymm7, %ymm10
+ vpor %ymm6, %ymm10, %ymm5
+ VPCMPEQ %ymm8, %ymm0, %ymm8
+ vpor %ymm5, %ymm8, %ymm9
+
+ vpmovmskb %ymm9, %eax
+ addq $(VEC_SIZE * 2), %rdi
+ /* No zero or search CHAR. */
testl %eax, %eax
- jnz L(found_char)
-
- testl %edx, %edx
- jnz L(return_null)
+ jz L(first_aligned_loop)
- jmp L(aligned_loop)
-
- .p2align 4
-L(found_char):
- testl %edx, %edx
- jnz L(char_and_nul)
+ /* If no zero CHAR then go to second loop (this allows us to
+ throw away all prior work). */
+ vpmovmskb %ymm8, %ecx
+ testl %ecx, %ecx
+ jz L(second_aligned_loop_prep)
- /* Remember the match and keep searching. */
- movl %eax, %edx
- leaq (%rdi, %rcx), %rsi
+ /* Search char could be zero so we need to get the true match.
+ */
+ vpmovmskb %ymm5, %eax
+ testl %eax, %eax
+ jnz L(first_aligned_loop_return)
- .p2align 4
-L(aligned_loop):
- vmovdqa (%rdi), %ymm1
- VPCMPEQ %ymm1, %ymm0, %ymm2
- addq $VEC_SIZE, %rdi
- VPCMPEQ %ymm1, %ymm4, %ymm3
- vpmovmskb %ymm2, %ecx
- vpmovmskb %ymm3, %eax
- orl %eax, %ecx
- jnz L(char_nor_null)
-
- vmovdqa (%rdi), %ymm1
- VPCMPEQ %ymm1, %ymm0, %ymm2
- add $VEC_SIZE, %rdi
- VPCMPEQ %ymm1, %ymm4, %ymm3
- vpmovmskb %ymm2, %ecx
+ .p2align 4,, 4
+L(first_vec_x1_or_x2):
+ VPCMPEQ %ymm3, %ymm7, %ymm3
+ VPCMPEQ %ymm2, %ymm7, %ymm2
vpmovmskb %ymm3, %eax
- orl %eax, %ecx
- jnz L(char_nor_null)
-
- vmovdqa (%rdi), %ymm1
- VPCMPEQ %ymm1, %ymm0, %ymm2
- addq $VEC_SIZE, %rdi
- VPCMPEQ %ymm1, %ymm4, %ymm3
- vpmovmskb %ymm2, %ecx
- vpmovmskb %ymm3, %eax
- orl %eax, %ecx
- jnz L(char_nor_null)
-
- vmovdqa (%rdi), %ymm1
- VPCMPEQ %ymm1, %ymm0, %ymm2
- addq $VEC_SIZE, %rdi
- VPCMPEQ %ymm1, %ymm4, %ymm3
- vpmovmskb %ymm2, %ecx
- vpmovmskb %ymm3, %eax
- orl %eax, %ecx
- jz L(aligned_loop)
-
- .p2align 4
-L(char_nor_null):
- /* Find a CHAR or a nul CHAR in a loop. */
- testl %eax, %eax
- jnz L(match)
-L(return_value):
- testl %edx, %edx
- jz L(return_null)
- movl %edx, %eax
- movq %rsi, %rdi
+ vpmovmskb %ymm2, %edx
+ /* Use add for macro-fusion. */
+ addq %rax, %rdx
+ jz L(first_vec_x0_test)
+ /* NB: We could move this shift to before the branch and save a
+ bit of code size / performance on the fall through. The
+ branch leads to the null case which generally seems hotter
+ than char in first 3x VEC. */
+ salq $32, %rax
+ addq %rdx, %rax
+ bsrq %rax, %rax
+ leaq 1(%rsi, %rax), %rax
+# ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+# endif
+ VZEROUPPER_RETURN
+ .p2align 4,, 8
+L(first_aligned_loop_return):
+ VPCMPEQ %ymm4, %ymm0, %ymm4
+ vpmovmskb %ymm4, %edx
+ salq $32, %rcx
+ orq %rdx, %rcx
+
+ vpmovmskb %ymm10, %eax
+ vpmovmskb %ymm6, %edx
+ salq $32, %rax
+ orq %rdx, %rax
+ blsmskq %rcx, %rcx
+ andq %rcx, %rax
+ jz L(first_vec_x1_or_x2)
+
+ bsrq %rax, %rax
+ leaq -(VEC_SIZE * 2)(%rdi, %rax), %rax
# ifdef USE_AS_WCSRCHR
- /* Keep the first bit for each matching CHAR for bsr. */
- andl $0x11111111, %eax
+ andq $-CHAR_SIZE, %rax
# endif
- bsrl %eax, %eax
- leaq -VEC_SIZE(%rdi, %rax), %rax
-L(return_vzeroupper):
- ZERO_UPPER_VEC_REGISTERS_RETURN
+ VZEROUPPER_RETURN
+ /* Search char cannot be zero. */
.p2align 4
-L(match):
- /* Find a CHAR. Check if there is a nul CHAR. */
- vpmovmskb %ymm2, %ecx
- testl %ecx, %ecx
- jnz L(find_nul)
-
- /* Remember the match and keep searching. */
- movl %eax, %edx
+L(second_aligned_loop_set_furthest_match):
+ /* Save VEC and pointer from most recent match. */
+L(second_aligned_loop_prep):
movq %rdi, %rsi
- jmp L(aligned_loop)
+ vmovdqu %ymm6, %ymm2
+ vmovdqu %ymm10, %ymm3
.p2align 4
-L(find_nul):
-# ifdef USE_AS_WCSRCHR
- /* Keep the first bit for each matching CHAR for bsr. */
- andl $0x11111111, %ecx
- andl $0x11111111, %eax
-# endif
- /* Mask out any matching bits after the nul CHAR. */
- movl %ecx, %r8d
- subl $1, %r8d
- xorl %ecx, %r8d
- andl %r8d, %eax
+L(second_aligned_loop):
+ /* Search 2x at at time. */
+ vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4
+ vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5
+
+ VPCMPEQ %ymm4, %ymm7, %ymm6
+ VPMIN %ymm4, %ymm5, %ymm1
+ VPCMPEQ %ymm5, %ymm7, %ymm10
+ vpor %ymm6, %ymm10, %ymm5
+ VPCMPEQ %ymm1, %ymm0, %ymm1
+ vpor %ymm5, %ymm1, %ymm9
+
+ vpmovmskb %ymm9, %eax
+ addq $(VEC_SIZE * 2), %rdi
testl %eax, %eax
- /* If there is no CHAR here, return the remembered one. */
- jz L(return_value)
- bsrl %eax, %eax
- leaq -VEC_SIZE(%rdi, %rax), %rax
- VZEROUPPER_RETURN
-
- .p2align 4
-L(char_and_nul):
- /* Find both a CHAR and a nul CHAR. */
- addq %rcx, %rdi
- movl %edx, %ecx
-L(char_and_nul_in_first_vec):
-# ifdef USE_AS_WCSRCHR
- /* Keep the first bit for each matching CHAR for bsr. */
- andl $0x11111111, %ecx
- andl $0x11111111, %eax
-# endif
- /* Mask out any matching bits after the nul CHAR. */
- movl %ecx, %r8d
- subl $1, %r8d
- xorl %ecx, %r8d
- andl %r8d, %eax
+ jz L(second_aligned_loop)
+ vpmovmskb %ymm1, %ecx
+ testl %ecx, %ecx
+ jz L(second_aligned_loop_set_furthest_match)
+ vpmovmskb %ymm5, %eax
testl %eax, %eax
- /* Return null pointer if the nul CHAR comes first. */
- jz L(return_null)
- bsrl %eax, %eax
- leaq -VEC_SIZE(%rdi, %rax), %rax
+ jnz L(return_new_match)
+
+ /* This is the hot patch. We know CHAR is inbounds and that
+ ymm3/ymm2 have latest match. */
+ .p2align 4,, 4
+L(return_old_match):
+ vpmovmskb %ymm3, %eax
+ vpmovmskb %ymm2, %edx
+ salq $32, %rax
+ orq %rdx, %rax
+ bsrq %rax, %rax
+ /* Search char cannot be zero so safe to just use lea for
+ wcsrchr. */
+ leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax
VZEROUPPER_RETURN
- .p2align 4
-L(return_null):
- xorl %eax, %eax
+ /* Last iteration also potentially has a match. */
+ .p2align 4,, 8
+L(return_new_match):
+ VPCMPEQ %ymm4, %ymm0, %ymm4
+ vpmovmskb %ymm4, %edx
+ salq $32, %rcx
+ orq %rdx, %rcx
+
+ vpmovmskb %ymm10, %eax
+ vpmovmskb %ymm6, %edx
+ salq $32, %rax
+ orq %rdx, %rax
+ blsmskq %rcx, %rcx
+ andq %rcx, %rax
+ jz L(return_old_match)
+ bsrq %rax, %rax
+ /* Search char cannot be zero so safe to just use lea for
+ wcsrchr. */
+ leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax
VZEROUPPER_RETURN
-END (STRRCHR)
+ .p2align 4,, 4
+L(cross_page):
+ movq %rdi, %rsi
+ andq $-VEC_SIZE, %rsi
+ vmovdqu (%rsi), %ymm1
+ VPCMPEQ %ymm1, %ymm0, %ymm6
+ vpmovmskb %ymm6, %ecx
+ shrxl %edi, %ecx, %ecx
+ testl %ecx, %ecx
+ jz L(page_cross_continue)
+ VPCMPEQ %ymm1, %ymm7, %ymm1
+ vpmovmskb %ymm1, %eax
+ shrxl %edi, %eax, %eax
+ blsmskl %ecx, %ecx
+ andl %ecx, %eax
+ jz L(ret2)
+ bsrl %eax, %eax
+ addq %rdi, %rax
+# ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+# endif
+L(ret2):
+ VZEROUPPER_RETURN
+END(STRRCHR)
#endif
--
2.25.1
^ permalink raw reply [flat|nested] 36+ messages in thread
* [PATCH v2 4/4] x86: Optimize {str|wcs}rchr-evex
2022-04-21 22:22 ` [PATCH v2 1/4] " Noah Goldstein
2022-04-21 22:22 ` [PATCH v2 2/4] x86: Optimize {str|wcs}rchr-sse2 Noah Goldstein
2022-04-21 22:22 ` [PATCH v2 3/4] x86: Optimize {str|wcs}rchr-avx2 Noah Goldstein
@ 2022-04-21 22:22 ` Noah Goldstein
2022-04-21 23:59 ` H.J. Lu
2 siblings, 1 reply; 36+ messages in thread
From: Noah Goldstein @ 2022-04-21 22:22 UTC (permalink / raw)
To: libc-alpha
The new code unrolls the main loop slightly without adding too much
overhead and minimizes the comparisons for the search CHAR.
Geometric Mean of all benchmarks New / Old: 0.755
See email for all results.
Full xcheck passes on x86_64 with and without multiarch enabled.
---
sysdeps/x86_64/multiarch/strrchr-evex.S | 441 ++++++++++++++----------
1 file changed, 259 insertions(+), 182 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
index adeddaed32..5cf9a8315b 100644
--- a/sysdeps/x86_64/multiarch/strrchr-evex.S
+++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
@@ -24,242 +24,319 @@
# define STRRCHR __strrchr_evex
# endif
-# define VMOVU vmovdqu64
-# define VMOVA vmovdqa64
+# define VMOVU vmovdqu64
+# define VMOVA vmovdqa64
# ifdef USE_AS_WCSRCHR
+# define SHIFT_REG esi
+
+# define kunpck kunpckbw
+# define kmov_2x kmovd
+# define maskz_2x ecx
+# define maskm_2x eax
+# define CHAR_SIZE 4
+# define VPMIN vpminud
+# define VPTESTN vptestnmd
# define VPBROADCAST vpbroadcastd
-# define VPCMP vpcmpd
-# define SHIFT_REG r8d
+# define VPCMP vpcmpd
# else
+# define SHIFT_REG edi
+
+# define kunpck kunpckdq
+# define kmov_2x kmovq
+# define maskz_2x rcx
+# define maskm_2x rax
+
+# define CHAR_SIZE 1
+# define VPMIN vpminub
+# define VPTESTN vptestnmb
# define VPBROADCAST vpbroadcastb
-# define VPCMP vpcmpb
-# define SHIFT_REG ecx
+# define VPCMP vpcmpb
# endif
# define XMMZERO xmm16
# define YMMZERO ymm16
# define YMMMATCH ymm17
-# define YMM1 ymm18
+# define YMMSAVE ymm18
+
+# define YMM1 ymm19
+# define YMM2 ymm20
+# define YMM3 ymm21
+# define YMM4 ymm22
+# define YMM5 ymm23
+# define YMM6 ymm24
+# define YMM7 ymm25
+# define YMM8 ymm26
-# define VEC_SIZE 32
- .section .text.evex,"ax",@progbits
-ENTRY (STRRCHR)
- movl %edi, %ecx
+# define VEC_SIZE 32
+# define PAGE_SIZE 4096
+ .section .text.evex, "ax", @progbits
+ENTRY(STRRCHR)
+ movl %edi, %eax
/* Broadcast CHAR to YMMMATCH. */
VPBROADCAST %esi, %YMMMATCH
- vpxorq %XMMZERO, %XMMZERO, %XMMZERO
-
- /* Check if we may cross page boundary with one vector load. */
- andl $(2 * VEC_SIZE - 1), %ecx
- cmpl $VEC_SIZE, %ecx
- ja L(cros_page_boundary)
+ andl $(PAGE_SIZE - 1), %eax
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ jg L(cross_page_boundary)
+L(page_cross_continue):
VMOVU (%rdi), %YMM1
-
- /* Each bit in K0 represents a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM1, %k0
- /* Each bit in K1 represents a CHAR in YMM1. */
- VPCMP $0, %YMMMATCH, %YMM1, %k1
+ VPTESTN %YMM1, %YMM1, %k0
kmovd %k0, %ecx
- kmovd %k1, %eax
-
- addq $VEC_SIZE, %rdi
-
- testl %eax, %eax
- jnz L(first_vec)
-
testl %ecx, %ecx
- jnz L(return_null)
-
- andq $-VEC_SIZE, %rdi
- xorl %edx, %edx
- jmp L(aligned_loop)
-
- .p2align 4
-L(first_vec):
- /* Check if there is a null byte. */
- testl %ecx, %ecx
- jnz L(char_and_nul_in_first_vec)
-
- /* Remember the match and keep searching. */
- movl %eax, %edx
- movq %rdi, %rsi
- andq $-VEC_SIZE, %rdi
- jmp L(aligned_loop)
-
- .p2align 4
-L(cros_page_boundary):
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
-
+ jz L(aligned_more)
+ VPCMP $0, %YMMMATCH, %YMM1, %k1
+ kmovd %k1, %eax
+ blsmskl %ecx, %ecx
+ andl %ecx, %eax
+ jz L(ret0)
+ bsrl %eax, %eax
# ifdef USE_AS_WCSRCHR
- /* NB: Divide shift count by 4 since each bit in K1 represent 4
- bytes. */
- movl %ecx, %SHIFT_REG
- sarl $2, %SHIFT_REG
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
+# else
+ addq %rdi, %rax
# endif
+L(ret0):
+ ret
- VMOVA (%rdi), %YMM1
-
- /* Each bit in K0 represents a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM1, %k0
- /* Each bit in K1 represents a CHAR in YMM1. */
+ /* Returns for first vec x1/x2/x3 have hard coded backward
+ search path for earlier matches. */
+ .p2align 4,, 6
+L(first_vec_x1):
+ VPCMP $0, %YMMMATCH, %YMM2, %k1
+ kmovd %k1, %eax
+ blsmskl %ecx, %ecx
+ andl %ecx, %eax
+ jnz L(first_vec_x1_return)
+ .p2align 4,, 4
+L(first_vec_x0_test):
VPCMP $0, %YMMMATCH, %YMM1, %k1
- kmovd %k0, %edx
kmovd %k1, %eax
-
- shrxl %SHIFT_REG, %edx, %edx
- shrxl %SHIFT_REG, %eax, %eax
- addq $VEC_SIZE, %rdi
-
- /* Check if there is a CHAR. */
testl %eax, %eax
- jnz L(found_char)
-
- testl %edx, %edx
- jnz L(return_null)
-
- jmp L(aligned_loop)
-
- .p2align 4
-L(found_char):
- testl %edx, %edx
- jnz L(char_and_nul)
-
- /* Remember the match and keep searching. */
- movl %eax, %edx
- leaq (%rdi, %rcx), %rsi
+ jz L(ret1)
+ bsrl %eax, %eax
+# ifdef USE_AS_WCSRCHR
+ leaq (%rsi, %rax, CHAR_SIZE), %rax
+# else
+ addq %rsi, %rax
+# endif
+L(ret1):
+ ret
- .p2align 4
-L(aligned_loop):
- VMOVA (%rdi), %YMM1
- addq $VEC_SIZE, %rdi
+ .p2align 4,, 10
+L(first_vec_x1_or_x2):
+ VPCMP $0, %YMM3, %YMMMATCH, %k3
+ VPCMP $0, %YMM2, %YMMMATCH, %k2
+ kortestd %k2, %k3
+ jz L(first_vec_x0_test)
+
+ kunpck %k2, %k3, %k3
+ kmovq %k3, %rax
+ bsrq %rax, %rax
+ leaq (VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax
+ ret
- /* Each bit in K0 represents a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM1, %k0
- /* Each bit in K1 represents a CHAR in YMM1. */
- VPCMP $0, %YMMMATCH, %YMM1, %k1
- kmovd %k0, %ecx
+ .p2align 4,, 6
+L(first_vec_x3):
+ VPCMP $0, %YMMMATCH, %YMM4, %k1
kmovd %k1, %eax
- orl %eax, %ecx
- jnz L(char_nor_null)
+ blsmskl %ecx, %ecx
+ andl %ecx, %eax
+ jz L(first_vec_x1_or_x2)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
- VMOVA (%rdi), %YMM1
- add $VEC_SIZE, %rdi
+ .p2align 4,, 6
+L(first_vec_x0_x1_test):
+ VPCMP $0, %YMMMATCH, %YMM2, %k1
+ kmovd %k1, %eax
+ testl %eax, %eax
+ jz L(first_vec_x0_test)
+ .p2align 4,, 4
+L(first_vec_x1_return):
+ bsrl %eax, %eax
+ leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
- /* Each bit in K0 represents a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM1, %k0
- /* Each bit in K1 represents a CHAR in YMM1. */
- VPCMP $0, %YMMMATCH, %YMM1, %k1
- kmovd %k0, %ecx
+ .p2align 4,, 10
+L(first_vec_x2):
+ VPCMP $0, %YMMMATCH, %YMM3, %k1
kmovd %k1, %eax
- orl %eax, %ecx
- jnz L(char_nor_null)
+ blsmskl %ecx, %ecx
+ andl %ecx, %eax
+ jz L(first_vec_x0_x1_test)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
- VMOVA (%rdi), %YMM1
- addq $VEC_SIZE, %rdi
- /* Each bit in K0 represents a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM1, %k0
- /* Each bit in K1 represents a CHAR in YMM1. */
- VPCMP $0, %YMMMATCH, %YMM1, %k1
+ .p2align 4
+L(aligned_more):
+ /* Need to keep original pointer incase YMM1 has last match. */
+ movq %rdi, %rsi
+ andq $-VEC_SIZE, %rdi
+ VMOVU VEC_SIZE(%rdi), %YMM2
+ VPTESTN %YMM2, %YMM2, %k0
kmovd %k0, %ecx
- kmovd %k1, %eax
- orl %eax, %ecx
- jnz L(char_nor_null)
+ testl %ecx, %ecx
+ jnz L(first_vec_x1)
- VMOVA (%rdi), %YMM1
- addq $VEC_SIZE, %rdi
+ VMOVU (VEC_SIZE * 2)(%rdi), %YMM3
+ VPTESTN %YMM3, %YMM3, %k0
+ kmovd %k0, %ecx
+ testl %ecx, %ecx
+ jnz L(first_vec_x2)
- /* Each bit in K0 represents a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM1, %k0
- /* Each bit in K1 represents a CHAR in YMM1. */
- VPCMP $0, %YMMMATCH, %YMM1, %k1
+ VMOVU (VEC_SIZE * 3)(%rdi), %YMM4
+ VPTESTN %YMM4, %YMM4, %k0
kmovd %k0, %ecx
- kmovd %k1, %eax
- orl %eax, %ecx
- jz L(aligned_loop)
+ movq %rdi, %r8
+ testl %ecx, %ecx
+ jnz L(first_vec_x3)
+ andq $-(VEC_SIZE * 2), %rdi
.p2align 4
-L(char_nor_null):
- /* Find a CHAR or a null byte in a loop. */
+L(first_aligned_loop):
+ /* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee
+ they don't store a match. */
+ VMOVA (VEC_SIZE * 4)(%rdi), %YMM5
+ VMOVA (VEC_SIZE * 5)(%rdi), %YMM6
+
+ VPCMP $0, %YMM5, %YMMMATCH, %k2
+ vpxord %YMM6, %YMMMATCH, %YMM7
+
+ VPMIN %YMM5, %YMM6, %YMM8
+ VPMIN %YMM8, %YMM7, %YMM7
+
+ VPTESTN %YMM7, %YMM7, %k1
+ subq $(VEC_SIZE * -2), %rdi
+ kortestd %k1, %k2
+ jz L(first_aligned_loop)
+
+ VPCMP $0, %YMM6, %YMMMATCH, %k3
+ VPTESTN %YMM8, %YMM8, %k1
+ ktestd %k1, %k1
+ jz L(second_aligned_loop_prep)
+
+ kortestd %k2, %k3
+ jnz L(return_first_aligned_loop)
+
+ .p2align 4,, 6
+L(first_vec_x1_or_x2_or_x3):
+ VPCMP $0, %YMM4, %YMMMATCH, %k4
+ kmovd %k4, %eax
testl %eax, %eax
- jnz L(match)
-L(return_value):
- testl %edx, %edx
- jz L(return_null)
- movl %edx, %eax
- movq %rsi, %rdi
+ jz L(first_vec_x1_or_x2)
bsrl %eax, %eax
-# ifdef USE_AS_WCSRCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq -VEC_SIZE(%rdi, %rax, 4), %rax
-# else
- leaq -VEC_SIZE(%rdi, %rax), %rax
-# endif
+ leaq (VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
ret
- .p2align 4
-L(match):
- /* Find a CHAR. Check if there is a null byte. */
- kmovd %k0, %ecx
- testl %ecx, %ecx
- jnz L(find_nul)
+ .p2align 4,, 8
+L(return_first_aligned_loop):
+ VPTESTN %YMM5, %YMM5, %k0
+ kunpck %k0, %k1, %k0
+ kmov_2x %k0, %maskz_2x
+
+ blsmsk %maskz_2x, %maskz_2x
+ kunpck %k2, %k3, %k3
+ kmov_2x %k3, %maskm_2x
+ and %maskz_2x, %maskm_2x
+ jz L(first_vec_x1_or_x2_or_x3)
+
+ bsr %maskm_2x, %maskm_2x
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
- /* Remember the match and keep searching. */
- movl %eax, %edx
+ .p2align 4
+ /* We can throw away the work done for the first 4x checks here
+ as we have a later match. This is the 'fast' path persay.
+ */
+L(second_aligned_loop_prep):
+L(second_aligned_loop_set_furthest_match):
movq %rdi, %rsi
- jmp L(aligned_loop)
+ kunpck %k2, %k3, %k4
.p2align 4
-L(find_nul):
- /* Mask out any matching bits after the null byte. */
- movl %ecx, %r8d
- subl $1, %r8d
- xorl %ecx, %r8d
- andl %r8d, %eax
- testl %eax, %eax
- /* If there is no CHAR here, return the remembered one. */
- jz L(return_value)
- bsrl %eax, %eax
+L(second_aligned_loop):
+ VMOVU (VEC_SIZE * 4)(%rdi), %YMM1
+ VMOVU (VEC_SIZE * 5)(%rdi), %YMM2
+
+ VPCMP $0, %YMM1, %YMMMATCH, %k2
+ vpxord %YMM2, %YMMMATCH, %YMM3
+
+ VPMIN %YMM1, %YMM2, %YMM4
+ VPMIN %YMM3, %YMM4, %YMM3
+
+ VPTESTN %YMM3, %YMM3, %k1
+ subq $(VEC_SIZE * -2), %rdi
+ kortestd %k1, %k2
+ jz L(second_aligned_loop)
+
+ VPCMP $0, %YMM2, %YMMMATCH, %k3
+ VPTESTN %YMM4, %YMM4, %k1
+ ktestd %k1, %k1
+ jz L(second_aligned_loop_set_furthest_match)
+
+ kortestd %k2, %k3
+ /* branch here because there is a significant advantage interms
+ of output dependency chance in using edx. */
+ jnz L(return_new_match)
+L(return_old_match):
+ kmovq %k4, %rax
+ bsrq %rax, %rax
+ leaq (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax
+ ret
+
+L(return_new_match):
+ VPTESTN %YMM1, %YMM1, %k0
+ kunpck %k0, %k1, %k0
+ kmov_2x %k0, %maskz_2x
+
+ blsmsk %maskz_2x, %maskz_2x
+ kunpck %k2, %k3, %k3
+ kmov_2x %k3, %maskm_2x
+ and %maskz_2x, %maskm_2x
+ jz L(return_old_match)
+
+ bsr %maskm_2x, %maskm_2x
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
+
+ /* This block is horribly aligned (% 16 == 15). This is
+ intentional. The L(cross_page_boundary) block is exactly
+ 32-bytes of code size. Ultimately this is a cold case so
+ save the code size by leaving misaligned. */
+L(cross_page_boundary):
+ xorq %rdi, %rax
+ VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %YMM1
+ VPTESTN %YMM1, %YMM1, %k0
+ kmovd %k0, %ecx
# ifdef USE_AS_WCSRCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq -VEC_SIZE(%rdi, %rax, 4), %rax
-# else
- leaq -VEC_SIZE(%rdi, %rax), %rax
+ movl %edi, %esi
+ andl $(VEC_SIZE - 1), %esi
+ shrl $2, %esi
# endif
- ret
+ shrxl %SHIFT_REG, %ecx, %ecx
- .p2align 4
-L(char_and_nul):
- /* Find both a CHAR and a null byte. */
- addq %rcx, %rdi
- movl %edx, %ecx
-L(char_and_nul_in_first_vec):
- /* Mask out any matching bits after the null byte. */
- movl %ecx, %r8d
- subl $1, %r8d
- xorl %ecx, %r8d
- andl %r8d, %eax
- testl %eax, %eax
- /* Return null pointer if the null byte comes first. */
- jz L(return_null)
+ testl %ecx, %ecx
+ jz L(page_cross_continue)
+ VPCMP $0, %YMMMATCH, %YMM1, %k1
+ kmovd %k1, %eax
+ shrxl %SHIFT_REG, %eax, %eax
+ blsmskl %ecx, %ecx
+ andl %ecx, %eax
+ jz L(ret3)
bsrl %eax, %eax
# ifdef USE_AS_WCSRCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq -VEC_SIZE(%rdi, %rax, 4), %rax
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
# else
- leaq -VEC_SIZE(%rdi, %rax), %rax
+ addq %rdi, %rax
# endif
+L(ret3):
ret
- .p2align 4
-L(return_null):
- xorl %eax, %eax
- ret
-
-END (STRRCHR)
+END(STRRCHR)
#endif
--
2.25.1
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH v1 2/5] x86: Optimize {str|wcs}rchr-sse2
2022-04-21 21:48 ` H.J. Lu
@ 2022-04-21 22:23 ` Noah Goldstein
0 siblings, 0 replies; 36+ messages in thread
From: Noah Goldstein @ 2022-04-21 22:23 UTC (permalink / raw)
To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell
On Thu, Apr 21, 2022 at 4:49 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Thu, Apr 21, 2022 at 1:57 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Thu, Apr 21, 2022 at 3:27 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Wed, Apr 20, 2022 at 8:14 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > The new code unrolls the main loop slightly without adding too much
> > > > overhead and minimizes the comparisons for the search CHAR.
> > > >
> > > > Geometric Mean of all benchmarks New / Old: 0.741
> > > > See email for all results.
> > > >
> > > > Full xcheck passes on x86_64 with and without multiarch enabled.
> > > > ---
> > > > Results For: strrchr
> > > >
> > > > Geometric Mean of N=30 runs.
> > > >
> > > > Geometric Mean of all benchmarks New / Old: 0.741
> > > > Benchmarks performance on Tigerlake:
> > > > https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i71165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html
> > > >
> > > > len, align, pos, seek, max_char, freq, New Time / Old Time
> > > > 2048, 0, 32, 0, 127, 1, 0.647
> > > > 2048, 1, 32, 0, 127, 1, 0.621
> > > > 2048, 0, 64, 0, 127, 1, 0.661
> > > > 2048, 2, 64, 0, 127, 1, 0.655
> > > > 2048, 0, 128, 0, 127, 1, 0.69
> > > > 2048, 3, 128, 0, 127, 1, 0.689
> > > > 2048, 0, 256, 0, 127, 1, 0.718
> > > > 2048, 4, 256, 0, 127, 1, 0.718
> > > > 2048, 0, 512, 0, 127, 1, 0.758
> > > > 2048, 5, 512, 0, 127, 1, 0.754
> > > > 2048, 0, 1024, 0, 127, 1, 1.029
> > > > 2048, 6, 1024, 0, 127, 1, 1.032
> > > > 2048, 0, 2048, 0, 127, 1, 0.826
> > > > 2048, 7, 2048, 0, 127, 1, 0.834
> > > > 2048, 0, 4096, 0, 127, 1, 0.825
> > > > 2048, 8, 4096, 0, 127, 1, 0.83
> > > > 256, 1, 64, 0, 127, 1, 0.657
> > > > 256, 15, 64, 0, 127, 1, 0.657
> > > > 256, 2, 64, 0, 127, 1, 0.657
> > > > 256, 30, 64, 0, 127, 1, 0.523
> > > > 256, 3, 64, 0, 127, 1, 0.657
> > > > 256, 45, 64, 0, 127, 1, 0.654
> > > > 256, 4, 64, 0, 127, 1, 0.657
> > > > 256, 60, 64, 0, 127, 1, 0.526
> > > > 256, 5, 64, 0, 127, 1, 0.658
> > > > 256, 75, 64, 0, 127, 1, 0.658
> > > > 256, 6, 64, 0, 127, 1, 0.655
> > > > 256, 90, 64, 0, 127, 1, 0.523
> > > > 256, 7, 64, 0, 127, 1, 0.655
> > > > 256, 105, 64, 0, 127, 1, 0.654
> > > > 1, 0, 0, 0, 127, 1, 0.98
> > > > 2, 0, 1, 0, 127, 1, 0.978
> > > > 3, 0, 2, 0, 127, 1, 0.975
> > > > 4, 0, 3, 0, 127, 1, 0.976
> > > > 5, 0, 4, 0, 127, 1, 0.977
> > > > 6, 0, 5, 0, 127, 1, 0.981
> > > > 7, 0, 6, 0, 127, 1, 0.982
> > > > 8, 0, 7, 0, 127, 1, 0.98
> > > > 9, 0, 8, 0, 127, 1, 0.978
> > > > 10, 0, 9, 0, 127, 1, 0.981
> > > > 11, 0, 10, 0, 127, 1, 0.984
> > > > 12, 0, 11, 0, 127, 1, 0.982
> > > > 13, 0, 12, 0, 127, 1, 0.98
> > > > 14, 0, 13, 0, 127, 1, 0.978
> > > > 15, 0, 14, 0, 127, 1, 0.979
> > > > 16, 0, 15, 0, 127, 1, 0.986
> > > > 17, 0, 16, 0, 127, 1, 0.529
> > > > 18, 0, 17, 0, 127, 1, 0.566
> > > > 19, 0, 18, 0, 127, 1, 0.575
> > > > 20, 0, 19, 0, 127, 1, 0.573
> > > > 21, 0, 20, 0, 127, 1, 0.579
> > > > 22, 0, 21, 0, 127, 1, 0.595
> > > > 23, 0, 22, 0, 127, 1, 0.585
> > > > 24, 0, 23, 0, 127, 1, 0.586
> > > > 25, 0, 24, 0, 127, 1, 0.587
> > > > 26, 0, 25, 0, 127, 1, 0.592
> > > > 27, 0, 26, 0, 127, 1, 0.595
> > > > 28, 0, 27, 0, 127, 1, 0.592
> > > > 29, 0, 28, 0, 127, 1, 0.6
> > > > 30, 0, 29, 0, 127, 1, 0.598
> > > > 31, 0, 30, 0, 127, 1, 0.595
> > > > 32, 0, 31, 0, 127, 1, 0.592
> > > > 2048, 0, 32, 23, 127, 1, 0.827
> > > > 2048, 1, 32, 23, 127, 1, 0.826
> > > > 2048, 0, 64, 23, 127, 1, 0.824
> > > > 2048, 2, 64, 23, 127, 1, 0.825
> > > > 2048, 0, 128, 23, 127, 1, 0.829
> > > > 2048, 3, 128, 23, 127, 1, 0.824
> > > > 2048, 0, 256, 23, 127, 1, 0.832
> > > > 2048, 4, 256, 23, 127, 1, 0.825
> > > > 2048, 0, 512, 23, 127, 1, 0.831
> > > > 2048, 5, 512, 23, 127, 1, 0.837
> > > > 2048, 0, 1024, 23, 127, 1, 0.721
> > > > 2048, 6, 1024, 23, 127, 1, 0.757
> > > > 2048, 0, 2048, 23, 127, 1, 0.825
> > > > 2048, 7, 2048, 23, 127, 1, 0.824
> > > > 2048, 0, 4096, 23, 127, 1, 0.828
> > > > 2048, 8, 4096, 23, 127, 1, 0.823
> > > > 256, 1, 64, 23, 127, 1, 0.665
> > > > 256, 15, 64, 23, 127, 1, 0.661
> > > > 256, 2, 64, 23, 127, 1, 0.674
> > > > 256, 30, 64, 23, 127, 1, 0.605
> > > > 256, 3, 64, 23, 127, 1, 0.668
> > > > 256, 45, 64, 23, 127, 1, 0.661
> > > > 256, 4, 64, 23, 127, 1, 0.657
> > > > 256, 60, 64, 23, 127, 1, 0.594
> > > > 256, 5, 64, 23, 127, 1, 0.654
> > > > 256, 75, 64, 23, 127, 1, 0.673
> > > > 256, 6, 64, 23, 127, 1, 0.688
> > > > 256, 90, 64, 23, 127, 1, 0.6
> > > > 256, 7, 64, 23, 127, 1, 0.66
> > > > 256, 105, 64, 23, 127, 1, 0.654
> > > > 1, 0, 0, 23, 127, 1, 0.981
> > > > 2, 0, 1, 23, 127, 1, 0.976
> > > > 3, 0, 2, 23, 127, 1, 0.983
> > > > 4, 0, 3, 23, 127, 1, 0.984
> > > > 5, 0, 4, 23, 127, 1, 0.973
> > > > 6, 0, 5, 23, 127, 1, 0.987
> > > > 7, 0, 6, 23, 127, 1, 0.977
> > > > 8, 0, 7, 23, 127, 1, 0.979
> > > > 9, 0, 8, 23, 127, 1, 0.981
> > > > 10, 0, 9, 23, 127, 1, 0.98
> > > > 11, 0, 10, 23, 127, 1, 0.983
> > > > 12, 0, 11, 23, 127, 1, 0.98
> > > > 13, 0, 12, 23, 127, 1, 0.98
> > > > 14, 0, 13, 23, 127, 1, 0.977
> > > > 15, 0, 14, 23, 127, 1, 0.982
> > > > 16, 0, 15, 23, 127, 1, 0.581
> > > > 17, 0, 16, 23, 127, 1, 0.551
> > > > 18, 0, 17, 23, 127, 1, 0.555
> > > > 19, 0, 18, 23, 127, 1, 0.586
> > > > 20, 0, 19, 23, 127, 1, 0.585
> > > > 21, 0, 20, 23, 127, 1, 0.582
> > > > 22, 0, 21, 23, 127, 1, 0.571
> > > > 23, 0, 22, 23, 127, 1, 0.576
> > > > 24, 0, 23, 23, 127, 1, 0.581
> > > > 25, 0, 24, 23, 127, 1, 0.589
> > > > 26, 0, 25, 23, 127, 1, 0.593
> > > > 27, 0, 26, 23, 127, 1, 0.595
> > > > 28, 0, 27, 23, 127, 1, 0.583
> > > > 29, 0, 28, 23, 127, 1, 0.595
> > > > 30, 0, 29, 23, 127, 1, 0.58
> > > > 31, 0, 30, 23, 127, 1, 0.594
> > > > 32, 0, 31, 23, 127, 1, 0.665
> > > > 2048, 0, 32, 23, 127, 2, 0.825
> > > > 2048, 1, 32, 23, 127, 2, 0.818
> > > > 2048, 0, 64, 23, 127, 2, 0.829
> > > > 2048, 2, 64, 23, 127, 2, 0.828
> > > > 2048, 0, 128, 23, 127, 2, 0.823
> > > > 2048, 3, 128, 23, 127, 2, 0.825
> > > > 2048, 0, 256, 23, 127, 2, 0.819
> > > > 2048, 4, 256, 23, 127, 2, 0.828
> > > > 2048, 0, 512, 23, 127, 2, 0.824
> > > > 2048, 5, 512, 23, 127, 2, 0.827
> > > > 2048, 0, 1024, 23, 127, 2, 0.813
> > > > 2048, 6, 1024, 23, 127, 2, 0.834
> > > > 2048, 0, 2048, 23, 127, 2, 0.927
> > > > 2048, 7, 2048, 23, 127, 2, 0.923
> > > > 2048, 0, 4096, 23, 127, 2, 0.818
> > > > 2048, 8, 4096, 23, 127, 2, 0.82
> > > > 256, 1, 64, 23, 127, 2, 0.693
> > > > 256, 15, 64, 23, 127, 2, 0.686
> > > > 256, 2, 64, 23, 127, 2, 0.69
> > > > 256, 30, 64, 23, 127, 2, 0.611
> > > > 256, 3, 64, 23, 127, 2, 0.692
> > > > 256, 45, 64, 23, 127, 2, 0.685
> > > > 256, 4, 64, 23, 127, 2, 0.688
> > > > 256, 60, 64, 23, 127, 2, 0.6
> > > > 256, 5, 64, 23, 127, 2, 0.69
> > > > 256, 75, 64, 23, 127, 2, 0.689
> > > > 256, 6, 64, 23, 127, 2, 0.688
> > > > 256, 90, 64, 23, 127, 2, 0.611
> > > > 256, 7, 64, 23, 127, 2, 0.69
> > > > 256, 105, 64, 23, 127, 2, 0.686
> > > > 1, 0, 0, 23, 127, 2, 0.982
> > > > 2, 0, 1, 23, 127, 2, 0.987
> > > > 3, 0, 2, 23, 127, 2, 0.978
> > > > 4, 0, 3, 23, 127, 2, 0.977
> > > > 5, 0, 4, 23, 127, 2, 0.979
> > > > 6, 0, 5, 23, 127, 2, 0.985
> > > > 7, 0, 6, 23, 127, 2, 0.975
> > > > 8, 0, 7, 23, 127, 2, 0.981
> > > > 9, 0, 8, 23, 127, 2, 0.984
> > > > 10, 0, 9, 23, 127, 2, 0.983
> > > > 11, 0, 10, 23, 127, 2, 0.982
> > > > 12, 0, 11, 23, 127, 2, 0.976
> > > > 13, 0, 12, 23, 127, 2, 0.985
> > > > 14, 0, 13, 23, 127, 2, 0.984
> > > > 15, 0, 14, 23, 127, 2, 0.98
> > > > 16, 0, 15, 23, 127, 2, 0.583
> > > > 17, 0, 16, 23, 127, 2, 0.552
> > > > 18, 0, 17, 23, 127, 2, 0.564
> > > > 19, 0, 18, 23, 127, 2, 0.585
> > > > 20, 0, 19, 23, 127, 2, 0.578
> > > > 21, 0, 20, 23, 127, 2, 0.578
> > > > 22, 0, 21, 23, 127, 2, 0.571
> > > > 23, 0, 22, 23, 127, 2, 0.587
> > > > 24, 0, 23, 23, 127, 2, 0.589
> > > > 25, 0, 24, 23, 127, 2, 0.593
> > > > 26, 0, 25, 23, 127, 2, 0.589
> > > > 27, 0, 26, 23, 127, 2, 0.588
> > > > 28, 0, 27, 23, 127, 2, 0.593
> > > > 29, 0, 28, 23, 127, 2, 0.579
> > > > 30, 0, 29, 23, 127, 2, 0.572
> > > > 31, 0, 30, 23, 127, 2, 0.582
> > > > 32, 0, 31, 23, 127, 2, 0.659
> > > > 2048, 0, 32, 23, 127, 4, 0.822
> > > > 2048, 1, 32, 23, 127, 4, 0.818
> > > > 2048, 0, 64, 23, 127, 4, 0.826
> > > > 2048, 2, 64, 23, 127, 4, 0.824
> > > > 2048, 0, 128, 23, 127, 4, 0.833
> > > > 2048, 3, 128, 23, 127, 4, 0.831
> > > > 2048, 0, 256, 23, 127, 4, 0.826
> > > > 2048, 4, 256, 23, 127, 4, 0.831
> > > > 2048, 0, 512, 23, 127, 4, 0.834
> > > > 2048, 5, 512, 23, 127, 4, 0.83
> > > > 2048, 0, 1024, 23, 127, 4, 0.836
> > > > 2048, 6, 1024, 23, 127, 4, 0.844
> > > > 2048, 0, 2048, 23, 127, 4, 0.696
> > > > 2048, 7, 2048, 23, 127, 4, 0.704
> > > > 2048, 0, 4096, 23, 127, 4, 0.936
> > > > 2048, 8, 4096, 23, 127, 4, 0.925
> > > > 256, 1, 64, 23, 127, 4, 0.694
> > > > 256, 15, 64, 23, 127, 4, 0.69
> > > > 256, 2, 64, 23, 127, 4, 0.687
> > > > 256, 30, 64, 23, 127, 4, 0.612
> > > > 256, 3, 64, 23, 127, 4, 0.685
> > > > 256, 45, 64, 23, 127, 4, 0.685
> > > > 256, 4, 64, 23, 127, 4, 0.684
> > > > 256, 60, 64, 23, 127, 4, 0.606
> > > > 256, 5, 64, 23, 127, 4, 0.69
> > > > 256, 75, 64, 23, 127, 4, 0.688
> > > > 256, 6, 64, 23, 127, 4, 0.69
> > > > 256, 90, 64, 23, 127, 4, 0.615
> > > > 256, 7, 64, 23, 127, 4, 0.691
> > > > 256, 105, 64, 23, 127, 4, 0.688
> > > > 1, 0, 0, 23, 127, 4, 0.982
> > > > 2, 0, 1, 23, 127, 4, 0.983
> > > > 3, 0, 2, 23, 127, 4, 0.981
> > > > 4, 0, 3, 23, 127, 4, 0.984
> > > > 5, 0, 4, 23, 127, 4, 0.963
> > > > 6, 0, 5, 23, 127, 4, 0.978
> > > > 7, 0, 6, 23, 127, 4, 0.985
> > > > 8, 0, 7, 23, 127, 4, 0.986
> > > > 9, 0, 8, 23, 127, 4, 0.978
> > > > 10, 0, 9, 23, 127, 4, 0.985
> > > > 11, 0, 10, 23, 127, 4, 0.986
> > > > 12, 0, 11, 23, 127, 4, 0.983
> > > > 13, 0, 12, 23, 127, 4, 0.986
> > > > 14, 0, 13, 23, 127, 4, 0.98
> > > > 15, 0, 14, 23, 127, 4, 0.979
> > > > 16, 0, 15, 23, 127, 4, 0.582
> > > > 17, 0, 16, 23, 127, 4, 0.542
> > > > 18, 0, 17, 23, 127, 4, 0.564
> > > > 19, 0, 18, 23, 127, 4, 0.571
> > > > 20, 0, 19, 23, 127, 4, 0.582
> > > > 21, 0, 20, 23, 127, 4, 0.573
> > > > 22, 0, 21, 23, 127, 4, 0.575
> > > > 23, 0, 22, 23, 127, 4, 0.578
> > > > 24, 0, 23, 23, 127, 4, 0.58
> > > > 25, 0, 24, 23, 127, 4, 0.592
> > > > 26, 0, 25, 23, 127, 4, 0.588
> > > > 27, 0, 26, 23, 127, 4, 0.574
> > > > 28, 0, 27, 23, 127, 4, 0.589
> > > > 29, 0, 28, 23, 127, 4, 0.56
> > > > 30, 0, 29, 23, 127, 4, 0.587
> > > > 31, 0, 30, 23, 127, 4, 0.584
> > > > 32, 0, 31, 23, 127, 4, 0.664
> > > > 2048, 0, 32, 23, 127, 8, 0.826
> > > > 2048, 1, 32, 23, 127, 8, 0.821
> > > > 2048, 0, 64, 23, 127, 8, 0.828
> > > > 2048, 2, 64, 23, 127, 8, 0.827
> > > > 2048, 0, 128, 23, 127, 8, 0.833
> > > > 2048, 3, 128, 23, 127, 8, 0.83
> > > > 2048, 0, 256, 23, 127, 8, 0.855
> > > > 2048, 4, 256, 23, 127, 8, 0.849
> > > > 2048, 0, 512, 23, 127, 8, 0.849
> > > > 2048, 5, 512, 23, 127, 8, 0.851
> > > > 2048, 0, 1024, 23, 127, 8, 0.856
> > > > 2048, 6, 1024, 23, 127, 8, 0.862
> > > > 2048, 0, 2048, 23, 127, 8, 0.709
> > > > 2048, 7, 2048, 23, 127, 8, 0.712
> > > > 2048, 0, 4096, 23, 127, 8, 0.702
> > > > 2048, 8, 4096, 23, 127, 8, 0.701
> > > > 256, 1, 64, 23, 127, 8, 0.689
> > > > 256, 15, 64, 23, 127, 8, 0.688
> > > > 256, 2, 64, 23, 127, 8, 0.691
> > > > 256, 30, 64, 23, 127, 8, 0.612
> > > > 256, 3, 64, 23, 127, 8, 0.688
> > > > 256, 45, 64, 23, 127, 8, 0.686
> > > > 256, 4, 64, 23, 127, 8, 0.694
> > > > 256, 60, 64, 23, 127, 8, 0.609
> > > > 256, 5, 64, 23, 127, 8, 0.69
> > > > 256, 75, 64, 23, 127, 8, 0.69
> > > > 256, 6, 64, 23, 127, 8, 0.691
> > > > 256, 90, 64, 23, 127, 8, 0.612
> > > > 256, 7, 64, 23, 127, 8, 0.689
> > > > 256, 105, 64, 23, 127, 8, 0.688
> > > > 1, 0, 0, 23, 127, 8, 0.98
> > > > 2, 0, 1, 23, 127, 8, 0.978
> > > > 3, 0, 2, 23, 127, 8, 0.98
> > > > 4, 0, 3, 23, 127, 8, 0.978
> > > > 5, 0, 4, 23, 127, 8, 0.977
> > > > 6, 0, 5, 23, 127, 8, 0.984
> > > > 7, 0, 6, 23, 127, 8, 0.982
> > > > 8, 0, 7, 23, 127, 8, 0.983
> > > > 9, 0, 8, 23, 127, 8, 0.987
> > > > 10, 0, 9, 23, 127, 8, 0.979
> > > > 11, 0, 10, 23, 127, 8, 0.985
> > > > 12, 0, 11, 23, 127, 8, 0.981
> > > > 13, 0, 12, 23, 127, 8, 0.98
> > > > 14, 0, 13, 23, 127, 8, 0.982
> > > > 15, 0, 14, 23, 127, 8, 0.981
> > > > 16, 0, 15, 23, 127, 8, 0.579
> > > > 17, 0, 16, 23, 127, 8, 0.531
> > > > 18, 0, 17, 23, 127, 8, 0.577
> > > > 19, 0, 18, 23, 127, 8, 0.588
> > > > 20, 0, 19, 23, 127, 8, 0.571
> > > > 21, 0, 20, 23, 127, 8, 0.576
> > > > 22, 0, 21, 23, 127, 8, 0.59
> > > > 23, 0, 22, 23, 127, 8, 0.574
> > > > 24, 0, 23, 23, 127, 8, 0.583
> > > > 25, 0, 24, 23, 127, 8, 0.581
> > > > 26, 0, 25, 23, 127, 8, 0.592
> > > > 27, 0, 26, 23, 127, 8, 0.586
> > > > 28, 0, 27, 23, 127, 8, 0.588
> > > > 29, 0, 28, 23, 127, 8, 0.578
> > > > 30, 0, 29, 23, 127, 8, 0.573
> > > > 31, 0, 30, 23, 127, 8, 0.588
> > > > 32, 0, 31, 23, 127, 8, 0.664
> > > > 2048, 0, 32, 23, 127, 16, 0.825
> > > > 2048, 1, 32, 23, 127, 16, 0.823
> > > > 2048, 0, 64, 23, 127, 16, 0.831
> > > > 2048, 2, 64, 23, 127, 16, 0.822
> > > > 2048, 0, 128, 23, 127, 16, 0.831
> > > > 2048, 3, 128, 23, 127, 16, 0.831
> > > > 2048, 0, 256, 23, 127, 16, 0.849
> > > > 2048, 4, 256, 23, 127, 16, 0.85
> > > > 2048, 0, 512, 23, 127, 16, 0.751
> > > > 2048, 5, 512, 23, 127, 16, 0.75
> > > > 2048, 0, 1024, 23, 127, 16, 0.913
> > > > 2048, 6, 1024, 23, 127, 16, 0.895
> > > > 2048, 0, 2048, 23, 127, 16, 0.736
> > > > 2048, 7, 2048, 23, 127, 16, 0.741
> > > > 2048, 0, 4096, 23, 127, 16, 0.712
> > > > 2048, 8, 4096, 23, 127, 16, 0.711
> > > > 256, 1, 64, 23, 127, 16, 0.758
> > > > 256, 15, 64, 23, 127, 16, 0.692
> > > > 256, 2, 64, 23, 127, 16, 0.692
> > > > 256, 30, 64, 23, 127, 16, 0.613
> > > > 256, 3, 64, 23, 127, 16, 0.69
> > > > 256, 45, 64, 23, 127, 16, 0.687
> > > > 256, 4, 64, 23, 127, 16, 0.69
> > > > 256, 60, 64, 23, 127, 16, 0.604
> > > > 256, 5, 64, 23, 127, 16, 0.687
> > > > 256, 75, 64, 23, 127, 16, 0.687
> > > > 256, 6, 64, 23, 127, 16, 0.69
> > > > 256, 90, 64, 23, 127, 16, 0.61
> > > > 256, 7, 64, 23, 127, 16, 0.69
> > > > 256, 105, 64, 23, 127, 16, 0.685
> > > > 1, 0, 0, 23, 127, 16, 0.981
> > > > 2, 0, 1, 23, 127, 16, 0.985
> > > > 3, 0, 2, 23, 127, 16, 0.985
> > > > 4, 0, 3, 23, 127, 16, 0.981
> > > > 5, 0, 4, 23, 127, 16, 0.979
> > > > 6, 0, 5, 23, 127, 16, 0.986
> > > > 7, 0, 6, 23, 127, 16, 0.986
> > > > 8, 0, 7, 23, 127, 16, 0.982
> > > > 9, 0, 8, 23, 127, 16, 0.982
> > > > 10, 0, 9, 23, 127, 16, 0.98
> > > > 11, 0, 10, 23, 127, 16, 0.983
> > > > 12, 0, 11, 23, 127, 16, 0.982
> > > > 13, 0, 12, 23, 127, 16, 0.982
> > > > 14, 0, 13, 23, 127, 16, 0.982
> > > > 15, 0, 14, 23, 127, 16, 0.982
> > > > 16, 0, 15, 23, 127, 16, 0.582
> > > > 17, 0, 16, 23, 127, 16, 0.542
> > > > 18, 0, 17, 23, 127, 16, 0.554
> > > > 19, 0, 18, 23, 127, 16, 0.562
> > > > 20, 0, 19, 23, 127, 16, 0.587
> > > > 21, 0, 20, 23, 127, 16, 0.584
> > > > 22, 0, 21, 23, 127, 16, 0.587
> > > > 23, 0, 22, 23, 127, 16, 0.594
> > > > 24, 0, 23, 23, 127, 16, 0.581
> > > > 25, 0, 24, 23, 127, 16, 0.577
> > > > 26, 0, 25, 23, 127, 16, 0.588
> > > > 27, 0, 26, 23, 127, 16, 0.589
> > > > 28, 0, 27, 23, 127, 16, 0.596
> > > > 29, 0, 28, 23, 127, 16, 0.591
> > > > 30, 0, 29, 23, 127, 16, 0.585
> > > > 31, 0, 30, 23, 127, 16, 0.59
> > > > 32, 0, 31, 23, 127, 16, 0.669
> > > >
> > > > sysdeps/x86_64/multiarch/strrchr-sse2.S | 2 +-
> > > > sysdeps/x86_64/multiarch/wcsrchr-sse2.S | 3 +-
> > > > sysdeps/x86_64/strrchr.S | 505 +++++++++++++++---------
> > > > sysdeps/x86_64/wcsrchr.S | 268 +------------
> > > > 4 files changed, 334 insertions(+), 444 deletions(-)
> > > >
> > > > diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
> > > > index db1b44c23c..866396e947 100644
> > > > --- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
> > > > +++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
> > > > @@ -17,7 +17,7 @@
> > > > <https://www.gnu.org/licenses/>. */
> > > >
> > > > #if IS_IN (libc)
> > > > -# define strrchr __strrchr_sse2
> > > > +# define STRRCHR __strrchr_sse2
> > > >
> > > > # undef weak_alias
> > > > # define weak_alias(strrchr, rindex)
> > > > diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> > > > index 78d1ca6553..69d2f3cdb1 100644
> > > > --- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> > > > +++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> > > > @@ -17,7 +17,6 @@
> > > > <https://www.gnu.org/licenses/>. */
> > > >
> > > > #if IS_IN (libc)
> > > > -# define wcsrchr __wcsrchr_sse2
> > > > +# define STRRCHR __wcsrchr_sse2
> > > > #endif
> > > > -
> > > > #include "../wcsrchr.S"
> > > > diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
> > > > index 50d886713e..94449ad806 100644
> > > > --- a/sysdeps/x86_64/strrchr.S
> > > > +++ b/sysdeps/x86_64/strrchr.S
> > > > @@ -19,210 +19,355 @@
> > > >
> > > > #include <sysdep.h>
> > > >
> > > > +#ifndef STRRCHR
> > > > +# define STRRCHR strrchr
> > > > +#endif
> > > > +
> > > > +#ifdef USE_AS_WCSRCHR
> > > > +# define PCMPEQ pcmpeqd
> > > > +# define CHAR_SIZE 4
> > > > +# define PMINU pminud
> > > > +#else
> > > > +# define PCMPEQ pcmpeqb
> > > > +# define CHAR_SIZE 1
> > > > +# define PMINU pminub
> > > > +#endif
> > > > +
> > > > +#define PAGE_SIZE 4096
> > > > +#define VEC_SIZE 16
> > > > +
> > > > .text
> > > > -ENTRY (strrchr)
> > > > - movd %esi, %xmm1
> > > > +ENTRY(STRRCHR)
> > > > + movd %esi, %xmm0
> > > > movq %rdi, %rax
> > > > - andl $4095, %eax
> > > > - punpcklbw %xmm1, %xmm1
> > > > - cmpq $4032, %rax
> > > > - punpcklwd %xmm1, %xmm1
> > > > - pshufd $0, %xmm1, %xmm1
> > > > + andl $(PAGE_SIZE - 1), %eax
> > > > +#ifndef USE_AS_WCSRCHR
> > > > + punpcklbw %xmm0, %xmm0
> > > > + punpcklwd %xmm0, %xmm0
> > > > +#endif
> > > > + pshufd $0, %xmm0, %xmm0
> > > > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> > > > ja L(cross_page)
> > > > - movdqu (%rdi), %xmm0
> > > > +
> > > > +L(cross_page_continue):
> > > > + movups (%rdi), %xmm1
> > > > pxor %xmm2, %xmm2
> > > > - movdqa %xmm0, %xmm3
> > > > - pcmpeqb %xmm1, %xmm0
> > > > - pcmpeqb %xmm2, %xmm3
> > > > - pmovmskb %xmm0, %ecx
> > > > - pmovmskb %xmm3, %edx
> > > > - testq %rdx, %rdx
> > > > - je L(next_48_bytes)
> > > > - leaq -1(%rdx), %rax
> > > > - xorq %rdx, %rax
> > > > - andq %rcx, %rax
> > > > - je L(exit)
> > > > - bsrq %rax, %rax
> > > > + PCMPEQ %xmm1, %xmm2
> > > > + pmovmskb %xmm2, %ecx
> > > > + testl %ecx, %ecx
> > > > + jz L(aligned_more)
> > > > +
> > > > + PCMPEQ %xmm0, %xmm1
> > > > + pmovmskb %xmm1, %eax
> > > > + leal -1(%rcx), %edx
> > > > + xorl %edx, %ecx
> > > > + andl %ecx, %eax
> > > > + jz L(ret0)
> > > > + bsrl %eax, %eax
> > > > addq %rdi, %rax
> > > > + /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
> > > > + search CHAR is zero we are correct. Either way `andq
> > > > + -CHAR_SIZE, %rax` gets the correct result. */
> > > > +#ifdef USE_AS_WCSRCHR
> > > > + andq $-CHAR_SIZE, %rax
> > > > +#endif
> > > > +L(ret0):
> > > > ret
> > > >
> > > > + /* Returns for first vec x1/x2 have hard coded backward search
> > > > + path for earlier matches. */
> > > > .p2align 4
> > > > -L(next_48_bytes):
> > > > - movdqu 16(%rdi), %xmm4
> > > > - movdqa %xmm4, %xmm5
> > > > - movdqu 32(%rdi), %xmm3
> > > > - pcmpeqb %xmm1, %xmm4
> > > > - pcmpeqb %xmm2, %xmm5
> > > > - movdqu 48(%rdi), %xmm0
> > > > - pmovmskb %xmm5, %edx
> > > > - movdqa %xmm3, %xmm5
> > > > - pcmpeqb %xmm1, %xmm3
> > > > - pcmpeqb %xmm2, %xmm5
> > > > - pcmpeqb %xmm0, %xmm2
> > > > - salq $16, %rdx
> > > > - pmovmskb %xmm3, %r8d
> > > > - pmovmskb %xmm5, %eax
> > > > - pmovmskb %xmm2, %esi
> > > > - salq $32, %r8
> > > > - salq $32, %rax
> > > > - pcmpeqb %xmm1, %xmm0
> > > > - orq %rdx, %rax
> > > > - movq %rsi, %rdx
> > > > - pmovmskb %xmm4, %esi
> > > > - salq $48, %rdx
> > > > - salq $16, %rsi
> > > > - orq %r8, %rsi
> > > > - orq %rcx, %rsi
> > > > - pmovmskb %xmm0, %ecx
> > > > - salq $48, %rcx
> > > > - orq %rcx, %rsi
> > > > - orq %rdx, %rax
> > > > - je L(loop_header2)
> > > > - leaq -1(%rax), %rcx
> > > > - xorq %rax, %rcx
> > > > - andq %rcx, %rsi
> > > > - je L(exit)
> > > > - bsrq %rsi, %rsi
> > > > - leaq (%rdi,%rsi), %rax
> > > > +L(first_vec_x0_test):
> > > > + PCMPEQ %xmm0, %xmm1
> > > > + pmovmskb %xmm1, %eax
> > > > + testl %eax, %eax
> > > > + jz L(ret0)
> > > > + bsrl %eax, %eax
> > > > + addq %r8, %rax
> > > > +#ifdef USE_AS_WCSRCHR
> > > > + andq $-CHAR_SIZE, %rax
> > > > +#endif
> > > > ret
> > > >
> > > > .p2align 4
> > > > -L(loop_header2):
> > > > - testq %rsi, %rsi
> > > > - movq %rdi, %rcx
> > > > - je L(no_c_found)
> > > > -L(loop_header):
> > > > - addq $64, %rdi
> > > > - pxor %xmm7, %xmm7
> > > > - andq $-64, %rdi
> > > > - jmp L(loop_entry)
> > > > +L(first_vec_x1):
> > > > + PCMPEQ %xmm0, %xmm2
> > > > + pmovmskb %xmm2, %eax
> > > > + leal -1(%rcx), %edx
> > > > + xorl %edx, %ecx
> > > > + andl %ecx, %eax
> > > > + jz L(first_vec_x0_test)
> > > > + bsrl %eax, %eax
> > > > + leaq (VEC_SIZE)(%rdi, %rax), %rax
> > > > +#ifdef USE_AS_WCSRCHR
> > > > + andq $-CHAR_SIZE, %rax
> > > > +#endif
> > > > + ret
> > > >
> > > > .p2align 4
> > > > -L(loop64):
> > > > - testq %rdx, %rdx
> > > > - cmovne %rdx, %rsi
> > > > - cmovne %rdi, %rcx
> > > > - addq $64, %rdi
> > > > -L(loop_entry):
> > > > - movdqa 32(%rdi), %xmm3
> > > > - pxor %xmm6, %xmm6
> > > > - movdqa 48(%rdi), %xmm2
> > > > - movdqa %xmm3, %xmm0
> > > > - movdqa 16(%rdi), %xmm4
> > > > - pminub %xmm2, %xmm0
> > > > - movdqa (%rdi), %xmm5
> > > > - pminub %xmm4, %xmm0
> > > > - pminub %xmm5, %xmm0
> > > > - pcmpeqb %xmm7, %xmm0
> > > > - pmovmskb %xmm0, %eax
> > > > - movdqa %xmm5, %xmm0
> > > > - pcmpeqb %xmm1, %xmm0
> > > > - pmovmskb %xmm0, %r9d
> > > > - movdqa %xmm4, %xmm0
> > > > - pcmpeqb %xmm1, %xmm0
> > > > - pmovmskb %xmm0, %edx
> > > > - movdqa %xmm3, %xmm0
> > > > - pcmpeqb %xmm1, %xmm0
> > > > - salq $16, %rdx
> > > > - pmovmskb %xmm0, %r10d
> > > > - movdqa %xmm2, %xmm0
> > > > - pcmpeqb %xmm1, %xmm0
> > > > - salq $32, %r10
> > > > - orq %r10, %rdx
> > > > - pmovmskb %xmm0, %r8d
> > > > - orq %r9, %rdx
> > > > - salq $48, %r8
> > > > - orq %r8, %rdx
> > > > +L(first_vec_x1_test):
> > > > + PCMPEQ %xmm0, %xmm2
> > > > + pmovmskb %xmm2, %eax
> > > > testl %eax, %eax
> > > > - je L(loop64)
> > > > - pcmpeqb %xmm6, %xmm4
> > > > - pcmpeqb %xmm6, %xmm3
> > > > - pcmpeqb %xmm6, %xmm5
> > > > - pmovmskb %xmm4, %eax
> > > > - pmovmskb %xmm3, %r10d
> > > > - pcmpeqb %xmm6, %xmm2
> > > > - pmovmskb %xmm5, %r9d
> > > > - salq $32, %r10
> > > > - salq $16, %rax
> > > > - pmovmskb %xmm2, %r8d
> > > > - orq %r10, %rax
> > > > - orq %r9, %rax
> > > > - salq $48, %r8
> > > > - orq %r8, %rax
> > > > - leaq -1(%rax), %r8
> > > > - xorq %rax, %r8
> > > > - andq %r8, %rdx
> > > > - cmovne %rdi, %rcx
> > > > - cmovne %rdx, %rsi
> > > > - bsrq %rsi, %rsi
> > > > - leaq (%rcx,%rsi), %rax
> > > > + jz L(first_vec_x0_test)
> > > > + bsrl %eax, %eax
> > > > + leaq (VEC_SIZE)(%rdi, %rax), %rax
> > > > +#ifdef USE_AS_WCSRCHR
> > > > + andq $-CHAR_SIZE, %rax
> > > > +#endif
> > > > + ret
> > > > +
> > > > + .p2align 4
> > > > +L(first_vec_x2):
> > > > + PCMPEQ %xmm0, %xmm3
> > > > + pmovmskb %xmm3, %eax
> > > > + leal -1(%rcx), %edx
> > > > + xorl %edx, %ecx
> > > > + andl %ecx, %eax
> > > > + jz L(first_vec_x1_test)
> > > > + bsrl %eax, %eax
> > > > + leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
> > > > +#ifdef USE_AS_WCSRCHR
> > > > + andq $-CHAR_SIZE, %rax
> > > > +#endif
> > > > + ret
> > > > +
> > > > + .p2align 4
> > > > +L(aligned_more):
> > > > + /* Save original pointer if match was in VEC 0. */
> > > > + movq %rdi, %r8
> > > > + andq $-VEC_SIZE, %rdi
> > > > +
> > > > + movaps VEC_SIZE(%rdi), %xmm2
> > > > + pxor %xmm3, %xmm3
> > > > + PCMPEQ %xmm2, %xmm3
> > > > + pmovmskb %xmm3, %ecx
> > > > + testl %ecx, %ecx
> > > > + jnz L(first_vec_x1)
> > > > +
> > > > + movaps (VEC_SIZE * 2)(%rdi), %xmm3
> > > > + pxor %xmm4, %xmm4
> > > > + PCMPEQ %xmm3, %xmm4
> > > > + pmovmskb %xmm4, %ecx
> > > > + testl %ecx, %ecx
> > > > + jnz L(first_vec_x2)
> > > > +
> > > > + addq $VEC_SIZE, %rdi
> > > > + /* Save pointer again before realigning. */
> > > > + movq %rdi, %rsi
> > > > + andq $-(VEC_SIZE * 2), %rdi
> > > > + .p2align 4
> > > > +L(first_loop):
> > > > + /* Do 2x VEC at a time. */
> > > > + movaps (VEC_SIZE * 2)(%rdi), %xmm4
> > > > + movaps (VEC_SIZE * 3)(%rdi), %xmm5
> > > > + /* If SSE2 no pminud. */
> > > > +#ifdef NO_PMINU
> > >
> > > Do we really need SSE4.1 wcsrchr? I think we should focus on AVX2 and
> > > above.
> >
> > It seems like freebie performance that can make a difference in the loop
> > cases. (see the SSE4.1 commit for numbers).
>
> But these numbers are on Tiger Lake. I think we should continue to
> improve SSE2
> version and optimize AVX2/AVX512. I don't think we should increase code sizes
> for SSE4.
Fair enough. Removed SSE4 version but added comment suggesting it as an
optimization if the need arises.
>
> > Imo there is little harm but if you feel strongly I'll drop. (In V2 will
> > change the .text section for SSE4_1).
> >
> > What do you think?
> > >
> > > > + movaps %xmm5, %xmm6
> > > > + pxor %xmm8, %xmm8
> > > > +
> > > > + PCMPEQ %xmm8, %xmm5
> > > > + PCMPEQ %xmm4, %xmm8
> > > > + por %xmm5, %xmm8
> > > > +#else
> > > > + movaps %xmm5, %xmm6
> > > > + PMINU %xmm4, %xmm5
> > > > +#endif
> > > > +
> > > > + movaps %xmm4, %xmm9
> > > > + PCMPEQ %xmm0, %xmm4
> > > > + PCMPEQ %xmm0, %xmm6
> > > > + movaps %xmm6, %xmm7
> > > > + por %xmm4, %xmm6
> > > > +#ifndef NO_PMINU
> > > > + pxor %xmm8, %xmm8
> > > > + PCMPEQ %xmm5, %xmm8
> > > > +#endif
> > > > + pmovmskb %xmm8, %ecx
> > > > + pmovmskb %xmm6, %eax
> > > > +
> > > > + addq $(VEC_SIZE * 2), %rdi
> > > > + /* Use `addl` 1) so we can undo it with `subl` and 2) it can
> > > > + macro-fuse with `jz`. */
> > > > + addl %ecx, %eax
> > > > + jz L(first_loop)
> > > > +
> > > > + /* Check if there is zero match. */
> > > > + testl %ecx, %ecx
> > > > + jz L(second_loop_match)
> > > > +
> > > > + /* Check if there was a match in last iteration. */
> > > > + subl %ecx, %eax
> > > > + jnz L(new_match)
> > > > +
> > > > +L(first_loop_old_match):
> > > > + PCMPEQ %xmm0, %xmm2
> > > > + PCMPEQ %xmm0, %xmm3
> > > > + pmovmskb %xmm2, %ecx
> > > > + pmovmskb %xmm3, %eax
> > > > + addl %eax, %ecx
> > > > + jz L(first_vec_x0_test)
> > > > + /* NB: We could move this shift to before the branch and save a
> > > > + bit of code size / performance on the fall through. The
> > > > + branch leads to the null case which generally seems hotter
> > > > + than char in first 3x VEC. */
> > > > + sall $16, %eax
> > > > + orl %ecx, %eax
> > > > +
> > > > + bsrl %eax, %eax
> > > > + addq %rsi, %rax
> > > > +#ifdef USE_AS_WCSRCHR
> > > > + andq $-CHAR_SIZE, %rax
> > > > +#endif
> > > > + ret
> > > > +
> > > > + .p2align 4
> > > > +L(new_match):
> > > > + pxor %xmm6, %xmm6
> > > > + PCMPEQ %xmm9, %xmm6
> > > > + pmovmskb %xmm6, %eax
> > > > + sall $16, %ecx
> > > > + orl %eax, %ecx
> > > > +
> > > > + /* We can't reuse either of the old comparisons as since we mask
> > > > + of zeros after first zero (instead of using the full
> > > > + comparison) we can't gurantee no interference between match
> > > > + after end of string and valid match. */
> > > > + pmovmskb %xmm4, %eax
> > > > + pmovmskb %xmm7, %edx
> > > > + sall $16, %edx
> > > > + orl %edx, %eax
> > > > +
> > > > + leal -1(%ecx), %edx
> > > > + xorl %edx, %ecx
> > > > + andl %ecx, %eax
> > > > + jz L(first_loop_old_match)
> > > > + bsrl %eax, %eax
> > > > + addq %rdi, %rax
> > > > +#ifdef USE_AS_WCSRCHR
> > > > + andq $-CHAR_SIZE, %rax
> > > > +#endif
> > > > ret
> > > >
> > > > + /* Save minimum state for getting most recent match. We can
> > > > + throw out all previous work. */
> > > > .p2align 4
> > > > -L(no_c_found):
> > > > - movl $1, %esi
> > > > - xorl %ecx, %ecx
> > > > - jmp L(loop_header)
> > > > +L(second_loop_match):
> > > > + movq %rdi, %rsi
> > > > + movaps %xmm4, %xmm2
> > > > + movaps %xmm7, %xmm3
> > > >
> > > > .p2align 4
> > > > -L(exit):
> > > > - xorl %eax, %eax
> > > > +L(second_loop):
> > > > + movaps (VEC_SIZE * 2)(%rdi), %xmm4
> > > > + movaps (VEC_SIZE * 3)(%rdi), %xmm5
> > > > +#ifdef NO_PMINU
> > > > + movaps %xmm5, %xmm6
> > > > + pxor %xmm8, %xmm8
> > > > +
> > > > + PCMPEQ %xmm8, %xmm5
> > > > + PCMPEQ %xmm4, %xmm8
> > > > + por %xmm5, %xmm8
> > > > +#else
> > > > + movaps %xmm5, %xmm6
> > > > + PMINU %xmm4, %xmm5
> > > > +#endif
> > > > +
> > > > + movaps %xmm4, %xmm9
> > > > + PCMPEQ %xmm0, %xmm4
> > > > + PCMPEQ %xmm0, %xmm6
> > > > + movaps %xmm6, %xmm7
> > > > + por %xmm4, %xmm6
> > > > +#ifndef NO_PMINU
> > > > + pxor %xmm8, %xmm8
> > > > + PCMPEQ %xmm5, %xmm8
> > > > +#endif
> > > > +
> > > > + pmovmskb %xmm8, %ecx
> > > > + pmovmskb %xmm6, %eax
> > > > +
> > > > + addq $(VEC_SIZE * 2), %rdi
> > > > + /* Either null term or new occurence of CHAR. */
> > > > + addl %ecx, %eax
> > > > + jz L(second_loop)
> > > > +
> > > > + /* No null term so much be new occurence of CHAR. */
> > > > + testl %ecx, %ecx
> > > > + jz L(second_loop_match)
> > > > +
> > > > +
> > > > + subl %ecx, %eax
> > > > + jnz L(second_loop_new_match)
> > > > +
> > > > +L(second_loop_old_match):
> > > > + pmovmskb %xmm2, %ecx
> > > > + pmovmskb %xmm3, %eax
> > > > + sall $16, %eax
> > > > + orl %ecx, %eax
> > > > + bsrl %eax, %eax
> > > > + addq %rsi, %rax
> > > > +#ifdef USE_AS_WCSRCHR
> > > > + andq $-CHAR_SIZE, %rax
> > > > +#endif
> > > > ret
> > > >
> > > > .p2align 4
> > > > +L(second_loop_new_match):
> > > > + pxor %xmm6, %xmm6
> > > > + PCMPEQ %xmm9, %xmm6
> > > > + pmovmskb %xmm6, %eax
> > > > + sall $16, %ecx
> > > > + orl %eax, %ecx
> > > > +
> > > > + /* We can't reuse either of the old comparisons as since we mask
> > > > + of zeros after first zero (instead of using the full
> > > > + comparison) we can't gurantee no interference between match
> > > > + after end of string and valid match. */
> > > > + pmovmskb %xmm4, %eax
> > > > + pmovmskb %xmm7, %edx
> > > > + sall $16, %edx
> > > > + orl %edx, %eax
> > > > +
> > > > + leal -1(%ecx), %edx
> > > > + xorl %edx, %ecx
> > > > + andl %ecx, %eax
> > > > + jz L(second_loop_old_match)
> > > > + bsrl %eax, %eax
> > > > + addq %rdi, %rax
> > > > +#ifdef USE_AS_WCSRCHR
> > > > + andq $-CHAR_SIZE, %rax
> > > > +#endif
> > > > + ret
> > > > +
> > > > + .p2align 4,, 4
> > > > L(cross_page):
> > > > - movq %rdi, %rax
> > > > - pxor %xmm0, %xmm0
> > > > - andq $-64, %rax
> > > > - movdqu (%rax), %xmm5
> > > > - movdqa %xmm5, %xmm6
> > > > - movdqu 16(%rax), %xmm4
> > > > - pcmpeqb %xmm1, %xmm5
> > > > - pcmpeqb %xmm0, %xmm6
> > > > - movdqu 32(%rax), %xmm3
> > > > - pmovmskb %xmm6, %esi
> > > > - movdqa %xmm4, %xmm6
> > > > - movdqu 48(%rax), %xmm2
> > > > - pcmpeqb %xmm1, %xmm4
> > > > - pcmpeqb %xmm0, %xmm6
> > > > - pmovmskb %xmm6, %edx
> > > > - movdqa %xmm3, %xmm6
> > > > - pcmpeqb %xmm1, %xmm3
> > > > - pcmpeqb %xmm0, %xmm6
> > > > - pcmpeqb %xmm2, %xmm0
> > > > - salq $16, %rdx
> > > > - pmovmskb %xmm3, %r9d
> > > > - pmovmskb %xmm6, %r8d
> > > > - pmovmskb %xmm0, %ecx
> > > > - salq $32, %r9
> > > > - salq $32, %r8
> > > > - pcmpeqb %xmm1, %xmm2
> > > > - orq %r8, %rdx
> > > > - salq $48, %rcx
> > > > - pmovmskb %xmm5, %r8d
> > > > - orq %rsi, %rdx
> > > > - pmovmskb %xmm4, %esi
> > > > - orq %rcx, %rdx
> > > > - pmovmskb %xmm2, %ecx
> > > > - salq $16, %rsi
> > > > - salq $48, %rcx
> > > > - orq %r9, %rsi
> > > > - orq %r8, %rsi
> > > > - orq %rcx, %rsi
> > > > + movq %rdi, %rsi
> > > > + andq $-VEC_SIZE, %rsi
> > > > + movaps (%rsi), %xmm1
> > > > + pxor %xmm2, %xmm2
> > > > + PCMPEQ %xmm1, %xmm2
> > > > + pmovmskb %xmm2, %edx
> > > > movl %edi, %ecx
> > > > - subl %eax, %ecx
> > > > - shrq %cl, %rdx
> > > > - shrq %cl, %rsi
> > > > - testq %rdx, %rdx
> > > > - je L(loop_header2)
> > > > - leaq -1(%rdx), %rax
> > > > - xorq %rdx, %rax
> > > > - andq %rax, %rsi
> > > > - je L(exit)
> > > > - bsrq %rsi, %rax
> > > > + andl $(VEC_SIZE - 1), %ecx
> > > > + sarl %cl, %edx
> > > > + jz L(cross_page_continue)
> > > > + PCMPEQ %xmm0, %xmm1
> > > > + pmovmskb %xmm1, %eax
> > > > + sarl %cl, %eax
> > > > + leal -1(%rdx), %ecx
> > > > + xorl %edx, %ecx
> > > > + andl %ecx, %eax
> > > > + jz L(ret1)
> > > > + bsrl %eax, %eax
> > > > addq %rdi, %rax
> > > > +#ifdef USE_AS_WCSRCHR
> > > > + andq $-CHAR_SIZE, %rax
> > > > +#endif
> > > > +L(ret1):
> > > > ret
> > > > -END (strrchr)
> > > > +END(STRRCHR)
> > > >
> > > > -weak_alias (strrchr, rindex)
> > > > -libc_hidden_builtin_def (strrchr)
> > > > +#ifndef USE_AS_WCSRCHR
> > > > + weak_alias (STRRCHR, rindex)
> > > > + libc_hidden_builtin_def (STRRCHR)
> > > > +#endif
> > > > diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
> > > > index 61552954de..2b80efc5ef 100644
> > > > --- a/sysdeps/x86_64/wcsrchr.S
> > > > +++ b/sysdeps/x86_64/wcsrchr.S
> > > > @@ -1,4 +1,4 @@
> > > > -/* wcsrchr with SSSE3
> > > > +/* wcsrchr optimized with SSE2.
> > > > Copyright (C) 2011-2022 Free Software Foundation, Inc.
> > > > This file is part of the GNU C Library.
> > > >
> > > > @@ -16,266 +16,12 @@
> > > > License along with the GNU C Library; if not, see
> > > > <https://www.gnu.org/licenses/>. */
> > > >
> > > > -#include <sysdep.h>
> > > >
> > > > - .text
> > > > -ENTRY (wcsrchr)
> > > > +#define USE_AS_WCSRCHR 1
> > > > +#define NO_PMINU 1
> > > >
> > > > - movd %rsi, %xmm1
> > > > - mov %rdi, %rcx
> > > > - punpckldq %xmm1, %xmm1
> > > > - pxor %xmm2, %xmm2
> > > > - punpckldq %xmm1, %xmm1
> > > > - and $63, %rcx
> > > > - cmp $48, %rcx
> > > > - ja L(crosscache)
> > > > +#ifndef STRRCHR
> > > > +# define STRRCHR wcsrchr
> > > > +#endif
> > > >
> > > > - movdqu (%rdi), %xmm0
> > > > - pcmpeqd %xmm0, %xmm2
> > > > - pcmpeqd %xmm1, %xmm0
> > > > - pmovmskb %xmm2, %rcx
> > > > - pmovmskb %xmm0, %rax
> > > > - add $16, %rdi
> > > > -
> > > > - test %rax, %rax
> > > > - jnz L(unaligned_match1)
> > > > -
> > > > - test %rcx, %rcx
> > > > - jnz L(return_null)
> > > > -
> > > > - and $-16, %rdi
> > > > - xor %r8, %r8
> > > > - jmp L(loop)
> > > > -
> > > > - .p2align 4
> > > > -L(unaligned_match1):
> > > > - test %rcx, %rcx
> > > > - jnz L(prolog_find_zero_1)
> > > > -
> > > > - mov %rax, %r8
> > > > - mov %rdi, %rsi
> > > > - and $-16, %rdi
> > > > - jmp L(loop)
> > > > -
> > > > - .p2align 4
> > > > -L(crosscache):
> > > > - and $15, %rcx
> > > > - and $-16, %rdi
> > > > - pxor %xmm3, %xmm3
> > > > - movdqa (%rdi), %xmm0
> > > > - pcmpeqd %xmm0, %xmm3
> > > > - pcmpeqd %xmm1, %xmm0
> > > > - pmovmskb %xmm3, %rdx
> > > > - pmovmskb %xmm0, %rax
> > > > - shr %cl, %rdx
> > > > - shr %cl, %rax
> > > > - add $16, %rdi
> > > > -
> > > > - test %rax, %rax
> > > > - jnz L(unaligned_match)
> > > > -
> > > > - test %rdx, %rdx
> > > > - jnz L(return_null)
> > > > -
> > > > - xor %r8, %r8
> > > > - jmp L(loop)
> > > > -
> > > > - .p2align 4
> > > > -L(unaligned_match):
> > > > - test %rdx, %rdx
> > > > - jnz L(prolog_find_zero)
> > > > -
> > > > - mov %rax, %r8
> > > > - lea (%rdi, %rcx), %rsi
> > > > -
> > > > -/* Loop start on aligned string. */
> > > > - .p2align 4
> > > > -L(loop):
> > > > - movdqa (%rdi), %xmm0
> > > > - pcmpeqd %xmm0, %xmm2
> > > > - add $16, %rdi
> > > > - pcmpeqd %xmm1, %xmm0
> > > > - pmovmskb %xmm2, %rcx
> > > > - pmovmskb %xmm0, %rax
> > > > - or %rax, %rcx
> > > > - jnz L(matches)
> > > > -
> > > > - movdqa (%rdi), %xmm3
> > > > - pcmpeqd %xmm3, %xmm2
> > > > - add $16, %rdi
> > > > - pcmpeqd %xmm1, %xmm3
> > > > - pmovmskb %xmm2, %rcx
> > > > - pmovmskb %xmm3, %rax
> > > > - or %rax, %rcx
> > > > - jnz L(matches)
> > > > -
> > > > - movdqa (%rdi), %xmm4
> > > > - pcmpeqd %xmm4, %xmm2
> > > > - add $16, %rdi
> > > > - pcmpeqd %xmm1, %xmm4
> > > > - pmovmskb %xmm2, %rcx
> > > > - pmovmskb %xmm4, %rax
> > > > - or %rax, %rcx
> > > > - jnz L(matches)
> > > > -
> > > > - movdqa (%rdi), %xmm5
> > > > - pcmpeqd %xmm5, %xmm2
> > > > - add $16, %rdi
> > > > - pcmpeqd %xmm1, %xmm5
> > > > - pmovmskb %xmm2, %rcx
> > > > - pmovmskb %xmm5, %rax
> > > > - or %rax, %rcx
> > > > - jz L(loop)
> > > > -
> > > > - .p2align 4
> > > > -L(matches):
> > > > - test %rax, %rax
> > > > - jnz L(match)
> > > > -L(return_value):
> > > > - test %r8, %r8
> > > > - jz L(return_null)
> > > > - mov %r8, %rax
> > > > - mov %rsi, %rdi
> > > > -
> > > > - test $15 << 4, %ah
> > > > - jnz L(match_fourth_wchar)
> > > > - test %ah, %ah
> > > > - jnz L(match_third_wchar)
> > > > - test $15 << 4, %al
> > > > - jnz L(match_second_wchar)
> > > > - lea -16(%rdi), %rax
> > > > - ret
> > > > -
> > > > - .p2align 4
> > > > -L(match):
> > > > - pmovmskb %xmm2, %rcx
> > > > - test %rcx, %rcx
> > > > - jnz L(find_zero)
> > > > - mov %rax, %r8
> > > > - mov %rdi, %rsi
> > > > - jmp L(loop)
> > > > -
> > > > - .p2align 4
> > > > -L(find_zero):
> > > > - test $15, %cl
> > > > - jnz L(find_zero_in_first_wchar)
> > > > - test %cl, %cl
> > > > - jnz L(find_zero_in_second_wchar)
> > > > - test $15, %ch
> > > > - jnz L(find_zero_in_third_wchar)
> > > > -
> > > > - and $1 << 13 - 1, %rax
> > > > - jz L(return_value)
> > > > -
> > > > - test $15 << 4, %ah
> > > > - jnz L(match_fourth_wchar)
> > > > - test %ah, %ah
> > > > - jnz L(match_third_wchar)
> > > > - test $15 << 4, %al
> > > > - jnz L(match_second_wchar)
> > > > - lea -16(%rdi), %rax
> > > > - ret
> > > > -
> > > > - .p2align 4
> > > > -L(find_zero_in_first_wchar):
> > > > - test $1, %rax
> > > > - jz L(return_value)
> > > > - lea -16(%rdi), %rax
> > > > - ret
> > > > -
> > > > - .p2align 4
> > > > -L(find_zero_in_second_wchar):
> > > > - and $1 << 5 - 1, %rax
> > > > - jz L(return_value)
> > > > -
> > > > - test $15 << 4, %al
> > > > - jnz L(match_second_wchar)
> > > > - lea -16(%rdi), %rax
> > > > - ret
> > > > -
> > > > - .p2align 4
> > > > -L(find_zero_in_third_wchar):
> > > > - and $1 << 9 - 1, %rax
> > > > - jz L(return_value)
> > > > -
> > > > - test %ah, %ah
> > > > - jnz L(match_third_wchar)
> > > > - test $15 << 4, %al
> > > > - jnz L(match_second_wchar)
> > > > - lea -16(%rdi), %rax
> > > > - ret
> > > > -
> > > > - .p2align 4
> > > > -L(prolog_find_zero):
> > > > - add %rcx, %rdi
> > > > - mov %rdx, %rcx
> > > > -L(prolog_find_zero_1):
> > > > - test $15, %cl
> > > > - jnz L(prolog_find_zero_in_first_wchar)
> > > > - test %cl, %cl
> > > > - jnz L(prolog_find_zero_in_second_wchar)
> > > > - test $15, %ch
> > > > - jnz L(prolog_find_zero_in_third_wchar)
> > > > -
> > > > - and $1 << 13 - 1, %rax
> > > > - jz L(return_null)
> > > > -
> > > > - test $15 << 4, %ah
> > > > - jnz L(match_fourth_wchar)
> > > > - test %ah, %ah
> > > > - jnz L(match_third_wchar)
> > > > - test $15 << 4, %al
> > > > - jnz L(match_second_wchar)
> > > > - lea -16(%rdi), %rax
> > > > - ret
> > > > -
> > > > - .p2align 4
> > > > -L(prolog_find_zero_in_first_wchar):
> > > > - test $1, %rax
> > > > - jz L(return_null)
> > > > - lea -16(%rdi), %rax
> > > > - ret
> > > > -
> > > > - .p2align 4
> > > > -L(prolog_find_zero_in_second_wchar):
> > > > - and $1 << 5 - 1, %rax
> > > > - jz L(return_null)
> > > > -
> > > > - test $15 << 4, %al
> > > > - jnz L(match_second_wchar)
> > > > - lea -16(%rdi), %rax
> > > > - ret
> > > > -
> > > > - .p2align 4
> > > > -L(prolog_find_zero_in_third_wchar):
> > > > - and $1 << 9 - 1, %rax
> > > > - jz L(return_null)
> > > > -
> > > > - test %ah, %ah
> > > > - jnz L(match_third_wchar)
> > > > - test $15 << 4, %al
> > > > - jnz L(match_second_wchar)
> > > > - lea -16(%rdi), %rax
> > > > - ret
> > > > -
> > > > - .p2align 4
> > > > -L(match_second_wchar):
> > > > - lea -12(%rdi), %rax
> > > > - ret
> > > > -
> > > > - .p2align 4
> > > > -L(match_third_wchar):
> > > > - lea -8(%rdi), %rax
> > > > - ret
> > > > -
> > > > - .p2align 4
> > > > -L(match_fourth_wchar):
> > > > - lea -4(%rdi), %rax
> > > > - ret
> > > > -
> > > > - .p2align 4
> > > > -L(return_null):
> > > > - xor %rax, %rax
> > > > - ret
> > > > -
> > > > -END (wcsrchr)
> > > > +#include "../strrchr.S"
> > > > --
> > > > 2.25.1
> > > >
> > >
> > >
> > > --
> > > H.J.
>
>
>
> --
> H.J.
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH v2 2/4] x86: Optimize {str|wcs}rchr-sse2
2022-04-21 22:22 ` [PATCH v2 2/4] x86: Optimize {str|wcs}rchr-sse2 Noah Goldstein
@ 2022-04-21 23:46 ` H.J. Lu
2022-04-22 1:54 ` Noah Goldstein
0 siblings, 1 reply; 36+ messages in thread
From: H.J. Lu @ 2022-04-21 23:46 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Thu, Apr 21, 2022 at 3:22 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The new code unrolls the main loop slightly without adding too much
> overhead and minimizes the comparisons for the search CHAR.
>
> Geometric Mean of all benchmarks New / Old: 0.741
> See email for all results.
>
> Full xcheck passes on x86_64 with and without multiarch enabled.
> ---
> sysdeps/x86_64/multiarch/strrchr-sse2.S | 2 +-
> sysdeps/x86_64/multiarch/wcsrchr-sse2.S | 3 +-
> sysdeps/x86_64/strrchr.S | 510 +++++++++++++++---------
> sysdeps/x86_64/wcsrchr.S | 268 +------------
> 4 files changed, 339 insertions(+), 444 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
> index db1b44c23c..866396e947 100644
> --- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
> +++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
> @@ -17,7 +17,7 @@
> <https://www.gnu.org/licenses/>. */
>
> #if IS_IN (libc)
> -# define strrchr __strrchr_sse2
> +# define STRRCHR __strrchr_sse2
>
> # undef weak_alias
> # define weak_alias(strrchr, rindex)
> diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> index 78d1ca6553..69d2f3cdb1 100644
> --- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> +++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> @@ -17,7 +17,6 @@
> <https://www.gnu.org/licenses/>. */
>
> #if IS_IN (libc)
> -# define wcsrchr __wcsrchr_sse2
> +# define STRRCHR __wcsrchr_sse2
> #endif
> -
> #include "../wcsrchr.S"
> diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
> index 50d886713e..6efb25c880 100644
> --- a/sysdeps/x86_64/strrchr.S
> +++ b/sysdeps/x86_64/strrchr.S
> @@ -19,210 +19,360 @@
>
> #include <sysdep.h>
>
> +#ifndef STRRCHR
> +# define STRRCHR strrchr
> +#endif
> +
> +#ifdef USE_AS_WCSRCHR
> +# define PCMPEQ pcmpeqd
> +# define CHAR_SIZE 4
> +# define PMINU pminud
> +#else
> +# define PCMPEQ pcmpeqb
> +# define CHAR_SIZE 1
> +# define PMINU pminub
> +#endif
> +
> +#define PAGE_SIZE 4096
> +#define VEC_SIZE 16
> +
> .text
> -ENTRY (strrchr)
> - movd %esi, %xmm1
> +ENTRY(STRRCHR)
> + movd %esi, %xmm0
> movq %rdi, %rax
> - andl $4095, %eax
> - punpcklbw %xmm1, %xmm1
> - cmpq $4032, %rax
> - punpcklwd %xmm1, %xmm1
> - pshufd $0, %xmm1, %xmm1
> + andl $(PAGE_SIZE - 1), %eax
> +#ifndef USE_AS_WCSRCHR
> + punpcklbw %xmm0, %xmm0
> + punpcklwd %xmm0, %xmm0
> +#endif
> + pshufd $0, %xmm0, %xmm0
> + cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> ja L(cross_page)
> - movdqu (%rdi), %xmm0
> +
> +L(cross_page_continue):
> + movups (%rdi), %xmm1
> pxor %xmm2, %xmm2
> - movdqa %xmm0, %xmm3
> - pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm2, %xmm3
> - pmovmskb %xmm0, %ecx
> - pmovmskb %xmm3, %edx
> - testq %rdx, %rdx
> - je L(next_48_bytes)
> - leaq -1(%rdx), %rax
> - xorq %rdx, %rax
> - andq %rcx, %rax
> - je L(exit)
> - bsrq %rax, %rax
> + PCMPEQ %xmm1, %xmm2
> + pmovmskb %xmm2, %ecx
> + testl %ecx, %ecx
> + jz L(aligned_more)
> +
> + PCMPEQ %xmm0, %xmm1
> + pmovmskb %xmm1, %eax
> + leal -1(%rcx), %edx
> + xorl %edx, %ecx
> + andl %ecx, %eax
> + jz L(ret0)
> + bsrl %eax, %eax
> addq %rdi, %rax
> + /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
> + search CHAR is zero we are correct. Either way `andq
> + -CHAR_SIZE, %rax` gets the correct result. */
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> +L(ret0):
> ret
>
> + /* Returns for first vec x1/x2 have hard coded backward search
> + path for earlier matches. */
> .p2align 4
> -L(next_48_bytes):
> - movdqu 16(%rdi), %xmm4
> - movdqa %xmm4, %xmm5
> - movdqu 32(%rdi), %xmm3
> - pcmpeqb %xmm1, %xmm4
> - pcmpeqb %xmm2, %xmm5
> - movdqu 48(%rdi), %xmm0
> - pmovmskb %xmm5, %edx
> - movdqa %xmm3, %xmm5
> - pcmpeqb %xmm1, %xmm3
> - pcmpeqb %xmm2, %xmm5
> - pcmpeqb %xmm0, %xmm2
> - salq $16, %rdx
> - pmovmskb %xmm3, %r8d
> - pmovmskb %xmm5, %eax
> - pmovmskb %xmm2, %esi
> - salq $32, %r8
> - salq $32, %rax
> - pcmpeqb %xmm1, %xmm0
> - orq %rdx, %rax
> - movq %rsi, %rdx
> - pmovmskb %xmm4, %esi
> - salq $48, %rdx
> - salq $16, %rsi
> - orq %r8, %rsi
> - orq %rcx, %rsi
> - pmovmskb %xmm0, %ecx
> - salq $48, %rcx
> - orq %rcx, %rsi
> - orq %rdx, %rax
> - je L(loop_header2)
> - leaq -1(%rax), %rcx
> - xorq %rax, %rcx
> - andq %rcx, %rsi
> - je L(exit)
> - bsrq %rsi, %rsi
> - leaq (%rdi,%rsi), %rax
> +L(first_vec_x0_test):
> + PCMPEQ %xmm0, %xmm1
> + pmovmskb %xmm1, %eax
> + testl %eax, %eax
> + jz L(ret0)
> + bsrl %eax, %eax
> + addq %r8, %rax
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> ret
>
> .p2align 4
> -L(loop_header2):
> - testq %rsi, %rsi
> - movq %rdi, %rcx
> - je L(no_c_found)
> -L(loop_header):
> - addq $64, %rdi
> - pxor %xmm7, %xmm7
> - andq $-64, %rdi
> - jmp L(loop_entry)
> +L(first_vec_x1):
> + PCMPEQ %xmm0, %xmm2
> + pmovmskb %xmm2, %eax
> + leal -1(%rcx), %edx
> + xorl %edx, %ecx
> + andl %ecx, %eax
> + jz L(first_vec_x0_test)
> + bsrl %eax, %eax
> + leaq (VEC_SIZE)(%rdi, %rax), %rax
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> + ret
>
> .p2align 4
> -L(loop64):
> - testq %rdx, %rdx
> - cmovne %rdx, %rsi
> - cmovne %rdi, %rcx
> - addq $64, %rdi
> -L(loop_entry):
> - movdqa 32(%rdi), %xmm3
> - pxor %xmm6, %xmm6
> - movdqa 48(%rdi), %xmm2
> - movdqa %xmm3, %xmm0
> - movdqa 16(%rdi), %xmm4
> - pminub %xmm2, %xmm0
> - movdqa (%rdi), %xmm5
> - pminub %xmm4, %xmm0
> - pminub %xmm5, %xmm0
> - pcmpeqb %xmm7, %xmm0
> - pmovmskb %xmm0, %eax
> - movdqa %xmm5, %xmm0
> - pcmpeqb %xmm1, %xmm0
> - pmovmskb %xmm0, %r9d
> - movdqa %xmm4, %xmm0
> - pcmpeqb %xmm1, %xmm0
> - pmovmskb %xmm0, %edx
> - movdqa %xmm3, %xmm0
> - pcmpeqb %xmm1, %xmm0
> - salq $16, %rdx
> - pmovmskb %xmm0, %r10d
> - movdqa %xmm2, %xmm0
> - pcmpeqb %xmm1, %xmm0
> - salq $32, %r10
> - orq %r10, %rdx
> - pmovmskb %xmm0, %r8d
> - orq %r9, %rdx
> - salq $48, %r8
> - orq %r8, %rdx
> +L(first_vec_x1_test):
> + PCMPEQ %xmm0, %xmm2
> + pmovmskb %xmm2, %eax
> testl %eax, %eax
> - je L(loop64)
> - pcmpeqb %xmm6, %xmm4
> - pcmpeqb %xmm6, %xmm3
> - pcmpeqb %xmm6, %xmm5
> - pmovmskb %xmm4, %eax
> - pmovmskb %xmm3, %r10d
> - pcmpeqb %xmm6, %xmm2
> - pmovmskb %xmm5, %r9d
> - salq $32, %r10
> - salq $16, %rax
> - pmovmskb %xmm2, %r8d
> - orq %r10, %rax
> - orq %r9, %rax
> - salq $48, %r8
> - orq %r8, %rax
> - leaq -1(%rax), %r8
> - xorq %rax, %r8
> - andq %r8, %rdx
> - cmovne %rdi, %rcx
> - cmovne %rdx, %rsi
> - bsrq %rsi, %rsi
> - leaq (%rcx,%rsi), %rax
> + jz L(first_vec_x0_test)
> + bsrl %eax, %eax
> + leaq (VEC_SIZE)(%rdi, %rax), %rax
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> + ret
> +
> + .p2align 4
> +L(first_vec_x2):
> + PCMPEQ %xmm0, %xmm3
> + pmovmskb %xmm3, %eax
> + leal -1(%rcx), %edx
> + xorl %edx, %ecx
> + andl %ecx, %eax
> + jz L(first_vec_x1_test)
> + bsrl %eax, %eax
> + leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> + ret
> +
> + .p2align 4
> +L(aligned_more):
> + /* Save original pointer if match was in VEC 0. */
> + movq %rdi, %r8
> + andq $-VEC_SIZE, %rdi
> +
> + movaps VEC_SIZE(%rdi), %xmm2
> + pxor %xmm3, %xmm3
> + PCMPEQ %xmm2, %xmm3
> + pmovmskb %xmm3, %ecx
> + testl %ecx, %ecx
> + jnz L(first_vec_x1)
> +
> + movaps (VEC_SIZE * 2)(%rdi), %xmm3
> + pxor %xmm4, %xmm4
> + PCMPEQ %xmm3, %xmm4
> + pmovmskb %xmm4, %ecx
> + testl %ecx, %ecx
> + jnz L(first_vec_x2)
> +
> + addq $VEC_SIZE, %rdi
> + /* Save pointer again before realigning. */
> + movq %rdi, %rsi
> + andq $-(VEC_SIZE * 2), %rdi
> + .p2align 4
> +L(first_loop):
> + /* Do 2x VEC at a time. */
> + movaps (VEC_SIZE * 2)(%rdi), %xmm4
> + movaps (VEC_SIZE * 3)(%rdi), %xmm5
> + /* If SSE2 no pminud so wcsrchr needs seperate logic for
Did you mean "Since", instead of "If"?
> + detecting zero. Note if this is found to be a bottleneck it
> + may be worth adding an SSE4.1 wcsrchr implementation. */
> +#ifdef USE_AS_WCSRCHR
> + movaps %xmm5, %xmm6
> + pxor %xmm8, %xmm8
> +
> + PCMPEQ %xmm8, %xmm5
> + PCMPEQ %xmm4, %xmm8
> + por %xmm5, %xmm8
> +#else
> + movaps %xmm5, %xmm6
> + PMINU %xmm4, %xmm5
> +#endif
> +
> + movaps %xmm4, %xmm9
> + PCMPEQ %xmm0, %xmm4
> + PCMPEQ %xmm0, %xmm6
> + movaps %xmm6, %xmm7
> + por %xmm4, %xmm6
> +#ifndef USE_AS_WCSRCHR
> + pxor %xmm8, %xmm8
> + PCMPEQ %xmm5, %xmm8
> +#endif
> + pmovmskb %xmm8, %ecx
> + pmovmskb %xmm6, %eax
> +
> + addq $(VEC_SIZE * 2), %rdi
> + /* Use `addl` 1) so we can undo it with `subl` and 2) it can
> + macro-fuse with `jz`. */
> + addl %ecx, %eax
> + jz L(first_loop)
> +
> + /* Check if there is zero match. */
> + testl %ecx, %ecx
> + jz L(second_loop_match)
> +
> + /* Check if there was a match in last iteration. */
> + subl %ecx, %eax
> + jnz L(new_match)
> +
> +L(first_loop_old_match):
> + PCMPEQ %xmm0, %xmm2
> + PCMPEQ %xmm0, %xmm3
> + pmovmskb %xmm2, %ecx
> + pmovmskb %xmm3, %eax
> + addl %eax, %ecx
> + jz L(first_vec_x0_test)
> + /* NB: We could move this shift to before the branch and save a
> + bit of code size / performance on the fall through. The
> + branch leads to the null case which generally seems hotter
> + than char in first 3x VEC. */
> + sall $16, %eax
> + orl %ecx, %eax
> +
> + bsrl %eax, %eax
> + addq %rsi, %rax
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> + ret
> +
> + .p2align 4
> +L(new_match):
> + pxor %xmm6, %xmm6
> + PCMPEQ %xmm9, %xmm6
> + pmovmskb %xmm6, %eax
> + sall $16, %ecx
> + orl %eax, %ecx
> +
> + /* We can't reuse either of the old comparisons as since we mask
> + of zeros after first zero (instead of using the full
> + comparison) we can't gurantee no interference between match
> + after end of string and valid match. */
> + pmovmskb %xmm4, %eax
> + pmovmskb %xmm7, %edx
> + sall $16, %edx
> + orl %edx, %eax
> +
> + leal -1(%ecx), %edx
> + xorl %edx, %ecx
> + andl %ecx, %eax
> + jz L(first_loop_old_match)
> + bsrl %eax, %eax
> + addq %rdi, %rax
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> ret
>
> + /* Save minimum state for getting most recent match. We can
> + throw out all previous work. */
> .p2align 4
> -L(no_c_found):
> - movl $1, %esi
> - xorl %ecx, %ecx
> - jmp L(loop_header)
> +L(second_loop_match):
> + movq %rdi, %rsi
> + movaps %xmm4, %xmm2
> + movaps %xmm7, %xmm3
>
> .p2align 4
> -L(exit):
> - xorl %eax, %eax
> +L(second_loop):
> + movaps (VEC_SIZE * 2)(%rdi), %xmm4
> + movaps (VEC_SIZE * 3)(%rdi), %xmm5
> + /* If SSE2 no pminud so wcsrchr needs seperate logic for
Did you mean "Since", instead of "If"?
> + detecting zero. Note if this is found to be a bottleneck it
> + may be worth adding an SSE4.1 wcsrchr implementation. */
> +#ifdef USE_AS_WCSRCHR
> + movaps %xmm5, %xmm6
> + pxor %xmm8, %xmm8
> +
> + PCMPEQ %xmm8, %xmm5
> + PCMPEQ %xmm4, %xmm8
> + por %xmm5, %xmm8
> +#else
> + movaps %xmm5, %xmm6
> + PMINU %xmm4, %xmm5
> +#endif
> +
> + movaps %xmm4, %xmm9
> + PCMPEQ %xmm0, %xmm4
> + PCMPEQ %xmm0, %xmm6
> + movaps %xmm6, %xmm7
> + por %xmm4, %xmm6
> +#ifndef USE_AS_WCSRCHR
> + pxor %xmm8, %xmm8
> + PCMPEQ %xmm5, %xmm8
> +#endif
> +
> + pmovmskb %xmm8, %ecx
> + pmovmskb %xmm6, %eax
> +
> + addq $(VEC_SIZE * 2), %rdi
> + /* Either null term or new occurence of CHAR. */
> + addl %ecx, %eax
> + jz L(second_loop)
> +
> + /* No null term so much be new occurence of CHAR. */
> + testl %ecx, %ecx
> + jz L(second_loop_match)
> +
> +
> + subl %ecx, %eax
> + jnz L(second_loop_new_match)
> +
> +L(second_loop_old_match):
> + pmovmskb %xmm2, %ecx
> + pmovmskb %xmm3, %eax
> + sall $16, %eax
> + orl %ecx, %eax
> + bsrl %eax, %eax
> + addq %rsi, %rax
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> ret
>
> .p2align 4
> +L(second_loop_new_match):
> + pxor %xmm6, %xmm6
> + PCMPEQ %xmm9, %xmm6
> + pmovmskb %xmm6, %eax
> + sall $16, %ecx
> + orl %eax, %ecx
> +
> + /* We can't reuse either of the old comparisons as since we mask
> + of zeros after first zero (instead of using the full
> + comparison) we can't gurantee no interference between match
> + after end of string and valid match. */
> + pmovmskb %xmm4, %eax
> + pmovmskb %xmm7, %edx
> + sall $16, %edx
> + orl %edx, %eax
> +
> + leal -1(%ecx), %edx
> + xorl %edx, %ecx
> + andl %ecx, %eax
> + jz L(second_loop_old_match)
> + bsrl %eax, %eax
> + addq %rdi, %rax
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> + ret
> +
> + .p2align 4,, 4
> L(cross_page):
> - movq %rdi, %rax
> - pxor %xmm0, %xmm0
> - andq $-64, %rax
> - movdqu (%rax), %xmm5
> - movdqa %xmm5, %xmm6
> - movdqu 16(%rax), %xmm4
> - pcmpeqb %xmm1, %xmm5
> - pcmpeqb %xmm0, %xmm6
> - movdqu 32(%rax), %xmm3
> - pmovmskb %xmm6, %esi
> - movdqa %xmm4, %xmm6
> - movdqu 48(%rax), %xmm2
> - pcmpeqb %xmm1, %xmm4
> - pcmpeqb %xmm0, %xmm6
> - pmovmskb %xmm6, %edx
> - movdqa %xmm3, %xmm6
> - pcmpeqb %xmm1, %xmm3
> - pcmpeqb %xmm0, %xmm6
> - pcmpeqb %xmm2, %xmm0
> - salq $16, %rdx
> - pmovmskb %xmm3, %r9d
> - pmovmskb %xmm6, %r8d
> - pmovmskb %xmm0, %ecx
> - salq $32, %r9
> - salq $32, %r8
> - pcmpeqb %xmm1, %xmm2
> - orq %r8, %rdx
> - salq $48, %rcx
> - pmovmskb %xmm5, %r8d
> - orq %rsi, %rdx
> - pmovmskb %xmm4, %esi
> - orq %rcx, %rdx
> - pmovmskb %xmm2, %ecx
> - salq $16, %rsi
> - salq $48, %rcx
> - orq %r9, %rsi
> - orq %r8, %rsi
> - orq %rcx, %rsi
> + movq %rdi, %rsi
> + andq $-VEC_SIZE, %rsi
> + movaps (%rsi), %xmm1
> + pxor %xmm2, %xmm2
> + PCMPEQ %xmm1, %xmm2
> + pmovmskb %xmm2, %edx
> movl %edi, %ecx
> - subl %eax, %ecx
> - shrq %cl, %rdx
> - shrq %cl, %rsi
> - testq %rdx, %rdx
> - je L(loop_header2)
> - leaq -1(%rdx), %rax
> - xorq %rdx, %rax
> - andq %rax, %rsi
> - je L(exit)
> - bsrq %rsi, %rax
> + andl $(VEC_SIZE - 1), %ecx
> + sarl %cl, %edx
> + jz L(cross_page_continue)
> + PCMPEQ %xmm0, %xmm1
> + pmovmskb %xmm1, %eax
> + sarl %cl, %eax
> + leal -1(%rdx), %ecx
> + xorl %edx, %ecx
> + andl %ecx, %eax
> + jz L(ret1)
> + bsrl %eax, %eax
> addq %rdi, %rax
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> +L(ret1):
> ret
> -END (strrchr)
> +END(STRRCHR)
>
> -weak_alias (strrchr, rindex)
> -libc_hidden_builtin_def (strrchr)
> +#ifndef USE_AS_WCSRCHR
> + weak_alias (STRRCHR, rindex)
> + libc_hidden_builtin_def (STRRCHR)
> +#endif
> diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
> index 61552954de..2b80efc5ef 100644
> --- a/sysdeps/x86_64/wcsrchr.S
> +++ b/sysdeps/x86_64/wcsrchr.S
> @@ -1,4 +1,4 @@
> -/* wcsrchr with SSSE3
> +/* wcsrchr optimized with SSE2.
> Copyright (C) 2011-2022 Free Software Foundation, Inc.
> This file is part of the GNU C Library.
>
> @@ -16,266 +16,12 @@
> License along with the GNU C Library; if not, see
> <https://www.gnu.org/licenses/>. */
>
> -#include <sysdep.h>
>
> - .text
> -ENTRY (wcsrchr)
> +#define USE_AS_WCSRCHR 1
> +#define NO_PMINU 1
>
> - movd %rsi, %xmm1
> - mov %rdi, %rcx
> - punpckldq %xmm1, %xmm1
> - pxor %xmm2, %xmm2
> - punpckldq %xmm1, %xmm1
> - and $63, %rcx
> - cmp $48, %rcx
> - ja L(crosscache)
> +#ifndef STRRCHR
> +# define STRRCHR wcsrchr
> +#endif
>
> - movdqu (%rdi), %xmm0
> - pcmpeqd %xmm0, %xmm2
> - pcmpeqd %xmm1, %xmm0
> - pmovmskb %xmm2, %rcx
> - pmovmskb %xmm0, %rax
> - add $16, %rdi
> -
> - test %rax, %rax
> - jnz L(unaligned_match1)
> -
> - test %rcx, %rcx
> - jnz L(return_null)
> -
> - and $-16, %rdi
> - xor %r8, %r8
> - jmp L(loop)
> -
> - .p2align 4
> -L(unaligned_match1):
> - test %rcx, %rcx
> - jnz L(prolog_find_zero_1)
> -
> - mov %rax, %r8
> - mov %rdi, %rsi
> - and $-16, %rdi
> - jmp L(loop)
> -
> - .p2align 4
> -L(crosscache):
> - and $15, %rcx
> - and $-16, %rdi
> - pxor %xmm3, %xmm3
> - movdqa (%rdi), %xmm0
> - pcmpeqd %xmm0, %xmm3
> - pcmpeqd %xmm1, %xmm0
> - pmovmskb %xmm3, %rdx
> - pmovmskb %xmm0, %rax
> - shr %cl, %rdx
> - shr %cl, %rax
> - add $16, %rdi
> -
> - test %rax, %rax
> - jnz L(unaligned_match)
> -
> - test %rdx, %rdx
> - jnz L(return_null)
> -
> - xor %r8, %r8
> - jmp L(loop)
> -
> - .p2align 4
> -L(unaligned_match):
> - test %rdx, %rdx
> - jnz L(prolog_find_zero)
> -
> - mov %rax, %r8
> - lea (%rdi, %rcx), %rsi
> -
> -/* Loop start on aligned string. */
> - .p2align 4
> -L(loop):
> - movdqa (%rdi), %xmm0
> - pcmpeqd %xmm0, %xmm2
> - add $16, %rdi
> - pcmpeqd %xmm1, %xmm0
> - pmovmskb %xmm2, %rcx
> - pmovmskb %xmm0, %rax
> - or %rax, %rcx
> - jnz L(matches)
> -
> - movdqa (%rdi), %xmm3
> - pcmpeqd %xmm3, %xmm2
> - add $16, %rdi
> - pcmpeqd %xmm1, %xmm3
> - pmovmskb %xmm2, %rcx
> - pmovmskb %xmm3, %rax
> - or %rax, %rcx
> - jnz L(matches)
> -
> - movdqa (%rdi), %xmm4
> - pcmpeqd %xmm4, %xmm2
> - add $16, %rdi
> - pcmpeqd %xmm1, %xmm4
> - pmovmskb %xmm2, %rcx
> - pmovmskb %xmm4, %rax
> - or %rax, %rcx
> - jnz L(matches)
> -
> - movdqa (%rdi), %xmm5
> - pcmpeqd %xmm5, %xmm2
> - add $16, %rdi
> - pcmpeqd %xmm1, %xmm5
> - pmovmskb %xmm2, %rcx
> - pmovmskb %xmm5, %rax
> - or %rax, %rcx
> - jz L(loop)
> -
> - .p2align 4
> -L(matches):
> - test %rax, %rax
> - jnz L(match)
> -L(return_value):
> - test %r8, %r8
> - jz L(return_null)
> - mov %r8, %rax
> - mov %rsi, %rdi
> -
> - test $15 << 4, %ah
> - jnz L(match_fourth_wchar)
> - test %ah, %ah
> - jnz L(match_third_wchar)
> - test $15 << 4, %al
> - jnz L(match_second_wchar)
> - lea -16(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(match):
> - pmovmskb %xmm2, %rcx
> - test %rcx, %rcx
> - jnz L(find_zero)
> - mov %rax, %r8
> - mov %rdi, %rsi
> - jmp L(loop)
> -
> - .p2align 4
> -L(find_zero):
> - test $15, %cl
> - jnz L(find_zero_in_first_wchar)
> - test %cl, %cl
> - jnz L(find_zero_in_second_wchar)
> - test $15, %ch
> - jnz L(find_zero_in_third_wchar)
> -
> - and $1 << 13 - 1, %rax
> - jz L(return_value)
> -
> - test $15 << 4, %ah
> - jnz L(match_fourth_wchar)
> - test %ah, %ah
> - jnz L(match_third_wchar)
> - test $15 << 4, %al
> - jnz L(match_second_wchar)
> - lea -16(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(find_zero_in_first_wchar):
> - test $1, %rax
> - jz L(return_value)
> - lea -16(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(find_zero_in_second_wchar):
> - and $1 << 5 - 1, %rax
> - jz L(return_value)
> -
> - test $15 << 4, %al
> - jnz L(match_second_wchar)
> - lea -16(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(find_zero_in_third_wchar):
> - and $1 << 9 - 1, %rax
> - jz L(return_value)
> -
> - test %ah, %ah
> - jnz L(match_third_wchar)
> - test $15 << 4, %al
> - jnz L(match_second_wchar)
> - lea -16(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(prolog_find_zero):
> - add %rcx, %rdi
> - mov %rdx, %rcx
> -L(prolog_find_zero_1):
> - test $15, %cl
> - jnz L(prolog_find_zero_in_first_wchar)
> - test %cl, %cl
> - jnz L(prolog_find_zero_in_second_wchar)
> - test $15, %ch
> - jnz L(prolog_find_zero_in_third_wchar)
> -
> - and $1 << 13 - 1, %rax
> - jz L(return_null)
> -
> - test $15 << 4, %ah
> - jnz L(match_fourth_wchar)
> - test %ah, %ah
> - jnz L(match_third_wchar)
> - test $15 << 4, %al
> - jnz L(match_second_wchar)
> - lea -16(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(prolog_find_zero_in_first_wchar):
> - test $1, %rax
> - jz L(return_null)
> - lea -16(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(prolog_find_zero_in_second_wchar):
> - and $1 << 5 - 1, %rax
> - jz L(return_null)
> -
> - test $15 << 4, %al
> - jnz L(match_second_wchar)
> - lea -16(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(prolog_find_zero_in_third_wchar):
> - and $1 << 9 - 1, %rax
> - jz L(return_null)
> -
> - test %ah, %ah
> - jnz L(match_third_wchar)
> - test $15 << 4, %al
> - jnz L(match_second_wchar)
> - lea -16(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(match_second_wchar):
> - lea -12(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(match_third_wchar):
> - lea -8(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(match_fourth_wchar):
> - lea -4(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(return_null):
> - xor %rax, %rax
> - ret
> -
> -END (wcsrchr)
> +#include "../strrchr.S"
> --
> 2.25.1
>
--
H.J.
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH v1 1/5] benchtests: Improve bench-strrchr
2022-04-21 22:07 ` Noah Goldstein
@ 2022-04-21 23:49 ` H.J. Lu
2022-04-22 1:11 ` Noah Goldstein
0 siblings, 1 reply; 36+ messages in thread
From: H.J. Lu @ 2022-04-21 23:49 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Thu, Apr 21, 2022 at 3:08 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Thu, Apr 21, 2022 at 3:13 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Wed, Apr 20, 2022 at 8:14 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > 1. Use json-lib for printing results.
> > > 2. Expose all parameters (before pos, seek_char, and max_char where
> > > not printed).
> > > 3. Add benchmarks that test multiple occurence of seek_char in the
> > > string.
> > > ---
> > > benchtests/bench-strrchr.c | 126 ++++++++++++++++++++++++-------------
> > > 1 file changed, 82 insertions(+), 44 deletions(-)
> > >
> > > diff --git a/benchtests/bench-strrchr.c b/benchtests/bench-strrchr.c
> > > index abdae60c51..cceea77e1b 100644
> > > --- a/benchtests/bench-strrchr.c
> > > +++ b/benchtests/bench-strrchr.c
> > > @@ -23,6 +23,7 @@
> > > # define TEST_NAME "strrchr"
> > > #endif
> > > #include "bench-string.h"
> > > +#include "json-lib.h"
> > >
> > > #define BIG_CHAR MAX_CHAR
> > >
> > > @@ -53,7 +54,8 @@ SIMPLE_STRRCHR (const CHAR *s, int c)
> > > }
> > >
> > > static void
> > > -do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
> > > +do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s, int c,
> > > + CHAR *exp_res)
> > > {
> > > CHAR *res = CALL (impl, s, c);
> > > size_t i, iters = INNER_LOOP_ITERS8;
> > > @@ -61,8 +63,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
> > >
> > > if (res != exp_res)
> > > {
> > > - error (0, 0, "Wrong result in function %s %p %p", impl->name,
> > > - res, exp_res);
> > > + error (0, 0, "Wrong result in function %s %p %p", impl->name, res,
> > > + exp_res);
> >
> > These changes aren't needed.
> >
> > > ret = 1;
> > > return;
> > > }
> > > @@ -72,24 +74,28 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
> > > {
> > > CALL (impl, s, c);
> > > }
> > > - TIMING_NOW (stop);
> > >
> > > + TIMING_NOW (stop);
> >
> > Not needed.
>
> Will fix in V2
> >
> > > TIMING_DIFF (cur, start, stop);
> > >
> > > - TIMING_PRINT_MEAN ((double) cur, (double) iters);
> > > + json_element_double (json_ctx, (double) cur / (double) iters);
> > > + return;
> >
> > Return isn't needed.
>
> Will fix in V2.
> >
> > > }
> > >
> > > static void
> > > -do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
> > > +do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
> > > + int seek_char, int max_char, size_t freq)
> > > /* For wcsrchr: align here means align not in bytes,
> > > but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
> > > len for wcschr here isn't in bytes but it's number of wchar_t symbols. */
> > > {
> > > size_t i;
> > > + size_t pos_chunk_sz = freq ? (pos / freq) : pos;
> > > + size_t last_pos = len;
> > > CHAR *result;
> > > CHAR *buf = (CHAR *) buf1;
> > >
> > > - align &= 7;
> > > + align &= (getpagesize () - 1);
> >
> > If we have such large alignments, the tests may be skipped.
> > Should we change it to 127 instead?
>
> There is logic around page cross cases in x86_64 versions so think
> makes sense to support benchmarking it.
>
> Also i think that would tend to give the previous version a bit of
> an unfair disadvantage as the slow aligning case will never be
> tested in the new version.
If "align" is close to the page size, will it trigger
if ((align + len) * sizeof (CHAR) >= page_size)
return;
and skip page cross cases?
> >
> > > if ((align + len) * sizeof (CHAR) >= page_size)
> > > return;
> > >
> > > @@ -103,6 +109,16 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
> > > if ((i > pos || pos >= len) && buf[align + i] == seek_char)
> > > buf[align + i] = seek_char + 10 + (random () & 15);
> > > }
> > > +
> > > + if (pos_chunk_sz == 0 && pos)
> > > + pos_chunk_sz = 1;
> > > +
> > > + for (i = pos_chunk_sz; i < pos && i < len; i += pos_chunk_sz)
> > > + {
> > > + buf[align + i] = seek_char;
> > > + last_pos = i;
> > > + }
> > > +
> > > buf[align + len] = 0;
> > >
> > > if (pos < len)
> > > @@ -110,66 +126,88 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
> > > buf[align + pos] = seek_char;
> > > result = (CHAR *) (buf + align + pos);
> > > }
> > > + else if (last_pos < len)
> > > + result = (CHAR *) (buf + align + last_pos);
> > > else if (seek_char == 0)
> > > result = (CHAR *) (buf + align + len);
> > > else
> > > result = NULL;
> > >
> > > - printf ("Length %4zd, alignment in bytes %2zd:", len, align * sizeof (CHAR));
> > > + json_element_object_begin (json_ctx);
> > > + json_attr_uint (json_ctx, "len", len);
> > > + json_attr_uint (json_ctx, "pos", pos);
> > > + json_attr_uint (json_ctx, "align", align);
> > > + json_attr_uint (json_ctx, "freq", freq);
> > > + json_attr_uint (json_ctx, "seek", seek_char);
> > > + json_attr_uint (json_ctx, "max_char", max_char);
> > > + json_array_begin (json_ctx, "timings");
> > >
> > > FOR_EACH_IMPL (impl, 0)
> > > - do_one_test (impl, (CHAR *) (buf + align), seek_char, result);
> > > + do_one_test (json_ctx, impl, (CHAR *) (buf + align), seek_char, result);
> > >
> > > - putchar ('\n');
> > > + json_array_end (json_ctx);
> > > + json_element_object_end (json_ctx);
> > > }
> > >
> > > int
> > > test_main (void)
> > > {
> > > - size_t i;
> > > + json_ctx_t json_ctx;
> > > + size_t i, j;
> > > + int seek;
> > >
> > > test_init ();
> > > + json_init (&json_ctx, 0, stdout);
> > >
> > > - printf ("%20s", "");
> > > - FOR_EACH_IMPL (impl, 0)
> > > - printf ("\t%s", impl->name);
> > > - putchar ('\n');
> > > + json_document_begin (&json_ctx);
> > > + json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> > >
> > > - for (i = 1; i < 8; ++i)
> > > - {
> > > - do_test (0, 16 << i, 2048, 23, SMALL_CHAR);
> > > - do_test (i, 16 << i, 2048, 23, SMALL_CHAR);
> > > - }
> > > + json_attr_object_begin (&json_ctx, "functions");
> > > + json_attr_object_begin (&json_ctx, TEST_NAME);
> > > + json_attr_string (&json_ctx, "bench-variant", "");
> > >
> > > - for (i = 1; i < 8; ++i)
> > > - {
> > > - do_test (i, 64, 256, 23, SMALL_CHAR);
> > > - do_test (i, 64, 256, 23, BIG_CHAR);
> > > - }
> > > -
> > > - for (i = 0; i < 32; ++i)
> > > - {
> > > - do_test (0, i, i + 1, 23, SMALL_CHAR);
> > > - do_test (0, i, i + 1, 23, BIG_CHAR);
> > > - }
> > > + json_array_begin (&json_ctx, "ifuncs");
> > > + FOR_EACH_IMPL (impl, 0)
> > > + json_element_string (&json_ctx, impl->name);
> > > + json_array_end (&json_ctx);
> > >
> > > - for (i = 1; i < 8; ++i)
> > > - {
> > > - do_test (0, 16 << i, 2048, 0, SMALL_CHAR);
> > > - do_test (i, 16 << i, 2048, 0, SMALL_CHAR);
> > > - }
> > > + json_array_begin (&json_ctx, "results");
> > >
> > > - for (i = 1; i < 8; ++i)
> > > + for (seek = 0; seek <= 23; seek += 23)
> > > {
> > > - do_test (i, 64, 256, 0, SMALL_CHAR);
> > > - do_test (i, 64, 256, 0, BIG_CHAR);
> > > + for (j = 1; j < 32; j += j)
> > > + {
> > > + for (i = 1; i < 9; ++i)
> > > + {
> > > + do_test (&json_ctx, 0, 16 << i, 2048, seek, SMALL_CHAR, j);
> > > + do_test (&json_ctx, i, 16 << i, 2048, seek, SMALL_CHAR, j);
> > > + }
> > > +
> > > + for (i = 1; i < 8; ++i)
> > > + {
> > > + do_test (&json_ctx, i, 64, 256, seek, SMALL_CHAR, j);
> > > + do_test (&json_ctx, i, 64, 256, seek, BIG_CHAR, j);
> > > +
> > > + do_test (&json_ctx, i * 15, 64, 256, seek, SMALL_CHAR, j);
> > > + do_test (&json_ctx, i * 15, 64, 256, seek, BIG_CHAR, j);
> > > + }
> > > +
> > > + for (i = 0; i < 32; ++i)
> > > + {
> > > + do_test (&json_ctx, 0, i, i + 1, seek, SMALL_CHAR, j);
> > > + do_test (&json_ctx, 0, i, i + 1, seek, BIG_CHAR, j);
> > > + }
> > > + if (seek == 0)
> > > + {
> > > + break;
> > > + }
> > > + }
> > > }
> > >
> > > - for (i = 0; i < 32; ++i)
> > > - {
> > > - do_test (0, i, i + 1, 0, SMALL_CHAR);
> > > - do_test (0, i, i + 1, 0, BIG_CHAR);
> > > - }
> > > + json_array_end (&json_ctx);
> > > + json_attr_object_end (&json_ctx);
> > > + json_attr_object_end (&json_ctx);
> > > + json_document_end (&json_ctx);
> > >
> > > return ret;
> > > }
> > > --
> > > 2.25.1
> > >
> >
> >
> > --
> > H.J.
--
H.J.
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH v2 4/4] x86: Optimize {str|wcs}rchr-evex
2022-04-21 22:22 ` [PATCH v2 4/4] x86: Optimize {str|wcs}rchr-evex Noah Goldstein
@ 2022-04-21 23:59 ` H.J. Lu
2022-04-22 1:53 ` Noah Goldstein
0 siblings, 1 reply; 36+ messages in thread
From: H.J. Lu @ 2022-04-21 23:59 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Thu, Apr 21, 2022 at 3:22 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The new code unrolls the main loop slightly without adding too much
> overhead and minimizes the comparisons for the search CHAR.
>
> Geometric Mean of all benchmarks New / Old: 0.755
> See email for all results.
>
> Full xcheck passes on x86_64 with and without multiarch enabled.
> ---
> sysdeps/x86_64/multiarch/strrchr-evex.S | 441 ++++++++++++++----------
> 1 file changed, 259 insertions(+), 182 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
> index adeddaed32..5cf9a8315b 100644
> --- a/sysdeps/x86_64/multiarch/strrchr-evex.S
> +++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
> @@ -24,242 +24,319 @@
> # define STRRCHR __strrchr_evex
> # endif
>
> -# define VMOVU vmovdqu64
> -# define VMOVA vmovdqa64
> +# define VMOVU vmovdqu64
> +# define VMOVA vmovdqa64
>
> # ifdef USE_AS_WCSRCHR
> +# define SHIFT_REG esi
> +
> +# define kunpck kunpckbw
> +# define kmov_2x kmovd
> +# define maskz_2x ecx
> +# define maskm_2x eax
> +# define CHAR_SIZE 4
> +# define VPMIN vpminud
> +# define VPTESTN vptestnmd
> # define VPBROADCAST vpbroadcastd
> -# define VPCMP vpcmpd
> -# define SHIFT_REG r8d
> +# define VPCMP vpcmpd
> # else
> +# define SHIFT_REG edi
> +
> +# define kunpck kunpckdq
> +# define kmov_2x kmovq
> +# define maskz_2x rcx
> +# define maskm_2x rax
> +
> +# define CHAR_SIZE 1
> +# define VPMIN vpminub
> +# define VPTESTN vptestnmb
> # define VPBROADCAST vpbroadcastb
> -# define VPCMP vpcmpb
> -# define SHIFT_REG ecx
> +# define VPCMP vpcmpb
> # endif
>
> # define XMMZERO xmm16
> # define YMMZERO ymm16
> # define YMMMATCH ymm17
> -# define YMM1 ymm18
> +# define YMMSAVE ymm18
> +
> +# define YMM1 ymm19
> +# define YMM2 ymm20
> +# define YMM3 ymm21
> +# define YMM4 ymm22
> +# define YMM5 ymm23
> +# define YMM6 ymm24
> +# define YMM7 ymm25
> +# define YMM8 ymm26
>
> -# define VEC_SIZE 32
>
> - .section .text.evex,"ax",@progbits
> -ENTRY (STRRCHR)
> - movl %edi, %ecx
> +# define VEC_SIZE 32
> +# define PAGE_SIZE 4096
> + .section .text.evex, "ax", @progbits
> +ENTRY(STRRCHR)
> + movl %edi, %eax
> /* Broadcast CHAR to YMMMATCH. */
> VPBROADCAST %esi, %YMMMATCH
>
> - vpxorq %XMMZERO, %XMMZERO, %XMMZERO
> -
> - /* Check if we may cross page boundary with one vector load. */
> - andl $(2 * VEC_SIZE - 1), %ecx
> - cmpl $VEC_SIZE, %ecx
> - ja L(cros_page_boundary)
> + andl $(PAGE_SIZE - 1), %eax
> + cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> + jg L(cross_page_boundary)
>
> +L(page_cross_continue):
> VMOVU (%rdi), %YMM1
> -
> - /* Each bit in K0 represents a null byte in YMM1. */
> - VPCMP $0, %YMMZERO, %YMM1, %k0
> - /* Each bit in K1 represents a CHAR in YMM1. */
> - VPCMP $0, %YMMMATCH, %YMM1, %k1
> + VPTESTN %YMM1, %YMM1, %k0
Please add some comments for mask register on VPTESTN tests.
> kmovd %k0, %ecx
> - kmovd %k1, %eax
> -
> - addq $VEC_SIZE, %rdi
> -
> - testl %eax, %eax
> - jnz L(first_vec)
> -
> testl %ecx, %ecx
> - jnz L(return_null)
> -
> - andq $-VEC_SIZE, %rdi
> - xorl %edx, %edx
> - jmp L(aligned_loop)
> -
> - .p2align 4
> -L(first_vec):
> - /* Check if there is a null byte. */
> - testl %ecx, %ecx
> - jnz L(char_and_nul_in_first_vec)
> -
> - /* Remember the match and keep searching. */
> - movl %eax, %edx
> - movq %rdi, %rsi
> - andq $-VEC_SIZE, %rdi
> - jmp L(aligned_loop)
> -
> - .p2align 4
> -L(cros_page_boundary):
> - andl $(VEC_SIZE - 1), %ecx
> - andq $-VEC_SIZE, %rdi
> -
> + jz L(aligned_more)
> + VPCMP $0, %YMMMATCH, %YMM1, %k1
Please add some comments.
> + kmovd %k1, %eax
> + blsmskl %ecx, %ecx
> + andl %ecx, %eax
> + jz L(ret0)
> + bsrl %eax, %eax
> # ifdef USE_AS_WCSRCHR
> - /* NB: Divide shift count by 4 since each bit in K1 represent 4
> - bytes. */
> - movl %ecx, %SHIFT_REG
> - sarl $2, %SHIFT_REG
> + leaq (%rdi, %rax, CHAR_SIZE), %rax
> +# else
> + addq %rdi, %rax
> # endif
> +L(ret0):
> + ret
>
> - VMOVA (%rdi), %YMM1
> -
> - /* Each bit in K0 represents a null byte in YMM1. */
> - VPCMP $0, %YMMZERO, %YMM1, %k0
> - /* Each bit in K1 represents a CHAR in YMM1. */
> + /* Returns for first vec x1/x2/x3 have hard coded backward
> + search path for earlier matches. */
> + .p2align 4,, 6
> +L(first_vec_x1):
> + VPCMP $0, %YMMMATCH, %YMM2, %k1
> + kmovd %k1, %eax
> + blsmskl %ecx, %ecx
> + andl %ecx, %eax
> + jnz L(first_vec_x1_return)
> + .p2align 4,, 4
> +L(first_vec_x0_test):
> VPCMP $0, %YMMMATCH, %YMM1, %k1
> - kmovd %k0, %edx
> kmovd %k1, %eax
> -
> - shrxl %SHIFT_REG, %edx, %edx
> - shrxl %SHIFT_REG, %eax, %eax
> - addq $VEC_SIZE, %rdi
> -
> - /* Check if there is a CHAR. */
> testl %eax, %eax
> - jnz L(found_char)
> -
> - testl %edx, %edx
> - jnz L(return_null)
> -
> - jmp L(aligned_loop)
> -
> - .p2align 4
> -L(found_char):
> - testl %edx, %edx
> - jnz L(char_and_nul)
> -
> - /* Remember the match and keep searching. */
> - movl %eax, %edx
> - leaq (%rdi, %rcx), %rsi
> + jz L(ret1)
> + bsrl %eax, %eax
> +# ifdef USE_AS_WCSRCHR
> + leaq (%rsi, %rax, CHAR_SIZE), %rax
> +# else
> + addq %rsi, %rax
> +# endif
> +L(ret1):
> + ret
>
> - .p2align 4
> -L(aligned_loop):
> - VMOVA (%rdi), %YMM1
> - addq $VEC_SIZE, %rdi
> + .p2align 4,, 10
> +L(first_vec_x1_or_x2):
> + VPCMP $0, %YMM3, %YMMMATCH, %k3
> + VPCMP $0, %YMM2, %YMMMATCH, %k2
> + kortestd %k2, %k3
> + jz L(first_vec_x0_test)
> +
> + kunpck %k2, %k3, %k3
> + kmovq %k3, %rax
> + bsrq %rax, %rax
> + leaq (VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax
> + ret
>
> - /* Each bit in K0 represents a null byte in YMM1. */
> - VPCMP $0, %YMMZERO, %YMM1, %k0
> - /* Each bit in K1 represents a CHAR in YMM1. */
> - VPCMP $0, %YMMMATCH, %YMM1, %k1
> - kmovd %k0, %ecx
> + .p2align 4,, 6
> +L(first_vec_x3):
> + VPCMP $0, %YMMMATCH, %YMM4, %k1
> kmovd %k1, %eax
> - orl %eax, %ecx
> - jnz L(char_nor_null)
> + blsmskl %ecx, %ecx
> + andl %ecx, %eax
> + jz L(first_vec_x1_or_x2)
> + bsrl %eax, %eax
> + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> + ret
>
> - VMOVA (%rdi), %YMM1
> - add $VEC_SIZE, %rdi
> + .p2align 4,, 6
> +L(first_vec_x0_x1_test):
> + VPCMP $0, %YMMMATCH, %YMM2, %k1
> + kmovd %k1, %eax
> + testl %eax, %eax
> + jz L(first_vec_x0_test)
> + .p2align 4,, 4
> +L(first_vec_x1_return):
> + bsrl %eax, %eax
> + leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
> + ret
>
> - /* Each bit in K0 represents a null byte in YMM1. */
> - VPCMP $0, %YMMZERO, %YMM1, %k0
> - /* Each bit in K1 represents a CHAR in YMM1. */
> - VPCMP $0, %YMMMATCH, %YMM1, %k1
> - kmovd %k0, %ecx
> + .p2align 4,, 10
> +L(first_vec_x2):
> + VPCMP $0, %YMMMATCH, %YMM3, %k1
> kmovd %k1, %eax
> - orl %eax, %ecx
> - jnz L(char_nor_null)
> + blsmskl %ecx, %ecx
> + andl %ecx, %eax
> + jz L(first_vec_x0_x1_test)
> + bsrl %eax, %eax
> + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> + ret
>
> - VMOVA (%rdi), %YMM1
> - addq $VEC_SIZE, %rdi
>
> - /* Each bit in K0 represents a null byte in YMM1. */
> - VPCMP $0, %YMMZERO, %YMM1, %k0
> - /* Each bit in K1 represents a CHAR in YMM1. */
> - VPCMP $0, %YMMMATCH, %YMM1, %k1
> + .p2align 4
> +L(aligned_more):
> + /* Need to keep original pointer incase YMM1 has last match. */
> + movq %rdi, %rsi
> + andq $-VEC_SIZE, %rdi
> + VMOVU VEC_SIZE(%rdi), %YMM2
> + VPTESTN %YMM2, %YMM2, %k0
> kmovd %k0, %ecx
> - kmovd %k1, %eax
> - orl %eax, %ecx
> - jnz L(char_nor_null)
> + testl %ecx, %ecx
> + jnz L(first_vec_x1)
>
> - VMOVA (%rdi), %YMM1
> - addq $VEC_SIZE, %rdi
> + VMOVU (VEC_SIZE * 2)(%rdi), %YMM3
> + VPTESTN %YMM3, %YMM3, %k0
> + kmovd %k0, %ecx
> + testl %ecx, %ecx
> + jnz L(first_vec_x2)
>
> - /* Each bit in K0 represents a null byte in YMM1. */
> - VPCMP $0, %YMMZERO, %YMM1, %k0
> - /* Each bit in K1 represents a CHAR in YMM1. */
> - VPCMP $0, %YMMMATCH, %YMM1, %k1
> + VMOVU (VEC_SIZE * 3)(%rdi), %YMM4
> + VPTESTN %YMM4, %YMM4, %k0
> kmovd %k0, %ecx
> - kmovd %k1, %eax
> - orl %eax, %ecx
> - jz L(aligned_loop)
> + movq %rdi, %r8
> + testl %ecx, %ecx
> + jnz L(first_vec_x3)
>
> + andq $-(VEC_SIZE * 2), %rdi
> .p2align 4
> -L(char_nor_null):
> - /* Find a CHAR or a null byte in a loop. */
> +L(first_aligned_loop):
> + /* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee
> + they don't store a match. */
> + VMOVA (VEC_SIZE * 4)(%rdi), %YMM5
> + VMOVA (VEC_SIZE * 5)(%rdi), %YMM6
> +
> + VPCMP $0, %YMM5, %YMMMATCH, %k2
> + vpxord %YMM6, %YMMMATCH, %YMM7
> +
> + VPMIN %YMM5, %YMM6, %YMM8
> + VPMIN %YMM8, %YMM7, %YMM7
> +
> + VPTESTN %YMM7, %YMM7, %k1
> + subq $(VEC_SIZE * -2), %rdi
> + kortestd %k1, %k2
> + jz L(first_aligned_loop)
> +
> + VPCMP $0, %YMM6, %YMMMATCH, %k3
> + VPTESTN %YMM8, %YMM8, %k1
> + ktestd %k1, %k1
> + jz L(second_aligned_loop_prep)
> +
> + kortestd %k2, %k3
> + jnz L(return_first_aligned_loop)
> +
> + .p2align 4,, 6
> +L(first_vec_x1_or_x2_or_x3):
> + VPCMP $0, %YMM4, %YMMMATCH, %k4
> + kmovd %k4, %eax
> testl %eax, %eax
> - jnz L(match)
> -L(return_value):
> - testl %edx, %edx
> - jz L(return_null)
> - movl %edx, %eax
> - movq %rsi, %rdi
> + jz L(first_vec_x1_or_x2)
> bsrl %eax, %eax
> -# ifdef USE_AS_WCSRCHR
> - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> - leaq -VEC_SIZE(%rdi, %rax, 4), %rax
> -# else
> - leaq -VEC_SIZE(%rdi, %rax), %rax
> -# endif
> + leaq (VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
> ret
>
> - .p2align 4
> -L(match):
> - /* Find a CHAR. Check if there is a null byte. */
> - kmovd %k0, %ecx
> - testl %ecx, %ecx
> - jnz L(find_nul)
> + .p2align 4,, 8
> +L(return_first_aligned_loop):
> + VPTESTN %YMM5, %YMM5, %k0
> + kunpck %k0, %k1, %k0
> + kmov_2x %k0, %maskz_2x
> +
> + blsmsk %maskz_2x, %maskz_2x
> + kunpck %k2, %k3, %k3
> + kmov_2x %k3, %maskm_2x
> + and %maskz_2x, %maskm_2x
> + jz L(first_vec_x1_or_x2_or_x3)
> +
> + bsr %maskm_2x, %maskm_2x
> + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> + ret
>
> - /* Remember the match and keep searching. */
> - movl %eax, %edx
> + .p2align 4
> + /* We can throw away the work done for the first 4x checks here
> + as we have a later match. This is the 'fast' path persay.
> + */
> +L(second_aligned_loop_prep):
> +L(second_aligned_loop_set_furthest_match):
> movq %rdi, %rsi
> - jmp L(aligned_loop)
> + kunpck %k2, %k3, %k4
>
> .p2align 4
> -L(find_nul):
> - /* Mask out any matching bits after the null byte. */
> - movl %ecx, %r8d
> - subl $1, %r8d
> - xorl %ecx, %r8d
> - andl %r8d, %eax
> - testl %eax, %eax
> - /* If there is no CHAR here, return the remembered one. */
> - jz L(return_value)
> - bsrl %eax, %eax
> +L(second_aligned_loop):
> + VMOVU (VEC_SIZE * 4)(%rdi), %YMM1
> + VMOVU (VEC_SIZE * 5)(%rdi), %YMM2
> +
> + VPCMP $0, %YMM1, %YMMMATCH, %k2
> + vpxord %YMM2, %YMMMATCH, %YMM3
> +
> + VPMIN %YMM1, %YMM2, %YMM4
> + VPMIN %YMM3, %YMM4, %YMM3
> +
> + VPTESTN %YMM3, %YMM3, %k1
> + subq $(VEC_SIZE * -2), %rdi
> + kortestd %k1, %k2
> + jz L(second_aligned_loop)
> +
> + VPCMP $0, %YMM2, %YMMMATCH, %k3
> + VPTESTN %YMM4, %YMM4, %k1
> + ktestd %k1, %k1
> + jz L(second_aligned_loop_set_furthest_match)
> +
> + kortestd %k2, %k3
> + /* branch here because there is a significant advantage interms
> + of output dependency chance in using edx. */
> + jnz L(return_new_match)
> +L(return_old_match):
> + kmovq %k4, %rax
> + bsrq %rax, %rax
> + leaq (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax
> + ret
> +
> +L(return_new_match):
> + VPTESTN %YMM1, %YMM1, %k0
> + kunpck %k0, %k1, %k0
> + kmov_2x %k0, %maskz_2x
> +
> + blsmsk %maskz_2x, %maskz_2x
> + kunpck %k2, %k3, %k3
> + kmov_2x %k3, %maskm_2x
> + and %maskz_2x, %maskm_2x
> + jz L(return_old_match)
> +
> + bsr %maskm_2x, %maskm_2x
> + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> + ret
> +
> + /* This block is horribly aligned (% 16 == 15). This is
> + intentional. The L(cross_page_boundary) block is exactly
> + 32-bytes of code size. Ultimately this is a cold case so
> + save the code size by leaving misaligned. */
> +L(cross_page_boundary):
> + xorq %rdi, %rax
> + VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %YMM1
> + VPTESTN %YMM1, %YMM1, %k0
> + kmovd %k0, %ecx
> # ifdef USE_AS_WCSRCHR
> - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> - leaq -VEC_SIZE(%rdi, %rax, 4), %rax
> -# else
> - leaq -VEC_SIZE(%rdi, %rax), %rax
> + movl %edi, %esi
> + andl $(VEC_SIZE - 1), %esi
> + shrl $2, %esi
> # endif
> - ret
> + shrxl %SHIFT_REG, %ecx, %ecx
>
> - .p2align 4
> -L(char_and_nul):
> - /* Find both a CHAR and a null byte. */
> - addq %rcx, %rdi
> - movl %edx, %ecx
> -L(char_and_nul_in_first_vec):
> - /* Mask out any matching bits after the null byte. */
> - movl %ecx, %r8d
> - subl $1, %r8d
> - xorl %ecx, %r8d
> - andl %r8d, %eax
> - testl %eax, %eax
> - /* Return null pointer if the null byte comes first. */
> - jz L(return_null)
> + testl %ecx, %ecx
> + jz L(page_cross_continue)
> + VPCMP $0, %YMMMATCH, %YMM1, %k1
> + kmovd %k1, %eax
> + shrxl %SHIFT_REG, %eax, %eax
> + blsmskl %ecx, %ecx
> + andl %ecx, %eax
> + jz L(ret3)
> bsrl %eax, %eax
> # ifdef USE_AS_WCSRCHR
> - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> - leaq -VEC_SIZE(%rdi, %rax, 4), %rax
> + leaq (%rdi, %rax, CHAR_SIZE), %rax
> # else
> - leaq -VEC_SIZE(%rdi, %rax), %rax
> + addq %rdi, %rax
> # endif
> +L(ret3):
> ret
>
> - .p2align 4
> -L(return_null):
> - xorl %eax, %eax
> - ret
> -
> -END (STRRCHR)
> +END(STRRCHR)
> #endif
> --
> 2.25.1
>
--
H.J.
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH v1 1/5] benchtests: Improve bench-strrchr
2022-04-21 23:49 ` H.J. Lu
@ 2022-04-22 1:11 ` Noah Goldstein
0 siblings, 0 replies; 36+ messages in thread
From: Noah Goldstein @ 2022-04-22 1:11 UTC (permalink / raw)
To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell
On Thu, Apr 21, 2022 at 6:50 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Thu, Apr 21, 2022 at 3:08 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Thu, Apr 21, 2022 at 3:13 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Wed, Apr 20, 2022 at 8:14 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > 1. Use json-lib for printing results.
> > > > 2. Expose all parameters (before pos, seek_char, and max_char where
> > > > not printed).
> > > > 3. Add benchmarks that test multiple occurence of seek_char in the
> > > > string.
> > > > ---
> > > > benchtests/bench-strrchr.c | 126 ++++++++++++++++++++++++-------------
> > > > 1 file changed, 82 insertions(+), 44 deletions(-)
> > > >
> > > > diff --git a/benchtests/bench-strrchr.c b/benchtests/bench-strrchr.c
> > > > index abdae60c51..cceea77e1b 100644
> > > > --- a/benchtests/bench-strrchr.c
> > > > +++ b/benchtests/bench-strrchr.c
> > > > @@ -23,6 +23,7 @@
> > > > # define TEST_NAME "strrchr"
> > > > #endif
> > > > #include "bench-string.h"
> > > > +#include "json-lib.h"
> > > >
> > > > #define BIG_CHAR MAX_CHAR
> > > >
> > > > @@ -53,7 +54,8 @@ SIMPLE_STRRCHR (const CHAR *s, int c)
> > > > }
> > > >
> > > > static void
> > > > -do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
> > > > +do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s, int c,
> > > > + CHAR *exp_res)
> > > > {
> > > > CHAR *res = CALL (impl, s, c);
> > > > size_t i, iters = INNER_LOOP_ITERS8;
> > > > @@ -61,8 +63,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
> > > >
> > > > if (res != exp_res)
> > > > {
> > > > - error (0, 0, "Wrong result in function %s %p %p", impl->name,
> > > > - res, exp_res);
> > > > + error (0, 0, "Wrong result in function %s %p %p", impl->name, res,
> > > > + exp_res);
> > >
> > > These changes aren't needed.
> > >
> > > > ret = 1;
> > > > return;
> > > > }
> > > > @@ -72,24 +74,28 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
> > > > {
> > > > CALL (impl, s, c);
> > > > }
> > > > - TIMING_NOW (stop);
> > > >
> > > > + TIMING_NOW (stop);
> > >
> > > Not needed.
> >
> > Will fix in V2
> > >
> > > > TIMING_DIFF (cur, start, stop);
> > > >
> > > > - TIMING_PRINT_MEAN ((double) cur, (double) iters);
> > > > + json_element_double (json_ctx, (double) cur / (double) iters);
> > > > + return;
> > >
> > > Return isn't needed.
> >
> > Will fix in V2.
> > >
> > > > }
> > > >
> > > > static void
> > > > -do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
> > > > +do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
> > > > + int seek_char, int max_char, size_t freq)
> > > > /* For wcsrchr: align here means align not in bytes,
> > > > but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
> > > > len for wcschr here isn't in bytes but it's number of wchar_t symbols. */
> > > > {
> > > > size_t i;
> > > > + size_t pos_chunk_sz = freq ? (pos / freq) : pos;
> > > > + size_t last_pos = len;
> > > > CHAR *result;
> > > > CHAR *buf = (CHAR *) buf1;
> > > >
> > > > - align &= 7;
> > > > + align &= (getpagesize () - 1);
> > >
> > > If we have such large alignments, the tests may be skipped.
> > > Should we change it to 127 instead?
> >
> > There is logic around page cross cases in x86_64 versions so think
> > makes sense to support benchmarking it.
> >
> > Also i think that would tend to give the previous version a bit of
> > an unfair disadvantage as the slow aligning case will never be
> > tested in the new version.
>
> If "align" is close to the page size, will it trigger
>
> if ((align + len) * sizeof (CHAR) >= page_size)
> return;
>
> and skip page cross cases?
https://sourceware.org/git/?p=glibc.git;a=blob;f=benchtests/bench-string.h;h=5339ff47ffd9c9082c7bce038da00f9c48472c7f;hb=HEAD#l244
So for med/small sizes we will be fine?
>
> > >
> > > > if ((align + len) * sizeof (CHAR) >= page_size)
> > > > return;
> > > >
> > > > @@ -103,6 +109,16 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
> > > > if ((i > pos || pos >= len) && buf[align + i] == seek_char)
> > > > buf[align + i] = seek_char + 10 + (random () & 15);
> > > > }
> > > > +
> > > > + if (pos_chunk_sz == 0 && pos)
> > > > + pos_chunk_sz = 1;
> > > > +
> > > > + for (i = pos_chunk_sz; i < pos && i < len; i += pos_chunk_sz)
> > > > + {
> > > > + buf[align + i] = seek_char;
> > > > + last_pos = i;
> > > > + }
> > > > +
> > > > buf[align + len] = 0;
> > > >
> > > > if (pos < len)
> > > > @@ -110,66 +126,88 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
> > > > buf[align + pos] = seek_char;
> > > > result = (CHAR *) (buf + align + pos);
> > > > }
> > > > + else if (last_pos < len)
> > > > + result = (CHAR *) (buf + align + last_pos);
> > > > else if (seek_char == 0)
> > > > result = (CHAR *) (buf + align + len);
> > > > else
> > > > result = NULL;
> > > >
> > > > - printf ("Length %4zd, alignment in bytes %2zd:", len, align * sizeof (CHAR));
> > > > + json_element_object_begin (json_ctx);
> > > > + json_attr_uint (json_ctx, "len", len);
> > > > + json_attr_uint (json_ctx, "pos", pos);
> > > > + json_attr_uint (json_ctx, "align", align);
> > > > + json_attr_uint (json_ctx, "freq", freq);
> > > > + json_attr_uint (json_ctx, "seek", seek_char);
> > > > + json_attr_uint (json_ctx, "max_char", max_char);
> > > > + json_array_begin (json_ctx, "timings");
> > > >
> > > > FOR_EACH_IMPL (impl, 0)
> > > > - do_one_test (impl, (CHAR *) (buf + align), seek_char, result);
> > > > + do_one_test (json_ctx, impl, (CHAR *) (buf + align), seek_char, result);
> > > >
> > > > - putchar ('\n');
> > > > + json_array_end (json_ctx);
> > > > + json_element_object_end (json_ctx);
> > > > }
> > > >
> > > > int
> > > > test_main (void)
> > > > {
> > > > - size_t i;
> > > > + json_ctx_t json_ctx;
> > > > + size_t i, j;
> > > > + int seek;
> > > >
> > > > test_init ();
> > > > + json_init (&json_ctx, 0, stdout);
> > > >
> > > > - printf ("%20s", "");
> > > > - FOR_EACH_IMPL (impl, 0)
> > > > - printf ("\t%s", impl->name);
> > > > - putchar ('\n');
> > > > + json_document_begin (&json_ctx);
> > > > + json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> > > >
> > > > - for (i = 1; i < 8; ++i)
> > > > - {
> > > > - do_test (0, 16 << i, 2048, 23, SMALL_CHAR);
> > > > - do_test (i, 16 << i, 2048, 23, SMALL_CHAR);
> > > > - }
> > > > + json_attr_object_begin (&json_ctx, "functions");
> > > > + json_attr_object_begin (&json_ctx, TEST_NAME);
> > > > + json_attr_string (&json_ctx, "bench-variant", "");
> > > >
> > > > - for (i = 1; i < 8; ++i)
> > > > - {
> > > > - do_test (i, 64, 256, 23, SMALL_CHAR);
> > > > - do_test (i, 64, 256, 23, BIG_CHAR);
> > > > - }
> > > > -
> > > > - for (i = 0; i < 32; ++i)
> > > > - {
> > > > - do_test (0, i, i + 1, 23, SMALL_CHAR);
> > > > - do_test (0, i, i + 1, 23, BIG_CHAR);
> > > > - }
> > > > + json_array_begin (&json_ctx, "ifuncs");
> > > > + FOR_EACH_IMPL (impl, 0)
> > > > + json_element_string (&json_ctx, impl->name);
> > > > + json_array_end (&json_ctx);
> > > >
> > > > - for (i = 1; i < 8; ++i)
> > > > - {
> > > > - do_test (0, 16 << i, 2048, 0, SMALL_CHAR);
> > > > - do_test (i, 16 << i, 2048, 0, SMALL_CHAR);
> > > > - }
> > > > + json_array_begin (&json_ctx, "results");
> > > >
> > > > - for (i = 1; i < 8; ++i)
> > > > + for (seek = 0; seek <= 23; seek += 23)
> > > > {
> > > > - do_test (i, 64, 256, 0, SMALL_CHAR);
> > > > - do_test (i, 64, 256, 0, BIG_CHAR);
> > > > + for (j = 1; j < 32; j += j)
> > > > + {
> > > > + for (i = 1; i < 9; ++i)
> > > > + {
> > > > + do_test (&json_ctx, 0, 16 << i, 2048, seek, SMALL_CHAR, j);
> > > > + do_test (&json_ctx, i, 16 << i, 2048, seek, SMALL_CHAR, j);
> > > > + }
> > > > +
> > > > + for (i = 1; i < 8; ++i)
> > > > + {
> > > > + do_test (&json_ctx, i, 64, 256, seek, SMALL_CHAR, j);
> > > > + do_test (&json_ctx, i, 64, 256, seek, BIG_CHAR, j);
> > > > +
> > > > + do_test (&json_ctx, i * 15, 64, 256, seek, SMALL_CHAR, j);
> > > > + do_test (&json_ctx, i * 15, 64, 256, seek, BIG_CHAR, j);
> > > > + }
> > > > +
> > > > + for (i = 0; i < 32; ++i)
> > > > + {
> > > > + do_test (&json_ctx, 0, i, i + 1, seek, SMALL_CHAR, j);
> > > > + do_test (&json_ctx, 0, i, i + 1, seek, BIG_CHAR, j);
> > > > + }
> > > > + if (seek == 0)
> > > > + {
> > > > + break;
> > > > + }
> > > > + }
> > > > }
> > > >
> > > > - for (i = 0; i < 32; ++i)
> > > > - {
> > > > - do_test (0, i, i + 1, 0, SMALL_CHAR);
> > > > - do_test (0, i, i + 1, 0, BIG_CHAR);
> > > > - }
> > > > + json_array_end (&json_ctx);
> > > > + json_attr_object_end (&json_ctx);
> > > > + json_attr_object_end (&json_ctx);
> > > > + json_document_end (&json_ctx);
> > > >
> > > > return ret;
> > > > }
> > > > --
> > > > 2.25.1
> > > >
> > >
> > >
> > > --
> > > H.J.
>
>
>
> --
> H.J.
^ permalink raw reply [flat|nested] 36+ messages in thread
* [PATCH v3 1/4] benchtests: Improve bench-strrchr
2022-04-21 3:14 [PATCH v1 1/5] benchtests: Improve bench-strrchr Noah Goldstein
` (5 preceding siblings ...)
2022-04-21 22:22 ` [PATCH v2 1/4] " Noah Goldstein
@ 2022-04-22 1:52 ` Noah Goldstein
2022-04-22 1:52 ` [PATCH v3 2/4] x86: Optimize {str|wcs}rchr-sse2 Noah Goldstein
` (3 more replies)
2022-04-22 19:11 ` [PATCH v4 " Noah Goldstein
7 siblings, 4 replies; 36+ messages in thread
From: Noah Goldstein @ 2022-04-22 1:52 UTC (permalink / raw)
To: libc-alpha
1. Use json-lib for printing results.
2. Expose all parameters (before pos, seek_char, and max_char where
not printed).
3. Add benchmarks that test multiple occurence of seek_char in the
string.
---
benchtests/bench-strrchr.c | 124 ++++++++++++++++++++++++-------------
1 file changed, 80 insertions(+), 44 deletions(-)
diff --git a/benchtests/bench-strrchr.c b/benchtests/bench-strrchr.c
index abdae60c51..ce4307a098 100644
--- a/benchtests/bench-strrchr.c
+++ b/benchtests/bench-strrchr.c
@@ -23,6 +23,7 @@
# define TEST_NAME "strrchr"
#endif
#include "bench-string.h"
+#include "json-lib.h"
#define BIG_CHAR MAX_CHAR
@@ -53,7 +54,8 @@ SIMPLE_STRRCHR (const CHAR *s, int c)
}
static void
-do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
+do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s, int c,
+ CHAR *exp_res)
{
CHAR *res = CALL (impl, s, c);
size_t i, iters = INNER_LOOP_ITERS8;
@@ -61,8 +63,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
if (res != exp_res)
{
- error (0, 0, "Wrong result in function %s %p %p", impl->name,
- res, exp_res);
+ error (0, 0, "Wrong result in function %s %p %p", impl->name, res,
+ exp_res);
ret = 1;
return;
}
@@ -73,23 +75,25 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
CALL (impl, s, c);
}
TIMING_NOW (stop);
-
TIMING_DIFF (cur, start, stop);
- TIMING_PRINT_MEAN ((double) cur, (double) iters);
+ json_element_double (json_ctx, (double) cur / (double) iters);
}
static void
-do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
+do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
+ int seek_char, int max_char, size_t freq)
/* For wcsrchr: align here means align not in bytes,
but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
len for wcschr here isn't in bytes but it's number of wchar_t symbols. */
{
size_t i;
+ size_t pos_chunk_sz = freq ? (pos / freq) : pos;
+ size_t last_pos = len;
CHAR *result;
CHAR *buf = (CHAR *) buf1;
- align &= 7;
+ align &= (getpagesize () - 1);
if ((align + len) * sizeof (CHAR) >= page_size)
return;
@@ -103,6 +107,16 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
if ((i > pos || pos >= len) && buf[align + i] == seek_char)
buf[align + i] = seek_char + 10 + (random () & 15);
}
+
+ if (pos_chunk_sz == 0 && pos)
+ pos_chunk_sz = 1;
+
+ for (i = pos_chunk_sz; i < pos && i < len; i += pos_chunk_sz)
+ {
+ buf[align + i] = seek_char;
+ last_pos = i;
+ }
+
buf[align + len] = 0;
if (pos < len)
@@ -110,66 +124,88 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
buf[align + pos] = seek_char;
result = (CHAR *) (buf + align + pos);
}
+ else if (last_pos < len)
+ result = (CHAR *) (buf + align + last_pos);
else if (seek_char == 0)
result = (CHAR *) (buf + align + len);
else
result = NULL;
- printf ("Length %4zd, alignment in bytes %2zd:", len, align * sizeof (CHAR));
+ json_element_object_begin (json_ctx);
+ json_attr_uint (json_ctx, "len", len);
+ json_attr_uint (json_ctx, "pos", pos);
+ json_attr_uint (json_ctx, "align", align);
+ json_attr_uint (json_ctx, "freq", freq);
+ json_attr_uint (json_ctx, "seek", seek_char);
+ json_attr_uint (json_ctx, "max_char", max_char);
+ json_array_begin (json_ctx, "timings");
FOR_EACH_IMPL (impl, 0)
- do_one_test (impl, (CHAR *) (buf + align), seek_char, result);
+ do_one_test (json_ctx, impl, (CHAR *) (buf + align), seek_char, result);
- putchar ('\n');
+ json_array_end (json_ctx);
+ json_element_object_end (json_ctx);
}
int
test_main (void)
{
- size_t i;
+ json_ctx_t json_ctx;
+ size_t i, j;
+ int seek;
test_init ();
+ json_init (&json_ctx, 0, stdout);
- printf ("%20s", "");
- FOR_EACH_IMPL (impl, 0)
- printf ("\t%s", impl->name);
- putchar ('\n');
-
- for (i = 1; i < 8; ++i)
- {
- do_test (0, 16 << i, 2048, 23, SMALL_CHAR);
- do_test (i, 16 << i, 2048, 23, SMALL_CHAR);
- }
+ json_document_begin (&json_ctx);
+ json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
- for (i = 1; i < 8; ++i)
- {
- do_test (i, 64, 256, 23, SMALL_CHAR);
- do_test (i, 64, 256, 23, BIG_CHAR);
- }
+ json_attr_object_begin (&json_ctx, "functions");
+ json_attr_object_begin (&json_ctx, TEST_NAME);
+ json_attr_string (&json_ctx, "bench-variant", "");
- for (i = 0; i < 32; ++i)
- {
- do_test (0, i, i + 1, 23, SMALL_CHAR);
- do_test (0, i, i + 1, 23, BIG_CHAR);
- }
+ json_array_begin (&json_ctx, "ifuncs");
+ FOR_EACH_IMPL (impl, 0)
+ json_element_string (&json_ctx, impl->name);
+ json_array_end (&json_ctx);
- for (i = 1; i < 8; ++i)
- {
- do_test (0, 16 << i, 2048, 0, SMALL_CHAR);
- do_test (i, 16 << i, 2048, 0, SMALL_CHAR);
- }
+ json_array_begin (&json_ctx, "results");
- for (i = 1; i < 8; ++i)
+ for (seek = 0; seek <= 23; seek += 23)
{
- do_test (i, 64, 256, 0, SMALL_CHAR);
- do_test (i, 64, 256, 0, BIG_CHAR);
+ for (j = 1; j < 32; j += j)
+ {
+ for (i = 1; i < 9; ++i)
+ {
+ do_test (&json_ctx, 0, 16 << i, 2048, seek, SMALL_CHAR, j);
+ do_test (&json_ctx, i, 16 << i, 2048, seek, SMALL_CHAR, j);
+ }
+
+ for (i = 1; i < 8; ++i)
+ {
+ do_test (&json_ctx, i, 64, 256, seek, SMALL_CHAR, j);
+ do_test (&json_ctx, i, 64, 256, seek, BIG_CHAR, j);
+
+ do_test (&json_ctx, i * 15, 64, 256, seek, SMALL_CHAR, j);
+ do_test (&json_ctx, i * 15, 64, 256, seek, BIG_CHAR, j);
+ }
+
+ for (i = 0; i < 32; ++i)
+ {
+ do_test (&json_ctx, 0, i, i + 1, seek, SMALL_CHAR, j);
+ do_test (&json_ctx, 0, i, i + 1, seek, BIG_CHAR, j);
+ }
+ if (seek == 0)
+ {
+ break;
+ }
+ }
}
- for (i = 0; i < 32; ++i)
- {
- do_test (0, i, i + 1, 0, SMALL_CHAR);
- do_test (0, i, i + 1, 0, BIG_CHAR);
- }
+ json_array_end (&json_ctx);
+ json_attr_object_end (&json_ctx);
+ json_attr_object_end (&json_ctx);
+ json_document_end (&json_ctx);
return ret;
}
--
2.25.1
^ permalink raw reply [flat|nested] 36+ messages in thread
* [PATCH v3 2/4] x86: Optimize {str|wcs}rchr-sse2
2022-04-22 1:52 ` [PATCH v3 1/4] benchtests: Improve bench-strrchr Noah Goldstein
@ 2022-04-22 1:52 ` Noah Goldstein
2022-04-22 19:06 ` H.J. Lu
2022-04-22 1:52 ` [PATCH v3 3/4] x86: Optimize {str|wcs}rchr-avx2 Noah Goldstein
` (2 subsequent siblings)
3 siblings, 1 reply; 36+ messages in thread
From: Noah Goldstein @ 2022-04-22 1:52 UTC (permalink / raw)
To: libc-alpha
The new code unrolls the main loop slightly without adding too much
overhead and minimizes the comparisons for the search CHAR.
Geometric Mean of all benchmarks New / Old: 0.741
See email for all results.
Full xcheck passes on x86_64 with and without multiarch enabled.
---
sysdeps/x86_64/multiarch/strrchr-sse2.S | 2 +-
sysdeps/x86_64/multiarch/wcsrchr-sse2.S | 3 +-
sysdeps/x86_64/strrchr.S | 510 +++++++++++++++---------
sysdeps/x86_64/wcsrchr.S | 268 +------------
4 files changed, 339 insertions(+), 444 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
index db1b44c23c..866396e947 100644
--- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
+++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
@@ -17,7 +17,7 @@
<https://www.gnu.org/licenses/>. */
#if IS_IN (libc)
-# define strrchr __strrchr_sse2
+# define STRRCHR __strrchr_sse2
# undef weak_alias
# define weak_alias(strrchr, rindex)
diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
index 78d1ca6553..69d2f3cdb1 100644
--- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
+++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
@@ -17,7 +17,6 @@
<https://www.gnu.org/licenses/>. */
#if IS_IN (libc)
-# define wcsrchr __wcsrchr_sse2
+# define STRRCHR __wcsrchr_sse2
#endif
-
#include "../wcsrchr.S"
diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
index 50d886713e..4d7ba4ceb2 100644
--- a/sysdeps/x86_64/strrchr.S
+++ b/sysdeps/x86_64/strrchr.S
@@ -19,210 +19,360 @@
#include <sysdep.h>
+#ifndef STRRCHR
+# define STRRCHR strrchr
+#endif
+
+#ifdef USE_AS_WCSRCHR
+# define PCMPEQ pcmpeqd
+# define CHAR_SIZE 4
+# define PMINU pminud
+#else
+# define PCMPEQ pcmpeqb
+# define CHAR_SIZE 1
+# define PMINU pminub
+#endif
+
+#define PAGE_SIZE 4096
+#define VEC_SIZE 16
+
.text
-ENTRY (strrchr)
- movd %esi, %xmm1
+ENTRY(STRRCHR)
+ movd %esi, %xmm0
movq %rdi, %rax
- andl $4095, %eax
- punpcklbw %xmm1, %xmm1
- cmpq $4032, %rax
- punpcklwd %xmm1, %xmm1
- pshufd $0, %xmm1, %xmm1
+ andl $(PAGE_SIZE - 1), %eax
+#ifndef USE_AS_WCSRCHR
+ punpcklbw %xmm0, %xmm0
+ punpcklwd %xmm0, %xmm0
+#endif
+ pshufd $0, %xmm0, %xmm0
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
ja L(cross_page)
- movdqu (%rdi), %xmm0
+
+L(cross_page_continue):
+ movups (%rdi), %xmm1
pxor %xmm2, %xmm2
- movdqa %xmm0, %xmm3
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm3
- pmovmskb %xmm0, %ecx
- pmovmskb %xmm3, %edx
- testq %rdx, %rdx
- je L(next_48_bytes)
- leaq -1(%rdx), %rax
- xorq %rdx, %rax
- andq %rcx, %rax
- je L(exit)
- bsrq %rax, %rax
+ PCMPEQ %xmm1, %xmm2
+ pmovmskb %xmm2, %ecx
+ testl %ecx, %ecx
+ jz L(aligned_more)
+
+ PCMPEQ %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+ leal -1(%rcx), %edx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(ret0)
+ bsrl %eax, %eax
addq %rdi, %rax
+ /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
+ search CHAR is zero we are correct. Either way `andq
+ -CHAR_SIZE, %rax` gets the correct result. */
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+L(ret0):
ret
+ /* Returns for first vec x1/x2 have hard coded backward search
+ path for earlier matches. */
.p2align 4
-L(next_48_bytes):
- movdqu 16(%rdi), %xmm4
- movdqa %xmm4, %xmm5
- movdqu 32(%rdi), %xmm3
- pcmpeqb %xmm1, %xmm4
- pcmpeqb %xmm2, %xmm5
- movdqu 48(%rdi), %xmm0
- pmovmskb %xmm5, %edx
- movdqa %xmm3, %xmm5
- pcmpeqb %xmm1, %xmm3
- pcmpeqb %xmm2, %xmm5
- pcmpeqb %xmm0, %xmm2
- salq $16, %rdx
- pmovmskb %xmm3, %r8d
- pmovmskb %xmm5, %eax
- pmovmskb %xmm2, %esi
- salq $32, %r8
- salq $32, %rax
- pcmpeqb %xmm1, %xmm0
- orq %rdx, %rax
- movq %rsi, %rdx
- pmovmskb %xmm4, %esi
- salq $48, %rdx
- salq $16, %rsi
- orq %r8, %rsi
- orq %rcx, %rsi
- pmovmskb %xmm0, %ecx
- salq $48, %rcx
- orq %rcx, %rsi
- orq %rdx, %rax
- je L(loop_header2)
- leaq -1(%rax), %rcx
- xorq %rax, %rcx
- andq %rcx, %rsi
- je L(exit)
- bsrq %rsi, %rsi
- leaq (%rdi,%rsi), %rax
+L(first_vec_x0_test):
+ PCMPEQ %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+ testl %eax, %eax
+ jz L(ret0)
+ bsrl %eax, %eax
+ addq %r8, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
ret
.p2align 4
-L(loop_header2):
- testq %rsi, %rsi
- movq %rdi, %rcx
- je L(no_c_found)
-L(loop_header):
- addq $64, %rdi
- pxor %xmm7, %xmm7
- andq $-64, %rdi
- jmp L(loop_entry)
+L(first_vec_x1):
+ PCMPEQ %xmm0, %xmm2
+ pmovmskb %xmm2, %eax
+ leal -1(%rcx), %edx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(first_vec_x0_test)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
.p2align 4
-L(loop64):
- testq %rdx, %rdx
- cmovne %rdx, %rsi
- cmovne %rdi, %rcx
- addq $64, %rdi
-L(loop_entry):
- movdqa 32(%rdi), %xmm3
- pxor %xmm6, %xmm6
- movdqa 48(%rdi), %xmm2
- movdqa %xmm3, %xmm0
- movdqa 16(%rdi), %xmm4
- pminub %xmm2, %xmm0
- movdqa (%rdi), %xmm5
- pminub %xmm4, %xmm0
- pminub %xmm5, %xmm0
- pcmpeqb %xmm7, %xmm0
- pmovmskb %xmm0, %eax
- movdqa %xmm5, %xmm0
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %r9d
- movdqa %xmm4, %xmm0
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %edx
- movdqa %xmm3, %xmm0
- pcmpeqb %xmm1, %xmm0
- salq $16, %rdx
- pmovmskb %xmm0, %r10d
- movdqa %xmm2, %xmm0
- pcmpeqb %xmm1, %xmm0
- salq $32, %r10
- orq %r10, %rdx
- pmovmskb %xmm0, %r8d
- orq %r9, %rdx
- salq $48, %r8
- orq %r8, %rdx
+L(first_vec_x1_test):
+ PCMPEQ %xmm0, %xmm2
+ pmovmskb %xmm2, %eax
testl %eax, %eax
- je L(loop64)
- pcmpeqb %xmm6, %xmm4
- pcmpeqb %xmm6, %xmm3
- pcmpeqb %xmm6, %xmm5
- pmovmskb %xmm4, %eax
- pmovmskb %xmm3, %r10d
- pcmpeqb %xmm6, %xmm2
- pmovmskb %xmm5, %r9d
- salq $32, %r10
- salq $16, %rax
- pmovmskb %xmm2, %r8d
- orq %r10, %rax
- orq %r9, %rax
- salq $48, %r8
- orq %r8, %rax
- leaq -1(%rax), %r8
- xorq %rax, %r8
- andq %r8, %rdx
- cmovne %rdi, %rcx
- cmovne %rdx, %rsi
- bsrq %rsi, %rsi
- leaq (%rcx,%rsi), %rax
+ jz L(first_vec_x0_test)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
+
+ .p2align 4
+L(first_vec_x2):
+ PCMPEQ %xmm0, %xmm3
+ pmovmskb %xmm3, %eax
+ leal -1(%rcx), %edx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(first_vec_x1_test)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
+
+ .p2align 4
+L(aligned_more):
+ /* Save original pointer if match was in VEC 0. */
+ movq %rdi, %r8
+ andq $-VEC_SIZE, %rdi
+
+ movaps VEC_SIZE(%rdi), %xmm2
+ pxor %xmm3, %xmm3
+ PCMPEQ %xmm2, %xmm3
+ pmovmskb %xmm3, %ecx
+ testl %ecx, %ecx
+ jnz L(first_vec_x1)
+
+ movaps (VEC_SIZE * 2)(%rdi), %xmm3
+ pxor %xmm4, %xmm4
+ PCMPEQ %xmm3, %xmm4
+ pmovmskb %xmm4, %ecx
+ testl %ecx, %ecx
+ jnz L(first_vec_x2)
+
+ addq $VEC_SIZE, %rdi
+ /* Save pointer again before realigning. */
+ movq %rdi, %rsi
+ andq $-(VEC_SIZE * 2), %rdi
+ .p2align 4
+L(first_loop):
+ /* Do 2x VEC at a time. */
+ movaps (VEC_SIZE * 2)(%rdi), %xmm4
+ movaps (VEC_SIZE * 3)(%rdi), %xmm5
+ /* Since SSE2 no pminud so wcsrchr needs seperate logic for
+ detecting zero. Note if this is found to be a bottleneck it
+ may be worth adding an SSE4.1 wcsrchr implementation. */
+#ifdef USE_AS_WCSRCHR
+ movaps %xmm5, %xmm6
+ pxor %xmm8, %xmm8
+
+ PCMPEQ %xmm8, %xmm5
+ PCMPEQ %xmm4, %xmm8
+ por %xmm5, %xmm8
+#else
+ movaps %xmm5, %xmm6
+ PMINU %xmm4, %xmm5
+#endif
+
+ movaps %xmm4, %xmm9
+ PCMPEQ %xmm0, %xmm4
+ PCMPEQ %xmm0, %xmm6
+ movaps %xmm6, %xmm7
+ por %xmm4, %xmm6
+#ifndef USE_AS_WCSRCHR
+ pxor %xmm8, %xmm8
+ PCMPEQ %xmm5, %xmm8
+#endif
+ pmovmskb %xmm8, %ecx
+ pmovmskb %xmm6, %eax
+
+ addq $(VEC_SIZE * 2), %rdi
+ /* Use `addl` 1) so we can undo it with `subl` and 2) it can
+ macro-fuse with `jz`. */
+ addl %ecx, %eax
+ jz L(first_loop)
+
+ /* Check if there is zero match. */
+ testl %ecx, %ecx
+ jz L(second_loop_match)
+
+ /* Check if there was a match in last iteration. */
+ subl %ecx, %eax
+ jnz L(new_match)
+
+L(first_loop_old_match):
+ PCMPEQ %xmm0, %xmm2
+ PCMPEQ %xmm0, %xmm3
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm3, %eax
+ addl %eax, %ecx
+ jz L(first_vec_x0_test)
+ /* NB: We could move this shift to before the branch and save a
+ bit of code size / performance on the fall through. The
+ branch leads to the null case which generally seems hotter
+ than char in first 3x VEC. */
+ sall $16, %eax
+ orl %ecx, %eax
+
+ bsrl %eax, %eax
+ addq %rsi, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
+
+ .p2align 4
+L(new_match):
+ pxor %xmm6, %xmm6
+ PCMPEQ %xmm9, %xmm6
+ pmovmskb %xmm6, %eax
+ sall $16, %ecx
+ orl %eax, %ecx
+
+ /* We can't reuse either of the old comparisons as since we mask
+ of zeros after first zero (instead of using the full
+ comparison) we can't gurantee no interference between match
+ after end of string and valid match. */
+ pmovmskb %xmm4, %eax
+ pmovmskb %xmm7, %edx
+ sall $16, %edx
+ orl %edx, %eax
+
+ leal -1(%ecx), %edx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(first_loop_old_match)
+ bsrl %eax, %eax
+ addq %rdi, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
ret
+ /* Save minimum state for getting most recent match. We can
+ throw out all previous work. */
.p2align 4
-L(no_c_found):
- movl $1, %esi
- xorl %ecx, %ecx
- jmp L(loop_header)
+L(second_loop_match):
+ movq %rdi, %rsi
+ movaps %xmm4, %xmm2
+ movaps %xmm7, %xmm3
.p2align 4
-L(exit):
- xorl %eax, %eax
+L(second_loop):
+ movaps (VEC_SIZE * 2)(%rdi), %xmm4
+ movaps (VEC_SIZE * 3)(%rdi), %xmm5
+ /* Since SSE2 no pminud so wcsrchr needs seperate logic for
+ detecting zero. Note if this is found to be a bottleneck it
+ may be worth adding an SSE4.1 wcsrchr implementation. */
+#ifdef USE_AS_WCSRCHR
+ movaps %xmm5, %xmm6
+ pxor %xmm8, %xmm8
+
+ PCMPEQ %xmm8, %xmm5
+ PCMPEQ %xmm4, %xmm8
+ por %xmm5, %xmm8
+#else
+ movaps %xmm5, %xmm6
+ PMINU %xmm4, %xmm5
+#endif
+
+ movaps %xmm4, %xmm9
+ PCMPEQ %xmm0, %xmm4
+ PCMPEQ %xmm0, %xmm6
+ movaps %xmm6, %xmm7
+ por %xmm4, %xmm6
+#ifndef USE_AS_WCSRCHR
+ pxor %xmm8, %xmm8
+ PCMPEQ %xmm5, %xmm8
+#endif
+
+ pmovmskb %xmm8, %ecx
+ pmovmskb %xmm6, %eax
+
+ addq $(VEC_SIZE * 2), %rdi
+ /* Either null term or new occurence of CHAR. */
+ addl %ecx, %eax
+ jz L(second_loop)
+
+ /* No null term so much be new occurence of CHAR. */
+ testl %ecx, %ecx
+ jz L(second_loop_match)
+
+
+ subl %ecx, %eax
+ jnz L(second_loop_new_match)
+
+L(second_loop_old_match):
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm3, %eax
+ sall $16, %eax
+ orl %ecx, %eax
+ bsrl %eax, %eax
+ addq %rsi, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
ret
.p2align 4
+L(second_loop_new_match):
+ pxor %xmm6, %xmm6
+ PCMPEQ %xmm9, %xmm6
+ pmovmskb %xmm6, %eax
+ sall $16, %ecx
+ orl %eax, %ecx
+
+ /* We can't reuse either of the old comparisons as since we mask
+ of zeros after first zero (instead of using the full
+ comparison) we can't gurantee no interference between match
+ after end of string and valid match. */
+ pmovmskb %xmm4, %eax
+ pmovmskb %xmm7, %edx
+ sall $16, %edx
+ orl %edx, %eax
+
+ leal -1(%ecx), %edx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(second_loop_old_match)
+ bsrl %eax, %eax
+ addq %rdi, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
+
+ .p2align 4,, 4
L(cross_page):
- movq %rdi, %rax
- pxor %xmm0, %xmm0
- andq $-64, %rax
- movdqu (%rax), %xmm5
- movdqa %xmm5, %xmm6
- movdqu 16(%rax), %xmm4
- pcmpeqb %xmm1, %xmm5
- pcmpeqb %xmm0, %xmm6
- movdqu 32(%rax), %xmm3
- pmovmskb %xmm6, %esi
- movdqa %xmm4, %xmm6
- movdqu 48(%rax), %xmm2
- pcmpeqb %xmm1, %xmm4
- pcmpeqb %xmm0, %xmm6
- pmovmskb %xmm6, %edx
- movdqa %xmm3, %xmm6
- pcmpeqb %xmm1, %xmm3
- pcmpeqb %xmm0, %xmm6
- pcmpeqb %xmm2, %xmm0
- salq $16, %rdx
- pmovmskb %xmm3, %r9d
- pmovmskb %xmm6, %r8d
- pmovmskb %xmm0, %ecx
- salq $32, %r9
- salq $32, %r8
- pcmpeqb %xmm1, %xmm2
- orq %r8, %rdx
- salq $48, %rcx
- pmovmskb %xmm5, %r8d
- orq %rsi, %rdx
- pmovmskb %xmm4, %esi
- orq %rcx, %rdx
- pmovmskb %xmm2, %ecx
- salq $16, %rsi
- salq $48, %rcx
- orq %r9, %rsi
- orq %r8, %rsi
- orq %rcx, %rsi
+ movq %rdi, %rsi
+ andq $-VEC_SIZE, %rsi
+ movaps (%rsi), %xmm1
+ pxor %xmm2, %xmm2
+ PCMPEQ %xmm1, %xmm2
+ pmovmskb %xmm2, %edx
movl %edi, %ecx
- subl %eax, %ecx
- shrq %cl, %rdx
- shrq %cl, %rsi
- testq %rdx, %rdx
- je L(loop_header2)
- leaq -1(%rdx), %rax
- xorq %rdx, %rax
- andq %rax, %rsi
- je L(exit)
- bsrq %rsi, %rax
+ andl $(VEC_SIZE - 1), %ecx
+ sarl %cl, %edx
+ jz L(cross_page_continue)
+ PCMPEQ %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+ sarl %cl, %eax
+ leal -1(%rdx), %ecx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(ret1)
+ bsrl %eax, %eax
addq %rdi, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+L(ret1):
ret
-END (strrchr)
+END(STRRCHR)
-weak_alias (strrchr, rindex)
-libc_hidden_builtin_def (strrchr)
+#ifndef USE_AS_WCSRCHR
+ weak_alias (STRRCHR, rindex)
+ libc_hidden_builtin_def (STRRCHR)
+#endif
diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
index 61552954de..2b80efc5ef 100644
--- a/sysdeps/x86_64/wcsrchr.S
+++ b/sysdeps/x86_64/wcsrchr.S
@@ -1,4 +1,4 @@
-/* wcsrchr with SSSE3
+/* wcsrchr optimized with SSE2.
Copyright (C) 2011-2022 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -16,266 +16,12 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#include <sysdep.h>
- .text
-ENTRY (wcsrchr)
+#define USE_AS_WCSRCHR 1
+#define NO_PMINU 1
- movd %rsi, %xmm1
- mov %rdi, %rcx
- punpckldq %xmm1, %xmm1
- pxor %xmm2, %xmm2
- punpckldq %xmm1, %xmm1
- and $63, %rcx
- cmp $48, %rcx
- ja L(crosscache)
+#ifndef STRRCHR
+# define STRRCHR wcsrchr
+#endif
- movdqu (%rdi), %xmm0
- pcmpeqd %xmm0, %xmm2
- pcmpeqd %xmm1, %xmm0
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm0, %rax
- add $16, %rdi
-
- test %rax, %rax
- jnz L(unaligned_match1)
-
- test %rcx, %rcx
- jnz L(return_null)
-
- and $-16, %rdi
- xor %r8, %r8
- jmp L(loop)
-
- .p2align 4
-L(unaligned_match1):
- test %rcx, %rcx
- jnz L(prolog_find_zero_1)
-
- mov %rax, %r8
- mov %rdi, %rsi
- and $-16, %rdi
- jmp L(loop)
-
- .p2align 4
-L(crosscache):
- and $15, %rcx
- and $-16, %rdi
- pxor %xmm3, %xmm3
- movdqa (%rdi), %xmm0
- pcmpeqd %xmm0, %xmm3
- pcmpeqd %xmm1, %xmm0
- pmovmskb %xmm3, %rdx
- pmovmskb %xmm0, %rax
- shr %cl, %rdx
- shr %cl, %rax
- add $16, %rdi
-
- test %rax, %rax
- jnz L(unaligned_match)
-
- test %rdx, %rdx
- jnz L(return_null)
-
- xor %r8, %r8
- jmp L(loop)
-
- .p2align 4
-L(unaligned_match):
- test %rdx, %rdx
- jnz L(prolog_find_zero)
-
- mov %rax, %r8
- lea (%rdi, %rcx), %rsi
-
-/* Loop start on aligned string. */
- .p2align 4
-L(loop):
- movdqa (%rdi), %xmm0
- pcmpeqd %xmm0, %xmm2
- add $16, %rdi
- pcmpeqd %xmm1, %xmm0
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm0, %rax
- or %rax, %rcx
- jnz L(matches)
-
- movdqa (%rdi), %xmm3
- pcmpeqd %xmm3, %xmm2
- add $16, %rdi
- pcmpeqd %xmm1, %xmm3
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm3, %rax
- or %rax, %rcx
- jnz L(matches)
-
- movdqa (%rdi), %xmm4
- pcmpeqd %xmm4, %xmm2
- add $16, %rdi
- pcmpeqd %xmm1, %xmm4
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm4, %rax
- or %rax, %rcx
- jnz L(matches)
-
- movdqa (%rdi), %xmm5
- pcmpeqd %xmm5, %xmm2
- add $16, %rdi
- pcmpeqd %xmm1, %xmm5
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm5, %rax
- or %rax, %rcx
- jz L(loop)
-
- .p2align 4
-L(matches):
- test %rax, %rax
- jnz L(match)
-L(return_value):
- test %r8, %r8
- jz L(return_null)
- mov %r8, %rax
- mov %rsi, %rdi
-
- test $15 << 4, %ah
- jnz L(match_fourth_wchar)
- test %ah, %ah
- jnz L(match_third_wchar)
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(match):
- pmovmskb %xmm2, %rcx
- test %rcx, %rcx
- jnz L(find_zero)
- mov %rax, %r8
- mov %rdi, %rsi
- jmp L(loop)
-
- .p2align 4
-L(find_zero):
- test $15, %cl
- jnz L(find_zero_in_first_wchar)
- test %cl, %cl
- jnz L(find_zero_in_second_wchar)
- test $15, %ch
- jnz L(find_zero_in_third_wchar)
-
- and $1 << 13 - 1, %rax
- jz L(return_value)
-
- test $15 << 4, %ah
- jnz L(match_fourth_wchar)
- test %ah, %ah
- jnz L(match_third_wchar)
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(find_zero_in_first_wchar):
- test $1, %rax
- jz L(return_value)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(find_zero_in_second_wchar):
- and $1 << 5 - 1, %rax
- jz L(return_value)
-
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(find_zero_in_third_wchar):
- and $1 << 9 - 1, %rax
- jz L(return_value)
-
- test %ah, %ah
- jnz L(match_third_wchar)
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(prolog_find_zero):
- add %rcx, %rdi
- mov %rdx, %rcx
-L(prolog_find_zero_1):
- test $15, %cl
- jnz L(prolog_find_zero_in_first_wchar)
- test %cl, %cl
- jnz L(prolog_find_zero_in_second_wchar)
- test $15, %ch
- jnz L(prolog_find_zero_in_third_wchar)
-
- and $1 << 13 - 1, %rax
- jz L(return_null)
-
- test $15 << 4, %ah
- jnz L(match_fourth_wchar)
- test %ah, %ah
- jnz L(match_third_wchar)
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(prolog_find_zero_in_first_wchar):
- test $1, %rax
- jz L(return_null)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(prolog_find_zero_in_second_wchar):
- and $1 << 5 - 1, %rax
- jz L(return_null)
-
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(prolog_find_zero_in_third_wchar):
- and $1 << 9 - 1, %rax
- jz L(return_null)
-
- test %ah, %ah
- jnz L(match_third_wchar)
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(match_second_wchar):
- lea -12(%rdi), %rax
- ret
-
- .p2align 4
-L(match_third_wchar):
- lea -8(%rdi), %rax
- ret
-
- .p2align 4
-L(match_fourth_wchar):
- lea -4(%rdi), %rax
- ret
-
- .p2align 4
-L(return_null):
- xor %rax, %rax
- ret
-
-END (wcsrchr)
+#include "../strrchr.S"
--
2.25.1
^ permalink raw reply [flat|nested] 36+ messages in thread
* [PATCH v3 3/4] x86: Optimize {str|wcs}rchr-avx2
2022-04-22 1:52 ` [PATCH v3 1/4] benchtests: Improve bench-strrchr Noah Goldstein
2022-04-22 1:52 ` [PATCH v3 2/4] x86: Optimize {str|wcs}rchr-sse2 Noah Goldstein
@ 2022-04-22 1:52 ` Noah Goldstein
2022-04-22 19:03 ` H.J. Lu
2022-04-22 1:52 ` [PATCH v3 4/4] x86: Optimize {str|wcs}rchr-evex Noah Goldstein
2022-04-22 18:29 ` [PATCH v3 1/4] benchtests: Improve bench-strrchr H.J. Lu
3 siblings, 1 reply; 36+ messages in thread
From: Noah Goldstein @ 2022-04-22 1:52 UTC (permalink / raw)
To: libc-alpha
The new code unrolls the main loop slightly without adding too much
overhead and minimizes the comparisons for the search CHAR.
Geometric Mean of all benchmarks New / Old: 0.832
See email for all results.
Full xcheck passes on x86_64 with and without multiarch enabled.
---
sysdeps/x86_64/multiarch/strrchr-avx2.S | 426 +++++++++++++++---------
1 file changed, 269 insertions(+), 157 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
index 1df2adfad0..bd26ba80d5 100644
--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
@@ -27,9 +27,13 @@
# ifdef USE_AS_WCSRCHR
# define VPBROADCAST vpbroadcastd
# define VPCMPEQ vpcmpeqd
+# define VPMIN vpminud
+# define CHAR_SIZE 4
# else
# define VPBROADCAST vpbroadcastb
# define VPCMPEQ vpcmpeqb
+# define VPMIN vpminub
+# define CHAR_SIZE 1
# endif
# ifndef VZEROUPPER
@@ -41,196 +45,304 @@
# endif
# define VEC_SIZE 32
+# define PAGE_SIZE 4096
- .section SECTION(.text),"ax",@progbits
-ENTRY (STRRCHR)
- movd %esi, %xmm4
- movl %edi, %ecx
+ .section SECTION(.text), "ax", @progbits
+ENTRY(STRRCHR)
+ movd %esi, %xmm7
+ movl %edi, %eax
/* Broadcast CHAR to YMM4. */
- VPBROADCAST %xmm4, %ymm4
+ VPBROADCAST %xmm7, %ymm7
vpxor %xmm0, %xmm0, %xmm0
- /* Check if we may cross page boundary with one vector load. */
- andl $(2 * VEC_SIZE - 1), %ecx
- cmpl $VEC_SIZE, %ecx
- ja L(cros_page_boundary)
+ /* Shift here instead of `andl` to save code size (saves a fetch
+ block). */
+ sall $20, %eax
+ cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax
+ ja L(cross_page)
+L(page_cross_continue):
vmovdqu (%rdi), %ymm1
- VPCMPEQ %ymm1, %ymm0, %ymm2
- VPCMPEQ %ymm1, %ymm4, %ymm3
- vpmovmskb %ymm2, %ecx
- vpmovmskb %ymm3, %eax
- addq $VEC_SIZE, %rdi
+ /* Check end of string match. */
+ VPCMPEQ %ymm1, %ymm0, %ymm6
+ vpmovmskb %ymm6, %ecx
+ testl %ecx, %ecx
+ jz L(aligned_more)
+
+ /* Only check match with search CHAR if needed. */
+ VPCMPEQ %ymm1, %ymm7, %ymm1
+ vpmovmskb %ymm1, %eax
+ /* Check if match before first zero. */
+ blsmskl %ecx, %ecx
+ andl %ecx, %eax
+ jz L(ret0)
+ bsrl %eax, %eax
+ addq %rdi, %rax
+ /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
+ search CHAR is zero we are correct. Either way `andq
+ -CHAR_SIZE, %rax` gets the correct result. */
+# ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+# endif
+L(ret0):
+L(return_vzeroupper):
+ ZERO_UPPER_VEC_REGISTERS_RETURN
+
+ /* Returns for first vec x1/x2 have hard coded backward search
+ path for earlier matches. */
+ .p2align 4,, 10
+L(first_vec_x1):
+ VPCMPEQ %ymm2, %ymm7, %ymm6
+ vpmovmskb %ymm6, %eax
+ blsmskl %ecx, %ecx
+ andl %ecx, %eax
+ jnz L(first_vec_x1_return)
+
+ .p2align 4,, 4
+L(first_vec_x0_test):
+ VPCMPEQ %ymm1, %ymm7, %ymm6
+ vpmovmskb %ymm6, %eax
+ testl %eax, %eax
+ jz L(ret1)
+ bsrl %eax, %eax
+ addq %r8, %rax
+# ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+# endif
+L(ret1):
+ VZEROUPPER_RETURN
+ .p2align 4,, 10
+L(first_vec_x0_x1_test):
+ VPCMPEQ %ymm2, %ymm7, %ymm6
+ vpmovmskb %ymm6, %eax
+ /* Check ymm2 for search CHAR match. If no match then check ymm1
+ before returning. */
testl %eax, %eax
- jnz L(first_vec)
+ jz L(first_vec_x0_test)
+ .p2align 4,, 4
+L(first_vec_x1_return):
+ bsrl %eax, %eax
+ leaq 1(%rdi, %rax), %rax
+# ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+# endif
+ VZEROUPPER_RETURN
- testl %ecx, %ecx
- jnz L(return_null)
- andq $-VEC_SIZE, %rdi
- xorl %edx, %edx
- jmp L(aligned_loop)
+ .p2align 4,, 10
+L(first_vec_x2):
+ VPCMPEQ %ymm3, %ymm7, %ymm6
+ vpmovmskb %ymm6, %eax
+ blsmskl %ecx, %ecx
+ /* If no in-range search CHAR match in ymm3 then need to check
+ ymm1/ymm2 for an earlier match (we delay checking search
+ CHAR matches until needed). */
+ andl %ecx, %eax
+ jz L(first_vec_x0_x1_test)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE + 1)(%rdi, %rax), %rax
+# ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+# endif
+ VZEROUPPER_RETURN
+
.p2align 4
-L(first_vec):
- /* Check if there is a nul CHAR. */
+L(aligned_more):
+ /* Save original pointer if match was in VEC 0. */
+ movq %rdi, %r8
+
+ /* Align src. */
+ orq $(VEC_SIZE - 1), %rdi
+ vmovdqu 1(%rdi), %ymm2
+ VPCMPEQ %ymm2, %ymm0, %ymm6
+ vpmovmskb %ymm6, %ecx
testl %ecx, %ecx
- jnz L(char_and_nul_in_first_vec)
+ jnz L(first_vec_x1)
- /* Remember the match and keep searching. */
- movl %eax, %edx
- movq %rdi, %rsi
- andq $-VEC_SIZE, %rdi
- jmp L(aligned_loop)
+ vmovdqu (VEC_SIZE + 1)(%rdi), %ymm3
+ VPCMPEQ %ymm3, %ymm0, %ymm6
+ vpmovmskb %ymm6, %ecx
+ testl %ecx, %ecx
+ jnz L(first_vec_x2)
+ /* Save pointer again before realigning. */
+ movq %rdi, %rsi
+ addq $(VEC_SIZE + 1), %rdi
+ andq $-(VEC_SIZE * 2), %rdi
.p2align 4
-L(cros_page_boundary):
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
- vmovdqa (%rdi), %ymm1
- VPCMPEQ %ymm1, %ymm0, %ymm2
- VPCMPEQ %ymm1, %ymm4, %ymm3
- vpmovmskb %ymm2, %edx
- vpmovmskb %ymm3, %eax
- shrl %cl, %edx
- shrl %cl, %eax
- addq $VEC_SIZE, %rdi
-
- /* Check if there is a CHAR. */
+L(first_aligned_loop):
+ /* Do 2x VEC at a time. Any more and the cost of finding the
+ match outweights loop benefit. */
+ vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4
+ vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5
+
+ VPCMPEQ %ymm4, %ymm7, %ymm6
+ VPMIN %ymm4, %ymm5, %ymm8
+ VPCMPEQ %ymm5, %ymm7, %ymm10
+ vpor %ymm6, %ymm10, %ymm5
+ VPCMPEQ %ymm8, %ymm0, %ymm8
+ vpor %ymm5, %ymm8, %ymm9
+
+ vpmovmskb %ymm9, %eax
+ addq $(VEC_SIZE * 2), %rdi
+ /* No zero or search CHAR. */
testl %eax, %eax
- jnz L(found_char)
-
- testl %edx, %edx
- jnz L(return_null)
+ jz L(first_aligned_loop)
- jmp L(aligned_loop)
-
- .p2align 4
-L(found_char):
- testl %edx, %edx
- jnz L(char_and_nul)
+ /* If no zero CHAR then go to second loop (this allows us to
+ throw away all prior work). */
+ vpmovmskb %ymm8, %ecx
+ testl %ecx, %ecx
+ jz L(second_aligned_loop_prep)
- /* Remember the match and keep searching. */
- movl %eax, %edx
- leaq (%rdi, %rcx), %rsi
+ /* Search char could be zero so we need to get the true match.
+ */
+ vpmovmskb %ymm5, %eax
+ testl %eax, %eax
+ jnz L(first_aligned_loop_return)
- .p2align 4
-L(aligned_loop):
- vmovdqa (%rdi), %ymm1
- VPCMPEQ %ymm1, %ymm0, %ymm2
- addq $VEC_SIZE, %rdi
- VPCMPEQ %ymm1, %ymm4, %ymm3
- vpmovmskb %ymm2, %ecx
- vpmovmskb %ymm3, %eax
- orl %eax, %ecx
- jnz L(char_nor_null)
-
- vmovdqa (%rdi), %ymm1
- VPCMPEQ %ymm1, %ymm0, %ymm2
- add $VEC_SIZE, %rdi
- VPCMPEQ %ymm1, %ymm4, %ymm3
- vpmovmskb %ymm2, %ecx
+ .p2align 4,, 4
+L(first_vec_x1_or_x2):
+ VPCMPEQ %ymm3, %ymm7, %ymm3
+ VPCMPEQ %ymm2, %ymm7, %ymm2
vpmovmskb %ymm3, %eax
- orl %eax, %ecx
- jnz L(char_nor_null)
-
- vmovdqa (%rdi), %ymm1
- VPCMPEQ %ymm1, %ymm0, %ymm2
- addq $VEC_SIZE, %rdi
- VPCMPEQ %ymm1, %ymm4, %ymm3
- vpmovmskb %ymm2, %ecx
- vpmovmskb %ymm3, %eax
- orl %eax, %ecx
- jnz L(char_nor_null)
-
- vmovdqa (%rdi), %ymm1
- VPCMPEQ %ymm1, %ymm0, %ymm2
- addq $VEC_SIZE, %rdi
- VPCMPEQ %ymm1, %ymm4, %ymm3
- vpmovmskb %ymm2, %ecx
- vpmovmskb %ymm3, %eax
- orl %eax, %ecx
- jz L(aligned_loop)
-
- .p2align 4
-L(char_nor_null):
- /* Find a CHAR or a nul CHAR in a loop. */
- testl %eax, %eax
- jnz L(match)
-L(return_value):
- testl %edx, %edx
- jz L(return_null)
- movl %edx, %eax
- movq %rsi, %rdi
+ vpmovmskb %ymm2, %edx
+ /* Use add for macro-fusion. */
+ addq %rax, %rdx
+ jz L(first_vec_x0_test)
+ /* NB: We could move this shift to before the branch and save a
+ bit of code size / performance on the fall through. The
+ branch leads to the null case which generally seems hotter
+ than char in first 3x VEC. */
+ salq $32, %rax
+ addq %rdx, %rax
+ bsrq %rax, %rax
+ leaq 1(%rsi, %rax), %rax
+# ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+# endif
+ VZEROUPPER_RETURN
+ .p2align 4,, 8
+L(first_aligned_loop_return):
+ VPCMPEQ %ymm4, %ymm0, %ymm4
+ vpmovmskb %ymm4, %edx
+ salq $32, %rcx
+ orq %rdx, %rcx
+
+ vpmovmskb %ymm10, %eax
+ vpmovmskb %ymm6, %edx
+ salq $32, %rax
+ orq %rdx, %rax
+ blsmskq %rcx, %rcx
+ andq %rcx, %rax
+ jz L(first_vec_x1_or_x2)
+
+ bsrq %rax, %rax
+ leaq -(VEC_SIZE * 2)(%rdi, %rax), %rax
# ifdef USE_AS_WCSRCHR
- /* Keep the first bit for each matching CHAR for bsr. */
- andl $0x11111111, %eax
+ andq $-CHAR_SIZE, %rax
# endif
- bsrl %eax, %eax
- leaq -VEC_SIZE(%rdi, %rax), %rax
-L(return_vzeroupper):
- ZERO_UPPER_VEC_REGISTERS_RETURN
+ VZEROUPPER_RETURN
+ /* Search char cannot be zero. */
.p2align 4
-L(match):
- /* Find a CHAR. Check if there is a nul CHAR. */
- vpmovmskb %ymm2, %ecx
- testl %ecx, %ecx
- jnz L(find_nul)
-
- /* Remember the match and keep searching. */
- movl %eax, %edx
+L(second_aligned_loop_set_furthest_match):
+ /* Save VEC and pointer from most recent match. */
+L(second_aligned_loop_prep):
movq %rdi, %rsi
- jmp L(aligned_loop)
+ vmovdqu %ymm6, %ymm2
+ vmovdqu %ymm10, %ymm3
.p2align 4
-L(find_nul):
-# ifdef USE_AS_WCSRCHR
- /* Keep the first bit for each matching CHAR for bsr. */
- andl $0x11111111, %ecx
- andl $0x11111111, %eax
-# endif
- /* Mask out any matching bits after the nul CHAR. */
- movl %ecx, %r8d
- subl $1, %r8d
- xorl %ecx, %r8d
- andl %r8d, %eax
+L(second_aligned_loop):
+ /* Search 2x at at time. */
+ vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4
+ vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5
+
+ VPCMPEQ %ymm4, %ymm7, %ymm6
+ VPMIN %ymm4, %ymm5, %ymm1
+ VPCMPEQ %ymm5, %ymm7, %ymm10
+ vpor %ymm6, %ymm10, %ymm5
+ VPCMPEQ %ymm1, %ymm0, %ymm1
+ vpor %ymm5, %ymm1, %ymm9
+
+ vpmovmskb %ymm9, %eax
+ addq $(VEC_SIZE * 2), %rdi
testl %eax, %eax
- /* If there is no CHAR here, return the remembered one. */
- jz L(return_value)
- bsrl %eax, %eax
- leaq -VEC_SIZE(%rdi, %rax), %rax
- VZEROUPPER_RETURN
-
- .p2align 4
-L(char_and_nul):
- /* Find both a CHAR and a nul CHAR. */
- addq %rcx, %rdi
- movl %edx, %ecx
-L(char_and_nul_in_first_vec):
-# ifdef USE_AS_WCSRCHR
- /* Keep the first bit for each matching CHAR for bsr. */
- andl $0x11111111, %ecx
- andl $0x11111111, %eax
-# endif
- /* Mask out any matching bits after the nul CHAR. */
- movl %ecx, %r8d
- subl $1, %r8d
- xorl %ecx, %r8d
- andl %r8d, %eax
+ jz L(second_aligned_loop)
+ vpmovmskb %ymm1, %ecx
+ testl %ecx, %ecx
+ jz L(second_aligned_loop_set_furthest_match)
+ vpmovmskb %ymm5, %eax
testl %eax, %eax
- /* Return null pointer if the nul CHAR comes first. */
- jz L(return_null)
- bsrl %eax, %eax
- leaq -VEC_SIZE(%rdi, %rax), %rax
+ jnz L(return_new_match)
+
+ /* This is the hot patch. We know CHAR is inbounds and that
+ ymm3/ymm2 have latest match. */
+ .p2align 4,, 4
+L(return_old_match):
+ vpmovmskb %ymm3, %eax
+ vpmovmskb %ymm2, %edx
+ salq $32, %rax
+ orq %rdx, %rax
+ bsrq %rax, %rax
+ /* Search char cannot be zero so safe to just use lea for
+ wcsrchr. */
+ leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax
VZEROUPPER_RETURN
- .p2align 4
-L(return_null):
- xorl %eax, %eax
+ /* Last iteration also potentially has a match. */
+ .p2align 4,, 8
+L(return_new_match):
+ VPCMPEQ %ymm4, %ymm0, %ymm4
+ vpmovmskb %ymm4, %edx
+ salq $32, %rcx
+ orq %rdx, %rcx
+
+ vpmovmskb %ymm10, %eax
+ vpmovmskb %ymm6, %edx
+ salq $32, %rax
+ orq %rdx, %rax
+ blsmskq %rcx, %rcx
+ andq %rcx, %rax
+ jz L(return_old_match)
+ bsrq %rax, %rax
+ /* Search char cannot be zero so safe to just use lea for
+ wcsrchr. */
+ leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax
VZEROUPPER_RETURN
-END (STRRCHR)
+ .p2align 4,, 4
+L(cross_page):
+ movq %rdi, %rsi
+ andq $-VEC_SIZE, %rsi
+ vmovdqu (%rsi), %ymm1
+ VPCMPEQ %ymm1, %ymm0, %ymm6
+ vpmovmskb %ymm6, %ecx
+ /* Shift out zero CHAR matches that are before the begining of
+ src (rdi). */
+ shrxl %edi, %ecx, %ecx
+ testl %ecx, %ecx
+ jz L(page_cross_continue)
+ VPCMPEQ %ymm1, %ymm7, %ymm1
+ vpmovmskb %ymm1, %eax
+
+ /* Shift out search CHAR matches that are before the begining of
+ src (rdi). */
+ shrxl %edi, %eax, %eax
+ blsmskl %ecx, %ecx
+ /* Check if any search CHAR match in range. */
+ andl %ecx, %eax
+ jz L(ret2)
+ bsrl %eax, %eax
+ addq %rdi, %rax
+# ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+# endif
+L(ret2):
+ VZEROUPPER_RETURN
+END(STRRCHR)
#endif
--
2.25.1
^ permalink raw reply [flat|nested] 36+ messages in thread
* [PATCH v3 4/4] x86: Optimize {str|wcs}rchr-evex
2022-04-22 1:52 ` [PATCH v3 1/4] benchtests: Improve bench-strrchr Noah Goldstein
2022-04-22 1:52 ` [PATCH v3 2/4] x86: Optimize {str|wcs}rchr-sse2 Noah Goldstein
2022-04-22 1:52 ` [PATCH v3 3/4] x86: Optimize {str|wcs}rchr-avx2 Noah Goldstein
@ 2022-04-22 1:52 ` Noah Goldstein
2022-04-22 19:04 ` H.J. Lu
2022-04-22 18:29 ` [PATCH v3 1/4] benchtests: Improve bench-strrchr H.J. Lu
3 siblings, 1 reply; 36+ messages in thread
From: Noah Goldstein @ 2022-04-22 1:52 UTC (permalink / raw)
To: libc-alpha
The new code unrolls the main loop slightly without adding too much
overhead and minimizes the comparisons for the search CHAR.
Geometric Mean of all benchmarks New / Old: 0.755
See email for all results.
Full xcheck passes on x86_64 with and without multiarch enabled.
---
sysdeps/x86_64/multiarch/strrchr-evex.S | 471 +++++++++++++++---------
1 file changed, 290 insertions(+), 181 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
index adeddaed32..8014c285b3 100644
--- a/sysdeps/x86_64/multiarch/strrchr-evex.S
+++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
@@ -24,242 +24,351 @@
# define STRRCHR __strrchr_evex
# endif
-# define VMOVU vmovdqu64
-# define VMOVA vmovdqa64
+# define VMOVU vmovdqu64
+# define VMOVA vmovdqa64
# ifdef USE_AS_WCSRCHR
+# define SHIFT_REG esi
+
+# define kunpck kunpckbw
+# define kmov_2x kmovd
+# define maskz_2x ecx
+# define maskm_2x eax
+# define CHAR_SIZE 4
+# define VPMIN vpminud
+# define VPTESTN vptestnmd
# define VPBROADCAST vpbroadcastd
-# define VPCMP vpcmpd
-# define SHIFT_REG r8d
+# define VPCMP vpcmpd
# else
+# define SHIFT_REG edi
+
+# define kunpck kunpckdq
+# define kmov_2x kmovq
+# define maskz_2x rcx
+# define maskm_2x rax
+
+# define CHAR_SIZE 1
+# define VPMIN vpminub
+# define VPTESTN vptestnmb
# define VPBROADCAST vpbroadcastb
-# define VPCMP vpcmpb
-# define SHIFT_REG ecx
+# define VPCMP vpcmpb
# endif
# define XMMZERO xmm16
# define YMMZERO ymm16
# define YMMMATCH ymm17
-# define YMM1 ymm18
+# define YMMSAVE ymm18
+
+# define YMM1 ymm19
+# define YMM2 ymm20
+# define YMM3 ymm21
+# define YMM4 ymm22
+# define YMM5 ymm23
+# define YMM6 ymm24
+# define YMM7 ymm25
+# define YMM8 ymm26
-# define VEC_SIZE 32
- .section .text.evex,"ax",@progbits
-ENTRY (STRRCHR)
- movl %edi, %ecx
+# define VEC_SIZE 32
+# define PAGE_SIZE 4096
+ .section .text.evex, "ax", @progbits
+ENTRY(STRRCHR)
+ movl %edi, %eax
/* Broadcast CHAR to YMMMATCH. */
VPBROADCAST %esi, %YMMMATCH
- vpxorq %XMMZERO, %XMMZERO, %XMMZERO
-
- /* Check if we may cross page boundary with one vector load. */
- andl $(2 * VEC_SIZE - 1), %ecx
- cmpl $VEC_SIZE, %ecx
- ja L(cros_page_boundary)
+ andl $(PAGE_SIZE - 1), %eax
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ jg L(cross_page_boundary)
+L(page_cross_continue):
VMOVU (%rdi), %YMM1
-
- /* Each bit in K0 represents a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM1, %k0
- /* Each bit in K1 represents a CHAR in YMM1. */
- VPCMP $0, %YMMMATCH, %YMM1, %k1
+ /* k0 has a 1 for each zero CHAR in YMM1. */
+ VPTESTN %YMM1, %YMM1, %k0
kmovd %k0, %ecx
- kmovd %k1, %eax
-
- addq $VEC_SIZE, %rdi
-
- testl %eax, %eax
- jnz L(first_vec)
-
testl %ecx, %ecx
- jnz L(return_null)
-
- andq $-VEC_SIZE, %rdi
- xorl %edx, %edx
- jmp L(aligned_loop)
-
- .p2align 4
-L(first_vec):
- /* Check if there is a null byte. */
- testl %ecx, %ecx
- jnz L(char_and_nul_in_first_vec)
-
- /* Remember the match and keep searching. */
- movl %eax, %edx
- movq %rdi, %rsi
- andq $-VEC_SIZE, %rdi
- jmp L(aligned_loop)
-
- .p2align 4
-L(cros_page_boundary):
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
+ jz L(aligned_more)
+ /* fallthrough: zero CHAR in first VEC. */
+ /* K1 has a 1 for each search CHAR match in YMM1. */
+ VPCMP $0, %YMMMATCH, %YMM1, %k1
+ kmovd %k1, %eax
+ /* Build mask up until first zero CHAR (used to mask of
+ potential search CHAR matches past the end of the string).
+ */
+ blsmskl %ecx, %ecx
+ andl %ecx, %eax
+ jz L(ret0)
+ /* Get last match (the `andl` removed any out of bounds
+ matches). */
+ bsrl %eax, %eax
# ifdef USE_AS_WCSRCHR
- /* NB: Divide shift count by 4 since each bit in K1 represent 4
- bytes. */
- movl %ecx, %SHIFT_REG
- sarl $2, %SHIFT_REG
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
+# else
+ addq %rdi, %rax
# endif
+L(ret0):
+ ret
- VMOVA (%rdi), %YMM1
-
- /* Each bit in K0 represents a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM1, %k0
- /* Each bit in K1 represents a CHAR in YMM1. */
+ /* Returns for first vec x1/x2/x3 have hard coded backward
+ search path for earlier matches. */
+ .p2align 4,, 6
+L(first_vec_x1):
+ VPCMP $0, %YMMMATCH, %YMM2, %k1
+ kmovd %k1, %eax
+ blsmskl %ecx, %ecx
+ /* eax non-zero if search CHAR in range. */
+ andl %ecx, %eax
+ jnz L(first_vec_x1_return)
+
+ /* fallthrough: no match in YMM2 then need to check for earlier
+ matches (in YMM1). */
+ .p2align 4,, 4
+L(first_vec_x0_test):
VPCMP $0, %YMMMATCH, %YMM1, %k1
- kmovd %k0, %edx
kmovd %k1, %eax
-
- shrxl %SHIFT_REG, %edx, %edx
- shrxl %SHIFT_REG, %eax, %eax
- addq $VEC_SIZE, %rdi
-
- /* Check if there is a CHAR. */
testl %eax, %eax
- jnz L(found_char)
-
- testl %edx, %edx
- jnz L(return_null)
-
- jmp L(aligned_loop)
-
- .p2align 4
-L(found_char):
- testl %edx, %edx
- jnz L(char_and_nul)
-
- /* Remember the match and keep searching. */
- movl %eax, %edx
- leaq (%rdi, %rcx), %rsi
+ jz L(ret1)
+ bsrl %eax, %eax
+# ifdef USE_AS_WCSRCHR
+ leaq (%rsi, %rax, CHAR_SIZE), %rax
+# else
+ addq %rsi, %rax
+# endif
+L(ret1):
+ ret
- .p2align 4
-L(aligned_loop):
- VMOVA (%rdi), %YMM1
- addq $VEC_SIZE, %rdi
+ .p2align 4,, 10
+L(first_vec_x1_or_x2):
+ VPCMP $0, %YMM3, %YMMMATCH, %k3
+ VPCMP $0, %YMM2, %YMMMATCH, %k2
+ /* K2 and K3 have 1 for any search CHAR match. Test if any
+ matches between either of them. Otherwise check YMM1. */
+ kortestd %k2, %k3
+ jz L(first_vec_x0_test)
+
+ /* Guranteed that YMM2 and YMM3 are within range so merge the
+ two bitmasks then get last result. */
+ kunpck %k2, %k3, %k3
+ kmovq %k3, %rax
+ bsrq %rax, %rax
+ leaq (VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax
+ ret
- /* Each bit in K0 represents a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM1, %k0
- /* Each bit in K1 represents a CHAR in YMM1. */
- VPCMP $0, %YMMMATCH, %YMM1, %k1
- kmovd %k0, %ecx
+ .p2align 4,, 6
+L(first_vec_x3):
+ VPCMP $0, %YMMMATCH, %YMM4, %k1
kmovd %k1, %eax
- orl %eax, %ecx
- jnz L(char_nor_null)
+ blsmskl %ecx, %ecx
+ /* If no search CHAR match in range check YMM1/YMM2/YMM3. */
+ andl %ecx, %eax
+ jz L(first_vec_x1_or_x2)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
- VMOVA (%rdi), %YMM1
- add $VEC_SIZE, %rdi
+ .p2align 4,, 6
+L(first_vec_x0_x1_test):
+ VPCMP $0, %YMMMATCH, %YMM2, %k1
+ kmovd %k1, %eax
+ /* Check YMM2 for last match first. If no match try YMM1. */
+ testl %eax, %eax
+ jz L(first_vec_x0_test)
+ .p2align 4,, 4
+L(first_vec_x1_return):
+ bsrl %eax, %eax
+ leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
- /* Each bit in K0 represents a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM1, %k0
- /* Each bit in K1 represents a CHAR in YMM1. */
- VPCMP $0, %YMMMATCH, %YMM1, %k1
- kmovd %k0, %ecx
+ .p2align 4,, 10
+L(first_vec_x2):
+ VPCMP $0, %YMMMATCH, %YMM3, %k1
kmovd %k1, %eax
- orl %eax, %ecx
- jnz L(char_nor_null)
+ blsmskl %ecx, %ecx
+ /* Check YMM3 for last match first. If no match try YMM2/YMM1.
+ */
+ andl %ecx, %eax
+ jz L(first_vec_x0_x1_test)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
- VMOVA (%rdi), %YMM1
- addq $VEC_SIZE, %rdi
- /* Each bit in K0 represents a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM1, %k0
- /* Each bit in K1 represents a CHAR in YMM1. */
- VPCMP $0, %YMMMATCH, %YMM1, %k1
+ .p2align 4
+L(aligned_more):
+ /* Need to keep original pointer incase YMM1 has last match. */
+ movq %rdi, %rsi
+ andq $-VEC_SIZE, %rdi
+ VMOVU VEC_SIZE(%rdi), %YMM2
+ VPTESTN %YMM2, %YMM2, %k0
kmovd %k0, %ecx
- kmovd %k1, %eax
- orl %eax, %ecx
- jnz L(char_nor_null)
+ testl %ecx, %ecx
+ jnz L(first_vec_x1)
- VMOVA (%rdi), %YMM1
- addq $VEC_SIZE, %rdi
+ VMOVU (VEC_SIZE * 2)(%rdi), %YMM3
+ VPTESTN %YMM3, %YMM3, %k0
+ kmovd %k0, %ecx
+ testl %ecx, %ecx
+ jnz L(first_vec_x2)
- /* Each bit in K0 represents a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM1, %k0
- /* Each bit in K1 represents a CHAR in YMM1. */
- VPCMP $0, %YMMMATCH, %YMM1, %k1
+ VMOVU (VEC_SIZE * 3)(%rdi), %YMM4
+ VPTESTN %YMM4, %YMM4, %k0
kmovd %k0, %ecx
- kmovd %k1, %eax
- orl %eax, %ecx
- jz L(aligned_loop)
+ movq %rdi, %r8
+ testl %ecx, %ecx
+ jnz L(first_vec_x3)
+ andq $-(VEC_SIZE * 2), %rdi
.p2align 4
-L(char_nor_null):
- /* Find a CHAR or a null byte in a loop. */
+L(first_aligned_loop):
+ /* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee
+ they don't store a match. */
+ VMOVA (VEC_SIZE * 4)(%rdi), %YMM5
+ VMOVA (VEC_SIZE * 5)(%rdi), %YMM6
+
+ VPCMP $0, %YMM5, %YMMMATCH, %k2
+ vpxord %YMM6, %YMMMATCH, %YMM7
+
+ VPMIN %YMM5, %YMM6, %YMM8
+ VPMIN %YMM8, %YMM7, %YMM7
+
+ VPTESTN %YMM7, %YMM7, %k1
+ subq $(VEC_SIZE * -2), %rdi
+ kortestd %k1, %k2
+ jz L(first_aligned_loop)
+
+ VPCMP $0, %YMM6, %YMMMATCH, %k3
+ VPTESTN %YMM8, %YMM8, %k1
+ ktestd %k1, %k1
+ jz L(second_aligned_loop_prep)
+
+ kortestd %k2, %k3
+ jnz L(return_first_aligned_loop)
+
+ .p2align 4,, 6
+L(first_vec_x1_or_x2_or_x3):
+ VPCMP $0, %YMM4, %YMMMATCH, %k4
+ kmovd %k4, %eax
testl %eax, %eax
- jnz L(match)
-L(return_value):
- testl %edx, %edx
- jz L(return_null)
- movl %edx, %eax
- movq %rsi, %rdi
+ jz L(first_vec_x1_or_x2)
bsrl %eax, %eax
-# ifdef USE_AS_WCSRCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq -VEC_SIZE(%rdi, %rax, 4), %rax
-# else
- leaq -VEC_SIZE(%rdi, %rax), %rax
-# endif
+ leaq (VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
ret
- .p2align 4
-L(match):
- /* Find a CHAR. Check if there is a null byte. */
- kmovd %k0, %ecx
- testl %ecx, %ecx
- jnz L(find_nul)
+ .p2align 4,, 8
+L(return_first_aligned_loop):
+ VPTESTN %YMM5, %YMM5, %k0
+ kunpck %k0, %k1, %k0
+ kmov_2x %k0, %maskz_2x
+
+ blsmsk %maskz_2x, %maskz_2x
+ kunpck %k2, %k3, %k3
+ kmov_2x %k3, %maskm_2x
+ and %maskz_2x, %maskm_2x
+ jz L(first_vec_x1_or_x2_or_x3)
- /* Remember the match and keep searching. */
- movl %eax, %edx
+ bsr %maskm_2x, %maskm_2x
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
+
+ .p2align 4
+ /* We can throw away the work done for the first 4x checks here
+ as we have a later match. This is the 'fast' path persay.
+ */
+L(second_aligned_loop_prep):
+L(second_aligned_loop_set_furthest_match):
movq %rdi, %rsi
- jmp L(aligned_loop)
+ kunpck %k2, %k3, %k4
.p2align 4
-L(find_nul):
- /* Mask out any matching bits after the null byte. */
- movl %ecx, %r8d
- subl $1, %r8d
- xorl %ecx, %r8d
- andl %r8d, %eax
- testl %eax, %eax
- /* If there is no CHAR here, return the remembered one. */
- jz L(return_value)
- bsrl %eax, %eax
+L(second_aligned_loop):
+ VMOVU (VEC_SIZE * 4)(%rdi), %YMM1
+ VMOVU (VEC_SIZE * 5)(%rdi), %YMM2
+
+ VPCMP $0, %YMM1, %YMMMATCH, %k2
+ vpxord %YMM2, %YMMMATCH, %YMM3
+
+ VPMIN %YMM1, %YMM2, %YMM4
+ VPMIN %YMM3, %YMM4, %YMM3
+
+ VPTESTN %YMM3, %YMM3, %k1
+ subq $(VEC_SIZE * -2), %rdi
+ kortestd %k1, %k2
+ jz L(second_aligned_loop)
+
+ VPCMP $0, %YMM2, %YMMMATCH, %k3
+ VPTESTN %YMM4, %YMM4, %k1
+ ktestd %k1, %k1
+ jz L(second_aligned_loop_set_furthest_match)
+
+ kortestd %k2, %k3
+ /* branch here because there is a significant advantage interms
+ of output dependency chance in using edx. */
+ jnz L(return_new_match)
+L(return_old_match):
+ kmovq %k4, %rax
+ bsrq %rax, %rax
+ leaq (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax
+ ret
+
+L(return_new_match):
+ VPTESTN %YMM1, %YMM1, %k0
+ kunpck %k0, %k1, %k0
+ kmov_2x %k0, %maskz_2x
+
+ blsmsk %maskz_2x, %maskz_2x
+ kunpck %k2, %k3, %k3
+ kmov_2x %k3, %maskm_2x
+ and %maskz_2x, %maskm_2x
+ jz L(return_old_match)
+
+ bsr %maskm_2x, %maskm_2x
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
+
+L(cross_page_boundary):
+ /* eax contains all the page offset bits of src (rdi). `xor rdi,
+ rax` sets pointer will all page offset bits cleared so
+ offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
+ before page cross (guranteed to be safe to read). Doing this
+ as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
+ a bit of code size. */
+ xorq %rdi, %rax
+ VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %YMM1
+ VPTESTN %YMM1, %YMM1, %k0
+ kmovd %k0, %ecx
+
+ /* Shift out zero CHAR matches that are before the begining of
+ src (rdi). */
# ifdef USE_AS_WCSRCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq -VEC_SIZE(%rdi, %rax, 4), %rax
-# else
- leaq -VEC_SIZE(%rdi, %rax), %rax
+ movl %edi, %esi
+ andl $(VEC_SIZE - 1), %esi
+ shrl $2, %esi
# endif
- ret
+ shrxl %SHIFT_REG, %ecx, %ecx
- .p2align 4
-L(char_and_nul):
- /* Find both a CHAR and a null byte. */
- addq %rcx, %rdi
- movl %edx, %ecx
-L(char_and_nul_in_first_vec):
- /* Mask out any matching bits after the null byte. */
- movl %ecx, %r8d
- subl $1, %r8d
- xorl %ecx, %r8d
- andl %r8d, %eax
- testl %eax, %eax
- /* Return null pointer if the null byte comes first. */
- jz L(return_null)
+ testl %ecx, %ecx
+ jz L(page_cross_continue)
+
+ /* Found zero CHAR so need to test for search CHAR. */
+ VPCMP $0, %YMMMATCH, %YMM1, %k1
+ kmovd %k1, %eax
+ /* Shift out search CHAR matches that are before the begining of
+ src (rdi). */
+ shrxl %SHIFT_REG, %eax, %eax
+
+ /* Check if any search CHAR match in range. */
+ blsmskl %ecx, %ecx
+ andl %ecx, %eax
+ jz L(ret3)
bsrl %eax, %eax
# ifdef USE_AS_WCSRCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq -VEC_SIZE(%rdi, %rax, 4), %rax
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
# else
- leaq -VEC_SIZE(%rdi, %rax), %rax
+ addq %rdi, %rax
# endif
+L(ret3):
ret
- .p2align 4
-L(return_null):
- xorl %eax, %eax
- ret
-
-END (STRRCHR)
+END(STRRCHR)
#endif
--
2.25.1
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH v2 4/4] x86: Optimize {str|wcs}rchr-evex
2022-04-21 23:59 ` H.J. Lu
@ 2022-04-22 1:53 ` Noah Goldstein
0 siblings, 0 replies; 36+ messages in thread
From: Noah Goldstein @ 2022-04-22 1:53 UTC (permalink / raw)
To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell
On Thu, Apr 21, 2022 at 7:00 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Thu, Apr 21, 2022 at 3:22 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > The new code unrolls the main loop slightly without adding too much
> > overhead and minimizes the comparisons for the search CHAR.
> >
> > Geometric Mean of all benchmarks New / Old: 0.755
> > See email for all results.
> >
> > Full xcheck passes on x86_64 with and without multiarch enabled.
> > ---
> > sysdeps/x86_64/multiarch/strrchr-evex.S | 441 ++++++++++++++----------
> > 1 file changed, 259 insertions(+), 182 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
> > index adeddaed32..5cf9a8315b 100644
> > --- a/sysdeps/x86_64/multiarch/strrchr-evex.S
> > +++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
> > @@ -24,242 +24,319 @@
> > # define STRRCHR __strrchr_evex
> > # endif
> >
> > -# define VMOVU vmovdqu64
> > -# define VMOVA vmovdqa64
> > +# define VMOVU vmovdqu64
> > +# define VMOVA vmovdqa64
> >
> > # ifdef USE_AS_WCSRCHR
> > +# define SHIFT_REG esi
> > +
> > +# define kunpck kunpckbw
> > +# define kmov_2x kmovd
> > +# define maskz_2x ecx
> > +# define maskm_2x eax
> > +# define CHAR_SIZE 4
> > +# define VPMIN vpminud
> > +# define VPTESTN vptestnmd
> > # define VPBROADCAST vpbroadcastd
> > -# define VPCMP vpcmpd
> > -# define SHIFT_REG r8d
> > +# define VPCMP vpcmpd
> > # else
> > +# define SHIFT_REG edi
> > +
> > +# define kunpck kunpckdq
> > +# define kmov_2x kmovq
> > +# define maskz_2x rcx
> > +# define maskm_2x rax
> > +
> > +# define CHAR_SIZE 1
> > +# define VPMIN vpminub
> > +# define VPTESTN vptestnmb
> > # define VPBROADCAST vpbroadcastb
> > -# define VPCMP vpcmpb
> > -# define SHIFT_REG ecx
> > +# define VPCMP vpcmpb
> > # endif
> >
> > # define XMMZERO xmm16
> > # define YMMZERO ymm16
> > # define YMMMATCH ymm17
> > -# define YMM1 ymm18
> > +# define YMMSAVE ymm18
> > +
> > +# define YMM1 ymm19
> > +# define YMM2 ymm20
> > +# define YMM3 ymm21
> > +# define YMM4 ymm22
> > +# define YMM5 ymm23
> > +# define YMM6 ymm24
> > +# define YMM7 ymm25
> > +# define YMM8 ymm26
> >
> > -# define VEC_SIZE 32
> >
> > - .section .text.evex,"ax",@progbits
> > -ENTRY (STRRCHR)
> > - movl %edi, %ecx
> > +# define VEC_SIZE 32
> > +# define PAGE_SIZE 4096
> > + .section .text.evex, "ax", @progbits
> > +ENTRY(STRRCHR)
> > + movl %edi, %eax
> > /* Broadcast CHAR to YMMMATCH. */
> > VPBROADCAST %esi, %YMMMATCH
> >
> > - vpxorq %XMMZERO, %XMMZERO, %XMMZERO
> > -
> > - /* Check if we may cross page boundary with one vector load. */
> > - andl $(2 * VEC_SIZE - 1), %ecx
> > - cmpl $VEC_SIZE, %ecx
> > - ja L(cros_page_boundary)
> > + andl $(PAGE_SIZE - 1), %eax
> > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> > + jg L(cross_page_boundary)
> >
> > +L(page_cross_continue):
> > VMOVU (%rdi), %YMM1
> > -
> > - /* Each bit in K0 represents a null byte in YMM1. */
> > - VPCMP $0, %YMMZERO, %YMM1, %k0
> > - /* Each bit in K1 represents a CHAR in YMM1. */
> > - VPCMP $0, %YMMMATCH, %YMM1, %k1
> > + VPTESTN %YMM1, %YMM1, %k0
>
> Please add some comments for mask register on VPTESTN tests.
Added in V3.
>
> > kmovd %k0, %ecx
> > - kmovd %k1, %eax
> > -
> > - addq $VEC_SIZE, %rdi
> > -
> > - testl %eax, %eax
> > - jnz L(first_vec)
> > -
> > testl %ecx, %ecx
> > - jnz L(return_null)
> > -
> > - andq $-VEC_SIZE, %rdi
> > - xorl %edx, %edx
> > - jmp L(aligned_loop)
> > -
> > - .p2align 4
> > -L(first_vec):
> > - /* Check if there is a null byte. */
> > - testl %ecx, %ecx
> > - jnz L(char_and_nul_in_first_vec)
> > -
> > - /* Remember the match and keep searching. */
> > - movl %eax, %edx
> > - movq %rdi, %rsi
> > - andq $-VEC_SIZE, %rdi
> > - jmp L(aligned_loop)
> > -
> > - .p2align 4
> > -L(cros_page_boundary):
> > - andl $(VEC_SIZE - 1), %ecx
> > - andq $-VEC_SIZE, %rdi
> > -
> > + jz L(aligned_more)
> > + VPCMP $0, %YMMMATCH, %YMM1, %k1
>
> Please add some comments.
Added in V3.
>
> > + kmovd %k1, %eax
> > + blsmskl %ecx, %ecx
> > + andl %ecx, %eax
> > + jz L(ret0)
> > + bsrl %eax, %eax
> > # ifdef USE_AS_WCSRCHR
> > - /* NB: Divide shift count by 4 since each bit in K1 represent 4
> > - bytes. */
> > - movl %ecx, %SHIFT_REG
> > - sarl $2, %SHIFT_REG
> > + leaq (%rdi, %rax, CHAR_SIZE), %rax
> > +# else
> > + addq %rdi, %rax
> > # endif
> > +L(ret0):
> > + ret
> >
> > - VMOVA (%rdi), %YMM1
> > -
> > - /* Each bit in K0 represents a null byte in YMM1. */
> > - VPCMP $0, %YMMZERO, %YMM1, %k0
> > - /* Each bit in K1 represents a CHAR in YMM1. */
> > + /* Returns for first vec x1/x2/x3 have hard coded backward
> > + search path for earlier matches. */
> > + .p2align 4,, 6
> > +L(first_vec_x1):
> > + VPCMP $0, %YMMMATCH, %YMM2, %k1
> > + kmovd %k1, %eax
> > + blsmskl %ecx, %ecx
> > + andl %ecx, %eax
> > + jnz L(first_vec_x1_return)
> > + .p2align 4,, 4
> > +L(first_vec_x0_test):
> > VPCMP $0, %YMMMATCH, %YMM1, %k1
> > - kmovd %k0, %edx
> > kmovd %k1, %eax
> > -
> > - shrxl %SHIFT_REG, %edx, %edx
> > - shrxl %SHIFT_REG, %eax, %eax
> > - addq $VEC_SIZE, %rdi
> > -
> > - /* Check if there is a CHAR. */
> > testl %eax, %eax
> > - jnz L(found_char)
> > -
> > - testl %edx, %edx
> > - jnz L(return_null)
> > -
> > - jmp L(aligned_loop)
> > -
> > - .p2align 4
> > -L(found_char):
> > - testl %edx, %edx
> > - jnz L(char_and_nul)
> > -
> > - /* Remember the match and keep searching. */
> > - movl %eax, %edx
> > - leaq (%rdi, %rcx), %rsi
> > + jz L(ret1)
> > + bsrl %eax, %eax
> > +# ifdef USE_AS_WCSRCHR
> > + leaq (%rsi, %rax, CHAR_SIZE), %rax
> > +# else
> > + addq %rsi, %rax
> > +# endif
> > +L(ret1):
> > + ret
> >
> > - .p2align 4
> > -L(aligned_loop):
> > - VMOVA (%rdi), %YMM1
> > - addq $VEC_SIZE, %rdi
> > + .p2align 4,, 10
> > +L(first_vec_x1_or_x2):
> > + VPCMP $0, %YMM3, %YMMMATCH, %k3
> > + VPCMP $0, %YMM2, %YMMMATCH, %k2
> > + kortestd %k2, %k3
> > + jz L(first_vec_x0_test)
> > +
> > + kunpck %k2, %k3, %k3
> > + kmovq %k3, %rax
> > + bsrq %rax, %rax
> > + leaq (VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax
> > + ret
> >
> > - /* Each bit in K0 represents a null byte in YMM1. */
> > - VPCMP $0, %YMMZERO, %YMM1, %k0
> > - /* Each bit in K1 represents a CHAR in YMM1. */
> > - VPCMP $0, %YMMMATCH, %YMM1, %k1
> > - kmovd %k0, %ecx
> > + .p2align 4,, 6
> > +L(first_vec_x3):
> > + VPCMP $0, %YMMMATCH, %YMM4, %k1
> > kmovd %k1, %eax
> > - orl %eax, %ecx
> > - jnz L(char_nor_null)
> > + blsmskl %ecx, %ecx
> > + andl %ecx, %eax
> > + jz L(first_vec_x1_or_x2)
> > + bsrl %eax, %eax
> > + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> > + ret
> >
> > - VMOVA (%rdi), %YMM1
> > - add $VEC_SIZE, %rdi
> > + .p2align 4,, 6
> > +L(first_vec_x0_x1_test):
> > + VPCMP $0, %YMMMATCH, %YMM2, %k1
> > + kmovd %k1, %eax
> > + testl %eax, %eax
> > + jz L(first_vec_x0_test)
> > + .p2align 4,, 4
> > +L(first_vec_x1_return):
> > + bsrl %eax, %eax
> > + leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
> > + ret
> >
> > - /* Each bit in K0 represents a null byte in YMM1. */
> > - VPCMP $0, %YMMZERO, %YMM1, %k0
> > - /* Each bit in K1 represents a CHAR in YMM1. */
> > - VPCMP $0, %YMMMATCH, %YMM1, %k1
> > - kmovd %k0, %ecx
> > + .p2align 4,, 10
> > +L(first_vec_x2):
> > + VPCMP $0, %YMMMATCH, %YMM3, %k1
> > kmovd %k1, %eax
> > - orl %eax, %ecx
> > - jnz L(char_nor_null)
> > + blsmskl %ecx, %ecx
> > + andl %ecx, %eax
> > + jz L(first_vec_x0_x1_test)
> > + bsrl %eax, %eax
> > + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> > + ret
> >
> > - VMOVA (%rdi), %YMM1
> > - addq $VEC_SIZE, %rdi
> >
> > - /* Each bit in K0 represents a null byte in YMM1. */
> > - VPCMP $0, %YMMZERO, %YMM1, %k0
> > - /* Each bit in K1 represents a CHAR in YMM1. */
> > - VPCMP $0, %YMMMATCH, %YMM1, %k1
> > + .p2align 4
> > +L(aligned_more):
> > + /* Need to keep original pointer incase YMM1 has last match. */
> > + movq %rdi, %rsi
> > + andq $-VEC_SIZE, %rdi
> > + VMOVU VEC_SIZE(%rdi), %YMM2
> > + VPTESTN %YMM2, %YMM2, %k0
> > kmovd %k0, %ecx
> > - kmovd %k1, %eax
> > - orl %eax, %ecx
> > - jnz L(char_nor_null)
> > + testl %ecx, %ecx
> > + jnz L(first_vec_x1)
> >
> > - VMOVA (%rdi), %YMM1
> > - addq $VEC_SIZE, %rdi
> > + VMOVU (VEC_SIZE * 2)(%rdi), %YMM3
> > + VPTESTN %YMM3, %YMM3, %k0
> > + kmovd %k0, %ecx
> > + testl %ecx, %ecx
> > + jnz L(first_vec_x2)
> >
> > - /* Each bit in K0 represents a null byte in YMM1. */
> > - VPCMP $0, %YMMZERO, %YMM1, %k0
> > - /* Each bit in K1 represents a CHAR in YMM1. */
> > - VPCMP $0, %YMMMATCH, %YMM1, %k1
> > + VMOVU (VEC_SIZE * 3)(%rdi), %YMM4
> > + VPTESTN %YMM4, %YMM4, %k0
> > kmovd %k0, %ecx
> > - kmovd %k1, %eax
> > - orl %eax, %ecx
> > - jz L(aligned_loop)
> > + movq %rdi, %r8
> > + testl %ecx, %ecx
> > + jnz L(first_vec_x3)
> >
> > + andq $-(VEC_SIZE * 2), %rdi
> > .p2align 4
> > -L(char_nor_null):
> > - /* Find a CHAR or a null byte in a loop. */
> > +L(first_aligned_loop):
> > + /* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee
> > + they don't store a match. */
> > + VMOVA (VEC_SIZE * 4)(%rdi), %YMM5
> > + VMOVA (VEC_SIZE * 5)(%rdi), %YMM6
> > +
> > + VPCMP $0, %YMM5, %YMMMATCH, %k2
> > + vpxord %YMM6, %YMMMATCH, %YMM7
> > +
> > + VPMIN %YMM5, %YMM6, %YMM8
> > + VPMIN %YMM8, %YMM7, %YMM7
> > +
> > + VPTESTN %YMM7, %YMM7, %k1
> > + subq $(VEC_SIZE * -2), %rdi
> > + kortestd %k1, %k2
> > + jz L(first_aligned_loop)
> > +
> > + VPCMP $0, %YMM6, %YMMMATCH, %k3
> > + VPTESTN %YMM8, %YMM8, %k1
> > + ktestd %k1, %k1
> > + jz L(second_aligned_loop_prep)
> > +
> > + kortestd %k2, %k3
> > + jnz L(return_first_aligned_loop)
> > +
> > + .p2align 4,, 6
> > +L(first_vec_x1_or_x2_or_x3):
> > + VPCMP $0, %YMM4, %YMMMATCH, %k4
> > + kmovd %k4, %eax
> > testl %eax, %eax
> > - jnz L(match)
> > -L(return_value):
> > - testl %edx, %edx
> > - jz L(return_null)
> > - movl %edx, %eax
> > - movq %rsi, %rdi
> > + jz L(first_vec_x1_or_x2)
> > bsrl %eax, %eax
> > -# ifdef USE_AS_WCSRCHR
> > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> > - leaq -VEC_SIZE(%rdi, %rax, 4), %rax
> > -# else
> > - leaq -VEC_SIZE(%rdi, %rax), %rax
> > -# endif
> > + leaq (VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
> > ret
> >
> > - .p2align 4
> > -L(match):
> > - /* Find a CHAR. Check if there is a null byte. */
> > - kmovd %k0, %ecx
> > - testl %ecx, %ecx
> > - jnz L(find_nul)
> > + .p2align 4,, 8
> > +L(return_first_aligned_loop):
> > + VPTESTN %YMM5, %YMM5, %k0
> > + kunpck %k0, %k1, %k0
> > + kmov_2x %k0, %maskz_2x
> > +
> > + blsmsk %maskz_2x, %maskz_2x
> > + kunpck %k2, %k3, %k3
> > + kmov_2x %k3, %maskm_2x
> > + and %maskz_2x, %maskm_2x
> > + jz L(first_vec_x1_or_x2_or_x3)
> > +
> > + bsr %maskm_2x, %maskm_2x
> > + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> > + ret
> >
> > - /* Remember the match and keep searching. */
> > - movl %eax, %edx
> > + .p2align 4
> > + /* We can throw away the work done for the first 4x checks here
> > + as we have a later match. This is the 'fast' path persay.
> > + */
> > +L(second_aligned_loop_prep):
> > +L(second_aligned_loop_set_furthest_match):
> > movq %rdi, %rsi
> > - jmp L(aligned_loop)
> > + kunpck %k2, %k3, %k4
> >
> > .p2align 4
> > -L(find_nul):
> > - /* Mask out any matching bits after the null byte. */
> > - movl %ecx, %r8d
> > - subl $1, %r8d
> > - xorl %ecx, %r8d
> > - andl %r8d, %eax
> > - testl %eax, %eax
> > - /* If there is no CHAR here, return the remembered one. */
> > - jz L(return_value)
> > - bsrl %eax, %eax
> > +L(second_aligned_loop):
> > + VMOVU (VEC_SIZE * 4)(%rdi), %YMM1
> > + VMOVU (VEC_SIZE * 5)(%rdi), %YMM2
> > +
> > + VPCMP $0, %YMM1, %YMMMATCH, %k2
> > + vpxord %YMM2, %YMMMATCH, %YMM3
> > +
> > + VPMIN %YMM1, %YMM2, %YMM4
> > + VPMIN %YMM3, %YMM4, %YMM3
> > +
> > + VPTESTN %YMM3, %YMM3, %k1
> > + subq $(VEC_SIZE * -2), %rdi
> > + kortestd %k1, %k2
> > + jz L(second_aligned_loop)
> > +
> > + VPCMP $0, %YMM2, %YMMMATCH, %k3
> > + VPTESTN %YMM4, %YMM4, %k1
> > + ktestd %k1, %k1
> > + jz L(second_aligned_loop_set_furthest_match)
> > +
> > + kortestd %k2, %k3
> > + /* branch here because there is a significant advantage interms
> > + of output dependency chance in using edx. */
> > + jnz L(return_new_match)
> > +L(return_old_match):
> > + kmovq %k4, %rax
> > + bsrq %rax, %rax
> > + leaq (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax
> > + ret
> > +
> > +L(return_new_match):
> > + VPTESTN %YMM1, %YMM1, %k0
> > + kunpck %k0, %k1, %k0
> > + kmov_2x %k0, %maskz_2x
> > +
> > + blsmsk %maskz_2x, %maskz_2x
> > + kunpck %k2, %k3, %k3
> > + kmov_2x %k3, %maskm_2x
> > + and %maskz_2x, %maskm_2x
> > + jz L(return_old_match)
> > +
> > + bsr %maskm_2x, %maskm_2x
> > + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> > + ret
> > +
> > + /* This block is horribly aligned (% 16 == 15). This is
> > + intentional. The L(cross_page_boundary) block is exactly
> > + 32-bytes of code size. Ultimately this is a cold case so
> > + save the code size by leaving misaligned. */
> > +L(cross_page_boundary):
> > + xorq %rdi, %rax
> > + VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %YMM1
> > + VPTESTN %YMM1, %YMM1, %k0
> > + kmovd %k0, %ecx
> > # ifdef USE_AS_WCSRCHR
> > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> > - leaq -VEC_SIZE(%rdi, %rax, 4), %rax
> > -# else
> > - leaq -VEC_SIZE(%rdi, %rax), %rax
> > + movl %edi, %esi
> > + andl $(VEC_SIZE - 1), %esi
> > + shrl $2, %esi
> > # endif
> > - ret
> > + shrxl %SHIFT_REG, %ecx, %ecx
> >
> > - .p2align 4
> > -L(char_and_nul):
> > - /* Find both a CHAR and a null byte. */
> > - addq %rcx, %rdi
> > - movl %edx, %ecx
> > -L(char_and_nul_in_first_vec):
> > - /* Mask out any matching bits after the null byte. */
> > - movl %ecx, %r8d
> > - subl $1, %r8d
> > - xorl %ecx, %r8d
> > - andl %r8d, %eax
> > - testl %eax, %eax
> > - /* Return null pointer if the null byte comes first. */
> > - jz L(return_null)
> > + testl %ecx, %ecx
> > + jz L(page_cross_continue)
> > + VPCMP $0, %YMMMATCH, %YMM1, %k1
> > + kmovd %k1, %eax
> > + shrxl %SHIFT_REG, %eax, %eax
> > + blsmskl %ecx, %ecx
> > + andl %ecx, %eax
> > + jz L(ret3)
> > bsrl %eax, %eax
> > # ifdef USE_AS_WCSRCHR
> > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> > - leaq -VEC_SIZE(%rdi, %rax, 4), %rax
> > + leaq (%rdi, %rax, CHAR_SIZE), %rax
> > # else
> > - leaq -VEC_SIZE(%rdi, %rax), %rax
> > + addq %rdi, %rax
> > # endif
> > +L(ret3):
> > ret
> >
> > - .p2align 4
> > -L(return_null):
> > - xorl %eax, %eax
> > - ret
> > -
> > -END (STRRCHR)
> > +END(STRRCHR)
> > #endif
> > --
> > 2.25.1
> >
>
>
> --
> H.J.
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH v2 2/4] x86: Optimize {str|wcs}rchr-sse2
2022-04-21 23:46 ` H.J. Lu
@ 2022-04-22 1:54 ` Noah Goldstein
0 siblings, 0 replies; 36+ messages in thread
From: Noah Goldstein @ 2022-04-22 1:54 UTC (permalink / raw)
To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell
On Thu, Apr 21, 2022 at 6:46 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Thu, Apr 21, 2022 at 3:22 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > The new code unrolls the main loop slightly without adding too much
> > overhead and minimizes the comparisons for the search CHAR.
> >
> > Geometric Mean of all benchmarks New / Old: 0.741
> > See email for all results.
> >
> > Full xcheck passes on x86_64 with and without multiarch enabled.
> > ---
> > sysdeps/x86_64/multiarch/strrchr-sse2.S | 2 +-
> > sysdeps/x86_64/multiarch/wcsrchr-sse2.S | 3 +-
> > sysdeps/x86_64/strrchr.S | 510 +++++++++++++++---------
> > sysdeps/x86_64/wcsrchr.S | 268 +------------
> > 4 files changed, 339 insertions(+), 444 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
> > index db1b44c23c..866396e947 100644
> > --- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
> > +++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
> > @@ -17,7 +17,7 @@
> > <https://www.gnu.org/licenses/>. */
> >
> > #if IS_IN (libc)
> > -# define strrchr __strrchr_sse2
> > +# define STRRCHR __strrchr_sse2
> >
> > # undef weak_alias
> > # define weak_alias(strrchr, rindex)
> > diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> > index 78d1ca6553..69d2f3cdb1 100644
> > --- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> > +++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> > @@ -17,7 +17,6 @@
> > <https://www.gnu.org/licenses/>. */
> >
> > #if IS_IN (libc)
> > -# define wcsrchr __wcsrchr_sse2
> > +# define STRRCHR __wcsrchr_sse2
> > #endif
> > -
> > #include "../wcsrchr.S"
> > diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
> > index 50d886713e..6efb25c880 100644
> > --- a/sysdeps/x86_64/strrchr.S
> > +++ b/sysdeps/x86_64/strrchr.S
> > @@ -19,210 +19,360 @@
> >
> > #include <sysdep.h>
> >
> > +#ifndef STRRCHR
> > +# define STRRCHR strrchr
> > +#endif
> > +
> > +#ifdef USE_AS_WCSRCHR
> > +# define PCMPEQ pcmpeqd
> > +# define CHAR_SIZE 4
> > +# define PMINU pminud
> > +#else
> > +# define PCMPEQ pcmpeqb
> > +# define CHAR_SIZE 1
> > +# define PMINU pminub
> > +#endif
> > +
> > +#define PAGE_SIZE 4096
> > +#define VEC_SIZE 16
> > +
> > .text
> > -ENTRY (strrchr)
> > - movd %esi, %xmm1
> > +ENTRY(STRRCHR)
> > + movd %esi, %xmm0
> > movq %rdi, %rax
> > - andl $4095, %eax
> > - punpcklbw %xmm1, %xmm1
> > - cmpq $4032, %rax
> > - punpcklwd %xmm1, %xmm1
> > - pshufd $0, %xmm1, %xmm1
> > + andl $(PAGE_SIZE - 1), %eax
> > +#ifndef USE_AS_WCSRCHR
> > + punpcklbw %xmm0, %xmm0
> > + punpcklwd %xmm0, %xmm0
> > +#endif
> > + pshufd $0, %xmm0, %xmm0
> > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> > ja L(cross_page)
> > - movdqu (%rdi), %xmm0
> > +
> > +L(cross_page_continue):
> > + movups (%rdi), %xmm1
> > pxor %xmm2, %xmm2
> > - movdqa %xmm0, %xmm3
> > - pcmpeqb %xmm1, %xmm0
> > - pcmpeqb %xmm2, %xmm3
> > - pmovmskb %xmm0, %ecx
> > - pmovmskb %xmm3, %edx
> > - testq %rdx, %rdx
> > - je L(next_48_bytes)
> > - leaq -1(%rdx), %rax
> > - xorq %rdx, %rax
> > - andq %rcx, %rax
> > - je L(exit)
> > - bsrq %rax, %rax
> > + PCMPEQ %xmm1, %xmm2
> > + pmovmskb %xmm2, %ecx
> > + testl %ecx, %ecx
> > + jz L(aligned_more)
> > +
> > + PCMPEQ %xmm0, %xmm1
> > + pmovmskb %xmm1, %eax
> > + leal -1(%rcx), %edx
> > + xorl %edx, %ecx
> > + andl %ecx, %eax
> > + jz L(ret0)
> > + bsrl %eax, %eax
> > addq %rdi, %rax
> > + /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
> > + search CHAR is zero we are correct. Either way `andq
> > + -CHAR_SIZE, %rax` gets the correct result. */
> > +#ifdef USE_AS_WCSRCHR
> > + andq $-CHAR_SIZE, %rax
> > +#endif
> > +L(ret0):
> > ret
> >
> > + /* Returns for first vec x1/x2 have hard coded backward search
> > + path for earlier matches. */
> > .p2align 4
> > -L(next_48_bytes):
> > - movdqu 16(%rdi), %xmm4
> > - movdqa %xmm4, %xmm5
> > - movdqu 32(%rdi), %xmm3
> > - pcmpeqb %xmm1, %xmm4
> > - pcmpeqb %xmm2, %xmm5
> > - movdqu 48(%rdi), %xmm0
> > - pmovmskb %xmm5, %edx
> > - movdqa %xmm3, %xmm5
> > - pcmpeqb %xmm1, %xmm3
> > - pcmpeqb %xmm2, %xmm5
> > - pcmpeqb %xmm0, %xmm2
> > - salq $16, %rdx
> > - pmovmskb %xmm3, %r8d
> > - pmovmskb %xmm5, %eax
> > - pmovmskb %xmm2, %esi
> > - salq $32, %r8
> > - salq $32, %rax
> > - pcmpeqb %xmm1, %xmm0
> > - orq %rdx, %rax
> > - movq %rsi, %rdx
> > - pmovmskb %xmm4, %esi
> > - salq $48, %rdx
> > - salq $16, %rsi
> > - orq %r8, %rsi
> > - orq %rcx, %rsi
> > - pmovmskb %xmm0, %ecx
> > - salq $48, %rcx
> > - orq %rcx, %rsi
> > - orq %rdx, %rax
> > - je L(loop_header2)
> > - leaq -1(%rax), %rcx
> > - xorq %rax, %rcx
> > - andq %rcx, %rsi
> > - je L(exit)
> > - bsrq %rsi, %rsi
> > - leaq (%rdi,%rsi), %rax
> > +L(first_vec_x0_test):
> > + PCMPEQ %xmm0, %xmm1
> > + pmovmskb %xmm1, %eax
> > + testl %eax, %eax
> > + jz L(ret0)
> > + bsrl %eax, %eax
> > + addq %r8, %rax
> > +#ifdef USE_AS_WCSRCHR
> > + andq $-CHAR_SIZE, %rax
> > +#endif
> > ret
> >
> > .p2align 4
> > -L(loop_header2):
> > - testq %rsi, %rsi
> > - movq %rdi, %rcx
> > - je L(no_c_found)
> > -L(loop_header):
> > - addq $64, %rdi
> > - pxor %xmm7, %xmm7
> > - andq $-64, %rdi
> > - jmp L(loop_entry)
> > +L(first_vec_x1):
> > + PCMPEQ %xmm0, %xmm2
> > + pmovmskb %xmm2, %eax
> > + leal -1(%rcx), %edx
> > + xorl %edx, %ecx
> > + andl %ecx, %eax
> > + jz L(first_vec_x0_test)
> > + bsrl %eax, %eax
> > + leaq (VEC_SIZE)(%rdi, %rax), %rax
> > +#ifdef USE_AS_WCSRCHR
> > + andq $-CHAR_SIZE, %rax
> > +#endif
> > + ret
> >
> > .p2align 4
> > -L(loop64):
> > - testq %rdx, %rdx
> > - cmovne %rdx, %rsi
> > - cmovne %rdi, %rcx
> > - addq $64, %rdi
> > -L(loop_entry):
> > - movdqa 32(%rdi), %xmm3
> > - pxor %xmm6, %xmm6
> > - movdqa 48(%rdi), %xmm2
> > - movdqa %xmm3, %xmm0
> > - movdqa 16(%rdi), %xmm4
> > - pminub %xmm2, %xmm0
> > - movdqa (%rdi), %xmm5
> > - pminub %xmm4, %xmm0
> > - pminub %xmm5, %xmm0
> > - pcmpeqb %xmm7, %xmm0
> > - pmovmskb %xmm0, %eax
> > - movdqa %xmm5, %xmm0
> > - pcmpeqb %xmm1, %xmm0
> > - pmovmskb %xmm0, %r9d
> > - movdqa %xmm4, %xmm0
> > - pcmpeqb %xmm1, %xmm0
> > - pmovmskb %xmm0, %edx
> > - movdqa %xmm3, %xmm0
> > - pcmpeqb %xmm1, %xmm0
> > - salq $16, %rdx
> > - pmovmskb %xmm0, %r10d
> > - movdqa %xmm2, %xmm0
> > - pcmpeqb %xmm1, %xmm0
> > - salq $32, %r10
> > - orq %r10, %rdx
> > - pmovmskb %xmm0, %r8d
> > - orq %r9, %rdx
> > - salq $48, %r8
> > - orq %r8, %rdx
> > +L(first_vec_x1_test):
> > + PCMPEQ %xmm0, %xmm2
> > + pmovmskb %xmm2, %eax
> > testl %eax, %eax
> > - je L(loop64)
> > - pcmpeqb %xmm6, %xmm4
> > - pcmpeqb %xmm6, %xmm3
> > - pcmpeqb %xmm6, %xmm5
> > - pmovmskb %xmm4, %eax
> > - pmovmskb %xmm3, %r10d
> > - pcmpeqb %xmm6, %xmm2
> > - pmovmskb %xmm5, %r9d
> > - salq $32, %r10
> > - salq $16, %rax
> > - pmovmskb %xmm2, %r8d
> > - orq %r10, %rax
> > - orq %r9, %rax
> > - salq $48, %r8
> > - orq %r8, %rax
> > - leaq -1(%rax), %r8
> > - xorq %rax, %r8
> > - andq %r8, %rdx
> > - cmovne %rdi, %rcx
> > - cmovne %rdx, %rsi
> > - bsrq %rsi, %rsi
> > - leaq (%rcx,%rsi), %rax
> > + jz L(first_vec_x0_test)
> > + bsrl %eax, %eax
> > + leaq (VEC_SIZE)(%rdi, %rax), %rax
> > +#ifdef USE_AS_WCSRCHR
> > + andq $-CHAR_SIZE, %rax
> > +#endif
> > + ret
> > +
> > + .p2align 4
> > +L(first_vec_x2):
> > + PCMPEQ %xmm0, %xmm3
> > + pmovmskb %xmm3, %eax
> > + leal -1(%rcx), %edx
> > + xorl %edx, %ecx
> > + andl %ecx, %eax
> > + jz L(first_vec_x1_test)
> > + bsrl %eax, %eax
> > + leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
> > +#ifdef USE_AS_WCSRCHR
> > + andq $-CHAR_SIZE, %rax
> > +#endif
> > + ret
> > +
> > + .p2align 4
> > +L(aligned_more):
> > + /* Save original pointer if match was in VEC 0. */
> > + movq %rdi, %r8
> > + andq $-VEC_SIZE, %rdi
> > +
> > + movaps VEC_SIZE(%rdi), %xmm2
> > + pxor %xmm3, %xmm3
> > + PCMPEQ %xmm2, %xmm3
> > + pmovmskb %xmm3, %ecx
> > + testl %ecx, %ecx
> > + jnz L(first_vec_x1)
> > +
> > + movaps (VEC_SIZE * 2)(%rdi), %xmm3
> > + pxor %xmm4, %xmm4
> > + PCMPEQ %xmm3, %xmm4
> > + pmovmskb %xmm4, %ecx
> > + testl %ecx, %ecx
> > + jnz L(first_vec_x2)
> > +
> > + addq $VEC_SIZE, %rdi
> > + /* Save pointer again before realigning. */
> > + movq %rdi, %rsi
> > + andq $-(VEC_SIZE * 2), %rdi
> > + .p2align 4
> > +L(first_loop):
> > + /* Do 2x VEC at a time. */
> > + movaps (VEC_SIZE * 2)(%rdi), %xmm4
> > + movaps (VEC_SIZE * 3)(%rdi), %xmm5
> > + /* If SSE2 no pminud so wcsrchr needs seperate logic for
> Did you mean "Since", instead of "If"?
Fixed in V3.
>
> > + detecting zero. Note if this is found to be a bottleneck it
> > + may be worth adding an SSE4.1 wcsrchr implementation. */
> > +#ifdef USE_AS_WCSRCHR
> > + movaps %xmm5, %xmm6
> > + pxor %xmm8, %xmm8
> > +
> > + PCMPEQ %xmm8, %xmm5
> > + PCMPEQ %xmm4, %xmm8
> > + por %xmm5, %xmm8
> > +#else
> > + movaps %xmm5, %xmm6
> > + PMINU %xmm4, %xmm5
> > +#endif
> > +
> > + movaps %xmm4, %xmm9
> > + PCMPEQ %xmm0, %xmm4
> > + PCMPEQ %xmm0, %xmm6
> > + movaps %xmm6, %xmm7
> > + por %xmm4, %xmm6
> > +#ifndef USE_AS_WCSRCHR
> > + pxor %xmm8, %xmm8
> > + PCMPEQ %xmm5, %xmm8
> > +#endif
> > + pmovmskb %xmm8, %ecx
> > + pmovmskb %xmm6, %eax
> > +
> > + addq $(VEC_SIZE * 2), %rdi
> > + /* Use `addl` 1) so we can undo it with `subl` and 2) it can
> > + macro-fuse with `jz`. */
> > + addl %ecx, %eax
> > + jz L(first_loop)
> > +
> > + /* Check if there is zero match. */
> > + testl %ecx, %ecx
> > + jz L(second_loop_match)
> > +
> > + /* Check if there was a match in last iteration. */
> > + subl %ecx, %eax
> > + jnz L(new_match)
> > +
> > +L(first_loop_old_match):
> > + PCMPEQ %xmm0, %xmm2
> > + PCMPEQ %xmm0, %xmm3
> > + pmovmskb %xmm2, %ecx
> > + pmovmskb %xmm3, %eax
> > + addl %eax, %ecx
> > + jz L(first_vec_x0_test)
> > + /* NB: We could move this shift to before the branch and save a
> > + bit of code size / performance on the fall through. The
> > + branch leads to the null case which generally seems hotter
> > + than char in first 3x VEC. */
> > + sall $16, %eax
> > + orl %ecx, %eax
> > +
> > + bsrl %eax, %eax
> > + addq %rsi, %rax
> > +#ifdef USE_AS_WCSRCHR
> > + andq $-CHAR_SIZE, %rax
> > +#endif
> > + ret
> > +
> > + .p2align 4
> > +L(new_match):
> > + pxor %xmm6, %xmm6
> > + PCMPEQ %xmm9, %xmm6
> > + pmovmskb %xmm6, %eax
> > + sall $16, %ecx
> > + orl %eax, %ecx
> > +
> > + /* We can't reuse either of the old comparisons as since we mask
> > + of zeros after first zero (instead of using the full
> > + comparison) we can't gurantee no interference between match
> > + after end of string and valid match. */
> > + pmovmskb %xmm4, %eax
> > + pmovmskb %xmm7, %edx
> > + sall $16, %edx
> > + orl %edx, %eax
> > +
> > + leal -1(%ecx), %edx
> > + xorl %edx, %ecx
> > + andl %ecx, %eax
> > + jz L(first_loop_old_match)
> > + bsrl %eax, %eax
> > + addq %rdi, %rax
> > +#ifdef USE_AS_WCSRCHR
> > + andq $-CHAR_SIZE, %rax
> > +#endif
> > ret
> >
> > + /* Save minimum state for getting most recent match. We can
> > + throw out all previous work. */
> > .p2align 4
> > -L(no_c_found):
> > - movl $1, %esi
> > - xorl %ecx, %ecx
> > - jmp L(loop_header)
> > +L(second_loop_match):
> > + movq %rdi, %rsi
> > + movaps %xmm4, %xmm2
> > + movaps %xmm7, %xmm3
> >
> > .p2align 4
> > -L(exit):
> > - xorl %eax, %eax
> > +L(second_loop):
> > + movaps (VEC_SIZE * 2)(%rdi), %xmm4
> > + movaps (VEC_SIZE * 3)(%rdi), %xmm5
> > + /* If SSE2 no pminud so wcsrchr needs seperate logic for
> Did you mean "Since", instead of "If"?
>
> > + detecting zero. Note if this is found to be a bottleneck it
> > + may be worth adding an SSE4.1 wcsrchr implementation. */
> > +#ifdef USE_AS_WCSRCHR
> > + movaps %xmm5, %xmm6
> > + pxor %xmm8, %xmm8
> > +
> > + PCMPEQ %xmm8, %xmm5
> > + PCMPEQ %xmm4, %xmm8
> > + por %xmm5, %xmm8
> > +#else
> > + movaps %xmm5, %xmm6
> > + PMINU %xmm4, %xmm5
> > +#endif
> > +
> > + movaps %xmm4, %xmm9
> > + PCMPEQ %xmm0, %xmm4
> > + PCMPEQ %xmm0, %xmm6
> > + movaps %xmm6, %xmm7
> > + por %xmm4, %xmm6
> > +#ifndef USE_AS_WCSRCHR
> > + pxor %xmm8, %xmm8
> > + PCMPEQ %xmm5, %xmm8
> > +#endif
> > +
> > + pmovmskb %xmm8, %ecx
> > + pmovmskb %xmm6, %eax
> > +
> > + addq $(VEC_SIZE * 2), %rdi
> > + /* Either null term or new occurence of CHAR. */
> > + addl %ecx, %eax
> > + jz L(second_loop)
> > +
> > + /* No null term so much be new occurence of CHAR. */
> > + testl %ecx, %ecx
> > + jz L(second_loop_match)
> > +
> > +
> > + subl %ecx, %eax
> > + jnz L(second_loop_new_match)
> > +
> > +L(second_loop_old_match):
> > + pmovmskb %xmm2, %ecx
> > + pmovmskb %xmm3, %eax
> > + sall $16, %eax
> > + orl %ecx, %eax
> > + bsrl %eax, %eax
> > + addq %rsi, %rax
> > +#ifdef USE_AS_WCSRCHR
> > + andq $-CHAR_SIZE, %rax
> > +#endif
> > ret
> >
> > .p2align 4
> > +L(second_loop_new_match):
> > + pxor %xmm6, %xmm6
> > + PCMPEQ %xmm9, %xmm6
> > + pmovmskb %xmm6, %eax
> > + sall $16, %ecx
> > + orl %eax, %ecx
> > +
> > + /* We can't reuse either of the old comparisons as since we mask
> > + of zeros after first zero (instead of using the full
> > + comparison) we can't gurantee no interference between match
> > + after end of string and valid match. */
> > + pmovmskb %xmm4, %eax
> > + pmovmskb %xmm7, %edx
> > + sall $16, %edx
> > + orl %edx, %eax
> > +
> > + leal -1(%ecx), %edx
> > + xorl %edx, %ecx
> > + andl %ecx, %eax
> > + jz L(second_loop_old_match)
> > + bsrl %eax, %eax
> > + addq %rdi, %rax
> > +#ifdef USE_AS_WCSRCHR
> > + andq $-CHAR_SIZE, %rax
> > +#endif
> > + ret
> > +
> > + .p2align 4,, 4
> > L(cross_page):
> > - movq %rdi, %rax
> > - pxor %xmm0, %xmm0
> > - andq $-64, %rax
> > - movdqu (%rax), %xmm5
> > - movdqa %xmm5, %xmm6
> > - movdqu 16(%rax), %xmm4
> > - pcmpeqb %xmm1, %xmm5
> > - pcmpeqb %xmm0, %xmm6
> > - movdqu 32(%rax), %xmm3
> > - pmovmskb %xmm6, %esi
> > - movdqa %xmm4, %xmm6
> > - movdqu 48(%rax), %xmm2
> > - pcmpeqb %xmm1, %xmm4
> > - pcmpeqb %xmm0, %xmm6
> > - pmovmskb %xmm6, %edx
> > - movdqa %xmm3, %xmm6
> > - pcmpeqb %xmm1, %xmm3
> > - pcmpeqb %xmm0, %xmm6
> > - pcmpeqb %xmm2, %xmm0
> > - salq $16, %rdx
> > - pmovmskb %xmm3, %r9d
> > - pmovmskb %xmm6, %r8d
> > - pmovmskb %xmm0, %ecx
> > - salq $32, %r9
> > - salq $32, %r8
> > - pcmpeqb %xmm1, %xmm2
> > - orq %r8, %rdx
> > - salq $48, %rcx
> > - pmovmskb %xmm5, %r8d
> > - orq %rsi, %rdx
> > - pmovmskb %xmm4, %esi
> > - orq %rcx, %rdx
> > - pmovmskb %xmm2, %ecx
> > - salq $16, %rsi
> > - salq $48, %rcx
> > - orq %r9, %rsi
> > - orq %r8, %rsi
> > - orq %rcx, %rsi
> > + movq %rdi, %rsi
> > + andq $-VEC_SIZE, %rsi
> > + movaps (%rsi), %xmm1
> > + pxor %xmm2, %xmm2
> > + PCMPEQ %xmm1, %xmm2
> > + pmovmskb %xmm2, %edx
> > movl %edi, %ecx
> > - subl %eax, %ecx
> > - shrq %cl, %rdx
> > - shrq %cl, %rsi
> > - testq %rdx, %rdx
> > - je L(loop_header2)
> > - leaq -1(%rdx), %rax
> > - xorq %rdx, %rax
> > - andq %rax, %rsi
> > - je L(exit)
> > - bsrq %rsi, %rax
> > + andl $(VEC_SIZE - 1), %ecx
> > + sarl %cl, %edx
> > + jz L(cross_page_continue)
> > + PCMPEQ %xmm0, %xmm1
> > + pmovmskb %xmm1, %eax
> > + sarl %cl, %eax
> > + leal -1(%rdx), %ecx
> > + xorl %edx, %ecx
> > + andl %ecx, %eax
> > + jz L(ret1)
> > + bsrl %eax, %eax
> > addq %rdi, %rax
> > +#ifdef USE_AS_WCSRCHR
> > + andq $-CHAR_SIZE, %rax
> > +#endif
> > +L(ret1):
> > ret
> > -END (strrchr)
> > +END(STRRCHR)
> >
> > -weak_alias (strrchr, rindex)
> > -libc_hidden_builtin_def (strrchr)
> > +#ifndef USE_AS_WCSRCHR
> > + weak_alias (STRRCHR, rindex)
> > + libc_hidden_builtin_def (STRRCHR)
> > +#endif
> > diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
> > index 61552954de..2b80efc5ef 100644
> > --- a/sysdeps/x86_64/wcsrchr.S
> > +++ b/sysdeps/x86_64/wcsrchr.S
> > @@ -1,4 +1,4 @@
> > -/* wcsrchr with SSSE3
> > +/* wcsrchr optimized with SSE2.
> > Copyright (C) 2011-2022 Free Software Foundation, Inc.
> > This file is part of the GNU C Library.
> >
> > @@ -16,266 +16,12 @@
> > License along with the GNU C Library; if not, see
> > <https://www.gnu.org/licenses/>. */
> >
> > -#include <sysdep.h>
> >
> > - .text
> > -ENTRY (wcsrchr)
> > +#define USE_AS_WCSRCHR 1
> > +#define NO_PMINU 1
> >
> > - movd %rsi, %xmm1
> > - mov %rdi, %rcx
> > - punpckldq %xmm1, %xmm1
> > - pxor %xmm2, %xmm2
> > - punpckldq %xmm1, %xmm1
> > - and $63, %rcx
> > - cmp $48, %rcx
> > - ja L(crosscache)
> > +#ifndef STRRCHR
> > +# define STRRCHR wcsrchr
> > +#endif
> >
> > - movdqu (%rdi), %xmm0
> > - pcmpeqd %xmm0, %xmm2
> > - pcmpeqd %xmm1, %xmm0
> > - pmovmskb %xmm2, %rcx
> > - pmovmskb %xmm0, %rax
> > - add $16, %rdi
> > -
> > - test %rax, %rax
> > - jnz L(unaligned_match1)
> > -
> > - test %rcx, %rcx
> > - jnz L(return_null)
> > -
> > - and $-16, %rdi
> > - xor %r8, %r8
> > - jmp L(loop)
> > -
> > - .p2align 4
> > -L(unaligned_match1):
> > - test %rcx, %rcx
> > - jnz L(prolog_find_zero_1)
> > -
> > - mov %rax, %r8
> > - mov %rdi, %rsi
> > - and $-16, %rdi
> > - jmp L(loop)
> > -
> > - .p2align 4
> > -L(crosscache):
> > - and $15, %rcx
> > - and $-16, %rdi
> > - pxor %xmm3, %xmm3
> > - movdqa (%rdi), %xmm0
> > - pcmpeqd %xmm0, %xmm3
> > - pcmpeqd %xmm1, %xmm0
> > - pmovmskb %xmm3, %rdx
> > - pmovmskb %xmm0, %rax
> > - shr %cl, %rdx
> > - shr %cl, %rax
> > - add $16, %rdi
> > -
> > - test %rax, %rax
> > - jnz L(unaligned_match)
> > -
> > - test %rdx, %rdx
> > - jnz L(return_null)
> > -
> > - xor %r8, %r8
> > - jmp L(loop)
> > -
> > - .p2align 4
> > -L(unaligned_match):
> > - test %rdx, %rdx
> > - jnz L(prolog_find_zero)
> > -
> > - mov %rax, %r8
> > - lea (%rdi, %rcx), %rsi
> > -
> > -/* Loop start on aligned string. */
> > - .p2align 4
> > -L(loop):
> > - movdqa (%rdi), %xmm0
> > - pcmpeqd %xmm0, %xmm2
> > - add $16, %rdi
> > - pcmpeqd %xmm1, %xmm0
> > - pmovmskb %xmm2, %rcx
> > - pmovmskb %xmm0, %rax
> > - or %rax, %rcx
> > - jnz L(matches)
> > -
> > - movdqa (%rdi), %xmm3
> > - pcmpeqd %xmm3, %xmm2
> > - add $16, %rdi
> > - pcmpeqd %xmm1, %xmm3
> > - pmovmskb %xmm2, %rcx
> > - pmovmskb %xmm3, %rax
> > - or %rax, %rcx
> > - jnz L(matches)
> > -
> > - movdqa (%rdi), %xmm4
> > - pcmpeqd %xmm4, %xmm2
> > - add $16, %rdi
> > - pcmpeqd %xmm1, %xmm4
> > - pmovmskb %xmm2, %rcx
> > - pmovmskb %xmm4, %rax
> > - or %rax, %rcx
> > - jnz L(matches)
> > -
> > - movdqa (%rdi), %xmm5
> > - pcmpeqd %xmm5, %xmm2
> > - add $16, %rdi
> > - pcmpeqd %xmm1, %xmm5
> > - pmovmskb %xmm2, %rcx
> > - pmovmskb %xmm5, %rax
> > - or %rax, %rcx
> > - jz L(loop)
> > -
> > - .p2align 4
> > -L(matches):
> > - test %rax, %rax
> > - jnz L(match)
> > -L(return_value):
> > - test %r8, %r8
> > - jz L(return_null)
> > - mov %r8, %rax
> > - mov %rsi, %rdi
> > -
> > - test $15 << 4, %ah
> > - jnz L(match_fourth_wchar)
> > - test %ah, %ah
> > - jnz L(match_third_wchar)
> > - test $15 << 4, %al
> > - jnz L(match_second_wchar)
> > - lea -16(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(match):
> > - pmovmskb %xmm2, %rcx
> > - test %rcx, %rcx
> > - jnz L(find_zero)
> > - mov %rax, %r8
> > - mov %rdi, %rsi
> > - jmp L(loop)
> > -
> > - .p2align 4
> > -L(find_zero):
> > - test $15, %cl
> > - jnz L(find_zero_in_first_wchar)
> > - test %cl, %cl
> > - jnz L(find_zero_in_second_wchar)
> > - test $15, %ch
> > - jnz L(find_zero_in_third_wchar)
> > -
> > - and $1 << 13 - 1, %rax
> > - jz L(return_value)
> > -
> > - test $15 << 4, %ah
> > - jnz L(match_fourth_wchar)
> > - test %ah, %ah
> > - jnz L(match_third_wchar)
> > - test $15 << 4, %al
> > - jnz L(match_second_wchar)
> > - lea -16(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(find_zero_in_first_wchar):
> > - test $1, %rax
> > - jz L(return_value)
> > - lea -16(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(find_zero_in_second_wchar):
> > - and $1 << 5 - 1, %rax
> > - jz L(return_value)
> > -
> > - test $15 << 4, %al
> > - jnz L(match_second_wchar)
> > - lea -16(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(find_zero_in_third_wchar):
> > - and $1 << 9 - 1, %rax
> > - jz L(return_value)
> > -
> > - test %ah, %ah
> > - jnz L(match_third_wchar)
> > - test $15 << 4, %al
> > - jnz L(match_second_wchar)
> > - lea -16(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(prolog_find_zero):
> > - add %rcx, %rdi
> > - mov %rdx, %rcx
> > -L(prolog_find_zero_1):
> > - test $15, %cl
> > - jnz L(prolog_find_zero_in_first_wchar)
> > - test %cl, %cl
> > - jnz L(prolog_find_zero_in_second_wchar)
> > - test $15, %ch
> > - jnz L(prolog_find_zero_in_third_wchar)
> > -
> > - and $1 << 13 - 1, %rax
> > - jz L(return_null)
> > -
> > - test $15 << 4, %ah
> > - jnz L(match_fourth_wchar)
> > - test %ah, %ah
> > - jnz L(match_third_wchar)
> > - test $15 << 4, %al
> > - jnz L(match_second_wchar)
> > - lea -16(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(prolog_find_zero_in_first_wchar):
> > - test $1, %rax
> > - jz L(return_null)
> > - lea -16(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(prolog_find_zero_in_second_wchar):
> > - and $1 << 5 - 1, %rax
> > - jz L(return_null)
> > -
> > - test $15 << 4, %al
> > - jnz L(match_second_wchar)
> > - lea -16(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(prolog_find_zero_in_third_wchar):
> > - and $1 << 9 - 1, %rax
> > - jz L(return_null)
> > -
> > - test %ah, %ah
> > - jnz L(match_third_wchar)
> > - test $15 << 4, %al
> > - jnz L(match_second_wchar)
> > - lea -16(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(match_second_wchar):
> > - lea -12(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(match_third_wchar):
> > - lea -8(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(match_fourth_wchar):
> > - lea -4(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(return_null):
> > - xor %rax, %rax
> > - ret
> > -
> > -END (wcsrchr)
> > +#include "../strrchr.S"
> > --
> > 2.25.1
> >
>
>
> --
> H.J.
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH v3 1/4] benchtests: Improve bench-strrchr
2022-04-22 1:52 ` [PATCH v3 1/4] benchtests: Improve bench-strrchr Noah Goldstein
` (2 preceding siblings ...)
2022-04-22 1:52 ` [PATCH v3 4/4] x86: Optimize {str|wcs}rchr-evex Noah Goldstein
@ 2022-04-22 18:29 ` H.J. Lu
2022-04-22 19:12 ` Noah Goldstein
3 siblings, 1 reply; 36+ messages in thread
From: H.J. Lu @ 2022-04-22 18:29 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Thu, Apr 21, 2022 at 6:52 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> 1. Use json-lib for printing results.
> 2. Expose all parameters (before pos, seek_char, and max_char where
> not printed).
> 3. Add benchmarks that test multiple occurence of seek_char in the
> string.
> ---
> benchtests/bench-strrchr.c | 124 ++++++++++++++++++++++++-------------
> 1 file changed, 80 insertions(+), 44 deletions(-)
>
> diff --git a/benchtests/bench-strrchr.c b/benchtests/bench-strrchr.c
> index abdae60c51..ce4307a098 100644
> --- a/benchtests/bench-strrchr.c
> +++ b/benchtests/bench-strrchr.c
> @@ -23,6 +23,7 @@
> # define TEST_NAME "strrchr"
> #endif
> #include "bench-string.h"
> +#include "json-lib.h"
>
> #define BIG_CHAR MAX_CHAR
>
> @@ -53,7 +54,8 @@ SIMPLE_STRRCHR (const CHAR *s, int c)
> }
>
> static void
> -do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
> +do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s, int c,
> + CHAR *exp_res)
> {
> CHAR *res = CALL (impl, s, c);
> size_t i, iters = INNER_LOOP_ITERS8;
> @@ -61,8 +63,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
>
> if (res != exp_res)
> {
> - error (0, 0, "Wrong result in function %s %p %p", impl->name,
> - res, exp_res);
> + error (0, 0, "Wrong result in function %s %p %p", impl->name, res,
> + exp_res);
> ret = 1;
> return;
> }
> @@ -73,23 +75,25 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
> CALL (impl, s, c);
> }
> TIMING_NOW (stop);
> -
> TIMING_DIFF (cur, start, stop);
>
> - TIMING_PRINT_MEAN ((double) cur, (double) iters);
> + json_element_double (json_ctx, (double) cur / (double) iters);
> }
>
> static void
> -do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
> +do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
> + int seek_char, int max_char, size_t freq)
> /* For wcsrchr: align here means align not in bytes,
> but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
> len for wcschr here isn't in bytes but it's number of wchar_t symbols. */
> {
> size_t i;
> + size_t pos_chunk_sz = freq ? (pos / freq) : pos;
> + size_t last_pos = len;
> CHAR *result;
> CHAR *buf = (CHAR *) buf1;
>
> - align &= 7;
> + align &= (getpagesize () - 1);
Should we add some tests for page boundary cross?
> if ((align + len) * sizeof (CHAR) >= page_size)
> return;
>
> @@ -103,6 +107,16 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
> if ((i > pos || pos >= len) && buf[align + i] == seek_char)
> buf[align + i] = seek_char + 10 + (random () & 15);
> }
> +
> + if (pos_chunk_sz == 0 && pos)
> + pos_chunk_sz = 1;
> +
> + for (i = pos_chunk_sz; i < pos && i < len; i += pos_chunk_sz)
> + {
> + buf[align + i] = seek_char;
> + last_pos = i;
> + }
> +
> buf[align + len] = 0;
>
> if (pos < len)
> @@ -110,66 +124,88 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
> buf[align + pos] = seek_char;
> result = (CHAR *) (buf + align + pos);
> }
> + else if (last_pos < len)
> + result = (CHAR *) (buf + align + last_pos);
> else if (seek_char == 0)
> result = (CHAR *) (buf + align + len);
> else
> result = NULL;
>
> - printf ("Length %4zd, alignment in bytes %2zd:", len, align * sizeof (CHAR));
> + json_element_object_begin (json_ctx);
> + json_attr_uint (json_ctx, "len", len);
> + json_attr_uint (json_ctx, "pos", pos);
> + json_attr_uint (json_ctx, "align", align);
> + json_attr_uint (json_ctx, "freq", freq);
> + json_attr_uint (json_ctx, "seek", seek_char);
> + json_attr_uint (json_ctx, "max_char", max_char);
> + json_array_begin (json_ctx, "timings");
>
> FOR_EACH_IMPL (impl, 0)
> - do_one_test (impl, (CHAR *) (buf + align), seek_char, result);
> + do_one_test (json_ctx, impl, (CHAR *) (buf + align), seek_char, result);
>
> - putchar ('\n');
> + json_array_end (json_ctx);
> + json_element_object_end (json_ctx);
> }
>
> int
> test_main (void)
> {
> - size_t i;
> + json_ctx_t json_ctx;
> + size_t i, j;
> + int seek;
>
> test_init ();
> + json_init (&json_ctx, 0, stdout);
>
> - printf ("%20s", "");
> - FOR_EACH_IMPL (impl, 0)
> - printf ("\t%s", impl->name);
> - putchar ('\n');
> -
> - for (i = 1; i < 8; ++i)
> - {
> - do_test (0, 16 << i, 2048, 23, SMALL_CHAR);
> - do_test (i, 16 << i, 2048, 23, SMALL_CHAR);
> - }
> + json_document_begin (&json_ctx);
> + json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
>
> - for (i = 1; i < 8; ++i)
> - {
> - do_test (i, 64, 256, 23, SMALL_CHAR);
> - do_test (i, 64, 256, 23, BIG_CHAR);
> - }
> + json_attr_object_begin (&json_ctx, "functions");
> + json_attr_object_begin (&json_ctx, TEST_NAME);
> + json_attr_string (&json_ctx, "bench-variant", "");
>
> - for (i = 0; i < 32; ++i)
> - {
> - do_test (0, i, i + 1, 23, SMALL_CHAR);
> - do_test (0, i, i + 1, 23, BIG_CHAR);
> - }
> + json_array_begin (&json_ctx, "ifuncs");
> + FOR_EACH_IMPL (impl, 0)
> + json_element_string (&json_ctx, impl->name);
> + json_array_end (&json_ctx);
>
> - for (i = 1; i < 8; ++i)
> - {
> - do_test (0, 16 << i, 2048, 0, SMALL_CHAR);
> - do_test (i, 16 << i, 2048, 0, SMALL_CHAR);
> - }
> + json_array_begin (&json_ctx, "results");
>
> - for (i = 1; i < 8; ++i)
> + for (seek = 0; seek <= 23; seek += 23)
> {
> - do_test (i, 64, 256, 0, SMALL_CHAR);
> - do_test (i, 64, 256, 0, BIG_CHAR);
> + for (j = 1; j < 32; j += j)
> + {
> + for (i = 1; i < 9; ++i)
> + {
> + do_test (&json_ctx, 0, 16 << i, 2048, seek, SMALL_CHAR, j);
> + do_test (&json_ctx, i, 16 << i, 2048, seek, SMALL_CHAR, j);
> + }
> +
> + for (i = 1; i < 8; ++i)
> + {
> + do_test (&json_ctx, i, 64, 256, seek, SMALL_CHAR, j);
> + do_test (&json_ctx, i, 64, 256, seek, BIG_CHAR, j);
> +
> + do_test (&json_ctx, i * 15, 64, 256, seek, SMALL_CHAR, j);
> + do_test (&json_ctx, i * 15, 64, 256, seek, BIG_CHAR, j);
> + }
> +
> + for (i = 0; i < 32; ++i)
> + {
> + do_test (&json_ctx, 0, i, i + 1, seek, SMALL_CHAR, j);
> + do_test (&json_ctx, 0, i, i + 1, seek, BIG_CHAR, j);
> + }
> + if (seek == 0)
> + {
> + break;
> + }
> + }
> }
>
> - for (i = 0; i < 32; ++i)
> - {
> - do_test (0, i, i + 1, 0, SMALL_CHAR);
> - do_test (0, i, i + 1, 0, BIG_CHAR);
> - }
> + json_array_end (&json_ctx);
> + json_attr_object_end (&json_ctx);
> + json_attr_object_end (&json_ctx);
> + json_document_end (&json_ctx);
>
> return ret;
> }
> --
> 2.25.1
>
--
H.J.
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH v3 3/4] x86: Optimize {str|wcs}rchr-avx2
2022-04-22 1:52 ` [PATCH v3 3/4] x86: Optimize {str|wcs}rchr-avx2 Noah Goldstein
@ 2022-04-22 19:03 ` H.J. Lu
2022-05-12 20:14 ` Sunil Pandey
0 siblings, 1 reply; 36+ messages in thread
From: H.J. Lu @ 2022-04-22 19:03 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Thu, Apr 21, 2022 at 6:52 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The new code unrolls the main loop slightly without adding too much
> overhead and minimizes the comparisons for the search CHAR.
>
> Geometric Mean of all benchmarks New / Old: 0.832
> See email for all results.
>
> Full xcheck passes on x86_64 with and without multiarch enabled.
> ---
> sysdeps/x86_64/multiarch/strrchr-avx2.S | 426 +++++++++++++++---------
> 1 file changed, 269 insertions(+), 157 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
> index 1df2adfad0..bd26ba80d5 100644
> --- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
> @@ -27,9 +27,13 @@
> # ifdef USE_AS_WCSRCHR
> # define VPBROADCAST vpbroadcastd
> # define VPCMPEQ vpcmpeqd
> +# define VPMIN vpminud
> +# define CHAR_SIZE 4
> # else
> # define VPBROADCAST vpbroadcastb
> # define VPCMPEQ vpcmpeqb
> +# define VPMIN vpminub
> +# define CHAR_SIZE 1
> # endif
>
> # ifndef VZEROUPPER
> @@ -41,196 +45,304 @@
> # endif
>
> # define VEC_SIZE 32
> +# define PAGE_SIZE 4096
>
> - .section SECTION(.text),"ax",@progbits
> -ENTRY (STRRCHR)
> - movd %esi, %xmm4
> - movl %edi, %ecx
> + .section SECTION(.text), "ax", @progbits
> +ENTRY(STRRCHR)
> + movd %esi, %xmm7
> + movl %edi, %eax
> /* Broadcast CHAR to YMM4. */
> - VPBROADCAST %xmm4, %ymm4
> + VPBROADCAST %xmm7, %ymm7
> vpxor %xmm0, %xmm0, %xmm0
>
> - /* Check if we may cross page boundary with one vector load. */
> - andl $(2 * VEC_SIZE - 1), %ecx
> - cmpl $VEC_SIZE, %ecx
> - ja L(cros_page_boundary)
> + /* Shift here instead of `andl` to save code size (saves a fetch
> + block). */
> + sall $20, %eax
> + cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax
> + ja L(cross_page)
>
> +L(page_cross_continue):
> vmovdqu (%rdi), %ymm1
> - VPCMPEQ %ymm1, %ymm0, %ymm2
> - VPCMPEQ %ymm1, %ymm4, %ymm3
> - vpmovmskb %ymm2, %ecx
> - vpmovmskb %ymm3, %eax
> - addq $VEC_SIZE, %rdi
> + /* Check end of string match. */
> + VPCMPEQ %ymm1, %ymm0, %ymm6
> + vpmovmskb %ymm6, %ecx
> + testl %ecx, %ecx
> + jz L(aligned_more)
> +
> + /* Only check match with search CHAR if needed. */
> + VPCMPEQ %ymm1, %ymm7, %ymm1
> + vpmovmskb %ymm1, %eax
> + /* Check if match before first zero. */
> + blsmskl %ecx, %ecx
> + andl %ecx, %eax
> + jz L(ret0)
> + bsrl %eax, %eax
> + addq %rdi, %rax
> + /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
> + search CHAR is zero we are correct. Either way `andq
> + -CHAR_SIZE, %rax` gets the correct result. */
> +# ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +# endif
> +L(ret0):
> +L(return_vzeroupper):
> + ZERO_UPPER_VEC_REGISTERS_RETURN
> +
> + /* Returns for first vec x1/x2 have hard coded backward search
> + path for earlier matches. */
> + .p2align 4,, 10
> +L(first_vec_x1):
> + VPCMPEQ %ymm2, %ymm7, %ymm6
> + vpmovmskb %ymm6, %eax
> + blsmskl %ecx, %ecx
> + andl %ecx, %eax
> + jnz L(first_vec_x1_return)
> +
> + .p2align 4,, 4
> +L(first_vec_x0_test):
> + VPCMPEQ %ymm1, %ymm7, %ymm6
> + vpmovmskb %ymm6, %eax
> + testl %eax, %eax
> + jz L(ret1)
> + bsrl %eax, %eax
> + addq %r8, %rax
> +# ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +# endif
> +L(ret1):
> + VZEROUPPER_RETURN
>
> + .p2align 4,, 10
> +L(first_vec_x0_x1_test):
> + VPCMPEQ %ymm2, %ymm7, %ymm6
> + vpmovmskb %ymm6, %eax
> + /* Check ymm2 for search CHAR match. If no match then check ymm1
> + before returning. */
> testl %eax, %eax
> - jnz L(first_vec)
> + jz L(first_vec_x0_test)
> + .p2align 4,, 4
> +L(first_vec_x1_return):
> + bsrl %eax, %eax
> + leaq 1(%rdi, %rax), %rax
> +# ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +# endif
> + VZEROUPPER_RETURN
>
> - testl %ecx, %ecx
> - jnz L(return_null)
>
> - andq $-VEC_SIZE, %rdi
> - xorl %edx, %edx
> - jmp L(aligned_loop)
> + .p2align 4,, 10
> +L(first_vec_x2):
> + VPCMPEQ %ymm3, %ymm7, %ymm6
> + vpmovmskb %ymm6, %eax
> + blsmskl %ecx, %ecx
> + /* If no in-range search CHAR match in ymm3 then need to check
> + ymm1/ymm2 for an earlier match (we delay checking search
> + CHAR matches until needed). */
> + andl %ecx, %eax
> + jz L(first_vec_x0_x1_test)
> + bsrl %eax, %eax
> + leaq (VEC_SIZE + 1)(%rdi, %rax), %rax
> +# ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +# endif
> + VZEROUPPER_RETURN
> +
>
> .p2align 4
> -L(first_vec):
> - /* Check if there is a nul CHAR. */
> +L(aligned_more):
> + /* Save original pointer if match was in VEC 0. */
> + movq %rdi, %r8
> +
> + /* Align src. */
> + orq $(VEC_SIZE - 1), %rdi
> + vmovdqu 1(%rdi), %ymm2
> + VPCMPEQ %ymm2, %ymm0, %ymm6
> + vpmovmskb %ymm6, %ecx
> testl %ecx, %ecx
> - jnz L(char_and_nul_in_first_vec)
> + jnz L(first_vec_x1)
>
> - /* Remember the match and keep searching. */
> - movl %eax, %edx
> - movq %rdi, %rsi
> - andq $-VEC_SIZE, %rdi
> - jmp L(aligned_loop)
> + vmovdqu (VEC_SIZE + 1)(%rdi), %ymm3
> + VPCMPEQ %ymm3, %ymm0, %ymm6
> + vpmovmskb %ymm6, %ecx
> + testl %ecx, %ecx
> + jnz L(first_vec_x2)
>
> + /* Save pointer again before realigning. */
> + movq %rdi, %rsi
> + addq $(VEC_SIZE + 1), %rdi
> + andq $-(VEC_SIZE * 2), %rdi
> .p2align 4
> -L(cros_page_boundary):
> - andl $(VEC_SIZE - 1), %ecx
> - andq $-VEC_SIZE, %rdi
> - vmovdqa (%rdi), %ymm1
> - VPCMPEQ %ymm1, %ymm0, %ymm2
> - VPCMPEQ %ymm1, %ymm4, %ymm3
> - vpmovmskb %ymm2, %edx
> - vpmovmskb %ymm3, %eax
> - shrl %cl, %edx
> - shrl %cl, %eax
> - addq $VEC_SIZE, %rdi
> -
> - /* Check if there is a CHAR. */
> +L(first_aligned_loop):
> + /* Do 2x VEC at a time. Any more and the cost of finding the
> + match outweights loop benefit. */
> + vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4
> + vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5
> +
> + VPCMPEQ %ymm4, %ymm7, %ymm6
> + VPMIN %ymm4, %ymm5, %ymm8
> + VPCMPEQ %ymm5, %ymm7, %ymm10
> + vpor %ymm6, %ymm10, %ymm5
> + VPCMPEQ %ymm8, %ymm0, %ymm8
> + vpor %ymm5, %ymm8, %ymm9
> +
> + vpmovmskb %ymm9, %eax
> + addq $(VEC_SIZE * 2), %rdi
> + /* No zero or search CHAR. */
> testl %eax, %eax
> - jnz L(found_char)
> -
> - testl %edx, %edx
> - jnz L(return_null)
> + jz L(first_aligned_loop)
>
> - jmp L(aligned_loop)
> -
> - .p2align 4
> -L(found_char):
> - testl %edx, %edx
> - jnz L(char_and_nul)
> + /* If no zero CHAR then go to second loop (this allows us to
> + throw away all prior work). */
> + vpmovmskb %ymm8, %ecx
> + testl %ecx, %ecx
> + jz L(second_aligned_loop_prep)
>
> - /* Remember the match and keep searching. */
> - movl %eax, %edx
> - leaq (%rdi, %rcx), %rsi
> + /* Search char could be zero so we need to get the true match.
> + */
> + vpmovmskb %ymm5, %eax
> + testl %eax, %eax
> + jnz L(first_aligned_loop_return)
>
> - .p2align 4
> -L(aligned_loop):
> - vmovdqa (%rdi), %ymm1
> - VPCMPEQ %ymm1, %ymm0, %ymm2
> - addq $VEC_SIZE, %rdi
> - VPCMPEQ %ymm1, %ymm4, %ymm3
> - vpmovmskb %ymm2, %ecx
> - vpmovmskb %ymm3, %eax
> - orl %eax, %ecx
> - jnz L(char_nor_null)
> -
> - vmovdqa (%rdi), %ymm1
> - VPCMPEQ %ymm1, %ymm0, %ymm2
> - add $VEC_SIZE, %rdi
> - VPCMPEQ %ymm1, %ymm4, %ymm3
> - vpmovmskb %ymm2, %ecx
> + .p2align 4,, 4
> +L(first_vec_x1_or_x2):
> + VPCMPEQ %ymm3, %ymm7, %ymm3
> + VPCMPEQ %ymm2, %ymm7, %ymm2
> vpmovmskb %ymm3, %eax
> - orl %eax, %ecx
> - jnz L(char_nor_null)
> -
> - vmovdqa (%rdi), %ymm1
> - VPCMPEQ %ymm1, %ymm0, %ymm2
> - addq $VEC_SIZE, %rdi
> - VPCMPEQ %ymm1, %ymm4, %ymm3
> - vpmovmskb %ymm2, %ecx
> - vpmovmskb %ymm3, %eax
> - orl %eax, %ecx
> - jnz L(char_nor_null)
> -
> - vmovdqa (%rdi), %ymm1
> - VPCMPEQ %ymm1, %ymm0, %ymm2
> - addq $VEC_SIZE, %rdi
> - VPCMPEQ %ymm1, %ymm4, %ymm3
> - vpmovmskb %ymm2, %ecx
> - vpmovmskb %ymm3, %eax
> - orl %eax, %ecx
> - jz L(aligned_loop)
> -
> - .p2align 4
> -L(char_nor_null):
> - /* Find a CHAR or a nul CHAR in a loop. */
> - testl %eax, %eax
> - jnz L(match)
> -L(return_value):
> - testl %edx, %edx
> - jz L(return_null)
> - movl %edx, %eax
> - movq %rsi, %rdi
> + vpmovmskb %ymm2, %edx
> + /* Use add for macro-fusion. */
> + addq %rax, %rdx
> + jz L(first_vec_x0_test)
> + /* NB: We could move this shift to before the branch and save a
> + bit of code size / performance on the fall through. The
> + branch leads to the null case which generally seems hotter
> + than char in first 3x VEC. */
> + salq $32, %rax
> + addq %rdx, %rax
> + bsrq %rax, %rax
> + leaq 1(%rsi, %rax), %rax
> +# ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +# endif
> + VZEROUPPER_RETURN
>
> + .p2align 4,, 8
> +L(first_aligned_loop_return):
> + VPCMPEQ %ymm4, %ymm0, %ymm4
> + vpmovmskb %ymm4, %edx
> + salq $32, %rcx
> + orq %rdx, %rcx
> +
> + vpmovmskb %ymm10, %eax
> + vpmovmskb %ymm6, %edx
> + salq $32, %rax
> + orq %rdx, %rax
> + blsmskq %rcx, %rcx
> + andq %rcx, %rax
> + jz L(first_vec_x1_or_x2)
> +
> + bsrq %rax, %rax
> + leaq -(VEC_SIZE * 2)(%rdi, %rax), %rax
> # ifdef USE_AS_WCSRCHR
> - /* Keep the first bit for each matching CHAR for bsr. */
> - andl $0x11111111, %eax
> + andq $-CHAR_SIZE, %rax
> # endif
> - bsrl %eax, %eax
> - leaq -VEC_SIZE(%rdi, %rax), %rax
> -L(return_vzeroupper):
> - ZERO_UPPER_VEC_REGISTERS_RETURN
> + VZEROUPPER_RETURN
>
> + /* Search char cannot be zero. */
> .p2align 4
> -L(match):
> - /* Find a CHAR. Check if there is a nul CHAR. */
> - vpmovmskb %ymm2, %ecx
> - testl %ecx, %ecx
> - jnz L(find_nul)
> -
> - /* Remember the match and keep searching. */
> - movl %eax, %edx
> +L(second_aligned_loop_set_furthest_match):
> + /* Save VEC and pointer from most recent match. */
> +L(second_aligned_loop_prep):
> movq %rdi, %rsi
> - jmp L(aligned_loop)
> + vmovdqu %ymm6, %ymm2
> + vmovdqu %ymm10, %ymm3
>
> .p2align 4
> -L(find_nul):
> -# ifdef USE_AS_WCSRCHR
> - /* Keep the first bit for each matching CHAR for bsr. */
> - andl $0x11111111, %ecx
> - andl $0x11111111, %eax
> -# endif
> - /* Mask out any matching bits after the nul CHAR. */
> - movl %ecx, %r8d
> - subl $1, %r8d
> - xorl %ecx, %r8d
> - andl %r8d, %eax
> +L(second_aligned_loop):
> + /* Search 2x at at time. */
> + vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4
> + vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5
> +
> + VPCMPEQ %ymm4, %ymm7, %ymm6
> + VPMIN %ymm4, %ymm5, %ymm1
> + VPCMPEQ %ymm5, %ymm7, %ymm10
> + vpor %ymm6, %ymm10, %ymm5
> + VPCMPEQ %ymm1, %ymm0, %ymm1
> + vpor %ymm5, %ymm1, %ymm9
> +
> + vpmovmskb %ymm9, %eax
> + addq $(VEC_SIZE * 2), %rdi
> testl %eax, %eax
> - /* If there is no CHAR here, return the remembered one. */
> - jz L(return_value)
> - bsrl %eax, %eax
> - leaq -VEC_SIZE(%rdi, %rax), %rax
> - VZEROUPPER_RETURN
> -
> - .p2align 4
> -L(char_and_nul):
> - /* Find both a CHAR and a nul CHAR. */
> - addq %rcx, %rdi
> - movl %edx, %ecx
> -L(char_and_nul_in_first_vec):
> -# ifdef USE_AS_WCSRCHR
> - /* Keep the first bit for each matching CHAR for bsr. */
> - andl $0x11111111, %ecx
> - andl $0x11111111, %eax
> -# endif
> - /* Mask out any matching bits after the nul CHAR. */
> - movl %ecx, %r8d
> - subl $1, %r8d
> - xorl %ecx, %r8d
> - andl %r8d, %eax
> + jz L(second_aligned_loop)
> + vpmovmskb %ymm1, %ecx
> + testl %ecx, %ecx
> + jz L(second_aligned_loop_set_furthest_match)
> + vpmovmskb %ymm5, %eax
> testl %eax, %eax
> - /* Return null pointer if the nul CHAR comes first. */
> - jz L(return_null)
> - bsrl %eax, %eax
> - leaq -VEC_SIZE(%rdi, %rax), %rax
> + jnz L(return_new_match)
> +
> + /* This is the hot patch. We know CHAR is inbounds and that
> + ymm3/ymm2 have latest match. */
> + .p2align 4,, 4
> +L(return_old_match):
> + vpmovmskb %ymm3, %eax
> + vpmovmskb %ymm2, %edx
> + salq $32, %rax
> + orq %rdx, %rax
> + bsrq %rax, %rax
> + /* Search char cannot be zero so safe to just use lea for
> + wcsrchr. */
> + leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax
> VZEROUPPER_RETURN
>
> - .p2align 4
> -L(return_null):
> - xorl %eax, %eax
> + /* Last iteration also potentially has a match. */
> + .p2align 4,, 8
> +L(return_new_match):
> + VPCMPEQ %ymm4, %ymm0, %ymm4
> + vpmovmskb %ymm4, %edx
> + salq $32, %rcx
> + orq %rdx, %rcx
> +
> + vpmovmskb %ymm10, %eax
> + vpmovmskb %ymm6, %edx
> + salq $32, %rax
> + orq %rdx, %rax
> + blsmskq %rcx, %rcx
> + andq %rcx, %rax
> + jz L(return_old_match)
> + bsrq %rax, %rax
> + /* Search char cannot be zero so safe to just use lea for
> + wcsrchr. */
> + leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax
> VZEROUPPER_RETURN
>
> -END (STRRCHR)
> + .p2align 4,, 4
> +L(cross_page):
> + movq %rdi, %rsi
> + andq $-VEC_SIZE, %rsi
> + vmovdqu (%rsi), %ymm1
> + VPCMPEQ %ymm1, %ymm0, %ymm6
> + vpmovmskb %ymm6, %ecx
> + /* Shift out zero CHAR matches that are before the begining of
> + src (rdi). */
> + shrxl %edi, %ecx, %ecx
> + testl %ecx, %ecx
> + jz L(page_cross_continue)
> + VPCMPEQ %ymm1, %ymm7, %ymm1
> + vpmovmskb %ymm1, %eax
> +
> + /* Shift out search CHAR matches that are before the begining of
> + src (rdi). */
> + shrxl %edi, %eax, %eax
> + blsmskl %ecx, %ecx
> + /* Check if any search CHAR match in range. */
> + andl %ecx, %eax
> + jz L(ret2)
> + bsrl %eax, %eax
> + addq %rdi, %rax
> +# ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +# endif
> +L(ret2):
> + VZEROUPPER_RETURN
> +END(STRRCHR)
> #endif
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH v3 4/4] x86: Optimize {str|wcs}rchr-evex
2022-04-22 1:52 ` [PATCH v3 4/4] x86: Optimize {str|wcs}rchr-evex Noah Goldstein
@ 2022-04-22 19:04 ` H.J. Lu
2022-05-12 20:16 ` Sunil Pandey
0 siblings, 1 reply; 36+ messages in thread
From: H.J. Lu @ 2022-04-22 19:04 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Thu, Apr 21, 2022 at 6:52 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The new code unrolls the main loop slightly without adding too much
> overhead and minimizes the comparisons for the search CHAR.
>
> Geometric Mean of all benchmarks New / Old: 0.755
> See email for all results.
>
> Full xcheck passes on x86_64 with and without multiarch enabled.
> ---
> sysdeps/x86_64/multiarch/strrchr-evex.S | 471 +++++++++++++++---------
> 1 file changed, 290 insertions(+), 181 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
> index adeddaed32..8014c285b3 100644
> --- a/sysdeps/x86_64/multiarch/strrchr-evex.S
> +++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
> @@ -24,242 +24,351 @@
> # define STRRCHR __strrchr_evex
> # endif
>
> -# define VMOVU vmovdqu64
> -# define VMOVA vmovdqa64
> +# define VMOVU vmovdqu64
> +# define VMOVA vmovdqa64
>
> # ifdef USE_AS_WCSRCHR
> +# define SHIFT_REG esi
> +
> +# define kunpck kunpckbw
> +# define kmov_2x kmovd
> +# define maskz_2x ecx
> +# define maskm_2x eax
> +# define CHAR_SIZE 4
> +# define VPMIN vpminud
> +# define VPTESTN vptestnmd
> # define VPBROADCAST vpbroadcastd
> -# define VPCMP vpcmpd
> -# define SHIFT_REG r8d
> +# define VPCMP vpcmpd
> # else
> +# define SHIFT_REG edi
> +
> +# define kunpck kunpckdq
> +# define kmov_2x kmovq
> +# define maskz_2x rcx
> +# define maskm_2x rax
> +
> +# define CHAR_SIZE 1
> +# define VPMIN vpminub
> +# define VPTESTN vptestnmb
> # define VPBROADCAST vpbroadcastb
> -# define VPCMP vpcmpb
> -# define SHIFT_REG ecx
> +# define VPCMP vpcmpb
> # endif
>
> # define XMMZERO xmm16
> # define YMMZERO ymm16
> # define YMMMATCH ymm17
> -# define YMM1 ymm18
> +# define YMMSAVE ymm18
> +
> +# define YMM1 ymm19
> +# define YMM2 ymm20
> +# define YMM3 ymm21
> +# define YMM4 ymm22
> +# define YMM5 ymm23
> +# define YMM6 ymm24
> +# define YMM7 ymm25
> +# define YMM8 ymm26
>
> -# define VEC_SIZE 32
>
> - .section .text.evex,"ax",@progbits
> -ENTRY (STRRCHR)
> - movl %edi, %ecx
> +# define VEC_SIZE 32
> +# define PAGE_SIZE 4096
> + .section .text.evex, "ax", @progbits
> +ENTRY(STRRCHR)
> + movl %edi, %eax
> /* Broadcast CHAR to YMMMATCH. */
> VPBROADCAST %esi, %YMMMATCH
>
> - vpxorq %XMMZERO, %XMMZERO, %XMMZERO
> -
> - /* Check if we may cross page boundary with one vector load. */
> - andl $(2 * VEC_SIZE - 1), %ecx
> - cmpl $VEC_SIZE, %ecx
> - ja L(cros_page_boundary)
> + andl $(PAGE_SIZE - 1), %eax
> + cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> + jg L(cross_page_boundary)
>
> +L(page_cross_continue):
> VMOVU (%rdi), %YMM1
> -
> - /* Each bit in K0 represents a null byte in YMM1. */
> - VPCMP $0, %YMMZERO, %YMM1, %k0
> - /* Each bit in K1 represents a CHAR in YMM1. */
> - VPCMP $0, %YMMMATCH, %YMM1, %k1
> + /* k0 has a 1 for each zero CHAR in YMM1. */
> + VPTESTN %YMM1, %YMM1, %k0
> kmovd %k0, %ecx
> - kmovd %k1, %eax
> -
> - addq $VEC_SIZE, %rdi
> -
> - testl %eax, %eax
> - jnz L(first_vec)
> -
> testl %ecx, %ecx
> - jnz L(return_null)
> -
> - andq $-VEC_SIZE, %rdi
> - xorl %edx, %edx
> - jmp L(aligned_loop)
> -
> - .p2align 4
> -L(first_vec):
> - /* Check if there is a null byte. */
> - testl %ecx, %ecx
> - jnz L(char_and_nul_in_first_vec)
> -
> - /* Remember the match and keep searching. */
> - movl %eax, %edx
> - movq %rdi, %rsi
> - andq $-VEC_SIZE, %rdi
> - jmp L(aligned_loop)
> -
> - .p2align 4
> -L(cros_page_boundary):
> - andl $(VEC_SIZE - 1), %ecx
> - andq $-VEC_SIZE, %rdi
> + jz L(aligned_more)
> + /* fallthrough: zero CHAR in first VEC. */
>
> + /* K1 has a 1 for each search CHAR match in YMM1. */
> + VPCMP $0, %YMMMATCH, %YMM1, %k1
> + kmovd %k1, %eax
> + /* Build mask up until first zero CHAR (used to mask of
> + potential search CHAR matches past the end of the string).
> + */
> + blsmskl %ecx, %ecx
> + andl %ecx, %eax
> + jz L(ret0)
> + /* Get last match (the `andl` removed any out of bounds
> + matches). */
> + bsrl %eax, %eax
> # ifdef USE_AS_WCSRCHR
> - /* NB: Divide shift count by 4 since each bit in K1 represent 4
> - bytes. */
> - movl %ecx, %SHIFT_REG
> - sarl $2, %SHIFT_REG
> + leaq (%rdi, %rax, CHAR_SIZE), %rax
> +# else
> + addq %rdi, %rax
> # endif
> +L(ret0):
> + ret
>
> - VMOVA (%rdi), %YMM1
> -
> - /* Each bit in K0 represents a null byte in YMM1. */
> - VPCMP $0, %YMMZERO, %YMM1, %k0
> - /* Each bit in K1 represents a CHAR in YMM1. */
> + /* Returns for first vec x1/x2/x3 have hard coded backward
> + search path for earlier matches. */
> + .p2align 4,, 6
> +L(first_vec_x1):
> + VPCMP $0, %YMMMATCH, %YMM2, %k1
> + kmovd %k1, %eax
> + blsmskl %ecx, %ecx
> + /* eax non-zero if search CHAR in range. */
> + andl %ecx, %eax
> + jnz L(first_vec_x1_return)
> +
> + /* fallthrough: no match in YMM2 then need to check for earlier
> + matches (in YMM1). */
> + .p2align 4,, 4
> +L(first_vec_x0_test):
> VPCMP $0, %YMMMATCH, %YMM1, %k1
> - kmovd %k0, %edx
> kmovd %k1, %eax
> -
> - shrxl %SHIFT_REG, %edx, %edx
> - shrxl %SHIFT_REG, %eax, %eax
> - addq $VEC_SIZE, %rdi
> -
> - /* Check if there is a CHAR. */
> testl %eax, %eax
> - jnz L(found_char)
> -
> - testl %edx, %edx
> - jnz L(return_null)
> -
> - jmp L(aligned_loop)
> -
> - .p2align 4
> -L(found_char):
> - testl %edx, %edx
> - jnz L(char_and_nul)
> -
> - /* Remember the match and keep searching. */
> - movl %eax, %edx
> - leaq (%rdi, %rcx), %rsi
> + jz L(ret1)
> + bsrl %eax, %eax
> +# ifdef USE_AS_WCSRCHR
> + leaq (%rsi, %rax, CHAR_SIZE), %rax
> +# else
> + addq %rsi, %rax
> +# endif
> +L(ret1):
> + ret
>
> - .p2align 4
> -L(aligned_loop):
> - VMOVA (%rdi), %YMM1
> - addq $VEC_SIZE, %rdi
> + .p2align 4,, 10
> +L(first_vec_x1_or_x2):
> + VPCMP $0, %YMM3, %YMMMATCH, %k3
> + VPCMP $0, %YMM2, %YMMMATCH, %k2
> + /* K2 and K3 have 1 for any search CHAR match. Test if any
> + matches between either of them. Otherwise check YMM1. */
> + kortestd %k2, %k3
> + jz L(first_vec_x0_test)
> +
> + /* Guranteed that YMM2 and YMM3 are within range so merge the
> + two bitmasks then get last result. */
> + kunpck %k2, %k3, %k3
> + kmovq %k3, %rax
> + bsrq %rax, %rax
> + leaq (VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax
> + ret
>
> - /* Each bit in K0 represents a null byte in YMM1. */
> - VPCMP $0, %YMMZERO, %YMM1, %k0
> - /* Each bit in K1 represents a CHAR in YMM1. */
> - VPCMP $0, %YMMMATCH, %YMM1, %k1
> - kmovd %k0, %ecx
> + .p2align 4,, 6
> +L(first_vec_x3):
> + VPCMP $0, %YMMMATCH, %YMM4, %k1
> kmovd %k1, %eax
> - orl %eax, %ecx
> - jnz L(char_nor_null)
> + blsmskl %ecx, %ecx
> + /* If no search CHAR match in range check YMM1/YMM2/YMM3. */
> + andl %ecx, %eax
> + jz L(first_vec_x1_or_x2)
> + bsrl %eax, %eax
> + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> + ret
>
> - VMOVA (%rdi), %YMM1
> - add $VEC_SIZE, %rdi
> + .p2align 4,, 6
> +L(first_vec_x0_x1_test):
> + VPCMP $0, %YMMMATCH, %YMM2, %k1
> + kmovd %k1, %eax
> + /* Check YMM2 for last match first. If no match try YMM1. */
> + testl %eax, %eax
> + jz L(first_vec_x0_test)
> + .p2align 4,, 4
> +L(first_vec_x1_return):
> + bsrl %eax, %eax
> + leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
> + ret
>
> - /* Each bit in K0 represents a null byte in YMM1. */
> - VPCMP $0, %YMMZERO, %YMM1, %k0
> - /* Each bit in K1 represents a CHAR in YMM1. */
> - VPCMP $0, %YMMMATCH, %YMM1, %k1
> - kmovd %k0, %ecx
> + .p2align 4,, 10
> +L(first_vec_x2):
> + VPCMP $0, %YMMMATCH, %YMM3, %k1
> kmovd %k1, %eax
> - orl %eax, %ecx
> - jnz L(char_nor_null)
> + blsmskl %ecx, %ecx
> + /* Check YMM3 for last match first. If no match try YMM2/YMM1.
> + */
> + andl %ecx, %eax
> + jz L(first_vec_x0_x1_test)
> + bsrl %eax, %eax
> + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> + ret
>
> - VMOVA (%rdi), %YMM1
> - addq $VEC_SIZE, %rdi
>
> - /* Each bit in K0 represents a null byte in YMM1. */
> - VPCMP $0, %YMMZERO, %YMM1, %k0
> - /* Each bit in K1 represents a CHAR in YMM1. */
> - VPCMP $0, %YMMMATCH, %YMM1, %k1
> + .p2align 4
> +L(aligned_more):
> + /* Need to keep original pointer incase YMM1 has last match. */
> + movq %rdi, %rsi
> + andq $-VEC_SIZE, %rdi
> + VMOVU VEC_SIZE(%rdi), %YMM2
> + VPTESTN %YMM2, %YMM2, %k0
> kmovd %k0, %ecx
> - kmovd %k1, %eax
> - orl %eax, %ecx
> - jnz L(char_nor_null)
> + testl %ecx, %ecx
> + jnz L(first_vec_x1)
>
> - VMOVA (%rdi), %YMM1
> - addq $VEC_SIZE, %rdi
> + VMOVU (VEC_SIZE * 2)(%rdi), %YMM3
> + VPTESTN %YMM3, %YMM3, %k0
> + kmovd %k0, %ecx
> + testl %ecx, %ecx
> + jnz L(first_vec_x2)
>
> - /* Each bit in K0 represents a null byte in YMM1. */
> - VPCMP $0, %YMMZERO, %YMM1, %k0
> - /* Each bit in K1 represents a CHAR in YMM1. */
> - VPCMP $0, %YMMMATCH, %YMM1, %k1
> + VMOVU (VEC_SIZE * 3)(%rdi), %YMM4
> + VPTESTN %YMM4, %YMM4, %k0
> kmovd %k0, %ecx
> - kmovd %k1, %eax
> - orl %eax, %ecx
> - jz L(aligned_loop)
> + movq %rdi, %r8
> + testl %ecx, %ecx
> + jnz L(first_vec_x3)
>
> + andq $-(VEC_SIZE * 2), %rdi
> .p2align 4
> -L(char_nor_null):
> - /* Find a CHAR or a null byte in a loop. */
> +L(first_aligned_loop):
> + /* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee
> + they don't store a match. */
> + VMOVA (VEC_SIZE * 4)(%rdi), %YMM5
> + VMOVA (VEC_SIZE * 5)(%rdi), %YMM6
> +
> + VPCMP $0, %YMM5, %YMMMATCH, %k2
> + vpxord %YMM6, %YMMMATCH, %YMM7
> +
> + VPMIN %YMM5, %YMM6, %YMM8
> + VPMIN %YMM8, %YMM7, %YMM7
> +
> + VPTESTN %YMM7, %YMM7, %k1
> + subq $(VEC_SIZE * -2), %rdi
> + kortestd %k1, %k2
> + jz L(first_aligned_loop)
> +
> + VPCMP $0, %YMM6, %YMMMATCH, %k3
> + VPTESTN %YMM8, %YMM8, %k1
> + ktestd %k1, %k1
> + jz L(second_aligned_loop_prep)
> +
> + kortestd %k2, %k3
> + jnz L(return_first_aligned_loop)
> +
> + .p2align 4,, 6
> +L(first_vec_x1_or_x2_or_x3):
> + VPCMP $0, %YMM4, %YMMMATCH, %k4
> + kmovd %k4, %eax
> testl %eax, %eax
> - jnz L(match)
> -L(return_value):
> - testl %edx, %edx
> - jz L(return_null)
> - movl %edx, %eax
> - movq %rsi, %rdi
> + jz L(first_vec_x1_or_x2)
> bsrl %eax, %eax
> -# ifdef USE_AS_WCSRCHR
> - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> - leaq -VEC_SIZE(%rdi, %rax, 4), %rax
> -# else
> - leaq -VEC_SIZE(%rdi, %rax), %rax
> -# endif
> + leaq (VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
> ret
>
> - .p2align 4
> -L(match):
> - /* Find a CHAR. Check if there is a null byte. */
> - kmovd %k0, %ecx
> - testl %ecx, %ecx
> - jnz L(find_nul)
> + .p2align 4,, 8
> +L(return_first_aligned_loop):
> + VPTESTN %YMM5, %YMM5, %k0
> + kunpck %k0, %k1, %k0
> + kmov_2x %k0, %maskz_2x
> +
> + blsmsk %maskz_2x, %maskz_2x
> + kunpck %k2, %k3, %k3
> + kmov_2x %k3, %maskm_2x
> + and %maskz_2x, %maskm_2x
> + jz L(first_vec_x1_or_x2_or_x3)
>
> - /* Remember the match and keep searching. */
> - movl %eax, %edx
> + bsr %maskm_2x, %maskm_2x
> + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> + ret
> +
> + .p2align 4
> + /* We can throw away the work done for the first 4x checks here
> + as we have a later match. This is the 'fast' path persay.
> + */
> +L(second_aligned_loop_prep):
> +L(second_aligned_loop_set_furthest_match):
> movq %rdi, %rsi
> - jmp L(aligned_loop)
> + kunpck %k2, %k3, %k4
>
> .p2align 4
> -L(find_nul):
> - /* Mask out any matching bits after the null byte. */
> - movl %ecx, %r8d
> - subl $1, %r8d
> - xorl %ecx, %r8d
> - andl %r8d, %eax
> - testl %eax, %eax
> - /* If there is no CHAR here, return the remembered one. */
> - jz L(return_value)
> - bsrl %eax, %eax
> +L(second_aligned_loop):
> + VMOVU (VEC_SIZE * 4)(%rdi), %YMM1
> + VMOVU (VEC_SIZE * 5)(%rdi), %YMM2
> +
> + VPCMP $0, %YMM1, %YMMMATCH, %k2
> + vpxord %YMM2, %YMMMATCH, %YMM3
> +
> + VPMIN %YMM1, %YMM2, %YMM4
> + VPMIN %YMM3, %YMM4, %YMM3
> +
> + VPTESTN %YMM3, %YMM3, %k1
> + subq $(VEC_SIZE * -2), %rdi
> + kortestd %k1, %k2
> + jz L(second_aligned_loop)
> +
> + VPCMP $0, %YMM2, %YMMMATCH, %k3
> + VPTESTN %YMM4, %YMM4, %k1
> + ktestd %k1, %k1
> + jz L(second_aligned_loop_set_furthest_match)
> +
> + kortestd %k2, %k3
> + /* branch here because there is a significant advantage interms
> + of output dependency chance in using edx. */
> + jnz L(return_new_match)
> +L(return_old_match):
> + kmovq %k4, %rax
> + bsrq %rax, %rax
> + leaq (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax
> + ret
> +
> +L(return_new_match):
> + VPTESTN %YMM1, %YMM1, %k0
> + kunpck %k0, %k1, %k0
> + kmov_2x %k0, %maskz_2x
> +
> + blsmsk %maskz_2x, %maskz_2x
> + kunpck %k2, %k3, %k3
> + kmov_2x %k3, %maskm_2x
> + and %maskz_2x, %maskm_2x
> + jz L(return_old_match)
> +
> + bsr %maskm_2x, %maskm_2x
> + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> + ret
> +
> +L(cross_page_boundary):
> + /* eax contains all the page offset bits of src (rdi). `xor rdi,
> + rax` sets pointer will all page offset bits cleared so
> + offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
> + before page cross (guranteed to be safe to read). Doing this
> + as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
> + a bit of code size. */
> + xorq %rdi, %rax
> + VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %YMM1
> + VPTESTN %YMM1, %YMM1, %k0
> + kmovd %k0, %ecx
> +
> + /* Shift out zero CHAR matches that are before the begining of
> + src (rdi). */
> # ifdef USE_AS_WCSRCHR
> - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> - leaq -VEC_SIZE(%rdi, %rax, 4), %rax
> -# else
> - leaq -VEC_SIZE(%rdi, %rax), %rax
> + movl %edi, %esi
> + andl $(VEC_SIZE - 1), %esi
> + shrl $2, %esi
> # endif
> - ret
> + shrxl %SHIFT_REG, %ecx, %ecx
>
> - .p2align 4
> -L(char_and_nul):
> - /* Find both a CHAR and a null byte. */
> - addq %rcx, %rdi
> - movl %edx, %ecx
> -L(char_and_nul_in_first_vec):
> - /* Mask out any matching bits after the null byte. */
> - movl %ecx, %r8d
> - subl $1, %r8d
> - xorl %ecx, %r8d
> - andl %r8d, %eax
> - testl %eax, %eax
> - /* Return null pointer if the null byte comes first. */
> - jz L(return_null)
> + testl %ecx, %ecx
> + jz L(page_cross_continue)
> +
> + /* Found zero CHAR so need to test for search CHAR. */
> + VPCMP $0, %YMMMATCH, %YMM1, %k1
> + kmovd %k1, %eax
> + /* Shift out search CHAR matches that are before the begining of
> + src (rdi). */
> + shrxl %SHIFT_REG, %eax, %eax
> +
> + /* Check if any search CHAR match in range. */
> + blsmskl %ecx, %ecx
> + andl %ecx, %eax
> + jz L(ret3)
> bsrl %eax, %eax
> # ifdef USE_AS_WCSRCHR
> - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> - leaq -VEC_SIZE(%rdi, %rax, 4), %rax
> + leaq (%rdi, %rax, CHAR_SIZE), %rax
> # else
> - leaq -VEC_SIZE(%rdi, %rax), %rax
> + addq %rdi, %rax
> # endif
> +L(ret3):
> ret
>
> - .p2align 4
> -L(return_null):
> - xorl %eax, %eax
> - ret
> -
> -END (STRRCHR)
> +END(STRRCHR)
> #endif
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH v3 2/4] x86: Optimize {str|wcs}rchr-sse2
2022-04-22 1:52 ` [PATCH v3 2/4] x86: Optimize {str|wcs}rchr-sse2 Noah Goldstein
@ 2022-04-22 19:06 ` H.J. Lu
2022-05-12 20:13 ` Sunil Pandey
0 siblings, 1 reply; 36+ messages in thread
From: H.J. Lu @ 2022-04-22 19:06 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Thu, Apr 21, 2022 at 6:52 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The new code unrolls the main loop slightly without adding too much
> overhead and minimizes the comparisons for the search CHAR.
>
> Geometric Mean of all benchmarks New / Old: 0.741
> See email for all results.
>
> Full xcheck passes on x86_64 with and without multiarch enabled.
> ---
> sysdeps/x86_64/multiarch/strrchr-sse2.S | 2 +-
> sysdeps/x86_64/multiarch/wcsrchr-sse2.S | 3 +-
> sysdeps/x86_64/strrchr.S | 510 +++++++++++++++---------
> sysdeps/x86_64/wcsrchr.S | 268 +------------
> 4 files changed, 339 insertions(+), 444 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
> index db1b44c23c..866396e947 100644
> --- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
> +++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
> @@ -17,7 +17,7 @@
> <https://www.gnu.org/licenses/>. */
>
> #if IS_IN (libc)
> -# define strrchr __strrchr_sse2
> +# define STRRCHR __strrchr_sse2
>
> # undef weak_alias
> # define weak_alias(strrchr, rindex)
> diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> index 78d1ca6553..69d2f3cdb1 100644
> --- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> +++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> @@ -17,7 +17,6 @@
> <https://www.gnu.org/licenses/>. */
>
> #if IS_IN (libc)
> -# define wcsrchr __wcsrchr_sse2
> +# define STRRCHR __wcsrchr_sse2
> #endif
> -
> #include "../wcsrchr.S"
> diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
> index 50d886713e..4d7ba4ceb2 100644
> --- a/sysdeps/x86_64/strrchr.S
> +++ b/sysdeps/x86_64/strrchr.S
> @@ -19,210 +19,360 @@
>
> #include <sysdep.h>
>
> +#ifndef STRRCHR
> +# define STRRCHR strrchr
> +#endif
> +
> +#ifdef USE_AS_WCSRCHR
> +# define PCMPEQ pcmpeqd
> +# define CHAR_SIZE 4
> +# define PMINU pminud
> +#else
> +# define PCMPEQ pcmpeqb
> +# define CHAR_SIZE 1
> +# define PMINU pminub
> +#endif
> +
> +#define PAGE_SIZE 4096
> +#define VEC_SIZE 16
> +
> .text
> -ENTRY (strrchr)
> - movd %esi, %xmm1
> +ENTRY(STRRCHR)
> + movd %esi, %xmm0
> movq %rdi, %rax
> - andl $4095, %eax
> - punpcklbw %xmm1, %xmm1
> - cmpq $4032, %rax
> - punpcklwd %xmm1, %xmm1
> - pshufd $0, %xmm1, %xmm1
> + andl $(PAGE_SIZE - 1), %eax
> +#ifndef USE_AS_WCSRCHR
> + punpcklbw %xmm0, %xmm0
> + punpcklwd %xmm0, %xmm0
> +#endif
> + pshufd $0, %xmm0, %xmm0
> + cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> ja L(cross_page)
> - movdqu (%rdi), %xmm0
> +
> +L(cross_page_continue):
> + movups (%rdi), %xmm1
> pxor %xmm2, %xmm2
> - movdqa %xmm0, %xmm3
> - pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm2, %xmm3
> - pmovmskb %xmm0, %ecx
> - pmovmskb %xmm3, %edx
> - testq %rdx, %rdx
> - je L(next_48_bytes)
> - leaq -1(%rdx), %rax
> - xorq %rdx, %rax
> - andq %rcx, %rax
> - je L(exit)
> - bsrq %rax, %rax
> + PCMPEQ %xmm1, %xmm2
> + pmovmskb %xmm2, %ecx
> + testl %ecx, %ecx
> + jz L(aligned_more)
> +
> + PCMPEQ %xmm0, %xmm1
> + pmovmskb %xmm1, %eax
> + leal -1(%rcx), %edx
> + xorl %edx, %ecx
> + andl %ecx, %eax
> + jz L(ret0)
> + bsrl %eax, %eax
> addq %rdi, %rax
> + /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
> + search CHAR is zero we are correct. Either way `andq
> + -CHAR_SIZE, %rax` gets the correct result. */
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> +L(ret0):
> ret
>
> + /* Returns for first vec x1/x2 have hard coded backward search
> + path for earlier matches. */
> .p2align 4
> -L(next_48_bytes):
> - movdqu 16(%rdi), %xmm4
> - movdqa %xmm4, %xmm5
> - movdqu 32(%rdi), %xmm3
> - pcmpeqb %xmm1, %xmm4
> - pcmpeqb %xmm2, %xmm5
> - movdqu 48(%rdi), %xmm0
> - pmovmskb %xmm5, %edx
> - movdqa %xmm3, %xmm5
> - pcmpeqb %xmm1, %xmm3
> - pcmpeqb %xmm2, %xmm5
> - pcmpeqb %xmm0, %xmm2
> - salq $16, %rdx
> - pmovmskb %xmm3, %r8d
> - pmovmskb %xmm5, %eax
> - pmovmskb %xmm2, %esi
> - salq $32, %r8
> - salq $32, %rax
> - pcmpeqb %xmm1, %xmm0
> - orq %rdx, %rax
> - movq %rsi, %rdx
> - pmovmskb %xmm4, %esi
> - salq $48, %rdx
> - salq $16, %rsi
> - orq %r8, %rsi
> - orq %rcx, %rsi
> - pmovmskb %xmm0, %ecx
> - salq $48, %rcx
> - orq %rcx, %rsi
> - orq %rdx, %rax
> - je L(loop_header2)
> - leaq -1(%rax), %rcx
> - xorq %rax, %rcx
> - andq %rcx, %rsi
> - je L(exit)
> - bsrq %rsi, %rsi
> - leaq (%rdi,%rsi), %rax
> +L(first_vec_x0_test):
> + PCMPEQ %xmm0, %xmm1
> + pmovmskb %xmm1, %eax
> + testl %eax, %eax
> + jz L(ret0)
> + bsrl %eax, %eax
> + addq %r8, %rax
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> ret
>
> .p2align 4
> -L(loop_header2):
> - testq %rsi, %rsi
> - movq %rdi, %rcx
> - je L(no_c_found)
> -L(loop_header):
> - addq $64, %rdi
> - pxor %xmm7, %xmm7
> - andq $-64, %rdi
> - jmp L(loop_entry)
> +L(first_vec_x1):
> + PCMPEQ %xmm0, %xmm2
> + pmovmskb %xmm2, %eax
> + leal -1(%rcx), %edx
> + xorl %edx, %ecx
> + andl %ecx, %eax
> + jz L(first_vec_x0_test)
> + bsrl %eax, %eax
> + leaq (VEC_SIZE)(%rdi, %rax), %rax
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> + ret
>
> .p2align 4
> -L(loop64):
> - testq %rdx, %rdx
> - cmovne %rdx, %rsi
> - cmovne %rdi, %rcx
> - addq $64, %rdi
> -L(loop_entry):
> - movdqa 32(%rdi), %xmm3
> - pxor %xmm6, %xmm6
> - movdqa 48(%rdi), %xmm2
> - movdqa %xmm3, %xmm0
> - movdqa 16(%rdi), %xmm4
> - pminub %xmm2, %xmm0
> - movdqa (%rdi), %xmm5
> - pminub %xmm4, %xmm0
> - pminub %xmm5, %xmm0
> - pcmpeqb %xmm7, %xmm0
> - pmovmskb %xmm0, %eax
> - movdqa %xmm5, %xmm0
> - pcmpeqb %xmm1, %xmm0
> - pmovmskb %xmm0, %r9d
> - movdqa %xmm4, %xmm0
> - pcmpeqb %xmm1, %xmm0
> - pmovmskb %xmm0, %edx
> - movdqa %xmm3, %xmm0
> - pcmpeqb %xmm1, %xmm0
> - salq $16, %rdx
> - pmovmskb %xmm0, %r10d
> - movdqa %xmm2, %xmm0
> - pcmpeqb %xmm1, %xmm0
> - salq $32, %r10
> - orq %r10, %rdx
> - pmovmskb %xmm0, %r8d
> - orq %r9, %rdx
> - salq $48, %r8
> - orq %r8, %rdx
> +L(first_vec_x1_test):
> + PCMPEQ %xmm0, %xmm2
> + pmovmskb %xmm2, %eax
> testl %eax, %eax
> - je L(loop64)
> - pcmpeqb %xmm6, %xmm4
> - pcmpeqb %xmm6, %xmm3
> - pcmpeqb %xmm6, %xmm5
> - pmovmskb %xmm4, %eax
> - pmovmskb %xmm3, %r10d
> - pcmpeqb %xmm6, %xmm2
> - pmovmskb %xmm5, %r9d
> - salq $32, %r10
> - salq $16, %rax
> - pmovmskb %xmm2, %r8d
> - orq %r10, %rax
> - orq %r9, %rax
> - salq $48, %r8
> - orq %r8, %rax
> - leaq -1(%rax), %r8
> - xorq %rax, %r8
> - andq %r8, %rdx
> - cmovne %rdi, %rcx
> - cmovne %rdx, %rsi
> - bsrq %rsi, %rsi
> - leaq (%rcx,%rsi), %rax
> + jz L(first_vec_x0_test)
> + bsrl %eax, %eax
> + leaq (VEC_SIZE)(%rdi, %rax), %rax
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> + ret
> +
> + .p2align 4
> +L(first_vec_x2):
> + PCMPEQ %xmm0, %xmm3
> + pmovmskb %xmm3, %eax
> + leal -1(%rcx), %edx
> + xorl %edx, %ecx
> + andl %ecx, %eax
> + jz L(first_vec_x1_test)
> + bsrl %eax, %eax
> + leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> + ret
> +
> + .p2align 4
> +L(aligned_more):
> + /* Save original pointer if match was in VEC 0. */
> + movq %rdi, %r8
> + andq $-VEC_SIZE, %rdi
> +
> + movaps VEC_SIZE(%rdi), %xmm2
> + pxor %xmm3, %xmm3
> + PCMPEQ %xmm2, %xmm3
> + pmovmskb %xmm3, %ecx
> + testl %ecx, %ecx
> + jnz L(first_vec_x1)
> +
> + movaps (VEC_SIZE * 2)(%rdi), %xmm3
> + pxor %xmm4, %xmm4
> + PCMPEQ %xmm3, %xmm4
> + pmovmskb %xmm4, %ecx
> + testl %ecx, %ecx
> + jnz L(first_vec_x2)
> +
> + addq $VEC_SIZE, %rdi
> + /* Save pointer again before realigning. */
> + movq %rdi, %rsi
> + andq $-(VEC_SIZE * 2), %rdi
> + .p2align 4
> +L(first_loop):
> + /* Do 2x VEC at a time. */
> + movaps (VEC_SIZE * 2)(%rdi), %xmm4
> + movaps (VEC_SIZE * 3)(%rdi), %xmm5
> + /* Since SSE2 no pminud so wcsrchr needs seperate logic for
> + detecting zero. Note if this is found to be a bottleneck it
> + may be worth adding an SSE4.1 wcsrchr implementation. */
> +#ifdef USE_AS_WCSRCHR
> + movaps %xmm5, %xmm6
> + pxor %xmm8, %xmm8
> +
> + PCMPEQ %xmm8, %xmm5
> + PCMPEQ %xmm4, %xmm8
> + por %xmm5, %xmm8
> +#else
> + movaps %xmm5, %xmm6
> + PMINU %xmm4, %xmm5
> +#endif
> +
> + movaps %xmm4, %xmm9
> + PCMPEQ %xmm0, %xmm4
> + PCMPEQ %xmm0, %xmm6
> + movaps %xmm6, %xmm7
> + por %xmm4, %xmm6
> +#ifndef USE_AS_WCSRCHR
> + pxor %xmm8, %xmm8
> + PCMPEQ %xmm5, %xmm8
> +#endif
> + pmovmskb %xmm8, %ecx
> + pmovmskb %xmm6, %eax
> +
> + addq $(VEC_SIZE * 2), %rdi
> + /* Use `addl` 1) so we can undo it with `subl` and 2) it can
> + macro-fuse with `jz`. */
> + addl %ecx, %eax
> + jz L(first_loop)
> +
> + /* Check if there is zero match. */
> + testl %ecx, %ecx
> + jz L(second_loop_match)
> +
> + /* Check if there was a match in last iteration. */
> + subl %ecx, %eax
> + jnz L(new_match)
> +
> +L(first_loop_old_match):
> + PCMPEQ %xmm0, %xmm2
> + PCMPEQ %xmm0, %xmm3
> + pmovmskb %xmm2, %ecx
> + pmovmskb %xmm3, %eax
> + addl %eax, %ecx
> + jz L(first_vec_x0_test)
> + /* NB: We could move this shift to before the branch and save a
> + bit of code size / performance on the fall through. The
> + branch leads to the null case which generally seems hotter
> + than char in first 3x VEC. */
> + sall $16, %eax
> + orl %ecx, %eax
> +
> + bsrl %eax, %eax
> + addq %rsi, %rax
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> + ret
> +
> + .p2align 4
> +L(new_match):
> + pxor %xmm6, %xmm6
> + PCMPEQ %xmm9, %xmm6
> + pmovmskb %xmm6, %eax
> + sall $16, %ecx
> + orl %eax, %ecx
> +
> + /* We can't reuse either of the old comparisons as since we mask
> + of zeros after first zero (instead of using the full
> + comparison) we can't gurantee no interference between match
> + after end of string and valid match. */
> + pmovmskb %xmm4, %eax
> + pmovmskb %xmm7, %edx
> + sall $16, %edx
> + orl %edx, %eax
> +
> + leal -1(%ecx), %edx
> + xorl %edx, %ecx
> + andl %ecx, %eax
> + jz L(first_loop_old_match)
> + bsrl %eax, %eax
> + addq %rdi, %rax
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> ret
>
> + /* Save minimum state for getting most recent match. We can
> + throw out all previous work. */
> .p2align 4
> -L(no_c_found):
> - movl $1, %esi
> - xorl %ecx, %ecx
> - jmp L(loop_header)
> +L(second_loop_match):
> + movq %rdi, %rsi
> + movaps %xmm4, %xmm2
> + movaps %xmm7, %xmm3
>
> .p2align 4
> -L(exit):
> - xorl %eax, %eax
> +L(second_loop):
> + movaps (VEC_SIZE * 2)(%rdi), %xmm4
> + movaps (VEC_SIZE * 3)(%rdi), %xmm5
> + /* Since SSE2 no pminud so wcsrchr needs seperate logic for
> + detecting zero. Note if this is found to be a bottleneck it
> + may be worth adding an SSE4.1 wcsrchr implementation. */
> +#ifdef USE_AS_WCSRCHR
> + movaps %xmm5, %xmm6
> + pxor %xmm8, %xmm8
> +
> + PCMPEQ %xmm8, %xmm5
> + PCMPEQ %xmm4, %xmm8
> + por %xmm5, %xmm8
> +#else
> + movaps %xmm5, %xmm6
> + PMINU %xmm4, %xmm5
> +#endif
> +
> + movaps %xmm4, %xmm9
> + PCMPEQ %xmm0, %xmm4
> + PCMPEQ %xmm0, %xmm6
> + movaps %xmm6, %xmm7
> + por %xmm4, %xmm6
> +#ifndef USE_AS_WCSRCHR
> + pxor %xmm8, %xmm8
> + PCMPEQ %xmm5, %xmm8
> +#endif
> +
> + pmovmskb %xmm8, %ecx
> + pmovmskb %xmm6, %eax
> +
> + addq $(VEC_SIZE * 2), %rdi
> + /* Either null term or new occurence of CHAR. */
> + addl %ecx, %eax
> + jz L(second_loop)
> +
> + /* No null term so much be new occurence of CHAR. */
> + testl %ecx, %ecx
> + jz L(second_loop_match)
> +
> +
> + subl %ecx, %eax
> + jnz L(second_loop_new_match)
> +
> +L(second_loop_old_match):
> + pmovmskb %xmm2, %ecx
> + pmovmskb %xmm3, %eax
> + sall $16, %eax
> + orl %ecx, %eax
> + bsrl %eax, %eax
> + addq %rsi, %rax
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> ret
>
> .p2align 4
> +L(second_loop_new_match):
> + pxor %xmm6, %xmm6
> + PCMPEQ %xmm9, %xmm6
> + pmovmskb %xmm6, %eax
> + sall $16, %ecx
> + orl %eax, %ecx
> +
> + /* We can't reuse either of the old comparisons as since we mask
> + of zeros after first zero (instead of using the full
> + comparison) we can't gurantee no interference between match
> + after end of string and valid match. */
> + pmovmskb %xmm4, %eax
> + pmovmskb %xmm7, %edx
> + sall $16, %edx
> + orl %edx, %eax
> +
> + leal -1(%ecx), %edx
> + xorl %edx, %ecx
> + andl %ecx, %eax
> + jz L(second_loop_old_match)
> + bsrl %eax, %eax
> + addq %rdi, %rax
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> + ret
> +
> + .p2align 4,, 4
> L(cross_page):
> - movq %rdi, %rax
> - pxor %xmm0, %xmm0
> - andq $-64, %rax
> - movdqu (%rax), %xmm5
> - movdqa %xmm5, %xmm6
> - movdqu 16(%rax), %xmm4
> - pcmpeqb %xmm1, %xmm5
> - pcmpeqb %xmm0, %xmm6
> - movdqu 32(%rax), %xmm3
> - pmovmskb %xmm6, %esi
> - movdqa %xmm4, %xmm6
> - movdqu 48(%rax), %xmm2
> - pcmpeqb %xmm1, %xmm4
> - pcmpeqb %xmm0, %xmm6
> - pmovmskb %xmm6, %edx
> - movdqa %xmm3, %xmm6
> - pcmpeqb %xmm1, %xmm3
> - pcmpeqb %xmm0, %xmm6
> - pcmpeqb %xmm2, %xmm0
> - salq $16, %rdx
> - pmovmskb %xmm3, %r9d
> - pmovmskb %xmm6, %r8d
> - pmovmskb %xmm0, %ecx
> - salq $32, %r9
> - salq $32, %r8
> - pcmpeqb %xmm1, %xmm2
> - orq %r8, %rdx
> - salq $48, %rcx
> - pmovmskb %xmm5, %r8d
> - orq %rsi, %rdx
> - pmovmskb %xmm4, %esi
> - orq %rcx, %rdx
> - pmovmskb %xmm2, %ecx
> - salq $16, %rsi
> - salq $48, %rcx
> - orq %r9, %rsi
> - orq %r8, %rsi
> - orq %rcx, %rsi
> + movq %rdi, %rsi
> + andq $-VEC_SIZE, %rsi
> + movaps (%rsi), %xmm1
> + pxor %xmm2, %xmm2
> + PCMPEQ %xmm1, %xmm2
> + pmovmskb %xmm2, %edx
> movl %edi, %ecx
> - subl %eax, %ecx
> - shrq %cl, %rdx
> - shrq %cl, %rsi
> - testq %rdx, %rdx
> - je L(loop_header2)
> - leaq -1(%rdx), %rax
> - xorq %rdx, %rax
> - andq %rax, %rsi
> - je L(exit)
> - bsrq %rsi, %rax
> + andl $(VEC_SIZE - 1), %ecx
> + sarl %cl, %edx
> + jz L(cross_page_continue)
> + PCMPEQ %xmm0, %xmm1
> + pmovmskb %xmm1, %eax
> + sarl %cl, %eax
> + leal -1(%rdx), %ecx
> + xorl %edx, %ecx
> + andl %ecx, %eax
> + jz L(ret1)
> + bsrl %eax, %eax
> addq %rdi, %rax
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> +L(ret1):
> ret
> -END (strrchr)
> +END(STRRCHR)
>
> -weak_alias (strrchr, rindex)
> -libc_hidden_builtin_def (strrchr)
> +#ifndef USE_AS_WCSRCHR
> + weak_alias (STRRCHR, rindex)
> + libc_hidden_builtin_def (STRRCHR)
> +#endif
> diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
> index 61552954de..2b80efc5ef 100644
> --- a/sysdeps/x86_64/wcsrchr.S
> +++ b/sysdeps/x86_64/wcsrchr.S
> @@ -1,4 +1,4 @@
> -/* wcsrchr with SSSE3
> +/* wcsrchr optimized with SSE2.
> Copyright (C) 2011-2022 Free Software Foundation, Inc.
> This file is part of the GNU C Library.
>
> @@ -16,266 +16,12 @@
> License along with the GNU C Library; if not, see
> <https://www.gnu.org/licenses/>. */
>
> -#include <sysdep.h>
>
> - .text
> -ENTRY (wcsrchr)
> +#define USE_AS_WCSRCHR 1
> +#define NO_PMINU 1
>
> - movd %rsi, %xmm1
> - mov %rdi, %rcx
> - punpckldq %xmm1, %xmm1
> - pxor %xmm2, %xmm2
> - punpckldq %xmm1, %xmm1
> - and $63, %rcx
> - cmp $48, %rcx
> - ja L(crosscache)
> +#ifndef STRRCHR
> +# define STRRCHR wcsrchr
> +#endif
>
> - movdqu (%rdi), %xmm0
> - pcmpeqd %xmm0, %xmm2
> - pcmpeqd %xmm1, %xmm0
> - pmovmskb %xmm2, %rcx
> - pmovmskb %xmm0, %rax
> - add $16, %rdi
> -
> - test %rax, %rax
> - jnz L(unaligned_match1)
> -
> - test %rcx, %rcx
> - jnz L(return_null)
> -
> - and $-16, %rdi
> - xor %r8, %r8
> - jmp L(loop)
> -
> - .p2align 4
> -L(unaligned_match1):
> - test %rcx, %rcx
> - jnz L(prolog_find_zero_1)
> -
> - mov %rax, %r8
> - mov %rdi, %rsi
> - and $-16, %rdi
> - jmp L(loop)
> -
> - .p2align 4
> -L(crosscache):
> - and $15, %rcx
> - and $-16, %rdi
> - pxor %xmm3, %xmm3
> - movdqa (%rdi), %xmm0
> - pcmpeqd %xmm0, %xmm3
> - pcmpeqd %xmm1, %xmm0
> - pmovmskb %xmm3, %rdx
> - pmovmskb %xmm0, %rax
> - shr %cl, %rdx
> - shr %cl, %rax
> - add $16, %rdi
> -
> - test %rax, %rax
> - jnz L(unaligned_match)
> -
> - test %rdx, %rdx
> - jnz L(return_null)
> -
> - xor %r8, %r8
> - jmp L(loop)
> -
> - .p2align 4
> -L(unaligned_match):
> - test %rdx, %rdx
> - jnz L(prolog_find_zero)
> -
> - mov %rax, %r8
> - lea (%rdi, %rcx), %rsi
> -
> -/* Loop start on aligned string. */
> - .p2align 4
> -L(loop):
> - movdqa (%rdi), %xmm0
> - pcmpeqd %xmm0, %xmm2
> - add $16, %rdi
> - pcmpeqd %xmm1, %xmm0
> - pmovmskb %xmm2, %rcx
> - pmovmskb %xmm0, %rax
> - or %rax, %rcx
> - jnz L(matches)
> -
> - movdqa (%rdi), %xmm3
> - pcmpeqd %xmm3, %xmm2
> - add $16, %rdi
> - pcmpeqd %xmm1, %xmm3
> - pmovmskb %xmm2, %rcx
> - pmovmskb %xmm3, %rax
> - or %rax, %rcx
> - jnz L(matches)
> -
> - movdqa (%rdi), %xmm4
> - pcmpeqd %xmm4, %xmm2
> - add $16, %rdi
> - pcmpeqd %xmm1, %xmm4
> - pmovmskb %xmm2, %rcx
> - pmovmskb %xmm4, %rax
> - or %rax, %rcx
> - jnz L(matches)
> -
> - movdqa (%rdi), %xmm5
> - pcmpeqd %xmm5, %xmm2
> - add $16, %rdi
> - pcmpeqd %xmm1, %xmm5
> - pmovmskb %xmm2, %rcx
> - pmovmskb %xmm5, %rax
> - or %rax, %rcx
> - jz L(loop)
> -
> - .p2align 4
> -L(matches):
> - test %rax, %rax
> - jnz L(match)
> -L(return_value):
> - test %r8, %r8
> - jz L(return_null)
> - mov %r8, %rax
> - mov %rsi, %rdi
> -
> - test $15 << 4, %ah
> - jnz L(match_fourth_wchar)
> - test %ah, %ah
> - jnz L(match_third_wchar)
> - test $15 << 4, %al
> - jnz L(match_second_wchar)
> - lea -16(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(match):
> - pmovmskb %xmm2, %rcx
> - test %rcx, %rcx
> - jnz L(find_zero)
> - mov %rax, %r8
> - mov %rdi, %rsi
> - jmp L(loop)
> -
> - .p2align 4
> -L(find_zero):
> - test $15, %cl
> - jnz L(find_zero_in_first_wchar)
> - test %cl, %cl
> - jnz L(find_zero_in_second_wchar)
> - test $15, %ch
> - jnz L(find_zero_in_third_wchar)
> -
> - and $1 << 13 - 1, %rax
> - jz L(return_value)
> -
> - test $15 << 4, %ah
> - jnz L(match_fourth_wchar)
> - test %ah, %ah
> - jnz L(match_third_wchar)
> - test $15 << 4, %al
> - jnz L(match_second_wchar)
> - lea -16(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(find_zero_in_first_wchar):
> - test $1, %rax
> - jz L(return_value)
> - lea -16(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(find_zero_in_second_wchar):
> - and $1 << 5 - 1, %rax
> - jz L(return_value)
> -
> - test $15 << 4, %al
> - jnz L(match_second_wchar)
> - lea -16(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(find_zero_in_third_wchar):
> - and $1 << 9 - 1, %rax
> - jz L(return_value)
> -
> - test %ah, %ah
> - jnz L(match_third_wchar)
> - test $15 << 4, %al
> - jnz L(match_second_wchar)
> - lea -16(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(prolog_find_zero):
> - add %rcx, %rdi
> - mov %rdx, %rcx
> -L(prolog_find_zero_1):
> - test $15, %cl
> - jnz L(prolog_find_zero_in_first_wchar)
> - test %cl, %cl
> - jnz L(prolog_find_zero_in_second_wchar)
> - test $15, %ch
> - jnz L(prolog_find_zero_in_third_wchar)
> -
> - and $1 << 13 - 1, %rax
> - jz L(return_null)
> -
> - test $15 << 4, %ah
> - jnz L(match_fourth_wchar)
> - test %ah, %ah
> - jnz L(match_third_wchar)
> - test $15 << 4, %al
> - jnz L(match_second_wchar)
> - lea -16(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(prolog_find_zero_in_first_wchar):
> - test $1, %rax
> - jz L(return_null)
> - lea -16(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(prolog_find_zero_in_second_wchar):
> - and $1 << 5 - 1, %rax
> - jz L(return_null)
> -
> - test $15 << 4, %al
> - jnz L(match_second_wchar)
> - lea -16(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(prolog_find_zero_in_third_wchar):
> - and $1 << 9 - 1, %rax
> - jz L(return_null)
> -
> - test %ah, %ah
> - jnz L(match_third_wchar)
> - test $15 << 4, %al
> - jnz L(match_second_wchar)
> - lea -16(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(match_second_wchar):
> - lea -12(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(match_third_wchar):
> - lea -8(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(match_fourth_wchar):
> - lea -4(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(return_null):
> - xor %rax, %rax
> - ret
> -
> -END (wcsrchr)
> +#include "../strrchr.S"
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 36+ messages in thread
* [PATCH v4 1/4] benchtests: Improve bench-strrchr
2022-04-21 3:14 [PATCH v1 1/5] benchtests: Improve bench-strrchr Noah Goldstein
` (6 preceding siblings ...)
2022-04-22 1:52 ` [PATCH v3 1/4] benchtests: Improve bench-strrchr Noah Goldstein
@ 2022-04-22 19:11 ` Noah Goldstein
2022-04-23 1:53 ` H.J. Lu
7 siblings, 1 reply; 36+ messages in thread
From: Noah Goldstein @ 2022-04-22 19:11 UTC (permalink / raw)
To: libc-alpha
1. Use json-lib for printing results.
2. Expose all parameters (before pos, seek_char, and max_char where
not printed).
3. Add benchmarks that test multiple occurence of seek_char in the
string.
---
benchtests/bench-strrchr.c | 126 ++++++++++++++++++++++++-------------
1 file changed, 82 insertions(+), 44 deletions(-)
diff --git a/benchtests/bench-strrchr.c b/benchtests/bench-strrchr.c
index abdae60c51..7cd2a15484 100644
--- a/benchtests/bench-strrchr.c
+++ b/benchtests/bench-strrchr.c
@@ -23,6 +23,7 @@
# define TEST_NAME "strrchr"
#endif
#include "bench-string.h"
+#include "json-lib.h"
#define BIG_CHAR MAX_CHAR
@@ -53,7 +54,8 @@ SIMPLE_STRRCHR (const CHAR *s, int c)
}
static void
-do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
+do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s, int c,
+ CHAR *exp_res)
{
CHAR *res = CALL (impl, s, c);
size_t i, iters = INNER_LOOP_ITERS8;
@@ -61,8 +63,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
if (res != exp_res)
{
- error (0, 0, "Wrong result in function %s %p %p", impl->name,
- res, exp_res);
+ error (0, 0, "Wrong result in function %s %p %p", impl->name, res,
+ exp_res);
ret = 1;
return;
}
@@ -73,23 +75,25 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
CALL (impl, s, c);
}
TIMING_NOW (stop);
-
TIMING_DIFF (cur, start, stop);
- TIMING_PRINT_MEAN ((double) cur, (double) iters);
+ json_element_double (json_ctx, (double) cur / (double) iters);
}
static void
-do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
+do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
+ int seek_char, int max_char, size_t freq)
/* For wcsrchr: align here means align not in bytes,
but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
len for wcschr here isn't in bytes but it's number of wchar_t symbols. */
{
size_t i;
+ size_t pos_chunk_sz = freq ? (pos / freq) : pos;
+ size_t last_pos = len;
CHAR *result;
CHAR *buf = (CHAR *) buf1;
- align &= 7;
+ align &= (getpagesize () - 1);
if ((align + len) * sizeof (CHAR) >= page_size)
return;
@@ -103,6 +107,16 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
if ((i > pos || pos >= len) && buf[align + i] == seek_char)
buf[align + i] = seek_char + 10 + (random () & 15);
}
+
+ if (pos_chunk_sz == 0 && pos)
+ pos_chunk_sz = 1;
+
+ for (i = pos_chunk_sz; i < pos && i < len; i += pos_chunk_sz)
+ {
+ buf[align + i] = seek_char;
+ last_pos = i;
+ }
+
buf[align + len] = 0;
if (pos < len)
@@ -110,66 +124,90 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
buf[align + pos] = seek_char;
result = (CHAR *) (buf + align + pos);
}
+ else if (last_pos < len)
+ result = (CHAR *) (buf + align + last_pos);
else if (seek_char == 0)
result = (CHAR *) (buf + align + len);
else
result = NULL;
- printf ("Length %4zd, alignment in bytes %2zd:", len, align * sizeof (CHAR));
+ json_element_object_begin (json_ctx);
+ json_attr_uint (json_ctx, "len", len);
+ json_attr_uint (json_ctx, "pos", pos);
+ json_attr_uint (json_ctx, "align", align);
+ json_attr_uint (json_ctx, "freq", freq);
+ json_attr_uint (json_ctx, "seek", seek_char);
+ json_attr_uint (json_ctx, "max_char", max_char);
+ json_array_begin (json_ctx, "timings");
FOR_EACH_IMPL (impl, 0)
- do_one_test (impl, (CHAR *) (buf + align), seek_char, result);
+ do_one_test (json_ctx, impl, (CHAR *) (buf + align), seek_char, result);
- putchar ('\n');
+ json_array_end (json_ctx);
+ json_element_object_end (json_ctx);
}
int
test_main (void)
{
- size_t i;
+ json_ctx_t json_ctx;
+ size_t i, j;
+ int seek;
test_init ();
+ json_init (&json_ctx, 0, stdout);
- printf ("%20s", "");
- FOR_EACH_IMPL (impl, 0)
- printf ("\t%s", impl->name);
- putchar ('\n');
-
- for (i = 1; i < 8; ++i)
- {
- do_test (0, 16 << i, 2048, 23, SMALL_CHAR);
- do_test (i, 16 << i, 2048, 23, SMALL_CHAR);
- }
+ json_document_begin (&json_ctx);
+ json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
- for (i = 1; i < 8; ++i)
- {
- do_test (i, 64, 256, 23, SMALL_CHAR);
- do_test (i, 64, 256, 23, BIG_CHAR);
- }
+ json_attr_object_begin (&json_ctx, "functions");
+ json_attr_object_begin (&json_ctx, TEST_NAME);
+ json_attr_string (&json_ctx, "bench-variant", "");
- for (i = 0; i < 32; ++i)
- {
- do_test (0, i, i + 1, 23, SMALL_CHAR);
- do_test (0, i, i + 1, 23, BIG_CHAR);
- }
+ json_array_begin (&json_ctx, "ifuncs");
+ FOR_EACH_IMPL (impl, 0)
+ json_element_string (&json_ctx, impl->name);
+ json_array_end (&json_ctx);
- for (i = 1; i < 8; ++i)
- {
- do_test (0, 16 << i, 2048, 0, SMALL_CHAR);
- do_test (i, 16 << i, 2048, 0, SMALL_CHAR);
- }
+ json_array_begin (&json_ctx, "results");
- for (i = 1; i < 8; ++i)
+ for (seek = 0; seek <= 23; seek += 23)
{
- do_test (i, 64, 256, 0, SMALL_CHAR);
- do_test (i, 64, 256, 0, BIG_CHAR);
+ for (j = 1; j < 32; j += j)
+ {
+ for (i = 1; i < 9; ++i)
+ {
+ do_test (&json_ctx, 0, 16 << i, 2048, seek, SMALL_CHAR, j);
+ do_test (&json_ctx, i, 16 << i, 2048, seek, SMALL_CHAR, j);
+ }
+
+ for (i = 1; i < 8; ++i)
+ {
+ do_test (&json_ctx, i, 64, 256, seek, SMALL_CHAR, j);
+ do_test (&json_ctx, i, 64, 256, seek, BIG_CHAR, j);
+
+ do_test (&json_ctx, i * 15, 64, 256, seek, SMALL_CHAR, j);
+ do_test (&json_ctx, i * 15, 64, 256, seek, BIG_CHAR, j);
+ }
+
+ for (i = 0; i < 32; ++i)
+ {
+ do_test (&json_ctx, 0, i, i + 1, seek, SMALL_CHAR, j);
+ do_test (&json_ctx, 0, i, i + 1, seek, BIG_CHAR, j);
+ do_test (&json_ctx, getpagesize () - i / 2 - 1, i, i + 1, seek,
+ SMALL_CHAR, j);
+ }
+ if (seek == 0)
+ {
+ break;
+ }
+ }
}
- for (i = 0; i < 32; ++i)
- {
- do_test (0, i, i + 1, 0, SMALL_CHAR);
- do_test (0, i, i + 1, 0, BIG_CHAR);
- }
+ json_array_end (&json_ctx);
+ json_attr_object_end (&json_ctx);
+ json_attr_object_end (&json_ctx);
+ json_document_end (&json_ctx);
return ret;
}
--
2.25.1
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH v3 1/4] benchtests: Improve bench-strrchr
2022-04-22 18:29 ` [PATCH v3 1/4] benchtests: Improve bench-strrchr H.J. Lu
@ 2022-04-22 19:12 ` Noah Goldstein
0 siblings, 0 replies; 36+ messages in thread
From: Noah Goldstein @ 2022-04-22 19:12 UTC (permalink / raw)
To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell
On Fri, Apr 22, 2022 at 1:30 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Thu, Apr 21, 2022 at 6:52 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > 1. Use json-lib for printing results.
> > 2. Expose all parameters (before pos, seek_char, and max_char where
> > not printed).
> > 3. Add benchmarks that test multiple occurence of seek_char in the
> > string.
> > ---
> > benchtests/bench-strrchr.c | 124 ++++++++++++++++++++++++-------------
> > 1 file changed, 80 insertions(+), 44 deletions(-)
> >
> > diff --git a/benchtests/bench-strrchr.c b/benchtests/bench-strrchr.c
> > index abdae60c51..ce4307a098 100644
> > --- a/benchtests/bench-strrchr.c
> > +++ b/benchtests/bench-strrchr.c
> > @@ -23,6 +23,7 @@
> > # define TEST_NAME "strrchr"
> > #endif
> > #include "bench-string.h"
> > +#include "json-lib.h"
> >
> > #define BIG_CHAR MAX_CHAR
> >
> > @@ -53,7 +54,8 @@ SIMPLE_STRRCHR (const CHAR *s, int c)
> > }
> >
> > static void
> > -do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
> > +do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s, int c,
> > + CHAR *exp_res)
> > {
> > CHAR *res = CALL (impl, s, c);
> > size_t i, iters = INNER_LOOP_ITERS8;
> > @@ -61,8 +63,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
> >
> > if (res != exp_res)
> > {
> > - error (0, 0, "Wrong result in function %s %p %p", impl->name,
> > - res, exp_res);
> > + error (0, 0, "Wrong result in function %s %p %p", impl->name, res,
> > + exp_res);
> > ret = 1;
> > return;
> > }
> > @@ -73,23 +75,25 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
> > CALL (impl, s, c);
> > }
> > TIMING_NOW (stop);
> > -
> > TIMING_DIFF (cur, start, stop);
> >
> > - TIMING_PRINT_MEAN ((double) cur, (double) iters);
> > + json_element_double (json_ctx, (double) cur / (double) iters);
> > }
> >
> > static void
> > -do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
> > +do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
> > + int seek_char, int max_char, size_t freq)
> > /* For wcsrchr: align here means align not in bytes,
> > but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
> > len for wcschr here isn't in bytes but it's number of wchar_t symbols. */
> > {
> > size_t i;
> > + size_t pos_chunk_sz = freq ? (pos / freq) : pos;
> > + size_t last_pos = len;
> > CHAR *result;
> > CHAR *buf = (CHAR *) buf1;
> >
> > - align &= 7;
> > + align &= (getpagesize () - 1);
>
> Should we add some tests for page boundary cross?
Added a few in V4.
>
> > if ((align + len) * sizeof (CHAR) >= page_size)
> > return;
> >
> > @@ -103,6 +107,16 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
> > if ((i > pos || pos >= len) && buf[align + i] == seek_char)
> > buf[align + i] = seek_char + 10 + (random () & 15);
> > }
> > +
> > + if (pos_chunk_sz == 0 && pos)
> > + pos_chunk_sz = 1;
> > +
> > + for (i = pos_chunk_sz; i < pos && i < len; i += pos_chunk_sz)
> > + {
> > + buf[align + i] = seek_char;
> > + last_pos = i;
> > + }
> > +
> > buf[align + len] = 0;
> >
> > if (pos < len)
> > @@ -110,66 +124,88 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
> > buf[align + pos] = seek_char;
> > result = (CHAR *) (buf + align + pos);
> > }
> > + else if (last_pos < len)
> > + result = (CHAR *) (buf + align + last_pos);
> > else if (seek_char == 0)
> > result = (CHAR *) (buf + align + len);
> > else
> > result = NULL;
> >
> > - printf ("Length %4zd, alignment in bytes %2zd:", len, align * sizeof (CHAR));
> > + json_element_object_begin (json_ctx);
> > + json_attr_uint (json_ctx, "len", len);
> > + json_attr_uint (json_ctx, "pos", pos);
> > + json_attr_uint (json_ctx, "align", align);
> > + json_attr_uint (json_ctx, "freq", freq);
> > + json_attr_uint (json_ctx, "seek", seek_char);
> > + json_attr_uint (json_ctx, "max_char", max_char);
> > + json_array_begin (json_ctx, "timings");
> >
> > FOR_EACH_IMPL (impl, 0)
> > - do_one_test (impl, (CHAR *) (buf + align), seek_char, result);
> > + do_one_test (json_ctx, impl, (CHAR *) (buf + align), seek_char, result);
> >
> > - putchar ('\n');
> > + json_array_end (json_ctx);
> > + json_element_object_end (json_ctx);
> > }
> >
> > int
> > test_main (void)
> > {
> > - size_t i;
> > + json_ctx_t json_ctx;
> > + size_t i, j;
> > + int seek;
> >
> > test_init ();
> > + json_init (&json_ctx, 0, stdout);
> >
> > - printf ("%20s", "");
> > - FOR_EACH_IMPL (impl, 0)
> > - printf ("\t%s", impl->name);
> > - putchar ('\n');
> > -
> > - for (i = 1; i < 8; ++i)
> > - {
> > - do_test (0, 16 << i, 2048, 23, SMALL_CHAR);
> > - do_test (i, 16 << i, 2048, 23, SMALL_CHAR);
> > - }
> > + json_document_begin (&json_ctx);
> > + json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> >
> > - for (i = 1; i < 8; ++i)
> > - {
> > - do_test (i, 64, 256, 23, SMALL_CHAR);
> > - do_test (i, 64, 256, 23, BIG_CHAR);
> > - }
> > + json_attr_object_begin (&json_ctx, "functions");
> > + json_attr_object_begin (&json_ctx, TEST_NAME);
> > + json_attr_string (&json_ctx, "bench-variant", "");
> >
> > - for (i = 0; i < 32; ++i)
> > - {
> > - do_test (0, i, i + 1, 23, SMALL_CHAR);
> > - do_test (0, i, i + 1, 23, BIG_CHAR);
> > - }
> > + json_array_begin (&json_ctx, "ifuncs");
> > + FOR_EACH_IMPL (impl, 0)
> > + json_element_string (&json_ctx, impl->name);
> > + json_array_end (&json_ctx);
> >
> > - for (i = 1; i < 8; ++i)
> > - {
> > - do_test (0, 16 << i, 2048, 0, SMALL_CHAR);
> > - do_test (i, 16 << i, 2048, 0, SMALL_CHAR);
> > - }
> > + json_array_begin (&json_ctx, "results");
> >
> > - for (i = 1; i < 8; ++i)
> > + for (seek = 0; seek <= 23; seek += 23)
> > {
> > - do_test (i, 64, 256, 0, SMALL_CHAR);
> > - do_test (i, 64, 256, 0, BIG_CHAR);
> > + for (j = 1; j < 32; j += j)
> > + {
> > + for (i = 1; i < 9; ++i)
> > + {
> > + do_test (&json_ctx, 0, 16 << i, 2048, seek, SMALL_CHAR, j);
> > + do_test (&json_ctx, i, 16 << i, 2048, seek, SMALL_CHAR, j);
> > + }
> > +
> > + for (i = 1; i < 8; ++i)
> > + {
> > + do_test (&json_ctx, i, 64, 256, seek, SMALL_CHAR, j);
> > + do_test (&json_ctx, i, 64, 256, seek, BIG_CHAR, j);
> > +
> > + do_test (&json_ctx, i * 15, 64, 256, seek, SMALL_CHAR, j);
> > + do_test (&json_ctx, i * 15, 64, 256, seek, BIG_CHAR, j);
> > + }
> > +
> > + for (i = 0; i < 32; ++i)
> > + {
> > + do_test (&json_ctx, 0, i, i + 1, seek, SMALL_CHAR, j);
> > + do_test (&json_ctx, 0, i, i + 1, seek, BIG_CHAR, j);
> > + }
> > + if (seek == 0)
> > + {
> > + break;
> > + }
> > + }
> > }
> >
> > - for (i = 0; i < 32; ++i)
> > - {
> > - do_test (0, i, i + 1, 0, SMALL_CHAR);
> > - do_test (0, i, i + 1, 0, BIG_CHAR);
> > - }
> > + json_array_end (&json_ctx);
> > + json_attr_object_end (&json_ctx);
> > + json_attr_object_end (&json_ctx);
> > + json_document_end (&json_ctx);
> >
> > return ret;
> > }
> > --
> > 2.25.1
> >
>
>
> --
> H.J.
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH v4 1/4] benchtests: Improve bench-strrchr
2022-04-22 19:11 ` [PATCH v4 " Noah Goldstein
@ 2022-04-23 1:53 ` H.J. Lu
0 siblings, 0 replies; 36+ messages in thread
From: H.J. Lu @ 2022-04-23 1:53 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Fri, Apr 22, 2022 at 12:12 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> 1. Use json-lib for printing results.
> 2. Expose all parameters (before pos, seek_char, and max_char where
> not printed).
> 3. Add benchmarks that test multiple occurence of seek_char in the
> string.
> ---
> benchtests/bench-strrchr.c | 126 ++++++++++++++++++++++++-------------
> 1 file changed, 82 insertions(+), 44 deletions(-)
>
> diff --git a/benchtests/bench-strrchr.c b/benchtests/bench-strrchr.c
> index abdae60c51..7cd2a15484 100644
> --- a/benchtests/bench-strrchr.c
> +++ b/benchtests/bench-strrchr.c
> @@ -23,6 +23,7 @@
> # define TEST_NAME "strrchr"
> #endif
> #include "bench-string.h"
> +#include "json-lib.h"
>
> #define BIG_CHAR MAX_CHAR
>
> @@ -53,7 +54,8 @@ SIMPLE_STRRCHR (const CHAR *s, int c)
> }
>
> static void
> -do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
> +do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s, int c,
> + CHAR *exp_res)
> {
> CHAR *res = CALL (impl, s, c);
> size_t i, iters = INNER_LOOP_ITERS8;
> @@ -61,8 +63,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
>
> if (res != exp_res)
> {
> - error (0, 0, "Wrong result in function %s %p %p", impl->name,
> - res, exp_res);
> + error (0, 0, "Wrong result in function %s %p %p", impl->name, res,
> + exp_res);
> ret = 1;
> return;
> }
> @@ -73,23 +75,25 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res)
> CALL (impl, s, c);
> }
> TIMING_NOW (stop);
> -
> TIMING_DIFF (cur, start, stop);
>
> - TIMING_PRINT_MEAN ((double) cur, (double) iters);
> + json_element_double (json_ctx, (double) cur / (double) iters);
> }
>
> static void
> -do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
> +do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
> + int seek_char, int max_char, size_t freq)
> /* For wcsrchr: align here means align not in bytes,
> but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
> len for wcschr here isn't in bytes but it's number of wchar_t symbols. */
> {
> size_t i;
> + size_t pos_chunk_sz = freq ? (pos / freq) : pos;
> + size_t last_pos = len;
> CHAR *result;
> CHAR *buf = (CHAR *) buf1;
>
> - align &= 7;
> + align &= (getpagesize () - 1);
> if ((align + len) * sizeof (CHAR) >= page_size)
page_size == 2 * getpagesize ()
> return;
>
> @@ -103,6 +107,16 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
> if ((i > pos || pos >= len) && buf[align + i] == seek_char)
> buf[align + i] = seek_char + 10 + (random () & 15);
> }
> +
> + if (pos_chunk_sz == 0 && pos)
> + pos_chunk_sz = 1;
> +
> + for (i = pos_chunk_sz; i < pos && i < len; i += pos_chunk_sz)
> + {
> + buf[align + i] = seek_char;
> + last_pos = i;
> + }
> +
> buf[align + len] = 0;
>
> if (pos < len)
> @@ -110,66 +124,90 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
> buf[align + pos] = seek_char;
> result = (CHAR *) (buf + align + pos);
> }
> + else if (last_pos < len)
> + result = (CHAR *) (buf + align + last_pos);
> else if (seek_char == 0)
> result = (CHAR *) (buf + align + len);
> else
> result = NULL;
>
> - printf ("Length %4zd, alignment in bytes %2zd:", len, align * sizeof (CHAR));
> + json_element_object_begin (json_ctx);
> + json_attr_uint (json_ctx, "len", len);
> + json_attr_uint (json_ctx, "pos", pos);
> + json_attr_uint (json_ctx, "align", align);
> + json_attr_uint (json_ctx, "freq", freq);
> + json_attr_uint (json_ctx, "seek", seek_char);
> + json_attr_uint (json_ctx, "max_char", max_char);
> + json_array_begin (json_ctx, "timings");
>
> FOR_EACH_IMPL (impl, 0)
> - do_one_test (impl, (CHAR *) (buf + align), seek_char, result);
> + do_one_test (json_ctx, impl, (CHAR *) (buf + align), seek_char, result);
>
> - putchar ('\n');
> + json_array_end (json_ctx);
> + json_element_object_end (json_ctx);
> }
>
> int
> test_main (void)
> {
> - size_t i;
> + json_ctx_t json_ctx;
> + size_t i, j;
> + int seek;
>
> test_init ();
> + json_init (&json_ctx, 0, stdout);
>
> - printf ("%20s", "");
> - FOR_EACH_IMPL (impl, 0)
> - printf ("\t%s", impl->name);
> - putchar ('\n');
> -
> - for (i = 1; i < 8; ++i)
> - {
> - do_test (0, 16 << i, 2048, 23, SMALL_CHAR);
> - do_test (i, 16 << i, 2048, 23, SMALL_CHAR);
> - }
> + json_document_begin (&json_ctx);
> + json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
>
> - for (i = 1; i < 8; ++i)
> - {
> - do_test (i, 64, 256, 23, SMALL_CHAR);
> - do_test (i, 64, 256, 23, BIG_CHAR);
> - }
> + json_attr_object_begin (&json_ctx, "functions");
> + json_attr_object_begin (&json_ctx, TEST_NAME);
> + json_attr_string (&json_ctx, "bench-variant", "");
>
> - for (i = 0; i < 32; ++i)
> - {
> - do_test (0, i, i + 1, 23, SMALL_CHAR);
> - do_test (0, i, i + 1, 23, BIG_CHAR);
> - }
> + json_array_begin (&json_ctx, "ifuncs");
> + FOR_EACH_IMPL (impl, 0)
> + json_element_string (&json_ctx, impl->name);
> + json_array_end (&json_ctx);
>
> - for (i = 1; i < 8; ++i)
> - {
> - do_test (0, 16 << i, 2048, 0, SMALL_CHAR);
> - do_test (i, 16 << i, 2048, 0, SMALL_CHAR);
> - }
> + json_array_begin (&json_ctx, "results");
>
> - for (i = 1; i < 8; ++i)
> + for (seek = 0; seek <= 23; seek += 23)
> {
> - do_test (i, 64, 256, 0, SMALL_CHAR);
> - do_test (i, 64, 256, 0, BIG_CHAR);
> + for (j = 1; j < 32; j += j)
> + {
> + for (i = 1; i < 9; ++i)
> + {
> + do_test (&json_ctx, 0, 16 << i, 2048, seek, SMALL_CHAR, j);
> + do_test (&json_ctx, i, 16 << i, 2048, seek, SMALL_CHAR, j);
> + }
> +
> + for (i = 1; i < 8; ++i)
> + {
> + do_test (&json_ctx, i, 64, 256, seek, SMALL_CHAR, j);
> + do_test (&json_ctx, i, 64, 256, seek, BIG_CHAR, j);
> +
> + do_test (&json_ctx, i * 15, 64, 256, seek, SMALL_CHAR, j);
> + do_test (&json_ctx, i * 15, 64, 256, seek, BIG_CHAR, j);
> + }
> +
> + for (i = 0; i < 32; ++i)
> + {
> + do_test (&json_ctx, 0, i, i + 1, seek, SMALL_CHAR, j);
> + do_test (&json_ctx, 0, i, i + 1, seek, BIG_CHAR, j);
> + do_test (&json_ctx, getpagesize () - i / 2 - 1, i, i + 1, seek,
> + SMALL_CHAR, j);
> + }
> + if (seek == 0)
> + {
> + break;
> + }
> + }
> }
>
> - for (i = 0; i < 32; ++i)
> - {
> - do_test (0, i, i + 1, 0, SMALL_CHAR);
> - do_test (0, i, i + 1, 0, BIG_CHAR);
> - }
> + json_array_end (&json_ctx);
> + json_attr_object_end (&json_ctx);
> + json_attr_object_end (&json_ctx);
> + json_document_end (&json_ctx);
>
> return ret;
> }
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks
--
H.J.
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH v3 2/4] x86: Optimize {str|wcs}rchr-sse2
2022-04-22 19:06 ` H.J. Lu
@ 2022-05-12 20:13 ` Sunil Pandey
0 siblings, 0 replies; 36+ messages in thread
From: Sunil Pandey @ 2022-05-12 20:13 UTC (permalink / raw)
To: H.J. Lu, Libc-stable Mailing List; +Cc: Noah Goldstein, GNU C Library
On Fri, Apr 22, 2022 at 12:09 PM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Thu, Apr 21, 2022 at 6:52 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > The new code unrolls the main loop slightly without adding too much
> > overhead and minimizes the comparisons for the search CHAR.
> >
> > Geometric Mean of all benchmarks New / Old: 0.741
> > See email for all results.
> >
> > Full xcheck passes on x86_64 with and without multiarch enabled.
> > ---
> > sysdeps/x86_64/multiarch/strrchr-sse2.S | 2 +-
> > sysdeps/x86_64/multiarch/wcsrchr-sse2.S | 3 +-
> > sysdeps/x86_64/strrchr.S | 510 +++++++++++++++---------
> > sysdeps/x86_64/wcsrchr.S | 268 +------------
> > 4 files changed, 339 insertions(+), 444 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
> > index db1b44c23c..866396e947 100644
> > --- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
> > +++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
> > @@ -17,7 +17,7 @@
> > <https://www.gnu.org/licenses/>. */
> >
> > #if IS_IN (libc)
> > -# define strrchr __strrchr_sse2
> > +# define STRRCHR __strrchr_sse2
> >
> > # undef weak_alias
> > # define weak_alias(strrchr, rindex)
> > diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> > index 78d1ca6553..69d2f3cdb1 100644
> > --- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> > +++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> > @@ -17,7 +17,6 @@
> > <https://www.gnu.org/licenses/>. */
> >
> > #if IS_IN (libc)
> > -# define wcsrchr __wcsrchr_sse2
> > +# define STRRCHR __wcsrchr_sse2
> > #endif
> > -
> > #include "../wcsrchr.S"
> > diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
> > index 50d886713e..4d7ba4ceb2 100644
> > --- a/sysdeps/x86_64/strrchr.S
> > +++ b/sysdeps/x86_64/strrchr.S
> > @@ -19,210 +19,360 @@
> >
> > #include <sysdep.h>
> >
> > +#ifndef STRRCHR
> > +# define STRRCHR strrchr
> > +#endif
> > +
> > +#ifdef USE_AS_WCSRCHR
> > +# define PCMPEQ pcmpeqd
> > +# define CHAR_SIZE 4
> > +# define PMINU pminud
> > +#else
> > +# define PCMPEQ pcmpeqb
> > +# define CHAR_SIZE 1
> > +# define PMINU pminub
> > +#endif
> > +
> > +#define PAGE_SIZE 4096
> > +#define VEC_SIZE 16
> > +
> > .text
> > -ENTRY (strrchr)
> > - movd %esi, %xmm1
> > +ENTRY(STRRCHR)
> > + movd %esi, %xmm0
> > movq %rdi, %rax
> > - andl $4095, %eax
> > - punpcklbw %xmm1, %xmm1
> > - cmpq $4032, %rax
> > - punpcklwd %xmm1, %xmm1
> > - pshufd $0, %xmm1, %xmm1
> > + andl $(PAGE_SIZE - 1), %eax
> > +#ifndef USE_AS_WCSRCHR
> > + punpcklbw %xmm0, %xmm0
> > + punpcklwd %xmm0, %xmm0
> > +#endif
> > + pshufd $0, %xmm0, %xmm0
> > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> > ja L(cross_page)
> > - movdqu (%rdi), %xmm0
> > +
> > +L(cross_page_continue):
> > + movups (%rdi), %xmm1
> > pxor %xmm2, %xmm2
> > - movdqa %xmm0, %xmm3
> > - pcmpeqb %xmm1, %xmm0
> > - pcmpeqb %xmm2, %xmm3
> > - pmovmskb %xmm0, %ecx
> > - pmovmskb %xmm3, %edx
> > - testq %rdx, %rdx
> > - je L(next_48_bytes)
> > - leaq -1(%rdx), %rax
> > - xorq %rdx, %rax
> > - andq %rcx, %rax
> > - je L(exit)
> > - bsrq %rax, %rax
> > + PCMPEQ %xmm1, %xmm2
> > + pmovmskb %xmm2, %ecx
> > + testl %ecx, %ecx
> > + jz L(aligned_more)
> > +
> > + PCMPEQ %xmm0, %xmm1
> > + pmovmskb %xmm1, %eax
> > + leal -1(%rcx), %edx
> > + xorl %edx, %ecx
> > + andl %ecx, %eax
> > + jz L(ret0)
> > + bsrl %eax, %eax
> > addq %rdi, %rax
> > + /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
> > + search CHAR is zero we are correct. Either way `andq
> > + -CHAR_SIZE, %rax` gets the correct result. */
> > +#ifdef USE_AS_WCSRCHR
> > + andq $-CHAR_SIZE, %rax
> > +#endif
> > +L(ret0):
> > ret
> >
> > + /* Returns for first vec x1/x2 have hard coded backward search
> > + path for earlier matches. */
> > .p2align 4
> > -L(next_48_bytes):
> > - movdqu 16(%rdi), %xmm4
> > - movdqa %xmm4, %xmm5
> > - movdqu 32(%rdi), %xmm3
> > - pcmpeqb %xmm1, %xmm4
> > - pcmpeqb %xmm2, %xmm5
> > - movdqu 48(%rdi), %xmm0
> > - pmovmskb %xmm5, %edx
> > - movdqa %xmm3, %xmm5
> > - pcmpeqb %xmm1, %xmm3
> > - pcmpeqb %xmm2, %xmm5
> > - pcmpeqb %xmm0, %xmm2
> > - salq $16, %rdx
> > - pmovmskb %xmm3, %r8d
> > - pmovmskb %xmm5, %eax
> > - pmovmskb %xmm2, %esi
> > - salq $32, %r8
> > - salq $32, %rax
> > - pcmpeqb %xmm1, %xmm0
> > - orq %rdx, %rax
> > - movq %rsi, %rdx
> > - pmovmskb %xmm4, %esi
> > - salq $48, %rdx
> > - salq $16, %rsi
> > - orq %r8, %rsi
> > - orq %rcx, %rsi
> > - pmovmskb %xmm0, %ecx
> > - salq $48, %rcx
> > - orq %rcx, %rsi
> > - orq %rdx, %rax
> > - je L(loop_header2)
> > - leaq -1(%rax), %rcx
> > - xorq %rax, %rcx
> > - andq %rcx, %rsi
> > - je L(exit)
> > - bsrq %rsi, %rsi
> > - leaq (%rdi,%rsi), %rax
> > +L(first_vec_x0_test):
> > + PCMPEQ %xmm0, %xmm1
> > + pmovmskb %xmm1, %eax
> > + testl %eax, %eax
> > + jz L(ret0)
> > + bsrl %eax, %eax
> > + addq %r8, %rax
> > +#ifdef USE_AS_WCSRCHR
> > + andq $-CHAR_SIZE, %rax
> > +#endif
> > ret
> >
> > .p2align 4
> > -L(loop_header2):
> > - testq %rsi, %rsi
> > - movq %rdi, %rcx
> > - je L(no_c_found)
> > -L(loop_header):
> > - addq $64, %rdi
> > - pxor %xmm7, %xmm7
> > - andq $-64, %rdi
> > - jmp L(loop_entry)
> > +L(first_vec_x1):
> > + PCMPEQ %xmm0, %xmm2
> > + pmovmskb %xmm2, %eax
> > + leal -1(%rcx), %edx
> > + xorl %edx, %ecx
> > + andl %ecx, %eax
> > + jz L(first_vec_x0_test)
> > + bsrl %eax, %eax
> > + leaq (VEC_SIZE)(%rdi, %rax), %rax
> > +#ifdef USE_AS_WCSRCHR
> > + andq $-CHAR_SIZE, %rax
> > +#endif
> > + ret
> >
> > .p2align 4
> > -L(loop64):
> > - testq %rdx, %rdx
> > - cmovne %rdx, %rsi
> > - cmovne %rdi, %rcx
> > - addq $64, %rdi
> > -L(loop_entry):
> > - movdqa 32(%rdi), %xmm3
> > - pxor %xmm6, %xmm6
> > - movdqa 48(%rdi), %xmm2
> > - movdqa %xmm3, %xmm0
> > - movdqa 16(%rdi), %xmm4
> > - pminub %xmm2, %xmm0
> > - movdqa (%rdi), %xmm5
> > - pminub %xmm4, %xmm0
> > - pminub %xmm5, %xmm0
> > - pcmpeqb %xmm7, %xmm0
> > - pmovmskb %xmm0, %eax
> > - movdqa %xmm5, %xmm0
> > - pcmpeqb %xmm1, %xmm0
> > - pmovmskb %xmm0, %r9d
> > - movdqa %xmm4, %xmm0
> > - pcmpeqb %xmm1, %xmm0
> > - pmovmskb %xmm0, %edx
> > - movdqa %xmm3, %xmm0
> > - pcmpeqb %xmm1, %xmm0
> > - salq $16, %rdx
> > - pmovmskb %xmm0, %r10d
> > - movdqa %xmm2, %xmm0
> > - pcmpeqb %xmm1, %xmm0
> > - salq $32, %r10
> > - orq %r10, %rdx
> > - pmovmskb %xmm0, %r8d
> > - orq %r9, %rdx
> > - salq $48, %r8
> > - orq %r8, %rdx
> > +L(first_vec_x1_test):
> > + PCMPEQ %xmm0, %xmm2
> > + pmovmskb %xmm2, %eax
> > testl %eax, %eax
> > - je L(loop64)
> > - pcmpeqb %xmm6, %xmm4
> > - pcmpeqb %xmm6, %xmm3
> > - pcmpeqb %xmm6, %xmm5
> > - pmovmskb %xmm4, %eax
> > - pmovmskb %xmm3, %r10d
> > - pcmpeqb %xmm6, %xmm2
> > - pmovmskb %xmm5, %r9d
> > - salq $32, %r10
> > - salq $16, %rax
> > - pmovmskb %xmm2, %r8d
> > - orq %r10, %rax
> > - orq %r9, %rax
> > - salq $48, %r8
> > - orq %r8, %rax
> > - leaq -1(%rax), %r8
> > - xorq %rax, %r8
> > - andq %r8, %rdx
> > - cmovne %rdi, %rcx
> > - cmovne %rdx, %rsi
> > - bsrq %rsi, %rsi
> > - leaq (%rcx,%rsi), %rax
> > + jz L(first_vec_x0_test)
> > + bsrl %eax, %eax
> > + leaq (VEC_SIZE)(%rdi, %rax), %rax
> > +#ifdef USE_AS_WCSRCHR
> > + andq $-CHAR_SIZE, %rax
> > +#endif
> > + ret
> > +
> > + .p2align 4
> > +L(first_vec_x2):
> > + PCMPEQ %xmm0, %xmm3
> > + pmovmskb %xmm3, %eax
> > + leal -1(%rcx), %edx
> > + xorl %edx, %ecx
> > + andl %ecx, %eax
> > + jz L(first_vec_x1_test)
> > + bsrl %eax, %eax
> > + leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
> > +#ifdef USE_AS_WCSRCHR
> > + andq $-CHAR_SIZE, %rax
> > +#endif
> > + ret
> > +
> > + .p2align 4
> > +L(aligned_more):
> > + /* Save original pointer if match was in VEC 0. */
> > + movq %rdi, %r8
> > + andq $-VEC_SIZE, %rdi
> > +
> > + movaps VEC_SIZE(%rdi), %xmm2
> > + pxor %xmm3, %xmm3
> > + PCMPEQ %xmm2, %xmm3
> > + pmovmskb %xmm3, %ecx
> > + testl %ecx, %ecx
> > + jnz L(first_vec_x1)
> > +
> > + movaps (VEC_SIZE * 2)(%rdi), %xmm3
> > + pxor %xmm4, %xmm4
> > + PCMPEQ %xmm3, %xmm4
> > + pmovmskb %xmm4, %ecx
> > + testl %ecx, %ecx
> > + jnz L(first_vec_x2)
> > +
> > + addq $VEC_SIZE, %rdi
> > + /* Save pointer again before realigning. */
> > + movq %rdi, %rsi
> > + andq $-(VEC_SIZE * 2), %rdi
> > + .p2align 4
> > +L(first_loop):
> > + /* Do 2x VEC at a time. */
> > + movaps (VEC_SIZE * 2)(%rdi), %xmm4
> > + movaps (VEC_SIZE * 3)(%rdi), %xmm5
> > + /* Since SSE2 no pminud so wcsrchr needs seperate logic for
> > + detecting zero. Note if this is found to be a bottleneck it
> > + may be worth adding an SSE4.1 wcsrchr implementation. */
> > +#ifdef USE_AS_WCSRCHR
> > + movaps %xmm5, %xmm6
> > + pxor %xmm8, %xmm8
> > +
> > + PCMPEQ %xmm8, %xmm5
> > + PCMPEQ %xmm4, %xmm8
> > + por %xmm5, %xmm8
> > +#else
> > + movaps %xmm5, %xmm6
> > + PMINU %xmm4, %xmm5
> > +#endif
> > +
> > + movaps %xmm4, %xmm9
> > + PCMPEQ %xmm0, %xmm4
> > + PCMPEQ %xmm0, %xmm6
> > + movaps %xmm6, %xmm7
> > + por %xmm4, %xmm6
> > +#ifndef USE_AS_WCSRCHR
> > + pxor %xmm8, %xmm8
> > + PCMPEQ %xmm5, %xmm8
> > +#endif
> > + pmovmskb %xmm8, %ecx
> > + pmovmskb %xmm6, %eax
> > +
> > + addq $(VEC_SIZE * 2), %rdi
> > + /* Use `addl` 1) so we can undo it with `subl` and 2) it can
> > + macro-fuse with `jz`. */
> > + addl %ecx, %eax
> > + jz L(first_loop)
> > +
> > + /* Check if there is zero match. */
> > + testl %ecx, %ecx
> > + jz L(second_loop_match)
> > +
> > + /* Check if there was a match in last iteration. */
> > + subl %ecx, %eax
> > + jnz L(new_match)
> > +
> > +L(first_loop_old_match):
> > + PCMPEQ %xmm0, %xmm2
> > + PCMPEQ %xmm0, %xmm3
> > + pmovmskb %xmm2, %ecx
> > + pmovmskb %xmm3, %eax
> > + addl %eax, %ecx
> > + jz L(first_vec_x0_test)
> > + /* NB: We could move this shift to before the branch and save a
> > + bit of code size / performance on the fall through. The
> > + branch leads to the null case which generally seems hotter
> > + than char in first 3x VEC. */
> > + sall $16, %eax
> > + orl %ecx, %eax
> > +
> > + bsrl %eax, %eax
> > + addq %rsi, %rax
> > +#ifdef USE_AS_WCSRCHR
> > + andq $-CHAR_SIZE, %rax
> > +#endif
> > + ret
> > +
> > + .p2align 4
> > +L(new_match):
> > + pxor %xmm6, %xmm6
> > + PCMPEQ %xmm9, %xmm6
> > + pmovmskb %xmm6, %eax
> > + sall $16, %ecx
> > + orl %eax, %ecx
> > +
> > + /* We can't reuse either of the old comparisons as since we mask
> > + of zeros after first zero (instead of using the full
> > + comparison) we can't gurantee no interference between match
> > + after end of string and valid match. */
> > + pmovmskb %xmm4, %eax
> > + pmovmskb %xmm7, %edx
> > + sall $16, %edx
> > + orl %edx, %eax
> > +
> > + leal -1(%ecx), %edx
> > + xorl %edx, %ecx
> > + andl %ecx, %eax
> > + jz L(first_loop_old_match)
> > + bsrl %eax, %eax
> > + addq %rdi, %rax
> > +#ifdef USE_AS_WCSRCHR
> > + andq $-CHAR_SIZE, %rax
> > +#endif
> > ret
> >
> > + /* Save minimum state for getting most recent match. We can
> > + throw out all previous work. */
> > .p2align 4
> > -L(no_c_found):
> > - movl $1, %esi
> > - xorl %ecx, %ecx
> > - jmp L(loop_header)
> > +L(second_loop_match):
> > + movq %rdi, %rsi
> > + movaps %xmm4, %xmm2
> > + movaps %xmm7, %xmm3
> >
> > .p2align 4
> > -L(exit):
> > - xorl %eax, %eax
> > +L(second_loop):
> > + movaps (VEC_SIZE * 2)(%rdi), %xmm4
> > + movaps (VEC_SIZE * 3)(%rdi), %xmm5
> > + /* Since SSE2 no pminud so wcsrchr needs seperate logic for
> > + detecting zero. Note if this is found to be a bottleneck it
> > + may be worth adding an SSE4.1 wcsrchr implementation. */
> > +#ifdef USE_AS_WCSRCHR
> > + movaps %xmm5, %xmm6
> > + pxor %xmm8, %xmm8
> > +
> > + PCMPEQ %xmm8, %xmm5
> > + PCMPEQ %xmm4, %xmm8
> > + por %xmm5, %xmm8
> > +#else
> > + movaps %xmm5, %xmm6
> > + PMINU %xmm4, %xmm5
> > +#endif
> > +
> > + movaps %xmm4, %xmm9
> > + PCMPEQ %xmm0, %xmm4
> > + PCMPEQ %xmm0, %xmm6
> > + movaps %xmm6, %xmm7
> > + por %xmm4, %xmm6
> > +#ifndef USE_AS_WCSRCHR
> > + pxor %xmm8, %xmm8
> > + PCMPEQ %xmm5, %xmm8
> > +#endif
> > +
> > + pmovmskb %xmm8, %ecx
> > + pmovmskb %xmm6, %eax
> > +
> > + addq $(VEC_SIZE * 2), %rdi
> > + /* Either null term or new occurence of CHAR. */
> > + addl %ecx, %eax
> > + jz L(second_loop)
> > +
> > + /* No null term so much be new occurence of CHAR. */
> > + testl %ecx, %ecx
> > + jz L(second_loop_match)
> > +
> > +
> > + subl %ecx, %eax
> > + jnz L(second_loop_new_match)
> > +
> > +L(second_loop_old_match):
> > + pmovmskb %xmm2, %ecx
> > + pmovmskb %xmm3, %eax
> > + sall $16, %eax
> > + orl %ecx, %eax
> > + bsrl %eax, %eax
> > + addq %rsi, %rax
> > +#ifdef USE_AS_WCSRCHR
> > + andq $-CHAR_SIZE, %rax
> > +#endif
> > ret
> >
> > .p2align 4
> > +L(second_loop_new_match):
> > + pxor %xmm6, %xmm6
> > + PCMPEQ %xmm9, %xmm6
> > + pmovmskb %xmm6, %eax
> > + sall $16, %ecx
> > + orl %eax, %ecx
> > +
> > + /* We can't reuse either of the old comparisons as since we mask
> > + of zeros after first zero (instead of using the full
> > + comparison) we can't gurantee no interference between match
> > + after end of string and valid match. */
> > + pmovmskb %xmm4, %eax
> > + pmovmskb %xmm7, %edx
> > + sall $16, %edx
> > + orl %edx, %eax
> > +
> > + leal -1(%ecx), %edx
> > + xorl %edx, %ecx
> > + andl %ecx, %eax
> > + jz L(second_loop_old_match)
> > + bsrl %eax, %eax
> > + addq %rdi, %rax
> > +#ifdef USE_AS_WCSRCHR
> > + andq $-CHAR_SIZE, %rax
> > +#endif
> > + ret
> > +
> > + .p2align 4,, 4
> > L(cross_page):
> > - movq %rdi, %rax
> > - pxor %xmm0, %xmm0
> > - andq $-64, %rax
> > - movdqu (%rax), %xmm5
> > - movdqa %xmm5, %xmm6
> > - movdqu 16(%rax), %xmm4
> > - pcmpeqb %xmm1, %xmm5
> > - pcmpeqb %xmm0, %xmm6
> > - movdqu 32(%rax), %xmm3
> > - pmovmskb %xmm6, %esi
> > - movdqa %xmm4, %xmm6
> > - movdqu 48(%rax), %xmm2
> > - pcmpeqb %xmm1, %xmm4
> > - pcmpeqb %xmm0, %xmm6
> > - pmovmskb %xmm6, %edx
> > - movdqa %xmm3, %xmm6
> > - pcmpeqb %xmm1, %xmm3
> > - pcmpeqb %xmm0, %xmm6
> > - pcmpeqb %xmm2, %xmm0
> > - salq $16, %rdx
> > - pmovmskb %xmm3, %r9d
> > - pmovmskb %xmm6, %r8d
> > - pmovmskb %xmm0, %ecx
> > - salq $32, %r9
> > - salq $32, %r8
> > - pcmpeqb %xmm1, %xmm2
> > - orq %r8, %rdx
> > - salq $48, %rcx
> > - pmovmskb %xmm5, %r8d
> > - orq %rsi, %rdx
> > - pmovmskb %xmm4, %esi
> > - orq %rcx, %rdx
> > - pmovmskb %xmm2, %ecx
> > - salq $16, %rsi
> > - salq $48, %rcx
> > - orq %r9, %rsi
> > - orq %r8, %rsi
> > - orq %rcx, %rsi
> > + movq %rdi, %rsi
> > + andq $-VEC_SIZE, %rsi
> > + movaps (%rsi), %xmm1
> > + pxor %xmm2, %xmm2
> > + PCMPEQ %xmm1, %xmm2
> > + pmovmskb %xmm2, %edx
> > movl %edi, %ecx
> > - subl %eax, %ecx
> > - shrq %cl, %rdx
> > - shrq %cl, %rsi
> > - testq %rdx, %rdx
> > - je L(loop_header2)
> > - leaq -1(%rdx), %rax
> > - xorq %rdx, %rax
> > - andq %rax, %rsi
> > - je L(exit)
> > - bsrq %rsi, %rax
> > + andl $(VEC_SIZE - 1), %ecx
> > + sarl %cl, %edx
> > + jz L(cross_page_continue)
> > + PCMPEQ %xmm0, %xmm1
> > + pmovmskb %xmm1, %eax
> > + sarl %cl, %eax
> > + leal -1(%rdx), %ecx
> > + xorl %edx, %ecx
> > + andl %ecx, %eax
> > + jz L(ret1)
> > + bsrl %eax, %eax
> > addq %rdi, %rax
> > +#ifdef USE_AS_WCSRCHR
> > + andq $-CHAR_SIZE, %rax
> > +#endif
> > +L(ret1):
> > ret
> > -END (strrchr)
> > +END(STRRCHR)
> >
> > -weak_alias (strrchr, rindex)
> > -libc_hidden_builtin_def (strrchr)
> > +#ifndef USE_AS_WCSRCHR
> > + weak_alias (STRRCHR, rindex)
> > + libc_hidden_builtin_def (STRRCHR)
> > +#endif
> > diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
> > index 61552954de..2b80efc5ef 100644
> > --- a/sysdeps/x86_64/wcsrchr.S
> > +++ b/sysdeps/x86_64/wcsrchr.S
> > @@ -1,4 +1,4 @@
> > -/* wcsrchr with SSSE3
> > +/* wcsrchr optimized with SSE2.
> > Copyright (C) 2011-2022 Free Software Foundation, Inc.
> > This file is part of the GNU C Library.
> >
> > @@ -16,266 +16,12 @@
> > License along with the GNU C Library; if not, see
> > <https://www.gnu.org/licenses/>. */
> >
> > -#include <sysdep.h>
> >
> > - .text
> > -ENTRY (wcsrchr)
> > +#define USE_AS_WCSRCHR 1
> > +#define NO_PMINU 1
> >
> > - movd %rsi, %xmm1
> > - mov %rdi, %rcx
> > - punpckldq %xmm1, %xmm1
> > - pxor %xmm2, %xmm2
> > - punpckldq %xmm1, %xmm1
> > - and $63, %rcx
> > - cmp $48, %rcx
> > - ja L(crosscache)
> > +#ifndef STRRCHR
> > +# define STRRCHR wcsrchr
> > +#endif
> >
> > - movdqu (%rdi), %xmm0
> > - pcmpeqd %xmm0, %xmm2
> > - pcmpeqd %xmm1, %xmm0
> > - pmovmskb %xmm2, %rcx
> > - pmovmskb %xmm0, %rax
> > - add $16, %rdi
> > -
> > - test %rax, %rax
> > - jnz L(unaligned_match1)
> > -
> > - test %rcx, %rcx
> > - jnz L(return_null)
> > -
> > - and $-16, %rdi
> > - xor %r8, %r8
> > - jmp L(loop)
> > -
> > - .p2align 4
> > -L(unaligned_match1):
> > - test %rcx, %rcx
> > - jnz L(prolog_find_zero_1)
> > -
> > - mov %rax, %r8
> > - mov %rdi, %rsi
> > - and $-16, %rdi
> > - jmp L(loop)
> > -
> > - .p2align 4
> > -L(crosscache):
> > - and $15, %rcx
> > - and $-16, %rdi
> > - pxor %xmm3, %xmm3
> > - movdqa (%rdi), %xmm0
> > - pcmpeqd %xmm0, %xmm3
> > - pcmpeqd %xmm1, %xmm0
> > - pmovmskb %xmm3, %rdx
> > - pmovmskb %xmm0, %rax
> > - shr %cl, %rdx
> > - shr %cl, %rax
> > - add $16, %rdi
> > -
> > - test %rax, %rax
> > - jnz L(unaligned_match)
> > -
> > - test %rdx, %rdx
> > - jnz L(return_null)
> > -
> > - xor %r8, %r8
> > - jmp L(loop)
> > -
> > - .p2align 4
> > -L(unaligned_match):
> > - test %rdx, %rdx
> > - jnz L(prolog_find_zero)
> > -
> > - mov %rax, %r8
> > - lea (%rdi, %rcx), %rsi
> > -
> > -/* Loop start on aligned string. */
> > - .p2align 4
> > -L(loop):
> > - movdqa (%rdi), %xmm0
> > - pcmpeqd %xmm0, %xmm2
> > - add $16, %rdi
> > - pcmpeqd %xmm1, %xmm0
> > - pmovmskb %xmm2, %rcx
> > - pmovmskb %xmm0, %rax
> > - or %rax, %rcx
> > - jnz L(matches)
> > -
> > - movdqa (%rdi), %xmm3
> > - pcmpeqd %xmm3, %xmm2
> > - add $16, %rdi
> > - pcmpeqd %xmm1, %xmm3
> > - pmovmskb %xmm2, %rcx
> > - pmovmskb %xmm3, %rax
> > - or %rax, %rcx
> > - jnz L(matches)
> > -
> > - movdqa (%rdi), %xmm4
> > - pcmpeqd %xmm4, %xmm2
> > - add $16, %rdi
> > - pcmpeqd %xmm1, %xmm4
> > - pmovmskb %xmm2, %rcx
> > - pmovmskb %xmm4, %rax
> > - or %rax, %rcx
> > - jnz L(matches)
> > -
> > - movdqa (%rdi), %xmm5
> > - pcmpeqd %xmm5, %xmm2
> > - add $16, %rdi
> > - pcmpeqd %xmm1, %xmm5
> > - pmovmskb %xmm2, %rcx
> > - pmovmskb %xmm5, %rax
> > - or %rax, %rcx
> > - jz L(loop)
> > -
> > - .p2align 4
> > -L(matches):
> > - test %rax, %rax
> > - jnz L(match)
> > -L(return_value):
> > - test %r8, %r8
> > - jz L(return_null)
> > - mov %r8, %rax
> > - mov %rsi, %rdi
> > -
> > - test $15 << 4, %ah
> > - jnz L(match_fourth_wchar)
> > - test %ah, %ah
> > - jnz L(match_third_wchar)
> > - test $15 << 4, %al
> > - jnz L(match_second_wchar)
> > - lea -16(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(match):
> > - pmovmskb %xmm2, %rcx
> > - test %rcx, %rcx
> > - jnz L(find_zero)
> > - mov %rax, %r8
> > - mov %rdi, %rsi
> > - jmp L(loop)
> > -
> > - .p2align 4
> > -L(find_zero):
> > - test $15, %cl
> > - jnz L(find_zero_in_first_wchar)
> > - test %cl, %cl
> > - jnz L(find_zero_in_second_wchar)
> > - test $15, %ch
> > - jnz L(find_zero_in_third_wchar)
> > -
> > - and $1 << 13 - 1, %rax
> > - jz L(return_value)
> > -
> > - test $15 << 4, %ah
> > - jnz L(match_fourth_wchar)
> > - test %ah, %ah
> > - jnz L(match_third_wchar)
> > - test $15 << 4, %al
> > - jnz L(match_second_wchar)
> > - lea -16(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(find_zero_in_first_wchar):
> > - test $1, %rax
> > - jz L(return_value)
> > - lea -16(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(find_zero_in_second_wchar):
> > - and $1 << 5 - 1, %rax
> > - jz L(return_value)
> > -
> > - test $15 << 4, %al
> > - jnz L(match_second_wchar)
> > - lea -16(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(find_zero_in_third_wchar):
> > - and $1 << 9 - 1, %rax
> > - jz L(return_value)
> > -
> > - test %ah, %ah
> > - jnz L(match_third_wchar)
> > - test $15 << 4, %al
> > - jnz L(match_second_wchar)
> > - lea -16(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(prolog_find_zero):
> > - add %rcx, %rdi
> > - mov %rdx, %rcx
> > -L(prolog_find_zero_1):
> > - test $15, %cl
> > - jnz L(prolog_find_zero_in_first_wchar)
> > - test %cl, %cl
> > - jnz L(prolog_find_zero_in_second_wchar)
> > - test $15, %ch
> > - jnz L(prolog_find_zero_in_third_wchar)
> > -
> > - and $1 << 13 - 1, %rax
> > - jz L(return_null)
> > -
> > - test $15 << 4, %ah
> > - jnz L(match_fourth_wchar)
> > - test %ah, %ah
> > - jnz L(match_third_wchar)
> > - test $15 << 4, %al
> > - jnz L(match_second_wchar)
> > - lea -16(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(prolog_find_zero_in_first_wchar):
> > - test $1, %rax
> > - jz L(return_null)
> > - lea -16(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(prolog_find_zero_in_second_wchar):
> > - and $1 << 5 - 1, %rax
> > - jz L(return_null)
> > -
> > - test $15 << 4, %al
> > - jnz L(match_second_wchar)
> > - lea -16(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(prolog_find_zero_in_third_wchar):
> > - and $1 << 9 - 1, %rax
> > - jz L(return_null)
> > -
> > - test %ah, %ah
> > - jnz L(match_third_wchar)
> > - test $15 << 4, %al
> > - jnz L(match_second_wchar)
> > - lea -16(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(match_second_wchar):
> > - lea -12(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(match_third_wchar):
> > - lea -8(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(match_fourth_wchar):
> > - lea -4(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(return_null):
> > - xor %rax, %rax
> > - ret
> > -
> > -END (wcsrchr)
> > +#include "../strrchr.S"
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.
I would like to backport this patch to release branches.
Any comments or objections?
--Sunil
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH v3 3/4] x86: Optimize {str|wcs}rchr-avx2
2022-04-22 19:03 ` H.J. Lu
@ 2022-05-12 20:14 ` Sunil Pandey
2022-07-20 15:33 ` Noah Goldstein
0 siblings, 1 reply; 36+ messages in thread
From: Sunil Pandey @ 2022-05-12 20:14 UTC (permalink / raw)
To: H.J. Lu, Libc-stable Mailing List; +Cc: Noah Goldstein, GNU C Library
On Fri, Apr 22, 2022 at 12:08 PM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Thu, Apr 21, 2022 at 6:52 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > The new code unrolls the main loop slightly without adding too much
> > overhead and minimizes the comparisons for the search CHAR.
> >
> > Geometric Mean of all benchmarks New / Old: 0.832
> > See email for all results.
> >
> > Full xcheck passes on x86_64 with and without multiarch enabled.
> > ---
> > sysdeps/x86_64/multiarch/strrchr-avx2.S | 426 +++++++++++++++---------
> > 1 file changed, 269 insertions(+), 157 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
> > index 1df2adfad0..bd26ba80d5 100644
> > --- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
> > +++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
> > @@ -27,9 +27,13 @@
> > # ifdef USE_AS_WCSRCHR
> > # define VPBROADCAST vpbroadcastd
> > # define VPCMPEQ vpcmpeqd
> > +# define VPMIN vpminud
> > +# define CHAR_SIZE 4
> > # else
> > # define VPBROADCAST vpbroadcastb
> > # define VPCMPEQ vpcmpeqb
> > +# define VPMIN vpminub
> > +# define CHAR_SIZE 1
> > # endif
> >
> > # ifndef VZEROUPPER
> > @@ -41,196 +45,304 @@
> > # endif
> >
> > # define VEC_SIZE 32
> > +# define PAGE_SIZE 4096
> >
> > - .section SECTION(.text),"ax",@progbits
> > -ENTRY (STRRCHR)
> > - movd %esi, %xmm4
> > - movl %edi, %ecx
> > + .section SECTION(.text), "ax", @progbits
> > +ENTRY(STRRCHR)
> > + movd %esi, %xmm7
> > + movl %edi, %eax
> > /* Broadcast CHAR to YMM4. */
> > - VPBROADCAST %xmm4, %ymm4
> > + VPBROADCAST %xmm7, %ymm7
> > vpxor %xmm0, %xmm0, %xmm0
> >
> > - /* Check if we may cross page boundary with one vector load. */
> > - andl $(2 * VEC_SIZE - 1), %ecx
> > - cmpl $VEC_SIZE, %ecx
> > - ja L(cros_page_boundary)
> > + /* Shift here instead of `andl` to save code size (saves a fetch
> > + block). */
> > + sall $20, %eax
> > + cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax
> > + ja L(cross_page)
> >
> > +L(page_cross_continue):
> > vmovdqu (%rdi), %ymm1
> > - VPCMPEQ %ymm1, %ymm0, %ymm2
> > - VPCMPEQ %ymm1, %ymm4, %ymm3
> > - vpmovmskb %ymm2, %ecx
> > - vpmovmskb %ymm3, %eax
> > - addq $VEC_SIZE, %rdi
> > + /* Check end of string match. */
> > + VPCMPEQ %ymm1, %ymm0, %ymm6
> > + vpmovmskb %ymm6, %ecx
> > + testl %ecx, %ecx
> > + jz L(aligned_more)
> > +
> > + /* Only check match with search CHAR if needed. */
> > + VPCMPEQ %ymm1, %ymm7, %ymm1
> > + vpmovmskb %ymm1, %eax
> > + /* Check if match before first zero. */
> > + blsmskl %ecx, %ecx
> > + andl %ecx, %eax
> > + jz L(ret0)
> > + bsrl %eax, %eax
> > + addq %rdi, %rax
> > + /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
> > + search CHAR is zero we are correct. Either way `andq
> > + -CHAR_SIZE, %rax` gets the correct result. */
> > +# ifdef USE_AS_WCSRCHR
> > + andq $-CHAR_SIZE, %rax
> > +# endif
> > +L(ret0):
> > +L(return_vzeroupper):
> > + ZERO_UPPER_VEC_REGISTERS_RETURN
> > +
> > + /* Returns for first vec x1/x2 have hard coded backward search
> > + path for earlier matches. */
> > + .p2align 4,, 10
> > +L(first_vec_x1):
> > + VPCMPEQ %ymm2, %ymm7, %ymm6
> > + vpmovmskb %ymm6, %eax
> > + blsmskl %ecx, %ecx
> > + andl %ecx, %eax
> > + jnz L(first_vec_x1_return)
> > +
> > + .p2align 4,, 4
> > +L(first_vec_x0_test):
> > + VPCMPEQ %ymm1, %ymm7, %ymm6
> > + vpmovmskb %ymm6, %eax
> > + testl %eax, %eax
> > + jz L(ret1)
> > + bsrl %eax, %eax
> > + addq %r8, %rax
> > +# ifdef USE_AS_WCSRCHR
> > + andq $-CHAR_SIZE, %rax
> > +# endif
> > +L(ret1):
> > + VZEROUPPER_RETURN
> >
> > + .p2align 4,, 10
> > +L(first_vec_x0_x1_test):
> > + VPCMPEQ %ymm2, %ymm7, %ymm6
> > + vpmovmskb %ymm6, %eax
> > + /* Check ymm2 for search CHAR match. If no match then check ymm1
> > + before returning. */
> > testl %eax, %eax
> > - jnz L(first_vec)
> > + jz L(first_vec_x0_test)
> > + .p2align 4,, 4
> > +L(first_vec_x1_return):
> > + bsrl %eax, %eax
> > + leaq 1(%rdi, %rax), %rax
> > +# ifdef USE_AS_WCSRCHR
> > + andq $-CHAR_SIZE, %rax
> > +# endif
> > + VZEROUPPER_RETURN
> >
> > - testl %ecx, %ecx
> > - jnz L(return_null)
> >
> > - andq $-VEC_SIZE, %rdi
> > - xorl %edx, %edx
> > - jmp L(aligned_loop)
> > + .p2align 4,, 10
> > +L(first_vec_x2):
> > + VPCMPEQ %ymm3, %ymm7, %ymm6
> > + vpmovmskb %ymm6, %eax
> > + blsmskl %ecx, %ecx
> > + /* If no in-range search CHAR match in ymm3 then need to check
> > + ymm1/ymm2 for an earlier match (we delay checking search
> > + CHAR matches until needed). */
> > + andl %ecx, %eax
> > + jz L(first_vec_x0_x1_test)
> > + bsrl %eax, %eax
> > + leaq (VEC_SIZE + 1)(%rdi, %rax), %rax
> > +# ifdef USE_AS_WCSRCHR
> > + andq $-CHAR_SIZE, %rax
> > +# endif
> > + VZEROUPPER_RETURN
> > +
> >
> > .p2align 4
> > -L(first_vec):
> > - /* Check if there is a nul CHAR. */
> > +L(aligned_more):
> > + /* Save original pointer if match was in VEC 0. */
> > + movq %rdi, %r8
> > +
> > + /* Align src. */
> > + orq $(VEC_SIZE - 1), %rdi
> > + vmovdqu 1(%rdi), %ymm2
> > + VPCMPEQ %ymm2, %ymm0, %ymm6
> > + vpmovmskb %ymm6, %ecx
> > testl %ecx, %ecx
> > - jnz L(char_and_nul_in_first_vec)
> > + jnz L(first_vec_x1)
> >
> > - /* Remember the match and keep searching. */
> > - movl %eax, %edx
> > - movq %rdi, %rsi
> > - andq $-VEC_SIZE, %rdi
> > - jmp L(aligned_loop)
> > + vmovdqu (VEC_SIZE + 1)(%rdi), %ymm3
> > + VPCMPEQ %ymm3, %ymm0, %ymm6
> > + vpmovmskb %ymm6, %ecx
> > + testl %ecx, %ecx
> > + jnz L(first_vec_x2)
> >
> > + /* Save pointer again before realigning. */
> > + movq %rdi, %rsi
> > + addq $(VEC_SIZE + 1), %rdi
> > + andq $-(VEC_SIZE * 2), %rdi
> > .p2align 4
> > -L(cros_page_boundary):
> > - andl $(VEC_SIZE - 1), %ecx
> > - andq $-VEC_SIZE, %rdi
> > - vmovdqa (%rdi), %ymm1
> > - VPCMPEQ %ymm1, %ymm0, %ymm2
> > - VPCMPEQ %ymm1, %ymm4, %ymm3
> > - vpmovmskb %ymm2, %edx
> > - vpmovmskb %ymm3, %eax
> > - shrl %cl, %edx
> > - shrl %cl, %eax
> > - addq $VEC_SIZE, %rdi
> > -
> > - /* Check if there is a CHAR. */
> > +L(first_aligned_loop):
> > + /* Do 2x VEC at a time. Any more and the cost of finding the
> > + match outweights loop benefit. */
> > + vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4
> > + vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5
> > +
> > + VPCMPEQ %ymm4, %ymm7, %ymm6
> > + VPMIN %ymm4, %ymm5, %ymm8
> > + VPCMPEQ %ymm5, %ymm7, %ymm10
> > + vpor %ymm6, %ymm10, %ymm5
> > + VPCMPEQ %ymm8, %ymm0, %ymm8
> > + vpor %ymm5, %ymm8, %ymm9
> > +
> > + vpmovmskb %ymm9, %eax
> > + addq $(VEC_SIZE * 2), %rdi
> > + /* No zero or search CHAR. */
> > testl %eax, %eax
> > - jnz L(found_char)
> > -
> > - testl %edx, %edx
> > - jnz L(return_null)
> > + jz L(first_aligned_loop)
> >
> > - jmp L(aligned_loop)
> > -
> > - .p2align 4
> > -L(found_char):
> > - testl %edx, %edx
> > - jnz L(char_and_nul)
> > + /* If no zero CHAR then go to second loop (this allows us to
> > + throw away all prior work). */
> > + vpmovmskb %ymm8, %ecx
> > + testl %ecx, %ecx
> > + jz L(second_aligned_loop_prep)
> >
> > - /* Remember the match and keep searching. */
> > - movl %eax, %edx
> > - leaq (%rdi, %rcx), %rsi
> > + /* Search char could be zero so we need to get the true match.
> > + */
> > + vpmovmskb %ymm5, %eax
> > + testl %eax, %eax
> > + jnz L(first_aligned_loop_return)
> >
> > - .p2align 4
> > -L(aligned_loop):
> > - vmovdqa (%rdi), %ymm1
> > - VPCMPEQ %ymm1, %ymm0, %ymm2
> > - addq $VEC_SIZE, %rdi
> > - VPCMPEQ %ymm1, %ymm4, %ymm3
> > - vpmovmskb %ymm2, %ecx
> > - vpmovmskb %ymm3, %eax
> > - orl %eax, %ecx
> > - jnz L(char_nor_null)
> > -
> > - vmovdqa (%rdi), %ymm1
> > - VPCMPEQ %ymm1, %ymm0, %ymm2
> > - add $VEC_SIZE, %rdi
> > - VPCMPEQ %ymm1, %ymm4, %ymm3
> > - vpmovmskb %ymm2, %ecx
> > + .p2align 4,, 4
> > +L(first_vec_x1_or_x2):
> > + VPCMPEQ %ymm3, %ymm7, %ymm3
> > + VPCMPEQ %ymm2, %ymm7, %ymm2
> > vpmovmskb %ymm3, %eax
> > - orl %eax, %ecx
> > - jnz L(char_nor_null)
> > -
> > - vmovdqa (%rdi), %ymm1
> > - VPCMPEQ %ymm1, %ymm0, %ymm2
> > - addq $VEC_SIZE, %rdi
> > - VPCMPEQ %ymm1, %ymm4, %ymm3
> > - vpmovmskb %ymm2, %ecx
> > - vpmovmskb %ymm3, %eax
> > - orl %eax, %ecx
> > - jnz L(char_nor_null)
> > -
> > - vmovdqa (%rdi), %ymm1
> > - VPCMPEQ %ymm1, %ymm0, %ymm2
> > - addq $VEC_SIZE, %rdi
> > - VPCMPEQ %ymm1, %ymm4, %ymm3
> > - vpmovmskb %ymm2, %ecx
> > - vpmovmskb %ymm3, %eax
> > - orl %eax, %ecx
> > - jz L(aligned_loop)
> > -
> > - .p2align 4
> > -L(char_nor_null):
> > - /* Find a CHAR or a nul CHAR in a loop. */
> > - testl %eax, %eax
> > - jnz L(match)
> > -L(return_value):
> > - testl %edx, %edx
> > - jz L(return_null)
> > - movl %edx, %eax
> > - movq %rsi, %rdi
> > + vpmovmskb %ymm2, %edx
> > + /* Use add for macro-fusion. */
> > + addq %rax, %rdx
> > + jz L(first_vec_x0_test)
> > + /* NB: We could move this shift to before the branch and save a
> > + bit of code size / performance on the fall through. The
> > + branch leads to the null case which generally seems hotter
> > + than char in first 3x VEC. */
> > + salq $32, %rax
> > + addq %rdx, %rax
> > + bsrq %rax, %rax
> > + leaq 1(%rsi, %rax), %rax
> > +# ifdef USE_AS_WCSRCHR
> > + andq $-CHAR_SIZE, %rax
> > +# endif
> > + VZEROUPPER_RETURN
> >
> > + .p2align 4,, 8
> > +L(first_aligned_loop_return):
> > + VPCMPEQ %ymm4, %ymm0, %ymm4
> > + vpmovmskb %ymm4, %edx
> > + salq $32, %rcx
> > + orq %rdx, %rcx
> > +
> > + vpmovmskb %ymm10, %eax
> > + vpmovmskb %ymm6, %edx
> > + salq $32, %rax
> > + orq %rdx, %rax
> > + blsmskq %rcx, %rcx
> > + andq %rcx, %rax
> > + jz L(first_vec_x1_or_x2)
> > +
> > + bsrq %rax, %rax
> > + leaq -(VEC_SIZE * 2)(%rdi, %rax), %rax
> > # ifdef USE_AS_WCSRCHR
> > - /* Keep the first bit for each matching CHAR for bsr. */
> > - andl $0x11111111, %eax
> > + andq $-CHAR_SIZE, %rax
> > # endif
> > - bsrl %eax, %eax
> > - leaq -VEC_SIZE(%rdi, %rax), %rax
> > -L(return_vzeroupper):
> > - ZERO_UPPER_VEC_REGISTERS_RETURN
> > + VZEROUPPER_RETURN
> >
> > + /* Search char cannot be zero. */
> > .p2align 4
> > -L(match):
> > - /* Find a CHAR. Check if there is a nul CHAR. */
> > - vpmovmskb %ymm2, %ecx
> > - testl %ecx, %ecx
> > - jnz L(find_nul)
> > -
> > - /* Remember the match and keep searching. */
> > - movl %eax, %edx
> > +L(second_aligned_loop_set_furthest_match):
> > + /* Save VEC and pointer from most recent match. */
> > +L(second_aligned_loop_prep):
> > movq %rdi, %rsi
> > - jmp L(aligned_loop)
> > + vmovdqu %ymm6, %ymm2
> > + vmovdqu %ymm10, %ymm3
> >
> > .p2align 4
> > -L(find_nul):
> > -# ifdef USE_AS_WCSRCHR
> > - /* Keep the first bit for each matching CHAR for bsr. */
> > - andl $0x11111111, %ecx
> > - andl $0x11111111, %eax
> > -# endif
> > - /* Mask out any matching bits after the nul CHAR. */
> > - movl %ecx, %r8d
> > - subl $1, %r8d
> > - xorl %ecx, %r8d
> > - andl %r8d, %eax
> > +L(second_aligned_loop):
> > + /* Search 2x at at time. */
> > + vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4
> > + vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5
> > +
> > + VPCMPEQ %ymm4, %ymm7, %ymm6
> > + VPMIN %ymm4, %ymm5, %ymm1
> > + VPCMPEQ %ymm5, %ymm7, %ymm10
> > + vpor %ymm6, %ymm10, %ymm5
> > + VPCMPEQ %ymm1, %ymm0, %ymm1
> > + vpor %ymm5, %ymm1, %ymm9
> > +
> > + vpmovmskb %ymm9, %eax
> > + addq $(VEC_SIZE * 2), %rdi
> > testl %eax, %eax
> > - /* If there is no CHAR here, return the remembered one. */
> > - jz L(return_value)
> > - bsrl %eax, %eax
> > - leaq -VEC_SIZE(%rdi, %rax), %rax
> > - VZEROUPPER_RETURN
> > -
> > - .p2align 4
> > -L(char_and_nul):
> > - /* Find both a CHAR and a nul CHAR. */
> > - addq %rcx, %rdi
> > - movl %edx, %ecx
> > -L(char_and_nul_in_first_vec):
> > -# ifdef USE_AS_WCSRCHR
> > - /* Keep the first bit for each matching CHAR for bsr. */
> > - andl $0x11111111, %ecx
> > - andl $0x11111111, %eax
> > -# endif
> > - /* Mask out any matching bits after the nul CHAR. */
> > - movl %ecx, %r8d
> > - subl $1, %r8d
> > - xorl %ecx, %r8d
> > - andl %r8d, %eax
> > + jz L(second_aligned_loop)
> > + vpmovmskb %ymm1, %ecx
> > + testl %ecx, %ecx
> > + jz L(second_aligned_loop_set_furthest_match)
> > + vpmovmskb %ymm5, %eax
> > testl %eax, %eax
> > - /* Return null pointer if the nul CHAR comes first. */
> > - jz L(return_null)
> > - bsrl %eax, %eax
> > - leaq -VEC_SIZE(%rdi, %rax), %rax
> > + jnz L(return_new_match)
> > +
> > + /* This is the hot patch. We know CHAR is inbounds and that
> > + ymm3/ymm2 have latest match. */
> > + .p2align 4,, 4
> > +L(return_old_match):
> > + vpmovmskb %ymm3, %eax
> > + vpmovmskb %ymm2, %edx
> > + salq $32, %rax
> > + orq %rdx, %rax
> > + bsrq %rax, %rax
> > + /* Search char cannot be zero so safe to just use lea for
> > + wcsrchr. */
> > + leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax
> > VZEROUPPER_RETURN
> >
> > - .p2align 4
> > -L(return_null):
> > - xorl %eax, %eax
> > + /* Last iteration also potentially has a match. */
> > + .p2align 4,, 8
> > +L(return_new_match):
> > + VPCMPEQ %ymm4, %ymm0, %ymm4
> > + vpmovmskb %ymm4, %edx
> > + salq $32, %rcx
> > + orq %rdx, %rcx
> > +
> > + vpmovmskb %ymm10, %eax
> > + vpmovmskb %ymm6, %edx
> > + salq $32, %rax
> > + orq %rdx, %rax
> > + blsmskq %rcx, %rcx
> > + andq %rcx, %rax
> > + jz L(return_old_match)
> > + bsrq %rax, %rax
> > + /* Search char cannot be zero so safe to just use lea for
> > + wcsrchr. */
> > + leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax
> > VZEROUPPER_RETURN
> >
> > -END (STRRCHR)
> > + .p2align 4,, 4
> > +L(cross_page):
> > + movq %rdi, %rsi
> > + andq $-VEC_SIZE, %rsi
> > + vmovdqu (%rsi), %ymm1
> > + VPCMPEQ %ymm1, %ymm0, %ymm6
> > + vpmovmskb %ymm6, %ecx
> > + /* Shift out zero CHAR matches that are before the begining of
> > + src (rdi). */
> > + shrxl %edi, %ecx, %ecx
> > + testl %ecx, %ecx
> > + jz L(page_cross_continue)
> > + VPCMPEQ %ymm1, %ymm7, %ymm1
> > + vpmovmskb %ymm1, %eax
> > +
> > + /* Shift out search CHAR matches that are before the begining of
> > + src (rdi). */
> > + shrxl %edi, %eax, %eax
> > + blsmskl %ecx, %ecx
> > + /* Check if any search CHAR match in range. */
> > + andl %ecx, %eax
> > + jz L(ret2)
> > + bsrl %eax, %eax
> > + addq %rdi, %rax
> > +# ifdef USE_AS_WCSRCHR
> > + andq $-CHAR_SIZE, %rax
> > +# endif
> > +L(ret2):
> > + VZEROUPPER_RETURN
> > +END(STRRCHR)
> > #endif
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.
I would like to backport this patch to release branches.
Any comments or objections?
--Sunil
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH v3 4/4] x86: Optimize {str|wcs}rchr-evex
2022-04-22 19:04 ` H.J. Lu
@ 2022-05-12 20:16 ` Sunil Pandey
0 siblings, 0 replies; 36+ messages in thread
From: Sunil Pandey @ 2022-05-12 20:16 UTC (permalink / raw)
To: H.J. Lu, Libc-stable Mailing List; +Cc: Noah Goldstein, GNU C Library
On Fri, Apr 22, 2022 at 12:08 PM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Thu, Apr 21, 2022 at 6:52 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > The new code unrolls the main loop slightly without adding too much
> > overhead and minimizes the comparisons for the search CHAR.
> >
> > Geometric Mean of all benchmarks New / Old: 0.755
> > See email for all results.
> >
> > Full xcheck passes on x86_64 with and without multiarch enabled.
> > ---
> > sysdeps/x86_64/multiarch/strrchr-evex.S | 471 +++++++++++++++---------
> > 1 file changed, 290 insertions(+), 181 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
> > index adeddaed32..8014c285b3 100644
> > --- a/sysdeps/x86_64/multiarch/strrchr-evex.S
> > +++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
> > @@ -24,242 +24,351 @@
> > # define STRRCHR __strrchr_evex
> > # endif
> >
> > -# define VMOVU vmovdqu64
> > -# define VMOVA vmovdqa64
> > +# define VMOVU vmovdqu64
> > +# define VMOVA vmovdqa64
> >
> > # ifdef USE_AS_WCSRCHR
> > +# define SHIFT_REG esi
> > +
> > +# define kunpck kunpckbw
> > +# define kmov_2x kmovd
> > +# define maskz_2x ecx
> > +# define maskm_2x eax
> > +# define CHAR_SIZE 4
> > +# define VPMIN vpminud
> > +# define VPTESTN vptestnmd
> > # define VPBROADCAST vpbroadcastd
> > -# define VPCMP vpcmpd
> > -# define SHIFT_REG r8d
> > +# define VPCMP vpcmpd
> > # else
> > +# define SHIFT_REG edi
> > +
> > +# define kunpck kunpckdq
> > +# define kmov_2x kmovq
> > +# define maskz_2x rcx
> > +# define maskm_2x rax
> > +
> > +# define CHAR_SIZE 1
> > +# define VPMIN vpminub
> > +# define VPTESTN vptestnmb
> > # define VPBROADCAST vpbroadcastb
> > -# define VPCMP vpcmpb
> > -# define SHIFT_REG ecx
> > +# define VPCMP vpcmpb
> > # endif
> >
> > # define XMMZERO xmm16
> > # define YMMZERO ymm16
> > # define YMMMATCH ymm17
> > -# define YMM1 ymm18
> > +# define YMMSAVE ymm18
> > +
> > +# define YMM1 ymm19
> > +# define YMM2 ymm20
> > +# define YMM3 ymm21
> > +# define YMM4 ymm22
> > +# define YMM5 ymm23
> > +# define YMM6 ymm24
> > +# define YMM7 ymm25
> > +# define YMM8 ymm26
> >
> > -# define VEC_SIZE 32
> >
> > - .section .text.evex,"ax",@progbits
> > -ENTRY (STRRCHR)
> > - movl %edi, %ecx
> > +# define VEC_SIZE 32
> > +# define PAGE_SIZE 4096
> > + .section .text.evex, "ax", @progbits
> > +ENTRY(STRRCHR)
> > + movl %edi, %eax
> > /* Broadcast CHAR to YMMMATCH. */
> > VPBROADCAST %esi, %YMMMATCH
> >
> > - vpxorq %XMMZERO, %XMMZERO, %XMMZERO
> > -
> > - /* Check if we may cross page boundary with one vector load. */
> > - andl $(2 * VEC_SIZE - 1), %ecx
> > - cmpl $VEC_SIZE, %ecx
> > - ja L(cros_page_boundary)
> > + andl $(PAGE_SIZE - 1), %eax
> > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> > + jg L(cross_page_boundary)
> >
> > +L(page_cross_continue):
> > VMOVU (%rdi), %YMM1
> > -
> > - /* Each bit in K0 represents a null byte in YMM1. */
> > - VPCMP $0, %YMMZERO, %YMM1, %k0
> > - /* Each bit in K1 represents a CHAR in YMM1. */
> > - VPCMP $0, %YMMMATCH, %YMM1, %k1
> > + /* k0 has a 1 for each zero CHAR in YMM1. */
> > + VPTESTN %YMM1, %YMM1, %k0
> > kmovd %k0, %ecx
> > - kmovd %k1, %eax
> > -
> > - addq $VEC_SIZE, %rdi
> > -
> > - testl %eax, %eax
> > - jnz L(first_vec)
> > -
> > testl %ecx, %ecx
> > - jnz L(return_null)
> > -
> > - andq $-VEC_SIZE, %rdi
> > - xorl %edx, %edx
> > - jmp L(aligned_loop)
> > -
> > - .p2align 4
> > -L(first_vec):
> > - /* Check if there is a null byte. */
> > - testl %ecx, %ecx
> > - jnz L(char_and_nul_in_first_vec)
> > -
> > - /* Remember the match and keep searching. */
> > - movl %eax, %edx
> > - movq %rdi, %rsi
> > - andq $-VEC_SIZE, %rdi
> > - jmp L(aligned_loop)
> > -
> > - .p2align 4
> > -L(cros_page_boundary):
> > - andl $(VEC_SIZE - 1), %ecx
> > - andq $-VEC_SIZE, %rdi
> > + jz L(aligned_more)
> > + /* fallthrough: zero CHAR in first VEC. */
> >
> > + /* K1 has a 1 for each search CHAR match in YMM1. */
> > + VPCMP $0, %YMMMATCH, %YMM1, %k1
> > + kmovd %k1, %eax
> > + /* Build mask up until first zero CHAR (used to mask of
> > + potential search CHAR matches past the end of the string).
> > + */
> > + blsmskl %ecx, %ecx
> > + andl %ecx, %eax
> > + jz L(ret0)
> > + /* Get last match (the `andl` removed any out of bounds
> > + matches). */
> > + bsrl %eax, %eax
> > # ifdef USE_AS_WCSRCHR
> > - /* NB: Divide shift count by 4 since each bit in K1 represent 4
> > - bytes. */
> > - movl %ecx, %SHIFT_REG
> > - sarl $2, %SHIFT_REG
> > + leaq (%rdi, %rax, CHAR_SIZE), %rax
> > +# else
> > + addq %rdi, %rax
> > # endif
> > +L(ret0):
> > + ret
> >
> > - VMOVA (%rdi), %YMM1
> > -
> > - /* Each bit in K0 represents a null byte in YMM1. */
> > - VPCMP $0, %YMMZERO, %YMM1, %k0
> > - /* Each bit in K1 represents a CHAR in YMM1. */
> > + /* Returns for first vec x1/x2/x3 have hard coded backward
> > + search path for earlier matches. */
> > + .p2align 4,, 6
> > +L(first_vec_x1):
> > + VPCMP $0, %YMMMATCH, %YMM2, %k1
> > + kmovd %k1, %eax
> > + blsmskl %ecx, %ecx
> > + /* eax non-zero if search CHAR in range. */
> > + andl %ecx, %eax
> > + jnz L(first_vec_x1_return)
> > +
> > + /* fallthrough: no match in YMM2 then need to check for earlier
> > + matches (in YMM1). */
> > + .p2align 4,, 4
> > +L(first_vec_x0_test):
> > VPCMP $0, %YMMMATCH, %YMM1, %k1
> > - kmovd %k0, %edx
> > kmovd %k1, %eax
> > -
> > - shrxl %SHIFT_REG, %edx, %edx
> > - shrxl %SHIFT_REG, %eax, %eax
> > - addq $VEC_SIZE, %rdi
> > -
> > - /* Check if there is a CHAR. */
> > testl %eax, %eax
> > - jnz L(found_char)
> > -
> > - testl %edx, %edx
> > - jnz L(return_null)
> > -
> > - jmp L(aligned_loop)
> > -
> > - .p2align 4
> > -L(found_char):
> > - testl %edx, %edx
> > - jnz L(char_and_nul)
> > -
> > - /* Remember the match and keep searching. */
> > - movl %eax, %edx
> > - leaq (%rdi, %rcx), %rsi
> > + jz L(ret1)
> > + bsrl %eax, %eax
> > +# ifdef USE_AS_WCSRCHR
> > + leaq (%rsi, %rax, CHAR_SIZE), %rax
> > +# else
> > + addq %rsi, %rax
> > +# endif
> > +L(ret1):
> > + ret
> >
> > - .p2align 4
> > -L(aligned_loop):
> > - VMOVA (%rdi), %YMM1
> > - addq $VEC_SIZE, %rdi
> > + .p2align 4,, 10
> > +L(first_vec_x1_or_x2):
> > + VPCMP $0, %YMM3, %YMMMATCH, %k3
> > + VPCMP $0, %YMM2, %YMMMATCH, %k2
> > + /* K2 and K3 have 1 for any search CHAR match. Test if any
> > + matches between either of them. Otherwise check YMM1. */
> > + kortestd %k2, %k3
> > + jz L(first_vec_x0_test)
> > +
> > + /* Guranteed that YMM2 and YMM3 are within range so merge the
> > + two bitmasks then get last result. */
> > + kunpck %k2, %k3, %k3
> > + kmovq %k3, %rax
> > + bsrq %rax, %rax
> > + leaq (VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax
> > + ret
> >
> > - /* Each bit in K0 represents a null byte in YMM1. */
> > - VPCMP $0, %YMMZERO, %YMM1, %k0
> > - /* Each bit in K1 represents a CHAR in YMM1. */
> > - VPCMP $0, %YMMMATCH, %YMM1, %k1
> > - kmovd %k0, %ecx
> > + .p2align 4,, 6
> > +L(first_vec_x3):
> > + VPCMP $0, %YMMMATCH, %YMM4, %k1
> > kmovd %k1, %eax
> > - orl %eax, %ecx
> > - jnz L(char_nor_null)
> > + blsmskl %ecx, %ecx
> > + /* If no search CHAR match in range check YMM1/YMM2/YMM3. */
> > + andl %ecx, %eax
> > + jz L(first_vec_x1_or_x2)
> > + bsrl %eax, %eax
> > + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> > + ret
> >
> > - VMOVA (%rdi), %YMM1
> > - add $VEC_SIZE, %rdi
> > + .p2align 4,, 6
> > +L(first_vec_x0_x1_test):
> > + VPCMP $0, %YMMMATCH, %YMM2, %k1
> > + kmovd %k1, %eax
> > + /* Check YMM2 for last match first. If no match try YMM1. */
> > + testl %eax, %eax
> > + jz L(first_vec_x0_test)
> > + .p2align 4,, 4
> > +L(first_vec_x1_return):
> > + bsrl %eax, %eax
> > + leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
> > + ret
> >
> > - /* Each bit in K0 represents a null byte in YMM1. */
> > - VPCMP $0, %YMMZERO, %YMM1, %k0
> > - /* Each bit in K1 represents a CHAR in YMM1. */
> > - VPCMP $0, %YMMMATCH, %YMM1, %k1
> > - kmovd %k0, %ecx
> > + .p2align 4,, 10
> > +L(first_vec_x2):
> > + VPCMP $0, %YMMMATCH, %YMM3, %k1
> > kmovd %k1, %eax
> > - orl %eax, %ecx
> > - jnz L(char_nor_null)
> > + blsmskl %ecx, %ecx
> > + /* Check YMM3 for last match first. If no match try YMM2/YMM1.
> > + */
> > + andl %ecx, %eax
> > + jz L(first_vec_x0_x1_test)
> > + bsrl %eax, %eax
> > + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> > + ret
> >
> > - VMOVA (%rdi), %YMM1
> > - addq $VEC_SIZE, %rdi
> >
> > - /* Each bit in K0 represents a null byte in YMM1. */
> > - VPCMP $0, %YMMZERO, %YMM1, %k0
> > - /* Each bit in K1 represents a CHAR in YMM1. */
> > - VPCMP $0, %YMMMATCH, %YMM1, %k1
> > + .p2align 4
> > +L(aligned_more):
> > + /* Need to keep original pointer incase YMM1 has last match. */
> > + movq %rdi, %rsi
> > + andq $-VEC_SIZE, %rdi
> > + VMOVU VEC_SIZE(%rdi), %YMM2
> > + VPTESTN %YMM2, %YMM2, %k0
> > kmovd %k0, %ecx
> > - kmovd %k1, %eax
> > - orl %eax, %ecx
> > - jnz L(char_nor_null)
> > + testl %ecx, %ecx
> > + jnz L(first_vec_x1)
> >
> > - VMOVA (%rdi), %YMM1
> > - addq $VEC_SIZE, %rdi
> > + VMOVU (VEC_SIZE * 2)(%rdi), %YMM3
> > + VPTESTN %YMM3, %YMM3, %k0
> > + kmovd %k0, %ecx
> > + testl %ecx, %ecx
> > + jnz L(first_vec_x2)
> >
> > - /* Each bit in K0 represents a null byte in YMM1. */
> > - VPCMP $0, %YMMZERO, %YMM1, %k0
> > - /* Each bit in K1 represents a CHAR in YMM1. */
> > - VPCMP $0, %YMMMATCH, %YMM1, %k1
> > + VMOVU (VEC_SIZE * 3)(%rdi), %YMM4
> > + VPTESTN %YMM4, %YMM4, %k0
> > kmovd %k0, %ecx
> > - kmovd %k1, %eax
> > - orl %eax, %ecx
> > - jz L(aligned_loop)
> > + movq %rdi, %r8
> > + testl %ecx, %ecx
> > + jnz L(first_vec_x3)
> >
> > + andq $-(VEC_SIZE * 2), %rdi
> > .p2align 4
> > -L(char_nor_null):
> > - /* Find a CHAR or a null byte in a loop. */
> > +L(first_aligned_loop):
> > + /* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee
> > + they don't store a match. */
> > + VMOVA (VEC_SIZE * 4)(%rdi), %YMM5
> > + VMOVA (VEC_SIZE * 5)(%rdi), %YMM6
> > +
> > + VPCMP $0, %YMM5, %YMMMATCH, %k2
> > + vpxord %YMM6, %YMMMATCH, %YMM7
> > +
> > + VPMIN %YMM5, %YMM6, %YMM8
> > + VPMIN %YMM8, %YMM7, %YMM7
> > +
> > + VPTESTN %YMM7, %YMM7, %k1
> > + subq $(VEC_SIZE * -2), %rdi
> > + kortestd %k1, %k2
> > + jz L(first_aligned_loop)
> > +
> > + VPCMP $0, %YMM6, %YMMMATCH, %k3
> > + VPTESTN %YMM8, %YMM8, %k1
> > + ktestd %k1, %k1
> > + jz L(second_aligned_loop_prep)
> > +
> > + kortestd %k2, %k3
> > + jnz L(return_first_aligned_loop)
> > +
> > + .p2align 4,, 6
> > +L(first_vec_x1_or_x2_or_x3):
> > + VPCMP $0, %YMM4, %YMMMATCH, %k4
> > + kmovd %k4, %eax
> > testl %eax, %eax
> > - jnz L(match)
> > -L(return_value):
> > - testl %edx, %edx
> > - jz L(return_null)
> > - movl %edx, %eax
> > - movq %rsi, %rdi
> > + jz L(first_vec_x1_or_x2)
> > bsrl %eax, %eax
> > -# ifdef USE_AS_WCSRCHR
> > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> > - leaq -VEC_SIZE(%rdi, %rax, 4), %rax
> > -# else
> > - leaq -VEC_SIZE(%rdi, %rax), %rax
> > -# endif
> > + leaq (VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
> > ret
> >
> > - .p2align 4
> > -L(match):
> > - /* Find a CHAR. Check if there is a null byte. */
> > - kmovd %k0, %ecx
> > - testl %ecx, %ecx
> > - jnz L(find_nul)
> > + .p2align 4,, 8
> > +L(return_first_aligned_loop):
> > + VPTESTN %YMM5, %YMM5, %k0
> > + kunpck %k0, %k1, %k0
> > + kmov_2x %k0, %maskz_2x
> > +
> > + blsmsk %maskz_2x, %maskz_2x
> > + kunpck %k2, %k3, %k3
> > + kmov_2x %k3, %maskm_2x
> > + and %maskz_2x, %maskm_2x
> > + jz L(first_vec_x1_or_x2_or_x3)
> >
> > - /* Remember the match and keep searching. */
> > - movl %eax, %edx
> > + bsr %maskm_2x, %maskm_2x
> > + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> > + ret
> > +
> > + .p2align 4
> > + /* We can throw away the work done for the first 4x checks here
> > + as we have a later match. This is the 'fast' path persay.
> > + */
> > +L(second_aligned_loop_prep):
> > +L(second_aligned_loop_set_furthest_match):
> > movq %rdi, %rsi
> > - jmp L(aligned_loop)
> > + kunpck %k2, %k3, %k4
> >
> > .p2align 4
> > -L(find_nul):
> > - /* Mask out any matching bits after the null byte. */
> > - movl %ecx, %r8d
> > - subl $1, %r8d
> > - xorl %ecx, %r8d
> > - andl %r8d, %eax
> > - testl %eax, %eax
> > - /* If there is no CHAR here, return the remembered one. */
> > - jz L(return_value)
> > - bsrl %eax, %eax
> > +L(second_aligned_loop):
> > + VMOVU (VEC_SIZE * 4)(%rdi), %YMM1
> > + VMOVU (VEC_SIZE * 5)(%rdi), %YMM2
> > +
> > + VPCMP $0, %YMM1, %YMMMATCH, %k2
> > + vpxord %YMM2, %YMMMATCH, %YMM3
> > +
> > + VPMIN %YMM1, %YMM2, %YMM4
> > + VPMIN %YMM3, %YMM4, %YMM3
> > +
> > + VPTESTN %YMM3, %YMM3, %k1
> > + subq $(VEC_SIZE * -2), %rdi
> > + kortestd %k1, %k2
> > + jz L(second_aligned_loop)
> > +
> > + VPCMP $0, %YMM2, %YMMMATCH, %k3
> > + VPTESTN %YMM4, %YMM4, %k1
> > + ktestd %k1, %k1
> > + jz L(second_aligned_loop_set_furthest_match)
> > +
> > + kortestd %k2, %k3
> > + /* branch here because there is a significant advantage interms
> > + of output dependency chance in using edx. */
> > + jnz L(return_new_match)
> > +L(return_old_match):
> > + kmovq %k4, %rax
> > + bsrq %rax, %rax
> > + leaq (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax
> > + ret
> > +
> > +L(return_new_match):
> > + VPTESTN %YMM1, %YMM1, %k0
> > + kunpck %k0, %k1, %k0
> > + kmov_2x %k0, %maskz_2x
> > +
> > + blsmsk %maskz_2x, %maskz_2x
> > + kunpck %k2, %k3, %k3
> > + kmov_2x %k3, %maskm_2x
> > + and %maskz_2x, %maskm_2x
> > + jz L(return_old_match)
> > +
> > + bsr %maskm_2x, %maskm_2x
> > + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> > + ret
> > +
> > +L(cross_page_boundary):
> > + /* eax contains all the page offset bits of src (rdi). `xor rdi,
> > + rax` sets pointer will all page offset bits cleared so
> > + offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
> > + before page cross (guranteed to be safe to read). Doing this
> > + as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
> > + a bit of code size. */
> > + xorq %rdi, %rax
> > + VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %YMM1
> > + VPTESTN %YMM1, %YMM1, %k0
> > + kmovd %k0, %ecx
> > +
> > + /* Shift out zero CHAR matches that are before the begining of
> > + src (rdi). */
> > # ifdef USE_AS_WCSRCHR
> > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> > - leaq -VEC_SIZE(%rdi, %rax, 4), %rax
> > -# else
> > - leaq -VEC_SIZE(%rdi, %rax), %rax
> > + movl %edi, %esi
> > + andl $(VEC_SIZE - 1), %esi
> > + shrl $2, %esi
> > # endif
> > - ret
> > + shrxl %SHIFT_REG, %ecx, %ecx
> >
> > - .p2align 4
> > -L(char_and_nul):
> > - /* Find both a CHAR and a null byte. */
> > - addq %rcx, %rdi
> > - movl %edx, %ecx
> > -L(char_and_nul_in_first_vec):
> > - /* Mask out any matching bits after the null byte. */
> > - movl %ecx, %r8d
> > - subl $1, %r8d
> > - xorl %ecx, %r8d
> > - andl %r8d, %eax
> > - testl %eax, %eax
> > - /* Return null pointer if the null byte comes first. */
> > - jz L(return_null)
> > + testl %ecx, %ecx
> > + jz L(page_cross_continue)
> > +
> > + /* Found zero CHAR so need to test for search CHAR. */
> > + VPCMP $0, %YMMMATCH, %YMM1, %k1
> > + kmovd %k1, %eax
> > + /* Shift out search CHAR matches that are before the begining of
> > + src (rdi). */
> > + shrxl %SHIFT_REG, %eax, %eax
> > +
> > + /* Check if any search CHAR match in range. */
> > + blsmskl %ecx, %ecx
> > + andl %ecx, %eax
> > + jz L(ret3)
> > bsrl %eax, %eax
> > # ifdef USE_AS_WCSRCHR
> > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> > - leaq -VEC_SIZE(%rdi, %rax, 4), %rax
> > + leaq (%rdi, %rax, CHAR_SIZE), %rax
> > # else
> > - leaq -VEC_SIZE(%rdi, %rax), %rax
> > + addq %rdi, %rax
> > # endif
> > +L(ret3):
> > ret
> >
> > - .p2align 4
> > -L(return_null):
> > - xorl %eax, %eax
> > - ret
> > -
> > -END (STRRCHR)
> > +END(STRRCHR)
> > #endif
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.
I would like to backport this patch to release branches.
Any comments or objections?
--Sunil
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH v3 3/4] x86: Optimize {str|wcs}rchr-avx2
2022-05-12 20:14 ` Sunil Pandey
@ 2022-07-20 15:33 ` Noah Goldstein
0 siblings, 0 replies; 36+ messages in thread
From: Noah Goldstein @ 2022-07-20 15:33 UTC (permalink / raw)
To: Sunil Pandey; +Cc: H.J. Lu, Libc-stable Mailing List, GNU C Library
On Fri, May 13, 2022 at 4:15 AM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
> On Fri, Apr 22, 2022 at 12:08 PM H.J. Lu via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > On Thu, Apr 21, 2022 at 6:52 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > The new code unrolls the main loop slightly without adding too much
> > > overhead and minimizes the comparisons for the search CHAR.
> > >
> > > Geometric Mean of all benchmarks New / Old: 0.832
> > > See email for all results.
> > >
> > > Full xcheck passes on x86_64 with and without multiarch enabled.
> > > ---
> > > sysdeps/x86_64/multiarch/strrchr-avx2.S | 426 +++++++++++++++---------
> > > 1 file changed, 269 insertions(+), 157 deletions(-)
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
> > > index 1df2adfad0..bd26ba80d5 100644
> > > --- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
> > > +++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
> > > @@ -27,9 +27,13 @@
> > > # ifdef USE_AS_WCSRCHR
> > > # define VPBROADCAST vpbroadcastd
> > > # define VPCMPEQ vpcmpeqd
> > > +# define VPMIN vpminud
> > > +# define CHAR_SIZE 4
> > > # else
> > > # define VPBROADCAST vpbroadcastb
> > > # define VPCMPEQ vpcmpeqb
> > > +# define VPMIN vpminub
> > > +# define CHAR_SIZE 1
> > > # endif
> > >
> > > # ifndef VZEROUPPER
> > > @@ -41,196 +45,304 @@
> > > # endif
> > >
> > > # define VEC_SIZE 32
> > > +# define PAGE_SIZE 4096
> > >
> > > - .section SECTION(.text),"ax",@progbits
> > > -ENTRY (STRRCHR)
> > > - movd %esi, %xmm4
> > > - movl %edi, %ecx
> > > + .section SECTION(.text), "ax", @progbits
> > > +ENTRY(STRRCHR)
> > > + movd %esi, %xmm7
> > > + movl %edi, %eax
> > > /* Broadcast CHAR to YMM4. */
> > > - VPBROADCAST %xmm4, %ymm4
> > > + VPBROADCAST %xmm7, %ymm7
> > > vpxor %xmm0, %xmm0, %xmm0
> > >
> > > - /* Check if we may cross page boundary with one vector load. */
> > > - andl $(2 * VEC_SIZE - 1), %ecx
> > > - cmpl $VEC_SIZE, %ecx
> > > - ja L(cros_page_boundary)
> > > + /* Shift here instead of `andl` to save code size (saves a fetch
> > > + block). */
> > > + sall $20, %eax
> > > + cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax
> > > + ja L(cross_page)
> > >
> > > +L(page_cross_continue):
> > > vmovdqu (%rdi), %ymm1
> > > - VPCMPEQ %ymm1, %ymm0, %ymm2
> > > - VPCMPEQ %ymm1, %ymm4, %ymm3
> > > - vpmovmskb %ymm2, %ecx
> > > - vpmovmskb %ymm3, %eax
> > > - addq $VEC_SIZE, %rdi
> > > + /* Check end of string match. */
> > > + VPCMPEQ %ymm1, %ymm0, %ymm6
> > > + vpmovmskb %ymm6, %ecx
> > > + testl %ecx, %ecx
> > > + jz L(aligned_more)
> > > +
> > > + /* Only check match with search CHAR if needed. */
> > > + VPCMPEQ %ymm1, %ymm7, %ymm1
> > > + vpmovmskb %ymm1, %eax
> > > + /* Check if match before first zero. */
> > > + blsmskl %ecx, %ecx
> > > + andl %ecx, %eax
> > > + jz L(ret0)
> > > + bsrl %eax, %eax
> > > + addq %rdi, %rax
> > > + /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
> > > + search CHAR is zero we are correct. Either way `andq
> > > + -CHAR_SIZE, %rax` gets the correct result. */
> > > +# ifdef USE_AS_WCSRCHR
> > > + andq $-CHAR_SIZE, %rax
> > > +# endif
> > > +L(ret0):
> > > +L(return_vzeroupper):
> > > + ZERO_UPPER_VEC_REGISTERS_RETURN
> > > +
> > > + /* Returns for first vec x1/x2 have hard coded backward search
> > > + path for earlier matches. */
> > > + .p2align 4,, 10
> > > +L(first_vec_x1):
> > > + VPCMPEQ %ymm2, %ymm7, %ymm6
> > > + vpmovmskb %ymm6, %eax
> > > + blsmskl %ecx, %ecx
> > > + andl %ecx, %eax
> > > + jnz L(first_vec_x1_return)
> > > +
> > > + .p2align 4,, 4
> > > +L(first_vec_x0_test):
> > > + VPCMPEQ %ymm1, %ymm7, %ymm6
> > > + vpmovmskb %ymm6, %eax
> > > + testl %eax, %eax
> > > + jz L(ret1)
> > > + bsrl %eax, %eax
> > > + addq %r8, %rax
> > > +# ifdef USE_AS_WCSRCHR
> > > + andq $-CHAR_SIZE, %rax
> > > +# endif
> > > +L(ret1):
> > > + VZEROUPPER_RETURN
> > >
> > > + .p2align 4,, 10
> > > +L(first_vec_x0_x1_test):
> > > + VPCMPEQ %ymm2, %ymm7, %ymm6
> > > + vpmovmskb %ymm6, %eax
> > > + /* Check ymm2 for search CHAR match. If no match then check ymm1
> > > + before returning. */
> > > testl %eax, %eax
> > > - jnz L(first_vec)
> > > + jz L(first_vec_x0_test)
> > > + .p2align 4,, 4
> > > +L(first_vec_x1_return):
> > > + bsrl %eax, %eax
> > > + leaq 1(%rdi, %rax), %rax
> > > +# ifdef USE_AS_WCSRCHR
> > > + andq $-CHAR_SIZE, %rax
> > > +# endif
> > > + VZEROUPPER_RETURN
> > >
> > > - testl %ecx, %ecx
> > > - jnz L(return_null)
> > >
> > > - andq $-VEC_SIZE, %rdi
> > > - xorl %edx, %edx
> > > - jmp L(aligned_loop)
> > > + .p2align 4,, 10
> > > +L(first_vec_x2):
> > > + VPCMPEQ %ymm3, %ymm7, %ymm6
> > > + vpmovmskb %ymm6, %eax
> > > + blsmskl %ecx, %ecx
> > > + /* If no in-range search CHAR match in ymm3 then need to check
> > > + ymm1/ymm2 for an earlier match (we delay checking search
> > > + CHAR matches until needed). */
> > > + andl %ecx, %eax
> > > + jz L(first_vec_x0_x1_test)
> > > + bsrl %eax, %eax
> > > + leaq (VEC_SIZE + 1)(%rdi, %rax), %rax
> > > +# ifdef USE_AS_WCSRCHR
> > > + andq $-CHAR_SIZE, %rax
> > > +# endif
> > > + VZEROUPPER_RETURN
> > > +
> > >
> > > .p2align 4
> > > -L(first_vec):
> > > - /* Check if there is a nul CHAR. */
> > > +L(aligned_more):
> > > + /* Save original pointer if match was in VEC 0. */
> > > + movq %rdi, %r8
> > > +
> > > + /* Align src. */
> > > + orq $(VEC_SIZE - 1), %rdi
> > > + vmovdqu 1(%rdi), %ymm2
> > > + VPCMPEQ %ymm2, %ymm0, %ymm6
> > > + vpmovmskb %ymm6, %ecx
> > > testl %ecx, %ecx
> > > - jnz L(char_and_nul_in_first_vec)
> > > + jnz L(first_vec_x1)
> > >
> > > - /* Remember the match and keep searching. */
> > > - movl %eax, %edx
> > > - movq %rdi, %rsi
> > > - andq $-VEC_SIZE, %rdi
> > > - jmp L(aligned_loop)
> > > + vmovdqu (VEC_SIZE + 1)(%rdi), %ymm3
> > > + VPCMPEQ %ymm3, %ymm0, %ymm6
> > > + vpmovmskb %ymm6, %ecx
> > > + testl %ecx, %ecx
> > > + jnz L(first_vec_x2)
> > >
> > > + /* Save pointer again before realigning. */
> > > + movq %rdi, %rsi
> > > + addq $(VEC_SIZE + 1), %rdi
> > > + andq $-(VEC_SIZE * 2), %rdi
> > > .p2align 4
> > > -L(cros_page_boundary):
> > > - andl $(VEC_SIZE - 1), %ecx
> > > - andq $-VEC_SIZE, %rdi
> > > - vmovdqa (%rdi), %ymm1
> > > - VPCMPEQ %ymm1, %ymm0, %ymm2
> > > - VPCMPEQ %ymm1, %ymm4, %ymm3
> > > - vpmovmskb %ymm2, %edx
> > > - vpmovmskb %ymm3, %eax
> > > - shrl %cl, %edx
> > > - shrl %cl, %eax
> > > - addq $VEC_SIZE, %rdi
> > > -
> > > - /* Check if there is a CHAR. */
> > > +L(first_aligned_loop):
> > > + /* Do 2x VEC at a time. Any more and the cost of finding the
> > > + match outweights loop benefit. */
> > > + vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4
> > > + vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5
> > > +
> > > + VPCMPEQ %ymm4, %ymm7, %ymm6
> > > + VPMIN %ymm4, %ymm5, %ymm8
> > > + VPCMPEQ %ymm5, %ymm7, %ymm10
> > > + vpor %ymm6, %ymm10, %ymm5
> > > + VPCMPEQ %ymm8, %ymm0, %ymm8
> > > + vpor %ymm5, %ymm8, %ymm9
> > > +
> > > + vpmovmskb %ymm9, %eax
> > > + addq $(VEC_SIZE * 2), %rdi
> > > + /* No zero or search CHAR. */
> > > testl %eax, %eax
> > > - jnz L(found_char)
> > > -
> > > - testl %edx, %edx
> > > - jnz L(return_null)
> > > + jz L(first_aligned_loop)
> > >
> > > - jmp L(aligned_loop)
> > > -
> > > - .p2align 4
> > > -L(found_char):
> > > - testl %edx, %edx
> > > - jnz L(char_and_nul)
> > > + /* If no zero CHAR then go to second loop (this allows us to
> > > + throw away all prior work). */
> > > + vpmovmskb %ymm8, %ecx
> > > + testl %ecx, %ecx
> > > + jz L(second_aligned_loop_prep)
> > >
> > > - /* Remember the match and keep searching. */
> > > - movl %eax, %edx
> > > - leaq (%rdi, %rcx), %rsi
> > > + /* Search char could be zero so we need to get the true match.
> > > + */
> > > + vpmovmskb %ymm5, %eax
> > > + testl %eax, %eax
> > > + jnz L(first_aligned_loop_return)
> > >
> > > - .p2align 4
> > > -L(aligned_loop):
> > > - vmovdqa (%rdi), %ymm1
> > > - VPCMPEQ %ymm1, %ymm0, %ymm2
> > > - addq $VEC_SIZE, %rdi
> > > - VPCMPEQ %ymm1, %ymm4, %ymm3
> > > - vpmovmskb %ymm2, %ecx
> > > - vpmovmskb %ymm3, %eax
> > > - orl %eax, %ecx
> > > - jnz L(char_nor_null)
> > > -
> > > - vmovdqa (%rdi), %ymm1
> > > - VPCMPEQ %ymm1, %ymm0, %ymm2
> > > - add $VEC_SIZE, %rdi
> > > - VPCMPEQ %ymm1, %ymm4, %ymm3
> > > - vpmovmskb %ymm2, %ecx
> > > + .p2align 4,, 4
> > > +L(first_vec_x1_or_x2):
> > > + VPCMPEQ %ymm3, %ymm7, %ymm3
> > > + VPCMPEQ %ymm2, %ymm7, %ymm2
> > > vpmovmskb %ymm3, %eax
> > > - orl %eax, %ecx
> > > - jnz L(char_nor_null)
> > > -
> > > - vmovdqa (%rdi), %ymm1
> > > - VPCMPEQ %ymm1, %ymm0, %ymm2
> > > - addq $VEC_SIZE, %rdi
> > > - VPCMPEQ %ymm1, %ymm4, %ymm3
> > > - vpmovmskb %ymm2, %ecx
> > > - vpmovmskb %ymm3, %eax
> > > - orl %eax, %ecx
> > > - jnz L(char_nor_null)
> > > -
> > > - vmovdqa (%rdi), %ymm1
> > > - VPCMPEQ %ymm1, %ymm0, %ymm2
> > > - addq $VEC_SIZE, %rdi
> > > - VPCMPEQ %ymm1, %ymm4, %ymm3
> > > - vpmovmskb %ymm2, %ecx
> > > - vpmovmskb %ymm3, %eax
> > > - orl %eax, %ecx
> > > - jz L(aligned_loop)
> > > -
> > > - .p2align 4
> > > -L(char_nor_null):
> > > - /* Find a CHAR or a nul CHAR in a loop. */
> > > - testl %eax, %eax
> > > - jnz L(match)
> > > -L(return_value):
> > > - testl %edx, %edx
> > > - jz L(return_null)
> > > - movl %edx, %eax
> > > - movq %rsi, %rdi
> > > + vpmovmskb %ymm2, %edx
> > > + /* Use add for macro-fusion. */
> > > + addq %rax, %rdx
> > > + jz L(first_vec_x0_test)
> > > + /* NB: We could move this shift to before the branch and save a
> > > + bit of code size / performance on the fall through. The
> > > + branch leads to the null case which generally seems hotter
> > > + than char in first 3x VEC. */
> > > + salq $32, %rax
> > > + addq %rdx, %rax
> > > + bsrq %rax, %rax
> > > + leaq 1(%rsi, %rax), %rax
> > > +# ifdef USE_AS_WCSRCHR
> > > + andq $-CHAR_SIZE, %rax
> > > +# endif
> > > + VZEROUPPER_RETURN
> > >
> > > + .p2align 4,, 8
> > > +L(first_aligned_loop_return):
> > > + VPCMPEQ %ymm4, %ymm0, %ymm4
> > > + vpmovmskb %ymm4, %edx
> > > + salq $32, %rcx
> > > + orq %rdx, %rcx
> > > +
> > > + vpmovmskb %ymm10, %eax
> > > + vpmovmskb %ymm6, %edx
> > > + salq $32, %rax
> > > + orq %rdx, %rax
> > > + blsmskq %rcx, %rcx
> > > + andq %rcx, %rax
> > > + jz L(first_vec_x1_or_x2)
> > > +
> > > + bsrq %rax, %rax
> > > + leaq -(VEC_SIZE * 2)(%rdi, %rax), %rax
> > > # ifdef USE_AS_WCSRCHR
> > > - /* Keep the first bit for each matching CHAR for bsr. */
> > > - andl $0x11111111, %eax
> > > + andq $-CHAR_SIZE, %rax
> > > # endif
> > > - bsrl %eax, %eax
> > > - leaq -VEC_SIZE(%rdi, %rax), %rax
> > > -L(return_vzeroupper):
> > > - ZERO_UPPER_VEC_REGISTERS_RETURN
> > > + VZEROUPPER_RETURN
> > >
> > > + /* Search char cannot be zero. */
> > > .p2align 4
> > > -L(match):
> > > - /* Find a CHAR. Check if there is a nul CHAR. */
> > > - vpmovmskb %ymm2, %ecx
> > > - testl %ecx, %ecx
> > > - jnz L(find_nul)
> > > -
> > > - /* Remember the match and keep searching. */
> > > - movl %eax, %edx
> > > +L(second_aligned_loop_set_furthest_match):
> > > + /* Save VEC and pointer from most recent match. */
> > > +L(second_aligned_loop_prep):
> > > movq %rdi, %rsi
> > > - jmp L(aligned_loop)
> > > + vmovdqu %ymm6, %ymm2
> > > + vmovdqu %ymm10, %ymm3
> > >
> > > .p2align 4
> > > -L(find_nul):
> > > -# ifdef USE_AS_WCSRCHR
> > > - /* Keep the first bit for each matching CHAR for bsr. */
> > > - andl $0x11111111, %ecx
> > > - andl $0x11111111, %eax
> > > -# endif
> > > - /* Mask out any matching bits after the nul CHAR. */
> > > - movl %ecx, %r8d
> > > - subl $1, %r8d
> > > - xorl %ecx, %r8d
> > > - andl %r8d, %eax
> > > +L(second_aligned_loop):
> > > + /* Search 2x at at time. */
> > > + vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4
> > > + vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5
> > > +
> > > + VPCMPEQ %ymm4, %ymm7, %ymm6
> > > + VPMIN %ymm4, %ymm5, %ymm1
> > > + VPCMPEQ %ymm5, %ymm7, %ymm10
> > > + vpor %ymm6, %ymm10, %ymm5
> > > + VPCMPEQ %ymm1, %ymm0, %ymm1
> > > + vpor %ymm5, %ymm1, %ymm9
> > > +
> > > + vpmovmskb %ymm9, %eax
> > > + addq $(VEC_SIZE * 2), %rdi
> > > testl %eax, %eax
> > > - /* If there is no CHAR here, return the remembered one. */
> > > - jz L(return_value)
> > > - bsrl %eax, %eax
> > > - leaq -VEC_SIZE(%rdi, %rax), %rax
> > > - VZEROUPPER_RETURN
> > > -
> > > - .p2align 4
> > > -L(char_and_nul):
> > > - /* Find both a CHAR and a nul CHAR. */
> > > - addq %rcx, %rdi
> > > - movl %edx, %ecx
> > > -L(char_and_nul_in_first_vec):
> > > -# ifdef USE_AS_WCSRCHR
> > > - /* Keep the first bit for each matching CHAR for bsr. */
> > > - andl $0x11111111, %ecx
> > > - andl $0x11111111, %eax
> > > -# endif
> > > - /* Mask out any matching bits after the nul CHAR. */
> > > - movl %ecx, %r8d
> > > - subl $1, %r8d
> > > - xorl %ecx, %r8d
> > > - andl %r8d, %eax
> > > + jz L(second_aligned_loop)
> > > + vpmovmskb %ymm1, %ecx
> > > + testl %ecx, %ecx
> > > + jz L(second_aligned_loop_set_furthest_match)
> > > + vpmovmskb %ymm5, %eax
> > > testl %eax, %eax
> > > - /* Return null pointer if the nul CHAR comes first. */
> > > - jz L(return_null)
> > > - bsrl %eax, %eax
> > > - leaq -VEC_SIZE(%rdi, %rax), %rax
> > > + jnz L(return_new_match)
> > > +
> > > + /* This is the hot patch. We know CHAR is inbounds and that
> > > + ymm3/ymm2 have latest match. */
> > > + .p2align 4,, 4
> > > +L(return_old_match):
> > > + vpmovmskb %ymm3, %eax
> > > + vpmovmskb %ymm2, %edx
> > > + salq $32, %rax
> > > + orq %rdx, %rax
> > > + bsrq %rax, %rax
> > > + /* Search char cannot be zero so safe to just use lea for
> > > + wcsrchr. */
> > > + leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax
> > > VZEROUPPER_RETURN
> > >
> > > - .p2align 4
> > > -L(return_null):
> > > - xorl %eax, %eax
> > > + /* Last iteration also potentially has a match. */
> > > + .p2align 4,, 8
> > > +L(return_new_match):
> > > + VPCMPEQ %ymm4, %ymm0, %ymm4
> > > + vpmovmskb %ymm4, %edx
> > > + salq $32, %rcx
> > > + orq %rdx, %rcx
> > > +
> > > + vpmovmskb %ymm10, %eax
> > > + vpmovmskb %ymm6, %edx
> > > + salq $32, %rax
> > > + orq %rdx, %rax
> > > + blsmskq %rcx, %rcx
> > > + andq %rcx, %rax
> > > + jz L(return_old_match)
> > > + bsrq %rax, %rax
> > > + /* Search char cannot be zero so safe to just use lea for
> > > + wcsrchr. */
> > > + leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax
> > > VZEROUPPER_RETURN
> > >
> > > -END (STRRCHR)
> > > + .p2align 4,, 4
> > > +L(cross_page):
> > > + movq %rdi, %rsi
> > > + andq $-VEC_SIZE, %rsi
> > > + vmovdqu (%rsi), %ymm1
> > > + VPCMPEQ %ymm1, %ymm0, %ymm6
> > > + vpmovmskb %ymm6, %ecx
> > > + /* Shift out zero CHAR matches that are before the begining of
> > > + src (rdi). */
> > > + shrxl %edi, %ecx, %ecx
> > > + testl %ecx, %ecx
> > > + jz L(page_cross_continue)
> > > + VPCMPEQ %ymm1, %ymm7, %ymm1
> > > + vpmovmskb %ymm1, %eax
> > > +
> > > + /* Shift out search CHAR matches that are before the begining of
> > > + src (rdi). */
> > > + shrxl %edi, %eax, %eax
> > > + blsmskl %ecx, %ecx
> > > + /* Check if any search CHAR match in range. */
> > > + andl %ecx, %eax
> > > + jz L(ret2)
> > > + bsrl %eax, %eax
> > > + addq %rdi, %rax
> > > +# ifdef USE_AS_WCSRCHR
> > > + andq $-CHAR_SIZE, %rax
> > > +# endif
> > > +L(ret2):
> > > + VZEROUPPER_RETURN
> > > +END(STRRCHR)
> > > #endif
> > > --
> > > 2.25.1
> > >
> >
> > LGTM.
> >
> > Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
> >
> > Thanks.
> >
> > --
> > H.J.
>
> I would like to backport this patch to release branches.
> Any comments or objections?
Sorry, should have mentioned earlier but we should probably
get the strrchr-avx2.S changes from:
https://sourceware.org/git/?p=glibc.git;a=commit;h=3079f652d7cc34456aefb412677c01e758922527
>
> --Sunil
^ permalink raw reply [flat|nested] 36+ messages in thread
end of thread, other threads:[~2022-07-20 15:33 UTC | newest]
Thread overview: 36+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-04-21 3:14 [PATCH v1 1/5] benchtests: Improve bench-strrchr Noah Goldstein
2022-04-21 3:14 ` [PATCH v1 2/5] x86: Optimize {str|wcs}rchr-sse2 Noah Goldstein
2022-04-21 20:26 ` H.J. Lu
2022-04-21 20:57 ` Noah Goldstein
2022-04-21 21:48 ` H.J. Lu
2022-04-21 22:23 ` Noah Goldstein
2022-04-21 3:14 ` [PATCH v1 3/5] x86: Add wcsrchr optimized with SSE4_1 in wcsrchr-sse4_1.S Noah Goldstein
2022-04-21 3:14 ` [PATCH v1 4/5] x86: Optimize {str|wcs}rchr-avx2 Noah Goldstein
2022-04-21 3:14 ` [PATCH v1 5/5] x86: Optimize {str|wcs}rchr-evex Noah Goldstein
2022-04-21 20:12 ` [PATCH v1 1/5] benchtests: Improve bench-strrchr H.J. Lu
2022-04-21 22:07 ` Noah Goldstein
2022-04-21 23:49 ` H.J. Lu
2022-04-22 1:11 ` Noah Goldstein
2022-04-21 22:22 ` [PATCH v2 1/4] " Noah Goldstein
2022-04-21 22:22 ` [PATCH v2 2/4] x86: Optimize {str|wcs}rchr-sse2 Noah Goldstein
2022-04-21 23:46 ` H.J. Lu
2022-04-22 1:54 ` Noah Goldstein
2022-04-21 22:22 ` [PATCH v2 3/4] x86: Optimize {str|wcs}rchr-avx2 Noah Goldstein
2022-04-21 22:22 ` [PATCH v2 4/4] x86: Optimize {str|wcs}rchr-evex Noah Goldstein
2022-04-21 23:59 ` H.J. Lu
2022-04-22 1:53 ` Noah Goldstein
2022-04-22 1:52 ` [PATCH v3 1/4] benchtests: Improve bench-strrchr Noah Goldstein
2022-04-22 1:52 ` [PATCH v3 2/4] x86: Optimize {str|wcs}rchr-sse2 Noah Goldstein
2022-04-22 19:06 ` H.J. Lu
2022-05-12 20:13 ` Sunil Pandey
2022-04-22 1:52 ` [PATCH v3 3/4] x86: Optimize {str|wcs}rchr-avx2 Noah Goldstein
2022-04-22 19:03 ` H.J. Lu
2022-05-12 20:14 ` Sunil Pandey
2022-07-20 15:33 ` Noah Goldstein
2022-04-22 1:52 ` [PATCH v3 4/4] x86: Optimize {str|wcs}rchr-evex Noah Goldstein
2022-04-22 19:04 ` H.J. Lu
2022-05-12 20:16 ` Sunil Pandey
2022-04-22 18:29 ` [PATCH v3 1/4] benchtests: Improve bench-strrchr H.J. Lu
2022-04-22 19:12 ` Noah Goldstein
2022-04-22 19:11 ` [PATCH v4 " Noah Goldstein
2022-04-23 1:53 ` H.J. Lu
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).