* [PATCH v4 1/2] x86: Refactor and improve performance of strchr-avx2.S
@ 2021-02-03 5:38 goldstein.w.n
2021-02-03 5:39 ` [PATCH v4 2/2] x86: Add additional benchmarks and tests for strchr goldstein.w.n
2021-02-08 14:08 ` [PATCH v4 1/2] x86: Refactor and improve performance of strchr-avx2.S H.J. Lu
0 siblings, 2 replies; 10+ messages in thread
From: goldstein.w.n @ 2021-02-03 5:38 UTC (permalink / raw)
To: libc-alpha; +Cc: carlos, goldstein.w.n, hjl.tools
From: noah <goldstein.w.n@gmail.com>
No bug. Just seemed the performance could be improved a bit. Observed
and expected behavior are unchanged. Optimized body of main
loop. Updated page cross logic and optimized accordingly. Made a few
minor instruction selection modifications. No regressions in test
suite. Both test-strchrnul and test-strchr passed.
Signed-off-by: noah <goldstein.w.n@gmail.com>
---
sysdeps/x86_64/multiarch/strchr-avx2.S | 235 ++++++++++++-------------
sysdeps/x86_64/multiarch/strchr.c | 1 +
2 files changed, 118 insertions(+), 118 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
index d416558d04..8b9d78b55a 100644
--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
@@ -27,10 +27,12 @@
# ifdef USE_AS_WCSCHR
# define VPBROADCAST vpbroadcastd
# define VPCMPEQ vpcmpeqd
+# define VPMINU vpminud
# define CHAR_REG esi
# else
# define VPBROADCAST vpbroadcastb
# define VPCMPEQ vpcmpeqb
+# define VPMINU vpminub
# define CHAR_REG sil
# endif
@@ -39,20 +41,26 @@
# endif
# define VEC_SIZE 32
+# define PAGE_SIZE 4096
.section .text.avx,"ax",@progbits
ENTRY (STRCHR)
movl %edi, %ecx
- /* Broadcast CHAR to YMM0. */
+# ifndef USE_AS_STRCHRNUL
+ xorl %edx, %edx
+# endif
+
+ /* Broadcast CHAR to YMM0. */
vmovd %esi, %xmm0
vpxor %xmm9, %xmm9, %xmm9
VPBROADCAST %xmm0, %ymm0
- /* Check if we may cross page boundary with one vector load. */
- andl $(2 * VEC_SIZE - 1), %ecx
- cmpl $VEC_SIZE, %ecx
- ja L(cros_page_boundary)
-
- /* Check the first VEC_SIZE bytes. Search for both CHAR and the
+
+ /* Check if we cross page boundary with one vector load. */
+ andl $(PAGE_SIZE - 1), %ecx
+ cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
+ ja L(cross_page_boundary)
+
+ /* Check the first VEC_SIZE bytes. Search for both CHAR and the
null byte. */
vmovdqu (%rdi), %ymm8
VPCMPEQ %ymm8, %ymm0, %ymm1
@@ -60,50 +68,27 @@ ENTRY (STRCHR)
vpor %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %eax
testl %eax, %eax
- jnz L(first_vec_x0)
-
- /* Align data for aligned loads in the loop. */
- addq $VEC_SIZE, %rdi
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
-
- jmp L(more_4x_vec)
-
- .p2align 4
-L(cros_page_boundary):
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
- vmovdqu (%rdi), %ymm8
- VPCMPEQ %ymm8, %ymm0, %ymm1
- VPCMPEQ %ymm8, %ymm9, %ymm2
- vpor %ymm1, %ymm2, %ymm1
- vpmovmskb %ymm1, %eax
- /* Remove the leading bytes. */
- sarl %cl, %eax
- testl %eax, %eax
- jz L(aligned_more)
- /* Found CHAR or the null byte. */
+ jz L(more_vecs)
tzcntl %eax, %eax
- addq %rcx, %rax
-# ifdef USE_AS_STRCHRNUL
+ /* Found CHAR or the null byte. */
addq %rdi, %rax
-# else
- xorl %edx, %edx
- leaq (%rdi, %rax), %rax
- cmp (%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER
ret
.p2align 4
+L(more_vecs):
+ /* Align data for aligned loads in the loop. */
+ andq $-VEC_SIZE, %rdi
L(aligned_more):
- addq $VEC_SIZE, %rdi
-L(more_4x_vec):
- /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
- since data is only aligned to VEC_SIZE. */
- vmovdqa (%rdi), %ymm8
+ /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
+ since data is only aligned to VEC_SIZE. */
+ vmovdqa VEC_SIZE(%rdi), %ymm8
+ addq $VEC_SIZE, %rdi
VPCMPEQ %ymm8, %ymm0, %ymm1
VPCMPEQ %ymm8, %ymm9, %ymm2
vpor %ymm1, %ymm2, %ymm1
@@ -125,7 +110,7 @@ L(more_4x_vec):
vpor %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %eax
testl %eax, %eax
- jnz L(first_vec_x2)
+ jnz L(first_vec_x2)
vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
VPCMPEQ %ymm8, %ymm0, %ymm1
@@ -133,122 +118,136 @@ L(more_4x_vec):
vpor %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %eax
testl %eax, %eax
- jnz L(first_vec_x3)
-
- addq $(VEC_SIZE * 4), %rdi
-
- /* Align data to 4 * VEC_SIZE. */
- movq %rdi, %rcx
- andl $(4 * VEC_SIZE - 1), %ecx
- andq $-(4 * VEC_SIZE), %rdi
-
- .p2align 4
-L(loop_4x_vec):
- /* Compare 4 * VEC at a time forward. */
- vmovdqa (%rdi), %ymm5
- vmovdqa VEC_SIZE(%rdi), %ymm6
- vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
- vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
-
- VPCMPEQ %ymm5, %ymm0, %ymm1
- VPCMPEQ %ymm6, %ymm0, %ymm2
- VPCMPEQ %ymm7, %ymm0, %ymm3
- VPCMPEQ %ymm8, %ymm0, %ymm4
-
- VPCMPEQ %ymm5, %ymm9, %ymm5
- VPCMPEQ %ymm6, %ymm9, %ymm6
- VPCMPEQ %ymm7, %ymm9, %ymm7
- VPCMPEQ %ymm8, %ymm9, %ymm8
-
- vpor %ymm1, %ymm5, %ymm1
- vpor %ymm2, %ymm6, %ymm2
- vpor %ymm3, %ymm7, %ymm3
- vpor %ymm4, %ymm8, %ymm4
-
- vpor %ymm1, %ymm2, %ymm5
- vpor %ymm3, %ymm4, %ymm6
-
- vpor %ymm5, %ymm6, %ymm5
-
- vpmovmskb %ymm5, %eax
- testl %eax, %eax
- jnz L(4x_vec_end)
-
- addq $(VEC_SIZE * 4), %rdi
+ jz L(prep_loop_4x)
- jmp L(loop_4x_vec)
+ tzcntl %eax, %eax
+ leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
+ cmovne %rdx, %rax
+# endif
+ VZEROUPPER
+ ret
.p2align 4
L(first_vec_x0):
- /* Found CHAR or the null byte. */
tzcntl %eax, %eax
-# ifdef USE_AS_STRCHRNUL
+ /* Found CHAR or the null byte. */
addq %rdi, %rax
-# else
- xorl %edx, %edx
- leaq (%rdi, %rax), %rax
- cmp (%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER
ret
-
+
.p2align 4
L(first_vec_x1):
tzcntl %eax, %eax
-# ifdef USE_AS_STRCHRNUL
- addq $VEC_SIZE, %rax
- addq %rdi, %rax
-# else
- xorl %edx, %edx
leaq VEC_SIZE(%rdi, %rax), %rax
- cmp (%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER
- ret
-
+ ret
+
.p2align 4
L(first_vec_x2):
tzcntl %eax, %eax
-# ifdef USE_AS_STRCHRNUL
- addq $(VEC_SIZE * 2), %rax
- addq %rdi, %rax
-# else
- xorl %edx, %edx
+ /* Found CHAR or the null byte. */
leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
- cmp (%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER
ret
+
+L(prep_loop_4x):
+ /* Align data to 4 * VEC_SIZE. */
+ andq $-(VEC_SIZE * 4), %rdi
.p2align 4
-L(4x_vec_end):
+L(loop_4x_vec):
+ /* Compare 4 * VEC at a time forward. */
+ vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5
+ vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6
+ vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7
+ vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8
+
+ /* Leaves only CHARS matching esi as 0. */
+ vpxor %ymm5, %ymm0, %ymm1
+ vpxor %ymm6, %ymm0, %ymm2
+ vpxor %ymm7, %ymm0, %ymm3
+ vpxor %ymm8, %ymm0, %ymm4
+
+ VPMINU %ymm1, %ymm5, %ymm1
+ VPMINU %ymm2, %ymm6, %ymm2
+ VPMINU %ymm3, %ymm7, %ymm3
+ VPMINU %ymm4, %ymm8, %ymm4
+
+ VPMINU %ymm1, %ymm2, %ymm5
+ VPMINU %ymm3, %ymm4, %ymm6
+
+ VPMINU %ymm5, %ymm6, %ymm5
+
+ VPCMPEQ %ymm5, %ymm9, %ymm5
+ vpmovmskb %ymm5, %eax
+
+ addq $(VEC_SIZE * 4), %rdi
+ testl %eax, %eax
+ jz L(loop_4x_vec)
+
+ VPCMPEQ %ymm1, %ymm9, %ymm1
vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x0)
+
+ VPCMPEQ %ymm2, %ymm9, %ymm2
vpmovmskb %ymm2, %eax
testl %eax, %eax
jnz L(first_vec_x1)
- vpmovmskb %ymm3, %eax
- testl %eax, %eax
- jnz L(first_vec_x2)
+
+ VPCMPEQ %ymm3, %ymm9, %ymm3
+ VPCMPEQ %ymm4, %ymm9, %ymm4
+ vpmovmskb %ymm3, %ecx
vpmovmskb %ymm4, %eax
+ salq $32, %rax
+ orq %rcx, %rax
+ tzcntq %rax, %rax
+ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
+ cmovne %rdx, %rax
+# endif
+ VZEROUPPER
+ ret
+
+ /* Cold case for crossing page with first load. */
+ .p2align 4
+L(cross_page_boundary):
+ andq $-VEC_SIZE, %rdi
+ andl $(VEC_SIZE - 1), %ecx
+
+ vmovdqa (%rdi), %ymm8
+ VPCMPEQ %ymm8, %ymm0, %ymm1
+ VPCMPEQ %ymm8, %ymm9, %ymm2
+ vpor %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %eax
+ /* Remove the leading bits. */
+ sarxl %ecx, %eax, %eax
testl %eax, %eax
-L(first_vec_x3):
+ jz L(aligned_more)
tzcntl %eax, %eax
-# ifdef USE_AS_STRCHRNUL
- addq $(VEC_SIZE * 3), %rax
+ addq %rcx, %rdi
addq %rdi, %rax
-# else
- xorl %edx, %edx
- leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
- cmp (%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER
ret
END (STRCHR)
-#endif
+# endif
diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
index 583a152794..4dfbe3b58b 100644
--- a/sysdeps/x86_64/multiarch/strchr.c
+++ b/sysdeps/x86_64/multiarch/strchr.c
@@ -37,6 +37,7 @@ IFUNC_SELECTOR (void)
if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
&& CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
return OPTIMIZE (avx2);
--
2.29.2
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH v4 2/2] x86: Add additional benchmarks and tests for strchr
2021-02-03 5:38 [PATCH v4 1/2] x86: Refactor and improve performance of strchr-avx2.S goldstein.w.n
@ 2021-02-03 5:39 ` goldstein.w.n
2021-02-08 14:08 ` H.J. Lu
2021-02-08 14:08 ` [PATCH v4 1/2] x86: Refactor and improve performance of strchr-avx2.S H.J. Lu
1 sibling, 1 reply; 10+ messages in thread
From: goldstein.w.n @ 2021-02-03 5:39 UTC (permalink / raw)
To: libc-alpha; +Cc: carlos, goldstein.w.n, hjl.tools
From: noah <goldstein.w.n@gmail.com>
This patch adds additional benchmarks and tests for string size of
4096 and several benchmarks for string size 256 with different
alignments.
Signed-off-by: noah <goldstein.w.n@gmail.com>
---
Added 2 additional benchmark and test sizes:
4096: Just feels like a natural "large" size to test
256 with multiple alignments: This essentially is to test how
expensive the initial work prior to the 4x loop is depending on
different alignments.
results from bench-strchr: All times are in seconds and the medium of
100 runs. Old is current strchr-avx2.S implementation. New is this
patch.
Summary: New is definetly faster for medium -> large sizes. Once the
4x loop is hit there is a 10%+ speedup and New always wins out. For
smaller sizes there is more variance as to which is faster and the
differences are small. Generally it seems the New version wins
out. This is likely because 0 - 31 sized strings are the fast path for
new (no jmp). Also something that is neat is the significant
performance improved for alignment 96 and 112. This is because the 5x
vectors before 4x loop really favor that alignment.
Benchmarking CPU:
Icelake: Intel(R) Core(TM) i7-1065G7 CPU @ 1.30GHz
size, algn, Old T , New T -------- Win Dif
0 , 0 , 2.54 , 2.52 -------- New -0.02
1 , 0 , 2.57 , 2.52 -------- New -0.05
2 , 0 , 2.56 , 2.52 -------- New -0.04
3 , 0 , 2.58 , 2.54 -------- New -0.04
4 , 0 , 2.61 , 2.55 -------- New -0.06
5 , 0 , 2.65 , 2.62 -------- New -0.03
6 , 0 , 2.73 , 2.74 -------- Old -0.01
7 , 0 , 2.75 , 2.74 -------- New -0.01
8 , 0 , 2.62 , 2.6 -------- New -0.02
9 , 0 , 2.73 , 2.75 -------- Old -0.02
10 , 0 , 2.74 , 2.74 -------- Eq N/A
11 , 0 , 2.76 , 2.72 -------- New -0.04
12 , 0 , 2.74 , 2.72 -------- New -0.02
13 , 0 , 2.75 , 2.72 -------- New -0.03
14 , 0 , 2.74 , 2.73 -------- New -0.01
15 , 0 , 2.74 , 2.73 -------- New -0.01
16 , 0 , 2.74 , 2.73 -------- New -0.01
17 , 0 , 2.74 , 2.74 -------- Eq N/A
18 , 0 , 2.73 , 2.73 -------- Eq N/A
19 , 0 , 2.73 , 2.73 -------- Eq N/A
20 , 0 , 2.73 , 2.73 -------- Eq N/A
21 , 0 , 2.73 , 2.72 -------- New -0.01
22 , 0 , 2.71 , 2.74 -------- Old -0.03
23 , 0 , 2.71 , 2.69 -------- New -0.02
24 , 0 , 2.68 , 2.67 -------- New -0.01
25 , 0 , 2.66 , 2.62 -------- New -0.04
26 , 0 , 2.64 , 2.62 -------- New -0.02
27 , 0 , 2.71 , 2.64 -------- New -0.07
28 , 0 , 2.67 , 2.69 -------- Old -0.02
29 , 0 , 2.72 , 2.72 -------- Eq N/A
30 , 0 , 2.68 , 2.69 -------- Old -0.01
31 , 0 , 2.68 , 2.68 -------- Eq N/A
32 , 0 , 3.51 , 3.52 -------- Old -0.01
32 , 1 , 3.52 , 3.51 -------- New -0.01
64 , 0 , 3.97 , 3.93 -------- New -0.04
64 , 2 , 3.95 , 3.9 -------- New -0.05
64 , 1 , 4.0 , 3.93 -------- New -0.07
64 , 3 , 3.97 , 3.88 -------- New -0.09
64 , 4 , 3.95 , 3.89 -------- New -0.06
64 , 5 , 3.94 , 3.9 -------- New -0.04
64 , 6 , 3.97 , 3.9 -------- New -0.07
64 , 7 , 3.97 , 3.91 -------- New -0.06
96 , 0 , 4.74 , 4.52 -------- New -0.22
128 , 0 , 5.29 , 5.19 -------- New -0.1
128 , 2 , 5.29 , 5.15 -------- New -0.14
128 , 3 , 5.31 , 5.22 -------- New -0.09
256 , 0 , 11.19 , 9.81 -------- New -1.38
256 , 3 , 11.19 , 9.84 -------- New -1.35
256 , 4 , 11.2 , 9.88 -------- New -1.32
256 , 16 , 11.21 , 9.79 -------- New -1.42
256 , 32 , 11.39 , 10.34 -------- New -1.05
256 , 48 , 11.88 , 10.56 -------- New -1.32
256 , 64 , 11.82 , 10.83 -------- New -0.99
256 , 80 , 11.85 , 10.86 -------- New -0.99
256 , 96 , 9.56 , 8.76 -------- New -0.8
256 , 112 , 9.55 , 8.9 -------- New -0.65
512 , 0 , 15.76 , 13.72 -------- New -2.04
512 , 4 , 15.72 , 13.74 -------- New -1.98
512 , 5 , 15.73 , 13.74 -------- New -1.99
1024, 0 , 24.85 , 21.33 -------- New -3.52
1024, 5 , 24.86 , 21.27 -------- New -3.59
1024, 6 , 24.87 , 21.32 -------- New -3.55
2048, 0 , 45.75 , 36.7 -------- New -9.05
2048, 6 , 43.91 , 35.42 -------- New -8.49
2048, 7 , 44.43 , 36.37 -------- New -8.06
4096, 0 , 96.94 , 81.34 -------- New -15.6
4096, 7 , 97.01 , 81.32 -------- New -15.69
benchtests/bench-strchr.c | 26 +++++++++++++++++++++++++-
string/test-strchr.c | 26 +++++++++++++++++++++++++-
2 files changed, 50 insertions(+), 2 deletions(-)
diff --git a/benchtests/bench-strchr.c b/benchtests/bench-strchr.c
index bf493fe458..4ce2369d9b 100644
--- a/benchtests/bench-strchr.c
+++ b/benchtests/bench-strchr.c
@@ -100,7 +100,7 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
size_t i;
CHAR *result;
CHAR *buf = (CHAR *) buf1;
- align &= 15;
+ align &= 127;
if ((align + len) * sizeof (CHAR) >= page_size)
return;
@@ -151,12 +151,24 @@ test_main (void)
do_test (i, 16 << i, 2048, SMALL_CHAR, MIDDLE_CHAR);
}
+ for (i = 1; i < 8; ++i)
+ {
+ do_test (0, 16 << i, 4096, SMALL_CHAR, MIDDLE_CHAR);
+ do_test (i, 16 << i, 4096, SMALL_CHAR, MIDDLE_CHAR);
+ }
+
for (i = 1; i < 8; ++i)
{
do_test (i, 64, 256, SMALL_CHAR, MIDDLE_CHAR);
do_test (i, 64, 256, SMALL_CHAR, BIG_CHAR);
}
+ for (i = 0; i < 8; ++i)
+ {
+ do_test (16 * i, 256, 512, SMALL_CHAR, MIDDLE_CHAR);
+ do_test (16 * i, 256, 512, SMALL_CHAR, BIG_CHAR);
+ }
+
for (i = 0; i < 32; ++i)
{
do_test (0, i, i + 1, SMALL_CHAR, MIDDLE_CHAR);
@@ -169,12 +181,24 @@ test_main (void)
do_test (i, 16 << i, 2048, 0, MIDDLE_CHAR);
}
+ for (i = 1; i < 8; ++i)
+ {
+ do_test (0, 16 << i, 4096, 0, MIDDLE_CHAR);
+ do_test (i, 16 << i, 4096, 0, MIDDLE_CHAR);
+ }
+
for (i = 1; i < 8; ++i)
{
do_test (i, 64, 256, 0, MIDDLE_CHAR);
do_test (i, 64, 256, 0, BIG_CHAR);
}
+ for (i = 0; i < 8; ++i)
+ {
+ do_test (16 * i, 256, 512, 0, MIDDLE_CHAR);
+ do_test (16 * i, 256, 512, 0, BIG_CHAR);
+ }
+
for (i = 0; i < 32; ++i)
{
do_test (0, i, i + 1, 0, MIDDLE_CHAR);
diff --git a/string/test-strchr.c b/string/test-strchr.c
index 5b6022746c..2cf4ea2add 100644
--- a/string/test-strchr.c
+++ b/string/test-strchr.c
@@ -130,7 +130,7 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
size_t i;
CHAR *result;
CHAR *buf = (CHAR *) buf1;
- align &= 15;
+ align &= 127;
if ((align + len) * sizeof (CHAR) >= page_size)
return;
@@ -259,12 +259,24 @@ test_main (void)
do_test (i, 16 << i, 2048, SMALL_CHAR, MIDDLE_CHAR);
}
+ for (i = 1; i < 8; ++i)
+ {
+ do_test (0, 16 << i, 4096, SMALL_CHAR, MIDDLE_CHAR);
+ do_test (i, 16 << i, 4096, SMALL_CHAR, MIDDLE_CHAR);
+ }
+
for (i = 1; i < 8; ++i)
{
do_test (i, 64, 256, SMALL_CHAR, MIDDLE_CHAR);
do_test (i, 64, 256, SMALL_CHAR, BIG_CHAR);
}
+ for (i = 0; i < 8; ++i)
+ {
+ do_test (16 * i, 256, 512, SMALL_CHAR, MIDDLE_CHAR);
+ do_test (16 * i, 256, 512, SMALL_CHAR, BIG_CHAR);
+ }
+
for (i = 0; i < 32; ++i)
{
do_test (0, i, i + 1, SMALL_CHAR, MIDDLE_CHAR);
@@ -277,12 +289,24 @@ test_main (void)
do_test (i, 16 << i, 2048, 0, MIDDLE_CHAR);
}
+ for (i = 1; i < 8; ++i)
+ {
+ do_test (0, 16 << i, 4096, 0, MIDDLE_CHAR);
+ do_test (i, 16 << i, 4096, 0, MIDDLE_CHAR);
+ }
+
for (i = 1; i < 8; ++i)
{
do_test (i, 64, 256, 0, MIDDLE_CHAR);
do_test (i, 64, 256, 0, BIG_CHAR);
}
+ for (i = 0; i < 8; ++i)
+ {
+ do_test (16 * i, 256, 512, 0, MIDDLE_CHAR);
+ do_test (16 * i, 256, 512, 0, BIG_CHAR);
+ }
+
for (i = 0; i < 32; ++i)
{
do_test (0, i, i + 1, 0, MIDDLE_CHAR);
--
2.29.2
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH v4 1/2] x86: Refactor and improve performance of strchr-avx2.S
2021-02-03 5:38 [PATCH v4 1/2] x86: Refactor and improve performance of strchr-avx2.S goldstein.w.n
2021-02-03 5:39 ` [PATCH v4 2/2] x86: Add additional benchmarks and tests for strchr goldstein.w.n
@ 2021-02-08 14:08 ` H.J. Lu
2021-02-08 19:33 ` H.J. Lu
1 sibling, 1 reply; 10+ messages in thread
From: H.J. Lu @ 2021-02-08 14:08 UTC (permalink / raw)
To: noah; +Cc: GNU C Library, Carlos O'Donell
On Tue, Feb 2, 2021 at 9:39 PM <goldstein.w.n@gmail.com> wrote:
>
> From: noah <goldstein.w.n@gmail.com>
>
> No bug. Just seemed the performance could be improved a bit. Observed
> and expected behavior are unchanged. Optimized body of main
> loop. Updated page cross logic and optimized accordingly. Made a few
> minor instruction selection modifications. No regressions in test
> suite. Both test-strchrnul and test-strchr passed.
>
> Signed-off-by: noah <goldstein.w.n@gmail.com>
> ---
> sysdeps/x86_64/multiarch/strchr-avx2.S | 235 ++++++++++++-------------
> sysdeps/x86_64/multiarch/strchr.c | 1 +
> 2 files changed, 118 insertions(+), 118 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
> index d416558d04..8b9d78b55a 100644
> --- a/sysdeps/x86_64/multiarch/strchr-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
> @@ -27,10 +27,12 @@
> # ifdef USE_AS_WCSCHR
> # define VPBROADCAST vpbroadcastd
> # define VPCMPEQ vpcmpeqd
> +# define VPMINU vpminud
> # define CHAR_REG esi
> # else
> # define VPBROADCAST vpbroadcastb
> # define VPCMPEQ vpcmpeqb
> +# define VPMINU vpminub
> # define CHAR_REG sil
> # endif
>
> @@ -39,20 +41,26 @@
> # endif
>
> # define VEC_SIZE 32
> +# define PAGE_SIZE 4096
>
> .section .text.avx,"ax",@progbits
> ENTRY (STRCHR)
> movl %edi, %ecx
> - /* Broadcast CHAR to YMM0. */
> +# ifndef USE_AS_STRCHRNUL
> + xorl %edx, %edx
> +# endif
> +
> + /* Broadcast CHAR to YMM0. */
> vmovd %esi, %xmm0
> vpxor %xmm9, %xmm9, %xmm9
> VPBROADCAST %xmm0, %ymm0
> - /* Check if we may cross page boundary with one vector load. */
> - andl $(2 * VEC_SIZE - 1), %ecx
> - cmpl $VEC_SIZE, %ecx
> - ja L(cros_page_boundary)
> -
> - /* Check the first VEC_SIZE bytes. Search for both CHAR and the
> +
> + /* Check if we cross page boundary with one vector load. */
> + andl $(PAGE_SIZE - 1), %ecx
> + cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
> + ja L(cross_page_boundary)
> +
> + /* Check the first VEC_SIZE bytes. Search for both CHAR and the
> null byte. */
> vmovdqu (%rdi), %ymm8
> VPCMPEQ %ymm8, %ymm0, %ymm1
> @@ -60,50 +68,27 @@ ENTRY (STRCHR)
> vpor %ymm1, %ymm2, %ymm1
> vpmovmskb %ymm1, %eax
> testl %eax, %eax
> - jnz L(first_vec_x0)
> -
> - /* Align data for aligned loads in the loop. */
> - addq $VEC_SIZE, %rdi
> - andl $(VEC_SIZE - 1), %ecx
> - andq $-VEC_SIZE, %rdi
> -
> - jmp L(more_4x_vec)
> -
> - .p2align 4
> -L(cros_page_boundary):
> - andl $(VEC_SIZE - 1), %ecx
> - andq $-VEC_SIZE, %rdi
> - vmovdqu (%rdi), %ymm8
> - VPCMPEQ %ymm8, %ymm0, %ymm1
> - VPCMPEQ %ymm8, %ymm9, %ymm2
> - vpor %ymm1, %ymm2, %ymm1
> - vpmovmskb %ymm1, %eax
> - /* Remove the leading bytes. */
> - sarl %cl, %eax
> - testl %eax, %eax
> - jz L(aligned_more)
> - /* Found CHAR or the null byte. */
> + jz L(more_vecs)
> tzcntl %eax, %eax
> - addq %rcx, %rax
> -# ifdef USE_AS_STRCHRNUL
> + /* Found CHAR or the null byte. */
> addq %rdi, %rax
> -# else
> - xorl %edx, %edx
> - leaq (%rdi, %rax), %rax
> - cmp (%rax), %CHAR_REG
> +# ifndef USE_AS_STRCHRNUL
> + cmp (%rax), %CHAR_REG
> cmovne %rdx, %rax
> # endif
> VZEROUPPER
> ret
>
> .p2align 4
> +L(more_vecs):
> + /* Align data for aligned loads in the loop. */
> + andq $-VEC_SIZE, %rdi
> L(aligned_more):
> - addq $VEC_SIZE, %rdi
>
> -L(more_4x_vec):
> - /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
> - since data is only aligned to VEC_SIZE. */
> - vmovdqa (%rdi), %ymm8
> + /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
> + since data is only aligned to VEC_SIZE. */
> + vmovdqa VEC_SIZE(%rdi), %ymm8
> + addq $VEC_SIZE, %rdi
> VPCMPEQ %ymm8, %ymm0, %ymm1
> VPCMPEQ %ymm8, %ymm9, %ymm2
> vpor %ymm1, %ymm2, %ymm1
> @@ -125,7 +110,7 @@ L(more_4x_vec):
> vpor %ymm1, %ymm2, %ymm1
> vpmovmskb %ymm1, %eax
> testl %eax, %eax
> - jnz L(first_vec_x2)
> + jnz L(first_vec_x2)
>
> vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
> VPCMPEQ %ymm8, %ymm0, %ymm1
> @@ -133,122 +118,136 @@ L(more_4x_vec):
> vpor %ymm1, %ymm2, %ymm1
> vpmovmskb %ymm1, %eax
> testl %eax, %eax
> - jnz L(first_vec_x3)
> -
> - addq $(VEC_SIZE * 4), %rdi
> -
> - /* Align data to 4 * VEC_SIZE. */
> - movq %rdi, %rcx
> - andl $(4 * VEC_SIZE - 1), %ecx
> - andq $-(4 * VEC_SIZE), %rdi
> -
> - .p2align 4
> -L(loop_4x_vec):
> - /* Compare 4 * VEC at a time forward. */
> - vmovdqa (%rdi), %ymm5
> - vmovdqa VEC_SIZE(%rdi), %ymm6
> - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
> - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
> -
> - VPCMPEQ %ymm5, %ymm0, %ymm1
> - VPCMPEQ %ymm6, %ymm0, %ymm2
> - VPCMPEQ %ymm7, %ymm0, %ymm3
> - VPCMPEQ %ymm8, %ymm0, %ymm4
> -
> - VPCMPEQ %ymm5, %ymm9, %ymm5
> - VPCMPEQ %ymm6, %ymm9, %ymm6
> - VPCMPEQ %ymm7, %ymm9, %ymm7
> - VPCMPEQ %ymm8, %ymm9, %ymm8
> -
> - vpor %ymm1, %ymm5, %ymm1
> - vpor %ymm2, %ymm6, %ymm2
> - vpor %ymm3, %ymm7, %ymm3
> - vpor %ymm4, %ymm8, %ymm4
> -
> - vpor %ymm1, %ymm2, %ymm5
> - vpor %ymm3, %ymm4, %ymm6
> -
> - vpor %ymm5, %ymm6, %ymm5
> -
> - vpmovmskb %ymm5, %eax
> - testl %eax, %eax
> - jnz L(4x_vec_end)
> -
> - addq $(VEC_SIZE * 4), %rdi
> + jz L(prep_loop_4x)
>
> - jmp L(loop_4x_vec)
> + tzcntl %eax, %eax
> + leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
> +# ifndef USE_AS_STRCHRNUL
> + cmp (%rax), %CHAR_REG
> + cmovne %rdx, %rax
> +# endif
> + VZEROUPPER
> + ret
>
> .p2align 4
> L(first_vec_x0):
> - /* Found CHAR or the null byte. */
> tzcntl %eax, %eax
> -# ifdef USE_AS_STRCHRNUL
> + /* Found CHAR or the null byte. */
> addq %rdi, %rax
> -# else
> - xorl %edx, %edx
> - leaq (%rdi, %rax), %rax
> - cmp (%rax), %CHAR_REG
> +# ifndef USE_AS_STRCHRNUL
> + cmp (%rax), %CHAR_REG
> cmovne %rdx, %rax
> # endif
> VZEROUPPER
> ret
> -
> +
> .p2align 4
> L(first_vec_x1):
> tzcntl %eax, %eax
> -# ifdef USE_AS_STRCHRNUL
> - addq $VEC_SIZE, %rax
> - addq %rdi, %rax
> -# else
> - xorl %edx, %edx
> leaq VEC_SIZE(%rdi, %rax), %rax
> - cmp (%rax), %CHAR_REG
> +# ifndef USE_AS_STRCHRNUL
> + cmp (%rax), %CHAR_REG
> cmovne %rdx, %rax
> # endif
> VZEROUPPER
> - ret
> -
> + ret
> +
> .p2align 4
> L(first_vec_x2):
> tzcntl %eax, %eax
> -# ifdef USE_AS_STRCHRNUL
> - addq $(VEC_SIZE * 2), %rax
> - addq %rdi, %rax
> -# else
> - xorl %edx, %edx
> + /* Found CHAR or the null byte. */
> leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
> - cmp (%rax), %CHAR_REG
> +# ifndef USE_AS_STRCHRNUL
> + cmp (%rax), %CHAR_REG
> cmovne %rdx, %rax
> # endif
> VZEROUPPER
> ret
> +
> +L(prep_loop_4x):
> + /* Align data to 4 * VEC_SIZE. */
> + andq $-(VEC_SIZE * 4), %rdi
>
> .p2align 4
> -L(4x_vec_end):
> +L(loop_4x_vec):
> + /* Compare 4 * VEC at a time forward. */
> + vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5
> + vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6
> + vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7
> + vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8
> +
> + /* Leaves only CHARS matching esi as 0. */
> + vpxor %ymm5, %ymm0, %ymm1
> + vpxor %ymm6, %ymm0, %ymm2
> + vpxor %ymm7, %ymm0, %ymm3
> + vpxor %ymm8, %ymm0, %ymm4
> +
> + VPMINU %ymm1, %ymm5, %ymm1
> + VPMINU %ymm2, %ymm6, %ymm2
> + VPMINU %ymm3, %ymm7, %ymm3
> + VPMINU %ymm4, %ymm8, %ymm4
> +
> + VPMINU %ymm1, %ymm2, %ymm5
> + VPMINU %ymm3, %ymm4, %ymm6
> +
> + VPMINU %ymm5, %ymm6, %ymm5
> +
> + VPCMPEQ %ymm5, %ymm9, %ymm5
> + vpmovmskb %ymm5, %eax
> +
> + addq $(VEC_SIZE * 4), %rdi
> + testl %eax, %eax
> + jz L(loop_4x_vec)
> +
> + VPCMPEQ %ymm1, %ymm9, %ymm1
> vpmovmskb %ymm1, %eax
> testl %eax, %eax
> jnz L(first_vec_x0)
> +
> + VPCMPEQ %ymm2, %ymm9, %ymm2
> vpmovmskb %ymm2, %eax
> testl %eax, %eax
> jnz L(first_vec_x1)
> - vpmovmskb %ymm3, %eax
> - testl %eax, %eax
> - jnz L(first_vec_x2)
> +
> + VPCMPEQ %ymm3, %ymm9, %ymm3
> + VPCMPEQ %ymm4, %ymm9, %ymm4
> + vpmovmskb %ymm3, %ecx
> vpmovmskb %ymm4, %eax
> + salq $32, %rax
> + orq %rcx, %rax
> + tzcntq %rax, %rax
> + leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
> +# ifndef USE_AS_STRCHRNUL
> + cmp (%rax), %CHAR_REG
> + cmovne %rdx, %rax
> +# endif
> + VZEROUPPER
> + ret
> +
> + /* Cold case for crossing page with first load. */
> + .p2align 4
> +L(cross_page_boundary):
> + andq $-VEC_SIZE, %rdi
> + andl $(VEC_SIZE - 1), %ecx
> +
> + vmovdqa (%rdi), %ymm8
> + VPCMPEQ %ymm8, %ymm0, %ymm1
> + VPCMPEQ %ymm8, %ymm9, %ymm2
> + vpor %ymm1, %ymm2, %ymm1
> + vpmovmskb %ymm1, %eax
> + /* Remove the leading bits. */
> + sarxl %ecx, %eax, %eax
> testl %eax, %eax
> -L(first_vec_x3):
> + jz L(aligned_more)
> tzcntl %eax, %eax
> -# ifdef USE_AS_STRCHRNUL
> - addq $(VEC_SIZE * 3), %rax
> + addq %rcx, %rdi
> addq %rdi, %rax
> -# else
> - xorl %edx, %edx
> - leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
> - cmp (%rax), %CHAR_REG
> +# ifndef USE_AS_STRCHRNUL
> + cmp (%rax), %CHAR_REG
> cmovne %rdx, %rax
> # endif
> VZEROUPPER
> ret
>
> END (STRCHR)
> -#endif
> +# endif
> diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
> index 583a152794..4dfbe3b58b 100644
> --- a/sysdeps/x86_64/multiarch/strchr.c
> +++ b/sysdeps/x86_64/multiarch/strchr.c
> @@ -37,6 +37,7 @@ IFUNC_SELECTOR (void)
>
> if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
> && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> + && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
> && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
> return OPTIMIZE (avx2);
>
> --
> 2.29.2
>
LGTM.
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH v4 2/2] x86: Add additional benchmarks and tests for strchr
2021-02-03 5:39 ` [PATCH v4 2/2] x86: Add additional benchmarks and tests for strchr goldstein.w.n
@ 2021-02-08 14:08 ` H.J. Lu
2021-02-08 19:34 ` H.J. Lu
0 siblings, 1 reply; 10+ messages in thread
From: H.J. Lu @ 2021-02-08 14:08 UTC (permalink / raw)
To: noah; +Cc: GNU C Library, Carlos O'Donell
On Tue, Feb 2, 2021 at 9:39 PM <goldstein.w.n@gmail.com> wrote:
>
> From: noah <goldstein.w.n@gmail.com>
>
> This patch adds additional benchmarks and tests for string size of
> 4096 and several benchmarks for string size 256 with different
> alignments.
>
> Signed-off-by: noah <goldstein.w.n@gmail.com>
> ---
> Added 2 additional benchmark and test sizes:
>
> 4096: Just feels like a natural "large" size to test
>
> 256 with multiple alignments: This essentially is to test how
> expensive the initial work prior to the 4x loop is depending on
> different alignments.
>
> results from bench-strchr: All times are in seconds and the medium of
> 100 runs. Old is current strchr-avx2.S implementation. New is this
> patch.
>
> Summary: New is definetly faster for medium -> large sizes. Once the
> 4x loop is hit there is a 10%+ speedup and New always wins out. For
> smaller sizes there is more variance as to which is faster and the
> differences are small. Generally it seems the New version wins
> out. This is likely because 0 - 31 sized strings are the fast path for
> new (no jmp). Also something that is neat is the significant
> performance improved for alignment 96 and 112. This is because the 5x
> vectors before 4x loop really favor that alignment.
>
> Benchmarking CPU:
> Icelake: Intel(R) Core(TM) i7-1065G7 CPU @ 1.30GHz
>
> size, algn, Old T , New T -------- Win Dif
> 0 , 0 , 2.54 , 2.52 -------- New -0.02
> 1 , 0 , 2.57 , 2.52 -------- New -0.05
> 2 , 0 , 2.56 , 2.52 -------- New -0.04
> 3 , 0 , 2.58 , 2.54 -------- New -0.04
> 4 , 0 , 2.61 , 2.55 -------- New -0.06
> 5 , 0 , 2.65 , 2.62 -------- New -0.03
> 6 , 0 , 2.73 , 2.74 -------- Old -0.01
> 7 , 0 , 2.75 , 2.74 -------- New -0.01
> 8 , 0 , 2.62 , 2.6 -------- New -0.02
> 9 , 0 , 2.73 , 2.75 -------- Old -0.02
> 10 , 0 , 2.74 , 2.74 -------- Eq N/A
> 11 , 0 , 2.76 , 2.72 -------- New -0.04
> 12 , 0 , 2.74 , 2.72 -------- New -0.02
> 13 , 0 , 2.75 , 2.72 -------- New -0.03
> 14 , 0 , 2.74 , 2.73 -------- New -0.01
> 15 , 0 , 2.74 , 2.73 -------- New -0.01
> 16 , 0 , 2.74 , 2.73 -------- New -0.01
> 17 , 0 , 2.74 , 2.74 -------- Eq N/A
> 18 , 0 , 2.73 , 2.73 -------- Eq N/A
> 19 , 0 , 2.73 , 2.73 -------- Eq N/A
> 20 , 0 , 2.73 , 2.73 -------- Eq N/A
> 21 , 0 , 2.73 , 2.72 -------- New -0.01
> 22 , 0 , 2.71 , 2.74 -------- Old -0.03
> 23 , 0 , 2.71 , 2.69 -------- New -0.02
> 24 , 0 , 2.68 , 2.67 -------- New -0.01
> 25 , 0 , 2.66 , 2.62 -------- New -0.04
> 26 , 0 , 2.64 , 2.62 -------- New -0.02
> 27 , 0 , 2.71 , 2.64 -------- New -0.07
> 28 , 0 , 2.67 , 2.69 -------- Old -0.02
> 29 , 0 , 2.72 , 2.72 -------- Eq N/A
> 30 , 0 , 2.68 , 2.69 -------- Old -0.01
> 31 , 0 , 2.68 , 2.68 -------- Eq N/A
> 32 , 0 , 3.51 , 3.52 -------- Old -0.01
> 32 , 1 , 3.52 , 3.51 -------- New -0.01
> 64 , 0 , 3.97 , 3.93 -------- New -0.04
> 64 , 2 , 3.95 , 3.9 -------- New -0.05
> 64 , 1 , 4.0 , 3.93 -------- New -0.07
> 64 , 3 , 3.97 , 3.88 -------- New -0.09
> 64 , 4 , 3.95 , 3.89 -------- New -0.06
> 64 , 5 , 3.94 , 3.9 -------- New -0.04
> 64 , 6 , 3.97 , 3.9 -------- New -0.07
> 64 , 7 , 3.97 , 3.91 -------- New -0.06
> 96 , 0 , 4.74 , 4.52 -------- New -0.22
> 128 , 0 , 5.29 , 5.19 -------- New -0.1
> 128 , 2 , 5.29 , 5.15 -------- New -0.14
> 128 , 3 , 5.31 , 5.22 -------- New -0.09
> 256 , 0 , 11.19 , 9.81 -------- New -1.38
> 256 , 3 , 11.19 , 9.84 -------- New -1.35
> 256 , 4 , 11.2 , 9.88 -------- New -1.32
> 256 , 16 , 11.21 , 9.79 -------- New -1.42
> 256 , 32 , 11.39 , 10.34 -------- New -1.05
> 256 , 48 , 11.88 , 10.56 -------- New -1.32
> 256 , 64 , 11.82 , 10.83 -------- New -0.99
> 256 , 80 , 11.85 , 10.86 -------- New -0.99
> 256 , 96 , 9.56 , 8.76 -------- New -0.8
> 256 , 112 , 9.55 , 8.9 -------- New -0.65
> 512 , 0 , 15.76 , 13.72 -------- New -2.04
> 512 , 4 , 15.72 , 13.74 -------- New -1.98
> 512 , 5 , 15.73 , 13.74 -------- New -1.99
> 1024, 0 , 24.85 , 21.33 -------- New -3.52
> 1024, 5 , 24.86 , 21.27 -------- New -3.59
> 1024, 6 , 24.87 , 21.32 -------- New -3.55
> 2048, 0 , 45.75 , 36.7 -------- New -9.05
> 2048, 6 , 43.91 , 35.42 -------- New -8.49
> 2048, 7 , 44.43 , 36.37 -------- New -8.06
> 4096, 0 , 96.94 , 81.34 -------- New -15.6
> 4096, 7 , 97.01 , 81.32 -------- New -15.69
>
> benchtests/bench-strchr.c | 26 +++++++++++++++++++++++++-
> string/test-strchr.c | 26 +++++++++++++++++++++++++-
> 2 files changed, 50 insertions(+), 2 deletions(-)
>
> diff --git a/benchtests/bench-strchr.c b/benchtests/bench-strchr.c
> index bf493fe458..4ce2369d9b 100644
> --- a/benchtests/bench-strchr.c
> +++ b/benchtests/bench-strchr.c
> @@ -100,7 +100,7 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
> size_t i;
> CHAR *result;
> CHAR *buf = (CHAR *) buf1;
> - align &= 15;
> + align &= 127;
> if ((align + len) * sizeof (CHAR) >= page_size)
> return;
>
> @@ -151,12 +151,24 @@ test_main (void)
> do_test (i, 16 << i, 2048, SMALL_CHAR, MIDDLE_CHAR);
> }
>
> + for (i = 1; i < 8; ++i)
> + {
> + do_test (0, 16 << i, 4096, SMALL_CHAR, MIDDLE_CHAR);
> + do_test (i, 16 << i, 4096, SMALL_CHAR, MIDDLE_CHAR);
> + }
> +
> for (i = 1; i < 8; ++i)
> {
> do_test (i, 64, 256, SMALL_CHAR, MIDDLE_CHAR);
> do_test (i, 64, 256, SMALL_CHAR, BIG_CHAR);
> }
>
> + for (i = 0; i < 8; ++i)
> + {
> + do_test (16 * i, 256, 512, SMALL_CHAR, MIDDLE_CHAR);
> + do_test (16 * i, 256, 512, SMALL_CHAR, BIG_CHAR);
> + }
> +
> for (i = 0; i < 32; ++i)
> {
> do_test (0, i, i + 1, SMALL_CHAR, MIDDLE_CHAR);
> @@ -169,12 +181,24 @@ test_main (void)
> do_test (i, 16 << i, 2048, 0, MIDDLE_CHAR);
> }
>
> + for (i = 1; i < 8; ++i)
> + {
> + do_test (0, 16 << i, 4096, 0, MIDDLE_CHAR);
> + do_test (i, 16 << i, 4096, 0, MIDDLE_CHAR);
> + }
> +
> for (i = 1; i < 8; ++i)
> {
> do_test (i, 64, 256, 0, MIDDLE_CHAR);
> do_test (i, 64, 256, 0, BIG_CHAR);
> }
>
> + for (i = 0; i < 8; ++i)
> + {
> + do_test (16 * i, 256, 512, 0, MIDDLE_CHAR);
> + do_test (16 * i, 256, 512, 0, BIG_CHAR);
> + }
> +
> for (i = 0; i < 32; ++i)
> {
> do_test (0, i, i + 1, 0, MIDDLE_CHAR);
> diff --git a/string/test-strchr.c b/string/test-strchr.c
> index 5b6022746c..2cf4ea2add 100644
> --- a/string/test-strchr.c
> +++ b/string/test-strchr.c
> @@ -130,7 +130,7 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
> size_t i;
> CHAR *result;
> CHAR *buf = (CHAR *) buf1;
> - align &= 15;
> + align &= 127;
> if ((align + len) * sizeof (CHAR) >= page_size)
> return;
>
> @@ -259,12 +259,24 @@ test_main (void)
> do_test (i, 16 << i, 2048, SMALL_CHAR, MIDDLE_CHAR);
> }
>
> + for (i = 1; i < 8; ++i)
> + {
> + do_test (0, 16 << i, 4096, SMALL_CHAR, MIDDLE_CHAR);
> + do_test (i, 16 << i, 4096, SMALL_CHAR, MIDDLE_CHAR);
> + }
> +
> for (i = 1; i < 8; ++i)
> {
> do_test (i, 64, 256, SMALL_CHAR, MIDDLE_CHAR);
> do_test (i, 64, 256, SMALL_CHAR, BIG_CHAR);
> }
>
> + for (i = 0; i < 8; ++i)
> + {
> + do_test (16 * i, 256, 512, SMALL_CHAR, MIDDLE_CHAR);
> + do_test (16 * i, 256, 512, SMALL_CHAR, BIG_CHAR);
> + }
> +
> for (i = 0; i < 32; ++i)
> {
> do_test (0, i, i + 1, SMALL_CHAR, MIDDLE_CHAR);
> @@ -277,12 +289,24 @@ test_main (void)
> do_test (i, 16 << i, 2048, 0, MIDDLE_CHAR);
> }
>
> + for (i = 1; i < 8; ++i)
> + {
> + do_test (0, 16 << i, 4096, 0, MIDDLE_CHAR);
> + do_test (i, 16 << i, 4096, 0, MIDDLE_CHAR);
> + }
> +
> for (i = 1; i < 8; ++i)
> {
> do_test (i, 64, 256, 0, MIDDLE_CHAR);
> do_test (i, 64, 256, 0, BIG_CHAR);
> }
>
> + for (i = 0; i < 8; ++i)
> + {
> + do_test (16 * i, 256, 512, 0, MIDDLE_CHAR);
> + do_test (16 * i, 256, 512, 0, BIG_CHAR);
> + }
> +
> for (i = 0; i < 32; ++i)
> {
> do_test (0, i, i + 1, 0, MIDDLE_CHAR);
> --
> 2.29.2
>
LGTM.
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH v4 1/2] x86: Refactor and improve performance of strchr-avx2.S
2021-02-08 14:08 ` [PATCH v4 1/2] x86: Refactor and improve performance of strchr-avx2.S H.J. Lu
@ 2021-02-08 19:33 ` H.J. Lu
2021-02-08 19:48 ` Noah Goldstein
0 siblings, 1 reply; 10+ messages in thread
From: H.J. Lu @ 2021-02-08 19:33 UTC (permalink / raw)
To: noah; +Cc: GNU C Library, Carlos O'Donell
[-- Attachment #1: Type: text/plain, Size: 11974 bytes --]
On Mon, Feb 8, 2021 at 6:08 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Tue, Feb 2, 2021 at 9:39 PM <goldstein.w.n@gmail.com> wrote:
> >
> > From: noah <goldstein.w.n@gmail.com>
> >
> > No bug. Just seemed the performance could be improved a bit. Observed
> > and expected behavior are unchanged. Optimized body of main
> > loop. Updated page cross logic and optimized accordingly. Made a few
> > minor instruction selection modifications. No regressions in test
> > suite. Both test-strchrnul and test-strchr passed.
> >
> > Signed-off-by: noah <goldstein.w.n@gmail.com>
> > ---
> > sysdeps/x86_64/multiarch/strchr-avx2.S | 235 ++++++++++++-------------
> > sysdeps/x86_64/multiarch/strchr.c | 1 +
> > 2 files changed, 118 insertions(+), 118 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
> > index d416558d04..8b9d78b55a 100644
> > --- a/sysdeps/x86_64/multiarch/strchr-avx2.S
> > +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
> > @@ -27,10 +27,12 @@
> > # ifdef USE_AS_WCSCHR
> > # define VPBROADCAST vpbroadcastd
> > # define VPCMPEQ vpcmpeqd
> > +# define VPMINU vpminud
> > # define CHAR_REG esi
> > # else
> > # define VPBROADCAST vpbroadcastb
> > # define VPCMPEQ vpcmpeqb
> > +# define VPMINU vpminub
> > # define CHAR_REG sil
> > # endif
> >
> > @@ -39,20 +41,26 @@
> > # endif
> >
> > # define VEC_SIZE 32
> > +# define PAGE_SIZE 4096
> >
> > .section .text.avx,"ax",@progbits
> > ENTRY (STRCHR)
> > movl %edi, %ecx
> > - /* Broadcast CHAR to YMM0. */
> > +# ifndef USE_AS_STRCHRNUL
> > + xorl %edx, %edx
> > +# endif
> > +
> > + /* Broadcast CHAR to YMM0. */
> > vmovd %esi, %xmm0
> > vpxor %xmm9, %xmm9, %xmm9
> > VPBROADCAST %xmm0, %ymm0
> > - /* Check if we may cross page boundary with one vector load. */
> > - andl $(2 * VEC_SIZE - 1), %ecx
> > - cmpl $VEC_SIZE, %ecx
> > - ja L(cros_page_boundary)
> > -
> > - /* Check the first VEC_SIZE bytes. Search for both CHAR and the
> > +
> > + /* Check if we cross page boundary with one vector load. */
> > + andl $(PAGE_SIZE - 1), %ecx
> > + cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
> > + ja L(cross_page_boundary)
> > +
> > + /* Check the first VEC_SIZE bytes. Search for both CHAR and the
> > null byte. */
> > vmovdqu (%rdi), %ymm8
> > VPCMPEQ %ymm8, %ymm0, %ymm1
> > @@ -60,50 +68,27 @@ ENTRY (STRCHR)
> > vpor %ymm1, %ymm2, %ymm1
> > vpmovmskb %ymm1, %eax
> > testl %eax, %eax
> > - jnz L(first_vec_x0)
> > -
> > - /* Align data for aligned loads in the loop. */
> > - addq $VEC_SIZE, %rdi
> > - andl $(VEC_SIZE - 1), %ecx
> > - andq $-VEC_SIZE, %rdi
> > -
> > - jmp L(more_4x_vec)
> > -
> > - .p2align 4
> > -L(cros_page_boundary):
> > - andl $(VEC_SIZE - 1), %ecx
> > - andq $-VEC_SIZE, %rdi
> > - vmovdqu (%rdi), %ymm8
> > - VPCMPEQ %ymm8, %ymm0, %ymm1
> > - VPCMPEQ %ymm8, %ymm9, %ymm2
> > - vpor %ymm1, %ymm2, %ymm1
> > - vpmovmskb %ymm1, %eax
> > - /* Remove the leading bytes. */
> > - sarl %cl, %eax
> > - testl %eax, %eax
> > - jz L(aligned_more)
> > - /* Found CHAR or the null byte. */
> > + jz L(more_vecs)
> > tzcntl %eax, %eax
> > - addq %rcx, %rax
> > -# ifdef USE_AS_STRCHRNUL
> > + /* Found CHAR or the null byte. */
> > addq %rdi, %rax
> > -# else
> > - xorl %edx, %edx
> > - leaq (%rdi, %rax), %rax
> > - cmp (%rax), %CHAR_REG
> > +# ifndef USE_AS_STRCHRNUL
> > + cmp (%rax), %CHAR_REG
> > cmovne %rdx, %rax
> > # endif
> > VZEROUPPER
> > ret
> >
> > .p2align 4
> > +L(more_vecs):
> > + /* Align data for aligned loads in the loop. */
> > + andq $-VEC_SIZE, %rdi
> > L(aligned_more):
> > - addq $VEC_SIZE, %rdi
> >
> > -L(more_4x_vec):
> > - /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
> > - since data is only aligned to VEC_SIZE. */
> > - vmovdqa (%rdi), %ymm8
> > + /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
> > + since data is only aligned to VEC_SIZE. */
> > + vmovdqa VEC_SIZE(%rdi), %ymm8
> > + addq $VEC_SIZE, %rdi
> > VPCMPEQ %ymm8, %ymm0, %ymm1
> > VPCMPEQ %ymm8, %ymm9, %ymm2
> > vpor %ymm1, %ymm2, %ymm1
> > @@ -125,7 +110,7 @@ L(more_4x_vec):
> > vpor %ymm1, %ymm2, %ymm1
> > vpmovmskb %ymm1, %eax
> > testl %eax, %eax
> > - jnz L(first_vec_x2)
> > + jnz L(first_vec_x2)
> >
> > vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
> > VPCMPEQ %ymm8, %ymm0, %ymm1
> > @@ -133,122 +118,136 @@ L(more_4x_vec):
> > vpor %ymm1, %ymm2, %ymm1
> > vpmovmskb %ymm1, %eax
> > testl %eax, %eax
> > - jnz L(first_vec_x3)
> > -
> > - addq $(VEC_SIZE * 4), %rdi
> > -
> > - /* Align data to 4 * VEC_SIZE. */
> > - movq %rdi, %rcx
> > - andl $(4 * VEC_SIZE - 1), %ecx
> > - andq $-(4 * VEC_SIZE), %rdi
> > -
> > - .p2align 4
> > -L(loop_4x_vec):
> > - /* Compare 4 * VEC at a time forward. */
> > - vmovdqa (%rdi), %ymm5
> > - vmovdqa VEC_SIZE(%rdi), %ymm6
> > - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
> > - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
> > -
> > - VPCMPEQ %ymm5, %ymm0, %ymm1
> > - VPCMPEQ %ymm6, %ymm0, %ymm2
> > - VPCMPEQ %ymm7, %ymm0, %ymm3
> > - VPCMPEQ %ymm8, %ymm0, %ymm4
> > -
> > - VPCMPEQ %ymm5, %ymm9, %ymm5
> > - VPCMPEQ %ymm6, %ymm9, %ymm6
> > - VPCMPEQ %ymm7, %ymm9, %ymm7
> > - VPCMPEQ %ymm8, %ymm9, %ymm8
> > -
> > - vpor %ymm1, %ymm5, %ymm1
> > - vpor %ymm2, %ymm6, %ymm2
> > - vpor %ymm3, %ymm7, %ymm3
> > - vpor %ymm4, %ymm8, %ymm4
> > -
> > - vpor %ymm1, %ymm2, %ymm5
> > - vpor %ymm3, %ymm4, %ymm6
> > -
> > - vpor %ymm5, %ymm6, %ymm5
> > -
> > - vpmovmskb %ymm5, %eax
> > - testl %eax, %eax
> > - jnz L(4x_vec_end)
> > -
> > - addq $(VEC_SIZE * 4), %rdi
> > + jz L(prep_loop_4x)
> >
> > - jmp L(loop_4x_vec)
> > + tzcntl %eax, %eax
> > + leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
> > +# ifndef USE_AS_STRCHRNUL
> > + cmp (%rax), %CHAR_REG
> > + cmovne %rdx, %rax
> > +# endif
> > + VZEROUPPER
> > + ret
> >
> > .p2align 4
> > L(first_vec_x0):
> > - /* Found CHAR or the null byte. */
> > tzcntl %eax, %eax
> > -# ifdef USE_AS_STRCHRNUL
> > + /* Found CHAR or the null byte. */
> > addq %rdi, %rax
> > -# else
> > - xorl %edx, %edx
> > - leaq (%rdi, %rax), %rax
> > - cmp (%rax), %CHAR_REG
> > +# ifndef USE_AS_STRCHRNUL
> > + cmp (%rax), %CHAR_REG
> > cmovne %rdx, %rax
> > # endif
> > VZEROUPPER
> > ret
> > -
> > +
> > .p2align 4
> > L(first_vec_x1):
> > tzcntl %eax, %eax
> > -# ifdef USE_AS_STRCHRNUL
> > - addq $VEC_SIZE, %rax
> > - addq %rdi, %rax
> > -# else
> > - xorl %edx, %edx
> > leaq VEC_SIZE(%rdi, %rax), %rax
> > - cmp (%rax), %CHAR_REG
> > +# ifndef USE_AS_STRCHRNUL
> > + cmp (%rax), %CHAR_REG
> > cmovne %rdx, %rax
> > # endif
> > VZEROUPPER
> > - ret
> > -
> > + ret
> > +
> > .p2align 4
> > L(first_vec_x2):
> > tzcntl %eax, %eax
> > -# ifdef USE_AS_STRCHRNUL
> > - addq $(VEC_SIZE * 2), %rax
> > - addq %rdi, %rax
> > -# else
> > - xorl %edx, %edx
> > + /* Found CHAR or the null byte. */
> > leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
> > - cmp (%rax), %CHAR_REG
> > +# ifndef USE_AS_STRCHRNUL
> > + cmp (%rax), %CHAR_REG
> > cmovne %rdx, %rax
> > # endif
> > VZEROUPPER
> > ret
> > +
> > +L(prep_loop_4x):
> > + /* Align data to 4 * VEC_SIZE. */
> > + andq $-(VEC_SIZE * 4), %rdi
> >
> > .p2align 4
> > -L(4x_vec_end):
> > +L(loop_4x_vec):
> > + /* Compare 4 * VEC at a time forward. */
> > + vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5
> > + vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6
> > + vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7
> > + vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8
> > +
> > + /* Leaves only CHARS matching esi as 0. */
> > + vpxor %ymm5, %ymm0, %ymm1
> > + vpxor %ymm6, %ymm0, %ymm2
> > + vpxor %ymm7, %ymm0, %ymm3
> > + vpxor %ymm8, %ymm0, %ymm4
> > +
> > + VPMINU %ymm1, %ymm5, %ymm1
> > + VPMINU %ymm2, %ymm6, %ymm2
> > + VPMINU %ymm3, %ymm7, %ymm3
> > + VPMINU %ymm4, %ymm8, %ymm4
> > +
> > + VPMINU %ymm1, %ymm2, %ymm5
> > + VPMINU %ymm3, %ymm4, %ymm6
> > +
> > + VPMINU %ymm5, %ymm6, %ymm5
> > +
> > + VPCMPEQ %ymm5, %ymm9, %ymm5
> > + vpmovmskb %ymm5, %eax
> > +
> > + addq $(VEC_SIZE * 4), %rdi
> > + testl %eax, %eax
> > + jz L(loop_4x_vec)
> > +
> > + VPCMPEQ %ymm1, %ymm9, %ymm1
> > vpmovmskb %ymm1, %eax
> > testl %eax, %eax
> > jnz L(first_vec_x0)
> > +
> > + VPCMPEQ %ymm2, %ymm9, %ymm2
> > vpmovmskb %ymm2, %eax
> > testl %eax, %eax
> > jnz L(first_vec_x1)
> > - vpmovmskb %ymm3, %eax
> > - testl %eax, %eax
> > - jnz L(first_vec_x2)
> > +
> > + VPCMPEQ %ymm3, %ymm9, %ymm3
> > + VPCMPEQ %ymm4, %ymm9, %ymm4
> > + vpmovmskb %ymm3, %ecx
> > vpmovmskb %ymm4, %eax
> > + salq $32, %rax
> > + orq %rcx, %rax
> > + tzcntq %rax, %rax
> > + leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
> > +# ifndef USE_AS_STRCHRNUL
> > + cmp (%rax), %CHAR_REG
> > + cmovne %rdx, %rax
> > +# endif
> > + VZEROUPPER
> > + ret
> > +
> > + /* Cold case for crossing page with first load. */
> > + .p2align 4
> > +L(cross_page_boundary):
> > + andq $-VEC_SIZE, %rdi
> > + andl $(VEC_SIZE - 1), %ecx
> > +
> > + vmovdqa (%rdi), %ymm8
> > + VPCMPEQ %ymm8, %ymm0, %ymm1
> > + VPCMPEQ %ymm8, %ymm9, %ymm2
> > + vpor %ymm1, %ymm2, %ymm1
> > + vpmovmskb %ymm1, %eax
> > + /* Remove the leading bits. */
> > + sarxl %ecx, %eax, %eax
> > testl %eax, %eax
> > -L(first_vec_x3):
> > + jz L(aligned_more)
> > tzcntl %eax, %eax
> > -# ifdef USE_AS_STRCHRNUL
> > - addq $(VEC_SIZE * 3), %rax
> > + addq %rcx, %rdi
> > addq %rdi, %rax
> > -# else
> > - xorl %edx, %edx
> > - leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
> > - cmp (%rax), %CHAR_REG
> > +# ifndef USE_AS_STRCHRNUL
> > + cmp (%rax), %CHAR_REG
> > cmovne %rdx, %rax
> > # endif
> > VZEROUPPER
> > ret
> >
> > END (STRCHR)
> > -#endif
> > +# endif
> > diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
> > index 583a152794..4dfbe3b58b 100644
> > --- a/sysdeps/x86_64/multiarch/strchr.c
> > +++ b/sysdeps/x86_64/multiarch/strchr.c
> > @@ -37,6 +37,7 @@ IFUNC_SELECTOR (void)
> >
> > if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
> > && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> > + && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
> > && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
> > return OPTIMIZE (avx2);
> >
> > --
> > 2.29.2
> >
>
> LGTM.
>
> Thanks.
>
This is the updated patch with extra white spaces fixed I am checking in.
--
H.J.
[-- Attachment #2: 0001-x86-64-Refactor-and-improve-performance-of-strchr-av.patch --]
[-- Type: text/x-patch, Size: 8551 bytes --]
From 1f745ecc2109890886b161d4791e1406fdfc29b8 Mon Sep 17 00:00:00 2001
From: noah <goldstein.w.n@gmail.com>
Date: Wed, 3 Feb 2021 00:38:59 -0500
Subject: [PATCH 1/2] x86-64: Refactor and improve performance of strchr-avx2.S
No bug. Just seemed the performance could be improved a bit. Observed
and expected behavior are unchanged. Optimized body of main
loop. Updated page cross logic and optimized accordingly. Made a few
minor instruction selection modifications. No regressions in test
suite. Both test-strchrnul and test-strchr passed.
---
sysdeps/x86_64/multiarch/strchr-avx2.S | 225 ++++++++++++-------------
sysdeps/x86_64/multiarch/strchr.c | 1 +
2 files changed, 113 insertions(+), 113 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
index d416558d04..476c20c289 100644
--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
@@ -27,10 +27,12 @@
# ifdef USE_AS_WCSCHR
# define VPBROADCAST vpbroadcastd
# define VPCMPEQ vpcmpeqd
+# define VPMINU vpminud
# define CHAR_REG esi
# else
# define VPBROADCAST vpbroadcastb
# define VPCMPEQ vpcmpeqb
+# define VPMINU vpminub
# define CHAR_REG sil
# endif
@@ -39,71 +41,54 @@
# endif
# define VEC_SIZE 32
+# define PAGE_SIZE 4096
.section .text.avx,"ax",@progbits
ENTRY (STRCHR)
movl %edi, %ecx
- /* Broadcast CHAR to YMM0. */
+# ifndef USE_AS_STRCHRNUL
+ xorl %edx, %edx
+# endif
+
+ /* Broadcast CHAR to YMM0. */
vmovd %esi, %xmm0
vpxor %xmm9, %xmm9, %xmm9
VPBROADCAST %xmm0, %ymm0
- /* Check if we may cross page boundary with one vector load. */
- andl $(2 * VEC_SIZE - 1), %ecx
- cmpl $VEC_SIZE, %ecx
- ja L(cros_page_boundary)
-
- /* Check the first VEC_SIZE bytes. Search for both CHAR and the
- null byte. */
- vmovdqu (%rdi), %ymm8
- VPCMPEQ %ymm8, %ymm0, %ymm1
- VPCMPEQ %ymm8, %ymm9, %ymm2
- vpor %ymm1, %ymm2, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
-
- /* Align data for aligned loads in the loop. */
- addq $VEC_SIZE, %rdi
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
- jmp L(more_4x_vec)
+ /* Check if we cross page boundary with one vector load. */
+ andl $(PAGE_SIZE - 1), %ecx
+ cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
+ ja L(cross_page_boundary)
- .p2align 4
-L(cros_page_boundary):
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
+ /* Check the first VEC_SIZE bytes. Search for both CHAR and the
+ null byte. */
vmovdqu (%rdi), %ymm8
VPCMPEQ %ymm8, %ymm0, %ymm1
VPCMPEQ %ymm8, %ymm9, %ymm2
vpor %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %eax
- /* Remove the leading bytes. */
- sarl %cl, %eax
testl %eax, %eax
- jz L(aligned_more)
- /* Found CHAR or the null byte. */
+ jz L(more_vecs)
tzcntl %eax, %eax
- addq %rcx, %rax
-# ifdef USE_AS_STRCHRNUL
+ /* Found CHAR or the null byte. */
addq %rdi, %rax
-# else
- xorl %edx, %edx
- leaq (%rdi, %rax), %rax
- cmp (%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER
ret
.p2align 4
+L(more_vecs):
+ /* Align data for aligned loads in the loop. */
+ andq $-VEC_SIZE, %rdi
L(aligned_more):
- addq $VEC_SIZE, %rdi
-L(more_4x_vec):
- /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
- since data is only aligned to VEC_SIZE. */
- vmovdqa (%rdi), %ymm8
+ /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
+ since data is only aligned to VEC_SIZE. */
+ vmovdqa VEC_SIZE(%rdi), %ymm8
+ addq $VEC_SIZE, %rdi
VPCMPEQ %ymm8, %ymm0, %ymm1
VPCMPEQ %ymm8, %ymm9, %ymm2
vpor %ymm1, %ymm2, %ymm1
@@ -133,61 +118,24 @@ L(more_4x_vec):
vpor %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %eax
testl %eax, %eax
- jnz L(first_vec_x3)
-
- addq $(VEC_SIZE * 4), %rdi
-
- /* Align data to 4 * VEC_SIZE. */
- movq %rdi, %rcx
- andl $(4 * VEC_SIZE - 1), %ecx
- andq $-(4 * VEC_SIZE), %rdi
-
- .p2align 4
-L(loop_4x_vec):
- /* Compare 4 * VEC at a time forward. */
- vmovdqa (%rdi), %ymm5
- vmovdqa VEC_SIZE(%rdi), %ymm6
- vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
- vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
-
- VPCMPEQ %ymm5, %ymm0, %ymm1
- VPCMPEQ %ymm6, %ymm0, %ymm2
- VPCMPEQ %ymm7, %ymm0, %ymm3
- VPCMPEQ %ymm8, %ymm0, %ymm4
+ jz L(prep_loop_4x)
- VPCMPEQ %ymm5, %ymm9, %ymm5
- VPCMPEQ %ymm6, %ymm9, %ymm6
- VPCMPEQ %ymm7, %ymm9, %ymm7
- VPCMPEQ %ymm8, %ymm9, %ymm8
-
- vpor %ymm1, %ymm5, %ymm1
- vpor %ymm2, %ymm6, %ymm2
- vpor %ymm3, %ymm7, %ymm3
- vpor %ymm4, %ymm8, %ymm4
-
- vpor %ymm1, %ymm2, %ymm5
- vpor %ymm3, %ymm4, %ymm6
-
- vpor %ymm5, %ymm6, %ymm5
-
- vpmovmskb %ymm5, %eax
- testl %eax, %eax
- jnz L(4x_vec_end)
-
- addq $(VEC_SIZE * 4), %rdi
-
- jmp L(loop_4x_vec)
+ tzcntl %eax, %eax
+ leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
+ cmovne %rdx, %rax
+# endif
+ VZEROUPPER
+ ret
.p2align 4
L(first_vec_x0):
- /* Found CHAR or the null byte. */
tzcntl %eax, %eax
-# ifdef USE_AS_STRCHRNUL
+ /* Found CHAR or the null byte. */
addq %rdi, %rax
-# else
- xorl %edx, %edx
- leaq (%rdi, %rax), %rax
- cmp (%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER
@@ -196,13 +144,9 @@ L(first_vec_x0):
.p2align 4
L(first_vec_x1):
tzcntl %eax, %eax
-# ifdef USE_AS_STRCHRNUL
- addq $VEC_SIZE, %rax
- addq %rdi, %rax
-# else
- xorl %edx, %edx
leaq VEC_SIZE(%rdi, %rax), %rax
- cmp (%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER
@@ -211,44 +155,99 @@ L(first_vec_x1):
.p2align 4
L(first_vec_x2):
tzcntl %eax, %eax
-# ifdef USE_AS_STRCHRNUL
- addq $(VEC_SIZE * 2), %rax
- addq %rdi, %rax
-# else
- xorl %edx, %edx
+ /* Found CHAR or the null byte. */
leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
- cmp (%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER
ret
+L(prep_loop_4x):
+ /* Align data to 4 * VEC_SIZE. */
+ andq $-(VEC_SIZE * 4), %rdi
+
.p2align 4
-L(4x_vec_end):
+L(loop_4x_vec):
+ /* Compare 4 * VEC at a time forward. */
+ vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5
+ vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6
+ vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7
+ vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8
+
+ /* Leaves only CHARS matching esi as 0. */
+ vpxor %ymm5, %ymm0, %ymm1
+ vpxor %ymm6, %ymm0, %ymm2
+ vpxor %ymm7, %ymm0, %ymm3
+ vpxor %ymm8, %ymm0, %ymm4
+
+ VPMINU %ymm1, %ymm5, %ymm1
+ VPMINU %ymm2, %ymm6, %ymm2
+ VPMINU %ymm3, %ymm7, %ymm3
+ VPMINU %ymm4, %ymm8, %ymm4
+
+ VPMINU %ymm1, %ymm2, %ymm5
+ VPMINU %ymm3, %ymm4, %ymm6
+
+ VPMINU %ymm5, %ymm6, %ymm5
+
+ VPCMPEQ %ymm5, %ymm9, %ymm5
+ vpmovmskb %ymm5, %eax
+
+ addq $(VEC_SIZE * 4), %rdi
+ testl %eax, %eax
+ jz L(loop_4x_vec)
+
+ VPCMPEQ %ymm1, %ymm9, %ymm1
vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x0)
+
+ VPCMPEQ %ymm2, %ymm9, %ymm2
vpmovmskb %ymm2, %eax
testl %eax, %eax
jnz L(first_vec_x1)
- vpmovmskb %ymm3, %eax
- testl %eax, %eax
- jnz L(first_vec_x2)
+
+ VPCMPEQ %ymm3, %ymm9, %ymm3
+ VPCMPEQ %ymm4, %ymm9, %ymm4
+ vpmovmskb %ymm3, %ecx
vpmovmskb %ymm4, %eax
+ salq $32, %rax
+ orq %rcx, %rax
+ tzcntq %rax, %rax
+ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
+ cmovne %rdx, %rax
+# endif
+ VZEROUPPER
+ ret
+
+ /* Cold case for crossing page with first load. */
+ .p2align 4
+L(cross_page_boundary):
+ andq $-VEC_SIZE, %rdi
+ andl $(VEC_SIZE - 1), %ecx
+
+ vmovdqa (%rdi), %ymm8
+ VPCMPEQ %ymm8, %ymm0, %ymm1
+ VPCMPEQ %ymm8, %ymm9, %ymm2
+ vpor %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %eax
+ /* Remove the leading bits. */
+ sarxl %ecx, %eax, %eax
testl %eax, %eax
-L(first_vec_x3):
+ jz L(aligned_more)
tzcntl %eax, %eax
-# ifdef USE_AS_STRCHRNUL
- addq $(VEC_SIZE * 3), %rax
+ addq %rcx, %rdi
addq %rdi, %rax
-# else
- xorl %edx, %edx
- leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
- cmp (%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER
ret
END (STRCHR)
-#endif
+# endif
diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
index 583a152794..4dfbe3b58b 100644
--- a/sysdeps/x86_64/multiarch/strchr.c
+++ b/sysdeps/x86_64/multiarch/strchr.c
@@ -37,6 +37,7 @@ IFUNC_SELECTOR (void)
if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
&& CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
return OPTIMIZE (avx2);
--
2.29.2
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH v4 2/2] x86: Add additional benchmarks and tests for strchr
2021-02-08 14:08 ` H.J. Lu
@ 2021-02-08 19:34 ` H.J. Lu
2021-02-08 19:49 ` Noah Goldstein
0 siblings, 1 reply; 10+ messages in thread
From: H.J. Lu @ 2021-02-08 19:34 UTC (permalink / raw)
To: noah; +Cc: GNU C Library, Carlos O'Donell
[-- Attachment #1: Type: text/plain, Size: 9109 bytes --]
On Mon, Feb 8, 2021 at 6:08 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Tue, Feb 2, 2021 at 9:39 PM <goldstein.w.n@gmail.com> wrote:
> >
> > From: noah <goldstein.w.n@gmail.com>
> >
> > This patch adds additional benchmarks and tests for string size of
> > 4096 and several benchmarks for string size 256 with different
> > alignments.
> >
> > Signed-off-by: noah <goldstein.w.n@gmail.com>
> > ---
> > Added 2 additional benchmark and test sizes:
> >
> > 4096: Just feels like a natural "large" size to test
> >
> > 256 with multiple alignments: This essentially is to test how
> > expensive the initial work prior to the 4x loop is depending on
> > different alignments.
> >
> > results from bench-strchr: All times are in seconds and the medium of
> > 100 runs. Old is current strchr-avx2.S implementation. New is this
> > patch.
> >
> > Summary: New is definetly faster for medium -> large sizes. Once the
> > 4x loop is hit there is a 10%+ speedup and New always wins out. For
> > smaller sizes there is more variance as to which is faster and the
> > differences are small. Generally it seems the New version wins
> > out. This is likely because 0 - 31 sized strings are the fast path for
> > new (no jmp). Also something that is neat is the significant
> > performance improved for alignment 96 and 112. This is because the 5x
> > vectors before 4x loop really favor that alignment.
> >
> > Benchmarking CPU:
> > Icelake: Intel(R) Core(TM) i7-1065G7 CPU @ 1.30GHz
> >
> > size, algn, Old T , New T -------- Win Dif
> > 0 , 0 , 2.54 , 2.52 -------- New -0.02
> > 1 , 0 , 2.57 , 2.52 -------- New -0.05
> > 2 , 0 , 2.56 , 2.52 -------- New -0.04
> > 3 , 0 , 2.58 , 2.54 -------- New -0.04
> > 4 , 0 , 2.61 , 2.55 -------- New -0.06
> > 5 , 0 , 2.65 , 2.62 -------- New -0.03
> > 6 , 0 , 2.73 , 2.74 -------- Old -0.01
> > 7 , 0 , 2.75 , 2.74 -------- New -0.01
> > 8 , 0 , 2.62 , 2.6 -------- New -0.02
> > 9 , 0 , 2.73 , 2.75 -------- Old -0.02
> > 10 , 0 , 2.74 , 2.74 -------- Eq N/A
> > 11 , 0 , 2.76 , 2.72 -------- New -0.04
> > 12 , 0 , 2.74 , 2.72 -------- New -0.02
> > 13 , 0 , 2.75 , 2.72 -------- New -0.03
> > 14 , 0 , 2.74 , 2.73 -------- New -0.01
> > 15 , 0 , 2.74 , 2.73 -------- New -0.01
> > 16 , 0 , 2.74 , 2.73 -------- New -0.01
> > 17 , 0 , 2.74 , 2.74 -------- Eq N/A
> > 18 , 0 , 2.73 , 2.73 -------- Eq N/A
> > 19 , 0 , 2.73 , 2.73 -------- Eq N/A
> > 20 , 0 , 2.73 , 2.73 -------- Eq N/A
> > 21 , 0 , 2.73 , 2.72 -------- New -0.01
> > 22 , 0 , 2.71 , 2.74 -------- Old -0.03
> > 23 , 0 , 2.71 , 2.69 -------- New -0.02
> > 24 , 0 , 2.68 , 2.67 -------- New -0.01
> > 25 , 0 , 2.66 , 2.62 -------- New -0.04
> > 26 , 0 , 2.64 , 2.62 -------- New -0.02
> > 27 , 0 , 2.71 , 2.64 -------- New -0.07
> > 28 , 0 , 2.67 , 2.69 -------- Old -0.02
> > 29 , 0 , 2.72 , 2.72 -------- Eq N/A
> > 30 , 0 , 2.68 , 2.69 -------- Old -0.01
> > 31 , 0 , 2.68 , 2.68 -------- Eq N/A
> > 32 , 0 , 3.51 , 3.52 -------- Old -0.01
> > 32 , 1 , 3.52 , 3.51 -------- New -0.01
> > 64 , 0 , 3.97 , 3.93 -------- New -0.04
> > 64 , 2 , 3.95 , 3.9 -------- New -0.05
> > 64 , 1 , 4.0 , 3.93 -------- New -0.07
> > 64 , 3 , 3.97 , 3.88 -------- New -0.09
> > 64 , 4 , 3.95 , 3.89 -------- New -0.06
> > 64 , 5 , 3.94 , 3.9 -------- New -0.04
> > 64 , 6 , 3.97 , 3.9 -------- New -0.07
> > 64 , 7 , 3.97 , 3.91 -------- New -0.06
> > 96 , 0 , 4.74 , 4.52 -------- New -0.22
> > 128 , 0 , 5.29 , 5.19 -------- New -0.1
> > 128 , 2 , 5.29 , 5.15 -------- New -0.14
> > 128 , 3 , 5.31 , 5.22 -------- New -0.09
> > 256 , 0 , 11.19 , 9.81 -------- New -1.38
> > 256 , 3 , 11.19 , 9.84 -------- New -1.35
> > 256 , 4 , 11.2 , 9.88 -------- New -1.32
> > 256 , 16 , 11.21 , 9.79 -------- New -1.42
> > 256 , 32 , 11.39 , 10.34 -------- New -1.05
> > 256 , 48 , 11.88 , 10.56 -------- New -1.32
> > 256 , 64 , 11.82 , 10.83 -------- New -0.99
> > 256 , 80 , 11.85 , 10.86 -------- New -0.99
> > 256 , 96 , 9.56 , 8.76 -------- New -0.8
> > 256 , 112 , 9.55 , 8.9 -------- New -0.65
> > 512 , 0 , 15.76 , 13.72 -------- New -2.04
> > 512 , 4 , 15.72 , 13.74 -------- New -1.98
> > 512 , 5 , 15.73 , 13.74 -------- New -1.99
> > 1024, 0 , 24.85 , 21.33 -------- New -3.52
> > 1024, 5 , 24.86 , 21.27 -------- New -3.59
> > 1024, 6 , 24.87 , 21.32 -------- New -3.55
> > 2048, 0 , 45.75 , 36.7 -------- New -9.05
> > 2048, 6 , 43.91 , 35.42 -------- New -8.49
> > 2048, 7 , 44.43 , 36.37 -------- New -8.06
> > 4096, 0 , 96.94 , 81.34 -------- New -15.6
> > 4096, 7 , 97.01 , 81.32 -------- New -15.69
> >
> > benchtests/bench-strchr.c | 26 +++++++++++++++++++++++++-
> > string/test-strchr.c | 26 +++++++++++++++++++++++++-
> > 2 files changed, 50 insertions(+), 2 deletions(-)
> >
> > diff --git a/benchtests/bench-strchr.c b/benchtests/bench-strchr.c
> > index bf493fe458..4ce2369d9b 100644
> > --- a/benchtests/bench-strchr.c
> > +++ b/benchtests/bench-strchr.c
> > @@ -100,7 +100,7 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
> > size_t i;
> > CHAR *result;
> > CHAR *buf = (CHAR *) buf1;
> > - align &= 15;
> > + align &= 127;
> > if ((align + len) * sizeof (CHAR) >= page_size)
> > return;
> >
> > @@ -151,12 +151,24 @@ test_main (void)
> > do_test (i, 16 << i, 2048, SMALL_CHAR, MIDDLE_CHAR);
> > }
> >
> > + for (i = 1; i < 8; ++i)
> > + {
> > + do_test (0, 16 << i, 4096, SMALL_CHAR, MIDDLE_CHAR);
> > + do_test (i, 16 << i, 4096, SMALL_CHAR, MIDDLE_CHAR);
> > + }
> > +
> > for (i = 1; i < 8; ++i)
> > {
> > do_test (i, 64, 256, SMALL_CHAR, MIDDLE_CHAR);
> > do_test (i, 64, 256, SMALL_CHAR, BIG_CHAR);
> > }
> >
> > + for (i = 0; i < 8; ++i)
> > + {
> > + do_test (16 * i, 256, 512, SMALL_CHAR, MIDDLE_CHAR);
> > + do_test (16 * i, 256, 512, SMALL_CHAR, BIG_CHAR);
> > + }
> > +
> > for (i = 0; i < 32; ++i)
> > {
> > do_test (0, i, i + 1, SMALL_CHAR, MIDDLE_CHAR);
> > @@ -169,12 +181,24 @@ test_main (void)
> > do_test (i, 16 << i, 2048, 0, MIDDLE_CHAR);
> > }
> >
> > + for (i = 1; i < 8; ++i)
> > + {
> > + do_test (0, 16 << i, 4096, 0, MIDDLE_CHAR);
> > + do_test (i, 16 << i, 4096, 0, MIDDLE_CHAR);
> > + }
> > +
> > for (i = 1; i < 8; ++i)
> > {
> > do_test (i, 64, 256, 0, MIDDLE_CHAR);
> > do_test (i, 64, 256, 0, BIG_CHAR);
> > }
> >
> > + for (i = 0; i < 8; ++i)
> > + {
> > + do_test (16 * i, 256, 512, 0, MIDDLE_CHAR);
> > + do_test (16 * i, 256, 512, 0, BIG_CHAR);
> > + }
> > +
> > for (i = 0; i < 32; ++i)
> > {
> > do_test (0, i, i + 1, 0, MIDDLE_CHAR);
> > diff --git a/string/test-strchr.c b/string/test-strchr.c
> > index 5b6022746c..2cf4ea2add 100644
> > --- a/string/test-strchr.c
> > +++ b/string/test-strchr.c
> > @@ -130,7 +130,7 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
> > size_t i;
> > CHAR *result;
> > CHAR *buf = (CHAR *) buf1;
> > - align &= 15;
> > + align &= 127;
> > if ((align + len) * sizeof (CHAR) >= page_size)
> > return;
> >
> > @@ -259,12 +259,24 @@ test_main (void)
> > do_test (i, 16 << i, 2048, SMALL_CHAR, MIDDLE_CHAR);
> > }
> >
> > + for (i = 1; i < 8; ++i)
> > + {
> > + do_test (0, 16 << i, 4096, SMALL_CHAR, MIDDLE_CHAR);
> > + do_test (i, 16 << i, 4096, SMALL_CHAR, MIDDLE_CHAR);
> > + }
> > +
> > for (i = 1; i < 8; ++i)
> > {
> > do_test (i, 64, 256, SMALL_CHAR, MIDDLE_CHAR);
> > do_test (i, 64, 256, SMALL_CHAR, BIG_CHAR);
> > }
> >
> > + for (i = 0; i < 8; ++i)
> > + {
> > + do_test (16 * i, 256, 512, SMALL_CHAR, MIDDLE_CHAR);
> > + do_test (16 * i, 256, 512, SMALL_CHAR, BIG_CHAR);
> > + }
> > +
> > for (i = 0; i < 32; ++i)
> > {
> > do_test (0, i, i + 1, SMALL_CHAR, MIDDLE_CHAR);
> > @@ -277,12 +289,24 @@ test_main (void)
> > do_test (i, 16 << i, 2048, 0, MIDDLE_CHAR);
> > }
> >
> > + for (i = 1; i < 8; ++i)
> > + {
> > + do_test (0, 16 << i, 4096, 0, MIDDLE_CHAR);
> > + do_test (i, 16 << i, 4096, 0, MIDDLE_CHAR);
> > + }
> > +
> > for (i = 1; i < 8; ++i)
> > {
> > do_test (i, 64, 256, 0, MIDDLE_CHAR);
> > do_test (i, 64, 256, 0, BIG_CHAR);
> > }
> >
> > + for (i = 0; i < 8; ++i)
> > + {
> > + do_test (16 * i, 256, 512, 0, MIDDLE_CHAR);
> > + do_test (16 * i, 256, 512, 0, BIG_CHAR);
> > + }
> > +
> > for (i = 0; i < 32; ++i)
> > {
> > do_test (0, i, i + 1, 0, MIDDLE_CHAR);
> > --
> > 2.29.2
> >
>
> LGTM.
>
> Thanks.
>
> --
> H.J.
This is the updated patch with extra white spaces fixed I am checking in.
--
H.J.
[-- Attachment #2: 0002-strchr-Add-additional-benchmarks-and-tests.patch --]
[-- Type: text/x-patch, Size: 3895 bytes --]
From a00e2fe3dfd3a4e218ba6c1c3445ee68322ddda9 Mon Sep 17 00:00:00 2001
From: noah <goldstein.w.n@gmail.com>
Date: Wed, 3 Feb 2021 00:39:00 -0500
Subject: [PATCH 2/2] strchr: Add additional benchmarks and tests
This patch adds additional benchmarks and tests for string size of
4096 and several benchmarks for string size 256 with different
alignments.
---
benchtests/bench-strchr.c | 26 +++++++++++++++++++++++++-
string/test-strchr.c | 26 +++++++++++++++++++++++++-
2 files changed, 50 insertions(+), 2 deletions(-)
diff --git a/benchtests/bench-strchr.c b/benchtests/bench-strchr.c
index bf493fe458..4ce2369d9b 100644
--- a/benchtests/bench-strchr.c
+++ b/benchtests/bench-strchr.c
@@ -100,7 +100,7 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
size_t i;
CHAR *result;
CHAR *buf = (CHAR *) buf1;
- align &= 15;
+ align &= 127;
if ((align + len) * sizeof (CHAR) >= page_size)
return;
@@ -151,12 +151,24 @@ test_main (void)
do_test (i, 16 << i, 2048, SMALL_CHAR, MIDDLE_CHAR);
}
+ for (i = 1; i < 8; ++i)
+ {
+ do_test (0, 16 << i, 4096, SMALL_CHAR, MIDDLE_CHAR);
+ do_test (i, 16 << i, 4096, SMALL_CHAR, MIDDLE_CHAR);
+ }
+
for (i = 1; i < 8; ++i)
{
do_test (i, 64, 256, SMALL_CHAR, MIDDLE_CHAR);
do_test (i, 64, 256, SMALL_CHAR, BIG_CHAR);
}
+ for (i = 0; i < 8; ++i)
+ {
+ do_test (16 * i, 256, 512, SMALL_CHAR, MIDDLE_CHAR);
+ do_test (16 * i, 256, 512, SMALL_CHAR, BIG_CHAR);
+ }
+
for (i = 0; i < 32; ++i)
{
do_test (0, i, i + 1, SMALL_CHAR, MIDDLE_CHAR);
@@ -169,12 +181,24 @@ test_main (void)
do_test (i, 16 << i, 2048, 0, MIDDLE_CHAR);
}
+ for (i = 1; i < 8; ++i)
+ {
+ do_test (0, 16 << i, 4096, 0, MIDDLE_CHAR);
+ do_test (i, 16 << i, 4096, 0, MIDDLE_CHAR);
+ }
+
for (i = 1; i < 8; ++i)
{
do_test (i, 64, 256, 0, MIDDLE_CHAR);
do_test (i, 64, 256, 0, BIG_CHAR);
}
+ for (i = 0; i < 8; ++i)
+ {
+ do_test (16 * i, 256, 512, 0, MIDDLE_CHAR);
+ do_test (16 * i, 256, 512, 0, BIG_CHAR);
+ }
+
for (i = 0; i < 32; ++i)
{
do_test (0, i, i + 1, 0, MIDDLE_CHAR);
diff --git a/string/test-strchr.c b/string/test-strchr.c
index 5b6022746c..6c8ca54a7d 100644
--- a/string/test-strchr.c
+++ b/string/test-strchr.c
@@ -130,7 +130,7 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
size_t i;
CHAR *result;
CHAR *buf = (CHAR *) buf1;
- align &= 15;
+ align &= 127;
if ((align + len) * sizeof (CHAR) >= page_size)
return;
@@ -259,12 +259,24 @@ test_main (void)
do_test (i, 16 << i, 2048, SMALL_CHAR, MIDDLE_CHAR);
}
+ for (i = 1; i < 8; ++i)
+ {
+ do_test (0, 16 << i, 4096, SMALL_CHAR, MIDDLE_CHAR);
+ do_test (i, 16 << i, 4096, SMALL_CHAR, MIDDLE_CHAR);
+ }
+
for (i = 1; i < 8; ++i)
{
do_test (i, 64, 256, SMALL_CHAR, MIDDLE_CHAR);
do_test (i, 64, 256, SMALL_CHAR, BIG_CHAR);
}
+ for (i = 0; i < 8; ++i)
+ {
+ do_test (16 * i, 256, 512, SMALL_CHAR, MIDDLE_CHAR);
+ do_test (16 * i, 256, 512, SMALL_CHAR, BIG_CHAR);
+ }
+
for (i = 0; i < 32; ++i)
{
do_test (0, i, i + 1, SMALL_CHAR, MIDDLE_CHAR);
@@ -277,12 +289,24 @@ test_main (void)
do_test (i, 16 << i, 2048, 0, MIDDLE_CHAR);
}
+ for (i = 1; i < 8; ++i)
+ {
+ do_test (0, 16 << i, 4096, 0, MIDDLE_CHAR);
+ do_test (i, 16 << i, 4096, 0, MIDDLE_CHAR);
+ }
+
for (i = 1; i < 8; ++i)
{
do_test (i, 64, 256, 0, MIDDLE_CHAR);
do_test (i, 64, 256, 0, BIG_CHAR);
}
+ for (i = 0; i < 8; ++i)
+ {
+ do_test (16 * i, 256, 512, 0, MIDDLE_CHAR);
+ do_test (16 * i, 256, 512, 0, BIG_CHAR);
+ }
+
for (i = 0; i < 32; ++i)
{
do_test (0, i, i + 1, 0, MIDDLE_CHAR);
--
2.29.2
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH v4 1/2] x86: Refactor and improve performance of strchr-avx2.S
2021-02-08 19:33 ` H.J. Lu
@ 2021-02-08 19:48 ` Noah Goldstein
2021-02-08 20:57 ` Noah Goldstein
0 siblings, 1 reply; 10+ messages in thread
From: Noah Goldstein @ 2021-02-08 19:48 UTC (permalink / raw)
To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell
On Mon, Feb 8, 2021 at 2:33 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, Feb 8, 2021 at 6:08 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Tue, Feb 2, 2021 at 9:39 PM <goldstein.w.n@gmail.com> wrote:
> > >
> > > From: noah <goldstein.w.n@gmail.com>
> > >
> > > No bug. Just seemed the performance could be improved a bit. Observed
> > > and expected behavior are unchanged. Optimized body of main
> > > loop. Updated page cross logic and optimized accordingly. Made a few
> > > minor instruction selection modifications. No regressions in test
> > > suite. Both test-strchrnul and test-strchr passed.
> > >
> > > Signed-off-by: noah <goldstein.w.n@gmail.com>
> > > ---
> > > sysdeps/x86_64/multiarch/strchr-avx2.S | 235 ++++++++++++-------------
> > > sysdeps/x86_64/multiarch/strchr.c | 1 +
> > > 2 files changed, 118 insertions(+), 118 deletions(-)
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
> > > index d416558d04..8b9d78b55a 100644
> > > --- a/sysdeps/x86_64/multiarch/strchr-avx2.S
> > > +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
> > > @@ -27,10 +27,12 @@
> > > # ifdef USE_AS_WCSCHR
> > > # define VPBROADCAST vpbroadcastd
> > > # define VPCMPEQ vpcmpeqd
> > > +# define VPMINU vpminud
> > > # define CHAR_REG esi
> > > # else
> > > # define VPBROADCAST vpbroadcastb
> > > # define VPCMPEQ vpcmpeqb
> > > +# define VPMINU vpminub
> > > # define CHAR_REG sil
> > > # endif
> > >
> > > @@ -39,20 +41,26 @@
> > > # endif
> > >
> > > # define VEC_SIZE 32
> > > +# define PAGE_SIZE 4096
> > >
> > > .section .text.avx,"ax",@progbits
> > > ENTRY (STRCHR)
> > > movl %edi, %ecx
> > > - /* Broadcast CHAR to YMM0. */
> > > +# ifndef USE_AS_STRCHRNUL
> > > + xorl %edx, %edx
> > > +# endif
> > > +
> > > + /* Broadcast CHAR to YMM0. */
> > > vmovd %esi, %xmm0
> > > vpxor %xmm9, %xmm9, %xmm9
> > > VPBROADCAST %xmm0, %ymm0
> > > - /* Check if we may cross page boundary with one vector load. */
> > > - andl $(2 * VEC_SIZE - 1), %ecx
> > > - cmpl $VEC_SIZE, %ecx
> > > - ja L(cros_page_boundary)
> > > -
> > > - /* Check the first VEC_SIZE bytes. Search for both CHAR and the
> > > +
> > > + /* Check if we cross page boundary with one vector load. */
> > > + andl $(PAGE_SIZE - 1), %ecx
> > > + cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
> > > + ja L(cross_page_boundary)
> > > +
> > > + /* Check the first VEC_SIZE bytes. Search for both CHAR and the
> > > null byte. */
> > > vmovdqu (%rdi), %ymm8
> > > VPCMPEQ %ymm8, %ymm0, %ymm1
> > > @@ -60,50 +68,27 @@ ENTRY (STRCHR)
> > > vpor %ymm1, %ymm2, %ymm1
> > > vpmovmskb %ymm1, %eax
> > > testl %eax, %eax
> > > - jnz L(first_vec_x0)
> > > -
> > > - /* Align data for aligned loads in the loop. */
> > > - addq $VEC_SIZE, %rdi
> > > - andl $(VEC_SIZE - 1), %ecx
> > > - andq $-VEC_SIZE, %rdi
> > > -
> > > - jmp L(more_4x_vec)
> > > -
> > > - .p2align 4
> > > -L(cros_page_boundary):
> > > - andl $(VEC_SIZE - 1), %ecx
> > > - andq $-VEC_SIZE, %rdi
> > > - vmovdqu (%rdi), %ymm8
> > > - VPCMPEQ %ymm8, %ymm0, %ymm1
> > > - VPCMPEQ %ymm8, %ymm9, %ymm2
> > > - vpor %ymm1, %ymm2, %ymm1
> > > - vpmovmskb %ymm1, %eax
> > > - /* Remove the leading bytes. */
> > > - sarl %cl, %eax
> > > - testl %eax, %eax
> > > - jz L(aligned_more)
> > > - /* Found CHAR or the null byte. */
> > > + jz L(more_vecs)
> > > tzcntl %eax, %eax
> > > - addq %rcx, %rax
> > > -# ifdef USE_AS_STRCHRNUL
> > > + /* Found CHAR or the null byte. */
> > > addq %rdi, %rax
> > > -# else
> > > - xorl %edx, %edx
> > > - leaq (%rdi, %rax), %rax
> > > - cmp (%rax), %CHAR_REG
> > > +# ifndef USE_AS_STRCHRNUL
> > > + cmp (%rax), %CHAR_REG
> > > cmovne %rdx, %rax
> > > # endif
> > > VZEROUPPER
> > > ret
> > >
> > > .p2align 4
> > > +L(more_vecs):
> > > + /* Align data for aligned loads in the loop. */
> > > + andq $-VEC_SIZE, %rdi
> > > L(aligned_more):
> > > - addq $VEC_SIZE, %rdi
> > >
> > > -L(more_4x_vec):
> > > - /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
> > > - since data is only aligned to VEC_SIZE. */
> > > - vmovdqa (%rdi), %ymm8
> > > + /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
> > > + since data is only aligned to VEC_SIZE. */
> > > + vmovdqa VEC_SIZE(%rdi), %ymm8
> > > + addq $VEC_SIZE, %rdi
> > > VPCMPEQ %ymm8, %ymm0, %ymm1
> > > VPCMPEQ %ymm8, %ymm9, %ymm2
> > > vpor %ymm1, %ymm2, %ymm1
> > > @@ -125,7 +110,7 @@ L(more_4x_vec):
> > > vpor %ymm1, %ymm2, %ymm1
> > > vpmovmskb %ymm1, %eax
> > > testl %eax, %eax
> > > - jnz L(first_vec_x2)
> > > + jnz L(first_vec_x2)
> > >
> > > vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
> > > VPCMPEQ %ymm8, %ymm0, %ymm1
> > > @@ -133,122 +118,136 @@ L(more_4x_vec):
> > > vpor %ymm1, %ymm2, %ymm1
> > > vpmovmskb %ymm1, %eax
> > > testl %eax, %eax
> > > - jnz L(first_vec_x3)
> > > -
> > > - addq $(VEC_SIZE * 4), %rdi
> > > -
> > > - /* Align data to 4 * VEC_SIZE. */
> > > - movq %rdi, %rcx
> > > - andl $(4 * VEC_SIZE - 1), %ecx
> > > - andq $-(4 * VEC_SIZE), %rdi
> > > -
> > > - .p2align 4
> > > -L(loop_4x_vec):
> > > - /* Compare 4 * VEC at a time forward. */
> > > - vmovdqa (%rdi), %ymm5
> > > - vmovdqa VEC_SIZE(%rdi), %ymm6
> > > - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
> > > - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
> > > -
> > > - VPCMPEQ %ymm5, %ymm0, %ymm1
> > > - VPCMPEQ %ymm6, %ymm0, %ymm2
> > > - VPCMPEQ %ymm7, %ymm0, %ymm3
> > > - VPCMPEQ %ymm8, %ymm0, %ymm4
> > > -
> > > - VPCMPEQ %ymm5, %ymm9, %ymm5
> > > - VPCMPEQ %ymm6, %ymm9, %ymm6
> > > - VPCMPEQ %ymm7, %ymm9, %ymm7
> > > - VPCMPEQ %ymm8, %ymm9, %ymm8
> > > -
> > > - vpor %ymm1, %ymm5, %ymm1
> > > - vpor %ymm2, %ymm6, %ymm2
> > > - vpor %ymm3, %ymm7, %ymm3
> > > - vpor %ymm4, %ymm8, %ymm4
> > > -
> > > - vpor %ymm1, %ymm2, %ymm5
> > > - vpor %ymm3, %ymm4, %ymm6
> > > -
> > > - vpor %ymm5, %ymm6, %ymm5
> > > -
> > > - vpmovmskb %ymm5, %eax
> > > - testl %eax, %eax
> > > - jnz L(4x_vec_end)
> > > -
> > > - addq $(VEC_SIZE * 4), %rdi
> > > + jz L(prep_loop_4x)
> > >
> > > - jmp L(loop_4x_vec)
> > > + tzcntl %eax, %eax
> > > + leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
> > > +# ifndef USE_AS_STRCHRNUL
> > > + cmp (%rax), %CHAR_REG
> > > + cmovne %rdx, %rax
> > > +# endif
> > > + VZEROUPPER
> > > + ret
> > >
> > > .p2align 4
> > > L(first_vec_x0):
> > > - /* Found CHAR or the null byte. */
> > > tzcntl %eax, %eax
> > > -# ifdef USE_AS_STRCHRNUL
> > > + /* Found CHAR or the null byte. */
> > > addq %rdi, %rax
> > > -# else
> > > - xorl %edx, %edx
> > > - leaq (%rdi, %rax), %rax
> > > - cmp (%rax), %CHAR_REG
> > > +# ifndef USE_AS_STRCHRNUL
> > > + cmp (%rax), %CHAR_REG
> > > cmovne %rdx, %rax
> > > # endif
> > > VZEROUPPER
> > > ret
> > > -
> > > +
> > > .p2align 4
> > > L(first_vec_x1):
> > > tzcntl %eax, %eax
> > > -# ifdef USE_AS_STRCHRNUL
> > > - addq $VEC_SIZE, %rax
> > > - addq %rdi, %rax
> > > -# else
> > > - xorl %edx, %edx
> > > leaq VEC_SIZE(%rdi, %rax), %rax
> > > - cmp (%rax), %CHAR_REG
> > > +# ifndef USE_AS_STRCHRNUL
> > > + cmp (%rax), %CHAR_REG
> > > cmovne %rdx, %rax
> > > # endif
> > > VZEROUPPER
> > > - ret
> > > -
> > > + ret
> > > +
> > > .p2align 4
> > > L(first_vec_x2):
> > > tzcntl %eax, %eax
> > > -# ifdef USE_AS_STRCHRNUL
> > > - addq $(VEC_SIZE * 2), %rax
> > > - addq %rdi, %rax
> > > -# else
> > > - xorl %edx, %edx
> > > + /* Found CHAR or the null byte. */
> > > leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
> > > - cmp (%rax), %CHAR_REG
> > > +# ifndef USE_AS_STRCHRNUL
> > > + cmp (%rax), %CHAR_REG
> > > cmovne %rdx, %rax
> > > # endif
> > > VZEROUPPER
> > > ret
> > > +
> > > +L(prep_loop_4x):
> > > + /* Align data to 4 * VEC_SIZE. */
> > > + andq $-(VEC_SIZE * 4), %rdi
> > >
> > > .p2align 4
> > > -L(4x_vec_end):
> > > +L(loop_4x_vec):
> > > + /* Compare 4 * VEC at a time forward. */
> > > + vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5
> > > + vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6
> > > + vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7
> > > + vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8
> > > +
> > > + /* Leaves only CHARS matching esi as 0. */
> > > + vpxor %ymm5, %ymm0, %ymm1
> > > + vpxor %ymm6, %ymm0, %ymm2
> > > + vpxor %ymm7, %ymm0, %ymm3
> > > + vpxor %ymm8, %ymm0, %ymm4
> > > +
> > > + VPMINU %ymm1, %ymm5, %ymm1
> > > + VPMINU %ymm2, %ymm6, %ymm2
> > > + VPMINU %ymm3, %ymm7, %ymm3
> > > + VPMINU %ymm4, %ymm8, %ymm4
> > > +
> > > + VPMINU %ymm1, %ymm2, %ymm5
> > > + VPMINU %ymm3, %ymm4, %ymm6
> > > +
> > > + VPMINU %ymm5, %ymm6, %ymm5
> > > +
> > > + VPCMPEQ %ymm5, %ymm9, %ymm5
> > > + vpmovmskb %ymm5, %eax
> > > +
> > > + addq $(VEC_SIZE * 4), %rdi
> > > + testl %eax, %eax
> > > + jz L(loop_4x_vec)
> > > +
> > > + VPCMPEQ %ymm1, %ymm9, %ymm1
> > > vpmovmskb %ymm1, %eax
> > > testl %eax, %eax
> > > jnz L(first_vec_x0)
> > > +
> > > + VPCMPEQ %ymm2, %ymm9, %ymm2
> > > vpmovmskb %ymm2, %eax
> > > testl %eax, %eax
> > > jnz L(first_vec_x1)
> > > - vpmovmskb %ymm3, %eax
> > > - testl %eax, %eax
> > > - jnz L(first_vec_x2)
> > > +
> > > + VPCMPEQ %ymm3, %ymm9, %ymm3
> > > + VPCMPEQ %ymm4, %ymm9, %ymm4
> > > + vpmovmskb %ymm3, %ecx
> > > vpmovmskb %ymm4, %eax
> > > + salq $32, %rax
> > > + orq %rcx, %rax
> > > + tzcntq %rax, %rax
> > > + leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
> > > +# ifndef USE_AS_STRCHRNUL
> > > + cmp (%rax), %CHAR_REG
> > > + cmovne %rdx, %rax
> > > +# endif
> > > + VZEROUPPER
> > > + ret
> > > +
> > > + /* Cold case for crossing page with first load. */
> > > + .p2align 4
> > > +L(cross_page_boundary):
> > > + andq $-VEC_SIZE, %rdi
> > > + andl $(VEC_SIZE - 1), %ecx
> > > +
> > > + vmovdqa (%rdi), %ymm8
> > > + VPCMPEQ %ymm8, %ymm0, %ymm1
> > > + VPCMPEQ %ymm8, %ymm9, %ymm2
> > > + vpor %ymm1, %ymm2, %ymm1
> > > + vpmovmskb %ymm1, %eax
> > > + /* Remove the leading bits. */
> > > + sarxl %ecx, %eax, %eax
> > > testl %eax, %eax
> > > -L(first_vec_x3):
> > > + jz L(aligned_more)
> > > tzcntl %eax, %eax
> > > -# ifdef USE_AS_STRCHRNUL
> > > - addq $(VEC_SIZE * 3), %rax
> > > + addq %rcx, %rdi
> > > addq %rdi, %rax
> > > -# else
> > > - xorl %edx, %edx
> > > - leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
> > > - cmp (%rax), %CHAR_REG
> > > +# ifndef USE_AS_STRCHRNUL
> > > + cmp (%rax), %CHAR_REG
> > > cmovne %rdx, %rax
> > > # endif
> > > VZEROUPPER
> > > ret
> > >
> > > END (STRCHR)
> > > -#endif
> > > +# endif
> > > diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
> > > index 583a152794..4dfbe3b58b 100644
> > > --- a/sysdeps/x86_64/multiarch/strchr.c
> > > +++ b/sysdeps/x86_64/multiarch/strchr.c
> > > @@ -37,6 +37,7 @@ IFUNC_SELECTOR (void)
> > >
> > > if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
> > > && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> > > + && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
> > > && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
> > > return OPTIMIZE (avx2);
> > >
> > > --
> > > 2.29.2
> > >
> >
> > LGTM.
> >
> > Thanks.
> >
>
> This is the updated patch with extra white spaces fixed I am checking in.
>
> --
> H.J.
Awesome! Thanks!
N.G.
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH v4 2/2] x86: Add additional benchmarks and tests for strchr
2021-02-08 19:34 ` H.J. Lu
@ 2021-02-08 19:49 ` Noah Goldstein
0 siblings, 0 replies; 10+ messages in thread
From: Noah Goldstein @ 2021-02-08 19:49 UTC (permalink / raw)
To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell
On Mon, Feb 8, 2021 at 2:35 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, Feb 8, 2021 at 6:08 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Tue, Feb 2, 2021 at 9:39 PM <goldstein.w.n@gmail.com> wrote:
> > >
> > > From: noah <goldstein.w.n@gmail.com>
> > >
> > > This patch adds additional benchmarks and tests for string size of
> > > 4096 and several benchmarks for string size 256 with different
> > > alignments.
> > >
> > > Signed-off-by: noah <goldstein.w.n@gmail.com>
> > > ---
> > > Added 2 additional benchmark and test sizes:
> > >
> > > 4096: Just feels like a natural "large" size to test
> > >
> > > 256 with multiple alignments: This essentially is to test how
> > > expensive the initial work prior to the 4x loop is depending on
> > > different alignments.
> > >
> > > results from bench-strchr: All times are in seconds and the medium of
> > > 100 runs. Old is current strchr-avx2.S implementation. New is this
> > > patch.
> > >
> > > Summary: New is definetly faster for medium -> large sizes. Once the
> > > 4x loop is hit there is a 10%+ speedup and New always wins out. For
> > > smaller sizes there is more variance as to which is faster and the
> > > differences are small. Generally it seems the New version wins
> > > out. This is likely because 0 - 31 sized strings are the fast path for
> > > new (no jmp). Also something that is neat is the significant
> > > performance improved for alignment 96 and 112. This is because the 5x
> > > vectors before 4x loop really favor that alignment.
> > >
> > > Benchmarking CPU:
> > > Icelake: Intel(R) Core(TM) i7-1065G7 CPU @ 1.30GHz
> > >
> > > size, algn, Old T , New T -------- Win Dif
> > > 0 , 0 , 2.54 , 2.52 -------- New -0.02
> > > 1 , 0 , 2.57 , 2.52 -------- New -0.05
> > > 2 , 0 , 2.56 , 2.52 -------- New -0.04
> > > 3 , 0 , 2.58 , 2.54 -------- New -0.04
> > > 4 , 0 , 2.61 , 2.55 -------- New -0.06
> > > 5 , 0 , 2.65 , 2.62 -------- New -0.03
> > > 6 , 0 , 2.73 , 2.74 -------- Old -0.01
> > > 7 , 0 , 2.75 , 2.74 -------- New -0.01
> > > 8 , 0 , 2.62 , 2.6 -------- New -0.02
> > > 9 , 0 , 2.73 , 2.75 -------- Old -0.02
> > > 10 , 0 , 2.74 , 2.74 -------- Eq N/A
> > > 11 , 0 , 2.76 , 2.72 -------- New -0.04
> > > 12 , 0 , 2.74 , 2.72 -------- New -0.02
> > > 13 , 0 , 2.75 , 2.72 -------- New -0.03
> > > 14 , 0 , 2.74 , 2.73 -------- New -0.01
> > > 15 , 0 , 2.74 , 2.73 -------- New -0.01
> > > 16 , 0 , 2.74 , 2.73 -------- New -0.01
> > > 17 , 0 , 2.74 , 2.74 -------- Eq N/A
> > > 18 , 0 , 2.73 , 2.73 -------- Eq N/A
> > > 19 , 0 , 2.73 , 2.73 -------- Eq N/A
> > > 20 , 0 , 2.73 , 2.73 -------- Eq N/A
> > > 21 , 0 , 2.73 , 2.72 -------- New -0.01
> > > 22 , 0 , 2.71 , 2.74 -------- Old -0.03
> > > 23 , 0 , 2.71 , 2.69 -------- New -0.02
> > > 24 , 0 , 2.68 , 2.67 -------- New -0.01
> > > 25 , 0 , 2.66 , 2.62 -------- New -0.04
> > > 26 , 0 , 2.64 , 2.62 -------- New -0.02
> > > 27 , 0 , 2.71 , 2.64 -------- New -0.07
> > > 28 , 0 , 2.67 , 2.69 -------- Old -0.02
> > > 29 , 0 , 2.72 , 2.72 -------- Eq N/A
> > > 30 , 0 , 2.68 , 2.69 -------- Old -0.01
> > > 31 , 0 , 2.68 , 2.68 -------- Eq N/A
> > > 32 , 0 , 3.51 , 3.52 -------- Old -0.01
> > > 32 , 1 , 3.52 , 3.51 -------- New -0.01
> > > 64 , 0 , 3.97 , 3.93 -------- New -0.04
> > > 64 , 2 , 3.95 , 3.9 -------- New -0.05
> > > 64 , 1 , 4.0 , 3.93 -------- New -0.07
> > > 64 , 3 , 3.97 , 3.88 -------- New -0.09
> > > 64 , 4 , 3.95 , 3.89 -------- New -0.06
> > > 64 , 5 , 3.94 , 3.9 -------- New -0.04
> > > 64 , 6 , 3.97 , 3.9 -------- New -0.07
> > > 64 , 7 , 3.97 , 3.91 -------- New -0.06
> > > 96 , 0 , 4.74 , 4.52 -------- New -0.22
> > > 128 , 0 , 5.29 , 5.19 -------- New -0.1
> > > 128 , 2 , 5.29 , 5.15 -------- New -0.14
> > > 128 , 3 , 5.31 , 5.22 -------- New -0.09
> > > 256 , 0 , 11.19 , 9.81 -------- New -1.38
> > > 256 , 3 , 11.19 , 9.84 -------- New -1.35
> > > 256 , 4 , 11.2 , 9.88 -------- New -1.32
> > > 256 , 16 , 11.21 , 9.79 -------- New -1.42
> > > 256 , 32 , 11.39 , 10.34 -------- New -1.05
> > > 256 , 48 , 11.88 , 10.56 -------- New -1.32
> > > 256 , 64 , 11.82 , 10.83 -------- New -0.99
> > > 256 , 80 , 11.85 , 10.86 -------- New -0.99
> > > 256 , 96 , 9.56 , 8.76 -------- New -0.8
> > > 256 , 112 , 9.55 , 8.9 -------- New -0.65
> > > 512 , 0 , 15.76 , 13.72 -------- New -2.04
> > > 512 , 4 , 15.72 , 13.74 -------- New -1.98
> > > 512 , 5 , 15.73 , 13.74 -------- New -1.99
> > > 1024, 0 , 24.85 , 21.33 -------- New -3.52
> > > 1024, 5 , 24.86 , 21.27 -------- New -3.59
> > > 1024, 6 , 24.87 , 21.32 -------- New -3.55
> > > 2048, 0 , 45.75 , 36.7 -------- New -9.05
> > > 2048, 6 , 43.91 , 35.42 -------- New -8.49
> > > 2048, 7 , 44.43 , 36.37 -------- New -8.06
> > > 4096, 0 , 96.94 , 81.34 -------- New -15.6
> > > 4096, 7 , 97.01 , 81.32 -------- New -15.69
> > >
> > > benchtests/bench-strchr.c | 26 +++++++++++++++++++++++++-
> > > string/test-strchr.c | 26 +++++++++++++++++++++++++-
> > > 2 files changed, 50 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/benchtests/bench-strchr.c b/benchtests/bench-strchr.c
> > > index bf493fe458..4ce2369d9b 100644
> > > --- a/benchtests/bench-strchr.c
> > > +++ b/benchtests/bench-strchr.c
> > > @@ -100,7 +100,7 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
> > > size_t i;
> > > CHAR *result;
> > > CHAR *buf = (CHAR *) buf1;
> > > - align &= 15;
> > > + align &= 127;
> > > if ((align + len) * sizeof (CHAR) >= page_size)
> > > return;
> > >
> > > @@ -151,12 +151,24 @@ test_main (void)
> > > do_test (i, 16 << i, 2048, SMALL_CHAR, MIDDLE_CHAR);
> > > }
> > >
> > > + for (i = 1; i < 8; ++i)
> > > + {
> > > + do_test (0, 16 << i, 4096, SMALL_CHAR, MIDDLE_CHAR);
> > > + do_test (i, 16 << i, 4096, SMALL_CHAR, MIDDLE_CHAR);
> > > + }
> > > +
> > > for (i = 1; i < 8; ++i)
> > > {
> > > do_test (i, 64, 256, SMALL_CHAR, MIDDLE_CHAR);
> > > do_test (i, 64, 256, SMALL_CHAR, BIG_CHAR);
> > > }
> > >
> > > + for (i = 0; i < 8; ++i)
> > > + {
> > > + do_test (16 * i, 256, 512, SMALL_CHAR, MIDDLE_CHAR);
> > > + do_test (16 * i, 256, 512, SMALL_CHAR, BIG_CHAR);
> > > + }
> > > +
> > > for (i = 0; i < 32; ++i)
> > > {
> > > do_test (0, i, i + 1, SMALL_CHAR, MIDDLE_CHAR);
> > > @@ -169,12 +181,24 @@ test_main (void)
> > > do_test (i, 16 << i, 2048, 0, MIDDLE_CHAR);
> > > }
> > >
> > > + for (i = 1; i < 8; ++i)
> > > + {
> > > + do_test (0, 16 << i, 4096, 0, MIDDLE_CHAR);
> > > + do_test (i, 16 << i, 4096, 0, MIDDLE_CHAR);
> > > + }
> > > +
> > > for (i = 1; i < 8; ++i)
> > > {
> > > do_test (i, 64, 256, 0, MIDDLE_CHAR);
> > > do_test (i, 64, 256, 0, BIG_CHAR);
> > > }
> > >
> > > + for (i = 0; i < 8; ++i)
> > > + {
> > > + do_test (16 * i, 256, 512, 0, MIDDLE_CHAR);
> > > + do_test (16 * i, 256, 512, 0, BIG_CHAR);
> > > + }
> > > +
> > > for (i = 0; i < 32; ++i)
> > > {
> > > do_test (0, i, i + 1, 0, MIDDLE_CHAR);
> > > diff --git a/string/test-strchr.c b/string/test-strchr.c
> > > index 5b6022746c..2cf4ea2add 100644
> > > --- a/string/test-strchr.c
> > > +++ b/string/test-strchr.c
> > > @@ -130,7 +130,7 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
> > > size_t i;
> > > CHAR *result;
> > > CHAR *buf = (CHAR *) buf1;
> > > - align &= 15;
> > > + align &= 127;
> > > if ((align + len) * sizeof (CHAR) >= page_size)
> > > return;
> > >
> > > @@ -259,12 +259,24 @@ test_main (void)
> > > do_test (i, 16 << i, 2048, SMALL_CHAR, MIDDLE_CHAR);
> > > }
> > >
> > > + for (i = 1; i < 8; ++i)
> > > + {
> > > + do_test (0, 16 << i, 4096, SMALL_CHAR, MIDDLE_CHAR);
> > > + do_test (i, 16 << i, 4096, SMALL_CHAR, MIDDLE_CHAR);
> > > + }
> > > +
> > > for (i = 1; i < 8; ++i)
> > > {
> > > do_test (i, 64, 256, SMALL_CHAR, MIDDLE_CHAR);
> > > do_test (i, 64, 256, SMALL_CHAR, BIG_CHAR);
> > > }
> > >
> > > + for (i = 0; i < 8; ++i)
> > > + {
> > > + do_test (16 * i, 256, 512, SMALL_CHAR, MIDDLE_CHAR);
> > > + do_test (16 * i, 256, 512, SMALL_CHAR, BIG_CHAR);
> > > + }
> > > +
> > > for (i = 0; i < 32; ++i)
> > > {
> > > do_test (0, i, i + 1, SMALL_CHAR, MIDDLE_CHAR);
> > > @@ -277,12 +289,24 @@ test_main (void)
> > > do_test (i, 16 << i, 2048, 0, MIDDLE_CHAR);
> > > }
> > >
> > > + for (i = 1; i < 8; ++i)
> > > + {
> > > + do_test (0, 16 << i, 4096, 0, MIDDLE_CHAR);
> > > + do_test (i, 16 << i, 4096, 0, MIDDLE_CHAR);
> > > + }
> > > +
> > > for (i = 1; i < 8; ++i)
> > > {
> > > do_test (i, 64, 256, 0, MIDDLE_CHAR);
> > > do_test (i, 64, 256, 0, BIG_CHAR);
> > > }
> > >
> > > + for (i = 0; i < 8; ++i)
> > > + {
> > > + do_test (16 * i, 256, 512, 0, MIDDLE_CHAR);
> > > + do_test (16 * i, 256, 512, 0, BIG_CHAR);
> > > + }
> > > +
> > > for (i = 0; i < 32; ++i)
> > > {
> > > do_test (0, i, i + 1, 0, MIDDLE_CHAR);
> > > --
> > > 2.29.2
> > >
> >
> > LGTM.
> >
> > Thanks.
> >
> > --
> > H.J.
>
> This is the updated patch with extra white spaces fixed I am checking in.
>
>
> --
> H.J.
Awesome! Thanks!
N.G.
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH v4 1/2] x86: Refactor and improve performance of strchr-avx2.S
2021-02-08 19:48 ` Noah Goldstein
@ 2021-02-08 20:57 ` Noah Goldstein
2022-04-27 23:43 ` Sunil Pandey
0 siblings, 1 reply; 10+ messages in thread
From: Noah Goldstein @ 2021-02-08 20:57 UTC (permalink / raw)
To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell
On Mon, Feb 8, 2021 at 2:48 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Mon, Feb 8, 2021 at 2:33 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Mon, Feb 8, 2021 at 6:08 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Tue, Feb 2, 2021 at 9:39 PM <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > From: noah <goldstein.w.n@gmail.com>
> > > >
> > > > No bug. Just seemed the performance could be improved a bit. Observed
> > > > and expected behavior are unchanged. Optimized body of main
> > > > loop. Updated page cross logic and optimized accordingly. Made a few
> > > > minor instruction selection modifications. No regressions in test
> > > > suite. Both test-strchrnul and test-strchr passed.
> > > >
> > > > Signed-off-by: noah <goldstein.w.n@gmail.com>
> > > > ---
> > > > sysdeps/x86_64/multiarch/strchr-avx2.S | 235 ++++++++++++-------------
> > > > sysdeps/x86_64/multiarch/strchr.c | 1 +
> > > > 2 files changed, 118 insertions(+), 118 deletions(-)
> > > >
> > > > diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
> > > > index d416558d04..8b9d78b55a 100644
> > > > --- a/sysdeps/x86_64/multiarch/strchr-avx2.S
> > > > +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
> > > > @@ -27,10 +27,12 @@
> > > > # ifdef USE_AS_WCSCHR
> > > > # define VPBROADCAST vpbroadcastd
> > > > # define VPCMPEQ vpcmpeqd
> > > > +# define VPMINU vpminud
> > > > # define CHAR_REG esi
> > > > # else
> > > > # define VPBROADCAST vpbroadcastb
> > > > # define VPCMPEQ vpcmpeqb
> > > > +# define VPMINU vpminub
> > > > # define CHAR_REG sil
> > > > # endif
> > > >
> > > > @@ -39,20 +41,26 @@
> > > > # endif
> > > >
> > > > # define VEC_SIZE 32
> > > > +# define PAGE_SIZE 4096
> > > >
> > > > .section .text.avx,"ax",@progbits
> > > > ENTRY (STRCHR)
> > > > movl %edi, %ecx
> > > > - /* Broadcast CHAR to YMM0. */
> > > > +# ifndef USE_AS_STRCHRNUL
> > > > + xorl %edx, %edx
> > > > +# endif
> > > > +
> > > > + /* Broadcast CHAR to YMM0. */
> > > > vmovd %esi, %xmm0
> > > > vpxor %xmm9, %xmm9, %xmm9
> > > > VPBROADCAST %xmm0, %ymm0
> > > > - /* Check if we may cross page boundary with one vector load. */
> > > > - andl $(2 * VEC_SIZE - 1), %ecx
> > > > - cmpl $VEC_SIZE, %ecx
> > > > - ja L(cros_page_boundary)
> > > > -
> > > > - /* Check the first VEC_SIZE bytes. Search for both CHAR and the
> > > > +
> > > > + /* Check if we cross page boundary with one vector load. */
> > > > + andl $(PAGE_SIZE - 1), %ecx
> > > > + cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
> > > > + ja L(cross_page_boundary)
> > > > +
> > > > + /* Check the first VEC_SIZE bytes. Search for both CHAR and the
> > > > null byte. */
> > > > vmovdqu (%rdi), %ymm8
> > > > VPCMPEQ %ymm8, %ymm0, %ymm1
> > > > @@ -60,50 +68,27 @@ ENTRY (STRCHR)
> > > > vpor %ymm1, %ymm2, %ymm1
> > > > vpmovmskb %ymm1, %eax
> > > > testl %eax, %eax
> > > > - jnz L(first_vec_x0)
> > > > -
> > > > - /* Align data for aligned loads in the loop. */
> > > > - addq $VEC_SIZE, %rdi
> > > > - andl $(VEC_SIZE - 1), %ecx
> > > > - andq $-VEC_SIZE, %rdi
> > > > -
> > > > - jmp L(more_4x_vec)
> > > > -
> > > > - .p2align 4
> > > > -L(cros_page_boundary):
> > > > - andl $(VEC_SIZE - 1), %ecx
> > > > - andq $-VEC_SIZE, %rdi
> > > > - vmovdqu (%rdi), %ymm8
> > > > - VPCMPEQ %ymm8, %ymm0, %ymm1
> > > > - VPCMPEQ %ymm8, %ymm9, %ymm2
> > > > - vpor %ymm1, %ymm2, %ymm1
> > > > - vpmovmskb %ymm1, %eax
> > > > - /* Remove the leading bytes. */
> > > > - sarl %cl, %eax
> > > > - testl %eax, %eax
> > > > - jz L(aligned_more)
> > > > - /* Found CHAR or the null byte. */
> > > > + jz L(more_vecs)
> > > > tzcntl %eax, %eax
> > > > - addq %rcx, %rax
> > > > -# ifdef USE_AS_STRCHRNUL
> > > > + /* Found CHAR or the null byte. */
> > > > addq %rdi, %rax
> > > > -# else
> > > > - xorl %edx, %edx
> > > > - leaq (%rdi, %rax), %rax
> > > > - cmp (%rax), %CHAR_REG
> > > > +# ifndef USE_AS_STRCHRNUL
> > > > + cmp (%rax), %CHAR_REG
> > > > cmovne %rdx, %rax
> > > > # endif
> > > > VZEROUPPER
> > > > ret
> > > >
> > > > .p2align 4
> > > > +L(more_vecs):
> > > > + /* Align data for aligned loads in the loop. */
> > > > + andq $-VEC_SIZE, %rdi
> > > > L(aligned_more):
> > > > - addq $VEC_SIZE, %rdi
> > > >
> > > > -L(more_4x_vec):
> > > > - /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
> > > > - since data is only aligned to VEC_SIZE. */
> > > > - vmovdqa (%rdi), %ymm8
> > > > + /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
> > > > + since data is only aligned to VEC_SIZE. */
> > > > + vmovdqa VEC_SIZE(%rdi), %ymm8
> > > > + addq $VEC_SIZE, %rdi
> > > > VPCMPEQ %ymm8, %ymm0, %ymm1
> > > > VPCMPEQ %ymm8, %ymm9, %ymm2
> > > > vpor %ymm1, %ymm2, %ymm1
> > > > @@ -125,7 +110,7 @@ L(more_4x_vec):
> > > > vpor %ymm1, %ymm2, %ymm1
> > > > vpmovmskb %ymm1, %eax
> > > > testl %eax, %eax
> > > > - jnz L(first_vec_x2)
> > > > + jnz L(first_vec_x2)
> > > >
> > > > vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
> > > > VPCMPEQ %ymm8, %ymm0, %ymm1
> > > > @@ -133,122 +118,136 @@ L(more_4x_vec):
> > > > vpor %ymm1, %ymm2, %ymm1
> > > > vpmovmskb %ymm1, %eax
> > > > testl %eax, %eax
> > > > - jnz L(first_vec_x3)
> > > > -
> > > > - addq $(VEC_SIZE * 4), %rdi
> > > > -
> > > > - /* Align data to 4 * VEC_SIZE. */
> > > > - movq %rdi, %rcx
> > > > - andl $(4 * VEC_SIZE - 1), %ecx
> > > > - andq $-(4 * VEC_SIZE), %rdi
> > > > -
> > > > - .p2align 4
> > > > -L(loop_4x_vec):
> > > > - /* Compare 4 * VEC at a time forward. */
> > > > - vmovdqa (%rdi), %ymm5
> > > > - vmovdqa VEC_SIZE(%rdi), %ymm6
> > > > - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
> > > > - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
> > > > -
> > > > - VPCMPEQ %ymm5, %ymm0, %ymm1
> > > > - VPCMPEQ %ymm6, %ymm0, %ymm2
> > > > - VPCMPEQ %ymm7, %ymm0, %ymm3
> > > > - VPCMPEQ %ymm8, %ymm0, %ymm4
> > > > -
> > > > - VPCMPEQ %ymm5, %ymm9, %ymm5
> > > > - VPCMPEQ %ymm6, %ymm9, %ymm6
> > > > - VPCMPEQ %ymm7, %ymm9, %ymm7
> > > > - VPCMPEQ %ymm8, %ymm9, %ymm8
> > > > -
> > > > - vpor %ymm1, %ymm5, %ymm1
> > > > - vpor %ymm2, %ymm6, %ymm2
> > > > - vpor %ymm3, %ymm7, %ymm3
> > > > - vpor %ymm4, %ymm8, %ymm4
> > > > -
> > > > - vpor %ymm1, %ymm2, %ymm5
> > > > - vpor %ymm3, %ymm4, %ymm6
> > > > -
> > > > - vpor %ymm5, %ymm6, %ymm5
> > > > -
> > > > - vpmovmskb %ymm5, %eax
> > > > - testl %eax, %eax
> > > > - jnz L(4x_vec_end)
> > > > -
> > > > - addq $(VEC_SIZE * 4), %rdi
> > > > + jz L(prep_loop_4x)
> > > >
> > > > - jmp L(loop_4x_vec)
> > > > + tzcntl %eax, %eax
> > > > + leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
> > > > +# ifndef USE_AS_STRCHRNUL
> > > > + cmp (%rax), %CHAR_REG
> > > > + cmovne %rdx, %rax
> > > > +# endif
> > > > + VZEROUPPER
> > > > + ret
> > > >
> > > > .p2align 4
> > > > L(first_vec_x0):
> > > > - /* Found CHAR or the null byte. */
> > > > tzcntl %eax, %eax
> > > > -# ifdef USE_AS_STRCHRNUL
> > > > + /* Found CHAR or the null byte. */
> > > > addq %rdi, %rax
> > > > -# else
> > > > - xorl %edx, %edx
> > > > - leaq (%rdi, %rax), %rax
> > > > - cmp (%rax), %CHAR_REG
> > > > +# ifndef USE_AS_STRCHRNUL
> > > > + cmp (%rax), %CHAR_REG
> > > > cmovne %rdx, %rax
> > > > # endif
> > > > VZEROUPPER
> > > > ret
> > > > -
> > > > +
> > > > .p2align 4
> > > > L(first_vec_x1):
> > > > tzcntl %eax, %eax
> > > > -# ifdef USE_AS_STRCHRNUL
> > > > - addq $VEC_SIZE, %rax
> > > > - addq %rdi, %rax
> > > > -# else
> > > > - xorl %edx, %edx
> > > > leaq VEC_SIZE(%rdi, %rax), %rax
> > > > - cmp (%rax), %CHAR_REG
> > > > +# ifndef USE_AS_STRCHRNUL
> > > > + cmp (%rax), %CHAR_REG
> > > > cmovne %rdx, %rax
> > > > # endif
> > > > VZEROUPPER
> > > > - ret
> > > > -
> > > > + ret
> > > > +
> > > > .p2align 4
> > > > L(first_vec_x2):
> > > > tzcntl %eax, %eax
> > > > -# ifdef USE_AS_STRCHRNUL
> > > > - addq $(VEC_SIZE * 2), %rax
> > > > - addq %rdi, %rax
> > > > -# else
> > > > - xorl %edx, %edx
> > > > + /* Found CHAR or the null byte. */
> > > > leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
> > > > - cmp (%rax), %CHAR_REG
> > > > +# ifndef USE_AS_STRCHRNUL
> > > > + cmp (%rax), %CHAR_REG
> > > > cmovne %rdx, %rax
> > > > # endif
> > > > VZEROUPPER
> > > > ret
> > > > +
> > > > +L(prep_loop_4x):
> > > > + /* Align data to 4 * VEC_SIZE. */
> > > > + andq $-(VEC_SIZE * 4), %rdi
> > > >
> > > > .p2align 4
> > > > -L(4x_vec_end):
> > > > +L(loop_4x_vec):
> > > > + /* Compare 4 * VEC at a time forward. */
> > > > + vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5
> > > > + vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6
> > > > + vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7
> > > > + vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8
> > > > +
> > > > + /* Leaves only CHARS matching esi as 0. */
> > > > + vpxor %ymm5, %ymm0, %ymm1
> > > > + vpxor %ymm6, %ymm0, %ymm2
> > > > + vpxor %ymm7, %ymm0, %ymm3
> > > > + vpxor %ymm8, %ymm0, %ymm4
> > > > +
> > > > + VPMINU %ymm1, %ymm5, %ymm1
> > > > + VPMINU %ymm2, %ymm6, %ymm2
> > > > + VPMINU %ymm3, %ymm7, %ymm3
> > > > + VPMINU %ymm4, %ymm8, %ymm4
> > > > +
> > > > + VPMINU %ymm1, %ymm2, %ymm5
> > > > + VPMINU %ymm3, %ymm4, %ymm6
> > > > +
> > > > + VPMINU %ymm5, %ymm6, %ymm5
> > > > +
> > > > + VPCMPEQ %ymm5, %ymm9, %ymm5
> > > > + vpmovmskb %ymm5, %eax
> > > > +
> > > > + addq $(VEC_SIZE * 4), %rdi
> > > > + testl %eax, %eax
> > > > + jz L(loop_4x_vec)
> > > > +
> > > > + VPCMPEQ %ymm1, %ymm9, %ymm1
> > > > vpmovmskb %ymm1, %eax
> > > > testl %eax, %eax
> > > > jnz L(first_vec_x0)
> > > > +
> > > > + VPCMPEQ %ymm2, %ymm9, %ymm2
> > > > vpmovmskb %ymm2, %eax
> > > > testl %eax, %eax
> > > > jnz L(first_vec_x1)
> > > > - vpmovmskb %ymm3, %eax
> > > > - testl %eax, %eax
> > > > - jnz L(first_vec_x2)
> > > > +
> > > > + VPCMPEQ %ymm3, %ymm9, %ymm3
> > > > + VPCMPEQ %ymm4, %ymm9, %ymm4
> > > > + vpmovmskb %ymm3, %ecx
> > > > vpmovmskb %ymm4, %eax
> > > > + salq $32, %rax
> > > > + orq %rcx, %rax
> > > > + tzcntq %rax, %rax
> > > > + leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
> > > > +# ifndef USE_AS_STRCHRNUL
> > > > + cmp (%rax), %CHAR_REG
> > > > + cmovne %rdx, %rax
> > > > +# endif
> > > > + VZEROUPPER
> > > > + ret
> > > > +
> > > > + /* Cold case for crossing page with first load. */
> > > > + .p2align 4
> > > > +L(cross_page_boundary):
> > > > + andq $-VEC_SIZE, %rdi
> > > > + andl $(VEC_SIZE - 1), %ecx
> > > > +
> > > > + vmovdqa (%rdi), %ymm8
> > > > + VPCMPEQ %ymm8, %ymm0, %ymm1
> > > > + VPCMPEQ %ymm8, %ymm9, %ymm2
> > > > + vpor %ymm1, %ymm2, %ymm1
> > > > + vpmovmskb %ymm1, %eax
> > > > + /* Remove the leading bits. */
> > > > + sarxl %ecx, %eax, %eax
> > > > testl %eax, %eax
> > > > -L(first_vec_x3):
> > > > + jz L(aligned_more)
> > > > tzcntl %eax, %eax
> > > > -# ifdef USE_AS_STRCHRNUL
> > > > - addq $(VEC_SIZE * 3), %rax
> > > > + addq %rcx, %rdi
> > > > addq %rdi, %rax
> > > > -# else
> > > > - xorl %edx, %edx
> > > > - leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
> > > > - cmp (%rax), %CHAR_REG
> > > > +# ifndef USE_AS_STRCHRNUL
> > > > + cmp (%rax), %CHAR_REG
> > > > cmovne %rdx, %rax
> > > > # endif
> > > > VZEROUPPER
> > > > ret
> > > >
> > > > END (STRCHR)
> > > > -#endif
> > > > +# endif
> > > > diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
> > > > index 583a152794..4dfbe3b58b 100644
> > > > --- a/sysdeps/x86_64/multiarch/strchr.c
> > > > +++ b/sysdeps/x86_64/multiarch/strchr.c
> > > > @@ -37,6 +37,7 @@ IFUNC_SELECTOR (void)
> > > >
> > > > if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
> > > > && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> > > > + && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
> > > > && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
> > > > return OPTIMIZE (avx2);
> > > >
> > > > --
> > > > 2.29.2
> > > >
> > >
> > > LGTM.
> > >
> > > Thanks.
> > >
> >
> > This is the updated patch with extra white spaces fixed I am checking in.
> >
> > --
> > H.J.
>
> Awesome! Thanks!
>
> N.G.
Shoot, just realized this one has the old commit message that only
references test-strchr and test-strchrnul as passing (missing
reference to test-wcschr and test-wcschrnul).
Do you want me to send another patch with proper commit message or can
you fix it on your end or do is not really matter?
N.G.
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH v4 1/2] x86: Refactor and improve performance of strchr-avx2.S
2021-02-08 20:57 ` Noah Goldstein
@ 2022-04-27 23:43 ` Sunil Pandey
0 siblings, 0 replies; 10+ messages in thread
From: Sunil Pandey @ 2022-04-27 23:43 UTC (permalink / raw)
To: Noah Goldstein, libc-stable; +Cc: H.J. Lu, GNU C Library
On Mon, Feb 8, 2021 at 1:46 PM Noah Goldstein via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Mon, Feb 8, 2021 at 2:48 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Mon, Feb 8, 2021 at 2:33 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Mon, Feb 8, 2021 at 6:08 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > On Tue, Feb 2, 2021 at 9:39 PM <goldstein.w.n@gmail.com> wrote:
> > > > >
> > > > > From: noah <goldstein.w.n@gmail.com>
> > > > >
> > > > > No bug. Just seemed the performance could be improved a bit. Observed
> > > > > and expected behavior are unchanged. Optimized body of main
> > > > > loop. Updated page cross logic and optimized accordingly. Made a few
> > > > > minor instruction selection modifications. No regressions in test
> > > > > suite. Both test-strchrnul and test-strchr passed.
> > > > >
> > > > > Signed-off-by: noah <goldstein.w.n@gmail.com>
> > > > > ---
> > > > > sysdeps/x86_64/multiarch/strchr-avx2.S | 235 ++++++++++++-------------
> > > > > sysdeps/x86_64/multiarch/strchr.c | 1 +
> > > > > 2 files changed, 118 insertions(+), 118 deletions(-)
> > > > >
> > > > > diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
> > > > > index d416558d04..8b9d78b55a 100644
> > > > > --- a/sysdeps/x86_64/multiarch/strchr-avx2.S
> > > > > +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
> > > > > @@ -27,10 +27,12 @@
> > > > > # ifdef USE_AS_WCSCHR
> > > > > # define VPBROADCAST vpbroadcastd
> > > > > # define VPCMPEQ vpcmpeqd
> > > > > +# define VPMINU vpminud
> > > > > # define CHAR_REG esi
> > > > > # else
> > > > > # define VPBROADCAST vpbroadcastb
> > > > > # define VPCMPEQ vpcmpeqb
> > > > > +# define VPMINU vpminub
> > > > > # define CHAR_REG sil
> > > > > # endif
> > > > >
> > > > > @@ -39,20 +41,26 @@
> > > > > # endif
> > > > >
> > > > > # define VEC_SIZE 32
> > > > > +# define PAGE_SIZE 4096
> > > > >
> > > > > .section .text.avx,"ax",@progbits
> > > > > ENTRY (STRCHR)
> > > > > movl %edi, %ecx
> > > > > - /* Broadcast CHAR to YMM0. */
> > > > > +# ifndef USE_AS_STRCHRNUL
> > > > > + xorl %edx, %edx
> > > > > +# endif
> > > > > +
> > > > > + /* Broadcast CHAR to YMM0. */
> > > > > vmovd %esi, %xmm0
> > > > > vpxor %xmm9, %xmm9, %xmm9
> > > > > VPBROADCAST %xmm0, %ymm0
> > > > > - /* Check if we may cross page boundary with one vector load. */
> > > > > - andl $(2 * VEC_SIZE - 1), %ecx
> > > > > - cmpl $VEC_SIZE, %ecx
> > > > > - ja L(cros_page_boundary)
> > > > > -
> > > > > - /* Check the first VEC_SIZE bytes. Search for both CHAR and the
> > > > > +
> > > > > + /* Check if we cross page boundary with one vector load. */
> > > > > + andl $(PAGE_SIZE - 1), %ecx
> > > > > + cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
> > > > > + ja L(cross_page_boundary)
> > > > > +
> > > > > + /* Check the first VEC_SIZE bytes. Search for both CHAR and the
> > > > > null byte. */
> > > > > vmovdqu (%rdi), %ymm8
> > > > > VPCMPEQ %ymm8, %ymm0, %ymm1
> > > > > @@ -60,50 +68,27 @@ ENTRY (STRCHR)
> > > > > vpor %ymm1, %ymm2, %ymm1
> > > > > vpmovmskb %ymm1, %eax
> > > > > testl %eax, %eax
> > > > > - jnz L(first_vec_x0)
> > > > > -
> > > > > - /* Align data for aligned loads in the loop. */
> > > > > - addq $VEC_SIZE, %rdi
> > > > > - andl $(VEC_SIZE - 1), %ecx
> > > > > - andq $-VEC_SIZE, %rdi
> > > > > -
> > > > > - jmp L(more_4x_vec)
> > > > > -
> > > > > - .p2align 4
> > > > > -L(cros_page_boundary):
> > > > > - andl $(VEC_SIZE - 1), %ecx
> > > > > - andq $-VEC_SIZE, %rdi
> > > > > - vmovdqu (%rdi), %ymm8
> > > > > - VPCMPEQ %ymm8, %ymm0, %ymm1
> > > > > - VPCMPEQ %ymm8, %ymm9, %ymm2
> > > > > - vpor %ymm1, %ymm2, %ymm1
> > > > > - vpmovmskb %ymm1, %eax
> > > > > - /* Remove the leading bytes. */
> > > > > - sarl %cl, %eax
> > > > > - testl %eax, %eax
> > > > > - jz L(aligned_more)
> > > > > - /* Found CHAR or the null byte. */
> > > > > + jz L(more_vecs)
> > > > > tzcntl %eax, %eax
> > > > > - addq %rcx, %rax
> > > > > -# ifdef USE_AS_STRCHRNUL
> > > > > + /* Found CHAR or the null byte. */
> > > > > addq %rdi, %rax
> > > > > -# else
> > > > > - xorl %edx, %edx
> > > > > - leaq (%rdi, %rax), %rax
> > > > > - cmp (%rax), %CHAR_REG
> > > > > +# ifndef USE_AS_STRCHRNUL
> > > > > + cmp (%rax), %CHAR_REG
> > > > > cmovne %rdx, %rax
> > > > > # endif
> > > > > VZEROUPPER
> > > > > ret
> > > > >
> > > > > .p2align 4
> > > > > +L(more_vecs):
> > > > > + /* Align data for aligned loads in the loop. */
> > > > > + andq $-VEC_SIZE, %rdi
> > > > > L(aligned_more):
> > > > > - addq $VEC_SIZE, %rdi
> > > > >
> > > > > -L(more_4x_vec):
> > > > > - /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
> > > > > - since data is only aligned to VEC_SIZE. */
> > > > > - vmovdqa (%rdi), %ymm8
> > > > > + /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
> > > > > + since data is only aligned to VEC_SIZE. */
> > > > > + vmovdqa VEC_SIZE(%rdi), %ymm8
> > > > > + addq $VEC_SIZE, %rdi
> > > > > VPCMPEQ %ymm8, %ymm0, %ymm1
> > > > > VPCMPEQ %ymm8, %ymm9, %ymm2
> > > > > vpor %ymm1, %ymm2, %ymm1
> > > > > @@ -125,7 +110,7 @@ L(more_4x_vec):
> > > > > vpor %ymm1, %ymm2, %ymm1
> > > > > vpmovmskb %ymm1, %eax
> > > > > testl %eax, %eax
> > > > > - jnz L(first_vec_x2)
> > > > > + jnz L(first_vec_x2)
> > > > >
> > > > > vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
> > > > > VPCMPEQ %ymm8, %ymm0, %ymm1
> > > > > @@ -133,122 +118,136 @@ L(more_4x_vec):
> > > > > vpor %ymm1, %ymm2, %ymm1
> > > > > vpmovmskb %ymm1, %eax
> > > > > testl %eax, %eax
> > > > > - jnz L(first_vec_x3)
> > > > > -
> > > > > - addq $(VEC_SIZE * 4), %rdi
> > > > > -
> > > > > - /* Align data to 4 * VEC_SIZE. */
> > > > > - movq %rdi, %rcx
> > > > > - andl $(4 * VEC_SIZE - 1), %ecx
> > > > > - andq $-(4 * VEC_SIZE), %rdi
> > > > > -
> > > > > - .p2align 4
> > > > > -L(loop_4x_vec):
> > > > > - /* Compare 4 * VEC at a time forward. */
> > > > > - vmovdqa (%rdi), %ymm5
> > > > > - vmovdqa VEC_SIZE(%rdi), %ymm6
> > > > > - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
> > > > > - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
> > > > > -
> > > > > - VPCMPEQ %ymm5, %ymm0, %ymm1
> > > > > - VPCMPEQ %ymm6, %ymm0, %ymm2
> > > > > - VPCMPEQ %ymm7, %ymm0, %ymm3
> > > > > - VPCMPEQ %ymm8, %ymm0, %ymm4
> > > > > -
> > > > > - VPCMPEQ %ymm5, %ymm9, %ymm5
> > > > > - VPCMPEQ %ymm6, %ymm9, %ymm6
> > > > > - VPCMPEQ %ymm7, %ymm9, %ymm7
> > > > > - VPCMPEQ %ymm8, %ymm9, %ymm8
> > > > > -
> > > > > - vpor %ymm1, %ymm5, %ymm1
> > > > > - vpor %ymm2, %ymm6, %ymm2
> > > > > - vpor %ymm3, %ymm7, %ymm3
> > > > > - vpor %ymm4, %ymm8, %ymm4
> > > > > -
> > > > > - vpor %ymm1, %ymm2, %ymm5
> > > > > - vpor %ymm3, %ymm4, %ymm6
> > > > > -
> > > > > - vpor %ymm5, %ymm6, %ymm5
> > > > > -
> > > > > - vpmovmskb %ymm5, %eax
> > > > > - testl %eax, %eax
> > > > > - jnz L(4x_vec_end)
> > > > > -
> > > > > - addq $(VEC_SIZE * 4), %rdi
> > > > > + jz L(prep_loop_4x)
> > > > >
> > > > > - jmp L(loop_4x_vec)
> > > > > + tzcntl %eax, %eax
> > > > > + leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
> > > > > +# ifndef USE_AS_STRCHRNUL
> > > > > + cmp (%rax), %CHAR_REG
> > > > > + cmovne %rdx, %rax
> > > > > +# endif
> > > > > + VZEROUPPER
> > > > > + ret
> > > > >
> > > > > .p2align 4
> > > > > L(first_vec_x0):
> > > > > - /* Found CHAR or the null byte. */
> > > > > tzcntl %eax, %eax
> > > > > -# ifdef USE_AS_STRCHRNUL
> > > > > + /* Found CHAR or the null byte. */
> > > > > addq %rdi, %rax
> > > > > -# else
> > > > > - xorl %edx, %edx
> > > > > - leaq (%rdi, %rax), %rax
> > > > > - cmp (%rax), %CHAR_REG
> > > > > +# ifndef USE_AS_STRCHRNUL
> > > > > + cmp (%rax), %CHAR_REG
> > > > > cmovne %rdx, %rax
> > > > > # endif
> > > > > VZEROUPPER
> > > > > ret
> > > > > -
> > > > > +
> > > > > .p2align 4
> > > > > L(first_vec_x1):
> > > > > tzcntl %eax, %eax
> > > > > -# ifdef USE_AS_STRCHRNUL
> > > > > - addq $VEC_SIZE, %rax
> > > > > - addq %rdi, %rax
> > > > > -# else
> > > > > - xorl %edx, %edx
> > > > > leaq VEC_SIZE(%rdi, %rax), %rax
> > > > > - cmp (%rax), %CHAR_REG
> > > > > +# ifndef USE_AS_STRCHRNUL
> > > > > + cmp (%rax), %CHAR_REG
> > > > > cmovne %rdx, %rax
> > > > > # endif
> > > > > VZEROUPPER
> > > > > - ret
> > > > > -
> > > > > + ret
> > > > > +
> > > > > .p2align 4
> > > > > L(first_vec_x2):
> > > > > tzcntl %eax, %eax
> > > > > -# ifdef USE_AS_STRCHRNUL
> > > > > - addq $(VEC_SIZE * 2), %rax
> > > > > - addq %rdi, %rax
> > > > > -# else
> > > > > - xorl %edx, %edx
> > > > > + /* Found CHAR or the null byte. */
> > > > > leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
> > > > > - cmp (%rax), %CHAR_REG
> > > > > +# ifndef USE_AS_STRCHRNUL
> > > > > + cmp (%rax), %CHAR_REG
> > > > > cmovne %rdx, %rax
> > > > > # endif
> > > > > VZEROUPPER
> > > > > ret
> > > > > +
> > > > > +L(prep_loop_4x):
> > > > > + /* Align data to 4 * VEC_SIZE. */
> > > > > + andq $-(VEC_SIZE * 4), %rdi
> > > > >
> > > > > .p2align 4
> > > > > -L(4x_vec_end):
> > > > > +L(loop_4x_vec):
> > > > > + /* Compare 4 * VEC at a time forward. */
> > > > > + vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5
> > > > > + vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6
> > > > > + vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7
> > > > > + vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8
> > > > > +
> > > > > + /* Leaves only CHARS matching esi as 0. */
> > > > > + vpxor %ymm5, %ymm0, %ymm1
> > > > > + vpxor %ymm6, %ymm0, %ymm2
> > > > > + vpxor %ymm7, %ymm0, %ymm3
> > > > > + vpxor %ymm8, %ymm0, %ymm4
> > > > > +
> > > > > + VPMINU %ymm1, %ymm5, %ymm1
> > > > > + VPMINU %ymm2, %ymm6, %ymm2
> > > > > + VPMINU %ymm3, %ymm7, %ymm3
> > > > > + VPMINU %ymm4, %ymm8, %ymm4
> > > > > +
> > > > > + VPMINU %ymm1, %ymm2, %ymm5
> > > > > + VPMINU %ymm3, %ymm4, %ymm6
> > > > > +
> > > > > + VPMINU %ymm5, %ymm6, %ymm5
> > > > > +
> > > > > + VPCMPEQ %ymm5, %ymm9, %ymm5
> > > > > + vpmovmskb %ymm5, %eax
> > > > > +
> > > > > + addq $(VEC_SIZE * 4), %rdi
> > > > > + testl %eax, %eax
> > > > > + jz L(loop_4x_vec)
> > > > > +
> > > > > + VPCMPEQ %ymm1, %ymm9, %ymm1
> > > > > vpmovmskb %ymm1, %eax
> > > > > testl %eax, %eax
> > > > > jnz L(first_vec_x0)
> > > > > +
> > > > > + VPCMPEQ %ymm2, %ymm9, %ymm2
> > > > > vpmovmskb %ymm2, %eax
> > > > > testl %eax, %eax
> > > > > jnz L(first_vec_x1)
> > > > > - vpmovmskb %ymm3, %eax
> > > > > - testl %eax, %eax
> > > > > - jnz L(first_vec_x2)
> > > > > +
> > > > > + VPCMPEQ %ymm3, %ymm9, %ymm3
> > > > > + VPCMPEQ %ymm4, %ymm9, %ymm4
> > > > > + vpmovmskb %ymm3, %ecx
> > > > > vpmovmskb %ymm4, %eax
> > > > > + salq $32, %rax
> > > > > + orq %rcx, %rax
> > > > > + tzcntq %rax, %rax
> > > > > + leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
> > > > > +# ifndef USE_AS_STRCHRNUL
> > > > > + cmp (%rax), %CHAR_REG
> > > > > + cmovne %rdx, %rax
> > > > > +# endif
> > > > > + VZEROUPPER
> > > > > + ret
> > > > > +
> > > > > + /* Cold case for crossing page with first load. */
> > > > > + .p2align 4
> > > > > +L(cross_page_boundary):
> > > > > + andq $-VEC_SIZE, %rdi
> > > > > + andl $(VEC_SIZE - 1), %ecx
> > > > > +
> > > > > + vmovdqa (%rdi), %ymm8
> > > > > + VPCMPEQ %ymm8, %ymm0, %ymm1
> > > > > + VPCMPEQ %ymm8, %ymm9, %ymm2
> > > > > + vpor %ymm1, %ymm2, %ymm1
> > > > > + vpmovmskb %ymm1, %eax
> > > > > + /* Remove the leading bits. */
> > > > > + sarxl %ecx, %eax, %eax
> > > > > testl %eax, %eax
> > > > > -L(first_vec_x3):
> > > > > + jz L(aligned_more)
> > > > > tzcntl %eax, %eax
> > > > > -# ifdef USE_AS_STRCHRNUL
> > > > > - addq $(VEC_SIZE * 3), %rax
> > > > > + addq %rcx, %rdi
> > > > > addq %rdi, %rax
> > > > > -# else
> > > > > - xorl %edx, %edx
> > > > > - leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
> > > > > - cmp (%rax), %CHAR_REG
> > > > > +# ifndef USE_AS_STRCHRNUL
> > > > > + cmp (%rax), %CHAR_REG
> > > > > cmovne %rdx, %rax
> > > > > # endif
> > > > > VZEROUPPER
> > > > > ret
> > > > >
> > > > > END (STRCHR)
> > > > > -#endif
> > > > > +# endif
> > > > > diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
> > > > > index 583a152794..4dfbe3b58b 100644
> > > > > --- a/sysdeps/x86_64/multiarch/strchr.c
> > > > > +++ b/sysdeps/x86_64/multiarch/strchr.c
> > > > > @@ -37,6 +37,7 @@ IFUNC_SELECTOR (void)
> > > > >
> > > > > if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
> > > > > && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> > > > > + && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
> > > > > && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
> > > > > return OPTIMIZE (avx2);
> > > > >
> > > > > --
> > > > > 2.29.2
> > > > >
> > > >
> > > > LGTM.
> > > >
> > > > Thanks.
> > > >
> > >
> > > This is the updated patch with extra white spaces fixed I am checking in.
> > >
> > > --
> > > H.J.
> >
> > Awesome! Thanks!
> >
> > N.G.
>
> Shoot, just realized this one has the old commit message that only
> references test-strchr and test-strchrnul as passing (missing
> reference to test-wcschr and test-wcschrnul).
>
> Do you want me to send another patch with proper commit message or can
> you fix it on your end or do is not really matter?
>
> N.G.
I would like to backport this patch to release branches.
Any comments or objections?
--Sunil
^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2022-04-27 23:44 UTC | newest]
Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-02-03 5:38 [PATCH v4 1/2] x86: Refactor and improve performance of strchr-avx2.S goldstein.w.n
2021-02-03 5:39 ` [PATCH v4 2/2] x86: Add additional benchmarks and tests for strchr goldstein.w.n
2021-02-08 14:08 ` H.J. Lu
2021-02-08 19:34 ` H.J. Lu
2021-02-08 19:49 ` Noah Goldstein
2021-02-08 14:08 ` [PATCH v4 1/2] x86: Refactor and improve performance of strchr-avx2.S H.J. Lu
2021-02-08 19:33 ` H.J. Lu
2021-02-08 19:48 ` Noah Goldstein
2021-02-08 20:57 ` Noah Goldstein
2022-04-27 23:43 ` Sunil Pandey
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).