* [PATCH v3 1/2] x86: Refactor and improve performance of strchr-avx2.S
@ 2021-02-02 9:39 goldstein.w.n
2021-02-02 9:39 ` [PATCH v3 2/2] x86: Add additional benchmarks for strchr goldstein.w.n
2021-02-02 14:02 ` [PATCH v3 1/2] x86: Refactor and improve performance of strchr-avx2.S H.J. Lu
0 siblings, 2 replies; 5+ messages in thread
From: goldstein.w.n @ 2021-02-02 9:39 UTC (permalink / raw)
To: libc-alpha; +Cc: carlos, goldstein.w.n, hjl.tools
From: noah <goldstein.w.n@gmail.com>
No bug. Just seemed the performance could be improved a bit. Observed
and expected behavior are unchanged. Optimized body of main
loop. Updated page cross logic and optimized accordingly. Made a few
minor instruction selection modifications. No regressions in test
suite. Both test-strchrnul and test-strchr passed.
Signed-off-by: noah <goldstein.w.n@gmail.com>
---
sysdeps/x86_64/multiarch/strchr-avx2.S | 235 ++++++++++++-------------
sysdeps/x86_64/multiarch/strchr.c | 1 +
2 files changed, 118 insertions(+), 118 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
index d416558d04..806ca66a9b 100644
--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
@@ -27,10 +27,12 @@
# ifdef USE_AS_WCSCHR
# define VPBROADCAST vpbroadcastd
# define VPCMPEQ vpcmpeqd
+# define VPMINU vpminud
# define CHAR_REG esi
# else
# define VPBROADCAST vpbroadcastb
# define VPCMPEQ vpcmpeqb
+# define VPMINU vpminub
# define CHAR_REG sil
# endif
@@ -39,20 +41,26 @@
# endif
# define VEC_SIZE 32
+# define PAGE_SIZE 4096
.section .text.avx,"ax",@progbits
ENTRY (STRCHR)
movl %edi, %ecx
- /* Broadcast CHAR to YMM0. */
+# ifndef USE_AS_STRCHRNUL
+ xorl %edx, %edx
+# endif
+
+ /* Broadcast CHAR to YMM0. */
vmovd %esi, %xmm0
vpxor %xmm9, %xmm9, %xmm9
VPBROADCAST %xmm0, %ymm0
- /* Check if we may cross page boundary with one vector load. */
- andl $(2 * VEC_SIZE - 1), %ecx
- cmpl $VEC_SIZE, %ecx
- ja L(cros_page_boundary)
-
- /* Check the first VEC_SIZE bytes. Search for both CHAR and the
+
+ /* Check if we cross page boundary with one vector load. */
+ andl $(PAGE_SIZE - 1), %ecx
+ cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
+ ja L(cross_page_boundary)
+
+ /* Check the first VEC_SIZE bytes. Search for both CHAR and the
null byte. */
vmovdqu (%rdi), %ymm8
VPCMPEQ %ymm8, %ymm0, %ymm1
@@ -60,50 +68,27 @@ ENTRY (STRCHR)
vpor %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %eax
testl %eax, %eax
- jnz L(first_vec_x0)
-
- /* Align data for aligned loads in the loop. */
- addq $VEC_SIZE, %rdi
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
-
- jmp L(more_4x_vec)
-
- .p2align 4
-L(cros_page_boundary):
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
- vmovdqu (%rdi), %ymm8
- VPCMPEQ %ymm8, %ymm0, %ymm1
- VPCMPEQ %ymm8, %ymm9, %ymm2
- vpor %ymm1, %ymm2, %ymm1
- vpmovmskb %ymm1, %eax
- /* Remove the leading bytes. */
- sarl %cl, %eax
- testl %eax, %eax
- jz L(aligned_more)
- /* Found CHAR or the null byte. */
+ jz L(more_vecs)
tzcntl %eax, %eax
- addq %rcx, %rax
-# ifdef USE_AS_STRCHRNUL
+ /* Found CHAR or the null byte. */
addq %rdi, %rax
-# else
- xorl %edx, %edx
- leaq (%rdi, %rax), %rax
- cmp (%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER
ret
.p2align 4
+L(more_vecs):
+ /* Align data for aligned loads in the loop. */
+ andq $-VEC_SIZE, %rdi
L(aligned_more):
- addq $VEC_SIZE, %rdi
-L(more_4x_vec):
- /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
- since data is only aligned to VEC_SIZE. */
- vmovdqa (%rdi), %ymm8
+ /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
+ since data is only aligned to VEC_SIZE. */
+ vmovdqa VEC_SIZE(%rdi), %ymm8
+ addq $VEC_SIZE, %rdi
VPCMPEQ %ymm8, %ymm0, %ymm1
VPCMPEQ %ymm8, %ymm9, %ymm2
vpor %ymm1, %ymm2, %ymm1
@@ -125,7 +110,7 @@ L(more_4x_vec):
vpor %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %eax
testl %eax, %eax
- jnz L(first_vec_x2)
+ jnz L(first_vec_x2)
vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
VPCMPEQ %ymm8, %ymm0, %ymm1
@@ -133,122 +118,136 @@ L(more_4x_vec):
vpor %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %eax
testl %eax, %eax
- jnz L(first_vec_x3)
-
- addq $(VEC_SIZE * 4), %rdi
-
- /* Align data to 4 * VEC_SIZE. */
- movq %rdi, %rcx
- andl $(4 * VEC_SIZE - 1), %ecx
- andq $-(4 * VEC_SIZE), %rdi
-
- .p2align 4
-L(loop_4x_vec):
- /* Compare 4 * VEC at a time forward. */
- vmovdqa (%rdi), %ymm5
- vmovdqa VEC_SIZE(%rdi), %ymm6
- vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
- vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
-
- VPCMPEQ %ymm5, %ymm0, %ymm1
- VPCMPEQ %ymm6, %ymm0, %ymm2
- VPCMPEQ %ymm7, %ymm0, %ymm3
- VPCMPEQ %ymm8, %ymm0, %ymm4
-
- VPCMPEQ %ymm5, %ymm9, %ymm5
- VPCMPEQ %ymm6, %ymm9, %ymm6
- VPCMPEQ %ymm7, %ymm9, %ymm7
- VPCMPEQ %ymm8, %ymm9, %ymm8
-
- vpor %ymm1, %ymm5, %ymm1
- vpor %ymm2, %ymm6, %ymm2
- vpor %ymm3, %ymm7, %ymm3
- vpor %ymm4, %ymm8, %ymm4
-
- vpor %ymm1, %ymm2, %ymm5
- vpor %ymm3, %ymm4, %ymm6
-
- vpor %ymm5, %ymm6, %ymm5
-
- vpmovmskb %ymm5, %eax
- testl %eax, %eax
- jnz L(4x_vec_end)
-
- addq $(VEC_SIZE * 4), %rdi
+ jz L(prep_loop_4x)
- jmp L(loop_4x_vec)
+ tzcntl %eax, %eax
+ leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
+ cmovne %rdx, %rax
+# endif
+ VZEROUPPER
+ ret
.p2align 4
L(first_vec_x0):
- /* Found CHAR or the null byte. */
tzcntl %eax, %eax
-# ifdef USE_AS_STRCHRNUL
+ /* Found CHAR or the null byte. */
addq %rdi, %rax
-# else
- xorl %edx, %edx
- leaq (%rdi, %rax), %rax
- cmp (%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER
ret
-
+
.p2align 4
L(first_vec_x1):
tzcntl %eax, %eax
-# ifdef USE_AS_STRCHRNUL
- addq $VEC_SIZE, %rax
- addq %rdi, %rax
-# else
- xorl %edx, %edx
leaq VEC_SIZE(%rdi, %rax), %rax
- cmp (%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER
- ret
-
+ ret
+
.p2align 4
L(first_vec_x2):
tzcntl %eax, %eax
-# ifdef USE_AS_STRCHRNUL
- addq $(VEC_SIZE * 2), %rax
- addq %rdi, %rax
-# else
- xorl %edx, %edx
+ /* Found CHAR or the null byte. */
leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
- cmp (%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER
ret
+
+L(prep_loop_4x):
+ /* Align data to 4 * VEC_SIZE. */
+ andq $-(VEC_SIZE * 4), %rdi
.p2align 4
-L(4x_vec_end):
+L(loop_4x_vec):
+ /* Compare 4 * VEC at a time forward. */
+ vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5
+ vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6
+ vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7
+ vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8
+
+ /* Leaves only CHARS matching esi as 0. */
+ vpxor %ymm5, %ymm0, %ymm1
+ vpxor %ymm6, %ymm0, %ymm2
+ vpxor %ymm7, %ymm0, %ymm3
+ vpxor %ymm8, %ymm0, %ymm4
+
+ VPMINU %ymm1, %ymm5, %ymm1
+ VPMINU %ymm2, %ymm6, %ymm2
+ VPMINU %ymm3, %ymm7, %ymm3
+ VPMINU %ymm4, %ymm8, %ymm4
+
+ VPMINU %ymm1, %ymm2, %ymm5
+ VPMINU %ymm3, %ymm4, %ymm6
+
+ VPMINU %ymm5, %ymm6, %ymm5
+
+ VPCMPEQ %ymm5, %ymm9, %ymm5
+ vpmovmskb %ymm5, %eax
+
+ addq $(VEC_SIZE * 4), %rdi
+ testl %eax, %eax
+ jz L(loop_4x_vec)
+
+ VPCMPEQ %ymm1, %ymm9, %ymm1
vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x0)
+
+ VPCMPEQ %ymm2, %ymm9, %ymm2
vpmovmskb %ymm2, %eax
testl %eax, %eax
jnz L(first_vec_x1)
- vpmovmskb %ymm3, %eax
- testl %eax, %eax
- jnz L(first_vec_x2)
+
+ VPCMPEQ %ymm3, %ymm9, %ymm3
+ VPCMPEQ %ymm4, %ymm9, %ymm4
+ vpmovmskb %ymm3, %ecx
vpmovmskb %ymm4, %eax
+ salq $32, %rax
+ orq %rcx, %rax
+ tzcntq %rax, %rax
+ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
+ cmovne %rdx, %rax
+# endif
+ VZEROUPPER
+ ret
+
+ /* Cold case for crossing page with first load. */
+ .p2align 4
+L(cross_page_boundary):
+ andq $-VEC_SIZE, %rdi
+ andl $(VEC_SIZE - 1), %ecx
+
+ vmovdqa (%rdi), %ymm8
+ VPCMPEQ %ymm8, %ymm0, %ymm1
+ VPCMPEQ %ymm8, %ymm9, %ymm2
+ vpor %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %eax
+ /* Remove the leading bits. */
+ sarxl %ecx, %eax, %eax
testl %eax, %eax
-L(first_vec_x3):
+ jz L(aligned_more)
tzcntl %eax, %eax
-# ifdef USE_AS_STRCHRNUL
- addq $(VEC_SIZE * 3), %rax
+ addq %rcx, %rdi
addq %rdi, %rax
-# else
- xorl %edx, %edx
- leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
- cmp (%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER
ret
END (STRCHR)
-#endif
+# endif
diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
index 583a152794..4dfbe3b58b 100644
--- a/sysdeps/x86_64/multiarch/strchr.c
+++ b/sysdeps/x86_64/multiarch/strchr.c
@@ -37,6 +37,7 @@ IFUNC_SELECTOR (void)
if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
&& CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
return OPTIMIZE (avx2);
--
2.29.2
^ permalink raw reply [flat|nested] 5+ messages in thread
* [PATCH v3 2/2] x86: Add additional benchmarks for strchr
2021-02-02 9:39 [PATCH v3 1/2] x86: Refactor and improve performance of strchr-avx2.S goldstein.w.n
@ 2021-02-02 9:39 ` goldstein.w.n
2021-02-02 14:06 ` H.J. Lu
2021-02-02 14:02 ` [PATCH v3 1/2] x86: Refactor and improve performance of strchr-avx2.S H.J. Lu
1 sibling, 1 reply; 5+ messages in thread
From: goldstein.w.n @ 2021-02-02 9:39 UTC (permalink / raw)
To: libc-alpha; +Cc: carlos, goldstein.w.n, hjl.tools
From: noah <goldstein.w.n@gmail.com>
This patch adds additional benchmarks for string size of 4096 and
several benchmarks for string size 256 with different alignments.
Signed-off-by: noah <goldstein.w.n@gmail.com>
---
benchtests/bench-strchr.c | 79 ++++++++++++++++++++++++++-------------
1 file changed, 53 insertions(+), 26 deletions(-)
diff --git a/benchtests/bench-strchr.c b/benchtests/bench-strchr.c
index bf493fe458..ce7ffd354d 100644
--- a/benchtests/bench-strchr.c
+++ b/benchtests/bench-strchr.c
@@ -100,9 +100,12 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
size_t i;
CHAR *result;
CHAR *buf = (CHAR *) buf1;
- align &= 15;
+ align &= 127;
if ((align + len) * sizeof (CHAR) >= page_size)
- return;
+ {
+ return;
+ }
+
for (i = 0; i < len; ++i)
{
@@ -146,40 +149,64 @@ test_main (void)
putchar ('\n');
for (i = 1; i < 8; ++i)
- {
- do_test (0, 16 << i, 2048, SMALL_CHAR, MIDDLE_CHAR);
- do_test (i, 16 << i, 2048, SMALL_CHAR, MIDDLE_CHAR);
- }
+ {
+ do_test (0, 16 << i, 2048, SMALL_CHAR, MIDDLE_CHAR);
+ do_test (i, 16 << i, 2048, SMALL_CHAR, MIDDLE_CHAR);
+ }
for (i = 1; i < 8; ++i)
- {
- do_test (i, 64, 256, SMALL_CHAR, MIDDLE_CHAR);
- do_test (i, 64, 256, SMALL_CHAR, BIG_CHAR);
- }
+ {
+ do_test (0, 16 << i, 4096, SMALL_CHAR, MIDDLE_CHAR);
+ do_test (i, 16 << i, 4096, SMALL_CHAR, MIDDLE_CHAR);
+ }
+
+ for (i = 1; i < 8; ++i)
+ {
+ do_test (i, 64, 256, SMALL_CHAR, MIDDLE_CHAR);
+ do_test (i, 64, 256, SMALL_CHAR, BIG_CHAR);
+ }
+
+ for (i = 0; i < 8; ++i)
+ {
+ do_test (16 * i, 256, 512, SMALL_CHAR, MIDDLE_CHAR);
+ do_test (16 * i, 256, 512, SMALL_CHAR, BIG_CHAR);
+ }
for (i = 0; i < 32; ++i)
- {
- do_test (0, i, i + 1, SMALL_CHAR, MIDDLE_CHAR);
- do_test (0, i, i + 1, SMALL_CHAR, BIG_CHAR);
- }
+ {
+ do_test (0, i, i + 1, SMALL_CHAR, MIDDLE_CHAR);
+ do_test (0, i, i + 1, SMALL_CHAR, BIG_CHAR);
+ }
for (i = 1; i < 8; ++i)
- {
- do_test (0, 16 << i, 2048, 0, MIDDLE_CHAR);
- do_test (i, 16 << i, 2048, 0, MIDDLE_CHAR);
- }
+ {
+ do_test (0, 16 << i, 2048, 0, MIDDLE_CHAR);
+ do_test (i, 16 << i, 2048, 0, MIDDLE_CHAR);
+ }
for (i = 1; i < 8; ++i)
- {
- do_test (i, 64, 256, 0, MIDDLE_CHAR);
- do_test (i, 64, 256, 0, BIG_CHAR);
- }
+ {
+ do_test (0, 16 << i, 4096, 0, MIDDLE_CHAR);
+ do_test (i, 16 << i, 4096, 0, MIDDLE_CHAR);
+ }
+
+ for (i = 1; i < 8; ++i)
+ {
+ do_test (i, 64, 256, 0, MIDDLE_CHAR);
+ do_test (i, 64, 256, 0, BIG_CHAR);
+ }
+
+ for (i = 0; i < 8; ++i)
+ {
+ do_test (16 * i, 256, 512, 0, MIDDLE_CHAR);
+ do_test (16 * i, 256, 512, 0, BIG_CHAR);
+ }
for (i = 0; i < 32; ++i)
- {
- do_test (0, i, i + 1, 0, MIDDLE_CHAR);
- do_test (0, i, i + 1, 0, BIG_CHAR);
- }
+ {
+ do_test (0, i, i + 1, 0, MIDDLE_CHAR);
+ do_test (0, i, i + 1, 0, BIG_CHAR);
+ }
return ret;
}
--
2.29.2
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH v3 1/2] x86: Refactor and improve performance of strchr-avx2.S
2021-02-02 9:39 [PATCH v3 1/2] x86: Refactor and improve performance of strchr-avx2.S goldstein.w.n
2021-02-02 9:39 ` [PATCH v3 2/2] x86: Add additional benchmarks for strchr goldstein.w.n
@ 2021-02-02 14:02 ` H.J. Lu
1 sibling, 0 replies; 5+ messages in thread
From: H.J. Lu @ 2021-02-02 14:02 UTC (permalink / raw)
To: noah; +Cc: GNU C Library, Carlos O'Donell
On Tue, Feb 2, 2021 at 1:40 AM <goldstein.w.n@gmail.com> wrote:
>
> From: noah <goldstein.w.n@gmail.com>
>
> No bug. Just seemed the performance could be improved a bit. Observed
> and expected behavior are unchanged. Optimized body of main
> loop. Updated page cross logic and optimized accordingly. Made a few
> minor instruction selection modifications. No regressions in test
> suite. Both test-strchrnul and test-strchr passed.
>
> Signed-off-by: noah <goldstein.w.n@gmail.com>
> ---
> sysdeps/x86_64/multiarch/strchr-avx2.S | 235 ++++++++++++-------------
> sysdeps/x86_64/multiarch/strchr.c | 1 +
> 2 files changed, 118 insertions(+), 118 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
> index d416558d04..806ca66a9b 100644
> --- a/sysdeps/x86_64/multiarch/strchr-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
> @@ -27,10 +27,12 @@
> # ifdef USE_AS_WCSCHR
> # define VPBROADCAST vpbroadcastd
> # define VPCMPEQ vpcmpeqd
> +# define VPMINU vpminud
> # define CHAR_REG esi
> # else
> # define VPBROADCAST vpbroadcastb
> # define VPCMPEQ vpcmpeqb
> +# define VPMINU vpminub
> # define CHAR_REG sil
> # endif
>
> @@ -39,20 +41,26 @@
> # endif
>
> # define VEC_SIZE 32
> +# define PAGE_SIZE 4096
>
> .section .text.avx,"ax",@progbits
> ENTRY (STRCHR)
> movl %edi, %ecx
> - /* Broadcast CHAR to YMM0. */
> +# ifndef USE_AS_STRCHRNUL
> + xorl %edx, %edx
> +# endif
> +
> + /* Broadcast CHAR to YMM0. */
> vmovd %esi, %xmm0
> vpxor %xmm9, %xmm9, %xmm9
> VPBROADCAST %xmm0, %ymm0
> - /* Check if we may cross page boundary with one vector load. */
> - andl $(2 * VEC_SIZE - 1), %ecx
> - cmpl $VEC_SIZE, %ecx
> - ja L(cros_page_boundary)
> -
> - /* Check the first VEC_SIZE bytes. Search for both CHAR and the
> +
> + /* Check if we cross page boundary with one vector load. */
> + andl $(PAGE_SIZE - 1), %ecx
> + cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
> + ja L(cross_page_boundary)
> +
> + /* Check the first VEC_SIZE bytes. Search for both CHAR and the
> null byte. */
> vmovdqu (%rdi), %ymm8
> VPCMPEQ %ymm8, %ymm0, %ymm1
> @@ -60,50 +68,27 @@ ENTRY (STRCHR)
> vpor %ymm1, %ymm2, %ymm1
> vpmovmskb %ymm1, %eax
> testl %eax, %eax
> - jnz L(first_vec_x0)
> -
> - /* Align data for aligned loads in the loop. */
> - addq $VEC_SIZE, %rdi
> - andl $(VEC_SIZE - 1), %ecx
> - andq $-VEC_SIZE, %rdi
> -
> - jmp L(more_4x_vec)
> -
> - .p2align 4
> -L(cros_page_boundary):
> - andl $(VEC_SIZE - 1), %ecx
> - andq $-VEC_SIZE, %rdi
> - vmovdqu (%rdi), %ymm8
> - VPCMPEQ %ymm8, %ymm0, %ymm1
> - VPCMPEQ %ymm8, %ymm9, %ymm2
> - vpor %ymm1, %ymm2, %ymm1
> - vpmovmskb %ymm1, %eax
> - /* Remove the leading bytes. */
> - sarl %cl, %eax
> - testl %eax, %eax
> - jz L(aligned_more)
> - /* Found CHAR or the null byte. */
> + jz L(more_vecs)
> tzcntl %eax, %eax
> - addq %rcx, %rax
> -# ifdef USE_AS_STRCHRNUL
> + /* Found CHAR or the null byte. */
> addq %rdi, %rax
> -# else
> - xorl %edx, %edx
> - leaq (%rdi, %rax), %rax
> - cmp (%rax), %CHAR_REG
> +# ifndef USE_AS_STRCHRNUL
> + cmp (%rax), %CHAR_REG
Remove the extra tab.
> cmovne %rdx, %rax
> # endif
> VZEROUPPER
> ret
>
> .p2align 4
> +L(more_vecs):
> + /* Align data for aligned loads in the loop. */
> + andq $-VEC_SIZE, %rdi
> L(aligned_more):
> - addq $VEC_SIZE, %rdi
>
> -L(more_4x_vec):
> - /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
> - since data is only aligned to VEC_SIZE. */
> - vmovdqa (%rdi), %ymm8
> + /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
> + since data is only aligned to VEC_SIZE. */
> + vmovdqa VEC_SIZE(%rdi), %ymm8
> + addq $VEC_SIZE, %rdi
> VPCMPEQ %ymm8, %ymm0, %ymm1
> VPCMPEQ %ymm8, %ymm9, %ymm2
> vpor %ymm1, %ymm2, %ymm1
> @@ -125,7 +110,7 @@ L(more_4x_vec):
> vpor %ymm1, %ymm2, %ymm1
> vpmovmskb %ymm1, %eax
> testl %eax, %eax
> - jnz L(first_vec_x2)
> + jnz L(first_vec_x2)
>
> vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
> VPCMPEQ %ymm8, %ymm0, %ymm1
> @@ -133,122 +118,136 @@ L(more_4x_vec):
> vpor %ymm1, %ymm2, %ymm1
> vpmovmskb %ymm1, %eax
> testl %eax, %eax
> - jnz L(first_vec_x3)
> -
> - addq $(VEC_SIZE * 4), %rdi
> -
> - /* Align data to 4 * VEC_SIZE. */
> - movq %rdi, %rcx
> - andl $(4 * VEC_SIZE - 1), %ecx
> - andq $-(4 * VEC_SIZE), %rdi
> -
> - .p2align 4
> -L(loop_4x_vec):
> - /* Compare 4 * VEC at a time forward. */
> - vmovdqa (%rdi), %ymm5
> - vmovdqa VEC_SIZE(%rdi), %ymm6
> - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
> - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
> -
> - VPCMPEQ %ymm5, %ymm0, %ymm1
> - VPCMPEQ %ymm6, %ymm0, %ymm2
> - VPCMPEQ %ymm7, %ymm0, %ymm3
> - VPCMPEQ %ymm8, %ymm0, %ymm4
> -
> - VPCMPEQ %ymm5, %ymm9, %ymm5
> - VPCMPEQ %ymm6, %ymm9, %ymm6
> - VPCMPEQ %ymm7, %ymm9, %ymm7
> - VPCMPEQ %ymm8, %ymm9, %ymm8
> -
> - vpor %ymm1, %ymm5, %ymm1
> - vpor %ymm2, %ymm6, %ymm2
> - vpor %ymm3, %ymm7, %ymm3
> - vpor %ymm4, %ymm8, %ymm4
> -
> - vpor %ymm1, %ymm2, %ymm5
> - vpor %ymm3, %ymm4, %ymm6
> -
> - vpor %ymm5, %ymm6, %ymm5
> -
> - vpmovmskb %ymm5, %eax
> - testl %eax, %eax
> - jnz L(4x_vec_end)
> -
> - addq $(VEC_SIZE * 4), %rdi
> + jz L(prep_loop_4x)
>
> - jmp L(loop_4x_vec)
> + tzcntl %eax, %eax
> + leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
> +# ifndef USE_AS_STRCHRNUL
> + cmp (%rax), %CHAR_REG
Remove the extra tab.
> + cmovne %rdx, %rax
> +# endif
> + VZEROUPPER
> + ret
>
> .p2align 4
> L(first_vec_x0):
> - /* Found CHAR or the null byte. */
> tzcntl %eax, %eax
> -# ifdef USE_AS_STRCHRNUL
> + /* Found CHAR or the null byte. */
> addq %rdi, %rax
> -# else
> - xorl %edx, %edx
> - leaq (%rdi, %rax), %rax
> - cmp (%rax), %CHAR_REG
> +# ifndef USE_AS_STRCHRNUL
> + cmp (%rax), %CHAR_REG
Remove the extra tab.
> cmovne %rdx, %rax
> # endif
> VZEROUPPER
> ret
> -
> +
> .p2align 4
> L(first_vec_x1):
> tzcntl %eax, %eax
> -# ifdef USE_AS_STRCHRNUL
> - addq $VEC_SIZE, %rax
> - addq %rdi, %rax
> -# else
> - xorl %edx, %edx
> leaq VEC_SIZE(%rdi, %rax), %rax
> - cmp (%rax), %CHAR_REG
> +# ifndef USE_AS_STRCHRNUL
> + cmp (%rax), %CHAR_REG
Remove the extra tab.
> cmovne %rdx, %rax
> # endif
> VZEROUPPER
> - ret
> -
> + ret
> +
> .p2align 4
> L(first_vec_x2):
> tzcntl %eax, %eax
> -# ifdef USE_AS_STRCHRNUL
> - addq $(VEC_SIZE * 2), %rax
> - addq %rdi, %rax
> -# else
> - xorl %edx, %edx
> + /* Found CHAR or the null byte. */
> leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
> - cmp (%rax), %CHAR_REG
> +# ifndef USE_AS_STRCHRNUL
> + cmp (%rax), %CHAR_REG
Remove the extra tab.
> cmovne %rdx, %rax
> # endif
> VZEROUPPER
> ret
> +
> +L(prep_loop_4x):
> + /* Align data to 4 * VEC_SIZE. */
> + andq $-(VEC_SIZE * 4), %rdi
>
> .p2align 4
> -L(4x_vec_end):
> +L(loop_4x_vec):
> + /* Compare 4 * VEC at a time forward. */
> + vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5
> + vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6
> + vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7
> + vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8
> +
> + /* Leaves only CHARS matching esi as 0. */
> + vpxor %ymm5, %ymm0, %ymm1
> + vpxor %ymm6, %ymm0, %ymm2
> + vpxor %ymm7, %ymm0, %ymm3
> + vpxor %ymm8, %ymm0, %ymm4
> +
> + VPMINU %ymm1, %ymm5, %ymm1
> + VPMINU %ymm2, %ymm6, %ymm2
> + VPMINU %ymm3, %ymm7, %ymm3
> + VPMINU %ymm4, %ymm8, %ymm4
> +
> + VPMINU %ymm1, %ymm2, %ymm5
> + VPMINU %ymm3, %ymm4, %ymm6
> +
> + VPMINU %ymm5, %ymm6, %ymm5
> +
> + VPCMPEQ %ymm5, %ymm9, %ymm5
> + vpmovmskb %ymm5, %eax
> +
> + addq $(VEC_SIZE * 4), %rdi
> + testl %eax, %eax
> + jz L(loop_4x_vec)
> +
> + VPCMPEQ %ymm1, %ymm9, %ymm1
> vpmovmskb %ymm1, %eax
> testl %eax, %eax
> jnz L(first_vec_x0)
> +
> + VPCMPEQ %ymm2, %ymm9, %ymm2
> vpmovmskb %ymm2, %eax
> testl %eax, %eax
> jnz L(first_vec_x1)
> - vpmovmskb %ymm3, %eax
> - testl %eax, %eax
> - jnz L(first_vec_x2)
> +
> + VPCMPEQ %ymm3, %ymm9, %ymm3
> + VPCMPEQ %ymm4, %ymm9, %ymm4
> + vpmovmskb %ymm3, %ecx
> vpmovmskb %ymm4, %eax
> + salq $32, %rax
> + orq %rcx, %rax
Remove the extra tab.
> + tzcntq %rax, %rax
> + leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
> +# ifndef USE_AS_STRCHRNUL
> + cmp (%rax), %CHAR_REG
Remove the extra tab.
> + cmovne %rdx, %rax
> +# endif
> + VZEROUPPER
> + ret
> +
> + /* Cold case for crossing page with first load. */
> + .p2align 4
> +L(cross_page_boundary):
> + andq $-VEC_SIZE, %rdi
> + andl $(VEC_SIZE - 1), %ecx
> +
> + vmovdqa (%rdi), %ymm8
> + VPCMPEQ %ymm8, %ymm0, %ymm1
> + VPCMPEQ %ymm8, %ymm9, %ymm2
> + vpor %ymm1, %ymm2, %ymm1
> + vpmovmskb %ymm1, %eax
> + /* Remove the leading bits. */
> + sarxl %ecx, %eax, %eax
> testl %eax, %eax
> -L(first_vec_x3):
> + jz L(aligned_more)
> tzcntl %eax, %eax
> -# ifdef USE_AS_STRCHRNUL
> - addq $(VEC_SIZE * 3), %rax
> + addq %rcx, %rdi
> addq %rdi, %rax
> -# else
> - xorl %edx, %edx
> - leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
> - cmp (%rax), %CHAR_REG
> +# ifndef USE_AS_STRCHRNUL
> + cmp (%rax), %CHAR_REG
Remove the extra tab.
> cmovne %rdx, %rax
> # endif
> VZEROUPPER
> ret
>
> END (STRCHR)
> -#endif
> +# endif
> diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
> index 583a152794..4dfbe3b58b 100644
> --- a/sysdeps/x86_64/multiarch/strchr.c
> +++ b/sysdeps/x86_64/multiarch/strchr.c
> @@ -37,6 +37,7 @@ IFUNC_SELECTOR (void)
>
> if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
> && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> + && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
> && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
> return OPTIMIZE (avx2);
>
> --
> 2.29.2
>
--
H.J.
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH v3 2/2] x86: Add additional benchmarks for strchr
2021-02-02 9:39 ` [PATCH v3 2/2] x86: Add additional benchmarks for strchr goldstein.w.n
@ 2021-02-02 14:06 ` H.J. Lu
0 siblings, 0 replies; 5+ messages in thread
From: H.J. Lu @ 2021-02-02 14:06 UTC (permalink / raw)
To: noah; +Cc: GNU C Library, Carlos O'Donell
On Tue, Feb 2, 2021 at 1:40 AM <goldstein.w.n@gmail.com> wrote:
>
> From: noah <goldstein.w.n@gmail.com>
>
> This patch adds additional benchmarks for string size of 4096 and
> several benchmarks for string size 256 with different alignments.
>
> Signed-off-by: noah <goldstein.w.n@gmail.com>
> ---
> benchtests/bench-strchr.c | 79 ++++++++++++++++++++++++++-------------
> 1 file changed, 53 insertions(+), 26 deletions(-)
>
> diff --git a/benchtests/bench-strchr.c b/benchtests/bench-strchr.c
> index bf493fe458..ce7ffd354d 100644
> --- a/benchtests/bench-strchr.c
> +++ b/benchtests/bench-strchr.c
> @@ -100,9 +100,12 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
> size_t i;
> CHAR *result;
> CHAR *buf = (CHAR *) buf1;
> - align &= 15;
> + align &= 127;
> if ((align + len) * sizeof (CHAR) >= page_size)
> - return;
> + {
> + return;
> + }
> +
>
> for (i = 0; i < len; ++i)
> {
> @@ -146,40 +149,64 @@ test_main (void)
> putchar ('\n');
>
> for (i = 1; i < 8; ++i)
> - {
> - do_test (0, 16 << i, 2048, SMALL_CHAR, MIDDLE_CHAR);
> - do_test (i, 16 << i, 2048, SMALL_CHAR, MIDDLE_CHAR);
> - }
> + {
> + do_test (0, 16 << i, 2048, SMALL_CHAR, MIDDLE_CHAR);
> + do_test (i, 16 << i, 2048, SMALL_CHAR, MIDDLE_CHAR);
> + }
>
> for (i = 1; i < 8; ++i)
> - {
> - do_test (i, 64, 256, SMALL_CHAR, MIDDLE_CHAR);
> - do_test (i, 64, 256, SMALL_CHAR, BIG_CHAR);
> - }
> + {
> + do_test (0, 16 << i, 4096, SMALL_CHAR, MIDDLE_CHAR);
> + do_test (i, 16 << i, 4096, SMALL_CHAR, MIDDLE_CHAR);
> + }
> +
> + for (i = 1; i < 8; ++i)
> + {
> + do_test (i, 64, 256, SMALL_CHAR, MIDDLE_CHAR);
> + do_test (i, 64, 256, SMALL_CHAR, BIG_CHAR);
> + }
> +
> + for (i = 0; i < 8; ++i)
> + {
> + do_test (16 * i, 256, 512, SMALL_CHAR, MIDDLE_CHAR);
> + do_test (16 * i, 256, 512, SMALL_CHAR, BIG_CHAR);
> + }
>
> for (i = 0; i < 32; ++i)
> - {
> - do_test (0, i, i + 1, SMALL_CHAR, MIDDLE_CHAR);
> - do_test (0, i, i + 1, SMALL_CHAR, BIG_CHAR);
> - }
> + {
> + do_test (0, i, i + 1, SMALL_CHAR, MIDDLE_CHAR);
> + do_test (0, i, i + 1, SMALL_CHAR, BIG_CHAR);
> + }
>
> for (i = 1; i < 8; ++i)
> - {
> - do_test (0, 16 << i, 2048, 0, MIDDLE_CHAR);
> - do_test (i, 16 << i, 2048, 0, MIDDLE_CHAR);
> - }
> + {
> + do_test (0, 16 << i, 2048, 0, MIDDLE_CHAR);
> + do_test (i, 16 << i, 2048, 0, MIDDLE_CHAR);
> + }
>
> for (i = 1; i < 8; ++i)
> - {
> - do_test (i, 64, 256, 0, MIDDLE_CHAR);
> - do_test (i, 64, 256, 0, BIG_CHAR);
> - }
> + {
> + do_test (0, 16 << i, 4096, 0, MIDDLE_CHAR);
> + do_test (i, 16 << i, 4096, 0, MIDDLE_CHAR);
> + }
> +
> + for (i = 1; i < 8; ++i)
> + {
> + do_test (i, 64, 256, 0, MIDDLE_CHAR);
> + do_test (i, 64, 256, 0, BIG_CHAR);
> + }
> +
> + for (i = 0; i < 8; ++i)
> + {
> + do_test (16 * i, 256, 512, 0, MIDDLE_CHAR);
> + do_test (16 * i, 256, 512, 0, BIG_CHAR);
> + }
>
> for (i = 0; i < 32; ++i)
> - {
> - do_test (0, i, i + 1, 0, MIDDLE_CHAR);
> - do_test (0, i, i + 1, 0, BIG_CHAR);
> - }
> + {
> + do_test (0, i, i + 1, 0, MIDDLE_CHAR);
> + do_test (0, i, i + 1, 0, BIG_CHAR);
> + }
>
> return ret;
> }
> --
> 2.29.2
>
3 issues:
1. Please restore the nice commit message in
https://sourceware.org/pipermail/libc-alpha/2021-February/122200.html
2. Please fix the indentation (2 spaces).
3. Please add the same tests to string/test-strchr.c.
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 5+ messages in thread
* [PATCH v3 1/2] x86: Refactor and improve performance of strchr-avx2.S
2021-02-01 0:30 [PATCH v2 " noah
@ 2021-02-02 7:23 ` goldstein.w.n
0 siblings, 0 replies; 5+ messages in thread
From: goldstein.w.n @ 2021-02-02 7:23 UTC (permalink / raw)
To: libc-alpha; +Cc: carlos, goldstein.w.n, hjl.tools
From: noah <goldstein.w.n@gmail.com>
No bug. Just seemed the performance could be improved a bit. Observed
and expected behavior are unchanged. Optimized body of main
loop. Updated page cross logic and optimized accordingly. Made a few
minor instruction selection modifications. No regressions in test
suite. Both test-strchrnul and test-strchr passed.
Signed-off-by: noah <goldstein.w.n@gmail.com>
---
sysdeps/x86_64/multiarch/strchr-avx2.S | 235 ++++++++++++-------------
sysdeps/x86_64/multiarch/strchr.c | 1 +
2 files changed, 118 insertions(+), 118 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
index d416558d04..806ca66a9b 100644
--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
@@ -27,10 +27,12 @@
# ifdef USE_AS_WCSCHR
# define VPBROADCAST vpbroadcastd
# define VPCMPEQ vpcmpeqd
+# define VPMINU vpminud
# define CHAR_REG esi
# else
# define VPBROADCAST vpbroadcastb
# define VPCMPEQ vpcmpeqb
+# define VPMINU vpminub
# define CHAR_REG sil
# endif
@@ -39,20 +41,26 @@
# endif
# define VEC_SIZE 32
+# define PAGE_SIZE 4096
.section .text.avx,"ax",@progbits
ENTRY (STRCHR)
movl %edi, %ecx
- /* Broadcast CHAR to YMM0. */
+# ifndef USE_AS_STRCHRNUL
+ xorl %edx, %edx
+# endif
+
+ /* Broadcast CHAR to YMM0. */
vmovd %esi, %xmm0
vpxor %xmm9, %xmm9, %xmm9
VPBROADCAST %xmm0, %ymm0
- /* Check if we may cross page boundary with one vector load. */
- andl $(2 * VEC_SIZE - 1), %ecx
- cmpl $VEC_SIZE, %ecx
- ja L(cros_page_boundary)
-
- /* Check the first VEC_SIZE bytes. Search for both CHAR and the
+
+ /* Check if we cross page boundary with one vector load. */
+ andl $(PAGE_SIZE - 1), %ecx
+ cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
+ ja L(cross_page_boundary)
+
+ /* Check the first VEC_SIZE bytes. Search for both CHAR and the
null byte. */
vmovdqu (%rdi), %ymm8
VPCMPEQ %ymm8, %ymm0, %ymm1
@@ -60,50 +68,27 @@ ENTRY (STRCHR)
vpor %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %eax
testl %eax, %eax
- jnz L(first_vec_x0)
-
- /* Align data for aligned loads in the loop. */
- addq $VEC_SIZE, %rdi
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
-
- jmp L(more_4x_vec)
-
- .p2align 4
-L(cros_page_boundary):
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
- vmovdqu (%rdi), %ymm8
- VPCMPEQ %ymm8, %ymm0, %ymm1
- VPCMPEQ %ymm8, %ymm9, %ymm2
- vpor %ymm1, %ymm2, %ymm1
- vpmovmskb %ymm1, %eax
- /* Remove the leading bytes. */
- sarl %cl, %eax
- testl %eax, %eax
- jz L(aligned_more)
- /* Found CHAR or the null byte. */
+ jz L(more_vecs)
tzcntl %eax, %eax
- addq %rcx, %rax
-# ifdef USE_AS_STRCHRNUL
+ /* Found CHAR or the null byte. */
addq %rdi, %rax
-# else
- xorl %edx, %edx
- leaq (%rdi, %rax), %rax
- cmp (%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER
ret
.p2align 4
+L(more_vecs):
+ /* Align data for aligned loads in the loop. */
+ andq $-VEC_SIZE, %rdi
L(aligned_more):
- addq $VEC_SIZE, %rdi
-L(more_4x_vec):
- /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
- since data is only aligned to VEC_SIZE. */
- vmovdqa (%rdi), %ymm8
+ /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
+ since data is only aligned to VEC_SIZE. */
+ vmovdqa VEC_SIZE(%rdi), %ymm8
+ addq $VEC_SIZE, %rdi
VPCMPEQ %ymm8, %ymm0, %ymm1
VPCMPEQ %ymm8, %ymm9, %ymm2
vpor %ymm1, %ymm2, %ymm1
@@ -125,7 +110,7 @@ L(more_4x_vec):
vpor %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %eax
testl %eax, %eax
- jnz L(first_vec_x2)
+ jnz L(first_vec_x2)
vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
VPCMPEQ %ymm8, %ymm0, %ymm1
@@ -133,122 +118,136 @@ L(more_4x_vec):
vpor %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %eax
testl %eax, %eax
- jnz L(first_vec_x3)
-
- addq $(VEC_SIZE * 4), %rdi
-
- /* Align data to 4 * VEC_SIZE. */
- movq %rdi, %rcx
- andl $(4 * VEC_SIZE - 1), %ecx
- andq $-(4 * VEC_SIZE), %rdi
-
- .p2align 4
-L(loop_4x_vec):
- /* Compare 4 * VEC at a time forward. */
- vmovdqa (%rdi), %ymm5
- vmovdqa VEC_SIZE(%rdi), %ymm6
- vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
- vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
-
- VPCMPEQ %ymm5, %ymm0, %ymm1
- VPCMPEQ %ymm6, %ymm0, %ymm2
- VPCMPEQ %ymm7, %ymm0, %ymm3
- VPCMPEQ %ymm8, %ymm0, %ymm4
-
- VPCMPEQ %ymm5, %ymm9, %ymm5
- VPCMPEQ %ymm6, %ymm9, %ymm6
- VPCMPEQ %ymm7, %ymm9, %ymm7
- VPCMPEQ %ymm8, %ymm9, %ymm8
-
- vpor %ymm1, %ymm5, %ymm1
- vpor %ymm2, %ymm6, %ymm2
- vpor %ymm3, %ymm7, %ymm3
- vpor %ymm4, %ymm8, %ymm4
-
- vpor %ymm1, %ymm2, %ymm5
- vpor %ymm3, %ymm4, %ymm6
-
- vpor %ymm5, %ymm6, %ymm5
-
- vpmovmskb %ymm5, %eax
- testl %eax, %eax
- jnz L(4x_vec_end)
-
- addq $(VEC_SIZE * 4), %rdi
+ jz L(prep_loop_4x)
- jmp L(loop_4x_vec)
+ tzcntl %eax, %eax
+ leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
+ cmovne %rdx, %rax
+# endif
+ VZEROUPPER
+ ret
.p2align 4
L(first_vec_x0):
- /* Found CHAR or the null byte. */
tzcntl %eax, %eax
-# ifdef USE_AS_STRCHRNUL
+ /* Found CHAR or the null byte. */
addq %rdi, %rax
-# else
- xorl %edx, %edx
- leaq (%rdi, %rax), %rax
- cmp (%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER
ret
-
+
.p2align 4
L(first_vec_x1):
tzcntl %eax, %eax
-# ifdef USE_AS_STRCHRNUL
- addq $VEC_SIZE, %rax
- addq %rdi, %rax
-# else
- xorl %edx, %edx
leaq VEC_SIZE(%rdi, %rax), %rax
- cmp (%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER
- ret
-
+ ret
+
.p2align 4
L(first_vec_x2):
tzcntl %eax, %eax
-# ifdef USE_AS_STRCHRNUL
- addq $(VEC_SIZE * 2), %rax
- addq %rdi, %rax
-# else
- xorl %edx, %edx
+ /* Found CHAR or the null byte. */
leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
- cmp (%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER
ret
+
+L(prep_loop_4x):
+ /* Align data to 4 * VEC_SIZE. */
+ andq $-(VEC_SIZE * 4), %rdi
.p2align 4
-L(4x_vec_end):
+L(loop_4x_vec):
+ /* Compare 4 * VEC at a time forward. */
+ vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5
+ vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6
+ vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7
+ vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8
+
+ /* Leaves only CHARS matching esi as 0. */
+ vpxor %ymm5, %ymm0, %ymm1
+ vpxor %ymm6, %ymm0, %ymm2
+ vpxor %ymm7, %ymm0, %ymm3
+ vpxor %ymm8, %ymm0, %ymm4
+
+ VPMINU %ymm1, %ymm5, %ymm1
+ VPMINU %ymm2, %ymm6, %ymm2
+ VPMINU %ymm3, %ymm7, %ymm3
+ VPMINU %ymm4, %ymm8, %ymm4
+
+ VPMINU %ymm1, %ymm2, %ymm5
+ VPMINU %ymm3, %ymm4, %ymm6
+
+ VPMINU %ymm5, %ymm6, %ymm5
+
+ VPCMPEQ %ymm5, %ymm9, %ymm5
+ vpmovmskb %ymm5, %eax
+
+ addq $(VEC_SIZE * 4), %rdi
+ testl %eax, %eax
+ jz L(loop_4x_vec)
+
+ VPCMPEQ %ymm1, %ymm9, %ymm1
vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x0)
+
+ VPCMPEQ %ymm2, %ymm9, %ymm2
vpmovmskb %ymm2, %eax
testl %eax, %eax
jnz L(first_vec_x1)
- vpmovmskb %ymm3, %eax
- testl %eax, %eax
- jnz L(first_vec_x2)
+
+ VPCMPEQ %ymm3, %ymm9, %ymm3
+ VPCMPEQ %ymm4, %ymm9, %ymm4
+ vpmovmskb %ymm3, %ecx
vpmovmskb %ymm4, %eax
+ salq $32, %rax
+ orq %rcx, %rax
+ tzcntq %rax, %rax
+ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
+ cmovne %rdx, %rax
+# endif
+ VZEROUPPER
+ ret
+
+ /* Cold case for crossing page with first load. */
+ .p2align 4
+L(cross_page_boundary):
+ andq $-VEC_SIZE, %rdi
+ andl $(VEC_SIZE - 1), %ecx
+
+ vmovdqa (%rdi), %ymm8
+ VPCMPEQ %ymm8, %ymm0, %ymm1
+ VPCMPEQ %ymm8, %ymm9, %ymm2
+ vpor %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %eax
+ /* Remove the leading bits. */
+ sarxl %ecx, %eax, %eax
testl %eax, %eax
-L(first_vec_x3):
+ jz L(aligned_more)
tzcntl %eax, %eax
-# ifdef USE_AS_STRCHRNUL
- addq $(VEC_SIZE * 3), %rax
+ addq %rcx, %rdi
addq %rdi, %rax
-# else
- xorl %edx, %edx
- leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
- cmp (%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER
ret
END (STRCHR)
-#endif
+# endif
diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
index 583a152794..4dfbe3b58b 100644
--- a/sysdeps/x86_64/multiarch/strchr.c
+++ b/sysdeps/x86_64/multiarch/strchr.c
@@ -37,6 +37,7 @@ IFUNC_SELECTOR (void)
if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
&& CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
return OPTIMIZE (avx2);
--
2.29.2
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2021-02-02 14:06 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-02-02 9:39 [PATCH v3 1/2] x86: Refactor and improve performance of strchr-avx2.S goldstein.w.n
2021-02-02 9:39 ` [PATCH v3 2/2] x86: Add additional benchmarks for strchr goldstein.w.n
2021-02-02 14:06 ` H.J. Lu
2021-02-02 14:02 ` [PATCH v3 1/2] x86: Refactor and improve performance of strchr-avx2.S H.J. Lu
-- strict thread matches above, loose matches on Subject: below --
2021-02-01 0:30 [PATCH v2 " noah
2021-02-02 7:23 ` [PATCH v3 " goldstein.w.n
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).