From: "H.J. Lu" <hjl.tools@gmail.com>
To: noah <goldstein.w.n@gmail.com>
Cc: GNU C Library <libc-alpha@sourceware.org>,
"Carlos O'Donell" <carlos@systemhalted.org>
Subject: Re: [PATCH v4 1/2] x86: Refactor and improve performance of strchr-avx2.S
Date: Mon, 8 Feb 2021 11:33:02 -0800 [thread overview]
Message-ID: <CAMe9rOpMO=4JJjgmX=ka5Bf91MrrXKLxtCzsHZYFKOYiW7YYUQ@mail.gmail.com> (raw)
In-Reply-To: <CAMe9rOr8euhg1uaRtNRvrTPqAh2Aois0C7-yY7a2RyEbuEJr5w@mail.gmail.com>
[-- Attachment #1: Type: text/plain, Size: 11974 bytes --]
On Mon, Feb 8, 2021 at 6:08 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Tue, Feb 2, 2021 at 9:39 PM <goldstein.w.n@gmail.com> wrote:
> >
> > From: noah <goldstein.w.n@gmail.com>
> >
> > No bug. Just seemed the performance could be improved a bit. Observed
> > and expected behavior are unchanged. Optimized body of main
> > loop. Updated page cross logic and optimized accordingly. Made a few
> > minor instruction selection modifications. No regressions in test
> > suite. Both test-strchrnul and test-strchr passed.
> >
> > Signed-off-by: noah <goldstein.w.n@gmail.com>
> > ---
> > sysdeps/x86_64/multiarch/strchr-avx2.S | 235 ++++++++++++-------------
> > sysdeps/x86_64/multiarch/strchr.c | 1 +
> > 2 files changed, 118 insertions(+), 118 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
> > index d416558d04..8b9d78b55a 100644
> > --- a/sysdeps/x86_64/multiarch/strchr-avx2.S
> > +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
> > @@ -27,10 +27,12 @@
> > # ifdef USE_AS_WCSCHR
> > # define VPBROADCAST vpbroadcastd
> > # define VPCMPEQ vpcmpeqd
> > +# define VPMINU vpminud
> > # define CHAR_REG esi
> > # else
> > # define VPBROADCAST vpbroadcastb
> > # define VPCMPEQ vpcmpeqb
> > +# define VPMINU vpminub
> > # define CHAR_REG sil
> > # endif
> >
> > @@ -39,20 +41,26 @@
> > # endif
> >
> > # define VEC_SIZE 32
> > +# define PAGE_SIZE 4096
> >
> > .section .text.avx,"ax",@progbits
> > ENTRY (STRCHR)
> > movl %edi, %ecx
> > - /* Broadcast CHAR to YMM0. */
> > +# ifndef USE_AS_STRCHRNUL
> > + xorl %edx, %edx
> > +# endif
> > +
> > + /* Broadcast CHAR to YMM0. */
> > vmovd %esi, %xmm0
> > vpxor %xmm9, %xmm9, %xmm9
> > VPBROADCAST %xmm0, %ymm0
> > - /* Check if we may cross page boundary with one vector load. */
> > - andl $(2 * VEC_SIZE - 1), %ecx
> > - cmpl $VEC_SIZE, %ecx
> > - ja L(cros_page_boundary)
> > -
> > - /* Check the first VEC_SIZE bytes. Search for both CHAR and the
> > +
> > + /* Check if we cross page boundary with one vector load. */
> > + andl $(PAGE_SIZE - 1), %ecx
> > + cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
> > + ja L(cross_page_boundary)
> > +
> > + /* Check the first VEC_SIZE bytes. Search for both CHAR and the
> > null byte. */
> > vmovdqu (%rdi), %ymm8
> > VPCMPEQ %ymm8, %ymm0, %ymm1
> > @@ -60,50 +68,27 @@ ENTRY (STRCHR)
> > vpor %ymm1, %ymm2, %ymm1
> > vpmovmskb %ymm1, %eax
> > testl %eax, %eax
> > - jnz L(first_vec_x0)
> > -
> > - /* Align data for aligned loads in the loop. */
> > - addq $VEC_SIZE, %rdi
> > - andl $(VEC_SIZE - 1), %ecx
> > - andq $-VEC_SIZE, %rdi
> > -
> > - jmp L(more_4x_vec)
> > -
> > - .p2align 4
> > -L(cros_page_boundary):
> > - andl $(VEC_SIZE - 1), %ecx
> > - andq $-VEC_SIZE, %rdi
> > - vmovdqu (%rdi), %ymm8
> > - VPCMPEQ %ymm8, %ymm0, %ymm1
> > - VPCMPEQ %ymm8, %ymm9, %ymm2
> > - vpor %ymm1, %ymm2, %ymm1
> > - vpmovmskb %ymm1, %eax
> > - /* Remove the leading bytes. */
> > - sarl %cl, %eax
> > - testl %eax, %eax
> > - jz L(aligned_more)
> > - /* Found CHAR or the null byte. */
> > + jz L(more_vecs)
> > tzcntl %eax, %eax
> > - addq %rcx, %rax
> > -# ifdef USE_AS_STRCHRNUL
> > + /* Found CHAR or the null byte. */
> > addq %rdi, %rax
> > -# else
> > - xorl %edx, %edx
> > - leaq (%rdi, %rax), %rax
> > - cmp (%rax), %CHAR_REG
> > +# ifndef USE_AS_STRCHRNUL
> > + cmp (%rax), %CHAR_REG
> > cmovne %rdx, %rax
> > # endif
> > VZEROUPPER
> > ret
> >
> > .p2align 4
> > +L(more_vecs):
> > + /* Align data for aligned loads in the loop. */
> > + andq $-VEC_SIZE, %rdi
> > L(aligned_more):
> > - addq $VEC_SIZE, %rdi
> >
> > -L(more_4x_vec):
> > - /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
> > - since data is only aligned to VEC_SIZE. */
> > - vmovdqa (%rdi), %ymm8
> > + /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
> > + since data is only aligned to VEC_SIZE. */
> > + vmovdqa VEC_SIZE(%rdi), %ymm8
> > + addq $VEC_SIZE, %rdi
> > VPCMPEQ %ymm8, %ymm0, %ymm1
> > VPCMPEQ %ymm8, %ymm9, %ymm2
> > vpor %ymm1, %ymm2, %ymm1
> > @@ -125,7 +110,7 @@ L(more_4x_vec):
> > vpor %ymm1, %ymm2, %ymm1
> > vpmovmskb %ymm1, %eax
> > testl %eax, %eax
> > - jnz L(first_vec_x2)
> > + jnz L(first_vec_x2)
> >
> > vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
> > VPCMPEQ %ymm8, %ymm0, %ymm1
> > @@ -133,122 +118,136 @@ L(more_4x_vec):
> > vpor %ymm1, %ymm2, %ymm1
> > vpmovmskb %ymm1, %eax
> > testl %eax, %eax
> > - jnz L(first_vec_x3)
> > -
> > - addq $(VEC_SIZE * 4), %rdi
> > -
> > - /* Align data to 4 * VEC_SIZE. */
> > - movq %rdi, %rcx
> > - andl $(4 * VEC_SIZE - 1), %ecx
> > - andq $-(4 * VEC_SIZE), %rdi
> > -
> > - .p2align 4
> > -L(loop_4x_vec):
> > - /* Compare 4 * VEC at a time forward. */
> > - vmovdqa (%rdi), %ymm5
> > - vmovdqa VEC_SIZE(%rdi), %ymm6
> > - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
> > - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
> > -
> > - VPCMPEQ %ymm5, %ymm0, %ymm1
> > - VPCMPEQ %ymm6, %ymm0, %ymm2
> > - VPCMPEQ %ymm7, %ymm0, %ymm3
> > - VPCMPEQ %ymm8, %ymm0, %ymm4
> > -
> > - VPCMPEQ %ymm5, %ymm9, %ymm5
> > - VPCMPEQ %ymm6, %ymm9, %ymm6
> > - VPCMPEQ %ymm7, %ymm9, %ymm7
> > - VPCMPEQ %ymm8, %ymm9, %ymm8
> > -
> > - vpor %ymm1, %ymm5, %ymm1
> > - vpor %ymm2, %ymm6, %ymm2
> > - vpor %ymm3, %ymm7, %ymm3
> > - vpor %ymm4, %ymm8, %ymm4
> > -
> > - vpor %ymm1, %ymm2, %ymm5
> > - vpor %ymm3, %ymm4, %ymm6
> > -
> > - vpor %ymm5, %ymm6, %ymm5
> > -
> > - vpmovmskb %ymm5, %eax
> > - testl %eax, %eax
> > - jnz L(4x_vec_end)
> > -
> > - addq $(VEC_SIZE * 4), %rdi
> > + jz L(prep_loop_4x)
> >
> > - jmp L(loop_4x_vec)
> > + tzcntl %eax, %eax
> > + leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
> > +# ifndef USE_AS_STRCHRNUL
> > + cmp (%rax), %CHAR_REG
> > + cmovne %rdx, %rax
> > +# endif
> > + VZEROUPPER
> > + ret
> >
> > .p2align 4
> > L(first_vec_x0):
> > - /* Found CHAR or the null byte. */
> > tzcntl %eax, %eax
> > -# ifdef USE_AS_STRCHRNUL
> > + /* Found CHAR or the null byte. */
> > addq %rdi, %rax
> > -# else
> > - xorl %edx, %edx
> > - leaq (%rdi, %rax), %rax
> > - cmp (%rax), %CHAR_REG
> > +# ifndef USE_AS_STRCHRNUL
> > + cmp (%rax), %CHAR_REG
> > cmovne %rdx, %rax
> > # endif
> > VZEROUPPER
> > ret
> > -
> > +
> > .p2align 4
> > L(first_vec_x1):
> > tzcntl %eax, %eax
> > -# ifdef USE_AS_STRCHRNUL
> > - addq $VEC_SIZE, %rax
> > - addq %rdi, %rax
> > -# else
> > - xorl %edx, %edx
> > leaq VEC_SIZE(%rdi, %rax), %rax
> > - cmp (%rax), %CHAR_REG
> > +# ifndef USE_AS_STRCHRNUL
> > + cmp (%rax), %CHAR_REG
> > cmovne %rdx, %rax
> > # endif
> > VZEROUPPER
> > - ret
> > -
> > + ret
> > +
> > .p2align 4
> > L(first_vec_x2):
> > tzcntl %eax, %eax
> > -# ifdef USE_AS_STRCHRNUL
> > - addq $(VEC_SIZE * 2), %rax
> > - addq %rdi, %rax
> > -# else
> > - xorl %edx, %edx
> > + /* Found CHAR or the null byte. */
> > leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
> > - cmp (%rax), %CHAR_REG
> > +# ifndef USE_AS_STRCHRNUL
> > + cmp (%rax), %CHAR_REG
> > cmovne %rdx, %rax
> > # endif
> > VZEROUPPER
> > ret
> > +
> > +L(prep_loop_4x):
> > + /* Align data to 4 * VEC_SIZE. */
> > + andq $-(VEC_SIZE * 4), %rdi
> >
> > .p2align 4
> > -L(4x_vec_end):
> > +L(loop_4x_vec):
> > + /* Compare 4 * VEC at a time forward. */
> > + vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5
> > + vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6
> > + vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7
> > + vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8
> > +
> > + /* Leaves only CHARS matching esi as 0. */
> > + vpxor %ymm5, %ymm0, %ymm1
> > + vpxor %ymm6, %ymm0, %ymm2
> > + vpxor %ymm7, %ymm0, %ymm3
> > + vpxor %ymm8, %ymm0, %ymm4
> > +
> > + VPMINU %ymm1, %ymm5, %ymm1
> > + VPMINU %ymm2, %ymm6, %ymm2
> > + VPMINU %ymm3, %ymm7, %ymm3
> > + VPMINU %ymm4, %ymm8, %ymm4
> > +
> > + VPMINU %ymm1, %ymm2, %ymm5
> > + VPMINU %ymm3, %ymm4, %ymm6
> > +
> > + VPMINU %ymm5, %ymm6, %ymm5
> > +
> > + VPCMPEQ %ymm5, %ymm9, %ymm5
> > + vpmovmskb %ymm5, %eax
> > +
> > + addq $(VEC_SIZE * 4), %rdi
> > + testl %eax, %eax
> > + jz L(loop_4x_vec)
> > +
> > + VPCMPEQ %ymm1, %ymm9, %ymm1
> > vpmovmskb %ymm1, %eax
> > testl %eax, %eax
> > jnz L(first_vec_x0)
> > +
> > + VPCMPEQ %ymm2, %ymm9, %ymm2
> > vpmovmskb %ymm2, %eax
> > testl %eax, %eax
> > jnz L(first_vec_x1)
> > - vpmovmskb %ymm3, %eax
> > - testl %eax, %eax
> > - jnz L(first_vec_x2)
> > +
> > + VPCMPEQ %ymm3, %ymm9, %ymm3
> > + VPCMPEQ %ymm4, %ymm9, %ymm4
> > + vpmovmskb %ymm3, %ecx
> > vpmovmskb %ymm4, %eax
> > + salq $32, %rax
> > + orq %rcx, %rax
> > + tzcntq %rax, %rax
> > + leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
> > +# ifndef USE_AS_STRCHRNUL
> > + cmp (%rax), %CHAR_REG
> > + cmovne %rdx, %rax
> > +# endif
> > + VZEROUPPER
> > + ret
> > +
> > + /* Cold case for crossing page with first load. */
> > + .p2align 4
> > +L(cross_page_boundary):
> > + andq $-VEC_SIZE, %rdi
> > + andl $(VEC_SIZE - 1), %ecx
> > +
> > + vmovdqa (%rdi), %ymm8
> > + VPCMPEQ %ymm8, %ymm0, %ymm1
> > + VPCMPEQ %ymm8, %ymm9, %ymm2
> > + vpor %ymm1, %ymm2, %ymm1
> > + vpmovmskb %ymm1, %eax
> > + /* Remove the leading bits. */
> > + sarxl %ecx, %eax, %eax
> > testl %eax, %eax
> > -L(first_vec_x3):
> > + jz L(aligned_more)
> > tzcntl %eax, %eax
> > -# ifdef USE_AS_STRCHRNUL
> > - addq $(VEC_SIZE * 3), %rax
> > + addq %rcx, %rdi
> > addq %rdi, %rax
> > -# else
> > - xorl %edx, %edx
> > - leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
> > - cmp (%rax), %CHAR_REG
> > +# ifndef USE_AS_STRCHRNUL
> > + cmp (%rax), %CHAR_REG
> > cmovne %rdx, %rax
> > # endif
> > VZEROUPPER
> > ret
> >
> > END (STRCHR)
> > -#endif
> > +# endif
> > diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
> > index 583a152794..4dfbe3b58b 100644
> > --- a/sysdeps/x86_64/multiarch/strchr.c
> > +++ b/sysdeps/x86_64/multiarch/strchr.c
> > @@ -37,6 +37,7 @@ IFUNC_SELECTOR (void)
> >
> > if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
> > && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> > + && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
> > && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
> > return OPTIMIZE (avx2);
> >
> > --
> > 2.29.2
> >
>
> LGTM.
>
> Thanks.
>
This is the updated patch with extra white spaces fixed I am checking in.
--
H.J.
[-- Attachment #2: 0001-x86-64-Refactor-and-improve-performance-of-strchr-av.patch --]
[-- Type: text/x-patch, Size: 8551 bytes --]
From 1f745ecc2109890886b161d4791e1406fdfc29b8 Mon Sep 17 00:00:00 2001
From: noah <goldstein.w.n@gmail.com>
Date: Wed, 3 Feb 2021 00:38:59 -0500
Subject: [PATCH 1/2] x86-64: Refactor and improve performance of strchr-avx2.S
No bug. Just seemed the performance could be improved a bit. Observed
and expected behavior are unchanged. Optimized body of main
loop. Updated page cross logic and optimized accordingly. Made a few
minor instruction selection modifications. No regressions in test
suite. Both test-strchrnul and test-strchr passed.
---
sysdeps/x86_64/multiarch/strchr-avx2.S | 225 ++++++++++++-------------
sysdeps/x86_64/multiarch/strchr.c | 1 +
2 files changed, 113 insertions(+), 113 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
index d416558d04..476c20c289 100644
--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
@@ -27,10 +27,12 @@
# ifdef USE_AS_WCSCHR
# define VPBROADCAST vpbroadcastd
# define VPCMPEQ vpcmpeqd
+# define VPMINU vpminud
# define CHAR_REG esi
# else
# define VPBROADCAST vpbroadcastb
# define VPCMPEQ vpcmpeqb
+# define VPMINU vpminub
# define CHAR_REG sil
# endif
@@ -39,71 +41,54 @@
# endif
# define VEC_SIZE 32
+# define PAGE_SIZE 4096
.section .text.avx,"ax",@progbits
ENTRY (STRCHR)
movl %edi, %ecx
- /* Broadcast CHAR to YMM0. */
+# ifndef USE_AS_STRCHRNUL
+ xorl %edx, %edx
+# endif
+
+ /* Broadcast CHAR to YMM0. */
vmovd %esi, %xmm0
vpxor %xmm9, %xmm9, %xmm9
VPBROADCAST %xmm0, %ymm0
- /* Check if we may cross page boundary with one vector load. */
- andl $(2 * VEC_SIZE - 1), %ecx
- cmpl $VEC_SIZE, %ecx
- ja L(cros_page_boundary)
-
- /* Check the first VEC_SIZE bytes. Search for both CHAR and the
- null byte. */
- vmovdqu (%rdi), %ymm8
- VPCMPEQ %ymm8, %ymm0, %ymm1
- VPCMPEQ %ymm8, %ymm9, %ymm2
- vpor %ymm1, %ymm2, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
-
- /* Align data for aligned loads in the loop. */
- addq $VEC_SIZE, %rdi
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
- jmp L(more_4x_vec)
+ /* Check if we cross page boundary with one vector load. */
+ andl $(PAGE_SIZE - 1), %ecx
+ cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
+ ja L(cross_page_boundary)
- .p2align 4
-L(cros_page_boundary):
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
+ /* Check the first VEC_SIZE bytes. Search for both CHAR and the
+ null byte. */
vmovdqu (%rdi), %ymm8
VPCMPEQ %ymm8, %ymm0, %ymm1
VPCMPEQ %ymm8, %ymm9, %ymm2
vpor %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %eax
- /* Remove the leading bytes. */
- sarl %cl, %eax
testl %eax, %eax
- jz L(aligned_more)
- /* Found CHAR or the null byte. */
+ jz L(more_vecs)
tzcntl %eax, %eax
- addq %rcx, %rax
-# ifdef USE_AS_STRCHRNUL
+ /* Found CHAR or the null byte. */
addq %rdi, %rax
-# else
- xorl %edx, %edx
- leaq (%rdi, %rax), %rax
- cmp (%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER
ret
.p2align 4
+L(more_vecs):
+ /* Align data for aligned loads in the loop. */
+ andq $-VEC_SIZE, %rdi
L(aligned_more):
- addq $VEC_SIZE, %rdi
-L(more_4x_vec):
- /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
- since data is only aligned to VEC_SIZE. */
- vmovdqa (%rdi), %ymm8
+ /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
+ since data is only aligned to VEC_SIZE. */
+ vmovdqa VEC_SIZE(%rdi), %ymm8
+ addq $VEC_SIZE, %rdi
VPCMPEQ %ymm8, %ymm0, %ymm1
VPCMPEQ %ymm8, %ymm9, %ymm2
vpor %ymm1, %ymm2, %ymm1
@@ -133,61 +118,24 @@ L(more_4x_vec):
vpor %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %eax
testl %eax, %eax
- jnz L(first_vec_x3)
-
- addq $(VEC_SIZE * 4), %rdi
-
- /* Align data to 4 * VEC_SIZE. */
- movq %rdi, %rcx
- andl $(4 * VEC_SIZE - 1), %ecx
- andq $-(4 * VEC_SIZE), %rdi
-
- .p2align 4
-L(loop_4x_vec):
- /* Compare 4 * VEC at a time forward. */
- vmovdqa (%rdi), %ymm5
- vmovdqa VEC_SIZE(%rdi), %ymm6
- vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
- vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
-
- VPCMPEQ %ymm5, %ymm0, %ymm1
- VPCMPEQ %ymm6, %ymm0, %ymm2
- VPCMPEQ %ymm7, %ymm0, %ymm3
- VPCMPEQ %ymm8, %ymm0, %ymm4
+ jz L(prep_loop_4x)
- VPCMPEQ %ymm5, %ymm9, %ymm5
- VPCMPEQ %ymm6, %ymm9, %ymm6
- VPCMPEQ %ymm7, %ymm9, %ymm7
- VPCMPEQ %ymm8, %ymm9, %ymm8
-
- vpor %ymm1, %ymm5, %ymm1
- vpor %ymm2, %ymm6, %ymm2
- vpor %ymm3, %ymm7, %ymm3
- vpor %ymm4, %ymm8, %ymm4
-
- vpor %ymm1, %ymm2, %ymm5
- vpor %ymm3, %ymm4, %ymm6
-
- vpor %ymm5, %ymm6, %ymm5
-
- vpmovmskb %ymm5, %eax
- testl %eax, %eax
- jnz L(4x_vec_end)
-
- addq $(VEC_SIZE * 4), %rdi
-
- jmp L(loop_4x_vec)
+ tzcntl %eax, %eax
+ leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
+ cmovne %rdx, %rax
+# endif
+ VZEROUPPER
+ ret
.p2align 4
L(first_vec_x0):
- /* Found CHAR or the null byte. */
tzcntl %eax, %eax
-# ifdef USE_AS_STRCHRNUL
+ /* Found CHAR or the null byte. */
addq %rdi, %rax
-# else
- xorl %edx, %edx
- leaq (%rdi, %rax), %rax
- cmp (%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER
@@ -196,13 +144,9 @@ L(first_vec_x0):
.p2align 4
L(first_vec_x1):
tzcntl %eax, %eax
-# ifdef USE_AS_STRCHRNUL
- addq $VEC_SIZE, %rax
- addq %rdi, %rax
-# else
- xorl %edx, %edx
leaq VEC_SIZE(%rdi, %rax), %rax
- cmp (%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER
@@ -211,44 +155,99 @@ L(first_vec_x1):
.p2align 4
L(first_vec_x2):
tzcntl %eax, %eax
-# ifdef USE_AS_STRCHRNUL
- addq $(VEC_SIZE * 2), %rax
- addq %rdi, %rax
-# else
- xorl %edx, %edx
+ /* Found CHAR or the null byte. */
leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
- cmp (%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER
ret
+L(prep_loop_4x):
+ /* Align data to 4 * VEC_SIZE. */
+ andq $-(VEC_SIZE * 4), %rdi
+
.p2align 4
-L(4x_vec_end):
+L(loop_4x_vec):
+ /* Compare 4 * VEC at a time forward. */
+ vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5
+ vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6
+ vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7
+ vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8
+
+ /* Leaves only CHARS matching esi as 0. */
+ vpxor %ymm5, %ymm0, %ymm1
+ vpxor %ymm6, %ymm0, %ymm2
+ vpxor %ymm7, %ymm0, %ymm3
+ vpxor %ymm8, %ymm0, %ymm4
+
+ VPMINU %ymm1, %ymm5, %ymm1
+ VPMINU %ymm2, %ymm6, %ymm2
+ VPMINU %ymm3, %ymm7, %ymm3
+ VPMINU %ymm4, %ymm8, %ymm4
+
+ VPMINU %ymm1, %ymm2, %ymm5
+ VPMINU %ymm3, %ymm4, %ymm6
+
+ VPMINU %ymm5, %ymm6, %ymm5
+
+ VPCMPEQ %ymm5, %ymm9, %ymm5
+ vpmovmskb %ymm5, %eax
+
+ addq $(VEC_SIZE * 4), %rdi
+ testl %eax, %eax
+ jz L(loop_4x_vec)
+
+ VPCMPEQ %ymm1, %ymm9, %ymm1
vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x0)
+
+ VPCMPEQ %ymm2, %ymm9, %ymm2
vpmovmskb %ymm2, %eax
testl %eax, %eax
jnz L(first_vec_x1)
- vpmovmskb %ymm3, %eax
- testl %eax, %eax
- jnz L(first_vec_x2)
+
+ VPCMPEQ %ymm3, %ymm9, %ymm3
+ VPCMPEQ %ymm4, %ymm9, %ymm4
+ vpmovmskb %ymm3, %ecx
vpmovmskb %ymm4, %eax
+ salq $32, %rax
+ orq %rcx, %rax
+ tzcntq %rax, %rax
+ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
+ cmovne %rdx, %rax
+# endif
+ VZEROUPPER
+ ret
+
+ /* Cold case for crossing page with first load. */
+ .p2align 4
+L(cross_page_boundary):
+ andq $-VEC_SIZE, %rdi
+ andl $(VEC_SIZE - 1), %ecx
+
+ vmovdqa (%rdi), %ymm8
+ VPCMPEQ %ymm8, %ymm0, %ymm1
+ VPCMPEQ %ymm8, %ymm9, %ymm2
+ vpor %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %eax
+ /* Remove the leading bits. */
+ sarxl %ecx, %eax, %eax
testl %eax, %eax
-L(first_vec_x3):
+ jz L(aligned_more)
tzcntl %eax, %eax
-# ifdef USE_AS_STRCHRNUL
- addq $(VEC_SIZE * 3), %rax
+ addq %rcx, %rdi
addq %rdi, %rax
-# else
- xorl %edx, %edx
- leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
- cmp (%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER
ret
END (STRCHR)
-#endif
+# endif
diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
index 583a152794..4dfbe3b58b 100644
--- a/sysdeps/x86_64/multiarch/strchr.c
+++ b/sysdeps/x86_64/multiarch/strchr.c
@@ -37,6 +37,7 @@ IFUNC_SELECTOR (void)
if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
&& CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
return OPTIMIZE (avx2);
--
2.29.2
next prev parent reply other threads:[~2021-02-08 19:33 UTC|newest]
Thread overview: 10+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-02-03 5:38 goldstein.w.n
2021-02-03 5:39 ` [PATCH v4 2/2] x86: Add additional benchmarks and tests for strchr goldstein.w.n
2021-02-08 14:08 ` H.J. Lu
2021-02-08 19:34 ` H.J. Lu
2021-02-08 19:49 ` Noah Goldstein
2021-02-08 14:08 ` [PATCH v4 1/2] x86: Refactor and improve performance of strchr-avx2.S H.J. Lu
2021-02-08 19:33 ` H.J. Lu [this message]
2021-02-08 19:48 ` Noah Goldstein
2021-02-08 20:57 ` Noah Goldstein
2022-04-27 23:43 ` Sunil Pandey
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to='CAMe9rOpMO=4JJjgmX=ka5Bf91MrrXKLxtCzsHZYFKOYiW7YYUQ@mail.gmail.com' \
--to=hjl.tools@gmail.com \
--cc=carlos@systemhalted.org \
--cc=goldstein.w.n@gmail.com \
--cc=libc-alpha@sourceware.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).