On Mon, Feb 8, 2021 at 6:08 AM H.J. Lu wrote: > > On Tue, Feb 2, 2021 at 9:39 PM wrote: > > > > From: noah > > > > No bug. Just seemed the performance could be improved a bit. Observed > > and expected behavior are unchanged. Optimized body of main > > loop. Updated page cross logic and optimized accordingly. Made a few > > minor instruction selection modifications. No regressions in test > > suite. Both test-strchrnul and test-strchr passed. > > > > Signed-off-by: noah > > --- > > sysdeps/x86_64/multiarch/strchr-avx2.S | 235 ++++++++++++------------- > > sysdeps/x86_64/multiarch/strchr.c | 1 + > > 2 files changed, 118 insertions(+), 118 deletions(-) > > > > diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S > > index d416558d04..8b9d78b55a 100644 > > --- a/sysdeps/x86_64/multiarch/strchr-avx2.S > > +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S > > @@ -27,10 +27,12 @@ > > # ifdef USE_AS_WCSCHR > > # define VPBROADCAST vpbroadcastd > > # define VPCMPEQ vpcmpeqd > > +# define VPMINU vpminud > > # define CHAR_REG esi > > # else > > # define VPBROADCAST vpbroadcastb > > # define VPCMPEQ vpcmpeqb > > +# define VPMINU vpminub > > # define CHAR_REG sil > > # endif > > > > @@ -39,20 +41,26 @@ > > # endif > > > > # define VEC_SIZE 32 > > +# define PAGE_SIZE 4096 > > > > .section .text.avx,"ax",@progbits > > ENTRY (STRCHR) > > movl %edi, %ecx > > - /* Broadcast CHAR to YMM0. */ > > +# ifndef USE_AS_STRCHRNUL > > + xorl %edx, %edx > > +# endif > > + > > + /* Broadcast CHAR to YMM0. */ > > vmovd %esi, %xmm0 > > vpxor %xmm9, %xmm9, %xmm9 > > VPBROADCAST %xmm0, %ymm0 > > - /* Check if we may cross page boundary with one vector load. */ > > - andl $(2 * VEC_SIZE - 1), %ecx > > - cmpl $VEC_SIZE, %ecx > > - ja L(cros_page_boundary) > > - > > - /* Check the first VEC_SIZE bytes. Search for both CHAR and the > > + > > + /* Check if we cross page boundary with one vector load. */ > > + andl $(PAGE_SIZE - 1), %ecx > > + cmpl $(PAGE_SIZE - VEC_SIZE), %ecx > > + ja L(cross_page_boundary) > > + > > + /* Check the first VEC_SIZE bytes. Search for both CHAR and the > > null byte. */ > > vmovdqu (%rdi), %ymm8 > > VPCMPEQ %ymm8, %ymm0, %ymm1 > > @@ -60,50 +68,27 @@ ENTRY (STRCHR) > > vpor %ymm1, %ymm2, %ymm1 > > vpmovmskb %ymm1, %eax > > testl %eax, %eax > > - jnz L(first_vec_x0) > > - > > - /* Align data for aligned loads in the loop. */ > > - addq $VEC_SIZE, %rdi > > - andl $(VEC_SIZE - 1), %ecx > > - andq $-VEC_SIZE, %rdi > > - > > - jmp L(more_4x_vec) > > - > > - .p2align 4 > > -L(cros_page_boundary): > > - andl $(VEC_SIZE - 1), %ecx > > - andq $-VEC_SIZE, %rdi > > - vmovdqu (%rdi), %ymm8 > > - VPCMPEQ %ymm8, %ymm0, %ymm1 > > - VPCMPEQ %ymm8, %ymm9, %ymm2 > > - vpor %ymm1, %ymm2, %ymm1 > > - vpmovmskb %ymm1, %eax > > - /* Remove the leading bytes. */ > > - sarl %cl, %eax > > - testl %eax, %eax > > - jz L(aligned_more) > > - /* Found CHAR or the null byte. */ > > + jz L(more_vecs) > > tzcntl %eax, %eax > > - addq %rcx, %rax > > -# ifdef USE_AS_STRCHRNUL > > + /* Found CHAR or the null byte. */ > > addq %rdi, %rax > > -# else > > - xorl %edx, %edx > > - leaq (%rdi, %rax), %rax > > - cmp (%rax), %CHAR_REG > > +# ifndef USE_AS_STRCHRNUL > > + cmp (%rax), %CHAR_REG > > cmovne %rdx, %rax > > # endif > > VZEROUPPER > > ret > > > > .p2align 4 > > +L(more_vecs): > > + /* Align data for aligned loads in the loop. */ > > + andq $-VEC_SIZE, %rdi > > L(aligned_more): > > - addq $VEC_SIZE, %rdi > > > > -L(more_4x_vec): > > - /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time > > - since data is only aligned to VEC_SIZE. */ > > - vmovdqa (%rdi), %ymm8 > > + /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time > > + since data is only aligned to VEC_SIZE. */ > > + vmovdqa VEC_SIZE(%rdi), %ymm8 > > + addq $VEC_SIZE, %rdi > > VPCMPEQ %ymm8, %ymm0, %ymm1 > > VPCMPEQ %ymm8, %ymm9, %ymm2 > > vpor %ymm1, %ymm2, %ymm1 > > @@ -125,7 +110,7 @@ L(more_4x_vec): > > vpor %ymm1, %ymm2, %ymm1 > > vpmovmskb %ymm1, %eax > > testl %eax, %eax > > - jnz L(first_vec_x2) > > + jnz L(first_vec_x2) > > > > vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 > > VPCMPEQ %ymm8, %ymm0, %ymm1 > > @@ -133,122 +118,136 @@ L(more_4x_vec): > > vpor %ymm1, %ymm2, %ymm1 > > vpmovmskb %ymm1, %eax > > testl %eax, %eax > > - jnz L(first_vec_x3) > > - > > - addq $(VEC_SIZE * 4), %rdi > > - > > - /* Align data to 4 * VEC_SIZE. */ > > - movq %rdi, %rcx > > - andl $(4 * VEC_SIZE - 1), %ecx > > - andq $-(4 * VEC_SIZE), %rdi > > - > > - .p2align 4 > > -L(loop_4x_vec): > > - /* Compare 4 * VEC at a time forward. */ > > - vmovdqa (%rdi), %ymm5 > > - vmovdqa VEC_SIZE(%rdi), %ymm6 > > - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7 > > - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 > > - > > - VPCMPEQ %ymm5, %ymm0, %ymm1 > > - VPCMPEQ %ymm6, %ymm0, %ymm2 > > - VPCMPEQ %ymm7, %ymm0, %ymm3 > > - VPCMPEQ %ymm8, %ymm0, %ymm4 > > - > > - VPCMPEQ %ymm5, %ymm9, %ymm5 > > - VPCMPEQ %ymm6, %ymm9, %ymm6 > > - VPCMPEQ %ymm7, %ymm9, %ymm7 > > - VPCMPEQ %ymm8, %ymm9, %ymm8 > > - > > - vpor %ymm1, %ymm5, %ymm1 > > - vpor %ymm2, %ymm6, %ymm2 > > - vpor %ymm3, %ymm7, %ymm3 > > - vpor %ymm4, %ymm8, %ymm4 > > - > > - vpor %ymm1, %ymm2, %ymm5 > > - vpor %ymm3, %ymm4, %ymm6 > > - > > - vpor %ymm5, %ymm6, %ymm5 > > - > > - vpmovmskb %ymm5, %eax > > - testl %eax, %eax > > - jnz L(4x_vec_end) > > - > > - addq $(VEC_SIZE * 4), %rdi > > + jz L(prep_loop_4x) > > > > - jmp L(loop_4x_vec) > > + tzcntl %eax, %eax > > + leaq (VEC_SIZE * 3)(%rdi, %rax), %rax > > +# ifndef USE_AS_STRCHRNUL > > + cmp (%rax), %CHAR_REG > > + cmovne %rdx, %rax > > +# endif > > + VZEROUPPER > > + ret > > > > .p2align 4 > > L(first_vec_x0): > > - /* Found CHAR or the null byte. */ > > tzcntl %eax, %eax > > -# ifdef USE_AS_STRCHRNUL > > + /* Found CHAR or the null byte. */ > > addq %rdi, %rax > > -# else > > - xorl %edx, %edx > > - leaq (%rdi, %rax), %rax > > - cmp (%rax), %CHAR_REG > > +# ifndef USE_AS_STRCHRNUL > > + cmp (%rax), %CHAR_REG > > cmovne %rdx, %rax > > # endif > > VZEROUPPER > > ret > > - > > + > > .p2align 4 > > L(first_vec_x1): > > tzcntl %eax, %eax > > -# ifdef USE_AS_STRCHRNUL > > - addq $VEC_SIZE, %rax > > - addq %rdi, %rax > > -# else > > - xorl %edx, %edx > > leaq VEC_SIZE(%rdi, %rax), %rax > > - cmp (%rax), %CHAR_REG > > +# ifndef USE_AS_STRCHRNUL > > + cmp (%rax), %CHAR_REG > > cmovne %rdx, %rax > > # endif > > VZEROUPPER > > - ret > > - > > + ret > > + > > .p2align 4 > > L(first_vec_x2): > > tzcntl %eax, %eax > > -# ifdef USE_AS_STRCHRNUL > > - addq $(VEC_SIZE * 2), %rax > > - addq %rdi, %rax > > -# else > > - xorl %edx, %edx > > + /* Found CHAR or the null byte. */ > > leaq (VEC_SIZE * 2)(%rdi, %rax), %rax > > - cmp (%rax), %CHAR_REG > > +# ifndef USE_AS_STRCHRNUL > > + cmp (%rax), %CHAR_REG > > cmovne %rdx, %rax > > # endif > > VZEROUPPER > > ret > > + > > +L(prep_loop_4x): > > + /* Align data to 4 * VEC_SIZE. */ > > + andq $-(VEC_SIZE * 4), %rdi > > > > .p2align 4 > > -L(4x_vec_end): > > +L(loop_4x_vec): > > + /* Compare 4 * VEC at a time forward. */ > > + vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5 > > + vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6 > > + vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7 > > + vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8 > > + > > + /* Leaves only CHARS matching esi as 0. */ > > + vpxor %ymm5, %ymm0, %ymm1 > > + vpxor %ymm6, %ymm0, %ymm2 > > + vpxor %ymm7, %ymm0, %ymm3 > > + vpxor %ymm8, %ymm0, %ymm4 > > + > > + VPMINU %ymm1, %ymm5, %ymm1 > > + VPMINU %ymm2, %ymm6, %ymm2 > > + VPMINU %ymm3, %ymm7, %ymm3 > > + VPMINU %ymm4, %ymm8, %ymm4 > > + > > + VPMINU %ymm1, %ymm2, %ymm5 > > + VPMINU %ymm3, %ymm4, %ymm6 > > + > > + VPMINU %ymm5, %ymm6, %ymm5 > > + > > + VPCMPEQ %ymm5, %ymm9, %ymm5 > > + vpmovmskb %ymm5, %eax > > + > > + addq $(VEC_SIZE * 4), %rdi > > + testl %eax, %eax > > + jz L(loop_4x_vec) > > + > > + VPCMPEQ %ymm1, %ymm9, %ymm1 > > vpmovmskb %ymm1, %eax > > testl %eax, %eax > > jnz L(first_vec_x0) > > + > > + VPCMPEQ %ymm2, %ymm9, %ymm2 > > vpmovmskb %ymm2, %eax > > testl %eax, %eax > > jnz L(first_vec_x1) > > - vpmovmskb %ymm3, %eax > > - testl %eax, %eax > > - jnz L(first_vec_x2) > > + > > + VPCMPEQ %ymm3, %ymm9, %ymm3 > > + VPCMPEQ %ymm4, %ymm9, %ymm4 > > + vpmovmskb %ymm3, %ecx > > vpmovmskb %ymm4, %eax > > + salq $32, %rax > > + orq %rcx, %rax > > + tzcntq %rax, %rax > > + leaq (VEC_SIZE * 2)(%rdi, %rax), %rax > > +# ifndef USE_AS_STRCHRNUL > > + cmp (%rax), %CHAR_REG > > + cmovne %rdx, %rax > > +# endif > > + VZEROUPPER > > + ret > > + > > + /* Cold case for crossing page with first load. */ > > + .p2align 4 > > +L(cross_page_boundary): > > + andq $-VEC_SIZE, %rdi > > + andl $(VEC_SIZE - 1), %ecx > > + > > + vmovdqa (%rdi), %ymm8 > > + VPCMPEQ %ymm8, %ymm0, %ymm1 > > + VPCMPEQ %ymm8, %ymm9, %ymm2 > > + vpor %ymm1, %ymm2, %ymm1 > > + vpmovmskb %ymm1, %eax > > + /* Remove the leading bits. */ > > + sarxl %ecx, %eax, %eax > > testl %eax, %eax > > -L(first_vec_x3): > > + jz L(aligned_more) > > tzcntl %eax, %eax > > -# ifdef USE_AS_STRCHRNUL > > - addq $(VEC_SIZE * 3), %rax > > + addq %rcx, %rdi > > addq %rdi, %rax > > -# else > > - xorl %edx, %edx > > - leaq (VEC_SIZE * 3)(%rdi, %rax), %rax > > - cmp (%rax), %CHAR_REG > > +# ifndef USE_AS_STRCHRNUL > > + cmp (%rax), %CHAR_REG > > cmovne %rdx, %rax > > # endif > > VZEROUPPER > > ret > > > > END (STRCHR) > > -#endif > > +# endif > > diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c > > index 583a152794..4dfbe3b58b 100644 > > --- a/sysdeps/x86_64/multiarch/strchr.c > > +++ b/sysdeps/x86_64/multiarch/strchr.c > > @@ -37,6 +37,7 @@ IFUNC_SELECTOR (void) > > > > if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) > > && CPU_FEATURE_USABLE_P (cpu_features, AVX2) > > + && CPU_FEATURE_USABLE_P (cpu_features, BMI2) > > && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) > > return OPTIMIZE (avx2); > > > > -- > > 2.29.2 > > > > LGTM. > > Thanks. > This is the updated patch with extra white spaces fixed I am checking in. -- H.J.