* [PATCH] x86_64: Implement evex512 version of memchr, rawmemchr and wmemchr @ 2022-09-22 0:27 Sunil K Pandey 2022-09-22 0:50 ` Noah Goldstein 2022-10-03 18:33 ` Noah Goldstein 0 siblings, 2 replies; 26+ messages in thread From: Sunil K Pandey @ 2022-09-22 0:27 UTC (permalink / raw) To: libc-alpha; +Cc: hjl.tools This patch implements following evex512 version of string functions. evex512 version takes up to 30% less cycle as compared to evex, depending on length and alignment. - memchr function using 512 bit vectors. - rawmemchr function using 512 bit vectors. - wmemchr function using 512 bit vectors. Code size data: memchr-evex.o 762 byte memchr-evex512.o 570 byte (-25%) rawmemchr-evex.o 461 byte rawmemchr-evex512.o 413 byte (-10%) wmemchr-evex.o 794 byte wmemchr-evex512.o 568 byte (-28%) Placeholder function, not used by any processor at the moment. --- sysdeps/x86_64/multiarch/Makefile | 3 + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 + sysdeps/x86_64/multiarch/memchr-evex-base.S | 306 +++++++++++++++++++ sysdeps/x86_64/multiarch/memchr-evex512.S | 7 + sysdeps/x86_64/multiarch/rawmemchr-evex512.S | 7 + sysdeps/x86_64/multiarch/wmemchr-evex512.S | 8 + 6 files changed, 346 insertions(+) create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-base.S create mode 100644 sysdeps/x86_64/multiarch/memchr-evex512.S create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex512.S create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex512.S diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index df4601c294..e974b1ad97 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -4,6 +4,7 @@ sysdep_routines += \ memchr-avx2 \ memchr-avx2-rtm \ memchr-evex \ + memchr-evex512 \ memchr-evex-rtm \ memchr-sse2 \ memcmp-avx2-movbe \ @@ -36,6 +37,7 @@ sysdep_routines += \ rawmemchr-avx2 \ rawmemchr-avx2-rtm \ rawmemchr-evex \ + rawmemchr-evex512 \ rawmemchr-evex-rtm \ rawmemchr-sse2 \ stpcpy-avx2 \ @@ -156,6 +158,7 @@ sysdep_routines += \ wmemchr-avx2 \ wmemchr-avx2-rtm \ wmemchr-evex \ + wmemchr-evex512 \ wmemchr-evex-rtm \ wmemchr-sse2 \ wmemcmp-avx2-movbe \ diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index a71444eccb..17f770318d 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -63,6 +63,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), __memchr_evex) + X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __memchr_evex512) X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) @@ -329,6 +334,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), __rawmemchr_evex) + X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __rawmemchr_evex512) X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) @@ -903,6 +913,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), __wmemchr_evex) + X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __wmemchr_evex512) X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) diff --git a/sysdeps/x86_64/multiarch/memchr-evex-base.S b/sysdeps/x86_64/multiarch/memchr-evex-base.S new file mode 100644 index 0000000000..524f0809b5 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memchr-evex-base.S @@ -0,0 +1,306 @@ +/* Placeholder function, not used by any processor at the moment. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +/* UNUSED. Exists purely as reference implementation. */ + +#include <isa-level.h> + +#if ISA_SHOULD_BUILD (4) + +# include <sysdep.h> + +# ifdef USE_AS_WMEMCHR +# define CHAR_SIZE 4 +# define VPBROADCAST vpbroadcastd +# define VPCMP vpcmpd +# else +# define CHAR_SIZE 1 +# define VPBROADCAST vpbroadcastb +# define VPCMP vpcmpb +# endif + +# define PAGE_SIZE 4096 +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) +# define XMM1 xmm17 + +# if VEC_SIZE == 64 +# define KMOV kmovq +# define KOR korq +# define KORTEST kortestq +# define RAX rax +# define RCX rcx +# define SHR shrq +# define SARX sarxq +# define TEXTSUFFIX evex512 +# define VMM0 zmm16 +# elif VEC_SIZE == 32 +/* Currently Unused. */ +# define KMOV kmovd +# define KOR kord +# define KORTEST kortestd +# define RAX eax +# define RCX ecx +# define SHR shrl +# define SARX sarxl +# define TEXTSUFFIX evex256 +# define VMM0 ymm16 +# endif + + .section .text.TEXTSUFFIX, "ax", @progbits +/* Aligning entry point to 64 byte, provides better performance for + one vector length string. */ +ENTRY_P2ALIGN (MEMCHR, 6) +# ifndef USE_AS_RAWMEMCHR + /* Check for zero length. */ + test %RDX_LP, %RDX_LP + jz L(zero) + +# ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %edx, %edx +# endif +# endif + + /* Broadcast CHAR to VMM0. */ + VPBROADCAST %esi, %VMM0 + movl %edi, %eax + andl $(PAGE_SIZE - 1), %eax + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + ja L(page_cross) + + /* Compare [w]char for null, mask bit will be set for match. */ + VPCMP $0, (%rdi), %VMM0, %k0 + + KMOV %k0, %RAX +# ifndef USE_AS_RAWMEMCHR + bsf %RAX, %RCX + jz L(align_more) + xor %eax, %eax +# ifdef USE_AS_WMEMCHR + leaq (%rdi, %rcx, CHAR_SIZE), %rdi +# else + addq %rcx, %rdi +# endif + cmp %rcx, %rdx + cmova %rdi, %rax +# else + bsf %RAX, %RAX + jz L(align_more) + add %rdi, %rax +# endif + ret + +# ifndef USE_AS_RAWMEMCHR +L(zero): + xorl %eax, %eax + ret +# endif + + .p2align 5,,5 +L(page_cross): + movq %rdi, %rcx + andq $-VEC_SIZE, %rcx + + VPCMP $0, (%rcx), %VMM0, %k0 + KMOV %k0, %RCX + SARX %RAX, %RCX, %RAX +# ifndef USE_AS_RAWMEMCHR + bsf %RAX, %RCX + jz L(align_more) + xor %eax, %eax +# ifdef USE_AS_WMEMCHR + leaq (%rdi, %rcx, CHAR_SIZE), %rdi +# else + addq %rcx, %rdi +# endif + cmp %rcx, %rdx + cmovae %rdi, %rax + +# else + bsf %rax, %rax + jz L(align_more) + add %rdi, %rax +# endif + ret + +L(ret_vec_x2): + subq $-VEC_SIZE, %rdi +L(ret_vec_x1): + bsf %RAX, %RAX +# ifndef USE_AS_RAWMEMCHR + jz L(zero) + cmp %rax, %rdx + jbe L(zero) +# endif +# ifdef USE_AS_WMEMCHR + leaq (%rdi, %rax, CHAR_SIZE), %rax +# else + add %rdi, %rax +# endif + ret + + .p2align 5,,10 +L(align_more): +# ifndef USE_AS_RAWMEMCHR + xor %eax, %eax + subq %rdi, %rax +# endif + + subq $-VEC_SIZE, %rdi + /* Align rdi to VEC_SIZE. */ + andq $-VEC_SIZE, %rdi + +# ifndef USE_AS_RAWMEMCHR + addq %rdi, %rax +# ifdef USE_AS_WMEMCHR + sarl $2, %eax +# endif + subq %rax, %rdx + jbe L(zero) +# endif + + /* Loop unroll 4 times for 4 vector loop. */ + VPCMP $0, (%rdi), %VMM0, %k0 + + KMOV %k0, %RAX + test %RAX, %RAX + jnz L(ret_vec_x1) + +# ifndef USE_AS_RAWMEMCHR + subq $CHAR_PER_VEC, %rdx + jbe L(zero) +# endif + + VPCMP $0, VEC_SIZE(%rdi), %VMM0, %k0 + + KMOV %k0, %RAX + test %RAX, %RAX + jnz L(ret_vec_x2) + +# ifndef USE_AS_RAWMEMCHR + subq $CHAR_PER_VEC, %rdx + jbe L(zero) +# endif + + VPCMP $0, (VEC_SIZE * 2)(%rdi), %VMM0, %k0 + + KMOV %k0, %RAX + test %RAX, %RAX + jnz L(ret_vec_x3) + +# ifndef USE_AS_RAWMEMCHR + subq $CHAR_PER_VEC, %rdx + jbe L(zero) +# endif + + VPCMP $0, (VEC_SIZE * 3)(%rdi), %VMM0, %k0 + + KMOV %k0, %RAX + test %RAX, %RAX + jnz L(ret_vec_x4) + +# ifndef USE_AS_RAWMEMCHR + subq $CHAR_PER_VEC, %rdx + jbe L(zero) + /* Save pointer to find alignment adjustment. */ + movq %rdi, %rax +# endif + /* Align address to VEC_SIZE * 4 for loop. */ + andq $-(VEC_SIZE * 4), %rdi + + /* Add alignment difference to rdx. */ +# ifndef USE_AS_RAWMEMCHR + subq %rdi, %rax +# ifdef USE_AS_WMEMCHR + SHR $2, %RAX +# endif + addq %rax, %rdx + jmp L(loop_entry) +# endif + + /* 4 vector loop. */ + .p2align 5,,11 +L(loop): +# ifndef USE_AS_RAWMEMCHR + subq $(CHAR_PER_VEC * 4), %rdx + jbe L(zero) +L(loop_entry): +# endif + VPCMP $0, (VEC_SIZE * 4)(%rdi), %VMM0, %k1 + VPCMP $0, (VEC_SIZE * 5)(%rdi), %VMM0, %k2 + VPCMP $0, (VEC_SIZE * 6)(%rdi), %VMM0, %k3 + VPCMP $0, (VEC_SIZE * 7)(%rdi), %VMM0, %k4 + KOR %k1, %k2, %k5 + KOR %k3, %k4, %k6 + + subq $-(VEC_SIZE * 4), %rdi + KORTEST %k5, %k6 + jz L(loop) + + KMOV %k1, %RAX + test %RAX, %RAX + jnz L(ret_vec_x1) + +# ifndef USE_AS_RAWMEMCHR + subq $CHAR_PER_VEC, %rdx + jbe L(zero) +# endif + + KMOV %k2, %RAX + test %RAX, %RAX + jnz L(ret_vec_x2) + +# ifndef USE_AS_RAWMEMCHR + subq $CHAR_PER_VEC, %rdx + jbe L(zero) +# endif + + KMOV %k3, %RAX + test %RAX, %RAX + jnz L(ret_vec_x3) + +# ifndef USE_AS_RAWMEMCHR + subq $CHAR_PER_VEC, %rdx + jbe L(zero) +# endif + + /* At this point null [w]char must be in the fourth vector so no + need to check. */ + KMOV %k4, %RAX + +L(ret_vec_x4): + bsf %RAX, %RAX +# ifndef USE_AS_RAWMEMCHR + cmp %rax, %rdx + jbe L(zero) +# endif + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax + ret + + .p2align 5,,5 +L(ret_vec_x3): + bsf %RAX, %RAX +# ifndef USE_AS_RAWMEMCHR + cmp %rax, %rdx + jbe L(zero) +# endif + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax + ret + +END (MEMCHR) +#endif diff --git a/sysdeps/x86_64/multiarch/memchr-evex512.S b/sysdeps/x86_64/multiarch/memchr-evex512.S new file mode 100644 index 0000000000..47349d817a --- /dev/null +++ b/sysdeps/x86_64/multiarch/memchr-evex512.S @@ -0,0 +1,7 @@ +# ifndef MEMCHR +# define MEMCHR __memchr_evex512 +# endif + +#define VEC_SIZE 64 + +#include "memchr-evex-base.S" diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex512.S b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S new file mode 100644 index 0000000000..302d3cb055 --- /dev/null +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S @@ -0,0 +1,7 @@ +#ifndef RAWMEMCHR +# define RAWMEMCHR __rawmemchr_evex512 +#endif +#define USE_AS_RAWMEMCHR 1 +#define MEMCHR RAWMEMCHR + +#include "memchr-evex512.S" diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex512.S b/sysdeps/x86_64/multiarch/wmemchr-evex512.S new file mode 100644 index 0000000000..f45ed1db75 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemchr-evex512.S @@ -0,0 +1,8 @@ +#ifndef WMEMCHR +# define WMEMCHR __wmemchr_evex512 +#endif + +#define MEMCHR WMEMCHR +#define USE_AS_WMEMCHR 1 + +#include "memchr-evex512.S" -- 2.36.1 ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH] x86_64: Implement evex512 version of memchr, rawmemchr and wmemchr 2022-09-22 0:27 [PATCH] x86_64: Implement evex512 version of memchr, rawmemchr and wmemchr Sunil K Pandey @ 2022-09-22 0:50 ` Noah Goldstein 2022-09-23 3:57 ` Sunil Pandey 2022-10-03 18:33 ` Noah Goldstein 1 sibling, 1 reply; 26+ messages in thread From: Noah Goldstein @ 2022-09-22 0:50 UTC (permalink / raw) To: Sunil K Pandey; +Cc: GNU C Library On Wed, Sep 21, 2022 at 5:27 PM Sunil K Pandey via Libc-alpha <libc-alpha@sourceware.org> wrote: > > This patch implements following evex512 version of string functions. > evex512 version takes up to 30% less cycle as compared to evex, > depending on length and alignment. Please attach benchmark numbers. > > - memchr function using 512 bit vectors. > - rawmemchr function using 512 bit vectors. > - wmemchr function using 512 bit vectors. > > Code size data: > > memchr-evex.o 762 byte > memchr-evex512.o 570 byte (-25%) > > rawmemchr-evex.o 461 byte > rawmemchr-evex512.o 413 byte (-10%) > > wmemchr-evex.o 794 byte > wmemchr-evex512.o 568 byte (-28%) > > Placeholder function, not used by any processor at the moment. > --- > sysdeps/x86_64/multiarch/Makefile | 3 + > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 + > sysdeps/x86_64/multiarch/memchr-evex-base.S | 306 +++++++++++++++++++ > sysdeps/x86_64/multiarch/memchr-evex512.S | 7 + > sysdeps/x86_64/multiarch/rawmemchr-evex512.S | 7 + > sysdeps/x86_64/multiarch/wmemchr-evex512.S | 8 + > 6 files changed, 346 insertions(+) > create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-base.S > create mode 100644 sysdeps/x86_64/multiarch/memchr-evex512.S > create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex512.S > create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex512.S > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > index df4601c294..e974b1ad97 100644 > --- a/sysdeps/x86_64/multiarch/Makefile > +++ b/sysdeps/x86_64/multiarch/Makefile > @@ -4,6 +4,7 @@ sysdep_routines += \ > memchr-avx2 \ > memchr-avx2-rtm \ > memchr-evex \ > + memchr-evex512 \ > memchr-evex-rtm \ > memchr-sse2 \ > memcmp-avx2-movbe \ > @@ -36,6 +37,7 @@ sysdep_routines += \ > rawmemchr-avx2 \ > rawmemchr-avx2-rtm \ > rawmemchr-evex \ > + rawmemchr-evex512 \ > rawmemchr-evex-rtm \ > rawmemchr-sse2 \ > stpcpy-avx2 \ > @@ -156,6 +158,7 @@ sysdep_routines += \ > wmemchr-avx2 \ > wmemchr-avx2-rtm \ > wmemchr-evex \ > + wmemchr-evex512 \ > wmemchr-evex-rtm \ > wmemchr-sse2 \ > wmemcmp-avx2-movbe \ > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > index a71444eccb..17f770318d 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > @@ -63,6 +63,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > && CPU_FEATURE_USABLE (AVX512BW) > && CPU_FEATURE_USABLE (BMI2)), > __memchr_evex) > + X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > + (CPU_FEATURE_USABLE (AVX512VL) > + && CPU_FEATURE_USABLE (AVX512BW) > + && CPU_FEATURE_USABLE (BMI2)), > + __memchr_evex512) > X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > (CPU_FEATURE_USABLE (AVX512VL) > && CPU_FEATURE_USABLE (AVX512BW) > @@ -329,6 +334,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > && CPU_FEATURE_USABLE (AVX512BW) > && CPU_FEATURE_USABLE (BMI2)), > __rawmemchr_evex) > + X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > + (CPU_FEATURE_USABLE (AVX512VL) > + && CPU_FEATURE_USABLE (AVX512BW) > + && CPU_FEATURE_USABLE (BMI2)), > + __rawmemchr_evex512) > X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > (CPU_FEATURE_USABLE (AVX512VL) > && CPU_FEATURE_USABLE (AVX512BW) > @@ -903,6 +913,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > && CPU_FEATURE_USABLE (AVX512BW) > && CPU_FEATURE_USABLE (BMI2)), > __wmemchr_evex) > + X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > + (CPU_FEATURE_USABLE (AVX512VL) > + && CPU_FEATURE_USABLE (AVX512BW) > + && CPU_FEATURE_USABLE (BMI2)), > + __wmemchr_evex512) > X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > (CPU_FEATURE_USABLE (AVX512VL) > && CPU_FEATURE_USABLE (AVX512BW) > diff --git a/sysdeps/x86_64/multiarch/memchr-evex-base.S b/sysdeps/x86_64/multiarch/memchr-evex-base.S > new file mode 100644 > index 0000000000..524f0809b5 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/memchr-evex-base.S > @@ -0,0 +1,306 @@ > +/* Placeholder function, not used by any processor at the moment. > + Copyright (C) 2022 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +/* UNUSED. Exists purely as reference implementation. */ > + > +#include <isa-level.h> > + > +#if ISA_SHOULD_BUILD (4) > + > +# include <sysdep.h> > + > +# ifdef USE_AS_WMEMCHR > +# define CHAR_SIZE 4 > +# define VPBROADCAST vpbroadcastd > +# define VPCMP vpcmpd > +# else > +# define CHAR_SIZE 1 > +# define VPBROADCAST vpbroadcastb > +# define VPCMP vpcmpb > +# endif > + > +# define PAGE_SIZE 4096 > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > +# define XMM1 xmm17 > + > +# if VEC_SIZE == 64 > +# define KMOV kmovq > +# define KOR korq > +# define KORTEST kortestq > +# define RAX rax > +# define RCX rcx > +# define SHR shrq > +# define SARX sarxq > +# define TEXTSUFFIX evex512 > +# define VMM0 zmm16 > +# elif VEC_SIZE == 32 > +/* Currently Unused. */ > +# define KMOV kmovd > +# define KOR kord > +# define KORTEST kortestd > +# define RAX eax > +# define RCX ecx > +# define SHR shrl > +# define SARX sarxl > +# define TEXTSUFFIX evex256 > +# define VMM0 ymm16 > +# endif > + > + .section .text.TEXTSUFFIX, "ax", @progbits > +/* Aligning entry point to 64 byte, provides better performance for > + one vector length string. */ > +ENTRY_P2ALIGN (MEMCHR, 6) > +# ifndef USE_AS_RAWMEMCHR > + /* Check for zero length. */ > + test %RDX_LP, %RDX_LP > + jz L(zero) > + > +# ifdef __ILP32__ > + /* Clear the upper 32 bits. */ > + movl %edx, %edx > +# endif > +# endif > + > + /* Broadcast CHAR to VMM0. */ > + VPBROADCAST %esi, %VMM0 > + movl %edi, %eax > + andl $(PAGE_SIZE - 1), %eax > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > + ja L(page_cross) > + > + /* Compare [w]char for null, mask bit will be set for match. */ > + VPCMP $0, (%rdi), %VMM0, %k0 > + > + KMOV %k0, %RAX > +# ifndef USE_AS_RAWMEMCHR > + bsf %RAX, %RCX > + jz L(align_more) > + xor %eax, %eax > +# ifdef USE_AS_WMEMCHR > + leaq (%rdi, %rcx, CHAR_SIZE), %rdi > +# else > + addq %rcx, %rdi > +# endif > + cmp %rcx, %rdx > + cmova %rdi, %rax > +# else > + bsf %RAX, %RAX > + jz L(align_more) > + add %rdi, %rax > +# endif > + ret > + > +# ifndef USE_AS_RAWMEMCHR > +L(zero): > + xorl %eax, %eax > + ret > +# endif > + > + .p2align 5,,5 > +L(page_cross): > + movq %rdi, %rcx > + andq $-VEC_SIZE, %rcx > + > + VPCMP $0, (%rcx), %VMM0, %k0 > + KMOV %k0, %RCX > + SARX %RAX, %RCX, %RAX > +# ifndef USE_AS_RAWMEMCHR > + bsf %RAX, %RCX > + jz L(align_more) > + xor %eax, %eax > +# ifdef USE_AS_WMEMCHR > + leaq (%rdi, %rcx, CHAR_SIZE), %rdi > +# else > + addq %rcx, %rdi > +# endif > + cmp %rcx, %rdx > + cmovae %rdi, %rax > + > +# else > + bsf %rax, %rax > + jz L(align_more) > + add %rdi, %rax > +# endif > + ret > + > +L(ret_vec_x2): > + subq $-VEC_SIZE, %rdi > +L(ret_vec_x1): > + bsf %RAX, %RAX > +# ifndef USE_AS_RAWMEMCHR > + jz L(zero) > + cmp %rax, %rdx > + jbe L(zero) > +# endif > +# ifdef USE_AS_WMEMCHR > + leaq (%rdi, %rax, CHAR_SIZE), %rax > +# else > + add %rdi, %rax > +# endif > + ret > + > + .p2align 5,,10 > +L(align_more): > +# ifndef USE_AS_RAWMEMCHR > + xor %eax, %eax > + subq %rdi, %rax > +# endif > + > + subq $-VEC_SIZE, %rdi > + /* Align rdi to VEC_SIZE. */ > + andq $-VEC_SIZE, %rdi > + > +# ifndef USE_AS_RAWMEMCHR > + addq %rdi, %rax > +# ifdef USE_AS_WMEMCHR > + sarl $2, %eax > +# endif > + subq %rax, %rdx > + jbe L(zero) > +# endif > + > + /* Loop unroll 4 times for 4 vector loop. */ > + VPCMP $0, (%rdi), %VMM0, %k0 > + > + KMOV %k0, %RAX > + test %RAX, %RAX > + jnz L(ret_vec_x1) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero) > +# endif > + > + VPCMP $0, VEC_SIZE(%rdi), %VMM0, %k0 > + > + KMOV %k0, %RAX > + test %RAX, %RAX > + jnz L(ret_vec_x2) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero) > +# endif > + > + VPCMP $0, (VEC_SIZE * 2)(%rdi), %VMM0, %k0 > + > + KMOV %k0, %RAX > + test %RAX, %RAX > + jnz L(ret_vec_x3) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero) > +# endif > + > + VPCMP $0, (VEC_SIZE * 3)(%rdi), %VMM0, %k0 > + > + KMOV %k0, %RAX > + test %RAX, %RAX > + jnz L(ret_vec_x4) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero) > + /* Save pointer to find alignment adjustment. */ > + movq %rdi, %rax > +# endif > + /* Align address to VEC_SIZE * 4 for loop. */ > + andq $-(VEC_SIZE * 4), %rdi > + > + /* Add alignment difference to rdx. */ > +# ifndef USE_AS_RAWMEMCHR > + subq %rdi, %rax > +# ifdef USE_AS_WMEMCHR > + SHR $2, %RAX > +# endif > + addq %rax, %rdx > + jmp L(loop_entry) > +# endif > + > + /* 4 vector loop. */ > + .p2align 5,,11 > +L(loop): > +# ifndef USE_AS_RAWMEMCHR > + subq $(CHAR_PER_VEC * 4), %rdx > + jbe L(zero) > +L(loop_entry): > +# endif > + VPCMP $0, (VEC_SIZE * 4)(%rdi), %VMM0, %k1 > + VPCMP $0, (VEC_SIZE * 5)(%rdi), %VMM0, %k2 > + VPCMP $0, (VEC_SIZE * 6)(%rdi), %VMM0, %k3 > + VPCMP $0, (VEC_SIZE * 7)(%rdi), %VMM0, %k4 > + KOR %k1, %k2, %k5 > + KOR %k3, %k4, %k6 > + > + subq $-(VEC_SIZE * 4), %rdi > + KORTEST %k5, %k6 > + jz L(loop) > + > + KMOV %k1, %RAX > + test %RAX, %RAX > + jnz L(ret_vec_x1) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero) > +# endif > + > + KMOV %k2, %RAX > + test %RAX, %RAX > + jnz L(ret_vec_x2) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero) > +# endif > + > + KMOV %k3, %RAX > + test %RAX, %RAX > + jnz L(ret_vec_x3) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero) > +# endif > + > + /* At this point null [w]char must be in the fourth vector so no > + need to check. */ > + KMOV %k4, %RAX > + > +L(ret_vec_x4): > + bsf %RAX, %RAX > +# ifndef USE_AS_RAWMEMCHR > + cmp %rax, %rdx > + jbe L(zero) > +# endif > + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax > + ret > + > + .p2align 5,,5 > +L(ret_vec_x3): > + bsf %RAX, %RAX > +# ifndef USE_AS_RAWMEMCHR > + cmp %rax, %rdx > + jbe L(zero) > +# endif > + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > + ret > + > +END (MEMCHR) > +#endif > diff --git a/sysdeps/x86_64/multiarch/memchr-evex512.S b/sysdeps/x86_64/multiarch/memchr-evex512.S > new file mode 100644 > index 0000000000..47349d817a > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/memchr-evex512.S > @@ -0,0 +1,7 @@ > +# ifndef MEMCHR > +# define MEMCHR __memchr_evex512 > +# endif > + > +#define VEC_SIZE 64 > + > +#include "memchr-evex-base.S" > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex512.S b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S > new file mode 100644 > index 0000000000..302d3cb055 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S > @@ -0,0 +1,7 @@ > +#ifndef RAWMEMCHR > +# define RAWMEMCHR __rawmemchr_evex512 > +#endif > +#define USE_AS_RAWMEMCHR 1 > +#define MEMCHR RAWMEMCHR > + > +#include "memchr-evex512.S" > diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex512.S b/sysdeps/x86_64/multiarch/wmemchr-evex512.S > new file mode 100644 > index 0000000000..f45ed1db75 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/wmemchr-evex512.S > @@ -0,0 +1,8 @@ > +#ifndef WMEMCHR > +# define WMEMCHR __wmemchr_evex512 > +#endif > + > +#define MEMCHR WMEMCHR > +#define USE_AS_WMEMCHR 1 > + > +#include "memchr-evex512.S" > -- > 2.36.1 > ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH] x86_64: Implement evex512 version of memchr, rawmemchr and wmemchr 2022-09-22 0:50 ` Noah Goldstein @ 2022-09-23 3:57 ` Sunil Pandey 2022-09-29 3:42 ` Sunil Pandey 0 siblings, 1 reply; 26+ messages in thread From: Sunil Pandey @ 2022-09-23 3:57 UTC (permalink / raw) To: Noah Goldstein; +Cc: GNU C Library [-- Attachment #1: Type: text/plain, Size: 15770 bytes --] Microbenchmark data collected on: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz On Wed, Sep 21, 2022 at 5:50 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > On Wed, Sep 21, 2022 at 5:27 PM Sunil K Pandey via Libc-alpha > <libc-alpha@sourceware.org> wrote: > > > > This patch implements following evex512 version of string functions. > > evex512 version takes up to 30% less cycle as compared to evex, > > depending on length and alignment. > > Please attach benchmark numbers. > > > > - memchr function using 512 bit vectors. > > - rawmemchr function using 512 bit vectors. > > - wmemchr function using 512 bit vectors. > > > > Code size data: > > > > memchr-evex.o 762 byte > > memchr-evex512.o 570 byte (-25%) > > > > rawmemchr-evex.o 461 byte > > rawmemchr-evex512.o 413 byte (-10%) > > > > wmemchr-evex.o 794 byte > > wmemchr-evex512.o 568 byte (-28%) > > > > Placeholder function, not used by any processor at the moment. > > --- > > sysdeps/x86_64/multiarch/Makefile | 3 + > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 + > > sysdeps/x86_64/multiarch/memchr-evex-base.S | 306 +++++++++++++++++++ > > sysdeps/x86_64/multiarch/memchr-evex512.S | 7 + > > sysdeps/x86_64/multiarch/rawmemchr-evex512.S | 7 + > > sysdeps/x86_64/multiarch/wmemchr-evex512.S | 8 + > > 6 files changed, 346 insertions(+) > > create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-base.S > > create mode 100644 sysdeps/x86_64/multiarch/memchr-evex512.S > > create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex512.S > > create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex512.S > > > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > > index df4601c294..e974b1ad97 100644 > > --- a/sysdeps/x86_64/multiarch/Makefile > > +++ b/sysdeps/x86_64/multiarch/Makefile > > @@ -4,6 +4,7 @@ sysdep_routines += \ > > memchr-avx2 \ > > memchr-avx2-rtm \ > > memchr-evex \ > > + memchr-evex512 \ > > memchr-evex-rtm \ > > memchr-sse2 \ > > memcmp-avx2-movbe \ > > @@ -36,6 +37,7 @@ sysdep_routines += \ > > rawmemchr-avx2 \ > > rawmemchr-avx2-rtm \ > > rawmemchr-evex \ > > + rawmemchr-evex512 \ > > rawmemchr-evex-rtm \ > > rawmemchr-sse2 \ > > stpcpy-avx2 \ > > @@ -156,6 +158,7 @@ sysdep_routines += \ > > wmemchr-avx2 \ > > wmemchr-avx2-rtm \ > > wmemchr-evex \ > > + wmemchr-evex512 \ > > wmemchr-evex-rtm \ > > wmemchr-sse2 \ > > wmemcmp-avx2-movbe \ > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > index a71444eccb..17f770318d 100644 > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > @@ -63,6 +63,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > && CPU_FEATURE_USABLE (AVX512BW) > > && CPU_FEATURE_USABLE (BMI2)), > > __memchr_evex) > > + X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > > + (CPU_FEATURE_USABLE (AVX512VL) > > + && CPU_FEATURE_USABLE (AVX512BW) > > + && CPU_FEATURE_USABLE (BMI2)), > > + __memchr_evex512) > > X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > > (CPU_FEATURE_USABLE (AVX512VL) > > && CPU_FEATURE_USABLE (AVX512BW) > > @@ -329,6 +334,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > && CPU_FEATURE_USABLE (AVX512BW) > > && CPU_FEATURE_USABLE (BMI2)), > > __rawmemchr_evex) > > + X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > > + (CPU_FEATURE_USABLE (AVX512VL) > > + && CPU_FEATURE_USABLE (AVX512BW) > > + && CPU_FEATURE_USABLE (BMI2)), > > + __rawmemchr_evex512) > > X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > > (CPU_FEATURE_USABLE (AVX512VL) > > && CPU_FEATURE_USABLE (AVX512BW) > > @@ -903,6 +913,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > && CPU_FEATURE_USABLE (AVX512BW) > > && CPU_FEATURE_USABLE (BMI2)), > > __wmemchr_evex) > > + X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > > + (CPU_FEATURE_USABLE (AVX512VL) > > + && CPU_FEATURE_USABLE (AVX512BW) > > + && CPU_FEATURE_USABLE (BMI2)), > > + __wmemchr_evex512) > > X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > > (CPU_FEATURE_USABLE (AVX512VL) > > && CPU_FEATURE_USABLE (AVX512BW) > > diff --git a/sysdeps/x86_64/multiarch/memchr-evex-base.S b/sysdeps/x86_64/multiarch/memchr-evex-base.S > > new file mode 100644 > > index 0000000000..524f0809b5 > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/memchr-evex-base.S > > @@ -0,0 +1,306 @@ > > +/* Placeholder function, not used by any processor at the moment. > > + Copyright (C) 2022 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > +/* UNUSED. Exists purely as reference implementation. */ > > + > > +#include <isa-level.h> > > + > > +#if ISA_SHOULD_BUILD (4) > > + > > +# include <sysdep.h> > > + > > +# ifdef USE_AS_WMEMCHR > > +# define CHAR_SIZE 4 > > +# define VPBROADCAST vpbroadcastd > > +# define VPCMP vpcmpd > > +# else > > +# define CHAR_SIZE 1 > > +# define VPBROADCAST vpbroadcastb > > +# define VPCMP vpcmpb > > +# endif > > + > > +# define PAGE_SIZE 4096 > > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > +# define XMM1 xmm17 > > + > > +# if VEC_SIZE == 64 > > +# define KMOV kmovq > > +# define KOR korq > > +# define KORTEST kortestq > > +# define RAX rax > > +# define RCX rcx > > +# define SHR shrq > > +# define SARX sarxq > > +# define TEXTSUFFIX evex512 > > +# define VMM0 zmm16 > > +# elif VEC_SIZE == 32 > > +/* Currently Unused. */ > > +# define KMOV kmovd > > +# define KOR kord > > +# define KORTEST kortestd > > +# define RAX eax > > +# define RCX ecx > > +# define SHR shrl > > +# define SARX sarxl > > +# define TEXTSUFFIX evex256 > > +# define VMM0 ymm16 > > +# endif > > + > > + .section .text.TEXTSUFFIX, "ax", @progbits > > +/* Aligning entry point to 64 byte, provides better performance for > > + one vector length string. */ > > +ENTRY_P2ALIGN (MEMCHR, 6) > > +# ifndef USE_AS_RAWMEMCHR > > + /* Check for zero length. */ > > + test %RDX_LP, %RDX_LP > > + jz L(zero) > > + > > +# ifdef __ILP32__ > > + /* Clear the upper 32 bits. */ > > + movl %edx, %edx > > +# endif > > +# endif > > + > > + /* Broadcast CHAR to VMM0. */ > > + VPBROADCAST %esi, %VMM0 > > + movl %edi, %eax > > + andl $(PAGE_SIZE - 1), %eax > > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > > + ja L(page_cross) > > + > > + /* Compare [w]char for null, mask bit will be set for match. */ > > + VPCMP $0, (%rdi), %VMM0, %k0 > > + > > + KMOV %k0, %RAX > > +# ifndef USE_AS_RAWMEMCHR > > + bsf %RAX, %RCX > > + jz L(align_more) > > + xor %eax, %eax > > +# ifdef USE_AS_WMEMCHR > > + leaq (%rdi, %rcx, CHAR_SIZE), %rdi > > +# else > > + addq %rcx, %rdi > > +# endif > > + cmp %rcx, %rdx > > + cmova %rdi, %rax > > +# else > > + bsf %RAX, %RAX > > + jz L(align_more) > > + add %rdi, %rax > > +# endif > > + ret > > + > > +# ifndef USE_AS_RAWMEMCHR > > +L(zero): > > + xorl %eax, %eax > > + ret > > +# endif > > + > > + .p2align 5,,5 > > +L(page_cross): > > + movq %rdi, %rcx > > + andq $-VEC_SIZE, %rcx > > + > > + VPCMP $0, (%rcx), %VMM0, %k0 > > + KMOV %k0, %RCX > > + SARX %RAX, %RCX, %RAX > > +# ifndef USE_AS_RAWMEMCHR > > + bsf %RAX, %RCX > > + jz L(align_more) > > + xor %eax, %eax > > +# ifdef USE_AS_WMEMCHR > > + leaq (%rdi, %rcx, CHAR_SIZE), %rdi > > +# else > > + addq %rcx, %rdi > > +# endif > > + cmp %rcx, %rdx > > + cmovae %rdi, %rax > > + > > +# else > > + bsf %rax, %rax > > + jz L(align_more) > > + add %rdi, %rax > > +# endif > > + ret > > + > > +L(ret_vec_x2): > > + subq $-VEC_SIZE, %rdi > > +L(ret_vec_x1): > > + bsf %RAX, %RAX > > +# ifndef USE_AS_RAWMEMCHR > > + jz L(zero) > > + cmp %rax, %rdx > > + jbe L(zero) > > +# endif > > +# ifdef USE_AS_WMEMCHR > > + leaq (%rdi, %rax, CHAR_SIZE), %rax > > +# else > > + add %rdi, %rax > > +# endif > > + ret > > + > > + .p2align 5,,10 > > +L(align_more): > > +# ifndef USE_AS_RAWMEMCHR > > + xor %eax, %eax > > + subq %rdi, %rax > > +# endif > > + > > + subq $-VEC_SIZE, %rdi > > + /* Align rdi to VEC_SIZE. */ > > + andq $-VEC_SIZE, %rdi > > + > > +# ifndef USE_AS_RAWMEMCHR > > + addq %rdi, %rax > > +# ifdef USE_AS_WMEMCHR > > + sarl $2, %eax > > +# endif > > + subq %rax, %rdx > > + jbe L(zero) > > +# endif > > + > > + /* Loop unroll 4 times for 4 vector loop. */ > > + VPCMP $0, (%rdi), %VMM0, %k0 > > + > > + KMOV %k0, %RAX > > + test %RAX, %RAX > > + jnz L(ret_vec_x1) > > + > > +# ifndef USE_AS_RAWMEMCHR > > + subq $CHAR_PER_VEC, %rdx > > + jbe L(zero) > > +# endif > > + > > + VPCMP $0, VEC_SIZE(%rdi), %VMM0, %k0 > > + > > + KMOV %k0, %RAX > > + test %RAX, %RAX > > + jnz L(ret_vec_x2) > > + > > +# ifndef USE_AS_RAWMEMCHR > > + subq $CHAR_PER_VEC, %rdx > > + jbe L(zero) > > +# endif > > + > > + VPCMP $0, (VEC_SIZE * 2)(%rdi), %VMM0, %k0 > > + > > + KMOV %k0, %RAX > > + test %RAX, %RAX > > + jnz L(ret_vec_x3) > > + > > +# ifndef USE_AS_RAWMEMCHR > > + subq $CHAR_PER_VEC, %rdx > > + jbe L(zero) > > +# endif > > + > > + VPCMP $0, (VEC_SIZE * 3)(%rdi), %VMM0, %k0 > > + > > + KMOV %k0, %RAX > > + test %RAX, %RAX > > + jnz L(ret_vec_x4) > > + > > +# ifndef USE_AS_RAWMEMCHR > > + subq $CHAR_PER_VEC, %rdx > > + jbe L(zero) > > + /* Save pointer to find alignment adjustment. */ > > + movq %rdi, %rax > > +# endif > > + /* Align address to VEC_SIZE * 4 for loop. */ > > + andq $-(VEC_SIZE * 4), %rdi > > + > > + /* Add alignment difference to rdx. */ > > +# ifndef USE_AS_RAWMEMCHR > > + subq %rdi, %rax > > +# ifdef USE_AS_WMEMCHR > > + SHR $2, %RAX > > +# endif > > + addq %rax, %rdx > > + jmp L(loop_entry) > > +# endif > > + > > + /* 4 vector loop. */ > > + .p2align 5,,11 > > +L(loop): > > +# ifndef USE_AS_RAWMEMCHR > > + subq $(CHAR_PER_VEC * 4), %rdx > > + jbe L(zero) > > +L(loop_entry): > > +# endif > > + VPCMP $0, (VEC_SIZE * 4)(%rdi), %VMM0, %k1 > > + VPCMP $0, (VEC_SIZE * 5)(%rdi), %VMM0, %k2 > > + VPCMP $0, (VEC_SIZE * 6)(%rdi), %VMM0, %k3 > > + VPCMP $0, (VEC_SIZE * 7)(%rdi), %VMM0, %k4 > > + KOR %k1, %k2, %k5 > > + KOR %k3, %k4, %k6 > > + > > + subq $-(VEC_SIZE * 4), %rdi > > + KORTEST %k5, %k6 > > + jz L(loop) > > + > > + KMOV %k1, %RAX > > + test %RAX, %RAX > > + jnz L(ret_vec_x1) > > + > > +# ifndef USE_AS_RAWMEMCHR > > + subq $CHAR_PER_VEC, %rdx > > + jbe L(zero) > > +# endif > > + > > + KMOV %k2, %RAX > > + test %RAX, %RAX > > + jnz L(ret_vec_x2) > > + > > +# ifndef USE_AS_RAWMEMCHR > > + subq $CHAR_PER_VEC, %rdx > > + jbe L(zero) > > +# endif > > + > > + KMOV %k3, %RAX > > + test %RAX, %RAX > > + jnz L(ret_vec_x3) > > + > > +# ifndef USE_AS_RAWMEMCHR > > + subq $CHAR_PER_VEC, %rdx > > + jbe L(zero) > > +# endif > > + > > + /* At this point null [w]char must be in the fourth vector so no > > + need to check. */ > > + KMOV %k4, %RAX > > + > > +L(ret_vec_x4): > > + bsf %RAX, %RAX > > +# ifndef USE_AS_RAWMEMCHR > > + cmp %rax, %rdx > > + jbe L(zero) > > +# endif > > + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax > > + ret > > + > > + .p2align 5,,5 > > +L(ret_vec_x3): > > + bsf %RAX, %RAX > > +# ifndef USE_AS_RAWMEMCHR > > + cmp %rax, %rdx > > + jbe L(zero) > > +# endif > > + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > > + ret > > + > > +END (MEMCHR) > > +#endif > > diff --git a/sysdeps/x86_64/multiarch/memchr-evex512.S b/sysdeps/x86_64/multiarch/memchr-evex512.S > > new file mode 100644 > > index 0000000000..47349d817a > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/memchr-evex512.S > > @@ -0,0 +1,7 @@ > > +# ifndef MEMCHR > > +# define MEMCHR __memchr_evex512 > > +# endif > > + > > +#define VEC_SIZE 64 > > + > > +#include "memchr-evex-base.S" > > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex512.S b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S > > new file mode 100644 > > index 0000000000..302d3cb055 > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S > > @@ -0,0 +1,7 @@ > > +#ifndef RAWMEMCHR > > +# define RAWMEMCHR __rawmemchr_evex512 > > +#endif > > +#define USE_AS_RAWMEMCHR 1 > > +#define MEMCHR RAWMEMCHR > > + > > +#include "memchr-evex512.S" > > diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex512.S b/sysdeps/x86_64/multiarch/wmemchr-evex512.S > > new file mode 100644 > > index 0000000000..f45ed1db75 > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/wmemchr-evex512.S > > @@ -0,0 +1,8 @@ > > +#ifndef WMEMCHR > > +# define WMEMCHR __wmemchr_evex512 > > +#endif > > + > > +#define MEMCHR WMEMCHR > > +#define USE_AS_WMEMCHR 1 > > + > > +#include "memchr-evex512.S" > > -- > > 2.36.1 > > [-- Attachment #2: wmemchr.txt --] [-- Type: text/plain, Size: 23018 bytes --] Function: wmemchr Variant: __wmemchr_evex __wmemchr_evex512 ======================================================================================================================== len=256, align=1, pos=64: 22.16 15.79 ( 28.77%) len=256, align=1, pos=64: 20.04 13.88 ( 30.77%) len=256, align=2, pos=64: 18.07 12.92 ( 28.51%) len=256, align=2, pos=64: 17.02 12.13 ( 28.72%) len=256, align=3, pos=64: 16.10 11.33 ( 29.66%) len=256, align=3, pos=64: 15.18 11.36 ( 25.16%) len=256, align=4, pos=64: 15.20 11.11 ( 26.86%) len=256, align=4, pos=64: 15.16 11.15 ( 26.46%) len=256, align=5, pos=64: 15.19 11.10 ( 26.89%) len=256, align=5, pos=64: 15.20 11.19 ( 26.37%) len=256, align=6, pos=64: 15.19 11.10 ( 26.95%) len=256, align=6, pos=64: 15.20 11.17 ( 26.51%) len=256, align=7, pos=64: 15.16 11.19 ( 26.19%) len=256, align=7, pos=64: 15.07 11.11 ( 26.27%) len=192, align=1, pos=32: 9.40 9.11 ( 3.10%) len=192, align=1, pos=32: 9.33 9.18 ( 1.67%) len=256, align=1, pos=32: 9.38 9.12 ( 2.70%) len=256, align=1, pos=32: 9.32 9.14 ( 1.87%) len=512, align=1, pos=32: 9.36 9.12 ( 2.51%) len=512, align=1, pos=32: 9.39 9.12 ( 2.91%) len=192, align=2, pos=64: 15.20 11.15 ( 26.66%) len=192, align=2, pos=64: 15.22 11.14 ( 26.82%) len=256, align=2, pos=64: 15.20 11.11 ( 26.87%) len=256, align=2, pos=64: 15.20 11.14 ( 26.71%) len=512, align=2, pos=64: 15.17 11.15 ( 26.47%) len=512, align=2, pos=64: 15.21 11.16 ( 26.59%) len=192, align=3, pos=96: 16.78 15.82 ( 5.73%) len=192, align=3, pos=96: 16.48 15.82 ( 3.98%) len=256, align=3, pos=96: 16.36 15.82 ( 3.27%) len=256, align=3, pos=96: 16.51 15.79 ( 4.33%) len=512, align=3, pos=96: 16.49 15.82 ( 4.07%) len=512, align=3, pos=96: 16.50 15.82 ( 4.13%) len=192, align=4, pos=128: 17.88 18.49 ( -3.41%) len=192, align=4, pos=128: 17.79 18.47 ( -3.84%) len=256, align=4, pos=128: 17.93 18.60 ( -3.76%) len=256, align=4, pos=128: 17.79 18.51 ( -4.05%) len=512, align=4, pos=128: 17.79 18.47 ( -3.78%) len=512, align=4, pos=128: 17.69 18.48 ( -4.50%) len=192, align=5, pos=160: 19.44 17.83 ( 8.28%) len=192, align=5, pos=160: 19.07 17.83 ( 6.51%) len=256, align=5, pos=160: 19.10 17.84 ( 6.61%) len=256, align=5, pos=160: 19.09 17.83 ( 6.58%) len=512, align=5, pos=160: 19.07 17.79 ( 6.70%) len=512, align=5, pos=160: 19.07 17.84 ( 6.47%) len=192, align=6, pos=192: 20.28 19.26 ( 5.05%) len=192, align=6, pos=192: 20.39 19.83 ( 2.76%) len=256, align=6, pos=192: 20.40 20.49 ( -0.44%) len=256, align=6, pos=192: 20.40 20.46 ( -0.29%) len=512, align=6, pos=192: 20.40 20.45 ( -0.28%) len=512, align=6, pos=192: 20.37 20.47 ( -0.49%) len=192, align=7, pos=224: 20.27 19.80 ( 2.32%) len=192, align=7, pos=224: 20.37 19.15 ( 6.02%) len=256, align=7, pos=224: 22.02 19.83 ( 9.95%) len=256, align=7, pos=224: 25.78 19.80 ( 23.18%) len=512, align=7, pos=224: 22.02 19.80 ( 10.09%) len=512, align=7, pos=224: 23.96 19.80 ( 17.35%) len=2, align=0, pos=1: 4.58 5.71 (-24.58%) len=2, align=0, pos=1: 4.67 5.34 (-14.49%) len=2, align=1, pos=1: 4.67 5.66 (-21.20%) len=2, align=1, pos=1: 4.67 5.61 (-20.28%) len=0, align=0, pos=1: 4.00 4.00 ( -0.01%) len=0, align=0, pos=1: 4.00 4.00 ( -0.00%) len=0, align=1, pos=1: 4.22 4.00 ( 5.12%) len=0, align=1, pos=1: 4.00 4.04 ( -1.03%) len=3, align=0, pos=2: 4.68 5.34 (-14.08%) len=3, align=0, pos=2: 4.67 5.34 (-14.44%) len=3, align=2, pos=2: 4.67 5.66 (-21.18%) len=3, align=2, pos=2: 4.94 6.08 (-22.93%) len=1, align=0, pos=2: 4.67 5.34 (-14.50%) len=1, align=0, pos=2: 4.67 5.30 (-13.55%) len=1, align=2, pos=2: 4.67 5.61 (-20.25%) len=1, align=2, pos=2: 4.67 5.62 (-20.34%) len=4, align=0, pos=3: 4.67 5.30 (-13.60%) len=4, align=0, pos=3: 4.67 5.30 (-13.59%) len=4, align=3, pos=3: 4.67 5.62 (-20.32%) len=4, align=3, pos=3: 4.67 5.62 (-20.36%) len=2, align=0, pos=3: 4.71 5.30 (-12.54%) len=2, align=0, pos=3: 4.71 5.30 (-12.55%) len=2, align=3, pos=3: 4.71 5.62 (-19.33%) len=2, align=3, pos=3: 4.67 5.62 (-20.31%) len=5, align=0, pos=4: 4.67 5.30 (-13.59%) len=5, align=0, pos=4: 4.67 5.30 (-13.57%) len=5, align=4, pos=4: 4.67 5.62 (-20.31%) len=5, align=4, pos=4: 4.67 5.66 (-21.20%) len=3, align=0, pos=4: 4.67 5.34 (-14.42%) len=3, align=0, pos=4: 4.96 5.72 (-15.42%) len=3, align=4, pos=4: 4.71 5.62 (-19.32%) len=3, align=4, pos=4: 4.93 6.03 (-22.22%) len=6, align=0, pos=5: 4.67 5.30 (-13.54%) len=6, align=0, pos=5: 4.67 5.34 (-14.40%) len=6, align=5, pos=5: 4.67 6.15 (-31.82%) len=6, align=5, pos=5: 4.67 5.62 (-20.38%) len=4, align=0, pos=5: 4.67 5.30 (-13.56%) len=4, align=0, pos=5: 4.94 5.67 (-14.80%) len=4, align=5, pos=5: 4.67 5.65 (-21.16%) len=4, align=5, pos=5: 4.82 6.03 (-25.22%) len=7, align=0, pos=6: 4.71 5.30 (-12.54%) len=7, align=0, pos=6: 4.67 5.30 (-13.60%) len=7, align=6, pos=6: 4.67 5.61 (-20.30%) len=7, align=6, pos=6: 4.67 5.66 (-21.21%) len=5, align=0, pos=6: 4.72 5.30 (-12.09%) len=5, align=0, pos=6: 4.67 5.30 (-13.56%) len=5, align=6, pos=6: 4.67 5.65 (-21.02%) len=5, align=6, pos=6: 4.87 6.04 (-23.99%) len=8, align=0, pos=7: 4.67 5.30 (-13.55%) len=8, align=0, pos=7: 4.67 5.34 (-14.44%) len=8, align=7, pos=7: 5.23 6.02 (-15.06%) len=8, align=7, pos=7: 4.90 5.62 (-14.58%) len=6, align=0, pos=7: 4.67 5.34 (-14.42%) len=6, align=0, pos=7: 4.71 5.30 (-12.53%) len=6, align=7, pos=7: 4.90 5.61 (-14.41%) len=6, align=7, pos=7: 4.91 5.62 (-14.45%) len=9, align=0, pos=8: 8.70 5.72 ( 34.24%) len=9, align=0, pos=8: 8.82 5.70 ( 35.31%) len=9, align=8, pos=8: 8.71 5.62 ( 35.49%) len=9, align=8, pos=8: 8.67 5.66 ( 34.72%) len=7, align=0, pos=8: 4.67 5.30 (-13.50%) len=7, align=0, pos=8: 4.67 5.34 (-14.44%) len=7, align=8, pos=8: 4.91 5.61 (-14.44%) len=7, align=8, pos=8: 4.90 5.66 (-15.40%) len=10, align=0, pos=9: 8.87 5.73 ( 35.43%) len=10, align=0, pos=9: 8.67 5.72 ( 34.08%) len=10, align=9, pos=9: 8.66 6.04 ( 30.17%) len=10, align=9, pos=9: 8.70 6.04 ( 30.60%) len=8, align=0, pos=9: 4.96 5.70 (-15.04%) len=8, align=0, pos=9: 4.67 5.30 (-13.61%) len=8, align=9, pos=9: 5.26 5.66 ( -7.57%) len=8, align=9, pos=9: 5.55 6.04 ( -8.77%) len=11, align=0, pos=10: 8.67 5.34 ( 38.39%) len=11, align=0, pos=10: 8.67 5.34 ( 38.37%) len=11, align=10, pos=10: 8.67 5.66 ( 34.74%) len=11, align=10, pos=10: 8.67 5.62 ( 35.17%) len=9, align=0, pos=10: 7.75 5.72 ( 26.17%) len=9, align=0, pos=10: 7.78 5.71 ( 26.68%) len=9, align=10, pos=10: 7.71 5.62 ( 27.13%) len=9, align=10, pos=10: 7.67 5.66 ( 26.23%) len=12, align=0, pos=11: 8.67 5.34 ( 38.42%) len=12, align=0, pos=11: 8.67 5.34 ( 38.38%) len=12, align=11, pos=11: 8.67 5.65 ( 34.76%) len=12, align=11, pos=11: 8.67 6.04 ( 30.29%) len=10, align=0, pos=11: 7.79 5.73 ( 26.47%) len=10, align=0, pos=11: 7.76 5.72 ( 26.23%) len=10, align=11, pos=11: 7.76 6.03 ( 22.28%) len=10, align=11, pos=11: 7.71 5.61 ( 27.17%) len=13, align=0, pos=12: 8.71 5.30 ( 39.14%) len=13, align=0, pos=12: 8.67 5.34 ( 38.37%) len=13, align=12, pos=12: 8.00 5.66 ( 29.32%) len=13, align=12, pos=12: 8.00 5.66 ( 29.26%) len=11, align=0, pos=12: 7.69 5.30 ( 31.08%) len=11, align=0, pos=12: 7.69 5.30 ( 31.06%) len=11, align=12, pos=12: 7.67 5.62 ( 26.76%) len=11, align=12, pos=12: 7.71 6.04 ( 21.66%) len=14, align=0, pos=13: 8.67 5.73 ( 33.86%) len=14, align=0, pos=13: 8.69 5.73 ( 34.06%) len=14, align=13, pos=13: 8.10 6.01 ( 25.73%) len=14, align=13, pos=13: 8.04 6.03 ( 25.03%) len=12, align=0, pos=13: 7.72 5.74 ( 25.70%) len=12, align=0, pos=13: 7.78 5.74 ( 26.26%) len=12, align=13, pos=13: 8.29 6.04 ( 27.18%) len=12, align=13, pos=13: 8.26 6.03 ( 26.91%) len=15, align=0, pos=14: 8.68 5.83 ( 32.82%) len=15, align=0, pos=14: 8.65 5.72 ( 33.87%) len=15, align=14, pos=14: 8.03 6.04 ( 24.84%) len=15, align=14, pos=14: 8.00 5.66 ( 29.29%) len=13, align=0, pos=14: 8.14 5.83 ( 28.35%) len=13, align=0, pos=14: 7.69 5.34 ( 30.54%) len=13, align=14, pos=14: 8.22 5.66 ( 31.17%) len=13, align=14, pos=14: 9.33 5.65 ( 39.48%) len=16, align=0, pos=15: 8.67 5.30 ( 38.86%) len=16, align=0, pos=15: 8.67 5.34 ( 38.36%) len=16, align=15, pos=15: 8.67 5.65 ( 34.78%) len=16, align=15, pos=15: 8.09 6.02 ( 25.65%) len=14, align=0, pos=15: 7.76 5.71 ( 26.48%) len=14, align=0, pos=15: 7.76 5.71 ( 26.44%) len=14, align=15, pos=15: 8.29 6.04 ( 27.15%) len=14, align=15, pos=15: 8.25 6.02 ( 27.07%) len=17, align=0, pos=16: 8.10 7.16 ( 11.55%) len=17, align=0, pos=16: 8.03 6.71 ( 16.45%) len=17, align=16, pos=16: 8.11 7.12 ( 12.19%) len=17, align=16, pos=16: 8.52 7.51 ( 11.88%) len=15, align=0, pos=16: 8.14 5.68 ( 30.26%) len=15, align=0, pos=16: 8.14 5.67 ( 30.40%) len=15, align=16, pos=16: 8.19 4.70 ( 42.58%) len=15, align=16, pos=16: 7.76 4.44 ( 42.86%) len=18, align=0, pos=17: 8.11 7.10 ( 12.41%) len=18, align=0, pos=17: 8.49 7.67 ( 9.69%) len=18, align=17, pos=17: 8.00 6.71 ( 16.18%) len=18, align=17, pos=17: 8.10 7.12 ( 12.04%) len=16, align=0, pos=17: 8.02 5.77 ( 28.09%) len=16, align=0, pos=17: 7.40 5.77 ( 21.93%) len=16, align=17, pos=17: 8.25 8.42 ( -2.06%) len=16, align=17, pos=17: 8.22 8.03 ( 2.29%) len=19, align=0, pos=18: 8.10 7.11 ( 12.13%) len=19, align=0, pos=18: 8.03 7.19 ( 10.47%) len=19, align=18, pos=18: 8.00 6.71 ( 16.15%) len=19, align=18, pos=18: 8.03 7.13 ( 11.16%) len=17, align=0, pos=18: 8.22 7.41 ( 9.91%) len=17, align=0, pos=18: 8.31 8.42 ( -1.31%) len=17, align=18, pos=18: 8.27 7.96 ( 3.69%) len=17, align=18, pos=18: 8.22 7.48 ( 9.01%) len=20, align=0, pos=19: 8.10 7.13 ( 11.97%) len=20, align=0, pos=19: 8.02 7.16 ( 10.73%) len=20, align=19, pos=19: 8.00 6.67 ( 16.66%) len=20, align=19, pos=19: 8.06 7.11 ( 11.74%) len=18, align=0, pos=19: 8.28 8.43 ( -1.88%) len=18, align=0, pos=19: 8.27 8.48 ( -2.51%) len=18, align=19, pos=19: 9.35 8.00 ( 14.47%) len=18, align=19, pos=19: 8.48 8.28 ( 2.36%) len=21, align=0, pos=20: 8.09 7.11 ( 12.06%) len=21, align=0, pos=20: 8.03 6.67 ( 17.02%) len=21, align=20, pos=20: 9.96 7.15 ( 28.24%) len=21, align=20, pos=20: 10.02 7.11 ( 29.04%) len=19, align=0, pos=20: 8.45 8.43 ( 0.27%) len=19, align=0, pos=20: 8.27 7.82 ( 5.42%) len=19, align=20, pos=20: 9.33 8.04 ( 13.89%) len=19, align=20, pos=20: 8.27 8.43 ( -1.95%) len=22, align=0, pos=21: 8.09 7.14 ( 11.66%) len=22, align=0, pos=21: 8.09 7.10 ( 12.20%) len=22, align=21, pos=21: 10.16 7.11 ( 30.07%) len=22, align=21, pos=21: 10.55 7.12 ( 32.53%) len=20, align=0, pos=21: 8.27 8.46 ( -2.29%) len=20, align=0, pos=21: 8.46 8.44 ( 0.18%) len=20, align=21, pos=21: 10.00 8.45 ( 15.45%) len=20, align=21, pos=21: 10.00 8.66 ( 13.41%) len=23, align=0, pos=22: 8.11 7.12 ( 12.20%) len=23, align=0, pos=22: 8.10 7.12 ( 12.02%) len=23, align=22, pos=22: 10.02 7.24 ( 27.76%) len=23, align=22, pos=22: 9.98 7.14 ( 28.46%) len=21, align=0, pos=22: 8.26 8.46 ( -2.49%) len=21, align=0, pos=22: 8.30 8.45 ( -1.83%) len=21, align=22, pos=22: 10.04 8.00 ( 20.32%) len=21, align=22, pos=22: 10.00 7.79 ( 22.13%) len=24, align=0, pos=23: 8.03 7.17 ( 10.71%) len=24, align=0, pos=23: 8.05 7.12 ( 11.56%) len=24, align=23, pos=23: 10.01 7.12 ( 28.80%) len=24, align=23, pos=23: 10.00 7.12 ( 28.81%) len=22, align=0, pos=23: 8.28 8.45 ( -2.05%) len=22, align=0, pos=23: 8.24 8.45 ( -2.60%) len=22, align=23, pos=23: 9.98 8.94 ( 10.46%) len=22, align=23, pos=23: 9.98 8.04 ( 19.47%) len=25, align=0, pos=24: 9.98 7.14 ( 28.43%) len=25, align=0, pos=24: 9.98 7.17 ( 28.14%) len=25, align=24, pos=24: 9.98 9.28 ( 7.01%) len=25, align=24, pos=24: 9.98 9.14 ( 8.36%) len=23, align=0, pos=24: 8.25 8.46 ( -2.62%) len=23, align=0, pos=24: 8.27 8.42 ( -1.90%) len=23, align=24, pos=24: 9.17 7.79 ( 15.08%) len=23, align=24, pos=24: 8.30 8.44 ( -1.66%) len=26, align=0, pos=25: 10.00 7.13 ( 28.68%) len=26, align=0, pos=25: 9.99 7.17 ( 28.22%) len=26, align=25, pos=25: 10.01 9.13 ( 8.81%) len=26, align=25, pos=25: 10.00 9.14 ( 8.58%) len=24, align=0, pos=25: 8.26 8.10 ( 1.98%) len=24, align=0, pos=25: 8.24 8.46 ( -2.63%) len=24, align=25, pos=25: 9.99 9.13 ( 8.59%) len=24, align=25, pos=25: 10.04 9.10 ( 9.34%) len=27, align=0, pos=26: 10.65 7.13 ( 33.08%) len=27, align=0, pos=26: 10.04 7.13 ( 29.01%) len=27, align=26, pos=26: 10.00 9.12 ( 8.76%) len=27, align=26, pos=26: 9.97 9.17 ( 8.04%) len=25, align=0, pos=26: 10.03 8.45 ( 15.70%) len=25, align=0, pos=26: 9.97 8.45 ( 15.29%) len=25, align=26, pos=26: 9.98 9.18 ( 8.02%) len=25, align=26, pos=26: 10.63 8.70 ( 18.13%) len=28, align=0, pos=27: 9.83 7.18 ( 27.00%) len=28, align=0, pos=27: 9.99 7.14 ( 28.49%) len=28, align=27, pos=27: 10.11 9.10 ( 9.97%) len=28, align=27, pos=27: 10.02 9.12 ( 9.01%) len=26, align=0, pos=27: 10.04 8.43 ( 15.99%) len=26, align=0, pos=27: 10.04 8.44 ( 15.98%) len=26, align=27, pos=27: 9.99 9.19 ( 8.05%) len=26, align=27, pos=27: 9.97 9.17 ( 8.01%) len=29, align=0, pos=28: 9.98 7.15 ( 28.32%) len=29, align=0, pos=28: 9.97 7.18 ( 27.93%) len=29, align=28, pos=28: 10.61 9.16 ( 13.66%) len=29, align=28, pos=28: 10.67 9.12 ( 14.52%) len=27, align=0, pos=28: 10.03 7.80 ( 22.23%) len=27, align=0, pos=28: 9.98 8.50 ( 14.81%) len=27, align=28, pos=28: 9.97 9.17 ( 8.03%) len=27, align=28, pos=28: 9.96 9.15 ( 8.10%) len=30, align=0, pos=29: 9.95 7.16 ( 28.11%) len=30, align=0, pos=29: 10.01 7.13 ( 28.79%) len=30, align=29, pos=29: 10.66 9.11 ( 14.58%) len=30, align=29, pos=29: 10.64 9.17 ( 13.74%) len=28, align=0, pos=29: 10.03 8.42 ( 16.02%) len=28, align=0, pos=29: 10.02 7.79 ( 22.31%) len=28, align=29, pos=29: 10.65 9.14 ( 14.20%) len=28, align=29, pos=29: 10.65 9.17 ( 13.88%) len=31, align=0, pos=30: 10.47 7.11 ( 32.07%) len=31, align=0, pos=30: 10.02 7.13 ( 28.80%) len=31, align=30, pos=30: 10.63 9.15 ( 13.90%) len=31, align=30, pos=30: 10.68 9.10 ( 14.76%) len=29, align=0, pos=30: 10.03 8.43 ( 15.96%) len=29, align=0, pos=30: 10.24 8.48 ( 17.19%) len=29, align=30, pos=30: 10.64 9.16 ( 13.84%) len=29, align=30, pos=30: 10.65 9.10 ( 14.57%) len=32, align=0, pos=31: 9.97 7.16 ( 28.24%) len=32, align=0, pos=31: 9.96 7.16 ( 28.07%) len=32, align=31, pos=31: 10.65 9.18 ( 13.76%) len=32, align=31, pos=31: 10.61 9.16 ( 13.63%) len=30, align=0, pos=31: 10.02 7.91 ( 21.08%) len=30, align=0, pos=31: 10.03 7.79 ( 22.29%) len=30, align=31, pos=31: 10.67 9.12 ( 14.50%) len=30, align=31, pos=31: 10.65 9.19 ( 13.70%) [-- Attachment #3: memchr.txt --] [-- Type: text/plain, Size: 53015 bytes --] Function: memchr Variant: __memchr_evex __memchr_evex512 ======================================================================================================================== len=2048, align=0, pos=32: 7.03 4.78 ( 32.03%) len=256, align=1, pos=64: 7.79 6.73 ( 13.61%) len=2048, align=0, pos=32: 7.09 4.75 ( 32.98%) len=256, align=1, pos=64: 7.77 6.48 ( 16.61%) len=256, align=4081, pos=64: 7.79 7.58 ( 2.74%) len=2048, align=0, pos=64: 7.80 6.46 ( 17.13%) len=256, align=2, pos=64: 7.45 6.12 ( 17.82%) len=2048, align=0, pos=64: 7.40 6.18 ( 16.48%) len=256, align=2, pos=64: 7.37 6.18 ( 16.10%) len=256, align=4081, pos=64: 7.39 7.17 ( 2.96%) len=2048, align=0, pos=128: 8.52 7.18 ( 15.73%) len=256, align=3, pos=64: 7.45 6.13 ( 17.68%) len=2048, align=0, pos=128: 8.71 7.14 ( 18.00%) len=256, align=3, pos=64: 7.37 6.18 ( 16.10%) len=256, align=4081, pos=64: 7.37 7.14 ( 3.19%) len=2048, align=0, pos=256: 15.14 13.16 ( 13.13%) len=256, align=4, pos=64: 7.37 6.15 ( 16.54%) len=2048, align=0, pos=256: 15.21 13.14 ( 13.61%) len=256, align=4, pos=64: 7.42 6.14 ( 17.31%) len=256, align=4081, pos=64: 7.42 7.11 ( 4.19%) len=2048, align=0, pos=512: 18.34 18.22 ( 0.66%) len=256, align=5, pos=64: 7.39 6.12 ( 17.24%) len=2048, align=0, pos=512: 17.79 18.90 ( -6.21%) len=256, align=5, pos=64: 7.37 6.17 ( 16.24%) len=256, align=4081, pos=64: 7.40 7.18 ( 2.94%) len=2048, align=0, pos=1024: 25.57 21.68 ( 15.22%) len=256, align=6, pos=64: 7.37 6.19 ( 15.97%) len=2048, align=0, pos=1024: 25.54 21.72 ( 14.94%) len=256, align=6, pos=64: 7.43 6.13 ( 17.50%) len=256, align=4081, pos=64: 7.44 7.11 ( 4.42%) len=2048, align=0, pos=2048: 38.03 29.22 ( 23.17%) len=256, align=7, pos=64: 7.46 6.14 ( 17.68%) len=2048, align=0, pos=2048: 38.17 29.19 ( 23.52%) len=256, align=7, pos=64: 7.37 6.17 ( 16.26%) len=256, align=4081, pos=64: 7.41 7.13 ( 3.75%) len=192, align=1, pos=32: 6.79 4.45 ( 34.41%) len=192, align=1, pos=32: 6.74 4.46 ( 33.84%) len=256, align=1, pos=32: 6.79 4.59 ( 32.39%) len=256, align=1, pos=32: 6.72 4.44 ( 33.89%) len=512, align=1, pos=32: 6.79 4.46 ( 34.27%) len=512, align=1, pos=32: 6.72 4.45 ( 33.80%) len=256, align=4081, pos=32: 6.79 7.13 ( -4.96%) len=192, align=2, pos=64: 7.37 6.18 ( 16.17%) len=192, align=2, pos=64: 7.38 6.18 ( 16.30%) len=256, align=2, pos=64: 7.43 6.17 ( 16.89%) len=256, align=2, pos=64: 7.40 6.15 ( 16.89%) len=512, align=2, pos=64: 7.40 6.14 ( 17.03%) len=512, align=2, pos=64: 7.44 6.13 ( 17.68%) len=256, align=4081, pos=64: 7.43 7.13 ( 4.01%) len=192, align=3, pos=96: 8.05 6.18 ( 23.22%) len=192, align=3, pos=96: 8.05 6.17 ( 23.31%) len=256, align=3, pos=96: 7.91 6.13 ( 22.55%) len=256, align=3, pos=96: 8.10 6.12 ( 24.35%) len=512, align=3, pos=96: 8.06 6.14 ( 23.84%) len=512, align=3, pos=96: 8.03 6.18 ( 23.04%) len=256, align=4081, pos=96: 8.05 7.82 ( 2.85%) len=192, align=4, pos=128: 8.75 7.13 ( 18.50%) len=192, align=4, pos=128: 8.69 7.15 ( 17.66%) len=256, align=4, pos=128: 8.69 7.15 ( 17.81%) len=256, align=4, pos=128: 8.71 7.15 ( 17.88%) len=512, align=4, pos=128: 8.69 7.12 ( 18.06%) len=512, align=4, pos=128: 8.71 7.13 ( 18.11%) len=256, align=4081, pos=128: 8.68 7.83 ( 9.75%) len=192, align=5, pos=160: 9.90 7.20 ( 27.27%) len=192, align=5, pos=160: 10.00 7.20 ( 27.99%) len=256, align=5, pos=160: 10.03 7.13 ( 28.92%) len=256, align=5, pos=160: 10.00 7.16 ( 28.32%) len=512, align=5, pos=160: 12.62 7.17 ( 43.22%) len=512, align=5, pos=160: 12.58 7.20 ( 42.80%) len=256, align=4081, pos=160: 10.03 9.81 ( 2.23%) len=192, align=6, pos=192: 11.02 9.79 ( 11.18%) len=192, align=6, pos=192: 10.62 9.83 ( 7.53%) len=256, align=6, pos=192: 11.17 10.25 ( 8.21%) len=256, align=6, pos=192: 11.32 9.78 ( 13.64%) len=512, align=6, pos=192: 12.61 9.76 ( 22.57%) len=512, align=6, pos=192: 12.61 9.84 ( 21.96%) len=256, align=4081, pos=192: 11.29 9.82 ( 13.04%) len=192, align=7, pos=224: 10.81 9.79 ( 9.42%) len=192, align=7, pos=224: 10.65 9.84 ( 7.58%) len=256, align=7, pos=224: 12.59 9.84 ( 21.81%) len=256, align=7, pos=224: 12.49 9.77 ( 21.79%) len=512, align=7, pos=224: 12.62 9.79 ( 22.43%) len=512, align=7, pos=224: 12.60 9.82 ( 22.11%) len=256, align=4081, pos=224: 12.62 13.14 ( -4.06%) len=2, align=0, pos=1: 4.59 4.47 ( 2.55%) len=2, align=0, pos=1: 4.67 4.05 ( 13.18%) len=2, align=1, pos=1: 4.94 4.24 ( 14.28%) len=2, align=1, pos=1: 4.67 4.00 ( 14.28%) len=0, align=0, pos=1: 5.60 5.56 ( 0.80%) len=0, align=0, pos=1: 5.33 5.60 ( -4.99%) len=0, align=1, pos=1: 5.33 5.59 ( -4.89%) len=0, align=1, pos=1: 5.33 5.56 ( -4.17%) len=2, align=2048, pos=1: 4.67 4.00 ( 14.27%) len=2, align=2048, pos=1: 4.71 4.00 ( 15.05%) len=2, align=2049, pos=1: 4.67 4.04 ( 13.36%) len=2, align=2049, pos=1: 4.67 4.00 ( 14.28%) len=0, align=2048, pos=1: 5.56 5.56 ( -0.00%) len=0, align=2048, pos=1: 5.60 5.56 ( 0.73%) len=0, align=2049, pos=1: 5.33 5.60 ( -4.95%) len=0, align=2049, pos=1: 5.33 5.56 ( -4.17%) len=0, align=4081, pos=1: 5.56 5.56 ( -0.01%) len=0, align=4081, pos=1: 5.50 5.56 ( -1.03%) len=2, align=4081, pos=1: 6.04 5.56 ( 7.97%) len=2, align=4081, pos=1: 6.04 5.56 ( 8.04%) len=3, align=0, pos=2: 4.67 4.00 ( 14.29%) len=3, align=0, pos=2: 4.67 4.00 ( 14.26%) len=3, align=2, pos=2: 4.67 4.00 ( 14.28%) len=3, align=2, pos=2: 4.67 4.00 ( 14.26%) len=1, align=0, pos=2: 4.71 4.00 ( 15.10%) len=1, align=0, pos=2: 4.71 4.00 ( 15.05%) len=1, align=2, pos=2: 4.67 4.00 ( 14.28%) len=1, align=2, pos=2: 4.67 4.00 ( 14.28%) len=3, align=2048, pos=2: 4.67 4.00 ( 14.28%) len=3, align=2048, pos=2: 4.67 4.00 ( 14.28%) len=3, align=2050, pos=2: 4.67 4.00 ( 14.27%) len=3, align=2050, pos=2: 4.67 4.00 ( 14.28%) len=1, align=2048, pos=2: 4.67 4.00 ( 14.28%) len=1, align=2048, pos=2: 4.77 4.00 ( 16.13%) len=1, align=2050, pos=2: 4.67 4.04 ( 13.38%) len=1, align=2050, pos=2: 4.67 4.00 ( 14.28%) len=1, align=4081, pos=2: 6.00 5.56 ( 7.40%) len=1, align=4081, pos=2: 6.00 5.56 ( 7.40%) len=3, align=4081, pos=2: 6.00 5.60 ( 6.73%) len=3, align=4081, pos=2: 6.30 5.99 ( 4.98%) len=4, align=0, pos=3: 4.67 4.00 ( 14.28%) len=4, align=0, pos=3: 4.67 4.00 ( 14.28%) len=4, align=3, pos=3: 4.67 4.00 ( 14.27%) len=4, align=3, pos=3: 4.67 4.00 ( 14.28%) len=2, align=0, pos=3: 4.71 4.00 ( 15.02%) len=2, align=0, pos=3: 4.67 4.04 ( 13.38%) len=2, align=3, pos=3: 4.67 4.00 ( 14.28%) len=2, align=3, pos=3: 4.67 4.00 ( 14.26%) len=4, align=2048, pos=3: 4.67 4.00 ( 14.28%) len=4, align=2048, pos=3: 4.67 4.00 ( 14.28%) len=4, align=2051, pos=3: 4.67 4.00 ( 14.28%) len=4, align=2051, pos=3: 4.67 4.00 ( 14.26%) len=2, align=2048, pos=3: 4.71 4.00 ( 15.04%) len=2, align=2048, pos=3: 4.71 4.00 ( 15.03%) len=2, align=2051, pos=3: 4.67 4.04 ( 13.35%) len=2, align=2051, pos=3: 4.67 4.00 ( 14.28%) len=2, align=4081, pos=3: 6.00 5.56 ( 7.40%) len=2, align=4081, pos=3: 6.00 5.56 ( 7.41%) len=4, align=4081, pos=3: 6.09 5.97 ( 2.06%) len=4, align=4081, pos=3: 6.00 5.56 ( 7.41%) len=5, align=0, pos=4: 4.67 4.04 ( 13.52%) len=5, align=0, pos=4: 4.91 4.50 ( 8.32%) len=5, align=4, pos=4: 4.83 4.47 ( 7.47%) len=5, align=4, pos=4: 4.82 4.44 ( 7.79%) len=3, align=0, pos=4: 4.71 4.00 ( 15.02%) len=3, align=0, pos=4: 4.71 4.00 ( 15.03%) len=3, align=4, pos=4: 4.67 4.04 ( 13.39%) len=3, align=4, pos=4: 4.67 4.05 ( 13.31%) len=5, align=2048, pos=4: 4.82 4.46 ( 7.51%) len=5, align=2048, pos=4: 4.82 4.46 ( 7.54%) len=5, align=2052, pos=4: 4.80 4.44 ( 7.49%) len=5, align=2052, pos=4: 4.67 4.00 ( 14.28%) len=3, align=2048, pos=4: 4.71 4.00 ( 15.04%) len=3, align=2048, pos=4: 4.71 4.00 ( 15.02%) len=3, align=2052, pos=4: 4.71 4.00 ( 15.03%) len=3, align=2052, pos=4: 4.71 4.00 ( 15.04%) len=3, align=4081, pos=4: 6.04 5.33 ( 11.71%) len=3, align=4081, pos=4: 6.04 5.56 ( 8.06%) len=5, align=4081, pos=4: 6.00 5.60 ( 6.72%) len=5, align=4081, pos=4: 6.12 5.96 ( 2.62%) len=6, align=0, pos=5: 4.67 4.00 ( 14.28%) len=6, align=0, pos=5: 4.67 4.00 ( 14.27%) len=6, align=5, pos=5: 4.67 4.00 ( 14.28%) len=6, align=5, pos=5: 4.67 4.00 ( 14.28%) len=4, align=0, pos=5: 4.67 4.00 ( 14.26%) len=4, align=0, pos=5: 4.67 4.00 ( 14.28%) len=4, align=5, pos=5: 4.71 4.00 ( 15.06%) len=4, align=5, pos=5: 4.71 4.00 ( 15.02%) len=6, align=2048, pos=5: 4.82 4.48 ( 7.07%) len=6, align=2048, pos=5: 4.67 4.00 ( 14.28%) len=6, align=2053, pos=5: 4.67 4.00 ( 14.26%) len=6, align=2053, pos=5: 4.67 4.00 ( 14.26%) len=4, align=2048, pos=5: 4.67 4.00 ( 14.26%) len=4, align=2048, pos=5: 4.67 4.00 ( 14.28%) len=4, align=2053, pos=5: 4.67 4.00 ( 14.28%) len=4, align=2053, pos=5: 4.67 4.00 ( 14.28%) len=4, align=4081, pos=5: 6.04 5.56 ( 8.01%) len=4, align=4081, pos=5: 6.00 5.33 ( 11.11%) len=6, align=4081, pos=5: 6.35 5.99 ( 5.76%) len=6, align=4081, pos=5: 6.00 5.56 ( 7.40%) len=7, align=0, pos=6: 4.67 4.00 ( 14.28%) len=7, align=0, pos=6: 4.67 4.00 ( 14.26%) len=7, align=6, pos=6: 4.67 4.00 ( 14.26%) len=7, align=6, pos=6: 4.67 4.00 ( 14.26%) len=5, align=0, pos=6: 4.67 4.00 ( 14.28%) len=5, align=0, pos=6: 4.67 4.00 ( 14.26%) len=5, align=6, pos=6: 4.67 4.00 ( 14.28%) len=5, align=6, pos=6: 4.71 4.00 ( 15.04%) len=7, align=2048, pos=6: 4.71 4.00 ( 15.03%) len=7, align=2048, pos=6: 4.71 4.00 ( 15.03%) len=7, align=2054, pos=6: 4.71 4.00 ( 15.02%) len=7, align=2054, pos=6: 4.70 4.00 ( 14.92%) len=5, align=2048, pos=6: 4.67 4.04 ( 13.36%) len=5, align=2048, pos=6: 4.67 4.04 ( 13.38%) len=5, align=2054, pos=6: 4.67 4.00 ( 14.28%) len=5, align=2054, pos=6: 4.67 4.00 ( 14.28%) len=5, align=4081, pos=6: 6.00 5.60 ( 6.73%) len=5, align=4081, pos=6: 6.00 5.60 ( 6.69%) len=7, align=4081, pos=6: 6.04 5.56 ( 8.07%) len=7, align=4081, pos=6: 6.00 5.56 ( 7.41%) len=8, align=0, pos=7: 4.67 4.04 ( 13.39%) len=8, align=0, pos=7: 4.95 4.49 ( 9.22%) len=8, align=7, pos=7: 4.94 4.47 ( 9.54%) len=8, align=7, pos=7: 4.98 4.47 ( 10.41%) len=6, align=0, pos=7: 4.67 4.04 ( 13.37%) len=6, align=0, pos=7: 4.67 4.04 ( 13.39%) len=6, align=7, pos=7: 4.67 4.04 ( 13.37%) len=6, align=7, pos=7: 4.67 4.00 ( 14.26%) len=8, align=2048, pos=7: 4.67 4.04 ( 13.39%) len=8, align=2048, pos=7: 4.67 4.68 ( -0.21%) len=8, align=2055, pos=7: 4.82 4.47 ( 7.18%) len=8, align=2055, pos=7: 4.98 4.44 ( 10.76%) len=6, align=2048, pos=7: 4.67 4.04 ( 13.39%) len=6, align=2048, pos=7: 4.67 4.04 ( 13.37%) len=6, align=2055, pos=7: 4.67 4.04 ( 13.39%) len=6, align=2055, pos=7: 4.67 4.04 ( 13.33%) len=6, align=4081, pos=7: 6.00 5.60 ( 6.73%) len=6, align=4081, pos=7: 6.04 5.56 ( 8.04%) len=8, align=4081, pos=7: 6.00 5.33 ( 11.11%) len=8, align=4081, pos=7: 6.00 5.60 ( 6.73%) len=9, align=0, pos=8: 4.80 4.47 ( 6.84%) len=9, align=0, pos=8: 4.67 4.00 ( 14.28%) len=9, align=8, pos=8: 4.67 4.00 ( 14.27%) len=9, align=8, pos=8: 4.67 4.00 ( 14.28%) len=7, align=0, pos=8: 4.67 4.00 ( 14.28%) len=7, align=0, pos=8: 4.67 4.00 ( 14.26%) len=7, align=8, pos=8: 4.67 4.00 ( 14.28%) len=7, align=8, pos=8: 4.94 4.24 ( 14.28%) len=9, align=2048, pos=8: 5.03 4.70 ( 6.48%) len=9, align=2048, pos=8: 4.94 4.24 ( 14.28%) len=9, align=2056, pos=8: 4.94 4.24 ( 14.26%) len=9, align=2056, pos=8: 4.94 4.29 ( 13.20%) len=7, align=2048, pos=8: 4.94 4.28 ( 13.29%) len=7, align=2048, pos=8: 5.26 4.70 ( 10.54%) len=7, align=2056, pos=8: 4.94 4.24 ( 14.28%) len=7, align=2056, pos=8: 4.94 4.24 ( 14.28%) len=7, align=4081, pos=8: 6.00 5.33 ( 11.10%) len=7, align=4081, pos=8: 6.00 5.60 ( 6.67%) len=9, align=4081, pos=8: 5.84 5.96 ( -2.12%) len=9, align=4081, pos=8: 6.47 6.50 ( -0.52%) len=10, align=0, pos=9: 4.67 4.00 ( 14.28%) len=10, align=0, pos=9: 4.67 4.00 ( 14.28%) len=10, align=9, pos=9: 4.67 4.00 ( 14.26%) len=10, align=9, pos=9: 4.67 4.00 ( 14.26%) len=8, align=0, pos=9: 4.67 4.00 ( 14.27%) len=8, align=0, pos=9: 4.67 4.04 ( 13.38%) len=8, align=9, pos=9: 4.67 4.04 ( 13.38%) len=8, align=9, pos=9: 4.67 4.04 ( 13.37%) len=10, align=2048, pos=9: 4.82 4.45 ( 7.57%) len=10, align=2048, pos=9: 4.83 4.45 ( 7.74%) len=10, align=2057, pos=9: 4.77 4.44 ( 6.90%) len=10, align=2057, pos=9: 4.67 4.00 ( 14.28%) len=8, align=2048, pos=9: 4.67 4.00 ( 14.28%) len=8, align=2048, pos=9: 4.67 4.00 ( 14.28%) len=8, align=2057, pos=9: 4.67 4.00 ( 14.28%) len=8, align=2057, pos=9: 4.67 4.00 ( 14.28%) len=8, align=4081, pos=9: 6.00 5.56 ( 7.40%) len=8, align=4081, pos=9: 6.00 5.58 ( 7.00%) len=10, align=4081, pos=9: 6.00 5.56 ( 7.41%) len=10, align=4081, pos=9: 6.11 5.97 ( 2.36%) len=11, align=0, pos=10: 4.67 4.00 ( 14.28%) len=11, align=0, pos=10: 4.67 4.00 ( 14.26%) len=11, align=10, pos=10: 4.67 4.00 ( 14.28%) len=11, align=10, pos=10: 4.67 4.04 ( 13.38%) len=9, align=0, pos=10: 4.67 4.05 ( 13.29%) len=9, align=0, pos=10: 4.67 4.04 ( 13.39%) len=9, align=10, pos=10: 4.81 4.45 ( 7.50%) len=9, align=10, pos=10: 4.79 4.49 ( 6.28%) len=11, align=2048, pos=10: 4.67 4.00 ( 14.28%) len=11, align=2048, pos=10: 4.67 4.00 ( 14.28%) len=11, align=2058, pos=10: 4.67 4.00 ( 14.28%) len=11, align=2058, pos=10: 4.67 4.04 ( 13.39%) len=9, align=2048, pos=10: 4.67 4.04 ( 13.39%) len=9, align=2048, pos=10: 4.81 4.45 ( 7.52%) len=9, align=2058, pos=10: 4.71 4.00 ( 15.03%) len=9, align=2058, pos=10: 4.71 4.00 ( 15.03%) len=9, align=4081, pos=10: 6.04 5.56 ( 8.04%) len=9, align=4081, pos=10: 6.00 5.33 ( 11.11%) len=11, align=4081, pos=10: 6.11 5.97 ( 2.29%) len=11, align=4081, pos=10: 6.00 5.56 ( 7.41%) len=12, align=0, pos=11: 4.85 4.45 ( 8.27%) len=12, align=0, pos=11: 4.96 4.45 ( 10.25%) len=12, align=11, pos=11: 4.67 4.00 ( 14.28%) len=12, align=11, pos=11: 4.67 4.00 ( 14.28%) len=10, align=0, pos=11: 4.67 4.04 ( 13.40%) len=10, align=0, pos=11: 4.67 4.04 ( 13.40%) len=10, align=11, pos=11: 4.95 4.50 ( 9.16%) len=10, align=11, pos=11: 4.79 4.45 ( 6.99%) len=12, align=2048, pos=11: 4.67 4.00 ( 14.27%) len=12, align=2048, pos=11: 4.67 4.00 ( 14.28%) len=12, align=2059, pos=11: 4.67 4.04 ( 13.39%) len=12, align=2059, pos=11: 4.84 4.44 ( 8.31%) len=10, align=2048, pos=11: 4.94 4.45 ( 10.05%) len=10, align=2048, pos=11: 5.00 4.45 ( 11.04%) len=10, align=2059, pos=11: 4.67 4.00 ( 14.28%) len=10, align=2059, pos=11: 4.67 4.00 ( 14.26%) len=10, align=4081, pos=11: 6.00 5.60 ( 6.72%) len=10, align=4081, pos=11: 6.27 5.98 ( 4.56%) len=12, align=4081, pos=11: 6.00 5.41 ( 9.85%) len=12, align=4081, pos=11: 6.10 5.99 ( 1.77%) len=13, align=0, pos=12: 4.67 4.00 ( 14.28%) len=13, align=0, pos=12: 4.94 4.50 ( 8.91%) len=13, align=12, pos=12: 4.84 4.44 ( 8.23%) len=13, align=12, pos=12: 4.81 4.46 ( 7.19%) len=11, align=0, pos=12: 4.67 4.00 ( 14.26%) len=11, align=0, pos=12: 4.67 4.00 ( 14.28%) len=11, align=12, pos=12: 4.67 4.04 ( 13.38%) len=11, align=12, pos=12: 4.83 4.47 ( 7.53%) len=13, align=2048, pos=12: 4.94 4.46 ( 9.71%) len=13, align=2048, pos=12: 4.67 4.00 ( 14.28%) len=13, align=2060, pos=12: 4.67 4.00 ( 14.28%) len=13, align=2060, pos=12: 4.77 4.51 ( 5.41%) len=11, align=2048, pos=12: 4.82 4.47 ( 7.29%) len=11, align=2048, pos=12: 4.86 4.47 ( 7.97%) len=11, align=2060, pos=12: 4.67 4.00 ( 14.28%) len=11, align=2060, pos=12: 4.67 4.00 ( 14.28%) len=11, align=4081, pos=12: 6.00 5.41 ( 9.81%) len=11, align=4081, pos=12: 6.11 5.98 ( 2.13%) len=13, align=4081, pos=12: 6.00 5.60 ( 6.72%) len=13, align=4081, pos=12: 6.06 6.14 ( -1.28%) len=14, align=0, pos=13: 4.90 4.47 ( 8.85%) len=14, align=0, pos=13: 4.80 4.47 ( 6.92%) len=14, align=13, pos=13: 4.67 4.00 ( 14.28%) len=14, align=13, pos=13: 4.67 4.00 ( 14.26%) len=12, align=0, pos=13: 4.67 4.04 ( 13.38%) len=12, align=0, pos=13: 4.82 4.45 ( 7.65%) len=12, align=13, pos=13: 5.04 4.24 ( 15.98%) len=12, align=13, pos=13: 4.67 4.00 ( 14.26%) len=14, align=2048, pos=13: 4.77 4.48 ( 6.20%) len=14, align=2048, pos=13: 4.82 4.45 ( 7.63%) len=14, align=2061, pos=13: 4.67 4.00 ( 14.28%) len=14, align=2061, pos=13: 4.67 4.04 ( 13.39%) len=12, align=2048, pos=13: 4.67 4.04 ( 13.38%) len=12, align=2048, pos=13: 4.82 4.63 ( 3.81%) len=12, align=2061, pos=13: 4.80 4.45 ( 7.21%) len=12, align=2061, pos=13: 4.67 4.00 ( 14.28%) len=12, align=4081, pos=13: 6.00 5.60 ( 6.73%) len=12, align=4081, pos=13: 6.11 6.00 ( 1.81%) len=14, align=4081, pos=13: 6.06 5.97 ( 1.59%) len=14, align=4081, pos=13: 6.00 5.60 ( 6.63%) len=15, align=0, pos=14: 4.79 4.44 ( 7.25%) len=15, align=0, pos=14: 4.67 4.04 ( 13.37%) len=15, align=14, pos=14: 4.81 4.45 ( 7.50%) len=15, align=14, pos=14: 4.79 4.44 ( 7.28%) len=13, align=0, pos=14: 4.67 4.00 ( 14.28%) len=13, align=0, pos=14: 4.67 4.04 ( 13.40%) len=13, align=14, pos=14: 4.81 4.45 ( 7.55%) len=13, align=14, pos=14: 4.79 4.48 ( 6.44%) len=15, align=2048, pos=14: 4.67 4.00 ( 14.28%) len=15, align=2048, pos=14: 4.80 4.49 ( 6.39%) len=15, align=2062, pos=14: 4.67 4.46 ( 4.50%) len=15, align=2062, pos=14: 4.67 4.00 ( 14.28%) len=13, align=2048, pos=14: 4.67 4.00 ( 14.28%) len=13, align=2048, pos=14: 4.78 4.47 ( 6.52%) len=13, align=2062, pos=14: 4.81 4.46 ( 7.21%) len=13, align=2062, pos=14: 4.67 4.00 ( 14.28%) len=13, align=4081, pos=14: 6.00 5.60 ( 6.73%) len=13, align=4081, pos=14: 6.06 5.99 ( 1.26%) len=15, align=4081, pos=14: 6.11 5.99 ( 2.06%) len=15, align=4081, pos=14: 6.00 5.60 ( 6.73%) len=16, align=0, pos=15: 4.81 4.45 ( 7.36%) len=16, align=0, pos=15: 4.67 4.00 ( 14.27%) len=16, align=15, pos=15: 4.83 4.47 ( 7.50%) len=16, align=15, pos=15: 4.80 4.47 ( 6.94%) len=14, align=0, pos=15: 4.67 4.00 ( 14.28%) len=14, align=0, pos=15: 4.67 4.06 ( 12.99%) len=14, align=15, pos=15: 4.82 4.45 ( 7.54%) len=14, align=15, pos=15: 4.67 4.00 ( 14.28%) len=16, align=2048, pos=15: 4.67 4.04 ( 13.40%) len=16, align=2048, pos=15: 4.82 4.45 ( 7.69%) len=16, align=2063, pos=15: 4.80 4.44 ( 7.48%) len=16, align=2063, pos=15: 4.67 4.04 ( 13.38%) len=14, align=2048, pos=15: 4.83 4.46 ( 7.80%) len=14, align=2048, pos=15: 4.79 4.46 ( 6.99%) len=14, align=2063, pos=15: 4.67 4.00 ( 14.26%) len=14, align=2063, pos=15: 4.84 4.44 ( 8.15%) len=14, align=4081, pos=15: 6.09 5.97 ( 1.87%) len=14, align=4081, pos=15: 6.08 5.99 ( 1.58%) len=16, align=4081, pos=15: 7.67 6.71 ( 12.50%) len=16, align=4081, pos=15: 8.37 7.13 ( 14.83%) len=17, align=0, pos=16: 4.79 4.46 ( 6.91%) len=17, align=0, pos=16: 4.77 4.49 ( 5.77%) len=17, align=16, pos=16: 4.80 4.46 ( 7.21%) len=17, align=16, pos=16: 4.82 4.46 ( 7.43%) len=15, align=0, pos=16: 4.67 4.00 ( 14.28%) len=15, align=0, pos=16: 4.67 4.04 ( 13.40%) len=15, align=16, pos=16: 4.86 4.47 ( 8.15%) len=15, align=16, pos=16: 4.67 4.00 ( 14.26%) len=17, align=2048, pos=16: 4.82 4.43 ( 7.97%) len=17, align=2048, pos=16: 4.67 4.00 ( 14.28%) len=17, align=2064, pos=16: 4.84 4.45 ( 8.11%) len=17, align=2064, pos=16: 4.67 4.00 ( 14.28%) len=15, align=2048, pos=16: 5.41 4.46 ( 17.64%) len=15, align=2048, pos=16: 4.67 4.00 ( 14.28%) len=15, align=2064, pos=16: 4.67 4.04 ( 13.36%) len=15, align=2064, pos=16: 4.82 4.45 ( 7.59%) len=15, align=4081, pos=16: 6.00 6.71 (-11.79%) len=15, align=4081, pos=16: 6.06 7.09 (-17.01%) len=17, align=4081, pos=16: 7.73 7.12 ( 7.85%) len=17, align=4081, pos=16: 7.72 7.13 ( 7.61%) len=18, align=0, pos=17: 4.67 4.00 ( 14.28%) len=18, align=0, pos=17: 4.79 4.45 ( 7.09%) len=18, align=17, pos=17: 4.67 4.04 ( 13.39%) len=18, align=17, pos=17: 4.78 4.44 ( 6.97%) len=16, align=0, pos=17: 4.67 4.04 ( 13.36%) len=16, align=0, pos=17: 4.82 4.46 ( 7.43%) len=16, align=17, pos=17: 4.80 4.46 ( 7.18%) len=16, align=17, pos=17: 4.95 4.49 ( 9.36%) len=18, align=2048, pos=17: 4.78 4.45 ( 6.89%) len=18, align=2048, pos=17: 4.79 4.49 ( 6.33%) len=18, align=2065, pos=17: 4.82 4.46 ( 7.47%) len=18, align=2065, pos=17: 4.79 4.45 ( 7.09%) len=16, align=2048, pos=17: 4.67 4.00 ( 14.28%) len=16, align=2048, pos=17: 4.67 4.04 ( 13.51%) len=16, align=2065, pos=17: 4.82 4.46 ( 7.57%) len=16, align=2065, pos=17: 4.78 4.45 ( 6.97%) len=16, align=4081, pos=17: 8.37 7.33 ( 12.42%) len=16, align=4081, pos=17: 8.41 7.79 ( 7.38%) len=18, align=4081, pos=17: 7.67 6.71 ( 12.50%) len=18, align=4081, pos=17: 7.73 7.11 ( 7.99%) len=19, align=0, pos=18: 4.77 4.45 ( 6.78%) len=19, align=0, pos=18: 4.77 4.44 ( 6.87%) len=19, align=18, pos=18: 4.79 4.46 ( 6.98%) len=19, align=18, pos=18: 4.81 4.45 ( 7.45%) len=17, align=0, pos=18: 4.79 4.44 ( 7.38%) len=17, align=0, pos=18: 4.67 4.04 ( 13.40%) len=17, align=18, pos=18: 4.80 4.44 ( 7.50%) len=17, align=18, pos=18: 4.67 4.04 ( 13.39%) len=19, align=2048, pos=18: 4.77 4.46 ( 6.59%) len=19, align=2048, pos=18: 4.84 4.44 ( 8.21%) len=19, align=2066, pos=18: 4.67 4.04 ( 13.40%) len=19, align=2066, pos=18: 4.67 4.47 ( 4.32%) len=17, align=2048, pos=18: 4.77 4.49 ( 5.80%) len=17, align=2048, pos=18: 4.80 4.46 ( 7.20%) len=17, align=2066, pos=18: 4.82 4.48 ( 6.99%) len=17, align=2066, pos=18: 4.67 4.00 ( 14.28%) len=17, align=4081, pos=18: 8.54 7.76 ( 9.13%) len=17, align=4081, pos=18: 8.33 7.39 ( 11.29%) len=19, align=4081, pos=18: 7.77 7.11 ( 8.52%) len=19, align=4081, pos=18: 7.39 7.17 ( 2.95%) len=20, align=0, pos=19: 4.67 4.05 ( 13.21%) len=20, align=0, pos=19: 4.80 4.45 ( 7.42%) len=20, align=19, pos=19: 4.84 4.44 ( 8.39%) len=20, align=19, pos=19: 4.67 4.04 ( 13.40%) len=18, align=0, pos=19: 4.81 4.48 ( 6.73%) len=18, align=0, pos=19: 4.77 4.47 ( 6.27%) len=18, align=19, pos=19: 4.77 4.46 ( 6.59%) len=18, align=19, pos=19: 4.81 4.44 ( 7.66%) len=20, align=2048, pos=19: 4.80 4.48 ( 6.72%) len=20, align=2048, pos=19: 4.79 4.47 ( 6.79%) len=20, align=2067, pos=19: 4.78 4.43 ( 7.27%) len=20, align=2067, pos=19: 4.79 4.49 ( 6.28%) len=18, align=2048, pos=19: 4.77 4.45 ( 6.72%) len=18, align=2048, pos=19: 4.81 4.44 ( 7.76%) len=18, align=2067, pos=19: 4.79 4.51 ( 6.02%) len=18, align=2067, pos=19: 4.67 4.00 ( 14.28%) len=18, align=4081, pos=19: 8.37 7.81 ( 6.63%) len=18, align=4081, pos=19: 8.69 7.33 ( 15.62%) len=20, align=4081, pos=19: 7.76 7.12 ( 8.20%) len=20, align=4081, pos=19: 7.33 6.72 ( 8.33%) len=21, align=0, pos=20: 4.84 4.45 ( 8.07%) len=21, align=0, pos=20: 4.95 4.51 ( 8.90%) len=21, align=20, pos=20: 4.79 4.44 ( 7.41%) len=21, align=20, pos=20: 4.78 4.45 ( 6.96%) len=19, align=0, pos=20: 4.85 4.45 ( 8.14%) len=19, align=0, pos=20: 4.67 4.00 ( 14.27%) len=19, align=20, pos=20: 4.81 4.44 ( 7.72%) len=19, align=20, pos=20: 4.84 4.78 ( 1.23%) len=21, align=2048, pos=20: 4.81 4.45 ( 7.58%) len=21, align=2048, pos=20: 4.78 4.47 ( 6.61%) len=21, align=2068, pos=20: 4.79 4.45 ( 7.09%) len=21, align=2068, pos=20: 4.85 4.43 ( 8.52%) len=19, align=2048, pos=20: 4.67 4.00 ( 14.28%) len=19, align=2048, pos=20: 4.79 4.45 ( 7.24%) len=19, align=2068, pos=20: 4.84 4.46 ( 7.90%) len=19, align=2068, pos=20: 4.67 4.00 ( 14.27%) len=19, align=4081, pos=20: 8.55 7.84 ( 8.37%) len=19, align=4081, pos=20: 8.63 7.33 ( 15.04%) len=21, align=4081, pos=20: 7.73 7.17 ( 7.24%) len=21, align=4081, pos=20: 8.17 7.15 ( 12.45%) len=22, align=0, pos=21: 4.80 4.44 ( 7.52%) len=22, align=0, pos=21: 4.85 4.64 ( 4.31%) len=22, align=21, pos=21: 4.82 4.46 ( 7.34%) len=22, align=21, pos=21: 4.67 4.04 ( 13.39%) len=20, align=0, pos=21: 4.79 4.46 ( 6.91%) len=20, align=0, pos=21: 4.85 4.47 ( 7.81%) len=20, align=21, pos=21: 4.67 4.04 ( 13.47%) len=20, align=21, pos=21: 4.77 4.48 ( 6.15%) len=22, align=2048, pos=21: 4.79 4.46 ( 6.80%) len=22, align=2048, pos=21: 4.86 4.45 ( 8.56%) len=22, align=2069, pos=21: 4.82 4.51 ( 6.36%) len=22, align=2069, pos=21: 4.79 4.46 ( 6.97%) len=20, align=2048, pos=21: 4.78 4.46 ( 6.78%) len=20, align=2048, pos=21: 4.89 4.47 ( 8.57%) len=20, align=2069, pos=21: 4.67 4.04 ( 13.34%) len=20, align=2069, pos=21: 4.80 4.45 ( 7.14%) len=20, align=4081, pos=21: 9.03 7.33 ( 18.80%) len=20, align=4081, pos=21: 8.65 7.33 ( 15.21%) len=22, align=4081, pos=21: 7.69 7.50 ( 2.51%) len=22, align=4081, pos=21: 7.74 7.14 ( 7.81%) len=23, align=0, pos=22: 4.78 4.43 ( 7.30%) len=23, align=0, pos=22: 4.82 4.45 ( 7.65%) len=23, align=22, pos=22: 4.82 4.45 ( 7.59%) len=23, align=22, pos=22: 4.82 5.17 ( -7.26%) len=21, align=0, pos=22: 4.68 4.04 ( 13.65%) len=21, align=0, pos=22: 4.80 4.44 ( 7.37%) len=21, align=22, pos=22: 4.81 4.45 ( 7.46%) len=21, align=22, pos=22: 4.82 4.46 ( 7.53%) len=23, align=2048, pos=22: 4.67 4.04 ( 13.39%) len=23, align=2048, pos=22: 4.80 4.67 ( 2.67%) len=23, align=2070, pos=22: 4.79 4.46 ( 6.89%) len=23, align=2070, pos=22: 4.81 4.46 ( 7.29%) len=21, align=2048, pos=22: 4.67 4.04 ( 13.39%) len=21, align=2048, pos=22: 4.80 4.44 ( 7.41%) len=21, align=2070, pos=22: 4.80 4.46 ( 7.19%) len=21, align=2070, pos=22: 4.82 4.45 ( 7.69%) len=21, align=4081, pos=22: 8.50 7.79 ( 8.34%) len=21, align=4081, pos=22: 8.53 8.50 ( 0.36%) len=23, align=4081, pos=22: 7.73 7.10 ( 8.09%) len=23, align=4081, pos=22: 7.71 7.16 ( 7.09%) len=24, align=0, pos=23: 4.80 4.45 ( 7.22%) len=24, align=0, pos=23: 4.80 4.46 ( 6.96%) len=24, align=23, pos=23: 4.83 4.47 ( 7.38%) len=24, align=23, pos=23: 4.77 4.49 ( 5.85%) len=22, align=0, pos=23: 4.81 4.47 ( 7.05%) len=22, align=0, pos=23: 4.79 4.44 ( 7.36%) len=22, align=23, pos=23: 4.83 4.45 ( 7.79%) len=22, align=23, pos=23: 4.78 4.50 ( 5.96%) len=24, align=2048, pos=23: 4.77 4.48 ( 6.09%) len=24, align=2048, pos=23: 4.79 4.45 ( 7.06%) len=24, align=2071, pos=23: 4.79 4.45 ( 7.05%) len=24, align=2071, pos=23: 4.77 4.46 ( 6.48%) len=22, align=2048, pos=23: 4.90 4.48 ( 8.46%) len=22, align=2048, pos=23: 4.67 4.00 ( 14.26%) len=22, align=2071, pos=23: 4.81 4.46 ( 7.30%) len=22, align=2071, pos=23: 4.83 4.47 ( 7.39%) len=22, align=4081, pos=23: 8.37 7.33 ( 12.35%) len=22, align=4081, pos=23: 8.36 7.82 ( 6.39%) len=24, align=4081, pos=23: 7.75 7.13 ( 8.01%) len=24, align=4081, pos=23: 7.73 7.15 ( 7.40%) len=25, align=0, pos=24: 4.78 4.45 ( 6.94%) len=25, align=0, pos=24: 4.81 4.48 ( 6.80%) len=25, align=24, pos=24: 4.82 4.44 ( 7.89%) len=25, align=24, pos=24: 4.91 4.48 ( 8.74%) len=23, align=0, pos=24: 4.80 4.45 ( 7.39%) len=23, align=0, pos=24: 4.80 4.46 ( 7.05%) len=23, align=24, pos=24: 4.86 4.44 ( 8.57%) len=23, align=24, pos=24: 4.82 4.46 ( 7.49%) len=25, align=2048, pos=24: 4.85 4.47 ( 7.81%) len=25, align=2048, pos=24: 4.83 4.46 ( 7.55%) len=25, align=2072, pos=24: 4.84 4.46 ( 7.83%) len=25, align=2072, pos=24: 4.87 4.45 ( 8.60%) len=23, align=2048, pos=24: 4.83 4.45 ( 7.72%) len=23, align=2048, pos=24: 4.67 4.04 ( 13.40%) len=23, align=2072, pos=24: 4.79 4.44 ( 7.28%) len=23, align=2072, pos=24: 4.79 4.45 ( 7.10%) len=23, align=4081, pos=24: 8.40 7.78 ( 7.41%) len=23, align=4081, pos=24: 8.37 7.81 ( 6.61%) len=25, align=4081, pos=24: 7.72 7.17 ( 7.21%) len=25, align=4081, pos=24: 7.76 7.12 ( 8.35%) len=26, align=0, pos=25: 4.79 4.46 ( 6.95%) len=26, align=0, pos=25: 4.80 4.47 ( 6.94%) len=26, align=25, pos=25: 4.79 4.46 ( 6.84%) len=26, align=25, pos=25: 4.80 4.46 ( 7.02%) len=24, align=0, pos=25: 4.84 4.45 ( 8.02%) len=24, align=0, pos=25: 4.77 4.46 ( 6.61%) len=24, align=25, pos=25: 4.82 4.44 ( 7.83%) len=24, align=25, pos=25: 4.82 4.44 ( 8.01%) len=26, align=2048, pos=25: 4.83 4.66 ( 3.62%) len=26, align=2048, pos=25: 4.81 4.46 ( 7.37%) len=26, align=2073, pos=25: 4.85 4.45 ( 8.28%) len=26, align=2073, pos=25: 4.83 4.45 ( 7.79%) len=24, align=2048, pos=25: 4.67 4.04 ( 13.39%) len=24, align=2048, pos=25: 4.82 4.47 ( 7.27%) len=24, align=2073, pos=25: 4.77 4.46 ( 6.51%) len=24, align=2073, pos=25: 4.78 4.45 ( 6.91%) len=24, align=4081, pos=25: 8.42 7.78 ( 7.58%) len=24, align=4081, pos=25: 8.46 7.77 ( 8.17%) len=26, align=4081, pos=25: 7.76 7.11 ( 8.34%) len=26, align=4081, pos=25: 7.71 7.17 ( 7.01%) len=27, align=0, pos=26: 4.78 4.44 ( 7.15%) len=27, align=0, pos=26: 4.81 4.45 ( 7.44%) len=27, align=26, pos=26: 4.81 4.45 ( 7.52%) len=27, align=26, pos=26: 4.80 4.45 ( 7.28%) len=25, align=0, pos=26: 5.22 4.44 ( 14.86%) len=25, align=0, pos=26: 4.81 4.44 ( 7.64%) len=25, align=26, pos=26: 4.80 4.45 ( 7.16%) len=25, align=26, pos=26: 4.78 4.46 ( 6.84%) len=27, align=2048, pos=26: 4.80 4.46 ( 7.06%) len=27, align=2048, pos=26: 4.82 4.48 ( 7.00%) len=27, align=2074, pos=26: 4.80 4.45 ( 7.31%) len=27, align=2074, pos=26: 4.81 4.46 ( 7.29%) len=25, align=2048, pos=26: 4.78 4.44 ( 7.25%) len=25, align=2048, pos=26: 4.81 4.47 ( 7.16%) len=25, align=2074, pos=26: 4.81 4.46 ( 7.20%) len=25, align=2074, pos=26: 4.82 4.46 ( 7.51%) len=25, align=4081, pos=26: 8.53 7.78 ( 8.73%) len=25, align=4081, pos=26: 8.56 7.78 ( 9.15%) len=27, align=4081, pos=26: 7.90 7.11 ( 9.99%) len=27, align=4081, pos=26: 7.69 7.18 ( 6.68%) len=28, align=0, pos=27: 4.79 4.46 ( 7.05%) len=28, align=0, pos=27: 4.78 4.44 ( 7.20%) len=28, align=27, pos=27: 4.81 4.46 ( 7.26%) len=28, align=27, pos=27: 4.78 4.50 ( 6.00%) len=26, align=0, pos=27: 4.82 4.45 ( 7.54%) len=26, align=0, pos=27: 4.79 4.45 ( 7.00%) len=26, align=27, pos=27: 4.90 4.69 ( 4.32%) len=26, align=27, pos=27: 4.81 4.51 ( 6.21%) len=28, align=2048, pos=27: 4.79 4.51 ( 5.89%) len=28, align=2048, pos=27: 4.85 4.47 ( 7.90%) len=28, align=2075, pos=27: 4.83 4.54 ( 6.04%) len=28, align=2075, pos=27: 4.80 4.63 ( 3.52%) len=26, align=2048, pos=27: 4.80 4.45 ( 7.39%) len=26, align=2048, pos=27: 4.80 4.46 ( 7.20%) len=26, align=2075, pos=27: 4.79 4.49 ( 6.27%) len=26, align=2075, pos=27: 4.78 4.45 ( 6.76%) len=26, align=4081, pos=27: 8.36 7.84 ( 6.19%) len=26, align=4081, pos=27: 8.54 7.77 ( 9.07%) len=28, align=4081, pos=27: 7.75 7.13 ( 8.07%) len=28, align=4081, pos=27: 7.72 7.17 ( 7.06%) len=29, align=0, pos=28: 4.78 4.45 ( 7.02%) len=29, align=0, pos=28: 4.79 4.45 ( 7.10%) len=29, align=28, pos=28: 4.80 4.45 ( 7.37%) len=29, align=28, pos=28: 4.86 4.44 ( 8.77%) len=27, align=0, pos=28: 4.78 4.44 ( 7.16%) len=27, align=0, pos=28: 4.78 4.45 ( 7.00%) len=27, align=28, pos=28: 4.81 4.45 ( 7.44%) len=27, align=28, pos=28: 4.81 4.45 ( 7.38%) len=29, align=2048, pos=28: 4.81 4.49 ( 6.72%) len=29, align=2048, pos=28: 4.81 4.45 ( 7.41%) len=29, align=2076, pos=28: 4.84 4.45 ( 8.02%) len=29, align=2076, pos=28: 4.81 4.48 ( 6.71%) len=27, align=2048, pos=28: 4.78 4.50 ( 6.00%) len=27, align=2048, pos=28: 4.78 4.48 ( 6.35%) len=27, align=2076, pos=28: 4.80 4.50 ( 6.40%) len=27, align=2076, pos=28: 4.80 4.44 ( 7.50%) len=27, align=4081, pos=28: 8.34 7.82 ( 6.23%) len=27, align=4081, pos=28: 8.38 7.81 ( 6.78%) len=29, align=4081, pos=28: 7.71 7.14 ( 7.39%) len=29, align=4081, pos=28: 7.72 7.15 ( 7.32%) len=30, align=0, pos=29: 4.81 4.45 ( 7.60%) len=30, align=0, pos=29: 4.82 4.45 ( 7.54%) len=30, align=29, pos=29: 4.84 4.46 ( 7.82%) len=30, align=29, pos=29: 4.80 4.49 ( 6.47%) len=28, align=0, pos=29: 4.80 4.47 ( 6.90%) len=28, align=0, pos=29: 4.78 4.46 ( 6.67%) len=28, align=29, pos=29: 4.80 4.52 ( 5.71%) len=28, align=29, pos=29: 4.83 4.62 ( 4.31%) len=30, align=2048, pos=29: 4.80 4.48 ( 6.81%) len=30, align=2048, pos=29: 4.82 4.45 ( 7.57%) len=30, align=2077, pos=29: 4.80 4.44 ( 7.50%) len=30, align=2077, pos=29: 4.78 4.46 ( 6.87%) len=28, align=2048, pos=29: 4.80 4.48 ( 6.64%) len=28, align=2048, pos=29: 4.80 4.48 ( 6.67%) len=28, align=2077, pos=29: 4.79 4.49 ( 6.33%) len=28, align=2077, pos=29: 4.84 4.47 ( 7.75%) len=28, align=4081, pos=29: 8.39 7.79 ( 7.20%) len=28, align=4081, pos=29: 8.40 7.80 ( 7.17%) len=30, align=4081, pos=29: 7.89 7.12 ( 9.76%) len=30, align=4081, pos=29: 7.74 7.11 ( 8.14%) len=31, align=0, pos=30: 4.81 4.46 ( 7.31%) len=31, align=0, pos=30: 5.07 4.73 ( 6.69%) len=31, align=30, pos=30: 5.05 4.72 ( 6.46%) len=31, align=30, pos=30: 4.82 4.45 ( 7.69%) len=29, align=0, pos=30: 4.80 4.51 ( 5.94%) len=29, align=0, pos=30: 4.77 4.49 ( 5.94%) len=29, align=30, pos=30: 4.83 4.44 ( 7.99%) len=29, align=30, pos=30: 4.77 4.46 ( 6.56%) len=31, align=2048, pos=30: 4.80 4.46 ( 7.05%) len=31, align=2048, pos=30: 4.79 4.44 ( 7.44%) len=31, align=2078, pos=30: 4.80 4.48 ( 6.58%) len=31, align=2078, pos=30: 4.80 4.49 ( 6.49%) len=29, align=2048, pos=30: 4.83 4.45 ( 7.93%) len=29, align=2048, pos=30: 4.84 4.46 ( 7.85%) len=29, align=2078, pos=30: 4.81 4.46 ( 7.29%) len=29, align=2078, pos=30: 4.77 4.44 ( 6.99%) len=29, align=4081, pos=30: 8.35 7.80 ( 6.59%) len=29, align=4081, pos=30: 8.36 7.92 ( 5.35%) len=31, align=4081, pos=30: 7.71 7.13 ( 7.56%) len=31, align=4081, pos=30: 7.70 7.14 ( 7.27%) len=32, align=0, pos=31: 4.77 4.47 ( 6.45%) len=32, align=0, pos=31: 4.79 4.46 ( 6.87%) len=32, align=31, pos=31: 5.06 4.74 ( 6.37%) len=32, align=31, pos=31: 5.06 4.69 ( 7.32%) len=30, align=0, pos=31: 5.07 4.69 ( 7.39%) len=30, align=0, pos=31: 5.07 4.73 ( 6.76%) len=30, align=31, pos=31: 5.10 4.68 ( 8.17%) len=30, align=31, pos=31: 4.78 4.44 ( 7.19%) len=32, align=2048, pos=31: 4.81 4.47 ( 7.05%) len=32, align=2048, pos=31: 4.86 4.48 ( 7.94%) len=32, align=2079, pos=31: 4.82 4.46 ( 7.42%) len=32, align=2079, pos=31: 4.86 4.46 ( 8.17%) len=30, align=2048, pos=31: 4.77 4.47 ( 6.38%) len=30, align=2048, pos=31: 4.80 4.45 ( 7.30%) len=30, align=2079, pos=31: 4.80 4.49 ( 6.51%) len=30, align=2079, pos=31: 4.80 4.47 ( 6.90%) len=30, align=4081, pos=31: 8.46 7.77 ( 8.22%) len=30, align=4081, pos=31: 8.39 7.79 ( 7.21%) len=32, align=4081, pos=31: 7.71 7.14 ( 7.45%) len=32, align=4081, pos=31: 7.74 7.12 ( 8.01%) [-- Attachment #4: rawmemchr.txt --] [-- Type: text/plain, Size: 6674 bytes --] Function: rawmemchr Variant: __rawmemchr_evex __rawmemchr_evex512 ======================================================================================================================== length=32, alignment=0: 8.06 5.11 ( 36.55%) length=64, alignment=1: 8.61 6.84 ( 20.48%) length=32, alignment=0: 6.23 4.02 ( 35.56%) length=64, alignment=1: 7.19 5.78 ( 19.67%) length=64, alignment=0: 6.68 5.43 ( 18.75%) length=64, alignment=2: 6.51 5.45 ( 16.20%) length=64, alignment=0: 6.51 5.47 ( 15.97%) length=64, alignment=2: 6.51 5.46 ( 16.01%) length=128, alignment=0: 6.53 6.12 ( 6.29%) length=64, alignment=3: 6.50 5.45 ( 16.07%) length=128, alignment=0: 7.07 6.45 ( 8.75%) length=64, alignment=3: 6.60 5.46 ( 17.21%) length=256, alignment=0: 11.06 7.46 ( 32.56%) length=64, alignment=4: 6.50 5.48 ( 15.72%) length=256, alignment=0: 11.05 7.46 ( 32.46%) length=64, alignment=4: 6.50 5.46 ( 16.03%) length=512, alignment=0: 14.23 14.13 ( 0.67%) length=64, alignment=5: 6.50 5.46 ( 15.93%) length=512, alignment=0: 14.37 14.14 ( 1.64%) length=64, alignment=5: 6.50 5.47 ( 15.79%) length=1024, alignment=0: 21.36 18.36 ( 14.04%) length=64, alignment=6: 6.51 5.47 ( 15.99%) length=1024, alignment=0: 21.30 18.48 ( 13.23%) length=64, alignment=6: 6.62 5.46 ( 17.55%) length=1, alignment=0: 3.90 3.46 ( 11.43%) length=1, alignment=0: 3.90 3.46 ( 11.38%) length=2, alignment=0: 3.90 3.45 ( 11.53%) length=2, alignment=0: 3.90 3.45 ( 11.51%) length=3, alignment=0: 3.90 3.46 ( 11.32%) length=3, alignment=0: 3.90 3.45 ( 11.46%) length=4, alignment=0: 4.01 3.34 ( 16.66%) length=4, alignment=0: 3.90 3.45 ( 11.48%) length=5, alignment=0: 3.90 3.46 ( 11.44%) length=5, alignment=0: 3.90 3.45 ( 11.51%) length=6, alignment=0: 3.90 3.44 ( 11.72%) length=6, alignment=0: 3.90 3.46 ( 11.34%) length=7, alignment=0: 3.90 3.46 ( 11.37%) length=7, alignment=0: 3.90 3.46 ( 11.45%) length=8, alignment=0: 3.90 3.45 ( 11.61%) length=8, alignment=0: 3.90 3.45 ( 11.70%) length=9, alignment=0: 3.89 3.46 ( 11.22%) length=9, alignment=0: 4.04 3.45 ( 14.83%) length=10, alignment=0: 3.90 3.45 ( 11.66%) length=10, alignment=0: 3.90 3.45 ( 11.60%) length=11, alignment=0: 3.89 3.46 ( 11.17%) length=11, alignment=0: 3.90 3.44 ( 11.75%) length=12, alignment=0: 3.89 3.46 ( 11.18%) length=12, alignment=0: 3.90 3.45 ( 11.55%) length=13, alignment=0: 3.89 3.45 ( 11.25%) length=13, alignment=0: 3.90 3.45 ( 11.59%) length=14, alignment=0: 3.89 3.46 ( 11.17%) length=14, alignment=0: 3.90 3.45 ( 11.69%) length=15, alignment=0: 3.90 3.46 ( 11.43%) length=15, alignment=0: 3.90 3.45 ( 11.59%) length=16, alignment=0: 3.89 3.45 ( 11.27%) length=16, alignment=0: 3.90 3.44 ( 11.74%) length=17, alignment=0: 3.90 3.45 ( 11.66%) length=17, alignment=0: 3.90 3.61 ( 7.42%) length=18, alignment=0: 3.90 3.45 ( 11.50%) length=18, alignment=0: 3.90 3.45 ( 11.51%) length=19, alignment=0: 3.90 3.46 ( 11.36%) length=19, alignment=0: 3.89 3.46 ( 11.11%) length=20, alignment=0: 3.89 3.45 ( 11.34%) length=20, alignment=0: 3.90 3.46 ( 11.30%) length=21, alignment=0: 3.90 3.46 ( 11.41%) length=21, alignment=0: 3.90 3.45 ( 11.52%) length=22, alignment=0: 3.90 3.46 ( 11.36%) length=22, alignment=0: 3.90 3.46 ( 11.43%) length=23, alignment=0: 3.90 3.46 ( 11.38%) length=23, alignment=0: 3.89 3.45 ( 11.27%) length=24, alignment=0: 3.90 3.44 ( 11.74%) length=24, alignment=0: 3.90 3.46 ( 11.50%) length=25, alignment=0: 3.90 3.50 ( 10.25%) length=25, alignment=0: 3.90 3.45 ( 11.48%) length=26, alignment=0: 3.90 3.46 ( 11.39%) length=26, alignment=0: 3.90 3.46 ( 11.20%) length=27, alignment=0: 3.89 3.45 ( 11.31%) length=27, alignment=0: 3.90 3.45 ( 11.64%) length=28, alignment=0: 3.90 3.45 ( 11.64%) length=28, alignment=0: 3.90 3.46 ( 11.27%) length=29, alignment=0: 3.90 3.46 ( 11.36%) length=29, alignment=0: 3.90 3.47 ( 11.04%) length=30, alignment=0: 3.90 3.45 ( 11.47%) length=30, alignment=0: 3.89 3.70 ( 5.03%) length=31, alignment=0: 4.25 3.55 ( 16.53%) length=31, alignment=0: 4.13 3.66 ( 11.37%) ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH] x86_64: Implement evex512 version of memchr, rawmemchr and wmemchr 2022-09-23 3:57 ` Sunil Pandey @ 2022-09-29 3:42 ` Sunil Pandey 2022-09-29 4:07 ` Noah Goldstein 0 siblings, 1 reply; 26+ messages in thread From: Sunil Pandey @ 2022-09-29 3:42 UTC (permalink / raw) To: Noah Goldstein; +Cc: GNU C Library On Thu, Sep 22, 2022 at 8:57 PM Sunil Pandey <skpgkp2@gmail.com> wrote: > > Microbenchmark data collected on: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz > > > On Wed, Sep 21, 2022 at 5:50 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > On Wed, Sep 21, 2022 at 5:27 PM Sunil K Pandey via Libc-alpha > > <libc-alpha@sourceware.org> wrote: > > > > > > This patch implements following evex512 version of string functions. > > > evex512 version takes up to 30% less cycle as compared to evex, > > > depending on length and alignment. > > > > Please attach benchmark numbers. > > > > > > - memchr function using 512 bit vectors. > > > - rawmemchr function using 512 bit vectors. > > > - wmemchr function using 512 bit vectors. > > > > > > Code size data: > > > > > > memchr-evex.o 762 byte > > > memchr-evex512.o 570 byte (-25%) > > > > > > rawmemchr-evex.o 461 byte > > > rawmemchr-evex512.o 413 byte (-10%) > > > > > > wmemchr-evex.o 794 byte > > > wmemchr-evex512.o 568 byte (-28%) > > > > > > Placeholder function, not used by any processor at the moment. > > > --- > > > sysdeps/x86_64/multiarch/Makefile | 3 + > > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 + > > > sysdeps/x86_64/multiarch/memchr-evex-base.S | 306 +++++++++++++++++++ > > > sysdeps/x86_64/multiarch/memchr-evex512.S | 7 + > > > sysdeps/x86_64/multiarch/rawmemchr-evex512.S | 7 + > > > sysdeps/x86_64/multiarch/wmemchr-evex512.S | 8 + > > > 6 files changed, 346 insertions(+) > > > create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-base.S > > > create mode 100644 sysdeps/x86_64/multiarch/memchr-evex512.S > > > create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex512.S > > > create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex512.S > > > > > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > > > index df4601c294..e974b1ad97 100644 > > > --- a/sysdeps/x86_64/multiarch/Makefile > > > +++ b/sysdeps/x86_64/multiarch/Makefile > > > @@ -4,6 +4,7 @@ sysdep_routines += \ > > > memchr-avx2 \ > > > memchr-avx2-rtm \ > > > memchr-evex \ > > > + memchr-evex512 \ > > > memchr-evex-rtm \ > > > memchr-sse2 \ > > > memcmp-avx2-movbe \ > > > @@ -36,6 +37,7 @@ sysdep_routines += \ > > > rawmemchr-avx2 \ > > > rawmemchr-avx2-rtm \ > > > rawmemchr-evex \ > > > + rawmemchr-evex512 \ > > > rawmemchr-evex-rtm \ > > > rawmemchr-sse2 \ > > > stpcpy-avx2 \ > > > @@ -156,6 +158,7 @@ sysdep_routines += \ > > > wmemchr-avx2 \ > > > wmemchr-avx2-rtm \ > > > wmemchr-evex \ > > > + wmemchr-evex512 \ > > > wmemchr-evex-rtm \ > > > wmemchr-sse2 \ > > > wmemcmp-avx2-movbe \ > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > index a71444eccb..17f770318d 100644 > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > @@ -63,6 +63,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > && CPU_FEATURE_USABLE (AVX512BW) > > > && CPU_FEATURE_USABLE (BMI2)), > > > __memchr_evex) > > > + X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > > > + (CPU_FEATURE_USABLE (AVX512VL) > > > + && CPU_FEATURE_USABLE (AVX512BW) > > > + && CPU_FEATURE_USABLE (BMI2)), > > > + __memchr_evex512) > > > X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > > > (CPU_FEATURE_USABLE (AVX512VL) > > > && CPU_FEATURE_USABLE (AVX512BW) > > > @@ -329,6 +334,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > && CPU_FEATURE_USABLE (AVX512BW) > > > && CPU_FEATURE_USABLE (BMI2)), > > > __rawmemchr_evex) > > > + X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > > > + (CPU_FEATURE_USABLE (AVX512VL) > > > + && CPU_FEATURE_USABLE (AVX512BW) > > > + && CPU_FEATURE_USABLE (BMI2)), > > > + __rawmemchr_evex512) > > > X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > > > (CPU_FEATURE_USABLE (AVX512VL) > > > && CPU_FEATURE_USABLE (AVX512BW) > > > @@ -903,6 +913,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > && CPU_FEATURE_USABLE (AVX512BW) > > > && CPU_FEATURE_USABLE (BMI2)), > > > __wmemchr_evex) > > > + X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > > > + (CPU_FEATURE_USABLE (AVX512VL) > > > + && CPU_FEATURE_USABLE (AVX512BW) > > > + && CPU_FEATURE_USABLE (BMI2)), > > > + __wmemchr_evex512) > > > X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > > > (CPU_FEATURE_USABLE (AVX512VL) > > > && CPU_FEATURE_USABLE (AVX512BW) > > > diff --git a/sysdeps/x86_64/multiarch/memchr-evex-base.S b/sysdeps/x86_64/multiarch/memchr-evex-base.S > > > new file mode 100644 > > > index 0000000000..524f0809b5 > > > --- /dev/null > > > +++ b/sysdeps/x86_64/multiarch/memchr-evex-base.S > > > @@ -0,0 +1,306 @@ > > > +/* Placeholder function, not used by any processor at the moment. > > > + Copyright (C) 2022 Free Software Foundation, Inc. > > > + This file is part of the GNU C Library. > > > + > > > + The GNU C Library is free software; you can redistribute it and/or > > > + modify it under the terms of the GNU Lesser General Public > > > + License as published by the Free Software Foundation; either > > > + version 2.1 of the License, or (at your option) any later version. > > > + > > > + The GNU C Library is distributed in the hope that it will be useful, > > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > > + Lesser General Public License for more details. > > > + > > > + You should have received a copy of the GNU Lesser General Public > > > + License along with the GNU C Library; if not, see > > > + <https://www.gnu.org/licenses/>. */ > > > + > > > +/* UNUSED. Exists purely as reference implementation. */ > > > + > > > +#include <isa-level.h> > > > + > > > +#if ISA_SHOULD_BUILD (4) > > > + > > > +# include <sysdep.h> > > > + > > > +# ifdef USE_AS_WMEMCHR > > > +# define CHAR_SIZE 4 > > > +# define VPBROADCAST vpbroadcastd > > > +# define VPCMP vpcmpd > > > +# else > > > +# define CHAR_SIZE 1 > > > +# define VPBROADCAST vpbroadcastb > > > +# define VPCMP vpcmpb > > > +# endif > > > + > > > +# define PAGE_SIZE 4096 > > > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > > +# define XMM1 xmm17 > > > + > > > +# if VEC_SIZE == 64 > > > +# define KMOV kmovq > > > +# define KOR korq > > > +# define KORTEST kortestq > > > +# define RAX rax > > > +# define RCX rcx > > > +# define SHR shrq > > > +# define SARX sarxq > > > +# define TEXTSUFFIX evex512 > > > +# define VMM0 zmm16 > > > +# elif VEC_SIZE == 32 > > > +/* Currently Unused. */ > > > +# define KMOV kmovd > > > +# define KOR kord > > > +# define KORTEST kortestd > > > +# define RAX eax > > > +# define RCX ecx > > > +# define SHR shrl > > > +# define SARX sarxl > > > +# define TEXTSUFFIX evex256 > > > +# define VMM0 ymm16 > > > +# endif > > > + > > > + .section .text.TEXTSUFFIX, "ax", @progbits > > > +/* Aligning entry point to 64 byte, provides better performance for > > > + one vector length string. */ > > > +ENTRY_P2ALIGN (MEMCHR, 6) > > > +# ifndef USE_AS_RAWMEMCHR > > > + /* Check for zero length. */ > > > + test %RDX_LP, %RDX_LP > > > + jz L(zero) > > > + > > > +# ifdef __ILP32__ > > > + /* Clear the upper 32 bits. */ > > > + movl %edx, %edx > > > +# endif > > > +# endif > > > + > > > + /* Broadcast CHAR to VMM0. */ > > > + VPBROADCAST %esi, %VMM0 > > > + movl %edi, %eax > > > + andl $(PAGE_SIZE - 1), %eax > > > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > > > + ja L(page_cross) > > > + > > > + /* Compare [w]char for null, mask bit will be set for match. */ > > > + VPCMP $0, (%rdi), %VMM0, %k0 > > > + > > > + KMOV %k0, %RAX > > > +# ifndef USE_AS_RAWMEMCHR > > > + bsf %RAX, %RCX > > > + jz L(align_more) > > > + xor %eax, %eax > > > +# ifdef USE_AS_WMEMCHR > > > + leaq (%rdi, %rcx, CHAR_SIZE), %rdi > > > +# else > > > + addq %rcx, %rdi > > > +# endif > > > + cmp %rcx, %rdx > > > + cmova %rdi, %rax > > > +# else > > > + bsf %RAX, %RAX > > > + jz L(align_more) > > > + add %rdi, %rax > > > +# endif > > > + ret > > > + > > > +# ifndef USE_AS_RAWMEMCHR > > > +L(zero): > > > + xorl %eax, %eax > > > + ret > > > +# endif > > > + > > > + .p2align 5,,5 > > > +L(page_cross): > > > + movq %rdi, %rcx > > > + andq $-VEC_SIZE, %rcx > > > + > > > + VPCMP $0, (%rcx), %VMM0, %k0 > > > + KMOV %k0, %RCX > > > + SARX %RAX, %RCX, %RAX > > > +# ifndef USE_AS_RAWMEMCHR > > > + bsf %RAX, %RCX > > > + jz L(align_more) > > > + xor %eax, %eax > > > +# ifdef USE_AS_WMEMCHR > > > + leaq (%rdi, %rcx, CHAR_SIZE), %rdi > > > +# else > > > + addq %rcx, %rdi > > > +# endif > > > + cmp %rcx, %rdx > > > + cmovae %rdi, %rax > > > + > > > +# else > > > + bsf %rax, %rax > > > + jz L(align_more) > > > + add %rdi, %rax > > > +# endif > > > + ret > > > + > > > +L(ret_vec_x2): > > > + subq $-VEC_SIZE, %rdi > > > +L(ret_vec_x1): > > > + bsf %RAX, %RAX > > > +# ifndef USE_AS_RAWMEMCHR > > > + jz L(zero) > > > + cmp %rax, %rdx > > > + jbe L(zero) > > > +# endif > > > +# ifdef USE_AS_WMEMCHR > > > + leaq (%rdi, %rax, CHAR_SIZE), %rax > > > +# else > > > + add %rdi, %rax > > > +# endif > > > + ret > > > + > > > + .p2align 5,,10 > > > +L(align_more): > > > +# ifndef USE_AS_RAWMEMCHR > > > + xor %eax, %eax > > > + subq %rdi, %rax > > > +# endif > > > + > > > + subq $-VEC_SIZE, %rdi > > > + /* Align rdi to VEC_SIZE. */ > > > + andq $-VEC_SIZE, %rdi > > > + > > > +# ifndef USE_AS_RAWMEMCHR > > > + addq %rdi, %rax > > > +# ifdef USE_AS_WMEMCHR > > > + sarl $2, %eax > > > +# endif > > > + subq %rax, %rdx > > > + jbe L(zero) > > > +# endif > > > + > > > + /* Loop unroll 4 times for 4 vector loop. */ > > > + VPCMP $0, (%rdi), %VMM0, %k0 > > > + > > > + KMOV %k0, %RAX > > > + test %RAX, %RAX > > > + jnz L(ret_vec_x1) > > > + > > > +# ifndef USE_AS_RAWMEMCHR > > > + subq $CHAR_PER_VEC, %rdx > > > + jbe L(zero) > > > +# endif > > > + > > > + VPCMP $0, VEC_SIZE(%rdi), %VMM0, %k0 > > > + > > > + KMOV %k0, %RAX > > > + test %RAX, %RAX > > > + jnz L(ret_vec_x2) > > > + > > > +# ifndef USE_AS_RAWMEMCHR > > > + subq $CHAR_PER_VEC, %rdx > > > + jbe L(zero) > > > +# endif > > > + > > > + VPCMP $0, (VEC_SIZE * 2)(%rdi), %VMM0, %k0 > > > + > > > + KMOV %k0, %RAX > > > + test %RAX, %RAX > > > + jnz L(ret_vec_x3) > > > + > > > +# ifndef USE_AS_RAWMEMCHR > > > + subq $CHAR_PER_VEC, %rdx > > > + jbe L(zero) > > > +# endif > > > + > > > + VPCMP $0, (VEC_SIZE * 3)(%rdi), %VMM0, %k0 > > > + > > > + KMOV %k0, %RAX > > > + test %RAX, %RAX > > > + jnz L(ret_vec_x4) > > > + > > > +# ifndef USE_AS_RAWMEMCHR > > > + subq $CHAR_PER_VEC, %rdx > > > + jbe L(zero) > > > + /* Save pointer to find alignment adjustment. */ > > > + movq %rdi, %rax > > > +# endif > > > + /* Align address to VEC_SIZE * 4 for loop. */ > > > + andq $-(VEC_SIZE * 4), %rdi > > > + > > > + /* Add alignment difference to rdx. */ > > > +# ifndef USE_AS_RAWMEMCHR > > > + subq %rdi, %rax > > > +# ifdef USE_AS_WMEMCHR > > > + SHR $2, %RAX > > > +# endif > > > + addq %rax, %rdx > > > + jmp L(loop_entry) > > > +# endif > > > + > > > + /* 4 vector loop. */ > > > + .p2align 5,,11 > > > +L(loop): > > > +# ifndef USE_AS_RAWMEMCHR > > > + subq $(CHAR_PER_VEC * 4), %rdx > > > + jbe L(zero) > > > +L(loop_entry): > > > +# endif > > > + VPCMP $0, (VEC_SIZE * 4)(%rdi), %VMM0, %k1 > > > + VPCMP $0, (VEC_SIZE * 5)(%rdi), %VMM0, %k2 > > > + VPCMP $0, (VEC_SIZE * 6)(%rdi), %VMM0, %k3 > > > + VPCMP $0, (VEC_SIZE * 7)(%rdi), %VMM0, %k4 > > > + KOR %k1, %k2, %k5 > > > + KOR %k3, %k4, %k6 > > > + > > > + subq $-(VEC_SIZE * 4), %rdi > > > + KORTEST %k5, %k6 > > > + jz L(loop) > > > + > > > + KMOV %k1, %RAX > > > + test %RAX, %RAX > > > + jnz L(ret_vec_x1) > > > + > > > +# ifndef USE_AS_RAWMEMCHR > > > + subq $CHAR_PER_VEC, %rdx > > > + jbe L(zero) > > > +# endif > > > + > > > + KMOV %k2, %RAX > > > + test %RAX, %RAX > > > + jnz L(ret_vec_x2) > > > + > > > +# ifndef USE_AS_RAWMEMCHR > > > + subq $CHAR_PER_VEC, %rdx > > > + jbe L(zero) > > > +# endif > > > + > > > + KMOV %k3, %RAX > > > + test %RAX, %RAX > > > + jnz L(ret_vec_x3) > > > + > > > +# ifndef USE_AS_RAWMEMCHR > > > + subq $CHAR_PER_VEC, %rdx > > > + jbe L(zero) > > > +# endif > > > + > > > + /* At this point null [w]char must be in the fourth vector so no > > > + need to check. */ > > > + KMOV %k4, %RAX > > > + > > > +L(ret_vec_x4): > > > + bsf %RAX, %RAX > > > +# ifndef USE_AS_RAWMEMCHR > > > + cmp %rax, %rdx > > > + jbe L(zero) > > > +# endif > > > + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax > > > + ret > > > + > > > + .p2align 5,,5 > > > +L(ret_vec_x3): > > > + bsf %RAX, %RAX > > > +# ifndef USE_AS_RAWMEMCHR > > > + cmp %rax, %rdx > > > + jbe L(zero) > > > +# endif > > > + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > > > + ret > > > + > > > +END (MEMCHR) > > > +#endif > > > diff --git a/sysdeps/x86_64/multiarch/memchr-evex512.S b/sysdeps/x86_64/multiarch/memchr-evex512.S > > > new file mode 100644 > > > index 0000000000..47349d817a > > > --- /dev/null > > > +++ b/sysdeps/x86_64/multiarch/memchr-evex512.S > > > @@ -0,0 +1,7 @@ > > > +# ifndef MEMCHR > > > +# define MEMCHR __memchr_evex512 > > > +# endif > > > + > > > +#define VEC_SIZE 64 > > > + > > > +#include "memchr-evex-base.S" > > > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex512.S b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S > > > new file mode 100644 > > > index 0000000000..302d3cb055 > > > --- /dev/null > > > +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S > > > @@ -0,0 +1,7 @@ > > > +#ifndef RAWMEMCHR > > > +# define RAWMEMCHR __rawmemchr_evex512 > > > +#endif > > > +#define USE_AS_RAWMEMCHR 1 > > > +#define MEMCHR RAWMEMCHR > > > + > > > +#include "memchr-evex512.S" > > > diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex512.S b/sysdeps/x86_64/multiarch/wmemchr-evex512.S > > > new file mode 100644 > > > index 0000000000..f45ed1db75 > > > --- /dev/null > > > +++ b/sysdeps/x86_64/multiarch/wmemchr-evex512.S > > > @@ -0,0 +1,8 @@ > > > +#ifndef WMEMCHR > > > +# define WMEMCHR __wmemchr_evex512 > > > +#endif > > > + > > > +#define MEMCHR WMEMCHR > > > +#define USE_AS_WMEMCHR 1 > > > + > > > +#include "memchr-evex512.S" > > > -- > > > 2.36.1 > > > ping ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH] x86_64: Implement evex512 version of memchr, rawmemchr and wmemchr 2022-09-29 3:42 ` Sunil Pandey @ 2022-09-29 4:07 ` Noah Goldstein 0 siblings, 0 replies; 26+ messages in thread From: Noah Goldstein @ 2022-09-29 4:07 UTC (permalink / raw) To: Sunil Pandey; +Cc: GNU C Library On Wed, Sep 28, 2022 at 8:43 PM Sunil Pandey <skpgkp2@gmail.com> wrote: > > On Thu, Sep 22, 2022 at 8:57 PM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > Microbenchmark data collected on: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz > > > > > > On Wed, Sep 21, 2022 at 5:50 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > On Wed, Sep 21, 2022 at 5:27 PM Sunil K Pandey via Libc-alpha > > > <libc-alpha@sourceware.org> wrote: > > > > > > > > This patch implements following evex512 version of string functions. > > > > evex512 version takes up to 30% less cycle as compared to evex, > > > > depending on length and alignment. > > > > > > Please attach benchmark numbers. > > > > > > > > - memchr function using 512 bit vectors. > > > > - rawmemchr function using 512 bit vectors. > > > > - wmemchr function using 512 bit vectors. > > > > > > > > Code size data: > > > > > > > > memchr-evex.o 762 byte > > > > memchr-evex512.o 570 byte (-25%) > > > > > > > > rawmemchr-evex.o 461 byte > > > > rawmemchr-evex512.o 413 byte (-10%) > > > > > > > > wmemchr-evex.o 794 byte > > > > wmemchr-evex512.o 568 byte (-28%) > > > > > > > > Placeholder function, not used by any processor at the moment. > > > > --- > > > > sysdeps/x86_64/multiarch/Makefile | 3 + > > > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 + > > > > sysdeps/x86_64/multiarch/memchr-evex-base.S | 306 +++++++++++++++++++ > > > > sysdeps/x86_64/multiarch/memchr-evex512.S | 7 + > > > > sysdeps/x86_64/multiarch/rawmemchr-evex512.S | 7 + > > > > sysdeps/x86_64/multiarch/wmemchr-evex512.S | 8 + > > > > 6 files changed, 346 insertions(+) > > > > create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-base.S > > > > create mode 100644 sysdeps/x86_64/multiarch/memchr-evex512.S > > > > create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex512.S > > > > create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex512.S > > > > > > > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > > > > index df4601c294..e974b1ad97 100644 > > > > --- a/sysdeps/x86_64/multiarch/Makefile > > > > +++ b/sysdeps/x86_64/multiarch/Makefile > > > > @@ -4,6 +4,7 @@ sysdep_routines += \ > > > > memchr-avx2 \ > > > > memchr-avx2-rtm \ > > > > memchr-evex \ > > > > + memchr-evex512 \ > > > > memchr-evex-rtm \ > > > > memchr-sse2 \ > > > > memcmp-avx2-movbe \ > > > > @@ -36,6 +37,7 @@ sysdep_routines += \ > > > > rawmemchr-avx2 \ > > > > rawmemchr-avx2-rtm \ > > > > rawmemchr-evex \ > > > > + rawmemchr-evex512 \ > > > > rawmemchr-evex-rtm \ > > > > rawmemchr-sse2 \ > > > > stpcpy-avx2 \ > > > > @@ -156,6 +158,7 @@ sysdep_routines += \ > > > > wmemchr-avx2 \ > > > > wmemchr-avx2-rtm \ > > > > wmemchr-evex \ > > > > + wmemchr-evex512 \ > > > > wmemchr-evex-rtm \ > > > > wmemchr-sse2 \ > > > > wmemcmp-avx2-movbe \ > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > > index a71444eccb..17f770318d 100644 > > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > > @@ -63,6 +63,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > && CPU_FEATURE_USABLE (AVX512BW) > > > > && CPU_FEATURE_USABLE (BMI2)), > > > > __memchr_evex) > > > > + X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > > > > + (CPU_FEATURE_USABLE (AVX512VL) > > > > + && CPU_FEATURE_USABLE (AVX512BW) > > > > + && CPU_FEATURE_USABLE (BMI2)), > > > > + __memchr_evex512) > > > > X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > > > > (CPU_FEATURE_USABLE (AVX512VL) > > > > && CPU_FEATURE_USABLE (AVX512BW) > > > > @@ -329,6 +334,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > && CPU_FEATURE_USABLE (AVX512BW) > > > > && CPU_FEATURE_USABLE (BMI2)), > > > > __rawmemchr_evex) > > > > + X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > > > > + (CPU_FEATURE_USABLE (AVX512VL) > > > > + && CPU_FEATURE_USABLE (AVX512BW) > > > > + && CPU_FEATURE_USABLE (BMI2)), > > > > + __rawmemchr_evex512) > > > > X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > > > > (CPU_FEATURE_USABLE (AVX512VL) > > > > && CPU_FEATURE_USABLE (AVX512BW) > > > > @@ -903,6 +913,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > && CPU_FEATURE_USABLE (AVX512BW) > > > > && CPU_FEATURE_USABLE (BMI2)), > > > > __wmemchr_evex) > > > > + X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > > > > + (CPU_FEATURE_USABLE (AVX512VL) > > > > + && CPU_FEATURE_USABLE (AVX512BW) > > > > + && CPU_FEATURE_USABLE (BMI2)), > > > > + __wmemchr_evex512) > > > > X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > > > > (CPU_FEATURE_USABLE (AVX512VL) > > > > && CPU_FEATURE_USABLE (AVX512BW) > > > > diff --git a/sysdeps/x86_64/multiarch/memchr-evex-base.S b/sysdeps/x86_64/multiarch/memchr-evex-base.S > > > > new file mode 100644 > > > > index 0000000000..524f0809b5 > > > > --- /dev/null > > > > +++ b/sysdeps/x86_64/multiarch/memchr-evex-base.S > > > > @@ -0,0 +1,306 @@ > > > > +/* Placeholder function, not used by any processor at the moment. > > > > + Copyright (C) 2022 Free Software Foundation, Inc. > > > > + This file is part of the GNU C Library. > > > > + > > > > + The GNU C Library is free software; you can redistribute it and/or > > > > + modify it under the terms of the GNU Lesser General Public > > > > + License as published by the Free Software Foundation; either > > > > + version 2.1 of the License, or (at your option) any later version. > > > > + > > > > + The GNU C Library is distributed in the hope that it will be useful, > > > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > > > + Lesser General Public License for more details. > > > > + > > > > + You should have received a copy of the GNU Lesser General Public > > > > + License along with the GNU C Library; if not, see > > > > + <https://www.gnu.org/licenses/>. */ > > > > + > > > > +/* UNUSED. Exists purely as reference implementation. */ > > > > + > > > > +#include <isa-level.h> > > > > + > > > > +#if ISA_SHOULD_BUILD (4) > > > > + > > > > +# include <sysdep.h> > > > > + > > > > +# ifdef USE_AS_WMEMCHR > > > > +# define CHAR_SIZE 4 > > > > +# define VPBROADCAST vpbroadcastd > > > > +# define VPCMP vpcmpd > > > > +# else > > > > +# define CHAR_SIZE 1 > > > > +# define VPBROADCAST vpbroadcastb > > > > +# define VPCMP vpcmpb > > > > +# endif > > > > + > > > > +# define PAGE_SIZE 4096 > > > > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > > > +# define XMM1 xmm17 > > > > + > > > > +# if VEC_SIZE == 64 > > > > +# define KMOV kmovq > > > > +# define KOR korq > > > > +# define KORTEST kortestq > > > > +# define RAX rax > > > > +# define RCX rcx > > > > +# define SHR shrq > > > > +# define SARX sarxq > > > > +# define TEXTSUFFIX evex512 > > > > +# define VMM0 zmm16 > > > > +# elif VEC_SIZE == 32 > > > > +/* Currently Unused. */ > > > > +# define KMOV kmovd > > > > +# define KOR kord > > > > +# define KORTEST kortestd > > > > +# define RAX eax > > > > +# define RCX ecx > > > > +# define SHR shrl > > > > +# define SARX sarxl > > > > +# define TEXTSUFFIX evex256 > > > > +# define VMM0 ymm16 > > > > +# endif > > > > + > > > > + .section .text.TEXTSUFFIX, "ax", @progbits > > > > +/* Aligning entry point to 64 byte, provides better performance for > > > > + one vector length string. */ > > > > +ENTRY_P2ALIGN (MEMCHR, 6) > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + /* Check for zero length. */ > > > > + test %RDX_LP, %RDX_LP > > > > + jz L(zero) > > > > + > > > > +# ifdef __ILP32__ > > > > + /* Clear the upper 32 bits. */ > > > > + movl %edx, %edx > > > > +# endif > > > > +# endif > > > > + > > > > + /* Broadcast CHAR to VMM0. */ > > > > + VPBROADCAST %esi, %VMM0 > > > > + movl %edi, %eax > > > > + andl $(PAGE_SIZE - 1), %eax > > > > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > > > > + ja L(page_cross) > > > > + > > > > + /* Compare [w]char for null, mask bit will be set for match. */ > > > > + VPCMP $0, (%rdi), %VMM0, %k0 > > > > + > > > > + KMOV %k0, %RAX > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + bsf %RAX, %RCX > > > > + jz L(align_more) > > > > + xor %eax, %eax > > > > +# ifdef USE_AS_WMEMCHR > > > > + leaq (%rdi, %rcx, CHAR_SIZE), %rdi > > > > +# else > > > > + addq %rcx, %rdi > > > > +# endif > > > > + cmp %rcx, %rdx > > > > + cmova %rdi, %rax > > > > +# else > > > > + bsf %RAX, %RAX > > > > + jz L(align_more) > > > > + add %rdi, %rax > > > > +# endif > > > > + ret > > > > + > > > > +# ifndef USE_AS_RAWMEMCHR > > > > +L(zero): > > > > + xorl %eax, %eax > > > > + ret > > > > +# endif > > > > + > > > > + .p2align 5,,5 > > > > +L(page_cross): > > > > + movq %rdi, %rcx > > > > + andq $-VEC_SIZE, %rcx > > > > + > > > > + VPCMP $0, (%rcx), %VMM0, %k0 > > > > + KMOV %k0, %RCX > > > > + SARX %RAX, %RCX, %RAX > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + bsf %RAX, %RCX > > > > + jz L(align_more) > > > > + xor %eax, %eax > > > > +# ifdef USE_AS_WMEMCHR > > > > + leaq (%rdi, %rcx, CHAR_SIZE), %rdi > > > > +# else > > > > + addq %rcx, %rdi > > > > +# endif > > > > + cmp %rcx, %rdx > > > > + cmovae %rdi, %rax > > > > + > > > > +# else > > > > + bsf %rax, %rax > > > > + jz L(align_more) > > > > + add %rdi, %rax > > > > +# endif > > > > + ret > > > > + > > > > +L(ret_vec_x2): > > > > + subq $-VEC_SIZE, %rdi > > > > +L(ret_vec_x1): > > > > + bsf %RAX, %RAX > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + jz L(zero) > > > > + cmp %rax, %rdx > > > > + jbe L(zero) > > > > +# endif > > > > +# ifdef USE_AS_WMEMCHR > > > > + leaq (%rdi, %rax, CHAR_SIZE), %rax > > > > +# else > > > > + add %rdi, %rax > > > > +# endif > > > > + ret > > > > + > > > > + .p2align 5,,10 > > > > +L(align_more): > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + xor %eax, %eax > > > > + subq %rdi, %rax > > > > +# endif > > > > + > > > > + subq $-VEC_SIZE, %rdi > > > > + /* Align rdi to VEC_SIZE. */ > > > > + andq $-VEC_SIZE, %rdi > > > > + > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + addq %rdi, %rax > > > > +# ifdef USE_AS_WMEMCHR > > > > + sarl $2, %eax > > > > +# endif > > > > + subq %rax, %rdx > > > > + jbe L(zero) > > > > +# endif > > > > + > > > > + /* Loop unroll 4 times for 4 vector loop. */ > > > > + VPCMP $0, (%rdi), %VMM0, %k0 > > > > + > > > > + KMOV %k0, %RAX > > > > + test %RAX, %RAX > > > > + jnz L(ret_vec_x1) > > > > + > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + subq $CHAR_PER_VEC, %rdx > > > > + jbe L(zero) > > > > +# endif > > > > + > > > > + VPCMP $0, VEC_SIZE(%rdi), %VMM0, %k0 > > > > + > > > > + KMOV %k0, %RAX > > > > + test %RAX, %RAX > > > > + jnz L(ret_vec_x2) > > > > + > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + subq $CHAR_PER_VEC, %rdx > > > > + jbe L(zero) > > > > +# endif > > > > + > > > > + VPCMP $0, (VEC_SIZE * 2)(%rdi), %VMM0, %k0 > > > > + > > > > + KMOV %k0, %RAX > > > > + test %RAX, %RAX > > > > + jnz L(ret_vec_x3) > > > > + > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + subq $CHAR_PER_VEC, %rdx > > > > + jbe L(zero) > > > > +# endif > > > > + > > > > + VPCMP $0, (VEC_SIZE * 3)(%rdi), %VMM0, %k0 > > > > + > > > > + KMOV %k0, %RAX > > > > + test %RAX, %RAX > > > > + jnz L(ret_vec_x4) > > > > + > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + subq $CHAR_PER_VEC, %rdx > > > > + jbe L(zero) > > > > + /* Save pointer to find alignment adjustment. */ > > > > + movq %rdi, %rax > > > > +# endif > > > > + /* Align address to VEC_SIZE * 4 for loop. */ > > > > + andq $-(VEC_SIZE * 4), %rdi > > > > + > > > > + /* Add alignment difference to rdx. */ > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + subq %rdi, %rax > > > > +# ifdef USE_AS_WMEMCHR > > > > + SHR $2, %RAX > > > > +# endif > > > > + addq %rax, %rdx > > > > + jmp L(loop_entry) > > > > +# endif > > > > + > > > > + /* 4 vector loop. */ > > > > + .p2align 5,,11 > > > > +L(loop): > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + subq $(CHAR_PER_VEC * 4), %rdx > > > > + jbe L(zero) > > > > +L(loop_entry): > > > > +# endif > > > > + VPCMP $0, (VEC_SIZE * 4)(%rdi), %VMM0, %k1 > > > > + VPCMP $0, (VEC_SIZE * 5)(%rdi), %VMM0, %k2 > > > > + VPCMP $0, (VEC_SIZE * 6)(%rdi), %VMM0, %k3 > > > > + VPCMP $0, (VEC_SIZE * 7)(%rdi), %VMM0, %k4 > > > > + KOR %k1, %k2, %k5 > > > > + KOR %k3, %k4, %k6 > > > > + > > > > + subq $-(VEC_SIZE * 4), %rdi > > > > + KORTEST %k5, %k6 > > > > + jz L(loop) > > > > + > > > > + KMOV %k1, %RAX > > > > + test %RAX, %RAX > > > > + jnz L(ret_vec_x1) > > > > + > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + subq $CHAR_PER_VEC, %rdx > > > > + jbe L(zero) > > > > +# endif > > > > + > > > > + KMOV %k2, %RAX > > > > + test %RAX, %RAX > > > > + jnz L(ret_vec_x2) > > > > + > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + subq $CHAR_PER_VEC, %rdx > > > > + jbe L(zero) > > > > +# endif > > > > + > > > > + KMOV %k3, %RAX > > > > + test %RAX, %RAX > > > > + jnz L(ret_vec_x3) > > > > + > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + subq $CHAR_PER_VEC, %rdx > > > > + jbe L(zero) > > > > +# endif > > > > + > > > > + /* At this point null [w]char must be in the fourth vector so no > > > > + need to check. */ > > > > + KMOV %k4, %RAX > > > > + > > > > +L(ret_vec_x4): > > > > + bsf %RAX, %RAX > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + cmp %rax, %rdx > > > > + jbe L(zero) > > > > +# endif > > > > + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax > > > > + ret > > > > + > > > > + .p2align 5,,5 > > > > +L(ret_vec_x3): > > > > + bsf %RAX, %RAX > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + cmp %rax, %rdx > > > > + jbe L(zero) > > > > +# endif > > > > + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > > > > + ret > > > > + > > > > +END (MEMCHR) > > > > +#endif > > > > diff --git a/sysdeps/x86_64/multiarch/memchr-evex512.S b/sysdeps/x86_64/multiarch/memchr-evex512.S > > > > new file mode 100644 > > > > index 0000000000..47349d817a > > > > --- /dev/null > > > > +++ b/sysdeps/x86_64/multiarch/memchr-evex512.S > > > > @@ -0,0 +1,7 @@ > > > > +# ifndef MEMCHR > > > > +# define MEMCHR __memchr_evex512 > > > > +# endif > > > > + > > > > +#define VEC_SIZE 64 > > > > + > > > > +#include "memchr-evex-base.S" > > > > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex512.S b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S > > > > new file mode 100644 > > > > index 0000000000..302d3cb055 > > > > --- /dev/null > > > > +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S > > > > @@ -0,0 +1,7 @@ > > > > +#ifndef RAWMEMCHR > > > > +# define RAWMEMCHR __rawmemchr_evex512 > > > > +#endif > > > > +#define USE_AS_RAWMEMCHR 1 > > > > +#define MEMCHR RAWMEMCHR > > > > + > > > > +#include "memchr-evex512.S" > > > > diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex512.S b/sysdeps/x86_64/multiarch/wmemchr-evex512.S > > > > new file mode 100644 > > > > index 0000000000..f45ed1db75 > > > > --- /dev/null > > > > +++ b/sysdeps/x86_64/multiarch/wmemchr-evex512.S > > > > @@ -0,0 +1,8 @@ > > > > +#ifndef WMEMCHR > > > > +# define WMEMCHR __wmemchr_evex512 > > > > +#endif > > > > + > > > > +#define MEMCHR WMEMCHR > > > > +#define USE_AS_WMEMCHR 1 > > > > + > > > > +#include "memchr-evex512.S" > > > > -- > > > > 2.36.1 > > > > > > ping see my reply to strrchr. ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH] x86_64: Implement evex512 version of memchr, rawmemchr and wmemchr 2022-09-22 0:27 [PATCH] x86_64: Implement evex512 version of memchr, rawmemchr and wmemchr Sunil K Pandey 2022-09-22 0:50 ` Noah Goldstein @ 2022-10-03 18:33 ` Noah Goldstein 2022-10-03 19:00 ` H.J. Lu 1 sibling, 1 reply; 26+ messages in thread From: Noah Goldstein @ 2022-10-03 18:33 UTC (permalink / raw) To: Sunil K Pandey; +Cc: libc-alpha On Wed, Sep 21, 2022 at 5:27 PM Sunil K Pandey via Libc-alpha <libc-alpha@sourceware.org> wrote: > > This patch implements following evex512 version of string functions. > evex512 version takes up to 30% less cycle as compared to evex, > depending on length and alignment. > > - memchr function using 512 bit vectors. > - rawmemchr function using 512 bit vectors. > - wmemchr function using 512 bit vectors. > > Code size data: > > memchr-evex.o 762 byte > memchr-evex512.o 570 byte (-25%) > > rawmemchr-evex.o 461 byte > rawmemchr-evex512.o 413 byte (-10%) > > wmemchr-evex.o 794 byte > wmemchr-evex512.o 568 byte (-28%) > > Placeholder function, not used by any processor at the moment. > --- > sysdeps/x86_64/multiarch/Makefile | 3 + > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 + > sysdeps/x86_64/multiarch/memchr-evex-base.S | 306 +++++++++++++++++++ > sysdeps/x86_64/multiarch/memchr-evex512.S | 7 + > sysdeps/x86_64/multiarch/rawmemchr-evex512.S | 7 + > sysdeps/x86_64/multiarch/wmemchr-evex512.S | 8 + > 6 files changed, 346 insertions(+) > create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-base.S > create mode 100644 sysdeps/x86_64/multiarch/memchr-evex512.S > create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex512.S > create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex512.S > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > index df4601c294..e974b1ad97 100644 > --- a/sysdeps/x86_64/multiarch/Makefile > +++ b/sysdeps/x86_64/multiarch/Makefile > @@ -4,6 +4,7 @@ sysdep_routines += \ > memchr-avx2 \ > memchr-avx2-rtm \ > memchr-evex \ > + memchr-evex512 \ > memchr-evex-rtm \ > memchr-sse2 \ > memcmp-avx2-movbe \ > @@ -36,6 +37,7 @@ sysdep_routines += \ > rawmemchr-avx2 \ > rawmemchr-avx2-rtm \ > rawmemchr-evex \ > + rawmemchr-evex512 \ > rawmemchr-evex-rtm \ > rawmemchr-sse2 \ > stpcpy-avx2 \ > @@ -156,6 +158,7 @@ sysdep_routines += \ > wmemchr-avx2 \ > wmemchr-avx2-rtm \ > wmemchr-evex \ > + wmemchr-evex512 \ > wmemchr-evex-rtm \ > wmemchr-sse2 \ > wmemcmp-avx2-movbe \ > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > index a71444eccb..17f770318d 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > @@ -63,6 +63,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > && CPU_FEATURE_USABLE (AVX512BW) > && CPU_FEATURE_USABLE (BMI2)), > __memchr_evex) > + X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > + (CPU_FEATURE_USABLE (AVX512VL) > + && CPU_FEATURE_USABLE (AVX512BW) > + && CPU_FEATURE_USABLE (BMI2)), > + __memchr_evex512) > X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > (CPU_FEATURE_USABLE (AVX512VL) > && CPU_FEATURE_USABLE (AVX512BW) > @@ -329,6 +334,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > && CPU_FEATURE_USABLE (AVX512BW) > && CPU_FEATURE_USABLE (BMI2)), > __rawmemchr_evex) > + X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > + (CPU_FEATURE_USABLE (AVX512VL) > + && CPU_FEATURE_USABLE (AVX512BW) > + && CPU_FEATURE_USABLE (BMI2)), > + __rawmemchr_evex512) > X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > (CPU_FEATURE_USABLE (AVX512VL) > && CPU_FEATURE_USABLE (AVX512BW) > @@ -903,6 +913,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > && CPU_FEATURE_USABLE (AVX512BW) > && CPU_FEATURE_USABLE (BMI2)), > __wmemchr_evex) > + X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > + (CPU_FEATURE_USABLE (AVX512VL) > + && CPU_FEATURE_USABLE (AVX512BW) > + && CPU_FEATURE_USABLE (BMI2)), > + __wmemchr_evex512) > X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > (CPU_FEATURE_USABLE (AVX512VL) > && CPU_FEATURE_USABLE (AVX512BW) > diff --git a/sysdeps/x86_64/multiarch/memchr-evex-base.S b/sysdeps/x86_64/multiarch/memchr-evex-base.S > new file mode 100644 > index 0000000000..524f0809b5 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/memchr-evex-base.S > @@ -0,0 +1,306 @@ > +/* Placeholder function, not used by any processor at the moment. > + Copyright (C) 2022 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +/* UNUSED. Exists purely as reference implementation. */ > + > +#include <isa-level.h> > + > +#if ISA_SHOULD_BUILD (4) > + > +# include <sysdep.h> > + > +# ifdef USE_AS_WMEMCHR > +# define CHAR_SIZE 4 > +# define VPBROADCAST vpbroadcastd > +# define VPCMP vpcmpd > +# else > +# define CHAR_SIZE 1 > +# define VPBROADCAST vpbroadcastb > +# define VPCMP vpcmpb > +# endif > + > +# define PAGE_SIZE 4096 > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > +# define XMM1 xmm17 > + > +# if VEC_SIZE == 64 > +# define KMOV kmovq > +# define KOR korq > +# define KORTEST kortestq > +# define RAX rax > +# define RCX rcx > +# define SHR shrq > +# define SARX sarxq > +# define TEXTSUFFIX evex512 > +# define VMM0 zmm16 > +# elif VEC_SIZE == 32 > +/* Currently Unused. */ > +# define KMOV kmovd > +# define KOR kord > +# define KORTEST kortestd > +# define RAX eax > +# define RCX ecx > +# define SHR shrl > +# define SARX sarxl > +# define TEXTSUFFIX evex256 > +# define VMM0 ymm16 > +# endif > + > + .section .text.TEXTSUFFIX, "ax", @progbits > +/* Aligning entry point to 64 byte, provides better performance for > + one vector length string. */ > +ENTRY_P2ALIGN (MEMCHR, 6) > +# ifndef USE_AS_RAWMEMCHR > + /* Check for zero length. */ > + test %RDX_LP, %RDX_LP > + jz L(zero) > + > +# ifdef __ILP32__ > + /* Clear the upper 32 bits. */ > + movl %edx, %edx > +# endif > +# endif > + > + /* Broadcast CHAR to VMM0. */ > + VPBROADCAST %esi, %VMM0 > + movl %edi, %eax > + andl $(PAGE_SIZE - 1), %eax > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > + ja L(page_cross) > + > + /* Compare [w]char for null, mask bit will be set for match. */ > + VPCMP $0, (%rdi), %VMM0, %k0 > + > + KMOV %k0, %RAX > +# ifndef USE_AS_RAWMEMCHR > + bsf %RAX, %RCX > + jz L(align_more) > + xor %eax, %eax > +# ifdef USE_AS_WMEMCHR > + leaq (%rdi, %rcx, CHAR_SIZE), %rdi > +# else > + addq %rcx, %rdi > +# endif > + cmp %rcx, %rdx > + cmova %rdi, %rax > +# else > + bsf %RAX, %RAX > + jz L(align_more) > + add %rdi, %rax > +# endif > + ret > + > +# ifndef USE_AS_RAWMEMCHR > +L(zero): > + xorl %eax, %eax > + ret > +# endif > + > + .p2align 5,,5 > +L(page_cross): > + movq %rdi, %rcx > + andq $-VEC_SIZE, %rcx > + > + VPCMP $0, (%rcx), %VMM0, %k0 > + KMOV %k0, %RCX > + SARX %RAX, %RCX, %RAX > +# ifndef USE_AS_RAWMEMCHR > + bsf %RAX, %RCX > + jz L(align_more) > + xor %eax, %eax > +# ifdef USE_AS_WMEMCHR > + leaq (%rdi, %rcx, CHAR_SIZE), %rdi > +# else > + addq %rcx, %rdi > +# endif > + cmp %rcx, %rdx > + cmovae %rdi, %rax Irrelivant of other concerns this is buggy. It needs to be cmova. > + > +# else > + bsf %rax, %rax > + jz L(align_more) > + add %rdi, %rax > +# endif > + ret > + > +L(ret_vec_x2): > + subq $-VEC_SIZE, %rdi > +L(ret_vec_x1): > + bsf %RAX, %RAX > +# ifndef USE_AS_RAWMEMCHR > + jz L(zero) > + cmp %rax, %rdx > + jbe L(zero) > +# endif > +# ifdef USE_AS_WMEMCHR > + leaq (%rdi, %rax, CHAR_SIZE), %rax > +# else > + add %rdi, %rax > +# endif > + ret > + > + .p2align 5,,10 > +L(align_more): > +# ifndef USE_AS_RAWMEMCHR > + xor %eax, %eax > + subq %rdi, %rax > +# endif > + > + subq $-VEC_SIZE, %rdi > + /* Align rdi to VEC_SIZE. */ > + andq $-VEC_SIZE, %rdi > + > +# ifndef USE_AS_RAWMEMCHR > + addq %rdi, %rax > +# ifdef USE_AS_WMEMCHR > + sarl $2, %eax > +# endif > + subq %rax, %rdx > + jbe L(zero) > +# endif > + > + /* Loop unroll 4 times for 4 vector loop. */ > + VPCMP $0, (%rdi), %VMM0, %k0 > + > + KMOV %k0, %RAX > + test %RAX, %RAX > + jnz L(ret_vec_x1) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero) > +# endif > + > + VPCMP $0, VEC_SIZE(%rdi), %VMM0, %k0 > + > + KMOV %k0, %RAX > + test %RAX, %RAX > + jnz L(ret_vec_x2) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero) > +# endif > + > + VPCMP $0, (VEC_SIZE * 2)(%rdi), %VMM0, %k0 > + > + KMOV %k0, %RAX > + test %RAX, %RAX > + jnz L(ret_vec_x3) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero) > +# endif > + > + VPCMP $0, (VEC_SIZE * 3)(%rdi), %VMM0, %k0 > + > + KMOV %k0, %RAX > + test %RAX, %RAX > + jnz L(ret_vec_x4) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero) > + /* Save pointer to find alignment adjustment. */ > + movq %rdi, %rax > +# endif > + /* Align address to VEC_SIZE * 4 for loop. */ > + andq $-(VEC_SIZE * 4), %rdi > + > + /* Add alignment difference to rdx. */ > +# ifndef USE_AS_RAWMEMCHR > + subq %rdi, %rax > +# ifdef USE_AS_WMEMCHR > + SHR $2, %RAX > +# endif > + addq %rax, %rdx > + jmp L(loop_entry) > +# endif > + > + /* 4 vector loop. */ > + .p2align 5,,11 > +L(loop): > +# ifndef USE_AS_RAWMEMCHR > + subq $(CHAR_PER_VEC * 4), %rdx > + jbe L(zero) > +L(loop_entry): > +# endif > + VPCMP $0, (VEC_SIZE * 4)(%rdi), %VMM0, %k1 > + VPCMP $0, (VEC_SIZE * 5)(%rdi), %VMM0, %k2 > + VPCMP $0, (VEC_SIZE * 6)(%rdi), %VMM0, %k3 > + VPCMP $0, (VEC_SIZE * 7)(%rdi), %VMM0, %k4 > + KOR %k1, %k2, %k5 > + KOR %k3, %k4, %k6 > + > + subq $-(VEC_SIZE * 4), %rdi > + KORTEST %k5, %k6 > + jz L(loop) > + > + KMOV %k1, %RAX > + test %RAX, %RAX > + jnz L(ret_vec_x1) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero) > +# endif > + > + KMOV %k2, %RAX > + test %RAX, %RAX > + jnz L(ret_vec_x2) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero) > +# endif > + > + KMOV %k3, %RAX > + test %RAX, %RAX > + jnz L(ret_vec_x3) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero) > +# endif > + > + /* At this point null [w]char must be in the fourth vector so no > + need to check. */ > + KMOV %k4, %RAX > + > +L(ret_vec_x4): > + bsf %RAX, %RAX > +# ifndef USE_AS_RAWMEMCHR > + cmp %rax, %rdx > + jbe L(zero) > +# endif > + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax > + ret > + > + .p2align 5,,5 > +L(ret_vec_x3): > + bsf %RAX, %RAX > +# ifndef USE_AS_RAWMEMCHR > + cmp %rax, %rdx > + jbe L(zero) > +# endif > + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > + ret > + > +END (MEMCHR) > +#endif > diff --git a/sysdeps/x86_64/multiarch/memchr-evex512.S b/sysdeps/x86_64/multiarch/memchr-evex512.S > new file mode 100644 > index 0000000000..47349d817a > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/memchr-evex512.S > @@ -0,0 +1,7 @@ > +# ifndef MEMCHR > +# define MEMCHR __memchr_evex512 > +# endif > + > +#define VEC_SIZE 64 > + > +#include "memchr-evex-base.S" > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex512.S b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S > new file mode 100644 > index 0000000000..302d3cb055 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S > @@ -0,0 +1,7 @@ > +#ifndef RAWMEMCHR > +# define RAWMEMCHR __rawmemchr_evex512 > +#endif > +#define USE_AS_RAWMEMCHR 1 > +#define MEMCHR RAWMEMCHR > + > +#include "memchr-evex512.S" > diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex512.S b/sysdeps/x86_64/multiarch/wmemchr-evex512.S > new file mode 100644 > index 0000000000..f45ed1db75 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/wmemchr-evex512.S > @@ -0,0 +1,8 @@ > +#ifndef WMEMCHR > +# define WMEMCHR __wmemchr_evex512 > +#endif > + > +#define MEMCHR WMEMCHR > +#define USE_AS_WMEMCHR 1 > + > +#include "memchr-evex512.S" > -- > 2.36.1 > ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH] x86_64: Implement evex512 version of memchr, rawmemchr and wmemchr 2022-10-03 18:33 ` Noah Goldstein @ 2022-10-03 19:00 ` H.J. Lu 2022-10-03 19:12 ` Noah Goldstein 0 siblings, 1 reply; 26+ messages in thread From: H.J. Lu @ 2022-10-03 19:00 UTC (permalink / raw) To: Noah Goldstein; +Cc: Sunil K Pandey, libc-alpha On Mon, Oct 3, 2022 at 11:33 AM Noah Goldstein via Libc-alpha <libc-alpha@sourceware.org> wrote: > > On Wed, Sep 21, 2022 at 5:27 PM Sunil K Pandey via Libc-alpha > <libc-alpha@sourceware.org> wrote: > > > > This patch implements following evex512 version of string functions. > > evex512 version takes up to 30% less cycle as compared to evex, > > depending on length and alignment. > > > > - memchr function using 512 bit vectors. > > - rawmemchr function using 512 bit vectors. > > - wmemchr function using 512 bit vectors. > > > > Code size data: > > > > memchr-evex.o 762 byte > > memchr-evex512.o 570 byte (-25%) > > > > rawmemchr-evex.o 461 byte > > rawmemchr-evex512.o 413 byte (-10%) > > > > wmemchr-evex.o 794 byte > > wmemchr-evex512.o 568 byte (-28%) > > > > Placeholder function, not used by any processor at the moment. > > --- > > sysdeps/x86_64/multiarch/Makefile | 3 + > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 + > > sysdeps/x86_64/multiarch/memchr-evex-base.S | 306 +++++++++++++++++++ > > sysdeps/x86_64/multiarch/memchr-evex512.S | 7 + > > sysdeps/x86_64/multiarch/rawmemchr-evex512.S | 7 + > > sysdeps/x86_64/multiarch/wmemchr-evex512.S | 8 + > > 6 files changed, 346 insertions(+) > > create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-base.S > > create mode 100644 sysdeps/x86_64/multiarch/memchr-evex512.S > > create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex512.S > > create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex512.S > > > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > > index df4601c294..e974b1ad97 100644 > > --- a/sysdeps/x86_64/multiarch/Makefile > > +++ b/sysdeps/x86_64/multiarch/Makefile > > @@ -4,6 +4,7 @@ sysdep_routines += \ > > memchr-avx2 \ > > memchr-avx2-rtm \ > > memchr-evex \ > > + memchr-evex512 \ > > memchr-evex-rtm \ > > memchr-sse2 \ > > memcmp-avx2-movbe \ > > @@ -36,6 +37,7 @@ sysdep_routines += \ > > rawmemchr-avx2 \ > > rawmemchr-avx2-rtm \ > > rawmemchr-evex \ > > + rawmemchr-evex512 \ > > rawmemchr-evex-rtm \ > > rawmemchr-sse2 \ > > stpcpy-avx2 \ > > @@ -156,6 +158,7 @@ sysdep_routines += \ > > wmemchr-avx2 \ > > wmemchr-avx2-rtm \ > > wmemchr-evex \ > > + wmemchr-evex512 \ > > wmemchr-evex-rtm \ > > wmemchr-sse2 \ > > wmemcmp-avx2-movbe \ > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > index a71444eccb..17f770318d 100644 > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > @@ -63,6 +63,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > && CPU_FEATURE_USABLE (AVX512BW) > > && CPU_FEATURE_USABLE (BMI2)), > > __memchr_evex) > > + X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > > + (CPU_FEATURE_USABLE (AVX512VL) > > + && CPU_FEATURE_USABLE (AVX512BW) > > + && CPU_FEATURE_USABLE (BMI2)), > > + __memchr_evex512) > > X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > > (CPU_FEATURE_USABLE (AVX512VL) > > && CPU_FEATURE_USABLE (AVX512BW) > > @@ -329,6 +334,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > && CPU_FEATURE_USABLE (AVX512BW) > > && CPU_FEATURE_USABLE (BMI2)), > > __rawmemchr_evex) > > + X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > > + (CPU_FEATURE_USABLE (AVX512VL) > > + && CPU_FEATURE_USABLE (AVX512BW) > > + && CPU_FEATURE_USABLE (BMI2)), > > + __rawmemchr_evex512) > > X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > > (CPU_FEATURE_USABLE (AVX512VL) > > && CPU_FEATURE_USABLE (AVX512BW) > > @@ -903,6 +913,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > && CPU_FEATURE_USABLE (AVX512BW) > > && CPU_FEATURE_USABLE (BMI2)), > > __wmemchr_evex) > > + X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > > + (CPU_FEATURE_USABLE (AVX512VL) > > + && CPU_FEATURE_USABLE (AVX512BW) > > + && CPU_FEATURE_USABLE (BMI2)), > > + __wmemchr_evex512) > > X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > > (CPU_FEATURE_USABLE (AVX512VL) > > && CPU_FEATURE_USABLE (AVX512BW) > > diff --git a/sysdeps/x86_64/multiarch/memchr-evex-base.S b/sysdeps/x86_64/multiarch/memchr-evex-base.S > > new file mode 100644 > > index 0000000000..524f0809b5 > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/memchr-evex-base.S > > @@ -0,0 +1,306 @@ > > +/* Placeholder function, not used by any processor at the moment. > > + Copyright (C) 2022 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > +/* UNUSED. Exists purely as reference implementation. */ > > + > > +#include <isa-level.h> > > + > > +#if ISA_SHOULD_BUILD (4) > > + > > +# include <sysdep.h> > > + > > +# ifdef USE_AS_WMEMCHR > > +# define CHAR_SIZE 4 > > +# define VPBROADCAST vpbroadcastd > > +# define VPCMP vpcmpd > > +# else > > +# define CHAR_SIZE 1 > > +# define VPBROADCAST vpbroadcastb > > +# define VPCMP vpcmpb > > +# endif > > + > > +# define PAGE_SIZE 4096 > > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > +# define XMM1 xmm17 > > + > > +# if VEC_SIZE == 64 > > +# define KMOV kmovq > > +# define KOR korq > > +# define KORTEST kortestq > > +# define RAX rax > > +# define RCX rcx > > +# define SHR shrq > > +# define SARX sarxq > > +# define TEXTSUFFIX evex512 > > +# define VMM0 zmm16 > > +# elif VEC_SIZE == 32 > > +/* Currently Unused. */ > > +# define KMOV kmovd > > +# define KOR kord > > +# define KORTEST kortestd > > +# define RAX eax > > +# define RCX ecx > > +# define SHR shrl > > +# define SARX sarxl > > +# define TEXTSUFFIX evex256 > > +# define VMM0 ymm16 > > +# endif > > + > > + .section .text.TEXTSUFFIX, "ax", @progbits > > +/* Aligning entry point to 64 byte, provides better performance for > > + one vector length string. */ > > +ENTRY_P2ALIGN (MEMCHR, 6) > > +# ifndef USE_AS_RAWMEMCHR > > + /* Check for zero length. */ > > + test %RDX_LP, %RDX_LP > > + jz L(zero) > > + > > +# ifdef __ILP32__ > > + /* Clear the upper 32 bits. */ > > + movl %edx, %edx > > +# endif > > +# endif > > + > > + /* Broadcast CHAR to VMM0. */ > > + VPBROADCAST %esi, %VMM0 > > + movl %edi, %eax > > + andl $(PAGE_SIZE - 1), %eax > > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > > + ja L(page_cross) > > + > > + /* Compare [w]char for null, mask bit will be set for match. */ > > + VPCMP $0, (%rdi), %VMM0, %k0 > > + > > + KMOV %k0, %RAX > > +# ifndef USE_AS_RAWMEMCHR > > + bsf %RAX, %RCX > > + jz L(align_more) > > + xor %eax, %eax > > +# ifdef USE_AS_WMEMCHR > > + leaq (%rdi, %rcx, CHAR_SIZE), %rdi > > +# else > > + addq %rcx, %rdi > > +# endif > > + cmp %rcx, %rdx > > + cmova %rdi, %rax > > +# else > > + bsf %RAX, %RAX > > + jz L(align_more) > > + add %rdi, %rax > > +# endif > > + ret > > + > > +# ifndef USE_AS_RAWMEMCHR > > +L(zero): > > + xorl %eax, %eax > > + ret > > +# endif > > + > > + .p2align 5,,5 > > +L(page_cross): > > + movq %rdi, %rcx > > + andq $-VEC_SIZE, %rcx > > + > > + VPCMP $0, (%rcx), %VMM0, %k0 > > + KMOV %k0, %RCX > > + SARX %RAX, %RCX, %RAX > > +# ifndef USE_AS_RAWMEMCHR > > + bsf %RAX, %RCX > > + jz L(align_more) > > + xor %eax, %eax > > +# ifdef USE_AS_WMEMCHR > > + leaq (%rdi, %rcx, CHAR_SIZE), %rdi > > +# else > > + addq %rcx, %rdi > > +# endif > > + cmp %rcx, %rdx > > + cmovae %rdi, %rax > > Irrelivant of other concerns this is buggy. It needs to be > cmova. A testcase? > > + > > +# else > > + bsf %rax, %rax > > + jz L(align_more) > > + add %rdi, %rax > > +# endif > > + ret > > + > > +L(ret_vec_x2): > > + subq $-VEC_SIZE, %rdi > > +L(ret_vec_x1): > > + bsf %RAX, %RAX > > +# ifndef USE_AS_RAWMEMCHR > > + jz L(zero) > > + cmp %rax, %rdx > > + jbe L(zero) > > +# endif > > +# ifdef USE_AS_WMEMCHR > > + leaq (%rdi, %rax, CHAR_SIZE), %rax > > +# else > > + add %rdi, %rax > > +# endif > > + ret > > + > > + .p2align 5,,10 > > +L(align_more): > > +# ifndef USE_AS_RAWMEMCHR > > + xor %eax, %eax > > + subq %rdi, %rax > > +# endif > > + > > + subq $-VEC_SIZE, %rdi > > + /* Align rdi to VEC_SIZE. */ > > + andq $-VEC_SIZE, %rdi > > + > > +# ifndef USE_AS_RAWMEMCHR > > + addq %rdi, %rax > > +# ifdef USE_AS_WMEMCHR > > + sarl $2, %eax > > +# endif > > + subq %rax, %rdx > > + jbe L(zero) > > +# endif > > + > > + /* Loop unroll 4 times for 4 vector loop. */ > > + VPCMP $0, (%rdi), %VMM0, %k0 > > + > > + KMOV %k0, %RAX > > + test %RAX, %RAX > > + jnz L(ret_vec_x1) > > + > > +# ifndef USE_AS_RAWMEMCHR > > + subq $CHAR_PER_VEC, %rdx > > + jbe L(zero) > > +# endif > > + > > + VPCMP $0, VEC_SIZE(%rdi), %VMM0, %k0 > > + > > + KMOV %k0, %RAX > > + test %RAX, %RAX > > + jnz L(ret_vec_x2) > > + > > +# ifndef USE_AS_RAWMEMCHR > > + subq $CHAR_PER_VEC, %rdx > > + jbe L(zero) > > +# endif > > + > > + VPCMP $0, (VEC_SIZE * 2)(%rdi), %VMM0, %k0 > > + > > + KMOV %k0, %RAX > > + test %RAX, %RAX > > + jnz L(ret_vec_x3) > > + > > +# ifndef USE_AS_RAWMEMCHR > > + subq $CHAR_PER_VEC, %rdx > > + jbe L(zero) > > +# endif > > + > > + VPCMP $0, (VEC_SIZE * 3)(%rdi), %VMM0, %k0 > > + > > + KMOV %k0, %RAX > > + test %RAX, %RAX > > + jnz L(ret_vec_x4) > > + > > +# ifndef USE_AS_RAWMEMCHR > > + subq $CHAR_PER_VEC, %rdx > > + jbe L(zero) > > + /* Save pointer to find alignment adjustment. */ > > + movq %rdi, %rax > > +# endif > > + /* Align address to VEC_SIZE * 4 for loop. */ > > + andq $-(VEC_SIZE * 4), %rdi > > + > > + /* Add alignment difference to rdx. */ > > +# ifndef USE_AS_RAWMEMCHR > > + subq %rdi, %rax > > +# ifdef USE_AS_WMEMCHR > > + SHR $2, %RAX > > +# endif > > + addq %rax, %rdx > > + jmp L(loop_entry) > > +# endif > > + > > + /* 4 vector loop. */ > > + .p2align 5,,11 > > +L(loop): > > +# ifndef USE_AS_RAWMEMCHR > > + subq $(CHAR_PER_VEC * 4), %rdx > > + jbe L(zero) > > +L(loop_entry): > > +# endif > > + VPCMP $0, (VEC_SIZE * 4)(%rdi), %VMM0, %k1 > > + VPCMP $0, (VEC_SIZE * 5)(%rdi), %VMM0, %k2 > > + VPCMP $0, (VEC_SIZE * 6)(%rdi), %VMM0, %k3 > > + VPCMP $0, (VEC_SIZE * 7)(%rdi), %VMM0, %k4 > > + KOR %k1, %k2, %k5 > > + KOR %k3, %k4, %k6 > > + > > + subq $-(VEC_SIZE * 4), %rdi > > + KORTEST %k5, %k6 > > + jz L(loop) > > + > > + KMOV %k1, %RAX > > + test %RAX, %RAX > > + jnz L(ret_vec_x1) > > + > > +# ifndef USE_AS_RAWMEMCHR > > + subq $CHAR_PER_VEC, %rdx > > + jbe L(zero) > > +# endif > > + > > + KMOV %k2, %RAX > > + test %RAX, %RAX > > + jnz L(ret_vec_x2) > > + > > +# ifndef USE_AS_RAWMEMCHR > > + subq $CHAR_PER_VEC, %rdx > > + jbe L(zero) > > +# endif > > + > > + KMOV %k3, %RAX > > + test %RAX, %RAX > > + jnz L(ret_vec_x3) > > + > > +# ifndef USE_AS_RAWMEMCHR > > + subq $CHAR_PER_VEC, %rdx > > + jbe L(zero) > > +# endif > > + > > + /* At this point null [w]char must be in the fourth vector so no > > + need to check. */ > > + KMOV %k4, %RAX > > + > > +L(ret_vec_x4): > > + bsf %RAX, %RAX > > +# ifndef USE_AS_RAWMEMCHR > > + cmp %rax, %rdx > > + jbe L(zero) > > +# endif > > + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax > > + ret > > + > > + .p2align 5,,5 > > +L(ret_vec_x3): > > + bsf %RAX, %RAX > > +# ifndef USE_AS_RAWMEMCHR > > + cmp %rax, %rdx > > + jbe L(zero) > > +# endif > > + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > > + ret > > + > > +END (MEMCHR) > > +#endif > > diff --git a/sysdeps/x86_64/multiarch/memchr-evex512.S b/sysdeps/x86_64/multiarch/memchr-evex512.S > > new file mode 100644 > > index 0000000000..47349d817a > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/memchr-evex512.S > > @@ -0,0 +1,7 @@ > > +# ifndef MEMCHR > > +# define MEMCHR __memchr_evex512 > > +# endif > > + > > +#define VEC_SIZE 64 > > + > > +#include "memchr-evex-base.S" > > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex512.S b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S > > new file mode 100644 > > index 0000000000..302d3cb055 > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S > > @@ -0,0 +1,7 @@ > > +#ifndef RAWMEMCHR > > +# define RAWMEMCHR __rawmemchr_evex512 > > +#endif > > +#define USE_AS_RAWMEMCHR 1 > > +#define MEMCHR RAWMEMCHR > > + > > +#include "memchr-evex512.S" > > diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex512.S b/sysdeps/x86_64/multiarch/wmemchr-evex512.S > > new file mode 100644 > > index 0000000000..f45ed1db75 > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/wmemchr-evex512.S > > @@ -0,0 +1,8 @@ > > +#ifndef WMEMCHR > > +# define WMEMCHR __wmemchr_evex512 > > +#endif > > + > > +#define MEMCHR WMEMCHR > > +#define USE_AS_WMEMCHR 1 > > + > > +#include "memchr-evex512.S" > > -- > > 2.36.1 > > -- H.J. ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH] x86_64: Implement evex512 version of memchr, rawmemchr and wmemchr 2022-10-03 19:00 ` H.J. Lu @ 2022-10-03 19:12 ` Noah Goldstein 2022-10-13 21:41 ` [PATCH v2] " Sunil K Pandey 0 siblings, 1 reply; 26+ messages in thread From: Noah Goldstein @ 2022-10-03 19:12 UTC (permalink / raw) To: H.J. Lu; +Cc: Sunil K Pandey, libc-alpha On Mon, Oct 3, 2022 at 12:01 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > On Mon, Oct 3, 2022 at 11:33 AM Noah Goldstein via Libc-alpha > <libc-alpha@sourceware.org> wrote: > > > > On Wed, Sep 21, 2022 at 5:27 PM Sunil K Pandey via Libc-alpha > > <libc-alpha@sourceware.org> wrote: > > > > > > This patch implements following evex512 version of string functions. > > > evex512 version takes up to 30% less cycle as compared to evex, > > > depending on length and alignment. > > > > > > - memchr function using 512 bit vectors. > > > - rawmemchr function using 512 bit vectors. > > > - wmemchr function using 512 bit vectors. > > > > > > Code size data: > > > > > > memchr-evex.o 762 byte > > > memchr-evex512.o 570 byte (-25%) > > > > > > rawmemchr-evex.o 461 byte > > > rawmemchr-evex512.o 413 byte (-10%) > > > > > > wmemchr-evex.o 794 byte > > > wmemchr-evex512.o 568 byte (-28%) > > > > > > Placeholder function, not used by any processor at the moment. > > > --- > > > sysdeps/x86_64/multiarch/Makefile | 3 + > > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 + > > > sysdeps/x86_64/multiarch/memchr-evex-base.S | 306 +++++++++++++++++++ > > > sysdeps/x86_64/multiarch/memchr-evex512.S | 7 + > > > sysdeps/x86_64/multiarch/rawmemchr-evex512.S | 7 + > > > sysdeps/x86_64/multiarch/wmemchr-evex512.S | 8 + > > > 6 files changed, 346 insertions(+) > > > create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-base.S > > > create mode 100644 sysdeps/x86_64/multiarch/memchr-evex512.S > > > create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex512.S > > > create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex512.S > > > > > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > > > index df4601c294..e974b1ad97 100644 > > > --- a/sysdeps/x86_64/multiarch/Makefile > > > +++ b/sysdeps/x86_64/multiarch/Makefile > > > @@ -4,6 +4,7 @@ sysdep_routines += \ > > > memchr-avx2 \ > > > memchr-avx2-rtm \ > > > memchr-evex \ > > > + memchr-evex512 \ > > > memchr-evex-rtm \ > > > memchr-sse2 \ > > > memcmp-avx2-movbe \ > > > @@ -36,6 +37,7 @@ sysdep_routines += \ > > > rawmemchr-avx2 \ > > > rawmemchr-avx2-rtm \ > > > rawmemchr-evex \ > > > + rawmemchr-evex512 \ > > > rawmemchr-evex-rtm \ > > > rawmemchr-sse2 \ > > > stpcpy-avx2 \ > > > @@ -156,6 +158,7 @@ sysdep_routines += \ > > > wmemchr-avx2 \ > > > wmemchr-avx2-rtm \ > > > wmemchr-evex \ > > > + wmemchr-evex512 \ > > > wmemchr-evex-rtm \ > > > wmemchr-sse2 \ > > > wmemcmp-avx2-movbe \ > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > index a71444eccb..17f770318d 100644 > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > @@ -63,6 +63,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > && CPU_FEATURE_USABLE (AVX512BW) > > > && CPU_FEATURE_USABLE (BMI2)), > > > __memchr_evex) > > > + X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > > > + (CPU_FEATURE_USABLE (AVX512VL) > > > + && CPU_FEATURE_USABLE (AVX512BW) > > > + && CPU_FEATURE_USABLE (BMI2)), > > > + __memchr_evex512) > > > X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > > > (CPU_FEATURE_USABLE (AVX512VL) > > > && CPU_FEATURE_USABLE (AVX512BW) > > > @@ -329,6 +334,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > && CPU_FEATURE_USABLE (AVX512BW) > > > && CPU_FEATURE_USABLE (BMI2)), > > > __rawmemchr_evex) > > > + X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > > > + (CPU_FEATURE_USABLE (AVX512VL) > > > + && CPU_FEATURE_USABLE (AVX512BW) > > > + && CPU_FEATURE_USABLE (BMI2)), > > > + __rawmemchr_evex512) > > > X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > > > (CPU_FEATURE_USABLE (AVX512VL) > > > && CPU_FEATURE_USABLE (AVX512BW) > > > @@ -903,6 +913,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > && CPU_FEATURE_USABLE (AVX512BW) > > > && CPU_FEATURE_USABLE (BMI2)), > > > __wmemchr_evex) > > > + X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > > > + (CPU_FEATURE_USABLE (AVX512VL) > > > + && CPU_FEATURE_USABLE (AVX512BW) > > > + && CPU_FEATURE_USABLE (BMI2)), > > > + __wmemchr_evex512) > > > X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > > > (CPU_FEATURE_USABLE (AVX512VL) > > > && CPU_FEATURE_USABLE (AVX512BW) > > > diff --git a/sysdeps/x86_64/multiarch/memchr-evex-base.S b/sysdeps/x86_64/multiarch/memchr-evex-base.S > > > new file mode 100644 > > > index 0000000000..524f0809b5 > > > --- /dev/null > > > +++ b/sysdeps/x86_64/multiarch/memchr-evex-base.S > > > @@ -0,0 +1,306 @@ > > > +/* Placeholder function, not used by any processor at the moment. > > > + Copyright (C) 2022 Free Software Foundation, Inc. > > > + This file is part of the GNU C Library. > > > + > > > + The GNU C Library is free software; you can redistribute it and/or > > > + modify it under the terms of the GNU Lesser General Public > > > + License as published by the Free Software Foundation; either > > > + version 2.1 of the License, or (at your option) any later version. > > > + > > > + The GNU C Library is distributed in the hope that it will be useful, > > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > > + Lesser General Public License for more details. > > > + > > > + You should have received a copy of the GNU Lesser General Public > > > + License along with the GNU C Library; if not, see > > > + <https://www.gnu.org/licenses/>. */ > > > + > > > +/* UNUSED. Exists purely as reference implementation. */ > > > + > > > +#include <isa-level.h> > > > + > > > +#if ISA_SHOULD_BUILD (4) > > > + > > > +# include <sysdep.h> > > > + > > > +# ifdef USE_AS_WMEMCHR > > > +# define CHAR_SIZE 4 > > > +# define VPBROADCAST vpbroadcastd > > > +# define VPCMP vpcmpd > > > +# else > > > +# define CHAR_SIZE 1 > > > +# define VPBROADCAST vpbroadcastb > > > +# define VPCMP vpcmpb > > > +# endif > > > + > > > +# define PAGE_SIZE 4096 > > > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > > +# define XMM1 xmm17 > > > + > > > +# if VEC_SIZE == 64 > > > +# define KMOV kmovq > > > +# define KOR korq > > > +# define KORTEST kortestq > > > +# define RAX rax > > > +# define RCX rcx > > > +# define SHR shrq > > > +# define SARX sarxq > > > +# define TEXTSUFFIX evex512 > > > +# define VMM0 zmm16 > > > +# elif VEC_SIZE == 32 > > > +/* Currently Unused. */ > > > +# define KMOV kmovd > > > +# define KOR kord > > > +# define KORTEST kortestd > > > +# define RAX eax > > > +# define RCX ecx > > > +# define SHR shrl > > > +# define SARX sarxl > > > +# define TEXTSUFFIX evex256 > > > +# define VMM0 ymm16 > > > +# endif > > > + > > > + .section .text.TEXTSUFFIX, "ax", @progbits > > > +/* Aligning entry point to 64 byte, provides better performance for > > > + one vector length string. */ > > > +ENTRY_P2ALIGN (MEMCHR, 6) > > > +# ifndef USE_AS_RAWMEMCHR > > > + /* Check for zero length. */ > > > + test %RDX_LP, %RDX_LP > > > + jz L(zero) > > > + > > > +# ifdef __ILP32__ > > > + /* Clear the upper 32 bits. */ > > > + movl %edx, %edx > > > +# endif > > > +# endif > > > + > > > + /* Broadcast CHAR to VMM0. */ > > > + VPBROADCAST %esi, %VMM0 > > > + movl %edi, %eax > > > + andl $(PAGE_SIZE - 1), %eax > > > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > > > + ja L(page_cross) > > > + > > > + /* Compare [w]char for null, mask bit will be set for match. */ > > > + VPCMP $0, (%rdi), %VMM0, %k0 > > > + > > > + KMOV %k0, %RAX > > > +# ifndef USE_AS_RAWMEMCHR > > > + bsf %RAX, %RCX > > > + jz L(align_more) > > > + xor %eax, %eax > > > +# ifdef USE_AS_WMEMCHR > > > + leaq (%rdi, %rcx, CHAR_SIZE), %rdi > > > +# else > > > + addq %rcx, %rdi > > > +# endif > > > + cmp %rcx, %rdx > > > + cmova %rdi, %rax > > > +# else > > > + bsf %RAX, %RAX > > > + jz L(align_more) > > > + add %rdi, %rax > > > +# endif > > > + ret > > > + > > > +# ifndef USE_AS_RAWMEMCHR > > > +L(zero): > > > + xorl %eax, %eax > > > + ret > > > +# endif > > > + > > > + .p2align 5,,5 > > > +L(page_cross): > > > + movq %rdi, %rcx > > > + andq $-VEC_SIZE, %rcx > > > + > > > + VPCMP $0, (%rcx), %VMM0, %k0 > > > + KMOV %k0, %RCX > > > + SARX %RAX, %RCX, %RAX > > > +# ifndef USE_AS_RAWMEMCHR > > > + bsf %RAX, %RCX > > > + jz L(align_more) > > > + xor %eax, %eax > > > +# ifdef USE_AS_WMEMCHR > > > + leaq (%rdi, %rcx, CHAR_SIZE), %rdi > > > +# else > > > + addq %rcx, %rdi > > > +# endif > > > + cmp %rcx, %rdx > > > + cmovae %rdi, %rax > > > > Irrelivant of other concerns this is buggy. It needs to be > > cmova. > > A testcase? Align % PAGE_SIZE == 4094 position = 1 len = 1. > > > > + > > > +# else > > > + bsf %rax, %rax > > > + jz L(align_more) > > > + add %rdi, %rax > > > +# endif > > > + ret > > > + > > > +L(ret_vec_x2): > > > + subq $-VEC_SIZE, %rdi > > > +L(ret_vec_x1): > > > + bsf %RAX, %RAX > > > +# ifndef USE_AS_RAWMEMCHR > > > + jz L(zero) > > > + cmp %rax, %rdx > > > + jbe L(zero) > > > +# endif > > > +# ifdef USE_AS_WMEMCHR > > > + leaq (%rdi, %rax, CHAR_SIZE), %rax > > > +# else > > > + add %rdi, %rax > > > +# endif > > > + ret > > > + > > > + .p2align 5,,10 > > > +L(align_more): > > > +# ifndef USE_AS_RAWMEMCHR > > > + xor %eax, %eax > > > + subq %rdi, %rax > > > +# endif > > > + > > > + subq $-VEC_SIZE, %rdi > > > + /* Align rdi to VEC_SIZE. */ > > > + andq $-VEC_SIZE, %rdi > > > + > > > +# ifndef USE_AS_RAWMEMCHR > > > + addq %rdi, %rax > > > +# ifdef USE_AS_WMEMCHR > > > + sarl $2, %eax > > > +# endif > > > + subq %rax, %rdx > > > + jbe L(zero) > > > +# endif > > > + > > > + /* Loop unroll 4 times for 4 vector loop. */ > > > + VPCMP $0, (%rdi), %VMM0, %k0 > > > + > > > + KMOV %k0, %RAX > > > + test %RAX, %RAX > > > + jnz L(ret_vec_x1) > > > + > > > +# ifndef USE_AS_RAWMEMCHR > > > + subq $CHAR_PER_VEC, %rdx > > > + jbe L(zero) > > > +# endif > > > + > > > + VPCMP $0, VEC_SIZE(%rdi), %VMM0, %k0 > > > + > > > + KMOV %k0, %RAX > > > + test %RAX, %RAX > > > + jnz L(ret_vec_x2) > > > + > > > +# ifndef USE_AS_RAWMEMCHR > > > + subq $CHAR_PER_VEC, %rdx > > > + jbe L(zero) > > > +# endif > > > + > > > + VPCMP $0, (VEC_SIZE * 2)(%rdi), %VMM0, %k0 > > > + > > > + KMOV %k0, %RAX > > > + test %RAX, %RAX > > > + jnz L(ret_vec_x3) > > > + > > > +# ifndef USE_AS_RAWMEMCHR > > > + subq $CHAR_PER_VEC, %rdx > > > + jbe L(zero) > > > +# endif > > > + > > > + VPCMP $0, (VEC_SIZE * 3)(%rdi), %VMM0, %k0 > > > + > > > + KMOV %k0, %RAX > > > + test %RAX, %RAX > > > + jnz L(ret_vec_x4) > > > + > > > +# ifndef USE_AS_RAWMEMCHR > > > + subq $CHAR_PER_VEC, %rdx > > > + jbe L(zero) > > > + /* Save pointer to find alignment adjustment. */ > > > + movq %rdi, %rax > > > +# endif > > > + /* Align address to VEC_SIZE * 4 for loop. */ > > > + andq $-(VEC_SIZE * 4), %rdi > > > + > > > + /* Add alignment difference to rdx. */ > > > +# ifndef USE_AS_RAWMEMCHR > > > + subq %rdi, %rax > > > +# ifdef USE_AS_WMEMCHR > > > + SHR $2, %RAX > > > +# endif > > > + addq %rax, %rdx > > > + jmp L(loop_entry) > > > +# endif > > > + > > > + /* 4 vector loop. */ > > > + .p2align 5,,11 > > > +L(loop): > > > +# ifndef USE_AS_RAWMEMCHR > > > + subq $(CHAR_PER_VEC * 4), %rdx > > > + jbe L(zero) > > > +L(loop_entry): > > > +# endif > > > + VPCMP $0, (VEC_SIZE * 4)(%rdi), %VMM0, %k1 > > > + VPCMP $0, (VEC_SIZE * 5)(%rdi), %VMM0, %k2 > > > + VPCMP $0, (VEC_SIZE * 6)(%rdi), %VMM0, %k3 > > > + VPCMP $0, (VEC_SIZE * 7)(%rdi), %VMM0, %k4 > > > + KOR %k1, %k2, %k5 > > > + KOR %k3, %k4, %k6 > > > + > > > + subq $-(VEC_SIZE * 4), %rdi > > > + KORTEST %k5, %k6 > > > + jz L(loop) > > > + > > > + KMOV %k1, %RAX > > > + test %RAX, %RAX > > > + jnz L(ret_vec_x1) > > > + > > > +# ifndef USE_AS_RAWMEMCHR > > > + subq $CHAR_PER_VEC, %rdx > > > + jbe L(zero) > > > +# endif > > > + > > > + KMOV %k2, %RAX > > > + test %RAX, %RAX > > > + jnz L(ret_vec_x2) > > > + > > > +# ifndef USE_AS_RAWMEMCHR > > > + subq $CHAR_PER_VEC, %rdx > > > + jbe L(zero) > > > +# endif > > > + > > > + KMOV %k3, %RAX > > > + test %RAX, %RAX > > > + jnz L(ret_vec_x3) > > > + > > > +# ifndef USE_AS_RAWMEMCHR > > > + subq $CHAR_PER_VEC, %rdx > > > + jbe L(zero) > > > +# endif > > > + > > > + /* At this point null [w]char must be in the fourth vector so no > > > + need to check. */ > > > + KMOV %k4, %RAX > > > + > > > +L(ret_vec_x4): > > > + bsf %RAX, %RAX > > > +# ifndef USE_AS_RAWMEMCHR > > > + cmp %rax, %rdx > > > + jbe L(zero) > > > +# endif > > > + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax > > > + ret > > > + > > > + .p2align 5,,5 > > > +L(ret_vec_x3): > > > + bsf %RAX, %RAX > > > +# ifndef USE_AS_RAWMEMCHR > > > + cmp %rax, %rdx > > > + jbe L(zero) > > > +# endif > > > + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > > > + ret > > > + > > > +END (MEMCHR) > > > +#endif > > > diff --git a/sysdeps/x86_64/multiarch/memchr-evex512.S b/sysdeps/x86_64/multiarch/memchr-evex512.S > > > new file mode 100644 > > > index 0000000000..47349d817a > > > --- /dev/null > > > +++ b/sysdeps/x86_64/multiarch/memchr-evex512.S > > > @@ -0,0 +1,7 @@ > > > +# ifndef MEMCHR > > > +# define MEMCHR __memchr_evex512 > > > +# endif > > > + > > > +#define VEC_SIZE 64 > > > + > > > +#include "memchr-evex-base.S" > > > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex512.S b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S > > > new file mode 100644 > > > index 0000000000..302d3cb055 > > > --- /dev/null > > > +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S > > > @@ -0,0 +1,7 @@ > > > +#ifndef RAWMEMCHR > > > +# define RAWMEMCHR __rawmemchr_evex512 > > > +#endif > > > +#define USE_AS_RAWMEMCHR 1 > > > +#define MEMCHR RAWMEMCHR > > > + > > > +#include "memchr-evex512.S" > > > diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex512.S b/sysdeps/x86_64/multiarch/wmemchr-evex512.S > > > new file mode 100644 > > > index 0000000000..f45ed1db75 > > > --- /dev/null > > > +++ b/sysdeps/x86_64/multiarch/wmemchr-evex512.S > > > @@ -0,0 +1,8 @@ > > > +#ifndef WMEMCHR > > > +# define WMEMCHR __wmemchr_evex512 > > > +#endif > > > + > > > +#define MEMCHR WMEMCHR > > > +#define USE_AS_WMEMCHR 1 > > > + > > > +#include "memchr-evex512.S" > > > -- > > > 2.36.1 > > > > > > > -- > H.J. ^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCH v2] x86_64: Implement evex512 version of memchr, rawmemchr and wmemchr 2022-10-03 19:12 ` Noah Goldstein @ 2022-10-13 21:41 ` Sunil K Pandey 2022-10-15 14:01 ` [PATCH v3] " Sunil K Pandey 0 siblings, 1 reply; 26+ messages in thread From: Sunil K Pandey @ 2022-10-13 21:41 UTC (permalink / raw) To: libc-alpha Changes from v1: - Change vcmp to vcmpeq and vcmpneq. - Restrucure unconditional loop jump logic. - Improve 4 vector loop logic. - Fix bug near page boundary. This patch implements following evex512 version of string functions. evex512 version takes up to 30% less cycle as compared to evex, depending on length and alignment. - memchr function using 512 bit vectors. - rawmemchr function using 512 bit vectors. - wmemchr function using 512 bit vectors. Code size data: memchr-evex.o 762 byte memchr-evex512.o 588 byte (-23%) rawmemchr-evex.o 461 byte rawmemchr-evex512.o 432 byte (-6%) wmemchr-evex.o 794 byte wmemchr-evex512.o 586 byte (-26%) Placeholder function, not used by any processor at the moment. --- sysdeps/x86_64/multiarch/Makefile | 3 + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 + sysdeps/x86_64/multiarch/memchr-evex-base.S | 328 +++++++++++++++++++ sysdeps/x86_64/multiarch/memchr-evex512.S | 7 + sysdeps/x86_64/multiarch/rawmemchr-evex512.S | 7 + sysdeps/x86_64/multiarch/wmemchr-evex512.S | 8 + 6 files changed, 368 insertions(+) create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-base.S create mode 100644 sysdeps/x86_64/multiarch/memchr-evex512.S create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex512.S create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex512.S diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index df4601c294..e974b1ad97 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -4,6 +4,7 @@ sysdep_routines += \ memchr-avx2 \ memchr-avx2-rtm \ memchr-evex \ + memchr-evex512 \ memchr-evex-rtm \ memchr-sse2 \ memcmp-avx2-movbe \ @@ -36,6 +37,7 @@ sysdep_routines += \ rawmemchr-avx2 \ rawmemchr-avx2-rtm \ rawmemchr-evex \ + rawmemchr-evex512 \ rawmemchr-evex-rtm \ rawmemchr-sse2 \ stpcpy-avx2 \ @@ -156,6 +158,7 @@ sysdep_routines += \ wmemchr-avx2 \ wmemchr-avx2-rtm \ wmemchr-evex \ + wmemchr-evex512 \ wmemchr-evex-rtm \ wmemchr-sse2 \ wmemcmp-avx2-movbe \ diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 00a91123d3..529c0b0ef0 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -63,6 +63,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), __memchr_evex) + X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __memchr_evex512) X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) @@ -337,6 +342,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), __rawmemchr_evex) + X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __rawmemchr_evex512) X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) @@ -942,6 +952,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), __wmemchr_evex) + X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __wmemchr_evex512) X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) diff --git a/sysdeps/x86_64/multiarch/memchr-evex-base.S b/sysdeps/x86_64/multiarch/memchr-evex-base.S new file mode 100644 index 0000000000..4be4a6877a --- /dev/null +++ b/sysdeps/x86_64/multiarch/memchr-evex-base.S @@ -0,0 +1,328 @@ +/* Placeholder function, not used by any processor at the moment. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +/* UNUSED. Exists purely as reference implementation. */ + +#include <isa-level.h> + +#if ISA_SHOULD_BUILD (4) + +# include <sysdep.h> + +# ifdef USE_AS_WMEMCHR +# define CHAR_SIZE 4 +# define VPBROADCAST vpbroadcastd +# define VPCMPEQ vpcmpeqd +# define VPCMPNE vpcmpneqd +# define VPMINU vpminud +# else +# define CHAR_SIZE 1 +# define VPBROADCAST vpbroadcastb +# define VPCMPEQ vpcmpeqb +# define VPCMPNE vpcmpneqb +# define VPMINU vpminub +# endif + +# define PAGE_SIZE 4096 +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) + +# define XMMZERO xmm16 +# if VEC_SIZE == 64 +# define KMOV kmovq +# define KOR korq +# define NOT notq +# define KORTEST kortestq +# define VRAX rax +# define VRCX rcx +# define SHR shrq +# define SARX sarxq +# define TEXTSUFFIX evex512 +# define VMMZERO zmm16 +# define VMMMATCH zmm17 +# define VMM1 zmm18 +# define VMM2 zmm19 +# define VMM3 zmm20 +# elif VEC_SIZE == 32 +/* Currently Unused. */ +# define KMOV kmovd +# define KOR kord +# define NOT notl +# define KORTEST kortestd +# define VRAX eax +# define VRCX ecx +# define SHR shrl +# define SARX sarxl +# define TEXTSUFFIX evex256 +# define VMMZERO ymm16 +# define VMMMATCH ymm17 +# define VMM1 ymm18 +# define VMM2 ymm19 +# define VMM3 ymm20 +# endif + + .section .text.TEXTSUFFIX, "ax", @progbits +/* Aligning entry point to 64 byte, provides better performance for + one vector length string. */ +ENTRY_P2ALIGN (MEMCHR, 6) +# ifndef USE_AS_RAWMEMCHR + /* Check for zero length. */ + test %RDX_LP, %RDX_LP + jz L(zero) + +# ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %edx, %edx +# endif +# endif + + /* Broadcast CHAR to VMMMATCH. */ + VPBROADCAST %esi, %VMMMATCH + movl %edi, %eax + andl $(PAGE_SIZE - 1), %eax + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + ja L(page_cross) + + /* Compare [w]char for null, mask bit will be set for match. */ + VPCMPEQ (%rdi), %VMMMATCH, %k0 + + KMOV %k0, %VRAX +# ifndef USE_AS_RAWMEMCHR + bsf %VRAX, %VRCX + jz L(align_more) + xor %eax, %eax +# ifdef USE_AS_WMEMCHR + leaq (%rdi, %rcx, CHAR_SIZE), %rdi +# else + addq %rcx, %rdi +# endif + cmp %rcx, %rdx + cmova %rdi, %rax +# else + bsf %VRAX, %VRAX + jz L(align_more) + add %rdi, %rax +# endif + ret + +# ifndef USE_AS_RAWMEMCHR +L(zero): + xorl %eax, %eax + ret +# endif + + .p2align 5,,5 +L(page_cross): + movq %rdi, %rcx + andq $-VEC_SIZE, %rcx + + VPCMPEQ (%rcx), %VMMMATCH, %k0 + KMOV %k0, %VRCX + SARX %VRAX, %VRCX, %VRAX +# ifndef USE_AS_RAWMEMCHR + bsf %VRAX, %VRCX + jz L(align_more) + xor %eax, %eax +# ifdef USE_AS_WMEMCHR + leaq (%rdi, %rcx, CHAR_SIZE), %rdi +# else + addq %rcx, %rdi +# endif + cmp %rcx, %rdx + cmova %rdi, %rax + +# else + bsf %rax, %rax + jz L(align_more) + add %rdi, %rax +# endif + ret + +L(ret_vec_x2): + subq $-VEC_SIZE, %rdi +L(ret_vec_x1): + bsf %VRAX, %VRAX +# ifndef USE_AS_RAWMEMCHR + jz L(zero) + cmp %rax, %rdx + jbe L(zero) +# endif +# ifdef USE_AS_WMEMCHR + leaq (%rdi, %rax, CHAR_SIZE), %rax +# else + add %rdi, %rax +# endif + ret + + .p2align 5,,10 +L(align_more): +# ifndef USE_AS_RAWMEMCHR + xor %eax, %eax + subq %rdi, %rax +# endif + + subq $-VEC_SIZE, %rdi + /* Align rdi to VEC_SIZE. */ + andq $-VEC_SIZE, %rdi + +# ifndef USE_AS_RAWMEMCHR + addq %rdi, %rax +# ifdef USE_AS_WMEMCHR + sarl $2, %eax +# endif + subq %rax, %rdx + jbe L(zero) +# endif + + /* Loop unroll 4 times for 4 vector loop. */ + VPCMPEQ (%rdi), %VMMMATCH, %k0 + + KMOV %k0, %VRAX + test %VRAX, %VRAX + jnz L(ret_vec_x1) + +# ifndef USE_AS_RAWMEMCHR + subq $CHAR_PER_VEC, %rdx + jbe L(zero) +# endif + + VPCMPEQ VEC_SIZE(%rdi), %VMMMATCH, %k0 + + KMOV %k0, %VRAX + test %VRAX, %VRAX + jnz L(ret_vec_x2) + +# ifndef USE_AS_RAWMEMCHR + subq $CHAR_PER_VEC, %rdx + jbe L(zero) +# endif + + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMMMATCH, %k0 + + KMOV %k0, %VRAX + test %VRAX, %VRAX + jnz L(ret_vec_x3) + +# ifndef USE_AS_RAWMEMCHR + subq $CHAR_PER_VEC, %rdx + jbe L(zero) +# endif + + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMMMATCH, %k0 + + KMOV %k0, %VRAX + test %VRAX, %VRAX + jnz L(ret_vec_x4) + +# ifndef USE_AS_RAWMEMCHR + subq $CHAR_PER_VEC, %rdx + jbe L(zero) + /* Save pointer to find alignment adjustment. */ + movq %rdi, %rax +# endif + /* Align address to VEC_SIZE * 4 for loop. */ + andq $-(VEC_SIZE * 4), %rdi + + /* Add alignment difference to rdx. */ +# ifndef USE_AS_RAWMEMCHR + subq %rdi, %rax +# ifdef USE_AS_WMEMCHR + SHR $2, %VRAX +# endif + addq %rax, %rdx +# endif + vpxorq %XMMZERO, %XMMZERO, %XMMZERO + + /* 4 vector loop. */ + .p2align 5,,11 +L(loop): + + VPCMPNE (VEC_SIZE * 4)(%rdi), %VMMMATCH, %k1 + vpxorq (VEC_SIZE * 5)(%rdi), %VMMMATCH, %VMM1 + vpxorq (VEC_SIZE * 6)(%rdi), %VMMMATCH, %VMM2 + VPCMPEQ (VEC_SIZE * 7)(%rdi), %VMMMATCH, %k3 + VPMINU %VMM1, %VMM2, %VMM3{%k1}{z} + VPCMPEQ %VMM3, %VMMZERO, %k2 + + subq $-(VEC_SIZE * 4), %rdi + KORTEST %k2, %k3 +# ifdef USE_AS_RAWMEMCHR + jz L(loop) +# else + jnz L(loopend) + subq $(CHAR_PER_VEC * 4), %rdx + ja L(loop) + xor %eax, %eax + ret +# endif + +L(loopend): + VPCMPEQ (%rdi), %VMMMATCH, %k1 + KMOV %k1, %VRAX + test %VRAX, %VRAX + jnz L(ret_vec_x1) + +# ifndef USE_AS_RAWMEMCHR + subq $CHAR_PER_VEC, %rdx + jbe L(zero) +# endif + + VPCMPEQ VEC_SIZE(%rdi), %VMMMATCH, %k1 + KMOV %k1, %VRAX + test %VRAX, %VRAX + jnz L(ret_vec_x2) + +# ifndef USE_AS_RAWMEMCHR + subq $CHAR_PER_VEC, %rdx + jbe L(zero) +# endif + + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMMMATCH, %k1 + KMOV %k1, %VRAX + test %VRAX, %VRAX + jnz L(ret_vec_x3) + +# ifndef USE_AS_RAWMEMCHR + subq $CHAR_PER_VEC, %rdx + jbe L(zero) +# endif + + /* At this point null [w]char must be in the fourth vector so no + need to check. */ + KMOV %k3, %VRAX + +L(ret_vec_x4): + bsf %VRAX, %VRAX +# ifndef USE_AS_RAWMEMCHR + cmp %rax, %rdx + jbe L(zero) +# endif + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax + ret + + .p2align 5,,5 +L(ret_vec_x3): + bsf %VRAX, %VRAX +# ifndef USE_AS_RAWMEMCHR + cmp %rax, %rdx + jbe L(zero) +# endif + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax + ret + +END (MEMCHR) +#endif diff --git a/sysdeps/x86_64/multiarch/memchr-evex512.S b/sysdeps/x86_64/multiarch/memchr-evex512.S new file mode 100644 index 0000000000..47349d817a --- /dev/null +++ b/sysdeps/x86_64/multiarch/memchr-evex512.S @@ -0,0 +1,7 @@ +# ifndef MEMCHR +# define MEMCHR __memchr_evex512 +# endif + +#define VEC_SIZE 64 + +#include "memchr-evex-base.S" diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex512.S b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S new file mode 100644 index 0000000000..302d3cb055 --- /dev/null +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S @@ -0,0 +1,7 @@ +#ifndef RAWMEMCHR +# define RAWMEMCHR __rawmemchr_evex512 +#endif +#define USE_AS_RAWMEMCHR 1 +#define MEMCHR RAWMEMCHR + +#include "memchr-evex512.S" diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex512.S b/sysdeps/x86_64/multiarch/wmemchr-evex512.S new file mode 100644 index 0000000000..f45ed1db75 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemchr-evex512.S @@ -0,0 +1,8 @@ +#ifndef WMEMCHR +# define WMEMCHR __wmemchr_evex512 +#endif + +#define MEMCHR WMEMCHR +#define USE_AS_WMEMCHR 1 + +#include "memchr-evex512.S" -- 2.36.1 ^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCH v3] x86_64: Implement evex512 version of memchr, rawmemchr and wmemchr 2022-10-13 21:41 ` [PATCH v2] " Sunil K Pandey @ 2022-10-15 14:01 ` Sunil K Pandey 2022-10-15 16:26 ` Noah Goldstein 2022-10-15 16:26 ` Noah Goldstein 0 siblings, 2 replies; 26+ messages in thread From: Sunil K Pandey @ 2022-10-15 14:01 UTC (permalink / raw) To: libc-alpha Changes from v2: - Use VEC API - Replaced long jump L(zero) in L(endloop) with short jump L(zero_2) Changes from v1: - Change vcmp to vcmpeq and vcmpneq. - Restrucure unconditional loop jump logic. - Improve 4 vector loop logic. - Fix bug near page boundary. This patch implements following evex512 version of string functions. evex512 version takes up to 30% less cycle as compared to evex, depending on length and alignment. - memchr function using 512 bit vectors. - rawmemchr function using 512 bit vectors. - wmemchr function using 512 bit vectors. Code size data: memchr-evex.o 762 byte memchr-evex512.o 576 byte (-24%) rawmemchr-evex.o 461 byte rawmemchr-evex512.o 432 byte (-6%) wmemchr-evex.o 794 byte wmemchr-evex512.o 574 byte (-28%) Placeholder function, not used by any processor at the moment. --- sysdeps/x86_64/multiarch/Makefile | 3 + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 + sysdeps/x86_64/multiarch/memchr-evex-base.S | 295 +++++++++++++++++++ sysdeps/x86_64/multiarch/memchr-evex512.S | 8 + sysdeps/x86_64/multiarch/rawmemchr-evex512.S | 7 + sysdeps/x86_64/multiarch/wmemchr-evex512.S | 8 + 6 files changed, 336 insertions(+) create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-base.S create mode 100644 sysdeps/x86_64/multiarch/memchr-evex512.S create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex512.S create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex512.S diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index df4601c294..e974b1ad97 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -4,6 +4,7 @@ sysdep_routines += \ memchr-avx2 \ memchr-avx2-rtm \ memchr-evex \ + memchr-evex512 \ memchr-evex-rtm \ memchr-sse2 \ memcmp-avx2-movbe \ @@ -36,6 +37,7 @@ sysdep_routines += \ rawmemchr-avx2 \ rawmemchr-avx2-rtm \ rawmemchr-evex \ + rawmemchr-evex512 \ rawmemchr-evex-rtm \ rawmemchr-sse2 \ stpcpy-avx2 \ @@ -156,6 +158,7 @@ sysdep_routines += \ wmemchr-avx2 \ wmemchr-avx2-rtm \ wmemchr-evex \ + wmemchr-evex512 \ wmemchr-evex-rtm \ wmemchr-sse2 \ wmemcmp-avx2-movbe \ diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 00a91123d3..529c0b0ef0 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -63,6 +63,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), __memchr_evex) + X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __memchr_evex512) X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) @@ -337,6 +342,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), __rawmemchr_evex) + X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __rawmemchr_evex512) X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) @@ -942,6 +952,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), __wmemchr_evex) + X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __wmemchr_evex512) X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) diff --git a/sysdeps/x86_64/multiarch/memchr-evex-base.S b/sysdeps/x86_64/multiarch/memchr-evex-base.S new file mode 100644 index 0000000000..e3848dfed6 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memchr-evex-base.S @@ -0,0 +1,295 @@ +/* Placeholder function, not used by any processor at the moment. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +/* UNUSED. Exists purely as reference implementation. */ + +#include <isa-level.h> + +#if ISA_SHOULD_BUILD (4) + +# include <sysdep.h> + +# ifdef USE_AS_WMEMCHR +# define CHAR_SIZE 4 +# define VPBROADCAST vpbroadcastd +# define VPCMPEQ vpcmpeqd +# define VPCMPNE vpcmpneqd +# define VPMINU vpminud +# else +# define CHAR_SIZE 1 +# define VPBROADCAST vpbroadcastb +# define VPCMPEQ vpcmpeqb +# define VPCMPNE vpcmpneqb +# define VPMINU vpminub +# endif + +# define PAGE_SIZE 4096 +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) + + .section SECTION(.text), "ax", @progbits +/* Aligning entry point to 64 byte, provides better performance for + one vector length string. */ +ENTRY_P2ALIGN (MEMCHR, 6) +# ifndef USE_AS_RAWMEMCHR + /* Check for zero length. */ + test %RDX_LP, %RDX_LP + jz L(zero) + +# ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %edx, %edx +# endif +# endif + + /* Broadcast CHAR to VMM(1). */ + VPBROADCAST %esi, %VMM(1) + movl %edi, %eax + andl $(PAGE_SIZE - 1), %eax + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + ja L(page_cross) + + /* Compare [w]char for null, mask bit will be set for match. */ + VPCMPEQ (%rdi), %VMM(1), %k0 + + KMOV %k0, %VRAX +# ifndef USE_AS_RAWMEMCHR + bsf %VRAX, %VRCX + jz L(align_more) + xor %eax, %eax +# ifdef USE_AS_WMEMCHR + leaq (%rdi, %rcx, CHAR_SIZE), %rdi +# else + addq %rcx, %rdi +# endif + cmp %rcx, %rdx + cmova %rdi, %rax +# else + bsf %VRAX, %VRAX + jz L(align_more) + add %rdi, %rax +# endif + ret + +# ifndef USE_AS_RAWMEMCHR +L(zero): + xorl %eax, %eax + ret +# endif + + .p2align 5,,5 +L(page_cross): + movq %rdi, %rcx + andq $-VEC_SIZE, %rcx + + VPCMPEQ (%rcx), %VMM(1), %k0 + KMOV %k0, %VRCX + SARX %VRAX, %VRCX, %VRAX +# ifndef USE_AS_RAWMEMCHR + bsf %VRAX, %VRCX + jz L(align_more) + xor %eax, %eax +# ifdef USE_AS_WMEMCHR + leaq (%rdi, %rcx, CHAR_SIZE), %rdi +# else + addq %rcx, %rdi +# endif + cmp %rcx, %rdx + cmova %rdi, %rax + +# else + bsf %rax, %rax + jz L(align_more) + add %rdi, %rax +# endif + ret + +L(ret_vec_x2): + subq $-VEC_SIZE, %rdi +L(ret_vec_x1): + bsf %VRAX, %VRAX +# ifndef USE_AS_RAWMEMCHR + jz L(zero) + cmp %rax, %rdx + jbe L(zero) +# endif +# ifdef USE_AS_WMEMCHR + leaq (%rdi, %rax, CHAR_SIZE), %rax +# else + add %rdi, %rax +# endif + ret + + .p2align 5,,10 +L(align_more): +# ifndef USE_AS_RAWMEMCHR + xor %eax, %eax + subq %rdi, %rax +# endif + + subq $-VEC_SIZE, %rdi + /* Align rdi to VEC_SIZE. */ + andq $-VEC_SIZE, %rdi + +# ifndef USE_AS_RAWMEMCHR + addq %rdi, %rax +# ifdef USE_AS_WMEMCHR + sarl $2, %eax +# endif + subq %rax, %rdx + jbe L(zero) +# endif + + /* Loop unroll 4 times for 4 vector loop. */ + VPCMPEQ (%rdi), %VMM(1), %k0 + + KMOV %k0, %VRAX + test %VRAX, %VRAX + jnz L(ret_vec_x1) + +# ifndef USE_AS_RAWMEMCHR + subq $CHAR_PER_VEC, %rdx + jbe L(zero) +# endif + + VPCMPEQ VEC_SIZE(%rdi), %VMM(1), %k0 + + KMOV %k0, %VRAX + test %VRAX, %VRAX + jnz L(ret_vec_x2) + +# ifndef USE_AS_RAWMEMCHR + subq $CHAR_PER_VEC, %rdx + jbe L(zero) +# endif + + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMM(1), %k0 + + KMOV %k0, %VRAX + test %VRAX, %VRAX + jnz L(ret_vec_x3) + +# ifndef USE_AS_RAWMEMCHR + subq $CHAR_PER_VEC, %rdx + jbe L(zero) +# endif + + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMM(1), %k0 + + KMOV %k0, %VRAX + test %VRAX, %VRAX + jnz L(ret_vec_x4) + +# ifndef USE_AS_RAWMEMCHR + subq $CHAR_PER_VEC, %rdx + jbe L(zero) + /* Save pointer to find alignment adjustment. */ + movq %rdi, %rax +# endif + /* Align address to VEC_SIZE * 4 for loop. */ + andq $-(VEC_SIZE * 4), %rdi + + /* Add alignment difference to rdx. */ +# ifndef USE_AS_RAWMEMCHR + subq %rdi, %rax +# ifdef USE_AS_WMEMCHR + shr $2, %VRAX +# endif + addq %rax, %rdx +# endif + vpxorq %VMM_128(0), %VMM_128(0), %VMM_128(0) + + /* 4 vector loop. */ + .p2align 5,,11 +L(loop): + + VPCMPNE (VEC_SIZE * 4)(%rdi), %VMM(1), %k1 + vpxorq (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2) + vpxorq (VEC_SIZE * 6)(%rdi), %VMM(1), %VMM(3) + VPCMPEQ (VEC_SIZE * 7)(%rdi), %VMM(1), %k3 + VPMINU %VMM(2), %VMM(3), %VMM(3){%k1}{z} + VPCMPEQ %VMM(3), %VMM(0), %k2 + + subq $-(VEC_SIZE * 4), %rdi + KORTEST %k2, %k3 +# ifdef USE_AS_RAWMEMCHR + jz L(loop) +# else + jnz L(loopend) + subq $(CHAR_PER_VEC * 4), %rdx + ja L(loop) +L(zero_2): + xor %eax, %eax + ret +# endif + +L(loopend): + VPCMPEQ (%rdi), %VMM(1), %k1 + KMOV %k1, %VRAX + test %VRAX, %VRAX + jnz L(ret_vec_x1) + +# ifndef USE_AS_RAWMEMCHR + subq $CHAR_PER_VEC, %rdx + jbe L(zero_2) +# endif + + VPCMPEQ VEC_SIZE(%rdi), %VMM(1), %k1 + KMOV %k1, %VRAX + test %VRAX, %VRAX + jnz L(ret_vec_x2) + +# ifndef USE_AS_RAWMEMCHR + subq $CHAR_PER_VEC, %rdx + jbe L(zero_2) +# endif + + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMM(1), %k1 + KMOV %k1, %VRAX + test %VRAX, %VRAX + jnz L(ret_vec_x3) + +# ifndef USE_AS_RAWMEMCHR + subq $CHAR_PER_VEC, %rdx + jbe L(zero_2) +# endif + + /* At this point null [w]char must be in the fourth vector so no + need to check. */ + KMOV %k3, %VRAX + +L(ret_vec_x4): + bsf %VRAX, %VRAX +# ifndef USE_AS_RAWMEMCHR + cmp %rax, %rdx + jbe L(zero) +# endif + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax + ret + + .p2align 5,,5 +L(ret_vec_x3): + bsf %VRAX, %VRAX +# ifndef USE_AS_RAWMEMCHR + cmp %rax, %rdx + jbe L(zero) +# endif + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax + ret + +END (MEMCHR) +#endif diff --git a/sysdeps/x86_64/multiarch/memchr-evex512.S b/sysdeps/x86_64/multiarch/memchr-evex512.S new file mode 100644 index 0000000000..002f8c8489 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memchr-evex512.S @@ -0,0 +1,8 @@ +# ifndef MEMCHR +# define MEMCHR __memchr_evex512 +# endif + +#include "x86-evex512-vecs.h" +#include "reg-macros.h" + +#include "memchr-evex-base.S" diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex512.S b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S new file mode 100644 index 0000000000..302d3cb055 --- /dev/null +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S @@ -0,0 +1,7 @@ +#ifndef RAWMEMCHR +# define RAWMEMCHR __rawmemchr_evex512 +#endif +#define USE_AS_RAWMEMCHR 1 +#define MEMCHR RAWMEMCHR + +#include "memchr-evex512.S" diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex512.S b/sysdeps/x86_64/multiarch/wmemchr-evex512.S new file mode 100644 index 0000000000..f45ed1db75 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemchr-evex512.S @@ -0,0 +1,8 @@ +#ifndef WMEMCHR +# define WMEMCHR __wmemchr_evex512 +#endif + +#define MEMCHR WMEMCHR +#define USE_AS_WMEMCHR 1 + +#include "memchr-evex512.S" -- 2.36.1 ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH v3] x86_64: Implement evex512 version of memchr, rawmemchr and wmemchr 2022-10-15 14:01 ` [PATCH v3] " Sunil K Pandey @ 2022-10-15 16:26 ` Noah Goldstein 2022-10-15 16:26 ` Noah Goldstein 1 sibling, 0 replies; 26+ messages in thread From: Noah Goldstein @ 2022-10-15 16:26 UTC (permalink / raw) To: Sunil K Pandey; +Cc: libc-alpha On Sat, Oct 15, 2022 at 9:02 AM Sunil K Pandey via Libc-alpha <libc-alpha@sourceware.org> wrote: > > Changes from v2: > - Use VEC API > - Replaced long jump L(zero) in L(endloop) with short jump L(zero_2) > > Changes from v1: > - Change vcmp to vcmpeq and vcmpneq. > - Restrucure unconditional loop jump logic. > - Improve 4 vector loop logic. > - Fix bug near page boundary. > > This patch implements following evex512 version of string functions. > evex512 version takes up to 30% less cycle as compared to evex, > depending on length and alignment. > > - memchr function using 512 bit vectors. > - rawmemchr function using 512 bit vectors. > - wmemchr function using 512 bit vectors. > > Code size data: > > memchr-evex.o 762 byte > memchr-evex512.o 576 byte (-24%) > > rawmemchr-evex.o 461 byte > rawmemchr-evex512.o 432 byte (-6%) > > wmemchr-evex.o 794 byte > wmemchr-evex512.o 574 byte (-28%) > > Placeholder function, not used by any processor at the moment. > --- > sysdeps/x86_64/multiarch/Makefile | 3 + > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 + > sysdeps/x86_64/multiarch/memchr-evex-base.S | 295 +++++++++++++++++++ > sysdeps/x86_64/multiarch/memchr-evex512.S | 8 + > sysdeps/x86_64/multiarch/rawmemchr-evex512.S | 7 + > sysdeps/x86_64/multiarch/wmemchr-evex512.S | 8 + > 6 files changed, 336 insertions(+) > create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-base.S > create mode 100644 sysdeps/x86_64/multiarch/memchr-evex512.S > create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex512.S > create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex512.S > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > index df4601c294..e974b1ad97 100644 > --- a/sysdeps/x86_64/multiarch/Makefile > +++ b/sysdeps/x86_64/multiarch/Makefile > @@ -4,6 +4,7 @@ sysdep_routines += \ > memchr-avx2 \ > memchr-avx2-rtm \ > memchr-evex \ > + memchr-evex512 \ > memchr-evex-rtm \ > memchr-sse2 \ > memcmp-avx2-movbe \ > @@ -36,6 +37,7 @@ sysdep_routines += \ > rawmemchr-avx2 \ > rawmemchr-avx2-rtm \ > rawmemchr-evex \ > + rawmemchr-evex512 \ > rawmemchr-evex-rtm \ > rawmemchr-sse2 \ > stpcpy-avx2 \ > @@ -156,6 +158,7 @@ sysdep_routines += \ > wmemchr-avx2 \ > wmemchr-avx2-rtm \ > wmemchr-evex \ > + wmemchr-evex512 \ > wmemchr-evex-rtm \ > wmemchr-sse2 \ > wmemcmp-avx2-movbe \ > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > index 00a91123d3..529c0b0ef0 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > @@ -63,6 +63,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > && CPU_FEATURE_USABLE (AVX512BW) > && CPU_FEATURE_USABLE (BMI2)), > __memchr_evex) > + X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > + (CPU_FEATURE_USABLE (AVX512VL) > + && CPU_FEATURE_USABLE (AVX512BW) > + && CPU_FEATURE_USABLE (BMI2)), > + __memchr_evex512) > X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > (CPU_FEATURE_USABLE (AVX512VL) > && CPU_FEATURE_USABLE (AVX512BW) > @@ -337,6 +342,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > && CPU_FEATURE_USABLE (AVX512BW) > && CPU_FEATURE_USABLE (BMI2)), > __rawmemchr_evex) > + X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > + (CPU_FEATURE_USABLE (AVX512VL) > + && CPU_FEATURE_USABLE (AVX512BW) > + && CPU_FEATURE_USABLE (BMI2)), > + __rawmemchr_evex512) > X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > (CPU_FEATURE_USABLE (AVX512VL) > && CPU_FEATURE_USABLE (AVX512BW) > @@ -942,6 +952,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > && CPU_FEATURE_USABLE (AVX512BW) > && CPU_FEATURE_USABLE (BMI2)), > __wmemchr_evex) > + X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > + (CPU_FEATURE_USABLE (AVX512VL) > + && CPU_FEATURE_USABLE (AVX512BW) > + && CPU_FEATURE_USABLE (BMI2)), > + __wmemchr_evex512) > X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > (CPU_FEATURE_USABLE (AVX512VL) > && CPU_FEATURE_USABLE (AVX512BW) > diff --git a/sysdeps/x86_64/multiarch/memchr-evex-base.S b/sysdeps/x86_64/multiarch/memchr-evex-base.S > new file mode 100644 > index 0000000000..e3848dfed6 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/memchr-evex-base.S > @@ -0,0 +1,295 @@ > +/* Placeholder function, not used by any processor at the moment. > + Copyright (C) 2022 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +/* UNUSED. Exists purely as reference implementation. */ > + > +#include <isa-level.h> > + > +#if ISA_SHOULD_BUILD (4) > + > +# include <sysdep.h> > + > +# ifdef USE_AS_WMEMCHR > +# define CHAR_SIZE 4 > +# define VPBROADCAST vpbroadcastd > +# define VPCMPEQ vpcmpeqd > +# define VPCMPNE vpcmpneqd > +# define VPMINU vpminud > +# else > +# define CHAR_SIZE 1 > +# define VPBROADCAST vpbroadcastb > +# define VPCMPEQ vpcmpeqb > +# define VPCMPNE vpcmpneqb > +# define VPMINU vpminub > +# endif > + > +# define PAGE_SIZE 4096 > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > + > + .section SECTION(.text), "ax", @progbits > +/* Aligning entry point to 64 byte, provides better performance for > + one vector length string. */ > +ENTRY_P2ALIGN (MEMCHR, 6) > +# ifndef USE_AS_RAWMEMCHR > + /* Check for zero length. */ > + test %RDX_LP, %RDX_LP > + jz L(zero) > + > +# ifdef __ILP32__ > + /* Clear the upper 32 bits. */ > + movl %edx, %edx > +# endif > +# endif > + > + /* Broadcast CHAR to VMM(1). */ > + VPBROADCAST %esi, %VMM(1) > + movl %edi, %eax > + andl $(PAGE_SIZE - 1), %eax > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > + ja L(page_cross) > + > + /* Compare [w]char for null, mask bit will be set for match. */ > + VPCMPEQ (%rdi), %VMM(1), %k0 > + > + KMOV %k0, %VRAX > +# ifndef USE_AS_RAWMEMCHR > + bsf %VRAX, %VRCX > + jz L(align_more) > + xor %eax, %eax > +# ifdef USE_AS_WMEMCHR > + leaq (%rdi, %rcx, CHAR_SIZE), %rdi > +# else > + addq %rcx, %rdi > +# endif > + cmp %rcx, %rdx > + cmova %rdi, %rax > +# else > + bsf %VRAX, %VRAX > + jz L(align_more) > + add %rdi, %rax > +# endif > + ret > + > +# ifndef USE_AS_RAWMEMCHR > +L(zero): > + xorl %eax, %eax > + ret > +# endif > + > + .p2align 5,,5 > +L(page_cross): > + movq %rdi, %rcx > + andq $-VEC_SIZE, %rcx > + > + VPCMPEQ (%rcx), %VMM(1), %k0 > + KMOV %k0, %VRCX > + SARX %VRAX, %VRCX, %VRAX Downcase the SARX as is not a macro. > +# ifndef USE_AS_RAWMEMCHR > + bsf %VRAX, %VRCX > + jz L(align_more) > + xor %eax, %eax > +# ifdef USE_AS_WMEMCHR > + leaq (%rdi, %rcx, CHAR_SIZE), %rdi > +# else > + addq %rcx, %rdi > +# endif > + cmp %rcx, %rdx > + cmova %rdi, %rax > + > +# else > + bsf %rax, %rax > + jz L(align_more) > + add %rdi, %rax > +# endif > + ret > + > +L(ret_vec_x2): > + subq $-VEC_SIZE, %rdi > +L(ret_vec_x1): > + bsf %VRAX, %VRAX > +# ifndef USE_AS_RAWMEMCHR > + jz L(zero) > + cmp %rax, %rdx > + jbe L(zero) > +# endif > +# ifdef USE_AS_WMEMCHR > + leaq (%rdi, %rax, CHAR_SIZE), %rax > +# else > + add %rdi, %rax > +# endif > + ret > + > + .p2align 5,,10 > +L(align_more): > +# ifndef USE_AS_RAWMEMCHR > + xor %eax, %eax > + subq %rdi, %rax > +# endif > + > + subq $-VEC_SIZE, %rdi > + /* Align rdi to VEC_SIZE. */ > + andq $-VEC_SIZE, %rdi > + > +# ifndef USE_AS_RAWMEMCHR > + addq %rdi, %rax > +# ifdef USE_AS_WMEMCHR > + sarl $2, %eax > +# endif > + subq %rax, %rdx At this point we have tested VEC_SIZE bytes but you are only testing (VEC_SIZE - (%rdi % VEC_SIZE)) I think a seperate branch that tests full VEC_SIZE makes sense. We don't have a case for it in micro-benchmark but something like: pos = 128, len = 64, al = 0/32 There will be a major drop in perf when al == 32 bc you will end up testing an additional VEC. > + jbe L(zero) > +# endif > + > + /* Loop unroll 4 times for 4 vector loop. */ > + VPCMPEQ (%rdi), %VMM(1), %k0 > + > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > + jnz L(ret_vec_x1) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero) > +# endif > + > + VPCMPEQ VEC_SIZE(%rdi), %VMM(1), %k0 > + > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > + jnz L(ret_vec_x2) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero) > +# endif > + > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMM(1), %k0 > + > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > + jnz L(ret_vec_x3) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero) > +# endif > + > + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMM(1), %k0 > + > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > + jnz L(ret_vec_x4) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero) > + /* Save pointer to find alignment adjustment. */ > + movq %rdi, %rax > +# endif > + /* Align address to VEC_SIZE * 4 for loop. */ > + andq $-(VEC_SIZE * 4), %rdi > + > + /* Add alignment difference to rdx. */ > +# ifndef USE_AS_RAWMEMCHR > + subq %rdi, %rax > +# ifdef USE_AS_WMEMCHR > + shr $2, %VRAX > +# endif > + addq %rax, %rdx > +# endif > + vpxorq %VMM_128(0), %VMM_128(0), %VMM_128(0) > + This is unneeded. > + /* 4 vector loop. */ > + .p2align 5,,11 > +L(loop): > + > + VPCMPNE (VEC_SIZE * 4)(%rdi), %VMM(1), %k1 > + vpxorq (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2) > + vpxorq (VEC_SIZE * 6)(%rdi), %VMM(1), %VMM(3) > + VPCMPEQ (VEC_SIZE * 7)(%rdi), %VMM(1), %k3 > + VPMINU %VMM(2), %VMM(3), %VMM(3){%k1}{z} > + VPCMPEQ %VMM(3), %VMM(0), %k2 > + Use vptestnm{d|b} > + subq $-(VEC_SIZE * 4), %rdi > + KORTEST %k2, %k3 > +# ifdef USE_AS_RAWMEMCHR > + jz L(loop) > +# else > + jnz L(loopend) > + subq $(CHAR_PER_VEC * 4), %rdx > + ja L(loop) > +L(zero_2): > + xor %eax, %eax > + ret > +# endif > + > +L(loopend): > + VPCMPEQ (%rdi), %VMM(1), %k1 > + KMOV %k1, %VRAX > + test %VRAX, %VRAX > + jnz L(ret_vec_x1) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero_2) > +# endif > + > + VPCMPEQ VEC_SIZE(%rdi), %VMM(1), %k1 > + KMOV %k1, %VRAX > + test %VRAX, %VRAX > + jnz L(ret_vec_x2) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero_2) > +# endif > + > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMM(1), %k1 > + KMOV %k1, %VRAX > + test %VRAX, %VRAX > + jnz L(ret_vec_x3) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero_2) > +# endif > + > + /* At this point null [w]char must be in the fourth vector so no > + need to check. */ > + KMOV %k3, %VRAX > + > +L(ret_vec_x4): > + bsf %VRAX, %VRAX > +# ifndef USE_AS_RAWMEMCHR > + cmp %rax, %rdx > + jbe L(zero) > +# endif > + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax > + ret > + > + .p2align 5,,5 > +L(ret_vec_x3): > + bsf %VRAX, %VRAX > +# ifndef USE_AS_RAWMEMCHR > + cmp %rax, %rdx > + jbe L(zero) > +# endif > + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > + ret > + > +END (MEMCHR) > +#endif > diff --git a/sysdeps/x86_64/multiarch/memchr-evex512.S b/sysdeps/x86_64/multiarch/memchr-evex512.S > new file mode 100644 > index 0000000000..002f8c8489 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/memchr-evex512.S > @@ -0,0 +1,8 @@ > +# ifndef MEMCHR > +# define MEMCHR __memchr_evex512 > +# endif > + > +#include "x86-evex512-vecs.h" > +#include "reg-macros.h" > + > +#include "memchr-evex-base.S" > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex512.S b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S > new file mode 100644 > index 0000000000..302d3cb055 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S > @@ -0,0 +1,7 @@ > +#ifndef RAWMEMCHR > +# define RAWMEMCHR __rawmemchr_evex512 > +#endif > +#define USE_AS_RAWMEMCHR 1 > +#define MEMCHR RAWMEMCHR > + > +#include "memchr-evex512.S" > diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex512.S b/sysdeps/x86_64/multiarch/wmemchr-evex512.S > new file mode 100644 > index 0000000000..f45ed1db75 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/wmemchr-evex512.S > @@ -0,0 +1,8 @@ > +#ifndef WMEMCHR > +# define WMEMCHR __wmemchr_evex512 > +#endif > + > +#define MEMCHR WMEMCHR > +#define USE_AS_WMEMCHR 1 > + > +#include "memchr-evex512.S" > -- > 2.36.1 > ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH v3] x86_64: Implement evex512 version of memchr, rawmemchr and wmemchr 2022-10-15 14:01 ` [PATCH v3] " Sunil K Pandey 2022-10-15 16:26 ` Noah Goldstein @ 2022-10-15 16:26 ` Noah Goldstein 2022-10-17 3:53 ` [PATCH v4] " Sunil K Pandey 1 sibling, 1 reply; 26+ messages in thread From: Noah Goldstein @ 2022-10-15 16:26 UTC (permalink / raw) To: Sunil K Pandey; +Cc: libc-alpha On Sat, Oct 15, 2022 at 9:02 AM Sunil K Pandey via Libc-alpha <libc-alpha@sourceware.org> wrote: > > Changes from v2: > - Use VEC API > - Replaced long jump L(zero) in L(endloop) with short jump L(zero_2) > > Changes from v1: > - Change vcmp to vcmpeq and vcmpneq. > - Restrucure unconditional loop jump logic. > - Improve 4 vector loop logic. > - Fix bug near page boundary. > > This patch implements following evex512 version of string functions. > evex512 version takes up to 30% less cycle as compared to evex, > depending on length and alignment. > > - memchr function using 512 bit vectors. > - rawmemchr function using 512 bit vectors. > - wmemchr function using 512 bit vectors. > > Code size data: > > memchr-evex.o 762 byte > memchr-evex512.o 576 byte (-24%) > > rawmemchr-evex.o 461 byte > rawmemchr-evex512.o 432 byte (-6%) > > wmemchr-evex.o 794 byte > wmemchr-evex512.o 574 byte (-28%) > > Placeholder function, not used by any processor at the moment. > --- > sysdeps/x86_64/multiarch/Makefile | 3 + > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 + > sysdeps/x86_64/multiarch/memchr-evex-base.S | 295 +++++++++++++++++++ > sysdeps/x86_64/multiarch/memchr-evex512.S | 8 + > sysdeps/x86_64/multiarch/rawmemchr-evex512.S | 7 + > sysdeps/x86_64/multiarch/wmemchr-evex512.S | 8 + > 6 files changed, 336 insertions(+) > create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-base.S > create mode 100644 sysdeps/x86_64/multiarch/memchr-evex512.S > create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex512.S > create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex512.S > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > index df4601c294..e974b1ad97 100644 > --- a/sysdeps/x86_64/multiarch/Makefile > +++ b/sysdeps/x86_64/multiarch/Makefile > @@ -4,6 +4,7 @@ sysdep_routines += \ > memchr-avx2 \ > memchr-avx2-rtm \ > memchr-evex \ > + memchr-evex512 \ > memchr-evex-rtm \ > memchr-sse2 \ > memcmp-avx2-movbe \ > @@ -36,6 +37,7 @@ sysdep_routines += \ > rawmemchr-avx2 \ > rawmemchr-avx2-rtm \ > rawmemchr-evex \ > + rawmemchr-evex512 \ > rawmemchr-evex-rtm \ > rawmemchr-sse2 \ > stpcpy-avx2 \ > @@ -156,6 +158,7 @@ sysdep_routines += \ > wmemchr-avx2 \ > wmemchr-avx2-rtm \ > wmemchr-evex \ > + wmemchr-evex512 \ > wmemchr-evex-rtm \ > wmemchr-sse2 \ > wmemcmp-avx2-movbe \ > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > index 00a91123d3..529c0b0ef0 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > @@ -63,6 +63,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > && CPU_FEATURE_USABLE (AVX512BW) > && CPU_FEATURE_USABLE (BMI2)), > __memchr_evex) > + X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > + (CPU_FEATURE_USABLE (AVX512VL) > + && CPU_FEATURE_USABLE (AVX512BW) > + && CPU_FEATURE_USABLE (BMI2)), > + __memchr_evex512) > X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > (CPU_FEATURE_USABLE (AVX512VL) > && CPU_FEATURE_USABLE (AVX512BW) > @@ -337,6 +342,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > && CPU_FEATURE_USABLE (AVX512BW) > && CPU_FEATURE_USABLE (BMI2)), > __rawmemchr_evex) > + X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > + (CPU_FEATURE_USABLE (AVX512VL) > + && CPU_FEATURE_USABLE (AVX512BW) > + && CPU_FEATURE_USABLE (BMI2)), > + __rawmemchr_evex512) > X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > (CPU_FEATURE_USABLE (AVX512VL) > && CPU_FEATURE_USABLE (AVX512BW) > @@ -942,6 +952,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > && CPU_FEATURE_USABLE (AVX512BW) > && CPU_FEATURE_USABLE (BMI2)), > __wmemchr_evex) > + X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > + (CPU_FEATURE_USABLE (AVX512VL) > + && CPU_FEATURE_USABLE (AVX512BW) > + && CPU_FEATURE_USABLE (BMI2)), > + __wmemchr_evex512) > X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > (CPU_FEATURE_USABLE (AVX512VL) > && CPU_FEATURE_USABLE (AVX512BW) > diff --git a/sysdeps/x86_64/multiarch/memchr-evex-base.S b/sysdeps/x86_64/multiarch/memchr-evex-base.S > new file mode 100644 > index 0000000000..e3848dfed6 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/memchr-evex-base.S > @@ -0,0 +1,295 @@ > +/* Placeholder function, not used by any processor at the moment. > + Copyright (C) 2022 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +/* UNUSED. Exists purely as reference implementation. */ > + > +#include <isa-level.h> > + > +#if ISA_SHOULD_BUILD (4) > + > +# include <sysdep.h> > + > +# ifdef USE_AS_WMEMCHR > +# define CHAR_SIZE 4 > +# define VPBROADCAST vpbroadcastd > +# define VPCMPEQ vpcmpeqd > +# define VPCMPNE vpcmpneqd > +# define VPMINU vpminud > +# else > +# define CHAR_SIZE 1 > +# define VPBROADCAST vpbroadcastb > +# define VPCMPEQ vpcmpeqb > +# define VPCMPNE vpcmpneqb > +# define VPMINU vpminub > +# endif > + > +# define PAGE_SIZE 4096 > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > + > + .section SECTION(.text), "ax", @progbits > +/* Aligning entry point to 64 byte, provides better performance for > + one vector length string. */ > +ENTRY_P2ALIGN (MEMCHR, 6) > +# ifndef USE_AS_RAWMEMCHR > + /* Check for zero length. */ > + test %RDX_LP, %RDX_LP > + jz L(zero) > + > +# ifdef __ILP32__ > + /* Clear the upper 32 bits. */ > + movl %edx, %edx > +# endif > +# endif > + > + /* Broadcast CHAR to VMM(1). */ > + VPBROADCAST %esi, %VMM(1) > + movl %edi, %eax > + andl $(PAGE_SIZE - 1), %eax > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > + ja L(page_cross) > + > + /* Compare [w]char for null, mask bit will be set for match. */ > + VPCMPEQ (%rdi), %VMM(1), %k0 > + > + KMOV %k0, %VRAX > +# ifndef USE_AS_RAWMEMCHR > + bsf %VRAX, %VRCX > + jz L(align_more) > + xor %eax, %eax > +# ifdef USE_AS_WMEMCHR > + leaq (%rdi, %rcx, CHAR_SIZE), %rdi > +# else > + addq %rcx, %rdi > +# endif > + cmp %rcx, %rdx > + cmova %rdi, %rax > +# else > + bsf %VRAX, %VRAX > + jz L(align_more) > + add %rdi, %rax > +# endif > + ret > + > +# ifndef USE_AS_RAWMEMCHR > +L(zero): > + xorl %eax, %eax > + ret > +# endif > + > + .p2align 5,,5 > +L(page_cross): > + movq %rdi, %rcx > + andq $-VEC_SIZE, %rcx > + > + VPCMPEQ (%rcx), %VMM(1), %k0 > + KMOV %k0, %VRCX > + SARX %VRAX, %VRCX, %VRAX > +# ifndef USE_AS_RAWMEMCHR > + bsf %VRAX, %VRCX > + jz L(align_more) > + xor %eax, %eax > +# ifdef USE_AS_WMEMCHR > + leaq (%rdi, %rcx, CHAR_SIZE), %rdi > +# else > + addq %rcx, %rdi > +# endif > + cmp %rcx, %rdx > + cmova %rdi, %rax > + > +# else > + bsf %rax, %rax > + jz L(align_more) > + add %rdi, %rax > +# endif > + ret > + > +L(ret_vec_x2): > + subq $-VEC_SIZE, %rdi > +L(ret_vec_x1): > + bsf %VRAX, %VRAX > +# ifndef USE_AS_RAWMEMCHR > + jz L(zero) > + cmp %rax, %rdx > + jbe L(zero) > +# endif > +# ifdef USE_AS_WMEMCHR > + leaq (%rdi, %rax, CHAR_SIZE), %rax > +# else > + add %rdi, %rax > +# endif > + ret > + > + .p2align 5,,10 > +L(align_more): > +# ifndef USE_AS_RAWMEMCHR > + xor %eax, %eax > + subq %rdi, %rax > +# endif > + > + subq $-VEC_SIZE, %rdi > + /* Align rdi to VEC_SIZE. */ > + andq $-VEC_SIZE, %rdi > + > +# ifndef USE_AS_RAWMEMCHR > + addq %rdi, %rax > +# ifdef USE_AS_WMEMCHR > + sarl $2, %eax > +# endif > + subq %rax, %rdx > + jbe L(zero) > +# endif > + > + /* Loop unroll 4 times for 4 vector loop. */ > + VPCMPEQ (%rdi), %VMM(1), %k0 > + > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > + jnz L(ret_vec_x1) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero) > +# endif > + > + VPCMPEQ VEC_SIZE(%rdi), %VMM(1), %k0 > + > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > + jnz L(ret_vec_x2) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero) > +# endif > + > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMM(1), %k0 > + > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > + jnz L(ret_vec_x3) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero) > +# endif > + > + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMM(1), %k0 > + > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > + jnz L(ret_vec_x4) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero) > + /* Save pointer to find alignment adjustment. */ > + movq %rdi, %rax > +# endif > + /* Align address to VEC_SIZE * 4 for loop. */ > + andq $-(VEC_SIZE * 4), %rdi > + > + /* Add alignment difference to rdx. */ > +# ifndef USE_AS_RAWMEMCHR > + subq %rdi, %rax > +# ifdef USE_AS_WMEMCHR > + shr $2, %VRAX > +# endif > + addq %rax, %rdx > +# endif > + vpxorq %VMM_128(0), %VMM_128(0), %VMM_128(0) > + > + /* 4 vector loop. */ > + .p2align 5,,11 > +L(loop): > + > + VPCMPNE (VEC_SIZE * 4)(%rdi), %VMM(1), %k1 > + vpxorq (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2) > + vpxorq (VEC_SIZE * 6)(%rdi), %VMM(1), %VMM(3) > + VPCMPEQ (VEC_SIZE * 7)(%rdi), %VMM(1), %k3 > + VPMINU %VMM(2), %VMM(3), %VMM(3){%k1}{z} > + VPCMPEQ %VMM(3), %VMM(0), %k2 > + > + subq $-(VEC_SIZE * 4), %rdi > + KORTEST %k2, %k3 > +# ifdef USE_AS_RAWMEMCHR > + jz L(loop) > +# else > + jnz L(loopend) > + subq $(CHAR_PER_VEC * 4), %rdx > + ja L(loop) > +L(zero_2): > + xor %eax, %eax > + ret > +# endif > + > +L(loopend): > + VPCMPEQ (%rdi), %VMM(1), %k1 > + KMOV %k1, %VRAX > + test %VRAX, %VRAX > + jnz L(ret_vec_x1) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero_2) > +# endif > + > + VPCMPEQ VEC_SIZE(%rdi), %VMM(1), %k1 > + KMOV %k1, %VRAX > + test %VRAX, %VRAX > + jnz L(ret_vec_x2) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero_2) > +# endif > + > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMM(1), %k1 > + KMOV %k1, %VRAX > + test %VRAX, %VRAX > + jnz L(ret_vec_x3) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero_2) > +# endif > + > + /* At this point null [w]char must be in the fourth vector so no > + need to check. */ > + KMOV %k3, %VRAX > + > +L(ret_vec_x4): > + bsf %VRAX, %VRAX > +# ifndef USE_AS_RAWMEMCHR > + cmp %rax, %rdx > + jbe L(zero) > +# endif > + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax > + ret > + > + .p2align 5,,5 > +L(ret_vec_x3): > + bsf %VRAX, %VRAX > +# ifndef USE_AS_RAWMEMCHR > + cmp %rax, %rdx > + jbe L(zero) > +# endif > + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > + ret > + > +END (MEMCHR) > +#endif > diff --git a/sysdeps/x86_64/multiarch/memchr-evex512.S b/sysdeps/x86_64/multiarch/memchr-evex512.S > new file mode 100644 > index 0000000000..002f8c8489 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/memchr-evex512.S > @@ -0,0 +1,8 @@ > +# ifndef MEMCHR > +# define MEMCHR __memchr_evex512 > +# endif > + > +#include "x86-evex512-vecs.h" > +#include "reg-macros.h" > + > +#include "memchr-evex-base.S" > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex512.S b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S > new file mode 100644 > index 0000000000..302d3cb055 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S > @@ -0,0 +1,7 @@ > +#ifndef RAWMEMCHR > +# define RAWMEMCHR __rawmemchr_evex512 > +#endif > +#define USE_AS_RAWMEMCHR 1 > +#define MEMCHR RAWMEMCHR > + > +#include "memchr-evex512.S" > diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex512.S b/sysdeps/x86_64/multiarch/wmemchr-evex512.S > new file mode 100644 > index 0000000000..f45ed1db75 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/wmemchr-evex512.S > @@ -0,0 +1,8 @@ > +#ifndef WMEMCHR > +# define WMEMCHR __wmemchr_evex512 > +#endif > + > +#define MEMCHR WMEMCHR > +#define USE_AS_WMEMCHR 1 defined USE_WIDE_CHAR here as 32-bit gprs should be enough. > + > +#include "memchr-evex512.S" > -- > 2.36.1 > ^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCH v4] x86_64: Implement evex512 version of memchr, rawmemchr and wmemchr 2022-10-15 16:26 ` Noah Goldstein @ 2022-10-17 3:53 ` Sunil K Pandey 2022-10-17 15:46 ` Noah Goldstein 0 siblings, 1 reply; 26+ messages in thread From: Sunil K Pandey @ 2022-10-17 3:53 UTC (permalink / raw) To: libc-alpha Changes from v3: - Replace VPCMPEQ in loop with VPTESTNM for 4th vector. - Change first vector max check logic for terminating condition. - Change page cross logic for terminating condition. - Remove unnessary check in align_more block. - Remove unnessary VEC(0) initialization. - Define USE_WIDE_CHAR in wmemchr. Changes from v2: - Use VEC API - Replaced long jump L(zero) in L(endloop) with short jump L(zero_2) Changes from v1: - Change vcmp to vcmpeq and vcmpneq. - Restructure unconditional loop jump logic. - Improve 4 vector loop logic. - Fix bug near page boundary. This patch implements following evex512 version of string functions. evex512 version takes up to 30% less cycle as compared to evex, depending on length and alignment. - memchr function using 512 bit vectors. - rawmemchr function using 512 bit vectors. - wmemchr function using 512 bit vectors. Code size data: memchr-evex.o 762 byte memchr-evex512.o 576 byte (-24%) rawmemchr-evex.o 461 byte rawmemchr-evex512.o 432 byte (-6%) wmemchr-evex.o 794 byte wmemchr-evex512.o 576 byte (-27%) Placeholder function, not used by any processor at the moment. --- sysdeps/x86_64/multiarch/Makefile | 3 + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 + sysdeps/x86_64/multiarch/memchr-evex-base.S | 301 +++++++++++++++++++ sysdeps/x86_64/multiarch/memchr-evex512.S | 8 + sysdeps/x86_64/multiarch/rawmemchr-evex512.S | 7 + sysdeps/x86_64/multiarch/wmemchr-evex512.S | 9 + 6 files changed, 343 insertions(+) create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-base.S create mode 100644 sysdeps/x86_64/multiarch/memchr-evex512.S create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex512.S create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex512.S diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index df4601c294..e974b1ad97 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -4,6 +4,7 @@ sysdep_routines += \ memchr-avx2 \ memchr-avx2-rtm \ memchr-evex \ + memchr-evex512 \ memchr-evex-rtm \ memchr-sse2 \ memcmp-avx2-movbe \ @@ -36,6 +37,7 @@ sysdep_routines += \ rawmemchr-avx2 \ rawmemchr-avx2-rtm \ rawmemchr-evex \ + rawmemchr-evex512 \ rawmemchr-evex-rtm \ rawmemchr-sse2 \ stpcpy-avx2 \ @@ -156,6 +158,7 @@ sysdep_routines += \ wmemchr-avx2 \ wmemchr-avx2-rtm \ wmemchr-evex \ + wmemchr-evex512 \ wmemchr-evex-rtm \ wmemchr-sse2 \ wmemcmp-avx2-movbe \ diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 00a91123d3..529c0b0ef0 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -63,6 +63,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), __memchr_evex) + X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __memchr_evex512) X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) @@ -337,6 +342,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), __rawmemchr_evex) + X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __rawmemchr_evex512) X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) @@ -942,6 +952,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), __wmemchr_evex) + X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __wmemchr_evex512) X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) diff --git a/sysdeps/x86_64/multiarch/memchr-evex-base.S b/sysdeps/x86_64/multiarch/memchr-evex-base.S new file mode 100644 index 0000000000..4de60655b4 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memchr-evex-base.S @@ -0,0 +1,301 @@ +/* Placeholder function, not used by any processor at the moment. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +/* UNUSED. Exists purely as reference implementation. */ + +#include <isa-level.h> + +#if ISA_SHOULD_BUILD (4) + +# include <sysdep.h> + +# ifdef USE_AS_WMEMCHR +# define CHAR_SIZE 4 +# define VPBROADCAST vpbroadcastd +# define VPCMPEQ vpcmpeqd +# define VPCMPNE vpcmpneqd +# define VPMINU vpminud +# define VPTESTNM vptestnmd +# else +# define CHAR_SIZE 1 +# define VPBROADCAST vpbroadcastb +# define VPCMPEQ vpcmpeqb +# define VPCMPNE vpcmpneqb +# define VPMINU vpminub +# define VPTESTNM vptestnmb +# endif + +# define PAGE_SIZE 4096 +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) + + .section SECTION(.text), "ax", @progbits +/* Aligning entry point to 64 byte, provides better performance for + one vector length string. */ +ENTRY_P2ALIGN (MEMCHR, 6) +# ifndef USE_AS_RAWMEMCHR + /* Check for zero length. */ + test %RDX_LP, %RDX_LP + jz L(zero) + +# ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %edx, %edx +# endif +# endif + + /* Broadcast CHAR to VMM(1). */ + VPBROADCAST %esi, %VMM(1) + movl %edi, %eax + andl $(PAGE_SIZE - 1), %eax + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + ja L(page_cross) + + /* Compare [w]char for null, mask bit will be set for match. */ + VPCMPEQ (%rdi), %VMM(1), %k0 + + KMOV %k0, %VRCX +# ifndef USE_AS_RAWMEMCHR + mov %rdx, %rax + bsf %VRCX, %VRAX + cmp $CHAR_PER_VEC, %rax + ja L(align_more) +# ifdef USE_AS_WMEMCHR + leaq (%rdi, %rax, CHAR_SIZE), %rdi +# else + addq %rax, %rdi +# endif + cmp %rax, %rdx + jbe L(zero) + mov %rdi, %rax +# else + bsf %VRCX, %VRAX + jz L(align_more) + add %rdi, %rax +# endif + ret + +# ifndef USE_AS_RAWMEMCHR +L(zero): + xorl %eax, %eax + ret +# endif + + .p2align 5,,5 +L(page_cross): + movl %eax, %ecx + andl $(VEC_SIZE - 1), %ecx +# ifdef USE_AS_WMEMCHR + shrl $2, %ecx +# endif + xorq %rdi, %rax + VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1), %k0 + KMOV %k0, %VRSI + shr %cl, %VRSI +# ifndef USE_AS_RAWMEMCHR + jnz L(page_cross_end) + movl $CHAR_PER_VEC, %eax + sub %ecx, %eax + cmp %rax, %rdx + ja L(align_more) +# else + jz L(align_more) +# endif + +L(page_cross_end): +# ifndef USE_AS_RAWMEMCHR + bsf %VRSI, %VRCX + leaq (%rdi, %rcx, CHAR_SIZE), %rax + cmp %rcx, %rdx + jbe L(zero) +# else + bsf %VRSI, %VRAX + add %rdi, %rax +# endif + ret + +L(ret_vec_x2): + subq $-VEC_SIZE, %rdi +L(ret_vec_x1): + bsf %VRAX, %VRAX +# ifndef USE_AS_RAWMEMCHR + jz L(zero) + cmp %rax, %rdx + jbe L(zero) +# endif +# ifdef USE_AS_WMEMCHR + leaq (%rdi, %rax, CHAR_SIZE), %rax +# else + add %rdi, %rax +# endif + ret + + .p2align 5,,10 +L(align_more): +# ifndef USE_AS_RAWMEMCHR + mov %rdi, %rax +# endif + subq $-VEC_SIZE, %rdi + /* Align rdi to VEC_SIZE. */ + andq $-VEC_SIZE, %rdi + +# ifndef USE_AS_RAWMEMCHR + subq %rdi, %rax +# ifdef USE_AS_WMEMCHR + sar $2, %rax +# endif + addq %rax, %rdx +# endif + + /* Loop unroll 4 times for 4 vector loop. */ + VPCMPEQ (%rdi), %VMM(1), %k0 + + KMOV %k0, %VRAX + test %VRAX, %VRAX + jnz L(ret_vec_x1) + +# ifndef USE_AS_RAWMEMCHR + subq $CHAR_PER_VEC, %rdx + jbe L(zero) +# endif + + VPCMPEQ VEC_SIZE(%rdi), %VMM(1), %k0 + + KMOV %k0, %VRAX + test %VRAX, %VRAX + jnz L(ret_vec_x2) + +# ifndef USE_AS_RAWMEMCHR + subq $CHAR_PER_VEC, %rdx + jbe L(zero) +# endif + + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMM(1), %k0 + + KMOV %k0, %VRAX + test %VRAX, %VRAX + jnz L(ret_vec_x3) + +# ifndef USE_AS_RAWMEMCHR + subq $CHAR_PER_VEC, %rdx + jbe L(zero) +# endif + + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMM(1), %k0 + + KMOV %k0, %VRAX + test %VRAX, %VRAX + jnz L(ret_vec_x4) + +# ifndef USE_AS_RAWMEMCHR + subq $CHAR_PER_VEC, %rdx + jbe L(zero) + /* Save pointer to find alignment adjustment. */ + movq %rdi, %rax +# endif + /* Align address to VEC_SIZE * 4 for loop. */ + andq $-(VEC_SIZE * 4), %rdi + + /* Add alignment difference to rdx. */ +# ifndef USE_AS_RAWMEMCHR + subq %rdi, %rax +# ifdef USE_AS_WMEMCHR + shr $2, %VRAX +# endif + addq %rax, %rdx +# endif + + /* 4 vector loop. */ + .p2align 5,,11 +L(loop): + + VPCMPNE (VEC_SIZE * 4)(%rdi), %VMM(1), %k1 + vpxorq (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2) + vpxorq (VEC_SIZE * 6)(%rdi), %VMM(1), %VMM(3) + VPCMPEQ (VEC_SIZE * 7)(%rdi), %VMM(1), %k3 + VPMINU %VMM(2), %VMM(3), %VMM(3){%k1}{z} + VPTESTNM %VMM(3), %VMM(3), %k2 + + subq $-(VEC_SIZE * 4), %rdi + KORTEST %k2, %k3 +# ifdef USE_AS_RAWMEMCHR + jz L(loop) +# else + jnz L(loopend) + subq $(CHAR_PER_VEC * 4), %rdx + ja L(loop) +L(zero_2): + xor %eax, %eax + ret +# endif + +L(loopend): + VPCMPEQ (%rdi), %VMM(1), %k1 + KMOV %k1, %VRAX + test %VRAX, %VRAX + jnz L(ret_vec_x1) + +# ifndef USE_AS_RAWMEMCHR + subq $CHAR_PER_VEC, %rdx + jbe L(zero_2) +# endif + + VPCMPEQ VEC_SIZE(%rdi), %VMM(1), %k1 + KMOV %k1, %VRAX + test %VRAX, %VRAX + jnz L(ret_vec_x2) + +# ifndef USE_AS_RAWMEMCHR + subq $CHAR_PER_VEC, %rdx + jbe L(zero_2) +# endif + + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMM(1), %k1 + KMOV %k1, %VRAX + test %VRAX, %VRAX + jnz L(ret_vec_x3) + +# ifndef USE_AS_RAWMEMCHR + subq $CHAR_PER_VEC, %rdx + jbe L(zero_2) +# endif + + /* At this point null [w]char must be in the fourth vector so no + need to check. */ + KMOV %k3, %VRAX + +L(ret_vec_x4): + bsf %VRAX, %VRAX +# ifndef USE_AS_RAWMEMCHR + cmp %rax, %rdx + jbe L(zero) +# endif + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax + ret + + .p2align 5,,5 +L(ret_vec_x3): + bsf %VRAX, %VRAX +# ifndef USE_AS_RAWMEMCHR + cmp %rax, %rdx + jbe L(zero) +# endif + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax + ret + +END (MEMCHR) +#endif diff --git a/sysdeps/x86_64/multiarch/memchr-evex512.S b/sysdeps/x86_64/multiarch/memchr-evex512.S new file mode 100644 index 0000000000..002f8c8489 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memchr-evex512.S @@ -0,0 +1,8 @@ +# ifndef MEMCHR +# define MEMCHR __memchr_evex512 +# endif + +#include "x86-evex512-vecs.h" +#include "reg-macros.h" + +#include "memchr-evex-base.S" diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex512.S b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S new file mode 100644 index 0000000000..302d3cb055 --- /dev/null +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S @@ -0,0 +1,7 @@ +#ifndef RAWMEMCHR +# define RAWMEMCHR __rawmemchr_evex512 +#endif +#define USE_AS_RAWMEMCHR 1 +#define MEMCHR RAWMEMCHR + +#include "memchr-evex512.S" diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex512.S b/sysdeps/x86_64/multiarch/wmemchr-evex512.S new file mode 100644 index 0000000000..78ec4ee5ad --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemchr-evex512.S @@ -0,0 +1,9 @@ +#ifndef WMEMCHR +# define WMEMCHR __wmemchr_evex512 +#endif + +#define MEMCHR WMEMCHR +#define USE_AS_WMEMCHR 1 + +#define USE_WIDE_CHAR 1 +#include "memchr-evex512.S" -- 2.36.1 ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH v4] x86_64: Implement evex512 version of memchr, rawmemchr and wmemchr 2022-10-17 3:53 ` [PATCH v4] " Sunil K Pandey @ 2022-10-17 15:46 ` Noah Goldstein 2022-10-17 23:55 ` Sunil Pandey 2022-10-18 0:00 ` [PATCH v5] " Sunil K Pandey 0 siblings, 2 replies; 26+ messages in thread From: Noah Goldstein @ 2022-10-17 15:46 UTC (permalink / raw) To: Sunil K Pandey; +Cc: libc-alpha On Sun, Oct 16, 2022 at 10:53 PM Sunil K Pandey via Libc-alpha <libc-alpha@sourceware.org> wrote: > > Changes from v3: > - Replace VPCMPEQ in loop with VPTESTNM for 4th vector. > - Change first vector max check logic for terminating condition. > - Change page cross logic for terminating condition. > - Remove unnessary check in align_more block. > - Remove unnessary VEC(0) initialization. > - Define USE_WIDE_CHAR in wmemchr. > > Changes from v2: > - Use VEC API > - Replaced long jump L(zero) in L(endloop) with short jump L(zero_2) > > Changes from v1: > - Change vcmp to vcmpeq and vcmpneq. > - Restructure unconditional loop jump logic. > - Improve 4 vector loop logic. > - Fix bug near page boundary. > > This patch implements following evex512 version of string functions. > evex512 version takes up to 30% less cycle as compared to evex, > depending on length and alignment. > > - memchr function using 512 bit vectors. > - rawmemchr function using 512 bit vectors. > - wmemchr function using 512 bit vectors. > > Code size data: > > memchr-evex.o 762 byte > memchr-evex512.o 576 byte (-24%) > > rawmemchr-evex.o 461 byte > rawmemchr-evex512.o 432 byte (-6%) > > wmemchr-evex.o 794 byte > wmemchr-evex512.o 576 byte (-27%) > > Placeholder function, not used by any processor at the moment. > --- > sysdeps/x86_64/multiarch/Makefile | 3 + > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 + > sysdeps/x86_64/multiarch/memchr-evex-base.S | 301 +++++++++++++++++++ > sysdeps/x86_64/multiarch/memchr-evex512.S | 8 + > sysdeps/x86_64/multiarch/rawmemchr-evex512.S | 7 + > sysdeps/x86_64/multiarch/wmemchr-evex512.S | 9 + > 6 files changed, 343 insertions(+) > create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-base.S > create mode 100644 sysdeps/x86_64/multiarch/memchr-evex512.S > create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex512.S > create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex512.S > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > index df4601c294..e974b1ad97 100644 > --- a/sysdeps/x86_64/multiarch/Makefile > +++ b/sysdeps/x86_64/multiarch/Makefile > @@ -4,6 +4,7 @@ sysdep_routines += \ > memchr-avx2 \ > memchr-avx2-rtm \ > memchr-evex \ > + memchr-evex512 \ > memchr-evex-rtm \ > memchr-sse2 \ > memcmp-avx2-movbe \ > @@ -36,6 +37,7 @@ sysdep_routines += \ > rawmemchr-avx2 \ > rawmemchr-avx2-rtm \ > rawmemchr-evex \ > + rawmemchr-evex512 \ > rawmemchr-evex-rtm \ > rawmemchr-sse2 \ > stpcpy-avx2 \ > @@ -156,6 +158,7 @@ sysdep_routines += \ > wmemchr-avx2 \ > wmemchr-avx2-rtm \ > wmemchr-evex \ > + wmemchr-evex512 \ > wmemchr-evex-rtm \ > wmemchr-sse2 \ > wmemcmp-avx2-movbe \ > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > index 00a91123d3..529c0b0ef0 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > @@ -63,6 +63,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > && CPU_FEATURE_USABLE (AVX512BW) > && CPU_FEATURE_USABLE (BMI2)), > __memchr_evex) > + X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > + (CPU_FEATURE_USABLE (AVX512VL) > + && CPU_FEATURE_USABLE (AVX512BW) > + && CPU_FEATURE_USABLE (BMI2)), > + __memchr_evex512) > X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > (CPU_FEATURE_USABLE (AVX512VL) > && CPU_FEATURE_USABLE (AVX512BW) > @@ -337,6 +342,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > && CPU_FEATURE_USABLE (AVX512BW) > && CPU_FEATURE_USABLE (BMI2)), > __rawmemchr_evex) > + X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > + (CPU_FEATURE_USABLE (AVX512VL) > + && CPU_FEATURE_USABLE (AVX512BW) > + && CPU_FEATURE_USABLE (BMI2)), > + __rawmemchr_evex512) > X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > (CPU_FEATURE_USABLE (AVX512VL) > && CPU_FEATURE_USABLE (AVX512BW) > @@ -942,6 +952,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > && CPU_FEATURE_USABLE (AVX512BW) > && CPU_FEATURE_USABLE (BMI2)), > __wmemchr_evex) > + X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > + (CPU_FEATURE_USABLE (AVX512VL) > + && CPU_FEATURE_USABLE (AVX512BW) > + && CPU_FEATURE_USABLE (BMI2)), > + __wmemchr_evex512) > X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > (CPU_FEATURE_USABLE (AVX512VL) > && CPU_FEATURE_USABLE (AVX512BW) > diff --git a/sysdeps/x86_64/multiarch/memchr-evex-base.S b/sysdeps/x86_64/multiarch/memchr-evex-base.S > new file mode 100644 > index 0000000000..4de60655b4 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/memchr-evex-base.S > @@ -0,0 +1,301 @@ > +/* Placeholder function, not used by any processor at the moment. > + Copyright (C) 2022 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +/* UNUSED. Exists purely as reference implementation. */ > + > +#include <isa-level.h> > + > +#if ISA_SHOULD_BUILD (4) > + > +# include <sysdep.h> > + > +# ifdef USE_AS_WMEMCHR > +# define CHAR_SIZE 4 > +# define VPBROADCAST vpbroadcastd > +# define VPCMPEQ vpcmpeqd > +# define VPCMPNE vpcmpneqd > +# define VPMINU vpminud > +# define VPTESTNM vptestnmd > +# else > +# define CHAR_SIZE 1 > +# define VPBROADCAST vpbroadcastb > +# define VPCMPEQ vpcmpeqb > +# define VPCMPNE vpcmpneqb > +# define VPMINU vpminub > +# define VPTESTNM vptestnmb > +# endif > + > +# define PAGE_SIZE 4096 > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > + > + .section SECTION(.text), "ax", @progbits > +/* Aligning entry point to 64 byte, provides better performance for > + one vector length string. */ > +ENTRY_P2ALIGN (MEMCHR, 6) > +# ifndef USE_AS_RAWMEMCHR > + /* Check for zero length. */ > + test %RDX_LP, %RDX_LP > + jz L(zero) > + > +# ifdef __ILP32__ > + /* Clear the upper 32 bits. */ > + movl %edx, %edx > +# endif > +# endif > + > + /* Broadcast CHAR to VMM(1). */ > + VPBROADCAST %esi, %VMM(1) > + movl %edi, %eax > + andl $(PAGE_SIZE - 1), %eax > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > + ja L(page_cross) > + > + /* Compare [w]char for null, mask bit will be set for match. */ > + VPCMPEQ (%rdi), %VMM(1), %k0 > + > + KMOV %k0, %VRCX > +# ifndef USE_AS_RAWMEMCHR > + mov %rdx, %rax > + bsf %VRCX, %VRAX > + cmp $CHAR_PER_VEC, %rax > + ja L(align_more) Can you rerun benchmarks given that you dramatically changed how small sizes are handled. > +# ifdef USE_AS_WMEMCHR > + leaq (%rdi, %rax, CHAR_SIZE), %rdi > +# else > + addq %rax, %rdi > +# endif > + cmp %rax, %rdx > + jbe L(zero) > + mov %rdi, %rax > +# else > + bsf %VRCX, %VRAX > + jz L(align_more) > + add %rdi, %rax > +# endif > + ret > + > +# ifndef USE_AS_RAWMEMCHR > +L(zero): > + xorl %eax, %eax > + ret > +# endif > + > + .p2align 5,,5 > +L(page_cross): > + movl %eax, %ecx > + andl $(VEC_SIZE - 1), %ecx > +# ifdef USE_AS_WMEMCHR > + shrl $2, %ecx > +# endif > + xorq %rdi, %rax > + VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1), %k0 > + KMOV %k0, %VRSI > + shr %cl, %VRSI > +# ifndef USE_AS_RAWMEMCHR > + jnz L(page_cross_end) > + movl $CHAR_PER_VEC, %eax > + sub %ecx, %eax > + cmp %rax, %rdx > + ja L(align_more) > +# else > + jz L(align_more) > +# endif > + > +L(page_cross_end): > +# ifndef USE_AS_RAWMEMCHR > + bsf %VRSI, %VRCX > + leaq (%rdi, %rcx, CHAR_SIZE), %rax > + cmp %rcx, %rdx > + jbe L(zero) > +# else > + bsf %VRSI, %VRAX > + add %rdi, %rax > +# endif > + ret > + > +L(ret_vec_x2): > + subq $-VEC_SIZE, %rdi > +L(ret_vec_x1): > + bsf %VRAX, %VRAX > +# ifndef USE_AS_RAWMEMCHR > + jz L(zero) > + cmp %rax, %rdx > + jbe L(zero) > +# endif > +# ifdef USE_AS_WMEMCHR > + leaq (%rdi, %rax, CHAR_SIZE), %rax > +# else > + add %rdi, %rax > +# endif > + ret > + > + .p2align 5,,10 > +L(align_more): > +# ifndef USE_AS_RAWMEMCHR > + mov %rdi, %rax > +# endif > + subq $-VEC_SIZE, %rdi > + /* Align rdi to VEC_SIZE. */ > + andq $-VEC_SIZE, %rdi > + > +# ifndef USE_AS_RAWMEMCHR > + subq %rdi, %rax > +# ifdef USE_AS_WMEMCHR > + sar $2, %rax > +# endif > + addq %rax, %rdx > +# endif > + > + /* Loop unroll 4 times for 4 vector loop. */ > + VPCMPEQ (%rdi), %VMM(1), %k0 > + > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > + jnz L(ret_vec_x1) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero) > +# endif > + > + VPCMPEQ VEC_SIZE(%rdi), %VMM(1), %k0 > + > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > + jnz L(ret_vec_x2) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero) > +# endif > + > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMM(1), %k0 > + > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > + jnz L(ret_vec_x3) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero) > +# endif > + > + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMM(1), %k0 > + > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > + jnz L(ret_vec_x4) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero) > + /* Save pointer to find alignment adjustment. */ > + movq %rdi, %rax > +# endif > + /* Align address to VEC_SIZE * 4 for loop. */ > + andq $-(VEC_SIZE * 4), %rdi > + > + /* Add alignment difference to rdx. */ > +# ifndef USE_AS_RAWMEMCHR > + subq %rdi, %rax > +# ifdef USE_AS_WMEMCHR > + shr $2, %VRAX > +# endif > + addq %rax, %rdx > +# endif > + > + /* 4 vector loop. */ > + .p2align 5,,11 > +L(loop): > + > + VPCMPNE (VEC_SIZE * 4)(%rdi), %VMM(1), %k1 > + vpxorq (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2) > + vpxorq (VEC_SIZE * 6)(%rdi), %VMM(1), %VMM(3) > + VPCMPEQ (VEC_SIZE * 7)(%rdi), %VMM(1), %k3 > + VPMINU %VMM(2), %VMM(3), %VMM(3){%k1}{z} > + VPTESTNM %VMM(3), %VMM(3), %k2 > + > + subq $-(VEC_SIZE * 4), %rdi > + KORTEST %k2, %k3 > +# ifdef USE_AS_RAWMEMCHR > + jz L(loop) > +# else > + jnz L(loopend) > + subq $(CHAR_PER_VEC * 4), %rdx > + ja L(loop) > +L(zero_2): > + xor %eax, %eax > + ret > +# endif > + > +L(loopend): > + VPCMPEQ (%rdi), %VMM(1), %k1 > + KMOV %k1, %VRAX > + test %VRAX, %VRAX > + jnz L(ret_vec_x1) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero_2) > +# endif > + > + VPCMPEQ VEC_SIZE(%rdi), %VMM(1), %k1 > + KMOV %k1, %VRAX > + test %VRAX, %VRAX > + jnz L(ret_vec_x2) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero_2) > +# endif > + > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMM(1), %k1 > + KMOV %k1, %VRAX > + test %VRAX, %VRAX > + jnz L(ret_vec_x3) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero_2) > +# endif > + > + /* At this point null [w]char must be in the fourth vector so no > + need to check. */ > + KMOV %k3, %VRAX > + > +L(ret_vec_x4): > + bsf %VRAX, %VRAX > +# ifndef USE_AS_RAWMEMCHR > + cmp %rax, %rdx > + jbe L(zero) > +# endif > + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax > + ret > + > + .p2align 5,,5 > +L(ret_vec_x3): > + bsf %VRAX, %VRAX > +# ifndef USE_AS_RAWMEMCHR > + cmp %rax, %rdx > + jbe L(zero) > +# endif > + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > + ret > + > +END (MEMCHR) > +#endif > diff --git a/sysdeps/x86_64/multiarch/memchr-evex512.S b/sysdeps/x86_64/multiarch/memchr-evex512.S > new file mode 100644 > index 0000000000..002f8c8489 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/memchr-evex512.S > @@ -0,0 +1,8 @@ > +# ifndef MEMCHR > +# define MEMCHR __memchr_evex512 > +# endif > + > +#include "x86-evex512-vecs.h" > +#include "reg-macros.h" > + > +#include "memchr-evex-base.S" > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex512.S b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S > new file mode 100644 > index 0000000000..302d3cb055 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S > @@ -0,0 +1,7 @@ > +#ifndef RAWMEMCHR > +# define RAWMEMCHR __rawmemchr_evex512 > +#endif > +#define USE_AS_RAWMEMCHR 1 > +#define MEMCHR RAWMEMCHR > + > +#include "memchr-evex512.S" > diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex512.S b/sysdeps/x86_64/multiarch/wmemchr-evex512.S > new file mode 100644 > index 0000000000..78ec4ee5ad > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/wmemchr-evex512.S > @@ -0,0 +1,9 @@ > +#ifndef WMEMCHR > +# define WMEMCHR __wmemchr_evex512 > +#endif > + > +#define MEMCHR WMEMCHR > +#define USE_AS_WMEMCHR 1 > + > +#define USE_WIDE_CHAR 1 > +#include "memchr-evex512.S" > -- > 2.36.1 > ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH v4] x86_64: Implement evex512 version of memchr, rawmemchr and wmemchr 2022-10-17 15:46 ` Noah Goldstein @ 2022-10-17 23:55 ` Sunil Pandey 2022-10-18 0:00 ` [PATCH v5] " Sunil K Pandey 1 sibling, 0 replies; 26+ messages in thread From: Sunil Pandey @ 2022-10-17 23:55 UTC (permalink / raw) To: Noah Goldstein; +Cc: libc-alpha [-- Attachment #1: Type: text/plain, Size: 16458 bytes --] SKX data attached for v5. On Mon, Oct 17, 2022 at 8:46 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > On Sun, Oct 16, 2022 at 10:53 PM Sunil K Pandey via Libc-alpha > <libc-alpha@sourceware.org> wrote: > > > > Changes from v3: > > - Replace VPCMPEQ in loop with VPTESTNM for 4th vector. > > - Change first vector max check logic for terminating condition. > > - Change page cross logic for terminating condition. > > - Remove unnessary check in align_more block. > > - Remove unnessary VEC(0) initialization. > > - Define USE_WIDE_CHAR in wmemchr. > > > > Changes from v2: > > - Use VEC API > > - Replaced long jump L(zero) in L(endloop) with short jump L(zero_2) > > > > Changes from v1: > > - Change vcmp to vcmpeq and vcmpneq. > > - Restructure unconditional loop jump logic. > > - Improve 4 vector loop logic. > > - Fix bug near page boundary. > > > > This patch implements following evex512 version of string functions. > > evex512 version takes up to 30% less cycle as compared to evex, > > depending on length and alignment. > > > > - memchr function using 512 bit vectors. > > - rawmemchr function using 512 bit vectors. > > - wmemchr function using 512 bit vectors. > > > > Code size data: > > > > memchr-evex.o 762 byte > > memchr-evex512.o 576 byte (-24%) > > > > rawmemchr-evex.o 461 byte > > rawmemchr-evex512.o 432 byte (-6%) > > > > wmemchr-evex.o 794 byte > > wmemchr-evex512.o 576 byte (-27%) > > > > Placeholder function, not used by any processor at the moment. > > --- > > sysdeps/x86_64/multiarch/Makefile | 3 + > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 + > > sysdeps/x86_64/multiarch/memchr-evex-base.S | 301 +++++++++++++++++++ > > sysdeps/x86_64/multiarch/memchr-evex512.S | 8 + > > sysdeps/x86_64/multiarch/rawmemchr-evex512.S | 7 + > > sysdeps/x86_64/multiarch/wmemchr-evex512.S | 9 + > > 6 files changed, 343 insertions(+) > > create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-base.S > > create mode 100644 sysdeps/x86_64/multiarch/memchr-evex512.S > > create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex512.S > > create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex512.S > > > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > > index df4601c294..e974b1ad97 100644 > > --- a/sysdeps/x86_64/multiarch/Makefile > > +++ b/sysdeps/x86_64/multiarch/Makefile > > @@ -4,6 +4,7 @@ sysdep_routines += \ > > memchr-avx2 \ > > memchr-avx2-rtm \ > > memchr-evex \ > > + memchr-evex512 \ > > memchr-evex-rtm \ > > memchr-sse2 \ > > memcmp-avx2-movbe \ > > @@ -36,6 +37,7 @@ sysdep_routines += \ > > rawmemchr-avx2 \ > > rawmemchr-avx2-rtm \ > > rawmemchr-evex \ > > + rawmemchr-evex512 \ > > rawmemchr-evex-rtm \ > > rawmemchr-sse2 \ > > stpcpy-avx2 \ > > @@ -156,6 +158,7 @@ sysdep_routines += \ > > wmemchr-avx2 \ > > wmemchr-avx2-rtm \ > > wmemchr-evex \ > > + wmemchr-evex512 \ > > wmemchr-evex-rtm \ > > wmemchr-sse2 \ > > wmemcmp-avx2-movbe \ > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > index 00a91123d3..529c0b0ef0 100644 > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > @@ -63,6 +63,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > && CPU_FEATURE_USABLE (AVX512BW) > > && CPU_FEATURE_USABLE (BMI2)), > > __memchr_evex) > > + X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > > + (CPU_FEATURE_USABLE (AVX512VL) > > + && CPU_FEATURE_USABLE (AVX512BW) > > + && CPU_FEATURE_USABLE (BMI2)), > > + __memchr_evex512) > > X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > > (CPU_FEATURE_USABLE (AVX512VL) > > && CPU_FEATURE_USABLE (AVX512BW) > > @@ -337,6 +342,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > && CPU_FEATURE_USABLE (AVX512BW) > > && CPU_FEATURE_USABLE (BMI2)), > > __rawmemchr_evex) > > + X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > > + (CPU_FEATURE_USABLE (AVX512VL) > > + && CPU_FEATURE_USABLE (AVX512BW) > > + && CPU_FEATURE_USABLE (BMI2)), > > + __rawmemchr_evex512) > > X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > > (CPU_FEATURE_USABLE (AVX512VL) > > && CPU_FEATURE_USABLE (AVX512BW) > > @@ -942,6 +952,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > && CPU_FEATURE_USABLE (AVX512BW) > > && CPU_FEATURE_USABLE (BMI2)), > > __wmemchr_evex) > > + X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > > + (CPU_FEATURE_USABLE (AVX512VL) > > + && CPU_FEATURE_USABLE (AVX512BW) > > + && CPU_FEATURE_USABLE (BMI2)), > > + __wmemchr_evex512) > > X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > > (CPU_FEATURE_USABLE (AVX512VL) > > && CPU_FEATURE_USABLE (AVX512BW) > > diff --git a/sysdeps/x86_64/multiarch/memchr-evex-base.S b/sysdeps/x86_64/multiarch/memchr-evex-base.S > > new file mode 100644 > > index 0000000000..4de60655b4 > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/memchr-evex-base.S > > @@ -0,0 +1,301 @@ > > +/* Placeholder function, not used by any processor at the moment. > > + Copyright (C) 2022 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > +/* UNUSED. Exists purely as reference implementation. */ > > + > > +#include <isa-level.h> > > + > > +#if ISA_SHOULD_BUILD (4) > > + > > +# include <sysdep.h> > > + > > +# ifdef USE_AS_WMEMCHR > > +# define CHAR_SIZE 4 > > +# define VPBROADCAST vpbroadcastd > > +# define VPCMPEQ vpcmpeqd > > +# define VPCMPNE vpcmpneqd > > +# define VPMINU vpminud > > +# define VPTESTNM vptestnmd > > +# else > > +# define CHAR_SIZE 1 > > +# define VPBROADCAST vpbroadcastb > > +# define VPCMPEQ vpcmpeqb > > +# define VPCMPNE vpcmpneqb > > +# define VPMINU vpminub > > +# define VPTESTNM vptestnmb > > +# endif > > + > > +# define PAGE_SIZE 4096 > > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > + > > + .section SECTION(.text), "ax", @progbits > > +/* Aligning entry point to 64 byte, provides better performance for > > + one vector length string. */ > > +ENTRY_P2ALIGN (MEMCHR, 6) > > +# ifndef USE_AS_RAWMEMCHR > > + /* Check for zero length. */ > > + test %RDX_LP, %RDX_LP > > + jz L(zero) > > + > > +# ifdef __ILP32__ > > + /* Clear the upper 32 bits. */ > > + movl %edx, %edx > > +# endif > > +# endif > > + > > + /* Broadcast CHAR to VMM(1). */ > > + VPBROADCAST %esi, %VMM(1) > > + movl %edi, %eax > > + andl $(PAGE_SIZE - 1), %eax > > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > > + ja L(page_cross) > > + > > + /* Compare [w]char for null, mask bit will be set for match. */ > > + VPCMPEQ (%rdi), %VMM(1), %k0 > > + > > + KMOV %k0, %VRCX > > +# ifndef USE_AS_RAWMEMCHR > > + mov %rdx, %rax > > + bsf %VRCX, %VRAX > > + cmp $CHAR_PER_VEC, %rax > > + ja L(align_more) > > Can you rerun benchmarks given that you dramatically changed how small > sizes are handled. > > > > +# ifdef USE_AS_WMEMCHR > > + leaq (%rdi, %rax, CHAR_SIZE), %rdi > > +# else > > + addq %rax, %rdi > > +# endif > > + cmp %rax, %rdx > > + jbe L(zero) > > + mov %rdi, %rax > > +# else > > + bsf %VRCX, %VRAX > > + jz L(align_more) > > + add %rdi, %rax > > +# endif > > + ret > > + > > +# ifndef USE_AS_RAWMEMCHR > > +L(zero): > > + xorl %eax, %eax > > + ret > > +# endif > > + > > + .p2align 5,,5 > > +L(page_cross): > > + movl %eax, %ecx > > + andl $(VEC_SIZE - 1), %ecx > > +# ifdef USE_AS_WMEMCHR > > + shrl $2, %ecx > > +# endif > > + xorq %rdi, %rax > > + VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1), %k0 > > + KMOV %k0, %VRSI > > + shr %cl, %VRSI > > +# ifndef USE_AS_RAWMEMCHR > > + jnz L(page_cross_end) > > + movl $CHAR_PER_VEC, %eax > > + sub %ecx, %eax > > + cmp %rax, %rdx > > + ja L(align_more) > > +# else > > + jz L(align_more) > > +# endif > > + > > +L(page_cross_end): > > +# ifndef USE_AS_RAWMEMCHR > > + bsf %VRSI, %VRCX > > + leaq (%rdi, %rcx, CHAR_SIZE), %rax > > + cmp %rcx, %rdx > > + jbe L(zero) > > +# else > > + bsf %VRSI, %VRAX > > + add %rdi, %rax > > +# endif > > + ret > > + > > +L(ret_vec_x2): > > + subq $-VEC_SIZE, %rdi > > +L(ret_vec_x1): > > + bsf %VRAX, %VRAX > > +# ifndef USE_AS_RAWMEMCHR > > + jz L(zero) > > + cmp %rax, %rdx > > + jbe L(zero) > > +# endif > > +# ifdef USE_AS_WMEMCHR > > + leaq (%rdi, %rax, CHAR_SIZE), %rax > > +# else > > + add %rdi, %rax > > +# endif > > + ret > > + > > + .p2align 5,,10 > > +L(align_more): > > +# ifndef USE_AS_RAWMEMCHR > > + mov %rdi, %rax > > +# endif > > + subq $-VEC_SIZE, %rdi > > + /* Align rdi to VEC_SIZE. */ > > + andq $-VEC_SIZE, %rdi > > + > > +# ifndef USE_AS_RAWMEMCHR > > + subq %rdi, %rax > > +# ifdef USE_AS_WMEMCHR > > + sar $2, %rax > > +# endif > > + addq %rax, %rdx > > +# endif > > + > > + /* Loop unroll 4 times for 4 vector loop. */ > > + VPCMPEQ (%rdi), %VMM(1), %k0 > > + > > + KMOV %k0, %VRAX > > + test %VRAX, %VRAX > > + jnz L(ret_vec_x1) > > + > > +# ifndef USE_AS_RAWMEMCHR > > + subq $CHAR_PER_VEC, %rdx > > + jbe L(zero) > > +# endif > > + > > + VPCMPEQ VEC_SIZE(%rdi), %VMM(1), %k0 > > + > > + KMOV %k0, %VRAX > > + test %VRAX, %VRAX > > + jnz L(ret_vec_x2) > > + > > +# ifndef USE_AS_RAWMEMCHR > > + subq $CHAR_PER_VEC, %rdx > > + jbe L(zero) > > +# endif > > + > > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMM(1), %k0 > > + > > + KMOV %k0, %VRAX > > + test %VRAX, %VRAX > > + jnz L(ret_vec_x3) > > + > > +# ifndef USE_AS_RAWMEMCHR > > + subq $CHAR_PER_VEC, %rdx > > + jbe L(zero) > > +# endif > > + > > + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMM(1), %k0 > > + > > + KMOV %k0, %VRAX > > + test %VRAX, %VRAX > > + jnz L(ret_vec_x4) > > + > > +# ifndef USE_AS_RAWMEMCHR > > + subq $CHAR_PER_VEC, %rdx > > + jbe L(zero) > > + /* Save pointer to find alignment adjustment. */ > > + movq %rdi, %rax > > +# endif > > + /* Align address to VEC_SIZE * 4 for loop. */ > > + andq $-(VEC_SIZE * 4), %rdi > > + > > + /* Add alignment difference to rdx. */ > > +# ifndef USE_AS_RAWMEMCHR > > + subq %rdi, %rax > > +# ifdef USE_AS_WMEMCHR > > + shr $2, %VRAX > > +# endif > > + addq %rax, %rdx > > +# endif > > + > > + /* 4 vector loop. */ > > + .p2align 5,,11 > > +L(loop): > > + > > + VPCMPNE (VEC_SIZE * 4)(%rdi), %VMM(1), %k1 > > + vpxorq (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2) > > + vpxorq (VEC_SIZE * 6)(%rdi), %VMM(1), %VMM(3) > > + VPCMPEQ (VEC_SIZE * 7)(%rdi), %VMM(1), %k3 > > + VPMINU %VMM(2), %VMM(3), %VMM(3){%k1}{z} > > + VPTESTNM %VMM(3), %VMM(3), %k2 > > + > > + subq $-(VEC_SIZE * 4), %rdi > > + KORTEST %k2, %k3 > > +# ifdef USE_AS_RAWMEMCHR > > + jz L(loop) > > +# else > > + jnz L(loopend) > > + subq $(CHAR_PER_VEC * 4), %rdx > > + ja L(loop) > > +L(zero_2): > > + xor %eax, %eax > > + ret > > +# endif > > + > > +L(loopend): > > + VPCMPEQ (%rdi), %VMM(1), %k1 > > + KMOV %k1, %VRAX > > + test %VRAX, %VRAX > > + jnz L(ret_vec_x1) > > + > > +# ifndef USE_AS_RAWMEMCHR > > + subq $CHAR_PER_VEC, %rdx > > + jbe L(zero_2) > > +# endif > > + > > + VPCMPEQ VEC_SIZE(%rdi), %VMM(1), %k1 > > + KMOV %k1, %VRAX > > + test %VRAX, %VRAX > > + jnz L(ret_vec_x2) > > + > > +# ifndef USE_AS_RAWMEMCHR > > + subq $CHAR_PER_VEC, %rdx > > + jbe L(zero_2) > > +# endif > > + > > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMM(1), %k1 > > + KMOV %k1, %VRAX > > + test %VRAX, %VRAX > > + jnz L(ret_vec_x3) > > + > > +# ifndef USE_AS_RAWMEMCHR > > + subq $CHAR_PER_VEC, %rdx > > + jbe L(zero_2) > > +# endif > > + > > + /* At this point null [w]char must be in the fourth vector so no > > + need to check. */ > > + KMOV %k3, %VRAX > > + > > +L(ret_vec_x4): > > + bsf %VRAX, %VRAX > > +# ifndef USE_AS_RAWMEMCHR > > + cmp %rax, %rdx > > + jbe L(zero) > > +# endif > > + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax > > + ret > > + > > + .p2align 5,,5 > > +L(ret_vec_x3): > > + bsf %VRAX, %VRAX > > +# ifndef USE_AS_RAWMEMCHR > > + cmp %rax, %rdx > > + jbe L(zero) > > +# endif > > + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > > + ret > > + > > +END (MEMCHR) > > +#endif > > diff --git a/sysdeps/x86_64/multiarch/memchr-evex512.S b/sysdeps/x86_64/multiarch/memchr-evex512.S > > new file mode 100644 > > index 0000000000..002f8c8489 > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/memchr-evex512.S > > @@ -0,0 +1,8 @@ > > +# ifndef MEMCHR > > +# define MEMCHR __memchr_evex512 > > +# endif > > + > > +#include "x86-evex512-vecs.h" > > +#include "reg-macros.h" > > + > > +#include "memchr-evex-base.S" > > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex512.S b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S > > new file mode 100644 > > index 0000000000..302d3cb055 > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S > > @@ -0,0 +1,7 @@ > > +#ifndef RAWMEMCHR > > +# define RAWMEMCHR __rawmemchr_evex512 > > +#endif > > +#define USE_AS_RAWMEMCHR 1 > > +#define MEMCHR RAWMEMCHR > > + > > +#include "memchr-evex512.S" > > diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex512.S b/sysdeps/x86_64/multiarch/wmemchr-evex512.S > > new file mode 100644 > > index 0000000000..78ec4ee5ad > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/wmemchr-evex512.S > > @@ -0,0 +1,9 @@ > > +#ifndef WMEMCHR > > +# define WMEMCHR __wmemchr_evex512 > > +#endif > > + > > +#define MEMCHR WMEMCHR > > +#define USE_AS_WMEMCHR 1 > > + > > +#define USE_WIDE_CHAR 1 > > +#include "memchr-evex512.S" > > -- > > 2.36.1 > > [-- Attachment #2: rawmemchr_skx.txt --] [-- Type: text/plain, Size: 6674 bytes --] Function: rawmemchr Variant: __rawmemchr_evex __rawmemchr_evex512 ======================================================================================================================== length=32, alignment=0: 7.06 5.09 ( 27.95%) length=64, alignment=1: 8.04 7.11 ( 11.65%) length=32, alignment=0: 7.04 5.11 ( 27.32%) length=64, alignment=1: 8.04 7.10 ( 11.62%) length=64, alignment=0: 8.13 7.10 ( 12.65%) length=64, alignment=2: 5.22 4.79 ( 8.35%) length=64, alignment=0: 5.21 4.85 ( 6.92%) length=64, alignment=2: 5.42 4.80 ( 11.54%) length=128, alignment=0: 5.85 7.56 (-29.13%) length=64, alignment=3: 5.22 4.78 ( 8.29%) length=128, alignment=0: 5.85 6.81 (-16.34%) length=64, alignment=3: 5.22 4.79 ( 8.20%) length=256, alignment=0: 10.72 8.17 ( 23.73%) length=64, alignment=4: 5.23 4.80 ( 8.31%) length=256, alignment=0: 11.30 8.22 ( 27.25%) length=64, alignment=4: 5.22 4.79 ( 8.16%) length=512, alignment=0: 13.88 12.80 ( 7.78%) length=64, alignment=5: 5.22 4.78 ( 8.27%) length=512, alignment=0: 13.95 13.31 ( 4.52%) length=64, alignment=5: 5.22 4.78 ( 8.29%) length=1024, alignment=0: 21.21 16.16 ( 23.83%) length=64, alignment=6: 5.21 4.79 ( 8.08%) length=1024, alignment=0: 21.14 16.21 ( 23.32%) length=64, alignment=6: 5.22 4.79 ( 8.16%) length=1, alignment=0: 3.26 3.44 ( -5.55%) length=1, alignment=0: 3.26 3.44 ( -5.75%) length=2, alignment=0: 3.26 3.46 ( -6.19%) length=2, alignment=0: 3.26 3.46 ( -6.36%) length=3, alignment=0: 3.24 3.45 ( -6.46%) length=3, alignment=0: 3.24 3.46 ( -6.78%) length=4, alignment=0: 3.25 3.57 ( -9.58%) length=4, alignment=0: 3.25 3.54 ( -8.79%) length=5, alignment=0: 3.24 3.47 ( -6.91%) length=5, alignment=0: 3.24 3.46 ( -6.64%) length=6, alignment=0: 3.26 3.45 ( -6.03%) length=6, alignment=0: 3.25 3.46 ( -6.41%) length=7, alignment=0: 3.24 3.46 ( -6.59%) length=7, alignment=0: 3.24 3.46 ( -6.53%) length=8, alignment=0: 3.26 3.45 ( -6.00%) length=8, alignment=0: 3.25 3.45 ( -6.12%) length=9, alignment=0: 3.24 3.46 ( -6.61%) length=9, alignment=0: 3.24 3.45 ( -6.48%) length=10, alignment=0: 3.26 3.46 ( -6.21%) length=10, alignment=0: 3.26 3.46 ( -5.96%) length=11, alignment=0: 3.24 3.46 ( -6.72%) length=11, alignment=0: 3.24 3.46 ( -6.77%) length=12, alignment=0: 3.26 3.46 ( -6.28%) length=12, alignment=0: 3.26 3.63 (-11.48%) length=13, alignment=0: 3.25 3.46 ( -6.36%) length=13, alignment=0: 3.25 3.46 ( -6.19%) length=14, alignment=0: 3.26 3.45 ( -5.78%) length=14, alignment=0: 3.45 3.55 ( -2.87%) length=15, alignment=0: 3.24 3.46 ( -6.61%) length=15, alignment=0: 3.24 3.46 ( -6.82%) length=16, alignment=0: 3.25 3.46 ( -6.20%) length=16, alignment=0: 3.25 3.47 ( -6.64%) length=17, alignment=0: 3.26 3.47 ( -6.22%) length=17, alignment=0: 3.26 3.46 ( -6.17%) length=18, alignment=0: 3.26 3.47 ( -6.40%) length=18, alignment=0: 3.27 3.46 ( -5.89%) length=19, alignment=0: 3.26 3.59 (-10.26%) length=19, alignment=0: 3.24 3.58 (-10.27%) length=20, alignment=0: 3.26 3.46 ( -6.27%) length=20, alignment=0: 3.24 3.46 ( -6.60%) length=21, alignment=0: 3.26 3.46 ( -6.33%) length=21, alignment=0: 3.24 3.45 ( -6.45%) length=22, alignment=0: 3.43 3.87 (-12.78%) length=22, alignment=0: 3.44 3.65 ( -6.08%) length=23, alignment=0: 3.44 3.66 ( -6.19%) length=23, alignment=0: 3.41 3.47 ( -1.64%) length=24, alignment=0: 3.25 3.44 ( -5.84%) length=24, alignment=0: 3.25 3.46 ( -6.22%) length=25, alignment=0: 3.25 3.45 ( -5.90%) length=25, alignment=0: 3.25 3.45 ( -6.17%) length=26, alignment=0: 3.25 3.45 ( -5.91%) length=26, alignment=0: 3.26 3.46 ( -6.15%) length=27, alignment=0: 3.25 3.44 ( -5.88%) length=27, alignment=0: 3.26 3.46 ( -6.18%) length=28, alignment=0: 3.25 3.44 ( -5.77%) length=28, alignment=0: 3.25 3.45 ( -6.17%) length=29, alignment=0: 3.25 3.45 ( -6.09%) length=29, alignment=0: 3.25 3.45 ( -6.16%) length=30, alignment=0: 3.25 3.44 ( -5.79%) length=30, alignment=0: 3.25 3.45 ( -6.14%) length=31, alignment=0: 3.26 3.46 ( -6.17%) length=31, alignment=0: 3.42 3.66 ( -7.02%) [-- Attachment #3: wmemchr_skx.txt --] [-- Type: text/plain, Size: 23018 bytes --] Function: wmemchr Variant: __wmemchr_evex __wmemchr_evex512 ======================================================================================================================== len=256, align=1, pos=64: 18.03 11.10 ( 38.45%) len=256, align=1, pos=64: 18.34 11.09 ( 39.54%) len=256, align=2, pos=64: 16.60 10.44 ( 37.14%) len=256, align=2, pos=64: 16.50 10.77 ( 34.73%) len=256, align=3, pos=64: 16.49 10.45 ( 36.64%) len=256, align=3, pos=64: 16.49 11.10 ( 32.69%) len=256, align=4, pos=64: 17.00 12.06 ( 29.10%) len=256, align=4, pos=64: 16.49 10.44 ( 36.67%) len=256, align=5, pos=64: 16.47 12.01 ( 27.09%) len=256, align=5, pos=64: 16.49 11.11 ( 32.63%) len=256, align=6, pos=64: 16.95 10.47 ( 38.23%) len=256, align=6, pos=64: 16.51 10.46 ( 36.65%) len=256, align=7, pos=64: 16.86 11.00 ( 34.75%) len=256, align=7, pos=64: 16.33 12.98 ( 20.53%) len=192, align=1, pos=32: 9.97 7.82 ( 21.52%) len=192, align=1, pos=32: 10.00 7.76 ( 22.34%) len=256, align=1, pos=32: 10.01 7.79 ( 22.18%) len=256, align=1, pos=32: 9.95 9.25 ( 7.01%) len=512, align=1, pos=32: 10.26 7.76 ( 24.32%) len=512, align=1, pos=32: 9.98 9.06 ( 9.19%) len=192, align=2, pos=64: 16.48 10.46 ( 36.56%) len=192, align=2, pos=64: 16.46 10.77 ( 34.56%) len=256, align=2, pos=64: 17.06 10.67 ( 37.45%) len=256, align=2, pos=64: 16.49 11.11 ( 32.60%) len=512, align=2, pos=64: 16.47 11.15 ( 32.31%) len=512, align=2, pos=64: 16.48 10.49 ( 36.35%) len=192, align=3, pos=96: 18.03 17.15 ( 4.88%) len=192, align=3, pos=96: 17.79 19.57 (-10.06%) len=256, align=3, pos=96: 18.07 17.16 ( 5.05%) len=256, align=3, pos=96: 17.77 17.13 ( 3.58%) len=512, align=3, pos=96: 17.98 17.13 ( 4.70%) len=512, align=3, pos=96: 18.10 18.35 ( -1.43%) len=192, align=4, pos=128: 19.92 17.58 ( 11.72%) len=192, align=4, pos=128: 19.49 17.33 ( 11.07%) len=256, align=4, pos=128: 19.14 17.78 ( 7.06%) len=256, align=4, pos=128: 19.60 17.98 ( 8.25%) len=512, align=4, pos=128: 20.13 17.15 ( 14.79%) len=512, align=4, pos=128: 19.27 18.13 ( 5.96%) len=192, align=5, pos=160: 20.39 19.81 ( 2.84%) len=192, align=5, pos=160: 22.05 19.81 ( 10.14%) len=256, align=5, pos=160: 20.36 20.10 ( 1.31%) len=256, align=5, pos=160: 20.36 20.48 ( -0.57%) len=512, align=5, pos=160: 20.39 19.94 ( 2.21%) len=512, align=5, pos=160: 20.46 21.50 ( -5.07%) len=192, align=6, pos=192: 22.51 19.65 ( 12.69%) len=192, align=6, pos=192: 21.34 19.51 ( 8.61%) len=256, align=6, pos=192: 21.69 19.70 ( 9.15%) len=256, align=6, pos=192: 22.04 19.59 ( 11.10%) len=512, align=6, pos=192: 22.15 19.21 ( 13.27%) len=512, align=6, pos=192: 22.13 19.13 ( 13.52%) len=192, align=7, pos=224: 21.43 21.11 ( 1.49%) len=192, align=7, pos=224: 21.05 19.66 ( 6.58%) len=256, align=7, pos=224: 24.41 21.88 ( 10.38%) len=256, align=7, pos=224: 23.60 21.82 ( 7.58%) len=512, align=7, pos=224: 23.53 21.83 ( 7.23%) len=512, align=7, pos=224: 23.21 21.80 ( 6.06%) len=2, align=0, pos=1: 5.19 5.82 (-12.09%) len=2, align=0, pos=1: 4.67 4.97 ( -6.45%) len=2, align=1, pos=1: 4.67 5.14 (-10.03%) len=2, align=1, pos=1: 5.00 5.10 ( -2.05%) len=0, align=0, pos=1: 4.67 4.00 ( 14.28%) len=0, align=0, pos=1: 4.71 4.00 ( 15.05%) len=0, align=1, pos=1: 4.67 4.00 ( 14.28%) len=0, align=1, pos=1: 4.71 4.00 ( 15.04%) len=3, align=0, pos=2: 5.33 5.38 ( -0.82%) len=3, align=0, pos=2: 5.33 4.70 ( 11.82%) len=3, align=2, pos=2: 5.07 4.67 ( 7.92%) len=3, align=2, pos=2: 5.08 5.33 ( -5.01%) len=1, align=0, pos=2: 5.00 5.24 ( -4.81%) len=1, align=0, pos=2: 4.67 4.97 ( -6.47%) len=1, align=2, pos=2: 5.00 4.98 ( 0.42%) len=1, align=2, pos=2: 4.67 5.03 ( -7.84%) len=4, align=0, pos=3: 5.00 5.38 ( -7.51%) len=4, align=0, pos=3: 5.05 5.09 ( -0.79%) len=4, align=3, pos=3: 5.00 5.00 ( -0.00%) len=4, align=3, pos=3: 5.00 4.67 ( 6.66%) len=2, align=0, pos=3: 5.33 4.67 ( 12.49%) len=2, align=0, pos=3: 5.33 5.12 ( 4.07%) len=2, align=3, pos=3: 4.67 4.81 ( -2.96%) len=2, align=3, pos=3: 4.84 4.67 ( 3.51%) len=5, align=0, pos=4: 5.33 4.67 ( 12.49%) len=5, align=0, pos=4: 5.33 4.71 ( 11.71%) len=5, align=4, pos=4: 5.46 5.09 ( 6.80%) len=5, align=4, pos=4: 5.00 4.67 ( 6.66%) len=3, align=0, pos=4: 5.04 5.33 ( -5.91%) len=3, align=0, pos=4: 5.10 5.33 ( -4.67%) len=3, align=4, pos=4: 5.00 4.67 ( 6.66%) len=3, align=4, pos=4: 5.00 4.71 ( 5.82%) len=6, align=0, pos=5: 5.48 5.12 ( 6.72%) len=6, align=0, pos=5: 5.00 4.67 ( 6.66%) len=6, align=5, pos=5: 5.20 5.10 ( 1.78%) len=6, align=5, pos=5: 4.67 4.67 ( -0.01%) len=4, align=0, pos=5: 4.96 5.15 ( -3.87%) len=4, align=0, pos=5: 5.00 5.09 ( -1.79%) len=4, align=5, pos=5: 5.00 4.67 ( 6.66%) len=4, align=5, pos=5: 4.67 5.17 (-10.69%) len=7, align=0, pos=6: 5.48 5.10 ( 7.00%) len=7, align=0, pos=6: 5.00 5.00 ( -0.00%) len=7, align=6, pos=6: 5.31 5.75 ( -8.21%) len=7, align=6, pos=6: 5.00 5.16 ( -3.11%) len=5, align=0, pos=6: 4.85 4.67 ( 3.70%) len=5, align=0, pos=6: 5.33 4.67 ( 12.49%) len=5, align=6, pos=6: 4.85 5.10 ( -5.22%) len=5, align=6, pos=6: 5.00 5.33 ( -6.67%) len=8, align=0, pos=7: 4.85 5.10 ( -5.20%) len=8, align=0, pos=7: 5.00 5.38 ( -7.50%) len=8, align=7, pos=7: 5.10 5.76 (-12.99%) len=8, align=7, pos=7: 5.41 5.12 ( 5.26%) len=6, align=0, pos=7: 5.33 4.67 ( 12.49%) len=6, align=0, pos=7: 5.56 5.81 ( -4.45%) len=6, align=7, pos=7: 5.00 5.33 ( -6.67%) len=6, align=7, pos=7: 4.84 5.12 ( -5.93%) len=9, align=0, pos=8: 8.71 4.67 ( 46.41%) len=9, align=0, pos=8: 8.89 4.67 ( 47.53%) len=9, align=8, pos=8: 9.06 5.09 ( 43.79%) len=9, align=8, pos=8: 8.69 5.09 ( 41.46%) len=7, align=0, pos=8: 5.48 5.11 ( 6.88%) len=7, align=0, pos=8: 5.33 5.30 ( 0.69%) len=7, align=8, pos=8: 5.12 5.11 ( 0.26%) len=7, align=8, pos=8: 5.33 5.15 ( 3.53%) len=10, align=0, pos=9: 9.33 5.22 ( 44.11%) len=10, align=0, pos=9: 9.37 4.67 ( 50.22%) len=10, align=9, pos=9: 9.11 5.10 ( 44.00%) len=10, align=9, pos=9: 8.72 5.13 ( 41.16%) len=8, align=0, pos=9: 5.39 5.97 (-10.70%) len=8, align=0, pos=9: 5.09 5.24 ( -3.03%) len=8, align=9, pos=9: 4.67 4.67 ( -0.00%) len=8, align=9, pos=9: 5.48 5.11 ( 6.69%) len=11, align=0, pos=10: 8.69 5.77 ( 33.69%) len=11, align=0, pos=10: 8.68 5.11 ( 41.19%) len=11, align=10, pos=10: 8.67 5.71 ( 34.07%) len=11, align=10, pos=10: 8.86 5.77 ( 34.85%) len=9, align=0, pos=10: 8.16 5.75 ( 29.57%) len=9, align=0, pos=10: 8.54 5.11 ( 40.13%) len=9, align=10, pos=10: 8.18 5.10 ( 37.70%) len=9, align=10, pos=10: 8.00 5.00 ( 37.50%) len=12, align=0, pos=11: 8.73 5.33 ( 38.90%) len=12, align=0, pos=11: 8.68 5.10 ( 41.23%) len=12, align=11, pos=11: 8.68 5.38 ( 37.98%) len=12, align=11, pos=11: 9.37 4.67 ( 50.21%) len=10, align=0, pos=11: 8.18 5.77 ( 29.42%) len=10, align=0, pos=11: 8.64 5.11 ( 40.78%) len=10, align=11, pos=11: 8.20 5.09 ( 37.92%) len=10, align=11, pos=11: 8.67 4.99 ( 42.38%) len=13, align=0, pos=12: 8.70 5.12 ( 41.23%) len=13, align=0, pos=12: 8.69 5.24 ( 39.78%) len=13, align=12, pos=12: 8.69 5.68 ( 34.68%) len=13, align=12, pos=12: 8.74 5.11 ( 41.47%) len=11, align=0, pos=12: 8.19 5.25 ( 35.95%) len=11, align=0, pos=12: 8.17 5.33 ( 34.72%) len=11, align=12, pos=12: 8.07 5.11 ( 36.71%) len=11, align=12, pos=12: 8.20 5.11 ( 37.71%) len=14, align=0, pos=13: 8.71 5.12 ( 41.29%) len=14, align=0, pos=13: 8.68 5.10 ( 41.30%) len=14, align=13, pos=13: 8.88 5.10 ( 42.56%) len=14, align=13, pos=13: 8.70 5.32 ( 38.81%) len=12, align=0, pos=13: 8.50 5.11 ( 39.83%) len=12, align=0, pos=13: 7.92 5.74 ( 27.51%) len=12, align=13, pos=13: 8.67 5.15 ( 40.62%) len=12, align=13, pos=13: 9.23 5.77 ( 37.53%) len=15, align=0, pos=14: 8.90 5.77 ( 35.11%) len=15, align=0, pos=14: 8.68 5.33 ( 38.57%) len=15, align=14, pos=14: 9.36 5.11 ( 45.40%) len=15, align=14, pos=14: 8.71 5.10 ( 41.44%) len=13, align=0, pos=14: 8.19 5.21 ( 36.41%) len=13, align=0, pos=14: 8.19 4.67 ( 43.03%) len=13, align=14, pos=14: 8.71 5.75 ( 33.95%) len=13, align=14, pos=14: 8.79 5.33 ( 39.33%) len=16, align=0, pos=15: 8.68 5.15 ( 40.63%) len=16, align=0, pos=15: 8.68 5.09 ( 41.28%) len=16, align=15, pos=15: 8.74 5.11 ( 41.61%) len=16, align=15, pos=15: 8.69 5.57 ( 35.88%) len=14, align=0, pos=15: 8.20 5.12 ( 37.62%) len=14, align=0, pos=15: 8.20 5.13 ( 37.48%) len=14, align=15, pos=15: 8.67 5.11 ( 41.03%) len=14, align=15, pos=15: 8.94 5.11 ( 42.86%) len=17, align=0, pos=16: 8.83 7.08 ( 19.73%) len=17, align=0, pos=16: 8.70 7.13 ( 18.06%) len=17, align=16, pos=16: 8.70 7.17 ( 17.59%) len=17, align=16, pos=16: 8.67 7.16 ( 17.38%) len=15, align=0, pos=16: 8.19 5.13 ( 37.41%) len=15, align=0, pos=16: 8.32 5.11 ( 38.60%) len=15, align=16, pos=16: 8.20 5.11 ( 37.60%) len=15, align=16, pos=16: 8.34 5.09 ( 38.96%) len=18, align=0, pos=17: 8.69 7.31 ( 15.83%) len=18, align=0, pos=17: 8.68 9.53 ( -9.70%) len=18, align=17, pos=17: 9.23 7.49 ( 18.80%) len=18, align=17, pos=17: 9.16 8.91 ( 2.69%) len=16, align=0, pos=17: 8.68 5.12 ( 40.96%) len=16, align=0, pos=17: 9.18 5.11 ( 44.38%) len=16, align=17, pos=17: 9.32 5.09 ( 45.44%) len=16, align=17, pos=17: 8.72 5.25 ( 39.80%) len=19, align=0, pos=18: 8.67 7.94 ( 8.37%) len=19, align=0, pos=18: 8.67 7.58 ( 12.56%) len=19, align=18, pos=18: 8.69 7.27 ( 16.36%) len=19, align=18, pos=18: 8.71 7.52 ( 13.73%) len=17, align=0, pos=18: 8.67 9.17 ( -5.79%) len=17, align=0, pos=18: 8.67 7.75 ( 10.57%) len=17, align=18, pos=18: 8.71 8.40 ( 3.51%) len=17, align=18, pos=18: 8.71 8.42 ( 3.34%) len=20, align=0, pos=19: 8.74 7.08 ( 18.98%) len=20, align=0, pos=19: 9.21 7.49 ( 18.68%) len=20, align=19, pos=19: 8.67 7.17 ( 17.29%) len=20, align=19, pos=19: 8.74 7.11 ( 18.73%) len=18, align=0, pos=19: 8.71 7.44 ( 14.58%) len=18, align=0, pos=19: 8.70 8.42 ( 3.20%) len=18, align=19, pos=19: 8.69 7.42 ( 14.63%) len=18, align=19, pos=19: 8.71 7.96 ( 8.55%) len=21, align=0, pos=20: 8.73 7.09 ( 18.75%) len=21, align=0, pos=20: 8.68 7.14 ( 17.73%) len=21, align=20, pos=20: 11.22 8.41 ( 25.03%) len=21, align=20, pos=20: 10.89 8.41 ( 22.79%) len=19, align=0, pos=20: 8.70 7.62 ( 12.47%) len=19, align=0, pos=20: 8.67 7.49 ( 13.69%) len=19, align=20, pos=20: 8.69 8.11 ( 6.74%) len=19, align=20, pos=20: 8.67 7.72 ( 10.93%) len=22, align=0, pos=21: 8.71 8.40 ( 3.56%) len=22, align=0, pos=21: 8.72 7.15 ( 17.97%) len=22, align=21, pos=21: 11.19 7.08 ( 36.73%) len=22, align=21, pos=21: 10.84 7.27 ( 32.94%) len=20, align=0, pos=21: 8.71 8.31 ( 4.49%) len=20, align=0, pos=21: 8.71 8.45 ( 3.06%) len=20, align=21, pos=21: 10.67 7.44 ( 30.34%) len=20, align=21, pos=21: 11.31 8.37 ( 26.00%) len=23, align=0, pos=22: 8.72 8.41 ( 3.60%) len=23, align=0, pos=22: 8.71 8.41 ( 3.44%) len=23, align=22, pos=22: 10.62 7.16 ( 32.59%) len=23, align=22, pos=22: 10.63 8.04 ( 24.32%) len=21, align=0, pos=22: 8.71 7.74 ( 11.07%) len=21, align=0, pos=22: 8.67 7.97 ( 8.07%) len=21, align=22, pos=22: 11.12 9.05 ( 18.57%) len=21, align=22, pos=22: 10.67 8.42 ( 21.11%) len=24, align=0, pos=23: 8.71 7.27 ( 16.50%) len=24, align=0, pos=23: 9.04 8.59 ( 4.98%) len=24, align=23, pos=23: 10.65 8.40 ( 21.08%) len=24, align=23, pos=23: 10.64 7.10 ( 33.26%) len=22, align=0, pos=23: 8.85 8.41 ( 5.01%) len=22, align=0, pos=23: 8.70 8.16 ( 6.24%) len=22, align=23, pos=23: 10.68 8.42 ( 21.17%) len=22, align=23, pos=23: 10.65 8.41 ( 21.00%) len=25, align=0, pos=24: 10.87 8.41 ( 22.67%) len=25, align=0, pos=24: 10.65 7.08 ( 33.51%) len=25, align=24, pos=24: 10.85 7.79 ( 28.21%) len=25, align=24, pos=24: 10.68 7.77 ( 27.24%) len=23, align=0, pos=24: 8.72 7.73 ( 11.34%) len=23, align=0, pos=24: 8.71 8.46 ( 2.93%) len=23, align=24, pos=24: 9.05 8.42 ( 6.99%) len=23, align=24, pos=24: 8.69 8.42 ( 3.03%) len=26, align=0, pos=25: 10.73 8.42 ( 21.48%) len=26, align=0, pos=25: 10.89 7.10 ( 34.77%) len=26, align=25, pos=25: 10.91 7.79 ( 28.60%) len=26, align=25, pos=25: 10.64 10.42 ( 2.05%) len=24, align=0, pos=25: 9.00 7.76 ( 13.70%) len=24, align=0, pos=25: 8.68 8.96 ( -3.18%) len=24, align=25, pos=25: 10.63 9.33 ( 12.27%) len=24, align=25, pos=25: 10.62 8.63 ( 18.77%) len=27, align=0, pos=26: 10.72 8.41 ( 21.54%) len=27, align=0, pos=26: 10.68 7.11 ( 33.37%) len=27, align=26, pos=26: 10.67 7.77 ( 27.14%) len=27, align=26, pos=26: 11.33 7.78 ( 31.29%) len=25, align=0, pos=26: 10.67 7.75 ( 27.32%) len=25, align=0, pos=26: 11.05 8.45 ( 23.54%) len=25, align=26, pos=26: 10.67 9.09 ( 14.79%) len=25, align=26, pos=26: 10.67 9.10 ( 14.69%) len=28, align=0, pos=27: 11.26 7.12 ( 36.77%) len=28, align=0, pos=27: 10.61 7.15 ( 32.62%) len=28, align=27, pos=27: 11.27 7.82 ( 30.58%) len=28, align=27, pos=27: 10.63 7.83 ( 26.36%) len=26, align=0, pos=27: 10.63 7.89 ( 25.73%) len=26, align=0, pos=27: 10.62 7.48 ( 29.61%) len=26, align=27, pos=27: 10.62 9.12 ( 14.11%) len=26, align=27, pos=27: 10.66 8.42 ( 21.05%) len=29, align=0, pos=28: 10.85 7.09 ( 34.62%) len=29, align=0, pos=28: 11.10 8.42 ( 24.16%) len=29, align=28, pos=28: 10.61 8.17 ( 23.01%) len=29, align=28, pos=28: 10.54 7.76 ( 26.40%) len=27, align=0, pos=28: 10.95 8.40 ( 23.26%) len=27, align=0, pos=28: 10.81 7.43 ( 31.24%) len=27, align=28, pos=28: 10.67 9.15 ( 14.28%) len=27, align=28, pos=28: 10.60 8.84 ( 16.67%) len=30, align=0, pos=29: 10.83 7.08 ( 34.57%) len=30, align=0, pos=29: 11.10 7.08 ( 36.19%) len=30, align=29, pos=29: 10.62 8.16 ( 23.18%) len=30, align=29, pos=29: 10.66 7.77 ( 27.06%) len=28, align=0, pos=29: 10.67 7.44 ( 30.27%) len=28, align=0, pos=29: 11.30 7.67 ( 32.14%) len=28, align=29, pos=29: 11.82 8.49 ( 28.20%) len=28, align=29, pos=29: 11.28 9.14 ( 19.00%) len=31, align=0, pos=30: 11.19 7.08 ( 36.67%) len=31, align=0, pos=30: 10.60 7.15 ( 32.50%) len=31, align=30, pos=30: 10.66 7.77 ( 27.15%) len=31, align=30, pos=30: 10.68 7.79 ( 27.10%) len=29, align=0, pos=30: 10.60 8.26 ( 22.09%) len=29, align=0, pos=30: 10.62 7.44 ( 29.96%) len=29, align=30, pos=30: 11.14 9.09 ( 18.38%) len=29, align=30, pos=30: 11.04 8.98 ( 18.68%) len=32, align=0, pos=31: 10.64 7.12 ( 33.10%) len=32, align=0, pos=31: 10.60 8.19 ( 22.76%) len=32, align=31, pos=31: 10.66 7.79 ( 26.97%) len=32, align=31, pos=31: 10.64 7.81 ( 26.62%) len=30, align=0, pos=31: 10.77 7.43 ( 31.00%) len=30, align=0, pos=31: 11.07 7.42 ( 32.96%) len=30, align=31, pos=31: 11.04 8.98 ( 18.64%) len=30, align=31, pos=31: 12.28 9.61 ( 21.74%) [-- Attachment #4: memchr_skx.txt --] [-- Type: text/plain, Size: 53015 bytes --] Function: memchr Variant: __memchr_evex __memchr_evex512 ======================================================================================================================== len=2048, align=0, pos=32: 7.76 6.67 ( 14.10%) len=256, align=1, pos=64: 9.07 7.05 ( 22.28%) len=2048, align=0, pos=32: 7.35 5.10 ( 30.67%) len=256, align=1, pos=64: 8.09 6.65 ( 17.83%) len=256, align=4081, pos=64: 8.68 7.77 ( 10.46%) len=2048, align=0, pos=64: 8.04 6.52 ( 18.94%) len=256, align=2, pos=64: 8.05 6.67 ( 17.17%) len=2048, align=0, pos=64: 8.01 6.68 ( 16.62%) len=256, align=2, pos=64: 9.35 6.82 ( 27.03%) len=256, align=4081, pos=64: 9.21 7.66 ( 16.83%) len=2048, align=0, pos=128: 9.49 7.76 ( 18.22%) len=256, align=3, pos=64: 8.64 6.66 ( 22.92%) len=2048, align=0, pos=128: 9.32 7.85 ( 15.81%) len=256, align=3, pos=64: 8.22 6.67 ( 18.94%) len=256, align=4081, pos=64: 8.65 7.69 ( 11.13%) len=2048, align=0, pos=256: 15.02 9.84 ( 34.51%) len=256, align=4, pos=64: 8.01 6.66 ( 16.90%) len=2048, align=0, pos=256: 15.21 9.11 ( 40.15%) len=256, align=4, pos=64: 8.03 6.66 ( 17.12%) len=256, align=4081, pos=64: 8.96 7.34 ( 18.04%) len=2048, align=0, pos=512: 18.05 15.16 ( 16.01%) len=256, align=5, pos=64: 8.01 6.47 ( 19.26%) len=2048, align=0, pos=512: 17.89 15.11 ( 15.56%) len=256, align=5, pos=64: 8.04 6.72 ( 16.37%) len=256, align=4081, pos=64: 8.66 7.82 ( 9.64%) len=2048, align=0, pos=1024: 27.13 22.19 ( 18.21%) len=256, align=6, pos=64: 8.07 6.42 ( 20.44%) len=2048, align=0, pos=1024: 23.71 22.57 ( 4.79%) len=256, align=6, pos=64: 8.22 6.64 ( 19.16%) len=256, align=4081, pos=64: 8.70 7.79 ( 10.40%) len=2048, align=0, pos=2048: 39.58 34.34 ( 13.24%) len=256, align=7, pos=64: 8.26 6.65 ( 19.56%) len=2048, align=0, pos=2048: 39.13 32.54 ( 16.84%) len=256, align=7, pos=64: 8.70 7.42 ( 14.72%) len=256, align=4081, pos=64: 8.69 7.36 ( 15.27%) len=192, align=1, pos=32: 7.39 5.11 ( 30.83%) len=192, align=1, pos=32: 7.39 5.13 ( 30.55%) len=256, align=1, pos=32: 7.77 5.10 ( 34.38%) len=256, align=1, pos=32: 7.39 5.10 ( 30.96%) len=512, align=1, pos=32: 7.68 5.83 ( 24.07%) len=512, align=1, pos=32: 7.39 5.76 ( 22.10%) len=256, align=4081, pos=32: 8.00 7.39 ( 7.69%) len=192, align=2, pos=64: 8.04 6.74 ( 16.19%) len=192, align=2, pos=64: 8.03 6.66 ( 17.00%) len=256, align=2, pos=64: 8.54 6.45 ( 24.48%) len=256, align=2, pos=64: 8.24 6.64 ( 19.43%) len=512, align=2, pos=64: 8.04 6.71 ( 16.62%) len=512, align=2, pos=64: 8.01 6.66 ( 16.78%) len=256, align=4081, pos=64: 8.71 7.33 ( 15.82%) len=192, align=3, pos=96: 8.55 6.66 ( 22.11%) len=192, align=3, pos=96: 8.66 6.59 ( 23.93%) len=256, align=3, pos=96: 8.73 6.66 ( 23.70%) len=256, align=3, pos=96: 8.74 6.66 ( 23.87%) len=512, align=3, pos=96: 8.66 6.71 ( 22.46%) len=512, align=3, pos=96: 8.88 6.64 ( 25.21%) len=256, align=4081, pos=96: 9.32 9.13 ( 2.07%) len=192, align=4, pos=128: 9.21 7.88 ( 14.44%) len=192, align=4, pos=128: 9.34 7.82 ( 16.34%) len=256, align=4, pos=128: 9.33 7.81 ( 16.28%) len=256, align=4, pos=128: 9.31 7.84 ( 15.81%) len=512, align=4, pos=128: 9.34 7.76 ( 16.88%) len=512, align=4, pos=128: 9.46 7.76 ( 18.03%) len=256, align=4081, pos=128: 10.44 8.45 ( 19.10%) len=192, align=5, pos=160: 11.48 7.88 ( 31.36%) len=192, align=5, pos=160: 11.30 7.77 ( 31.28%) len=256, align=5, pos=160: 11.01 7.76 ( 29.51%) len=256, align=5, pos=160: 10.65 7.78 ( 26.95%) len=512, align=5, pos=160: 13.27 7.77 ( 41.47%) len=512, align=5, pos=160: 13.29 7.80 ( 41.32%) len=256, align=4081, pos=160: 11.29 9.16 ( 18.83%) len=192, align=6, pos=192: 11.99 9.78 ( 18.43%) len=192, align=6, pos=192: 12.62 9.11 ( 27.84%) len=256, align=6, pos=192: 13.26 7.80 ( 41.16%) len=256, align=6, pos=192: 13.20 7.85 ( 40.57%) len=512, align=6, pos=192: 13.27 7.81 ( 41.16%) len=512, align=6, pos=192: 13.15 7.78 ( 40.81%) len=256, align=4081, pos=192: 13.89 9.12 ( 34.35%) len=192, align=7, pos=224: 11.94 9.46 ( 20.81%) len=192, align=7, pos=224: 13.12 9.67 ( 26.31%) len=256, align=7, pos=224: 14.76 7.79 ( 47.19%) len=256, align=7, pos=224: 14.51 7.88 ( 45.70%) len=512, align=7, pos=224: 14.16 8.28 ( 41.50%) len=512, align=7, pos=224: 13.27 7.82 ( 41.05%) len=256, align=4081, pos=224: 15.73 9.78 ( 37.83%) len=2, align=0, pos=1: 5.19 5.80 (-11.72%) len=2, align=0, pos=1: 5.33 4.74 ( 11.17%) len=2, align=1, pos=1: 5.33 5.38 ( -0.81%) len=2, align=1, pos=1: 5.33 4.71 ( 11.64%) len=0, align=0, pos=1: 4.67 4.00 ( 14.28%) len=0, align=0, pos=1: 4.71 4.00 ( 15.05%) len=0, align=1, pos=1: 4.67 4.00 ( 14.28%) len=0, align=1, pos=1: 4.99 4.00 ( 19.79%) len=2, align=2048, pos=1: 5.33 4.90 ( 8.19%) len=2, align=2048, pos=1: 5.33 5.08 ( 4.74%) len=2, align=2049, pos=1: 5.33 5.39 ( -0.97%) len=2, align=2049, pos=1: 5.33 4.81 ( 9.73%) len=0, align=2048, pos=1: 4.67 4.00 ( 14.27%) len=0, align=2048, pos=1: 4.71 4.00 ( 15.03%) len=0, align=2049, pos=1: 4.67 4.00 ( 14.28%) len=0, align=2049, pos=1: 4.81 4.00 ( 16.85%) len=0, align=4081, pos=1: 4.67 4.00 ( 14.28%) len=0, align=4081, pos=1: 4.82 4.00 ( 16.96%) len=2, align=4081, pos=1: 7.16 6.00 ( 16.22%) len=2, align=4081, pos=1: 6.00 5.56 ( 7.40%) len=3, align=0, pos=2: 5.33 5.33 ( -0.01%) len=3, align=0, pos=2: 5.33 4.68 ( 12.25%) len=3, align=2, pos=2: 5.33 5.38 ( -0.93%) len=3, align=2, pos=2: 5.33 4.83 ( 9.49%) len=1, align=0, pos=2: 5.33 5.12 ( 3.91%) len=1, align=0, pos=2: 6.00 5.17 ( 13.81%) len=1, align=2, pos=2: 5.33 4.93 ( 7.54%) len=1, align=2, pos=2: 5.33 4.71 ( 11.72%) len=3, align=2048, pos=2: 5.33 4.71 ( 11.72%) len=3, align=2048, pos=2: 5.33 5.37 ( -0.77%) len=3, align=2050, pos=2: 5.38 5.33 ( 0.78%) len=3, align=2050, pos=2: 5.37 4.67 ( 13.16%) len=1, align=2048, pos=2: 5.37 5.33 ( 0.66%) len=1, align=2048, pos=2: 5.84 4.67 ( 20.03%) len=1, align=2050, pos=2: 5.37 4.67 ( 13.16%) len=1, align=2050, pos=2: 6.00 5.38 ( 10.42%) len=1, align=4081, pos=2: 6.00 5.89 ( 1.88%) len=1, align=4081, pos=2: 6.04 6.00 ( 0.70%) len=3, align=4081, pos=2: 6.00 5.56 ( 7.40%) len=3, align=4081, pos=2: 6.00 6.23 ( -3.77%) len=4, align=0, pos=3: 5.38 4.67 ( 13.27%) len=4, align=0, pos=3: 5.65 4.94 ( 12.49%) len=4, align=3, pos=3: 6.00 4.67 ( 22.21%) len=4, align=3, pos=3: 5.65 5.10 ( 9.86%) len=2, align=0, pos=3: 6.08 4.94 ( 18.76%) len=2, align=0, pos=3: 5.33 5.00 ( 6.24%) len=2, align=3, pos=3: 5.65 4.94 ( 12.49%) len=2, align=3, pos=3: 6.35 5.37 ( 15.40%) len=4, align=2048, pos=3: 6.28 5.10 ( 18.74%) len=4, align=2048, pos=3: 5.33 4.67 ( 12.49%) len=4, align=2051, pos=3: 5.33 5.33 ( -0.01%) len=4, align=2051, pos=3: 5.65 5.04 ( 10.80%) len=2, align=2048, pos=3: 5.40 5.33 ( 1.15%) len=2, align=2048, pos=3: 5.43 4.67 ( 14.09%) len=2, align=2051, pos=3: 5.33 5.33 ( -0.01%) len=2, align=2051, pos=3: 5.33 4.67 ( 12.49%) len=2, align=4081, pos=3: 6.00 5.60 ( 6.71%) len=2, align=4081, pos=3: 6.06 6.00 ( 1.00%) len=4, align=4081, pos=3: 6.28 5.33 ( 15.04%) len=4, align=4081, pos=3: 6.28 5.74 ( 8.58%) len=5, align=0, pos=4: 5.33 4.67 ( 12.50%) len=5, align=0, pos=4: 5.33 4.99 ( 6.44%) len=5, align=4, pos=4: 5.89 5.76 ( 2.24%) len=5, align=4, pos=4: 5.33 5.00 ( 6.24%) len=3, align=0, pos=4: 5.86 5.53 ( 5.54%) len=3, align=0, pos=4: 5.67 5.35 ( 5.70%) len=3, align=4, pos=4: 5.38 5.34 ( 0.75%) len=3, align=4, pos=4: 6.35 5.66 ( 10.93%) len=5, align=2048, pos=4: 5.50 5.76 ( -4.85%) len=5, align=2048, pos=4: 5.33 5.33 ( -0.01%) len=5, align=2052, pos=4: 5.33 5.38 ( -0.94%) len=5, align=2052, pos=4: 5.48 5.75 ( -4.85%) len=3, align=2048, pos=4: 5.65 5.29 ( 6.24%) len=3, align=2048, pos=4: 5.33 5.00 ( 6.24%) len=3, align=2052, pos=4: 5.33 5.21 ( 2.35%) len=3, align=2052, pos=4: 5.40 4.67 ( 13.55%) len=3, align=4081, pos=4: 6.00 5.56 ( 7.40%) len=3, align=4081, pos=4: 6.00 5.76 ( 4.07%) len=5, align=4081, pos=4: 6.13 5.97 ( 2.61%) len=5, align=4081, pos=4: 6.35 6.12 ( 3.61%) len=6, align=0, pos=5: 6.06 5.11 ( 15.70%) len=6, align=0, pos=5: 5.33 5.04 ( 5.57%) len=6, align=5, pos=5: 6.01 5.74 ( 4.39%) len=6, align=5, pos=5: 6.00 5.00 ( 16.66%) len=4, align=0, pos=5: 6.18 5.11 ( 17.43%) len=4, align=0, pos=5: 6.00 5.00 ( 16.66%) len=4, align=5, pos=5: 5.33 4.67 ( 12.49%) len=4, align=5, pos=5: 5.33 4.71 ( 11.72%) len=6, align=2048, pos=5: 6.81 5.37 ( 21.13%) len=6, align=2048, pos=5: 5.33 5.11 ( 4.15%) len=6, align=2053, pos=5: 5.77 5.54 ( 3.97%) len=6, align=2053, pos=5: 5.33 5.00 ( 6.24%) len=4, align=2048, pos=5: 5.33 4.71 ( 11.73%) len=4, align=2048, pos=5: 5.98 5.09 ( 14.93%) len=4, align=2053, pos=5: 6.00 4.67 ( 22.21%) len=4, align=2053, pos=5: 6.00 5.04 ( 15.93%) len=4, align=4081, pos=5: 6.23 5.75 ( 7.69%) len=4, align=4081, pos=5: 6.00 5.56 ( 7.40%) len=6, align=4081, pos=5: 6.10 5.98 ( 2.09%) len=6, align=4081, pos=5: 6.00 5.56 ( 7.40%) len=7, align=0, pos=6: 5.47 5.10 ( 6.63%) len=7, align=0, pos=6: 6.00 5.33 ( 11.11%) len=7, align=6, pos=6: 6.14 5.09 ( 17.14%) len=7, align=6, pos=6: 6.00 4.71 ( 21.54%) len=5, align=0, pos=6: 5.48 5.76 ( -5.06%) len=5, align=0, pos=6: 6.00 5.33 ( 11.10%) len=5, align=6, pos=6: 5.34 5.12 ( 4.15%) len=5, align=6, pos=6: 6.00 5.33 ( 11.11%) len=7, align=2048, pos=6: 5.47 5.21 ( 4.77%) len=7, align=2048, pos=6: 5.33 4.67 ( 12.49%) len=7, align=2054, pos=6: 5.48 5.11 ( 6.66%) len=7, align=2054, pos=6: 6.05 5.10 ( 15.64%) len=5, align=2048, pos=6: 6.00 5.29 ( 11.82%) len=5, align=2048, pos=6: 6.04 5.09 ( 15.77%) len=5, align=2054, pos=6: 5.33 5.02 ( 5.81%) len=5, align=2054, pos=6: 5.77 5.10 ( 11.66%) len=5, align=4081, pos=6: 6.00 5.56 ( 7.40%) len=5, align=4081, pos=6: 6.11 5.98 ( 2.12%) len=7, align=4081, pos=6: 6.00 5.60 ( 6.74%) len=7, align=4081, pos=6: 6.07 5.96 ( 1.81%) len=8, align=0, pos=7: 5.47 5.11 ( 6.58%) len=8, align=0, pos=7: 5.33 4.95 ( 7.20%) len=8, align=7, pos=7: 5.44 5.16 ( 5.20%) len=8, align=7, pos=7: 5.47 5.10 ( 6.71%) len=6, align=0, pos=7: 5.33 4.67 ( 12.49%) len=6, align=0, pos=7: 5.46 5.10 ( 6.44%) len=6, align=7, pos=7: 6.00 4.67 ( 22.21%) len=6, align=7, pos=7: 5.48 5.11 ( 6.75%) len=8, align=2048, pos=7: 5.33 4.90 ( 8.09%) len=8, align=2048, pos=7: 5.42 5.09 ( 6.11%) len=8, align=2055, pos=7: 5.45 5.09 ( 6.62%) len=8, align=2055, pos=7: 5.33 4.67 ( 12.49%) len=6, align=2048, pos=7: 5.49 5.10 ( 7.14%) len=6, align=2048, pos=7: 5.33 5.33 ( -0.01%) len=6, align=2055, pos=7: 5.55 5.10 ( 8.22%) len=6, align=2055, pos=7: 5.33 4.67 ( 12.49%) len=6, align=4081, pos=7: 6.11 5.97 ( 2.39%) len=6, align=4081, pos=7: 6.35 6.15 ( 3.16%) len=8, align=4081, pos=7: 6.00 5.55 ( 7.52%) len=8, align=4081, pos=7: 6.07 5.96 ( 1.90%) len=9, align=0, pos=8: 5.43 5.78 ( -6.30%) len=9, align=0, pos=8: 5.47 5.76 ( -5.38%) len=9, align=8, pos=8: 5.33 5.38 ( -0.94%) len=9, align=8, pos=8: 5.42 5.81 ( -7.22%) len=7, align=0, pos=8: 6.12 5.78 ( 5.52%) len=7, align=0, pos=8: 5.33 5.26 ( 1.30%) len=7, align=8, pos=8: 5.44 5.11 ( 6.01%) len=7, align=8, pos=8: 5.48 5.12 ( 6.66%) len=9, align=2048, pos=8: 5.33 4.95 ( 7.16%) len=9, align=2048, pos=8: 5.42 5.10 ( 5.89%) len=9, align=2056, pos=8: 6.06 5.11 ( 15.68%) len=9, align=2056, pos=8: 5.46 5.22 ( 4.43%) len=7, align=2048, pos=8: 6.00 4.67 ( 22.21%) len=7, align=2048, pos=8: 6.08 5.88 ( 3.18%) len=7, align=2056, pos=8: 5.47 5.10 ( 6.71%) len=7, align=2056, pos=8: 6.00 5.33 ( 11.10%) len=7, align=4081, pos=8: 6.07 5.99 ( 1.36%) len=7, align=4081, pos=8: 6.10 5.77 ( 5.41%) len=9, align=4081, pos=8: 6.09 6.00 ( 1.57%) len=9, align=4081, pos=8: 6.00 5.59 ( 6.81%) len=10, align=0, pos=9: 5.33 4.79 ( 10.14%) len=10, align=0, pos=9: 5.41 5.83 ( -7.81%) len=10, align=9, pos=9: 5.44 5.76 ( -5.85%) len=10, align=9, pos=9: 5.43 5.14 ( 5.26%) len=8, align=0, pos=9: 5.56 5.76 ( -3.54%) len=8, align=0, pos=9: 6.00 5.35 ( 10.90%) len=8, align=9, pos=9: 5.33 5.11 ( 4.00%) len=8, align=9, pos=9: 5.42 5.09 ( 5.99%) len=10, align=2048, pos=9: 5.46 5.77 ( -5.59%) len=10, align=2048, pos=9: 5.57 5.09 ( 8.58%) len=10, align=2057, pos=9: 6.00 4.90 ( 18.33%) len=10, align=2057, pos=9: 5.44 5.09 ( 6.34%) len=8, align=2048, pos=9: 5.44 5.11 ( 6.03%) len=8, align=2048, pos=9: 5.46 5.09 ( 6.81%) len=8, align=2057, pos=9: 5.33 4.83 ( 9.43%) len=8, align=2057, pos=9: 5.44 5.11 ( 6.00%) len=8, align=4081, pos=9: 6.10 5.97 ( 2.12%) len=8, align=4081, pos=9: 6.13 5.77 ( 5.76%) len=10, align=4081, pos=9: 6.11 5.97 ( 2.29%) len=10, align=4081, pos=9: 6.13 5.75 ( 6.12%) len=11, align=0, pos=10: 5.46 5.12 ( 6.23%) len=11, align=0, pos=10: 5.45 5.11 ( 6.18%) len=11, align=10, pos=10: 5.44 5.75 ( -5.60%) len=11, align=10, pos=10: 5.46 5.11 ( 6.31%) len=9, align=0, pos=10: 5.33 5.24 ( 1.66%) len=9, align=0, pos=10: 5.41 5.16 ( 4.62%) len=9, align=10, pos=10: 5.41 5.75 ( -6.31%) len=9, align=10, pos=10: 5.42 5.78 ( -6.64%) len=11, align=2048, pos=10: 5.42 5.11 ( 5.77%) len=11, align=2048, pos=10: 5.44 5.09 ( 6.35%) len=11, align=2058, pos=10: 6.07 5.11 ( 15.71%) len=11, align=2058, pos=10: 5.44 5.90 ( -8.57%) len=9, align=2048, pos=10: 5.97 5.10 ( 14.48%) len=9, align=2048, pos=10: 5.57 5.12 ( 8.08%) len=9, align=2058, pos=10: 5.33 5.37 ( -0.75%) len=9, align=2058, pos=10: 5.43 5.21 ( 4.00%) len=9, align=4081, pos=10: 6.06 5.77 ( 4.87%) len=9, align=4081, pos=10: 6.05 5.97 ( 1.20%) len=11, align=4081, pos=10: 6.06 5.84 ( 3.59%) len=11, align=4081, pos=10: 6.05 5.86 ( 3.14%) len=12, align=0, pos=11: 5.82 5.75 ( 1.18%) len=12, align=0, pos=11: 5.50 5.75 ( -4.46%) len=12, align=11, pos=11: 5.30 5.12 ( 3.46%) len=12, align=11, pos=11: 5.73 6.06 ( -5.85%) len=10, align=0, pos=11: 5.42 5.10 ( 5.89%) len=10, align=0, pos=11: 5.41 5.76 ( -6.41%) len=10, align=11, pos=11: 6.07 5.77 ( 4.98%) len=10, align=11, pos=11: 6.05 5.12 ( 15.26%) len=12, align=2048, pos=11: 5.43 5.11 ( 5.84%) len=12, align=2048, pos=11: 5.43 5.80 ( -6.94%) len=12, align=2059, pos=11: 6.27 5.78 ( 7.79%) len=12, align=2059, pos=11: 5.43 5.09 ( 6.35%) len=10, align=2048, pos=11: 5.44 5.46 ( -0.40%) len=10, align=2048, pos=11: 5.43 5.15 ( 5.15%) len=10, align=2059, pos=11: 5.41 5.77 ( -6.62%) len=10, align=2059, pos=11: 5.42 5.11 ( 5.72%) len=10, align=4081, pos=11: 6.07 5.77 ( 4.96%) len=10, align=4081, pos=11: 6.08 5.81 ( 4.44%) len=12, align=4081, pos=11: 6.07 5.76 ( 5.03%) len=12, align=4081, pos=11: 6.10 5.96 ( 2.22%) len=13, align=0, pos=12: 5.43 5.12 ( 5.66%) len=13, align=0, pos=12: 5.43 5.27 ( 3.00%) len=13, align=12, pos=12: 5.41 5.77 ( -6.64%) len=13, align=12, pos=12: 5.43 5.75 ( -5.87%) len=11, align=0, pos=12: 5.43 5.75 ( -5.87%) len=11, align=0, pos=12: 5.42 5.11 ( 5.63%) len=11, align=12, pos=12: 5.43 5.09 ( 6.28%) len=11, align=12, pos=12: 5.41 5.10 ( 5.75%) len=13, align=2048, pos=12: 5.49 5.34 ( 2.76%) len=13, align=2048, pos=12: 5.44 5.80 ( -6.74%) len=13, align=2060, pos=12: 5.48 5.91 ( -7.83%) len=13, align=2060, pos=12: 5.41 5.11 ( 5.62%) len=11, align=2048, pos=12: 5.40 5.09 ( 5.77%) len=11, align=2048, pos=12: 5.42 5.11 ( 5.75%) len=11, align=2060, pos=12: 5.41 5.12 ( 5.27%) len=11, align=2060, pos=12: 6.04 5.76 ( 4.72%) len=11, align=4081, pos=12: 5.94 5.74 ( 3.36%) len=11, align=4081, pos=12: 6.07 5.94 ( 2.22%) len=13, align=4081, pos=12: 6.12 5.98 ( 2.18%) len=13, align=4081, pos=12: 6.07 5.76 ( 5.03%) len=14, align=0, pos=13: 5.47 5.84 ( -6.73%) len=14, align=0, pos=13: 5.44 5.79 ( -6.45%) len=14, align=13, pos=13: 5.41 5.77 ( -6.69%) len=14, align=13, pos=13: 6.05 5.10 ( 15.72%) len=12, align=0, pos=13: 5.47 5.09 ( 7.03%) len=12, align=0, pos=13: 5.49 5.77 ( -5.20%) len=12, align=13, pos=13: 5.58 5.10 ( 8.63%) len=12, align=13, pos=13: 5.42 5.75 ( -6.16%) len=14, align=2048, pos=13: 5.42 5.69 ( -4.97%) len=14, align=2048, pos=13: 5.45 5.11 ( 6.09%) len=14, align=2061, pos=13: 5.41 5.09 ( 5.97%) len=14, align=2061, pos=13: 5.43 5.53 ( -1.70%) len=12, align=2048, pos=13: 6.07 5.76 ( 5.12%) len=12, align=2048, pos=13: 5.47 5.75 ( -5.19%) len=12, align=2061, pos=13: 5.41 5.11 ( 5.53%) len=12, align=2061, pos=13: 5.43 5.75 ( -5.87%) len=12, align=4081, pos=13: 6.07 5.87 ( 3.24%) len=12, align=4081, pos=13: 6.10 5.98 ( 2.10%) len=14, align=4081, pos=13: 6.07 5.97 ( 1.65%) len=14, align=4081, pos=13: 6.12 5.97 ( 2.49%) len=15, align=0, pos=14: 5.43 5.11 ( 5.91%) len=15, align=0, pos=14: 5.45 5.76 ( -5.80%) len=15, align=14, pos=14: 5.44 5.10 ( 6.14%) len=15, align=14, pos=14: 5.75 4.71 ( 18.10%) len=13, align=0, pos=14: 5.47 5.76 ( -5.29%) len=13, align=0, pos=14: 5.41 5.77 ( -6.55%) len=13, align=14, pos=14: 6.06 5.75 ( 5.07%) len=13, align=14, pos=14: 5.32 5.81 ( -9.19%) len=15, align=2048, pos=14: 5.44 5.91 ( -8.69%) len=15, align=2048, pos=14: 5.94 5.12 ( 13.78%) len=15, align=2062, pos=14: 5.77 5.75 ( 0.24%) len=15, align=2062, pos=14: 5.98 4.67 ( 22.02%) len=13, align=2048, pos=14: 5.44 5.45 ( -0.14%) len=13, align=2048, pos=14: 5.79 5.79 ( -0.01%) len=13, align=2062, pos=14: 6.06 5.10 ( 15.84%) len=13, align=2062, pos=14: 5.44 5.78 ( -6.33%) len=13, align=4081, pos=14: 6.10 5.75 ( 5.80%) len=13, align=4081, pos=14: 6.06 5.96 ( 1.61%) len=15, align=4081, pos=14: 6.05 5.99 ( 0.98%) len=15, align=4081, pos=14: 6.57 5.33 ( 18.78%) len=16, align=0, pos=15: 5.46 5.82 ( -6.49%) len=16, align=0, pos=15: 5.43 5.77 ( -6.15%) len=16, align=15, pos=15: 5.74 5.77 ( -0.54%) len=16, align=15, pos=15: 5.43 5.76 ( -5.96%) len=14, align=0, pos=15: 5.44 5.77 ( -6.06%) len=14, align=0, pos=15: 5.43 5.74 ( -5.68%) len=14, align=15, pos=15: 5.41 5.15 ( 4.86%) len=14, align=15, pos=15: 5.45 5.10 ( 6.36%) len=16, align=2048, pos=15: 5.43 5.09 ( 6.31%) len=16, align=2048, pos=15: 5.46 5.76 ( -5.42%) len=16, align=2063, pos=15: 5.43 5.12 ( 5.75%) len=16, align=2063, pos=15: 5.70 5.78 ( -1.33%) len=14, align=2048, pos=15: 6.04 5.09 ( 15.81%) len=14, align=2048, pos=15: 5.41 5.80 ( -7.29%) len=14, align=2063, pos=15: 5.42 5.14 ( 5.29%) len=14, align=2063, pos=15: 5.43 5.09 ( 6.24%) len=14, align=4081, pos=15: 6.12 5.78 ( 5.57%) len=14, align=4081, pos=15: 6.06 5.77 ( 4.82%) len=16, align=4081, pos=15: 8.05 7.36 ( 8.56%) len=16, align=4081, pos=15: 8.03 7.45 ( 7.22%) len=17, align=0, pos=16: 5.43 5.79 ( -6.68%) len=17, align=0, pos=16: 5.49 5.76 ( -4.94%) len=17, align=16, pos=16: 6.09 5.76 ( 5.43%) len=17, align=16, pos=16: 5.47 5.75 ( -5.12%) len=15, align=0, pos=16: 5.43 5.29 ( 2.46%) len=15, align=0, pos=16: 5.48 5.76 ( -5.05%) len=15, align=16, pos=16: 6.39 5.44 ( 14.82%) len=15, align=16, pos=16: 6.09 5.11 ( 16.16%) len=17, align=2048, pos=16: 5.77 5.75 ( 0.33%) len=17, align=2048, pos=16: 5.46 5.11 ( 6.46%) len=17, align=2064, pos=16: 5.77 5.11 ( 11.48%) len=17, align=2064, pos=16: 5.77 5.79 ( -0.42%) len=15, align=2048, pos=16: 6.06 5.32 ( 12.23%) len=15, align=2048, pos=16: 5.46 5.77 ( -5.62%) len=15, align=2064, pos=16: 5.43 5.68 ( -4.54%) len=15, align=2064, pos=16: 5.46 5.12 ( 6.23%) len=15, align=4081, pos=16: 6.07 6.76 (-11.37%) len=15, align=4081, pos=16: 6.06 6.44 ( -6.24%) len=17, align=4081, pos=16: 8.02 7.38 ( 7.92%) len=17, align=4081, pos=16: 8.28 7.32 ( 11.57%) len=18, align=0, pos=17: 5.44 5.77 ( -6.00%) len=18, align=0, pos=17: 5.42 5.11 ( 5.67%) len=18, align=17, pos=17: 5.79 5.74 ( 0.73%) len=18, align=17, pos=17: 5.47 5.12 ( 6.51%) len=16, align=0, pos=17: 6.08 5.78 ( 4.84%) len=16, align=0, pos=17: 5.44 5.12 ( 5.89%) len=16, align=17, pos=17: 5.44 5.14 ( 5.51%) len=16, align=17, pos=17: 6.05 5.12 ( 15.36%) len=18, align=2048, pos=17: 5.46 5.11 ( 6.54%) len=18, align=2048, pos=17: 5.85 5.76 ( 1.58%) len=18, align=2065, pos=17: 5.43 5.11 ( 5.85%) len=18, align=2065, pos=17: 5.42 5.80 ( -6.86%) len=16, align=2048, pos=17: 5.44 5.75 ( -5.69%) len=16, align=2048, pos=17: 5.47 5.09 ( 6.97%) len=16, align=2065, pos=17: 5.44 5.10 ( 6.30%) len=16, align=2065, pos=17: 5.47 5.09 ( 7.02%) len=16, align=4081, pos=17: 8.70 9.12 ( -4.79%) len=16, align=4081, pos=17: 9.23 9.08 ( 1.62%) len=18, align=4081, pos=17: 8.05 7.76 ( 3.53%) len=18, align=4081, pos=17: 8.61 7.31 ( 15.12%) len=19, align=0, pos=18: 5.45 5.25 ( 3.69%) len=19, align=0, pos=18: 6.07 5.60 ( 7.77%) len=19, align=18, pos=18: 5.43 5.39 ( 0.68%) len=19, align=18, pos=18: 5.41 5.64 ( -4.34%) len=17, align=0, pos=18: 6.06 5.76 ( 4.87%) len=17, align=0, pos=18: 6.07 5.77 ( 4.94%) len=17, align=18, pos=18: 6.06 5.11 ( 15.61%) len=17, align=18, pos=18: 5.47 5.76 ( -5.32%) len=19, align=2048, pos=18: 5.68 5.77 ( -1.54%) len=19, align=2048, pos=18: 5.48 5.10 ( 6.96%) len=19, align=2066, pos=18: 5.72 5.12 ( 10.52%) len=19, align=2066, pos=18: 5.75 5.11 ( 11.12%) len=17, align=2048, pos=18: 6.06 5.12 ( 15.51%) len=17, align=2048, pos=18: 5.41 5.79 ( -7.05%) len=17, align=2066, pos=18: 6.05 5.10 ( 15.64%) len=17, align=2066, pos=18: 6.06 5.74 ( 5.29%) len=17, align=4081, pos=18: 8.99 9.08 ( -0.92%) len=17, align=4081, pos=18: 9.15 8.74 ( 4.46%) len=19, align=4081, pos=18: 8.02 7.39 ( 7.85%) len=19, align=4081, pos=18: 8.07 7.32 ( 9.34%) len=20, align=0, pos=19: 5.43 5.81 ( -7.13%) len=20, align=0, pos=19: 5.43 5.56 ( -2.43%) len=20, align=19, pos=19: 6.02 5.12 ( 15.03%) len=20, align=19, pos=19: 5.49 5.10 ( 7.04%) len=18, align=0, pos=19: 6.29 5.19 ( 17.61%) len=18, align=0, pos=19: 5.90 5.76 ( 2.32%) len=18, align=19, pos=19: 5.82 5.90 ( -1.36%) len=18, align=19, pos=19: 6.04 5.10 ( 15.48%) len=20, align=2048, pos=19: 6.15 5.10 ( 17.08%) len=20, align=2048, pos=19: 5.73 6.07 ( -5.84%) len=20, align=2067, pos=19: 5.45 5.10 ( 6.52%) len=20, align=2067, pos=19: 5.46 5.09 ( 6.71%) len=18, align=2048, pos=19: 5.41 5.09 ( 5.86%) len=18, align=2048, pos=19: 6.05 5.74 ( 5.02%) len=18, align=2067, pos=19: 5.28 5.74 ( -8.67%) len=18, align=2067, pos=19: 6.57 5.38 ( 18.10%) len=18, align=4081, pos=19: 9.04 9.13 ( -1.04%) len=18, align=4081, pos=19: 9.04 9.08 ( -0.45%) len=20, align=4081, pos=19: 8.71 7.32 ( 15.90%) len=20, align=4081, pos=19: 8.01 7.40 ( 7.71%) len=21, align=0, pos=20: 5.44 5.09 ( 6.39%) len=21, align=0, pos=20: 5.45 5.76 ( -5.76%) len=21, align=20, pos=20: 5.45 5.16 ( 5.33%) len=21, align=20, pos=20: 5.49 5.75 ( -4.82%) len=19, align=0, pos=20: 5.82 5.76 ( 1.01%) len=19, align=0, pos=20: 5.47 5.74 ( -4.93%) len=19, align=20, pos=20: 5.48 5.10 ( 6.88%) len=19, align=20, pos=20: 5.50 5.76 ( -4.67%) len=21, align=2048, pos=20: 6.06 5.10 ( 15.85%) len=21, align=2048, pos=20: 5.43 5.79 ( -6.53%) len=21, align=2068, pos=20: 6.07 5.13 ( 15.49%) len=21, align=2068, pos=20: 5.93 5.77 ( 2.70%) len=19, align=2048, pos=20: 5.47 5.76 ( -5.35%) len=19, align=2048, pos=20: 5.46 5.11 ( 6.28%) len=19, align=2068, pos=20: 5.46 5.10 ( 6.57%) len=19, align=2068, pos=20: 5.47 5.16 ( 5.75%) len=19, align=4081, pos=20: 8.95 9.08 ( -1.50%) len=19, align=4081, pos=20: 9.08 9.08 ( 0.05%) len=21, align=4081, pos=20: 8.08 7.34 ( 9.17%) len=21, align=4081, pos=20: 8.04 7.77 ( 3.38%) len=22, align=0, pos=21: 5.45 5.75 ( -5.45%) len=22, align=0, pos=21: 6.08 5.19 ( 14.73%) len=22, align=21, pos=21: 5.42 5.15 ( 5.02%) len=22, align=21, pos=21: 5.42 5.78 ( -6.73%) len=20, align=0, pos=21: 5.76 5.10 ( 11.45%) len=20, align=0, pos=21: 5.50 5.12 ( 7.04%) len=20, align=21, pos=21: 5.44 5.81 ( -6.82%) len=20, align=21, pos=21: 5.44 5.75 ( -5.82%) len=22, align=2048, pos=21: 6.07 6.22 ( -2.56%) len=22, align=2048, pos=21: 5.49 5.79 ( -5.47%) len=22, align=2069, pos=21: 5.66 5.11 ( 9.74%) len=22, align=2069, pos=21: 5.42 5.11 ( 5.80%) len=20, align=2048, pos=21: 6.05 5.11 ( 15.47%) len=20, align=2048, pos=21: 6.08 5.79 ( 4.69%) len=20, align=2069, pos=21: 5.43 5.30 ( 2.26%) len=20, align=2069, pos=21: 5.42 5.79 ( -6.77%) len=20, align=4081, pos=21: 9.04 8.76 ( 3.17%) len=20, align=4081, pos=21: 9.22 9.08 ( 1.50%) len=22, align=4081, pos=21: 8.37 7.34 ( 12.36%) len=22, align=4081, pos=21: 8.02 7.38 ( 8.03%) len=23, align=0, pos=22: 5.31 5.13 ( 3.32%) len=23, align=0, pos=22: 5.48 5.09 ( 7.03%) len=23, align=22, pos=22: 5.43 5.09 ( 6.22%) len=23, align=22, pos=22: 6.05 5.76 ( 4.75%) len=21, align=0, pos=22: 6.05 5.43 ( 10.30%) len=21, align=0, pos=22: 5.43 5.11 ( 5.99%) len=21, align=22, pos=22: 6.08 5.79 ( 4.67%) len=21, align=22, pos=22: 5.83 5.74 ( 1.51%) len=23, align=2048, pos=22: 5.49 5.75 ( -4.78%) len=23, align=2048, pos=22: 5.90 5.10 ( 13.61%) len=23, align=2070, pos=22: 5.41 5.11 ( 5.59%) len=23, align=2070, pos=22: 6.06 5.14 ( 15.14%) len=21, align=2048, pos=22: 5.66 5.77 ( -1.86%) len=21, align=2048, pos=22: 5.42 5.10 ( 6.02%) len=21, align=2070, pos=22: 5.44 5.77 ( -6.04%) len=21, align=2070, pos=22: 5.43 5.25 ( 3.23%) len=21, align=4081, pos=22: 8.74 9.08 ( -3.84%) len=21, align=4081, pos=22: 8.85 9.07 ( -2.51%) len=23, align=4081, pos=22: 8.68 7.38 ( 15.06%) len=23, align=4081, pos=22: 8.63 7.33 ( 15.06%) len=24, align=0, pos=23: 5.48 5.25 ( 4.25%) len=24, align=0, pos=23: 5.48 5.31 ( 3.18%) len=24, align=23, pos=23: 5.50 5.11 ( 7.11%) len=24, align=23, pos=23: 5.30 5.84 (-10.19%) len=22, align=0, pos=23: 5.47 5.13 ( 6.21%) len=22, align=0, pos=23: 6.07 5.76 ( 5.11%) len=22, align=23, pos=23: 5.44 5.33 ( 1.93%) len=22, align=23, pos=23: 5.45 5.10 ( 6.46%) len=24, align=2048, pos=23: 6.08 5.14 ( 15.43%) len=24, align=2048, pos=23: 5.58 5.76 ( -3.25%) len=24, align=2071, pos=23: 6.07 5.15 ( 15.14%) len=24, align=2071, pos=23: 5.48 5.10 ( 7.06%) len=22, align=2048, pos=23: 5.44 5.76 ( -5.98%) len=22, align=2048, pos=23: 6.02 5.11 ( 15.11%) len=22, align=2071, pos=23: 6.09 5.75 ( 5.56%) len=22, align=2071, pos=23: 6.05 5.15 ( 14.90%) len=22, align=4081, pos=23: 9.27 8.74 ( 5.72%) len=22, align=4081, pos=23: 9.33 8.75 ( 6.16%) len=24, align=4081, pos=23: 8.67 7.50 ( 13.54%) len=24, align=4081, pos=23: 8.02 7.54 ( 5.99%) len=25, align=0, pos=24: 6.08 5.78 ( 4.90%) len=25, align=0, pos=24: 5.45 5.62 ( -3.21%) len=25, align=24, pos=24: 5.49 5.76 ( -4.81%) len=25, align=24, pos=24: 6.05 5.82 ( 3.84%) len=23, align=0, pos=24: 5.41 5.12 ( 5.26%) len=23, align=0, pos=24: 5.63 5.89 ( -4.58%) len=23, align=24, pos=24: 6.04 5.77 ( 4.57%) len=23, align=24, pos=24: 5.47 5.11 ( 6.61%) len=25, align=2048, pos=24: 6.07 5.74 ( 5.37%) len=25, align=2048, pos=24: 6.09 5.09 ( 16.36%) len=25, align=2072, pos=24: 5.42 5.65 ( -4.13%) len=25, align=2072, pos=24: 5.43 5.09 ( 6.37%) len=23, align=2048, pos=24: 5.45 5.28 ( 3.09%) len=23, align=2048, pos=24: 6.07 5.76 ( 5.01%) len=23, align=2072, pos=24: 5.47 5.75 ( -5.29%) len=23, align=2072, pos=24: 6.07 5.74 ( 5.49%) len=23, align=4081, pos=24: 9.20 8.74 ( 4.97%) len=23, align=4081, pos=24: 9.04 9.08 ( -0.36%) len=25, align=4081, pos=24: 8.06 7.55 ( 6.40%) len=25, align=4081, pos=24: 8.01 7.37 ( 8.03%) len=26, align=0, pos=25: 5.50 5.13 ( 6.79%) len=26, align=0, pos=25: 5.45 5.13 ( 5.90%) len=26, align=25, pos=25: 5.45 5.76 ( -5.68%) len=26, align=25, pos=25: 5.46 5.79 ( -6.05%) len=24, align=0, pos=25: 6.06 5.35 ( 11.76%) len=24, align=0, pos=25: 5.62 5.76 ( -2.48%) len=24, align=25, pos=25: 6.03 5.10 ( 15.37%) len=24, align=25, pos=25: 5.34 5.74 ( -7.62%) len=26, align=2048, pos=25: 6.08 5.10 ( 16.01%) len=26, align=2048, pos=25: 5.57 5.09 ( 8.53%) len=26, align=2073, pos=25: 5.43 5.80 ( -6.77%) len=26, align=2073, pos=25: 5.45 5.72 ( -4.98%) len=24, align=2048, pos=25: 5.65 5.77 ( -2.22%) len=24, align=2048, pos=25: 6.07 5.15 ( 15.29%) len=24, align=2073, pos=25: 5.44 5.10 ( 6.16%) len=24, align=2073, pos=25: 5.46 5.76 ( -5.49%) len=24, align=4081, pos=25: 8.99 9.05 ( -0.63%) len=24, align=4081, pos=25: 8.99 9.08 ( -1.04%) len=26, align=4081, pos=25: 8.03 7.32 ( 8.91%) len=26, align=4081, pos=25: 8.01 7.38 ( 7.92%) len=27, align=0, pos=26: 5.47 5.13 ( 6.16%) len=27, align=0, pos=26: 5.48 5.12 ( 6.59%) len=27, align=26, pos=26: 5.42 5.16 ( 4.77%) len=27, align=26, pos=26: 5.45 5.11 ( 6.25%) len=25, align=0, pos=26: 5.44 5.76 ( -5.92%) len=25, align=0, pos=26: 6.08 5.37 ( 11.63%) len=25, align=26, pos=26: 5.43 5.37 ( 1.07%) len=25, align=26, pos=26: 5.43 5.09 ( 6.23%) len=27, align=2048, pos=26: 5.42 5.09 ( 6.05%) len=27, align=2048, pos=26: 5.94 5.11 ( 14.02%) len=27, align=2074, pos=26: 5.61 5.09 ( 9.23%) len=27, align=2074, pos=26: 5.90 5.75 ( 2.55%) len=25, align=2048, pos=26: 6.07 5.45 ( 10.27%) len=25, align=2048, pos=26: 6.06 5.09 ( 16.00%) len=25, align=2074, pos=26: 5.61 5.76 ( -2.60%) len=25, align=2074, pos=26: 5.44 5.69 ( -4.58%) len=25, align=4081, pos=26: 8.69 9.05 ( -4.18%) len=25, align=4081, pos=26: 9.27 9.09 ( 1.97%) len=27, align=4081, pos=26: 8.03 7.39 ( 8.04%) len=27, align=4081, pos=26: 8.07 7.32 ( 9.33%) len=28, align=0, pos=27: 5.41 5.26 ( 2.69%) len=28, align=0, pos=27: 6.06 5.11 ( 15.64%) len=28, align=27, pos=27: 5.42 5.12 ( 5.67%) len=28, align=27, pos=27: 5.41 5.10 ( 5.77%) len=26, align=0, pos=27: 6.05 5.09 ( 15.92%) len=26, align=0, pos=27: 5.88 5.11 ( 12.97%) len=26, align=27, pos=27: 5.43 5.14 ( 5.40%) len=26, align=27, pos=27: 5.41 5.12 ( 5.39%) len=28, align=2048, pos=27: 6.04 5.77 ( 4.56%) len=28, align=2048, pos=27: 5.43 5.77 ( -6.33%) len=28, align=2075, pos=27: 6.04 5.62 ( 6.94%) len=28, align=2075, pos=27: 5.42 5.17 ( 4.48%) len=26, align=2048, pos=27: 6.05 5.78 ( 4.54%) len=26, align=2048, pos=27: 6.05 5.11 ( 15.55%) len=26, align=2075, pos=27: 5.47 5.76 ( -5.16%) len=26, align=2075, pos=27: 6.06 5.10 ( 15.87%) len=26, align=4081, pos=27: 9.08 9.08 ( 0.03%) len=26, align=4081, pos=27: 8.98 9.13 ( -1.67%) len=28, align=4081, pos=27: 8.56 7.34 ( 14.22%) len=28, align=4081, pos=27: 8.01 7.39 ( 7.79%) len=29, align=0, pos=28: 5.48 5.10 ( 6.91%) len=29, align=0, pos=28: 5.98 5.35 ( 10.63%) len=29, align=28, pos=28: 6.05 5.11 ( 15.52%) len=29, align=28, pos=28: 6.07 5.12 ( 15.64%) len=27, align=0, pos=28: 5.41 5.75 ( -6.41%) len=27, align=0, pos=28: 5.48 5.12 ( 6.61%) len=27, align=28, pos=28: 5.78 5.75 ( 0.47%) len=27, align=28, pos=28: 5.67 5.09 ( 10.31%) len=29, align=2048, pos=28: 5.76 5.11 ( 11.35%) len=29, align=2048, pos=28: 5.64 5.75 ( -2.04%) len=29, align=2076, pos=28: 5.41 5.10 ( 5.62%) len=29, align=2076, pos=28: 5.43 5.09 ( 6.15%) len=27, align=2048, pos=28: 6.10 5.76 ( 5.64%) len=27, align=2048, pos=28: 5.78 5.10 ( 11.72%) len=27, align=2076, pos=28: 5.45 5.12 ( 6.05%) len=27, align=2076, pos=28: 6.06 5.14 ( 15.12%) len=27, align=4081, pos=28: 9.01 9.08 ( -0.75%) len=27, align=4081, pos=28: 8.99 9.12 ( -1.38%) len=29, align=4081, pos=28: 8.03 7.33 ( 8.77%) len=29, align=4081, pos=28: 8.39 7.34 ( 12.48%) len=30, align=0, pos=29: 6.07 5.58 ( 8.08%) len=30, align=0, pos=29: 5.47 5.80 ( -5.97%) len=30, align=29, pos=29: 5.46 5.10 ( 6.55%) len=30, align=29, pos=29: 5.44 5.11 ( 6.15%) len=28, align=0, pos=29: 5.44 5.76 ( -6.01%) len=28, align=0, pos=29: 5.47 5.09 ( 7.06%) len=28, align=29, pos=29: 5.45 5.27 ( 3.21%) len=28, align=29, pos=29: 5.47 5.77 ( -5.59%) len=30, align=2048, pos=29: 6.06 5.10 ( 15.92%) len=30, align=2048, pos=29: 5.43 5.78 ( -6.45%) len=30, align=2077, pos=29: 5.30 5.56 ( -4.97%) len=30, align=2077, pos=29: 5.46 5.76 ( -5.60%) len=28, align=2048, pos=29: 5.76 5.77 ( -0.25%) len=28, align=2048, pos=29: 5.72 5.12 ( 10.53%) len=28, align=2077, pos=29: 5.74 5.11 ( 10.98%) len=28, align=2077, pos=29: 5.45 5.76 ( -5.86%) len=28, align=4081, pos=29: 9.28 9.09 ( 2.09%) len=28, align=4081, pos=29: 9.15 9.08 ( 0.73%) len=30, align=4081, pos=29: 8.76 8.37 ( 4.46%) len=30, align=4081, pos=29: 8.44 7.33 ( 13.13%) len=31, align=0, pos=30: 5.42 5.15 ( 5.04%) len=31, align=0, pos=30: 5.49 5.11 ( 6.91%) len=31, align=30, pos=30: 6.07 5.12 ( 15.63%) len=31, align=30, pos=30: 6.08 5.15 ( 15.30%) len=29, align=0, pos=30: 6.08 5.40 ( 11.12%) len=29, align=0, pos=30: 5.44 5.14 ( 5.48%) len=29, align=30, pos=30: 5.46 5.11 ( 6.39%) len=29, align=30, pos=30: 5.68 5.09 ( 10.41%) len=31, align=2048, pos=30: 5.44 5.10 ( 6.29%) len=31, align=2048, pos=30: 5.31 5.78 ( -9.03%) len=31, align=2078, pos=30: 5.81 5.11 ( 12.06%) len=31, align=2078, pos=30: 5.44 5.77 ( -6.02%) len=29, align=2048, pos=30: 6.06 5.10 ( 15.83%) len=29, align=2048, pos=30: 5.45 5.14 ( 5.63%) len=29, align=2078, pos=30: 5.60 5.48 ( 2.17%) len=29, align=2078, pos=30: 6.09 5.74 ( 5.66%) len=29, align=4081, pos=30: 9.37 9.08 ( 2.99%) len=29, align=4081, pos=30: 9.02 8.75 ( 2.98%) len=31, align=4081, pos=30: 8.58 7.32 ( 14.63%) len=31, align=4081, pos=30: 8.08 7.32 ( 9.47%) len=32, align=0, pos=31: 5.45 5.58 ( -2.32%) len=32, align=0, pos=31: 5.48 5.12 ( 6.45%) len=32, align=31, pos=31: 5.43 5.11 ( 5.91%) len=32, align=31, pos=31: 5.44 5.82 ( -6.98%) len=30, align=0, pos=31: 5.81 5.10 ( 12.08%) len=30, align=0, pos=31: 5.44 5.14 ( 5.52%) len=30, align=31, pos=31: 5.44 5.12 ( 5.83%) len=30, align=31, pos=31: 6.08 5.58 ( 8.31%) len=32, align=2048, pos=31: 5.48 5.10 ( 6.88%) len=32, align=2048, pos=31: 5.42 5.09 ( 6.16%) len=32, align=2079, pos=31: 6.06 5.44 ( 10.26%) len=32, align=2079, pos=31: 5.76 5.10 ( 11.58%) len=30, align=2048, pos=31: 5.44 5.76 ( -5.86%) len=30, align=2048, pos=31: 5.66 5.16 ( 8.92%) len=30, align=2079, pos=31: 5.45 5.15 ( 5.39%) len=30, align=2079, pos=31: 5.50 5.75 ( -4.55%) len=30, align=4081, pos=31: 9.02 9.14 ( -1.31%) len=30, align=4081, pos=31: 8.99 9.11 ( -1.36%) len=32, align=4081, pos=31: 8.06 7.40 ( 8.25%) len=32, align=4081, pos=31: 8.01 7.32 ( 8.62%) ^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCH v5] x86_64: Implement evex512 version of memchr, rawmemchr and wmemchr 2022-10-17 15:46 ` Noah Goldstein 2022-10-17 23:55 ` Sunil Pandey @ 2022-10-18 0:00 ` Sunil K Pandey 2022-10-18 3:01 ` Noah Goldstein 1 sibling, 1 reply; 26+ messages in thread From: Sunil K Pandey @ 2022-10-18 0:00 UTC (permalink / raw) To: libc-alpha Changes from v4: - Replace jmp max in first vector with cmov. - Replace jmp max in page cross with cmov. Changes from v3: - Replace VPCMPEQ in loop with VPTESTNM for 4th vector. - Change first vector max check logic for terminating condition. - Change page cross logic for terminating condition. - Remove unnessary check in align_more block. - Remove unnessary VEC(0) initialization. - Define USE_WIDE_CHAR in wmemchr. Changes from v2: - Use VEC API - Replaced long jump L(zero) in L(endloop) with short jump L(zero_2) Changes from v1: - Change vcmp to vcmpeq and vcmpneq. - Restructure unconditional loop jump logic. - Improve 4 vector loop logic. - Fix bug near page boundary. This patch implements following evex512 version of string functions. evex512 version takes up to 30% less cycle as compared to evex, depending on length and alignment. - memchr function using 512 bit vectors. - rawmemchr function using 512 bit vectors. - wmemchr function using 512 bit vectors. Code size data: memchr-evex.o 762 byte memchr-evex512.o 576 byte (-24%) rawmemchr-evex.o 461 byte rawmemchr-evex512.o 432 byte (-6%) wmemchr-evex.o 794 byte wmemchr-evex512.o 576 byte (-27%) Placeholder function, not used by any processor at the moment. Fix page cross logic Fix 2 --- sysdeps/x86_64/multiarch/Makefile | 3 + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 + sysdeps/x86_64/multiarch/memchr-evex-base.S | 301 +++++++++++++++++++ sysdeps/x86_64/multiarch/memchr-evex512.S | 8 + sysdeps/x86_64/multiarch/rawmemchr-evex512.S | 7 + sysdeps/x86_64/multiarch/wmemchr-evex512.S | 9 + 6 files changed, 343 insertions(+) create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-base.S create mode 100644 sysdeps/x86_64/multiarch/memchr-evex512.S create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex512.S create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex512.S diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index df4601c294..e974b1ad97 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -4,6 +4,7 @@ sysdep_routines += \ memchr-avx2 \ memchr-avx2-rtm \ memchr-evex \ + memchr-evex512 \ memchr-evex-rtm \ memchr-sse2 \ memcmp-avx2-movbe \ @@ -36,6 +37,7 @@ sysdep_routines += \ rawmemchr-avx2 \ rawmemchr-avx2-rtm \ rawmemchr-evex \ + rawmemchr-evex512 \ rawmemchr-evex-rtm \ rawmemchr-sse2 \ stpcpy-avx2 \ @@ -156,6 +158,7 @@ sysdep_routines += \ wmemchr-avx2 \ wmemchr-avx2-rtm \ wmemchr-evex \ + wmemchr-evex512 \ wmemchr-evex-rtm \ wmemchr-sse2 \ wmemcmp-avx2-movbe \ diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 00a91123d3..529c0b0ef0 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -63,6 +63,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), __memchr_evex) + X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __memchr_evex512) X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) @@ -337,6 +342,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), __rawmemchr_evex) + X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __rawmemchr_evex512) X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) @@ -942,6 +952,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), __wmemchr_evex) + X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __wmemchr_evex512) X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) diff --git a/sysdeps/x86_64/multiarch/memchr-evex-base.S b/sysdeps/x86_64/multiarch/memchr-evex-base.S new file mode 100644 index 0000000000..ea92983db8 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memchr-evex-base.S @@ -0,0 +1,301 @@ +/* Placeholder function, not used by any processor at the moment. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +/* UNUSED. Exists purely as reference implementation. */ + +#include <isa-level.h> + +#if ISA_SHOULD_BUILD (4) + +# include <sysdep.h> + +# ifdef USE_AS_WMEMCHR +# define CHAR_SIZE 4 +# define VPBROADCAST vpbroadcastd +# define VPCMPEQ vpcmpeqd +# define VPCMPNE vpcmpneqd +# define VPMINU vpminud +# define VPTESTNM vptestnmd +# else +# define CHAR_SIZE 1 +# define VPBROADCAST vpbroadcastb +# define VPCMPEQ vpcmpeqb +# define VPCMPNE vpcmpneqb +# define VPMINU vpminub +# define VPTESTNM vptestnmb +# endif + +# define PAGE_SIZE 4096 +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) + + .section SECTION(.text), "ax", @progbits +/* Aligning entry point to 64 byte, provides better performance for + one vector length string. */ +ENTRY_P2ALIGN (MEMCHR, 6) +# ifndef USE_AS_RAWMEMCHR + /* Check for zero length. */ + test %RDX_LP, %RDX_LP + jz L(zero) + +# ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %edx, %edx +# endif +# endif + + /* Broadcast CHAR to VMM(1). */ + VPBROADCAST %esi, %VMM(1) + movl %edi, %eax + andl $(PAGE_SIZE - 1), %eax + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + ja L(page_cross) + + /* Compare [w]char for null, mask bit will be set for match. */ + VPCMPEQ (%rdi), %VMM(1), %k0 + + KMOV %k0, %VRCX +# ifndef USE_AS_RAWMEMCHR + mov %rdx, %rsi + bsf %VRCX, %VRSI + cmp $CHAR_PER_VEC, %rsi + ja L(align_more) +# ifdef USE_AS_WMEMCHR + leaq (%rdi, %rsi, CHAR_SIZE), %rdi +# else + addq %rsi, %rdi +# endif + xor %eax, %eax + cmp %rsi, %rdx + cmova %rdi, %rax +# else + bsf %VRCX, %VRAX + jz L(align_more) + add %rdi, %rax +# endif + ret + + .p2align 5,,5 +L(page_cross): + movl %eax, %ecx + andl $(VEC_SIZE - 1), %ecx +# ifdef USE_AS_WMEMCHR + shrl $2, %ecx +# endif + xorq %rdi, %rax + VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1), %k0 + KMOV %k0, %VRSI + shr %cl, %VRSI +# ifndef USE_AS_RAWMEMCHR + jnz L(page_cross_end) + movl $CHAR_PER_VEC, %eax + sub %ecx, %eax + cmp %rax, %rdx + ja L(align_more) +# else + jz L(align_more) +# endif + +L(page_cross_end): +# ifndef USE_AS_RAWMEMCHR + bsf %VRSI, %VRCX + leaq (%rdi, %rcx, CHAR_SIZE), %rdi + xor %eax, %eax + cmp %rcx, %rdx + cmova %rdi, %rax +# else + bsf %VRSI, %VRAX + add %rdi, %rax +# endif + ret + +# ifndef USE_AS_RAWMEMCHR +L(zero): + xorl %eax, %eax + ret +# endif + +L(ret_vec_x2): + subq $-VEC_SIZE, %rdi +L(ret_vec_x1): + bsf %VRAX, %VRAX +# ifndef USE_AS_RAWMEMCHR + cmp %rax, %rdx + jbe L(zero) +# endif +# ifdef USE_AS_WMEMCHR + leaq (%rdi, %rax, CHAR_SIZE), %rax +# else + add %rdi, %rax +# endif + ret + + .p2align 5,,5 +L(align_more): +# ifndef USE_AS_RAWMEMCHR + mov %rdi, %rax +# endif + subq $-VEC_SIZE, %rdi + /* Align rdi to VEC_SIZE. */ + andq $-VEC_SIZE, %rdi + +# ifndef USE_AS_RAWMEMCHR + subq %rdi, %rax +# ifdef USE_AS_WMEMCHR + sar $2, %rax +# endif + addq %rax, %rdx +# endif + + /* Loop unroll 4 times for 4 vector loop. */ + VPCMPEQ (%rdi), %VMM(1), %k0 + + KMOV %k0, %VRAX + test %VRAX, %VRAX + jnz L(ret_vec_x1) + +# ifndef USE_AS_RAWMEMCHR + subq $CHAR_PER_VEC, %rdx + jbe L(zero) +# endif + + VPCMPEQ VEC_SIZE(%rdi), %VMM(1), %k0 + + KMOV %k0, %VRAX + test %VRAX, %VRAX + jnz L(ret_vec_x2) + +# ifndef USE_AS_RAWMEMCHR + subq $CHAR_PER_VEC, %rdx + jbe L(zero) +# endif + + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMM(1), %k0 + + KMOV %k0, %VRAX + test %VRAX, %VRAX + jnz L(ret_vec_x3) + +# ifndef USE_AS_RAWMEMCHR + subq $CHAR_PER_VEC, %rdx + jbe L(zero) +# endif + + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMM(1), %k0 + + KMOV %k0, %VRAX + test %VRAX, %VRAX + jnz L(ret_vec_x4) + +# ifndef USE_AS_RAWMEMCHR + subq $CHAR_PER_VEC, %rdx + jbe L(zero) + /* Save pointer to find alignment adjustment. */ + movq %rdi, %rax +# endif + /* Align address to VEC_SIZE * 4 for loop. */ + andq $-(VEC_SIZE * 4), %rdi + + /* Add alignment difference to rdx. */ +# ifndef USE_AS_RAWMEMCHR + subq %rdi, %rax +# ifdef USE_AS_WMEMCHR + shr $2, %VRAX +# endif + addq %rax, %rdx +# endif + + /* 4 vector loop. */ + .p2align 5,,11 +L(loop): + + VPCMPNE (VEC_SIZE * 4)(%rdi), %VMM(1), %k1 + vpxorq (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2) + vpxorq (VEC_SIZE * 6)(%rdi), %VMM(1), %VMM(3) + VPCMPEQ (VEC_SIZE * 7)(%rdi), %VMM(1), %k3 + VPMINU %VMM(2), %VMM(3), %VMM(3){%k1}{z} + VPTESTNM %VMM(3), %VMM(3), %k2 + + subq $-(VEC_SIZE * 4), %rdi + KORTEST %k2, %k3 +# ifdef USE_AS_RAWMEMCHR + jz L(loop) +# else + jnz L(loopend) + subq $(CHAR_PER_VEC * 4), %rdx + ja L(loop) +L(zero_2): + xor %eax, %eax + ret +# endif + +L(loopend): + VPCMPEQ (%rdi), %VMM(1), %k1 + KMOV %k1, %VRAX + test %VRAX, %VRAX + jnz L(ret_vec_x1) + +# ifndef USE_AS_RAWMEMCHR + subq $CHAR_PER_VEC, %rdx + jbe L(zero_2) +# endif + + VPCMPEQ VEC_SIZE(%rdi), %VMM(1), %k1 + KMOV %k1, %VRAX + test %VRAX, %VRAX + jnz L(ret_vec_x2) + +# ifndef USE_AS_RAWMEMCHR + subq $CHAR_PER_VEC, %rdx + jbe L(zero_2) +# endif + + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMM(1), %k1 + KMOV %k1, %VRAX + test %VRAX, %VRAX + jnz L(ret_vec_x3) + +# ifndef USE_AS_RAWMEMCHR + subq $CHAR_PER_VEC, %rdx + jbe L(zero_2) +# endif + + /* At this point null [w]char must be in the fourth vector so no + need to check. */ + KMOV %k3, %VRAX + +L(ret_vec_x4): + bsf %VRAX, %VRAX +# ifndef USE_AS_RAWMEMCHR + cmp %rax, %rdx + jbe L(zero) +# endif + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax + ret + + .p2align 5,,5 +L(ret_vec_x3): + bsf %VRAX, %VRAX +# ifndef USE_AS_RAWMEMCHR + cmp %rax, %rdx + jbe L(zero) +# endif + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax + ret + +END (MEMCHR) +#endif diff --git a/sysdeps/x86_64/multiarch/memchr-evex512.S b/sysdeps/x86_64/multiarch/memchr-evex512.S new file mode 100644 index 0000000000..002f8c8489 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memchr-evex512.S @@ -0,0 +1,8 @@ +# ifndef MEMCHR +# define MEMCHR __memchr_evex512 +# endif + +#include "x86-evex512-vecs.h" +#include "reg-macros.h" + +#include "memchr-evex-base.S" diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex512.S b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S new file mode 100644 index 0000000000..302d3cb055 --- /dev/null +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S @@ -0,0 +1,7 @@ +#ifndef RAWMEMCHR +# define RAWMEMCHR __rawmemchr_evex512 +#endif +#define USE_AS_RAWMEMCHR 1 +#define MEMCHR RAWMEMCHR + +#include "memchr-evex512.S" diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex512.S b/sysdeps/x86_64/multiarch/wmemchr-evex512.S new file mode 100644 index 0000000000..78ec4ee5ad --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemchr-evex512.S @@ -0,0 +1,9 @@ +#ifndef WMEMCHR +# define WMEMCHR __wmemchr_evex512 +#endif + +#define MEMCHR WMEMCHR +#define USE_AS_WMEMCHR 1 + +#define USE_WIDE_CHAR 1 +#include "memchr-evex512.S" -- 2.36.1 ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH v5] x86_64: Implement evex512 version of memchr, rawmemchr and wmemchr 2022-10-18 0:00 ` [PATCH v5] " Sunil K Pandey @ 2022-10-18 3:01 ` Noah Goldstein 2022-10-18 4:15 ` Sunil Pandey 0 siblings, 1 reply; 26+ messages in thread From: Noah Goldstein @ 2022-10-18 3:01 UTC (permalink / raw) To: Sunil K Pandey; +Cc: libc-alpha On Mon, Oct 17, 2022 at 5:00 PM Sunil K Pandey via Libc-alpha <libc-alpha@sourceware.org> wrote: > > Changes from v4: > - Replace jmp max in first vector with cmov. > - Replace jmp max in page cross with cmov. > Changes from v3: > - Replace VPCMPEQ in loop with VPTESTNM for 4th vector. > - Change first vector max check logic for terminating condition. > - Change page cross logic for terminating condition. > - Remove unnessary check in align_more block. > - Remove unnessary VEC(0) initialization. > - Define USE_WIDE_CHAR in wmemchr. > > Changes from v2: > - Use VEC API > - Replaced long jump L(zero) in L(endloop) with short jump L(zero_2) > > Changes from v1: > - Change vcmp to vcmpeq and vcmpneq. > - Restructure unconditional loop jump logic. > - Improve 4 vector loop logic. > - Fix bug near page boundary. > > This patch implements following evex512 version of string functions. > evex512 version takes up to 30% less cycle as compared to evex, > depending on length and alignment. > > - memchr function using 512 bit vectors. > - rawmemchr function using 512 bit vectors. > - wmemchr function using 512 bit vectors. > > Code size data: > > memchr-evex.o 762 byte > memchr-evex512.o 576 byte (-24%) > > rawmemchr-evex.o 461 byte > rawmemchr-evex512.o 432 byte (-6%) > > wmemchr-evex.o 794 byte > wmemchr-evex512.o 576 byte (-27%) > > Placeholder function, not used by any processor at the moment. > > Fix page cross logic > > Fix 2 > --- > sysdeps/x86_64/multiarch/Makefile | 3 + > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 + > sysdeps/x86_64/multiarch/memchr-evex-base.S | 301 +++++++++++++++++++ > sysdeps/x86_64/multiarch/memchr-evex512.S | 8 + > sysdeps/x86_64/multiarch/rawmemchr-evex512.S | 7 + > sysdeps/x86_64/multiarch/wmemchr-evex512.S | 9 + > 6 files changed, 343 insertions(+) > create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-base.S > create mode 100644 sysdeps/x86_64/multiarch/memchr-evex512.S > create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex512.S > create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex512.S > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > index df4601c294..e974b1ad97 100644 > --- a/sysdeps/x86_64/multiarch/Makefile > +++ b/sysdeps/x86_64/multiarch/Makefile > @@ -4,6 +4,7 @@ sysdep_routines += \ > memchr-avx2 \ > memchr-avx2-rtm \ > memchr-evex \ > + memchr-evex512 \ > memchr-evex-rtm \ > memchr-sse2 \ > memcmp-avx2-movbe \ > @@ -36,6 +37,7 @@ sysdep_routines += \ > rawmemchr-avx2 \ > rawmemchr-avx2-rtm \ > rawmemchr-evex \ > + rawmemchr-evex512 \ > rawmemchr-evex-rtm \ > rawmemchr-sse2 \ > stpcpy-avx2 \ > @@ -156,6 +158,7 @@ sysdep_routines += \ > wmemchr-avx2 \ > wmemchr-avx2-rtm \ > wmemchr-evex \ > + wmemchr-evex512 \ > wmemchr-evex-rtm \ > wmemchr-sse2 \ > wmemcmp-avx2-movbe \ > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > index 00a91123d3..529c0b0ef0 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > @@ -63,6 +63,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > && CPU_FEATURE_USABLE (AVX512BW) > && CPU_FEATURE_USABLE (BMI2)), > __memchr_evex) > + X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > + (CPU_FEATURE_USABLE (AVX512VL) > + && CPU_FEATURE_USABLE (AVX512BW) > + && CPU_FEATURE_USABLE (BMI2)), > + __memchr_evex512) > X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > (CPU_FEATURE_USABLE (AVX512VL) > && CPU_FEATURE_USABLE (AVX512BW) > @@ -337,6 +342,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > && CPU_FEATURE_USABLE (AVX512BW) > && CPU_FEATURE_USABLE (BMI2)), > __rawmemchr_evex) > + X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > + (CPU_FEATURE_USABLE (AVX512VL) > + && CPU_FEATURE_USABLE (AVX512BW) > + && CPU_FEATURE_USABLE (BMI2)), > + __rawmemchr_evex512) > X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > (CPU_FEATURE_USABLE (AVX512VL) > && CPU_FEATURE_USABLE (AVX512BW) > @@ -942,6 +952,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > && CPU_FEATURE_USABLE (AVX512BW) > && CPU_FEATURE_USABLE (BMI2)), > __wmemchr_evex) > + X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > + (CPU_FEATURE_USABLE (AVX512VL) > + && CPU_FEATURE_USABLE (AVX512BW) > + && CPU_FEATURE_USABLE (BMI2)), > + __wmemchr_evex512) > X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > (CPU_FEATURE_USABLE (AVX512VL) > && CPU_FEATURE_USABLE (AVX512BW) > diff --git a/sysdeps/x86_64/multiarch/memchr-evex-base.S b/sysdeps/x86_64/multiarch/memchr-evex-base.S > new file mode 100644 > index 0000000000..ea92983db8 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/memchr-evex-base.S > @@ -0,0 +1,301 @@ > +/* Placeholder function, not used by any processor at the moment. > + Copyright (C) 2022 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +/* UNUSED. Exists purely as reference implementation. */ > + > +#include <isa-level.h> > + > +#if ISA_SHOULD_BUILD (4) > + > +# include <sysdep.h> > + > +# ifdef USE_AS_WMEMCHR > +# define CHAR_SIZE 4 > +# define VPBROADCAST vpbroadcastd > +# define VPCMPEQ vpcmpeqd > +# define VPCMPNE vpcmpneqd > +# define VPMINU vpminud > +# define VPTESTNM vptestnmd > +# else > +# define CHAR_SIZE 1 > +# define VPBROADCAST vpbroadcastb > +# define VPCMPEQ vpcmpeqb > +# define VPCMPNE vpcmpneqb > +# define VPMINU vpminub > +# define VPTESTNM vptestnmb > +# endif > + > +# define PAGE_SIZE 4096 > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > + > + .section SECTION(.text), "ax", @progbits > +/* Aligning entry point to 64 byte, provides better performance for > + one vector length string. */ > +ENTRY_P2ALIGN (MEMCHR, 6) > +# ifndef USE_AS_RAWMEMCHR > + /* Check for zero length. */ > + test %RDX_LP, %RDX_LP > + jz L(zero) > + > +# ifdef __ILP32__ > + /* Clear the upper 32 bits. */ > + movl %edx, %edx > +# endif > +# endif > + > + /* Broadcast CHAR to VMM(1). */ > + VPBROADCAST %esi, %VMM(1) > + movl %edi, %eax > + andl $(PAGE_SIZE - 1), %eax > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > + ja L(page_cross) > + > + /* Compare [w]char for null, mask bit will be set for match. */ > + VPCMPEQ (%rdi), %VMM(1), %k0 > + > + KMOV %k0, %VRCX > +# ifndef USE_AS_RAWMEMCHR > + mov %rdx, %rsi > + bsf %VRCX, %VRSI > + cmp $CHAR_PER_VEC, %rsi > + ja L(align_more) > +# ifdef USE_AS_WMEMCHR > + leaq (%rdi, %rsi, CHAR_SIZE), %rdi > +# else > + addq %rsi, %rdi > +# endif > + xor %eax, %eax > + cmp %rsi, %rdx > + cmova %rdi, %rax > +# else > + bsf %VRCX, %VRAX > + jz L(align_more) > + add %rdi, %rax > +# endif > + ret > + > + .p2align 5,,5 > +L(page_cross): > + movl %eax, %ecx > + andl $(VEC_SIZE - 1), %ecx > +# ifdef USE_AS_WMEMCHR > + shrl $2, %ecx > +# endif > + xorq %rdi, %rax > + VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1), %k0 > + KMOV %k0, %VRSI > + shr %cl, %VRSI > +# ifndef USE_AS_RAWMEMCHR > + jnz L(page_cross_end) > + movl $CHAR_PER_VEC, %eax > + sub %ecx, %eax > + cmp %rax, %rdx > + ja L(align_more) > +# else > + jz L(align_more) > +# endif > + > +L(page_cross_end): > +# ifndef USE_AS_RAWMEMCHR > + bsf %VRSI, %VRCX > + leaq (%rdi, %rcx, CHAR_SIZE), %rdi > + xor %eax, %eax > + cmp %rcx, %rdx > + cmova %rdi, %rax You have a bug here test case: align % 4096 = 4036 len = 8 pos = N/A (no char in bounds). I think the `bsf %rsi, %rcx` shouldn't have rcx as a destination or you need to move `rdx` to `rcx` first. > +# else > + bsf %VRSI, %VRAX > + add %rdi, %rax > +# endif > + ret > + > +# ifndef USE_AS_RAWMEMCHR > +L(zero): > + xorl %eax, %eax > + ret > +# endif > + > +L(ret_vec_x2): > + subq $-VEC_SIZE, %rdi > +L(ret_vec_x1): > + bsf %VRAX, %VRAX > +# ifndef USE_AS_RAWMEMCHR > + cmp %rax, %rdx > + jbe L(zero) > +# endif > +# ifdef USE_AS_WMEMCHR > + leaq (%rdi, %rax, CHAR_SIZE), %rax > +# else > + add %rdi, %rax > +# endif > + ret > + > + .p2align 5,,5 > +L(align_more): > +# ifndef USE_AS_RAWMEMCHR > + mov %rdi, %rax > +# endif > + subq $-VEC_SIZE, %rdi > + /* Align rdi to VEC_SIZE. */ > + andq $-VEC_SIZE, %rdi > + > +# ifndef USE_AS_RAWMEMCHR > + subq %rdi, %rax > +# ifdef USE_AS_WMEMCHR > + sar $2, %rax > +# endif > + addq %rax, %rdx > +# endif > + > + /* Loop unroll 4 times for 4 vector loop. */ > + VPCMPEQ (%rdi), %VMM(1), %k0 > + > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > + jnz L(ret_vec_x1) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero) > +# endif > + > + VPCMPEQ VEC_SIZE(%rdi), %VMM(1), %k0 > + > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > + jnz L(ret_vec_x2) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero) > +# endif > + > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMM(1), %k0 > + > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > + jnz L(ret_vec_x3) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero) > +# endif > + > + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMM(1), %k0 > + > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > + jnz L(ret_vec_x4) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero) > + /* Save pointer to find alignment adjustment. */ > + movq %rdi, %rax > +# endif > + /* Align address to VEC_SIZE * 4 for loop. */ > + andq $-(VEC_SIZE * 4), %rdi > + > + /* Add alignment difference to rdx. */ > +# ifndef USE_AS_RAWMEMCHR > + subq %rdi, %rax > +# ifdef USE_AS_WMEMCHR > + shr $2, %VRAX > +# endif > + addq %rax, %rdx > +# endif > + > + /* 4 vector loop. */ > + .p2align 5,,11 > +L(loop): > + > + VPCMPNE (VEC_SIZE * 4)(%rdi), %VMM(1), %k1 > + vpxorq (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2) > + vpxorq (VEC_SIZE * 6)(%rdi), %VMM(1), %VMM(3) > + VPCMPEQ (VEC_SIZE * 7)(%rdi), %VMM(1), %k3 > + VPMINU %VMM(2), %VMM(3), %VMM(3){%k1}{z} > + VPTESTNM %VMM(3), %VMM(3), %k2 > + > + subq $-(VEC_SIZE * 4), %rdi > + KORTEST %k2, %k3 > +# ifdef USE_AS_RAWMEMCHR > + jz L(loop) > +# else > + jnz L(loopend) > + subq $(CHAR_PER_VEC * 4), %rdx > + ja L(loop) > +L(zero_2): > + xor %eax, %eax > + ret > +# endif > + > +L(loopend): > + VPCMPEQ (%rdi), %VMM(1), %k1 > + KMOV %k1, %VRAX > + test %VRAX, %VRAX > + jnz L(ret_vec_x1) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero_2) > +# endif > + > + VPCMPEQ VEC_SIZE(%rdi), %VMM(1), %k1 > + KMOV %k1, %VRAX > + test %VRAX, %VRAX > + jnz L(ret_vec_x2) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero_2) > +# endif > + > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMM(1), %k1 > + KMOV %k1, %VRAX > + test %VRAX, %VRAX > + jnz L(ret_vec_x3) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero_2) > +# endif > + > + /* At this point null [w]char must be in the fourth vector so no > + need to check. */ > + KMOV %k3, %VRAX > + > +L(ret_vec_x4): > + bsf %VRAX, %VRAX > +# ifndef USE_AS_RAWMEMCHR > + cmp %rax, %rdx > + jbe L(zero) > +# endif > + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax > + ret > + > + .p2align 5,,5 > +L(ret_vec_x3): > + bsf %VRAX, %VRAX > +# ifndef USE_AS_RAWMEMCHR > + cmp %rax, %rdx > + jbe L(zero) > +# endif > + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > + ret > + > +END (MEMCHR) > +#endif > diff --git a/sysdeps/x86_64/multiarch/memchr-evex512.S b/sysdeps/x86_64/multiarch/memchr-evex512.S > new file mode 100644 > index 0000000000..002f8c8489 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/memchr-evex512.S > @@ -0,0 +1,8 @@ > +# ifndef MEMCHR > +# define MEMCHR __memchr_evex512 > +# endif > + > +#include "x86-evex512-vecs.h" > +#include "reg-macros.h" > + > +#include "memchr-evex-base.S" > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex512.S b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S > new file mode 100644 > index 0000000000..302d3cb055 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S > @@ -0,0 +1,7 @@ > +#ifndef RAWMEMCHR > +# define RAWMEMCHR __rawmemchr_evex512 > +#endif > +#define USE_AS_RAWMEMCHR 1 > +#define MEMCHR RAWMEMCHR > + > +#include "memchr-evex512.S" > diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex512.S b/sysdeps/x86_64/multiarch/wmemchr-evex512.S > new file mode 100644 > index 0000000000..78ec4ee5ad > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/wmemchr-evex512.S > @@ -0,0 +1,9 @@ > +#ifndef WMEMCHR > +# define WMEMCHR __wmemchr_evex512 > +#endif > + > +#define MEMCHR WMEMCHR > +#define USE_AS_WMEMCHR 1 > + > +#define USE_WIDE_CHAR 1 > +#include "memchr-evex512.S" > -- > 2.36.1 > ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH v5] x86_64: Implement evex512 version of memchr, rawmemchr and wmemchr 2022-10-18 3:01 ` Noah Goldstein @ 2022-10-18 4:15 ` Sunil Pandey 2022-10-18 4:18 ` Noah Goldstein 0 siblings, 1 reply; 26+ messages in thread From: Sunil Pandey @ 2022-10-18 4:15 UTC (permalink / raw) To: Noah Goldstein; +Cc: libc-alpha On Mon, Oct 17, 2022 at 8:02 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > On Mon, Oct 17, 2022 at 5:00 PM Sunil K Pandey via Libc-alpha > <libc-alpha@sourceware.org> wrote: > > > > Changes from v4: > > - Replace jmp max in first vector with cmov. > > - Replace jmp max in page cross with cmov. > > Changes from v3: > > - Replace VPCMPEQ in loop with VPTESTNM for 4th vector. > > - Change first vector max check logic for terminating condition. > > - Change page cross logic for terminating condition. > > - Remove unnessary check in align_more block. > > - Remove unnessary VEC(0) initialization. > > - Define USE_WIDE_CHAR in wmemchr. > > > > Changes from v2: > > - Use VEC API > > - Replaced long jump L(zero) in L(endloop) with short jump L(zero_2) > > > > Changes from v1: > > - Change vcmp to vcmpeq and vcmpneq. > > - Restructure unconditional loop jump logic. > > - Improve 4 vector loop logic. > > - Fix bug near page boundary. > > > > This patch implements following evex512 version of string functions. > > evex512 version takes up to 30% less cycle as compared to evex, > > depending on length and alignment. > > > > - memchr function using 512 bit vectors. > > - rawmemchr function using 512 bit vectors. > > - wmemchr function using 512 bit vectors. > > > > Code size data: > > > > memchr-evex.o 762 byte > > memchr-evex512.o 576 byte (-24%) > > > > rawmemchr-evex.o 461 byte > > rawmemchr-evex512.o 432 byte (-6%) > > > > wmemchr-evex.o 794 byte > > wmemchr-evex512.o 576 byte (-27%) > > > > Placeholder function, not used by any processor at the moment. > > > > Fix page cross logic > > > > Fix 2 > > --- > > sysdeps/x86_64/multiarch/Makefile | 3 + > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 + > > sysdeps/x86_64/multiarch/memchr-evex-base.S | 301 +++++++++++++++++++ > > sysdeps/x86_64/multiarch/memchr-evex512.S | 8 + > > sysdeps/x86_64/multiarch/rawmemchr-evex512.S | 7 + > > sysdeps/x86_64/multiarch/wmemchr-evex512.S | 9 + > > 6 files changed, 343 insertions(+) > > create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-base.S > > create mode 100644 sysdeps/x86_64/multiarch/memchr-evex512.S > > create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex512.S > > create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex512.S > > > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > > index df4601c294..e974b1ad97 100644 > > --- a/sysdeps/x86_64/multiarch/Makefile > > +++ b/sysdeps/x86_64/multiarch/Makefile > > @@ -4,6 +4,7 @@ sysdep_routines += \ > > memchr-avx2 \ > > memchr-avx2-rtm \ > > memchr-evex \ > > + memchr-evex512 \ > > memchr-evex-rtm \ > > memchr-sse2 \ > > memcmp-avx2-movbe \ > > @@ -36,6 +37,7 @@ sysdep_routines += \ > > rawmemchr-avx2 \ > > rawmemchr-avx2-rtm \ > > rawmemchr-evex \ > > + rawmemchr-evex512 \ > > rawmemchr-evex-rtm \ > > rawmemchr-sse2 \ > > stpcpy-avx2 \ > > @@ -156,6 +158,7 @@ sysdep_routines += \ > > wmemchr-avx2 \ > > wmemchr-avx2-rtm \ > > wmemchr-evex \ > > + wmemchr-evex512 \ > > wmemchr-evex-rtm \ > > wmemchr-sse2 \ > > wmemcmp-avx2-movbe \ > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > index 00a91123d3..529c0b0ef0 100644 > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > @@ -63,6 +63,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > && CPU_FEATURE_USABLE (AVX512BW) > > && CPU_FEATURE_USABLE (BMI2)), > > __memchr_evex) > > + X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > > + (CPU_FEATURE_USABLE (AVX512VL) > > + && CPU_FEATURE_USABLE (AVX512BW) > > + && CPU_FEATURE_USABLE (BMI2)), > > + __memchr_evex512) > > X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > > (CPU_FEATURE_USABLE (AVX512VL) > > && CPU_FEATURE_USABLE (AVX512BW) > > @@ -337,6 +342,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > && CPU_FEATURE_USABLE (AVX512BW) > > && CPU_FEATURE_USABLE (BMI2)), > > __rawmemchr_evex) > > + X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > > + (CPU_FEATURE_USABLE (AVX512VL) > > + && CPU_FEATURE_USABLE (AVX512BW) > > + && CPU_FEATURE_USABLE (BMI2)), > > + __rawmemchr_evex512) > > X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > > (CPU_FEATURE_USABLE (AVX512VL) > > && CPU_FEATURE_USABLE (AVX512BW) > > @@ -942,6 +952,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > && CPU_FEATURE_USABLE (AVX512BW) > > && CPU_FEATURE_USABLE (BMI2)), > > __wmemchr_evex) > > + X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > > + (CPU_FEATURE_USABLE (AVX512VL) > > + && CPU_FEATURE_USABLE (AVX512BW) > > + && CPU_FEATURE_USABLE (BMI2)), > > + __wmemchr_evex512) > > X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > > (CPU_FEATURE_USABLE (AVX512VL) > > && CPU_FEATURE_USABLE (AVX512BW) > > diff --git a/sysdeps/x86_64/multiarch/memchr-evex-base.S b/sysdeps/x86_64/multiarch/memchr-evex-base.S > > new file mode 100644 > > index 0000000000..ea92983db8 > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/memchr-evex-base.S > > @@ -0,0 +1,301 @@ > > +/* Placeholder function, not used by any processor at the moment. > > + Copyright (C) 2022 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > +/* UNUSED. Exists purely as reference implementation. */ > > + > > +#include <isa-level.h> > > + > > +#if ISA_SHOULD_BUILD (4) > > + > > +# include <sysdep.h> > > + > > +# ifdef USE_AS_WMEMCHR > > +# define CHAR_SIZE 4 > > +# define VPBROADCAST vpbroadcastd > > +# define VPCMPEQ vpcmpeqd > > +# define VPCMPNE vpcmpneqd > > +# define VPMINU vpminud > > +# define VPTESTNM vptestnmd > > +# else > > +# define CHAR_SIZE 1 > > +# define VPBROADCAST vpbroadcastb > > +# define VPCMPEQ vpcmpeqb > > +# define VPCMPNE vpcmpneqb > > +# define VPMINU vpminub > > +# define VPTESTNM vptestnmb > > +# endif > > + > > +# define PAGE_SIZE 4096 > > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > + > > + .section SECTION(.text), "ax", @progbits > > +/* Aligning entry point to 64 byte, provides better performance for > > + one vector length string. */ > > +ENTRY_P2ALIGN (MEMCHR, 6) > > +# ifndef USE_AS_RAWMEMCHR > > + /* Check for zero length. */ > > + test %RDX_LP, %RDX_LP > > + jz L(zero) > > + > > +# ifdef __ILP32__ > > + /* Clear the upper 32 bits. */ > > + movl %edx, %edx > > +# endif > > +# endif > > + > > + /* Broadcast CHAR to VMM(1). */ > > + VPBROADCAST %esi, %VMM(1) > > + movl %edi, %eax > > + andl $(PAGE_SIZE - 1), %eax > > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > > + ja L(page_cross) > > + > > + /* Compare [w]char for null, mask bit will be set for match. */ > > + VPCMPEQ (%rdi), %VMM(1), %k0 > > + > > + KMOV %k0, %VRCX > > +# ifndef USE_AS_RAWMEMCHR > > + mov %rdx, %rsi > > + bsf %VRCX, %VRSI > > + cmp $CHAR_PER_VEC, %rsi > > + ja L(align_more) > > +# ifdef USE_AS_WMEMCHR > > + leaq (%rdi, %rsi, CHAR_SIZE), %rdi > > +# else > > + addq %rsi, %rdi > > +# endif > > + xor %eax, %eax > > + cmp %rsi, %rdx > > + cmova %rdi, %rax > > +# else > > + bsf %VRCX, %VRAX > > + jz L(align_more) > > + add %rdi, %rax > > +# endif > > + ret > > + > > + .p2align 5,,5 > > +L(page_cross): > > + movl %eax, %ecx > > + andl $(VEC_SIZE - 1), %ecx > > +# ifdef USE_AS_WMEMCHR > > + shrl $2, %ecx > > +# endif > > + xorq %rdi, %rax > > + VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1), %k0 > > + KMOV %k0, %VRSI > > + shr %cl, %VRSI > > +# ifndef USE_AS_RAWMEMCHR > > + jnz L(page_cross_end) > > + movl $CHAR_PER_VEC, %eax > > + sub %ecx, %eax > > + cmp %rax, %rdx > > + ja L(align_more) > > +# else > > + jz L(align_more) > > +# endif > > + > > +L(page_cross_end): > > +# ifndef USE_AS_RAWMEMCHR > > + bsf %VRSI, %VRCX > > + leaq (%rdi, %rcx, CHAR_SIZE), %rdi > > + xor %eax, %eax > > + cmp %rcx, %rdx > > + cmova %rdi, %rax > > You have a bug here test case: > > align % 4096 = 4036 > len = 8 > pos = N/A (no char in bounds). > Can you please help reproduce this issue. I tried adding this test case but it didn't fail. do_test (4036, 20, 8, 8, 0x9B); > I think the `bsf %rsi, %rcx` shouldn't have rcx as a destination > or you need to move `rdx` to `rcx` first. > > > +# else > > + bsf %VRSI, %VRAX > > + add %rdi, %rax > > +# endif > > + ret > > + > > +# ifndef USE_AS_RAWMEMCHR > > +L(zero): > > + xorl %eax, %eax > > + ret > > +# endif > > + > > +L(ret_vec_x2): > > + subq $-VEC_SIZE, %rdi > > +L(ret_vec_x1): > > + bsf %VRAX, %VRAX > > +# ifndef USE_AS_RAWMEMCHR > > + cmp %rax, %rdx > > + jbe L(zero) > > +# endif > > +# ifdef USE_AS_WMEMCHR > > + leaq (%rdi, %rax, CHAR_SIZE), %rax > > +# else > > + add %rdi, %rax > > +# endif > > + ret > > + > > + .p2align 5,,5 > > +L(align_more): > > +# ifndef USE_AS_RAWMEMCHR > > + mov %rdi, %rax > > +# endif > > + subq $-VEC_SIZE, %rdi > > + /* Align rdi to VEC_SIZE. */ > > + andq $-VEC_SIZE, %rdi > > + > > +# ifndef USE_AS_RAWMEMCHR > > + subq %rdi, %rax > > +# ifdef USE_AS_WMEMCHR > > + sar $2, %rax > > +# endif > > + addq %rax, %rdx > > +# endif > > + > > + /* Loop unroll 4 times for 4 vector loop. */ > > + VPCMPEQ (%rdi), %VMM(1), %k0 > > + > > + KMOV %k0, %VRAX > > + test %VRAX, %VRAX > > + jnz L(ret_vec_x1) > > + > > +# ifndef USE_AS_RAWMEMCHR > > + subq $CHAR_PER_VEC, %rdx > > + jbe L(zero) > > +# endif > > + > > + VPCMPEQ VEC_SIZE(%rdi), %VMM(1), %k0 > > + > > + KMOV %k0, %VRAX > > + test %VRAX, %VRAX > > + jnz L(ret_vec_x2) > > + > > +# ifndef USE_AS_RAWMEMCHR > > + subq $CHAR_PER_VEC, %rdx > > + jbe L(zero) > > +# endif > > + > > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMM(1), %k0 > > + > > + KMOV %k0, %VRAX > > + test %VRAX, %VRAX > > + jnz L(ret_vec_x3) > > + > > +# ifndef USE_AS_RAWMEMCHR > > + subq $CHAR_PER_VEC, %rdx > > + jbe L(zero) > > +# endif > > + > > + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMM(1), %k0 > > + > > + KMOV %k0, %VRAX > > + test %VRAX, %VRAX > > + jnz L(ret_vec_x4) > > + > > +# ifndef USE_AS_RAWMEMCHR > > + subq $CHAR_PER_VEC, %rdx > > + jbe L(zero) > > + /* Save pointer to find alignment adjustment. */ > > + movq %rdi, %rax > > +# endif > > + /* Align address to VEC_SIZE * 4 for loop. */ > > + andq $-(VEC_SIZE * 4), %rdi > > + > > + /* Add alignment difference to rdx. */ > > +# ifndef USE_AS_RAWMEMCHR > > + subq %rdi, %rax > > +# ifdef USE_AS_WMEMCHR > > + shr $2, %VRAX > > +# endif > > + addq %rax, %rdx > > +# endif > > + > > + /* 4 vector loop. */ > > + .p2align 5,,11 > > +L(loop): > > + > > + VPCMPNE (VEC_SIZE * 4)(%rdi), %VMM(1), %k1 > > + vpxorq (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2) > > + vpxorq (VEC_SIZE * 6)(%rdi), %VMM(1), %VMM(3) > > + VPCMPEQ (VEC_SIZE * 7)(%rdi), %VMM(1), %k3 > > + VPMINU %VMM(2), %VMM(3), %VMM(3){%k1}{z} > > + VPTESTNM %VMM(3), %VMM(3), %k2 > > + > > + subq $-(VEC_SIZE * 4), %rdi > > + KORTEST %k2, %k3 > > +# ifdef USE_AS_RAWMEMCHR > > + jz L(loop) > > +# else > > + jnz L(loopend) > > + subq $(CHAR_PER_VEC * 4), %rdx > > + ja L(loop) > > +L(zero_2): > > + xor %eax, %eax > > + ret > > +# endif > > + > > +L(loopend): > > + VPCMPEQ (%rdi), %VMM(1), %k1 > > + KMOV %k1, %VRAX > > + test %VRAX, %VRAX > > + jnz L(ret_vec_x1) > > + > > +# ifndef USE_AS_RAWMEMCHR > > + subq $CHAR_PER_VEC, %rdx > > + jbe L(zero_2) > > +# endif > > + > > + VPCMPEQ VEC_SIZE(%rdi), %VMM(1), %k1 > > + KMOV %k1, %VRAX > > + test %VRAX, %VRAX > > + jnz L(ret_vec_x2) > > + > > +# ifndef USE_AS_RAWMEMCHR > > + subq $CHAR_PER_VEC, %rdx > > + jbe L(zero_2) > > +# endif > > + > > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMM(1), %k1 > > + KMOV %k1, %VRAX > > + test %VRAX, %VRAX > > + jnz L(ret_vec_x3) > > + > > +# ifndef USE_AS_RAWMEMCHR > > + subq $CHAR_PER_VEC, %rdx > > + jbe L(zero_2) > > +# endif > > + > > + /* At this point null [w]char must be in the fourth vector so no > > + need to check. */ > > + KMOV %k3, %VRAX > > + > > +L(ret_vec_x4): > > + bsf %VRAX, %VRAX > > +# ifndef USE_AS_RAWMEMCHR > > + cmp %rax, %rdx > > + jbe L(zero) > > +# endif > > + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax > > + ret > > + > > + .p2align 5,,5 > > +L(ret_vec_x3): > > + bsf %VRAX, %VRAX > > +# ifndef USE_AS_RAWMEMCHR > > + cmp %rax, %rdx > > + jbe L(zero) > > +# endif > > + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > > + ret > > + > > +END (MEMCHR) > > +#endif > > diff --git a/sysdeps/x86_64/multiarch/memchr-evex512.S b/sysdeps/x86_64/multiarch/memchr-evex512.S > > new file mode 100644 > > index 0000000000..002f8c8489 > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/memchr-evex512.S > > @@ -0,0 +1,8 @@ > > +# ifndef MEMCHR > > +# define MEMCHR __memchr_evex512 > > +# endif > > + > > +#include "x86-evex512-vecs.h" > > +#include "reg-macros.h" > > + > > +#include "memchr-evex-base.S" > > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex512.S b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S > > new file mode 100644 > > index 0000000000..302d3cb055 > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S > > @@ -0,0 +1,7 @@ > > +#ifndef RAWMEMCHR > > +# define RAWMEMCHR __rawmemchr_evex512 > > +#endif > > +#define USE_AS_RAWMEMCHR 1 > > +#define MEMCHR RAWMEMCHR > > + > > +#include "memchr-evex512.S" > > diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex512.S b/sysdeps/x86_64/multiarch/wmemchr-evex512.S > > new file mode 100644 > > index 0000000000..78ec4ee5ad > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/wmemchr-evex512.S > > @@ -0,0 +1,9 @@ > > +#ifndef WMEMCHR > > +# define WMEMCHR __wmemchr_evex512 > > +#endif > > + > > +#define MEMCHR WMEMCHR > > +#define USE_AS_WMEMCHR 1 > > + > > +#define USE_WIDE_CHAR 1 > > +#include "memchr-evex512.S" > > -- > > 2.36.1 > > ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH v5] x86_64: Implement evex512 version of memchr, rawmemchr and wmemchr 2022-10-18 4:15 ` Sunil Pandey @ 2022-10-18 4:18 ` Noah Goldstein 2022-10-18 4:19 ` Noah Goldstein 2022-10-18 5:36 ` [PATCH v5] " Sunil Pandey 0 siblings, 2 replies; 26+ messages in thread From: Noah Goldstein @ 2022-10-18 4:18 UTC (permalink / raw) To: Sunil Pandey; +Cc: libc-alpha On Mon, Oct 17, 2022 at 11:15 PM Sunil Pandey <skpgkp2@gmail.com> wrote: > > On Mon, Oct 17, 2022 at 8:02 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > On Mon, Oct 17, 2022 at 5:00 PM Sunil K Pandey via Libc-alpha > > <libc-alpha@sourceware.org> wrote: > > > > > > Changes from v4: > > > - Replace jmp max in first vector with cmov. > > > - Replace jmp max in page cross with cmov. > > > Changes from v3: > > > - Replace VPCMPEQ in loop with VPTESTNM for 4th vector. > > > - Change first vector max check logic for terminating condition. > > > - Change page cross logic for terminating condition. > > > - Remove unnessary check in align_more block. > > > - Remove unnessary VEC(0) initialization. > > > - Define USE_WIDE_CHAR in wmemchr. > > > > > > Changes from v2: > > > - Use VEC API > > > - Replaced long jump L(zero) in L(endloop) with short jump L(zero_2) > > > > > > Changes from v1: > > > - Change vcmp to vcmpeq and vcmpneq. > > > - Restructure unconditional loop jump logic. > > > - Improve 4 vector loop logic. > > > - Fix bug near page boundary. > > > > > > This patch implements following evex512 version of string functions. > > > evex512 version takes up to 30% less cycle as compared to evex, > > > depending on length and alignment. > > > > > > - memchr function using 512 bit vectors. > > > - rawmemchr function using 512 bit vectors. > > > - wmemchr function using 512 bit vectors. > > > > > > Code size data: > > > > > > memchr-evex.o 762 byte > > > memchr-evex512.o 576 byte (-24%) > > > > > > rawmemchr-evex.o 461 byte > > > rawmemchr-evex512.o 432 byte (-6%) > > > > > > wmemchr-evex.o 794 byte > > > wmemchr-evex512.o 576 byte (-27%) > > > > > > Placeholder function, not used by any processor at the moment. > > > > > > Fix page cross logic > > > > > > Fix 2 > > > --- > > > sysdeps/x86_64/multiarch/Makefile | 3 + > > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 + > > > sysdeps/x86_64/multiarch/memchr-evex-base.S | 301 +++++++++++++++++++ > > > sysdeps/x86_64/multiarch/memchr-evex512.S | 8 + > > > sysdeps/x86_64/multiarch/rawmemchr-evex512.S | 7 + > > > sysdeps/x86_64/multiarch/wmemchr-evex512.S | 9 + > > > 6 files changed, 343 insertions(+) > > > create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-base.S > > > create mode 100644 sysdeps/x86_64/multiarch/memchr-evex512.S > > > create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex512.S > > > create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex512.S > > > > > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > > > index df4601c294..e974b1ad97 100644 > > > --- a/sysdeps/x86_64/multiarch/Makefile > > > +++ b/sysdeps/x86_64/multiarch/Makefile > > > @@ -4,6 +4,7 @@ sysdep_routines += \ > > > memchr-avx2 \ > > > memchr-avx2-rtm \ > > > memchr-evex \ > > > + memchr-evex512 \ > > > memchr-evex-rtm \ > > > memchr-sse2 \ > > > memcmp-avx2-movbe \ > > > @@ -36,6 +37,7 @@ sysdep_routines += \ > > > rawmemchr-avx2 \ > > > rawmemchr-avx2-rtm \ > > > rawmemchr-evex \ > > > + rawmemchr-evex512 \ > > > rawmemchr-evex-rtm \ > > > rawmemchr-sse2 \ > > > stpcpy-avx2 \ > > > @@ -156,6 +158,7 @@ sysdep_routines += \ > > > wmemchr-avx2 \ > > > wmemchr-avx2-rtm \ > > > wmemchr-evex \ > > > + wmemchr-evex512 \ > > > wmemchr-evex-rtm \ > > > wmemchr-sse2 \ > > > wmemcmp-avx2-movbe \ > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > index 00a91123d3..529c0b0ef0 100644 > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > @@ -63,6 +63,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > && CPU_FEATURE_USABLE (AVX512BW) > > > && CPU_FEATURE_USABLE (BMI2)), > > > __memchr_evex) > > > + X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > > > + (CPU_FEATURE_USABLE (AVX512VL) > > > + && CPU_FEATURE_USABLE (AVX512BW) > > > + && CPU_FEATURE_USABLE (BMI2)), > > > + __memchr_evex512) > > > X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > > > (CPU_FEATURE_USABLE (AVX512VL) > > > && CPU_FEATURE_USABLE (AVX512BW) > > > @@ -337,6 +342,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > && CPU_FEATURE_USABLE (AVX512BW) > > > && CPU_FEATURE_USABLE (BMI2)), > > > __rawmemchr_evex) > > > + X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > > > + (CPU_FEATURE_USABLE (AVX512VL) > > > + && CPU_FEATURE_USABLE (AVX512BW) > > > + && CPU_FEATURE_USABLE (BMI2)), > > > + __rawmemchr_evex512) > > > X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > > > (CPU_FEATURE_USABLE (AVX512VL) > > > && CPU_FEATURE_USABLE (AVX512BW) > > > @@ -942,6 +952,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > && CPU_FEATURE_USABLE (AVX512BW) > > > && CPU_FEATURE_USABLE (BMI2)), > > > __wmemchr_evex) > > > + X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > > > + (CPU_FEATURE_USABLE (AVX512VL) > > > + && CPU_FEATURE_USABLE (AVX512BW) > > > + && CPU_FEATURE_USABLE (BMI2)), > > > + __wmemchr_evex512) > > > X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > > > (CPU_FEATURE_USABLE (AVX512VL) > > > && CPU_FEATURE_USABLE (AVX512BW) > > > diff --git a/sysdeps/x86_64/multiarch/memchr-evex-base.S b/sysdeps/x86_64/multiarch/memchr-evex-base.S > > > new file mode 100644 > > > index 0000000000..ea92983db8 > > > --- /dev/null > > > +++ b/sysdeps/x86_64/multiarch/memchr-evex-base.S > > > @@ -0,0 +1,301 @@ > > > +/* Placeholder function, not used by any processor at the moment. > > > + Copyright (C) 2022 Free Software Foundation, Inc. > > > + This file is part of the GNU C Library. > > > + > > > + The GNU C Library is free software; you can redistribute it and/or > > > + modify it under the terms of the GNU Lesser General Public > > > + License as published by the Free Software Foundation; either > > > + version 2.1 of the License, or (at your option) any later version. > > > + > > > + The GNU C Library is distributed in the hope that it will be useful, > > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > > + Lesser General Public License for more details. > > > + > > > + You should have received a copy of the GNU Lesser General Public > > > + License along with the GNU C Library; if not, see > > > + <https://www.gnu.org/licenses/>. */ > > > + > > > +/* UNUSED. Exists purely as reference implementation. */ > > > + > > > +#include <isa-level.h> > > > + > > > +#if ISA_SHOULD_BUILD (4) > > > + > > > +# include <sysdep.h> > > > + > > > +# ifdef USE_AS_WMEMCHR > > > +# define CHAR_SIZE 4 > > > +# define VPBROADCAST vpbroadcastd > > > +# define VPCMPEQ vpcmpeqd > > > +# define VPCMPNE vpcmpneqd > > > +# define VPMINU vpminud > > > +# define VPTESTNM vptestnmd > > > +# else > > > +# define CHAR_SIZE 1 > > > +# define VPBROADCAST vpbroadcastb > > > +# define VPCMPEQ vpcmpeqb > > > +# define VPCMPNE vpcmpneqb > > > +# define VPMINU vpminub > > > +# define VPTESTNM vptestnmb > > > +# endif > > > + > > > +# define PAGE_SIZE 4096 > > > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > > + > > > + .section SECTION(.text), "ax", @progbits > > > +/* Aligning entry point to 64 byte, provides better performance for > > > + one vector length string. */ > > > +ENTRY_P2ALIGN (MEMCHR, 6) > > > +# ifndef USE_AS_RAWMEMCHR > > > + /* Check for zero length. */ > > > + test %RDX_LP, %RDX_LP > > > + jz L(zero) > > > + > > > +# ifdef __ILP32__ > > > + /* Clear the upper 32 bits. */ > > > + movl %edx, %edx > > > +# endif > > > +# endif > > > + > > > + /* Broadcast CHAR to VMM(1). */ > > > + VPBROADCAST %esi, %VMM(1) > > > + movl %edi, %eax > > > + andl $(PAGE_SIZE - 1), %eax > > > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > > > + ja L(page_cross) > > > + > > > + /* Compare [w]char for null, mask bit will be set for match. */ > > > + VPCMPEQ (%rdi), %VMM(1), %k0 > > > + > > > + KMOV %k0, %VRCX > > > +# ifndef USE_AS_RAWMEMCHR > > > + mov %rdx, %rsi > > > + bsf %VRCX, %VRSI > > > + cmp $CHAR_PER_VEC, %rsi > > > + ja L(align_more) > > > +# ifdef USE_AS_WMEMCHR > > > + leaq (%rdi, %rsi, CHAR_SIZE), %rdi > > > +# else > > > + addq %rsi, %rdi > > > +# endif > > > + xor %eax, %eax > > > + cmp %rsi, %rdx > > > + cmova %rdi, %rax > > > +# else > > > + bsf %VRCX, %VRAX > > > + jz L(align_more) > > > + add %rdi, %rax > > > +# endif > > > + ret > > > + > > > + .p2align 5,,5 > > > +L(page_cross): > > > + movl %eax, %ecx > > > + andl $(VEC_SIZE - 1), %ecx > > > +# ifdef USE_AS_WMEMCHR > > > + shrl $2, %ecx > > > +# endif > > > + xorq %rdi, %rax > > > + VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1), %k0 > > > + KMOV %k0, %VRSI > > > + shr %cl, %VRSI > > > +# ifndef USE_AS_RAWMEMCHR > > > + jnz L(page_cross_end) > > > + movl $CHAR_PER_VEC, %eax > > > + sub %ecx, %eax > > > + cmp %rax, %rdx > > > + ja L(align_more) > > > +# else > > > + jz L(align_more) > > > +# endif > > > + > > > +L(page_cross_end): > > > +# ifndef USE_AS_RAWMEMCHR > > > + bsf %VRSI, %VRCX > > > + leaq (%rdi, %rcx, CHAR_SIZE), %rdi > > > + xor %eax, %eax > > > + cmp %rcx, %rdx > > > + cmova %rdi, %rax > > > > You have a bug here test case: > > > > align % 4096 = 4036 > > len = 8 > > pos = N/A (no char in bounds). > > > > Can you please help reproduce this issue. > I tried adding this test case but it didn't fail. > > do_test (4036, 20, 8, 8, 0x9B); position need to not be in the first VEC (even out of bounds). > > > > I think the `bsf %rsi, %rcx` shouldn't have rcx as a destination > > or you need to move `rdx` to `rcx` first. > > > > > +# else > > > + bsf %VRSI, %VRAX > > > + add %rdi, %rax > > > +# endif > > > + ret > > > + > > > +# ifndef USE_AS_RAWMEMCHR > > > +L(zero): > > > + xorl %eax, %eax > > > + ret > > > +# endif > > > + > > > +L(ret_vec_x2): > > > + subq $-VEC_SIZE, %rdi > > > +L(ret_vec_x1): > > > + bsf %VRAX, %VRAX > > > +# ifndef USE_AS_RAWMEMCHR > > > + cmp %rax, %rdx > > > + jbe L(zero) > > > +# endif > > > +# ifdef USE_AS_WMEMCHR > > > + leaq (%rdi, %rax, CHAR_SIZE), %rax > > > +# else > > > + add %rdi, %rax > > > +# endif > > > + ret > > > + > > > + .p2align 5,,5 > > > +L(align_more): > > > +# ifndef USE_AS_RAWMEMCHR > > > + mov %rdi, %rax > > > +# endif > > > + subq $-VEC_SIZE, %rdi > > > + /* Align rdi to VEC_SIZE. */ > > > + andq $-VEC_SIZE, %rdi > > > + > > > +# ifndef USE_AS_RAWMEMCHR > > > + subq %rdi, %rax > > > +# ifdef USE_AS_WMEMCHR > > > + sar $2, %rax > > > +# endif > > > + addq %rax, %rdx > > > +# endif > > > + > > > + /* Loop unroll 4 times for 4 vector loop. */ > > > + VPCMPEQ (%rdi), %VMM(1), %k0 > > > + > > > + KMOV %k0, %VRAX > > > + test %VRAX, %VRAX > > > + jnz L(ret_vec_x1) > > > + > > > +# ifndef USE_AS_RAWMEMCHR > > > + subq $CHAR_PER_VEC, %rdx > > > + jbe L(zero) > > > +# endif > > > + > > > + VPCMPEQ VEC_SIZE(%rdi), %VMM(1), %k0 > > > + > > > + KMOV %k0, %VRAX > > > + test %VRAX, %VRAX > > > + jnz L(ret_vec_x2) > > > + > > > +# ifndef USE_AS_RAWMEMCHR > > > + subq $CHAR_PER_VEC, %rdx > > > + jbe L(zero) > > > +# endif > > > + > > > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMM(1), %k0 > > > + > > > + KMOV %k0, %VRAX > > > + test %VRAX, %VRAX > > > + jnz L(ret_vec_x3) > > > + > > > +# ifndef USE_AS_RAWMEMCHR > > > + subq $CHAR_PER_VEC, %rdx > > > + jbe L(zero) > > > +# endif > > > + > > > + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMM(1), %k0 > > > + > > > + KMOV %k0, %VRAX > > > + test %VRAX, %VRAX > > > + jnz L(ret_vec_x4) > > > + > > > +# ifndef USE_AS_RAWMEMCHR > > > + subq $CHAR_PER_VEC, %rdx > > > + jbe L(zero) > > > + /* Save pointer to find alignment adjustment. */ > > > + movq %rdi, %rax > > > +# endif > > > + /* Align address to VEC_SIZE * 4 for loop. */ > > > + andq $-(VEC_SIZE * 4), %rdi > > > + > > > + /* Add alignment difference to rdx. */ > > > +# ifndef USE_AS_RAWMEMCHR > > > + subq %rdi, %rax > > > +# ifdef USE_AS_WMEMCHR > > > + shr $2, %VRAX > > > +# endif > > > + addq %rax, %rdx > > > +# endif > > > + > > > + /* 4 vector loop. */ > > > + .p2align 5,,11 > > > +L(loop): > > > + > > > + VPCMPNE (VEC_SIZE * 4)(%rdi), %VMM(1), %k1 > > > + vpxorq (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2) > > > + vpxorq (VEC_SIZE * 6)(%rdi), %VMM(1), %VMM(3) > > > + VPCMPEQ (VEC_SIZE * 7)(%rdi), %VMM(1), %k3 > > > + VPMINU %VMM(2), %VMM(3), %VMM(3){%k1}{z} > > > + VPTESTNM %VMM(3), %VMM(3), %k2 > > > + > > > + subq $-(VEC_SIZE * 4), %rdi > > > + KORTEST %k2, %k3 > > > +# ifdef USE_AS_RAWMEMCHR > > > + jz L(loop) > > > +# else > > > + jnz L(loopend) > > > + subq $(CHAR_PER_VEC * 4), %rdx > > > + ja L(loop) > > > +L(zero_2): > > > + xor %eax, %eax > > > + ret > > > +# endif > > > + > > > +L(loopend): > > > + VPCMPEQ (%rdi), %VMM(1), %k1 > > > + KMOV %k1, %VRAX > > > + test %VRAX, %VRAX > > > + jnz L(ret_vec_x1) > > > + > > > +# ifndef USE_AS_RAWMEMCHR > > > + subq $CHAR_PER_VEC, %rdx > > > + jbe L(zero_2) > > > +# endif > > > + > > > + VPCMPEQ VEC_SIZE(%rdi), %VMM(1), %k1 > > > + KMOV %k1, %VRAX > > > + test %VRAX, %VRAX > > > + jnz L(ret_vec_x2) > > > + > > > +# ifndef USE_AS_RAWMEMCHR > > > + subq $CHAR_PER_VEC, %rdx > > > + jbe L(zero_2) > > > +# endif > > > + > > > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMM(1), %k1 > > > + KMOV %k1, %VRAX > > > + test %VRAX, %VRAX > > > + jnz L(ret_vec_x3) > > > + > > > +# ifndef USE_AS_RAWMEMCHR > > > + subq $CHAR_PER_VEC, %rdx > > > + jbe L(zero_2) > > > +# endif > > > + > > > + /* At this point null [w]char must be in the fourth vector so no > > > + need to check. */ > > > + KMOV %k3, %VRAX > > > + > > > +L(ret_vec_x4): > > > + bsf %VRAX, %VRAX > > > +# ifndef USE_AS_RAWMEMCHR > > > + cmp %rax, %rdx > > > + jbe L(zero) > > > +# endif > > > + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax > > > + ret > > > + > > > + .p2align 5,,5 > > > +L(ret_vec_x3): > > > + bsf %VRAX, %VRAX > > > +# ifndef USE_AS_RAWMEMCHR > > > + cmp %rax, %rdx > > > + jbe L(zero) > > > +# endif > > > + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > > > + ret > > > + > > > +END (MEMCHR) > > > +#endif > > > diff --git a/sysdeps/x86_64/multiarch/memchr-evex512.S b/sysdeps/x86_64/multiarch/memchr-evex512.S > > > new file mode 100644 > > > index 0000000000..002f8c8489 > > > --- /dev/null > > > +++ b/sysdeps/x86_64/multiarch/memchr-evex512.S > > > @@ -0,0 +1,8 @@ > > > +# ifndef MEMCHR > > > +# define MEMCHR __memchr_evex512 > > > +# endif > > > + > > > +#include "x86-evex512-vecs.h" > > > +#include "reg-macros.h" > > > + > > > +#include "memchr-evex-base.S" > > > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex512.S b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S > > > new file mode 100644 > > > index 0000000000..302d3cb055 > > > --- /dev/null > > > +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S > > > @@ -0,0 +1,7 @@ > > > +#ifndef RAWMEMCHR > > > +# define RAWMEMCHR __rawmemchr_evex512 > > > +#endif > > > +#define USE_AS_RAWMEMCHR 1 > > > +#define MEMCHR RAWMEMCHR > > > + > > > +#include "memchr-evex512.S" > > > diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex512.S b/sysdeps/x86_64/multiarch/wmemchr-evex512.S > > > new file mode 100644 > > > index 0000000000..78ec4ee5ad > > > --- /dev/null > > > +++ b/sysdeps/x86_64/multiarch/wmemchr-evex512.S > > > @@ -0,0 +1,9 @@ > > > +#ifndef WMEMCHR > > > +# define WMEMCHR __wmemchr_evex512 > > > +#endif > > > + > > > +#define MEMCHR WMEMCHR > > > +#define USE_AS_WMEMCHR 1 > > > + > > > +#define USE_WIDE_CHAR 1 > > > +#include "memchr-evex512.S" > > > -- > > > 2.36.1 > > > ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH v5] x86_64: Implement evex512 version of memchr, rawmemchr and wmemchr 2022-10-18 4:18 ` Noah Goldstein @ 2022-10-18 4:19 ` Noah Goldstein 2022-10-18 8:02 ` [PATCH v6] " Sunil K Pandey 2022-10-18 5:36 ` [PATCH v5] " Sunil Pandey 1 sibling, 1 reply; 26+ messages in thread From: Noah Goldstein @ 2022-10-18 4:19 UTC (permalink / raw) To: Sunil Pandey; +Cc: libc-alpha On Mon, Oct 17, 2022 at 11:18 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > On Mon, Oct 17, 2022 at 11:15 PM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > On Mon, Oct 17, 2022 at 8:02 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > On Mon, Oct 17, 2022 at 5:00 PM Sunil K Pandey via Libc-alpha > > > <libc-alpha@sourceware.org> wrote: > > > > > > > > Changes from v4: > > > > - Replace jmp max in first vector with cmov. > > > > - Replace jmp max in page cross with cmov. > > > > Changes from v3: > > > > - Replace VPCMPEQ in loop with VPTESTNM for 4th vector. > > > > - Change first vector max check logic for terminating condition. > > > > - Change page cross logic for terminating condition. > > > > - Remove unnessary check in align_more block. > > > > - Remove unnessary VEC(0) initialization. > > > > - Define USE_WIDE_CHAR in wmemchr. > > > > > > > > Changes from v2: > > > > - Use VEC API > > > > - Replaced long jump L(zero) in L(endloop) with short jump L(zero_2) > > > > > > > > Changes from v1: > > > > - Change vcmp to vcmpeq and vcmpneq. > > > > - Restructure unconditional loop jump logic. > > > > - Improve 4 vector loop logic. > > > > - Fix bug near page boundary. > > > > > > > > This patch implements following evex512 version of string functions. > > > > evex512 version takes up to 30% less cycle as compared to evex, > > > > depending on length and alignment. > > > > > > > > - memchr function using 512 bit vectors. > > > > - rawmemchr function using 512 bit vectors. > > > > - wmemchr function using 512 bit vectors. > > > > > > > > Code size data: > > > > > > > > memchr-evex.o 762 byte > > > > memchr-evex512.o 576 byte (-24%) > > > > > > > > rawmemchr-evex.o 461 byte > > > > rawmemchr-evex512.o 432 byte (-6%) > > > > > > > > wmemchr-evex.o 794 byte > > > > wmemchr-evex512.o 576 byte (-27%) > > > > > > > > Placeholder function, not used by any processor at the moment. > > > > > > > > Fix page cross logic > > > > > > > > Fix 2 > > > > --- > > > > sysdeps/x86_64/multiarch/Makefile | 3 + > > > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 + > > > > sysdeps/x86_64/multiarch/memchr-evex-base.S | 301 +++++++++++++++++++ > > > > sysdeps/x86_64/multiarch/memchr-evex512.S | 8 + > > > > sysdeps/x86_64/multiarch/rawmemchr-evex512.S | 7 + > > > > sysdeps/x86_64/multiarch/wmemchr-evex512.S | 9 + > > > > 6 files changed, 343 insertions(+) > > > > create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-base.S > > > > create mode 100644 sysdeps/x86_64/multiarch/memchr-evex512.S > > > > create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex512.S > > > > create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex512.S > > > > > > > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > > > > index df4601c294..e974b1ad97 100644 > > > > --- a/sysdeps/x86_64/multiarch/Makefile > > > > +++ b/sysdeps/x86_64/multiarch/Makefile > > > > @@ -4,6 +4,7 @@ sysdep_routines += \ > > > > memchr-avx2 \ > > > > memchr-avx2-rtm \ > > > > memchr-evex \ > > > > + memchr-evex512 \ > > > > memchr-evex-rtm \ > > > > memchr-sse2 \ > > > > memcmp-avx2-movbe \ > > > > @@ -36,6 +37,7 @@ sysdep_routines += \ > > > > rawmemchr-avx2 \ > > > > rawmemchr-avx2-rtm \ > > > > rawmemchr-evex \ > > > > + rawmemchr-evex512 \ > > > > rawmemchr-evex-rtm \ > > > > rawmemchr-sse2 \ > > > > stpcpy-avx2 \ > > > > @@ -156,6 +158,7 @@ sysdep_routines += \ > > > > wmemchr-avx2 \ > > > > wmemchr-avx2-rtm \ > > > > wmemchr-evex \ > > > > + wmemchr-evex512 \ > > > > wmemchr-evex-rtm \ > > > > wmemchr-sse2 \ > > > > wmemcmp-avx2-movbe \ > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > > index 00a91123d3..529c0b0ef0 100644 > > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > > @@ -63,6 +63,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > && CPU_FEATURE_USABLE (AVX512BW) > > > > && CPU_FEATURE_USABLE (BMI2)), > > > > __memchr_evex) > > > > + X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > > > > + (CPU_FEATURE_USABLE (AVX512VL) > > > > + && CPU_FEATURE_USABLE (AVX512BW) > > > > + && CPU_FEATURE_USABLE (BMI2)), > > > > + __memchr_evex512) > > > > X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > > > > (CPU_FEATURE_USABLE (AVX512VL) > > > > && CPU_FEATURE_USABLE (AVX512BW) > > > > @@ -337,6 +342,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > && CPU_FEATURE_USABLE (AVX512BW) > > > > && CPU_FEATURE_USABLE (BMI2)), > > > > __rawmemchr_evex) > > > > + X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > > > > + (CPU_FEATURE_USABLE (AVX512VL) > > > > + && CPU_FEATURE_USABLE (AVX512BW) > > > > + && CPU_FEATURE_USABLE (BMI2)), > > > > + __rawmemchr_evex512) > > > > X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > > > > (CPU_FEATURE_USABLE (AVX512VL) > > > > && CPU_FEATURE_USABLE (AVX512BW) > > > > @@ -942,6 +952,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > && CPU_FEATURE_USABLE (AVX512BW) > > > > && CPU_FEATURE_USABLE (BMI2)), > > > > __wmemchr_evex) > > > > + X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > > > > + (CPU_FEATURE_USABLE (AVX512VL) > > > > + && CPU_FEATURE_USABLE (AVX512BW) > > > > + && CPU_FEATURE_USABLE (BMI2)), > > > > + __wmemchr_evex512) > > > > X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > > > > (CPU_FEATURE_USABLE (AVX512VL) > > > > && CPU_FEATURE_USABLE (AVX512BW) > > > > diff --git a/sysdeps/x86_64/multiarch/memchr-evex-base.S b/sysdeps/x86_64/multiarch/memchr-evex-base.S > > > > new file mode 100644 > > > > index 0000000000..ea92983db8 > > > > --- /dev/null > > > > +++ b/sysdeps/x86_64/multiarch/memchr-evex-base.S > > > > @@ -0,0 +1,301 @@ > > > > +/* Placeholder function, not used by any processor at the moment. > > > > + Copyright (C) 2022 Free Software Foundation, Inc. > > > > + This file is part of the GNU C Library. > > > > + > > > > + The GNU C Library is free software; you can redistribute it and/or > > > > + modify it under the terms of the GNU Lesser General Public > > > > + License as published by the Free Software Foundation; either > > > > + version 2.1 of the License, or (at your option) any later version. > > > > + > > > > + The GNU C Library is distributed in the hope that it will be useful, > > > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > > > + Lesser General Public License for more details. > > > > + > > > > + You should have received a copy of the GNU Lesser General Public > > > > + License along with the GNU C Library; if not, see > > > > + <https://www.gnu.org/licenses/>. */ > > > > + > > > > +/* UNUSED. Exists purely as reference implementation. */ > > > > + > > > > +#include <isa-level.h> > > > > + > > > > +#if ISA_SHOULD_BUILD (4) > > > > + > > > > +# include <sysdep.h> > > > > + > > > > +# ifdef USE_AS_WMEMCHR > > > > +# define CHAR_SIZE 4 > > > > +# define VPBROADCAST vpbroadcastd > > > > +# define VPCMPEQ vpcmpeqd > > > > +# define VPCMPNE vpcmpneqd > > > > +# define VPMINU vpminud > > > > +# define VPTESTNM vptestnmd > > > > +# else > > > > +# define CHAR_SIZE 1 > > > > +# define VPBROADCAST vpbroadcastb > > > > +# define VPCMPEQ vpcmpeqb > > > > +# define VPCMPNE vpcmpneqb > > > > +# define VPMINU vpminub > > > > +# define VPTESTNM vptestnmb > > > > +# endif > > > > + > > > > +# define PAGE_SIZE 4096 > > > > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > > > + > > > > + .section SECTION(.text), "ax", @progbits > > > > +/* Aligning entry point to 64 byte, provides better performance for > > > > + one vector length string. */ > > > > +ENTRY_P2ALIGN (MEMCHR, 6) > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + /* Check for zero length. */ > > > > + test %RDX_LP, %RDX_LP > > > > + jz L(zero) > > > > + > > > > +# ifdef __ILP32__ > > > > + /* Clear the upper 32 bits. */ > > > > + movl %edx, %edx > > > > +# endif > > > > +# endif > > > > + > > > > + /* Broadcast CHAR to VMM(1). */ > > > > + VPBROADCAST %esi, %VMM(1) > > > > + movl %edi, %eax > > > > + andl $(PAGE_SIZE - 1), %eax > > > > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > > > > + ja L(page_cross) > > > > + > > > > + /* Compare [w]char for null, mask bit will be set for match. */ > > > > + VPCMPEQ (%rdi), %VMM(1), %k0 > > > > + > > > > + KMOV %k0, %VRCX > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + mov %rdx, %rsi > > > > + bsf %VRCX, %VRSI This needs to be `bsfq` as `bsfl` (for VEC_SIZE == 32) has undefined result in upper bits of dst. > > > > + cmp $CHAR_PER_VEC, %rsi > > > > + ja L(align_more) > > > > +# ifdef USE_AS_WMEMCHR > > > > + leaq (%rdi, %rsi, CHAR_SIZE), %rdi > > > > +# else > > > > + addq %rsi, %rdi > > > > +# endif > > > > + xor %eax, %eax > > > > + cmp %rsi, %rdx > > > > + cmova %rdi, %rax > > > > +# else > > > > + bsf %VRCX, %VRAX > > > > + jz L(align_more) > > > > + add %rdi, %rax > > > > +# endif > > > > + ret > > > > + > > > > + .p2align 5,,5 > > > > +L(page_cross): > > > > + movl %eax, %ecx > > > > + andl $(VEC_SIZE - 1), %ecx > > > > +# ifdef USE_AS_WMEMCHR > > > > + shrl $2, %ecx > > > > +# endif > > > > + xorq %rdi, %rax > > > > + VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1), %k0 > > > > + KMOV %k0, %VRSI > > > > + shr %cl, %VRSI > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + jnz L(page_cross_end) > > > > + movl $CHAR_PER_VEC, %eax > > > > + sub %ecx, %eax > > > > + cmp %rax, %rdx > > > > + ja L(align_more) > > > > +# else > > > > + jz L(align_more) > > > > +# endif > > > > + > > > > +L(page_cross_end): > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + bsf %VRSI, %VRCX This needs to be `bsfq` as `bsfl` (for VEC_SIZE == 32) has undefined result in upper bits of dst. > > > > + leaq (%rdi, %rcx, CHAR_SIZE), %rdi > > > > + xor %eax, %eax > > > > + cmp %rcx, %rdx > > > > + cmova %rdi, %rax > > > > > > You have a bug here test case: > > > > > > align % 4096 = 4036 > > > len = 8 > > > pos = N/A (no char in bounds). > > > > > > > Can you please help reproduce this issue. > > I tried adding this test case but it didn't fail. > > > > do_test (4036, 20, 8, 8, 0x9B); > position need to not be in the first VEC (even out of bounds). > > > > > > > I think the `bsf %rsi, %rcx` shouldn't have rcx as a destination > > > or you need to move `rdx` to `rcx` first. > > > > > > > +# else > > > > + bsf %VRSI, %VRAX > > > > + add %rdi, %rax > > > > +# endif > > > > + ret > > > > + > > > > +# ifndef USE_AS_RAWMEMCHR > > > > +L(zero): > > > > + xorl %eax, %eax > > > > + ret > > > > +# endif > > > > + > > > > +L(ret_vec_x2): > > > > + subq $-VEC_SIZE, %rdi > > > > +L(ret_vec_x1): > > > > + bsf %VRAX, %VRAX > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + cmp %rax, %rdx > > > > + jbe L(zero) > > > > +# endif > > > > +# ifdef USE_AS_WMEMCHR > > > > + leaq (%rdi, %rax, CHAR_SIZE), %rax > > > > +# else > > > > + add %rdi, %rax > > > > +# endif > > > > + ret > > > > + > > > > + .p2align 5,,5 > > > > +L(align_more): > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + mov %rdi, %rax > > > > +# endif > > > > + subq $-VEC_SIZE, %rdi > > > > + /* Align rdi to VEC_SIZE. */ > > > > + andq $-VEC_SIZE, %rdi > > > > + > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + subq %rdi, %rax > > > > +# ifdef USE_AS_WMEMCHR > > > > + sar $2, %rax > > > > +# endif > > > > + addq %rax, %rdx > > > > +# endif > > > > + > > > > + /* Loop unroll 4 times for 4 vector loop. */ > > > > + VPCMPEQ (%rdi), %VMM(1), %k0 > > > > + > > > > + KMOV %k0, %VRAX > > > > + test %VRAX, %VRAX > > > > + jnz L(ret_vec_x1) > > > > + > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + subq $CHAR_PER_VEC, %rdx > > > > + jbe L(zero) > > > > +# endif > > > > + > > > > + VPCMPEQ VEC_SIZE(%rdi), %VMM(1), %k0 > > > > + > > > > + KMOV %k0, %VRAX > > > > + test %VRAX, %VRAX > > > > + jnz L(ret_vec_x2) > > > > + > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + subq $CHAR_PER_VEC, %rdx > > > > + jbe L(zero) > > > > +# endif > > > > + > > > > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMM(1), %k0 > > > > + > > > > + KMOV %k0, %VRAX > > > > + test %VRAX, %VRAX > > > > + jnz L(ret_vec_x3) > > > > + > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + subq $CHAR_PER_VEC, %rdx > > > > + jbe L(zero) > > > > +# endif > > > > + > > > > + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMM(1), %k0 > > > > + > > > > + KMOV %k0, %VRAX > > > > + test %VRAX, %VRAX > > > > + jnz L(ret_vec_x4) > > > > + > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + subq $CHAR_PER_VEC, %rdx > > > > + jbe L(zero) > > > > + /* Save pointer to find alignment adjustment. */ > > > > + movq %rdi, %rax > > > > +# endif > > > > + /* Align address to VEC_SIZE * 4 for loop. */ > > > > + andq $-(VEC_SIZE * 4), %rdi > > > > + > > > > + /* Add alignment difference to rdx. */ > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + subq %rdi, %rax > > > > +# ifdef USE_AS_WMEMCHR > > > > + shr $2, %VRAX > > > > +# endif > > > > + addq %rax, %rdx > > > > +# endif > > > > + > > > > + /* 4 vector loop. */ > > > > + .p2align 5,,11 > > > > +L(loop): > > > > + > > > > + VPCMPNE (VEC_SIZE * 4)(%rdi), %VMM(1), %k1 > > > > + vpxorq (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2) > > > > + vpxorq (VEC_SIZE * 6)(%rdi), %VMM(1), %VMM(3) > > > > + VPCMPEQ (VEC_SIZE * 7)(%rdi), %VMM(1), %k3 > > > > + VPMINU %VMM(2), %VMM(3), %VMM(3){%k1}{z} > > > > + VPTESTNM %VMM(3), %VMM(3), %k2 > > > > + > > > > + subq $-(VEC_SIZE * 4), %rdi > > > > + KORTEST %k2, %k3 > > > > +# ifdef USE_AS_RAWMEMCHR > > > > + jz L(loop) > > > > +# else > > > > + jnz L(loopend) > > > > + subq $(CHAR_PER_VEC * 4), %rdx > > > > + ja L(loop) > > > > +L(zero_2): > > > > + xor %eax, %eax > > > > + ret > > > > +# endif > > > > + > > > > +L(loopend): > > > > + VPCMPEQ (%rdi), %VMM(1), %k1 > > > > + KMOV %k1, %VRAX > > > > + test %VRAX, %VRAX > > > > + jnz L(ret_vec_x1) > > > > + > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + subq $CHAR_PER_VEC, %rdx > > > > + jbe L(zero_2) > > > > +# endif > > > > + > > > > + VPCMPEQ VEC_SIZE(%rdi), %VMM(1), %k1 > > > > + KMOV %k1, %VRAX > > > > + test %VRAX, %VRAX > > > > + jnz L(ret_vec_x2) > > > > + > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + subq $CHAR_PER_VEC, %rdx > > > > + jbe L(zero_2) > > > > +# endif > > > > + > > > > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMM(1), %k1 > > > > + KMOV %k1, %VRAX > > > > + test %VRAX, %VRAX > > > > + jnz L(ret_vec_x3) > > > > + > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + subq $CHAR_PER_VEC, %rdx > > > > + jbe L(zero_2) > > > > +# endif > > > > + > > > > + /* At this point null [w]char must be in the fourth vector so no > > > > + need to check. */ > > > > + KMOV %k3, %VRAX > > > > + > > > > +L(ret_vec_x4): > > > > + bsf %VRAX, %VRAX > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + cmp %rax, %rdx > > > > + jbe L(zero) > > > > +# endif > > > > + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax > > > > + ret > > > > + > > > > + .p2align 5,,5 > > > > +L(ret_vec_x3): > > > > + bsf %VRAX, %VRAX > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + cmp %rax, %rdx > > > > + jbe L(zero) > > > > +# endif > > > > + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > > > > + ret > > > > + > > > > +END (MEMCHR) > > > > +#endif > > > > diff --git a/sysdeps/x86_64/multiarch/memchr-evex512.S b/sysdeps/x86_64/multiarch/memchr-evex512.S > > > > new file mode 100644 > > > > index 0000000000..002f8c8489 > > > > --- /dev/null > > > > +++ b/sysdeps/x86_64/multiarch/memchr-evex512.S > > > > @@ -0,0 +1,8 @@ > > > > +# ifndef MEMCHR > > > > +# define MEMCHR __memchr_evex512 > > > > +# endif > > > > + > > > > +#include "x86-evex512-vecs.h" > > > > +#include "reg-macros.h" > > > > + > > > > +#include "memchr-evex-base.S" > > > > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex512.S b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S > > > > new file mode 100644 > > > > index 0000000000..302d3cb055 > > > > --- /dev/null > > > > +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S > > > > @@ -0,0 +1,7 @@ > > > > +#ifndef RAWMEMCHR > > > > +# define RAWMEMCHR __rawmemchr_evex512 > > > > +#endif > > > > +#define USE_AS_RAWMEMCHR 1 > > > > +#define MEMCHR RAWMEMCHR > > > > + > > > > +#include "memchr-evex512.S" > > > > diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex512.S b/sysdeps/x86_64/multiarch/wmemchr-evex512.S > > > > new file mode 100644 > > > > index 0000000000..78ec4ee5ad > > > > --- /dev/null > > > > +++ b/sysdeps/x86_64/multiarch/wmemchr-evex512.S > > > > @@ -0,0 +1,9 @@ > > > > +#ifndef WMEMCHR > > > > +# define WMEMCHR __wmemchr_evex512 > > > > +#endif > > > > + > > > > +#define MEMCHR WMEMCHR > > > > +#define USE_AS_WMEMCHR 1 > > > > + > > > > +#define USE_WIDE_CHAR 1 > > > > +#include "memchr-evex512.S" > > > > -- > > > > 2.36.1 > > > > ^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCH v6] x86_64: Implement evex512 version of memchr, rawmemchr and wmemchr 2022-10-18 4:19 ` Noah Goldstein @ 2022-10-18 8:02 ` Sunil K Pandey 2022-10-18 17:12 ` Noah Goldstein 0 siblings, 1 reply; 26+ messages in thread From: Sunil K Pandey @ 2022-10-18 8:02 UTC (permalink / raw) To: libc-alpha Changes from v5: - Change bsf to bsfq in 1st vector check. - Change bsf to bsfq in page cross logic. - Fix bug in page cross logic. Changes from v4: - Replace jmp max in first vector with cmov. - Replace jmp max in page cross with cmov. Changes from v3: - Replace VPCMPEQ in loop with VPTESTNM for 4th vector. - Change first vector max check logic for terminating condition. - Change page cross logic for terminating condition. - Remove unnessary check in align_more block. - Remove unnessary VEC(0) initialization. - Define USE_WIDE_CHAR in wmemchr. Changes from v2: - Use VEC API - Replaced long jump L(zero) in L(endloop) with short jump L(zero_2) Changes from v1: - Change vcmp to vcmpeq and vcmpneq. - Restructure unconditional loop jump logic. - Improve 4 vector loop logic. - Fix bug near page boundary. This patch implements following evex512 version of string functions. evex512 version takes up to 30% less cycle as compared to evex, depending on length and alignment. - memchr function using 512 bit vectors. - rawmemchr function using 512 bit vectors. - wmemchr function using 512 bit vectors. Code size data: memchr-evex.o 762 byte memchr-evex512.o 576 byte (-24%) rawmemchr-evex.o 461 byte rawmemchr-evex512.o 432 byte (-6%) wmemchr-evex.o 794 byte wmemchr-evex512.o 576 byte (-27%) Placeholder function, not used by any processor at the moment. --- sysdeps/x86_64/multiarch/Makefile | 3 + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 + sysdeps/x86_64/multiarch/memchr-evex-base.S | 302 +++++++++++++++++++ sysdeps/x86_64/multiarch/memchr-evex512.S | 8 + sysdeps/x86_64/multiarch/rawmemchr-evex512.S | 7 + sysdeps/x86_64/multiarch/wmemchr-evex512.S | 9 + 6 files changed, 344 insertions(+) create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-base.S create mode 100644 sysdeps/x86_64/multiarch/memchr-evex512.S create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex512.S create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex512.S diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index df4601c294..e974b1ad97 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -4,6 +4,7 @@ sysdep_routines += \ memchr-avx2 \ memchr-avx2-rtm \ memchr-evex \ + memchr-evex512 \ memchr-evex-rtm \ memchr-sse2 \ memcmp-avx2-movbe \ @@ -36,6 +37,7 @@ sysdep_routines += \ rawmemchr-avx2 \ rawmemchr-avx2-rtm \ rawmemchr-evex \ + rawmemchr-evex512 \ rawmemchr-evex-rtm \ rawmemchr-sse2 \ stpcpy-avx2 \ @@ -156,6 +158,7 @@ sysdep_routines += \ wmemchr-avx2 \ wmemchr-avx2-rtm \ wmemchr-evex \ + wmemchr-evex512 \ wmemchr-evex-rtm \ wmemchr-sse2 \ wmemcmp-avx2-movbe \ diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 00a91123d3..529c0b0ef0 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -63,6 +63,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), __memchr_evex) + X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __memchr_evex512) X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) @@ -337,6 +342,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), __rawmemchr_evex) + X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __rawmemchr_evex512) X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) @@ -942,6 +952,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), __wmemchr_evex) + X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __wmemchr_evex512) X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) diff --git a/sysdeps/x86_64/multiarch/memchr-evex-base.S b/sysdeps/x86_64/multiarch/memchr-evex-base.S new file mode 100644 index 0000000000..2a692799b1 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memchr-evex-base.S @@ -0,0 +1,302 @@ +/* Placeholder function, not used by any processor at the moment. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +/* UNUSED. Exists purely as reference implementation. */ + +#include <isa-level.h> + +#if ISA_SHOULD_BUILD (4) + +# include <sysdep.h> + +# ifdef USE_AS_WMEMCHR +# define CHAR_SIZE 4 +# define VPBROADCAST vpbroadcastd +# define VPCMPEQ vpcmpeqd +# define VPCMPNE vpcmpneqd +# define VPMINU vpminud +# define VPTESTNM vptestnmd +# else +# define CHAR_SIZE 1 +# define VPBROADCAST vpbroadcastb +# define VPCMPEQ vpcmpeqb +# define VPCMPNE vpcmpneqb +# define VPMINU vpminub +# define VPTESTNM vptestnmb +# endif + +# define PAGE_SIZE 4096 +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) + + .section SECTION(.text), "ax", @progbits +/* Aligning entry point to 64 byte, provides better performance for + one vector length string. */ +ENTRY_P2ALIGN (MEMCHR, 6) +# ifndef USE_AS_RAWMEMCHR + /* Check for zero length. */ + test %RDX_LP, %RDX_LP + jz L(zero) + +# ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %edx, %edx +# endif +# endif + + /* Broadcast CHAR to VMM(1). */ + VPBROADCAST %esi, %VMM(1) + movl %edi, %eax + andl $(PAGE_SIZE - 1), %eax + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + ja L(page_cross) + + /* Compare [w]char for null, mask bit will be set for match. */ + VPCMPEQ (%rdi), %VMM(1), %k0 + + KMOV %k0, %VRCX +# ifndef USE_AS_RAWMEMCHR + mov %rdx, %rsi + bsfq %rcx, %rsi + cmp $CHAR_PER_VEC, %rsi + ja L(align_more) +# ifdef USE_AS_WMEMCHR + leaq (%rdi, %rsi, CHAR_SIZE), %rdi +# else + addq %rsi, %rdi +# endif + xor %eax, %eax + cmp %rsi, %rdx + cmova %rdi, %rax +# else + bsf %VRCX, %VRAX + jz L(align_more) + add %rdi, %rax +# endif + ret + + .p2align 5,,5 +L(page_cross): + movl %eax, %ecx + andl $(VEC_SIZE - 1), %ecx +# ifdef USE_AS_WMEMCHR + shrl $2, %ecx +# endif + xorq %rdi, %rax + VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1), %k0 + KMOV %k0, %VRSI + shr %cl, %VRSI +# ifndef USE_AS_RAWMEMCHR + jnz L(page_cross_end) + movl $CHAR_PER_VEC, %eax + sub %ecx, %eax + cmp %rax, %rdx + ja L(align_more) +# else + jz L(align_more) +# endif + +L(page_cross_end): +# ifndef USE_AS_RAWMEMCHR + bsfq %rsi, %rcx + jz L(zero) + leaq (%rdi, %rcx, CHAR_SIZE), %rdi + xor %eax, %eax + cmp %rcx, %rdx + cmova %rdi, %rax +# else + bsf %VRSI, %VRAX + add %rdi, %rax +# endif + ret + +# ifndef USE_AS_RAWMEMCHR +L(zero): + xorl %eax, %eax + ret +# endif + +L(ret_vec_x2): + subq $-VEC_SIZE, %rdi +L(ret_vec_x1): + bsf %VRAX, %VRAX +# ifndef USE_AS_RAWMEMCHR + cmp %rax, %rdx + jbe L(zero) +# endif +# ifdef USE_AS_WMEMCHR + leaq (%rdi, %rax, CHAR_SIZE), %rax +# else + add %rdi, %rax +# endif + ret + + .p2align 5,,5 +L(align_more): +# ifndef USE_AS_RAWMEMCHR + mov %rdi, %rax +# endif + subq $-VEC_SIZE, %rdi + /* Align rdi to VEC_SIZE. */ + andq $-VEC_SIZE, %rdi + +# ifndef USE_AS_RAWMEMCHR + subq %rdi, %rax +# ifdef USE_AS_WMEMCHR + sar $2, %rax +# endif + addq %rax, %rdx +# endif + + /* Loop unroll 4 times for 4 vector loop. */ + VPCMPEQ (%rdi), %VMM(1), %k0 + + KMOV %k0, %VRAX + test %VRAX, %VRAX + jnz L(ret_vec_x1) + +# ifndef USE_AS_RAWMEMCHR + subq $CHAR_PER_VEC, %rdx + jbe L(zero) +# endif + + VPCMPEQ VEC_SIZE(%rdi), %VMM(1), %k0 + + KMOV %k0, %VRAX + test %VRAX, %VRAX + jnz L(ret_vec_x2) + +# ifndef USE_AS_RAWMEMCHR + subq $CHAR_PER_VEC, %rdx + jbe L(zero) +# endif + + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMM(1), %k0 + + KMOV %k0, %VRAX + test %VRAX, %VRAX + jnz L(ret_vec_x3) + +# ifndef USE_AS_RAWMEMCHR + subq $CHAR_PER_VEC, %rdx + jbe L(zero) +# endif + + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMM(1), %k0 + + KMOV %k0, %VRAX + test %VRAX, %VRAX + jnz L(ret_vec_x4) + +# ifndef USE_AS_RAWMEMCHR + subq $CHAR_PER_VEC, %rdx + jbe L(zero) + /* Save pointer to find alignment adjustment. */ + movq %rdi, %rax +# endif + /* Align address to VEC_SIZE * 4 for loop. */ + andq $-(VEC_SIZE * 4), %rdi + + /* Add alignment difference to rdx. */ +# ifndef USE_AS_RAWMEMCHR + subq %rdi, %rax +# ifdef USE_AS_WMEMCHR + shr $2, %VRAX +# endif + addq %rax, %rdx +# endif + + /* 4 vector loop. */ + .p2align 5,,11 +L(loop): + + VPCMPNE (VEC_SIZE * 4)(%rdi), %VMM(1), %k1 + vpxorq (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2) + vpxorq (VEC_SIZE * 6)(%rdi), %VMM(1), %VMM(3) + VPCMPEQ (VEC_SIZE * 7)(%rdi), %VMM(1), %k3 + VPMINU %VMM(2), %VMM(3), %VMM(3){%k1}{z} + VPTESTNM %VMM(3), %VMM(3), %k2 + + subq $-(VEC_SIZE * 4), %rdi + KORTEST %k2, %k3 +# ifdef USE_AS_RAWMEMCHR + jz L(loop) +# else + jnz L(loopend) + subq $(CHAR_PER_VEC * 4), %rdx + ja L(loop) +L(zero_2): + xor %eax, %eax + ret +# endif + +L(loopend): + VPCMPEQ (%rdi), %VMM(1), %k1 + KMOV %k1, %VRAX + test %VRAX, %VRAX + jnz L(ret_vec_x1) + +# ifndef USE_AS_RAWMEMCHR + subq $CHAR_PER_VEC, %rdx + jbe L(zero_2) +# endif + + VPCMPEQ VEC_SIZE(%rdi), %VMM(1), %k1 + KMOV %k1, %VRAX + test %VRAX, %VRAX + jnz L(ret_vec_x2) + +# ifndef USE_AS_RAWMEMCHR + subq $CHAR_PER_VEC, %rdx + jbe L(zero_2) +# endif + + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMM(1), %k1 + KMOV %k1, %VRAX + test %VRAX, %VRAX + jnz L(ret_vec_x3) + +# ifndef USE_AS_RAWMEMCHR + subq $CHAR_PER_VEC, %rdx + jbe L(zero_2) +# endif + + /* At this point null [w]char must be in the fourth vector so no + need to check. */ + KMOV %k3, %VRAX + +L(ret_vec_x4): + bsf %VRAX, %VRAX +# ifndef USE_AS_RAWMEMCHR + cmp %rax, %rdx + jbe L(zero) +# endif + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax + ret + + .p2align 5,,5 +L(ret_vec_x3): + bsf %VRAX, %VRAX +# ifndef USE_AS_RAWMEMCHR + cmp %rax, %rdx + jbe L(zero) +# endif + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax + ret + +END (MEMCHR) +#endif diff --git a/sysdeps/x86_64/multiarch/memchr-evex512.S b/sysdeps/x86_64/multiarch/memchr-evex512.S new file mode 100644 index 0000000000..002f8c8489 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memchr-evex512.S @@ -0,0 +1,8 @@ +# ifndef MEMCHR +# define MEMCHR __memchr_evex512 +# endif + +#include "x86-evex512-vecs.h" +#include "reg-macros.h" + +#include "memchr-evex-base.S" diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex512.S b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S new file mode 100644 index 0000000000..302d3cb055 --- /dev/null +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S @@ -0,0 +1,7 @@ +#ifndef RAWMEMCHR +# define RAWMEMCHR __rawmemchr_evex512 +#endif +#define USE_AS_RAWMEMCHR 1 +#define MEMCHR RAWMEMCHR + +#include "memchr-evex512.S" diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex512.S b/sysdeps/x86_64/multiarch/wmemchr-evex512.S new file mode 100644 index 0000000000..78ec4ee5ad --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemchr-evex512.S @@ -0,0 +1,9 @@ +#ifndef WMEMCHR +# define WMEMCHR __wmemchr_evex512 +#endif + +#define MEMCHR WMEMCHR +#define USE_AS_WMEMCHR 1 + +#define USE_WIDE_CHAR 1 +#include "memchr-evex512.S" -- 2.36.1 ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH v6] x86_64: Implement evex512 version of memchr, rawmemchr and wmemchr 2022-10-18 8:02 ` [PATCH v6] " Sunil K Pandey @ 2022-10-18 17:12 ` Noah Goldstein 0 siblings, 0 replies; 26+ messages in thread From: Noah Goldstein @ 2022-10-18 17:12 UTC (permalink / raw) To: Sunil K Pandey; +Cc: libc-alpha On Tue, Oct 18, 2022 at 3:04 AM Sunil K Pandey via Libc-alpha <libc-alpha@sourceware.org> wrote: > > Changes from v5: > - Change bsf to bsfq in 1st vector check. > - Change bsf to bsfq in page cross logic. > - Fix bug in page cross logic. > Changes from v4: > - Replace jmp max in first vector with cmov. > - Replace jmp max in page cross with cmov. > Changes from v3: > - Replace VPCMPEQ in loop with VPTESTNM for 4th vector. > - Change first vector max check logic for terminating condition. > - Change page cross logic for terminating condition. > - Remove unnessary check in align_more block. > - Remove unnessary VEC(0) initialization. > - Define USE_WIDE_CHAR in wmemchr. > > Changes from v2: > - Use VEC API > - Replaced long jump L(zero) in L(endloop) with short jump L(zero_2) > > Changes from v1: > - Change vcmp to vcmpeq and vcmpneq. > - Restructure unconditional loop jump logic. > - Improve 4 vector loop logic. > - Fix bug near page boundary. > > This patch implements following evex512 version of string functions. > evex512 version takes up to 30% less cycle as compared to evex, > depending on length and alignment. > > - memchr function using 512 bit vectors. > - rawmemchr function using 512 bit vectors. > - wmemchr function using 512 bit vectors. > > Code size data: > > memchr-evex.o 762 byte > memchr-evex512.o 576 byte (-24%) > > rawmemchr-evex.o 461 byte > rawmemchr-evex512.o 432 byte (-6%) > > wmemchr-evex.o 794 byte > wmemchr-evex512.o 576 byte (-27%) > > Placeholder function, not used by any processor at the moment. > --- > sysdeps/x86_64/multiarch/Makefile | 3 + > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 + > sysdeps/x86_64/multiarch/memchr-evex-base.S | 302 +++++++++++++++++++ > sysdeps/x86_64/multiarch/memchr-evex512.S | 8 + > sysdeps/x86_64/multiarch/rawmemchr-evex512.S | 7 + > sysdeps/x86_64/multiarch/wmemchr-evex512.S | 9 + > 6 files changed, 344 insertions(+) > create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-base.S > create mode 100644 sysdeps/x86_64/multiarch/memchr-evex512.S > create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex512.S > create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex512.S > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > index df4601c294..e974b1ad97 100644 > --- a/sysdeps/x86_64/multiarch/Makefile > +++ b/sysdeps/x86_64/multiarch/Makefile > @@ -4,6 +4,7 @@ sysdep_routines += \ > memchr-avx2 \ > memchr-avx2-rtm \ > memchr-evex \ > + memchr-evex512 \ > memchr-evex-rtm \ > memchr-sse2 \ > memcmp-avx2-movbe \ > @@ -36,6 +37,7 @@ sysdep_routines += \ > rawmemchr-avx2 \ > rawmemchr-avx2-rtm \ > rawmemchr-evex \ > + rawmemchr-evex512 \ > rawmemchr-evex-rtm \ > rawmemchr-sse2 \ > stpcpy-avx2 \ > @@ -156,6 +158,7 @@ sysdep_routines += \ > wmemchr-avx2 \ > wmemchr-avx2-rtm \ > wmemchr-evex \ > + wmemchr-evex512 \ > wmemchr-evex-rtm \ > wmemchr-sse2 \ > wmemcmp-avx2-movbe \ > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > index 00a91123d3..529c0b0ef0 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > @@ -63,6 +63,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > && CPU_FEATURE_USABLE (AVX512BW) > && CPU_FEATURE_USABLE (BMI2)), > __memchr_evex) > + X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > + (CPU_FEATURE_USABLE (AVX512VL) > + && CPU_FEATURE_USABLE (AVX512BW) > + && CPU_FEATURE_USABLE (BMI2)), > + __memchr_evex512) > X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > (CPU_FEATURE_USABLE (AVX512VL) > && CPU_FEATURE_USABLE (AVX512BW) > @@ -337,6 +342,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > && CPU_FEATURE_USABLE (AVX512BW) > && CPU_FEATURE_USABLE (BMI2)), > __rawmemchr_evex) > + X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > + (CPU_FEATURE_USABLE (AVX512VL) > + && CPU_FEATURE_USABLE (AVX512BW) > + && CPU_FEATURE_USABLE (BMI2)), > + __rawmemchr_evex512) > X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > (CPU_FEATURE_USABLE (AVX512VL) > && CPU_FEATURE_USABLE (AVX512BW) > @@ -942,6 +952,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > && CPU_FEATURE_USABLE (AVX512BW) > && CPU_FEATURE_USABLE (BMI2)), > __wmemchr_evex) > + X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > + (CPU_FEATURE_USABLE (AVX512VL) > + && CPU_FEATURE_USABLE (AVX512BW) > + && CPU_FEATURE_USABLE (BMI2)), > + __wmemchr_evex512) > X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > (CPU_FEATURE_USABLE (AVX512VL) > && CPU_FEATURE_USABLE (AVX512BW) > diff --git a/sysdeps/x86_64/multiarch/memchr-evex-base.S b/sysdeps/x86_64/multiarch/memchr-evex-base.S > new file mode 100644 > index 0000000000..2a692799b1 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/memchr-evex-base.S > @@ -0,0 +1,302 @@ > +/* Placeholder function, not used by any processor at the moment. > + Copyright (C) 2022 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +/* UNUSED. Exists purely as reference implementation. */ > + > +#include <isa-level.h> > + > +#if ISA_SHOULD_BUILD (4) > + > +# include <sysdep.h> > + > +# ifdef USE_AS_WMEMCHR > +# define CHAR_SIZE 4 > +# define VPBROADCAST vpbroadcastd > +# define VPCMPEQ vpcmpeqd > +# define VPCMPNE vpcmpneqd > +# define VPMINU vpminud > +# define VPTESTNM vptestnmd > +# else > +# define CHAR_SIZE 1 > +# define VPBROADCAST vpbroadcastb > +# define VPCMPEQ vpcmpeqb > +# define VPCMPNE vpcmpneqb > +# define VPMINU vpminub > +# define VPTESTNM vptestnmb > +# endif > + > +# define PAGE_SIZE 4096 > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > + > + .section SECTION(.text), "ax", @progbits > +/* Aligning entry point to 64 byte, provides better performance for > + one vector length string. */ > +ENTRY_P2ALIGN (MEMCHR, 6) > +# ifndef USE_AS_RAWMEMCHR > + /* Check for zero length. */ > + test %RDX_LP, %RDX_LP > + jz L(zero) > + > +# ifdef __ILP32__ > + /* Clear the upper 32 bits. */ > + movl %edx, %edx > +# endif > +# endif > + > + /* Broadcast CHAR to VMM(1). */ > + VPBROADCAST %esi, %VMM(1) > + movl %edi, %eax > + andl $(PAGE_SIZE - 1), %eax > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > + ja L(page_cross) > + > + /* Compare [w]char for null, mask bit will be set for match. */ > + VPCMPEQ (%rdi), %VMM(1), %k0 > + > + KMOV %k0, %VRCX > +# ifndef USE_AS_RAWMEMCHR > + mov %rdx, %rsi > + bsfq %rcx, %rsi > + cmp $CHAR_PER_VEC, %rsi > + ja L(align_more) > +# ifdef USE_AS_WMEMCHR > + leaq (%rdi, %rsi, CHAR_SIZE), %rdi > +# else > + addq %rsi, %rdi > +# endif > + xor %eax, %eax > + cmp %rsi, %rdx > + cmova %rdi, %rax > +# else > + bsf %VRCX, %VRAX > + jz L(align_more) > + add %rdi, %rax > +# endif > + ret > + > + .p2align 5,,5 > +L(page_cross): > + movl %eax, %ecx > + andl $(VEC_SIZE - 1), %ecx > +# ifdef USE_AS_WMEMCHR > + shrl $2, %ecx > +# endif > + xorq %rdi, %rax > + VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1), %k0 > + KMOV %k0, %VRSI > + shr %cl, %VRSI > +# ifndef USE_AS_RAWMEMCHR > + jnz L(page_cross_end) > + movl $CHAR_PER_VEC, %eax > + sub %ecx, %eax > + cmp %rax, %rdx > + ja L(align_more) > +# else > + jz L(align_more) > +# endif > + > +L(page_cross_end): > +# ifndef USE_AS_RAWMEMCHR > + bsfq %rsi, %rcx > + jz L(zero) > + leaq (%rdi, %rcx, CHAR_SIZE), %rdi > + xor %eax, %eax > + cmp %rcx, %rdx > + cmova %rdi, %rax > +# else > + bsf %VRSI, %VRAX > + add %rdi, %rax > +# endif > + ret > + > +# ifndef USE_AS_RAWMEMCHR > +L(zero): > + xorl %eax, %eax > + ret > +# endif > + > +L(ret_vec_x2): > + subq $-VEC_SIZE, %rdi > +L(ret_vec_x1): > + bsf %VRAX, %VRAX > +# ifndef USE_AS_RAWMEMCHR > + cmp %rax, %rdx > + jbe L(zero) > +# endif > +# ifdef USE_AS_WMEMCHR > + leaq (%rdi, %rax, CHAR_SIZE), %rax > +# else > + add %rdi, %rax > +# endif > + ret > + > + .p2align 5,,5 > +L(align_more): > +# ifndef USE_AS_RAWMEMCHR > + mov %rdi, %rax > +# endif > + subq $-VEC_SIZE, %rdi > + /* Align rdi to VEC_SIZE. */ > + andq $-VEC_SIZE, %rdi > + > +# ifndef USE_AS_RAWMEMCHR > + subq %rdi, %rax > +# ifdef USE_AS_WMEMCHR > + sar $2, %rax > +# endif > + addq %rax, %rdx > +# endif > + > + /* Loop unroll 4 times for 4 vector loop. */ > + VPCMPEQ (%rdi), %VMM(1), %k0 > + > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > + jnz L(ret_vec_x1) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero) > +# endif > + > + VPCMPEQ VEC_SIZE(%rdi), %VMM(1), %k0 > + > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > + jnz L(ret_vec_x2) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero) > +# endif > + > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMM(1), %k0 > + > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > + jnz L(ret_vec_x3) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero) > +# endif > + > + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMM(1), %k0 > + > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > + jnz L(ret_vec_x4) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero) > + /* Save pointer to find alignment adjustment. */ > + movq %rdi, %rax > +# endif > + /* Align address to VEC_SIZE * 4 for loop. */ > + andq $-(VEC_SIZE * 4), %rdi > + > + /* Add alignment difference to rdx. */ > +# ifndef USE_AS_RAWMEMCHR > + subq %rdi, %rax > +# ifdef USE_AS_WMEMCHR > + shr $2, %VRAX > +# endif > + addq %rax, %rdx > +# endif > + > + /* 4 vector loop. */ > + .p2align 5,,11 > +L(loop): > + > + VPCMPNE (VEC_SIZE * 4)(%rdi), %VMM(1), %k1 > + vpxorq (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2) > + vpxorq (VEC_SIZE * 6)(%rdi), %VMM(1), %VMM(3) > + VPCMPEQ (VEC_SIZE * 7)(%rdi), %VMM(1), %k3 > + VPMINU %VMM(2), %VMM(3), %VMM(3){%k1}{z} > + VPTESTNM %VMM(3), %VMM(3), %k2 > + > + subq $-(VEC_SIZE * 4), %rdi > + KORTEST %k2, %k3 > +# ifdef USE_AS_RAWMEMCHR > + jz L(loop) > +# else > + jnz L(loopend) > + subq $(CHAR_PER_VEC * 4), %rdx > + ja L(loop) > +L(zero_2): > + xor %eax, %eax > + ret > +# endif > + > +L(loopend): > + VPCMPEQ (%rdi), %VMM(1), %k1 > + KMOV %k1, %VRAX > + test %VRAX, %VRAX > + jnz L(ret_vec_x1) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero_2) > +# endif > + > + VPCMPEQ VEC_SIZE(%rdi), %VMM(1), %k1 > + KMOV %k1, %VRAX > + test %VRAX, %VRAX > + jnz L(ret_vec_x2) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero_2) > +# endif > + > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMM(1), %k1 > + KMOV %k1, %VRAX > + test %VRAX, %VRAX > + jnz L(ret_vec_x3) > + > +# ifndef USE_AS_RAWMEMCHR > + subq $CHAR_PER_VEC, %rdx > + jbe L(zero_2) > +# endif > + > + /* At this point null [w]char must be in the fourth vector so no > + need to check. */ > + KMOV %k3, %VRAX > + > +L(ret_vec_x4): > + bsf %VRAX, %VRAX > +# ifndef USE_AS_RAWMEMCHR > + cmp %rax, %rdx > + jbe L(zero) > +# endif > + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax > + ret > + > + .p2align 5,,5 > +L(ret_vec_x3): > + bsf %VRAX, %VRAX > +# ifndef USE_AS_RAWMEMCHR > + cmp %rax, %rdx > + jbe L(zero) > +# endif > + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > + ret > + > +END (MEMCHR) > +#endif > diff --git a/sysdeps/x86_64/multiarch/memchr-evex512.S b/sysdeps/x86_64/multiarch/memchr-evex512.S > new file mode 100644 > index 0000000000..002f8c8489 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/memchr-evex512.S > @@ -0,0 +1,8 @@ > +# ifndef MEMCHR > +# define MEMCHR __memchr_evex512 > +# endif > + > +#include "x86-evex512-vecs.h" > +#include "reg-macros.h" > + > +#include "memchr-evex-base.S" > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex512.S b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S > new file mode 100644 > index 0000000000..302d3cb055 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S > @@ -0,0 +1,7 @@ > +#ifndef RAWMEMCHR > +# define RAWMEMCHR __rawmemchr_evex512 > +#endif > +#define USE_AS_RAWMEMCHR 1 > +#define MEMCHR RAWMEMCHR > + > +#include "memchr-evex512.S" > diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex512.S b/sysdeps/x86_64/multiarch/wmemchr-evex512.S > new file mode 100644 > index 0000000000..78ec4ee5ad > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/wmemchr-evex512.S > @@ -0,0 +1,9 @@ > +#ifndef WMEMCHR > +# define WMEMCHR __wmemchr_evex512 > +#endif > + > +#define MEMCHR WMEMCHR > +#define USE_AS_WMEMCHR 1 > + > +#define USE_WIDE_CHAR 1 > +#include "memchr-evex512.S" > -- > 2.36.1 > LGTM. Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com> ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH v5] x86_64: Implement evex512 version of memchr, rawmemchr and wmemchr 2022-10-18 4:18 ` Noah Goldstein 2022-10-18 4:19 ` Noah Goldstein @ 2022-10-18 5:36 ` Sunil Pandey 2022-10-18 6:30 ` Noah Goldstein 1 sibling, 1 reply; 26+ messages in thread From: Sunil Pandey @ 2022-10-18 5:36 UTC (permalink / raw) To: Noah Goldstein; +Cc: libc-alpha On Mon, Oct 17, 2022 at 9:18 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > On Mon, Oct 17, 2022 at 11:15 PM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > On Mon, Oct 17, 2022 at 8:02 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > On Mon, Oct 17, 2022 at 5:00 PM Sunil K Pandey via Libc-alpha > > > <libc-alpha@sourceware.org> wrote: > > > > > > > > Changes from v4: > > > > - Replace jmp max in first vector with cmov. > > > > - Replace jmp max in page cross with cmov. > > > > Changes from v3: > > > > - Replace VPCMPEQ in loop with VPTESTNM for 4th vector. > > > > - Change first vector max check logic for terminating condition. > > > > - Change page cross logic for terminating condition. > > > > - Remove unnessary check in align_more block. > > > > - Remove unnessary VEC(0) initialization. > > > > - Define USE_WIDE_CHAR in wmemchr. > > > > > > > > Changes from v2: > > > > - Use VEC API > > > > - Replaced long jump L(zero) in L(endloop) with short jump L(zero_2) > > > > > > > > Changes from v1: > > > > - Change vcmp to vcmpeq and vcmpneq. > > > > - Restructure unconditional loop jump logic. > > > > - Improve 4 vector loop logic. > > > > - Fix bug near page boundary. > > > > > > > > This patch implements following evex512 version of string functions. > > > > evex512 version takes up to 30% less cycle as compared to evex, > > > > depending on length and alignment. > > > > > > > > - memchr function using 512 bit vectors. > > > > - rawmemchr function using 512 bit vectors. > > > > - wmemchr function using 512 bit vectors. > > > > > > > > Code size data: > > > > > > > > memchr-evex.o 762 byte > > > > memchr-evex512.o 576 byte (-24%) > > > > > > > > rawmemchr-evex.o 461 byte > > > > rawmemchr-evex512.o 432 byte (-6%) > > > > > > > > wmemchr-evex.o 794 byte > > > > wmemchr-evex512.o 576 byte (-27%) > > > > > > > > Placeholder function, not used by any processor at the moment. > > > > > > > > Fix page cross logic > > > > > > > > Fix 2 > > > > --- > > > > sysdeps/x86_64/multiarch/Makefile | 3 + > > > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 + > > > > sysdeps/x86_64/multiarch/memchr-evex-base.S | 301 +++++++++++++++++++ > > > > sysdeps/x86_64/multiarch/memchr-evex512.S | 8 + > > > > sysdeps/x86_64/multiarch/rawmemchr-evex512.S | 7 + > > > > sysdeps/x86_64/multiarch/wmemchr-evex512.S | 9 + > > > > 6 files changed, 343 insertions(+) > > > > create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-base.S > > > > create mode 100644 sysdeps/x86_64/multiarch/memchr-evex512.S > > > > create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex512.S > > > > create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex512.S > > > > > > > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > > > > index df4601c294..e974b1ad97 100644 > > > > --- a/sysdeps/x86_64/multiarch/Makefile > > > > +++ b/sysdeps/x86_64/multiarch/Makefile > > > > @@ -4,6 +4,7 @@ sysdep_routines += \ > > > > memchr-avx2 \ > > > > memchr-avx2-rtm \ > > > > memchr-evex \ > > > > + memchr-evex512 \ > > > > memchr-evex-rtm \ > > > > memchr-sse2 \ > > > > memcmp-avx2-movbe \ > > > > @@ -36,6 +37,7 @@ sysdep_routines += \ > > > > rawmemchr-avx2 \ > > > > rawmemchr-avx2-rtm \ > > > > rawmemchr-evex \ > > > > + rawmemchr-evex512 \ > > > > rawmemchr-evex-rtm \ > > > > rawmemchr-sse2 \ > > > > stpcpy-avx2 \ > > > > @@ -156,6 +158,7 @@ sysdep_routines += \ > > > > wmemchr-avx2 \ > > > > wmemchr-avx2-rtm \ > > > > wmemchr-evex \ > > > > + wmemchr-evex512 \ > > > > wmemchr-evex-rtm \ > > > > wmemchr-sse2 \ > > > > wmemcmp-avx2-movbe \ > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > > index 00a91123d3..529c0b0ef0 100644 > > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > > @@ -63,6 +63,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > && CPU_FEATURE_USABLE (AVX512BW) > > > > && CPU_FEATURE_USABLE (BMI2)), > > > > __memchr_evex) > > > > + X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > > > > + (CPU_FEATURE_USABLE (AVX512VL) > > > > + && CPU_FEATURE_USABLE (AVX512BW) > > > > + && CPU_FEATURE_USABLE (BMI2)), > > > > + __memchr_evex512) > > > > X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > > > > (CPU_FEATURE_USABLE (AVX512VL) > > > > && CPU_FEATURE_USABLE (AVX512BW) > > > > @@ -337,6 +342,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > && CPU_FEATURE_USABLE (AVX512BW) > > > > && CPU_FEATURE_USABLE (BMI2)), > > > > __rawmemchr_evex) > > > > + X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > > > > + (CPU_FEATURE_USABLE (AVX512VL) > > > > + && CPU_FEATURE_USABLE (AVX512BW) > > > > + && CPU_FEATURE_USABLE (BMI2)), > > > > + __rawmemchr_evex512) > > > > X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > > > > (CPU_FEATURE_USABLE (AVX512VL) > > > > && CPU_FEATURE_USABLE (AVX512BW) > > > > @@ -942,6 +952,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > && CPU_FEATURE_USABLE (AVX512BW) > > > > && CPU_FEATURE_USABLE (BMI2)), > > > > __wmemchr_evex) > > > > + X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > > > > + (CPU_FEATURE_USABLE (AVX512VL) > > > > + && CPU_FEATURE_USABLE (AVX512BW) > > > > + && CPU_FEATURE_USABLE (BMI2)), > > > > + __wmemchr_evex512) > > > > X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > > > > (CPU_FEATURE_USABLE (AVX512VL) > > > > && CPU_FEATURE_USABLE (AVX512BW) > > > > diff --git a/sysdeps/x86_64/multiarch/memchr-evex-base.S b/sysdeps/x86_64/multiarch/memchr-evex-base.S > > > > new file mode 100644 > > > > index 0000000000..ea92983db8 > > > > --- /dev/null > > > > +++ b/sysdeps/x86_64/multiarch/memchr-evex-base.S > > > > @@ -0,0 +1,301 @@ > > > > +/* Placeholder function, not used by any processor at the moment. > > > > + Copyright (C) 2022 Free Software Foundation, Inc. > > > > + This file is part of the GNU C Library. > > > > + > > > > + The GNU C Library is free software; you can redistribute it and/or > > > > + modify it under the terms of the GNU Lesser General Public > > > > + License as published by the Free Software Foundation; either > > > > + version 2.1 of the License, or (at your option) any later version. > > > > + > > > > + The GNU C Library is distributed in the hope that it will be useful, > > > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > > > + Lesser General Public License for more details. > > > > + > > > > + You should have received a copy of the GNU Lesser General Public > > > > + License along with the GNU C Library; if not, see > > > > + <https://www.gnu.org/licenses/>. */ > > > > + > > > > +/* UNUSED. Exists purely as reference implementation. */ > > > > + > > > > +#include <isa-level.h> > > > > + > > > > +#if ISA_SHOULD_BUILD (4) > > > > + > > > > +# include <sysdep.h> > > > > + > > > > +# ifdef USE_AS_WMEMCHR > > > > +# define CHAR_SIZE 4 > > > > +# define VPBROADCAST vpbroadcastd > > > > +# define VPCMPEQ vpcmpeqd > > > > +# define VPCMPNE vpcmpneqd > > > > +# define VPMINU vpminud > > > > +# define VPTESTNM vptestnmd > > > > +# else > > > > +# define CHAR_SIZE 1 > > > > +# define VPBROADCAST vpbroadcastb > > > > +# define VPCMPEQ vpcmpeqb > > > > +# define VPCMPNE vpcmpneqb > > > > +# define VPMINU vpminub > > > > +# define VPTESTNM vptestnmb > > > > +# endif > > > > + > > > > +# define PAGE_SIZE 4096 > > > > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > > > + > > > > + .section SECTION(.text), "ax", @progbits > > > > +/* Aligning entry point to 64 byte, provides better performance for > > > > + one vector length string. */ > > > > +ENTRY_P2ALIGN (MEMCHR, 6) > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + /* Check for zero length. */ > > > > + test %RDX_LP, %RDX_LP > > > > + jz L(zero) > > > > + > > > > +# ifdef __ILP32__ > > > > + /* Clear the upper 32 bits. */ > > > > + movl %edx, %edx > > > > +# endif > > > > +# endif > > > > + > > > > + /* Broadcast CHAR to VMM(1). */ > > > > + VPBROADCAST %esi, %VMM(1) > > > > + movl %edi, %eax > > > > + andl $(PAGE_SIZE - 1), %eax > > > > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > > > > + ja L(page_cross) > > > > + > > > > + /* Compare [w]char for null, mask bit will be set for match. */ > > > > + VPCMPEQ (%rdi), %VMM(1), %k0 > > > > + > > > > + KMOV %k0, %VRCX > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + mov %rdx, %rsi > > > > + bsf %VRCX, %VRSI > > > > + cmp $CHAR_PER_VEC, %rsi > > > > + ja L(align_more) > > > > +# ifdef USE_AS_WMEMCHR > > > > + leaq (%rdi, %rsi, CHAR_SIZE), %rdi > > > > +# else > > > > + addq %rsi, %rdi > > > > +# endif > > > > + xor %eax, %eax > > > > + cmp %rsi, %rdx > > > > + cmova %rdi, %rax > > > > +# else > > > > + bsf %VRCX, %VRAX > > > > + jz L(align_more) > > > > + add %rdi, %rax > > > > +# endif > > > > + ret > > > > + > > > > + .p2align 5,,5 > > > > +L(page_cross): > > > > + movl %eax, %ecx > > > > + andl $(VEC_SIZE - 1), %ecx > > > > +# ifdef USE_AS_WMEMCHR > > > > + shrl $2, %ecx > > > > +# endif > > > > + xorq %rdi, %rax > > > > + VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1), %k0 > > > > + KMOV %k0, %VRSI > > > > + shr %cl, %VRSI > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + jnz L(page_cross_end) > > > > + movl $CHAR_PER_VEC, %eax > > > > + sub %ecx, %eax > > > > + cmp %rax, %rdx > > > > + ja L(align_more) > > > > +# else > > > > + jz L(align_more) > > > > +# endif > > > > + > > > > +L(page_cross_end): > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + bsf %VRSI, %VRCX > > > > + leaq (%rdi, %rcx, CHAR_SIZE), %rdi > > > > + xor %eax, %eax > > > > + cmp %rcx, %rdx > > > > + cmova %rdi, %rax > > > > > > You have a bug here test case: > > > > > > align % 4096 = 4036 > > > len = 8 > > > pos = N/A (no char in bounds). > > > > > > > Can you please help reproduce this issue. > > I tried adding this test case but it didn't fail. > > > > do_test (4036, 20, 8, 8, 0x9B); > position need to not be in the first VEC (even out of bounds). > > How about do_test (4036, 1200, 8, 8, 0x9B); It still doesn't reproduce. > > > > > I think the `bsf %rsi, %rcx` shouldn't have rcx as a destination > > > or you need to move `rdx` to `rcx` first. > > > > > > > +# else > > > > + bsf %VRSI, %VRAX > > > > + add %rdi, %rax > > > > +# endif > > > > + ret > > > > + > > > > +# ifndef USE_AS_RAWMEMCHR > > > > +L(zero): > > > > + xorl %eax, %eax > > > > + ret > > > > +# endif > > > > + > > > > +L(ret_vec_x2): > > > > + subq $-VEC_SIZE, %rdi > > > > +L(ret_vec_x1): > > > > + bsf %VRAX, %VRAX > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + cmp %rax, %rdx > > > > + jbe L(zero) > > > > +# endif > > > > +# ifdef USE_AS_WMEMCHR > > > > + leaq (%rdi, %rax, CHAR_SIZE), %rax > > > > +# else > > > > + add %rdi, %rax > > > > +# endif > > > > + ret > > > > + > > > > + .p2align 5,,5 > > > > +L(align_more): > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + mov %rdi, %rax > > > > +# endif > > > > + subq $-VEC_SIZE, %rdi > > > > + /* Align rdi to VEC_SIZE. */ > > > > + andq $-VEC_SIZE, %rdi > > > > + > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + subq %rdi, %rax > > > > +# ifdef USE_AS_WMEMCHR > > > > + sar $2, %rax > > > > +# endif > > > > + addq %rax, %rdx > > > > +# endif > > > > + > > > > + /* Loop unroll 4 times for 4 vector loop. */ > > > > + VPCMPEQ (%rdi), %VMM(1), %k0 > > > > + > > > > + KMOV %k0, %VRAX > > > > + test %VRAX, %VRAX > > > > + jnz L(ret_vec_x1) > > > > + > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + subq $CHAR_PER_VEC, %rdx > > > > + jbe L(zero) > > > > +# endif > > > > + > > > > + VPCMPEQ VEC_SIZE(%rdi), %VMM(1), %k0 > > > > + > > > > + KMOV %k0, %VRAX > > > > + test %VRAX, %VRAX > > > > + jnz L(ret_vec_x2) > > > > + > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + subq $CHAR_PER_VEC, %rdx > > > > + jbe L(zero) > > > > +# endif > > > > + > > > > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMM(1), %k0 > > > > + > > > > + KMOV %k0, %VRAX > > > > + test %VRAX, %VRAX > > > > + jnz L(ret_vec_x3) > > > > + > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + subq $CHAR_PER_VEC, %rdx > > > > + jbe L(zero) > > > > +# endif > > > > + > > > > + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMM(1), %k0 > > > > + > > > > + KMOV %k0, %VRAX > > > > + test %VRAX, %VRAX > > > > + jnz L(ret_vec_x4) > > > > + > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + subq $CHAR_PER_VEC, %rdx > > > > + jbe L(zero) > > > > + /* Save pointer to find alignment adjustment. */ > > > > + movq %rdi, %rax > > > > +# endif > > > > + /* Align address to VEC_SIZE * 4 for loop. */ > > > > + andq $-(VEC_SIZE * 4), %rdi > > > > + > > > > + /* Add alignment difference to rdx. */ > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + subq %rdi, %rax > > > > +# ifdef USE_AS_WMEMCHR > > > > + shr $2, %VRAX > > > > +# endif > > > > + addq %rax, %rdx > > > > +# endif > > > > + > > > > + /* 4 vector loop. */ > > > > + .p2align 5,,11 > > > > +L(loop): > > > > + > > > > + VPCMPNE (VEC_SIZE * 4)(%rdi), %VMM(1), %k1 > > > > + vpxorq (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2) > > > > + vpxorq (VEC_SIZE * 6)(%rdi), %VMM(1), %VMM(3) > > > > + VPCMPEQ (VEC_SIZE * 7)(%rdi), %VMM(1), %k3 > > > > + VPMINU %VMM(2), %VMM(3), %VMM(3){%k1}{z} > > > > + VPTESTNM %VMM(3), %VMM(3), %k2 > > > > + > > > > + subq $-(VEC_SIZE * 4), %rdi > > > > + KORTEST %k2, %k3 > > > > +# ifdef USE_AS_RAWMEMCHR > > > > + jz L(loop) > > > > +# else > > > > + jnz L(loopend) > > > > + subq $(CHAR_PER_VEC * 4), %rdx > > > > + ja L(loop) > > > > +L(zero_2): > > > > + xor %eax, %eax > > > > + ret > > > > +# endif > > > > + > > > > +L(loopend): > > > > + VPCMPEQ (%rdi), %VMM(1), %k1 > > > > + KMOV %k1, %VRAX > > > > + test %VRAX, %VRAX > > > > + jnz L(ret_vec_x1) > > > > + > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + subq $CHAR_PER_VEC, %rdx > > > > + jbe L(zero_2) > > > > +# endif > > > > + > > > > + VPCMPEQ VEC_SIZE(%rdi), %VMM(1), %k1 > > > > + KMOV %k1, %VRAX > > > > + test %VRAX, %VRAX > > > > + jnz L(ret_vec_x2) > > > > + > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + subq $CHAR_PER_VEC, %rdx > > > > + jbe L(zero_2) > > > > +# endif > > > > + > > > > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMM(1), %k1 > > > > + KMOV %k1, %VRAX > > > > + test %VRAX, %VRAX > > > > + jnz L(ret_vec_x3) > > > > + > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + subq $CHAR_PER_VEC, %rdx > > > > + jbe L(zero_2) > > > > +# endif > > > > + > > > > + /* At this point null [w]char must be in the fourth vector so no > > > > + need to check. */ > > > > + KMOV %k3, %VRAX > > > > + > > > > +L(ret_vec_x4): > > > > + bsf %VRAX, %VRAX > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + cmp %rax, %rdx > > > > + jbe L(zero) > > > > +# endif > > > > + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax > > > > + ret > > > > + > > > > + .p2align 5,,5 > > > > +L(ret_vec_x3): > > > > + bsf %VRAX, %VRAX > > > > +# ifndef USE_AS_RAWMEMCHR > > > > + cmp %rax, %rdx > > > > + jbe L(zero) > > > > +# endif > > > > + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > > > > + ret > > > > + > > > > +END (MEMCHR) > > > > +#endif > > > > diff --git a/sysdeps/x86_64/multiarch/memchr-evex512.S b/sysdeps/x86_64/multiarch/memchr-evex512.S > > > > new file mode 100644 > > > > index 0000000000..002f8c8489 > > > > --- /dev/null > > > > +++ b/sysdeps/x86_64/multiarch/memchr-evex512.S > > > > @@ -0,0 +1,8 @@ > > > > +# ifndef MEMCHR > > > > +# define MEMCHR __memchr_evex512 > > > > +# endif > > > > + > > > > +#include "x86-evex512-vecs.h" > > > > +#include "reg-macros.h" > > > > + > > > > +#include "memchr-evex-base.S" > > > > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex512.S b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S > > > > new file mode 100644 > > > > index 0000000000..302d3cb055 > > > > --- /dev/null > > > > +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S > > > > @@ -0,0 +1,7 @@ > > > > +#ifndef RAWMEMCHR > > > > +# define RAWMEMCHR __rawmemchr_evex512 > > > > +#endif > > > > +#define USE_AS_RAWMEMCHR 1 > > > > +#define MEMCHR RAWMEMCHR > > > > + > > > > +#include "memchr-evex512.S" > > > > diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex512.S b/sysdeps/x86_64/multiarch/wmemchr-evex512.S > > > > new file mode 100644 > > > > index 0000000000..78ec4ee5ad > > > > --- /dev/null > > > > +++ b/sysdeps/x86_64/multiarch/wmemchr-evex512.S > > > > @@ -0,0 +1,9 @@ > > > > +#ifndef WMEMCHR > > > > +# define WMEMCHR __wmemchr_evex512 > > > > +#endif > > > > + > > > > +#define MEMCHR WMEMCHR > > > > +#define USE_AS_WMEMCHR 1 > > > > + > > > > +#define USE_WIDE_CHAR 1 > > > > +#include "memchr-evex512.S" > > > > -- > > > > 2.36.1 > > > > ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH v5] x86_64: Implement evex512 version of memchr, rawmemchr and wmemchr 2022-10-18 5:36 ` [PATCH v5] " Sunil Pandey @ 2022-10-18 6:30 ` Noah Goldstein 2022-10-18 7:44 ` [PATCH] String: Improve test coverage for memchr Sunil K Pandey 0 siblings, 1 reply; 26+ messages in thread From: Noah Goldstein @ 2022-10-18 6:30 UTC (permalink / raw) To: Sunil Pandey; +Cc: libc-alpha On Mon, Oct 17, 2022 at 10:37 PM Sunil Pandey <skpgkp2@gmail.com> wrote: > > On Mon, Oct 17, 2022 at 9:18 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > On Mon, Oct 17, 2022 at 11:15 PM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > > > On Mon, Oct 17, 2022 at 8:02 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > > > On Mon, Oct 17, 2022 at 5:00 PM Sunil K Pandey via Libc-alpha > > > > <libc-alpha@sourceware.org> wrote: > > > > > > > > > > Changes from v4: > > > > > - Replace jmp max in first vector with cmov. > > > > > - Replace jmp max in page cross with cmov. > > > > > Changes from v3: > > > > > - Replace VPCMPEQ in loop with VPTESTNM for 4th vector. > > > > > - Change first vector max check logic for terminating condition. > > > > > - Change page cross logic for terminating condition. > > > > > - Remove unnessary check in align_more block. > > > > > - Remove unnessary VEC(0) initialization. > > > > > - Define USE_WIDE_CHAR in wmemchr. > > > > > > > > > > Changes from v2: > > > > > - Use VEC API > > > > > - Replaced long jump L(zero) in L(endloop) with short jump L(zero_2) > > > > > > > > > > Changes from v1: > > > > > - Change vcmp to vcmpeq and vcmpneq. > > > > > - Restructure unconditional loop jump logic. > > > > > - Improve 4 vector loop logic. > > > > > - Fix bug near page boundary. > > > > > > > > > > This patch implements following evex512 version of string functions. > > > > > evex512 version takes up to 30% less cycle as compared to evex, > > > > > depending on length and alignment. > > > > > > > > > > - memchr function using 512 bit vectors. > > > > > - rawmemchr function using 512 bit vectors. > > > > > - wmemchr function using 512 bit vectors. > > > > > > > > > > Code size data: > > > > > > > > > > memchr-evex.o 762 byte > > > > > memchr-evex512.o 576 byte (-24%) > > > > > > > > > > rawmemchr-evex.o 461 byte > > > > > rawmemchr-evex512.o 432 byte (-6%) > > > > > > > > > > wmemchr-evex.o 794 byte > > > > > wmemchr-evex512.o 576 byte (-27%) > > > > > > > > > > Placeholder function, not used by any processor at the moment. > > > > > > > > > > Fix page cross logic > > > > > > > > > > Fix 2 > > > > > --- > > > > > sysdeps/x86_64/multiarch/Makefile | 3 + > > > > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 + > > > > > sysdeps/x86_64/multiarch/memchr-evex-base.S | 301 +++++++++++++++++++ > > > > > sysdeps/x86_64/multiarch/memchr-evex512.S | 8 + > > > > > sysdeps/x86_64/multiarch/rawmemchr-evex512.S | 7 + > > > > > sysdeps/x86_64/multiarch/wmemchr-evex512.S | 9 + > > > > > 6 files changed, 343 insertions(+) > > > > > create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-base.S > > > > > create mode 100644 sysdeps/x86_64/multiarch/memchr-evex512.S > > > > > create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex512.S > > > > > create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex512.S > > > > > > > > > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > > > > > index df4601c294..e974b1ad97 100644 > > > > > --- a/sysdeps/x86_64/multiarch/Makefile > > > > > +++ b/sysdeps/x86_64/multiarch/Makefile > > > > > @@ -4,6 +4,7 @@ sysdep_routines += \ > > > > > memchr-avx2 \ > > > > > memchr-avx2-rtm \ > > > > > memchr-evex \ > > > > > + memchr-evex512 \ > > > > > memchr-evex-rtm \ > > > > > memchr-sse2 \ > > > > > memcmp-avx2-movbe \ > > > > > @@ -36,6 +37,7 @@ sysdep_routines += \ > > > > > rawmemchr-avx2 \ > > > > > rawmemchr-avx2-rtm \ > > > > > rawmemchr-evex \ > > > > > + rawmemchr-evex512 \ > > > > > rawmemchr-evex-rtm \ > > > > > rawmemchr-sse2 \ > > > > > stpcpy-avx2 \ > > > > > @@ -156,6 +158,7 @@ sysdep_routines += \ > > > > > wmemchr-avx2 \ > > > > > wmemchr-avx2-rtm \ > > > > > wmemchr-evex \ > > > > > + wmemchr-evex512 \ > > > > > wmemchr-evex-rtm \ > > > > > wmemchr-sse2 \ > > > > > wmemcmp-avx2-movbe \ > > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > > > index 00a91123d3..529c0b0ef0 100644 > > > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > > > @@ -63,6 +63,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > > && CPU_FEATURE_USABLE (AVX512BW) > > > > > && CPU_FEATURE_USABLE (BMI2)), > > > > > __memchr_evex) > > > > > + X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > > > > > + (CPU_FEATURE_USABLE (AVX512VL) > > > > > + && CPU_FEATURE_USABLE (AVX512BW) > > > > > + && CPU_FEATURE_USABLE (BMI2)), > > > > > + __memchr_evex512) > > > > > X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > > > > > (CPU_FEATURE_USABLE (AVX512VL) > > > > > && CPU_FEATURE_USABLE (AVX512BW) > > > > > @@ -337,6 +342,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > > && CPU_FEATURE_USABLE (AVX512BW) > > > > > && CPU_FEATURE_USABLE (BMI2)), > > > > > __rawmemchr_evex) > > > > > + X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > > > > > + (CPU_FEATURE_USABLE (AVX512VL) > > > > > + && CPU_FEATURE_USABLE (AVX512BW) > > > > > + && CPU_FEATURE_USABLE (BMI2)), > > > > > + __rawmemchr_evex512) > > > > > X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > > > > > (CPU_FEATURE_USABLE (AVX512VL) > > > > > && CPU_FEATURE_USABLE (AVX512BW) > > > > > @@ -942,6 +952,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > > && CPU_FEATURE_USABLE (AVX512BW) > > > > > && CPU_FEATURE_USABLE (BMI2)), > > > > > __wmemchr_evex) > > > > > + X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > > > > > + (CPU_FEATURE_USABLE (AVX512VL) > > > > > + && CPU_FEATURE_USABLE (AVX512BW) > > > > > + && CPU_FEATURE_USABLE (BMI2)), > > > > > + __wmemchr_evex512) > > > > > X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > > > > > (CPU_FEATURE_USABLE (AVX512VL) > > > > > && CPU_FEATURE_USABLE (AVX512BW) > > > > > diff --git a/sysdeps/x86_64/multiarch/memchr-evex-base.S b/sysdeps/x86_64/multiarch/memchr-evex-base.S > > > > > new file mode 100644 > > > > > index 0000000000..ea92983db8 > > > > > --- /dev/null > > > > > +++ b/sysdeps/x86_64/multiarch/memchr-evex-base.S > > > > > @@ -0,0 +1,301 @@ > > > > > +/* Placeholder function, not used by any processor at the moment. > > > > > + Copyright (C) 2022 Free Software Foundation, Inc. > > > > > + This file is part of the GNU C Library. > > > > > + > > > > > + The GNU C Library is free software; you can redistribute it and/or > > > > > + modify it under the terms of the GNU Lesser General Public > > > > > + License as published by the Free Software Foundation; either > > > > > + version 2.1 of the License, or (at your option) any later version. > > > > > + > > > > > + The GNU C Library is distributed in the hope that it will be useful, > > > > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > > > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > > > > + Lesser General Public License for more details. > > > > > + > > > > > + You should have received a copy of the GNU Lesser General Public > > > > > + License along with the GNU C Library; if not, see > > > > > + <https://www.gnu.org/licenses/>. */ > > > > > + > > > > > +/* UNUSED. Exists purely as reference implementation. */ > > > > > + > > > > > +#include <isa-level.h> > > > > > + > > > > > +#if ISA_SHOULD_BUILD (4) > > > > > + > > > > > +# include <sysdep.h> > > > > > + > > > > > +# ifdef USE_AS_WMEMCHR > > > > > +# define CHAR_SIZE 4 > > > > > +# define VPBROADCAST vpbroadcastd > > > > > +# define VPCMPEQ vpcmpeqd > > > > > +# define VPCMPNE vpcmpneqd > > > > > +# define VPMINU vpminud > > > > > +# define VPTESTNM vptestnmd > > > > > +# else > > > > > +# define CHAR_SIZE 1 > > > > > +# define VPBROADCAST vpbroadcastb > > > > > +# define VPCMPEQ vpcmpeqb > > > > > +# define VPCMPNE vpcmpneqb > > > > > +# define VPMINU vpminub > > > > > +# define VPTESTNM vptestnmb > > > > > +# endif > > > > > + > > > > > +# define PAGE_SIZE 4096 > > > > > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > > > > + > > > > > + .section SECTION(.text), "ax", @progbits > > > > > +/* Aligning entry point to 64 byte, provides better performance for > > > > > + one vector length string. */ > > > > > +ENTRY_P2ALIGN (MEMCHR, 6) > > > > > +# ifndef USE_AS_RAWMEMCHR > > > > > + /* Check for zero length. */ > > > > > + test %RDX_LP, %RDX_LP > > > > > + jz L(zero) > > > > > + > > > > > +# ifdef __ILP32__ > > > > > + /* Clear the upper 32 bits. */ > > > > > + movl %edx, %edx > > > > > +# endif > > > > > +# endif > > > > > + > > > > > + /* Broadcast CHAR to VMM(1). */ > > > > > + VPBROADCAST %esi, %VMM(1) > > > > > + movl %edi, %eax > > > > > + andl $(PAGE_SIZE - 1), %eax > > > > > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > > > > > + ja L(page_cross) > > > > > + > > > > > + /* Compare [w]char for null, mask bit will be set for match. */ > > > > > + VPCMPEQ (%rdi), %VMM(1), %k0 > > > > > + > > > > > + KMOV %k0, %VRCX > > > > > +# ifndef USE_AS_RAWMEMCHR > > > > > + mov %rdx, %rsi > > > > > + bsf %VRCX, %VRSI > > > > > + cmp $CHAR_PER_VEC, %rsi > > > > > + ja L(align_more) > > > > > +# ifdef USE_AS_WMEMCHR > > > > > + leaq (%rdi, %rsi, CHAR_SIZE), %rdi > > > > > +# else > > > > > + addq %rsi, %rdi > > > > > +# endif > > > > > + xor %eax, %eax > > > > > + cmp %rsi, %rdx > > > > > + cmova %rdi, %rax > > > > > +# else > > > > > + bsf %VRCX, %VRAX > > > > > + jz L(align_more) > > > > > + add %rdi, %rax > > > > > +# endif > > > > > + ret > > > > > + > > > > > + .p2align 5,,5 > > > > > +L(page_cross): > > > > > + movl %eax, %ecx > > > > > + andl $(VEC_SIZE - 1), %ecx > > > > > +# ifdef USE_AS_WMEMCHR > > > > > + shrl $2, %ecx > > > > > +# endif > > > > > + xorq %rdi, %rax > > > > > + VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1), %k0 > > > > > + KMOV %k0, %VRSI > > > > > + shr %cl, %VRSI > > > > > +# ifndef USE_AS_RAWMEMCHR > > > > > + jnz L(page_cross_end) > > > > > + movl $CHAR_PER_VEC, %eax > > > > > + sub %ecx, %eax > > > > > + cmp %rax, %rdx > > > > > + ja L(align_more) > > > > > +# else > > > > > + jz L(align_more) > > > > > +# endif > > > > > + > > > > > +L(page_cross_end): > > > > > +# ifndef USE_AS_RAWMEMCHR > > > > > + bsf %VRSI, %VRCX > > > > > + leaq (%rdi, %rcx, CHAR_SIZE), %rdi > > > > > + xor %eax, %eax > > > > > + cmp %rcx, %rdx > > > > > + cmova %rdi, %rax > > > > > > > > You have a bug here test case: > > > > > > > > align % 4096 = 4036 > > > > len = 8 > > > > pos = N/A (no char in bounds). > > > > > > > > > > Can you please help reproduce this issue. > > > I tried adding this test case but it didn't fail. > > > > > > do_test (4036, 20, 8, 8, 0x9B); > > position need to not be in the first VEC (even out of bounds). > > > > > How about > > do_test (4036, 1200, 8, 8, 0x9B); > > It still doesn't reproduce. do_test (4036, 1200, 1200, 8, 23); sorry the params I had were confusing. For some reason glibc testsuite has two different variables for char position and decided to name one of them 'len'. Meant 3rd argument so 'n' in the glibc test suite. > > > > > > > > I think the `bsf %rsi, %rcx` shouldn't have rcx as a destination > > > > or you need to move `rdx` to `rcx` first. > > > > > > > > > +# else > > > > > + bsf %VRSI, %VRAX > > > > > + add %rdi, %rax > > > > > +# endif > > > > > + ret > > > > > + > > > > > +# ifndef USE_AS_RAWMEMCHR > > > > > +L(zero): > > > > > + xorl %eax, %eax > > > > > + ret > > > > > +# endif > > > > > + > > > > > +L(ret_vec_x2): > > > > > + subq $-VEC_SIZE, %rdi > > > > > +L(ret_vec_x1): > > > > > + bsf %VRAX, %VRAX > > > > > +# ifndef USE_AS_RAWMEMCHR > > > > > + cmp %rax, %rdx > > > > > + jbe L(zero) > > > > > +# endif > > > > > +# ifdef USE_AS_WMEMCHR > > > > > + leaq (%rdi, %rax, CHAR_SIZE), %rax > > > > > +# else > > > > > + add %rdi, %rax > > > > > +# endif > > > > > + ret > > > > > + > > > > > + .p2align 5,,5 > > > > > +L(align_more): > > > > > +# ifndef USE_AS_RAWMEMCHR > > > > > + mov %rdi, %rax > > > > > +# endif > > > > > + subq $-VEC_SIZE, %rdi > > > > > + /* Align rdi to VEC_SIZE. */ > > > > > + andq $-VEC_SIZE, %rdi > > > > > + > > > > > +# ifndef USE_AS_RAWMEMCHR > > > > > + subq %rdi, %rax > > > > > +# ifdef USE_AS_WMEMCHR > > > > > + sar $2, %rax > > > > > +# endif > > > > > + addq %rax, %rdx > > > > > +# endif > > > > > + > > > > > + /* Loop unroll 4 times for 4 vector loop. */ > > > > > + VPCMPEQ (%rdi), %VMM(1), %k0 > > > > > + > > > > > + KMOV %k0, %VRAX > > > > > + test %VRAX, %VRAX > > > > > + jnz L(ret_vec_x1) > > > > > + > > > > > +# ifndef USE_AS_RAWMEMCHR > > > > > + subq $CHAR_PER_VEC, %rdx > > > > > + jbe L(zero) > > > > > +# endif > > > > > + > > > > > + VPCMPEQ VEC_SIZE(%rdi), %VMM(1), %k0 > > > > > + > > > > > + KMOV %k0, %VRAX > > > > > + test %VRAX, %VRAX > > > > > + jnz L(ret_vec_x2) > > > > > + > > > > > +# ifndef USE_AS_RAWMEMCHR > > > > > + subq $CHAR_PER_VEC, %rdx > > > > > + jbe L(zero) > > > > > +# endif > > > > > + > > > > > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMM(1), %k0 > > > > > + > > > > > + KMOV %k0, %VRAX > > > > > + test %VRAX, %VRAX > > > > > + jnz L(ret_vec_x3) > > > > > + > > > > > +# ifndef USE_AS_RAWMEMCHR > > > > > + subq $CHAR_PER_VEC, %rdx > > > > > + jbe L(zero) > > > > > +# endif > > > > > + > > > > > + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMM(1), %k0 > > > > > + > > > > > + KMOV %k0, %VRAX > > > > > + test %VRAX, %VRAX > > > > > + jnz L(ret_vec_x4) > > > > > + > > > > > +# ifndef USE_AS_RAWMEMCHR > > > > > + subq $CHAR_PER_VEC, %rdx > > > > > + jbe L(zero) > > > > > + /* Save pointer to find alignment adjustment. */ > > > > > + movq %rdi, %rax > > > > > +# endif > > > > > + /* Align address to VEC_SIZE * 4 for loop. */ > > > > > + andq $-(VEC_SIZE * 4), %rdi > > > > > + > > > > > + /* Add alignment difference to rdx. */ > > > > > +# ifndef USE_AS_RAWMEMCHR > > > > > + subq %rdi, %rax > > > > > +# ifdef USE_AS_WMEMCHR > > > > > + shr $2, %VRAX > > > > > +# endif > > > > > + addq %rax, %rdx > > > > > +# endif > > > > > + > > > > > + /* 4 vector loop. */ > > > > > + .p2align 5,,11 > > > > > +L(loop): > > > > > + > > > > > + VPCMPNE (VEC_SIZE * 4)(%rdi), %VMM(1), %k1 > > > > > + vpxorq (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2) > > > > > + vpxorq (VEC_SIZE * 6)(%rdi), %VMM(1), %VMM(3) > > > > > + VPCMPEQ (VEC_SIZE * 7)(%rdi), %VMM(1), %k3 > > > > > + VPMINU %VMM(2), %VMM(3), %VMM(3){%k1}{z} > > > > > + VPTESTNM %VMM(3), %VMM(3), %k2 > > > > > + > > > > > + subq $-(VEC_SIZE * 4), %rdi > > > > > + KORTEST %k2, %k3 > > > > > +# ifdef USE_AS_RAWMEMCHR > > > > > + jz L(loop) > > > > > +# else > > > > > + jnz L(loopend) > > > > > + subq $(CHAR_PER_VEC * 4), %rdx > > > > > + ja L(loop) > > > > > +L(zero_2): > > > > > + xor %eax, %eax > > > > > + ret > > > > > +# endif > > > > > + > > > > > +L(loopend): > > > > > + VPCMPEQ (%rdi), %VMM(1), %k1 > > > > > + KMOV %k1, %VRAX > > > > > + test %VRAX, %VRAX > > > > > + jnz L(ret_vec_x1) > > > > > + > > > > > +# ifndef USE_AS_RAWMEMCHR > > > > > + subq $CHAR_PER_VEC, %rdx > > > > > + jbe L(zero_2) > > > > > +# endif > > > > > + > > > > > + VPCMPEQ VEC_SIZE(%rdi), %VMM(1), %k1 > > > > > + KMOV %k1, %VRAX > > > > > + test %VRAX, %VRAX > > > > > + jnz L(ret_vec_x2) > > > > > + > > > > > +# ifndef USE_AS_RAWMEMCHR > > > > > + subq $CHAR_PER_VEC, %rdx > > > > > + jbe L(zero_2) > > > > > +# endif > > > > > + > > > > > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMM(1), %k1 > > > > > + KMOV %k1, %VRAX > > > > > + test %VRAX, %VRAX > > > > > + jnz L(ret_vec_x3) > > > > > + > > > > > +# ifndef USE_AS_RAWMEMCHR > > > > > + subq $CHAR_PER_VEC, %rdx > > > > > + jbe L(zero_2) > > > > > +# endif > > > > > + > > > > > + /* At this point null [w]char must be in the fourth vector so no > > > > > + need to check. */ > > > > > + KMOV %k3, %VRAX > > > > > + > > > > > +L(ret_vec_x4): > > > > > + bsf %VRAX, %VRAX > > > > > +# ifndef USE_AS_RAWMEMCHR > > > > > + cmp %rax, %rdx > > > > > + jbe L(zero) > > > > > +# endif > > > > > + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax > > > > > + ret > > > > > + > > > > > + .p2align 5,,5 > > > > > +L(ret_vec_x3): > > > > > + bsf %VRAX, %VRAX > > > > > +# ifndef USE_AS_RAWMEMCHR > > > > > + cmp %rax, %rdx > > > > > + jbe L(zero) > > > > > +# endif > > > > > + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > > > > > + ret > > > > > + > > > > > +END (MEMCHR) > > > > > +#endif > > > > > diff --git a/sysdeps/x86_64/multiarch/memchr-evex512.S b/sysdeps/x86_64/multiarch/memchr-evex512.S > > > > > new file mode 100644 > > > > > index 0000000000..002f8c8489 > > > > > --- /dev/null > > > > > +++ b/sysdeps/x86_64/multiarch/memchr-evex512.S > > > > > @@ -0,0 +1,8 @@ > > > > > +# ifndef MEMCHR > > > > > +# define MEMCHR __memchr_evex512 > > > > > +# endif > > > > > + > > > > > +#include "x86-evex512-vecs.h" > > > > > +#include "reg-macros.h" > > > > > + > > > > > +#include "memchr-evex-base.S" > > > > > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex512.S b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S > > > > > new file mode 100644 > > > > > index 0000000000..302d3cb055 > > > > > --- /dev/null > > > > > +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S > > > > > @@ -0,0 +1,7 @@ > > > > > +#ifndef RAWMEMCHR > > > > > +# define RAWMEMCHR __rawmemchr_evex512 > > > > > +#endif > > > > > +#define USE_AS_RAWMEMCHR 1 > > > > > +#define MEMCHR RAWMEMCHR > > > > > + > > > > > +#include "memchr-evex512.S" > > > > > diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex512.S b/sysdeps/x86_64/multiarch/wmemchr-evex512.S > > > > > new file mode 100644 > > > > > index 0000000000..78ec4ee5ad > > > > > --- /dev/null > > > > > +++ b/sysdeps/x86_64/multiarch/wmemchr-evex512.S > > > > > @@ -0,0 +1,9 @@ > > > > > +#ifndef WMEMCHR > > > > > +# define WMEMCHR __wmemchr_evex512 > > > > > +#endif > > > > > + > > > > > +#define MEMCHR WMEMCHR > > > > > +#define USE_AS_WMEMCHR 1 > > > > > + > > > > > +#define USE_WIDE_CHAR 1 > > > > > +#include "memchr-evex512.S" > > > > > -- > > > > > 2.36.1 > > > > > ^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCH] String: Improve test coverage for memchr 2022-10-18 6:30 ` Noah Goldstein @ 2022-10-18 7:44 ` Sunil K Pandey 2022-10-18 15:44 ` H.J. Lu 0 siblings, 1 reply; 26+ messages in thread From: Sunil K Pandey @ 2022-10-18 7:44 UTC (permalink / raw) To: libc-alpha This test improves memchr coverage near page boundary. --- string/test-memchr.c | 1 + 1 file changed, 1 insertion(+) diff --git a/string/test-memchr.c b/string/test-memchr.c index 1cbcd57fd4..ab206e05db 100644 --- a/string/test-memchr.c +++ b/string/test-memchr.c @@ -251,6 +251,7 @@ test_main (void) /* page_size is in fact getpagesize() * 2. */ do_test (page_size / 2 - i, i, i, 1, 0x9B); do_test (page_size / 2 - i, i - 1, i - 1, 1, 0x9B); + do_test (page_size / 2 - (i * 4), i + 128, i + 128, i, 0x9B); } do_random_tests (); -- 2.36.1 ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH] String: Improve test coverage for memchr 2022-10-18 7:44 ` [PATCH] String: Improve test coverage for memchr Sunil K Pandey @ 2022-10-18 15:44 ` H.J. Lu 0 siblings, 0 replies; 26+ messages in thread From: H.J. Lu @ 2022-10-18 15:44 UTC (permalink / raw) To: Sunil K Pandey; +Cc: libc-alpha On Tue, Oct 18, 2022 at 12:44 AM Sunil K Pandey via Libc-alpha <libc-alpha@sourceware.org> wrote: > > This test improves memchr coverage near page boundary. > --- > string/test-memchr.c | 1 + > 1 file changed, 1 insertion(+) > > diff --git a/string/test-memchr.c b/string/test-memchr.c > index 1cbcd57fd4..ab206e05db 100644 > --- a/string/test-memchr.c > +++ b/string/test-memchr.c > @@ -251,6 +251,7 @@ test_main (void) > /* page_size is in fact getpagesize() * 2. */ > do_test (page_size / 2 - i, i, i, 1, 0x9B); > do_test (page_size / 2 - i, i - 1, i - 1, 1, 0x9B); > + do_test (page_size / 2 - (i * 4), i + 128, i + 128, i, 0x9B); > } > > do_random_tests (); > -- > 2.36.1 > LGTM. Thanks. -- H.J. ^ permalink raw reply [flat|nested] 26+ messages in thread
end of thread, other threads:[~2022-10-18 17:12 UTC | newest] Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2022-09-22 0:27 [PATCH] x86_64: Implement evex512 version of memchr, rawmemchr and wmemchr Sunil K Pandey 2022-09-22 0:50 ` Noah Goldstein 2022-09-23 3:57 ` Sunil Pandey 2022-09-29 3:42 ` Sunil Pandey 2022-09-29 4:07 ` Noah Goldstein 2022-10-03 18:33 ` Noah Goldstein 2022-10-03 19:00 ` H.J. Lu 2022-10-03 19:12 ` Noah Goldstein 2022-10-13 21:41 ` [PATCH v2] " Sunil K Pandey 2022-10-15 14:01 ` [PATCH v3] " Sunil K Pandey 2022-10-15 16:26 ` Noah Goldstein 2022-10-15 16:26 ` Noah Goldstein 2022-10-17 3:53 ` [PATCH v4] " Sunil K Pandey 2022-10-17 15:46 ` Noah Goldstein 2022-10-17 23:55 ` Sunil Pandey 2022-10-18 0:00 ` [PATCH v5] " Sunil K Pandey 2022-10-18 3:01 ` Noah Goldstein 2022-10-18 4:15 ` Sunil Pandey 2022-10-18 4:18 ` Noah Goldstein 2022-10-18 4:19 ` Noah Goldstein 2022-10-18 8:02 ` [PATCH v6] " Sunil K Pandey 2022-10-18 17:12 ` Noah Goldstein 2022-10-18 5:36 ` [PATCH v5] " Sunil Pandey 2022-10-18 6:30 ` Noah Goldstein 2022-10-18 7:44 ` [PATCH] String: Improve test coverage for memchr Sunil K Pandey 2022-10-18 15:44 ` H.J. Lu
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).