* [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function @ 2023-06-30 20:48 Sunil K Pandey 2023-06-30 21:04 ` Noah Goldstein 2023-07-02 17:03 ` Noah Goldstein 0 siblings, 2 replies; 24+ messages in thread From: Sunil K Pandey @ 2023-06-30 20:48 UTC (permalink / raw) To: libc-alpha; +Cc: hjl.tools This patch optimizes strlcpy/wsclcpy string functions for AVX2. --- sysdeps/x86_64/multiarch/Makefile | 4 + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 18 + sysdeps/x86_64/multiarch/ifunc-strlcpy.h | 34 ++ sysdeps/x86_64/multiarch/strlcpy-avx2.S | 446 +++++++++++++++++++++ sysdeps/x86_64/multiarch/strlcpy-generic.c | 25 ++ sysdeps/x86_64/multiarch/strlcpy.c | 36 ++ sysdeps/x86_64/multiarch/wcslcpy-avx2.S | 4 + sysdeps/x86_64/multiarch/wcslcpy-generic.c | 25 ++ sysdeps/x86_64/multiarch/wcslcpy.c | 35 ++ 9 files changed, 627 insertions(+) create mode 100644 sysdeps/x86_64/multiarch/ifunc-strlcpy.h create mode 100644 sysdeps/x86_64/multiarch/strlcpy-avx2.S create mode 100644 sysdeps/x86_64/multiarch/strlcpy-generic.c create mode 100644 sysdeps/x86_64/multiarch/strlcpy.c create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-avx2.S create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-generic.c create mode 100644 sysdeps/x86_64/multiarch/wcslcpy.c diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index e1e894c963..7e3fc081df 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -82,6 +82,8 @@ sysdep_routines += \ strcpy-sse2 \ strcpy-sse2-unaligned \ strcspn-sse4 \ + strlcpy-avx2 \ + strlcpy-generic \ strlen-avx2 \ strlen-avx2-rtm \ strlen-evex \ @@ -153,6 +155,8 @@ sysdep_routines += \ wcscpy-evex \ wcscpy-generic \ wcscpy-ssse3 \ + wcslcpy-avx2 \ + wcslcpy-generic \ wcslen-avx2 \ wcslen-avx2-rtm \ wcslen-evex \ diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 5427ff1907..9928dee187 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -751,6 +751,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, 1, __strncat_sse2_unaligned)) + /* Support sysdeps/x86_64/multiarch/strlcpy.c. */ + IFUNC_IMPL (i, name, strlcpy, + X86_IFUNC_IMPL_ADD_V3 (array, i, strlcpy, + CPU_FEATURE_USABLE (AVX2), + __strlcpy_avx2) + X86_IFUNC_IMPL_ADD_V1 (array, i, strlcpy, + 1, + __strlcpy_generic)) + /* Support sysdeps/x86_64/multiarch/strncpy.c. */ IFUNC_IMPL (i, name, strncpy, X86_IFUNC_IMPL_ADD_V4 (array, i, strncpy, @@ -917,6 +926,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, 1, __wcscpy_generic)) + /* Support sysdeps/x86_64/multiarch/wcslcpy.c. */ + IFUNC_IMPL (i, name, wcslcpy, + X86_IFUNC_IMPL_ADD_V3 (array, i, wcslcpy, + CPU_FEATURE_USABLE (AVX2), + __wcslcpy_avx2) + X86_IFUNC_IMPL_ADD_V1 (array, i, wcslcpy, + 1, + __wcslcpy_generic)) + /* Support sysdeps/x86_64/multiarch/wcsncpy.c. */ IFUNC_IMPL (i, name, wcsncpy, X86_IFUNC_IMPL_ADD_V4 (array, i, wcsncpy, diff --git a/sysdeps/x86_64/multiarch/ifunc-strlcpy.h b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h new file mode 100644 index 0000000000..982a30d15b --- /dev/null +++ b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h @@ -0,0 +1,34 @@ +/* Common definition for ifunc selections. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <init-arch.h> + +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden; + +static inline void * +IFUNC_SELECTOR (void) +{ + const struct cpu_features *cpu_features = __get_cpu_features (); + + if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2)) + return OPTIMIZE (avx2); + + return OPTIMIZE (generic); +} diff --git a/sysdeps/x86_64/multiarch/strlcpy-avx2.S b/sysdeps/x86_64/multiarch/strlcpy-avx2.S new file mode 100644 index 0000000000..cf54b1e990 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strlcpy-avx2.S @@ -0,0 +1,446 @@ +/* Strlcpy/wcslcpy optimized with AVX2. + Copyright (C) 2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <isa-level.h> + +#if ISA_SHOULD_BUILD (3) + +# include <sysdep.h> + +# ifndef VEC_SIZE +# include "x86-avx-vecs.h" +# endif + +# ifndef STRLCPY +# define STRLCPY __strlcpy_avx2 +# endif + + +# ifdef USE_AS_WCSLCPY +# define CHAR_SIZE 4 +# define MOVU movl +# define VPCMPEQ vpcmpeqd +# define VPMINU vpminud +# else +# define CHAR_SIZE 1 +# define MOVU movb +# define VPCMPEQ vpcmpeqb +# define VPMINU vpminub +# endif + +# define PMOVMSK vpmovmskb +# define PAGE_SIZE 4096 +# define VEC_SIZE 32 +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) + + .section SECTION(.text),"ax",@progbits +/* Aligning entry point to 64 byte, provides better performance for + one vector length string. */ + +ENTRY_P2ALIGN (STRLCPY, 6) +# ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %edx, %edx +# endif + + /* Zero out vector register for end of string comparison. */ + vpxor %VMM(0), %VMM(0), %VMM(0) + /* Save source pointer for return calculation. */ + mov %rsi, %r8 + mov %esi, %eax + sall $20, %eax + cmpl $((PAGE_SIZE - (VEC_SIZE)) << 20), %eax + ja L(page_cross) + +L(page_cross_continue): + /* Load first vector. */ + VMOVU (%rsi), %VMM(1) + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) + PMOVMSK %VMM(2), %eax + test %eax, %eax + jnz L(ret_vec_x1) + + test %rdx, %rdx + jz L(continue_second_vector) + + /* Check whether we can copy full vector. */ + cmp $CHAR_PER_VEC, %rdx + jbe L(page_cross_small_vec_copy) + /* Copy first vector. */ + VMOVU %VMM(1), (%rdi) + sub $CHAR_PER_VEC, %rdx + +L(continue_second_vector): + /* Align RSI pointer and adjust RDI based on offset. */ + mov %rsi, %rax + and $-VEC_SIZE, %rsi + sub %rsi, %rax + sub %rax, %rdi + + /* Check if string already copied N char, and RDX is 0. */ + test %rdx, %rdx + jz L(skip_copy_alignment_fix) + + /* Adjust RDX for copy alignment fix. */ +# ifdef USE_AS_WCSLCPY + shr $2, %rax +# endif + add %rax, %rdx + +L(skip_copy_alignment_fix): + /* Load second vector. */ + VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1) + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) + vptest %VMM(2), %VMM(2) + jnz L(ret_vec_x2) + + /* Skip copy if RDX is 0. */ + test %rdx, %rdx + jz L(continue_third_vector) + + /* Jump below/equal(instead of below) used here, because last + copy chracter must be NULL. */ + cmp $CHAR_PER_VEC, %rdx + jbe L(partial_copy_second_vector) + + sub $CHAR_PER_VEC, %rdx + /* Copy second vector. */ + VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) + +L(continue_third_vector): + /* Load third vector. */ + VMOVA (VEC_SIZE * 2)(%rsi), %VMM(1) + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) + vptest %VMM(2), %VMM(2) + jnz L(ret_vec_x3) + + /* Skip copy if RDX is 0. */ + test %rdx, %rdx + jz L(continue_fourth_vector) + + cmp $CHAR_PER_VEC, %rdx + jbe L(partial_copy_third_vector) + + sub $CHAR_PER_VEC, %rdx + /* Copy third vector. */ + VMOVU %VMM(1), (VEC_SIZE * 2)(%rdi) + +L(continue_fourth_vector): + /* Load fourth vector. */ + VMOVA (VEC_SIZE * 3)(%rsi), %VMM(1) + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) + vptest %VMM(2), %VMM(2) + jnz L(ret_vec_x4) + + /* Skip copy if RDX is 0. */ + test %rdx, %rdx + jz L(loop_4x_align) + + cmp $CHAR_PER_VEC, %rdx + jbe L(partial_copy_fourth_vector) + + sub $CHAR_PER_VEC, %rdx + /* Copy fourth vector. */ + VMOVU %VMM(1), (VEC_SIZE * 3)(%rdi) + + +L(loop_4x_align): + /* Jump to loop if RSI is already 4 vector align. */ + test $(VEC_SIZE * 4 - 1), %esi + jz L(loop_4x_read) + + mov %rsi, %rcx + + /* Align RSI to 4x vector. */ + and $(VEC_SIZE * -4), %rsi + sub %rsi, %rcx + + /* Adjust RDI for RSI alignment fix. */ + sub %rcx, %rdi + + /* Jump to loop if RDX is 0. */ + test %rdx, %rdx + jz L(loop_4x_read) + +# ifdef USE_AS_WCSLCPY + shr $2, %rcx +# endif + + /* Adjust RDX for RSI alignment fix. */ + add %rcx, %rdx + jmp L(loop_4x_read) + + .p2align 4,,6 +L(loop_4x_vec): + /* Skip copy if RDX is 0. */ + test %rdx, %rdx + jz L(loop_partial_copy_return) + cmp $(CHAR_PER_VEC * 4), %rdx + jbe L(loop_partial_copy) + VMOVU %VMM(1), (VEC_SIZE * 4)(%rdi) + VMOVU %VMM(2), (VEC_SIZE * 5)(%rdi) + VMOVU %VMM(3), (VEC_SIZE * 6)(%rdi) + VMOVU %VMM(4), (VEC_SIZE * 7)(%rdi) + sub $(CHAR_PER_VEC * 4), %rdx + +L(loop_partial_copy_return): + sub $(VEC_SIZE * -4), %rsi + sub $(VEC_SIZE * -4), %rdi + +L(loop_4x_read): + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(1) + VMOVA (VEC_SIZE * 5)(%rsi), %VMM(2) + VMOVA (VEC_SIZE * 6)(%rsi), %VMM(3) + VMOVA (VEC_SIZE * 7)(%rsi), %VMM(4) + VPMINU %VMM(1), %VMM(2), %VMM(5) + VPMINU %VMM(3), %VMM(4), %VMM(6) + VPMINU %VMM(5), %VMM(6), %VMM(7) + VPCMPEQ %VMM(0), %VMM(7), %VMM(7) + vptest %VMM(7), %VMM(7) + + jz L(loop_4x_vec) + + /* Check if string ends in first vector or second vector. */ + lea (VEC_SIZE * 4)(%rsi), %rax + sub %r8, %rax +# ifdef USE_AS_WCSLCPY + shr $2, %rax +# endif + xor %r10, %r10 + VPCMPEQ %VMM(0), %VMM(5), %VMM(6) + vptest %VMM(6), %VMM(6) + jnz L(endloop) + sub $(CHAR_PER_VEC * -2), %rax + mov $(CHAR_PER_VEC * 2), %r10 + VMOVA %VMM(3), %VMM(1) + VMOVA %VMM(4), %VMM(2) + +L(endloop): + VPCMPEQ %VMM(0), %VMM(1), %VMM(1) + VPCMPEQ %VMM(0), %VMM(2), %VMM(2) + PMOVMSK %VMM(1), %rcx + PMOVMSK %VMM(2), %r9 + shlq $32, %r9 + orq %r9, %rcx + bsf %rcx, %rcx + /* Shift RCX by 2, VPMOVMSK has only byte version. */ +# ifdef USE_AS_WCSLCPY + shr $2, %rcx +# endif + /* At this point RAX has length to return. */ + add %rcx, %rax + test %rdx, %rdx + jz L(ret) + + /* Add 1 to account for NULL character in RDX comparison. */ + lea 1(%r10, %rcx), %rcx + cmp %rdx, %rcx + cmovb %rcx, %rdx + +L(loop_partial_copy): + cmp $(CHAR_PER_VEC * 2), %rdx + jbe L(loop_partial_first_half) + /* Reload first 2 vector. */ + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(1) + VMOVA (VEC_SIZE * 5)(%rsi), %VMM(2) + VMOVU %VMM(1), (VEC_SIZE * 4)(%rdi) + VMOVU %VMM(2), (VEC_SIZE * 5)(%rdi) + +L(loop_partial_first_half): + /* Go back 2 vector from last and use overlapping copy. + (VEC_SIZE * 4 - VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE) + (VEC_SIZE * 4 - VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE) + */ + VMOVU (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(3) + VMOVU (VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %VMM(4) + VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE) + VMOVU %VMM(4), (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE) + MOVU $0, (VEC_SIZE * 4 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) + xor %rdx, %rdx + vptest %VMM(7), %VMM(7) + jz L(loop_partial_copy_return) + ret + + .p2align 4 +L(page_cross): + mov %rsi, %rcx + mov %rsi, %r11 + and $-VEC_SIZE, %r11 + and $(VEC_SIZE - 1), %rcx + VMOVA (%r11), %VMM(1) + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) + PMOVMSK %VMM(2), %eax + shr %cl, %eax + jz L(page_cross_continue) + +L(ret_vec_x1): + bsf %eax, %eax +# ifdef USE_AS_WCSLCPY + shr $2, %eax +# endif + /* Increment by 1 to account for NULL char. */ + lea 1(%eax), %ecx + cmp %rdx, %rcx + cmovb %rcx, %rdx + test %rdx, %rdx + jz L(ret) + +L(page_cross_small_vec_copy): + cmp $(16 / CHAR_SIZE), %rdx + jbe L(copy_8_byte_scalar) + VMOVU (%rsi), %VMM_128(1) + VMOVU -16(%rsi, %rdx, CHAR_SIZE), %VMM_128(3) + VMOVU %VMM_128(1), (%rdi) + VMOVU %VMM_128(3), -16(%rdi, %rdx, CHAR_SIZE) + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) + xor %rdx, %rdx + vptest %VMM(2), %VMM(2) + jz L(continue_second_vector) + ret + +L(copy_8_byte_scalar): + cmp $(8 / CHAR_SIZE), %rdx + jbe L(copy_4_byte_scalar) + movq (%rsi), %r10 + movq -8(%rsi, %rdx, CHAR_SIZE), %r11 + movq %r10, (%rdi) + movq %r11, -8(%rdi, %rdx, CHAR_SIZE) + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) + xor %edx, %edx + vptest %VMM(2), %VMM(2) + jz L(continue_second_vector) + ret + +L(copy_4_byte_scalar): +# ifndef USE_AS_WCSLCPY + cmp $4, %rdx + jbe L(copy_2_byte_scalar) +# endif + movl (%rsi), %r10d + movl -4(%rsi, %rdx, CHAR_SIZE), %r11d + movl %r10d, (%rdi) + movl %r11d, -4(%rdi, %rdx, CHAR_SIZE) + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) + xor %edx, %edx + vptest %VMM(2), %VMM(2) + jz L(continue_second_vector) + ret + +# ifndef USE_AS_WCSLCPY +L(copy_2_byte_scalar): + cmp $2, %rdx + jbe L(copy_1_byte_scalar) + movw (%rsi), %r10w + movw -(CHAR_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %r11w + movw %r10w, (%rdi) + movw %r11w, -(CHAR_SIZE * 3)(%rdi, %rdx, CHAR_SIZE) + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) + xor %edx, %edx + vptest %VMM(2), %VMM(2) + jz L(continue_second_vector) + ret + +L(copy_1_byte_scalar): + MOVU (%rsi), %r10b + MOVU %r10b, (%rdi) + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) + xor %edx, %edx + vptest %VMM(2), %VMM(2) + jz L(continue_second_vector) + ret +# endif + +L(ret_vec_x2): + PMOVMSK %VMM(2), %rax + bsf %rax, %rcx + /* Calculate return value. */ + lea VEC_SIZE(%rsi, %rcx), %rax + sub %r8, %rax +# ifdef USE_AS_WCSLCPY + shr $2, %rax + shr $2, %rcx +# endif + inc %rcx + test %rdx, %rdx + jz L(ret) + cmp %rdx, %rcx + cmovb %rcx, %rdx + +L(partial_copy_second_vector): + VMOVU (%rsi, %rdx, CHAR_SIZE), %VMM(1) + VMOVU %VMM(1), (%rdi, %rdx, CHAR_SIZE) + MOVU $0, (VEC_SIZE - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) + xor %edx, %edx + vptest %VMM(2), %VMM(2) + jz L(continue_third_vector) + +L(ret): + ret + +L(ret_vec_x3): + PMOVMSK %VMM(2), %rax + bsf %rax, %rcx + /* Calculate return value. */ + lea (VEC_SIZE * 2)(%rsi, %rcx), %rax + sub %r8, %rax +# ifdef USE_AS_WCSLCPY + shr $2, %rax + shr $2, %rcx +# endif + inc %rcx + test %rdx, %rdx + jz L(ret) + cmp %rdx, %rcx + cmovb %rcx, %rdx + +L(partial_copy_third_vector): + VMOVU (VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM(1) + VMOVU %VMM(1), (VEC_SIZE)(%rdi, %rdx, CHAR_SIZE) + MOVU $0, ((VEC_SIZE * 2) - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) + xor %edx, %edx + vptest %VMM(2), %VMM(2) + jz L(continue_fourth_vector) + ret + +L(ret_vec_x4): + PMOVMSK %VMM(2), %rax + bsf %rax, %rcx + /* Calculate return value. */ + lea (VEC_SIZE * 3)(%rsi, %rcx), %rax + sub %r8, %rax +# ifdef USE_AS_WCSLCPY + shr $2, %rax + shr $2, %rcx +# endif + inc %rcx + test %rdx, %rdx + jz L(ret) + cmp %rdx, %rcx + cmovb %rcx, %rdx + +L(partial_copy_fourth_vector): + VMOVU (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(1) + VMOVU %VMM(1), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE) + MOVU $0, ((VEC_SIZE * 3) - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) + xor %edx, %edx + vptest %VMM(2), %VMM(2) + jz L(continue_fourth_vector) + ret + +END (STRLCPY) +#endif diff --git a/sysdeps/x86_64/multiarch/strlcpy-generic.c b/sysdeps/x86_64/multiarch/strlcpy-generic.c new file mode 100644 index 0000000000..eee3b7b086 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strlcpy-generic.c @@ -0,0 +1,25 @@ +/* strlcpy generic. + Copyright (C) 2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + + +#include <isa-level.h> +#if ISA_SHOULD_BUILD (1) +# define __strlcpy __strlcpy_generic +# include <string/strlcpy.c> + +#endif diff --git a/sysdeps/x86_64/multiarch/strlcpy.c b/sysdeps/x86_64/multiarch/strlcpy.c new file mode 100644 index 0000000000..ded41fbcfb --- /dev/null +++ b/sysdeps/x86_64/multiarch/strlcpy.c @@ -0,0 +1,36 @@ +/* Multiple versions of strlcpy. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define __strlcpy __redirect_strlcpy +# include <string.h> +# undef __strlcpy + +# define SYMBOL_NAME strlcpy +# include "ifunc-strlcpy.h" + +libc_ifunc_redirected (__redirect_strlcpy, __strlcpy, IFUNC_SELECTOR ()); +weak_alias (__strlcpy, strlcpy) + +# ifdef SHARED +__hidden_ver1 (__strlcpy, __GI___strlcpy, __redirect_strlcpy) + __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strlcpy); +# endif +#endif diff --git a/sysdeps/x86_64/multiarch/wcslcpy-avx2.S b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S new file mode 100644 index 0000000000..dafc20ded0 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S @@ -0,0 +1,4 @@ +#define STRLCPY __wcslcpy_avx2 +#define USE_AS_WCSLCPY 1 + +#include "strlcpy-avx2.S" diff --git a/sysdeps/x86_64/multiarch/wcslcpy-generic.c b/sysdeps/x86_64/multiarch/wcslcpy-generic.c new file mode 100644 index 0000000000..ffd3c0e846 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcslcpy-generic.c @@ -0,0 +1,25 @@ +/* wcslcpy generic. + Copyright (C) 2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + + +#include <isa-level.h> +#if ISA_SHOULD_BUILD (1) +# define __wcslcpy __wcslcpy_generic +# include <wcsmbs/wcslcpy.c> + +#endif diff --git a/sysdeps/x86_64/multiarch/wcslcpy.c b/sysdeps/x86_64/multiarch/wcslcpy.c new file mode 100644 index 0000000000..371ef9626c --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcslcpy.c @@ -0,0 +1,35 @@ +/* Multiple versions of wcslcpy. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define __wcslcpy __redirect_wcslcpy +# include <wchar.h> +# undef __wcslcpy + +# define SYMBOL_NAME wcslcpy +# include "ifunc-strlcpy.h" + +libc_ifunc_redirected (__redirect_wcslcpy, __wcslcpy, IFUNC_SELECTOR ()); +weak_alias (__wcslcpy, wcslcpy) +# ifdef SHARED +__hidden_ver1 (__wcslcpy, __GI___wcslcpy, __redirect_wcslcpy) + __attribute__((visibility ("hidden"))) __attribute_copy__ (wcslcpy); +# endif +#endif -- 2.38.1 ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function 2023-06-30 20:48 [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function Sunil K Pandey @ 2023-06-30 21:04 ` Noah Goldstein 2023-06-30 21:27 ` Paul Eggert 2023-07-01 9:41 ` Florian Weimer 2023-07-02 17:03 ` Noah Goldstein 1 sibling, 2 replies; 24+ messages in thread From: Noah Goldstein @ 2023-06-30 21:04 UTC (permalink / raw) To: Sunil K Pandey; +Cc: libc-alpha, hjl.tools On Fri, Jun 30, 2023 at 3:48 PM Sunil K Pandey via Libc-alpha <libc-alpha@sourceware.org> wrote: > > This patch optimizes strlcpy/wsclcpy string functions for AVX2. > --- > sysdeps/x86_64/multiarch/Makefile | 4 + > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 18 + > sysdeps/x86_64/multiarch/ifunc-strlcpy.h | 34 ++ > sysdeps/x86_64/multiarch/strlcpy-avx2.S | 446 +++++++++++++++++++++ > sysdeps/x86_64/multiarch/strlcpy-generic.c | 25 ++ > sysdeps/x86_64/multiarch/strlcpy.c | 36 ++ > sysdeps/x86_64/multiarch/wcslcpy-avx2.S | 4 + > sysdeps/x86_64/multiarch/wcslcpy-generic.c | 25 ++ > sysdeps/x86_64/multiarch/wcslcpy.c | 35 ++ > 9 files changed, 627 insertions(+) > create mode 100644 sysdeps/x86_64/multiarch/ifunc-strlcpy.h > create mode 100644 sysdeps/x86_64/multiarch/strlcpy-avx2.S > create mode 100644 sysdeps/x86_64/multiarch/strlcpy-generic.c > create mode 100644 sysdeps/x86_64/multiarch/strlcpy.c > create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-avx2.S > create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-generic.c > create mode 100644 sysdeps/x86_64/multiarch/wcslcpy.c > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > index e1e894c963..7e3fc081df 100644 > --- a/sysdeps/x86_64/multiarch/Makefile > +++ b/sysdeps/x86_64/multiarch/Makefile > @@ -82,6 +82,8 @@ sysdep_routines += \ > strcpy-sse2 \ > strcpy-sse2-unaligned \ > strcspn-sse4 \ > + strlcpy-avx2 \ > + strlcpy-generic \ > strlen-avx2 \ > strlen-avx2-rtm \ > strlen-evex \ > @@ -153,6 +155,8 @@ sysdep_routines += \ > wcscpy-evex \ > wcscpy-generic \ > wcscpy-ssse3 \ > + wcslcpy-avx2 \ > + wcslcpy-generic \ > wcslen-avx2 \ > wcslen-avx2-rtm \ > wcslen-evex \ > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > index 5427ff1907..9928dee187 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > @@ -751,6 +751,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > 1, > __strncat_sse2_unaligned)) > > + /* Support sysdeps/x86_64/multiarch/strlcpy.c. */ > + IFUNC_IMPL (i, name, strlcpy, > + X86_IFUNC_IMPL_ADD_V3 (array, i, strlcpy, > + CPU_FEATURE_USABLE (AVX2), > + __strlcpy_avx2) > + X86_IFUNC_IMPL_ADD_V1 (array, i, strlcpy, > + 1, > + __strlcpy_generic)) > + > /* Support sysdeps/x86_64/multiarch/strncpy.c. */ > IFUNC_IMPL (i, name, strncpy, > X86_IFUNC_IMPL_ADD_V4 (array, i, strncpy, > @@ -917,6 +926,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > 1, > __wcscpy_generic)) > > + /* Support sysdeps/x86_64/multiarch/wcslcpy.c. */ > + IFUNC_IMPL (i, name, wcslcpy, > + X86_IFUNC_IMPL_ADD_V3 (array, i, wcslcpy, > + CPU_FEATURE_USABLE (AVX2), > + __wcslcpy_avx2) > + X86_IFUNC_IMPL_ADD_V1 (array, i, wcslcpy, > + 1, > + __wcslcpy_generic)) > + > /* Support sysdeps/x86_64/multiarch/wcsncpy.c. */ > IFUNC_IMPL (i, name, wcsncpy, > X86_IFUNC_IMPL_ADD_V4 (array, i, wcsncpy, > diff --git a/sysdeps/x86_64/multiarch/ifunc-strlcpy.h b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h > new file mode 100644 > index 0000000000..982a30d15b > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h > @@ -0,0 +1,34 @@ > +/* Common definition for ifunc selections. > + All versions must be listed in ifunc-impl-list.c. > + Copyright (C) 2023 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <init-arch.h> > + > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; > +extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden; > + > +static inline void * > +IFUNC_SELECTOR (void) > +{ > + const struct cpu_features *cpu_features = __get_cpu_features (); > + > + if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2)) > + return OPTIMIZE (avx2); > + > + return OPTIMIZE (generic); > +} > diff --git a/sysdeps/x86_64/multiarch/strlcpy-avx2.S b/sysdeps/x86_64/multiarch/strlcpy-avx2.S > new file mode 100644 > index 0000000000..cf54b1e990 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/strlcpy-avx2.S > @@ -0,0 +1,446 @@ > +/* Strlcpy/wcslcpy optimized with AVX2. > + Copyright (C) 2023 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <isa-level.h> > + > +#if ISA_SHOULD_BUILD (3) > + > +# include <sysdep.h> > + > +# ifndef VEC_SIZE > +# include "x86-avx-vecs.h" > +# endif > + > +# ifndef STRLCPY > +# define STRLCPY __strlcpy_avx2 > +# endif > + > + > +# ifdef USE_AS_WCSLCPY > +# define CHAR_SIZE 4 > +# define MOVU movl > +# define VPCMPEQ vpcmpeqd > +# define VPMINU vpminud > +# else > +# define CHAR_SIZE 1 > +# define MOVU movb > +# define VPCMPEQ vpcmpeqb > +# define VPMINU vpminub > +# endif > + > +# define PMOVMSK vpmovmskb > +# define PAGE_SIZE 4096 > +# define VEC_SIZE 32 > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > + > + .section SECTION(.text),"ax",@progbits > +/* Aligning entry point to 64 byte, provides better performance for > + one vector length string. */ > + > +ENTRY_P2ALIGN (STRLCPY, 6) > +# ifdef __ILP32__ > + /* Clear the upper 32 bits. */ > + movl %edx, %edx > +# endif > + > + /* Zero out vector register for end of string comparison. */ > + vpxor %VMM(0), %VMM(0), %VMM(0) > + /* Save source pointer for return calculation. */ > + mov %rsi, %r8 > + mov %esi, %eax > + sall $20, %eax > + cmpl $((PAGE_SIZE - (VEC_SIZE)) << 20), %eax > + ja L(page_cross) > + > +L(page_cross_continue): > + /* Load first vector. */ > + VMOVU (%rsi), %VMM(1) > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) > + PMOVMSK %VMM(2), %eax > + test %eax, %eax > + jnz L(ret_vec_x1) > + > + test %rdx, %rdx > + jz L(continue_second_vector) > + > + /* Check whether we can copy full vector. */ > + cmp $CHAR_PER_VEC, %rdx > + jbe L(page_cross_small_vec_copy) > + /* Copy first vector. */ > + VMOVU %VMM(1), (%rdi) > + sub $CHAR_PER_VEC, %rdx > + > +L(continue_second_vector): > + /* Align RSI pointer and adjust RDI based on offset. */ > + mov %rsi, %rax > + and $-VEC_SIZE, %rsi > + sub %rsi, %rax > + sub %rax, %rdi > + > + /* Check if string already copied N char, and RDX is 0. */ > + test %rdx, %rdx > + jz L(skip_copy_alignment_fix) > + > + /* Adjust RDX for copy alignment fix. */ > +# ifdef USE_AS_WCSLCPY > + shr $2, %rax > +# endif > + add %rax, %rdx > + > +L(skip_copy_alignment_fix): > + /* Load second vector. */ > + VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1) > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) > + vptest %VMM(2), %VMM(2) > + jnz L(ret_vec_x2) > + > + /* Skip copy if RDX is 0. */ > + test %rdx, %rdx > + jz L(continue_third_vector) > + > + /* Jump below/equal(instead of below) used here, because last > + copy chracter must be NULL. */ > + cmp $CHAR_PER_VEC, %rdx > + jbe L(partial_copy_second_vector) > + > + sub $CHAR_PER_VEC, %rdx > + /* Copy second vector. */ > + VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) > + > +L(continue_third_vector): > + /* Load third vector. */ > + VMOVA (VEC_SIZE * 2)(%rsi), %VMM(1) > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) > + vptest %VMM(2), %VMM(2) > + jnz L(ret_vec_x3) > + > + /* Skip copy if RDX is 0. */ > + test %rdx, %rdx > + jz L(continue_fourth_vector) > + > + cmp $CHAR_PER_VEC, %rdx > + jbe L(partial_copy_third_vector) > + > + sub $CHAR_PER_VEC, %rdx > + /* Copy third vector. */ > + VMOVU %VMM(1), (VEC_SIZE * 2)(%rdi) > + > +L(continue_fourth_vector): > + /* Load fourth vector. */ > + VMOVA (VEC_SIZE * 3)(%rsi), %VMM(1) > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) > + vptest %VMM(2), %VMM(2) > + jnz L(ret_vec_x4) > + > + /* Skip copy if RDX is 0. */ > + test %rdx, %rdx > + jz L(loop_4x_align) > + > + cmp $CHAR_PER_VEC, %rdx > + jbe L(partial_copy_fourth_vector) > + > + sub $CHAR_PER_VEC, %rdx > + /* Copy fourth vector. */ > + VMOVU %VMM(1), (VEC_SIZE * 3)(%rdi) > + > + > +L(loop_4x_align): > + /* Jump to loop if RSI is already 4 vector align. */ > + test $(VEC_SIZE * 4 - 1), %esi > + jz L(loop_4x_read) > + > + mov %rsi, %rcx > + > + /* Align RSI to 4x vector. */ > + and $(VEC_SIZE * -4), %rsi > + sub %rsi, %rcx > + > + /* Adjust RDI for RSI alignment fix. */ > + sub %rcx, %rdi > + > + /* Jump to loop if RDX is 0. */ > + test %rdx, %rdx > + jz L(loop_4x_read) > + > +# ifdef USE_AS_WCSLCPY > + shr $2, %rcx > +# endif > + > + /* Adjust RDX for RSI alignment fix. */ > + add %rcx, %rdx > + jmp L(loop_4x_read) > + > + .p2align 4,,6 > +L(loop_4x_vec): > + /* Skip copy if RDX is 0. */ > + test %rdx, %rdx > + jz L(loop_partial_copy_return) > + cmp $(CHAR_PER_VEC * 4), %rdx > + jbe L(loop_partial_copy) > + VMOVU %VMM(1), (VEC_SIZE * 4)(%rdi) > + VMOVU %VMM(2), (VEC_SIZE * 5)(%rdi) > + VMOVU %VMM(3), (VEC_SIZE * 6)(%rdi) > + VMOVU %VMM(4), (VEC_SIZE * 7)(%rdi) > + sub $(CHAR_PER_VEC * 4), %rdx > + > +L(loop_partial_copy_return): > + sub $(VEC_SIZE * -4), %rsi > + sub $(VEC_SIZE * -4), %rdi > + > +L(loop_4x_read): > + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(1) > + VMOVA (VEC_SIZE * 5)(%rsi), %VMM(2) > + VMOVA (VEC_SIZE * 6)(%rsi), %VMM(3) > + VMOVA (VEC_SIZE * 7)(%rsi), %VMM(4) > + VPMINU %VMM(1), %VMM(2), %VMM(5) > + VPMINU %VMM(3), %VMM(4), %VMM(6) > + VPMINU %VMM(5), %VMM(6), %VMM(7) > + VPCMPEQ %VMM(0), %VMM(7), %VMM(7) > + vptest %VMM(7), %VMM(7) > + > + jz L(loop_4x_vec) > + > + /* Check if string ends in first vector or second vector. */ > + lea (VEC_SIZE * 4)(%rsi), %rax > + sub %r8, %rax > +# ifdef USE_AS_WCSLCPY > + shr $2, %rax > +# endif > + xor %r10, %r10 > + VPCMPEQ %VMM(0), %VMM(5), %VMM(6) > + vptest %VMM(6), %VMM(6) > + jnz L(endloop) > + sub $(CHAR_PER_VEC * -2), %rax > + mov $(CHAR_PER_VEC * 2), %r10 > + VMOVA %VMM(3), %VMM(1) > + VMOVA %VMM(4), %VMM(2) > + > +L(endloop): > + VPCMPEQ %VMM(0), %VMM(1), %VMM(1) > + VPCMPEQ %VMM(0), %VMM(2), %VMM(2) > + PMOVMSK %VMM(1), %rcx > + PMOVMSK %VMM(2), %r9 > + shlq $32, %r9 > + orq %r9, %rcx > + bsf %rcx, %rcx > + /* Shift RCX by 2, VPMOVMSK has only byte version. */ > +# ifdef USE_AS_WCSLCPY > + shr $2, %rcx > +# endif > + /* At this point RAX has length to return. */ > + add %rcx, %rax > + test %rdx, %rdx > + jz L(ret) > + > + /* Add 1 to account for NULL character in RDX comparison. */ > + lea 1(%r10, %rcx), %rcx > + cmp %rdx, %rcx > + cmovb %rcx, %rdx > + > +L(loop_partial_copy): > + cmp $(CHAR_PER_VEC * 2), %rdx > + jbe L(loop_partial_first_half) > + /* Reload first 2 vector. */ > + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(1) > + VMOVA (VEC_SIZE * 5)(%rsi), %VMM(2) > + VMOVU %VMM(1), (VEC_SIZE * 4)(%rdi) > + VMOVU %VMM(2), (VEC_SIZE * 5)(%rdi) > + > +L(loop_partial_first_half): > + /* Go back 2 vector from last and use overlapping copy. > + (VEC_SIZE * 4 - VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE) > + (VEC_SIZE * 4 - VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE) > + */ > + VMOVU (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(3) > + VMOVU (VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %VMM(4) > + VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE) > + VMOVU %VMM(4), (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE) > + MOVU $0, (VEC_SIZE * 4 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) > + xor %rdx, %rdx > + vptest %VMM(7), %VMM(7) > + jz L(loop_partial_copy_return) > + ret > + > + .p2align 4 > +L(page_cross): > + mov %rsi, %rcx > + mov %rsi, %r11 > + and $-VEC_SIZE, %r11 > + and $(VEC_SIZE - 1), %rcx > + VMOVA (%r11), %VMM(1) > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) > + PMOVMSK %VMM(2), %eax > + shr %cl, %eax > + jz L(page_cross_continue) > + > +L(ret_vec_x1): > + bsf %eax, %eax > +# ifdef USE_AS_WCSLCPY > + shr $2, %eax > +# endif > + /* Increment by 1 to account for NULL char. */ > + lea 1(%eax), %ecx > + cmp %rdx, %rcx > + cmovb %rcx, %rdx > + test %rdx, %rdx > + jz L(ret) > + > +L(page_cross_small_vec_copy): > + cmp $(16 / CHAR_SIZE), %rdx > + jbe L(copy_8_byte_scalar) > + VMOVU (%rsi), %VMM_128(1) > + VMOVU -16(%rsi, %rdx, CHAR_SIZE), %VMM_128(3) > + VMOVU %VMM_128(1), (%rdi) > + VMOVU %VMM_128(3), -16(%rdi, %rdx, CHAR_SIZE) > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > + xor %rdx, %rdx > + vptest %VMM(2), %VMM(2) > + jz L(continue_second_vector) > + ret > + > +L(copy_8_byte_scalar): > + cmp $(8 / CHAR_SIZE), %rdx > + jbe L(copy_4_byte_scalar) > + movq (%rsi), %r10 > + movq -8(%rsi, %rdx, CHAR_SIZE), %r11 > + movq %r10, (%rdi) > + movq %r11, -8(%rdi, %rdx, CHAR_SIZE) > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > + xor %edx, %edx > + vptest %VMM(2), %VMM(2) > + jz L(continue_second_vector) > + ret > + > +L(copy_4_byte_scalar): > +# ifndef USE_AS_WCSLCPY > + cmp $4, %rdx > + jbe L(copy_2_byte_scalar) > +# endif > + movl (%rsi), %r10d > + movl -4(%rsi, %rdx, CHAR_SIZE), %r11d > + movl %r10d, (%rdi) > + movl %r11d, -4(%rdi, %rdx, CHAR_SIZE) > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > + xor %edx, %edx > + vptest %VMM(2), %VMM(2) > + jz L(continue_second_vector) > + ret > + > +# ifndef USE_AS_WCSLCPY > +L(copy_2_byte_scalar): > + cmp $2, %rdx > + jbe L(copy_1_byte_scalar) > + movw (%rsi), %r10w > + movw -(CHAR_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %r11w > + movw %r10w, (%rdi) > + movw %r11w, -(CHAR_SIZE * 3)(%rdi, %rdx, CHAR_SIZE) > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > + xor %edx, %edx > + vptest %VMM(2), %VMM(2) > + jz L(continue_second_vector) > + ret > + > +L(copy_1_byte_scalar): > + MOVU (%rsi), %r10b > + MOVU %r10b, (%rdi) > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > + xor %edx, %edx > + vptest %VMM(2), %VMM(2) > + jz L(continue_second_vector) > + ret > +# endif > + > +L(ret_vec_x2): > + PMOVMSK %VMM(2), %rax > + bsf %rax, %rcx > + /* Calculate return value. */ > + lea VEC_SIZE(%rsi, %rcx), %rax > + sub %r8, %rax > +# ifdef USE_AS_WCSLCPY > + shr $2, %rax > + shr $2, %rcx > +# endif > + inc %rcx > + test %rdx, %rdx > + jz L(ret) > + cmp %rdx, %rcx > + cmovb %rcx, %rdx > + > +L(partial_copy_second_vector): > + VMOVU (%rsi, %rdx, CHAR_SIZE), %VMM(1) > + VMOVU %VMM(1), (%rdi, %rdx, CHAR_SIZE) > + MOVU $0, (VEC_SIZE - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > + xor %edx, %edx > + vptest %VMM(2), %VMM(2) > + jz L(continue_third_vector) > + > +L(ret): > + ret > + > +L(ret_vec_x3): > + PMOVMSK %VMM(2), %rax > + bsf %rax, %rcx > + /* Calculate return value. */ > + lea (VEC_SIZE * 2)(%rsi, %rcx), %rax > + sub %r8, %rax > +# ifdef USE_AS_WCSLCPY > + shr $2, %rax > + shr $2, %rcx > +# endif > + inc %rcx > + test %rdx, %rdx > + jz L(ret) > + cmp %rdx, %rcx > + cmovb %rcx, %rdx > + > +L(partial_copy_third_vector): > + VMOVU (VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM(1) > + VMOVU %VMM(1), (VEC_SIZE)(%rdi, %rdx, CHAR_SIZE) > + MOVU $0, ((VEC_SIZE * 2) - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > + xor %edx, %edx > + vptest %VMM(2), %VMM(2) > + jz L(continue_fourth_vector) > + ret > + > +L(ret_vec_x4): > + PMOVMSK %VMM(2), %rax > + bsf %rax, %rcx > + /* Calculate return value. */ > + lea (VEC_SIZE * 3)(%rsi, %rcx), %rax > + sub %r8, %rax > +# ifdef USE_AS_WCSLCPY > + shr $2, %rax > + shr $2, %rcx > +# endif > + inc %rcx > + test %rdx, %rdx > + jz L(ret) > + cmp %rdx, %rcx > + cmovb %rcx, %rdx > + > +L(partial_copy_fourth_vector): > + VMOVU (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(1) > + VMOVU %VMM(1), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE) > + MOVU $0, ((VEC_SIZE * 3) - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > + xor %edx, %edx > + vptest %VMM(2), %VMM(2) > + jz L(continue_fourth_vector) > + ret > + > +END (STRLCPY) > +#endif > diff --git a/sysdeps/x86_64/multiarch/strlcpy-generic.c b/sysdeps/x86_64/multiarch/strlcpy-generic.c > new file mode 100644 > index 0000000000..eee3b7b086 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/strlcpy-generic.c > @@ -0,0 +1,25 @@ > +/* strlcpy generic. > + Copyright (C) 2023 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > + > +#include <isa-level.h> > +#if ISA_SHOULD_BUILD (1) > +# define __strlcpy __strlcpy_generic > +# include <string/strlcpy.c> > + > +#endif > diff --git a/sysdeps/x86_64/multiarch/strlcpy.c b/sysdeps/x86_64/multiarch/strlcpy.c > new file mode 100644 > index 0000000000..ded41fbcfb > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/strlcpy.c > @@ -0,0 +1,36 @@ > +/* Multiple versions of strlcpy. > + All versions must be listed in ifunc-impl-list.c. > + Copyright (C) 2023 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +/* Define multiple versions only for the definition in libc. */ > +#if IS_IN (libc) > +# define __strlcpy __redirect_strlcpy > +# include <string.h> > +# undef __strlcpy > + > +# define SYMBOL_NAME strlcpy > +# include "ifunc-strlcpy.h" > + > +libc_ifunc_redirected (__redirect_strlcpy, __strlcpy, IFUNC_SELECTOR ()); > +weak_alias (__strlcpy, strlcpy) > + > +# ifdef SHARED > +__hidden_ver1 (__strlcpy, __GI___strlcpy, __redirect_strlcpy) > + __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strlcpy); > +# endif > +#endif > diff --git a/sysdeps/x86_64/multiarch/wcslcpy-avx2.S b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S > new file mode 100644 > index 0000000000..dafc20ded0 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S > @@ -0,0 +1,4 @@ > +#define STRLCPY __wcslcpy_avx2 > +#define USE_AS_WCSLCPY 1 > + > +#include "strlcpy-avx2.S" > diff --git a/sysdeps/x86_64/multiarch/wcslcpy-generic.c b/sysdeps/x86_64/multiarch/wcslcpy-generic.c > new file mode 100644 > index 0000000000..ffd3c0e846 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/wcslcpy-generic.c > @@ -0,0 +1,25 @@ > +/* wcslcpy generic. > + Copyright (C) 2023 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > + > +#include <isa-level.h> > +#if ISA_SHOULD_BUILD (1) > +# define __wcslcpy __wcslcpy_generic > +# include <wcsmbs/wcslcpy.c> > + > +#endif > diff --git a/sysdeps/x86_64/multiarch/wcslcpy.c b/sysdeps/x86_64/multiarch/wcslcpy.c > new file mode 100644 > index 0000000000..371ef9626c > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/wcslcpy.c > @@ -0,0 +1,35 @@ > +/* Multiple versions of wcslcpy. > + All versions must be listed in ifunc-impl-list.c. > + Copyright (C) 2023 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +/* Define multiple versions only for the definition in libc. */ > +#if IS_IN (libc) > +# define __wcslcpy __redirect_wcslcpy > +# include <wchar.h> > +# undef __wcslcpy > + > +# define SYMBOL_NAME wcslcpy > +# include "ifunc-strlcpy.h" > + > +libc_ifunc_redirected (__redirect_wcslcpy, __wcslcpy, IFUNC_SELECTOR ()); > +weak_alias (__wcslcpy, wcslcpy) > +# ifdef SHARED > +__hidden_ver1 (__wcslcpy, __GI___wcslcpy, __redirect_wcslcpy) > + __attribute__((visibility ("hidden"))) __attribute_copy__ (wcslcpy); > +# endif > +#endif > -- > 2.38.1 > Think we should at the very least wait for the generic strlcpy codes to land first. ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function 2023-06-30 21:04 ` Noah Goldstein @ 2023-06-30 21:27 ` Paul Eggert 2023-06-30 22:21 ` Sunil Pandey 2023-07-03 12:55 ` Adhemerval Zanella Netto 2023-07-01 9:41 ` Florian Weimer 1 sibling, 2 replies; 24+ messages in thread From: Paul Eggert @ 2023-06-30 21:27 UTC (permalink / raw) To: Noah Goldstein, Sunil K Pandey; +Cc: libc-alpha, hjl.tools On 2023-06-30 14:04, Noah Goldstein via Libc-alpha wrote: > Think we should at the very least wait for the generic strlcpy codes > to land first. Let's not optimize these functions at all, unless there's good and measured reason to do so. In practice I expected they're called with small sizes for which optimization is a net minus as it consumes valuable maintenance time with no real benefit. ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function 2023-06-30 21:27 ` Paul Eggert @ 2023-06-30 22:21 ` Sunil Pandey 2023-06-30 23:22 ` Noah Goldstein 2023-07-03 16:30 ` Paul Eggert 2023-07-03 12:55 ` Adhemerval Zanella Netto 1 sibling, 2 replies; 24+ messages in thread From: Sunil Pandey @ 2023-06-30 22:21 UTC (permalink / raw) To: Paul Eggert; +Cc: Noah Goldstein, libc-alpha, hjl.tools [-- Attachment #1.1: Type: text/plain, Size: 702 bytes --] On Fri, Jun 30, 2023 at 2:27 PM Paul Eggert <eggert@cs.ucla.edu> wrote: > On 2023-06-30 14:04, Noah Goldstein via Libc-alpha wrote: > > Think we should at the very least wait for the generic strlcpy codes > > to land first. > > Let's not optimize these functions at all, unless there's good and > measured reason to do so. In practice I expected they're called with > small sizes for which optimization is a net minus as it consumes > valuable maintenance time with no real benefit. > Hi Paul, Attached is strcpy/wcslcpy microbenchmark data based on Noah strlcpy/wcslcpy microbenchmark patch. https://sourceware.org/pipermail/libc-alpha/2023-April/147557.html Thanks, Sunil [-- Attachment #2: bench-wcslcpy.txt --] [-- Type: text/plain, Size: 18368 bytes --] Function: wcslcpy Variant: __wcslcpy_avx2 __wcslcpy_generic ======================================================================================================================== len=16, align1=1, align2=1, n=16: 14.99 ( 24.63%) 19.89 len=16, align1=1, align2=1, n=16: 14.58 ( 19.61%) 18.13 len=16, align1=1, align2=2, n=16: 16.99 ( 4.02%) 17.70 len=16, align1=2, align2=1, n=16: 15.14 ( 17.08%) 18.25 len=2, align1=7, align2=2, n=4: 8.40 ( 44.41%) 15.11 len=4, align1=2, align2=7, n=2: 10.91 ( 42.41%) 18.95 len=2, align1=7, align2=2, n=4: 8.92 ( 34.99%) 13.72 len=4, align1=2, align2=7, n=2: 10.92 ( 42.05%) 18.84 len=16, align1=2, align2=2, n=16: 15.70 ( 11.97%) 17.84 len=16, align1=2, align2=2, n=16: 14.83 ( 16.82%) 17.83 len=16, align1=2, align2=4, n=16: 17.30 ( -0.46%) 17.22 len=16, align1=4, align2=2, n=16: 15.44 ( 15.20%) 18.21 len=4, align1=6, align2=4, n=8: 12.87 ( 14.74%) 15.09 len=8, align1=4, align2=6, n=4: 13.72 ( 25.95%) 18.53 len=4, align1=6, align2=4, n=8: 12.85 ( 9.03%) 14.13 len=8, align1=4, align2=6, n=4: 12.67 ( 31.60%) 18.52 len=16, align1=3, align2=3, n=16: 14.57 ( 15.76%) 17.30 len=16, align1=3, align2=3, n=16: 14.82 ( 14.03%) 17.23 len=16, align1=3, align2=6, n=16: 17.02 ( 3.24%) 17.59 len=16, align1=6, align2=3, n=16: 15.04 ( 19.50%) 18.68 len=8, align1=5, align2=6, n=16: 14.96 ( 8.00%) 16.26 len=16, align1=6, align2=5, n=8: 13.70 ( 25.56%) 18.41 len=8, align1=5, align2=6, n=16: 14.54 ( 7.87%) 15.78 len=16, align1=6, align2=5, n=8: 12.35 ( 24.15%) 16.28 len=16, align1=4, align2=4, n=16: 13.93 ( 14.41%) 16.28 len=16, align1=4, align2=4, n=16: 13.63 ( 16.32%) 16.29 len=16, align1=4, align2=0, n=16: 12.97 ( 21.40%) 16.51 len=16, align1=0, align2=4, n=16: 14.09 ( 15.59%) 16.70 len=16, align1=4, align2=0, n=32: 13.75 ( 31.95%) 20.20 len=32, align1=0, align2=4, n=16: 19.20 ( -0.01%) 19.20 len=16, align1=4, align2=0, n=32: 14.45 ( 31.46%) 21.08 len=32, align1=0, align2=4, n=16: 19.07 ( -1.55%) 18.78 len=16, align1=5, align2=5, n=16: 14.89 ( 15.97%) 17.72 len=16, align1=5, align2=5, n=16: 13.78 ( 15.12%) 16.23 len=16, align1=5, align2=2, n=16: 14.89 ( 13.72%) 17.26 len=16, align1=2, align2=5, n=16: 17.32 ( -0.72%) 17.20 len=32, align1=3, align2=2, n=64: 23.78 ( 20.84%) 30.05 len=64, align1=2, align2=3, n=32: 24.54 ( 0.82%) 24.74 len=32, align1=3, align2=2, n=64: 22.48 ( 17.99%) 27.41 len=64, align1=2, align2=3, n=32: 22.63 ( 8.72%) 24.79 len=16, align1=6, align2=6, n=16: 14.76 ( 14.10%) 17.19 len=16, align1=6, align2=6, n=16: 14.57 ( 16.81%) 17.52 len=16, align1=6, align2=4, n=16: 14.88 ( 13.70%) 17.25 len=16, align1=4, align2=6, n=16: 16.29 ( -0.14%) 16.27 len=64, align1=2, align2=4, n=128: 28.40 ( 9.37%) 31.34 len=128, align1=4, align2=2, n=64: 28.48 ( 10.08%) 31.67 len=64, align1=2, align2=4, n=128: 29.65 ( 11.33%) 33.44 len=128, align1=4, align2=2, n=64: 30.18 ( 6.40%) 32.25 len=16, align1=7, align2=7, n=16: 14.86 ( 8.40%) 16.22 len=16, align1=7, align2=7, n=16: 13.78 ( 16.30%) 16.47 len=16, align1=7, align2=6, n=16: 14.23 ( 12.27%) 16.22 len=16, align1=6, align2=7, n=16: 16.30 ( -0.53%) 16.22 len=128, align1=1, align2=6, n=256: 35.07 ( 25.88%) 47.32 len=256, align1=6, align2=1, n=128: 45.32 ( 11.90%) 51.44 len=128, align1=1, align2=6, n=256: 35.14 ( 24.65%) 46.64 len=256, align1=6, align2=1, n=128: 43.26 ( 15.54%) 51.22 len=8, align1=0, align2=0, n=16: 13.17 ( 29.35%) 18.65 len=32, align1=0, align2=0, n=16: 18.81 ( -3.57%) 18.17 len=8, align1=7, align2=2, n=16: 13.92 ( -7.07%) 13.00 len=32, align1=7, align2=2, n=16: 17.52 ( 14.77%) 20.55 len=16, align1=0, align2=0, n=32: 13.85 ( 33.77%) 20.91 len=64, align1=0, align2=0, n=32: 23.32 ( 7.24%) 25.14 len=16, align1=6, align2=4, n=32: 14.87 ( 17.40%) 18.00 len=64, align1=6, align2=4, n=32: 23.32 ( 14.99%) 27.43 len=32, align1=0, align2=0, n=64: 21.05 ( 16.72%) 25.28 len=128, align1=0, align2=0, n=64: 28.81 ( 11.25%) 32.46 len=32, align1=5, align2=6, n=64: 24.68 ( 10.16%) 27.47 len=128, align1=5, align2=6, n=64: 28.66 ( 7.24%) 30.89 len=64, align1=0, align2=0, n=128: 24.98 ( 21.37%) 31.77 len=256, align1=0, align2=0, n=128: 43.90 ( 18.92%) 54.14 len=64, align1=4, align2=0, n=128: 26.13 ( 24.65%) 34.68 len=256, align1=4, align2=0, n=128: 44.27 ( 15.06%) 52.12 len=128, align1=0, align2=0, n=256: 34.29 ( 33.53%) 51.58 len=512, align1=0, align2=0, n=256: 68.94 ( 8.14%) 75.05 len=128, align1=3, align2=2, n=256: 36.06 ( 15.45%) 42.65 len=512, align1=3, align2=2, n=256: 65.15 ( 12.33%) 74.32 len=256, align1=0, align2=0, n=512: 46.37 ( 30.42%) 66.64 len=1024, align1=0, align2=0, n=512: 114.89 ( 8.32%) 125.31 len=256, align1=2, align2=4, n=512: 56.05 ( 16.50%) 67.12 len=1024, align1=2, align2=4, n=512: 179.87 (-52.13%) 118.24 len=512, align1=0, align2=0, n=1024: 68.16 ( 29.70%) 96.96 len=512, align1=1, align2=6, n=1024: 119.39 (-26.04%) 94.72 len=128, align1=1, align2=0, n=64: 27.46 ( 17.94%) 33.46 len=128, align1=0, align2=0, n=64: 29.69 ( -2.62%) 28.93 len=128, align1=0, align2=0, n=64: 27.25 ( 6.15%) 29.03 len=128, align1=0, align2=0, n=64: 27.24 ( 6.61%) 29.17 len=64, align1=1, align2=0, n=128: 25.50 ( 21.40%) 32.44 len=64, align1=0, align2=0, n=128: 23.50 ( 27.08%) 32.22 len=64, align1=0, align2=0, n=128: 24.88 ( 16.98%) 29.97 len=64, align1=0, align2=0, n=128: 24.59 ( 22.98%) 31.92 len=128, align1=1, align2=0, n=96: 27.46 ( 29.72%) 39.07 len=128, align1=0, align2=0, n=96: 28.55 ( 20.33%) 35.83 len=128, align1=0, align2=0, n=96: 27.25 ( 24.21%) 35.95 len=128, align1=0, align2=0, n=96: 28.53 ( 19.86%) 35.59 len=96, align1=1, align2=0, n=128: 30.65 ( 18.65%) 37.68 len=96, align1=0, align2=0, n=128: 28.06 ( 19.41%) 34.82 len=96, align1=0, align2=0, n=128: 27.92 ( 20.27%) 35.02 len=96, align1=0, align2=0, n=128: 28.06 ( 19.43%) 34.83 len=128, align1=1, align2=0, n=128: 31.31 ( 28.02%) 43.51 len=128, align1=0, align2=0, n=128: 28.52 ( 29.34%) 40.37 len=128, align1=0, align2=0, n=128: 27.25 ( 32.18%) 40.17 len=128, align1=0, align2=0, n=128: 27.46 ( 31.33%) 39.99 len=128, align1=1, align2=0, n=128: 31.32 ( 28.00%) 43.50 len=128, align1=0, align2=0, n=128: 27.46 ( 31.03%) 39.82 len=128, align1=0, align2=0, n=128: 27.25 ( 32.30%) 40.25 len=128, align1=0, align2=0, n=128: 27.25 ( 31.97%) 40.05 len=128, align1=1, align2=0, n=160: 34.00 ( 20.12%) 42.56 len=128, align1=0, align2=0, n=160: 32.19 ( 30.63%) 46.40 len=128, align1=0, align2=0, n=160: 32.17 ( 28.12%) 44.76 len=128, align1=0, align2=0, n=160: 32.39 ( 27.63%) 44.76 len=160, align1=1, align2=0, n=128: 29.84 ( 35.97%) 46.61 len=160, align1=0, align2=0, n=128: 31.79 ( 25.56%) 42.71 len=160, align1=0, align2=0, n=128: 32.00 ( 24.86%) 42.59 len=160, align1=0, align2=0, n=128: 31.79 ( 25.85%) 42.86 len=128, align1=1, align2=0, n=192: 33.81 ( 21.08%) 42.84 len=128, align1=0, align2=0, n=192: 32.38 ( 29.98%) 46.24 len=128, align1=0, align2=0, n=192: 32.38 ( 27.38%) 44.58 len=128, align1=0, align2=0, n=192: 32.18 ( 28.29%) 44.87 len=192, align1=1, align2=0, n=128: 34.71 ( 27.54%) 47.90 len=192, align1=0, align2=0, n=128: 35.25 ( 22.44%) 45.44 len=192, align1=0, align2=0, n=128: 35.30 ( 21.97%) 45.24 len=192, align1=0, align2=0, n=128: 35.03 ( 22.17%) 45.01 len=256, align1=1, align2=0, n=192: 39.58 ( 30.82%) 57.21 len=256, align1=0, align2=0, n=192: 42.27 ( 24.21%) 55.77 len=256, align1=0, align2=0, n=192: 41.10 ( 26.00%) 55.54 len=256, align1=0, align2=0, n=192: 43.11 ( 21.51%) 54.92 len=192, align1=1, align2=0, n=256: 38.15 ( 29.78%) 54.33 len=192, align1=0, align2=0, n=256: 37.43 ( 32.27%) 55.26 len=192, align1=0, align2=0, n=256: 37.43 ( 32.46%) 55.42 len=192, align1=0, align2=0, n=256: 37.43 ( 32.46%) 55.42 len=256, align1=1, align2=0, n=224: 40.87 ( 31.48%) 59.65 len=256, align1=0, align2=0, n=224: 41.66 ( 26.95%) 57.02 len=256, align1=0, align2=0, n=224: 41.08 ( 28.22%) 57.24 len=256, align1=0, align2=0, n=224: 41.17 ( 27.86%) 57.07 len=224, align1=1, align2=0, n=256: 38.96 ( 32.41%) 57.65 len=224, align1=0, align2=0, n=256: 42.27 ( 28.61%) 59.21 len=224, align1=0, align2=0, n=256: 40.15 ( 32.33%) 59.34 len=224, align1=0, align2=0, n=256: 40.10 ( 32.78%) 59.65 len=256, align1=1, align2=0, n=256: 41.22 ( 33.31%) 61.80 len=256, align1=0, align2=0, n=256: 41.52 ( 29.99%) 59.30 len=256, align1=0, align2=0, n=256: 41.17 ( 29.82%) 58.66 len=256, align1=0, align2=0, n=256: 41.18 ( 30.68%) 59.40 len=256, align1=1, align2=0, n=256: 47.52 ( 29.49%) 67.39 len=256, align1=0, align2=0, n=256: 44.83 ( 30.61%) 64.60 len=256, align1=0, align2=0, n=256: 45.50 ( 29.57%) 64.60 len=256, align1=0, align2=0, n=256: 44.83 ( 29.93%) 63.97 len=256, align1=1, align2=0, n=288: 44.21 ( 33.34%) 66.32 len=256, align1=0, align2=0, n=288: 41.58 ( 33.60%) 62.62 len=256, align1=0, align2=0, n=288: 44.57 ( 30.02%) 63.69 len=256, align1=0, align2=0, n=288: 42.80 ( 35.55%) 66.41 len=288, align1=1, align2=0, n=256: 46.39 ( 29.55%) 65.85 len=288, align1=0, align2=0, n=256: 45.95 ( 28.95%) 64.68 len=288, align1=0, align2=0, n=256: 46.26 ( 29.92%) 66.02 len=288, align1=0, align2=0, n=256: 48.47 ( 20.26%) 60.79 len=256, align1=1, align2=0, n=320: 41.81 ( 31.09%) 60.67 len=256, align1=0, align2=0, n=320: 41.87 ( 34.40%) 63.82 len=256, align1=0, align2=0, n=320: 41.52 ( 34.47%) 63.35 len=256, align1=0, align2=0, n=320: 44.29 ( 33.29%) 66.39 len=320, align1=1, align2=0, n=256: 48.70 ( 29.59%) 69.16 len=320, align1=0, align2=0, n=256: 46.47 ( 24.55%) 61.60 len=320, align1=0, align2=0, n=256: 45.68 ( 27.30%) 62.83 len=320, align1=0, align2=0, n=256: 47.34 ( 23.15%) 61.60 len=512, align1=1, align2=0, n=448: 72.59 ( 23.10%) 94.39 len=512, align1=0, align2=0, n=448: 68.84 ( 38.34%) 111.65 len=512, align1=0, align2=0, n=448: 69.80 ( 36.56%) 110.03 len=512, align1=0, align2=0, n=448: 67.31 ( 40.49%) 113.10 len=448, align1=1, align2=0, n=512: 65.75 ( 28.23%) 91.61 len=448, align1=0, align2=0, n=512: 61.41 ( 30.51%) 88.36 len=448, align1=0, align2=0, n=512: 65.19 ( 29.15%) 92.02 len=448, align1=0, align2=0, n=512: 61.07 ( 31.08%) 88.61 len=512, align1=1, align2=0, n=480: 75.89 ( 16.65%) 91.05 len=512, align1=0, align2=0, n=480: 66.17 ( 26.56%) 90.10 len=512, align1=0, align2=0, n=480: 65.74 ( 26.92%) 89.96 len=512, align1=0, align2=0, n=480: 66.30 ( 26.50%) 90.21 len=480, align1=1, align2=0, n=512: 65.24 ( 28.33%) 91.03 len=480, align1=0, align2=0, n=512: 64.50 ( 30.43%) 92.70 len=480, align1=0, align2=0, n=512: 64.49 ( 29.90%) 91.99 len=480, align1=0, align2=0, n=512: 64.50 ( 30.11%) 92.29 len=512, align1=1, align2=0, n=512: 68.43 ( 28.04%) 95.09 len=512, align1=0, align2=0, n=512: 67.02 ( 27.18%) 92.05 len=512, align1=0, align2=0, n=512: 67.02 ( 27.01%) 91.82 len=512, align1=0, align2=0, n=512: 67.02 ( 27.05%) 91.87 len=512, align1=1, align2=0, n=512: 67.68 ( 28.93%) 95.23 len=512, align1=0, align2=0, n=512: 67.03 ( 27.48%) 92.42 len=512, align1=0, align2=0, n=512: 67.02 ( 27.15%) 92.00 len=512, align1=0, align2=0, n=512: 67.02 ( 27.33%) 92.23 len=512, align1=1, align2=0, n=544: 70.63 ( 26.35%) 95.89 len=512, align1=0, align2=0, n=544: 67.72 ( 29.97%) 96.70 len=512, align1=0, align2=0, n=544: 67.71 ( 30.17%) 96.95 len=512, align1=0, align2=0, n=544: 67.71 ( 29.99%) 96.72 len=544, align1=1, align2=0, n=512: 83.22 ( 13.39%) 96.08 len=544, align1=0, align2=0, n=512: 68.97 ( 27.78%) 95.50 len=544, align1=0, align2=0, n=512: 71.83 ( 24.53%) 95.18 len=544, align1=0, align2=0, n=512: 68.99 ( 27.28%) 94.87 len=512, align1=1, align2=0, n=576: 72.60 ( 28.17%) 101.08 len=512, align1=0, align2=0, n=576: 72.27 ( 25.52%) 97.03 len=512, align1=0, align2=0, n=576: 67.75 ( 30.53%) 97.52 len=512, align1=0, align2=0, n=576: 72.53 ( 29.10%) 102.30 len=576, align1=1, align2=0, n=512: 82.05 ( 16.23%) 97.94 len=576, align1=0, align2=0, n=512: 71.35 ( 26.64%) 97.26 len=576, align1=0, align2=0, n=512: 74.36 ( 23.52%) 97.23 len=576, align1=0, align2=0, n=512: 71.58 ( 26.50%) 97.38 len=1024, align1=1, align2=0, n=960: 147.26 ( 11.02%) 165.50 len=1024, align1=0, align2=0, n=960: 134.00 ( 13.30%) 154.55 len=1024, align1=0, align2=0, n=960: 134.31 ( 13.26%) 154.84 len=1024, align1=0, align2=0, n=960: 134.53 ( 12.97%) 154.58 len=960, align1=1, align2=0, n=1024: 129.09 ( 20.84%) 163.08 len=960, align1=0, align2=0, n=1024: 113.32 ( 26.35%) 153.86 len=960, align1=0, align2=0, n=1024: 113.08 ( 26.77%) 154.42 len=960, align1=0, align2=0, n=1024: 113.10 ( 26.50%) 153.88 len=1024, align1=1, align2=0, n=992: 138.81 ( 18.75%) 170.85 len=1024, align1=0, align2=0, n=992: 134.08 ( 14.74%) 157.25 len=1024, align1=0, align2=0, n=992: 133.96 ( 14.83%) 157.28 len=1024, align1=0, align2=0, n=992: 133.76 ( 15.03%) 157.42 len=992, align1=1, align2=0, n=1024: 136.17 ( 18.21%) 166.50 len=992, align1=0, align2=0, n=1024: 116.81 ( 29.71%) 166.18 len=992, align1=0, align2=0, n=1024: 116.46 ( 26.72%) 158.92 len=992, align1=0, align2=0, n=1024: 116.63 ( 26.64%) 158.99 len=1024, align1=1, align2=0, n=1024: 150.63 ( 14.32%) 175.81 len=1024, align1=0, align2=0, n=1024: 119.07 ( 26.07%) 161.07 len=1024, align1=0, align2=0, n=1024: 119.10 ( 26.06%) 161.08 len=1024, align1=0, align2=0, n=1024: 118.91 ( 26.16%) 161.04 len=1024, align1=1, align2=0, n=1024: 158.94 ( 13.17%) 183.06 len=1024, align1=0, align2=0, n=1024: 120.68 ( 27.45%) 166.35 len=1024, align1=0, align2=0, n=1024: 119.16 ( 26.03%) 161.09 len=1024, align1=0, align2=0, n=1024: 119.16 ( 26.02%) 161.07 len=1024, align1=1, align2=0, n=1056: 162.90 ( 15.29%) 192.30 len=1024, align1=0, align2=0, n=1056: 140.90 ( 26.76%) 192.38 len=1024, align1=0, align2=0, n=1056: 140.05 ( 30.28%) 200.89 len=1024, align1=0, align2=0, n=1056: 146.22 ( 25.04%) 195.08 len=1056, align1=1, align2=0, n=1024: 166.62 ( 8.97%) 183.03 len=1056, align1=0, align2=0, n=1024: 121.48 ( 25.46%) 162.98 len=1056, align1=0, align2=0, n=1024: 123.93 ( 24.01%) 163.09 len=1056, align1=0, align2=0, n=1024: 127.86 ( 25.98%) 172.73 len=1024, align1=1, align2=0, n=1088: 167.49 ( 12.93%) 192.36 len=1024, align1=0, align2=0, n=1088: 147.48 ( 23.34%) 192.38 len=1024, align1=0, align2=0, n=1088: 140.01 ( 27.22%) 192.39 len=1024, align1=0, align2=0, n=1088: 140.09 ( 27.23%) 192.51 len=1088, align1=1, align2=0, n=1024: 159.00 ( 13.46%) 183.73 len=1088, align1=0, align2=0, n=1024: 143.31 ( 14.25%) 167.13 len=1088, align1=0, align2=0, n=1024: 140.46 ( 14.32%) 163.93 len=1088, align1=0, align2=0, n=1024: 139.85 ( 14.69%) 163.92 [-- Attachment #3: bench-strlcpy.txt --] [-- Type: text/plain, Size: 18518 bytes --] Function: strlcpy Variant: __strlcpy_avx2 __strlcpy_generic ======================================================================================================================== len=16, align1=1, align2=1, n=16: 11.11 ( 32.32%) 16.41 len=16, align1=1, align2=1, n=16: 10.73 ( 32.83%) 15.98 len=16, align1=1, align2=2, n=16: 10.53 ( 33.23%) 15.77 len=16, align1=2, align2=1, n=16: 10.89 ( 32.50%) 16.13 len=2, align1=7, align2=2, n=4: 8.06 ( 35.05%) 12.41 len=4, align1=2, align2=7, n=2: 8.66 ( 37.31%) 13.82 len=2, align1=7, align2=2, n=4: 7.78 ( 33.85%) 11.77 len=4, align1=2, align2=7, n=2: 8.70 ( 37.88%) 14.01 len=16, align1=2, align2=2, n=16: 10.43 ( 31.86%) 15.31 len=16, align1=2, align2=2, n=16: 10.87 ( 30.40%) 15.62 len=16, align1=2, align2=4, n=16: 10.47 ( 30.24%) 15.01 len=16, align1=4, align2=2, n=16: 10.56 ( 31.99%) 15.53 len=4, align1=6, align2=4, n=8: 11.33 ( 18.99%) 13.99 len=8, align1=4, align2=6, n=4: 10.44 ( 27.20%) 14.34 len=4, align1=6, align2=4, n=8: 11.43 ( 13.14%) 13.15 len=8, align1=4, align2=6, n=4: 10.83 ( 28.59%) 15.16 len=16, align1=3, align2=3, n=16: 10.39 ( 33.18%) 15.54 len=16, align1=3, align2=3, n=16: 10.13 ( 38.74%) 16.53 len=16, align1=3, align2=6, n=16: 10.29 ( 37.51%) 16.46 len=16, align1=6, align2=3, n=16: 10.56 ( 31.97%) 15.53 len=8, align1=5, align2=6, n=16: 10.48 ( 22.21%) 13.47 len=16, align1=6, align2=5, n=8: 10.95 ( 27.84%) 15.17 len=8, align1=5, align2=6, n=16: 10.55 ( 23.09%) 13.71 len=16, align1=6, align2=5, n=8: 10.98 ( 27.79%) 15.20 len=16, align1=4, align2=4, n=16: 10.39 ( 32.51%) 15.40 len=16, align1=4, align2=4, n=16: 10.38 ( 33.76%) 15.68 len=16, align1=4, align2=0, n=16: 10.57 ( 28.87%) 14.86 len=16, align1=0, align2=4, n=16: 10.28 ( 34.27%) 15.64 len=16, align1=4, align2=0, n=32: 10.59 ( 23.24%) 13.79 len=32, align1=0, align2=4, n=16: 11.66 ( 30.50%) 16.77 len=16, align1=4, align2=0, n=32: 10.67 ( 23.98%) 14.04 len=32, align1=0, align2=4, n=16: 11.06 ( 33.61%) 16.66 len=16, align1=5, align2=5, n=16: 10.43 ( 33.52%) 15.68 len=16, align1=5, align2=5, n=16: 10.49 ( 33.47%) 15.77 len=16, align1=5, align2=2, n=16: 10.54 ( 29.46%) 14.94 len=16, align1=2, align2=5, n=16: 10.20 ( 31.63%) 14.92 len=32, align1=3, align2=2, n=64: 13.88 ( 0.59%) 13.97 len=64, align1=2, align2=3, n=32: 11.72 ( 22.36%) 15.09 len=32, align1=3, align2=2, n=64: 13.49 ( 2.26%) 13.81 len=64, align1=2, align2=3, n=32: 11.54 ( 26.22%) 15.64 len=16, align1=6, align2=6, n=16: 10.39 ( 27.70%) 14.37 len=16, align1=6, align2=6, n=16: 9.94 ( 32.04%) 14.63 len=16, align1=6, align2=4, n=16: 9.91 ( 33.92%) 14.99 len=16, align1=4, align2=6, n=16: 10.19 ( 32.66%) 15.14 len=64, align1=2, align2=4, n=128: 14.66 ( 4.10%) 15.29 len=128, align1=4, align2=2, n=64: 18.22 (-17.01%) 15.57 len=64, align1=2, align2=4, n=128: 14.64 ( 3.89%) 15.24 len=128, align1=4, align2=2, n=64: 18.22 (-14.83%) 15.86 len=16, align1=7, align2=7, n=16: 9.86 ( 30.07%) 14.11 len=16, align1=7, align2=7, n=16: 9.86 ( 30.09%) 14.11 len=16, align1=7, align2=6, n=16: 9.93 ( 32.92%) 14.81 len=16, align1=6, align2=7, n=16: 9.83 ( 30.41%) 14.13 len=128, align1=1, align2=6, n=256: 22.24 ( 9.63%) 24.61 len=256, align1=6, align2=1, n=128: 20.91 ( 12.22%) 23.82 len=128, align1=1, align2=6, n=256: 22.21 ( 9.86%) 24.64 len=256, align1=6, align2=1, n=128: 20.81 ( 12.85%) 23.88 len=8, align1=0, align2=0, n=16: 10.33 ( 20.37%) 12.97 len=32, align1=0, align2=0, n=16: 10.75 ( 32.13%) 15.84 len=8, align1=7, align2=2, n=16: 10.38 ( 20.33%) 13.02 len=32, align1=7, align2=2, n=16: 11.03 ( 30.36%) 15.84 len=16, align1=0, align2=0, n=32: 9.98 ( 26.96%) 13.67 len=64, align1=0, align2=0, n=32: 10.94 ( 26.69%) 14.92 len=16, align1=6, align2=4, n=32: 10.07 ( 22.77%) 13.04 len=64, align1=6, align2=4, n=32: 11.68 ( 22.22%) 15.01 len=32, align1=0, align2=0, n=64: 11.15 ( 11.26%) 12.57 len=128, align1=0, align2=0, n=64: 17.59 ( -6.54%) 16.51 len=32, align1=5, align2=6, n=64: 12.56 ( 12.27%) 14.32 len=128, align1=5, align2=6, n=64: 19.12 (-20.33%) 15.89 len=64, align1=0, align2=0, n=128: 12.70 ( 17.81%) 15.45 len=256, align1=0, align2=0, n=128: 22.12 ( 7.72%) 23.97 len=64, align1=4, align2=0, n=128: 12.84 ( 18.75%) 15.81 len=256, align1=4, align2=0, n=128: 21.48 ( 12.33%) 24.50 len=128, align1=0, align2=0, n=256: 19.17 ( 3.24%) 19.81 len=512, align1=0, align2=0, n=256: 26.55 ( 3.43%) 27.49 len=128, align1=3, align2=2, n=256: 20.07 ( 17.46%) 24.32 len=512, align1=3, align2=2, n=256: 26.65 ( 17.61%) 32.35 len=256, align1=0, align2=0, n=512: 22.48 ( 14.46%) 26.28 len=1024, align1=0, align2=0, n=512: 39.85 ( 12.47%) 45.53 len=256, align1=2, align2=4, n=512: 27.00 ( 8.13%) 29.39 len=1024, align1=2, align2=4, n=512: 43.97 ( 15.73%) 52.18 len=512, align1=0, align2=0, n=1024: 32.09 ( 29.08%) 45.25 len=2048, align1=0, align2=0, n=1024: 65.11 ( 7.02%) 70.02 len=512, align1=1, align2=6, n=1024: 35.13 ( 26.54%) 47.83 len=2048, align1=1, align2=6, n=1024: 80.38 (-15.59%) 69.53 len=128, align1=1, align2=0, n=64: 18.89 (-12.93%) 16.72 len=128, align1=0, align2=0, n=64: 16.93 ( -9.06%) 15.52 len=128, align1=0, align2=0, n=64: 16.92 ( -8.70%) 15.57 len=128, align1=0, align2=0, n=64: 17.58 (-12.44%) 15.63 len=64, align1=1, align2=0, n=128: 12.84 ( 18.40%) 15.74 len=64, align1=0, align2=0, n=128: 12.64 ( 19.60%) 15.72 len=64, align1=0, align2=0, n=128: 12.78 ( 17.35%) 15.47 len=64, align1=0, align2=0, n=128: 12.65 ( 18.44%) 15.51 len=128, align1=1, align2=0, n=96: 20.15 ( -9.88%) 18.34 len=128, align1=0, align2=0, n=96: 18.21 ( -3.68%) 17.57 len=128, align1=0, align2=0, n=96: 18.46 ( -5.09%) 17.57 len=128, align1=0, align2=0, n=96: 18.86 ( 1.57%) 19.16 len=96, align1=1, align2=0, n=128: 13.99 ( 15.86%) 16.62 len=96, align1=0, align2=0, n=128: 14.60 ( 11.99%) 16.59 len=96, align1=0, align2=0, n=128: 14.38 ( 20.13%) 18.00 len=96, align1=0, align2=0, n=128: 14.34 ( 11.75%) 16.25 len=128, align1=1, align2=0, n=128: 19.53 ( -0.01%) 19.53 len=128, align1=0, align2=0, n=128: 20.17 ( -3.30%) 19.53 len=128, align1=0, align2=0, n=128: 20.18 (-14.72%) 17.59 len=128, align1=0, align2=0, n=128: 20.82 ( -0.68%) 20.68 len=128, align1=1, align2=0, n=128: 20.01 ( -5.92%) 18.89 len=128, align1=0, align2=0, n=128: 21.37 ( -8.22%) 19.74 len=128, align1=0, align2=0, n=128: 20.17 (-14.75%) 17.57 len=128, align1=0, align2=0, n=128: 20.80 (-18.42%) 17.57 len=128, align1=1, align2=0, n=160: 19.65 ( 15.99%) 23.39 len=128, align1=0, align2=0, n=160: 19.14 ( 3.36%) 19.80 len=128, align1=0, align2=0, n=160: 19.18 ( 3.40%) 19.85 len=128, align1=0, align2=0, n=160: 19.15 ( 3.36%) 19.81 len=160, align1=1, align2=0, n=128: 18.88 ( 12.02%) 21.46 len=160, align1=0, align2=0, n=128: 20.16 ( 9.62%) 22.31 len=160, align1=0, align2=0, n=128: 20.80 ( 0.05%) 20.81 len=160, align1=0, align2=0, n=128: 20.16 ( 8.81%) 22.11 len=128, align1=1, align2=0, n=192: 19.65 ( 16.12%) 23.42 len=128, align1=0, align2=0, n=192: 19.14 ( 3.37%) 19.80 len=128, align1=0, align2=0, n=192: 19.18 ( 3.16%) 19.80 len=128, align1=0, align2=0, n=192: 19.19 ( 3.06%) 19.80 len=192, align1=1, align2=0, n=128: 18.86 ( 19.40%) 23.40 len=192, align1=0, align2=0, n=128: 20.81 ( 6.46%) 22.24 len=192, align1=0, align2=0, n=128: 20.81 ( 8.70%) 22.79 len=192, align1=0, align2=0, n=128: 21.46 ( 4.55%) 22.48 len=256, align1=1, align2=0, n=192: 20.83 ( 13.49%) 24.08 len=256, align1=0, align2=0, n=192: 21.35 ( 15.83%) 25.37 len=256, align1=0, align2=0, n=192: 20.83 ( 15.85%) 24.75 len=256, align1=0, align2=0, n=192: 21.87 ( 13.82%) 25.37 len=192, align1=1, align2=0, n=256: 22.27 ( 5.03%) 23.45 len=192, align1=0, align2=0, n=256: 19.58 ( 14.91%) 23.02 len=192, align1=0, align2=0, n=256: 19.58 ( 14.91%) 23.01 len=192, align1=0, align2=0, n=256: 19.57 ( 16.70%) 23.50 len=256, align1=1, align2=0, n=224: 20.84 ( 19.02%) 25.74 len=256, align1=0, align2=0, n=224: 20.91 ( 15.73%) 24.81 len=256, align1=0, align2=0, n=224: 21.47 ( 10.79%) 24.07 len=256, align1=0, align2=0, n=224: 21.47 ( 10.79%) 24.06 len=224, align1=1, align2=0, n=256: 20.43 ( 16.38%) 24.43 len=224, align1=0, align2=0, n=256: 19.23 ( 16.62%) 23.06 len=224, align1=0, align2=0, n=256: 19.21 ( 16.84%) 23.10 len=224, align1=0, align2=0, n=256: 19.24 ( 16.77%) 23.12 len=256, align1=1, align2=0, n=256: 24.05 ( 5.44%) 25.44 len=256, align1=0, align2=0, n=256: 21.63 ( 14.98%) 25.45 len=256, align1=0, align2=0, n=256: 20.81 ( 13.64%) 24.10 len=256, align1=0, align2=0, n=256: 20.81 ( 13.67%) 24.10 len=256, align1=1, align2=0, n=256: 24.10 ( -0.20%) 24.05 len=256, align1=0, align2=0, n=256: 21.46 ( 16.56%) 25.71 len=256, align1=0, align2=0, n=256: 21.46 ( 10.79%) 24.05 len=256, align1=0, align2=0, n=256: 20.81 ( 14.64%) 24.38 len=256, align1=1, align2=0, n=288: 24.21 ( 15.45%) 28.63 len=256, align1=0, align2=0, n=288: 23.11 ( 12.68%) 26.46 len=256, align1=0, align2=0, n=288: 22.55 ( 14.25%) 26.29 len=256, align1=0, align2=0, n=288: 22.49 ( 14.49%) 26.30 len=288, align1=1, align2=0, n=256: 24.06 ( 5.36%) 25.42 len=288, align1=0, align2=0, n=256: 22.82 ( 7.35%) 24.63 len=288, align1=0, align2=0, n=256: 22.80 ( 10.98%) 25.62 len=288, align1=0, align2=0, n=256: 21.46 ( 17.56%) 26.03 len=256, align1=1, align2=0, n=320: 24.17 ( 15.82%) 28.71 len=256, align1=0, align2=0, n=320: 22.44 ( 14.79%) 26.34 len=256, align1=0, align2=0, n=320: 22.56 ( 14.14%) 26.27 len=256, align1=0, align2=0, n=320: 22.50 ( 14.35%) 26.27 len=320, align1=1, align2=0, n=256: 24.10 ( 8.33%) 26.29 len=320, align1=0, align2=0, n=256: 22.11 ( 16.28%) 26.41 len=320, align1=0, align2=0, n=256: 21.57 ( 16.27%) 25.76 len=320, align1=0, align2=0, n=256: 21.46 ( 15.42%) 25.37 len=512, align1=1, align2=0, n=448: 27.62 ( 31.43%) 40.28 len=512, align1=0, align2=0, n=448: 27.63 ( 32.11%) 40.70 len=512, align1=0, align2=0, n=448: 26.53 ( 35.05%) 40.85 len=512, align1=0, align2=0, n=448: 26.51 ( 34.99%) 40.78 len=448, align1=1, align2=0, n=512: 31.01 ( 28.08%) 43.11 len=448, align1=0, align2=0, n=512: 29.35 ( 36.94%) 46.54 len=448, align1=0, align2=0, n=512: 29.38 ( 37.01%) 46.63 len=448, align1=0, align2=0, n=512: 29.38 ( 37.01%) 46.64 len=512, align1=1, align2=0, n=480: 28.24 ( 35.42%) 43.73 len=512, align1=0, align2=0, n=480: 28.76 ( 28.65%) 40.31 len=512, align1=0, align2=0, n=480: 28.47 ( 30.82%) 41.16 len=512, align1=0, align2=0, n=480: 26.70 ( 31.68%) 39.08 len=480, align1=1, align2=0, n=512: 30.73 ( 26.75%) 41.95 len=480, align1=0, align2=0, n=512: 28.79 ( 34.92%) 44.23 len=480, align1=0, align2=0, n=512: 28.76 ( 35.89%) 44.87 len=480, align1=0, align2=0, n=512: 29.39 ( 35.67%) 45.68 len=512, align1=1, align2=0, n=512: 30.58 ( 25.28%) 40.92 len=512, align1=0, align2=0, n=512: 26.67 ( 31.41%) 38.87 len=512, align1=0, align2=0, n=512: 26.67 ( 34.15%) 40.50 len=512, align1=0, align2=0, n=512: 27.17 ( 30.43%) 39.06 len=512, align1=1, align2=0, n=512: 30.63 ( 25.12%) 40.91 len=512, align1=0, align2=0, n=512: 26.74 ( 31.56%) 39.06 len=512, align1=0, align2=0, n=512: 26.72 ( 31.55%) 39.04 len=512, align1=0, align2=0, n=512: 26.74 ( 31.11%) 38.81 len=512, align1=1, align2=0, n=544: 33.43 ( 21.70%) 42.69 len=512, align1=0, align2=0, n=544: 31.96 ( 27.77%) 44.25 len=512, align1=0, align2=0, n=544: 31.36 ( 27.40%) 43.20 len=512, align1=0, align2=0, n=544: 31.41 ( 27.14%) 43.11 len=544, align1=1, align2=0, n=512: 30.55 ( 25.76%) 41.15 len=544, align1=0, align2=0, n=512: 27.26 ( 31.01%) 39.51 len=544, align1=0, align2=0, n=512: 27.30 ( 30.74%) 39.41 len=544, align1=0, align2=0, n=512: 26.65 ( 32.38%) 39.40 len=512, align1=1, align2=0, n=576: 33.39 ( 21.56%) 42.58 len=512, align1=0, align2=0, n=576: 31.41 ( 28.37%) 43.85 len=512, align1=0, align2=0, n=576: 31.41 ( 27.57%) 43.37 len=512, align1=0, align2=0, n=576: 31.42 ( 27.41%) 43.28 len=576, align1=1, align2=0, n=512: 30.61 ( 27.75%) 42.36 len=576, align1=0, align2=0, n=512: 27.66 ( 31.54%) 40.40 len=576, align1=0, align2=0, n=512: 28.04 ( 30.84%) 40.55 len=576, align1=0, align2=0, n=512: 27.94 ( 31.15%) 40.58 len=1024, align1=1, align2=0, n=960: 39.78 ( 28.72%) 55.80 len=1024, align1=0, align2=0, n=960: 40.87 ( 26.15%) 55.34 len=1024, align1=0, align2=0, n=960: 40.06 ( 26.81%) 54.73 len=1024, align1=0, align2=0, n=960: 40.25 ( 26.40%) 54.69 len=960, align1=1, align2=0, n=1024: 38.74 ( 31.46%) 56.52 len=960, align1=0, align2=0, n=1024: 38.37 ( 36.30%) 60.24 len=960, align1=0, align2=0, n=1024: 38.37 ( 36.36%) 60.30 len=960, align1=0, align2=0, n=1024: 39.88 ( 35.25%) 61.60 len=1024, align1=1, align2=0, n=992: 39.71 ( 28.13%) 55.26 len=1024, align1=0, align2=0, n=992: 39.85 ( 29.39%) 56.44 len=1024, align1=0, align2=0, n=992: 40.34 ( 25.81%) 54.37 len=1024, align1=0, align2=0, n=992: 40.31 ( 25.91%) 54.40 len=992, align1=1, align2=0, n=1024: 37.72 ( 32.49%) 55.88 len=992, align1=0, align2=0, n=1024: 38.37 ( 36.02%) 59.97 len=992, align1=0, align2=0, n=1024: 38.42 ( 35.53%) 59.60 len=992, align1=0, align2=0, n=1024: 38.40 ( 35.67%) 59.69 len=1024, align1=1, align2=0, n=1024: 40.88 ( 26.02%) 55.26 len=1024, align1=0, align2=0, n=1024: 40.36 ( 25.56%) 54.22 len=1024, align1=0, align2=0, n=1024: 40.31 ( 25.60%) 54.19 len=1024, align1=0, align2=0, n=1024: 40.35 ( 29.70%) 57.40 len=1024, align1=1, align2=0, n=1024: 41.03 ( 25.71%) 55.22 len=1024, align1=0, align2=0, n=1024: 40.37 ( 25.42%) 54.13 len=1024, align1=0, align2=0, n=1024: 40.31 ( 25.64%) 54.21 len=1024, align1=0, align2=0, n=1024: 40.32 ( 25.60%) 54.19 len=1024, align1=1, align2=0, n=1056: 41.06 ( 25.94%) 55.45 len=1024, align1=0, align2=0, n=1056: 41.06 ( 29.54%) 58.27 len=1024, align1=0, align2=0, n=1056: 41.05 ( 28.94%) 57.77 len=1024, align1=0, align2=0, n=1056: 41.02 ( 28.82%) 57.62 len=1056, align1=1, align2=0, n=1024: 41.00 ( 26.23%) 55.59 len=1056, align1=0, align2=0, n=1024: 39.67 ( 27.07%) 54.39 len=1056, align1=0, align2=0, n=1024: 40.34 ( 29.19%) 56.97 len=1056, align1=0, align2=0, n=1024: 40.37 ( 27.52%) 55.71 len=1024, align1=1, align2=0, n=1088: 41.02 ( 26.33%) 55.68 len=1024, align1=0, align2=0, n=1088: 41.06 ( 30.82%) 59.35 len=1024, align1=0, align2=0, n=1088: 41.05 ( 29.58%) 58.29 len=1024, align1=0, align2=0, n=1088: 41.14 ( 28.69%) 57.69 len=1088, align1=1, align2=0, n=1024: 41.31 ( 27.50%) 56.98 len=1088, align1=0, align2=0, n=1024: 40.32 ( 29.25%) 56.99 len=1088, align1=0, align2=0, n=1024: 40.74 ( 27.82%) 56.44 len=1088, align1=0, align2=0, n=1024: 40.70 ( 26.62%) 55.47 ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function 2023-06-30 22:21 ` Sunil Pandey @ 2023-06-30 23:22 ` Noah Goldstein 2023-06-30 23:27 ` Noah Goldstein 2023-07-03 16:30 ` Paul Eggert 1 sibling, 1 reply; 24+ messages in thread From: Noah Goldstein @ 2023-06-30 23:22 UTC (permalink / raw) To: Sunil Pandey; +Cc: Paul Eggert, libc-alpha, hjl.tools On Fri, Jun 30, 2023 at 5:21 PM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > On Fri, Jun 30, 2023 at 2:27 PM Paul Eggert <eggert@cs.ucla.edu> wrote: >> >> On 2023-06-30 14:04, Noah Goldstein via Libc-alpha wrote: >> > Think we should at the very least wait for the generic strlcpy codes >> > to land first. >> >> Let's not optimize these functions at all, unless there's good and >> measured reason to do so. In practice I expected they're called with >> small sizes for which optimization is a net minus as it consumes >> valuable maintenance time with no real benefit. > > > Hi Paul, > > Attached is strcpy/wcslcpy microbenchmark data based on Noah strlcpy/wcslcpy microbenchmark patch. > I don't think the concern is that we can beat the generic impl (which hasn't even landed yet AFAICT), it whether doing so makes sense given the usage/goal of the functions. > https://sourceware.org/pipermail/libc-alpha/2023-April/147557.html > > Thanks, > Sunil ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function 2023-06-30 23:22 ` Noah Goldstein @ 2023-06-30 23:27 ` Noah Goldstein 0 siblings, 0 replies; 24+ messages in thread From: Noah Goldstein @ 2023-06-30 23:27 UTC (permalink / raw) To: Sunil Pandey; +Cc: Paul Eggert, libc-alpha, hjl.tools On Fri, Jun 30, 2023 at 6:22 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > On Fri, Jun 30, 2023 at 5:21 PM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > > > > > On Fri, Jun 30, 2023 at 2:27 PM Paul Eggert <eggert@cs.ucla.edu> wrote: > >> > >> On 2023-06-30 14:04, Noah Goldstein via Libc-alpha wrote: > >> > Think we should at the very least wait for the generic strlcpy codes > >> > to land first. > >> > >> Let's not optimize these functions at all, unless there's good and > >> measured reason to do so. In practice I expected they're called with > >> small sizes for which optimization is a net minus as it consumes > >> valuable maintenance time with no real benefit. > > > > > > Hi Paul, > > > > Attached is strcpy/wcslcpy microbenchmark data based on Noah strlcpy/wcslcpy microbenchmark patch. > > > I don't think the concern is that we can beat the generic impl (which hasn't > even landed yet AFAICT), it whether doing so makes sense given the > usage/goal of the functions. > That being said, I'm generally in favor of adding optimized versions since we happen to be a position where at least several developers find it worth their time to maintain, but not before the generic versions have landed. > > https://sourceware.org/pipermail/libc-alpha/2023-April/147557.html > > > > Thanks, > > Sunil ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function 2023-06-30 22:21 ` Sunil Pandey 2023-06-30 23:22 ` Noah Goldstein @ 2023-07-03 16:30 ` Paul Eggert 2023-07-03 18:40 ` Noah Goldstein 1 sibling, 1 reply; 24+ messages in thread From: Paul Eggert @ 2023-07-03 16:30 UTC (permalink / raw) To: Sunil Pandey; +Cc: Noah Goldstein, libc-alpha, hjl.tools On 2023-06-30 15:21, Sunil Pandey wrote: > Attached is strcpy/wcslcpy microbenchmark data based on Noah > strlcpy/wcslcpy microbenchmark patch. Although it's helpful to know that the proposed patch improves microbenchmark scores, that's not enough to justify it. Let's see benchmarks of real programs. If they don't show significant wins, let's not bother. Programs that use strlcpy, by and large, don't use it in performance-sensitive areas, and their developers and users are far more worried about security than about performance. Making the implementation harder to audit will likely be a net negative for these applications. This doesn't sound a like a win at all. Plus, who uses wcslcpy? Why bother to tune it if nobody uses it? ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function 2023-07-03 16:30 ` Paul Eggert @ 2023-07-03 18:40 ` Noah Goldstein 2023-07-03 18:54 ` Adhemerval Zanella Netto 2023-07-03 21:14 ` Paul Eggert 0 siblings, 2 replies; 24+ messages in thread From: Noah Goldstein @ 2023-07-03 18:40 UTC (permalink / raw) To: Paul Eggert; +Cc: Sunil Pandey, libc-alpha, hjl.tools On Mon, Jul 3, 2023 at 11:30 AM Paul Eggert <eggert@cs.ucla.edu> wrote: > > On 2023-06-30 15:21, Sunil Pandey wrote: > > Attached is strcpy/wcslcpy microbenchmark data based on Noah > > strlcpy/wcslcpy microbenchmark patch. > > Although it's helpful to know that the proposed patch improves > microbenchmark scores, that's not enough to justify it. Let's see > benchmarks of real programs. If they don't show significant wins, let's > not bother. > > Programs that use strlcpy, by and large, don't use it in > performance-sensitive areas, and their developers and users are far more > worried about security than about performance. Making the implementation > harder to audit will likely be a net negative for these applications. > This doesn't sound a like a win at all. > > Plus, who uses wcslcpy? Why bother to tune it if nobody uses it? Think we should look into dropping optimized strcpy/wcscpy family in general? For the most part don't see them in perf sensitive areas anyways (generally people that care about perf maintain the length and use mem* functions). ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function 2023-07-03 18:40 ` Noah Goldstein @ 2023-07-03 18:54 ` Adhemerval Zanella Netto 2023-07-03 21:14 ` Paul Eggert 1 sibling, 0 replies; 24+ messages in thread From: Adhemerval Zanella Netto @ 2023-07-03 18:54 UTC (permalink / raw) To: libc-alpha On 03/07/23 15:40, Noah Goldstein via Libc-alpha wrote: > On Mon, Jul 3, 2023 at 11:30 AM Paul Eggert <eggert@cs.ucla.edu> wrote: >> >> On 2023-06-30 15:21, Sunil Pandey wrote: >>> Attached is strcpy/wcslcpy microbenchmark data based on Noah >>> strlcpy/wcslcpy microbenchmark patch. >> >> Although it's helpful to know that the proposed patch improves >> microbenchmark scores, that's not enough to justify it. Let's see >> benchmarks of real programs. If they don't show significant wins, let's >> not bother. >> >> Programs that use strlcpy, by and large, don't use it in >> performance-sensitive areas, and their developers and users are far more >> worried about security than about performance. Making the implementation >> harder to audit will likely be a net negative for these applications. >> This doesn't sound a like a win at all. >> >> Plus, who uses wcslcpy? Why bother to tune it if nobody uses it? > > Think we should look into dropping optimized strcpy/wcscpy family > in general? For the most part don't see them in perf sensitive areas > anyways (generally people that care about perf maintain the length > and use mem* functions). I will go for it, these interface are provided mainly to comply with standards and for x86 it adds only more maintenance. ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function 2023-07-03 18:40 ` Noah Goldstein 2023-07-03 18:54 ` Adhemerval Zanella Netto @ 2023-07-03 21:14 ` Paul Eggert 2023-07-03 22:04 ` Gabriel Ravier 1 sibling, 1 reply; 24+ messages in thread From: Paul Eggert @ 2023-07-03 21:14 UTC (permalink / raw) To: Noah Goldstein; +Cc: Sunil Pandey, libc-alpha, hjl.tools On 2023-07-03 11:40, Noah Goldstein wrote: > Think we should look into dropping optimized strcpy/wcscpy family > in general? For wcscpy yes. Who uses wcscpy? Optimizing it is a worthless time sink. strcpy optimization might be worth keeping, as it's used so much more. Measurements of real programs would help decide. In the meantime inertia suggests that when in doubt, leave it alone. For strlcpy it's an easy call: don't optimize unless realistic benchmarks show it's a win. ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function 2023-07-03 21:14 ` Paul Eggert @ 2023-07-03 22:04 ` Gabriel Ravier 2023-07-03 23:12 ` Paul Eggert 0 siblings, 1 reply; 24+ messages in thread From: Gabriel Ravier @ 2023-07-03 22:04 UTC (permalink / raw) To: Paul Eggert, Noah Goldstein; +Cc: Sunil Pandey, libc-alpha, hjl.tools On 7/3/23 23:14, Paul Eggert wrote: > On 2023-07-03 11:40, Noah Goldstein wrote: >> Think we should look into dropping optimized strcpy/wcscpy family >> in general? > > For wcscpy yes. Who uses wcscpy? Optimizing it is a worthless time sink. > > strcpy optimization might be worth keeping, as it's used so much more. > Measurements of real programs would help decide. In the meantime > inertia suggests that when in doubt, leave it alone. > > For strlcpy it's an easy call: don't optimize unless realistic > benchmarks show it's a win. I guess it depends on just how much people use BSD software on Linux, because if you're looking at the BSDs the amount of usage of strlcpy is just absurdly massive - OpenBSD's tree has 4997 occurences of it, when memcpy is present 13470 times. That still means memcpy is used 3 times as often, but the idea that strlcpy is so popular as to be used to a remotely comparable degree is itself kind of astonishing. ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function 2023-07-03 22:04 ` Gabriel Ravier @ 2023-07-03 23:12 ` Paul Eggert 2023-07-04 7:45 ` Andreas Schwab 0 siblings, 1 reply; 24+ messages in thread From: Paul Eggert @ 2023-07-03 23:12 UTC (permalink / raw) To: Gabriel Ravier, Noah Goldstein; +Cc: Sunil Pandey, libc-alpha, hjl.tools On 2023-07-03 15:04, Gabriel Ravier wrote: > OpenBSD's tree has 4997 occurrences of it Many years ago the OpenBSD team went through its source code and replaced uses of strcpy with strlcpy, without much thought involved and even introducing problems in the process. I expect that not much of this code is used elsewhere and it's not that relevant to glibc. Of the little OpenBSDish code that is relevant (notably OpenSSH) I expect the performance difference to be so small as to not be worth optimizating glibc. Real-worldish benchmarks could help check this. ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function 2023-07-03 23:12 ` Paul Eggert @ 2023-07-04 7:45 ` Andreas Schwab 0 siblings, 0 replies; 24+ messages in thread From: Andreas Schwab @ 2023-07-04 7:45 UTC (permalink / raw) To: Paul Eggert Cc: Gabriel Ravier, Noah Goldstein, Sunil Pandey, libc-alpha, hjl.tools On Jul 03 2023, Paul Eggert wrote: > On 2023-07-03 15:04, Gabriel Ravier wrote: >> OpenBSD's tree has 4997 occurrences of it > > Many years ago the OpenBSD team went through its source code and replaced > uses of strcpy with strlcpy, without much thought involved and even > introducing problems in the process. In the Linux kernel sources all uses of strlcpy are being erased, because the developers have realized how crappy that interface is. -- Andreas Schwab, SUSE Labs, schwab@suse.de GPG Key fingerprint = 0196 BAD8 1CE9 1970 F4BE 1748 E4D4 88E3 0EEA B9D7 "And now for something completely different." ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function 2023-06-30 21:27 ` Paul Eggert 2023-06-30 22:21 ` Sunil Pandey @ 2023-07-03 12:55 ` Adhemerval Zanella Netto 1 sibling, 0 replies; 24+ messages in thread From: Adhemerval Zanella Netto @ 2023-07-03 12:55 UTC (permalink / raw) To: Paul Eggert, Noah Goldstein, Sunil K Pandey; +Cc: libc-alpha, hjl.tools On 30/06/23 18:27, Paul Eggert wrote: > On 2023-06-30 14:04, Noah Goldstein via Libc-alpha wrote: >> Think we should at the very least wait for the generic strlcpy codes >> to land first. > > Let's not optimize these functions at all, unless there's good and measured reason to do so. In practice I expected they're called with small sizes for which optimization is a net minus as it consumes valuable maintenance time with no real benefit. I tend to agree, although these are now added in next POSIX my understanding is there are still not encouraged to be used due multiple shortcoming on previous discussion. ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function 2023-06-30 21:04 ` Noah Goldstein 2023-06-30 21:27 ` Paul Eggert @ 2023-07-01 9:41 ` Florian Weimer 2023-07-02 1:22 ` Noah Goldstein 1 sibling, 1 reply; 24+ messages in thread From: Florian Weimer @ 2023-07-01 9:41 UTC (permalink / raw) To: Noah Goldstein via Libc-alpha; +Cc: Sunil K Pandey, Noah Goldstein, hjl.tools * Noah Goldstein via Libc-alpha: > Think we should at the very least wait for the generic strlcpy codes > to land first. Do you mean a version of string/strlcpy.c that is based on a modified string/stplcpy.c, rather than the one we have now that calls just strlen and memcpy? Thanks, Florian ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function 2023-07-01 9:41 ` Florian Weimer @ 2023-07-02 1:22 ` Noah Goldstein 2023-07-02 6:51 ` Florian Weimer 0 siblings, 1 reply; 24+ messages in thread From: Noah Goldstein @ 2023-07-02 1:22 UTC (permalink / raw) To: Florian Weimer; +Cc: Noah Goldstein via Libc-alpha, Sunil K Pandey, hjl.tools On Sat, Jul 1, 2023 at 4:41 AM Florian Weimer <fweimer@redhat.com> wrote: > > * Noah Goldstein via Libc-alpha: > > > Think we should at the very least wait for the generic strlcpy codes > > to land first. > > Do you mean a version of string/strlcpy.c that is based on a modified > string/stplcpy.c, rather than the one we have now that calls just strlen > and memcpy? Hmm? I mean your strlcpy/strlcat patch to land. > > Thanks, > Florian > ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function 2023-07-02 1:22 ` Noah Goldstein @ 2023-07-02 6:51 ` Florian Weimer 2023-07-02 16:55 ` Noah Goldstein 0 siblings, 1 reply; 24+ messages in thread From: Florian Weimer @ 2023-07-02 6:51 UTC (permalink / raw) To: Noah Goldstein; +Cc: Noah Goldstein via Libc-alpha, Sunil K Pandey, hjl.tools * Noah Goldstein: > On Sat, Jul 1, 2023 at 4:41 AM Florian Weimer <fweimer@redhat.com> wrote: >> >> * Noah Goldstein via Libc-alpha: >> >> > Think we should at the very least wait for the generic strlcpy codes >> > to land first. >> >> Do you mean a version of string/strlcpy.c that is based on a modified >> string/stplcpy.c, rather than the one we have now that calls just strlen >> and memcpy? > > Hmm? I mean your strlcpy/strlcat patch to land. That has already happened? Thanks, Florian ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function 2023-07-02 6:51 ` Florian Weimer @ 2023-07-02 16:55 ` Noah Goldstein 2023-07-02 17:02 ` Florian Weimer 0 siblings, 1 reply; 24+ messages in thread From: Noah Goldstein @ 2023-07-02 16:55 UTC (permalink / raw) To: Florian Weimer; +Cc: Noah Goldstein via Libc-alpha, Sunil K Pandey, hjl.tools On Sun, Jul 2, 2023 at 1:51 AM Florian Weimer <fweimer@redhat.com> wrote: > > * Noah Goldstein: > > > On Sat, Jul 1, 2023 at 4:41 AM Florian Weimer <fweimer@redhat.com> wrote: > >> > >> * Noah Goldstein via Libc-alpha: > >> > >> > Think we should at the very least wait for the generic strlcpy codes > >> > to land first. > >> > >> Do you mean a version of string/strlcpy.c that is based on a modified > >> string/stplcpy.c, rather than the one we have now that calls just strlen > >> and memcpy? > > > > Hmm? I mean your strlcpy/strlcat patch to land. > > That has already happened? :/ yup had been a minute since I pulled. Are we getting stplcpy? > > Thanks, > Florian > ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function 2023-07-02 16:55 ` Noah Goldstein @ 2023-07-02 17:02 ` Florian Weimer 0 siblings, 0 replies; 24+ messages in thread From: Florian Weimer @ 2023-07-02 17:02 UTC (permalink / raw) To: Noah Goldstein; +Cc: Noah Goldstein via Libc-alpha, Sunil K Pandey, hjl.tools * Noah Goldstein: >> >> Do you mean a version of string/strlcpy.c that is based on a modified >> >> string/stplcpy.c, rather than the one we have now that calls just strlen >> >> and memcpy? >> > >> > Hmm? I mean your strlcpy/strlcat patch to land. >> >> That has already happened? > :/ yup had been a minute since I pulled. > > Are we getting stplcpy? No. I mentioned string/stplcpy.c because it's what the generic strcpy is based upon. Sorry for the confusion. Thanks, Florian ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function 2023-06-30 20:48 [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function Sunil K Pandey 2023-06-30 21:04 ` Noah Goldstein @ 2023-07-02 17:03 ` Noah Goldstein 2023-07-02 18:37 ` Sunil Pandey 1 sibling, 1 reply; 24+ messages in thread From: Noah Goldstein @ 2023-07-02 17:03 UTC (permalink / raw) To: Sunil K Pandey; +Cc: libc-alpha, hjl.tools On Fri, Jun 30, 2023 at 3:48 PM Sunil K Pandey via Libc-alpha <libc-alpha@sourceware.org> wrote: > > This patch optimizes strlcpy/wsclcpy string functions for AVX2. > --- > sysdeps/x86_64/multiarch/Makefile | 4 + > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 18 + > sysdeps/x86_64/multiarch/ifunc-strlcpy.h | 34 ++ > sysdeps/x86_64/multiarch/strlcpy-avx2.S | 446 +++++++++++++++++++++ > sysdeps/x86_64/multiarch/strlcpy-generic.c | 25 ++ > sysdeps/x86_64/multiarch/strlcpy.c | 36 ++ > sysdeps/x86_64/multiarch/wcslcpy-avx2.S | 4 + > sysdeps/x86_64/multiarch/wcslcpy-generic.c | 25 ++ > sysdeps/x86_64/multiarch/wcslcpy.c | 35 ++ > 9 files changed, 627 insertions(+) > create mode 100644 sysdeps/x86_64/multiarch/ifunc-strlcpy.h > create mode 100644 sysdeps/x86_64/multiarch/strlcpy-avx2.S > create mode 100644 sysdeps/x86_64/multiarch/strlcpy-generic.c > create mode 100644 sysdeps/x86_64/multiarch/strlcpy.c > create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-avx2.S > create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-generic.c > create mode 100644 sysdeps/x86_64/multiarch/wcslcpy.c > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > index e1e894c963..7e3fc081df 100644 > --- a/sysdeps/x86_64/multiarch/Makefile > +++ b/sysdeps/x86_64/multiarch/Makefile > @@ -82,6 +82,8 @@ sysdep_routines += \ > strcpy-sse2 \ > strcpy-sse2-unaligned \ > strcspn-sse4 \ > + strlcpy-avx2 \ > + strlcpy-generic \ > strlen-avx2 \ > strlen-avx2-rtm \ > strlen-evex \ > @@ -153,6 +155,8 @@ sysdep_routines += \ > wcscpy-evex \ > wcscpy-generic \ > wcscpy-ssse3 \ > + wcslcpy-avx2 \ > + wcslcpy-generic \ > wcslen-avx2 \ > wcslen-avx2-rtm \ > wcslen-evex \ > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > index 5427ff1907..9928dee187 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > @@ -751,6 +751,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > 1, > __strncat_sse2_unaligned)) > > + /* Support sysdeps/x86_64/multiarch/strlcpy.c. */ > + IFUNC_IMPL (i, name, strlcpy, > + X86_IFUNC_IMPL_ADD_V3 (array, i, strlcpy, > + CPU_FEATURE_USABLE (AVX2), > + __strlcpy_avx2) > + X86_IFUNC_IMPL_ADD_V1 (array, i, strlcpy, > + 1, > + __strlcpy_generic)) > + > /* Support sysdeps/x86_64/multiarch/strncpy.c. */ > IFUNC_IMPL (i, name, strncpy, > X86_IFUNC_IMPL_ADD_V4 (array, i, strncpy, > @@ -917,6 +926,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > 1, > __wcscpy_generic)) > > + /* Support sysdeps/x86_64/multiarch/wcslcpy.c. */ > + IFUNC_IMPL (i, name, wcslcpy, > + X86_IFUNC_IMPL_ADD_V3 (array, i, wcslcpy, > + CPU_FEATURE_USABLE (AVX2), > + __wcslcpy_avx2) > + X86_IFUNC_IMPL_ADD_V1 (array, i, wcslcpy, > + 1, > + __wcslcpy_generic)) > + > /* Support sysdeps/x86_64/multiarch/wcsncpy.c. */ > IFUNC_IMPL (i, name, wcsncpy, > X86_IFUNC_IMPL_ADD_V4 (array, i, wcsncpy, > diff --git a/sysdeps/x86_64/multiarch/ifunc-strlcpy.h b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h > new file mode 100644 > index 0000000000..982a30d15b > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h > @@ -0,0 +1,34 @@ > +/* Common definition for ifunc selections. > + All versions must be listed in ifunc-impl-list.c. > + Copyright (C) 2023 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <init-arch.h> > + > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; > +extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden; > + > +static inline void * > +IFUNC_SELECTOR (void) > +{ > + const struct cpu_features *cpu_features = __get_cpu_features (); > + > + if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2)) > + return OPTIMIZE (avx2); > + > + return OPTIMIZE (generic); > +} > diff --git a/sysdeps/x86_64/multiarch/strlcpy-avx2.S b/sysdeps/x86_64/multiarch/strlcpy-avx2.S > new file mode 100644 > index 0000000000..cf54b1e990 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/strlcpy-avx2.S > @@ -0,0 +1,446 @@ > +/* Strlcpy/wcslcpy optimized with AVX2. > + Copyright (C) 2023 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <isa-level.h> > + > +#if ISA_SHOULD_BUILD (3) > + > +# include <sysdep.h> > + > +# ifndef VEC_SIZE > +# include "x86-avx-vecs.h" > +# endif > + > +# ifndef STRLCPY > +# define STRLCPY __strlcpy_avx2 > +# endif > + > + > +# ifdef USE_AS_WCSLCPY > +# define CHAR_SIZE 4 > +# define MOVU movl > +# define VPCMPEQ vpcmpeqd > +# define VPMINU vpminud > +# else > +# define CHAR_SIZE 1 > +# define MOVU movb > +# define VPCMPEQ vpcmpeqb > +# define VPMINU vpminub > +# endif > + > +# define PMOVMSK vpmovmskb > +# define PAGE_SIZE 4096 > +# define VEC_SIZE 32 > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > + > + .section SECTION(.text),"ax",@progbits > +/* Aligning entry point to 64 byte, provides better performance for > + one vector length string. */ > + > +ENTRY_P2ALIGN (STRLCPY, 6) > +# ifdef __ILP32__ > + /* Clear the upper 32 bits. */ > + movl %edx, %edx > +# endif > + > + /* Zero out vector register for end of string comparison. */ > + vpxor %VMM(0), %VMM(0), %VMM(0) > + /* Save source pointer for return calculation. */ > + mov %rsi, %r8 > + mov %esi, %eax > + sall $20, %eax > + cmpl $((PAGE_SIZE - (VEC_SIZE)) << 20), %eax > + ja L(page_cross) > + > +L(page_cross_continue): > + /* Load first vector. */ > + VMOVU (%rsi), %VMM(1) > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) > + PMOVMSK %VMM(2), %eax > + test %eax, %eax > + jnz L(ret_vec_x1) > + > + test %rdx, %rdx > + jz L(continue_second_vector) > + > + /* Check whether we can copy full vector. */ > + cmp $CHAR_PER_VEC, %rdx > + jbe L(page_cross_small_vec_copy) > + /* Copy first vector. */ > + VMOVU %VMM(1), (%rdi) > + sub $CHAR_PER_VEC, %rdx > + > +L(continue_second_vector): > + /* Align RSI pointer and adjust RDI based on offset. */ > + mov %rsi, %rax > + and $-VEC_SIZE, %rsi > + sub %rsi, %rax > + sub %rax, %rdi > + > + /* Check if string already copied N char, and RDX is 0. */ > + test %rdx, %rdx > + jz L(skip_copy_alignment_fix) > + > + /* Adjust RDX for copy alignment fix. */ > +# ifdef USE_AS_WCSLCPY > + shr $2, %rax > +# endif > + add %rax, %rdx > + > +L(skip_copy_alignment_fix): > + /* Load second vector. */ > + VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1) > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) > + vptest %VMM(2), %VMM(2) > + jnz L(ret_vec_x2) > + > + /* Skip copy if RDX is 0. */ > + test %rdx, %rdx > + jz L(continue_third_vector) > + > + /* Jump below/equal(instead of below) used here, because last > + copy chracter must be NULL. */ > + cmp $CHAR_PER_VEC, %rdx > + jbe L(partial_copy_second_vector) > + > + sub $CHAR_PER_VEC, %rdx > + /* Copy second vector. */ > + VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) > + > +L(continue_third_vector): > + /* Load third vector. */ > + VMOVA (VEC_SIZE * 2)(%rsi), %VMM(1) > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) > + vptest %VMM(2), %VMM(2) > + jnz L(ret_vec_x3) > + > + /* Skip copy if RDX is 0. */ > + test %rdx, %rdx > + jz L(continue_fourth_vector) > + > + cmp $CHAR_PER_VEC, %rdx > + jbe L(partial_copy_third_vector) > + > + sub $CHAR_PER_VEC, %rdx > + /* Copy third vector. */ > + VMOVU %VMM(1), (VEC_SIZE * 2)(%rdi) > + > +L(continue_fourth_vector): > + /* Load fourth vector. */ > + VMOVA (VEC_SIZE * 3)(%rsi), %VMM(1) > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) > + vptest %VMM(2), %VMM(2) > + jnz L(ret_vec_x4) > + > + /* Skip copy if RDX is 0. */ > + test %rdx, %rdx > + jz L(loop_4x_align) > + > + cmp $CHAR_PER_VEC, %rdx > + jbe L(partial_copy_fourth_vector) > + > + sub $CHAR_PER_VEC, %rdx > + /* Copy fourth vector. */ > + VMOVU %VMM(1), (VEC_SIZE * 3)(%rdi) > + > + > +L(loop_4x_align): > + /* Jump to loop if RSI is already 4 vector align. */ > + test $(VEC_SIZE * 4 - 1), %esi > + jz L(loop_4x_read) > + > + mov %rsi, %rcx > + > + /* Align RSI to 4x vector. */ > + and $(VEC_SIZE * -4), %rsi > + sub %rsi, %rcx > + > + /* Adjust RDI for RSI alignment fix. */ > + sub %rcx, %rdi > + > + /* Jump to loop if RDX is 0. */ > + test %rdx, %rdx > + jz L(loop_4x_read) > + > +# ifdef USE_AS_WCSLCPY > + shr $2, %rcx > +# endif > + > + /* Adjust RDX for RSI alignment fix. */ > + add %rcx, %rdx > + jmp L(loop_4x_read) > + > + .p2align 4,,6 > +L(loop_4x_vec): > + /* Skip copy if RDX is 0. */ > + test %rdx, %rdx > + jz L(loop_partial_copy_return) > + cmp $(CHAR_PER_VEC * 4), %rdx > + jbe L(loop_partial_copy) > + VMOVU %VMM(1), (VEC_SIZE * 4)(%rdi) > + VMOVU %VMM(2), (VEC_SIZE * 5)(%rdi) > + VMOVU %VMM(3), (VEC_SIZE * 6)(%rdi) > + VMOVU %VMM(4), (VEC_SIZE * 7)(%rdi) > + sub $(CHAR_PER_VEC * 4), %rdx > + > +L(loop_partial_copy_return): > + sub $(VEC_SIZE * -4), %rsi > + sub $(VEC_SIZE * -4), %rdi > + > +L(loop_4x_read): > + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(1) > + VMOVA (VEC_SIZE * 5)(%rsi), %VMM(2) > + VMOVA (VEC_SIZE * 6)(%rsi), %VMM(3) > + VMOVA (VEC_SIZE * 7)(%rsi), %VMM(4) > + VPMINU %VMM(1), %VMM(2), %VMM(5) > + VPMINU %VMM(3), %VMM(4), %VMM(6) > + VPMINU %VMM(5), %VMM(6), %VMM(7) > + VPCMPEQ %VMM(0), %VMM(7), %VMM(7) > + vptest %VMM(7), %VMM(7) > + > + jz L(loop_4x_vec) > + > + /* Check if string ends in first vector or second vector. */ > + lea (VEC_SIZE * 4)(%rsi), %rax > + sub %r8, %rax > +# ifdef USE_AS_WCSLCPY > + shr $2, %rax > +# endif > + xor %r10, %r10 > + VPCMPEQ %VMM(0), %VMM(5), %VMM(6) > + vptest %VMM(6), %VMM(6) > + jnz L(endloop) > + sub $(CHAR_PER_VEC * -2), %rax > + mov $(CHAR_PER_VEC * 2), %r10 > + VMOVA %VMM(3), %VMM(1) > + VMOVA %VMM(4), %VMM(2) > + > +L(endloop): > + VPCMPEQ %VMM(0), %VMM(1), %VMM(1) > + VPCMPEQ %VMM(0), %VMM(2), %VMM(2) > + PMOVMSK %VMM(1), %rcx > + PMOVMSK %VMM(2), %r9 > + shlq $32, %r9 > + orq %r9, %rcx > + bsf %rcx, %rcx > + /* Shift RCX by 2, VPMOVMSK has only byte version. */ > +# ifdef USE_AS_WCSLCPY > + shr $2, %rcx > +# endif > + /* At this point RAX has length to return. */ > + add %rcx, %rax > + test %rdx, %rdx > + jz L(ret) > + > + /* Add 1 to account for NULL character in RDX comparison. */ > + lea 1(%r10, %rcx), %rcx > + cmp %rdx, %rcx > + cmovb %rcx, %rdx > + > +L(loop_partial_copy): > + cmp $(CHAR_PER_VEC * 2), %rdx > + jbe L(loop_partial_first_half) > + /* Reload first 2 vector. */ > + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(1) > + VMOVA (VEC_SIZE * 5)(%rsi), %VMM(2) > + VMOVU %VMM(1), (VEC_SIZE * 4)(%rdi) > + VMOVU %VMM(2), (VEC_SIZE * 5)(%rdi) > + > +L(loop_partial_first_half): > + /* Go back 2 vector from last and use overlapping copy. > + (VEC_SIZE * 4 - VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE) > + (VEC_SIZE * 4 - VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE) > + */ > + VMOVU (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(3) > + VMOVU (VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %VMM(4) > + VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE) > + VMOVU %VMM(4), (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE) > + MOVU $0, (VEC_SIZE * 4 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) > + xor %rdx, %rdx > + vptest %VMM(7), %VMM(7) > + jz L(loop_partial_copy_return) > + ret > + > + .p2align 4 > +L(page_cross): > + mov %rsi, %rcx > + mov %rsi, %r11 > + and $-VEC_SIZE, %r11 > + and $(VEC_SIZE - 1), %rcx > + VMOVA (%r11), %VMM(1) > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) > + PMOVMSK %VMM(2), %eax > + shr %cl, %eax > + jz L(page_cross_continue) > + > +L(ret_vec_x1): > + bsf %eax, %eax > +# ifdef USE_AS_WCSLCPY > + shr $2, %eax > +# endif > + /* Increment by 1 to account for NULL char. */ > + lea 1(%eax), %ecx > + cmp %rdx, %rcx > + cmovb %rcx, %rdx > + test %rdx, %rdx > + jz L(ret) > + > +L(page_cross_small_vec_copy): > + cmp $(16 / CHAR_SIZE), %rdx > + jbe L(copy_8_byte_scalar) > + VMOVU (%rsi), %VMM_128(1) > + VMOVU -16(%rsi, %rdx, CHAR_SIZE), %VMM_128(3) > + VMOVU %VMM_128(1), (%rdi) > + VMOVU %VMM_128(3), -16(%rdi, %rdx, CHAR_SIZE) > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > + xor %rdx, %rdx > + vptest %VMM(2), %VMM(2) > + jz L(continue_second_vector) > + ret > + > +L(copy_8_byte_scalar): > + cmp $(8 / CHAR_SIZE), %rdx > + jbe L(copy_4_byte_scalar) > + movq (%rsi), %r10 > + movq -8(%rsi, %rdx, CHAR_SIZE), %r11 > + movq %r10, (%rdi) > + movq %r11, -8(%rdi, %rdx, CHAR_SIZE) > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > + xor %edx, %edx > + vptest %VMM(2), %VMM(2) > + jz L(continue_second_vector) > + ret > + > +L(copy_4_byte_scalar): > +# ifndef USE_AS_WCSLCPY > + cmp $4, %rdx > + jbe L(copy_2_byte_scalar) > +# endif > + movl (%rsi), %r10d > + movl -4(%rsi, %rdx, CHAR_SIZE), %r11d > + movl %r10d, (%rdi) > + movl %r11d, -4(%rdi, %rdx, CHAR_SIZE) > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > + xor %edx, %edx > + vptest %VMM(2), %VMM(2) > + jz L(continue_second_vector) > + ret > + > +# ifndef USE_AS_WCSLCPY > +L(copy_2_byte_scalar): > + cmp $2, %rdx > + jbe L(copy_1_byte_scalar) > + movw (%rsi), %r10w > + movw -(CHAR_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %r11w > + movw %r10w, (%rdi) > + movw %r11w, -(CHAR_SIZE * 3)(%rdi, %rdx, CHAR_SIZE) > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > + xor %edx, %edx > + vptest %VMM(2), %VMM(2) > + jz L(continue_second_vector) > + ret > + > +L(copy_1_byte_scalar): > + MOVU (%rsi), %r10b > + MOVU %r10b, (%rdi) > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > + xor %edx, %edx > + vptest %VMM(2), %VMM(2) > + jz L(continue_second_vector) > + ret > +# endif > + > +L(ret_vec_x2): > + PMOVMSK %VMM(2), %rax > + bsf %rax, %rcx > + /* Calculate return value. */ > + lea VEC_SIZE(%rsi, %rcx), %rax > + sub %r8, %rax > +# ifdef USE_AS_WCSLCPY > + shr $2, %rax > + shr $2, %rcx > +# endif > + inc %rcx > + test %rdx, %rdx > + jz L(ret) > + cmp %rdx, %rcx > + cmovb %rcx, %rdx > + > +L(partial_copy_second_vector): > + VMOVU (%rsi, %rdx, CHAR_SIZE), %VMM(1) > + VMOVU %VMM(1), (%rdi, %rdx, CHAR_SIZE) > + MOVU $0, (VEC_SIZE - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > + xor %edx, %edx > + vptest %VMM(2), %VMM(2) > + jz L(continue_third_vector) > + > +L(ret): > + ret > + > +L(ret_vec_x3): > + PMOVMSK %VMM(2), %rax > + bsf %rax, %rcx > + /* Calculate return value. */ > + lea (VEC_SIZE * 2)(%rsi, %rcx), %rax > + sub %r8, %rax > +# ifdef USE_AS_WCSLCPY > + shr $2, %rax > + shr $2, %rcx > +# endif > + inc %rcx > + test %rdx, %rdx > + jz L(ret) > + cmp %rdx, %rcx > + cmovb %rcx, %rdx > + > +L(partial_copy_third_vector): > + VMOVU (VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM(1) > + VMOVU %VMM(1), (VEC_SIZE)(%rdi, %rdx, CHAR_SIZE) > + MOVU $0, ((VEC_SIZE * 2) - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > + xor %edx, %edx > + vptest %VMM(2), %VMM(2) > + jz L(continue_fourth_vector) > + ret > + > +L(ret_vec_x4): > + PMOVMSK %VMM(2), %rax > + bsf %rax, %rcx > + /* Calculate return value. */ > + lea (VEC_SIZE * 3)(%rsi, %rcx), %rax > + sub %r8, %rax > +# ifdef USE_AS_WCSLCPY > + shr $2, %rax > + shr $2, %rcx > +# endif > + inc %rcx > + test %rdx, %rdx > + jz L(ret) > + cmp %rdx, %rcx > + cmovb %rcx, %rdx > + > +L(partial_copy_fourth_vector): > + VMOVU (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(1) > + VMOVU %VMM(1), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE) > + MOVU $0, ((VEC_SIZE * 3) - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > + xor %edx, %edx > + vptest %VMM(2), %VMM(2) > + jz L(continue_fourth_vector) > + ret > + > +END (STRLCPY) Is strlcpy/strlcat integratable with existing strncat impl? Had figured they would fit in the same file. > +#endif > diff --git a/sysdeps/x86_64/multiarch/strlcpy-generic.c b/sysdeps/x86_64/multiarch/strlcpy-generic.c > new file mode 100644 > index 0000000000..eee3b7b086 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/strlcpy-generic.c > @@ -0,0 +1,25 @@ > +/* strlcpy generic. > + Copyright (C) 2023 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > + > +#include <isa-level.h> > +#if ISA_SHOULD_BUILD (1) > +# define __strlcpy __strlcpy_generic > +# include <string/strlcpy.c> > + > +#endif > diff --git a/sysdeps/x86_64/multiarch/strlcpy.c b/sysdeps/x86_64/multiarch/strlcpy.c > new file mode 100644 > index 0000000000..ded41fbcfb > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/strlcpy.c > @@ -0,0 +1,36 @@ > +/* Multiple versions of strlcpy. > + All versions must be listed in ifunc-impl-list.c. > + Copyright (C) 2023 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +/* Define multiple versions only for the definition in libc. */ > +#if IS_IN (libc) > +# define __strlcpy __redirect_strlcpy > +# include <string.h> > +# undef __strlcpy > + > +# define SYMBOL_NAME strlcpy > +# include "ifunc-strlcpy.h" > + > +libc_ifunc_redirected (__redirect_strlcpy, __strlcpy, IFUNC_SELECTOR ()); > +weak_alias (__strlcpy, strlcpy) > + > +# ifdef SHARED > +__hidden_ver1 (__strlcpy, __GI___strlcpy, __redirect_strlcpy) > + __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strlcpy); > +# endif > +#endif > diff --git a/sysdeps/x86_64/multiarch/wcslcpy-avx2.S b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S > new file mode 100644 > index 0000000000..dafc20ded0 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S > @@ -0,0 +1,4 @@ > +#define STRLCPY __wcslcpy_avx2 > +#define USE_AS_WCSLCPY 1 > + > +#include "strlcpy-avx2.S" > diff --git a/sysdeps/x86_64/multiarch/wcslcpy-generic.c b/sysdeps/x86_64/multiarch/wcslcpy-generic.c > new file mode 100644 > index 0000000000..ffd3c0e846 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/wcslcpy-generic.c > @@ -0,0 +1,25 @@ > +/* wcslcpy generic. > + Copyright (C) 2023 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > + > +#include <isa-level.h> > +#if ISA_SHOULD_BUILD (1) > +# define __wcslcpy __wcslcpy_generic > +# include <wcsmbs/wcslcpy.c> > + > +#endif > diff --git a/sysdeps/x86_64/multiarch/wcslcpy.c b/sysdeps/x86_64/multiarch/wcslcpy.c > new file mode 100644 > index 0000000000..371ef9626c > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/wcslcpy.c > @@ -0,0 +1,35 @@ > +/* Multiple versions of wcslcpy. > + All versions must be listed in ifunc-impl-list.c. > + Copyright (C) 2023 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +/* Define multiple versions only for the definition in libc. */ > +#if IS_IN (libc) > +# define __wcslcpy __redirect_wcslcpy > +# include <wchar.h> > +# undef __wcslcpy > + > +# define SYMBOL_NAME wcslcpy > +# include "ifunc-strlcpy.h" > + > +libc_ifunc_redirected (__redirect_wcslcpy, __wcslcpy, IFUNC_SELECTOR ()); > +weak_alias (__wcslcpy, wcslcpy) > +# ifdef SHARED > +__hidden_ver1 (__wcslcpy, __GI___wcslcpy, __redirect_wcslcpy) > + __attribute__((visibility ("hidden"))) __attribute_copy__ (wcslcpy); > +# endif > +#endif > -- > 2.38.1 > ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function 2023-07-02 17:03 ` Noah Goldstein @ 2023-07-02 18:37 ` Sunil Pandey 2023-07-02 18:54 ` Noah Goldstein 0 siblings, 1 reply; 24+ messages in thread From: Sunil Pandey @ 2023-07-02 18:37 UTC (permalink / raw) To: Noah Goldstein; +Cc: libc-alpha, hjl.tools [-- Attachment #1: Type: text/plain, Size: 28697 bytes --] On Sun, Jul 2, 2023 at 10:03 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > On Fri, Jun 30, 2023 at 3:48 PM Sunil K Pandey via Libc-alpha > <libc-alpha@sourceware.org> wrote: > > > > This patch optimizes strlcpy/wsclcpy string functions for AVX2. > > --- > > sysdeps/x86_64/multiarch/Makefile | 4 + > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 18 + > > sysdeps/x86_64/multiarch/ifunc-strlcpy.h | 34 ++ > > sysdeps/x86_64/multiarch/strlcpy-avx2.S | 446 +++++++++++++++++++++ > > sysdeps/x86_64/multiarch/strlcpy-generic.c | 25 ++ > > sysdeps/x86_64/multiarch/strlcpy.c | 36 ++ > > sysdeps/x86_64/multiarch/wcslcpy-avx2.S | 4 + > > sysdeps/x86_64/multiarch/wcslcpy-generic.c | 25 ++ > > sysdeps/x86_64/multiarch/wcslcpy.c | 35 ++ > > 9 files changed, 627 insertions(+) > > create mode 100644 sysdeps/x86_64/multiarch/ifunc-strlcpy.h > > create mode 100644 sysdeps/x86_64/multiarch/strlcpy-avx2.S > > create mode 100644 sysdeps/x86_64/multiarch/strlcpy-generic.c > > create mode 100644 sysdeps/x86_64/multiarch/strlcpy.c > > create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-avx2.S > > create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-generic.c > > create mode 100644 sysdeps/x86_64/multiarch/wcslcpy.c > > > > diff --git a/sysdeps/x86_64/multiarch/Makefile > b/sysdeps/x86_64/multiarch/Makefile > > index e1e894c963..7e3fc081df 100644 > > --- a/sysdeps/x86_64/multiarch/Makefile > > +++ b/sysdeps/x86_64/multiarch/Makefile > > @@ -82,6 +82,8 @@ sysdep_routines += \ > > strcpy-sse2 \ > > strcpy-sse2-unaligned \ > > strcspn-sse4 \ > > + strlcpy-avx2 \ > > + strlcpy-generic \ > > strlen-avx2 \ > > strlen-avx2-rtm \ > > strlen-evex \ > > @@ -153,6 +155,8 @@ sysdep_routines += \ > > wcscpy-evex \ > > wcscpy-generic \ > > wcscpy-ssse3 \ > > + wcslcpy-avx2 \ > > + wcslcpy-generic \ > > wcslen-avx2 \ > > wcslen-avx2-rtm \ > > wcslen-evex \ > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > index 5427ff1907..9928dee187 100644 > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > @@ -751,6 +751,15 @@ __libc_ifunc_impl_list (const char *name, struct > libc_ifunc_impl *array, > > 1, > > __strncat_sse2_unaligned)) > > > > + /* Support sysdeps/x86_64/multiarch/strlcpy.c. */ > > + IFUNC_IMPL (i, name, strlcpy, > > + X86_IFUNC_IMPL_ADD_V3 (array, i, strlcpy, > > + CPU_FEATURE_USABLE (AVX2), > > + __strlcpy_avx2) > > + X86_IFUNC_IMPL_ADD_V1 (array, i, strlcpy, > > + 1, > > + __strlcpy_generic)) > > + > > /* Support sysdeps/x86_64/multiarch/strncpy.c. */ > > IFUNC_IMPL (i, name, strncpy, > > X86_IFUNC_IMPL_ADD_V4 (array, i, strncpy, > > @@ -917,6 +926,15 @@ __libc_ifunc_impl_list (const char *name, struct > libc_ifunc_impl *array, > > 1, > > __wcscpy_generic)) > > > > + /* Support sysdeps/x86_64/multiarch/wcslcpy.c. */ > > + IFUNC_IMPL (i, name, wcslcpy, > > + X86_IFUNC_IMPL_ADD_V3 (array, i, wcslcpy, > > + CPU_FEATURE_USABLE (AVX2), > > + __wcslcpy_avx2) > > + X86_IFUNC_IMPL_ADD_V1 (array, i, wcslcpy, > > + 1, > > + __wcslcpy_generic)) > > + > > /* Support sysdeps/x86_64/multiarch/wcsncpy.c. */ > > IFUNC_IMPL (i, name, wcsncpy, > > X86_IFUNC_IMPL_ADD_V4 (array, i, wcsncpy, > > diff --git a/sysdeps/x86_64/multiarch/ifunc-strlcpy.h > b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h > > new file mode 100644 > > index 0000000000..982a30d15b > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h > > @@ -0,0 +1,34 @@ > > +/* Common definition for ifunc selections. > > + All versions must be listed in ifunc-impl-list.c. > > + Copyright (C) 2023 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > +#include <init-arch.h> > > + > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden; > > + > > +static inline void * > > +IFUNC_SELECTOR (void) > > +{ > > + const struct cpu_features *cpu_features = __get_cpu_features (); > > + > > + if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2)) > > + return OPTIMIZE (avx2); > > + > > + return OPTIMIZE (generic); > > +} > > diff --git a/sysdeps/x86_64/multiarch/strlcpy-avx2.S > b/sysdeps/x86_64/multiarch/strlcpy-avx2.S > > new file mode 100644 > > index 0000000000..cf54b1e990 > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/strlcpy-avx2.S > > @@ -0,0 +1,446 @@ > > +/* Strlcpy/wcslcpy optimized with AVX2. > > + Copyright (C) 2023 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > +#include <isa-level.h> > > + > > +#if ISA_SHOULD_BUILD (3) > > + > > +# include <sysdep.h> > > + > > +# ifndef VEC_SIZE > > +# include "x86-avx-vecs.h" > > +# endif > > + > > +# ifndef STRLCPY > > +# define STRLCPY __strlcpy_avx2 > > +# endif > > + > > + > > +# ifdef USE_AS_WCSLCPY > > +# define CHAR_SIZE 4 > > +# define MOVU movl > > +# define VPCMPEQ vpcmpeqd > > +# define VPMINU vpminud > > +# else > > +# define CHAR_SIZE 1 > > +# define MOVU movb > > +# define VPCMPEQ vpcmpeqb > > +# define VPMINU vpminub > > +# endif > > + > > +# define PMOVMSK vpmovmskb > > +# define PAGE_SIZE 4096 > > +# define VEC_SIZE 32 > > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > + > > + .section SECTION(.text),"ax",@progbits > > +/* Aligning entry point to 64 byte, provides better performance for > > + one vector length string. */ > > + > > +ENTRY_P2ALIGN (STRLCPY, 6) > > +# ifdef __ILP32__ > > + /* Clear the upper 32 bits. */ > > + movl %edx, %edx > > +# endif > > + > > + /* Zero out vector register for end of string comparison. */ > > + vpxor %VMM(0), %VMM(0), %VMM(0) > > + /* Save source pointer for return calculation. */ > > + mov %rsi, %r8 > > + mov %esi, %eax > > + sall $20, %eax > > + cmpl $((PAGE_SIZE - (VEC_SIZE)) << 20), %eax > > + ja L(page_cross) > > + > > +L(page_cross_continue): > > + /* Load first vector. */ > > + VMOVU (%rsi), %VMM(1) > > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) > > + PMOVMSK %VMM(2), %eax > > + test %eax, %eax > > + jnz L(ret_vec_x1) > > + > > + test %rdx, %rdx > > + jz L(continue_second_vector) > > + > > + /* Check whether we can copy full vector. */ > > + cmp $CHAR_PER_VEC, %rdx > > + jbe L(page_cross_small_vec_copy) > > + /* Copy first vector. */ > > + VMOVU %VMM(1), (%rdi) > > + sub $CHAR_PER_VEC, %rdx > > + > > +L(continue_second_vector): > > + /* Align RSI pointer and adjust RDI based on offset. */ > > + mov %rsi, %rax > > + and $-VEC_SIZE, %rsi > > + sub %rsi, %rax > > + sub %rax, %rdi > > + > > + /* Check if string already copied N char, and RDX is 0. */ > > + test %rdx, %rdx > > + jz L(skip_copy_alignment_fix) > > + > > + /* Adjust RDX for copy alignment fix. */ > > +# ifdef USE_AS_WCSLCPY > > + shr $2, %rax > > +# endif > > + add %rax, %rdx > > + > > +L(skip_copy_alignment_fix): > > + /* Load second vector. */ > > + VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1) > > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) > > + vptest %VMM(2), %VMM(2) > > + jnz L(ret_vec_x2) > > + > > + /* Skip copy if RDX is 0. */ > > + test %rdx, %rdx > > + jz L(continue_third_vector) > > + > > + /* Jump below/equal(instead of below) used here, because last > > + copy chracter must be NULL. */ > > + cmp $CHAR_PER_VEC, %rdx > > + jbe L(partial_copy_second_vector) > > + > > + sub $CHAR_PER_VEC, %rdx > > + /* Copy second vector. */ > > + VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) > > + > > +L(continue_third_vector): > > + /* Load third vector. */ > > + VMOVA (VEC_SIZE * 2)(%rsi), %VMM(1) > > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) > > + vptest %VMM(2), %VMM(2) > > + jnz L(ret_vec_x3) > > + > > + /* Skip copy if RDX is 0. */ > > + test %rdx, %rdx > > + jz L(continue_fourth_vector) > > + > > + cmp $CHAR_PER_VEC, %rdx > > + jbe L(partial_copy_third_vector) > > + > > + sub $CHAR_PER_VEC, %rdx > > + /* Copy third vector. */ > > + VMOVU %VMM(1), (VEC_SIZE * 2)(%rdi) > > + > > +L(continue_fourth_vector): > > + /* Load fourth vector. */ > > + VMOVA (VEC_SIZE * 3)(%rsi), %VMM(1) > > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) > > + vptest %VMM(2), %VMM(2) > > + jnz L(ret_vec_x4) > > + > > + /* Skip copy if RDX is 0. */ > > + test %rdx, %rdx > > + jz L(loop_4x_align) > > + > > + cmp $CHAR_PER_VEC, %rdx > > + jbe L(partial_copy_fourth_vector) > > + > > + sub $CHAR_PER_VEC, %rdx > > + /* Copy fourth vector. */ > > + VMOVU %VMM(1), (VEC_SIZE * 3)(%rdi) > > + > > + > > +L(loop_4x_align): > > + /* Jump to loop if RSI is already 4 vector align. */ > > + test $(VEC_SIZE * 4 - 1), %esi > > + jz L(loop_4x_read) > > + > > + mov %rsi, %rcx > > + > > + /* Align RSI to 4x vector. */ > > + and $(VEC_SIZE * -4), %rsi > > + sub %rsi, %rcx > > + > > + /* Adjust RDI for RSI alignment fix. */ > > + sub %rcx, %rdi > > + > > + /* Jump to loop if RDX is 0. */ > > + test %rdx, %rdx > > + jz L(loop_4x_read) > > + > > +# ifdef USE_AS_WCSLCPY > > + shr $2, %rcx > > +# endif > > + > > + /* Adjust RDX for RSI alignment fix. */ > > + add %rcx, %rdx > > + jmp L(loop_4x_read) > > + > > + .p2align 4,,6 > > +L(loop_4x_vec): > > + /* Skip copy if RDX is 0. */ > > + test %rdx, %rdx > > + jz L(loop_partial_copy_return) > > + cmp $(CHAR_PER_VEC * 4), %rdx > > + jbe L(loop_partial_copy) > > + VMOVU %VMM(1), (VEC_SIZE * 4)(%rdi) > > + VMOVU %VMM(2), (VEC_SIZE * 5)(%rdi) > > + VMOVU %VMM(3), (VEC_SIZE * 6)(%rdi) > > + VMOVU %VMM(4), (VEC_SIZE * 7)(%rdi) > > + sub $(CHAR_PER_VEC * 4), %rdx > > + > > +L(loop_partial_copy_return): > > + sub $(VEC_SIZE * -4), %rsi > > + sub $(VEC_SIZE * -4), %rdi > > + > > +L(loop_4x_read): > > + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(1) > > + VMOVA (VEC_SIZE * 5)(%rsi), %VMM(2) > > + VMOVA (VEC_SIZE * 6)(%rsi), %VMM(3) > > + VMOVA (VEC_SIZE * 7)(%rsi), %VMM(4) > > + VPMINU %VMM(1), %VMM(2), %VMM(5) > > + VPMINU %VMM(3), %VMM(4), %VMM(6) > > + VPMINU %VMM(5), %VMM(6), %VMM(7) > > + VPCMPEQ %VMM(0), %VMM(7), %VMM(7) > > + vptest %VMM(7), %VMM(7) > > + > > + jz L(loop_4x_vec) > > + > > + /* Check if string ends in first vector or second vector. */ > > + lea (VEC_SIZE * 4)(%rsi), %rax > > + sub %r8, %rax > > +# ifdef USE_AS_WCSLCPY > > + shr $2, %rax > > +# endif > > + xor %r10, %r10 > > + VPCMPEQ %VMM(0), %VMM(5), %VMM(6) > > + vptest %VMM(6), %VMM(6) > > + jnz L(endloop) > > + sub $(CHAR_PER_VEC * -2), %rax > > + mov $(CHAR_PER_VEC * 2), %r10 > > + VMOVA %VMM(3), %VMM(1) > > + VMOVA %VMM(4), %VMM(2) > > + > > +L(endloop): > > + VPCMPEQ %VMM(0), %VMM(1), %VMM(1) > > + VPCMPEQ %VMM(0), %VMM(2), %VMM(2) > > + PMOVMSK %VMM(1), %rcx > > + PMOVMSK %VMM(2), %r9 > > + shlq $32, %r9 > > + orq %r9, %rcx > > + bsf %rcx, %rcx > > + /* Shift RCX by 2, VPMOVMSK has only byte version. */ > > +# ifdef USE_AS_WCSLCPY > > + shr $2, %rcx > > +# endif > > + /* At this point RAX has length to return. */ > > + add %rcx, %rax > > + test %rdx, %rdx > > + jz L(ret) > > + > > + /* Add 1 to account for NULL character in RDX comparison. */ > > + lea 1(%r10, %rcx), %rcx > > + cmp %rdx, %rcx > > + cmovb %rcx, %rdx > > + > > +L(loop_partial_copy): > > + cmp $(CHAR_PER_VEC * 2), %rdx > > + jbe L(loop_partial_first_half) > > + /* Reload first 2 vector. */ > > + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(1) > > + VMOVA (VEC_SIZE * 5)(%rsi), %VMM(2) > > + VMOVU %VMM(1), (VEC_SIZE * 4)(%rdi) > > + VMOVU %VMM(2), (VEC_SIZE * 5)(%rdi) > > + > > +L(loop_partial_first_half): > > + /* Go back 2 vector from last and use overlapping copy. > > + (VEC_SIZE * 4 - VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE) > > + (VEC_SIZE * 4 - VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE) > > + */ > > + VMOVU (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(3) > > + VMOVU (VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %VMM(4) > > + VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE) > > + VMOVU %VMM(4), (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE) > > + MOVU $0, (VEC_SIZE * 4 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) > > + xor %rdx, %rdx > > + vptest %VMM(7), %VMM(7) > > + jz L(loop_partial_copy_return) > > + ret > > + > > + .p2align 4 > > +L(page_cross): > > + mov %rsi, %rcx > > + mov %rsi, %r11 > > + and $-VEC_SIZE, %r11 > > + and $(VEC_SIZE - 1), %rcx > > + VMOVA (%r11), %VMM(1) > > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) > > + PMOVMSK %VMM(2), %eax > > + shr %cl, %eax > > + jz L(page_cross_continue) > > + > > +L(ret_vec_x1): > > + bsf %eax, %eax > > +# ifdef USE_AS_WCSLCPY > > + shr $2, %eax > > +# endif > > + /* Increment by 1 to account for NULL char. */ > > + lea 1(%eax), %ecx > > + cmp %rdx, %rcx > > + cmovb %rcx, %rdx > > + test %rdx, %rdx > > + jz L(ret) > > + > > +L(page_cross_small_vec_copy): > > + cmp $(16 / CHAR_SIZE), %rdx > > + jbe L(copy_8_byte_scalar) > > + VMOVU (%rsi), %VMM_128(1) > > + VMOVU -16(%rsi, %rdx, CHAR_SIZE), %VMM_128(3) > > + VMOVU %VMM_128(1), (%rdi) > > + VMOVU %VMM_128(3), -16(%rdi, %rdx, CHAR_SIZE) > > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > > + xor %rdx, %rdx > > + vptest %VMM(2), %VMM(2) > > + jz L(continue_second_vector) > > + ret > > + > > +L(copy_8_byte_scalar): > > + cmp $(8 / CHAR_SIZE), %rdx > > + jbe L(copy_4_byte_scalar) > > + movq (%rsi), %r10 > > + movq -8(%rsi, %rdx, CHAR_SIZE), %r11 > > + movq %r10, (%rdi) > > + movq %r11, -8(%rdi, %rdx, CHAR_SIZE) > > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > > + xor %edx, %edx > > + vptest %VMM(2), %VMM(2) > > + jz L(continue_second_vector) > > + ret > > + > > +L(copy_4_byte_scalar): > > +# ifndef USE_AS_WCSLCPY > > + cmp $4, %rdx > > + jbe L(copy_2_byte_scalar) > > +# endif > > + movl (%rsi), %r10d > > + movl -4(%rsi, %rdx, CHAR_SIZE), %r11d > > + movl %r10d, (%rdi) > > + movl %r11d, -4(%rdi, %rdx, CHAR_SIZE) > > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > > + xor %edx, %edx > > + vptest %VMM(2), %VMM(2) > > + jz L(continue_second_vector) > > + ret > > + > > +# ifndef USE_AS_WCSLCPY > > +L(copy_2_byte_scalar): > > + cmp $2, %rdx > > + jbe L(copy_1_byte_scalar) > > + movw (%rsi), %r10w > > + movw -(CHAR_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %r11w > > + movw %r10w, (%rdi) > > + movw %r11w, -(CHAR_SIZE * 3)(%rdi, %rdx, CHAR_SIZE) > > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > > + xor %edx, %edx > > + vptest %VMM(2), %VMM(2) > > + jz L(continue_second_vector) > > + ret > > + > > +L(copy_1_byte_scalar): > > + MOVU (%rsi), %r10b > > + MOVU %r10b, (%rdi) > > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > > + xor %edx, %edx > > + vptest %VMM(2), %VMM(2) > > + jz L(continue_second_vector) > > + ret > > +# endif > > + > > +L(ret_vec_x2): > > + PMOVMSK %VMM(2), %rax > > + bsf %rax, %rcx > > + /* Calculate return value. */ > > + lea VEC_SIZE(%rsi, %rcx), %rax > > + sub %r8, %rax > > +# ifdef USE_AS_WCSLCPY > > + shr $2, %rax > > + shr $2, %rcx > > +# endif > > + inc %rcx > > + test %rdx, %rdx > > + jz L(ret) > > + cmp %rdx, %rcx > > + cmovb %rcx, %rdx > > + > > +L(partial_copy_second_vector): > > + VMOVU (%rsi, %rdx, CHAR_SIZE), %VMM(1) > > + VMOVU %VMM(1), (%rdi, %rdx, CHAR_SIZE) > > + MOVU $0, (VEC_SIZE - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > > + xor %edx, %edx > > + vptest %VMM(2), %VMM(2) > > + jz L(continue_third_vector) > > + > > +L(ret): > > + ret > > + > > +L(ret_vec_x3): > > + PMOVMSK %VMM(2), %rax > > + bsf %rax, %rcx > > + /* Calculate return value. */ > > + lea (VEC_SIZE * 2)(%rsi, %rcx), %rax > > + sub %r8, %rax > > +# ifdef USE_AS_WCSLCPY > > + shr $2, %rax > > + shr $2, %rcx > > +# endif > > + inc %rcx > > + test %rdx, %rdx > > + jz L(ret) > > + cmp %rdx, %rcx > > + cmovb %rcx, %rdx > > + > > +L(partial_copy_third_vector): > > + VMOVU (VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM(1) > > + VMOVU %VMM(1), (VEC_SIZE)(%rdi, %rdx, CHAR_SIZE) > > + MOVU $0, ((VEC_SIZE * 2) - CHAR_SIZE * 1)(%rdi, %rdx, > CHAR_SIZE) > > + xor %edx, %edx > > + vptest %VMM(2), %VMM(2) > > + jz L(continue_fourth_vector) > > + ret > > + > > +L(ret_vec_x4): > > + PMOVMSK %VMM(2), %rax > > + bsf %rax, %rcx > > + /* Calculate return value. */ > > + lea (VEC_SIZE * 3)(%rsi, %rcx), %rax > > + sub %r8, %rax > > +# ifdef USE_AS_WCSLCPY > > + shr $2, %rax > > + shr $2, %rcx > > +# endif > > + inc %rcx > > + test %rdx, %rdx > > + jz L(ret) > > + cmp %rdx, %rcx > > + cmovb %rcx, %rdx > > + > > +L(partial_copy_fourth_vector): > > + VMOVU (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(1) > > + VMOVU %VMM(1), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE) > > + MOVU $0, ((VEC_SIZE * 3) - CHAR_SIZE * 1)(%rdi, %rdx, > CHAR_SIZE) > > + xor %edx, %edx > > + vptest %VMM(2), %VMM(2) > > + jz L(continue_fourth_vector) > > + ret > > + > > +END (STRLCPY) > > Is strlcpy/strlcat integratable with existing strncat impl? Had > figured they would > fit in the same file. > Hi Noah, It may not be a good idea to put strlcpy/strlcat in the existing strncpy/strnat impl file, as strlcpy/strlcat functions are associated with GLIBC_2.38 ABI. --Sunil > > +#endif > > diff --git a/sysdeps/x86_64/multiarch/strlcpy-generic.c > b/sysdeps/x86_64/multiarch/strlcpy-generic.c > > new file mode 100644 > > index 0000000000..eee3b7b086 > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/strlcpy-generic.c > > @@ -0,0 +1,25 @@ > > +/* strlcpy generic. > > + Copyright (C) 2023 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > + > > +#include <isa-level.h> > > +#if ISA_SHOULD_BUILD (1) > > +# define __strlcpy __strlcpy_generic > > +# include <string/strlcpy.c> > > + > > +#endif > > diff --git a/sysdeps/x86_64/multiarch/strlcpy.c > b/sysdeps/x86_64/multiarch/strlcpy.c > > new file mode 100644 > > index 0000000000..ded41fbcfb > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/strlcpy.c > > @@ -0,0 +1,36 @@ > > +/* Multiple versions of strlcpy. > > + All versions must be listed in ifunc-impl-list.c. > > + Copyright (C) 2023 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > +/* Define multiple versions only for the definition in libc. */ > > +#if IS_IN (libc) > > +# define __strlcpy __redirect_strlcpy > > +# include <string.h> > > +# undef __strlcpy > > + > > +# define SYMBOL_NAME strlcpy > > +# include "ifunc-strlcpy.h" > > + > > +libc_ifunc_redirected (__redirect_strlcpy, __strlcpy, IFUNC_SELECTOR > ()); > > +weak_alias (__strlcpy, strlcpy) > > + > > +# ifdef SHARED > > +__hidden_ver1 (__strlcpy, __GI___strlcpy, __redirect_strlcpy) > > + __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strlcpy); > > +# endif > > +#endif > > diff --git a/sysdeps/x86_64/multiarch/wcslcpy-avx2.S > b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S > > new file mode 100644 > > index 0000000000..dafc20ded0 > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S > > @@ -0,0 +1,4 @@ > > +#define STRLCPY __wcslcpy_avx2 > > +#define USE_AS_WCSLCPY 1 > > + > > +#include "strlcpy-avx2.S" > > diff --git a/sysdeps/x86_64/multiarch/wcslcpy-generic.c > b/sysdeps/x86_64/multiarch/wcslcpy-generic.c > > new file mode 100644 > > index 0000000000..ffd3c0e846 > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/wcslcpy-generic.c > > @@ -0,0 +1,25 @@ > > +/* wcslcpy generic. > > + Copyright (C) 2023 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > + > > +#include <isa-level.h> > > +#if ISA_SHOULD_BUILD (1) > > +# define __wcslcpy __wcslcpy_generic > > +# include <wcsmbs/wcslcpy.c> > > + > > +#endif > > diff --git a/sysdeps/x86_64/multiarch/wcslcpy.c > b/sysdeps/x86_64/multiarch/wcslcpy.c > > new file mode 100644 > > index 0000000000..371ef9626c > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/wcslcpy.c > > @@ -0,0 +1,35 @@ > > +/* Multiple versions of wcslcpy. > > + All versions must be listed in ifunc-impl-list.c. > > + Copyright (C) 2023 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > +/* Define multiple versions only for the definition in libc. */ > > +#if IS_IN (libc) > > +# define __wcslcpy __redirect_wcslcpy > > +# include <wchar.h> > > +# undef __wcslcpy > > + > > +# define SYMBOL_NAME wcslcpy > > +# include "ifunc-strlcpy.h" > > + > > +libc_ifunc_redirected (__redirect_wcslcpy, __wcslcpy, IFUNC_SELECTOR > ()); > > +weak_alias (__wcslcpy, wcslcpy) > > +# ifdef SHARED > > +__hidden_ver1 (__wcslcpy, __GI___wcslcpy, __redirect_wcslcpy) > > + __attribute__((visibility ("hidden"))) __attribute_copy__ (wcslcpy); > > +# endif > > +#endif > > -- > > 2.38.1 > > > ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function 2023-07-02 18:37 ` Sunil Pandey @ 2023-07-02 18:54 ` Noah Goldstein 2023-07-03 1:03 ` Sunil Pandey 0 siblings, 1 reply; 24+ messages in thread From: Noah Goldstein @ 2023-07-02 18:54 UTC (permalink / raw) To: Sunil Pandey; +Cc: libc-alpha, hjl.tools On Sun, Jul 2, 2023 at 1:38 PM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > On Sun, Jul 2, 2023 at 10:03 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: >> >> On Fri, Jun 30, 2023 at 3:48 PM Sunil K Pandey via Libc-alpha >> <libc-alpha@sourceware.org> wrote: >> > >> > This patch optimizes strlcpy/wsclcpy string functions for AVX2. >> > --- >> > sysdeps/x86_64/multiarch/Makefile | 4 + >> > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 18 + >> > sysdeps/x86_64/multiarch/ifunc-strlcpy.h | 34 ++ >> > sysdeps/x86_64/multiarch/strlcpy-avx2.S | 446 +++++++++++++++++++++ >> > sysdeps/x86_64/multiarch/strlcpy-generic.c | 25 ++ >> > sysdeps/x86_64/multiarch/strlcpy.c | 36 ++ >> > sysdeps/x86_64/multiarch/wcslcpy-avx2.S | 4 + >> > sysdeps/x86_64/multiarch/wcslcpy-generic.c | 25 ++ >> > sysdeps/x86_64/multiarch/wcslcpy.c | 35 ++ >> > 9 files changed, 627 insertions(+) >> > create mode 100644 sysdeps/x86_64/multiarch/ifunc-strlcpy.h >> > create mode 100644 sysdeps/x86_64/multiarch/strlcpy-avx2.S >> > create mode 100644 sysdeps/x86_64/multiarch/strlcpy-generic.c >> > create mode 100644 sysdeps/x86_64/multiarch/strlcpy.c >> > create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-avx2.S >> > create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-generic.c >> > create mode 100644 sysdeps/x86_64/multiarch/wcslcpy.c >> > >> > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile >> > index e1e894c963..7e3fc081df 100644 >> > --- a/sysdeps/x86_64/multiarch/Makefile >> > +++ b/sysdeps/x86_64/multiarch/Makefile >> > @@ -82,6 +82,8 @@ sysdep_routines += \ >> > strcpy-sse2 \ >> > strcpy-sse2-unaligned \ >> > strcspn-sse4 \ >> > + strlcpy-avx2 \ >> > + strlcpy-generic \ >> > strlen-avx2 \ >> > strlen-avx2-rtm \ >> > strlen-evex \ >> > @@ -153,6 +155,8 @@ sysdep_routines += \ >> > wcscpy-evex \ >> > wcscpy-generic \ >> > wcscpy-ssse3 \ >> > + wcslcpy-avx2 \ >> > + wcslcpy-generic \ >> > wcslen-avx2 \ >> > wcslen-avx2-rtm \ >> > wcslen-evex \ >> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c >> > index 5427ff1907..9928dee187 100644 >> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c >> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c >> > @@ -751,6 +751,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, >> > 1, >> > __strncat_sse2_unaligned)) >> > >> > + /* Support sysdeps/x86_64/multiarch/strlcpy.c. */ >> > + IFUNC_IMPL (i, name, strlcpy, >> > + X86_IFUNC_IMPL_ADD_V3 (array, i, strlcpy, >> > + CPU_FEATURE_USABLE (AVX2), >> > + __strlcpy_avx2) >> > + X86_IFUNC_IMPL_ADD_V1 (array, i, strlcpy, >> > + 1, >> > + __strlcpy_generic)) >> > + >> > /* Support sysdeps/x86_64/multiarch/strncpy.c. */ >> > IFUNC_IMPL (i, name, strncpy, >> > X86_IFUNC_IMPL_ADD_V4 (array, i, strncpy, >> > @@ -917,6 +926,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, >> > 1, >> > __wcscpy_generic)) >> > >> > + /* Support sysdeps/x86_64/multiarch/wcslcpy.c. */ >> > + IFUNC_IMPL (i, name, wcslcpy, >> > + X86_IFUNC_IMPL_ADD_V3 (array, i, wcslcpy, >> > + CPU_FEATURE_USABLE (AVX2), >> > + __wcslcpy_avx2) >> > + X86_IFUNC_IMPL_ADD_V1 (array, i, wcslcpy, >> > + 1, >> > + __wcslcpy_generic)) >> > + >> > /* Support sysdeps/x86_64/multiarch/wcsncpy.c. */ >> > IFUNC_IMPL (i, name, wcsncpy, >> > X86_IFUNC_IMPL_ADD_V4 (array, i, wcsncpy, >> > diff --git a/sysdeps/x86_64/multiarch/ifunc-strlcpy.h b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h >> > new file mode 100644 >> > index 0000000000..982a30d15b >> > --- /dev/null >> > +++ b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h >> > @@ -0,0 +1,34 @@ >> > +/* Common definition for ifunc selections. >> > + All versions must be listed in ifunc-impl-list.c. >> > + Copyright (C) 2023 Free Software Foundation, Inc. >> > + This file is part of the GNU C Library. >> > + >> > + The GNU C Library is free software; you can redistribute it and/or >> > + modify it under the terms of the GNU Lesser General Public >> > + License as published by the Free Software Foundation; either >> > + version 2.1 of the License, or (at your option) any later version. >> > + >> > + The GNU C Library is distributed in the hope that it will be useful, >> > + but WITHOUT ANY WARRANTY; without even the implied warranty of >> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> > + Lesser General Public License for more details. >> > + >> > + You should have received a copy of the GNU Lesser General Public >> > + License along with the GNU C Library; if not, see >> > + <https://www.gnu.org/licenses/>. */ >> > + >> > +#include <init-arch.h> >> > + >> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; >> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden; >> > + >> > +static inline void * >> > +IFUNC_SELECTOR (void) >> > +{ >> > + const struct cpu_features *cpu_features = __get_cpu_features (); >> > + >> > + if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2)) >> > + return OPTIMIZE (avx2); >> > + >> > + return OPTIMIZE (generic); >> > +} >> > diff --git a/sysdeps/x86_64/multiarch/strlcpy-avx2.S b/sysdeps/x86_64/multiarch/strlcpy-avx2.S >> > new file mode 100644 >> > index 0000000000..cf54b1e990 >> > --- /dev/null >> > +++ b/sysdeps/x86_64/multiarch/strlcpy-avx2.S >> > @@ -0,0 +1,446 @@ >> > +/* Strlcpy/wcslcpy optimized with AVX2. >> > + Copyright (C) 2023 Free Software Foundation, Inc. >> > + This file is part of the GNU C Library. >> > + >> > + The GNU C Library is free software; you can redistribute it and/or >> > + modify it under the terms of the GNU Lesser General Public >> > + License as published by the Free Software Foundation; either >> > + version 2.1 of the License, or (at your option) any later version. >> > + >> > + The GNU C Library is distributed in the hope that it will be useful, >> > + but WITHOUT ANY WARRANTY; without even the implied warranty of >> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> > + Lesser General Public License for more details. >> > + >> > + You should have received a copy of the GNU Lesser General Public >> > + License along with the GNU C Library; if not, see >> > + <https://www.gnu.org/licenses/>. */ >> > + >> > +#include <isa-level.h> >> > + >> > +#if ISA_SHOULD_BUILD (3) >> > + >> > +# include <sysdep.h> >> > + >> > +# ifndef VEC_SIZE >> > +# include "x86-avx-vecs.h" >> > +# endif >> > + >> > +# ifndef STRLCPY >> > +# define STRLCPY __strlcpy_avx2 >> > +# endif >> > + >> > + >> > +# ifdef USE_AS_WCSLCPY >> > +# define CHAR_SIZE 4 >> > +# define MOVU movl >> > +# define VPCMPEQ vpcmpeqd >> > +# define VPMINU vpminud >> > +# else >> > +# define CHAR_SIZE 1 >> > +# define MOVU movb >> > +# define VPCMPEQ vpcmpeqb >> > +# define VPMINU vpminub >> > +# endif >> > + >> > +# define PMOVMSK vpmovmskb >> > +# define PAGE_SIZE 4096 >> > +# define VEC_SIZE 32 >> > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) >> > + >> > + .section SECTION(.text),"ax",@progbits >> > +/* Aligning entry point to 64 byte, provides better performance for >> > + one vector length string. */ >> > + >> > +ENTRY_P2ALIGN (STRLCPY, 6) >> > +# ifdef __ILP32__ >> > + /* Clear the upper 32 bits. */ >> > + movl %edx, %edx >> > +# endif >> > + >> > + /* Zero out vector register for end of string comparison. */ >> > + vpxor %VMM(0), %VMM(0), %VMM(0) >> > + /* Save source pointer for return calculation. */ >> > + mov %rsi, %r8 >> > + mov %esi, %eax >> > + sall $20, %eax >> > + cmpl $((PAGE_SIZE - (VEC_SIZE)) << 20), %eax >> > + ja L(page_cross) >> > + >> > +L(page_cross_continue): >> > + /* Load first vector. */ >> > + VMOVU (%rsi), %VMM(1) >> > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) >> > + PMOVMSK %VMM(2), %eax >> > + test %eax, %eax >> > + jnz L(ret_vec_x1) >> > + >> > + test %rdx, %rdx >> > + jz L(continue_second_vector) >> > + >> > + /* Check whether we can copy full vector. */ >> > + cmp $CHAR_PER_VEC, %rdx >> > + jbe L(page_cross_small_vec_copy) >> > + /* Copy first vector. */ >> > + VMOVU %VMM(1), (%rdi) >> > + sub $CHAR_PER_VEC, %rdx >> > + >> > +L(continue_second_vector): >> > + /* Align RSI pointer and adjust RDI based on offset. */ >> > + mov %rsi, %rax >> > + and $-VEC_SIZE, %rsi >> > + sub %rsi, %rax >> > + sub %rax, %rdi >> > + >> > + /* Check if string already copied N char, and RDX is 0. */ >> > + test %rdx, %rdx >> > + jz L(skip_copy_alignment_fix) >> > + >> > + /* Adjust RDX for copy alignment fix. */ >> > +# ifdef USE_AS_WCSLCPY >> > + shr $2, %rax >> > +# endif >> > + add %rax, %rdx >> > + >> > +L(skip_copy_alignment_fix): >> > + /* Load second vector. */ >> > + VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1) >> > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) >> > + vptest %VMM(2), %VMM(2) >> > + jnz L(ret_vec_x2) >> > + >> > + /* Skip copy if RDX is 0. */ >> > + test %rdx, %rdx >> > + jz L(continue_third_vector) >> > + >> > + /* Jump below/equal(instead of below) used here, because last >> > + copy chracter must be NULL. */ >> > + cmp $CHAR_PER_VEC, %rdx >> > + jbe L(partial_copy_second_vector) >> > + >> > + sub $CHAR_PER_VEC, %rdx >> > + /* Copy second vector. */ >> > + VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) >> > + >> > +L(continue_third_vector): >> > + /* Load third vector. */ >> > + VMOVA (VEC_SIZE * 2)(%rsi), %VMM(1) >> > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) >> > + vptest %VMM(2), %VMM(2) >> > + jnz L(ret_vec_x3) >> > + >> > + /* Skip copy if RDX is 0. */ >> > + test %rdx, %rdx >> > + jz L(continue_fourth_vector) >> > + >> > + cmp $CHAR_PER_VEC, %rdx >> > + jbe L(partial_copy_third_vector) >> > + >> > + sub $CHAR_PER_VEC, %rdx >> > + /* Copy third vector. */ >> > + VMOVU %VMM(1), (VEC_SIZE * 2)(%rdi) >> > + >> > +L(continue_fourth_vector): >> > + /* Load fourth vector. */ >> > + VMOVA (VEC_SIZE * 3)(%rsi), %VMM(1) >> > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) >> > + vptest %VMM(2), %VMM(2) >> > + jnz L(ret_vec_x4) >> > + >> > + /* Skip copy if RDX is 0. */ >> > + test %rdx, %rdx >> > + jz L(loop_4x_align) >> > + >> > + cmp $CHAR_PER_VEC, %rdx >> > + jbe L(partial_copy_fourth_vector) >> > + >> > + sub $CHAR_PER_VEC, %rdx >> > + /* Copy fourth vector. */ >> > + VMOVU %VMM(1), (VEC_SIZE * 3)(%rdi) >> > + >> > + >> > +L(loop_4x_align): >> > + /* Jump to loop if RSI is already 4 vector align. */ >> > + test $(VEC_SIZE * 4 - 1), %esi >> > + jz L(loop_4x_read) >> > + >> > + mov %rsi, %rcx >> > + >> > + /* Align RSI to 4x vector. */ >> > + and $(VEC_SIZE * -4), %rsi >> > + sub %rsi, %rcx >> > + >> > + /* Adjust RDI for RSI alignment fix. */ >> > + sub %rcx, %rdi >> > + >> > + /* Jump to loop if RDX is 0. */ >> > + test %rdx, %rdx >> > + jz L(loop_4x_read) >> > + >> > +# ifdef USE_AS_WCSLCPY >> > + shr $2, %rcx >> > +# endif >> > + >> > + /* Adjust RDX for RSI alignment fix. */ >> > + add %rcx, %rdx >> > + jmp L(loop_4x_read) >> > + >> > + .p2align 4,,6 >> > +L(loop_4x_vec): >> > + /* Skip copy if RDX is 0. */ >> > + test %rdx, %rdx >> > + jz L(loop_partial_copy_return) >> > + cmp $(CHAR_PER_VEC * 4), %rdx >> > + jbe L(loop_partial_copy) >> > + VMOVU %VMM(1), (VEC_SIZE * 4)(%rdi) >> > + VMOVU %VMM(2), (VEC_SIZE * 5)(%rdi) >> > + VMOVU %VMM(3), (VEC_SIZE * 6)(%rdi) >> > + VMOVU %VMM(4), (VEC_SIZE * 7)(%rdi) >> > + sub $(CHAR_PER_VEC * 4), %rdx >> > + >> > +L(loop_partial_copy_return): >> > + sub $(VEC_SIZE * -4), %rsi >> > + sub $(VEC_SIZE * -4), %rdi >> > + >> > +L(loop_4x_read): >> > + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(1) >> > + VMOVA (VEC_SIZE * 5)(%rsi), %VMM(2) >> > + VMOVA (VEC_SIZE * 6)(%rsi), %VMM(3) >> > + VMOVA (VEC_SIZE * 7)(%rsi), %VMM(4) >> > + VPMINU %VMM(1), %VMM(2), %VMM(5) >> > + VPMINU %VMM(3), %VMM(4), %VMM(6) >> > + VPMINU %VMM(5), %VMM(6), %VMM(7) >> > + VPCMPEQ %VMM(0), %VMM(7), %VMM(7) >> > + vptest %VMM(7), %VMM(7) >> > + >> > + jz L(loop_4x_vec) >> > + >> > + /* Check if string ends in first vector or second vector. */ >> > + lea (VEC_SIZE * 4)(%rsi), %rax >> > + sub %r8, %rax >> > +# ifdef USE_AS_WCSLCPY >> > + shr $2, %rax >> > +# endif >> > + xor %r10, %r10 >> > + VPCMPEQ %VMM(0), %VMM(5), %VMM(6) >> > + vptest %VMM(6), %VMM(6) >> > + jnz L(endloop) >> > + sub $(CHAR_PER_VEC * -2), %rax >> > + mov $(CHAR_PER_VEC * 2), %r10 >> > + VMOVA %VMM(3), %VMM(1) >> > + VMOVA %VMM(4), %VMM(2) >> > + >> > +L(endloop): >> > + VPCMPEQ %VMM(0), %VMM(1), %VMM(1) >> > + VPCMPEQ %VMM(0), %VMM(2), %VMM(2) >> > + PMOVMSK %VMM(1), %rcx >> > + PMOVMSK %VMM(2), %r9 >> > + shlq $32, %r9 >> > + orq %r9, %rcx >> > + bsf %rcx, %rcx >> > + /* Shift RCX by 2, VPMOVMSK has only byte version. */ >> > +# ifdef USE_AS_WCSLCPY >> > + shr $2, %rcx >> > +# endif >> > + /* At this point RAX has length to return. */ >> > + add %rcx, %rax >> > + test %rdx, %rdx >> > + jz L(ret) >> > + >> > + /* Add 1 to account for NULL character in RDX comparison. */ >> > + lea 1(%r10, %rcx), %rcx >> > + cmp %rdx, %rcx >> > + cmovb %rcx, %rdx >> > + >> > +L(loop_partial_copy): >> > + cmp $(CHAR_PER_VEC * 2), %rdx >> > + jbe L(loop_partial_first_half) >> > + /* Reload first 2 vector. */ >> > + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(1) >> > + VMOVA (VEC_SIZE * 5)(%rsi), %VMM(2) >> > + VMOVU %VMM(1), (VEC_SIZE * 4)(%rdi) >> > + VMOVU %VMM(2), (VEC_SIZE * 5)(%rdi) >> > + >> > +L(loop_partial_first_half): >> > + /* Go back 2 vector from last and use overlapping copy. >> > + (VEC_SIZE * 4 - VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE) >> > + (VEC_SIZE * 4 - VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE) >> > + */ >> > + VMOVU (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(3) >> > + VMOVU (VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %VMM(4) >> > + VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE) >> > + VMOVU %VMM(4), (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE) >> > + MOVU $0, (VEC_SIZE * 4 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) >> > + xor %rdx, %rdx >> > + vptest %VMM(7), %VMM(7) >> > + jz L(loop_partial_copy_return) >> > + ret >> > + >> > + .p2align 4 >> > +L(page_cross): >> > + mov %rsi, %rcx >> > + mov %rsi, %r11 >> > + and $-VEC_SIZE, %r11 >> > + and $(VEC_SIZE - 1), %rcx >> > + VMOVA (%r11), %VMM(1) >> > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) >> > + PMOVMSK %VMM(2), %eax >> > + shr %cl, %eax >> > + jz L(page_cross_continue) >> > + >> > +L(ret_vec_x1): >> > + bsf %eax, %eax >> > +# ifdef USE_AS_WCSLCPY >> > + shr $2, %eax >> > +# endif >> > + /* Increment by 1 to account for NULL char. */ >> > + lea 1(%eax), %ecx >> > + cmp %rdx, %rcx >> > + cmovb %rcx, %rdx >> > + test %rdx, %rdx >> > + jz L(ret) >> > + >> > +L(page_cross_small_vec_copy): >> > + cmp $(16 / CHAR_SIZE), %rdx >> > + jbe L(copy_8_byte_scalar) >> > + VMOVU (%rsi), %VMM_128(1) >> > + VMOVU -16(%rsi, %rdx, CHAR_SIZE), %VMM_128(3) >> > + VMOVU %VMM_128(1), (%rdi) >> > + VMOVU %VMM_128(3), -16(%rdi, %rdx, CHAR_SIZE) >> > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) >> > + xor %rdx, %rdx >> > + vptest %VMM(2), %VMM(2) >> > + jz L(continue_second_vector) >> > + ret >> > + >> > +L(copy_8_byte_scalar): >> > + cmp $(8 / CHAR_SIZE), %rdx >> > + jbe L(copy_4_byte_scalar) >> > + movq (%rsi), %r10 >> > + movq -8(%rsi, %rdx, CHAR_SIZE), %r11 >> > + movq %r10, (%rdi) >> > + movq %r11, -8(%rdi, %rdx, CHAR_SIZE) >> > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) >> > + xor %edx, %edx >> > + vptest %VMM(2), %VMM(2) >> > + jz L(continue_second_vector) >> > + ret >> > + >> > +L(copy_4_byte_scalar): >> > +# ifndef USE_AS_WCSLCPY >> > + cmp $4, %rdx >> > + jbe L(copy_2_byte_scalar) >> > +# endif >> > + movl (%rsi), %r10d >> > + movl -4(%rsi, %rdx, CHAR_SIZE), %r11d >> > + movl %r10d, (%rdi) >> > + movl %r11d, -4(%rdi, %rdx, CHAR_SIZE) >> > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) >> > + xor %edx, %edx >> > + vptest %VMM(2), %VMM(2) >> > + jz L(continue_second_vector) >> > + ret >> > + >> > +# ifndef USE_AS_WCSLCPY >> > +L(copy_2_byte_scalar): >> > + cmp $2, %rdx >> > + jbe L(copy_1_byte_scalar) >> > + movw (%rsi), %r10w >> > + movw -(CHAR_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %r11w >> > + movw %r10w, (%rdi) >> > + movw %r11w, -(CHAR_SIZE * 3)(%rdi, %rdx, CHAR_SIZE) >> > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) >> > + xor %edx, %edx >> > + vptest %VMM(2), %VMM(2) >> > + jz L(continue_second_vector) >> > + ret >> > + >> > +L(copy_1_byte_scalar): >> > + MOVU (%rsi), %r10b >> > + MOVU %r10b, (%rdi) >> > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) >> > + xor %edx, %edx >> > + vptest %VMM(2), %VMM(2) >> > + jz L(continue_second_vector) >> > + ret >> > +# endif >> > + >> > +L(ret_vec_x2): >> > + PMOVMSK %VMM(2), %rax >> > + bsf %rax, %rcx >> > + /* Calculate return value. */ >> > + lea VEC_SIZE(%rsi, %rcx), %rax >> > + sub %r8, %rax >> > +# ifdef USE_AS_WCSLCPY >> > + shr $2, %rax >> > + shr $2, %rcx >> > +# endif >> > + inc %rcx >> > + test %rdx, %rdx >> > + jz L(ret) >> > + cmp %rdx, %rcx >> > + cmovb %rcx, %rdx >> > + >> > +L(partial_copy_second_vector): >> > + VMOVU (%rsi, %rdx, CHAR_SIZE), %VMM(1) >> > + VMOVU %VMM(1), (%rdi, %rdx, CHAR_SIZE) >> > + MOVU $0, (VEC_SIZE - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) >> > + xor %edx, %edx >> > + vptest %VMM(2), %VMM(2) >> > + jz L(continue_third_vector) >> > + >> > +L(ret): >> > + ret >> > + >> > +L(ret_vec_x3): >> > + PMOVMSK %VMM(2), %rax >> > + bsf %rax, %rcx >> > + /* Calculate return value. */ >> > + lea (VEC_SIZE * 2)(%rsi, %rcx), %rax >> > + sub %r8, %rax >> > +# ifdef USE_AS_WCSLCPY >> > + shr $2, %rax >> > + shr $2, %rcx >> > +# endif >> > + inc %rcx >> > + test %rdx, %rdx >> > + jz L(ret) >> > + cmp %rdx, %rcx >> > + cmovb %rcx, %rdx >> > + >> > +L(partial_copy_third_vector): >> > + VMOVU (VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM(1) >> > + VMOVU %VMM(1), (VEC_SIZE)(%rdi, %rdx, CHAR_SIZE) >> > + MOVU $0, ((VEC_SIZE * 2) - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) >> > + xor %edx, %edx >> > + vptest %VMM(2), %VMM(2) >> > + jz L(continue_fourth_vector) >> > + ret >> > + >> > +L(ret_vec_x4): >> > + PMOVMSK %VMM(2), %rax >> > + bsf %rax, %rcx >> > + /* Calculate return value. */ >> > + lea (VEC_SIZE * 3)(%rsi, %rcx), %rax >> > + sub %r8, %rax >> > +# ifdef USE_AS_WCSLCPY >> > + shr $2, %rax >> > + shr $2, %rcx >> > +# endif >> > + inc %rcx >> > + test %rdx, %rdx >> > + jz L(ret) >> > + cmp %rdx, %rcx >> > + cmovb %rcx, %rdx >> > + >> > +L(partial_copy_fourth_vector): >> > + VMOVU (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(1) >> > + VMOVU %VMM(1), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE) >> > + MOVU $0, ((VEC_SIZE * 3) - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) >> > + xor %edx, %edx >> > + vptest %VMM(2), %VMM(2) >> > + jz L(continue_fourth_vector) >> > + ret >> > + >> > +END (STRLCPY) >> >> Is strlcpy/strlcat integratable with existing strncat impl? Had >> figured they would >> fit in the same file. > > > Hi Noah, > > It may not be a good idea to put strlcpy/strlcat in the existing strncpy/strnat impl file, > as strlcpy/strlcat functions are associated with GLIBC_2.38 ABI. > Well, we can put the impl there and include it from another to manage any special link cases. > --Sunil > >> >> > +#endif >> > diff --git a/sysdeps/x86_64/multiarch/strlcpy-generic.c b/sysdeps/x86_64/multiarch/strlcpy-generic.c >> > new file mode 100644 >> > index 0000000000..eee3b7b086 >> > --- /dev/null >> > +++ b/sysdeps/x86_64/multiarch/strlcpy-generic.c >> > @@ -0,0 +1,25 @@ >> > +/* strlcpy generic. >> > + Copyright (C) 2023 Free Software Foundation, Inc. >> > + This file is part of the GNU C Library. >> > + >> > + The GNU C Library is free software; you can redistribute it and/or >> > + modify it under the terms of the GNU Lesser General Public >> > + License as published by the Free Software Foundation; either >> > + version 2.1 of the License, or (at your option) any later version. >> > + >> > + The GNU C Library is distributed in the hope that it will be useful, >> > + but WITHOUT ANY WARRANTY; without even the implied warranty of >> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> > + Lesser General Public License for more details. >> > + >> > + You should have received a copy of the GNU Lesser General Public >> > + License along with the GNU C Library; if not, see >> > + <https://www.gnu.org/licenses/>. */ >> > + >> > + >> > +#include <isa-level.h> >> > +#if ISA_SHOULD_BUILD (1) >> > +# define __strlcpy __strlcpy_generic >> > +# include <string/strlcpy.c> >> > + >> > +#endif >> > diff --git a/sysdeps/x86_64/multiarch/strlcpy.c b/sysdeps/x86_64/multiarch/strlcpy.c >> > new file mode 100644 >> > index 0000000000..ded41fbcfb >> > --- /dev/null >> > +++ b/sysdeps/x86_64/multiarch/strlcpy.c >> > @@ -0,0 +1,36 @@ >> > +/* Multiple versions of strlcpy. >> > + All versions must be listed in ifunc-impl-list.c. >> > + Copyright (C) 2023 Free Software Foundation, Inc. >> > + This file is part of the GNU C Library. >> > + >> > + The GNU C Library is free software; you can redistribute it and/or >> > + modify it under the terms of the GNU Lesser General Public >> > + License as published by the Free Software Foundation; either >> > + version 2.1 of the License, or (at your option) any later version. >> > + >> > + The GNU C Library is distributed in the hope that it will be useful, >> > + but WITHOUT ANY WARRANTY; without even the implied warranty of >> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> > + Lesser General Public License for more details. >> > + >> > + You should have received a copy of the GNU Lesser General Public >> > + License along with the GNU C Library; if not, see >> > + <https://www.gnu.org/licenses/>. */ >> > + >> > +/* Define multiple versions only for the definition in libc. */ >> > +#if IS_IN (libc) >> > +# define __strlcpy __redirect_strlcpy >> > +# include <string.h> >> > +# undef __strlcpy >> > + >> > +# define SYMBOL_NAME strlcpy >> > +# include "ifunc-strlcpy.h" >> > + >> > +libc_ifunc_redirected (__redirect_strlcpy, __strlcpy, IFUNC_SELECTOR ()); >> > +weak_alias (__strlcpy, strlcpy) >> > + >> > +# ifdef SHARED >> > +__hidden_ver1 (__strlcpy, __GI___strlcpy, __redirect_strlcpy) >> > + __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strlcpy); >> > +# endif >> > +#endif >> > diff --git a/sysdeps/x86_64/multiarch/wcslcpy-avx2.S b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S >> > new file mode 100644 >> > index 0000000000..dafc20ded0 >> > --- /dev/null >> > +++ b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S >> > @@ -0,0 +1,4 @@ >> > +#define STRLCPY __wcslcpy_avx2 >> > +#define USE_AS_WCSLCPY 1 >> > + >> > +#include "strlcpy-avx2.S" >> > diff --git a/sysdeps/x86_64/multiarch/wcslcpy-generic.c b/sysdeps/x86_64/multiarch/wcslcpy-generic.c >> > new file mode 100644 >> > index 0000000000..ffd3c0e846 >> > --- /dev/null >> > +++ b/sysdeps/x86_64/multiarch/wcslcpy-generic.c >> > @@ -0,0 +1,25 @@ >> > +/* wcslcpy generic. >> > + Copyright (C) 2023 Free Software Foundation, Inc. >> > + This file is part of the GNU C Library. >> > + >> > + The GNU C Library is free software; you can redistribute it and/or >> > + modify it under the terms of the GNU Lesser General Public >> > + License as published by the Free Software Foundation; either >> > + version 2.1 of the License, or (at your option) any later version. >> > + >> > + The GNU C Library is distributed in the hope that it will be useful, >> > + but WITHOUT ANY WARRANTY; without even the implied warranty of >> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> > + Lesser General Public License for more details. >> > + >> > + You should have received a copy of the GNU Lesser General Public >> > + License along with the GNU C Library; if not, see >> > + <https://www.gnu.org/licenses/>. */ >> > + >> > + >> > +#include <isa-level.h> >> > +#if ISA_SHOULD_BUILD (1) >> > +# define __wcslcpy __wcslcpy_generic >> > +# include <wcsmbs/wcslcpy.c> >> > + >> > +#endif >> > diff --git a/sysdeps/x86_64/multiarch/wcslcpy.c b/sysdeps/x86_64/multiarch/wcslcpy.c >> > new file mode 100644 >> > index 0000000000..371ef9626c >> > --- /dev/null >> > +++ b/sysdeps/x86_64/multiarch/wcslcpy.c >> > @@ -0,0 +1,35 @@ >> > +/* Multiple versions of wcslcpy. >> > + All versions must be listed in ifunc-impl-list.c. >> > + Copyright (C) 2023 Free Software Foundation, Inc. >> > + This file is part of the GNU C Library. >> > + >> > + The GNU C Library is free software; you can redistribute it and/or >> > + modify it under the terms of the GNU Lesser General Public >> > + License as published by the Free Software Foundation; either >> > + version 2.1 of the License, or (at your option) any later version. >> > + >> > + The GNU C Library is distributed in the hope that it will be useful, >> > + but WITHOUT ANY WARRANTY; without even the implied warranty of >> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> > + Lesser General Public License for more details. >> > + >> > + You should have received a copy of the GNU Lesser General Public >> > + License along with the GNU C Library; if not, see >> > + <https://www.gnu.org/licenses/>. */ >> > + >> > +/* Define multiple versions only for the definition in libc. */ >> > +#if IS_IN (libc) >> > +# define __wcslcpy __redirect_wcslcpy >> > +# include <wchar.h> >> > +# undef __wcslcpy >> > + >> > +# define SYMBOL_NAME wcslcpy >> > +# include "ifunc-strlcpy.h" >> > + >> > +libc_ifunc_redirected (__redirect_wcslcpy, __wcslcpy, IFUNC_SELECTOR ()); >> > +weak_alias (__wcslcpy, wcslcpy) >> > +# ifdef SHARED >> > +__hidden_ver1 (__wcslcpy, __GI___wcslcpy, __redirect_wcslcpy) >> > + __attribute__((visibility ("hidden"))) __attribute_copy__ (wcslcpy); >> > +# endif >> > +#endif >> > -- >> > 2.38.1 >> > ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function 2023-07-02 18:54 ` Noah Goldstein @ 2023-07-03 1:03 ` Sunil Pandey 2023-07-03 1:47 ` Noah Goldstein 0 siblings, 1 reply; 24+ messages in thread From: Sunil Pandey @ 2023-07-03 1:03 UTC (permalink / raw) To: Noah Goldstein; +Cc: libc-alpha, hjl.tools [-- Attachment #1: Type: text/plain, Size: 31468 bytes --] On Sun, Jul 2, 2023 at 11:54 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > On Sun, Jul 2, 2023 at 1:38 PM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > > > > > On Sun, Jul 2, 2023 at 10:03 AM Noah Goldstein <goldstein.w.n@gmail.com> > wrote: > >> > >> On Fri, Jun 30, 2023 at 3:48 PM Sunil K Pandey via Libc-alpha > >> <libc-alpha@sourceware.org> wrote: > >> > > >> > This patch optimizes strlcpy/wsclcpy string functions for AVX2. > >> > --- > >> > sysdeps/x86_64/multiarch/Makefile | 4 + > >> > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 18 + > >> > sysdeps/x86_64/multiarch/ifunc-strlcpy.h | 34 ++ > >> > sysdeps/x86_64/multiarch/strlcpy-avx2.S | 446 > +++++++++++++++++++++ > >> > sysdeps/x86_64/multiarch/strlcpy-generic.c | 25 ++ > >> > sysdeps/x86_64/multiarch/strlcpy.c | 36 ++ > >> > sysdeps/x86_64/multiarch/wcslcpy-avx2.S | 4 + > >> > sysdeps/x86_64/multiarch/wcslcpy-generic.c | 25 ++ > >> > sysdeps/x86_64/multiarch/wcslcpy.c | 35 ++ > >> > 9 files changed, 627 insertions(+) > >> > create mode 100644 sysdeps/x86_64/multiarch/ifunc-strlcpy.h > >> > create mode 100644 sysdeps/x86_64/multiarch/strlcpy-avx2.S > >> > create mode 100644 sysdeps/x86_64/multiarch/strlcpy-generic.c > >> > create mode 100644 sysdeps/x86_64/multiarch/strlcpy.c > >> > create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-avx2.S > >> > create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-generic.c > >> > create mode 100644 sysdeps/x86_64/multiarch/wcslcpy.c > >> > > >> > diff --git a/sysdeps/x86_64/multiarch/Makefile > b/sysdeps/x86_64/multiarch/Makefile > >> > index e1e894c963..7e3fc081df 100644 > >> > --- a/sysdeps/x86_64/multiarch/Makefile > >> > +++ b/sysdeps/x86_64/multiarch/Makefile > >> > @@ -82,6 +82,8 @@ sysdep_routines += \ > >> > strcpy-sse2 \ > >> > strcpy-sse2-unaligned \ > >> > strcspn-sse4 \ > >> > + strlcpy-avx2 \ > >> > + strlcpy-generic \ > >> > strlen-avx2 \ > >> > strlen-avx2-rtm \ > >> > strlen-evex \ > >> > @@ -153,6 +155,8 @@ sysdep_routines += \ > >> > wcscpy-evex \ > >> > wcscpy-generic \ > >> > wcscpy-ssse3 \ > >> > + wcslcpy-avx2 \ > >> > + wcslcpy-generic \ > >> > wcslen-avx2 \ > >> > wcslen-avx2-rtm \ > >> > wcslen-evex \ > >> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > >> > index 5427ff1907..9928dee187 100644 > >> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > >> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > >> > @@ -751,6 +751,15 @@ __libc_ifunc_impl_list (const char *name, struct > libc_ifunc_impl *array, > >> > 1, > >> > __strncat_sse2_unaligned)) > >> > > >> > + /* Support sysdeps/x86_64/multiarch/strlcpy.c. */ > >> > + IFUNC_IMPL (i, name, strlcpy, > >> > + X86_IFUNC_IMPL_ADD_V3 (array, i, strlcpy, > >> > + CPU_FEATURE_USABLE (AVX2), > >> > + __strlcpy_avx2) > >> > + X86_IFUNC_IMPL_ADD_V1 (array, i, strlcpy, > >> > + 1, > >> > + __strlcpy_generic)) > >> > + > >> > /* Support sysdeps/x86_64/multiarch/strncpy.c. */ > >> > IFUNC_IMPL (i, name, strncpy, > >> > X86_IFUNC_IMPL_ADD_V4 (array, i, strncpy, > >> > @@ -917,6 +926,15 @@ __libc_ifunc_impl_list (const char *name, struct > libc_ifunc_impl *array, > >> > 1, > >> > __wcscpy_generic)) > >> > > >> > + /* Support sysdeps/x86_64/multiarch/wcslcpy.c. */ > >> > + IFUNC_IMPL (i, name, wcslcpy, > >> > + X86_IFUNC_IMPL_ADD_V3 (array, i, wcslcpy, > >> > + CPU_FEATURE_USABLE (AVX2), > >> > + __wcslcpy_avx2) > >> > + X86_IFUNC_IMPL_ADD_V1 (array, i, wcslcpy, > >> > + 1, > >> > + __wcslcpy_generic)) > >> > + > >> > /* Support sysdeps/x86_64/multiarch/wcsncpy.c. */ > >> > IFUNC_IMPL (i, name, wcsncpy, > >> > X86_IFUNC_IMPL_ADD_V4 (array, i, wcsncpy, > >> > diff --git a/sysdeps/x86_64/multiarch/ifunc-strlcpy.h > b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h > >> > new file mode 100644 > >> > index 0000000000..982a30d15b > >> > --- /dev/null > >> > +++ b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h > >> > @@ -0,0 +1,34 @@ > >> > +/* Common definition for ifunc selections. > >> > + All versions must be listed in ifunc-impl-list.c. > >> > + Copyright (C) 2023 Free Software Foundation, Inc. > >> > + This file is part of the GNU C Library. > >> > + > >> > + The GNU C Library is free software; you can redistribute it and/or > >> > + modify it under the terms of the GNU Lesser General Public > >> > + License as published by the Free Software Foundation; either > >> > + version 2.1 of the License, or (at your option) any later version. > >> > + > >> > + The GNU C Library is distributed in the hope that it will be > useful, > >> > + but WITHOUT ANY WARRANTY; without even the implied warranty of > >> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > >> > + Lesser General Public License for more details. > >> > + > >> > + You should have received a copy of the GNU Lesser General Public > >> > + License along with the GNU C Library; if not, see > >> > + <https://www.gnu.org/licenses/>. */ > >> > + > >> > +#include <init-arch.h> > >> > + > >> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; > >> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden; > >> > + > >> > +static inline void * > >> > +IFUNC_SELECTOR (void) > >> > +{ > >> > + const struct cpu_features *cpu_features = __get_cpu_features (); > >> > + > >> > + if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2)) > >> > + return OPTIMIZE (avx2); > >> > + > >> > + return OPTIMIZE (generic); > >> > +} > >> > diff --git a/sysdeps/x86_64/multiarch/strlcpy-avx2.S > b/sysdeps/x86_64/multiarch/strlcpy-avx2.S > >> > new file mode 100644 > >> > index 0000000000..cf54b1e990 > >> > --- /dev/null > >> > +++ b/sysdeps/x86_64/multiarch/strlcpy-avx2.S > >> > @@ -0,0 +1,446 @@ > >> > +/* Strlcpy/wcslcpy optimized with AVX2. > >> > + Copyright (C) 2023 Free Software Foundation, Inc. > >> > + This file is part of the GNU C Library. > >> > + > >> > + The GNU C Library is free software; you can redistribute it and/or > >> > + modify it under the terms of the GNU Lesser General Public > >> > + License as published by the Free Software Foundation; either > >> > + version 2.1 of the License, or (at your option) any later version. > >> > + > >> > + The GNU C Library is distributed in the hope that it will be > useful, > >> > + but WITHOUT ANY WARRANTY; without even the implied warranty of > >> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > >> > + Lesser General Public License for more details. > >> > + > >> > + You should have received a copy of the GNU Lesser General Public > >> > + License along with the GNU C Library; if not, see > >> > + <https://www.gnu.org/licenses/>. */ > >> > + > >> > +#include <isa-level.h> > >> > + > >> > +#if ISA_SHOULD_BUILD (3) > >> > + > >> > +# include <sysdep.h> > >> > + > >> > +# ifndef VEC_SIZE > >> > +# include "x86-avx-vecs.h" > >> > +# endif > >> > + > >> > +# ifndef STRLCPY > >> > +# define STRLCPY __strlcpy_avx2 > >> > +# endif > >> > + > >> > + > >> > +# ifdef USE_AS_WCSLCPY > >> > +# define CHAR_SIZE 4 > >> > +# define MOVU movl > >> > +# define VPCMPEQ vpcmpeqd > >> > +# define VPMINU vpminud > >> > +# else > >> > +# define CHAR_SIZE 1 > >> > +# define MOVU movb > >> > +# define VPCMPEQ vpcmpeqb > >> > +# define VPMINU vpminub > >> > +# endif > >> > + > >> > +# define PMOVMSK vpmovmskb > >> > +# define PAGE_SIZE 4096 > >> > +# define VEC_SIZE 32 > >> > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > >> > + > >> > + .section SECTION(.text),"ax",@progbits > >> > +/* Aligning entry point to 64 byte, provides better performance for > >> > + one vector length string. */ > >> > + > >> > +ENTRY_P2ALIGN (STRLCPY, 6) > >> > +# ifdef __ILP32__ > >> > + /* Clear the upper 32 bits. */ > >> > + movl %edx, %edx > >> > +# endif > >> > + > >> > + /* Zero out vector register for end of string comparison. */ > >> > + vpxor %VMM(0), %VMM(0), %VMM(0) > >> > + /* Save source pointer for return calculation. */ > >> > + mov %rsi, %r8 > >> > + mov %esi, %eax > >> > + sall $20, %eax > >> > + cmpl $((PAGE_SIZE - (VEC_SIZE)) << 20), %eax > >> > + ja L(page_cross) > >> > + > >> > +L(page_cross_continue): > >> > + /* Load first vector. */ > >> > + VMOVU (%rsi), %VMM(1) > >> > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) > >> > + PMOVMSK %VMM(2), %eax > >> > + test %eax, %eax > >> > + jnz L(ret_vec_x1) > >> > + > >> > + test %rdx, %rdx > >> > + jz L(continue_second_vector) > >> > + > >> > + /* Check whether we can copy full vector. */ > >> > + cmp $CHAR_PER_VEC, %rdx > >> > + jbe L(page_cross_small_vec_copy) > >> > + /* Copy first vector. */ > >> > + VMOVU %VMM(1), (%rdi) > >> > + sub $CHAR_PER_VEC, %rdx > >> > + > >> > +L(continue_second_vector): > >> > + /* Align RSI pointer and adjust RDI based on offset. */ > >> > + mov %rsi, %rax > >> > + and $-VEC_SIZE, %rsi > >> > + sub %rsi, %rax > >> > + sub %rax, %rdi > >> > + > >> > + /* Check if string already copied N char, and RDX is 0. */ > >> > + test %rdx, %rdx > >> > + jz L(skip_copy_alignment_fix) > >> > + > >> > + /* Adjust RDX for copy alignment fix. */ > >> > +# ifdef USE_AS_WCSLCPY > >> > + shr $2, %rax > >> > +# endif > >> > + add %rax, %rdx > >> > + > >> > +L(skip_copy_alignment_fix): > >> > + /* Load second vector. */ > >> > + VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1) > >> > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) > >> > + vptest %VMM(2), %VMM(2) > >> > + jnz L(ret_vec_x2) > >> > + > >> > + /* Skip copy if RDX is 0. */ > >> > + test %rdx, %rdx > >> > + jz L(continue_third_vector) > >> > + > >> > + /* Jump below/equal(instead of below) used here, because last > >> > + copy chracter must be NULL. */ > >> > + cmp $CHAR_PER_VEC, %rdx > >> > + jbe L(partial_copy_second_vector) > >> > + > >> > + sub $CHAR_PER_VEC, %rdx > >> > + /* Copy second vector. */ > >> > + VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) > >> > + > >> > +L(continue_third_vector): > >> > + /* Load third vector. */ > >> > + VMOVA (VEC_SIZE * 2)(%rsi), %VMM(1) > >> > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) > >> > + vptest %VMM(2), %VMM(2) > >> > + jnz L(ret_vec_x3) > >> > + > >> > + /* Skip copy if RDX is 0. */ > >> > + test %rdx, %rdx > >> > + jz L(continue_fourth_vector) > >> > + > >> > + cmp $CHAR_PER_VEC, %rdx > >> > + jbe L(partial_copy_third_vector) > >> > + > >> > + sub $CHAR_PER_VEC, %rdx > >> > + /* Copy third vector. */ > >> > + VMOVU %VMM(1), (VEC_SIZE * 2)(%rdi) > >> > + > >> > +L(continue_fourth_vector): > >> > + /* Load fourth vector. */ > >> > + VMOVA (VEC_SIZE * 3)(%rsi), %VMM(1) > >> > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) > >> > + vptest %VMM(2), %VMM(2) > >> > + jnz L(ret_vec_x4) > >> > + > >> > + /* Skip copy if RDX is 0. */ > >> > + test %rdx, %rdx > >> > + jz L(loop_4x_align) > >> > + > >> > + cmp $CHAR_PER_VEC, %rdx > >> > + jbe L(partial_copy_fourth_vector) > >> > + > >> > + sub $CHAR_PER_VEC, %rdx > >> > + /* Copy fourth vector. */ > >> > + VMOVU %VMM(1), (VEC_SIZE * 3)(%rdi) > >> > + > >> > + > >> > +L(loop_4x_align): > >> > + /* Jump to loop if RSI is already 4 vector align. */ > >> > + test $(VEC_SIZE * 4 - 1), %esi > >> > + jz L(loop_4x_read) > >> > + > >> > + mov %rsi, %rcx > >> > + > >> > + /* Align RSI to 4x vector. */ > >> > + and $(VEC_SIZE * -4), %rsi > >> > + sub %rsi, %rcx > >> > + > >> > + /* Adjust RDI for RSI alignment fix. */ > >> > + sub %rcx, %rdi > >> > + > >> > + /* Jump to loop if RDX is 0. */ > >> > + test %rdx, %rdx > >> > + jz L(loop_4x_read) > >> > + > >> > +# ifdef USE_AS_WCSLCPY > >> > + shr $2, %rcx > >> > +# endif > >> > + > >> > + /* Adjust RDX for RSI alignment fix. */ > >> > + add %rcx, %rdx > >> > + jmp L(loop_4x_read) > >> > + > >> > + .p2align 4,,6 > >> > +L(loop_4x_vec): > >> > + /* Skip copy if RDX is 0. */ > >> > + test %rdx, %rdx > >> > + jz L(loop_partial_copy_return) > >> > + cmp $(CHAR_PER_VEC * 4), %rdx > >> > + jbe L(loop_partial_copy) > >> > + VMOVU %VMM(1), (VEC_SIZE * 4)(%rdi) > >> > + VMOVU %VMM(2), (VEC_SIZE * 5)(%rdi) > >> > + VMOVU %VMM(3), (VEC_SIZE * 6)(%rdi) > >> > + VMOVU %VMM(4), (VEC_SIZE * 7)(%rdi) > >> > + sub $(CHAR_PER_VEC * 4), %rdx > >> > + > >> > +L(loop_partial_copy_return): > >> > + sub $(VEC_SIZE * -4), %rsi > >> > + sub $(VEC_SIZE * -4), %rdi > >> > + > >> > +L(loop_4x_read): > >> > + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(1) > >> > + VMOVA (VEC_SIZE * 5)(%rsi), %VMM(2) > >> > + VMOVA (VEC_SIZE * 6)(%rsi), %VMM(3) > >> > + VMOVA (VEC_SIZE * 7)(%rsi), %VMM(4) > >> > + VPMINU %VMM(1), %VMM(2), %VMM(5) > >> > + VPMINU %VMM(3), %VMM(4), %VMM(6) > >> > + VPMINU %VMM(5), %VMM(6), %VMM(7) > >> > + VPCMPEQ %VMM(0), %VMM(7), %VMM(7) > >> > + vptest %VMM(7), %VMM(7) > >> > + > >> > + jz L(loop_4x_vec) > >> > + > >> > + /* Check if string ends in first vector or second vector. */ > >> > + lea (VEC_SIZE * 4)(%rsi), %rax > >> > + sub %r8, %rax > >> > +# ifdef USE_AS_WCSLCPY > >> > + shr $2, %rax > >> > +# endif > >> > + xor %r10, %r10 > >> > + VPCMPEQ %VMM(0), %VMM(5), %VMM(6) > >> > + vptest %VMM(6), %VMM(6) > >> > + jnz L(endloop) > >> > + sub $(CHAR_PER_VEC * -2), %rax > >> > + mov $(CHAR_PER_VEC * 2), %r10 > >> > + VMOVA %VMM(3), %VMM(1) > >> > + VMOVA %VMM(4), %VMM(2) > >> > + > >> > +L(endloop): > >> > + VPCMPEQ %VMM(0), %VMM(1), %VMM(1) > >> > + VPCMPEQ %VMM(0), %VMM(2), %VMM(2) > >> > + PMOVMSK %VMM(1), %rcx > >> > + PMOVMSK %VMM(2), %r9 > >> > + shlq $32, %r9 > >> > + orq %r9, %rcx > >> > + bsf %rcx, %rcx > >> > + /* Shift RCX by 2, VPMOVMSK has only byte version. */ > >> > +# ifdef USE_AS_WCSLCPY > >> > + shr $2, %rcx > >> > +# endif > >> > + /* At this point RAX has length to return. */ > >> > + add %rcx, %rax > >> > + test %rdx, %rdx > >> > + jz L(ret) > >> > + > >> > + /* Add 1 to account for NULL character in RDX comparison. */ > >> > + lea 1(%r10, %rcx), %rcx > >> > + cmp %rdx, %rcx > >> > + cmovb %rcx, %rdx > >> > + > >> > +L(loop_partial_copy): > >> > + cmp $(CHAR_PER_VEC * 2), %rdx > >> > + jbe L(loop_partial_first_half) > >> > + /* Reload first 2 vector. */ > >> > + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(1) > >> > + VMOVA (VEC_SIZE * 5)(%rsi), %VMM(2) > >> > + VMOVU %VMM(1), (VEC_SIZE * 4)(%rdi) > >> > + VMOVU %VMM(2), (VEC_SIZE * 5)(%rdi) > >> > + > >> > +L(loop_partial_first_half): > >> > + /* Go back 2 vector from last and use overlapping copy. > >> > + (VEC_SIZE * 4 - VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE) > >> > + (VEC_SIZE * 4 - VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE) > >> > + */ > >> > + VMOVU (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(3) > >> > + VMOVU (VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %VMM(4) > >> > + VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE) > >> > + VMOVU %VMM(4), (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE) > >> > + MOVU $0, (VEC_SIZE * 4 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) > >> > + xor %rdx, %rdx > >> > + vptest %VMM(7), %VMM(7) > >> > + jz L(loop_partial_copy_return) > >> > + ret > >> > + > >> > + .p2align 4 > >> > +L(page_cross): > >> > + mov %rsi, %rcx > >> > + mov %rsi, %r11 > >> > + and $-VEC_SIZE, %r11 > >> > + and $(VEC_SIZE - 1), %rcx > >> > + VMOVA (%r11), %VMM(1) > >> > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) > >> > + PMOVMSK %VMM(2), %eax > >> > + shr %cl, %eax > >> > + jz L(page_cross_continue) > >> > + > >> > +L(ret_vec_x1): > >> > + bsf %eax, %eax > >> > +# ifdef USE_AS_WCSLCPY > >> > + shr $2, %eax > >> > +# endif > >> > + /* Increment by 1 to account for NULL char. */ > >> > + lea 1(%eax), %ecx > >> > + cmp %rdx, %rcx > >> > + cmovb %rcx, %rdx > >> > + test %rdx, %rdx > >> > + jz L(ret) > >> > + > >> > +L(page_cross_small_vec_copy): > >> > + cmp $(16 / CHAR_SIZE), %rdx > >> > + jbe L(copy_8_byte_scalar) > >> > + VMOVU (%rsi), %VMM_128(1) > >> > + VMOVU -16(%rsi, %rdx, CHAR_SIZE), %VMM_128(3) > >> > + VMOVU %VMM_128(1), (%rdi) > >> > + VMOVU %VMM_128(3), -16(%rdi, %rdx, CHAR_SIZE) > >> > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > >> > + xor %rdx, %rdx > >> > + vptest %VMM(2), %VMM(2) > >> > + jz L(continue_second_vector) > >> > + ret > >> > + > >> > +L(copy_8_byte_scalar): > >> > + cmp $(8 / CHAR_SIZE), %rdx > >> > + jbe L(copy_4_byte_scalar) > >> > + movq (%rsi), %r10 > >> > + movq -8(%rsi, %rdx, CHAR_SIZE), %r11 > >> > + movq %r10, (%rdi) > >> > + movq %r11, -8(%rdi, %rdx, CHAR_SIZE) > >> > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > >> > + xor %edx, %edx > >> > + vptest %VMM(2), %VMM(2) > >> > + jz L(continue_second_vector) > >> > + ret > >> > + > >> > +L(copy_4_byte_scalar): > >> > +# ifndef USE_AS_WCSLCPY > >> > + cmp $4, %rdx > >> > + jbe L(copy_2_byte_scalar) > >> > +# endif > >> > + movl (%rsi), %r10d > >> > + movl -4(%rsi, %rdx, CHAR_SIZE), %r11d > >> > + movl %r10d, (%rdi) > >> > + movl %r11d, -4(%rdi, %rdx, CHAR_SIZE) > >> > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > >> > + xor %edx, %edx > >> > + vptest %VMM(2), %VMM(2) > >> > + jz L(continue_second_vector) > >> > + ret > >> > + > >> > +# ifndef USE_AS_WCSLCPY > >> > +L(copy_2_byte_scalar): > >> > + cmp $2, %rdx > >> > + jbe L(copy_1_byte_scalar) > >> > + movw (%rsi), %r10w > >> > + movw -(CHAR_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %r11w > >> > + movw %r10w, (%rdi) > >> > + movw %r11w, -(CHAR_SIZE * 3)(%rdi, %rdx, CHAR_SIZE) > >> > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > >> > + xor %edx, %edx > >> > + vptest %VMM(2), %VMM(2) > >> > + jz L(continue_second_vector) > >> > + ret > >> > + > >> > +L(copy_1_byte_scalar): > >> > + MOVU (%rsi), %r10b > >> > + MOVU %r10b, (%rdi) > >> > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > >> > + xor %edx, %edx > >> > + vptest %VMM(2), %VMM(2) > >> > + jz L(continue_second_vector) > >> > + ret > >> > +# endif > >> > + > >> > +L(ret_vec_x2): > >> > + PMOVMSK %VMM(2), %rax > >> > + bsf %rax, %rcx > >> > + /* Calculate return value. */ > >> > + lea VEC_SIZE(%rsi, %rcx), %rax > >> > + sub %r8, %rax > >> > +# ifdef USE_AS_WCSLCPY > >> > + shr $2, %rax > >> > + shr $2, %rcx > >> > +# endif > >> > + inc %rcx > >> > + test %rdx, %rdx > >> > + jz L(ret) > >> > + cmp %rdx, %rcx > >> > + cmovb %rcx, %rdx > >> > + > >> > +L(partial_copy_second_vector): > >> > + VMOVU (%rsi, %rdx, CHAR_SIZE), %VMM(1) > >> > + VMOVU %VMM(1), (%rdi, %rdx, CHAR_SIZE) > >> > + MOVU $0, (VEC_SIZE - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > >> > + xor %edx, %edx > >> > + vptest %VMM(2), %VMM(2) > >> > + jz L(continue_third_vector) > >> > + > >> > +L(ret): > >> > + ret > >> > + > >> > +L(ret_vec_x3): > >> > + PMOVMSK %VMM(2), %rax > >> > + bsf %rax, %rcx > >> > + /* Calculate return value. */ > >> > + lea (VEC_SIZE * 2)(%rsi, %rcx), %rax > >> > + sub %r8, %rax > >> > +# ifdef USE_AS_WCSLCPY > >> > + shr $2, %rax > >> > + shr $2, %rcx > >> > +# endif > >> > + inc %rcx > >> > + test %rdx, %rdx > >> > + jz L(ret) > >> > + cmp %rdx, %rcx > >> > + cmovb %rcx, %rdx > >> > + > >> > +L(partial_copy_third_vector): > >> > + VMOVU (VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM(1) > >> > + VMOVU %VMM(1), (VEC_SIZE)(%rdi, %rdx, CHAR_SIZE) > >> > + MOVU $0, ((VEC_SIZE * 2) - CHAR_SIZE * 1)(%rdi, %rdx, > CHAR_SIZE) > >> > + xor %edx, %edx > >> > + vptest %VMM(2), %VMM(2) > >> > + jz L(continue_fourth_vector) > >> > + ret > >> > + > >> > +L(ret_vec_x4): > >> > + PMOVMSK %VMM(2), %rax > >> > + bsf %rax, %rcx > >> > + /* Calculate return value. */ > >> > + lea (VEC_SIZE * 3)(%rsi, %rcx), %rax > >> > + sub %r8, %rax > >> > +# ifdef USE_AS_WCSLCPY > >> > + shr $2, %rax > >> > + shr $2, %rcx > >> > +# endif > >> > + inc %rcx > >> > + test %rdx, %rdx > >> > + jz L(ret) > >> > + cmp %rdx, %rcx > >> > + cmovb %rcx, %rdx > >> > + > >> > +L(partial_copy_fourth_vector): > >> > + VMOVU (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(1) > >> > + VMOVU %VMM(1), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE) > >> > + MOVU $0, ((VEC_SIZE * 3) - CHAR_SIZE * 1)(%rdi, %rdx, > CHAR_SIZE) > >> > + xor %edx, %edx > >> > + vptest %VMM(2), %VMM(2) > >> > + jz L(continue_fourth_vector) > >> > + ret > >> > + > >> > +END (STRLCPY) > >> > >> Is strlcpy/strlcat integratable with existing strncat impl? Had > >> figured they would > >> fit in the same file. > > > > > > Hi Noah, > > > > It may not be a good idea to put strlcpy/strlcat in the existing > strncpy/strnat impl file, > > as strlcpy/strlcat functions are associated with GLIBC_2.38 ABI. > > > Well, we can put the impl there and include it from another to manage > any special > link cases. > Due to ABI, none of strlcpy/strlcat changes can go in the glibc version earlier than 2.38, to avoid any future strncpy backporting complications, it is better to keep them in separate files for now. > > --Sunil > > > >> > >> > +#endif > >> > diff --git a/sysdeps/x86_64/multiarch/strlcpy-generic.c > b/sysdeps/x86_64/multiarch/strlcpy-generic.c > >> > new file mode 100644 > >> > index 0000000000..eee3b7b086 > >> > --- /dev/null > >> > +++ b/sysdeps/x86_64/multiarch/strlcpy-generic.c > >> > @@ -0,0 +1,25 @@ > >> > +/* strlcpy generic. > >> > + Copyright (C) 2023 Free Software Foundation, Inc. > >> > + This file is part of the GNU C Library. > >> > + > >> > + The GNU C Library is free software; you can redistribute it and/or > >> > + modify it under the terms of the GNU Lesser General Public > >> > + License as published by the Free Software Foundation; either > >> > + version 2.1 of the License, or (at your option) any later version. > >> > + > >> > + The GNU C Library is distributed in the hope that it will be > useful, > >> > + but WITHOUT ANY WARRANTY; without even the implied warranty of > >> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > >> > + Lesser General Public License for more details. > >> > + > >> > + You should have received a copy of the GNU Lesser General Public > >> > + License along with the GNU C Library; if not, see > >> > + <https://www.gnu.org/licenses/>. */ > >> > + > >> > + > >> > +#include <isa-level.h> > >> > +#if ISA_SHOULD_BUILD (1) > >> > +# define __strlcpy __strlcpy_generic > >> > +# include <string/strlcpy.c> > >> > + > >> > +#endif > >> > diff --git a/sysdeps/x86_64/multiarch/strlcpy.c > b/sysdeps/x86_64/multiarch/strlcpy.c > >> > new file mode 100644 > >> > index 0000000000..ded41fbcfb > >> > --- /dev/null > >> > +++ b/sysdeps/x86_64/multiarch/strlcpy.c > >> > @@ -0,0 +1,36 @@ > >> > +/* Multiple versions of strlcpy. > >> > + All versions must be listed in ifunc-impl-list.c. > >> > + Copyright (C) 2023 Free Software Foundation, Inc. > >> > + This file is part of the GNU C Library. > >> > + > >> > + The GNU C Library is free software; you can redistribute it and/or > >> > + modify it under the terms of the GNU Lesser General Public > >> > + License as published by the Free Software Foundation; either > >> > + version 2.1 of the License, or (at your option) any later version. > >> > + > >> > + The GNU C Library is distributed in the hope that it will be > useful, > >> > + but WITHOUT ANY WARRANTY; without even the implied warranty of > >> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > >> > + Lesser General Public License for more details. > >> > + > >> > + You should have received a copy of the GNU Lesser General Public > >> > + License along with the GNU C Library; if not, see > >> > + <https://www.gnu.org/licenses/>. */ > >> > + > >> > +/* Define multiple versions only for the definition in libc. */ > >> > +#if IS_IN (libc) > >> > +# define __strlcpy __redirect_strlcpy > >> > +# include <string.h> > >> > +# undef __strlcpy > >> > + > >> > +# define SYMBOL_NAME strlcpy > >> > +# include "ifunc-strlcpy.h" > >> > + > >> > +libc_ifunc_redirected (__redirect_strlcpy, __strlcpy, IFUNC_SELECTOR > ()); > >> > +weak_alias (__strlcpy, strlcpy) > >> > + > >> > +# ifdef SHARED > >> > +__hidden_ver1 (__strlcpy, __GI___strlcpy, __redirect_strlcpy) > >> > + __attribute__ ((visibility ("hidden"))) __attribute_copy__ > (strlcpy); > >> > +# endif > >> > +#endif > >> > diff --git a/sysdeps/x86_64/multiarch/wcslcpy-avx2.S > b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S > >> > new file mode 100644 > >> > index 0000000000..dafc20ded0 > >> > --- /dev/null > >> > +++ b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S > >> > @@ -0,0 +1,4 @@ > >> > +#define STRLCPY __wcslcpy_avx2 > >> > +#define USE_AS_WCSLCPY 1 > >> > + > >> > +#include "strlcpy-avx2.S" > >> > diff --git a/sysdeps/x86_64/multiarch/wcslcpy-generic.c > b/sysdeps/x86_64/multiarch/wcslcpy-generic.c > >> > new file mode 100644 > >> > index 0000000000..ffd3c0e846 > >> > --- /dev/null > >> > +++ b/sysdeps/x86_64/multiarch/wcslcpy-generic.c > >> > @@ -0,0 +1,25 @@ > >> > +/* wcslcpy generic. > >> > + Copyright (C) 2023 Free Software Foundation, Inc. > >> > + This file is part of the GNU C Library. > >> > + > >> > + The GNU C Library is free software; you can redistribute it and/or > >> > + modify it under the terms of the GNU Lesser General Public > >> > + License as published by the Free Software Foundation; either > >> > + version 2.1 of the License, or (at your option) any later version. > >> > + > >> > + The GNU C Library is distributed in the hope that it will be > useful, > >> > + but WITHOUT ANY WARRANTY; without even the implied warranty of > >> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > >> > + Lesser General Public License for more details. > >> > + > >> > + You should have received a copy of the GNU Lesser General Public > >> > + License along with the GNU C Library; if not, see > >> > + <https://www.gnu.org/licenses/>. */ > >> > + > >> > + > >> > +#include <isa-level.h> > >> > +#if ISA_SHOULD_BUILD (1) > >> > +# define __wcslcpy __wcslcpy_generic > >> > +# include <wcsmbs/wcslcpy.c> > >> > + > >> > +#endif > >> > diff --git a/sysdeps/x86_64/multiarch/wcslcpy.c > b/sysdeps/x86_64/multiarch/wcslcpy.c > >> > new file mode 100644 > >> > index 0000000000..371ef9626c > >> > --- /dev/null > >> > +++ b/sysdeps/x86_64/multiarch/wcslcpy.c > >> > @@ -0,0 +1,35 @@ > >> > +/* Multiple versions of wcslcpy. > >> > + All versions must be listed in ifunc-impl-list.c. > >> > + Copyright (C) 2023 Free Software Foundation, Inc. > >> > + This file is part of the GNU C Library. > >> > + > >> > + The GNU C Library is free software; you can redistribute it and/or > >> > + modify it under the terms of the GNU Lesser General Public > >> > + License as published by the Free Software Foundation; either > >> > + version 2.1 of the License, or (at your option) any later version. > >> > + > >> > + The GNU C Library is distributed in the hope that it will be > useful, > >> > + but WITHOUT ANY WARRANTY; without even the implied warranty of > >> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > >> > + Lesser General Public License for more details. > >> > + > >> > + You should have received a copy of the GNU Lesser General Public > >> > + License along with the GNU C Library; if not, see > >> > + <https://www.gnu.org/licenses/>. */ > >> > + > >> > +/* Define multiple versions only for the definition in libc. */ > >> > +#if IS_IN (libc) > >> > +# define __wcslcpy __redirect_wcslcpy > >> > +# include <wchar.h> > >> > +# undef __wcslcpy > >> > + > >> > +# define SYMBOL_NAME wcslcpy > >> > +# include "ifunc-strlcpy.h" > >> > + > >> > +libc_ifunc_redirected (__redirect_wcslcpy, __wcslcpy, IFUNC_SELECTOR > ()); > >> > +weak_alias (__wcslcpy, wcslcpy) > >> > +# ifdef SHARED > >> > +__hidden_ver1 (__wcslcpy, __GI___wcslcpy, __redirect_wcslcpy) > >> > + __attribute__((visibility ("hidden"))) __attribute_copy__ > (wcslcpy); > >> > +# endif > >> > +#endif > >> > -- > >> > 2.38.1 > >> > > ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function 2023-07-03 1:03 ` Sunil Pandey @ 2023-07-03 1:47 ` Noah Goldstein 0 siblings, 0 replies; 24+ messages in thread From: Noah Goldstein @ 2023-07-03 1:47 UTC (permalink / raw) To: Sunil Pandey; +Cc: libc-alpha, hjl.tools On Sun, Jul 2, 2023 at 8:04 PM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > On Sun, Jul 2, 2023 at 11:54 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: >> >> On Sun, Jul 2, 2023 at 1:38 PM Sunil Pandey <skpgkp2@gmail.com> wrote: >> > >> > >> > >> > On Sun, Jul 2, 2023 at 10:03 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: >> >> >> >> On Fri, Jun 30, 2023 at 3:48 PM Sunil K Pandey via Libc-alpha >> >> <libc-alpha@sourceware.org> wrote: >> >> > >> >> > This patch optimizes strlcpy/wsclcpy string functions for AVX2. >> >> > --- >> >> > sysdeps/x86_64/multiarch/Makefile | 4 + >> >> > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 18 + >> >> > sysdeps/x86_64/multiarch/ifunc-strlcpy.h | 34 ++ >> >> > sysdeps/x86_64/multiarch/strlcpy-avx2.S | 446 +++++++++++++++++++++ >> >> > sysdeps/x86_64/multiarch/strlcpy-generic.c | 25 ++ >> >> > sysdeps/x86_64/multiarch/strlcpy.c | 36 ++ >> >> > sysdeps/x86_64/multiarch/wcslcpy-avx2.S | 4 + >> >> > sysdeps/x86_64/multiarch/wcslcpy-generic.c | 25 ++ >> >> > sysdeps/x86_64/multiarch/wcslcpy.c | 35 ++ >> >> > 9 files changed, 627 insertions(+) >> >> > create mode 100644 sysdeps/x86_64/multiarch/ifunc-strlcpy.h >> >> > create mode 100644 sysdeps/x86_64/multiarch/strlcpy-avx2.S >> >> > create mode 100644 sysdeps/x86_64/multiarch/strlcpy-generic.c >> >> > create mode 100644 sysdeps/x86_64/multiarch/strlcpy.c >> >> > create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-avx2.S >> >> > create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-generic.c >> >> > create mode 100644 sysdeps/x86_64/multiarch/wcslcpy.c >> >> > >> >> > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile >> >> > index e1e894c963..7e3fc081df 100644 >> >> > --- a/sysdeps/x86_64/multiarch/Makefile >> >> > +++ b/sysdeps/x86_64/multiarch/Makefile >> >> > @@ -82,6 +82,8 @@ sysdep_routines += \ >> >> > strcpy-sse2 \ >> >> > strcpy-sse2-unaligned \ >> >> > strcspn-sse4 \ >> >> > + strlcpy-avx2 \ >> >> > + strlcpy-generic \ >> >> > strlen-avx2 \ >> >> > strlen-avx2-rtm \ >> >> > strlen-evex \ >> >> > @@ -153,6 +155,8 @@ sysdep_routines += \ >> >> > wcscpy-evex \ >> >> > wcscpy-generic \ >> >> > wcscpy-ssse3 \ >> >> > + wcslcpy-avx2 \ >> >> > + wcslcpy-generic \ >> >> > wcslen-avx2 \ >> >> > wcslen-avx2-rtm \ >> >> > wcslen-evex \ >> >> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c >> >> > index 5427ff1907..9928dee187 100644 >> >> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c >> >> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c >> >> > @@ -751,6 +751,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, >> >> > 1, >> >> > __strncat_sse2_unaligned)) >> >> > >> >> > + /* Support sysdeps/x86_64/multiarch/strlcpy.c. */ >> >> > + IFUNC_IMPL (i, name, strlcpy, >> >> > + X86_IFUNC_IMPL_ADD_V3 (array, i, strlcpy, >> >> > + CPU_FEATURE_USABLE (AVX2), >> >> > + __strlcpy_avx2) >> >> > + X86_IFUNC_IMPL_ADD_V1 (array, i, strlcpy, >> >> > + 1, >> >> > + __strlcpy_generic)) >> >> > + >> >> > /* Support sysdeps/x86_64/multiarch/strncpy.c. */ >> >> > IFUNC_IMPL (i, name, strncpy, >> >> > X86_IFUNC_IMPL_ADD_V4 (array, i, strncpy, >> >> > @@ -917,6 +926,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, >> >> > 1, >> >> > __wcscpy_generic)) >> >> > >> >> > + /* Support sysdeps/x86_64/multiarch/wcslcpy.c. */ >> >> > + IFUNC_IMPL (i, name, wcslcpy, >> >> > + X86_IFUNC_IMPL_ADD_V3 (array, i, wcslcpy, >> >> > + CPU_FEATURE_USABLE (AVX2), >> >> > + __wcslcpy_avx2) >> >> > + X86_IFUNC_IMPL_ADD_V1 (array, i, wcslcpy, >> >> > + 1, >> >> > + __wcslcpy_generic)) >> >> > + >> >> > /* Support sysdeps/x86_64/multiarch/wcsncpy.c. */ >> >> > IFUNC_IMPL (i, name, wcsncpy, >> >> > X86_IFUNC_IMPL_ADD_V4 (array, i, wcsncpy, >> >> > diff --git a/sysdeps/x86_64/multiarch/ifunc-strlcpy.h b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h >> >> > new file mode 100644 >> >> > index 0000000000..982a30d15b >> >> > --- /dev/null >> >> > +++ b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h >> >> > @@ -0,0 +1,34 @@ >> >> > +/* Common definition for ifunc selections. >> >> > + All versions must be listed in ifunc-impl-list.c. >> >> > + Copyright (C) 2023 Free Software Foundation, Inc. >> >> > + This file is part of the GNU C Library. >> >> > + >> >> > + The GNU C Library is free software; you can redistribute it and/or >> >> > + modify it under the terms of the GNU Lesser General Public >> >> > + License as published by the Free Software Foundation; either >> >> > + version 2.1 of the License, or (at your option) any later version. >> >> > + >> >> > + The GNU C Library is distributed in the hope that it will be useful, >> >> > + but WITHOUT ANY WARRANTY; without even the implied warranty of >> >> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> >> > + Lesser General Public License for more details. >> >> > + >> >> > + You should have received a copy of the GNU Lesser General Public >> >> > + License along with the GNU C Library; if not, see >> >> > + <https://www.gnu.org/licenses/>. */ >> >> > + >> >> > +#include <init-arch.h> >> >> > + >> >> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; >> >> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden; >> >> > + >> >> > +static inline void * >> >> > +IFUNC_SELECTOR (void) >> >> > +{ >> >> > + const struct cpu_features *cpu_features = __get_cpu_features (); >> >> > + >> >> > + if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2)) >> >> > + return OPTIMIZE (avx2); >> >> > + >> >> > + return OPTIMIZE (generic); >> >> > +} >> >> > diff --git a/sysdeps/x86_64/multiarch/strlcpy-avx2.S b/sysdeps/x86_64/multiarch/strlcpy-avx2.S >> >> > new file mode 100644 >> >> > index 0000000000..cf54b1e990 >> >> > --- /dev/null >> >> > +++ b/sysdeps/x86_64/multiarch/strlcpy-avx2.S >> >> > @@ -0,0 +1,446 @@ >> >> > +/* Strlcpy/wcslcpy optimized with AVX2. >> >> > + Copyright (C) 2023 Free Software Foundation, Inc. >> >> > + This file is part of the GNU C Library. >> >> > + >> >> > + The GNU C Library is free software; you can redistribute it and/or >> >> > + modify it under the terms of the GNU Lesser General Public >> >> > + License as published by the Free Software Foundation; either >> >> > + version 2.1 of the License, or (at your option) any later version. >> >> > + >> >> > + The GNU C Library is distributed in the hope that it will be useful, >> >> > + but WITHOUT ANY WARRANTY; without even the implied warranty of >> >> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> >> > + Lesser General Public License for more details. >> >> > + >> >> > + You should have received a copy of the GNU Lesser General Public >> >> > + License along with the GNU C Library; if not, see >> >> > + <https://www.gnu.org/licenses/>. */ >> >> > + >> >> > +#include <isa-level.h> >> >> > + >> >> > +#if ISA_SHOULD_BUILD (3) >> >> > + >> >> > +# include <sysdep.h> >> >> > + >> >> > +# ifndef VEC_SIZE >> >> > +# include "x86-avx-vecs.h" >> >> > +# endif >> >> > + >> >> > +# ifndef STRLCPY >> >> > +# define STRLCPY __strlcpy_avx2 >> >> > +# endif >> >> > + >> >> > + >> >> > +# ifdef USE_AS_WCSLCPY >> >> > +# define CHAR_SIZE 4 >> >> > +# define MOVU movl >> >> > +# define VPCMPEQ vpcmpeqd >> >> > +# define VPMINU vpminud >> >> > +# else >> >> > +# define CHAR_SIZE 1 >> >> > +# define MOVU movb >> >> > +# define VPCMPEQ vpcmpeqb >> >> > +# define VPMINU vpminub >> >> > +# endif >> >> > + >> >> > +# define PMOVMSK vpmovmskb >> >> > +# define PAGE_SIZE 4096 >> >> > +# define VEC_SIZE 32 >> >> > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) >> >> > + >> >> > + .section SECTION(.text),"ax",@progbits >> >> > +/* Aligning entry point to 64 byte, provides better performance for >> >> > + one vector length string. */ >> >> > + >> >> > +ENTRY_P2ALIGN (STRLCPY, 6) >> >> > +# ifdef __ILP32__ >> >> > + /* Clear the upper 32 bits. */ >> >> > + movl %edx, %edx >> >> > +# endif >> >> > + >> >> > + /* Zero out vector register for end of string comparison. */ >> >> > + vpxor %VMM(0), %VMM(0), %VMM(0) >> >> > + /* Save source pointer for return calculation. */ >> >> > + mov %rsi, %r8 >> >> > + mov %esi, %eax >> >> > + sall $20, %eax >> >> > + cmpl $((PAGE_SIZE - (VEC_SIZE)) << 20), %eax >> >> > + ja L(page_cross) >> >> > + >> >> > +L(page_cross_continue): >> >> > + /* Load first vector. */ >> >> > + VMOVU (%rsi), %VMM(1) >> >> > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) >> >> > + PMOVMSK %VMM(2), %eax >> >> > + test %eax, %eax >> >> > + jnz L(ret_vec_x1) >> >> > + >> >> > + test %rdx, %rdx >> >> > + jz L(continue_second_vector) >> >> > + >> >> > + /* Check whether we can copy full vector. */ >> >> > + cmp $CHAR_PER_VEC, %rdx >> >> > + jbe L(page_cross_small_vec_copy) >> >> > + /* Copy first vector. */ >> >> > + VMOVU %VMM(1), (%rdi) >> >> > + sub $CHAR_PER_VEC, %rdx >> >> > + >> >> > +L(continue_second_vector): >> >> > + /* Align RSI pointer and adjust RDI based on offset. */ >> >> > + mov %rsi, %rax >> >> > + and $-VEC_SIZE, %rsi >> >> > + sub %rsi, %rax >> >> > + sub %rax, %rdi >> >> > + >> >> > + /* Check if string already copied N char, and RDX is 0. */ >> >> > + test %rdx, %rdx >> >> > + jz L(skip_copy_alignment_fix) >> >> > + >> >> > + /* Adjust RDX for copy alignment fix. */ >> >> > +# ifdef USE_AS_WCSLCPY >> >> > + shr $2, %rax >> >> > +# endif >> >> > + add %rax, %rdx >> >> > + >> >> > +L(skip_copy_alignment_fix): >> >> > + /* Load second vector. */ >> >> > + VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1) >> >> > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) >> >> > + vptest %VMM(2), %VMM(2) >> >> > + jnz L(ret_vec_x2) >> >> > + >> >> > + /* Skip copy if RDX is 0. */ >> >> > + test %rdx, %rdx >> >> > + jz L(continue_third_vector) >> >> > + >> >> > + /* Jump below/equal(instead of below) used here, because last >> >> > + copy chracter must be NULL. */ >> >> > + cmp $CHAR_PER_VEC, %rdx >> >> > + jbe L(partial_copy_second_vector) >> >> > + >> >> > + sub $CHAR_PER_VEC, %rdx >> >> > + /* Copy second vector. */ >> >> > + VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) >> >> > + >> >> > +L(continue_third_vector): >> >> > + /* Load third vector. */ >> >> > + VMOVA (VEC_SIZE * 2)(%rsi), %VMM(1) >> >> > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) >> >> > + vptest %VMM(2), %VMM(2) >> >> > + jnz L(ret_vec_x3) >> >> > + >> >> > + /* Skip copy if RDX is 0. */ >> >> > + test %rdx, %rdx >> >> > + jz L(continue_fourth_vector) >> >> > + >> >> > + cmp $CHAR_PER_VEC, %rdx >> >> > + jbe L(partial_copy_third_vector) >> >> > + >> >> > + sub $CHAR_PER_VEC, %rdx >> >> > + /* Copy third vector. */ >> >> > + VMOVU %VMM(1), (VEC_SIZE * 2)(%rdi) >> >> > + >> >> > +L(continue_fourth_vector): >> >> > + /* Load fourth vector. */ >> >> > + VMOVA (VEC_SIZE * 3)(%rsi), %VMM(1) >> >> > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) >> >> > + vptest %VMM(2), %VMM(2) >> >> > + jnz L(ret_vec_x4) >> >> > + >> >> > + /* Skip copy if RDX is 0. */ >> >> > + test %rdx, %rdx >> >> > + jz L(loop_4x_align) >> >> > + >> >> > + cmp $CHAR_PER_VEC, %rdx >> >> > + jbe L(partial_copy_fourth_vector) >> >> > + >> >> > + sub $CHAR_PER_VEC, %rdx >> >> > + /* Copy fourth vector. */ >> >> > + VMOVU %VMM(1), (VEC_SIZE * 3)(%rdi) >> >> > + >> >> > + >> >> > +L(loop_4x_align): >> >> > + /* Jump to loop if RSI is already 4 vector align. */ >> >> > + test $(VEC_SIZE * 4 - 1), %esi >> >> > + jz L(loop_4x_read) >> >> > + >> >> > + mov %rsi, %rcx >> >> > + >> >> > + /* Align RSI to 4x vector. */ >> >> > + and $(VEC_SIZE * -4), %rsi >> >> > + sub %rsi, %rcx >> >> > + >> >> > + /* Adjust RDI for RSI alignment fix. */ >> >> > + sub %rcx, %rdi >> >> > + >> >> > + /* Jump to loop if RDX is 0. */ >> >> > + test %rdx, %rdx >> >> > + jz L(loop_4x_read) >> >> > + >> >> > +# ifdef USE_AS_WCSLCPY >> >> > + shr $2, %rcx >> >> > +# endif >> >> > + >> >> > + /* Adjust RDX for RSI alignment fix. */ >> >> > + add %rcx, %rdx >> >> > + jmp L(loop_4x_read) >> >> > + >> >> > + .p2align 4,,6 >> >> > +L(loop_4x_vec): >> >> > + /* Skip copy if RDX is 0. */ >> >> > + test %rdx, %rdx >> >> > + jz L(loop_partial_copy_return) >> >> > + cmp $(CHAR_PER_VEC * 4), %rdx >> >> > + jbe L(loop_partial_copy) >> >> > + VMOVU %VMM(1), (VEC_SIZE * 4)(%rdi) >> >> > + VMOVU %VMM(2), (VEC_SIZE * 5)(%rdi) >> >> > + VMOVU %VMM(3), (VEC_SIZE * 6)(%rdi) >> >> > + VMOVU %VMM(4), (VEC_SIZE * 7)(%rdi) >> >> > + sub $(CHAR_PER_VEC * 4), %rdx >> >> > + >> >> > +L(loop_partial_copy_return): >> >> > + sub $(VEC_SIZE * -4), %rsi >> >> > + sub $(VEC_SIZE * -4), %rdi >> >> > + >> >> > +L(loop_4x_read): >> >> > + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(1) >> >> > + VMOVA (VEC_SIZE * 5)(%rsi), %VMM(2) >> >> > + VMOVA (VEC_SIZE * 6)(%rsi), %VMM(3) >> >> > + VMOVA (VEC_SIZE * 7)(%rsi), %VMM(4) >> >> > + VPMINU %VMM(1), %VMM(2), %VMM(5) >> >> > + VPMINU %VMM(3), %VMM(4), %VMM(6) >> >> > + VPMINU %VMM(5), %VMM(6), %VMM(7) >> >> > + VPCMPEQ %VMM(0), %VMM(7), %VMM(7) >> >> > + vptest %VMM(7), %VMM(7) >> >> > + >> >> > + jz L(loop_4x_vec) >> >> > + >> >> > + /* Check if string ends in first vector or second vector. */ >> >> > + lea (VEC_SIZE * 4)(%rsi), %rax >> >> > + sub %r8, %rax >> >> > +# ifdef USE_AS_WCSLCPY >> >> > + shr $2, %rax >> >> > +# endif >> >> > + xor %r10, %r10 >> >> > + VPCMPEQ %VMM(0), %VMM(5), %VMM(6) >> >> > + vptest %VMM(6), %VMM(6) >> >> > + jnz L(endloop) >> >> > + sub $(CHAR_PER_VEC * -2), %rax >> >> > + mov $(CHAR_PER_VEC * 2), %r10 >> >> > + VMOVA %VMM(3), %VMM(1) >> >> > + VMOVA %VMM(4), %VMM(2) >> >> > + >> >> > +L(endloop): >> >> > + VPCMPEQ %VMM(0), %VMM(1), %VMM(1) >> >> > + VPCMPEQ %VMM(0), %VMM(2), %VMM(2) >> >> > + PMOVMSK %VMM(1), %rcx >> >> > + PMOVMSK %VMM(2), %r9 >> >> > + shlq $32, %r9 >> >> > + orq %r9, %rcx >> >> > + bsf %rcx, %rcx >> >> > + /* Shift RCX by 2, VPMOVMSK has only byte version. */ >> >> > +# ifdef USE_AS_WCSLCPY >> >> > + shr $2, %rcx >> >> > +# endif >> >> > + /* At this point RAX has length to return. */ >> >> > + add %rcx, %rax >> >> > + test %rdx, %rdx >> >> > + jz L(ret) >> >> > + >> >> > + /* Add 1 to account for NULL character in RDX comparison. */ >> >> > + lea 1(%r10, %rcx), %rcx >> >> > + cmp %rdx, %rcx >> >> > + cmovb %rcx, %rdx >> >> > + >> >> > +L(loop_partial_copy): >> >> > + cmp $(CHAR_PER_VEC * 2), %rdx >> >> > + jbe L(loop_partial_first_half) >> >> > + /* Reload first 2 vector. */ >> >> > + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(1) >> >> > + VMOVA (VEC_SIZE * 5)(%rsi), %VMM(2) >> >> > + VMOVU %VMM(1), (VEC_SIZE * 4)(%rdi) >> >> > + VMOVU %VMM(2), (VEC_SIZE * 5)(%rdi) >> >> > + >> >> > +L(loop_partial_first_half): >> >> > + /* Go back 2 vector from last and use overlapping copy. >> >> > + (VEC_SIZE * 4 - VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE) >> >> > + (VEC_SIZE * 4 - VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE) >> >> > + */ >> >> > + VMOVU (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(3) >> >> > + VMOVU (VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %VMM(4) >> >> > + VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE) >> >> > + VMOVU %VMM(4), (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE) >> >> > + MOVU $0, (VEC_SIZE * 4 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) >> >> > + xor %rdx, %rdx >> >> > + vptest %VMM(7), %VMM(7) >> >> > + jz L(loop_partial_copy_return) >> >> > + ret >> >> > + >> >> > + .p2align 4 >> >> > +L(page_cross): >> >> > + mov %rsi, %rcx >> >> > + mov %rsi, %r11 >> >> > + and $-VEC_SIZE, %r11 >> >> > + and $(VEC_SIZE - 1), %rcx >> >> > + VMOVA (%r11), %VMM(1) >> >> > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) >> >> > + PMOVMSK %VMM(2), %eax >> >> > + shr %cl, %eax >> >> > + jz L(page_cross_continue) >> >> > + >> >> > +L(ret_vec_x1): >> >> > + bsf %eax, %eax >> >> > +# ifdef USE_AS_WCSLCPY >> >> > + shr $2, %eax >> >> > +# endif >> >> > + /* Increment by 1 to account for NULL char. */ >> >> > + lea 1(%eax), %ecx >> >> > + cmp %rdx, %rcx >> >> > + cmovb %rcx, %rdx >> >> > + test %rdx, %rdx >> >> > + jz L(ret) >> >> > + >> >> > +L(page_cross_small_vec_copy): >> >> > + cmp $(16 / CHAR_SIZE), %rdx >> >> > + jbe L(copy_8_byte_scalar) >> >> > + VMOVU (%rsi), %VMM_128(1) >> >> > + VMOVU -16(%rsi, %rdx, CHAR_SIZE), %VMM_128(3) >> >> > + VMOVU %VMM_128(1), (%rdi) >> >> > + VMOVU %VMM_128(3), -16(%rdi, %rdx, CHAR_SIZE) >> >> > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) >> >> > + xor %rdx, %rdx >> >> > + vptest %VMM(2), %VMM(2) >> >> > + jz L(continue_second_vector) >> >> > + ret >> >> > + >> >> > +L(copy_8_byte_scalar): >> >> > + cmp $(8 / CHAR_SIZE), %rdx >> >> > + jbe L(copy_4_byte_scalar) >> >> > + movq (%rsi), %r10 >> >> > + movq -8(%rsi, %rdx, CHAR_SIZE), %r11 >> >> > + movq %r10, (%rdi) >> >> > + movq %r11, -8(%rdi, %rdx, CHAR_SIZE) >> >> > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) >> >> > + xor %edx, %edx >> >> > + vptest %VMM(2), %VMM(2) >> >> > + jz L(continue_second_vector) >> >> > + ret >> >> > + >> >> > +L(copy_4_byte_scalar): >> >> > +# ifndef USE_AS_WCSLCPY >> >> > + cmp $4, %rdx >> >> > + jbe L(copy_2_byte_scalar) >> >> > +# endif >> >> > + movl (%rsi), %r10d >> >> > + movl -4(%rsi, %rdx, CHAR_SIZE), %r11d >> >> > + movl %r10d, (%rdi) >> >> > + movl %r11d, -4(%rdi, %rdx, CHAR_SIZE) >> >> > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) >> >> > + xor %edx, %edx >> >> > + vptest %VMM(2), %VMM(2) >> >> > + jz L(continue_second_vector) >> >> > + ret >> >> > + >> >> > +# ifndef USE_AS_WCSLCPY >> >> > +L(copy_2_byte_scalar): >> >> > + cmp $2, %rdx >> >> > + jbe L(copy_1_byte_scalar) >> >> > + movw (%rsi), %r10w >> >> > + movw -(CHAR_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %r11w >> >> > + movw %r10w, (%rdi) >> >> > + movw %r11w, -(CHAR_SIZE * 3)(%rdi, %rdx, CHAR_SIZE) >> >> > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) >> >> > + xor %edx, %edx >> >> > + vptest %VMM(2), %VMM(2) >> >> > + jz L(continue_second_vector) >> >> > + ret >> >> > + >> >> > +L(copy_1_byte_scalar): >> >> > + MOVU (%rsi), %r10b >> >> > + MOVU %r10b, (%rdi) >> >> > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) >> >> > + xor %edx, %edx >> >> > + vptest %VMM(2), %VMM(2) >> >> > + jz L(continue_second_vector) >> >> > + ret >> >> > +# endif >> >> > + >> >> > +L(ret_vec_x2): >> >> > + PMOVMSK %VMM(2), %rax >> >> > + bsf %rax, %rcx >> >> > + /* Calculate return value. */ >> >> > + lea VEC_SIZE(%rsi, %rcx), %rax >> >> > + sub %r8, %rax >> >> > +# ifdef USE_AS_WCSLCPY >> >> > + shr $2, %rax >> >> > + shr $2, %rcx >> >> > +# endif >> >> > + inc %rcx >> >> > + test %rdx, %rdx >> >> > + jz L(ret) >> >> > + cmp %rdx, %rcx >> >> > + cmovb %rcx, %rdx >> >> > + >> >> > +L(partial_copy_second_vector): >> >> > + VMOVU (%rsi, %rdx, CHAR_SIZE), %VMM(1) >> >> > + VMOVU %VMM(1), (%rdi, %rdx, CHAR_SIZE) >> >> > + MOVU $0, (VEC_SIZE - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) >> >> > + xor %edx, %edx >> >> > + vptest %VMM(2), %VMM(2) >> >> > + jz L(continue_third_vector) >> >> > + >> >> > +L(ret): >> >> > + ret >> >> > + >> >> > +L(ret_vec_x3): >> >> > + PMOVMSK %VMM(2), %rax >> >> > + bsf %rax, %rcx >> >> > + /* Calculate return value. */ >> >> > + lea (VEC_SIZE * 2)(%rsi, %rcx), %rax >> >> > + sub %r8, %rax >> >> > +# ifdef USE_AS_WCSLCPY >> >> > + shr $2, %rax >> >> > + shr $2, %rcx >> >> > +# endif >> >> > + inc %rcx >> >> > + test %rdx, %rdx >> >> > + jz L(ret) >> >> > + cmp %rdx, %rcx >> >> > + cmovb %rcx, %rdx >> >> > + >> >> > +L(partial_copy_third_vector): >> >> > + VMOVU (VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM(1) >> >> > + VMOVU %VMM(1), (VEC_SIZE)(%rdi, %rdx, CHAR_SIZE) >> >> > + MOVU $0, ((VEC_SIZE * 2) - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) >> >> > + xor %edx, %edx >> >> > + vptest %VMM(2), %VMM(2) >> >> > + jz L(continue_fourth_vector) >> >> > + ret >> >> > + >> >> > +L(ret_vec_x4): >> >> > + PMOVMSK %VMM(2), %rax >> >> > + bsf %rax, %rcx >> >> > + /* Calculate return value. */ >> >> > + lea (VEC_SIZE * 3)(%rsi, %rcx), %rax >> >> > + sub %r8, %rax >> >> > +# ifdef USE_AS_WCSLCPY >> >> > + shr $2, %rax >> >> > + shr $2, %rcx >> >> > +# endif >> >> > + inc %rcx >> >> > + test %rdx, %rdx >> >> > + jz L(ret) >> >> > + cmp %rdx, %rcx >> >> > + cmovb %rcx, %rdx >> >> > + >> >> > +L(partial_copy_fourth_vector): >> >> > + VMOVU (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(1) >> >> > + VMOVU %VMM(1), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE) >> >> > + MOVU $0, ((VEC_SIZE * 3) - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) >> >> > + xor %edx, %edx >> >> > + vptest %VMM(2), %VMM(2) >> >> > + jz L(continue_fourth_vector) >> >> > + ret >> >> > + >> >> > +END (STRLCPY) >> >> >> >> Is strlcpy/strlcat integratable with existing strncat impl? Had >> >> figured they would >> >> fit in the same file. >> > >> > >> > Hi Noah, >> > >> > It may not be a good idea to put strlcpy/strlcat in the existing strncpy/strnat impl file, >> > as strlcpy/strlcat functions are associated with GLIBC_2.38 ABI. >> > >> Well, we can put the impl there and include it from another to manage >> any special >> link cases. > > > Due to ABI, none of strlcpy/strlcat changes can go in the glibc version earlier than 2.38, > to avoid any future strncpy backporting complications, it is better to keep them in separate > files for now. > I get that, but can't we just have an impl file that implements all the functions logic. It would only build strl* if its included to (similar to how strlen avx512 impl is currently setup). >> >> > --Sunil >> > >> >> >> >> > +#endif >> >> > diff --git a/sysdeps/x86_64/multiarch/strlcpy-generic.c b/sysdeps/x86_64/multiarch/strlcpy-generic.c >> >> > new file mode 100644 >> >> > index 0000000000..eee3b7b086 >> >> > --- /dev/null >> >> > +++ b/sysdeps/x86_64/multiarch/strlcpy-generic.c >> >> > @@ -0,0 +1,25 @@ >> >> > +/* strlcpy generic. >> >> > + Copyright (C) 2023 Free Software Foundation, Inc. >> >> > + This file is part of the GNU C Library. >> >> > + >> >> > + The GNU C Library is free software; you can redistribute it and/or >> >> > + modify it under the terms of the GNU Lesser General Public >> >> > + License as published by the Free Software Foundation; either >> >> > + version 2.1 of the License, or (at your option) any later version. >> >> > + >> >> > + The GNU C Library is distributed in the hope that it will be useful, >> >> > + but WITHOUT ANY WARRANTY; without even the implied warranty of >> >> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> >> > + Lesser General Public License for more details. >> >> > + >> >> > + You should have received a copy of the GNU Lesser General Public >> >> > + License along with the GNU C Library; if not, see >> >> > + <https://www.gnu.org/licenses/>. */ >> >> > + >> >> > + >> >> > +#include <isa-level.h> >> >> > +#if ISA_SHOULD_BUILD (1) >> >> > +# define __strlcpy __strlcpy_generic >> >> > +# include <string/strlcpy.c> >> >> > + >> >> > +#endif >> >> > diff --git a/sysdeps/x86_64/multiarch/strlcpy.c b/sysdeps/x86_64/multiarch/strlcpy.c >> >> > new file mode 100644 >> >> > index 0000000000..ded41fbcfb >> >> > --- /dev/null >> >> > +++ b/sysdeps/x86_64/multiarch/strlcpy.c >> >> > @@ -0,0 +1,36 @@ >> >> > +/* Multiple versions of strlcpy. >> >> > + All versions must be listed in ifunc-impl-list.c. >> >> > + Copyright (C) 2023 Free Software Foundation, Inc. >> >> > + This file is part of the GNU C Library. >> >> > + >> >> > + The GNU C Library is free software; you can redistribute it and/or >> >> > + modify it under the terms of the GNU Lesser General Public >> >> > + License as published by the Free Software Foundation; either >> >> > + version 2.1 of the License, or (at your option) any later version. >> >> > + >> >> > + The GNU C Library is distributed in the hope that it will be useful, >> >> > + but WITHOUT ANY WARRANTY; without even the implied warranty of >> >> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> >> > + Lesser General Public License for more details. >> >> > + >> >> > + You should have received a copy of the GNU Lesser General Public >> >> > + License along with the GNU C Library; if not, see >> >> > + <https://www.gnu.org/licenses/>. */ >> >> > + >> >> > +/* Define multiple versions only for the definition in libc. */ >> >> > +#if IS_IN (libc) >> >> > +# define __strlcpy __redirect_strlcpy >> >> > +# include <string.h> >> >> > +# undef __strlcpy >> >> > + >> >> > +# define SYMBOL_NAME strlcpy >> >> > +# include "ifunc-strlcpy.h" >> >> > + >> >> > +libc_ifunc_redirected (__redirect_strlcpy, __strlcpy, IFUNC_SELECTOR ()); >> >> > +weak_alias (__strlcpy, strlcpy) >> >> > + >> >> > +# ifdef SHARED >> >> > +__hidden_ver1 (__strlcpy, __GI___strlcpy, __redirect_strlcpy) >> >> > + __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strlcpy); >> >> > +# endif >> >> > +#endif >> >> > diff --git a/sysdeps/x86_64/multiarch/wcslcpy-avx2.S b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S >> >> > new file mode 100644 >> >> > index 0000000000..dafc20ded0 >> >> > --- /dev/null >> >> > +++ b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S >> >> > @@ -0,0 +1,4 @@ >> >> > +#define STRLCPY __wcslcpy_avx2 >> >> > +#define USE_AS_WCSLCPY 1 >> >> > + >> >> > +#include "strlcpy-avx2.S" >> >> > diff --git a/sysdeps/x86_64/multiarch/wcslcpy-generic.c b/sysdeps/x86_64/multiarch/wcslcpy-generic.c >> >> > new file mode 100644 >> >> > index 0000000000..ffd3c0e846 >> >> > --- /dev/null >> >> > +++ b/sysdeps/x86_64/multiarch/wcslcpy-generic.c >> >> > @@ -0,0 +1,25 @@ >> >> > +/* wcslcpy generic. >> >> > + Copyright (C) 2023 Free Software Foundation, Inc. >> >> > + This file is part of the GNU C Library. >> >> > + >> >> > + The GNU C Library is free software; you can redistribute it and/or >> >> > + modify it under the terms of the GNU Lesser General Public >> >> > + License as published by the Free Software Foundation; either >> >> > + version 2.1 of the License, or (at your option) any later version. >> >> > + >> >> > + The GNU C Library is distributed in the hope that it will be useful, >> >> > + but WITHOUT ANY WARRANTY; without even the implied warranty of >> >> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> >> > + Lesser General Public License for more details. >> >> > + >> >> > + You should have received a copy of the GNU Lesser General Public >> >> > + License along with the GNU C Library; if not, see >> >> > + <https://www.gnu.org/licenses/>. */ >> >> > + >> >> > + >> >> > +#include <isa-level.h> >> >> > +#if ISA_SHOULD_BUILD (1) >> >> > +# define __wcslcpy __wcslcpy_generic >> >> > +# include <wcsmbs/wcslcpy.c> >> >> > + >> >> > +#endif >> >> > diff --git a/sysdeps/x86_64/multiarch/wcslcpy.c b/sysdeps/x86_64/multiarch/wcslcpy.c >> >> > new file mode 100644 >> >> > index 0000000000..371ef9626c >> >> > --- /dev/null >> >> > +++ b/sysdeps/x86_64/multiarch/wcslcpy.c >> >> > @@ -0,0 +1,35 @@ >> >> > +/* Multiple versions of wcslcpy. >> >> > + All versions must be listed in ifunc-impl-list.c. >> >> > + Copyright (C) 2023 Free Software Foundation, Inc. >> >> > + This file is part of the GNU C Library. >> >> > + >> >> > + The GNU C Library is free software; you can redistribute it and/or >> >> > + modify it under the terms of the GNU Lesser General Public >> >> > + License as published by the Free Software Foundation; either >> >> > + version 2.1 of the License, or (at your option) any later version. >> >> > + >> >> > + The GNU C Library is distributed in the hope that it will be useful, >> >> > + but WITHOUT ANY WARRANTY; without even the implied warranty of >> >> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> >> > + Lesser General Public License for more details. >> >> > + >> >> > + You should have received a copy of the GNU Lesser General Public >> >> > + License along with the GNU C Library; if not, see >> >> > + <https://www.gnu.org/licenses/>. */ >> >> > + >> >> > +/* Define multiple versions only for the definition in libc. */ >> >> > +#if IS_IN (libc) >> >> > +# define __wcslcpy __redirect_wcslcpy >> >> > +# include <wchar.h> >> >> > +# undef __wcslcpy >> >> > + >> >> > +# define SYMBOL_NAME wcslcpy >> >> > +# include "ifunc-strlcpy.h" >> >> > + >> >> > +libc_ifunc_redirected (__redirect_wcslcpy, __wcslcpy, IFUNC_SELECTOR ()); >> >> > +weak_alias (__wcslcpy, wcslcpy) >> >> > +# ifdef SHARED >> >> > +__hidden_ver1 (__wcslcpy, __GI___wcslcpy, __redirect_wcslcpy) >> >> > + __attribute__((visibility ("hidden"))) __attribute_copy__ (wcslcpy); >> >> > +# endif >> >> > +#endif >> >> > -- >> >> > 2.38.1 >> >> > ^ permalink raw reply [flat|nested] 24+ messages in thread
end of thread, other threads:[~2023-07-04 7:45 UTC | newest] Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2023-06-30 20:48 [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function Sunil K Pandey 2023-06-30 21:04 ` Noah Goldstein 2023-06-30 21:27 ` Paul Eggert 2023-06-30 22:21 ` Sunil Pandey 2023-06-30 23:22 ` Noah Goldstein 2023-06-30 23:27 ` Noah Goldstein 2023-07-03 16:30 ` Paul Eggert 2023-07-03 18:40 ` Noah Goldstein 2023-07-03 18:54 ` Adhemerval Zanella Netto 2023-07-03 21:14 ` Paul Eggert 2023-07-03 22:04 ` Gabriel Ravier 2023-07-03 23:12 ` Paul Eggert 2023-07-04 7:45 ` Andreas Schwab 2023-07-03 12:55 ` Adhemerval Zanella Netto 2023-07-01 9:41 ` Florian Weimer 2023-07-02 1:22 ` Noah Goldstein 2023-07-02 6:51 ` Florian Weimer 2023-07-02 16:55 ` Noah Goldstein 2023-07-02 17:02 ` Florian Weimer 2023-07-02 17:03 ` Noah Goldstein 2023-07-02 18:37 ` Sunil Pandey 2023-07-02 18:54 ` Noah Goldstein 2023-07-03 1:03 ` Sunil Pandey 2023-07-03 1:47 ` Noah Goldstein
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).