On Wed, Jul 6, 2022 at 10:14 AM Noah Goldstein wrote: > > On Tue, Jul 5, 2022 at 7:05 PM Noah Goldstein wrote: > > > > On Tue, Jul 5, 2022 at 6:33 PM H.J. Lu wrote: > > > > > > On Tue, Jul 5, 2022 at 5:06 PM Noah Goldstein wrote: > > > > > > > > These functions all have optimized versions: > > > > __strncat_sse2_unaligned, __strncpy_sse2_unaligned, and > > > > stpncpy_sse2_unaligned which are faster than their respective generic > > > > implementations. Since the sse2 versions can run on baseline x86_64, > > > > > > Is this true on all x86-64 processors? > > > > That they are faster or that sse2 can run on baseline x86_64? Both cases are true on Zhaoxin processors. > > Assuming you mean faster: > > maybe not for large strings on a processor that heavily prefers aligned > access. > > The generics are implemented as: > > strncat = strnlen + strlen + memcpy > strncpy = strnlen + memset + memcpy > stpncpy = strnlen + memset + memcpy > > For very large strings on processors with SSSE3 > all of those functions can be implemented with only aligned > access in the loops. > > Not sure if it would work in favor of the generic implementations though as > they are essentially doing 2x + the number of memory accesses. > > mayshao do you have benchmark numbers on these functions? The results are attached. If there is any problem with the results, please feel free to let me know. Best Regards, May Shao > > > > > > > > > > > > we should use these as the baseline implementation and can remove the > > > > generic implementations. > > > > > > > > Geometric mean of N=20 runs of the entire benchmark suite on: > > > > 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz (Tigerlake) > > > > > > > > __strncat_sse2_unaligned / __strncat_generic: .944 > > > > __strncpy_sse2_unaligned / __strncpy_generic: .726 > > > > __stpncpy_sse2_unaligned / __stpncpy_generic: .650 > > > > > > > > Tested build with and without multiarch and full check with multiarch. > > > > --- > > > > sysdeps/x86_64/multiarch/Makefile | 3 -- > > > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 9 ++-- > > > > sysdeps/x86_64/multiarch/ifunc-strcpy.h | 8 +--- > > > > sysdeps/x86_64/multiarch/ifunc-strncpy.h | 48 ++++++++++++++++++++++ > > > > sysdeps/x86_64/multiarch/stpncpy-generic.c | 26 ------------ > > > > sysdeps/x86_64/multiarch/stpncpy.c | 3 +- > > > > sysdeps/x86_64/multiarch/strncat-generic.c | 21 ---------- > > > > sysdeps/x86_64/multiarch/strncat.c | 3 +- > > > > sysdeps/x86_64/multiarch/strncpy-generic.c | 24 ----------- > > > > sysdeps/x86_64/multiarch/strncpy.c | 3 +- > > > > 10 files changed, 56 insertions(+), 92 deletions(-) > > > > create mode 100644 sysdeps/x86_64/multiarch/ifunc-strncpy.h > > > > delete mode 100644 sysdeps/x86_64/multiarch/stpncpy-generic.c > > > > delete mode 100644 sysdeps/x86_64/multiarch/strncat-generic.c > > > > delete mode 100644 sysdeps/x86_64/multiarch/strncpy-generic.c > > > > > > > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > > > > index 18cea04423..04e238efa0 100644 > > > > --- a/sysdeps/x86_64/multiarch/Makefile > > > > +++ b/sysdeps/x86_64/multiarch/Makefile > > > > @@ -45,7 +45,6 @@ sysdep_routines += \ > > > > stpcpy-sse2-unaligned \ > > > > stpncpy-avx2 \ > > > > stpncpy-avx2-rtm \ > > > > - stpncpy-generic \ > > > > stpncpy-evex \ > > > > stpncpy-sse2-unaligned \ > > > > strcasecmp_l-avx2 \ > > > > @@ -92,7 +91,6 @@ sysdep_routines += \ > > > > strncase_l-sse4_2 \ > > > > strncat-avx2 \ > > > > strncat-avx2-rtm \ > > > > - strncat-generic \ > > > > strncat-evex \ > > > > strncat-sse2-unaligned \ > > > > strncmp-avx2 \ > > > > @@ -102,7 +100,6 @@ sysdep_routines += \ > > > > strncmp-sse4_2 \ > > > > strncpy-avx2 \ > > > > strncpy-avx2-rtm \ > > > > - strncpy-generic \ > > > > strncpy-evex \ > > > > strncpy-sse2-unaligned \ > > > > strnlen-avx2 \ > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > > index adf7d4bafd..2c96cb62d2 100644 > > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > > @@ -403,8 +403,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > && CPU_FEATURE_USABLE (AVX512BW)), > > > > __stpncpy_evex) > > > > IFUNC_IMPL_ADD (array, i, stpncpy, 1, > > > > - __stpncpy_sse2_unaligned) > > > > - IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_generic)) > > > > + __stpncpy_sse2_unaligned)) > > > > > > > > /* Support sysdeps/x86_64/multiarch/stpcpy.c. */ > > > > IFUNC_IMPL (i, name, stpcpy, > > > > @@ -618,8 +617,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > && CPU_FEATURE_USABLE (AVX512BW)), > > > > __strncat_evex) > > > > IFUNC_IMPL_ADD (array, i, strncat, 1, > > > > - __strncat_sse2_unaligned) > > > > - IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_generic)) > > > > + __strncat_sse2_unaligned)) > > > > > > > > /* Support sysdeps/x86_64/multiarch/strncpy.c. */ > > > > IFUNC_IMPL (i, name, strncpy, > > > > @@ -634,8 +632,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > && CPU_FEATURE_USABLE (AVX512BW)), > > > > __strncpy_evex) > > > > IFUNC_IMPL_ADD (array, i, strncpy, 1, > > > > - __strncpy_sse2_unaligned) > > > > - IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_generic)) > > > > + __strncpy_sse2_unaligned)) > > > > > > > > /* Support sysdeps/x86_64/multiarch/strpbrk.c. */ > > > > IFUNC_IMPL (i, name, strpbrk, > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h > > > > index 80529458d1..a15afa44e9 100644 > > > > --- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h > > > > +++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h > > > > @@ -20,11 +20,7 @@ > > > > > > > > #include > > > > > > > > -#ifndef GENERIC > > > > -# define GENERIC sse2 > > > > -#endif > > > > - > > > > -extern __typeof (REDIRECT_NAME) OPTIMIZE (GENERIC) attribute_hidden; > > > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; > > > > extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) > > > > attribute_hidden; > > > > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; > > > > @@ -53,5 +49,5 @@ IFUNC_SELECTOR (void) > > > > if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load)) > > > > return OPTIMIZE (sse2_unaligned); > > > > > > > > - return OPTIMIZE (GENERIC); > > > > + return OPTIMIZE (sse2); > > > > } > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-strncpy.h b/sysdeps/x86_64/multiarch/ifunc-strncpy.h > > > > new file mode 100644 > > > > index 0000000000..323225af4d > > > > --- /dev/null > > > > +++ b/sysdeps/x86_64/multiarch/ifunc-strncpy.h > > > > @@ -0,0 +1,48 @@ > > > > +/* Common definition for ifunc st{r|p}n{cpy|cat} > > > > + All versions must be listed in ifunc-impl-list.c. > > > > + Copyright (C) 2022 Free Software Foundation, Inc. > > > > + This file is part of the GNU C Library. > > > > + > > > > + The GNU C Library is free software; you can redistribute it and/or > > > > + modify it under the terms of the GNU Lesser General Public > > > > + License as published by the Free Software Foundation; either > > > > + version 2.1 of the License, or (at your option) any later version. > > > > + > > > > + The GNU C Library is distributed in the hope that it will be useful, > > > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > > > + Lesser General Public License for more details. > > > > + > > > > + You should have received a copy of the GNU Lesser General Public > > > > + License along with the GNU C Library; if not, see > > > > + . */ > > > > + > > > > +#include > > > > + > > > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) > > > > + attribute_hidden; > > > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; > > > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; > > > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; > > > > + > > > > +static inline void * > > > > +IFUNC_SELECTOR (void) > > > > +{ > > > > + const struct cpu_features* cpu_features = __get_cpu_features (); > > > > + > > > > + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) > > > > + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) > > > > + { > > > > + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) > > > > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) > > > > + return OPTIMIZE (evex); > > > > + > > > > + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) > > > > + return OPTIMIZE (avx2_rtm); > > > > + > > > > + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) > > > > + return OPTIMIZE (avx2); > > > > + } > > > > + > > > > + return OPTIMIZE (sse2_unaligned); > > > > +} > > > > diff --git a/sysdeps/x86_64/multiarch/stpncpy-generic.c b/sysdeps/x86_64/multiarch/stpncpy-generic.c > > > > deleted file mode 100644 > > > > index 87826845b0..0000000000 > > > > --- a/sysdeps/x86_64/multiarch/stpncpy-generic.c > > > > +++ /dev/null > > > > @@ -1,26 +0,0 @@ > > > > -/* stpncpy. > > > > - Copyright (C) 2022 Free Software Foundation, Inc. > > > > - This file is part of the GNU C Library. > > > > - > > > > - The GNU C Library is free software; you can redistribute it and/or > > > > - modify it under the terms of the GNU Lesser General Public > > > > - License as published by the Free Software Foundation; either > > > > - version 2.1 of the License, or (at your option) any later version. > > > > - > > > > - The GNU C Library is distributed in the hope that it will be useful, > > > > - but WITHOUT ANY WARRANTY; without even the implied warranty of > > > > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > > > - Lesser General Public License for more details. > > > > - > > > > - You should have received a copy of the GNU Lesser General Public > > > > - License along with the GNU C Library; if not, see > > > > - . */ > > > > - > > > > - > > > > -#define STPNCPY __stpncpy_generic > > > > -#undef weak_alias > > > > -#define weak_alias(ignored1, ignored2) > > > > -#undef libc_hidden_def > > > > -#define libc_hidden_def(stpncpy) > > > > - > > > > -#include > > > > diff --git a/sysdeps/x86_64/multiarch/stpncpy.c b/sysdeps/x86_64/multiarch/stpncpy.c > > > > index 879bc83f0b..a8d083ff0d 100644 > > > > --- a/sysdeps/x86_64/multiarch/stpncpy.c > > > > +++ b/sysdeps/x86_64/multiarch/stpncpy.c > > > > @@ -25,9 +25,8 @@ > > > > # undef stpncpy > > > > # undef __stpncpy > > > > > > > > -# define GENERIC generic > > > > # define SYMBOL_NAME stpncpy > > > > -# include "ifunc-strcpy.h" > > > > +# include "ifunc-strncpy.h" > > > > > > > > libc_ifunc_redirected (__redirect_stpncpy, __stpncpy, IFUNC_SELECTOR ()); > > > > > > > > diff --git a/sysdeps/x86_64/multiarch/strncat-generic.c b/sysdeps/x86_64/multiarch/strncat-generic.c > > > > deleted file mode 100644 > > > > index 0090669cd1..0000000000 > > > > --- a/sysdeps/x86_64/multiarch/strncat-generic.c > > > > +++ /dev/null > > > > @@ -1,21 +0,0 @@ > > > > -/* strncat. > > > > - Copyright (C) 2022 Free Software Foundation, Inc. > > > > - This file is part of the GNU C Library. > > > > - > > > > - The GNU C Library is free software; you can redistribute it and/or > > > > - modify it under the terms of the GNU Lesser General Public > > > > - License as published by the Free Software Foundation; either > > > > - version 2.1 of the License, or (at your option) any later version. > > > > - > > > > - The GNU C Library is distributed in the hope that it will be useful, > > > > - but WITHOUT ANY WARRANTY; without even the implied warranty of > > > > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > > > - Lesser General Public License for more details. > > > > - > > > > - You should have received a copy of the GNU Lesser General Public > > > > - License along with the GNU C Library; if not, see > > > > - . */ > > > > - > > > > - > > > > -#define STRNCAT __strncat_generic > > > > -#include > > > > diff --git a/sysdeps/x86_64/multiarch/strncat.c b/sysdeps/x86_64/multiarch/strncat.c > > > > index 50fba8a41f..a590c25d51 100644 > > > > --- a/sysdeps/x86_64/multiarch/strncat.c > > > > +++ b/sysdeps/x86_64/multiarch/strncat.c > > > > @@ -24,8 +24,7 @@ > > > > # undef strncat > > > > > > > > # define SYMBOL_NAME strncat > > > > -# define GENERIC generic > > > > -# include "ifunc-strcpy.h" > > > > +# include "ifunc-strncpy.h" > > > > > > > > libc_ifunc_redirected (__redirect_strncat, strncat, IFUNC_SELECTOR ()); > > > > strong_alias (strncat, __strncat); > > > > diff --git a/sysdeps/x86_64/multiarch/strncpy-generic.c b/sysdeps/x86_64/multiarch/strncpy-generic.c > > > > deleted file mode 100644 > > > > index 9916153dd5..0000000000 > > > > --- a/sysdeps/x86_64/multiarch/strncpy-generic.c > > > > +++ /dev/null > > > > @@ -1,24 +0,0 @@ > > > > -/* strncpy. > > > > - Copyright (C) 2022 Free Software Foundation, Inc. > > > > - This file is part of the GNU C Library. > > > > - > > > > - The GNU C Library is free software; you can redistribute it and/or > > > > - modify it under the terms of the GNU Lesser General Public > > > > - License as published by the Free Software Foundation; either > > > > - version 2.1 of the License, or (at your option) any later version. > > > > - > > > > - The GNU C Library is distributed in the hope that it will be useful, > > > > - but WITHOUT ANY WARRANTY; without even the implied warranty of > > > > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > > > - Lesser General Public License for more details. > > > > - > > > > - You should have received a copy of the GNU Lesser General Public > > > > - License along with the GNU C Library; if not, see > > > > - . */ > > > > - > > > > - > > > > -#define STRNCPY __strncpy_generic > > > > -#undef libc_hidden_builtin_def > > > > -#define libc_hidden_builtin_def(strncpy) > > > > - > > > > -#include > > > > diff --git a/sysdeps/x86_64/multiarch/strncpy.c b/sysdeps/x86_64/multiarch/strncpy.c > > > > index 7fc7d72ec5..c83440f0e3 100644 > > > > --- a/sysdeps/x86_64/multiarch/strncpy.c > > > > +++ b/sysdeps/x86_64/multiarch/strncpy.c > > > > @@ -24,8 +24,7 @@ > > > > # undef strncpy > > > > > > > > # define SYMBOL_NAME strncpy > > > > -# define GENERIC generic > > > > -# include "ifunc-strcpy.h" > > > > +# include "ifunc-strncpy.h" > > > > > > > > libc_ifunc_redirected (__redirect_strncpy, strncpy, IFUNC_SELECTOR ()); > > > > > > > > -- > > > > 2.34.1 > > > > > > > > > > > > > -- > > > H.J. >