public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
From: Noah Goldstein <goldstein.w.n@gmail.com>
To: "H.J. Lu" <hjl.tools@gmail.com>
Cc: GNU C Library <libc-alpha@sourceware.org>,
	"Carlos O'Donell" <carlos@systemhalted.org>
Subject: Re: [PATCH v2] x86: Rename generic functions with unique postfix for clarity
Date: Thu, 9 Jun 2022 18:26:57 -0700	[thread overview]
Message-ID: <CAFUsyf+Vc-ED2UhRcGE7WxxtS-OYL6+U9sBV-DKHo9+e9QgEgw@mail.gmail.com> (raw)
In-Reply-To: <CAMe9rOpcws-9d=getSo7itF-puA0sVxSQpWg3LuLQb2ZDu8sRQ@mail.gmail.com>

On Thu, Jun 9, 2022 at 6:20 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Thu, Jun 9, 2022 at 5:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > No functions are changed. It just renames generic implementations from
> > '{func}_sse2' to '{func}_generic'. This is just because the postfix
> > "_sse2" was overloaded and was used for files that had hand-optimized
> > sse2 assembly implementations and files that just redirected back
> > to the generic implementation.
>
> This change isn't small and its benefit is very small.  Can it be the part of
> a big change to support building glibc with
>
> -march=x86-64-vN

kk
>
> > Full xcheck passed on x86_64.
> > ---
> >  sysdeps/x86_64/multiarch/Makefile             |  15 +-
> >  sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  16 +-
> >  sysdeps/x86_64/multiarch/ifunc-sse4_2.h       |   4 +-
> >  sysdeps/x86_64/multiarch/ifunc-strcpy.h       |   8 +-
> >  sysdeps/x86_64/multiarch/ifunc-wcslen.h       |   8 +-
> >  sysdeps/x86_64/multiarch/stpncpy-c.c          |   2 +-
> >  sysdeps/x86_64/multiarch/stpncpy.c            |   1 +
> >  sysdeps/x86_64/multiarch/strcspn-c-sse4.c     | 163 ++++++++++++++++++
> >  sysdeps/x86_64/multiarch/strcspn-c.c          | 151 +---------------
> >  sysdeps/x86_64/multiarch/strcspn-sse2.c       |  28 ---
> >  sysdeps/x86_64/multiarch/strncat-c.c          |   2 +-
> >  sysdeps/x86_64/multiarch/strncat.c            |   1 +
> >  sysdeps/x86_64/multiarch/strncpy-c.c          |   2 +-
> >  sysdeps/x86_64/multiarch/strncpy.c            |   1 +
> >  .../{strspn-sse2.c => strpbrk-c-sse4.c}       |  18 +-
> >  sysdeps/x86_64/multiarch/strpbrk-c.c          |  18 +-
> >  sysdeps/x86_64/multiarch/strpbrk-sse2.c       |  28 ---
> >  sysdeps/x86_64/multiarch/strspn-c-sse4.c      | 136 +++++++++++++++
> >  sysdeps/x86_64/multiarch/strspn-c.c           | 126 +-------------
> >  sysdeps/x86_64/multiarch/wcscpy-c.c           |   2 +-
> >  sysdeps/x86_64/multiarch/wcscpy.c             |   4 +-
> >  sysdeps/x86_64/multiarch/wcsnlen-c.c          |   4 +-
> >  sysdeps/x86_64/multiarch/wcsnlen.c            |   1 +
> >  23 files changed, 376 insertions(+), 363 deletions(-)
> >  create mode 100644 sysdeps/x86_64/multiarch/strcspn-c-sse4.c
> >  delete mode 100644 sysdeps/x86_64/multiarch/strcspn-sse2.c
> >  rename sysdeps/x86_64/multiarch/{strspn-sse2.c => strpbrk-c-sse4.c} (74%)
> >  delete mode 100644 sysdeps/x86_64/multiarch/strpbrk-sse2.c
> >  create mode 100644 sysdeps/x86_64/multiarch/strspn-c-sse4.c
> >
> > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > index 3d153cac35..86c6ecdfc1 100644
> > --- a/sysdeps/x86_64/multiarch/Makefile
> > +++ b/sysdeps/x86_64/multiarch/Makefile
> > @@ -77,7 +77,7 @@ sysdep_routines += \
> >    strcpy-sse2 \
> >    strcpy-sse2-unaligned \
> >    strcspn-c \
> > -  strcspn-sse2 \
> > +  strcspn-c-sse4 \
> >    strlen-avx2 \
> >    strlen-avx2-rtm \
> >    strlen-evex \
> > @@ -109,21 +109,22 @@ sysdep_routines += \
> >    strnlen-evex512 \
> >    strnlen-sse2 \
> >    strpbrk-c \
> > -  strpbrk-sse2 \
> > +  strpbrk-c-sse4 \
> >    strrchr-avx2 \
> >    strrchr-avx2-rtm \
> >    strrchr-evex \
> >    strrchr-sse2 \
> >    strspn-c \
> > -  strspn-sse2 \
> > +  strspn-c-sse4 \
> >    strstr-avx512 \
> >    strstr-sse2-unaligned \
> >    varshift \
> >  # sysdep_routines
> > -CFLAGS-varshift.c += -msse4
> > -CFLAGS-strcspn-c.c += -msse4
> > -CFLAGS-strpbrk-c.c += -msse4
> > -CFLAGS-strspn-c.c += -msse4
> > +
> > +CFLAGS-strcspn-c-sse4.c += -msse4
> > +CFLAGS-strpbrk-c-sse4.c += -msse4
> > +CFLAGS-strspn-c-sse4.c += -msse4
> > +
> >  CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -mavx512bw -mbmi -mbmi2 -O3
> >  endif
> >
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > index 58f3ec8306..4cbd200d39 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > @@ -372,7 +372,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                               __stpncpy_evex)
> >               IFUNC_IMPL_ADD (array, i, stpncpy, 1,
> >                               __stpncpy_sse2_unaligned)
> > -             IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_sse2))
> > +             IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_generic))
> >
> >    /* Support sysdeps/x86_64/multiarch/stpcpy.c.  */
> >    IFUNC_IMPL (i, name, stpcpy,
> > @@ -531,7 +531,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >    IFUNC_IMPL (i, name, strcspn,
> >               IFUNC_IMPL_ADD (array, i, strcspn, CPU_FEATURE_USABLE (SSE4_2),
> >                               __strcspn_sse42)
> > -             IFUNC_IMPL_ADD (array, i, strcspn, 1, __strcspn_sse2))
> > +             IFUNC_IMPL_ADD (array, i, strcspn, 1, __strcspn_generic))
> >
> >    /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
> >    IFUNC_IMPL (i, name, strncasecmp,
> > @@ -585,7 +585,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                               __strncat_evex)
> >               IFUNC_IMPL_ADD (array, i, strncat, 1,
> >                               __strncat_sse2_unaligned)
> > -             IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_sse2))
> > +             IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_generic))
> >
> >    /* Support sysdeps/x86_64/multiarch/strncpy.c.  */
> >    IFUNC_IMPL (i, name, strncpy,
> > @@ -601,20 +601,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                               __strncpy_evex)
> >               IFUNC_IMPL_ADD (array, i, strncpy, 1,
> >                               __strncpy_sse2_unaligned)
> > -             IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2))
> > +             IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_generic))
> >
> >    /* Support sysdeps/x86_64/multiarch/strpbrk.c.  */
> >    IFUNC_IMPL (i, name, strpbrk,
> >               IFUNC_IMPL_ADD (array, i, strpbrk, CPU_FEATURE_USABLE (SSE4_2),
> >                               __strpbrk_sse42)
> > -             IFUNC_IMPL_ADD (array, i, strpbrk, 1, __strpbrk_sse2))
> > +             IFUNC_IMPL_ADD (array, i, strpbrk, 1, __strpbrk_generic))
> >
> >
> >    /* Support sysdeps/x86_64/multiarch/strspn.c.  */
> >    IFUNC_IMPL (i, name, strspn,
> >               IFUNC_IMPL_ADD (array, i, strspn, CPU_FEATURE_USABLE (SSE4_2),
> >                               __strspn_sse42)
> > -             IFUNC_IMPL_ADD (array, i, strspn, 1, __strspn_sse2))
> > +             IFUNC_IMPL_ADD (array, i, strspn, 1, __strspn_generic))
> >
> >    /* Support sysdeps/x86_64/multiarch/strstr.c.  */
> >    IFUNC_IMPL (i, name, strstr,
> > @@ -697,7 +697,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >    IFUNC_IMPL (i, name, wcscpy,
> >               IFUNC_IMPL_ADD (array, i, wcscpy, CPU_FEATURE_USABLE (SSSE3),
> >                               __wcscpy_ssse3)
> > -             IFUNC_IMPL_ADD (array, i, wcscpy, 1, __wcscpy_sse2))
> > +             IFUNC_IMPL_ADD (array, i, wcscpy, 1, __wcscpy_generic))
> >
> >    /* Support sysdeps/x86_64/multiarch/wcslen.c.  */
> >    IFUNC_IMPL (i, name, wcslen,
> > @@ -749,7 +749,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >               IFUNC_IMPL_ADD (array, i, wcsnlen,
> >                               CPU_FEATURE_USABLE (SSE4_1),
> >                               __wcsnlen_sse4_1)
> > -             IFUNC_IMPL_ADD (array, i, wcsnlen, 1, __wcsnlen_sse2))
> > +             IFUNC_IMPL_ADD (array, i, wcsnlen, 1, __wcsnlen_generic))
> >
> >    /* Support sysdeps/x86_64/multiarch/wmemchr.c.  */
> >    IFUNC_IMPL (i, name, wmemchr,
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-sse4_2.h b/sysdeps/x86_64/multiarch/ifunc-sse4_2.h
> > index b555ff2fac..ee36525bcf 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-sse4_2.h
> > +++ b/sysdeps/x86_64/multiarch/ifunc-sse4_2.h
> > @@ -19,7 +19,7 @@
> >
> >  #include <init-arch.h>
> >
> > -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden;
> >  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
> >
> >  static inline void *
> > @@ -30,5 +30,5 @@ IFUNC_SELECTOR (void)
> >    if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_2))
> >      return OPTIMIZE (sse42);
> >
> > -  return OPTIMIZE (sse2);
> > +  return OPTIMIZE (generic);
> >  }
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> > index a15afa44e9..80529458d1 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> > +++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> > @@ -20,7 +20,11 @@
> >
> >  #include <init-arch.h>
> >
> > -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> > +#ifndef GENERIC
> > +# define GENERIC sse2
> > +#endif
> > +
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (GENERIC) attribute_hidden;
> >  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
> >    attribute_hidden;
> >  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> > @@ -49,5 +53,5 @@ IFUNC_SELECTOR (void)
> >    if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
> >      return OPTIMIZE (sse2_unaligned);
> >
> > -  return OPTIMIZE (sse2);
> > +  return OPTIMIZE (GENERIC);
> >  }
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-wcslen.h b/sysdeps/x86_64/multiarch/ifunc-wcslen.h
> > index 2b29e7608a..88c1c502af 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-wcslen.h
> > +++ b/sysdeps/x86_64/multiarch/ifunc-wcslen.h
> > @@ -19,7 +19,11 @@
> >
> >  #include <init-arch.h>
> >
> > -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> > +#ifndef GENERIC
> > +# define GENERIC sse2
> > +#endif
> > +
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (GENERIC) attribute_hidden;
> >  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
> >  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> >  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
> > @@ -48,5 +52,5 @@ IFUNC_SELECTOR (void)
> >    if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
> >      return OPTIMIZE (sse4_1);
> >
> > -  return OPTIMIZE (sse2);
> > +  return OPTIMIZE (GENERIC);
> >  }
> > diff --git a/sysdeps/x86_64/multiarch/stpncpy-c.c b/sysdeps/x86_64/multiarch/stpncpy-c.c
> > index b016e487e1..eb62fcf388 100644
> > --- a/sysdeps/x86_64/multiarch/stpncpy-c.c
> > +++ b/sysdeps/x86_64/multiarch/stpncpy-c.c
> > @@ -1,4 +1,4 @@
> > -#define STPNCPY __stpncpy_sse2
> > +#define STPNCPY __stpncpy_generic
> >  #undef weak_alias
> >  #define weak_alias(ignored1, ignored2)
> >  #undef libc_hidden_def
> > diff --git a/sysdeps/x86_64/multiarch/stpncpy.c b/sysdeps/x86_64/multiarch/stpncpy.c
> > index 82fa53957d..879bc83f0b 100644
> > --- a/sysdeps/x86_64/multiarch/stpncpy.c
> > +++ b/sysdeps/x86_64/multiarch/stpncpy.c
> > @@ -25,6 +25,7 @@
> >  # undef stpncpy
> >  # undef __stpncpy
> >
> > +# define GENERIC generic
> >  # define SYMBOL_NAME stpncpy
> >  # include "ifunc-strcpy.h"
> >
> > diff --git a/sysdeps/x86_64/multiarch/strcspn-c-sse4.c b/sysdeps/x86_64/multiarch/strcspn-c-sse4.c
> > new file mode 100644
> > index 0000000000..59f64f9fe8
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strcspn-c-sse4.c
> > @@ -0,0 +1,163 @@
> > +/* strcspn with SSE4.2 intrinsics
> > +   Copyright (C) 2009-2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include <nmmintrin.h>
> > +#include <string.h>
> > +#include "varshift.h"
> > +
> > +/* We use 0x2:
> > +       _SIDD_SBYTE_OPS
> > +       | _SIDD_CMP_EQUAL_ANY
> > +       | _SIDD_POSITIVE_POLARITY
> > +       | _SIDD_LEAST_SIGNIFICANT
> > +   on pcmpistri to compare xmm/mem128
> > +
> > +   0 1 2 3 4 5 6 7 8 9 A B C D E F
> > +   X X X X X X X X X X X X X X X X
> > +
> > +   against xmm
> > +
> > +   0 1 2 3 4 5 6 7 8 9 A B C D E F
> > +   A A A A A A A A A A A A A A A A
> > +
> > +   to find out if the first 16byte data element has any byte A and
> > +   the offset of the first byte.  There are 3 cases:
> > +
> > +   1. The first 16byte data element has the byte A at the offset X.
> > +   2. The first 16byte data element has EOS and doesn't have the byte A.
> > +   3. The first 16byte data element is valid and doesn't have the byte A.
> > +
> > +   Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
> > +
> > +    1           X        1      0/1      0
> > +    2          16        0       1       0
> > +    3          16        0       0       0
> > +
> > +   We exit from the loop for cases 1 and 2 with jbe which branches
> > +   when either CFlag or ZFlag is 1.  If CFlag == 1, ECX has the offset
> > +   X for case 1.  */
> > +
> > +#ifndef STRCSPN_GENERIC
> > +# define STRCSPN_GENERIC __strcspn_generic
> > +# define STRCSPN_SSE42 __strcspn_sse42
> > +#endif
> > +
> > +#ifdef USE_AS_STRPBRK
> > +# define RETURN(val1, val2) return val1
> > +#else
> > +# define RETURN(val1, val2) return val2
> > +#endif
> > +
> > +extern
> > +#ifdef USE_AS_STRPBRK
> > +char *
> > +#else
> > +size_t
> > +#endif
> > +STRCSPN_GENERIC (const char *, const char *) attribute_hidden;
> > +
> > +
> > +#ifdef USE_AS_STRPBRK
> > +char *
> > +#else
> > +size_t
> > +#endif
> > +__attribute__ ((section (".text.sse4.2")))
> > +STRCSPN_SSE42 (const char *s, const char *a)
> > +{
> > +  if (*a == 0)
> > +    RETURN (NULL, strlen (s));
> > +
> > +  const char *aligned;
> > +  __m128i mask, maskz, zero;
> > +  unsigned int maskz_bits;
> > +  unsigned int offset = (unsigned int) ((size_t) a & 15);
> > +  zero = _mm_set1_epi8 (0);
> > +  if (offset != 0)
> > +    {
> > +      /* Load masks.  */
> > +      aligned = (const char *) ((size_t) a & -16L);
> > +      __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
> > +      maskz = _mm_cmpeq_epi8 (mask0, zero);
> > +
> > +      /* Find where the NULL terminator is.  */
> > +      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
> > +      if (maskz_bits != 0)
> > +        {
> > +          mask = __m128i_shift_right (mask0, offset);
> > +          offset = (unsigned int) ((size_t) s & 15);
> > +          if (offset)
> > +            goto start_unaligned;
> > +
> > +          aligned = s;
> > +          goto start_loop;
> > +        }
> > +    }
> > +
> > +  /* A is aligned.  */
> > +  mask = _mm_loadu_si128 ((__m128i *) a);
> > +  /* Find where the NULL terminator is.  */
> > +  maskz = _mm_cmpeq_epi8 (mask, zero);
> > +  maskz_bits = _mm_movemask_epi8 (maskz);
> > +  if (maskz_bits == 0)
> > +    {
> > +      /* There is no NULL terminator.  Don't use SSE4.2 if the length
> > +         of A > 16.  */
> > +      if (a[16] != 0)
> > +        return STRCSPN_GENERIC (s, a);
> > +    }
> > +
> > +  aligned = s;
> > +  offset = (unsigned int) ((size_t) s & 15);
> > +  if (offset != 0)
> > +    {
> > +    start_unaligned:
> > +      /* Check partial string.  */
> > +      aligned = (const char *) ((size_t) s & -16L);
> > +      __m128i value = _mm_load_si128 ((__m128i *) aligned);
> > +
> > +      value = __m128i_shift_right (value, offset);
> > +
> > +      unsigned int length = _mm_cmpistri (mask, value, 0x2);
> > +      /* No need to check ZFlag since ZFlag is always 1.  */
> > +      unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
> > +      if (cflag)
> > +       RETURN ((char *) (s + length), length);
> > +      /* Find where the NULL terminator is.  */
> > +      unsigned int index = _mm_cmpistri (value, value, 0x3a);
> > +      if (index < 16 - offset)
> > +       RETURN (NULL, index);
> > +      aligned += 16;
> > +    }
> > +
> > +start_loop:
> > +  while (1)
> > +    {
> > +      __m128i value = _mm_load_si128 ((__m128i *) aligned);
> > +      unsigned int index = _mm_cmpistri (mask, value, 0x2);
> > +      unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
> > +      unsigned int zflag = _mm_cmpistrz (mask, value, 0x2);
> > +      if (cflag)
> > +       RETURN ((char *) (aligned + index), (size_t) (aligned + index - s));
> > +      if (zflag)
> > +       RETURN (NULL,
> > +               /* Find where the NULL terminator is.  */
> > +               (size_t) (aligned + _mm_cmpistri (value, value, 0x3a) - s));
> > +      aligned += 16;
> > +    }
> > +}
> > diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c
> > index c312fab8b1..423de2e2b2 100644
> > --- a/sysdeps/x86_64/multiarch/strcspn-c.c
> > +++ b/sysdeps/x86_64/multiarch/strcspn-c.c
> > @@ -1,5 +1,5 @@
> > -/* strcspn with SSE4.2 intrinsics
> > -   Copyright (C) 2009-2022 Free Software Foundation, Inc.
> > +/* strcspn.
> > +   Copyright (C) 2017-2022 Free Software Foundation, Inc.
> >     This file is part of the GNU C Library.
> >
> >     The GNU C Library is free software; you can redistribute it and/or
> > @@ -16,148 +16,13 @@
> >     License along with the GNU C Library; if not, see
> >     <https://www.gnu.org/licenses/>.  */
> >
> > -#include <nmmintrin.h>
> > -#include <string.h>
> > -#include "varshift.h"
> > +#if IS_IN (libc)
> >
> > -/* We use 0x2:
> > -       _SIDD_SBYTE_OPS
> > -       | _SIDD_CMP_EQUAL_ANY
> > -       | _SIDD_POSITIVE_POLARITY
> > -       | _SIDD_LEAST_SIGNIFICANT
> > -   on pcmpistri to compare xmm/mem128
> > +# include <sysdep.h>
> > +# define STRCSPN __strcspn_generic
> >
> > -   0 1 2 3 4 5 6 7 8 9 A B C D E F
> > -   X X X X X X X X X X X X X X X X
> > -
> > -   against xmm
> > -
> > -   0 1 2 3 4 5 6 7 8 9 A B C D E F
> > -   A A A A A A A A A A A A A A A A
> > -
> > -   to find out if the first 16byte data element has any byte A and
> > -   the offset of the first byte.  There are 3 cases:
> > -
> > -   1. The first 16byte data element has the byte A at the offset X.
> > -   2. The first 16byte data element has EOS and doesn't have the byte A.
> > -   3. The first 16byte data element is valid and doesn't have the byte A.
> > -
> > -   Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
> > -
> > -    1           X        1      0/1      0
> > -    2          16        0       1       0
> > -    3          16        0       0       0
> > -
> > -   We exit from the loop for cases 1 and 2 with jbe which branches
> > -   when either CFlag or ZFlag is 1.  If CFlag == 1, ECX has the offset
> > -   X for case 1.  */
> > -
> > -#ifndef STRCSPN_SSE2
> > -# define STRCSPN_SSE2 __strcspn_sse2
> > -# define STRCSPN_SSE42 __strcspn_sse42
> > -#endif
> > -
> > -#ifdef USE_AS_STRPBRK
> > -# define RETURN(val1, val2) return val1
> > -#else
> > -# define RETURN(val1, val2) return val2
> > -#endif
> > -
> > -extern
> > -#ifdef USE_AS_STRPBRK
> > -char *
> > -#else
> > -size_t
> > -#endif
> > -STRCSPN_SSE2 (const char *, const char *) attribute_hidden;
> > -
> > -
> > -#ifdef USE_AS_STRPBRK
> > -char *
> > -#else
> > -size_t
> > +# undef libc_hidden_builtin_def
> > +# define libc_hidden_builtin_def(STRCSPN)
> >  #endif
> > -__attribute__ ((section (".text.sse4.2")))
> > -STRCSPN_SSE42 (const char *s, const char *a)
> > -{
> > -  if (*a == 0)
> > -    RETURN (NULL, strlen (s));
> > -
> > -  const char *aligned;
> > -  __m128i mask, maskz, zero;
> > -  unsigned int maskz_bits;
> > -  unsigned int offset = (unsigned int) ((size_t) a & 15);
> > -  zero = _mm_set1_epi8 (0);
> > -  if (offset != 0)
> > -    {
> > -      /* Load masks.  */
> > -      aligned = (const char *) ((size_t) a & -16L);
> > -      __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
> > -      maskz = _mm_cmpeq_epi8 (mask0, zero);
> > -
> > -      /* Find where the NULL terminator is.  */
> > -      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
> > -      if (maskz_bits != 0)
> > -        {
> > -          mask = __m128i_shift_right (mask0, offset);
> > -          offset = (unsigned int) ((size_t) s & 15);
> > -          if (offset)
> > -            goto start_unaligned;
> > -
> > -          aligned = s;
> > -          goto start_loop;
> > -        }
> > -    }
> > -
> > -  /* A is aligned.  */
> > -  mask = _mm_loadu_si128 ((__m128i *) a);
> > -  /* Find where the NULL terminator is.  */
> > -  maskz = _mm_cmpeq_epi8 (mask, zero);
> > -  maskz_bits = _mm_movemask_epi8 (maskz);
> > -  if (maskz_bits == 0)
> > -    {
> > -      /* There is no NULL terminator.  Don't use SSE4.2 if the length
> > -         of A > 16.  */
> > -      if (a[16] != 0)
> > -        return STRCSPN_SSE2 (s, a);
> > -    }
> > -
> > -  aligned = s;
> > -  offset = (unsigned int) ((size_t) s & 15);
> > -  if (offset != 0)
> > -    {
> > -    start_unaligned:
> > -      /* Check partial string.  */
> > -      aligned = (const char *) ((size_t) s & -16L);
> > -      __m128i value = _mm_load_si128 ((__m128i *) aligned);
> > -
> > -      value = __m128i_shift_right (value, offset);
> > -
> > -      unsigned int length = _mm_cmpistri (mask, value, 0x2);
> > -      /* No need to check ZFlag since ZFlag is always 1.  */
> > -      unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
> > -      if (cflag)
> > -       RETURN ((char *) (s + length), length);
> > -      /* Find where the NULL terminator is.  */
> > -      unsigned int index = _mm_cmpistri (value, value, 0x3a);
> > -      if (index < 16 - offset)
> > -       RETURN (NULL, index);
> > -      aligned += 16;
> > -    }
> >
> > -start_loop:
> > -  while (1)
> > -    {
> > -      __m128i value = _mm_load_si128 ((__m128i *) aligned);
> > -      unsigned int index = _mm_cmpistri (mask, value, 0x2);
> > -      unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
> > -      unsigned int zflag = _mm_cmpistrz (mask, value, 0x2);
> > -      if (cflag)
> > -       RETURN ((char *) (aligned + index), (size_t) (aligned + index - s));
> > -      if (zflag)
> > -       RETURN (NULL,
> > -               /* Find where the NULL terminator is.  */
> > -               (size_t) (aligned + _mm_cmpistri (value, value, 0x3a) - s));
> > -      aligned += 16;
> > -    }
> > -}
> > +#include <string/strcspn.c>
> > diff --git a/sysdeps/x86_64/multiarch/strcspn-sse2.c b/sysdeps/x86_64/multiarch/strcspn-sse2.c
> > deleted file mode 100644
> > index 3a04bb39fc..0000000000
> > --- a/sysdeps/x86_64/multiarch/strcspn-sse2.c
> > +++ /dev/null
> > @@ -1,28 +0,0 @@
> > -/* strcspn.
> > -   Copyright (C) 2017-2022 Free Software Foundation, Inc.
> > -   This file is part of the GNU C Library.
> > -
> > -   The GNU C Library is free software; you can redistribute it and/or
> > -   modify it under the terms of the GNU Lesser General Public
> > -   License as published by the Free Software Foundation; either
> > -   version 2.1 of the License, or (at your option) any later version.
> > -
> > -   The GNU C Library is distributed in the hope that it will be useful,
> > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > -   Lesser General Public License for more details.
> > -
> > -   You should have received a copy of the GNU Lesser General Public
> > -   License along with the GNU C Library; if not, see
> > -   <https://www.gnu.org/licenses/>.  */
> > -
> > -#if IS_IN (libc)
> > -
> > -# include <sysdep.h>
> > -# define STRCSPN __strcspn_sse2
> > -
> > -# undef libc_hidden_builtin_def
> > -# define libc_hidden_builtin_def(STRCSPN)
> > -#endif
> > -
> > -#include <string/strcspn.c>
> > diff --git a/sysdeps/x86_64/multiarch/strncat-c.c b/sysdeps/x86_64/multiarch/strncat-c.c
> > index 93a7fab7ea..b729c033d9 100644
> > --- a/sysdeps/x86_64/multiarch/strncat-c.c
> > +++ b/sysdeps/x86_64/multiarch/strncat-c.c
> > @@ -1,2 +1,2 @@
> > -#define STRNCAT __strncat_sse2
> > +#define STRNCAT __strncat_generic
> >  #include <string/strncat.c>
> > diff --git a/sysdeps/x86_64/multiarch/strncat.c b/sysdeps/x86_64/multiarch/strncat.c
> > index b649343a97..50fba8a41f 100644
> > --- a/sysdeps/x86_64/multiarch/strncat.c
> > +++ b/sysdeps/x86_64/multiarch/strncat.c
> > @@ -24,6 +24,7 @@
> >  # undef strncat
> >
> >  # define SYMBOL_NAME strncat
> > +# define GENERIC generic
> >  # include "ifunc-strcpy.h"
> >
> >  libc_ifunc_redirected (__redirect_strncat, strncat, IFUNC_SELECTOR ());
> > diff --git a/sysdeps/x86_64/multiarch/strncpy-c.c b/sysdeps/x86_64/multiarch/strncpy-c.c
> > index 57c45ac7ab..183b0b8e0f 100644
> > --- a/sysdeps/x86_64/multiarch/strncpy-c.c
> > +++ b/sysdeps/x86_64/multiarch/strncpy-c.c
> > @@ -1,4 +1,4 @@
> > -#define STRNCPY __strncpy_sse2
> > +#define STRNCPY __strncpy_generic
> >  #undef libc_hidden_builtin_def
> >  #define libc_hidden_builtin_def(strncpy)
> >
> > diff --git a/sysdeps/x86_64/multiarch/strncpy.c b/sysdeps/x86_64/multiarch/strncpy.c
> > index 2a780a7e16..7fc7d72ec5 100644
> > --- a/sysdeps/x86_64/multiarch/strncpy.c
> > +++ b/sysdeps/x86_64/multiarch/strncpy.c
> > @@ -24,6 +24,7 @@
> >  # undef strncpy
> >
> >  # define SYMBOL_NAME strncpy
> > +# define GENERIC generic
> >  # include "ifunc-strcpy.h"
> >
> >  libc_ifunc_redirected (__redirect_strncpy, strncpy, IFUNC_SELECTOR ());
> > diff --git a/sysdeps/x86_64/multiarch/strspn-sse2.c b/sysdeps/x86_64/multiarch/strpbrk-c-sse4.c
> > similarity index 74%
> > rename from sysdeps/x86_64/multiarch/strspn-sse2.c
> > rename to sysdeps/x86_64/multiarch/strpbrk-c-sse4.c
> > index 61cc6cb0a5..8700276773 100644
> > --- a/sysdeps/x86_64/multiarch/strspn-sse2.c
> > +++ b/sysdeps/x86_64/multiarch/strpbrk-c-sse4.c
> > @@ -1,5 +1,5 @@
> > -/* strspn.
> > -   Copyright (C) 2017-2022 Free Software Foundation, Inc.
> > +/* strpbrk with SSE4.2 intrinsics
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> >     This file is part of the GNU C Library.
> >
> >     The GNU C Library is free software; you can redistribute it and/or
> > @@ -16,13 +16,7 @@
> >     License along with the GNU C Library; if not, see
> >     <https://www.gnu.org/licenses/>.  */
> >
> > -#if IS_IN (libc)
> > -
> > -# include <sysdep.h>
> > -# define STRSPN __strspn_sse2
> > -
> > -# undef libc_hidden_builtin_def
> > -# define libc_hidden_builtin_def(STRSPN)
> > -#endif
> > -
> > -#include <string/strspn.c>
> > +#define USE_AS_STRPBRK
> > +#define STRCSPN_GENERIC __strpbrk_generic
> > +#define STRCSPN_SSE42 __strpbrk_sse42
> > +#include "strcspn-c-sse4.c"
> > diff --git a/sysdeps/x86_64/multiarch/strpbrk-c.c b/sysdeps/x86_64/multiarch/strpbrk-c.c
> > index abf4ff7f1a..d31acfe495 100644
> > --- a/sysdeps/x86_64/multiarch/strpbrk-c.c
> > +++ b/sysdeps/x86_64/multiarch/strpbrk-c.c
> > @@ -1,5 +1,5 @@
> > -/* strpbrk with SSE4.2 intrinsics
> > -   Copyright (C) 2022 Free Software Foundation, Inc.
> > +/* strpbrk.
> > +   Copyright (C) 2017-2022 Free Software Foundation, Inc.
> >     This file is part of the GNU C Library.
> >
> >     The GNU C Library is free software; you can redistribute it and/or
> > @@ -16,7 +16,13 @@
> >     License along with the GNU C Library; if not, see
> >     <https://www.gnu.org/licenses/>.  */
> >
> > -#define USE_AS_STRPBRK
> > -#define STRCSPN_SSE2 __strpbrk_sse2
> > -#define STRCSPN_SSE42 __strpbrk_sse42
> > -#include "strcspn-c.c"
> > +#if IS_IN (libc)
> > +
> > +# include <sysdep.h>
> > +# define STRPBRK __strpbrk_generic
> > +
> > +# undef libc_hidden_builtin_def
> > +# define libc_hidden_builtin_def(STRPBRK)
> > +#endif
> > +
> > +#include <string/strpbrk.c>
> > diff --git a/sysdeps/x86_64/multiarch/strpbrk-sse2.c b/sysdeps/x86_64/multiarch/strpbrk-sse2.c
> > deleted file mode 100644
> > index d03214c4fb..0000000000
> > --- a/sysdeps/x86_64/multiarch/strpbrk-sse2.c
> > +++ /dev/null
> > @@ -1,28 +0,0 @@
> > -/* strpbrk.
> > -   Copyright (C) 2017-2022 Free Software Foundation, Inc.
> > -   This file is part of the GNU C Library.
> > -
> > -   The GNU C Library is free software; you can redistribute it and/or
> > -   modify it under the terms of the GNU Lesser General Public
> > -   License as published by the Free Software Foundation; either
> > -   version 2.1 of the License, or (at your option) any later version.
> > -
> > -   The GNU C Library is distributed in the hope that it will be useful,
> > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > -   Lesser General Public License for more details.
> > -
> > -   You should have received a copy of the GNU Lesser General Public
> > -   License along with the GNU C Library; if not, see
> > -   <https://www.gnu.org/licenses/>.  */
> > -
> > -#if IS_IN (libc)
> > -
> > -# include <sysdep.h>
> > -# define STRPBRK __strpbrk_sse2
> > -
> > -# undef libc_hidden_builtin_def
> > -# define libc_hidden_builtin_def(STRPBRK)
> > -#endif
> > -
> > -#include <string/strpbrk.c>
> > diff --git a/sysdeps/x86_64/multiarch/strspn-c-sse4.c b/sysdeps/x86_64/multiarch/strspn-c-sse4.c
> > new file mode 100644
> > index 0000000000..d044916688
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strspn-c-sse4.c
> > @@ -0,0 +1,136 @@
> > +/* strspn with SSE4.2 intrinsics
> > +   Copyright (C) 2009-2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include <nmmintrin.h>
> > +#include <string.h>
> > +#include "varshift.h"
> > +
> > +/* We use 0x12:
> > +       _SIDD_SBYTE_OPS
> > +       | _SIDD_CMP_EQUAL_ANY
> > +       | _SIDD_NEGATIVE_POLARITY
> > +       | _SIDD_LEAST_SIGNIFICANT
> > +   on pcmpistri to compare xmm/mem128
> > +
> > +   0 1 2 3 4 5 6 7 8 9 A B C D E F
> > +   X X X X X X X X X X X X X X X X
> > +
> > +   against xmm
> > +
> > +   0 1 2 3 4 5 6 7 8 9 A B C D E F
> > +   A A A A A A A A A A A A A A A A
> > +
> > +   to find out if the first 16byte data element has any non-A byte and
> > +   the offset of the first byte.  There are 2 cases:
> > +
> > +   1. The first 16byte data element has the non-A byte, including
> > +      EOS, at the offset X.
> > +   2. The first 16byte data element is valid and doesn't have the non-A
> > +      byte.
> > +
> > +   Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
> > +
> > +   case                ECX     CFlag   ZFlag   SFlag
> > +    1           X        1      0/1      0
> > +    2          16        0       0       0
> > +
> > +   We exit from the loop for case 1.  */
> > +
> > +extern size_t __strspn_generic (const char *, const char *) attribute_hidden;
> > +
> > +
> > +size_t
> > +__attribute__ ((section (".text.sse4.2")))
> > +__strspn_sse42 (const char *s, const char *a)
> > +{
> > +  if (*a == 0)
> > +    return 0;
> > +
> > +  const char *aligned;
> > +  __m128i mask, maskz, zero;
> > +  unsigned int maskz_bits;
> > +  unsigned int offset = (int) ((size_t) a & 15);
> > +  zero = _mm_set1_epi8 (0);
> > +  if (offset != 0)
> > +    {
> > +      /* Load masks.  */
> > +      aligned = (const char *) ((size_t) a & -16L);
> > +      __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
> > +      maskz = _mm_cmpeq_epi8 (mask0, zero);
> > +
> > +      /* Find where the NULL terminator is.  */
> > +      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
> > +      if (maskz_bits != 0)
> > +        {
> > +          mask = __m128i_shift_right (mask0, offset);
> > +          offset = (unsigned int) ((size_t) s & 15);
> > +          if (offset)
> > +            goto start_unaligned;
> > +
> > +          aligned = s;
> > +          goto start_loop;
> > +        }
> > +    }
> > +
> > +  /* A is aligned.  */
> > +  mask = _mm_loadu_si128 ((__m128i *) a);
> > +
> > +  /* Find where the NULL terminator is.  */
> > +  maskz = _mm_cmpeq_epi8 (mask, zero);
> > +  maskz_bits = _mm_movemask_epi8 (maskz);
> > +  if (maskz_bits == 0)
> > +    {
> > +      /* There is no NULL terminator.  Don't use SSE4.2 if the length
> > +         of A > 16.  */
> > +      if (a[16] != 0)
> > +        return __strspn_generic (s, a);
> > +    }
> > +  aligned = s;
> > +  offset = (unsigned int) ((size_t) s & 15);
> > +
> > +  if (offset != 0)
> > +    {
> > +    start_unaligned:
> > +      /* Check partial string.  */
> > +      aligned = (const char *) ((size_t) s & -16L);
> > +      __m128i value = _mm_load_si128 ((__m128i *) aligned);
> > +      __m128i adj_value = __m128i_shift_right (value, offset);
> > +
> > +      unsigned int length = _mm_cmpistri (mask, adj_value, 0x12);
> > +      /* No need to check CFlag since it is always 1.  */
> > +      if (length < 16 - offset)
> > +       return length;
> > +      /* Find where the NULL terminator is.  */
> > +      maskz = _mm_cmpeq_epi8 (value, zero);
> > +      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
> > +      if (maskz_bits != 0)
> > +       return length;
> > +      aligned += 16;
> > +    }
> > +
> > +start_loop:
> > +  while (1)
> > +    {
> > +      __m128i value = _mm_load_si128 ((__m128i *) aligned);
> > +      unsigned int index = _mm_cmpistri (mask, value, 0x12);
> > +      unsigned int cflag = _mm_cmpistrc (mask, value, 0x12);
> > +      if (cflag)
> > +       return (size_t) (aligned + index - s);
> > +      aligned += 16;
> > +    }
> > +}
> > diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c
> > index 6124033ceb..6b50c36432 100644
> > --- a/sysdeps/x86_64/multiarch/strspn-c.c
> > +++ b/sysdeps/x86_64/multiarch/strspn-c.c
> > @@ -1,5 +1,5 @@
> > -/* strspn with SSE4.2 intrinsics
> > -   Copyright (C) 2009-2022 Free Software Foundation, Inc.
> > +/* strspn.
> > +   Copyright (C) 2017-2022 Free Software Foundation, Inc.
> >     This file is part of the GNU C Library.
> >
> >     The GNU C Library is free software; you can redistribute it and/or
> > @@ -16,121 +16,13 @@
> >     License along with the GNU C Library; if not, see
> >     <https://www.gnu.org/licenses/>.  */
> >
> > -#include <nmmintrin.h>
> > -#include <string.h>
> > -#include "varshift.h"
> > +#if IS_IN (libc)
> >
> > -/* We use 0x12:
> > -       _SIDD_SBYTE_OPS
> > -       | _SIDD_CMP_EQUAL_ANY
> > -       | _SIDD_NEGATIVE_POLARITY
> > -       | _SIDD_LEAST_SIGNIFICANT
> > -   on pcmpistri to compare xmm/mem128
> > +# include <sysdep.h>
> > +# define STRSPN __strspn_generic
> >
> > -   0 1 2 3 4 5 6 7 8 9 A B C D E F
> > -   X X X X X X X X X X X X X X X X
> > +# undef libc_hidden_builtin_def
> > +# define libc_hidden_builtin_def(STRSPN)
> > +#endif
> >
> > -   against xmm
> > -
> > -   0 1 2 3 4 5 6 7 8 9 A B C D E F
> > -   A A A A A A A A A A A A A A A A
> > -
> > -   to find out if the first 16byte data element has any non-A byte and
> > -   the offset of the first byte.  There are 2 cases:
> > -
> > -   1. The first 16byte data element has the non-A byte, including
> > -      EOS, at the offset X.
> > -   2. The first 16byte data element is valid and doesn't have the non-A
> > -      byte.
> > -
> > -   Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
> > -
> > -   case                ECX     CFlag   ZFlag   SFlag
> > -    1           X        1      0/1      0
> > -    2          16        0       0       0
> > -
> > -   We exit from the loop for case 1.  */
> > -
> > -extern size_t __strspn_sse2 (const char *, const char *) attribute_hidden;
> > -
> > -
> > -size_t
> > -__attribute__ ((section (".text.sse4.2")))
> > -__strspn_sse42 (const char *s, const char *a)
> > -{
> > -  if (*a == 0)
> > -    return 0;
> > -
> > -  const char *aligned;
> > -  __m128i mask, maskz, zero;
> > -  unsigned int maskz_bits;
> > -  unsigned int offset = (int) ((size_t) a & 15);
> > -  zero = _mm_set1_epi8 (0);
> > -  if (offset != 0)
> > -    {
> > -      /* Load masks.  */
> > -      aligned = (const char *) ((size_t) a & -16L);
> > -      __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
> > -      maskz = _mm_cmpeq_epi8 (mask0, zero);
> > -
> > -      /* Find where the NULL terminator is.  */
> > -      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
> > -      if (maskz_bits != 0)
> > -        {
> > -          mask = __m128i_shift_right (mask0, offset);
> > -          offset = (unsigned int) ((size_t) s & 15);
> > -          if (offset)
> > -            goto start_unaligned;
> > -
> > -          aligned = s;
> > -          goto start_loop;
> > -        }
> > -    }
> > -
> > -  /* A is aligned.  */
> > -  mask = _mm_loadu_si128 ((__m128i *) a);
> > -
> > -  /* Find where the NULL terminator is.  */
> > -  maskz = _mm_cmpeq_epi8 (mask, zero);
> > -  maskz_bits = _mm_movemask_epi8 (maskz);
> > -  if (maskz_bits == 0)
> > -    {
> > -      /* There is no NULL terminator.  Don't use SSE4.2 if the length
> > -         of A > 16.  */
> > -      if (a[16] != 0)
> > -        return __strspn_sse2 (s, a);
> > -    }
> > -  aligned = s;
> > -  offset = (unsigned int) ((size_t) s & 15);
> > -
> > -  if (offset != 0)
> > -    {
> > -    start_unaligned:
> > -      /* Check partial string.  */
> > -      aligned = (const char *) ((size_t) s & -16L);
> > -      __m128i value = _mm_load_si128 ((__m128i *) aligned);
> > -      __m128i adj_value = __m128i_shift_right (value, offset);
> > -
> > -      unsigned int length = _mm_cmpistri (mask, adj_value, 0x12);
> > -      /* No need to check CFlag since it is always 1.  */
> > -      if (length < 16 - offset)
> > -       return length;
> > -      /* Find where the NULL terminator is.  */
> > -      maskz = _mm_cmpeq_epi8 (value, zero);
> > -      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
> > -      if (maskz_bits != 0)
> > -       return length;
> > -      aligned += 16;
> > -    }
> > -
> > -start_loop:
> > -  while (1)
> > -    {
> > -      __m128i value = _mm_load_si128 ((__m128i *) aligned);
> > -      unsigned int index = _mm_cmpistri (mask, value, 0x12);
> > -      unsigned int cflag = _mm_cmpistrc (mask, value, 0x12);
> > -      if (cflag)
> > -       return (size_t) (aligned + index - s);
> > -      aligned += 16;
> > -    }
> > -}
> > +#include <string/strspn.c>
> > diff --git a/sysdeps/x86_64/multiarch/wcscpy-c.c b/sysdeps/x86_64/multiarch/wcscpy-c.c
> > index 26d6984e9b..fa38dd898d 100644
> > --- a/sysdeps/x86_64/multiarch/wcscpy-c.c
> > +++ b/sysdeps/x86_64/multiarch/wcscpy-c.c
> > @@ -1,5 +1,5 @@
> >  #if IS_IN (libc)
> > -# define WCSCPY  __wcscpy_sse2
> > +# define WCSCPY  __wcscpy_generic
> >  #endif
> >
> >  #include <wcsmbs/wcscpy.c>
> > diff --git a/sysdeps/x86_64/multiarch/wcscpy.c b/sysdeps/x86_64/multiarch/wcscpy.c
> > index 6a2d1421d9..53c3228dc2 100644
> > --- a/sysdeps/x86_64/multiarch/wcscpy.c
> > +++ b/sysdeps/x86_64/multiarch/wcscpy.c
> > @@ -26,7 +26,7 @@
> >  # define SYMBOL_NAME wcscpy
> >  # include <init-arch.h>
> >
> > -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden;
> >  extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
> >
> >  static inline void *
> > @@ -37,7 +37,7 @@ IFUNC_SELECTOR (void)
> >    if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
> >      return OPTIMIZE (ssse3);
> >
> > -  return OPTIMIZE (sse2);
> > +  return OPTIMIZE (generic);
> >  }
> >
> >  libc_ifunc_redirected (__redirect_wcscpy, __wcscpy, IFUNC_SELECTOR ());
> > diff --git a/sysdeps/x86_64/multiarch/wcsnlen-c.c b/sysdeps/x86_64/multiarch/wcsnlen-c.c
> > index e1ec7cfbb5..1c9c04241a 100644
> > --- a/sysdeps/x86_64/multiarch/wcsnlen-c.c
> > +++ b/sysdeps/x86_64/multiarch/wcsnlen-c.c
> > @@ -1,9 +1,9 @@
> >  #if IS_IN (libc)
> >  # include <wchar.h>
> >
> > -# define WCSNLEN __wcsnlen_sse2
> > +# define WCSNLEN __wcsnlen_generic
> >
> > -extern __typeof (wcsnlen) __wcsnlen_sse2;
> > +extern __typeof (wcsnlen) __wcsnlen_generic;
> >  #endif
> >
> >  #include "wcsmbs/wcsnlen.c"
> > diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c
> > index baa26666a8..05b7a211de 100644
> > --- a/sysdeps/x86_64/multiarch/wcsnlen.c
> > +++ b/sysdeps/x86_64/multiarch/wcsnlen.c
> > @@ -24,6 +24,7 @@
> >  # undef __wcsnlen
> >
> >  # define SYMBOL_NAME wcsnlen
> > +# define GENERIC generic
> >  # include "ifunc-wcslen.h"
> >
> >  libc_ifunc_redirected (__redirect_wcsnlen, __wcsnlen, IFUNC_SELECTOR ());
> > --
> > 2.34.1
> >
>
>
> --
> H.J.

  reply	other threads:[~2022-06-10  1:27 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-06-09  4:16 [PATCH v1 1/3] x86: Align varshift table to 32-bytes Noah Goldstein
2022-06-09  4:16 ` [PATCH v1 2/3] x86: Add avx compiled version for strspn, strcspn, and strpbrk Noah Goldstein
2022-06-09 15:28   ` H.J. Lu
2022-06-09  4:16 ` [PATCH v1 3/3] x86: Rename generic functions with unique postfix for clarity Noah Goldstein
2022-06-10  0:58   ` [PATCH v2] " Noah Goldstein
2022-06-10  1:19     ` H.J. Lu
2022-06-10  1:26       ` Noah Goldstein [this message]
2022-06-16 22:11   ` [PATCH v3] " Noah Goldstein
2022-06-16 22:43     ` H.J. Lu
2022-06-09 15:14 ` [PATCH v1 1/3] x86: Align varshift table to 32-bytes H.J. Lu
2022-07-14  2:51   ` Sunil Pandey

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=CAFUsyf+Vc-ED2UhRcGE7WxxtS-OYL6+U9sBV-DKHo9+e9QgEgw@mail.gmail.com \
    --to=goldstein.w.n@gmail.com \
    --cc=carlos@systemhalted.org \
    --cc=hjl.tools@gmail.com \
    --cc=libc-alpha@sourceware.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).