public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH] x86_64: Remove avx512 strstr implementation
@ 2024-03-21 17:12 Adhemerval Zanella
  2024-03-21 23:17 ` Noah Goldstein
  0 siblings, 1 reply; 2+ messages in thread
From: Adhemerval Zanella @ 2024-03-21 17:12 UTC (permalink / raw)
  To: libc-alpha; +Cc: Wilco Dijkstra, Noah Goldstein, H . J . Lu

As indicated in a recent thread, this it is a simple brute-force
algorithm that checks the whole needle at a matching character pair
(and does so 1 byte at a time after the first 64 bytes of a needle).
Also it never skips ahead and thus can match at every haystack
position after trying to match all of the needle, which generic
implementation avoids.

As indicated by Wilco, a 4x larger needle and 16x larger haystack gives
a clear 65x slowdown both basic_strstr and __strstr_avx512:

  "ifuncs": ["basic_strstr", "twoway_strstr", "__strstr_avx512",
"__strstr_sse2_unaligned", "__strstr_generic"],

    {
     "len_haystack": 65536,
     "len_needle": 1024,
     "align_haystack": 0,
     "align_needle": 0,
     "fail": 1,
     "desc": "Difficult bruteforce needle",
     "timings": [4.0948e+07, 15094.5, 3.20818e+07, 108558, 10839.2]
    },
    {
     "len_haystack": 1048576,
     "len_needle": 4096,
     "align_haystack": 0,
     "align_needle": 0,
     "fail": 1,
     "desc": "Difficult bruteforce needle",
     "timings": [2.69767e+09, 100797, 2.08535e+09, 495706, 82666.9]
    }

PS: I don't have an AVX512 capable machine to verify this issues, but
    skimming through the code it does seems to follow what Wilco has
    described.

---
 sysdeps/x86_64/multiarch/Makefile          |   3 -
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |   6 -
 sysdeps/x86_64/multiarch/strstr-avx512.c   | 218 ---------------------
 sysdeps/x86_64/multiarch/strstr.c          |  25 +--
 4 files changed, 4 insertions(+), 248 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/strstr-avx512.c

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index d3d2270394..696cb66991 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -117,7 +117,6 @@ sysdep_routines += \
   strrchr-evex512 \
   strrchr-sse2 \
   strspn-sse4 \
-  strstr-avx512 \
   strstr-sse2-unaligned \
   varshift \
 # sysdep_routines
@@ -125,8 +124,6 @@ sysdep_routines += \
 CFLAGS-strcspn-sse4.c += -msse4
 CFLAGS-strpbrk-sse4.c += -msse4
 CFLAGS-strspn-sse4.c += -msse4
-
-CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -mavx512bw -mbmi -mbmi2 -O3
 endif
 
 ifeq ($(subdir),wcsmbs)
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index c4a21d4b7c..0bbb71bbbf 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -790,12 +790,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strstr.c.  */
   IFUNC_IMPL (i, name, strstr,
-              IFUNC_IMPL_ADD (array, i, strstr,
-                              (CPU_FEATURE_USABLE (AVX512VL)
-                               && CPU_FEATURE_USABLE (AVX512BW)
-                               && CPU_FEATURE_USABLE (AVX512DQ)
-                               && CPU_FEATURE_USABLE (BMI2)),
-                              __strstr_avx512)
 	      IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_generic))
 
diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c b/sysdeps/x86_64/multiarch/strstr-avx512.c
deleted file mode 100644
index 3ac53accbd..0000000000
--- a/sysdeps/x86_64/multiarch/strstr-avx512.c
+++ /dev/null
@@ -1,218 +0,0 @@
-/* strstr optimized with 512-bit AVX-512 instructions
-   Copyright (C) 2022-2024 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <immintrin.h>
-#include <inttypes.h>
-#include <stdbool.h>
-#include <string.h>
-
-#define FULL_MMASK64 0xffffffffffffffff
-#define ONE_64BIT 0x1ull
-#define ZMM_SIZE_IN_BYTES 64
-#define PAGESIZE 4096
-
-#define cvtmask64_u64(...) (uint64_t) (__VA_ARGS__)
-#define kshiftri_mask64(x, y) ((x) >> (y))
-#define kand_mask64(x, y) ((x) & (y))
-
-/*
- Returns the index of the first edge within the needle, returns 0 if no edge
- is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg'
- */
-static inline size_t
-find_edge_in_needle (const char *ned)
-{
-  size_t ind = 0;
-  while (ned[ind + 1] != '\0')
-    {
-      if (ned[ind] != ned[ind + 1])
-        return ind;
-      else
-        ind = ind + 1;
-    }
-  return 0;
-}
-
-/*
- Compare needle with haystack byte by byte at specified location
- */
-static inline bool
-verify_string_match (const char *hay, const size_t hay_index, const char *ned,
-                     size_t ind)
-{
-  while (ned[ind] != '\0')
-    {
-      if (ned[ind] != hay[hay_index + ind])
-        return false;
-      ind = ind + 1;
-    }
-  return true;
-}
-
-/*
- Compare needle with haystack at specified location. The first 64 bytes are
- compared using a ZMM register.
- */
-static inline bool
-verify_string_match_avx512 (const char *hay, const size_t hay_index,
-                            const char *ned, const __mmask64 ned_mask,
-                            const __m512i ned_zmm)
-{
-  /* check first 64 bytes using zmm and then scalar */
-  __m512i hay_zmm = _mm512_loadu_si512 (hay + hay_index); // safe to do so
-  __mmask64 match = _mm512_mask_cmpneq_epi8_mask (ned_mask, hay_zmm, ned_zmm);
-  if (match != 0x0) // failed the first few chars
-    return false;
-  else if (ned_mask == FULL_MMASK64)
-    return verify_string_match (hay, hay_index, ned, ZMM_SIZE_IN_BYTES);
-  return true;
-}
-
-char *
-__strstr_avx512 (const char *haystack, const char *ned)
-{
-  char first = ned[0];
-  if (first == '\0')
-    return (char *)haystack;
-  if (ned[1] == '\0')
-    return (char *)strchr (haystack, ned[0]);
-
-  size_t edge = find_edge_in_needle (ned);
-
-  /* ensure haystack is as long as the pos of edge in needle */
-  for (int ii = 0; ii < edge; ++ii)
-    {
-      if (haystack[ii] == '\0')
-        return NULL;
-    }
-
-  /*
-   Load 64 bytes of the needle and save it to a zmm register
-   Read one cache line at a time to avoid loading across a page boundary
-   */
-  __mmask64 ned_load_mask = _bzhi_u64 (
-      FULL_MMASK64, 64 - ((uintptr_t) (ned) & 63));
-  __m512i ned_zmm = _mm512_maskz_loadu_epi8 (ned_load_mask, ned);
-  __mmask64 ned_nullmask
-      = _mm512_mask_testn_epi8_mask (ned_load_mask, ned_zmm, ned_zmm);
-
-  if (__glibc_unlikely (ned_nullmask == 0x0))
-    {
-      ned_zmm = _mm512_loadu_si512 (ned);
-      ned_nullmask = _mm512_testn_epi8_mask (ned_zmm, ned_zmm);
-      ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
-      if (ned_nullmask != 0x0)
-        ned_load_mask = ned_load_mask >> 1;
-    }
-  else
-    {
-      ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
-      ned_load_mask = ned_load_mask >> 1;
-    }
-  const __m512i ned0 = _mm512_set1_epi8 (ned[edge]);
-  const __m512i ned1 = _mm512_set1_epi8 (ned[edge + 1]);
-
-  /*
-   Read the bytes of haystack in the current cache line
-   */
-  size_t hay_index = edge;
-  __mmask64 loadmask = _bzhi_u64 (
-      FULL_MMASK64, 64 - ((uintptr_t) (haystack + hay_index) & 63));
-  /* First load is a partial cache line */
-  __m512i hay0 = _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index);
-  /* Search for NULL and compare only till null char */
-  uint64_t nullmask
-      = cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, hay0, hay0));
-  uint64_t cmpmask = nullmask ^ (nullmask - ONE_64BIT);
-  cmpmask = cmpmask & cvtmask64_u64 (loadmask);
-  /* Search for the 2 characters of needle */
-  __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
-  __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
-  k1 = kshiftri_mask64 (k1, 1);
-  /* k2 masks tell us if both chars from needle match */
-  uint64_t k2 = cvtmask64_u64 (kand_mask64 (k0, k1)) & cmpmask;
-  /* For every match, search for the entire needle for a full match */
-  while (k2)
-    {
-      uint64_t bitcount = _tzcnt_u64 (k2);
-      k2 = _blsr_u64 (k2);
-      size_t match_pos = hay_index + bitcount - edge;
-      if (((uintptr_t) (haystack + match_pos) & (PAGESIZE - 1))
-          < PAGESIZE - 1 - ZMM_SIZE_IN_BYTES)
-        {
-          /*
-           * Use vector compare as long as you are not crossing a page
-           */
-          if (verify_string_match_avx512 (haystack, match_pos, ned,
-                                          ned_load_mask, ned_zmm))
-            return (char *)haystack + match_pos;
-        }
-      else
-        {
-          if (verify_string_match (haystack, match_pos, ned, 0))
-            return (char *)haystack + match_pos;
-        }
-    }
-  /* We haven't checked for potential match at the last char yet */
-  haystack = (const char *)(((uintptr_t) (haystack + hay_index) | 63));
-  hay_index = 0;
-
-  /*
-   Loop over one cache line at a time to prevent reading over page
-   boundary
-   */
-  __m512i hay1;
-  while (nullmask == 0)
-    {
-      hay0 = _mm512_loadu_si512 (haystack + hay_index);
-      hay1 = _mm512_load_si512 (haystack + hay_index
-                                + 1); // Always 64 byte aligned
-      nullmask = cvtmask64_u64 (_mm512_testn_epi8_mask (hay1, hay1));
-      /* Compare only till null char */
-      cmpmask = nullmask ^ (nullmask - ONE_64BIT);
-      k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
-      k1 = _mm512_cmpeq_epi8_mask (hay1, ned1);
-      /* k2 masks tell us if both chars from needle match */
-      k2 = cvtmask64_u64 (kand_mask64 (k0, k1)) & cmpmask;
-      /* For every match, compare full strings for potential match */
-      while (k2)
-        {
-          uint64_t bitcount = _tzcnt_u64 (k2);
-          k2 = _blsr_u64 (k2);
-          size_t match_pos = hay_index + bitcount - edge;
-          if (((uintptr_t) (haystack + match_pos) & (PAGESIZE - 1))
-              < PAGESIZE - 1 - ZMM_SIZE_IN_BYTES)
-            {
-              /*
-               * Use vector compare as long as you are not crossing a page
-               */
-              if (verify_string_match_avx512 (haystack, match_pos, ned,
-                                              ned_load_mask, ned_zmm))
-                return (char *)haystack + match_pos;
-            }
-          else
-            {
-              /* Compare byte by byte */
-              if (verify_string_match (haystack, match_pos, ned, 0))
-                return (char *)haystack + match_pos;
-            }
-        }
-      hay_index += ZMM_SIZE_IN_BYTES;
-    }
-  return NULL;
-}
diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c
index a513bac5c3..828308668b 100644
--- a/sysdeps/x86_64/multiarch/strstr.c
+++ b/sysdeps/x86_64/multiarch/strstr.c
@@ -35,32 +35,15 @@
 
 extern __typeof (__redirect_strstr) __strstr_sse2_unaligned attribute_hidden;
 extern __typeof (__redirect_strstr) __strstr_generic attribute_hidden;
-extern __typeof (__redirect_strstr) __strstr_avx512 attribute_hidden;
 
 #include "init-arch.h"
 
 /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
    ifunc symbol properly.  */
 extern __typeof (__redirect_strstr) __libc_strstr;
-
-static inline void *
-IFUNC_SELECTOR (void)
-{
-  const struct cpu_features *cpu_features = __get_cpu_features ();
-
-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
-      && CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
-      && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
-      && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ)
-      && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
-    return __strstr_avx512;
-
-  if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
-    return __strstr_sse2_unaligned;
-
-  return __strstr_generic;
-}
-
-libc_ifunc_redirected (__redirect_strstr, __libc_strstr, IFUNC_SELECTOR ());
+libc_ifunc (__libc_strstr,
+	    HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	    ? __strstr_sse2_unaligned
+	    : __strstr_generic)
 #undef strstr
 strong_alias (__libc_strstr, strstr)
-- 
2.34.1


^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [PATCH] x86_64: Remove avx512 strstr implementation
  2024-03-21 17:12 [PATCH] x86_64: Remove avx512 strstr implementation Adhemerval Zanella
@ 2024-03-21 23:17 ` Noah Goldstein
  0 siblings, 0 replies; 2+ messages in thread
From: Noah Goldstein @ 2024-03-21 23:17 UTC (permalink / raw)
  To: Adhemerval Zanella; +Cc: libc-alpha, Wilco Dijkstra, H . J . Lu

On Thu, Mar 21, 2024 at 12:12 PM Adhemerval Zanella
<adhemerval.zanella@linaro.org> wrote:
>
> As indicated in a recent thread, this it is a simple brute-force
> algorithm that checks the whole needle at a matching character pair
> (and does so 1 byte at a time after the first 64 bytes of a needle).
> Also it never skips ahead and thus can match at every haystack
> position after trying to match all of the needle, which generic
> implementation avoids.
>
> As indicated by Wilco, a 4x larger needle and 16x larger haystack gives
> a clear 65x slowdown both basic_strstr and __strstr_avx512:
>
>   "ifuncs": ["basic_strstr", "twoway_strstr", "__strstr_avx512",
> "__strstr_sse2_unaligned", "__strstr_generic"],
>
>     {
>      "len_haystack": 65536,
>      "len_needle": 1024,
>      "align_haystack": 0,
>      "align_needle": 0,
>      "fail": 1,
>      "desc": "Difficult bruteforce needle",
>      "timings": [4.0948e+07, 15094.5, 3.20818e+07, 108558, 10839.2]
>     },
>     {
>      "len_haystack": 1048576,
>      "len_needle": 4096,
>      "align_haystack": 0,
>      "align_needle": 0,
>      "fail": 1,
>      "desc": "Difficult bruteforce needle",
>      "timings": [2.69767e+09, 100797, 2.08535e+09, 495706, 82666.9]
>     }
>
> PS: I don't have an AVX512 capable machine to verify this issues, but
>     skimming through the code it does seems to follow what Wilco has
>     described.
>
> ---
>  sysdeps/x86_64/multiarch/Makefile          |   3 -
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c |   6 -
>  sysdeps/x86_64/multiarch/strstr-avx512.c   | 218 ---------------------
>  sysdeps/x86_64/multiarch/strstr.c          |  25 +--
>  4 files changed, 4 insertions(+), 248 deletions(-)
>  delete mode 100644 sysdeps/x86_64/multiarch/strstr-avx512.c
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index d3d2270394..696cb66991 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -117,7 +117,6 @@ sysdep_routines += \
>    strrchr-evex512 \
>    strrchr-sse2 \
>    strspn-sse4 \
> -  strstr-avx512 \
>    strstr-sse2-unaligned \
>    varshift \
>  # sysdep_routines
> @@ -125,8 +124,6 @@ sysdep_routines += \
>  CFLAGS-strcspn-sse4.c += -msse4
>  CFLAGS-strpbrk-sse4.c += -msse4
>  CFLAGS-strspn-sse4.c += -msse4
> -
> -CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -mavx512bw -mbmi -mbmi2 -O3
>  endif
>
>  ifeq ($(subdir),wcsmbs)
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index c4a21d4b7c..0bbb71bbbf 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -790,12 +790,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
>    /* Support sysdeps/x86_64/multiarch/strstr.c.  */
>    IFUNC_IMPL (i, name, strstr,
> -              IFUNC_IMPL_ADD (array, i, strstr,
> -                              (CPU_FEATURE_USABLE (AVX512VL)
> -                               && CPU_FEATURE_USABLE (AVX512BW)
> -                               && CPU_FEATURE_USABLE (AVX512DQ)
> -                               && CPU_FEATURE_USABLE (BMI2)),
> -                              __strstr_avx512)
>               IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
>               IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_generic))
>
> diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c b/sysdeps/x86_64/multiarch/strstr-avx512.c
> deleted file mode 100644
> index 3ac53accbd..0000000000
> --- a/sysdeps/x86_64/multiarch/strstr-avx512.c
> +++ /dev/null
> @@ -1,218 +0,0 @@
> -/* strstr optimized with 512-bit AVX-512 instructions
> -   Copyright (C) 2022-2024 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#include <immintrin.h>
> -#include <inttypes.h>
> -#include <stdbool.h>
> -#include <string.h>
> -
> -#define FULL_MMASK64 0xffffffffffffffff
> -#define ONE_64BIT 0x1ull
> -#define ZMM_SIZE_IN_BYTES 64
> -#define PAGESIZE 4096
> -
> -#define cvtmask64_u64(...) (uint64_t) (__VA_ARGS__)
> -#define kshiftri_mask64(x, y) ((x) >> (y))
> -#define kand_mask64(x, y) ((x) & (y))
> -
> -/*
> - Returns the index of the first edge within the needle, returns 0 if no edge
> - is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg'
> - */
> -static inline size_t
> -find_edge_in_needle (const char *ned)
> -{
> -  size_t ind = 0;
> -  while (ned[ind + 1] != '\0')
> -    {
> -      if (ned[ind] != ned[ind + 1])
> -        return ind;
> -      else
> -        ind = ind + 1;
> -    }
> -  return 0;
> -}
> -
> -/*
> - Compare needle with haystack byte by byte at specified location
> - */
> -static inline bool
> -verify_string_match (const char *hay, const size_t hay_index, const char *ned,
> -                     size_t ind)
> -{
> -  while (ned[ind] != '\0')
> -    {
> -      if (ned[ind] != hay[hay_index + ind])
> -        return false;
> -      ind = ind + 1;
> -    }
> -  return true;
> -}
> -
> -/*
> - Compare needle with haystack at specified location. The first 64 bytes are
> - compared using a ZMM register.
> - */
> -static inline bool
> -verify_string_match_avx512 (const char *hay, const size_t hay_index,
> -                            const char *ned, const __mmask64 ned_mask,
> -                            const __m512i ned_zmm)
> -{
> -  /* check first 64 bytes using zmm and then scalar */
> -  __m512i hay_zmm = _mm512_loadu_si512 (hay + hay_index); // safe to do so
> -  __mmask64 match = _mm512_mask_cmpneq_epi8_mask (ned_mask, hay_zmm, ned_zmm);
> -  if (match != 0x0) // failed the first few chars
> -    return false;
> -  else if (ned_mask == FULL_MMASK64)
> -    return verify_string_match (hay, hay_index, ned, ZMM_SIZE_IN_BYTES);
> -  return true;
> -}
> -
> -char *
> -__strstr_avx512 (const char *haystack, const char *ned)
> -{
> -  char first = ned[0];
> -  if (first == '\0')
> -    return (char *)haystack;
> -  if (ned[1] == '\0')
> -    return (char *)strchr (haystack, ned[0]);
> -
> -  size_t edge = find_edge_in_needle (ned);
> -
> -  /* ensure haystack is as long as the pos of edge in needle */
> -  for (int ii = 0; ii < edge; ++ii)
> -    {
> -      if (haystack[ii] == '\0')
> -        return NULL;
> -    }
> -
> -  /*
> -   Load 64 bytes of the needle and save it to a zmm register
> -   Read one cache line at a time to avoid loading across a page boundary
> -   */
> -  __mmask64 ned_load_mask = _bzhi_u64 (
> -      FULL_MMASK64, 64 - ((uintptr_t) (ned) & 63));
> -  __m512i ned_zmm = _mm512_maskz_loadu_epi8 (ned_load_mask, ned);
> -  __mmask64 ned_nullmask
> -      = _mm512_mask_testn_epi8_mask (ned_load_mask, ned_zmm, ned_zmm);
> -
> -  if (__glibc_unlikely (ned_nullmask == 0x0))
> -    {
> -      ned_zmm = _mm512_loadu_si512 (ned);
> -      ned_nullmask = _mm512_testn_epi8_mask (ned_zmm, ned_zmm);
> -      ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
> -      if (ned_nullmask != 0x0)
> -        ned_load_mask = ned_load_mask >> 1;
> -    }
> -  else
> -    {
> -      ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
> -      ned_load_mask = ned_load_mask >> 1;
> -    }
> -  const __m512i ned0 = _mm512_set1_epi8 (ned[edge]);
> -  const __m512i ned1 = _mm512_set1_epi8 (ned[edge + 1]);
> -
> -  /*
> -   Read the bytes of haystack in the current cache line
> -   */
> -  size_t hay_index = edge;
> -  __mmask64 loadmask = _bzhi_u64 (
> -      FULL_MMASK64, 64 - ((uintptr_t) (haystack + hay_index) & 63));
> -  /* First load is a partial cache line */
> -  __m512i hay0 = _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index);
> -  /* Search for NULL and compare only till null char */
> -  uint64_t nullmask
> -      = cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, hay0, hay0));
> -  uint64_t cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> -  cmpmask = cmpmask & cvtmask64_u64 (loadmask);
> -  /* Search for the 2 characters of needle */
> -  __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> -  __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
> -  k1 = kshiftri_mask64 (k1, 1);
> -  /* k2 masks tell us if both chars from needle match */
> -  uint64_t k2 = cvtmask64_u64 (kand_mask64 (k0, k1)) & cmpmask;
> -  /* For every match, search for the entire needle for a full match */
> -  while (k2)
> -    {
> -      uint64_t bitcount = _tzcnt_u64 (k2);
> -      k2 = _blsr_u64 (k2);
> -      size_t match_pos = hay_index + bitcount - edge;
> -      if (((uintptr_t) (haystack + match_pos) & (PAGESIZE - 1))
> -          < PAGESIZE - 1 - ZMM_SIZE_IN_BYTES)
> -        {
> -          /*
> -           * Use vector compare as long as you are not crossing a page
> -           */
> -          if (verify_string_match_avx512 (haystack, match_pos, ned,
> -                                          ned_load_mask, ned_zmm))
> -            return (char *)haystack + match_pos;
> -        }
> -      else
> -        {
> -          if (verify_string_match (haystack, match_pos, ned, 0))
> -            return (char *)haystack + match_pos;
> -        }
> -    }
> -  /* We haven't checked for potential match at the last char yet */
> -  haystack = (const char *)(((uintptr_t) (haystack + hay_index) | 63));
> -  hay_index = 0;
> -
> -  /*
> -   Loop over one cache line at a time to prevent reading over page
> -   boundary
> -   */
> -  __m512i hay1;
> -  while (nullmask == 0)
> -    {
> -      hay0 = _mm512_loadu_si512 (haystack + hay_index);
> -      hay1 = _mm512_load_si512 (haystack + hay_index
> -                                + 1); // Always 64 byte aligned
> -      nullmask = cvtmask64_u64 (_mm512_testn_epi8_mask (hay1, hay1));
> -      /* Compare only till null char */
> -      cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> -      k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> -      k1 = _mm512_cmpeq_epi8_mask (hay1, ned1);
> -      /* k2 masks tell us if both chars from needle match */
> -      k2 = cvtmask64_u64 (kand_mask64 (k0, k1)) & cmpmask;
> -      /* For every match, compare full strings for potential match */
> -      while (k2)
> -        {
> -          uint64_t bitcount = _tzcnt_u64 (k2);
> -          k2 = _blsr_u64 (k2);
> -          size_t match_pos = hay_index + bitcount - edge;
> -          if (((uintptr_t) (haystack + match_pos) & (PAGESIZE - 1))
> -              < PAGESIZE - 1 - ZMM_SIZE_IN_BYTES)
> -            {
> -              /*
> -               * Use vector compare as long as you are not crossing a page
> -               */
> -              if (verify_string_match_avx512 (haystack, match_pos, ned,
> -                                              ned_load_mask, ned_zmm))
> -                return (char *)haystack + match_pos;
> -            }
> -          else
> -            {
> -              /* Compare byte by byte */
> -              if (verify_string_match (haystack, match_pos, ned, 0))
> -                return (char *)haystack + match_pos;
> -            }
> -        }
> -      hay_index += ZMM_SIZE_IN_BYTES;
> -    }
> -  return NULL;
> -}
> diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c
> index a513bac5c3..828308668b 100644
> --- a/sysdeps/x86_64/multiarch/strstr.c
> +++ b/sysdeps/x86_64/multiarch/strstr.c
> @@ -35,32 +35,15 @@
>
>  extern __typeof (__redirect_strstr) __strstr_sse2_unaligned attribute_hidden;
>  extern __typeof (__redirect_strstr) __strstr_generic attribute_hidden;
> -extern __typeof (__redirect_strstr) __strstr_avx512 attribute_hidden;
>
>  #include "init-arch.h"
>
>  /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
>     ifunc symbol properly.  */
>  extern __typeof (__redirect_strstr) __libc_strstr;
> -
> -static inline void *
> -IFUNC_SELECTOR (void)
> -{
> -  const struct cpu_features *cpu_features = __get_cpu_features ();
> -
> -  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
> -      && CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> -      && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> -      && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ)
> -      && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> -    return __strstr_avx512;
> -
> -  if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
> -    return __strstr_sse2_unaligned;
> -
> -  return __strstr_generic;
> -}
> -
> -libc_ifunc_redirected (__redirect_strstr, __libc_strstr, IFUNC_SELECTOR ());
> +libc_ifunc (__libc_strstr,
> +           HAS_ARCH_FEATURE (Fast_Unaligned_Load)
> +           ? __strstr_sse2_unaligned
> +           : __strstr_generic)
>  #undef strstr
>  strong_alias (__libc_strstr, strstr)
> --
> 2.34.1
>

LGTM.
Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2024-03-21 23:17 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-03-21 17:12 [PATCH] x86_64: Remove avx512 strstr implementation Adhemerval Zanella
2024-03-21 23:17 ` Noah Goldstein

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).