public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH 1/1] x86_64: Add strstr function with 512-bit EVEX
@ 2022-05-26 20:22 Raghuveer Devulapalli
  2022-05-26 21:25 ` Noah Goldstein
                   ` (2 more replies)
  0 siblings, 3 replies; 16+ messages in thread
From: Raghuveer Devulapalli @ 2022-05-26 20:22 UTC (permalink / raw)
  To: libc-alpha

Adding a 512-bit EVEX version of strstr. The algorithm works as follows:

(1) We spend a few cycles at the begining to peek into the needle. We
locate an edge in the needle (first occurance of 2 consequent distinct
characters) and also store the first 64-bytes into a zmm register.

(2) We search for the edge in the haystack by looking into one cache
line of the haystack at a time. This avoids having to read past a page
boundary which can cause a seg fault.

(3) If an edge is found in the haystack we first compare the first
64-bytes of the needle (already stored in a zmm register) before we
proceed with a full string compare performed byte by byte.

Benchmarking data on ICX shows upto 2x speed up when compared to
__strstr_sse2_unaligned (including partial benchtests data from
bench-strstr.out):

|---------------------------------+---------------+-----------------------|
|                                 | strstr_avx512 | strstr_sse2_unaligned |
|---------------------------------+---------------+-----------------------|
| Length 16384/ 16,  1/11, found: | 1939.75       | 3458.44               |
| Length 16384/ 16, 14/ 5, fail : | 1967.75       | 3541.12               |
| Length 16384/ 32,  1/11, found: | 1540.38       | 2908.25               |
| Length 16384/ 32, 14/ 5, fail : | 1345.94       | 2866.31               |
| Length 16384/ 64,  1/11, found: | 1968.81       | 4327.56               |
| Length 16384/ 64, 14/ 5, fail : | 1993.75       | 4215.69               |
| Length 16384/128,  1/11, found: | 1535.44       | 3780.56               |
| Length 16384/128, 14/ 5, fail : | 1414.75       | 3595.25               |
| Length 16384/256,  1/11, found: | 2957.75       | 5501.44               |
| Length 16384/256, 14/ 5, fail : | 2682.62       | 5099.88               |
| Length 32768/ 16,  1/11, found: | 7820.19       | 11262.9               |
| Length 32768/ 16, 14/ 5, fail : | 8196.88       | 10871.2               |
| Length 32768/ 32,  1/11, found: | 5709.19       | 6611.56               |
| Length 32768/ 32, 14/ 5, fail : | 5716.12       | 6647.06               |
| Length 32768/ 64,  1/11, found: | 7160.44       | 10143.7               |
| Length 32768/ 64, 14/ 5, fail : | 7021.38       | 10150.6               |
| Length 32768/128,  1/11, found: | 4935.31       | 6756.56               |
| Length 32768/128, 14/ 5, fail : | 4774.38       | 6746.19               |
| Length 32768/256,  1/11, found: | 7933.19       | 12563.8               |
| Length 32768/256, 14/ 5, fail : | 7975          | 12558.6               |
| Length 65536/ 16,  1/11, found: | 9066.69       | 9419.62               |
| Length 65536/ 16, 14/ 5, fail : | 8496          | 9384.75               |
| Length 65536/ 32,  1/11, found: | 10258.8       | 11192.4               |
| Length 65536/ 32, 14/ 5, fail : | 8712.12       | 11172.3               |
| Length 65536/ 64,  1/11, found: | 11085.2       | 18162.1               |
| Length 65536/ 64, 14/ 5, fail : | 11219.6       | 17921.5               |
| Length 65536/128,  1/11, found: | 9753.56       | 18704.6               |
| Length 65536/128, 14/ 5, fail : | 9588.81       | 18465.6               |
| Length 65536/256,  1/11, found: | 18333.3       | 28505.2               |
| Length 65536/256, 14/ 5, fail : | 18018.8       | 27990.8               |
|---------------------------------+---------------+-----------------------|
---
 sysdeps/x86_64/multiarch/Makefile          |   2 +
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |   6 +
 sysdeps/x86_64/multiarch/strstr-avx512.c   | 208 +++++++++++++++++++++
 sysdeps/x86_64/multiarch/strstr.c          |  24 ++-
 4 files changed, 236 insertions(+), 4 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/strstr-avx512.c

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index e7b413edad..6dc54a7265 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -126,6 +126,7 @@ sysdep_routines += \
   strrchr-sse2 \
   strspn-c \
   strspn-sse2 \
+  strstr-avx512 \
   strstr-sse2-unaligned \
   varshift \
 # sysdep_routines
@@ -133,6 +134,7 @@ CFLAGS-varshift.c += -msse4
 CFLAGS-strcspn-c.c += -msse4
 CFLAGS-strpbrk-c.c += -msse4
 CFLAGS-strspn-c.c += -msse4
+CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -mavx512bw -mbmi -mbmi2 -O3
 endif
 
 ifeq ($(subdir),wcsmbs)
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index a594f4176e..cc9a7eaaa1 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -653,6 +653,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strstr.c.  */
   IFUNC_IMPL (i, name, strstr,
+              IFUNC_IMPL_ADD (array, i, strstr,
+                              (CPU_FEATURE_USABLE (AVX512VL)
+                               && CPU_FEATURE_USABLE (AVX512BW)
+                               && CPU_FEATURE_USABLE (AVX512DQ)
+                               && CPU_FEATURE_USABLE (BMI2)),
+                              __strstr_avx512)
 	      IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
 
diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c b/sysdeps/x86_64/multiarch/strstr-avx512.c
new file mode 100644
index 0000000000..4082a75a1b
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strstr-avx512.c
@@ -0,0 +1,208 @@
+/* strstr optimized with 512-bit AVX-512 instructions
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <immintrin.h>
+#include <inttypes.h>
+#include <stdbool.h>
+#include <string.h>
+
+#define FULL_MMASK64 0xffffffffffffffff
+#define ONE_64BIT 0x1ull
+#define ZMM_SIZE_IN_BYTES 64
+
+/*
+ Returns the index of the first edge within the needle, returns 0 if no edge
+ is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg'
+ */
+static inline size_t
+find_edge_in_needle (const char *ned)
+{
+  size_t ind = 0;
+  while (ned[ind + 1] != '\0')
+    {
+      if (ned[ind] != ned[ind + 1])
+        return ind;
+      else
+        ind = ind + 1;
+    }
+  return 0;
+}
+
+/*
+ Compare needle with haystack byte by byte at specified location
+ */
+static inline bool
+verify_string_match (const char *hay, const size_t hay_index, const char *ned,
+                     size_t ind)
+{
+  while (ned[ind] != '\0')
+    {
+      if (ned[ind] != hay[hay_index + ind])
+        return false;
+      ind = ind + 1;
+    }
+  return true;
+}
+
+/*
+ Compare needle with haystack at specified location. The first 64 bytes are
+ compared using a ZMM register.
+ */
+static inline bool
+verify_string_match_avx512 (const char *hay, const size_t hay_index,
+                            const char *ned, const __mmask64 ned_mask,
+                            const __m512i ned_zmm)
+{
+  /* check first 64 bytes using zmm and then scalar */
+  __m512i hay_zmm = _mm512_loadu_si512 (hay + hay_index); // safe to do so
+  __mmask64 match = _mm512_mask_cmpneq_epi8_mask (ned_mask, hay_zmm, ned_zmm);
+  if (match != 0x0) // failed the first few chars
+    return false;
+  else if (ned_mask == FULL_MMASK64)
+    return verify_string_match (hay, hay_index, ned, ZMM_SIZE_IN_BYTES);
+  return true;
+}
+
+char *
+__strstr_avx512 (const char *haystack, const char *ned)
+{
+  char first = ned[0];
+  if (first == '\0')
+    return (char *)haystack;
+  if (ned[1] == '\0')
+    return (char *)strchr (haystack, ned[0]);
+
+  size_t edge = find_edge_in_needle (ned);
+
+  /* ensure haystack is as long as the pos of edge in needle */
+  for (int ii = 0; ii < edge; ++ii)
+    {
+      if (haystack[ii] == '\0')
+        return NULL;
+    }
+
+  const __m512i null = _mm512_setzero_si512 (); // '\0'
+
+  /*
+   Load 64 bytes of the needle and save it to a zmm register
+   Read one cache line at a time to avoid loading across a page boundary
+   */
+  __mmask64 ned_load_mask
+      = _bzhi_u64 (FULL_MMASK64, 64 - ((uintptr_t)ned & 63));
+  __m512i ned_zmm = _mm512_maskz_loadu_epi8 (ned_load_mask, ned);
+  __mmask64 ned_nullmask
+      = _mm512_mask_cmpeq_epi8_mask (ned_load_mask, ned_zmm, null);
+  if (__glibc_unlikely (ned_nullmask == 0x0))
+    {
+      ned_zmm = _mm512_loadu_si512 (ned);
+      ned_nullmask = _mm512_cmpeq_epi8_mask (ned_zmm, null);
+      ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
+      if (ned_nullmask != 0x0)
+        ned_load_mask = ned_load_mask >> 1;
+    }
+  else
+    {
+      ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
+      ned_load_mask = ned_load_mask >> 1;
+    }
+  const __m512i ned0 = _mm512_set1_epi8 (ned[edge]);
+  const __m512i ned1 = _mm512_set1_epi8 (ned[edge + 1]);
+
+  /*
+   Read the bytes of haystack in the current cache line
+   */
+  size_t hay_index = edge;
+  __mmask64 loadmask = _bzhi_u64 (
+      FULL_MMASK64, 64 - ((uintptr_t) (haystack + hay_index) & 63));
+  /* First load is a partial cache line */
+  __m512i hay0 = _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index);
+  /* Search for NULL and compare only till null char */
+  __mmask64 nullmask = _mm512_mask_cmpeq_epi8_mask (loadmask, hay0, null);
+  __mmask64 cmpmask = nullmask ^ (nullmask - ONE_64BIT);
+  cmpmask = _kand_mask64 (cmpmask, loadmask);
+  /* Search for the 2 charaters of needle */
+  __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
+  __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
+  k1 = _kshiftri_mask64 (k1, 1);
+  /* k2 masks tell us if both chars from needle match */
+  uint64_t k2 = _cvtmask64_u64 (_kand_mask64 (_kand_mask64 (k0, k1), cmpmask));
+  /* For every match, search for the entire needle for a full match */
+  while (k2)
+    {
+      uint64_t bitcount = _tzcnt_u64(k2);
+      k2 = _blsr_u64(k2);
+      size_t match_pos = hay_index + bitcount - edge;
+      if (nullmask == 0)
+        {
+          if (verify_string_match_avx512 (haystack, match_pos, ned,
+                                          ned_load_mask, ned_zmm))
+            return (char *)haystack + match_pos;
+        }
+      else
+        {
+          if (verify_string_match (haystack, match_pos, ned, 0))
+            return (char *)haystack + match_pos;
+        }
+    }
+  /* We haven't checked for potential match at the last char yet */
+  hay_index += _mm_popcnt_u64 (loadmask) - 1;
+
+  /*
+   Loop over one cache line at a time to prevent reading over page
+   boundary
+   */
+  __m512i hay1;
+  while (nullmask == 0)
+    {
+      hay0 = _mm512_loadu_si512 (haystack + hay_index);
+      hay1 = _mm512_load_si512 (haystack + hay_index
+                                + 1); // Always 64 byte aligned
+      nullmask = _mm512_cmpeq_epi8_mask (hay1, null);
+      /* Compare only till null char */
+      cmpmask = nullmask ^ (nullmask - ONE_64BIT);
+      k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
+      k1 = _mm512_cmpeq_epi8_mask (hay1, ned1);
+      /* k2 masks tell us if both chars from needle match */
+      k2 = _cvtmask64_u64 (_kand_mask64 (_kand_mask64 (k0, k1), cmpmask));
+      /* For every match, compare full strings for potential match */
+      while (k2)
+        {
+          uint64_t bitcount = _tzcnt_u64(k2);
+          k2 = _blsr_u64(k2);
+          size_t match_pos = hay_index + bitcount - edge;
+          if (nullmask == 0)
+            {
+              /*
+               Since the haystack doesn't terminate at the current cache
+               line, we can use zmm register to compare the first 64 bytes
+               */
+              if (verify_string_match_avx512 (haystack, match_pos, ned,
+                                              ned_load_mask, ned_zmm))
+                return (char *)haystack + match_pos;
+            }
+          else
+            {
+              /* Compare byte by byte */
+              if (verify_string_match (haystack, match_pos, ned, 0))
+                return (char *)haystack + match_pos;
+            }
+        }
+      hay_index += ZMM_SIZE_IN_BYTES;
+    }
+  return NULL;
+}
diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c
index 95600a9de5..2fb8b169b6 100644
--- a/sysdeps/x86_64/multiarch/strstr.c
+++ b/sysdeps/x86_64/multiarch/strstr.c
@@ -35,16 +35,32 @@
 
 extern __typeof (__redirect_strstr) __strstr_sse2_unaligned attribute_hidden;
 extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden;
+extern __typeof (__redirect_strstr) __strstr_avx512 attribute_hidden;
 
 #include "init-arch.h"
 
 /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
    ifunc symbol properly.  */
 extern __typeof (__redirect_strstr) __libc_strstr;
-libc_ifunc (__libc_strstr,
-	    HAS_ARCH_FEATURE (Fast_Unaligned_Load)
-	    ? __strstr_sse2_unaligned
-	    : __strstr_sse2)
 
+static inline void *
+IFUNC_SELECTOR (void)
+{
+  const struct cpu_features *cpu_features = __get_cpu_features ();
+
+  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
+      && CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+      && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+      && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ)
+      && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+    return __strstr_avx512;
+
+  if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
+    return __strstr_sse2_unaligned;
+
+  return __strstr_sse2;
+}
+
+libc_ifunc_redirected (__redirect_strstr, __libc_strstr, IFUNC_SELECTOR ());
 #undef strstr
 strong_alias (__libc_strstr, strstr)
-- 
2.36.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2022-07-14  2:05 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-05-26 20:22 [PATCH 1/1] x86_64: Add strstr function with 512-bit EVEX Raghuveer Devulapalli
2022-05-26 21:25 ` Noah Goldstein
2022-05-31 19:16   ` Devulapalli, Raghuveer
2022-05-31 19:36     ` Devulapalli, Raghuveer
2022-05-31 21:33     ` Noah Goldstein
2022-06-01  4:13       ` Devulapalli, Raghuveer
2022-05-26 21:41 ` Noah Goldstein
2022-05-26 22:26   ` Noah Goldstein
2022-05-27 17:49     ` Devulapalli, Raghuveer
2022-06-03 21:01 ` [PATCH] " Raghuveer Devulapalli
2022-06-06 19:17   ` [PATCH v2] " Raghuveer Devulapalli
2022-06-06 20:25     ` Noah Goldstein
2022-06-06 20:35       ` Noah Goldstein
2022-06-06 21:32     ` H.J. Lu
2022-06-06 21:39       ` Devulapalli, Raghuveer
2022-07-14  2:04         ` Sunil Pandey

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).