From: Noah Goldstein <goldstein.w.n@gmail.com>
To: James Tirta Halim <tirtajames45@gmail.com>
Cc: libc-alpha@sourceware.org
Subject: Re: [PATCH v4] sysdeps/x86_64/multiarch/memmem-avx2.c: add memmem-avx2.c
Date: Fri, 2 Feb 2024 22:47:55 +0000 [thread overview]
Message-ID: <CAFUsyfJ-hJTY6dBAqn=LAFbPEpy8+OQO4W97_LDXpPp_-1pWCg@mail.gmail.com> (raw)
In-Reply-To: <20240201005721.782679-1-tirtajames45@gmail.com>
On Thu, Feb 1, 2024 at 1:00 AM James Tirta Halim <tirtajames45@gmail.com> wrote:
>
> Find the rarest byte in NE. Find the parts of HS that matches the rare byte
> and the byte after it. If found, shift back to the start of NE in HS and
> vector compare the first VEC_SIZE with NE. If matches, compare the rest
> with MEMCMPEQ.
>
> Timings (Core i3-1115G4):
> basic_memmem twoway_memmem __memmem_avx512 __memmem_avx2
> __memmem_generic
> Total:
> 6.80124e+06 1.06087e+06 219483 345385 768041
> Average:
> 25958.9 4049.11 837.721 1318.26 2931.45
>
> Passes make check.
>
> Changes in v1:
> 1. Add memmem-avx2.c
>
> Changes in v2:
> 1. Add avx512 support with a generic header file
> 2. Use __memcmpeq instead of memcmp
> 3. Remove scalar loop
> 4. Fix unsafe unaligned load
>
> Changes in v3:
> 1. Avoid checking for alignment to the start of the page since that will be rare
> 2. Use __memcmpeq instead of __memcmpeq_avx2 (it generates undefined
> reference errors)
> 3. Add memmem.c (needs review)
> 4. Add __memcmpeq_avx2 and __memcmpeq_avx512 to ifunc-impl-list.c (needs
> review)
> 5. Add libc_hidden_builtin_def and MEMMEM to memmem.c (needs review)
>
> Changes in v4:
> 1. Correct the cpu feature checks in ifunc-impl-list.c and memmem.c to
> use AVX512BW and BMI1 for AVX512 and AVX2 and BMI1 for AVX2
> 2. Correct the Makefile to use the appropriate flags
> 3. Rename memmem-vectorized-avx.h to memmem-avx-base.h
> 4. Remove unused vector macros (POPCNT and LZCNT)
>
> ---
> string/memmem.c | 7 +-
> sysdeps/x86_64/multiarch/Makefile | 5 +
> sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 ++
> sysdeps/x86_64/multiarch/memmem-avx-base.h | 217 +++++++++++++++++++++
> sysdeps/x86_64/multiarch/memmem-avx2.c | 3 +
> sysdeps/x86_64/multiarch/memmem-avx512.c | 16 ++
> sysdeps/x86_64/multiarch/memmem.c | 67 +++++++
> 7 files changed, 326 insertions(+), 1 deletion(-)
> create mode 100644 sysdeps/x86_64/multiarch/memmem-avx-base.h
> create mode 100644 sysdeps/x86_64/multiarch/memmem-avx2.c
> create mode 100644 sysdeps/x86_64/multiarch/memmem-avx512.c
> create mode 100644 sysdeps/x86_64/multiarch/memmem.c
>
> diff --git a/string/memmem.c b/string/memmem.c
> index 6badc1c3bd..62654b4bd0 100644
> --- a/string/memmem.c
> +++ b/string/memmem.c
> @@ -32,6 +32,10 @@
>
> #undef memmem
>
> +#ifndef MEMMEM
> +# define MEMMEM __memmem
> +#endif
> +
> /* Hash character pairs so a small shift table can be used. All bits of
> p[0] are included, but not all bits from p[-1]. So if two equal hashes
> match on p[-1], p[0] matches too. Hash collisions are harmless and result
> @@ -50,7 +54,7 @@
> The limit also implies worst-case performance is linear.
> Needles larger than 256 characters use the linear-time Two-Way algorithm. */
> void *
> -__memmem(const void *haystack, size_t hs_len,
> +MEMMEM(const void *haystack, size_t hs_len,
> const void *needle, size_t ne_len)
> {
> const unsigned char *hs = (const unsigned char *)haystack;
> @@ -122,3 +126,4 @@ const void *needle, size_t ne_len)
> libc_hidden_def(__memmem)
> weak_alias(__memmem, memmem)
> libc_hidden_weak(memmem)
> +libc_hidden_builtin_def(MEMMEM)
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index e1e894c963..95c95eee4b 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -15,6 +15,8 @@ sysdep_routines += \
> memcmpeq-avx2-rtm \
> memcmpeq-evex \
> memcmpeq-sse2 \
> + memmem-avx2 \
> + memmem-avx512 \
> memmove-avx-unaligned-erms \
> memmove-avx-unaligned-erms-rtm \
> memmove-avx512-no-vzeroupper \
> @@ -122,6 +124,9 @@ sysdep_routines += \
> varshift \
> # sysdep_routines
>
> +CFLAGS-memmem-avx2.c += -mavx2 -mbmi -O3
> +CFLAGS-memmem-avx512.c += -mavx512f -mavx512bw -mbmi -O3
> +
> CFLAGS-strcspn-sse4.c += -msse4
> CFLAGS-strpbrk-sse4.c += -msse4
> CFLAGS-strspn-sse4.c += -msse4
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 5427ff1907..300d4064ae 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -799,6 +799,18 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
> IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_generic))
>
> + /* Support sysdeps/x86_64/multiarch/memmem.c. */
> + IFUNC_IMPL (i, name, memmem,
> + IFUNC_IMPL_ADD (array, i, memmem,
> + (CPU_FEATURE_USABLE (AVX512BW)
> + && CPU_FEATURE_USABLE (BMI1)),
> + __memmem_avx512)
> + IFUNC_IMPL_ADD (array, i, memmem,
> + (CPU_FEATURE_USABLE (AVX2)
> + && CPU_FEATURE_USABLE (BMI1)),
> + __memmem_avx2)
> + IFUNC_IMPL_ADD (array, i, memmem, 1, __memmem_generic))
> +
> /* Support sysdeps/x86_64/multiarch/wcschr.c. */
> IFUNC_IMPL (i, name, wcschr,
> X86_IFUNC_IMPL_ADD_V4 (array, i, wcschr,
> diff --git a/sysdeps/x86_64/multiarch/memmem-avx-base.h b/sysdeps/x86_64/multiarch/memmem-avx-base.h
> new file mode 100644
> index 0000000000..46883bb121
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/memmem-avx-base.h
> @@ -0,0 +1,217 @@
> +#include <immintrin.h>
> +#include <inttypes.h>
> +#include <string.h>
> +#include <libc-pointer-arith.h>
> +
> +#ifndef FUNC_NAME
> +# define __memmem_avx2
> +#endif
> +#ifndef VEC
> +# define VEC __m256i
> +#endif
> +#ifndef VEC_SIZE
> +# define VEC_SIZE sizeof (VEC)
> +#endif
> +#ifndef MASK
> +# define MASK uint32_t
> +#endif
> +#ifndef MASK_SIZE
> +# define MASK_SIZE sizeof (MASK)
> +#endif
> +#ifndef LOAD
> +# define LOAD(x) _mm256_load_si256 (x)
> +#endif
> +#ifndef LOADU
> +# define LOADU(x) _mm256_loadu_si256 (x)
> +#endif
> +#ifndef STORE
> +# define STORE(dst, src) _mm256_store_si256 (dst, src)
> +#endif
> +#ifndef STOREU
> +# define STOREU(dst, src) _mm256_storeu_si256 (dst, src)
> +#endif
> +#ifndef CMPEQ8_MASK
> +# define CMPEQ8_MASK(x, y) _mm256_movemask_epi8 (_mm256_cmpeq_epi8 (x, y))
> +#endif
> +#ifndef SETZERO
> +# define SETZERO(x) _mm256_setzero_si256 (x)
> +#endif
> +#ifndef SETONE8
> +# define SETONE8(x) _mm256_set1_epi8 (x)
> +#endif
> +#ifndef TZCNT
> +# define TZCNT(x) _tzcnt_u32 (x)
> +#endif
> +#ifndef BLSR
> +# define BLSR(x) _blsr_u32 (x)
> +#endif
> +#ifndef ONES
> +# define ONES ((MASK) -1)
> +#endif
> +
Things like `ONE`, `VEC_SIZE`, `MASK_SIZE`, etc...
can just be unconditionally defined in memmem-avx-base
Also, instead of having a default in memmem-avx-base,
think the rest should be just be defined in the memem-avx2/memem-avx512.
Otherwise theres not really preventing the `TZCNT`/`BLSR` from becoming
desynced with `MASK` (likewise for the VEC defines).
> +#ifndef MEMCMPEQ
> +# define MEMCMPEQ __memcmpeq
> +#endif
> +#ifndef MEMCPY
> +# define MEMCPY memcpy
> +#endif
> +#ifndef MEMCHR
> +# define MEMCHR memchr
> +#endif
> +#ifndef PAGE_SIZE
> +# define PAGE_SIZE 4096
> +#endif
> +#define MIN(x, y) (((x) < (y)) ? (x) : (y))
> +
> +static inline void *
> +find_rarest_byte (const void *ne, size_t n)
> +{
> + /* Lower is rarer. The table is based on the
> + *.c and *.h files in glibc. */
> + static const unsigned char rarebyte_table[256]
> + = { 0, 1, 13, 56, 59, 60, 61, 62, 63, 232, 248, 2, 158, 4,
> + 5, 6, 7, 8, 9, 10, 14, 20, 26, 29, 37, 46, 52, 53,
> + 54, 55, 57, 58, 255, 172, 242, 193, 162, 174, 178, 182, 218, 219,
> + 212, 180, 249, 197, 221, 210, 253, 231, 230, 224, 225, 226, 227, 223,
> + 222, 220, 176, 213, 184, 229, 188, 164, 159, 209, 181, 203, 189, 216,
> + 196, 192, 185, 205, 161, 168, 215, 187, 211, 194, 195, 165, 206, 204,
> + 214, 198, 173, 179, 175, 183, 167, 202, 239, 201, 160, 241, 163, 246,
> + 233, 238, 240, 254, 237, 208, 234, 250, 169, 186, 236, 217, 245, 243,
> + 228, 170, 247, 244, 251, 235, 199, 200, 252, 207, 177, 191, 171, 190,
> + 166, 3, 140, 134, 124, 126, 86, 128, 95, 117, 114, 93, 81, 87,
> + 132, 96, 112, 97, 103, 82, 139, 89, 98, 88, 119, 74, 156, 115,
> + 104, 75, 120, 106, 76, 155, 90, 122, 107, 125, 152, 145, 136, 137,
> + 101, 116, 102, 108, 99, 141, 77, 78, 118, 79, 109, 100, 150, 73,
> + 94, 72, 121, 151, 113, 135, 110, 105, 83, 91, 11, 12, 64, 149,
> + 146, 111, 65, 69, 66, 15, 16, 17, 18, 19, 130, 92, 144, 123,
> + 21, 22, 23, 24, 131, 133, 127, 142, 25, 70, 129, 27, 28, 67,
> + 153, 84, 143, 138, 147, 157, 148, 68, 71, 30, 31, 32, 33, 34,
> + 35, 36, 154, 38, 39, 40, 41, 42, 80, 43, 44, 45, 47, 48,
> + 85, 49, 50, 51 };
> + const unsigned char *rare = (const unsigned char *) ne;
> + const unsigned char *p = (const unsigned char *) ne;
> + int c_rare = rarebyte_table[*rare];
> + int c;
> + for (; n--; ++p)
> + {
> + c = rarebyte_table[*p];
> + if (c < c_rare)
> + {
> + rare = p;
> + c_rare = c;
> + }
> + }
> + return (void *) rare;
> +}
> +
> +void *
> +FUNC_NAME (const void *hs, size_t hs_len, const void *ne, size_t ne_len)
> +{
> + if (ne_len == 1)
> + return (void *) MEMCHR (hs, *(unsigned char *) ne, hs_len);
> + if (__glibc_unlikely (ne_len == 0))
> + return (void *) hs;
> + if (__glibc_unlikely (hs_len < ne_len))
> + return NULL;
> + VEC hv0, hv1, hv, nv;
> + MASK i, hm0, hm1, m, cmpm;
> + const unsigned int matchsh = ne_len < VEC_SIZE ? VEC_SIZE - ne_len : 0;
> + const MASK matchm = ONES << matchsh;
> + const unsigned char *h = (const unsigned char *) hs;
> + const unsigned char *const end = h + hs_len - ne_len;
> + const unsigned char *hp;
> + size_t shift = PTR_DIFF (find_rarest_byte (ne, ne_len), ne);
think ne_len here should be probably limitted to something like
MIN(ne_len, VEC_SIZE).
> + if (shift == ne_len - 1)
> + --shift;
> + const VEC nv0 = SETONE8 (*((char *) ne + shift));
> + const VEC nv1 = SETONE8 (*((char *) ne + shift + 1));
> + h += shift;
> + if (PTR_DIFF (PTR_ALIGN_UP (ne, PAGE_SIZE), ne) >= VEC_SIZE
> + || PTR_IS_ALIGNED (ne, PAGE_SIZE) || ne_len >= VEC_SIZE)
> + nv = LOADU ((VEC *) ne);
think simpler logic is `(ne & (PAGE_SIZE - 1)) > (PAGE_SIZE - VEC_SIZE)`
> + else
> + MEMCPY (&nv, ne, MIN (VEC_SIZE, ne_len));
> + const unsigned int off = PTR_DIFF (h, PTR_ALIGN_DOWN (h, VEC_SIZE));
> + unsigned int off2 = (PTR_DIFF (end, (h - shift)) < VEC_SIZE)
> + ? VEC_SIZE - (unsigned int) (end - (h - shift)) - 1
> + : 0;
> + h -= off;
> + hv0 = LOAD ((const VEC *) h);
> + hm0 = (MASK) CMPEQ8_MASK (hv0, nv0);
> + hm1 = (MASK) CMPEQ8_MASK (hv0, nv1) >> 1;
> + /* Clear matched bits that are out of bounds. */
> + m = (((hm0 & hm1) >> off) << off2) >> off2;
> + while (m)
> + {
> + i = TZCNT (m);
> + m = BLSR (m);
> + hp = h + off + i - shift;
> + if (PTR_DIFF (PTR_ALIGN_UP (hp, PAGE_SIZE), hp) >= VEC_SIZE)
> + {
> + hv = LOADU ((VEC *) hp);
> + cmpm = (MASK) CMPEQ8_MASK (hv, nv) << matchsh;
> + if (cmpm == matchm)
> + if (ne_len <= VEC_SIZE
> + || !MEMCMPEQ (hp + VEC_SIZE, (const char *) ne + VEC_SIZE,
> + ne_len - VEC_SIZE))
> + return (void *) hp;
> + }
> + else
> + {
> + if (!MEMCMPEQ (hp, ne, ne_len))
> + return (void *) hp;
> + }
> + }
> + h += VEC_SIZE - 1;
> + for (; h - shift + VEC_SIZE <= end; h += VEC_SIZE)
> + {
> + hv0 = LOADU ((const VEC *) h);
> + hv1 = LOAD ((const VEC *) (h + 1));
> + hm1 = (MASK) CMPEQ8_MASK (hv1, nv1);
> + hm0 = (MASK) CMPEQ8_MASK (hv0, nv0);
> + m = hm0 & hm1;
> + while (m)
> + {
> + match:
> + i = TZCNT (m);
> + m = BLSR (m);
> + hp = h + i - shift;
> + if (PTR_DIFF (PTR_ALIGN_UP (hp, PAGE_SIZE), hp) >= VEC_SIZE)
> + {
> + hv = LOADU ((VEC *) hp);
> + cmpm = (MASK) CMPEQ8_MASK (hv, nv) << matchsh;
> + if (cmpm == matchm)
> + if (ne_len <= VEC_SIZE
> + || !MEMCMPEQ (hp + VEC_SIZE, (const char *) ne + VEC_SIZE,
> + ne_len - VEC_SIZE))
> + return (void *) hp;
> + }
> + else
> + {
> + if (!MEMCMPEQ (hp, ne, ne_len))
> + return (void *) hp;
> + }
> + }
> + }
> + if (h - shift <= end)
> + {
> + off2 = VEC_SIZE - (unsigned int) (end - (h - shift)) - 1;
> + hv1 = LOAD ((const VEC *) (h + 1));
> + if (PTR_DIFF (PTR_ALIGN_UP (h, PAGE_SIZE), h) >= VEC_SIZE)
> + {
> + hv0 = LOADU ((const VEC *) h);
> + hm1 = (MASK) CMPEQ8_MASK (hv1, nv1);
> + hm0 = (MASK) CMPEQ8_MASK (hv0, nv0);
> + }
> + else
> + {
> + hm1 = (MASK) CMPEQ8_MASK (hv1, nv1);
> + hm0 = 1 | (MASK) CMPEQ8_MASK (hv1, nv0) << 1;
> + }
> + /* Clear matched bits that are out of bounds. */
> + m = ((hm0 & hm1) << off2) >> off2;
> + if (m)
> + goto match;
> + }
> + return NULL;
> +}
The implementation is ingeneral a bit hard to follow.
Can you
1) comment the implementation. Particularly a bit lost
following the setup code around `off`/`h`/`off2`/`shift`.
> diff --git a/sysdeps/x86_64/multiarch/memmem-avx2.c b/sysdeps/x86_64/multiarch/memmem-avx2.c
> new file mode 100644
> index 0000000000..91f5d5d331
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/memmem-avx2.c
> @@ -0,0 +1,3 @@
> +#define FUNC_NAME __memmem_avx2
> +
> +#include "memmem-avx-base.h"
> diff --git a/sysdeps/x86_64/multiarch/memmem-avx512.c b/sysdeps/x86_64/multiarch/memmem-avx512.c
> new file mode 100644
> index 0000000000..163efa2133
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/memmem-avx512.c
> @@ -0,0 +1,16 @@
> +#define VEC __m512i
> +#define MASK uint64_t
> +#define LOAD(x) _mm512_load_si512 (x)
> +#define LOADU(x) _mm512_loadu_si512 (x)
> +#define STORE(dst, src) _mm512_store_si512 (dst, src)
> +#define STOREU(dst, src) _mm512_storeu_si512 (dst, src)
> +#define CMPEQ8_MASK(x, y) _mm512_cmpeq_epi8_mask (x, y)
> +#define SETZERO(x) _mm512_setzero_si512 (x)
> +#define SETONE8(x) _mm512_set1_epi8 (x)
> +#define TZCNT(x) _tzcnt_u64 (x)
> +#define BLSR(x) _blsr_u64 (x)
> +#define ONES ((MASK) -1)
> +
> +#define FUNC_NAME __memmem_avx512
> +
> +#include "memmem-avx-base.h"
> diff --git a/sysdeps/x86_64/multiarch/memmem.c b/sysdeps/x86_64/multiarch/memmem.c
> new file mode 100644
> index 0000000000..8fe7b77d33
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/memmem.c
> @@ -0,0 +1,67 @@
> +/* Multiple versions of memmem.
> + All versions must be listed in ifunc-impl-list.c.
> + Copyright (C) 2012-2023 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +/* Redefine memmem so that the compiler won't complain about the type
> + mismatch with the IFUNC selector in strong_alias, below. */
> +#undef memmem
> +#define memmem __redirect_memmem
> +#include <string.h>
> +#undef memmem
> +
> +#define MEMMEM __memmem_generic
> +#ifdef SHARED
> +# undef libc_hidden_builtin_def
> +# define libc_hidden_builtin_def(name) \
> + __hidden_ver1 (__memmem_generic, __GI_memmem, __memmem_generic);
> +#endif
> +
> +#include "string/memmem.c"
> +
> +extern __typeof (__redirect_memmem) __memmem_avx2 attribute_hidden;
> +extern __typeof (__redirect_memmem) __memmem_generic attribute_hidden;
> +extern __typeof (__redirect_memmem) __memmem_avx512 attribute_hidden;
> +
> +#define SYMBOL_NAME memmem
> +
> +#include "init-arch.h"
> +
> +/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
> + ifunc symbol properly. */
> +extern __typeof (__redirect_memmem) __libc_memmem;
> +
> +static inline void *
> +IFUNC_SELECTOR (void)
> +{
> + const struct cpu_features *cpu_features = __get_cpu_features ();
> +
> + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
> + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> + && CPU_FEATURE_USABLE_P (cpu_features, BMI1))
> + return __memmem_avx512;
> +
> + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> + && CPU_FEATURE_USABLE_P (cpu_features, BMI1))
> + return __memmem_avx2;
> +
> + return __memmem_generic;
> +}
> +
> +libc_ifunc_redirected (__redirect_memmem, __libc_memmem, IFUNC_SELECTOR ());
> +#undef memmem
> +strong_alias (__libc_memmem, __memmem)
> --
> 2.43.0
>
next prev parent reply other threads:[~2024-02-02 22:48 UTC|newest]
Thread overview: 3+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-02-01 0:57 James Tirta Halim
2024-02-02 22:47 ` Noah Goldstein [this message]
2024-02-19 12:12 ` James
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to='CAFUsyfJ-hJTY6dBAqn=LAFbPEpy8+OQO4W97_LDXpPp_-1pWCg@mail.gmail.com' \
--to=goldstein.w.n@gmail.com \
--cc=libc-alpha@sourceware.org \
--cc=tirtajames45@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).