diff --git a/ChangeLog b/ChangeLog index cb52249..416c61a 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,25 @@ +2019-09-13 Wilco Dijkstra + + * string/memmem.c (__memmem): Rewrite to improve performance. + +2019-06-12 Wilco Dijkstra + + * string/str-two-way.h (two_way_short_needle): Add inline to avoid + warning. + (two_way_long_needle): Block inlining. + * string/strstr.c (strstr2): Add new function. + (strstr3): Likewise. + (STRSTR): Completely rewrite strstr to improve performance. + +2019-09-13 Rajalakshmi Srinivasaraghavan + + * string/memmem.c: Use memcmp for first match. + +2019-09-13 Wilco Dijkstra + + * string/strcasestr.c (STRCASESTR): Simplify and speedup first match. + * string/strstr.c (AVAILABLE): Likewise. + 2019-09-06 Wilco Dijkstra * manual/tunables.texi (glibc.cpu.name): Add ares tunable. diff --git a/string/memmem.c b/string/memmem.c index 43efaa3..7fbe1cb 100644 --- a/string/memmem.c +++ b/string/memmem.c @@ -15,17 +15,13 @@ License along with the GNU C Library; if not, see . */ -/* This particular implementation was written by Eric Blake, 2008. */ - #ifndef _LIBC # include #endif -/* Specification of memmem. */ #include #ifndef _LIBC -# define __builtin_expect(expr, val) (expr) # define __memmem memmem #endif @@ -36,47 +32,98 @@ #undef memmem -/* Return the first occurrence of NEEDLE in HAYSTACK. Return HAYSTACK - if NEEDLE_LEN is 0, otherwise NULL if NEEDLE is not found in - HAYSTACK. */ +/* Hash character pairs so a small shift table can be used. All bits of + p[0] are included, but not all bits from p[-1]. So if two equal hashes + match on p[-1], p[0] matches too. Hash collisions are harmless and result + in smaller shifts. */ +#define hash2(p) (((size_t)(p)[0] - ((size_t)(p)[-1] << 3)) % sizeof (shift)) + +/* Fast memmem algorithm with guaranteed linear-time performance. + Small needles up to size 2 use a dedicated linear search. Longer needles + up to size 256 use a novel modified Horspool algorithm. It hashes pairs + of characters to quickly skip past mismatches. The main search loop only + exits if the last 2 characters match, avoiding unnecessary calls to memcmp + and allowing for a larger skip if there is no match. A self-adapting + filtering check is used to quickly detect mismatches in long needles. + By limiting the needle length to 256, the shift table can be reduced to 8 + bits per entry, lowering preprocessing overhead and minimizing cache effects. + The limit also implies worst-case performance is linear. + Needles larger than 256 characters use the linear-time Two-Way algorithm. */ void * -__memmem (const void *haystack_start, size_t haystack_len, - const void *needle_start, size_t needle_len) +__memmem (const void *haystack, size_t hs_len, + const void *needle, size_t ne_len) { - /* Abstract memory is considered to be an array of 'unsigned char' values, - not an array of 'char' values. See ISO C 99 section 6.2.6.1. */ - const unsigned char *haystack = (const unsigned char *) haystack_start; - const unsigned char *needle = (const unsigned char *) needle_start; - - if (needle_len == 0) - /* The first occurrence of the empty string is deemed to occur at - the beginning of the string. */ - return (void *) haystack; - - /* Sanity check, otherwise the loop might search through the whole - memory. */ - if (__glibc_unlikely (haystack_len < needle_len)) + const unsigned char *hs = (const unsigned char *) haystack; + const unsigned char *ne = (const unsigned char *) needle; + + if (ne_len == 0) + return (void *) hs; + if (ne_len == 1) + return (void *) memchr (hs, ne[0], hs_len); + + /* Ensure haystack length is >= needle length. */ + if (hs_len < ne_len) return NULL; - /* Use optimizations in memchr when possible, to reduce the search - size of haystack using a linear algorithm with a smaller - coefficient. However, avoid memchr for long needles, since we - can often achieve sublinear performance. */ - if (needle_len < LONG_NEEDLE_THRESHOLD) + const unsigned char *end = hs + hs_len - ne_len; + + if (ne_len == 2) + { + uint32_t nw = ne[0] << 16 | ne[1], hw = hs[0] << 16 | hs[1]; + for (hs++; hs <= end && hw != nw; ) + hw = hw << 16 | *++hs; + return hw == nw ? (void *)hs - 1 : NULL; + } + + /* Use Two-Way algorithm for very long needles. */ + if (__builtin_expect (ne_len > 256, 0)) + return two_way_long_needle (hs, hs_len, ne, ne_len); + + uint8_t shift[256]; + size_t tmp, shift1; + size_t m1 = ne_len - 1; + size_t offset = 0; + + memset (shift, 0, sizeof (shift)); + for (int i = 1; i < m1; i++) + shift[hash2 (ne + i)] = i; + /* Shift1 is the amount we can skip after matching the hash of the + needle end but not the full needle. */ + shift1 = m1 - shift[hash2 (ne + m1)]; + shift[hash2 (ne + m1)] = m1; + + for ( ; hs <= end; ) { - haystack = memchr (haystack, *needle, haystack_len); - if (!haystack || __builtin_expect (needle_len == 1, 0)) - return (void *) haystack; - haystack_len -= haystack - (const unsigned char *) haystack_start; - if (haystack_len < needle_len) - return NULL; - return two_way_short_needle (haystack, haystack_len, needle, needle_len); + /* Skip past character pairs not in the needle. */ + do + { + hs += m1; + tmp = shift[hash2 (hs)]; + } + while (tmp == 0 && hs <= end); + + /* If the match is not at the end of the needle, shift to the end + and continue until we match the hash of the needle end. */ + hs -= tmp; + if (tmp < m1) + continue; + + /* Hash of the last 2 characters matches. If the needle is long, + try to quickly filter out mismatches. */ + if (m1 < 15 || memcmp (hs + offset, ne + offset, 8) == 0) + { + if (memcmp (hs, ne, m1) == 0) + return (void *) hs; + + /* Adjust filter offset when it doesn't find the mismatch. */ + offset = (offset >= 8 ? offset : m1) - 8; + } + + /* Skip based on matching the hash of the needle end. */ + hs += shift1; } - else - return two_way_long_needle (haystack, haystack_len, needle, needle_len); + return NULL; } libc_hidden_def (__memmem) weak_alias (__memmem, memmem) libc_hidden_weak (memmem) - -#undef LONG_NEEDLE_THRESHOLD diff --git a/string/str-two-way.h b/string/str-two-way.h index 523d946..358959b 100644 --- a/string/str-two-way.h +++ b/string/str-two-way.h @@ -221,7 +221,7 @@ critical_factorization (const unsigned char *needle, size_t needle_len, most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. */ -static RETURN_TYPE +static inline RETURN_TYPE two_way_short_needle (const unsigned char *haystack, size_t haystack_len, const unsigned char *needle, size_t needle_len) { @@ -382,8 +382,11 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len, and sublinear performance O(HAYSTACK_LEN / NEEDLE_LEN) is possible. If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and - sublinear performance is not possible. */ -static RETURN_TYPE + sublinear performance is not possible. + + Since this function is large and complex, block inlining to avoid + slowing down the common case of small needles. */ +__attribute__((noinline)) static RETURN_TYPE two_way_long_needle (const unsigned char *haystack, size_t haystack_len, const unsigned char *needle, size_t needle_len) { diff --git a/string/strcasestr.c b/string/strcasestr.c index 421764b..8aa7603 100644 --- a/string/strcasestr.c +++ b/string/strcasestr.c @@ -59,31 +59,22 @@ case-insensitive comparison. This function gives unspecified results in multibyte locales. */ char * -STRCASESTR (const char *haystack_start, const char *needle_start) +STRCASESTR (const char *haystack, const char *needle) { - const char *haystack = haystack_start; - const char *needle = needle_start; size_t needle_len; /* Length of NEEDLE. */ size_t haystack_len; /* Known minimum length of HAYSTACK. */ - bool ok = true; /* True if NEEDLE is prefix of HAYSTACK. */ - - /* Determine length of NEEDLE, and in the process, make sure - HAYSTACK is at least as long (no point processing all of a long - NEEDLE if HAYSTACK is too short). */ - while (*haystack && *needle) - { - ok &= (TOLOWER ((unsigned char) *haystack) - == TOLOWER ((unsigned char) *needle)); - haystack++; - needle++; - } - if (*needle) + + /* Handle empty NEEDLE special case. */ + if (needle[0] == '\0') + return (char *) haystack; + + /* Ensure HAYSTACK length is at least as long as NEEDLE length. + Since a match may occur early on in a huge HAYSTACK, use strnlen + and read ahead a few cachelines for improved performance. */ + needle_len = strlen (needle); + haystack_len = __strnlen (haystack, needle_len + 256); + if (haystack_len < needle_len) return NULL; - if (ok) - return (char *) haystack_start; - needle_len = needle - needle_start; - haystack = haystack_start + 1; - haystack_len = needle_len - 1; /* Perform the search. Abstract memory is considered to be an array of 'unsigned char' values, not an array of 'char' values. See @@ -91,10 +82,10 @@ STRCASESTR (const char *haystack_start, const char *needle_start) if (needle_len < LONG_NEEDLE_THRESHOLD) return two_way_short_needle ((const unsigned char *) haystack, haystack_len, - (const unsigned char *) needle_start, + (const unsigned char *) needle, needle_len); return two_way_long_needle ((const unsigned char *) haystack, haystack_len, - (const unsigned char *) needle_start, + (const unsigned char *) needle, needle_len); } diff --git a/string/strstr.c b/string/strstr.c index 79ebcc7..7ffb18a 100644 --- a/string/strstr.c +++ b/string/strstr.c @@ -16,29 +16,17 @@ License along with the GNU C Library; if not, see . */ -/* This particular implementation was written by Eric Blake, 2008. */ - #ifndef _LIBC # include #endif -/* Specification of strstr. */ #include -#include - -#ifndef _LIBC -# define __builtin_expect(expr, val) (expr) -#endif - #define RETURN_TYPE char * #define AVAILABLE(h, h_l, j, n_l) \ (((j) + (n_l) <= (h_l)) \ || ((h_l) += __strnlen ((void*)((h) + (h_l)), (n_l) + 512), \ (j) + (n_l) <= (h_l))) -#define CHECK_EOL (1) -#define RET0_IF_0(a) if (!a) goto ret0 -#define FASTSEARCH(S,C,N) (void*) strchr ((void*)(S), (C)) #include "str-two-way.h" #undef strstr @@ -47,48 +35,128 @@ #define STRSTR strstr #endif -/* Return the first occurrence of NEEDLE in HAYSTACK. Return HAYSTACK - if NEEDLE is empty, otherwise NULL if NEEDLE is not found in - HAYSTACK. */ +static inline char * +strstr2 (const unsigned char *hs, const unsigned char *ne) +{ + uint32_t h1 = (ne[0] << 16) | ne[1]; + uint32_t h2 = 0; + for (int c = hs[0]; h1 != h2 && c != 0; c = *++hs) + h2 = (h2 << 16) | c; + return h1 == h2 ? (char *)hs - 2 : NULL; +} + +static inline char * +strstr3 (const unsigned char *hs, const unsigned char *ne) +{ + uint32_t h1 = ((uint32_t)ne[0] << 24) | (ne[1] << 16) | (ne[2] << 8); + uint32_t h2 = 0; + for (int c = hs[0]; h1 != h2 && c != 0; c = *++hs) + h2 = (h2 | c) << 8; + return h1 == h2 ? (char *)hs - 3 : NULL; +} + +/* Hash character pairs so a small shift table can be used. All bits of + p[0] are included, but not all bits from p[-1]. So if two equal hashes + match on p[-1], p[0] matches too. Hash collisions are harmless and result + in smaller shifts. */ +#define hash2(p) (((size_t)(p)[0] - ((size_t)(p)[-1] << 3)) % sizeof (shift)) + +/* Fast strstr algorithm with guaranteed linear-time performance. + Small needles up to size 3 use a dedicated linear search. Longer needles + up to size 256 use a novel modified Horspool algorithm. It hashes pairs + of characters to quickly skip past mismatches. The main search loop only + exits if the last 2 characters match, avoiding unnecessary calls to memcmp + and allowing for a larger skip if there is no match. A self-adapting + filtering check is used to quickly detect mismatches in long needles. + By limiting the needle length to 256, the shift table can be reduced to 8 + bits per entry, lowering preprocessing overhead and minimizing cache effects. + The limit also implies worst-case performance is linear. + Needles larger than 256 characters use the linear-time Two-Way algorithm. */ char * -STRSTR (const char *haystack_start, const char *needle_start) +STRSTR (const char *haystack, const char *needle) { - const char *haystack = haystack_start; - const char *needle = needle_start; - size_t needle_len; /* Length of NEEDLE. */ - size_t haystack_len; /* Known minimum length of HAYSTACK. */ - bool ok = true; /* True if NEEDLE is prefix of HAYSTACK. */ - - /* Determine length of NEEDLE, and in the process, make sure - HAYSTACK is at least as long (no point processing all of a long - NEEDLE if HAYSTACK is too short). */ - while (*haystack && *needle) - ok &= *haystack++ == *needle++; - if (*needle) + const unsigned char *hs = (const unsigned char *) haystack; + const unsigned char *ne = (const unsigned char *) needle; + + /* Handle short needle special cases first. */ + if (ne[0] == '\0') + return (char *)hs; + hs = (const unsigned char *)strchr ((const char*)hs, ne[0]); + if (hs == NULL || ne[1] == '\0') + return (char*)hs; + if (ne[2] == '\0') + return strstr2 (hs, ne); + if (ne[3] == '\0') + return strstr3 (hs, ne); + + /* Ensure haystack length is at least as long as needle length. + Since a match may occur early on in a huge haystack, use strnlen + and read ahead a few cachelines for improved performance. */ + size_t ne_len = strlen ((const char*)ne); + size_t hs_len = __strnlen ((const char*)hs, ne_len | 512); + if (hs_len < ne_len) return NULL; - if (ok) - return (char *) haystack_start; - - /* Reduce the size of haystack using strchr, since it has a smaller - linear coefficient than the Two-Way algorithm. */ - needle_len = needle - needle_start; - haystack = strchr (haystack_start + 1, *needle_start); - if (!haystack || __builtin_expect (needle_len == 1, 0)) - return (char *) haystack; - needle -= needle_len; - haystack_len = (haystack > haystack_start + needle_len ? 1 - : needle_len + haystack_start - haystack); - - /* Perform the search. Abstract memory is considered to be an array - of 'unsigned char' values, not an array of 'char' values. See - ISO C 99 section 6.2.6.1. */ - if (needle_len < LONG_NEEDLE_THRESHOLD) - return two_way_short_needle ((const unsigned char *) haystack, - haystack_len, - (const unsigned char *) needle, needle_len); - return two_way_long_needle ((const unsigned char *) haystack, haystack_len, - (const unsigned char *) needle, needle_len); + + /* Check whether we have a match. This improves performance since we + avoid initialization overheads. */ + if (memcmp (hs, ne, ne_len) == 0) + return (char *) hs; + + /* Use Two-Way algorithm for very long needles. */ + if (__glibc_unlikely (ne_len > 256)) + return two_way_long_needle (hs, hs_len, ne, ne_len); + + const unsigned char *end = hs + hs_len - ne_len; + uint8_t shift[256]; + size_t tmp, shift1; + size_t m1 = ne_len - 1; + size_t offset = 0; + + /* Initialize bad character shift hash table. */ + memset (shift, 0, sizeof (shift)); + for (int i = 1; i < m1; i++) + shift[hash2 (ne + i)] = i; + /* Shift1 is the amount we can skip after matching the hash of the + needle end but not the full needle. */ + shift1 = m1 - shift[hash2 (ne + m1)]; + shift[hash2 (ne + m1)] = m1; + + while (1) + { + if (__glibc_unlikely (hs > end)) + { + end += __strnlen ((const char*)end + m1 + 1, 2048); + if (hs > end) + return NULL; + } + + /* Skip past character pairs not in the needle. */ + do + { + hs += m1; + tmp = shift[hash2 (hs)]; + } + while (tmp == 0 && hs <= end); + + /* If the match is not at the end of the needle, shift to the end + and continue until we match the hash of the needle end. */ + hs -= tmp; + if (tmp < m1) + continue; + + /* Hash of the last 2 characters matches. If the needle is long, + try to quickly filter out mismatches. */ + if (m1 < 15 || memcmp (hs + offset, ne + offset, 8) == 0) + { + if (memcmp (hs, ne, m1) == 0) + return (void *) hs; + + /* Adjust filter offset when it doesn't find the mismatch. */ + offset = (offset >= 8 ? offset : m1) - 8; + } + + /* Skip based on matching the hash of the needle end. */ + hs += shift1; + } } libc_hidden_builtin_def (strstr) - -#undef LONG_NEEDLE_THRESHOLD