diff --git a/ChangeLog b/ChangeLog index 3265e61..3682637 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,46 @@ +2019-09-13 Wilco Dijkstra + + * string/memmem.c (__memmem): Rewrite to improve performance. + +2019-06-12 Wilco Dijkstra + + * string/str-two-way.h (two_way_short_needle): Add inline to avoid + warning. + (two_way_long_needle): Block inlining. + * string/strstr.c (strstr2): Add new function. + (strstr3): Likewise. + (STRSTR): Completely rewrite strstr to improve performance. + +2019-09-13 Wilco Dijkstra + + [BZ #23637] + * string/test-strstr.c (pr23637): New function. + (test_main): Add tests with longer needles. + * string/strcasestr.c (AVAILABLE): Fix readahead distance. + * string/strstr.c (AVAILABLE): Likewise. + +2019-09-13 Rajalakshmi Srinivasaraghavan + + * string/memmem.c: Use memcmp for first match. + +2019-09-13 Wilco Dijkstra + + * string/strcasestr.c (STRCASESTR): Simplify and speedup first match. + * string/strstr.c (AVAILABLE): Likewise. + +2019-09-13 Wilco Dijkstra + + * benchtests/bench-strcasestr.c: Rename __strnlen to strnlen. + * benchtests/bench-strstr.c: Likewise. + * string/memmem.c (FASTSEARCH): Define. + * string/str-two-way.h (two_way_short_needle): Minor cleanups. + Add support for FASTSEARCH. + * string/strcasestr.c (AVAILABLE): Use read-ahead __strnlen. + * string/strstr.c (AVAILABLE): Use read-ahead __strnlen. + (FASTSEARCH): Define. + * string/test-strcasestr.c: Rename __strnlen to strnlen. + * string/test-strstr.c: Likewise. + 2019-09-06 Wilco Dijkstra * manual/tunables.texi (glibc.cpu.name): Add ares tunable. diff --git a/benchtests/bench-strcasestr.c b/benchtests/bench-strcasestr.c index e6659ea..4337e0c 100644 --- a/benchtests/bench-strcasestr.c +++ b/benchtests/bench-strcasestr.c @@ -24,6 +24,7 @@ #define STRCASESTR simple_strcasestr #define NO_ALIAS #define __strncasecmp strncasecmp +#define __strnlen strnlen #include "../string/strcasestr.c" diff --git a/benchtests/bench-strstr.c b/benchtests/bench-strstr.c index 86d5e82..b7431de 100644 --- a/benchtests/bench-strstr.c +++ b/benchtests/bench-strstr.c @@ -22,6 +22,9 @@ #define STRSTR simple_strstr +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(X) +#define __strnlen strnlen #include "../string/strstr.c" diff --git a/string/memmem.c b/string/memmem.c index c17e1cf..7fbe1cb 100644 --- a/string/memmem.c +++ b/string/memmem.c @@ -15,67 +15,115 @@ License along with the GNU C Library; if not, see . */ -/* This particular implementation was written by Eric Blake, 2008. */ - #ifndef _LIBC # include #endif -/* Specification of memmem. */ #include #ifndef _LIBC -# define __builtin_expect(expr, val) (expr) # define __memmem memmem #endif #define RETURN_TYPE void * #define AVAILABLE(h, h_l, j, n_l) ((j) <= (h_l) - (n_l)) +#define FASTSEARCH(S,C,N) (void*) memchr ((void *)(S), (C), (N)) #include "str-two-way.h" #undef memmem -/* Return the first occurrence of NEEDLE in HAYSTACK. Return HAYSTACK - if NEEDLE_LEN is 0, otherwise NULL if NEEDLE is not found in - HAYSTACK. */ +/* Hash character pairs so a small shift table can be used. All bits of + p[0] are included, but not all bits from p[-1]. So if two equal hashes + match on p[-1], p[0] matches too. Hash collisions are harmless and result + in smaller shifts. */ +#define hash2(p) (((size_t)(p)[0] - ((size_t)(p)[-1] << 3)) % sizeof (shift)) + +/* Fast memmem algorithm with guaranteed linear-time performance. + Small needles up to size 2 use a dedicated linear search. Longer needles + up to size 256 use a novel modified Horspool algorithm. It hashes pairs + of characters to quickly skip past mismatches. The main search loop only + exits if the last 2 characters match, avoiding unnecessary calls to memcmp + and allowing for a larger skip if there is no match. A self-adapting + filtering check is used to quickly detect mismatches in long needles. + By limiting the needle length to 256, the shift table can be reduced to 8 + bits per entry, lowering preprocessing overhead and minimizing cache effects. + The limit also implies worst-case performance is linear. + Needles larger than 256 characters use the linear-time Two-Way algorithm. */ void * -__memmem (const void *haystack_start, size_t haystack_len, - const void *needle_start, size_t needle_len) +__memmem (const void *haystack, size_t hs_len, + const void *needle, size_t ne_len) { - /* Abstract memory is considered to be an array of 'unsigned char' values, - not an array of 'char' values. See ISO C 99 section 6.2.6.1. */ - const unsigned char *haystack = (const unsigned char *) haystack_start; - const unsigned char *needle = (const unsigned char *) needle_start; - - if (needle_len == 0) - /* The first occurrence of the empty string is deemed to occur at - the beginning of the string. */ - return (void *) haystack; - - /* Sanity check, otherwise the loop might search through the whole - memory. */ - if (__glibc_unlikely (haystack_len < needle_len)) + const unsigned char *hs = (const unsigned char *) haystack; + const unsigned char *ne = (const unsigned char *) needle; + + if (ne_len == 0) + return (void *) hs; + if (ne_len == 1) + return (void *) memchr (hs, ne[0], hs_len); + + /* Ensure haystack length is >= needle length. */ + if (hs_len < ne_len) return NULL; - /* Use optimizations in memchr when possible, to reduce the search - size of haystack using a linear algorithm with a smaller - coefficient. However, avoid memchr for long needles, since we - can often achieve sublinear performance. */ - if (needle_len < LONG_NEEDLE_THRESHOLD) + const unsigned char *end = hs + hs_len - ne_len; + + if (ne_len == 2) + { + uint32_t nw = ne[0] << 16 | ne[1], hw = hs[0] << 16 | hs[1]; + for (hs++; hs <= end && hw != nw; ) + hw = hw << 16 | *++hs; + return hw == nw ? (void *)hs - 1 : NULL; + } + + /* Use Two-Way algorithm for very long needles. */ + if (__builtin_expect (ne_len > 256, 0)) + return two_way_long_needle (hs, hs_len, ne, ne_len); + + uint8_t shift[256]; + size_t tmp, shift1; + size_t m1 = ne_len - 1; + size_t offset = 0; + + memset (shift, 0, sizeof (shift)); + for (int i = 1; i < m1; i++) + shift[hash2 (ne + i)] = i; + /* Shift1 is the amount we can skip after matching the hash of the + needle end but not the full needle. */ + shift1 = m1 - shift[hash2 (ne + m1)]; + shift[hash2 (ne + m1)] = m1; + + for ( ; hs <= end; ) { - haystack = memchr (haystack, *needle, haystack_len); - if (!haystack || __builtin_expect (needle_len == 1, 0)) - return (void *) haystack; - haystack_len -= haystack - (const unsigned char *) haystack_start; - if (haystack_len < needle_len) - return NULL; - return two_way_short_needle (haystack, haystack_len, needle, needle_len); + /* Skip past character pairs not in the needle. */ + do + { + hs += m1; + tmp = shift[hash2 (hs)]; + } + while (tmp == 0 && hs <= end); + + /* If the match is not at the end of the needle, shift to the end + and continue until we match the hash of the needle end. */ + hs -= tmp; + if (tmp < m1) + continue; + + /* Hash of the last 2 characters matches. If the needle is long, + try to quickly filter out mismatches. */ + if (m1 < 15 || memcmp (hs + offset, ne + offset, 8) == 0) + { + if (memcmp (hs, ne, m1) == 0) + return (void *) hs; + + /* Adjust filter offset when it doesn't find the mismatch. */ + offset = (offset >= 8 ? offset : m1) - 8; + } + + /* Skip based on matching the hash of the needle end. */ + hs += shift1; } - else - return two_way_long_needle (haystack, haystack_len, needle, needle_len); + return NULL; } libc_hidden_def (__memmem) weak_alias (__memmem, memmem) libc_hidden_weak (memmem) - -#undef LONG_NEEDLE_THRESHOLD diff --git a/string/str-two-way.h b/string/str-two-way.h index cd260585..358959b 100644 --- a/string/str-two-way.h +++ b/string/str-two-way.h @@ -221,7 +221,7 @@ critical_factorization (const unsigned char *needle, size_t needle_len, most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. */ -static RETURN_TYPE +static inline RETURN_TYPE two_way_short_needle (const unsigned char *haystack, size_t haystack_len, const unsigned char *needle, size_t needle_len) { @@ -281,50 +281,50 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len, } else { - const unsigned char *phaystack = &haystack[suffix]; + const unsigned char *phaystack; /* The comparison always starts from needle[suffix], so cache it and use an optimized first-character loop. */ unsigned char needle_suffix = CANON_ELEMENT (needle[suffix]); -#if CHECK_EOL - /* We start matching from the SUFFIX'th element, so make sure we - don't hit '\0' before that. */ - if (haystack_len < suffix + 1 - && !AVAILABLE (haystack, haystack_len, 0, suffix + 1)) - return NULL; -#endif - /* The two halves of needle are distinct; no extra memory is required, and any mismatch results in a maximal shift. */ period = MAX (suffix, needle_len - suffix) + 1; j = 0; - while (1 -#if !CHECK_EOL - && AVAILABLE (haystack, haystack_len, j, needle_len) -#endif - ) + while (AVAILABLE (haystack, haystack_len, j, needle_len)) { unsigned char haystack_char; const unsigned char *pneedle; - /* TODO: The first-character loop can be sped up by adapting - longword-at-a-time implementation of memchr/strchr. */ - if (needle_suffix + phaystack = &haystack[suffix + j]; + +#ifdef FASTSEARCH + if (*phaystack++ != needle_suffix) + { + phaystack = FASTSEARCH (phaystack, needle_suffix, + haystack_len - needle_len - j); + if (phaystack == NULL) + goto ret0; + j = phaystack - &haystack[suffix]; + phaystack++; + } +#else + while (needle_suffix != (haystack_char = CANON_ELEMENT (*phaystack++))) { RET0_IF_0 (haystack_char); -#if !CHECK_EOL +# if !CHECK_EOL ++j; -#endif - continue; + if (!AVAILABLE (haystack, haystack_len, j, needle_len)) + goto ret0; +# endif } -#if CHECK_EOL +# if CHECK_EOL /* Calculate J if it wasn't kept up-to-date in the first-character loop. */ j = phaystack - &haystack[suffix] - 1; +# endif #endif - /* Scan for matches in right half. */ i = suffix + 1; pneedle = &needle[i]; @@ -338,6 +338,11 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len, } ++i; } +#if CHECK_EOL + /* Update minimal length of haystack. */ + if (phaystack > haystack + haystack_len) + haystack_len = phaystack - haystack; +#endif if (needle_len <= i) { /* Scan for matches in left half. */ @@ -360,13 +365,6 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len, } else j += i - suffix + 1; - -#if CHECK_EOL - if (!AVAILABLE (haystack, haystack_len, j, needle_len)) - break; -#endif - - phaystack = &haystack[suffix + j]; } } ret0: __attribute__ ((unused)) @@ -384,8 +382,11 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len, and sublinear performance O(HAYSTACK_LEN / NEEDLE_LEN) is possible. If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and - sublinear performance is not possible. */ -static RETURN_TYPE + sublinear performance is not possible. + + Since this function is large and complex, block inlining to avoid + slowing down the common case of small needles. */ +__attribute__((noinline)) static RETURN_TYPE two_way_long_needle (const unsigned char *haystack, size_t haystack_len, const unsigned char *needle, size_t needle_len) { diff --git a/string/strcasestr.c b/string/strcasestr.c index 90ba189..8aa7603 100644 --- a/string/strcasestr.c +++ b/string/strcasestr.c @@ -37,8 +37,9 @@ /* Two-Way algorithm. */ #define RETURN_TYPE char * #define AVAILABLE(h, h_l, j, n_l) \ - (!memchr ((h) + (h_l), '\0', (j) + (n_l) - (h_l)) \ - && ((h_l) = (j) + (n_l))) + (((j) + (n_l) <= (h_l)) \ + || ((h_l) += __strnlen ((void*)((h) + (h_l)), (n_l) + 512), \ + (j) + (n_l) <= (h_l))) #define CHECK_EOL (1) #define RET0_IF_0(a) if (!a) goto ret0 #define CANON_ELEMENT(c) TOLOWER (c) @@ -58,31 +59,22 @@ case-insensitive comparison. This function gives unspecified results in multibyte locales. */ char * -STRCASESTR (const char *haystack_start, const char *needle_start) +STRCASESTR (const char *haystack, const char *needle) { - const char *haystack = haystack_start; - const char *needle = needle_start; size_t needle_len; /* Length of NEEDLE. */ size_t haystack_len; /* Known minimum length of HAYSTACK. */ - bool ok = true; /* True if NEEDLE is prefix of HAYSTACK. */ - - /* Determine length of NEEDLE, and in the process, make sure - HAYSTACK is at least as long (no point processing all of a long - NEEDLE if HAYSTACK is too short). */ - while (*haystack && *needle) - { - ok &= (TOLOWER ((unsigned char) *haystack) - == TOLOWER ((unsigned char) *needle)); - haystack++; - needle++; - } - if (*needle) + + /* Handle empty NEEDLE special case. */ + if (needle[0] == '\0') + return (char *) haystack; + + /* Ensure HAYSTACK length is at least as long as NEEDLE length. + Since a match may occur early on in a huge HAYSTACK, use strnlen + and read ahead a few cachelines for improved performance. */ + needle_len = strlen (needle); + haystack_len = __strnlen (haystack, needle_len + 256); + if (haystack_len < needle_len) return NULL; - if (ok) - return (char *) haystack_start; - needle_len = needle - needle_start; - haystack = haystack_start + 1; - haystack_len = needle_len - 1; /* Perform the search. Abstract memory is considered to be an array of 'unsigned char' values, not an array of 'char' values. See @@ -90,10 +82,10 @@ STRCASESTR (const char *haystack_start, const char *needle_start) if (needle_len < LONG_NEEDLE_THRESHOLD) return two_way_short_needle ((const unsigned char *) haystack, haystack_len, - (const unsigned char *) needle_start, + (const unsigned char *) needle, needle_len); return two_way_long_needle ((const unsigned char *) haystack, haystack_len, - (const unsigned char *) needle_start, + (const unsigned char *) needle, needle_len); } diff --git a/string/strstr.c b/string/strstr.c index b3b5deb..7ffb18a 100644 --- a/string/strstr.c +++ b/string/strstr.c @@ -16,27 +16,17 @@ License along with the GNU C Library; if not, see . */ -/* This particular implementation was written by Eric Blake, 2008. */ - #ifndef _LIBC # include #endif -/* Specification of strstr. */ #include -#include - -#ifndef _LIBC -# define __builtin_expect(expr, val) (expr) -#endif - #define RETURN_TYPE char * #define AVAILABLE(h, h_l, j, n_l) \ - (!memchr ((h) + (h_l), '\0', (j) + (n_l) - (h_l)) \ - && ((h_l) = (j) + (n_l))) -#define CHECK_EOL (1) -#define RET0_IF_0(a) if (!a) goto ret0 + (((j) + (n_l) <= (h_l)) \ + || ((h_l) += __strnlen ((void*)((h) + (h_l)), (n_l) + 512), \ + (j) + (n_l) <= (h_l))) #include "str-two-way.h" #undef strstr @@ -45,48 +35,128 @@ #define STRSTR strstr #endif -/* Return the first occurrence of NEEDLE in HAYSTACK. Return HAYSTACK - if NEEDLE is empty, otherwise NULL if NEEDLE is not found in - HAYSTACK. */ +static inline char * +strstr2 (const unsigned char *hs, const unsigned char *ne) +{ + uint32_t h1 = (ne[0] << 16) | ne[1]; + uint32_t h2 = 0; + for (int c = hs[0]; h1 != h2 && c != 0; c = *++hs) + h2 = (h2 << 16) | c; + return h1 == h2 ? (char *)hs - 2 : NULL; +} + +static inline char * +strstr3 (const unsigned char *hs, const unsigned char *ne) +{ + uint32_t h1 = ((uint32_t)ne[0] << 24) | (ne[1] << 16) | (ne[2] << 8); + uint32_t h2 = 0; + for (int c = hs[0]; h1 != h2 && c != 0; c = *++hs) + h2 = (h2 | c) << 8; + return h1 == h2 ? (char *)hs - 3 : NULL; +} + +/* Hash character pairs so a small shift table can be used. All bits of + p[0] are included, but not all bits from p[-1]. So if two equal hashes + match on p[-1], p[0] matches too. Hash collisions are harmless and result + in smaller shifts. */ +#define hash2(p) (((size_t)(p)[0] - ((size_t)(p)[-1] << 3)) % sizeof (shift)) + +/* Fast strstr algorithm with guaranteed linear-time performance. + Small needles up to size 3 use a dedicated linear search. Longer needles + up to size 256 use a novel modified Horspool algorithm. It hashes pairs + of characters to quickly skip past mismatches. The main search loop only + exits if the last 2 characters match, avoiding unnecessary calls to memcmp + and allowing for a larger skip if there is no match. A self-adapting + filtering check is used to quickly detect mismatches in long needles. + By limiting the needle length to 256, the shift table can be reduced to 8 + bits per entry, lowering preprocessing overhead and minimizing cache effects. + The limit also implies worst-case performance is linear. + Needles larger than 256 characters use the linear-time Two-Way algorithm. */ char * -STRSTR (const char *haystack_start, const char *needle_start) +STRSTR (const char *haystack, const char *needle) { - const char *haystack = haystack_start; - const char *needle = needle_start; - size_t needle_len; /* Length of NEEDLE. */ - size_t haystack_len; /* Known minimum length of HAYSTACK. */ - bool ok = true; /* True if NEEDLE is prefix of HAYSTACK. */ - - /* Determine length of NEEDLE, and in the process, make sure - HAYSTACK is at least as long (no point processing all of a long - NEEDLE if HAYSTACK is too short). */ - while (*haystack && *needle) - ok &= *haystack++ == *needle++; - if (*needle) + const unsigned char *hs = (const unsigned char *) haystack; + const unsigned char *ne = (const unsigned char *) needle; + + /* Handle short needle special cases first. */ + if (ne[0] == '\0') + return (char *)hs; + hs = (const unsigned char *)strchr ((const char*)hs, ne[0]); + if (hs == NULL || ne[1] == '\0') + return (char*)hs; + if (ne[2] == '\0') + return strstr2 (hs, ne); + if (ne[3] == '\0') + return strstr3 (hs, ne); + + /* Ensure haystack length is at least as long as needle length. + Since a match may occur early on in a huge haystack, use strnlen + and read ahead a few cachelines for improved performance. */ + size_t ne_len = strlen ((const char*)ne); + size_t hs_len = __strnlen ((const char*)hs, ne_len | 512); + if (hs_len < ne_len) return NULL; - if (ok) - return (char *) haystack_start; - - /* Reduce the size of haystack using strchr, since it has a smaller - linear coefficient than the Two-Way algorithm. */ - needle_len = needle - needle_start; - haystack = strchr (haystack_start + 1, *needle_start); - if (!haystack || __builtin_expect (needle_len == 1, 0)) - return (char *) haystack; - needle -= needle_len; - haystack_len = (haystack > haystack_start + needle_len ? 1 - : needle_len + haystack_start - haystack); - - /* Perform the search. Abstract memory is considered to be an array - of 'unsigned char' values, not an array of 'char' values. See - ISO C 99 section 6.2.6.1. */ - if (needle_len < LONG_NEEDLE_THRESHOLD) - return two_way_short_needle ((const unsigned char *) haystack, - haystack_len, - (const unsigned char *) needle, needle_len); - return two_way_long_needle ((const unsigned char *) haystack, haystack_len, - (const unsigned char *) needle, needle_len); + + /* Check whether we have a match. This improves performance since we + avoid initialization overheads. */ + if (memcmp (hs, ne, ne_len) == 0) + return (char *) hs; + + /* Use Two-Way algorithm for very long needles. */ + if (__glibc_unlikely (ne_len > 256)) + return two_way_long_needle (hs, hs_len, ne, ne_len); + + const unsigned char *end = hs + hs_len - ne_len; + uint8_t shift[256]; + size_t tmp, shift1; + size_t m1 = ne_len - 1; + size_t offset = 0; + + /* Initialize bad character shift hash table. */ + memset (shift, 0, sizeof (shift)); + for (int i = 1; i < m1; i++) + shift[hash2 (ne + i)] = i; + /* Shift1 is the amount we can skip after matching the hash of the + needle end but not the full needle. */ + shift1 = m1 - shift[hash2 (ne + m1)]; + shift[hash2 (ne + m1)] = m1; + + while (1) + { + if (__glibc_unlikely (hs > end)) + { + end += __strnlen ((const char*)end + m1 + 1, 2048); + if (hs > end) + return NULL; + } + + /* Skip past character pairs not in the needle. */ + do + { + hs += m1; + tmp = shift[hash2 (hs)]; + } + while (tmp == 0 && hs <= end); + + /* If the match is not at the end of the needle, shift to the end + and continue until we match the hash of the needle end. */ + hs -= tmp; + if (tmp < m1) + continue; + + /* Hash of the last 2 characters matches. If the needle is long, + try to quickly filter out mismatches. */ + if (m1 < 15 || memcmp (hs + offset, ne + offset, 8) == 0) + { + if (memcmp (hs, ne, m1) == 0) + return (void *) hs; + + /* Adjust filter offset when it doesn't find the mismatch. */ + offset = (offset >= 8 ? offset : m1) - 8; + } + + /* Skip based on matching the hash of the needle end. */ + hs += shift1; + } } libc_hidden_builtin_def (strstr) - -#undef LONG_NEEDLE_THRESHOLD diff --git a/string/test-strcasestr.c b/string/test-strcasestr.c index 2b0a38e..9b1088d 100644 --- a/string/test-strcasestr.c +++ b/string/test-strcasestr.c @@ -25,6 +25,7 @@ #define STRCASESTR simple_strcasestr #define NO_ALIAS #define __strncasecmp strncasecmp +#define __strnlen strnlen #include "strcasestr.c" diff --git a/string/test-strstr.c b/string/test-strstr.c index acf6ff8..5861b01 100644 --- a/string/test-strstr.c +++ b/string/test-strstr.c @@ -24,6 +24,7 @@ #define STRSTR simple_strstr #define libc_hidden_builtin_def(arg) /* nothing */ +#define __strnlen strnlen #include "strstr.c" @@ -150,6 +151,32 @@ check2 (void) } } +#define N 1024 + +static void +pr23637 (void) +{ + char *h = (char*) buf1; + char *n = (char*) buf2; + + for (int i = 0; i < N; i++) + { + n[i] = 'x'; + h[i] = ' '; + h[i + N] = 'x'; + } + + n[N] = '\0'; + h[N * 2] = '\0'; + + /* Ensure we don't match at the first 'x'. */ + h[0] = 'x'; + + char *exp_result = stupid_strstr (h, n); + FOR_EACH_IMPL (impl, 0) + check_result (impl, h, n, exp_result); +} + static int test_main (void) { @@ -157,6 +184,7 @@ test_main (void) check1 (); check2 (); + pr23637 (); printf ("%23s", ""); FOR_EACH_IMPL (impl, 0) @@ -201,6 +229,9 @@ test_main (void) do_test (15, 9, hlen, klen, 1); do_test (15, 15, hlen, klen, 0); do_test (15, 15, hlen, klen, 1); + + do_test (15, 15, hlen + klen * 4, klen * 4, 0); + do_test (15, 15, hlen + klen * 4, klen * 4, 1); } do_test (0, 0, page_size - 1, 16, 0);