public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH] Improve strstr performance
@ 2018-06-11 10:31 Wilco Dijkstra
  2018-06-26 11:13 ` Wilco Dijkstra
  2018-06-29 19:51 ` Adhemerval Zanella
  0 siblings, 2 replies; 6+ messages in thread
From: Wilco Dijkstra @ 2018-06-11 10:31 UTC (permalink / raw)
  To: libc-alpha; +Cc: nd

Improve strstr performance.  Strstr tends to be slow because it uses
many calls to memchr and a slow byte loop to scan for the next match.
Performance is significantly improved by using strnlen on larger blocks
and using strchr to search for the next matching character.  strcasestr
can also use strnlen to scan ahead, and memmem can use memchr to check
for the next match.

On the GLIBC bench tests the overall performance gains on Cortex-A72 are:
strstr: +25%
strcasestr: +4.3%
memmem: +18%

On a 256KB dataset strstr performance improves by 67%, strcasestr by 47%.

Tested on AArch64, passes GLIBC tests.

ChangeLog:
2018-06-08  Wilco Dijkstra  <wdijkstr@arm.com>

	* string/memmem.c (FASTSEARCH): Define.	
	* string/str-two-way.h (two_way_short_needle): Minor cleanups.
	Add support for FASTSEARCH.
	* string/strcasestr.c (AVAILABLE): Use read-ahead strnlen.
	* string/strstr.c (AVAILABLE): Use read-ahead strnlen.
	(FASTSEARCH): Define.
--

diff --git a/string/memmem.c b/string/memmem.c
index c17e1cf6a63c0709df0c1c7a45410c5178b2876e..43efaa3fb718e7a22c3b4122c278720768644d0f 100644
--- a/string/memmem.c
+++ b/string/memmem.c
@@ -31,6 +31,7 @@
 
 #define RETURN_TYPE void *
 #define AVAILABLE(h, h_l, j, n_l) ((j) <= (h_l) - (n_l))
+#define FASTSEARCH(S,C,N) (void*) memchr ((void *)(S), (C), (N))
 #include "str-two-way.h"
 
 #undef memmem
diff --git a/string/str-two-way.h b/string/str-two-way.h
index cd2605857d9b5c432c3c2b1c5e35f55d8e9285b6..6ce6798ea4d595bb4c95335233eb92595f447a28 100644
--- a/string/str-two-way.h
+++ b/string/str-two-way.h
@@ -281,50 +281,50 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
     }
   else
     {
-      const unsigned char *phaystack = &haystack[suffix];
+      const unsigned char *phaystack;
       /* The comparison always starts from needle[suffix], so cache it
 	 and use an optimized first-character loop.  */
       unsigned char needle_suffix = CANON_ELEMENT (needle[suffix]);
 
-#if CHECK_EOL
-      /* We start matching from the SUFFIX'th element, so make sure we
-	 don't hit '\0' before that.  */
-      if (haystack_len < suffix + 1
-	  && !AVAILABLE (haystack, haystack_len, 0, suffix + 1))
-	return NULL;
-#endif
-
       /* The two halves of needle are distinct; no extra memory is
 	 required, and any mismatch results in a maximal shift.  */
       period = MAX (suffix, needle_len - suffix) + 1;
       j = 0;
-      while (1
-#if !CHECK_EOL
-	     && AVAILABLE (haystack, haystack_len, j, needle_len)
-#endif
-	     )
+      while (AVAILABLE (haystack, haystack_len, j, needle_len))
 	{
 	  unsigned char haystack_char;
 	  const unsigned char *pneedle;
 
-	  /* TODO: The first-character loop can be sped up by adapting
-	     longword-at-a-time implementation of memchr/strchr.  */
-	  if (needle_suffix
+	  phaystack = &haystack[suffix + j];
+
+#ifdef FASTSEARCH
+	  if (*phaystack++ != needle_suffix)
+	    {
+	      phaystack = FASTSEARCH (phaystack, needle_suffix,
+				      haystack_len - needle_len - j);
+	      if (phaystack == NULL)
+		goto ret0;
+	      j = phaystack - &haystack[suffix];
+	      phaystack++;
+	    }
+#else
+	  while (needle_suffix
 	      != (haystack_char = CANON_ELEMENT (*phaystack++)))
 	    {
 	      RET0_IF_0 (haystack_char);
-#if !CHECK_EOL
+# if !CHECK_EOL
 	      ++j;
-#endif
-	      continue;
+	      if (!AVAILABLE (haystack, haystack_len, j, needle_len))
+		goto ret0;
+# endif
 	    }
 
-#if CHECK_EOL
+# if CHECK_EOL
 	  /* Calculate J if it wasn't kept up-to-date in the first-character
 	     loop.  */
 	  j = phaystack - &haystack[suffix] - 1;
+# endif
 #endif
-
 	  /* Scan for matches in right half.  */
 	  i = suffix + 1;
 	  pneedle = &needle[i];
@@ -338,6 +338,11 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
 		}
 	      ++i;
 	    }
+#if CHECK_EOL
+	  /* Update minimal length of haystack.  */
+	  if (phaystack > haystack + haystack_len)
+	    haystack_len = phaystack - haystack;
+#endif
 	  if (needle_len <= i)
 	    {
 	      /* Scan for matches in left half.  */
@@ -360,13 +365,6 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
 	    }
 	  else
 	    j += i - suffix + 1;
-
-#if CHECK_EOL
-	  if (!AVAILABLE (haystack, haystack_len, j, needle_len))
-	    break;
-#endif
-
-	  phaystack = &haystack[suffix + j];
 	}
     }
  ret0: __attribute__ ((unused))
diff --git a/string/strcasestr.c b/string/strcasestr.c
index 90ba189790481a58eb7627eb8c396530f988f985..d54c4b56853735fb8f5ebd59419b222ca8827651 100644
--- a/string/strcasestr.c
+++ b/string/strcasestr.c
@@ -37,8 +37,8 @@
 /* Two-Way algorithm.  */
 #define RETURN_TYPE char *
 #define AVAILABLE(h, h_l, j, n_l)			\
-  (!memchr ((h) + (h_l), '\0', (j) + (n_l) - (h_l))	\
-   && ((h_l) = (j) + (n_l)))
+  (((j) + (n_l) <= (h_l)) || ((h_l) += strnlen ((void*)((h) + (h_l)), 512), \
+			      (j) + (n_l) <= (h_l)))
 #define CHECK_EOL (1)
 #define RET0_IF_0(a) if (!a) goto ret0
 #define CANON_ELEMENT(c) TOLOWER (c)
diff --git a/string/strstr.c b/string/strstr.c
index b3b5deb673758510616d32849bcf2fc15ce941cd..346f303c7423c95de0fbcdeb1a9f31293b303372 100644
--- a/string/strstr.c
+++ b/string/strstr.c
@@ -33,10 +33,11 @@
 
 #define RETURN_TYPE char *
 #define AVAILABLE(h, h_l, j, n_l)			\
-  (!memchr ((h) + (h_l), '\0', (j) + (n_l) - (h_l))	\
-   && ((h_l) = (j) + (n_l)))
+  (((j) + (n_l) <= (h_l)) || ((h_l) += strnlen ((void*)((h) + (h_l)), 512), \
+			      (j) + (n_l) <= (h_l)))
 #define CHECK_EOL (1)
 #define RET0_IF_0(a) if (!a) goto ret0
+#define FASTSEARCH(S,C,N) (void*) strchr ((void*)(S), (C))
 #include "str-two-way.h"
 
 #undef strstr

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] Improve strstr performance
  2018-06-11 10:31 [PATCH] Improve strstr performance Wilco Dijkstra
@ 2018-06-26 11:13 ` Wilco Dijkstra
  2018-06-29 19:51 ` Adhemerval Zanella
  1 sibling, 0 replies; 6+ messages in thread
From: Wilco Dijkstra @ 2018-06-26 11:13 UTC (permalink / raw)
  To: libc-alpha; +Cc: nd


ping


From: Wilco Dijkstra
Sent: 11 June 2018 11:31
To: libc-alpha@sourceware.org
Cc: nd
Subject: [PATCH] Improve strstr performance
  

Improve strstr performance.  Strstr tends to be slow because it uses
many calls to memchr and a slow byte loop to scan for the next match.
Performance is significantly improved by using strnlen on larger blocks
and using strchr to search for the next matching character.  strcasestr
can also use strnlen to scan ahead, and memmem can use memchr to check
for the next match.

On the GLIBC bench tests the overall performance gains on Cortex-A72 are:
strstr: +25%
strcasestr: +4.3%
memmem: +18%

On a 256KB dataset strstr performance improves by 67%, strcasestr by 47%.

Tested on AArch64, passes GLIBC tests.

ChangeLog:
2018-06-08  Wilco Dijkstra  <wdijkstr@arm.com>

        * string/memmem.c (FASTSEARCH): Define. 
        * string/str-two-way.h (two_way_short_needle): Minor cleanups.
        Add support for FASTSEARCH.
        * string/strcasestr.c (AVAILABLE): Use read-ahead strnlen.
        * string/strstr.c (AVAILABLE): Use read-ahead strnlen.
        (FASTSEARCH): Define.
--

diff --git a/string/memmem.c b/string/memmem.c
index c17e1cf6a63c0709df0c1c7a45410c5178b2876e..43efaa3fb718e7a22c3b4122c278720768644d0f 100644
--- a/string/memmem.c
+++ b/string/memmem.c
@@ -31,6 +31,7 @@
 
 #define RETURN_TYPE void *
 #define AVAILABLE(h, h_l, j, n_l) ((j) <= (h_l) - (n_l))
+#define FASTSEARCH(S,C,N) (void*) memchr ((void *)(S), (C), (N))
 #include "str-two-way.h"
 
 #undef memmem
diff --git a/string/str-two-way.h b/string/str-two-way.h
index cd2605857d9b5c432c3c2b1c5e35f55d8e9285b6..6ce6798ea4d595bb4c95335233eb92595f447a28 100644
--- a/string/str-two-way.h
+++ b/string/str-two-way.h
@@ -281,50 +281,50 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
     }
   else
     {
-      const unsigned char *phaystack = &haystack[suffix];
+      const unsigned char *phaystack;
       /* The comparison always starts from needle[suffix], so cache it
          and use an optimized first-character loop.  */
       unsigned char needle_suffix = CANON_ELEMENT (needle[suffix]);
 
-#if CHECK_EOL
-      /* We start matching from the SUFFIX'th element, so make sure we
-        don't hit '\0' before that.  */
-      if (haystack_len < suffix + 1
-         && !AVAILABLE (haystack, haystack_len, 0, suffix + 1))
-       return NULL;
-#endif
-
       /* The two halves of needle are distinct; no extra memory is
          required, and any mismatch results in a maximal shift.  */
       period = MAX (suffix, needle_len - suffix) + 1;
       j = 0;
-      while (1
-#if !CHECK_EOL
-            && AVAILABLE (haystack, haystack_len, j, needle_len)
-#endif
-            )
+      while (AVAILABLE (haystack, haystack_len, j, needle_len))
         {
           unsigned char haystack_char;
           const unsigned char *pneedle;
 
-         /* TODO: The first-character loop can be sped up by adapting
-            longword-at-a-time implementation of memchr/strchr.  */
-         if (needle_suffix
+         phaystack = &haystack[suffix + j];
+
+#ifdef FASTSEARCH
+         if (*phaystack++ != needle_suffix)
+           {
+             phaystack = FASTSEARCH (phaystack, needle_suffix,
+                                     haystack_len - needle_len - j);
+             if (phaystack == NULL)
+               goto ret0;
+             j = phaystack - &haystack[suffix];
+             phaystack++;
+           }
+#else
+         while (needle_suffix
               != (haystack_char = CANON_ELEMENT (*phaystack++)))
             {
               RET0_IF_0 (haystack_char);
-#if !CHECK_EOL
+# if !CHECK_EOL
               ++j;
-#endif
-             continue;
+             if (!AVAILABLE (haystack, haystack_len, j, needle_len))
+               goto ret0;
+# endif
             }
 
-#if CHECK_EOL
+# if CHECK_EOL
           /* Calculate J if it wasn't kept up-to-date in the first-character
              loop.  */
           j = phaystack - &haystack[suffix] - 1;
+# endif
 #endif
-
           /* Scan for matches in right half.  */
           i = suffix + 1;
           pneedle = &needle[i];
@@ -338,6 +338,11 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
                 }
               ++i;
             }
+#if CHECK_EOL
+         /* Update minimal length of haystack.  */
+         if (phaystack > haystack + haystack_len)
+           haystack_len = phaystack - haystack;
+#endif
           if (needle_len <= i)
             {
               /* Scan for matches in left half.  */
@@ -360,13 +365,6 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
             }
           else
             j += i - suffix + 1;
-
-#if CHECK_EOL
-         if (!AVAILABLE (haystack, haystack_len, j, needle_len))
-           break;
-#endif
-
-         phaystack = &haystack[suffix + j];
         }
     }
  ret0: __attribute__ ((unused))
diff --git a/string/strcasestr.c b/string/strcasestr.c
index 90ba189790481a58eb7627eb8c396530f988f985..d54c4b56853735fb8f5ebd59419b222ca8827651 100644
--- a/string/strcasestr.c
+++ b/string/strcasestr.c
@@ -37,8 +37,8 @@
 /* Two-Way algorithm.  */
 #define RETURN_TYPE char *
 #define AVAILABLE(h, h_l, j, n_l)                       \
-  (!memchr ((h) + (h_l), '\0', (j) + (n_l) - (h_l))    \
-   && ((h_l) = (j) + (n_l)))
+  (((j) + (n_l) <= (h_l)) || ((h_l) += strnlen ((void*)((h) + (h_l)), 512), \
+                             (j) + (n_l) <= (h_l)))
 #define CHECK_EOL (1)
 #define RET0_IF_0(a) if (!a) goto ret0
 #define CANON_ELEMENT(c) TOLOWER (c)
diff --git a/string/strstr.c b/string/strstr.c
index b3b5deb673758510616d32849bcf2fc15ce941cd..346f303c7423c95de0fbcdeb1a9f31293b303372 100644
--- a/string/strstr.c
+++ b/string/strstr.c
@@ -33,10 +33,11 @@
 
 #define RETURN_TYPE char *
 #define AVAILABLE(h, h_l, j, n_l)                       \
-  (!memchr ((h) + (h_l), '\0', (j) + (n_l) - (h_l))    \
-   && ((h_l) = (j) + (n_l)))
+  (((j) + (n_l) <= (h_l)) || ((h_l) += strnlen ((void*)((h) + (h_l)), 512), \
+                             (j) + (n_l) <= (h_l)))
 #define CHECK_EOL (1)
 #define RET0_IF_0(a) if (!a) goto ret0
+#define FASTSEARCH(S,C,N) (void*) strchr ((void*)(S), (C))
 #include "str-two-way.h"
 
 #undef strstr
    

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] Improve strstr performance
  2018-06-11 10:31 [PATCH] Improve strstr performance Wilco Dijkstra
  2018-06-26 11:13 ` Wilco Dijkstra
@ 2018-06-29 19:51 ` Adhemerval Zanella
  1 sibling, 0 replies; 6+ messages in thread
From: Adhemerval Zanella @ 2018-06-29 19:51 UTC (permalink / raw)
  To: libc-alpha



On 11/06/2018 07:31, Wilco Dijkstra wrote:
> Improve strstr performance.  Strstr tends to be slow because it uses
> many calls to memchr and a slow byte loop to scan for the next match.
> Performance is significantly improved by using strnlen on larger blocks
> and using strchr to search for the next matching character.  strcasestr
> can also use strnlen to scan ahead, and memmem can use memchr to check
> for the next match.
> 
> On the GLIBC bench tests the overall performance gains on Cortex-A72 are:
> strstr: +25%
> strcasestr: +4.3%
> memmem: +18%
> 
> On a 256KB dataset strstr performance improves by 67%, strcasestr by 47%.
> 
> Tested on AArch64, passes GLIBC tests.

I am checking if it is worth to keep the implementation in sync with gnulib,
since it contains some changes over the years which did not make into glibc.

I create a branch with these changes [1] along with your changes, but
results are mixed:

   - For strstr/strcasestr performance is the same for neddles larger than
     4, but faster for shorter needles.

   - For memmem performance is faster for needle of size 2, but slower for
     large haystack.

My plan is to check if we can simplify the code a bit, as gnulib one has
done; but in overall the change should be ok (modulo one fix below).

[1] https://sourceware.org/git/?p=glibc.git;a=shortlog;h=refs/heads/azanella/str-two-way-optimization

> diff --git a/string/strcasestr.c b/string/strcasestr.c
> index 90ba189790481a58eb7627eb8c396530f988f985..d54c4b56853735fb8f5ebd59419b222ca8827651 100644
> --- a/string/strcasestr.c
> +++ b/string/strcasestr.c
> @@ -37,8 +37,8 @@
>  /* Two-Way algorithm.  */
>  #define RETURN_TYPE char *
>  #define AVAILABLE(h, h_l, j, n_l)			\
> -  (!memchr ((h) + (h_l), '\0', (j) + (n_l) - (h_l))	\
> -   && ((h_l) = (j) + (n_l)))
> +  (((j) + (n_l) <= (h_l)) || ((h_l) += strnlen ((void*)((h) + (h_l)), 512), \
> +			      (j) + (n_l) <= (h_l)))

Why did it not trigger linknamespace issues with strnlen usage? For instance:

$ cat conform/ISO/assert.h/linknamespace.out 
[initial] __assert_fail -> [libc.a(assert.o)] __dcgettext -> [libc.a(dcgettext.o)] __dcigettext -> [libc.a(dcigettext.o)] strstr -> [libc.a(strstr.o)] strnlen

On the branch I created it contains a fix for this [2].

[2] https://sourceware.org/git/?p=glibc.git;a=commit;h=6cc58a9708ff256323c34d200fd9362f25bf66f7

>  #define CHECK_EOL (1)
>  #define RET0_IF_0(a) if (!a) goto ret0
>  #define CANON_ELEMENT(c) TOLOWER (c)
> diff --git a/string/strstr.c b/string/strstr.c
> index b3b5deb673758510616d32849bcf2fc15ce941cd..346f303c7423c95de0fbcdeb1a9f31293b303372 100644
> --- a/string/strstr.c
> +++ b/string/strstr.c
> @@ -33,10 +33,11 @@
>  
>  #define RETURN_TYPE char *
>  #define AVAILABLE(h, h_l, j, n_l)			\
> -  (!memchr ((h) + (h_l), '\0', (j) + (n_l) - (h_l))	\
> -   && ((h_l) = (j) + (n_l)))
> +  (((j) + (n_l) <= (h_l)) || ((h_l) += strnlen ((void*)((h) + (h_l)), 512), \
> +			      (j) + (n_l) <= (h_l)))
>  #define CHECK_EOL (1)
>  #define RET0_IF_0(a) if (!a) goto ret0
> +#define FASTSEARCH(S,C,N) (void*) strchr ((void*)(S), (C))
>  #include "str-two-way.h"
>  
>  #undef strstr
> 

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] Improve strstr performance
  2018-07-03 12:49 ` Adhemerval Zanella
@ 2018-07-17 12:54   ` Wilco Dijkstra
  0 siblings, 0 replies; 6+ messages in thread
From: Wilco Dijkstra @ 2018-07-17 12:54 UTC (permalink / raw)
  To: Adhemerval Zanella; +Cc: libc-alpha, nd

Adhemerval Zanella wrote:
  
>> Overall the synced version makes the code far more readable, so it is a good idea
>> as a basis for further improvements. However it will only be faster with my patch
>> fully merged.
>
> My intention was to check if it would be worth to sync gnulib and then
> apply your change on top or the other way around.  It seems it would be
> better to add your patch and I will propose another with some gnulib
> changes (mainly documentation and small optimizations).  Some glibc
> differences seems more to get around some compiler missed optimization
> than actually code improvements (99677e575504 for instance).

I've committed my strstr patch now. If I had known about the gnulib version,
I would have started from that! It looks it can be simplified further - due to
strchr/memchr/strnlen using a single pointer everywhere would be best. 

I don't think there is any point working around missed compiler optimizations -
it seems better to fix GCC if the issue still exists.

Wilco

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] Improve strstr performance
  2018-06-29 23:23 Wilco Dijkstra
@ 2018-07-03 12:49 ` Adhemerval Zanella
  2018-07-17 12:54   ` Wilco Dijkstra
  0 siblings, 1 reply; 6+ messages in thread
From: Adhemerval Zanella @ 2018-07-03 12:49 UTC (permalink / raw)
  To: Wilco Dijkstra; +Cc: libc-alpha, nd



On 29/06/2018 20:23, Wilco Dijkstra wrote:
> Adhemerval wrote:
>> I am checking if it is worth to keep the implementation in sync with gnulib,
>> since it contains some changes over the years which did not make into glibc.
>>
>> I create a branch with these changes [1] along with your changes, but
>> results are mixed:
> 
> Well that's no surprise - the synced code looks much cleaner but will be slower
> without my patch. As is it removes the search for the next matching character,
> so it uses AVAILBLE every character which means even more calls to memchr...
> Your version of my patch removes the FASTSEARCH optimization (maybe missing
> str-two-way.h?), so it loses the main performance gain.
> 
> Overall the synced version makes the code far more readable, so it is a good idea
> as a basis for further improvements. However it will only be faster with my patch
> fully merged.

My intention was to check if it would be worth to sync gnulib and then
apply your change on top or the other way around.  It seems it would be
better to add your patch and I will propose another with some gnulib
changes (mainly documentation and small optimizations).  Some glibc
differences seems more to get around some compiler missed optimization
than actually code improvements (99677e575504 for instance).

> 
> This looks dubious and will slow down strcasestr:
> 
> -#define TOLOWER(Ch) tolower (Ch)
> +#define TOLOWER(Ch) (isupper (Ch) ? tolower (Ch) : (Ch))
> 
> Wilco
> 

This was corrected on glibc side and did not get back into gnulib.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] Improve strstr performance
@ 2018-06-29 23:23 Wilco Dijkstra
  2018-07-03 12:49 ` Adhemerval Zanella
  0 siblings, 1 reply; 6+ messages in thread
From: Wilco Dijkstra @ 2018-06-29 23:23 UTC (permalink / raw)
  To: adhemerval.zanella; +Cc: libc-alpha, nd

Adhemerval wrote:
> I am checking if it is worth to keep the implementation in sync with gnulib,
> since it contains some changes over the years which did not make into glibc.
>
> I create a branch with these changes [1] along with your changes, but
> results are mixed:

Well that's no surprise - the synced code looks much cleaner but will be slower
without my patch. As is it removes the search for the next matching character,
so it uses AVAILBLE every character which means even more calls to memchr...
Your version of my patch removes the FASTSEARCH optimization (maybe missing
str-two-way.h?), so it loses the main performance gain.

Overall the synced version makes the code far more readable, so it is a good idea
as a basis for further improvements. However it will only be faster with my patch
fully merged.

This looks dubious and will slow down strcasestr:

-#define TOLOWER(Ch) tolower (Ch)
+#define TOLOWER(Ch) (isupper (Ch) ? tolower (Ch) : (Ch))

Wilco

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2018-07-17 12:54 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-06-11 10:31 [PATCH] Improve strstr performance Wilco Dijkstra
2018-06-26 11:13 ` Wilco Dijkstra
2018-06-29 19:51 ` Adhemerval Zanella
2018-06-29 23:23 Wilco Dijkstra
2018-07-03 12:49 ` Adhemerval Zanella
2018-07-17 12:54   ` Wilco Dijkstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).