public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] Optimize strchr (s, 0) to strlen
@ 2016-04-18 17:01 Wilco Dijkstra
  2016-04-18 17:09 ` Jakub Jelinek
                   ` (2 more replies)
  0 siblings, 3 replies; 37+ messages in thread
From: Wilco Dijkstra @ 2016-04-18 17:01 UTC (permalink / raw)
  To: gcc-patches; +Cc: nd

Optimize strchr (s, 0) to s + strlen (s).  strchr (s, 0) appears a common
idiom for finding the end of a string, however it is not a very efficient
way of doing so.  Strlen is a much simpler operation which is significantly
faster (eg. on x86 strlen is 50% faster for strings of 8 bytes and about
twice as fast as strchr on strings of 1KB).

OK for trunk?

ChangeLog:
2016-04-18  Wilco Dijkstra  <wdijkstr@arm.com>

gcc/
	* gcc/builtins.c (fold_builtin_strchr): Optimize strchr (s, 0) into
	strlen.

testsuite/
	* gcc/testsuite/gcc.dg/strlenopt-20.c: Update test.
	* gcc/testsuite/gcc.dg/strlenopt-21.c: Likewise.
	* gcc/testsuite/gcc.dg/strlenopt-22.c: Likewise.
	* gcc/testsuite/gcc.dg/strlenopt-26.c: Likewise.
	* gcc/testsuite/gcc.dg/strlenopt-5.c: Likewise.
	* gcc/testsuite/gcc.dg/strlenopt-7.c: Likewise.
	* gcc/testsuite/gcc.dg/strlenopt-9.c: Likewise.

--

diff --git a/gcc/builtins.c b/gcc/builtins.c
index 058ecc39aab205099713e503861103ce6ba5ee6d..150e707178a3e119d42ef630b384da3eaf7b2182 100644
--- a/gcc/builtins.c
+++ b/gcc/builtins.c
@@ -8567,20 +8567,20 @@ fold_builtin_strchr (location_t loc, tree s1, tree s2, tree type)
   else
     {
       const char *p1;
+      char c;
 
       if (TREE_CODE (s2) != INTEGER_CST)
 	return NULL_TREE;
 
+      if (target_char_cast (s2, &c))
+	return NULL_TREE;
+
       p1 = c_getstr (s1);
       if (p1 != NULL)
 	{
-	  char c;
 	  const char *r;
 	  tree tem;
 
-	  if (target_char_cast (s2, &c))
-	    return NULL_TREE;
-
 	  r = strchr (p1, c);
 
 	  if (r == NULL)
@@ -8590,6 +8590,20 @@ fold_builtin_strchr (location_t loc, tree s1, tree s2, tree type)
 	  tem = fold_build_pointer_plus_hwi_loc (loc, s1, r - p1);
 	  return fold_convert_loc (loc, type, tem);
 	}
+      else if (c == 0)
+	{
+	  tree fn = builtin_decl_implicit (BUILT_IN_STRLEN);
+	  if (!fn)
+	    return NULL_TREE;
+
+	  s1 = builtin_save_expr (s1);
+
+	  /* Transform strchr (s1, '\0') to s1 + strlen (s1).  */
+	  fn = build_call_expr_loc (loc, fn, 1, s1);
+	  tree tem = fold_build_pointer_plus (s1, fn);
+	  return fold_convert_loc (loc, type, tem);
+	}
+
       return NULL_TREE;
     }
 }
diff --git a/gcc/testsuite/gcc.dg/strlenopt-20.c b/gcc/testsuite/gcc.dg/strlenopt-20.c
index a83e845c26d88e5acdcabf142f7b319136663488..7b483eaeac1aa47278111a92148a16f00b2aaa2d 100644
--- a/gcc/testsuite/gcc.dg/strlenopt-20.c
+++ b/gcc/testsuite/gcc.dg/strlenopt-20.c
@@ -86,9 +86,9 @@ main ()
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "memcpy \\(" 4 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strcpy \\(" 0 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
-/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
diff --git a/gcc/testsuite/gcc.dg/strlenopt-21.c b/gcc/testsuite/gcc.dg/strlenopt-21.c
index e22fa9fca9ba14354db2cd5f602283b64bd8dcac..05b85a49dde0a7f5d269174fd4269e40be910dbd 100644
--- a/gcc/testsuite/gcc.dg/strlenopt-21.c
+++ b/gcc/testsuite/gcc.dg/strlenopt-21.c
@@ -57,9 +57,9 @@ main ()
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "memcpy \\(" 3 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strcpy \\(" 0 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
-/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
diff --git a/gcc/testsuite/gcc.dg/strlenopt-22.c b/gcc/testsuite/gcc.dg/strlenopt-22.c
index aa55f5ebd6a2d4803ee9a7fd60fc538d86f47124..b4ef772f0e59252f10a5419ede6837b3c8ca8265 100644
--- a/gcc/testsuite/gcc.dg/strlenopt-22.c
+++ b/gcc/testsuite/gcc.dg/strlenopt-22.c
@@ -31,9 +31,9 @@ main ()
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "strlen \\(" 3 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strlen \\(" 4 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "memcpy \\(" 1 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strcpy \\(" 1 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
-/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
diff --git a/gcc/testsuite/gcc.dg/strlenopt-26.c b/gcc/testsuite/gcc.dg/strlenopt-26.c
index 4bd54bef540806ca90d2b9bcfc33eb531991c967..da2f465a5b5003fa5dca05f3a6ee00e97b98b5dd 100644
--- a/gcc/testsuite/gcc.dg/strlenopt-26.c
+++ b/gcc/testsuite/gcc.dg/strlenopt-26.c
@@ -21,4 +21,5 @@ main (void)
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
diff --git a/gcc/testsuite/gcc.dg/strlenopt-5.c b/gcc/testsuite/gcc.dg/strlenopt-5.c
index 1b006a93045599a75bdb10a39e86ffa59b475c83..a24aea44e8b00ff7b35a907aaa941b4c509642c4 100644
--- a/gcc/testsuite/gcc.dg/strlenopt-5.c
+++ b/gcc/testsuite/gcc.dg/strlenopt-5.c
@@ -48,9 +48,9 @@ main ()
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "strlen \\(" 0 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "memcpy \\(" 2 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strcpy \\(" 1 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
-/* { dg-final { scan-tree-dump-times "strchr \\(" 2 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
diff --git a/gcc/testsuite/gcc.dg/strlenopt-7.c b/gcc/testsuite/gcc.dg/strlenopt-7.c
index 3ae1e2cb3f0d08c8b05107c4e65b67bdb39cf7ab..aa53d7e75254dfe56c93172afc49f95e5b7901e6 100644
--- a/gcc/testsuite/gcc.dg/strlenopt-7.c
+++ b/gcc/testsuite/gcc.dg/strlenopt-7.c
@@ -40,11 +40,11 @@ main ()
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "strlen \\(" 0 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "memcpy \\(" 2 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strcpy \\(" 0 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
-/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "\\*r_\[0-9\]* = 0;" 1 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "return 3;" 1 "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/strlenopt-9.c b/gcc/testsuite/gcc.dg/strlenopt-9.c
index b0406b162d48fca375883035043b0c50b9db61a1..e5e276210ba4b7d75867605f1ecf5c06eb970ef5 100644
--- a/gcc/testsuite/gcc.dg/strlenopt-9.c
+++ b/gcc/testsuite/gcc.dg/strlenopt-9.c
@@ -98,10 +98,10 @@ main ()
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "strlen \\(" 3 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strlen \\(" 5 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "memcpy \\(" 6 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strcpy \\(" 1 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
-/* { dg-final { scan-tree-dump-times "strchr \\(" 3 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "return 4;" 1 "optimized" } } */

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] Optimize strchr (s, 0) to strlen
  2016-04-18 17:01 [PATCH] Optimize strchr (s, 0) to strlen Wilco Dijkstra
@ 2016-04-18 17:09 ` Jakub Jelinek
  2016-04-18 17:31   ` Wilco Dijkstra
  2016-04-18 22:55 ` Oleg Endo
  2016-04-19  8:14 ` Richard Biener
  2 siblings, 1 reply; 37+ messages in thread
From: Jakub Jelinek @ 2016-04-18 17:09 UTC (permalink / raw)
  To: Wilco Dijkstra; +Cc: gcc-patches, nd

On Mon, Apr 18, 2016 at 05:00:45PM +0000, Wilco Dijkstra wrote:
> Optimize strchr (s, 0) to s + strlen (s).  strchr (s, 0) appears a common
> idiom for finding the end of a string, however it is not a very efficient
> way of doing so.  Strlen is a much simpler operation which is significantly
> faster (eg. on x86 strlen is 50% faster for strings of 8 bytes and about
> twice as fast as strchr on strings of 1KB).

That is generally a bad idea, it really depends on the target.
So, perhaps such decision should be done on a case by case basis,
and certainly not this early.
Often strchr (s, 0) should be just optimized as rawmemchr (s, 0) instead.

	Jakub

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] Optimize strchr (s, 0) to strlen
  2016-04-18 17:09 ` Jakub Jelinek
@ 2016-04-18 17:31   ` Wilco Dijkstra
  0 siblings, 0 replies; 37+ messages in thread
From: Wilco Dijkstra @ 2016-04-18 17:31 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: gcc-patches, nd

Jakub Jelinek <jakub@redhat.com> wrote:
> On Mon, Apr 18, 2016 at 05:00:45PM +0000, Wilco Dijkstra wrote:
>> Optimize strchr (s, 0) to s + strlen (s).  strchr (s, 0) appears a common
>> idiom for finding the end of a string, however it is not a very efficient
>> way of doing so.  Strlen is a much simpler operation which is significantly
>> faster (eg. on x86 strlen is 50% faster for strings of 8 bytes and about
>> twice as fast as strchr on strings of 1KB).
>
> That is generally a bad idea, it really depends on the target.
> So, perhaps such decision should be done on a case by case basis,
> and certainly not this early.
> Often strchr (s, 0) should be just optimized as rawmemchr (s, 0) instead.

I don't see how it would depend on the target - under what circumstances could
strchr possibly be faster than strlen? The former must do 2 comparisons per 
character - the latter only 1.

Note rawmemchr is a non-standard function, ie. GCC would need to be told
whether the target library supports it, so it would require new builtins and
infrastructure.

Also rawmemchr is significantly slower than strlen on most targets, even when
an assembly implementation is available. That's why I posted this patch for GLIBC:
https://sourceware.org/ml/libc-alpha/2016-04/msg00382.html

Wilco

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] Optimize strchr (s, 0) to strlen
  2016-04-18 17:01 [PATCH] Optimize strchr (s, 0) to strlen Wilco Dijkstra
  2016-04-18 17:09 ` Jakub Jelinek
@ 2016-04-18 22:55 ` Oleg Endo
  2016-04-19  8:14 ` Richard Biener
  2 siblings, 0 replies; 37+ messages in thread
From: Oleg Endo @ 2016-04-18 22:55 UTC (permalink / raw)
  To: Wilco Dijkstra, gcc-patches; +Cc: nd

On Mon, 2016-04-18 at 17:00 +0000, Wilco Dijkstra wrote:
> Optimize strchr (s, 0) to s + strlen (s).  strchr (s, 0) appears a
> common
> idiom for finding the end of a string, however it is not a very
> efficient
> way of doing so.  Strlen is a much simpler operation which is
> significantly
> faster (eg. on x86 strlen is 50% faster for strings of 8 bytes and
> about
> twice as fast as strchr on strings of 1KB).
> 
> OK for trunk?

Can you please file this as PR 61056?

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61056


Cheers,
Oleg


> 
> ChangeLog:
> 2016-04-18  Wilco Dijkstra  <wdijkstr@arm.com>
> 
> gcc/
> 	* gcc/builtins.c (fold_builtin_strchr): Optimize strchr (s, 0)
> into
> 	strlen.
> 
> testsuite/
> 	* gcc/testsuite/gcc.dg/strlenopt-20.c: Update test.
> 	* gcc/testsuite/gcc.dg/strlenopt-21.c: Likewise.
> 	* gcc/testsuite/gcc.dg/strlenopt-22.c: Likewise.
> 	* gcc/testsuite/gcc.dg/strlenopt-26.c: Likewise.
> 	* gcc/testsuite/gcc.dg/strlenopt-5.c: Likewise.
> 	* gcc/testsuite/gcc.dg/strlenopt-7.c: Likewise.
> 	* gcc/testsuite/gcc.dg/strlenopt-9.c: Likewise.
> 
> --
> 
> diff --git a/gcc/builtins.c b/gcc/builtins.c
> index
> 058ecc39aab205099713e503861103ce6ba5ee6d..150e707178a3e119d42ef630b38
> 4da3eaf7b2182 100644
> --- a/gcc/builtins.c
> +++ b/gcc/builtins.c
> @@ -8567,20 +8567,20 @@ fold_builtin_strchr (location_t loc, tree s1,
> tree s2, tree type)
>    else
>      {
>        const char *p1;
> +      char c;
>  
>        if (TREE_CODE (s2) != INTEGER_CST)
>  	return NULL_TREE;
>  
> +      if (target_char_cast (s2, &c))
> +	return NULL_TREE;
> +
>        p1 = c_getstr (s1);
>        if (p1 != NULL)
>  	{
> -	  char c;
>  	  const char *r;
>  	  tree tem;
>  
> -	  if (target_char_cast (s2, &c))
> -	    return NULL_TREE;
> -
>  	  r = strchr (p1, c);
>  
>  	  if (r == NULL)
> @@ -8590,6 +8590,20 @@ fold_builtin_strchr (location_t loc, tree s1,
> tree s2, tree type)
>  	  tem = fold_build_pointer_plus_hwi_loc (loc, s1, r - p1);
>  	  return fold_convert_loc (loc, type, tem);
>  	}
> +      else if (c == 0)
> +	{
> +	  tree fn = builtin_decl_implicit (BUILT_IN_STRLEN);
> +	  if (!fn)
> +	    return NULL_TREE;
> +
> +	  s1 = builtin_save_expr (s1);
> +
> +	  /* Transform strchr (s1, '\0') to s1 + strlen (s1).  */
> +	  fn = build_call_expr_loc (loc, fn, 1, s1);
> +	  tree tem = fold_build_pointer_plus (s1, fn);
> +	  return fold_convert_loc (loc, type, tem);
> +	}
> +
>        return NULL_TREE;
>      }
>  }
> diff --git a/gcc/testsuite/gcc.dg/strlenopt-20.c
> b/gcc/testsuite/gcc.dg/strlenopt-20.c
> index
> a83e845c26d88e5acdcabf142f7b319136663488..7b483eaeac1aa47278111a92148
> a16f00b2aaa2d 100644
> --- a/gcc/testsuite/gcc.dg/strlenopt-20.c
> +++ b/gcc/testsuite/gcc.dg/strlenopt-20.c
> @@ -86,9 +86,9 @@ main ()
>    return 0;
>  }
>  
> -/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */
> +/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "memcpy \\(" 4 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "strcpy \\(" 0 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
> -/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */
> +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
> diff --git a/gcc/testsuite/gcc.dg/strlenopt-21.c
> b/gcc/testsuite/gcc.dg/strlenopt-21.c
> index
> e22fa9fca9ba14354db2cd5f602283b64bd8dcac..05b85a49dde0a7f5d269174fd42
> 69e40be910dbd 100644
> --- a/gcc/testsuite/gcc.dg/strlenopt-21.c
> +++ b/gcc/testsuite/gcc.dg/strlenopt-21.c
> @@ -57,9 +57,9 @@ main ()
>    return 0;
>  }
>  
> -/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */
> +/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "memcpy \\(" 3 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "strcpy \\(" 0 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
> -/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */
> +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
> diff --git a/gcc/testsuite/gcc.dg/strlenopt-22.c
> b/gcc/testsuite/gcc.dg/strlenopt-22.c
> index
> aa55f5ebd6a2d4803ee9a7fd60fc538d86f47124..b4ef772f0e59252f10a5419ede6
> 837b3c8ca8265 100644
> --- a/gcc/testsuite/gcc.dg/strlenopt-22.c
> +++ b/gcc/testsuite/gcc.dg/strlenopt-22.c
> @@ -31,9 +31,9 @@ main ()
>    return 0;
>  }
>  
> -/* { dg-final { scan-tree-dump-times "strlen \\(" 3 "strlen" } } */
> +/* { dg-final { scan-tree-dump-times "strlen \\(" 4 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "memcpy \\(" 1 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "strcpy \\(" 1 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
> -/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */
> +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
> diff --git a/gcc/testsuite/gcc.dg/strlenopt-26.c
> b/gcc/testsuite/gcc.dg/strlenopt-26.c
> index
> 4bd54bef540806ca90d2b9bcfc33eb531991c967..da2f465a5b5003fa5dca05f3a6e
> e00e97b98b5dd 100644
> --- a/gcc/testsuite/gcc.dg/strlenopt-26.c
> +++ b/gcc/testsuite/gcc.dg/strlenopt-26.c
> @@ -21,4 +21,5 @@ main (void)
>    return 0;
>  }
>  
> -/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */
> +/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */
> +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
> diff --git a/gcc/testsuite/gcc.dg/strlenopt-5.c
> b/gcc/testsuite/gcc.dg/strlenopt-5.c
> index
> 1b006a93045599a75bdb10a39e86ffa59b475c83..a24aea44e8b00ff7b35a907aaa9
> 41b4c509642c4 100644
> --- a/gcc/testsuite/gcc.dg/strlenopt-5.c
> +++ b/gcc/testsuite/gcc.dg/strlenopt-5.c
> @@ -48,9 +48,9 @@ main ()
>    return 0;
>  }
>  
> -/* { dg-final { scan-tree-dump-times "strlen \\(" 0 "strlen" } } */
> +/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "memcpy \\(" 2 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "strcpy \\(" 1 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
> -/* { dg-final { scan-tree-dump-times "strchr \\(" 2 "strlen" } } */
> +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
> diff --git a/gcc/testsuite/gcc.dg/strlenopt-7.c
> b/gcc/testsuite/gcc.dg/strlenopt-7.c
> index
> 3ae1e2cb3f0d08c8b05107c4e65b67bdb39cf7ab..aa53d7e75254dfe56c93172afc4
> 9f95e5b7901e6 100644
> --- a/gcc/testsuite/gcc.dg/strlenopt-7.c
> +++ b/gcc/testsuite/gcc.dg/strlenopt-7.c
> @@ -40,11 +40,11 @@ main ()
>    return 0;
>  }
>  
> -/* { dg-final { scan-tree-dump-times "strlen \\(" 0 "strlen" } } */
> +/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "memcpy \\(" 2 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "strcpy \\(" 0 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
> -/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */
> +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "\\*r_\[0-9\]* = 0;" 1 "strlen"
> } } */
>  /* { dg-final { scan-tree-dump-times "return 3;" 1 "optimized" } }
> */
> diff --git a/gcc/testsuite/gcc.dg/strlenopt-9.c
> b/gcc/testsuite/gcc.dg/strlenopt-9.c
> index
> b0406b162d48fca375883035043b0c50b9db61a1..e5e276210ba4b7d75867605f1ec
> f5c06eb970ef5 100644
> --- a/gcc/testsuite/gcc.dg/strlenopt-9.c
> +++ b/gcc/testsuite/gcc.dg/strlenopt-9.c
> @@ -98,10 +98,10 @@ main ()
>    return 0;
>  }
>  
> -/* { dg-final { scan-tree-dump-times "strlen \\(" 3 "strlen" } } */
> +/* { dg-final { scan-tree-dump-times "strlen \\(" 5 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "memcpy \\(" 6 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "strcpy \\(" 1 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
> -/* { dg-final { scan-tree-dump-times "strchr \\(" 3 "strlen" } } */
> +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "return 4;" 1 "optimized" } }
> */
> 

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] Optimize strchr (s, 0) to strlen
  2016-04-18 17:01 [PATCH] Optimize strchr (s, 0) to strlen Wilco Dijkstra
  2016-04-18 17:09 ` Jakub Jelinek
  2016-04-18 22:55 ` Oleg Endo
@ 2016-04-19  8:14 ` Richard Biener
  2016-04-19 16:33   ` Wilco Dijkstra
  2 siblings, 1 reply; 37+ messages in thread
From: Richard Biener @ 2016-04-19  8:14 UTC (permalink / raw)
  To: Wilco Dijkstra; +Cc: gcc-patches, nd

On Mon, Apr 18, 2016 at 7:00 PM, Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
> Optimize strchr (s, 0) to s + strlen (s).  strchr (s, 0) appears a common
> idiom for finding the end of a string, however it is not a very efficient
> way of doing so.  Strlen is a much simpler operation which is significantly
> faster (eg. on x86 strlen is 50% faster for strings of 8 bytes and about
> twice as fast as strchr on strings of 1KB).
>
> OK for trunk?

This folding should be added to gimple-fold.c:gimple_fold_builtin instead,
the builtins.c foldings are purerly for folding to constants nowadays.

Richard.

> ChangeLog:
> 2016-04-18  Wilco Dijkstra  <wdijkstr@arm.com>
>
> gcc/
>         * gcc/builtins.c (fold_builtin_strchr): Optimize strchr (s, 0) into
>         strlen.
>
> testsuite/
>         * gcc/testsuite/gcc.dg/strlenopt-20.c: Update test.
>         * gcc/testsuite/gcc.dg/strlenopt-21.c: Likewise.
>         * gcc/testsuite/gcc.dg/strlenopt-22.c: Likewise.
>         * gcc/testsuite/gcc.dg/strlenopt-26.c: Likewise.
>         * gcc/testsuite/gcc.dg/strlenopt-5.c: Likewise.
>         * gcc/testsuite/gcc.dg/strlenopt-7.c: Likewise.
>         * gcc/testsuite/gcc.dg/strlenopt-9.c: Likewise.
>
> --
>
> diff --git a/gcc/builtins.c b/gcc/builtins.c
> index 058ecc39aab205099713e503861103ce6ba5ee6d..150e707178a3e119d42ef630b384da3eaf7b2182 100644
> --- a/gcc/builtins.c
> +++ b/gcc/builtins.c
> @@ -8567,20 +8567,20 @@ fold_builtin_strchr (location_t loc, tree s1, tree s2, tree type)
>    else
>      {
>        const char *p1;
> +      char c;
>
>        if (TREE_CODE (s2) != INTEGER_CST)
>         return NULL_TREE;
>
> +      if (target_char_cast (s2, &c))
> +       return NULL_TREE;
> +
>        p1 = c_getstr (s1);
>        if (p1 != NULL)
>         {
> -         char c;
>           const char *r;
>           tree tem;
>
> -         if (target_char_cast (s2, &c))
> -           return NULL_TREE;
> -
>           r = strchr (p1, c);
>
>           if (r == NULL)
> @@ -8590,6 +8590,20 @@ fold_builtin_strchr (location_t loc, tree s1, tree s2, tree type)
>           tem = fold_build_pointer_plus_hwi_loc (loc, s1, r - p1);
>           return fold_convert_loc (loc, type, tem);
>         }
> +      else if (c == 0)
> +       {
> +         tree fn = builtin_decl_implicit (BUILT_IN_STRLEN);
> +         if (!fn)
> +           return NULL_TREE;
> +
> +         s1 = builtin_save_expr (s1);
> +
> +         /* Transform strchr (s1, '\0') to s1 + strlen (s1).  */
> +         fn = build_call_expr_loc (loc, fn, 1, s1);
> +         tree tem = fold_build_pointer_plus (s1, fn);
> +         return fold_convert_loc (loc, type, tem);
> +       }
> +
>        return NULL_TREE;
>      }
>  }
> diff --git a/gcc/testsuite/gcc.dg/strlenopt-20.c b/gcc/testsuite/gcc.dg/strlenopt-20.c
> index a83e845c26d88e5acdcabf142f7b319136663488..7b483eaeac1aa47278111a92148a16f00b2aaa2d 100644
> --- a/gcc/testsuite/gcc.dg/strlenopt-20.c
> +++ b/gcc/testsuite/gcc.dg/strlenopt-20.c
> @@ -86,9 +86,9 @@ main ()
>    return 0;
>  }
>
> -/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */
> +/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "memcpy \\(" 4 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "strcpy \\(" 0 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
> -/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */
> +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
> diff --git a/gcc/testsuite/gcc.dg/strlenopt-21.c b/gcc/testsuite/gcc.dg/strlenopt-21.c
> index e22fa9fca9ba14354db2cd5f602283b64bd8dcac..05b85a49dde0a7f5d269174fd4269e40be910dbd 100644
> --- a/gcc/testsuite/gcc.dg/strlenopt-21.c
> +++ b/gcc/testsuite/gcc.dg/strlenopt-21.c
> @@ -57,9 +57,9 @@ main ()
>    return 0;
>  }
>
> -/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */
> +/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "memcpy \\(" 3 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "strcpy \\(" 0 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
> -/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */
> +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
> diff --git a/gcc/testsuite/gcc.dg/strlenopt-22.c b/gcc/testsuite/gcc.dg/strlenopt-22.c
> index aa55f5ebd6a2d4803ee9a7fd60fc538d86f47124..b4ef772f0e59252f10a5419ede6837b3c8ca8265 100644
> --- a/gcc/testsuite/gcc.dg/strlenopt-22.c
> +++ b/gcc/testsuite/gcc.dg/strlenopt-22.c
> @@ -31,9 +31,9 @@ main ()
>    return 0;
>  }
>
> -/* { dg-final { scan-tree-dump-times "strlen \\(" 3 "strlen" } } */
> +/* { dg-final { scan-tree-dump-times "strlen \\(" 4 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "memcpy \\(" 1 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "strcpy \\(" 1 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
> -/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */
> +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
> diff --git a/gcc/testsuite/gcc.dg/strlenopt-26.c b/gcc/testsuite/gcc.dg/strlenopt-26.c
> index 4bd54bef540806ca90d2b9bcfc33eb531991c967..da2f465a5b5003fa5dca05f3a6ee00e97b98b5dd 100644
> --- a/gcc/testsuite/gcc.dg/strlenopt-26.c
> +++ b/gcc/testsuite/gcc.dg/strlenopt-26.c
> @@ -21,4 +21,5 @@ main (void)
>    return 0;
>  }
>
> -/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */
> +/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */
> +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
> diff --git a/gcc/testsuite/gcc.dg/strlenopt-5.c b/gcc/testsuite/gcc.dg/strlenopt-5.c
> index 1b006a93045599a75bdb10a39e86ffa59b475c83..a24aea44e8b00ff7b35a907aaa941b4c509642c4 100644
> --- a/gcc/testsuite/gcc.dg/strlenopt-5.c
> +++ b/gcc/testsuite/gcc.dg/strlenopt-5.c
> @@ -48,9 +48,9 @@ main ()
>    return 0;
>  }
>
> -/* { dg-final { scan-tree-dump-times "strlen \\(" 0 "strlen" } } */
> +/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "memcpy \\(" 2 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "strcpy \\(" 1 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
> -/* { dg-final { scan-tree-dump-times "strchr \\(" 2 "strlen" } } */
> +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
> diff --git a/gcc/testsuite/gcc.dg/strlenopt-7.c b/gcc/testsuite/gcc.dg/strlenopt-7.c
> index 3ae1e2cb3f0d08c8b05107c4e65b67bdb39cf7ab..aa53d7e75254dfe56c93172afc49f95e5b7901e6 100644
> --- a/gcc/testsuite/gcc.dg/strlenopt-7.c
> +++ b/gcc/testsuite/gcc.dg/strlenopt-7.c
> @@ -40,11 +40,11 @@ main ()
>    return 0;
>  }
>
> -/* { dg-final { scan-tree-dump-times "strlen \\(" 0 "strlen" } } */
> +/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "memcpy \\(" 2 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "strcpy \\(" 0 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
> -/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */
> +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "\\*r_\[0-9\]* = 0;" 1 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "return 3;" 1 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/strlenopt-9.c b/gcc/testsuite/gcc.dg/strlenopt-9.c
> index b0406b162d48fca375883035043b0c50b9db61a1..e5e276210ba4b7d75867605f1ecf5c06eb970ef5 100644
> --- a/gcc/testsuite/gcc.dg/strlenopt-9.c
> +++ b/gcc/testsuite/gcc.dg/strlenopt-9.c
> @@ -98,10 +98,10 @@ main ()
>    return 0;
>  }
>
> -/* { dg-final { scan-tree-dump-times "strlen \\(" 3 "strlen" } } */
> +/* { dg-final { scan-tree-dump-times "strlen \\(" 5 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "memcpy \\(" 6 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "strcpy \\(" 1 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
> -/* { dg-final { scan-tree-dump-times "strchr \\(" 3 "strlen" } } */
> +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "return 4;" 1 "optimized" } } */
>

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] Optimize strchr (s, 0) to strlen
  2016-04-19  8:14 ` Richard Biener
@ 2016-04-19 16:33   ` Wilco Dijkstra
  2016-04-20  8:45     ` Richard Biener
  0 siblings, 1 reply; 37+ messages in thread
From: Wilco Dijkstra @ 2016-04-19 16:33 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, nd

Richard Biener wrote:
>
> This folding should be added to gimple-fold.c:gimple_fold_builtin instead,
> the builtins.c foldings are purerly for folding to constants nowadays.

So is this better? It's a lot more verbose for something so simple...
Unfortunately match.pd doesn't support this kind of thing either.

Wilco


ChangeLog:
2016-04-19  Wilco Dijkstra  <wdijkstr@arm.com>

gcc/
	* gcc/gimple-fold.c (gimple_fold_builtin_strchr):
	New function to optimize strchr (s, 0) to strlen.
	(gimple_fold_builtin): Add BUILT_IN_STRCHR case.

testsuite/
	* gcc/testsuite/gcc.dg/strlenopt-20.c: Update test.
	* gcc/testsuite/gcc.dg/strlenopt-21.c: Likewise.
	* gcc/testsuite/gcc.dg/strlenopt-22.c: Likewise.
	* gcc/testsuite/gcc.dg/strlenopt-26.c: Likewise.
	* gcc/testsuite/gcc.dg/strlenopt-5.c: Likewise.
	* gcc/testsuite/gcc.dg/strlenopt-7.c: Likewise.
	* gcc/testsuite/gcc.dg/strlenopt-9.c: Likewise.

--

diff --git a/gcc/gimple-fold.c b/gcc/gimple-fold.c
index eb130d048469f0b8196e565fed9a40de74b098bd..11dcf69fc919f066362f4f713db392d14b39764e 100644
--- a/gcc/gimple-fold.c
+++ b/gcc/gimple-fold.c
@@ -1380,6 +1380,59 @@ gimple_fold_builtin_strncpy (gimple_stmt_iterator *gsi,
   return true;
 }
 
+/* Simplify strchr (str, 0) into str + strlen (str).
+   In general strlen is significantly faster than strchr
+   due to being a simpler operation.  */
+static bool
+gimple_fold_builtin_strchr (gimple_stmt_iterator *gsi)
+{
+  gimple *stmt = gsi_stmt (*gsi);
+  tree str = gimple_call_arg (stmt, 0);
+  tree c = gimple_call_arg (stmt, 1);
+  location_t loc = gimple_location (stmt);
+
+  if (optimize_function_for_size_p (cfun))
+    return false;
+
+  if (!integer_zerop (c) || !gimple_call_lhs (stmt))
+    return false;
+
+  tree newstr;
+  tree strlen_fn = builtin_decl_implicit (BUILT_IN_STRLEN);
+
+  if (!strlen_fn)
+    return false;
+
+  /* Create newstr = strlen (str).  */
+  gimple_seq stmts = NULL, stmts2;
+  gimple *repl = gimple_build_call (strlen_fn, 1, str);
+  gimple_set_location (repl, loc);
+  if (gimple_in_ssa_p (cfun))
+    newstr = make_ssa_name (size_type_node);
+  else
+    newstr = create_tmp_reg (size_type_node);
+  gimple_call_set_lhs (repl, newstr);
+  gimple_seq_add_stmt_without_update (&stmts, repl);
+
+  /* Create (str p+ strlen (str)).  */
+  newstr = fold_build_pointer_plus_loc (loc, str, newstr);
+  newstr = force_gimple_operand (newstr, &stmts2, true, NULL_TREE);
+  gimple_seq_add_seq_without_update (&stmts, stmts2);
+
+  repl = gimple_build_assign (gimple_call_lhs (stmt), newstr);
+  gimple_seq_add_stmt_without_update (&stmts, repl);
+  gsi_replace_with_seq_vops (gsi, stmts);
+  /* gsi now points at the assignment to the lhs, get a
+     stmt iterator to the strlen.
+     ???  We can't use gsi_for_stmt as that doesn't work when the
+     CFG isn't built yet.  */
+  gimple_stmt_iterator gsi2 = *gsi;
+  gsi_prev (&gsi2);
+  gsi_prev (&gsi2);
+  fold_stmt (&gsi2);
+  return true;
+}
+
 /* Simplify a call to the strcat builtin.  DST and SRC are the arguments
    to the call.
 
@@ -2821,6 +2874,8 @@ gimple_fold_builtin (gimple_stmt_iterator *gsi)
 					 gimple_call_arg (stmt, 1));
     case BUILT_IN_STRNCAT:
       return gimple_fold_builtin_strncat (gsi);
+    case BUILT_IN_STRCHR:
+      return gimple_fold_builtin_strchr (gsi);
     case BUILT_IN_FPUTS:
       return gimple_fold_builtin_fputs (gsi, gimple_call_arg (stmt, 0),
 					gimple_call_arg (stmt, 1), false);
diff --git a/gcc/testsuite/gcc.dg/strlenopt-20.c b/gcc/testsuite/gcc.dg/strlenopt-20.c
index a83e845c26d88e5acdcabf142f7b319136663488..7b483eaeac1aa47278111a92148a16f00b2aaa2d 100644
--- a/gcc/testsuite/gcc.dg/strlenopt-20.c
+++ b/gcc/testsuite/gcc.dg/strlenopt-20.c
@@ -86,9 +86,9 @@ main ()
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "memcpy \\(" 4 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strcpy \\(" 0 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
-/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
diff --git a/gcc/testsuite/gcc.dg/strlenopt-21.c b/gcc/testsuite/gcc.dg/strlenopt-21.c
index e22fa9fca9ba14354db2cd5f602283b64bd8dcac..05b85a49dde0a7f5d269174fd4269e40be910dbd 100644
--- a/gcc/testsuite/gcc.dg/strlenopt-21.c
+++ b/gcc/testsuite/gcc.dg/strlenopt-21.c
@@ -57,9 +57,9 @@ main ()
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "memcpy \\(" 3 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strcpy \\(" 0 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
-/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
diff --git a/gcc/testsuite/gcc.dg/strlenopt-22.c b/gcc/testsuite/gcc.dg/strlenopt-22.c
index aa55f5ebd6a2d4803ee9a7fd60fc538d86f47124..b4ef772f0e59252f10a5419ede6837b3c8ca8265 100644
--- a/gcc/testsuite/gcc.dg/strlenopt-22.c
+++ b/gcc/testsuite/gcc.dg/strlenopt-22.c
@@ -31,9 +31,9 @@ main ()
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "strlen \\(" 3 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strlen \\(" 4 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "memcpy \\(" 1 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strcpy \\(" 1 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
-/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
diff --git a/gcc/testsuite/gcc.dg/strlenopt-26.c b/gcc/testsuite/gcc.dg/strlenopt-26.c
index 4bd54bef540806ca90d2b9bcfc33eb531991c967..da2f465a5b5003fa5dca05f3a6ee00e97b98b5dd 100644
--- a/gcc/testsuite/gcc.dg/strlenopt-26.c
+++ b/gcc/testsuite/gcc.dg/strlenopt-26.c
@@ -21,4 +21,5 @@ main (void)
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
diff --git a/gcc/testsuite/gcc.dg/strlenopt-5.c b/gcc/testsuite/gcc.dg/strlenopt-5.c
index 1b006a93045599a75bdb10a39e86ffa59b475c83..a24aea44e8b00ff7b35a907aaa941b4c509642c4 100644
--- a/gcc/testsuite/gcc.dg/strlenopt-5.c
+++ b/gcc/testsuite/gcc.dg/strlenopt-5.c
@@ -48,9 +48,9 @@ main ()
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "strlen \\(" 0 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "memcpy \\(" 2 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strcpy \\(" 1 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
-/* { dg-final { scan-tree-dump-times "strchr \\(" 2 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
diff --git a/gcc/testsuite/gcc.dg/strlenopt-7.c b/gcc/testsuite/gcc.dg/strlenopt-7.c
index 3ae1e2cb3f0d08c8b05107c4e65b67bdb39cf7ab..aa53d7e75254dfe56c93172afc49f95e5b7901e6 100644
--- a/gcc/testsuite/gcc.dg/strlenopt-7.c
+++ b/gcc/testsuite/gcc.dg/strlenopt-7.c
@@ -40,11 +40,11 @@ main ()
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "strlen \\(" 0 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "memcpy \\(" 2 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strcpy \\(" 0 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
-/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "\\*r_\[0-9\]* = 0;" 1 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "return 3;" 1 "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/strlenopt-9.c b/gcc/testsuite/gcc.dg/strlenopt-9.c
index b0406b162d48fca375883035043b0c50b9db61a1..e5e276210ba4b7d75867605f1ecf5c06eb970ef5 100644
--- a/gcc/testsuite/gcc.dg/strlenopt-9.c
+++ b/gcc/testsuite/gcc.dg/strlenopt-9.c
@@ -98,10 +98,10 @@ main ()
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "strlen \\(" 3 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strlen \\(" 5 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "memcpy \\(" 6 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strcpy \\(" 1 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
-/* { dg-final { scan-tree-dump-times "strchr \\(" 3 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "return 4;" 1 "optimized" } } */

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] Optimize strchr (s, 0) to strlen
  2016-04-19 16:33   ` Wilco Dijkstra
@ 2016-04-20  8:45     ` Richard Biener
  2016-04-20  9:44       ` Richard Biener
  2016-04-20 11:57       ` Wilco Dijkstra
  0 siblings, 2 replies; 37+ messages in thread
From: Richard Biener @ 2016-04-20  8:45 UTC (permalink / raw)
  To: Wilco Dijkstra; +Cc: gcc-patches, nd

On Tue, Apr 19, 2016 at 6:32 PM, Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
> Richard Biener wrote:
>>
>> This folding should be added to gimple-fold.c:gimple_fold_builtin instead,
>> the builtins.c foldings are purerly for folding to constants nowadays.
>
> So is this better? It's a lot more verbose for something so simple...
> Unfortunately match.pd doesn't support this kind of thing either.

Better - comments below.  Jakub objections to the usefulness of the transform
remain - we do have the strlen pass that uses some global knowledge to decide
on profitability.  One could argue that for -Os doing the reverse transform is
profitable?

Yes, match.pd doesn't support this (yet).  It may be possible to teach it simple
cases like this - I plan to revisit this at some point.  The issue is one of
side-effects which are tricky to handle if you consider patterns that do not
only match a single call (as this one would).  So a baby-step towards getting
this supported in match.pd is to handle matching toplevel calls specially.

Richard.

> Wilco
>
>
> ChangeLog:
> 2016-04-19  Wilco Dijkstra  <wdijkstr@arm.com>
>
> gcc/
>         * gcc/gimple-fold.c (gimple_fold_builtin_strchr):
>         New function to optimize strchr (s, 0) to strlen.
>         (gimple_fold_builtin): Add BUILT_IN_STRCHR case.
>
> testsuite/
>         * gcc/testsuite/gcc.dg/strlenopt-20.c: Update test.
>         * gcc/testsuite/gcc.dg/strlenopt-21.c: Likewise.
>         * gcc/testsuite/gcc.dg/strlenopt-22.c: Likewise.
>         * gcc/testsuite/gcc.dg/strlenopt-26.c: Likewise.
>         * gcc/testsuite/gcc.dg/strlenopt-5.c: Likewise.
>         * gcc/testsuite/gcc.dg/strlenopt-7.c: Likewise.
>         * gcc/testsuite/gcc.dg/strlenopt-9.c: Likewise.
>
> --
>
> diff --git a/gcc/gimple-fold.c b/gcc/gimple-fold.c
> index eb130d048469f0b8196e565fed9a40de74b098bd..11dcf69fc919f066362f4f713db392d14b39764e 100644
> --- a/gcc/gimple-fold.c
> +++ b/gcc/gimple-fold.c
> @@ -1380,6 +1380,59 @@ gimple_fold_builtin_strncpy (gimple_stmt_iterator *gsi,
>    return true;
>  }
>
> +/* Simplify strchr (str, 0) into str + strlen (str).
> +   In general strlen is significantly faster than strchr
> +   due to being a simpler operation.  */
> +static bool
> +gimple_fold_builtin_strchr (gimple_stmt_iterator *gsi)
> +{
> +  gimple *stmt = gsi_stmt (*gsi);
> +  tree str = gimple_call_arg (stmt, 0);
> +  tree c = gimple_call_arg (stmt, 1);
> +  location_t loc = gimple_location (stmt);
> +
> +  if (optimize_function_for_size_p (cfun))
> +    return false;

Hmm, I think we'd want a optimize_stmt_for_size_p (stmt) which
does the right thing for the case we have a CFG (look at the BB)
or when not (look at the function).

> +  if (!integer_zerop (c) || !gimple_call_lhs (stmt))
> +    return false;
> +
> +  tree newstr;
> +  tree strlen_fn = builtin_decl_implicit (BUILT_IN_STRLEN);
> +
> +  if (!strlen_fn)
> +    return false;
> +
> +  /* Create newstr = strlen (str).  */
> +  gimple_seq stmts = NULL, stmts2;
> +  gimple *repl = gimple_build_call (strlen_fn, 1, str);
> +  gimple_set_location (repl, loc);
> +  if (gimple_in_ssa_p (cfun))
> +    newstr = make_ssa_name (size_type_node);
> +  else
> +    newstr = create_tmp_reg (size_type_node);
> +  gimple_call_set_lhs (repl, newstr);
> +  gimple_seq_add_stmt_without_update (&stmts, repl);
> +
> +  /* Create (str p+ strlen (str)).  */
> +  newstr = fold_build_pointer_plus_loc (loc, str, newstr);
> +  newstr = force_gimple_operand (newstr, &stmts2, true, NULL_TREE);

I think you want to build a gimple_assign directly here, otherwise ...

> +  gimple_seq_add_seq_without_update (&stmts, stmts2);
> +
> +  repl = gimple_build_assign (gimple_call_lhs (stmt), newstr);
> +  gimple_seq_add_stmt_without_update (&stmts, repl);
> +  gsi_replace_with_seq_vops (gsi, stmts);
> +  /* gsi now points at the assignment to the lhs, get a
> +     stmt iterator to the strlen.
> +     ???  We can't use gsi_for_stmt as that doesn't work when the
> +     CFG isn't built yet.  */
> +  gimple_stmt_iterator gsi2 = *gsi;
> +  gsi_prev (&gsi2);
> +  gsi_prev (&gsi2);

... this may not reliably end up at the call stmt.

> +  fold_stmt (&gsi2);
> +  return true;
> +}
> +
>  /* Simplify a call to the strcat builtin.  DST and SRC are the arguments
>     to the call.
>
> @@ -2821,6 +2874,8 @@ gimple_fold_builtin (gimple_stmt_iterator *gsi)
>                                          gimple_call_arg (stmt, 1));
>      case BUILT_IN_STRNCAT:
>        return gimple_fold_builtin_strncat (gsi);
> +    case BUILT_IN_STRCHR:
> +      return gimple_fold_builtin_strchr (gsi);
>      case BUILT_IN_FPUTS:
>        return gimple_fold_builtin_fputs (gsi, gimple_call_arg (stmt, 0),
>                                         gimple_call_arg (stmt, 1), false);
> diff --git a/gcc/testsuite/gcc.dg/strlenopt-20.c b/gcc/testsuite/gcc.dg/strlenopt-20.c
> index a83e845c26d88e5acdcabf142f7b319136663488..7b483eaeac1aa47278111a92148a16f00b2aaa2d 100644
> --- a/gcc/testsuite/gcc.dg/strlenopt-20.c
> +++ b/gcc/testsuite/gcc.dg/strlenopt-20.c
> @@ -86,9 +86,9 @@ main ()
>    return 0;
>  }
>
> -/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */
> +/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "memcpy \\(" 4 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "strcpy \\(" 0 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
> -/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */
> +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
> diff --git a/gcc/testsuite/gcc.dg/strlenopt-21.c b/gcc/testsuite/gcc.dg/strlenopt-21.c
> index e22fa9fca9ba14354db2cd5f602283b64bd8dcac..05b85a49dde0a7f5d269174fd4269e40be910dbd 100644
> --- a/gcc/testsuite/gcc.dg/strlenopt-21.c
> +++ b/gcc/testsuite/gcc.dg/strlenopt-21.c
> @@ -57,9 +57,9 @@ main ()
>    return 0;
>  }
>
> -/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */
> +/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "memcpy \\(" 3 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "strcpy \\(" 0 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
> -/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */
> +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
> diff --git a/gcc/testsuite/gcc.dg/strlenopt-22.c b/gcc/testsuite/gcc.dg/strlenopt-22.c
> index aa55f5ebd6a2d4803ee9a7fd60fc538d86f47124..b4ef772f0e59252f10a5419ede6837b3c8ca8265 100644
> --- a/gcc/testsuite/gcc.dg/strlenopt-22.c
> +++ b/gcc/testsuite/gcc.dg/strlenopt-22.c
> @@ -31,9 +31,9 @@ main ()
>    return 0;
>  }
>
> -/* { dg-final { scan-tree-dump-times "strlen \\(" 3 "strlen" } } */
> +/* { dg-final { scan-tree-dump-times "strlen \\(" 4 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "memcpy \\(" 1 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "strcpy \\(" 1 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
> -/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */
> +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
> diff --git a/gcc/testsuite/gcc.dg/strlenopt-26.c b/gcc/testsuite/gcc.dg/strlenopt-26.c
> index 4bd54bef540806ca90d2b9bcfc33eb531991c967..da2f465a5b5003fa5dca05f3a6ee00e97b98b5dd 100644
> --- a/gcc/testsuite/gcc.dg/strlenopt-26.c
> +++ b/gcc/testsuite/gcc.dg/strlenopt-26.c
> @@ -21,4 +21,5 @@ main (void)
>    return 0;
>  }
>
> -/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */
> +/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */
> +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
> diff --git a/gcc/testsuite/gcc.dg/strlenopt-5.c b/gcc/testsuite/gcc.dg/strlenopt-5.c
> index 1b006a93045599a75bdb10a39e86ffa59b475c83..a24aea44e8b00ff7b35a907aaa941b4c509642c4 100644
> --- a/gcc/testsuite/gcc.dg/strlenopt-5.c
> +++ b/gcc/testsuite/gcc.dg/strlenopt-5.c
> @@ -48,9 +48,9 @@ main ()
>    return 0;
>  }
>
> -/* { dg-final { scan-tree-dump-times "strlen \\(" 0 "strlen" } } */
> +/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "memcpy \\(" 2 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "strcpy \\(" 1 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
> -/* { dg-final { scan-tree-dump-times "strchr \\(" 2 "strlen" } } */
> +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
> diff --git a/gcc/testsuite/gcc.dg/strlenopt-7.c b/gcc/testsuite/gcc.dg/strlenopt-7.c
> index 3ae1e2cb3f0d08c8b05107c4e65b67bdb39cf7ab..aa53d7e75254dfe56c93172afc49f95e5b7901e6 100644
> --- a/gcc/testsuite/gcc.dg/strlenopt-7.c
> +++ b/gcc/testsuite/gcc.dg/strlenopt-7.c
> @@ -40,11 +40,11 @@ main ()
>    return 0;
>  }
>
> -/* { dg-final { scan-tree-dump-times "strlen \\(" 0 "strlen" } } */
> +/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "memcpy \\(" 2 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "strcpy \\(" 0 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
> -/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */
> +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "\\*r_\[0-9\]* = 0;" 1 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "return 3;" 1 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/strlenopt-9.c b/gcc/testsuite/gcc.dg/strlenopt-9.c
> index b0406b162d48fca375883035043b0c50b9db61a1..e5e276210ba4b7d75867605f1ecf5c06eb970ef5 100644
> --- a/gcc/testsuite/gcc.dg/strlenopt-9.c
> +++ b/gcc/testsuite/gcc.dg/strlenopt-9.c
> @@ -98,10 +98,10 @@ main ()
>    return 0;
>  }
>
> -/* { dg-final { scan-tree-dump-times "strlen \\(" 3 "strlen" } } */
> +/* { dg-final { scan-tree-dump-times "strlen \\(" 5 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "memcpy \\(" 6 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "strcpy \\(" 1 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
> -/* { dg-final { scan-tree-dump-times "strchr \\(" 3 "strlen" } } */
> +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "return 4;" 1 "optimized" } } */
>

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] Optimize strchr (s, 0) to strlen
  2016-04-20  8:45     ` Richard Biener
@ 2016-04-20  9:44       ` Richard Biener
  2016-04-20 10:33         ` Jakub Jelinek
  2016-04-20 11:57       ` Wilco Dijkstra
  1 sibling, 1 reply; 37+ messages in thread
From: Richard Biener @ 2016-04-20  9:44 UTC (permalink / raw)
  To: Wilco Dijkstra; +Cc: gcc-patches, nd

[-- Attachment #1: Type: text/plain, Size: 11575 bytes --]

On Wed, Apr 20, 2016 at 10:45 AM, Richard Biener
<richard.guenther@gmail.com> wrote:
> On Tue, Apr 19, 2016 at 6:32 PM, Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
>> Richard Biener wrote:
>>>
>>> This folding should be added to gimple-fold.c:gimple_fold_builtin instead,
>>> the builtins.c foldings are purerly for folding to constants nowadays.
>>
>> So is this better? It's a lot more verbose for something so simple...
>> Unfortunately match.pd doesn't support this kind of thing either.
>
> Better - comments below.  Jakub objections to the usefulness of the transform
> remain - we do have the strlen pass that uses some global knowledge to decide
> on profitability.  One could argue that for -Os doing the reverse transform is
> profitable?
>
> Yes, match.pd doesn't support this (yet).  It may be possible to teach it simple
> cases like this - I plan to revisit this at some point.  The issue is one of
> side-effects which are tricky to handle if you consider patterns that do not
> only match a single call (as this one would).  So a baby-step towards getting
> this supported in match.pd is to handle matching toplevel calls specially.

Ok, just gave it a stab in a different way - see attached.  This makes

(simplify
 (BUILT_IN_STRCHR @0 integer_zerop)
 (pointer_plus @0 (BUILT_IN_STRLEN:size_type_node @0)))

work.  Note how the SSA following predicate needs to be aware of
side-effects to guard against bogus applies of complicated patterns
(none of those exist yet).

Richard.

> Richard.
>
>> Wilco
>>
>>
>> ChangeLog:
>> 2016-04-19  Wilco Dijkstra  <wdijkstr@arm.com>
>>
>> gcc/
>>         * gcc/gimple-fold.c (gimple_fold_builtin_strchr):
>>         New function to optimize strchr (s, 0) to strlen.
>>         (gimple_fold_builtin): Add BUILT_IN_STRCHR case.
>>
>> testsuite/
>>         * gcc/testsuite/gcc.dg/strlenopt-20.c: Update test.
>>         * gcc/testsuite/gcc.dg/strlenopt-21.c: Likewise.
>>         * gcc/testsuite/gcc.dg/strlenopt-22.c: Likewise.
>>         * gcc/testsuite/gcc.dg/strlenopt-26.c: Likewise.
>>         * gcc/testsuite/gcc.dg/strlenopt-5.c: Likewise.
>>         * gcc/testsuite/gcc.dg/strlenopt-7.c: Likewise.
>>         * gcc/testsuite/gcc.dg/strlenopt-9.c: Likewise.
>>
>> --
>>
>> diff --git a/gcc/gimple-fold.c b/gcc/gimple-fold.c
>> index eb130d048469f0b8196e565fed9a40de74b098bd..11dcf69fc919f066362f4f713db392d14b39764e 100644
>> --- a/gcc/gimple-fold.c
>> +++ b/gcc/gimple-fold.c
>> @@ -1380,6 +1380,59 @@ gimple_fold_builtin_strncpy (gimple_stmt_iterator *gsi,
>>    return true;
>>  }
>>
>> +/* Simplify strchr (str, 0) into str + strlen (str).
>> +   In general strlen is significantly faster than strchr
>> +   due to being a simpler operation.  */
>> +static bool
>> +gimple_fold_builtin_strchr (gimple_stmt_iterator *gsi)
>> +{
>> +  gimple *stmt = gsi_stmt (*gsi);
>> +  tree str = gimple_call_arg (stmt, 0);
>> +  tree c = gimple_call_arg (stmt, 1);
>> +  location_t loc = gimple_location (stmt);
>> +
>> +  if (optimize_function_for_size_p (cfun))
>> +    return false;
>
> Hmm, I think we'd want a optimize_stmt_for_size_p (stmt) which
> does the right thing for the case we have a CFG (look at the BB)
> or when not (look at the function).
>
>> +  if (!integer_zerop (c) || !gimple_call_lhs (stmt))
>> +    return false;
>> +
>> +  tree newstr;
>> +  tree strlen_fn = builtin_decl_implicit (BUILT_IN_STRLEN);
>> +
>> +  if (!strlen_fn)
>> +    return false;
>> +
>> +  /* Create newstr = strlen (str).  */
>> +  gimple_seq stmts = NULL, stmts2;
>> +  gimple *repl = gimple_build_call (strlen_fn, 1, str);
>> +  gimple_set_location (repl, loc);
>> +  if (gimple_in_ssa_p (cfun))
>> +    newstr = make_ssa_name (size_type_node);
>> +  else
>> +    newstr = create_tmp_reg (size_type_node);
>> +  gimple_call_set_lhs (repl, newstr);
>> +  gimple_seq_add_stmt_without_update (&stmts, repl);
>> +
>> +  /* Create (str p+ strlen (str)).  */
>> +  newstr = fold_build_pointer_plus_loc (loc, str, newstr);
>> +  newstr = force_gimple_operand (newstr, &stmts2, true, NULL_TREE);
>
> I think you want to build a gimple_assign directly here, otherwise ...
>
>> +  gimple_seq_add_seq_without_update (&stmts, stmts2);
>> +
>> +  repl = gimple_build_assign (gimple_call_lhs (stmt), newstr);
>> +  gimple_seq_add_stmt_without_update (&stmts, repl);
>> +  gsi_replace_with_seq_vops (gsi, stmts);
>> +  /* gsi now points at the assignment to the lhs, get a
>> +     stmt iterator to the strlen.
>> +     ???  We can't use gsi_for_stmt as that doesn't work when the
>> +     CFG isn't built yet.  */
>> +  gimple_stmt_iterator gsi2 = *gsi;
>> +  gsi_prev (&gsi2);
>> +  gsi_prev (&gsi2);
>
> ... this may not reliably end up at the call stmt.
>
>> +  fold_stmt (&gsi2);
>> +  return true;
>> +}
>> +
>>  /* Simplify a call to the strcat builtin.  DST and SRC are the arguments
>>     to the call.
>>
>> @@ -2821,6 +2874,8 @@ gimple_fold_builtin (gimple_stmt_iterator *gsi)
>>                                          gimple_call_arg (stmt, 1));
>>      case BUILT_IN_STRNCAT:
>>        return gimple_fold_builtin_strncat (gsi);
>> +    case BUILT_IN_STRCHR:
>> +      return gimple_fold_builtin_strchr (gsi);
>>      case BUILT_IN_FPUTS:
>>        return gimple_fold_builtin_fputs (gsi, gimple_call_arg (stmt, 0),
>>                                         gimple_call_arg (stmt, 1), false);
>> diff --git a/gcc/testsuite/gcc.dg/strlenopt-20.c b/gcc/testsuite/gcc.dg/strlenopt-20.c
>> index a83e845c26d88e5acdcabf142f7b319136663488..7b483eaeac1aa47278111a92148a16f00b2aaa2d 100644
>> --- a/gcc/testsuite/gcc.dg/strlenopt-20.c
>> +++ b/gcc/testsuite/gcc.dg/strlenopt-20.c
>> @@ -86,9 +86,9 @@ main ()
>>    return 0;
>>  }
>>
>> -/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */
>> +/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */
>>  /* { dg-final { scan-tree-dump-times "memcpy \\(" 4 "strlen" } } */
>>  /* { dg-final { scan-tree-dump-times "strcpy \\(" 0 "strlen" } } */
>>  /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
>> -/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */
>> +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
>>  /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
>> diff --git a/gcc/testsuite/gcc.dg/strlenopt-21.c b/gcc/testsuite/gcc.dg/strlenopt-21.c
>> index e22fa9fca9ba14354db2cd5f602283b64bd8dcac..05b85a49dde0a7f5d269174fd4269e40be910dbd 100644
>> --- a/gcc/testsuite/gcc.dg/strlenopt-21.c
>> +++ b/gcc/testsuite/gcc.dg/strlenopt-21.c
>> @@ -57,9 +57,9 @@ main ()
>>    return 0;
>>  }
>>
>> -/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */
>> +/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */
>>  /* { dg-final { scan-tree-dump-times "memcpy \\(" 3 "strlen" } } */
>>  /* { dg-final { scan-tree-dump-times "strcpy \\(" 0 "strlen" } } */
>>  /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
>> -/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */
>> +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
>>  /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
>> diff --git a/gcc/testsuite/gcc.dg/strlenopt-22.c b/gcc/testsuite/gcc.dg/strlenopt-22.c
>> index aa55f5ebd6a2d4803ee9a7fd60fc538d86f47124..b4ef772f0e59252f10a5419ede6837b3c8ca8265 100644
>> --- a/gcc/testsuite/gcc.dg/strlenopt-22.c
>> +++ b/gcc/testsuite/gcc.dg/strlenopt-22.c
>> @@ -31,9 +31,9 @@ main ()
>>    return 0;
>>  }
>>
>> -/* { dg-final { scan-tree-dump-times "strlen \\(" 3 "strlen" } } */
>> +/* { dg-final { scan-tree-dump-times "strlen \\(" 4 "strlen" } } */
>>  /* { dg-final { scan-tree-dump-times "memcpy \\(" 1 "strlen" } } */
>>  /* { dg-final { scan-tree-dump-times "strcpy \\(" 1 "strlen" } } */
>>  /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
>> -/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */
>> +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
>>  /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
>> diff --git a/gcc/testsuite/gcc.dg/strlenopt-26.c b/gcc/testsuite/gcc.dg/strlenopt-26.c
>> index 4bd54bef540806ca90d2b9bcfc33eb531991c967..da2f465a5b5003fa5dca05f3a6ee00e97b98b5dd 100644
>> --- a/gcc/testsuite/gcc.dg/strlenopt-26.c
>> +++ b/gcc/testsuite/gcc.dg/strlenopt-26.c
>> @@ -21,4 +21,5 @@ main (void)
>>    return 0;
>>  }
>>
>> -/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */
>> +/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */
>> +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
>> diff --git a/gcc/testsuite/gcc.dg/strlenopt-5.c b/gcc/testsuite/gcc.dg/strlenopt-5.c
>> index 1b006a93045599a75bdb10a39e86ffa59b475c83..a24aea44e8b00ff7b35a907aaa941b4c509642c4 100644
>> --- a/gcc/testsuite/gcc.dg/strlenopt-5.c
>> +++ b/gcc/testsuite/gcc.dg/strlenopt-5.c
>> @@ -48,9 +48,9 @@ main ()
>>    return 0;
>>  }
>>
>> -/* { dg-final { scan-tree-dump-times "strlen \\(" 0 "strlen" } } */
>> +/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */
>>  /* { dg-final { scan-tree-dump-times "memcpy \\(" 2 "strlen" } } */
>>  /* { dg-final { scan-tree-dump-times "strcpy \\(" 1 "strlen" } } */
>>  /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
>> -/* { dg-final { scan-tree-dump-times "strchr \\(" 2 "strlen" } } */
>> +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
>>  /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
>> diff --git a/gcc/testsuite/gcc.dg/strlenopt-7.c b/gcc/testsuite/gcc.dg/strlenopt-7.c
>> index 3ae1e2cb3f0d08c8b05107c4e65b67bdb39cf7ab..aa53d7e75254dfe56c93172afc49f95e5b7901e6 100644
>> --- a/gcc/testsuite/gcc.dg/strlenopt-7.c
>> +++ b/gcc/testsuite/gcc.dg/strlenopt-7.c
>> @@ -40,11 +40,11 @@ main ()
>>    return 0;
>>  }
>>
>> -/* { dg-final { scan-tree-dump-times "strlen \\(" 0 "strlen" } } */
>> +/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */
>>  /* { dg-final { scan-tree-dump-times "memcpy \\(" 2 "strlen" } } */
>>  /* { dg-final { scan-tree-dump-times "strcpy \\(" 0 "strlen" } } */
>>  /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
>> -/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */
>> +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
>>  /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
>>  /* { dg-final { scan-tree-dump-times "\\*r_\[0-9\]* = 0;" 1 "strlen" } } */
>>  /* { dg-final { scan-tree-dump-times "return 3;" 1 "optimized" } } */
>> diff --git a/gcc/testsuite/gcc.dg/strlenopt-9.c b/gcc/testsuite/gcc.dg/strlenopt-9.c
>> index b0406b162d48fca375883035043b0c50b9db61a1..e5e276210ba4b7d75867605f1ecf5c06eb970ef5 100644
>> --- a/gcc/testsuite/gcc.dg/strlenopt-9.c
>> +++ b/gcc/testsuite/gcc.dg/strlenopt-9.c
>> @@ -98,10 +98,10 @@ main ()
>>    return 0;
>>  }
>>
>> -/* { dg-final { scan-tree-dump-times "strlen \\(" 3 "strlen" } } */
>> +/* { dg-final { scan-tree-dump-times "strlen \\(" 5 "strlen" } } */
>>  /* { dg-final { scan-tree-dump-times "memcpy \\(" 6 "strlen" } } */
>>  /* { dg-final { scan-tree-dump-times "strcpy \\(" 1 "strlen" } } */
>>  /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
>> -/* { dg-final { scan-tree-dump-times "strchr \\(" 3 "strlen" } } */
>> +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
>>  /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
>>  /* { dg-final { scan-tree-dump-times "return 4;" 1 "optimized" } } */
>>

[-- Attachment #2: mas-allow-string-builtins --]
[-- Type: application/octet-stream, Size: 2596 bytes --]

Index: gcc/gimple-fold.c
===================================================================
*** gcc/gimple-fold.c	(revision 235237)
--- gcc/gimple-fold.c	(working copy)
*************** fold_stmt_1 (gimple_stmt_iterator *gsi,
*** 3641,3647 ****
    /* Dispatch to pattern-based folding.  */
    if (!inplace
        || is_gimple_assign (stmt)
!       || gimple_code (stmt) == GIMPLE_COND)
      {
        gimple_seq seq = NULL;
        code_helper rcode;
--- 3641,3648 ----
    /* Dispatch to pattern-based folding.  */
    if (!inplace
        || is_gimple_assign (stmt)
!       || gimple_code (stmt) == GIMPLE_COND
!       || is_gimple_call (stmt))
      {
        gimple_seq seq = NULL;
        code_helper rcode;
*************** no_follow_ssa_edges (tree)
*** 3835,3843 ****
  tree
  follow_single_use_edges (tree val)
  {
!   if (TREE_CODE (val) == SSA_NAME
!       && !has_single_use (val))
!     return NULL_TREE;
    return val;
  }
  
--- 3836,3858 ----
  tree
  follow_single_use_edges (tree val)
  {
!   if (TREE_CODE (val) == SSA_NAME)
!     {
!       if (! has_single_use (val))
! 	return NULL_TREE;
!       gimple *def = SSA_NAME_DEF_STMT (val);
!       /* If the stmt references memory it's not safe to follow the edge.  */
!       if (gimple_in_ssa_p (cfun))
! 	{
! 	  if (gimple_vuse (def))
! 	    return NULL_TREE;
! 	}
!       else if (! (is_gimple_assign (def)
! 		  && ! gimple_assign_load_p (def))
! 	       && ! (is_gimple_call (def)
! 		     && gimple_call_flags (def) & ECF_CONST))
! 	return NULL_TREE;
!     }
    return val;
  }
  
Index: gcc/gimple-match-head.c
===================================================================
*** gcc/gimple-match-head.c	(revision 235237)
--- gcc/gimple-match-head.c	(working copy)
*************** maybe_push_res_to_seq (code_helper rcode
*** 355,364 ****
  	  if (!decl)
  	    return NULL;
  
- 	  /* We can't and should not emit calls to non-const functions.  */
- 	  if (!(flags_from_decl_or_type (decl) & ECF_CONST))
- 	    return NULL;
- 
  	  new_stmt = gimple_build_call (decl, nargs, ops[0], ops[1], ops[2]);
  	}
        if (!res)
--- 355,360 ----
Index: gcc/match.pd
===================================================================
*** gcc/match.pd	(revision 235237)
--- gcc/match.pd	(working copy)
*************** DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
*** 3052,3054 ****
--- 3052,3058 ----
   (SIGNBIT @0)
   (if (!HONOR_SIGNED_ZEROS (@0))
    (convert (lt @0 { build_real (TREE_TYPE (@0), dconst0); }))))
+ 
+ (simplify
+  (BUILT_IN_STRCHR @0 integer_zerop)
+  (pointer_plus @0 (BUILT_IN_STRLEN:size_type_node @0)))

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] Optimize strchr (s, 0) to strlen
  2016-04-20  9:44       ` Richard Biener
@ 2016-04-20 10:33         ` Jakub Jelinek
  2016-04-20 10:50           ` Richard Biener
  2016-04-20 11:17           ` Wilco Dijkstra
  0 siblings, 2 replies; 37+ messages in thread
From: Jakub Jelinek @ 2016-04-20 10:33 UTC (permalink / raw)
  To: Richard Biener; +Cc: Wilco Dijkstra, gcc-patches, nd

On Wed, Apr 20, 2016 at 11:44:08AM +0200, Richard Biener wrote:
> (simplify
>  (BUILT_IN_STRCHR @0 integer_zerop)
>  (pointer_plus @0 (BUILT_IN_STRLEN:size_type_node @0)))

I still don't like this transformation and would very much prefer to see
using rawmemchr instead on targets that provide it, and also this is
something that IMHO should be done in the tree-ssa-strlen.c pass together
with the other optimizations in there.  Similarly to stpcpy, which is also
non-standard (in POSIX, but not in C), we should just look at headers if
rawmemchr is defined with compatible prototype.
Also, strrchr (s, 0) should be folded to strchr (s, 0) or handled the same
like that one.
And, while x = strchr (s, 0) to x = rawmemchr (s, 0) is a reasonable -Os
transformation, x = s + strlen (s) is not, it makes code usually larger
(especially because it increases register pressure across the call).

	Jakub

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] Optimize strchr (s, 0) to strlen
  2016-04-20 10:33         ` Jakub Jelinek
@ 2016-04-20 10:50           ` Richard Biener
  2016-04-20 11:17           ` Wilco Dijkstra
  1 sibling, 0 replies; 37+ messages in thread
From: Richard Biener @ 2016-04-20 10:50 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Wilco Dijkstra, gcc-patches, nd

On Wed, Apr 20, 2016 at 12:33 PM, Jakub Jelinek <jakub@redhat.com> wrote:
> On Wed, Apr 20, 2016 at 11:44:08AM +0200, Richard Biener wrote:
>> (simplify
>>  (BUILT_IN_STRCHR @0 integer_zerop)
>>  (pointer_plus @0 (BUILT_IN_STRLEN:size_type_node @0)))
>
> I still don't like this transformation and would very much prefer to see
> using rawmemchr instead on targets that provide it, and also this is
> something that IMHO should be done in the tree-ssa-strlen.c pass together
> with the other optimizations in there.  Similarly to stpcpy, which is also
> non-standard (in POSIX, but not in C), we should just look at headers if
> rawmemchr is defined with compatible prototype.
> Also, strrchr (s, 0) should be folded to strchr (s, 0) or handled the same
> like that one.
> And, while x = strchr (s, 0) to x = rawmemchr (s, 0) is a reasonable -Os
> transformation, x = s + strlen (s) is not, it makes code usually larger
> (especially because it increases register pressure across the call).

Sure - agreed.  So with the patch

(simplify
 (BUILT_IN_STRRCHR @0 integer_zerop@1)
 (BUILT_IN_STRCHR @0 @1))

is possible at least ;)

Richard.

>         Jakub

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] Optimize strchr (s, 0) to strlen
  2016-04-20 10:33         ` Jakub Jelinek
  2016-04-20 10:50           ` Richard Biener
@ 2016-04-20 11:17           ` Wilco Dijkstra
  2016-04-20 13:01             ` Jakub Jelinek
  1 sibling, 1 reply; 37+ messages in thread
From: Wilco Dijkstra @ 2016-04-20 11:17 UTC (permalink / raw)
  To: Richard Biener, Jakub Jelinek; +Cc: gcc-patches, nd

Jakub Jelinek wrote:
> I still don't like this transformation and would very much prefer to see
> using rawmemchr instead on targets that provide it, and also this is
> something that IMHO should be done in the tree-ssa-strlen.c pass together
> with the other optimizations in there.  Similarly to stpcpy, which is also
> non-standard (in POSIX, but not in C), we should just look at headers if
> rawmemchr is defined with compatible prototype.

Can you quantify "don't like"? I benchmarked rawmemchr on a few targets
and it's slower than strlen, so it's hard to guess what you don't like about it.

Several targets don't even have an assembly implementation of rawmemchr, so
looking at the header would not be sufficient to determine rawmemchr is fast, let
alone as fast as strlen.

The tree-ssa-strlen pass seems to optimize repeated calls to strlen, or strcpy
after a strlen, so I'm not sure how this is related - this is a local transformation
like the foldings in builtin.c/gimple-fold.c.

> Also, strrchr (s, 0) should be folded to strchr (s, 0) or handled the same
> like that one.

GCC converts strrchr (s, 0) to strchr (s, 0) which then gets optimized. I checked
this happens as expected with both versions of my patch.

> And, while x = strchr (s, 0) to x = rawmemchr (s, 0) is a reasonable -Os
> transformation, x = s + strlen (s) is not, it makes code usually larger
> (especially because it increases register pressure across the call).

Indeed, that's why my transformation is disabled with -Os.

Wilco

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] Optimize strchr (s, 0) to strlen
  2016-04-20  8:45     ` Richard Biener
  2016-04-20  9:44       ` Richard Biener
@ 2016-04-20 11:57       ` Wilco Dijkstra
  2016-04-20 11:59         ` Richard Biener
  1 sibling, 1 reply; 37+ messages in thread
From: Wilco Dijkstra @ 2016-04-20 11:57 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, nd

Richard Biener wrote:
> Better - comments below.  Jakub objections to the usefulness of the transform
> remain - we do have the strlen pass that uses some global knowledge to decide
> on profitability.  One could argue that for -Os doing the reverse transform is
> profitable?

In what way would it get more info to decide on profitability? The transform is 
profitable unless you messed up your strlen implementation badly.

For -Os one could do the reverse, but I don't think it is going to give a substantial
codesize gain compared to other simple improvements, so unlikely worth it.

>> +  if (optimize_function_for_size_p (cfun))
>> +    return false;
>
> Hmm, I think we'd want a optimize_stmt_for_size_p (stmt) which
> does the right thing for the case we have a CFG (look at the BB)
> or when not (look at the function).

Does that use the often incorrect BB probabilities? I used the function variant on
purpose to avoid it making the wrong decision. A typical example I see is that GCC
inlines a return sequence into an if marked with __builtin_expect (c, 0) but not in the
hot code that follows...

> I think you want to build a gimple_assign directly here, otherwise ...

>... this may not reliably end up at the call stmt.

OK, I revisit that once we've agreed how to proceed with this patch - we now have
3 variants...

Wilco

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] Optimize strchr (s, 0) to strlen
  2016-04-20 11:57       ` Wilco Dijkstra
@ 2016-04-20 11:59         ` Richard Biener
  2016-05-04 12:54           ` Wilco Dijkstra
  0 siblings, 1 reply; 37+ messages in thread
From: Richard Biener @ 2016-04-20 11:59 UTC (permalink / raw)
  To: Wilco Dijkstra; +Cc: gcc-patches, nd

On Wed, Apr 20, 2016 at 1:56 PM, Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
> Richard Biener wrote:
>> Better - comments below.  Jakub objections to the usefulness of the transform
>> remain - we do have the strlen pass that uses some global knowledge to decide
>> on profitability.  One could argue that for -Os doing the reverse transform is
>> profitable?
>
> In what way would it get more info to decide on profitability? The transform is
> profitable unless you messed up your strlen implementation badly.
>
> For -Os one could do the reverse, but I don't think it is going to give a substantial
> codesize gain compared to other simple improvements, so unlikely worth it.
>
>>> +  if (optimize_function_for_size_p (cfun))
>>> +    return false;
>>
>> Hmm, I think we'd want a optimize_stmt_for_size_p (stmt) which
>> does the right thing for the case we have a CFG (look at the BB)
>> or when not (look at the function).
>
> Does that use the often incorrect BB probabilities? I used the function variant on
> purpose to avoid it making the wrong decision. A typical example I see is that GCC
> inlines a return sequence into an if marked with __builtin_expect (c, 0) but not in the
> hot code that follows...
>
>> I think you want to build a gimple_assign directly here, otherwise ...
>
>>... this may not reliably end up at the call stmt.
>
> OK, I revisit that once we've agreed how to proceed with this patch - we now have
> 3 variants...

Yeah ;)  I'm currently bootstrapping/testing the patch that makes it possible to
write all this in match.pd.

Richard.

> Wilco
>

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] Optimize strchr (s, 0) to strlen
  2016-04-20 11:17           ` Wilco Dijkstra
@ 2016-04-20 13:01             ` Jakub Jelinek
  2016-04-20 14:18               ` Wilco Dijkstra
  0 siblings, 1 reply; 37+ messages in thread
From: Jakub Jelinek @ 2016-04-20 13:01 UTC (permalink / raw)
  To: Wilco Dijkstra; +Cc: Richard Biener, gcc-patches, nd

On Wed, Apr 20, 2016 at 11:17:06AM +0000, Wilco Dijkstra wrote:
> Can you quantify "don't like"? I benchmarked rawmemchr on a few targets
> and it's slower than strlen, so it's hard to guess what you don't like about it.

This is the same stuff as has been discussed for mempcpy, rawmemchr is the
API meant to use for getting pointer to the terminating '\0', if there are
deficiencies on the library side, they should be fixed.  If you hardcode in
GCC emitting worse sequence at the caller (which s + strlen (s) is), then
even once the library deficiency is fixed, you still don't get benefit from
it.  I wonder how you work around the
 define strchr(s, c) \
  (__extension__ (__builtin_constant_p (c) && !__builtin_constant_p (s) \
                  && (c) == '\0' \
                  ? (char *) __rawmemchr (s, c) \
                  : __builtin_strchr (s, c)))
in glibc headers anyway.

Another thing is for the cases where strlen is desirable to be expanded
inline, in that case rawmemchr (x, 0) or strchr (x, 0) is likely useful to be
expanded inline as well and then this decision should be done at expansion
time.

	Jakub

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] Optimize strchr (s, 0) to strlen
  2016-04-20 13:01             ` Jakub Jelinek
@ 2016-04-20 14:18               ` Wilco Dijkstra
  0 siblings, 0 replies; 37+ messages in thread
From: Wilco Dijkstra @ 2016-04-20 14:18 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Richard Biener, gcc-patches, nd

Jakub Jelinek wrote:
> On Wed, Apr 20, 2016 at 11:17:06AM +0000, Wilco Dijkstra wrote:
>> Can you quantify "don't like"? I benchmarked rawmemchr on a few targets
>> and it's slower than strlen, so it's hard to guess what you don't like about it.
>
> This is the same stuff as has been discussed for mempcpy, rawmemchr is the
> API meant to use for getting pointer to the terminating '\0', if there are
> deficiencies on the library side, they should be fixed.  

About mempcpy, GLIBC nowadays expands it into memcpy (p, q, n) + n by default
in string.h.

Generally after a lot of discussion on this last year, the consensus is that these
functions don't provide a useful gain and are often detrimental to performance even
if optimized assembly implementations happen to be available due to I-cache
pressure.

Emitting rawmemchr/mempcpy/stpcpy automatically as a result of optimization is
a bad idea for most targets given libraries often have inefficient default implementations.
I fixed the GLIBC mempcpy and stpcpy C implementations to use memcpy and strlen
so at least for these performance is no longer absolutely terrible.

Saying that all C libraries should be forced to provide highly optimized assembler
versions for these functions is onerous since they are not frequently used in code 
(a quick grep of SPEC resulted in one use of mempcpy, 0 uses of rawmemchr,
strchrnul and stpcpy).

> If you hardcode in
> GCC emitting worse sequence at the caller (which s + strlen (s) is), then
> even once the library deficiency is fixed, you still don't get benefit from
> it.  

What benefit exactly? Rawmemchr cannot ever beat strlen. There is a trick that can
make a good strlen faster than rawmemchr, but even ignoring that, an integer based
rawmemchr needs to do extra operations in its inner loop. A SIMD version could use
similar inner loops although rawmemchr still has a higher cost. You could special case
searching for '\0' and jump to strlen (I have patches for that), but that also adds cost...

>I wonder how you work around the
> define strchr(s, c) \
> ..
> in glibc headers anyway.

That should either be removed or changed to use strlen (I have patches for both
options out for review).

> Another thing is for the cases where strlen is desirable to be expanded
> inline, in that case rawmemchr (x, 0) or strchr (x, 0) is likely useful to be
> expanded inline as well and then this decision should be done at expansion
> time.

I'm not sure I'm following you here - that's an argument to expand into strlen early
as strlen is better optimized in GCC...

Wilco

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] Optimize strchr (s, 0) to strlen
  2016-04-20 11:59         ` Richard Biener
@ 2016-05-04 12:54           ` Wilco Dijkstra
  2016-05-18 12:29             ` Wilco Dijkstra
  0 siblings, 1 reply; 37+ messages in thread
From: Wilco Dijkstra @ 2016-05-04 12:54 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, nd

Richard Biener wrote:
>
> Yeah ;)  I'm currently bootstrapping/testing the patch that makes it possible to
> write all this in match.pd.

So did that pass bootstrap? It would be good to decide how to proceed with this.

Wilco

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] Optimize strchr (s, 0) to strlen
  2016-05-04 12:54           ` Wilco Dijkstra
@ 2016-05-18 12:29             ` Wilco Dijkstra
  2016-05-18 13:22               ` Richard Biener
  0 siblings, 1 reply; 37+ messages in thread
From: Wilco Dijkstra @ 2016-05-18 12:29 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, nd

Richard Biener wrote:
>
> Yeah ;)  I'm currently bootstrapping/testing the patch that makes it possible to
> write all this in match.pd.

So what was the conclusion? Improving match.pd to be able to handle more cases
like this seems like a nice thing.

Wilco

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] Optimize strchr (s, 0) to strlen
  2016-05-18 12:29             ` Wilco Dijkstra
@ 2016-05-18 13:22               ` Richard Biener
  2016-09-13 12:39                 ` Wilco Dijkstra
  0 siblings, 1 reply; 37+ messages in thread
From: Richard Biener @ 2016-05-18 13:22 UTC (permalink / raw)
  To: Wilco Dijkstra; +Cc: gcc-patches, nd

On Wed, May 18, 2016 at 2:29 PM, Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
> Richard Biener wrote:
>>
>> Yeah ;)  I'm currently bootstrapping/testing the patch that makes it possible to
>> write all this in match.pd.
>
> So what was the conclusion? Improving match.pd to be able to handle more cases
> like this seems like a nice thing.

I'm stuck with fallout and making this work requires some serious
thought.  Don't
hold your breath here :/

The restricted case of strchr (a, 0) -> strlen () can be made working
more easily
but I didn't yet try to implement a restriction only allowing the
cases that would work.

Meanwhile the strlenopt pass would be an appropriate place to handle
this transform
(well, if we now agree on its usefulness).

Richard.

>
> Wilco
>

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] Optimize strchr (s, 0) to strlen
  2016-05-18 13:22               ` Richard Biener
@ 2016-09-13 12:39                 ` Wilco Dijkstra
  2016-09-13 12:46                   ` Oleg Endo
  2016-09-14 13:46                   ` Richard Biener
  0 siblings, 2 replies; 37+ messages in thread
From: Wilco Dijkstra @ 2016-09-13 12:39 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, nd, Jakub Jelinek

Richard Biener <richard.guenther@gmail.com> wrote:
> On Wed, May 18, 2016 at 2:29 PM, Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
>> Richard Biener wrote:
>>
>>> Yeah ;)  I'm currently bootstrapping/testing the patch that makes it possible to
>>> write all this in match.pd.
>>
>> So what was the conclusion? Improving match.pd to be able to handle more cases
>> like this seems like a nice thing.
>
> I'm stuck with fallout and making this work requires some serious
> thought.  Don't
> hold your breath here :/
>
> The restricted case of strchr (a, 0) -> strlen () can be made working
> more easily
> but I didn't yet try to implement a restriction only allowing the
> cases that would work.
> 
> Meanwhile the strlenopt pass would be an appropriate place to handle
> this transform
> (well, if we now agree on its usefulness).

I'd like to pick this up again so we can make GCC7 a bit less inefficient for this case.
(original thread: https://gcc.gnu.org/ml/gcc-patches/2016-04/msg00870.html)

We've seen several different proposals for where/how to do this simplification, why did you 
say strlenopt is best? It would be an unconditional strchr (a, 0) -> a + strlen (a) rewrite,
ie. completely unrelated to what strlenopt does. We do all the other simplifications based
on constant arguments in builtins.c and gimple-fold.c, why is strchr (s, 0) different?

Wilco


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] Optimize strchr (s, 0) to strlen
  2016-09-13 12:39                 ` Wilco Dijkstra
@ 2016-09-13 12:46                   ` Oleg Endo
  2016-09-14 13:46                   ` Richard Biener
  1 sibling, 0 replies; 37+ messages in thread
From: Oleg Endo @ 2016-09-13 12:46 UTC (permalink / raw)
  To: Wilco Dijkstra, Richard Biener; +Cc: gcc-patches, nd, Jakub Jelinek

On Tue, 2016-09-13 at 12:36 +0000, Wilco Dijkstra wrote:
> Richard Biener <richard.guenther@gmail.com> wrote:
> > 
> > On Wed, May 18, 2016 at 2:29 PM, Wilco Dijkstra <Wilco.Dijkstra@arm
> > .com> wrote:
> > > 
> > > Richard Biener wrote:
> > > 
> > > > 
> > > > Yeah ;)  I'm currently bootstrapping/testing the patch that
> > > > makes it possible to
> > > > write all this in match.pd.
> > > So what was the conclusion? Improving match.pd to be able to
> > > handle more cases
> > > like this seems like a nice thing.
> > I'm stuck with fallout and making this work requires some serious
> > thought.  Don't
> > hold your breath here :/
> > 
> > The restricted case of strchr (a, 0) -> strlen () can be made
> > working
> > more easily
> > but I didn't yet try to implement a restriction only allowing the
> > cases that would work.
> > 
> > Meanwhile the strlenopt pass would be an appropriate place to
> > handle
> > this transform
> > (well, if we now agree on its usefulness).
> I'd like to pick this up again so we can make GCC7 a bit less
> inefficient for this case.
> (original thread: https://gcc.gnu.org/ml/gcc-patches/2016-04/msg00870
> .html)
> 
> We've seen several different proposals for where/how to do this
> simplification, why did you 
> say strlenopt is best? It would be an unconditional strchr (a, 0) ->
> a + strlen (a) rewrite,
> ie. completely unrelated to what strlenopt does. We do all the other
> simplifications based
> on constant arguments in builtins.c and gimple-fold.c, why is strchr
> (s, 0) different?
> 

BTW, there are two PRs for this: 61056 and 32650.  Please take them
into account when committing something for this issue.

Cheers,
Oleg

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] Optimize strchr (s, 0) to strlen
  2016-09-13 12:39                 ` Wilco Dijkstra
  2016-09-13 12:46                   ` Oleg Endo
@ 2016-09-14 13:46                   ` Richard Biener
  2016-09-14 13:55                     ` Jakub Jelinek
  1 sibling, 1 reply; 37+ messages in thread
From: Richard Biener @ 2016-09-14 13:46 UTC (permalink / raw)
  To: Wilco Dijkstra; +Cc: gcc-patches, nd, Jakub Jelinek

On Tue, Sep 13, 2016 at 2:36 PM, Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
> Richard Biener <richard.guenther@gmail.com> wrote:
>> On Wed, May 18, 2016 at 2:29 PM, Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
>>> Richard Biener wrote:
>>>
>>>> Yeah ;)  I'm currently bootstrapping/testing the patch that makes it possible to
>>>> write all this in match.pd.
>>>
>>> So what was the conclusion? Improving match.pd to be able to handle more cases
>>> like this seems like a nice thing.
>>
>> I'm stuck with fallout and making this work requires some serious
>> thought.  Don't
>> hold your breath here :/
>>
>> The restricted case of strchr (a, 0) -> strlen () can be made working
>> more easily
>> but I didn't yet try to implement a restriction only allowing the
>> cases that would work.
>>
>> Meanwhile the strlenopt pass would be an appropriate place to handle
>> this transform
>> (well, if we now agree on its usefulness).
>
> I'd like to pick this up again so we can make GCC7 a bit less inefficient for this case.
> (original thread: https://gcc.gnu.org/ml/gcc-patches/2016-04/msg00870.html)
>
> We've seen several different proposals for where/how to do this simplification, why did you
> say strlenopt is best? It would be an unconditional strchr (a, 0) -> a + strlen (a) rewrite,
> ie. completely unrelated to what strlenopt does. We do all the other simplifications based
> on constant arguments in builtins.c and gimple-fold.c, why is strchr (s, 0) different?

I was thinking about the case where strlen opt already knows strlen
(a).  But sure, gimple-fold.c
works as well.

Richard.

> Wilco
>
>

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] Optimize strchr (s, 0) to strlen
  2016-09-14 13:46                   ` Richard Biener
@ 2016-09-14 13:55                     ` Jakub Jelinek
  2016-09-14 17:07                       ` Szabolcs Nagy
  2016-09-15  8:15                       ` Richard Biener
  0 siblings, 2 replies; 37+ messages in thread
From: Jakub Jelinek @ 2016-09-14 13:55 UTC (permalink / raw)
  To: Richard Biener; +Cc: Wilco Dijkstra, gcc-patches, nd

On Wed, Sep 14, 2016 at 03:41:33PM +0200, Richard Biener wrote:
> > We've seen several different proposals for where/how to do this simplification, why did you
> > say strlenopt is best? It would be an unconditional strchr (a, 0) -> a + strlen (a) rewrite,
> > ie. completely unrelated to what strlenopt does. We do all the other simplifications based
> > on constant arguments in builtins.c and gimple-fold.c, why is strchr (s, 0) different?
> 
> I was thinking about the case where strlen opt already knows strlen
> (a).  But sure, gimple-fold.c
> works as well.

I think for the middle-end, using strchr (a, 0) as canonical instead of a + strlen (a)
is better, and at expansion time we can decide what to use (a + strlen (a)
if you'd expand strlen inline, rather than as a function call, or
__rawmemchr (which if libc is sane should be fastest), or strchr, or a + strlen (a)).

	Jakub

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] Optimize strchr (s, 0) to strlen
  2016-09-14 13:55                     ` Jakub Jelinek
@ 2016-09-14 17:07                       ` Szabolcs Nagy
  2016-09-15  8:15                       ` Richard Biener
  1 sibling, 0 replies; 37+ messages in thread
From: Szabolcs Nagy @ 2016-09-14 17:07 UTC (permalink / raw)
  To: Jakub Jelinek, Richard Biener; +Cc: nd, Wilco Dijkstra, gcc-patches

On 14/09/16 14:45, Jakub Jelinek wrote:
> I think for the middle-end, using strchr (a, 0) as canonical instead of a + strlen (a)
> is better, and at expansion time we can decide what to use (a + strlen (a)
> if you'd expand strlen inline, rather than as a function call, or
> __rawmemchr (which if libc is sane should be fastest), or strchr, or a + strlen (a)).
> 

i think the compiler should move away from generating
calls to nonstandard functions, __rawmemchr is much
less used than strlen so it's less likely to be optimized
in certain target/libc combination.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] Optimize strchr (s, 0) to strlen
  2016-09-14 13:55                     ` Jakub Jelinek
  2016-09-14 17:07                       ` Szabolcs Nagy
@ 2016-09-15  8:15                       ` Richard Biener
  2016-09-15  8:28                         ` Jakub Jelinek
  2016-09-15 13:51                         ` Wilco Dijkstra
  1 sibling, 2 replies; 37+ messages in thread
From: Richard Biener @ 2016-09-15  8:15 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Wilco Dijkstra, gcc-patches, nd

On Wed, Sep 14, 2016 at 3:45 PM, Jakub Jelinek <jakub@redhat.com> wrote:
> On Wed, Sep 14, 2016 at 03:41:33PM +0200, Richard Biener wrote:
>> > We've seen several different proposals for where/how to do this simplification, why did you
>> > say strlenopt is best? It would be an unconditional strchr (a, 0) -> a + strlen (a) rewrite,
>> > ie. completely unrelated to what strlenopt does. We do all the other simplifications based
>> > on constant arguments in builtins.c and gimple-fold.c, why is strchr (s, 0) different?
>>
>> I was thinking about the case where strlen opt already knows strlen
>> (a).  But sure, gimple-fold.c
>> works as well.
>
> I think for the middle-end, using strchr (a, 0) as canonical instead of a + strlen (a)
> is better, and at expansion time we can decide what to use (a + strlen (a)
> if you'd expand strlen inline, rather than as a function call, or
> __rawmemchr (which if libc is sane should be fastest), or strchr, or a + strlen (a)).

OTOH that then argues for doing it in strlenopt because that knows
whether we maybe
already computed strlen (a) (which might have other uses than adding to a).

Richard.

>         Jakub

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] Optimize strchr (s, 0) to strlen
  2016-09-15  8:15                       ` Richard Biener
@ 2016-09-15  8:28                         ` Jakub Jelinek
  2016-09-15 13:51                         ` Wilco Dijkstra
  1 sibling, 0 replies; 37+ messages in thread
From: Jakub Jelinek @ 2016-09-15  8:28 UTC (permalink / raw)
  To: Richard Biener; +Cc: Wilco Dijkstra, gcc-patches, nd

On Thu, Sep 15, 2016 at 09:52:34AM +0200, Richard Biener wrote:
> On Wed, Sep 14, 2016 at 3:45 PM, Jakub Jelinek <jakub@redhat.com> wrote:
> > On Wed, Sep 14, 2016 at 03:41:33PM +0200, Richard Biener wrote:
> >> > We've seen several different proposals for where/how to do this simplification, why did you
> >> > say strlenopt is best? It would be an unconditional strchr (a, 0) -> a + strlen (a) rewrite,
> >> > ie. completely unrelated to what strlenopt does. We do all the other simplifications based
> >> > on constant arguments in builtins.c and gimple-fold.c, why is strchr (s, 0) different?
> >>
> >> I was thinking about the case where strlen opt already knows strlen
> >> (a).  But sure, gimple-fold.c
> >> works as well.
> >
> > I think for the middle-end, using strchr (a, 0) as canonical instead of a + strlen (a)
> > is better, and at expansion time we can decide what to use (a + strlen (a)
> > if you'd expand strlen inline, rather than as a function call, or
> > __rawmemchr (which if libc is sane should be fastest), or strchr, or a + strlen (a)).
> 
> OTOH that then argues for doing it in strlenopt because that knows
> whether we maybe
> already computed strlen (a) (which might have other uses than adding to a).

Sure.

	Jakub

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] Optimize strchr (s, 0) to strlen
  2016-09-15  8:15                       ` Richard Biener
  2016-09-15  8:28                         ` Jakub Jelinek
@ 2016-09-15 13:51                         ` Wilco Dijkstra
  2016-09-15 14:17                           ` Jakub Jelinek
  2016-09-15 14:24                           ` Bernd Schmidt
  1 sibling, 2 replies; 37+ messages in thread
From: Wilco Dijkstra @ 2016-09-15 13:51 UTC (permalink / raw)
  To: Richard Biener, Jakub Jelinek; +Cc: gcc-patches, nd

Richard Biener wrote:
> On Wed, Sep 14, 2016 at 3:45 PM, Jakub Jelinek <jakub@redhat.com> wrote:
> > On Wed, Sep 14, 2016 at 03:41:33PM +0200, Richard Biener wrote:
> >> > We've seen several different proposals for where/how to do this simplification, why did you
> >> > say strlenopt is best? It would be an unconditional strchr (a, 0) -> a + strlen (a) rewrite,
> >> > ie. completely unrelated to what strlenopt does. We do all the other simplifications based
> >> > on constant arguments in builtins.c and gimple-fold.c, why is strchr (s, 0) different?
>>>
> >> I was thinking about the case where strlen opt already knows strlen
> >> (a).  But sure, gimple-fold.c
> >> works as well.
> >
> > I think for the middle-end, using strchr (a, 0) as canonical instead of a + strlen (a)
> > is better, and at expansion time we can decide what to use (a + strlen (a)
> > if you'd expand strlen inline, rather than as a function call, or
> > __rawmemchr (which if libc is sane should be fastest), or strchr, or a + strlen (a)).

__rawmemchr is not the fastest on any target I tried, including x86, so GLIBC's
current default behaviour of always using __rawmemchr is inefficient. GCC doesn't
support __rawmemchr at all, so the strlen optimization can't optimize it.

strchr is significantly slower than strlen, so generating that is a bad idea too.

So the only reasonable optimization is to always emit a + strlen (a).

> OTOH that then argues for doing it in strlenopt because that knows
> whether we maybe
> already computed strlen (a) (which might have other uses than adding to a).

strlenopt can already change strchr (a, 0) into strlen (a) + a when strlen has been
called before it, so that part is already done. However it doesn't optimize a strlen after 
a strchr, so if the expansion is done late you'd end up with a redundant strlen.

That means the expansion would need to happen either before or very early in strlenopt
(likely an extra pass at init time to avoid upsetting the strlen optimization - we want to
treat any strchr as a real strlen).

Wilco

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] Optimize strchr (s, 0) to strlen
  2016-09-15 13:51                         ` Wilco Dijkstra
@ 2016-09-15 14:17                           ` Jakub Jelinek
  2016-09-15 14:29                             ` Szabolcs Nagy
  2016-09-15 14:24                           ` Bernd Schmidt
  1 sibling, 1 reply; 37+ messages in thread
From: Jakub Jelinek @ 2016-09-15 14:17 UTC (permalink / raw)
  To: Wilco Dijkstra; +Cc: Richard Biener, gcc-patches, nd

On Thu, Sep 15, 2016 at 01:38:45PM +0000, Wilco Dijkstra wrote:
> __rawmemchr is not the fastest on any target I tried, including x86, so GLIBC's
> current default behaviour of always using __rawmemchr is inefficient. GCC doesn't
> support __rawmemchr at all, so the strlen optimization can't optimize it.

Why?  It knows you are targeting glibc, and if it sees rawmemchr (or
__rawmemchr) in the headers as well, it can emit that.
As for speed, there is no inherent reason why rawmemchr should be slower
than strlen, on the contrary.  So, if on some target rawmemchr is slower
than strlen, most likely it has never been implemented there properly, or
somebody improved strlen without bothering to improve rawmemchr at the same
time.

> strchr is significantly slower than strlen, so generating that is a bad idea too.
> 
> So the only reasonable optimization is to always emit a + strlen (a).

I disagree.  Another, on glibc more reasonable, optimization is to make sure
rawmemchr is fast enough.

	Jakub

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] Optimize strchr (s, 0) to strlen
  2016-09-15 13:51                         ` Wilco Dijkstra
  2016-09-15 14:17                           ` Jakub Jelinek
@ 2016-09-15 14:24                           ` Bernd Schmidt
  2016-09-16 13:41                             ` Wilco Dijkstra
  1 sibling, 1 reply; 37+ messages in thread
From: Bernd Schmidt @ 2016-09-15 14:24 UTC (permalink / raw)
  To: Wilco Dijkstra, Richard Biener, Jakub Jelinek; +Cc: gcc-patches, nd

On 09/15/2016 03:38 PM, Wilco Dijkstra wrote:
> __rawmemchr is not the fastest on any target I tried, including x86,

Interesting. Care to share your test program? I just looked at the libc 
sources and strlen/rawmemchr are practically identical code so I'd 
expect any difference to be lost in the noise. Of course there might be 
inlines interfering with the comparison.

> So the only reasonable optimization is to always emit a + strlen (a).

Not sure about "only reasonable" but on the whole I'd agree that it's 
reasonable and we shouldn't let the perfect be the enemy of the good 
here. I'm sure we can come up with lots of different ways to do this but 
let's just pick one and if the one Wilco submitted looks decent let's 
just put it in.

Out of curiousity, is there real-world code that this is intended to 
optimize?


Bernd

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] Optimize strchr (s, 0) to strlen
  2016-09-15 14:17                           ` Jakub Jelinek
@ 2016-09-15 14:29                             ` Szabolcs Nagy
  2016-09-15 14:45                               ` Jakub Jelinek
  0 siblings, 1 reply; 37+ messages in thread
From: Szabolcs Nagy @ 2016-09-15 14:29 UTC (permalink / raw)
  To: Jakub Jelinek, Wilco Dijkstra; +Cc: nd, Richard Biener, gcc-patches

On 15/09/16 14:49, Jakub Jelinek wrote:
> On Thu, Sep 15, 2016 at 01:38:45PM +0000, Wilco Dijkstra wrote:
>> __rawmemchr is not the fastest on any target I tried, including x86, so GLIBC's
>> current default behaviour of always using __rawmemchr is inefficient. GCC doesn't
>> support __rawmemchr at all, so the strlen optimization can't optimize it.
> 
> Why?  It knows you are targeting glibc, and if it sees rawmemchr (or
> __rawmemchr) in the headers as well, it can emit that.
> As for speed, there is no inherent reason why rawmemchr should be slower
> than strlen, on the contrary.  So, if on some target rawmemchr is slower
> than strlen, most likely it has never been implemented there properly, or
> somebody improved strlen without bothering to improve rawmemchr at the same
> time.
> 

from libc point of view, rawmemchr is a rarely used
nonstandard function that should be optimized for size.
(glibc does not do this now, but it should in my opinion.)

in case of static linking strlen is most likely linked
into your binary already, but rawmemchr is not, so
emitting calls to it increases code size.

>> strchr is significantly slower than strlen, so generating that is a bad idea too.
>>
>> So the only reasonable optimization is to always emit a + strlen (a).
> 
> I disagree.  Another, on glibc more reasonable, optimization is to make sure
> rawmemchr is fast enough.
> 

libc should not imlpement n variants of similar functions,
it is a maintainance problem and it makes the code bloated.

(glibc even has asm implementations, which history tells
is a bad idea: string functions had bugs, performance
regressions and other problems because the asm is not
future proof and it is hard to guarantee consistent
behaviour across targets e.g. memchr on x86_64 is broken
right now BZ 19387)

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] Optimize strchr (s, 0) to strlen
  2016-09-15 14:29                             ` Szabolcs Nagy
@ 2016-09-15 14:45                               ` Jakub Jelinek
  2016-09-15 15:04                                 ` Wilco Dijkstra
  0 siblings, 1 reply; 37+ messages in thread
From: Jakub Jelinek @ 2016-09-15 14:45 UTC (permalink / raw)
  To: Szabolcs Nagy; +Cc: Wilco Dijkstra, nd, Richard Biener, gcc-patches

On Thu, Sep 15, 2016 at 03:16:52PM +0100, Szabolcs Nagy wrote:
> On 15/09/16 14:49, Jakub Jelinek wrote:
> > On Thu, Sep 15, 2016 at 01:38:45PM +0000, Wilco Dijkstra wrote:
> >> __rawmemchr is not the fastest on any target I tried, including x86, so GLIBC's
> >> current default behaviour of always using __rawmemchr is inefficient. GCC doesn't
> >> support __rawmemchr at all, so the strlen optimization can't optimize it.
> > 
> > Why?  It knows you are targeting glibc, and if it sees rawmemchr (or
> > __rawmemchr) in the headers as well, it can emit that.
> > As for speed, there is no inherent reason why rawmemchr should be slower
> > than strlen, on the contrary.  So, if on some target rawmemchr is slower
> > than strlen, most likely it has never been implemented there properly, or
> > somebody improved strlen without bothering to improve rawmemchr at the same
> > time.
> > 
> 
> from libc point of view, rawmemchr is a rarely used
> nonstandard function that should be optimized for size.
> (glibc does not do this now, but it should in my opinion.)

rawmemchr with 0 is to strlen conceptually like stpcpy is to strcpy.
Are you arguing that glibc should implement strcpy using stpcpy, or vice
versa?
rawmemchr is certainly not rarely used, strchr (p, 0) is optimized to
__rawmemchr by the glibc header macros, so it is very frequently used.
tree-ssa-strlen.c should be tought to handle these.
I'm not asking musl to implement rawmemchr, but glibc already has it and
should provide an efficient implementation of it (if it doesn't, I really
haven't seen any benchmark results that would say it isn't).

> libc should not imlpement n variants of similar functions,
> it is a maintainance problem and it makes the code bloated.
> 
> (glibc even has asm implementations, which history tells
> is a bad idea: string functions had bugs, performance
> regressions and other problems because the asm is not
> future proof and it is hard to guarantee consistent
> behaviour across targets e.g. memchr on x86_64 is broken
> right now BZ 19387)

I disagree with that.  For some routines like string ops, writing them in
assembly is highly desirable.

	Jakub

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] Optimize strchr (s, 0) to strlen
  2016-09-15 14:45                               ` Jakub Jelinek
@ 2016-09-15 15:04                                 ` Wilco Dijkstra
  2016-09-15 15:28                                   ` Jakub Jelinek
  0 siblings, 1 reply; 37+ messages in thread
From: Wilco Dijkstra @ 2016-09-15 15:04 UTC (permalink / raw)
  To: Szabolcs Nagy, Jakub Jelinek; +Cc: nd, Richard Biener, gcc-patches

Jakub Jelinek wrote:
On Thu, Sep 15, 2016 at 03:16:52PM +0100, Szabolcs Nagy wrote:
> > 
> > from libc point of view, rawmemchr is a rarely used
> > nonstandard function that should be optimized for size.
> > (glibc does not do this now, but it should in my opinion.)
>
> rawmemchr with 0 is to strlen conceptually like stpcpy is to strcpy.
> Are you arguing that glibc should implement strcpy using stpcpy, or vice
> versa?

stpcpy is not conceptually the same, but for mempcpy, yes. By default
it's converted into memcpy in the GLIBC headers and the generic implementation.

stpcpy uses strlen and memcpy which is generally the most efficient version
(it even beat several assembler implementations).

> rawmemchr is certainly not rarely used, strchr (p, 0) is optimized to
>__rawmemchr by the glibc header macros, so it is very frequently used.

That's pretty much its only use, so not an argument for rawmemchr really 
but for optimizing strchr (p, 0) better.

Wilco



^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] Optimize strchr (s, 0) to strlen
  2016-09-15 15:04                                 ` Wilco Dijkstra
@ 2016-09-15 15:28                                   ` Jakub Jelinek
  2016-09-15 15:31                                     ` Wilco Dijkstra
  0 siblings, 1 reply; 37+ messages in thread
From: Jakub Jelinek @ 2016-09-15 15:28 UTC (permalink / raw)
  To: Wilco Dijkstra; +Cc: Szabolcs Nagy, nd, Richard Biener, gcc-patches

On Thu, Sep 15, 2016 at 02:55:48PM +0000, Wilco Dijkstra wrote:
> Jakub Jelinek wrote:
> On Thu, Sep 15, 2016 at 03:16:52PM +0100, Szabolcs Nagy wrote:
> > > 
> > > from libc point of view, rawmemchr is a rarely used
> > > nonstandard function that should be optimized for size.
> > > (glibc does not do this now, but it should in my opinion.)
> >
> > rawmemchr with 0 is to strlen conceptually like stpcpy is to strcpy.
> > Are you arguing that glibc should implement strcpy using stpcpy, or vice
> > versa?
> 
> stpcpy is not conceptually the same, but for mempcpy, yes. By default
> it's converted into memcpy in the GLIBC headers and the generic implementation.
> 
> stpcpy uses strlen and memcpy which is generally the most efficient version
> (it even beat several assembler implementations).

??  I certainly see something completely different, at least on the arches
I've looked at.

	Jakub

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] Optimize strchr (s, 0) to strlen
  2016-09-15 15:28                                   ` Jakub Jelinek
@ 2016-09-15 15:31                                     ` Wilco Dijkstra
  2016-09-15 15:48                                       ` Jakub Jelinek
  0 siblings, 1 reply; 37+ messages in thread
From: Wilco Dijkstra @ 2016-09-15 15:31 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Szabolcs Nagy, nd, Richard Biener, gcc-patches

From: Jakub Jelinek <jakub@redhat.com>
> On Thu, Sep 15, 2016 at 02:55:48PM +0000, Wilco Dijkstra wrote:
>> stpcpy is not conceptually the same, but for mempcpy, yes. By default
>> it's converted into memcpy in the GLIBC headers and the generic implementation.
>> 
>> stpcpy uses strlen and memcpy which is generally the most efficient version
>> (it even beat several assembler implementations).
>
> ??  I certainly see something completely different, at least on the arches
> I've looked at.

glibc/string/string.h contains:

#if defined __USE_GNU && defined __OPTIMIZE__ \
    && defined __extern_always_inline && __GNUC_PREREQ (3,2)
# if !defined _FORCE_INLINES && !defined _HAVE_STRING_ARCH_mempcpy

#define mempcpy(dest, src, n) __mempcpy_inline (dest, src, n)
#define __mempcpy(dest, src, n) __mempcpy_inline (dest, src, n)

__extern_always_inline void *
__mempcpy_inline (void *__restrict __dest,
                  const void *__restrict __src, size_t __n)
{
  return (char *) memcpy (__dest, __src, __n) + __n;
}

That's best done GCC as a general optimization as currently mempcpy is not 
handled efficiently (see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70140),
and it avoids having to repeat this for every C library out there...

glibc/string/mempcpy.c:

void *
MEMPCPY (void *dest, const void *src, size_t len)
{
  return memcpy (dest, src, len) + len;
}

And glibc/string/stpcpy.c:

char *
STPCPY (char *dest, const char *src)
{
  size_t len = strlen (src);
  return memcpy (dest, src, len + 1) + len;
}

This means that without having to write any assembly code, by default 
mempcpy, stpcpy etc are as efficient as possible (memcpy and strlen are 
optimized well on all targets, that's not true for mempcpy, stpcpy and similar
functions, and to make matters worse, the generic code used to be very inefficient).

Wilco

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] Optimize strchr (s, 0) to strlen
  2016-09-15 15:31                                     ` Wilco Dijkstra
@ 2016-09-15 15:48                                       ` Jakub Jelinek
  2016-09-15 16:46                                         ` Wilco Dijkstra
  0 siblings, 1 reply; 37+ messages in thread
From: Jakub Jelinek @ 2016-09-15 15:48 UTC (permalink / raw)
  To: Wilco Dijkstra; +Cc: Szabolcs Nagy, nd, Richard Biener, gcc-patches

On Thu, Sep 15, 2016 at 03:27:52PM +0000, Wilco Dijkstra wrote:
> That's best done GCC as a general optimization as currently mempcpy is not 
> handled efficiently (see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70140),
> and it avoids having to repeat this for every C library out there...
> 
> glibc/string/mempcpy.c:
> 
> void *
> MEMPCPY (void *dest, const void *src, size_t len)
> {
>   return memcpy (dest, src, len) + len;
> }
> 
> And glibc/string/stpcpy.c:
> 
> char *
> STPCPY (char *dest, const char *src)
> {
>   size_t len = strlen (src);
>   return memcpy (dest, src, len + 1) + len;
> }

Those are the generic definitions, all targets that care about performance
obviously should replace them with assembly code.

	Jakub

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] Optimize strchr (s, 0) to strlen
  2016-09-15 15:48                                       ` Jakub Jelinek
@ 2016-09-15 16:46                                         ` Wilco Dijkstra
  0 siblings, 0 replies; 37+ messages in thread
From: Wilco Dijkstra @ 2016-09-15 16:46 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Szabolcs Nagy, nd, Richard Biener, gcc-patches

Jakub Jelinek <jakub@redhat.com>
>
> Those are the generic definitions, all targets that care about performance
> obviously should replace them with assembly code.

No, that's exactly my point, it is not true that it is always best to write assembly
code. For example there is absolutely no benefit in writing an optimized mempcpy. 
At best it is as fast as memcpy, and therefore expanding mempcpy into
memcpy (p, q, n) + n would have the same performance. In actual use memcpy
will then be slightly faster due to lower I-cache pressure.

Wilco

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] Optimize strchr (s, 0) to strlen
  2016-09-15 14:24                           ` Bernd Schmidt
@ 2016-09-16 13:41                             ` Wilco Dijkstra
  2016-09-16 15:37                               ` Jeff Law
  0 siblings, 1 reply; 37+ messages in thread
From: Wilco Dijkstra @ 2016-09-16 13:41 UTC (permalink / raw)
  To: Bernd Schmidt, Richard Biener, Jakub Jelinek; +Cc: gcc-patches, nd

Bernd Schmidt wrote:
> On 09/15/2016 03:38 PM, Wilco Dijkstra wrote:
> > __rawmemchr is not the fastest on any target I tried, including x86,
>
> Interesting. Care to share your test program? I just looked at the libc 
> sources and strlen/rawmemchr are practically identical code so I'd 
> expect any difference to be lost in the noise. Of course there might be 
> inlines interfering with the comparison.

It's glibc/benchtests/bench-strlen.c slightly modified to compare strlen,
rawmemchr and strchr. Even if they appear identical the inner loop of strlen
is much faster than strchr and rawmemchr at larger sizes:

                                                          strchr          rawmemchr          strlen
Length 4096, alignment 12:      3.35132e+06     2.39842e+06     1.88962e+06

> > So the only reasonable optimization is to always emit a + strlen (a).
>
> Not sure about "only reasonable" but on the whole I'd agree that it's 
> reasonable and we shouldn't let the perfect be the enemy of the good 
> here. I'm sure we can come up with lots of different ways to do this but 
> let's just pick one and if the one Wilco submitted looks decent let's 
> just put it in.
> 
> Out of curiousity, is there real-world code that this is intended to 
> optimize?

I noticed rawmemchr taking non-trivial amounts of time in various profiles
despite no use of rawmemchr in any of the source code. It's apparently a
common idiom to use strchr (s, 0) to find the end of a string. Given strchr is
slower than strlen, it is changed to rawmemchr by GLIBC headers. However
this makes things even slower since few targets have an optimized rawmemchr,
and for targets that do, strlen is faster.

So this is one of many improvements to ensure GCC/GLIBC by default do 
optimizations in a way that is best for most targets. If a particular target
wants to do something different that is always possible of course.

Wilco

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] Optimize strchr (s, 0) to strlen
  2016-09-16 13:41                             ` Wilco Dijkstra
@ 2016-09-16 15:37                               ` Jeff Law
  0 siblings, 0 replies; 37+ messages in thread
From: Jeff Law @ 2016-09-16 15:37 UTC (permalink / raw)
  To: Wilco Dijkstra, Bernd Schmidt, Richard Biener, Jakub Jelinek
  Cc: gcc-patches, nd

On 09/16/2016 07:28 AM, Wilco Dijkstra wrote:
>
> I noticed rawmemchr taking non-trivial amounts of time in various profiles
> despite no use of rawmemchr in any of the source code. It's apparently a
> common idiom to use strchr (s, 0) to find the end of a string.
A bit of a surprise, but it is what it is I guess.

jeff


^ permalink raw reply	[flat|nested] 37+ messages in thread

end of thread, other threads:[~2016-09-16 15:27 UTC | newest]

Thread overview: 37+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-04-18 17:01 [PATCH] Optimize strchr (s, 0) to strlen Wilco Dijkstra
2016-04-18 17:09 ` Jakub Jelinek
2016-04-18 17:31   ` Wilco Dijkstra
2016-04-18 22:55 ` Oleg Endo
2016-04-19  8:14 ` Richard Biener
2016-04-19 16:33   ` Wilco Dijkstra
2016-04-20  8:45     ` Richard Biener
2016-04-20  9:44       ` Richard Biener
2016-04-20 10:33         ` Jakub Jelinek
2016-04-20 10:50           ` Richard Biener
2016-04-20 11:17           ` Wilco Dijkstra
2016-04-20 13:01             ` Jakub Jelinek
2016-04-20 14:18               ` Wilco Dijkstra
2016-04-20 11:57       ` Wilco Dijkstra
2016-04-20 11:59         ` Richard Biener
2016-05-04 12:54           ` Wilco Dijkstra
2016-05-18 12:29             ` Wilco Dijkstra
2016-05-18 13:22               ` Richard Biener
2016-09-13 12:39                 ` Wilco Dijkstra
2016-09-13 12:46                   ` Oleg Endo
2016-09-14 13:46                   ` Richard Biener
2016-09-14 13:55                     ` Jakub Jelinek
2016-09-14 17:07                       ` Szabolcs Nagy
2016-09-15  8:15                       ` Richard Biener
2016-09-15  8:28                         ` Jakub Jelinek
2016-09-15 13:51                         ` Wilco Dijkstra
2016-09-15 14:17                           ` Jakub Jelinek
2016-09-15 14:29                             ` Szabolcs Nagy
2016-09-15 14:45                               ` Jakub Jelinek
2016-09-15 15:04                                 ` Wilco Dijkstra
2016-09-15 15:28                                   ` Jakub Jelinek
2016-09-15 15:31                                     ` Wilco Dijkstra
2016-09-15 15:48                                       ` Jakub Jelinek
2016-09-15 16:46                                         ` Wilco Dijkstra
2016-09-15 14:24                           ` Bernd Schmidt
2016-09-16 13:41                             ` Wilco Dijkstra
2016-09-16 15:37                               ` Jeff Law

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).