public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* calloc = malloc + memset
@ 2014-02-28 22:48 Marc Glisse
  2014-03-01 12:26 ` Paolo Carlini
                   ` (3 more replies)
  0 siblings, 4 replies; 26+ messages in thread
From: Marc Glisse @ 2014-02-28 22:48 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: TEXT/PLAIN, Size: 437 bytes --]

Hello,

this is a stage 1 patch, and I'll ping it then, but if you have comments 
now...

Passes bootstrap+testsuite on x86_64-linux-gnu.

2014-02-28  Marc Glisse  <marc.glisse@inria.fr>

 	PR tree-optimization/57742
gcc/
 	* tree-ssa-forwprop.c (simplify_malloc_memset): New function.
 	(simplify_builtin_call): Call it.
gcc/testsuite/
 	* g++.dg/tree-ssa/calloc.C: New testcase.
 	* gcc.dg/tree-ssa/calloc.c: Likewise.

-- 
Marc Glisse

[-- Attachment #2: Type: TEXT/PLAIN, Size: 8810 bytes --]

Index: gcc/testsuite/g++.dg/tree-ssa/calloc.C
===================================================================
--- gcc/testsuite/g++.dg/tree-ssa/calloc.C	(revision 0)
+++ gcc/testsuite/g++.dg/tree-ssa/calloc.C	(working copy)
@@ -0,0 +1,35 @@
+/* { dg-do compile } */
+/* { dg-options "-std=gnu++11 -O3 -fdump-tree-optimized" } */
+
+#include <new>
+#include <vector>
+#include <cstdlib>
+
+void g(void*);
+inline void* operator new(std::size_t sz) _GLIBCXX_THROW (std::bad_alloc)
+{
+  void *p;
+
+  if (sz == 0)
+    sz = 1;
+
+  // Slightly modified from the libsupc++ version, that one has 2 calls
+  // to malloc which makes it too hard to optimize.
+  while ((p = std::malloc (sz)) == 0)
+    {
+      std::new_handler handler = std::get_new_handler ();
+      if (! handler)
+        _GLIBCXX_THROW_OR_ABORT(std::bad_alloc());
+      handler ();
+    }
+  return p;
+}
+
+void f(void*p,int n){
+  new(p)std::vector<int>(n);
+}
+
+/* { dg-final { scan-tree-dump-times "calloc" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-not "malloc" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "memset" "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */

Property changes on: gcc/testsuite/g++.dg/tree-ssa/calloc.C
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+Author Date Id Revision URL
\ No newline at end of property
Index: gcc/testsuite/gcc.dg/tree-ssa/calloc.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/calloc.c	(revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/calloc.c	(working copy)
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+#include <stdlib.h>
+#include <string.h>
+
+extern int a;
+extern int* b;
+int n;
+void* f(long*q){
+  int*p=malloc(n);
+  ++*q;
+  if(p){
+    ++*q;
+    a=2;
+    memset(p,0,n);
+    *b=3;
+  }
+  return p;
+}
+void* g(void){
+  float*p=calloc(8,4);
+  return memset(p,0,32);
+}
+
+/* { dg-final { scan-tree-dump-times "calloc" 2 "optimized" } } */
+/* { dg-final { scan-tree-dump-not "malloc" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "memset" "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */

Property changes on: gcc/testsuite/gcc.dg/tree-ssa/calloc.c
___________________________________________________________________
Added: svn:keywords
## -0,0 +1 ##
+Author Date Id Revision URL
\ No newline at end of property
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Index: gcc/tree-ssa-forwprop.c
===================================================================
--- gcc/tree-ssa-forwprop.c	(revision 208224)
+++ gcc/tree-ssa-forwprop.c	(working copy)
@@ -1487,20 +1487,149 @@ constant_pointer_difference (tree p1, tr
     }
 
   for (i = 0; i < cnt[0]; i++)
     for (j = 0; j < cnt[1]; j++)
       if (exps[0][i] == exps[1][j])
 	return size_binop (MINUS_EXPR, offs[0][i], offs[1][j]);
 
   return NULL_TREE;
 }
 
+/* Optimize
+   ptr = malloc (n);
+   memset (ptr, 0, n);
+   into
+   ptr = calloc (n);
+   gsi_p is known to point to a call to __builtin_memset.  */
+static bool
+simplify_malloc_memset (gimple_stmt_iterator *gsi_p)
+{
+  /* First make sure we have:
+     ptr = malloc (n);
+     memset (ptr, 0, n);  */
+  gimple stmt2 = gsi_stmt (*gsi_p);
+  if (!integer_zerop (gimple_call_arg (stmt2, 1)))
+    return false;
+  tree ptr1, ptr2 = gimple_call_arg (stmt2, 0);
+  tree size = gimple_call_arg (stmt2, 2);
+  if (TREE_CODE (ptr2) != SSA_NAME) 
+    return false;
+  gimple stmt1 = SSA_NAME_DEF_STMT (ptr2);
+  tree callee1;
+  /* Handle the case where STMT1 is a unary PHI, which happends
+     for instance with:
+     while (!(p = malloc (n))) { ... }
+     memset (p, 0, n);  */
+  if (!stmt1)
+    return false;
+  if (gimple_code (stmt1) == GIMPLE_PHI
+      && gimple_phi_num_args (stmt1) == 1)
+    {
+      ptr1 = gimple_phi_arg_def (stmt1, 0);
+      if (TREE_CODE (ptr1) != SSA_NAME)
+	return false;
+      stmt1 = SSA_NAME_DEF_STMT (ptr1);
+    }
+  else
+    ptr1 = ptr2;
+  if (!stmt1
+      || !is_gimple_call (stmt1)
+      || !(callee1 = gimple_call_fndecl (stmt1)))
+    return false;
+
+  bool is_calloc;
+  if (DECL_FUNCTION_CODE (callee1) == BUILT_IN_MALLOC)
+    {
+      is_calloc = false;
+      if (!operand_equal_p (gimple_call_arg (stmt1, 0), size, 0))
+	return false;
+    }
+  else if (DECL_FUNCTION_CODE (callee1) == BUILT_IN_CALLOC)
+    {
+      is_calloc = true;
+      tree arg1 = gimple_call_arg (stmt1, 0);
+      tree arg2 = gimple_call_arg (stmt1, 1);
+      tree size1 = fold_build2 (MULT_EXPR, TREE_TYPE (size), arg1, arg2);
+      if (!operand_equal_p (size1, size, 0))
+	return false;
+      /* We could look at SSA_NAME_DEF_STMT (size), but there can be many casts
+	 mixed with the MULT_EXPR which makes it hard to match with size1.  */
+    }
+  else
+    return false;
+
+  /* We only allow two simple CFG forms for now. Either stmt1 and stmt2 are
+     in the same BB (in this order), or BB1 (malloc) ends with:
+     if (ptr) goto BB2 (memset)  */
+  basic_block bb1 = gimple_bb (stmt1);
+  basic_block bb2 = gimple_bb (stmt2);
+  if (bb1 != bb2)
+    {
+      if (!single_pred_p (bb2) || single_pred (bb2) != bb1)
+	return false;
+      gimple cond = last_stmt (bb1);
+      if (gimple_code (cond) != GIMPLE_COND
+	  || !integer_zerop (gimple_cond_rhs (cond))
+	  || gimple_cond_lhs (cond) != ptr1)
+	return false;
+      int branch;
+      tree_code comp = gimple_cond_code (cond);
+      if (comp == NE_EXPR)
+	branch = EDGE_TRUE_VALUE;
+      else if (comp == EQ_EXPR)
+	branch = EDGE_FALSE_VALUE;
+      else
+	return false;
+      edge e;
+      edge_iterator ei;
+      FOR_EACH_EDGE (e, ei, bb1->succs)
+	if ((e->flags & branch) && e->dest != bb2)
+	  return false;
+    }
+  
+  /* Finally, make sure the memory is not used before stmt2.  */
+  ao_ref ref;
+  ao_ref_init_from_ptr_and_size (&ref, ptr1, size);
+  tree vdef = gimple_vuse (stmt2);
+  if (vdef == NULL)
+    return false;
+  while (true)
+    {
+      gimple cur = SSA_NAME_DEF_STMT (vdef);
+      if (cur == stmt1) break;
+      if (stmt_may_clobber_ref_p_1 (cur, &ref))
+	return false;
+      vdef = gimple_vuse (cur);
+    }
+
+  /* Replace malloc with calloc and remove memset.  */
+  if (!is_calloc)
+    {
+      gimple_stmt_iterator gsi = gsi_for_stmt (stmt1);
+      update_gimple_call (&gsi, builtin_decl_implicit (BUILT_IN_CALLOC), 2,
+			  size, build_one_cst (size_type_node));
+    }
+  tree lhs = gimple_call_lhs (stmt2);
+  unlink_stmt_vdef (stmt2);
+  if (lhs)
+    {
+      gimple assign = gimple_build_assign (lhs, ptr2);
+      gsi_replace (gsi_p, assign, false);
+    }
+  else
+    {
+      gsi_remove (gsi_p, true);
+      release_defs (stmt2);
+    }
+  return true;
+}
+
 /* *GSI_P is a GIMPLE_CALL to a builtin function.
    Optimize
    memcpy (p, "abcd", 4);
    memset (p + 4, ' ', 3);
    into
    memcpy (p, "abcd   ", 7);
    call if the latter can be stored by pieces during expansion.  */
 
 static bool
 simplify_builtin_call (gimple_stmt_iterator *gsi_p, tree callee2)
@@ -1508,38 +1637,44 @@ simplify_builtin_call (gimple_stmt_itera
   gimple stmt1, stmt2 = gsi_stmt (*gsi_p);
   tree vuse = gimple_vuse (stmt2);
   if (vuse == NULL)
     return false;
   stmt1 = SSA_NAME_DEF_STMT (vuse);
 
   switch (DECL_FUNCTION_CODE (callee2))
     {
     case BUILT_IN_MEMSET:
       if (gimple_call_num_args (stmt2) != 3
-	  || gimple_call_lhs (stmt2)
 	  || CHAR_BIT != 8
 	  || BITS_PER_UNIT != 8)
 	break;
       else
 	{
+	  if (simplify_malloc_memset (gsi_p))
+	    return true;
+
 	  tree callee1;
 	  tree ptr1, src1, str1, off1, len1, lhs1;
 	  tree ptr2 = gimple_call_arg (stmt2, 0);
 	  tree val2 = gimple_call_arg (stmt2, 1);
 	  tree len2 = gimple_call_arg (stmt2, 2);
 	  tree diff, vdef, new_str_cst;
 	  gimple use_stmt;
 	  unsigned int ptr1_align;
 	  unsigned HOST_WIDE_INT src_len;
 	  char *src_buf;
 	  use_operand_p use_p;
 
+	  /* Not implemented yet.  */
+	  if (gimple_call_lhs (stmt2))
+	    break;
+
 	  if (!tree_fits_shwi_p (val2)
 	      || !tree_fits_uhwi_p (len2))
 	    break;
 	  if (is_gimple_call (stmt1))
 	    {
 	      /* If first stmt is a call, it needs to be memcpy
 		 or mempcpy, with string literal as second argument and
 		 constant length.  */
 	      callee1 = gimple_call_fndecl (stmt1);
 	      if (callee1 == NULL_TREE

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: calloc = malloc + memset
  2014-02-28 22:48 calloc = malloc + memset Marc Glisse
@ 2014-03-01 12:26 ` Paolo Carlini
  2014-03-01 14:51   ` Marc Glisse
  2014-03-03  9:50 ` Richard Biener
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 26+ messages in thread
From: Paolo Carlini @ 2014-03-01 12:26 UTC (permalink / raw)
  To: Marc Glisse; +Cc: gcc-patches

Hi

> On 28/feb/2014, at 23:48, Marc Glisse <marc.glisse@inria.fr> wrote:
> 
> Hello,
> 
> this is a stage 1 patch, and I'll ping it then, but if you have comments now...
> 
> Passes bootstrap+testsuite on x86_64-linux-gnu.
> 
> 2014-02-28  Marc Glisse  <marc.glisse@inria.fr>
> 
>    PR tree-optimization/57742
> gcc/
>    * tree-ssa-forwprop.c (simplify_malloc_memset): New function.
>    (simplify_builtin_call): Call it.
> gcc/testsuite/
>    * g++.dg/tree-ssa/calloc.C: New testcase.
>    * gcc.dg/tree-ssa/calloc.c: Likewise.
> 
> -- 
> Marc Glisse
> Index: gcc/testsuite/g++.dg/tree-ssa/calloc.C
> ===================================================================
> --- gcc/testsuite/g++.dg/tree-ssa/calloc.C    (revision 0)
> +++ gcc/testsuite/g++.dg/tree-ssa/calloc.C    (working copy)
> @@ -0,0 +1,35 @@
> +/* { dg-do compile } */
> +/* { dg-options "-std=gnu++11 -O3 -fdump-tree-optimized" } */
> +
> +#include <new>
> +#include <vector>
> +#include <cstdlib>
> +
> +void g(void*);
> +inline void* operator new(std::size_t sz) _GLIBCXX_THROW (std::bad_alloc)

Unless *really* necessary I would recommend not including the large <vector> (that also couples quite seriously the front-end testsuite to the library testsuite, we already discussed those topics in the past). Using the internal macros seems also unnecessary.

Paolo

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: calloc = malloc + memset
  2014-03-01 12:26 ` Paolo Carlini
@ 2014-03-01 14:51   ` Marc Glisse
  0 siblings, 0 replies; 26+ messages in thread
From: Marc Glisse @ 2014-03-01 14:51 UTC (permalink / raw)
  To: Paolo Carlini; +Cc: gcc-patches

On Sat, 1 Mar 2014, Paolo Carlini wrote:

> Hi
>
>> On 28/feb/2014, at 23:48, Marc Glisse <marc.glisse@inria.fr> wrote:
>>
>> Hello,
>>
>> this is a stage 1 patch, and I'll ping it then, but if you have comments now...
>>
>> Passes bootstrap+testsuite on x86_64-linux-gnu.
>>
>> 2014-02-28  Marc Glisse  <marc.glisse@inria.fr>
>>
>>    PR tree-optimization/57742
>> gcc/
>>    * tree-ssa-forwprop.c (simplify_malloc_memset): New function.
>>    (simplify_builtin_call): Call it.
>> gcc/testsuite/
>>    * g++.dg/tree-ssa/calloc.C: New testcase.
>>    * gcc.dg/tree-ssa/calloc.c: Likewise.
>>
>> --
>> Marc Glisse
>> Index: gcc/testsuite/g++.dg/tree-ssa/calloc.C
>> ===================================================================
>> --- gcc/testsuite/g++.dg/tree-ssa/calloc.C    (revision 0)
>> +++ gcc/testsuite/g++.dg/tree-ssa/calloc.C    (working copy)
>> @@ -0,0 +1,35 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-std=gnu++11 -O3 -fdump-tree-optimized" } */
>> +
>> +#include <new>
>> +#include <vector>
>> +#include <cstdlib>
>> +
>> +void g(void*);
>> +inline void* operator new(std::size_t sz) _GLIBCXX_THROW (std::bad_alloc)
>
> Unless *really* necessary I would recommend not including the large 
> <vector> (that also couples quite seriously the front-end testsuite to 
> the library testsuite, we already discussed those topics in the past). 
> Using the internal macros seems also unnecessary.

I think it might be the first time I include large headers in a compiler 
testcase (note that there are already 16 other testcases including 
<vector> in g++.dg). In this case, it seems to be what I want to test 
though. I already have some elementary tests in gcc.dg. This testcase is 
the original motivation for working on this. It requires a combination of 
quite a few optimizations (inlining, recognizing that a loop is a memset, 
aliasing, this optimization (the complicated version with a PHI node)), 
and I want to test that we won't for instance shuffle the passes in a way 
that breaks it. Also, if the library changes vector enough that this 
doesn't optimize anymore, I want to know about it, either the library 
change was wrong or the middle-end needs to improve some optimization 
before the next release.


I wanted to keep the implementation of new as close to the one in 
libsupc++ as possible (mimic LTO), so I copy-pasted (and slightly edited, 
I may propose a patch to libsupc++ later). I agree that I should remove 
the exception specification (since I am compiling in C++11 to have access 
to get_new_handler) and replace _GLIBCXX_THROW_OR_ABORT with just throw, 
and I just did it locally, thanks.

-- 
Marc Glisse

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: calloc = malloc + memset
  2014-02-28 22:48 calloc = malloc + memset Marc Glisse
  2014-03-01 12:26 ` Paolo Carlini
@ 2014-03-03  9:50 ` Richard Biener
  2014-03-03 13:45   ` Marc Glisse
  2014-03-23 19:47   ` Marc Glisse
  2014-06-23 18:20 ` Andi Kleen
  2014-07-15 12:38 ` Bernd Schmidt
  3 siblings, 2 replies; 26+ messages in thread
From: Richard Biener @ 2014-03-03  9:50 UTC (permalink / raw)
  To: Marc Glisse; +Cc: GCC Patches

On Fri, Feb 28, 2014 at 11:48 PM, Marc Glisse <marc.glisse@inria.fr> wrote:
> Hello,
>
> this is a stage 1 patch, and I'll ping it then, but if you have comments
> now...
>
> Passes bootstrap+testsuite on x86_64-linux-gnu.
>
> 2014-02-28  Marc Glisse  <marc.glisse@inria.fr>
>
>         PR tree-optimization/57742
> gcc/
>         * tree-ssa-forwprop.c (simplify_malloc_memset): New function.
>         (simplify_builtin_call): Call it.
> gcc/testsuite/
>         * g++.dg/tree-ssa/calloc.C: New testcase.
>         * gcc.dg/tree-ssa/calloc.c: Likewise.
>
> --
> Marc Glisse
> Index: gcc/testsuite/g++.dg/tree-ssa/calloc.C
> ===================================================================
> --- gcc/testsuite/g++.dg/tree-ssa/calloc.C      (revision 0)
> +++ gcc/testsuite/g++.dg/tree-ssa/calloc.C      (working copy)
> @@ -0,0 +1,35 @@
> +/* { dg-do compile } */
> +/* { dg-options "-std=gnu++11 -O3 -fdump-tree-optimized" } */
> +
> +#include <new>
> +#include <vector>
> +#include <cstdlib>
> +
> +void g(void*);
> +inline void* operator new(std::size_t sz) _GLIBCXX_THROW (std::bad_alloc)
> +{
> +  void *p;
> +
> +  if (sz == 0)
> +    sz = 1;
> +
> +  // Slightly modified from the libsupc++ version, that one has 2 calls
> +  // to malloc which makes it too hard to optimize.
> +  while ((p = std::malloc (sz)) == 0)
> +    {
> +      std::new_handler handler = std::get_new_handler ();
> +      if (! handler)
> +        _GLIBCXX_THROW_OR_ABORT(std::bad_alloc());
> +      handler ();
> +    }
> +  return p;
> +}
> +
> +void f(void*p,int n){
> +  new(p)std::vector<int>(n);
> +}
> +
> +/* { dg-final { scan-tree-dump-times "calloc" 1 "optimized" } } */
> +/* { dg-final { scan-tree-dump-not "malloc" "optimized" } } */
> +/* { dg-final { scan-tree-dump-not "memset" "optimized" } } */
> +/* { dg-final { cleanup-tree-dump "optimized" } } */
>
> Property changes on: gcc/testsuite/g++.dg/tree-ssa/calloc.C
> ___________________________________________________________________
> Added: svn:eol-style
> ## -0,0 +1 ##
> +native
> \ No newline at end of property
> Added: svn:keywords
> ## -0,0 +1 ##
> +Author Date Id Revision URL
> \ No newline at end of property
> Index: gcc/testsuite/gcc.dg/tree-ssa/calloc.c
> ===================================================================
> --- gcc/testsuite/gcc.dg/tree-ssa/calloc.c      (revision 0)
> +++ gcc/testsuite/gcc.dg/tree-ssa/calloc.c      (working copy)
> @@ -0,0 +1,29 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +
> +#include <stdlib.h>
> +#include <string.h>
> +
> +extern int a;
> +extern int* b;
> +int n;
> +void* f(long*q){
> +  int*p=malloc(n);
> +  ++*q;
> +  if(p){
> +    ++*q;
> +    a=2;
> +    memset(p,0,n);
> +    *b=3;
> +  }
> +  return p;
> +}
> +void* g(void){
> +  float*p=calloc(8,4);
> +  return memset(p,0,32);
> +}
> +
> +/* { dg-final { scan-tree-dump-times "calloc" 2 "optimized" } } */
> +/* { dg-final { scan-tree-dump-not "malloc" "optimized" } } */
> +/* { dg-final { scan-tree-dump-not "memset" "optimized" } } */
> +/* { dg-final { cleanup-tree-dump "optimized" } } */
>
> Property changes on: gcc/testsuite/gcc.dg/tree-ssa/calloc.c
> ___________________________________________________________________
> Added: svn:keywords
> ## -0,0 +1 ##
> +Author Date Id Revision URL
> \ No newline at end of property
> Added: svn:eol-style
> ## -0,0 +1 ##
> +native
> \ No newline at end of property
> Index: gcc/tree-ssa-forwprop.c
> ===================================================================
> --- gcc/tree-ssa-forwprop.c     (revision 208224)
> +++ gcc/tree-ssa-forwprop.c     (working copy)
> @@ -1487,20 +1487,149 @@ constant_pointer_difference (tree p1, tr
>      }
>
>    for (i = 0; i < cnt[0]; i++)
>      for (j = 0; j < cnt[1]; j++)
>        if (exps[0][i] == exps[1][j])
>         return size_binop (MINUS_EXPR, offs[0][i], offs[1][j]);
>
>    return NULL_TREE;
>  }
>
> +/* Optimize
> +   ptr = malloc (n);
> +   memset (ptr, 0, n);
> +   into
> +   ptr = calloc (n);
> +   gsi_p is known to point to a call to __builtin_memset.  */
> +static bool
> +simplify_malloc_memset (gimple_stmt_iterator *gsi_p)
> +{
> +  /* First make sure we have:
> +     ptr = malloc (n);
> +     memset (ptr, 0, n);  */
> +  gimple stmt2 = gsi_stmt (*gsi_p);
> +  if (!integer_zerop (gimple_call_arg (stmt2, 1)))
> +    return false;
> +  tree ptr1, ptr2 = gimple_call_arg (stmt2, 0);
> +  tree size = gimple_call_arg (stmt2, 2);
> +  if (TREE_CODE (ptr2) != SSA_NAME)
> +    return false;
> +  gimple stmt1 = SSA_NAME_DEF_STMT (ptr2);
> +  tree callee1;
> +  /* Handle the case where STMT1 is a unary PHI, which happends
> +     for instance with:
> +     while (!(p = malloc (n))) { ... }
> +     memset (p, 0, n);  */
> +  if (!stmt1)
> +    return false;
> +  if (gimple_code (stmt1) == GIMPLE_PHI
> +      && gimple_phi_num_args (stmt1) == 1)
> +    {
> +      ptr1 = gimple_phi_arg_def (stmt1, 0);
> +      if (TREE_CODE (ptr1) != SSA_NAME)
> +       return false;
> +      stmt1 = SSA_NAME_DEF_STMT (ptr1);
> +    }
> +  else
> +    ptr1 = ptr2;
> +  if (!stmt1
> +      || !is_gimple_call (stmt1)
> +      || !(callee1 = gimple_call_fndecl (stmt1)))
> +    return false;

That's a bit much of ad-hoc pattern-matching ... wouldn't be
p = malloc (n);
memset (p, 0, n);

transform better suited to the strlen opt pass?  After all that tracks
what 'string' is associated with a SSA name pointer through
arbitrary satements using a lattice.

> +  bool is_calloc;
> +  if (DECL_FUNCTION_CODE (callee1) == BUILT_IN_MALLOC)
> +    {
> +      is_calloc = false;
> +      if (!operand_equal_p (gimple_call_arg (stmt1, 0), size, 0))
> +       return false;
> +    }
> +  else if (DECL_FUNCTION_CODE (callee1) == BUILT_IN_CALLOC)
> +    {
> +      is_calloc = true;
> +      tree arg1 = gimple_call_arg (stmt1, 0);
> +      tree arg2 = gimple_call_arg (stmt1, 1);
> +      tree size1 = fold_build2 (MULT_EXPR, TREE_TYPE (size), arg1, arg2);
> +      if (!operand_equal_p (size1, size, 0))
> +       return false;
> +      /* We could look at SSA_NAME_DEF_STMT (size), but there can be many
> casts
> +        mixed with the MULT_EXPR which makes it hard to match with size1.
> */

The same probably applies to calloc(); memset (, 0,); though here you
could even match points-to info (after all even only clearing a portion of
the calloc()ed memory is dead code).  If points-to conservatively computes
that the memset pointer only points to null or the memory tag the
calloc return value points to then you can discard it without further
checking ...

> +    }
> +  else
> +    return false;
> +
> +  /* We only allow two simple CFG forms for now. Either stmt1 and stmt2 are
> +     in the same BB (in this order), or BB1 (malloc) ends with:
> +     if (ptr) goto BB2 (memset)  */
> +  basic_block bb1 = gimple_bb (stmt1);
> +  basic_block bb2 = gimple_bb (stmt2);
> +  if (bb1 != bb2)
> +    {
> +      if (!single_pred_p (bb2) || single_pred (bb2) != bb1)
> +       return false;
> +      gimple cond = last_stmt (bb1);
> +      if (gimple_code (cond) != GIMPLE_COND
> +         || !integer_zerop (gimple_cond_rhs (cond))
> +         || gimple_cond_lhs (cond) != ptr1)
> +       return false;
> +      int branch;
> +      tree_code comp = gimple_cond_code (cond);
> +      if (comp == NE_EXPR)
> +       branch = EDGE_TRUE_VALUE;
> +      else if (comp == EQ_EXPR)
> +       branch = EDGE_FALSE_VALUE;
> +      else
> +       return false;
> +      edge e;
> +      edge_iterator ei;
> +      FOR_EACH_EDGE (e, ei, bb1->succs)
> +       if ((e->flags & branch) && e->dest != bb2)
> +         return false;
> +    }

CFG matching in forwprop ... ehhh.  This really asks for integrating
this with a propagation engine.

> +  /* Finally, make sure the memory is not used before stmt2.  */
> +  ao_ref ref;
> +  ao_ref_init_from_ptr_and_size (&ref, ptr1, size);
> +  tree vdef = gimple_vuse (stmt2);
> +  if (vdef == NULL)
> +    return false;
> +  while (true)
> +    {
> +      gimple cur = SSA_NAME_DEF_STMT (vdef);
> +      if (cur == stmt1) break;
> +      if (stmt_may_clobber_ref_p_1 (cur, &ref))
> +       return false;
> +      vdef = gimple_vuse (cur);
> +    }

We have walk_aliased_vdefs() for this.

That said, please try to integrate this kind of transforms with
the strlen opt pass (even if it requires making its lattice more generic).
The memset removal even looks like a candidate for DSE.

Thanks,
Richard.

> +  /* Replace malloc with calloc and remove memset.  */
> +  if (!is_calloc)
> +    {
> +      gimple_stmt_iterator gsi = gsi_for_stmt (stmt1);
> +      update_gimple_call (&gsi, builtin_decl_implicit (BUILT_IN_CALLOC), 2,
> +                         size, build_one_cst (size_type_node));
> +    }
> +  tree lhs = gimple_call_lhs (stmt2);
> +  unlink_stmt_vdef (stmt2);
> +  if (lhs)
> +    {
> +      gimple assign = gimple_build_assign (lhs, ptr2);
> +      gsi_replace (gsi_p, assign, false);
> +    }
> +  else
> +    {
> +      gsi_remove (gsi_p, true);
> +      release_defs (stmt2);
> +    }
> +  return true;
> +}
> +
>  /* *GSI_P is a GIMPLE_CALL to a builtin function.
>     Optimize
>     memcpy (p, "abcd", 4);
>     memset (p + 4, ' ', 3);
>     into
>     memcpy (p, "abcd   ", 7);
>     call if the latter can be stored by pieces during expansion.  */
>
>  static bool
>  simplify_builtin_call (gimple_stmt_iterator *gsi_p, tree callee2)
> @@ -1508,38 +1637,44 @@ simplify_builtin_call (gimple_stmt_itera
>    gimple stmt1, stmt2 = gsi_stmt (*gsi_p);
>    tree vuse = gimple_vuse (stmt2);
>    if (vuse == NULL)
>      return false;
>    stmt1 = SSA_NAME_DEF_STMT (vuse);
>
>    switch (DECL_FUNCTION_CODE (callee2))
>      {
>      case BUILT_IN_MEMSET:
>        if (gimple_call_num_args (stmt2) != 3
> -         || gimple_call_lhs (stmt2)
>           || CHAR_BIT != 8
>           || BITS_PER_UNIT != 8)
>         break;
>        else
>         {
> +         if (simplify_malloc_memset (gsi_p))
> +           return true;
> +
>           tree callee1;
>           tree ptr1, src1, str1, off1, len1, lhs1;
>           tree ptr2 = gimple_call_arg (stmt2, 0);
>           tree val2 = gimple_call_arg (stmt2, 1);
>           tree len2 = gimple_call_arg (stmt2, 2);
>           tree diff, vdef, new_str_cst;
>           gimple use_stmt;
>           unsigned int ptr1_align;
>           unsigned HOST_WIDE_INT src_len;
>           char *src_buf;
>           use_operand_p use_p;
>
> +         /* Not implemented yet.  */
> +         if (gimple_call_lhs (stmt2))
> +           break;
> +
>           if (!tree_fits_shwi_p (val2)
>               || !tree_fits_uhwi_p (len2))
>             break;
>           if (is_gimple_call (stmt1))
>             {
>               /* If first stmt is a call, it needs to be memcpy
>                  or mempcpy, with string literal as second argument and
>                  constant length.  */
>               callee1 = gimple_call_fndecl (stmt1);
>               if (callee1 == NULL_TREE
>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: calloc = malloc + memset
  2014-03-03  9:50 ` Richard Biener
@ 2014-03-03 13:45   ` Marc Glisse
  2014-03-23 19:47   ` Marc Glisse
  1 sibling, 0 replies; 26+ messages in thread
From: Marc Glisse @ 2014-03-03 13:45 UTC (permalink / raw)
  To: Richard Biener; +Cc: GCC Patches

On Mon, 3 Mar 2014, Richard Biener wrote:

> That's a bit much of ad-hoc pattern-matching ... wouldn't be
> p = malloc (n);
> memset (p, 0, n);
>
> transform better suited to the strlen opt pass?  After all that tracks
> what 'string' is associated with a SSA name pointer through
> arbitrary satements using a lattice.

Too early, it needs to run later than ldist, or there won't be any
memset to match in the std::vector case. Would you consider moving or
duplicating either strlen or ldist so they are run in the order I need?

> The same probably applies to calloc(); memset (, 0,);

Oh, you mean the length doesn't have to match for calloc? That's true, I
completely missed that.

> though here you
> could even match points-to info (after all even only clearing a portion of
> the calloc()ed memory is dead code).  If points-to conservatively computes
> that the memset pointer only points to null or the memory tag the
> calloc return value points to then you can discard it without further
> checking ...

I'll look into it (and DSE). Note that the calloc case is just an
afterthought, what I really care about is replacing malloc.

>> +  /* Finally, make sure the memory is not used before stmt2.  */
>> +  ao_ref ref;
>> +  ao_ref_init_from_ptr_and_size (&ref, ptr1, size);
>> +  tree vdef = gimple_vuse (stmt2);
>> +  if (vdef == NULL)
>> +    return false;
>> +  while (true)
>> +    {
>> +      gimple cur = SSA_NAME_DEF_STMT (vdef);
>> +      if (cur == stmt1) break;
>> +      if (stmt_may_clobber_ref_p_1 (cur, &ref))
>> +       return false;
>> +      vdef = gimple_vuse (cur);
>> +    }
>
> We have walk_aliased_vdefs() for this.

As explained in the PR, walk_aliased_vdefs misses the call to malloc (it 
doesn't clobber the memory pointed to by p). You then suggested:
"Exact pattern matching of the CFG involved might be the easiest, plus 
manually implementing walk_aliased_vdefs by simply walking the use-def 
chain of the virtual operands from the memset operation to the malloc and 
checking stmt_may_clobber_ref_p_1 on the ao_ref_init_from_ptr_and_size 
ref."

> That said, please try to integrate this kind of transforms with
> the strlen opt pass (even if it requires making its lattice more generic).

Assuming the passes have a chance of being reordered, I'll try to 
understand how strlen works.

Thanks for the comments,

-- 
Marc Glisse

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: calloc = malloc + memset
  2014-03-03  9:50 ` Richard Biener
  2014-03-03 13:45   ` Marc Glisse
@ 2014-03-23 19:47   ` Marc Glisse
  2014-04-15 19:06     ` Marc Glisse
  2014-04-18 11:31     ` Jakub Jelinek
  1 sibling, 2 replies; 26+ messages in thread
From: Marc Glisse @ 2014-03-23 19:47 UTC (permalink / raw)
  To: Richard Biener; +Cc: GCC Patches

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1302 bytes --]

On Mon, 3 Mar 2014, Richard Biener wrote:

> That's a bit much of ad-hoc pattern-matching ... wouldn't be
> p = malloc (n);
> memset (p, 0, n);
>
> transform better suited to the strlen opt pass?  After all that tracks
> what 'string' is associated with a SSA name pointer through
> arbitrary satements using a lattice.

Like this? I had to move the strlen pass after the loop passes (and
after dom or everything was too dirty) but long enough before the end
(some optimizations are necessary after strlen). As a bonus, one more
strlen is optimized in the current testcases :-)

Running the pass twice would be another option I guess (it would require
implementing the clone method), but without a testcase showing it is
needed...

Passes bootstrap+testsuite on x86_64-linux-gnu.

2014-03-23  Marc Glisse  <marc.glisse@inria.fr>

         PR tree-optimization/57742
gcc/
         * tree-ssa-strlen.c (get_string_length): Ignore malloc.
         (handle_builtin_malloc, handle_builtin_memset): New functions.
 	(strlen_optimize_stmt): Call them.
 	* passes.def: Move strlen after loop+dom.
gcc/testsuite/
         * g++.dg/tree-ssa/calloc.C: New testcase.
         * gcc.dg/tree-ssa/calloc-1.c: Likewise.
         * gcc.dg/tree-ssa/calloc-2.c: Likewise.
 	* gcc.dg/strlenopt-9.c: Adapt.

-- 
Marc Glisse

[-- Attachment #2: Type: TEXT/PLAIN, Size: 11605 bytes --]

Index: gcc/passes.def
===================================================================
--- gcc/passes.def	(revision 208772)
+++ gcc/passes.def	(working copy)
@@ -176,21 +176,20 @@ along with GCC; see the file COPYING3.
 	 DOM and erroneous path isolation should be due to degenerate PHI nodes.
 	 So rather than run the full propagators, run a specialized pass which
 	 only examines PHIs to discover const/copy propagation
 	 opportunities.  */
       NEXT_PASS (pass_phi_only_cprop);
       NEXT_PASS (pass_dse);
       NEXT_PASS (pass_reassoc);
       NEXT_PASS (pass_dce);
       NEXT_PASS (pass_forwprop);
       NEXT_PASS (pass_phiopt);
-      NEXT_PASS (pass_strlen);
       NEXT_PASS (pass_ccp);
       /* After CCP we rewrite no longer addressed locals into SSA
 	 form if possible.  */
       NEXT_PASS (pass_copy_prop);
       NEXT_PASS (pass_cse_sincos);
       NEXT_PASS (pass_optimize_bswap);
       NEXT_PASS (pass_split_crit_edges);
       NEXT_PASS (pass_pre);
       NEXT_PASS (pass_sink_code);
       NEXT_PASS (pass_asan);
@@ -235,20 +234,21 @@ along with GCC; see the file COPYING3.
       NEXT_PASS (pass_cse_reciprocals);
       NEXT_PASS (pass_reassoc);
       NEXT_PASS (pass_strength_reduction);
       NEXT_PASS (pass_dominator);
       /* The only const/copy propagation opportunities left after
 	 DOM should be due to degenerate PHI nodes.  So rather than
 	 run the full propagators, run a specialized pass which
 	 only examines PHIs to discover const/copy propagation
 	 opportunities.  */
       NEXT_PASS (pass_phi_only_cprop);
+      NEXT_PASS (pass_strlen);
       NEXT_PASS (pass_vrp);
       NEXT_PASS (pass_cd_dce);
       NEXT_PASS (pass_tracer);
       NEXT_PASS (pass_dse);
       NEXT_PASS (pass_forwprop);
       NEXT_PASS (pass_phiopt);
       NEXT_PASS (pass_fold_builtins);
       NEXT_PASS (pass_optimize_widening_mul);
       NEXT_PASS (pass_tail_calls);
       NEXT_PASS (pass_rename_ssa_copies);
Index: gcc/testsuite/g++.dg/tree-ssa/calloc.C
===================================================================
--- gcc/testsuite/g++.dg/tree-ssa/calloc.C	(revision 0)
+++ gcc/testsuite/g++.dg/tree-ssa/calloc.C	(working copy)
@@ -0,0 +1,35 @@
+/* { dg-do compile { target c++11 } } */
+/* { dg-options "-O3 -fdump-tree-optimized" } */
+
+#include <new>
+#include <vector>
+#include <cstdlib>
+
+void g(void*);
+inline void* operator new(std::size_t sz)
+{
+  void *p;
+
+  if (sz == 0)
+    sz = 1;
+
+  // Slightly modified from the libsupc++ version, that one has 2 calls
+  // to malloc which makes it too hard to optimize.
+  while ((p = std::malloc (sz)) == 0)
+    {
+      std::new_handler handler = std::get_new_handler ();
+      if (! handler)
+        throw std::bad_alloc();
+      handler ();
+    }
+  return p;
+}
+
+void f(void*p,int n){
+  new(p)std::vector<int>(n);
+}
+
+/* { dg-final { scan-tree-dump-times "calloc" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-not "malloc" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "memset" "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */

Property changes on: gcc/testsuite/g++.dg/tree-ssa/calloc.C
___________________________________________________________________
Added: svn:keywords
## -0,0 +1 ##
+Author Date Id Revision URL
\ No newline at end of property
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Index: gcc/testsuite/gcc.dg/strlenopt-9.c
===================================================================
--- gcc/testsuite/gcc.dg/strlenopt-9.c	(revision 208772)
+++ gcc/testsuite/gcc.dg/strlenopt-9.c	(working copy)
@@ -11,21 +11,21 @@ fn1 (int r)
      optimized away.  */
   return strchr (p, '\0');
 }
 
 __attribute__((noinline, noclone)) size_t
 fn2 (int r)
 {
   char *p, q[10];
   strcpy (q, "abc");
   p = r ? "a" : q;
-  /* String length for p varies, therefore strlen below isn't
+  /* String length is constant for both alternatives, and strlen is
      optimized away.  */
   return strlen (p);
 }
 
 __attribute__((noinline, noclone)) size_t
 fn3 (char *p, int n)
 {
   int i;
   p = strchr (p, '\0');
   /* strcat here can be optimized into memcpy.  */
@@ -91,19 +91,19 @@ main ()
       || memcmp (b, "aaaabcdabcdefgabcdefgefgabcdabcd", 33) != 0
       || memcmp (buf, "aaaabcdabcdefgabcdefgefgabcdefg", 32) != 0)
     abort ();
   if (fn4 (b, 256) != 4
       || memcmp (b, "aaaabcdabcdefgabcdefgefgabcdabcdabcd", 37) != 0
       || memcmp (buf, "aaaabcdabcdefgabcdefgefgabcdabcdefgefg", 39) != 0)
     abort ();
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "strlen \\(" 4 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strlen \\(" 3 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "memcpy \\(" 6 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strcpy \\(" 1 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strchr \\(" 3 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
 /* { dg-final { cleanup-tree-dump "strlen" } } */
 /* { dg-final { scan-tree-dump-times "return 4;" 1 "optimized" } } */
 /* { dg-final { cleanup-tree-dump "optimized" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/calloc-1.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/calloc-1.c	(revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/calloc-1.c	(working copy)
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+#include <stdlib.h>
+#include <string.h>
+
+extern int a;
+extern int* b;
+int n;
+void* f(long*q){
+  int*p=malloc(n);
+  ++*q;
+  if(p){
+    ++*q;
+    a=2;
+    memset(p,0,n);
+    *b=3;
+  }
+  return p;
+}
+void* g(void){
+  float*p=calloc(8,4);
+  return memset(p,0,24); // not 32
+}
+
+/* { dg-final { scan-tree-dump-times "calloc" 2 "optimized" } } */
+/* { dg-final { scan-tree-dump-not "malloc" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "memset" "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */

Property changes on: gcc/testsuite/gcc.dg/tree-ssa/calloc-1.c
___________________________________________________________________
Added: svn:keywords
## -0,0 +1 ##
+Author Date Id Revision URL
\ No newline at end of property
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Index: gcc/tree-ssa-strlen.c
===================================================================
--- gcc/tree-ssa-strlen.c	(revision 208772)
+++ gcc/tree-ssa-strlen.c	(working copy)
@@ -496,20 +496,23 @@ get_string_length (strinfo si)
 	case BUILT_IN_STPCPY_CHK:
 	  gcc_assert (lhs != NULL_TREE);
 	  loc = gimple_location (stmt);
 	  si->endptr = lhs;
 	  si->stmt = NULL;
 	  lhs = fold_convert_loc (loc, size_type_node, lhs);
 	  si->length = fold_convert_loc (loc, size_type_node, si->ptr);
 	  si->length = fold_build2_loc (loc, MINUS_EXPR, size_type_node,
 					lhs, si->length);
 	  break;
+	case BUILT_IN_MALLOC:
+	  break;
+	  /* calloc always has si->length set.  */
 	default:
 	  gcc_unreachable ();
 	  break;
 	}
     }
 
   return si->length;
 }
 
 /* Invalidate string length information for strings whose length
@@ -521,20 +524,21 @@ maybe_invalidate (gimple stmt)
   strinfo si;
   unsigned int i;
   bool nonempty = false;
 
   for (i = 1; vec_safe_iterate (stridx_to_strinfo, i, &si); ++i)
     if (si != NULL)
       {
 	if (!si->dont_invalidate)
 	  {
 	    ao_ref r;
+	    // Do not use si->length.
 	    ao_ref_init_from_ptr_and_size (&r, si->ptr, NULL_TREE);
 	    if (stmt_may_clobber_ref_p_1 (stmt, &r))
 	      {
 		set_strinfo (i, NULL);
 		free_strinfo (si);
 		continue;
 	      }
 	  }
 	si->dont_invalidate = false;
 	nonempty = true;
@@ -1608,20 +1612,90 @@ handle_builtin_strcat (enum built_in_fun
 	{
 	  laststmt.stmt = stmt;
 	  laststmt.len = srclen;
 	  laststmt.stridx = dsi->idx;
 	}
     }
   else if (dump_file && (dump_flags & TDF_DETAILS) != 0)
     fprintf (dump_file, "not possible.\n");
 }
 
+/* Handle a call to malloc or calloc.  */
+
+static void
+handle_builtin_malloc (enum built_in_function bcode, gimple_stmt_iterator *gsi)
+{
+  gimple stmt = gsi_stmt (*gsi);
+  tree lhs = gimple_call_lhs (stmt);
+  gcc_assert (get_stridx (lhs) == 0);
+  int idx = new_stridx (lhs);
+  tree length = NULL_TREE;
+  if (bcode == BUILT_IN_CALLOC)
+    length = build_int_cst (size_type_node, 0);
+  strinfo si = new_strinfo (lhs, idx, length);
+  if (bcode == BUILT_IN_CALLOC)
+    si->endptr = lhs;
+  set_strinfo (idx, si);
+  si->writable = true;
+  si->stmt = stmt;
+  si->dont_invalidate = true;
+}
+
+/* Handle a call to memset.
+   After a call to calloc, memset(,0,) is unnecessary.
+   memset(malloc(n),0,n) is calloc(n,1).  */
+
+static bool
+handle_builtin_memset (gimple_stmt_iterator *gsi)
+{
+  gimple stmt1, stmt2 = gsi_stmt (*gsi);
+  if (!integer_zerop (gimple_call_arg (stmt2, 1)))
+    return true;
+  tree ptr = gimple_call_arg (stmt2, 0);
+  int idx1 = get_stridx (ptr);
+  if (idx1 <= 0)
+    return true;
+  strinfo si1 = get_strinfo (idx1);
+  if (!si1 || !(stmt1 = si1->stmt) || !is_gimple_call (stmt1))
+    return true;
+  tree callee1 = gimple_call_fndecl (stmt1);
+  if (!gimple_call_builtin_p (stmt1, BUILT_IN_NORMAL))
+    return true;
+  enum built_in_function code1 = DECL_FUNCTION_CODE (callee1);
+  tree size = gimple_call_arg (stmt2, 2);
+  if (code1 == BUILT_IN_CALLOC)
+    /* Not touching stmt1 */ ;
+  else if (code1 == BUILT_IN_MALLOC
+	   && operand_equal_p (gimple_call_arg (stmt1, 0), size, 0))
+    {
+      gimple_stmt_iterator gsi1 = gsi_for_stmt (stmt1);
+      update_gimple_call (&gsi1, builtin_decl_implicit (BUILT_IN_CALLOC), 2,
+			  size, build_one_cst (size_type_node));
+    }
+  else
+    return true;
+  tree lhs = gimple_call_lhs (stmt2);
+  unlink_stmt_vdef (stmt2);
+  if (lhs)
+    {
+      gimple assign = gimple_build_assign (lhs, ptr);
+      gsi_replace (gsi, assign, false);
+    }
+  else
+    {
+      gsi_remove (gsi, true);
+      release_defs (stmt2);
+    }
+
+  return false;
+}
+
 /* Handle a POINTER_PLUS_EXPR statement.
    For p = "abcd" + 2; compute associated length, or if
    p = q + off is pointing to a '\0' character of a string, call
    zero_length_string on it.  */
 
 static void
 handle_pointer_plus (gimple_stmt_iterator *gsi)
 {
   gimple stmt = gsi_stmt (*gsi);
   tree lhs = gimple_assign_lhs (stmt), off;
@@ -1845,20 +1919,28 @@ strlen_optimize_stmt (gimple_stmt_iterat
 	  case BUILT_IN_MEMCPY:
 	  case BUILT_IN_MEMCPY_CHK:
 	  case BUILT_IN_MEMPCPY:
 	  case BUILT_IN_MEMPCPY_CHK:
 	    handle_builtin_memcpy (DECL_FUNCTION_CODE (callee), gsi);
 	    break;
 	  case BUILT_IN_STRCAT:
 	  case BUILT_IN_STRCAT_CHK:
 	    handle_builtin_strcat (DECL_FUNCTION_CODE (callee), gsi);
 	    break;
+	  case BUILT_IN_MALLOC:
+	  case BUILT_IN_CALLOC:
+	    handle_builtin_malloc (DECL_FUNCTION_CODE (callee), gsi);
+	    break;
+	  case BUILT_IN_MEMSET:
+	    if (!handle_builtin_memset (gsi))
+	      return false;
+	    break;
 	  default:
 	    break;
 	  }
     }
   else if (is_gimple_assign (stmt))
     {
       tree lhs = gimple_assign_lhs (stmt);
 
       if (TREE_CODE (lhs) == SSA_NAME && POINTER_TYPE_P (TREE_TYPE (lhs)))
 	{

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: calloc = malloc + memset
  2014-03-23 19:47   ` Marc Glisse
@ 2014-04-15 19:06     ` Marc Glisse
  2014-04-18 11:31     ` Jakub Jelinek
  1 sibling, 0 replies; 26+ messages in thread
From: Marc Glisse @ 2014-04-15 19:06 UTC (permalink / raw)
  To: Richard Biener; +Cc: GCC Patches

Let me ping this. There's no hurry, but it may have got lost with 4.9 
approaching.
http://gcc.gnu.org/ml/gcc-patches/2014-03/msg01205.html


On Sun, 23 Mar 2014, Marc Glisse wrote:

> On Mon, 3 Mar 2014, Richard Biener wrote:
>
>> That's a bit much of ad-hoc pattern-matching ... wouldn't be
>> p = malloc (n);
>> memset (p, 0, n);
>> 
>> transform better suited to the strlen opt pass?  After all that tracks
>> what 'string' is associated with a SSA name pointer through
>> arbitrary satements using a lattice.
>
> Like this? I had to move the strlen pass after the loop passes (and
> after dom or everything was too dirty) but long enough before the end
> (some optimizations are necessary after strlen). As a bonus, one more
> strlen is optimized in the current testcases :-)
>
> Running the pass twice would be another option I guess (it would require
> implementing the clone method), but without a testcase showing it is
> needed...
>
> Passes bootstrap+testsuite on x86_64-linux-gnu.
>
> 2014-03-23  Marc Glisse  <marc.glisse@inria.fr>
>
>        PR tree-optimization/57742
> gcc/
>        * tree-ssa-strlen.c (get_string_length): Ignore malloc.
>        (handle_builtin_malloc, handle_builtin_memset): New functions.
> 	(strlen_optimize_stmt): Call them.
> 	* passes.def: Move strlen after loop+dom.
> gcc/testsuite/
>        * g++.dg/tree-ssa/calloc.C: New testcase.
>        * gcc.dg/tree-ssa/calloc-1.c: Likewise.
>        * gcc.dg/tree-ssa/calloc-2.c: Likewise.
> 	* gcc.dg/strlenopt-9.c: Adapt.

-- 
Marc Glisse

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: calloc = malloc + memset
  2014-03-23 19:47   ` Marc Glisse
  2014-04-15 19:06     ` Marc Glisse
@ 2014-04-18 11:31     ` Jakub Jelinek
  2014-04-18 18:34       ` Marc Glisse
  1 sibling, 1 reply; 26+ messages in thread
From: Jakub Jelinek @ 2014-04-18 11:31 UTC (permalink / raw)
  To: Marc Glisse; +Cc: Richard Biener, GCC Patches

On Sun, Mar 23, 2014 at 05:13:46PM +0100, Marc Glisse wrote:
> 2014-03-23  Marc Glisse  <marc.glisse@inria.fr>
> 
>         PR tree-optimization/57742
> gcc/
>         * tree-ssa-strlen.c (get_string_length): Ignore malloc.
>         (handle_builtin_malloc, handle_builtin_memset): New functions.
> 	(strlen_optimize_stmt): Call them.
> 	* passes.def: Move strlen after loop+dom.
> gcc/testsuite/
>         * g++.dg/tree-ssa/calloc.C: New testcase.
>         * gcc.dg/tree-ssa/calloc-1.c: Likewise.
>         * gcc.dg/tree-ssa/calloc-2.c: Likewise.
> 	* gcc.dg/strlenopt-9.c: Adapt.

Some CL lines are indented by 8 spaces instead of tabs.
The passes.def change makes me a little bit nervous, but if it works,
perhaps.

> --- gcc/testsuite/g++.dg/tree-ssa/calloc.C	(revision 0)
> +++ gcc/testsuite/g++.dg/tree-ssa/calloc.C	(working copy)
> @@ -0,0 +1,35 @@
> +/* { dg-do compile { target c++11 } } */
> +/* { dg-options "-O3 -fdump-tree-optimized" } */
> +
> +#include <new>
> +#include <vector>
> +#include <cstdlib>
> +
> +void g(void*);
> +inline void* operator new(std::size_t sz)
> +{
> +  void *p;
> +
> +  if (sz == 0)
> +    sz = 1;
> +
> +  // Slightly modified from the libsupc++ version, that one has 2 calls
> +  // to malloc which makes it too hard to optimize.
> +  while ((p = std::malloc (sz)) == 0)
> +    {
> +      std::new_handler handler = std::get_new_handler ();
> +      if (! handler)
> +        throw std::bad_alloc();
> +      handler ();
> +    }
> +  return p;
> +}
> +
> +void f(void*p,int n){
> +  new(p)std::vector<int>(n);
> +}
> +
> +/* { dg-final { scan-tree-dump-times "calloc" 1 "optimized" } } */
> +/* { dg-final { scan-tree-dump-not "malloc" "optimized" } } */
> +/* { dg-final { scan-tree-dump-not "memset" "optimized" } } */
> +/* { dg-final { cleanup-tree-dump "optimized" } } */

This looks to me way too much fragile, any time the libstdc++
or glibc headers change a little bit, you might need to adjust the
dg-final directives.  Much better would be if you just provided
the prototypes yourself and subset of the std::vector you really need for
the testcase.  You can throw some class or int, it doesn't have to be
std::bad_alloc, etc.

> --- gcc/testsuite/gcc.dg/strlenopt-9.c	(revision 208772)
> +++ gcc/testsuite/gcc.dg/strlenopt-9.c	(working copy)
> @@ -11,21 +11,21 @@ fn1 (int r)
>       optimized away.  */
>    return strchr (p, '\0');
>  }
>  
>  __attribute__((noinline, noclone)) size_t
>  fn2 (int r)
>  {
>    char *p, q[10];
>    strcpy (q, "abc");
>    p = r ? "a" : q;
> -  /* String length for p varies, therefore strlen below isn't
> +  /* String length is constant for both alternatives, and strlen is
>       optimized away.  */
>    return strlen (p);

Is this because of jump threading?

> --- gcc/testsuite/gcc.dg/tree-ssa/calloc-1.c	(revision 0)
> +++ gcc/testsuite/gcc.dg/tree-ssa/calloc-1.c	(working copy)
> @@ -0,0 +1,29 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +
> +#include <stdlib.h>
> +#include <string.h>

Even this I find unsafe.  The strlenopt*.c tests use it's custom
strlenopt.h header for a reason, you might just add a calloc
prototype in there and use that header.

> --- gcc/tree-ssa-strlen.c	(revision 208772)
> +++ gcc/tree-ssa-strlen.c	(working copy)
> @@ -496,20 +496,23 @@ get_string_length (strinfo si)
>  	case BUILT_IN_STPCPY_CHK:
>  	  gcc_assert (lhs != NULL_TREE);
>  	  loc = gimple_location (stmt);
>  	  si->endptr = lhs;
>  	  si->stmt = NULL;
>  	  lhs = fold_convert_loc (loc, size_type_node, lhs);
>  	  si->length = fold_convert_loc (loc, size_type_node, si->ptr);
>  	  si->length = fold_build2_loc (loc, MINUS_EXPR, size_type_node,
>  					lhs, si->length);
>  	  break;
> +	case BUILT_IN_MALLOC:
> +	  break;
> +	  /* calloc always has si->length set.  */

I guess it would be better to talk about BUILT_IN_CALLOC here, and align
the comment with case and default column, to make it clear the comment
is not about BUILT_IN_MALLOC.  Or is it?
But, see below.

>  	default:

>  	if (!si->dont_invalidate)
>  	  {
>  	    ao_ref r;
> +	    // Do not use si->length.

As the whole file uses /* ... */ comments, can you use them here too?

>  	    ao_ref_init_from_ptr_and_size (&r, si->ptr, NULL_TREE);
>  	    if (stmt_may_clobber_ref_p_1 (stmt, &r))
>  	      {
>  		set_strinfo (i, NULL);
>  		free_strinfo (si);
>  		continue;
>  	      }
>  	  }
>  	si->dont_invalidate = false;
>  	nonempty = true;
> @@ -1608,20 +1612,90 @@ handle_builtin_strcat (enum built_in_fun
>  	{
>  	  laststmt.stmt = stmt;
>  	  laststmt.len = srclen;
>  	  laststmt.stridx = dsi->idx;
>  	}
>      }
>    else if (dump_file && (dump_flags & TDF_DETAILS) != 0)
>      fprintf (dump_file, "not possible.\n");
>  }
>  
> +/* Handle a call to malloc or calloc.  */
> +
> +static void
> +handle_builtin_malloc (enum built_in_function bcode, gimple_stmt_iterator *gsi)
> +{
> +  gimple stmt = gsi_stmt (*gsi);
> +  tree lhs = gimple_call_lhs (stmt);
> +  gcc_assert (get_stridx (lhs) == 0);
> +  int idx = new_stridx (lhs);
> +  tree length = NULL_TREE;
> +  if (bcode == BUILT_IN_CALLOC)
> +    length = build_int_cst (size_type_node, 0);

Is this safe?  I mean, if you call int a = 0; ptr = calloc (a, n);
or ptr = calloc (n, a); or ptr = calloc (0, 0); etc., then there is no
zero termination.  Sure, calling strlen or strchr etc. on the result
is undefined behavior, just wondering if it isn't going to break anything.
Though, the result is zero length and thus any dereferencing would be a bug,
so perhaps we are fine.

> +static bool
> +handle_builtin_memset (gimple_stmt_iterator *gsi)
> +{
> +  gimple stmt1, stmt2 = gsi_stmt (*gsi);
> +  if (!integer_zerop (gimple_call_arg (stmt2, 1)))
> +    return true;
> +  tree ptr = gimple_call_arg (stmt2, 0);
> +  int idx1 = get_stridx (ptr);
> +  if (idx1 <= 0)
> +    return true;
> +  strinfo si1 = get_strinfo (idx1);
> +  if (!si1 || !(stmt1 = si1->stmt) || !is_gimple_call (stmt1))
> +    return true;

Please avoid assignments in if conditions if easily possible, in this
case just using separate if (si1 == NULL) return true; and
stmt1 = si1->stmt; and another if is IMHO better.

	Jakub

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: calloc = malloc + memset
  2014-04-18 11:31     ` Jakub Jelinek
@ 2014-04-18 18:34       ` Marc Glisse
  2014-04-23 14:05         ` Richard Biener
  0 siblings, 1 reply; 26+ messages in thread
From: Marc Glisse @ 2014-04-18 18:34 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Richard Biener, GCC Patches

[-- Attachment #1: Type: TEXT/PLAIN, Size: 6494 bytes --]

Thanks for the comments!

On Fri, 18 Apr 2014, Jakub Jelinek wrote:

> The passes.def change makes me a little bit nervous, but if it works,
> perhaps.

Would you prefer running the pass twice? I thought there would be less 
resistance to moving the pass than duplicating it. By the way, I think 
even passes we run only once should have the required functions 
implemented so they can be run several times (at least most of them), in 
case users want to do that in plugins. I was surprised when I tried adding 
a second strlen pass and the compiler refused.

>> --- gcc/testsuite/g++.dg/tree-ssa/calloc.C	(revision 0)
>> +++ gcc/testsuite/g++.dg/tree-ssa/calloc.C	(working copy)
>> @@ -0,0 +1,35 @@
>> +/* { dg-do compile { target c++11 } } */
>> +/* { dg-options "-O3 -fdump-tree-optimized" } */
>> +
>> +#include <new>
>> +#include <vector>
>> +#include <cstdlib>
>> +
>> +void g(void*);
>> +inline void* operator new(std::size_t sz)
>> +{
>> +  void *p;
>> +
>> +  if (sz == 0)
>> +    sz = 1;
>> +
>> +  // Slightly modified from the libsupc++ version, that one has 2 calls
>> +  // to malloc which makes it too hard to optimize.
>> +  while ((p = std::malloc (sz)) == 0)
>> +    {
>> +      std::new_handler handler = std::get_new_handler ();
>> +      if (! handler)
>> +        throw std::bad_alloc();
>> +      handler ();
>> +    }
>> +  return p;
>> +}
>> +
>> +void f(void*p,int n){
>> +  new(p)std::vector<int>(n);
>> +}
>> +
>> +/* { dg-final { scan-tree-dump-times "calloc" 1 "optimized" } } */
>> +/* { dg-final { scan-tree-dump-not "malloc" "optimized" } } */
>> +/* { dg-final { scan-tree-dump-not "memset" "optimized" } } */
>> +/* { dg-final { cleanup-tree-dump "optimized" } } */
>
> This looks to me way too much fragile, any time the libstdc++
> or glibc headers change a little bit, you might need to adjust the
> dg-final directives.  Much better would be if you just provided
> the prototypes yourself and subset of the std::vector you really need for
> the testcase.  You can throw some class or int, it doesn't have to be
> std::bad_alloc, etc.

I don't understand what seems so fragile to you. There is a single 
function in the .optimized dump, which just calls calloc in a loop. It 
doesn't seem that likely that a change in glibc/libstdc++ would make an 
extra memset pop up. A change in libstdc++ could easily prevent the 
optimization completely (I'd like to hope we can avoid that, half of the 
purpose of the testcase was making sure libstdc++ didn't change in a bad 
way), but I don't really see how it could keep it in a way that requires 
tweaking dg-final.

While trying to write a standalone version, I hit again many missed 
optimizations, getting such nice things in the .optimized dump as:

   _12 = p_13 + sz_7;
   if (_12 != p_13)

or:

   _12 = p_13 + sz_7;
   _30 = (unsigned long) _12;
   _9 = p_13 + 4;
   _10 = (unsigned long) _9;
   _11 = _30 - _10;
   _22 = _11 /[ex] 4;
   _21 = _22;
   _40 = _21 + 1;
   _34 = _40 * 4;

It is embarrassing... I hope the combiner GSoC will work well and we can 
just add a dozen patterns to handle this before 4.10.

>> --- gcc/testsuite/gcc.dg/strlenopt-9.c	(revision 208772)
>> +++ gcc/testsuite/gcc.dg/strlenopt-9.c	(working copy)
>> @@ -11,21 +11,21 @@ fn1 (int r)
>>       optimized away.  */
>>    return strchr (p, '\0');
>>  }
>>
>>  __attribute__((noinline, noclone)) size_t
>>  fn2 (int r)
>>  {
>>    char *p, q[10];
>>    strcpy (q, "abc");
>>    p = r ? "a" : q;
>> -  /* String length for p varies, therefore strlen below isn't
>> +  /* String length is constant for both alternatives, and strlen is
>>       optimized away.  */
>>    return strlen (p);
>
> Is this because of jump threading?

It is PRE that turns:

   if (r_4(D) == 0)
     goto <bb 5>;
   else
     goto <bb 3>;

   <bb 5>:
   goto <bb 4>;

   <bb 3>:

   <bb 4>:
   # p_1 = PHI <&q(5), "a"(3)>
   _5 = __builtin_strlen (p_1);

into:

   if (r_4(D) == 0)
     goto <bb 5>;
   else
     goto <bb 3>;

   <bb 5>:
   _7 = __builtin_strlen (&q);
   pretmp_8 = _7;
   goto <bb 4>;

   <bb 3>:

   <bb 4>:
   # p_1 = PHI <&q(5), "a"(3)>
   # prephitmp_9 = PHI <pretmp_8(5), 1(3)>
   _5 = prephitmp_9;

It says:

Found partial redundancy for expression 
{call_expr<__builtin_strlen>,p_1}@.MEM_3 (0005)

>> --- gcc/testsuite/gcc.dg/tree-ssa/calloc-1.c	(revision 0)
>> +++ gcc/testsuite/gcc.dg/tree-ssa/calloc-1.c	(working copy)
>> @@ -0,0 +1,29 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O2 -fdump-tree-optimized" } */
>> +
>> +#include <stdlib.h>
>> +#include <string.h>
>
> Even this I find unsafe.  The strlenopt*.c tests use it's custom
> strlenopt.h header for a reason, you might just add a calloc
> prototype in there and use that header.

Might as well use __builtin_* then.

>> +/* Handle a call to malloc or calloc.  */
>> +
>> +static void
>> +handle_builtin_malloc (enum built_in_function bcode, gimple_stmt_iterator *gsi)
>> +{
>> +  gimple stmt = gsi_stmt (*gsi);
>> +  tree lhs = gimple_call_lhs (stmt);
>> +  gcc_assert (get_stridx (lhs) == 0);
>> +  int idx = new_stridx (lhs);
>> +  tree length = NULL_TREE;
>> +  if (bcode == BUILT_IN_CALLOC)
>> +    length = build_int_cst (size_type_node, 0);
>
> Is this safe?  I mean, if you call int a = 0; ptr = calloc (a, n);
> or ptr = calloc (n, a); or ptr = calloc (0, 0); etc., then there is no
> zero termination.  Sure, calling strlen or strchr etc. on the result
> is undefined behavior, just wondering if it isn't going to break anything.
> Though, the result is zero length and thus any dereferencing would be a bug,
> so perhaps we are fine.

You know the pass better than I do... I don't think the pass does anything 
like: "all 0-length strings are equivalent, let's replace one by another 
one", so I think we should be ok, but I don't mind treating calloc like 
malloc here if you think it is too risky, I don't think people call 
strlen(calloc(a,b)) very often, and it would let me remove the "don't use 
si->length" comment.


Updated and tested on x86_64-linux-gnu.

2014-04-18  Marc Glisse  <marc.glisse@inria.fr>

 	PR tree-optimization/57742
gcc/
 	* tree-ssa-strlen.c (get_string_length): Ignore malloc.
 	(handle_builtin_malloc, handle_builtin_memset): New functions.
 	(strlen_optimize_stmt): Call them.
 	* passes.def: Move strlen after loop+dom.
gcc/testsuite/
 	* g++.dg/tree-ssa/calloc.C: New testcase.
 	* gcc.dg/tree-ssa/calloc-1.c: Likewise.
 	* gcc.dg/tree-ssa/calloc-2.c: Likewise.
 	* gcc.dg/strlenopt-9.c: Adapt.

-- 
Marc Glisse

[-- Attachment #2: Type: TEXT/PLAIN, Size: 12219 bytes --]

Index: gcc/passes.def
===================================================================
--- gcc/passes.def	(revision 209516)
+++ gcc/passes.def	(working copy)
@@ -176,21 +176,20 @@ along with GCC; see the file COPYING3.
 	 DOM and erroneous path isolation should be due to degenerate PHI nodes.
 	 So rather than run the full propagators, run a specialized pass which
 	 only examines PHIs to discover const/copy propagation
 	 opportunities.  */
       NEXT_PASS (pass_phi_only_cprop);
       NEXT_PASS (pass_dse);
       NEXT_PASS (pass_reassoc);
       NEXT_PASS (pass_dce);
       NEXT_PASS (pass_forwprop);
       NEXT_PASS (pass_phiopt);
-      NEXT_PASS (pass_strlen);
       NEXT_PASS (pass_ccp);
       /* After CCP we rewrite no longer addressed locals into SSA
 	 form if possible.  */
       NEXT_PASS (pass_copy_prop);
       NEXT_PASS (pass_cse_sincos);
       NEXT_PASS (pass_optimize_bswap);
       NEXT_PASS (pass_split_crit_edges);
       NEXT_PASS (pass_pre);
       NEXT_PASS (pass_sink_code);
       NEXT_PASS (pass_asan);
@@ -235,20 +234,21 @@ along with GCC; see the file COPYING3.
       NEXT_PASS (pass_cse_reciprocals);
       NEXT_PASS (pass_reassoc);
       NEXT_PASS (pass_strength_reduction);
       NEXT_PASS (pass_dominator);
       /* The only const/copy propagation opportunities left after
 	 DOM should be due to degenerate PHI nodes.  So rather than
 	 run the full propagators, run a specialized pass which
 	 only examines PHIs to discover const/copy propagation
 	 opportunities.  */
       NEXT_PASS (pass_phi_only_cprop);
+      NEXT_PASS (pass_strlen);
       NEXT_PASS (pass_vrp);
       NEXT_PASS (pass_cd_dce);
       NEXT_PASS (pass_tracer);
       NEXT_PASS (pass_dse);
       NEXT_PASS (pass_forwprop);
       NEXT_PASS (pass_phiopt);
       NEXT_PASS (pass_fold_builtins);
       NEXT_PASS (pass_optimize_widening_mul);
       NEXT_PASS (pass_tail_calls);
       NEXT_PASS (pass_rename_ssa_copies);
Index: gcc/testsuite/g++.dg/tree-ssa/calloc.C
===================================================================
--- gcc/testsuite/g++.dg/tree-ssa/calloc.C	(revision 0)
+++ gcc/testsuite/g++.dg/tree-ssa/calloc.C	(working copy)
@@ -0,0 +1,50 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fdump-tree-optimized" } */
+
+typedef __SIZE_TYPE__ size_t;
+inline void* operator new(size_t, void* p) throw() { return p; }
+
+typedef void (*handler_t)(void);
+extern handler_t get_handle();
+
+inline void* operator new(size_t sz)
+{
+  void *p;
+
+  if (sz == 0)
+    sz = 1;
+
+  while ((p = __builtin_malloc (sz)) == 0)
+    {
+      handler_t handler = get_handle ();
+      if (! handler)
+        throw 42;
+      handler ();
+    }
+  return p;
+}
+
+struct vect {
+  int *start, *end;
+  vect(size_t n) {
+    start = end = 0;
+    if (n > (size_t)-1 / sizeof(int))
+      throw 33;
+    if (n != 0)
+      start = static_cast<int*> (operator new (n * sizeof(int)));
+    end = start + n;
+    int *p = start;
+    for (size_t l = n; l > 0; --l, ++p)
+      *p = 0;
+  }
+};
+
+void f (void *p, int n)
+{
+  new (p) vect(n);
+}
+
+/* { dg-final { scan-tree-dump-times "calloc" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-not "malloc" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "memset" "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: gcc/testsuite/gcc.dg/strlenopt-9.c
===================================================================
--- gcc/testsuite/gcc.dg/strlenopt-9.c	(revision 209516)
+++ gcc/testsuite/gcc.dg/strlenopt-9.c	(working copy)
@@ -11,21 +11,21 @@ fn1 (int r)
      optimized away.  */
   return strchr (p, '\0');
 }
 
 __attribute__((noinline, noclone)) size_t
 fn2 (int r)
 {
   char *p, q[10];
   strcpy (q, "abc");
   p = r ? "a" : q;
-  /* String length for p varies, therefore strlen below isn't
+  /* String length is constant for both alternatives, and strlen is
      optimized away.  */
   return strlen (p);
 }
 
 __attribute__((noinline, noclone)) size_t
 fn3 (char *p, int n)
 {
   int i;
   p = strchr (p, '\0');
   /* strcat here can be optimized into memcpy.  */
@@ -91,19 +91,19 @@ main ()
       || memcmp (b, "aaaabcdabcdefgabcdefgefgabcdabcd", 33) != 0
       || memcmp (buf, "aaaabcdabcdefgabcdefgefgabcdefg", 32) != 0)
     abort ();
   if (fn4 (b, 256) != 4
       || memcmp (b, "aaaabcdabcdefgabcdefgefgabcdabcdabcd", 37) != 0
       || memcmp (buf, "aaaabcdabcdefgabcdefgefgabcdabcdefgefg", 39) != 0)
     abort ();
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "strlen \\(" 4 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strlen \\(" 3 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "memcpy \\(" 6 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strcpy \\(" 1 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strchr \\(" 3 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
 /* { dg-final { cleanup-tree-dump "strlen" } } */
 /* { dg-final { scan-tree-dump-times "return 4;" 1 "optimized" } } */
 /* { dg-final { cleanup-tree-dump "optimized" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/calloc-1.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/calloc-1.c	(revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/calloc-1.c	(working copy)
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+extern int a;
+extern int *b;
+int n;
+void* f(long *q)
+{
+  int *p = __builtin_malloc (n);
+  ++*q;
+  if (p)
+  {
+    ++*q;
+    a = 2;
+    __builtin_memset (p, 0, n);
+    *b = 3;
+  }
+  return p;
+}
+void* g(void)
+{
+  float *p = __builtin_calloc (8, 4);
+  return __builtin_memset (p, 0, 24); // not 32
+}
+
+/* { dg-final { scan-tree-dump-times "calloc" 2 "optimized" } } */
+/* { dg-final { scan-tree-dump-not "malloc" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "memset" "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/calloc-2.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/calloc-2.c	(revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/calloc-2.c	(working copy)
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+int n, nn;
+void* f()
+{
+  char *p = __builtin_calloc (n, 1);
+  p[42] = '\n';
+  __builtin_memset (p, 0, nn);
+  return p;
+}
+
+void* g(int m1, int m2)
+{
+  char *p = __builtin_malloc (m2);
+  while (--m1)
+  {
+    __builtin_memset (p, 0, m2);
+    p[n] = 'b';
+  }
+  return p;
+}
+
+/* { dg-final { scan-tree-dump-times "malloc" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "calloc" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "memset" 2 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: gcc/tree-ssa-strlen.c
===================================================================
--- gcc/tree-ssa-strlen.c	(revision 209516)
+++ gcc/tree-ssa-strlen.c	(working copy)
@@ -496,20 +496,23 @@ get_string_length (strinfo si)
 	case BUILT_IN_STPCPY_CHK:
 	  gcc_assert (lhs != NULL_TREE);
 	  loc = gimple_location (stmt);
 	  si->endptr = lhs;
 	  si->stmt = NULL;
 	  lhs = fold_convert_loc (loc, size_type_node, lhs);
 	  si->length = fold_convert_loc (loc, size_type_node, si->ptr);
 	  si->length = fold_build2_loc (loc, MINUS_EXPR, size_type_node,
 					lhs, si->length);
 	  break;
+	case BUILT_IN_MALLOC:
+	  break;
+	/* BUILT_IN_CALLOC always has si->length set.  */
 	default:
 	  gcc_unreachable ();
 	  break;
 	}
     }
 
   return si->length;
 }
 
 /* Invalidate string length information for strings whose length
@@ -521,20 +524,21 @@ maybe_invalidate (gimple stmt)
   strinfo si;
   unsigned int i;
   bool nonempty = false;
 
   for (i = 1; vec_safe_iterate (stridx_to_strinfo, i, &si); ++i)
     if (si != NULL)
       {
 	if (!si->dont_invalidate)
 	  {
 	    ao_ref r;
+	    /* Do not use si->length.  */
 	    ao_ref_init_from_ptr_and_size (&r, si->ptr, NULL_TREE);
 	    if (stmt_may_clobber_ref_p_1 (stmt, &r))
 	      {
 		set_strinfo (i, NULL);
 		free_strinfo (si);
 		continue;
 	      }
 	  }
 	si->dont_invalidate = false;
 	nonempty = true;
@@ -1608,20 +1612,93 @@ handle_builtin_strcat (enum built_in_fun
 	{
 	  laststmt.stmt = stmt;
 	  laststmt.len = srclen;
 	  laststmt.stridx = dsi->idx;
 	}
     }
   else if (dump_file && (dump_flags & TDF_DETAILS) != 0)
     fprintf (dump_file, "not possible.\n");
 }
 
+/* Handle a call to malloc or calloc.  */
+
+static void
+handle_builtin_malloc (enum built_in_function bcode, gimple_stmt_iterator *gsi)
+{
+  gimple stmt = gsi_stmt (*gsi);
+  tree lhs = gimple_call_lhs (stmt);
+  gcc_assert (get_stridx (lhs) == 0);
+  int idx = new_stridx (lhs);
+  tree length = NULL_TREE;
+  if (bcode == BUILT_IN_CALLOC)
+    length = build_int_cst (size_type_node, 0);
+  strinfo si = new_strinfo (lhs, idx, length);
+  if (bcode == BUILT_IN_CALLOC)
+    si->endptr = lhs;
+  set_strinfo (idx, si);
+  si->writable = true;
+  si->stmt = stmt;
+  si->dont_invalidate = true;
+}
+
+/* Handle a call to memset.
+   After a call to calloc, memset(,0,) is unnecessary.
+   memset(malloc(n),0,n) is calloc(n,1).  */
+
+static bool
+handle_builtin_memset (gimple_stmt_iterator *gsi)
+{
+  gimple stmt2 = gsi_stmt (*gsi);
+  if (!integer_zerop (gimple_call_arg (stmt2, 1)))
+    return true;
+  tree ptr = gimple_call_arg (stmt2, 0);
+  int idx1 = get_stridx (ptr);
+  if (idx1 <= 0)
+    return true;
+  strinfo si1 = get_strinfo (idx1);
+  if (!si1)
+    return true;
+  gimple stmt1 = si1->stmt;
+  if (!stmt1 || !is_gimple_call (stmt1))
+    return true;
+  tree callee1 = gimple_call_fndecl (stmt1);
+  if (!gimple_call_builtin_p (stmt1, BUILT_IN_NORMAL))
+    return true;
+  enum built_in_function code1 = DECL_FUNCTION_CODE (callee1);
+  tree size = gimple_call_arg (stmt2, 2);
+  if (code1 == BUILT_IN_CALLOC)
+    /* Not touching stmt1 */ ;
+  else if (code1 == BUILT_IN_MALLOC
+	   && operand_equal_p (gimple_call_arg (stmt1, 0), size, 0))
+    {
+      gimple_stmt_iterator gsi1 = gsi_for_stmt (stmt1);
+      update_gimple_call (&gsi1, builtin_decl_implicit (BUILT_IN_CALLOC), 2,
+			  size, build_one_cst (size_type_node));
+    }
+  else
+    return true;
+  tree lhs = gimple_call_lhs (stmt2);
+  unlink_stmt_vdef (stmt2);
+  if (lhs)
+    {
+      gimple assign = gimple_build_assign (lhs, ptr);
+      gsi_replace (gsi, assign, false);
+    }
+  else
+    {
+      gsi_remove (gsi, true);
+      release_defs (stmt2);
+    }
+
+  return false;
+}
+
 /* Handle a POINTER_PLUS_EXPR statement.
    For p = "abcd" + 2; compute associated length, or if
    p = q + off is pointing to a '\0' character of a string, call
    zero_length_string on it.  */
 
 static void
 handle_pointer_plus (gimple_stmt_iterator *gsi)
 {
   gimple stmt = gsi_stmt (*gsi);
   tree lhs = gimple_assign_lhs (stmt), off;
@@ -1845,20 +1922,28 @@ strlen_optimize_stmt (gimple_stmt_iterat
 	  case BUILT_IN_MEMCPY:
 	  case BUILT_IN_MEMCPY_CHK:
 	  case BUILT_IN_MEMPCPY:
 	  case BUILT_IN_MEMPCPY_CHK:
 	    handle_builtin_memcpy (DECL_FUNCTION_CODE (callee), gsi);
 	    break;
 	  case BUILT_IN_STRCAT:
 	  case BUILT_IN_STRCAT_CHK:
 	    handle_builtin_strcat (DECL_FUNCTION_CODE (callee), gsi);
 	    break;
+	  case BUILT_IN_MALLOC:
+	  case BUILT_IN_CALLOC:
+	    handle_builtin_malloc (DECL_FUNCTION_CODE (callee), gsi);
+	    break;
+	  case BUILT_IN_MEMSET:
+	    if (!handle_builtin_memset (gsi))
+	      return false;
+	    break;
 	  default:
 	    break;
 	  }
     }
   else if (is_gimple_assign (stmt))
     {
       tree lhs = gimple_assign_lhs (stmt);
 
       if (TREE_CODE (lhs) == SSA_NAME && POINTER_TYPE_P (TREE_TYPE (lhs)))
 	{

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: calloc = malloc + memset
  2014-04-18 18:34       ` Marc Glisse
@ 2014-04-23 14:05         ` Richard Biener
  2014-05-17 14:19           ` Marc Glisse
  0 siblings, 1 reply; 26+ messages in thread
From: Richard Biener @ 2014-04-23 14:05 UTC (permalink / raw)
  To: Marc Glisse; +Cc: Jakub Jelinek, GCC Patches

On Fri, Apr 18, 2014 at 8:27 PM, Marc Glisse <marc.glisse@inria.fr> wrote:
> Thanks for the comments!
>
>
> On Fri, 18 Apr 2014, Jakub Jelinek wrote:
>
>> The passes.def change makes me a little bit nervous, but if it works,
>> perhaps.
>
>
> Would you prefer running the pass twice? I thought there would be less
> resistance to moving the pass than duplicating it.

Indeed.  I think placing it after loops and CSE (thus what you have done)
makes sense.  strlenopt itself shouldn't enable much additional
optimizations.  But well, pass ordering is always tricky.

Didn't look at the rest of the changes, but Jakub is certainly able to
approve the patch so I leave it to him.

Thanks,
Richard.

> By the way, I think even
> passes we run only once should have the required functions implemented so
> they can be run several times (at least most of them), in case users want to
> do that in plugins. I was surprised when I tried adding a second strlen pass
> and the compiler refused.
>
>
>>> --- gcc/testsuite/g++.dg/tree-ssa/calloc.C      (revision 0)
>>> +++ gcc/testsuite/g++.dg/tree-ssa/calloc.C      (working copy)
>>> @@ -0,0 +1,35 @@
>>> +/* { dg-do compile { target c++11 } } */
>>> +/* { dg-options "-O3 -fdump-tree-optimized" } */
>>> +
>>> +#include <new>
>>> +#include <vector>
>>> +#include <cstdlib>
>>> +
>>> +void g(void*);
>>> +inline void* operator new(std::size_t sz)
>>> +{
>>> +  void *p;
>>> +
>>> +  if (sz == 0)
>>> +    sz = 1;
>>> +
>>> +  // Slightly modified from the libsupc++ version, that one has 2 calls
>>> +  // to malloc which makes it too hard to optimize.
>>> +  while ((p = std::malloc (sz)) == 0)
>>> +    {
>>> +      std::new_handler handler = std::get_new_handler ();
>>> +      if (! handler)
>>> +        throw std::bad_alloc();
>>> +      handler ();
>>> +    }
>>> +  return p;
>>> +}
>>> +
>>> +void f(void*p,int n){
>>> +  new(p)std::vector<int>(n);
>>> +}
>>> +
>>> +/* { dg-final { scan-tree-dump-times "calloc" 1 "optimized" } } */
>>> +/* { dg-final { scan-tree-dump-not "malloc" "optimized" } } */
>>> +/* { dg-final { scan-tree-dump-not "memset" "optimized" } } */
>>> +/* { dg-final { cleanup-tree-dump "optimized" } } */
>>
>>
>> This looks to me way too much fragile, any time the libstdc++
>> or glibc headers change a little bit, you might need to adjust the
>> dg-final directives.  Much better would be if you just provided
>> the prototypes yourself and subset of the std::vector you really need for
>> the testcase.  You can throw some class or int, it doesn't have to be
>> std::bad_alloc, etc.
>
>
> I don't understand what seems so fragile to you. There is a single function
> in the .optimized dump, which just calls calloc in a loop. It doesn't seem
> that likely that a change in glibc/libstdc++ would make an extra memset pop
> up. A change in libstdc++ could easily prevent the optimization completely
> (I'd like to hope we can avoid that, half of the purpose of the testcase was
> making sure libstdc++ didn't change in a bad way), but I don't really see
> how it could keep it in a way that requires tweaking dg-final.
>
> While trying to write a standalone version, I hit again many missed
> optimizations, getting such nice things in the .optimized dump as:
>
>   _12 = p_13 + sz_7;
>   if (_12 != p_13)
>
> or:
>
>   _12 = p_13 + sz_7;
>   _30 = (unsigned long) _12;
>   _9 = p_13 + 4;
>   _10 = (unsigned long) _9;
>   _11 = _30 - _10;
>   _22 = _11 /[ex] 4;
>   _21 = _22;
>   _40 = _21 + 1;
>   _34 = _40 * 4;
>
> It is embarrassing... I hope the combiner GSoC will work well and we can
> just add a dozen patterns to handle this before 4.10.
>
>
>>> --- gcc/testsuite/gcc.dg/strlenopt-9.c  (revision 208772)
>>> +++ gcc/testsuite/gcc.dg/strlenopt-9.c  (working copy)
>>> @@ -11,21 +11,21 @@ fn1 (int r)
>>>       optimized away.  */
>>>    return strchr (p, '\0');
>>>  }
>>>
>>>  __attribute__((noinline, noclone)) size_t
>>>  fn2 (int r)
>>>  {
>>>    char *p, q[10];
>>>    strcpy (q, "abc");
>>>    p = r ? "a" : q;
>>> -  /* String length for p varies, therefore strlen below isn't
>>> +  /* String length is constant for both alternatives, and strlen is
>>>       optimized away.  */
>>>    return strlen (p);
>>
>>
>> Is this because of jump threading?
>
>
> It is PRE that turns:
>
>   if (r_4(D) == 0)
>     goto <bb 5>;
>   else
>     goto <bb 3>;
>
>   <bb 5>:
>   goto <bb 4>;
>
>   <bb 3>:
>
>   <bb 4>:
>   # p_1 = PHI <&q(5), "a"(3)>
>   _5 = __builtin_strlen (p_1);
>
> into:
>
>   if (r_4(D) == 0)
>     goto <bb 5>;
>   else
>     goto <bb 3>;
>
>   <bb 5>:
>   _7 = __builtin_strlen (&q);
>   pretmp_8 = _7;
>   goto <bb 4>;
>
>   <bb 3>:
>
>   <bb 4>:
>   # p_1 = PHI <&q(5), "a"(3)>
>   # prephitmp_9 = PHI <pretmp_8(5), 1(3)>
>   _5 = prephitmp_9;
>
> It says:
>
> Found partial redundancy for expression
> {call_expr<__builtin_strlen>,p_1}@.MEM_3 (0005)
>
>
>>> --- gcc/testsuite/gcc.dg/tree-ssa/calloc-1.c    (revision 0)
>>> +++ gcc/testsuite/gcc.dg/tree-ssa/calloc-1.c    (working copy)
>>> @@ -0,0 +1,29 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-options "-O2 -fdump-tree-optimized" } */
>>> +
>>> +#include <stdlib.h>
>>> +#include <string.h>
>>
>>
>> Even this I find unsafe.  The strlenopt*.c tests use it's custom
>> strlenopt.h header for a reason, you might just add a calloc
>> prototype in there and use that header.
>
>
> Might as well use __builtin_* then.
>
>
>>> +/* Handle a call to malloc or calloc.  */
>>> +
>>> +static void
>>> +handle_builtin_malloc (enum built_in_function bcode,
>>> gimple_stmt_iterator *gsi)
>>> +{
>>> +  gimple stmt = gsi_stmt (*gsi);
>>> +  tree lhs = gimple_call_lhs (stmt);
>>> +  gcc_assert (get_stridx (lhs) == 0);
>>> +  int idx = new_stridx (lhs);
>>> +  tree length = NULL_TREE;
>>> +  if (bcode == BUILT_IN_CALLOC)
>>> +    length = build_int_cst (size_type_node, 0);
>>
>>
>> Is this safe?  I mean, if you call int a = 0; ptr = calloc (a, n);
>> or ptr = calloc (n, a); or ptr = calloc (0, 0); etc., then there is no
>> zero termination.  Sure, calling strlen or strchr etc. on the result
>> is undefined behavior, just wondering if it isn't going to break anything.
>> Though, the result is zero length and thus any dereferencing would be a
>> bug,
>> so perhaps we are fine.
>
>
> You know the pass better than I do... I don't think the pass does anything
> like: "all 0-length strings are equivalent, let's replace one by another
> one", so I think we should be ok, but I don't mind treating calloc like
> malloc here if you think it is too risky, I don't think people call
> strlen(calloc(a,b)) very often, and it would let me remove the "don't use
> si->length" comment.
>
>
> Updated and tested on x86_64-linux-gnu.
>
> 2014-04-18  Marc Glisse  <marc.glisse@inria.fr>
>
>
>         PR tree-optimization/57742
> gcc/
>         * tree-ssa-strlen.c (get_string_length): Ignore malloc.
>         (handle_builtin_malloc, handle_builtin_memset): New functions.
>         (strlen_optimize_stmt): Call them.
>         * passes.def: Move strlen after loop+dom.
> gcc/testsuite/
>         * g++.dg/tree-ssa/calloc.C: New testcase.
>         * gcc.dg/tree-ssa/calloc-1.c: Likewise.
>         * gcc.dg/tree-ssa/calloc-2.c: Likewise.
>         * gcc.dg/strlenopt-9.c: Adapt.
>
> --
> Marc Glisse
> Index: gcc/passes.def
> ===================================================================
> --- gcc/passes.def      (revision 209516)
> +++ gcc/passes.def      (working copy)
> @@ -176,21 +176,20 @@ along with GCC; see the file COPYING3.
>          DOM and erroneous path isolation should be due to degenerate PHI
> nodes.
>          So rather than run the full propagators, run a specialized pass
> which
>          only examines PHIs to discover const/copy propagation
>          opportunities.  */
>        NEXT_PASS (pass_phi_only_cprop);
>        NEXT_PASS (pass_dse);
>        NEXT_PASS (pass_reassoc);
>        NEXT_PASS (pass_dce);
>        NEXT_PASS (pass_forwprop);
>        NEXT_PASS (pass_phiopt);
> -      NEXT_PASS (pass_strlen);
>        NEXT_PASS (pass_ccp);
>        /* After CCP we rewrite no longer addressed locals into SSA
>          form if possible.  */
>        NEXT_PASS (pass_copy_prop);
>        NEXT_PASS (pass_cse_sincos);
>        NEXT_PASS (pass_optimize_bswap);
>        NEXT_PASS (pass_split_crit_edges);
>        NEXT_PASS (pass_pre);
>        NEXT_PASS (pass_sink_code);
>        NEXT_PASS (pass_asan);
> @@ -235,20 +234,21 @@ along with GCC; see the file COPYING3.
>        NEXT_PASS (pass_cse_reciprocals);
>        NEXT_PASS (pass_reassoc);
>        NEXT_PASS (pass_strength_reduction);
>        NEXT_PASS (pass_dominator);
>        /* The only const/copy propagation opportunities left after
>          DOM should be due to degenerate PHI nodes.  So rather than
>          run the full propagators, run a specialized pass which
>          only examines PHIs to discover const/copy propagation
>          opportunities.  */
>        NEXT_PASS (pass_phi_only_cprop);
> +      NEXT_PASS (pass_strlen);
>        NEXT_PASS (pass_vrp);
>        NEXT_PASS (pass_cd_dce);
>        NEXT_PASS (pass_tracer);
>        NEXT_PASS (pass_dse);
>        NEXT_PASS (pass_forwprop);
>        NEXT_PASS (pass_phiopt);
>        NEXT_PASS (pass_fold_builtins);
>        NEXT_PASS (pass_optimize_widening_mul);
>        NEXT_PASS (pass_tail_calls);
>        NEXT_PASS (pass_rename_ssa_copies);
> Index: gcc/testsuite/g++.dg/tree-ssa/calloc.C
> ===================================================================
> --- gcc/testsuite/g++.dg/tree-ssa/calloc.C      (revision 0)
> +++ gcc/testsuite/g++.dg/tree-ssa/calloc.C      (working copy)
> @@ -0,0 +1,50 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fdump-tree-optimized" } */
> +
> +typedef __SIZE_TYPE__ size_t;
> +inline void* operator new(size_t, void* p) throw() { return p; }
> +
> +typedef void (*handler_t)(void);
> +extern handler_t get_handle();
> +
> +inline void* operator new(size_t sz)
> +{
> +  void *p;
> +
> +  if (sz == 0)
> +    sz = 1;
> +
> +  while ((p = __builtin_malloc (sz)) == 0)
> +    {
> +      handler_t handler = get_handle ();
> +      if (! handler)
> +        throw 42;
> +      handler ();
> +    }
> +  return p;
> +}
> +
> +struct vect {
> +  int *start, *end;
> +  vect(size_t n) {
> +    start = end = 0;
> +    if (n > (size_t)-1 / sizeof(int))
> +      throw 33;
> +    if (n != 0)
> +      start = static_cast<int*> (operator new (n * sizeof(int)));
> +    end = start + n;
> +    int *p = start;
> +    for (size_t l = n; l > 0; --l, ++p)
> +      *p = 0;
> +  }
> +};
> +
> +void f (void *p, int n)
> +{
> +  new (p) vect(n);
> +}
> +
> +/* { dg-final { scan-tree-dump-times "calloc" 1 "optimized" } } */
> +/* { dg-final { scan-tree-dump-not "malloc" "optimized" } } */
> +/* { dg-final { scan-tree-dump-not "memset" "optimized" } } */
> +/* { dg-final { cleanup-tree-dump "optimized" } } */
> Index: gcc/testsuite/gcc.dg/strlenopt-9.c
> ===================================================================
> --- gcc/testsuite/gcc.dg/strlenopt-9.c  (revision 209516)
> +++ gcc/testsuite/gcc.dg/strlenopt-9.c  (working copy)
> @@ -11,21 +11,21 @@ fn1 (int r)
>       optimized away.  */
>    return strchr (p, '\0');
>  }
>
>  __attribute__((noinline, noclone)) size_t
>  fn2 (int r)
>  {
>    char *p, q[10];
>    strcpy (q, "abc");
>    p = r ? "a" : q;
> -  /* String length for p varies, therefore strlen below isn't
> +  /* String length is constant for both alternatives, and strlen is
>       optimized away.  */
>    return strlen (p);
>  }
>
>  __attribute__((noinline, noclone)) size_t
>  fn3 (char *p, int n)
>  {
>    int i;
>    p = strchr (p, '\0');
>    /* strcat here can be optimized into memcpy.  */
> @@ -91,19 +91,19 @@ main ()
>        || memcmp (b, "aaaabcdabcdefgabcdefgefgabcdabcd", 33) != 0
>        || memcmp (buf, "aaaabcdabcdefgabcdefgefgabcdefg", 32) != 0)
>      abort ();
>    if (fn4 (b, 256) != 4
>        || memcmp (b, "aaaabcdabcdefgabcdefgefgabcdabcdabcd", 37) != 0
>        || memcmp (buf, "aaaabcdabcdefgabcdefgefgabcdabcdefgefg", 39) != 0)
>      abort ();
>    return 0;
>  }
>
> -/* { dg-final { scan-tree-dump-times "strlen \\(" 4 "strlen" } } */
> +/* { dg-final { scan-tree-dump-times "strlen \\(" 3 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "memcpy \\(" 6 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "strcpy \\(" 1 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "strchr \\(" 3 "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
>  /* { dg-final { cleanup-tree-dump "strlen" } } */
>  /* { dg-final { scan-tree-dump-times "return 4;" 1 "optimized" } } */
>  /* { dg-final { cleanup-tree-dump "optimized" } } */
> Index: gcc/testsuite/gcc.dg/tree-ssa/calloc-1.c
> ===================================================================
> --- gcc/testsuite/gcc.dg/tree-ssa/calloc-1.c    (revision 0)
> +++ gcc/testsuite/gcc.dg/tree-ssa/calloc-1.c    (working copy)
> @@ -0,0 +1,29 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +
> +extern int a;
> +extern int *b;
> +int n;
> +void* f(long *q)
> +{
> +  int *p = __builtin_malloc (n);
> +  ++*q;
> +  if (p)
> +  {
> +    ++*q;
> +    a = 2;
> +    __builtin_memset (p, 0, n);
> +    *b = 3;
> +  }
> +  return p;
> +}
> +void* g(void)
> +{
> +  float *p = __builtin_calloc (8, 4);
> +  return __builtin_memset (p, 0, 24); // not 32
> +}
> +
> +/* { dg-final { scan-tree-dump-times "calloc" 2 "optimized" } } */
> +/* { dg-final { scan-tree-dump-not "malloc" "optimized" } } */
> +/* { dg-final { scan-tree-dump-not "memset" "optimized" } } */
> +/* { dg-final { cleanup-tree-dump "optimized" } } */
> Index: gcc/testsuite/gcc.dg/tree-ssa/calloc-2.c
> ===================================================================
> --- gcc/testsuite/gcc.dg/tree-ssa/calloc-2.c    (revision 0)
> +++ gcc/testsuite/gcc.dg/tree-ssa/calloc-2.c    (working copy)
> @@ -0,0 +1,27 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +
> +int n, nn;
> +void* f()
> +{
> +  char *p = __builtin_calloc (n, 1);
> +  p[42] = '\n';
> +  __builtin_memset (p, 0, nn);
> +  return p;
> +}
> +
> +void* g(int m1, int m2)
> +{
> +  char *p = __builtin_malloc (m2);
> +  while (--m1)
> +  {
> +    __builtin_memset (p, 0, m2);
> +    p[n] = 'b';
> +  }
> +  return p;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "malloc" 1 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times "calloc" 1 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times "memset" 2 "optimized" } } */
> +/* { dg-final { cleanup-tree-dump "optimized" } } */
> Index: gcc/tree-ssa-strlen.c
> ===================================================================
> --- gcc/tree-ssa-strlen.c       (revision 209516)
> +++ gcc/tree-ssa-strlen.c       (working copy)
> @@ -496,20 +496,23 @@ get_string_length (strinfo si)
>         case BUILT_IN_STPCPY_CHK:
>           gcc_assert (lhs != NULL_TREE);
>           loc = gimple_location (stmt);
>           si->endptr = lhs;
>           si->stmt = NULL;
>           lhs = fold_convert_loc (loc, size_type_node, lhs);
>           si->length = fold_convert_loc (loc, size_type_node, si->ptr);
>           si->length = fold_build2_loc (loc, MINUS_EXPR, size_type_node,
>                                         lhs, si->length);
>           break;
> +       case BUILT_IN_MALLOC:
> +         break;
> +       /* BUILT_IN_CALLOC always has si->length set.  */
>         default:
>           gcc_unreachable ();
>           break;
>         }
>      }
>
>    return si->length;
>  }
>
>  /* Invalidate string length information for strings whose length
> @@ -521,20 +524,21 @@ maybe_invalidate (gimple stmt)
>    strinfo si;
>    unsigned int i;
>    bool nonempty = false;
>
>    for (i = 1; vec_safe_iterate (stridx_to_strinfo, i, &si); ++i)
>      if (si != NULL)
>        {
>         if (!si->dont_invalidate)
>           {
>             ao_ref r;
> +           /* Do not use si->length.  */
>             ao_ref_init_from_ptr_and_size (&r, si->ptr, NULL_TREE);
>             if (stmt_may_clobber_ref_p_1 (stmt, &r))
>               {
>                 set_strinfo (i, NULL);
>                 free_strinfo (si);
>                 continue;
>               }
>           }
>         si->dont_invalidate = false;
>         nonempty = true;
> @@ -1608,20 +1612,93 @@ handle_builtin_strcat (enum built_in_fun
>         {
>           laststmt.stmt = stmt;
>           laststmt.len = srclen;
>           laststmt.stridx = dsi->idx;
>         }
>      }
>    else if (dump_file && (dump_flags & TDF_DETAILS) != 0)
>      fprintf (dump_file, "not possible.\n");
>  }
>
> +/* Handle a call to malloc or calloc.  */
> +
> +static void
> +handle_builtin_malloc (enum built_in_function bcode, gimple_stmt_iterator
> *gsi)
> +{
> +  gimple stmt = gsi_stmt (*gsi);
> +  tree lhs = gimple_call_lhs (stmt);
> +  gcc_assert (get_stridx (lhs) == 0);
> +  int idx = new_stridx (lhs);
> +  tree length = NULL_TREE;
> +  if (bcode == BUILT_IN_CALLOC)
> +    length = build_int_cst (size_type_node, 0);
> +  strinfo si = new_strinfo (lhs, idx, length);
> +  if (bcode == BUILT_IN_CALLOC)
> +    si->endptr = lhs;
> +  set_strinfo (idx, si);
> +  si->writable = true;
> +  si->stmt = stmt;
> +  si->dont_invalidate = true;
> +}
> +
> +/* Handle a call to memset.
> +   After a call to calloc, memset(,0,) is unnecessary.
> +   memset(malloc(n),0,n) is calloc(n,1).  */
> +
> +static bool
> +handle_builtin_memset (gimple_stmt_iterator *gsi)
> +{
> +  gimple stmt2 = gsi_stmt (*gsi);
> +  if (!integer_zerop (gimple_call_arg (stmt2, 1)))
> +    return true;
> +  tree ptr = gimple_call_arg (stmt2, 0);
> +  int idx1 = get_stridx (ptr);
> +  if (idx1 <= 0)
> +    return true;
> +  strinfo si1 = get_strinfo (idx1);
> +  if (!si1)
> +    return true;
> +  gimple stmt1 = si1->stmt;
> +  if (!stmt1 || !is_gimple_call (stmt1))
> +    return true;
> +  tree callee1 = gimple_call_fndecl (stmt1);
> +  if (!gimple_call_builtin_p (stmt1, BUILT_IN_NORMAL))
> +    return true;
> +  enum built_in_function code1 = DECL_FUNCTION_CODE (callee1);
> +  tree size = gimple_call_arg (stmt2, 2);
> +  if (code1 == BUILT_IN_CALLOC)
> +    /* Not touching stmt1 */ ;
> +  else if (code1 == BUILT_IN_MALLOC
> +          && operand_equal_p (gimple_call_arg (stmt1, 0), size, 0))
> +    {
> +      gimple_stmt_iterator gsi1 = gsi_for_stmt (stmt1);
> +      update_gimple_call (&gsi1, builtin_decl_implicit (BUILT_IN_CALLOC),
> 2,
> +                         size, build_one_cst (size_type_node));
> +    }
> +  else
> +    return true;
> +  tree lhs = gimple_call_lhs (stmt2);
> +  unlink_stmt_vdef (stmt2);
> +  if (lhs)
> +    {
> +      gimple assign = gimple_build_assign (lhs, ptr);
> +      gsi_replace (gsi, assign, false);
> +    }
> +  else
> +    {
> +      gsi_remove (gsi, true);
> +      release_defs (stmt2);
> +    }
> +
> +  return false;
> +}
> +
>  /* Handle a POINTER_PLUS_EXPR statement.
>     For p = "abcd" + 2; compute associated length, or if
>     p = q + off is pointing to a '\0' character of a string, call
>     zero_length_string on it.  */
>
>  static void
>  handle_pointer_plus (gimple_stmt_iterator *gsi)
>  {
>    gimple stmt = gsi_stmt (*gsi);
>    tree lhs = gimple_assign_lhs (stmt), off;
> @@ -1845,20 +1922,28 @@ strlen_optimize_stmt (gimple_stmt_iterat
>           case BUILT_IN_MEMCPY:
>           case BUILT_IN_MEMCPY_CHK:
>           case BUILT_IN_MEMPCPY:
>           case BUILT_IN_MEMPCPY_CHK:
>             handle_builtin_memcpy (DECL_FUNCTION_CODE (callee), gsi);
>             break;
>           case BUILT_IN_STRCAT:
>           case BUILT_IN_STRCAT_CHK:
>             handle_builtin_strcat (DECL_FUNCTION_CODE (callee), gsi);
>             break;
> +         case BUILT_IN_MALLOC:
> +         case BUILT_IN_CALLOC:
> +           handle_builtin_malloc (DECL_FUNCTION_CODE (callee), gsi);
> +           break;
> +         case BUILT_IN_MEMSET:
> +           if (!handle_builtin_memset (gsi))
> +             return false;
> +           break;
>           default:
>             break;
>           }
>      }
>    else if (is_gimple_assign (stmt))
>      {
>        tree lhs = gimple_assign_lhs (stmt);
>
>        if (TREE_CODE (lhs) == SSA_NAME && POINTER_TYPE_P (TREE_TYPE (lhs)))
>         {
>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: calloc = malloc + memset
  2014-04-23 14:05         ` Richard Biener
@ 2014-05-17 14:19           ` Marc Glisse
  2014-06-03 14:00             ` Marc Glisse
  0 siblings, 1 reply; 26+ messages in thread
From: Marc Glisse @ 2014-05-17 14:19 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: GCC Patches

Ping Jakub?
https://gcc.gnu.org/ml/gcc-patches/2014-04/msg01104.html

On Wed, 23 Apr 2014, Richard Biener wrote:

> On Fri, Apr 18, 2014 at 8:27 PM, Marc Glisse <marc.glisse@inria.fr> wrote:
>> Thanks for the comments!
>>
>>
>> On Fri, 18 Apr 2014, Jakub Jelinek wrote:
>>
>>> The passes.def change makes me a little bit nervous, but if it works,
>>> perhaps.
>>
>>
>> Would you prefer running the pass twice? I thought there would be less
>> resistance to moving the pass than duplicating it.
>
> Indeed.  I think placing it after loops and CSE (thus what you have done)
> makes sense.  strlenopt itself shouldn't enable much additional
> optimizations.  But well, pass ordering is always tricky.
>
> Didn't look at the rest of the changes, but Jakub is certainly able to
> approve the patch so I leave it to him.
>
> Thanks,
> Richard.
>
>> By the way, I think even
>> passes we run only once should have the required functions implemented so
>> they can be run several times (at least most of them), in case users want to
>> do that in plugins. I was surprised when I tried adding a second strlen pass
>> and the compiler refused.
>>
>>
>>>> --- gcc/testsuite/g++.dg/tree-ssa/calloc.C      (revision 0)
>>>> +++ gcc/testsuite/g++.dg/tree-ssa/calloc.C      (working copy)
>>>> @@ -0,0 +1,35 @@
>>>> +/* { dg-do compile { target c++11 } } */
>>>> +/* { dg-options "-O3 -fdump-tree-optimized" } */
>>>> +
>>>> +#include <new>
>>>> +#include <vector>
>>>> +#include <cstdlib>
>>>> +
>>>> +void g(void*);
>>>> +inline void* operator new(std::size_t sz)
>>>> +{
>>>> +  void *p;
>>>> +
>>>> +  if (sz == 0)
>>>> +    sz = 1;
>>>> +
>>>> +  // Slightly modified from the libsupc++ version, that one has 2 calls
>>>> +  // to malloc which makes it too hard to optimize.
>>>> +  while ((p = std::malloc (sz)) == 0)
>>>> +    {
>>>> +      std::new_handler handler = std::get_new_handler ();
>>>> +      if (! handler)
>>>> +        throw std::bad_alloc();
>>>> +      handler ();
>>>> +    }
>>>> +  return p;
>>>> +}
>>>> +
>>>> +void f(void*p,int n){
>>>> +  new(p)std::vector<int>(n);
>>>> +}
>>>> +
>>>> +/* { dg-final { scan-tree-dump-times "calloc" 1 "optimized" } } */
>>>> +/* { dg-final { scan-tree-dump-not "malloc" "optimized" } } */
>>>> +/* { dg-final { scan-tree-dump-not "memset" "optimized" } } */
>>>> +/* { dg-final { cleanup-tree-dump "optimized" } } */
>>>
>>>
>>> This looks to me way too much fragile, any time the libstdc++
>>> or glibc headers change a little bit, you might need to adjust the
>>> dg-final directives.  Much better would be if you just provided
>>> the prototypes yourself and subset of the std::vector you really need for
>>> the testcase.  You can throw some class or int, it doesn't have to be
>>> std::bad_alloc, etc.
>>
>>
>> I don't understand what seems so fragile to you. There is a single function
>> in the .optimized dump, which just calls calloc in a loop. It doesn't seem
>> that likely that a change in glibc/libstdc++ would make an extra memset pop
>> up. A change in libstdc++ could easily prevent the optimization completely
>> (I'd like to hope we can avoid that, half of the purpose of the testcase was
>> making sure libstdc++ didn't change in a bad way), but I don't really see
>> how it could keep it in a way that requires tweaking dg-final.
>>
>> While trying to write a standalone version, I hit again many missed
>> optimizations, getting such nice things in the .optimized dump as:
>>
>>   _12 = p_13 + sz_7;
>>   if (_12 != p_13)
>>
>> or:
>>
>>   _12 = p_13 + sz_7;
>>   _30 = (unsigned long) _12;
>>   _9 = p_13 + 4;
>>   _10 = (unsigned long) _9;
>>   _11 = _30 - _10;
>>   _22 = _11 /[ex] 4;
>>   _21 = _22;
>>   _40 = _21 + 1;
>>   _34 = _40 * 4;
>>
>> It is embarrassing... I hope the combiner GSoC will work well and we can
>> just add a dozen patterns to handle this before 4.10.
>>
>>
>>>> --- gcc/testsuite/gcc.dg/strlenopt-9.c  (revision 208772)
>>>> +++ gcc/testsuite/gcc.dg/strlenopt-9.c  (working copy)
>>>> @@ -11,21 +11,21 @@ fn1 (int r)
>>>>       optimized away.  */
>>>>    return strchr (p, '\0');
>>>>  }
>>>>
>>>>  __attribute__((noinline, noclone)) size_t
>>>>  fn2 (int r)
>>>>  {
>>>>    char *p, q[10];
>>>>    strcpy (q, "abc");
>>>>    p = r ? "a" : q;
>>>> -  /* String length for p varies, therefore strlen below isn't
>>>> +  /* String length is constant for both alternatives, and strlen is
>>>>       optimized away.  */
>>>>    return strlen (p);
>>>
>>>
>>> Is this because of jump threading?
>>
>>
>> It is PRE that turns:
>>
>>   if (r_4(D) == 0)
>>     goto <bb 5>;
>>   else
>>     goto <bb 3>;
>>
>>   <bb 5>:
>>   goto <bb 4>;
>>
>>   <bb 3>:
>>
>>   <bb 4>:
>>   # p_1 = PHI <&q(5), "a"(3)>
>>   _5 = __builtin_strlen (p_1);
>>
>> into:
>>
>>   if (r_4(D) == 0)
>>     goto <bb 5>;
>>   else
>>     goto <bb 3>;
>>
>>   <bb 5>:
>>   _7 = __builtin_strlen (&q);
>>   pretmp_8 = _7;
>>   goto <bb 4>;
>>
>>   <bb 3>:
>>
>>   <bb 4>:
>>   # p_1 = PHI <&q(5), "a"(3)>
>>   # prephitmp_9 = PHI <pretmp_8(5), 1(3)>
>>   _5 = prephitmp_9;
>>
>> It says:
>>
>> Found partial redundancy for expression
>> {call_expr<__builtin_strlen>,p_1}@.MEM_3 (0005)
>>
>>
>>>> --- gcc/testsuite/gcc.dg/tree-ssa/calloc-1.c    (revision 0)
>>>> +++ gcc/testsuite/gcc.dg/tree-ssa/calloc-1.c    (working copy)
>>>> @@ -0,0 +1,29 @@
>>>> +/* { dg-do compile } */
>>>> +/* { dg-options "-O2 -fdump-tree-optimized" } */
>>>> +
>>>> +#include <stdlib.h>
>>>> +#include <string.h>
>>>
>>>
>>> Even this I find unsafe.  The strlenopt*.c tests use it's custom
>>> strlenopt.h header for a reason, you might just add a calloc
>>> prototype in there and use that header.
>>
>>
>> Might as well use __builtin_* then.
>>
>>
>>>> +/* Handle a call to malloc or calloc.  */
>>>> +
>>>> +static void
>>>> +handle_builtin_malloc (enum built_in_function bcode,
>>>> gimple_stmt_iterator *gsi)
>>>> +{
>>>> +  gimple stmt = gsi_stmt (*gsi);
>>>> +  tree lhs = gimple_call_lhs (stmt);
>>>> +  gcc_assert (get_stridx (lhs) == 0);
>>>> +  int idx = new_stridx (lhs);
>>>> +  tree length = NULL_TREE;
>>>> +  if (bcode == BUILT_IN_CALLOC)
>>>> +    length = build_int_cst (size_type_node, 0);
>>>
>>>
>>> Is this safe?  I mean, if you call int a = 0; ptr = calloc (a, n);
>>> or ptr = calloc (n, a); or ptr = calloc (0, 0); etc., then there is no
>>> zero termination.  Sure, calling strlen or strchr etc. on the result
>>> is undefined behavior, just wondering if it isn't going to break anything.
>>> Though, the result is zero length and thus any dereferencing would be a
>>> bug,
>>> so perhaps we are fine.
>>
>>
>> You know the pass better than I do... I don't think the pass does anything
>> like: "all 0-length strings are equivalent, let's replace one by another
>> one", so I think we should be ok, but I don't mind treating calloc like
>> malloc here if you think it is too risky, I don't think people call
>> strlen(calloc(a,b)) very often, and it would let me remove the "don't use
>> si->length" comment.
>>
>>
>> Updated and tested on x86_64-linux-gnu.
>>
>> 2014-04-18  Marc Glisse  <marc.glisse@inria.fr>
>>
>>
>>         PR tree-optimization/57742
>> gcc/
>>         * tree-ssa-strlen.c (get_string_length): Ignore malloc.
>>         (handle_builtin_malloc, handle_builtin_memset): New functions.
>>         (strlen_optimize_stmt): Call them.
>>         * passes.def: Move strlen after loop+dom.
>> gcc/testsuite/
>>         * g++.dg/tree-ssa/calloc.C: New testcase.
>>         * gcc.dg/tree-ssa/calloc-1.c: Likewise.
>>         * gcc.dg/tree-ssa/calloc-2.c: Likewise.
>>         * gcc.dg/strlenopt-9.c: Adapt.
>>
>> --
>> Marc Glisse
>> Index: gcc/passes.def
>> ===================================================================
>> --- gcc/passes.def      (revision 209516)
>> +++ gcc/passes.def      (working copy)
>> @@ -176,21 +176,20 @@ along with GCC; see the file COPYING3.
>>          DOM and erroneous path isolation should be due to degenerate PHI
>> nodes.
>>          So rather than run the full propagators, run a specialized pass
>> which
>>          only examines PHIs to discover const/copy propagation
>>          opportunities.  */
>>        NEXT_PASS (pass_phi_only_cprop);
>>        NEXT_PASS (pass_dse);
>>        NEXT_PASS (pass_reassoc);
>>        NEXT_PASS (pass_dce);
>>        NEXT_PASS (pass_forwprop);
>>        NEXT_PASS (pass_phiopt);
>> -      NEXT_PASS (pass_strlen);
>>        NEXT_PASS (pass_ccp);
>>        /* After CCP we rewrite no longer addressed locals into SSA
>>          form if possible.  */
>>        NEXT_PASS (pass_copy_prop);
>>        NEXT_PASS (pass_cse_sincos);
>>        NEXT_PASS (pass_optimize_bswap);
>>        NEXT_PASS (pass_split_crit_edges);
>>        NEXT_PASS (pass_pre);
>>        NEXT_PASS (pass_sink_code);
>>        NEXT_PASS (pass_asan);
>> @@ -235,20 +234,21 @@ along with GCC; see the file COPYING3.
>>        NEXT_PASS (pass_cse_reciprocals);
>>        NEXT_PASS (pass_reassoc);
>>        NEXT_PASS (pass_strength_reduction);
>>        NEXT_PASS (pass_dominator);
>>        /* The only const/copy propagation opportunities left after
>>          DOM should be due to degenerate PHI nodes.  So rather than
>>          run the full propagators, run a specialized pass which
>>          only examines PHIs to discover const/copy propagation
>>          opportunities.  */
>>        NEXT_PASS (pass_phi_only_cprop);
>> +      NEXT_PASS (pass_strlen);
>>        NEXT_PASS (pass_vrp);
>>        NEXT_PASS (pass_cd_dce);
>>        NEXT_PASS (pass_tracer);
>>        NEXT_PASS (pass_dse);
>>        NEXT_PASS (pass_forwprop);
>>        NEXT_PASS (pass_phiopt);
>>        NEXT_PASS (pass_fold_builtins);
>>        NEXT_PASS (pass_optimize_widening_mul);
>>        NEXT_PASS (pass_tail_calls);
>>        NEXT_PASS (pass_rename_ssa_copies);
>> Index: gcc/testsuite/g++.dg/tree-ssa/calloc.C
>> ===================================================================
>> --- gcc/testsuite/g++.dg/tree-ssa/calloc.C      (revision 0)
>> +++ gcc/testsuite/g++.dg/tree-ssa/calloc.C      (working copy)
>> @@ -0,0 +1,50 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O3 -fdump-tree-optimized" } */
>> +
>> +typedef __SIZE_TYPE__ size_t;
>> +inline void* operator new(size_t, void* p) throw() { return p; }
>> +
>> +typedef void (*handler_t)(void);
>> +extern handler_t get_handle();
>> +
>> +inline void* operator new(size_t sz)
>> +{
>> +  void *p;
>> +
>> +  if (sz == 0)
>> +    sz = 1;
>> +
>> +  while ((p = __builtin_malloc (sz)) == 0)
>> +    {
>> +      handler_t handler = get_handle ();
>> +      if (! handler)
>> +        throw 42;
>> +      handler ();
>> +    }
>> +  return p;
>> +}
>> +
>> +struct vect {
>> +  int *start, *end;
>> +  vect(size_t n) {
>> +    start = end = 0;
>> +    if (n > (size_t)-1 / sizeof(int))
>> +      throw 33;
>> +    if (n != 0)
>> +      start = static_cast<int*> (operator new (n * sizeof(int)));
>> +    end = start + n;
>> +    int *p = start;
>> +    for (size_t l = n; l > 0; --l, ++p)
>> +      *p = 0;
>> +  }
>> +};
>> +
>> +void f (void *p, int n)
>> +{
>> +  new (p) vect(n);
>> +}
>> +
>> +/* { dg-final { scan-tree-dump-times "calloc" 1 "optimized" } } */
>> +/* { dg-final { scan-tree-dump-not "malloc" "optimized" } } */
>> +/* { dg-final { scan-tree-dump-not "memset" "optimized" } } */
>> +/* { dg-final { cleanup-tree-dump "optimized" } } */
>> Index: gcc/testsuite/gcc.dg/strlenopt-9.c
>> ===================================================================
>> --- gcc/testsuite/gcc.dg/strlenopt-9.c  (revision 209516)
>> +++ gcc/testsuite/gcc.dg/strlenopt-9.c  (working copy)
>> @@ -11,21 +11,21 @@ fn1 (int r)
>>       optimized away.  */
>>    return strchr (p, '\0');
>>  }
>>
>>  __attribute__((noinline, noclone)) size_t
>>  fn2 (int r)
>>  {
>>    char *p, q[10];
>>    strcpy (q, "abc");
>>    p = r ? "a" : q;
>> -  /* String length for p varies, therefore strlen below isn't
>> +  /* String length is constant for both alternatives, and strlen is
>>       optimized away.  */
>>    return strlen (p);
>>  }
>>
>>  __attribute__((noinline, noclone)) size_t
>>  fn3 (char *p, int n)
>>  {
>>    int i;
>>    p = strchr (p, '\0');
>>    /* strcat here can be optimized into memcpy.  */
>> @@ -91,19 +91,19 @@ main ()
>>        || memcmp (b, "aaaabcdabcdefgabcdefgefgabcdabcd", 33) != 0
>>        || memcmp (buf, "aaaabcdabcdefgabcdefgefgabcdefg", 32) != 0)
>>      abort ();
>>    if (fn4 (b, 256) != 4
>>        || memcmp (b, "aaaabcdabcdefgabcdefgefgabcdabcdabcd", 37) != 0
>>        || memcmp (buf, "aaaabcdabcdefgabcdefgefgabcdabcdefgefg", 39) != 0)
>>      abort ();
>>    return 0;
>>  }
>>
>> -/* { dg-final { scan-tree-dump-times "strlen \\(" 4 "strlen" } } */
>> +/* { dg-final { scan-tree-dump-times "strlen \\(" 3 "strlen" } } */
>>  /* { dg-final { scan-tree-dump-times "memcpy \\(" 6 "strlen" } } */
>>  /* { dg-final { scan-tree-dump-times "strcpy \\(" 1 "strlen" } } */
>>  /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
>>  /* { dg-final { scan-tree-dump-times "strchr \\(" 3 "strlen" } } */
>>  /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
>>  /* { dg-final { cleanup-tree-dump "strlen" } } */
>>  /* { dg-final { scan-tree-dump-times "return 4;" 1 "optimized" } } */
>>  /* { dg-final { cleanup-tree-dump "optimized" } } */
>> Index: gcc/testsuite/gcc.dg/tree-ssa/calloc-1.c
>> ===================================================================
>> --- gcc/testsuite/gcc.dg/tree-ssa/calloc-1.c    (revision 0)
>> +++ gcc/testsuite/gcc.dg/tree-ssa/calloc-1.c    (working copy)
>> @@ -0,0 +1,29 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O2 -fdump-tree-optimized" } */
>> +
>> +extern int a;
>> +extern int *b;
>> +int n;
>> +void* f(long *q)
>> +{
>> +  int *p = __builtin_malloc (n);
>> +  ++*q;
>> +  if (p)
>> +  {
>> +    ++*q;
>> +    a = 2;
>> +    __builtin_memset (p, 0, n);
>> +    *b = 3;
>> +  }
>> +  return p;
>> +}
>> +void* g(void)
>> +{
>> +  float *p = __builtin_calloc (8, 4);
>> +  return __builtin_memset (p, 0, 24); // not 32
>> +}
>> +
>> +/* { dg-final { scan-tree-dump-times "calloc" 2 "optimized" } } */
>> +/* { dg-final { scan-tree-dump-not "malloc" "optimized" } } */
>> +/* { dg-final { scan-tree-dump-not "memset" "optimized" } } */
>> +/* { dg-final { cleanup-tree-dump "optimized" } } */
>> Index: gcc/testsuite/gcc.dg/tree-ssa/calloc-2.c
>> ===================================================================
>> --- gcc/testsuite/gcc.dg/tree-ssa/calloc-2.c    (revision 0)
>> +++ gcc/testsuite/gcc.dg/tree-ssa/calloc-2.c    (working copy)
>> @@ -0,0 +1,27 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O2 -fdump-tree-optimized" } */
>> +
>> +int n, nn;
>> +void* f()
>> +{
>> +  char *p = __builtin_calloc (n, 1);
>> +  p[42] = '\n';
>> +  __builtin_memset (p, 0, nn);
>> +  return p;
>> +}
>> +
>> +void* g(int m1, int m2)
>> +{
>> +  char *p = __builtin_malloc (m2);
>> +  while (--m1)
>> +  {
>> +    __builtin_memset (p, 0, m2);
>> +    p[n] = 'b';
>> +  }
>> +  return p;
>> +}
>> +
>> +/* { dg-final { scan-tree-dump-times "malloc" 1 "optimized" } } */
>> +/* { dg-final { scan-tree-dump-times "calloc" 1 "optimized" } } */
>> +/* { dg-final { scan-tree-dump-times "memset" 2 "optimized" } } */
>> +/* { dg-final { cleanup-tree-dump "optimized" } } */
>> Index: gcc/tree-ssa-strlen.c
>> ===================================================================
>> --- gcc/tree-ssa-strlen.c       (revision 209516)
>> +++ gcc/tree-ssa-strlen.c       (working copy)
>> @@ -496,20 +496,23 @@ get_string_length (strinfo si)
>>         case BUILT_IN_STPCPY_CHK:
>>           gcc_assert (lhs != NULL_TREE);
>>           loc = gimple_location (stmt);
>>           si->endptr = lhs;
>>           si->stmt = NULL;
>>           lhs = fold_convert_loc (loc, size_type_node, lhs);
>>           si->length = fold_convert_loc (loc, size_type_node, si->ptr);
>>           si->length = fold_build2_loc (loc, MINUS_EXPR, size_type_node,
>>                                         lhs, si->length);
>>           break;
>> +       case BUILT_IN_MALLOC:
>> +         break;
>> +       /* BUILT_IN_CALLOC always has si->length set.  */
>>         default:
>>           gcc_unreachable ();
>>           break;
>>         }
>>      }
>>
>>    return si->length;
>>  }
>>
>>  /* Invalidate string length information for strings whose length
>> @@ -521,20 +524,21 @@ maybe_invalidate (gimple stmt)
>>    strinfo si;
>>    unsigned int i;
>>    bool nonempty = false;
>>
>>    for (i = 1; vec_safe_iterate (stridx_to_strinfo, i, &si); ++i)
>>      if (si != NULL)
>>        {
>>         if (!si->dont_invalidate)
>>           {
>>             ao_ref r;
>> +           /* Do not use si->length.  */
>>             ao_ref_init_from_ptr_and_size (&r, si->ptr, NULL_TREE);
>>             if (stmt_may_clobber_ref_p_1 (stmt, &r))
>>               {
>>                 set_strinfo (i, NULL);
>>                 free_strinfo (si);
>>                 continue;
>>               }
>>           }
>>         si->dont_invalidate = false;
>>         nonempty = true;
>> @@ -1608,20 +1612,93 @@ handle_builtin_strcat (enum built_in_fun
>>         {
>>           laststmt.stmt = stmt;
>>           laststmt.len = srclen;
>>           laststmt.stridx = dsi->idx;
>>         }
>>      }
>>    else if (dump_file && (dump_flags & TDF_DETAILS) != 0)
>>      fprintf (dump_file, "not possible.\n");
>>  }
>>
>> +/* Handle a call to malloc or calloc.  */
>> +
>> +static void
>> +handle_builtin_malloc (enum built_in_function bcode, gimple_stmt_iterator
>> *gsi)
>> +{
>> +  gimple stmt = gsi_stmt (*gsi);
>> +  tree lhs = gimple_call_lhs (stmt);
>> +  gcc_assert (get_stridx (lhs) == 0);
>> +  int idx = new_stridx (lhs);
>> +  tree length = NULL_TREE;
>> +  if (bcode == BUILT_IN_CALLOC)
>> +    length = build_int_cst (size_type_node, 0);
>> +  strinfo si = new_strinfo (lhs, idx, length);
>> +  if (bcode == BUILT_IN_CALLOC)
>> +    si->endptr = lhs;
>> +  set_strinfo (idx, si);
>> +  si->writable = true;
>> +  si->stmt = stmt;
>> +  si->dont_invalidate = true;
>> +}
>> +
>> +/* Handle a call to memset.
>> +   After a call to calloc, memset(,0,) is unnecessary.
>> +   memset(malloc(n),0,n) is calloc(n,1).  */
>> +
>> +static bool
>> +handle_builtin_memset (gimple_stmt_iterator *gsi)
>> +{
>> +  gimple stmt2 = gsi_stmt (*gsi);
>> +  if (!integer_zerop (gimple_call_arg (stmt2, 1)))
>> +    return true;
>> +  tree ptr = gimple_call_arg (stmt2, 0);
>> +  int idx1 = get_stridx (ptr);
>> +  if (idx1 <= 0)
>> +    return true;
>> +  strinfo si1 = get_strinfo (idx1);
>> +  if (!si1)
>> +    return true;
>> +  gimple stmt1 = si1->stmt;
>> +  if (!stmt1 || !is_gimple_call (stmt1))
>> +    return true;
>> +  tree callee1 = gimple_call_fndecl (stmt1);
>> +  if (!gimple_call_builtin_p (stmt1, BUILT_IN_NORMAL))
>> +    return true;
>> +  enum built_in_function code1 = DECL_FUNCTION_CODE (callee1);
>> +  tree size = gimple_call_arg (stmt2, 2);
>> +  if (code1 == BUILT_IN_CALLOC)
>> +    /* Not touching stmt1 */ ;
>> +  else if (code1 == BUILT_IN_MALLOC
>> +          && operand_equal_p (gimple_call_arg (stmt1, 0), size, 0))
>> +    {
>> +      gimple_stmt_iterator gsi1 = gsi_for_stmt (stmt1);
>> +      update_gimple_call (&gsi1, builtin_decl_implicit (BUILT_IN_CALLOC),
>> 2,
>> +                         size, build_one_cst (size_type_node));
>> +    }
>> +  else
>> +    return true;
>> +  tree lhs = gimple_call_lhs (stmt2);
>> +  unlink_stmt_vdef (stmt2);
>> +  if (lhs)
>> +    {
>> +      gimple assign = gimple_build_assign (lhs, ptr);
>> +      gsi_replace (gsi, assign, false);
>> +    }
>> +  else
>> +    {
>> +      gsi_remove (gsi, true);
>> +      release_defs (stmt2);
>> +    }
>> +
>> +  return false;
>> +}
>> +
>>  /* Handle a POINTER_PLUS_EXPR statement.
>>     For p = "abcd" + 2; compute associated length, or if
>>     p = q + off is pointing to a '\0' character of a string, call
>>     zero_length_string on it.  */
>>
>>  static void
>>  handle_pointer_plus (gimple_stmt_iterator *gsi)
>>  {
>>    gimple stmt = gsi_stmt (*gsi);
>>    tree lhs = gimple_assign_lhs (stmt), off;
>> @@ -1845,20 +1922,28 @@ strlen_optimize_stmt (gimple_stmt_iterat
>>           case BUILT_IN_MEMCPY:
>>           case BUILT_IN_MEMCPY_CHK:
>>           case BUILT_IN_MEMPCPY:
>>           case BUILT_IN_MEMPCPY_CHK:
>>             handle_builtin_memcpy (DECL_FUNCTION_CODE (callee), gsi);
>>             break;
>>           case BUILT_IN_STRCAT:
>>           case BUILT_IN_STRCAT_CHK:
>>             handle_builtin_strcat (DECL_FUNCTION_CODE (callee), gsi);
>>             break;
>> +         case BUILT_IN_MALLOC:
>> +         case BUILT_IN_CALLOC:
>> +           handle_builtin_malloc (DECL_FUNCTION_CODE (callee), gsi);
>> +           break;
>> +         case BUILT_IN_MEMSET:
>> +           if (!handle_builtin_memset (gsi))
>> +             return false;
>> +           break;
>>           default:
>>             break;
>>           }
>>      }
>>    else if (is_gimple_assign (stmt))
>>      {
>>        tree lhs = gimple_assign_lhs (stmt);
>>
>>        if (TREE_CODE (lhs) == SSA_NAME && POINTER_TYPE_P (TREE_TYPE (lhs)))
>>         {

-- 
Marc Glisse

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: calloc = malloc + memset
  2014-05-17 14:19           ` Marc Glisse
@ 2014-06-03 14:00             ` Marc Glisse
  2014-06-23  7:32               ` Jakub Jelinek
  0 siblings, 1 reply; 26+ messages in thread
From: Marc Glisse @ 2014-06-03 14:00 UTC (permalink / raw)
  To: GCC Patches

Ping?

On Sat, 17 May 2014, Marc Glisse wrote:

> Ping Jakub?
> https://gcc.gnu.org/ml/gcc-patches/2014-04/msg01104.html
>
> On Wed, 23 Apr 2014, Richard Biener wrote:
>
>> On Fri, Apr 18, 2014 at 8:27 PM, Marc Glisse <marc.glisse@inria.fr> wrote:
>>> Thanks for the comments!
>>> 
>>> 
>>> On Fri, 18 Apr 2014, Jakub Jelinek wrote:
>>> 
>>>> The passes.def change makes me a little bit nervous, but if it works,
>>>> perhaps.
>>> 
>>> 
>>> Would you prefer running the pass twice? I thought there would be less
>>> resistance to moving the pass than duplicating it.
>> 
>> Indeed.  I think placing it after loops and CSE (thus what you have done)
>> makes sense.  strlenopt itself shouldn't enable much additional
>> optimizations.  But well, pass ordering is always tricky.
>> 
>> Didn't look at the rest of the changes, but Jakub is certainly able to
>> approve the patch so I leave it to him.

-- 
Marc Glisse

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: calloc = malloc + memset
  2014-06-03 14:00             ` Marc Glisse
@ 2014-06-23  7:32               ` Jakub Jelinek
  2014-06-23 15:52                 ` Marc Glisse
  0 siblings, 1 reply; 26+ messages in thread
From: Jakub Jelinek @ 2014-06-23  7:32 UTC (permalink / raw)
  To: Marc Glisse; +Cc: GCC Patches

On Tue, Jun 03, 2014 at 04:00:17PM +0200, Marc Glisse wrote:
> Ping?

Ok for trunk, sorry for the delay.

	Jakub

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: calloc = malloc + memset
  2014-06-23  7:32               ` Jakub Jelinek
@ 2014-06-23 15:52                 ` Marc Glisse
  2014-06-23 16:11                   ` Richard Biener
  0 siblings, 1 reply; 26+ messages in thread
From: Marc Glisse @ 2014-06-23 15:52 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: GCC Patches, richard.guenther

[-- Attachment #1: Type: TEXT/PLAIN, Size: 863 bytes --]

On Mon, 23 Jun 2014, Jakub Jelinek wrote:

> Ok for trunk, sorry for the delay.

Thanks. Richard has moved the passes a bit since then, but I still have 
exactly one spot where the testsuite is ok :-) I need strlen to be after 
dom (for calloc.C) and before vrp (for several strlenopt-*.c). I'll commit 
it tomorrow if there aren't any comments on the pass placement.

2014-06-24  Marc Glisse  <marc.glisse@inria.fr>

 	PR tree-optimization/57742
gcc/
 	* tree-ssa-strlen.c (get_string_length): Ignore malloc.
 	(handle_builtin_malloc, handle_builtin_memset): New functions.
 	(strlen_optimize_stmt): Call them.
 	* passes.def: Move strlen after loop+dom but before vrp.
gcc/testsuite/
 	* g++.dg/tree-ssa/calloc.C: New testcase.
 	* gcc.dg/tree-ssa/calloc-1.c: Likewise.
 	* gcc.dg/tree-ssa/calloc-2.c: Likewise.
 	* gcc.dg/strlenopt-9.c: Adapt.

-- 
Marc Glisse

[-- Attachment #2: Type: TEXT/PLAIN, Size: 12199 bytes --]

Index: gcc/passes.def
===================================================================
--- gcc/passes.def	(revision 211886)
+++ gcc/passes.def	(working copy)
@@ -179,21 +179,20 @@ along with GCC; see the file COPYING3.
 	 DOM and erroneous path isolation should be due to degenerate PHI nodes.
 	 So rather than run the full propagators, run a specialized pass which
 	 only examines PHIs to discover const/copy propagation
 	 opportunities.  */
       NEXT_PASS (pass_phi_only_cprop);
       NEXT_PASS (pass_dse);
       NEXT_PASS (pass_reassoc);
       NEXT_PASS (pass_dce);
       NEXT_PASS (pass_forwprop);
       NEXT_PASS (pass_phiopt);
-      NEXT_PASS (pass_strlen);
       NEXT_PASS (pass_ccp);
       /* After CCP we rewrite no longer addressed locals into SSA
 	 form if possible.  */
       NEXT_PASS (pass_copy_prop);
       NEXT_PASS (pass_cse_sincos);
       NEXT_PASS (pass_optimize_bswap);
       NEXT_PASS (pass_split_crit_edges);
       NEXT_PASS (pass_pre);
       NEXT_PASS (pass_sink_code);
       NEXT_PASS (pass_asan);
@@ -232,20 +231,21 @@ along with GCC; see the file COPYING3.
 	  NEXT_PASS (pass_loop_prefetch);
 	  NEXT_PASS (pass_iv_optimize);
 	  NEXT_PASS (pass_lim);
 	  NEXT_PASS (pass_tree_loop_done);
       POP_INSERT_PASSES ()
       NEXT_PASS (pass_lower_vector_ssa);
       NEXT_PASS (pass_cse_reciprocals);
       NEXT_PASS (pass_reassoc);
       NEXT_PASS (pass_strength_reduction);
       NEXT_PASS (pass_dominator);
+      NEXT_PASS (pass_strlen);
       NEXT_PASS (pass_vrp);
       /* The only const/copy propagation opportunities left after
 	 DOM and VRP should be due to degenerate PHI nodes.  So rather than
 	 run the full propagators, run a specialized pass which
 	 only examines PHIs to discover const/copy propagation
 	 opportunities.  */
       NEXT_PASS (pass_phi_only_cprop);
       NEXT_PASS (pass_cd_dce);
       NEXT_PASS (pass_tracer);
       NEXT_PASS (pass_dse);
Index: gcc/testsuite/g++.dg/tree-ssa/calloc.C
===================================================================
--- gcc/testsuite/g++.dg/tree-ssa/calloc.C	(revision 0)
+++ gcc/testsuite/g++.dg/tree-ssa/calloc.C	(working copy)
@@ -0,0 +1,50 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fdump-tree-optimized" } */
+
+typedef __SIZE_TYPE__ size_t;
+inline void* operator new(size_t, void* p) throw() { return p; }
+
+typedef void (*handler_t)(void);
+extern handler_t get_handle();
+
+inline void* operator new(size_t sz)
+{
+  void *p;
+
+  if (sz == 0)
+    sz = 1;
+
+  while ((p = __builtin_malloc (sz)) == 0)
+    {
+      handler_t handler = get_handle ();
+      if (! handler)
+        throw 42;
+      handler ();
+    }
+  return p;
+}
+
+struct vect {
+  int *start, *end;
+  vect(size_t n) {
+    start = end = 0;
+    if (n > (size_t)-1 / sizeof(int))
+      throw 33;
+    if (n != 0)
+      start = static_cast<int*> (operator new (n * sizeof(int)));
+    end = start + n;
+    int *p = start;
+    for (size_t l = n; l > 0; --l, ++p)
+      *p = 0;
+  }
+};
+
+void f (void *p, int n)
+{
+  new (p) vect(n);
+}
+
+/* { dg-final { scan-tree-dump-times "calloc" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-not "malloc" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "memset" "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: gcc/testsuite/gcc.dg/strlenopt-9.c
===================================================================
--- gcc/testsuite/gcc.dg/strlenopt-9.c	(revision 211886)
+++ gcc/testsuite/gcc.dg/strlenopt-9.c	(working copy)
@@ -11,21 +11,21 @@ fn1 (int r)
      optimized away.  */
   return strchr (p, '\0');
 }
 
 __attribute__((noinline, noclone)) size_t
 fn2 (int r)
 {
   char *p, q[10];
   strcpy (q, "abc");
   p = r ? "a" : q;
-  /* String length for p varies, therefore strlen below isn't
+  /* String length is constant for both alternatives, and strlen is
      optimized away.  */
   return strlen (p);
 }
 
 __attribute__((noinline, noclone)) size_t
 fn3 (char *p, int n)
 {
   int i;
   p = strchr (p, '\0');
   /* strcat here can be optimized into memcpy.  */
@@ -91,19 +91,19 @@ main ()
       || memcmp (b, "aaaabcdabcdefgabcdefgefgabcdabcd", 33) != 0
       || memcmp (buf, "aaaabcdabcdefgabcdefgefgabcdefg", 32) != 0)
     abort ();
   if (fn4 (b, 256) != 4
       || memcmp (b, "aaaabcdabcdefgabcdefgefgabcdabcdabcd", 37) != 0
       || memcmp (buf, "aaaabcdabcdefgabcdefgefgabcdabcdefgefg", 39) != 0)
     abort ();
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "strlen \\(" 4 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strlen \\(" 3 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "memcpy \\(" 6 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strcpy \\(" 1 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strchr \\(" 3 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
 /* { dg-final { cleanup-tree-dump "strlen" } } */
 /* { dg-final { scan-tree-dump-times "return 4;" 1 "optimized" } } */
 /* { dg-final { cleanup-tree-dump "optimized" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/calloc-1.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/calloc-1.c	(revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/calloc-1.c	(working copy)
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+extern int a;
+extern int *b;
+int n;
+void* f(long *q)
+{
+  int *p = __builtin_malloc (n);
+  ++*q;
+  if (p)
+  {
+    ++*q;
+    a = 2;
+    __builtin_memset (p, 0, n);
+    *b = 3;
+  }
+  return p;
+}
+void* g(void)
+{
+  float *p = __builtin_calloc (8, 4);
+  return __builtin_memset (p, 0, 24); // not 32
+}
+
+/* { dg-final { scan-tree-dump-times "calloc" 2 "optimized" } } */
+/* { dg-final { scan-tree-dump-not "malloc" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "memset" "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/calloc-2.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/calloc-2.c	(revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/calloc-2.c	(working copy)
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+int n, nn;
+void* f()
+{
+  char *p = __builtin_calloc (n, 1);
+  p[42] = '\n';
+  __builtin_memset (p, 0, nn);
+  return p;
+}
+
+void* g(int m1, int m2)
+{
+  char *p = __builtin_malloc (m2);
+  while (--m1)
+  {
+    __builtin_memset (p, 0, m2);
+    p[n] = 'b';
+  }
+  return p;
+}
+
+/* { dg-final { scan-tree-dump-times "malloc" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "calloc" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "memset" 2 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: gcc/tree-ssa-strlen.c
===================================================================
--- gcc/tree-ssa-strlen.c	(revision 211886)
+++ gcc/tree-ssa-strlen.c	(working copy)
@@ -496,20 +496,23 @@ get_string_length (strinfo si)
 	case BUILT_IN_STPCPY_CHK:
 	  gcc_assert (lhs != NULL_TREE);
 	  loc = gimple_location (stmt);
 	  si->endptr = lhs;
 	  si->stmt = NULL;
 	  lhs = fold_convert_loc (loc, size_type_node, lhs);
 	  si->length = fold_convert_loc (loc, size_type_node, si->ptr);
 	  si->length = fold_build2_loc (loc, MINUS_EXPR, size_type_node,
 					lhs, si->length);
 	  break;
+	case BUILT_IN_MALLOC:
+	  break;
+	/* BUILT_IN_CALLOC always has si->length set.  */
 	default:
 	  gcc_unreachable ();
 	  break;
 	}
     }
 
   return si->length;
 }
 
 /* Invalidate string length information for strings whose length
@@ -521,20 +524,21 @@ maybe_invalidate (gimple stmt)
   strinfo si;
   unsigned int i;
   bool nonempty = false;
 
   for (i = 1; vec_safe_iterate (stridx_to_strinfo, i, &si); ++i)
     if (si != NULL)
       {
 	if (!si->dont_invalidate)
 	  {
 	    ao_ref r;
+	    /* Do not use si->length.  */
 	    ao_ref_init_from_ptr_and_size (&r, si->ptr, NULL_TREE);
 	    if (stmt_may_clobber_ref_p_1 (stmt, &r))
 	      {
 		set_strinfo (i, NULL);
 		free_strinfo (si);
 		continue;
 	      }
 	  }
 	si->dont_invalidate = false;
 	nonempty = true;
@@ -1608,20 +1612,93 @@ handle_builtin_strcat (enum built_in_fun
 	{
 	  laststmt.stmt = stmt;
 	  laststmt.len = srclen;
 	  laststmt.stridx = dsi->idx;
 	}
     }
   else if (dump_file && (dump_flags & TDF_DETAILS) != 0)
     fprintf (dump_file, "not possible.\n");
 }
 
+/* Handle a call to malloc or calloc.  */
+
+static void
+handle_builtin_malloc (enum built_in_function bcode, gimple_stmt_iterator *gsi)
+{
+  gimple stmt = gsi_stmt (*gsi);
+  tree lhs = gimple_call_lhs (stmt);
+  gcc_assert (get_stridx (lhs) == 0);
+  int idx = new_stridx (lhs);
+  tree length = NULL_TREE;
+  if (bcode == BUILT_IN_CALLOC)
+    length = build_int_cst (size_type_node, 0);
+  strinfo si = new_strinfo (lhs, idx, length);
+  if (bcode == BUILT_IN_CALLOC)
+    si->endptr = lhs;
+  set_strinfo (idx, si);
+  si->writable = true;
+  si->stmt = stmt;
+  si->dont_invalidate = true;
+}
+
+/* Handle a call to memset.
+   After a call to calloc, memset(,0,) is unnecessary.
+   memset(malloc(n),0,n) is calloc(n,1).  */
+
+static bool
+handle_builtin_memset (gimple_stmt_iterator *gsi)
+{
+  gimple stmt2 = gsi_stmt (*gsi);
+  if (!integer_zerop (gimple_call_arg (stmt2, 1)))
+    return true;
+  tree ptr = gimple_call_arg (stmt2, 0);
+  int idx1 = get_stridx (ptr);
+  if (idx1 <= 0)
+    return true;
+  strinfo si1 = get_strinfo (idx1);
+  if (!si1)
+    return true;
+  gimple stmt1 = si1->stmt;
+  if (!stmt1 || !is_gimple_call (stmt1))
+    return true;
+  tree callee1 = gimple_call_fndecl (stmt1);
+  if (!gimple_call_builtin_p (stmt1, BUILT_IN_NORMAL))
+    return true;
+  enum built_in_function code1 = DECL_FUNCTION_CODE (callee1);
+  tree size = gimple_call_arg (stmt2, 2);
+  if (code1 == BUILT_IN_CALLOC)
+    /* Not touching stmt1 */ ;
+  else if (code1 == BUILT_IN_MALLOC
+	   && operand_equal_p (gimple_call_arg (stmt1, 0), size, 0))
+    {
+      gimple_stmt_iterator gsi1 = gsi_for_stmt (stmt1);
+      update_gimple_call (&gsi1, builtin_decl_implicit (BUILT_IN_CALLOC), 2,
+			  size, build_one_cst (size_type_node));
+    }
+  else
+    return true;
+  tree lhs = gimple_call_lhs (stmt2);
+  unlink_stmt_vdef (stmt2);
+  if (lhs)
+    {
+      gimple assign = gimple_build_assign (lhs, ptr);
+      gsi_replace (gsi, assign, false);
+    }
+  else
+    {
+      gsi_remove (gsi, true);
+      release_defs (stmt2);
+    }
+
+  return false;
+}
+
 /* Handle a POINTER_PLUS_EXPR statement.
    For p = "abcd" + 2; compute associated length, or if
    p = q + off is pointing to a '\0' character of a string, call
    zero_length_string on it.  */
 
 static void
 handle_pointer_plus (gimple_stmt_iterator *gsi)
 {
   gimple stmt = gsi_stmt (*gsi);
   tree lhs = gimple_assign_lhs (stmt), off;
@@ -1845,20 +1922,28 @@ strlen_optimize_stmt (gimple_stmt_iterat
 	  case BUILT_IN_MEMCPY:
 	  case BUILT_IN_MEMCPY_CHK:
 	  case BUILT_IN_MEMPCPY:
 	  case BUILT_IN_MEMPCPY_CHK:
 	    handle_builtin_memcpy (DECL_FUNCTION_CODE (callee), gsi);
 	    break;
 	  case BUILT_IN_STRCAT:
 	  case BUILT_IN_STRCAT_CHK:
 	    handle_builtin_strcat (DECL_FUNCTION_CODE (callee), gsi);
 	    break;
+	  case BUILT_IN_MALLOC:
+	  case BUILT_IN_CALLOC:
+	    handle_builtin_malloc (DECL_FUNCTION_CODE (callee), gsi);
+	    break;
+	  case BUILT_IN_MEMSET:
+	    if (!handle_builtin_memset (gsi))
+	      return false;
+	    break;
 	  default:
 	    break;
 	  }
     }
   else if (is_gimple_assign (stmt))
     {
       tree lhs = gimple_assign_lhs (stmt);
 
       if (TREE_CODE (lhs) == SSA_NAME && POINTER_TYPE_P (TREE_TYPE (lhs)))
 	{

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: calloc = malloc + memset
  2014-06-23 15:52                 ` Marc Glisse
@ 2014-06-23 16:11                   ` Richard Biener
  2014-06-23 16:19                     ` Marc Glisse
  0 siblings, 1 reply; 26+ messages in thread
From: Richard Biener @ 2014-06-23 16:11 UTC (permalink / raw)
  To: Marc Glisse, Jakub Jelinek; +Cc: GCC Patches

On June 23, 2014 5:51:30 PM CEST, Marc Glisse <marc.glisse@inria.fr> wrote:
>On Mon, 23 Jun 2014, Jakub Jelinek wrote:
>
>> Ok for trunk, sorry for the delay.
>
>Thanks. Richard has moved the passes a bit since then, but I still have
>
>exactly one spot where the testsuite is ok :-) I need strlen to be
>after 
>dom (for calloc.C) and before vrp (for several strlenopt-*.c). I'll
>commit 
>it tomorrow if there aren't any comments on the pass placement.

But vrp does not run at -O1 - does strlenopt?

>2014-06-24  Marc Glisse  <marc.glisse@inria.fr>
>
> 	PR tree-optimization/57742
>gcc/
> 	* tree-ssa-strlen.c (get_string_length): Ignore malloc.
> 	(handle_builtin_malloc, handle_builtin_memset): New functions.
> 	(strlen_optimize_stmt): Call them.
> 	* passes.def: Move strlen after loop+dom but before vrp.
>gcc/testsuite/
> 	* g++.dg/tree-ssa/calloc.C: New testcase.
> 	* gcc.dg/tree-ssa/calloc-1.c: Likewise.
> 	* gcc.dg/tree-ssa/calloc-2.c: Likewise.
> 	* gcc.dg/strlenopt-9.c: Adapt.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: calloc = malloc + memset
  2014-06-23 16:11                   ` Richard Biener
@ 2014-06-23 16:19                     ` Marc Glisse
  2014-06-23 16:45                       ` Richard Biener
  0 siblings, 1 reply; 26+ messages in thread
From: Marc Glisse @ 2014-06-23 16:19 UTC (permalink / raw)
  To: Richard Biener; +Cc: Jakub Jelinek, GCC Patches

On Mon, 23 Jun 2014, Richard Biener wrote:

> On June 23, 2014 5:51:30 PM CEST, Marc Glisse <marc.glisse@inria.fr> wrote:
>> On Mon, 23 Jun 2014, Jakub Jelinek wrote:
>>
>>> Ok for trunk, sorry for the delay.
>>
>> Thanks. Richard has moved the passes a bit since then, but I still have
>>
>> exactly one spot where the testsuite is ok :-) I need strlen to be
>> after
>> dom (for calloc.C) and before vrp (for several strlenopt-*.c). I'll
>> commit
>> it tomorrow if there aren't any comments on the pass placement.
>
> But vrp does not run at -O1 - does strlenopt?

     { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_foptimize_strlen, NULL, 1 },
     { OPT_LEVELS_2_PLUS, OPT_ftree_vrp, NULL, 1 },

So that's just a missed optimization at -Os, I guess.

-- 
Marc Glisse

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: calloc = malloc + memset
  2014-06-23 16:19                     ` Marc Glisse
@ 2014-06-23 16:45                       ` Richard Biener
  0 siblings, 0 replies; 26+ messages in thread
From: Richard Biener @ 2014-06-23 16:45 UTC (permalink / raw)
  To: Marc Glisse; +Cc: Jakub Jelinek, GCC Patches

On Mon, Jun 23, 2014 at 6:19 PM, Marc Glisse <marc.glisse@inria.fr> wrote:
> On Mon, 23 Jun 2014, Richard Biener wrote:
>
>> On June 23, 2014 5:51:30 PM CEST, Marc Glisse <marc.glisse@inria.fr>
>> wrote:
>>>
>>> On Mon, 23 Jun 2014, Jakub Jelinek wrote:
>>>
>>>> Ok for trunk, sorry for the delay.
>>>
>>>
>>> Thanks. Richard has moved the passes a bit since then, but I still have
>>>
>>> exactly one spot where the testsuite is ok :-) I need strlen to be
>>> after
>>> dom (for calloc.C) and before vrp (for several strlenopt-*.c). I'll
>>> commit
>>> it tomorrow if there aren't any comments on the pass placement.
>>
>>
>> But vrp does not run at -O1 - does strlenopt?
>
>
>     { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_foptimize_strlen, NULL, 1 },
>     { OPT_LEVELS_2_PLUS, OPT_ftree_vrp, NULL, 1 },
>
> So that's just a missed optimization at -Os, I guess.

Ok, that's fine (not sure why we restrict all of strilenopt instead of
just those
transforms that are harmful for -Os).

Richard.

> --
> Marc Glisse

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: calloc = malloc + memset
  2014-02-28 22:48 calloc = malloc + memset Marc Glisse
  2014-03-01 12:26 ` Paolo Carlini
  2014-03-03  9:50 ` Richard Biener
@ 2014-06-23 18:20 ` Andi Kleen
  2014-06-23 18:51   ` Andrew Pinski
  2014-06-23 19:00   ` Marc Glisse
  2014-07-15 12:38 ` Bernd Schmidt
  3 siblings, 2 replies; 26+ messages in thread
From: Andi Kleen @ 2014-06-23 18:20 UTC (permalink / raw)
  To: Marc Glisse; +Cc: gcc-patches

Marc Glisse <marc.glisse@inria.fr> writes:

> Hello,
>
> this is a stage 1 patch, and I'll ping it then, but if you have
> comments now...

FWIW i believe the transformation will break a large variety of micro benchmarks.

calloc internally knows that memory fresh from the OS is zeroed.
But the memory may not be faulted in yet.

memset always faults in the memory.

So if you have some test like

   buf = malloc(...)
   memset(buf, ...) 
   start = get_time();
   ... do something with buf
   end = get_time()

Now the times will be completely off because the measured times includes
the page faults.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: calloc = malloc + memset
  2014-06-23 18:20 ` Andi Kleen
@ 2014-06-23 18:51   ` Andrew Pinski
  2014-06-23 19:00   ` Marc Glisse
  1 sibling, 0 replies; 26+ messages in thread
From: Andrew Pinski @ 2014-06-23 18:51 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Marc Glisse, GCC Patches

On Mon, Jun 23, 2014 at 11:17 AM, Andi Kleen <andi@firstfloor.org> wrote:
> Marc Glisse <marc.glisse@inria.fr> writes:
>
>> Hello,
>>
>> this is a stage 1 patch, and I'll ping it then, but if you have
>> comments now...
>
> FWIW i believe the transformation will break a large variety of micro benchmarks.
>
> calloc internally knows that memory fresh from the OS is zeroed.
> But the memory may not be faulted in yet.
>
> memset always faults in the memory.
>
> So if you have some test like
>
>    buf = malloc(...)
>    memset(buf, ...)
>    start = get_time();
>    ... do something with buf
>    end = get_time()
>
> Now the times will be completely off because the measured times includes
> the page faults.


Easy way for these benchmarks to get around this.
volatile char *vbuf = (char*)buf;
for(i=0;i<bufsize;i++)
  *vbuf = 0;
before get_time ();

Now there is no way for the compiler to optimize away the inlined
memset and will always be 100% correct in the future.

Also micro-benchmarking is going to have issues like this too with
future optimizations.

Thanks,
Andrew

>
> -Andi
>
> --
> ak@linux.intel.com -- Speaking for myself only

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: calloc = malloc + memset
  2014-06-23 18:20 ` Andi Kleen
  2014-06-23 18:51   ` Andrew Pinski
@ 2014-06-23 19:00   ` Marc Glisse
  2014-06-23 19:37     ` Andi Kleen
  1 sibling, 1 reply; 26+ messages in thread
From: Marc Glisse @ 2014-06-23 19:00 UTC (permalink / raw)
  To: Andi Kleen; +Cc: gcc-patches

On Mon, 23 Jun 2014, Andi Kleen wrote:

> FWIW i believe the transformation will break a large variety of micro benchmarks.
>
> calloc internally knows that memory fresh from the OS is zeroed.
> But the memory may not be faulted in yet.
>
> memset always faults in the memory.
>
> So if you have some test like
>
>   buf = malloc(...)
>   memset(buf, ...)
>   start = get_time();
>   ... do something with buf
>   end = get_time()
>
> Now the times will be completely off because the measured times includes
> the page faults.

Good point. I guess working around compiler optimizations is part of the 
game for micro benchmarks, and their authors would be disappointed if the 
compiler didn't mess it up regularly in new and entertaining ways ;-)

-- 
Marc Glisse

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: calloc = malloc + memset
  2014-06-23 19:00   ` Marc Glisse
@ 2014-06-23 19:37     ` Andi Kleen
  2014-06-23 20:14       ` Marc Glisse
  0 siblings, 1 reply; 26+ messages in thread
From: Andi Kleen @ 2014-06-23 19:37 UTC (permalink / raw)
  To: Marc Glisse; +Cc: Andi Kleen, gcc-patches

On Mon, Jun 23, 2014 at 09:00:02PM +0200, Marc Glisse wrote:
> On Mon, 23 Jun 2014, Andi Kleen wrote:
> 
> >FWIW i believe the transformation will break a large variety of micro benchmarks.
> >
> >calloc internally knows that memory fresh from the OS is zeroed.
> >But the memory may not be faulted in yet.
> >
> >memset always faults in the memory.
> >
> >So if you have some test like
> >
> >  buf = malloc(...)
> >  memset(buf, ...)
> >  start = get_time();
> >  ... do something with buf
> >  end = get_time()
> >
> >Now the times will be completely off because the measured times includes
> >the page faults.
> 
> Good point. I guess working around compiler optimizations is part of
> the game for micro benchmarks, and their authors would be
> disappointed if the compiler didn't mess it up regularly in new and
> entertaining ways ;-)

I would prefer to not do it. I'm not sure it has a lot of benefit.
If you want to keep it please make sure there is an easy way to turn
it off.

-Andi
-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: calloc = malloc + memset
  2014-06-23 19:37     ` Andi Kleen
@ 2014-06-23 20:14       ` Marc Glisse
  2014-06-23 20:21         ` Andi Kleen
  0 siblings, 1 reply; 26+ messages in thread
From: Marc Glisse @ 2014-06-23 20:14 UTC (permalink / raw)
  To: Andi Kleen; +Cc: gcc-patches

On Mon, 23 Jun 2014, Andi Kleen wrote:

> I would prefer to not do it.

For the sake of micro benchmarks?

> I'm not sure it has a lot of benefit.

It has a non-zero benefit.

> If you want to keep it please make sure there is an easy way to turn
> it off.

Any of these flags works:
-fdisable-tree-strlen
-fno-builtin-malloc
-fno-builtin-memset (assuming you wrote 'memset' explicitly in your code)
-fno-builtin
-ffreestanding
-O1
-Os

In the code, you can hide that the pointer passed to memset is the one 
returned by malloc by storing it in a volatile variable, or any other 
trick to hide from the compiler that we are doing memset(malloc(n),0,n).

-- 
Marc Glisse

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: calloc = malloc + memset
  2014-06-23 20:14       ` Marc Glisse
@ 2014-06-23 20:21         ` Andi Kleen
  2014-06-23 20:25           ` Andrew Pinski
  0 siblings, 1 reply; 26+ messages in thread
From: Andi Kleen @ 2014-06-23 20:21 UTC (permalink / raw)
  To: Marc Glisse; +Cc: Andi Kleen, gcc-patches

On Mon, Jun 23, 2014 at 10:14:25PM +0200, Marc Glisse wrote:
> On Mon, 23 Jun 2014, Andi Kleen wrote:
> 
> >I would prefer to not do it.
> 
> For the sake of micro benchmarks?

Yes benchmarks are important.

-Andi

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: calloc = malloc + memset
  2014-06-23 20:21         ` Andi Kleen
@ 2014-06-23 20:25           ` Andrew Pinski
  0 siblings, 0 replies; 26+ messages in thread
From: Andrew Pinski @ 2014-06-23 20:25 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Marc Glisse, GCC Patches

On Mon, Jun 23, 2014 at 1:21 PM, Andi Kleen <andi@firstfloor.org> wrote:
> On Mon, Jun 23, 2014 at 10:14:25PM +0200, Marc Glisse wrote:
>> On Mon, 23 Jun 2014, Andi Kleen wrote:
>>
>> >I would prefer to not do it.
>>
>> For the sake of micro benchmarks?
>
> Yes benchmarks are important.


But micro-benchmarks are not important.  In fact this patch could
improve some benchmarks as you no longer thrash your cache.

So benchmarks are important but micro-benchmarks are not.

Thanks,
Andrew

>
> -Andi

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: calloc = malloc + memset
  2014-02-28 22:48 calloc = malloc + memset Marc Glisse
                   ` (2 preceding siblings ...)
  2014-06-23 18:20 ` Andi Kleen
@ 2014-07-15 12:38 ` Bernd Schmidt
  2014-07-15 13:15   ` Richard Biener
  3 siblings, 1 reply; 26+ messages in thread
From: Bernd Schmidt @ 2014-07-15 12:38 UTC (permalink / raw)
  To: Marc Glisse, gcc-patches

On 02/28/2014 11:48 PM, Marc Glisse wrote:
> /* Optimize
> +   ptr = malloc (n);
> +   memset (ptr, 0, n);
> +   into
> +   ptr = calloc (n);
> +   gsi_p is known to point to a call to __builtin_memset.  */

Is there anything in here to prevent us making an infinite loop if the 
above pattern occurs in a function called "calloc"?


Bernd

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: calloc = malloc + memset
  2014-07-15 12:38 ` Bernd Schmidt
@ 2014-07-15 13:15   ` Richard Biener
  0 siblings, 0 replies; 26+ messages in thread
From: Richard Biener @ 2014-07-15 13:15 UTC (permalink / raw)
  To: Bernd Schmidt; +Cc: Marc Glisse, GCC Patches

On Tue, Jul 15, 2014 at 2:33 PM, Bernd Schmidt <bernds_cb1@t-online.de> wrote:
> On 02/28/2014 11:48 PM, Marc Glisse wrote:
>>
>> /* Optimize
>> +   ptr = malloc (n);
>> +   memset (ptr, 0, n);
>> +   into
>> +   ptr = calloc (n);
>> +   gsi_p is known to point to a call to __builtin_memset.  */
>
>
> Is there anything in here to prevent us making an infinite loop if the above
> pattern occurs in a function called "calloc"?

Nothing.  See how I ended up doing

2014-05-06  Richard Biener  <rguenther@suse.de>

        * c-opts.c (c_common_post_options): For -freestanding,
        -fno-hosted and -fno-builtin disable pattern recognition
        if not enabled explicitely.

to avoid sth like this for memset/memcpy/memmove recognition.

Richard.

>
> Bernd
>

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2014-07-15 12:38 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-02-28 22:48 calloc = malloc + memset Marc Glisse
2014-03-01 12:26 ` Paolo Carlini
2014-03-01 14:51   ` Marc Glisse
2014-03-03  9:50 ` Richard Biener
2014-03-03 13:45   ` Marc Glisse
2014-03-23 19:47   ` Marc Glisse
2014-04-15 19:06     ` Marc Glisse
2014-04-18 11:31     ` Jakub Jelinek
2014-04-18 18:34       ` Marc Glisse
2014-04-23 14:05         ` Richard Biener
2014-05-17 14:19           ` Marc Glisse
2014-06-03 14:00             ` Marc Glisse
2014-06-23  7:32               ` Jakub Jelinek
2014-06-23 15:52                 ` Marc Glisse
2014-06-23 16:11                   ` Richard Biener
2014-06-23 16:19                     ` Marc Glisse
2014-06-23 16:45                       ` Richard Biener
2014-06-23 18:20 ` Andi Kleen
2014-06-23 18:51   ` Andrew Pinski
2014-06-23 19:00   ` Marc Glisse
2014-06-23 19:37     ` Andi Kleen
2014-06-23 20:14       ` Marc Glisse
2014-06-23 20:21         ` Andi Kleen
2014-06-23 20:25           ` Andrew Pinski
2014-07-15 12:38 ` Bernd Schmidt
2014-07-15 13:15   ` Richard Biener

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).