* [PATCH] Optimiza aggregate a = b = c = {} (PR c/78408) @ 2016-11-25 19:32 Jakub Jelinek 2016-11-28 9:50 ` Richard Biener 0 siblings, 1 reply; 11+ messages in thread From: Jakub Jelinek @ 2016-11-25 19:32 UTC (permalink / raw) To: Richard Biener; +Cc: gcc-patches Hi! This patch optimizes a = {}; b = a; into a = {}; b = {}; and similarly for memset instead of the first stmt and/or memcpy instead of the second one. Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? 2016-11-25 Jakub Jelinek <jakub@redhat.com> PR c/78408 * tree-ssa-ccp.c: Include tree-dfa.h. (optimize_memcpy): New function. (pass_fold_builtins::execute): Use it. Remove useless conditional break after BUILT_IN_VA_*. * gcc.dg/pr78408.c: New test. --- gcc/tree-ssa-ccp.c.jj 2016-11-18 20:04:27.000000000 +0100 +++ gcc/tree-ssa-ccp.c 2016-11-25 17:54:26.862166658 +0100 @@ -143,6 +143,7 @@ along with GCC; see the file COPYING3. #include "stor-layout.h" #include "optabs-query.h" #include "tree-ssa-ccp.h" +#include "tree-dfa.h" /* Possible lattice values. */ typedef enum @@ -2928,6 +2929,113 @@ optimize_atomic_bit_test_and (gimple_stm release_ssa_name (lhs); } +/* Optimize + a = {}; + b = a; + into + a = {}; + b = {}; + Similarly for memset (&a, ..., sizeof (a)); instead of a = {}; + and/or memcpy (&b, &a, sizeof (a)); instead of b = a; */ + +static void +optimize_memcpy (gimple_stmt_iterator *gsip, tree dest, tree src) +{ + gimple *stmt = gsi_stmt (*gsip); + if (gimple_has_volatile_ops (stmt) + || TREE_THIS_VOLATILE (dest) + || TREE_THIS_VOLATILE (src)) + return; + + tree vuse = gimple_vuse (stmt); + if (vuse == NULL) + return; + + gimple *defstmt = SSA_NAME_DEF_STMT (vuse); + tree src2 = NULL_TREE; + tree val = integer_zero_node; + if (gimple_store_p (defstmt) + && gimple_assign_single_p (defstmt) + && TREE_CODE (gimple_assign_rhs1 (defstmt)) == CONSTRUCTOR + && CONSTRUCTOR_NELTS (gimple_assign_rhs1 (defstmt)) == 0 + && !gimple_clobber_p (defstmt)) + src2 = gimple_assign_lhs (defstmt); + else if (gimple_call_builtin_p (defstmt, BUILT_IN_MEMSET) + && TREE_CODE (gimple_call_arg (defstmt, 0)) == ADDR_EXPR + && TREE_CODE (gimple_call_arg (defstmt, 1)) == INTEGER_CST + && TREE_CODE (gimple_call_arg (defstmt, 2)) == INTEGER_CST) + { + HOST_WIDE_INT ssize, max_size, off; + bool reverse; + src2 = TREE_OPERAND (gimple_call_arg (defstmt, 0), 0); + get_ref_base_and_extent (src2, &off, &ssize, &max_size, &reverse); + if (ssize != max_size + || (ssize % BITS_PER_UNIT) != 0 + || !wi::eq_p (gimple_call_arg (defstmt, 2), ssize / BITS_PER_UNIT)) + src2 = NULL_TREE; + else + { + val = gimple_call_arg (defstmt, 1); + if (!integer_zerop (val) && is_gimple_assign (stmt)) + src2 = NULL_TREE; + } + } + + if (src2 == NULL_TREE) + return; + + if (!operand_equal_p (src, src2, 0)) + { + /* Handle also + a = {}; + MEM[(char * {ref-all})&b] = MEM[(char * {ref-all})&a]; */ + if (is_gimple_assign (stmt) + && TREE_CODE (src) == MEM_REF + && integer_zerop (TREE_OPERAND (src, 1)) + && TREE_CODE (TREE_OPERAND (src, 0)) == ADDR_EXPR + && DECL_P (src2) + && operand_equal_p (TREE_OPERAND (TREE_OPERAND (src, 0), 0), + src2, 0) + && tree_int_cst_equal (TYPE_SIZE (TREE_TYPE (src)), + DECL_SIZE (src2))) + src = TREE_OPERAND (TREE_OPERAND (src, 0), 0); + else + return; + } + if (refs_may_alias_p (dest, src)) + return; + + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "Simplified\n "); + print_gimple_stmt (dump_file, stmt, 0, dump_flags); + fprintf (dump_file, "after previous\n "); + print_gimple_stmt (dump_file, defstmt, 0, dump_flags); + } + + if (is_gimple_assign (stmt)) + { + tree ctor = build_constructor (TREE_TYPE (dest), NULL); + gimple_assign_set_rhs_from_tree (gsip, ctor); + update_stmt (stmt); + } + else + { + gcall *call = as_a <gcall *> (stmt); + tree fndecl = builtin_decl_implicit (BUILT_IN_MEMSET); + gimple_call_set_fndecl (call, fndecl); + gimple_call_set_fntype (call, TREE_TYPE (fndecl)); + gimple_call_set_arg (call, 1, val); + update_stmt (stmt); + } + + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "into\n "); + print_gimple_stmt (dump_file, stmt, 0, dump_flags); + } +} + /* A simple pass that attempts to fold all builtin functions. This pass is run after we've propagated as many constants as we can. */ @@ -2994,6 +3102,9 @@ pass_fold_builtins::execute (function *f continue; } } + else if (gimple_assign_load_p (stmt) && gimple_store_p (stmt)) + optimize_memcpy (&i, gimple_assign_lhs (stmt), + gimple_assign_rhs1 (stmt)); gsi_next (&i); continue; } @@ -3109,14 +3220,39 @@ pass_fold_builtins::execute (function *f false, false); break; + case BUILT_IN_MEMCPY: + if (gimple_call_builtin_p (stmt, BUILT_IN_NORMAL) + && TREE_CODE (gimple_call_arg (stmt, 0)) == ADDR_EXPR + && TREE_CODE (gimple_call_arg (stmt, 1)) == ADDR_EXPR + && TREE_CODE (gimple_call_arg (stmt, 2)) == INTEGER_CST) + { + tree dest = TREE_OPERAND (gimple_call_arg (stmt, 0), 0); + tree src = TREE_OPERAND (gimple_call_arg (stmt, 1), 0); + HOST_WIDE_INT dsize, ssize, max_size, off; + bool reverse; + get_ref_base_and_extent (dest, &off, &dsize, &max_size, + &reverse); + if (dsize != max_size) + break; + get_ref_base_and_extent (src, &off, &ssize, &max_size, + &reverse); + if (ssize != max_size + || ssize != dsize + || (ssize % BITS_PER_UNIT) != 0) + break; + if (!wi::eq_p (gimple_call_arg (stmt, 2), + ssize / BITS_PER_UNIT)) + break; + optimize_memcpy (&i, dest, src); + } + break; + case BUILT_IN_VA_START: case BUILT_IN_VA_END: case BUILT_IN_VA_COPY: /* These shouldn't be folded before pass_stdarg. */ result = optimize_stdarg_builtin (stmt); - if (result) - break; - /* FALLTHRU */ + break; default:; } --- gcc/testsuite/gcc.dg/pr78408.c.jj 2016-11-25 18:02:47.344752199 +0100 +++ gcc/testsuite/gcc.dg/pr78408.c 2016-11-25 18:02:26.000000000 +0100 @@ -0,0 +1,78 @@ +/* PR c/78408 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-fab1-details" } */ +/* { dg-final { scan-tree-dump-times "after previous" 16 "fab1" } } */ + +struct S { char a[32]; }; +struct T { char a[65536]; }; +void bar (int, struct S *, struct S *, struct T *, struct T *); + +void +f1 (void) +{ + struct S a, b; + struct T c, d; + a = b = (struct S) {}; + c = d = (struct T) {}; + bar (1, &a, &b, &c, &d); +} + +void +f2 (void) +{ + struct S a, b; + struct T c, d; + b = (struct S) {}; + a = b; + d = (struct T) {}; + c = d; + bar (2, &a, &b, &c, &d); +} + +void +f3 (void) +{ + struct S a, b; + struct T c, d; + __builtin_memset (&b, 0, sizeof (b)); + a = b; + __builtin_memset (&d, 0, sizeof (d)); + c = d; + bar (3, &a, &b, &c, &d); +} + + +void +f4 (void) +{ + struct S a, b; + struct T c, d; + b = (struct S) {}; + __builtin_memcpy (&a, &b, sizeof (b)); + d = (struct T) {}; + __builtin_memcpy (&c, &d, sizeof (d)); + bar (4, &a, &b, &c, &d); +} + +void +f5 (void) +{ + struct S a, b; + struct T c, d; + __builtin_memset (&b, 0, sizeof (b)); + __builtin_memcpy (&a, &b, sizeof (b)); + __builtin_memset (&d, 0, sizeof (d)); + __builtin_memcpy (&c, &d, sizeof (d)); + bar (5, &a, &b, &c, &d); +} + +void +f6 (void) +{ + struct S a, b, e, g; + struct T c, d, f, h; + g = e = a = b = (struct S) {}; + h = f = c = d = (struct T) {}; + bar (6, &a, &b, &c, &d); + bar (6, &e, &g, &f, &h); +} Jakub ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] Optimiza aggregate a = b = c = {} (PR c/78408) 2016-11-25 19:32 [PATCH] Optimiza aggregate a = b = c = {} (PR c/78408) Jakub Jelinek @ 2016-11-28 9:50 ` Richard Biener 2016-12-13 11:36 ` Jakub Jelinek 0 siblings, 1 reply; 11+ messages in thread From: Richard Biener @ 2016-11-28 9:50 UTC (permalink / raw) To: Jakub Jelinek; +Cc: gcc-patches On Fri, 25 Nov 2016, Jakub Jelinek wrote: > Hi! > > This patch optimizes a = {}; b = a; into a = {}; b = {}; > and similarly for memset instead of the first stmt and/or memcpy > instead of the second one. > > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? > > 2016-11-25 Jakub Jelinek <jakub@redhat.com> > > PR c/78408 > * tree-ssa-ccp.c: Include tree-dfa.h. > (optimize_memcpy): New function. > (pass_fold_builtins::execute): Use it. Remove useless conditional > break after BUILT_IN_VA_*. > > * gcc.dg/pr78408.c: New test. > > --- gcc/tree-ssa-ccp.c.jj 2016-11-18 20:04:27.000000000 +0100 > +++ gcc/tree-ssa-ccp.c 2016-11-25 17:54:26.862166658 +0100 > @@ -143,6 +143,7 @@ along with GCC; see the file COPYING3. > #include "stor-layout.h" > #include "optabs-query.h" > #include "tree-ssa-ccp.h" > +#include "tree-dfa.h" > > /* Possible lattice values. */ > typedef enum > @@ -2928,6 +2929,113 @@ optimize_atomic_bit_test_and (gimple_stm > release_ssa_name (lhs); > } > > +/* Optimize > + a = {}; > + b = a; > + into > + a = {}; > + b = {}; > + Similarly for memset (&a, ..., sizeof (a)); instead of a = {}; > + and/or memcpy (&b, &a, sizeof (a)); instead of b = a; */ > + > +static void > +optimize_memcpy (gimple_stmt_iterator *gsip, tree dest, tree src) > +{ > + gimple *stmt = gsi_stmt (*gsip); > + if (gimple_has_volatile_ops (stmt) > + || TREE_THIS_VOLATILE (dest) > + || TREE_THIS_VOLATILE (src)) > + return; > + > + tree vuse = gimple_vuse (stmt); > + if (vuse == NULL) > + return; > + > + gimple *defstmt = SSA_NAME_DEF_STMT (vuse); > + tree src2 = NULL_TREE; > + tree val = integer_zero_node; > + if (gimple_store_p (defstmt) > + && gimple_assign_single_p (defstmt) > + && TREE_CODE (gimple_assign_rhs1 (defstmt)) == CONSTRUCTOR > + && CONSTRUCTOR_NELTS (gimple_assign_rhs1 (defstmt)) == 0 > + && !gimple_clobber_p (defstmt)) > + src2 = gimple_assign_lhs (defstmt); > + else if (gimple_call_builtin_p (defstmt, BUILT_IN_MEMSET) > + && TREE_CODE (gimple_call_arg (defstmt, 0)) == ADDR_EXPR > + && TREE_CODE (gimple_call_arg (defstmt, 1)) == INTEGER_CST > + && TREE_CODE (gimple_call_arg (defstmt, 2)) == INTEGER_CST) > + { > + HOST_WIDE_INT ssize, max_size, off; > + bool reverse; > + src2 = TREE_OPERAND (gimple_call_arg (defstmt, 0), 0); > + get_ref_base_and_extent (src2, &off, &ssize, &max_size, &reverse); > + if (ssize != max_size > + || (ssize % BITS_PER_UNIT) != 0 > + || !wi::eq_p (gimple_call_arg (defstmt, 2), ssize / BITS_PER_UNIT)) > + src2 = NULL_TREE; I wonder why you jump through the hoops of get_ref_base_and_extent given the call args will be invariant addresses and thus get_addr_base_and_unit_offset would be more appropriate here. Also not sure why you want to restrict the size with the wi::eq_p (probably for the a = b case where the size isn't given explicitely but then you don't check whether off is 0 ...). I'd say passing in base, offset and size for src and dest into this function will simplify things and should allow to handle memset (p+10, 0, 24); memcpy (q, p+10, 24); if you compare bases with operand_equal_p. > + else > + { > + val = gimple_call_arg (defstmt, 1); > + if (!integer_zerop (val) && is_gimple_assign (stmt)) > + src2 = NULL_TREE; > + } > + } > + > + if (src2 == NULL_TREE) > + return; > + > + if (!operand_equal_p (src, src2, 0)) > + { > + /* Handle also > + a = {}; > + MEM[(char * {ref-all})&b] = MEM[(char * {ref-all})&a]; */ > + if (is_gimple_assign (stmt) > + && TREE_CODE (src) == MEM_REF > + && integer_zerop (TREE_OPERAND (src, 1)) > + && TREE_CODE (TREE_OPERAND (src, 0)) == ADDR_EXPR > + && DECL_P (src2) > + && operand_equal_p (TREE_OPERAND (TREE_OPERAND (src, 0), 0), > + src2, 0) > + && tree_int_cst_equal (TYPE_SIZE (TREE_TYPE (src)), > + DECL_SIZE (src2))) > + src = TREE_OPERAND (TREE_OPERAND (src, 0), 0); > + else > + return; > + } > + if (refs_may_alias_p (dest, src)) > + return; Why's that? Richard. > + if (dump_file && (dump_flags & TDF_DETAILS)) > + { > + fprintf (dump_file, "Simplified\n "); > + print_gimple_stmt (dump_file, stmt, 0, dump_flags); > + fprintf (dump_file, "after previous\n "); > + print_gimple_stmt (dump_file, defstmt, 0, dump_flags); > + } > + > + if (is_gimple_assign (stmt)) > + { > + tree ctor = build_constructor (TREE_TYPE (dest), NULL); > + gimple_assign_set_rhs_from_tree (gsip, ctor); > + update_stmt (stmt); > + } > + else > + { > + gcall *call = as_a <gcall *> (stmt); > + tree fndecl = builtin_decl_implicit (BUILT_IN_MEMSET); > + gimple_call_set_fndecl (call, fndecl); > + gimple_call_set_fntype (call, TREE_TYPE (fndecl)); > + gimple_call_set_arg (call, 1, val); > + update_stmt (stmt); > + } > + > + if (dump_file && (dump_flags & TDF_DETAILS)) > + { > + fprintf (dump_file, "into\n "); > + print_gimple_stmt (dump_file, stmt, 0, dump_flags); > + } > +} > + > /* A simple pass that attempts to fold all builtin functions. This pass > is run after we've propagated as many constants as we can. */ > > @@ -2994,6 +3102,9 @@ pass_fold_builtins::execute (function *f > continue; > } > } > + else if (gimple_assign_load_p (stmt) && gimple_store_p (stmt)) > + optimize_memcpy (&i, gimple_assign_lhs (stmt), > + gimple_assign_rhs1 (stmt)); > gsi_next (&i); > continue; > } > @@ -3109,14 +3220,39 @@ pass_fold_builtins::execute (function *f > false, false); > break; > > + case BUILT_IN_MEMCPY: > + if (gimple_call_builtin_p (stmt, BUILT_IN_NORMAL) > + && TREE_CODE (gimple_call_arg (stmt, 0)) == ADDR_EXPR > + && TREE_CODE (gimple_call_arg (stmt, 1)) == ADDR_EXPR > + && TREE_CODE (gimple_call_arg (stmt, 2)) == INTEGER_CST) > + { > + tree dest = TREE_OPERAND (gimple_call_arg (stmt, 0), 0); > + tree src = TREE_OPERAND (gimple_call_arg (stmt, 1), 0); > + HOST_WIDE_INT dsize, ssize, max_size, off; > + bool reverse; > + get_ref_base_and_extent (dest, &off, &dsize, &max_size, > + &reverse); > + if (dsize != max_size) > + break; > + get_ref_base_and_extent (src, &off, &ssize, &max_size, > + &reverse); > + if (ssize != max_size > + || ssize != dsize > + || (ssize % BITS_PER_UNIT) != 0) > + break; > + if (!wi::eq_p (gimple_call_arg (stmt, 2), > + ssize / BITS_PER_UNIT)) > + break; > + optimize_memcpy (&i, dest, src); > + } > + break; > + > case BUILT_IN_VA_START: > case BUILT_IN_VA_END: > case BUILT_IN_VA_COPY: > /* These shouldn't be folded before pass_stdarg. */ > result = optimize_stdarg_builtin (stmt); > - if (result) > - break; > - /* FALLTHRU */ > + break; > > default:; > } > --- gcc/testsuite/gcc.dg/pr78408.c.jj 2016-11-25 18:02:47.344752199 +0100 > +++ gcc/testsuite/gcc.dg/pr78408.c 2016-11-25 18:02:26.000000000 +0100 > @@ -0,0 +1,78 @@ > +/* PR c/78408 */ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -fdump-tree-fab1-details" } */ > +/* { dg-final { scan-tree-dump-times "after previous" 16 "fab1" } } */ > + > +struct S { char a[32]; }; > +struct T { char a[65536]; }; > +void bar (int, struct S *, struct S *, struct T *, struct T *); > + > +void > +f1 (void) > +{ > + struct S a, b; > + struct T c, d; > + a = b = (struct S) {}; > + c = d = (struct T) {}; > + bar (1, &a, &b, &c, &d); > +} > + > +void > +f2 (void) > +{ > + struct S a, b; > + struct T c, d; > + b = (struct S) {}; > + a = b; > + d = (struct T) {}; > + c = d; > + bar (2, &a, &b, &c, &d); > +} > + > +void > +f3 (void) > +{ > + struct S a, b; > + struct T c, d; > + __builtin_memset (&b, 0, sizeof (b)); > + a = b; > + __builtin_memset (&d, 0, sizeof (d)); > + c = d; > + bar (3, &a, &b, &c, &d); > +} > + > + > +void > +f4 (void) > +{ > + struct S a, b; > + struct T c, d; > + b = (struct S) {}; > + __builtin_memcpy (&a, &b, sizeof (b)); > + d = (struct T) {}; > + __builtin_memcpy (&c, &d, sizeof (d)); > + bar (4, &a, &b, &c, &d); > +} > + > +void > +f5 (void) > +{ > + struct S a, b; > + struct T c, d; > + __builtin_memset (&b, 0, sizeof (b)); > + __builtin_memcpy (&a, &b, sizeof (b)); > + __builtin_memset (&d, 0, sizeof (d)); > + __builtin_memcpy (&c, &d, sizeof (d)); > + bar (5, &a, &b, &c, &d); > +} > + > +void > +f6 (void) > +{ > + struct S a, b, e, g; > + struct T c, d, f, h; > + g = e = a = b = (struct S) {}; > + h = f = c = d = (struct T) {}; > + bar (6, &a, &b, &c, &d); > + bar (6, &e, &g, &f, &h); > +} > > Jakub > > -- Richard Biener <rguenther@suse.de> SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg) ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] Optimiza aggregate a = b = c = {} (PR c/78408) 2016-11-28 9:50 ` Richard Biener @ 2016-12-13 11:36 ` Jakub Jelinek 2016-12-14 12:29 ` Richard Biener 0 siblings, 1 reply; 11+ messages in thread From: Jakub Jelinek @ 2016-12-13 11:36 UTC (permalink / raw) To: Richard Biener; +Cc: gcc-patches Hi! Sorry for not getting to this earlier. On Mon, Nov 28, 2016 at 10:50:26AM +0100, Richard Biener wrote: > > + else if (gimple_call_builtin_p (defstmt, BUILT_IN_MEMSET) > > + && TREE_CODE (gimple_call_arg (defstmt, 0)) == ADDR_EXPR > > + && TREE_CODE (gimple_call_arg (defstmt, 1)) == INTEGER_CST > > + && TREE_CODE (gimple_call_arg (defstmt, 2)) == INTEGER_CST) > > + { > > + HOST_WIDE_INT ssize, max_size, off; > > + bool reverse; > > + src2 = TREE_OPERAND (gimple_call_arg (defstmt, 0), 0); > > + get_ref_base_and_extent (src2, &off, &ssize, &max_size, &reverse); > > + if (ssize != max_size > > + || (ssize % BITS_PER_UNIT) != 0 > > + || !wi::eq_p (gimple_call_arg (defstmt, 2), ssize / BITS_PER_UNIT)) > > + src2 = NULL_TREE; > > I wonder why you jump through the hoops of get_ref_base_and_extent > given the call args will be invariant addresses and thus > get_addr_base_and_unit_offset would be more appropriate here. get_addr_base_and_unit_offset does not give me the size though, which is what I wanted to compute. Even if as you suggest I'd accept other INTEGER_CST sizes, it would still be better to punt if the memset is clearly invalid. And for the case where the memset call is followed by assignment, not memcpy (very common, as memcpy is often folded into the assignment), the verification needs to be done. > Also not sure why you want to restrict the size with the wi::eq_p > (probably for the a = b case where the size isn't given explicitely > but then you don't check whether off is 0 ...). I'd say passing But I'm not comparing the result of get_ref_base_and_extent, but the argument as is. Perhaps where it does above the src2 = NULL_TREE; I could save the size into one var, off into another one and set src2 to the result of get_ref_base_and_extent in that case, then if those vars are set require the second stmt to be a memset and do the same stuff there? > in base, offset and size for src and dest into this function will > simplify things and should allow to handle > > memset (p+10, 0, 24); > memcpy (q, p+10, 24); > > if you compare bases with operand_equal_p. > > > + if (refs_may_alias_p (dest, src)) > > + return; > > Why's that? I admit I'm not sure if GIMPLE_ASSIGN may be between overlapping objects or not, memset can't. Jakub ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] Optimiza aggregate a = b = c = {} (PR c/78408) 2016-12-13 11:36 ` Jakub Jelinek @ 2016-12-14 12:29 ` Richard Biener 2016-12-15 16:44 ` [PATCH] Optimiza aggregate a = b = c = {} (PR c/78408, take 2) Jakub Jelinek 0 siblings, 1 reply; 11+ messages in thread From: Richard Biener @ 2016-12-14 12:29 UTC (permalink / raw) To: Jakub Jelinek; +Cc: gcc-patches On Tue, 13 Dec 2016, Jakub Jelinek wrote: > Hi! > > Sorry for not getting to this earlier. > > On Mon, Nov 28, 2016 at 10:50:26AM +0100, Richard Biener wrote: > > > + else if (gimple_call_builtin_p (defstmt, BUILT_IN_MEMSET) > > > + && TREE_CODE (gimple_call_arg (defstmt, 0)) == ADDR_EXPR > > > + && TREE_CODE (gimple_call_arg (defstmt, 1)) == INTEGER_CST > > > + && TREE_CODE (gimple_call_arg (defstmt, 2)) == INTEGER_CST) > > > + { > > > + HOST_WIDE_INT ssize, max_size, off; > > > + bool reverse; > > > + src2 = TREE_OPERAND (gimple_call_arg (defstmt, 0), 0); > > > + get_ref_base_and_extent (src2, &off, &ssize, &max_size, &reverse); > > > + if (ssize != max_size > > > + || (ssize % BITS_PER_UNIT) != 0 > > > + || !wi::eq_p (gimple_call_arg (defstmt, 2), ssize / BITS_PER_UNIT)) > > > + src2 = NULL_TREE; > > > > I wonder why you jump through the hoops of get_ref_base_and_extent > > given the call args will be invariant addresses and thus > > get_addr_base_and_unit_offset would be more appropriate here. > > get_addr_base_and_unit_offset does not give me the size though, > which is what I wanted to compute. Even if as you suggest I'd > accept other INTEGER_CST sizes, it would still be better to punt > if the memset is clearly invalid. And for the case where the > memset call is followed by assignment, not memcpy (very common, as > memcpy is often folded into the assignment), the verification needs > to be done. > > > Also not sure why you want to restrict the size with the wi::eq_p > > (probably for the a = b case where the size isn't given explicitely > > but then you don't check whether off is 0 ...). I'd say passing > > But I'm not comparing the result of get_ref_base_and_extent, but > the argument as is. Ah, ok. But then the size of the memset shouldn't be compared against the get_ref_base_and_extend size from src2 but to the size of the access of SRC/DEST (clearly looking at the "size" of the ADDR_EXPR argument is somewhat bogus). And as you compare src and dest with operand_equal_p there is no need to reject ssize != max_size either (you'd of course not see memset (&a[i].x, 0, 400) because &a[i].x is not invariant, you'd need to lookup the SSA def for a pointer here). You can get at the size of an actual access simpler than by the full-blown get_ref_base_and_extent (just outline a get_ref_size () from the head part of it. I still think that using get_addr_base_and_unit_offset on the address is better and passing decomposed (base, offset, size) triplets to optimize_memcpy would also save you the MEM[(char * {ref-all})&b] = MEM[(char * {ref-all})&a]; special-case. > Perhaps where it does above the src2 = NULL_TREE; > I could save the size into one var, off into another one and set > src2 to the result of get_ref_base_and_extent in that case, then > if those vars are set require the second stmt to be a memset and do the same > stuff there? > > > in base, offset and size for src and dest into this function will > > simplify things and should allow to handle > > > > memset (p+10, 0, 24); > > memcpy (q, p+10, 24); > > > > if you compare bases with operand_equal_p. > > > > > + if (refs_may_alias_p (dest, src)) > > > + return; > > > > Why's that? > > I admit I'm not sure if GIMPLE_ASSIGN may be between overlapping objects or > not, memset can't. No, a memory-memory GIMPLE_ASSIGN has to be validly translatable to memcpy. Richard. ^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH] Optimiza aggregate a = b = c = {} (PR c/78408, take 2) 2016-12-14 12:29 ` Richard Biener @ 2016-12-15 16:44 ` Jakub Jelinek 2016-12-16 12:53 ` Richard Biener 0 siblings, 1 reply; 11+ messages in thread From: Jakub Jelinek @ 2016-12-15 16:44 UTC (permalink / raw) To: Richard Biener; +Cc: gcc-patches On Wed, Dec 14, 2016 at 01:27:56PM +0100, Richard Biener wrote: > Ah, ok. But then the size of the memset shouldn't be compared > against the get_ref_base_and_extend size from src2 but to the > size of the access of SRC/DEST (clearly looking at the "size" of > the ADDR_EXPR argument is somewhat bogus). > And as you compare src and dest > with operand_equal_p there is no need to reject ssize != max_size > either (you'd of course not see memset (&a[i].x, 0, 400) because > &a[i].x is not invariant, you'd need to lookup the SSA def for a pointer > here). > > You can get at the size of an actual access simpler than by > the full-blown get_ref_base_and_extent (just outline a > get_ref_size () from the head part of it. > > I still think that using get_addr_base_and_unit_offset on the > address is better and passing decomposed (base, offset, size) > triplets to optimize_memcpy would also save you the > MEM[(char * {ref-all})&b] = MEM[(char * {ref-all})&a]; special-case. Here is an updated patch that does that (i.e. always work with base, offset, length triplets) and drops the alias check for dest vs. src overlap. Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? 2016-12-15 Jakub Jelinek <jakub@redhat.com> PR c/78408 * tree-ssa-ccp.c: Include tree-dfa.h. (optimize_memcpy): New function. (pass_fold_builtins::execute): Use it. Remove useless conditional break after BUILT_IN_VA_*. * gcc.dg/pr78408-1.c: New test. * gcc.dg/pr78408-2.c: New test. --- gcc/tree-ssa-ccp.c.jj 2016-11-28 16:19:11.000000000 +0100 +++ gcc/tree-ssa-ccp.c 2016-12-15 15:09:03.180993745 +0100 @@ -143,6 +143,7 @@ along with GCC; see the file COPYING3. #include "stor-layout.h" #include "optabs-query.h" #include "tree-ssa-ccp.h" +#include "tree-dfa.h" /* Possible lattice values. */ typedef enum @@ -2933,6 +2934,113 @@ optimize_atomic_bit_test_and (gimple_stm release_ssa_name (lhs); } +/* Optimize + a = {}; + b = a; + into + a = {}; + b = {}; + Similarly for memset (&a, ..., sizeof (a)); instead of a = {}; + and/or memcpy (&b, &a, sizeof (a)); instead of b = a; */ + +static void +optimize_memcpy (gimple_stmt_iterator *gsip, tree dest, tree src, tree len) +{ + gimple *stmt = gsi_stmt (*gsip); + if (gimple_has_volatile_ops (stmt) + || TREE_THIS_VOLATILE (dest) + || TREE_THIS_VOLATILE (src)) + return; + + tree vuse = gimple_vuse (stmt); + if (vuse == NULL) + return; + + gimple *defstmt = SSA_NAME_DEF_STMT (vuse); + tree src2 = NULL_TREE, len2 = NULL_TREE; + HOST_WIDE_INT offset, offset2; + tree val = integer_zero_node; + if (gimple_store_p (defstmt) + && gimple_assign_single_p (defstmt) + && TREE_CODE (gimple_assign_rhs1 (defstmt)) == CONSTRUCTOR + && CONSTRUCTOR_NELTS (gimple_assign_rhs1 (defstmt)) == 0 + && !gimple_clobber_p (defstmt)) + src2 = gimple_assign_lhs (defstmt); + else if (gimple_call_builtin_p (defstmt, BUILT_IN_MEMSET) + && TREE_CODE (gimple_call_arg (defstmt, 0)) == ADDR_EXPR + && TREE_CODE (gimple_call_arg (defstmt, 1)) == INTEGER_CST) + { + src2 = TREE_OPERAND (gimple_call_arg (defstmt, 0), 0); + len2 = gimple_call_arg (defstmt, 2); + val = gimple_call_arg (defstmt, 1); + if (!integer_zerop (val) && is_gimple_assign (stmt)) + src2 = NULL_TREE; + } + + if (src2 == NULL_TREE) + return; + + if (len == NULL_TREE) + len = (TREE_CODE (src) == COMPONENT_REF + ? DECL_SIZE_UNIT (TREE_OPERAND (src, 1)) + : TYPE_SIZE_UNIT (TREE_TYPE (src))); + if (len2 == NULL_TREE) + len2 = (TREE_CODE (src2) == COMPONENT_REF + ? DECL_SIZE_UNIT (TREE_OPERAND (src2, 1)) + : TYPE_SIZE_UNIT (TREE_TYPE (src2))); + if (len == NULL_TREE + || TREE_CODE (len) != INTEGER_CST + || len2 == NULL_TREE + || TREE_CODE (len2) != INTEGER_CST) + return; + + src = get_addr_base_and_unit_offset (src, &offset); + src2 = get_addr_base_and_unit_offset (src2, &offset2); + if (src == NULL_TREE + || src2 == NULL_TREE + || offset < offset2) + return; + + if (!operand_equal_p (src, src2, 0)) + return; + + /* [ src + offset2, src + offset2 + len2 - 1 ] is set to val. + Make sure that + [ src + offset, src + offset + len - 1 ] is a subset of that. */ + if (wi::to_widest (len) + (offset - offset2) > wi::to_widest (len2)) + return; + + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "Simplified\n "); + print_gimple_stmt (dump_file, stmt, 0, dump_flags); + fprintf (dump_file, "after previous\n "); + print_gimple_stmt (dump_file, defstmt, 0, dump_flags); + } + + if (is_gimple_assign (stmt)) + { + tree ctor = build_constructor (TREE_TYPE (dest), NULL); + gimple_assign_set_rhs_from_tree (gsip, ctor); + update_stmt (stmt); + } + else + { + gcall *call = as_a <gcall *> (stmt); + tree fndecl = builtin_decl_implicit (BUILT_IN_MEMSET); + gimple_call_set_fndecl (call, fndecl); + gimple_call_set_fntype (call, TREE_TYPE (fndecl)); + gimple_call_set_arg (call, 1, val); + update_stmt (stmt); + } + + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "into\n "); + print_gimple_stmt (dump_file, stmt, 0, dump_flags); + } +} + /* A simple pass that attempts to fold all builtin functions. This pass is run after we've propagated as many constants as we can. */ @@ -2999,6 +3107,9 @@ pass_fold_builtins::execute (function *f continue; } } + else if (gimple_assign_load_p (stmt) && gimple_store_p (stmt)) + optimize_memcpy (&i, gimple_assign_lhs (stmt), + gimple_assign_rhs1 (stmt), NULL_TREE); gsi_next (&i); continue; } @@ -3114,14 +3225,25 @@ pass_fold_builtins::execute (function *f false, false); break; + case BUILT_IN_MEMCPY: + if (gimple_call_builtin_p (stmt, BUILT_IN_NORMAL) + && TREE_CODE (gimple_call_arg (stmt, 0)) == ADDR_EXPR + && TREE_CODE (gimple_call_arg (stmt, 1)) == ADDR_EXPR + && TREE_CODE (gimple_call_arg (stmt, 2)) == INTEGER_CST) + { + tree dest = TREE_OPERAND (gimple_call_arg (stmt, 0), 0); + tree src = TREE_OPERAND (gimple_call_arg (stmt, 1), 0); + tree len = gimple_call_arg (stmt, 2); + optimize_memcpy (&i, dest, src, len); + } + break; + case BUILT_IN_VA_START: case BUILT_IN_VA_END: case BUILT_IN_VA_COPY: /* These shouldn't be folded before pass_stdarg. */ result = optimize_stdarg_builtin (stmt); - if (result) - break; - /* FALLTHRU */ + break; default:; } --- gcc/testsuite/gcc.dg/pr78408-1.c.jj 2016-12-15 13:54:08.942121305 +0100 +++ gcc/testsuite/gcc.dg/pr78408-1.c 2016-12-15 15:13:04.900774924 +0100 @@ -0,0 +1,88 @@ +/* PR c/78408 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-fab1-details" } */ +/* { dg-final { scan-tree-dump-times "after previous" 17 "fab1" } } */ + +struct S { char a[32]; }; +struct T { char a[65536]; }; +void bar (int, struct S *, struct S *, struct T *, struct T *); +void baz (char *, char *); + +void +f1 (void) +{ + struct S a, b; + struct T c, d; + a = b = (struct S) {}; + c = d = (struct T) {}; + bar (1, &a, &b, &c, &d); +} + +void +f2 (void) +{ + struct S a, b; + struct T c, d; + b = (struct S) {}; + a = b; + d = (struct T) {}; + c = d; + bar (2, &a, &b, &c, &d); +} + +void +f3 (void) +{ + struct S a, b; + struct T c, d; + __builtin_memset (&b, 0, sizeof (b)); + a = b; + __builtin_memset (&d, 0, sizeof (d)); + c = d; + bar (3, &a, &b, &c, &d); +} + + +void +f4 (void) +{ + struct S a, b; + struct T c, d; + b = (struct S) {}; + __builtin_memcpy (&a, &b, sizeof (b)); + d = (struct T) {}; + __builtin_memcpy (&c, &d, sizeof (d)); + bar (4, &a, &b, &c, &d); +} + +void +f5 (void) +{ + struct S a, b; + struct T c, d; + __builtin_memset (&b, 0, sizeof (b)); + __builtin_memcpy (&a, &b, sizeof (b)); + __builtin_memset (&d, 0, sizeof (d)); + __builtin_memcpy (&c, &d, sizeof (d)); + bar (5, &a, &b, &c, &d); +} + +void +f6 (void) +{ + struct S a, b, e, g; + struct T c, d, f, h; + g = e = a = b = (struct S) {}; + h = f = c = d = (struct T) {}; + bar (6, &a, &b, &c, &d); + bar (6, &e, &g, &f, &h); +} + +void +f7 (void) +{ + char a[64], b[64]; + __builtin_memset (a + 13, 2, 27); + __builtin_memcpy (b + 4, a + 17, 23); + baz (a, b); +} --- gcc/testsuite/gcc.dg/pr78408-2.c.jj 2016-12-15 15:13:48.060200199 +0100 +++ gcc/testsuite/gcc.dg/pr78408-2.c 2016-12-15 15:15:50.880564683 +0100 @@ -0,0 +1,39 @@ +/* PR c/78408 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-fab1-details" } */ +/* { dg-final { scan-tree-dump-not "after previous" "fab1" } } */ + +struct S { char a[32]; }; +struct T { char a[65536]; }; +void bar (int, struct S *, struct S *, struct T *, struct T *); +void baz (char *, char *); + +void +f1 (void) +{ + struct S a, b; + struct T c, d; + __builtin_memset (&b, 2, sizeof (b)); + a = b; + __builtin_memset (&d, 3, sizeof (d)); + c = d; + bar (3, &a, &b, &c, &d); +} + +void +f2 (void) +{ + char a[64], b[64]; + __builtin_memset (a + 13, 2, 27); + __builtin_memcpy (b + 4, a + 17, 24); + baz (a, b); +} + +void +f3 (void) +{ + char a[64], b[64]; + __builtin_memset (a + 13, 2, 27); + __builtin_memcpy (b + 4, a + 12, 5); + baz (a, b); +} Jakub ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] Optimiza aggregate a = b = c = {} (PR c/78408, take 2) 2016-12-15 16:44 ` [PATCH] Optimiza aggregate a = b = c = {} (PR c/78408, take 2) Jakub Jelinek @ 2016-12-16 12:53 ` Richard Biener 2016-12-16 13:57 ` Jakub Jelinek 2016-12-19 20:24 ` Jeff Law 0 siblings, 2 replies; 11+ messages in thread From: Richard Biener @ 2016-12-16 12:53 UTC (permalink / raw) To: Jakub Jelinek; +Cc: gcc-patches On Thu, 15 Dec 2016, Jakub Jelinek wrote: > On Wed, Dec 14, 2016 at 01:27:56PM +0100, Richard Biener wrote: > > Ah, ok. But then the size of the memset shouldn't be compared > > against the get_ref_base_and_extend size from src2 but to the > > size of the access of SRC/DEST (clearly looking at the "size" of > > the ADDR_EXPR argument is somewhat bogus). > > And as you compare src and dest > > with operand_equal_p there is no need to reject ssize != max_size > > either (you'd of course not see memset (&a[i].x, 0, 400) because > > &a[i].x is not invariant, you'd need to lookup the SSA def for a pointer > > here). > > > > You can get at the size of an actual access simpler than by > > the full-blown get_ref_base_and_extent (just outline a > > get_ref_size () from the head part of it. > > > > I still think that using get_addr_base_and_unit_offset on the > > address is better and passing decomposed (base, offset, size) > > triplets to optimize_memcpy would also save you the > > MEM[(char * {ref-all})&b] = MEM[(char * {ref-all})&a]; special-case. > > Here is an updated patch that does that (i.e. always work with base, offset, > length triplets) and drops the alias check for dest vs. src overlap. > > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? > > 2016-12-15 Jakub Jelinek <jakub@redhat.com> > > PR c/78408 > * tree-ssa-ccp.c: Include tree-dfa.h. > (optimize_memcpy): New function. > (pass_fold_builtins::execute): Use it. Remove useless conditional > break after BUILT_IN_VA_*. > > * gcc.dg/pr78408-1.c: New test. > * gcc.dg/pr78408-2.c: New test. > > --- gcc/tree-ssa-ccp.c.jj 2016-11-28 16:19:11.000000000 +0100 > +++ gcc/tree-ssa-ccp.c 2016-12-15 15:09:03.180993745 +0100 > @@ -143,6 +143,7 @@ along with GCC; see the file COPYING3. > #include "stor-layout.h" > #include "optabs-query.h" > #include "tree-ssa-ccp.h" > +#include "tree-dfa.h" > > /* Possible lattice values. */ > typedef enum > @@ -2933,6 +2934,113 @@ optimize_atomic_bit_test_and (gimple_stm > release_ssa_name (lhs); > } > > +/* Optimize > + a = {}; > + b = a; > + into > + a = {}; > + b = {}; > + Similarly for memset (&a, ..., sizeof (a)); instead of a = {}; > + and/or memcpy (&b, &a, sizeof (a)); instead of b = a; */ > + > +static void > +optimize_memcpy (gimple_stmt_iterator *gsip, tree dest, tree src, tree len) > +{ > + gimple *stmt = gsi_stmt (*gsip); > + if (gimple_has_volatile_ops (stmt) > + || TREE_THIS_VOLATILE (dest) > + || TREE_THIS_VOLATILE (src)) The last two should be redundant and/or not necessary. > + return; > + > + tree vuse = gimple_vuse (stmt); > + if (vuse == NULL) > + return; > + > + gimple *defstmt = SSA_NAME_DEF_STMT (vuse); > + tree src2 = NULL_TREE, len2 = NULL_TREE; > + HOST_WIDE_INT offset, offset2; > + tree val = integer_zero_node; > + if (gimple_store_p (defstmt) > + && gimple_assign_single_p (defstmt) > + && TREE_CODE (gimple_assign_rhs1 (defstmt)) == CONSTRUCTOR > + && CONSTRUCTOR_NELTS (gimple_assign_rhs1 (defstmt)) == 0 Should be always true for stores from constructors. > + && !gimple_clobber_p (defstmt)) > + src2 = gimple_assign_lhs (defstmt); > + else if (gimple_call_builtin_p (defstmt, BUILT_IN_MEMSET) > + && TREE_CODE (gimple_call_arg (defstmt, 0)) == ADDR_EXPR > + && TREE_CODE (gimple_call_arg (defstmt, 1)) == INTEGER_CST) > + { > + src2 = TREE_OPERAND (gimple_call_arg (defstmt, 0), 0); > + len2 = gimple_call_arg (defstmt, 2); > + val = gimple_call_arg (defstmt, 1); > + if (!integer_zerop (val) && is_gimple_assign (stmt)) Please add a comment here. I think below ... (*) > + src2 = NULL_TREE; > + } > + > + if (src2 == NULL_TREE) > + return; > + > + if (len == NULL_TREE) > + len = (TREE_CODE (src) == COMPONENT_REF > + ? DECL_SIZE_UNIT (TREE_OPERAND (src, 1)) > + : TYPE_SIZE_UNIT (TREE_TYPE (src))); > + if (len2 == NULL_TREE) > + len2 = (TREE_CODE (src2) == COMPONENT_REF > + ? DECL_SIZE_UNIT (TREE_OPERAND (src2, 1)) > + : TYPE_SIZE_UNIT (TREE_TYPE (src2))); > + if (len == NULL_TREE > + || TREE_CODE (len) != INTEGER_CST > + || len2 == NULL_TREE > + || TREE_CODE (len2) != INTEGER_CST) > + return; > + > + src = get_addr_base_and_unit_offset (src, &offset); > + src2 = get_addr_base_and_unit_offset (src2, &offset2); > + if (src == NULL_TREE > + || src2 == NULL_TREE > + || offset < offset2) > + return; > + > + if (!operand_equal_p (src, src2, 0)) > + return; > + > + /* [ src + offset2, src + offset2 + len2 - 1 ] is set to val. > + Make sure that > + [ src + offset, src + offset + len - 1 ] is a subset of that. */ > + if (wi::to_widest (len) + (offset - offset2) > wi::to_widest (len2)) I think you can use ::to_offset which is more efficient. > + return; > + > + if (dump_file && (dump_flags & TDF_DETAILS)) > + { > + fprintf (dump_file, "Simplified\n "); > + print_gimple_stmt (dump_file, stmt, 0, dump_flags); > + fprintf (dump_file, "after previous\n "); > + print_gimple_stmt (dump_file, defstmt, 0, dump_flags); > + } > + > + if (is_gimple_assign (stmt)) (*) a better check would be if val is zero because even a memcpy could be optimized to an assignment from {}. I suppose you're doing it this way because of simplicity and because we're late in the compilation and thus it doesn't matter in practice for optimization? (the 2nd destination could become non-TREE_ADDRESSABLE, enabling better alias disambiguation) Ok with the above changes, some more comments are ok to resolve the last one. > + { > + tree ctor = build_constructor (TREE_TYPE (dest), NULL); > + gimple_assign_set_rhs_from_tree (gsip, ctor); > + update_stmt (stmt); > + } > + else /* stmt is memcpy */ Thanks, Richard. > + { > + gcall *call = as_a <gcall *> (stmt); > + tree fndecl = builtin_decl_implicit (BUILT_IN_MEMSET); > + gimple_call_set_fndecl (call, fndecl); > + gimple_call_set_fntype (call, TREE_TYPE (fndecl)); > + gimple_call_set_arg (call, 1, val); > + update_stmt (stmt); > + } > + > + if (dump_file && (dump_flags & TDF_DETAILS)) > + { > + fprintf (dump_file, "into\n "); > + print_gimple_stmt (dump_file, stmt, 0, dump_flags); > + } > +} > + > /* A simple pass that attempts to fold all builtin functions. This pass > is run after we've propagated as many constants as we can. */ > > @@ -2999,6 +3107,9 @@ pass_fold_builtins::execute (function *f > continue; > } > } > + else if (gimple_assign_load_p (stmt) && gimple_store_p (stmt)) > + optimize_memcpy (&i, gimple_assign_lhs (stmt), > + gimple_assign_rhs1 (stmt), NULL_TREE); > gsi_next (&i); > continue; > } > @@ -3114,14 +3225,25 @@ pass_fold_builtins::execute (function *f > false, false); > break; > > + case BUILT_IN_MEMCPY: > + if (gimple_call_builtin_p (stmt, BUILT_IN_NORMAL) > + && TREE_CODE (gimple_call_arg (stmt, 0)) == ADDR_EXPR > + && TREE_CODE (gimple_call_arg (stmt, 1)) == ADDR_EXPR > + && TREE_CODE (gimple_call_arg (stmt, 2)) == INTEGER_CST) > + { > + tree dest = TREE_OPERAND (gimple_call_arg (stmt, 0), 0); > + tree src = TREE_OPERAND (gimple_call_arg (stmt, 1), 0); > + tree len = gimple_call_arg (stmt, 2); > + optimize_memcpy (&i, dest, src, len); > + } > + break; > + > case BUILT_IN_VA_START: > case BUILT_IN_VA_END: > case BUILT_IN_VA_COPY: > /* These shouldn't be folded before pass_stdarg. */ > result = optimize_stdarg_builtin (stmt); > - if (result) > - break; > - /* FALLTHRU */ > + break; > > default:; > } > --- gcc/testsuite/gcc.dg/pr78408-1.c.jj 2016-12-15 13:54:08.942121305 +0100 > +++ gcc/testsuite/gcc.dg/pr78408-1.c 2016-12-15 15:13:04.900774924 +0100 > @@ -0,0 +1,88 @@ > +/* PR c/78408 */ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -fdump-tree-fab1-details" } */ > +/* { dg-final { scan-tree-dump-times "after previous" 17 "fab1" } } */ > + > +struct S { char a[32]; }; > +struct T { char a[65536]; }; > +void bar (int, struct S *, struct S *, struct T *, struct T *); > +void baz (char *, char *); > + > +void > +f1 (void) > +{ > + struct S a, b; > + struct T c, d; > + a = b = (struct S) {}; > + c = d = (struct T) {}; > + bar (1, &a, &b, &c, &d); > +} > + > +void > +f2 (void) > +{ > + struct S a, b; > + struct T c, d; > + b = (struct S) {}; > + a = b; > + d = (struct T) {}; > + c = d; > + bar (2, &a, &b, &c, &d); > +} > + > +void > +f3 (void) > +{ > + struct S a, b; > + struct T c, d; > + __builtin_memset (&b, 0, sizeof (b)); > + a = b; > + __builtin_memset (&d, 0, sizeof (d)); > + c = d; > + bar (3, &a, &b, &c, &d); > +} > + > + > +void > +f4 (void) > +{ > + struct S a, b; > + struct T c, d; > + b = (struct S) {}; > + __builtin_memcpy (&a, &b, sizeof (b)); > + d = (struct T) {}; > + __builtin_memcpy (&c, &d, sizeof (d)); > + bar (4, &a, &b, &c, &d); > +} > + > +void > +f5 (void) > +{ > + struct S a, b; > + struct T c, d; > + __builtin_memset (&b, 0, sizeof (b)); > + __builtin_memcpy (&a, &b, sizeof (b)); > + __builtin_memset (&d, 0, sizeof (d)); > + __builtin_memcpy (&c, &d, sizeof (d)); > + bar (5, &a, &b, &c, &d); > +} > + > +void > +f6 (void) > +{ > + struct S a, b, e, g; > + struct T c, d, f, h; > + g = e = a = b = (struct S) {}; > + h = f = c = d = (struct T) {}; > + bar (6, &a, &b, &c, &d); > + bar (6, &e, &g, &f, &h); > +} > + > +void > +f7 (void) > +{ > + char a[64], b[64]; > + __builtin_memset (a + 13, 2, 27); > + __builtin_memcpy (b + 4, a + 17, 23); > + baz (a, b); > +} > --- gcc/testsuite/gcc.dg/pr78408-2.c.jj 2016-12-15 15:13:48.060200199 +0100 > +++ gcc/testsuite/gcc.dg/pr78408-2.c 2016-12-15 15:15:50.880564683 +0100 > @@ -0,0 +1,39 @@ > +/* PR c/78408 */ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -fdump-tree-fab1-details" } */ > +/* { dg-final { scan-tree-dump-not "after previous" "fab1" } } */ > + > +struct S { char a[32]; }; > +struct T { char a[65536]; }; > +void bar (int, struct S *, struct S *, struct T *, struct T *); > +void baz (char *, char *); > + > +void > +f1 (void) > +{ > + struct S a, b; > + struct T c, d; > + __builtin_memset (&b, 2, sizeof (b)); > + a = b; > + __builtin_memset (&d, 3, sizeof (d)); > + c = d; > + bar (3, &a, &b, &c, &d); > +} > + > +void > +f2 (void) > +{ > + char a[64], b[64]; > + __builtin_memset (a + 13, 2, 27); > + __builtin_memcpy (b + 4, a + 17, 24); > + baz (a, b); > +} > + > +void > +f3 (void) > +{ > + char a[64], b[64]; > + __builtin_memset (a + 13, 2, 27); > + __builtin_memcpy (b + 4, a + 12, 5); > + baz (a, b); > +} > > > Jakub > > -- Richard Biener <rguenther@suse.de> SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg) ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] Optimiza aggregate a = b = c = {} (PR c/78408, take 2) 2016-12-16 12:53 ` Richard Biener @ 2016-12-16 13:57 ` Jakub Jelinek 2016-12-16 13:17 ` Richard Biener 2016-12-19 20:24 ` Jeff Law 1 sibling, 1 reply; 11+ messages in thread From: Jakub Jelinek @ 2016-12-16 13:57 UTC (permalink / raw) To: Richard Biener; +Cc: gcc-patches On Fri, Dec 16, 2016 at 01:50:54PM +0100, Richard Biener wrote: > > +static void > > +optimize_memcpy (gimple_stmt_iterator *gsip, tree dest, tree src, tree len) > > +{ > > + gimple *stmt = gsi_stmt (*gsip); > > + if (gimple_has_volatile_ops (stmt) > > + || TREE_THIS_VOLATILE (dest) > > + || TREE_THIS_VOLATILE (src)) > > The last two should be redundant and/or not necessary. I've been doing this if stmt is __builtin_memcpy and the actual objects are TREE_THIS_VOLATILE, then I wanted to punt. But I guess if we never transform __builtin_memcpy into anything other than __builtin_memset, then it isn't a problem. > > + return; > > + > > + tree vuse = gimple_vuse (stmt); > > + if (vuse == NULL) > > + return; > > + > > + gimple *defstmt = SSA_NAME_DEF_STMT (vuse); > > + tree src2 = NULL_TREE, len2 = NULL_TREE; > > + HOST_WIDE_INT offset, offset2; > > + tree val = integer_zero_node; > > + if (gimple_store_p (defstmt) > > + && gimple_assign_single_p (defstmt) > > + && TREE_CODE (gimple_assign_rhs1 (defstmt)) == CONSTRUCTOR > > + && CONSTRUCTOR_NELTS (gimple_assign_rhs1 (defstmt)) == 0 > > Should be always true for stores from constructors. Ok, can I just gcc_checking_assert verify it or is that not worth it? > > + && !gimple_clobber_p (defstmt)) > > + src2 = gimple_assign_lhs (defstmt); > > + else if (gimple_call_builtin_p (defstmt, BUILT_IN_MEMSET) > > + && TREE_CODE (gimple_call_arg (defstmt, 0)) == ADDR_EXPR > > + && TREE_CODE (gimple_call_arg (defstmt, 1)) == INTEGER_CST) > > + { > > + src2 = TREE_OPERAND (gimple_call_arg (defstmt, 0), 0); > > + len2 = gimple_call_arg (defstmt, 2); > > + val = gimple_call_arg (defstmt, 1); > > + if (!integer_zerop (val) && is_gimple_assign (stmt)) > > Please add a comment here. I think below ... (*) > > > + src2 = NULL_TREE; > > + } > > + > > + if (src2 == NULL_TREE) > > + return; > > + > > + if (len == NULL_TREE) > > + len = (TREE_CODE (src) == COMPONENT_REF > > + ? DECL_SIZE_UNIT (TREE_OPERAND (src, 1)) > > + : TYPE_SIZE_UNIT (TREE_TYPE (src))); > > + if (len2 == NULL_TREE) > > + len2 = (TREE_CODE (src2) == COMPONENT_REF > > + ? DECL_SIZE_UNIT (TREE_OPERAND (src2, 1)) > > + : TYPE_SIZE_UNIT (TREE_TYPE (src2))); > > + if (len == NULL_TREE > > + || TREE_CODE (len) != INTEGER_CST > > + || len2 == NULL_TREE > > + || TREE_CODE (len2) != INTEGER_CST) > > + return; > > + > > + src = get_addr_base_and_unit_offset (src, &offset); > > + src2 = get_addr_base_and_unit_offset (src2, &offset2); > > + if (src == NULL_TREE > > + || src2 == NULL_TREE > > + || offset < offset2) > > + return; > > + > > + if (!operand_equal_p (src, src2, 0)) > > + return; > > + > > + /* [ src + offset2, src + offset2 + len2 - 1 ] is set to val. > > + Make sure that > > + [ src + offset, src + offset + len - 1 ] is a subset of that. */ > > + if (wi::to_widest (len) + (offset - offset2) > wi::to_widest (len2)) > > I think you can use ::to_offset which is more efficient. That is reasonable. > > + return; > > + > > + if (dump_file && (dump_flags & TDF_DETAILS)) > > + { > > + fprintf (dump_file, "Simplified\n "); > > + print_gimple_stmt (dump_file, stmt, 0, dump_flags); > > + fprintf (dump_file, "after previous\n "); > > + print_gimple_stmt (dump_file, defstmt, 0, dump_flags); > > + } > > + > > + if (is_gimple_assign (stmt)) > > (*) a better check would be if val is zero because even a memcpy > could be optimized to an assignment from {}. I suppose you're > doing it this way because of simplicity and because we're > late in the compilation and thus it doesn't matter in practice > for optimization? (the 2nd destination could become > non-TREE_ADDRESSABLE, enabling better alias disambiguation) Is memset and = {} equivalent even for aggregates with paddings? If val is not zero, then I can only handle it if stmt is memcpy, (or in theory if stmt is assignment, and dest is addressable it could be transformed into memset). If val is zero, and dest isn't volatile and the size matches the size of dest, then perhaps it could be transformed into = {} (if there are no paddings/bitfields or if it doesn't matter), I haven't done that for simplicity indeed. Jakub ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] Optimiza aggregate a = b = c = {} (PR c/78408, take 2) 2016-12-16 13:57 ` Jakub Jelinek @ 2016-12-16 13:17 ` Richard Biener 2016-12-16 14:02 ` Jakub Jelinek 0 siblings, 1 reply; 11+ messages in thread From: Richard Biener @ 2016-12-16 13:17 UTC (permalink / raw) To: Jakub Jelinek; +Cc: gcc-patches On Fri, 16 Dec 2016, Jakub Jelinek wrote: > On Fri, Dec 16, 2016 at 01:50:54PM +0100, Richard Biener wrote: > > > +static void > > > +optimize_memcpy (gimple_stmt_iterator *gsip, tree dest, tree src, tree len) > > > +{ > > > + gimple *stmt = gsi_stmt (*gsip); > > > + if (gimple_has_volatile_ops (stmt) > > > + || TREE_THIS_VOLATILE (dest) > > > + || TREE_THIS_VOLATILE (src)) > > > > The last two should be redundant and/or not necessary. > > I've been doing this if stmt is __builtin_memcpy and the actual > objects are TREE_THIS_VOLATILE, then I wanted to punt. > But I guess if we never transform __builtin_memcpy into anything > other than __builtin_memset, then it isn't a problem. Yeah, and a memcpy of a volatile object isn't a volatile access anyway. > > > + return; > > > + > > > + tree vuse = gimple_vuse (stmt); > > > + if (vuse == NULL) > > > + return; > > > + > > > + gimple *defstmt = SSA_NAME_DEF_STMT (vuse); > > > + tree src2 = NULL_TREE, len2 = NULL_TREE; > > > + HOST_WIDE_INT offset, offset2; > > > + tree val = integer_zero_node; > > > + if (gimple_store_p (defstmt) > > > + && gimple_assign_single_p (defstmt) > > > + && TREE_CODE (gimple_assign_rhs1 (defstmt)) == CONSTRUCTOR > > > + && CONSTRUCTOR_NELTS (gimple_assign_rhs1 (defstmt)) == 0 > > > > Should be always true for stores from constructors. > > Ok, can I just gcc_checking_assert verify it or is that not worth it? Not worth it -- tree-cfg.c gimple verification contains that already (ok, not exactly - it allows vector CONSTRUCTORs as RHS of !DECL_GIMPLE_REG_P vectors -- see a bit above (MEM_REF:) for a missed check. > > > > + && !gimple_clobber_p (defstmt)) > > > + src2 = gimple_assign_lhs (defstmt); > > > + else if (gimple_call_builtin_p (defstmt, BUILT_IN_MEMSET) > > > + && TREE_CODE (gimple_call_arg (defstmt, 0)) == ADDR_EXPR > > > + && TREE_CODE (gimple_call_arg (defstmt, 1)) == INTEGER_CST) > > > + { > > > + src2 = TREE_OPERAND (gimple_call_arg (defstmt, 0), 0); > > > + len2 = gimple_call_arg (defstmt, 2); > > > + val = gimple_call_arg (defstmt, 1); > > > + if (!integer_zerop (val) && is_gimple_assign (stmt)) > > > > Please add a comment here. I think below ... (*) > > > > > + src2 = NULL_TREE; > > > + } > > > + > > > + if (src2 == NULL_TREE) > > > + return; > > > + > > > + if (len == NULL_TREE) > > > + len = (TREE_CODE (src) == COMPONENT_REF > > > + ? DECL_SIZE_UNIT (TREE_OPERAND (src, 1)) > > > + : TYPE_SIZE_UNIT (TREE_TYPE (src))); > > > + if (len2 == NULL_TREE) > > > + len2 = (TREE_CODE (src2) == COMPONENT_REF > > > + ? DECL_SIZE_UNIT (TREE_OPERAND (src2, 1)) > > > + : TYPE_SIZE_UNIT (TREE_TYPE (src2))); > > > + if (len == NULL_TREE > > > + || TREE_CODE (len) != INTEGER_CST > > > + || len2 == NULL_TREE > > > + || TREE_CODE (len2) != INTEGER_CST) > > > + return; > > > + > > > + src = get_addr_base_and_unit_offset (src, &offset); > > > + src2 = get_addr_base_and_unit_offset (src2, &offset2); > > > + if (src == NULL_TREE > > > + || src2 == NULL_TREE > > > + || offset < offset2) > > > + return; > > > + > > > + if (!operand_equal_p (src, src2, 0)) > > > + return; > > > + > > > + /* [ src + offset2, src + offset2 + len2 - 1 ] is set to val. > > > + Make sure that > > > + [ src + offset, src + offset + len - 1 ] is a subset of that. */ > > > + if (wi::to_widest (len) + (offset - offset2) > wi::to_widest (len2)) > > > > I think you can use ::to_offset which is more efficient. > > That is reasonable. > > > > + return; > > > + > > > + if (dump_file && (dump_flags & TDF_DETAILS)) > > > + { > > > + fprintf (dump_file, "Simplified\n "); > > > + print_gimple_stmt (dump_file, stmt, 0, dump_flags); > > > + fprintf (dump_file, "after previous\n "); > > > + print_gimple_stmt (dump_file, defstmt, 0, dump_flags); > > > + } > > > + > > > + if (is_gimple_assign (stmt)) > > > > (*) a better check would be if val is zero because even a memcpy > > could be optimized to an assignment from {}. I suppose you're > > doing it this way because of simplicity and because we're > > late in the compilation and thus it doesn't matter in practice > > for optimization? (the 2nd destination could become > > non-TREE_ADDRESSABLE, enabling better alias disambiguation) > > Is memset and = {} equivalent even for aggregates with paddings? When lowering memset to = {} you should use MEM<char[]> (&decl, off-of-original-alias-type) = {}-of-type-char[]; similar for lowering of memcpy (I think what gimple-fold.c does for example is slightly unsafe). But in practice I think nothing in GCC assumes you can lower accesses and ignore padding if that pass (like SRA) does not see all accesses to prove that is safe. > If val is not zero, then I can only handle it if stmt is memcpy, > (or in theory if stmt is assignment, and dest is addressable it > could be transformed into memset). If val is zero, and dest isn't > volatile and the size matches the size of dest, then perhaps it > could be transformed into = {} (if there are no paddings/bitfields > or if it doesn't matter), I haven't done that for simplicity indeed. Yes, please add a comment so a curious bystander doesn't have to figure that out himself. Richard. ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] Optimiza aggregate a = b = c = {} (PR c/78408, take 2) 2016-12-16 13:17 ` Richard Biener @ 2016-12-16 14:02 ` Jakub Jelinek 2016-12-16 14:15 ` Richard Biener 0 siblings, 1 reply; 11+ messages in thread From: Jakub Jelinek @ 2016-12-16 14:02 UTC (permalink / raw) To: Richard Biener; +Cc: gcc-patches On Fri, Dec 16, 2016 at 02:16:40PM +0100, Richard Biener wrote: > > Is memset and = {} equivalent even for aggregates with paddings? > > When lowering memset to = {} you should use > > MEM<char[]> (&decl, off-of-original-alias-type) = {}-of-type-char[]; > > similar for lowering of memcpy (I think what gimple-fold.c does > for example is slightly unsafe). But in practice I think nothing > in GCC assumes you can lower accesses and ignore padding if that > pass (like SRA) does not see all accesses to prove that is safe. > > > If val is not zero, then I can only handle it if stmt is memcpy, > > (or in theory if stmt is assignment, and dest is addressable it > > could be transformed into memset). If val is zero, and dest isn't > > volatile and the size matches the size of dest, then perhaps it > > could be transformed into = {} (if there are no paddings/bitfields > > or if it doesn't matter), I haven't done that for simplicity indeed. > > Yes, please add a comment so a curious bystander doesn't have to figure > that out himself. So like this (if it passes bootstrap/regtest)? 2016-12-16 Jakub Jelinek <jakub@redhat.com> PR c/78408 * tree-ssa-ccp.c: Include tree-dfa.h. (optimize_memcpy): New function. (pass_fold_builtins::execute): Use it. Remove useless conditional break after BUILT_IN_VA_*. * gcc.dg/pr78408-1.c: New test. * gcc.dg/pr78408-2.c: New test. --- gcc/tree-ssa-ccp.c.jj 2016-12-16 11:24:31.495987909 +0100 +++ gcc/tree-ssa-ccp.c 2016-12-16 14:41:22.572861225 +0100 @@ -143,6 +143,7 @@ along with GCC; see the file COPYING3. #include "stor-layout.h" #include "optabs-query.h" #include "tree-ssa-ccp.h" +#include "tree-dfa.h" /* Possible lattice values. */ typedef enum @@ -2933,6 +2934,119 @@ optimize_atomic_bit_test_and (gimple_stm release_ssa_name (lhs); } +/* Optimize + a = {}; + b = a; + into + a = {}; + b = {}; + Similarly for memset (&a, ..., sizeof (a)); instead of a = {}; + and/or memcpy (&b, &a, sizeof (a)); instead of b = a; */ + +static void +optimize_memcpy (gimple_stmt_iterator *gsip, tree dest, tree src, tree len) +{ + gimple *stmt = gsi_stmt (*gsip); + if (gimple_has_volatile_ops (stmt)) + return; + + tree vuse = gimple_vuse (stmt); + if (vuse == NULL) + return; + + gimple *defstmt = SSA_NAME_DEF_STMT (vuse); + tree src2 = NULL_TREE, len2 = NULL_TREE; + HOST_WIDE_INT offset, offset2; + tree val = integer_zero_node; + if (gimple_store_p (defstmt) + && gimple_assign_single_p (defstmt) + && TREE_CODE (gimple_assign_rhs1 (defstmt)) == CONSTRUCTOR + && !gimple_clobber_p (defstmt)) + src2 = gimple_assign_lhs (defstmt); + else if (gimple_call_builtin_p (defstmt, BUILT_IN_MEMSET) + && TREE_CODE (gimple_call_arg (defstmt, 0)) == ADDR_EXPR + && TREE_CODE (gimple_call_arg (defstmt, 1)) == INTEGER_CST) + { + src2 = TREE_OPERAND (gimple_call_arg (defstmt, 0), 0); + len2 = gimple_call_arg (defstmt, 2); + val = gimple_call_arg (defstmt, 1); + /* For non-0 val, we'd have to transform stmt from assignment + into memset (only if dest is addressable). */ + if (!integer_zerop (val) && is_gimple_assign (stmt)) + src2 = NULL_TREE; + } + + if (src2 == NULL_TREE) + return; + + if (len == NULL_TREE) + len = (TREE_CODE (src) == COMPONENT_REF + ? DECL_SIZE_UNIT (TREE_OPERAND (src, 1)) + : TYPE_SIZE_UNIT (TREE_TYPE (src))); + if (len2 == NULL_TREE) + len2 = (TREE_CODE (src2) == COMPONENT_REF + ? DECL_SIZE_UNIT (TREE_OPERAND (src2, 1)) + : TYPE_SIZE_UNIT (TREE_TYPE (src2))); + if (len == NULL_TREE + || TREE_CODE (len) != INTEGER_CST + || len2 == NULL_TREE + || TREE_CODE (len2) != INTEGER_CST) + return; + + src = get_addr_base_and_unit_offset (src, &offset); + src2 = get_addr_base_and_unit_offset (src2, &offset2); + if (src == NULL_TREE + || src2 == NULL_TREE + || offset < offset2) + return; + + if (!operand_equal_p (src, src2, 0)) + return; + + /* [ src + offset2, src + offset2 + len2 - 1 ] is set to val. + Make sure that + [ src + offset, src + offset + len - 1 ] is a subset of that. */ + if (wi::to_offset (len) + (offset - offset2) > wi::to_offset (len2)) + return; + + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "Simplified\n "); + print_gimple_stmt (dump_file, stmt, 0, dump_flags); + fprintf (dump_file, "after previous\n "); + print_gimple_stmt (dump_file, defstmt, 0, dump_flags); + } + + /* For simplicity, don't change the kind of the stmt, + turn dest = src; into dest = {}; and memcpy (&dest, &src, len); + into memset (&dest, val, len); + In theory we could change dest = src into memset if dest + is addressable (maybe beneficial if val is not 0), or + memcpy (&dest, &src, len) into dest = {} if len is the size + of dest, dest isn't volatile. */ + if (is_gimple_assign (stmt)) + { + tree ctor = build_constructor (TREE_TYPE (dest), NULL); + gimple_assign_set_rhs_from_tree (gsip, ctor); + update_stmt (stmt); + } + else /* If stmt is memcpy, transform it into memset. */ + { + gcall *call = as_a <gcall *> (stmt); + tree fndecl = builtin_decl_implicit (BUILT_IN_MEMSET); + gimple_call_set_fndecl (call, fndecl); + gimple_call_set_fntype (call, TREE_TYPE (fndecl)); + gimple_call_set_arg (call, 1, val); + update_stmt (stmt); + } + + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "into\n "); + print_gimple_stmt (dump_file, stmt, 0, dump_flags); + } +} + /* A simple pass that attempts to fold all builtin functions. This pass is run after we've propagated as many constants as we can. */ @@ -2999,6 +3113,9 @@ pass_fold_builtins::execute (function *f continue; } } + else if (gimple_assign_load_p (stmt) && gimple_store_p (stmt)) + optimize_memcpy (&i, gimple_assign_lhs (stmt), + gimple_assign_rhs1 (stmt), NULL_TREE); gsi_next (&i); continue; } @@ -3114,14 +3231,25 @@ pass_fold_builtins::execute (function *f false, false); break; + case BUILT_IN_MEMCPY: + if (gimple_call_builtin_p (stmt, BUILT_IN_NORMAL) + && TREE_CODE (gimple_call_arg (stmt, 0)) == ADDR_EXPR + && TREE_CODE (gimple_call_arg (stmt, 1)) == ADDR_EXPR + && TREE_CODE (gimple_call_arg (stmt, 2)) == INTEGER_CST) + { + tree dest = TREE_OPERAND (gimple_call_arg (stmt, 0), 0); + tree src = TREE_OPERAND (gimple_call_arg (stmt, 1), 0); + tree len = gimple_call_arg (stmt, 2); + optimize_memcpy (&i, dest, src, len); + } + break; + case BUILT_IN_VA_START: case BUILT_IN_VA_END: case BUILT_IN_VA_COPY: /* These shouldn't be folded before pass_stdarg. */ result = optimize_stdarg_builtin (stmt); - if (result) - break; - /* FALLTHRU */ + break; default:; } --- gcc/testsuite/gcc.dg/pr78408-1.c.jj 2016-12-16 14:33:38.890063632 +0100 +++ gcc/testsuite/gcc.dg/pr78408-1.c 2016-12-16 14:33:38.890063632 +0100 @@ -0,0 +1,88 @@ +/* PR c/78408 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-fab1-details" } */ +/* { dg-final { scan-tree-dump-times "after previous" 17 "fab1" } } */ + +struct S { char a[32]; }; +struct T { char a[65536]; }; +void bar (int, struct S *, struct S *, struct T *, struct T *); +void baz (char *, char *); + +void +f1 (void) +{ + struct S a, b; + struct T c, d; + a = b = (struct S) {}; + c = d = (struct T) {}; + bar (1, &a, &b, &c, &d); +} + +void +f2 (void) +{ + struct S a, b; + struct T c, d; + b = (struct S) {}; + a = b; + d = (struct T) {}; + c = d; + bar (2, &a, &b, &c, &d); +} + +void +f3 (void) +{ + struct S a, b; + struct T c, d; + __builtin_memset (&b, 0, sizeof (b)); + a = b; + __builtin_memset (&d, 0, sizeof (d)); + c = d; + bar (3, &a, &b, &c, &d); +} + + +void +f4 (void) +{ + struct S a, b; + struct T c, d; + b = (struct S) {}; + __builtin_memcpy (&a, &b, sizeof (b)); + d = (struct T) {}; + __builtin_memcpy (&c, &d, sizeof (d)); + bar (4, &a, &b, &c, &d); +} + +void +f5 (void) +{ + struct S a, b; + struct T c, d; + __builtin_memset (&b, 0, sizeof (b)); + __builtin_memcpy (&a, &b, sizeof (b)); + __builtin_memset (&d, 0, sizeof (d)); + __builtin_memcpy (&c, &d, sizeof (d)); + bar (5, &a, &b, &c, &d); +} + +void +f6 (void) +{ + struct S a, b, e, g; + struct T c, d, f, h; + g = e = a = b = (struct S) {}; + h = f = c = d = (struct T) {}; + bar (6, &a, &b, &c, &d); + bar (6, &e, &g, &f, &h); +} + +void +f7 (void) +{ + char a[64], b[64]; + __builtin_memset (a + 13, 2, 27); + __builtin_memcpy (b + 4, a + 17, 23); + baz (a, b); +} --- gcc/testsuite/gcc.dg/pr78408-2.c.jj 2016-12-16 14:33:38.890063632 +0100 +++ gcc/testsuite/gcc.dg/pr78408-2.c 2016-12-16 14:33:38.890063632 +0100 @@ -0,0 +1,39 @@ +/* PR c/78408 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-fab1-details" } */ +/* { dg-final { scan-tree-dump-not "after previous" "fab1" } } */ + +struct S { char a[32]; }; +struct T { char a[65536]; }; +void bar (int, struct S *, struct S *, struct T *, struct T *); +void baz (char *, char *); + +void +f1 (void) +{ + struct S a, b; + struct T c, d; + __builtin_memset (&b, 2, sizeof (b)); + a = b; + __builtin_memset (&d, 3, sizeof (d)); + c = d; + bar (3, &a, &b, &c, &d); +} + +void +f2 (void) +{ + char a[64], b[64]; + __builtin_memset (a + 13, 2, 27); + __builtin_memcpy (b + 4, a + 17, 24); + baz (a, b); +} + +void +f3 (void) +{ + char a[64], b[64]; + __builtin_memset (a + 13, 2, 27); + __builtin_memcpy (b + 4, a + 12, 5); + baz (a, b); +} Jakub ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] Optimiza aggregate a = b = c = {} (PR c/78408, take 2) 2016-12-16 14:02 ` Jakub Jelinek @ 2016-12-16 14:15 ` Richard Biener 0 siblings, 0 replies; 11+ messages in thread From: Richard Biener @ 2016-12-16 14:15 UTC (permalink / raw) To: Jakub Jelinek; +Cc: gcc-patches On Fri, 16 Dec 2016, Jakub Jelinek wrote: > On Fri, Dec 16, 2016 at 02:16:40PM +0100, Richard Biener wrote: > > > Is memset and = {} equivalent even for aggregates with paddings? > > > > When lowering memset to = {} you should use > > > > MEM<char[]> (&decl, off-of-original-alias-type) = {}-of-type-char[]; > > > > similar for lowering of memcpy (I think what gimple-fold.c does > > for example is slightly unsafe). But in practice I think nothing > > in GCC assumes you can lower accesses and ignore padding if that > > pass (like SRA) does not see all accesses to prove that is safe. > > > > > If val is not zero, then I can only handle it if stmt is memcpy, > > > (or in theory if stmt is assignment, and dest is addressable it > > > could be transformed into memset). If val is zero, and dest isn't > > > volatile and the size matches the size of dest, then perhaps it > > > could be transformed into = {} (if there are no paddings/bitfields > > > or if it doesn't matter), I haven't done that for simplicity indeed. > > > > Yes, please add a comment so a curious bystander doesn't have to figure > > that out himself. > > So like this (if it passes bootstrap/regtest)? Yes. Thanks, Richard. > 2016-12-16 Jakub Jelinek <jakub@redhat.com> > > PR c/78408 > * tree-ssa-ccp.c: Include tree-dfa.h. > (optimize_memcpy): New function. > (pass_fold_builtins::execute): Use it. Remove useless conditional > break after BUILT_IN_VA_*. > > * gcc.dg/pr78408-1.c: New test. > * gcc.dg/pr78408-2.c: New test. > > --- gcc/tree-ssa-ccp.c.jj 2016-12-16 11:24:31.495987909 +0100 > +++ gcc/tree-ssa-ccp.c 2016-12-16 14:41:22.572861225 +0100 > @@ -143,6 +143,7 @@ along with GCC; see the file COPYING3. > #include "stor-layout.h" > #include "optabs-query.h" > #include "tree-ssa-ccp.h" > +#include "tree-dfa.h" > > /* Possible lattice values. */ > typedef enum > @@ -2933,6 +2934,119 @@ optimize_atomic_bit_test_and (gimple_stm > release_ssa_name (lhs); > } > > +/* Optimize > + a = {}; > + b = a; > + into > + a = {}; > + b = {}; > + Similarly for memset (&a, ..., sizeof (a)); instead of a = {}; > + and/or memcpy (&b, &a, sizeof (a)); instead of b = a; */ > + > +static void > +optimize_memcpy (gimple_stmt_iterator *gsip, tree dest, tree src, tree len) > +{ > + gimple *stmt = gsi_stmt (*gsip); > + if (gimple_has_volatile_ops (stmt)) > + return; > + > + tree vuse = gimple_vuse (stmt); > + if (vuse == NULL) > + return; > + > + gimple *defstmt = SSA_NAME_DEF_STMT (vuse); > + tree src2 = NULL_TREE, len2 = NULL_TREE; > + HOST_WIDE_INT offset, offset2; > + tree val = integer_zero_node; > + if (gimple_store_p (defstmt) > + && gimple_assign_single_p (defstmt) > + && TREE_CODE (gimple_assign_rhs1 (defstmt)) == CONSTRUCTOR > + && !gimple_clobber_p (defstmt)) > + src2 = gimple_assign_lhs (defstmt); > + else if (gimple_call_builtin_p (defstmt, BUILT_IN_MEMSET) > + && TREE_CODE (gimple_call_arg (defstmt, 0)) == ADDR_EXPR > + && TREE_CODE (gimple_call_arg (defstmt, 1)) == INTEGER_CST) > + { > + src2 = TREE_OPERAND (gimple_call_arg (defstmt, 0), 0); > + len2 = gimple_call_arg (defstmt, 2); > + val = gimple_call_arg (defstmt, 1); > + /* For non-0 val, we'd have to transform stmt from assignment > + into memset (only if dest is addressable). */ > + if (!integer_zerop (val) && is_gimple_assign (stmt)) > + src2 = NULL_TREE; > + } > + > + if (src2 == NULL_TREE) > + return; > + > + if (len == NULL_TREE) > + len = (TREE_CODE (src) == COMPONENT_REF > + ? DECL_SIZE_UNIT (TREE_OPERAND (src, 1)) > + : TYPE_SIZE_UNIT (TREE_TYPE (src))); > + if (len2 == NULL_TREE) > + len2 = (TREE_CODE (src2) == COMPONENT_REF > + ? DECL_SIZE_UNIT (TREE_OPERAND (src2, 1)) > + : TYPE_SIZE_UNIT (TREE_TYPE (src2))); > + if (len == NULL_TREE > + || TREE_CODE (len) != INTEGER_CST > + || len2 == NULL_TREE > + || TREE_CODE (len2) != INTEGER_CST) > + return; > + > + src = get_addr_base_and_unit_offset (src, &offset); > + src2 = get_addr_base_and_unit_offset (src2, &offset2); > + if (src == NULL_TREE > + || src2 == NULL_TREE > + || offset < offset2) > + return; > + > + if (!operand_equal_p (src, src2, 0)) > + return; > + > + /* [ src + offset2, src + offset2 + len2 - 1 ] is set to val. > + Make sure that > + [ src + offset, src + offset + len - 1 ] is a subset of that. */ > + if (wi::to_offset (len) + (offset - offset2) > wi::to_offset (len2)) > + return; > + > + if (dump_file && (dump_flags & TDF_DETAILS)) > + { > + fprintf (dump_file, "Simplified\n "); > + print_gimple_stmt (dump_file, stmt, 0, dump_flags); > + fprintf (dump_file, "after previous\n "); > + print_gimple_stmt (dump_file, defstmt, 0, dump_flags); > + } > + > + /* For simplicity, don't change the kind of the stmt, > + turn dest = src; into dest = {}; and memcpy (&dest, &src, len); > + into memset (&dest, val, len); > + In theory we could change dest = src into memset if dest > + is addressable (maybe beneficial if val is not 0), or > + memcpy (&dest, &src, len) into dest = {} if len is the size > + of dest, dest isn't volatile. */ > + if (is_gimple_assign (stmt)) > + { > + tree ctor = build_constructor (TREE_TYPE (dest), NULL); > + gimple_assign_set_rhs_from_tree (gsip, ctor); > + update_stmt (stmt); > + } > + else /* If stmt is memcpy, transform it into memset. */ > + { > + gcall *call = as_a <gcall *> (stmt); > + tree fndecl = builtin_decl_implicit (BUILT_IN_MEMSET); > + gimple_call_set_fndecl (call, fndecl); > + gimple_call_set_fntype (call, TREE_TYPE (fndecl)); > + gimple_call_set_arg (call, 1, val); > + update_stmt (stmt); > + } > + > + if (dump_file && (dump_flags & TDF_DETAILS)) > + { > + fprintf (dump_file, "into\n "); > + print_gimple_stmt (dump_file, stmt, 0, dump_flags); > + } > +} > + > /* A simple pass that attempts to fold all builtin functions. This pass > is run after we've propagated as many constants as we can. */ > > @@ -2999,6 +3113,9 @@ pass_fold_builtins::execute (function *f > continue; > } > } > + else if (gimple_assign_load_p (stmt) && gimple_store_p (stmt)) > + optimize_memcpy (&i, gimple_assign_lhs (stmt), > + gimple_assign_rhs1 (stmt), NULL_TREE); > gsi_next (&i); > continue; > } > @@ -3114,14 +3231,25 @@ pass_fold_builtins::execute (function *f > false, false); > break; > > + case BUILT_IN_MEMCPY: > + if (gimple_call_builtin_p (stmt, BUILT_IN_NORMAL) > + && TREE_CODE (gimple_call_arg (stmt, 0)) == ADDR_EXPR > + && TREE_CODE (gimple_call_arg (stmt, 1)) == ADDR_EXPR > + && TREE_CODE (gimple_call_arg (stmt, 2)) == INTEGER_CST) > + { > + tree dest = TREE_OPERAND (gimple_call_arg (stmt, 0), 0); > + tree src = TREE_OPERAND (gimple_call_arg (stmt, 1), 0); > + tree len = gimple_call_arg (stmt, 2); > + optimize_memcpy (&i, dest, src, len); > + } > + break; > + > case BUILT_IN_VA_START: > case BUILT_IN_VA_END: > case BUILT_IN_VA_COPY: > /* These shouldn't be folded before pass_stdarg. */ > result = optimize_stdarg_builtin (stmt); > - if (result) > - break; > - /* FALLTHRU */ > + break; > > default:; > } > --- gcc/testsuite/gcc.dg/pr78408-1.c.jj 2016-12-16 14:33:38.890063632 +0100 > +++ gcc/testsuite/gcc.dg/pr78408-1.c 2016-12-16 14:33:38.890063632 +0100 > @@ -0,0 +1,88 @@ > +/* PR c/78408 */ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -fdump-tree-fab1-details" } */ > +/* { dg-final { scan-tree-dump-times "after previous" 17 "fab1" } } */ > + > +struct S { char a[32]; }; > +struct T { char a[65536]; }; > +void bar (int, struct S *, struct S *, struct T *, struct T *); > +void baz (char *, char *); > + > +void > +f1 (void) > +{ > + struct S a, b; > + struct T c, d; > + a = b = (struct S) {}; > + c = d = (struct T) {}; > + bar (1, &a, &b, &c, &d); > +} > + > +void > +f2 (void) > +{ > + struct S a, b; > + struct T c, d; > + b = (struct S) {}; > + a = b; > + d = (struct T) {}; > + c = d; > + bar (2, &a, &b, &c, &d); > +} > + > +void > +f3 (void) > +{ > + struct S a, b; > + struct T c, d; > + __builtin_memset (&b, 0, sizeof (b)); > + a = b; > + __builtin_memset (&d, 0, sizeof (d)); > + c = d; > + bar (3, &a, &b, &c, &d); > +} > + > + > +void > +f4 (void) > +{ > + struct S a, b; > + struct T c, d; > + b = (struct S) {}; > + __builtin_memcpy (&a, &b, sizeof (b)); > + d = (struct T) {}; > + __builtin_memcpy (&c, &d, sizeof (d)); > + bar (4, &a, &b, &c, &d); > +} > + > +void > +f5 (void) > +{ > + struct S a, b; > + struct T c, d; > + __builtin_memset (&b, 0, sizeof (b)); > + __builtin_memcpy (&a, &b, sizeof (b)); > + __builtin_memset (&d, 0, sizeof (d)); > + __builtin_memcpy (&c, &d, sizeof (d)); > + bar (5, &a, &b, &c, &d); > +} > + > +void > +f6 (void) > +{ > + struct S a, b, e, g; > + struct T c, d, f, h; > + g = e = a = b = (struct S) {}; > + h = f = c = d = (struct T) {}; > + bar (6, &a, &b, &c, &d); > + bar (6, &e, &g, &f, &h); > +} > + > +void > +f7 (void) > +{ > + char a[64], b[64]; > + __builtin_memset (a + 13, 2, 27); > + __builtin_memcpy (b + 4, a + 17, 23); > + baz (a, b); > +} > --- gcc/testsuite/gcc.dg/pr78408-2.c.jj 2016-12-16 14:33:38.890063632 +0100 > +++ gcc/testsuite/gcc.dg/pr78408-2.c 2016-12-16 14:33:38.890063632 +0100 > @@ -0,0 +1,39 @@ > +/* PR c/78408 */ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -fdump-tree-fab1-details" } */ > +/* { dg-final { scan-tree-dump-not "after previous" "fab1" } } */ > + > +struct S { char a[32]; }; > +struct T { char a[65536]; }; > +void bar (int, struct S *, struct S *, struct T *, struct T *); > +void baz (char *, char *); > + > +void > +f1 (void) > +{ > + struct S a, b; > + struct T c, d; > + __builtin_memset (&b, 2, sizeof (b)); > + a = b; > + __builtin_memset (&d, 3, sizeof (d)); > + c = d; > + bar (3, &a, &b, &c, &d); > +} > + > +void > +f2 (void) > +{ > + char a[64], b[64]; > + __builtin_memset (a + 13, 2, 27); > + __builtin_memcpy (b + 4, a + 17, 24); > + baz (a, b); > +} > + > +void > +f3 (void) > +{ > + char a[64], b[64]; > + __builtin_memset (a + 13, 2, 27); > + __builtin_memcpy (b + 4, a + 12, 5); > + baz (a, b); > +} > > > Jakub > > -- Richard Biener <rguenther@suse.de> SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg) ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] Optimiza aggregate a = b = c = {} (PR c/78408, take 2) 2016-12-16 12:53 ` Richard Biener 2016-12-16 13:57 ` Jakub Jelinek @ 2016-12-19 20:24 ` Jeff Law 1 sibling, 0 replies; 11+ messages in thread From: Jeff Law @ 2016-12-19 20:24 UTC (permalink / raw) To: Richard Biener, Jakub Jelinek; +Cc: gcc-patches On 12/16/2016 05:50 AM, Richard Biener wrote: >> + gimple *defstmt = SSA_NAME_DEF_STMT (vuse); >> + tree src2 = NULL_TREE, len2 = NULL_TREE; >> + HOST_WIDE_INT offset, offset2; >> + tree val = integer_zero_node; >> + if (gimple_store_p (defstmt) >> + && gimple_assign_single_p (defstmt) >> + && TREE_CODE (gimple_assign_rhs1 (defstmt)) == CONSTRUCTOR >> + && CONSTRUCTOR_NELTS (gimple_assign_rhs1 (defstmt)) == 0 > > Should be always true for stores from constructors. Seems like I should drop similar checks from the in-progress DSE changes. I thought I saw something in SRA to cover when this wasn't true as well that could probably be simplified. jeff ^ permalink raw reply [flat|nested] 11+ messages in thread
end of thread, other threads:[~2016-12-19 20:17 UTC | newest] Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2016-11-25 19:32 [PATCH] Optimiza aggregate a = b = c = {} (PR c/78408) Jakub Jelinek 2016-11-28 9:50 ` Richard Biener 2016-12-13 11:36 ` Jakub Jelinek 2016-12-14 12:29 ` Richard Biener 2016-12-15 16:44 ` [PATCH] Optimiza aggregate a = b = c = {} (PR c/78408, take 2) Jakub Jelinek 2016-12-16 12:53 ` Richard Biener 2016-12-16 13:57 ` Jakub Jelinek 2016-12-16 13:17 ` Richard Biener 2016-12-16 14:02 ` Jakub Jelinek 2016-12-16 14:15 ` Richard Biener 2016-12-19 20:24 ` Jeff Law
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).