Hi Richard, On Tue, 23 May 2023 at 11:55, Richard Biener via Gcc-patches < gcc-patches@gcc.gnu.org> wrote: > The following fixes code hoisting to properly consider ANTIC_OUT instead > of ANTIC_IN. That's a bit expensive to re-compute but since we no > longer iterate we're doing this only once per BB which should be > acceptable. This avoids missing hoistings to the end of blocks where > something in the block clobbers the hoisted value. > > Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed. > > PR tree-optimization/109849 > * tree-ssa-pre.cc (do_hoist_insertion): Compute ANTIC_OUT > and use that to determine what to hoist. > > * gcc.dg/tree-ssa/ssa-hoist-8.c: New testcase. > This patch causes a regression on aarch64: gcc.target/aarch64/sve/fmla_2.c: \\tst1d found 3 times FAIL: gcc.target/aarch64/sve/fmla_2.c scan-assembler-times \\tst1d 2 We used to generate: mov x6, 0 mov w7, 55 whilelo p7.d, wzr, w7 .p2align 3,,7 .L2: ld1d z30.d, p7/z, [x5, x6, lsl 3] ld1d z31.d, p7/z, [x4, x6, lsl 3] cmpne p6.d, p7/z, z30.d, #0 ld1d z30.d, p7/z, [x3, x6, lsl 3] ld1d z29.d, p6/z, [x2, x6, lsl 3] movprfx z28, z30 fmla z28.d, p6/m, z31.d, z29.d fmla z31.d, p6/m, z30.d, z29.d st1d z28.d, p7, [x1, x6, lsl 3] st1d z31.d, p7, [x0, x6, lsl 3] incd x6 whilelo p7.d, w6, w7 b.any .L2 But now: mov x6, 0 mov w7, 55 ptrue p4.b, all whilelo p7.d, wzr, w7 .p2align 3,,7 .L2: ld1d z30.d, p7/z, [x5, x6, lsl 3] ld1d z31.d, p7/z, [x4, x6, lsl 3] cmpne p6.d, p7/z, z30.d, #0 cmpeq p5.d, p7/z, z30.d, #0 ld1d z29.d, p6/z, [x2, x6, lsl 3] ld1d z28.d, p6/z, [x3, x6, lsl 3] ld1d z30.d, p5/z, [x3, x6, lsl 3] movprfx z27, z31 fmla z27.d, p4/m, z29.d, z28.d movprfx z30.d, p6/m, z28.d fmla z30.d, p6/m, z31.d, z29.d st1d z27.d, p6, [x0, x6, lsl 3] st1d z30.d, p7, [x1, x6, lsl 3] st1d z31.d, p5, [x0, x6, lsl 3] incd x6 whilelo p7.d, w6, w7 b.any .L2 Christophe --- > gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-8.c | 22 ++++++++++ > gcc/tree-ssa-pre.cc | 48 ++++++++++++++++++--- > 2 files changed, 64 insertions(+), 6 deletions(-) > create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-8.c > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-8.c > b/gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-8.c > new file mode 100644 > index 00000000000..66bb48e0dc1 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-8.c > @@ -0,0 +1,22 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -fdump-tree-pre-stats" } */ > + > +int mem; > +void foo (); > +int bar (int flag) > +{ > + int res; > + foo (); > + /* Hoist the load of mem here even though foo () clobbers it. */ > + if (flag) > + res = mem; > + else > + { > + res = mem; > + mem = 2; > + } > + return res; > +} > + > +/* { dg-final { scan-tree-dump "HOIST inserted: 1" "pre" } } */ > +/* { dg-final { scan-tree-dump-times " = mem;" 1 "pre" } } */ > diff --git a/gcc/tree-ssa-pre.cc b/gcc/tree-ssa-pre.cc > index 1f7eea93c16..d56431b4145 100644 > --- a/gcc/tree-ssa-pre.cc > +++ b/gcc/tree-ssa-pre.cc > @@ -3622,19 +3622,51 @@ do_hoist_insertion (basic_block block) > && stmt_ends_bb_p (gsi_stmt (last))) > return false; > > - /* Compute the set of hoistable expressions from ANTIC_IN. First > compute > + /* We have multiple successors, compute ANTIC_OUT by taking the > intersection > + of all of ANTIC_IN translating through PHI nodes. Note we do not > have to > + worry about iteration stability here so just intersect the > expression sets > + as well. This is a simplification of what we do in > compute_antic_aux. */ > + bitmap_set_t ANTIC_OUT = bitmap_set_new (); > + bool first = true; > + FOR_EACH_EDGE (e, ei, block->succs) > + { > + if (first) > + { > + phi_translate_set (ANTIC_OUT, ANTIC_IN (e->dest), e); > + first = false; > + } > + else if (!gimple_seq_empty_p (phi_nodes (e->dest))) > + { > + bitmap_set_t tmp = bitmap_set_new (); > + phi_translate_set (tmp, ANTIC_IN (e->dest), e); > + bitmap_and_into (&ANTIC_OUT->values, &tmp->values); > + bitmap_and_into (&ANTIC_OUT->expressions, &tmp->expressions); > + bitmap_set_free (tmp); > + } > + else > + { > + bitmap_and_into (&ANTIC_OUT->values, &ANTIC_IN > (e->dest)->values); > + bitmap_and_into (&ANTIC_OUT->expressions, > + &ANTIC_IN (e->dest)->expressions); > + } > + } > + > + /* Compute the set of hoistable expressions from ANTIC_OUT. First > compute > hoistable values. */ > bitmap_set hoistable_set; > > - /* A hoistable value must be in ANTIC_IN(block) > + /* A hoistable value must be in ANTIC_OUT(block) > but not in AVAIL_OUT(BLOCK). */ > bitmap_initialize (&hoistable_set.values, &grand_bitmap_obstack); > bitmap_and_compl (&hoistable_set.values, > - &ANTIC_IN (block)->values, &AVAIL_OUT (block)->values); > + &ANTIC_OUT->values, &AVAIL_OUT (block)->values); > > /* Short-cut for a common case: hoistable_set is empty. */ > if (bitmap_empty_p (&hoistable_set.values)) > - return false; > + { > + bitmap_set_free (ANTIC_OUT); > + return false; > + } > > /* Compute which of the hoistable values is in AVAIL_OUT of > at least one of the successors of BLOCK. */ > @@ -3652,11 +3684,14 @@ do_hoist_insertion (basic_block block) > > /* Short-cut for a common case: availout_in_some is empty. */ > if (bitmap_empty_p (&availout_in_some)) > - return false; > + { > + bitmap_set_free (ANTIC_OUT); > + return false; > + } > > /* Hack hoitable_set in-place so we can use > sorted_array_from_bitmap_set. */ > bitmap_move (&hoistable_set.values, &availout_in_some); > - hoistable_set.expressions = ANTIC_IN (block)->expressions; > + hoistable_set.expressions = ANTIC_OUT->expressions; > > /* Now finally construct the topological-ordered expression set. */ > vec exprs = sorted_array_from_bitmap_set (&hoistable_set); > @@ -3731,6 +3766,7 @@ do_hoist_insertion (basic_block block) > } > > exprs.release (); > + bitmap_set_free (ANTIC_OUT); > > return new_stuff; > } > -- > 2.35.3 >