From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from smtp-out1.suse.de (smtp-out1.suse.de [195.135.220.28]) by sourceware.org (Postfix) with ESMTPS id 47B3D3858C20 for ; Tue, 16 Aug 2022 13:44:13 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 47B3D3858C20 Received: from relay2.suse.de (relay2.suse.de [149.44.160.134]) by smtp-out1.suse.de (Postfix) with ESMTP id 0B5EC374D9; Tue, 16 Aug 2022 13:44:12 +0000 (UTC) Received: from wotan.suse.de (wotan.suse.de [10.160.0.1]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by relay2.suse.de (Postfix) with ESMTPS id ED4852C153; Tue, 16 Aug 2022 13:44:11 +0000 (UTC) Date: Tue, 16 Aug 2022 13:44:11 +0000 (UTC) From: Richard Biener To: Aldy Hernandez cc: Andrew MacLeod , Jeff Law , gcc-patches Subject: Re: [PATCH] Support threading of just the exit edge In-Reply-To: Message-ID: References: <20220812120145.91C6513AAE@imap2.suse-dmz.suse.de> <3b6265b1-a0ab-f0b7-2d1f-aaf6277eec14@redhat.com> User-Agent: Alpine 2.22 (LSU 394 2020-01-19) MIME-Version: 1.0 Content-Type: text/plain; charset=US-ASCII X-Spam-Status: No, score=-11.0 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, GIT_PATCH_0, SPF_HELO_NONE, SPF_PASS, TXREP, T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org X-BeenThere: gcc-patches@gcc.gnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Gcc-patches mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 16 Aug 2022 13:44:16 -0000 On Tue, 16 Aug 2022, Richard Biener wrote: > On Tue, 16 Aug 2022, Aldy Hernandez wrote: > > > On Tue, Aug 16, 2022 at 11:18 AM Richard Biener wrote: > > > > > > On Mon, 15 Aug 2022, Aldy Hernandez wrote: > > > > > > > On Mon, Aug 15, 2022 at 9:24 PM Andrew MacLeod wrote: > > > > > > > > > > heh. or just > > > > > > > > > > > > > > > + int_range<2> r; > > > > > + if (!fold_range (r, const_cast (cond_stmt)) > > > > > + || !r.singleton_p (&val)) > > > > > > > > > > > > > > > if you do not provide a range_query to any of the fold_using_range code, > > > > > it defaults to: > > > > > > > > > > fur_source::fur_source (range_query *q) > > > > > { > > > > > if (q) > > > > > m_query = q; > > > > > else if (cfun) > > > > > m_query = get_range_query (cfun); > > > > > else > > > > > m_query = get_global_range_query (); > > > > > m_gori = NULL; > > > > > } > > > > > > > > > > > > > Sweet. Even better! > > > > > > So when I do the following incremental change ontop of the posted > > > patch then I see that the path query is able to simplify more > > > "single BB paths" than the global range folding. > > > > > > diff --git a/gcc/tree-ssa-threadbackward.cc > > > b/gcc/tree-ssa-threadbackward.cc > > > index 669098e4ec3..777e778037f 100644 > > > --- a/gcc/tree-ssa-threadbackward.cc > > > +++ b/gcc/tree-ssa-threadbackward.cc > > > @@ -314,6 +314,12 @@ back_threader::find_taken_edge_cond (const > > > vec &path, > > > { > > > int_range_max r; > > > > > > + int_range<2> rf; > > > + if (path.length () == 1) > > > + { > > > + fold_range (rf, cond); > > > + } > > > + > > > m_solver->compute_ranges (path, m_imports); > > > m_solver->range_of_stmt (r, cond); > > > > > > @@ -325,6 +331,8 @@ back_threader::find_taken_edge_cond (const > > > vec &path, > > > > > > if (r == true_range || r == false_range) > > > { > > > + if (path.length () == 1) > > > + gcc_assert (r == rf); > > > edge e_true, e_false; > > > basic_block bb = gimple_bb (cond); > > > extract_true_false_edges_from_block (bb, &e_true, &e_false); > > > > > > Even doing the following (not sure what's the difference and in > > > particular expense over the path range query) results in missed > > > simplifications (checking my set of cc1files). > > > > > > diff --git a/gcc/tree-ssa-threadbackward.cc > > > b/gcc/tree-ssa-threadbackward.cc > > > index 669098e4ec3..1d43a179d08 100644 > > > --- a/gcc/tree-ssa-threadbackward.cc > > > +++ b/gcc/tree-ssa-threadbackward.cc > > > @@ -99,6 +99,7 @@ private: > > > > > > back_threader_registry m_registry; > > > back_threader_profitability m_profit; > > > + gimple_ranger *m_ranger; > > > path_range_query *m_solver; > > > > > > // Current path being analyzed. > > > @@ -146,12 +147,14 @@ back_threader::back_threader (function *fun, > > > unsigned flags, bool first) > > > // The path solver needs EDGE_DFS_BACK in resolving mode. > > > if (flags & BT_RESOLVE) > > > mark_dfs_back_edges (); > > > - m_solver = new path_range_query (flags & BT_RESOLVE); > > > + m_ranger = new gimple_ranger; > > > + m_solver = new path_range_query (flags & BT_RESOLVE, m_ranger); > > > } > > > > Passing an allocated ranger here results in less simplifications over > > letting path_range_query allocate its own? That's not right. Or do > > you mean that using fold_range() with the m_ranger causes ICEs with > > your patch (due to the non-null processing described below)? > > Yes, I've needed a ranger to use fold_range (..., m_ranger) which > I thought might do more than not passing one. > > > > > > > back_threader::~back_threader () > > > { > > > delete m_solver; > > > + delete m_ranger; > > > > > > loop_optimizer_finalize (); > > > } > > > @@ -314,6 +317,12 @@ back_threader::find_taken_edge_cond (const > > > vec &path, > > > { > > > int_range_max r; > > > > > > + int_range<2> rf; > > > + if (path.length () == 1) > > > + { > > > + fold_range (rf, cond, m_ranger); > > > + } > > > + > > > m_solver->compute_ranges (path, m_imports); > > > m_solver->range_of_stmt (r, cond); > > > > > > @@ -325,6 +334,8 @@ back_threader::find_taken_edge_cond (const > > > vec &path, > > > > > > if (r == true_range || r == false_range) > > > { > > > + if (path.length () == 1) > > > + gcc_assert (r == rf); > > > edge e_true, e_false; > > > basic_block bb = gimple_bb (cond); > > > extract_true_false_edges_from_block (bb, &e_true, &e_false); > > > > > > one example is > > > > > > [local count: 14414059]: > > > _128 = node_177(D)->typed.type; > > > pretmp_413 = MEM[(const union tree_node *)_128].base.code; > > > _431 = pretmp_413 + 65519; > > > if (_128 == 0B) > > > goto ; [18.09%] > > > else > > > goto ; [81.91%] > > > > > > where m_imports for the path is just _128 and the range computed is > > > false while the ranger query returns VARYING. But > > > path_range_query::range_defined_in_block does > > > > > > if (bb && POINTER_TYPE_P (TREE_TYPE (name))) > > > m_ranger->m_cache.m_exit.maybe_adjust_range (r, name, bb); > > > > > > which adjusts the range to ~[0, 0], probably because of the > > > dereference in the following stmt. > > > > > > Why does fold_range not do this when folding the exit test? Is there > > > a way to make it do so? It looks like the only routine using this > > > in gimple-range.cc is range_on_edge and there it's used for e->src > > > after calling range_on_exit for e->src (why's it not done in > > > range_on_exit?). A testcase for this is > > > > Andrew's gonna have to answer this one, because I'm just a user of the > > infer_range infrastructure. But yes, you're right... fold_range > > doesn't seem to take into account side-effects such as non-null. > > OK, let's see. I can probably use the path query as well - I'll > see to produce a prototype of doing those simplifications early, > avoiding threadings there and through unreachable parts of the CFG. Something like the following - it processes the CFG first, simplifying BB conditionals and marking edges so we can later avoid threading along unreachable paths. It will no longer count those simplifications as theaded jumps. It's basically like an EVRP pass on steroids (because of the path query doing more than ranger folding of the conditional), but only simplifying conditionals. Ideally just using ranger or even better, fold_stmt (..., m_ranger) plus find_taken_edge () would work here. Especially for ethread doing such simplification seems worthwhile since EVRP only comes later. Likewise the first threading pass after inlining runs before the first VRP pass (and VRP isn't enabled at -O1 but threading is). That said, what triggered this is seeing that the backwards threader needs 0 -> 2 (aka ENTRY_BLOCK -> 2) to simplify the conditional in BB2 and that it will also copy BB2 for this even though there's only a single entry into BB2. I suppose we could say we don't want to thread those, thus reject any threading attempt with an entry edge into a block with a single predecessor? Or optimize for this case in the copier, not copying such blocks? I'm somewhat undecided what's the correct thing to do here. Richard. diff --git a/gcc/gimple-range-path.cc b/gcc/gimple-range-path.cc index c99d77dd340..782794d3825 100644 --- a/gcc/gimple-range-path.cc +++ b/gcc/gimple-range-path.cc @@ -220,7 +220,6 @@ path_range_query::unreachable_path_p () void path_range_query::set_path (const vec &path) { - gcc_checking_assert (path.length () > 1); m_path = path.copy (); m_pos = m_path.length () - 1; bitmap_clear (m_has_cache_entry); diff --git a/gcc/tree-ssa-threadbackward.cc b/gcc/tree-ssa-threadbackward.cc index 1c362839fab..3ca8c6eb9da 100644 --- a/gcc/tree-ssa-threadbackward.cc +++ b/gcc/tree-ssa-threadbackward.cc @@ -91,9 +91,9 @@ private: edge maybe_register_path (); void maybe_register_path_dump (edge taken_edge); void find_paths_to_names (basic_block bb, bitmap imports, unsigned); - edge find_taken_edge (const vec &path); - edge find_taken_edge_cond (const vec &path, gcond *); - edge find_taken_edge_switch (const vec &path, gswitch *); + edge find_taken_edge (const vec &path, bool = false); + edge find_taken_edge_cond (const vec &path, gcond *, bool); + edge find_taken_edge_switch (const vec &path, gswitch *, bool); virtual void debug (); virtual void dump (FILE *out); @@ -124,6 +124,7 @@ private: // each threadfull[12] pass. This is used to differentiate between // the different threading passes so we can set up debug counters. bool m_first; + auto_edge_flag m_reachable; }; // Used to differentiate unreachable edges, so we may stop the search @@ -132,7 +133,8 @@ const edge back_threader::UNREACHABLE_EDGE = (edge) -1; back_threader::back_threader (function *fun, unsigned flags, bool first) : m_profit (flags & BT_SPEED), - m_first (first) + m_first (first), + m_reachable (fun) { if (flags & BT_SPEED) loop_optimizer_init (LOOPS_HAVE_PREHEADERS | LOOPS_HAVE_SIMPLE_LATCHES); @@ -265,16 +267,16 @@ back_threader::maybe_register_path () // outgoing edge can be calculated, return NULL. edge -back_threader::find_taken_edge (const vec &path) +back_threader::find_taken_edge (const vec &path, bool modify) { - gcc_checking_assert (path.length () > 1); switch (gimple_code (m_last_stmt)) { case GIMPLE_COND: - return find_taken_edge_cond (path, as_a (m_last_stmt)); + return find_taken_edge_cond (path, as_a (m_last_stmt), modify); case GIMPLE_SWITCH: - return find_taken_edge_switch (path, as_a (m_last_stmt)); + return find_taken_edge_switch (path, as_a (m_last_stmt), + modify); default: return NULL; @@ -285,7 +287,7 @@ back_threader::find_taken_edge (const vec &path) edge back_threader::find_taken_edge_switch (const vec &path, - gswitch *sw) + gswitch *sw, bool modify) { tree name = gimple_switch_index (sw); int_range_max r; @@ -303,6 +305,9 @@ back_threader::find_taken_edge_switch (const vec &path, if (!label) return NULL; + if (modify) + gimple_switch_set_index (sw, CASE_LOW (label)); + return find_edge (gimple_bb (sw), label_to_block (cfun, CASE_LABEL (label))); } @@ -310,7 +315,7 @@ back_threader::find_taken_edge_switch (const vec &path, edge back_threader::find_taken_edge_cond (const vec &path, - gcond *cond) + gcond *cond, bool modify) { int_range_max r; @@ -328,6 +333,13 @@ back_threader::find_taken_edge_cond (const vec &path, edge e_true, e_false; basic_block bb = gimple_bb (cond); extract_true_false_edges_from_block (bb, &e_true, &e_false); + if (modify) + { + if (r == true_range) + gimple_cond_make_true (cond); + else + gimple_cond_make_false (cond); + } return r == true_range ? e_true : e_false; } return NULL; @@ -452,6 +464,7 @@ back_threader::find_paths_to_names (basic_block bb, bitmap interesting, FOR_EACH_EDGE (e, iter, bb->preds) { if (e->flags & EDGE_ABNORMAL + || !(e->flags & m_reachable) // This is like path_crosses_loops in profitable_path_p but // more restrictive to avoid peeling off loop iterations (see // tree-ssa/pr14341.c for an example). @@ -948,16 +961,101 @@ unsigned int back_threader::thread_blocks () { basic_block bb; + bool changed = false; + + auto_vec worklist (n_basic_blocks_for_fn (cfun)); + auto_bb_flag visited (cfun); + worklist.quick_push (single_succ (ENTRY_BLOCK_PTR_FOR_FN (cfun))); + worklist[0]->flags |= visited; + while (!worklist.is_empty ()) + { + bb = worklist.pop (); + + // Try to resolve the jump at the end of the block + // ??? We'd like to use ranger without a fake path of + // length one here, but that turns out to be less powerful + gimple *stmt = last_stmt (bb); + if (stmt + && (gimple_code (stmt) == GIMPLE_COND + || gimple_code (stmt) == GIMPLE_SWITCH)) + { + bitmap_clear (m_imports); + ssa_op_iter iter; + tree name; + // could use gori exports here + FOR_EACH_SSA_TREE_OPERAND (name, stmt, iter, SSA_OP_USE) + { + if (gimple_range_ssa_p (name)) + bitmap_set_bit (m_imports, SSA_NAME_VERSION (name)); + } + if (!bitmap_empty_p (m_imports)) + { + auto_vec path; + path.quick_push (bb); + m_last_stmt = stmt; + if (edge e = find_taken_edge (path, true)) + { + if (e == UNREACHABLE_EDGE) + continue; + else + { + // The jump was modified to be trivially + // redirectable by CFG cleanup, so make sure + // that runs + changed = true; + if (!(e->dest->flags & visited)) + { + worklist.quick_push (e->dest); + e->dest->flags |= visited; + e->flags |= m_reachable; + continue; + } + } + } + } + } + + edge_iterator ei; + edge e; + FOR_EACH_EDGE (e, ei, bb->succs) + { + e->flags |= m_reachable; + if (!(e->dest->flags & visited)) + { + worklist.quick_push (e->dest); + e->dest->flags |= visited; + e->flags |= m_reachable; + } + } + } + FOR_EACH_BB_FN (bb, m_fun) - if (EDGE_COUNT (bb->succs) > 1) - maybe_thread_block (bb); + // visited doubles for reachable + if (bb->flags & visited) + { + bb->flags &= ~visited; + unsigned cnt = 0; + edge_iterator ei; + edge e; + FOR_EACH_EDGE (e, ei, bb->succs) + if (e->flags & m_reachable) + cnt++; + // check we have a yet unresolved exit jump + if (cnt > 1) + maybe_thread_block (bb); + } - bool changed = m_registry.thread_through_all_blocks (true); + FOR_EACH_BB_FN (bb, m_fun) + { + edge_iterator ei; + edge e; + FOR_EACH_EDGE (e, ei, bb->succs) + e->flags &= ~m_reachable; + } - if (m_flags & BT_SPEED) - return changed ? TODO_cleanup_cfg : 0; + changed |= m_registry.thread_through_all_blocks (true); - return false; + return changed ? TODO_cleanup_cfg : 0; } namespace {