diff --git a/gcc/tree-loop-distribution.cc b/gcc/tree-loop-distribution.cc index 97879498db46dd3c34181ae9aa6e5476004dd5b5..d790ce5fffab3aa3dfc40d833a968314a4442b9e 100644 --- a/gcc/tree-loop-distribution.cc +++ b/gcc/tree-loop-distribution.cc @@ -948,7 +948,7 @@ copy_loop_before (class loop *loop, bool redirect_lc_phi_defs) edge preheader = loop_preheader_edge (loop); initialize_original_copy_tables (); - res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, NULL, preheader); + res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, NULL, preheader, false); gcc_assert (res != NULL); /* When a not last partition is supposed to keep the LC PHIs computed diff --git a/gcc/tree-ssa-loop-niter.cc b/gcc/tree-ssa-loop-niter.cc index 5d398b67e68c7076760854119590f18b19c622b6..79686f6c4945b7139ba377300430c04b7aeefe6c 100644 --- a/gcc/tree-ssa-loop-niter.cc +++ b/gcc/tree-ssa-loop-niter.cc @@ -3072,7 +3072,12 @@ loop_only_exit_p (const class loop *loop, basic_block *body, const_edge exit) gimple_stmt_iterator bsi; unsigned i; - if (exit != single_exit (loop)) + /* We need to check for alternative exits since exit can be NULL. */ + auto exits = get_loop_exit_edges (loop); + if (exits.length () != 1) + return false; + + if (exit != exits[0]) return false; for (i = 0; i < loop->num_nodes; i++) diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc index 6b93fb3f9af8f2bbdf5dec28f0009177aa5171ab..550d7f40002cf0b58f8a927cb150edd7c2aa9999 100644 --- a/gcc/tree-vect-loop-manip.cc +++ b/gcc/tree-vect-loop-manip.cc @@ -252,6 +252,9 @@ adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def) { tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e); + gcc_assert (TREE_CODE (orig_def) != SSA_NAME + || orig_def != new_def); + SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def); if (MAY_HAVE_DEBUG_BIND_STMTS) @@ -1292,7 +1295,8 @@ vect_set_loop_condition_normal (loop_vec_info loop_vinfo, gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT); /* Record the number of latch iterations. */ - if (limit == niters) + if (limit == niters + || LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) /* Case A: the loop iterates NITERS times. Subtract one to get the latch count. */ loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters, @@ -1303,7 +1307,13 @@ vect_set_loop_condition_normal (loop_vec_info loop_vinfo, loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type, limit, step); - if (final_iv) + /* For multiple exits we've already maintained LCSSA form and handled + the scalar iteration update in the code that deals with the merge + block and its updated guard. I could move that code here instead + of in vect_update_ivs_after_early_break but I have to still deal + with the updates to the counter `i`. So for now I'll keep them + together. */ + if (final_iv && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) { gassign *assign; edge exit = LOOP_VINFO_IV_EXIT (loop_vinfo); @@ -1509,11 +1519,19 @@ vec_init_exit_info (class loop *loop) on E which is either the entry or exit of LOOP. If SCALAR_LOOP is non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the basic blocks from SCALAR_LOOP instead of LOOP, but to either the - entry or exit of LOOP. */ + entry or exit of LOOP. If FLOW_LOOPS then connect LOOP to SCALAR_LOOP as a + continuation. This is correct for cases where one loop continues from the + other like in the vectorizer, but not true for uses in e.g. loop distribution + where the loop is duplicated and then modified. + + If UPDATED_DOMS is not NULL it is update with the list of basic blocks whoms + dominators were updated during the peeling. */ class loop * slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, - class loop *scalar_loop, edge e) + class loop *scalar_loop, edge e, + bool flow_loops, + vec *updated_doms) { class loop *new_loop; basic_block *new_bbs, *bbs, *pbbs; @@ -1602,6 +1620,19 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++) rename_variables_in_bb (new_bbs[i], duplicate_outer_loop); + /* Rename the exit uses. */ + for (edge exit : get_loop_exit_edges (new_loop)) + for (auto gsi = gsi_start_phis (exit->dest); + !gsi_end_p (gsi); gsi_next (&gsi)) + { + tree orig_def = PHI_ARG_DEF_FROM_EDGE (gsi.phi (), exit); + rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), exit)); + if (MAY_HAVE_DEBUG_BIND_STMTS) + adjust_debug_stmts (orig_def, PHI_RESULT (gsi.phi ()), exit->dest); + } + + /* This condition happens when the loop has been versioned. e.g. due to ifcvt + versioning the loop. */ if (scalar_loop != loop) { /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from @@ -1616,28 +1647,106 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, EDGE_SUCC (loop->latch, 0)); } + vec alt_exits = loop->vec_loop_alt_exits; + bool multiple_exits_p = !alt_exits.is_empty (); + auto_vec doms; + class loop *update_loop = NULL; + if (at_exit) /* Add the loop copy at exit. */ { - if (scalar_loop != loop) + if (scalar_loop != loop && new_exit->dest != exit_dest) { - gphi_iterator gsi; new_exit = redirect_edge_and_branch (new_exit, exit_dest); + flush_pending_stmts (new_exit); + } - for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi); - gsi_next (&gsi)) + auto loop_exits = get_loop_exit_edges (loop); + for (edge exit : loop_exits) + redirect_edge_and_branch (exit, new_preheader); + + + /* Copy the current loop LC PHI nodes between the original loop exit + block and the new loop header. This allows us to later split the + preheader block and still find the right LC nodes. */ + edge latch_new = single_succ_edge (new_preheader); + edge latch_old = loop_latch_edge (loop); + hash_set lcssa_vars; + for (auto gsi_from = gsi_start_phis (latch_old->dest), + gsi_to = gsi_start_phis (latch_new->dest); + flow_loops && !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to); + gsi_next (&gsi_from), gsi_next (&gsi_to)) + { + gimple *from_phi = gsi_stmt (gsi_from); + gimple *to_phi = gsi_stmt (gsi_to); + tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, latch_old); + /* In all cases, even in early break situations we're only + interested in the number of fully executed loop iters. As such + we discard any partially done iteration. So we simply propagate + the phi nodes from the latch to the merge block. */ + tree new_res = copy_ssa_name (gimple_phi_result (from_phi)); + gphi *lcssa_phi = create_phi_node (new_res, e->dest); + + lcssa_vars.add (new_arg); + + /* Main loop exit should use the final iter value. */ + add_phi_arg (lcssa_phi, new_arg, loop->vec_loop_iv, UNKNOWN_LOCATION); + + /* All other exits use the previous iters. */ + for (edge e : alt_exits) + add_phi_arg (lcssa_phi, gimple_phi_result (from_phi), e, + UNKNOWN_LOCATION); + + adjust_phi_and_debug_stmts (to_phi, latch_new, new_res); + } + + /* Copy over any live SSA vars that may not have been materialized in the + loops themselves but would be in the exit block. However when the live + value is not used inside the loop then we don't need to do this, if we do + then when we split the guard block the branch edge can end up containing the + wrong reference, particularly if it shares an edge with something that has + bypassed the loop. This is not something peeling can check so we need to + anticipate the usage of the live variable here. */ + auto exit_map = redirect_edge_var_map_vector (exit); + if (exit_map) + for (auto vm : exit_map) + { + if (lcssa_vars.contains (vm.def) + || TREE_CODE (vm.def) != SSA_NAME) + continue; + + imm_use_iterator imm_iter; + use_operand_p use_p; + bool use_in_loop = false; + + FOR_EACH_IMM_USE_FAST (use_p, imm_iter, vm.def) { - gphi *phi = gsi.phi (); - tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e); - location_t orig_locus - = gimple_phi_arg_location_from_edge (phi, e); + basic_block bb = gimple_bb (USE_STMT (use_p)); + if (flow_bb_inside_loop_p (loop, bb) + && !gimple_vuse (USE_STMT (use_p))) + { + use_in_loop = true; + break; + } + } - add_phi_arg (phi, orig_arg, new_exit, orig_locus); + if (!use_in_loop) + { + /* Do a final check to see if it's perhaps defined in the loop. This + mirrors the relevancy analysis's used_outside_scope. */ + gimple *stmt = SSA_NAME_DEF_STMT (vm.def); + if (!stmt || !flow_bb_inside_loop_p (loop, gimple_bb (stmt))) + continue; } + + tree new_res = copy_ssa_name (vm.result); + gphi *lcssa_phi = create_phi_node (new_res, e->dest); + for (edge exit : loop_exits) + add_phi_arg (lcssa_phi, vm.def, exit, vm.locus); } - redirect_edge_and_branch_force (e, new_preheader); - flush_pending_stmts (e); + set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src); - if (was_imm_dom || duplicate_outer_loop) + + if ((was_imm_dom || duplicate_outer_loop) && !multiple_exits_p) set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src); /* And remove the non-necessary forwarder again. Keep the other @@ -1647,9 +1756,42 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, delete_basic_block (preheader); set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header, loop_preheader_edge (scalar_loop)->src); + + /* Finally after wiring the new epilogue we need to update its main exit + to the original function exit we recorded. Other exits are already + correct. */ + if (multiple_exits_p) + { + for (edge e : get_loop_exit_edges (loop)) + doms.safe_push (e->dest); + update_loop = new_loop; + doms.safe_push (exit_dest); + + /* Likely a fall-through edge, so update if needed. */ + if (single_succ_p (exit_dest)) + doms.safe_push (single_succ (exit_dest)); + } } else /* Add the copy at entry. */ { + /* Copy the current loop LC PHI nodes between the original loop exit + block and the new loop header. This allows us to later split the + preheader block and still find the right LC nodes. */ + edge old_latch_loop = loop_latch_edge (loop); + edge old_latch_init = loop_preheader_edge (loop); + edge new_latch_loop = loop_latch_edge (new_loop); + edge new_latch_init = loop_preheader_edge (new_loop); + for (auto gsi_from = gsi_start_phis (new_latch_init->dest), + gsi_to = gsi_start_phis (old_latch_loop->dest); + flow_loops && !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to); + gsi_next (&gsi_from), gsi_next (&gsi_to)) + { + gimple *from_phi = gsi_stmt (gsi_from); + gimple *to_phi = gsi_stmt (gsi_to); + tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, new_latch_loop); + adjust_phi_and_debug_stmts (to_phi, old_latch_init, new_arg); + } + if (scalar_loop != loop) { /* Remove the non-necessary forwarder of scalar_loop again. */ @@ -1677,31 +1819,36 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, delete_basic_block (new_preheader); set_immediate_dominator (CDI_DOMINATORS, new_loop->header, loop_preheader_edge (new_loop)->src); + + if (multiple_exits_p) + update_loop = loop; } - if (scalar_loop != loop) + if (multiple_exits_p) { - /* Update new_loop->header PHIs, so that on the preheader - edge they are the ones from loop rather than scalar_loop. */ - gphi_iterator gsi_orig, gsi_new; - edge orig_e = loop_preheader_edge (loop); - edge new_e = loop_preheader_edge (new_loop); - - for (gsi_orig = gsi_start_phis (loop->header), - gsi_new = gsi_start_phis (new_loop->header); - !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new); - gsi_next (&gsi_orig), gsi_next (&gsi_new)) + for (edge e : get_loop_exit_edges (update_loop)) { - gphi *orig_phi = gsi_orig.phi (); - gphi *new_phi = gsi_new.phi (); - tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e); - location_t orig_locus - = gimple_phi_arg_location_from_edge (orig_phi, orig_e); - - add_phi_arg (new_phi, orig_arg, new_e, orig_locus); + edge ex; + edge_iterator ei; + FOR_EACH_EDGE (ex, ei, e->dest->succs) + { + /* Find the first non-fallthrough block as fall-throughs can't + dominate other blocks. */ + while ((ex->flags & EDGE_FALLTHRU) + && single_succ_p (ex->dest)) + { + doms.safe_push (ex->dest); + ex = single_succ_edge (ex->dest); + } + doms.safe_push (ex->dest); + } + doms.safe_push (e->dest); } - } + iterate_fix_dominators (CDI_DOMINATORS, doms, false); + if (updated_doms) + updated_doms->safe_splice (doms); + } free (new_bbs); free (bbs); @@ -1777,6 +1924,9 @@ slpeel_can_duplicate_loop_p (const loop_vec_info loop_vinfo, const_edge e) gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src); unsigned int num_bb = loop->inner? 5 : 2; + if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) + num_bb += LOOP_VINFO_ALT_EXITS (loop_vinfo).length (); + /* All loops have an outer scope; the only case loop->outer is NULL is for the function itself. */ if (!loop_outer (loop) @@ -2044,6 +2194,11 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); basic_block update_bb = update_e->dest; + /* For early exits we'll update the IVs in + vect_update_ivs_after_early_break. */ + if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) + return; + basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest; /* Make sure there exists a single-predecessor exit bb: */ @@ -2131,6 +2286,208 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, /* Fix phi expressions in the successor bb. */ adjust_phi_and_debug_stmts (phi1, update_e, ni_name); } + return; +} + +/* Function vect_update_ivs_after_early_break. + + "Advance" the induction variables of LOOP to the value they should take + after the execution of LOOP. This is currently necessary because the + vectorizer does not handle induction variables that are used after the + loop. Such a situation occurs when the last iterations of LOOP are + peeled, because of the early exit. With an early exit we always peel the + loop. + + Input: + - LOOP_VINFO - a loop info structure for the loop that is going to be + vectorized. The last few iterations of LOOP were peeled. + - LOOP - a loop that is going to be vectorized. The last few iterations + of LOOP were peeled. + - VF - The loop vectorization factor. + - NITERS_ORIG - the number of iterations that LOOP executes (before it is + vectorized). i.e, the number of times the ivs should be + bumped. + - NITERS_VECTOR - The number of iterations that the vector LOOP executes. + - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path + coming out from LOOP on which there are uses of the LOOP ivs + (this is the path from LOOP->exit to epilog_loop->preheader). + + The new definitions of the ivs are placed in LOOP->exit. + The phi args associated with the edge UPDATE_E in the bb + UPDATE_E->dest are updated accordingly. + + Output: + - If available, the LCSSA phi node for the loop IV temp. + + Assumption 1: Like the rest of the vectorizer, this function assumes + a single loop exit that has a single predecessor. + + Assumption 2: The phi nodes in the LOOP header and in update_bb are + organized in the same order. + + Assumption 3: The access function of the ivs is simple enough (see + vect_can_advance_ivs_p). This assumption will be relaxed in the future. + + Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path + coming out of LOOP on which the ivs of LOOP are used (this is the path + that leads to the epilog loop; other paths skip the epilog loop). This + path starts with the edge UPDATE_E, and its destination (denoted update_bb) + needs to have its phis updated. + */ + +static tree +vect_update_ivs_after_early_break (loop_vec_info loop_vinfo, class loop * epilog, + poly_int64 vf, tree niters_orig, + tree niters_vector, edge update_e) +{ + if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) + return NULL; + + gphi_iterator gsi, gsi1; + tree ni_name, ivtmp = NULL; + basic_block update_bb = update_e->dest; + vec alt_exits = LOOP_VINFO_ALT_EXITS (loop_vinfo); + edge loop_iv = LOOP_VINFO_IV_EXIT (loop_vinfo); + basic_block exit_bb = loop_iv->dest; + class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + gcond *cond = LOOP_VINFO_LOOP_IV_COND (loop_vinfo); + + gcc_assert (cond); + + for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb); + !gsi_end_p (gsi) && !gsi_end_p (gsi1); + gsi_next (&gsi), gsi_next (&gsi1)) + { + tree init_expr, final_expr, step_expr; + tree type; + tree var, ni, off; + gimple_stmt_iterator last_gsi; + + gphi *phi = gsi1.phi (); + tree phi_ssa = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (epilog)); + gphi *phi1 = dyn_cast (SSA_NAME_DEF_STMT (phi_ssa)); + if (!phi1) + continue; + stmt_vec_info phi_info = loop_vinfo->lookup_stmt (gsi.phi ()); + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "vect_update_ivs_after_early_break: phi: %G", + (gimple *)phi); + + /* Skip reduction and virtual phis. */ + if (!iv_phi_p (phi_info)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "reduc or virtual phi. skip.\n"); + continue; + } + + /* For multiple exits where we handle early exits we need to carry on + with the previous IV as loop iteration was not done because we exited + early. As such just grab the original IV. */ + phi_ssa = PHI_ARG_DEF_FROM_EDGE (gsi.phi (), loop_latch_edge (loop)); + if (gimple_cond_lhs (cond) != phi_ssa + && gimple_cond_rhs (cond) != phi_ssa) + { + type = TREE_TYPE (gimple_phi_result (phi)); + step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info); + step_expr = unshare_expr (step_expr); + + /* We previously generated the new merged phi in the same BB as the + guard. So use that to perform the scaling on rather than the + normal loop phi which don't take the early breaks into account. */ + final_expr = gimple_phi_result (phi1); + init_expr = PHI_ARG_DEF_FROM_EDGE (gsi.phi (), loop_preheader_edge (loop)); + + tree stype = TREE_TYPE (step_expr); + /* For early break the final loop IV is: + init + (final - init) * vf which takes into account peeling + values and non-single steps. */ + off = fold_build2 (MINUS_EXPR, stype, + fold_convert (stype, final_expr), + fold_convert (stype, init_expr)); + /* Now adjust for VF to get the final iteration value. */ + off = fold_build2 (MULT_EXPR, stype, off, build_int_cst (stype, vf)); + + /* Adjust the value with the offset. */ + if (POINTER_TYPE_P (type)) + ni = fold_build_pointer_plus (init_expr, off); + else + ni = fold_convert (type, + fold_build2 (PLUS_EXPR, stype, + fold_convert (stype, init_expr), + off)); + var = create_tmp_var (type, "tmp"); + + last_gsi = gsi_last_bb (exit_bb); + gimple_seq new_stmts = NULL; + ni_name = force_gimple_operand (ni, &new_stmts, false, var); + /* Exit_bb shouldn't be empty. */ + if (!gsi_end_p (last_gsi)) + gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT); + else + gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT); + + /* Fix phi expressions in the successor bb. */ + adjust_phi_and_debug_stmts (phi, update_e, ni_name); + } + else + { + type = TREE_TYPE (gimple_phi_result (phi)); + step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info); + step_expr = unshare_expr (step_expr); + + /* We previously generated the new merged phi in the same BB as the + guard. So use that to perform the scaling on rather than the + normal loop phi which don't take the early breaks into account. */ + init_expr = PHI_ARG_DEF_FROM_EDGE (phi1, loop_preheader_edge (loop)); + tree stype = TREE_TYPE (step_expr); + + if (vf.is_constant ()) + { + ni = fold_build2 (MULT_EXPR, stype, + fold_convert (stype, + niters_vector), + build_int_cst (stype, vf)); + + ni = fold_build2 (MINUS_EXPR, stype, + fold_convert (stype, + niters_orig), + fold_convert (stype, ni)); + } + else + /* If the loop's VF isn't constant then the loop must have been + masked, so at the end of the loop we know we have finished + the entire loop and found nothing. */ + ni = build_zero_cst (stype); + + ni = fold_convert (type, ni); + /* We don't support variable n in this version yet. */ + gcc_assert (TREE_CODE (ni) == INTEGER_CST); + + var = create_tmp_var (type, "tmp"); + + last_gsi = gsi_last_bb (exit_bb); + gimple_seq new_stmts = NULL; + ni_name = force_gimple_operand (ni, &new_stmts, false, var); + /* Exit_bb shouldn't be empty. */ + if (!gsi_end_p (last_gsi)) + gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT); + else + gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT); + + adjust_phi_and_debug_stmts (phi1, loop_iv, ni_name); + + for (edge exit : alt_exits) + adjust_phi_and_debug_stmts (phi1, exit, + build_int_cst (TREE_TYPE (step_expr), + vf)); + ivtmp = gimple_phi_result (phi1); + } + } + + return ivtmp; } /* Return a gimple value containing the misalignment (measured in vector @@ -2632,137 +2989,34 @@ vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo, /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP, this function searches for the corresponding lcssa phi node in exit - bb of LOOP. If it is found, return the phi result; otherwise return - NULL. */ + bb of LOOP following the LCSSA_EDGE to the exit node. If it is found, + return the phi result; otherwise return NULL. */ static tree find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED, - gphi *lcssa_phi) + gphi *lcssa_phi, int lcssa_edge = 0) { gphi_iterator gsi; edge e = loop->vec_loop_iv; - gcc_assert (single_pred_p (e->dest)); for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi)) { gphi *phi = gsi.phi (); - if (operand_equal_p (PHI_ARG_DEF (phi, 0), - PHI_ARG_DEF (lcssa_phi, 0), 0)) - return PHI_RESULT (phi); - } - return NULL_TREE; -} - -/* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND - from SECOND/FIRST and puts it at the original loop's preheader/exit - edge, the two loops are arranged as below: - - preheader_a: - first_loop: - header_a: - i_1 = PHI; - ... - i_2 = i_1 + 1; - if (cond_a) - goto latch_a; - else - goto between_bb; - latch_a: - goto header_a; - - between_bb: - ;; i_x = PHI; ;; LCSSA phi node to be created for FIRST, - - second_loop: - header_b: - i_3 = PHI; ;; Use of i_0 to be replaced with i_x, - or with i_2 if no LCSSA phi is created - under condition of CREATE_LCSSA_FOR_IV_PHIS. - ... - i_4 = i_3 + 1; - if (cond_b) - goto latch_b; - else - goto exit_bb; - latch_b: - goto header_b; - - exit_bb: - - This function creates loop closed SSA for the first loop; update the - second loop's PHI nodes by replacing argument on incoming edge with the - result of newly created lcssa PHI nodes. IF CREATE_LCSSA_FOR_IV_PHIS - is false, Loop closed ssa phis will only be created for non-iv phis for - the first loop. - - This function assumes exit bb of the first loop is preheader bb of the - second loop, i.e, between_bb in the example code. With PHIs updated, - the second loop will execute rest iterations of the first. */ - -static void -slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo, - class loop *first, class loop *second, - bool create_lcssa_for_iv_phis) -{ - gphi_iterator gsi_update, gsi_orig; - class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); - - edge first_latch_e = EDGE_SUCC (first->latch, 0); - edge second_preheader_e = loop_preheader_edge (second); - basic_block between_bb = single_exit (first)->dest; - - gcc_assert (between_bb == second_preheader_e->src); - gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb)); - /* Either the first loop or the second is the loop to be vectorized. */ - gcc_assert (loop == first || loop == second); - - for (gsi_orig = gsi_start_phis (first->header), - gsi_update = gsi_start_phis (second->header); - !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update); - gsi_next (&gsi_orig), gsi_next (&gsi_update)) - { - gphi *orig_phi = gsi_orig.phi (); - gphi *update_phi = gsi_update.phi (); - - tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e); - /* Generate lcssa PHI node for the first loop. */ - gphi *vect_phi = (loop == first) ? orig_phi : update_phi; - stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi); - if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info)) + /* Nested loops with multiple exits can have different no# phi node + arguments between the main loop and epilog as epilog falls to the + second loop. */ + if (gimple_phi_num_args (phi) > e->dest_idx) { - tree new_res = copy_ssa_name (PHI_RESULT (orig_phi)); - gphi *lcssa_phi = create_phi_node (new_res, between_bb); - add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION); - arg = new_res; - } - - /* Update PHI node in the second loop by replacing arg on the loop's - incoming edge. */ - adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg); - } - - /* For epilogue peeling we have to make sure to copy all LC PHIs - for correct vectorization of live stmts. */ - if (loop == first) - { - basic_block orig_exit = single_exit (second)->dest; - for (gsi_orig = gsi_start_phis (orig_exit); - !gsi_end_p (gsi_orig); gsi_next (&gsi_orig)) - { - gphi *orig_phi = gsi_orig.phi (); - tree orig_arg = PHI_ARG_DEF (orig_phi, 0); - if (TREE_CODE (orig_arg) != SSA_NAME || virtual_operand_p (orig_arg)) - continue; - - /* Already created in the above loop. */ - if (find_guard_arg (first, second, orig_phi)) + tree var = PHI_ARG_DEF (phi, e->dest_idx); + if (TREE_CODE (var) != SSA_NAME) continue; - tree new_res = copy_ssa_name (orig_arg); - gphi *lcphi = create_phi_node (new_res, between_bb); - add_phi_arg (lcphi, orig_arg, single_exit (first), UNKNOWN_LOCATION); + if (operand_equal_p (get_current_def (var), + PHI_ARG_DEF (lcssa_phi, lcssa_edge), 0)) + return PHI_RESULT (phi); } } + return NULL_TREE; } /* Function slpeel_add_loop_guard adds guard skipping from the beginning @@ -2910,13 +3164,11 @@ slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog, gcc_assert (single_succ_p (merge_bb)); edge e = single_succ_edge (merge_bb); basic_block exit_bb = e->dest; - gcc_assert (single_pred_p (exit_bb)); - gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest); for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi)) { gphi *update_phi = gsi.phi (); - tree old_arg = PHI_ARG_DEF (update_phi, 0); + tree old_arg = PHI_ARG_DEF (update_phi, e->dest_idx); tree merge_arg = NULL_TREE; @@ -2928,7 +3180,7 @@ slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog, if (!merge_arg) merge_arg = old_arg; - tree guard_arg = find_guard_arg (loop, epilog, update_phi); + tree guard_arg = find_guard_arg (loop, epilog, update_phi, e->dest_idx); /* If the var is live after loop but not a reduction, we simply use the old arg. */ if (!guard_arg) @@ -2948,21 +3200,6 @@ slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog, } } -/* EPILOG loop is duplicated from the original loop for vectorizing, - the arg of its loop closed ssa PHI needs to be updated. */ - -static void -slpeel_update_phi_nodes_for_lcssa (class loop *epilog) -{ - gphi_iterator gsi; - basic_block exit_bb = single_exit (epilog)->dest; - - gcc_assert (single_pred_p (exit_bb)); - edge e = EDGE_PRED (exit_bb, 0); - for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi)) - rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e)); -} - /* EPILOGUE_VINFO is an epilogue loop that we now know would need to iterate exactly CONST_NITERS times. Make a final decision about whether the epilogue loop should be used, returning true if so. */ @@ -3138,6 +3375,14 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, bound_epilog += vf - 1; if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) bound_epilog += 1; + /* For early breaks the scalar loop needs to execute at most VF times + to find the element that caused the break. */ + if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) + { + bound_epilog = vf; + /* Force a scalar epilogue as we can't vectorize the index finding. */ + vect_epilogues = false; + } bool epilog_peeling = maybe_ne (bound_epilog, 0U); poly_uint64 bound_scalar = bound_epilog; @@ -3297,16 +3542,24 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, bound_prolog + bound_epilog) : (!LOOP_REQUIRES_VERSIONING (loop_vinfo) || vect_epilogues)); + + /* We only support early break vectorization on known bounds at this time. + This means that if the vector loop can't be entered then we won't generate + it at all. So for now force skip_vector off because the additional control + flow messes with the BB exits and we've already analyzed them. */ + skip_vector = skip_vector && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo); + /* Epilog loop must be executed if the number of iterations for epilog loop is known at compile time, otherwise we need to add a check at the end of vector loop and skip to the end of epilog loop. */ bool skip_epilog = (prolog_peeling < 0 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || !vf.is_constant ()); - /* PEELING_FOR_GAPS is special because epilog loop must be executed. */ - if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) + /* PEELING_FOR_GAPS and peeling for early breaks are special because epilog + loop must be executed. */ + if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) + || LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) skip_epilog = false; - class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo); auto_vec original_counts; basic_block *original_bbs = NULL; @@ -3344,13 +3597,13 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, if (prolog_peeling) { e = loop_preheader_edge (loop); - gcc_checking_assert (slpeel_can_duplicate_loop_p (loop, e)); - + gcc_checking_assert (slpeel_can_duplicate_loop_p (loop_vinfo, e)); /* Peel prolog and put it on preheader edge of loop. */ - prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e); + prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e, + true); gcc_assert (prolog); prolog->force_vectorize = false; - slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true); + first_loop = prolog; reset_original_copy_tables (); @@ -3420,11 +3673,12 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, as the transformations mentioned above make less or no sense when not vectorizing. */ epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop; - epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e); + auto_vec doms; + epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e, true, + &doms); gcc_assert (epilog); epilog->force_vectorize = false; - slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false); /* Scalar version loop may be preferred. In this case, add guard and skip to epilog. Note this only happens when the number of @@ -3496,6 +3750,54 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf, update_e); + /* For early breaks we must create a guard to check how many iterations + of the scalar loop are yet to be performed. */ + if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) + { + tree ivtmp = + vect_update_ivs_after_early_break (loop_vinfo, epilog, vf, niters, + *niters_vector, update_e); + + gcc_assert (ivtmp); + tree guard_cond = fold_build2 (EQ_EXPR, boolean_type_node, + fold_convert (TREE_TYPE (niters), + ivtmp), + build_zero_cst (TREE_TYPE (niters))); + basic_block guard_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest; + + /* If we had a fallthrough edge, the guard will the threaded through + and so we may need to find the actual final edge. */ + edge final_edge = epilog->vec_loop_iv; + /* slpeel_update_phi_nodes_for_guard2 expects an empty block in + between the guard and the exit edge. It only adds new nodes and + doesn't update existing one in the current scheme. */ + basic_block guard_to = split_edge (final_edge); + edge guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to, + guard_bb, prob_epilog.invert (), + irred_flag); + doms.safe_push (guard_bb); + + iterate_fix_dominators (CDI_DOMINATORS, doms, false); + + /* We must update all the edges from the new guard_bb. */ + slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e, + final_edge); + + /* If the loop was versioned we'll have an intermediate BB between + the guard and the exit. This intermediate block is required + because in the current scheme of things the guard block phi + updating can only maintain LCSSA by creating new blocks. In this + case we just need to update the uses in this block as well. */ + if (loop != scalar_loop) + { + for (gphi_iterator gsi = gsi_start_phis (guard_to); + !gsi_end_p (gsi); gsi_next (&gsi)) + rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), guard_e)); + } + + flush_pending_stmts (guard_e); + } + if (skip_epilog) { guard_cond = fold_build2 (EQ_EXPR, boolean_type_node, @@ -3520,8 +3822,6 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, } scale_loop_profile (epilog, prob_epilog, 0); } - else - slpeel_update_phi_nodes_for_lcssa (epilog); unsigned HOST_WIDE_INT bound; if (bound_scalar.is_constant (&bound)) diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index b4a98de80aa39057fc9b17977dd0e347b4f0fb5d..ab9a2048186f461f5ec49f21421958e7ee25eada 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -1007,6 +1007,8 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared) partial_load_store_bias (0), peeling_for_gaps (false), peeling_for_niter (false), + early_breaks (false), + non_break_control_flow (false), no_data_dependencies (false), has_mask_store (false), scalar_loop_scaling (profile_probability::uninitialized ()), @@ -1199,6 +1201,14 @@ vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo) th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)); + /* When we have multiple exits and VF is unknown, we must require partial + vectors because the loop bounds is not a minimum but a maximum. That is to + say we cannot unpredicate the main loop unless we peel or use partial + vectors in the epilogue. */ + if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo) + && !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()) + return true; + if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0) { @@ -1652,12 +1662,12 @@ vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo) loop_vinfo->scalar_costs->finish_cost (nullptr); } - /* Function vect_analyze_loop_form. Verify that certain CFG restrictions hold, including: - the loop has a pre-header - - the loop has a single entry and exit + - the loop has a single entry + - nested loops can have only a single exit. - the loop exit condition is simple enough - the number of iterations can be analyzed, i.e, a countable loop. The niter could be analyzed under some assumptions. */ @@ -1693,11 +1703,6 @@ vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info) | (exit-bb) */ - if (loop->num_nodes != 2) - return opt_result::failure_at (vect_location, - "not vectorized:" - " control flow in loop.\n"); - if (empty_block_p (loop->header)) return opt_result::failure_at (vect_location, "not vectorized: empty loop.\n"); @@ -1768,11 +1773,13 @@ vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info) dump_printf_loc (MSG_NOTE, vect_location, "Considering outer-loop vectorization.\n"); info->inner_loop_cond = inner.loop_cond; + + if (!single_exit (loop)) + return opt_result::failure_at (vect_location, + "not vectorized: multiple exits.\n"); + } - if (!single_exit (loop)) - return opt_result::failure_at (vect_location, - "not vectorized: multiple exits.\n"); if (EDGE_COUNT (loop->header->preds) != 2) return opt_result::failure_at (vect_location, "not vectorized:" @@ -1788,11 +1795,36 @@ vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info) "not vectorized: latch block not empty.\n"); /* Make sure the exit is not abnormal. */ - edge e = single_exit (loop); - if (e->flags & EDGE_ABNORMAL) - return opt_result::failure_at (vect_location, - "not vectorized:" - " abnormal loop exit edge.\n"); + auto_vec exits = get_loop_exit_edges (loop); + edge nexit = loop->vec_loop_iv; + for (edge e : exits) + { + if (e->flags & EDGE_ABNORMAL) + return opt_result::failure_at (vect_location, + "not vectorized:" + " abnormal loop exit edge.\n"); + /* Early break BB must be after the main exit BB. In theory we should + be able to vectorize the inverse order, but the current flow in the + the vectorizer always assumes you update successor PHI nodes, not + preds. */ + if (e != nexit && !dominated_by_p (CDI_DOMINATORS, nexit->src, e->src)) + return opt_result::failure_at (vect_location, + "not vectorized:" + " abnormal loop exit edge order.\n"); + } + + /* We currently only support early exit loops with known bounds. */ + if (exits.length () > 1) + { + class tree_niter_desc niter; + if (!number_of_iterations_exit_assumptions (loop, nexit, &niter, NULL) + || chrec_contains_undetermined (niter.niter) + || !evolution_function_is_constant_p (niter.niter)) + return opt_result::failure_at (vect_location, + "not vectorized:" + " early breaks only supported on loops" + " with known iteration bounds.\n"); + } info->conds = vect_get_loop_niters (loop, &info->assumptions, @@ -1866,6 +1898,10 @@ vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared, LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_splice (info->alt_loop_conds); LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->loop_cond; + /* Check to see if we're vectorizing multiple exits. */ + LOOP_VINFO_EARLY_BREAKS (loop_vinfo) + = !LOOP_VINFO_LOOP_CONDS (loop_vinfo).is_empty (); + if (info->inner_loop_cond) { stmt_vec_info inner_loop_cond_info @@ -3070,7 +3106,8 @@ start_over: /* If an epilogue loop is required make sure we can create one. */ if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) - || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)) + || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) + || LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) { if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n"); @@ -5797,7 +5834,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, basic_block exit_bb; tree scalar_dest; tree scalar_type; - gimple *new_phi = NULL, *phi; + gimple *new_phi = NULL, *phi = NULL; gimple_stmt_iterator exit_gsi; tree new_temp = NULL_TREE, new_name, new_scalar_dest; gimple *epilog_stmt = NULL; @@ -6039,6 +6076,33 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, new_def = gimple_convert (&stmts, vectype, new_def); reduc_inputs.quick_push (new_def); } + + /* Update the other exits. */ + if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) + { + vec alt_exits = LOOP_VINFO_ALT_EXITS (loop_vinfo); + gphi_iterator gsi, gsi1; + for (edge exit : alt_exits) + { + /* Find the phi node to propaget into the exit block for each + exit edge. */ + for (gsi = gsi_start_phis (exit_bb), + gsi1 = gsi_start_phis (exit->src); + !gsi_end_p (gsi) && !gsi_end_p (gsi1); + gsi_next (&gsi), gsi_next (&gsi1)) + { + /* There really should be a function to just get the number + of phis inside a bb. */ + if (phi && phi == gsi.phi ()) + { + gphi *phi1 = gsi1.phi (); + SET_PHI_ARG_DEF (phi, exit->dest_idx, + PHI_RESULT (phi1)); + break; + } + } + } + } gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); } @@ -10355,6 +10419,13 @@ vectorizable_live_operation (vec_info *vinfo, new_tree = lane_extract ; lhs' = new_tree; */ + /* When vectorizing an early break, any live statement that is used + outside of the loop are dead. The loop will never get to them. + We could change the liveness value during analysis instead but since + the below code is invalid anyway just ignore it during codegen. */ + if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) + return true; + class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest; gcc_assert (single_pred_p (exit_bb)); @@ -11277,7 +11348,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call) /* Make sure there exists a single-predecessor exit bb. Do this before versioning. */ edge e = LOOP_VINFO_IV_EXIT (loop_vinfo); - if (! single_pred_p (e->dest)) + if (e && ! single_pred_p (e->dest) && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) { split_loop_exit_edge (e, true); if (dump_enabled_p ()) @@ -11303,7 +11374,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call) if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)) { e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)); - if (! single_pred_p (e->dest)) + if (e && ! single_pred_p (e->dest)) { split_loop_exit_edge (e, true); if (dump_enabled_p ()) @@ -11641,7 +11712,8 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call) /* Loops vectorized with a variable factor won't benefit from unrolling/peeling. */ - if (!vf.is_constant ()) + if (!vf.is_constant () + && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) { loop->unroll = 1; if (dump_enabled_p ()) diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 87c4353fa5180fcb7f60b192897456cf24f3fdbe..03524e8500ee06df42f82afe78ee2a7c627be45b 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -344,9 +344,34 @@ vect_stmt_relevant_p (stmt_vec_info stmt_info, loop_vec_info loop_vinfo, *live_p = false; /* cond stmt other than loop exit cond. */ - if (is_ctrl_stmt (stmt_info->stmt) - && STMT_VINFO_TYPE (stmt_info) != loop_exit_ctrl_vec_info_type) - *relevant = vect_used_in_scope; + if (is_ctrl_stmt (stmt_info->stmt)) + { + /* Ideally EDGE_LOOP_EXIT would have been set on the exit edge, but + it looks like loop_manip doesn't do that.. So we have to do it + the hard way. */ + basic_block bb = gimple_bb (stmt_info->stmt); + bool exit_bb = false, early_exit = false; + edge_iterator ei; + edge e; + FOR_EACH_EDGE (e, ei, bb->succs) + if (!flow_bb_inside_loop_p (loop, e->dest)) + { + exit_bb = true; + early_exit = loop->vec_loop_iv->src != bb; + break; + } + + /* We should have processed any exit edge, so an edge not an early + break must be a loop IV edge. We need to distinguish between the + two as we don't want to generate code for the main loop IV. */ + if (exit_bb) + { + if (early_exit) + *relevant = vect_used_in_scope; + } + else if (bb->loop_father == loop) + LOOP_VINFO_GENERAL_CTR_FLOW (loop_vinfo) = true; + } /* changing memory. */ if (gimple_code (stmt_info->stmt) != GIMPLE_PHI) @@ -359,6 +384,11 @@ vect_stmt_relevant_p (stmt_vec_info stmt_info, loop_vec_info loop_vinfo, *relevant = vect_used_in_scope; } + auto_vec exits = get_loop_exit_edges (loop); + auto_bitmap exit_bbs; + for (edge exit : exits) + bitmap_set_bit (exit_bbs, exit->dest->index); + /* uses outside the loop. */ FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF) { @@ -377,7 +407,7 @@ vect_stmt_relevant_p (stmt_vec_info stmt_info, loop_vec_info loop_vinfo, /* We expect all such uses to be in the loop exit phis (because of loop closed form) */ gcc_assert (gimple_code (USE_STMT (use_p)) == GIMPLE_PHI); - gcc_assert (bb == single_exit (loop)->dest); + gcc_assert (bitmap_bit_p (exit_bbs, bb->index)); *live_p = true; } @@ -683,6 +713,13 @@ vect_mark_stmts_to_be_vectorized (loop_vec_info loop_vinfo, bool *fatal) } } + /* Ideally this should be in vect_analyze_loop_form but we haven't seen all + the conds yet at that point and there's no quick way to retrieve them. */ + if (LOOP_VINFO_GENERAL_CTR_FLOW (loop_vinfo)) + return opt_result::failure_at (vect_location, + "not vectorized:" + " unsupported control flow in loop.\n"); + /* 2. Process_worklist */ while (worklist.length () > 0) { @@ -778,6 +815,20 @@ vect_mark_stmts_to_be_vectorized (loop_vec_info loop_vinfo, bool *fatal) return res; } } + } + else if (gcond *cond = dyn_cast (stmt_vinfo->stmt)) + { + enum tree_code rhs_code = gimple_cond_code (cond); + gcc_assert (TREE_CODE_CLASS (rhs_code) == tcc_comparison); + opt_result res + = process_use (stmt_vinfo, gimple_cond_lhs (cond), + loop_vinfo, relevant, &worklist, false); + if (!res) + return res; + res = process_use (stmt_vinfo, gimple_cond_rhs (cond), + loop_vinfo, relevant, &worklist, false); + if (!res) + return res; } else if (gcall *call = dyn_cast (stmt_vinfo->stmt)) { @@ -11919,11 +11970,15 @@ vect_analyze_stmt (vec_info *vinfo, node_instance, cost_vec); if (!res) return res; - } + } + + if (is_ctrl_stmt (stmt_info->stmt)) + STMT_VINFO_DEF_TYPE (stmt_info) = vect_early_exit_def; switch (STMT_VINFO_DEF_TYPE (stmt_info)) { case vect_internal_def: + case vect_early_exit_def: break; case vect_reduction_def: @@ -11956,6 +12011,7 @@ vect_analyze_stmt (vec_info *vinfo, { gcall *call = dyn_cast (stmt_info->stmt); gcc_assert (STMT_VINFO_VECTYPE (stmt_info) + || gimple_code (stmt_info->stmt) == GIMPLE_COND || (call && gimple_call_lhs (call) == NULL_TREE)); *need_to_vectorize = true; } diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index ec65b65b5910e9cbad0a8c7e83c950b6168b98bf..24a0567a2f23f1b3d8b340baff61d18da8e242dd 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -63,6 +63,7 @@ enum vect_def_type { vect_internal_def, vect_induction_def, vect_reduction_def, + vect_early_exit_def, vect_double_reduction_def, vect_nested_cycle, vect_first_order_recurrence, @@ -876,6 +877,13 @@ public: we need to peel off iterations at the end to form an epilogue loop. */ bool peeling_for_niter; + /* When the loop has early breaks that we can vectorize we need to peel + the loop for the break finding loop. */ + bool early_breaks; + + /* When the loop has a non-early break control flow inside. */ + bool non_break_control_flow; + /* List of loop additional IV conditionals found in the loop. */ auto_vec conds; @@ -985,9 +993,11 @@ public: #define LOOP_VINFO_REDUCTION_CHAINS(L) (L)->reduction_chains #define LOOP_VINFO_PEELING_FOR_GAPS(L) (L)->peeling_for_gaps #define LOOP_VINFO_PEELING_FOR_NITER(L) (L)->peeling_for_niter +#define LOOP_VINFO_EARLY_BREAKS(L) (L)->early_breaks #define LOOP_VINFO_EARLY_BRK_CONFLICT_STMTS(L) (L)->early_break_conflict #define LOOP_VINFO_EARLY_BRK_DEST_BB(L) (L)->early_break_dest_bb #define LOOP_VINFO_EARLY_BRK_VUSES(L) (L)->early_break_vuses +#define LOOP_VINFO_GENERAL_CTR_FLOW(L) (L)->non_break_control_flow #define LOOP_VINFO_LOOP_CONDS(L) (L)->conds #define LOOP_VINFO_LOOP_IV_COND(L) (L)->loop_iv_cond #define LOOP_VINFO_NO_DATA_DEPENDENCIES(L) (L)->no_data_dependencies @@ -1038,8 +1048,8 @@ public: stack. */ typedef opt_pointer_wrapper opt_loop_vec_info; -inline loop_vec_info -loop_vec_info_for_loop (class loop *loop) +static inline loop_vec_info +loop_vec_info_for_loop (const class loop *loop) { return (loop_vec_info) loop->aux; } @@ -1789,7 +1799,7 @@ is_loop_header_bb_p (basic_block bb) { if (bb == (bb->loop_father)->header) return true; - gcc_checking_assert (EDGE_COUNT (bb->preds) == 1); + return false; } @@ -2176,9 +2186,10 @@ class auto_purge_vect_location in tree-vect-loop-manip.cc. */ extern void vect_set_loop_condition (class loop *, loop_vec_info, tree, tree, tree, bool); -extern bool slpeel_can_duplicate_loop_p (const class loop *, const_edge); +extern bool slpeel_can_duplicate_loop_p (const loop_vec_info, const_edge); class loop *slpeel_tree_duplicate_loop_to_edge_cfg (class loop *, - class loop *, edge); + class loop *, edge, bool, + vec * = NULL); class loop *vect_loop_versioning (loop_vec_info, gimple *); extern class loop *vect_do_peeling (loop_vec_info, tree, tree, tree *, tree *, tree *, int, bool, bool, diff --git a/gcc/tree-vectorizer.cc b/gcc/tree-vectorizer.cc index a048e9d89178a37455bd7b83ab0f2a238a4ce69e..0dc5479dc92058b6c70c67f29f5dc9a8d72235f4 100644 --- a/gcc/tree-vectorizer.cc +++ b/gcc/tree-vectorizer.cc @@ -1379,7 +1379,9 @@ pass_vectorize::execute (function *fun) predicates that need to be shared for optimal predicate usage. However reassoc will re-order them and prevent CSE from working as it should. CSE only the loop body, not the entry. */ - bitmap_set_bit (exit_bbs, single_exit (loop)->dest->index); + auto_vec exits = get_loop_exit_edges (loop); + for (edge exit : exits) + bitmap_set_bit (exit_bbs, exit->dest->index); edge entry = EDGE_PRED (loop_preheader_edge (loop)->src, 0); do_rpo_vn (fun, entry, exit_bbs);