diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index a28bb6321d76b8222bc8cfdade151ca9b4dca406..c84f1df9cd9a1325135defcbe1d101642a867373 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -153,7 +153,8 @@ along with GCC; see the file COPYING3. If not see http://gcc.gnu.org/projects/tree-ssa/vectorization.html */ -static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *); +static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *, + unsigned *); static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info, bool *, bool *); @@ -828,6 +829,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared) skip_main_loop_edge (nullptr), skip_this_loop_edge (nullptr), reusable_accumulators (), + suggested_unroll_factor (1), max_vectorization_factor (0), mask_skip_niters (NULL_TREE), rgroup_compare_type (NULL_TREE), @@ -1811,7 +1813,8 @@ vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo) definitely no, or -1 if it's worth retrying. */ static int -vect_analyze_loop_costing (loop_vec_info loop_vinfo) +vect_analyze_loop_costing (loop_vec_info loop_vinfo, + unsigned *suggested_unroll_factor) { class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo); @@ -1845,7 +1848,8 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo) int min_profitable_iters, min_profitable_estimate; vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters, - &min_profitable_estimate); + &min_profitable_estimate, + suggested_unroll_factor); if (min_profitable_iters < 0) { @@ -2129,10 +2133,16 @@ vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo, vectors to the epilogue, with the main loop continuing to operate on full vectors. + If we are unrolling we also do not want to use partial vectors. This + is to avoid the overhead of generating multiple masks and also to + avoid having to execute entire iterations of FALSE masked instructions + when dealing with one or less full iterations. + ??? We could then end up failing to use partial vectors if we decide to peel iterations into a prologue, and if the main loop then ends up processing fewer than VF iterations. */ - if (param_vect_partial_vector_usage == 1 + if ((param_vect_partial_vector_usage == 1 + || loop_vinfo->suggested_unroll_factor > 1) && !LOOP_VINFO_EPILOGUE_P (loop_vinfo) && !vect_known_niters_smaller_than_vf (loop_vinfo)) LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true; @@ -2199,7 +2209,8 @@ vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo, for it. The different analyses will record information in the loop_vec_info struct. */ static opt_result -vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal) +vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, + unsigned *suggested_unroll_factor) { opt_result ok = opt_result::success (); int res; @@ -2359,6 +2370,12 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal) set of rgroups. */ gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ()); + /* Apply the suggested unrolling factor, this was determined by the backend + during finish_cost the first time we ran the analyzis for this + vector mode. */ + if (loop_vinfo->suggested_unroll_factor > 1) + LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor; + /* This is the point where we can re-start analysis with SLP forced off. */ start_over: @@ -2550,7 +2567,7 @@ start_over: return ok; /* Check the costings of the loop make vectorizing worthwhile. */ - res = vect_analyze_loop_costing (loop_vinfo); + res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor); if (res < 0) { ok = opt_result::failure_at (vect_location, @@ -2951,7 +2968,7 @@ vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared, loop_vec_info main_loop_vinfo, const vector_modes &vector_modes, unsigned &mode_i, machine_mode &autodetected_vector_mode, - bool &fatal) + bool &fatal, unsigned int *suggested_unroll_factor = NULL) { loop_vec_info loop_vinfo = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo); @@ -2959,8 +2976,18 @@ vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared, machine_mode vector_mode = vector_modes[mode_i]; loop_vinfo->vector_mode = vector_mode; + /* Don't ask for a suggested unroll factor for an already unrolled loop + vinfo and reset the value for the next analysis. */ + if (suggested_unroll_factor && *suggested_unroll_factor > 1) + { + loop_vinfo->suggested_unroll_factor = *suggested_unroll_factor; + *suggested_unroll_factor = 1; + suggested_unroll_factor = NULL; + } + /* Run the main analysis. */ - opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal); + opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, + suggested_unroll_factor); if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, "***** Analysis %s with vector mode %s\n", @@ -3072,6 +3099,7 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared) unsigned int first_loop_i = 0; unsigned int first_loop_next_i = 0; unsigned HOST_WIDE_INT simdlen = loop->simdlen; + unsigned int suggested_unroll_factor = 1; /* First determine the main loop vectorization mode, either the first one that works, starting with auto-detecting the vector mode and then @@ -3084,7 +3112,8 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared) opt_loop_vec_info loop_vinfo = vect_analyze_loop_1 (loop, shared, &loop_form_info, NULL, vector_modes, mode_i, - autodetected_vector_mode, fatal); + autodetected_vector_mode, fatal, + &suggested_unroll_factor); if (fatal) break; @@ -3112,6 +3141,16 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared) first_loop_vinfo = loop_vinfo; first_loop_i = loop_vinfo_i; first_loop_next_i = mode_i; + if (suggested_unroll_factor > 1) + { + mode_i = first_loop_i; + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "***** Retryng analysis for unrolling" + " with unroll factor %d.\n", + suggested_unroll_factor); + continue; + } } else { @@ -3158,10 +3197,33 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared) /* Now analyze first_loop_vinfo for epilogue vectorization. */ poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo); + if (first_loop_vinfo->suggested_unroll_factor > 1) + { + if (LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (first_loop_vinfo)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "***** Re-trying analysis with first vector mode" + " %s for epilogue with partial vectors of" + " unrolled first loop.\n", + GET_MODE_NAME (vector_modes[0])); + mode_i = 0; + } + else + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "***** Re-trying analysis with same vector mode" + " %s for epilogue of unrolled first loop.\n", + GET_MODE_NAME (first_loop_vinfo->vector_mode)); + mode_i = first_loop_i; + } + } + /* Handle the case that the original loop can use partial vectorization, but want to only adopt it for the epilogue. The retry should be in the same mode as original. */ - if (LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (first_loop_vinfo)) + else if (LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (first_loop_vinfo)) { gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (first_loop_vinfo) && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (first_loop_vinfo)); @@ -3182,6 +3244,7 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared) /* ??? If first_loop_vinfo was using VOIDmode then we probably want to instead search for the corresponding mode in vector_modes[]. */ + poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo); while (1) { bool fatal; @@ -3193,6 +3256,22 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared) if (fatal) break; + if (loop_vinfo + && known_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), first_vinfo_vf) + && !(LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) + && maybe_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), + first_vinfo_vf))) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "***** Will not use %s mode as an" + " epilogue, since it leads to an higher" + " vectorization factor than main loop\n", + GET_MODE_NAME (loop_vinfo->vector_mode)); + delete loop_vinfo; + loop_vinfo = opt_loop_vec_info::success (NULL); + } + if (loop_vinfo) { if (pick_lowest_cost_p) @@ -3905,7 +3984,8 @@ vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue, static void vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, int *ret_min_profitable_niters, - int *ret_min_profitable_estimate) + int *ret_min_profitable_estimate, + unsigned *suggested_unroll_factor) { int min_profitable_iters; int min_profitable_estimate; @@ -4265,8 +4345,23 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, } /* Complete the target-specific cost calculations. */ - finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost, - &vec_inside_cost, &vec_epilogue_cost); + finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), + &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost, + suggested_unroll_factor); + + if (suggested_unroll_factor && *suggested_unroll_factor > 1 + && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR + && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) * + *suggested_unroll_factor, + LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo))) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "can't unroll as unrolled vectorization factor larger" + " than maximum vectorization factor: %d\n", + LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)); + *suggested_unroll_factor = 1; + } vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost); @@ -7255,7 +7350,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo, participating. */ if (ncopies > 1 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live) - && reduc_chain_length == 1) + && reduc_chain_length == 1 + && loop_vinfo->suggested_unroll_factor == 1) single_defuse_cycle = true; if (single_defuse_cycle || lane_reduc_code_p) diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index b552e9dccce5bce6a3bbcf5d531e7ccefa719b9a..238e0b4bf21871c518da73a79320f34e55c9201c 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -624,6 +624,13 @@ public: about the reductions that generated them. */ hash_map reusable_accumulators; + /* The number of times that the target suggested we unroll the vector loop + in order to promote more ILP. This value will be used to re-analyze the + loop for vectorization and if successful the value will be folded into + vectorization_factor (and therefore exactly divides + vectorization_factor). */ + unsigned int suggested_unroll_factor; + /* Maximum runtime vectorization factor, or MAX_VECTORIZATION_FACTOR if there is no particular limit. */ unsigned HOST_WIDE_INT max_vectorization_factor; @@ -1430,6 +1437,7 @@ public: unsigned int prologue_cost () const; unsigned int body_cost () const; unsigned int epilogue_cost () const; + unsigned int suggested_unroll_factor () const; protected: unsigned int record_stmt_cost (stmt_vec_info, vect_cost_model_location, @@ -1447,6 +1455,9 @@ protected: /* The costs of the three regions, indexed by vect_cost_model_location. */ unsigned int m_costs[3]; + /* The suggested unrolling factor determined at finish_cost. */ + unsigned int m_suggested_unroll_factor; + /* True if finish_cost has been called. */ bool m_finished; }; @@ -1459,6 +1470,7 @@ vector_costs::vector_costs (vec_info *vinfo, bool costing_for_scalar) : m_vinfo (vinfo), m_costing_for_scalar (costing_for_scalar), m_costs (), + m_suggested_unroll_factor(1), m_finished (false) { } @@ -1490,6 +1502,15 @@ vector_costs::epilogue_cost () const return m_costs[vect_epilogue]; } +/* Return the suggested unroll factor. */ + +inline unsigned int +vector_costs::suggested_unroll_factor () const +{ + gcc_checking_assert (m_finished); + return m_suggested_unroll_factor; +} + #define VECT_MAX_COST 1000 /* The maximum number of intermediate steps required in multi-step type @@ -1665,12 +1686,15 @@ add_stmt_cost (vector_costs *costs, stmt_info_for_cost *i) static inline void finish_cost (vector_costs *costs, unsigned *prologue_cost, - unsigned *body_cost, unsigned *epilogue_cost) + unsigned *body_cost, unsigned *epilogue_cost, + unsigned *suggested_unroll_factor = NULL) { costs->finish_cost (); *prologue_cost = costs->prologue_cost (); *body_cost = costs->body_cost (); *epilogue_cost = costs->epilogue_cost (); + if (suggested_unroll_factor) + *suggested_unroll_factor = costs->suggested_unroll_factor (); } inline void