diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 36519ccc5a58abab483c38d0a6c5f039592bfc7f..9b1e01e9b62050d7e34bc55454771e40bdbdb4cb 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -15972,8 +15972,8 @@ aarch64_adjust_body_cost (aarch64_vector_costs *costs, unsigned int body_cost) /* Implement TARGET_VECTORIZE_FINISH_COST. */ static void -aarch64_finish_cost (void *data, unsigned *prologue_cost, - unsigned *body_cost, unsigned *epilogue_cost) +aarch64_finish_cost (void *data, unsigned *prologue_cost, unsigned *body_cost, + unsigned *epilogue_cost, unsigned *suggested_unroll_factor) { auto *costs = static_cast (data); *prologue_cost = costs->region[vect_prologue]; @@ -15984,6 +15984,9 @@ aarch64_finish_cost (void *data, unsigned *prologue_cost, && costs->vec_flags && aarch64_use_new_vector_costs_p ()) *body_cost = aarch64_adjust_body_cost (costs, *body_cost); + + if(suggested_unroll_factor) + *suggested_unroll_factor = 1; } /* Implement TARGET_VECTORIZE_DESTROY_COST_DATA. */ diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index afc2674d49da370ae0f5ef277df7e9954f303b8e..a48e43879512793907fef946c1575c3ed7f68092 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -23048,13 +23048,15 @@ ix86_add_stmt_cost (class vec_info *vinfo, void *data, int count, /* Implement targetm.vectorize.finish_cost. */ static void -ix86_finish_cost (void *data, unsigned *prologue_cost, - unsigned *body_cost, unsigned *epilogue_cost) +ix86_finish_cost (void *data, unsigned *prologue_cost, unsigned *body_cost, + unsigned *epilogue_cost, unsigned *suggested_unroll_factor) { unsigned *cost = (unsigned *) data; *prologue_cost = cost[vect_prologue]; *body_cost = cost[vect_body]; *epilogue_cost = cost[vect_epilogue]; + if (suggested_unroll_factor) + *suggested_unroll_factor = 1; } /* Implement targetm.vectorize.destroy_cost_data. */ diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index ad81dfb316dff00cde810d6b1edd31fa49d5c1e8..59d30ad6fcd1758383c52e34a0f90a126c501ec3 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -5551,8 +5551,8 @@ rs6000_adjust_vect_cost_per_loop (rs6000_cost_data *data) /* Implement targetm.vectorize.finish_cost. */ static void -rs6000_finish_cost (void *data, unsigned *prologue_cost, - unsigned *body_cost, unsigned *epilogue_cost) +rs6000_finish_cost (void *data, unsigned *prologue_cost, unsigned *body_cost, + unsigned *epilogue_cost, unsigned *suggested_unroll_factor) { rs6000_cost_data *cost_data = (rs6000_cost_data*) data; @@ -5578,6 +5578,8 @@ rs6000_finish_cost (void *data, unsigned *prologue_cost, *prologue_cost = cost_data->cost[vect_prologue]; *body_cost = cost_data->cost[vect_body]; *epilogue_cost = cost_data->cost[vect_epilogue]; + if (suggested_unroll_factor) + *suggested_unroll_factor = 1; } /* Implement targetm.vectorize.destroy_cost_data. */ diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index be8148583d8571b0d035b1938db9d056bfd213a8..c584260b02c3e8d4fcd7b31c38321d5f81a71428 100644 --- a/gcc/doc/tm.texi +++ b/gcc/doc/tm.texi @@ -6276,11 +6276,12 @@ return value should be viewed as a tentative cost that may later be revised. @end deftypefn -@deftypefn {Target Hook} void TARGET_VECTORIZE_FINISH_COST (void *@var{data}, unsigned *@var{prologue_cost}, unsigned *@var{body_cost}, unsigned *@var{epilogue_cost}) +@deftypefn {Target Hook} void TARGET_VECTORIZE_FINISH_COST (void *@var{data}, unsigned *@var{prologue_cost}, unsigned *@var{body_cost}, unsigned *@var{epilogue_cost}, unsigned *@var{suggested_unroll_factor}) This hook should complete calculations of the cost of vectorizing a loop or basic block based on @var{data}, and return the prologue, body, and -epilogue costs as unsigned integers. The default returns the value of -the three accumulators. +epilogue costs as unsigned integers. It also asks the backend whether it +has a @var{suggested_unroll_factor}. The default returns the value of +the three cost accumulators. @end deftypefn @deftypefn {Target Hook} void TARGET_VECTORIZE_DESTROY_COST_DATA (void *@var{data}) diff --git a/gcc/target.def b/gcc/target.def index bfa819609c21bd71c0cc585c01dba42534453f47..df0f170ff3378671e802d82a8bce8e153d8cf8fe 100644 --- a/gcc/target.def +++ b/gcc/target.def @@ -2078,11 +2078,12 @@ DEFHOOK (finish_cost, "This hook should complete calculations of the cost of vectorizing a loop\n\ or basic block based on @var{data}, and return the prologue, body, and\n\ -epilogue costs as unsigned integers. The default returns the value of\n\ -the three accumulators.", +epilogue costs as unsigned integers. It also asks the backend whether it\n\ +has a @var{suggested_unroll_factor}. The default returns the value of\n\ +the three cost accumulators.", void, (void *data, unsigned *prologue_cost, unsigned *body_cost, - unsigned *epilogue_cost), + unsigned *epilogue_cost, unsigned *suggested_unroll_factor), default_finish_cost) /* Function to delete target-specific cost modeling data. */ diff --git a/gcc/targhooks.h b/gcc/targhooks.h index 92d51992e625c2497aa8496b1e2e3d916e5706fd..b9697c366876fe5a8c444ffcf58bdc6b5c33b0ad 100644 --- a/gcc/targhooks.h +++ b/gcc/targhooks.h @@ -123,7 +123,8 @@ extern unsigned default_add_stmt_cost (class vec_info *, void *, int, enum vect_cost_for_stmt, class _stmt_vec_info *, tree, int, enum vect_cost_model_location); -extern void default_finish_cost (void *, unsigned *, unsigned *, unsigned *); +extern void default_finish_cost (void *, unsigned *, unsigned *, unsigned *, + unsigned *); extern void default_destroy_cost_data (void *); /* OpenACC hooks. */ diff --git a/gcc/targhooks.c b/gcc/targhooks.c index c9b5208853dbc15706a65d1eb335e28e0564325e..8552d9a0f144e7bcee3f2653f2ea84ea677f80a2 100644 --- a/gcc/targhooks.c +++ b/gcc/targhooks.c @@ -1518,13 +1518,18 @@ default_add_stmt_cost (class vec_info *vinfo, void *data, int count, /* By default, the cost model just returns the accumulated costs. */ void -default_finish_cost (void *data, unsigned *prologue_cost, - unsigned *body_cost, unsigned *epilogue_cost) +default_finish_cost (void *data, + unsigned *prologue_cost, unsigned *body_cost, + unsigned *epilogue_cost, + unsigned *suggested_unroll_factor) { unsigned *cost = (unsigned *) data; *prologue_cost = cost[vect_prologue]; *body_cost = cost[vect_body]; *epilogue_cost = cost[vect_epilogue]; + /* Do not unroll. */ + if (suggested_unroll_factor) + *suggested_unroll_factor = 1; } /* Free the cost data. */ diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index 5a5b8da2e771a1dd204f22a6447eba96bb3b352c..1bfe2e4f989143f4415c6c5b4a0b902ef1e00d66 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -153,7 +153,8 @@ along with GCC; see the file COPYING3. If not see http://gcc.gnu.org/projects/tree-ssa/vectorization.html */ -static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *); +static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *, + unsigned *); static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info, bool *, bool *); @@ -277,7 +278,8 @@ vect_determine_vf_for_stmt (vec_info *vinfo, */ static opt_result -vect_determine_vectorization_factor (loop_vec_info loop_vinfo) +vect_determine_vectorization_factor (loop_vec_info loop_vinfo, + poly_uint64 min_vf) { class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); @@ -354,6 +356,28 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo) } } + /* Apply the suggested unrolling factor, this was determined by the backend + during finish_cost the first time we ran the analyzis for this + vector mode. */ + if (loop_vinfo->suggested_unroll_factor > 1) + { + poly_uint64 unrolled_vf + = vectorization_factor * loop_vinfo->suggested_unroll_factor; + unsigned HOST_WIDE_INT max_vf = estimated_poly_value (unrolled_vf, + POLY_VALUE_MAX); + /* Make sure the unrolled vectorization factor fits the min and max + vectorization factor. */ + if (max_vf <= LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) + && known_le (unrolled_vf, min_vf)) + vectorization_factor = unrolled_vf; + else if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "Unrolling failed due to unroll factor not fitting in" + " range of min and max vectorization factor:" + " [%d, %d]\n", + min_vf, max_vf); + } + /* TODO: Analyze cost. Decide if worth while to vectorize. */ if (dump_enabled_p ()) { @@ -828,6 +852,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared) skip_main_loop_edge (nullptr), skip_this_loop_edge (nullptr), reusable_accumulators (), + suggested_unroll_factor (1), max_vectorization_factor (0), mask_skip_niters (NULL_TREE), rgroup_compare_type (NULL_TREE), @@ -1829,7 +1854,8 @@ vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo) definitely no, or -1 if it's worth retrying. */ static int -vect_analyze_loop_costing (loop_vec_info loop_vinfo) +vect_analyze_loop_costing (loop_vec_info loop_vinfo, + unsigned *suggested_unroll_factor) { class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo); @@ -1863,7 +1889,8 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo) int min_profitable_iters, min_profitable_estimate; vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters, - &min_profitable_estimate); + &min_profitable_estimate, + suggested_unroll_factor); if (min_profitable_iters < 0) { @@ -2128,10 +2155,16 @@ vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo, vectors to the epilogue, with the main loop continuing to operate on full vectors. + If we are unrolling we also do not want to use partial vectors. This + is to avoid the overhead of generating multiple masks and also to + avoid having to execute entire iterations of FALSE masked instructions + when dealing with one or less full iterations. + ??? We could then end up failing to use partial vectors if we decide to peel iterations into a prologue, and if the main loop then ends up processing fewer than VF iterations. */ - if (param_vect_partial_vector_usage == 1 + if ((param_vect_partial_vector_usage == 1 + || loop_vinfo->suggested_unroll_factor > 1) && !LOOP_VINFO_EPILOGUE_P (loop_vinfo) && !vect_known_niters_smaller_than_vf (loop_vinfo)) LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true; @@ -2198,13 +2231,16 @@ vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo, for it. The different analyses will record information in the loop_vec_info struct. */ static opt_result -vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts) +vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts, + unsigned *suggested_unroll_factor, + poly_uint64 min_vf = 2) { opt_result ok = opt_result::success (); int res; unsigned int max_vf = MAX_VECTORIZATION_FACTOR; - poly_uint64 min_vf = 2; loop_vec_info orig_loop_vinfo = NULL; + if (*suggested_unroll_factor > 1) + max_vf = estimated_poly_value (min_vf, POLY_VALUE_MAX); /* If we are dealing with an epilogue then orig_loop_vinfo points to the loop_vec_info of the first vectorized loop. */ @@ -2308,11 +2344,12 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts) return ok; } if (max_vf != MAX_VECTORIZATION_FACTOR - && maybe_lt (max_vf, min_vf)) + && loop_vinfo->suggested_unroll_factor == 1 + && max_vf < estimated_poly_value (min_vf, POLY_VALUE_MAX)) return opt_result::failure_at (vect_location, "bad data dependence.\n"); LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf; - ok = vect_determine_vectorization_factor (loop_vinfo); + ok = vect_determine_vectorization_factor (loop_vinfo, min_vf); if (!ok) { if (dump_enabled_p ()) @@ -2321,7 +2358,9 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts) return ok; } if (max_vf != MAX_VECTORIZATION_FACTOR - && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo))) + && loop_vinfo->suggested_unroll_factor == 1 + && max_vf < estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo), + POLY_VALUE_MAX)) return opt_result::failure_at (vect_location, "bad data dependence.\n"); /* Compute the scalar iteration cost. */ @@ -2547,7 +2586,7 @@ start_over: return ok; /* Check the costings of the loop make vectorizing worthwhile. */ - res = vect_analyze_loop_costing (loop_vinfo); + res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor); if (res < 0) { ok = opt_result::failure_at (vect_location, @@ -2879,6 +2918,122 @@ vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo, return true; } +/* Determine whether we can unroll this loop. */ + +static bool +vect_can_unroll (loop_vec_info loop_vinfo) +{ + stmt_vec_info stmt_info; + unsigned i; + poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); + + if (known_le (vectorization_factor, 1U)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "will not unroll loop with a VF of 1 or less\n"); + return false; + } + + FOR_EACH_VEC_ELT (loop_vinfo->stmt_vec_infos, i, stmt_info) + { + if (STMT_VINFO_IN_PATTERN_P (stmt_info) + || !STMT_VINFO_RELEVANT_P (stmt_info) + || stmt_info->vectype == NULL_TREE) + continue; + /* Do not unroll loops with negative steps as it is unlikely that + vectorization will succeed due to the way we deal with negative steps + in loads and stores in 'get_load_store_type'. */ + if (stmt_info->dr_aux.dr + && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)) + { + dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info); + tree step = vect_dr_behavior (loop_vinfo, dr_info)->step; + if (TREE_CODE (step) == INTEGER_CST + && tree_int_cst_compare (step, size_zero_node) < 0) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "could not unroll due to negative step\n"); + return false; + } + } + + if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def) + { + auto red_info = info_for_reduction (loop_vinfo, stmt_info); + if (STMT_VINFO_REDUC_TYPE (red_info) != TREE_CODE_REDUCTION) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "could not unroll loop with reduction due to " + "non TREE_CODE_REDUCTION\n"); + return false; + } + } + } + + return true; +} + + +/* Try to unroll the current loop. First determine the unrolling factor using + the analysis done for the current vector mode. Then re-analyze the loop for + the given unrolling factor and the current vector mode. */ + +static opt_loop_vec_info +vect_try_unrolling (loop_vec_info loop_vinfo, unsigned *n_stmts, + unsigned suggested_unroll_factor) +{ + DUMP_VECT_SCOPE ("vect_try_unrolling"); + + if (suggested_unroll_factor == 1) + return opt_loop_vec_info::failure_at (vect_location, + "*** Target determined unrolling is" + " not profitable.\n"); + + if (!vect_can_unroll (loop_vinfo)) + return opt_loop_vec_info::failure_at (vect_location, + "*** Can not unroll this loop.\n"); + + loop_vec_info unrolled_vinfo + = opt_loop_vec_info::success (vect_analyze_loop_form (loop_vinfo->loop, + loop_vinfo->shared)); + unrolled_vinfo->vector_mode = loop_vinfo->vector_mode; + + /* Use the suggested_unrolling_factor that was returned at the target's + TARGET_VECTORIZE_FINISH_COST hook. */ + unrolled_vinfo->suggested_unroll_factor = suggested_unroll_factor; + poly_uint64 unrolled_vf + = LOOP_VINFO_VECT_FACTOR (loop_vinfo) * suggested_unroll_factor; + + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "***** unrolling factor %d chosen for vector mode %s," + "re-trying analyzis...\n", + suggested_unroll_factor, + GET_MODE_NAME (unrolled_vinfo->vector_mode)); + bool unrolling_fatal = false; + if (vect_analyze_loop_2 (unrolled_vinfo, unrolling_fatal, n_stmts, + &suggested_unroll_factor, + unrolled_vf) + && known_ne (LOOP_VINFO_VECT_FACTOR (loop_vinfo), + LOOP_VINFO_VECT_FACTOR (unrolled_vinfo))) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "unrolling succeeded with factor = %d\n", + suggested_unroll_factor); + unrolled_vinfo->loop->aux = NULL; + return opt_loop_vec_info::success (unrolled_vinfo); + } + + loop_vinfo->loop->aux = NULL; + return opt_loop_vec_info::failure_at (vect_location, + "unrolling failed with factor = %d\n", + suggested_unroll_factor); +} + /* If LOOP_VINFO is already a main loop, return it unmodified. Otherwise try to reanalyze it as a main loop. Return the loop_vinfo on success and null on failure. */ @@ -2902,8 +3057,16 @@ vect_reanalyze_as_main_loop (loop_vec_info loop_vinfo, unsigned int *n_stmts) main_loop_vinfo->vector_mode = loop_vinfo->vector_mode; bool fatal = false; - bool res = vect_analyze_loop_2 (main_loop_vinfo, fatal, n_stmts); + unsigned suggested_unroll_factor = 1; + bool res = vect_analyze_loop_2 (main_loop_vinfo, fatal, n_stmts, + &suggested_unroll_factor); loop->aux = NULL; + opt_loop_vec_info unrolled_vinfo + = opt_loop_vec_info::success (vect_try_unrolling (main_loop_vinfo, n_stmts, + suggested_unroll_factor)); + if (unrolled_vinfo) + main_loop_vinfo = unrolled_vinfo; + if (!res) { if (dump_enabled_p ()) @@ -2960,6 +3123,7 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared) bool vect_epilogues = false; opt_result res = opt_result::success (); unsigned HOST_WIDE_INT simdlen = loop->simdlen; + unsigned suggested_unroll_factor = 1; while (1) { /* Check the CFG characteristics of the loop (nesting, entry/exit). */ @@ -3007,7 +3171,8 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared) if (vect_epilogues) LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo; - res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts); + res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts, + &suggested_unroll_factor); if (mode_i == 0) autodetected_vector_mode = loop_vinfo->vector_mode; if (dump_enabled_p ()) @@ -3038,6 +3203,18 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared) if (res) { + /* Only try unrolling main loops. */ + if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)) + { + opt_loop_vec_info unrolled_vinfo = + vect_try_unrolling (loop_vinfo, &n_stmts, + suggested_unroll_factor); + if (unrolled_vinfo) + loop_vinfo = unrolled_vinfo; + /* Reset suggested_unroll_factor for next loop_vinfo. */ + suggested_unroll_factor = 1; + } + LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1; vectorized_loops++; @@ -3056,13 +3233,26 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared) /* Keep trying to roll back vectorization attempts while the loop_vec_infos they produced were worse than this one. */ vec &vinfos = first_loop_vinfo->epilogue_vinfos; + poly_uint64 vinfo_vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); + poly_uint64 first_vinfo_vf + = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo); while (!vinfos.is_empty () + && (known_lt (vinfo_vf, first_vinfo_vf) + || (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) + && maybe_eq (vinfo_vf, first_vinfo_vf))) && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ())) { gcc_assert (vect_epilogues); delete vinfos.pop (); } + /* Check if we may want to replace the current first_loop_vinfo + with the new loop, but only if they have different vector + modes. If they have the same vector mode this means the main + loop is an unrolled loop and we are trying to vectorize the + epilogue using the same vector mode but with a lower + vectorization factor. */ if (vinfos.is_empty () + && loop_vinfo->vector_mode != first_loop_vinfo->vector_mode && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo)) { loop_vec_info main_loop_vinfo @@ -3105,14 +3295,34 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared) /* For now only allow one epilogue loop. */ && first_loop_vinfo->epilogue_vinfos.is_empty ()) { - first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo); - poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo); - gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo) - || maybe_ne (lowest_th, 0U)); - /* Keep track of the known smallest versioning - threshold. */ - if (ordered_p (lowest_th, th)) - lowest_th = ordered_min (lowest_th, th); + /* Ensure the epilogue has a smaller VF than the main loop or + uses predication and has the same VF. */ + if (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo), + LOOP_VINFO_VECT_FACTOR (first_loop_vinfo)) + || (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) + && maybe_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), + LOOP_VINFO_VECT_FACTOR (first_loop_vinfo)))) + { + first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo); + poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo); + gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo) + || maybe_ne (lowest_th, 0U)); + /* Keep track of the known smallest versioning + threshold. */ + if (ordered_p (lowest_th, th)) + lowest_th = ordered_min (lowest_th, th); + } + else + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "***** Will not use %s mode as an" + " epilogue, since it leads to an higher" + " vectorization factor than main loop\n", + GET_MODE_NAME (loop_vinfo->vector_mode)); + delete loop_vinfo; + loop_vinfo = opt_loop_vec_info::success (NULL); + } } else { @@ -3153,13 +3363,32 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared) /* Handle the case that the original loop can use partial vectorization, but want to only adopt it for the epilogue. - The retry should be in the same mode as original. */ + The retry should be in the same mode as original. + Also handle the case where we have unrolled the main loop and want to + retry all vector modes again for the epilogues, since the VF is now + at least twice as high as the current vector mode. */ if (vect_epilogues && loop_vinfo - && LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo)) + && (LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) + || loop_vinfo->suggested_unroll_factor > 1)) { - gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) + gcc_assert ((LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) + || loop_vinfo->suggested_unroll_factor > 1) && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)); + /* If we are unrolling, try all VECTOR_MODES for the epilogue. */ + if (loop_vinfo->suggested_unroll_factor > 1) + { + next_vector_mode = vector_modes[0]; + mode_i = 1; + + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "***** Re-trying analysis with vector mode" + " %s for epilogues after unrolling.\n", + GET_MODE_NAME (next_vector_mode)); + continue; + } + if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, "***** Re-trying analysis with same vector mode" @@ -3862,7 +4091,8 @@ vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue, static void vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, int *ret_min_profitable_niters, - int *ret_min_profitable_estimate) + int *ret_min_profitable_estimate, + unsigned *suggested_unroll_factor) { int min_profitable_iters; int min_profitable_estimate; @@ -4222,8 +4452,9 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, } /* Complete the target-specific cost calculations. */ - finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost, - &vec_inside_cost, &vec_epilogue_cost); + finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), + &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost, + suggested_unroll_factor); vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost); @@ -7212,7 +7443,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo, participating. */ if (ncopies > 1 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live) - && reduc_chain_length == 1) + && reduc_chain_length == 1 + && loop_vinfo->suggested_unroll_factor == 1) single_defuse_cycle = true; if (single_defuse_cycle || lane_reduc_code_p) diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c index 024a1c38a2342246d7891db1de5f1d6e6458d5dd..a8a6c6a19ed4c98144f9097467c59386fdbe8233 100644 --- a/gcc/tree-vect-slp.c +++ b/gcc/tree-vect-slp.c @@ -5418,8 +5418,8 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo, } while (vi < li_vector_costs.length () && li_vector_costs[vi].first == vl); - finish_cost (vect_target_cost_data, &vec_prologue_cost, - &vec_inside_cost, &vec_epilogue_cost); + finish_cost (vect_target_cost_data, &vec_prologue_cost, &vec_inside_cost, + &vec_epilogue_cost); destroy_cost_data (vect_target_cost_data); vec_outside_cost = vec_prologue_cost + vec_epilogue_cost; diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index c4c5678e7f1abafc25c465319dbacf3ef50f0ae9..8b182cd34e7d6a8d9e55a9c1003900b8216a952f 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -621,6 +621,13 @@ public: about the reductions that generated them. */ hash_map reusable_accumulators; + /* The number of times that the target suggested we unroll the vector loop + in order to promote more ILP. This value will be used to re-analyze the + loop for vectorization and if successful the value will be folded into + vectorization_factor (and therefore exactly divides + vectorization_factor). */ + unsigned int suggested_unroll_factor; + /* Maximum runtime vectorization factor, or MAX_VECTORIZATION_FACTOR if there is no particular limit. */ unsigned HOST_WIDE_INT max_vectorization_factor; @@ -1571,9 +1578,11 @@ add_stmt_cost (vec_info *vinfo, void *data, stmt_info_for_cost *i) static inline void finish_cost (void *data, unsigned *prologue_cost, - unsigned *body_cost, unsigned *epilogue_cost) + unsigned *body_cost, unsigned *epilogue_cost, + unsigned *suggested_unroll_factor = NULL) { - targetm.vectorize.finish_cost (data, prologue_cost, body_cost, epilogue_cost); + targetm.vectorize.finish_cost (data, prologue_cost, body_cost, epilogue_cost, + suggested_unroll_factor); } /* Alias targetm.vectorize.destroy_cost_data. */