diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 009afc5f894..86ef584e09b 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -5177,6 +5177,34 @@ rs6000_add_stmt_cost (class vec_info *vinfo, void *data, int count, return retval; } +/* For some target specific vectorization cost which can't be handled per stmt, + we check the requisite conditions and adjust the vectorization cost + accordingly if satisfied. One typical example is to model shift cost for + vector with length by counting number of required lengths under condition + LOOP_VINFO_FULLY_WITH_LENGTH_P. */ + +static void +rs6000_adjust_vect_cost_per_loop (rs6000_cost_data *data) +{ + struct loop *loop = data->loop_info; + gcc_assert (loop); + loop_vec_info loop_vinfo = loop_vec_info_for_loop (loop); + + if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)) + { + rgroup_controls *rgc; + unsigned int num_vectors_m1; + unsigned int shift_cnt = 0; + FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc) + if (rgc->type) + /* Each length needs one shift to fill into bits 0-7. */ + shift_cnt += num_vectors_m1 + 1; + + rs6000_add_stmt_cost (loop_vinfo, (void *) data, shift_cnt, scalar_stmt, + NULL, NULL_TREE, 0, vect_body); + } +} + /* Implement targetm.vectorize.finish_cost. */ static void @@ -5186,7 +5214,10 @@ rs6000_finish_cost (void *data, unsigned *prologue_cost, rs6000_cost_data *cost_data = (rs6000_cost_data*) data; if (cost_data->loop_info) - rs6000_density_test (cost_data); + { + rs6000_adjust_vect_cost_per_loop (cost_data); + rs6000_density_test (cost_data); + } /* Don't vectorize minimum-vectorization-factor, simple copy loops that require versioning for any reason. The vectorization is at diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c index 490e7befea3..9d0e3fc525e 100644 --- a/gcc/tree-vect-loop-manip.c +++ b/gcc/tree-vect-loop-manip.c @@ -412,7 +412,10 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm, This means that we cannot guarantee that such an induction variable would ever hit a value that produces a set of all-false masks or zero - lengths for RGC. */ + lengths for RGC. + + Note that please check cost modeling whether needed to be updated in + function vect_estimate_min_profitable_iters if any changes. */ static tree vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo, diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index 06cde4b1da3..a00160a7f2d 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -3798,6 +3798,70 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks - 1, vector_stmt, NULL, NULL_TREE, 0, vect_body); } + else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)) + { + /* Referring to the functions vect_set_loop_condition_partial_vectors + and vect_set_loop_controls_directly, we need to generate each + length in the prologue and in the loop body if required. Although + there are some possible optimizations, we consider the worst case + here. */ + + /* For wrap around checking. */ + tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo); + unsigned int compare_precision = TYPE_PRECISION (compare_type); + widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo); + + bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo); + bool need_iterate_p + = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo) + && !vect_known_niters_smaller_than_vf (loop_vinfo)); + + /* Init min/max, shift and minus cost relative to single + scalar_stmt. For now we only use length-based partial vectors on + Power, target specific cost tweaking may be needed for other + ports in future. */ + unsigned int min_max_cost = 2; + unsigned int shift_cost = 1, minus_cost = 1; + + /* Init cost relative to single scalar_stmt. */ + unsigned int prologue_cnt = 0; + unsigned int body_cnt = 0; + + rgroup_controls *rgc; + unsigned int num_vectors_m1; + FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc) + if (rgc->type) + { + unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor; + + /* May need one shift for nitems_total computation. */ + if (nitems != 1 && !niters_known_p) + prologue_cnt += shift_cost; + + /* Need to handle wrap around. */ + if (iv_limit == -1 + || (wi::min_precision (iv_limit * nitems, UNSIGNED) + > compare_precision)) + prologue_cnt += (min_max_cost + minus_cost); + + /* Need to handle batch limit excepting for the 1st one. */ + prologue_cnt += (min_max_cost + minus_cost) * num_vectors_m1; + + unsigned int num_vectors = num_vectors_m1 + 1; + /* Need to set up lengths in prologue, only one MIN required + since start index is zero. */ + prologue_cnt += min_max_cost * num_vectors; + + /* Need to update lengths in body for next iteration. */ + if (need_iterate_p) + body_cnt += (2 * min_max_cost + minus_cost) * num_vectors; + } + + (void) add_stmt_cost (loop_vinfo, target_cost_data, prologue_cnt, + scalar_stmt, NULL, NULL_TREE, 0, vect_prologue); + (void) add_stmt_cost (loop_vinfo, target_cost_data, body_cnt, scalar_stmt, + NULL, NULL_TREE, 0, vect_body); + } /* FORNOW: The scalar outside cost is incremented in one of the following ways: @@ -3932,8 +3996,8 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, } /* ??? The "if" arm is written to handle all cases; see below for what - we would do for !LOOP_VINFO_FULLY_MASKED_P. */ - if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) + we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */ + if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) { /* Rewriting the condition above in terms of the number of vector iterations (vniters) rather than the number of @@ -3960,7 +4024,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n", min_vec_niters); - if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) + if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) { /* Now that we know the minimum number of vector iterations, find the minimum niters for which the scalar cost is larger: @@ -4015,6 +4079,10 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, && min_profitable_iters < (assumed_vf + peel_iters_prologue)) /* We want the vectorized loop to execute at least once. */ min_profitable_iters = assumed_vf + peel_iters_prologue; + else if (min_profitable_iters < peel_iters_prologue) + /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the + vectorized loop to execute at least once. */ + min_profitable_iters = peel_iters_prologue; if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, @@ -4032,7 +4100,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, if (vec_outside_cost <= 0) min_profitable_estimate = 0; - else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) + else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) { /* This is a repeat of the code above, but with + SOC rather than - SOC. */ @@ -4044,7 +4112,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, if (outside_overhead > 0) min_vec_niters = outside_overhead / saving_per_viter + 1; - if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) + if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) { int threshold = (vec_inside_cost * min_vec_niters + vec_outside_cost diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index 31af46ae19c..8550a252f44 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -12090,7 +12090,10 @@ vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info, min_of_start_and_end = min (START_INDEX, END_INDEX); left_len = END_INDEX - min_of_start_and_end; rhs = min (left_len, LEN_LIMIT); - LEN = rhs; */ + LEN = rhs; + + Note that please check cost modeling whether needed to be updated in + function vect_estimate_min_profitable_iters if any changes. */ gimple_seq vect_gen_len (tree len, tree start_index, tree end_index, tree len_limit)