--- gcc/doc/invoke.texi | 7 ++ gcc/params.opt | 4 + gcc/tree-vect-loop-manip.c | 97 ++++++++++----- gcc/tree-vect-loop.c | 243 ++++++++++++++++++++++++++++++++++++- gcc/tree-vect-stmts.c | 155 ++++++++++++++++++++--- gcc/tree-vectorizer.h | 43 ++++++- 6 files changed, 494 insertions(+), 55 deletions(-) diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 8b9935dfe65..ac765feab13 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -13079,6 +13079,13 @@ by the copy loop headers pass. @item vect-epilogues-nomask Enable loop epilogue vectorization using smaller vector size. +@item vect-with-length-scope +Control the scope of vector memory access with length exploitation. 0 means we +don't expliot any vector memory access with length, 1 means we only exploit +vector memory access with length for those loops whose iteration number are +less than VF, such as very small loop or epilogue, 2 means we want to exploit +vector memory access with length for any loops if possible. + @item slp-max-insns-in-bb Maximum number of instructions in basic block to be considered for SLP vectorization. diff --git a/gcc/params.opt b/gcc/params.opt index 4aec480798b..d4309101067 100644 --- a/gcc/params.opt +++ b/gcc/params.opt @@ -964,4 +964,8 @@ Bound on number of runtime checks inserted by the vectorizer's loop versioning f Common Joined UInteger Var(param_vect_max_version_for_alignment_checks) Init(6) Param Optimization Bound on number of runtime checks inserted by the vectorizer's loop versioning for alignment check. +-param=vect-with-length-scope= +Common Joined UInteger Var(param_vect_with_length_scope) Init(0) IntegerRange(0, 2) Param Optimization +Control the vector with length exploitation scope. + ; This comment is to ensure we retain the blank line above. diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c index 1fac5898525..1eaf6e1c3ea 100644 --- a/gcc/tree-vect-loop-manip.c +++ b/gcc/tree-vect-loop-manip.c @@ -399,19 +399,20 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm, It is known that: - NITERS * RGC->max_nscalars_per_iter + NITERS * RGC->max_nscalars_per_iter * RGC->factor does not overflow. However, MIGHT_WRAP_P says whether an induction variable that starts at 0 and has step: - VF * RGC->max_nscalars_per_iter + VF * RGC->max_nscalars_per_iter * RGC->factor might overflow before hitting a value above: - (NITERS + NITERS_SKIP) * RGC->max_nscalars_per_iter + (NITERS + NITERS_SKIP) * RGC->max_nscalars_per_iter * RGC->factor This means that we cannot guarantee that such an induction variable - would ever hit a value that produces a set of all-false masks for RGC. */ + would ever hit a value that produces a set of all-false masks or zero + lengths for RGC. */ static tree vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo, @@ -422,10 +423,20 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo, { tree compare_type = LOOP_VINFO_COMPARE_TYPE (loop_vinfo); tree iv_type = LOOP_VINFO_IV_TYPE (loop_vinfo); + bool vect_for_masking = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo); + tree ctrl_type = rgc->type; - unsigned int nscalars_per_iter = rgc->max_nscalars_per_iter; + /* Scale up nscalars per iteration with factor. */ + unsigned int nscalars_per_iter_ft = rgc->max_nscalars_per_iter * rgc->factor; poly_uint64 nscalars_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type); poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); + tree length_limit = NULL_TREE; + /* For length, we probably need length_limit to check length in range. */ + if (!vect_for_masking) + { + poly_uint64 len_limit = nscalars_per_ctrl * rgc->factor; + length_limit = build_int_cst (compare_type, len_limit); + } /* Calculate the maximum number of scalar values that the rgroup handles in total, the number that it handles for each iteration @@ -434,12 +445,12 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo, tree nscalars_total = niters; tree nscalars_step = build_int_cst (iv_type, vf); tree nscalars_skip = niters_skip; - if (nscalars_per_iter != 1) + if (nscalars_per_iter_ft != 1) { /* We checked before choosing to use a partial vectorization loop that these multiplications don't overflow. */ - tree compare_factor = build_int_cst (compare_type, nscalars_per_iter); - tree iv_factor = build_int_cst (iv_type, nscalars_per_iter); + tree compare_factor = build_int_cst (compare_type, nscalars_per_iter_ft); + tree iv_factor = build_int_cst (iv_type, nscalars_per_iter_ft); nscalars_total = gimple_build (preheader_seq, MULT_EXPR, compare_type, nscalars_total, compare_factor); nscalars_step = gimple_build (preheader_seq, MULT_EXPR, iv_type, @@ -509,7 +520,7 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo, NSCALARS_SKIP to that cannot overflow. */ tree const_limit = build_int_cst (compare_type, LOOP_VINFO_VECT_FACTOR (loop_vinfo) - * nscalars_per_iter); + * nscalars_per_iter_ft); first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type, nscalars_total, const_limit); first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type, @@ -549,16 +560,16 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo, { /* Previous controls will cover BIAS scalars. This control covers the next batch. */ - poly_uint64 bias = nscalars_per_ctrl * i; + poly_uint64 batch_nscalars_ft = nscalars_per_ctrl * rgc->factor; + poly_uint64 bias = batch_nscalars_ft * i; tree bias_tree = build_int_cst (compare_type, bias); - gimple *tmp_stmt; /* See whether the first iteration of the vector loop is known to have a full control. */ poly_uint64 const_limit; bool first_iteration_full = (poly_int_tree_p (first_limit, &const_limit) - && known_ge (const_limit, (i + 1) * nscalars_per_ctrl)); + && known_ge (const_limit, (i + 1) * batch_nscalars_ft)); /* Rather than have a new IV that starts at BIAS and goes up to TEST_LIMIT, prefer to use the same 0-based IV for each control @@ -598,9 +609,19 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo, end = first_limit; } - init_ctrl = make_temp_ssa_name (ctrl_type, NULL, "max_mask"); - tmp_stmt = vect_gen_while (init_ctrl, start, end); - gimple_seq_add_stmt (preheader_seq, tmp_stmt); + if (vect_for_masking) + { + init_ctrl = make_temp_ssa_name (ctrl_type, NULL, "max_mask"); + gimple *tmp_stmt = vect_gen_while (init_ctrl, start, end); + gimple_seq_add_stmt (preheader_seq, tmp_stmt); + } + else + { + init_ctrl = make_temp_ssa_name (compare_type, NULL, "max_len"); + gimple_seq seq = vect_gen_len (init_ctrl, start, + end, length_limit); + gimple_seq_add_seq (preheader_seq, seq); + } } /* Now AND out the bits that are within the number of skipped @@ -617,16 +638,32 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo, init_ctrl, unskipped_mask); else init_ctrl = unskipped_mask; + gcc_assert (vect_for_masking); } + /* First iteration is full. */ if (!init_ctrl) - /* First iteration is full. */ - init_ctrl = build_minus_one_cst (ctrl_type); + { + if (vect_for_masking) + init_ctrl = build_minus_one_cst (ctrl_type); + else + init_ctrl = length_limit; + } /* Get the control value for the next iteration of the loop. */ - next_ctrl = make_temp_ssa_name (ctrl_type, NULL, "next_mask"); - gcall *call = vect_gen_while (next_ctrl, test_index, this_test_limit); - gsi_insert_before (test_gsi, call, GSI_SAME_STMT); + if (vect_for_masking) + { + next_ctrl = make_temp_ssa_name (ctrl_type, NULL, "next_mask"); + gcall *call = vect_gen_while (next_ctrl, test_index, this_test_limit); + gsi_insert_before (test_gsi, call, GSI_SAME_STMT); + } + else + { + next_ctrl = make_temp_ssa_name (compare_type, NULL, "next_len"); + gimple_seq seq = vect_gen_len (next_ctrl, test_index, this_test_limit, + length_limit); + gsi_insert_seq_before (test_gsi, seq, GSI_SAME_STMT); + } vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl); } @@ -651,6 +688,7 @@ vect_set_loop_condition_partial_vectors (class loop *loop, gimple_seq preheader_seq = NULL; gimple_seq header_seq = NULL; + bool vect_for_masking = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo); tree compare_type = LOOP_VINFO_COMPARE_TYPE (loop_vinfo); unsigned int compare_precision = TYPE_PRECISION (compare_type); tree orig_niters = niters; @@ -685,28 +723,30 @@ vect_set_loop_condition_partial_vectors (class loop *loop, tree test_ctrl = NULL_TREE; rgroup_controls *rgc; unsigned int i; - vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); - FOR_EACH_VEC_ELT (*masks, i, rgc) + auto_vec *controls = vect_for_masking + ? &LOOP_VINFO_MASKS (loop_vinfo) + : &LOOP_VINFO_LENS (loop_vinfo); + FOR_EACH_VEC_ELT (*controls, i, rgc) if (!rgc->controls.is_empty ()) { /* First try using permutes. This adds a single vector instruction to the loop for each mask, but needs no extra loop invariants or IVs. */ unsigned int nmasks = i + 1; - if ((nmasks & 1) == 0) + if (vect_for_masking && (nmasks & 1) == 0) { - rgroup_controls *half_rgc = &(*masks)[nmasks / 2 - 1]; + rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1]; if (!half_rgc->controls.is_empty () && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc)) continue; } /* See whether zero-based IV would ever generate all-false masks - before wrapping around. */ + or zero length before wrapping around. */ + unsigned nscalars_ft = rgc->max_nscalars_per_iter * rgc->factor; bool might_wrap_p = (iv_limit == -1 - || (wi::min_precision (iv_limit * rgc->max_nscalars_per_iter, - UNSIGNED) + || (wi::min_precision (iv_limit * nscalars_ft, UNSIGNED) > compare_precision)); /* Set up all controls for this group. */ @@ -2567,7 +2607,8 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, if (vect_epilogues && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && prolog_peeling >= 0 - && known_eq (vf, lowest_vf)) + && known_eq (vf, lowest_vf) + && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (epilogue_vinfo)) { unsigned HOST_WIDE_INT eiters = (LOOP_VINFO_INT_NITERS (loop_vinfo) diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index b6e96f77f69..19a37af2f56 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -815,6 +815,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared) vectorizable (false), can_use_partial_vectors_p (true), using_partial_vectors_p (false), + epil_using_partial_vectors_p (false), peeling_for_gaps (false), peeling_for_niter (false), no_data_dependencies (false), @@ -895,6 +896,7 @@ _loop_vec_info::~_loop_vec_info () free (bbs); release_vec_loop_controls (&masks); + release_vec_loop_controls (&lens); delete ivexpr_map; delete scan_map; epilogue_vinfos.release (); @@ -1070,6 +1072,88 @@ vect_verify_full_masking (loop_vec_info loop_vinfo) return true; } +/* Check whether we can use vector access with length based on precison + comparison. So far, to keep it simple, we only allow the case that the + precision of the target supported length is larger than the precision + required by loop niters. */ + +static bool +vect_verify_loop_lens (loop_vec_info loop_vinfo) +{ + vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo); + + if (LOOP_VINFO_LENS (loop_vinfo).is_empty ()) + return false; + + /* The one which has the largest NV should have max bytes per iter. */ + rgroup_controls *rgl = &(*lens)[lens->length () - 1]; + + /* Work out how many bits we need to represent the length limit. */ + unsigned int nscalars_per_iter_ft = rgl->max_nscalars_per_iter * rgl->factor; + unsigned int min_ni_prec + = min_prec_for_max_niters (loop_vinfo, nscalars_per_iter_ft); + + /* Now use the maximum of below precisions for one suitable IV type: + - the IV's natural precision + - the precision needed to hold: the maximum number of scalar + iterations multiplied by the scale factor (min_ni_prec above) + - the Pmode precision + */ + + /* If min_ni_width is less than the precision of the current niters, + we perfer to still use the niters type. */ + unsigned int ni_prec + = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo))); + /* Prefer to use Pmode and wider IV to avoid narrow conversions. */ + unsigned int pmode_prec = GET_MODE_BITSIZE (Pmode); + + unsigned int required_prec = ni_prec; + if (required_prec < pmode_prec) + required_prec = pmode_prec; + + tree iv_type = NULL_TREE; + if (min_ni_prec > required_prec) + { + opt_scalar_int_mode tmode_iter; + unsigned standard_bits = 0; + FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT) + { + scalar_mode tmode = tmode_iter.require (); + unsigned int tbits = GET_MODE_BITSIZE (tmode); + + /* ??? Do we really want to construct one IV whose precision exceeds + BITS_PER_WORD? */ + if (tbits > BITS_PER_WORD) + break; + + /* Find the first available standard integral type. */ + if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode)) + { + standard_bits = tbits; + break; + } + } + if (standard_bits != 0) + iv_type = build_nonstandard_integer_type (standard_bits, true); + } + else + iv_type = build_nonstandard_integer_type (required_prec, true); + + if (!iv_type) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "can't use length-based partial vectorization" + " due to no suitable iv type.\n"); + return false; + } + + LOOP_VINFO_COMPARE_TYPE (loop_vinfo) = iv_type; + LOOP_VINFO_IV_TYPE (loop_vinfo) = iv_type; + + return true; +} + /* Calculate the cost of one scalar iteration of the loop. */ static void vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo) @@ -2144,11 +2228,63 @@ start_over: return ok; } - /* Decide whether to use a fully-masked loop for this vectorization - factor. */ - LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) - = (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) - && vect_verify_full_masking (loop_vinfo)); + /* For now, we don't expect to mix both masking and length approaches for one + loop, disable it if both are recorded. */ + if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) + && !LOOP_VINFO_MASKS (loop_vinfo).is_empty () + && !LOOP_VINFO_LENS (loop_vinfo).is_empty ()) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "can't use a partial vectorized loop because we" + " don't expect to mix partial vectorization" + " approaches for the same loop.\n"); + LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; + } + + /* Decide whether to use a partial vectorization loop for this + vectorization factor. */ + if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) + { + /* Decide whether to use fully-masked approach. */ + if (vect_verify_full_masking (loop_vinfo)) + LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true; + /* Decide whether to use length-based approach. */ + else if (vect_verify_loop_lens (loop_vinfo)) + { + if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) + || LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "can't use length-based partial vectorization" + " approach becuase peeling for alignment or" + " gaps is required.\n"); + LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false; + } + else if (param_vect_with_length_scope == 0) + LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false; + /* The epilogue and other known niters less than VF + cases can still use vector access with length fully. */ + else if (param_vect_with_length_scope == 1 + && !LOOP_VINFO_EPILOGUE_P (loop_vinfo) + && !known_niters_smaller_than_vf (loop_vinfo)) + { + LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false; + LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true; + } + else + { + LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true; + LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false; + } + } + else + LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false; + } + else + LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false; + if (dump_enabled_p ()) { if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) @@ -2157,6 +2293,15 @@ start_over: else dump_printf_loc (MSG_NOTE, vect_location, "not using a fully-masked loop.\n"); + + if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)) + dump_printf_loc (MSG_NOTE, vect_location, + "using length-based partial" + " vectorization for loop fully.\n"); + else + dump_printf_loc (MSG_NOTE, vect_location, + "not using length-based partial" + " vectorization for loop fully.\n"); } /* If epilog loop is required because of data accesses with gaps, @@ -2377,6 +2522,7 @@ again: = init_cost (LOOP_VINFO_LOOP (loop_vinfo)); /* Reset accumulated rgroup information. */ release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo)); + release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo)); /* Reset assorted flags. */ LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false; LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false; @@ -2663,7 +2809,10 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared) lowest_th = ordered_min (lowest_th, th); } else - delete loop_vinfo; + { + delete loop_vinfo; + loop_vinfo = opt_loop_vec_info::success (NULL); + } /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is enabled, SIMDUID is not set, it is the innermost loop and we have @@ -2688,6 +2837,7 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared) else { delete loop_vinfo; + loop_vinfo = opt_loop_vec_info::success (NULL); if (fatal) { gcc_checking_assert (first_loop_vinfo == NULL); @@ -2695,6 +2845,23 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared) } } + /* Handle the case that the original loop can use partial + vectorization, but want to only adopt it for the epilogue. + The retry should be in the same mode as original. */ + if (vect_epilogues + && loop_vinfo + && LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo)) + { + gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) + && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)); + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "***** Re-trying analysis with same vector mode" + " %s for epilogue with partial vectorization.\n", + GET_MODE_NAME (loop_vinfo->vector_mode)); + continue; + } + if (mode_i < vector_modes.length () && VECTOR_MODE_P (autodetected_vector_mode) && (related_vector_mode (vector_modes[mode_i], @@ -3535,6 +3702,11 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, target_cost_data, num_masks - 1, vector_stmt, NULL, NULL_TREE, 0, vect_body); } + else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)) + { + peel_iters_prologue = 0; + peel_iters_epilogue = 0; + } else if (npeel < 0) { peel_iters_prologue = assumed_vf / 2; @@ -8319,6 +8491,7 @@ vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks, { rgm->max_nscalars_per_iter = nscalars_per_iter; rgm->type = truth_type_for (vectype); + rgm->factor = 1; } } @@ -8371,6 +8544,64 @@ vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks, return mask; } +/* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS + lengths for vector access with length that each control a vector of type + VECTYPE. */ + +void +vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens, + unsigned int nvectors, tree vectype) +{ + gcc_assert (nvectors != 0); + if (lens->length () < nvectors) + lens->safe_grow_cleared (nvectors); + rgroup_controls *rgl = &(*lens)[nvectors - 1]; + + /* The number of scalars per iteration, scalar occupied bytes and + the number of vectors are both compile-time constants. */ + unsigned int nscalars_per_iter + = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype), + LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant (); + + if (rgl->max_nscalars_per_iter < nscalars_per_iter) + { + rgl->max_nscalars_per_iter = nscalars_per_iter; + rgl->type = vectype; + /* For now, the length-based is for length in bytes. + FIXME if length-based supports more eg: length in scalar counts. */ + rgl->factor = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype))); + } +} + +/* Given a complete set of length LENS, extract length number INDEX for an + rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS. */ + +tree +vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens, + unsigned int nvectors, unsigned int index) +{ + rgroup_controls *rgl = &(*lens)[nvectors - 1]; + + /* Populate the rgroup's len array, if this is the first time we've + used it. */ + if (rgl->controls.is_empty ()) + { + rgl->controls.safe_grow_cleared (nvectors); + for (unsigned int i = 0; i < nvectors; ++i) + { + tree len_type = LOOP_VINFO_COMPARE_TYPE (loop_vinfo); + gcc_assert (len_type != NULL_TREE); + tree len = make_temp_ssa_name (len_type, NULL, "loop_len"); + + /* Provide a dummy definition until the real one is available. */ + SSA_NAME_DEF_STMT (len) = gimple_build_nop (); + rgl->controls[i] = len; + } + } + + return rgl->controls[index]; +} + /* Scale profiling counters by estimation for LOOP which is vectorized by factor VF. */ diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index 484470091a8..98f166d742f 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -1855,29 +1855,56 @@ check_load_store_for_partial_vectors ( return; } - machine_mode mask_mode; - if (!VECTOR_MODE_P (vecmode) - || !targetm.vectorize.get_mask_mode (vecmode).exists (&mask_mode) - || !can_vec_mask_load_store_p (vecmode, mask_mode, is_load)) + if (!VECTOR_MODE_P (vecmode)) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "can't use a fully-masked loop because the target" - " doesn't have the appropriate masked load or" - " store.\n"); + "can't use a partial vectorization loop because of" + " the unexpected mode.\n"); LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; return; } - /* We might load more scalars than we need for permuting SLP loads. - We checked in get_group_load_store_type that the extra elements - don't leak into a new vector. */ + poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); unsigned int nvectors; - if (can_div_away_from_zero_p (group_size * vf, nunits, &nvectors)) - vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype, scalar_mask); - else - gcc_unreachable (); + + machine_mode mask_mode; + bool partial_vectorization_p = false; + if (targetm.vectorize.get_mask_mode (vecmode).exists (&mask_mode) + && can_vec_mask_load_store_p (vecmode, mask_mode, is_load)) + { + /* We might load more scalars than we need for permuting SLP loads. + We checked in get_group_load_store_type that the extra elements + don't leak into a new vector. */ + if (can_div_away_from_zero_p (group_size * vf, nunits, &nvectors)) + vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype, + scalar_mask); + else + gcc_unreachable (); + partial_vectorization_p = true; + } + + optab op = is_load ? lenload_optab : lenstore_optab; + if (convert_optab_handler (op, vecmode, targetm.vectorize.length_mode)) + { + vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo); + if (can_div_away_from_zero_p (group_size * vf, nunits, &nvectors)) + vect_record_loop_len (loop_vinfo, lens, nvectors, vectype); + else + gcc_unreachable (); + partial_vectorization_p = true; + } + + if (!partial_vectorization_p) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "can't use a partial vectorization loop because the" + " target doesn't have the appropriate partial" + "vectorization load or store.\n"); + LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; + } } /* Return the mask input to a masked load or store. VEC_MASK is the vectorized @@ -8070,6 +8097,14 @@ vectorizable_store (vec_info *vinfo, = (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) ? &LOOP_VINFO_MASKS (loop_vinfo) : NULL); + vec_loop_lens *loop_lens + = (loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo) + ? &LOOP_VINFO_LENS (loop_vinfo) + : NULL); + + /* Shouldn't go with length-based approach if fully masked. */ + gcc_assert (!loop_lens || (loop_lens && !loop_masks)); + /* Targets with store-lane instructions must not require explicit realignment. vect_supportable_dr_alignment always returns either dr_aligned or dr_unaligned_supported for masked operations. */ @@ -8322,10 +8357,16 @@ vectorizable_store (vec_info *vinfo, unsigned HOST_WIDE_INT align; tree final_mask = NULL_TREE; + tree final_len = NULL_TREE; if (loop_masks) final_mask = vect_get_loop_mask (gsi, loop_masks, vec_num * ncopies, vectype, vec_num * j + i); + else if (loop_lens) + final_len = vect_get_loop_len (loop_vinfo, loop_lens, + vec_num * ncopies, + vec_num * j + i); + if (vec_mask) final_mask = prepare_load_store_mask (mask_vectype, final_mask, vec_mask, gsi); @@ -8405,6 +8446,17 @@ vectorizable_store (vec_info *vinfo, new_stmt_info = vect_finish_stmt_generation (vinfo, stmt_info, call, gsi); } + else if (final_len) + { + align = least_bit_hwi (misalign | align); + tree ptr = build_int_cst (ref_type, align); + gcall *call + = gimple_build_call_internal (IFN_LEN_STORE, 4, dataref_ptr, + ptr, final_len, vec_oprnd); + gimple_call_set_nothrow (call, true); + new_stmt_info + = vect_finish_stmt_generation (vinfo, stmt_info, call, gsi); + } else { data_ref = fold_build2 (MEM_REF, vectype, @@ -8939,6 +8991,7 @@ vectorizable_load (vec_info *vinfo, tree dr_offset; gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)); + gcc_assert (!LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)); gcc_assert (!nested_in_vect_loop); if (grouped_load) @@ -9237,6 +9290,14 @@ vectorizable_load (vec_info *vinfo, = (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) ? &LOOP_VINFO_MASKS (loop_vinfo) : NULL); + vec_loop_lens *loop_lens + = (loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo) + ? &LOOP_VINFO_LENS (loop_vinfo) + : NULL); + + /* Shouldn't go with length-based approach if fully masked. */ + gcc_assert (!loop_lens || (loop_lens && !loop_masks)); + /* Targets with store-lane instructions must not require explicit realignment. vect_supportable_dr_alignment always returns either dr_aligned or dr_unaligned_supported for masked operations. */ @@ -9558,11 +9619,18 @@ vectorizable_load (vec_info *vinfo, for (i = 0; i < vec_num; i++) { tree final_mask = NULL_TREE; + tree final_len = NULL_TREE; if (loop_masks && memory_access_type != VMAT_INVARIANT) final_mask = vect_get_loop_mask (gsi, loop_masks, vec_num * ncopies, vectype, vec_num * j + i); + else if (loop_lens + && memory_access_type != VMAT_INVARIANT) + final_len = vect_get_loop_len (loop_vinfo, loop_lens, + vec_num * ncopies, + vec_num * j + i); + if (vec_mask) final_mask = prepare_load_store_mask (mask_vectype, final_mask, vec_mask, gsi); @@ -9632,6 +9700,18 @@ vectorizable_load (vec_info *vinfo, new_stmt = call; data_ref = NULL_TREE; } + else if (final_len) + { + align = least_bit_hwi (misalign | align); + tree ptr = build_int_cst (ref_type, align); + gcall *call + = gimple_build_call_internal (IFN_LEN_LOAD, 3, + dataref_ptr, ptr, + final_len); + gimple_call_set_nothrow (call, true); + new_stmt = call; + data_ref = NULL_TREE; + } else { tree ltype = vectype; @@ -10282,11 +10362,17 @@ vectorizable_condition (vec_info *vinfo, return false; } + /* For reduction, we expect EXTRACT_LAST_REDUCTION so far. */ if (loop_vinfo - && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) - && reduction_type == EXTRACT_LAST_REDUCTION) - vect_record_loop_mask (loop_vinfo, &LOOP_VINFO_MASKS (loop_vinfo), - ncopies * vec_num, vectype, NULL); + && for_reduction + && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) + { + if (reduction_type == EXTRACT_LAST_REDUCTION) + vect_record_loop_mask (loop_vinfo, &LOOP_VINFO_MASKS (loop_vinfo), + ncopies * vec_num, vectype, NULL); + else + LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; + } STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type; vect_model_simple_cost (vinfo, stmt_info, ncopies, dts, ndts, slp_node, @@ -12483,3 +12569,36 @@ vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info, *nunits_vectype_out = nunits_vectype; return opt_result::success (); } + +/* Generate and return statement sequence that sets vector length LEN that is: + + min_of_start_and_end = min (START_INDEX, END_INDEX); + left_len = END_INDEX - min_of_start_and_end; + rhs = min (left_len, LEN_LIMIT); + LEN = rhs; + + TODO: for now, rs6000 supported vector with length only cares 8-bits, which + means if we have left_len in bytes larger than 255, it can't be saturated to + vector limit (vector size). One target hook can be provided if other ports + don't suffer this. +*/ + +gimple_seq +vect_gen_len (tree len, tree start_index, tree end_index, tree len_limit) +{ + gimple_seq stmts = NULL; + tree len_type = TREE_TYPE (len); + gcc_assert (TREE_TYPE (start_index) == len_type); + + tree min = fold_build2 (MIN_EXPR, len_type, start_index, end_index); + tree left_len = fold_build2 (MINUS_EXPR, len_type, end_index, min); + left_len = fold_build2 (MIN_EXPR, len_type, left_len, len_limit); + + tree rhs = force_gimple_operand (left_len, &stmts, true, NULL_TREE); + gimple *new_stmt = gimple_build_assign (len, rhs); + gimple_stmt_iterator i = gsi_last (stmts); + gsi_insert_after_without_update (&i, new_stmt, GSI_CONTINUE_LINKING); + + return stmts; +} + diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 857b4a9db15..57da8db43a2 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -407,6 +407,16 @@ is_a_helper <_bb_vec_info *>::test (vec_info *i) are compile-time constants but VF and nL can be variable (if the target supports variable-length vectors). + Moreover, for some partial vectorization approach like length-based + in bytes, we care about the occupied bytes for each scalar. Provided + that each scalar has factor bytes, the total number of scalar values + becomes to factor * N, the above equation becomes to: + + factor * N = factor * NS * VF = factor * NV * NL + + factor * NS is the bytes of each scalar, factor * NL is the vector size + in bytes. + In classical vectorization, each iteration of the vector loop would handle exactly VF iterations of the original scalar loop. However, in a partial vectorization loop, a particular iteration of the vector @@ -462,14 +472,19 @@ is_a_helper <_bb_vec_info *>::test (vec_info *i) first level being indexed by nV - 1 (since nV == 0 doesn't exist) and the second being indexed by the mask index 0 <= i < nV. */ -/* The controls (like masks) needed by rgroups with nV vectors, +/* The controls (like masks, lengths) needed by rgroups with nV vectors, according to the description above. */ struct rgroup_controls { /* The largest nS for all rgroups that use these controls. */ unsigned int max_nscalars_per_iter; - /* The type of control to use, based on the highest nS recorded above. - For mask-based approach, it's used for mask_type. */ + /* For now, it's mainly used for length-based in bytes approach, it's + record the occupied bytes of each scalar. */ + unsigned int factor; + + /* This type is based on the highest nS recorded above. + For mask-based approach, it records mask type to use. + For length-based approach, it records appropriate vector type. */ tree type; /* A vector of nV controls, in iteration order. */ @@ -478,6 +493,8 @@ struct rgroup_controls { typedef auto_vec vec_loop_masks; +typedef auto_vec vec_loop_lens; + typedef auto_vec > drs_init_vec; /*-----------------------------------------------------------------*/ @@ -525,6 +542,10 @@ public: on inactive scalars. */ vec_loop_masks masks; + /* The lengths that a loop with length should use to avoid operating + on inactive scalars. */ + vec_loop_lens lens; + /* Set of scalar conditions that have loop mask applied. */ scalar_cond_masked_set_type scalar_cond_masked_set; @@ -630,6 +651,10 @@ public: /* True if have decided to use partial vectorization for this loop. */ bool using_partial_vectors_p; + /* Records whether we can use partial vector approaches for the epilogue of + this loop, for now we only support length approach. */ + bool epil_using_partial_vectors_p; + /* When we have grouped data accesses with gaps, we may introduce invalid memory accesses. We peel the last iteration of the loop to prevent this. */ @@ -693,9 +718,12 @@ public: #define LOOP_VINFO_VECTORIZABLE_P(L) (L)->vectorizable #define LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P(L) (L)->can_use_partial_vectors_p #define LOOP_VINFO_USING_PARTIAL_VECTORS_P(L) (L)->using_partial_vectors_p +#define LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P(L) \ + (L)->epil_using_partial_vectors_p #define LOOP_VINFO_VECT_FACTOR(L) (L)->vectorization_factor #define LOOP_VINFO_MAX_VECT_FACTOR(L) (L)->max_vectorization_factor #define LOOP_VINFO_MASKS(L) (L)->masks +#define LOOP_VINFO_LENS(L) (L)->lens #define LOOP_VINFO_MASK_SKIP_NITERS(L) (L)->mask_skip_niters #define LOOP_VINFO_COMPARE_TYPE(L) (L)->compare_type #define LOOP_VINFO_IV_TYPE(L) (L)->iv_type @@ -733,6 +761,10 @@ public: (LOOP_VINFO_USING_PARTIAL_VECTORS_P (L) \ && !LOOP_VINFO_MASKS (L).is_empty ()) +#define LOOP_VINFO_FULLY_WITH_LENGTH_P(L) \ + (LOOP_VINFO_USING_PARTIAL_VECTORS_P (L) \ + && !LOOP_VINFO_LENS (L).is_empty ()) + #define LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT(L) \ ((L)->may_misalign_stmts.length () > 0) #define LOOP_REQUIRES_VERSIONING_FOR_ALIAS(L) \ @@ -1850,6 +1882,11 @@ extern void vect_record_loop_mask (loop_vec_info, vec_loop_masks *, unsigned int, tree, tree); extern tree vect_get_loop_mask (gimple_stmt_iterator *, vec_loop_masks *, unsigned int, tree, unsigned int); +extern void vect_record_loop_len (loop_vec_info, vec_loop_lens *, unsigned int, + tree); +extern tree vect_get_loop_len (loop_vec_info, vec_loop_lens *, unsigned int, + unsigned int); +extern gimple_seq vect_gen_len (tree, tree, tree, tree); extern stmt_vec_info info_for_reduction (vec_info *, stmt_vec_info); /* Drive for loop transformation stage. */ --