Bootstrap on X86 passed. Ok for trunk? Thanks. juzhe.zhong@rivai.ai From: juzhe.zhong Date: 2023-05-22 16:38 To: gcc-patches CC: richard.sandiford; rguenther; Ju-Zhe Zhong Subject: [PATCH V12] VECT: Add decrement IV iteration loop control by variable amount support From: Ju-Zhe Zhong gcc/ChangeLog: * tree-vect-loop-manip.cc (vect_adjust_loop_lens_control): New function. (vect_set_loop_controls_directly): Add decrement IV support. (vect_set_loop_condition_partial_vectors): Ditto. * tree-vect-loop.cc: Ditto. * tree-vectorizer.h (LOOP_VINFO_USING_DECREMENTING_IV_P): New macro. --- gcc/tree-vect-loop-manip.cc | 184 +++++++++++++++++++++++++++++++++++- gcc/tree-vect-loop.cc | 10 ++ gcc/tree-vectorizer.h | 8 ++ 3 files changed, 199 insertions(+), 3 deletions(-) diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc index ff6159e08d5..94b38d1e0fb 100644 --- a/gcc/tree-vect-loop-manip.cc +++ b/gcc/tree-vect-loop-manip.cc @@ -385,6 +385,66 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm, return false; } +/* Try to use adjust loop lens for non-SLP multiple-rgroups. + + _36 = MIN_EXPR ; + + First length (MIN (X, VF/N)): + loop_len_15 = MIN_EXPR <_36, VF/N>; + + Second length: + tmp = _36 - loop_len_15; + loop_len_16 = MIN (tmp, VF/N); + + Third length: + tmp2 = tmp - loop_len_16; + loop_len_17 = MIN (tmp2, VF/N); + + Last length: + loop_len_18 = tmp2 - loop_len_17; +*/ + +static void +vect_adjust_loop_lens_control (tree iv_type, gimple_seq *seq, + rgroup_controls *dest_rgm, + rgroup_controls *src_rgm, tree step) +{ + tree ctrl_type = dest_rgm->type; + poly_uint64 nitems_per_ctrl + = TYPE_VECTOR_SUBPARTS (ctrl_type) * dest_rgm->factor; + tree length_limit = build_int_cst (iv_type, nitems_per_ctrl); + + for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i) + { + if (!step) + step = src_rgm->controls[i / dest_rgm->controls.length ()]; + tree ctrl = dest_rgm->controls[i]; + if (i == 0) + { + /* First iteration: MIN (X, VF/N) capped to the range [0, VF/N]. */ + gassign *assign + = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit); + gimple_seq_add_stmt (seq, assign); + } + else if (i == dest_rgm->controls.length () - 1) + { + /* Last iteration: Remain capped to the range [0, VF/N]. */ + gassign *assign = gimple_build_assign (ctrl, MINUS_EXPR, step, + dest_rgm->controls[i - 1]); + gimple_seq_add_stmt (seq, assign); + } + else + { + /* (MIN (remain, VF*I/N)) capped to the range [0, VF/N]. */ + step = gimple_build (seq, MINUS_EXPR, iv_type, step, + dest_rgm->controls[i - 1]); + gassign *assign + = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit); + gimple_seq_add_stmt (seq, assign); + } + } +} + /* Helper for vect_set_loop_condition_partial_vectors. Generate definitions for all the rgroup controls in RGC and return a control that is nonzero when the loop needs to iterate. Add any new preheader statements to @@ -468,9 +528,78 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo, gimple_stmt_iterator incr_gsi; bool insert_after; standard_iv_increment_position (loop, &incr_gsi, &insert_after); - create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE, - loop, &incr_gsi, insert_after, &index_before_incr, - &index_after_incr); + if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)) + { + nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total); + tree step = make_ssa_name (iv_type); + /* Create decrement IV. */ + create_iv (nitems_total, MINUS_EXPR, step, NULL_TREE, loop, &incr_gsi, + insert_after, &index_before_incr, &index_after_incr); + tree temp = gimple_build (header_seq, MIN_EXPR, iv_type, + index_before_incr, nitems_step); + gimple_seq_add_stmt (header_seq, gimple_build_assign (step, temp)); + + if (rgc->max_nscalars_per_iter == 1) + { + /* single rgroup: + ... + _10 = (unsigned long) count_12(D); + ... + # ivtmp_9 = PHI + _36 = MIN_EXPR ; + ... + vect__4.8_28 = .LEN_LOAD (_17, 32B, _36, 0); + ... + ivtmp_35 = ivtmp_9 - _36; + ... + if (ivtmp_35 != 0) + goto ; [83.33%] + else + goto ; [16.67%] + */ + gassign *assign = gimple_build_assign (rgc->controls[0], step); + gimple_seq_add_stmt (header_seq, assign); + } + else + { + /* Multiple rgroup (SLP): + ... + _38 = (unsigned long) bnd.7_29; + _39 = _38 * 2; + ... + # ivtmp_41 = PHI + ... + _43 = MIN_EXPR ; + loop_len_26 = MIN_EXPR <_43, 16>; + loop_len_25 = _43 - loop_len_26; + ... + .LEN_STORE (_6, 8B, loop_len_26, ...); + ... + .LEN_STORE (_25, 8B, loop_len_25, ...); + _33 = loop_len_26 / 2; + ... + .LEN_STORE (_8, 16B, _33, ...); + _36 = loop_len_25 / 2; + ... + .LEN_STORE (_15, 16B, _36, ...); + ivtmp_42 = ivtmp_41 - _43; + ... + if (ivtmp_42 != 0) + goto ; [83.33%] + else + goto ; [16.67%] + */ + vect_adjust_loop_lens_control (iv_type, header_seq, rgc, NULL, step); + } + return index_after_incr; + } + else + { + /* Create increment IV. */ + create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE, + loop, &incr_gsi, insert_after, &index_before_incr, + &index_after_incr); + } tree zero_index = build_int_cst (compare_type, 0); tree test_index, test_limit, first_limit; @@ -753,6 +882,55 @@ vect_set_loop_condition_partial_vectors (class loop *loop, continue; } + if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) + && rgc->max_nscalars_per_iter == 1 + && rgc != &LOOP_VINFO_LENS (loop_vinfo)[0]) + { + /* Multiple rgroup (non-SLP): + ... + _38 = (unsigned long) n_12(D); + ... + # ivtmp_38 = PHI + ... + _40 = MIN_EXPR ; + loop_len_21 = MIN_EXPR <_40, POLY_INT_CST [2, 2]>; + _41 = _40 - loop_len_21; + loop_len_20 = MIN_EXPR <_41, POLY_INT_CST [2, 2]>; + _42 = _40 - loop_len_20; + loop_len_19 = MIN_EXPR <_42, POLY_INT_CST [2, 2]>; + _43 = _40 - loop_len_19; + loop_len_16 = MIN_EXPR <_43, POLY_INT_CST [2, 2]>; + ... + vect__4.8_15 = .LEN_LOAD (_6, 64B, loop_len_21, 0); + ... + vect__4.9_8 = .LEN_LOAD (_13, 64B, loop_len_20, 0); + ... + vect__4.10_28 = .LEN_LOAD (_46, 64B, loop_len_19, 0); + ... + vect__4.11_30 = .LEN_LOAD (_49, 64B, loop_len_16, 0); + vect__7.13_31 = VEC_PACK_TRUNC_EXPR ; + vect__7.13_32 = VEC_PACK_TRUNC_EXPR <...>; + vect__7.12_33 = VEC_PACK_TRUNC_EXPR <...>; + ... + .LEN_STORE (_14, 16B, _40, vect__7.12_33, 0); + ivtmp_39 = ivtmp_38 - _40; + ... + if (ivtmp_39 != 0) + goto ; [92.31%] + else + goto ; [7.69%] + */ + rgroup_controls *sub_rgc + = &(*controls)[nmasks / rgc->controls.length () - 1]; + if (!sub_rgc->controls.is_empty ()) + { + tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo); + vect_adjust_loop_lens_control (iv_type, &header_seq, rgc, + sub_rgc, NULL_TREE); + continue; + } + } + /* See whether zero-based IV would ever generate all-false masks or zero length before wrapping around. */ bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc); diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index cf10132b0bf..53ac4d2a6c2 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -2725,6 +2725,16 @@ start_over: && !vect_verify_loop_lens (loop_vinfo)) LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; + /* If we're vectorizing an loop that uses length "controls" and + can iterate more than once, we apply decrementing IV approach + in loop control. */ + if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) + && !LOOP_VINFO_LENS (loop_vinfo).is_empty () + && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) + && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo), + LOOP_VINFO_VECT_FACTOR (loop_vinfo)))) + LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true; + /* If we're vectorizing an epilogue loop, the vectorized loop either needs to be able to handle fewer than VF scalars, or needs to have a lower VF than the main loop. */ diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 02d2ad6fba1..fba09b9ffd3 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -818,6 +818,13 @@ public: the vector loop can handle fewer than VF scalars. */ bool using_partial_vectors_p; + /* True if we've decided to use a decrementing loop control IV that counts + scalars. This can be done for any loop that: + + (a) uses length "controls"; and + (b) can iterate more than once. */ + bool using_decrementing_iv_p; + /* True if we've decided to use partially-populated vectors for the epilogue of loop. */ bool epil_using_partial_vectors_p; @@ -890,6 +897,7 @@ public: #define LOOP_VINFO_VECTORIZABLE_P(L) (L)->vectorizable #define LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P(L) (L)->can_use_partial_vectors_p #define LOOP_VINFO_USING_PARTIAL_VECTORS_P(L) (L)->using_partial_vectors_p +#define LOOP_VINFO_USING_DECREMENTING_IV_P(L) (L)->using_decrementing_iv_p #define LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P(L) \ (L)->epil_using_partial_vectors_p #define LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS(L) (L)->partial_load_store_bias -- 2.36.3