* [PATCH V12] VECT: Add decrement IV iteration loop control by variable amount support @ 2023-05-22 8:38 juzhe.zhong 2023-05-23 12:32 ` juzhe.zhong 2023-05-24 11:23 ` Richard Sandiford 0 siblings, 2 replies; 16+ messages in thread From: juzhe.zhong @ 2023-05-22 8:38 UTC (permalink / raw) To: gcc-patches; +Cc: richard.sandiford, rguenther, Ju-Zhe Zhong From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai> gcc/ChangeLog: * tree-vect-loop-manip.cc (vect_adjust_loop_lens_control): New function. (vect_set_loop_controls_directly): Add decrement IV support. (vect_set_loop_condition_partial_vectors): Ditto. * tree-vect-loop.cc: Ditto. * tree-vectorizer.h (LOOP_VINFO_USING_DECREMENTING_IV_P): New macro. --- gcc/tree-vect-loop-manip.cc | 184 +++++++++++++++++++++++++++++++++++- gcc/tree-vect-loop.cc | 10 ++ gcc/tree-vectorizer.h | 8 ++ 3 files changed, 199 insertions(+), 3 deletions(-) diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc index ff6159e08d5..94b38d1e0fb 100644 --- a/gcc/tree-vect-loop-manip.cc +++ b/gcc/tree-vect-loop-manip.cc @@ -385,6 +385,66 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm, return false; } +/* Try to use adjust loop lens for non-SLP multiple-rgroups. + + _36 = MIN_EXPR <ivtmp_34, VF>; + + First length (MIN (X, VF/N)): + loop_len_15 = MIN_EXPR <_36, VF/N>; + + Second length: + tmp = _36 - loop_len_15; + loop_len_16 = MIN (tmp, VF/N); + + Third length: + tmp2 = tmp - loop_len_16; + loop_len_17 = MIN (tmp2, VF/N); + + Last length: + loop_len_18 = tmp2 - loop_len_17; +*/ + +static void +vect_adjust_loop_lens_control (tree iv_type, gimple_seq *seq, + rgroup_controls *dest_rgm, + rgroup_controls *src_rgm, tree step) +{ + tree ctrl_type = dest_rgm->type; + poly_uint64 nitems_per_ctrl + = TYPE_VECTOR_SUBPARTS (ctrl_type) * dest_rgm->factor; + tree length_limit = build_int_cst (iv_type, nitems_per_ctrl); + + for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i) + { + if (!step) + step = src_rgm->controls[i / dest_rgm->controls.length ()]; + tree ctrl = dest_rgm->controls[i]; + if (i == 0) + { + /* First iteration: MIN (X, VF/N) capped to the range [0, VF/N]. */ + gassign *assign + = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit); + gimple_seq_add_stmt (seq, assign); + } + else if (i == dest_rgm->controls.length () - 1) + { + /* Last iteration: Remain capped to the range [0, VF/N]. */ + gassign *assign = gimple_build_assign (ctrl, MINUS_EXPR, step, + dest_rgm->controls[i - 1]); + gimple_seq_add_stmt (seq, assign); + } + else + { + /* (MIN (remain, VF*I/N)) capped to the range [0, VF/N]. */ + step = gimple_build (seq, MINUS_EXPR, iv_type, step, + dest_rgm->controls[i - 1]); + gassign *assign + = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit); + gimple_seq_add_stmt (seq, assign); + } + } +} + /* Helper for vect_set_loop_condition_partial_vectors. Generate definitions for all the rgroup controls in RGC and return a control that is nonzero when the loop needs to iterate. Add any new preheader statements to @@ -468,9 +528,78 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo, gimple_stmt_iterator incr_gsi; bool insert_after; standard_iv_increment_position (loop, &incr_gsi, &insert_after); - create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE, - loop, &incr_gsi, insert_after, &index_before_incr, - &index_after_incr); + if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)) + { + nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total); + tree step = make_ssa_name (iv_type); + /* Create decrement IV. */ + create_iv (nitems_total, MINUS_EXPR, step, NULL_TREE, loop, &incr_gsi, + insert_after, &index_before_incr, &index_after_incr); + tree temp = gimple_build (header_seq, MIN_EXPR, iv_type, + index_before_incr, nitems_step); + gimple_seq_add_stmt (header_seq, gimple_build_assign (step, temp)); + + if (rgc->max_nscalars_per_iter == 1) + { + /* single rgroup: + ... + _10 = (unsigned long) count_12(D); + ... + # ivtmp_9 = PHI <ivtmp_35(6), _10(5)> + _36 = MIN_EXPR <ivtmp_9, POLY_INT_CST [4, 4]>; + ... + vect__4.8_28 = .LEN_LOAD (_17, 32B, _36, 0); + ... + ivtmp_35 = ivtmp_9 - _36; + ... + if (ivtmp_35 != 0) + goto <bb 4>; [83.33%] + else + goto <bb 5>; [16.67%] + */ + gassign *assign = gimple_build_assign (rgc->controls[0], step); + gimple_seq_add_stmt (header_seq, assign); + } + else + { + /* Multiple rgroup (SLP): + ... + _38 = (unsigned long) bnd.7_29; + _39 = _38 * 2; + ... + # ivtmp_41 = PHI <ivtmp_42(6), _39(5)> + ... + _43 = MIN_EXPR <ivtmp_41, 32>; + loop_len_26 = MIN_EXPR <_43, 16>; + loop_len_25 = _43 - loop_len_26; + ... + .LEN_STORE (_6, 8B, loop_len_26, ...); + ... + .LEN_STORE (_25, 8B, loop_len_25, ...); + _33 = loop_len_26 / 2; + ... + .LEN_STORE (_8, 16B, _33, ...); + _36 = loop_len_25 / 2; + ... + .LEN_STORE (_15, 16B, _36, ...); + ivtmp_42 = ivtmp_41 - _43; + ... + if (ivtmp_42 != 0) + goto <bb 4>; [83.33%] + else + goto <bb 5>; [16.67%] + */ + vect_adjust_loop_lens_control (iv_type, header_seq, rgc, NULL, step); + } + return index_after_incr; + } + else + { + /* Create increment IV. */ + create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE, + loop, &incr_gsi, insert_after, &index_before_incr, + &index_after_incr); + } tree zero_index = build_int_cst (compare_type, 0); tree test_index, test_limit, first_limit; @@ -753,6 +882,55 @@ vect_set_loop_condition_partial_vectors (class loop *loop, continue; } + if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) + && rgc->max_nscalars_per_iter == 1 + && rgc != &LOOP_VINFO_LENS (loop_vinfo)[0]) + { + /* Multiple rgroup (non-SLP): + ... + _38 = (unsigned long) n_12(D); + ... + # ivtmp_38 = PHI <ivtmp_39(3), 100(2)> + ... + _40 = MIN_EXPR <ivtmp_38, POLY_INT_CST [8, 8]>; + loop_len_21 = MIN_EXPR <_40, POLY_INT_CST [2, 2]>; + _41 = _40 - loop_len_21; + loop_len_20 = MIN_EXPR <_41, POLY_INT_CST [2, 2]>; + _42 = _40 - loop_len_20; + loop_len_19 = MIN_EXPR <_42, POLY_INT_CST [2, 2]>; + _43 = _40 - loop_len_19; + loop_len_16 = MIN_EXPR <_43, POLY_INT_CST [2, 2]>; + ... + vect__4.8_15 = .LEN_LOAD (_6, 64B, loop_len_21, 0); + ... + vect__4.9_8 = .LEN_LOAD (_13, 64B, loop_len_20, 0); + ... + vect__4.10_28 = .LEN_LOAD (_46, 64B, loop_len_19, 0); + ... + vect__4.11_30 = .LEN_LOAD (_49, 64B, loop_len_16, 0); + vect__7.13_31 = VEC_PACK_TRUNC_EXPR <vect__4.8_15, vect__4.9_8>; + vect__7.13_32 = VEC_PACK_TRUNC_EXPR <...>; + vect__7.12_33 = VEC_PACK_TRUNC_EXPR <...>; + ... + .LEN_STORE (_14, 16B, _40, vect__7.12_33, 0); + ivtmp_39 = ivtmp_38 - _40; + ... + if (ivtmp_39 != 0) + goto <bb 3>; [92.31%] + else + goto <bb 4>; [7.69%] + */ + rgroup_controls *sub_rgc + = &(*controls)[nmasks / rgc->controls.length () - 1]; + if (!sub_rgc->controls.is_empty ()) + { + tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo); + vect_adjust_loop_lens_control (iv_type, &header_seq, rgc, + sub_rgc, NULL_TREE); + continue; + } + } + /* See whether zero-based IV would ever generate all-false masks or zero length before wrapping around. */ bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc); diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index cf10132b0bf..53ac4d2a6c2 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -2725,6 +2725,16 @@ start_over: && !vect_verify_loop_lens (loop_vinfo)) LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; + /* If we're vectorizing an loop that uses length "controls" and + can iterate more than once, we apply decrementing IV approach + in loop control. */ + if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) + && !LOOP_VINFO_LENS (loop_vinfo).is_empty () + && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) + && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo), + LOOP_VINFO_VECT_FACTOR (loop_vinfo)))) + LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true; + /* If we're vectorizing an epilogue loop, the vectorized loop either needs to be able to handle fewer than VF scalars, or needs to have a lower VF than the main loop. */ diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 02d2ad6fba1..fba09b9ffd3 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -818,6 +818,13 @@ public: the vector loop can handle fewer than VF scalars. */ bool using_partial_vectors_p; + /* True if we've decided to use a decrementing loop control IV that counts + scalars. This can be done for any loop that: + + (a) uses length "controls"; and + (b) can iterate more than once. */ + bool using_decrementing_iv_p; + /* True if we've decided to use partially-populated vectors for the epilogue of loop. */ bool epil_using_partial_vectors_p; @@ -890,6 +897,7 @@ public: #define LOOP_VINFO_VECTORIZABLE_P(L) (L)->vectorizable #define LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P(L) (L)->can_use_partial_vectors_p #define LOOP_VINFO_USING_PARTIAL_VECTORS_P(L) (L)->using_partial_vectors_p +#define LOOP_VINFO_USING_DECREMENTING_IV_P(L) (L)->using_decrementing_iv_p #define LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P(L) \ (L)->epil_using_partial_vectors_p #define LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS(L) (L)->partial_load_store_bias -- 2.36.3 ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH V12] VECT: Add decrement IV iteration loop control by variable amount support 2023-05-22 8:38 [PATCH V12] VECT: Add decrement IV iteration loop control by variable amount support juzhe.zhong @ 2023-05-23 12:32 ` juzhe.zhong 2023-05-24 11:23 ` Richard Sandiford 1 sibling, 0 replies; 16+ messages in thread From: juzhe.zhong @ 2023-05-23 12:32 UTC (permalink / raw) To: 钟居哲, gcc-patches; +Cc: richard.sandiford, rguenther [-- Attachment #1: Type: text/plain, Size: 10223 bytes --] Bootstrap on X86 passed. Ok for trunk? Thanks. juzhe.zhong@rivai.ai From: juzhe.zhong Date: 2023-05-22 16:38 To: gcc-patches CC: richard.sandiford; rguenther; Ju-Zhe Zhong Subject: [PATCH V12] VECT: Add decrement IV iteration loop control by variable amount support From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai> gcc/ChangeLog: * tree-vect-loop-manip.cc (vect_adjust_loop_lens_control): New function. (vect_set_loop_controls_directly): Add decrement IV support. (vect_set_loop_condition_partial_vectors): Ditto. * tree-vect-loop.cc: Ditto. * tree-vectorizer.h (LOOP_VINFO_USING_DECREMENTING_IV_P): New macro. --- gcc/tree-vect-loop-manip.cc | 184 +++++++++++++++++++++++++++++++++++- gcc/tree-vect-loop.cc | 10 ++ gcc/tree-vectorizer.h | 8 ++ 3 files changed, 199 insertions(+), 3 deletions(-) diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc index ff6159e08d5..94b38d1e0fb 100644 --- a/gcc/tree-vect-loop-manip.cc +++ b/gcc/tree-vect-loop-manip.cc @@ -385,6 +385,66 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm, return false; } +/* Try to use adjust loop lens for non-SLP multiple-rgroups. + + _36 = MIN_EXPR <ivtmp_34, VF>; + + First length (MIN (X, VF/N)): + loop_len_15 = MIN_EXPR <_36, VF/N>; + + Second length: + tmp = _36 - loop_len_15; + loop_len_16 = MIN (tmp, VF/N); + + Third length: + tmp2 = tmp - loop_len_16; + loop_len_17 = MIN (tmp2, VF/N); + + Last length: + loop_len_18 = tmp2 - loop_len_17; +*/ + +static void +vect_adjust_loop_lens_control (tree iv_type, gimple_seq *seq, + rgroup_controls *dest_rgm, + rgroup_controls *src_rgm, tree step) +{ + tree ctrl_type = dest_rgm->type; + poly_uint64 nitems_per_ctrl + = TYPE_VECTOR_SUBPARTS (ctrl_type) * dest_rgm->factor; + tree length_limit = build_int_cst (iv_type, nitems_per_ctrl); + + for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i) + { + if (!step) + step = src_rgm->controls[i / dest_rgm->controls.length ()]; + tree ctrl = dest_rgm->controls[i]; + if (i == 0) + { + /* First iteration: MIN (X, VF/N) capped to the range [0, VF/N]. */ + gassign *assign + = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit); + gimple_seq_add_stmt (seq, assign); + } + else if (i == dest_rgm->controls.length () - 1) + { + /* Last iteration: Remain capped to the range [0, VF/N]. */ + gassign *assign = gimple_build_assign (ctrl, MINUS_EXPR, step, + dest_rgm->controls[i - 1]); + gimple_seq_add_stmt (seq, assign); + } + else + { + /* (MIN (remain, VF*I/N)) capped to the range [0, VF/N]. */ + step = gimple_build (seq, MINUS_EXPR, iv_type, step, + dest_rgm->controls[i - 1]); + gassign *assign + = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit); + gimple_seq_add_stmt (seq, assign); + } + } +} + /* Helper for vect_set_loop_condition_partial_vectors. Generate definitions for all the rgroup controls in RGC and return a control that is nonzero when the loop needs to iterate. Add any new preheader statements to @@ -468,9 +528,78 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo, gimple_stmt_iterator incr_gsi; bool insert_after; standard_iv_increment_position (loop, &incr_gsi, &insert_after); - create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE, - loop, &incr_gsi, insert_after, &index_before_incr, - &index_after_incr); + if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)) + { + nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total); + tree step = make_ssa_name (iv_type); + /* Create decrement IV. */ + create_iv (nitems_total, MINUS_EXPR, step, NULL_TREE, loop, &incr_gsi, + insert_after, &index_before_incr, &index_after_incr); + tree temp = gimple_build (header_seq, MIN_EXPR, iv_type, + index_before_incr, nitems_step); + gimple_seq_add_stmt (header_seq, gimple_build_assign (step, temp)); + + if (rgc->max_nscalars_per_iter == 1) + { + /* single rgroup: + ... + _10 = (unsigned long) count_12(D); + ... + # ivtmp_9 = PHI <ivtmp_35(6), _10(5)> + _36 = MIN_EXPR <ivtmp_9, POLY_INT_CST [4, 4]>; + ... + vect__4.8_28 = .LEN_LOAD (_17, 32B, _36, 0); + ... + ivtmp_35 = ivtmp_9 - _36; + ... + if (ivtmp_35 != 0) + goto <bb 4>; [83.33%] + else + goto <bb 5>; [16.67%] + */ + gassign *assign = gimple_build_assign (rgc->controls[0], step); + gimple_seq_add_stmt (header_seq, assign); + } + else + { + /* Multiple rgroup (SLP): + ... + _38 = (unsigned long) bnd.7_29; + _39 = _38 * 2; + ... + # ivtmp_41 = PHI <ivtmp_42(6), _39(5)> + ... + _43 = MIN_EXPR <ivtmp_41, 32>; + loop_len_26 = MIN_EXPR <_43, 16>; + loop_len_25 = _43 - loop_len_26; + ... + .LEN_STORE (_6, 8B, loop_len_26, ...); + ... + .LEN_STORE (_25, 8B, loop_len_25, ...); + _33 = loop_len_26 / 2; + ... + .LEN_STORE (_8, 16B, _33, ...); + _36 = loop_len_25 / 2; + ... + .LEN_STORE (_15, 16B, _36, ...); + ivtmp_42 = ivtmp_41 - _43; + ... + if (ivtmp_42 != 0) + goto <bb 4>; [83.33%] + else + goto <bb 5>; [16.67%] + */ + vect_adjust_loop_lens_control (iv_type, header_seq, rgc, NULL, step); + } + return index_after_incr; + } + else + { + /* Create increment IV. */ + create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE, + loop, &incr_gsi, insert_after, &index_before_incr, + &index_after_incr); + } tree zero_index = build_int_cst (compare_type, 0); tree test_index, test_limit, first_limit; @@ -753,6 +882,55 @@ vect_set_loop_condition_partial_vectors (class loop *loop, continue; } + if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) + && rgc->max_nscalars_per_iter == 1 + && rgc != &LOOP_VINFO_LENS (loop_vinfo)[0]) + { + /* Multiple rgroup (non-SLP): + ... + _38 = (unsigned long) n_12(D); + ... + # ivtmp_38 = PHI <ivtmp_39(3), 100(2)> + ... + _40 = MIN_EXPR <ivtmp_38, POLY_INT_CST [8, 8]>; + loop_len_21 = MIN_EXPR <_40, POLY_INT_CST [2, 2]>; + _41 = _40 - loop_len_21; + loop_len_20 = MIN_EXPR <_41, POLY_INT_CST [2, 2]>; + _42 = _40 - loop_len_20; + loop_len_19 = MIN_EXPR <_42, POLY_INT_CST [2, 2]>; + _43 = _40 - loop_len_19; + loop_len_16 = MIN_EXPR <_43, POLY_INT_CST [2, 2]>; + ... + vect__4.8_15 = .LEN_LOAD (_6, 64B, loop_len_21, 0); + ... + vect__4.9_8 = .LEN_LOAD (_13, 64B, loop_len_20, 0); + ... + vect__4.10_28 = .LEN_LOAD (_46, 64B, loop_len_19, 0); + ... + vect__4.11_30 = .LEN_LOAD (_49, 64B, loop_len_16, 0); + vect__7.13_31 = VEC_PACK_TRUNC_EXPR <vect__4.8_15, vect__4.9_8>; + vect__7.13_32 = VEC_PACK_TRUNC_EXPR <...>; + vect__7.12_33 = VEC_PACK_TRUNC_EXPR <...>; + ... + .LEN_STORE (_14, 16B, _40, vect__7.12_33, 0); + ivtmp_39 = ivtmp_38 - _40; + ... + if (ivtmp_39 != 0) + goto <bb 3>; [92.31%] + else + goto <bb 4>; [7.69%] + */ + rgroup_controls *sub_rgc + = &(*controls)[nmasks / rgc->controls.length () - 1]; + if (!sub_rgc->controls.is_empty ()) + { + tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo); + vect_adjust_loop_lens_control (iv_type, &header_seq, rgc, + sub_rgc, NULL_TREE); + continue; + } + } + /* See whether zero-based IV would ever generate all-false masks or zero length before wrapping around. */ bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc); diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index cf10132b0bf..53ac4d2a6c2 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -2725,6 +2725,16 @@ start_over: && !vect_verify_loop_lens (loop_vinfo)) LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; + /* If we're vectorizing an loop that uses length "controls" and + can iterate more than once, we apply decrementing IV approach + in loop control. */ + if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) + && !LOOP_VINFO_LENS (loop_vinfo).is_empty () + && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) + && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo), + LOOP_VINFO_VECT_FACTOR (loop_vinfo)))) + LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true; + /* If we're vectorizing an epilogue loop, the vectorized loop either needs to be able to handle fewer than VF scalars, or needs to have a lower VF than the main loop. */ diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 02d2ad6fba1..fba09b9ffd3 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -818,6 +818,13 @@ public: the vector loop can handle fewer than VF scalars. */ bool using_partial_vectors_p; + /* True if we've decided to use a decrementing loop control IV that counts + scalars. This can be done for any loop that: + + (a) uses length "controls"; and + (b) can iterate more than once. */ + bool using_decrementing_iv_p; + /* True if we've decided to use partially-populated vectors for the epilogue of loop. */ bool epil_using_partial_vectors_p; @@ -890,6 +897,7 @@ public: #define LOOP_VINFO_VECTORIZABLE_P(L) (L)->vectorizable #define LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P(L) (L)->can_use_partial_vectors_p #define LOOP_VINFO_USING_PARTIAL_VECTORS_P(L) (L)->using_partial_vectors_p +#define LOOP_VINFO_USING_DECREMENTING_IV_P(L) (L)->using_decrementing_iv_p #define LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P(L) \ (L)->epil_using_partial_vectors_p #define LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS(L) (L)->partial_load_store_bias -- 2.36.3 ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH V12] VECT: Add decrement IV iteration loop control by variable amount support 2023-05-22 8:38 [PATCH V12] VECT: Add decrement IV iteration loop control by variable amount support juzhe.zhong 2023-05-23 12:32 ` juzhe.zhong @ 2023-05-24 11:23 ` Richard Sandiford 2023-05-24 11:52 ` 钟居哲 2023-05-24 12:19 ` 钟居哲 1 sibling, 2 replies; 16+ messages in thread From: Richard Sandiford @ 2023-05-24 11:23 UTC (permalink / raw) To: juzhe.zhong; +Cc: gcc-patches, rguenther Sorry for the slow review. I needed some time to go through this patch and surrounding code to understand it, and to understand why it wasn't structured the way I was expecting. I've got some specific comments below, and then a general comment about how I think we should structure this. juzhe.zhong@rivai.ai writes: > From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai> > > gcc/ChangeLog: > > * tree-vect-loop-manip.cc (vect_adjust_loop_lens_control): New function. > (vect_set_loop_controls_directly): Add decrement IV support. > (vect_set_loop_condition_partial_vectors): Ditto. > * tree-vect-loop.cc: Ditto. > * tree-vectorizer.h (LOOP_VINFO_USING_DECREMENTING_IV_P): New macro. > > --- > gcc/tree-vect-loop-manip.cc | 184 +++++++++++++++++++++++++++++++++++- > gcc/tree-vect-loop.cc | 10 ++ > gcc/tree-vectorizer.h | 8 ++ > 3 files changed, 199 insertions(+), 3 deletions(-) > > diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc > index ff6159e08d5..94b38d1e0fb 100644 > --- a/gcc/tree-vect-loop-manip.cc > +++ b/gcc/tree-vect-loop-manip.cc > @@ -385,6 +385,66 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm, > return false; > } > > +/* Try to use adjust loop lens for non-SLP multiple-rgroups. > + > + _36 = MIN_EXPR <ivtmp_34, VF>; > + > + First length (MIN (X, VF/N)): > + loop_len_15 = MIN_EXPR <_36, VF/N>; > + > + Second length: > + tmp = _36 - loop_len_15; > + loop_len_16 = MIN (tmp, VF/N); > + > + Third length: > + tmp2 = tmp - loop_len_16; > + loop_len_17 = MIN (tmp2, VF/N); > + > + Last length: > + loop_len_18 = tmp2 - loop_len_17; > +*/ > + > +static void > +vect_adjust_loop_lens_control (tree iv_type, gimple_seq *seq, > + rgroup_controls *dest_rgm, > + rgroup_controls *src_rgm, tree step) > +{ > + tree ctrl_type = dest_rgm->type; > + poly_uint64 nitems_per_ctrl > + = TYPE_VECTOR_SUBPARTS (ctrl_type) * dest_rgm->factor; > + tree length_limit = build_int_cst (iv_type, nitems_per_ctrl); > + > + for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i) > + { > + if (!step) > + step = src_rgm->controls[i / dest_rgm->controls.length ()]; Could you explain this index? It looks like it will always be 0 due to the range of i. Since this is the only use of src_rgm, it might be cleaner to drop src_rgm and only pass the step. > + tree ctrl = dest_rgm->controls[i]; > + if (i == 0) > + { > + /* First iteration: MIN (X, VF/N) capped to the range [0, VF/N]. */ > + gassign *assign > + = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit); > + gimple_seq_add_stmt (seq, assign); > + } > + else if (i == dest_rgm->controls.length () - 1) > + { > + /* Last iteration: Remain capped to the range [0, VF/N]. */ > + gassign *assign = gimple_build_assign (ctrl, MINUS_EXPR, step, > + dest_rgm->controls[i - 1]); > + gimple_seq_add_stmt (seq, assign); > + } > + else > + { > + /* (MIN (remain, VF*I/N)) capped to the range [0, VF/N]. */ > + step = gimple_build (seq, MINUS_EXPR, iv_type, step, > + dest_rgm->controls[i - 1]); > + gassign *assign > + = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit); > + gimple_seq_add_stmt (seq, assign); > + } > + } > +} Not your fault, but the structure seems kind-of awkward, since it's really a MINUS_EXPR for i != 0 followed by a MIN_EXPR for i != last. But I agree that it probably has to be written as above given that the final destination is fixed in advance. > + > /* Helper for vect_set_loop_condition_partial_vectors. Generate definitions > for all the rgroup controls in RGC and return a control that is nonzero > when the loop needs to iterate. Add any new preheader statements to > @@ -468,9 +528,78 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo, > gimple_stmt_iterator incr_gsi; > bool insert_after; > standard_iv_increment_position (loop, &incr_gsi, &insert_after); > - create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE, > - loop, &incr_gsi, insert_after, &index_before_incr, > - &index_after_incr); > + if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)) > + { > + nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total); > + tree step = make_ssa_name (iv_type); > + /* Create decrement IV. */ > + create_iv (nitems_total, MINUS_EXPR, step, NULL_TREE, loop, &incr_gsi, > + insert_after, &index_before_incr, &index_after_incr); > + tree temp = gimple_build (header_seq, MIN_EXPR, iv_type, > + index_before_incr, nitems_step); > + gimple_seq_add_stmt (header_seq, gimple_build_assign (step, temp)); Probably simpler to build the MIN_EXPR as a gimple_build_assign, with "step" as its destination. No simplification is likely given that the input is a freshly-minted IV. > + > + if (rgc->max_nscalars_per_iter == 1) I'm not sure this is the correct condition. Isn't the split really between whether rgc->controls.length () is 1 or greater than 1? It's possible (even common) for rgc->controls.length () == 1 && rgc->max_nscalars_per_iter != 1, and in that case, vect_adjust_loop_lens_control will create an unnecessary MIN_EXPR. When rgc->controls.length () is 1, it would be good to use rgc->controls[0] as "step" above, rather than make a new SSA name. Thanks for putting the code here though. It looks like the right place to me. More on this below. > + { > + /* single rgroup: > + ... > + _10 = (unsigned long) count_12(D); > + ... > + # ivtmp_9 = PHI <ivtmp_35(6), _10(5)> > + _36 = MIN_EXPR <ivtmp_9, POLY_INT_CST [4, 4]>; > + ... > + vect__4.8_28 = .LEN_LOAD (_17, 32B, _36, 0); > + ... > + ivtmp_35 = ivtmp_9 - _36; > + ... > + if (ivtmp_35 != 0) > + goto <bb 4>; [83.33%] > + else > + goto <bb 5>; [16.67%] > + */ > + gassign *assign = gimple_build_assign (rgc->controls[0], step); > + gimple_seq_add_stmt (header_seq, assign); > + } > + else > + { > + /* Multiple rgroup (SLP): > + ... > + _38 = (unsigned long) bnd.7_29; > + _39 = _38 * 2; > + ... > + # ivtmp_41 = PHI <ivtmp_42(6), _39(5)> > + ... > + _43 = MIN_EXPR <ivtmp_41, 32>; > + loop_len_26 = MIN_EXPR <_43, 16>; > + loop_len_25 = _43 - loop_len_26; > + ... > + .LEN_STORE (_6, 8B, loop_len_26, ...); > + ... > + .LEN_STORE (_25, 8B, loop_len_25, ...); > + _33 = loop_len_26 / 2; > + ... > + .LEN_STORE (_8, 16B, _33, ...); > + _36 = loop_len_25 / 2; > + ... > + .LEN_STORE (_15, 16B, _36, ...); > + ivtmp_42 = ivtmp_41 - _43; > + ... > + if (ivtmp_42 != 0) > + goto <bb 4>; [83.33%] > + else > + goto <bb 5>; [16.67%] > + */ > + vect_adjust_loop_lens_control (iv_type, header_seq, rgc, NULL, step); > + } > + return index_after_incr; > + } > + else > + { > + /* Create increment IV. */ > + create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE, > + loop, &incr_gsi, insert_after, &index_before_incr, > + &index_after_incr); > + } Probably better to have no "else" and instead fall through to this create_iv. It makes it clearer that the decrementing IV case doesn't continue beyond the "if" above. > > tree zero_index = build_int_cst (compare_type, 0); > tree test_index, test_limit, first_limit; > @@ -753,6 +882,55 @@ vect_set_loop_condition_partial_vectors (class loop *loop, > continue; > } > > + if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) > + && rgc->max_nscalars_per_iter == 1 > + && rgc != &LOOP_VINFO_LENS (loop_vinfo)[0]) > + { > + /* Multiple rgroup (non-SLP): > + ... > + _38 = (unsigned long) n_12(D); > + ... > + # ivtmp_38 = PHI <ivtmp_39(3), 100(2)> > + ... > + _40 = MIN_EXPR <ivtmp_38, POLY_INT_CST [8, 8]>; > + loop_len_21 = MIN_EXPR <_40, POLY_INT_CST [2, 2]>; > + _41 = _40 - loop_len_21; > + loop_len_20 = MIN_EXPR <_41, POLY_INT_CST [2, 2]>; > + _42 = _40 - loop_len_20; > + loop_len_19 = MIN_EXPR <_42, POLY_INT_CST [2, 2]>; > + _43 = _40 - loop_len_19; > + loop_len_16 = MIN_EXPR <_43, POLY_INT_CST [2, 2]>; > + ... > + vect__4.8_15 = .LEN_LOAD (_6, 64B, loop_len_21, 0); > + ... > + vect__4.9_8 = .LEN_LOAD (_13, 64B, loop_len_20, 0); > + ... > + vect__4.10_28 = .LEN_LOAD (_46, 64B, loop_len_19, 0); > + ... > + vect__4.11_30 = .LEN_LOAD (_49, 64B, loop_len_16, 0); > + vect__7.13_31 = VEC_PACK_TRUNC_EXPR <vect__4.8_15, vect__4.9_8>; > + vect__7.13_32 = VEC_PACK_TRUNC_EXPR <...>; > + vect__7.12_33 = VEC_PACK_TRUNC_EXPR <...>; > + ... > + .LEN_STORE (_14, 16B, _40, vect__7.12_33, 0); > + ivtmp_39 = ivtmp_38 - _40; > + ... > + if (ivtmp_39 != 0) > + goto <bb 3>; [92.31%] > + else > + goto <bb 4>; [7.69%] > + */ > + rgroup_controls *sub_rgc > + = &(*controls)[nmasks / rgc->controls.length () - 1]; > + if (!sub_rgc->controls.is_empty ()) > + { > + tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo); > + vect_adjust_loop_lens_control (iv_type, &header_seq, rgc, > + sub_rgc, NULL_TREE); > + continue; > + } > + } I don't think this is right, since if we have 1-control, 2-control and 4-control groups, the fan-out for the 4-control group must be based on the 1-control group rather than any general sub_rgc. Using any other rgroup would mean that the input was already clamped by a MIN_EXPR (because that input would itself have been created by vect_adjust_loop_lens_control). > + > /* See whether zero-based IV would ever generate all-false masks > or zero length before wrapping around. */ > bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc); > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc > index cf10132b0bf..53ac4d2a6c2 100644 > --- a/gcc/tree-vect-loop.cc > +++ b/gcc/tree-vect-loop.cc > @@ -2725,6 +2725,16 @@ start_over: > && !vect_verify_loop_lens (loop_vinfo)) > LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; > > + /* If we're vectorizing an loop that uses length "controls" and > + can iterate more than once, we apply decrementing IV approach > + in loop control. */ > + if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) > + && !LOOP_VINFO_LENS (loop_vinfo).is_empty () > + && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) > + && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo), > + LOOP_VINFO_VECT_FACTOR (loop_vinfo)))) I think this also needs to check: LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0 ----- I think another way to think about the patch is this: Things would be easy if there was always a 1-control group. Then we could create a decrementing IV for that 1-control group, in the way that you do above. All other groups would use the 1-control group as input. They would fan that input out using vect_adjust_loop_lens_control. However, there isn't guaranteed to be 1-control group, and creating a fake one might not be easy. We'd need to calculate valid values for the factor, type, nscalars_per_iter, etc. fields. So when there is no 1-control group, it's probably easier to create a decrementing IV based on the first non-empty group. This is what your patch does. And I think it's the right approach. However, the IV created in this way is "special". It doesn't exist as part of the normal rgroup framework. It's something created outside the rgroup framework to make the controls easier to populate. In your case, this is the case where "step" is created and is not assigned directly to rgc->controls. Given that, I think we should structure the code as follows, changing the behaviour only for LOOP_VINFO_USING_DECREMENTING_IV_P: (1) In vect_set_loop_condition_partial_vectors, for the first iteration of: FOR_EACH_VEC_ELT (*controls, i, rgc) if (!rgc->controls.is_empty ()) call vect_set_loop_controls_directly. That is: /* See whether zero-based IV would ever generate all-false masks or zero length before wrapping around. */ bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc); /* Set up all controls for this group. */ test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo, &preheader_seq, &header_seq, loop_cond_gsi, rgc, niters, niters_skip, might_wrap_p); needs to be an "if" that (for LOOP_VINFO_USING_DECREMENTING_IV_P) is only executed on the first iteration. (2) vect_set_loop_controls_directly calculates "step" as in your patch. If rgc has 1 control, this step is the SSA name created for that control. Otherwise the step is a fresh SSA name, as in your patch. (3) vect_set_loop_controls_directly stores this step somewhere for later use, probably in LOOP_VINFO. Let's use "S" to refer to this stored step. (4) After the vect_set_loop_controls_directly call above, and outside the "if" statement that now contains vect_set_loop_controls_directly, check whether rgc->controls.length () > 1. If so, use vect_adjust_loop_lens_control to set the controls based on S. Then the only caller of vect_adjust_loop_lens_control is vect_set_loop_condition_partial_vectors. And the starting step for vect_adjust_loop_lens_control is always S. It sounds complicated when I write it like that, but I think it will be clearer in code. Hope that makes sense. Please let me know if not. Thanks, Richard ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: Re: [PATCH V12] VECT: Add decrement IV iteration loop control by variable amount support 2023-05-24 11:23 ` Richard Sandiford @ 2023-05-24 11:52 ` 钟居哲 2023-05-24 12:41 ` Richard Sandiford 2023-05-24 12:19 ` 钟居哲 1 sibling, 1 reply; 16+ messages in thread From: 钟居哲 @ 2023-05-24 11:52 UTC (permalink / raw) To: richard.sandiford; +Cc: gcc-patches, rguenther [-- Attachment #1: Type: text/plain, Size: 16768 bytes --] Hi, Richard. It's quite complicated for me and I am not sure whether I can catch up with you. So I will rather split the work step by step to implement the decrement IV For the first step you mentioned: >> (1) In vect_set_loop_condition_partial_vectors, for the first iteration of: >> FOR_EACH_VEC_ELT (*controls, i, rgc) >> if (!rgc->controls.is_empty ()) >> call vect_set_loop_controls_directly. That is: >> >> /* See whether zero-based IV would ever generate all-false masks >> or zero length before wrapping around. */ >> bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc); >> /* Set up all controls for this group. */ >> test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo, >> &preheader_seq, >> &header_seq, >> loop_cond_gsi, rgc, >> niters, niters_skip, >> might_wrap_p); >> needs to be an "if" that (for LOOP_VINFO_USING_DECREMENTING_IV_P) >> is only executed on the first iteration. Is it correct like this? FOR_EACH_VEC_ELT (*controls, i, rgc) if (!rgc->controls.is_empty ()) { /* First try using permutes. This adds a single vector instruction to the loop for each mask, but needs no extra loop invariants or IVs. */ unsigned int nmasks = i + 1; if (use_masks_p && (nmasks & 1) == 0) { rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1]; if (!half_rgc->controls.is_empty () && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc)) continue; } /* See whether zero-based IV would ever generate all-false masks or zero length before wrapping around. */ bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc); /* Set up all controls for this group. */ test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo, &preheader_seq, &header_seq, loop_cond_gsi, rgc, niters, niters_skip, might_wrap_p); /* Decrement IV only run vect_set_loop_controls_directly once. */ if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)) break; } Thanks. juzhe.zhong@rivai.ai From: Richard Sandiford Date: 2023-05-24 19:23 To: juzhe.zhong CC: gcc-patches; rguenther Subject: Re: [PATCH V12] VECT: Add decrement IV iteration loop control by variable amount support Sorry for the slow review. I needed some time to go through this patch and surrounding code to understand it, and to understand why it wasn't structured the way I was expecting. I've got some specific comments below, and then a general comment about how I think we should structure this. juzhe.zhong@rivai.ai writes: > From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai> > > gcc/ChangeLog: > > * tree-vect-loop-manip.cc (vect_adjust_loop_lens_control): New function. > (vect_set_loop_controls_directly): Add decrement IV support. > (vect_set_loop_condition_partial_vectors): Ditto. > * tree-vect-loop.cc: Ditto. > * tree-vectorizer.h (LOOP_VINFO_USING_DECREMENTING_IV_P): New macro. > > --- > gcc/tree-vect-loop-manip.cc | 184 +++++++++++++++++++++++++++++++++++- > gcc/tree-vect-loop.cc | 10 ++ > gcc/tree-vectorizer.h | 8 ++ > 3 files changed, 199 insertions(+), 3 deletions(-) > > diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc > index ff6159e08d5..94b38d1e0fb 100644 > --- a/gcc/tree-vect-loop-manip.cc > +++ b/gcc/tree-vect-loop-manip.cc > @@ -385,6 +385,66 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm, > return false; > } > > +/* Try to use adjust loop lens for non-SLP multiple-rgroups. > + > + _36 = MIN_EXPR <ivtmp_34, VF>; > + > + First length (MIN (X, VF/N)): > + loop_len_15 = MIN_EXPR <_36, VF/N>; > + > + Second length: > + tmp = _36 - loop_len_15; > + loop_len_16 = MIN (tmp, VF/N); > + > + Third length: > + tmp2 = tmp - loop_len_16; > + loop_len_17 = MIN (tmp2, VF/N); > + > + Last length: > + loop_len_18 = tmp2 - loop_len_17; > +*/ > + > +static void > +vect_adjust_loop_lens_control (tree iv_type, gimple_seq *seq, > + rgroup_controls *dest_rgm, > + rgroup_controls *src_rgm, tree step) > +{ > + tree ctrl_type = dest_rgm->type; > + poly_uint64 nitems_per_ctrl > + = TYPE_VECTOR_SUBPARTS (ctrl_type) * dest_rgm->factor; > + tree length_limit = build_int_cst (iv_type, nitems_per_ctrl); > + > + for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i) > + { > + if (!step) > + step = src_rgm->controls[i / dest_rgm->controls.length ()]; Could you explain this index? It looks like it will always be 0 due to the range of i. Since this is the only use of src_rgm, it might be cleaner to drop src_rgm and only pass the step. > + tree ctrl = dest_rgm->controls[i]; > + if (i == 0) > + { > + /* First iteration: MIN (X, VF/N) capped to the range [0, VF/N]. */ > + gassign *assign > + = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit); > + gimple_seq_add_stmt (seq, assign); > + } > + else if (i == dest_rgm->controls.length () - 1) > + { > + /* Last iteration: Remain capped to the range [0, VF/N]. */ > + gassign *assign = gimple_build_assign (ctrl, MINUS_EXPR, step, > + dest_rgm->controls[i - 1]); > + gimple_seq_add_stmt (seq, assign); > + } > + else > + { > + /* (MIN (remain, VF*I/N)) capped to the range [0, VF/N]. */ > + step = gimple_build (seq, MINUS_EXPR, iv_type, step, > + dest_rgm->controls[i - 1]); > + gassign *assign > + = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit); > + gimple_seq_add_stmt (seq, assign); > + } > + } > +} Not your fault, but the structure seems kind-of awkward, since it's really a MINUS_EXPR for i != 0 followed by a MIN_EXPR for i != last. But I agree that it probably has to be written as above given that the final destination is fixed in advance. > + > /* Helper for vect_set_loop_condition_partial_vectors. Generate definitions > for all the rgroup controls in RGC and return a control that is nonzero > when the loop needs to iterate. Add any new preheader statements to > @@ -468,9 +528,78 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo, > gimple_stmt_iterator incr_gsi; > bool insert_after; > standard_iv_increment_position (loop, &incr_gsi, &insert_after); > - create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE, > - loop, &incr_gsi, insert_after, &index_before_incr, > - &index_after_incr); > + if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)) > + { > + nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total); > + tree step = make_ssa_name (iv_type); > + /* Create decrement IV. */ > + create_iv (nitems_total, MINUS_EXPR, step, NULL_TREE, loop, &incr_gsi, > + insert_after, &index_before_incr, &index_after_incr); > + tree temp = gimple_build (header_seq, MIN_EXPR, iv_type, > + index_before_incr, nitems_step); > + gimple_seq_add_stmt (header_seq, gimple_build_assign (step, temp)); Probably simpler to build the MIN_EXPR as a gimple_build_assign, with "step" as its destination. No simplification is likely given that the input is a freshly-minted IV. > + > + if (rgc->max_nscalars_per_iter == 1) I'm not sure this is the correct condition. Isn't the split really between whether rgc->controls.length () is 1 or greater than 1? It's possible (even common) for rgc->controls.length () == 1 && rgc->max_nscalars_per_iter != 1, and in that case, vect_adjust_loop_lens_control will create an unnecessary MIN_EXPR. When rgc->controls.length () is 1, it would be good to use rgc->controls[0] as "step" above, rather than make a new SSA name. Thanks for putting the code here though. It looks like the right place to me. More on this below. > + { > + /* single rgroup: > + ... > + _10 = (unsigned long) count_12(D); > + ... > + # ivtmp_9 = PHI <ivtmp_35(6), _10(5)> > + _36 = MIN_EXPR <ivtmp_9, POLY_INT_CST [4, 4]>; > + ... > + vect__4.8_28 = .LEN_LOAD (_17, 32B, _36, 0); > + ... > + ivtmp_35 = ivtmp_9 - _36; > + ... > + if (ivtmp_35 != 0) > + goto <bb 4>; [83.33%] > + else > + goto <bb 5>; [16.67%] > + */ > + gassign *assign = gimple_build_assign (rgc->controls[0], step); > + gimple_seq_add_stmt (header_seq, assign); > + } > + else > + { > + /* Multiple rgroup (SLP): > + ... > + _38 = (unsigned long) bnd.7_29; > + _39 = _38 * 2; > + ... > + # ivtmp_41 = PHI <ivtmp_42(6), _39(5)> > + ... > + _43 = MIN_EXPR <ivtmp_41, 32>; > + loop_len_26 = MIN_EXPR <_43, 16>; > + loop_len_25 = _43 - loop_len_26; > + ... > + .LEN_STORE (_6, 8B, loop_len_26, ...); > + ... > + .LEN_STORE (_25, 8B, loop_len_25, ...); > + _33 = loop_len_26 / 2; > + ... > + .LEN_STORE (_8, 16B, _33, ...); > + _36 = loop_len_25 / 2; > + ... > + .LEN_STORE (_15, 16B, _36, ...); > + ivtmp_42 = ivtmp_41 - _43; > + ... > + if (ivtmp_42 != 0) > + goto <bb 4>; [83.33%] > + else > + goto <bb 5>; [16.67%] > + */ > + vect_adjust_loop_lens_control (iv_type, header_seq, rgc, NULL, step); > + } > + return index_after_incr; > + } > + else > + { > + /* Create increment IV. */ > + create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE, > + loop, &incr_gsi, insert_after, &index_before_incr, > + &index_after_incr); > + } Probably better to have no "else" and instead fall through to this create_iv. It makes it clearer that the decrementing IV case doesn't continue beyond the "if" above. > > tree zero_index = build_int_cst (compare_type, 0); > tree test_index, test_limit, first_limit; > @@ -753,6 +882,55 @@ vect_set_loop_condition_partial_vectors (class loop *loop, > continue; > } > > + if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) > + && rgc->max_nscalars_per_iter == 1 > + && rgc != &LOOP_VINFO_LENS (loop_vinfo)[0]) > + { > + /* Multiple rgroup (non-SLP): > + ... > + _38 = (unsigned long) n_12(D); > + ... > + # ivtmp_38 = PHI <ivtmp_39(3), 100(2)> > + ... > + _40 = MIN_EXPR <ivtmp_38, POLY_INT_CST [8, 8]>; > + loop_len_21 = MIN_EXPR <_40, POLY_INT_CST [2, 2]>; > + _41 = _40 - loop_len_21; > + loop_len_20 = MIN_EXPR <_41, POLY_INT_CST [2, 2]>; > + _42 = _40 - loop_len_20; > + loop_len_19 = MIN_EXPR <_42, POLY_INT_CST [2, 2]>; > + _43 = _40 - loop_len_19; > + loop_len_16 = MIN_EXPR <_43, POLY_INT_CST [2, 2]>; > + ... > + vect__4.8_15 = .LEN_LOAD (_6, 64B, loop_len_21, 0); > + ... > + vect__4.9_8 = .LEN_LOAD (_13, 64B, loop_len_20, 0); > + ... > + vect__4.10_28 = .LEN_LOAD (_46, 64B, loop_len_19, 0); > + ... > + vect__4.11_30 = .LEN_LOAD (_49, 64B, loop_len_16, 0); > + vect__7.13_31 = VEC_PACK_TRUNC_EXPR <vect__4.8_15, vect__4.9_8>; > + vect__7.13_32 = VEC_PACK_TRUNC_EXPR <...>; > + vect__7.12_33 = VEC_PACK_TRUNC_EXPR <...>; > + ... > + .LEN_STORE (_14, 16B, _40, vect__7.12_33, 0); > + ivtmp_39 = ivtmp_38 - _40; > + ... > + if (ivtmp_39 != 0) > + goto <bb 3>; [92.31%] > + else > + goto <bb 4>; [7.69%] > + */ > + rgroup_controls *sub_rgc > + = &(*controls)[nmasks / rgc->controls.length () - 1]; > + if (!sub_rgc->controls.is_empty ()) > + { > + tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo); > + vect_adjust_loop_lens_control (iv_type, &header_seq, rgc, > + sub_rgc, NULL_TREE); > + continue; > + } > + } I don't think this is right, since if we have 1-control, 2-control and 4-control groups, the fan-out for the 4-control group must be based on the 1-control group rather than any general sub_rgc. Using any other rgroup would mean that the input was already clamped by a MIN_EXPR (because that input would itself have been created by vect_adjust_loop_lens_control). > + > /* See whether zero-based IV would ever generate all-false masks > or zero length before wrapping around. */ > bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc); > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc > index cf10132b0bf..53ac4d2a6c2 100644 > --- a/gcc/tree-vect-loop.cc > +++ b/gcc/tree-vect-loop.cc > @@ -2725,6 +2725,16 @@ start_over: > && !vect_verify_loop_lens (loop_vinfo)) > LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; > > + /* If we're vectorizing an loop that uses length "controls" and > + can iterate more than once, we apply decrementing IV approach > + in loop control. */ > + if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) > + && !LOOP_VINFO_LENS (loop_vinfo).is_empty () > + && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) > + && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo), > + LOOP_VINFO_VECT_FACTOR (loop_vinfo)))) I think this also needs to check: LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0 ----- I think another way to think about the patch is this: Things would be easy if there was always a 1-control group. Then we could create a decrementing IV for that 1-control group, in the way that you do above. All other groups would use the 1-control group as input. They would fan that input out using vect_adjust_loop_lens_control. However, there isn't guaranteed to be 1-control group, and creating a fake one might not be easy. We'd need to calculate valid values for the factor, type, nscalars_per_iter, etc. fields. So when there is no 1-control group, it's probably easier to create a decrementing IV based on the first non-empty group. This is what your patch does. And I think it's the right approach. However, the IV created in this way is "special". It doesn't exist as part of the normal rgroup framework. It's something created outside the rgroup framework to make the controls easier to populate. In your case, this is the case where "step" is created and is not assigned directly to rgc->controls. Given that, I think we should structure the code as follows, changing the behaviour only for LOOP_VINFO_USING_DECREMENTING_IV_P: (1) In vect_set_loop_condition_partial_vectors, for the first iteration of: FOR_EACH_VEC_ELT (*controls, i, rgc) if (!rgc->controls.is_empty ()) call vect_set_loop_controls_directly. That is: /* See whether zero-based IV would ever generate all-false masks or zero length before wrapping around. */ bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc); /* Set up all controls for this group. */ test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo, &preheader_seq, &header_seq, loop_cond_gsi, rgc, niters, niters_skip, might_wrap_p); needs to be an "if" that (for LOOP_VINFO_USING_DECREMENTING_IV_P) is only executed on the first iteration. (2) vect_set_loop_controls_directly calculates "step" as in your patch. If rgc has 1 control, this step is the SSA name created for that control. Otherwise the step is a fresh SSA name, as in your patch. (3) vect_set_loop_controls_directly stores this step somewhere for later use, probably in LOOP_VINFO. Let's use "S" to refer to this stored step. (4) After the vect_set_loop_controls_directly call above, and outside the "if" statement that now contains vect_set_loop_controls_directly, check whether rgc->controls.length () > 1. If so, use vect_adjust_loop_lens_control to set the controls based on S. Then the only caller of vect_adjust_loop_lens_control is vect_set_loop_condition_partial_vectors. And the starting step for vect_adjust_loop_lens_control is always S. It sounds complicated when I write it like that, but I think it will be clearer in code. Hope that makes sense. Please let me know if not. Thanks, Richard ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH V12] VECT: Add decrement IV iteration loop control by variable amount support 2023-05-24 11:52 ` 钟居哲 @ 2023-05-24 12:41 ` Richard Sandiford 2023-05-24 12:51 ` Richard Biener 2023-05-24 13:26 ` 钟居哲 0 siblings, 2 replies; 16+ messages in thread From: Richard Sandiford @ 2023-05-24 12:41 UTC (permalink / raw) To: 钟居哲; +Cc: gcc-patches, rguenther Sorry, I realised later that I had an implicit assumption here: if there are multiple rgroups, it's better to have a single IV for the smallest rgroup and scale that up to bigger rgroups. E.g. if the loop control IV is taken from an N-control rgroup and has a step S, an N*M-control rgroup would be based on M*S. Of course, it's also OK to create multiple IVs if you prefer. It's just a question of which approach gives the best output in practice. Another way of going from an N-control rgroup ("G1") to an N*M-control rgroup ("G2") would be to reuse all N controls from G1. E.g. the first M controls in G2 would come from G1[0], the next M from G1[1], etc. That might lower the longest dependency chain. But whatever we do, it doesn't feel like max_nscalars_per_iter should be part of the decision. (I realise it will be part of the decision for the follow-on SELECT_IV patch. But that's because we require the number of elements processed in each iteration to be a multiple of max_nscalars_per_iter, and AIUI SELECT_IV wouldn't guarantee that. max_nscalars_per_iter shouldn't matter for the current patch though.) 钟居哲 <juzhe.zhong@rivai.ai> writes: > Hi, Richard. It's quite complicated for me and I am not sure whether I can catch up with you. > So I will rather split the work step by step to implement the decrement IV > > For the first step you mentioned: > >>> (1) In vect_set_loop_condition_partial_vectors, for the first iteration of: > > >> FOR_EACH_VEC_ELT (*controls, i, rgc) > >> if (!rgc->controls.is_empty ()) > >>> call vect_set_loop_controls_directly. That is: > >>> >> /* See whether zero-based IV would ever generate all-false masks >>> or zero length before wrapping around. */ >>> bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc); >>> > /* Set up all controls for this group. */ >>> test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo, > >> &preheader_seq, > >> &header_seq, > >> loop_cond_gsi, rgc, > >> niters, niters_skip, > >> might_wrap_p); > >>> needs to be an "if" that (for LOOP_VINFO_USING_DECREMENTING_IV_P) >>> is only executed on the first iteration. > > Is it correct like this? > > FOR_EACH_VEC_ELT (*controls, i, rgc) > if (!rgc->controls.is_empty ()) > { > /* First try using permutes. This adds a single vector > instruction to the loop for each mask, but needs no extra > loop invariants or IVs. */ > unsigned int nmasks = i + 1; > if (use_masks_p && (nmasks & 1) == 0) > { > rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1]; > if (!half_rgc->controls.is_empty () > && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc)) > continue; > } > > /* See whether zero-based IV would ever generate all-false masks > or zero length before wrapping around. */ > bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc); > > /* Set up all controls for this group. */ > test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo, > &preheader_seq, > &header_seq, > loop_cond_gsi, rgc, > niters, niters_skip, > might_wrap_p); > > /* Decrement IV only run vect_set_loop_controls_directly once. */ > if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)) > break; > } I meant something like: FOR_EACH_VEC_ELT (*controls, i, rgc) if (!rgc->controls.is_empty ()) { /* First try using permutes. This adds a single vector instruction to the loop for each mask, but needs no extra loop invariants or IVs. */ unsigned int nmasks = i + 1; if (use_masks_p && (nmasks & 1) == 0) { rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1]; if (!half_rgc->controls.is_empty () && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc)) continue; } if (!LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) || !LOOP_VINFO_DECREMENTING_IV_STEP (loop_info)) { /* See whether zero-based IV would ever generate all-false masks or zero length before wrapping around. */ bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc); /* Set up all controls for this group. */ test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo, &preheader_seq, &header_seq, loop_cond_gsi, rgc, niters, niters_skip, might_wrap_p); } if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) && rgc->controls.length () > 1) ...use vect_adjust_loop_lens_control... } where LOOP_VINFO_DECREMENTING_IV_STEP (loop_info) is "S" from my previous review. vect_set_loop_controls_directly would then set LOOP_VINFO_DECREMENTING_IV_STEP but would not call vect_adjust_loop_lens_control. But like I say, this is all based on the assumption that we should have a single IV and scale it up for later rgroups. If you'd prefer separate IVs then that's fine. But then I think it's less clear why we have: > + if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) > + && rgc->max_nscalars_per_iter == 1 > + && rgc != &LOOP_VINFO_LENS (loop_vinfo)[0]) > + { > + /* Multiple rgroup (non-SLP): > + ... > + _38 = (unsigned long) n_12(D); > + ... > + # ivtmp_38 = PHI <ivtmp_39(3), 100(2)> > + ... > + _40 = MIN_EXPR <ivtmp_38, POLY_INT_CST [8, 8]>; > + loop_len_21 = MIN_EXPR <_40, POLY_INT_CST [2, 2]>; > + _41 = _40 - loop_len_21; > + loop_len_20 = MIN_EXPR <_41, POLY_INT_CST [2, 2]>; > + _42 = _40 - loop_len_20; > + loop_len_19 = MIN_EXPR <_42, POLY_INT_CST [2, 2]>; > + _43 = _40 - loop_len_19; > + loop_len_16 = MIN_EXPR <_43, POLY_INT_CST [2, 2]>; > + ... > + vect__4.8_15 = .LEN_LOAD (_6, 64B, loop_len_21, 0); > + ... > + vect__4.9_8 = .LEN_LOAD (_13, 64B, loop_len_20, 0); > + ... > + vect__4.10_28 = .LEN_LOAD (_46, 64B, loop_len_19, 0); > + ... > + vect__4.11_30 = .LEN_LOAD (_49, 64B, loop_len_16, 0); > + vect__7.13_31 = VEC_PACK_TRUNC_EXPR <vect__4.8_15, vect__4.9_8>; > + vect__7.13_32 = VEC_PACK_TRUNC_EXPR <...>; > + vect__7.12_33 = VEC_PACK_TRUNC_EXPR <...>; > + ... > + .LEN_STORE (_14, 16B, _40, vect__7.12_33, 0); > + ivtmp_39 = ivtmp_38 - _40; > + ... > + if (ivtmp_39 != 0) > + goto <bb 3>; [92.31%] > + else > + goto <bb 4>; [7.69%] > + */ > + rgroup_controls *sub_rgc > + = &(*controls)[nmasks / rgc->controls.length () - 1]; > + if (!sub_rgc->controls.is_empty ()) > + { > + tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo); > + vect_adjust_loop_lens_control (iv_type, &header_seq, rgc, > + sub_rgc, NULL_TREE); > + continue; > + } > + } In other words, why is this different from what vect_set_loop_controls_directly would do? Thanks, Richard ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH V12] VECT: Add decrement IV iteration loop control by variable amount support 2023-05-24 12:41 ` Richard Sandiford @ 2023-05-24 12:51 ` Richard Biener 2023-05-24 13:27 ` 钟居哲 2023-05-24 13:26 ` 钟居哲 1 sibling, 1 reply; 16+ messages in thread From: Richard Biener @ 2023-05-24 12:51 UTC (permalink / raw) To: Richard Sandiford; +Cc: 钟居哲, gcc-patches On Wed, 24 May 2023, Richard Sandiford wrote: > Sorry, I realised later that I had an implicit assumption here: > if there are multiple rgroups, it's better to have a single IV > for the smallest rgroup and scale that up to bigger rgroups. > > E.g. if the loop control IV is taken from an N-control rgroup > and has a step S, an N*M-control rgroup would be based on M*S. > > Of course, it's also OK to create multiple IVs if you prefer. > It's just a question of which approach gives the best output > in practice. One thing to check is whether IVOPTs is ever able to eliminate one such IV using another. You can then also check whether when presented with a single IV it already considers the others you can create as candidates so you get the optimal selection in the end. Richard. ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: Re: [PATCH V12] VECT: Add decrement IV iteration loop control by variable amount support 2023-05-24 12:51 ` Richard Biener @ 2023-05-24 13:27 ` 钟居哲 0 siblings, 0 replies; 16+ messages in thread From: 钟居哲 @ 2023-05-24 13:27 UTC (permalink / raw) To: rguenther, richard.sandiford; +Cc: gcc-patches [-- Attachment #1: Type: text/plain, Size: 1154 bytes --] OK. Thanks. I am gonna refine the patch following Richard's idea and test it. Thanks both Richard and Richi. juzhe.zhong@rivai.ai From: Richard Biener Date: 2023-05-24 20:51 To: Richard Sandiford CC: 钟居哲; gcc-patches Subject: Re: [PATCH V12] VECT: Add decrement IV iteration loop control by variable amount support On Wed, 24 May 2023, Richard Sandiford wrote: > Sorry, I realised later that I had an implicit assumption here: > if there are multiple rgroups, it's better to have a single IV > for the smallest rgroup and scale that up to bigger rgroups. > > E.g. if the loop control IV is taken from an N-control rgroup > and has a step S, an N*M-control rgroup would be based on M*S. > > Of course, it's also OK to create multiple IVs if you prefer. > It's just a question of which approach gives the best output > in practice. One thing to check is whether IVOPTs is ever able to eliminate one such IV using another. You can then also check whether when presented with a single IV it already considers the others you can create as candidates so you get the optimal selection in the end. Richard. ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: Re: [PATCH V12] VECT: Add decrement IV iteration loop control by variable amount support 2023-05-24 12:41 ` Richard Sandiford 2023-05-24 12:51 ` Richard Biener @ 2023-05-24 13:26 ` 钟居哲 2023-05-24 14:01 ` Richard Sandiford 1 sibling, 1 reply; 16+ messages in thread From: 钟居哲 @ 2023-05-24 13:26 UTC (permalink / raw) To: richard.sandiford; +Cc: gcc-patches, rguenther [-- Attachment #1: Type: text/plain, Size: 12097 bytes --] >> In other words, why is this different from what >>vect_set_loop_controls_directly would do? Oh, I see. You are confused that why I do not make multiple-rgroup vec_trunk handling inside "vect_set_loop_controls_directly". Well. Frankly, I just replicate the handling of ARM SVE: unsigned int nmasks = i + 1; if (use_masks_p && (nmasks & 1) == 0) { rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1]; if (!half_rgc->controls.is_empty () && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc)) continue; } /* Try to use permutes to define the masks in DEST_RGM using the masks in SRC_RGM, given that the former has twice as many masks as the latter. Return true on success, adding any new statements to SEQ. */ static bool vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm, rgroup_controls *src_rgm) { tree src_masktype = src_rgm->type; tree dest_masktype = dest_rgm->type; machine_mode src_mode = TYPE_MODE (src_masktype); insn_code icode1, icode2; if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter && (icode1 = optab_handler (vec_unpacku_hi_optab, src_mode)) != CODE_FOR_nothing && (icode2 = optab_handler (vec_unpacku_lo_optab, src_mode)) != CODE_FOR_nothing) { /* Unpacking the source masks gives at least as many mask bits as we need. We can then VIEW_CONVERT any excess bits away. */ machine_mode dest_mode = insn_data[icode1].operand[0].mode; gcc_assert (dest_mode == insn_data[icode2].operand[0].mode); tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode); for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i) { tree src = src_rgm->controls[i / 2]; tree dest = dest_rgm->controls[i]; tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1) ? VEC_UNPACK_HI_EXPR : VEC_UNPACK_LO_EXPR); gassign *stmt; if (dest_masktype == unpack_masktype) stmt = gimple_build_assign (dest, code, src); else { tree temp = make_ssa_name (unpack_masktype); stmt = gimple_build_assign (temp, code, src); gimple_seq_add_stmt (seq, stmt); stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR, build1 (VIEW_CONVERT_EXPR, dest_masktype, temp)); } gimple_seq_add_stmt (seq, stmt); } return true; } vec_perm_indices indices[2]; if (dest_masktype == src_masktype && interleave_supported_p (&indices[0], src_masktype, 0) && interleave_supported_p (&indices[1], src_masktype, 1)) { /* The destination requires twice as many mask bits as the source, so we can use interleaving permutes to double up the number of bits. */ tree masks[2]; for (unsigned int i = 0; i < 2; ++i) masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]); for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i) { tree src = src_rgm->controls[i / 2]; tree dest = dest_rgm->controls[i]; gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR, src, src, masks[i & 1]); gimple_seq_add_stmt (seq, stmt); } return true; } return false; } I know this is just optimization for ARM SVE with sub_rgc (int16) is half size of rgc (int8). But when I just copy the codes from ARM SVE and make it general for all cases (int8 <-> int64). They all work well and codegen is good. If you don't like this way, would you mind give me some suggestions? Thanks. juzhe.zhong@rivai.ai From: Richard Sandiford Date: 2023-05-24 20:41 To: 钟居哲 CC: gcc-patches; rguenther Subject: Re: [PATCH V12] VECT: Add decrement IV iteration loop control by variable amount support Sorry, I realised later that I had an implicit assumption here: if there are multiple rgroups, it's better to have a single IV for the smallest rgroup and scale that up to bigger rgroups. E.g. if the loop control IV is taken from an N-control rgroup and has a step S, an N*M-control rgroup would be based on M*S. Of course, it's also OK to create multiple IVs if you prefer. It's just a question of which approach gives the best output in practice. Another way of going from an N-control rgroup ("G1") to an N*M-control rgroup ("G2") would be to reuse all N controls from G1. E.g. the first M controls in G2 would come from G1[0], the next M from G1[1], etc. That might lower the longest dependency chain. But whatever we do, it doesn't feel like max_nscalars_per_iter should be part of the decision. (I realise it will be part of the decision for the follow-on SELECT_IV patch. But that's because we require the number of elements processed in each iteration to be a multiple of max_nscalars_per_iter, and AIUI SELECT_IV wouldn't guarantee that. max_nscalars_per_iter shouldn't matter for the current patch though.) 钟居哲 <juzhe.zhong@rivai.ai> writes: > Hi, Richard. It's quite complicated for me and I am not sure whether I can catch up with you. > So I will rather split the work step by step to implement the decrement IV > > For the first step you mentioned: > >>> (1) In vect_set_loop_condition_partial_vectors, for the first iteration of: > > >> FOR_EACH_VEC_ELT (*controls, i, rgc) > >> if (!rgc->controls.is_empty ()) > >>> call vect_set_loop_controls_directly. That is: > >>> >> /* See whether zero-based IV would ever generate all-false masks >>> or zero length before wrapping around. */ >>> bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc); >>> > /* Set up all controls for this group. */ >>> test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo, > >> &preheader_seq, > >> &header_seq, > >> loop_cond_gsi, rgc, > >> niters, niters_skip, > >> might_wrap_p); > >>> needs to be an "if" that (for LOOP_VINFO_USING_DECREMENTING_IV_P) >>> is only executed on the first iteration. > > Is it correct like this? > > FOR_EACH_VEC_ELT (*controls, i, rgc) > if (!rgc->controls.is_empty ()) > { > /* First try using permutes. This adds a single vector > instruction to the loop for each mask, but needs no extra > loop invariants or IVs. */ > unsigned int nmasks = i + 1; > if (use_masks_p && (nmasks & 1) == 0) > { > rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1]; > if (!half_rgc->controls.is_empty () > && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc)) > continue; > } > > /* See whether zero-based IV would ever generate all-false masks > or zero length before wrapping around. */ > bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc); > > /* Set up all controls for this group. */ > test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo, > &preheader_seq, > &header_seq, > loop_cond_gsi, rgc, > niters, niters_skip, > might_wrap_p); > > /* Decrement IV only run vect_set_loop_controls_directly once. */ > if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)) > break; > } I meant something like: FOR_EACH_VEC_ELT (*controls, i, rgc) if (!rgc->controls.is_empty ()) { /* First try using permutes. This adds a single vector instruction to the loop for each mask, but needs no extra loop invariants or IVs. */ unsigned int nmasks = i + 1; if (use_masks_p && (nmasks & 1) == 0) { rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1]; if (!half_rgc->controls.is_empty () && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc)) continue; } if (!LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) || !LOOP_VINFO_DECREMENTING_IV_STEP (loop_info)) { /* See whether zero-based IV would ever generate all-false masks or zero length before wrapping around. */ bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc); /* Set up all controls for this group. */ test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo, &preheader_seq, &header_seq, loop_cond_gsi, rgc, niters, niters_skip, might_wrap_p); } if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) && rgc->controls.length () > 1) ...use vect_adjust_loop_lens_control... } where LOOP_VINFO_DECREMENTING_IV_STEP (loop_info) is "S" from my previous review. vect_set_loop_controls_directly would then set LOOP_VINFO_DECREMENTING_IV_STEP but would not call vect_adjust_loop_lens_control. But like I say, this is all based on the assumption that we should have a single IV and scale it up for later rgroups. If you'd prefer separate IVs then that's fine. But then I think it's less clear why we have: > + if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) > + && rgc->max_nscalars_per_iter == 1 > + && rgc != &LOOP_VINFO_LENS (loop_vinfo)[0]) > + { > + /* Multiple rgroup (non-SLP): > + ... > + _38 = (unsigned long) n_12(D); > + ... > + # ivtmp_38 = PHI <ivtmp_39(3), 100(2)> > + ... > + _40 = MIN_EXPR <ivtmp_38, POLY_INT_CST [8, 8]>; > + loop_len_21 = MIN_EXPR <_40, POLY_INT_CST [2, 2]>; > + _41 = _40 - loop_len_21; > + loop_len_20 = MIN_EXPR <_41, POLY_INT_CST [2, 2]>; > + _42 = _40 - loop_len_20; > + loop_len_19 = MIN_EXPR <_42, POLY_INT_CST [2, 2]>; > + _43 = _40 - loop_len_19; > + loop_len_16 = MIN_EXPR <_43, POLY_INT_CST [2, 2]>; > + ... > + vect__4.8_15 = .LEN_LOAD (_6, 64B, loop_len_21, 0); > + ... > + vect__4.9_8 = .LEN_LOAD (_13, 64B, loop_len_20, 0); > + ... > + vect__4.10_28 = .LEN_LOAD (_46, 64B, loop_len_19, 0); > + ... > + vect__4.11_30 = .LEN_LOAD (_49, 64B, loop_len_16, 0); > + vect__7.13_31 = VEC_PACK_TRUNC_EXPR <vect__4.8_15, vect__4.9_8>; > + vect__7.13_32 = VEC_PACK_TRUNC_EXPR <...>; > + vect__7.12_33 = VEC_PACK_TRUNC_EXPR <...>; > + ... > + .LEN_STORE (_14, 16B, _40, vect__7.12_33, 0); > + ivtmp_39 = ivtmp_38 - _40; > + ... > + if (ivtmp_39 != 0) > + goto <bb 3>; [92.31%] > + else > + goto <bb 4>; [7.69%] > + */ > + rgroup_controls *sub_rgc > + = &(*controls)[nmasks / rgc->controls.length () - 1]; > + if (!sub_rgc->controls.is_empty ()) > + { > + tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo); > + vect_adjust_loop_lens_control (iv_type, &header_seq, rgc, > + sub_rgc, NULL_TREE); > + continue; > + } > + } In other words, why is this different from what vect_set_loop_controls_directly would do? Thanks, Richard ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH V12] VECT: Add decrement IV iteration loop control by variable amount support 2023-05-24 13:26 ` 钟居哲 @ 2023-05-24 14:01 ` Richard Sandiford 2023-05-24 14:10 ` 钟居哲 ` (2 more replies) 0 siblings, 3 replies; 16+ messages in thread From: Richard Sandiford @ 2023-05-24 14:01 UTC (permalink / raw) To: 钟居哲; +Cc: gcc-patches, rguenther 钟居哲 <juzhe.zhong@rivai.ai> writes: >>> In other words, why is this different from what >>>vect_set_loop_controls_directly would do? > Oh, I see. You are confused that why I do not make multiple-rgroup vec_trunk > handling inside "vect_set_loop_controls_directly". > > Well. Frankly, I just replicate the handling of ARM SVE: > unsigned int nmasks = i + 1; > if (use_masks_p && (nmasks & 1) == 0) > { > rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1]; > if (!half_rgc->controls.is_empty () > && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc)) > continue; > } > > /* Try to use permutes to define the masks in DEST_RGM using the masks > in SRC_RGM, given that the former has twice as many masks as the > latter. Return true on success, adding any new statements to SEQ. */ > > static bool > vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm, > rgroup_controls *src_rgm) > { > tree src_masktype = src_rgm->type; > tree dest_masktype = dest_rgm->type; > machine_mode src_mode = TYPE_MODE (src_masktype); > insn_code icode1, icode2; > if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter > && (icode1 = optab_handler (vec_unpacku_hi_optab, > src_mode)) != CODE_FOR_nothing > && (icode2 = optab_handler (vec_unpacku_lo_optab, > src_mode)) != CODE_FOR_nothing) > { > /* Unpacking the source masks gives at least as many mask bits as > we need. We can then VIEW_CONVERT any excess bits away. */ > machine_mode dest_mode = insn_data[icode1].operand[0].mode; > gcc_assert (dest_mode == insn_data[icode2].operand[0].mode); > tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode); > for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i) > { > tree src = src_rgm->controls[i / 2]; > tree dest = dest_rgm->controls[i]; > tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1) > ? VEC_UNPACK_HI_EXPR > : VEC_UNPACK_LO_EXPR); > gassign *stmt; > if (dest_masktype == unpack_masktype) > stmt = gimple_build_assign (dest, code, src); > else > { > tree temp = make_ssa_name (unpack_masktype); > stmt = gimple_build_assign (temp, code, src); > gimple_seq_add_stmt (seq, stmt); > stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR, > build1 (VIEW_CONVERT_EXPR, > dest_masktype, temp)); > } > gimple_seq_add_stmt (seq, stmt); > } > return true; > } > vec_perm_indices indices[2]; > if (dest_masktype == src_masktype > && interleave_supported_p (&indices[0], src_masktype, 0) > && interleave_supported_p (&indices[1], src_masktype, 1)) > { > /* The destination requires twice as many mask bits as the source, so > we can use interleaving permutes to double up the number of bits. */ > tree masks[2]; > for (unsigned int i = 0; i < 2; ++i) > masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]); > for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i) > { > tree src = src_rgm->controls[i / 2]; > tree dest = dest_rgm->controls[i]; > gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR, > src, src, masks[i & 1]); > gimple_seq_add_stmt (seq, stmt); > } > return true; > } > return false; > } > > I know this is just optimization for ARM SVE with sub_rgc (int16) is half size of rgc (int8). > But when I just copy the codes from ARM SVE and make it general for all cases (int8 <-> int64). > They all work well and codegen is good. > > If you don't like this way, would you mind give me some suggestions? It's not a case of disliking one approach or disliking another. There are two separate parts of this: one specific and one general. The specific part is that the code had: rgroup_controls *sub_rgc = &(*controls)[nmasks / rgc->controls.length () - 1]; if (!sub_rgc->controls.is_empty ()) { tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo); vect_adjust_loop_lens_control (iv_type, &header_seq, rgc, sub_rgc, NULL_TREE); continue; } But AIUI, nmasks is always equal to rgc->controls.length () (if rgc->controls is non-empty). So I think this always used (*controls)[0] as the source rgroup. And I think that's the only case that would work, since vect_adjust_loop_lens_control only reads from sub_rgc once. Is that right? It would be possible to make other source rgroups work, but it would involve reading every element of sub_rgc->controls, not just sub_rgc->controls[0]. (This is also what the SVE code does.) The general point is that there are two extreme approaches: (1) Use one IV, and base everything off that. (2) Use a separate IV for every rgroup. It's also possible to do something between these two extremes. (And we should, if it gives better code.) The code above reuses an existing IV. Doing that for all multi-control rgroups would give (1). At the other extreme, (2) would involve calling vect_set_loop_controls_directly for every rgroup. Both approaches are fine. I'm not against one or the other. What I didn't understand was why your patch only reuses existing IVs for max_nscalars_per_iter == 1. Was it to avoid having to do a multiplication (well, really a shift left) when moving from one rgroup to another? E.g. if one rgroup had; nscalars_per_iter == 2 && factor == 1 and another had: nscalars_per_iter == 4 && factor == 1 then we would need to mulitply by 2 when going from the first rgroup to the second. If so, avoiding a multiplication seems like a good reason for the choice you were making in the path. But we then need to check max_nscalars_per_iter == 1 for both the source rgroup and the destination rgroup, not just the destination. And I think the condition for “no multiplication needed” should be that: max_nscalars_per_iter * factor (aka "nitems_per_iter") is the same for both groups. Thanks, Richard ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: Re: [PATCH V12] VECT: Add decrement IV iteration loop control by variable amount support 2023-05-24 14:01 ` Richard Sandiford @ 2023-05-24 14:10 ` 钟居哲 2023-05-24 14:57 ` Richard Sandiford [not found] ` <2023052422100415521814@rivai.ai> 2023-05-24 14:35 ` 钟居哲 2 siblings, 1 reply; 16+ messages in thread From: 钟居哲 @ 2023-05-24 14:10 UTC (permalink / raw) To: richard.sandiford; +Cc: gcc-patches, rguenther [-- Attachment #1: Type: text/plain, Size: 8471 bytes --] >> Both approaches are fine. I'm not against one or the other. >> What I didn't understand was why your patch only reuses existing IVs >> for max_nscalars_per_iter == 1. Was it to avoid having to do a >> multiplication (well, really a shift left) when moving from one >> rgroup to another? E.g. if one rgroup had; >> nscalars_per_iter == 2 && factor == 1 >> and another had: >> nscalars_per_iter == 4 && factor == 1 >> then we would need to mulitply by 2 when going from the first rgroup >> to the second. >> If so, avoiding a multiplication seems like a good reason for the choice >> you were making in the path. But we then need to check >> max_nscalars_per_iter == 1 for both the source rgroup and the >> destination rgroup, not just the destination. And I think the >> condition for “no multiplication needed” should be that: Oh, I didn't realize such complicated problem. Frankly, I didn't understand well rgroup. Sorry about that :). I just remember last time you said I need to handle multiple-rgroup not only for SLP but also non-SLP (which is vec_pack_trunk that I tested). Then I asked you when is non-SLP, you said max_nscalars_per_iter == 1. Then I use max_nscalars_per_iter == 1 here (I didn't really lean very well from this, just add it as you said). Actually, I just want to hanlde multip-rgroup for non-SLP here, I am trying to avoid multiplication and I think scalar multiplication (not cost too much) is fine in modern CPU. So, what do you suggest that I handle multiple-rgroup for non-SLP. Thanks. juzhe.zhong@rivai.ai From: Richard Sandiford Date: 2023-05-24 22:01 To: 钟居哲 CC: gcc-patches; rguenther Subject: Re: [PATCH V12] VECT: Add decrement IV iteration loop control by variable amount support 钟居哲 <juzhe.zhong@rivai.ai> writes: >>> In other words, why is this different from what >>>vect_set_loop_controls_directly would do? > Oh, I see. You are confused that why I do not make multiple-rgroup vec_trunk > handling inside "vect_set_loop_controls_directly". > > Well. Frankly, I just replicate the handling of ARM SVE: > unsigned int nmasks = i + 1; > if (use_masks_p && (nmasks & 1) == 0) > { > rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1]; > if (!half_rgc->controls.is_empty () > && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc)) > continue; > } > > /* Try to use permutes to define the masks in DEST_RGM using the masks > in SRC_RGM, given that the former has twice as many masks as the > latter. Return true on success, adding any new statements to SEQ. */ > > static bool > vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm, > rgroup_controls *src_rgm) > { > tree src_masktype = src_rgm->type; > tree dest_masktype = dest_rgm->type; > machine_mode src_mode = TYPE_MODE (src_masktype); > insn_code icode1, icode2; > if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter > && (icode1 = optab_handler (vec_unpacku_hi_optab, > src_mode)) != CODE_FOR_nothing > && (icode2 = optab_handler (vec_unpacku_lo_optab, > src_mode)) != CODE_FOR_nothing) > { > /* Unpacking the source masks gives at least as many mask bits as > we need. We can then VIEW_CONVERT any excess bits away. */ > machine_mode dest_mode = insn_data[icode1].operand[0].mode; > gcc_assert (dest_mode == insn_data[icode2].operand[0].mode); > tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode); > for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i) > { > tree src = src_rgm->controls[i / 2]; > tree dest = dest_rgm->controls[i]; > tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1) > ? VEC_UNPACK_HI_EXPR > : VEC_UNPACK_LO_EXPR); > gassign *stmt; > if (dest_masktype == unpack_masktype) > stmt = gimple_build_assign (dest, code, src); > else > { > tree temp = make_ssa_name (unpack_masktype); > stmt = gimple_build_assign (temp, code, src); > gimple_seq_add_stmt (seq, stmt); > stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR, > build1 (VIEW_CONVERT_EXPR, > dest_masktype, temp)); > } > gimple_seq_add_stmt (seq, stmt); > } > return true; > } > vec_perm_indices indices[2]; > if (dest_masktype == src_masktype > && interleave_supported_p (&indices[0], src_masktype, 0) > && interleave_supported_p (&indices[1], src_masktype, 1)) > { > /* The destination requires twice as many mask bits as the source, so > we can use interleaving permutes to double up the number of bits. */ > tree masks[2]; > for (unsigned int i = 0; i < 2; ++i) > masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]); > for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i) > { > tree src = src_rgm->controls[i / 2]; > tree dest = dest_rgm->controls[i]; > gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR, > src, src, masks[i & 1]); > gimple_seq_add_stmt (seq, stmt); > } > return true; > } > return false; > } > > I know this is just optimization for ARM SVE with sub_rgc (int16) is half size of rgc (int8). > But when I just copy the codes from ARM SVE and make it general for all cases (int8 <-> int64). > They all work well and codegen is good. > > If you don't like this way, would you mind give me some suggestions? It's not a case of disliking one approach or disliking another. There are two separate parts of this: one specific and one general. The specific part is that the code had: rgroup_controls *sub_rgc = &(*controls)[nmasks / rgc->controls.length () - 1]; if (!sub_rgc->controls.is_empty ()) { tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo); vect_adjust_loop_lens_control (iv_type, &header_seq, rgc, sub_rgc, NULL_TREE); continue; } But AIUI, nmasks is always equal to rgc->controls.length () (if rgc->controls is non-empty). So I think this always used (*controls)[0] as the source rgroup. And I think that's the only case that would work, since vect_adjust_loop_lens_control only reads from sub_rgc once. Is that right? It would be possible to make other source rgroups work, but it would involve reading every element of sub_rgc->controls, not just sub_rgc->controls[0]. (This is also what the SVE code does.) The general point is that there are two extreme approaches: (1) Use one IV, and base everything off that. (2) Use a separate IV for every rgroup. It's also possible to do something between these two extremes. (And we should, if it gives better code.) The code above reuses an existing IV. Doing that for all multi-control rgroups would give (1). At the other extreme, (2) would involve calling vect_set_loop_controls_directly for every rgroup. Both approaches are fine. I'm not against one or the other. What I didn't understand was why your patch only reuses existing IVs for max_nscalars_per_iter == 1. Was it to avoid having to do a multiplication (well, really a shift left) when moving from one rgroup to another? E.g. if one rgroup had; nscalars_per_iter == 2 && factor == 1 and another had: nscalars_per_iter == 4 && factor == 1 then we would need to mulitply by 2 when going from the first rgroup to the second. If so, avoiding a multiplication seems like a good reason for the choice you were making in the path. But we then need to check max_nscalars_per_iter == 1 for both the source rgroup and the destination rgroup, not just the destination. And I think the condition for “no multiplication needed” should be that: max_nscalars_per_iter * factor (aka "nitems_per_iter") is the same for both groups. Thanks, Richard ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH V12] VECT: Add decrement IV iteration loop control by variable amount support 2023-05-24 14:10 ` 钟居哲 @ 2023-05-24 14:57 ` Richard Sandiford 2023-05-24 14:58 ` 钟居哲 0 siblings, 1 reply; 16+ messages in thread From: Richard Sandiford @ 2023-05-24 14:57 UTC (permalink / raw) To: 钟居哲; +Cc: gcc-patches, rguenther 钟居哲 <juzhe.zhong@rivai.ai> writes: >>> Both approaches are fine. I'm not against one or the other. > >>> What I didn't understand was why your patch only reuses existing IVs >>> for max_nscalars_per_iter == 1. Was it to avoid having to do a >>> multiplication (well, really a shift left) when moving from one >>> rgroup to another? E.g. if one rgroup had; > >>> nscalars_per_iter == 2 && factor == 1 > >>> and another had: > >>> nscalars_per_iter == 4 && factor == 1 > >>> then we would need to mulitply by 2 when going from the first rgroup >>> to the second. > >>> If so, avoiding a multiplication seems like a good reason for the choice >>> you were making in the path. But we then need to check >>> max_nscalars_per_iter == 1 for both the source rgroup and the >>> destination rgroup, not just the destination. And I think the >>> condition for “no multiplication needed” should be that: > > Oh, I didn't realize such complicated problem. Frankly, I didn't understand well > rgroup. Sorry about that :). > > I just remember last time you said I need to handle multiple-rgroup > not only for SLP but also non-SLP (which is vec_pack_trunk that I tested). > Then I asked you when is non-SLP, you said max_nscalars_per_iter == 1. Yeah, max_nscalars_per_iter == 1 is the right way of checking for non-SLP. But I'm never been convinced that SLP vs. non-SLP is a meaningful distinction for this patch (that is, the parts that don't use SELECT_VL). SLP vs. non-SLP matters for SELECT_VL. But the rgroup abstraction should mean that SLP vs. non-SLP doesn't matter otherwise. Thanks, Richard ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: Re: [PATCH V12] VECT: Add decrement IV iteration loop control by variable amount support 2023-05-24 14:57 ` Richard Sandiford @ 2023-05-24 14:58 ` 钟居哲 0 siblings, 0 replies; 16+ messages in thread From: 钟居哲 @ 2023-05-24 14:58 UTC (permalink / raw) To: richard.sandiford; +Cc: gcc-patches, rguenther [-- Attachment #1: Type: text/plain, Size: 2110 bytes --] Yeah. Thanks. I have sent V14: https://gcc.gnu.org/pipermail/gcc-patches/2023-May/619478.html which I found there is no distinction between SLP and non-SLP. Could you review it? I think it's more reasonable now. Thanks. juzhe.zhong@rivai.ai From: Richard Sandiford Date: 2023-05-24 22:57 To: 钟居哲 CC: gcc-patches; rguenther Subject: Re: [PATCH V12] VECT: Add decrement IV iteration loop control by variable amount support 钟居哲 <juzhe.zhong@rivai.ai> writes: >>> Both approaches are fine. I'm not against one or the other. > >>> What I didn't understand was why your patch only reuses existing IVs >>> for max_nscalars_per_iter == 1. Was it to avoid having to do a >>> multiplication (well, really a shift left) when moving from one >>> rgroup to another? E.g. if one rgroup had; > >>> nscalars_per_iter == 2 && factor == 1 > >>> and another had: > >>> nscalars_per_iter == 4 && factor == 1 > >>> then we would need to mulitply by 2 when going from the first rgroup >>> to the second. > >>> If so, avoiding a multiplication seems like a good reason for the choice >>> you were making in the path. But we then need to check >>> max_nscalars_per_iter == 1 for both the source rgroup and the >>> destination rgroup, not just the destination. And I think the >>> condition for “no multiplication needed” should be that: > > Oh, I didn't realize such complicated problem. Frankly, I didn't understand well > rgroup. Sorry about that :). > > I just remember last time you said I need to handle multiple-rgroup > not only for SLP but also non-SLP (which is vec_pack_trunk that I tested). > Then I asked you when is non-SLP, you said max_nscalars_per_iter == 1. Yeah, max_nscalars_per_iter == 1 is the right way of checking for non-SLP. But I'm never been convinced that SLP vs. non-SLP is a meaningful distinction for this patch (that is, the parts that don't use SELECT_VL). SLP vs. non-SLP matters for SELECT_VL. But the rgroup abstraction should mean that SLP vs. non-SLP doesn't matter otherwise. Thanks, Richard ^ permalink raw reply [flat|nested] 16+ messages in thread
[parent not found: <2023052422100415521814@rivai.ai>]
* Re: Re: [PATCH V12] VECT: Add decrement IV iteration loop control by variable amount support [not found] ` <2023052422100415521814@rivai.ai> @ 2023-05-24 14:11 ` 钟居哲 2023-05-24 14:22 ` 钟居哲 1 sibling, 0 replies; 16+ messages in thread From: 钟居哲 @ 2023-05-24 14:11 UTC (permalink / raw) To: richard.sandiford; +Cc: gcc-patches, rguenther [-- Attachment #1: Type: text/plain, Size: 8981 bytes --] >> Actually, I just want to hanlde multip-rgroup for non-SLP here, I am trying to avoid multiplication and I think >> scalar multiplication (not cost too much) is fine in modern CPU. Sorry for incorrect typo. I didn't try to avoid multiplication and I think multiplication is fine. juzhe.zhong@rivai.ai From: 钟居哲 Date: 2023-05-24 22:10 To: richard.sandiford CC: gcc-patches; rguenther Subject: Re: Re: [PATCH V12] VECT: Add decrement IV iteration loop control by variable amount support >> Both approaches are fine. I'm not against one or the other. >> What I didn't understand was why your patch only reuses existing IVs >> for max_nscalars_per_iter == 1. Was it to avoid having to do a >> multiplication (well, really a shift left) when moving from one >> rgroup to another? E.g. if one rgroup had; >> nscalars_per_iter == 2 && factor == 1 >> and another had: >> nscalars_per_iter == 4 && factor == 1 >> then we would need to mulitply by 2 when going from the first rgroup >> to the second. >> If so, avoiding a multiplication seems like a good reason for the choice >> you were making in the path. But we then need to check >> max_nscalars_per_iter == 1 for both the source rgroup and the >> destination rgroup, not just the destination. And I think the >> condition for “no multiplication needed” should be that: Oh, I didn't realize such complicated problem. Frankly, I didn't understand well rgroup. Sorry about that :). I just remember last time you said I need to handle multiple-rgroup not only for SLP but also non-SLP (which is vec_pack_trunk that I tested). Then I asked you when is non-SLP, you said max_nscalars_per_iter == 1. Then I use max_nscalars_per_iter == 1 here (I didn't really lean very well from this, just add it as you said). Actually, I just want to hanlde multip-rgroup for non-SLP here, I am trying to avoid multiplication and I think scalar multiplication (not cost too much) is fine in modern CPU. So, what do you suggest that I handle multiple-rgroup for non-SLP. Thanks. juzhe.zhong@rivai.ai From: Richard Sandiford Date: 2023-05-24 22:01 To: 钟居哲 CC: gcc-patches; rguenther Subject: Re: [PATCH V12] VECT: Add decrement IV iteration loop control by variable amount support 钟居哲 <juzhe.zhong@rivai.ai> writes: >>> In other words, why is this different from what >>>vect_set_loop_controls_directly would do? > Oh, I see. You are confused that why I do not make multiple-rgroup vec_trunk > handling inside "vect_set_loop_controls_directly". > > Well. Frankly, I just replicate the handling of ARM SVE: > unsigned int nmasks = i + 1; > if (use_masks_p && (nmasks & 1) == 0) > { > rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1]; > if (!half_rgc->controls.is_empty () > && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc)) > continue; > } > > /* Try to use permutes to define the masks in DEST_RGM using the masks > in SRC_RGM, given that the former has twice as many masks as the > latter. Return true on success, adding any new statements to SEQ. */ > > static bool > vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm, > rgroup_controls *src_rgm) > { > tree src_masktype = src_rgm->type; > tree dest_masktype = dest_rgm->type; > machine_mode src_mode = TYPE_MODE (src_masktype); > insn_code icode1, icode2; > if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter > && (icode1 = optab_handler (vec_unpacku_hi_optab, > src_mode)) != CODE_FOR_nothing > && (icode2 = optab_handler (vec_unpacku_lo_optab, > src_mode)) != CODE_FOR_nothing) > { > /* Unpacking the source masks gives at least as many mask bits as > we need. We can then VIEW_CONVERT any excess bits away. */ > machine_mode dest_mode = insn_data[icode1].operand[0].mode; > gcc_assert (dest_mode == insn_data[icode2].operand[0].mode); > tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode); > for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i) > { > tree src = src_rgm->controls[i / 2]; > tree dest = dest_rgm->controls[i]; > tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1) > ? VEC_UNPACK_HI_EXPR > : VEC_UNPACK_LO_EXPR); > gassign *stmt; > if (dest_masktype == unpack_masktype) > stmt = gimple_build_assign (dest, code, src); > else > { > tree temp = make_ssa_name (unpack_masktype); > stmt = gimple_build_assign (temp, code, src); > gimple_seq_add_stmt (seq, stmt); > stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR, > build1 (VIEW_CONVERT_EXPR, > dest_masktype, temp)); > } > gimple_seq_add_stmt (seq, stmt); > } > return true; > } > vec_perm_indices indices[2]; > if (dest_masktype == src_masktype > && interleave_supported_p (&indices[0], src_masktype, 0) > && interleave_supported_p (&indices[1], src_masktype, 1)) > { > /* The destination requires twice as many mask bits as the source, so > we can use interleaving permutes to double up the number of bits. */ > tree masks[2]; > for (unsigned int i = 0; i < 2; ++i) > masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]); > for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i) > { > tree src = src_rgm->controls[i / 2]; > tree dest = dest_rgm->controls[i]; > gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR, > src, src, masks[i & 1]); > gimple_seq_add_stmt (seq, stmt); > } > return true; > } > return false; > } > > I know this is just optimization for ARM SVE with sub_rgc (int16) is half size of rgc (int8). > But when I just copy the codes from ARM SVE and make it general for all cases (int8 <-> int64). > They all work well and codegen is good. > > If you don't like this way, would you mind give me some suggestions? It's not a case of disliking one approach or disliking another. There are two separate parts of this: one specific and one general. The specific part is that the code had: rgroup_controls *sub_rgc = &(*controls)[nmasks / rgc->controls.length () - 1]; if (!sub_rgc->controls.is_empty ()) { tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo); vect_adjust_loop_lens_control (iv_type, &header_seq, rgc, sub_rgc, NULL_TREE); continue; } But AIUI, nmasks is always equal to rgc->controls.length () (if rgc->controls is non-empty). So I think this always used (*controls)[0] as the source rgroup. And I think that's the only case that would work, since vect_adjust_loop_lens_control only reads from sub_rgc once. Is that right? It would be possible to make other source rgroups work, but it would involve reading every element of sub_rgc->controls, not just sub_rgc->controls[0]. (This is also what the SVE code does.) The general point is that there are two extreme approaches: (1) Use one IV, and base everything off that. (2) Use a separate IV for every rgroup. It's also possible to do something between these two extremes. (And we should, if it gives better code.) The code above reuses an existing IV. Doing that for all multi-control rgroups would give (1). At the other extreme, (2) would involve calling vect_set_loop_controls_directly for every rgroup. Both approaches are fine. I'm not against one or the other. What I didn't understand was why your patch only reuses existing IVs for max_nscalars_per_iter == 1. Was it to avoid having to do a multiplication (well, really a shift left) when moving from one rgroup to another? E.g. if one rgroup had; nscalars_per_iter == 2 && factor == 1 and another had: nscalars_per_iter == 4 && factor == 1 then we would need to mulitply by 2 when going from the first rgroup to the second. If so, avoiding a multiplication seems like a good reason for the choice you were making in the path. But we then need to check max_nscalars_per_iter == 1 for both the source rgroup and the destination rgroup, not just the destination. And I think the condition for “no multiplication needed” should be that: max_nscalars_per_iter * factor (aka "nitems_per_iter") is the same for both groups. Thanks, Richard ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: Re: [PATCH V12] VECT: Add decrement IV iteration loop control by variable amount support [not found] ` <2023052422100415521814@rivai.ai> 2023-05-24 14:11 ` 钟居哲 @ 2023-05-24 14:22 ` 钟居哲 1 sibling, 0 replies; 16+ messages in thread From: 钟居哲 @ 2023-05-24 14:22 UTC (permalink / raw) To: richard.sandiford; +Cc: gcc-patches, rguenther [-- Attachment #1: Type: text/plain, Size: 8817 bytes --] Oh. I just realize the follow you design is working well for vec_pack_trunk too. Will send V13 patch soon. Thanks. juzhe.zhong@rivai.ai From: 钟居哲 Date: 2023-05-24 22:10 To: richard.sandiford CC: gcc-patches; rguenther Subject: Re: Re: [PATCH V12] VECT: Add decrement IV iteration loop control by variable amount support >> Both approaches are fine. I'm not against one or the other. >> What I didn't understand was why your patch only reuses existing IVs >> for max_nscalars_per_iter == 1. Was it to avoid having to do a >> multiplication (well, really a shift left) when moving from one >> rgroup to another? E.g. if one rgroup had; >> nscalars_per_iter == 2 && factor == 1 >> and another had: >> nscalars_per_iter == 4 && factor == 1 >> then we would need to mulitply by 2 when going from the first rgroup >> to the second. >> If so, avoiding a multiplication seems like a good reason for the choice >> you were making in the path. But we then need to check >> max_nscalars_per_iter == 1 for both the source rgroup and the >> destination rgroup, not just the destination. And I think the >> condition for “no multiplication needed” should be that: Oh, I didn't realize such complicated problem. Frankly, I didn't understand well rgroup. Sorry about that :). I just remember last time you said I need to handle multiple-rgroup not only for SLP but also non-SLP (which is vec_pack_trunk that I tested). Then I asked you when is non-SLP, you said max_nscalars_per_iter == 1. Then I use max_nscalars_per_iter == 1 here (I didn't really lean very well from this, just add it as you said). Actually, I just want to hanlde multip-rgroup for non-SLP here, I am trying to avoid multiplication and I think scalar multiplication (not cost too much) is fine in modern CPU. So, what do you suggest that I handle multiple-rgroup for non-SLP. Thanks. juzhe.zhong@rivai.ai From: Richard Sandiford Date: 2023-05-24 22:01 To: 钟居哲 CC: gcc-patches; rguenther Subject: Re: [PATCH V12] VECT: Add decrement IV iteration loop control by variable amount support 钟居哲 <juzhe.zhong@rivai.ai> writes: >>> In other words, why is this different from what >>>vect_set_loop_controls_directly would do? > Oh, I see. You are confused that why I do not make multiple-rgroup vec_trunk > handling inside "vect_set_loop_controls_directly". > > Well. Frankly, I just replicate the handling of ARM SVE: > unsigned int nmasks = i + 1; > if (use_masks_p && (nmasks & 1) == 0) > { > rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1]; > if (!half_rgc->controls.is_empty () > && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc)) > continue; > } > > /* Try to use permutes to define the masks in DEST_RGM using the masks > in SRC_RGM, given that the former has twice as many masks as the > latter. Return true on success, adding any new statements to SEQ. */ > > static bool > vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm, > rgroup_controls *src_rgm) > { > tree src_masktype = src_rgm->type; > tree dest_masktype = dest_rgm->type; > machine_mode src_mode = TYPE_MODE (src_masktype); > insn_code icode1, icode2; > if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter > && (icode1 = optab_handler (vec_unpacku_hi_optab, > src_mode)) != CODE_FOR_nothing > && (icode2 = optab_handler (vec_unpacku_lo_optab, > src_mode)) != CODE_FOR_nothing) > { > /* Unpacking the source masks gives at least as many mask bits as > we need. We can then VIEW_CONVERT any excess bits away. */ > machine_mode dest_mode = insn_data[icode1].operand[0].mode; > gcc_assert (dest_mode == insn_data[icode2].operand[0].mode); > tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode); > for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i) > { > tree src = src_rgm->controls[i / 2]; > tree dest = dest_rgm->controls[i]; > tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1) > ? VEC_UNPACK_HI_EXPR > : VEC_UNPACK_LO_EXPR); > gassign *stmt; > if (dest_masktype == unpack_masktype) > stmt = gimple_build_assign (dest, code, src); > else > { > tree temp = make_ssa_name (unpack_masktype); > stmt = gimple_build_assign (temp, code, src); > gimple_seq_add_stmt (seq, stmt); > stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR, > build1 (VIEW_CONVERT_EXPR, > dest_masktype, temp)); > } > gimple_seq_add_stmt (seq, stmt); > } > return true; > } > vec_perm_indices indices[2]; > if (dest_masktype == src_masktype > && interleave_supported_p (&indices[0], src_masktype, 0) > && interleave_supported_p (&indices[1], src_masktype, 1)) > { > /* The destination requires twice as many mask bits as the source, so > we can use interleaving permutes to double up the number of bits. */ > tree masks[2]; > for (unsigned int i = 0; i < 2; ++i) > masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]); > for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i) > { > tree src = src_rgm->controls[i / 2]; > tree dest = dest_rgm->controls[i]; > gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR, > src, src, masks[i & 1]); > gimple_seq_add_stmt (seq, stmt); > } > return true; > } > return false; > } > > I know this is just optimization for ARM SVE with sub_rgc (int16) is half size of rgc (int8). > But when I just copy the codes from ARM SVE and make it general for all cases (int8 <-> int64). > They all work well and codegen is good. > > If you don't like this way, would you mind give me some suggestions? It's not a case of disliking one approach or disliking another. There are two separate parts of this: one specific and one general. The specific part is that the code had: rgroup_controls *sub_rgc = &(*controls)[nmasks / rgc->controls.length () - 1]; if (!sub_rgc->controls.is_empty ()) { tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo); vect_adjust_loop_lens_control (iv_type, &header_seq, rgc, sub_rgc, NULL_TREE); continue; } But AIUI, nmasks is always equal to rgc->controls.length () (if rgc->controls is non-empty). So I think this always used (*controls)[0] as the source rgroup. And I think that's the only case that would work, since vect_adjust_loop_lens_control only reads from sub_rgc once. Is that right? It would be possible to make other source rgroups work, but it would involve reading every element of sub_rgc->controls, not just sub_rgc->controls[0]. (This is also what the SVE code does.) The general point is that there are two extreme approaches: (1) Use one IV, and base everything off that. (2) Use a separate IV for every rgroup. It's also possible to do something between these two extremes. (And we should, if it gives better code.) The code above reuses an existing IV. Doing that for all multi-control rgroups would give (1). At the other extreme, (2) would involve calling vect_set_loop_controls_directly for every rgroup. Both approaches are fine. I'm not against one or the other. What I didn't understand was why your patch only reuses existing IVs for max_nscalars_per_iter == 1. Was it to avoid having to do a multiplication (well, really a shift left) when moving from one rgroup to another? E.g. if one rgroup had; nscalars_per_iter == 2 && factor == 1 and another had: nscalars_per_iter == 4 && factor == 1 then we would need to mulitply by 2 when going from the first rgroup to the second. If so, avoiding a multiplication seems like a good reason for the choice you were making in the path. But we then need to check max_nscalars_per_iter == 1 for both the source rgroup and the destination rgroup, not just the destination. And I think the condition for “no multiplication needed” should be that: max_nscalars_per_iter * factor (aka "nitems_per_iter") is the same for both groups. Thanks, Richard ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: Re: [PATCH V12] VECT: Add decrement IV iteration loop control by variable amount support 2023-05-24 14:01 ` Richard Sandiford 2023-05-24 14:10 ` 钟居哲 [not found] ` <2023052422100415521814@rivai.ai> @ 2023-05-24 14:35 ` 钟居哲 2 siblings, 0 replies; 16+ messages in thread From: 钟居哲 @ 2023-05-24 14:35 UTC (permalink / raw) To: richard.sandiford; +Cc: gcc-patches, rguenther [-- Attachment #1: Type: text/plain, Size: 7066 bytes --] Hi. Richard. I have sent V13: https://gcc.gnu.org/pipermail/gcc-patches/2023-May/619475.html It looks more reasonable now. Could you continue review it again? Thanks. juzhe.zhong@rivai.ai From: Richard Sandiford Date: 2023-05-24 22:01 To: 钟居哲 CC: gcc-patches; rguenther Subject: Re: [PATCH V12] VECT: Add decrement IV iteration loop control by variable amount support 钟居哲 <juzhe.zhong@rivai.ai> writes: >>> In other words, why is this different from what >>>vect_set_loop_controls_directly would do? > Oh, I see. You are confused that why I do not make multiple-rgroup vec_trunk > handling inside "vect_set_loop_controls_directly". > > Well. Frankly, I just replicate the handling of ARM SVE: > unsigned int nmasks = i + 1; > if (use_masks_p && (nmasks & 1) == 0) > { > rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1]; > if (!half_rgc->controls.is_empty () > && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc)) > continue; > } > > /* Try to use permutes to define the masks in DEST_RGM using the masks > in SRC_RGM, given that the former has twice as many masks as the > latter. Return true on success, adding any new statements to SEQ. */ > > static bool > vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm, > rgroup_controls *src_rgm) > { > tree src_masktype = src_rgm->type; > tree dest_masktype = dest_rgm->type; > machine_mode src_mode = TYPE_MODE (src_masktype); > insn_code icode1, icode2; > if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter > && (icode1 = optab_handler (vec_unpacku_hi_optab, > src_mode)) != CODE_FOR_nothing > && (icode2 = optab_handler (vec_unpacku_lo_optab, > src_mode)) != CODE_FOR_nothing) > { > /* Unpacking the source masks gives at least as many mask bits as > we need. We can then VIEW_CONVERT any excess bits away. */ > machine_mode dest_mode = insn_data[icode1].operand[0].mode; > gcc_assert (dest_mode == insn_data[icode2].operand[0].mode); > tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode); > for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i) > { > tree src = src_rgm->controls[i / 2]; > tree dest = dest_rgm->controls[i]; > tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1) > ? VEC_UNPACK_HI_EXPR > : VEC_UNPACK_LO_EXPR); > gassign *stmt; > if (dest_masktype == unpack_masktype) > stmt = gimple_build_assign (dest, code, src); > else > { > tree temp = make_ssa_name (unpack_masktype); > stmt = gimple_build_assign (temp, code, src); > gimple_seq_add_stmt (seq, stmt); > stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR, > build1 (VIEW_CONVERT_EXPR, > dest_masktype, temp)); > } > gimple_seq_add_stmt (seq, stmt); > } > return true; > } > vec_perm_indices indices[2]; > if (dest_masktype == src_masktype > && interleave_supported_p (&indices[0], src_masktype, 0) > && interleave_supported_p (&indices[1], src_masktype, 1)) > { > /* The destination requires twice as many mask bits as the source, so > we can use interleaving permutes to double up the number of bits. */ > tree masks[2]; > for (unsigned int i = 0; i < 2; ++i) > masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]); > for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i) > { > tree src = src_rgm->controls[i / 2]; > tree dest = dest_rgm->controls[i]; > gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR, > src, src, masks[i & 1]); > gimple_seq_add_stmt (seq, stmt); > } > return true; > } > return false; > } > > I know this is just optimization for ARM SVE with sub_rgc (int16) is half size of rgc (int8). > But when I just copy the codes from ARM SVE and make it general for all cases (int8 <-> int64). > They all work well and codegen is good. > > If you don't like this way, would you mind give me some suggestions? It's not a case of disliking one approach or disliking another. There are two separate parts of this: one specific and one general. The specific part is that the code had: rgroup_controls *sub_rgc = &(*controls)[nmasks / rgc->controls.length () - 1]; if (!sub_rgc->controls.is_empty ()) { tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo); vect_adjust_loop_lens_control (iv_type, &header_seq, rgc, sub_rgc, NULL_TREE); continue; } But AIUI, nmasks is always equal to rgc->controls.length () (if rgc->controls is non-empty). So I think this always used (*controls)[0] as the source rgroup. And I think that's the only case that would work, since vect_adjust_loop_lens_control only reads from sub_rgc once. Is that right? It would be possible to make other source rgroups work, but it would involve reading every element of sub_rgc->controls, not just sub_rgc->controls[0]. (This is also what the SVE code does.) The general point is that there are two extreme approaches: (1) Use one IV, and base everything off that. (2) Use a separate IV for every rgroup. It's also possible to do something between these two extremes. (And we should, if it gives better code.) The code above reuses an existing IV. Doing that for all multi-control rgroups would give (1). At the other extreme, (2) would involve calling vect_set_loop_controls_directly for every rgroup. Both approaches are fine. I'm not against one or the other. What I didn't understand was why your patch only reuses existing IVs for max_nscalars_per_iter == 1. Was it to avoid having to do a multiplication (well, really a shift left) when moving from one rgroup to another? E.g. if one rgroup had; nscalars_per_iter == 2 && factor == 1 and another had: nscalars_per_iter == 4 && factor == 1 then we would need to mulitply by 2 when going from the first rgroup to the second. If so, avoiding a multiplication seems like a good reason for the choice you were making in the path. But we then need to check max_nscalars_per_iter == 1 for both the source rgroup and the destination rgroup, not just the destination. And I think the condition for “no multiplication needed” should be that: max_nscalars_per_iter * factor (aka "nitems_per_iter") is the same for both groups. Thanks, Richard ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: Re: [PATCH V12] VECT: Add decrement IV iteration loop control by variable amount support 2023-05-24 11:23 ` Richard Sandiford 2023-05-24 11:52 ` 钟居哲 @ 2023-05-24 12:19 ` 钟居哲 1 sibling, 0 replies; 16+ messages in thread From: 钟居哲 @ 2023-05-24 12:19 UTC (permalink / raw) To: richard.sandiford; +Cc: gcc-patches, rguenther [-- Attachment #1.1: Type: text/plain, Size: 14380 bytes --] Hi, Richard. For step 1. I have write this patch. Could you take a look at it? Thanks. juzhe.zhong@rivai.ai From: Richard Sandiford Date: 2023-05-24 19:23 To: juzhe.zhong CC: gcc-patches; rguenther Subject: Re: [PATCH V12] VECT: Add decrement IV iteration loop control by variable amount support Sorry for the slow review. I needed some time to go through this patch and surrounding code to understand it, and to understand why it wasn't structured the way I was expecting. I've got some specific comments below, and then a general comment about how I think we should structure this. juzhe.zhong@rivai.ai writes: > From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai> > > gcc/ChangeLog: > > * tree-vect-loop-manip.cc (vect_adjust_loop_lens_control): New function. > (vect_set_loop_controls_directly): Add decrement IV support. > (vect_set_loop_condition_partial_vectors): Ditto. > * tree-vect-loop.cc: Ditto. > * tree-vectorizer.h (LOOP_VINFO_USING_DECREMENTING_IV_P): New macro. > > --- > gcc/tree-vect-loop-manip.cc | 184 +++++++++++++++++++++++++++++++++++- > gcc/tree-vect-loop.cc | 10 ++ > gcc/tree-vectorizer.h | 8 ++ > 3 files changed, 199 insertions(+), 3 deletions(-) > > diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc > index ff6159e08d5..94b38d1e0fb 100644 > --- a/gcc/tree-vect-loop-manip.cc > +++ b/gcc/tree-vect-loop-manip.cc > @@ -385,6 +385,66 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm, > return false; > } > > +/* Try to use adjust loop lens for non-SLP multiple-rgroups. > + > + _36 = MIN_EXPR <ivtmp_34, VF>; > + > + First length (MIN (X, VF/N)): > + loop_len_15 = MIN_EXPR <_36, VF/N>; > + > + Second length: > + tmp = _36 - loop_len_15; > + loop_len_16 = MIN (tmp, VF/N); > + > + Third length: > + tmp2 = tmp - loop_len_16; > + loop_len_17 = MIN (tmp2, VF/N); > + > + Last length: > + loop_len_18 = tmp2 - loop_len_17; > +*/ > + > +static void > +vect_adjust_loop_lens_control (tree iv_type, gimple_seq *seq, > + rgroup_controls *dest_rgm, > + rgroup_controls *src_rgm, tree step) > +{ > + tree ctrl_type = dest_rgm->type; > + poly_uint64 nitems_per_ctrl > + = TYPE_VECTOR_SUBPARTS (ctrl_type) * dest_rgm->factor; > + tree length_limit = build_int_cst (iv_type, nitems_per_ctrl); > + > + for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i) > + { > + if (!step) > + step = src_rgm->controls[i / dest_rgm->controls.length ()]; Could you explain this index? It looks like it will always be 0 due to the range of i. Since this is the only use of src_rgm, it might be cleaner to drop src_rgm and only pass the step. > + tree ctrl = dest_rgm->controls[i]; > + if (i == 0) > + { > + /* First iteration: MIN (X, VF/N) capped to the range [0, VF/N]. */ > + gassign *assign > + = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit); > + gimple_seq_add_stmt (seq, assign); > + } > + else if (i == dest_rgm->controls.length () - 1) > + { > + /* Last iteration: Remain capped to the range [0, VF/N]. */ > + gassign *assign = gimple_build_assign (ctrl, MINUS_EXPR, step, > + dest_rgm->controls[i - 1]); > + gimple_seq_add_stmt (seq, assign); > + } > + else > + { > + /* (MIN (remain, VF*I/N)) capped to the range [0, VF/N]. */ > + step = gimple_build (seq, MINUS_EXPR, iv_type, step, > + dest_rgm->controls[i - 1]); > + gassign *assign > + = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit); > + gimple_seq_add_stmt (seq, assign); > + } > + } > +} Not your fault, but the structure seems kind-of awkward, since it's really a MINUS_EXPR for i != 0 followed by a MIN_EXPR for i != last. But I agree that it probably has to be written as above given that the final destination is fixed in advance. > + > /* Helper for vect_set_loop_condition_partial_vectors. Generate definitions > for all the rgroup controls in RGC and return a control that is nonzero > when the loop needs to iterate. Add any new preheader statements to > @@ -468,9 +528,78 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo, > gimple_stmt_iterator incr_gsi; > bool insert_after; > standard_iv_increment_position (loop, &incr_gsi, &insert_after); > - create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE, > - loop, &incr_gsi, insert_after, &index_before_incr, > - &index_after_incr); > + if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)) > + { > + nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total); > + tree step = make_ssa_name (iv_type); > + /* Create decrement IV. */ > + create_iv (nitems_total, MINUS_EXPR, step, NULL_TREE, loop, &incr_gsi, > + insert_after, &index_before_incr, &index_after_incr); > + tree temp = gimple_build (header_seq, MIN_EXPR, iv_type, > + index_before_incr, nitems_step); > + gimple_seq_add_stmt (header_seq, gimple_build_assign (step, temp)); Probably simpler to build the MIN_EXPR as a gimple_build_assign, with "step" as its destination. No simplification is likely given that the input is a freshly-minted IV. > + > + if (rgc->max_nscalars_per_iter == 1) I'm not sure this is the correct condition. Isn't the split really between whether rgc->controls.length () is 1 or greater than 1? It's possible (even common) for rgc->controls.length () == 1 && rgc->max_nscalars_per_iter != 1, and in that case, vect_adjust_loop_lens_control will create an unnecessary MIN_EXPR. When rgc->controls.length () is 1, it would be good to use rgc->controls[0] as "step" above, rather than make a new SSA name. Thanks for putting the code here though. It looks like the right place to me. More on this below. > + { > + /* single rgroup: > + ... > + _10 = (unsigned long) count_12(D); > + ... > + # ivtmp_9 = PHI <ivtmp_35(6), _10(5)> > + _36 = MIN_EXPR <ivtmp_9, POLY_INT_CST [4, 4]>; > + ... > + vect__4.8_28 = .LEN_LOAD (_17, 32B, _36, 0); > + ... > + ivtmp_35 = ivtmp_9 - _36; > + ... > + if (ivtmp_35 != 0) > + goto <bb 4>; [83.33%] > + else > + goto <bb 5>; [16.67%] > + */ > + gassign *assign = gimple_build_assign (rgc->controls[0], step); > + gimple_seq_add_stmt (header_seq, assign); > + } > + else > + { > + /* Multiple rgroup (SLP): > + ... > + _38 = (unsigned long) bnd.7_29; > + _39 = _38 * 2; > + ... > + # ivtmp_41 = PHI <ivtmp_42(6), _39(5)> > + ... > + _43 = MIN_EXPR <ivtmp_41, 32>; > + loop_len_26 = MIN_EXPR <_43, 16>; > + loop_len_25 = _43 - loop_len_26; > + ... > + .LEN_STORE (_6, 8B, loop_len_26, ...); > + ... > + .LEN_STORE (_25, 8B, loop_len_25, ...); > + _33 = loop_len_26 / 2; > + ... > + .LEN_STORE (_8, 16B, _33, ...); > + _36 = loop_len_25 / 2; > + ... > + .LEN_STORE (_15, 16B, _36, ...); > + ivtmp_42 = ivtmp_41 - _43; > + ... > + if (ivtmp_42 != 0) > + goto <bb 4>; [83.33%] > + else > + goto <bb 5>; [16.67%] > + */ > + vect_adjust_loop_lens_control (iv_type, header_seq, rgc, NULL, step); > + } > + return index_after_incr; > + } > + else > + { > + /* Create increment IV. */ > + create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE, > + loop, &incr_gsi, insert_after, &index_before_incr, > + &index_after_incr); > + } Probably better to have no "else" and instead fall through to this create_iv. It makes it clearer that the decrementing IV case doesn't continue beyond the "if" above. > > tree zero_index = build_int_cst (compare_type, 0); > tree test_index, test_limit, first_limit; > @@ -753,6 +882,55 @@ vect_set_loop_condition_partial_vectors (class loop *loop, > continue; > } > > + if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) > + && rgc->max_nscalars_per_iter == 1 > + && rgc != &LOOP_VINFO_LENS (loop_vinfo)[0]) > + { > + /* Multiple rgroup (non-SLP): > + ... > + _38 = (unsigned long) n_12(D); > + ... > + # ivtmp_38 = PHI <ivtmp_39(3), 100(2)> > + ... > + _40 = MIN_EXPR <ivtmp_38, POLY_INT_CST [8, 8]>; > + loop_len_21 = MIN_EXPR <_40, POLY_INT_CST [2, 2]>; > + _41 = _40 - loop_len_21; > + loop_len_20 = MIN_EXPR <_41, POLY_INT_CST [2, 2]>; > + _42 = _40 - loop_len_20; > + loop_len_19 = MIN_EXPR <_42, POLY_INT_CST [2, 2]>; > + _43 = _40 - loop_len_19; > + loop_len_16 = MIN_EXPR <_43, POLY_INT_CST [2, 2]>; > + ... > + vect__4.8_15 = .LEN_LOAD (_6, 64B, loop_len_21, 0); > + ... > + vect__4.9_8 = .LEN_LOAD (_13, 64B, loop_len_20, 0); > + ... > + vect__4.10_28 = .LEN_LOAD (_46, 64B, loop_len_19, 0); > + ... > + vect__4.11_30 = .LEN_LOAD (_49, 64B, loop_len_16, 0); > + vect__7.13_31 = VEC_PACK_TRUNC_EXPR <vect__4.8_15, vect__4.9_8>; > + vect__7.13_32 = VEC_PACK_TRUNC_EXPR <...>; > + vect__7.12_33 = VEC_PACK_TRUNC_EXPR <...>; > + ... > + .LEN_STORE (_14, 16B, _40, vect__7.12_33, 0); > + ivtmp_39 = ivtmp_38 - _40; > + ... > + if (ivtmp_39 != 0) > + goto <bb 3>; [92.31%] > + else > + goto <bb 4>; [7.69%] > + */ > + rgroup_controls *sub_rgc > + = &(*controls)[nmasks / rgc->controls.length () - 1]; > + if (!sub_rgc->controls.is_empty ()) > + { > + tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo); > + vect_adjust_loop_lens_control (iv_type, &header_seq, rgc, > + sub_rgc, NULL_TREE); > + continue; > + } > + } I don't think this is right, since if we have 1-control, 2-control and 4-control groups, the fan-out for the 4-control group must be based on the 1-control group rather than any general sub_rgc. Using any other rgroup would mean that the input was already clamped by a MIN_EXPR (because that input would itself have been created by vect_adjust_loop_lens_control). > + > /* See whether zero-based IV would ever generate all-false masks > or zero length before wrapping around. */ > bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc); > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc > index cf10132b0bf..53ac4d2a6c2 100644 > --- a/gcc/tree-vect-loop.cc > +++ b/gcc/tree-vect-loop.cc > @@ -2725,6 +2725,16 @@ start_over: > && !vect_verify_loop_lens (loop_vinfo)) > LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; > > + /* If we're vectorizing an loop that uses length "controls" and > + can iterate more than once, we apply decrementing IV approach > + in loop control. */ > + if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) > + && !LOOP_VINFO_LENS (loop_vinfo).is_empty () > + && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) > + && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo), > + LOOP_VINFO_VECT_FACTOR (loop_vinfo)))) I think this also needs to check: LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0 ----- I think another way to think about the patch is this: Things would be easy if there was always a 1-control group. Then we could create a decrementing IV for that 1-control group, in the way that you do above. All other groups would use the 1-control group as input. They would fan that input out using vect_adjust_loop_lens_control. However, there isn't guaranteed to be 1-control group, and creating a fake one might not be easy. We'd need to calculate valid values for the factor, type, nscalars_per_iter, etc. fields. So when there is no 1-control group, it's probably easier to create a decrementing IV based on the first non-empty group. This is what your patch does. And I think it's the right approach. However, the IV created in this way is "special". It doesn't exist as part of the normal rgroup framework. It's something created outside the rgroup framework to make the controls easier to populate. In your case, this is the case where "step" is created and is not assigned directly to rgc->controls. Given that, I think we should structure the code as follows, changing the behaviour only for LOOP_VINFO_USING_DECREMENTING_IV_P: (1) In vect_set_loop_condition_partial_vectors, for the first iteration of: FOR_EACH_VEC_ELT (*controls, i, rgc) if (!rgc->controls.is_empty ()) call vect_set_loop_controls_directly. That is: /* See whether zero-based IV would ever generate all-false masks or zero length before wrapping around. */ bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc); /* Set up all controls for this group. */ test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo, &preheader_seq, &header_seq, loop_cond_gsi, rgc, niters, niters_skip, might_wrap_p); needs to be an "if" that (for LOOP_VINFO_USING_DECREMENTING_IV_P) is only executed on the first iteration. (2) vect_set_loop_controls_directly calculates "step" as in your patch. If rgc has 1 control, this step is the SSA name created for that control. Otherwise the step is a fresh SSA name, as in your patch. (3) vect_set_loop_controls_directly stores this step somewhere for later use, probably in LOOP_VINFO. Let's use "S" to refer to this stored step. (4) After the vect_set_loop_controls_directly call above, and outside the "if" statement that now contains vect_set_loop_controls_directly, check whether rgc->controls.length () > 1. If so, use vect_adjust_loop_lens_control to set the controls based on S. Then the only caller of vect_adjust_loop_lens_control is vect_set_loop_condition_partial_vectors. And the starting step for vect_adjust_loop_lens_control is always S. It sounds complicated when I write it like that, but I think it will be clearer in code. Hope that makes sense. Please let me know if not. Thanks, Richard [-- Attachment #2: step1.patch --] [-- Type: application/octet-stream, Size: 4312 bytes --] diff --git a/riscv-gcc/gcc/tree-vect-loop-manip.cc b/riscv-gcc/gcc/tree-vect-loop-manip.cc index ff6159e08..9b2ee5350 100644 --- a/riscv-gcc/gcc/tree-vect-loop-manip.cc +++ b/riscv-gcc/gcc/tree-vect-loop-manip.cc @@ -468,6 +468,18 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo, gimple_stmt_iterator incr_gsi; bool insert_after; standard_iv_increment_position (loop, &incr_gsi, &insert_after); + if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)) + { + nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total); + tree step = rgc->controls.length () == 1 ? rgc->controls[0] + : make_ssa_name (iv_type); + gimple_seq_add_stmt (header_seq, + gimple_build_assign (step, MIN_EXPR, nitems_total, + nitems_step)); + return step; + } + + /* Create increment IV. */ create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE, loop, &incr_gsi, insert_after, &index_before_incr, &index_after_incr); @@ -764,6 +776,13 @@ vect_set_loop_condition_partial_vectors (class loop *loop, loop_cond_gsi, rgc, niters, niters_skip, might_wrap_p); + + /* Decrement IV only run vect_set_loop_controls_directly once. */ + if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)) + { + LOOP_VINFO_DECREMENTING_IV_STEP (loop_vinfo) = test_ctrl; + break; + } } /* Emit all accumulated statements. */ diff --git a/riscv-gcc/gcc/tree-vect-loop.cc b/riscv-gcc/gcc/tree-vect-loop.cc index cf10132b0..456f50fa7 100644 --- a/riscv-gcc/gcc/tree-vect-loop.cc +++ b/riscv-gcc/gcc/tree-vect-loop.cc @@ -973,6 +973,8 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared) vectorizable (false), can_use_partial_vectors_p (param_vect_partial_vector_usage != 0), using_partial_vectors_p (false), + using_decrementing_iv_p (false), + decrementing_iv_step (NULL_TREE), epil_using_partial_vectors_p (false), partial_load_store_bias (0), peeling_for_gaps (false), @@ -2725,6 +2727,17 @@ start_over: && !vect_verify_loop_lens (loop_vinfo)) LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; + /* If we're vectorizing an loop that uses length "controls" and + can iterate more than once, we apply decrementing IV approach + in loop control. */ + if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) + && !LOOP_VINFO_LENS (loop_vinfo).is_empty () + && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0 + && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) + && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo), + LOOP_VINFO_VECT_FACTOR (loop_vinfo)))) + LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true; + /* If we're vectorizing an epilogue loop, the vectorized loop either needs to be able to handle fewer than VF scalars, or needs to have a lower VF than the main loop. */ diff --git a/riscv-gcc/gcc/tree-vectorizer.h b/riscv-gcc/gcc/tree-vectorizer.h index 02d2ad6fb..7ed079f54 100644 --- a/riscv-gcc/gcc/tree-vectorizer.h +++ b/riscv-gcc/gcc/tree-vectorizer.h @@ -818,6 +818,16 @@ public: the vector loop can handle fewer than VF scalars. */ bool using_partial_vectors_p; + /* True if we've decided to use a decrementing loop control IV that counts + scalars. This can be done for any loop that: + + (a) uses length "controls"; and + (b) can iterate more than once. */ + bool using_decrementing_iv_p; + + /* The variable amount step for decrement IV. */ + tree decrementing_iv_step; + /* True if we've decided to use partially-populated vectors for the epilogue of loop. */ bool epil_using_partial_vectors_p; @@ -890,6 +900,8 @@ public: #define LOOP_VINFO_VECTORIZABLE_P(L) (L)->vectorizable #define LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P(L) (L)->can_use_partial_vectors_p #define LOOP_VINFO_USING_PARTIAL_VECTORS_P(L) (L)->using_partial_vectors_p +#define LOOP_VINFO_USING_DECREMENTING_IV_P(L) (L)->using_decrementing_iv_p +#define LOOP_VINFO_DECREMENTING_IV_STEP(L) (L)->decrementing_iv_step #define LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P(L) \ (L)->epil_using_partial_vectors_p #define LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS(L) (L)->partial_load_store_bias ^ permalink raw reply [flat|nested] 16+ messages in thread
end of thread, other threads:[~2023-05-24 14:59 UTC | newest] Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2023-05-22 8:38 [PATCH V12] VECT: Add decrement IV iteration loop control by variable amount support juzhe.zhong 2023-05-23 12:32 ` juzhe.zhong 2023-05-24 11:23 ` Richard Sandiford 2023-05-24 11:52 ` 钟居哲 2023-05-24 12:41 ` Richard Sandiford 2023-05-24 12:51 ` Richard Biener 2023-05-24 13:27 ` 钟居哲 2023-05-24 13:26 ` 钟居哲 2023-05-24 14:01 ` Richard Sandiford 2023-05-24 14:10 ` 钟居哲 2023-05-24 14:57 ` Richard Sandiford 2023-05-24 14:58 ` 钟居哲 [not found] ` <2023052422100415521814@rivai.ai> 2023-05-24 14:11 ` 钟居哲 2023-05-24 14:22 ` 钟居哲 2023-05-24 14:35 ` 钟居哲 2023-05-24 12:19 ` 钟居哲
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).