Hi, Richi. > 1. Target is using loop MASK as the partial vector loop control. >> I don't think it checks for this? I am not sure whether I understand EXTRACT_LAST correctly. But if target doesn't use loop MASK for partial vector loop control, how does target use EXTRACT_LAST? Since EXTRACT_LAST is always extracting the last element of the vector according to MASK operand. > But we don't really know this at this point? The only thing we know > is that nothing set LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P to false. Yes. So I am try to use 'get_len_load_store' to check whether target support LEN loop control. Well, I admit it's not a good idea. > I think it should work to change the direct_internal_fn_supported_p > check for IFN_EXTRACT_LAST to a "poitive" one guarding > gcc_assert (ncopies == 1 && !slp_node); > vect_record_loop_mask (loop_vinfo, > &LOOP_VINFO_MASKS (loop_vinfo), > 1, vectype, NULL); > and in the else branch check for VEC_EXTRACT support and if present > record a loop len. Just in this case this particular order would > be important. Do you mean change the codes as follows :? - if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype, - OPTIMIZE_FOR_SPEED)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "can't operate on partial vectors " - "because the target doesn't support extract " - "last reduction.\n"); - LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; - } - else if (slp_node) if (slp_node) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, "can't operate on partial vectors " "because an SLP statement is live after " "the loop.\n"); LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; } else if (ncopies > 1) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, "can't operate on partial vectors " "because ncopies is greater than 1.\n"); LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; } else { gcc_assert (ncopies == 1 && !slp_node); if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype, OPTIMIZE_FOR_SPEED)) vect_record_loop_mask (loop_vinfo, &LOOP_VINFO_MASKS (loop_vinfo), 1, vectype, NULL); else vect_record_loop_len (loop_vinfo, &LOOP_VINFO_LENS (loop_vinfo), 1, vectype, 1); } Thanks. juzhe.zhong@rivai.ai From: Richard Biener Date: 2023-08-11 18:21 To: juzhe.zhong@rivai.ai CC: gcc-patches; richard.sandiford; linkw; krebbel Subject: Re: Re: [PATCH V3] VECT: Support loop len control on EXTRACT_LAST vectorization On Fri, 11 Aug 2023, juzhe.zhong@rivai.ai wrote: > Hi, Richi. > > >> So how can we resolve the issue when a non-VL operation like > >> .VEC_EXTRACT is used for _len support? > > Do you mean non-VL extract last operation (I am sorry that not sure whether I understand your question correctly)? > If yes, the answer is for RVV, we are reusing the same flow as ARM SVE (BIT_FILED_REF approach), see the example below: > > https://godbolt.org/z/cqrWrY8q4 > > #define EXTRACT_LAST(TYPE) \ > TYPE __attribute__ ((noinline, noclone)) \ > test_##TYPE (TYPE *x, int n, TYPE value) \ > { \ > TYPE last; \ > for (int j = 0; j < 64; ++j) \ > { \ > last = x[j]; \ > x[j] = last * value; \ > } \ > return last; \ > } > > #define TEST_ALL(T) \ > T (uint8_t) \ > > TEST_ALL (EXTRACT_LAST) > > vect_cst__22 = {value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D)}; > vect_last_11.6_3 = MEM [(uint8_t *)x_10(D)]; > vect__4.7_23 = vect_last_11.6_3 * vect_cst__22; > MEM [(uint8_t *)x_10(D)] = vect__4.7_23; > _21 = BIT_FIELD_REF ; > > This approach works perfectly for both RVV and ARM SVE for non-VL and non-MASK EXTRACT_LAST operation. > > >> So, why do we test for get_len_load_store_mode and not just for > >> VEC_EXTRACT? > > Before answer this question, let me first elaborate how ARM SVE is doing with MASK EXTRACT_LAST. > > Here is the example: > https://godbolt.org/z/8cTv1jqMb > > ARM SVE IR: > > [local count: 955630224]: > # ivtmp_31 = PHI > > # loop_mask_22 = PHI -----> For RVV, we want this to be loop_len = SELECT_VL; > > _7 = &MEM [(uint8_t *)x_11(D) + ivtmp_31 * 1]; > vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_mask_22); > vect__4.9_27 = vect_last_12.8_23 * vect_cst__26; > .MASK_STORE (_7, 8B, loop_mask_22, vect__4.9_27); > ivtmp_32 = ivtmp_31 + POLY_INT_CST [16, 16]; > _1 = (unsigned int) ivtmp_32; > > next_mask_35 = .WHILE_ULT (_1, bnd.5_6, { 0, ... }); > > if (next_mask_35 != { 0, ... }) > goto ; [89.00%] > else > goto ; [11.00%] > > [local count: 105119324]: > > _25 = .EXTRACT_LAST (loop_mask_22, vect_last_12.8_23); [tail call] ----> Use the last mask generated in BB 4, so for RVV, we are using the loop_len. > > So this patch is trying to optimize the codegen with simulating same flow as ARM SVE but with replacing 'loop_mask_22' (This is generated in BB4) into 'loop_len'. > > For ARM SVE, they only check whether target support EXTRACT_LAST pattern, this pattern is supported means: > > 1. Target is using loop MASK as the partial vector loop control. I don't think it checks for this? > 2. extract_last optab is enabled in the backend. > > So for RVV, we are also checking same conditions: > > 1. Target is using loop LEN as the partial vector loop control (I use get_len_load_store_mode to check whether target is using loop LEN as the partial vector loop control). But we don't really know this at this point? The only thing we know is that nothing set LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P to false. > 2. vec_extract optab is enabled in the backend. > > An alternative approach is that we can adding EXTRACT_LAST_LEN internal FN, then we can only check this like ARM SVE only check EXTRACT_LAST. I think it should work to change the direct_internal_fn_supported_p check for IFN_EXTRACT_LAST to a "poitive" one guarding gcc_assert (ncopies == 1 && !slp_node); vect_record_loop_mask (loop_vinfo, &LOOP_VINFO_MASKS (loop_vinfo), 1, vectype, NULL); and in the else branch check for VEC_EXTRACT support and if present record a loop len. Just in this case this particular order would be important. > >> can we double-check this on powerpc and s390? > > Sure, I hope it can be beneficial to powerpc and s390. > And, I think Richard's comments are also very important so I am gonna wait for it. Yeah, just to double-check the bias stuff works correctly. Richard. > Thanks. > > > juzhe.zhong@rivai.ai > > From: Richard Biener > Date: 2023-08-11 15:01 > To: Ju-Zhe Zhong > CC: gcc-patches; richard.sandiford; linkw; krebbel > Subject: Re: [PATCH V3] VECT: Support loop len control on EXTRACT_LAST vectorization > On Fri, 11 Aug 2023, juzhe.zhong@rivai.ai wrote: > > > From: Ju-Zhe Zhong > > > > Hi, Richard and Richi. > > > > This patch add support live vectorization by VEC_EXTRACT for LEN loop control. > > > > Consider this following case: > > > > #include > > > > #define EXTRACT_LAST(TYPE) \ > > TYPE __attribute__ ((noinline, noclone)) \ > > test_##TYPE (TYPE *x, int n, TYPE value) \ > > { \ > > TYPE last; \ > > for (int j = 0; j < n; ++j) \ > > { \ > > last = x[j]; \ > > x[j] = last * value; \ > > } \ > > return last; \ > > } > > > > #define TEST_ALL(T) \ > > T (uint8_t) \ > > > > TEST_ALL (EXTRACT_LAST) > > > > ARM SVE IR: > > > > Preheader: > > max_mask_34 = .WHILE_ULT (0, bnd.5_6, { 0, ... }); > > > > Loop: > > ... > > # loop_mask_22 = PHI > > ... > > vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_mask_22); > > vect__4.9_27 = vect_last_12.8_23 * vect_cst__26; > > .MASK_STORE (_7, 8B, loop_mask_22, vect__4.9_27); > > ... > > next_mask_35 = .WHILE_ULT (_1, bnd.5_6, { 0, ... }); > > ... > > > > Epilogue: > > _25 = .EXTRACT_LAST (loop_mask_22, vect_last_12.8_23); > > > > For RVV since we prefer len in loop control, after this patch for RVV: > > > > Loop: > > ... > > loop_len_22 = SELECT_VL; > > vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_len_22); > > vect__4.9_27 = vect_last_12.8_23 * vect_cst__26; > > .MASK_STORE (_7, 8B, loop_len_22, vect__4.9_27); > > ... > > > > Epilogue: > > _25 = .VEC_EXTRACT (loop_len_22 + bias - 1, vect_last_12.8_23); > > > > Details of this approach: > > > > 1. Step 1 - Add 'vect_can_vectorize_extract_last_with_len_p' to enable live vectorization > > for LEN loop control. > > > > This function we check whether target support: > > - Use LEN as the loop control. > > - Support VEC_EXTRACT optab. > > > > 2. Step 2 - Record LEN for loop control if 'vect_can_vectorize_extract_last_with_len_p' is true. > > > > 3. Step 3 - Gerenate VEC_EXTRACT (v, LEN + BIAS - 1). > > > > The only difference between mask and len is that len is using length generated by SELECT_VL and > > use VEC_EXTRACT pattern. The rest of the live vectorization is totally the same ARM SVE. > > > > Bootstrap and Regression on X86 passed. > > > > Tested on ARM QEMU. > > > > Ok for trunk? > > > > gcc/ChangeLog: > > > > * tree-vect-loop.cc (vect_can_vectorize_extract_last_with_len_p): New function. > > (vectorizable_live_operation): Add loop len control. > > > > --- > > gcc/tree-vect-loop.cc | 76 +++++++++++++++++++++++++++++++++++++++---- > > 1 file changed, 70 insertions(+), 6 deletions(-) > > > > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc > > index bf8d677b584..809b73b966c 100644 > > --- a/gcc/tree-vect-loop.cc > > +++ b/gcc/tree-vect-loop.cc > > @@ -8963,6 +8963,27 @@ vect_can_vectorize_without_simd_p (code_helper code) > > && vect_can_vectorize_without_simd_p (tree_code (code))); > > } > > > > +/* Return true if target supports extract last vectorization with LEN. */ > > + > > +static bool > > +vect_can_vectorize_extract_last_with_len_p (tree vectype) > > +{ > > + /* Return false if target doesn't support LEN in loop control. */ > > + machine_mode vmode; > > + machine_mode vec_mode = TYPE_MODE (vectype); > > + if (!VECTOR_MODE_P (vec_mode)) > > + return false; > > + if (!get_len_load_store_mode (vec_mode, true).exists (&vmode) > > + || !get_len_load_store_mode (vec_mode, false).exists (&vmode)) > > + return false; > > So this "hidden" bit in the end decides whether to ... > > > + /* Target need to support VEC_EXTRACT to extract the last active element. */ > > + return convert_optab_handler (vec_extract_optab, > > + vec_mode, > > + TYPE_MODE (TREE_TYPE (vectype))) > > + != CODE_FOR_nothing; > > +} > > + > > /* Create vector init for vectorized iv. */ > > static tree > > vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr, > > @@ -10279,7 +10300,8 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, > > if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) > > { > > if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype, > > - OPTIMIZE_FOR_SPEED)) > > + OPTIMIZE_FOR_SPEED) > > + && !vect_can_vectorize_extract_last_with_len_p (vectype)) > > { > > if (dump_enabled_p ()) > > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > > @@ -10308,9 +10330,14 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, > > else > > { > > gcc_assert (ncopies == 1 && !slp_node); > > - vect_record_loop_mask (loop_vinfo, > > - &LOOP_VINFO_MASKS (loop_vinfo), > > - 1, vectype, NULL); > > + if (vect_can_vectorize_extract_last_with_len_p (vectype)) > > + vect_record_loop_len (loop_vinfo, > > + &LOOP_VINFO_LENS (loop_vinfo), > > + 1, vectype, 1); > > .. record a loop_len here. I think powerpc at least has .VEC_EXTRACT as > well but of course .VEC_EXTRACT support itself doesn't have anything to > do with 'len' support. > > x86 has .VEC_SET but not yet .VEC_EXTRACT, if it gets .VEC_EXTRACT > its partial vector support still wants masks, not lens (and once > we record both we fail). > > So how can we resolve the issue when a non-VL operation like > .VEC_EXTRACT is used for _len support? > > Note x86 doens't yet support IFN_EXTRACT_LAST either. > > So, why do we test for get_len_load_store_mode and not just for > VEC_EXTRACT? > > > + else > > + vect_record_loop_mask (loop_vinfo, > > + &LOOP_VINFO_MASKS (loop_vinfo), > > + 1, vectype, NULL); > > } > > } > > /* ??? Enable for loop costing as well. */ > > @@ -10336,7 +10363,9 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, > > gimple *vec_stmt; > > if (slp_node) > > { > > - gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)); > > + gcc_assert (!loop_vinfo > > + || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) > > + && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))); > > > > /* Get the correct slp vectorized stmt. */ > > vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry]; > > @@ -10380,7 +10409,42 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, > > > > gimple_seq stmts = NULL; > > tree new_tree; > > - if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) > > + if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)) > > + { > > + /* Emit: > > + > > + SCALAR_RES = VEC_EXTRACT > > + > > + where VEC_LHS is the vectorized live-out result and MASK is > > + the loop mask for the final iteration. */ > > + gcc_assert (ncopies == 1 && !slp_node); > > + gimple_seq tem = NULL; > > + gimple_stmt_iterator gsi = gsi_last (tem); > > + tree len > > + = vect_get_loop_len (loop_vinfo, &gsi, > > + &LOOP_VINFO_LENS (loop_vinfo), > > + 1, vectype, 0, 0); > > + > > + /* BIAS - 1. */ > > + signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); > > + tree bias_minus_one > > + = int_const_binop (MINUS_EXPR, > > + build_int_cst (TREE_TYPE (len), biasval), > > + build_one_cst (TREE_TYPE (len))); > > + > > + /* LAST_INDEX = LEN + (BIAS - 1). */ > > + tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len), > > + len, bias_minus_one); > > + > > + /* SCALAR_RES = VEC_EXTRACT . */ > > + tree scalar_res > > + = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype), > > + vec_lhs_phi, last_index); > > + > > can we double-check this on powerpc and s390? > > Thanks, > Richard. > > > + /* Convert the extracted vector element to the scalar type. */ > > + new_tree = gimple_convert (&stmts, lhs_type, scalar_res); > > + } > > + else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) > > { > > /* Emit: > > > > -- Richard Biener SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany; GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)