diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md index bf033e31c1c..dc2756f83e9 100644 --- a/gcc/config/rs6000/vsx.md +++ b/gcc/config/rs6000/vsx.md @@ -5637,7 +5637,8 @@ (define_expand "len_load_v16qi" [(match_operand:V16QI 0 "vlogical_operand") (match_operand:V16QI 1 "memory_operand") - (match_operand:QI 2 "gpc_reg_operand")] + (match_operand:QI 2 "gpc_reg_operand") + (match_operand:QI 3 "zero_constant")] "TARGET_P9_VECTOR && TARGET_64BIT" { rtx mem = XEXP (operands[1], 0); @@ -5651,6 +5652,7 @@ [(match_operand:V16QI 0 "memory_operand") (match_operand:V16QI 1 "vlogical_operand") (match_operand:QI 2 "gpc_reg_operand") + (match_operand:QI 3 "zero_constant") ] "TARGET_P9_VECTOR && TARGET_64BIT" { diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi index 2b41cb7fb7b..265c76f1609 100644 --- a/gcc/doc/md.texi +++ b/gcc/doc/md.texi @@ -5213,7 +5213,10 @@ which must be a vector mode. Operand 2 has whichever integer mode the target prefers. If operand 2 exceeds the number of elements in mode @var{m}, the behavior is undefined. If the target prefers the length to be measured in bytes rather than elements, it should only implement -this pattern for vectors of @code{QI} elements. +this pattern for vectors of @code{QI} elements. Operand 3 specifies +a bias predicate that determines whether a length of zero is permitted +or not. If permitted, the predicate should only allow a zero immediate, +otherwhise it should only allow an immediate value of -1. This pattern is not allowed to @code{FAIL}. @@ -5226,7 +5229,10 @@ a vector mode. Operand 2 has whichever integer mode the target prefers. If operand 2 exceeds the number of elements in mode @var{m}, the behavior is undefined. If the target prefers the length to be measured in bytes rather than elements, it should only implement this pattern for vectors -of @code{QI} elements. +of @code{QI} elements. Operand 3 specifies a bias predicate that +determines whether a length of zero is permitted or not. If permitted, +the predicate should only allow a zero constant, otherwhise it should +only allow an immediate value of -1. This pattern is not allowed to @code{FAIL}. diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c index 8312d08aab2..90fdc440248 100644 --- a/gcc/internal-fn.c +++ b/gcc/internal-fn.c @@ -2696,9 +2696,9 @@ expand_call_mem_ref (tree type, gcall *stmt, int index) static void expand_partial_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab) { - class expand_operand ops[3]; - tree type, lhs, rhs, maskt; - rtx mem, target, mask; + class expand_operand ops[4]; + tree type, lhs, rhs, maskt, biast; + rtx mem, target, mask, bias; insn_code icode; maskt = gimple_call_arg (stmt, 2); @@ -2723,11 +2723,20 @@ expand_partial_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab) create_output_operand (&ops[0], target, TYPE_MODE (type)); create_fixed_operand (&ops[1], mem); if (optab == len_load_optab) - create_convert_operand_from (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)), - TYPE_UNSIGNED (TREE_TYPE (maskt))); + { + create_convert_operand_from (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)), + TYPE_UNSIGNED (TREE_TYPE (maskt))); + biast = gimple_call_arg (stmt, 3); + bias = expand_normal (biast); + create_input_operand (&ops[3], bias, QImode); + expand_insn (icode, 4, ops); + } else + { create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt))); - expand_insn (icode, 3, ops); + expand_insn (icode, 3, ops); + } + if (!rtx_equal_p (target, ops[0].value)) emit_move_insn (target, ops[0].value); } @@ -2741,9 +2750,9 @@ expand_partial_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab) static void expand_partial_store_optab_fn (internal_fn, gcall *stmt, convert_optab optab) { - class expand_operand ops[3]; - tree type, lhs, rhs, maskt; - rtx mem, reg, mask; + class expand_operand ops[4]; + tree type, lhs, rhs, maskt, biast; + rtx mem, reg, mask, bias; insn_code icode; maskt = gimple_call_arg (stmt, 2); @@ -2766,11 +2775,19 @@ expand_partial_store_optab_fn (internal_fn, gcall *stmt, convert_optab optab) create_fixed_operand (&ops[0], mem); create_input_operand (&ops[1], reg, TYPE_MODE (type)); if (optab == len_store_optab) - create_convert_operand_from (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)), - TYPE_UNSIGNED (TREE_TYPE (maskt))); + { + create_convert_operand_from (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)), + TYPE_UNSIGNED (TREE_TYPE (maskt))); + biast = gimple_call_arg (stmt, 4); + bias = expand_normal (biast); + create_input_operand (&ops[3], bias, QImode); + expand_insn (icode, 4, ops); + } else - create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt))); - expand_insn (icode, 3, ops); + { + create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt))); + expand_insn (icode, 3, ops); + } } #define expand_mask_store_optab_fn expand_partial_store_optab_fn @@ -4172,6 +4189,29 @@ internal_check_ptrs_fn_supported_p (internal_fn ifn, tree type, && insn_operand_matches (icode, 4, GEN_INT (align))); } +/* Return the supported bias for the len_load IFN. For now we only support + the biases of 0 and -1 (in case 0 is not an allowable length for len_load). + If none of the biases match what the backend provides, return + VECT_PARTIAL_BIAS_UNSUPPORTED. */ + +signed char +internal_len_load_bias (internal_fn ifn, machine_mode mode) +{ + optab optab = direct_internal_fn_optab (ifn); + insn_code icode = direct_optab_handler (optab, mode); + + if (icode != CODE_FOR_nothing) + { + /* For now we only support biases of 0 or -1. Try both of them. */ + if (insn_operand_matches (icode, 3, GEN_INT (0))) + return 0; + if (insn_operand_matches (icode, 3, GEN_INT (-1))) + return -1; + } + + return VECT_PARTIAL_BIAS_UNSUPPORTED; +} + /* Expand STMT as though it were a call to internal function FN. */ void diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h index 19d0f849a5a..c057df1327a 100644 --- a/gcc/internal-fn.h +++ b/gcc/internal-fn.h @@ -227,6 +227,10 @@ extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree, tree, tree, int); extern bool internal_check_ptrs_fn_supported_p (internal_fn, tree, poly_uint64, unsigned int); +#define VECT_PARTIAL_BIAS_UNSUPPORTED 127 + +extern signed char internal_len_load_bias (internal_fn ifn, + machine_mode); extern void expand_addsub_overflow (location_t, tree_code, tree, tree, tree, bool, bool, bool, bool, tree *); diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c index 4988c93fdb6..79cf5029faf 100644 --- a/gcc/tree-vect-loop-manip.c +++ b/gcc/tree-vect-loop-manip.c @@ -436,7 +436,14 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo, tree length_limit = NULL_TREE; /* For length, we need length_limit to ensure length in range. */ if (!use_masks_p) - length_limit = build_int_cst (compare_type, nitems_per_ctrl); + { + int partial_load_bias = LOOP_VINFO_PARTIAL_LOAD_BIAS (loop_vinfo); + if (partial_load_bias == 0) + length_limit = build_int_cst (compare_type, nitems_per_ctrl); + else + length_limit = build_int_cst (compare_type, + nitems_per_ctrl + partial_load_bias); + } /* Calculate the maximum number of item values that the rgroup handles in total, the number that it handles for each iteration @@ -560,8 +567,12 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo, { /* Previous controls will cover BIAS items. This control covers the next batch. */ + tree bias_tree; poly_uint64 bias = nitems_per_ctrl * i; - tree bias_tree = build_int_cst (compare_type, bias); + int partial_load_bias = LOOP_VINFO_PARTIAL_LOAD_BIAS (loop_vinfo); + if (partial_load_bias != 0) + bias -= partial_load_bias; + bias_tree = build_int_cst (compare_type, bias); /* See whether the first iteration of the vector loop is known to have a full control. */ @@ -574,7 +585,7 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo, TEST_LIMIT, prefer to use the same 0-based IV for each control and adjust the bound down by BIAS. */ tree this_test_limit = test_limit; - if (i != 0) + if (i != 0 || (i == 0 && partial_load_bias != 0)) { this_test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type, this_test_limit, diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index e94356d76e9..705439cb380 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -1163,6 +1163,21 @@ vect_verify_loop_lens (loop_vec_info loop_vinfo) if (LOOP_VINFO_LENS (loop_vinfo).is_empty ()) return false; + machine_mode len_load_mode = get_len_load_store_mode + (loop_vinfo->vector_mode, false).require (); + + signed char partial_load_bias = internal_len_load_bias + (IFN_LEN_LOAD, len_load_mode); + + LOOP_VINFO_PARTIAL_LOAD_BIAS (loop_vinfo) = partial_load_bias; + + /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit + len_loads with a length of zero. In order to avoid that we prohibit + more than one loop length here. */ + if (partial_load_bias == -1 + && LOOP_VINFO_LENS (loop_vinfo).length () > 1) + return false; + unsigned int max_nitems_per_iter = 1; unsigned int i; rgroup_controls *rgl; @@ -4125,6 +4140,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, here. */ bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo); + signed char partial_load_bias = LOOP_VINFO_PARTIAL_LOAD_BIAS (loop_vinfo); bool need_iterate_p = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo) && !vect_known_niters_smaller_than_vf (loop_vinfo)); @@ -4157,6 +4173,11 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, for each since start index is zero. */ prologue_stmts += num_vectors; + /* If we have a non-zero partial load bias, we need one MINUS + and a MAX to adjust the load length. */ + if (partial_load_bias != 0) + prologue_stmts += 2; + /* Each may need two MINs and one MINUS to update lengths in body for next iteration. */ if (need_iterate_p) diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index 17849b575b7..57ecafb7965 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -8289,9 +8289,16 @@ vectorizable_store (vec_info *vinfo, gsi); vec_oprnd = var; } + + /* Check which bias value to use. */ + char biasval = internal_len_load_bias + (IFN_LEN_STORE, new_vmode); + + tree bias = build_int_cst (intQI_type_node, biasval); gcall *call - = gimple_build_call_internal (IFN_LEN_STORE, 4, dataref_ptr, - ptr, final_len, vec_oprnd); + = gimple_build_call_internal (IFN_LEN_STORE, 5, dataref_ptr, + ptr, final_len, vec_oprnd, + bias); gimple_call_set_nothrow (call, true); vect_finish_stmt_generation (vinfo, stmt_info, call, gsi); new_stmt = call; @@ -9588,24 +9595,32 @@ vectorizable_load (vec_info *vinfo, vec_num * j + i); tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT); + + machine_mode vmode = TYPE_MODE (vectype); + opt_machine_mode new_ovmode + = get_len_load_store_mode (vmode, true); + machine_mode new_vmode = new_ovmode.require (); + tree qi_type = unsigned_intQI_type_node; + tree new_vtype + = build_vector_type_for_mode (qi_type, new_vmode); + + /* Check which bias value to use. */ + char biasval = internal_len_load_bias + (IFN_LEN_LOAD, new_vmode); + + tree bias = build_int_cst (intQI_type_node, biasval); + gcall *call - = gimple_build_call_internal (IFN_LEN_LOAD, 3, + = gimple_build_call_internal (IFN_LEN_LOAD, 4, dataref_ptr, ptr, - final_len); + final_len, bias); gimple_call_set_nothrow (call, true); new_stmt = call; data_ref = NULL_TREE; /* Need conversion if it's wrapped with VnQI. */ - machine_mode vmode = TYPE_MODE (vectype); - opt_machine_mode new_ovmode - = get_len_load_store_mode (vmode, true); - machine_mode new_vmode = new_ovmode.require (); if (vmode != new_vmode) { - tree qi_type = unsigned_intQI_type_node; - tree new_vtype - = build_vector_type_for_mode (qi_type, new_vmode); tree var = vect_get_new_ssa_name (new_vtype, vect_simple_var); gimple_set_lhs (call, var); diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index c4c5678e7f1..b59f5f6643c 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -749,6 +749,11 @@ public: epilogue of loop. */ bool epil_using_partial_vectors_p; + /* The bias for len_load and len_store. For now, only 0 and -1 are + supported. -1 must be used when a backend does not support + len_load/len_store with a length of zero. */ + signed char partial_load_bias; + /* When we have grouped data accesses with gaps, we may introduce invalid memory accesses. We peel the last iteration of the loop to prevent this. */ @@ -814,6 +819,7 @@ public: #define LOOP_VINFO_USING_PARTIAL_VECTORS_P(L) (L)->using_partial_vectors_p #define LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P(L) \ (L)->epil_using_partial_vectors_p +#define LOOP_VINFO_PARTIAL_LOAD_BIAS(L) (L)->partial_load_bias #define LOOP_VINFO_VECT_FACTOR(L) (L)->vectorization_factor #define LOOP_VINFO_MAX_VECT_FACTOR(L) (L)->max_vectorization_factor #define LOOP_VINFO_MASKS(L) (L)->masks