diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h index 8ea38118b05769bd6fcb1d22d902a50979cfd953..9c3097b841838a451846c5c65054ab6759d860a9 100644 --- a/gcc/config/arm/arm-protos.h +++ b/gcc/config/arm/arm-protos.h @@ -63,7 +63,7 @@ extern void arm_decompose_di_binop (rtx, rtx, rtx *, rtx *, rtx *, rtx *); extern bool arm_q_bit_access (void); extern bool arm_ge_bits_access (void); extern bool arm_target_insn_ok_for_lob (rtx); - +extern rtx arm_attempt_dlstp_transform (rtx); #ifdef RTX_CODE enum reg_class arm_mode_base_reg_class (machine_mode); diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc index 31f2a7b9d4688dde69d1435e24cf885e8544be71..de6a507e4cfde0cb81081ed5769c7545a9ff48fe 100644 --- a/gcc/config/arm/arm.cc +++ b/gcc/config/arm/arm.cc @@ -470,6 +470,9 @@ static const struct attribute_spec arm_attribute_table[] = #undef TARGET_SCHED_REORDER #define TARGET_SCHED_REORDER arm_sched_reorder +#undef TARGET_ALLOW_ELEMENTWISE_DOLOOP_P +#define TARGET_ALLOW_ELEMENTWISE_DOLOOP_P arm_allow_elementwise_doloop_p + #undef TARGET_REGISTER_MOVE_COST #define TARGET_REGISTER_MOVE_COST arm_register_move_cost @@ -34168,8 +34171,504 @@ arm_target_insn_ok_for_lob (rtx insn) return single_succ_p (bb) && single_pred_p (bb) - && single_succ_edge (bb)->dest == single_pred_edge (bb)->src - && contains_no_active_insn_p (bb); + && single_succ_edge (bb)->dest == single_pred_edge (bb)->src; +} + +static int +arm_mve_get_vctp_lanes (rtx x) +{ + if (GET_CODE (x) == SET && GET_CODE (XEXP (x, 1)) == UNSPEC) + { + switch (XINT (XEXP (x, 1), 1)) + { + case VCTP8Q: + return 16; + case VCTP16Q: + return 8; + case VCTP32Q: + return 4; + case VCTP64Q: + return 2; + default: + break; + } + } + return 0; +} + +/* Check if an insn requires the use of the VPR_REG, if it does, return the + sub-rtx of the VPR_REG. The `type` argument controls whether + this function should: + * For type == 0, check all operands, including the OUT operands, + and return the first occurance of the VPR_REG. + * For type == 1, only check the input operands. + * For type == 2, only check the output operands. + (INOUT operands are considered both as input and output operands) +*/ +static rtx +arm_get_required_vpr_reg (rtx_insn *insn, int type = 0) +{ + gcc_assert (type == 0 || type == 1 + || type == 2); + if (!NONJUMP_INSN_P (insn)) + return NULL_RTX; + + bool requires_vpr; + extract_constrain_insn (insn); + int n_operands = recog_data.n_operands; + if (recog_data.n_alternatives == 0) + return NULL_RTX; + + /* Fill in recog_op_alt with information about the constraints of + this insn. */ + preprocess_constraints (insn); + + for (int op = 0; op < n_operands; op++) + { + requires_vpr = true; + if (type == 1 && (recog_data.operand_type[op] == OP_OUT + || recog_data.operand_type[op] == OP_INOUT)) + continue; + else if (type == 2 && (recog_data.operand_type[op] == OP_IN + || recog_data.operand_type[op] == OP_INOUT)) + continue; + + /* Iterate through alternatives of operand "op" in recog_op_alt and + identify if the operand is required to be the VPR. */ + for (int alt = 0; alt < recog_data.n_alternatives; alt++) + { + const operand_alternative *op_alt + = &recog_op_alt[alt * n_operands]; + /* Fetch the reg_class for each entry and check it against the + * VPR_REG reg_class. */ + if (alternative_class (op_alt, op) != VPR_REG) + requires_vpr = false; + } + /* If all alternatives of the insn require the VPR reg for this operand, + it means that either this is VPR-generating instruction, like a vctp, + vcmp, etc., or it is a VPT-predicated insruction. Return the subrtx + of the VPR reg operand. */ + if (requires_vpr) + return recog_data.operand[op]; + } + return NULL_RTX; +} + +static rtx +arm_get_required_vpr_reg_ret_val (rtx_insn *insn) +{ + return arm_get_required_vpr_reg (insn, 2); +} + +static rtx +arm_get_required_vpr_reg_param (rtx_insn *insn) +{ + return arm_get_required_vpr_reg (insn, 1); +} + +/* Scan the basic block of a loop body for a vctp instruction. If there is + at least vctp instruction, return the first rtx_insn *. */ + +static rtx_insn * +arm_mve_get_loop_vctp (basic_block bb) +{ + rtx_insn *insn = BB_HEAD (bb); + + /* Now scan through all the instruction patterns and + pick out any MVE instructions. */ + FOR_BB_INSNS (bb, insn) + if (INSN_P (insn)) + if (arm_mve_get_vctp_lanes (PATTERN (insn)) != 0) + return insn; + return NULL; +} + +rtx +arm_attempt_dlstp_transform (rtx label) +{ + int decrementnum; + basic_block body = BLOCK_FOR_INSN (label)->prev_bb; + + /* Ensure that the bb is within a loop that has all required metadata. */ + if (!body->loop_father || !body->loop_father->header + || !body->loop_father->simple_loop_desc) + return GEN_INT (1); + basic_block pre_loop_bb1 = loop_preheader_edge (body->loop_father)->src; + basic_block pre_loop_bb2 = pre_loop_bb1->prev_bb; + rtx count = simple_loop_desc (body->loop_father)->niter_expr; + rtx shift_expr = NULL_RTX; + rtx initial_compare = NULL_RTX; + + /* Doloop can only be done "elementwise" with predicated dlstp/letp if it + contains a VCTP on the number of elements processed by the loop. + Find the VCTP predicate generation inside the loop body BB. */ + rtx_insn *vctp_insn = arm_mve_get_loop_vctp (body); + if (!vctp_insn) + return GEN_INT (1); + + /* Additionally, the iteration counter must only get decremented by the + number of MVE lanes (as per the data type). + There are only two types of loops that can be turned into dlstp/letp loops: + A) Loops of the form: + while (num_of_elem > 0) + { + p = vctp (num_of_elem) + n -= num_of_lanes; + } + B) Loops of the form: + int num_of_iters = (num_of_elem + num_of_lanes - 1) / num_of_lanes + for (i = 0; i < num_of_iters; i++) + { + p = vctp (num_of_elem) + n -= num_of_lanes; + } + + These can be verified through the "count" variable in the middle-end + (a.k.a. get_simple_loop_desc (loop)->desc->niter_expr). This is the + expression used to calculate the number of iterations that the loop would + execute for a standard dls/le loop. + + For dlstp/letp we only support cases where this is a power of 2, so from + "count" we want to extract something like: + ( [l/a] shiftrt: (x) (const_int y)) + For loops of form A), "count" is already a shiftrt expression. + For loops of form B), "count" gets given as: + (plus: (not (i)) (num_of_iters)) + with setup happening in a previous basic block. Here we need to verify: + * That i is _always_ initialized to (const_int 0) + * That num_of_iters is a shiftrt expression. + */ + if (GET_CODE (count) == LSHIFTRT + || GET_CODE (count) == ASHIFTRT) + { + + shift_expr = count; + /* In this situation where we are looping on a decreasing number of + elements, a dlstp/letp loop can only work if the looping ends when the + element counter reaches zero and not some other value (e.g. n > 0 + works, not n > 1), or we can incorrectly end up running one additional + iteration. To by-pass any hoisting that the compiler may have done + with the first arg to `count`, we can instead look at the bb before + the loop preheader: this should end with a cmp+jump pair, where the + cmp needs to be: (const_int 0). */ + if (!pre_loop_bb2 || !BB_END (pre_loop_bb2) + || !prev_nonnote_nondebug_insn_bb (BB_END (pre_loop_bb2)) + || !INSN_P (prev_nonnote_nondebug_insn_bb (BB_END (pre_loop_bb2)))) + return GEN_INT (1); + else + initial_compare + = PATTERN (prev_nonnote_nondebug_insn_bb (BB_END (pre_loop_bb2))); + + if (!(initial_compare && GET_CODE (initial_compare) == SET + && cc_register (XEXP (initial_compare, 0), VOIDmode) + && GET_CODE (XEXP (initial_compare, 1)) == COMPARE + && CONST_INT_P (XEXP (XEXP (initial_compare, 1), 1)) + && INTVAL (XEXP (XEXP (initial_compare, 1), 1)) == 0)) + return GEN_INT (1); + } + else if (GET_CODE (count) == PLUS) + { + if (!(GET_CODE (XEXP (count, 0)) == NOT + && REG_P (XEXP (XEXP (count, 0), 0)) && REG_P (XEXP (count, 1)))) + return GEN_INT (1); + else + { + /* Verify the first argument to the plus */ + rtx loop_counter = XEXP (XEXP (count, 0), 0); + df_ref loop_counter_init_def = NULL; + loop_counter_init_def + = df_bb_regno_last_def_find (pre_loop_bb1, REGNO (loop_counter)); + rtx loop_counter_init + = PATTERN (DF_REF_INSN (loop_counter_init_def)); + if (!(loop_counter_init_def && GET_CODE (loop_counter_init) == SET + && CONST_INT_P (XEXP (loop_counter_init, 1)) + && INTVAL (XEXP (loop_counter_init, 1)) == 0)) + return GEN_INT (1); + + /* Verify the first argument to the plus */ + rtx count_max = XEXP (count, 1); + df_ref counter_max_def = NULL; + counter_max_def = DF_REG_DEF_CHAIN (REGNO (count_max)); + if (counter_max_def + && (GET_CODE (XEXP (single_set (DF_REF_INSN (counter_max_def)), + 1)) + == LSHIFTRT + || GET_CODE ( + XEXP (single_set (DF_REF_INSN (counter_max_def)), 1)) + == ASHIFTRT)) + { + shift_expr + = XEXP (single_set (DF_REF_INSN (counter_max_def)), 1); + } + else + return GEN_INT (1); + } + } + else + return GEN_INT (1); + + /* Check the validity of the shift: the second operand needs to be a + constant. */ + if (!CONSTANT_P (XEXP (shift_expr, 1))) + return GEN_INT (1); + /* Extract the loop decrement from the [A/L]SHIFTR 2nd operand of count. */ + decrementnum = (1 << (INTVAL (XEXP (shift_expr, 1)))); + /* Ensure it matches the number of lanes of the vctp instruction. */ + if (decrementnum != arm_mve_get_vctp_lanes (PATTERN (vctp_insn))) + return GEN_INT (1); + + rtx_insn *insn = 0; + rtx_insn *cur_insn = 0; + rtx_insn *seq; + rtx vctp_vpr_generated = NULL_RTX; + rtx insn_vpr_reg_operand = NULL_RTX; + + /* Scan through the insns in the loop bb and emit the transformed bb + insns to a sequence. */ + start_sequence (); + FOR_BB_INSNS (body, insn) + { + if (GET_CODE (insn) == CODE_LABEL || NOTE_INSN_BASIC_BLOCK_P (insn)) + continue; + else if (NOTE_P (insn)) + emit_note ((enum insn_note)NOTE_KIND (insn)); + else if (DEBUG_INSN_P (insn)) + emit_debug_insn (PATTERN (insn)); + else if (!INSN_P (insn)) + { + end_sequence (); + return GEN_INT (1); + } + /* When we find the vctp instruction: This may be followed by + a sign-extend insn to SImode. If it is, then save the + sign-extended REG into vctp_vpr_generated. If there is no + sign-extend, then store the raw output of the vctp. + For any VPT-predicated instructions we need to ensure that + the VPR they use is the same as the one given here and + they often consume the output of a subreg of the SImode + sign-extended VPR-reg. As a result, comparing against the + output of the sign-extend is more likely to succeed. + This code also guarantees to us that the vctp comes before + any instructions that use the VPR within the loop, for the + dlstp/letp transform to succeed. */ + else if (insn == vctp_insn) + { + rtx_insn *next_use1 = NULL; + df_ref use; + for (use = DF_REG_USE_CHAIN ( + DF_REF_REGNO (DF_INSN_INFO_DEFS (DF_INSN_INFO_GET (insn)))); + use; use = DF_REF_NEXT_REG (use)) + if (!next_use1 && NONDEBUG_INSN_P (DF_REF_INSN (use))) + next_use1 = DF_REF_INSN (use); + + if (GET_CODE (SET_SRC (single_set (next_use1))) == SIGN_EXTEND) + { + rtx_insn *next_use2 = NULL; + for (use = DF_REG_USE_CHAIN (DF_REF_REGNO ( + DF_INSN_INFO_DEFS (DF_INSN_INFO_GET (next_use1)))); + use; use = DF_REF_NEXT_REG (use)) + if (!next_use2 && NONDEBUG_INSN_P (DF_REF_INSN (use))) + next_use2 = DF_REF_INSN (use); + + if (GET_CODE (SET_SRC (single_set (next_use2))) == SUBREG) + vctp_vpr_generated = XEXP (PATTERN (next_use2), 0); + } + + if (!vctp_vpr_generated) + vctp_vpr_generated = XEXP (PATTERN (insn), 0); + /* Also emit a USE of the source register of the vctp. + This holds the number of elements being processed + by the loop. This later gets stored into `count` + for the middle-end to initialise the loop counter. */ + emit_use (XVECEXP (XEXP (PATTERN (insn), 1), 0, 0)); + continue; + } + /* If the insn pattern requires the use of the VPR value from the + vctp as an input parameter. */ + else if ((insn_vpr_reg_operand = arm_get_required_vpr_reg (insn)) + && !arm_get_required_vpr_reg_ret_val (insn) + && rtx_equal_p (vctp_vpr_generated, insn_vpr_reg_operand)) + { + gcc_assert (MVE_VPT_PREDICATED_INSN_P (insn)); + int new_icode = get_attr_mve_unpredicated_insn (insn); + extract_insn (insn); + rtx arr[8]; + int j = 0; + + /* When transforming a VPT-predicated instruction + into its unpredicated equivalent we need to drop + the VPR operand and we may need to also drop a + merge "vuninit" input operand, depending on the + instruction pattern. Here ensure that we have at + most a two-operand difference between the two + instrunctions. */ + int n_operands_diff + = recog_data.n_operands - insn_data[new_icode].n_operands; + gcc_assert (n_operands_diff > 0 && n_operands_diff <= 2); + + /* Then, loop through the operands of the predicated + instruction, and retain the ones that map to the + unpredicated instruction. */ + for (int i = 0; i < recog_data.n_operands; i++) + { + /* Ignore the VPR and, if needed, the vuninit + operand. */ + if (insn_vpr_reg_operand == recog_data.operand[i] + || (n_operands_diff == 2 + && !strcmp (recog_data.constraints[i], "0"))) + continue; + else + { + arr[j] = recog_data.operand[i]; + j++; + } + } + + /* Finally, emit the upredicated instruction. */ + switch (j) + { + case 1: + emit_insn (GEN_FCN (new_icode) (arr[0])); + break; + case 2: + emit_insn (GEN_FCN (new_icode) (arr[0], arr[1])); + break; + case 3: + emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2])); + break; + case 4: + emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2], + arr[3])); + break; + case 5: + emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2], arr[3], + arr[4])); + break; + case 6: + emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2], arr[3], + arr[4], arr[5])); + break; + case 7: + emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2], arr[3], + arr[4], arr[5], arr[6])); + break; + default: + gcc_unreachable (); + } + } + /* If within the loop we have an MVE vector instruction that is + unpredicated, the dlstp/letp looping will add automatic + predication to it. This ONLY is safe if it is not a store insn + and either the output of this insn is fed to an insn where we + know that it won't affect the result (e.g. a predicated insn). */ + else if (MVE_VPT_UNPREDICATED_INSN_P (insn) + && !arm_get_required_vpr_reg_ret_val (insn)) + { + if (mve_memory_operand (SET_DEST (single_set (insn)), + GET_MODE (SET_DEST (single_set (insn))))) + { + end_sequence (); + return GEN_INT (1); + } + /* Scan through the insn that USE the DEF of this insn + and bail out if any one is: + a) Outside the current basic block. + b) Not a predicable insn. + c) An unpredicated store insn. + Here we allow unpredicated vector insns: we expect + that this analysis will also be run, so we don't need + to follow them through to their DEFs). */ + df_ref insn_def = NULL; + insn_def = DF_INSN_INFO_DEFS (DF_INSN_INFO_GET (insn)); + for (df_ref use = DF_REG_USE_CHAIN (DF_REF_REGNO (insn_def)); use; + use = DF_REF_NEXT_REG (use)) + { + rtx_insn *next_use_insn = DF_REF_INSN (use); + if (NONDEBUG_INSN_P (next_use_insn)) + { + rtx next_insn_set_dest = SET_DEST (single_set (next_use_insn)); + if (BLOCK_FOR_INSN (insn) != BLOCK_FOR_INSN (next_use_insn) + || (NONDEBUG_INSN_P (next_use_insn) + && (!MVE_VPT_PREDICABLE_INSN_P (next_use_insn) + || (MVE_VPT_UNPREDICATED_INSN_P (next_use_insn) + && mve_memory_operand ( + next_insn_set_dest, + GET_MODE (next_insn_set_dest)))))) + { + end_sequence (); + return GEN_INT (1); + } + } + } + emit_insn (PATTERN (insn)); + } + /* Instructions that aren't predicable MVE vector instructions and + don't require the VPR can be carried over as-is. */ + else + { + df_ref insn_uses = NULL; + FOR_EACH_INSN_USE (insn_uses, insn) + { + for (df_ref def = DF_REG_DEF_CHAIN (DF_REF_REGNO (insn_uses)); def; + def = DF_REF_NEXT_REG (def)) + { + rtx vpr = arm_get_required_vpr_reg_ret_val (DF_REF_INSN (def)); + if (vpr && rtx_equal_p (vctp_vpr_generated, vpr)) + { + end_sequence (); + return GEN_INT (1); + } + } + } + emit_insn (PATTERN (insn)); + } + } + seq = get_insns (); + end_sequence (); + + /* Re-write the entire BB contents with the transformed + sequence. */ + FOR_BB_INSNS_SAFE (body, insn, cur_insn) + if (!(GET_CODE (insn) == CODE_LABEL || NOTE_INSN_BASIC_BLOCK_P (insn))) + delete_insn (insn); + for (insn = seq; NEXT_INSN (insn); insn = NEXT_INSN (insn)) + if (NOTE_P (insn)) + emit_note_after ((enum insn_note)NOTE_KIND (insn), BB_END (body)); + else if (DEBUG_INSN_P (insn)) + emit_debug_insn_after (PATTERN (insn), BB_END (body)); + else + emit_insn_after (PATTERN (insn), BB_END (body)); + + emit_jump_insn_after (PATTERN (insn), BB_END (body)); + return GEN_INT (decrementnum); +} + +/* Target hook to the number of elements to be processed by a dlstp/letp loop + into `count` to intialise the counter register. The number of elements was + previously extracted from the vctp insn and placed into a USE rtx. + We only check that the doloop_end pattern successfully decrements by a + number other than -1 for a valid dlstp/letp loop. No other checking is + needed as that was done previously. */ + +rtx +arm_allow_elementwise_doloop_p (rtx count, rtx label, rtx doloop) +{ + if (doloop + && INTVAL (XEXP (SET_SRC (XVECEXP (PATTERN (doloop), 0, 1)), 1)) != -1) + { + basic_block body = BLOCK_FOR_INSN (label)->prev_bb; + rtx_insn* insn; + FOR_BB_INSNS (body, insn) + { + if (INSN_P (insn) && GET_CODE (PATTERN (insn)) == USE) + { + rtx num_elem_reg = copy_rtx (XEXP (PATTERN (insn), 0)); + delete_insn (insn); + return num_elem_reg; + } + } + } + return count; } #if CHECKING_P diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index 29062cd6fb3c61c22417ef8fc25abd7819d2c034..aa2fdac22f3c7884c18ebf197b26a2d54da44c93 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -1464,7 +1464,9 @@ (define_int_attr mode1 [(VCTP8Q "8") (VCTP16Q "16") (VCTP32Q "32") (VCTP64Q "64") (VCTP8Q_M "8") (VCTP16Q_M "16") - (VCTP32Q_M "32") (VCTP64Q_M "64")]) + (VCTP32Q_M "32") (VCTP64Q_M "64") + (DLSTP8 "8") (DLSTP16 "16") (DLSTP32 "32") + (DLSTP64 "64")]) ;; Both kinds of return insn. (define_code_iterator RETURNS [return simple_return]) @@ -1773,6 +1775,8 @@ (define_int_iterator UQRSHLLQ [UQRSHLL_64 UQRSHLL_48]) (define_int_iterator SQRSHRLQ [SQRSHRL_64 SQRSHRL_48]) (define_int_iterator VSHLCQ_M [VSHLCQ_M_S VSHLCQ_M_U]) +(define_int_iterator DLSTP [DLSTP8 DLSTP16 DLSTP32 + DLSTP64]) ;; Define iterators for VCMLA operations (define_int_iterator VCMLA_OP [UNSPEC_VCMLA diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md index 03f816ff0f28b27c8835f19894889fa6066c1ad2..f08acfebf6a73767b8ff4fe7dbc3f0a2bcf0ad44 100644 --- a/gcc/config/arm/mve.md +++ b/gcc/config/arm/mve.md @@ -11097,3 +11097,38 @@ } DONE; }) + +;; Originally expanded by 'predicated_doloop_end'. +(define_insn "*predicated_doloop_end_internal" + [(set (pc) + (if_then_else + (ge (plus:SI (reg:SI LR_REGNUM) + (match_operand:SI 0 "const_int_operand" "")) + (const_int 0)) + (label_ref (match_operand 1 "" "")) + (pc))) + (set (reg:SI LR_REGNUM) + (plus:SI (reg:SI LR_REGNUM) (match_dup 0))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_32BIT && TARGET_HAVE_LOB && TARGET_HAVE_MVE && TARGET_THUMB2" + { + if (get_attr_length (insn) == 4) + return "letp\t%|lr, %l1"; + else + return "subs\t%|lr, #%0;bgt\t%l1"; + } + [(set (attr "length") + (if_then_else + (ltu (minus (pc) (match_dup 1)) (const_int 1024)) + (const_int 4) + (const_int 6))) + (set_attr "type" "branch")]) + +(define_insn "dlstp_insn" + [ + (set (reg:SI LR_REGNUM) + (unspec:SI [(match_operand:SI 0 "s_register_operand" "r")] + DLSTP)) + ] + "TARGET_32BIT && TARGET_HAVE_LOB && TARGET_HAVE_MVE && TARGET_THUMB2" + "dlstp.\t%|lr, %0") \ No newline at end of file diff --git a/gcc/config/arm/thumb2.md b/gcc/config/arm/thumb2.md index b2309a5216550104f29a71f33506d421f79f5e2c..c5955d599721343f36dec9805665734eca5cdb21 100644 --- a/gcc/config/arm/thumb2.md +++ b/gcc/config/arm/thumb2.md @@ -1613,7 +1613,7 @@ (use (match_operand 1 "" ""))] ; label "TARGET_32BIT" " - { +{ /* Currently SMS relies on the do-loop pattern to recognize loops where (1) the control part consists of all insns defining and/or using a certain 'count' register and (2) the loop count can be @@ -1623,41 +1623,67 @@ Also used to implement the low over head loops feature, which is part of the Armv8.1-M Mainline Low Overhead Branch (LOB) extension. */ - if (optimize > 0 && (flag_modulo_sched || TARGET_HAVE_LOB)) - { - rtx s0; - rtx bcomp; - rtx loc_ref; - rtx cc_reg; - rtx insn; - rtx cmp; - - if (GET_MODE (operands[0]) != SImode) - FAIL; - - s0 = operands [0]; - - /* Low over head loop instructions require the first operand to be LR. */ - if (TARGET_HAVE_LOB && arm_target_insn_ok_for_lob (operands [1])) - s0 = gen_rtx_REG (SImode, LR_REGNUM); - - if (TARGET_THUMB2) - insn = emit_insn (gen_thumb2_addsi3_compare0 (s0, s0, GEN_INT (-1))); - else - insn = emit_insn (gen_addsi3_compare0 (s0, s0, GEN_INT (-1))); - - cmp = XVECEXP (PATTERN (insn), 0, 0); - cc_reg = SET_DEST (cmp); - bcomp = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx); - loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands [1]); - emit_jump_insn (gen_rtx_SET (pc_rtx, - gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp, - loc_ref, pc_rtx))); - DONE; - } - else - FAIL; - }") + if (optimize > 0 && (flag_modulo_sched || TARGET_HAVE_LOB)) + { + rtx s0; + rtx bcomp; + rtx loc_ref; + rtx cc_reg; + rtx insn; + rtx cmp; + rtx decrement_num; + + if (GET_MODE (operands[0]) != SImode) + FAIL; + + s0 = operands[0]; + + if (TARGET_HAVE_LOB && arm_target_insn_ok_for_lob (operands[1])) + { + s0 = gen_rtx_REG (SImode, LR_REGNUM); + + /* If we have a compatibe MVE target, try and analyse the loop + contents to determine if we can use predicated dlstp/letp + looping. */ + if (TARGET_HAVE_MVE && TARGET_THUMB2 + && (decrement_num = arm_attempt_dlstp_transform (operands[1])) + && (INTVAL (decrement_num) != 1)) + { + insn = emit_insn + (gen_thumb2_addsi3_compare0 + (s0, s0, GEN_INT ((-1) * (INTVAL (decrement_num))))); + cmp = XVECEXP (PATTERN (insn), 0, 0); + cc_reg = SET_DEST (cmp); + bcomp = gen_rtx_GE (VOIDmode, cc_reg, const0_rtx); + loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands[1]); + emit_jump_insn (gen_rtx_SET (pc_rtx, + gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp, + loc_ref, pc_rtx))); + DONE; + } + + /* Otherwise, try standard decrement-by-one dls/le looping. */ + if (TARGET_THUMB2) + insn = emit_insn (gen_thumb2_addsi3_compare0 (s0, s0, + GEN_INT (-1))); + else + insn = emit_insn (gen_addsi3_compare0 (s0, s0, GEN_INT (-1))); + + cmp = XVECEXP (PATTERN (insn), 0, 0); + cc_reg = SET_DEST (cmp); + bcomp = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx); + loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands[1]); + emit_jump_insn (gen_rtx_SET (pc_rtx, + gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp, + loc_ref, pc_rtx))); + DONE; + } + else + FAIL; + } + else + FAIL; +}") (define_insn "*clear_apsr" [(unspec_volatile:SI [(const_int 0)] VUNSPEC_CLRM_APSR) @@ -1755,7 +1781,37 @@ { if (REGNO (operands[0]) == LR_REGNUM) { - emit_insn (gen_dls_insn (operands[0])); + /* Pick out the number by which we are decrementing the loop counter + in every iteration. If it's > 1, then use dlstp. */ + int const_int_dec_num + = abs (INTVAL (XEXP (XEXP (XVECEXP (PATTERN (operands[1]), 0, 1), + 1), + 1))); + switch (const_int_dec_num) + { + case 16: + emit_insn (gen_dlstp8_insn (operands[0])); + break; + + case 8: + emit_insn (gen_dlstp16_insn (operands[0])); + break; + + case 4: + emit_insn (gen_dlstp32_insn (operands[0])); + break; + + case 2: + emit_insn (gen_dlstp64_insn (operands[0])); + break; + + case 1: + emit_insn (gen_dls_insn (operands[0])); + break; + + default: + gcc_unreachable (); + } DONE; } else diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md index 7748e78437943ca0cd0d8909330ea8d3b4948ae3..744e7ab5731b5010ce280a046c5a94f6f590c350 100644 --- a/gcc/config/arm/unspecs.md +++ b/gcc/config/arm/unspecs.md @@ -579,6 +579,10 @@ VCTP16Q VCTP32Q VCTP64Q + DLSTP8 + DLSTP16 + DLSTP32 + DLSTP64 VPNOT VCREATEQ_F VCVTQ_N_TO_F_S diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index 8fe49c2ba3dbb4efd6143772082c4f8f5cc9728d..6b9d38e34cc9a81c06a7ba51699feb705ffd6981 100644 --- a/gcc/doc/tm.texi +++ b/gcc/doc/tm.texi @@ -11796,6 +11796,14 @@ loops, and will help ivopts to make some decisions. The default version of this hook returns false. @end deftypefn +@deftypefn {Target Hook} rtx TARGET_ALLOW_ELEMENTWISE_DOLOOP_P (rtx @var{count}, rtx @var{label}, rtx @var{doloop}) +This target hook allows the target to support loop-doloop optimisations +where the value that gets put into the loop counter register is not a +pre-calculation of the number of iteration of the loop. For instance, +the value used can be the number of elements that the loop will process. +The default version of this hook returns the same rtx it was given. +@end deftypefn + @deftypevr {Target Hook} bool TARGET_HAVE_COUNT_REG_DECR_P Return true if the target supports hardware count register for decrement and branch. diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index 62c49ac46de69f5bdd50770ba00ceb375436df2d..2e60922f6b68a034b18eb04d481554ea8983d254 100644 --- a/gcc/doc/tm.texi.in +++ b/gcc/doc/tm.texi.in @@ -7732,6 +7732,8 @@ to by @var{ce_info}. @hook TARGET_PREDICT_DOLOOP_P +@hook TARGET_ALLOW_ELEMENTWISE_DOLOOP_P + @hook TARGET_HAVE_COUNT_REG_DECR_P @hook TARGET_DOLOOP_COST_FOR_GENERIC diff --git a/gcc/loop-doloop.cc b/gcc/loop-doloop.cc index 30b45c8071a709d9f0d03037c3876b532727e6f0..e143c4d0e5e688841795600c9e14e76ee9e977e9 100644 --- a/gcc/loop-doloop.cc +++ b/gcc/loop-doloop.cc @@ -85,29 +85,29 @@ doloop_condition_get (rtx_insn *doloop_pat) forms: 1) (parallel [(set (pc) (if_then_else (condition) - (label_ref (label)) - (pc))) - (set (reg) (plus (reg) (const_int -1))) - (additional clobbers and uses)]) + (label_ref (label)) + (pc))) + (set (reg) (plus (reg) (const_int -n))) + (additional clobbers and uses)]) The branch must be the first entry of the parallel (also required by jump.cc), and the second entry of the parallel must be a set of the loop counter register. Some targets (IA-64) wrap the set of the loop counter in an if_then_else too. - 2) (set (reg) (plus (reg) (const_int -1)) - (set (pc) (if_then_else (reg != 0) - (label_ref (label)) - (pc))). + 2) (set (reg) (plus (reg) (const_int -n)) + (set (pc) (if_then_else (reg != 0) + (label_ref (label)) + (pc))). Some targets (ARM) do the comparison before the branch, as in the following form: - 3) (parallel [(set (cc) (compare ((plus (reg) (const_int -1), 0))) - (set (reg) (plus (reg) (const_int -1)))]) - (set (pc) (if_then_else (cc == NE) - (label_ref (label)) - (pc))) */ + 3) (parallel [(set (cc) (compare ((plus (reg) (const_int -n), 0))) + (set (reg) (plus (reg) (const_int -n)))]) + (set (pc) (if_then_else (cc == NE) + (label_ref (label)) + (pc))) */ pattern = PATTERN (doloop_pat); @@ -143,7 +143,7 @@ doloop_condition_get (rtx_insn *doloop_pat) || GET_CODE (cmp_arg1) != PLUS) return 0; reg_orig = XEXP (cmp_arg1, 0); - if (XEXP (cmp_arg1, 1) != GEN_INT (-1) + if (!CONST_INT_P (XEXP (cmp_arg1, 1)) || !REG_P (reg_orig)) return 0; cc_reg = SET_DEST (cmp_orig); @@ -156,7 +156,8 @@ doloop_condition_get (rtx_insn *doloop_pat) { /* We expect the condition to be of the form (reg != 0) */ cond = XEXP (SET_SRC (cmp), 0); - if (GET_CODE (cond) != NE || XEXP (cond, 1) != const0_rtx) + if ((GET_CODE (cond) != NE && GET_CODE (cond) != GE) + || XEXP (cond, 1) != const0_rtx) return 0; } } @@ -173,14 +174,14 @@ doloop_condition_get (rtx_insn *doloop_pat) if (! REG_P (reg)) return 0; - /* Check if something = (plus (reg) (const_int -1)). + /* Check if something = (plus (reg) (const_int -n)). On IA-64, this decrement is wrapped in an if_then_else. */ inc_src = SET_SRC (inc); if (GET_CODE (inc_src) == IF_THEN_ELSE) inc_src = XEXP (inc_src, 1); if (GET_CODE (inc_src) != PLUS || XEXP (inc_src, 0) != reg - || XEXP (inc_src, 1) != constm1_rtx) + || !CONST_INT_P (XEXP (inc_src, 1))) return 0; /* Check for (set (pc) (if_then_else (condition) @@ -211,42 +212,49 @@ doloop_condition_get (rtx_insn *doloop_pat) || (GET_CODE (XEXP (condition, 0)) == PLUS && XEXP (XEXP (condition, 0), 0) == reg)) { - if (GET_CODE (pattern) != PARALLEL) /* For the second form we expect: - (set (reg) (plus (reg) (const_int -1)) - (set (pc) (if_then_else (reg != 0) - (label_ref (label)) - (pc))). + (set (reg) (plus (reg) (const_int -n)) + (set (pc) (if_then_else (reg != 0) + (label_ref (label)) + (pc))). - is equivalent to the following: + If n == 1, that is equivalent to the following: - (parallel [(set (pc) (if_then_else (reg != 1) - (label_ref (label)) - (pc))) - (set (reg) (plus (reg) (const_int -1))) - (additional clobbers and uses)]) + (parallel [(set (pc) (if_then_else (reg != 1) + (label_ref (label)) + (pc))) + (set (reg) (plus (reg) (const_int -1))) + (additional clobbers and uses)]) - For the third form we expect: + For the third form we expect: - (parallel [(set (cc) (compare ((plus (reg) (const_int -1)), 0)) - (set (reg) (plus (reg) (const_int -1)))]) - (set (pc) (if_then_else (cc == NE) - (label_ref (label)) - (pc))) + (parallel [(set (cc) (compare ((plus (reg) (const_int -n)), 0)) + (set (reg) (plus (reg) (const_int -n)))]) + (set (pc) (if_then_else (cc == NE) + (label_ref (label)) + (pc))) - which is equivalent to the following: + Which also for n == 1 is equivalent to the following: - (parallel [(set (cc) (compare (reg, 1)) - (set (reg) (plus (reg) (const_int -1))) - (set (pc) (if_then_else (NE == cc) - (label_ref (label)) - (pc))))]) + (parallel [(set (cc) (compare (reg, 1)) + (set (reg) (plus (reg) (const_int -1))) + (set (pc) (if_then_else (NE == cc) + (label_ref (label)) + (pc))))]) - So we return the second form instead for the two cases. + So we return the second form instead for the two cases. + For the "elementwise" form where the decrement number isn't -1, + the final value may be exceeded, so use GE instead of NE. */ - condition = gen_rtx_fmt_ee (NE, VOIDmode, inc_src, const1_rtx); + if (GET_CODE (pattern) != PARALLEL) + { + if (INTVAL (XEXP (inc_src, 1)) != -1) + condition = gen_rtx_fmt_ee (GE, VOIDmode, inc_src, const0_rtx); + else + condition = gen_rtx_fmt_ee (NE, VOIDmode, inc_src, const1_rtx);; + } return condition; } @@ -685,17 +693,6 @@ doloop_optimize (class loop *loop) return false; } - max_cost - = COSTS_N_INSNS (param_max_iterations_computation_cost); - if (set_src_cost (desc->niter_expr, mode, optimize_loop_for_speed_p (loop)) - > max_cost) - { - if (dump_file) - fprintf (dump_file, - "Doloop: number of iterations too costly to compute.\n"); - return false; - } - if (desc->const_iter) iterations = widest_int::from (rtx_mode_t (desc->niter_expr, mode), UNSIGNED); @@ -722,6 +719,23 @@ doloop_optimize (class loop *loop) doloop_reg = gen_reg_rtx (mode); rtx_insn *doloop_seq = targetm.gen_doloop_end (doloop_reg, start_label); + /* Not all targets need to pre-calculate the number of the iterations of + the loop, they instead work by storing the number of elements in the + counter_reg and decrementing that. Call the appropriate target hook to + change the value of count. */ + count = targetm.allow_elementwise_doloop_p (count, start_label, doloop_seq); + + max_cost + = COSTS_N_INSNS (param_max_iterations_computation_cost); + if (set_src_cost (count, mode, optimize_loop_for_speed_p (loop)) + > max_cost) + { + if (dump_file) + fprintf (dump_file, + "Doloop: number of iterations too costly to compute.\n"); + return false; + } + word_mode_size = GET_MODE_PRECISION (word_mode); word_mode_max = (HOST_WIDE_INT_1U << (word_mode_size - 1) << 1) - 1; if (! doloop_seq diff --git a/gcc/target.def b/gcc/target.def index 082a7c62f34d7e73abb046bf949f4b8cc1ed3cf6..e2b672d7575b8bc7c19c41d63043f578debfd1ed 100644 --- a/gcc/target.def +++ b/gcc/target.def @@ -4411,6 +4411,16 @@ The default version of this hook returns false.", bool, (class loop *loop), default_predict_doloop_p) +DEFHOOK +(allow_elementwise_doloop_p, + "This target hook allows the target to support loop-doloop optimisations\n\ +where the value that gets put into the loop counter register is not a\n\ +pre-calculation of the number of iteration of the loop. For instance,\n\ +the value used can be the number of elements that the loop will process.\n\ +The default version of this hook returns the same rtx it was given.", + rtx, (rtx count, rtx label, rtx doloop), + default_allow_elementwise_doloop_p) + DEFHOOKPOD (have_count_reg_decr_p, "Return true if the target supports hardware count register for decrement\n\ diff --git a/gcc/targhooks.h b/gcc/targhooks.h index 3ca25ab6edb59b98225e7f22c0703730e569c30b..67f364c947bb51ada94ad5794fde2d58243936e7 100644 --- a/gcc/targhooks.h +++ b/gcc/targhooks.h @@ -88,6 +88,7 @@ extern bool default_fixed_point_supported_p (void); extern bool default_has_ifunc_p (void); extern bool default_predict_doloop_p (class loop *); +extern rtx default_allow_elementwise_doloop_p (rtx, rtx, rtx); extern machine_mode default_preferred_doloop_mode (machine_mode); extern const char * default_invalid_within_doloop (const rtx_insn *); diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc index d9e61552ad5a7b7a92eb172b011075cf7f29cc11..5d6c42e408527105361818efd65dd6e5b30f77dc 100644 --- a/gcc/targhooks.cc +++ b/gcc/targhooks.cc @@ -661,6 +661,12 @@ default_predict_doloop_p (class loop *loop ATTRIBUTE_UNUSED) return false; } +rtx +default_allow_elementwise_doloop_p (rtx count, rtx, rtx) +{ + return count; +} + /* By default, just use the input MODE itself. */ machine_mode diff --git a/gcc/testsuite/gcc.target/arm/dlstp-compile-asm.c b/gcc/testsuite/gcc.target/arm/dlstp-compile-asm.c index acf0836050c19b983feeaf97c3e52e1318bb194d..ed933a401e7888576377209295aa9bbcd9038889 100644 --- a/gcc/testsuite/gcc.target/arm/dlstp-compile-asm.c +++ b/gcc/testsuite/gcc.target/arm/dlstp-compile-asm.c @@ -140,10 +140,162 @@ TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vbrsrq, _m) TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vshlq, _m) TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vshrq, _m) +/* Now test some more configurations. */ + +/* Test a for loop format of decrementing to zero */ +int32_t a[] = {0, 1, 2, 3, 4, 5, 6, 7}; +void test1 (int32_t *b, int num_elems) +{ + for (int i = num_elems; i >= 0; i-= 4) + { + mve_pred16_t p = vctp32q (i); + int32x4_t va = vldrwq_z_s32 (&(a[i]), p); + vstrwq_p_s32 (b + i, va, p); + } +} + +/* Iteration counter counting up to num_iter. */ +void test2 (uint8_t *a, uint8_t *b, uint8_t *c, int n) +{ + int num_iter = (n + 15)/16; + for (int i = 0; i < num_iter; i++) + { + mve_pred16_t p = vctp8q (n); + uint8x16_t va = vldrbq_z_u8 (a, p); + uint8x16_t vb = vldrbq_z_u8 (b, p); + uint8x16_t vc = vaddq_x_u8 (va, vb, p); + vstrbq_p_u8 (c, vc, p); + n-=16; + } +} + +/* Using an unpredicated arithmetic instruction within the loop. */ +void test5 (uint8_t *a, uint8_t *b, uint8_t *c, uint8_t *d, int n) +{ + while (n > 0) + { + mve_pred16_t p = vctp8q (n); + uint8x16_t va = vldrbq_z_u8 (a, p); + uint8x16_t vb = vldrbq_u8 (b); + uint8x16_t vc = vaddq_u8 (va, vb); + uint8x16_t vd = vaddq_x_u8 (va, vb, p); + vstrbq_p_u8 (c, vc, p); + vstrbq_p_u8 (d, vd, p); + n-=16; + } +} + +/* Using a different VPR value for one instruction in the loop. */ +void test11 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p1) +{ + while (n > 0) + { + mve_pred16_t p = vctp32q (n); + int32x4_t va = vldrwq_z_s32 (a, p); + int32x4_t vb = vldrwq_z_s32 (b, p1); + int32x4_t vc = vaddq_x_s32 (va, vb, p); + vstrwq_p_s32 (c, vc, p); + c += 4; + a += 4; + b += 4; + n -= 4; + } +} + + + +/* Generating and using a constant VPR value in the loop, with a vctp. */ +void test12 (int32_t *a, int32_t *b, int32_t *c, int n, int g) +{ + while (n > 0) + { + mve_pred16_t p = vctp32q (n); + int32x4_t va = vldrwq_z_s32 (a, p); + mve_pred16_t p1 = vctp32q (g); + int32x4_t vb = vldrwq_z_s32 (b, p1); + int32x4_t vc = vaddq_x_s32 (va, vb, p); + vstrwq_p_s32 (c, vc, p); + c += 4; + a += 4; + b += 4; + n -= 4; + } +} + +/* Generating and using a different VPR value in the loop, with a vctp. */ +void test122 (int32_t *a, int32_t *b, int32_t *c, int n, int g) +{ + while (n > 0) + { + mve_pred16_t p = vctp32q (n); + int32x4_t va = vldrwq_z_s32 (a, p); + mve_pred16_t p1 = vctp32q (g); + int32x4_t vb = vldrwq_z_s32 (b, p1); + int32x4_t vc = vaddq_x_s32 (va, vb, p); + vstrwq_p_s32 (c, vc, p); + c += 4; + a += 4; + b += 4; + n -= 4; + g++; + } +} + +/* Generating and using a different VPR value in the loop, with a vctp_m. */ +void test13 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p1) +{ + while (n > 0) + { + mve_pred16_t p = vctp32q (n); + int32x4_t va = vldrwq_z_s32 (a, p); + mve_pred16_t p2 = vctp32q_m (n, p1); + int32x4_t vb = vldrwq_z_s32 (b, p1); + int32x4_t vc = vaddq_x_s32 (va, vb, p2); + vstrwq_p_s32 (c, vc, p); + c += 4; + a += 4; + b += 4; + n -= 4; + } +} + +/* Generating and using a different VPR value in the loop, with a vcmp. */ +void test14 (int32_t *a, int32_t *b, int32_t *c, int n) +{ + while (n > 0) + { + mve_pred16_t p = vctp32q (n); + int32x4_t va = vldrwq_z_s32 (a, p); + int32x4_t vb = vldrwq_z_s32 (b, p); + mve_pred16_t p1 = vcmpeqq_s32 (va, vb); + int32x4_t vc = vaddq_x_s32 (va, vb, p1); + vstrwq_p_s32 (c, vc, p); + c += 4; + a += 4; + b += 4; + n -= 4; + } +} + +/* Generating and using a different VPR value in the loop, with a vcmp_m. */ +void test15 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p1) +{ + while (n > 0) + { + mve_pred16_t p = vctp32q (n); + int32x4_t va = vldrwq_z_s32 (a, p); + int32x4_t vb = vldrwq_z_s32 (b, p); + mve_pred16_t p2 = vcmpeqq_m_s32 (va, vb, p1); + int32x4_t vc = vaddq_x_s32 (va, vb, p2); + vstrwq_p_s32 (c, vc, p); + c += 4; + a += 4; + b += 4; + n -= 4; + } +} + /* The final number of DLSTPs currently is calculated by the number of - `TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY.*` macros * 6. */ -/* { dg-final { scan-assembler-times {\tdlstp} 144 } } */ -/* { dg-final { scan-assembler-times {\tletp} 144 } } */ -/* { dg-final { scan-assembler-not "\tvctp\t" } } */ -/* { dg-final { scan-assembler-not "\tvpst\t" } } */ -/* { dg-final { scan-assembler-not "p0" } } */ + `TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY.*` macros * 6 + 9. */ +/* { dg-final { scan-assembler-times {\tdlstp} 153 } } */ +/* { dg-final { scan-assembler-times {\tletp} 153 } } */ diff --git a/gcc/testsuite/gcc.target/arm/dlstp-int16x8.c b/gcc/testsuite/gcc.target/arm/dlstp-int16x8.c new file mode 100644 index 0000000000000000000000000000000000000000..b1c03b618e0dcc7d4268f2f004663b6e332a402a --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/dlstp-int16x8.c @@ -0,0 +1,68 @@ +/* { dg-do run { target { arm*-*-* } } } */ +/* { dg-require-effective-target arm_v8_1m_mve_ok } */ +/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */ +/* { dg-options "-march=armv8.1-m.main+fp.dp+mve.fp -mfloat-abi=hard -mfpu=auto -O3 --save-temps" } */ + +#include +#include +#include +#include "lob.h" + +void __attribute__ ((noinline)) test (int16_t *a, int16_t *b, int16_t *c, int n) +{ + while (n > 0) + { + mve_pred16_t p = vctp16q (n); + int16x8_t va = vldrhq_z_s16 (a, p); + int16x8_t vb = vldrhq_z_s16 (b, p); + int16x8_t vc = vaddq_x_s16 (va, vb, p); + vstrhq_p_s16 (c, vc, p); + c+=8; + a+=8; + b+=8; + n-=8; + } +} + +int main () +{ + int i; + int16_t temp1[N]; + int16_t temp2[N]; + int16_t temp3[N]; + reset_data16 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 0); + check_plus16 (temp1, temp2, temp3, 0); + + reset_data16 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 1); + check_plus16 (temp1, temp2, temp3, 1); + + reset_data16 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 7); + check_plus16 (temp1, temp2, temp3, 7); + + reset_data16 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 8); + check_plus16 (temp1, temp2, temp3, 8); + + reset_data16 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 9); + check_plus16 (temp1, temp2, temp3, 9); + + reset_data16 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 16); + check_plus16 (temp1, temp2, temp3, 16); + + reset_data16 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 17); + check_plus16 (temp1, temp2, temp3, 17); + + reset_data16 (temp1, temp2, temp3, N); +} + +/* { dg-final { scan-assembler-times {\tdlstp.16} 1 } } */ +/* { dg-final { scan-assembler-times {\tletp} 1 } } */ +/* { dg-final { scan-assembler-not "\tvctp" } } */ +/* { dg-final { scan-assembler-not "\tvpst" } } */ +/* { dg-final { scan-assembler-not "p0" } } */ diff --git a/gcc/testsuite/gcc.target/arm/dlstp-int32x4.c b/gcc/testsuite/gcc.target/arm/dlstp-int32x4.c new file mode 100644 index 0000000000000000000000000000000000000000..4a3eb0577be0631ab4e07f6f75c2c802b535c88c --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/dlstp-int32x4.c @@ -0,0 +1,68 @@ +/* { dg-do run { target { arm*-*-* } } } */ +/* { dg-require-effective-target arm_v8_1m_mve_ok } */ +/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */ +/* { dg-options "-march=armv8.1-m.main+fp.dp+mve.fp -mfloat-abi=hard -mfpu=auto -O3 --save-temps" } */ + +#include +#include +#include +#include "lob.h" + +void __attribute__ ((noinline)) test (int32_t *a, int32_t *b, int32_t *c, int n) +{ + while (n > 0) + { + mve_pred16_t p = vctp32q (n); + int32x4_t va = vldrwq_z_s32 (a, p); + int32x4_t vb = vldrwq_z_s32 (b, p); + int32x4_t vc = vaddq_x_s32 (va, vb, p); + vstrwq_p_s32 (c, vc, p); + c+=4; + a+=4; + b+=4; + n-=4; + } +} + +int main () +{ + int i; + int32_t temp1[N]; + int32_t temp2[N]; + int32_t temp3[N]; + reset_data32 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 0); + check_plus32 (temp1, temp2, temp3, 0); + + reset_data32 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 1); + check_plus32 (temp1, temp2, temp3, 1); + + reset_data32 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 3); + check_plus32 (temp1, temp2, temp3, 3); + + reset_data32 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 4); + check_plus32 (temp1, temp2, temp3, 4); + + reset_data32 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 5); + check_plus32 (temp1, temp2, temp3, 5); + + reset_data32 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 8); + check_plus32 (temp1, temp2, temp3, 8); + + reset_data32 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 9); + check_plus32 (temp1, temp2, temp3, 9); + + reset_data32 (temp1, temp2, temp3, N); +} + +/* { dg-final { scan-assembler-times {\tdlstp.32} 1 } } */ +/* { dg-final { scan-assembler-times {\tletp} 1 } } */ +/* { dg-final { scan-assembler-not "\tvctp" } } */ +/* { dg-final { scan-assembler-not "\tvpst" } } */ +/* { dg-final { scan-assembler-not "p0" } } */ diff --git a/gcc/testsuite/gcc.target/arm/dlstp-int64x2.c b/gcc/testsuite/gcc.target/arm/dlstp-int64x2.c new file mode 100644 index 0000000000000000000000000000000000000000..f05ff834363e2a47a9faddb6b8b09b60eb94a3c2 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/dlstp-int64x2.c @@ -0,0 +1,68 @@ +/* { dg-do run { target { arm*-*-* } } } */ +/* { dg-require-effective-target arm_v8_1m_mve_ok } */ +/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */ +/* { dg-options "-march=armv8.1-m.main+fp.dp+mve.fp -mfloat-abi=hard -mfpu=auto -O3 --save-temps" } */ + +#include +#include +#include +#include "lob.h" + +void __attribute__ ((noinline)) test (int64_t *a, int64_t *c, int n) +{ + while (n > 0) + { + mve_pred16_t p = vctp64q (n); + int64x2_t va = vldrdq_gather_offset_z_s64 (a, vcreateq_u64 (8, 0), p); + vstrdq_scatter_offset_p_s64 (c, vcreateq_u64 (8, 0), va, p); + c+=2; + a+=2; + n-=2; + } +} + +int main () +{ + int i; + int64_t temp1[N]; + int64_t temp3[N]; + reset_data64 (temp1, temp3, N); + test (temp1, temp3, 0); + check_memcpy64 (temp1, temp3, 0); + + reset_data64 (temp1, temp3, N); + test (temp1, temp3, 1); + check_memcpy64 (temp1, temp3, 1); + + reset_data64 (temp1, temp3, N); + test (temp1, temp3, 2); + check_memcpy64 (temp1, temp3, 2); + + reset_data64 (temp1, temp3, N); + test (temp1, temp3, 3); + check_memcpy64 (temp1, temp3, 3); + + reset_data64 (temp1, temp3, N); + test (temp1, temp3, 4); + check_memcpy64 (temp1, temp3, 4); + + reset_data64 (temp1, temp3, N); + test (temp1, temp3, 5); + check_memcpy64 (temp1, temp3, 5); + + reset_data64 (temp1, temp3, N); + test (temp1, temp3, 6); + check_memcpy64 (temp1, temp3, 6); + + reset_data64 (temp1, temp3, N); + test (temp1, temp3, 7); + check_memcpy64 (temp1, temp3, 7); + + reset_data64 (temp1, temp3, N); +} + +/* { dg-final { scan-assembler-times {\tdlstp.64} 1 } } */ +/* { dg-final { scan-assembler-times {\tletp} 1 } } */ +/* { dg-final { scan-assembler-not "\tvctp" } } */ +/* { dg-final { scan-assembler-not "\tvpst" } } */ +/* { dg-final { scan-assembler-not "p0" } } */ diff --git a/gcc/testsuite/gcc.target/arm/dlstp-int8x16.c b/gcc/testsuite/gcc.target/arm/dlstp-int8x16.c new file mode 100644 index 0000000000000000000000000000000000000000..f281ba7848d1ba3321124e996eb5bea44977c2ab --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/dlstp-int8x16.c @@ -0,0 +1,68 @@ +/* { dg-do run { target { arm*-*-* } } } */ +/* { dg-require-effective-target arm_v8_1m_mve_ok } */ +/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */ +/* { dg-options "-march=armv8.1-m.main+fp.dp+mve.fp -mfloat-abi=hard -mfpu=auto -O3 --save-temps" } */ + +#include +#include +#include +#include "lob.h" + +void __attribute__ ((noinline)) test (int8_t *a, int8_t *b, int8_t *c, int n) +{ + while (n > 0) + { + mve_pred16_t p = vctp8q (n); + int8x16_t va = vldrbq_z_s8 (a, p); + int8x16_t vb = vldrbq_z_s8 (b, p); + int8x16_t vc = vaddq_x_s8 (va, vb, p); + vstrbq_p_s8 (c, vc, p); + c+=16; + a+=16; + b+=16; + n-=16; + } +} + +int main () +{ + int i; + int8_t temp1[N]; + int8_t temp2[N]; + int8_t temp3[N]; + reset_data8 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 0); + check_plus8 (temp1, temp2, temp3, 0); + + reset_data8 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 1); + check_plus8 (temp1, temp2, temp3, 1); + + reset_data8 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 15); + check_plus8 (temp1, temp2, temp3, 15); + + reset_data8 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 16); + check_plus8 (temp1, temp2, temp3, 16); + + reset_data8 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 17); + check_plus8 (temp1, temp2, temp3, 17); + + reset_data8 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 32); + check_plus8 (temp1, temp2, temp3, 32); + + reset_data8 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 33); + check_plus8 (temp1, temp2, temp3, 33); + + reset_data8 (temp1, temp2, temp3, N); +} + +/* { dg-final { scan-assembler-times {\tdlstp.8} 1 } } */ +/* { dg-final { scan-assembler-times {\tletp} 1 } } */ +/* { dg-final { scan-assembler-not "\tvctp" } } */ +/* { dg-final { scan-assembler-not "\tvpst" } } */ +/* { dg-final { scan-assembler-not "p0" } } */ diff --git a/gcc/testsuite/gcc.target/arm/dlstp-invalid-asm.c b/gcc/testsuite/gcc.target/arm/dlstp-invalid-asm.c new file mode 100644 index 0000000000000000000000000000000000000000..c0d049360285f427837b7d815a3348a815cd41fa --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/dlstp-invalid-asm.c @@ -0,0 +1,189 @@ + +/* { dg-do compile { target { arm*-*-* } } } */ +/* { dg-require-effective-target arm_v8_1m_mve_ok } */ +/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */ +/* { dg-options "-march=armv8.1-m.main+fp.dp+mve.fp -mfloat-abi=hard -mfpu=auto -O3 --save-temps" } */ + +#include + +/* Terminating on a non-zero number of elements. */ +void test1 (uint8_t *a, uint8_t *b, uint8_t *c, int n) +{ + while (n > 1) + { + mve_pred16_t p = vctp8q (n); + uint8x16_t va = vldrbq_z_u8 (a, p); + uint8x16_t vb = vldrbq_z_u8 (b, p); + uint8x16_t vc = vaddq_x_u8 (va, vb, p); + vstrbq_p_u8 (c, vc, p); + n -= 16; + } +} + +/* Similar, terminating on a non-zero number of elements, but in a for loop + format. */ +int32_t a[] = {0, 1, 2, 3, 4, 5, 6, 7}; +void test2 (int32_t *b, int num_elems) +{ + for (int i = num_elems; i >= 2; i-= 4) + { + mve_pred16_t p = vctp32q (i); + int32x4_t va = vldrwq_z_s32 (&(a[i]), p); + vstrwq_p_s32 (b + i, va, p); + } +} + +/* Iteration counter counting up to num_iter, with a non-zero starting num. */ +void test3 (uint8_t *a, uint8_t *b, uint8_t *c, int n) +{ + int num_iter = (n + 15)/16; + for (int i = 1; i < num_iter; i++) + { + mve_pred16_t p = vctp8q (n); + uint8x16_t va = vldrbq_z_u8 (a, p); + uint8x16_t vb = vldrbq_z_u8 (b, p); + uint8x16_t vc = vaddq_x_u8 (va, vb, p); + vstrbq_p_u8 (c, vc, p); + n -= 16; + } +} + +/* Iteration counter counting up to num_iter, with a larger increment */ +void test4 (uint8_t *a, uint8_t *b, uint8_t *c, int n) +{ + int num_iter = (n + 15)/16; + for (int i = 0; i < num_iter; i+=2) + { + mve_pred16_t p = vctp8q (n); + uint8x16_t va = vldrbq_z_u8 (a, p); + uint8x16_t vb = vldrbq_z_u8 (b, p); + uint8x16_t vc = vaddq_x_u8 (va, vb, p); + vstrbq_p_u8 (c, vc, p); + n -= 16; + } +} + +/* Using an unpredicated store instruction within the loop. */ +void test5 (uint8_t *a, uint8_t *b, uint8_t *c, uint8_t *d, int n) +{ + while (n > 0) + { + mve_pred16_t p = vctp8q (n); + uint8x16_t va = vldrbq_z_u8 (a, p); + uint8x16_t vb = vldrbq_z_u8 (b, p); + uint8x16_t vc = vaddq_u8 (va, vb); + uint8x16_t vd = vaddq_x_u8 (va, vb, p); + vstrbq_u8 (d, vd); + n -= 16; + } +} + +/* Using an unpredicated store outside the loop. */ +void test16 (uint8_t *a, uint8_t *b, uint8_t *c, int n, uint8x16_t vx) +{ + uint8_t sum = 0; + while (n > 0) + { + mve_pred16_t p = vctp8q (n); + uint8x16_t va = vldrbq_z_u8 (a, p); + uint8x16_t vb = vldrbq_z_u8 (b, p); + uint8x16_t vc = vaddq_m_u8 (vx, va, vb, p); + vx = vaddq_u8 (vx, vc); + a += 16; + b += 16; + n -= 16; + } + vstrbq_u8 (c, vx); +} + +/* Using an unpredicated op where the result is valid outside the bb. */ +uint8_t test17 (uint8_t *a, uint8_t *b, uint8_t *c, int n, uint8x16_t vx) +{ + uint8_t sum = 0; + while (n > 0) + { + mve_pred16_t p = vctp8q (n); + uint8x16_t va = vldrbq_z_u8 (a, p); + uint8x16_t vb = vldrbq_z_u8 (b, p); + uint8x16_t vc = vaddq_m_u8 (vx, va, vb, p); + sum += vaddvq_u8 (vc); + a += 16; + b += 16; + n -= 16; + } + return sum; +} + +/* Using a VPR that gets modified within the loop. */ +void test7 (int32_t *a, int32_t *b, int32_t *c, int n) +{ + while (n > 0) + { + mve_pred16_t p = vctp32q (n); + int32x4_t va = vldrwq_z_s32 (a, p); + p++; + int32x4_t vb = vldrwq_z_s32 (b, p); + int32x4_t vc = vaddq_x_s32 (va, vb, p); + vstrwq_p_s32 (c, vc, p); + c += 4; + a += 4; + b += 4; + n -= 4; + } +} + +/* Using a VPR that gets re-generated within the loop. */ +void test8 (int32_t *a, int32_t *b, int32_t *c, int n) +{ + mve_pred16_t p = vctp32q (n); + while (n > 0) + { + int32x4_t va = vldrwq_z_s32 (a, p); + p = vctp32q (n); + int32x4_t vb = vldrwq_z_s32 (b, p); + int32x4_t vc = vaddq_x_s32 (va, vb, p); + vstrwq_p_s32 (c, vc, p); + c += 4; + a += 4; + b += 4; + n -= 4; + } +} + +/* Using vctp32q_m instead of vctp32q. */ +void test9 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p0) +{ + while (n > 0) + { + mve_pred16_t p = vctp32q_m (n, p0); + int32x4_t va = vldrwq_z_s32 (a, p); + int32x4_t vb = vldrwq_z_s32 (b, p); + int32x4_t vc = vaddq_x_s32 (va, vb, p); + vstrwq_p_s32 (c, vc, p); + c += 4; + a += 4; + b += 4; + n -= 4; + } +} + +/* Generating and using a different VPR value in the loop, with a vctp_m that is tied to the base vctp VPR. */ +void test10 (int32_t *a, int32_t *b, int32_t *c, int n) +{ + while (n > 0) + { + mve_pred16_t p = vctp32q (n); + int32x4_t va = vldrwq_z_s32 (a, p); + mve_pred16_t p1 = vctp32q_m (n, p); + int32x4_t vb = vldrwq_z_s32 (b, p1); + int32x4_t vc = vaddq_x_s32 (va, vb, p1); + vstrwq_p_s32 (c, vc, p); + c += 4; + a += 4; + b += 4; + n -= 4; + } +} + +/* { dg-final { scan-assembler-not "\tdlstp" } } */ +/* { dg-final { scan-assembler-not "\tletp" } } */ diff --git a/gcc/testsuite/gcc.target/arm/lob.h b/gcc/testsuite/gcc.target/arm/lob.h index feaae7cc89959b3147368980120700bbc3e85ecb..3941fe7a8b620e62a5f742722be1ba2d031f5a8d 100644 --- a/gcc/testsuite/gcc.target/arm/lob.h +++ b/gcc/testsuite/gcc.target/arm/lob.h @@ -1,15 +1,131 @@ #include - +#include /* Common code for lob tests. */ #define NO_LOB asm volatile ("@ clobber lr" : : : "lr" ) -#define N 10000 +#define N 100 + +static void +reset_data (int *a, int *b, int *c, int x) +{ + memset (a, -1, x * sizeof (*a)); + memset (b, -1, x * sizeof (*b)); + memset (c, 0, x * sizeof (*c)); +} + +static void +reset_data8 (int8_t *a, int8_t *b, int8_t *c, int x) +{ + memset (a, -1, x * sizeof (*a)); + memset (b, -1, x * sizeof (*b)); + memset (c, 0, x * sizeof (*c)); +} + +static void +reset_data16 (int16_t *a, int16_t *b, int16_t *c, int x) +{ + memset (a, -1, x * sizeof (*a)); + memset (b, -1, x * sizeof (*b)); + memset (c, 0, x * sizeof (*c)); +} + +static void +reset_data32 (int32_t *a, int32_t *b, int32_t *c, int x) +{ + memset (a, -1, x * sizeof (*a)); + memset (b, -1, x * sizeof (*b)); + memset (c, 0, x * sizeof (*c)); +} + +static void +reset_data64 (int64_t *a, int64_t *c, int x) +{ + memset (a, -1, x * sizeof (*a)); + memset (c, 0, x * sizeof (*c)); +} + +static void +check_plus (int *a, int *b, int *c, int x) +{ + for (int i = 0; i < N; i++) + { + NO_LOB; + if (i < x) + { + if (c[i] != (a[i] + b[i])) abort (); + } + else + { + if (c[i] != 0) abort (); + } + } +} + +static void +check_plus8 (int8_t *a, int8_t *b, int8_t *c, int x) +{ + for (int i = 0; i < N; i++) + { + NO_LOB; + if (i < x) + { + if (c[i] != (a[i] + b[i])) abort (); + } + else + { + if (c[i] != 0) abort (); + } + } +} + +static void +check_plus16 (int16_t *a, int16_t *b, int16_t *c, int x) +{ + for (int i = 0; i < N; i++) + { + NO_LOB; + if (i < x) + { + if (c[i] != (a[i] + b[i])) abort (); + } + else + { + if (c[i] != 0) abort (); + } + } +} + +static void +check_plus32 (int32_t *a, int32_t *b, int32_t *c, int x) +{ + for (int i = 0; i < N; i++) + { + NO_LOB; + if (i < x) + { + if (c[i] != (a[i] + b[i])) abort (); + } + else + { + if (c[i] != 0) abort (); + } + } +} static void -reset_data (int *a, int *b, int *c) +check_memcpy64 (int64_t *a, int64_t *c, int x) { - memset (a, -1, N * sizeof (*a)); - memset (b, -1, N * sizeof (*b)); - memset (c, -1, N * sizeof (*c)); + for (int i = 0; i < N; i++) + { + NO_LOB; + if (i < x) + { + if (c[i] != a[i]) abort (); + } + else + { + if (c[i] != 0) abort (); + } + } } diff --git a/gcc/testsuite/gcc.target/arm/lob1.c b/gcc/testsuite/gcc.target/arm/lob1.c index ba5c82cd55c582c96a18ad417a3041e43d843613..c8ce653a5c39fb1ffcf82a6e584d9a0467a130c0 100644 --- a/gcc/testsuite/gcc.target/arm/lob1.c +++ b/gcc/testsuite/gcc.target/arm/lob1.c @@ -54,29 +54,18 @@ loop3 (int *a, int *b, int *c) } while (i < N); } -void -check (int *a, int *b, int *c) -{ - for (int i = 0; i < N; i++) - { - NO_LOB; - if (c[i] != a[i] + b[i]) - abort (); - } -} - int main (void) { - reset_data (a, b, c); + reset_data (a, b, c, N); loop1 (a, b ,c); - check (a, b ,c); - reset_data (a, b, c); + check_plus (a, b, c, N); + reset_data (a, b, c, N); loop2 (a, b ,c); - check (a, b ,c); - reset_data (a, b, c); + check_plus (a, b, c, N); + reset_data (a, b, c, N); loop3 (a, b ,c); - check (a, b ,c); + check_plus (a, b, c, N); return 0; } diff --git a/gcc/testsuite/gcc.target/arm/lob6.c b/gcc/testsuite/gcc.target/arm/lob6.c index 17b6124295e8ae9e1cb57e41fa43a954b3390eec..4fe116e2c2be3748d1bb6da7bb9092db8f962abc 100644 --- a/gcc/testsuite/gcc.target/arm/lob6.c +++ b/gcc/testsuite/gcc.target/arm/lob6.c @@ -79,14 +79,14 @@ check (void) int main (void) { - reset_data (a1, b1, c1); - reset_data (a2, b2, c2); + reset_data (a1, b1, c1, N); + reset_data (a2, b2, c2, N); loop1 (a1, b1, c1); ref1 (a2, b2, c2); check (); - reset_data (a1, b1, c1); - reset_data (a2, b2, c2); + reset_data (a1, b1, c1, N); + reset_data (a2, b2, c2, N); loop2 (a1, b1, c1); ref2 (a2, b2, c2); check ();