diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h index aea472bfbb9deaa8e925756963c7c5cc6fdc0d09..10ffc713f7744252655e9d78510df5cef98c2355 100644 --- a/gcc/config/arm/arm-protos.h +++ b/gcc/config/arm/arm-protos.h @@ -65,7 +65,7 @@ extern void arm_decompose_di_binop (rtx, rtx, rtx *, rtx *, rtx *, rtx *); extern bool arm_q_bit_access (void); extern bool arm_ge_bits_access (void); extern bool arm_target_insn_ok_for_lob (rtx); - +extern rtx arm_attempt_dlstp_transform (rtx); #ifdef RTX_CODE enum reg_class arm_mode_base_reg_class (machine_mode); diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc index 1d285051fa6d8e6c05813801eb678e58da25714d..d4ac04e095f01f42c1c4611074c3499a01242716 100644 --- a/gcc/config/arm/arm.cc +++ b/gcc/config/arm/arm.cc @@ -472,6 +472,9 @@ static const struct attribute_spec arm_attribute_table[] = #undef TARGET_SCHED_REORDER #define TARGET_SCHED_REORDER arm_sched_reorder +#undef TARGET_ALLOW_ELEMENTWISE_DOLOOP_P +#define TARGET_ALLOW_ELEMENTWISE_DOLOOP_P arm_allow_elementwise_doloop_p + #undef TARGET_REGISTER_MOVE_COST #define TARGET_REGISTER_MOVE_COST arm_register_move_cost @@ -33067,7 +33070,7 @@ arm_block_set_vect (rtx dstbase, return arm_block_set_unaligned_vect (dstbase, length, value, align); } -/* Expand string store operation. Firstly we try to do that by using +/* Expand string store operation. First we try to do that by using vectorization instructions, then try with ARM unaligned access and double-word store if profitable. OPERANDS[0] is the destination, OPERANDS[1] is the number of bytes, operands[2] is the value to @@ -34395,8 +34398,563 @@ arm_target_insn_ok_for_lob (rtx insn) return single_succ_p (bb) && single_pred_p (bb) - && single_succ_edge (bb)->dest == single_pred_edge (bb)->src - && contains_no_active_insn_p (bb); + && single_succ_edge (bb)->dest == single_pred_edge (bb)->src; +} + +static int +arm_mve_get_vctp_lanes (rtx x) +{ + if (GET_CODE (x) == SET && GET_CODE (XEXP (x, 1)) == UNSPEC + && XINT (XEXP (x, 1), 1) == VCTP) + { + switch (GET_MODE (XEXP (x, 1))) + { + case V16BImode: + return 16; + case V8BImode: + return 8; + case V4BImode: + return 4; + case V2QImode: + return 2; + default: + break; + } + } + return 0; +} + +/* Check if an insn requires the use of the VPR_REG, if it does, return the + sub-rtx of the VPR_REG. The `type` argument controls whether + this function should: + * For type == 0, check all operands, including the OUT operands, + and return the first occurance of the VPR_REG. + * For type == 1, only check the input operands. + * For type == 2, only check the output operands. + (INOUT operands are considered both as input and output operands) +*/ +static rtx +arm_get_required_vpr_reg (rtx_insn *insn, unsigned int type = 0) +{ + gcc_assert (type < 3); + if (!NONJUMP_INSN_P (insn)) + return NULL_RTX; + + bool requires_vpr; + extract_constrain_insn (insn); + int n_operands = recog_data.n_operands; + if (recog_data.n_alternatives == 0) + return NULL_RTX; + + /* Fill in recog_op_alt with information about the constraints of + this insn. */ + preprocess_constraints (insn); + + for (int op = 0; op < n_operands; op++) + { + requires_vpr = true; + if (type == 1 && (recog_data.operand_type[op] == OP_OUT + || recog_data.operand_type[op] == OP_INOUT)) + continue; + else if (type == 2 && (recog_data.operand_type[op] == OP_IN + || recog_data.operand_type[op] == OP_INOUT)) + continue; + + /* Iterate through alternatives of operand "op" in recog_op_alt and + identify if the operand is required to be the VPR. */ + for (int alt = 0; alt < recog_data.n_alternatives; alt++) + { + const operand_alternative *op_alt + = &recog_op_alt[alt * n_operands]; + /* Fetch the reg_class for each entry and check it against the + * VPR_REG reg_class. */ + if (alternative_class (op_alt, op) != VPR_REG) + requires_vpr = false; + } + /* If all alternatives of the insn require the VPR reg for this operand, + it means that either this is VPR-generating instruction, like a vctp, + vcmp, etc., or it is a VPT-predicated insruction. Return the subrtx + of the VPR reg operand. */ + if (requires_vpr) + return recog_data.operand[op]; + } + return NULL_RTX; +} + +static rtx +ALWAYS_INLINE ATTRIBUTE_UNUSED +arm_get_required_vpr_reg_ret_val (rtx_insn *insn) +{ + return arm_get_required_vpr_reg (insn, 2); +} + +static rtx +ALWAYS_INLINE +arm_get_required_vpr_reg_param (rtx_insn *insn) +{ + return arm_get_required_vpr_reg (insn, 1); +} + +/* Scan the basic block of a loop body for a vctp instruction. If there is + at least vctp instruction, return the first rtx_insn *. */ + +static rtx_insn * +arm_mve_get_loop_vctp (basic_block bb) +{ + rtx_insn *insn = BB_HEAD (bb); + + /* Now scan through all the instruction patterns and + pick out any MVE instructions. */ + FOR_BB_INSNS (bb, insn) + if (INSN_P (insn)) + if (arm_mve_get_vctp_lanes (PATTERN (insn)) != 0) + return insn; + return NULL; +} + +/* Recursively scan through the DF chain backwards within the basic block and + determine if any of the USEs of the original insn (or the USEs of the insns + where thy were DEF-ed, etc., recursively) were affected by implicit VPT + predication of an MVE_VPT_UNPREDICATED_INSN_P in a dlstp/letp loop. */ + +static bool +arm_mve_check_df_for_implic_predic (rtx_insn *insn, rtx_insn* loop_vctp, + rtx vctp_vpr_generated) +{ + rtx insn_vpr_reg_operand = NULL_RTX; + + /* Exit the recursion with a "true" when we find an unpredicated insn, or with + a false if we find the dlstp/letp loop vctp or a correctly dlstp/letp + predicated insn. */ + if (MVE_VPT_UNPREDICATED_INSN_P (insn) && insn != loop_vctp) + return true; + else if (insn == loop_vctp + || ((insn_vpr_reg_operand = arm_get_required_vpr_reg_param (insn)) + && rtx_equal_p (vctp_vpr_generated, insn_vpr_reg_operand))) + return false; + + /* For each USE in the instruction we are called for, we will loop backwards + up the BB to try and find if it was DEF-ed within the dlstp/letp loop. + If we do find such a DEF, then recurse on it's INSN. The intention here + is find if any of the inputs to the current instruction were affected by + implicit predication by being MVE_VPT_UNPREDICATED_INSN_Ps in a dlstp/letp + loop. */ + df_ref insn_uses = NULL; + FOR_EACH_INSN_USE (insn_uses, insn) + { + /* Starting from the current insn, scan backwards through the insn chain + until BB_HEAD: "for each insn in the BB prior to the current one". */ + rtx_insn *prev_insn = NULL; + for (prev_insn = insn; + prev_insn && prev_insn != PREV_INSN (BB_HEAD (BLOCK_FOR_INSN (insn))); + prev_insn = PREV_INSN (prev_insn)) + { + /* Look at all the DEFs of that previous insn: if one of them is on + the same REG as our current insn, then recurse in order to check + that insn's USEs. If any of these insns return true as + MVE_VPT_UNPREDICATED_INSN_Ps, then the whole chain is affected + by the change in behaviour from being placed in dlstp/letp loop. */ + df_ref prev_insn_defs = NULL; + FOR_EACH_INSN_DEF (prev_insn_defs, prev_insn) + { + if (DF_REF_REGNO (insn_uses) == DF_REF_REGNO (prev_insn_defs) + && insn != prev_insn + && BLOCK_FOR_INSN (insn) == BLOCK_FOR_INSN (prev_insn) + && arm_mve_check_df_for_implic_predic (prev_insn, + loop_vctp, + vctp_vpr_generated)) + return true; + } + } + } + return false; +} + +/* Attempt to transform the loop contents of loop basic block from VPT + predicated insns into unpredicated insns for a dlstp/letp loop. */ + +rtx +arm_attempt_dlstp_transform (rtx label) +{ + int decrementnum; + basic_block body = BLOCK_FOR_INSN (label)->prev_bb; + + /* Ensure that the bb is within a loop that has all required metadata. */ + if (!body->loop_father || !body->loop_father->header + || !body->loop_father->simple_loop_desc) + return GEN_INT (1); + basic_block pre_loop_bb1 = loop_preheader_edge (body->loop_father)->src; + basic_block pre_loop_bb2 = pre_loop_bb1->prev_bb; + rtx count = simple_loop_desc (body->loop_father)->niter_expr; + rtx shift_expr = NULL_RTX; + rtx initial_compare = NULL_RTX; + + /* Doloop can only be done "elementwise" with predicated dlstp/letp if it + contains a VCTP on the number of elements processed by the loop. + Find the VCTP predicate generation inside the loop body BB. */ + rtx_insn *vctp_insn = arm_mve_get_loop_vctp (body); + if (!vctp_insn) + return GEN_INT (1); + + /* Additionally, the iteration counter must only get decremented by the + number of MVE lanes (as per the data type). + There are only two types of loops that can be turned into dlstp/letp loops: + A) Loops of the form: + while (num_of_elem > 0) + { + p = vctp (num_of_elem) + n -= num_of_lanes; + } + B) Loops of the form: + int num_of_iters = (num_of_elem + num_of_lanes - 1) / num_of_lanes + for (i = 0; i < num_of_iters; i++) + { + p = vctp (num_of_elem) + n -= num_of_lanes; + } + + These can be verified through the "count" variable in the middle-end + (a.k.a. get_simple_loop_desc (loop)->desc->niter_expr). This is the + expression used to calculate the number of iterations that the loop would + execute for a standard dls/le loop. + + For dlstp/letp we only support cases where this is a power of 2, so from + "count" we want to extract something like: + ( [l/a] shiftrt: (x) (const_int y)) + For loops of form A), "count" is already a shiftrt expression. + For loops of form B), "count" gets given as: + (plus: (not (i)) (num_of_iters)) + with setup happening in a previous basic block. Here we need to verify: + * That i is _always_ initialized to (const_int 0) + * That num_of_iters is a shiftrt expression. + */ + if (GET_CODE (count) == LSHIFTRT + || GET_CODE (count) == ASHIFTRT) + { + shift_expr = count; + /* In this situation where we are looping on a decreasing number of + elements, a dlstp/letp loop can only work if the looping ends when the + element counter reaches zero and not some other value (e.g. n > 0 + works, not n > 1), or we can incorrectly end up running one additional + iteration. To by-pass any hoisting that the compiler may have done + with the first arg to `count`, we can instead look at the bb before + the loop preheader: this should end with a cmp+jump pair, where the + cmp needs to be: (const_int 0). */ + if (!pre_loop_bb2 || !BB_END (pre_loop_bb2) + || !prev_nonnote_nondebug_insn_bb (BB_END (pre_loop_bb2)) + || !INSN_P (prev_nonnote_nondebug_insn_bb (BB_END (pre_loop_bb2)))) + return GEN_INT (1); + else + initial_compare + = PATTERN (prev_nonnote_nondebug_insn_bb (BB_END (pre_loop_bb2))); + + if (!(initial_compare && GET_CODE (initial_compare) == SET + && cc_register (XEXP (initial_compare, 0), VOIDmode) + && GET_CODE (XEXP (initial_compare, 1)) == COMPARE + && CONST_INT_P (XEXP (XEXP (initial_compare, 1), 1)) + && INTVAL (XEXP (XEXP (initial_compare, 1), 1)) == 0)) + return GEN_INT (1); + } + else if (GET_CODE (count) == PLUS) + { + if (!(GET_CODE (XEXP (count, 0)) == NOT + && REG_P (XEXP (XEXP (count, 0), 0)) && REG_P (XEXP (count, 1)))) + return GEN_INT (1); + else + { + /* Verify the first argument to the plus. */ + rtx loop_counter = XEXP (XEXP (count, 0), 0); + df_ref loop_counter_init_def = NULL; + loop_counter_init_def + = df_bb_regno_last_def_find (pre_loop_bb1, REGNO (loop_counter)); + rtx loop_counter_init + = PATTERN (DF_REF_INSN (loop_counter_init_def)); + if (!(loop_counter_init_def && GET_CODE (loop_counter_init) == SET + && CONST_INT_P (XEXP (loop_counter_init, 1)) + && INTVAL (XEXP (loop_counter_init, 1)) == 0)) + return GEN_INT (1); + + /* Verify the second argument to the plus. */ + rtx count_max = XEXP (count, 1); + df_ref counter_max_def = NULL; + counter_max_def = DF_REG_DEF_CHAIN (REGNO (count_max)); + if (counter_max_def + && (GET_CODE (XEXP (single_set (DF_REF_INSN (counter_max_def)), + 1)) + == LSHIFTRT + || GET_CODE ( + XEXP (single_set (DF_REF_INSN (counter_max_def)), 1)) + == ASHIFTRT)) + { + shift_expr + = XEXP (single_set (DF_REF_INSN (counter_max_def)), 1); + } + else + return GEN_INT (1); + } + } + else + return GEN_INT (1); + + /* Check the validity of the shift: the second operand needs to be a + constant. */ + if (!CONSTANT_P (XEXP (shift_expr, 1))) + return GEN_INT (1); + /* Extract the loop decrement from the [A/L]SHIFTR 2nd operand of count. */ + decrementnum = (1 << (INTVAL (XEXP (shift_expr, 1)))); + /* Ensure it matches the number of lanes of the vctp instruction. */ + if (decrementnum != arm_mve_get_vctp_lanes (PATTERN (vctp_insn))) + return GEN_INT (1); + + rtx_insn *insn = 0; + rtx_insn *cur_insn = 0; + rtx_insn *seq; + rtx vctp_vpr_generated = NULL_RTX; + + /* Scan through the insns in the loop bb and emit the transformed bb + insns to a sequence. */ + start_sequence (); + FOR_BB_INSNS (body, insn) + { + rtx insn_vpr_reg_operand = NULL_RTX; + if (GET_CODE (insn) == CODE_LABEL || NOTE_INSN_BASIC_BLOCK_P (insn)) + continue; + else if (NOTE_P (insn)) + emit_note ((enum insn_note) NOTE_KIND (insn)); + else if (DEBUG_INSN_P (insn)) + emit_debug_insn (PATTERN (insn)); + else if (!INSN_P (insn)) + { + end_sequence (); + return GEN_INT (1); + } + /* When we find the vctp instruction: This may be followed by + a zero-extend insn to SImode. If it is, then save the + zero-extended REG into vctp_vpr_generated. If there is no + zero-extend, then store the raw output of the vctp. + For any VPT-predicated instructions we need to ensure that + the VPR they use is the same as the one given here and + they often consume the output of a subreg of the SImode + zero-extended VPR-reg. As a result, comparing against the + output of the zero-extend is more likely to succeed. + This code also guarantees to us that the vctp comes before + any instructions that use the VPR within the loop, for the + dlstp/letp transform to succeed. */ + else if (insn == vctp_insn) + { + rtx_insn *next_use1 = NULL; + df_ref use; + for (use = DF_REG_USE_CHAIN ( + DF_REF_REGNO (DF_INSN_INFO_DEFS (DF_INSN_INFO_GET (insn)))); + use; use = DF_REF_NEXT_REG (use)) + if (!next_use1 && NONDEBUG_INSN_P (DF_REF_INSN (use))) + next_use1 = DF_REF_INSN (use); + + if (GET_CODE (SET_SRC (single_set (next_use1))) == ZERO_EXTEND) + { + rtx_insn *next_use2 = NULL; + for (use = DF_REG_USE_CHAIN (DF_REF_REGNO ( + DF_INSN_INFO_DEFS (DF_INSN_INFO_GET (next_use1)))); + use; use = DF_REF_NEXT_REG (use)) + if (!next_use2 && NONDEBUG_INSN_P (DF_REF_INSN (use))) + next_use2 = DF_REF_INSN (use); + + if (GET_CODE (SET_SRC (single_set (next_use2))) == SUBREG) + vctp_vpr_generated = XEXP (PATTERN (next_use2), 0); + } + + if (!vctp_vpr_generated) + { + end_sequence (); + return GEN_INT (1); + } + /* Also emit a USE of the source register of the vctp. + This holds the number of elements being processed + by the loop. This later gets stored into `count` + for the middle-end to initialise the loop counter. */ + emit_use (XVECEXP (XEXP (PATTERN (insn), 1), 0, 0)); + continue; + } + /* If the insn pattern requires the use of the VPR value from the + vctp as an input parameter. */ + else if ((insn_vpr_reg_operand = arm_get_required_vpr_reg_param (insn)) + && rtx_equal_p (vctp_vpr_generated, insn_vpr_reg_operand)) + { + gcc_assert (MVE_VPT_PREDICATED_INSN_P (insn)); + int new_icode = get_attr_mve_unpredicated_insn (insn); + extract_insn (insn); + rtx arr[8]; + int j = 0; + + /* When transforming a VPT-predicated instruction + into its unpredicated equivalent we need to drop + the VPR operand and we may need to also drop a + merge "vuninit" input operand, depending on the + instruction pattern. Here ensure that we have at + most a two-operand difference between the two + instrunctions. */ + int n_operands_diff + = recog_data.n_operands - insn_data[new_icode].n_operands; + gcc_assert (n_operands_diff > 0 && n_operands_diff <= 2); + + /* Then, loop through the operands of the predicated + instruction, and retain the ones that map to the + unpredicated instruction. */ + for (int i = 0; i < recog_data.n_operands; i++) + { + /* Ignore the VPR and, if needed, the vuninit + operand. */ + if (insn_vpr_reg_operand == recog_data.operand[i] + || (n_operands_diff == 2 + && !strcmp (recog_data.constraints[i], "0"))) + continue; + else + { + arr[j] = recog_data.operand[i]; + j++; + } + } + + /* Finally, emit the upredicated instruction. */ + switch (j) + { + case 1: + emit_insn (GEN_FCN (new_icode) (arr[0])); + break; + case 2: + emit_insn (GEN_FCN (new_icode) (arr[0], arr[1])); + break; + case 3: + emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2])); + break; + case 4: + emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2], + arr[3])); + break; + case 5: + emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2], arr[3], + arr[4])); + break; + case 6: + emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2], arr[3], + arr[4], arr[5])); + break; + case 7: + emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2], arr[3], + arr[4], arr[5], arr[6])); + break; + default: + gcc_unreachable (); + } + } + /* If the insn isn't VPT predicated on vctp_vpr_generated, we need to + make sure that it is still valid within the dlstp/letp loop. */ + else + { + /* None of registers USE-d by the instruction need can be the VPR + vctp_vpr_generated. This blocks the optimisation if there any + instructions that use the optimised-out VPR value in any way + other than as a VPT block predicate. */ + df_ref insn_uses = NULL; + FOR_EACH_INSN_USE (insn_uses, insn) + { + if (rtx_equal_p (vctp_vpr_generated, DF_REF_REG (insn_uses))) + { + end_sequence (); + return GEN_INT (1); + } + } + /* If within the loop we have an MVE vector instruction that is + unpredicated, the dlstp/letp looping will add implicit + predication to it. This will result in a change in behaviour + of the instruction, so we need to find out if any instructions + that feed into the current instruction were implicitly + predicated. */ + if (arm_mve_check_df_for_implic_predic (insn, vctp_insn, + vctp_vpr_generated)) + { + if (mve_memory_operand (SET_DEST (single_set (insn)), + GET_MODE (SET_DEST (single_set (insn))))) + { + end_sequence (); + return GEN_INT (1); + } + + /* Next, if we have identified that the current DEF will be + modified by such implicit predication, scan through all the + insns that USE it and bail out if any one is outside the + current basic block. */ + df_ref insn_def = NULL; + insn_def = DF_INSN_INFO_DEFS (DF_INSN_INFO_GET (insn)); + if (insn_def) + { + for (df_ref use = DF_REG_USE_CHAIN (DF_REF_REGNO (insn_def)); + use; use = DF_REF_NEXT_REG (use)) + { + rtx_insn *next_use_insn = DF_REF_INSN (use); + if (NONDEBUG_INSN_P (next_use_insn)) + { + rtx next_insn_set_dest + = SET_DEST (single_set (next_use_insn)); + if (BLOCK_FOR_INSN (insn) + != BLOCK_FOR_INSN (next_use_insn)) + { + end_sequence (); + return GEN_INT (1); + } + } + } + } + } + emit_insn (PATTERN (insn)); + } + } + seq = get_insns (); + end_sequence (); + + /* Re-write the entire BB contents with the transformed + sequence. */ + FOR_BB_INSNS_SAFE (body, insn, cur_insn) + if (!(GET_CODE (insn) == CODE_LABEL || NOTE_INSN_BASIC_BLOCK_P (insn))) + delete_insn (insn); + for (insn = seq; NEXT_INSN (insn); insn = NEXT_INSN (insn)) + if (NOTE_P (insn)) + emit_note_after ((enum insn_note)NOTE_KIND (insn), BB_END (body)); + else if (DEBUG_INSN_P (insn)) + emit_debug_insn_after (PATTERN (insn), BB_END (body)); + else + emit_insn_after (PATTERN (insn), BB_END (body)); + + emit_jump_insn_after (PATTERN (insn), BB_END (body)); + return GEN_INT (decrementnum); +} + +/* Target hook to the number of elements to be processed by a dlstp/letp loop + into `count` to intialise the counter register. The number of elements was + previously extracted from the vctp insn and placed into a USE rtx. + We only check that the doloop_end pattern successfully decrements by a + number other than -1 for a valid dlstp/letp loop. No other checking is + needed as that was done previously. */ + +rtx +arm_allow_elementwise_doloop_p (rtx count, rtx label, rtx doloop) +{ + if (doloop + && INTVAL (XEXP (SET_SRC (XVECEXP (PATTERN (doloop), 0, 1)), 1)) != -1) + { + basic_block body = BLOCK_FOR_INSN (label)->prev_bb; + rtx_insn* insn; + FOR_BB_INSNS (body, insn) + { + if (INSN_P (insn) && GET_CODE (PATTERN (insn)) == USE) + { + rtx num_elem_reg = copy_rtx (XEXP (PATTERN (insn), 0)); + delete_insn (insn); + return num_elem_reg; + } + } + } + return count; } #if CHECKING_P diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index 39895ad62aa3afd55d3cbc92c55b45bc56710bcb..95024fe01612c0dfcd3a7dc97d3f1eda1fccc063 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -1464,6 +1464,10 @@ (VADCIQ_M_S "s") (SQRSHRL_64 "64") (SQRSHRL_48 "48") (UQRSHLL_64 "64") (UQRSHLL_48 "48") (VSHLCQ_M_S "s") (VSHLCQ_M_U "u")]) + +(define_int_attr mode1 [(DLSTP8 "8") (DLSTP16 "16") (DLSTP32 "32") + (DLSTP64 "64")]) + ;; Both kinds of return insn. (define_code_iterator RETURNS [return simple_return]) (define_code_attr return_str [(return "") (simple_return "simple_")]) @@ -1769,6 +1773,8 @@ (define_int_iterator UQRSHLLQ [UQRSHLL_64 UQRSHLL_48]) (define_int_iterator SQRSHRLQ [SQRSHRL_64 SQRSHRL_48]) (define_int_iterator VSHLCQ_M [VSHLCQ_M_S VSHLCQ_M_U]) +(define_int_iterator DLSTP [DLSTP8 DLSTP16 DLSTP32 + DLSTP64]) ;; Define iterators for VCMLA operations (define_int_iterator VCMLA_OP [UNSPEC_VCMLA diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md index 854b8ab935f82ad0eb99e6af9852ce8154cf9d9d..8dd3567759e5d794b1039c114ccb13765a131304 100644 --- a/gcc/config/arm/mve.md +++ b/gcc/config/arm/mve.md @@ -11100,3 +11100,38 @@ } DONE; }) + +;; Originally expanded by 'predicated_doloop_end'. +(define_insn "*predicated_doloop_end_internal" + [(set (pc) + (if_then_else + (ge (plus:SI (reg:SI LR_REGNUM) + (match_operand:SI 0 "const_int_operand" "")) + (const_int 0)) + (label_ref (match_operand 1 "" "")) + (pc))) + (set (reg:SI LR_REGNUM) + (plus:SI (reg:SI LR_REGNUM) (match_dup 0))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_32BIT && TARGET_HAVE_LOB && TARGET_HAVE_MVE && TARGET_THUMB2" + { + if (get_attr_length (insn) == 4) + return "letp\t%|lr, %l1"; + else + return "subs\t%|lr, #%0;bgt\t%l1"; + } + [(set (attr "length") + (if_then_else + (ltu (minus (pc) (match_dup 1)) (const_int 1024)) + (const_int 4) + (const_int 6))) + (set_attr "type" "branch")]) + +(define_insn "dlstp_insn" + [ + (set (reg:SI LR_REGNUM) + (unspec:SI [(match_operand:SI 0 "s_register_operand" "r")] + DLSTP)) + ] + "TARGET_32BIT && TARGET_HAVE_LOB && TARGET_HAVE_MVE && TARGET_THUMB2" + "dlstp.\t%|lr, %0") \ No newline at end of file diff --git a/gcc/config/arm/thumb2.md b/gcc/config/arm/thumb2.md index e1e013befa7a67ddbf517bf22797bdaeeb96b94f..3beb4d4b48cd47c8e946fba53c49975ff8271ac1 100644 --- a/gcc/config/arm/thumb2.md +++ b/gcc/config/arm/thumb2.md @@ -1613,7 +1613,7 @@ (use (match_operand 1 "" ""))] ; label "TARGET_32BIT" " - { +{ /* Currently SMS relies on the do-loop pattern to recognize loops where (1) the control part consists of all insns defining and/or using a certain 'count' register and (2) the loop count can be @@ -1623,41 +1623,67 @@ Also used to implement the low over head loops feature, which is part of the Armv8.1-M Mainline Low Overhead Branch (LOB) extension. */ - if (optimize > 0 && (flag_modulo_sched || TARGET_HAVE_LOB)) - { - rtx s0; - rtx bcomp; - rtx loc_ref; - rtx cc_reg; - rtx insn; - rtx cmp; - - if (GET_MODE (operands[0]) != SImode) - FAIL; - - s0 = operands [0]; - - /* Low over head loop instructions require the first operand to be LR. */ - if (TARGET_HAVE_LOB && arm_target_insn_ok_for_lob (operands [1])) - s0 = gen_rtx_REG (SImode, LR_REGNUM); - - if (TARGET_THUMB2) - insn = emit_insn (gen_thumb2_addsi3_compare0 (s0, s0, GEN_INT (-1))); - else - insn = emit_insn (gen_addsi3_compare0 (s0, s0, GEN_INT (-1))); - - cmp = XVECEXP (PATTERN (insn), 0, 0); - cc_reg = SET_DEST (cmp); - bcomp = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx); - loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands [1]); - emit_jump_insn (gen_rtx_SET (pc_rtx, - gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp, - loc_ref, pc_rtx))); - DONE; - } - else - FAIL; - }") + if (optimize > 0 && (flag_modulo_sched || TARGET_HAVE_LOB)) + { + rtx s0; + rtx bcomp; + rtx loc_ref; + rtx cc_reg; + rtx insn; + rtx cmp; + rtx decrement_num; + + if (GET_MODE (operands[0]) != SImode) + FAIL; + + s0 = operands[0]; + + if (TARGET_HAVE_LOB && arm_target_insn_ok_for_lob (operands[1])) + { + s0 = gen_rtx_REG (SImode, LR_REGNUM); + + /* If we have a compatibe MVE target, try and analyse the loop + contents to determine if we can use predicated dlstp/letp + looping. */ + if (TARGET_HAVE_MVE && TARGET_THUMB2 + && (decrement_num = arm_attempt_dlstp_transform (operands[1])) + && (INTVAL (decrement_num) != 1)) + { + insn = emit_insn + (gen_thumb2_addsi3_compare0 + (s0, s0, GEN_INT ((-1) * (INTVAL (decrement_num))))); + cmp = XVECEXP (PATTERN (insn), 0, 0); + cc_reg = SET_DEST (cmp); + bcomp = gen_rtx_GE (VOIDmode, cc_reg, const0_rtx); + loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands[1]); + emit_jump_insn (gen_rtx_SET (pc_rtx, + gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp, + loc_ref, pc_rtx))); + DONE; + } + + /* Otherwise, try standard decrement-by-one dls/le looping. */ + if (TARGET_THUMB2) + insn = emit_insn (gen_thumb2_addsi3_compare0 (s0, s0, + GEN_INT (-1))); + else + insn = emit_insn (gen_addsi3_compare0 (s0, s0, GEN_INT (-1))); + + cmp = XVECEXP (PATTERN (insn), 0, 0); + cc_reg = SET_DEST (cmp); + bcomp = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx); + loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands[1]); + emit_jump_insn (gen_rtx_SET (pc_rtx, + gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp, + loc_ref, pc_rtx))); + DONE; + } + else + FAIL; + } + else + FAIL; +}") (define_insn "*clear_apsr" [(unspec_volatile:SI [(const_int 0)] VUNSPEC_CLRM_APSR) @@ -1755,7 +1781,37 @@ { if (REGNO (operands[0]) == LR_REGNUM) { - emit_insn (gen_dls_insn (operands[0])); + /* Pick out the number by which we are decrementing the loop counter + in every iteration. If it's > 1, then use dlstp. */ + int const_int_dec_num + = abs (INTVAL (XEXP (XEXP (XVECEXP (PATTERN (operands[1]), 0, 1), + 1), + 1))); + switch (const_int_dec_num) + { + case 16: + emit_insn (gen_dlstp8_insn (operands[0])); + break; + + case 8: + emit_insn (gen_dlstp16_insn (operands[0])); + break; + + case 4: + emit_insn (gen_dlstp32_insn (operands[0])); + break; + + case 2: + emit_insn (gen_dlstp64_insn (operands[0])); + break; + + case 1: + emit_insn (gen_dls_insn (operands[0])); + break; + + default: + gcc_unreachable (); + } DONE; } else diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md index 84384ee798de363b874c41a16dc5daae34eccb94..e533c60215a49fe0aabfa8b5fb6d988faf3c4f51 100644 --- a/gcc/config/arm/unspecs.md +++ b/gcc/config/arm/unspecs.md @@ -581,6 +581,10 @@ VADDLVQ_U VCTP VCTP_M + DLSTP8 + DLSTP16 + DLSTP32 + DLSTP64 VPNOT VCREATEQ_F VCVTQ_N_TO_F_S diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index c6c891972d1e58cd163b259ba96a599d62326865..4e3fcb1a845ed176386be41b3ace9f067fef9361 100644 --- a/gcc/doc/tm.texi +++ b/gcc/doc/tm.texi @@ -11796,6 +11796,14 @@ loops, and will help ivopts to make some decisions. The default version of this hook returns false. @end deftypefn +@deftypefn {Target Hook} rtx TARGET_ALLOW_ELEMENTWISE_DOLOOP_P (rtx @var{count}, rtx @var{label}, rtx @var{doloop}) +This target hook allows the target to support loop-doloop optimisations +where the value that gets put into the loop counter register is not a +pre-calculation of the number of iteration of the loop. For instance, +the value used can be the number of elements that the loop will process. +The default version of this hook returns the same rtx it was given. +@end deftypefn + @deftypevr {Target Hook} bool TARGET_HAVE_COUNT_REG_DECR_P Return true if the target supports hardware count register for decrement and branch. diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index 613b2534149415f442163d599503efaf423b673b..f3e74dfd553c2b5a5857668f6835e762ce2c2616 100644 --- a/gcc/doc/tm.texi.in +++ b/gcc/doc/tm.texi.in @@ -7732,6 +7732,8 @@ to by @var{ce_info}. @hook TARGET_PREDICT_DOLOOP_P +@hook TARGET_ALLOW_ELEMENTWISE_DOLOOP_P + @hook TARGET_HAVE_COUNT_REG_DECR_P @hook TARGET_DOLOOP_COST_FOR_GENERIC diff --git a/gcc/loop-doloop.cc b/gcc/loop-doloop.cc index 4feb0a25ab9331b7124df900f73c9fc6fb3eb10b..38024e223439cac1ca00c0fe2af87d7628c8a815 100644 --- a/gcc/loop-doloop.cc +++ b/gcc/loop-doloop.cc @@ -85,29 +85,29 @@ doloop_condition_get (rtx_insn *doloop_pat) forms: 1) (parallel [(set (pc) (if_then_else (condition) - (label_ref (label)) - (pc))) - (set (reg) (plus (reg) (const_int -1))) - (additional clobbers and uses)]) + (label_ref (label)) + (pc))) + (set (reg) (plus (reg) (const_int -n))) + (additional clobbers and uses)]) The branch must be the first entry of the parallel (also required by jump.cc), and the second entry of the parallel must be a set of the loop counter register. Some targets (IA-64) wrap the set of the loop counter in an if_then_else too. - 2) (set (reg) (plus (reg) (const_int -1)) - (set (pc) (if_then_else (reg != 0) - (label_ref (label)) - (pc))). + 2) (set (reg) (plus (reg) (const_int -n)) + (set (pc) (if_then_else (reg != 0) + (label_ref (label)) + (pc))). Some targets (ARM) do the comparison before the branch, as in the following form: - 3) (parallel [(set (cc) (compare ((plus (reg) (const_int -1), 0))) - (set (reg) (plus (reg) (const_int -1)))]) - (set (pc) (if_then_else (cc == NE) - (label_ref (label)) - (pc))) */ + 3) (parallel [(set (cc) (compare ((plus (reg) (const_int -n), 0))) + (set (reg) (plus (reg) (const_int -n)))]) + (set (pc) (if_then_else (cc == NE) + (label_ref (label)) + (pc))) */ pattern = PATTERN (doloop_pat); @@ -143,7 +143,7 @@ doloop_condition_get (rtx_insn *doloop_pat) || GET_CODE (cmp_arg1) != PLUS) return 0; reg_orig = XEXP (cmp_arg1, 0); - if (XEXP (cmp_arg1, 1) != GEN_INT (-1) + if (!CONST_INT_P (XEXP (cmp_arg1, 1)) || !REG_P (reg_orig)) return 0; cc_reg = SET_DEST (cmp_orig); @@ -156,7 +156,8 @@ doloop_condition_get (rtx_insn *doloop_pat) { /* We expect the condition to be of the form (reg != 0) */ cond = XEXP (SET_SRC (cmp), 0); - if (GET_CODE (cond) != NE || XEXP (cond, 1) != const0_rtx) + if ((GET_CODE (cond) != NE && GET_CODE (cond) != GE) + || XEXP (cond, 1) != const0_rtx) return 0; } } @@ -173,14 +174,14 @@ doloop_condition_get (rtx_insn *doloop_pat) if (! REG_P (reg)) return 0; - /* Check if something = (plus (reg) (const_int -1)). + /* Check if something = (plus (reg) (const_int -n)). On IA-64, this decrement is wrapped in an if_then_else. */ inc_src = SET_SRC (inc); if (GET_CODE (inc_src) == IF_THEN_ELSE) inc_src = XEXP (inc_src, 1); if (GET_CODE (inc_src) != PLUS || XEXP (inc_src, 0) != reg - || XEXP (inc_src, 1) != constm1_rtx) + || !CONST_INT_P (XEXP (inc_src, 1))) return 0; /* Check for (set (pc) (if_then_else (condition) @@ -211,42 +212,49 @@ doloop_condition_get (rtx_insn *doloop_pat) || (GET_CODE (XEXP (condition, 0)) == PLUS && XEXP (XEXP (condition, 0), 0) == reg)) { - if (GET_CODE (pattern) != PARALLEL) /* For the second form we expect: - (set (reg) (plus (reg) (const_int -1)) - (set (pc) (if_then_else (reg != 0) - (label_ref (label)) - (pc))). + (set (reg) (plus (reg) (const_int -n)) + (set (pc) (if_then_else (reg != 0) + (label_ref (label)) + (pc))). - is equivalent to the following: + If n == 1, that is equivalent to the following: - (parallel [(set (pc) (if_then_else (reg != 1) - (label_ref (label)) - (pc))) - (set (reg) (plus (reg) (const_int -1))) - (additional clobbers and uses)]) + (parallel [(set (pc) (if_then_else (reg != 1) + (label_ref (label)) + (pc))) + (set (reg) (plus (reg) (const_int -1))) + (additional clobbers and uses)]) - For the third form we expect: + For the third form we expect: - (parallel [(set (cc) (compare ((plus (reg) (const_int -1)), 0)) - (set (reg) (plus (reg) (const_int -1)))]) - (set (pc) (if_then_else (cc == NE) - (label_ref (label)) - (pc))) + (parallel [(set (cc) (compare ((plus (reg) (const_int -n)), 0)) + (set (reg) (plus (reg) (const_int -n)))]) + (set (pc) (if_then_else (cc == NE) + (label_ref (label)) + (pc))) - which is equivalent to the following: + Which also for n == 1 is equivalent to the following: - (parallel [(set (cc) (compare (reg, 1)) - (set (reg) (plus (reg) (const_int -1))) - (set (pc) (if_then_else (NE == cc) - (label_ref (label)) - (pc))))]) + (parallel [(set (cc) (compare (reg, 1)) + (set (reg) (plus (reg) (const_int -1))) + (set (pc) (if_then_else (NE == cc) + (label_ref (label)) + (pc))))]) - So we return the second form instead for the two cases. + So we return the second form instead for the two cases. + For the "elementwise" form where the decrement number isn't -1, + the final value may be exceeded, so use GE instead of NE. */ - condition = gen_rtx_fmt_ee (NE, VOIDmode, inc_src, const1_rtx); + if (GET_CODE (pattern) != PARALLEL) + { + if (INTVAL (XEXP (inc_src, 1)) != -1) + condition = gen_rtx_fmt_ee (GE, VOIDmode, inc_src, const0_rtx); + else + condition = gen_rtx_fmt_ee (NE, VOIDmode, inc_src, const1_rtx);; + } return condition; } @@ -685,17 +693,6 @@ doloop_optimize (class loop *loop) return false; } - max_cost - = COSTS_N_INSNS (param_max_iterations_computation_cost); - if (set_src_cost (desc->niter_expr, mode, optimize_loop_for_speed_p (loop)) - > max_cost) - { - if (dump_file) - fprintf (dump_file, - "Doloop: number of iterations too costly to compute.\n"); - return false; - } - if (desc->const_iter) iterations = widest_int::from (rtx_mode_t (desc->niter_expr, mode), UNSIGNED); @@ -722,6 +719,23 @@ doloop_optimize (class loop *loop) doloop_reg = gen_reg_rtx (mode); rtx_insn *doloop_seq = targetm.gen_doloop_end (doloop_reg, start_label); + /* Not all targets need to pre-calculate the number of the iterations of + the loop, they instead work by storing the number of elements in the + counter_reg and decrementing that. Call the appropriate target hook to + change the value of count. */ + count = targetm.allow_elementwise_doloop_p (count, start_label, doloop_seq); + + max_cost + = COSTS_N_INSNS (param_max_iterations_computation_cost); + if (set_src_cost (count, mode, optimize_loop_for_speed_p (loop)) + > max_cost) + { + if (dump_file) + fprintf (dump_file, + "Doloop: number of iterations too costly to compute.\n"); + return false; + } + word_mode_size = GET_MODE_PRECISION (word_mode); word_mode_max = (HOST_WIDE_INT_1U << (word_mode_size - 1) << 1) - 1; if (! doloop_seq diff --git a/gcc/target.def b/gcc/target.def index db8af0cbe81624513f114fc9bbd8be61d855f409..e799cd97de50b814b0428c0f2960912a6091d330 100644 --- a/gcc/target.def +++ b/gcc/target.def @@ -4411,6 +4411,16 @@ The default version of this hook returns false.", bool, (class loop *loop), default_predict_doloop_p) +DEFHOOK +(allow_elementwise_doloop_p, + "This target hook allows the target to support loop-doloop optimisations\n\ +where the value that gets put into the loop counter register is not a\n\ +pre-calculation of the number of iteration of the loop. For instance,\n\ +the value used can be the number of elements that the loop will process.\n\ +The default version of this hook returns the same rtx it was given.", + rtx, (rtx count, rtx label, rtx doloop), + default_allow_elementwise_doloop_p) + DEFHOOKPOD (have_count_reg_decr_p, "Return true if the target supports hardware count register for decrement\n\ diff --git a/gcc/targhooks.h b/gcc/targhooks.h index a1df260f5483dc84f18d8f12c5202484a32d5bb7..e94ac1fe09dd48bd24210a8c8982f7a59e4cbc63 100644 --- a/gcc/targhooks.h +++ b/gcc/targhooks.h @@ -88,6 +88,7 @@ extern bool default_fixed_point_supported_p (void); extern bool default_has_ifunc_p (void); extern bool default_predict_doloop_p (class loop *); +extern rtx default_allow_elementwise_doloop_p (rtx, rtx, rtx); extern machine_mode default_preferred_doloop_mode (machine_mode); extern const char * default_invalid_within_doloop (const rtx_insn *); diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc index fe0116521feaf32187e7bc113bf93b1805852c79..de4eea9ec0302a6bc4e5274893925aecf2444676 100644 --- a/gcc/targhooks.cc +++ b/gcc/targhooks.cc @@ -661,6 +661,12 @@ default_predict_doloop_p (class loop *loop ATTRIBUTE_UNUSED) return false; } +rtx +default_allow_elementwise_doloop_p (rtx count, rtx, rtx) +{ + return count; +} + /* By default, just use the input MODE itself. */ machine_mode diff --git a/gcc/testsuite/gcc.target/arm/dlstp-compile-asm.c b/gcc/testsuite/gcc.target/arm/dlstp-compile-asm.c index acf0836050c19b983feeaf97c3e52e1318bb194d..e036b0386260a547a3a2e42d8f34f0913793e593 100644 --- a/gcc/testsuite/gcc.target/arm/dlstp-compile-asm.c +++ b/gcc/testsuite/gcc.target/arm/dlstp-compile-asm.c @@ -140,10 +140,196 @@ TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vbrsrq, _m) TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vshlq, _m) TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vshrq, _m) +/* Now test some more configurations. */ + +/* Test a for loop format of decrementing to zero */ +int32_t a[] = {0, 1, 2, 3, 4, 5, 6, 7}; +void test1 (int32_t *b, int num_elems) +{ + for (int i = num_elems; i >= 0; i-= 4) + { + mve_pred16_t p = vctp32q (i); + int32x4_t va = vldrwq_z_s32 (&(a[i]), p); + vstrwq_p_s32 (b + i, va, p); + } +} + +/* Iteration counter counting up to num_iter. */ +void test2 (uint8_t *a, uint8_t *b, uint8_t *c, int n) +{ + int num_iter = (n + 15)/16; + for (int i = 0; i < num_iter; i++) + { + mve_pred16_t p = vctp8q (n); + uint8x16_t va = vldrbq_z_u8 (a, p); + uint8x16_t vb = vldrbq_z_u8 (b, p); + uint8x16_t vc = vaddq_x_u8 (va, vb, p); + vstrbq_p_u8 (c, vc, p); + n-=16; + } +} + +/* Using an unpredicated arithmetic instruction within the loop. */ +void test3 (uint8_t *a, uint8_t *b, uint8_t *c, uint8_t *d, int n) +{ + while (n > 0) + { + mve_pred16_t p = vctp8q (n); + uint8x16_t va = vldrbq_z_u8 (a, p); + uint8x16_t vb = vldrbq_u8 (b); + uint8x16_t vc = vaddq_u8 (va, vb); + uint8x16_t vd = vaddq_x_u8 (va, vb, p); + vstrbq_p_u8 (c, vc, p); + vstrbq_p_u8 (d, vd, p); + n-=16; + } +} + +/* Using a different VPR value for one instruction in the loop. */ +void test4 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p1) +{ + while (n > 0) + { + mve_pred16_t p = vctp32q (n); + int32x4_t va = vldrwq_z_s32 (a, p); + int32x4_t vb = vldrwq_z_s32 (b, p1); + int32x4_t vc = vaddq_x_s32 (va, vb, p); + vstrwq_p_s32 (c, vc, p); + c += 4; + a += 4; + b += 4; + n -= 4; + } +} + +/* Generating and using a constant VPR value in the loop, with a vctp. */ +void test5 (int32_t *a, int32_t *b, int32_t *c, int n, int g) +{ + while (n > 0) + { + mve_pred16_t p = vctp32q (n); + int32x4_t va = vldrwq_z_s32 (a, p); + mve_pred16_t p1 = vctp32q (g); + int32x4_t vb = vldrwq_z_s32 (b, p1); + int32x4_t vc = vaddq_x_s32 (va, vb, p); + vstrwq_p_s32 (c, vc, p); + c += 4; + a += 4; + b += 4; + n -= 4; + } +} + +/* Generating and using a different VPR value in the loop, with a vctp. */ +void test6 (int32_t *a, int32_t *b, int32_t *c, int n, int g) +{ + while (n > 0) + { + mve_pred16_t p = vctp32q (n); + int32x4_t va = vldrwq_z_s32 (a, p); + mve_pred16_t p1 = vctp32q (g); + int32x4_t vb = vldrwq_z_s32 (b, p1); + int32x4_t vc = vaddq_x_s32 (va, vb, p); + vstrwq_p_s32 (c, vc, p); + c += 4; + a += 4; + b += 4; + n -= 4; + g++; + } +} + +/* Generating and using a different VPR value in the loop, with a vctp_m. */ +void test7 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p1) +{ + while (n > 0) + { + mve_pred16_t p = vctp32q (n); + int32x4_t va = vldrwq_z_s32 (a, p); + mve_pred16_t p2 = vctp32q_m (n, p1); + int32x4_t vb = vldrwq_z_s32 (b, p1); + int32x4_t vc = vaddq_x_s32 (va, vb, p2); + vstrwq_p_s32 (c, vc, p); + c += 4; + a += 4; + b += 4; + n -= 4; + } +} + +/* Generating and using a different VPR value in the loop, with a vctp_m that is tied to the base vctp VPR. */ +void test8 (int32_t *a, int32_t *b, int32_t *c, int n) +{ + while (n > 0) + { + mve_pred16_t p = vctp32q (n); + int32x4_t va = vldrwq_z_s32 (a, p); + mve_pred16_t p1 = vctp32q_m (n, p); + int32x4_t vb = vldrwq_z_s32 (b, p1); + int32x4_t vc = vaddq_x_s32 (va, vb, p1); + vstrwq_p_s32 (c, vc, p); + c += 4; + a += 4; + b += 4; + n -= 4; + } +} + +/* Generating and using a different VPR value in the loop, with a vcmp. */ +void test9 (int32_t *a, int32_t *b, int32_t *c, int n) +{ + while (n > 0) + { + mve_pred16_t p = vctp32q (n); + int32x4_t va = vldrwq_z_s32 (a, p); + int32x4_t vb = vldrwq_z_s32 (b, p); + mve_pred16_t p1 = vcmpeqq_s32 (va, vb); + int32x4_t vc = vaddq_x_s32 (va, vb, p1); + vstrwq_p_s32 (c, vc, p); + c += 4; + a += 4; + b += 4; + n -= 4; + } +} + +/* Generating and using a different VPR value in the loop, with a vcmp_m. */ +void test10 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p1) +{ + while (n > 0) + { + mve_pred16_t p = vctp32q (n); + int32x4_t va = vldrwq_z_s32 (a, p); + int32x4_t vb = vldrwq_z_s32 (b, p); + mve_pred16_t p2 = vcmpeqq_m_s32 (va, vb, p1); + int32x4_t vc = vaddq_x_s32 (va, vb, p2); + vstrwq_p_s32 (c, vc, p); + c += 4; + a += 4; + b += 4; + n -= 4; + } +} + +/* Generating and using a different VPR value in the loop, with a vcmp_m that is tied to the base vctp VPR. */ +void test11 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p1) +{ + while (n > 0) + { + mve_pred16_t p = vctp32q (n); + int32x4_t va = vldrwq_z_s32 (a, p); + int32x4_t vb = vldrwq_z_s32 (b, p); + mve_pred16_t p2 = vcmpeqq_m_s32 (va, vb, p1); + int32x4_t vc = vaddq_x_s32 (va, vb, p2); + vstrwq_p_s32 (c, vc, p); + c += 4; + a += 4; + b += 4; + n -= 4; + } +} + /* The final number of DLSTPs currently is calculated by the number of - `TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY.*` macros * 6. */ -/* { dg-final { scan-assembler-times {\tdlstp} 144 } } */ -/* { dg-final { scan-assembler-times {\tletp} 144 } } */ -/* { dg-final { scan-assembler-not "\tvctp\t" } } */ -/* { dg-final { scan-assembler-not "\tvpst\t" } } */ -/* { dg-final { scan-assembler-not "p0" } } */ + `TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY.*` macros * 6 + 11. */ +/* { dg-final { scan-assembler-times {\tdlstp} 155 } } */ +/* { dg-final { scan-assembler-times {\tletp} 155 } } */ diff --git a/gcc/testsuite/gcc.target/arm/dlstp-int16x8.c b/gcc/testsuite/gcc.target/arm/dlstp-int16x8.c new file mode 100644 index 0000000000000000000000000000000000000000..b1c03b618e0dcc7d4268f2f004663b6e332a402a --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/dlstp-int16x8.c @@ -0,0 +1,68 @@ +/* { dg-do run { target { arm*-*-* } } } */ +/* { dg-require-effective-target arm_v8_1m_mve_ok } */ +/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */ +/* { dg-options "-march=armv8.1-m.main+fp.dp+mve.fp -mfloat-abi=hard -mfpu=auto -O3 --save-temps" } */ + +#include +#include +#include +#include "lob.h" + +void __attribute__ ((noinline)) test (int16_t *a, int16_t *b, int16_t *c, int n) +{ + while (n > 0) + { + mve_pred16_t p = vctp16q (n); + int16x8_t va = vldrhq_z_s16 (a, p); + int16x8_t vb = vldrhq_z_s16 (b, p); + int16x8_t vc = vaddq_x_s16 (va, vb, p); + vstrhq_p_s16 (c, vc, p); + c+=8; + a+=8; + b+=8; + n-=8; + } +} + +int main () +{ + int i; + int16_t temp1[N]; + int16_t temp2[N]; + int16_t temp3[N]; + reset_data16 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 0); + check_plus16 (temp1, temp2, temp3, 0); + + reset_data16 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 1); + check_plus16 (temp1, temp2, temp3, 1); + + reset_data16 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 7); + check_plus16 (temp1, temp2, temp3, 7); + + reset_data16 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 8); + check_plus16 (temp1, temp2, temp3, 8); + + reset_data16 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 9); + check_plus16 (temp1, temp2, temp3, 9); + + reset_data16 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 16); + check_plus16 (temp1, temp2, temp3, 16); + + reset_data16 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 17); + check_plus16 (temp1, temp2, temp3, 17); + + reset_data16 (temp1, temp2, temp3, N); +} + +/* { dg-final { scan-assembler-times {\tdlstp.16} 1 } } */ +/* { dg-final { scan-assembler-times {\tletp} 1 } } */ +/* { dg-final { scan-assembler-not "\tvctp" } } */ +/* { dg-final { scan-assembler-not "\tvpst" } } */ +/* { dg-final { scan-assembler-not "p0" } } */ diff --git a/gcc/testsuite/gcc.target/arm/dlstp-int32x4.c b/gcc/testsuite/gcc.target/arm/dlstp-int32x4.c new file mode 100644 index 0000000000000000000000000000000000000000..4a3eb0577be0631ab4e07f6f75c2c802b535c88c --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/dlstp-int32x4.c @@ -0,0 +1,68 @@ +/* { dg-do run { target { arm*-*-* } } } */ +/* { dg-require-effective-target arm_v8_1m_mve_ok } */ +/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */ +/* { dg-options "-march=armv8.1-m.main+fp.dp+mve.fp -mfloat-abi=hard -mfpu=auto -O3 --save-temps" } */ + +#include +#include +#include +#include "lob.h" + +void __attribute__ ((noinline)) test (int32_t *a, int32_t *b, int32_t *c, int n) +{ + while (n > 0) + { + mve_pred16_t p = vctp32q (n); + int32x4_t va = vldrwq_z_s32 (a, p); + int32x4_t vb = vldrwq_z_s32 (b, p); + int32x4_t vc = vaddq_x_s32 (va, vb, p); + vstrwq_p_s32 (c, vc, p); + c+=4; + a+=4; + b+=4; + n-=4; + } +} + +int main () +{ + int i; + int32_t temp1[N]; + int32_t temp2[N]; + int32_t temp3[N]; + reset_data32 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 0); + check_plus32 (temp1, temp2, temp3, 0); + + reset_data32 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 1); + check_plus32 (temp1, temp2, temp3, 1); + + reset_data32 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 3); + check_plus32 (temp1, temp2, temp3, 3); + + reset_data32 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 4); + check_plus32 (temp1, temp2, temp3, 4); + + reset_data32 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 5); + check_plus32 (temp1, temp2, temp3, 5); + + reset_data32 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 8); + check_plus32 (temp1, temp2, temp3, 8); + + reset_data32 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 9); + check_plus32 (temp1, temp2, temp3, 9); + + reset_data32 (temp1, temp2, temp3, N); +} + +/* { dg-final { scan-assembler-times {\tdlstp.32} 1 } } */ +/* { dg-final { scan-assembler-times {\tletp} 1 } } */ +/* { dg-final { scan-assembler-not "\tvctp" } } */ +/* { dg-final { scan-assembler-not "\tvpst" } } */ +/* { dg-final { scan-assembler-not "p0" } } */ diff --git a/gcc/testsuite/gcc.target/arm/dlstp-int64x2.c b/gcc/testsuite/gcc.target/arm/dlstp-int64x2.c new file mode 100644 index 0000000000000000000000000000000000000000..f05ff834363e2a47a9faddb6b8b09b60eb94a3c2 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/dlstp-int64x2.c @@ -0,0 +1,68 @@ +/* { dg-do run { target { arm*-*-* } } } */ +/* { dg-require-effective-target arm_v8_1m_mve_ok } */ +/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */ +/* { dg-options "-march=armv8.1-m.main+fp.dp+mve.fp -mfloat-abi=hard -mfpu=auto -O3 --save-temps" } */ + +#include +#include +#include +#include "lob.h" + +void __attribute__ ((noinline)) test (int64_t *a, int64_t *c, int n) +{ + while (n > 0) + { + mve_pred16_t p = vctp64q (n); + int64x2_t va = vldrdq_gather_offset_z_s64 (a, vcreateq_u64 (8, 0), p); + vstrdq_scatter_offset_p_s64 (c, vcreateq_u64 (8, 0), va, p); + c+=2; + a+=2; + n-=2; + } +} + +int main () +{ + int i; + int64_t temp1[N]; + int64_t temp3[N]; + reset_data64 (temp1, temp3, N); + test (temp1, temp3, 0); + check_memcpy64 (temp1, temp3, 0); + + reset_data64 (temp1, temp3, N); + test (temp1, temp3, 1); + check_memcpy64 (temp1, temp3, 1); + + reset_data64 (temp1, temp3, N); + test (temp1, temp3, 2); + check_memcpy64 (temp1, temp3, 2); + + reset_data64 (temp1, temp3, N); + test (temp1, temp3, 3); + check_memcpy64 (temp1, temp3, 3); + + reset_data64 (temp1, temp3, N); + test (temp1, temp3, 4); + check_memcpy64 (temp1, temp3, 4); + + reset_data64 (temp1, temp3, N); + test (temp1, temp3, 5); + check_memcpy64 (temp1, temp3, 5); + + reset_data64 (temp1, temp3, N); + test (temp1, temp3, 6); + check_memcpy64 (temp1, temp3, 6); + + reset_data64 (temp1, temp3, N); + test (temp1, temp3, 7); + check_memcpy64 (temp1, temp3, 7); + + reset_data64 (temp1, temp3, N); +} + +/* { dg-final { scan-assembler-times {\tdlstp.64} 1 } } */ +/* { dg-final { scan-assembler-times {\tletp} 1 } } */ +/* { dg-final { scan-assembler-not "\tvctp" } } */ +/* { dg-final { scan-assembler-not "\tvpst" } } */ +/* { dg-final { scan-assembler-not "p0" } } */ diff --git a/gcc/testsuite/gcc.target/arm/dlstp-int8x16.c b/gcc/testsuite/gcc.target/arm/dlstp-int8x16.c new file mode 100644 index 0000000000000000000000000000000000000000..f281ba7848d1ba3321124e996eb5bea44977c2ab --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/dlstp-int8x16.c @@ -0,0 +1,68 @@ +/* { dg-do run { target { arm*-*-* } } } */ +/* { dg-require-effective-target arm_v8_1m_mve_ok } */ +/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */ +/* { dg-options "-march=armv8.1-m.main+fp.dp+mve.fp -mfloat-abi=hard -mfpu=auto -O3 --save-temps" } */ + +#include +#include +#include +#include "lob.h" + +void __attribute__ ((noinline)) test (int8_t *a, int8_t *b, int8_t *c, int n) +{ + while (n > 0) + { + mve_pred16_t p = vctp8q (n); + int8x16_t va = vldrbq_z_s8 (a, p); + int8x16_t vb = vldrbq_z_s8 (b, p); + int8x16_t vc = vaddq_x_s8 (va, vb, p); + vstrbq_p_s8 (c, vc, p); + c+=16; + a+=16; + b+=16; + n-=16; + } +} + +int main () +{ + int i; + int8_t temp1[N]; + int8_t temp2[N]; + int8_t temp3[N]; + reset_data8 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 0); + check_plus8 (temp1, temp2, temp3, 0); + + reset_data8 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 1); + check_plus8 (temp1, temp2, temp3, 1); + + reset_data8 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 15); + check_plus8 (temp1, temp2, temp3, 15); + + reset_data8 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 16); + check_plus8 (temp1, temp2, temp3, 16); + + reset_data8 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 17); + check_plus8 (temp1, temp2, temp3, 17); + + reset_data8 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 32); + check_plus8 (temp1, temp2, temp3, 32); + + reset_data8 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 33); + check_plus8 (temp1, temp2, temp3, 33); + + reset_data8 (temp1, temp2, temp3, N); +} + +/* { dg-final { scan-assembler-times {\tdlstp.8} 1 } } */ +/* { dg-final { scan-assembler-times {\tletp} 1 } } */ +/* { dg-final { scan-assembler-not "\tvctp" } } */ +/* { dg-final { scan-assembler-not "\tvpst" } } */ +/* { dg-final { scan-assembler-not "p0" } } */ diff --git a/gcc/testsuite/gcc.target/arm/dlstp-invalid-asm.c b/gcc/testsuite/gcc.target/arm/dlstp-invalid-asm.c new file mode 100644 index 0000000000000000000000000000000000000000..b250625ce426400ad6be1edcb77e854cc6b79dc9 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/dlstp-invalid-asm.c @@ -0,0 +1,210 @@ + +/* { dg-do compile { target { arm*-*-* } } } */ +/* { dg-require-effective-target arm_v8_1m_mve_ok } */ +/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */ +/* { dg-options "-march=armv8.1-m.main+fp.dp+mve.fp -mfloat-abi=hard -mfpu=auto -O3 --save-temps" } */ + +#include + +/* Terminating on a non-zero number of elements. */ +void test1 (uint8_t *a, uint8_t *b, uint8_t *c, int n) +{ + while (n > 1) + { + mve_pred16_t p = vctp8q (n); + uint8x16_t va = vldrbq_z_u8 (a, p); + uint8x16_t vb = vldrbq_z_u8 (b, p); + uint8x16_t vc = vaddq_x_u8 (va, vb, p); + vstrbq_p_u8 (c, vc, p); + n -= 16; + } +} + +/* Similar, terminating on a non-zero number of elements, but in a for loop + format. */ +int32_t a[] = {0, 1, 2, 3, 4, 5, 6, 7}; +void test2 (int32_t *b, int num_elems) +{ + for (int i = num_elems; i >= 2; i-= 4) + { + mve_pred16_t p = vctp32q (i); + int32x4_t va = vldrwq_z_s32 (&(a[i]), p); + vstrwq_p_s32 (b + i, va, p); + } +} + +/* Iteration counter counting up to num_iter, with a non-zero starting num. */ +void test3 (uint8_t *a, uint8_t *b, uint8_t *c, int n) +{ + int num_iter = (n + 15)/16; + for (int i = 1; i < num_iter; i++) + { + mve_pred16_t p = vctp8q (n); + uint8x16_t va = vldrbq_z_u8 (a, p); + uint8x16_t vb = vldrbq_z_u8 (b, p); + uint8x16_t vc = vaddq_x_u8 (va, vb, p); + vstrbq_p_u8 (c, vc, p); + n -= 16; + } +} + +/* Iteration counter counting up to num_iter, with a larger increment */ +void test4 (uint8_t *a, uint8_t *b, uint8_t *c, int n) +{ + int num_iter = (n + 15)/16; + for (int i = 0; i < num_iter; i+=2) + { + mve_pred16_t p = vctp8q (n); + uint8x16_t va = vldrbq_z_u8 (a, p); + uint8x16_t vb = vldrbq_z_u8 (b, p); + uint8x16_t vc = vaddq_x_u8 (va, vb, p); + vstrbq_p_u8 (c, vc, p); + n -= 16; + } +} + +/* Using an unpredicated store instruction within the loop. */ +void test5 (uint8_t *a, uint8_t *b, uint8_t *c, uint8_t *d, int n) +{ + while (n > 0) + { + mve_pred16_t p = vctp8q (n); + uint8x16_t va = vldrbq_z_u8 (a, p); + uint8x16_t vb = vldrbq_z_u8 (b, p); + uint8x16_t vc = vaddq_u8 (va, vb); + uint8x16_t vd = vaddq_x_u8 (va, vb, p); + vstrbq_u8 (d, vd); + n -= 16; + } +} + +/* Using an unpredicated vcmp to generate a new predicate value in the + loop and then using it in a store insn. */ +void test14 (int32_t *a, int32_t *b, int32x4_t vc, int32_t *c, int n) +{ + while (n > 0) + { + mve_pred16_t p = vctp32q (n); + int32x4_t va = vldrwq_z_s32 (a, p); + int32x4_t vb = vldrwq_z_s32 (b, p); + int32x4_t vc = vaddq_x_s32 (va, vb, p); + mve_pred16_t p1 = vcmpeqq_s32 (va, vc); + vstrwq_p_s32 (c, vc, p1); + c += 4; + a += 4; + b += 4; + n -= 4; + } +} + +/* Using an unpredicated store outside the loop. */ +void test6 (uint8_t *a, uint8_t *b, uint8_t *c, int n, uint8x16_t vx) +{ + while (n > 0) + { + mve_pred16_t p = vctp8q (n); + uint8x16_t va = vldrbq_z_u8 (a, p); + uint8x16_t vb = vldrbq_z_u8 (b, p); + uint8x16_t vc = vaddq_m_u8 (vx, va, vb, p); + vx = vaddq_u8 (vx, vc); + a += 16; + b += 16; + n -= 16; + } + vstrbq_u8 (c, vx); +} + +/* Using an unpredicated op with a scalar output, where the result is valid + outside the bb. */ +uint8_t test7 (uint8_t *a, uint8_t *b, uint8_t *c, int n, uint8x16_t vx) +{ + uint8_t sum = 0; + while (n > 0) + { + mve_pred16_t p = vctp8q (n); + uint8x16_t va = vldrbq_z_u8 (a, p); + uint8x16_t vb = vldrbq_z_u8 (b, p); + uint8x16_t vc = vaddq_m_u8 (vx, va, vb, p); + sum += vaddvq_u8 (vc); + a += 16; + b += 16; + n -= 16; + } + return sum; +} + +/* Using an unpredicated op with a scalar output, then a scalar op, + where the result is valid outside the bb. */ +uint8_t test8 (uint8_t *a, uint8_t *b, uint8_t *c, int n, uint8x16_t vx, int g) +{ + uint8_t sum = 0; + while (n > 0) + { + mve_pred16_t p = vctp8q (n); + uint8x16_t va = vldrbq_z_u8 (a, p); + uint8x16_t vb = vldrbq_z_u8 (b, p); + uint8x16_t vc = vaddq_m_u8 (vx, va, vb, p); + sum += vaddvq_u8 (vc); + sum += g; + a += 16; + b += 16; + n -= 16; + } + return sum; +} + +/* Using a VPR that gets modified within the loop. */ +void test9 (int32_t *a, int32_t *b, int32_t *c, int n) +{ + while (n > 0) + { + mve_pred16_t p = vctp32q (n); + int32x4_t va = vldrwq_z_s32 (a, p); + p++; + int32x4_t vb = vldrwq_z_s32 (b, p); + int32x4_t vc = vaddq_x_s32 (va, vb, p); + vstrwq_p_s32 (c, vc, p); + c += 4; + a += 4; + b += 4; + n -= 4; + } +} + +/* Using a VPR that gets re-generated within the loop. */ +void test10 (int32_t *a, int32_t *b, int32_t *c, int n) +{ + mve_pred16_t p = vctp32q (n); + while (n > 0) + { + int32x4_t va = vldrwq_z_s32 (a, p); + p = vctp32q (n); + int32x4_t vb = vldrwq_z_s32 (b, p); + int32x4_t vc = vaddq_x_s32 (va, vb, p); + vstrwq_p_s32 (c, vc, p); + c += 4; + a += 4; + b += 4; + n -= 4; + } +} + +/* Using vctp32q_m instead of vctp32q. */ +void test11 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p0) +{ + while (n > 0) + { + mve_pred16_t p = vctp32q_m (n, p0); + int32x4_t va = vldrwq_z_s32 (a, p); + int32x4_t vb = vldrwq_z_s32 (b, p); + int32x4_t vc = vaddq_x_s32 (va, vb, p); + vstrwq_p_s32 (c, vc, p); + c += 4; + a += 4; + b += 4; + n -= 4; + } +} + +/* { dg-final { scan-assembler-not "\tdlstp" } } */ +/* { dg-final { scan-assembler-not "\tletp" } } */ diff --git a/gcc/testsuite/gcc.target/arm/lob.h b/gcc/testsuite/gcc.target/arm/lob.h index feaae7cc89959b3147368980120700bbc3e85ecb..3941fe7a8b620e62a5f742722be1ba2d031f5a8d 100644 --- a/gcc/testsuite/gcc.target/arm/lob.h +++ b/gcc/testsuite/gcc.target/arm/lob.h @@ -1,15 +1,131 @@ #include - +#include /* Common code for lob tests. */ #define NO_LOB asm volatile ("@ clobber lr" : : : "lr" ) -#define N 10000 +#define N 100 + +static void +reset_data (int *a, int *b, int *c, int x) +{ + memset (a, -1, x * sizeof (*a)); + memset (b, -1, x * sizeof (*b)); + memset (c, 0, x * sizeof (*c)); +} + +static void +reset_data8 (int8_t *a, int8_t *b, int8_t *c, int x) +{ + memset (a, -1, x * sizeof (*a)); + memset (b, -1, x * sizeof (*b)); + memset (c, 0, x * sizeof (*c)); +} + +static void +reset_data16 (int16_t *a, int16_t *b, int16_t *c, int x) +{ + memset (a, -1, x * sizeof (*a)); + memset (b, -1, x * sizeof (*b)); + memset (c, 0, x * sizeof (*c)); +} + +static void +reset_data32 (int32_t *a, int32_t *b, int32_t *c, int x) +{ + memset (a, -1, x * sizeof (*a)); + memset (b, -1, x * sizeof (*b)); + memset (c, 0, x * sizeof (*c)); +} + +static void +reset_data64 (int64_t *a, int64_t *c, int x) +{ + memset (a, -1, x * sizeof (*a)); + memset (c, 0, x * sizeof (*c)); +} + +static void +check_plus (int *a, int *b, int *c, int x) +{ + for (int i = 0; i < N; i++) + { + NO_LOB; + if (i < x) + { + if (c[i] != (a[i] + b[i])) abort (); + } + else + { + if (c[i] != 0) abort (); + } + } +} + +static void +check_plus8 (int8_t *a, int8_t *b, int8_t *c, int x) +{ + for (int i = 0; i < N; i++) + { + NO_LOB; + if (i < x) + { + if (c[i] != (a[i] + b[i])) abort (); + } + else + { + if (c[i] != 0) abort (); + } + } +} + +static void +check_plus16 (int16_t *a, int16_t *b, int16_t *c, int x) +{ + for (int i = 0; i < N; i++) + { + NO_LOB; + if (i < x) + { + if (c[i] != (a[i] + b[i])) abort (); + } + else + { + if (c[i] != 0) abort (); + } + } +} + +static void +check_plus32 (int32_t *a, int32_t *b, int32_t *c, int x) +{ + for (int i = 0; i < N; i++) + { + NO_LOB; + if (i < x) + { + if (c[i] != (a[i] + b[i])) abort (); + } + else + { + if (c[i] != 0) abort (); + } + } +} static void -reset_data (int *a, int *b, int *c) +check_memcpy64 (int64_t *a, int64_t *c, int x) { - memset (a, -1, N * sizeof (*a)); - memset (b, -1, N * sizeof (*b)); - memset (c, -1, N * sizeof (*c)); + for (int i = 0; i < N; i++) + { + NO_LOB; + if (i < x) + { + if (c[i] != a[i]) abort (); + } + else + { + if (c[i] != 0) abort (); + } + } } diff --git a/gcc/testsuite/gcc.target/arm/lob1.c b/gcc/testsuite/gcc.target/arm/lob1.c index ba5c82cd55c582c96a18ad417a3041e43d843613..c8ce653a5c39fb1ffcf82a6e584d9a0467a130c0 100644 --- a/gcc/testsuite/gcc.target/arm/lob1.c +++ b/gcc/testsuite/gcc.target/arm/lob1.c @@ -54,29 +54,18 @@ loop3 (int *a, int *b, int *c) } while (i < N); } -void -check (int *a, int *b, int *c) -{ - for (int i = 0; i < N; i++) - { - NO_LOB; - if (c[i] != a[i] + b[i]) - abort (); - } -} - int main (void) { - reset_data (a, b, c); + reset_data (a, b, c, N); loop1 (a, b ,c); - check (a, b ,c); - reset_data (a, b, c); + check_plus (a, b, c, N); + reset_data (a, b, c, N); loop2 (a, b ,c); - check (a, b ,c); - reset_data (a, b, c); + check_plus (a, b, c, N); + reset_data (a, b, c, N); loop3 (a, b ,c); - check (a, b ,c); + check_plus (a, b, c, N); return 0; } diff --git a/gcc/testsuite/gcc.target/arm/lob6.c b/gcc/testsuite/gcc.target/arm/lob6.c index 17b6124295e8ae9e1cb57e41fa43a954b3390eec..4fe116e2c2be3748d1bb6da7bb9092db8f962abc 100644 --- a/gcc/testsuite/gcc.target/arm/lob6.c +++ b/gcc/testsuite/gcc.target/arm/lob6.c @@ -79,14 +79,14 @@ check (void) int main (void) { - reset_data (a1, b1, c1); - reset_data (a2, b2, c2); + reset_data (a1, b1, c1, N); + reset_data (a2, b2, c2, N); loop1 (a1, b1, c1); ref1 (a2, b2, c2); check (); - reset_data (a1, b1, c1); - reset_data (a2, b2, c2); + reset_data (a1, b1, c1, N); + reset_data (a2, b2, c2, N); loop2 (a1, b1, c1); ref2 (a2, b2, c2); check ();