diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index f2e3d905dbb..7a6d24a8006 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -7366,7 +7366,8 @@ ;; knows what to generate. (define_expand "doloop_end" [(use (match_operand 0 "" "")) ; loop pseudo - (use (match_operand 1 "" ""))] ; label + (use (match_operand 1 "" "")) ; label + (use (match_operand 2 "" ""))] ; decrement constant "optimize > 0 && flag_modulo_sched" { rtx s0; diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md index 458d3edf716..a4b3d1addbf 100644 --- a/gcc/config/arc/arc.md +++ b/gcc/config/arc/arc.md @@ -4985,12 +4985,13 @@ archs4x, archs4xd" (pc))) (set (match_dup 0) (plus:SI (match_dup 0) (const_int -1))) (unspec:SI [(const_int 0)] UNSPEC_ARC_LP) - (clobber (match_dup 2))])] + (clobber (match_dup 3)) + (match_operand 2 "" "")])] "" { if (GET_MODE (operands[0]) != SImode) FAIL; - operands[2] = gen_rtx_SCRATCH (SImode); + operands[3] = gen_rtx_SCRATCH (SImode); }) (define_insn "arc_lp" diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h index 550272facd1..7684620f0f4 100644 --- a/gcc/config/arm/arm-protos.h +++ b/gcc/config/arm/arm-protos.h @@ -63,7 +63,7 @@ extern void arm_decompose_di_binop (rtx, rtx, rtx *, rtx *, rtx *, rtx *); extern bool arm_q_bit_access (void); extern bool arm_ge_bits_access (void); extern bool arm_target_insn_ok_for_lob (rtx); - +extern rtx arm_attempt_dlstp_transform (rtx, rtx); #ifdef RTX_CODE enum reg_class arm_mode_base_reg_class (machine_mode); diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc index ee8f1babf8a..72d0187eb00 100644 --- a/gcc/config/arm/arm.cc +++ b/gcc/config/arm/arm.cc @@ -470,6 +470,9 @@ static const struct attribute_spec arm_attribute_table[] = #undef TARGET_SCHED_REORDER #define TARGET_SCHED_REORDER arm_sched_reorder +#undef TARGET_ALLOW_ELEMENTWISE_DOLOOP_P +#define TARGET_ALLOW_ELEMENTWISE_DOLOOP_P arm_allow_elementwise_doloop_p + #undef TARGET_REGISTER_MOVE_COST #define TARGET_REGISTER_MOVE_COST arm_register_move_cost @@ -34138,8 +34141,370 @@ arm_target_insn_ok_for_lob (rtx insn) return single_succ_p (bb) && single_pred_p (bb) - && single_succ_edge (bb)->dest == single_pred_edge (bb)->src - && contains_no_active_insn_p (bb); + && single_succ_edge (bb)->dest == single_pred_edge (bb)->src; +} + +static int +arm_mve_get_vctp_lanes (rtx x) +{ + if (GET_CODE (x) == SET && GET_CODE (XEXP (x, 1)) == UNSPEC) + { + switch (XINT (XEXP (x, 1), 1)) + { + case VCTP8Q: + return 16; + case VCTP16Q: + return 8; + case VCTP32Q: + return 4; + case VCTP64Q: + return 2; + default: + break; + } + } + return 0; +} + +/* Check if an insn requires the use of the VPR_REG, if it does, return the + sub-rtx of the matched operand. If there are more than one operand (e.g. an + input operand and an output operand) that use VPR_REG, return the first + occurance, which is usually the output operand. */ + +static rtx +arm_get_required_vpr_reg (rtx_insn *insn) +{ + if (!NONJUMP_INSN_P (insn)) + return NULL_RTX; + + bool requires_vpr; + extract_constrain_insn (insn); + int n_operands = recog_data.n_operands; + if (recog_data.n_alternatives == 0) + return NULL_RTX; + + /* Fill in recog_op_alt with information about the constraints of + this insn. */ + preprocess_constraints (insn); + + for (int use = 0; use < n_operands; use++) + { + requires_vpr = true; + /* Iterate through alternatives of operand "use" in recog_op_alt and + identify if the operand is required to be the VPR. */ + for (int alt1 = 0; alt1 < recog_data.n_alternatives; alt1++) + { + const operand_alternative *op_alt1 + = &recog_op_alt[alt1 * n_operands]; + /* Fetch the reg_class for each entry and check it against the + * VPR_REG reg_class. */ + if (alternative_class (op_alt1, use) != VPR_REG) + requires_vpr = false; + } + /* If all alternatives of the insn require the VPR reg for this operand, + it means that either this is VPR-generating instruction, like a vctp, + vcmp, etc., or it is a VPT-predicated insruction. Return the subrtx + of the VPR reg operand. */ + if (requires_vpr) + return recog_data.operand[use]; + } + return NULL_RTX; +} + +/* Scan the basic block of a loop body for a vctp instruction. If there is + exactly one unique vctp instruction, return its rtx_insn *. */ + +static rtx_insn * +arm_mve_get_loop_unique_vctp (basic_block bb) +{ + rtx_insn *insn = BB_HEAD (bb); + rtx_insn *vctp_op = NULL; + + /* Now scan through all the instruction patterns and + pick out any MVE instructions. */ + FOR_BB_INSNS (bb, insn) + { + if (INSN_P (insn)) + { + /* First check if this is a vctp instruction. There needs to be + exactly one vctp instruction within the loop. */ + if (arm_mve_get_vctp_lanes (PATTERN (insn)) != 0) + { + /* If we already found one vctp instruction, then the + loop is not consistent internally. */ + if (vctp_op) + return NULL; + + vctp_op = insn; + } + } + } + return vctp_op; +} + +rtx +arm_attempt_dlstp_transform (rtx label, rtx count) +{ + int decrementnum; + basic_block body = BLOCK_FOR_INSN (label)->prev_bb; + rtx initial_compare; + /* Doloop can only be done "elementwise" with predicated dlstp/letp + when the iteration counter gets decremented by the number of MVE + lanes. This can be extracted from the `count`, which is the expression + used to calculate the number of iterations that the loop would execute + for a standard dls/le loop. Since we only support cases where this is a + power of 2, we can assume that this expression arrives here as: + (lshiftrt: (A) (const_int y)) + Then we can extract the decrementnum from y. */ + if (GET_CODE (count) == LSHIFTRT && ARITHMETIC_P (XEXP (count, 0)) + /* There is one final condition that needs to be met for the loop to be + transformable: dlstp/letp will continue looping until there are + elements still to process. This can only work if the looping ends + when the element counter reaches zero and not some other value + (e.g. n > 0 works, not n > 1), or we can incorrectly end up running + one additional iteration. To by-pass any hoisting that the compiler + may have done with the `A` in `count` above, we can instead look up + to the bb before the loop preheader: this should end with a cmp+jump + pair, where the cmp needs to be with (const_int 0). */ + && loop_preheader_edge (body->loop_father)->src->prev_bb + && BB_END (loop_preheader_edge (body->loop_father)->src->prev_bb) + && PREV_INSN (BB_END (loop_preheader_edge (body->loop_father) + ->src->prev_bb)) + && INSN_P (PREV_INSN (BB_END (loop_preheader_edge (body->loop_father) + ->src->prev_bb))) + && (initial_compare + = PATTERN (PREV_INSN (BB_END (loop_preheader_edge (body->loop_father) + ->src->prev_bb)))) + && GET_CODE (initial_compare) == SET + && cc_register (XEXP (initial_compare, 0), VOIDmode) + && GET_CODE (XEXP (initial_compare, 1)) == COMPARE + && CONST_INT_P (XEXP (XEXP (initial_compare, 1), 1)) + && INTVAL (XEXP (XEXP (initial_compare, 1), 1)) == 0) + { + /* Extract the integer decrement from the LSHIFTR condition. */ + decrementnum = (1 << (INTVAL (XEXP (count, 1)))); + /* Find the vctp predicate generation inside the loop body BB. */ + rtx_insn *vctp_insn = arm_mve_get_loop_unique_vctp (body); + /* If we have successfully found one exactly vctp predicate-generating + instruction within the loop and the number by which we deprecate the + loop counter in each iteration matches the number of lanes of the + vctp instruction, we can attempt to turn this into a dlstp/letp loop. + */ + if (!vctp_insn + || decrementnum != arm_mve_get_vctp_lanes (PATTERN (vctp_insn))) + return GEN_INT (1); + + rtx_insn *insn = 0; + rtx_insn *cur_insn = 0; + rtx_insn *seq; + rtx vctp_vpr_generated = NULL_RTX; + rtx insn_vpr_reg_operand = NULL_RTX; + int new_icode; + + /* Scan through the insns in the loop bb and emit the transformed bb + insns to a sequence. */ + start_sequence (); + FOR_BB_INSNS (body, insn) + { + if (GET_CODE (insn) == CODE_LABEL || NOTE_INSN_BASIC_BLOCK_P (insn)) + continue; + else if (NOTE_P (insn)) + emit_note ((enum insn_note) NOTE_KIND (insn)); + else if (!INSN_P (insn)) + { + end_sequence (); + return GEN_INT (1); + } + /* When we find the vctp instruction: This may be followed by + a sign-extend insn to SImode. If it is, then save the + sign-extended REG into vctp_vpr_generated. If there is no + sign-extend, then store the raw output of the vctp. + For any VPT-predicated instructions we need to ensure that + the VPR they use is the same as the one given here and + they often consume the output of a subreg of the SImode + sign-extended VPR-reg. As a result, comparing against the + output of the sign-extend is more likely to succeed. + This code also guarantees to us that the vctp comes before + any instructions that use the VPR within the loop, for the + dlstp/letp transform to succeed. */ + else if (insn == vctp_insn) + { + if (GET_CODE ( + XEXP (PATTERN (next_nonnote_nondebug_insn_bb (insn)), 1)) + == SIGN_EXTEND + && GET_CODE ( + XEXP (PATTERN (next_nonnote_nondebug_insn_bb ( + next_nonnote_nondebug_insn_bb (insn))), + 1)) + == SUBREG) + vctp_vpr_generated + = XEXP (PATTERN (next_nonnote_nondebug_insn_bb ( + next_nonnote_nondebug_insn_bb (insn))), + 0); + else + vctp_vpr_generated = XEXP (PATTERN (insn), 0); + /* Also emit a USE of the source register of the vctp. + This holds the number of elements being processed + by the loop. This later gets stored into `count`. + */ + emit_use (XVECEXP (XEXP (PATTERN (insn), 1), 0, 0)); + continue; + } + /* If the insn pattern requires the use of the VPR, then it + is a VPT-predicated instruction. */ + else if ((insn_vpr_reg_operand = arm_get_required_vpr_reg (insn)) + != NULL_RTX) + { + /* If the VPR value is different to the one generated by + the vctp, then fail the conversion. */ + if (!rtx_equal_p (vctp_vpr_generated, insn_vpr_reg_operand)) + { + end_sequence (); + return GEN_INT (1); + } + /* If the insn does use the same VPR as the one generated + by the vctp, it will need to be transformed into its + non-predicated version. Also ensure that it's a valid + recog-ed instruction with the mve_unpredicated_insn + atrribute. */ + else if (recog_memoized (insn) >= 0 + && (new_icode = get_attr_mve_unpredicated_insn (insn))) + { + extract_insn (insn); + rtx arr[8]; + int j = 0; + + /* When transforming a VPT-predicated instruction + into its unpredicated equivalent we need to drop + the VPR operand and we may need to also drop a + merge "vuninit" input operand, depending on the + instruction pattern. Here ensure that we have at + most a two-operand difference between the two + instrunctions. */ + int n_operands_diff = recog_data.n_operands + - insn_data[new_icode].n_operands; + gcc_assert (n_operands_diff > 0 && n_operands_diff <= 2); + + /* Then, loop through the operands of the predicated + instruction, and retain the ones that map to the + unpredicated instruction. */ + for (int i = 0; i < recog_data.n_operands; i++) + { + /* Ignore the VPR and, if needed, the vuninit + operand. */ + if (insn_vpr_reg_operand == recog_data.operand[i] + || (n_operands_diff == 2 + && !strcmp (recog_data.constraints[i], "0"))) + continue; + else + { + arr[j] = recog_data.operand[i]; + j++; + } + } + + /* Finally, emit the upredicated instruction. */ + switch (j) + { + case 1: + emit_insn (GEN_FCN (new_icode) (arr[0])); + break; + case 2: + emit_insn (GEN_FCN (new_icode) (arr[0], arr[1])); + break; + case 3: + emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], + arr[2])); + break; + case 4: + emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2], + arr[3])); + break; + case 5: + emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2], + arr[3], arr[4])); + break; + case 6: + emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2], + arr[3], arr[4], + arr[5])); + break; + case 7: + emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2], + arr[3], arr[4], arr[5], + arr[6])); + break; + default: + gcc_unreachable (); + } + } + /* If we can't identify the INSN as either being either + for deletion or to re-map, then we don't know how to + handle it, so fail the whole conversion. */ + else + { + end_sequence (); + return GEN_INT (1); + } + } + /* Instructions that dont's require the VPR can be carried + over as-is. */ + else if (DEBUG_INSN_P (insn)) + emit_debug_insn (PATTERN (insn)); + else + emit_insn (PATTERN (insn)); + } + seq = get_insns (); + end_sequence (); + + /* Re-write the entire BB contents with the transformed + sequence. */ + FOR_BB_INSNS_SAFE (body, insn, cur_insn) + if (!(GET_CODE (insn) == CODE_LABEL || NOTE_INSN_BASIC_BLOCK_P (insn))) + delete_insn (insn); + for (insn = seq; NEXT_INSN (insn); insn = NEXT_INSN (insn)) + if (NOTE_P (insn)) + emit_note_after ((enum insn_note) NOTE_KIND (insn), BB_END (body)); + else if (DEBUG_INSN_P (insn)) + emit_debug_insn_after (PATTERN (insn), BB_END (body)); + else + emit_insn_after (PATTERN (insn), BB_END (body)); + + emit_jump_insn_after (PATTERN (insn), BB_END (body)); + return GEN_INT (decrementnum); + } + /* Bail out: we can't use dlstp/letp, so return 1 to allow loop-doloop to try + the standard dls/le pair. */ + return GEN_INT (1); +} + +/* Target hook to the number of elements to be processed by a dlstp/letp loop + into `count` to intialise the counter register. The number of elements was + previously extracted from the vctp insn and placed into a USE rtx. + We only check that the doloop_end pattern successfully decrements by a + number other than -1 for a valid dlstp/letp loop. No other checking is + needed as that was done previously. */ + +rtx +arm_allow_elementwise_doloop_p (rtx count, rtx label, rtx doloop) +{ + if (doloop + && INTVAL (XEXP (SET_SRC (XVECEXP (PATTERN (doloop), 0, 1)), 1)) != -1 + && ARITHMETIC_P (XEXP (count, 0))) + { + basic_block body = BLOCK_FOR_INSN (label)->prev_bb; + rtx_insn* insn; + FOR_BB_INSNS (body, insn) + { + if (INSN_P (insn) && GET_CODE (PATTERN (insn)) == USE) + { + rtx num_elem_reg = copy_rtx (XEXP (PATTERN (insn), 0)); + delete_insn (insn); + return num_elem_reg; + } + } + } + return count; } #if CHECKING_P diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index 29062cd6fb3..aa2fdac22f3 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -1464,7 +1464,9 @@ (define_int_attr mode1 [(VCTP8Q "8") (VCTP16Q "16") (VCTP32Q "32") (VCTP64Q "64") (VCTP8Q_M "8") (VCTP16Q_M "16") - (VCTP32Q_M "32") (VCTP64Q_M "64")]) + (VCTP32Q_M "32") (VCTP64Q_M "64") + (DLSTP8 "8") (DLSTP16 "16") (DLSTP32 "32") + (DLSTP64 "64")]) ;; Both kinds of return insn. (define_code_iterator RETURNS [return simple_return]) @@ -1773,6 +1775,8 @@ (define_int_iterator UQRSHLLQ [UQRSHLL_64 UQRSHLL_48]) (define_int_iterator SQRSHRLQ [SQRSHRL_64 SQRSHRL_48]) (define_int_iterator VSHLCQ_M [VSHLCQ_M_S VSHLCQ_M_U]) +(define_int_iterator DLSTP [DLSTP8 DLSTP16 DLSTP32 + DLSTP64]) ;; Define iterators for VCMLA operations (define_int_iterator VCMLA_OP [UNSPEC_VCMLA diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md index b1c8c1c569f..3baddab5905 100644 --- a/gcc/config/arm/mve.md +++ b/gcc/config/arm/mve.md @@ -10837,3 +10837,38 @@ } DONE; }) + +;; Originally expanded by 'predicated_doloop_end'. +(define_insn "*predicated_doloop_end_internal" + [(set (pc) + (if_then_else + (ge (plus:SI (reg:SI LR_REGNUM) + (match_operand:SI 0 "const_int_operand" "")) + (const_int 0)) + (label_ref (match_operand 1 "" "")) + (pc))) + (set (reg:SI LR_REGNUM) + (plus:SI (reg:SI LR_REGNUM) (match_dup 0))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_32BIT && TARGET_HAVE_LOB && TARGET_HAVE_MVE && TARGET_THUMB2" + { + if (get_attr_length (insn) == 4) + return "letp\t%|lr, %l1"; + else + return "subs\t%|lr, #%0;bgt\t%l1"; + } + [(set (attr "length") + (if_then_else + (ltu (minus (pc) (match_dup 1)) (const_int 1024)) + (const_int 4) + (const_int 6))) + (set_attr "type" "branch")]) + +(define_insn "dlstp_insn" + [ + (set (reg:SI LR_REGNUM) + (unspec:SI [(match_operand:SI 0 "s_register_operand" "r")] + DLSTP)) + ] + "TARGET_32BIT && TARGET_HAVE_LOB && TARGET_HAVE_MVE && TARGET_THUMB2" + "dlstp.\t%|lr, %0") \ No newline at end of file diff --git a/gcc/config/arm/thumb2.md b/gcc/config/arm/thumb2.md index b2309a52165..a21e2909872 100644 --- a/gcc/config/arm/thumb2.md +++ b/gcc/config/arm/thumb2.md @@ -1610,10 +1610,11 @@ ;; knows what to generate. (define_expand "doloop_end" [(use (match_operand 0 "" "")) ; loop pseudo - (use (match_operand 1 "" ""))] ; label + (use (match_operand 1 "" "")) ; label + (use (match_operand 2 "" ""))] ; decrement constant "TARGET_32BIT" " - { +{ /* Currently SMS relies on the do-loop pattern to recognize loops where (1) the control part consists of all insns defining and/or using a certain 'count' register and (2) the loop count can be @@ -1623,41 +1624,68 @@ Also used to implement the low over head loops feature, which is part of the Armv8.1-M Mainline Low Overhead Branch (LOB) extension. */ - if (optimize > 0 && (flag_modulo_sched || TARGET_HAVE_LOB)) - { - rtx s0; - rtx bcomp; - rtx loc_ref; - rtx cc_reg; - rtx insn; - rtx cmp; - - if (GET_MODE (operands[0]) != SImode) - FAIL; - - s0 = operands [0]; - - /* Low over head loop instructions require the first operand to be LR. */ - if (TARGET_HAVE_LOB && arm_target_insn_ok_for_lob (operands [1])) - s0 = gen_rtx_REG (SImode, LR_REGNUM); - - if (TARGET_THUMB2) - insn = emit_insn (gen_thumb2_addsi3_compare0 (s0, s0, GEN_INT (-1))); - else - insn = emit_insn (gen_addsi3_compare0 (s0, s0, GEN_INT (-1))); - - cmp = XVECEXP (PATTERN (insn), 0, 0); - cc_reg = SET_DEST (cmp); - bcomp = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx); - loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands [1]); - emit_jump_insn (gen_rtx_SET (pc_rtx, - gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp, - loc_ref, pc_rtx))); - DONE; - } - else - FAIL; - }") + if (optimize > 0 && (flag_modulo_sched || TARGET_HAVE_LOB)) + { + rtx s0; + rtx bcomp; + rtx loc_ref; + rtx cc_reg; + rtx insn; + rtx cmp; + rtx decrement_num; + + if (GET_MODE (operands[0]) != SImode) + FAIL; + + s0 = operands[0]; + + if (TARGET_HAVE_LOB && arm_target_insn_ok_for_lob (operands[1])) + { + s0 = gen_rtx_REG (SImode, LR_REGNUM); + + /* If we have a compatibe MVE target, try and analyse the loop + contents to determine if we can use predicated dlstp/letp + looping. */ + if (TARGET_HAVE_MVE && TARGET_THUMB2 + && (decrement_num = arm_attempt_dlstp_transform (operands[1], + operands[2])) + && (INTVAL (decrement_num) != 1)) + { + insn = emit_insn + (gen_thumb2_addsi3_compare0 + (s0, s0, GEN_INT ((-1) * (INTVAL (decrement_num))))); + cmp = XVECEXP (PATTERN (insn), 0, 0); + cc_reg = SET_DEST (cmp); + bcomp = gen_rtx_GE (VOIDmode, cc_reg, const0_rtx); + loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands[1]); + emit_jump_insn (gen_rtx_SET (pc_rtx, + gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp, + loc_ref, pc_rtx))); + DONE; + } + + /* Otherwise, try standard decrement-by-one dls/le looping. */ + if (TARGET_THUMB2) + insn = emit_insn (gen_thumb2_addsi3_compare0 (s0, s0, + GEN_INT (-1))); + else + insn = emit_insn (gen_addsi3_compare0 (s0, s0, GEN_INT (-1))); + + cmp = XVECEXP (PATTERN (insn), 0, 0); + cc_reg = SET_DEST (cmp); + bcomp = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx); + loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands[1]); + emit_jump_insn (gen_rtx_SET (pc_rtx, + gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp, + loc_ref, pc_rtx))); + DONE; + } + else + FAIL; + } + else + FAIL; +}") (define_insn "*clear_apsr" [(unspec_volatile:SI [(const_int 0)] VUNSPEC_CLRM_APSR) @@ -1755,7 +1783,37 @@ { if (REGNO (operands[0]) == LR_REGNUM) { - emit_insn (gen_dls_insn (operands[0])); + /* Pick out the number by which we are decrementing the loop counter + in every iteration. If it's > 1, then use dlstp. */ + int const_int_dec_num + = abs (INTVAL (XEXP (XEXP (XVECEXP (PATTERN (operands[1]), 0, 1), + 1), + 1))); + switch (const_int_dec_num) + { + case 16: + emit_insn (gen_dlstp8_insn (operands[0])); + break; + + case 8: + emit_insn (gen_dlstp16_insn (operands[0])); + break; + + case 4: + emit_insn (gen_dlstp32_insn (operands[0])); + break; + + case 2: + emit_insn (gen_dlstp64_insn (operands[0])); + break; + + case 1: + emit_insn (gen_dls_insn (operands[0])); + break; + + default: + gcc_unreachable (); + } DONE; } else diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md index 7748e784379..744e7ab5731 100644 --- a/gcc/config/arm/unspecs.md +++ b/gcc/config/arm/unspecs.md @@ -579,6 +579,10 @@ VCTP16Q VCTP32Q VCTP64Q + DLSTP8 + DLSTP16 + DLSTP32 + DLSTP64 VPNOT VCREATEQ_F VCVTQ_N_TO_F_S diff --git a/gcc/config/bfin/bfin.md b/gcc/config/bfin/bfin.md index 56b24726bc2..3b31ee3a1ee 100644 --- a/gcc/config/bfin/bfin.md +++ b/gcc/config/bfin/bfin.md @@ -1959,7 +1959,8 @@ (plus:SI (match_dup 0) (const_int -1))) (unspec [(const_int 0)] UNSPEC_LSETUP_END) - (clobber (match_dup 2)) + (clobber (match_dup 3)) + (match_operand 2 "" "") (clobber (reg:BI REG_CC))])] ; match_scratch "" { @@ -1967,7 +1968,7 @@ if (GET_MODE (operands[0]) != SImode) FAIL; bfin_hardware_loop (); - operands[2] = gen_rtx_SCRATCH (SImode); + operands[3] = gen_rtx_SCRATCH (SImode); }) (define_insn "loop_end" diff --git a/gcc/config/c6x/c6x.md b/gcc/config/c6x/c6x.md index 60110410d0b..de4f4a56d99 100644 --- a/gcc/config/c6x/c6x.md +++ b/gcc/config/c6x/c6x.md @@ -1429,13 +1429,14 @@ (set (match_dup 0) (plus:SI (match_dup 0) (const_int -1))) - (clobber (match_dup 2))])] ; match_scratch + (clobber (match_dup 3)) ; match_scratch + (match_operand 2 "" "")])] "TARGET_INSNS_64PLUS && optimize" { /* The loop optimizer doesn't check the predicates... */ if (GET_MODE (operands[0]) != SImode) FAIL; - operands[2] = gen_rtx_SCRATCH (SImode); + operands[3] = gen_rtx_SCRATCH (SImode); }) (define_insn "mvilc" diff --git a/gcc/config/ia64/ia64.md b/gcc/config/ia64/ia64.md index 5d1d47da55b..5b24bd76ace 100644 --- a/gcc/config/ia64/ia64.md +++ b/gcc/config/ia64/ia64.md @@ -3956,7 +3956,8 @@ (define_expand "doloop_end" [(use (match_operand 0 "" "")) ; loop pseudo - (use (match_operand 1 "" ""))] ; label + (use (match_operand 1 "" "")) ; label + (use (match_operand 2 "" ""))] ; decrement constant "" { if (GET_MODE (operands[0]) != DImode) diff --git a/gcc/config/pdp11/pdp11.md b/gcc/config/pdp11/pdp11.md index a46efc1ef78..aa359955c3c 100644 --- a/gcc/config/pdp11/pdp11.md +++ b/gcc/config/pdp11/pdp11.md @@ -332,7 +332,8 @@ (pc))) (set (match_dup 0) (plus:HI (match_dup 0) - (const_int -1)))])] + (const_int -1))) + (match_operand 2 "" "")])] "TARGET_40_PLUS" "{ if (GET_MODE (operands[0]) != HImode) diff --git a/gcc/config/pru/pru.md b/gcc/config/pru/pru.md index bdc5ad79ba0..990cdfe0974 100644 --- a/gcc/config/pru/pru.md +++ b/gcc/config/pru/pru.md @@ -1636,7 +1636,8 @@ (define_expand "doloop_end" [(use (match_operand 0 "nonimmediate_operand")) - (use (label_ref (match_operand 1 "")))] + (use (label_ref (match_operand 1 ""))) + (use (match_operand 2 "" ""))] ; decrement constant "TARGET_OPT_LOOP" { if (GET_CODE (operands[0]) == REG && GET_MODE (operands[0]) == QImode) diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md index ad5a4cf2ef8..fdaeb6a02ca 100644 --- a/gcc/config/rs6000/rs6000.md +++ b/gcc/config/rs6000/rs6000.md @@ -13422,7 +13422,8 @@ (define_expand "doloop_end" [(use (match_operand 0)) ; loop pseudo - (use (match_operand 1))] ; label + (use (match_operand 1)) ; label + (use (match_operand 2 "" ""))] ; decrement constant "" { if (GET_MODE (operands[0]) != Pmode) diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md index 962927c3112..bad317fdba7 100644 --- a/gcc/config/s390/s390.md +++ b/gcc/config/s390/s390.md @@ -9780,7 +9780,8 @@ (define_expand "doloop_end" [(use (match_operand 0 "" "")) ; loop pseudo - (use (match_operand 1 "" ""))] ; label + (use (match_operand 1 "" "")) ; label + (use (match_operand 2 "" ""))] ; decrement constant "" { if (GET_MODE (operands[0]) == SImode) diff --git a/gcc/config/sh/sh.md b/gcc/config/sh/sh.md index 59a7b216433..5b8d74d6029 100644 --- a/gcc/config/sh/sh.md +++ b/gcc/config/sh/sh.md @@ -6404,7 +6404,8 @@ (pc))) (set (match_dup 0) (plus:SI (match_dup 0) (const_int -1))) - (clobber (reg:SI T_REG))])] + (clobber (reg:SI T_REG)) + (match_operand 2 "" "")])] "TARGET_SH2" { if (GET_MODE (operands[0]) != SImode) diff --git a/gcc/config/v850/v850.md b/gcc/config/v850/v850.md index 6ca31e3f43f..8d0812abac7 100644 --- a/gcc/config/v850/v850.md +++ b/gcc/config/v850/v850.md @@ -1434,7 +1434,8 @@ (define_expand "doloop_end" [(use (match_operand 0 "" "")) ; loop pseudo - (use (match_operand 1 "" ""))] ; label + (use (match_operand 1 "" "")) ; label + (use (match_operand 2 "" ""))] ; decrement constant "TARGET_V850E3V5_UP && TARGET_LOOP" { rtx loop_cnt = operands[0]; diff --git a/gcc/config/xtensa/xtensa.md b/gcc/config/xtensa/xtensa.md index 608110c20bc..57a4e2ca353 100644 --- a/gcc/config/xtensa/xtensa.md +++ b/gcc/config/xtensa/xtensa.md @@ -2016,13 +2016,14 @@ (plus:SI (match_dup 0) (const_int -1))) (unspec [(const_int 0)] UNSPEC_LSETUP_END) - (clobber (match_dup 2))])] ; match_scratch + (clobber (match_dup 3)) ; match_scratch + (match_operand 2 "" "")])] "TARGET_LOOPS && optimize" { /* The loop optimizer doesn't check the predicates... */ if (GET_MODE (operands[0]) != SImode) FAIL; - operands[2] = gen_rtx_SCRATCH (SImode); + operands[3] = gen_rtx_SCRATCH (SImode); }) diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index 110f8dfa0a9..a8cfcd7497d 100644 --- a/gcc/doc/tm.texi +++ b/gcc/doc/tm.texi @@ -11774,6 +11774,14 @@ loops, and will help ivopts to make some decisions. The default version of this hook returns false. @end deftypefn +@deftypefn {Target Hook} rtx TARGET_ALLOW_ELEMENTWISE_DOLOOP_P (rtx @var{count}, rtx @var{label}, rtx @var{doloop}) +This target hook allows the target to support loop-doloop optimisations +where the value that gets put into the loop counter register is not a +pre-calculation of the number of iteration of the loop. For instance, +the value used can be the number of elements that the loop will process. +The default version of this hook returns the same rtx it was given. +@end deftypefn + @deftypevr {Target Hook} bool TARGET_HAVE_COUNT_REG_DECR_P Return true if the target supports hardware count register for decrement and branch. diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index 501ddf147e4..024711cfdb9 100644 --- a/gcc/doc/tm.texi.in +++ b/gcc/doc/tm.texi.in @@ -7730,6 +7730,8 @@ to by @var{ce_info}. @hook TARGET_PREDICT_DOLOOP_P +@hook TARGET_ALLOW_ELEMENTWISE_DOLOOP_P + @hook TARGET_HAVE_COUNT_REG_DECR_P @hook TARGET_DOLOOP_COST_FOR_GENERIC diff --git a/gcc/loop-doloop.cc b/gcc/loop-doloop.cc index 30b45c8071a..cbbc662f16b 100644 --- a/gcc/loop-doloop.cc +++ b/gcc/loop-doloop.cc @@ -85,29 +85,29 @@ doloop_condition_get (rtx_insn *doloop_pat) forms: 1) (parallel [(set (pc) (if_then_else (condition) - (label_ref (label)) - (pc))) - (set (reg) (plus (reg) (const_int -1))) - (additional clobbers and uses)]) + (label_ref (label)) + (pc))) + (set (reg) (plus (reg) (const_int -n))) + (additional clobbers and uses)]) The branch must be the first entry of the parallel (also required by jump.cc), and the second entry of the parallel must be a set of the loop counter register. Some targets (IA-64) wrap the set of the loop counter in an if_then_else too. - 2) (set (reg) (plus (reg) (const_int -1)) - (set (pc) (if_then_else (reg != 0) - (label_ref (label)) - (pc))). + 2) (set (reg) (plus (reg) (const_int -n)) + (set (pc) (if_then_else (reg != 0) + (label_ref (label)) + (pc))). Some targets (ARM) do the comparison before the branch, as in the following form: - 3) (parallel [(set (cc) (compare ((plus (reg) (const_int -1), 0))) - (set (reg) (plus (reg) (const_int -1)))]) - (set (pc) (if_then_else (cc == NE) - (label_ref (label)) - (pc))) */ + 3) (parallel [(set (cc) (compare ((plus (reg) (const_int -n), 0))) + (set (reg) (plus (reg) (const_int -n)))]) + (set (pc) (if_then_else (cc == NE) + (label_ref (label)) + (pc))) */ pattern = PATTERN (doloop_pat); @@ -143,7 +143,7 @@ doloop_condition_get (rtx_insn *doloop_pat) || GET_CODE (cmp_arg1) != PLUS) return 0; reg_orig = XEXP (cmp_arg1, 0); - if (XEXP (cmp_arg1, 1) != GEN_INT (-1) + if (!CONST_INT_P (XEXP (cmp_arg1, 1)) || !REG_P (reg_orig)) return 0; cc_reg = SET_DEST (cmp_orig); @@ -156,7 +156,8 @@ doloop_condition_get (rtx_insn *doloop_pat) { /* We expect the condition to be of the form (reg != 0) */ cond = XEXP (SET_SRC (cmp), 0); - if (GET_CODE (cond) != NE || XEXP (cond, 1) != const0_rtx) + if ((GET_CODE (cond) != NE && GET_CODE (cond) != GE) + || XEXP (cond, 1) != const0_rtx) return 0; } } @@ -173,14 +174,14 @@ doloop_condition_get (rtx_insn *doloop_pat) if (! REG_P (reg)) return 0; - /* Check if something = (plus (reg) (const_int -1)). + /* Check if something = (plus (reg) (const_int -n)). On IA-64, this decrement is wrapped in an if_then_else. */ inc_src = SET_SRC (inc); if (GET_CODE (inc_src) == IF_THEN_ELSE) inc_src = XEXP (inc_src, 1); if (GET_CODE (inc_src) != PLUS || XEXP (inc_src, 0) != reg - || XEXP (inc_src, 1) != constm1_rtx) + || !CONST_INT_P (XEXP (inc_src, 1))) return 0; /* Check for (set (pc) (if_then_else (condition) @@ -211,42 +212,49 @@ doloop_condition_get (rtx_insn *doloop_pat) || (GET_CODE (XEXP (condition, 0)) == PLUS && XEXP (XEXP (condition, 0), 0) == reg)) { - if (GET_CODE (pattern) != PARALLEL) /* For the second form we expect: - (set (reg) (plus (reg) (const_int -1)) - (set (pc) (if_then_else (reg != 0) - (label_ref (label)) - (pc))). + (set (reg) (plus (reg) (const_int -n)) + (set (pc) (if_then_else (reg != 0) + (label_ref (label)) + (pc))). - is equivalent to the following: + If n == 1, that is equivalent to the following: - (parallel [(set (pc) (if_then_else (reg != 1) - (label_ref (label)) - (pc))) - (set (reg) (plus (reg) (const_int -1))) - (additional clobbers and uses)]) + (parallel [(set (pc) (if_then_else (reg != 1) + (label_ref (label)) + (pc))) + (set (reg) (plus (reg) (const_int -1))) + (additional clobbers and uses)]) - For the third form we expect: + For the third form we expect: - (parallel [(set (cc) (compare ((plus (reg) (const_int -1)), 0)) - (set (reg) (plus (reg) (const_int -1)))]) - (set (pc) (if_then_else (cc == NE) - (label_ref (label)) - (pc))) + (parallel [(set (cc) (compare ((plus (reg) (const_int -n)), 0)) + (set (reg) (plus (reg) (const_int -n)))]) + (set (pc) (if_then_else (cc == NE) + (label_ref (label)) + (pc))) - which is equivalent to the following: + Which also for n == 1 is equivalent to the following: - (parallel [(set (cc) (compare (reg, 1)) - (set (reg) (plus (reg) (const_int -1))) - (set (pc) (if_then_else (NE == cc) - (label_ref (label)) - (pc))))]) + (parallel [(set (cc) (compare (reg, 1)) + (set (reg) (plus (reg) (const_int -1))) + (set (pc) (if_then_else (NE == cc) + (label_ref (label)) + (pc))))]) - So we return the second form instead for the two cases. + So we return the second form instead for the two cases. + For the "elementwise" form where the decrement number isn't -1, + the final value may be exceeded, so use GE instead of NE. */ - condition = gen_rtx_fmt_ee (NE, VOIDmode, inc_src, const1_rtx); + if (GET_CODE (pattern) != PARALLEL) + { + if (INTVAL (XEXP (inc_src, 1)) != -1) + condition = gen_rtx_fmt_ee (GE, VOIDmode, inc_src, const0_rtx); + else + condition = gen_rtx_fmt_ee (NE, VOIDmode, inc_src, const1_rtx);; + } return condition; } @@ -685,17 +693,6 @@ doloop_optimize (class loop *loop) return false; } - max_cost - = COSTS_N_INSNS (param_max_iterations_computation_cost); - if (set_src_cost (desc->niter_expr, mode, optimize_loop_for_speed_p (loop)) - > max_cost) - { - if (dump_file) - fprintf (dump_file, - "Doloop: number of iterations too costly to compute.\n"); - return false; - } - if (desc->const_iter) iterations = widest_int::from (rtx_mode_t (desc->niter_expr, mode), UNSIGNED); @@ -720,7 +717,25 @@ doloop_optimize (class loop *loop) count = copy_rtx (desc->niter_expr); start_label = block_label (desc->in_edge->dest); doloop_reg = gen_reg_rtx (mode); - rtx_insn *doloop_seq = targetm.gen_doloop_end (doloop_reg, start_label); + rtx_insn *doloop_seq = targetm.gen_doloop_end (doloop_reg, start_label, + count); + + /* Not all targets need to pre-calculate the number of the iterations of + the loop, they instead work by storing the number of elements in the + counter_reg and decrementing that. Call the appropriate target hook to + change the value of count. */ + count = targetm.allow_elementwise_doloop_p (count, start_label, doloop_seq); + + max_cost + = COSTS_N_INSNS (param_max_iterations_computation_cost); + if (set_src_cost (count, mode, optimize_loop_for_speed_p (loop)) + > max_cost) + { + if (dump_file) + fprintf (dump_file, + "Doloop: number of iterations too costly to compute.\n"); + return false; + } word_mode_size = GET_MODE_PRECISION (word_mode); word_mode_max = (HOST_WIDE_INT_1U << (word_mode_size - 1) << 1) - 1; @@ -737,7 +752,7 @@ doloop_optimize (class loop *loop) else count = lowpart_subreg (word_mode, count, mode); PUT_MODE (doloop_reg, word_mode); - doloop_seq = targetm.gen_doloop_end (doloop_reg, start_label); + doloop_seq = targetm.gen_doloop_end (doloop_reg, start_label, count); } if (! doloop_seq) { diff --git a/gcc/target-insns.def b/gcc/target-insns.def index de8c0092f98..b77b7972426 100644 --- a/gcc/target-insns.def +++ b/gcc/target-insns.def @@ -48,7 +48,7 @@ DEF_TARGET_INSN (casesi, (rtx x0, rtx x1, rtx x2, rtx x3, rtx x4)) DEF_TARGET_INSN (check_stack, (rtx x0)) DEF_TARGET_INSN (clear_cache, (rtx x0, rtx x1)) DEF_TARGET_INSN (doloop_begin, (rtx x0, rtx x1)) -DEF_TARGET_INSN (doloop_end, (rtx x0, rtx x1)) +DEF_TARGET_INSN (doloop_end, (rtx x0, rtx x1, rtx x2)) DEF_TARGET_INSN (eh_return, (rtx x0)) DEF_TARGET_INSN (epilogue, (void)) DEF_TARGET_INSN (exception_receiver, (void)) diff --git a/gcc/target.def b/gcc/target.def index a3d3b04a165..e74724c8a13 100644 --- a/gcc/target.def +++ b/gcc/target.def @@ -4392,6 +4392,16 @@ The default version of this hook returns false.", bool, (class loop *loop), default_predict_doloop_p) +DEFHOOK +(allow_elementwise_doloop_p, + "This target hook allows the target to support loop-doloop optimisations\n\ +where the value that gets put into the loop counter register is not a\n\ +pre-calculation of the number of iteration of the loop. For instance,\n\ +the value used can be the number of elements that the loop will process.\n\ +The default version of this hook returns the same rtx it was given.", + rtx, (rtx count, rtx label, rtx doloop), + default_allow_elementwise_doloop_p) + DEFHOOKPOD (have_count_reg_decr_p, "Return true if the target supports hardware count register for decrement\n\ diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc index d17d393baed..04d08056dd7 100644 --- a/gcc/targhooks.cc +++ b/gcc/targhooks.cc @@ -661,6 +661,12 @@ default_predict_doloop_p (class loop *loop ATTRIBUTE_UNUSED) return false; } +rtx +default_allow_elementwise_doloop_p (rtx count, rtx, rtx) +{ + return count; +} + /* By default, just use the input MODE itself. */ machine_mode diff --git a/gcc/targhooks.h b/gcc/targhooks.h index ecce55ebe79..f9aba2e0813 100644 --- a/gcc/targhooks.h +++ b/gcc/targhooks.h @@ -88,6 +88,7 @@ extern bool default_fixed_point_supported_p (void); extern bool default_has_ifunc_p (void); extern bool default_predict_doloop_p (class loop *); +extern rtx default_allow_elementwise_doloop_p (rtx, rtx, rtx); extern machine_mode default_preferred_doloop_mode (machine_mode); extern const char * default_invalid_within_doloop (const rtx_insn *); diff --git a/gcc/testsuite/gcc.target/arm/dlstp-int16x8.c b/gcc/testsuite/gcc.target/arm/dlstp-int16x8.c new file mode 100644 index 00000000000..a61f02ed3a2 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/dlstp-int16x8.c @@ -0,0 +1,68 @@ +/* { dg-do run { target { arm*-*-* } } } */ +/* { dg-require-effective-target arm_v8_1m_mve_ok } */ +/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */ +/* { dg-options "-march=armv8.1-m.main+fp.dp+mve.fp -mfloat-abi=hard -mfpu=auto -O3 --save-temps" } */ + +#include +#include +#include +#include "lob.h" + +void __attribute__ ((noinline)) test (int16_t *a, int16_t *b, int16_t *c, int n) +{ + while (n > 0) + { + mve_pred16_t p = vctp16q (n); + int16x8_t va = vldrhq_z_s16 (a, p); + int16x8_t vb = vldrhq_z_s16 (b, p); + int16x8_t vc = vaddq_x_s16 (va, vb, p); + vstrhq_p_s16 (c, vc, p); + c+=8; + a+=8; + b+=8; + n-=8; + } +} + +int main () +{ + int i; + int16_t temp1[N]; + int16_t temp2[N]; + int16_t temp3[N]; + reset_data16 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 0); + check_plus16 (temp1, temp2, temp3, 0); + + reset_data16 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 1); + check_plus16 (temp1, temp2, temp3, 1); + + reset_data16 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 7); + check_plus16 (temp1, temp2, temp3, 7); + + reset_data16 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 8); + check_plus16 (temp1, temp2, temp3, 8); + + reset_data16 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 9); + check_plus16 (temp1, temp2, temp3, 9); + + reset_data16 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 16); + check_plus16 (temp1, temp2, temp3, 16); + + reset_data16 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 17); + check_plus16 (temp1, temp2, temp3, 17); + + reset_data16 (temp1, temp2, temp3, N); +} + +/* { dg-final { scan-assembler-times {\tdlstp.16\t} 1 } } */ +/* { dg-final { scan-assembler-times {\tletp\t} 1 } } */ +/* { dg-final { scan-assembler-not "\tvctp\t" } } */ +/* { dg-final { scan-assembler-not "\tvpst\t" } } */ +/* { dg-final { scan-assembler-not "p0" } } */ diff --git a/gcc/testsuite/gcc.target/arm/dlstp-int32x4.c b/gcc/testsuite/gcc.target/arm/dlstp-int32x4.c new file mode 100644 index 00000000000..31a7264ae26 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/dlstp-int32x4.c @@ -0,0 +1,68 @@ +/* { dg-do run { target { arm*-*-* } } } */ +/* { dg-require-effective-target arm_v8_1m_mve_ok } */ +/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */ +/* { dg-options "-march=armv8.1-m.main+fp.dp+mve.fp -mfloat-abi=hard -mfpu=auto -O3 --save-temps" } */ + +#include +#include +#include +#include "lob.h" + +void __attribute__ ((noinline)) test (int32_t *a, int32_t *b, int32_t *c, int n) +{ + while (n > 0) + { + mve_pred16_t p = vctp32q (n); + int32x4_t va = vldrwq_z_s32 (a, p); + int32x4_t vb = vldrwq_z_s32 (b, p); + int32x4_t vc = vaddq_x_s32 (va, vb, p); + vstrwq_p_s32 (c, vc, p); + c+=4; + a+=4; + b+=4; + n-=4; + } +} + +int main () +{ + int i; + int32_t temp1[N]; + int32_t temp2[N]; + int32_t temp3[N]; + reset_data32 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 0); + check_plus32 (temp1, temp2, temp3, 0); + + reset_data32 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 1); + check_plus32 (temp1, temp2, temp3, 1); + + reset_data32 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 3); + check_plus32 (temp1, temp2, temp3, 3); + + reset_data32 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 4); + check_plus32 (temp1, temp2, temp3, 4); + + reset_data32 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 5); + check_plus32 (temp1, temp2, temp3, 5); + + reset_data32 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 8); + check_plus32 (temp1, temp2, temp3, 8); + + reset_data32 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 9); + check_plus32 (temp1, temp2, temp3, 9); + + reset_data32 (temp1, temp2, temp3, N); +} + +/* { dg-final { scan-assembler-times {\tdlstp.32\t} 1 } } */ +/* { dg-final { scan-assembler-times {\tletp\t} 1 } } */ +/* { dg-final { scan-assembler-not "\tvctp\t" } } */ +/* { dg-final { scan-assembler-not "\tvpst\t" } } */ +/* { dg-final { scan-assembler-not "p0" } } */ diff --git a/gcc/testsuite/gcc.target/arm/dlstp-int64x2.c b/gcc/testsuite/gcc.target/arm/dlstp-int64x2.c new file mode 100644 index 00000000000..a09c01884da --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/dlstp-int64x2.c @@ -0,0 +1,68 @@ +/* { dg-do run { target { arm*-*-* } } } */ +/* { dg-require-effective-target arm_v8_1m_mve_ok } */ +/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */ +/* { dg-options "-march=armv8.1-m.main+fp.dp+mve.fp -mfloat-abi=hard -mfpu=auto -O3 --save-temps" } */ + +#include +#include +#include +#include "lob.h" + +void __attribute__ ((noinline)) test (int64_t *a, int64_t *c, int n) +{ + while (n > 0) + { + mve_pred16_t p = vctp64q (n); + int64x2_t va = vldrdq_gather_offset_z_s64 (a, vcreateq_u64 (8, 0), p); + vstrdq_scatter_offset_p_s64 (c, vcreateq_u64 (8, 0), va, p); + c+=2; + a+=2; + n-=2; + } +} + +int main () +{ + int i; + int64_t temp1[N]; + int64_t temp3[N]; + reset_data64 (temp1, temp3, N); + test (temp1, temp3, 0); + check_memcpy64 (temp1, temp3, 0); + + reset_data64 (temp1, temp3, N); + test (temp1, temp3, 1); + check_memcpy64 (temp1, temp3, 1); + + reset_data64 (temp1, temp3, N); + test (temp1, temp3, 2); + check_memcpy64 (temp1, temp3, 2); + + reset_data64 (temp1, temp3, N); + test (temp1, temp3, 3); + check_memcpy64 (temp1, temp3, 3); + + reset_data64 (temp1, temp3, N); + test (temp1, temp3, 4); + check_memcpy64 (temp1, temp3, 4); + + reset_data64 (temp1, temp3, N); + test (temp1, temp3, 5); + check_memcpy64 (temp1, temp3, 5); + + reset_data64 (temp1, temp3, N); + test (temp1, temp3, 6); + check_memcpy64 (temp1, temp3, 6); + + reset_data64 (temp1, temp3, N); + test (temp1, temp3, 7); + check_memcpy64 (temp1, temp3, 7); + + reset_data64 (temp1, temp3, N); +} + +/* { dg-final { scan-assembler-times {\tdlstp.64\t} 1 } } */ +/* { dg-final { scan-assembler-times {\tletp\t} 1 } } */ +/* { dg-final { scan-assembler-not "\tvctp\t" } } */ +/* { dg-final { scan-assembler-not "\tvpst\t" } } */ +/* { dg-final { scan-assembler-not "p0" } } */ diff --git a/gcc/testsuite/gcc.target/arm/dlstp-int8x16.c b/gcc/testsuite/gcc.target/arm/dlstp-int8x16.c new file mode 100644 index 00000000000..49fbd4c16a2 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/dlstp-int8x16.c @@ -0,0 +1,68 @@ +/* { dg-do run { target { arm*-*-* } } } */ +/* { dg-require-effective-target arm_v8_1m_mve_ok } */ +/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */ +/* { dg-options "-march=armv8.1-m.main+fp.dp+mve.fp -mfloat-abi=hard -mfpu=auto -O3 --save-temps" } */ + +#include +#include +#include +#include "lob.h" + +void __attribute__ ((noinline)) test (int8_t *a, int8_t *b, int8_t *c, int n) +{ + while (n > 0) + { + mve_pred16_t p = vctp8q (n); + int8x16_t va = vldrbq_z_s8 (a, p); + int8x16_t vb = vldrbq_z_s8 (b, p); + int8x16_t vc = vaddq_x_s8 (va, vb, p); + vstrbq_p_s8 (c, vc, p); + c+=16; + a+=16; + b+=16; + n-=16; + } +} + +int main () +{ + int i; + int8_t temp1[N]; + int8_t temp2[N]; + int8_t temp3[N]; + reset_data8 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 0); + check_plus8 (temp1, temp2, temp3, 0); + + reset_data8 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 1); + check_plus8 (temp1, temp2, temp3, 1); + + reset_data8 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 15); + check_plus8 (temp1, temp2, temp3, 15); + + reset_data8 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 16); + check_plus8 (temp1, temp2, temp3, 16); + + reset_data8 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 17); + check_plus8 (temp1, temp2, temp3, 17); + + reset_data8 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 32); + check_plus8 (temp1, temp2, temp3, 32); + + reset_data8 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 33); + check_plus8 (temp1, temp2, temp3, 33); + + reset_data8 (temp1, temp2, temp3, N); +} + +/* { dg-final { scan-assembler-times {\tdlstp.8\t} 1 } } */ +/* { dg-final { scan-assembler-times {\tletp\t} 1 } } */ +/* { dg-final { scan-assembler-not "\tvctp\t" } } */ +/* { dg-final { scan-assembler-not "\tvpst\t" } } */ +/* { dg-final { scan-assembler-not "p0" } } */ diff --git a/gcc/testsuite/gcc.target/arm/lob.h b/gcc/testsuite/gcc.target/arm/lob.h index feaae7cc899..3941fe7a8b6 100644 --- a/gcc/testsuite/gcc.target/arm/lob.h +++ b/gcc/testsuite/gcc.target/arm/lob.h @@ -1,15 +1,131 @@ #include - +#include /* Common code for lob tests. */ #define NO_LOB asm volatile ("@ clobber lr" : : : "lr" ) -#define N 10000 +#define N 100 + +static void +reset_data (int *a, int *b, int *c, int x) +{ + memset (a, -1, x * sizeof (*a)); + memset (b, -1, x * sizeof (*b)); + memset (c, 0, x * sizeof (*c)); +} + +static void +reset_data8 (int8_t *a, int8_t *b, int8_t *c, int x) +{ + memset (a, -1, x * sizeof (*a)); + memset (b, -1, x * sizeof (*b)); + memset (c, 0, x * sizeof (*c)); +} + +static void +reset_data16 (int16_t *a, int16_t *b, int16_t *c, int x) +{ + memset (a, -1, x * sizeof (*a)); + memset (b, -1, x * sizeof (*b)); + memset (c, 0, x * sizeof (*c)); +} + +static void +reset_data32 (int32_t *a, int32_t *b, int32_t *c, int x) +{ + memset (a, -1, x * sizeof (*a)); + memset (b, -1, x * sizeof (*b)); + memset (c, 0, x * sizeof (*c)); +} + +static void +reset_data64 (int64_t *a, int64_t *c, int x) +{ + memset (a, -1, x * sizeof (*a)); + memset (c, 0, x * sizeof (*c)); +} + +static void +check_plus (int *a, int *b, int *c, int x) +{ + for (int i = 0; i < N; i++) + { + NO_LOB; + if (i < x) + { + if (c[i] != (a[i] + b[i])) abort (); + } + else + { + if (c[i] != 0) abort (); + } + } +} + +static void +check_plus8 (int8_t *a, int8_t *b, int8_t *c, int x) +{ + for (int i = 0; i < N; i++) + { + NO_LOB; + if (i < x) + { + if (c[i] != (a[i] + b[i])) abort (); + } + else + { + if (c[i] != 0) abort (); + } + } +} + +static void +check_plus16 (int16_t *a, int16_t *b, int16_t *c, int x) +{ + for (int i = 0; i < N; i++) + { + NO_LOB; + if (i < x) + { + if (c[i] != (a[i] + b[i])) abort (); + } + else + { + if (c[i] != 0) abort (); + } + } +} + +static void +check_plus32 (int32_t *a, int32_t *b, int32_t *c, int x) +{ + for (int i = 0; i < N; i++) + { + NO_LOB; + if (i < x) + { + if (c[i] != (a[i] + b[i])) abort (); + } + else + { + if (c[i] != 0) abort (); + } + } +} static void -reset_data (int *a, int *b, int *c) +check_memcpy64 (int64_t *a, int64_t *c, int x) { - memset (a, -1, N * sizeof (*a)); - memset (b, -1, N * sizeof (*b)); - memset (c, -1, N * sizeof (*c)); + for (int i = 0; i < N; i++) + { + NO_LOB; + if (i < x) + { + if (c[i] != a[i]) abort (); + } + else + { + if (c[i] != 0) abort (); + } + } } diff --git a/gcc/testsuite/gcc.target/arm/lob1.c b/gcc/testsuite/gcc.target/arm/lob1.c index ba5c82cd55c..c8ce653a5c3 100644 --- a/gcc/testsuite/gcc.target/arm/lob1.c +++ b/gcc/testsuite/gcc.target/arm/lob1.c @@ -54,29 +54,18 @@ loop3 (int *a, int *b, int *c) } while (i < N); } -void -check (int *a, int *b, int *c) -{ - for (int i = 0; i < N; i++) - { - NO_LOB; - if (c[i] != a[i] + b[i]) - abort (); - } -} - int main (void) { - reset_data (a, b, c); + reset_data (a, b, c, N); loop1 (a, b ,c); - check (a, b ,c); - reset_data (a, b, c); + check_plus (a, b, c, N); + reset_data (a, b, c, N); loop2 (a, b ,c); - check (a, b ,c); - reset_data (a, b, c); + check_plus (a, b, c, N); + reset_data (a, b, c, N); loop3 (a, b ,c); - check (a, b ,c); + check_plus (a, b, c, N); return 0; } diff --git a/gcc/testsuite/gcc.target/arm/lob6.c b/gcc/testsuite/gcc.target/arm/lob6.c index 17b6124295e..4fe116e2c2b 100644 --- a/gcc/testsuite/gcc.target/arm/lob6.c +++ b/gcc/testsuite/gcc.target/arm/lob6.c @@ -79,14 +79,14 @@ check (void) int main (void) { - reset_data (a1, b1, c1); - reset_data (a2, b2, c2); + reset_data (a1, b1, c1, N); + reset_data (a2, b2, c2, N); loop1 (a1, b1, c1); ref1 (a2, b2, c2); check (); - reset_data (a1, b1, c1); - reset_data (a2, b2, c2); + reset_data (a1, b1, c1, N); + reset_data (a2, b2, c2, N); loop2 (a1, b1, c1); ref2 (a2, b2, c2); check ();