On 06/11/2023 17:29, Stamatis Markianos-Wright wrote: > > On 06/11/2023 11:24, Richard Sandiford wrote: >> Stamatis Markianos-Wright writes: >>>> One of the main reasons for reading the arm bits was to try to answer >>>> the question: if we switch to a downcounting loop with a GE condition, >>>> how do we make sure that the start value is not a large unsigned >>>> number that is interpreted as negative by GE?  E.g. if the loop >>>> originally counted up in steps of N and used an LTU condition, >>>> it could stop at a value in the range [INT_MAX + 1, UINT_MAX]. >>>> But the loop might never iterate if we start counting down from >>>> most values in that range. >>>> >>>> Does the patch handle that? >>> So AFAICT this is actually handled in the generic code in >>> `doloop_valid_p`: >>> >>> This kind of loops fail because of they are "desc->infinite", then no >>> loop-doloop conversion is attempted at all (even for standard dls/le >>> loops) >>> >>> Thanks to that check I haven't been able to trigger anything like the >>> behaviour you describe, do you think the doloop_valid_p checks are >>> robust enough? >> The loops I was thinking of are provably not infinite though. E.g.: >> >>    for (unsigned int i = 0; i < UINT_MAX - 100; ++i) >>      ... >> >> is known to terminate.  And doloop conversion is safe with the normal >> count-down-by-1 approach, so I don't think current code would need >> to reject it.  I.e. a conversion to: >> >>    unsigned int i = UINT_MAX - 101; >>    do >>      ... >>    while (--i != ~0U); >> >> would be safe, but a conversion to: >> >>    int i = UINT_MAX - 101; >>    do >>      ... >>    while ((i -= step, i > 0)); >> >> wouldn't, because the loop body would only be executed once. >> >> I'm only going off the name "infinite" though :)  It's possible that >> it has more connotations than that. >> >> Thanks, >> Richard > > Ack, yep, I see what you mean now, and yep, that kind of loop does > indeed pass through doloop_valid_p > > Interestingly , in the v8-M Arm ARM this is done with: > > ``` > > boolean IsLastLowOverheadLoop(INSTR_EXEC_STATE_Type state) > // This does not check whether a loop is currently active. > // If the PE were in a loop, would this be the last one? > return UInt(state.LoopCount) <= (1 << (4 - LTPSIZE)); > > ``` > > So architecturally the asm we output would be ok (except maybe the > "branch too far subs;bgt;lctp" fallback at > `predicated_doloop_end_internal` (maybe that should be `bhi`))... But > now GE: isn't looking like an accurate representation of this > operation in the compiler. > > I'm wondering if I should try to make `predicated_doloop_end_internal` > contain a comparison along the lines of: > (gtu: (plus: (LR) (const_int -num_lanes)) (const_int num_lanes_minus_1)) > > I'll give that a try :) > > The only reason I'd chosen to go with GE earlier, tbh, was because of > the existing handling of GE in loop-doloop.cc > > Let me know if any other ideas come to your mind! > > > Cheers, > > Stam It looks like I've had success with the below (diff to previous patch), trimmed a bit to only the functionally interesting things:: diff --git a/gcc/config/arm/thumb2.md b/gcc/config/arm/thumb2.md index 368d5138ca1..54dd4ee564b 100644 --- a/gcc/config/arm/thumb2.md +++ b/gcc/config/arm/thumb2.md @@ -1649,16 +1649,28 @@            && (decrement_num = arm_attempt_dlstp_transform (operands[1]))            && (INTVAL (decrement_num) != 1))          { -          insn = emit_insn -              (gen_thumb2_addsi3_compare0 -              (s0, s0, GEN_INT ((-1) * (INTVAL (decrement_num))))); -          cmp = XVECEXP (PATTERN (insn), 0, 0); -          cc_reg = SET_DEST (cmp); -          bcomp = gen_rtx_GE (VOIDmode, cc_reg, const0_rtx);            loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands[1]); -          emit_jump_insn (gen_rtx_SET (pc_rtx, -                       gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp, -                                 loc_ref, pc_rtx))); +          switch (INTVAL (decrement_num)) +        { +          case 2: +            insn = emit_jump_insn (gen_predicated_doloop_end_internal2 +                        (s0, loc_ref)); +            break; +          case 4: +            insn = emit_jump_insn (gen_predicated_doloop_end_internal4 +                        (s0, loc_ref)); +            break; +          case 8: +            insn = emit_jump_insn (gen_predicated_doloop_end_internal8 +                        (s0, loc_ref)); +            break; +          case 16: +            insn = emit_jump_insn (gen_predicated_doloop_end_internal16 +                        (s0, loc_ref)); +            break; +          default: +            gcc_unreachable (); +        }            DONE;          }      } diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md index 93905583b18..c083f965fa9 100644 --- a/gcc/config/arm/mve.md +++ b/gcc/config/arm/mve.md @@ -6922,23 +6922,24 @@  ;; Originally expanded by 'predicated_doloop_end'.  ;; In the rare situation where the branch is too far, we do also need to  ;; revert FPSCR.LTPSIZE back to 0x100 after the last iteration. -(define_insn "*predicated_doloop_end_internal" +(define_insn "predicated_doloop_end_internal"    [(set (pc)      (if_then_else -       (ge (plus:SI (reg:SI LR_REGNUM) -            (match_operand:SI 0 "const_int_operand" "")) -        (const_int 0)) -     (label_ref (match_operand 1 "" "")) +       (gtu (unspec:SI [(plus:SI (match_operand:SI 0 "s_register_operand" "=r") +                     (const_int ))] +        LETP) +        (const_int )) +     (match_operand 1 "" "")       (pc))) -   (set (reg:SI LR_REGNUM) -    (plus:SI (reg:SI LR_REGNUM) (match_dup 0))) +   (set (match_dup 0) +    (plus:SI (match_dup 0) (const_int )))     (clobber (reg:CC CC_REGNUM))]    "TARGET_HAVE_MVE"    {      if (get_attr_length (insn) == 4)        return "letp\t%|lr, %l1";      else -      return "subs\t%|lr, #%n0\n\tbgt\t%l1\n\tlctp"; +      return "subs\t%|lr, #\n\tbhi\t%l1\n\tlctp";    }    [(set (attr "length")      (if_then_else @@ -6947,11 +6948,11 @@          (const_int 6)))     (set_attr "type" "branch")]) -(define_insn "dlstp_insn" +(define_insn "dlstp_insn"    [      (set (reg:SI LR_REGNUM)       (unspec:SI [(match_operand:SI 0 "s_register_operand" "r")]        DLSTP))    ]    "TARGET_HAVE_MVE" -  "dlstp.\t%|lr, %0") +  "dlstp.\t%|lr, %0") diff --git a/gcc/loop-doloop.cc b/gcc/loop-doloop.cc index 6a72700a127..47fdef989b4 100644 --- a/gcc/loop-doloop.cc +++ b/gcc/loop-doloop.cc @@ -185,6 +185,7 @@ doloop_condition_get (rtx_insn *doloop_pat)        || XEXP (inc_src, 0) != reg        || !CONST_INT_P (XEXP (inc_src, 1)))      return 0; +  int dec_num = abs (INTVAL (XEXP (inc_src, 1)));    /* Check for (set (pc) (if_then_else (condition)                                         (label_ref (label)) @@ -199,21 +200,32 @@ doloop_condition_get (rtx_insn *doloop_pat)    /* Extract loop termination condition.  */    condition = XEXP (SET_SRC (cmp), 0); -  /* We expect a GE or NE comparison with 0 or 1.  */ -  if ((GET_CODE (condition) != GE -       && GET_CODE (condition) != NE) -      || (XEXP (condition, 1) != const0_rtx -          && XEXP (condition, 1) != const1_rtx)) +  /* We expect a GE or NE comparison with 0 or 1, or a GTU comparison with +     dec_num - 1.  */ +  if (!((GET_CODE (condition) == GE +     || GET_CODE (condition) == NE) +    && (XEXP (condition, 1) == const0_rtx +        || XEXP (condition, 1) == const1_rtx )) +      &&!(GET_CODE (condition) == GTU +      && ((INTVAL (XEXP (condition, 1))) == (dec_num - 1))))      return 0; -  if ((XEXP (condition, 0) == reg) +  /* For the ARM special case of having a GTU: re-form the condition without +     the unspec for the benefit of the middle-end.  */ +  if (GET_CODE (condition) == GTU) +    { +      condition = gen_rtx_fmt_ee (GTU, VOIDmode, inc_src, GEN_INT (dec_num - 1)); +      return condition; +    } +  else if ((XEXP (condition, 0) == reg)        /* For the third case:  */        || ((cc_reg != NULL_RTX)        && (XEXP (condition, 0) == cc_reg)        && (reg_orig == reg)) @@ -245,20 +257,11 @@ doloop_condition_get (rtx_insn *doloop_pat)                         (label_ref (label))                         (pc))))]) -    So we return the second form instead for the two cases when n == 1. - -    For n > 1, the final value may be exceeded, so use GE instead of NE. +    So we return the second form instead for the two cases.       */ -     if (GET_CODE (pattern) != PARALLEL) -       { -    if (INTVAL (XEXP (inc_src, 1)) != -1) -      condition = gen_rtx_fmt_ee (GE, VOIDmode, inc_src, const0_rtx); -    else -      condition = gen_rtx_fmt_ee (NE, VOIDmode, inc_src, const1_rtx);; -       } - +    condition = gen_rtx_fmt_ee (NE, VOIDmode, inc_src, const1_rtx);      return condition; -   } +    }    /* ??? If a machine uses a funny comparison, we could return a       canonicalized form here.  */ @@ -501,7 +504,8 @@ doloop_modify (class loop *loop, class niter_desc *desc,      case GE:        /* Currently only GE tests against zero are supported.  */        gcc_assert (XEXP (condition, 1) == const0_rtx); - +      /* FALLTHRU */ +    case GTU:        noloop = constm1_rtx; diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index a6a7ff507a5..9398702cddd 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -2673,8 +2673,16 @@  (define_int_attr mrrc [(VUNSPEC_MRRC "mrrc") (VUNSPEC_MRRC2 "mrrc2")])  (define_int_attr MRRC [(VUNSPEC_MRRC "MRRC") (VUNSPEC_MRRC2 "MRRC2")]) -(define_int_attr mode1 [(DLSTP8 "8") (DLSTP16 "16") (DLSTP32 "32") -            (DLSTP64 "64")]) +(define_int_attr dlstp_elemsize [(DLSTP8 "8") (DLSTP16 "16") (DLSTP32 "32") +                 (DLSTP64 "64")]) + +(define_int_attr letp_num_lanes [(LETP8 "16") (LETP16 "8") (LETP32 "4") +                 (LETP64 "2")]) +(define_int_attr letp_num_lanes_neg [(LETP8 "-16") (LETP16 "-8") (LETP32 "-4") +                     (LETP64 "-2")]) + +(define_int_attr letp_num_lanes_minus_1 [(LETP8 "15") (LETP16 "7") (LETP32 "3") +                     (LETP64 "1")])  (define_int_attr opsuffix [(UNSPEC_DOT_S "s8")                 (UNSPEC_DOT_U "u8") @@ -2921,6 +2929,8 @@  (define_int_iterator VQSHLUQ_N [VQSHLUQ_N_S])  (define_int_iterator DLSTP [DLSTP8 DLSTP16 DLSTP32                     DLSTP64]) +(define_int_iterator LETP [LETP8 LETP16 LETP32 +               LETP64])  ;; Define iterators for VCMLA operations  (define_int_iterator VCMLA_OP [UNSPEC_VCMLA        /* The iteration count does not need incrementing for a GE test.  */ diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md index 12ae4c4f820..2d6f27c14f4 100644 --- a/gcc/config/arm/unspecs.md +++ b/gcc/config/arm/unspecs.md @@ -587,6 +587,10 @@    DLSTP16    DLSTP32    DLSTP64 +  LETP8 +  LETP16 +  LETP32 +  LETP64    VPNOT    VCREATEQ_F    VCVTQ_N_TO_F_S I've attached the whole [2/2] patch diff with this change and the required comment changes in doloop_condition_get. WDYT? Thanks, Stam > >