[RFC] Aarch64: Replace nested FP min/max with conditionals for TX2

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [RFC] Aarch64: Replace nested FP min/max with conditionals for TX2
@ 2020-09-09 15:50 Anton Youdkevitch
  2020-09-10 10:03 ` Richard Biener
  0 siblings, 1 reply; 4+ messages in thread
From: Anton Youdkevitch @ 2020-09-09 15:50 UTC (permalink / raw)
  To: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 545 bytes --]

ThunderxT2 chip has an odd property that nested scalar FP min and max are
slower than logically the same sequence of compares and branches.

Here is the patch where I'm trying to implement that transformation.
Please advise if the "combine" pass (actually after the pass itself) is the
appropriate place to do this.

I was considering the possibility to implement this in aarch64.md
(which would be much cleaner) but didn't manage to figure out how
to make fmin/fmax survive until later passes and replace them only
then.

-- 
  Thanks,
  Anton

[-- Attachment #2: 0001-WIP-MIN-to-conditionals-1.patch --]
[-- Type: application/octet-stream, Size: 11400 bytes --]

From fd15f48a4c2cfc3cf51f941b95558629768c1de6 Mon Sep 17 00:00:00 2001
From: Anton Youdkevitch <anton.youdkevitch@bell-sw.com>
Date: Tue, 8 Sep 2020 06:07:54 -0700
Subject: [PATCH] WIP MIN to conditionals

Trying to revert nested mins conditionals at
later rtl passes.

Disabled cmove conversion of conditionals if
a conditional operates on FP types.

Added target hook to query if the machine
benefits from using this transoformation.

Created new extra tune flag AARCH64_EXTRA_TUNING_MINMAX_TO_COND
and added it to TX2 tuning flags.

Added new -mminmax-to-cond target flag.

Implemented aarch64 target hook to for
TARGET_MINMAX_TO_COND_PROFITABLE_P.
---
 gcc/combine.c                               | 140 +++++++++++++++++++++++++++-
 gcc/config/aarch64/aarch64-tuning-flags.def |   2 +
 gcc/config/aarch64/aarch64.c                |  13 ++-
 gcc/config/aarch64/aarch64.opt              |   4 +
 gcc/doc/tm.texi                             |   5 +
 gcc/doc/tm.texi.in                          |   2 +
 gcc/ifcvt.c                                 |  14 +++
 gcc/target.def                              |   7 ++
 8 files changed, 185 insertions(+), 2 deletions(-)

diff --git a/gcc/combine.c b/gcc/combine.c
index 4fee114..f43e1ec 100644
--- a/gcc/combine.c
+++ b/gcc/combine.c
@@ -495,8 +495,9 @@ static bool unmentioned_reg_p (rtx, rtx);
 static void record_truncated_values (rtx *, void *);
 static bool reg_truncated_to_mode (machine_mode, const_rtx);
 static rtx gen_lowpart_or_truncate (machine_mode, rtx);
-\f
+static void subst_minmax_with_cond();
 
+\f
 /* It is not safe to use ordinary gen_lowpart in combine.
    See comments in gen_lowpart_for_combine.  */
 #undef RTL_HOOKS_GEN_LOWPART
@@ -1532,6 +1533,12 @@ retry:
   default_rtl_profile ();
   clear_bb_flags ();
   new_direct_jump_p |= purge_all_dead_edges ();
+
+  /* the way the edges are created while doing subst
+     min/max makes the optimizer purge FT blocks */
+  if ( targetm.minmax_to_cond_profitable_p ())
+    subst_minmax_with_cond ();
+
   new_direct_jump_p |= delete_noop_moves ();
 
   /* Clean up.  */
@@ -6476,6 +6483,137 @@ combine_simplify_rtx (rtx x, machine_mode op0_mode, int in_dest,
   return x;
 }
 \f
+
+static void
+subst_minmax_with_cond()
+{
+  basic_block bb;
+
+  FOR_EACH_BB_REVERSE_FN (bb, cfun)
+    {
+      rtx_insn *insn, *next;
+
+      for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb)); insn = next)
+	{
+	  next = NEXT_INSN (insn);
+	  if (!NONJUMP_INSN_P (insn) || insn->deleted ()
+	      || GET_CODE (PATTERN (insn)) != SET
+	      || !SCALAR_FLOAT_MODE_P (GET_MODE (SET_SRC (PATTERN (insn)))))
+	    continue;
+
+	  rtx x = SET_SRC (PATTERN (insn));
+	  rtx_code code1 = GET_CODE (x);
+
+	  if (code1 != SMIN && code1 != UMIN &&
+	      code1 != SMAX && code1 != UMAX)
+	    continue;
+
+	  rtx_insn *use = NULL;
+
+	  if (!find_single_use (SET_DEST (PATTERN (insn)), insn, &use))
+	    continue;
+
+	  rtx_code code2 = GET_CODE (SET_SRC (PATTERN (use)));
+
+	  if (code2 != SMIN && code2 != UMIN &&
+	      code2 != SMAX && code2 != UMAX)
+	    continue;
+
+	  bool signed1 = (code1 == SMIN || code1 == SMAX);
+	  bool signed2 = (code2 == SMIN || code2 == SMAX);
+
+	  if (signed1 != signed2)
+	    continue;
+
+	  rtx a = XEXP (x, 0);
+	  rtx b = XEXP (x, 1);
+	  rtx c = XEXP (SET_SRC (PATTERN (use)), 1);
+
+	  if (GET_CODE (a) != REG || GET_CODE (b) != REG
+	      || GET_CODE (c) != REG)
+	    continue;
+
+	  rtx_code_label *lab1;
+	  rtx_insn *ij1, *ij2;
+	  rtx res = gen_reg_rtx (GET_MODE (a));
+	  basic_block bb1 = bb;
+
+	  /*
+	   BB1
+	     mov res, a
+	     cmp res,b
+	     b.gt l1
+	   BB2
+             l1:
+             mov res, b
+           BB3 (falltrough from BB)
+             cmp res, b
+             b.gt l2
+           BB4
+             l2:
+             mov res, c
+           BB5 (falltrough from BB3)
+             mov ret, res
+	   */
+	  start_sequence ();
+	  emit_move_insn (res, a);
+
+	  machine_mode cc_mode = SELECT_CC_MODE (GT, res, a);
+	  rtx cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
+	  emit_insn (gen_rtx_SET (cc_reg, gen_rtx_COMPARE (cc_mode, res, b)));
+	  lab1 = gen_label_rtx ();
+	  rtx z;
+	  if (code1 == SMIN || code1 == UMIN)
+	    z = gen_rtx_GT (VOIDmode, cc_reg, const0_rtx);
+	  else
+	    z = gen_rtx_LT (VOIDmode, cc_reg, const0_rtx);
+
+	  z = gen_rtx_IF_THEN_ELSE (VOIDmode, z,
+				    gen_rtx_LABEL_REF (Pmode, lab1), pc_rtx);
+	  ij1 = emit_jump_insn (gen_rtx_SET (pc_rtx, z));
+
+	  emit_label (lab1);
+	  rtx_insn* mov1 = emit_move_insn (res, b);
+	  emit_insn (gen_rtx_SET (cc_reg, gen_rtx_COMPARE (cc_mode, res, c)));
+	  lab1 = gen_label_rtx ();
+	  if (code2 == SMIN || code2 == UMIN)
+	    z = gen_rtx_GT (VOIDmode, cc_reg, const0_rtx);
+	  else
+	    z = gen_rtx_LT (VOIDmode, cc_reg, const0_rtx);
+	  z = gen_rtx_IF_THEN_ELSE (VOIDmode, z,
+				    gen_rtx_LABEL_REF (Pmode, lab1), pc_rtx);
+	  ij2 = emit_jump_insn (gen_rtx_SET (pc_rtx, z));
+	  emit_label (lab1);
+	  rtx_insn* mov2 = emit_move_insn (res, c);
+
+	  /* reuse dest part of the second MIN insn */
+	  emit_move_insn (SET_DEST (PATTERN (use)), res);
+
+	  rtx_insn* seq = get_insns ();
+	  end_sequence ();
+	  emit_insn_before (seq, insn);
+
+	  /* changing CFG to accomodate the conditional branches */
+	  edge e1 = split_block (bb1, ij1);
+	  edge e2 = split_block (e1->dest, mov1);
+	  make_edge (bb1, e2->dest, EDGE_FALLTHRU);
+	  e1->flags &= ~EDGE_FALLTHRU; /* this is a conditional jump, remove
+					  FALLTHRU flag */
+	  bb1 = e2->dest;
+	  e1 = split_block (bb1, ij2);
+	  e2 = split_block (e1->dest, mov2);
+	  make_edge (bb1, e2->dest, EDGE_FALLTHRU);
+	  e1->flags &= ~EDGE_FALLTHRU;
+
+	  delete_insn (insn);
+	  delete_insn (use);
+	  break;
+	}
+    }
+}
+
+
+
 /* Simplify X, an IF_THEN_ELSE expression.  Return the new expression.  */
 
 static rtx
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def
index ccef3c0..86dc8bc 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -46,4 +46,6 @@ AARCH64_EXTRA_TUNING_OPTION ("no_ldp_stp_qregs", NO_LDP_STP_QREGS)
 
 AARCH64_EXTRA_TUNING_OPTION ("rename_load_regs", RENAME_LOAD_REGS)
 
+AARCH64_EXTRA_TUNING_OPTION ("minmax_to_cond", MINMAX_TO_COND)
+
 #undef AARCH64_EXTRA_TUNING_OPTION
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 2a0fd71..ef78f6c 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1268,7 +1268,7 @@ static const struct tune_params thunderx2t99_tunings =
   2,	/* min_div_recip_mul_df.  */
   0,	/* max_case_values.  */
   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_MINMAX_TO_COND),	/* tune_flags.  */
   &thunderx2t99_prefetch_tune
 };
 
@@ -23041,6 +23041,14 @@ aarch64_invalid_unary_op (int op, const_tree type)
   return NULL;
 }
 
+static bool
+aarch64_minmax_to_cond_profitable_p ()
+{
+  return ((aarch64_tune_params.extra_tuning_flags &
+	  AARCH64_EXTRA_TUNE_MINMAX_TO_COND) &&
+	  aarch64_flag_minmax_to_cond);
+}
+
 /* Return the diagnostic message string if the binary operation OP is
    not permitted on TYPE1 and TYPE2, NULL otherwise.  */
 
@@ -23898,6 +23906,9 @@ aarch64_libgcc_floating_mode_supported_p
 #undef TARGET_ASM_FUNCTION_EPILOGUE
 #define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks
 
+#undef TARGET_MINMAX_TO_COND_PROFITABLE_P
+#define TARGET_MINMAX_TO_COND_PROFITABLE_P aarch64_minmax_to_cond_profitable_p
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-aarch64.h"
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index 5170361..4c0bad8 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -275,3 +275,7 @@ The number of Newton iterations for calculating the reciprocal for float type.
 Target Joined UInteger Var(aarch64_double_recp_precision) Init(2) IntegerRange(1, 5) Param
 The number of Newton iterations for calculating the reciprocal for double type.  The precision of division is proportional to this param when division approximation is enabled.  The default value is 2.
 
+mminmax-to-cond
+Target Var(aarch64_flag_minmax_to_cond) Optimization
+Convert nested minmax instructions into the sequence
+of conditionals
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 41b9e10..250e33f 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -12200,6 +12200,11 @@ This target hook can be used to generate a target-specific code
  emits a @code{speculation_barrier} instruction if that is defined.
 @end deftypefn
 
+@deftypefn {Target Hook} bool TARGET_MINMAX_TO_COND_PROFITABLE_P (void)
+There are cases when it is profitable to replace nested min/max
+  instructions by the sequence of conditionals.
+@end deftypefn
+
 @deftypefn {Target Hook} void TARGET_RUN_TARGET_SELFTESTS (void)
 If selftests are enabled, run any selftests for this target.
 @end deftypefn
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 3be984b..39c5848 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -8179,4 +8179,6 @@ maintainer is familiar with.
 
 @hook TARGET_SPECULATION_SAFE_VALUE
 
+@hook TARGET_MINMAX_TO_COND_PROFITABLE_P
+
 @hook TARGET_RUN_TARGET_SELFTESTS
diff --git a/gcc/ifcvt.c b/gcc/ifcvt.c
index a9ea7b1..9091e6a 100644
--- a/gcc/ifcvt.c
+++ b/gcc/ifcvt.c
@@ -1768,6 +1768,11 @@ noce_try_cmove (struct noce_if_info *if_info)
   if (!noce_simple_bbs (if_info))
     return FALSE;
 
+  /* don't bother to convert branches to cmoves for FP */
+  if (SCALAR_FLOAT_MODE_P (GET_MODE (XEXP (if_info->cond, 0)))
+      && targetm.minmax_to_cond_profitable_p ())
+    return FALSE;
+
   if ((CONSTANT_P (if_info->a) || register_operand (if_info->a, VOIDmode))
       && (CONSTANT_P (if_info->b) || register_operand (if_info->b, VOIDmode)))
     {
@@ -2048,6 +2053,11 @@ noce_try_cmove_arith (struct noce_if_info *if_info)
   rtx cond = if_info->cond;
   rtx_insn *ifcvt_seq;
 
+  /* don't bother to convert branches to cmoves for FP */
+  if (SCALAR_FLOAT_MODE_P (GET_MODE (XEXP (if_info->cond, 0)))
+      && targetm.minmax_to_cond_profitable_p ())
+    return FALSE;
+
   /* A conditional move from two memory sources is equivalent to a
      conditional on their addresses followed by a load.  Don't do this
      early because it'll screw alias analysis.  Note that we've
@@ -2462,6 +2472,10 @@ noce_try_minmax (struct noce_if_info *if_info)
   if (!noce_simple_bbs (if_info))
     return FALSE;
 
+  if (SCALAR_FLOAT_MODE_P (GET_MODE (XEXP (if_info->cond, 0)))
+      && targetm.minmax_to_cond_profitable_p ())
+    return FALSE;
+
   /* ??? Reject modes with NaNs or signed zeros since we don't know how
      they will be resolved with an SMIN/SMAX.  It wouldn't be too hard
      to get the target to tell us...  */
diff --git a/gcc/target.def b/gcc/target.def
index f2f314e..b59b5e7 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -3228,6 +3228,13 @@ DEFHOOK
  bool, (ao_ref *ref),
  default_ref_may_alias_errno)
 
+DEFHOOK
+(minmax_to_cond_profitable_p,
+ "There are cases when it is profitable to replace nested min/max\n\
+  instructions by the sequence of conditionals.",
+  bool, (void), hook_bool_void_false)
+
+
 /* Support for named address spaces.  */
 #undef HOOK_PREFIX
 #define HOOK_PREFIX "TARGET_ADDR_SPACE_"
-- 
2.7.4


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [RFC] Aarch64: Replace nested FP min/max with conditionals for TX2
  2020-09-09 15:50 [RFC] Aarch64: Replace nested FP min/max with conditionals for TX2 Anton Youdkevitch
@ 2020-09-10 10:03 ` Richard Biener
       [not found]   ` <CAMDt1gTW5a_nhC5CeoURzdr4abZkbxSz7u46X9QnKpN6X8n6cA@mail.gmail.com>
  0 siblings, 1 reply; 4+ messages in thread
From: Richard Biener @ 2020-09-10 10:03 UTC (permalink / raw)
  To: Anton Youdkevitch; +Cc: GCC Patches

On Wed, Sep 9, 2020 at 5:51 PM Anton Youdkevitch
<anton.youdkevitch@bell-sw.com> wrote:
>
> ThunderxT2 chip has an odd property that nested scalar FP min and max are
> slower than logically the same sequence of compares and branches.

Always for any input data?

> Here is the patch where I'm trying to implement that transformation.
> Please advise if the "combine" pass (actually after the pass itself) is the
> appropriate place to do this.
>
> I was considering the possibility to implement this in aarch64.md
> (which would be much cleaner) but didn't manage to figure out how
> to make fmin/fmax survive until later passes and replace them only
> then.

+             || !SCALAR_FLOAT_MODE_P (GET_MODE (SET_SRC (PATTERN (insn)))))
+           continue;
...
+         if (code1 != SMIN && code1 != UMIN &&
+             code1 != SMAX && code1 != UMAX)
+           continue;

you shouldn't see U{MIN,MAX} for float data.

May I suggest to instead to this in a peephole2 or in another late
machine-specific pass?

Are nested vector FP min/max fast?

Richard.


>
> --
>   Thanks,
>   Anton

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [RFC] Aarch64: Replace nested FP min/max with conditionals for TX2
       [not found]   ` <CAMDt1gTW5a_nhC5CeoURzdr4abZkbxSz7u46X9QnKpN6X8n6cA@mail.gmail.com>
@ 2020-09-11  6:42     ` Richard Biener
  2020-09-12  8:29       ` Anton Youdkevitch
  0 siblings, 1 reply; 4+ messages in thread
From: Richard Biener @ 2020-09-11  6:42 UTC (permalink / raw)
  To: Anton Youdkevitch, GCC Patches

On Fri, Sep 11, 2020 at 8:27 AM Anton Youdkevitch
<anton.youdkevitch@bell-sw.com> wrote:
>
> Richard,
>
> On Thu, Sep 10, 2020 at 12:03 PM Richard Biener <richard.guenther@gmail.com> wrote:
>>
>> On Wed, Sep 9, 2020 at 5:51 PM Anton Youdkevitch
>> <anton.youdkevitch@bell-sw.com> wrote:
>> >
>> > ThunderxT2 chip has an odd property that nested scalar FP min and max are
>> > slower than logically the same sequence of compares and branches.
>>
>> Always for any input data?
>
> If you mean the data that makes it choose all the combinations of
> taken/not taken branches then yes — the results for synthetics are always
> the same (+60%). I didn't check Inf/NaNs, though, as in such
> cases performance is not a concern.

I specifically was suggesting to measure the effect of branch mispredicts.
You'll have the case of the first branch being mispredicted, the second
branch being mispredicted and both branches being mispredicted.
So how's the worst case behaving in comparison to the FP min/max
back-to-back case?

Btw, did you try to use conditional moves / conditional compares (IIRC
arm has some weird ccmp that might or might not come in handy)?

>> > Here is the patch where I'm trying to implement that transformation.
>> > Please advise if the "combine" pass (actually after the pass itself) is the
>> > appropriate place to do this.
>> >
>> > I was considering the possibility to implement this in aarch64.md
>> > (which would be much cleaner) but didn't manage to figure out how
>> > to make fmin/fmax survive until later passes and replace them only
>> > then.
>>
>> +             || !SCALAR_FLOAT_MODE_P (GET_MODE (SET_SRC (PATTERN (insn)))))
>> +           continue;
>> ...
>> +         if (code1 != SMIN && code1 != UMIN &&
>> +             code1 != SMAX && code1 != UMAX)
>> +           continue;
>>
>> you shouldn't see U{MIN,MAX} for float data.
>
> OK, thanks. Will fix that.
>
>>
>>
>> May I suggest to instead to this in a peephole2 or in another late
>> machine-specific pass?
>
> Yes, sure, I'm basically asking for any suggestion. My idea is to move
> it as late as possible since messing with control flow is generally a bad
> idea. The current implementation is just a proof of concept. Do you
> think it's worth to postpone it until, let's say, shorten or peephole2
> would be enough?

I think doing it as late as possible, possibly after sched2, is best
since presumably the slowness really depends on back-to-back
min(max(..)) (what about min (min (..))?), so if there's enough other
instructions inbetween they behave reasonable again.

Did you try if scheduling some insns inbetween the min/max operation
would improve things?  Thus, might it be reasonable to adjust the
machine desctiption to artitifically constrain min/max latency?

>>
>>
>> Are nested vector FP min/max fast?
>
> The vector min/max are as fast as the scalar ones (ironically) it is that utilizing the vector
> compare and branch will much be slower: it's not just the fact the ASIMD compare does
> not affect CC register and additional processing is required but also the number of branches
> to deal with all the individual elements of the vector in the mixed case. It seemed pretty much
> a deadend so I didn't bother to touch it.

OK, I wasn't thinking of applyin the same transform to vector code but using
vector instructions in place of the scalar ones instead of branchy code.  But if
that doesn't make a difference ...

Richard.

> --
>   Thanks,
>   Anton
>
>
>
>>
>> Richard.
>>
>>
>> >
>> > --
>> >   Thanks,
>> >   Anton

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [RFC] Aarch64: Replace nested FP min/max with conditionals for TX2
  2020-09-11  6:42     ` Richard Biener
@ 2020-09-12  8:29       ` Anton Youdkevitch
  0 siblings, 0 replies; 4+ messages in thread
From: Anton Youdkevitch @ 2020-09-12  8:29 UTC (permalink / raw)
  To: Richard Biener; +Cc: GCC Patches

On Fri, Sep 11, 2020 at 8:43 AM Richard Biener <richard.guenther@gmail.com>
wrote:

> On Fri, Sep 11, 2020 at 8:27 AM Anton Youdkevitch
> <anton.youdkevitch@bell-sw.com> wrote:
> >
> > Richard,
> >
> > On Thu, Sep 10, 2020 at 12:03 PM Richard Biener <
> richard.guenther@gmail.com> wrote:
> >>
> >> On Wed, Sep 9, 2020 at 5:51 PM Anton Youdkevitch
> >> <anton.youdkevitch@bell-sw.com> wrote:
> >> >
> >> > ThunderxT2 chip has an odd property that nested scalar FP min and max
> are
> >> > slower than logically the same sequence of compares and branches.
> >>
> >> Always for any input data?
> >
> > If you mean the data that makes it choose all the combinations of
> > taken/not taken branches then yes — the results for synthetics are always
> > the same (+60%). I didn't check Inf/NaNs, though, as in such
> > cases performance is not a concern.
>
> I specifically was suggesting to measure the effect of branch mispredicts.
> You'll have the case of the first branch being mispredicted, the second
> branch being mispredicted and both branches being mispredicted.
> So how's the worst case behaving in comparison to the FP min/max
> back-to-back case?
>
Yes, I measured all the four cases. However, since the data was static this
might be just training the branch predictor. The thing is that even the
best case
has 3 FP insns vs 2 FP mins/maxes and is still almost two times faster. The
worst case has 5 FP insns.


>
> Btw, did you try to use conditional moves / conditional compares (IIRC
> arm has some weird ccmp that might or might not come in handy)?
>
I did. FP conditional moves are notoriously slow on TX2. The implementation
that uses FP cmoves is several times worse than min/max or branche ones.


>
> >> > Here is the patch where I'm trying to implement that transformation.
> >> > Please advise if the "combine" pass (actually after the pass itself)
> is the
> >> > appropriate place to do this.
> >> >
> >> > I was considering the possibility to implement this in aarch64.md
> >> > (which would be much cleaner) but didn't manage to figure out how
> >> > to make fmin/fmax survive until later passes and replace them only
> >> > then.
> >>
> >> +             || !SCALAR_FLOAT_MODE_P (GET_MODE (SET_SRC (PATTERN
> (insn)))))
> >> +           continue;
> >> ...
> >> +         if (code1 != SMIN && code1 != UMIN &&
> >> +             code1 != SMAX && code1 != UMAX)
> >> +           continue;
> >>
> >> you shouldn't see U{MIN,MAX} for float data.
> >
> > OK, thanks. Will fix that.
> >
> >>
> >>
> >> May I suggest to instead to this in a peephole2 or in another late
> >> machine-specific pass?
> >
> > Yes, sure, I'm basically asking for any suggestion. My idea is to move
> > it as late as possible since messing with control flow is generally a bad
> > idea. The current implementation is just a proof of concept. Do you
> > think it's worth to postpone it until, let's say, shorten or peephole2
> > would be enough?
>
> I think doing it as late as possible, possibly after sched2, is best
> since presumably the slowness really depends on back-to-back
> min(max(..)) (what about min (min (..))?), so if there's enough other
> instructions inbetween they behave reasonable again.
>
OK, understood. Thanks!


>
> Did you try if scheduling some insns inbetween the min/max operation
> would improve things?  Thus, might it be reasonable to adjust the
> machine desctiption to artitifically constrain min/max latency?
>
Good point, thanks. The main difference for branched vs non-branched
versions is CPU utilization, so, proper scheduling might (should?)
change this.


>
> >>
> >>
> >> Are nested vector FP min/max fast?
> >
> > The vector min/max are as fast as the scalar ones (ironically) it is
> that utilizing the vector
> > compare and branch will much be slower: it's not just the fact the ASIMD
> compare does
> > not affect CC register and additional processing is required but also
> the number of branches
> > to deal with all the individual elements of the vector in the mixed
> case. It seemed pretty much
> > a deadend so I didn't bother to touch it.
>
> OK, I wasn't thinking of applyin the same transform to vector code but
> using
> vector instructions in place of the scalar ones instead of branchy code.
> But if
> that doesn't make a difference ...
>
No, in this case it does not.

-- 
  Thanks,
  Anton


>
> Richard.
>
> > --
> >   Thanks,
> >   Anton
> >
> >
> >
> >>
> >> Richard.
> >>
> >>
> >> >
> >> > --
> >> >   Thanks,
> >> >   Anton
>

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2020-09-12  8:29 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-09-09 15:50 [RFC] Aarch64: Replace nested FP min/max with conditionals for TX2 Anton Youdkevitch
2020-09-10 10:03 ` Richard Biener
     [not found]   ` <CAMDt1gTW5a_nhC5CeoURzdr4abZkbxSz7u46X9QnKpN6X8n6cA@mail.gmail.com>
2020-09-11  6:42     ` Richard Biener
2020-09-12  8:29       ` Anton Youdkevitch

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).