public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] combine: Allow combining two insns to two insns
@ 2018-07-24 17:18 Segher Boessenkool
  2018-07-24 21:13 ` Jeff Law
                   ` (4 more replies)
  0 siblings, 5 replies; 18+ messages in thread
From: Segher Boessenkool @ 2018-07-24 17:18 UTC (permalink / raw)
  To: gcc-patches; +Cc: Segher Boessenkool

This patch allows combine to combine two insns into two.  This helps
in many cases, by reducing instruction path length, and also allowing
further combinations to happen.  PR85160 is a typical example of code
that it can improve.

This patch does not allow such combinations if either of the original
instructions was a simple move instruction.  In those cases combining
the two instructions increases register pressure without improving the
code.  With this move test register pressure does no longer increase
noticably as far as I can tell.

(At first I also didn't allow either of the resulting insns to be a
move instruction.  But that is actually a very good thing to have, as
should have been obvious).

Tested for many months; tested on about 30 targets.

I'll commit this later this week if there are no objections.


Segher


2018-07-24  Segher Boessenkool  <segher@kernel.crashing.org>

	PR rtl-optimization/85160
	* combine.c (is_just_move): New function.
	(try_combine): Allow combining two instructions into two if neither of
	the original instructions was a move.

---
 gcc/combine.c | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/gcc/combine.c b/gcc/combine.c
index cfe0f19..d64e84d 100644
--- a/gcc/combine.c
+++ b/gcc/combine.c
@@ -2604,6 +2604,17 @@ can_split_parallel_of_n_reg_sets (rtx_insn *insn, int n)
   return true;
 }
 
+/* Return whether X is just a single set, with the source
+   a general_operand.  */
+static bool
+is_just_move (rtx x)
+{
+  if (INSN_P (x))
+    x = PATTERN (x);
+
+  return (GET_CODE (x) == SET && general_operand (SET_SRC (x), VOIDmode));
+}
+
 /* Try to combine the insns I0, I1 and I2 into I3.
    Here I0, I1 and I2 appear earlier than I3.
    I0 and I1 can be zero; then we combine just I2 into I3, or I1 and I2 into
@@ -2668,6 +2679,7 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, rtx_insn *i0,
   int swap_i2i3 = 0;
   int split_i2i3 = 0;
   int changed_i3_dest = 0;
+  bool i2_was_move = false, i3_was_move = false;
 
   int maxreg;
   rtx_insn *temp_insn;
@@ -3059,6 +3071,10 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, rtx_insn *i0,
       return 0;
     }
 
+  /* Record whether i2 and i3 are trivial moves.  */
+  i2_was_move = is_just_move (i2);
+  i3_was_move = is_just_move (i3);
+
   /* Record whether I2DEST is used in I2SRC and similarly for the other
      cases.  Knowing this will help in register status updating below.  */
   i2dest_in_i2src = reg_overlap_mentioned_p (i2dest, i2src);
@@ -4014,8 +4030,10 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, rtx_insn *i0,
 	   && XVECLEN (newpat, 0) == 2
 	   && GET_CODE (XVECEXP (newpat, 0, 0)) == SET
 	   && GET_CODE (XVECEXP (newpat, 0, 1)) == SET
-	   && (i1 || set_noop_p (XVECEXP (newpat, 0, 0))
-		  || set_noop_p (XVECEXP (newpat, 0, 1)))
+	   && (i1
+	       || set_noop_p (XVECEXP (newpat, 0, 0))
+	       || set_noop_p (XVECEXP (newpat, 0, 1))
+	       || (!i2_was_move && !i3_was_move))
 	   && GET_CODE (SET_DEST (XVECEXP (newpat, 0, 0))) != ZERO_EXTRACT
 	   && GET_CODE (SET_DEST (XVECEXP (newpat, 0, 0))) != STRICT_LOW_PART
 	   && GET_CODE (SET_DEST (XVECEXP (newpat, 0, 1))) != ZERO_EXTRACT
-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] combine: Allow combining two insns to two insns
  2018-07-24 17:18 [PATCH] combine: Allow combining two insns to two insns Segher Boessenkool
@ 2018-07-24 21:13 ` Jeff Law
  2018-07-25  8:28 ` Richard Biener
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 18+ messages in thread
From: Jeff Law @ 2018-07-24 21:13 UTC (permalink / raw)
  To: Segher Boessenkool, gcc-patches

On 07/24/2018 11:18 AM, Segher Boessenkool wrote:
> This patch allows combine to combine two insns into two.  This helps
> in many cases, by reducing instruction path length, and also allowing
> further combinations to happen.  PR85160 is a typical example of code
> that it can improve.
> 
> This patch does not allow such combinations if either of the original
> instructions was a simple move instruction.  In those cases combining
> the two instructions increases register pressure without improving the
> code.  With this move test register pressure does no longer increase
> noticably as far as I can tell.
> 
> (At first I also didn't allow either of the resulting insns to be a
> move instruction.  But that is actually a very good thing to have, as
> should have been obvious).
> 
> Tested for many months; tested on about 30 targets.
> 
> I'll commit this later this week if there are no objections.
> 
> 
> Segher
> 
> 
> 2018-07-24  Segher Boessenkool  <segher@kernel.crashing.org>
> 
> 	PR rtl-optimization/85160
> 	* combine.c (is_just_move): New function.
> 	(try_combine): Allow combining two instructions into two if neither of
> 	the original instructions was a move.
I've had several instances where a 2->2 combination would be useful
through the years.  I didn't save any of those examples though...  Good
to see the limitation being addressed.

jeff

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] combine: Allow combining two insns to two insns
  2018-07-24 17:18 [PATCH] combine: Allow combining two insns to two insns Segher Boessenkool
  2018-07-24 21:13 ` Jeff Law
@ 2018-07-25  8:28 ` Richard Biener
  2018-07-25  9:50   ` Segher Boessenkool
  2018-07-31 12:39   ` H.J. Lu
  2018-07-25 13:47 ` David Malcolm
                   ` (2 subsequent siblings)
  4 siblings, 2 replies; 18+ messages in thread
From: Richard Biener @ 2018-07-25  8:28 UTC (permalink / raw)
  To: Segher Boessenkool; +Cc: GCC Patches

On Tue, Jul 24, 2018 at 7:18 PM Segher Boessenkool
<segher@kernel.crashing.org> wrote:
>
> This patch allows combine to combine two insns into two.  This helps
> in many cases, by reducing instruction path length, and also allowing
> further combinations to happen.  PR85160 is a typical example of code
> that it can improve.
>
> This patch does not allow such combinations if either of the original
> instructions was a simple move instruction.  In those cases combining
> the two instructions increases register pressure without improving the
> code.  With this move test register pressure does no longer increase
> noticably as far as I can tell.
>
> (At first I also didn't allow either of the resulting insns to be a
> move instruction.  But that is actually a very good thing to have, as
> should have been obvious).
>
> Tested for many months; tested on about 30 targets.
>
> I'll commit this later this week if there are no objections.

Sounds good - but, _any_ testcase?  Please! ;)

Richard.

>
> Segher
>
>
> 2018-07-24  Segher Boessenkool  <segher@kernel.crashing.org>
>
>         PR rtl-optimization/85160
>         * combine.c (is_just_move): New function.
>         (try_combine): Allow combining two instructions into two if neither of
>         the original instructions was a move.
>
> ---
>  gcc/combine.c | 22 ++++++++++++++++++++--
>  1 file changed, 20 insertions(+), 2 deletions(-)
>
> diff --git a/gcc/combine.c b/gcc/combine.c
> index cfe0f19..d64e84d 100644
> --- a/gcc/combine.c
> +++ b/gcc/combine.c
> @@ -2604,6 +2604,17 @@ can_split_parallel_of_n_reg_sets (rtx_insn *insn, int n)
>    return true;
>  }
>
> +/* Return whether X is just a single set, with the source
> +   a general_operand.  */
> +static bool
> +is_just_move (rtx x)
> +{
> +  if (INSN_P (x))
> +    x = PATTERN (x);
> +
> +  return (GET_CODE (x) == SET && general_operand (SET_SRC (x), VOIDmode));
> +}
> +
>  /* Try to combine the insns I0, I1 and I2 into I3.
>     Here I0, I1 and I2 appear earlier than I3.
>     I0 and I1 can be zero; then we combine just I2 into I3, or I1 and I2 into
> @@ -2668,6 +2679,7 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, rtx_insn *i0,
>    int swap_i2i3 = 0;
>    int split_i2i3 = 0;
>    int changed_i3_dest = 0;
> +  bool i2_was_move = false, i3_was_move = false;
>
>    int maxreg;
>    rtx_insn *temp_insn;
> @@ -3059,6 +3071,10 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, rtx_insn *i0,
>        return 0;
>      }
>
> +  /* Record whether i2 and i3 are trivial moves.  */
> +  i2_was_move = is_just_move (i2);
> +  i3_was_move = is_just_move (i3);
> +
>    /* Record whether I2DEST is used in I2SRC and similarly for the other
>       cases.  Knowing this will help in register status updating below.  */
>    i2dest_in_i2src = reg_overlap_mentioned_p (i2dest, i2src);
> @@ -4014,8 +4030,10 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, rtx_insn *i0,
>            && XVECLEN (newpat, 0) == 2
>            && GET_CODE (XVECEXP (newpat, 0, 0)) == SET
>            && GET_CODE (XVECEXP (newpat, 0, 1)) == SET
> -          && (i1 || set_noop_p (XVECEXP (newpat, 0, 0))
> -                 || set_noop_p (XVECEXP (newpat, 0, 1)))
> +          && (i1
> +              || set_noop_p (XVECEXP (newpat, 0, 0))
> +              || set_noop_p (XVECEXP (newpat, 0, 1))
> +              || (!i2_was_move && !i3_was_move))
>            && GET_CODE (SET_DEST (XVECEXP (newpat, 0, 0))) != ZERO_EXTRACT
>            && GET_CODE (SET_DEST (XVECEXP (newpat, 0, 0))) != STRICT_LOW_PART
>            && GET_CODE (SET_DEST (XVECEXP (newpat, 0, 1))) != ZERO_EXTRACT
> --
> 1.8.3.1
>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] combine: Allow combining two insns to two insns
  2018-07-25  8:28 ` Richard Biener
@ 2018-07-25  9:50   ` Segher Boessenkool
  2018-07-25 10:37     ` Richard Biener
  2018-07-31 12:39   ` H.J. Lu
  1 sibling, 1 reply; 18+ messages in thread
From: Segher Boessenkool @ 2018-07-25  9:50 UTC (permalink / raw)
  To: Richard Biener; +Cc: GCC Patches

On Wed, Jul 25, 2018 at 10:28:30AM +0200, Richard Biener wrote:
> On Tue, Jul 24, 2018 at 7:18 PM Segher Boessenkool
> <segher@kernel.crashing.org> wrote:
> >
> > This patch allows combine to combine two insns into two.  This helps
> > in many cases, by reducing instruction path length, and also allowing
> > further combinations to happen.  PR85160 is a typical example of code
> > that it can improve.
> >
> > This patch does not allow such combinations if either of the original
> > instructions was a simple move instruction.  In those cases combining
> > the two instructions increases register pressure without improving the
> > code.  With this move test register pressure does no longer increase
> > noticably as far as I can tell.
> >
> > (At first I also didn't allow either of the resulting insns to be a
> > move instruction.  But that is actually a very good thing to have, as
> > should have been obvious).
> >
> > Tested for many months; tested on about 30 targets.
> >
> > I'll commit this later this week if there are no objections.
> 
> Sounds good - but, _any_ testcase?  Please! ;)

I only have target-specific ones.  Most *simple* ones will already be
optimised by current code (via 3->2 combination).  But I've now got one
that trunk does not optimise, and it can be confirmed with looking at
the resulting machine code even (no need to look at the combine dump,
which is a very good thing).  And it is a proper thing to test even: it
tests that some source is compiled to properly optimised machine code.

Any other kind of testcase is worse than useless, of course.

Testing it results in working code isn't very feasible or useful either.


Segher

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] combine: Allow combining two insns to two insns
  2018-07-25  9:50   ` Segher Boessenkool
@ 2018-07-25 10:37     ` Richard Biener
  0 siblings, 0 replies; 18+ messages in thread
From: Richard Biener @ 2018-07-25 10:37 UTC (permalink / raw)
  To: Segher Boessenkool; +Cc: GCC Patches

On Wed, Jul 25, 2018 at 11:50 AM Segher Boessenkool
<segher@kernel.crashing.org> wrote:
>
> On Wed, Jul 25, 2018 at 10:28:30AM +0200, Richard Biener wrote:
> > On Tue, Jul 24, 2018 at 7:18 PM Segher Boessenkool
> > <segher@kernel.crashing.org> wrote:
> > >
> > > This patch allows combine to combine two insns into two.  This helps
> > > in many cases, by reducing instruction path length, and also allowing
> > > further combinations to happen.  PR85160 is a typical example of code
> > > that it can improve.
> > >
> > > This patch does not allow such combinations if either of the original
> > > instructions was a simple move instruction.  In those cases combining
> > > the two instructions increases register pressure without improving the
> > > code.  With this move test register pressure does no longer increase
> > > noticably as far as I can tell.
> > >
> > > (At first I also didn't allow either of the resulting insns to be a
> > > move instruction.  But that is actually a very good thing to have, as
> > > should have been obvious).
> > >
> > > Tested for many months; tested on about 30 targets.
> > >
> > > I'll commit this later this week if there are no objections.
> >
> > Sounds good - but, _any_ testcase?  Please! ;)
>
> I only have target-specific ones.

Works for me.

>  Most *simple* ones will already be
> optimised by current code (via 3->2 combination).  But I've now got one
> that trunk does not optimise, and it can be confirmed with looking at
> the resulting machine code even (no need to look at the combine dump,
> which is a very good thing).  And it is a proper thing to test even: it
> tests that some source is compiled to properly optimised machine code.
>
> Any other kind of testcase is worse than useless, of course.
>
> Testing it results in working code isn't very feasible or useful either.
>
>
> Segher

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] combine: Allow combining two insns to two insns
  2018-07-24 17:18 [PATCH] combine: Allow combining two insns to two insns Segher Boessenkool
  2018-07-24 21:13 ` Jeff Law
  2018-07-25  8:28 ` Richard Biener
@ 2018-07-25 13:47 ` David Malcolm
  2018-07-25 14:19   ` Segher Boessenkool
  2018-07-30 16:09 ` Segher Boessenkool
  2018-08-02  5:52 ` Toon Moene
  4 siblings, 1 reply; 18+ messages in thread
From: David Malcolm @ 2018-07-25 13:47 UTC (permalink / raw)
  To: Segher Boessenkool, gcc-patches

On Tue, 2018-07-24 at 17:18 +0000, Segher Boessenkool wrote:
> This patch allows combine to combine two insns into two.  This helps
> in many cases, by reducing instruction path length, and also allowing
> further combinations to happen.  PR85160 is a typical example of code
> that it can improve.
> 
> This patch does not allow such combinations if either of the original
> instructions was a simple move instruction.  In those cases combining
> the two instructions increases register pressure without improving
> the
> code.  With this move test register pressure does no longer increase
> noticably as far as I can tell.
> 
> (At first I also didn't allow either of the resulting insns to be a
> move instruction.  But that is actually a very good thing to have, as
> should have been obvious).
> 
> Tested for many months; tested on about 30 targets.
> 
> I'll commit this later this week if there are no objections.
> 
> 
> Segher
> 
> 
> 2018-07-24  Segher Boessenkool  <segher@kernel.crashing.org>
> 
> 	PR rtl-optimization/85160
> 	* combine.c (is_just_move): New function.
> 	(try_combine): Allow combining two instructions into two if
> neither of
> 	the original instructions was a move.
> 
> ---
>  gcc/combine.c | 22 ++++++++++++++++++++--
>  1 file changed, 20 insertions(+), 2 deletions(-)
> 
> diff --git a/gcc/combine.c b/gcc/combine.c
> index cfe0f19..d64e84d 100644
> --- a/gcc/combine.c
> +++ b/gcc/combine.c
> @@ -2604,6 +2604,17 @@ can_split_parallel_of_n_reg_sets (rtx_insn
> *insn, int n)
>    return true;
>  }
>  
> +/* Return whether X is just a single set, with the source
> +   a general_operand.  */
> +static bool
> +is_just_move (rtx x)
> +{
> +  if (INSN_P (x))
> +    x = PATTERN (x);
> +
> +  return (GET_CODE (x) == SET && general_operand (SET_SRC (x),
> VOIDmode));
> +}

If I'm reading it right, the patch only calls this function on i2 and
i3, which are known to be rtx_insn *, rather than just rtx.

Hence the only way in which GET_CODE (x) can be SET is if the INSN_P
pattern test sets x to PATTERN (x) immediately above: it can't be a SET
otherwise - but this isn't obvious from the code.

Can this function take an rtx_insn * instead?  Maybe something like:

/* Return whether INSN's pattern is just a single set, with the source
   a general_operand.  */
static bool
is_just_move_p (rtx_insn *insn)
{
  if (!INSN_P (insn))
    return false;

  rtx x = PATTERN (insn);
  return (GET_CODE (x) == SET && general_operand (SET_SRC (x), VOIDmode));
}

or similar?

[...snip...]

Thanks; I hope this is constructive.
Dave

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] combine: Allow combining two insns to two insns
  2018-07-25 13:47 ` David Malcolm
@ 2018-07-25 14:19   ` Segher Boessenkool
  0 siblings, 0 replies; 18+ messages in thread
From: Segher Boessenkool @ 2018-07-25 14:19 UTC (permalink / raw)
  To: David Malcolm; +Cc: gcc-patches

On Wed, Jul 25, 2018 at 09:47:31AM -0400, David Malcolm wrote:
> > +/* Return whether X is just a single set, with the source
> > +   a general_operand.  */
> > +static bool
> > +is_just_move (rtx x)
> > +{
> > +  if (INSN_P (x))
> > +    x = PATTERN (x);
> > +
> > +  return (GET_CODE (x) == SET && general_operand (SET_SRC (x),
> > VOIDmode));
> > +}
> 
> If I'm reading it right, the patch only calls this function on i2 and
> i3, which are known to be rtx_insn *, rather than just rtx.

I used to also have

  is_just_move (XVECEXP (newpat, 0, 0))

etc.; during most of combine you do not have instructions, just patterns.


Segher

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] combine: Allow combining two insns to two insns
  2018-07-24 17:18 [PATCH] combine: Allow combining two insns to two insns Segher Boessenkool
                   ` (2 preceding siblings ...)
  2018-07-25 13:47 ` David Malcolm
@ 2018-07-30 16:09 ` Segher Boessenkool
  2018-07-31 12:34   ` Christophe Lyon
  2018-08-02  5:52 ` Toon Moene
  4 siblings, 1 reply; 18+ messages in thread
From: Segher Boessenkool @ 2018-07-30 16:09 UTC (permalink / raw)
  To: gcc-patches

On Tue, Jul 24, 2018 at 05:18:41PM +0000, Segher Boessenkool wrote:
> This patch allows combine to combine two insns into two.  This helps
> in many cases, by reducing instruction path length, and also allowing
> further combinations to happen.  PR85160 is a typical example of code
> that it can improve.
> 
> This patch does not allow such combinations if either of the original
> instructions was a simple move instruction.  In those cases combining
> the two instructions increases register pressure without improving the
> code.  With this move test register pressure does no longer increase
> noticably as far as I can tell.
> 
> (At first I also didn't allow either of the resulting insns to be a
> move instruction.  But that is actually a very good thing to have, as
> should have been obvious).
> 
> Tested for many months; tested on about 30 targets.
> 
> I'll commit this later this week if there are no objections.

Done now, with the testcase at https://gcc.gnu.org/ml/gcc-patches/2018-07/msg01856.html .


Segher

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] combine: Allow combining two insns to two insns
  2018-07-30 16:09 ` Segher Boessenkool
@ 2018-07-31 12:34   ` Christophe Lyon
  2018-07-31 12:59     ` Richard Sandiford
  2018-07-31 13:57     ` Segher Boessenkool
  0 siblings, 2 replies; 18+ messages in thread
From: Christophe Lyon @ 2018-07-31 12:34 UTC (permalink / raw)
  To: Segher Boessenkool; +Cc: gcc Patches

On Mon, 30 Jul 2018 at 18:09, Segher Boessenkool
<segher@kernel.crashing.org> wrote:
>
> On Tue, Jul 24, 2018 at 05:18:41PM +0000, Segher Boessenkool wrote:
> > This patch allows combine to combine two insns into two.  This helps
> > in many cases, by reducing instruction path length, and also allowing
> > further combinations to happen.  PR85160 is a typical example of code
> > that it can improve.
> >
> > This patch does not allow such combinations if either of the original
> > instructions was a simple move instruction.  In those cases combining
> > the two instructions increases register pressure without improving the
> > code.  With this move test register pressure does no longer increase
> > noticably as far as I can tell.
> >
> > (At first I also didn't allow either of the resulting insns to be a
> > move instruction.  But that is actually a very good thing to have, as
> > should have been obvious).
> >
> > Tested for many months; tested on about 30 targets.
> >
> > I'll commit this later this week if there are no objections.
>
> Done now, with the testcase at https://gcc.gnu.org/ml/gcc-patches/2018-07/msg01856.html .
>

Hi,

Since this was committed, I've noticed regressions
on aarch64:
FAIL: gcc.dg/zero_bits_compound-1.c scan-assembler-not \\(and:

on arm-none-linux-gnueabi
FAIL: gfortran.dg/actual_array_constructor_1.f90   -O1  execution test

On aarch64, I've also noticed a few others regressions but I'm not yet
100% sure it's caused by this patch (bisect running):
    gcc.target/aarch64/ashltidisi.c scan-assembler-times asr 4
    gcc.target/aarch64/sve/var_stride_2.c -march=armv8.2-a+sve
scan-assembler-times \\tadd\\tx[0-9]+, x[0-9]+, x[0-9]+, lsl 10\\n 2
    gcc.target/aarch64/sve/var_stride_4.c -march=armv8.2-a+sve
scan-assembler-times \\tlsl\\tx[0-9]+, x[0-9]+, 10\\n 2
    gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmeq\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
#0\\.0\\n 7
    gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmeq\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
z[0-9]+\\.d\\n 14
    gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmeq\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
#0\\.0\\n 5
    gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmeq\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
z[0-9]+\\.s\\n 10
    gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmge\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
#0\\.0\\n 21
    gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmge\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
z[0-9]+\\.d\\n 42
    gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmge\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
#0\\.0\\n 15
    gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmge\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
z[0-9]+\\.s\\n 30
    gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmgt\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
#0\\.0\\n 21
    gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmgt\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
z[0-9]+\\.d\\n 42
    gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmgt\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
#0\\.0\\n 15
    gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmgt\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
z[0-9]+\\.s\\n 30
    gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmle\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
#0\\.0\\n 21
    gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmle\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
z[0-9]+\\.d\\n 42
    gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmle\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
#0\\.0\\n 15
    gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmle\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
z[0-9]+\\.s\\n 30
    gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmlt\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
#0\\.0\\n 21
    gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmlt\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
z[0-9]+\\.d\\n 42
    gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmlt\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
#0\\.0\\n 15
    gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmlt\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
z[0-9]+\\.s\\n 30
    gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmne\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
#0\\.0\\n 21
    gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmne\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
z[0-9]+\\.d\\n 42
    gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmne\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
#0\\.0\\n 15
    gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmne\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
z[0-9]+\\.s\\n 30
    gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmuo\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
z[0-9]+\\.d\\n 252
    gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmuo\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
z[0-9]+\\.s\\n 180
    gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmeq\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
#0\\.0 14
    gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmeq\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
z[0-9]+\\.d 28
    gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmeq\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
#0\\.0 10
    gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmeq\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
z[0-9]+\\.s 20
    gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmge\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
#0\\.0 21
    gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmge\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
z[0-9]+\\.d 42
    gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmge\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
#0\\.0 15
    gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmge\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
z[0-9]+\\.s 30
    gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmgt\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
#0\\.0 28
    gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmgt\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
z[0-9]+\\.d 56
    gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmgt\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
#0\\.0 20
    gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmgt\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
z[0-9]+\\.s 40
    gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmle\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
#0\\.0 21
    gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmle\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
z[0-9]+\\.d 42
    gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmle\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
#0\\.0 15
    gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmle\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
z[0-9]+\\.s 30
    gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmlt\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
#0\\.0 28
    gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmlt\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
z[0-9]+\\.d 56
    gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmlt\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
#0\\.0 20
    gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmlt\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
z[0-9]+\\.s 40
    gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmne\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
#0\\.0 7
    gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmne\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
z[0-9]+\\.d 14
    gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmne\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
#0\\.0 5
    gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmne\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
z[0-9]+\\.s 10
    gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmuo\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
z[0-9]+\\.d 63
    gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
scan-assembler-times \\tfcmuo\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
z[0-9]+\\.s 45

>
> Segher

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] combine: Allow combining two insns to two insns
  2018-07-25  8:28 ` Richard Biener
  2018-07-25  9:50   ` Segher Boessenkool
@ 2018-07-31 12:39   ` H.J. Lu
  2018-07-31 14:08     ` Segher Boessenkool
  1 sibling, 1 reply; 18+ messages in thread
From: H.J. Lu @ 2018-07-31 12:39 UTC (permalink / raw)
  To: Richard Biener; +Cc: Segher Boessenkool, GCC Patches

On Wed, Jul 25, 2018 at 1:28 AM, Richard Biener
<richard.guenther@gmail.com> wrote:
> On Tue, Jul 24, 2018 at 7:18 PM Segher Boessenkool
> <segher@kernel.crashing.org> wrote:
>>
>> This patch allows combine to combine two insns into two.  This helps
>> in many cases, by reducing instruction path length, and also allowing
>> further combinations to happen.  PR85160 is a typical example of code
>> that it can improve.
>>
>> This patch does not allow such combinations if either of the original
>> instructions was a simple move instruction.  In those cases combining
>> the two instructions increases register pressure without improving the
>> code.  With this move test register pressure does no longer increase
>> noticably as far as I can tell.
>>
>> (At first I also didn't allow either of the resulting insns to be a
>> move instruction.  But that is actually a very good thing to have, as
>> should have been obvious).
>>
>> Tested for many months; tested on about 30 targets.
>>
>> I'll commit this later this week if there are no objections.
>
> Sounds good - but, _any_ testcase?  Please! ;)
>

Here is a testcase:

For

---
#define N 16
float f[N];
double d[N];
int n[N];

__attribute__((noinline)) void
f3 (void)
{
  int i;
  for (i = 0; i < N; i++)
    d[i] = f[i];
}
---

r263067 improved -O3 -mavx2 -mtune=generic -m64 from

.cfi_startproc
vmovaps f(%rip), %xmm2
vmovaps f+32(%rip), %xmm3
vinsertf128 $0x1, f+16(%rip), %ymm2, %ymm0
vcvtps2pd %xmm0, %ymm1
vextractf128 $0x1, %ymm0, %xmm0
vmovaps %xmm1, d(%rip)
vextractf128 $0x1, %ymm1, d+16(%rip)
vcvtps2pd %xmm0, %ymm0
vmovaps %xmm0, d+32(%rip)
vextractf128 $0x1, %ymm0, d+48(%rip)
vinsertf128 $0x1, f+48(%rip), %ymm3, %ymm0
vcvtps2pd %xmm0, %ymm1
vextractf128 $0x1, %ymm0, %xmm0
vmovaps %xmm1, d+64(%rip)
vextractf128 $0x1, %ymm1, d+80(%rip)
vcvtps2pd %xmm0, %ymm0
vmovaps %xmm0, d+96(%rip)
vextractf128 $0x1, %ymm0, d+112(%rip)
vzeroupper
ret
.cfi_endproc

to

.cfi_startproc
vcvtps2pd f(%rip), %ymm0
vmovaps %xmm0, d(%rip)
vextractf128 $0x1, %ymm0, d+16(%rip)
vcvtps2pd f+16(%rip), %ymm0
vmovaps %xmm0, d+32(%rip)
vextractf128 $0x1, %ymm0, d+48(%rip)
vcvtps2pd f+32(%rip), %ymm0
vextractf128 $0x1, %ymm0, d+80(%rip)
vmovaps %xmm0, d+64(%rip)
vcvtps2pd f+48(%rip), %ymm0
vextractf128 $0x1, %ymm0, d+112(%rip)
vmovaps %xmm0, d+96(%rip)
vzeroupper
ret
.cfi_endproc

This is:

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=86752

H.J.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] combine: Allow combining two insns to two insns
  2018-07-31 12:34   ` Christophe Lyon
@ 2018-07-31 12:59     ` Richard Sandiford
  2018-07-31 13:57     ` Segher Boessenkool
  1 sibling, 0 replies; 18+ messages in thread
From: Richard Sandiford @ 2018-07-31 12:59 UTC (permalink / raw)
  To: Christophe Lyon; +Cc: Segher Boessenkool, gcc Patches

Christophe Lyon <christophe.lyon@linaro.org> writes:
> On Mon, 30 Jul 2018 at 18:09, Segher Boessenkool
> <segher@kernel.crashing.org> wrote:
>>
>> On Tue, Jul 24, 2018 at 05:18:41PM +0000, Segher Boessenkool wrote:
>> > This patch allows combine to combine two insns into two.  This helps
>> > in many cases, by reducing instruction path length, and also allowing
>> > further combinations to happen.  PR85160 is a typical example of code
>> > that it can improve.
>> >
>> > This patch does not allow such combinations if either of the original
>> > instructions was a simple move instruction.  In those cases combining
>> > the two instructions increases register pressure without improving the
>> > code.  With this move test register pressure does no longer increase
>> > noticably as far as I can tell.
>> >
>> > (At first I also didn't allow either of the resulting insns to be a
>> > move instruction.  But that is actually a very good thing to have, as
>> > should have been obvious).
>> >
>> > Tested for many months; tested on about 30 targets.
>> >
>> > I'll commit this later this week if there are no objections.
>>
>> Done now, with the testcase at
>> https://gcc.gnu.org/ml/gcc-patches/2018-07/msg01856.html .
>>
>
> Hi,
>
> Since this was committed, I've noticed regressions
> on aarch64:
> FAIL: gcc.dg/zero_bits_compound-1.c scan-assembler-not \\(and:
>
> on arm-none-linux-gnueabi
> FAIL: gfortran.dg/actual_array_constructor_1.f90   -O1  execution test
>
> On aarch64, I've also noticed a few others regressions but I'm not yet
> 100% sure it's caused by this patch (bisect running):
>     gcc.target/aarch64/ashltidisi.c scan-assembler-times asr 4
>     gcc.target/aarch64/sve/var_stride_2.c -march=armv8.2-a+sve
> scan-assembler-times \\tadd\\tx[0-9]+, x[0-9]+, x[0-9]+, lsl 10\\n 2
>     gcc.target/aarch64/sve/var_stride_4.c -march=armv8.2-a+sve
> scan-assembler-times \\tlsl\\tx[0-9]+, x[0-9]+, 10\\n 2
>     gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmeq\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
> #0\\.0\\n 7
>     gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmeq\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
> z[0-9]+\\.d\\n 14
>     gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmeq\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
> #0\\.0\\n 5
>     gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmeq\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
> z[0-9]+\\.s\\n 10
>     gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmge\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
> #0\\.0\\n 21
>     gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmge\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
> z[0-9]+\\.d\\n 42
>     gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmge\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
> #0\\.0\\n 15
>     gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmge\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
> z[0-9]+\\.s\\n 30
>     gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmgt\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
> #0\\.0\\n 21
>     gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmgt\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
> z[0-9]+\\.d\\n 42
>     gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmgt\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
> #0\\.0\\n 15
>     gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmgt\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
> z[0-9]+\\.s\\n 30
>     gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmle\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
> #0\\.0\\n 21
>     gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmle\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
> z[0-9]+\\.d\\n 42
>     gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmle\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
> #0\\.0\\n 15
>     gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmle\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
> z[0-9]+\\.s\\n 30
>     gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmlt\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
> #0\\.0\\n 21
>     gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmlt\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
> z[0-9]+\\.d\\n 42
>     gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmlt\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
> #0\\.0\\n 15
>     gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmlt\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
> z[0-9]+\\.s\\n 30
>     gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmne\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
> #0\\.0\\n 21
>     gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmne\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
> z[0-9]+\\.d\\n 42
>     gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmne\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
> #0\\.0\\n 15
>     gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmne\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
> z[0-9]+\\.s\\n 30
>     gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmuo\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
> z[0-9]+\\.d\\n 252
>     gcc.target/aarch64/sve/vcond_4.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmuo\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
> z[0-9]+\\.s\\n 180
>     gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmeq\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
> #0\\.0 14
>     gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmeq\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
> z[0-9]+\\.d 28
>     gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmeq\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
> #0\\.0 10
>     gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmeq\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
> z[0-9]+\\.s 20
>     gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmge\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
> #0\\.0 21
>     gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmge\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
> z[0-9]+\\.d 42
>     gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmge\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
> #0\\.0 15
>     gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmge\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
> z[0-9]+\\.s 30
>     gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmgt\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
> #0\\.0 28
>     gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmgt\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
> z[0-9]+\\.d 56
>     gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmgt\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
> #0\\.0 20
>     gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmgt\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
> z[0-9]+\\.s 40
>     gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmle\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
> #0\\.0 21
>     gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmle\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
> z[0-9]+\\.d 42
>     gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmle\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
> #0\\.0 15
>     gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmle\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
> z[0-9]+\\.s 30
>     gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmlt\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
> #0\\.0 28
>     gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmlt\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
> z[0-9]+\\.d 56
>     gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmlt\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
> #0\\.0 20
>     gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmlt\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
> z[0-9]+\\.s 40
>     gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmne\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
> #0\\.0 7
>     gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmne\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
> z[0-9]+\\.d 14
>     gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmne\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
> #0\\.0 5
>     gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmne\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
> z[0-9]+\\.s 10
>     gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmuo\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d,
> z[0-9]+\\.d 63
>     gcc.target/aarch64/sve/vcond_5.c -march=armv8.2-a+sve
> scan-assembler-times \\tfcmuo\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s,
> z[0-9]+\\.s 45

The SVE failures were caused by it.  What combine is doing is definitely
valid though, since it's converting two dependent instructions into two
independent instructions of equal cost.

I think the fix would be to have proper support for conditional
comparisons, but that's not a short-term thing.  I've filed PR86753
in the meantime and will probably XFAIL the tests for now.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] combine: Allow combining two insns to two insns
  2018-07-31 12:34   ` Christophe Lyon
  2018-07-31 12:59     ` Richard Sandiford
@ 2018-07-31 13:57     ` Segher Boessenkool
  2018-07-31 15:37       ` Richard Earnshaw (lists)
  2018-08-01  8:27       ` Christophe Lyon
  1 sibling, 2 replies; 18+ messages in thread
From: Segher Boessenkool @ 2018-07-31 13:57 UTC (permalink / raw)
  To: Christophe Lyon; +Cc: gcc Patches

Hi Christophe,

On Tue, Jul 31, 2018 at 02:34:06PM +0200, Christophe Lyon wrote:
> Since this was committed, I've noticed regressions
> on aarch64:
> FAIL: gcc.dg/zero_bits_compound-1.c scan-assembler-not \\(and:

This went from
        and     w0, w0, 255
        lsl     w1, w0, 8
        orr     w0, w1, w0, lsl 20
        ret
to
        and     w1, w0, 255
        ubfiz   w0, w0, 8, 8
        orr     w0, w0, w1, lsl 20
        ret
so it's neither an improvement nor a regression, just different code.
The testcase wants no ANDs in the RTL.


> on arm-none-linux-gnueabi
> FAIL: gfortran.dg/actual_array_constructor_1.f90   -O1  execution test

That sounds bad.  Open a PR, maybe?


> On aarch64, I've also noticed a few others regressions but I'm not yet
> 100% sure it's caused by this patch (bisect running):
>     gcc.target/aarch64/ashltidisi.c scan-assembler-times asr 4

 ushift_53_i:
-       uxtw    x1, w0
-       lsl     x0, x1, 53
-       lsr     x1, x1, 11
+       lsr     w1, w0, 11
+       lsl     x0, x0, 53
        ret

 shift_53_i:
-       sxtw    x1, w0
-       lsl     x0, x1, 53
-       asr     x1, x1, 11
+       sbfx    x1, x0, 11, 21
+       lsl     x0, x0, 53
        ret

Both are improvements afais.  The number of asr insns changes, sure.


>     gcc.target/aarch64/sve/var_stride_2.c -march=armv8.2-a+sve
> scan-assembler-times \\tadd\\tx[0-9]+, x[0-9]+, x[0-9]+, lsl 10\\n 2

Skipping all the SVE tests, sorry.  Richard says they look like
improvements, and exactly of the expected kind.  :-)


Segher

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] combine: Allow combining two insns to two insns
  2018-07-31 12:39   ` H.J. Lu
@ 2018-07-31 14:08     ` Segher Boessenkool
  0 siblings, 0 replies; 18+ messages in thread
From: Segher Boessenkool @ 2018-07-31 14:08 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Richard Biener, GCC Patches

On Tue, Jul 31, 2018 at 05:39:37AM -0700, H.J. Lu wrote:
> For
> 
> ---
> #define N 16
> float f[N];
> double d[N];
> int n[N];
> 
> __attribute__((noinline)) void
> f3 (void)
> {
>   int i;
>   for (i = 0; i < N; i++)
>     d[i] = f[i];
> }
> ---
> 
> r263067 improved -O3 -mavx2 -mtune=generic -m64 from
> 
> .cfi_startproc
> vmovaps f(%rip), %xmm2
> vmovaps f+32(%rip), %xmm3
> vinsertf128 $0x1, f+16(%rip), %ymm2, %ymm0
> vcvtps2pd %xmm0, %ymm1
> vextractf128 $0x1, %ymm0, %xmm0
> vmovaps %xmm1, d(%rip)
> vextractf128 $0x1, %ymm1, d+16(%rip)
> vcvtps2pd %xmm0, %ymm0
> vmovaps %xmm0, d+32(%rip)
> vextractf128 $0x1, %ymm0, d+48(%rip)
> vinsertf128 $0x1, f+48(%rip), %ymm3, %ymm0
> vcvtps2pd %xmm0, %ymm1
> vextractf128 $0x1, %ymm0, %xmm0
> vmovaps %xmm1, d+64(%rip)
> vextractf128 $0x1, %ymm1, d+80(%rip)
> vcvtps2pd %xmm0, %ymm0
> vmovaps %xmm0, d+96(%rip)
> vextractf128 $0x1, %ymm0, d+112(%rip)
> vzeroupper
> ret
> .cfi_endproc
> 
> to
> 
> .cfi_startproc
> vcvtps2pd f(%rip), %ymm0
> vmovaps %xmm0, d(%rip)
> vextractf128 $0x1, %ymm0, d+16(%rip)
> vcvtps2pd f+16(%rip), %ymm0
> vmovaps %xmm0, d+32(%rip)
> vextractf128 $0x1, %ymm0, d+48(%rip)
> vcvtps2pd f+32(%rip), %ymm0
> vextractf128 $0x1, %ymm0, d+80(%rip)
> vmovaps %xmm0, d+64(%rip)
> vcvtps2pd f+48(%rip), %ymm0
> vextractf128 $0x1, %ymm0, d+112(%rip)
> vmovaps %xmm0, d+96(%rip)
> vzeroupper
> ret
> .cfi_endproc

I cannot really read AVX, but that looks like better code alright :-)


Segher

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] combine: Allow combining two insns to two insns
  2018-07-31 13:57     ` Segher Boessenkool
@ 2018-07-31 15:37       ` Richard Earnshaw (lists)
  2018-08-01  8:27       ` Christophe Lyon
  1 sibling, 0 replies; 18+ messages in thread
From: Richard Earnshaw (lists) @ 2018-07-31 15:37 UTC (permalink / raw)
  To: Segher Boessenkool, Christophe Lyon; +Cc: gcc Patches

On 31/07/18 14:57, Segher Boessenkool wrote:
> Hi Christophe,
> 
> On Tue, Jul 31, 2018 at 02:34:06PM +0200, Christophe Lyon wrote:
>> Since this was committed, I've noticed regressions
>> on aarch64:
>> FAIL: gcc.dg/zero_bits_compound-1.c scan-assembler-not \\(and:
> 
> This went from
>         and     w0, w0, 255
>         lsl     w1, w0, 8

These are sequentially dependent.

>         orr     w0, w1, w0, lsl 20
>         ret
> to
>         and     w1, w0, 255
>         ubfiz   w0, w0, 8, 8

These can run in parallel.

So the change is a good one!  On a super-scalar machine we save a cycle.

R.

>         orr     w0, w0, w1, lsl 20
>         ret
> so it's neither an improvement nor a regression, just different code.
> The testcase wants no ANDs in the RTL.
> 
> 
>> on arm-none-linux-gnueabi
>> FAIL: gfortran.dg/actual_array_constructor_1.f90   -O1  execution test
> 
> That sounds bad.  Open a PR, maybe?
> 
> 
>> On aarch64, I've also noticed a few others regressions but I'm not yet
>> 100% sure it's caused by this patch (bisect running):
>>     gcc.target/aarch64/ashltidisi.c scan-assembler-times asr 4
> 
>  ushift_53_i:
> -       uxtw    x1, w0
> -       lsl     x0, x1, 53
> -       lsr     x1, x1, 11
> +       lsr     w1, w0, 11
> +       lsl     x0, x0, 53
>         ret
> 
>  shift_53_i:
> -       sxtw    x1, w0
> -       lsl     x0, x1, 53
> -       asr     x1, x1, 11
> +       sbfx    x1, x0, 11, 21
> +       lsl     x0, x0, 53
>         ret
> 
> Both are improvements afais.  The number of asr insns changes, sure.
> 
> 
>>     gcc.target/aarch64/sve/var_stride_2.c -march=armv8.2-a+sve
>> scan-assembler-times \\tadd\\tx[0-9]+, x[0-9]+, x[0-9]+, lsl 10\\n 2
> 
> Skipping all the SVE tests, sorry.  Richard says they look like
> improvements, and exactly of the expected kind.  :-)
> 
> 
> Segher
> 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] combine: Allow combining two insns to two insns
  2018-07-31 13:57     ` Segher Boessenkool
  2018-07-31 15:37       ` Richard Earnshaw (lists)
@ 2018-08-01  8:27       ` Christophe Lyon
  2018-08-01  9:40         ` Segher Boessenkool
  1 sibling, 1 reply; 18+ messages in thread
From: Christophe Lyon @ 2018-08-01  8:27 UTC (permalink / raw)
  To: Segher Boessenkool; +Cc: gcc Patches

On Tue, 31 Jul 2018 at 15:57, Segher Boessenkool
<segher@kernel.crashing.org> wrote:
>
> Hi Christophe,
>
> On Tue, Jul 31, 2018 at 02:34:06PM +0200, Christophe Lyon wrote:
> > Since this was committed, I've noticed regressions
> > on aarch64:
> > FAIL: gcc.dg/zero_bits_compound-1.c scan-assembler-not \\(and:
>
> This went from
>         and     w0, w0, 255
>         lsl     w1, w0, 8
>         orr     w0, w1, w0, lsl 20
>         ret
> to
>         and     w1, w0, 255
>         ubfiz   w0, w0, 8, 8
>         orr     w0, w0, w1, lsl 20
>         ret
> so it's neither an improvement nor a regression, just different code.
> The testcase wants no ANDs in the RTL.

I didn't try to manually regenerate the code before and after the patch,
but if there was "and     w0, w0, 255" before the patch, why did the test pass?

> > on arm-none-linux-gnueabi
> > FAIL: gfortran.dg/actual_array_constructor_1.f90   -O1  execution test
>
> That sounds bad.  Open a PR, maybe?
>
I've just filed PR86771

> > On aarch64, I've also noticed a few others regressions but I'm not yet
> > 100% sure it's caused by this patch (bisect running):
> >     gcc.target/aarch64/ashltidisi.c scan-assembler-times asr 4
>
>  ushift_53_i:
> -       uxtw    x1, w0
> -       lsl     x0, x1, 53
> -       lsr     x1, x1, 11
> +       lsr     w1, w0, 11
> +       lsl     x0, x0, 53
>         ret
>
>  shift_53_i:
> -       sxtw    x1, w0
> -       lsl     x0, x1, 53
> -       asr     x1, x1, 11
> +       sbfx    x1, x0, 11, 21
> +       lsl     x0, x0, 53
>         ret
>
> Both are improvements afais.  The number of asr insns changes, sure.
>
Looks like an easy "fix" would be to change to "scan-assembler-times asr 3"
but maybe the aarch64 maintainers want to add more checks here (lsl/lsr counts)

Don't you include arm/aarch64 in the 30 targets you used for testing?

>
> >     gcc.target/aarch64/sve/var_stride_2.c -march=armv8.2-a+sve
> > scan-assembler-times \\tadd\\tx[0-9]+, x[0-9]+, x[0-9]+, lsl 10\\n 2
>
> Skipping all the SVE tests, sorry.  Richard says they look like
> improvements, and exactly of the expected kind.  :-)
>
>
> Segher

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] combine: Allow combining two insns to two insns
  2018-08-01  8:27       ` Christophe Lyon
@ 2018-08-01  9:40         ` Segher Boessenkool
  2018-08-01 10:52           ` Christophe Lyon
  0 siblings, 1 reply; 18+ messages in thread
From: Segher Boessenkool @ 2018-08-01  9:40 UTC (permalink / raw)
  To: Christophe Lyon; +Cc: gcc Patches

On Wed, Aug 01, 2018 at 10:27:31AM +0200, Christophe Lyon wrote:
> On Tue, 31 Jul 2018 at 15:57, Segher Boessenkool
> <segher@kernel.crashing.org> wrote:
> > On Tue, Jul 31, 2018 at 02:34:06PM +0200, Christophe Lyon wrote:
> > > Since this was committed, I've noticed regressions
> > > on aarch64:
> > > FAIL: gcc.dg/zero_bits_compound-1.c scan-assembler-not \\(and:
> >
> > This went from
> >         and     w0, w0, 255
> >         lsl     w1, w0, 8
> >         orr     w0, w1, w0, lsl 20
> >         ret
> > to
> >         and     w1, w0, 255
> >         ubfiz   w0, w0, 8, 8
> >         orr     w0, w0, w1, lsl 20
> >         ret
> > so it's neither an improvement nor a regression, just different code.
> > The testcase wants no ANDs in the RTL.
> 
> I didn't try to manually regenerate the code before and after the patch,
> but if there was "and     w0, w0, 255" before the patch, why did the test pass?

It wasn't an AND in RTL (it was a ZERO_EXTEND).

> > > on arm-none-linux-gnueabi
> > > FAIL: gfortran.dg/actual_array_constructor_1.f90   -O1  execution test
> >
> > That sounds bad.  Open a PR, maybe?
> >
> I've just filed PR86771

Thanks.


Segher

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] combine: Allow combining two insns to two insns
  2018-08-01  9:40         ` Segher Boessenkool
@ 2018-08-01 10:52           ` Christophe Lyon
  0 siblings, 0 replies; 18+ messages in thread
From: Christophe Lyon @ 2018-08-01 10:52 UTC (permalink / raw)
  To: Segher Boessenkool; +Cc: gcc Patches

On Wed, 1 Aug 2018 at 11:40, Segher Boessenkool
<segher@kernel.crashing.org> wrote:
>
> On Wed, Aug 01, 2018 at 10:27:31AM +0200, Christophe Lyon wrote:
> > On Tue, 31 Jul 2018 at 15:57, Segher Boessenkool
> > <segher@kernel.crashing.org> wrote:
> > > On Tue, Jul 31, 2018 at 02:34:06PM +0200, Christophe Lyon wrote:
> > > > Since this was committed, I've noticed regressions
> > > > on aarch64:
> > > > FAIL: gcc.dg/zero_bits_compound-1.c scan-assembler-not \\(and:
> > >
> > > This went from
> > >         and     w0, w0, 255
> > >         lsl     w1, w0, 8
> > >         orr     w0, w1, w0, lsl 20
> > >         ret
> > > to
> > >         and     w1, w0, 255
> > >         ubfiz   w0, w0, 8, 8
> > >         orr     w0, w0, w1, lsl 20
> > >         ret
> > > so it's neither an improvement nor a regression, just different code.
> > > The testcase wants no ANDs in the RTL.
> >
> > I didn't try to manually regenerate the code before and after the patch,
> > but if there was "and     w0, w0, 255" before the patch, why did the test pass?
>
> It wasn't an AND in RTL (it was a ZERO_EXTEND).
>

Indeed, I missed the -dP in the options.

> > > > on arm-none-linux-gnueabi
> > > > FAIL: gfortran.dg/actual_array_constructor_1.f90   -O1  execution test
> > >
> > > That sounds bad.  Open a PR, maybe?
> > >
> > I've just filed PR86771
>
> Thanks.
>
>
> Segher

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] combine: Allow combining two insns to two insns
  2018-07-24 17:18 [PATCH] combine: Allow combining two insns to two insns Segher Boessenkool
                   ` (3 preceding siblings ...)
  2018-07-30 16:09 ` Segher Boessenkool
@ 2018-08-02  5:52 ` Toon Moene
  4 siblings, 0 replies; 18+ messages in thread
From: Toon Moene @ 2018-08-02  5:52 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 1099 bytes --]

On 07/24/2018 07:18 PM, Segher Boessenkool wrote:

> This patch allows combine to combine two insns into two.  This helps
> in many cases, by reducing instruction path length, and also allowing
> further combinations to happen.  PR85160 is a typical example of code
> that it can improve.

I cannot state with certainty that the improvements to our most 
notorious routine between 8.2 and current trunk are solely due to this 
change, but the differences are telling (see attached Fortran code - the 
analysis is about the third loop).

Number of instructions for this loop (Skylake i9-7900).

gfortran82 -S -Ofast -march=native -mtune=native:

   458 verint.s.82.loop3

gfortran90 -S -Ofast -march=native -mtune=native:

   396 verint.s.90.loop3

But the most stunning difference is the use of the stack [ nn(rsp) ] - 
see the attached files ...

-- 
Toon Moene - e-mail: toon@moene.org - phone: +31 346 214290
Saturnushof 14, 3738 XG  Maartensdijk, The Netherlands
At home: http://moene.org/~toon/; weather: http://moene.org/~hirlam/
Progress of GNU Fortran: http://gcc.gnu.org/wiki/GFortran#news

[-- Attachment #2: verint.f --]
[-- Type: text/x-fortran, Size: 12959 bytes --]

# 1 "/scratch/hirlam/hl_home/MPI/lib/src/grdy/verint.F"
# 1 "<built-in>"
# 1 "<command-line>"
# 1 "/scratch/hirlam/hl_home/MPI/lib/src/grdy/verint.F"
c Library:grdy $RCSfile$, $Revision: 7536 $
c checked in by $Author: ovignes $ at $Date: 2009-12-18 14:23:36 +0100 (Fri, 18 Dec 2009) $
c $State$, $Locker$
c $Log$
c Revision 1.3  1999/04/22 09:30:45  DagBjoerge
c MPP code
c
c Revision 1.2  1999/03/09 10:23:13  GerardCats
c Add SGI paralllellisation directives DOACROSS
c
c Revision 1.1  1996/09/06 13:12:18  GCats
c Created from grdy.apl, 1 version 2.6.1, by Gerard Cats
c
      SUBROUTINE VERINT (
     I   KLON   , KLAT   , KLEV   , KINT  , KHALO
     I , KLON1  , KLON2  , KLAT1  , KLAT2
     I , KP     , KQ     , KR
     R , PARG   , PRES
     R , PALFH  , PBETH
     R , PALFA  , PBETA  , PGAMA   )
C
C*******************************************************************
C
C  VERINT - THREE DIMENSIONAL INTERPOLATION
C
C  PURPOSE:
C
C  THREE DIMENSIONAL INTERPOLATION
C
C  INPUT PARAMETERS:
C
C  KLON      NUMBER OF GRIDPOINTS IN X-DIRECTION
C  KLAT      NUMBER OF GRIDPOINTS IN Y-DIRECTION
C  KLEV      NUMBER OF VERTICAL LEVELS
C  KINT      TYPE OF INTERPOLATION
C            = 1 - LINEAR
C            = 2 - QUADRATIC
C            = 3 - CUBIC
C            = 4 - MIXED CUBIC/LINEAR
C  KLON1     FIRST GRIDPOINT IN X-DIRECTION
C  KLON2     LAST  GRIDPOINT IN X-DIRECTION
C  KLAT1     FIRST GRIDPOINT IN Y-DIRECTION
C  KLAT2     LAST  GRIDPOINT IN Y-DIRECTION
C  KP        ARRAY OF INDEXES FOR HORIZONTAL DISPLACEMENTS
C  KQ        ARRAY OF INDEXES FOR HORIZONTAL DISPLACEMENTS
C  KR        ARRAY OF INDEXES FOR VERTICAL   DISPLACEMENTS
C  PARG      ARRAY OF ARGUMENTS
C  PALFH     ALFA HAT
C  PBETH     BETA HAT
C  PALFA     ARRAY OF WEIGHTS IN X-DIRECTION
C  PBETA     ARRAY OF WEIGHTS IN Y-DIRECTION
C  PGAMA     ARRAY OF WEIGHTS IN VERTICAL DIRECTION
C
C  OUTPUT PARAMETERS:
C
C  PRES      INTERPOLATED FIELD
C
C  HISTORY:
C
C  J.E. HAUGEN       1      1992
C
C*******************************************************************
C
      IMPLICIT NONE
C
      INTEGER KLON   , KLAT   , KLEV   , KINT   , KHALO,
     I        KLON1  , KLON2  , KLAT1  , KLAT2
C
      INTEGER   KP(KLON,KLAT), KQ(KLON,KLAT), KR(KLON,KLAT)
      REAL    PARG(2-KHALO:KLON+KHALO-1,2-KHALO:KLAT+KHALO-1,KLEV)  ,   
     R        PRES(KLON,KLAT)     ,
     R       PALFH(KLON,KLAT)     ,  PBETH(KLON,KLAT)  ,
     R       PALFA(KLON,KLAT,4)   ,  PBETA(KLON,KLAT,4),
     R       PGAMA(KLON,KLAT,4)
C
      INTEGER JX, JY, IDX, IDY, ILEV
      REAL Z1MAH, Z1MBH
C
      IF (KINT.EQ.1) THEN
C  LINEAR INTERPOLATION
C
      DO JY = KLAT1,KLAT2
      DO JX = KLON1,KLON2
         IDX  = KP(JX,JY)
         IDY  = KQ(JX,JY)
         ILEV = KR(JX,JY)
C
         PRES(JX,JY) = PGAMA(JX,JY,1)*(
C
     +   PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY-1,ILEV-1)
     +                  + PALFA(JX,JY,2)*PARG(IDX  ,IDY-1,ILEV-1) )
     + + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY  ,ILEV-1)
     +                  + PALFA(JX,JY,2)*PARG(IDX  ,IDY  ,ILEV-1) ) )
C    +
     +               + PGAMA(JX,JY,2)*(
C    +
     +   PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY-1,ILEV  )
     +                  + PALFA(JX,JY,2)*PARG(IDX  ,IDY-1,ILEV  ) )
     + + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY  ,ILEV  )
     +                  + PALFA(JX,JY,2)*PARG(IDX  ,IDY  ,ILEV  ) ) )
      ENDDO
      ENDDO
C
      ELSE
     +IF (KINT.EQ.2) THEN
C  QUADRATIC INTERPOLATION
C
      DO JY = KLAT1,KLAT2
      DO JX = KLON1,KLON2
         IDX  = KP(JX,JY)
         IDY  = KQ(JX,JY)
         ILEV = KR(JX,JY)
C
         PRES(JX,JY) = PGAMA(JX,JY,1)*(
C
     +   PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY-1,ILEV-1)
     +                  + PALFA(JX,JY,2)*PARG(IDX  ,IDY-1,ILEV-1)
     +                  + PALFA(JX,JY,3)*PARG(IDX+1,IDY-1,ILEV-1) )
     + + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY  ,ILEV-1)
     +                  + PALFA(JX,JY,2)*PARG(IDX  ,IDY  ,ILEV-1)
     +                  + PALFA(JX,JY,3)*PARG(IDX+1,IDY  ,ILEV-1) )
     + + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY+1,ILEV-1)
     +                  + PALFA(JX,JY,2)*PARG(IDX  ,IDY+1,ILEV-1)
     +                  + PALFA(JX,JY,3)*PARG(IDX+1,IDY+1,ILEV-1) ) )
C    +
     +               + PGAMA(JX,JY,2)*(
C    +
     +   PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY-1,ILEV  )
     +                  + PALFA(JX,JY,2)*PARG(IDX  ,IDY-1,ILEV  )
     +                  + PALFA(JX,JY,3)*PARG(IDX+1,IDY-1,ILEV  ) )
     + + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY  ,ILEV  )
     +                  + PALFA(JX,JY,2)*PARG(IDX  ,IDY  ,ILEV  )
     +                  + PALFA(JX,JY,3)*PARG(IDX+1,IDY  ,ILEV  ) )
     + + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY+1,ILEV  )
     +                  + PALFA(JX,JY,2)*PARG(IDX  ,IDY+1,ILEV  )
     +                  + PALFA(JX,JY,3)*PARG(IDX+1,IDY+1,ILEV  ) ) )
C    +
     +               + PGAMA(JX,JY,3)*(
C    +
     +   PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY-1,ILEV+1)
     +                  + PALFA(JX,JY,2)*PARG(IDX  ,IDY-1,ILEV+1)
     +                  + PALFA(JX,JY,3)*PARG(IDX+1,IDY-1,ILEV+1) )
     + + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY  ,ILEV+1)
     +                  + PALFA(JX,JY,2)*PARG(IDX  ,IDY  ,ILEV+1)
     +                  + PALFA(JX,JY,3)*PARG(IDX+1,IDY  ,ILEV+1) )
     + + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY+1,ILEV+1)
     +                  + PALFA(JX,JY,2)*PARG(IDX  ,IDY+1,ILEV+1)
     +                  + PALFA(JX,JY,3)*PARG(IDX+1,IDY+1,ILEV+1) ) )
      ENDDO
      ENDDO
C
      ELSE
     +IF (KINT.EQ.3) THEN
C  CUBIC INTERPOLATION
C
      DO JY = KLAT1,KLAT2
      DO JX = KLON1,KLON2
         IDX  = KP(JX,JY)
         IDY  = KQ(JX,JY)
         ILEV = KR(JX,JY)
C
         PRES(JX,JY) = PGAMA(JX,JY,1)*(
C
     +   PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-2,ILEV-2)
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY-2,ILEV-2)
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY-2,ILEV-2)
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY-2,ILEV-2) )
     + + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-1,ILEV-2)
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY-1,ILEV-2)
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY-1,ILEV-2)
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY-1,ILEV-2) ) 
     + + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY  ,ILEV-2)
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY  ,ILEV-2)
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY  ,ILEV-2)
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY  ,ILEV-2) )
     + + PBETA(JX,JY,4)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY+1,ILEV-2)
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY+1,ILEV-2)
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY+1,ILEV-2)
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY+1,ILEV-2) ) )
C    +
     +               + PGAMA(JX,JY,2)*(
C    +
     +   PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-2,ILEV-1)
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY-2,ILEV-1)
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY-2,ILEV-1)
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY-2,ILEV-1) )
     + + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-1,ILEV-1)
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY-1,ILEV-1)
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY-1,ILEV-1)
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY-1,ILEV-1) ) 
     + + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY  ,ILEV-1)
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY  ,ILEV-1)
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY  ,ILEV-1)
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY  ,ILEV-1) )
     + + PBETA(JX,JY,4)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY+1,ILEV-1)
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY+1,ILEV-1)
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY+1,ILEV-1)
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY+1,ILEV-1) ) )
C    +
     +               + PGAMA(JX,JY,3)*(
C    +
     +   PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-2,ILEV  )
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY-2,ILEV  )
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY-2,ILEV  )
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY-2,ILEV  ) )
     + + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-1,ILEV  )
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY-1,ILEV  )
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY-1,ILEV  )
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY-1,ILEV  ) ) 
     + + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY  ,ILEV  )
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY  ,ILEV  )
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY  ,ILEV  )
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY  ,ILEV  ) )
     + + PBETA(JX,JY,4)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY+1,ILEV  )
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY+1,ILEV  )
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY+1,ILEV  )
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY+1,ILEV  ) ) )
C    +
     +               + PGAMA(JX,JY,4)*(
C    +
     +   PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-2,ILEV+1)
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY-2,ILEV+1)
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY-2,ILEV+1)
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY-2,ILEV+1) )
     + + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-1,ILEV+1)
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY-1,ILEV+1)
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY-1,ILEV+1)
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY-1,ILEV+1) )
     + + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY  ,ILEV+1)
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY  ,ILEV+1)
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY  ,ILEV+1)
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY  ,ILEV+1) )
     + + PBETA(JX,JY,4)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY+1,ILEV+1)
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY+1,ILEV+1)
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY+1,ILEV+1)
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY+1,ILEV+1) ) )
      ENDDO
      ENDDO
C
      ELSE
     +IF (KINT.EQ.4) THEN
C  MIXED CUBIC/LINEAR INTERPOLATION
C
      DO JY = KLAT1,KLAT2
      DO JX = KLON1,KLON2
         IDX  = KP(JX,JY)
         IDY  = KQ(JX,JY)
         ILEV = KR(JX,JY)
C
         Z1MAH = 1.0 - PALFH(JX,JY)
         Z1MBH = 1.0 - PBETH(JX,JY)
C
         PRES(JX,JY) = PGAMA(JX,JY,1)*(
C
     +   PBETH(JX,JY)  *( PALFH(JX,JY)  *PARG(IDX-1,IDY-1,ILEV-2)
     +                  + Z1MAH         *PARG(IDX  ,IDY-1,ILEV-2) )
     + + Z1MBH         *( PALFH(JX,JY)  *PARG(IDX-1,IDY  ,ILEV-2)
     +                  + Z1MAH         *PARG(IDX  ,IDY  ,ILEV-2) ) )
C    +
     +               + PGAMA(JX,JY,4)*(
C    +
     +   PBETH(JX,JY)  *( PALFH(JX,JY)  *PARG(IDX-1,IDY-1,ILEV+1)
     +                  + Z1MAH         *PARG(IDX  ,IDY-1,ILEV+1) )
     + + Z1MBH         *( PALFH(JX,JY)  *PARG(IDX-1,IDY  ,ILEV+1)
     +                  + Z1MAH         *PARG(IDX  ,IDY  ,ILEV+1) ) )
C    +
     +               + PGAMA(JX,JY,2)*(
C    +
     +   PBETA(JX,JY,1)*( PALFH(JX,JY)  *PARG(IDX-1,IDY-2,ILEV-1)
     +                  + Z1MAH         *PARG(IDX  ,IDY-2,ILEV-1) )
     + + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-1,ILEV-1)
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY-1,ILEV-1)
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY-1,ILEV-1)
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY-1,ILEV-1) )
     + + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY  ,ILEV-1)
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY  ,ILEV-1)
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY  ,ILEV-1)
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY  ,ILEV-1) )
     + + PBETA(JX,JY,4)*( PALFH(JX,JY)  *PARG(IDX-1,IDY+1,ILEV-1)
     +                  + Z1MAH         *PARG(IDX  ,IDY+1,ILEV-1) ) )
C    +
     +               + PGAMA(JX,JY,3)*(
C    +
     +   PBETA(JX,JY,1)*( PALFH(JX,JY)  *PARG(IDX-1,IDY-2,ILEV  )
     +                  + Z1MAH         *PARG(IDX  ,IDY-2,ILEV  ) )
     + + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-1,ILEV  )
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY-1,ILEV  )
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY-1,ILEV  )
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY-1,ILEV  ) ) 
     + + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY  ,ILEV  )
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY  ,ILEV  )
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY  ,ILEV  )
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY  ,ILEV  ) )
     + + PBETA(JX,JY,4)*( PALFH(JX,JY)  *PARG(IDX-1,IDY+1,ILEV  )
     +                  + Z1MAH         *PARG(IDX  ,IDY+1,ILEV  ) ) )
      ENDDO
      ENDDO
C
      ENDIF
C
      RETURN
      END

[-- Attachment #3: verint.s.90.loop3.stack --]
[-- Type: text/plain, Size: 229 bytes --]

	movq	32(%rsp), %rdx
	movq	72(%rsp), %rdx
	movq	80(%rsp), %rdx
	vmovaps	%ymm18, 296(%rsp)
	movq	56(%rsp), %rdx
	movq	48(%rsp), %rdx
	movq	64(%rsp), %rdx
	movq	40(%rsp), %rdx
	vaddps	296(%rsp), %ymm19, %ymm19
	cmpq	24(%rsp), %rax

[-- Attachment #4: verint.s.82.loop3.stack --]
[-- Type: text/plain, Size: 3128 bytes --]

	movq	1384(%rsp), %rdx
	vmovaps	%ymm29, 2184(%rsp)
	vmovaps	%ymm30, 2216(%rsp)
	vmovaps	%ymm28, 2248(%rsp)
	vmovaps	%ymm17, 2280(%rsp)
	vmovaps	%ymm19, 2312(%rsp)
	vmovaps	%ymm31, 2344(%rsp)
	vmovaps	%ymm25, 2376(%rsp)
	vmovaps	%ymm21, 2408(%rsp)
	vmovaps	%ymm16, 2440(%rsp)
	vmovaps	%ymm30, 2472(%rsp)
	vmovaps	%ymm13, 2504(%rsp)
	vmovaps	%ymm11, 2536(%rsp)
	vmovaps	%ymm28, 2568(%rsp)
	vmovaps	%ymm4, 2600(%rsp)
	vmovaps	%ymm19, 2632(%rsp)
	vmovaps	%ymm17, 2664(%rsp)
	vmovaps	%ymm16, 2696(%rsp)
	vmovaps	%ymm19, 2728(%rsp)
	movq	1256(%rsp), %rdx
	vmovaps	%ymm31, 2760(%rsp)
	vmovaps	%ymm21, 2792(%rsp)
	vmovaps	%ymm31, 2152(%rsp)
	vmovaps	%ymm17, 2824(%rsp)
	vmovaps	%ymm21, 2856(%rsp)
	vmovaps	%ymm17, 2888(%rsp)
	vmovaps	%ymm21, 2920(%rsp)
	vmovaps	%ymm16, 2952(%rsp)
	vmovaps	%ymm11, 2984(%rsp)
	vmovaps	%ymm31, 3016(%rsp)
	vmovaps	%ymm21, 3048(%rsp)
	vmovaps	%ymm31, 3080(%rsp)
	vmovaps	%ymm21, 3112(%rsp)
	vmovaps	%ymm29, 3144(%rsp)
	vmovaps	%ymm31, 3176(%rsp)
	vmovaps	%ymm16, 3208(%rsp)
	vmovaps	%ymm29, 3240(%rsp)
	vmovaps	%ymm4, 3272(%rsp)
	vmovaps	%ymm31, 3304(%rsp)
	vmovaps	%ymm29, 3336(%rsp)
	vmovaps	%ymm26, 3368(%rsp)
	vmovaps	%ymm25, 3400(%rsp)
	vmovaps	%ymm26, 3432(%rsp)
	vmovaps	%ymm25, 3464(%rsp)
	vmovaps	%ymm25, 3496(%rsp)
	vmovaps	%ymm31, 3528(%rsp)
	vmovaps	%ymm2, 3560(%rsp)
	vmovaps	%ymm29, 3592(%rsp)
	vmovaps	3144(%rsp), %ymm8
	vshuff32x4	$0, 3176(%rsp), %ymm8, %ymm0
	vshuff32x4	$0, 3208(%rsp), %ymm11, %ymm11
	vmovaps	3016(%rsp), %ymm10
	vmovaps	2152(%rsp), %ymm31
	vshuff32x4	$0, 3272(%rsp), %ymm21, %ymm21
	vshuff32x4	$0, 3240(%rsp), %ymm15, %ymm15
	vshuff32x4	$0, 3048(%rsp), %ymm10, %ymm0
	vmovaps	3080(%rsp), %ymm10
	vshuff32x4	$0, 3112(%rsp), %ymm10, %ymm5
	vmovaps	2632(%rsp), %ymm5
	vshuff32x4	$0, 2664(%rsp), %ymm5, %ymm8
	vmovaps	2696(%rsp), %ymm5
	vshuff32x4	$0, 2728(%rsp), %ymm5, %ymm5
	vmovaps	2760(%rsp), %ymm15
	vshuff32x4	$0, 2792(%rsp), %ymm15, %ymm9
	vmovaps	2824(%rsp), %ymm15
	vmovaps	2888(%rsp), %ymm0
	vshuff32x4	$0, 2920(%rsp), %ymm0, %ymm8
	vmovaps	2952(%rsp), %ymm0
	vshuff32x4	$0, 2984(%rsp), %ymm0, %ymm0
	vshuff32x4	$0, 2856(%rsp), %ymm15, %ymm8
	vshuff32x4	$0, 3560(%rsp), %ymm19, %ymm19
	vmovaps	3304(%rsp), %ymm7
	vshuff32x4	$0, 3336(%rsp), %ymm7, %ymm0
	vmovaps	3368(%rsp), %ymm7
	vshuff32x4	$0, 3528(%rsp), %ymm13, %ymm13
	vshuff32x4	$0, 3400(%rsp), %ymm7, %ymm1
	vmovaps	3432(%rsp), %ymm18
	vshuff32x4	$0, 3592(%rsp), %ymm20, %ymm20
	vshuff32x4	$0, 3464(%rsp), %ymm18, %ymm18
	vshuff32x4	$0, 3496(%rsp), %ymm26, %ymm1
	vmovaps	2184(%rsp), %ymm29
	movq	1224(%rsp), %rdx
	vmovaps	2440(%rsp), %ymm20
	vshuff32x4	$0, 2216(%rsp), %ymm29, %ymm0
	vmovaps	2248(%rsp), %ymm29
	vshuff32x4	$0, 2280(%rsp), %ymm29, %ymm1
	vmovaps	2312(%rsp), %ymm29
	movq	1288(%rsp), %rdx
	vshuff32x4	$0, 2344(%rsp), %ymm29, %ymm1
	vmovaps	2376(%rsp), %ymm29
	vshuff32x4	$0, 2408(%rsp), %ymm29, %ymm2
	vshuff32x4	$0, 2472(%rsp), %ymm20, %ymm1
	vmovaps	2504(%rsp), %ymm20
	vshuff32x4	$0, 2600(%rsp), %ymm28, %ymm3
	vshuff32x4	$0, 2536(%rsp), %ymm20, %ymm2
	movq	1320(%rsp), %rdx
	vshuff32x4	$0, 2568(%rsp), %ymm30, %ymm2
	movq	1352(%rsp), %rdx
	movq	1192(%rsp), %rdx
	cmpq	1160(%rsp), %rax

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2018-08-02  5:52 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-07-24 17:18 [PATCH] combine: Allow combining two insns to two insns Segher Boessenkool
2018-07-24 21:13 ` Jeff Law
2018-07-25  8:28 ` Richard Biener
2018-07-25  9:50   ` Segher Boessenkool
2018-07-25 10:37     ` Richard Biener
2018-07-31 12:39   ` H.J. Lu
2018-07-31 14:08     ` Segher Boessenkool
2018-07-25 13:47 ` David Malcolm
2018-07-25 14:19   ` Segher Boessenkool
2018-07-30 16:09 ` Segher Boessenkool
2018-07-31 12:34   ` Christophe Lyon
2018-07-31 12:59     ` Richard Sandiford
2018-07-31 13:57     ` Segher Boessenkool
2018-07-31 15:37       ` Richard Earnshaw (lists)
2018-08-01  8:27       ` Christophe Lyon
2018-08-01  9:40         ` Segher Boessenkool
2018-08-01 10:52           ` Christophe Lyon
2018-08-02  5:52 ` Toon Moene

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).