[PATCH] Unswitching outer loops.

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH] Unswitching outer loops.
@ 2015-07-10 10:03 Yuri Rumyantsev
  2015-07-14 11:07 ` Richard Biener
  0 siblings, 1 reply; 17+ messages in thread
From: Yuri Rumyantsev @ 2015-07-10 10:03 UTC (permalink / raw)
  To: gcc-patches, Richard Biener, Igor Zamyatin

[-- Attachment #1: Type: text/plain, Size: 1136 bytes --]

Hi All,

Here is presented simple transformation which tries to hoist out of
outer-loop a check on zero trip count for inner-loop. This is very
restricted transformation since it accepts outer-loops with very
simple cfg, as for example:
    acc = 0;
   for (i = 1; i <= m; i++) {
      for (j = 0; j < n; j++)
         if (l[j] == i) { v[j] = acc; acc++; };
      acc <<= 1;
   }

Note that degenerative outer loop (without inner loop) will be
completely deleted as dead code.
The main goal of this transformation was to convert outer-loop to form
accepted by outer-loop vectorization (such test-case is also included
to patch).

Bootstrap and regression testing did not show any new failures.

Is it OK for trunk?

ChangeLog:
2015-07-10  Yuri Rumyantsev  <ysrumyan@gmail.com>

* tree-ssa-loop-unswitch.c: Include "tree-cfgcleanup.h" and
"gimple-iterator.h", add prototype for tree_unswitch_outer_loop.
(tree_ssa_unswitch_loops): Add invoke of tree_unswitch_outer_loop.
(tree_unswitch_outer_loop): New function.

gcc/testsuite/ChangeLog:
* gcc.dg/tree-ssa/unswitch-outer-loop-1.c: New test.
* gcc.dg/vect/vect-outer-simd-3.c: New test.

[-- Attachment #2: patch.3 --]
[-- Type: application/octet-stream, Size: 9639 bytes --]

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/unswitch-outer-loop-1.c b/gcc/testsuite/gcc.dg/tree-ssa/unswitch-outer-loop-1.c
new file mode 100755
index 0000000..296d18f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/unswitch-outer-loop-1.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -funswitch-loops -fdump-tree-unswitch-details" } */
+
+void foo (int *v, int n, int m, int *l)
+{
+   int acc, i, j;
+
+   acc = 0;
+   for (i = 1; i <= m; i++) {
+      for (j = 0; j < n; j++)
+	if (l[j] == i)
+	  {
+	    v[j] = acc; acc++;
+	  }
+      acc <<= 1;
+   }
+}
+
+/* Outer loop should be unswitched.  */
+/* { dg-final { scan-tree-dump "Unswitching outer loop" "unswitch" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-simd-3.c b/gcc/testsuite/gcc.dg/vect/vect-outer-simd-3.c
new file mode 100644
index 0000000..891afbc
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-outer-simd-3.c
@@ -0,0 +1,78 @@
+/* { dg-require-effective-target vect_simd_clones } */
+/* { dg-additional-options "-funswitch-loops -fopenmp-simd -ffast-math" } */
+#include <stdlib.h>
+#include "tree-vect.h"
+#define N 64
+
+float *px, *py;
+float *tx, *ty;
+float *x1, *z1, *t1, *t2;
+int bound[N];
+
+static void inline bar(const float cx, float cy,
+                         float *vx, float *vy, int n)
+{
+  int j;
+    for (j = 0; j < n; ++j)
+    {
+        const float dx  = cx - px[j];
+        const float dy  = cy - py[j];
+        *vx               -= dx * tx[j];
+        *vy               -= dy * ty[j];
+    }
+}
+
+__attribute__((noinline, noclone)) void foo1 ()
+{
+  int i;
+  int n = bound[63];
+#pragma omp simd
+  for (i=0; i<N; i++)
+    bar(px[i], py[i], x1+i, z1+i, n);
+}
+
+__attribute__((noinline, noclone)) void foo2 ()
+{
+  volatile int i;
+  int n = bound[63];
+  for (i=0; i<N; i++)
+    bar(px[i], py[i], x1+i, z1+i, n);
+}
+
+
+int main()
+{
+  float *X = (float*)malloc(N * 8 * sizeof (float));
+  int i;
+  check_vect ();
+  px = &X[0];
+  py = &X[N * 1];
+  tx = &X[N * 2];
+  ty = &X[N * 3];
+  x1 = &X[N * 4];
+  z1 = &X[N * 5];
+  t1 = &X[N * 6];
+  t2 = &X[N * 7];
+
+  for (i=0; i<N; i++)
+    {
+      px[i] = (float) (i+2);
+      tx[i] = (float) (i+1);
+      py[i] = (float) (i+4);
+      ty[i] = (float) (i+3);
+      x1[i] = z1[i] = 1.0f;
+      bound[i] = i + 1;
+    }
+  foo1 ();  /* vector variant.  */
+  for (i=0; i<N;i++)
+    {
+      t1[i] = x1[i]; x1[i] = 1.0f;
+      t2[i] = z1[i]; z1[i] = 1.0f;
+    }
+  foo2 ();  /* scalar variant.  */
+  for (i=0; i<N; i++)
+    if (x1[i] != t1[i] || z1[i] != t2[i])
+      abort ();
+  return 0;
+}
+/* { dg-final { scan-tree-dump "OUTER LOOP VECTORIZED" "vect" } } */
diff --git a/gcc/tree-ssa-loop-unswitch.c b/gcc/tree-ssa-loop-unswitch.c
index 0be9142..befde7a 100644
--- a/gcc/tree-ssa-loop-unswitch.c
+++ b/gcc/tree-ssa-loop-unswitch.c
@@ -48,6 +48,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "params.h"
 #include "tree-pass.h"
 #include "tree-inline.h"
+#include "tree-cfgcleanup.h"
+#include "gimple-iterator.h"
 
 /* This file implements the loop unswitching, i.e. transformation of loops like
 
@@ -88,6 +90,7 @@ along with GCC; see the file COPYING3.  If not see
 static struct loop *tree_unswitch_loop (struct loop *, basic_block, tree);
 static bool tree_unswitch_single_loop (struct loop *, int);
 static tree tree_may_unswitch_on (basic_block, struct loop *);
+static bool tree_unswitch_outer_loop (struct loop *);
 
 /* Main entry point.  Perform loop unswitching on all suitable loops.  */
 
@@ -112,6 +115,13 @@ tree_ssa_unswitch_loops (void)
           continue;
         }
 
+      /* Try to unswitch outer loop.  */
+      if (tree_unswitch_outer_loop (loop))
+	{
+	  changed = true;
+	  continue;
+	}
+
       /* The loop should not be too large, to limit code growth. */
       if (tree_num_loop_insns (loop, &eni_size_weights)
           > (unsigned) PARAM_VALUE (PARAM_MAX_UNSWITCH_INSNS))
@@ -412,6 +422,180 @@ tree_unswitch_loop (struct loop *loop,
 		       REG_BR_PROB_BASE - prob_true, false);
 }
 
+/* Try to unswitch outer loop if any to support vectorization of outer loop
+   with non-constant invariant upper bound of innermost loop.  In fact it tries
+   to hoist out a check on zero trip count for innermost loop.
+   Returns true if unswitching took place.  */
+static bool
+tree_unswitch_outer_loop (struct loop *inner_loop)
+{
+  struct loop *loop = loop_outer (inner_loop);
+  basic_block header;
+  basic_block inner_header;
+  gimple stmt;
+  basic_block exit_bb;
+  tree cond, cond_new;
+  ssa_op_iter iter;
+  tree use;
+  basic_block def_bb, merge_bb;
+  gimple def;
+  struct loop *nloop;
+  unsigned prob_true;
+  edge edge_true, edge_false;
+  gcond *cond_stmt;
+  gphi_iterator gsi;
+
+  if (!loop || loop->next)
+    return false;
+
+  if (!single_exit (inner_loop) || !single_exit (loop))
+    return false;
+  header = loop->header;
+  stmt = last_stmt (header);
+  if (!stmt || gimple_code (stmt) != GIMPLE_COND)
+    return false;
+  /* Determine exit bb and header of inner loop.  */
+  inner_header = EDGE_SUCC (header, 0)->dest;
+  if (!flow_bb_inside_loop_p (loop, inner_header))
+    return false;
+  /* Skip empty bb.  */
+  if (empty_block_p (inner_header))
+    {
+      if (!single_succ_p (inner_header))
+	return false;
+      inner_header = single_succ (inner_header);
+    }
+  exit_bb = EDGE_SUCC (header, 1)->dest;
+  if (!flow_bb_inside_loop_p (loop, exit_bb))
+    return false;
+  if (empty_block_p (exit_bb))
+    {
+      if (!single_succ_p (exit_bb))
+	return false;
+      exit_bb = single_succ (exit_bb);
+    }
+  if (inner_header != inner_loop->header)
+    {
+      basic_block tmp = inner_header;
+      if (exit_bb != inner_loop->header)
+	return false;
+      inner_header = exit_bb;
+      exit_bb = tmp;
+    }
+  if (EDGE_COUNT (exit_bb->succs) != 2)
+    return false;;
+  gcc_assert (last_stmt (inner_header));
+  if (EDGE_COUNT (exit_bb->preds) != 2)
+    return false;
+  gcc_assert (last_stmt (exit_bb));
+  if (!loop_exit_edge_p (loop, EDGE_SUCC (exit_bb, 0)))
+    {
+      if (!loop_exit_edge_p (loop, EDGE_SUCC (exit_bb, 1))
+	  || EDGE_SUCC (exit_bb, 0)->dest != loop->latch)
+	return false;
+    }
+  else if (EDGE_SUCC (exit_bb, 1)->dest != loop->latch)
+    return false;
+  /* Is condition loop invariant?  */
+  FOR_EACH_SSA_TREE_OPERAND (use, stmt, iter, SSA_OP_USE)
+    {
+      def = SSA_NAME_DEF_STMT (use);
+      def_bb = gimple_bb (def);
+      if (def_bb && flow_bb_inside_loop_p (loop, def_bb))
+	return false;
+    }
+
+  cond = build2 (gimple_cond_code(stmt), boolean_type_node,
+		 gimple_cond_lhs (stmt), gimple_cond_rhs (stmt));
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, ";; Unswitching outer loop\n");
+  initialize_original_copy_tables ();
+  extract_true_false_edges_from_block (header, &edge_true, &edge_false);
+  prob_true = edge_true->probability;
+  nloop = loop_version (loop, unshare_expr (cond), NULL, prob_true, prob_true,
+			REG_BR_PROB_BASE - prob_true, false);
+  if (!nloop)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf(dump_file, "Can't create version!\n");
+      free_original_copy_tables ();
+      return false;
+    }
+
+  if (!nloop->force_vectorize)
+    nloop->force_vectorize = true;
+  if (loop->safelen != 0)
+    nloop->safelen = loop->safelen;
+  header = loop->header;
+  stmt = last_stmt (header);
+  cond_new = simplify_using_entry_checks (loop, cond);
+  cond_stmt = as_a <gcond *> (stmt);
+  if (integer_nonzerop (cond_new))
+    gimple_cond_set_condition_from_tree (cond_stmt, boolean_true_node);
+  else if (integer_zerop (cond_new))
+    gimple_cond_set_condition_from_tree (cond_stmt, boolean_false_node);
+  else
+    gcc_unreachable ();
+
+  header = nloop->header;
+  stmt = last_stmt (header);
+  cond_new = simplify_using_entry_checks (nloop, cond);
+  cond_stmt = as_a <gcond *> (stmt);
+  if (integer_nonzerop (cond_new))
+    gimple_cond_set_condition_from_tree (cond_stmt, boolean_true_node);
+  else if (integer_zerop (cond_new))
+    gimple_cond_set_condition_from_tree (cond_stmt, boolean_false_node);
+  else
+    gcc_unreachable ();
+
+  merge_bb = single_exit (loop)->dest;
+  if (EDGE_COUNT (merge_bb->preds) >= 2)
+    split_edge (single_exit (loop));
+  /* Clean-up cfg to remove useless one-argument phi in exit block of
+     outer-loop.  */
+  cleanup_tree_cfg ();
+  mark_virtual_operands_for_renaming (cfun);
+  update_ssa (TODO_update_ssa);
+
+  /* Determine which loop is loop nest.  */
+  if (!loop->inner)
+    {
+      gcc_assert (nloop->inner);
+      loop = nloop;
+    }
+  exit_bb = single_exit (loop)->src;
+  gcc_assert (EDGE_COUNT (exit_bb->preds) == 1);
+  /* Perform substitution of rhs of phi nodes aka copy propagation since
+     vectorizer does not accept phi nodes in non-header bb.  */
+  for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi);)
+    {
+      gphi *phi;
+      tree arg;
+      gimple stmt;
+      imm_use_iterator imm_iter;
+      use_operand_p use_p;
+
+      phi = gsi.phi ();
+      arg = gimple_phi_arg_def (phi, 0);
+      if (TREE_CODE (arg) == SSA_NAME)
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf(dump_file, "Remove phi in exit block of outer-loop\n");
+	  FOR_EACH_IMM_USE_STMT (stmt, imm_iter, PHI_RESULT (phi))
+	    {
+	      if (stmt != phi)
+		FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
+		  SET_USE (use_p, arg);
+	    }
+          remove_phi_node (&gsi, true);
+	}
+      else
+	gsi_next (&gsi);
+    }
+  free_original_copy_tables ();
+  return true;
+}
+
 /* Loop unswitching pass.  */
 
 namespace {

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] Unswitching outer loops.
  2015-07-10 10:03 [PATCH] Unswitching outer loops Yuri Rumyantsev
@ 2015-07-14 11:07 ` Richard Biener
  2015-07-23 15:21   ` Yuri Rumyantsev
  0 siblings, 1 reply; 17+ messages in thread
From: Richard Biener @ 2015-07-14 11:07 UTC (permalink / raw)
  To: Yuri Rumyantsev; +Cc: gcc-patches, Igor Zamyatin

On Fri, Jul 10, 2015 at 12:02 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
> Hi All,
>
> Here is presented simple transformation which tries to hoist out of
> outer-loop a check on zero trip count for inner-loop. This is very
> restricted transformation since it accepts outer-loops with very
> simple cfg, as for example:
>     acc = 0;
>    for (i = 1; i <= m; i++) {
>       for (j = 0; j < n; j++)
>          if (l[j] == i) { v[j] = acc; acc++; };
>       acc <<= 1;
>    }
>
> Note that degenerative outer loop (without inner loop) will be
> completely deleted as dead code.
> The main goal of this transformation was to convert outer-loop to form
> accepted by outer-loop vectorization (such test-case is also included
> to patch).
>
> Bootstrap and regression testing did not show any new failures.
>
> Is it OK for trunk?

I think this is

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=23855

as well.  It has a patch adding a invariant loop guard hoisting
phase to loop-header copying.  Yeah, it needs updating to
trunk again I suppose.  It's always non-stage1 when I come
back to that patch.

Your patch seems to be very specific and only handles outer
loops of innermost loops.

Richard.

> ChangeLog:
> 2015-07-10  Yuri Rumyantsev  <ysrumyan@gmail.com>
>
> * tree-ssa-loop-unswitch.c: Include "tree-cfgcleanup.h" and
> "gimple-iterator.h", add prototype for tree_unswitch_outer_loop.
> (tree_ssa_unswitch_loops): Add invoke of tree_unswitch_outer_loop.
> (tree_unswitch_outer_loop): New function.
>
> gcc/testsuite/ChangeLog:
> * gcc.dg/tree-ssa/unswitch-outer-loop-1.c: New test.
> * gcc.dg/vect/vect-outer-simd-3.c: New test.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] Unswitching outer loops.
  2015-07-14 11:07 ` Richard Biener
@ 2015-07-23 15:21   ` Yuri Rumyantsev
  2015-07-28 11:00     ` Richard Biener
  0 siblings, 1 reply; 17+ messages in thread
From: Yuri Rumyantsev @ 2015-07-23 15:21 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, Igor Zamyatin

Hi Richard,

I checked that both test-cases from 23855 are sucessfully unswitched
by proposed patch. I understand that it does not catch deeper loop
nest as
   for (i=0; i<10; i++)
     for (j=0;j<n;j++)
        for (k=0;k<20;k++)
  ...
but duplication of middle-loop does not look reasonable.

Here is dump for your second test-case:

void foo(int *ie, int *je, double *x)
{
  int i, j;
  for (j=0; j<*je; ++j)
    for (i=0; i<*ie; ++i)
      x[i+j] = 0.0;
}
grep -i unswitch t6.c.119t.unswitch
;; Unswitching outer loop

Yuri.

2015-07-14 14:06 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
> On Fri, Jul 10, 2015 at 12:02 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>> Hi All,
>>
>> Here is presented simple transformation which tries to hoist out of
>> outer-loop a check on zero trip count for inner-loop. This is very
>> restricted transformation since it accepts outer-loops with very
>> simple cfg, as for example:
>>     acc = 0;
>>    for (i = 1; i <= m; i++) {
>>       for (j = 0; j < n; j++)
>>          if (l[j] == i) { v[j] = acc; acc++; };
>>       acc <<= 1;
>>    }
>>
>> Note that degenerative outer loop (without inner loop) will be
>> completely deleted as dead code.
>> The main goal of this transformation was to convert outer-loop to form
>> accepted by outer-loop vectorization (such test-case is also included
>> to patch).
>>
>> Bootstrap and regression testing did not show any new failures.
>>
>> Is it OK for trunk?
>
> I think this is
>
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=23855
>
> as well.  It has a patch adding a invariant loop guard hoisting
> phase to loop-header copying.  Yeah, it needs updating to
> trunk again I suppose.  It's always non-stage1 when I come
> back to that patch.
>
> Your patch seems to be very specific and only handles outer
> loops of innermost loops.
>
> Richard.
>
>> ChangeLog:
>> 2015-07-10  Yuri Rumyantsev  <ysrumyan@gmail.com>
>>
>> * tree-ssa-loop-unswitch.c: Include "tree-cfgcleanup.h" and
>> "gimple-iterator.h", add prototype for tree_unswitch_outer_loop.
>> (tree_ssa_unswitch_loops): Add invoke of tree_unswitch_outer_loop.
>> (tree_unswitch_outer_loop): New function.
>>
>> gcc/testsuite/ChangeLog:
>> * gcc.dg/tree-ssa/unswitch-outer-loop-1.c: New test.
>> * gcc.dg/vect/vect-outer-simd-3.c: New test.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] Unswitching outer loops.
  2015-07-23 15:21   ` Yuri Rumyantsev
@ 2015-07-28 11:00     ` Richard Biener
  2015-07-31 12:07       ` Yuri Rumyantsev
  0 siblings, 1 reply; 17+ messages in thread
From: Richard Biener @ 2015-07-28 11:00 UTC (permalink / raw)
  To: Yuri Rumyantsev; +Cc: gcc-patches, Igor Zamyatin

On Thu, Jul 23, 2015 at 4:45 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
> Hi Richard,
>
> I checked that both test-cases from 23855 are sucessfully unswitched
> by proposed patch. I understand that it does not catch deeper loop
> nest as
>    for (i=0; i<10; i++)
>      for (j=0;j<n;j++)
>         for (k=0;k<20;k++)
>   ...
> but duplication of middle-loop does not look reasonable.
>
> Here is dump for your second test-case:
>
> void foo(int *ie, int *je, double *x)
> {
>   int i, j;
>   for (j=0; j<*je; ++j)
>     for (i=0; i<*ie; ++i)
>       x[i+j] = 0.0;
> }
> grep -i unswitch t6.c.119t.unswitch
> ;; Unswitching outer loop

I was saying that why go with a limited approach when a patch (in
unknown state...)
is available that does it more generally?  Also unswitching is quite
expensive compared
to "moving" the invariant condition.

In your patch:

+  if (!nloop->force_vectorize)
+    nloop->force_vectorize = true;
+  if (loop->safelen != 0)
+    nloop->safelen = loop->safelen;

I see no guard on force_vectorize so = true looks bogus here.  Please just use
copy_loop_info.

+  if (integer_nonzerop (cond_new))
+    gimple_cond_set_condition_from_tree (cond_stmt, boolean_true_node);
+  else if (integer_zerop (cond_new))
+    gimple_cond_set_condition_from_tree (cond_stmt, boolean_false_node);

gimple_cond_make_true/false (cond_stmt);

btw, seems odd that we have to recompute which loop is the true / false variant
when we just fed a guard condition to loop_version.  Can't we statically
determine whether loop or nloop has the in-loop condition true or false?

+  /* Clean-up cfg to remove useless one-argument phi in exit block of
+     outer-loop.  */
+  cleanup_tree_cfg ();

I know unswitching is already O(number-of-unswitched-loops * size-of-function)
because it updates SSA form after each individual unswitching (and it does that
because it invokes itself recursively on unswitched loops).  But do you really
need to invoke CFG cleanup here?

Richard.

> Yuri.
>
> 2015-07-14 14:06 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>> On Fri, Jul 10, 2015 at 12:02 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>> Hi All,
>>>
>>> Here is presented simple transformation which tries to hoist out of
>>> outer-loop a check on zero trip count for inner-loop. This is very
>>> restricted transformation since it accepts outer-loops with very
>>> simple cfg, as for example:
>>>     acc = 0;
>>>    for (i = 1; i <= m; i++) {
>>>       for (j = 0; j < n; j++)
>>>          if (l[j] == i) { v[j] = acc; acc++; };
>>>       acc <<= 1;
>>>    }
>>>
>>> Note that degenerative outer loop (without inner loop) will be
>>> completely deleted as dead code.
>>> The main goal of this transformation was to convert outer-loop to form
>>> accepted by outer-loop vectorization (such test-case is also included
>>> to patch).
>>>
>>> Bootstrap and regression testing did not show any new failures.
>>>
>>> Is it OK for trunk?
>>
>> I think this is
>>
>> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=23855
>>
>> as well.  It has a patch adding a invariant loop guard hoisting
>> phase to loop-header copying.  Yeah, it needs updating to
>> trunk again I suppose.  It's always non-stage1 when I come
>> back to that patch.
>>
>> Your patch seems to be very specific and only handles outer
>> loops of innermost loops.
>>
>> Richard.
>>
>>> ChangeLog:
>>> 2015-07-10  Yuri Rumyantsev  <ysrumyan@gmail.com>
>>>
>>> * tree-ssa-loop-unswitch.c: Include "tree-cfgcleanup.h" and
>>> "gimple-iterator.h", add prototype for tree_unswitch_outer_loop.
>>> (tree_ssa_unswitch_loops): Add invoke of tree_unswitch_outer_loop.
>>> (tree_unswitch_outer_loop): New function.
>>>
>>> gcc/testsuite/ChangeLog:
>>> * gcc.dg/tree-ssa/unswitch-outer-loop-1.c: New test.
>>> * gcc.dg/vect/vect-outer-simd-3.c: New test.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] Unswitching outer loops.
  2015-07-28 11:00     ` Richard Biener
@ 2015-07-31 12:07       ` Yuri Rumyantsev
  2015-07-31 15:54         ` Jeff Law
  2015-08-03  7:27         ` Richard Biener
  0 siblings, 2 replies; 17+ messages in thread
From: Yuri Rumyantsev @ 2015-07-31 12:07 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, Igor Zamyatin

Hi Richard,

I learned your updated patch for 23825 and it is more general in
comparison with my.
I'd like to propose you a compromise - let's consider my patch only
for force-vectorize outer loop only to allow outer-loop
vecctorization. Note that your approach will not hoist invariant
guards if loops contains something else except for inner-loop, i.e. it
won't be empty for taken branch.
I also would like to answer on your last question - CFG cleanup is
invoked to perform deletion of single-argument phi nodes from tail
block through substitution - such phi's prevent outer-loop
vectorization. But it is clear that such transformation can be done
other pass.

What is your opinion?

Yuri.

2015-07-28 13:50 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
> On Thu, Jul 23, 2015 at 4:45 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>> Hi Richard,
>>
>> I checked that both test-cases from 23855 are sucessfully unswitched
>> by proposed patch. I understand that it does not catch deeper loop
>> nest as
>>    for (i=0; i<10; i++)
>>      for (j=0;j<n;j++)
>>         for (k=0;k<20;k++)
>>   ...
>> but duplication of middle-loop does not look reasonable.
>>
>> Here is dump for your second test-case:
>>
>> void foo(int *ie, int *je, double *x)
>> {
>>   int i, j;
>>   for (j=0; j<*je; ++j)
>>     for (i=0; i<*ie; ++i)
>>       x[i+j] = 0.0;
>> }
>> grep -i unswitch t6.c.119t.unswitch
>> ;; Unswitching outer loop
>
> I was saying that why go with a limited approach when a patch (in
> unknown state...)
> is available that does it more generally?  Also unswitching is quite
> expensive compared
> to "moving" the invariant condition.
>
> In your patch:
>
> +  if (!nloop->force_vectorize)
> +    nloop->force_vectorize = true;
> +  if (loop->safelen != 0)
> +    nloop->safelen = loop->safelen;
>
> I see no guard on force_vectorize so = true looks bogus here.  Please just use
> copy_loop_info.
>
> +  if (integer_nonzerop (cond_new))
> +    gimple_cond_set_condition_from_tree (cond_stmt, boolean_true_node);
> +  else if (integer_zerop (cond_new))
> +    gimple_cond_set_condition_from_tree (cond_stmt, boolean_false_node);
>
> gimple_cond_make_true/false (cond_stmt);
>
> btw, seems odd that we have to recompute which loop is the true / false variant
> when we just fed a guard condition to loop_version.  Can't we statically
> determine whether loop or nloop has the in-loop condition true or false?
>
> +  /* Clean-up cfg to remove useless one-argument phi in exit block of
> +     outer-loop.  */
> +  cleanup_tree_cfg ();
>
> I know unswitching is already O(number-of-unswitched-loops * size-of-function)
> because it updates SSA form after each individual unswitching (and it does that
> because it invokes itself recursively on unswitched loops).  But do you really
> need to invoke CFG cleanup here?
>
> Richard.
>
>> Yuri.
>>
>> 2015-07-14 14:06 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>> On Fri, Jul 10, 2015 at 12:02 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>> Hi All,
>>>>
>>>> Here is presented simple transformation which tries to hoist out of
>>>> outer-loop a check on zero trip count for inner-loop. This is very
>>>> restricted transformation since it accepts outer-loops with very
>>>> simple cfg, as for example:
>>>>     acc = 0;
>>>>    for (i = 1; i <= m; i++) {
>>>>       for (j = 0; j < n; j++)
>>>>          if (l[j] == i) { v[j] = acc; acc++; };
>>>>       acc <<= 1;
>>>>    }
>>>>
>>>> Note that degenerative outer loop (without inner loop) will be
>>>> completely deleted as dead code.
>>>> The main goal of this transformation was to convert outer-loop to form
>>>> accepted by outer-loop vectorization (such test-case is also included
>>>> to patch).
>>>>
>>>> Bootstrap and regression testing did not show any new failures.
>>>>
>>>> Is it OK for trunk?
>>>
>>> I think this is
>>>
>>> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=23855
>>>
>>> as well.  It has a patch adding a invariant loop guard hoisting
>>> phase to loop-header copying.  Yeah, it needs updating to
>>> trunk again I suppose.  It's always non-stage1 when I come
>>> back to that patch.
>>>
>>> Your patch seems to be very specific and only handles outer
>>> loops of innermost loops.
>>>
>>> Richard.
>>>
>>>> ChangeLog:
>>>> 2015-07-10  Yuri Rumyantsev  <ysrumyan@gmail.com>
>>>>
>>>> * tree-ssa-loop-unswitch.c: Include "tree-cfgcleanup.h" and
>>>> "gimple-iterator.h", add prototype for tree_unswitch_outer_loop.
>>>> (tree_ssa_unswitch_loops): Add invoke of tree_unswitch_outer_loop.
>>>> (tree_unswitch_outer_loop): New function.
>>>>
>>>> gcc/testsuite/ChangeLog:
>>>> * gcc.dg/tree-ssa/unswitch-outer-loop-1.c: New test.
>>>> * gcc.dg/vect/vect-outer-simd-3.c: New test.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] Unswitching outer loops.
  2015-07-31 12:07       ` Yuri Rumyantsev
@ 2015-07-31 15:54         ` Jeff Law
  2015-08-03  7:27         ` Richard Biener
  1 sibling, 0 replies; 17+ messages in thread
From: Jeff Law @ 2015-07-31 15:54 UTC (permalink / raw)
  To: Yuri Rumyantsev, Richard Biener; +Cc: gcc-patches, Igor Zamyatin

On 07/31/2015 05:17 AM, Yuri Rumyantsev wrote:
> Hi Richard,
>
> I learned your updated patch for 23825 and it is more general in
> comparison with my.
> I'd like to propose you a compromise - let's consider my patch only
> for force-vectorize outer loop only to allow outer-loop
> vecctorization. Note that your approach will not hoist invariant
> guards if loops contains something else except for inner-loop, i.e. it
> won't be empty for taken branch.
> I also would like to answer on your last question - CFG cleanup is
> invoked to perform deletion of single-argument phi nodes from tail
> block through substitution - such phi's prevent outer-loop
> vectorization. But it is clear that such transformation can be done
> other pass.
A single-argument PHI is a degenerate.  phi-only-cprop might be your 
friend :-)

jeff

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] Unswitching outer loops.
  2015-07-31 12:07       ` Yuri Rumyantsev
  2015-07-31 15:54         ` Jeff Law
@ 2015-08-03  7:27         ` Richard Biener
       [not found]           ` <CAEoMCqSorkh1WmFtVB_huC2hbcVr8uc1EYaRaNVe1g+5hVuzPw@mail.gmail.com>
  1 sibling, 1 reply; 17+ messages in thread
From: Richard Biener @ 2015-08-03  7:27 UTC (permalink / raw)
  To: Yuri Rumyantsev; +Cc: gcc-patches, Igor Zamyatin

On Fri, Jul 31, 2015 at 1:17 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
> Hi Richard,
>
> I learned your updated patch for 23825 and it is more general in
> comparison with my.
> I'd like to propose you a compromise - let's consider my patch only
> for force-vectorize outer loop only to allow outer-loop
> vecctorization.

I don't see why we should special-case that if the approach in 23825
is sensible.

> Note that your approach will not hoist invariant
> guards if loops contains something else except for inner-loop, i.e. it
> won't be empty for taken branch.

Yes, it does not perform unswitching but guard hoisting.  Note that this
is originally Zdenek Dvoraks patch.

> I also would like to answer on your last question - CFG cleanup is
> invoked to perform deletion of single-argument phi nodes from tail
> block through substitution - such phi's prevent outer-loop
> vectorization. But it is clear that such transformation can be done
> other pass.

Hmm, I wonder why the copy_prop pass after unswitching does not
get rid of them?

> What is your opinion?

My opinion is that if we want to enhance unswitching to catch this
(or similar) cases then we should make it a lot more general than
your pattern-matching approach.  I see nothing that should prevent
us from considering unswitching non-innermost loops in general.
It should be only a cost consideration to not do non-innermost loop
unswitching (in addition to maybe a --param specifying the maximum
depth of a loop nest to unswitch).

So my first thought when seeing your patch still holds - the patch
looks very much too specific.

Richard.

> Yuri.
>
> 2015-07-28 13:50 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>> On Thu, Jul 23, 2015 at 4:45 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>> Hi Richard,
>>>
>>> I checked that both test-cases from 23855 are sucessfully unswitched
>>> by proposed patch. I understand that it does not catch deeper loop
>>> nest as
>>>    for (i=0; i<10; i++)
>>>      for (j=0;j<n;j++)
>>>         for (k=0;k<20;k++)
>>>   ...
>>> but duplication of middle-loop does not look reasonable.
>>>
>>> Here is dump for your second test-case:
>>>
>>> void foo(int *ie, int *je, double *x)
>>> {
>>>   int i, j;
>>>   for (j=0; j<*je; ++j)
>>>     for (i=0; i<*ie; ++i)
>>>       x[i+j] = 0.0;
>>> }
>>> grep -i unswitch t6.c.119t.unswitch
>>> ;; Unswitching outer loop
>>
>> I was saying that why go with a limited approach when a patch (in
>> unknown state...)
>> is available that does it more generally?  Also unswitching is quite
>> expensive compared
>> to "moving" the invariant condition.
>>
>> In your patch:
>>
>> +  if (!nloop->force_vectorize)
>> +    nloop->force_vectorize = true;
>> +  if (loop->safelen != 0)
>> +    nloop->safelen = loop->safelen;
>>
>> I see no guard on force_vectorize so = true looks bogus here.  Please just use
>> copy_loop_info.
>>
>> +  if (integer_nonzerop (cond_new))
>> +    gimple_cond_set_condition_from_tree (cond_stmt, boolean_true_node);
>> +  else if (integer_zerop (cond_new))
>> +    gimple_cond_set_condition_from_tree (cond_stmt, boolean_false_node);
>>
>> gimple_cond_make_true/false (cond_stmt);
>>
>> btw, seems odd that we have to recompute which loop is the true / false variant
>> when we just fed a guard condition to loop_version.  Can't we statically
>> determine whether loop or nloop has the in-loop condition true or false?
>>
>> +  /* Clean-up cfg to remove useless one-argument phi in exit block of
>> +     outer-loop.  */
>> +  cleanup_tree_cfg ();
>>
>> I know unswitching is already O(number-of-unswitched-loops * size-of-function)
>> because it updates SSA form after each individual unswitching (and it does that
>> because it invokes itself recursively on unswitched loops).  But do you really
>> need to invoke CFG cleanup here?
>>
>> Richard.
>>
>>> Yuri.
>>>
>>> 2015-07-14 14:06 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>> On Fri, Jul 10, 2015 at 12:02 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>> Hi All,
>>>>>
>>>>> Here is presented simple transformation which tries to hoist out of
>>>>> outer-loop a check on zero trip count for inner-loop. This is very
>>>>> restricted transformation since it accepts outer-loops with very
>>>>> simple cfg, as for example:
>>>>>     acc = 0;
>>>>>    for (i = 1; i <= m; i++) {
>>>>>       for (j = 0; j < n; j++)
>>>>>          if (l[j] == i) { v[j] = acc; acc++; };
>>>>>       acc <<= 1;
>>>>>    }
>>>>>
>>>>> Note that degenerative outer loop (without inner loop) will be
>>>>> completely deleted as dead code.
>>>>> The main goal of this transformation was to convert outer-loop to form
>>>>> accepted by outer-loop vectorization (such test-case is also included
>>>>> to patch).
>>>>>
>>>>> Bootstrap and regression testing did not show any new failures.
>>>>>
>>>>> Is it OK for trunk?
>>>>
>>>> I think this is
>>>>
>>>> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=23855
>>>>
>>>> as well.  It has a patch adding a invariant loop guard hoisting
>>>> phase to loop-header copying.  Yeah, it needs updating to
>>>> trunk again I suppose.  It's always non-stage1 when I come
>>>> back to that patch.
>>>>
>>>> Your patch seems to be very specific and only handles outer
>>>> loops of innermost loops.
>>>>
>>>> Richard.
>>>>
>>>>> ChangeLog:
>>>>> 2015-07-10  Yuri Rumyantsev  <ysrumyan@gmail.com>
>>>>>
>>>>> * tree-ssa-loop-unswitch.c: Include "tree-cfgcleanup.h" and
>>>>> "gimple-iterator.h", add prototype for tree_unswitch_outer_loop.
>>>>> (tree_ssa_unswitch_loops): Add invoke of tree_unswitch_outer_loop.
>>>>> (tree_unswitch_outer_loop): New function.
>>>>>
>>>>> gcc/testsuite/ChangeLog:
>>>>> * gcc.dg/tree-ssa/unswitch-outer-loop-1.c: New test.
>>>>> * gcc.dg/vect/vect-outer-simd-3.c: New test.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] Unswitching outer loops.
       [not found]                 ` <CAFiYyc2O9i690A0LZ0+jEOP8nkyz8Btc0YAb469aMgnRaVsmsQ@mail.gmail.com>
@ 2015-09-30 11:40                   ` Yuri Rumyantsev
  2015-10-05 10:57                     ` Richard Biener
  0 siblings, 1 reply; 17+ messages in thread
From: Yuri Rumyantsev @ 2015-09-30 11:40 UTC (permalink / raw)
  To: Richard Biener, gcc-patches, Igor Zamyatin

[-- Attachment #1: Type: text/plain, Size: 10321 bytes --]

Hi Richard,

I re-designed outer loop unswitching using basic idea of 23855 patch -
hoist invariant guard if loop is empty without guard. Note that this
was added to loop unswitching pass with simple modifications - using
another loop iterator etc.

Bootstrap and regression testing did not show any new failures.
What is your opinion?

Thanks.

ChangeLog:
2015-09-30  Yuri Rumyantsev  <ysrumyan@gmail.com>

* tree-ssa-loop-unswitch.c: Include "gimple-iterator.h" and
"cfghooks.h", add prototypes for introduced new functions.
(tree_ssa_unswitch_loops): Use from innermost loop iterator, move all
checks on ability of loop unswitching to tree_unswitch_single_loop;
invoke tree_unswitch_single_loop or tree_unswitch_outer_loop depending
on innermost loop check.
(tree_unswitch_single_loop): Add all required checks on ability of
loop unswitching under zero recursive level guard.
(tree_unswitch_outer_loop): New function.
(find_loop_guard): Likewise.
(empty_bb_without_guard_p): Likewise.
(used_outside_loop_p): Likewise.
(hoist_guard): Likewise.
(check_exit_phi): Likewise.

   gcc/testsuite/ChangeLog:
* gcc.dg/loop-unswitch-2.c: New test.

2015-09-16 11:26 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
> Yeah, as said, the patch wasn't fully ready and it also felt odd to do
> this hoisting in loop header copying.  Integrating it
> with LIM would be a better fit eventually.
>
> Note that we did agree to go forward with your original patch just
> making it more "generically" perform outer loop
> unswitching.  Did you explore that idea further?
>
>
>
> On Tue, Sep 15, 2015 at 6:00 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>> Thanks Richard.
>>
>> I found one more issue that could not be fixed simply. In 23855 you
>> consider the following test-case:
>> void foo(int *ie, int *je, double *x)
>> {
>>   int i, j;
>>   for (j=0; j<*je; ++j)
>>     for (i=0; i<*ie; ++i)
>>       x[i+j] = 0.0;
>> }
>> and proposed to hoist up a check on *ie out of loop. It requires
>> memref alias analysis since in general x and ie can alias (if their
>> types are compatible - int *ie & int * x). Such analysis is performed
>> by pre or lim passes. Without such analysis we can not hoist a test on
>> non-zero for *ie out of loop using 238565 patch.
>>  The second concern is that proposed copy header algorithm changes
>> loop structure significantly and it is not accepted by vectorizer
>> since latch is not empty (such transformation assumes loop peeling for
>> one iteration. So I can propose to implement simple guard hoisting
>> without copying header and tail blocks (if it is possible).
>>
>> I will appreciate you for any advice or help since without such
>> hoisting we are not able to perform outer loop vectorization for
>> important benchmark.
>> and
>>
>> 2015-09-15 14:22 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>> On Thu, Sep 3, 2015 at 6:32 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>> Hi Richard,
>>>>
>>>> I started learning, tuning and debugging patch proposed in 23855 and
>>>> discovered thta it does not work properly.
>>>> So I wonder is it tested patch and it should work?
>>>
>>> I don't remember, but as it wasn't committed it certainly wasn't ready.
>>>
>>>> Should it accept for hoisting the following loop nest
>>>>   for (i=0; i<n; i++) {
>>>>     s = 0;
>>>>     for (j=0; j<m; j++)
>>>>       s += a[i] * b[j];
>>>>     c[i] = s;
>>>>   }
>>>> Note that i-loop will nit be empty if m is equal to 0.
>>>
>>> if m is equal to 0 then we still have the c[i] = s store, no?  Of course
>>> we could unswitch the outer loop on m == 0 but simple hoisting wouldn't work.
>>>
>>> Richard.
>>>
>>>> 2015-08-03 10:27 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>> On Fri, Jul 31, 2015 at 1:17 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>>> Hi Richard,
>>>>>>
>>>>>> I learned your updated patch for 23825 and it is more general in
>>>>>> comparison with my.
>>>>>> I'd like to propose you a compromise - let's consider my patch only
>>>>>> for force-vectorize outer loop only to allow outer-loop
>>>>>> vecctorization.
>>>>>
>>>>> I don't see why we should special-case that if the approach in 23825
>>>>> is sensible.
>>>>>
>>>>>> Note that your approach will not hoist invariant
>>>>>> guards if loops contains something else except for inner-loop, i.e. it
>>>>>> won't be empty for taken branch.
>>>>>
>>>>> Yes, it does not perform unswitching but guard hoisting.  Note that this
>>>>> is originally Zdenek Dvoraks patch.
>>>>>
>>>>>> I also would like to answer on your last question - CFG cleanup is
>>>>>> invoked to perform deletion of single-argument phi nodes from tail
>>>>>> block through substitution - such phi's prevent outer-loop
>>>>>> vectorization. But it is clear that such transformation can be done
>>>>>> other pass.
>>>>>
>>>>> Hmm, I wonder why the copy_prop pass after unswitching does not
>>>>> get rid of them?
>>>>>
>>>>>> What is your opinion?
>>>>>
>>>>> My opinion is that if we want to enhance unswitching to catch this
>>>>> (or similar) cases then we should make it a lot more general than
>>>>> your pattern-matching approach.  I see nothing that should prevent
>>>>> us from considering unswitching non-innermost loops in general.
>>>>> It should be only a cost consideration to not do non-innermost loop
>>>>> unswitching (in addition to maybe a --param specifying the maximum
>>>>> depth of a loop nest to unswitch).
>>>>>
>>>>> So my first thought when seeing your patch still holds - the patch
>>>>> looks very much too specific.
>>>>>
>>>>> Richard.
>>>>>
>>>>>> Yuri.
>>>>>>
>>>>>> 2015-07-28 13:50 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>>>> On Thu, Jul 23, 2015 at 4:45 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>>>>> Hi Richard,
>>>>>>>>
>>>>>>>> I checked that both test-cases from 23855 are sucessfully unswitched
>>>>>>>> by proposed patch. I understand that it does not catch deeper loop
>>>>>>>> nest as
>>>>>>>>    for (i=0; i<10; i++)
>>>>>>>>      for (j=0;j<n;j++)
>>>>>>>>         for (k=0;k<20;k++)
>>>>>>>>   ...
>>>>>>>> but duplication of middle-loop does not look reasonable.
>>>>>>>>
>>>>>>>> Here is dump for your second test-case:
>>>>>>>>
>>>>>>>> void foo(int *ie, int *je, double *x)
>>>>>>>> {
>>>>>>>>   int i, j;
>>>>>>>>   for (j=0; j<*je; ++j)
>>>>>>>>     for (i=0; i<*ie; ++i)
>>>>>>>>       x[i+j] = 0.0;
>>>>>>>> }
>>>>>>>> grep -i unswitch t6.c.119t.unswitch
>>>>>>>> ;; Unswitching outer loop
>>>>>>>
>>>>>>> I was saying that why go with a limited approach when a patch (in
>>>>>>> unknown state...)
>>>>>>> is available that does it more generally?  Also unswitching is quite
>>>>>>> expensive compared
>>>>>>> to "moving" the invariant condition.
>>>>>>>
>>>>>>> In your patch:
>>>>>>>
>>>>>>> +  if (!nloop->force_vectorize)
>>>>>>> +    nloop->force_vectorize = true;
>>>>>>> +  if (loop->safelen != 0)
>>>>>>> +    nloop->safelen = loop->safelen;
>>>>>>>
>>>>>>> I see no guard on force_vectorize so = true looks bogus here.  Please just use
>>>>>>> copy_loop_info.
>>>>>>>
>>>>>>> +  if (integer_nonzerop (cond_new))
>>>>>>> +    gimple_cond_set_condition_from_tree (cond_stmt, boolean_true_node);
>>>>>>> +  else if (integer_zerop (cond_new))
>>>>>>> +    gimple_cond_set_condition_from_tree (cond_stmt, boolean_false_node);
>>>>>>>
>>>>>>> gimple_cond_make_true/false (cond_stmt);
>>>>>>>
>>>>>>> btw, seems odd that we have to recompute which loop is the true / false variant
>>>>>>> when we just fed a guard condition to loop_version.  Can't we statically
>>>>>>> determine whether loop or nloop has the in-loop condition true or false?
>>>>>>>
>>>>>>> +  /* Clean-up cfg to remove useless one-argument phi in exit block of
>>>>>>> +     outer-loop.  */
>>>>>>> +  cleanup_tree_cfg ();
>>>>>>>
>>>>>>> I know unswitching is already O(number-of-unswitched-loops * size-of-function)
>>>>>>> because it updates SSA form after each individual unswitching (and it does that
>>>>>>> because it invokes itself recursively on unswitched loops).  But do you really
>>>>>>> need to invoke CFG cleanup here?
>>>>>>>
>>>>>>> Richard.
>>>>>>>
>>>>>>>> Yuri.
>>>>>>>>
>>>>>>>> 2015-07-14 14:06 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>>>>>> On Fri, Jul 10, 2015 at 12:02 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>>>>>>> Hi All,
>>>>>>>>>>
>>>>>>>>>> Here is presented simple transformation which tries to hoist out of
>>>>>>>>>> outer-loop a check on zero trip count for inner-loop. This is very
>>>>>>>>>> restricted transformation since it accepts outer-loops with very
>>>>>>>>>> simple cfg, as for example:
>>>>>>>>>>     acc = 0;
>>>>>>>>>>    for (i = 1; i <= m; i++) {
>>>>>>>>>>       for (j = 0; j < n; j++)
>>>>>>>>>>          if (l[j] == i) { v[j] = acc; acc++; };
>>>>>>>>>>       acc <<= 1;
>>>>>>>>>>    }
>>>>>>>>>>
>>>>>>>>>> Note that degenerative outer loop (without inner loop) will be
>>>>>>>>>> completely deleted as dead code.
>>>>>>>>>> The main goal of this transformation was to convert outer-loop to form
>>>>>>>>>> accepted by outer-loop vectorization (such test-case is also included
>>>>>>>>>> to patch).
>>>>>>>>>>
>>>>>>>>>> Bootstrap and regression testing did not show any new failures.
>>>>>>>>>>
>>>>>>>>>> Is it OK for trunk?
>>>>>>>>>
>>>>>>>>> I think this is
>>>>>>>>>
>>>>>>>>> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=23855
>>>>>>>>>
>>>>>>>>> as well.  It has a patch adding a invariant loop guard hoisting
>>>>>>>>> phase to loop-header copying.  Yeah, it needs updating to
>>>>>>>>> trunk again I suppose.  It's always non-stage1 when I come
>>>>>>>>> back to that patch.
>>>>>>>>>
>>>>>>>>> Your patch seems to be very specific and only handles outer
>>>>>>>>> loops of innermost loops.
>>>>>>>>>
>>>>>>>>> Richard.
>>>>>>>>>
>>>>>>>>>> ChangeLog:
>>>>>>>>>> 2015-07-10  Yuri Rumyantsev  <ysrumyan@gmail.com>
>>>>>>>>>>
>>>>>>>>>> * tree-ssa-loop-unswitch.c: Include "tree-cfgcleanup.h" and
>>>>>>>>>> "gimple-iterator.h", add prototype for tree_unswitch_outer_loop.
>>>>>>>>>> (tree_ssa_unswitch_loops): Add invoke of tree_unswitch_outer_loop.
>>>>>>>>>> (tree_unswitch_outer_loop): New function.
>>>>>>>>>>
>>>>>>>>>> gcc/testsuite/ChangeLog:
>>>>>>>>>> * gcc.dg/tree-ssa/unswitch-outer-loop-1.c: New test.
>>>>>>>>>> * gcc.dg/vect/vect-outer-simd-3.c: New test.

[-- Attachment #2: patch.new --]
[-- Type: application/octet-stream, Size: 14901 bytes --]

diff --git a/gcc/testsuite/gcc.dg/loop-unswitch-2.c b/gcc/testsuite/gcc.dg/loop-unswitch-2.c
new file mode 100644
index 0000000..012c07b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/loop-unswitch-2.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -funswitch-loops -fdump-tree-unswitch-details" } */
+
+void foo (float **a, float **b, float *c, int n, int m, int l)
+{
+  int i,j,k;
+  float s;
+  for (i=0; i<l; i++)
+    {
+      for (j=0; j<n; j++)
+	for (k=0; k<m; k++)
+	  c[i] += a[i][k] * b[k][j];
+    }
+}
+
+/* { dg-final { scan-tree-dump-times "guard hoisted" 2 "unswitch" } } */ 
diff --git a/gcc/tree-ssa-loop-unswitch.c b/gcc/tree-ssa-loop-unswitch.c
index a273638..9385503 100644
--- a/gcc/tree-ssa-loop-unswitch.c
+++ b/gcc/tree-ssa-loop-unswitch.c
@@ -39,6 +39,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "params.h"
 #include "tree-pass.h"
 #include "tree-inline.h"
+#include "gimple-iterator.h"
+#include "cfghooks.h"
 
 /* This file implements the loop unswitching, i.e. transformation of loops like
 
@@ -79,6 +81,12 @@ along with GCC; see the file COPYING3.  If not see
 static struct loop *tree_unswitch_loop (struct loop *, basic_block, tree);
 static bool tree_unswitch_single_loop (struct loop *, int);
 static tree tree_may_unswitch_on (basic_block, struct loop *);
+static bool tree_unswitch_outer_loop (struct loop *);
+static edge find_loop_guard (struct loop *);
+static bool empty_bb_without_guard_p (struct loop *, basic_block);
+static bool used_outside_loop_p (struct loop *, tree);
+static void hoist_guard (struct loop *, edge);
+static bool check_exit_phi (struct loop *);
 
 /* Main entry point.  Perform loop unswitching on all suitable loops.  */
 
@@ -87,42 +95,15 @@ tree_ssa_unswitch_loops (void)
 {
   struct loop *loop;
   bool changed = false;
-  HOST_WIDE_INT iterations;
 
-  /* Go through inner loops (only original ones).  */
-  FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST)
+  /* Go through all loops starting from innermost.  */
+  FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
     {
-      if (dump_file && (dump_flags & TDF_DETAILS))
-        fprintf (dump_file, ";; Considering loop %d\n", loop->num);
-
-      /* Do not unswitch in cold regions. */
-      if (optimize_loop_for_size_p (loop))
-        {
-          if (dump_file && (dump_flags & TDF_DETAILS))
-            fprintf (dump_file, ";; Not unswitching cold loops\n");
-          continue;
-        }
-
-      /* The loop should not be too large, to limit code growth. */
-      if (tree_num_loop_insns (loop, &eni_size_weights)
-          > (unsigned) PARAM_VALUE (PARAM_MAX_UNSWITCH_INSNS))
-        {
-          if (dump_file && (dump_flags & TDF_DETAILS))
-            fprintf (dump_file, ";; Not unswitching, loop too big\n");
-          continue;
-        }
-
-      /* If the loop is not expected to iterate, there is no need
-	 for unswitching.  */
-      iterations = estimated_loop_iterations_int (loop);
-      if (iterations >= 0 && iterations <= 1)
-	{
-          if (dump_file && (dump_flags & TDF_DETAILS))
-            fprintf (dump_file, ";; Not unswitching, loop is not expected to iterate\n");
-          continue;
-	}
-
-      changed |= tree_unswitch_single_loop (loop, 0);
+      if (!loop->inner)
+	/* Unswitch innermost loop.  */
+	changed |= tree_unswitch_single_loop (loop, 0);
+      else
+	changed |= tree_unswitch_outer_loop (loop);
     }
 
   if (changed)
@@ -216,6 +197,39 @@ tree_unswitch_single_loop (struct loop *loop, int num)
   tree cond = NULL_TREE;
   gimple stmt;
   bool changed = false;
+  HOST_WIDE_INT iterations;
+
+  /* Perform initial tests if unswitch is eligible.  */
+  if (num == 0)
+    {
+      /* Do not unswitch in cold regions. */
+      if (optimize_loop_for_size_p (loop))
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, ";; Not unswitching cold loops\n");
+	  return false;
+	}
+
+      /* The loop should not be too large, to limit code growth. */
+      if (tree_num_loop_insns (loop, &eni_size_weights)
+	  > (unsigned) PARAM_VALUE (PARAM_MAX_UNSWITCH_INSNS))
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, ";; Not unswitching, loop too big\n");
+	  return false;
+	}
+
+      /* If the loop is not expected to iterate, there is no need
+	 for unswitching.  */
+      iterations = estimated_loop_iterations_int (loop);
+      if (iterations >= 0 && iterations <= 1)
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, ";; Not unswitching, loop is not expected"
+		     " to iterate\n");
+	  return false;
+	}
+    }
 
   i = 0;
   bbs = get_loop_body (loop);
@@ -403,6 +417,359 @@ tree_unswitch_loop (struct loop *loop,
 		       REG_BR_PROB_BASE - prob_true, false);
 }
 
+/* Unswitch outer loops by hoisting invariant guard on
+   inner loop without code duplication.  */
+static bool
+tree_unswitch_outer_loop (struct loop *loop)
+{
+  edge exit, guard;
+
+  gcc_assert (loop->inner);
+  if (loop->inner->next)
+    return false;
+  /* Accept loops with single exit only.  */
+  exit = single_exit (loop);
+  if (!exit)
+    return false;
+  /* Check that phi argument of exit edge is not defined inside loop.  */
+  if (!check_exit_phi (loop))
+    return false;
+  /* Loop must not be infinite.  */
+  if (!finite_loop_p (loop))
+    return false;
+  guard = find_loop_guard (loop);
+  if (guard)
+    {
+      hoist_guard (loop, guard);
+      update_ssa (TODO_update_ssa);
+      return true;
+    }
+  return false;
+}
+
+/* Checks if the body of the LOOP is within an invariant guard.  If this
+   is the case, returns the edge that jumps over the real body of the loop,
+   otherwise returns NULL.  */
+
+static edge
+find_loop_guard (struct loop *loop)
+{
+  basic_block header = loop->header;
+  edge guard_edge, te, fe;
+  /* bitmap processed, known_invariants;*/
+  basic_block *body = NULL;
+  unsigned i;
+  tree use;
+  ssa_op_iter iter;
+
+  /* We check for the following situation:
+
+     while (1)
+       {
+	 [header]]
+         loop_phi_nodes;
+	 something1;
+	 if (cond1)
+	   body;
+	 nvar = phi(orig, bvar) ... for all variables changed in body;
+	 [guard_end]
+	 something2;
+	 if (cond2)
+	   break;
+	 something3;
+       }
+
+     where:
+
+     1) cond1 is loop invariant
+     2) If cond1 is false, then the loop is essentially empty; i.e.,
+	a) nothing in something1, something2 and something3 has side
+	   effects
+	b) anything defined in something1, something2 and something3
+	   is not used outside of the loop.  */
+
+  while (single_succ_p (header))
+    header = single_succ (header);
+  if (!last_stmt (header)
+      || gimple_code (last_stmt (header)) != GIMPLE_COND)
+    return NULL;
+
+  extract_true_false_edges_from_block (header, &te, &fe);
+  if (!flow_bb_inside_loop_p (loop, te->dest)
+      || !flow_bb_inside_loop_p (loop, fe->dest))
+    return NULL;
+
+  if (just_once_each_iteration_p (loop, te->dest)
+      || (single_succ_p (te->dest)
+	  && just_once_each_iteration_p (loop, single_succ (te->dest))))
+    {
+      if (just_once_each_iteration_p (loop, fe->dest))
+	return NULL;
+      guard_edge = te;
+    }
+  else if (just_once_each_iteration_p (loop, fe->dest)
+	   || (single_succ_p (fe->dest)
+	       && just_once_each_iteration_p (loop, single_succ (fe->dest))))
+    guard_edge = fe;
+  else
+    return NULL;
+
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file,
+	     "Considering guard %d -> %d in loop %d\n",
+	     guard_edge->src->index, guard_edge->dest->index, loop->num);
+  /* Check if condition operands do not have definitions inside loop since
+     any bb copying is not performed.  */
+  FOR_EACH_SSA_TREE_OPERAND (use, last_stmt (header), iter, SSA_OP_USE)
+    {
+      gimple def = SSA_NAME_DEF_STMT (use);
+      basic_block def_bb = gimple_bb (def);
+      if (def_bb
+          && flow_bb_inside_loop_p (loop, def_bb))
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, "  guard operands have definitions"
+				" inside loop\n");
+	  return NULL;
+	}
+    }
+
+  body = get_loop_body_in_dom_order (loop);
+  for (i = 0; i < loop->num_nodes; i++)
+    {
+      if (body[i]->loop_father != loop)
+	continue;
+      if (!empty_bb_without_guard_p (loop, body[i]))
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, "  block %d has side effects\n", body[i]->index);
+	  guard_edge = NULL;
+	  goto end;
+	}
+    }
+
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, "  suitable to hoist\n");
+end:
+  if (body)
+    free (body);
+  return guard_edge;
+}
+
+/* Returns true if
+   1) no statement in BB has side effects
+   2) assuming that edge GUARD is always taken, all definitions in BB
+      are noy used outside of the loop.
+   KNOWN_INVARIANTS is a set of ssa names we know to be invariant, and
+   PROCESSED is a set of ssa names for that we already tested whether they
+   are invariant or not.  */
+
+static bool
+empty_bb_without_guard_p (struct loop *loop, basic_block bb)
+{
+  basic_block exit_bb = single_exit (loop)->src;
+  bool may_be_used_outside = (bb == exit_bb
+			      || !dominated_by_p (CDI_DOMINATORS, bb, exit_bb));
+  tree name;
+  ssa_op_iter op_iter;
+
+  /* Phi nodes do not have side effects, but their results might be used
+     outside of the loop.  */
+  if (may_be_used_outside)
+    {
+      for (gphi_iterator gsi = gsi_start_phis (bb);
+	   !gsi_end_p (gsi); gsi_next (&gsi))
+	{
+	  gphi *phi = gsi.phi ();
+	  name = PHI_RESULT (phi);
+	  if (virtual_operand_p (name))
+	    continue;
+
+	  if (used_outside_loop_p (loop, name))
+	    return false;
+	}
+    }
+
+  for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
+       !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      gimple stmt = gsi_stmt (gsi);
+      if (gimple_has_side_effects (stmt))
+	return false;
+
+      if (gimple_vdef(stmt))
+	return false;
+
+      FOR_EACH_SSA_TREE_OPERAND (name, stmt, op_iter, SSA_OP_DEF)
+	{
+	  if (may_be_used_outside
+	      && used_outside_loop_p (loop, name))
+	    return false;
+	}
+    }
+  return true;
+}
+
+/* Return true if NAME is used outside of LOOP.  */
+
+static bool
+used_outside_loop_p (struct loop *loop, tree name)
+{
+  imm_use_iterator it;
+  use_operand_p use;
+
+  FOR_EACH_IMM_USE_FAST (use, it, name)
+    {
+      gimple stmt = USE_STMT (use);
+      if (!flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
+	return true;
+    }
+
+  return false;
+}
+
+/* Moves the check of GUARD outside of LOOP.  */
+
+static void
+hoist_guard (struct loop *loop, edge guard)
+{
+  edge exit = single_exit (loop);
+  edge preh = loop_preheader_edge (loop);
+  basic_block pre_header = preh->src;
+  basic_block bb;
+  edge te, fe, e, tmpe, new_edge;
+  gimple stmt;
+  basic_block guard_bb = guard->src;
+  gimple_stmt_iterator gsi;
+  int flags = 0;
+  bool fix_dom_of_exit;
+
+  gcc_assert (single_succ_p (pre_header));
+  bb = get_immediate_dominator (CDI_DOMINATORS, exit->dest);
+  fix_dom_of_exit = flow_bb_inside_loop_p (loop, bb);
+  /* Hoist GUARD out of loop.  */
+  gsi = gsi_last_bb (guard_bb);
+  stmt = gsi_stmt (gsi);
+  gcc_assert (gimple_code (stmt) == GIMPLE_COND);
+  extract_true_false_edges_from_block (guard_bb, &te, &fe);
+  tmpe = (guard == te) ? fe : te;
+  tmpe->flags &=~ (EDGE_TRUE_VALUE | EDGE_FALSE_VALUE);
+  flags = tmpe->flags;
+  tmpe->flags |= EDGE_FALLTHRU;
+  gsi_remove (&gsi, false);
+  bb = guard->dest;
+  remove_edge (guard);
+  /* Update dominance for destination of GUARD.  */
+  if (EDGE_COUNT (bb->preds) == 0)
+    {
+      basic_block s_bb;
+      gcc_assert (single_succ_p (bb));
+      s_bb = single_succ (bb);
+      delete_basic_block (bb);
+      if (single_pred_p (s_bb))
+	set_immediate_dominator (CDI_DOMINATORS, s_bb, single_pred (s_bb));
+      else
+	{
+	  basic_block dom_bb = recompute_dominator (CDI_DOMINATORS, s_bb);
+	  gcc_assert (dom_bb);
+	  set_immediate_dominator (CDI_DOMINATORS, s_bb, dom_bb);
+	}
+    }
+  else if (single_pred_p (bb))
+    set_immediate_dominator (CDI_DOMINATORS, bb, single_pred (bb));
+  else
+    {
+      basic_block dom_bb = recompute_dominator (CDI_DOMINATORS, bb);;
+      gcc_assert (dom_bb);
+      set_immediate_dominator (CDI_DOMINATORS, bb, dom_bb);
+    }
+  /* Insert guard to PRE_HEADER.  */
+  if (!empty_block_p (pre_header))
+    gsi = gsi_last_bb (pre_header);
+  else
+    gsi = gsi_start_bb (pre_header);
+  gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
+  /* Create new loop pre-header.  */
+  e = split_block (pre_header, last_stmt (pre_header));
+  gcc_assert (loop_preheader_edge (loop)->src == e->dest);
+  if (guard == fe)
+    {
+      e->flags = EDGE_TRUE_VALUE;
+      flags |= EDGE_FALSE_VALUE;
+    }
+  else
+    {
+      e->flags = EDGE_FALSE_VALUE;
+      flags |= EDGE_TRUE_VALUE;
+    }
+  new_edge = make_edge (pre_header, exit->dest, flags);
+  if (fix_dom_of_exit)
+    set_immediate_dominator (CDI_DOMINATORS, exit->dest, pre_header);
+  update_stmt (gsi_stmt (gsi));
+  /* Add NEW_ADGE argument for all phi in post-header block.  */
+  bb = exit->dest;
+  for (gphi_iterator gsi = gsi_start_phis (bb);
+       !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      gphi *phi = gsi.phi ();
+      /* edge_iterator ei; */
+      tree arg;
+      if (virtual_operand_p (gimple_phi_result (phi)))
+	{
+	  arg = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
+	  add_phi_arg (phi, arg, new_edge, UNKNOWN_LOCATION);
+	}
+      else
+	{
+	  /* Use exit edge argument.  */
+	  arg = PHI_ARG_DEF_FROM_EDGE (phi, exit);
+	  add_phi_arg (phi, arg, new_edge, UNKNOWN_LOCATION);
+	}
+    }
+
+  mark_virtual_operands_for_renaming (cfun);
+  update_ssa (TODO_update_ssa);
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, "  guard hoisted.\n");
+}
+
+/* Return true if phi argument for exit edge can be used
+   for edge around loop.  */
+
+static bool
+check_exit_phi (struct loop *loop)
+{
+  edge exit = single_exit (loop);
+  basic_block pre_header = loop_preheader_edge (loop)->src;
+
+  for (gphi_iterator gsi = gsi_start_phis (exit->dest);
+       !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      gphi *phi = gsi.phi ();
+      tree arg;
+      gimple def;
+      basic_block def_bb;
+      if (virtual_operand_p (gimple_phi_result (phi)))
+	continue;
+      arg = PHI_ARG_DEF_FROM_EDGE (phi, exit);
+      if (TREE_CODE (arg) != SSA_NAME)
+	continue;
+      def = SSA_NAME_DEF_STMT (arg);
+      if (!def)
+	continue;
+      def_bb = gimple_bb (def);
+      if (!def_bb)
+	continue;
+      if (!dominated_by_p (CDI_DOMINATORS, pre_header, def_bb))
+	/* Definition inside loop!  */
+	return false;
+      /* Check loop closed phi invariant.  */
+      if (!flow_bb_inside_loop_p (def_bb->loop_father, pre_header))
+	return false;
+    }
+  return true;
+}
+
 /* Loop unswitching pass.  */
 
 namespace {

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] Unswitching outer loops.
  2015-09-30 11:40                   ` Yuri Rumyantsev
@ 2015-10-05 10:57                     ` Richard Biener
  2015-10-05 13:13                       ` Yuri Rumyantsev
  0 siblings, 1 reply; 17+ messages in thread
From: Richard Biener @ 2015-10-05 10:57 UTC (permalink / raw)
  To: Yuri Rumyantsev; +Cc: gcc-patches, Igor Zamyatin

On Wed, Sep 30, 2015 at 12:46 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
> Hi Richard,
>
> I re-designed outer loop unswitching using basic idea of 23855 patch -
> hoist invariant guard if loop is empty without guard. Note that this
> was added to loop unswitching pass with simple modifications - using
> another loop iterator etc.
>
> Bootstrap and regression testing did not show any new failures.
> What is your opinion?

Overall it looks good.  Some comments below - a few more testcases would
be nice as well.

+  /* Loop must not be infinite.  */
+  if (!finite_loop_p (loop))
+    return false;

why's that?

+  body = get_loop_body_in_dom_order (loop);
+  for (i = 0; i < loop->num_nodes; i++)
+    {
+      if (body[i]->loop_father != loop)
+       continue;
+      if (!empty_bb_without_guard_p (loop, body[i]))

I wonder if there is a better way to iterate over the interesting
blocks and PHIs
we need to check for side-effects (and thus we maybe can avoid gathering
the loop in DOM order).

+      FOR_EACH_SSA_TREE_OPERAND (name, stmt, op_iter, SSA_OP_DEF)
+       {
+         if (may_be_used_outside

may_be_used_outside can be hoisted above the loop.  I wonder if we can take
advantage of loop-closed SSA form here (and the fact we have a single exit
from the loop).  Iterating over exit dest PHIs and determining whether the
exit edge DEF is inside the loop part it may not be should be enough.

+  gcc_assert (single_succ_p (pre_header));

that should be always true.

+  gsi_remove (&gsi, false);
+  bb = guard->dest;
+  remove_edge (guard);
+  /* Update dominance for destination of GUARD.  */
+  if (EDGE_COUNT (bb->preds) == 0)
+    {
+      basic_block s_bb;
+      gcc_assert (single_succ_p (bb));
+      s_bb = single_succ (bb);
+      delete_basic_block (bb);
+      if (single_pred_p (s_bb))
+       set_immediate_dominator (CDI_DOMINATORS, s_bb, single_pred (s_bb));

all this massaging should be simplified by leaving it to CFG cleanup by
simply adjusting the CONDs condition to always true/false.  There is
gimple_cond_make_{true,false} () for this (would be nice to have a variant
taking a bool).

+  new_edge = make_edge (pre_header, exit->dest, flags);
+  if (fix_dom_of_exit)
+    set_immediate_dominator (CDI_DOMINATORS, exit->dest, pre_header);
+  update_stmt (gsi_stmt (gsi));

the update_stmt should be not necessary, it's done by gsi_insert_after already.

+  /* Add NEW_ADGE argument for all phi in post-header block.  */
+  bb = exit->dest;
+  for (gphi_iterator gsi = gsi_start_phis (bb);
+       !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      gphi *phi = gsi.phi ();
+      /* edge_iterator ei; */
+      tree arg;
+      if (virtual_operand_p (gimple_phi_result (phi)))
+       {
+         arg = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
+         add_phi_arg (phi, arg, new_edge, UNKNOWN_LOCATION);
+       }
+      else
+       {
+         /* Use exit edge argument.  */
+         arg = PHI_ARG_DEF_FROM_EDGE (phi, exit);
+         add_phi_arg (phi, arg, new_edge, UNKNOWN_LOCATION);

Hum.  How is it ok to use the exit edge argument for the edge that skips
the loop?  Why can't you always use the pre-header edge value?
That is, if we have

 for(i=0;i<m;++i)
   {
     if (n > 0)
    {
     for (;;)
       {
       }
     }
   }
  ... = i;

then i is used after the loop and the correct value to use if
n > 0 is false is '0'.  Maybe this way we can also relax
what check_exit_phi does?  IMHO the only restriction is
if sth defined inside the loop before the header check for
the inner loop is used after the loop.

Thanks,
Richard.

> Thanks.
>
> ChangeLog:
> 2015-09-30  Yuri Rumyantsev  <ysrumyan@gmail.com>
>
> * tree-ssa-loop-unswitch.c: Include "gimple-iterator.h" and
> "cfghooks.h", add prototypes for introduced new functions.
> (tree_ssa_unswitch_loops): Use from innermost loop iterator, move all
> checks on ability of loop unswitching to tree_unswitch_single_loop;
> invoke tree_unswitch_single_loop or tree_unswitch_outer_loop depending
> on innermost loop check.
> (tree_unswitch_single_loop): Add all required checks on ability of
> loop unswitching under zero recursive level guard.
> (tree_unswitch_outer_loop): New function.
> (find_loop_guard): Likewise.
> (empty_bb_without_guard_p): Likewise.
> (used_outside_loop_p): Likewise.
> (hoist_guard): Likewise.
> (check_exit_phi): Likewise.
>
>    gcc/testsuite/ChangeLog:
> * gcc.dg/loop-unswitch-2.c: New test.
>
> 2015-09-16 11:26 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>> Yeah, as said, the patch wasn't fully ready and it also felt odd to do
>> this hoisting in loop header copying.  Integrating it
>> with LIM would be a better fit eventually.
>>
>> Note that we did agree to go forward with your original patch just
>> making it more "generically" perform outer loop
>> unswitching.  Did you explore that idea further?
>>
>>
>>
>> On Tue, Sep 15, 2015 at 6:00 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>> Thanks Richard.
>>>
>>> I found one more issue that could not be fixed simply. In 23855 you
>>> consider the following test-case:
>>> void foo(int *ie, int *je, double *x)
>>> {
>>>   int i, j;
>>>   for (j=0; j<*je; ++j)
>>>     for (i=0; i<*ie; ++i)
>>>       x[i+j] = 0.0;
>>> }
>>> and proposed to hoist up a check on *ie out of loop. It requires
>>> memref alias analysis since in general x and ie can alias (if their
>>> types are compatible - int *ie & int * x). Such analysis is performed
>>> by pre or lim passes. Without such analysis we can not hoist a test on
>>> non-zero for *ie out of loop using 238565 patch.
>>>  The second concern is that proposed copy header algorithm changes
>>> loop structure significantly and it is not accepted by vectorizer
>>> since latch is not empty (such transformation assumes loop peeling for
>>> one iteration. So I can propose to implement simple guard hoisting
>>> without copying header and tail blocks (if it is possible).
>>>
>>> I will appreciate you for any advice or help since without such
>>> hoisting we are not able to perform outer loop vectorization for
>>> important benchmark.
>>> and
>>>
>>> 2015-09-15 14:22 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>> On Thu, Sep 3, 2015 at 6:32 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>> Hi Richard,
>>>>>
>>>>> I started learning, tuning and debugging patch proposed in 23855 and
>>>>> discovered thta it does not work properly.
>>>>> So I wonder is it tested patch and it should work?
>>>>
>>>> I don't remember, but as it wasn't committed it certainly wasn't ready.
>>>>
>>>>> Should it accept for hoisting the following loop nest
>>>>>   for (i=0; i<n; i++) {
>>>>>     s = 0;
>>>>>     for (j=0; j<m; j++)
>>>>>       s += a[i] * b[j];
>>>>>     c[i] = s;
>>>>>   }
>>>>> Note that i-loop will nit be empty if m is equal to 0.
>>>>
>>>> if m is equal to 0 then we still have the c[i] = s store, no?  Of course
>>>> we could unswitch the outer loop on m == 0 but simple hoisting wouldn't work.
>>>>
>>>> Richard.
>>>>
>>>>> 2015-08-03 10:27 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>>> On Fri, Jul 31, 2015 at 1:17 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>>>> Hi Richard,
>>>>>>>
>>>>>>> I learned your updated patch for 23825 and it is more general in
>>>>>>> comparison with my.
>>>>>>> I'd like to propose you a compromise - let's consider my patch only
>>>>>>> for force-vectorize outer loop only to allow outer-loop
>>>>>>> vecctorization.
>>>>>>
>>>>>> I don't see why we should special-case that if the approach in 23825
>>>>>> is sensible.
>>>>>>
>>>>>>> Note that your approach will not hoist invariant
>>>>>>> guards if loops contains something else except for inner-loop, i.e. it
>>>>>>> won't be empty for taken branch.
>>>>>>
>>>>>> Yes, it does not perform unswitching but guard hoisting.  Note that this
>>>>>> is originally Zdenek Dvoraks patch.
>>>>>>
>>>>>>> I also would like to answer on your last question - CFG cleanup is
>>>>>>> invoked to perform deletion of single-argument phi nodes from tail
>>>>>>> block through substitution - such phi's prevent outer-loop
>>>>>>> vectorization. But it is clear that such transformation can be done
>>>>>>> other pass.
>>>>>>
>>>>>> Hmm, I wonder why the copy_prop pass after unswitching does not
>>>>>> get rid of them?
>>>>>>
>>>>>>> What is your opinion?
>>>>>>
>>>>>> My opinion is that if we want to enhance unswitching to catch this
>>>>>> (or similar) cases then we should make it a lot more general than
>>>>>> your pattern-matching approach.  I see nothing that should prevent
>>>>>> us from considering unswitching non-innermost loops in general.
>>>>>> It should be only a cost consideration to not do non-innermost loop
>>>>>> unswitching (in addition to maybe a --param specifying the maximum
>>>>>> depth of a loop nest to unswitch).
>>>>>>
>>>>>> So my first thought when seeing your patch still holds - the patch
>>>>>> looks very much too specific.
>>>>>>
>>>>>> Richard.
>>>>>>
>>>>>>> Yuri.
>>>>>>>
>>>>>>> 2015-07-28 13:50 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>>>>> On Thu, Jul 23, 2015 at 4:45 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>>>>>> Hi Richard,
>>>>>>>>>
>>>>>>>>> I checked that both test-cases from 23855 are sucessfully unswitched
>>>>>>>>> by proposed patch. I understand that it does not catch deeper loop
>>>>>>>>> nest as
>>>>>>>>>    for (i=0; i<10; i++)
>>>>>>>>>      for (j=0;j<n;j++)
>>>>>>>>>         for (k=0;k<20;k++)
>>>>>>>>>   ...
>>>>>>>>> but duplication of middle-loop does not look reasonable.
>>>>>>>>>
>>>>>>>>> Here is dump for your second test-case:
>>>>>>>>>
>>>>>>>>> void foo(int *ie, int *je, double *x)
>>>>>>>>> {
>>>>>>>>>   int i, j;
>>>>>>>>>   for (j=0; j<*je; ++j)
>>>>>>>>>     for (i=0; i<*ie; ++i)
>>>>>>>>>       x[i+j] = 0.0;
>>>>>>>>> }
>>>>>>>>> grep -i unswitch t6.c.119t.unswitch
>>>>>>>>> ;; Unswitching outer loop
>>>>>>>>
>>>>>>>> I was saying that why go with a limited approach when a patch (in
>>>>>>>> unknown state...)
>>>>>>>> is available that does it more generally?  Also unswitching is quite
>>>>>>>> expensive compared
>>>>>>>> to "moving" the invariant condition.
>>>>>>>>
>>>>>>>> In your patch:
>>>>>>>>
>>>>>>>> +  if (!nloop->force_vectorize)
>>>>>>>> +    nloop->force_vectorize = true;
>>>>>>>> +  if (loop->safelen != 0)
>>>>>>>> +    nloop->safelen = loop->safelen;
>>>>>>>>
>>>>>>>> I see no guard on force_vectorize so = true looks bogus here.  Please just use
>>>>>>>> copy_loop_info.
>>>>>>>>
>>>>>>>> +  if (integer_nonzerop (cond_new))
>>>>>>>> +    gimple_cond_set_condition_from_tree (cond_stmt, boolean_true_node);
>>>>>>>> +  else if (integer_zerop (cond_new))
>>>>>>>> +    gimple_cond_set_condition_from_tree (cond_stmt, boolean_false_node);
>>>>>>>>
>>>>>>>> gimple_cond_make_true/false (cond_stmt);
>>>>>>>>
>>>>>>>> btw, seems odd that we have to recompute which loop is the true / false variant
>>>>>>>> when we just fed a guard condition to loop_version.  Can't we statically
>>>>>>>> determine whether loop or nloop has the in-loop condition true or false?
>>>>>>>>
>>>>>>>> +  /* Clean-up cfg to remove useless one-argument phi in exit block of
>>>>>>>> +     outer-loop.  */
>>>>>>>> +  cleanup_tree_cfg ();
>>>>>>>>
>>>>>>>> I know unswitching is already O(number-of-unswitched-loops * size-of-function)
>>>>>>>> because it updates SSA form after each individual unswitching (and it does that
>>>>>>>> because it invokes itself recursively on unswitched loops).  But do you really
>>>>>>>> need to invoke CFG cleanup here?
>>>>>>>>
>>>>>>>> Richard.
>>>>>>>>
>>>>>>>>> Yuri.
>>>>>>>>>
>>>>>>>>> 2015-07-14 14:06 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>>>>>>> On Fri, Jul 10, 2015 at 12:02 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>>>>>>>> Hi All,
>>>>>>>>>>>
>>>>>>>>>>> Here is presented simple transformation which tries to hoist out of
>>>>>>>>>>> outer-loop a check on zero trip count for inner-loop. This is very
>>>>>>>>>>> restricted transformation since it accepts outer-loops with very
>>>>>>>>>>> simple cfg, as for example:
>>>>>>>>>>>     acc = 0;
>>>>>>>>>>>    for (i = 1; i <= m; i++) {
>>>>>>>>>>>       for (j = 0; j < n; j++)
>>>>>>>>>>>          if (l[j] == i) { v[j] = acc; acc++; };
>>>>>>>>>>>       acc <<= 1;
>>>>>>>>>>>    }
>>>>>>>>>>>
>>>>>>>>>>> Note that degenerative outer loop (without inner loop) will be
>>>>>>>>>>> completely deleted as dead code.
>>>>>>>>>>> The main goal of this transformation was to convert outer-loop to form
>>>>>>>>>>> accepted by outer-loop vectorization (such test-case is also included
>>>>>>>>>>> to patch).
>>>>>>>>>>>
>>>>>>>>>>> Bootstrap and regression testing did not show any new failures.
>>>>>>>>>>>
>>>>>>>>>>> Is it OK for trunk?
>>>>>>>>>>
>>>>>>>>>> I think this is
>>>>>>>>>>
>>>>>>>>>> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=23855
>>>>>>>>>>
>>>>>>>>>> as well.  It has a patch adding a invariant loop guard hoisting
>>>>>>>>>> phase to loop-header copying.  Yeah, it needs updating to
>>>>>>>>>> trunk again I suppose.  It's always non-stage1 when I come
>>>>>>>>>> back to that patch.
>>>>>>>>>>
>>>>>>>>>> Your patch seems to be very specific and only handles outer
>>>>>>>>>> loops of innermost loops.
>>>>>>>>>>
>>>>>>>>>> Richard.
>>>>>>>>>>
>>>>>>>>>>> ChangeLog:
>>>>>>>>>>> 2015-07-10  Yuri Rumyantsev  <ysrumyan@gmail.com>
>>>>>>>>>>>
>>>>>>>>>>> * tree-ssa-loop-unswitch.c: Include "tree-cfgcleanup.h" and
>>>>>>>>>>> "gimple-iterator.h", add prototype for tree_unswitch_outer_loop.
>>>>>>>>>>> (tree_ssa_unswitch_loops): Add invoke of tree_unswitch_outer_loop.
>>>>>>>>>>> (tree_unswitch_outer_loop): New function.
>>>>>>>>>>>
>>>>>>>>>>> gcc/testsuite/ChangeLog:
>>>>>>>>>>> * gcc.dg/tree-ssa/unswitch-outer-loop-1.c: New test.
>>>>>>>>>>> * gcc.dg/vect/vect-outer-simd-3.c: New test.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] Unswitching outer loops.
  2015-10-05 10:57                     ` Richard Biener
@ 2015-10-05 13:13                       ` Yuri Rumyantsev
  2015-10-06  7:59                         ` Richard Biener
  0 siblings, 1 reply; 17+ messages in thread
From: Yuri Rumyantsev @ 2015-10-05 13:13 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, Igor Zamyatin

Thanks Richard.
I'd like to answer on your last comment related to using of exit edge
argument for edge that skips loop.
Let's consider the following test-case:

#include <stdlib.h>
#define N 32
float *foo(int ustride, int size, float *src)
{
   float *buffer, *p;
   int i, k;

   if (!src)
    return NULL;

   buffer = (float *) malloc(N * size * sizeof(float));

   if(buffer)
      for(i=0, p=buffer; i<N; i++, src+=ustride)
for(k=0; k<size; k++)
 *p++ = src[k];

   return buffer;
}

Before adding new edge we have in post-header bb:
  <bb 9>:
  # _6 = PHI <0B(8), buffer_20(16)>
  return _6;

It is clear that we must preserve function semantic and transform it to
_6 = PHI <0B(12), buffer_19(9), buffer_19(4)>


2015-10-05 13:57 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
> On Wed, Sep 30, 2015 at 12:46 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>> Hi Richard,
>>
>> I re-designed outer loop unswitching using basic idea of 23855 patch -
>> hoist invariant guard if loop is empty without guard. Note that this
>> was added to loop unswitching pass with simple modifications - using
>> another loop iterator etc.
>>
>> Bootstrap and regression testing did not show any new failures.
>> What is your opinion?
>
> Overall it looks good.  Some comments below - a few more testcases would
> be nice as well.
>
> +  /* Loop must not be infinite.  */
> +  if (!finite_loop_p (loop))
> +    return false;
>
> why's that?
>
> +  body = get_loop_body_in_dom_order (loop);
> +  for (i = 0; i < loop->num_nodes; i++)
> +    {
> +      if (body[i]->loop_father != loop)
> +       continue;
> +      if (!empty_bb_without_guard_p (loop, body[i]))
>
> I wonder if there is a better way to iterate over the interesting
> blocks and PHIs
> we need to check for side-effects (and thus we maybe can avoid gathering
> the loop in DOM order).
>
> +      FOR_EACH_SSA_TREE_OPERAND (name, stmt, op_iter, SSA_OP_DEF)
> +       {
> +         if (may_be_used_outside
>
> may_be_used_outside can be hoisted above the loop.  I wonder if we can take
> advantage of loop-closed SSA form here (and the fact we have a single exit
> from the loop).  Iterating over exit dest PHIs and determining whether the
> exit edge DEF is inside the loop part it may not be should be enough.
>
> +  gcc_assert (single_succ_p (pre_header));
>
> that should be always true.
>
> +  gsi_remove (&gsi, false);
> +  bb = guard->dest;
> +  remove_edge (guard);
> +  /* Update dominance for destination of GUARD.  */
> +  if (EDGE_COUNT (bb->preds) == 0)
> +    {
> +      basic_block s_bb;
> +      gcc_assert (single_succ_p (bb));
> +      s_bb = single_succ (bb);
> +      delete_basic_block (bb);
> +      if (single_pred_p (s_bb))
> +       set_immediate_dominator (CDI_DOMINATORS, s_bb, single_pred (s_bb));
>
> all this massaging should be simplified by leaving it to CFG cleanup by
> simply adjusting the CONDs condition to always true/false.  There is
> gimple_cond_make_{true,false} () for this (would be nice to have a variant
> taking a bool).
>
> +  new_edge = make_edge (pre_header, exit->dest, flags);
> +  if (fix_dom_of_exit)
> +    set_immediate_dominator (CDI_DOMINATORS, exit->dest, pre_header);
> +  update_stmt (gsi_stmt (gsi));
>
> the update_stmt should be not necessary, it's done by gsi_insert_after already.
>
> +  /* Add NEW_ADGE argument for all phi in post-header block.  */
> +  bb = exit->dest;
> +  for (gphi_iterator gsi = gsi_start_phis (bb);
> +       !gsi_end_p (gsi); gsi_next (&gsi))
> +    {
> +      gphi *phi = gsi.phi ();
> +      /* edge_iterator ei; */
> +      tree arg;
> +      if (virtual_operand_p (gimple_phi_result (phi)))
> +       {
> +         arg = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
> +         add_phi_arg (phi, arg, new_edge, UNKNOWN_LOCATION);
> +       }
> +      else
> +       {
> +         /* Use exit edge argument.  */
> +         arg = PHI_ARG_DEF_FROM_EDGE (phi, exit);
> +         add_phi_arg (phi, arg, new_edge, UNKNOWN_LOCATION);
>
> Hum.  How is it ok to use the exit edge argument for the edge that skips
> the loop?  Why can't you always use the pre-header edge value?
> That is, if we have
>
>  for(i=0;i<m;++i)
>    {
>      if (n > 0)
>     {
>      for (;;)
>        {
>        }
>      }
>    }
>   ... = i;
>
> then i is used after the loop and the correct value to use if
> n > 0 is false is '0'.  Maybe this way we can also relax
> what check_exit_phi does?  IMHO the only restriction is
> if sth defined inside the loop before the header check for
> the inner loop is used after the loop.
>
> Thanks,
> Richard.
>
>> Thanks.
>>
>> ChangeLog:
>> 2015-09-30  Yuri Rumyantsev  <ysrumyan@gmail.com>
>>
>> * tree-ssa-loop-unswitch.c: Include "gimple-iterator.h" and
>> "cfghooks.h", add prototypes for introduced new functions.
>> (tree_ssa_unswitch_loops): Use from innermost loop iterator, move all
>> checks on ability of loop unswitching to tree_unswitch_single_loop;
>> invoke tree_unswitch_single_loop or tree_unswitch_outer_loop depending
>> on innermost loop check.
>> (tree_unswitch_single_loop): Add all required checks on ability of
>> loop unswitching under zero recursive level guard.
>> (tree_unswitch_outer_loop): New function.
>> (find_loop_guard): Likewise.
>> (empty_bb_without_guard_p): Likewise.
>> (used_outside_loop_p): Likewise.
>> (hoist_guard): Likewise.
>> (check_exit_phi): Likewise.
>>
>>    gcc/testsuite/ChangeLog:
>> * gcc.dg/loop-unswitch-2.c: New test.
>>
>> 2015-09-16 11:26 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>> Yeah, as said, the patch wasn't fully ready and it also felt odd to do
>>> this hoisting in loop header copying.  Integrating it
>>> with LIM would be a better fit eventually.
>>>
>>> Note that we did agree to go forward with your original patch just
>>> making it more "generically" perform outer loop
>>> unswitching.  Did you explore that idea further?
>>>
>>>
>>>
>>> On Tue, Sep 15, 2015 at 6:00 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>> Thanks Richard.
>>>>
>>>> I found one more issue that could not be fixed simply. In 23855 you
>>>> consider the following test-case:
>>>> void foo(int *ie, int *je, double *x)
>>>> {
>>>>   int i, j;
>>>>   for (j=0; j<*je; ++j)
>>>>     for (i=0; i<*ie; ++i)
>>>>       x[i+j] = 0.0;
>>>> }
>>>> and proposed to hoist up a check on *ie out of loop. It requires
>>>> memref alias analysis since in general x and ie can alias (if their
>>>> types are compatible - int *ie & int * x). Such analysis is performed
>>>> by pre or lim passes. Without such analysis we can not hoist a test on
>>>> non-zero for *ie out of loop using 238565 patch.
>>>>  The second concern is that proposed copy header algorithm changes
>>>> loop structure significantly and it is not accepted by vectorizer
>>>> since latch is not empty (such transformation assumes loop peeling for
>>>> one iteration. So I can propose to implement simple guard hoisting
>>>> without copying header and tail blocks (if it is possible).
>>>>
>>>> I will appreciate you for any advice or help since without such
>>>> hoisting we are not able to perform outer loop vectorization for
>>>> important benchmark.
>>>> and
>>>>
>>>> 2015-09-15 14:22 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>> On Thu, Sep 3, 2015 at 6:32 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>>> Hi Richard,
>>>>>>
>>>>>> I started learning, tuning and debugging patch proposed in 23855 and
>>>>>> discovered thta it does not work properly.
>>>>>> So I wonder is it tested patch and it should work?
>>>>>
>>>>> I don't remember, but as it wasn't committed it certainly wasn't ready.
>>>>>
>>>>>> Should it accept for hoisting the following loop nest
>>>>>>   for (i=0; i<n; i++) {
>>>>>>     s = 0;
>>>>>>     for (j=0; j<m; j++)
>>>>>>       s += a[i] * b[j];
>>>>>>     c[i] = s;
>>>>>>   }
>>>>>> Note that i-loop will nit be empty if m is equal to 0.
>>>>>
>>>>> if m is equal to 0 then we still have the c[i] = s store, no?  Of course
>>>>> we could unswitch the outer loop on m == 0 but simple hoisting wouldn't work.
>>>>>
>>>>> Richard.
>>>>>
>>>>>> 2015-08-03 10:27 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>>>> On Fri, Jul 31, 2015 at 1:17 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>>>>> Hi Richard,
>>>>>>>>
>>>>>>>> I learned your updated patch for 23825 and it is more general in
>>>>>>>> comparison with my.
>>>>>>>> I'd like to propose you a compromise - let's consider my patch only
>>>>>>>> for force-vectorize outer loop only to allow outer-loop
>>>>>>>> vecctorization.
>>>>>>>
>>>>>>> I don't see why we should special-case that if the approach in 23825
>>>>>>> is sensible.
>>>>>>>
>>>>>>>> Note that your approach will not hoist invariant
>>>>>>>> guards if loops contains something else except for inner-loop, i.e. it
>>>>>>>> won't be empty for taken branch.
>>>>>>>
>>>>>>> Yes, it does not perform unswitching but guard hoisting.  Note that this
>>>>>>> is originally Zdenek Dvoraks patch.
>>>>>>>
>>>>>>>> I also would like to answer on your last question - CFG cleanup is
>>>>>>>> invoked to perform deletion of single-argument phi nodes from tail
>>>>>>>> block through substitution - such phi's prevent outer-loop
>>>>>>>> vectorization. But it is clear that such transformation can be done
>>>>>>>> other pass.
>>>>>>>
>>>>>>> Hmm, I wonder why the copy_prop pass after unswitching does not
>>>>>>> get rid of them?
>>>>>>>
>>>>>>>> What is your opinion?
>>>>>>>
>>>>>>> My opinion is that if we want to enhance unswitching to catch this
>>>>>>> (or similar) cases then we should make it a lot more general than
>>>>>>> your pattern-matching approach.  I see nothing that should prevent
>>>>>>> us from considering unswitching non-innermost loops in general.
>>>>>>> It should be only a cost consideration to not do non-innermost loop
>>>>>>> unswitching (in addition to maybe a --param specifying the maximum
>>>>>>> depth of a loop nest to unswitch).
>>>>>>>
>>>>>>> So my first thought when seeing your patch still holds - the patch
>>>>>>> looks very much too specific.
>>>>>>>
>>>>>>> Richard.
>>>>>>>
>>>>>>>> Yuri.
>>>>>>>>
>>>>>>>> 2015-07-28 13:50 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>>>>>> On Thu, Jul 23, 2015 at 4:45 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>>>>>>> Hi Richard,
>>>>>>>>>>
>>>>>>>>>> I checked that both test-cases from 23855 are sucessfully unswitched
>>>>>>>>>> by proposed patch. I understand that it does not catch deeper loop
>>>>>>>>>> nest as
>>>>>>>>>>    for (i=0; i<10; i++)
>>>>>>>>>>      for (j=0;j<n;j++)
>>>>>>>>>>         for (k=0;k<20;k++)
>>>>>>>>>>   ...
>>>>>>>>>> but duplication of middle-loop does not look reasonable.
>>>>>>>>>>
>>>>>>>>>> Here is dump for your second test-case:
>>>>>>>>>>
>>>>>>>>>> void foo(int *ie, int *je, double *x)
>>>>>>>>>> {
>>>>>>>>>>   int i, j;
>>>>>>>>>>   for (j=0; j<*je; ++j)
>>>>>>>>>>     for (i=0; i<*ie; ++i)
>>>>>>>>>>       x[i+j] = 0.0;
>>>>>>>>>> }
>>>>>>>>>> grep -i unswitch t6.c.119t.unswitch
>>>>>>>>>> ;; Unswitching outer loop
>>>>>>>>>
>>>>>>>>> I was saying that why go with a limited approach when a patch (in
>>>>>>>>> unknown state...)
>>>>>>>>> is available that does it more generally?  Also unswitching is quite
>>>>>>>>> expensive compared
>>>>>>>>> to "moving" the invariant condition.
>>>>>>>>>
>>>>>>>>> In your patch:
>>>>>>>>>
>>>>>>>>> +  if (!nloop->force_vectorize)
>>>>>>>>> +    nloop->force_vectorize = true;
>>>>>>>>> +  if (loop->safelen != 0)
>>>>>>>>> +    nloop->safelen = loop->safelen;
>>>>>>>>>
>>>>>>>>> I see no guard on force_vectorize so = true looks bogus here.  Please just use
>>>>>>>>> copy_loop_info.
>>>>>>>>>
>>>>>>>>> +  if (integer_nonzerop (cond_new))
>>>>>>>>> +    gimple_cond_set_condition_from_tree (cond_stmt, boolean_true_node);
>>>>>>>>> +  else if (integer_zerop (cond_new))
>>>>>>>>> +    gimple_cond_set_condition_from_tree (cond_stmt, boolean_false_node);
>>>>>>>>>
>>>>>>>>> gimple_cond_make_true/false (cond_stmt);
>>>>>>>>>
>>>>>>>>> btw, seems odd that we have to recompute which loop is the true / false variant
>>>>>>>>> when we just fed a guard condition to loop_version.  Can't we statically
>>>>>>>>> determine whether loop or nloop has the in-loop condition true or false?
>>>>>>>>>
>>>>>>>>> +  /* Clean-up cfg to remove useless one-argument phi in exit block of
>>>>>>>>> +     outer-loop.  */
>>>>>>>>> +  cleanup_tree_cfg ();
>>>>>>>>>
>>>>>>>>> I know unswitching is already O(number-of-unswitched-loops * size-of-function)
>>>>>>>>> because it updates SSA form after each individual unswitching (and it does that
>>>>>>>>> because it invokes itself recursively on unswitched loops).  But do you really
>>>>>>>>> need to invoke CFG cleanup here?
>>>>>>>>>
>>>>>>>>> Richard.
>>>>>>>>>
>>>>>>>>>> Yuri.
>>>>>>>>>>
>>>>>>>>>> 2015-07-14 14:06 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>>>>>>>> On Fri, Jul 10, 2015 at 12:02 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>>>>>>>>> Hi All,
>>>>>>>>>>>>
>>>>>>>>>>>> Here is presented simple transformation which tries to hoist out of
>>>>>>>>>>>> outer-loop a check on zero trip count for inner-loop. This is very
>>>>>>>>>>>> restricted transformation since it accepts outer-loops with very
>>>>>>>>>>>> simple cfg, as for example:
>>>>>>>>>>>>     acc = 0;
>>>>>>>>>>>>    for (i = 1; i <= m; i++) {
>>>>>>>>>>>>       for (j = 0; j < n; j++)
>>>>>>>>>>>>          if (l[j] == i) { v[j] = acc; acc++; };
>>>>>>>>>>>>       acc <<= 1;
>>>>>>>>>>>>    }
>>>>>>>>>>>>
>>>>>>>>>>>> Note that degenerative outer loop (without inner loop) will be
>>>>>>>>>>>> completely deleted as dead code.
>>>>>>>>>>>> The main goal of this transformation was to convert outer-loop to form
>>>>>>>>>>>> accepted by outer-loop vectorization (such test-case is also included
>>>>>>>>>>>> to patch).
>>>>>>>>>>>>
>>>>>>>>>>>> Bootstrap and regression testing did not show any new failures.
>>>>>>>>>>>>
>>>>>>>>>>>> Is it OK for trunk?
>>>>>>>>>>>
>>>>>>>>>>> I think this is
>>>>>>>>>>>
>>>>>>>>>>> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=23855
>>>>>>>>>>>
>>>>>>>>>>> as well.  It has a patch adding a invariant loop guard hoisting
>>>>>>>>>>> phase to loop-header copying.  Yeah, it needs updating to
>>>>>>>>>>> trunk again I suppose.  It's always non-stage1 when I come
>>>>>>>>>>> back to that patch.
>>>>>>>>>>>
>>>>>>>>>>> Your patch seems to be very specific and only handles outer
>>>>>>>>>>> loops of innermost loops.
>>>>>>>>>>>
>>>>>>>>>>> Richard.
>>>>>>>>>>>
>>>>>>>>>>>> ChangeLog:
>>>>>>>>>>>> 2015-07-10  Yuri Rumyantsev  <ysrumyan@gmail.com>
>>>>>>>>>>>>
>>>>>>>>>>>> * tree-ssa-loop-unswitch.c: Include "tree-cfgcleanup.h" and
>>>>>>>>>>>> "gimple-iterator.h", add prototype for tree_unswitch_outer_loop.
>>>>>>>>>>>> (tree_ssa_unswitch_loops): Add invoke of tree_unswitch_outer_loop.
>>>>>>>>>>>> (tree_unswitch_outer_loop): New function.
>>>>>>>>>>>>
>>>>>>>>>>>> gcc/testsuite/ChangeLog:
>>>>>>>>>>>> * gcc.dg/tree-ssa/unswitch-outer-loop-1.c: New test.
>>>>>>>>>>>> * gcc.dg/vect/vect-outer-simd-3.c: New test.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] Unswitching outer loops.
  2015-10-05 13:13                       ` Yuri Rumyantsev
@ 2015-10-06  7:59                         ` Richard Biener
  2015-10-06 11:41                           ` Yuri Rumyantsev
  0 siblings, 1 reply; 17+ messages in thread
From: Richard Biener @ 2015-10-06  7:59 UTC (permalink / raw)
  To: Yuri Rumyantsev; +Cc: gcc-patches, Igor Zamyatin

On Mon, Oct 5, 2015 at 3:13 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
> Thanks Richard.
> I'd like to answer on your last comment related to using of exit edge
> argument for edge that skips loop.
> Let's consider the following test-case:
>
> #include <stdlib.h>
> #define N 32
> float *foo(int ustride, int size, float *src)
> {
>    float *buffer, *p;
>    int i, k;
>
>    if (!src)
>     return NULL;
>
>    buffer = (float *) malloc(N * size * sizeof(float));
>
>    if(buffer)
>       for(i=0, p=buffer; i<N; i++, src+=ustride)
> for(k=0; k<size; k++)
>  *p++ = src[k];
>
>    return buffer;
> }
>
> Before adding new edge we have in post-header bb:
>   <bb 9>:
>   # _6 = PHI <0B(8), buffer_20(16)>
>   return _6;
>
> It is clear that we must preserve function semantic and transform it to
> _6 = PHI <0B(12), buffer_19(9), buffer_19(4)>

Ah, yeah.  I was confusing the loop exit of the inner vs. the outer loop.

Richard.

>
> 2015-10-05 13:57 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>> On Wed, Sep 30, 2015 at 12:46 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>> Hi Richard,
>>>
>>> I re-designed outer loop unswitching using basic idea of 23855 patch -
>>> hoist invariant guard if loop is empty without guard. Note that this
>>> was added to loop unswitching pass with simple modifications - using
>>> another loop iterator etc.
>>>
>>> Bootstrap and regression testing did not show any new failures.
>>> What is your opinion?
>>
>> Overall it looks good.  Some comments below - a few more testcases would
>> be nice as well.
>>
>> +  /* Loop must not be infinite.  */
>> +  if (!finite_loop_p (loop))
>> +    return false;
>>
>> why's that?
>>
>> +  body = get_loop_body_in_dom_order (loop);
>> +  for (i = 0; i < loop->num_nodes; i++)
>> +    {
>> +      if (body[i]->loop_father != loop)
>> +       continue;
>> +      if (!empty_bb_without_guard_p (loop, body[i]))
>>
>> I wonder if there is a better way to iterate over the interesting
>> blocks and PHIs
>> we need to check for side-effects (and thus we maybe can avoid gathering
>> the loop in DOM order).
>>
>> +      FOR_EACH_SSA_TREE_OPERAND (name, stmt, op_iter, SSA_OP_DEF)
>> +       {
>> +         if (may_be_used_outside
>>
>> may_be_used_outside can be hoisted above the loop.  I wonder if we can take
>> advantage of loop-closed SSA form here (and the fact we have a single exit
>> from the loop).  Iterating over exit dest PHIs and determining whether the
>> exit edge DEF is inside the loop part it may not be should be enough.
>>
>> +  gcc_assert (single_succ_p (pre_header));
>>
>> that should be always true.
>>
>> +  gsi_remove (&gsi, false);
>> +  bb = guard->dest;
>> +  remove_edge (guard);
>> +  /* Update dominance for destination of GUARD.  */
>> +  if (EDGE_COUNT (bb->preds) == 0)
>> +    {
>> +      basic_block s_bb;
>> +      gcc_assert (single_succ_p (bb));
>> +      s_bb = single_succ (bb);
>> +      delete_basic_block (bb);
>> +      if (single_pred_p (s_bb))
>> +       set_immediate_dominator (CDI_DOMINATORS, s_bb, single_pred (s_bb));
>>
>> all this massaging should be simplified by leaving it to CFG cleanup by
>> simply adjusting the CONDs condition to always true/false.  There is
>> gimple_cond_make_{true,false} () for this (would be nice to have a variant
>> taking a bool).
>>
>> +  new_edge = make_edge (pre_header, exit->dest, flags);
>> +  if (fix_dom_of_exit)
>> +    set_immediate_dominator (CDI_DOMINATORS, exit->dest, pre_header);
>> +  update_stmt (gsi_stmt (gsi));
>>
>> the update_stmt should be not necessary, it's done by gsi_insert_after already.
>>
>> +  /* Add NEW_ADGE argument for all phi in post-header block.  */
>> +  bb = exit->dest;
>> +  for (gphi_iterator gsi = gsi_start_phis (bb);
>> +       !gsi_end_p (gsi); gsi_next (&gsi))
>> +    {
>> +      gphi *phi = gsi.phi ();
>> +      /* edge_iterator ei; */
>> +      tree arg;
>> +      if (virtual_operand_p (gimple_phi_result (phi)))
>> +       {
>> +         arg = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
>> +         add_phi_arg (phi, arg, new_edge, UNKNOWN_LOCATION);
>> +       }
>> +      else
>> +       {
>> +         /* Use exit edge argument.  */
>> +         arg = PHI_ARG_DEF_FROM_EDGE (phi, exit);
>> +         add_phi_arg (phi, arg, new_edge, UNKNOWN_LOCATION);
>>
>> Hum.  How is it ok to use the exit edge argument for the edge that skips
>> the loop?  Why can't you always use the pre-header edge value?
>> That is, if we have
>>
>>  for(i=0;i<m;++i)
>>    {
>>      if (n > 0)
>>     {
>>      for (;;)
>>        {
>>        }
>>      }
>>    }
>>   ... = i;
>>
>> then i is used after the loop and the correct value to use if
>> n > 0 is false is '0'.  Maybe this way we can also relax
>> what check_exit_phi does?  IMHO the only restriction is
>> if sth defined inside the loop before the header check for
>> the inner loop is used after the loop.
>>
>> Thanks,
>> Richard.
>>
>>> Thanks.
>>>
>>> ChangeLog:
>>> 2015-09-30  Yuri Rumyantsev  <ysrumyan@gmail.com>
>>>
>>> * tree-ssa-loop-unswitch.c: Include "gimple-iterator.h" and
>>> "cfghooks.h", add prototypes for introduced new functions.
>>> (tree_ssa_unswitch_loops): Use from innermost loop iterator, move all
>>> checks on ability of loop unswitching to tree_unswitch_single_loop;
>>> invoke tree_unswitch_single_loop or tree_unswitch_outer_loop depending
>>> on innermost loop check.
>>> (tree_unswitch_single_loop): Add all required checks on ability of
>>> loop unswitching under zero recursive level guard.
>>> (tree_unswitch_outer_loop): New function.
>>> (find_loop_guard): Likewise.
>>> (empty_bb_without_guard_p): Likewise.
>>> (used_outside_loop_p): Likewise.
>>> (hoist_guard): Likewise.
>>> (check_exit_phi): Likewise.
>>>
>>>    gcc/testsuite/ChangeLog:
>>> * gcc.dg/loop-unswitch-2.c: New test.
>>>
>>> 2015-09-16 11:26 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>> Yeah, as said, the patch wasn't fully ready and it also felt odd to do
>>>> this hoisting in loop header copying.  Integrating it
>>>> with LIM would be a better fit eventually.
>>>>
>>>> Note that we did agree to go forward with your original patch just
>>>> making it more "generically" perform outer loop
>>>> unswitching.  Did you explore that idea further?
>>>>
>>>>
>>>>
>>>> On Tue, Sep 15, 2015 at 6:00 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>> Thanks Richard.
>>>>>
>>>>> I found one more issue that could not be fixed simply. In 23855 you
>>>>> consider the following test-case:
>>>>> void foo(int *ie, int *je, double *x)
>>>>> {
>>>>>   int i, j;
>>>>>   for (j=0; j<*je; ++j)
>>>>>     for (i=0; i<*ie; ++i)
>>>>>       x[i+j] = 0.0;
>>>>> }
>>>>> and proposed to hoist up a check on *ie out of loop. It requires
>>>>> memref alias analysis since in general x and ie can alias (if their
>>>>> types are compatible - int *ie & int * x). Such analysis is performed
>>>>> by pre or lim passes. Without such analysis we can not hoist a test on
>>>>> non-zero for *ie out of loop using 238565 patch.
>>>>>  The second concern is that proposed copy header algorithm changes
>>>>> loop structure significantly and it is not accepted by vectorizer
>>>>> since latch is not empty (such transformation assumes loop peeling for
>>>>> one iteration. So I can propose to implement simple guard hoisting
>>>>> without copying header and tail blocks (if it is possible).
>>>>>
>>>>> I will appreciate you for any advice or help since without such
>>>>> hoisting we are not able to perform outer loop vectorization for
>>>>> important benchmark.
>>>>> and
>>>>>
>>>>> 2015-09-15 14:22 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>>> On Thu, Sep 3, 2015 at 6:32 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>>>> Hi Richard,
>>>>>>>
>>>>>>> I started learning, tuning and debugging patch proposed in 23855 and
>>>>>>> discovered thta it does not work properly.
>>>>>>> So I wonder is it tested patch and it should work?
>>>>>>
>>>>>> I don't remember, but as it wasn't committed it certainly wasn't ready.
>>>>>>
>>>>>>> Should it accept for hoisting the following loop nest
>>>>>>>   for (i=0; i<n; i++) {
>>>>>>>     s = 0;
>>>>>>>     for (j=0; j<m; j++)
>>>>>>>       s += a[i] * b[j];
>>>>>>>     c[i] = s;
>>>>>>>   }
>>>>>>> Note that i-loop will nit be empty if m is equal to 0.
>>>>>>
>>>>>> if m is equal to 0 then we still have the c[i] = s store, no?  Of course
>>>>>> we could unswitch the outer loop on m == 0 but simple hoisting wouldn't work.
>>>>>>
>>>>>> Richard.
>>>>>>
>>>>>>> 2015-08-03 10:27 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>>>>> On Fri, Jul 31, 2015 at 1:17 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>>>>>> Hi Richard,
>>>>>>>>>
>>>>>>>>> I learned your updated patch for 23825 and it is more general in
>>>>>>>>> comparison with my.
>>>>>>>>> I'd like to propose you a compromise - let's consider my patch only
>>>>>>>>> for force-vectorize outer loop only to allow outer-loop
>>>>>>>>> vecctorization.
>>>>>>>>
>>>>>>>> I don't see why we should special-case that if the approach in 23825
>>>>>>>> is sensible.
>>>>>>>>
>>>>>>>>> Note that your approach will not hoist invariant
>>>>>>>>> guards if loops contains something else except for inner-loop, i.e. it
>>>>>>>>> won't be empty for taken branch.
>>>>>>>>
>>>>>>>> Yes, it does not perform unswitching but guard hoisting.  Note that this
>>>>>>>> is originally Zdenek Dvoraks patch.
>>>>>>>>
>>>>>>>>> I also would like to answer on your last question - CFG cleanup is
>>>>>>>>> invoked to perform deletion of single-argument phi nodes from tail
>>>>>>>>> block through substitution - such phi's prevent outer-loop
>>>>>>>>> vectorization. But it is clear that such transformation can be done
>>>>>>>>> other pass.
>>>>>>>>
>>>>>>>> Hmm, I wonder why the copy_prop pass after unswitching does not
>>>>>>>> get rid of them?
>>>>>>>>
>>>>>>>>> What is your opinion?
>>>>>>>>
>>>>>>>> My opinion is that if we want to enhance unswitching to catch this
>>>>>>>> (or similar) cases then we should make it a lot more general than
>>>>>>>> your pattern-matching approach.  I see nothing that should prevent
>>>>>>>> us from considering unswitching non-innermost loops in general.
>>>>>>>> It should be only a cost consideration to not do non-innermost loop
>>>>>>>> unswitching (in addition to maybe a --param specifying the maximum
>>>>>>>> depth of a loop nest to unswitch).
>>>>>>>>
>>>>>>>> So my first thought when seeing your patch still holds - the patch
>>>>>>>> looks very much too specific.
>>>>>>>>
>>>>>>>> Richard.
>>>>>>>>
>>>>>>>>> Yuri.
>>>>>>>>>
>>>>>>>>> 2015-07-28 13:50 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>>>>>>> On Thu, Jul 23, 2015 at 4:45 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>>>>>>>> Hi Richard,
>>>>>>>>>>>
>>>>>>>>>>> I checked that both test-cases from 23855 are sucessfully unswitched
>>>>>>>>>>> by proposed patch. I understand that it does not catch deeper loop
>>>>>>>>>>> nest as
>>>>>>>>>>>    for (i=0; i<10; i++)
>>>>>>>>>>>      for (j=0;j<n;j++)
>>>>>>>>>>>         for (k=0;k<20;k++)
>>>>>>>>>>>   ...
>>>>>>>>>>> but duplication of middle-loop does not look reasonable.
>>>>>>>>>>>
>>>>>>>>>>> Here is dump for your second test-case:
>>>>>>>>>>>
>>>>>>>>>>> void foo(int *ie, int *je, double *x)
>>>>>>>>>>> {
>>>>>>>>>>>   int i, j;
>>>>>>>>>>>   for (j=0; j<*je; ++j)
>>>>>>>>>>>     for (i=0; i<*ie; ++i)
>>>>>>>>>>>       x[i+j] = 0.0;
>>>>>>>>>>> }
>>>>>>>>>>> grep -i unswitch t6.c.119t.unswitch
>>>>>>>>>>> ;; Unswitching outer loop
>>>>>>>>>>
>>>>>>>>>> I was saying that why go with a limited approach when a patch (in
>>>>>>>>>> unknown state...)
>>>>>>>>>> is available that does it more generally?  Also unswitching is quite
>>>>>>>>>> expensive compared
>>>>>>>>>> to "moving" the invariant condition.
>>>>>>>>>>
>>>>>>>>>> In your patch:
>>>>>>>>>>
>>>>>>>>>> +  if (!nloop->force_vectorize)
>>>>>>>>>> +    nloop->force_vectorize = true;
>>>>>>>>>> +  if (loop->safelen != 0)
>>>>>>>>>> +    nloop->safelen = loop->safelen;
>>>>>>>>>>
>>>>>>>>>> I see no guard on force_vectorize so = true looks bogus here.  Please just use
>>>>>>>>>> copy_loop_info.
>>>>>>>>>>
>>>>>>>>>> +  if (integer_nonzerop (cond_new))
>>>>>>>>>> +    gimple_cond_set_condition_from_tree (cond_stmt, boolean_true_node);
>>>>>>>>>> +  else if (integer_zerop (cond_new))
>>>>>>>>>> +    gimple_cond_set_condition_from_tree (cond_stmt, boolean_false_node);
>>>>>>>>>>
>>>>>>>>>> gimple_cond_make_true/false (cond_stmt);
>>>>>>>>>>
>>>>>>>>>> btw, seems odd that we have to recompute which loop is the true / false variant
>>>>>>>>>> when we just fed a guard condition to loop_version.  Can't we statically
>>>>>>>>>> determine whether loop or nloop has the in-loop condition true or false?
>>>>>>>>>>
>>>>>>>>>> +  /* Clean-up cfg to remove useless one-argument phi in exit block of
>>>>>>>>>> +     outer-loop.  */
>>>>>>>>>> +  cleanup_tree_cfg ();
>>>>>>>>>>
>>>>>>>>>> I know unswitching is already O(number-of-unswitched-loops * size-of-function)
>>>>>>>>>> because it updates SSA form after each individual unswitching (and it does that
>>>>>>>>>> because it invokes itself recursively on unswitched loops).  But do you really
>>>>>>>>>> need to invoke CFG cleanup here?
>>>>>>>>>>
>>>>>>>>>> Richard.
>>>>>>>>>>
>>>>>>>>>>> Yuri.
>>>>>>>>>>>
>>>>>>>>>>> 2015-07-14 14:06 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>>>>>>>>> On Fri, Jul 10, 2015 at 12:02 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>>>>>>>>>> Hi All,
>>>>>>>>>>>>>
>>>>>>>>>>>>> Here is presented simple transformation which tries to hoist out of
>>>>>>>>>>>>> outer-loop a check on zero trip count for inner-loop. This is very
>>>>>>>>>>>>> restricted transformation since it accepts outer-loops with very
>>>>>>>>>>>>> simple cfg, as for example:
>>>>>>>>>>>>>     acc = 0;
>>>>>>>>>>>>>    for (i = 1; i <= m; i++) {
>>>>>>>>>>>>>       for (j = 0; j < n; j++)
>>>>>>>>>>>>>          if (l[j] == i) { v[j] = acc; acc++; };
>>>>>>>>>>>>>       acc <<= 1;
>>>>>>>>>>>>>    }
>>>>>>>>>>>>>
>>>>>>>>>>>>> Note that degenerative outer loop (without inner loop) will be
>>>>>>>>>>>>> completely deleted as dead code.
>>>>>>>>>>>>> The main goal of this transformation was to convert outer-loop to form
>>>>>>>>>>>>> accepted by outer-loop vectorization (such test-case is also included
>>>>>>>>>>>>> to patch).
>>>>>>>>>>>>>
>>>>>>>>>>>>> Bootstrap and regression testing did not show any new failures.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Is it OK for trunk?
>>>>>>>>>>>>
>>>>>>>>>>>> I think this is
>>>>>>>>>>>>
>>>>>>>>>>>> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=23855
>>>>>>>>>>>>
>>>>>>>>>>>> as well.  It has a patch adding a invariant loop guard hoisting
>>>>>>>>>>>> phase to loop-header copying.  Yeah, it needs updating to
>>>>>>>>>>>> trunk again I suppose.  It's always non-stage1 when I come
>>>>>>>>>>>> back to that patch.
>>>>>>>>>>>>
>>>>>>>>>>>> Your patch seems to be very specific and only handles outer
>>>>>>>>>>>> loops of innermost loops.
>>>>>>>>>>>>
>>>>>>>>>>>> Richard.
>>>>>>>>>>>>
>>>>>>>>>>>>> ChangeLog:
>>>>>>>>>>>>> 2015-07-10  Yuri Rumyantsev  <ysrumyan@gmail.com>
>>>>>>>>>>>>>
>>>>>>>>>>>>> * tree-ssa-loop-unswitch.c: Include "tree-cfgcleanup.h" and
>>>>>>>>>>>>> "gimple-iterator.h", add prototype for tree_unswitch_outer_loop.
>>>>>>>>>>>>> (tree_ssa_unswitch_loops): Add invoke of tree_unswitch_outer_loop.
>>>>>>>>>>>>> (tree_unswitch_outer_loop): New function.
>>>>>>>>>>>>>
>>>>>>>>>>>>> gcc/testsuite/ChangeLog:
>>>>>>>>>>>>> * gcc.dg/tree-ssa/unswitch-outer-loop-1.c: New test.
>>>>>>>>>>>>> * gcc.dg/vect/vect-outer-simd-3.c: New test.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] Unswitching outer loops.
  2015-10-06  7:59                         ` Richard Biener
@ 2015-10-06 11:41                           ` Yuri Rumyantsev
  2015-10-06 12:21                             ` Richard Biener
  0 siblings, 1 reply; 17+ messages in thread
From: Yuri Rumyantsev @ 2015-10-06 11:41 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, Igor Zamyatin

[-- Attachment #1: Type: text/plain, Size: 17303 bytes --]

Richard,

Here is updated patch which reflects almost all your remarks:
1. Use ordinary get_loop_body.
2. Delete useless asserts.
3. Use check on iterated loop instead of finite_loop_p.
4. Do not update CFG by adjusting the CONDs condition to always true/false.
5. Add couple tests.

ChangeLog:
2015-10-06  Yuri Rumyantsev  <ysrumyan@gmail.com>

* tree-ssa-loop-unswitch.c: Include "gimple-iterator.h" and
"cfghooks.h", add prototypes for introduced new functions.
(tree_ssa_unswitch_loops): Use from innermost loop iterator, move all
checks on ability of loop unswitching to tree_unswitch_single_loop;
invoke tree_unswitch_single_loop or tree_unswitch_outer_loop depending
on innermost loop check.
(tree_unswitch_single_loop): Add all required checks on ability of
loop unswitching under zero recursive level guard.
(tree_unswitch_outer_loop): New function.
(find_loop_guard): Likewise.
(empty_bb_without_guard_p): Likewise.
(used_outside_loop_p): Likewise.
(hoist_guard): Likewise.
(check_exit_phi): Likewise.

   gcc/testsuite/ChangeLog:
* gcc.dg/loop-unswitch-2.c: New test.
* gcc.dg/loop-unswitch-3.c: Likewise.
* gcc.dg/loop-unswitch-4.c: Likewise.

2015-10-06 10:59 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
> On Mon, Oct 5, 2015 at 3:13 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>> Thanks Richard.
>> I'd like to answer on your last comment related to using of exit edge
>> argument for edge that skips loop.
>> Let's consider the following test-case:
>>
>> #include <stdlib.h>
>> #define N 32
>> float *foo(int ustride, int size, float *src)
>> {
>>    float *buffer, *p;
>>    int i, k;
>>
>>    if (!src)
>>     return NULL;
>>
>>    buffer = (float *) malloc(N * size * sizeof(float));
>>
>>    if(buffer)
>>       for(i=0, p=buffer; i<N; i++, src+=ustride)
>> for(k=0; k<size; k++)
>>  *p++ = src[k];
>>
>>    return buffer;
>> }
>>
>> Before adding new edge we have in post-header bb:
>>   <bb 9>:
>>   # _6 = PHI <0B(8), buffer_20(16)>
>>   return _6;
>>
>> It is clear that we must preserve function semantic and transform it to
>> _6 = PHI <0B(12), buffer_19(9), buffer_19(4)>
>
> Ah, yeah.  I was confusing the loop exit of the inner vs. the outer loop.
>
> Richard.
>
>>
>> 2015-10-05 13:57 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>> On Wed, Sep 30, 2015 at 12:46 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>> Hi Richard,
>>>>
>>>> I re-designed outer loop unswitching using basic idea of 23855 patch -
>>>> hoist invariant guard if loop is empty without guard. Note that this
>>>> was added to loop unswitching pass with simple modifications - using
>>>> another loop iterator etc.
>>>>
>>>> Bootstrap and regression testing did not show any new failures.
>>>> What is your opinion?
>>>
>>> Overall it looks good.  Some comments below - a few more testcases would
>>> be nice as well.
>>>
>>> +  /* Loop must not be infinite.  */
>>> +  if (!finite_loop_p (loop))
>>> +    return false;
>>>
>>> why's that?
>>>
>>> +  body = get_loop_body_in_dom_order (loop);
>>> +  for (i = 0; i < loop->num_nodes; i++)
>>> +    {
>>> +      if (body[i]->loop_father != loop)
>>> +       continue;
>>> +      if (!empty_bb_without_guard_p (loop, body[i]))
>>>
>>> I wonder if there is a better way to iterate over the interesting
>>> blocks and PHIs
>>> we need to check for side-effects (and thus we maybe can avoid gathering
>>> the loop in DOM order).
>>>
>>> +      FOR_EACH_SSA_TREE_OPERAND (name, stmt, op_iter, SSA_OP_DEF)
>>> +       {
>>> +         if (may_be_used_outside
>>>
>>> may_be_used_outside can be hoisted above the loop.  I wonder if we can take
>>> advantage of loop-closed SSA form here (and the fact we have a single exit
>>> from the loop).  Iterating over exit dest PHIs and determining whether the
>>> exit edge DEF is inside the loop part it may not be should be enough.
>>>
>>> +  gcc_assert (single_succ_p (pre_header));
>>>
>>> that should be always true.
>>>
>>> +  gsi_remove (&gsi, false);
>>> +  bb = guard->dest;
>>> +  remove_edge (guard);
>>> +  /* Update dominance for destination of GUARD.  */
>>> +  if (EDGE_COUNT (bb->preds) == 0)
>>> +    {
>>> +      basic_block s_bb;
>>> +      gcc_assert (single_succ_p (bb));
>>> +      s_bb = single_succ (bb);
>>> +      delete_basic_block (bb);
>>> +      if (single_pred_p (s_bb))
>>> +       set_immediate_dominator (CDI_DOMINATORS, s_bb, single_pred (s_bb));
>>>
>>> all this massaging should be simplified by leaving it to CFG cleanup by
>>> simply adjusting the CONDs condition to always true/false.  There is
>>> gimple_cond_make_{true,false} () for this (would be nice to have a variant
>>> taking a bool).
>>>
>>> +  new_edge = make_edge (pre_header, exit->dest, flags);
>>> +  if (fix_dom_of_exit)
>>> +    set_immediate_dominator (CDI_DOMINATORS, exit->dest, pre_header);
>>> +  update_stmt (gsi_stmt (gsi));
>>>
>>> the update_stmt should be not necessary, it's done by gsi_insert_after already.
>>>
>>> +  /* Add NEW_ADGE argument for all phi in post-header block.  */
>>> +  bb = exit->dest;
>>> +  for (gphi_iterator gsi = gsi_start_phis (bb);
>>> +       !gsi_end_p (gsi); gsi_next (&gsi))
>>> +    {
>>> +      gphi *phi = gsi.phi ();
>>> +      /* edge_iterator ei; */
>>> +      tree arg;
>>> +      if (virtual_operand_p (gimple_phi_result (phi)))
>>> +       {
>>> +         arg = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
>>> +         add_phi_arg (phi, arg, new_edge, UNKNOWN_LOCATION);
>>> +       }
>>> +      else
>>> +       {
>>> +         /* Use exit edge argument.  */
>>> +         arg = PHI_ARG_DEF_FROM_EDGE (phi, exit);
>>> +         add_phi_arg (phi, arg, new_edge, UNKNOWN_LOCATION);
>>>
>>> Hum.  How is it ok to use the exit edge argument for the edge that skips
>>> the loop?  Why can't you always use the pre-header edge value?
>>> That is, if we have
>>>
>>>  for(i=0;i<m;++i)
>>>    {
>>>      if (n > 0)
>>>     {
>>>      for (;;)
>>>        {
>>>        }
>>>      }
>>>    }
>>>   ... = i;
>>>
>>> then i is used after the loop and the correct value to use if
>>> n > 0 is false is '0'.  Maybe this way we can also relax
>>> what check_exit_phi does?  IMHO the only restriction is
>>> if sth defined inside the loop before the header check for
>>> the inner loop is used after the loop.
>>>
>>> Thanks,
>>> Richard.
>>>
>>>> Thanks.
>>>>
>>>> ChangeLog:
>>>> 2015-09-30  Yuri Rumyantsev  <ysrumyan@gmail.com>
>>>>
>>>> * tree-ssa-loop-unswitch.c: Include "gimple-iterator.h" and
>>>> "cfghooks.h", add prototypes for introduced new functions.
>>>> (tree_ssa_unswitch_loops): Use from innermost loop iterator, move all
>>>> checks on ability of loop unswitching to tree_unswitch_single_loop;
>>>> invoke tree_unswitch_single_loop or tree_unswitch_outer_loop depending
>>>> on innermost loop check.
>>>> (tree_unswitch_single_loop): Add all required checks on ability of
>>>> loop unswitching under zero recursive level guard.
>>>> (tree_unswitch_outer_loop): New function.
>>>> (find_loop_guard): Likewise.
>>>> (empty_bb_without_guard_p): Likewise.
>>>> (used_outside_loop_p): Likewise.
>>>> (hoist_guard): Likewise.
>>>> (check_exit_phi): Likewise.
>>>>
>>>>    gcc/testsuite/ChangeLog:
>>>> * gcc.dg/loop-unswitch-2.c: New test.
>>>>
>>>> 2015-09-16 11:26 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>> Yeah, as said, the patch wasn't fully ready and it also felt odd to do
>>>>> this hoisting in loop header copying.  Integrating it
>>>>> with LIM would be a better fit eventually.
>>>>>
>>>>> Note that we did agree to go forward with your original patch just
>>>>> making it more "generically" perform outer loop
>>>>> unswitching.  Did you explore that idea further?
>>>>>
>>>>>
>>>>>
>>>>> On Tue, Sep 15, 2015 at 6:00 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>>> Thanks Richard.
>>>>>>
>>>>>> I found one more issue that could not be fixed simply. In 23855 you
>>>>>> consider the following test-case:
>>>>>> void foo(int *ie, int *je, double *x)
>>>>>> {
>>>>>>   int i, j;
>>>>>>   for (j=0; j<*je; ++j)
>>>>>>     for (i=0; i<*ie; ++i)
>>>>>>       x[i+j] = 0.0;
>>>>>> }
>>>>>> and proposed to hoist up a check on *ie out of loop. It requires
>>>>>> memref alias analysis since in general x and ie can alias (if their
>>>>>> types are compatible - int *ie & int * x). Such analysis is performed
>>>>>> by pre or lim passes. Without such analysis we can not hoist a test on
>>>>>> non-zero for *ie out of loop using 238565 patch.
>>>>>>  The second concern is that proposed copy header algorithm changes
>>>>>> loop structure significantly and it is not accepted by vectorizer
>>>>>> since latch is not empty (such transformation assumes loop peeling for
>>>>>> one iteration. So I can propose to implement simple guard hoisting
>>>>>> without copying header and tail blocks (if it is possible).
>>>>>>
>>>>>> I will appreciate you for any advice or help since without such
>>>>>> hoisting we are not able to perform outer loop vectorization for
>>>>>> important benchmark.
>>>>>> and
>>>>>>
>>>>>> 2015-09-15 14:22 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>>>> On Thu, Sep 3, 2015 at 6:32 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>>>>> Hi Richard,
>>>>>>>>
>>>>>>>> I started learning, tuning and debugging patch proposed in 23855 and
>>>>>>>> discovered thta it does not work properly.
>>>>>>>> So I wonder is it tested patch and it should work?
>>>>>>>
>>>>>>> I don't remember, but as it wasn't committed it certainly wasn't ready.
>>>>>>>
>>>>>>>> Should it accept for hoisting the following loop nest
>>>>>>>>   for (i=0; i<n; i++) {
>>>>>>>>     s = 0;
>>>>>>>>     for (j=0; j<m; j++)
>>>>>>>>       s += a[i] * b[j];
>>>>>>>>     c[i] = s;
>>>>>>>>   }
>>>>>>>> Note that i-loop will nit be empty if m is equal to 0.
>>>>>>>
>>>>>>> if m is equal to 0 then we still have the c[i] = s store, no?  Of course
>>>>>>> we could unswitch the outer loop on m == 0 but simple hoisting wouldn't work.
>>>>>>>
>>>>>>> Richard.
>>>>>>>
>>>>>>>> 2015-08-03 10:27 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>>>>>> On Fri, Jul 31, 2015 at 1:17 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>>>>>>> Hi Richard,
>>>>>>>>>>
>>>>>>>>>> I learned your updated patch for 23825 and it is more general in
>>>>>>>>>> comparison with my.
>>>>>>>>>> I'd like to propose you a compromise - let's consider my patch only
>>>>>>>>>> for force-vectorize outer loop only to allow outer-loop
>>>>>>>>>> vecctorization.
>>>>>>>>>
>>>>>>>>> I don't see why we should special-case that if the approach in 23825
>>>>>>>>> is sensible.
>>>>>>>>>
>>>>>>>>>> Note that your approach will not hoist invariant
>>>>>>>>>> guards if loops contains something else except for inner-loop, i.e. it
>>>>>>>>>> won't be empty for taken branch.
>>>>>>>>>
>>>>>>>>> Yes, it does not perform unswitching but guard hoisting.  Note that this
>>>>>>>>> is originally Zdenek Dvoraks patch.
>>>>>>>>>
>>>>>>>>>> I also would like to answer on your last question - CFG cleanup is
>>>>>>>>>> invoked to perform deletion of single-argument phi nodes from tail
>>>>>>>>>> block through substitution - such phi's prevent outer-loop
>>>>>>>>>> vectorization. But it is clear that such transformation can be done
>>>>>>>>>> other pass.
>>>>>>>>>
>>>>>>>>> Hmm, I wonder why the copy_prop pass after unswitching does not
>>>>>>>>> get rid of them?
>>>>>>>>>
>>>>>>>>>> What is your opinion?
>>>>>>>>>
>>>>>>>>> My opinion is that if we want to enhance unswitching to catch this
>>>>>>>>> (or similar) cases then we should make it a lot more general than
>>>>>>>>> your pattern-matching approach.  I see nothing that should prevent
>>>>>>>>> us from considering unswitching non-innermost loops in general.
>>>>>>>>> It should be only a cost consideration to not do non-innermost loop
>>>>>>>>> unswitching (in addition to maybe a --param specifying the maximum
>>>>>>>>> depth of a loop nest to unswitch).
>>>>>>>>>
>>>>>>>>> So my first thought when seeing your patch still holds - the patch
>>>>>>>>> looks very much too specific.
>>>>>>>>>
>>>>>>>>> Richard.
>>>>>>>>>
>>>>>>>>>> Yuri.
>>>>>>>>>>
>>>>>>>>>> 2015-07-28 13:50 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>>>>>>>> On Thu, Jul 23, 2015 at 4:45 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>>>>>>>>> Hi Richard,
>>>>>>>>>>>>
>>>>>>>>>>>> I checked that both test-cases from 23855 are sucessfully unswitched
>>>>>>>>>>>> by proposed patch. I understand that it does not catch deeper loop
>>>>>>>>>>>> nest as
>>>>>>>>>>>>    for (i=0; i<10; i++)
>>>>>>>>>>>>      for (j=0;j<n;j++)
>>>>>>>>>>>>         for (k=0;k<20;k++)
>>>>>>>>>>>>   ...
>>>>>>>>>>>> but duplication of middle-loop does not look reasonable.
>>>>>>>>>>>>
>>>>>>>>>>>> Here is dump for your second test-case:
>>>>>>>>>>>>
>>>>>>>>>>>> void foo(int *ie, int *je, double *x)
>>>>>>>>>>>> {
>>>>>>>>>>>>   int i, j;
>>>>>>>>>>>>   for (j=0; j<*je; ++j)
>>>>>>>>>>>>     for (i=0; i<*ie; ++i)
>>>>>>>>>>>>       x[i+j] = 0.0;
>>>>>>>>>>>> }
>>>>>>>>>>>> grep -i unswitch t6.c.119t.unswitch
>>>>>>>>>>>> ;; Unswitching outer loop
>>>>>>>>>>>
>>>>>>>>>>> I was saying that why go with a limited approach when a patch (in
>>>>>>>>>>> unknown state...)
>>>>>>>>>>> is available that does it more generally?  Also unswitching is quite
>>>>>>>>>>> expensive compared
>>>>>>>>>>> to "moving" the invariant condition.
>>>>>>>>>>>
>>>>>>>>>>> In your patch:
>>>>>>>>>>>
>>>>>>>>>>> +  if (!nloop->force_vectorize)
>>>>>>>>>>> +    nloop->force_vectorize = true;
>>>>>>>>>>> +  if (loop->safelen != 0)
>>>>>>>>>>> +    nloop->safelen = loop->safelen;
>>>>>>>>>>>
>>>>>>>>>>> I see no guard on force_vectorize so = true looks bogus here.  Please just use
>>>>>>>>>>> copy_loop_info.
>>>>>>>>>>>
>>>>>>>>>>> +  if (integer_nonzerop (cond_new))
>>>>>>>>>>> +    gimple_cond_set_condition_from_tree (cond_stmt, boolean_true_node);
>>>>>>>>>>> +  else if (integer_zerop (cond_new))
>>>>>>>>>>> +    gimple_cond_set_condition_from_tree (cond_stmt, boolean_false_node);
>>>>>>>>>>>
>>>>>>>>>>> gimple_cond_make_true/false (cond_stmt);
>>>>>>>>>>>
>>>>>>>>>>> btw, seems odd that we have to recompute which loop is the true / false variant
>>>>>>>>>>> when we just fed a guard condition to loop_version.  Can't we statically
>>>>>>>>>>> determine whether loop or nloop has the in-loop condition true or false?
>>>>>>>>>>>
>>>>>>>>>>> +  /* Clean-up cfg to remove useless one-argument phi in exit block of
>>>>>>>>>>> +     outer-loop.  */
>>>>>>>>>>> +  cleanup_tree_cfg ();
>>>>>>>>>>>
>>>>>>>>>>> I know unswitching is already O(number-of-unswitched-loops * size-of-function)
>>>>>>>>>>> because it updates SSA form after each individual unswitching (and it does that
>>>>>>>>>>> because it invokes itself recursively on unswitched loops).  But do you really
>>>>>>>>>>> need to invoke CFG cleanup here?
>>>>>>>>>>>
>>>>>>>>>>> Richard.
>>>>>>>>>>>
>>>>>>>>>>>> Yuri.
>>>>>>>>>>>>
>>>>>>>>>>>> 2015-07-14 14:06 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>>>>>>>>>> On Fri, Jul 10, 2015 at 12:02 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>>>>>>>>>>> Hi All,
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Here is presented simple transformation which tries to hoist out of
>>>>>>>>>>>>>> outer-loop a check on zero trip count for inner-loop. This is very
>>>>>>>>>>>>>> restricted transformation since it accepts outer-loops with very
>>>>>>>>>>>>>> simple cfg, as for example:
>>>>>>>>>>>>>>     acc = 0;
>>>>>>>>>>>>>>    for (i = 1; i <= m; i++) {
>>>>>>>>>>>>>>       for (j = 0; j < n; j++)
>>>>>>>>>>>>>>          if (l[j] == i) { v[j] = acc; acc++; };
>>>>>>>>>>>>>>       acc <<= 1;
>>>>>>>>>>>>>>    }
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Note that degenerative outer loop (without inner loop) will be
>>>>>>>>>>>>>> completely deleted as dead code.
>>>>>>>>>>>>>> The main goal of this transformation was to convert outer-loop to form
>>>>>>>>>>>>>> accepted by outer-loop vectorization (such test-case is also included
>>>>>>>>>>>>>> to patch).
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Bootstrap and regression testing did not show any new failures.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Is it OK for trunk?
>>>>>>>>>>>>>
>>>>>>>>>>>>> I think this is
>>>>>>>>>>>>>
>>>>>>>>>>>>> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=23855
>>>>>>>>>>>>>
>>>>>>>>>>>>> as well.  It has a patch adding a invariant loop guard hoisting
>>>>>>>>>>>>> phase to loop-header copying.  Yeah, it needs updating to
>>>>>>>>>>>>> trunk again I suppose.  It's always non-stage1 when I come
>>>>>>>>>>>>> back to that patch.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Your patch seems to be very specific and only handles outer
>>>>>>>>>>>>> loops of innermost loops.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Richard.
>>>>>>>>>>>>>
>>>>>>>>>>>>>> ChangeLog:
>>>>>>>>>>>>>> 2015-07-10  Yuri Rumyantsev  <ysrumyan@gmail.com>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> * tree-ssa-loop-unswitch.c: Include "tree-cfgcleanup.h" and
>>>>>>>>>>>>>> "gimple-iterator.h", add prototype for tree_unswitch_outer_loop.
>>>>>>>>>>>>>> (tree_ssa_unswitch_loops): Add invoke of tree_unswitch_outer_loop.
>>>>>>>>>>>>>> (tree_unswitch_outer_loop): New function.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> gcc/testsuite/ChangeLog:
>>>>>>>>>>>>>> * gcc.dg/tree-ssa/unswitch-outer-loop-1.c: New test.
>>>>>>>>>>>>>> * gcc.dg/vect/vect-outer-simd-3.c: New test.

[-- Attachment #2: patch.update --]
[-- Type: application/octet-stream, Size: 16578 bytes --]

diff --git a/gcc/testsuite/gcc.dg/loop-unswitch-2.c b/gcc/testsuite/gcc.dg/loop-unswitch-2.c
new file mode 100644
index 0000000..df91d20
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/loop-unswitch-2.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -funswitch-loops -fdump-tree-unswitch-details" } */
+
+void foo (float **a, float **b, float *c, int n, int m, int l)
+{
+  int i,j,k;
+  float s;
+  for (i=0; i<l; i++)
+    {
+      for (j=0; j<n; j++)
+	for (k=0; k<m; k++)
+	  c[i] += a[i][k] * b[k][j];
+    }
+}
+
+/* { dg-final { scan-tree-dump-times "guard hoisted" 2 "unswitch" } } */
diff --git a/gcc/testsuite/gcc.dg/loop-unswitch-3.c b/gcc/testsuite/gcc.dg/loop-unswitch-3.c
new file mode 100644
index 0000000..0ddf54a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/loop-unswitch-3.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -funswitch-loops -fdump-tree-unswitch-details" } */
+
+#include <stdlib.h>
+#define N 32
+float *foo(int ustride, int size, float *src)
+{
+   float *buffer, *p;
+   int i, k;
+
+   if (!src)
+    return NULL;
+
+   buffer = (float *) malloc(N * size * sizeof(float));
+
+   if(buffer) 
+      for(i=0, p=buffer; i<N; i++, src+=ustride)
+	for(k=0; k<size; k++)
+	  *p++ = src[k];
+
+   return buffer;
+}
+
+/* { dg-final { scan-tree-dump-times "guard hoisted" 1 "unswitch" } } */ 
+
diff --git a/gcc/testsuite/gcc.dg/loop-unswitch-4.c b/gcc/testsuite/gcc.dg/loop-unswitch-4.c
new file mode 100644
index 0000000..e4a7f2e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/loop-unswitch-4.c
@@ -0,0 +1,51 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -funswitch-loops" } */
+
+#include <stdlib.h>
+__attribute__ ((noinline))
+void foo (float **a, float **b, float *c, int n, int m, int l)
+{
+  int i,j,k;
+  float s;
+  for (i=0; i<l; i++)
+    for (j=0; j<n; j++)
+      for (k=0; k<m; k++)
+	c[i] += a[i][k] * b[k][j];
+}
+
+int main()
+{
+  const int N = 32;
+  float **ar1, **ar2;
+  float *res;
+  int i, j;
+  ar1 = (float **)malloc (N * sizeof (float*));
+  ar2 = (float **)malloc (N * sizeof (float*));
+  res = (float *)malloc( N * sizeof (float));
+  for (i=0; i<N; i++)
+    {
+      ar1[i] = (float*)malloc (N * sizeof (float));
+      ar2[i] = (float*)malloc (N * sizeof (float));
+    }
+  for (i=0; i<N; i++)
+    {
+      for (j=0; j<N; j++)
+	{
+	  ar1[i][j] = 2.0f;
+	  ar2[i][j] = 1.5f;
+	}
+      res[i] = 0.0f;
+    }
+  foo (ar1, ar2, res, N, N, N);
+  for (i=0; i<N; i++)
+    if (res[i] != 3072.0f)
+      abort();
+  for (i=0; i<N; i++)
+    res[i] = 0.0f;
+  foo (ar1, ar2, res, N, 0, N);
+  for (i=0; i<N; i++)
+    if (res[i] != 0.0f)
+      abort();
+  return 0;
+}
+
diff --git a/gcc/tree-ssa-loop-unswitch.c b/gcc/tree-ssa-loop-unswitch.c
index a273638..791aea2 100644
--- a/gcc/tree-ssa-loop-unswitch.c
+++ b/gcc/tree-ssa-loop-unswitch.c
@@ -39,6 +39,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "params.h"
 #include "tree-pass.h"
 #include "tree-inline.h"
+#include "gimple-iterator.h"
+#include "cfghooks.h"
 
 /* This file implements the loop unswitching, i.e. transformation of loops like
 
@@ -79,6 +81,12 @@ along with GCC; see the file COPYING3.  If not see
 static struct loop *tree_unswitch_loop (struct loop *, basic_block, tree);
 static bool tree_unswitch_single_loop (struct loop *, int);
 static tree tree_may_unswitch_on (basic_block, struct loop *);
+static bool tree_unswitch_outer_loop (struct loop *);
+static edge find_loop_guard (struct loop *);
+static bool empty_bb_without_guard_p (struct loop *, basic_block);
+static bool used_outside_loop_p (struct loop *, tree);
+static void hoist_guard (struct loop *, edge);
+static bool check_exit_phi (struct loop *);
 
 /* Main entry point.  Perform loop unswitching on all suitable loops.  */
 
@@ -87,42 +95,15 @@ tree_ssa_unswitch_loops (void)
 {
   struct loop *loop;
   bool changed = false;
-  HOST_WIDE_INT iterations;
 
-  /* Go through inner loops (only original ones).  */
-  FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST)
+  /* Go through all loops starting from innermost.  */
+  FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
     {
-      if (dump_file && (dump_flags & TDF_DETAILS))
-        fprintf (dump_file, ";; Considering loop %d\n", loop->num);
-
-      /* Do not unswitch in cold regions. */
-      if (optimize_loop_for_size_p (loop))
-        {
-          if (dump_file && (dump_flags & TDF_DETAILS))
-            fprintf (dump_file, ";; Not unswitching cold loops\n");
-          continue;
-        }
-
-      /* The loop should not be too large, to limit code growth. */
-      if (tree_num_loop_insns (loop, &eni_size_weights)
-          > (unsigned) PARAM_VALUE (PARAM_MAX_UNSWITCH_INSNS))
-        {
-          if (dump_file && (dump_flags & TDF_DETAILS))
-            fprintf (dump_file, ";; Not unswitching, loop too big\n");
-          continue;
-        }
-
-      /* If the loop is not expected to iterate, there is no need
-	 for unswitching.  */
-      iterations = estimated_loop_iterations_int (loop);
-      if (iterations >= 0 && iterations <= 1)
-	{
-          if (dump_file && (dump_flags & TDF_DETAILS))
-            fprintf (dump_file, ";; Not unswitching, loop is not expected to iterate\n");
-          continue;
-	}
-
-      changed |= tree_unswitch_single_loop (loop, 0);
+      if (!loop->inner)
+	/* Unswitch innermost loop.  */
+	changed |= tree_unswitch_single_loop (loop, 0);
+      else
+	changed |= tree_unswitch_outer_loop (loop);
     }
 
   if (changed)
@@ -216,6 +197,39 @@ tree_unswitch_single_loop (struct loop *loop, int num)
   tree cond = NULL_TREE;
   gimple stmt;
   bool changed = false;
+  HOST_WIDE_INT iterations;
+
+  /* Perform initial tests if unswitch is eligible.  */
+  if (num == 0)
+    {
+      /* Do not unswitch in cold regions. */
+      if (optimize_loop_for_size_p (loop))
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, ";; Not unswitching cold loops\n");
+	  return false;
+	}
+
+      /* The loop should not be too large, to limit code growth. */
+      if (tree_num_loop_insns (loop, &eni_size_weights)
+	  > (unsigned) PARAM_VALUE (PARAM_MAX_UNSWITCH_INSNS))
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, ";; Not unswitching, loop too big\n");
+	  return false;
+	}
+
+      /* If the loop is not expected to iterate, there is no need
+	 for unswitching.  */
+      iterations = estimated_loop_iterations_int (loop);
+      if (iterations >= 0 && iterations <= 1)
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, ";; Not unswitching, loop is not expected"
+		     " to iterate\n");
+	  return false;
+	}
+    }
 
   i = 0;
   bbs = get_loop_body (loop);
@@ -403,6 +417,347 @@ tree_unswitch_loop (struct loop *loop,
 		       REG_BR_PROB_BASE - prob_true, false);
 }
 
+/* Unswitch outer loops by hoisting invariant guard on
+   inner loop without code duplication.  */
+static bool
+tree_unswitch_outer_loop (struct loop *loop)
+{
+  edge exit, guard;
+  HOST_WIDE_INT iterations;
+
+  gcc_assert (loop->inner);
+  if (loop->inner->next)
+    return false;
+  /* Accept loops with single exit only.  */
+  exit = single_exit (loop);
+  if (!exit)
+    return false;
+  /* Check that phi argument of exit edge is not defined inside loop.  */
+  if (!check_exit_phi (loop))
+    return false;
+  /* If the loop is not expected to iterate, there is no need
+      for unswitching.  */
+  iterations = estimated_loop_iterations_int (loop);
+  if (iterations >= 0 && iterations <= 1)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, ";; Not unswitching, loop is not expected"
+		 " to iterate\n");
+	return false;
+    }
+
+  guard = find_loop_guard (loop);
+  if (guard)
+    {
+      hoist_guard (loop, guard);
+      update_ssa (TODO_update_ssa);
+      return true;
+    }
+  return false;
+}
+
+/* Checks if the body of the LOOP is within an invariant guard.  If this
+   is the case, returns the edge that jumps over the real body of the loop,
+   otherwise returns NULL.  */
+
+static edge
+find_loop_guard (struct loop *loop)
+{
+  basic_block header = loop->header;
+  edge guard_edge, te, fe;
+  /* bitmap processed, known_invariants;*/
+  basic_block *body = NULL;
+  unsigned i;
+  tree use;
+  ssa_op_iter iter;
+
+  /* We check for the following situation:
+
+     while (1)
+       {
+	 [header]]
+         loop_phi_nodes;
+	 something1;
+	 if (cond1)
+	   body;
+	 nvar = phi(orig, bvar) ... for all variables changed in body;
+	 [guard_end]
+	 something2;
+	 if (cond2)
+	   break;
+	 something3;
+       }
+
+     where:
+
+     1) cond1 is loop invariant
+     2) If cond1 is false, then the loop is essentially empty; i.e.,
+	a) nothing in something1, something2 and something3 has side
+	   effects
+	b) anything defined in something1, something2 and something3
+	   is not used outside of the loop.  */
+
+  while (single_succ_p (header))
+    header = single_succ (header);
+  if (!last_stmt (header)
+      || gimple_code (last_stmt (header)) != GIMPLE_COND)
+    return NULL;
+
+  extract_true_false_edges_from_block (header, &te, &fe);
+  if (!flow_bb_inside_loop_p (loop, te->dest)
+      || !flow_bb_inside_loop_p (loop, fe->dest))
+    return NULL;
+
+  if (just_once_each_iteration_p (loop, te->dest)
+      || (single_succ_p (te->dest)
+	  && just_once_each_iteration_p (loop, single_succ (te->dest))))
+    {
+      if (just_once_each_iteration_p (loop, fe->dest))
+	return NULL;
+      guard_edge = te;
+    }
+  else if (just_once_each_iteration_p (loop, fe->dest)
+	   || (single_succ_p (fe->dest)
+	       && just_once_each_iteration_p (loop, single_succ (fe->dest))))
+    guard_edge = fe;
+  else
+    return NULL;
+
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file,
+	     "Considering guard %d -> %d in loop %d\n",
+	     guard_edge->src->index, guard_edge->dest->index, loop->num);
+  /* Check if condition operands do not have definitions inside loop since
+     any bb copying is not performed.  */
+  FOR_EACH_SSA_TREE_OPERAND (use, last_stmt (header), iter, SSA_OP_USE)
+    {
+      gimple def = SSA_NAME_DEF_STMT (use);
+      basic_block def_bb = gimple_bb (def);
+      if (def_bb
+          && flow_bb_inside_loop_p (loop, def_bb))
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, "  guard operands have definitions"
+				" inside loop\n");
+	  return NULL;
+	}
+    }
+
+  body = get_loop_body (loop);
+  for (i = 0; i < loop->num_nodes; i++)
+    {
+      if (body[i]->loop_father != loop)
+	continue;
+      if (!empty_bb_without_guard_p (loop, body[i]))
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, "  block %d has side effects\n", body[i]->index);
+	  guard_edge = NULL;
+	  goto end;
+	}
+    }
+
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, "  suitable to hoist\n");
+end:
+  if (body)
+    free (body);
+  return guard_edge;
+}
+
+/* Returns true if
+   1) no statement in BB has side effects
+   2) assuming that edge GUARD is always taken, all definitions in BB
+      are noy used outside of the loop.
+   KNOWN_INVARIANTS is a set of ssa names we know to be invariant, and
+   PROCESSED is a set of ssa names for that we already tested whether they
+   are invariant or not.  */
+
+static bool
+empty_bb_without_guard_p (struct loop *loop, basic_block bb)
+{
+  basic_block exit_bb = single_exit (loop)->src;
+  bool may_be_used_outside = (bb == exit_bb
+			      || !dominated_by_p (CDI_DOMINATORS, bb, exit_bb));
+  tree name;
+  ssa_op_iter op_iter;
+
+  /* Phi nodes do not have side effects, but their results might be used
+     outside of the loop.  */
+  if (may_be_used_outside)
+    {
+      for (gphi_iterator gsi = gsi_start_phis (bb);
+	   !gsi_end_p (gsi); gsi_next (&gsi))
+	{
+	  gphi *phi = gsi.phi ();
+	  name = PHI_RESULT (phi);
+	  if (virtual_operand_p (name))
+	    continue;
+
+	  if (used_outside_loop_p (loop, name))
+	    return false;
+	}
+    }
+
+  for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
+       !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      gimple stmt = gsi_stmt (gsi);
+      if (gimple_has_side_effects (stmt))
+	return false;
+
+      if (gimple_vdef(stmt))
+	return false;
+
+      FOR_EACH_SSA_TREE_OPERAND (name, stmt, op_iter, SSA_OP_DEF)
+	{
+	  if (may_be_used_outside
+	      && used_outside_loop_p (loop, name))
+	    return false;
+	}
+    }
+  return true;
+}
+
+/* Return true if NAME is used outside of LOOP.  */
+
+static bool
+used_outside_loop_p (struct loop *loop, tree name)
+{
+  imm_use_iterator it;
+  use_operand_p use;
+
+  FOR_EACH_IMM_USE_FAST (use, it, name)
+    {
+      gimple stmt = USE_STMT (use);
+      if (!flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
+	return true;
+    }
+
+  return false;
+}
+
+/* Moves the check of GUARD outside of LOOP.  */
+
+static void
+hoist_guard (struct loop *loop, edge guard)
+{
+  edge exit = single_exit (loop);
+  edge preh = loop_preheader_edge (loop);
+  basic_block pre_header = preh->src;
+  basic_block bb;
+  edge te, fe, e, new_edge;
+  gimple stmt;
+  basic_block guard_bb = guard->src;
+  gimple_stmt_iterator gsi;
+  int flags = 0;
+  bool fix_dom_of_exit;
+  gcond *cond_stmt, *new_cond_stmt;
+
+  bb = get_immediate_dominator (CDI_DOMINATORS, exit->dest);
+  fix_dom_of_exit = flow_bb_inside_loop_p (loop, bb);
+  gsi = gsi_last_bb (guard_bb);
+  stmt = gsi_stmt (gsi);
+  gcc_assert (gimple_code (stmt) == GIMPLE_COND);
+  cond_stmt = as_a <gcond *> (stmt);
+  extract_true_false_edges_from_block (guard_bb, &te, &fe);
+  /* Insert guard to PRE_HEADER.  */
+  if (!empty_block_p (pre_header))
+    gsi = gsi_last_bb (pre_header);
+  else
+    gsi = gsi_start_bb (pre_header);
+  /* Create copy of COND_STMT.  */
+  new_cond_stmt = gimple_build_cond (gimple_cond_code (cond_stmt),
+				     gimple_cond_lhs (cond_stmt),
+				     gimple_cond_rhs (cond_stmt),
+				     NULL_TREE, NULL_TREE);
+  gsi_insert_after (&gsi, new_cond_stmt, GSI_NEW_STMT);
+  /* Convert COND_STMT to true/false conditional.  */
+  if (guard == te)
+    gimple_cond_make_false (cond_stmt);
+  else
+    gimple_cond_make_true (cond_stmt);
+  update_stmt (cond_stmt);
+  /* Create new loop pre-header.  */
+  e = split_block (pre_header, last_stmt (pre_header));
+  gcc_assert (loop_preheader_edge (loop)->src == e->dest);
+  if (guard == fe)
+    {
+      e->flags = EDGE_TRUE_VALUE;
+      flags |= EDGE_FALSE_VALUE;
+    }
+  else
+    {
+      e->flags = EDGE_FALSE_VALUE;
+      flags |= EDGE_TRUE_VALUE;
+    }
+  new_edge = make_edge (pre_header, exit->dest, flags);
+  if (fix_dom_of_exit)
+    set_immediate_dominator (CDI_DOMINATORS, exit->dest, pre_header);
+  /* Add NEW_ADGE argument for all phi in post-header block.  */
+  bb = exit->dest;
+  for (gphi_iterator gsi = gsi_start_phis (bb);
+       !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      gphi *phi = gsi.phi ();
+      /* edge_iterator ei; */
+      tree arg;
+      if (virtual_operand_p (gimple_phi_result (phi)))
+	{
+	  arg = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
+	  add_phi_arg (phi, arg, new_edge, UNKNOWN_LOCATION);
+	}
+      else
+	{
+	  /* Use exit edge argument.  */
+	  arg = PHI_ARG_DEF_FROM_EDGE (phi, exit);
+	  add_phi_arg (phi, arg, new_edge, UNKNOWN_LOCATION);
+	}
+    }
+
+  mark_virtual_operands_for_renaming (cfun);
+  update_ssa (TODO_update_ssa);
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, "  guard hoisted.\n");
+}
+
+/* Return true if phi argument for exit edge can be used
+   for edge around loop.  */
+
+static bool
+check_exit_phi (struct loop *loop)
+{
+  edge exit = single_exit (loop);
+  basic_block pre_header = loop_preheader_edge (loop)->src;
+
+  for (gphi_iterator gsi = gsi_start_phis (exit->dest);
+       !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      gphi *phi = gsi.phi ();
+      tree arg;
+      gimple def;
+      basic_block def_bb;
+      if (virtual_operand_p (gimple_phi_result (phi)))
+	continue;
+      arg = PHI_ARG_DEF_FROM_EDGE (phi, exit);
+      if (TREE_CODE (arg) != SSA_NAME)
+	continue;
+      def = SSA_NAME_DEF_STMT (arg);
+      if (!def)
+	continue;
+      def_bb = gimple_bb (def);
+      if (!def_bb)
+	continue;
+      if (!dominated_by_p (CDI_DOMINATORS, pre_header, def_bb))
+	/* Definition inside loop!  */
+	return false;
+      /* Check loop closed phi invariant.  */
+      if (!flow_bb_inside_loop_p (def_bb->loop_father, pre_header))
+	return false;
+    }
+  return true;
+}
+
 /* Loop unswitching pass.  */
 
 namespace {

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] Unswitching outer loops.
  2015-10-06 11:41                           ` Yuri Rumyantsev
@ 2015-10-06 12:21                             ` Richard Biener
  2015-10-07  9:53                               ` Yuri Rumyantsev
  0 siblings, 1 reply; 17+ messages in thread
From: Richard Biener @ 2015-10-06 12:21 UTC (permalink / raw)
  To: Yuri Rumyantsev; +Cc: gcc-patches, Igor Zamyatin

On Tue, Oct 6, 2015 at 1:41 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
> Richard,
>
> Here is updated patch which reflects almost all your remarks:
> 1. Use ordinary get_loop_body.
> 2. Delete useless asserts.
> 3. Use check on iterated loop instead of finite_loop_p.
> 4. Do not update CFG by adjusting the CONDs condition to always true/false.
> 5. Add couple tests.

+  /* Add NEW_ADGE argument for all phi in post-header block.  */
+  bb = exit->dest;
+  for (gphi_iterator gsi = gsi_start_phis (bb);
+       !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      gphi *phi = gsi.phi ();
+      /* edge_iterator ei; */
+      tree arg;
+      if (virtual_operand_p (gimple_phi_result (phi)))
+       {
+         arg = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
+         add_phi_arg (phi, arg, new_edge, UNKNOWN_LOCATION);

now I know what confused me - here you are looking at a loop exit PHI
but querying with the preheader edge index.  I think you need to walk
the loop header PHIs to find the PHI for the virtual operand and use that
to get the PHI arg from?

The side-effect / used-outside code is still the same.  What matters
is side-effects outside of the loop-header protected code region, not
blocks excluding the inner loop.  Say,

  for (;;)
    {
      if (invariant-guard)
        {
           printf ("Blah");
           for (;;)
             ;
        }
    }

would still ok to be unswitched.  So instead of

+      if (body[i]->loop_father != loop)
+       continue;

it would be

       if (dominated_by_p (CDI_DOMINATORS, body[i], header)
           && !dominated_by_p (CDI_DOMINATORS, body[i], fe->dest))

with the obvious improvement to the patch to not only consider header checks
in the outer loop header but in the pre-header block of the inner loop.

And I still think you should walk the exit PHIs args to see whether they
are defined in the non-guarded region of the outer loop instead of walking
all uses of all defs.

Note that I think you miss endless loops as side-effects if that endless
loop occurs through a irreducible region (thus not reflected in the
loop tree).  Thus you should reject BB_IRREDUCIBLE_LOOP blocks
in the non-guarded region as well.

It seems to me that protecting adjacent loops with a single guard is
also eligible for hoisting thus the restriction on loop->inner->next
should become a restriction on no loops (or irreducible regions)
in the non-guarded region.

Most things can be improved as followup, but at least the
virtual PHI arg thing needs to be sorted out.

Thanks,
Richard.


> ChangeLog:
> 2015-10-06  Yuri Rumyantsev  <ysrumyan@gmail.com>
>
> * tree-ssa-loop-unswitch.c: Include "gimple-iterator.h" and
> "cfghooks.h", add prototypes for introduced new functions.
> (tree_ssa_unswitch_loops): Use from innermost loop iterator, move all
> checks on ability of loop unswitching to tree_unswitch_single_loop;
> invoke tree_unswitch_single_loop or tree_unswitch_outer_loop depending
> on innermost loop check.
> (tree_unswitch_single_loop): Add all required checks on ability of
> loop unswitching under zero recursive level guard.
> (tree_unswitch_outer_loop): New function.
> (find_loop_guard): Likewise.
> (empty_bb_without_guard_p): Likewise.
> (used_outside_loop_p): Likewise.
> (hoist_guard): Likewise.
> (check_exit_phi): Likewise.
>
>    gcc/testsuite/ChangeLog:
> * gcc.dg/loop-unswitch-2.c: New test.
> * gcc.dg/loop-unswitch-3.c: Likewise.
> * gcc.dg/loop-unswitch-4.c: Likewise.
>
> 2015-10-06 10:59 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>> On Mon, Oct 5, 2015 at 3:13 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>> Thanks Richard.
>>> I'd like to answer on your last comment related to using of exit edge
>>> argument for edge that skips loop.
>>> Let's consider the following test-case:
>>>
>>> #include <stdlib.h>
>>> #define N 32
>>> float *foo(int ustride, int size, float *src)
>>> {
>>>    float *buffer, *p;
>>>    int i, k;
>>>
>>>    if (!src)
>>>     return NULL;
>>>
>>>    buffer = (float *) malloc(N * size * sizeof(float));
>>>
>>>    if(buffer)
>>>       for(i=0, p=buffer; i<N; i++, src+=ustride)
>>> for(k=0; k<size; k++)
>>>  *p++ = src[k];
>>>
>>>    return buffer;
>>> }
>>>
>>> Before adding new edge we have in post-header bb:
>>>   <bb 9>:
>>>   # _6 = PHI <0B(8), buffer_20(16)>
>>>   return _6;
>>>
>>> It is clear that we must preserve function semantic and transform it to
>>> _6 = PHI <0B(12), buffer_19(9), buffer_19(4)>
>>
>> Ah, yeah.  I was confusing the loop exit of the inner vs. the outer loop.
>>
>> Richard.
>>
>>>
>>> 2015-10-05 13:57 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>> On Wed, Sep 30, 2015 at 12:46 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>> Hi Richard,
>>>>>
>>>>> I re-designed outer loop unswitching using basic idea of 23855 patch -
>>>>> hoist invariant guard if loop is empty without guard. Note that this
>>>>> was added to loop unswitching pass with simple modifications - using
>>>>> another loop iterator etc.
>>>>>
>>>>> Bootstrap and regression testing did not show any new failures.
>>>>> What is your opinion?
>>>>
>>>> Overall it looks good.  Some comments below - a few more testcases would
>>>> be nice as well.
>>>>
>>>> +  /* Loop must not be infinite.  */
>>>> +  if (!finite_loop_p (loop))
>>>> +    return false;
>>>>
>>>> why's that?
>>>>
>>>> +  body = get_loop_body_in_dom_order (loop);
>>>> +  for (i = 0; i < loop->num_nodes; i++)
>>>> +    {
>>>> +      if (body[i]->loop_father != loop)
>>>> +       continue;
>>>> +      if (!empty_bb_without_guard_p (loop, body[i]))
>>>>
>>>> I wonder if there is a better way to iterate over the interesting
>>>> blocks and PHIs
>>>> we need to check for side-effects (and thus we maybe can avoid gathering
>>>> the loop in DOM order).
>>>>
>>>> +      FOR_EACH_SSA_TREE_OPERAND (name, stmt, op_iter, SSA_OP_DEF)
>>>> +       {
>>>> +         if (may_be_used_outside
>>>>
>>>> may_be_used_outside can be hoisted above the loop.  I wonder if we can take
>>>> advantage of loop-closed SSA form here (and the fact we have a single exit
>>>> from the loop).  Iterating over exit dest PHIs and determining whether the
>>>> exit edge DEF is inside the loop part it may not be should be enough.
>>>>
>>>> +  gcc_assert (single_succ_p (pre_header));
>>>>
>>>> that should be always true.
>>>>
>>>> +  gsi_remove (&gsi, false);
>>>> +  bb = guard->dest;
>>>> +  remove_edge (guard);
>>>> +  /* Update dominance for destination of GUARD.  */
>>>> +  if (EDGE_COUNT (bb->preds) == 0)
>>>> +    {
>>>> +      basic_block s_bb;
>>>> +      gcc_assert (single_succ_p (bb));
>>>> +      s_bb = single_succ (bb);
>>>> +      delete_basic_block (bb);
>>>> +      if (single_pred_p (s_bb))
>>>> +       set_immediate_dominator (CDI_DOMINATORS, s_bb, single_pred (s_bb));
>>>>
>>>> all this massaging should be simplified by leaving it to CFG cleanup by
>>>> simply adjusting the CONDs condition to always true/false.  There is
>>>> gimple_cond_make_{true,false} () for this (would be nice to have a variant
>>>> taking a bool).
>>>>
>>>> +  new_edge = make_edge (pre_header, exit->dest, flags);
>>>> +  if (fix_dom_of_exit)
>>>> +    set_immediate_dominator (CDI_DOMINATORS, exit->dest, pre_header);
>>>> +  update_stmt (gsi_stmt (gsi));
>>>>
>>>> the update_stmt should be not necessary, it's done by gsi_insert_after already.
>>>>
>>>> +  /* Add NEW_ADGE argument for all phi in post-header block.  */
>>>> +  bb = exit->dest;
>>>> +  for (gphi_iterator gsi = gsi_start_phis (bb);
>>>> +       !gsi_end_p (gsi); gsi_next (&gsi))
>>>> +    {
>>>> +      gphi *phi = gsi.phi ();
>>>> +      /* edge_iterator ei; */
>>>> +      tree arg;
>>>> +      if (virtual_operand_p (gimple_phi_result (phi)))
>>>> +       {
>>>> +         arg = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
>>>> +         add_phi_arg (phi, arg, new_edge, UNKNOWN_LOCATION);
>>>> +       }
>>>> +      else
>>>> +       {
>>>> +         /* Use exit edge argument.  */
>>>> +         arg = PHI_ARG_DEF_FROM_EDGE (phi, exit);
>>>> +         add_phi_arg (phi, arg, new_edge, UNKNOWN_LOCATION);
>>>>
>>>> Hum.  How is it ok to use the exit edge argument for the edge that skips
>>>> the loop?  Why can't you always use the pre-header edge value?
>>>> That is, if we have
>>>>
>>>>  for(i=0;i<m;++i)
>>>>    {
>>>>      if (n > 0)
>>>>     {
>>>>      for (;;)
>>>>        {
>>>>        }
>>>>      }
>>>>    }
>>>>   ... = i;
>>>>
>>>> then i is used after the loop and the correct value to use if
>>>> n > 0 is false is '0'.  Maybe this way we can also relax
>>>> what check_exit_phi does?  IMHO the only restriction is
>>>> if sth defined inside the loop before the header check for
>>>> the inner loop is used after the loop.
>>>>
>>>> Thanks,
>>>> Richard.
>>>>
>>>>> Thanks.
>>>>>
>>>>> ChangeLog:
>>>>> 2015-09-30  Yuri Rumyantsev  <ysrumyan@gmail.com>
>>>>>
>>>>> * tree-ssa-loop-unswitch.c: Include "gimple-iterator.h" and
>>>>> "cfghooks.h", add prototypes for introduced new functions.
>>>>> (tree_ssa_unswitch_loops): Use from innermost loop iterator, move all
>>>>> checks on ability of loop unswitching to tree_unswitch_single_loop;
>>>>> invoke tree_unswitch_single_loop or tree_unswitch_outer_loop depending
>>>>> on innermost loop check.
>>>>> (tree_unswitch_single_loop): Add all required checks on ability of
>>>>> loop unswitching under zero recursive level guard.
>>>>> (tree_unswitch_outer_loop): New function.
>>>>> (find_loop_guard): Likewise.
>>>>> (empty_bb_without_guard_p): Likewise.
>>>>> (used_outside_loop_p): Likewise.
>>>>> (hoist_guard): Likewise.
>>>>> (check_exit_phi): Likewise.
>>>>>
>>>>>    gcc/testsuite/ChangeLog:
>>>>> * gcc.dg/loop-unswitch-2.c: New test.
>>>>>
>>>>> 2015-09-16 11:26 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>>> Yeah, as said, the patch wasn't fully ready and it also felt odd to do
>>>>>> this hoisting in loop header copying.  Integrating it
>>>>>> with LIM would be a better fit eventually.
>>>>>>
>>>>>> Note that we did agree to go forward with your original patch just
>>>>>> making it more "generically" perform outer loop
>>>>>> unswitching.  Did you explore that idea further?
>>>>>>
>>>>>>
>>>>>>
>>>>>> On Tue, Sep 15, 2015 at 6:00 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>>>> Thanks Richard.
>>>>>>>
>>>>>>> I found one more issue that could not be fixed simply. In 23855 you
>>>>>>> consider the following test-case:
>>>>>>> void foo(int *ie, int *je, double *x)
>>>>>>> {
>>>>>>>   int i, j;
>>>>>>>   for (j=0; j<*je; ++j)
>>>>>>>     for (i=0; i<*ie; ++i)
>>>>>>>       x[i+j] = 0.0;
>>>>>>> }
>>>>>>> and proposed to hoist up a check on *ie out of loop. It requires
>>>>>>> memref alias analysis since in general x and ie can alias (if their
>>>>>>> types are compatible - int *ie & int * x). Such analysis is performed
>>>>>>> by pre or lim passes. Without such analysis we can not hoist a test on
>>>>>>> non-zero for *ie out of loop using 238565 patch.
>>>>>>>  The second concern is that proposed copy header algorithm changes
>>>>>>> loop structure significantly and it is not accepted by vectorizer
>>>>>>> since latch is not empty (such transformation assumes loop peeling for
>>>>>>> one iteration. So I can propose to implement simple guard hoisting
>>>>>>> without copying header and tail blocks (if it is possible).
>>>>>>>
>>>>>>> I will appreciate you for any advice or help since without such
>>>>>>> hoisting we are not able to perform outer loop vectorization for
>>>>>>> important benchmark.
>>>>>>> and
>>>>>>>
>>>>>>> 2015-09-15 14:22 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>>>>> On Thu, Sep 3, 2015 at 6:32 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>>>>>> Hi Richard,
>>>>>>>>>
>>>>>>>>> I started learning, tuning and debugging patch proposed in 23855 and
>>>>>>>>> discovered thta it does not work properly.
>>>>>>>>> So I wonder is it tested patch and it should work?
>>>>>>>>
>>>>>>>> I don't remember, but as it wasn't committed it certainly wasn't ready.
>>>>>>>>
>>>>>>>>> Should it accept for hoisting the following loop nest
>>>>>>>>>   for (i=0; i<n; i++) {
>>>>>>>>>     s = 0;
>>>>>>>>>     for (j=0; j<m; j++)
>>>>>>>>>       s += a[i] * b[j];
>>>>>>>>>     c[i] = s;
>>>>>>>>>   }
>>>>>>>>> Note that i-loop will nit be empty if m is equal to 0.
>>>>>>>>
>>>>>>>> if m is equal to 0 then we still have the c[i] = s store, no?  Of course
>>>>>>>> we could unswitch the outer loop on m == 0 but simple hoisting wouldn't work.
>>>>>>>>
>>>>>>>> Richard.
>>>>>>>>
>>>>>>>>> 2015-08-03 10:27 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>>>>>>> On Fri, Jul 31, 2015 at 1:17 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>>>>>>>> Hi Richard,
>>>>>>>>>>>
>>>>>>>>>>> I learned your updated patch for 23825 and it is more general in
>>>>>>>>>>> comparison with my.
>>>>>>>>>>> I'd like to propose you a compromise - let's consider my patch only
>>>>>>>>>>> for force-vectorize outer loop only to allow outer-loop
>>>>>>>>>>> vecctorization.
>>>>>>>>>>
>>>>>>>>>> I don't see why we should special-case that if the approach in 23825
>>>>>>>>>> is sensible.
>>>>>>>>>>
>>>>>>>>>>> Note that your approach will not hoist invariant
>>>>>>>>>>> guards if loops contains something else except for inner-loop, i.e. it
>>>>>>>>>>> won't be empty for taken branch.
>>>>>>>>>>
>>>>>>>>>> Yes, it does not perform unswitching but guard hoisting.  Note that this
>>>>>>>>>> is originally Zdenek Dvoraks patch.
>>>>>>>>>>
>>>>>>>>>>> I also would like to answer on your last question - CFG cleanup is
>>>>>>>>>>> invoked to perform deletion of single-argument phi nodes from tail
>>>>>>>>>>> block through substitution - such phi's prevent outer-loop
>>>>>>>>>>> vectorization. But it is clear that such transformation can be done
>>>>>>>>>>> other pass.
>>>>>>>>>>
>>>>>>>>>> Hmm, I wonder why the copy_prop pass after unswitching does not
>>>>>>>>>> get rid of them?
>>>>>>>>>>
>>>>>>>>>>> What is your opinion?
>>>>>>>>>>
>>>>>>>>>> My opinion is that if we want to enhance unswitching to catch this
>>>>>>>>>> (or similar) cases then we should make it a lot more general than
>>>>>>>>>> your pattern-matching approach.  I see nothing that should prevent
>>>>>>>>>> us from considering unswitching non-innermost loops in general.
>>>>>>>>>> It should be only a cost consideration to not do non-innermost loop
>>>>>>>>>> unswitching (in addition to maybe a --param specifying the maximum
>>>>>>>>>> depth of a loop nest to unswitch).
>>>>>>>>>>
>>>>>>>>>> So my first thought when seeing your patch still holds - the patch
>>>>>>>>>> looks very much too specific.
>>>>>>>>>>
>>>>>>>>>> Richard.
>>>>>>>>>>
>>>>>>>>>>> Yuri.
>>>>>>>>>>>
>>>>>>>>>>> 2015-07-28 13:50 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>>>>>>>>> On Thu, Jul 23, 2015 at 4:45 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>>>>>>>>>> Hi Richard,
>>>>>>>>>>>>>
>>>>>>>>>>>>> I checked that both test-cases from 23855 are sucessfully unswitched
>>>>>>>>>>>>> by proposed patch. I understand that it does not catch deeper loop
>>>>>>>>>>>>> nest as
>>>>>>>>>>>>>    for (i=0; i<10; i++)
>>>>>>>>>>>>>      for (j=0;j<n;j++)
>>>>>>>>>>>>>         for (k=0;k<20;k++)
>>>>>>>>>>>>>   ...
>>>>>>>>>>>>> but duplication of middle-loop does not look reasonable.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Here is dump for your second test-case:
>>>>>>>>>>>>>
>>>>>>>>>>>>> void foo(int *ie, int *je, double *x)
>>>>>>>>>>>>> {
>>>>>>>>>>>>>   int i, j;
>>>>>>>>>>>>>   for (j=0; j<*je; ++j)
>>>>>>>>>>>>>     for (i=0; i<*ie; ++i)
>>>>>>>>>>>>>       x[i+j] = 0.0;
>>>>>>>>>>>>> }
>>>>>>>>>>>>> grep -i unswitch t6.c.119t.unswitch
>>>>>>>>>>>>> ;; Unswitching outer loop
>>>>>>>>>>>>
>>>>>>>>>>>> I was saying that why go with a limited approach when a patch (in
>>>>>>>>>>>> unknown state...)
>>>>>>>>>>>> is available that does it more generally?  Also unswitching is quite
>>>>>>>>>>>> expensive compared
>>>>>>>>>>>> to "moving" the invariant condition.
>>>>>>>>>>>>
>>>>>>>>>>>> In your patch:
>>>>>>>>>>>>
>>>>>>>>>>>> +  if (!nloop->force_vectorize)
>>>>>>>>>>>> +    nloop->force_vectorize = true;
>>>>>>>>>>>> +  if (loop->safelen != 0)
>>>>>>>>>>>> +    nloop->safelen = loop->safelen;
>>>>>>>>>>>>
>>>>>>>>>>>> I see no guard on force_vectorize so = true looks bogus here.  Please just use
>>>>>>>>>>>> copy_loop_info.
>>>>>>>>>>>>
>>>>>>>>>>>> +  if (integer_nonzerop (cond_new))
>>>>>>>>>>>> +    gimple_cond_set_condition_from_tree (cond_stmt, boolean_true_node);
>>>>>>>>>>>> +  else if (integer_zerop (cond_new))
>>>>>>>>>>>> +    gimple_cond_set_condition_from_tree (cond_stmt, boolean_false_node);
>>>>>>>>>>>>
>>>>>>>>>>>> gimple_cond_make_true/false (cond_stmt);
>>>>>>>>>>>>
>>>>>>>>>>>> btw, seems odd that we have to recompute which loop is the true / false variant
>>>>>>>>>>>> when we just fed a guard condition to loop_version.  Can't we statically
>>>>>>>>>>>> determine whether loop or nloop has the in-loop condition true or false?
>>>>>>>>>>>>
>>>>>>>>>>>> +  /* Clean-up cfg to remove useless one-argument phi in exit block of
>>>>>>>>>>>> +     outer-loop.  */
>>>>>>>>>>>> +  cleanup_tree_cfg ();
>>>>>>>>>>>>
>>>>>>>>>>>> I know unswitching is already O(number-of-unswitched-loops * size-of-function)
>>>>>>>>>>>> because it updates SSA form after each individual unswitching (and it does that
>>>>>>>>>>>> because it invokes itself recursively on unswitched loops).  But do you really
>>>>>>>>>>>> need to invoke CFG cleanup here?
>>>>>>>>>>>>
>>>>>>>>>>>> Richard.
>>>>>>>>>>>>
>>>>>>>>>>>>> Yuri.
>>>>>>>>>>>>>
>>>>>>>>>>>>> 2015-07-14 14:06 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>>>>>>>>>>> On Fri, Jul 10, 2015 at 12:02 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>>>>>>>>>>>> Hi All,
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Here is presented simple transformation which tries to hoist out of
>>>>>>>>>>>>>>> outer-loop a check on zero trip count for inner-loop. This is very
>>>>>>>>>>>>>>> restricted transformation since it accepts outer-loops with very
>>>>>>>>>>>>>>> simple cfg, as for example:
>>>>>>>>>>>>>>>     acc = 0;
>>>>>>>>>>>>>>>    for (i = 1; i <= m; i++) {
>>>>>>>>>>>>>>>       for (j = 0; j < n; j++)
>>>>>>>>>>>>>>>          if (l[j] == i) { v[j] = acc; acc++; };
>>>>>>>>>>>>>>>       acc <<= 1;
>>>>>>>>>>>>>>>    }
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Note that degenerative outer loop (without inner loop) will be
>>>>>>>>>>>>>>> completely deleted as dead code.
>>>>>>>>>>>>>>> The main goal of this transformation was to convert outer-loop to form
>>>>>>>>>>>>>>> accepted by outer-loop vectorization (such test-case is also included
>>>>>>>>>>>>>>> to patch).
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Bootstrap and regression testing did not show any new failures.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Is it OK for trunk?
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> I think this is
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=23855
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> as well.  It has a patch adding a invariant loop guard hoisting
>>>>>>>>>>>>>> phase to loop-header copying.  Yeah, it needs updating to
>>>>>>>>>>>>>> trunk again I suppose.  It's always non-stage1 when I come
>>>>>>>>>>>>>> back to that patch.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Your patch seems to be very specific and only handles outer
>>>>>>>>>>>>>> loops of innermost loops.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Richard.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> ChangeLog:
>>>>>>>>>>>>>>> 2015-07-10  Yuri Rumyantsev  <ysrumyan@gmail.com>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> * tree-ssa-loop-unswitch.c: Include "tree-cfgcleanup.h" and
>>>>>>>>>>>>>>> "gimple-iterator.h", add prototype for tree_unswitch_outer_loop.
>>>>>>>>>>>>>>> (tree_ssa_unswitch_loops): Add invoke of tree_unswitch_outer_loop.
>>>>>>>>>>>>>>> (tree_unswitch_outer_loop): New function.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> gcc/testsuite/ChangeLog:
>>>>>>>>>>>>>>> * gcc.dg/tree-ssa/unswitch-outer-loop-1.c: New test.
>>>>>>>>>>>>>>> * gcc.dg/vect/vect-outer-simd-3.c: New test.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] Unswitching outer loops.
  2015-10-06 12:21                             ` Richard Biener
@ 2015-10-07  9:53                               ` Yuri Rumyantsev
  2015-10-07 15:26                                 ` Yuri Rumyantsev
  2015-10-09 19:05                                 ` H.J. Lu
  0 siblings, 2 replies; 17+ messages in thread
From: Yuri Rumyantsev @ 2015-10-07  9:53 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, Igor Zamyatin

[-- Attachment #1: Type: text/plain, Size: 21738 bytes --]

Richard,

I've fixed adding virtual phi argument and add check on irreducible basic block.
New patch is attached.

I checked it for bootstrap and regression testing, no new failures.

ChangeLog:
2015-10-07  Yuri Rumyantsev  <ysrumyan@gmail.com>

* tree-ssa-loop-unswitch.c: Include "gimple-iterator.h" and
"cfghooks.h", add prototypes for introduced new functions.
(tree_ssa_unswitch_loops): Use from innermost loop iterator, move all
checks on ability of loop unswitching to tree_unswitch_single_loop;
invoke tree_unswitch_single_loop or tree_unswitch_outer_loop depending
on innermost loop check.
(tree_unswitch_single_loop): Add all required checks on ability of
loop unswitching under zero recursive level guard.
(tree_unswitch_outer_loop): New function.
(find_loop_guard): Likewise.
(empty_bb_without_guard_p): Likewise.
(used_outside_loop_p): Likewise.
(get_vop_from_header): Likewise.
(hoist_guard): Likewise.
(check_exit_phi): Likewise.

   gcc/testsuite/ChangeLog:
* gcc.dg/loop-unswitch-2.c: New test.
* gcc.dg/loop-unswitch-3.c: Likewise.
* gcc.dg/loop-unswitch-4.c: Likewise.


2015-10-06 15:21 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
> On Tue, Oct 6, 2015 at 1:41 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>> Richard,
>>
>> Here is updated patch which reflects almost all your remarks:
>> 1. Use ordinary get_loop_body.
>> 2. Delete useless asserts.
>> 3. Use check on iterated loop instead of finite_loop_p.
>> 4. Do not update CFG by adjusting the CONDs condition to always true/false.
>> 5. Add couple tests.
>
> +  /* Add NEW_ADGE argument for all phi in post-header block.  */
> +  bb = exit->dest;
> +  for (gphi_iterator gsi = gsi_start_phis (bb);
> +       !gsi_end_p (gsi); gsi_next (&gsi))
> +    {
> +      gphi *phi = gsi.phi ();
> +      /* edge_iterator ei; */
> +      tree arg;
> +      if (virtual_operand_p (gimple_phi_result (phi)))
> +       {
> +         arg = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
> +         add_phi_arg (phi, arg, new_edge, UNKNOWN_LOCATION);
>
> now I know what confused me - here you are looking at a loop exit PHI
> but querying with the preheader edge index.  I think you need to walk
> the loop header PHIs to find the PHI for the virtual operand and use that
> to get the PHI arg from?
>
> The side-effect / used-outside code is still the same.  What matters
> is side-effects outside of the loop-header protected code region, not
> blocks excluding the inner loop.  Say,
>
>   for (;;)
>     {
>       if (invariant-guard)
>         {
>            printf ("Blah");
>            for (;;)
>              ;
>         }
>     }
>
> would still ok to be unswitched.  So instead of
>
> +      if (body[i]->loop_father != loop)
> +       continue;
>
> it would be
>
>        if (dominated_by_p (CDI_DOMINATORS, body[i], header)
>            && !dominated_by_p (CDI_DOMINATORS, body[i], fe->dest))
>
> with the obvious improvement to the patch to not only consider header checks
> in the outer loop header but in the pre-header block of the inner loop.
>
> And I still think you should walk the exit PHIs args to see whether they
> are defined in the non-guarded region of the outer loop instead of walking
> all uses of all defs.
>
> Note that I think you miss endless loops as side-effects if that endless
> loop occurs through a irreducible region (thus not reflected in the
> loop tree).  Thus you should reject BB_IRREDUCIBLE_LOOP blocks
> in the non-guarded region as well.
>
> It seems to me that protecting adjacent loops with a single guard is
> also eligible for hoisting thus the restriction on loop->inner->next
> should become a restriction on no loops (or irreducible regions)
> in the non-guarded region.
>
> Most things can be improved as followup, but at least the
> virtual PHI arg thing needs to be sorted out.
>
> Thanks,
> Richard.
>
>
>> ChangeLog:
>> 2015-10-06  Yuri Rumyantsev  <ysrumyan@gmail.com>
>>
>> * tree-ssa-loop-unswitch.c: Include "gimple-iterator.h" and
>> "cfghooks.h", add prototypes for introduced new functions.
>> (tree_ssa_unswitch_loops): Use from innermost loop iterator, move all
>> checks on ability of loop unswitching to tree_unswitch_single_loop;
>> invoke tree_unswitch_single_loop or tree_unswitch_outer_loop depending
>> on innermost loop check.
>> (tree_unswitch_single_loop): Add all required checks on ability of
>> loop unswitching under zero recursive level guard.
>> (tree_unswitch_outer_loop): New function.
>> (find_loop_guard): Likewise.
>> (empty_bb_without_guard_p): Likewise.
>> (used_outside_loop_p): Likewise.
>> (hoist_guard): Likewise.
>> (check_exit_phi): Likewise.
>>
>>    gcc/testsuite/ChangeLog:
>> * gcc.dg/loop-unswitch-2.c: New test.
>> * gcc.dg/loop-unswitch-3.c: Likewise.
>> * gcc.dg/loop-unswitch-4.c: Likewise.
>>
>> 2015-10-06 10:59 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>> On Mon, Oct 5, 2015 at 3:13 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>> Thanks Richard.
>>>> I'd like to answer on your last comment related to using of exit edge
>>>> argument for edge that skips loop.
>>>> Let's consider the following test-case:
>>>>
>>>> #include <stdlib.h>
>>>> #define N 32
>>>> float *foo(int ustride, int size, float *src)
>>>> {
>>>>    float *buffer, *p;
>>>>    int i, k;
>>>>
>>>>    if (!src)
>>>>     return NULL;
>>>>
>>>>    buffer = (float *) malloc(N * size * sizeof(float));
>>>>
>>>>    if(buffer)
>>>>       for(i=0, p=buffer; i<N; i++, src+=ustride)
>>>> for(k=0; k<size; k++)
>>>>  *p++ = src[k];
>>>>
>>>>    return buffer;
>>>> }
>>>>
>>>> Before adding new edge we have in post-header bb:
>>>>   <bb 9>:
>>>>   # _6 = PHI <0B(8), buffer_20(16)>
>>>>   return _6;
>>>>
>>>> It is clear that we must preserve function semantic and transform it to
>>>> _6 = PHI <0B(12), buffer_19(9), buffer_19(4)>
>>>
>>> Ah, yeah.  I was confusing the loop exit of the inner vs. the outer loop.
>>>
>>> Richard.
>>>
>>>>
>>>> 2015-10-05 13:57 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>> On Wed, Sep 30, 2015 at 12:46 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>>> Hi Richard,
>>>>>>
>>>>>> I re-designed outer loop unswitching using basic idea of 23855 patch -
>>>>>> hoist invariant guard if loop is empty without guard. Note that this
>>>>>> was added to loop unswitching pass with simple modifications - using
>>>>>> another loop iterator etc.
>>>>>>
>>>>>> Bootstrap and regression testing did not show any new failures.
>>>>>> What is your opinion?
>>>>>
>>>>> Overall it looks good.  Some comments below - a few more testcases would
>>>>> be nice as well.
>>>>>
>>>>> +  /* Loop must not be infinite.  */
>>>>> +  if (!finite_loop_p (loop))
>>>>> +    return false;
>>>>>
>>>>> why's that?
>>>>>
>>>>> +  body = get_loop_body_in_dom_order (loop);
>>>>> +  for (i = 0; i < loop->num_nodes; i++)
>>>>> +    {
>>>>> +      if (body[i]->loop_father != loop)
>>>>> +       continue;
>>>>> +      if (!empty_bb_without_guard_p (loop, body[i]))
>>>>>
>>>>> I wonder if there is a better way to iterate over the interesting
>>>>> blocks and PHIs
>>>>> we need to check for side-effects (and thus we maybe can avoid gathering
>>>>> the loop in DOM order).
>>>>>
>>>>> +      FOR_EACH_SSA_TREE_OPERAND (name, stmt, op_iter, SSA_OP_DEF)
>>>>> +       {
>>>>> +         if (may_be_used_outside
>>>>>
>>>>> may_be_used_outside can be hoisted above the loop.  I wonder if we can take
>>>>> advantage of loop-closed SSA form here (and the fact we have a single exit
>>>>> from the loop).  Iterating over exit dest PHIs and determining whether the
>>>>> exit edge DEF is inside the loop part it may not be should be enough.
>>>>>
>>>>> +  gcc_assert (single_succ_p (pre_header));
>>>>>
>>>>> that should be always true.
>>>>>
>>>>> +  gsi_remove (&gsi, false);
>>>>> +  bb = guard->dest;
>>>>> +  remove_edge (guard);
>>>>> +  /* Update dominance for destination of GUARD.  */
>>>>> +  if (EDGE_COUNT (bb->preds) == 0)
>>>>> +    {
>>>>> +      basic_block s_bb;
>>>>> +      gcc_assert (single_succ_p (bb));
>>>>> +      s_bb = single_succ (bb);
>>>>> +      delete_basic_block (bb);
>>>>> +      if (single_pred_p (s_bb))
>>>>> +       set_immediate_dominator (CDI_DOMINATORS, s_bb, single_pred (s_bb));
>>>>>
>>>>> all this massaging should be simplified by leaving it to CFG cleanup by
>>>>> simply adjusting the CONDs condition to always true/false.  There is
>>>>> gimple_cond_make_{true,false} () for this (would be nice to have a variant
>>>>> taking a bool).
>>>>>
>>>>> +  new_edge = make_edge (pre_header, exit->dest, flags);
>>>>> +  if (fix_dom_of_exit)
>>>>> +    set_immediate_dominator (CDI_DOMINATORS, exit->dest, pre_header);
>>>>> +  update_stmt (gsi_stmt (gsi));
>>>>>
>>>>> the update_stmt should be not necessary, it's done by gsi_insert_after already.
>>>>>
>>>>> +  /* Add NEW_ADGE argument for all phi in post-header block.  */
>>>>> +  bb = exit->dest;
>>>>> +  for (gphi_iterator gsi = gsi_start_phis (bb);
>>>>> +       !gsi_end_p (gsi); gsi_next (&gsi))
>>>>> +    {
>>>>> +      gphi *phi = gsi.phi ();
>>>>> +      /* edge_iterator ei; */
>>>>> +      tree arg;
>>>>> +      if (virtual_operand_p (gimple_phi_result (phi)))
>>>>> +       {
>>>>> +         arg = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
>>>>> +         add_phi_arg (phi, arg, new_edge, UNKNOWN_LOCATION);
>>>>> +       }
>>>>> +      else
>>>>> +       {
>>>>> +         /* Use exit edge argument.  */
>>>>> +         arg = PHI_ARG_DEF_FROM_EDGE (phi, exit);
>>>>> +         add_phi_arg (phi, arg, new_edge, UNKNOWN_LOCATION);
>>>>>
>>>>> Hum.  How is it ok to use the exit edge argument for the edge that skips
>>>>> the loop?  Why can't you always use the pre-header edge value?
>>>>> That is, if we have
>>>>>
>>>>>  for(i=0;i<m;++i)
>>>>>    {
>>>>>      if (n > 0)
>>>>>     {
>>>>>      for (;;)
>>>>>        {
>>>>>        }
>>>>>      }
>>>>>    }
>>>>>   ... = i;
>>>>>
>>>>> then i is used after the loop and the correct value to use if
>>>>> n > 0 is false is '0'.  Maybe this way we can also relax
>>>>> what check_exit_phi does?  IMHO the only restriction is
>>>>> if sth defined inside the loop before the header check for
>>>>> the inner loop is used after the loop.
>>>>>
>>>>> Thanks,
>>>>> Richard.
>>>>>
>>>>>> Thanks.
>>>>>>
>>>>>> ChangeLog:
>>>>>> 2015-09-30  Yuri Rumyantsev  <ysrumyan@gmail.com>
>>>>>>
>>>>>> * tree-ssa-loop-unswitch.c: Include "gimple-iterator.h" and
>>>>>> "cfghooks.h", add prototypes for introduced new functions.
>>>>>> (tree_ssa_unswitch_loops): Use from innermost loop iterator, move all
>>>>>> checks on ability of loop unswitching to tree_unswitch_single_loop;
>>>>>> invoke tree_unswitch_single_loop or tree_unswitch_outer_loop depending
>>>>>> on innermost loop check.
>>>>>> (tree_unswitch_single_loop): Add all required checks on ability of
>>>>>> loop unswitching under zero recursive level guard.
>>>>>> (tree_unswitch_outer_loop): New function.
>>>>>> (find_loop_guard): Likewise.
>>>>>> (empty_bb_without_guard_p): Likewise.
>>>>>> (used_outside_loop_p): Likewise.
>>>>>> (hoist_guard): Likewise.
>>>>>> (check_exit_phi): Likewise.
>>>>>>
>>>>>>    gcc/testsuite/ChangeLog:
>>>>>> * gcc.dg/loop-unswitch-2.c: New test.
>>>>>>
>>>>>> 2015-09-16 11:26 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>>>> Yeah, as said, the patch wasn't fully ready and it also felt odd to do
>>>>>>> this hoisting in loop header copying.  Integrating it
>>>>>>> with LIM would be a better fit eventually.
>>>>>>>
>>>>>>> Note that we did agree to go forward with your original patch just
>>>>>>> making it more "generically" perform outer loop
>>>>>>> unswitching.  Did you explore that idea further?
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> On Tue, Sep 15, 2015 at 6:00 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>>>>> Thanks Richard.
>>>>>>>>
>>>>>>>> I found one more issue that could not be fixed simply. In 23855 you
>>>>>>>> consider the following test-case:
>>>>>>>> void foo(int *ie, int *je, double *x)
>>>>>>>> {
>>>>>>>>   int i, j;
>>>>>>>>   for (j=0; j<*je; ++j)
>>>>>>>>     for (i=0; i<*ie; ++i)
>>>>>>>>       x[i+j] = 0.0;
>>>>>>>> }
>>>>>>>> and proposed to hoist up a check on *ie out of loop. It requires
>>>>>>>> memref alias analysis since in general x and ie can alias (if their
>>>>>>>> types are compatible - int *ie & int * x). Such analysis is performed
>>>>>>>> by pre or lim passes. Without such analysis we can not hoist a test on
>>>>>>>> non-zero for *ie out of loop using 238565 patch.
>>>>>>>>  The second concern is that proposed copy header algorithm changes
>>>>>>>> loop structure significantly and it is not accepted by vectorizer
>>>>>>>> since latch is not empty (such transformation assumes loop peeling for
>>>>>>>> one iteration. So I can propose to implement simple guard hoisting
>>>>>>>> without copying header and tail blocks (if it is possible).
>>>>>>>>
>>>>>>>> I will appreciate you for any advice or help since without such
>>>>>>>> hoisting we are not able to perform outer loop vectorization for
>>>>>>>> important benchmark.
>>>>>>>> and
>>>>>>>>
>>>>>>>> 2015-09-15 14:22 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>>>>>> On Thu, Sep 3, 2015 at 6:32 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>>>>>>> Hi Richard,
>>>>>>>>>>
>>>>>>>>>> I started learning, tuning and debugging patch proposed in 23855 and
>>>>>>>>>> discovered thta it does not work properly.
>>>>>>>>>> So I wonder is it tested patch and it should work?
>>>>>>>>>
>>>>>>>>> I don't remember, but as it wasn't committed it certainly wasn't ready.
>>>>>>>>>
>>>>>>>>>> Should it accept for hoisting the following loop nest
>>>>>>>>>>   for (i=0; i<n; i++) {
>>>>>>>>>>     s = 0;
>>>>>>>>>>     for (j=0; j<m; j++)
>>>>>>>>>>       s += a[i] * b[j];
>>>>>>>>>>     c[i] = s;
>>>>>>>>>>   }
>>>>>>>>>> Note that i-loop will nit be empty if m is equal to 0.
>>>>>>>>>
>>>>>>>>> if m is equal to 0 then we still have the c[i] = s store, no?  Of course
>>>>>>>>> we could unswitch the outer loop on m == 0 but simple hoisting wouldn't work.
>>>>>>>>>
>>>>>>>>> Richard.
>>>>>>>>>
>>>>>>>>>> 2015-08-03 10:27 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>>>>>>>> On Fri, Jul 31, 2015 at 1:17 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>>>>>>>>> Hi Richard,
>>>>>>>>>>>>
>>>>>>>>>>>> I learned your updated patch for 23825 and it is more general in
>>>>>>>>>>>> comparison with my.
>>>>>>>>>>>> I'd like to propose you a compromise - let's consider my patch only
>>>>>>>>>>>> for force-vectorize outer loop only to allow outer-loop
>>>>>>>>>>>> vecctorization.
>>>>>>>>>>>
>>>>>>>>>>> I don't see why we should special-case that if the approach in 23825
>>>>>>>>>>> is sensible.
>>>>>>>>>>>
>>>>>>>>>>>> Note that your approach will not hoist invariant
>>>>>>>>>>>> guards if loops contains something else except for inner-loop, i.e. it
>>>>>>>>>>>> won't be empty for taken branch.
>>>>>>>>>>>
>>>>>>>>>>> Yes, it does not perform unswitching but guard hoisting.  Note that this
>>>>>>>>>>> is originally Zdenek Dvoraks patch.
>>>>>>>>>>>
>>>>>>>>>>>> I also would like to answer on your last question - CFG cleanup is
>>>>>>>>>>>> invoked to perform deletion of single-argument phi nodes from tail
>>>>>>>>>>>> block through substitution - such phi's prevent outer-loop
>>>>>>>>>>>> vectorization. But it is clear that such transformation can be done
>>>>>>>>>>>> other pass.
>>>>>>>>>>>
>>>>>>>>>>> Hmm, I wonder why the copy_prop pass after unswitching does not
>>>>>>>>>>> get rid of them?
>>>>>>>>>>>
>>>>>>>>>>>> What is your opinion?
>>>>>>>>>>>
>>>>>>>>>>> My opinion is that if we want to enhance unswitching to catch this
>>>>>>>>>>> (or similar) cases then we should make it a lot more general than
>>>>>>>>>>> your pattern-matching approach.  I see nothing that should prevent
>>>>>>>>>>> us from considering unswitching non-innermost loops in general.
>>>>>>>>>>> It should be only a cost consideration to not do non-innermost loop
>>>>>>>>>>> unswitching (in addition to maybe a --param specifying the maximum
>>>>>>>>>>> depth of a loop nest to unswitch).
>>>>>>>>>>>
>>>>>>>>>>> So my first thought when seeing your patch still holds - the patch
>>>>>>>>>>> looks very much too specific.
>>>>>>>>>>>
>>>>>>>>>>> Richard.
>>>>>>>>>>>
>>>>>>>>>>>> Yuri.
>>>>>>>>>>>>
>>>>>>>>>>>> 2015-07-28 13:50 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>>>>>>>>>> On Thu, Jul 23, 2015 at 4:45 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>>>>>>>>>>> Hi Richard,
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> I checked that both test-cases from 23855 are sucessfully unswitched
>>>>>>>>>>>>>> by proposed patch. I understand that it does not catch deeper loop
>>>>>>>>>>>>>> nest as
>>>>>>>>>>>>>>    for (i=0; i<10; i++)
>>>>>>>>>>>>>>      for (j=0;j<n;j++)
>>>>>>>>>>>>>>         for (k=0;k<20;k++)
>>>>>>>>>>>>>>   ...
>>>>>>>>>>>>>> but duplication of middle-loop does not look reasonable.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Here is dump for your second test-case:
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> void foo(int *ie, int *je, double *x)
>>>>>>>>>>>>>> {
>>>>>>>>>>>>>>   int i, j;
>>>>>>>>>>>>>>   for (j=0; j<*je; ++j)
>>>>>>>>>>>>>>     for (i=0; i<*ie; ++i)
>>>>>>>>>>>>>>       x[i+j] = 0.0;
>>>>>>>>>>>>>> }
>>>>>>>>>>>>>> grep -i unswitch t6.c.119t.unswitch
>>>>>>>>>>>>>> ;; Unswitching outer loop
>>>>>>>>>>>>>
>>>>>>>>>>>>> I was saying that why go with a limited approach when a patch (in
>>>>>>>>>>>>> unknown state...)
>>>>>>>>>>>>> is available that does it more generally?  Also unswitching is quite
>>>>>>>>>>>>> expensive compared
>>>>>>>>>>>>> to "moving" the invariant condition.
>>>>>>>>>>>>>
>>>>>>>>>>>>> In your patch:
>>>>>>>>>>>>>
>>>>>>>>>>>>> +  if (!nloop->force_vectorize)
>>>>>>>>>>>>> +    nloop->force_vectorize = true;
>>>>>>>>>>>>> +  if (loop->safelen != 0)
>>>>>>>>>>>>> +    nloop->safelen = loop->safelen;
>>>>>>>>>>>>>
>>>>>>>>>>>>> I see no guard on force_vectorize so = true looks bogus here.  Please just use
>>>>>>>>>>>>> copy_loop_info.
>>>>>>>>>>>>>
>>>>>>>>>>>>> +  if (integer_nonzerop (cond_new))
>>>>>>>>>>>>> +    gimple_cond_set_condition_from_tree (cond_stmt, boolean_true_node);
>>>>>>>>>>>>> +  else if (integer_zerop (cond_new))
>>>>>>>>>>>>> +    gimple_cond_set_condition_from_tree (cond_stmt, boolean_false_node);
>>>>>>>>>>>>>
>>>>>>>>>>>>> gimple_cond_make_true/false (cond_stmt);
>>>>>>>>>>>>>
>>>>>>>>>>>>> btw, seems odd that we have to recompute which loop is the true / false variant
>>>>>>>>>>>>> when we just fed a guard condition to loop_version.  Can't we statically
>>>>>>>>>>>>> determine whether loop or nloop has the in-loop condition true or false?
>>>>>>>>>>>>>
>>>>>>>>>>>>> +  /* Clean-up cfg to remove useless one-argument phi in exit block of
>>>>>>>>>>>>> +     outer-loop.  */
>>>>>>>>>>>>> +  cleanup_tree_cfg ();
>>>>>>>>>>>>>
>>>>>>>>>>>>> I know unswitching is already O(number-of-unswitched-loops * size-of-function)
>>>>>>>>>>>>> because it updates SSA form after each individual unswitching (and it does that
>>>>>>>>>>>>> because it invokes itself recursively on unswitched loops).  But do you really
>>>>>>>>>>>>> need to invoke CFG cleanup here?
>>>>>>>>>>>>>
>>>>>>>>>>>>> Richard.
>>>>>>>>>>>>>
>>>>>>>>>>>>>> Yuri.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> 2015-07-14 14:06 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>>>>>>>>>>>> On Fri, Jul 10, 2015 at 12:02 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>>>>>>>>>>>>> Hi All,
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Here is presented simple transformation which tries to hoist out of
>>>>>>>>>>>>>>>> outer-loop a check on zero trip count for inner-loop. This is very
>>>>>>>>>>>>>>>> restricted transformation since it accepts outer-loops with very
>>>>>>>>>>>>>>>> simple cfg, as for example:
>>>>>>>>>>>>>>>>     acc = 0;
>>>>>>>>>>>>>>>>    for (i = 1; i <= m; i++) {
>>>>>>>>>>>>>>>>       for (j = 0; j < n; j++)
>>>>>>>>>>>>>>>>          if (l[j] == i) { v[j] = acc; acc++; };
>>>>>>>>>>>>>>>>       acc <<= 1;
>>>>>>>>>>>>>>>>    }
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Note that degenerative outer loop (without inner loop) will be
>>>>>>>>>>>>>>>> completely deleted as dead code.
>>>>>>>>>>>>>>>> The main goal of this transformation was to convert outer-loop to form
>>>>>>>>>>>>>>>> accepted by outer-loop vectorization (such test-case is also included
>>>>>>>>>>>>>>>> to patch).
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Bootstrap and regression testing did not show any new failures.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Is it OK for trunk?
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> I think this is
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=23855
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> as well.  It has a patch adding a invariant loop guard hoisting
>>>>>>>>>>>>>>> phase to loop-header copying.  Yeah, it needs updating to
>>>>>>>>>>>>>>> trunk again I suppose.  It's always non-stage1 when I come
>>>>>>>>>>>>>>> back to that patch.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Your patch seems to be very specific and only handles outer
>>>>>>>>>>>>>>> loops of innermost loops.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Richard.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> ChangeLog:
>>>>>>>>>>>>>>>> 2015-07-10  Yuri Rumyantsev  <ysrumyan@gmail.com>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> * tree-ssa-loop-unswitch.c: Include "tree-cfgcleanup.h" and
>>>>>>>>>>>>>>>> "gimple-iterator.h", add prototype for tree_unswitch_outer_loop.
>>>>>>>>>>>>>>>> (tree_ssa_unswitch_loops): Add invoke of tree_unswitch_outer_loop.
>>>>>>>>>>>>>>>> (tree_unswitch_outer_loop): New function.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> gcc/testsuite/ChangeLog:
>>>>>>>>>>>>>>>> * gcc.dg/tree-ssa/unswitch-outer-loop-1.c: New test.
>>>>>>>>>>>>>>>> * gcc.dg/vect/vect-outer-simd-3.c: New test.

[-- Attachment #2: patch.update2 --]
[-- Type: application/octet-stream, Size: 17376 bytes --]

diff --git a/gcc/testsuite/gcc.dg/loop-unswitch-2.c b/gcc/testsuite/gcc.dg/loop-unswitch-2.c
new file mode 100644
index 0000000..95622a6
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/loop-unswitch-2.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -funswitch-loops -fdump-tree-unswitch-details" } */
+
+void foo (float **a, float **b, float *c, int n, int m, int l)
+{
+  int i,j,k;
+  float s;
+  for (i=0; i<l; i++)
+    {
+      for (j=0; j<n; j++)
+	for (k=0; k<m; k++)
+	  c[i] += a[i][k] * b[k][j];
+    }
+}
+
+/* { dg-final { scan-tree-dump-times "guard hoisted" 2 "unswitch" } } */
diff --git a/gcc/testsuite/gcc.dg/loop-unswitch-3.c b/gcc/testsuite/gcc.dg/loop-unswitch-3.c
new file mode 100644
index 0000000..d043eb6
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/loop-unswitch-3.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -funswitch-loops -fdump-tree-unswitch-details" } */
+
+#include <stdlib.h>
+#define N 32
+float *foo(int ustride, int size, float *src)
+{
+   float *buffer, *p;
+   int i, k;
+
+   if (!src)
+    return NULL;
+
+   buffer = (float *) malloc(N * size * sizeof(float));
+
+   if(buffer)
+      for(i=0, p=buffer; i<N; i++, src+=ustride)
+	for(k=0; k<size; k++)
+	  *p++ = src[k];
+
+   return buffer;
+}
+
+/* { dg-final { scan-tree-dump-times "guard hoisted" 1 "unswitch" } } */
+
diff --git a/gcc/testsuite/gcc.dg/loop-unswitch-4.c b/gcc/testsuite/gcc.dg/loop-unswitch-4.c
new file mode 100644
index 0000000..e4a7f2e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/loop-unswitch-4.c
@@ -0,0 +1,51 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -funswitch-loops" } */
+
+#include <stdlib.h>
+__attribute__ ((noinline))
+void foo (float **a, float **b, float *c, int n, int m, int l)
+{
+  int i,j,k;
+  float s;
+  for (i=0; i<l; i++)
+    for (j=0; j<n; j++)
+      for (k=0; k<m; k++)
+	c[i] += a[i][k] * b[k][j];
+}
+
+int main()
+{
+  const int N = 32;
+  float **ar1, **ar2;
+  float *res;
+  int i, j;
+  ar1 = (float **)malloc (N * sizeof (float*));
+  ar2 = (float **)malloc (N * sizeof (float*));
+  res = (float *)malloc( N * sizeof (float));
+  for (i=0; i<N; i++)
+    {
+      ar1[i] = (float*)malloc (N * sizeof (float));
+      ar2[i] = (float*)malloc (N * sizeof (float));
+    }
+  for (i=0; i<N; i++)
+    {
+      for (j=0; j<N; j++)
+	{
+	  ar1[i][j] = 2.0f;
+	  ar2[i][j] = 1.5f;
+	}
+      res[i] = 0.0f;
+    }
+  foo (ar1, ar2, res, N, N, N);
+  for (i=0; i<N; i++)
+    if (res[i] != 3072.0f)
+      abort();
+  for (i=0; i<N; i++)
+    res[i] = 0.0f;
+  foo (ar1, ar2, res, N, 0, N);
+  for (i=0; i<N; i++)
+    if (res[i] != 0.0f)
+      abort();
+  return 0;
+}
+
diff --git a/gcc/tree-ssa-loop-unswitch.c b/gcc/tree-ssa-loop-unswitch.c
index a273638..ed5a64a 100644
--- a/gcc/tree-ssa-loop-unswitch.c
+++ b/gcc/tree-ssa-loop-unswitch.c
@@ -39,6 +39,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "params.h"
 #include "tree-pass.h"
 #include "tree-inline.h"
+#include "gimple-iterator.h"
+#include "cfghooks.h"
 
 /* This file implements the loop unswitching, i.e. transformation of loops like
 
@@ -79,6 +81,13 @@ along with GCC; see the file COPYING3.  If not see
 static struct loop *tree_unswitch_loop (struct loop *, basic_block, tree);
 static bool tree_unswitch_single_loop (struct loop *, int);
 static tree tree_may_unswitch_on (basic_block, struct loop *);
+static bool tree_unswitch_outer_loop (struct loop *);
+static edge find_loop_guard (struct loop *);
+static bool empty_bb_without_guard_p (struct loop *, basic_block);
+static bool used_outside_loop_p (struct loop *, tree);
+static void hoist_guard (struct loop *, edge);
+static bool check_exit_phi (struct loop *);
+static tree get_vop_from_header (struct loop *);
 
 /* Main entry point.  Perform loop unswitching on all suitable loops.  */
 
@@ -87,42 +96,15 @@ tree_ssa_unswitch_loops (void)
 {
   struct loop *loop;
   bool changed = false;
-  HOST_WIDE_INT iterations;
 
-  /* Go through inner loops (only original ones).  */
-  FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST)
+  /* Go through all loops starting from innermost.  */
+  FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
     {
-      if (dump_file && (dump_flags & TDF_DETAILS))
-        fprintf (dump_file, ";; Considering loop %d\n", loop->num);
-
-      /* Do not unswitch in cold regions. */
-      if (optimize_loop_for_size_p (loop))
-        {
-          if (dump_file && (dump_flags & TDF_DETAILS))
-            fprintf (dump_file, ";; Not unswitching cold loops\n");
-          continue;
-        }
-
-      /* The loop should not be too large, to limit code growth. */
-      if (tree_num_loop_insns (loop, &eni_size_weights)
-          > (unsigned) PARAM_VALUE (PARAM_MAX_UNSWITCH_INSNS))
-        {
-          if (dump_file && (dump_flags & TDF_DETAILS))
-            fprintf (dump_file, ";; Not unswitching, loop too big\n");
-          continue;
-        }
-
-      /* If the loop is not expected to iterate, there is no need
-	 for unswitching.  */
-      iterations = estimated_loop_iterations_int (loop);
-      if (iterations >= 0 && iterations <= 1)
-	{
-          if (dump_file && (dump_flags & TDF_DETAILS))
-            fprintf (dump_file, ";; Not unswitching, loop is not expected to iterate\n");
-          continue;
-	}
-
-      changed |= tree_unswitch_single_loop (loop, 0);
+      if (!loop->inner)
+	/* Unswitch innermost loop.  */
+	changed |= tree_unswitch_single_loop (loop, 0);
+      else
+	changed |= tree_unswitch_outer_loop (loop);
     }
 
   if (changed)
@@ -216,6 +198,39 @@ tree_unswitch_single_loop (struct loop *loop, int num)
   tree cond = NULL_TREE;
   gimple stmt;
   bool changed = false;
+  HOST_WIDE_INT iterations;
+
+  /* Perform initial tests if unswitch is eligible.  */
+  if (num == 0)
+    {
+      /* Do not unswitch in cold regions. */
+      if (optimize_loop_for_size_p (loop))
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, ";; Not unswitching cold loops\n");
+	  return false;
+	}
+
+      /* The loop should not be too large, to limit code growth. */
+      if (tree_num_loop_insns (loop, &eni_size_weights)
+	  > (unsigned) PARAM_VALUE (PARAM_MAX_UNSWITCH_INSNS))
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, ";; Not unswitching, loop too big\n");
+	  return false;
+	}
+
+      /* If the loop is not expected to iterate, there is no need
+	 for unswitching.  */
+      iterations = estimated_loop_iterations_int (loop);
+      if (iterations >= 0 && iterations <= 1)
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, ";; Not unswitching, loop is not expected"
+		     " to iterate\n");
+	  return false;
+	}
+    }
 
   i = 0;
   bbs = get_loop_body (loop);
@@ -403,6 +418,374 @@ tree_unswitch_loop (struct loop *loop,
 		       REG_BR_PROB_BASE - prob_true, false);
 }
 
+/* Unswitch outer loops by hoisting invariant guard on
+   inner loop without code duplication.  */
+static bool
+tree_unswitch_outer_loop (struct loop *loop)
+{
+  edge exit, guard;
+  HOST_WIDE_INT iterations;
+
+  gcc_assert (loop->inner);
+  if (loop->inner->next)
+    return false;
+  /* Accept loops with single exit only.  */
+  exit = single_exit (loop);
+  if (!exit)
+    return false;
+  /* Check that phi argument of exit edge is not defined inside loop.  */
+  if (!check_exit_phi (loop))
+    return false;
+  /* If the loop is not expected to iterate, there is no need
+      for unswitching.  */
+  iterations = estimated_loop_iterations_int (loop);
+  if (iterations >= 0 && iterations <= 1)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, ";; Not unswitching, loop is not expected"
+		 " to iterate\n");
+	return false;
+    }
+
+  guard = find_loop_guard (loop);
+  if (guard)
+    {
+      hoist_guard (loop, guard);
+      update_ssa (TODO_update_ssa);
+      return true;
+    }
+  return false;
+}
+
+/* Checks if the body of the LOOP is within an invariant guard.  If this
+   is the case, returns the edge that jumps over the real body of the loop,
+   otherwise returns NULL.  */
+
+static edge
+find_loop_guard (struct loop *loop)
+{
+  basic_block header = loop->header;
+  edge guard_edge, te, fe;
+  /* bitmap processed, known_invariants;*/
+  basic_block *body = NULL;
+  unsigned i;
+  tree use;
+  ssa_op_iter iter;
+
+  /* We check for the following situation:
+
+     while (1)
+       {
+	 [header]]
+         loop_phi_nodes;
+	 something1;
+	 if (cond1)
+	   body;
+	 nvar = phi(orig, bvar) ... for all variables changed in body;
+	 [guard_end]
+	 something2;
+	 if (cond2)
+	   break;
+	 something3;
+       }
+
+     where:
+
+     1) cond1 is loop invariant
+     2) If cond1 is false, then the loop is essentially empty; i.e.,
+	a) nothing in something1, something2 and something3 has side
+	   effects
+	b) anything defined in something1, something2 and something3
+	   is not used outside of the loop.  */
+
+  while (single_succ_p (header))
+    header = single_succ (header);
+  if (!last_stmt (header)
+      || gimple_code (last_stmt (header)) != GIMPLE_COND)
+    return NULL;
+
+  extract_true_false_edges_from_block (header, &te, &fe);
+  if (!flow_bb_inside_loop_p (loop, te->dest)
+      || !flow_bb_inside_loop_p (loop, fe->dest))
+    return NULL;
+
+  if (just_once_each_iteration_p (loop, te->dest)
+      || (single_succ_p (te->dest)
+	  && just_once_each_iteration_p (loop, single_succ (te->dest))))
+    {
+      if (just_once_each_iteration_p (loop, fe->dest))
+	return NULL;
+      guard_edge = te;
+    }
+  else if (just_once_each_iteration_p (loop, fe->dest)
+	   || (single_succ_p (fe->dest)
+	       && just_once_each_iteration_p (loop, single_succ (fe->dest))))
+    guard_edge = fe;
+  else
+    return NULL;
+
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file,
+	     "Considering guard %d -> %d in loop %d\n",
+	     guard_edge->src->index, guard_edge->dest->index, loop->num);
+  /* Check if condition operands do not have definitions inside loop since
+     any bb copying is not performed.  */
+  FOR_EACH_SSA_TREE_OPERAND (use, last_stmt (header), iter, SSA_OP_USE)
+    {
+      gimple def = SSA_NAME_DEF_STMT (use);
+      basic_block def_bb = gimple_bb (def);
+      if (def_bb
+          && flow_bb_inside_loop_p (loop, def_bb))
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, "  guard operands have definitions"
+				" inside loop\n");
+	  return NULL;
+	}
+    }
+
+  body = get_loop_body (loop);
+  for (i = 0; i < loop->num_nodes; i++)
+    {
+      basic_block bb = body[i];
+      if (bb->loop_father != loop)
+	continue;
+      if (bb->flags & BB_IRREDUCIBLE_LOOP)
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, "Block %d is marked as irreducible in loop\n",
+		      bb->index);
+	  guard_edge = NULL;
+	  goto end;
+	}
+      if (!empty_bb_without_guard_p (loop, bb))
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, "  block %d has side effects\n", bb->index);
+	  guard_edge = NULL;
+	  goto end;
+	}
+    }
+
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, "  suitable to hoist\n");
+end:
+  if (body)
+    free (body);
+  return guard_edge;
+}
+
+/* Returns true if
+   1) no statement in BB has side effects
+   2) assuming that edge GUARD is always taken, all definitions in BB
+      are noy used outside of the loop.
+   KNOWN_INVARIANTS is a set of ssa names we know to be invariant, and
+   PROCESSED is a set of ssa names for that we already tested whether they
+   are invariant or not.  */
+
+static bool
+empty_bb_without_guard_p (struct loop *loop, basic_block bb)
+{
+  basic_block exit_bb = single_exit (loop)->src;
+  bool may_be_used_outside = (bb == exit_bb
+			      || !dominated_by_p (CDI_DOMINATORS, bb, exit_bb));
+  tree name;
+  ssa_op_iter op_iter;
+
+  /* Phi nodes do not have side effects, but their results might be used
+     outside of the loop.  */
+  if (may_be_used_outside)
+    {
+      for (gphi_iterator gsi = gsi_start_phis (bb);
+	   !gsi_end_p (gsi); gsi_next (&gsi))
+	{
+	  gphi *phi = gsi.phi ();
+	  name = PHI_RESULT (phi);
+	  if (virtual_operand_p (name))
+	    continue;
+
+	  if (used_outside_loop_p (loop, name))
+	    return false;
+	}
+    }
+
+  for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
+       !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      gimple stmt = gsi_stmt (gsi);
+      if (gimple_has_side_effects (stmt))
+	return false;
+
+      if (gimple_vdef(stmt))
+	return false;
+
+      FOR_EACH_SSA_TREE_OPERAND (name, stmt, op_iter, SSA_OP_DEF)
+	{
+	  if (may_be_used_outside
+	      && used_outside_loop_p (loop, name))
+	    return false;
+	}
+    }
+  return true;
+}
+
+/* Return true if NAME is used outside of LOOP.  */
+
+static bool
+used_outside_loop_p (struct loop *loop, tree name)
+{
+  imm_use_iterator it;
+  use_operand_p use;
+
+  FOR_EACH_IMM_USE_FAST (use, it, name)
+    {
+      gimple stmt = USE_STMT (use);
+      if (!flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
+	return true;
+    }
+
+  return false;
+}
+
+/* Return argument for loop preheader edge in header virtual phi if any.  */
+
+static tree
+get_vop_from_header (struct loop *loop)
+{
+  for (gphi_iterator gsi = gsi_start_phis (loop->header);
+       !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      gphi *phi = gsi.phi ();
+      if (!virtual_operand_p (gimple_phi_result (phi)))
+	continue;
+      return PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
+    }
+  return NULL_TREE;
+}
+
+/* Move the check of GUARD outside of LOOP.  */
+
+static void
+hoist_guard (struct loop *loop, edge guard)
+{
+  edge exit = single_exit (loop);
+  edge preh = loop_preheader_edge (loop);
+  basic_block pre_header = preh->src;
+  basic_block bb;
+  edge te, fe, e, new_edge;
+  gimple stmt;
+  basic_block guard_bb = guard->src;
+  gimple_stmt_iterator gsi;
+  int flags = 0;
+  bool fix_dom_of_exit;
+  gcond *cond_stmt, *new_cond_stmt;
+
+  bb = get_immediate_dominator (CDI_DOMINATORS, exit->dest);
+  fix_dom_of_exit = flow_bb_inside_loop_p (loop, bb);
+  gsi = gsi_last_bb (guard_bb);
+  stmt = gsi_stmt (gsi);
+  gcc_assert (gimple_code (stmt) == GIMPLE_COND);
+  cond_stmt = as_a <gcond *> (stmt);
+  extract_true_false_edges_from_block (guard_bb, &te, &fe);
+  /* Insert guard to PRE_HEADER.  */
+  if (!empty_block_p (pre_header))
+    gsi = gsi_last_bb (pre_header);
+  else
+    gsi = gsi_start_bb (pre_header);
+  /* Create copy of COND_STMT.  */
+  new_cond_stmt = gimple_build_cond (gimple_cond_code (cond_stmt),
+				     gimple_cond_lhs (cond_stmt),
+				     gimple_cond_rhs (cond_stmt),
+				     NULL_TREE, NULL_TREE);
+  gsi_insert_after (&gsi, new_cond_stmt, GSI_NEW_STMT);
+  /* Convert COND_STMT to true/false conditional.  */
+  if (guard == te)
+    gimple_cond_make_false (cond_stmt);
+  else
+    gimple_cond_make_true (cond_stmt);
+  update_stmt (cond_stmt);
+  /* Create new loop pre-header.  */
+  e = split_block (pre_header, last_stmt (pre_header));
+  gcc_assert (loop_preheader_edge (loop)->src == e->dest);
+  if (guard == fe)
+    {
+      e->flags = EDGE_TRUE_VALUE;
+      flags |= EDGE_FALSE_VALUE;
+    }
+  else
+    {
+      e->flags = EDGE_FALSE_VALUE;
+      flags |= EDGE_TRUE_VALUE;
+    }
+  new_edge = make_edge (pre_header, exit->dest, flags);
+  if (fix_dom_of_exit)
+    set_immediate_dominator (CDI_DOMINATORS, exit->dest, pre_header);
+  /* Add NEW_ADGE argument for all phi in post-header block.  */
+  bb = exit->dest;
+  for (gphi_iterator gsi = gsi_start_phis (bb);
+       !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      gphi *phi = gsi.phi ();
+      tree arg;
+      if (virtual_operand_p (gimple_phi_result (phi)))
+	{
+	  arg = get_vop_from_header (loop);
+	  if (arg == NULL_TREE)
+	    /* Use exit edge argument.  */
+	    arg =  PHI_ARG_DEF_FROM_EDGE (phi, exit);
+	  add_phi_arg (phi, arg, new_edge, UNKNOWN_LOCATION);
+	}
+      else
+	{
+	  /* Use exit edge argument.  */
+	  arg = PHI_ARG_DEF_FROM_EDGE (phi, exit);
+	  add_phi_arg (phi, arg, new_edge, UNKNOWN_LOCATION);
+	}
+    }
+
+  mark_virtual_operands_for_renaming (cfun);
+  update_ssa (TODO_update_ssa);
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, "  guard hoisted.\n");
+}
+
+/* Return true if phi argument for exit edge can be used
+   for edge around loop.  */
+
+static bool
+check_exit_phi (struct loop *loop)
+{
+  edge exit = single_exit (loop);
+  basic_block pre_header = loop_preheader_edge (loop)->src;
+
+  for (gphi_iterator gsi = gsi_start_phis (exit->dest);
+       !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      gphi *phi = gsi.phi ();
+      tree arg;
+      gimple def;
+      basic_block def_bb;
+      if (virtual_operand_p (gimple_phi_result (phi)))
+	continue;
+      arg = PHI_ARG_DEF_FROM_EDGE (phi, exit);
+      if (TREE_CODE (arg) != SSA_NAME)
+	continue;
+      def = SSA_NAME_DEF_STMT (arg);
+      if (!def)
+	continue;
+      def_bb = gimple_bb (def);
+      if (!def_bb)
+	continue;
+      if (!dominated_by_p (CDI_DOMINATORS, pre_header, def_bb))
+	/* Definition inside loop!  */
+	return false;
+      /* Check loop closed phi invariant.  */
+      if (!flow_bb_inside_loop_p (def_bb->loop_father, pre_header))
+	return false;
+    }
+  return true;
+}
+
 /* Loop unswitching pass.  */
 
 namespace {

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] Unswitching outer loops.
  2015-10-07  9:53                               ` Yuri Rumyantsev
@ 2015-10-07 15:26                                 ` Yuri Rumyantsev
  2015-10-08 12:31                                   ` Richard Biener
  2015-10-09 19:05                                 ` H.J. Lu
  1 sibling, 1 reply; 17+ messages in thread
From: Yuri Rumyantsev @ 2015-10-07 15:26 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, Igor Zamyatin

[-- Attachment #1: Type: text/plain, Size: 22457 bytes --]

Richard,

I noticed that 'gimple' type was changed and send you updated patch.

Thanks.
Yuri.

2015-10-07 12:53 GMT+03:00 Yuri Rumyantsev <ysrumyan@gmail.com>:
> Richard,
>
> I've fixed adding virtual phi argument and add check on irreducible basic block.
> New patch is attached.
>
> I checked it for bootstrap and regression testing, no new failures.
>
> ChangeLog:
> 2015-10-07  Yuri Rumyantsev  <ysrumyan@gmail.com>
>
> * tree-ssa-loop-unswitch.c: Include "gimple-iterator.h" and
> "cfghooks.h", add prototypes for introduced new functions.
> (tree_ssa_unswitch_loops): Use from innermost loop iterator, move all
> checks on ability of loop unswitching to tree_unswitch_single_loop;
> invoke tree_unswitch_single_loop or tree_unswitch_outer_loop depending
> on innermost loop check.
> (tree_unswitch_single_loop): Add all required checks on ability of
> loop unswitching under zero recursive level guard.
> (tree_unswitch_outer_loop): New function.
> (find_loop_guard): Likewise.
> (empty_bb_without_guard_p): Likewise.
> (used_outside_loop_p): Likewise.
> (get_vop_from_header): Likewise.
> (hoist_guard): Likewise.
> (check_exit_phi): Likewise.
>
>    gcc/testsuite/ChangeLog:
> * gcc.dg/loop-unswitch-2.c: New test.
> * gcc.dg/loop-unswitch-3.c: Likewise.
> * gcc.dg/loop-unswitch-4.c: Likewise.
>
>
> 2015-10-06 15:21 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>> On Tue, Oct 6, 2015 at 1:41 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>> Richard,
>>>
>>> Here is updated patch which reflects almost all your remarks:
>>> 1. Use ordinary get_loop_body.
>>> 2. Delete useless asserts.
>>> 3. Use check on iterated loop instead of finite_loop_p.
>>> 4. Do not update CFG by adjusting the CONDs condition to always true/false.
>>> 5. Add couple tests.
>>
>> +  /* Add NEW_ADGE argument for all phi in post-header block.  */
>> +  bb = exit->dest;
>> +  for (gphi_iterator gsi = gsi_start_phis (bb);
>> +       !gsi_end_p (gsi); gsi_next (&gsi))
>> +    {
>> +      gphi *phi = gsi.phi ();
>> +      /* edge_iterator ei; */
>> +      tree arg;
>> +      if (virtual_operand_p (gimple_phi_result (phi)))
>> +       {
>> +         arg = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
>> +         add_phi_arg (phi, arg, new_edge, UNKNOWN_LOCATION);
>>
>> now I know what confused me - here you are looking at a loop exit PHI
>> but querying with the preheader edge index.  I think you need to walk
>> the loop header PHIs to find the PHI for the virtual operand and use that
>> to get the PHI arg from?
>>
>> The side-effect / used-outside code is still the same.  What matters
>> is side-effects outside of the loop-header protected code region, not
>> blocks excluding the inner loop.  Say,
>>
>>   for (;;)
>>     {
>>       if (invariant-guard)
>>         {
>>            printf ("Blah");
>>            for (;;)
>>              ;
>>         }
>>     }
>>
>> would still ok to be unswitched.  So instead of
>>
>> +      if (body[i]->loop_father != loop)
>> +       continue;
>>
>> it would be
>>
>>        if (dominated_by_p (CDI_DOMINATORS, body[i], header)
>>            && !dominated_by_p (CDI_DOMINATORS, body[i], fe->dest))
>>
>> with the obvious improvement to the patch to not only consider header checks
>> in the outer loop header but in the pre-header block of the inner loop.
>>
>> And I still think you should walk the exit PHIs args to see whether they
>> are defined in the non-guarded region of the outer loop instead of walking
>> all uses of all defs.
>>
>> Note that I think you miss endless loops as side-effects if that endless
>> loop occurs through a irreducible region (thus not reflected in the
>> loop tree).  Thus you should reject BB_IRREDUCIBLE_LOOP blocks
>> in the non-guarded region as well.
>>
>> It seems to me that protecting adjacent loops with a single guard is
>> also eligible for hoisting thus the restriction on loop->inner->next
>> should become a restriction on no loops (or irreducible regions)
>> in the non-guarded region.
>>
>> Most things can be improved as followup, but at least the
>> virtual PHI arg thing needs to be sorted out.
>>
>> Thanks,
>> Richard.
>>
>>
>>> ChangeLog:
>>> 2015-10-06  Yuri Rumyantsev  <ysrumyan@gmail.com>
>>>
>>> * tree-ssa-loop-unswitch.c: Include "gimple-iterator.h" and
>>> "cfghooks.h", add prototypes for introduced new functions.
>>> (tree_ssa_unswitch_loops): Use from innermost loop iterator, move all
>>> checks on ability of loop unswitching to tree_unswitch_single_loop;
>>> invoke tree_unswitch_single_loop or tree_unswitch_outer_loop depending
>>> on innermost loop check.
>>> (tree_unswitch_single_loop): Add all required checks on ability of
>>> loop unswitching under zero recursive level guard.
>>> (tree_unswitch_outer_loop): New function.
>>> (find_loop_guard): Likewise.
>>> (empty_bb_without_guard_p): Likewise.
>>> (used_outside_loop_p): Likewise.
>>> (hoist_guard): Likewise.
>>> (check_exit_phi): Likewise.
>>>
>>>    gcc/testsuite/ChangeLog:
>>> * gcc.dg/loop-unswitch-2.c: New test.
>>> * gcc.dg/loop-unswitch-3.c: Likewise.
>>> * gcc.dg/loop-unswitch-4.c: Likewise.
>>>
>>> 2015-10-06 10:59 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>> On Mon, Oct 5, 2015 at 3:13 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>> Thanks Richard.
>>>>> I'd like to answer on your last comment related to using of exit edge
>>>>> argument for edge that skips loop.
>>>>> Let's consider the following test-case:
>>>>>
>>>>> #include <stdlib.h>
>>>>> #define N 32
>>>>> float *foo(int ustride, int size, float *src)
>>>>> {
>>>>>    float *buffer, *p;
>>>>>    int i, k;
>>>>>
>>>>>    if (!src)
>>>>>     return NULL;
>>>>>
>>>>>    buffer = (float *) malloc(N * size * sizeof(float));
>>>>>
>>>>>    if(buffer)
>>>>>       for(i=0, p=buffer; i<N; i++, src+=ustride)
>>>>> for(k=0; k<size; k++)
>>>>>  *p++ = src[k];
>>>>>
>>>>>    return buffer;
>>>>> }
>>>>>
>>>>> Before adding new edge we have in post-header bb:
>>>>>   <bb 9>:
>>>>>   # _6 = PHI <0B(8), buffer_20(16)>
>>>>>   return _6;
>>>>>
>>>>> It is clear that we must preserve function semantic and transform it to
>>>>> _6 = PHI <0B(12), buffer_19(9), buffer_19(4)>
>>>>
>>>> Ah, yeah.  I was confusing the loop exit of the inner vs. the outer loop.
>>>>
>>>> Richard.
>>>>
>>>>>
>>>>> 2015-10-05 13:57 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>>> On Wed, Sep 30, 2015 at 12:46 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>>>> Hi Richard,
>>>>>>>
>>>>>>> I re-designed outer loop unswitching using basic idea of 23855 patch -
>>>>>>> hoist invariant guard if loop is empty without guard. Note that this
>>>>>>> was added to loop unswitching pass with simple modifications - using
>>>>>>> another loop iterator etc.
>>>>>>>
>>>>>>> Bootstrap and regression testing did not show any new failures.
>>>>>>> What is your opinion?
>>>>>>
>>>>>> Overall it looks good.  Some comments below - a few more testcases would
>>>>>> be nice as well.
>>>>>>
>>>>>> +  /* Loop must not be infinite.  */
>>>>>> +  if (!finite_loop_p (loop))
>>>>>> +    return false;
>>>>>>
>>>>>> why's that?
>>>>>>
>>>>>> +  body = get_loop_body_in_dom_order (loop);
>>>>>> +  for (i = 0; i < loop->num_nodes; i++)
>>>>>> +    {
>>>>>> +      if (body[i]->loop_father != loop)
>>>>>> +       continue;
>>>>>> +      if (!empty_bb_without_guard_p (loop, body[i]))
>>>>>>
>>>>>> I wonder if there is a better way to iterate over the interesting
>>>>>> blocks and PHIs
>>>>>> we need to check for side-effects (and thus we maybe can avoid gathering
>>>>>> the loop in DOM order).
>>>>>>
>>>>>> +      FOR_EACH_SSA_TREE_OPERAND (name, stmt, op_iter, SSA_OP_DEF)
>>>>>> +       {
>>>>>> +         if (may_be_used_outside
>>>>>>
>>>>>> may_be_used_outside can be hoisted above the loop.  I wonder if we can take
>>>>>> advantage of loop-closed SSA form here (and the fact we have a single exit
>>>>>> from the loop).  Iterating over exit dest PHIs and determining whether the
>>>>>> exit edge DEF is inside the loop part it may not be should be enough.
>>>>>>
>>>>>> +  gcc_assert (single_succ_p (pre_header));
>>>>>>
>>>>>> that should be always true.
>>>>>>
>>>>>> +  gsi_remove (&gsi, false);
>>>>>> +  bb = guard->dest;
>>>>>> +  remove_edge (guard);
>>>>>> +  /* Update dominance for destination of GUARD.  */
>>>>>> +  if (EDGE_COUNT (bb->preds) == 0)
>>>>>> +    {
>>>>>> +      basic_block s_bb;
>>>>>> +      gcc_assert (single_succ_p (bb));
>>>>>> +      s_bb = single_succ (bb);
>>>>>> +      delete_basic_block (bb);
>>>>>> +      if (single_pred_p (s_bb))
>>>>>> +       set_immediate_dominator (CDI_DOMINATORS, s_bb, single_pred (s_bb));
>>>>>>
>>>>>> all this massaging should be simplified by leaving it to CFG cleanup by
>>>>>> simply adjusting the CONDs condition to always true/false.  There is
>>>>>> gimple_cond_make_{true,false} () for this (would be nice to have a variant
>>>>>> taking a bool).
>>>>>>
>>>>>> +  new_edge = make_edge (pre_header, exit->dest, flags);
>>>>>> +  if (fix_dom_of_exit)
>>>>>> +    set_immediate_dominator (CDI_DOMINATORS, exit->dest, pre_header);
>>>>>> +  update_stmt (gsi_stmt (gsi));
>>>>>>
>>>>>> the update_stmt should be not necessary, it's done by gsi_insert_after already.
>>>>>>
>>>>>> +  /* Add NEW_ADGE argument for all phi in post-header block.  */
>>>>>> +  bb = exit->dest;
>>>>>> +  for (gphi_iterator gsi = gsi_start_phis (bb);
>>>>>> +       !gsi_end_p (gsi); gsi_next (&gsi))
>>>>>> +    {
>>>>>> +      gphi *phi = gsi.phi ();
>>>>>> +      /* edge_iterator ei; */
>>>>>> +      tree arg;
>>>>>> +      if (virtual_operand_p (gimple_phi_result (phi)))
>>>>>> +       {
>>>>>> +         arg = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
>>>>>> +         add_phi_arg (phi, arg, new_edge, UNKNOWN_LOCATION);
>>>>>> +       }
>>>>>> +      else
>>>>>> +       {
>>>>>> +         /* Use exit edge argument.  */
>>>>>> +         arg = PHI_ARG_DEF_FROM_EDGE (phi, exit);
>>>>>> +         add_phi_arg (phi, arg, new_edge, UNKNOWN_LOCATION);
>>>>>>
>>>>>> Hum.  How is it ok to use the exit edge argument for the edge that skips
>>>>>> the loop?  Why can't you always use the pre-header edge value?
>>>>>> That is, if we have
>>>>>>
>>>>>>  for(i=0;i<m;++i)
>>>>>>    {
>>>>>>      if (n > 0)
>>>>>>     {
>>>>>>      for (;;)
>>>>>>        {
>>>>>>        }
>>>>>>      }
>>>>>>    }
>>>>>>   ... = i;
>>>>>>
>>>>>> then i is used after the loop and the correct value to use if
>>>>>> n > 0 is false is '0'.  Maybe this way we can also relax
>>>>>> what check_exit_phi does?  IMHO the only restriction is
>>>>>> if sth defined inside the loop before the header check for
>>>>>> the inner loop is used after the loop.
>>>>>>
>>>>>> Thanks,
>>>>>> Richard.
>>>>>>
>>>>>>> Thanks.
>>>>>>>
>>>>>>> ChangeLog:
>>>>>>> 2015-09-30  Yuri Rumyantsev  <ysrumyan@gmail.com>
>>>>>>>
>>>>>>> * tree-ssa-loop-unswitch.c: Include "gimple-iterator.h" and
>>>>>>> "cfghooks.h", add prototypes for introduced new functions.
>>>>>>> (tree_ssa_unswitch_loops): Use from innermost loop iterator, move all
>>>>>>> checks on ability of loop unswitching to tree_unswitch_single_loop;
>>>>>>> invoke tree_unswitch_single_loop or tree_unswitch_outer_loop depending
>>>>>>> on innermost loop check.
>>>>>>> (tree_unswitch_single_loop): Add all required checks on ability of
>>>>>>> loop unswitching under zero recursive level guard.
>>>>>>> (tree_unswitch_outer_loop): New function.
>>>>>>> (find_loop_guard): Likewise.
>>>>>>> (empty_bb_without_guard_p): Likewise.
>>>>>>> (used_outside_loop_p): Likewise.
>>>>>>> (hoist_guard): Likewise.
>>>>>>> (check_exit_phi): Likewise.
>>>>>>>
>>>>>>>    gcc/testsuite/ChangeLog:
>>>>>>> * gcc.dg/loop-unswitch-2.c: New test.
>>>>>>>
>>>>>>> 2015-09-16 11:26 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>>>>> Yeah, as said, the patch wasn't fully ready and it also felt odd to do
>>>>>>>> this hoisting in loop header copying.  Integrating it
>>>>>>>> with LIM would be a better fit eventually.
>>>>>>>>
>>>>>>>> Note that we did agree to go forward with your original patch just
>>>>>>>> making it more "generically" perform outer loop
>>>>>>>> unswitching.  Did you explore that idea further?
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> On Tue, Sep 15, 2015 at 6:00 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>>>>>> Thanks Richard.
>>>>>>>>>
>>>>>>>>> I found one more issue that could not be fixed simply. In 23855 you
>>>>>>>>> consider the following test-case:
>>>>>>>>> void foo(int *ie, int *je, double *x)
>>>>>>>>> {
>>>>>>>>>   int i, j;
>>>>>>>>>   for (j=0; j<*je; ++j)
>>>>>>>>>     for (i=0; i<*ie; ++i)
>>>>>>>>>       x[i+j] = 0.0;
>>>>>>>>> }
>>>>>>>>> and proposed to hoist up a check on *ie out of loop. It requires
>>>>>>>>> memref alias analysis since in general x and ie can alias (if their
>>>>>>>>> types are compatible - int *ie & int * x). Such analysis is performed
>>>>>>>>> by pre or lim passes. Without such analysis we can not hoist a test on
>>>>>>>>> non-zero for *ie out of loop using 238565 patch.
>>>>>>>>>  The second concern is that proposed copy header algorithm changes
>>>>>>>>> loop structure significantly and it is not accepted by vectorizer
>>>>>>>>> since latch is not empty (such transformation assumes loop peeling for
>>>>>>>>> one iteration. So I can propose to implement simple guard hoisting
>>>>>>>>> without copying header and tail blocks (if it is possible).
>>>>>>>>>
>>>>>>>>> I will appreciate you for any advice or help since without such
>>>>>>>>> hoisting we are not able to perform outer loop vectorization for
>>>>>>>>> important benchmark.
>>>>>>>>> and
>>>>>>>>>
>>>>>>>>> 2015-09-15 14:22 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>>>>>>> On Thu, Sep 3, 2015 at 6:32 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>>>>>>>> Hi Richard,
>>>>>>>>>>>
>>>>>>>>>>> I started learning, tuning and debugging patch proposed in 23855 and
>>>>>>>>>>> discovered thta it does not work properly.
>>>>>>>>>>> So I wonder is it tested patch and it should work?
>>>>>>>>>>
>>>>>>>>>> I don't remember, but as it wasn't committed it certainly wasn't ready.
>>>>>>>>>>
>>>>>>>>>>> Should it accept for hoisting the following loop nest
>>>>>>>>>>>   for (i=0; i<n; i++) {
>>>>>>>>>>>     s = 0;
>>>>>>>>>>>     for (j=0; j<m; j++)
>>>>>>>>>>>       s += a[i] * b[j];
>>>>>>>>>>>     c[i] = s;
>>>>>>>>>>>   }
>>>>>>>>>>> Note that i-loop will nit be empty if m is equal to 0.
>>>>>>>>>>
>>>>>>>>>> if m is equal to 0 then we still have the c[i] = s store, no?  Of course
>>>>>>>>>> we could unswitch the outer loop on m == 0 but simple hoisting wouldn't work.
>>>>>>>>>>
>>>>>>>>>> Richard.
>>>>>>>>>>
>>>>>>>>>>> 2015-08-03 10:27 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>>>>>>>>> On Fri, Jul 31, 2015 at 1:17 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>>>>>>>>>> Hi Richard,
>>>>>>>>>>>>>
>>>>>>>>>>>>> I learned your updated patch for 23825 and it is more general in
>>>>>>>>>>>>> comparison with my.
>>>>>>>>>>>>> I'd like to propose you a compromise - let's consider my patch only
>>>>>>>>>>>>> for force-vectorize outer loop only to allow outer-loop
>>>>>>>>>>>>> vecctorization.
>>>>>>>>>>>>
>>>>>>>>>>>> I don't see why we should special-case that if the approach in 23825
>>>>>>>>>>>> is sensible.
>>>>>>>>>>>>
>>>>>>>>>>>>> Note that your approach will not hoist invariant
>>>>>>>>>>>>> guards if loops contains something else except for inner-loop, i.e. it
>>>>>>>>>>>>> won't be empty for taken branch.
>>>>>>>>>>>>
>>>>>>>>>>>> Yes, it does not perform unswitching but guard hoisting.  Note that this
>>>>>>>>>>>> is originally Zdenek Dvoraks patch.
>>>>>>>>>>>>
>>>>>>>>>>>>> I also would like to answer on your last question - CFG cleanup is
>>>>>>>>>>>>> invoked to perform deletion of single-argument phi nodes from tail
>>>>>>>>>>>>> block through substitution - such phi's prevent outer-loop
>>>>>>>>>>>>> vectorization. But it is clear that such transformation can be done
>>>>>>>>>>>>> other pass.
>>>>>>>>>>>>
>>>>>>>>>>>> Hmm, I wonder why the copy_prop pass after unswitching does not
>>>>>>>>>>>> get rid of them?
>>>>>>>>>>>>
>>>>>>>>>>>>> What is your opinion?
>>>>>>>>>>>>
>>>>>>>>>>>> My opinion is that if we want to enhance unswitching to catch this
>>>>>>>>>>>> (or similar) cases then we should make it a lot more general than
>>>>>>>>>>>> your pattern-matching approach.  I see nothing that should prevent
>>>>>>>>>>>> us from considering unswitching non-innermost loops in general.
>>>>>>>>>>>> It should be only a cost consideration to not do non-innermost loop
>>>>>>>>>>>> unswitching (in addition to maybe a --param specifying the maximum
>>>>>>>>>>>> depth of a loop nest to unswitch).
>>>>>>>>>>>>
>>>>>>>>>>>> So my first thought when seeing your patch still holds - the patch
>>>>>>>>>>>> looks very much too specific.
>>>>>>>>>>>>
>>>>>>>>>>>> Richard.
>>>>>>>>>>>>
>>>>>>>>>>>>> Yuri.
>>>>>>>>>>>>>
>>>>>>>>>>>>> 2015-07-28 13:50 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>>>>>>>>>>> On Thu, Jul 23, 2015 at 4:45 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>>>>>>>>>>>> Hi Richard,
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> I checked that both test-cases from 23855 are sucessfully unswitched
>>>>>>>>>>>>>>> by proposed patch. I understand that it does not catch deeper loop
>>>>>>>>>>>>>>> nest as
>>>>>>>>>>>>>>>    for (i=0; i<10; i++)
>>>>>>>>>>>>>>>      for (j=0;j<n;j++)
>>>>>>>>>>>>>>>         for (k=0;k<20;k++)
>>>>>>>>>>>>>>>   ...
>>>>>>>>>>>>>>> but duplication of middle-loop does not look reasonable.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Here is dump for your second test-case:
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> void foo(int *ie, int *je, double *x)
>>>>>>>>>>>>>>> {
>>>>>>>>>>>>>>>   int i, j;
>>>>>>>>>>>>>>>   for (j=0; j<*je; ++j)
>>>>>>>>>>>>>>>     for (i=0; i<*ie; ++i)
>>>>>>>>>>>>>>>       x[i+j] = 0.0;
>>>>>>>>>>>>>>> }
>>>>>>>>>>>>>>> grep -i unswitch t6.c.119t.unswitch
>>>>>>>>>>>>>>> ;; Unswitching outer loop
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> I was saying that why go with a limited approach when a patch (in
>>>>>>>>>>>>>> unknown state...)
>>>>>>>>>>>>>> is available that does it more generally?  Also unswitching is quite
>>>>>>>>>>>>>> expensive compared
>>>>>>>>>>>>>> to "moving" the invariant condition.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> In your patch:
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> +  if (!nloop->force_vectorize)
>>>>>>>>>>>>>> +    nloop->force_vectorize = true;
>>>>>>>>>>>>>> +  if (loop->safelen != 0)
>>>>>>>>>>>>>> +    nloop->safelen = loop->safelen;
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> I see no guard on force_vectorize so = true looks bogus here.  Please just use
>>>>>>>>>>>>>> copy_loop_info.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> +  if (integer_nonzerop (cond_new))
>>>>>>>>>>>>>> +    gimple_cond_set_condition_from_tree (cond_stmt, boolean_true_node);
>>>>>>>>>>>>>> +  else if (integer_zerop (cond_new))
>>>>>>>>>>>>>> +    gimple_cond_set_condition_from_tree (cond_stmt, boolean_false_node);
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> gimple_cond_make_true/false (cond_stmt);
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> btw, seems odd that we have to recompute which loop is the true / false variant
>>>>>>>>>>>>>> when we just fed a guard condition to loop_version.  Can't we statically
>>>>>>>>>>>>>> determine whether loop or nloop has the in-loop condition true or false?
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> +  /* Clean-up cfg to remove useless one-argument phi in exit block of
>>>>>>>>>>>>>> +     outer-loop.  */
>>>>>>>>>>>>>> +  cleanup_tree_cfg ();
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> I know unswitching is already O(number-of-unswitched-loops * size-of-function)
>>>>>>>>>>>>>> because it updates SSA form after each individual unswitching (and it does that
>>>>>>>>>>>>>> because it invokes itself recursively on unswitched loops).  But do you really
>>>>>>>>>>>>>> need to invoke CFG cleanup here?
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Richard.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Yuri.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> 2015-07-14 14:06 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>>>>>>>>>>>>> On Fri, Jul 10, 2015 at 12:02 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>>>>>>>>>>>>>> Hi All,
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Here is presented simple transformation which tries to hoist out of
>>>>>>>>>>>>>>>>> outer-loop a check on zero trip count for inner-loop. This is very
>>>>>>>>>>>>>>>>> restricted transformation since it accepts outer-loops with very
>>>>>>>>>>>>>>>>> simple cfg, as for example:
>>>>>>>>>>>>>>>>>     acc = 0;
>>>>>>>>>>>>>>>>>    for (i = 1; i <= m; i++) {
>>>>>>>>>>>>>>>>>       for (j = 0; j < n; j++)
>>>>>>>>>>>>>>>>>          if (l[j] == i) { v[j] = acc; acc++; };
>>>>>>>>>>>>>>>>>       acc <<= 1;
>>>>>>>>>>>>>>>>>    }
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Note that degenerative outer loop (without inner loop) will be
>>>>>>>>>>>>>>>>> completely deleted as dead code.
>>>>>>>>>>>>>>>>> The main goal of this transformation was to convert outer-loop to form
>>>>>>>>>>>>>>>>> accepted by outer-loop vectorization (such test-case is also included
>>>>>>>>>>>>>>>>> to patch).
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Bootstrap and regression testing did not show any new failures.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Is it OK for trunk?
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> I think this is
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=23855
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> as well.  It has a patch adding a invariant loop guard hoisting
>>>>>>>>>>>>>>>> phase to loop-header copying.  Yeah, it needs updating to
>>>>>>>>>>>>>>>> trunk again I suppose.  It's always non-stage1 when I come
>>>>>>>>>>>>>>>> back to that patch.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Your patch seems to be very specific and only handles outer
>>>>>>>>>>>>>>>> loops of innermost loops.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Richard.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> ChangeLog:
>>>>>>>>>>>>>>>>> 2015-07-10  Yuri Rumyantsev  <ysrumyan@gmail.com>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> * tree-ssa-loop-unswitch.c: Include "tree-cfgcleanup.h" and
>>>>>>>>>>>>>>>>> "gimple-iterator.h", add prototype for tree_unswitch_outer_loop.
>>>>>>>>>>>>>>>>> (tree_ssa_unswitch_loops): Add invoke of tree_unswitch_outer_loop.
>>>>>>>>>>>>>>>>> (tree_unswitch_outer_loop): New function.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> gcc/testsuite/ChangeLog:
>>>>>>>>>>>>>>>>> * gcc.dg/tree-ssa/unswitch-outer-loop-1.c: New test.
>>>>>>>>>>>>>>>>> * gcc.dg/vect/vect-outer-simd-3.c: New test.

[-- Attachment #2: patch.fixed --]
[-- Type: application/octet-stream, Size: 17376 bytes --]

diff --git a/gcc/testsuite/gcc.dg/loop-unswitch-2.c b/gcc/testsuite/gcc.dg/loop-unswitch-2.c
new file mode 100644
index 0000000..5ebf608
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/loop-unswitch-2.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -funswitch-loops -fdump-tree-unswitch-details" } */
+
+void foo (float **a, float **b, float *c, int n, int m, int l)
+{
+  int i,j,k;
+  float s;
+  for (i=0; i<l; i++)
+    for (j=0; j<n; j++)
+      for (k=0; k<m; k++)
+	c[i] += a[i][k] * b[k][j];
+}
+
+/* { dg-final { scan-tree-dump-times "guard hoisted" 2 "unswitch" } } */
+
diff --git a/gcc/testsuite/gcc.dg/loop-unswitch-3.c b/gcc/testsuite/gcc.dg/loop-unswitch-3.c
new file mode 100644
index 0000000..e355286
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/loop-unswitch-3.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -funswitch-loops -fdump-tree-unswitch-details" } */
+
+#include <stdlib.h>
+#define N 32
+float *foo(int ustride, int size, float *src)
+{
+   float *buffer, *p;
+   int i, k;
+
+   if (!src)
+    return NULL;
+
+   buffer = (float *) malloc(N * size * sizeof(float));
+
+   if(buffer)
+      for(i=0, p=buffer; i<N; i++, src+=ustride)
+	for(k=0; k<size; k++)
+	  *p++ = src[k];
+
+   return buffer;
+}
+
+/* { dg-final { scan-tree-dump-times "guard hoisted" 1 "unswitch" } } */
+
+
diff --git a/gcc/testsuite/gcc.dg/loop-unswitch-4.c b/gcc/testsuite/gcc.dg/loop-unswitch-4.c
new file mode 100644
index 0000000..320a1cd
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/loop-unswitch-4.c
@@ -0,0 +1,52 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -funswitch-loops" } */
+
+#include <stdlib.h>
+__attribute__ ((noinline))
+void foo (float **a, float **b, float *c, int n, int m, int l)
+{
+  int i,j,k;
+  float s;
+  for (i=0; i<l; i++)
+    for (j=0; j<n; j++)
+      for (k=0; k<m; k++)
+	c[i] += a[i][k] * b[k][j];
+}
+
+int main()
+{
+  const int N = 32;
+  float **ar1, **ar2;
+  float *res;
+  int i, j;
+  ar1 = (float **)malloc (N * sizeof (float*));
+  ar2 = (float **)malloc (N * sizeof (float*));
+  res = (float *)malloc( N * sizeof (float));
+  for (i=0; i<N; i++)
+    {
+      ar1[i] = (float*)malloc (N * sizeof (float));
+      ar2[i] = (float*)malloc (N * sizeof (float));
+    }
+  for (i=0; i<N; i++)
+    {
+      for (j=0; j<N; j++)
+	{
+	  ar1[i][j] = 2.0f;
+	  ar2[i][j] = 1.5f;
+	}
+      res[i] = 0.0f;
+    }
+  foo (ar1, ar2, res, N, N, N);
+  for (i=0; i<N; i++)
+    if (res[i] != 3072.0f)
+      abort();
+  for (i=0; i<N; i++)
+    res[i] = 0.0f;
+  foo (ar1, ar2, res, N, 0, N);
+  for (i=0; i<N; i++)
+    if (res[i] != 0.0f)
+      abort();
+ 
+  return 0;
+}
+
diff --git a/gcc/tree-ssa-loop-unswitch.c b/gcc/tree-ssa-loop-unswitch.c
index 0b54612..4328d6a 100644
--- a/gcc/tree-ssa-loop-unswitch.c
+++ b/gcc/tree-ssa-loop-unswitch.c
@@ -39,6 +39,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "params.h"
 #include "tree-pass.h"
 #include "tree-inline.h"
+#include "gimple-iterator.h"
+#include "cfghooks.h"
 
 /* This file implements the loop unswitching, i.e. transformation of loops like
 
@@ -79,6 +81,13 @@ along with GCC; see the file COPYING3.  If not see
 static struct loop *tree_unswitch_loop (struct loop *, basic_block, tree);
 static bool tree_unswitch_single_loop (struct loop *, int);
 static tree tree_may_unswitch_on (basic_block, struct loop *);
+static bool tree_unswitch_outer_loop (struct loop *);
+static edge find_loop_guard (struct loop *);
+static bool empty_bb_without_guard_p (struct loop *, basic_block);
+static bool used_outside_loop_p (struct loop *, tree);
+static void hoist_guard (struct loop *, edge);
+static bool check_exit_phi (struct loop *);
+static tree get_vop_from_header (struct loop *);
 
 /* Main entry point.  Perform loop unswitching on all suitable loops.  */
 
@@ -87,42 +96,15 @@ tree_ssa_unswitch_loops (void)
 {
   struct loop *loop;
   bool changed = false;
-  HOST_WIDE_INT iterations;
 
-  /* Go through inner loops (only original ones).  */
-  FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST)
+  /* Go through all loops starting from innermost.  */
+  FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
     {
-      if (dump_file && (dump_flags & TDF_DETAILS))
-        fprintf (dump_file, ";; Considering loop %d\n", loop->num);
-
-      /* Do not unswitch in cold regions. */
-      if (optimize_loop_for_size_p (loop))
-        {
-          if (dump_file && (dump_flags & TDF_DETAILS))
-            fprintf (dump_file, ";; Not unswitching cold loops\n");
-          continue;
-        }
-
-      /* The loop should not be too large, to limit code growth. */
-      if (tree_num_loop_insns (loop, &eni_size_weights)
-          > (unsigned) PARAM_VALUE (PARAM_MAX_UNSWITCH_INSNS))
-        {
-          if (dump_file && (dump_flags & TDF_DETAILS))
-            fprintf (dump_file, ";; Not unswitching, loop too big\n");
-          continue;
-        }
-
-      /* If the loop is not expected to iterate, there is no need
-	 for unswitching.  */
-      iterations = estimated_loop_iterations_int (loop);
-      if (iterations >= 0 && iterations <= 1)
-	{
-          if (dump_file && (dump_flags & TDF_DETAILS))
-            fprintf (dump_file, ";; Not unswitching, loop is not expected to iterate\n");
-          continue;
-	}
-
-      changed |= tree_unswitch_single_loop (loop, 0);
+      if (!loop->inner)
+	/* Unswitch innermost loop.  */
+	changed |= tree_unswitch_single_loop (loop, 0);
+      else
+	changed |= tree_unswitch_outer_loop (loop);
     }
 
   if (changed)
@@ -216,6 +198,39 @@ tree_unswitch_single_loop (struct loop *loop, int num)
   tree cond = NULL_TREE;
   gimple *stmt;
   bool changed = false;
+  HOST_WIDE_INT iterations;
+
+  /* Perform initial tests if unswitch is eligible.  */
+  if (num == 0)
+    {
+      /* Do not unswitch in cold regions. */
+      if (optimize_loop_for_size_p (loop))
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, ";; Not unswitching cold loops\n");
+	  return false;
+	}
+
+      /* The loop should not be too large, to limit code growth. */
+      if (tree_num_loop_insns (loop, &eni_size_weights)
+	  > (unsigned) PARAM_VALUE (PARAM_MAX_UNSWITCH_INSNS))
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, ";; Not unswitching, loop too big\n");
+	  return false;
+	}
+
+      /* If the loop is not expected to iterate, there is no need
+	 for unswitching.  */
+      iterations = estimated_loop_iterations_int (loop);
+      if (iterations >= 0 && iterations <= 1)
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, ";; Not unswitching, loop is not expected"
+		     " to iterate\n");
+	  return false;
+	}
+    }
 
   i = 0;
   bbs = get_loop_body (loop);
@@ -403,6 +418,374 @@ tree_unswitch_loop (struct loop *loop,
 		       REG_BR_PROB_BASE - prob_true, false);
 }
 
+/* Unswitch outer loops by hoisting invariant guard on
+   inner loop without code duplication.  */
+static bool
+tree_unswitch_outer_loop (struct loop *loop)
+{
+  edge exit, guard;
+  HOST_WIDE_INT iterations;
+
+  gcc_assert (loop->inner);
+  if (loop->inner->next)
+    return false;
+  /* Accept loops with single exit only.  */
+  exit = single_exit (loop);
+  if (!exit)
+    return false;
+  /* Check that phi argument of exit edge is not defined inside loop.  */
+  if (!check_exit_phi (loop))
+    return false;
+  /* If the loop is not expected to iterate, there is no need
+      for unswitching.  */
+  iterations = estimated_loop_iterations_int (loop);
+  if (iterations >= 0 && iterations <= 1)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, ";; Not unswitching, loop is not expected"
+		 " to iterate\n");
+	return false;
+    }
+
+  guard = find_loop_guard (loop);
+  if (guard)
+    {
+      hoist_guard (loop, guard);
+      update_ssa (TODO_update_ssa);
+      return true;
+    }
+  return false;
+}
+
+/* Checks if the body of the LOOP is within an invariant guard.  If this
+   is the case, returns the edge that jumps over the real body of the loop,
+   otherwise returns NULL.  */
+
+static edge
+find_loop_guard (struct loop *loop)
+{
+  basic_block header = loop->header;
+  edge guard_edge, te, fe;
+  /* bitmap processed, known_invariants;*/
+  basic_block *body = NULL;
+  unsigned i;
+  tree use;
+  ssa_op_iter iter;
+
+  /* We check for the following situation:
+
+     while (1)
+       {
+	 [header]]
+         loop_phi_nodes;
+	 something1;
+	 if (cond1)
+	   body;
+	 nvar = phi(orig, bvar) ... for all variables changed in body;
+	 [guard_end]
+	 something2;
+	 if (cond2)
+	   break;
+	 something3;
+       }
+
+     where:
+
+     1) cond1 is loop invariant
+     2) If cond1 is false, then the loop is essentially empty; i.e.,
+	a) nothing in something1, something2 and something3 has side
+	   effects
+	b) anything defined in something1, something2 and something3
+	   is not used outside of the loop.  */
+
+  while (single_succ_p (header))
+    header = single_succ (header);
+  if (!last_stmt (header)
+      || gimple_code (last_stmt (header)) != GIMPLE_COND)
+    return NULL;
+
+  extract_true_false_edges_from_block (header, &te, &fe);
+  if (!flow_bb_inside_loop_p (loop, te->dest)
+      || !flow_bb_inside_loop_p (loop, fe->dest))
+    return NULL;
+
+  if (just_once_each_iteration_p (loop, te->dest)
+      || (single_succ_p (te->dest)
+	  && just_once_each_iteration_p (loop, single_succ (te->dest))))
+    {
+      if (just_once_each_iteration_p (loop, fe->dest))
+	return NULL;
+      guard_edge = te;
+    }
+  else if (just_once_each_iteration_p (loop, fe->dest)
+	   || (single_succ_p (fe->dest)
+	       && just_once_each_iteration_p (loop, single_succ (fe->dest))))
+    guard_edge = fe;
+  else
+    return NULL;
+
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file,
+	     "Considering guard %d -> %d in loop %d\n",
+	     guard_edge->src->index, guard_edge->dest->index, loop->num);
+  /* Check if condition operands do not have definitions inside loop since
+     any bb copying is not performed.  */
+  FOR_EACH_SSA_TREE_OPERAND (use, last_stmt (header), iter, SSA_OP_USE)
+    {
+      gimple *def = SSA_NAME_DEF_STMT (use);
+      basic_block def_bb = gimple_bb (def);
+      if (def_bb
+          && flow_bb_inside_loop_p (loop, def_bb))
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, "  guard operands have definitions"
+				" inside loop\n");
+	  return NULL;
+	}
+    }
+
+  body = get_loop_body (loop);
+  for (i = 0; i < loop->num_nodes; i++)
+    {
+      basic_block bb = body[i];
+      if (bb->loop_father != loop)
+	continue;
+      if (bb->flags & BB_IRREDUCIBLE_LOOP)
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, "Block %d is marked as irreducible in loop\n",
+		      bb->index);
+	  guard_edge = NULL;
+	  goto end;
+	}
+      if (!empty_bb_without_guard_p (loop, bb))
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, "  block %d has side effects\n", bb->index);
+	  guard_edge = NULL;
+	  goto end;
+	}
+    }
+
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, "  suitable to hoist\n");
+end:
+  if (body)
+    free (body);
+  return guard_edge;
+}
+
+/* Returns true if
+   1) no statement in BB has side effects
+   2) assuming that edge GUARD is always taken, all definitions in BB
+      are noy used outside of the loop.
+   KNOWN_INVARIANTS is a set of ssa names we know to be invariant, and
+   PROCESSED is a set of ssa names for that we already tested whether they
+   are invariant or not.  */
+
+static bool
+empty_bb_without_guard_p (struct loop *loop, basic_block bb)
+{
+  basic_block exit_bb = single_exit (loop)->src;
+  bool may_be_used_outside = (bb == exit_bb
+			      || !dominated_by_p (CDI_DOMINATORS, bb, exit_bb));
+  tree name;
+  ssa_op_iter op_iter;
+
+  /* Phi nodes do not have side effects, but their results might be used
+     outside of the loop.  */
+  if (may_be_used_outside)
+    {
+      for (gphi_iterator gsi = gsi_start_phis (bb);
+	   !gsi_end_p (gsi); gsi_next (&gsi))
+	{
+	  gphi *phi = gsi.phi ();
+	  name = PHI_RESULT (phi);
+	  if (virtual_operand_p (name))
+	    continue;
+
+	  if (used_outside_loop_p (loop, name))
+	    return false;
+	}
+    }
+
+  for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
+       !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      gimple *stmt = gsi_stmt (gsi);
+      if (gimple_has_side_effects (stmt))
+	return false;
+
+      if (gimple_vdef(stmt))
+	return false;
+
+      FOR_EACH_SSA_TREE_OPERAND (name, stmt, op_iter, SSA_OP_DEF)
+	{
+	  if (may_be_used_outside
+	      && used_outside_loop_p (loop, name))
+	    return false;
+	}
+    }
+  return true;
+}
+
+/* Return true if NAME is used outside of LOOP.  */
+
+static bool
+used_outside_loop_p (struct loop *loop, tree name)
+{
+  imm_use_iterator it;
+  use_operand_p use;
+
+  FOR_EACH_IMM_USE_FAST (use, it, name)
+    {
+      gimple *stmt = USE_STMT (use);
+      if (!flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
+	return true;
+    }
+
+  return false;
+}
+
+/* Return argument for loop preheader edge in header virtual phi if any.  */
+
+static tree
+get_vop_from_header (struct loop *loop)
+{
+  for (gphi_iterator gsi = gsi_start_phis (loop->header);
+       !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      gphi *phi = gsi.phi ();
+      if (!virtual_operand_p (gimple_phi_result (phi)))
+	continue;
+      return PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
+    }
+  return NULL_TREE;
+}
+
+/* Move the check of GUARD outside of LOOP.  */
+
+static void
+hoist_guard (struct loop *loop, edge guard)
+{
+  edge exit = single_exit (loop);
+  edge preh = loop_preheader_edge (loop);
+  basic_block pre_header = preh->src;
+  basic_block bb;
+  edge te, fe, e, new_edge;
+  gimple *stmt;
+  basic_block guard_bb = guard->src;
+  gimple_stmt_iterator gsi;
+  int flags = 0;
+  bool fix_dom_of_exit;
+  gcond *cond_stmt, *new_cond_stmt;
+
+  bb = get_immediate_dominator (CDI_DOMINATORS, exit->dest);
+  fix_dom_of_exit = flow_bb_inside_loop_p (loop, bb);
+  gsi = gsi_last_bb (guard_bb);
+  stmt = gsi_stmt (gsi);
+  gcc_assert (gimple_code (stmt) == GIMPLE_COND);
+  cond_stmt = as_a <gcond *> (stmt);
+  extract_true_false_edges_from_block (guard_bb, &te, &fe);
+  /* Insert guard to PRE_HEADER.  */
+  if (!empty_block_p (pre_header))
+    gsi = gsi_last_bb (pre_header);
+  else
+    gsi = gsi_start_bb (pre_header);
+  /* Create copy of COND_STMT.  */
+  new_cond_stmt = gimple_build_cond (gimple_cond_code (cond_stmt),
+				     gimple_cond_lhs (cond_stmt),
+				     gimple_cond_rhs (cond_stmt),
+				     NULL_TREE, NULL_TREE);
+  gsi_insert_after (&gsi, new_cond_stmt, GSI_NEW_STMT);
+  /* Convert COND_STMT to true/false conditional.  */
+  if (guard == te)
+    gimple_cond_make_false (cond_stmt);
+  else
+    gimple_cond_make_true (cond_stmt);
+  update_stmt (cond_stmt);
+  /* Create new loop pre-header.  */
+  e = split_block (pre_header, last_stmt (pre_header));
+  gcc_assert (loop_preheader_edge (loop)->src == e->dest);
+  if (guard == fe)
+    {
+      e->flags = EDGE_TRUE_VALUE;
+      flags |= EDGE_FALSE_VALUE;
+    }
+  else
+    {
+      e->flags = EDGE_FALSE_VALUE;
+      flags |= EDGE_TRUE_VALUE;
+    }
+  new_edge = make_edge (pre_header, exit->dest, flags);
+  if (fix_dom_of_exit)
+    set_immediate_dominator (CDI_DOMINATORS, exit->dest, pre_header);
+  /* Add NEW_ADGE argument for all phi in post-header block.  */
+  bb = exit->dest;
+  for (gphi_iterator gsi = gsi_start_phis (bb);
+       !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      gphi *phi = gsi.phi ();
+      tree arg;
+      if (virtual_operand_p (gimple_phi_result (phi)))
+	{
+	  arg = get_vop_from_header (loop);
+	  if (arg == NULL_TREE)
+	    /* Use exit edge argument.  */
+	    arg =  PHI_ARG_DEF_FROM_EDGE (phi, exit);
+	  add_phi_arg (phi, arg, new_edge, UNKNOWN_LOCATION);
+	}
+      else
+	{
+	  /* Use exit edge argument.  */
+	  arg = PHI_ARG_DEF_FROM_EDGE (phi, exit);
+	  add_phi_arg (phi, arg, new_edge, UNKNOWN_LOCATION);
+	}
+    }
+
+  mark_virtual_operands_for_renaming (cfun);
+  update_ssa (TODO_update_ssa);
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, "  guard hoisted.\n");
+}
+
+/* Return true if phi argument for exit edge can be used
+   for edge around loop.  */
+
+static bool
+check_exit_phi (struct loop *loop)
+{
+  edge exit = single_exit (loop);
+  basic_block pre_header = loop_preheader_edge (loop)->src;
+
+  for (gphi_iterator gsi = gsi_start_phis (exit->dest);
+       !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      gphi *phi = gsi.phi ();
+      tree arg;
+      gimple *def;
+      basic_block def_bb;
+      if (virtual_operand_p (gimple_phi_result (phi)))
+	continue;
+      arg = PHI_ARG_DEF_FROM_EDGE (phi, exit);
+      if (TREE_CODE (arg) != SSA_NAME)
+	continue;
+      def = SSA_NAME_DEF_STMT (arg);
+      if (!def)
+	continue;
+      def_bb = gimple_bb (def);
+      if (!def_bb)
+	continue;
+      if (!dominated_by_p (CDI_DOMINATORS, pre_header, def_bb))
+	/* Definition inside loop!  */
+	return false;
+      /* Check loop closed phi invariant.  */
+      if (!flow_bb_inside_loop_p (def_bb->loop_father, pre_header))
+	return false;
+    }
+  return true;
+}
+
 /* Loop unswitching pass.  */
 
 namespace {

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] Unswitching outer loops.
  2015-10-07 15:26                                 ` Yuri Rumyantsev
@ 2015-10-08 12:31                                   ` Richard Biener
  0 siblings, 0 replies; 17+ messages in thread
From: Richard Biener @ 2015-10-08 12:31 UTC (permalink / raw)
  To: Yuri Rumyantsev; +Cc: gcc-patches, Igor Zamyatin

On Wed, Oct 7, 2015 at 5:26 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
> Richard,
>
> I noticed that 'gimple' type was changed and send you updated patch.

Ok.

Thanks,
Richard.

> Thanks.
> Yuri.
>
> 2015-10-07 12:53 GMT+03:00 Yuri Rumyantsev <ysrumyan@gmail.com>:
>> Richard,
>>
>> I've fixed adding virtual phi argument and add check on irreducible basic block.
>> New patch is attached.
>>
>> I checked it for bootstrap and regression testing, no new failures.
>>
>> ChangeLog:
>> 2015-10-07  Yuri Rumyantsev  <ysrumyan@gmail.com>
>>
>> * tree-ssa-loop-unswitch.c: Include "gimple-iterator.h" and
>> "cfghooks.h", add prototypes for introduced new functions.
>> (tree_ssa_unswitch_loops): Use from innermost loop iterator, move all
>> checks on ability of loop unswitching to tree_unswitch_single_loop;
>> invoke tree_unswitch_single_loop or tree_unswitch_outer_loop depending
>> on innermost loop check.
>> (tree_unswitch_single_loop): Add all required checks on ability of
>> loop unswitching under zero recursive level guard.
>> (tree_unswitch_outer_loop): New function.
>> (find_loop_guard): Likewise.
>> (empty_bb_without_guard_p): Likewise.
>> (used_outside_loop_p): Likewise.
>> (get_vop_from_header): Likewise.
>> (hoist_guard): Likewise.
>> (check_exit_phi): Likewise.
>>
>>    gcc/testsuite/ChangeLog:
>> * gcc.dg/loop-unswitch-2.c: New test.
>> * gcc.dg/loop-unswitch-3.c: Likewise.
>> * gcc.dg/loop-unswitch-4.c: Likewise.
>>
>>
>> 2015-10-06 15:21 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>> On Tue, Oct 6, 2015 at 1:41 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>> Richard,
>>>>
>>>> Here is updated patch which reflects almost all your remarks:
>>>> 1. Use ordinary get_loop_body.
>>>> 2. Delete useless asserts.
>>>> 3. Use check on iterated loop instead of finite_loop_p.
>>>> 4. Do not update CFG by adjusting the CONDs condition to always true/false.
>>>> 5. Add couple tests.
>>>
>>> +  /* Add NEW_ADGE argument for all phi in post-header block.  */
>>> +  bb = exit->dest;
>>> +  for (gphi_iterator gsi = gsi_start_phis (bb);
>>> +       !gsi_end_p (gsi); gsi_next (&gsi))
>>> +    {
>>> +      gphi *phi = gsi.phi ();
>>> +      /* edge_iterator ei; */
>>> +      tree arg;
>>> +      if (virtual_operand_p (gimple_phi_result (phi)))
>>> +       {
>>> +         arg = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
>>> +         add_phi_arg (phi, arg, new_edge, UNKNOWN_LOCATION);
>>>
>>> now I know what confused me - here you are looking at a loop exit PHI
>>> but querying with the preheader edge index.  I think you need to walk
>>> the loop header PHIs to find the PHI for the virtual operand and use that
>>> to get the PHI arg from?
>>>
>>> The side-effect / used-outside code is still the same.  What matters
>>> is side-effects outside of the loop-header protected code region, not
>>> blocks excluding the inner loop.  Say,
>>>
>>>   for (;;)
>>>     {
>>>       if (invariant-guard)
>>>         {
>>>            printf ("Blah");
>>>            for (;;)
>>>              ;
>>>         }
>>>     }
>>>
>>> would still ok to be unswitched.  So instead of
>>>
>>> +      if (body[i]->loop_father != loop)
>>> +       continue;
>>>
>>> it would be
>>>
>>>        if (dominated_by_p (CDI_DOMINATORS, body[i], header)
>>>            && !dominated_by_p (CDI_DOMINATORS, body[i], fe->dest))
>>>
>>> with the obvious improvement to the patch to not only consider header checks
>>> in the outer loop header but in the pre-header block of the inner loop.
>>>
>>> And I still think you should walk the exit PHIs args to see whether they
>>> are defined in the non-guarded region of the outer loop instead of walking
>>> all uses of all defs.
>>>
>>> Note that I think you miss endless loops as side-effects if that endless
>>> loop occurs through a irreducible region (thus not reflected in the
>>> loop tree).  Thus you should reject BB_IRREDUCIBLE_LOOP blocks
>>> in the non-guarded region as well.
>>>
>>> It seems to me that protecting adjacent loops with a single guard is
>>> also eligible for hoisting thus the restriction on loop->inner->next
>>> should become a restriction on no loops (or irreducible regions)
>>> in the non-guarded region.
>>>
>>> Most things can be improved as followup, but at least the
>>> virtual PHI arg thing needs to be sorted out.
>>>
>>> Thanks,
>>> Richard.
>>>
>>>
>>>> ChangeLog:
>>>> 2015-10-06  Yuri Rumyantsev  <ysrumyan@gmail.com>
>>>>
>>>> * tree-ssa-loop-unswitch.c: Include "gimple-iterator.h" and
>>>> "cfghooks.h", add prototypes for introduced new functions.
>>>> (tree_ssa_unswitch_loops): Use from innermost loop iterator, move all
>>>> checks on ability of loop unswitching to tree_unswitch_single_loop;
>>>> invoke tree_unswitch_single_loop or tree_unswitch_outer_loop depending
>>>> on innermost loop check.
>>>> (tree_unswitch_single_loop): Add all required checks on ability of
>>>> loop unswitching under zero recursive level guard.
>>>> (tree_unswitch_outer_loop): New function.
>>>> (find_loop_guard): Likewise.
>>>> (empty_bb_without_guard_p): Likewise.
>>>> (used_outside_loop_p): Likewise.
>>>> (hoist_guard): Likewise.
>>>> (check_exit_phi): Likewise.
>>>>
>>>>    gcc/testsuite/ChangeLog:
>>>> * gcc.dg/loop-unswitch-2.c: New test.
>>>> * gcc.dg/loop-unswitch-3.c: Likewise.
>>>> * gcc.dg/loop-unswitch-4.c: Likewise.
>>>>
>>>> 2015-10-06 10:59 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>> On Mon, Oct 5, 2015 at 3:13 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>>> Thanks Richard.
>>>>>> I'd like to answer on your last comment related to using of exit edge
>>>>>> argument for edge that skips loop.
>>>>>> Let's consider the following test-case:
>>>>>>
>>>>>> #include <stdlib.h>
>>>>>> #define N 32
>>>>>> float *foo(int ustride, int size, float *src)
>>>>>> {
>>>>>>    float *buffer, *p;
>>>>>>    int i, k;
>>>>>>
>>>>>>    if (!src)
>>>>>>     return NULL;
>>>>>>
>>>>>>    buffer = (float *) malloc(N * size * sizeof(float));
>>>>>>
>>>>>>    if(buffer)
>>>>>>       for(i=0, p=buffer; i<N; i++, src+=ustride)
>>>>>> for(k=0; k<size; k++)
>>>>>>  *p++ = src[k];
>>>>>>
>>>>>>    return buffer;
>>>>>> }
>>>>>>
>>>>>> Before adding new edge we have in post-header bb:
>>>>>>   <bb 9>:
>>>>>>   # _6 = PHI <0B(8), buffer_20(16)>
>>>>>>   return _6;
>>>>>>
>>>>>> It is clear that we must preserve function semantic and transform it to
>>>>>> _6 = PHI <0B(12), buffer_19(9), buffer_19(4)>
>>>>>
>>>>> Ah, yeah.  I was confusing the loop exit of the inner vs. the outer loop.
>>>>>
>>>>> Richard.
>>>>>
>>>>>>
>>>>>> 2015-10-05 13:57 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>>>> On Wed, Sep 30, 2015 at 12:46 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>>>>> Hi Richard,
>>>>>>>>
>>>>>>>> I re-designed outer loop unswitching using basic idea of 23855 patch -
>>>>>>>> hoist invariant guard if loop is empty without guard. Note that this
>>>>>>>> was added to loop unswitching pass with simple modifications - using
>>>>>>>> another loop iterator etc.
>>>>>>>>
>>>>>>>> Bootstrap and regression testing did not show any new failures.
>>>>>>>> What is your opinion?
>>>>>>>
>>>>>>> Overall it looks good.  Some comments below - a few more testcases would
>>>>>>> be nice as well.
>>>>>>>
>>>>>>> +  /* Loop must not be infinite.  */
>>>>>>> +  if (!finite_loop_p (loop))
>>>>>>> +    return false;
>>>>>>>
>>>>>>> why's that?
>>>>>>>
>>>>>>> +  body = get_loop_body_in_dom_order (loop);
>>>>>>> +  for (i = 0; i < loop->num_nodes; i++)
>>>>>>> +    {
>>>>>>> +      if (body[i]->loop_father != loop)
>>>>>>> +       continue;
>>>>>>> +      if (!empty_bb_without_guard_p (loop, body[i]))
>>>>>>>
>>>>>>> I wonder if there is a better way to iterate over the interesting
>>>>>>> blocks and PHIs
>>>>>>> we need to check for side-effects (and thus we maybe can avoid gathering
>>>>>>> the loop in DOM order).
>>>>>>>
>>>>>>> +      FOR_EACH_SSA_TREE_OPERAND (name, stmt, op_iter, SSA_OP_DEF)
>>>>>>> +       {
>>>>>>> +         if (may_be_used_outside
>>>>>>>
>>>>>>> may_be_used_outside can be hoisted above the loop.  I wonder if we can take
>>>>>>> advantage of loop-closed SSA form here (and the fact we have a single exit
>>>>>>> from the loop).  Iterating over exit dest PHIs and determining whether the
>>>>>>> exit edge DEF is inside the loop part it may not be should be enough.
>>>>>>>
>>>>>>> +  gcc_assert (single_succ_p (pre_header));
>>>>>>>
>>>>>>> that should be always true.
>>>>>>>
>>>>>>> +  gsi_remove (&gsi, false);
>>>>>>> +  bb = guard->dest;
>>>>>>> +  remove_edge (guard);
>>>>>>> +  /* Update dominance for destination of GUARD.  */
>>>>>>> +  if (EDGE_COUNT (bb->preds) == 0)
>>>>>>> +    {
>>>>>>> +      basic_block s_bb;
>>>>>>> +      gcc_assert (single_succ_p (bb));
>>>>>>> +      s_bb = single_succ (bb);
>>>>>>> +      delete_basic_block (bb);
>>>>>>> +      if (single_pred_p (s_bb))
>>>>>>> +       set_immediate_dominator (CDI_DOMINATORS, s_bb, single_pred (s_bb));
>>>>>>>
>>>>>>> all this massaging should be simplified by leaving it to CFG cleanup by
>>>>>>> simply adjusting the CONDs condition to always true/false.  There is
>>>>>>> gimple_cond_make_{true,false} () for this (would be nice to have a variant
>>>>>>> taking a bool).
>>>>>>>
>>>>>>> +  new_edge = make_edge (pre_header, exit->dest, flags);
>>>>>>> +  if (fix_dom_of_exit)
>>>>>>> +    set_immediate_dominator (CDI_DOMINATORS, exit->dest, pre_header);
>>>>>>> +  update_stmt (gsi_stmt (gsi));
>>>>>>>
>>>>>>> the update_stmt should be not necessary, it's done by gsi_insert_after already.
>>>>>>>
>>>>>>> +  /* Add NEW_ADGE argument for all phi in post-header block.  */
>>>>>>> +  bb = exit->dest;
>>>>>>> +  for (gphi_iterator gsi = gsi_start_phis (bb);
>>>>>>> +       !gsi_end_p (gsi); gsi_next (&gsi))
>>>>>>> +    {
>>>>>>> +      gphi *phi = gsi.phi ();
>>>>>>> +      /* edge_iterator ei; */
>>>>>>> +      tree arg;
>>>>>>> +      if (virtual_operand_p (gimple_phi_result (phi)))
>>>>>>> +       {
>>>>>>> +         arg = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
>>>>>>> +         add_phi_arg (phi, arg, new_edge, UNKNOWN_LOCATION);
>>>>>>> +       }
>>>>>>> +      else
>>>>>>> +       {
>>>>>>> +         /* Use exit edge argument.  */
>>>>>>> +         arg = PHI_ARG_DEF_FROM_EDGE (phi, exit);
>>>>>>> +         add_phi_arg (phi, arg, new_edge, UNKNOWN_LOCATION);
>>>>>>>
>>>>>>> Hum.  How is it ok to use the exit edge argument for the edge that skips
>>>>>>> the loop?  Why can't you always use the pre-header edge value?
>>>>>>> That is, if we have
>>>>>>>
>>>>>>>  for(i=0;i<m;++i)
>>>>>>>    {
>>>>>>>      if (n > 0)
>>>>>>>     {
>>>>>>>      for (;;)
>>>>>>>        {
>>>>>>>        }
>>>>>>>      }
>>>>>>>    }
>>>>>>>   ... = i;
>>>>>>>
>>>>>>> then i is used after the loop and the correct value to use if
>>>>>>> n > 0 is false is '0'.  Maybe this way we can also relax
>>>>>>> what check_exit_phi does?  IMHO the only restriction is
>>>>>>> if sth defined inside the loop before the header check for
>>>>>>> the inner loop is used after the loop.
>>>>>>>
>>>>>>> Thanks,
>>>>>>> Richard.
>>>>>>>
>>>>>>>> Thanks.
>>>>>>>>
>>>>>>>> ChangeLog:
>>>>>>>> 2015-09-30  Yuri Rumyantsev  <ysrumyan@gmail.com>
>>>>>>>>
>>>>>>>> * tree-ssa-loop-unswitch.c: Include "gimple-iterator.h" and
>>>>>>>> "cfghooks.h", add prototypes for introduced new functions.
>>>>>>>> (tree_ssa_unswitch_loops): Use from innermost loop iterator, move all
>>>>>>>> checks on ability of loop unswitching to tree_unswitch_single_loop;
>>>>>>>> invoke tree_unswitch_single_loop or tree_unswitch_outer_loop depending
>>>>>>>> on innermost loop check.
>>>>>>>> (tree_unswitch_single_loop): Add all required checks on ability of
>>>>>>>> loop unswitching under zero recursive level guard.
>>>>>>>> (tree_unswitch_outer_loop): New function.
>>>>>>>> (find_loop_guard): Likewise.
>>>>>>>> (empty_bb_without_guard_p): Likewise.
>>>>>>>> (used_outside_loop_p): Likewise.
>>>>>>>> (hoist_guard): Likewise.
>>>>>>>> (check_exit_phi): Likewise.
>>>>>>>>
>>>>>>>>    gcc/testsuite/ChangeLog:
>>>>>>>> * gcc.dg/loop-unswitch-2.c: New test.
>>>>>>>>
>>>>>>>> 2015-09-16 11:26 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>>>>>> Yeah, as said, the patch wasn't fully ready and it also felt odd to do
>>>>>>>>> this hoisting in loop header copying.  Integrating it
>>>>>>>>> with LIM would be a better fit eventually.
>>>>>>>>>
>>>>>>>>> Note that we did agree to go forward with your original patch just
>>>>>>>>> making it more "generically" perform outer loop
>>>>>>>>> unswitching.  Did you explore that idea further?
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> On Tue, Sep 15, 2015 at 6:00 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>>>>>>> Thanks Richard.
>>>>>>>>>>
>>>>>>>>>> I found one more issue that could not be fixed simply. In 23855 you
>>>>>>>>>> consider the following test-case:
>>>>>>>>>> void foo(int *ie, int *je, double *x)
>>>>>>>>>> {
>>>>>>>>>>   int i, j;
>>>>>>>>>>   for (j=0; j<*je; ++j)
>>>>>>>>>>     for (i=0; i<*ie; ++i)
>>>>>>>>>>       x[i+j] = 0.0;
>>>>>>>>>> }
>>>>>>>>>> and proposed to hoist up a check on *ie out of loop. It requires
>>>>>>>>>> memref alias analysis since in general x and ie can alias (if their
>>>>>>>>>> types are compatible - int *ie & int * x). Such analysis is performed
>>>>>>>>>> by pre or lim passes. Without such analysis we can not hoist a test on
>>>>>>>>>> non-zero for *ie out of loop using 238565 patch.
>>>>>>>>>>  The second concern is that proposed copy header algorithm changes
>>>>>>>>>> loop structure significantly and it is not accepted by vectorizer
>>>>>>>>>> since latch is not empty (such transformation assumes loop peeling for
>>>>>>>>>> one iteration. So I can propose to implement simple guard hoisting
>>>>>>>>>> without copying header and tail blocks (if it is possible).
>>>>>>>>>>
>>>>>>>>>> I will appreciate you for any advice or help since without such
>>>>>>>>>> hoisting we are not able to perform outer loop vectorization for
>>>>>>>>>> important benchmark.
>>>>>>>>>> and
>>>>>>>>>>
>>>>>>>>>> 2015-09-15 14:22 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>>>>>>>> On Thu, Sep 3, 2015 at 6:32 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>>>>>>>>> Hi Richard,
>>>>>>>>>>>>
>>>>>>>>>>>> I started learning, tuning and debugging patch proposed in 23855 and
>>>>>>>>>>>> discovered thta it does not work properly.
>>>>>>>>>>>> So I wonder is it tested patch and it should work?
>>>>>>>>>>>
>>>>>>>>>>> I don't remember, but as it wasn't committed it certainly wasn't ready.
>>>>>>>>>>>
>>>>>>>>>>>> Should it accept for hoisting the following loop nest
>>>>>>>>>>>>   for (i=0; i<n; i++) {
>>>>>>>>>>>>     s = 0;
>>>>>>>>>>>>     for (j=0; j<m; j++)
>>>>>>>>>>>>       s += a[i] * b[j];
>>>>>>>>>>>>     c[i] = s;
>>>>>>>>>>>>   }
>>>>>>>>>>>> Note that i-loop will nit be empty if m is equal to 0.
>>>>>>>>>>>
>>>>>>>>>>> if m is equal to 0 then we still have the c[i] = s store, no?  Of course
>>>>>>>>>>> we could unswitch the outer loop on m == 0 but simple hoisting wouldn't work.
>>>>>>>>>>>
>>>>>>>>>>> Richard.
>>>>>>>>>>>
>>>>>>>>>>>> 2015-08-03 10:27 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>>>>>>>>>> On Fri, Jul 31, 2015 at 1:17 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>>>>>>>>>>> Hi Richard,
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> I learned your updated patch for 23825 and it is more general in
>>>>>>>>>>>>>> comparison with my.
>>>>>>>>>>>>>> I'd like to propose you a compromise - let's consider my patch only
>>>>>>>>>>>>>> for force-vectorize outer loop only to allow outer-loop
>>>>>>>>>>>>>> vecctorization.
>>>>>>>>>>>>>
>>>>>>>>>>>>> I don't see why we should special-case that if the approach in 23825
>>>>>>>>>>>>> is sensible.
>>>>>>>>>>>>>
>>>>>>>>>>>>>> Note that your approach will not hoist invariant
>>>>>>>>>>>>>> guards if loops contains something else except for inner-loop, i.e. it
>>>>>>>>>>>>>> won't be empty for taken branch.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Yes, it does not perform unswitching but guard hoisting.  Note that this
>>>>>>>>>>>>> is originally Zdenek Dvoraks patch.
>>>>>>>>>>>>>
>>>>>>>>>>>>>> I also would like to answer on your last question - CFG cleanup is
>>>>>>>>>>>>>> invoked to perform deletion of single-argument phi nodes from tail
>>>>>>>>>>>>>> block through substitution - such phi's prevent outer-loop
>>>>>>>>>>>>>> vectorization. But it is clear that such transformation can be done
>>>>>>>>>>>>>> other pass.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Hmm, I wonder why the copy_prop pass after unswitching does not
>>>>>>>>>>>>> get rid of them?
>>>>>>>>>>>>>
>>>>>>>>>>>>>> What is your opinion?
>>>>>>>>>>>>>
>>>>>>>>>>>>> My opinion is that if we want to enhance unswitching to catch this
>>>>>>>>>>>>> (or similar) cases then we should make it a lot more general than
>>>>>>>>>>>>> your pattern-matching approach.  I see nothing that should prevent
>>>>>>>>>>>>> us from considering unswitching non-innermost loops in general.
>>>>>>>>>>>>> It should be only a cost consideration to not do non-innermost loop
>>>>>>>>>>>>> unswitching (in addition to maybe a --param specifying the maximum
>>>>>>>>>>>>> depth of a loop nest to unswitch).
>>>>>>>>>>>>>
>>>>>>>>>>>>> So my first thought when seeing your patch still holds - the patch
>>>>>>>>>>>>> looks very much too specific.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Richard.
>>>>>>>>>>>>>
>>>>>>>>>>>>>> Yuri.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> 2015-07-28 13:50 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>>>>>>>>>>>> On Thu, Jul 23, 2015 at 4:45 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>>>>>>>>>>>>> Hi Richard,
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> I checked that both test-cases from 23855 are sucessfully unswitched
>>>>>>>>>>>>>>>> by proposed patch. I understand that it does not catch deeper loop
>>>>>>>>>>>>>>>> nest as
>>>>>>>>>>>>>>>>    for (i=0; i<10; i++)
>>>>>>>>>>>>>>>>      for (j=0;j<n;j++)
>>>>>>>>>>>>>>>>         for (k=0;k<20;k++)
>>>>>>>>>>>>>>>>   ...
>>>>>>>>>>>>>>>> but duplication of middle-loop does not look reasonable.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Here is dump for your second test-case:
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> void foo(int *ie, int *je, double *x)
>>>>>>>>>>>>>>>> {
>>>>>>>>>>>>>>>>   int i, j;
>>>>>>>>>>>>>>>>   for (j=0; j<*je; ++j)
>>>>>>>>>>>>>>>>     for (i=0; i<*ie; ++i)
>>>>>>>>>>>>>>>>       x[i+j] = 0.0;
>>>>>>>>>>>>>>>> }
>>>>>>>>>>>>>>>> grep -i unswitch t6.c.119t.unswitch
>>>>>>>>>>>>>>>> ;; Unswitching outer loop
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> I was saying that why go with a limited approach when a patch (in
>>>>>>>>>>>>>>> unknown state...)
>>>>>>>>>>>>>>> is available that does it more generally?  Also unswitching is quite
>>>>>>>>>>>>>>> expensive compared
>>>>>>>>>>>>>>> to "moving" the invariant condition.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> In your patch:
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> +  if (!nloop->force_vectorize)
>>>>>>>>>>>>>>> +    nloop->force_vectorize = true;
>>>>>>>>>>>>>>> +  if (loop->safelen != 0)
>>>>>>>>>>>>>>> +    nloop->safelen = loop->safelen;
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> I see no guard on force_vectorize so = true looks bogus here.  Please just use
>>>>>>>>>>>>>>> copy_loop_info.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> +  if (integer_nonzerop (cond_new))
>>>>>>>>>>>>>>> +    gimple_cond_set_condition_from_tree (cond_stmt, boolean_true_node);
>>>>>>>>>>>>>>> +  else if (integer_zerop (cond_new))
>>>>>>>>>>>>>>> +    gimple_cond_set_condition_from_tree (cond_stmt, boolean_false_node);
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> gimple_cond_make_true/false (cond_stmt);
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> btw, seems odd that we have to recompute which loop is the true / false variant
>>>>>>>>>>>>>>> when we just fed a guard condition to loop_version.  Can't we statically
>>>>>>>>>>>>>>> determine whether loop or nloop has the in-loop condition true or false?
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> +  /* Clean-up cfg to remove useless one-argument phi in exit block of
>>>>>>>>>>>>>>> +     outer-loop.  */
>>>>>>>>>>>>>>> +  cleanup_tree_cfg ();
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> I know unswitching is already O(number-of-unswitched-loops * size-of-function)
>>>>>>>>>>>>>>> because it updates SSA form after each individual unswitching (and it does that
>>>>>>>>>>>>>>> because it invokes itself recursively on unswitched loops).  But do you really
>>>>>>>>>>>>>>> need to invoke CFG cleanup here?
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Richard.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Yuri.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> 2015-07-14 14:06 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>>>>>>>>>>>>>> On Fri, Jul 10, 2015 at 12:02 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>>>>>>>>>>>>>>> Hi All,
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Here is presented simple transformation which tries to hoist out of
>>>>>>>>>>>>>>>>>> outer-loop a check on zero trip count for inner-loop. This is very
>>>>>>>>>>>>>>>>>> restricted transformation since it accepts outer-loops with very
>>>>>>>>>>>>>>>>>> simple cfg, as for example:
>>>>>>>>>>>>>>>>>>     acc = 0;
>>>>>>>>>>>>>>>>>>    for (i = 1; i <= m; i++) {
>>>>>>>>>>>>>>>>>>       for (j = 0; j < n; j++)
>>>>>>>>>>>>>>>>>>          if (l[j] == i) { v[j] = acc; acc++; };
>>>>>>>>>>>>>>>>>>       acc <<= 1;
>>>>>>>>>>>>>>>>>>    }
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Note that degenerative outer loop (without inner loop) will be
>>>>>>>>>>>>>>>>>> completely deleted as dead code.
>>>>>>>>>>>>>>>>>> The main goal of this transformation was to convert outer-loop to form
>>>>>>>>>>>>>>>>>> accepted by outer-loop vectorization (such test-case is also included
>>>>>>>>>>>>>>>>>> to patch).
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Bootstrap and regression testing did not show any new failures.
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Is it OK for trunk?
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> I think this is
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=23855
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> as well.  It has a patch adding a invariant loop guard hoisting
>>>>>>>>>>>>>>>>> phase to loop-header copying.  Yeah, it needs updating to
>>>>>>>>>>>>>>>>> trunk again I suppose.  It's always non-stage1 when I come
>>>>>>>>>>>>>>>>> back to that patch.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Your patch seems to be very specific and only handles outer
>>>>>>>>>>>>>>>>> loops of innermost loops.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Richard.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> ChangeLog:
>>>>>>>>>>>>>>>>>> 2015-07-10  Yuri Rumyantsev  <ysrumyan@gmail.com>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> * tree-ssa-loop-unswitch.c: Include "tree-cfgcleanup.h" and
>>>>>>>>>>>>>>>>>> "gimple-iterator.h", add prototype for tree_unswitch_outer_loop.
>>>>>>>>>>>>>>>>>> (tree_ssa_unswitch_loops): Add invoke of tree_unswitch_outer_loop.
>>>>>>>>>>>>>>>>>> (tree_unswitch_outer_loop): New function.
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> gcc/testsuite/ChangeLog:
>>>>>>>>>>>>>>>>>> * gcc.dg/tree-ssa/unswitch-outer-loop-1.c: New test.
>>>>>>>>>>>>>>>>>> * gcc.dg/vect/vect-outer-simd-3.c: New test.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] Unswitching outer loops.
  2015-10-07  9:53                               ` Yuri Rumyantsev
  2015-10-07 15:26                                 ` Yuri Rumyantsev
@ 2015-10-09 19:05                                 ` H.J. Lu
  1 sibling, 0 replies; 17+ messages in thread
From: H.J. Lu @ 2015-10-09 19:05 UTC (permalink / raw)
  To: Yuri Rumyantsev; +Cc: Richard Biener, gcc-patches, Igor Zamyatin

On Wed, Oct 7, 2015 at 2:53 AM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
> Richard,
>
> I've fixed adding virtual phi argument and add check on irreducible basic block.
> New patch is attached.
>
> I checked it for bootstrap and regression testing, no new failures.
>
> ChangeLog:
> 2015-10-07  Yuri Rumyantsev  <ysrumyan@gmail.com>
>
> * tree-ssa-loop-unswitch.c: Include "gimple-iterator.h" and
> "cfghooks.h", add prototypes for introduced new functions.
> (tree_ssa_unswitch_loops): Use from innermost loop iterator, move all
> checks on ability of loop unswitching to tree_unswitch_single_loop;
> invoke tree_unswitch_single_loop or tree_unswitch_outer_loop depending
> on innermost loop check.
> (tree_unswitch_single_loop): Add all required checks on ability of
> loop unswitching under zero recursive level guard.
> (tree_unswitch_outer_loop): New function.
> (find_loop_guard): Likewise.
> (empty_bb_without_guard_p): Likewise.
> (used_outside_loop_p): Likewise.
> (get_vop_from_header): Likewise.
> (hoist_guard): Likewise.
> (check_exit_phi): Likewise.
>
>    gcc/testsuite/ChangeLog:
> * gcc.dg/loop-unswitch-2.c: New test.
> * gcc.dg/loop-unswitch-3.c: Likewise.
> * gcc.dg/loop-unswitch-4.c: Likewise.
>

This caused:

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67909


H.J.

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2015-10-09 19:05 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-07-10 10:03 [PATCH] Unswitching outer loops Yuri Rumyantsev
2015-07-14 11:07 ` Richard Biener
2015-07-23 15:21   ` Yuri Rumyantsev
2015-07-28 11:00     ` Richard Biener
2015-07-31 12:07       ` Yuri Rumyantsev
2015-07-31 15:54         ` Jeff Law
2015-08-03  7:27         ` Richard Biener
     [not found]           ` <CAEoMCqSorkh1WmFtVB_huC2hbcVr8uc1EYaRaNVe1g+5hVuzPw@mail.gmail.com>
     [not found]             ` <CAFiYyc1nCCyF-4BH2hPWkKpmXnaQFQ34RMM5TTuHjZxZ25crrA@mail.gmail.com>
     [not found]               ` <CAEoMCqSRsER9ZGgnX9eJgZJyN4EwkpxzWWk1FHRxWNiEW0HVCg@mail.gmail.com>
     [not found]                 ` <CAFiYyc2O9i690A0LZ0+jEOP8nkyz8Btc0YAb469aMgnRaVsmsQ@mail.gmail.com>
2015-09-30 11:40                   ` Yuri Rumyantsev
2015-10-05 10:57                     ` Richard Biener
2015-10-05 13:13                       ` Yuri Rumyantsev
2015-10-06  7:59                         ` Richard Biener
2015-10-06 11:41                           ` Yuri Rumyantsev
2015-10-06 12:21                             ` Richard Biener
2015-10-07  9:53                               ` Yuri Rumyantsev
2015-10-07 15:26                                 ` Yuri Rumyantsev
2015-10-08 12:31                                   ` Richard Biener
2015-10-09 19:05                                 ` H.J. Lu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).