public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH, stage1][PR65443] Add transform_to_exit_first_loop_alt
@ 2015-03-27 14:10 Tom de Vries
  2015-04-03 12:40 ` Tom de Vries
  0 siblings, 1 reply; 13+ messages in thread
From: Tom de Vries @ 2015-03-27 14:10 UTC (permalink / raw)
  To: GCC Patches; +Cc: Richard Biener, Jakub Jelinek

[-- Attachment #1: Type: text/plain, Size: 1492 bytes --]

Hi,

this patch fixes PR65443, a todo in the parloops pass for function 
transform_to_exit_first_loop:
...
    TODO: the common case is that latch of the loop is empty and immediately
    follows the loop exit.  In this case, it would be better not to copy the
    body of the loop, but only move the entry of the loop directly before the
    exit check and increase the number of iterations of the loop by one.
    This may need some additional preconditioning in case NIT = ~0.
...

The current implementation of transform_to_exit_first_loop transforms the loop 
by moving and duplicating the loop body. This patch transforms the loop into the 
same normal form, but without the duplication, if that's possible (determined by 
try_transform_to_exit_first_loop_alt).

The actual transformation, done by transform_to_exit_first_loop_alt transforms 
this loop:
...
      bb preheader:
      ...
      goto <bb header>

      <bb header>:
      ...
      if (ivtmp < n)
        goto <bb latch>;
      else
        goto <bb exit>;

      <bb latch>:
      ivtmp = ivtmp + inc;
      goto <bb header>
...

into this one:
...
      bb preheader:
      ...
      goto <bb newheader>

      <bb header>:
      ...
      goto <bb latch>;

      <bb newheader>:
      if (ivtmp < n + 1)
        goto <bb header>;
      else
        goto <bb exit>;

      <bb latch>:
      ivtmp = ivtmp + inc;
      goto <bb newheader>
...

Bootstrapped and reg-tested on x86_64.

OK for stage1 trunk?

Thanks,
- Tom

[-- Attachment #2: 0001-Add-transform_to_exit_first_loop_alt.patch --]
[-- Type: text/x-patch, Size: 18627 bytes --]

Add transform_to_exit_first_loop_alt

2015-03-27  Tom de Vries  <tom@codesourcery.com>

	PR tree-optimization/65443
	* tree-parloops.c (replace_imm_uses, replace_uses_in_bb_by)
	(replace_uses_in_bbs_by, transform_to_exit_first_loop_alt)
	(try_transform_to_exit_first_loop_alt): New function.
	(transform_to_exit_first_loop): Use
	try_transform_to_exit_first_loop_alt.

	* gcc.dg/parloops-exit-first-loop-alt-2.c: New test.
	* gcc.dg/parloops-exit-first-loop-alt-3.c: New test.
	* gcc.dg/parloops-exit-first-loop-alt.c: New test.

	* testsuite/libgomp.c/parloops-exit-first-loop-alt-2.c: New test.
	* testsuite/libgomp.c/parloops-exit-first-loop-alt-3.c: New test.
	* testsuite/libgomp.c/parloops-exit-first-loop-alt.c: New test.
---
 .../gcc.dg/parloops-exit-first-loop-alt-2.c        |  27 ++
 .../gcc.dg/parloops-exit-first-loop-alt-3.c        |  26 ++
 .../gcc.dg/parloops-exit-first-loop-alt.c          |  27 ++
 gcc/tree-parloops.c                                | 344 ++++++++++++++++++++-
 .../libgomp.c/parloops-exit-first-loop-alt-2.c     |  48 +++
 .../libgomp.c/parloops-exit-first-loop-alt-3.c     |  29 ++
 .../libgomp.c/parloops-exit-first-loop-alt.c       |  48 +++
 7 files changed, 538 insertions(+), 11 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/parloops-exit-first-loop-alt-2.c
 create mode 100644 gcc/testsuite/gcc.dg/parloops-exit-first-loop-alt-3.c
 create mode 100644 gcc/testsuite/gcc.dg/parloops-exit-first-loop-alt.c
 create mode 100644 libgomp/testsuite/libgomp.c/parloops-exit-first-loop-alt-2.c
 create mode 100644 libgomp/testsuite/libgomp.c/parloops-exit-first-loop-alt-3.c
 create mode 100644 libgomp/testsuite/libgomp.c/parloops-exit-first-loop-alt.c

diff --git a/gcc/testsuite/gcc.dg/parloops-exit-first-loop-alt-2.c b/gcc/testsuite/gcc.dg/parloops-exit-first-loop-alt-2.c
new file mode 100644
index 0000000..a4d6cb2
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/parloops-exit-first-loop-alt-2.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target pthread } */
+/* { dg-options "-O2 -ftree-parallelize-loops=2 -fdump-tree-parloops" } */
+
+#define N 1000
+
+unsigned int a[N];
+unsigned int b[N];
+unsigned int c[N];
+
+void __attribute__((noclone,noinline))
+f (unsigned int n)
+{
+  int i;
+
+    for (i = 0; i < N; ++i)
+      c[i] = a[i] + b[i];
+}
+
+/* Three times three array accesses:
+   - three in f._loopfn.0
+   - three in the parallel
+   - three in the low iteration count loop
+   Crucially, none for a peeled off last iteration following the parallel.  */
+/* { dg-final { scan-tree-dump-times "(?n)\\\[i" 9 "parloops" } } */
+
+/* { dg-final { cleanup-tree-dump "parloops" } } */
diff --git a/gcc/testsuite/gcc.dg/parloops-exit-first-loop-alt-3.c b/gcc/testsuite/gcc.dg/parloops-exit-first-loop-alt-3.c
new file mode 100644
index 0000000..c7ab51f88
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/parloops-exit-first-loop-alt-3.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target pthread } */
+/* { dg-options "-O2 -ftree-parallelize-loops=2 -fdump-tree-parloops" } */
+
+unsigned int *a;
+
+unsigned int __attribute__((noclone,noinline))
+f (unsigned int n)
+{
+  int i;
+  unsigned int sum = 1;
+
+  for (i = 0; i < n; ++i)
+    sum += a[i];
+
+  return sum;
+}
+
+/* Three array accesses:
+   - one in f._loopfn.0
+   - one in the parallel
+   - one in the low iteration count loop
+   Crucially, none for a peeled off last iteration following the parallel.  */
+/* { dg-final { scan-tree-dump-times "(?n)\\\* 4" 3 "parloops" } } */
+
+/* { dg-final { cleanup-tree-dump "parloops" } } */
diff --git a/gcc/testsuite/gcc.dg/parloops-exit-first-loop-alt.c b/gcc/testsuite/gcc.dg/parloops-exit-first-loop-alt.c
new file mode 100644
index 0000000..42ca3ac
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/parloops-exit-first-loop-alt.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target pthread } */
+/* { dg-options "-O2 -ftree-parallelize-loops=2 -fdump-tree-parloops" } */
+
+#define N 1000
+
+unsigned int a[N];
+unsigned int b[N];
+unsigned int c[N];
+
+void __attribute__((noclone,noinline))
+f (unsigned int n)
+{
+  int i;
+
+  for (i = 0; i < n; ++i)
+    c[i] = a[i] + b[i];
+}
+
+/* Three times three array accesses:
+   - three in f._loopfn.0
+   - three in the parallel
+   - three in the low iteration count loop
+   Crucially, none for a peeled off last iteration following the parallel.  */
+/* { dg-final { scan-tree-dump-times "(?n)\\\[i" 9 "parloops" } } */
+
+/* { dg-final { cleanup-tree-dump "parloops" } } */
diff --git a/gcc/tree-parloops.c b/gcc/tree-parloops.c
index fbb9eeb..a60efbe 100644
--- a/gcc/tree-parloops.c
+++ b/gcc/tree-parloops.c
@@ -1495,17 +1495,327 @@ create_loop_fn (location_t loc)
   return decl;
 }
 
-/* Moves the exit condition of LOOP to the beginning of its header, and
-   duplicates the part of the last iteration that gets disabled to the
-   exit of the loop.  NIT is the number of iterations of the loop
-   (used to initialize the variables in the duplicated part).
-
-   TODO: the common case is that latch of the loop is empty and immediately
-   follows the loop exit.  In this case, it would be better not to copy the
-   body of the loop, but only move the entry of the loop directly before the
-   exit check and increase the number of iterations of the loop by one.
-   This may need some additional preconditioning in case NIT = ~0.
-   REDUCTION_LIST describes the reductions in LOOP.  */
+/* Replace uses of VAL in iterator IMM_ITER.  */
+
+static void
+replace_imm_uses (tree val, imm_use_iterator *imm_iter)
+{
+  use_operand_p use_p;
+
+  FOR_EACH_IMM_USE_ON_STMT (use_p, *imm_iter)
+    SET_USE (use_p, val);
+}
+
+/* Replace uses of NAME by VAL in block BB.  */
+
+static void
+replace_uses_in_bb_by (tree name, tree val, basic_block bb)
+{
+  gimple use_stmt;
+  imm_use_iterator imm_iter;
+
+  FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, name)
+    {
+      if (gimple_bb (use_stmt) != bb)
+	continue;
+
+      replace_imm_uses (val, &imm_iter);
+    }
+}
+
+/* Replace uses of NAME by VAL in blocks BBS.  */
+
+static void
+replace_uses_in_bbs_by (tree name, tree val, bitmap bbs)
+{
+  gimple use_stmt;
+  imm_use_iterator imm_iter;
+
+  FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, name)
+    {
+      if (!bitmap_bit_p (bbs, gimple_bb (use_stmt)->index))
+	continue;
+
+      replace_imm_uses (val, &imm_iter);
+    }
+}
+
+/*
+   Do transformation from:
+
+     bb preheader:
+     ...
+     goto <bb header>
+
+     <bb header>:
+     ...
+     if (ivtmp < n)
+       goto <bb latch>;
+     else
+       goto <bb exit>;
+
+     <bb latch>:
+     ivtmp = ivtmp + inc;
+     goto <bb header>
+
+   to:
+
+     bb preheader:
+     ...
+     goto <bb newheader>
+
+     <bb header>:
+     ...
+     goto <bb latch>;
+
+     <bb newheader>:
+     if (ivtmp < n + 1)
+       goto <bb header>;
+     else
+       goto <bb exit>;
+
+     <bb latch>:
+     ivtmp = ivtmp + inc;
+     goto <bb newheader>
+
+   Moves the exit condition of LOOP to the beginning of its header.
+   REDUCTION_LIST describes the reductions in LOOP.  BOUND is the new loop
+   bound.  */
+
+static void
+transform_to_exit_first_loop_alt (struct loop *loop,
+				  reduction_info_table_type *reduction_list,
+				  tree bound)
+{
+  basic_block header = loop->header;
+  basic_block latch = loop->latch;
+  edge exit = single_dom_exit (loop);
+  basic_block exit_block = exit->dest;
+  gcond *cond_stmt = as_a <gcond *> (last_stmt (exit->src));
+  tree control = gimple_cond_lhs (cond_stmt);
+  gphi *phi, *nphi;
+  tree res, new_res, init, inc_res;
+
+  /* Gather the bbs dominated by the exit block.  */
+  bitmap exit_dominated = BITMAP_ALLOC (NULL);
+  bitmap_set_bit (exit_dominated, exit_block->index);
+  vec<basic_block> exit_dominated_vec
+    = get_dominated_by (CDI_DOMINATORS, exit_block);
+
+  int i;
+  basic_block dom_bb;
+  FOR_EACH_VEC_ELT (exit_dominated_vec, i, dom_bb)
+    bitmap_set_bit (exit_dominated, dom_bb->index);
+
+  exit_dominated_vec.release ();
+
+  /* Create the new_header block.  */
+  basic_block new_header = split_block_before_cond_jump (exit->src);
+  edge split_edge = single_pred_edge (new_header);
+
+  /* We could redirect entry to new_header, but we want to keep the phis on
+     header intact.  So we create new_entry here, and remove entry later.  */
+  edge entry = loop_preheader_edge (loop);
+  edge new_entry = make_edge (entry->src, new_header, EDGE_FALLTHRU);
+  new_entry->probability = entry->probability;
+
+  /* We could redirect post_inc_edge to new_header, but we want to keep the phis
+     on header intact.  So we create new_post_inc_edge here, and remove
+     post_inc_edge later.  */
+  edge post_inc_edge = single_succ_edge (latch);
+  edge new_post_inc_edge = make_edge (latch, new_header, EDGE_FALLTHRU);
+  new_post_inc_edge->probability = REG_BR_PROB_BASE;
+
+  /* Redirect pre_inc_edge to header.  */
+  edge pre_inc_edge = single_pred_edge (latch);
+  edge new_latch_edge = redirect_edge_and_branch (pre_inc_edge, header);
+
+  /* Redirect split_edge to latch.  */
+  redirect_edge_and_branch (split_edge, latch);
+
+  /* Set the new loop bound.  */
+  gimple_cond_set_rhs (cond_stmt, bound);
+
+  /* Repair the ssa.  */
+  for (gphi_iterator gsi = gsi_start_phis (header);
+       !gsi_end_p (gsi);
+       gsi_next (&gsi))
+    {
+      phi = gsi.phi ();
+      res = PHI_RESULT (phi);
+
+      /* Create new phi.  */
+      new_res = copy_ssa_name (res, phi);
+      nphi = create_phi_node (new_res, new_header);
+
+      /* Set the entry argument of the new phi.  */
+      init = gimple_phi_arg_def (phi, entry->dest_idx);
+      add_phi_arg (nphi, init, new_entry, UNKNOWN_LOCATION);
+
+      replace_uses_in_bb_by (res, new_res, new_header);
+
+      /* Set the back argument of the new phi.  */
+      inc_res = gimple_phi_arg_def (phi, 1 - entry->dest_idx);
+      add_phi_arg (nphi, inc_res, new_post_inc_edge, UNKNOWN_LOCATION);
+
+      /* Fixup old phi.  */
+      add_phi_arg (phi, new_res, new_latch_edge, UNKNOWN_LOCATION);
+
+      /* Loop-closed ssa does not hold for virtuals, so we cannot get away with
+	 exit_block only.  */
+      replace_uses_in_bbs_by (inc_res, new_res, exit_dominated);
+
+      struct reduction_info *red = reduction_phi (reduction_list, phi);
+      gcc_assert (virtual_operand_p (res)
+		  || res == control
+		  || red != NULL);
+
+      if (red)
+	{
+	  /* Register the new reduction phi.  */
+	  red->reduc_phi = nphi;
+	  gimple_set_uid (red->reduc_phi, red->reduc_version);
+	}
+    }
+  BITMAP_FREE (exit_dominated);
+
+  /* Register the reduction exit phis.  */
+  for (gphi_iterator gsi = gsi_start_phis (exit_block);
+       !gsi_end_p (gsi);
+       gsi_next (&gsi))
+    {
+      phi = gsi.phi ();
+      res = PHI_RESULT (phi);
+      if (virtual_operand_p (res))
+	continue;
+
+      tree val = PHI_ARG_DEF_FROM_EDGE (phi, exit);
+      gimple reduc_phi = SSA_NAME_DEF_STMT (val);
+      struct reduction_info *red = reduction_phi (reduction_list, reduc_phi);
+      if (red != NULL)
+	red->keep_res = phi;
+    }
+
+  /* We no longer need the phis on the old loop header.  Remove replaced
+     edges.  */
+  remove_edge (entry);
+  remove_edge (post_inc_edge);
+
+  /* Fix up loop structure.  TODO: Check whether this is sufficient.  */
+  loop->header = new_header;
+
+  /* Recalculate dominance info.  */
+  free_dominance_info (CDI_DOMINATORS);
+  calculate_dominance_info (CDI_DOMINATORS);
+}
+
+/* Tries to moves the exit condition of LOOP to the beginning of its header
+   without duplication of the loop body.  NIT is the number of iterations of the
+   loop.  REDUCTION_LIST describes the reductions in LOOP.  Return true if
+   transformation is successful.  */
+
+static bool
+try_transform_to_exit_first_loop_alt (struct loop *loop,
+				      reduction_info_table_type *reduction_list,
+				      tree nit)
+{
+  /* Check whether the latch contains a single statement.  */
+  if (!gimple_seq_singleton_p (bb_seq (loop->latch)))
+    return true;
+
+  /* Check whether the latch contains the loop iv increment.  */
+  edge back = single_succ_edge (loop->latch);
+  edge exit = single_dom_exit (loop);
+  gcond *cond_stmt = as_a <gcond *> (last_stmt (exit->src));
+  tree control = gimple_cond_lhs (cond_stmt);
+  gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (control));
+  tree inc_res = gimple_phi_arg_def (phi, back->dest_idx);
+  if (gimple_bb (SSA_NAME_DEF_STMT (inc_res)) != loop->latch)
+    return false;
+
+  /* Check whether there's no code between the loop condition and the latch.  */
+  if (!single_pred_p (loop->latch)
+      || single_pred (loop->latch) != exit->src)
+    return false;
+
+  tree alt_bound = NULL_TREE;
+  tree nit_type = TREE_TYPE (nit);
+
+  /* Figure out whether nit + 1 overflows.  */
+  if (TREE_CODE (nit) == INTEGER_CST)
+    {
+      if (!tree_int_cst_equal (nit, TYPE_MAXVAL (nit_type)))
+	{
+	  alt_bound = fold_build2_loc (UNKNOWN_LOCATION, PLUS_EXPR, nit_type,
+				       nit, build_one_cst (nit_type));
+
+	  gcc_assert (TREE_CODE (alt_bound) == INTEGER_CST);
+	}
+      else
+	{
+	  /* Todo: Figure out if we can trigger this, if it's worth to handle
+	     optimally, and if we can handle it optimally.  */
+	}
+    }
+  else
+    {
+      gcc_assert (TREE_CODE (nit) == SSA_NAME);
+
+      gimple def = SSA_NAME_DEF_STMT (nit);
+
+      if (def
+	  && is_gimple_assign (def)
+	  && gimple_assign_rhs_code (def) == PLUS_EXPR)
+	{
+	  tree op1 = gimple_assign_rhs1 (def);
+	  tree op2 = gimple_assign_rhs2 (def);
+	  if (integer_minus_onep (op1))
+	    alt_bound = op2;
+	  else if (integer_minus_onep (op2))
+	    alt_bound = op1;
+	}
+
+      /* There is a number of test-cases for which we don't get an alt_bound
+	 here: they're listed here, with the lhs of the last stmt as the nit:
+
+	 libgomp.graphite/force-parallel-1.c:
+	 _21 = (signed long) N_6(D);
+	 _19 = _21 + -1;
+	 _7 = (unsigned long) _19;
+
+	 libgomp.graphite/force-parallel-2.c:
+	 _33 = (signed long) N_9(D);
+	 _16 = _33 + -1;
+	 _37 = (unsigned long) _16;
+
+	 libgomp.graphite/force-parallel-5.c:
+	 <bb 6>:
+	 # graphite_IV.5_46 = PHI <0(5), graphite_IV.5_47(11)>
+	 <bb 7>:
+	 _33 = (unsigned long) graphite_IV.5_46;
+
+	 g++.dg/tree-ssa/pr34355.C:
+	 _2 = (unsigned int) i_9;
+	 _3 = 4 - _2;
+
+	 gcc.dg/pr53849.c:
+	 _5 = d.0_11 + -2;
+	 _18 = (unsigned int) _5;
+
+	 We will be able to handle some of these cases, if we can determine when
+	 it's safe to look past casts.  */
+    }
+
+  if (alt_bound == NULL_TREE)
+    return false;
+
+  transform_to_exit_first_loop_alt (loop, reduction_list, alt_bound);
+  return true;
+}
+
+/* Moves the exit condition of LOOP to the beginning of its header.  NIT is the
+   number of iterations of the loop.  REDUCTION_LIST describes the reductions in
+   LOOP.  */
 
 static void
 transform_to_exit_first_loop (struct loop *loop,
@@ -1522,6 +1832,18 @@ transform_to_exit_first_loop (struct loop *loop,
   gcond *cond_stmt, *cond_nit;
   tree nit_1;
 
+  /* The common case is that latch of the loop is empty (apart from the
+     increment) and immediately follows the loop exit test.  Attempt to move the
+     entry of the loop directly before the exit check and increase the number of
+     iterations of the loop by one.  */
+  if (try_transform_to_exit_first_loop_alt (loop, reduction_list, nit))
+    return;
+
+  /* Fall back on the method that handles more cases, but duplicates the loop
+     body: move the exit condition of LOOP to the beginning of its header, and
+     duplicate the part of the last iteration that gets disabled to the exit of
+     the loop.  */
+
   split_block_after_labels (loop->header);
   orig_header = single_succ (loop->header);
   hpred = single_succ_edge (loop->header);
diff --git a/libgomp/testsuite/libgomp.c/parloops-exit-first-loop-alt-2.c b/libgomp/testsuite/libgomp.c/parloops-exit-first-loop-alt-2.c
new file mode 100644
index 0000000..eb5e11f
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/parloops-exit-first-loop-alt-2.c
@@ -0,0 +1,48 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -ftree-parallelize-loops=2" } */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#define N 1000
+
+unsigned int a[N];
+unsigned int b[N];
+unsigned int c[N];
+
+void __attribute__((noclone,noinline))
+f (void)
+{
+  int i;
+
+  for (i = 0; i < N; ++i)
+    c[i] = a[i] + b[i];
+}
+
+int
+main (void)
+{
+  int i, j;
+
+  /* Complexify loop to inhibit parloops.  */
+  for (j = 0; j < 100; ++j)
+    for (i = 0; i < 10; i++)
+      {
+	int k = i + (10 * j);
+	a[k] = k;
+	b[k] = (k * 3) % 7;
+	c[k] = k * 2;
+      }
+
+  f ();
+
+  for (i = 0; i < N; i++)
+    {
+      unsigned int actual = c[i];
+      unsigned int expected = i + ((i * 3) % 7);
+      if (actual != expected)
+	abort ();
+    }
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/parloops-exit-first-loop-alt-3.c b/libgomp/testsuite/libgomp.c/parloops-exit-first-loop-alt-3.c
new file mode 100644
index 0000000..b426b3f
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/parloops-exit-first-loop-alt-3.c
@@ -0,0 +1,29 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -ftree-parallelize-loops=2" } */
+
+unsigned int *a;
+
+unsigned int __attribute__((noclone,noinline))
+f (unsigned int n)
+{
+  int i;
+  unsigned int sum = 1;
+
+  for (i = 0; i < n; ++i)
+    sum += a[i];
+
+  return sum;
+}
+
+int
+main (void)
+{
+  unsigned int res;
+  unsigned int array[4000];
+  int i;
+  for (i = 0; i < 4000; ++i)
+    array[i] = i % 7;
+  a = &array[0];
+  res = f (4000);
+  return !(res == 11995);
+}
diff --git a/libgomp/testsuite/libgomp.c/parloops-exit-first-loop-alt.c b/libgomp/testsuite/libgomp.c/parloops-exit-first-loop-alt.c
new file mode 100644
index 0000000..d7d4003
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/parloops-exit-first-loop-alt.c
@@ -0,0 +1,48 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -ftree-parallelize-loops=2" } */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#define N 1000
+
+unsigned int a[N];
+unsigned int b[N];
+unsigned int c[N];
+
+void __attribute__((noclone,noinline))
+f (unsigned int n)
+{
+  int i;
+
+  for (i = 0; i < n; ++i)
+    c[i] = a[i] + b[i];
+}
+
+int
+main (void)
+{
+  int i, j;
+
+  /* Complexify loop to inhibit parloops.  */
+  for (j = 0; j < 100; ++j)
+    for (i = 0; i < 10; i++)
+      {
+	int k = i + (10 * j);
+	a[k] = k;
+	b[k] = (k * 3) % 7;
+	c[k] = k * 2;
+      }
+
+  f (N);
+
+  for (i = 0; i < N; i++)
+    {
+      unsigned int actual = c[i];
+      unsigned int expected = i + ((i * 3) % 7);
+      if (actual != expected)
+	abort ();
+    }
+
+  return 0;
+}
-- 
1.9.1


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH, stage1][PR65443] Add transform_to_exit_first_loop_alt
  2015-03-27 14:10 [PATCH, stage1][PR65443] Add transform_to_exit_first_loop_alt Tom de Vries
@ 2015-04-03 12:40 ` Tom de Vries
  2015-04-15 13:39   ` [PING][PATCH][PR65443] " Tom de Vries
  0 siblings, 1 reply; 13+ messages in thread
From: Tom de Vries @ 2015-04-03 12:40 UTC (permalink / raw)
  To: GCC Patches; +Cc: Richard Biener, Jakub Jelinek

[-- Attachment #1: Type: text/plain, Size: 1706 bytes --]

On 27-03-15 15:10, Tom de Vries wrote:
> Hi,
>
> this patch fixes PR65443, a todo in the parloops pass for function
> transform_to_exit_first_loop:
> ...
>     TODO: the common case is that latch of the loop is empty and immediately
>     follows the loop exit.  In this case, it would be better not to copy the
>     body of the loop, but only move the entry of the loop directly before the
>     exit check and increase the number of iterations of the loop by one.
>     This may need some additional preconditioning in case NIT = ~0.
> ...
>
> The current implementation of transform_to_exit_first_loop transforms the loop
> by moving and duplicating the loop body. This patch transforms the loop into the
> same normal form, but without the duplication, if that's possible (determined by
> try_transform_to_exit_first_loop_alt).
>
> The actual transformation, done by transform_to_exit_first_loop_alt transforms
> this loop:
> ...
>       bb preheader:
>       ...
>       goto <bb header>
>
>       <bb header>:
>       ...
>       if (ivtmp < n)
>         goto <bb latch>;
>       else
>         goto <bb exit>;
>
>       <bb latch>:
>       ivtmp = ivtmp + inc;
>       goto <bb header>
> ...
>
> into this one:
> ...
>       bb preheader:
>       ...
>       goto <bb newheader>
>
>       <bb header>:
>       ...
>       goto <bb latch>;
>
>       <bb newheader>:
>       if (ivtmp < n + 1)
>         goto <bb header>;
>       else
>         goto <bb exit>;
>
>       <bb latch>:
>       ivtmp = ivtmp + inc;
>       goto <bb newheader>
> ...
>

Updated patch, now using redirect_edge_var_map and flush_pending_stmts.

Bootstrapped and reg-tested on x86_64.

OK for stage1 trunk?

Thanks,
- Tom



[-- Attachment #2: 0001-Add-transform_to_exit_first_loop_alt.patch --]
[-- Type: text/x-patch, Size: 18868 bytes --]

Add transform_to_exit_first_loop_alt

2015-03-27  Tom de Vries  <tom@codesourcery.com>

	PR tree-optimization/65443
	* tree-parloops.c (replace_imm_uses, replace_uses_in_bb_by)
	(replace_uses_in_bbs_by, transform_to_exit_first_loop_alt)
	(try_transform_to_exit_first_loop_alt): New function.
	(transform_to_exit_first_loop): Use
	try_transform_to_exit_first_loop_alt.

	* gcc.dg/parloops-exit-first-loop-alt-2.c: New test.
	* gcc.dg/parloops-exit-first-loop-alt-3.c: New test.
	* gcc.dg/parloops-exit-first-loop-alt.c: New test.

	* testsuite/libgomp.c/parloops-exit-first-loop-alt-2.c: New test.
	* testsuite/libgomp.c/parloops-exit-first-loop-alt-3.c: New test.
	* testsuite/libgomp.c/parloops-exit-first-loop-alt.c: New test.
---
 .../gcc.dg/parloops-exit-first-loop-alt-2.c        |  27 ++
 .../gcc.dg/parloops-exit-first-loop-alt-3.c        |  26 ++
 .../gcc.dg/parloops-exit-first-loop-alt.c          |  27 ++
 gcc/tree-parloops.c                                | 341 ++++++++++++++++++++-
 .../libgomp.c/parloops-exit-first-loop-alt-2.c     |  48 +++
 .../libgomp.c/parloops-exit-first-loop-alt-3.c     |  29 ++
 .../libgomp.c/parloops-exit-first-loop-alt.c       |  48 +++
 7 files changed, 534 insertions(+), 12 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/parloops-exit-first-loop-alt-2.c
 create mode 100644 gcc/testsuite/gcc.dg/parloops-exit-first-loop-alt-3.c
 create mode 100644 gcc/testsuite/gcc.dg/parloops-exit-first-loop-alt.c
 create mode 100644 libgomp/testsuite/libgomp.c/parloops-exit-first-loop-alt-2.c
 create mode 100644 libgomp/testsuite/libgomp.c/parloops-exit-first-loop-alt-3.c
 create mode 100644 libgomp/testsuite/libgomp.c/parloops-exit-first-loop-alt.c

diff --git a/gcc/testsuite/gcc.dg/parloops-exit-first-loop-alt-2.c b/gcc/testsuite/gcc.dg/parloops-exit-first-loop-alt-2.c
new file mode 100644
index 0000000..a4d6cb2
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/parloops-exit-first-loop-alt-2.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target pthread } */
+/* { dg-options "-O2 -ftree-parallelize-loops=2 -fdump-tree-parloops" } */
+
+#define N 1000
+
+unsigned int a[N];
+unsigned int b[N];
+unsigned int c[N];
+
+void __attribute__((noclone,noinline))
+f (unsigned int n)
+{
+  int i;
+
+    for (i = 0; i < N; ++i)
+      c[i] = a[i] + b[i];
+}
+
+/* Three times three array accesses:
+   - three in f._loopfn.0
+   - three in the parallel
+   - three in the low iteration count loop
+   Crucially, none for a peeled off last iteration following the parallel.  */
+/* { dg-final { scan-tree-dump-times "(?n)\\\[i" 9 "parloops" } } */
+
+/* { dg-final { cleanup-tree-dump "parloops" } } */
diff --git a/gcc/testsuite/gcc.dg/parloops-exit-first-loop-alt-3.c b/gcc/testsuite/gcc.dg/parloops-exit-first-loop-alt-3.c
new file mode 100644
index 0000000..c7ab51f88
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/parloops-exit-first-loop-alt-3.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target pthread } */
+/* { dg-options "-O2 -ftree-parallelize-loops=2 -fdump-tree-parloops" } */
+
+unsigned int *a;
+
+unsigned int __attribute__((noclone,noinline))
+f (unsigned int n)
+{
+  int i;
+  unsigned int sum = 1;
+
+  for (i = 0; i < n; ++i)
+    sum += a[i];
+
+  return sum;
+}
+
+/* Three array accesses:
+   - one in f._loopfn.0
+   - one in the parallel
+   - one in the low iteration count loop
+   Crucially, none for a peeled off last iteration following the parallel.  */
+/* { dg-final { scan-tree-dump-times "(?n)\\\* 4" 3 "parloops" } } */
+
+/* { dg-final { cleanup-tree-dump "parloops" } } */
diff --git a/gcc/testsuite/gcc.dg/parloops-exit-first-loop-alt.c b/gcc/testsuite/gcc.dg/parloops-exit-first-loop-alt.c
new file mode 100644
index 0000000..42ca3ac
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/parloops-exit-first-loop-alt.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target pthread } */
+/* { dg-options "-O2 -ftree-parallelize-loops=2 -fdump-tree-parloops" } */
+
+#define N 1000
+
+unsigned int a[N];
+unsigned int b[N];
+unsigned int c[N];
+
+void __attribute__((noclone,noinline))
+f (unsigned int n)
+{
+  int i;
+
+  for (i = 0; i < n; ++i)
+    c[i] = a[i] + b[i];
+}
+
+/* Three times three array accesses:
+   - three in f._loopfn.0
+   - three in the parallel
+   - three in the low iteration count loop
+   Crucially, none for a peeled off last iteration following the parallel.  */
+/* { dg-final { scan-tree-dump-times "(?n)\\\[i" 9 "parloops" } } */
+
+/* { dg-final { cleanup-tree-dump "parloops" } } */
diff --git a/gcc/tree-parloops.c b/gcc/tree-parloops.c
index 62a6444..acb0010 100644
--- a/gcc/tree-parloops.c
+++ b/gcc/tree-parloops.c
@@ -78,6 +78,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "plugin-api.h"
 #include "ipa-ref.h"
 #include "cgraph.h"
+#include "tree-ssa.h"
 
 /* This pass tries to distribute iterations of loops into several threads.
    The implementation is straightforward -- for each loop we test whether its
@@ -1487,17 +1488,322 @@ create_loop_fn (location_t loc)
   return decl;
 }
 
-/* Moves the exit condition of LOOP to the beginning of its header, and
-   duplicates the part of the last iteration that gets disabled to the
-   exit of the loop.  NIT is the number of iterations of the loop
-   (used to initialize the variables in the duplicated part).
+/* Replace uses of VAL in iterator IMM_ITER.  */
 
-   TODO: the common case is that latch of the loop is empty and immediately
-   follows the loop exit.  In this case, it would be better not to copy the
-   body of the loop, but only move the entry of the loop directly before the
-   exit check and increase the number of iterations of the loop by one.
-   This may need some additional preconditioning in case NIT = ~0.
-   REDUCTION_LIST describes the reductions in LOOP.  */
+static void
+replace_imm_uses (tree val, imm_use_iterator *imm_iter)
+{
+  use_operand_p use_p;
+
+  FOR_EACH_IMM_USE_ON_STMT (use_p, *imm_iter)
+    SET_USE (use_p, val);
+}
+
+/* Replace uses of NAME by VAL in block BB.  */
+
+static void
+replace_uses_in_bb_by (tree name, tree val, basic_block bb)
+{
+  gimple use_stmt;
+  imm_use_iterator imm_iter;
+
+  FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, name)
+    {
+      if (gimple_bb (use_stmt) != bb)
+	continue;
+
+      replace_imm_uses (val, &imm_iter);
+    }
+}
+
+/* Replace uses of NAME by VAL in blocks BBS.  */
+
+static void
+replace_uses_in_bbs_by (tree name, tree val, bitmap bbs)
+{
+  gimple use_stmt;
+  imm_use_iterator imm_iter;
+
+  FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, name)
+    {
+      if (!bitmap_bit_p (bbs, gimple_bb (use_stmt)->index))
+	continue;
+
+      replace_imm_uses (val, &imm_iter);
+    }
+}
+
+/* Do transformation from:
+
+     bb preheader:
+     ...
+     goto <bb header>
+
+     <bb header>:
+     ...
+     if (ivtmp < n)
+       goto <bb latch>;
+     else
+       goto <bb exit>;
+
+     <bb latch>:
+     ivtmp = ivtmp + inc;
+     goto <bb header>
+
+   to:
+
+     bb preheader:
+     ...
+     goto <bb newheader>
+
+     <bb header>:
+     ...
+     goto <bb latch>;
+
+     <bb newheader>:
+     if (ivtmp < n + 1)
+       goto <bb header>;
+     else
+       goto <bb exit>;
+
+     <bb latch>:
+     ivtmp = ivtmp + inc;
+     goto <bb newheader>
+
+   Moves the exit condition of LOOP to the beginning of its header.
+   REDUCTION_LIST describes the reductions in LOOP.  BOUND is the new loop
+   bound.  */
+
+static void
+transform_to_exit_first_loop_alt (struct loop *loop,
+				  reduction_info_table_type *reduction_list,
+				  tree bound)
+{
+  basic_block header = loop->header;
+  basic_block latch = loop->latch;
+  edge exit = single_dom_exit (loop);
+  basic_block exit_block = exit->dest;
+  gcond *cond_stmt = as_a <gcond *> (last_stmt (exit->src));
+  tree control = gimple_cond_lhs (cond_stmt);
+  edge e;
+
+  /* Gather the bbs dominated by the exit block.  */
+  bitmap exit_dominated = BITMAP_ALLOC (NULL);
+  bitmap_set_bit (exit_dominated, exit_block->index);
+  vec<basic_block> exit_dominated_vec
+    = get_dominated_by (CDI_DOMINATORS, exit_block);
+
+  int i;
+  basic_block dom_bb;
+  FOR_EACH_VEC_ELT (exit_dominated_vec, i, dom_bb)
+    bitmap_set_bit (exit_dominated, dom_bb->index);
+
+  exit_dominated_vec.release ();
+
+  /* Create the new_header block.  */
+  basic_block new_header = split_block_before_cond_jump (exit->src);
+  edge split_edge = single_pred_edge (new_header);
+
+  /* Redirect entry edge to new_header.  */
+  edge entry = loop_preheader_edge (loop);
+  e = redirect_edge_and_branch (entry, new_header);
+  gcc_assert (e == entry);
+
+  /* Redirect post_inc_edge to new_header.  */
+  edge post_inc_edge = single_succ_edge (latch);
+  e = redirect_edge_and_branch (post_inc_edge, new_header);
+  gcc_assert (e == post_inc_edge);
+
+  /* Redirect post_cond_edge to header.  */
+  edge post_cond_edge = single_pred_edge (latch);
+  e = redirect_edge_and_branch (post_cond_edge, header);
+  gcc_assert (e == post_cond_edge);
+
+  /* Redirect split_edge to latch.  */
+  e = redirect_edge_and_branch (split_edge, latch);
+  gcc_assert (e == split_edge);
+
+  /* Set the new loop bound.  */
+  gimple_cond_set_rhs (cond_stmt, bound);
+
+  /* Repair the ssa.  */
+  vec<edge_var_map> *v = redirect_edge_var_map_vector (post_inc_edge);
+  edge_var_map *vm;
+  gphi_iterator gsi;
+  for (gsi = gsi_start_phis (header), i = 0;
+       !gsi_end_p (gsi) && v->iterate (i, &vm);
+       gsi_next (&gsi), i++)
+    {
+      gphi *phi = gsi.phi ();
+      tree res = PHI_RESULT (phi);
+
+      /* Create new phi.  */
+      tree new_res = copy_ssa_name (res, phi);
+      gphi *nphi = create_phi_node (new_res, new_header);
+
+      replace_uses_in_bb_by (res, new_res, new_header);
+
+      /* Fixup old phi.  */
+      add_phi_arg (phi, new_res, post_cond_edge, UNKNOWN_LOCATION);
+
+      /* Loop-closed ssa does not hold for virtuals, so we cannot get away with
+	 exit_block only.  */
+      tree inc_res = redirect_edge_var_map_def (vm);
+      replace_uses_in_bbs_by (inc_res, new_res, exit_dominated);
+
+      struct reduction_info *red = reduction_phi (reduction_list, phi);
+      gcc_assert (virtual_operand_p (res)
+		  || res == control
+		  || red != NULL);
+
+      if (red)
+	{
+	  /* Register the new reduction phi.  */
+	  red->reduc_phi = nphi;
+	  gimple_set_uid (red->reduc_phi, red->reduc_version);
+	}
+    }
+  gcc_assert (gsi_end_p (gsi) && !v->iterate (i, &vm));
+  BITMAP_FREE (exit_dominated);
+
+  /* Set the entry argument of the new phis.  */
+  flush_pending_stmts (entry);
+
+  /* Set the back argument of the new phis.  */
+  flush_pending_stmts (post_inc_edge);
+
+  /* Register the reduction exit phis.  */
+  for (gphi_iterator gsi = gsi_start_phis (exit_block);
+       !gsi_end_p (gsi);
+       gsi_next (&gsi))
+    {
+      gphi *phi = gsi.phi ();
+      tree res = PHI_RESULT (phi);
+      if (virtual_operand_p (res))
+	continue;
+
+      tree val = PHI_ARG_DEF_FROM_EDGE (phi, exit);
+      gimple reduc_phi = SSA_NAME_DEF_STMT (val);
+      struct reduction_info *red = reduction_phi (reduction_list, reduc_phi);
+      if (red != NULL)
+	red->keep_res = phi;
+    }
+
+  /* Fix up loop structure.  TODO: Check whether this is sufficient.  */
+  loop->header = new_header;
+
+  /* Recalculate dominance info.  */
+  free_dominance_info (CDI_DOMINATORS);
+  calculate_dominance_info (CDI_DOMINATORS);
+}
+
+/* Tries to moves the exit condition of LOOP to the beginning of its header
+   without duplication of the loop body.  NIT is the number of iterations of the
+   loop.  REDUCTION_LIST describes the reductions in LOOP.  Return true if
+   transformation is successful.  */
+
+static bool
+try_transform_to_exit_first_loop_alt (struct loop *loop,
+				      reduction_info_table_type *reduction_list,
+				      tree nit)
+{
+  /* Check whether the latch contains a single statement.  */
+  if (!gimple_seq_singleton_p (bb_seq (loop->latch)))
+    return true;
+
+  /* Check whether the latch contains the loop iv increment.  */
+  edge back = single_succ_edge (loop->latch);
+  edge exit = single_dom_exit (loop);
+  gcond *cond_stmt = as_a <gcond *> (last_stmt (exit->src));
+  tree control = gimple_cond_lhs (cond_stmt);
+  gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (control));
+  tree inc_res = gimple_phi_arg_def (phi, back->dest_idx);
+  if (gimple_bb (SSA_NAME_DEF_STMT (inc_res)) != loop->latch)
+    return false;
+
+  /* Check whether there's no code between the loop condition and the latch.  */
+  if (!single_pred_p (loop->latch)
+      || single_pred (loop->latch) != exit->src)
+    return false;
+
+  tree alt_bound = NULL_TREE;
+  tree nit_type = TREE_TYPE (nit);
+
+  /* Figure out whether nit + 1 overflows.  */
+  if (TREE_CODE (nit) == INTEGER_CST)
+    {
+      if (!tree_int_cst_equal (nit, TYPE_MAXVAL (nit_type)))
+	{
+	  alt_bound = fold_build2_loc (UNKNOWN_LOCATION, PLUS_EXPR, nit_type,
+				       nit, build_one_cst (nit_type));
+
+	  gcc_assert (TREE_CODE (alt_bound) == INTEGER_CST);
+	}
+      else
+	{
+	  /* Todo: Figure out if we can trigger this, if it's worth to handle
+	     optimally, and if we can handle it optimally.  */
+	}
+    }
+  else
+    {
+      gcc_assert (TREE_CODE (nit) == SSA_NAME);
+
+      gimple def = SSA_NAME_DEF_STMT (nit);
+
+      if (def
+	  && is_gimple_assign (def)
+	  && gimple_assign_rhs_code (def) == PLUS_EXPR)
+	{
+	  tree op1 = gimple_assign_rhs1 (def);
+	  tree op2 = gimple_assign_rhs2 (def);
+	  if (integer_minus_onep (op1))
+	    alt_bound = op2;
+	  else if (integer_minus_onep (op2))
+	    alt_bound = op1;
+	}
+
+      /* There is a number of test-cases for which we don't get an alt_bound
+	 here: they're listed here, with the lhs of the last stmt as the nit:
+
+	 libgomp.graphite/force-parallel-1.c:
+	 _21 = (signed long) N_6(D);
+	 _19 = _21 + -1;
+	 _7 = (unsigned long) _19;
+
+	 libgomp.graphite/force-parallel-2.c:
+	 _33 = (signed long) N_9(D);
+	 _16 = _33 + -1;
+	 _37 = (unsigned long) _16;
+
+	 libgomp.graphite/force-parallel-5.c:
+	 <bb 6>:
+	 # graphite_IV.5_46 = PHI <0(5), graphite_IV.5_47(11)>
+	 <bb 7>:
+	 _33 = (unsigned long) graphite_IV.5_46;
+
+	 g++.dg/tree-ssa/pr34355.C:
+	 _2 = (unsigned int) i_9;
+	 _3 = 4 - _2;
+
+	 gcc.dg/pr53849.c:
+	 _5 = d.0_11 + -2;
+	 _18 = (unsigned int) _5;
+
+	 We will be able to handle some of these cases, if we can determine when
+	 it's safe to look past casts.  */
+    }
+
+  if (alt_bound == NULL_TREE)
+    return false;
+
+  transform_to_exit_first_loop_alt (loop, reduction_list, alt_bound);
+  return true;
+}
+
+/* Moves the exit condition of LOOP to the beginning of its header.  NIT is the
+   number of iterations of the loop.  REDUCTION_LIST describes the reductions in
+   LOOP.  */
 
 static void
 transform_to_exit_first_loop (struct loop *loop,
@@ -1882,8 +2188,19 @@ gen_parallel_loop (struct loop *loop,
   /* Base all the induction variables in LOOP on a single control one.  */
   canonicalize_loop_ivs (loop, &nit, true);
 
-  /* Ensure that the exit condition is the first statement in the loop.  */
-  transform_to_exit_first_loop (loop, reduction_list, nit);
+  /* Ensure that the exit condition is the first statement in the loop.
+     The common case is that latch of the loop is empty (apart from the
+     increment) and immediately follows the loop exit test.  Attempt to move the
+     entry of the loop directly before the exit check and increase the number of
+     iterations of the loop by one.  */
+  if (!try_transform_to_exit_first_loop_alt (loop, reduction_list, nit))
+    {
+      /* Fall back on the method that handles more cases, but duplicates the
+	 loop body: move the exit condition of LOOP to the beginning of its
+	 header, and duplicate the part of the last iteration that gets disabled
+	 to the exit of the loop.  */
+      transform_to_exit_first_loop (loop, reduction_list, nit);
+    }
 
   /* Generate initializations for reductions.  */
   if (reduction_list->elements () > 0)
diff --git a/libgomp/testsuite/libgomp.c/parloops-exit-first-loop-alt-2.c b/libgomp/testsuite/libgomp.c/parloops-exit-first-loop-alt-2.c
new file mode 100644
index 0000000..eb5e11f
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/parloops-exit-first-loop-alt-2.c
@@ -0,0 +1,48 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -ftree-parallelize-loops=2" } */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#define N 1000
+
+unsigned int a[N];
+unsigned int b[N];
+unsigned int c[N];
+
+void __attribute__((noclone,noinline))
+f (void)
+{
+  int i;
+
+  for (i = 0; i < N; ++i)
+    c[i] = a[i] + b[i];
+}
+
+int
+main (void)
+{
+  int i, j;
+
+  /* Complexify loop to inhibit parloops.  */
+  for (j = 0; j < 100; ++j)
+    for (i = 0; i < 10; i++)
+      {
+	int k = i + (10 * j);
+	a[k] = k;
+	b[k] = (k * 3) % 7;
+	c[k] = k * 2;
+      }
+
+  f ();
+
+  for (i = 0; i < N; i++)
+    {
+      unsigned int actual = c[i];
+      unsigned int expected = i + ((i * 3) % 7);
+      if (actual != expected)
+	abort ();
+    }
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/parloops-exit-first-loop-alt-3.c b/libgomp/testsuite/libgomp.c/parloops-exit-first-loop-alt-3.c
new file mode 100644
index 0000000..b426b3f
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/parloops-exit-first-loop-alt-3.c
@@ -0,0 +1,29 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -ftree-parallelize-loops=2" } */
+
+unsigned int *a;
+
+unsigned int __attribute__((noclone,noinline))
+f (unsigned int n)
+{
+  int i;
+  unsigned int sum = 1;
+
+  for (i = 0; i < n; ++i)
+    sum += a[i];
+
+  return sum;
+}
+
+int
+main (void)
+{
+  unsigned int res;
+  unsigned int array[4000];
+  int i;
+  for (i = 0; i < 4000; ++i)
+    array[i] = i % 7;
+  a = &array[0];
+  res = f (4000);
+  return !(res == 11995);
+}
diff --git a/libgomp/testsuite/libgomp.c/parloops-exit-first-loop-alt.c b/libgomp/testsuite/libgomp.c/parloops-exit-first-loop-alt.c
new file mode 100644
index 0000000..d7d4003
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/parloops-exit-first-loop-alt.c
@@ -0,0 +1,48 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -ftree-parallelize-loops=2" } */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#define N 1000
+
+unsigned int a[N];
+unsigned int b[N];
+unsigned int c[N];
+
+void __attribute__((noclone,noinline))
+f (unsigned int n)
+{
+  int i;
+
+  for (i = 0; i < n; ++i)
+    c[i] = a[i] + b[i];
+}
+
+int
+main (void)
+{
+  int i, j;
+
+  /* Complexify loop to inhibit parloops.  */
+  for (j = 0; j < 100; ++j)
+    for (i = 0; i < 10; i++)
+      {
+	int k = i + (10 * j);
+	a[k] = k;
+	b[k] = (k * 3) % 7;
+	c[k] = k * 2;
+      }
+
+  f (N);
+
+  for (i = 0; i < N; i++)
+    {
+      unsigned int actual = c[i];
+      unsigned int expected = i + ((i * 3) % 7);
+      if (actual != expected)
+	abort ();
+    }
+
+  return 0;
+}
-- 
1.9.1


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PING][PATCH][PR65443] Add transform_to_exit_first_loop_alt
  2015-04-03 12:40 ` Tom de Vries
@ 2015-04-15 13:39   ` Tom de Vries
  2015-04-20 12:26     ` Richard Biener
  0 siblings, 1 reply; 13+ messages in thread
From: Tom de Vries @ 2015-04-15 13:39 UTC (permalink / raw)
  To: GCC Patches; +Cc: Richard Biener, Jakub Jelinek

[-- Attachment #1: Type: text/plain, Size: 1821 bytes --]

On 03-04-15 14:39, Tom de Vries wrote:
> On 27-03-15 15:10, Tom de Vries wrote:
>> Hi,
>>
>> this patch fixes PR65443, a todo in the parloops pass for function
>> transform_to_exit_first_loop:
>> ...
>>     TODO: the common case is that latch of the loop is empty and immediately
>>     follows the loop exit.  In this case, it would be better not to copy the
>>     body of the loop, but only move the entry of the loop directly before the
>>     exit check and increase the number of iterations of the loop by one.
>>     This may need some additional preconditioning in case NIT = ~0.
>> ...
>>
>> The current implementation of transform_to_exit_first_loop transforms the loop
>> by moving and duplicating the loop body. This patch transforms the loop into the
>> same normal form, but without the duplication, if that's possible (determined by
>> try_transform_to_exit_first_loop_alt).
>>
>> The actual transformation, done by transform_to_exit_first_loop_alt transforms
>> this loop:
>> ...
>>       bb preheader:
>>       ...
>>       goto <bb header>
>>
>>       <bb header>:
>>       ...
>>       if (ivtmp < n)
>>         goto <bb latch>;
>>       else
>>         goto <bb exit>;
>>
>>       <bb latch>:
>>       ivtmp = ivtmp + inc;
>>       goto <bb header>
>> ...
>>
>> into this one:
>> ...
>>       bb preheader:
>>       ...
>>       goto <bb newheader>
>>
>>       <bb header>:
>>       ...
>>       goto <bb latch>;
>>
>>       <bb newheader>:
>>       if (ivtmp < n + 1)
>>         goto <bb header>;
>>       else
>>         goto <bb exit>;
>>
>>       <bb latch>:
>>       ivtmp = ivtmp + inc;
>>       goto <bb newheader>
>> ...
>>
>
> Updated patch, now using redirect_edge_var_map and flush_pending_stmts.
>
> Bootstrapped and reg-tested on x86_64.
>
> OK for stage1 trunk?
>

Ping.

Thanks,
- Tom


[-- Attachment #2: 0001-Add-transform_to_exit_first_loop_alt.patch --]
[-- Type: text/x-patch, Size: 18868 bytes --]

Add transform_to_exit_first_loop_alt

2015-03-27  Tom de Vries  <tom@codesourcery.com>

	PR tree-optimization/65443
	* tree-parloops.c (replace_imm_uses, replace_uses_in_bb_by)
	(replace_uses_in_bbs_by, transform_to_exit_first_loop_alt)
	(try_transform_to_exit_first_loop_alt): New function.
	(transform_to_exit_first_loop): Use
	try_transform_to_exit_first_loop_alt.

	* gcc.dg/parloops-exit-first-loop-alt-2.c: New test.
	* gcc.dg/parloops-exit-first-loop-alt-3.c: New test.
	* gcc.dg/parloops-exit-first-loop-alt.c: New test.

	* testsuite/libgomp.c/parloops-exit-first-loop-alt-2.c: New test.
	* testsuite/libgomp.c/parloops-exit-first-loop-alt-3.c: New test.
	* testsuite/libgomp.c/parloops-exit-first-loop-alt.c: New test.
---
 .../gcc.dg/parloops-exit-first-loop-alt-2.c        |  27 ++
 .../gcc.dg/parloops-exit-first-loop-alt-3.c        |  26 ++
 .../gcc.dg/parloops-exit-first-loop-alt.c          |  27 ++
 gcc/tree-parloops.c                                | 341 ++++++++++++++++++++-
 .../libgomp.c/parloops-exit-first-loop-alt-2.c     |  48 +++
 .../libgomp.c/parloops-exit-first-loop-alt-3.c     |  29 ++
 .../libgomp.c/parloops-exit-first-loop-alt.c       |  48 +++
 7 files changed, 534 insertions(+), 12 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/parloops-exit-first-loop-alt-2.c
 create mode 100644 gcc/testsuite/gcc.dg/parloops-exit-first-loop-alt-3.c
 create mode 100644 gcc/testsuite/gcc.dg/parloops-exit-first-loop-alt.c
 create mode 100644 libgomp/testsuite/libgomp.c/parloops-exit-first-loop-alt-2.c
 create mode 100644 libgomp/testsuite/libgomp.c/parloops-exit-first-loop-alt-3.c
 create mode 100644 libgomp/testsuite/libgomp.c/parloops-exit-first-loop-alt.c

diff --git a/gcc/testsuite/gcc.dg/parloops-exit-first-loop-alt-2.c b/gcc/testsuite/gcc.dg/parloops-exit-first-loop-alt-2.c
new file mode 100644
index 0000000..a4d6cb2
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/parloops-exit-first-loop-alt-2.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target pthread } */
+/* { dg-options "-O2 -ftree-parallelize-loops=2 -fdump-tree-parloops" } */
+
+#define N 1000
+
+unsigned int a[N];
+unsigned int b[N];
+unsigned int c[N];
+
+void __attribute__((noclone,noinline))
+f (unsigned int n)
+{
+  int i;
+
+    for (i = 0; i < N; ++i)
+      c[i] = a[i] + b[i];
+}
+
+/* Three times three array accesses:
+   - three in f._loopfn.0
+   - three in the parallel
+   - three in the low iteration count loop
+   Crucially, none for a peeled off last iteration following the parallel.  */
+/* { dg-final { scan-tree-dump-times "(?n)\\\[i" 9 "parloops" } } */
+
+/* { dg-final { cleanup-tree-dump "parloops" } } */
diff --git a/gcc/testsuite/gcc.dg/parloops-exit-first-loop-alt-3.c b/gcc/testsuite/gcc.dg/parloops-exit-first-loop-alt-3.c
new file mode 100644
index 0000000..c7ab51f88
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/parloops-exit-first-loop-alt-3.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target pthread } */
+/* { dg-options "-O2 -ftree-parallelize-loops=2 -fdump-tree-parloops" } */
+
+unsigned int *a;
+
+unsigned int __attribute__((noclone,noinline))
+f (unsigned int n)
+{
+  int i;
+  unsigned int sum = 1;
+
+  for (i = 0; i < n; ++i)
+    sum += a[i];
+
+  return sum;
+}
+
+/* Three array accesses:
+   - one in f._loopfn.0
+   - one in the parallel
+   - one in the low iteration count loop
+   Crucially, none for a peeled off last iteration following the parallel.  */
+/* { dg-final { scan-tree-dump-times "(?n)\\\* 4" 3 "parloops" } } */
+
+/* { dg-final { cleanup-tree-dump "parloops" } } */
diff --git a/gcc/testsuite/gcc.dg/parloops-exit-first-loop-alt.c b/gcc/testsuite/gcc.dg/parloops-exit-first-loop-alt.c
new file mode 100644
index 0000000..42ca3ac
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/parloops-exit-first-loop-alt.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target pthread } */
+/* { dg-options "-O2 -ftree-parallelize-loops=2 -fdump-tree-parloops" } */
+
+#define N 1000
+
+unsigned int a[N];
+unsigned int b[N];
+unsigned int c[N];
+
+void __attribute__((noclone,noinline))
+f (unsigned int n)
+{
+  int i;
+
+  for (i = 0; i < n; ++i)
+    c[i] = a[i] + b[i];
+}
+
+/* Three times three array accesses:
+   - three in f._loopfn.0
+   - three in the parallel
+   - three in the low iteration count loop
+   Crucially, none for a peeled off last iteration following the parallel.  */
+/* { dg-final { scan-tree-dump-times "(?n)\\\[i" 9 "parloops" } } */
+
+/* { dg-final { cleanup-tree-dump "parloops" } } */
diff --git a/gcc/tree-parloops.c b/gcc/tree-parloops.c
index 62a6444..acb0010 100644
--- a/gcc/tree-parloops.c
+++ b/gcc/tree-parloops.c
@@ -78,6 +78,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "plugin-api.h"
 #include "ipa-ref.h"
 #include "cgraph.h"
+#include "tree-ssa.h"
 
 /* This pass tries to distribute iterations of loops into several threads.
    The implementation is straightforward -- for each loop we test whether its
@@ -1487,17 +1488,322 @@ create_loop_fn (location_t loc)
   return decl;
 }
 
-/* Moves the exit condition of LOOP to the beginning of its header, and
-   duplicates the part of the last iteration that gets disabled to the
-   exit of the loop.  NIT is the number of iterations of the loop
-   (used to initialize the variables in the duplicated part).
+/* Replace uses of VAL in iterator IMM_ITER.  */
 
-   TODO: the common case is that latch of the loop is empty and immediately
-   follows the loop exit.  In this case, it would be better not to copy the
-   body of the loop, but only move the entry of the loop directly before the
-   exit check and increase the number of iterations of the loop by one.
-   This may need some additional preconditioning in case NIT = ~0.
-   REDUCTION_LIST describes the reductions in LOOP.  */
+static void
+replace_imm_uses (tree val, imm_use_iterator *imm_iter)
+{
+  use_operand_p use_p;
+
+  FOR_EACH_IMM_USE_ON_STMT (use_p, *imm_iter)
+    SET_USE (use_p, val);
+}
+
+/* Replace uses of NAME by VAL in block BB.  */
+
+static void
+replace_uses_in_bb_by (tree name, tree val, basic_block bb)
+{
+  gimple use_stmt;
+  imm_use_iterator imm_iter;
+
+  FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, name)
+    {
+      if (gimple_bb (use_stmt) != bb)
+	continue;
+
+      replace_imm_uses (val, &imm_iter);
+    }
+}
+
+/* Replace uses of NAME by VAL in blocks BBS.  */
+
+static void
+replace_uses_in_bbs_by (tree name, tree val, bitmap bbs)
+{
+  gimple use_stmt;
+  imm_use_iterator imm_iter;
+
+  FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, name)
+    {
+      if (!bitmap_bit_p (bbs, gimple_bb (use_stmt)->index))
+	continue;
+
+      replace_imm_uses (val, &imm_iter);
+    }
+}
+
+/* Do transformation from:
+
+     bb preheader:
+     ...
+     goto <bb header>
+
+     <bb header>:
+     ...
+     if (ivtmp < n)
+       goto <bb latch>;
+     else
+       goto <bb exit>;
+
+     <bb latch>:
+     ivtmp = ivtmp + inc;
+     goto <bb header>
+
+   to:
+
+     bb preheader:
+     ...
+     goto <bb newheader>
+
+     <bb header>:
+     ...
+     goto <bb latch>;
+
+     <bb newheader>:
+     if (ivtmp < n + 1)
+       goto <bb header>;
+     else
+       goto <bb exit>;
+
+     <bb latch>:
+     ivtmp = ivtmp + inc;
+     goto <bb newheader>
+
+   Moves the exit condition of LOOP to the beginning of its header.
+   REDUCTION_LIST describes the reductions in LOOP.  BOUND is the new loop
+   bound.  */
+
+static void
+transform_to_exit_first_loop_alt (struct loop *loop,
+				  reduction_info_table_type *reduction_list,
+				  tree bound)
+{
+  basic_block header = loop->header;
+  basic_block latch = loop->latch;
+  edge exit = single_dom_exit (loop);
+  basic_block exit_block = exit->dest;
+  gcond *cond_stmt = as_a <gcond *> (last_stmt (exit->src));
+  tree control = gimple_cond_lhs (cond_stmt);
+  edge e;
+
+  /* Gather the bbs dominated by the exit block.  */
+  bitmap exit_dominated = BITMAP_ALLOC (NULL);
+  bitmap_set_bit (exit_dominated, exit_block->index);
+  vec<basic_block> exit_dominated_vec
+    = get_dominated_by (CDI_DOMINATORS, exit_block);
+
+  int i;
+  basic_block dom_bb;
+  FOR_EACH_VEC_ELT (exit_dominated_vec, i, dom_bb)
+    bitmap_set_bit (exit_dominated, dom_bb->index);
+
+  exit_dominated_vec.release ();
+
+  /* Create the new_header block.  */
+  basic_block new_header = split_block_before_cond_jump (exit->src);
+  edge split_edge = single_pred_edge (new_header);
+
+  /* Redirect entry edge to new_header.  */
+  edge entry = loop_preheader_edge (loop);
+  e = redirect_edge_and_branch (entry, new_header);
+  gcc_assert (e == entry);
+
+  /* Redirect post_inc_edge to new_header.  */
+  edge post_inc_edge = single_succ_edge (latch);
+  e = redirect_edge_and_branch (post_inc_edge, new_header);
+  gcc_assert (e == post_inc_edge);
+
+  /* Redirect post_cond_edge to header.  */
+  edge post_cond_edge = single_pred_edge (latch);
+  e = redirect_edge_and_branch (post_cond_edge, header);
+  gcc_assert (e == post_cond_edge);
+
+  /* Redirect split_edge to latch.  */
+  e = redirect_edge_and_branch (split_edge, latch);
+  gcc_assert (e == split_edge);
+
+  /* Set the new loop bound.  */
+  gimple_cond_set_rhs (cond_stmt, bound);
+
+  /* Repair the ssa.  */
+  vec<edge_var_map> *v = redirect_edge_var_map_vector (post_inc_edge);
+  edge_var_map *vm;
+  gphi_iterator gsi;
+  for (gsi = gsi_start_phis (header), i = 0;
+       !gsi_end_p (gsi) && v->iterate (i, &vm);
+       gsi_next (&gsi), i++)
+    {
+      gphi *phi = gsi.phi ();
+      tree res = PHI_RESULT (phi);
+
+      /* Create new phi.  */
+      tree new_res = copy_ssa_name (res, phi);
+      gphi *nphi = create_phi_node (new_res, new_header);
+
+      replace_uses_in_bb_by (res, new_res, new_header);
+
+      /* Fixup old phi.  */
+      add_phi_arg (phi, new_res, post_cond_edge, UNKNOWN_LOCATION);
+
+      /* Loop-closed ssa does not hold for virtuals, so we cannot get away with
+	 exit_block only.  */
+      tree inc_res = redirect_edge_var_map_def (vm);
+      replace_uses_in_bbs_by (inc_res, new_res, exit_dominated);
+
+      struct reduction_info *red = reduction_phi (reduction_list, phi);
+      gcc_assert (virtual_operand_p (res)
+		  || res == control
+		  || red != NULL);
+
+      if (red)
+	{
+	  /* Register the new reduction phi.  */
+	  red->reduc_phi = nphi;
+	  gimple_set_uid (red->reduc_phi, red->reduc_version);
+	}
+    }
+  gcc_assert (gsi_end_p (gsi) && !v->iterate (i, &vm));
+  BITMAP_FREE (exit_dominated);
+
+  /* Set the entry argument of the new phis.  */
+  flush_pending_stmts (entry);
+
+  /* Set the back argument of the new phis.  */
+  flush_pending_stmts (post_inc_edge);
+
+  /* Register the reduction exit phis.  */
+  for (gphi_iterator gsi = gsi_start_phis (exit_block);
+       !gsi_end_p (gsi);
+       gsi_next (&gsi))
+    {
+      gphi *phi = gsi.phi ();
+      tree res = PHI_RESULT (phi);
+      if (virtual_operand_p (res))
+	continue;
+
+      tree val = PHI_ARG_DEF_FROM_EDGE (phi, exit);
+      gimple reduc_phi = SSA_NAME_DEF_STMT (val);
+      struct reduction_info *red = reduction_phi (reduction_list, reduc_phi);
+      if (red != NULL)
+	red->keep_res = phi;
+    }
+
+  /* Fix up loop structure.  TODO: Check whether this is sufficient.  */
+  loop->header = new_header;
+
+  /* Recalculate dominance info.  */
+  free_dominance_info (CDI_DOMINATORS);
+  calculate_dominance_info (CDI_DOMINATORS);
+}
+
+/* Tries to moves the exit condition of LOOP to the beginning of its header
+   without duplication of the loop body.  NIT is the number of iterations of the
+   loop.  REDUCTION_LIST describes the reductions in LOOP.  Return true if
+   transformation is successful.  */
+
+static bool
+try_transform_to_exit_first_loop_alt (struct loop *loop,
+				      reduction_info_table_type *reduction_list,
+				      tree nit)
+{
+  /* Check whether the latch contains a single statement.  */
+  if (!gimple_seq_singleton_p (bb_seq (loop->latch)))
+    return true;
+
+  /* Check whether the latch contains the loop iv increment.  */
+  edge back = single_succ_edge (loop->latch);
+  edge exit = single_dom_exit (loop);
+  gcond *cond_stmt = as_a <gcond *> (last_stmt (exit->src));
+  tree control = gimple_cond_lhs (cond_stmt);
+  gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (control));
+  tree inc_res = gimple_phi_arg_def (phi, back->dest_idx);
+  if (gimple_bb (SSA_NAME_DEF_STMT (inc_res)) != loop->latch)
+    return false;
+
+  /* Check whether there's no code between the loop condition and the latch.  */
+  if (!single_pred_p (loop->latch)
+      || single_pred (loop->latch) != exit->src)
+    return false;
+
+  tree alt_bound = NULL_TREE;
+  tree nit_type = TREE_TYPE (nit);
+
+  /* Figure out whether nit + 1 overflows.  */
+  if (TREE_CODE (nit) == INTEGER_CST)
+    {
+      if (!tree_int_cst_equal (nit, TYPE_MAXVAL (nit_type)))
+	{
+	  alt_bound = fold_build2_loc (UNKNOWN_LOCATION, PLUS_EXPR, nit_type,
+				       nit, build_one_cst (nit_type));
+
+	  gcc_assert (TREE_CODE (alt_bound) == INTEGER_CST);
+	}
+      else
+	{
+	  /* Todo: Figure out if we can trigger this, if it's worth to handle
+	     optimally, and if we can handle it optimally.  */
+	}
+    }
+  else
+    {
+      gcc_assert (TREE_CODE (nit) == SSA_NAME);
+
+      gimple def = SSA_NAME_DEF_STMT (nit);
+
+      if (def
+	  && is_gimple_assign (def)
+	  && gimple_assign_rhs_code (def) == PLUS_EXPR)
+	{
+	  tree op1 = gimple_assign_rhs1 (def);
+	  tree op2 = gimple_assign_rhs2 (def);
+	  if (integer_minus_onep (op1))
+	    alt_bound = op2;
+	  else if (integer_minus_onep (op2))
+	    alt_bound = op1;
+	}
+
+      /* There is a number of test-cases for which we don't get an alt_bound
+	 here: they're listed here, with the lhs of the last stmt as the nit:
+
+	 libgomp.graphite/force-parallel-1.c:
+	 _21 = (signed long) N_6(D);
+	 _19 = _21 + -1;
+	 _7 = (unsigned long) _19;
+
+	 libgomp.graphite/force-parallel-2.c:
+	 _33 = (signed long) N_9(D);
+	 _16 = _33 + -1;
+	 _37 = (unsigned long) _16;
+
+	 libgomp.graphite/force-parallel-5.c:
+	 <bb 6>:
+	 # graphite_IV.5_46 = PHI <0(5), graphite_IV.5_47(11)>
+	 <bb 7>:
+	 _33 = (unsigned long) graphite_IV.5_46;
+
+	 g++.dg/tree-ssa/pr34355.C:
+	 _2 = (unsigned int) i_9;
+	 _3 = 4 - _2;
+
+	 gcc.dg/pr53849.c:
+	 _5 = d.0_11 + -2;
+	 _18 = (unsigned int) _5;
+
+	 We will be able to handle some of these cases, if we can determine when
+	 it's safe to look past casts.  */
+    }
+
+  if (alt_bound == NULL_TREE)
+    return false;
+
+  transform_to_exit_first_loop_alt (loop, reduction_list, alt_bound);
+  return true;
+}
+
+/* Moves the exit condition of LOOP to the beginning of its header.  NIT is the
+   number of iterations of the loop.  REDUCTION_LIST describes the reductions in
+   LOOP.  */
 
 static void
 transform_to_exit_first_loop (struct loop *loop,
@@ -1882,8 +2188,19 @@ gen_parallel_loop (struct loop *loop,
   /* Base all the induction variables in LOOP on a single control one.  */
   canonicalize_loop_ivs (loop, &nit, true);
 
-  /* Ensure that the exit condition is the first statement in the loop.  */
-  transform_to_exit_first_loop (loop, reduction_list, nit);
+  /* Ensure that the exit condition is the first statement in the loop.
+     The common case is that latch of the loop is empty (apart from the
+     increment) and immediately follows the loop exit test.  Attempt to move the
+     entry of the loop directly before the exit check and increase the number of
+     iterations of the loop by one.  */
+  if (!try_transform_to_exit_first_loop_alt (loop, reduction_list, nit))
+    {
+      /* Fall back on the method that handles more cases, but duplicates the
+	 loop body: move the exit condition of LOOP to the beginning of its
+	 header, and duplicate the part of the last iteration that gets disabled
+	 to the exit of the loop.  */
+      transform_to_exit_first_loop (loop, reduction_list, nit);
+    }
 
   /* Generate initializations for reductions.  */
   if (reduction_list->elements () > 0)
diff --git a/libgomp/testsuite/libgomp.c/parloops-exit-first-loop-alt-2.c b/libgomp/testsuite/libgomp.c/parloops-exit-first-loop-alt-2.c
new file mode 100644
index 0000000..eb5e11f
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/parloops-exit-first-loop-alt-2.c
@@ -0,0 +1,48 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -ftree-parallelize-loops=2" } */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#define N 1000
+
+unsigned int a[N];
+unsigned int b[N];
+unsigned int c[N];
+
+void __attribute__((noclone,noinline))
+f (void)
+{
+  int i;
+
+  for (i = 0; i < N; ++i)
+    c[i] = a[i] + b[i];
+}
+
+int
+main (void)
+{
+  int i, j;
+
+  /* Complexify loop to inhibit parloops.  */
+  for (j = 0; j < 100; ++j)
+    for (i = 0; i < 10; i++)
+      {
+	int k = i + (10 * j);
+	a[k] = k;
+	b[k] = (k * 3) % 7;
+	c[k] = k * 2;
+      }
+
+  f ();
+
+  for (i = 0; i < N; i++)
+    {
+      unsigned int actual = c[i];
+      unsigned int expected = i + ((i * 3) % 7);
+      if (actual != expected)
+	abort ();
+    }
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/parloops-exit-first-loop-alt-3.c b/libgomp/testsuite/libgomp.c/parloops-exit-first-loop-alt-3.c
new file mode 100644
index 0000000..b426b3f
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/parloops-exit-first-loop-alt-3.c
@@ -0,0 +1,29 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -ftree-parallelize-loops=2" } */
+
+unsigned int *a;
+
+unsigned int __attribute__((noclone,noinline))
+f (unsigned int n)
+{
+  int i;
+  unsigned int sum = 1;
+
+  for (i = 0; i < n; ++i)
+    sum += a[i];
+
+  return sum;
+}
+
+int
+main (void)
+{
+  unsigned int res;
+  unsigned int array[4000];
+  int i;
+  for (i = 0; i < 4000; ++i)
+    array[i] = i % 7;
+  a = &array[0];
+  res = f (4000);
+  return !(res == 11995);
+}
diff --git a/libgomp/testsuite/libgomp.c/parloops-exit-first-loop-alt.c b/libgomp/testsuite/libgomp.c/parloops-exit-first-loop-alt.c
new file mode 100644
index 0000000..d7d4003
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/parloops-exit-first-loop-alt.c
@@ -0,0 +1,48 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -ftree-parallelize-loops=2" } */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#define N 1000
+
+unsigned int a[N];
+unsigned int b[N];
+unsigned int c[N];
+
+void __attribute__((noclone,noinline))
+f (unsigned int n)
+{
+  int i;
+
+  for (i = 0; i < n; ++i)
+    c[i] = a[i] + b[i];
+}
+
+int
+main (void)
+{
+  int i, j;
+
+  /* Complexify loop to inhibit parloops.  */
+  for (j = 0; j < 100; ++j)
+    for (i = 0; i < 10; i++)
+      {
+	int k = i + (10 * j);
+	a[k] = k;
+	b[k] = (k * 3) % 7;
+	c[k] = k * 2;
+      }
+
+  f (N);
+
+  for (i = 0; i < N; i++)
+    {
+      unsigned int actual = c[i];
+      unsigned int expected = i + ((i * 3) % 7);
+      if (actual != expected)
+	abort ();
+    }
+
+  return 0;
+}
-- 
1.9.1


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PING][PATCH][PR65443] Add transform_to_exit_first_loop_alt
  2015-04-15 13:39   ` [PING][PATCH][PR65443] " Tom de Vries
@ 2015-04-20 12:26     ` Richard Biener
  2015-05-14 11:47       ` Tom de Vries
  0 siblings, 1 reply; 13+ messages in thread
From: Richard Biener @ 2015-04-20 12:26 UTC (permalink / raw)
  To: Tom de Vries; +Cc: GCC Patches, Jakub Jelinek

On Wed, 15 Apr 2015, Tom de Vries wrote:

> On 03-04-15 14:39, Tom de Vries wrote:
> > On 27-03-15 15:10, Tom de Vries wrote:
> > > Hi,
> > > 
> > > this patch fixes PR65443, a todo in the parloops pass for function
> > > transform_to_exit_first_loop:
> > > ...
> > >     TODO: the common case is that latch of the loop is empty and
> > > immediately
> > >     follows the loop exit.  In this case, it would be better not to copy
> > > the
> > >     body of the loop, but only move the entry of the loop directly before
> > > the
> > >     exit check and increase the number of iterations of the loop by one.
> > >     This may need some additional preconditioning in case NIT = ~0.
> > > ...
> > > 
> > > The current implementation of transform_to_exit_first_loop transforms the
> > > loop
> > > by moving and duplicating the loop body. This patch transforms the loop
> > > into the
> > > same normal form, but without the duplication, if that's possible
> > > (determined by
> > > try_transform_to_exit_first_loop_alt).
> > > 
> > > The actual transformation, done by transform_to_exit_first_loop_alt
> > > transforms
> > > this loop:
> > > ...
> > >       bb preheader:
> > >       ...
> > >       goto <bb header>
> > > 
> > >       <bb header>:
> > >       ...
> > >       if (ivtmp < n)
> > >         goto <bb latch>;
> > >       else
> > >         goto <bb exit>;
> > > 
> > >       <bb latch>:
> > >       ivtmp = ivtmp + inc;
> > >       goto <bb header>
> > > ...
> > > 
> > > into this one:
> > > ...
> > >       bb preheader:
> > >       ...
> > >       goto <bb newheader>
> > > 
> > >       <bb header>:
> > >       ...
> > >       goto <bb latch>;
> > > 
> > >       <bb newheader>:
> > >       if (ivtmp < n + 1)
> > >         goto <bb header>;
> > >       else
> > >         goto <bb exit>;
> > > 
> > >       <bb latch>:
> > >       ivtmp = ivtmp + inc;
> > >       goto <bb newheader>
> > > ...
> > > 
> > 
> > Updated patch, now using redirect_edge_var_map and flush_pending_stmts.
> > 
> > Bootstrapped and reg-tested on x86_64.
> > 
> > OK for stage1 trunk?
> > 
> 
> Ping.

+static void
+replace_imm_uses (tree val, imm_use_iterator *imm_iter)
+{
+  use_operand_p use_p;
+
+  FOR_EACH_IMM_USE_ON_STMT (use_p, *imm_iter)
+    SET_USE (use_p, val);

Use propagate_value.  Why this odd interface passing a imm_iter?!

In fact most of the "repair SSA" in transform_to_exit_first_loop_alt
looks too ad-hoc to me ... it also looks somewhat familiar to other
code ...

Unfortunately the comment before the function isn't in SSA form
so it's hard to follow the transform.

I consider the parloops code bitrotten, no repair possible, so
I might be convinced to not care about new spaghetti in there...

+  /* Fix up loop structure.  TODO: Check whether this is sufficient.  */
+  loop->header = new_header;
+

no, surely not.  Number of iterations (and estimates) are off
after the transform and loop->latch might also need updating.

"Safest" is to simply schedule a fixup (but you'll lose any
loop annotation in that process).

+  /* Figure out whether nit + 1 overflows.  */
+  if (TREE_CODE (nit) == INTEGER_CST)
+    {
+      if (!tree_int_cst_equal (nit, TYPE_MAXVAL (nit_type)))

in case nit_type is a pointer type TYPE_MAXVAL will be NULL I think.

Is the whole exercise for performance?  In that case using an
entirely new, unsigned IV, that runs from 0 to niter should
be easiest and just run-time guard that niter == +INF case?

For the graphite case, can't you make graphite generate the
loops exit-first in the first place?

The testcases are just correctness ones?  That is, there isn't
any testcase that checks whether the new code is exercised?
(no extra debugging dumping?)

Thanks,
Richard.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PING][PATCH][PR65443] Add transform_to_exit_first_loop_alt
  2015-04-20 12:26     ` Richard Biener
@ 2015-05-14 11:47       ` Tom de Vries
  2015-05-26 11:05         ` Richard Biener
  2015-06-01  8:12         ` [gomp4,committed,PR65443] Add transform_to_exit_first_loop_al Tom de Vries
  0 siblings, 2 replies; 13+ messages in thread
From: Tom de Vries @ 2015-05-14 11:47 UTC (permalink / raw)
  To: Richard Biener; +Cc: GCC Patches, Jakub Jelinek

[-- Attachment #1: Type: text/plain, Size: 6357 bytes --]

On 20-04-15 14:25, Richard Biener wrote:
> On Wed, 15 Apr 2015, Tom de Vries wrote:
>
>> On 03-04-15 14:39, Tom de Vries wrote:
>>> On 27-03-15 15:10, Tom de Vries wrote:
>>>> Hi,
>>>>
>>>> this patch fixes PR65443, a todo in the parloops pass for function
>>>> transform_to_exit_first_loop:
>>>> ...
>>>>      TODO: the common case is that latch of the loop is empty and
>>>> immediately
>>>>      follows the loop exit.  In this case, it would be better not to copy
>>>> the
>>>>      body of the loop, but only move the entry of the loop directly before
>>>> the
>>>>      exit check and increase the number of iterations of the loop by one.
>>>>      This may need some additional preconditioning in case NIT = ~0.
>>>> ...
>>>>
>>>> The current implementation of transform_to_exit_first_loop transforms the
>>>> loop
>>>> by moving and duplicating the loop body. This patch transforms the loop
>>>> into the
>>>> same normal form, but without the duplication, if that's possible
>>>> (determined by
>>>> try_transform_to_exit_first_loop_alt).
>>>>
>>>> The actual transformation, done by transform_to_exit_first_loop_alt
>>>> transforms
>>>> this loop:
>>>> ...
>>>>        bb preheader:
>>>>        ...
>>>>        goto <bb header>
>>>>
>>>>        <bb header>:
>>>>        ...
>>>>        if (ivtmp < n)
>>>>          goto <bb latch>;
>>>>        else
>>>>          goto <bb exit>;
>>>>
>>>>        <bb latch>:
>>>>        ivtmp = ivtmp + inc;
>>>>        goto <bb header>
>>>> ...
>>>>
>>>> into this one:
>>>> ...
>>>>        bb preheader:
>>>>        ...
>>>>        goto <bb newheader>
>>>>
>>>>        <bb header>:
>>>>        ...
>>>>        goto <bb latch>;
>>>>
>>>>        <bb newheader>:
>>>>        if (ivtmp < n + 1)
>>>>          goto <bb header>;
>>>>        else
>>>>          goto <bb exit>;
>>>>
>>>>        <bb latch>:
>>>>        ivtmp = ivtmp + inc;
>>>>        goto <bb newheader>
>>>> ...
>>>>
>>>
>>> Updated patch, now using redirect_edge_var_map and flush_pending_stmts.
>>>
>>> Bootstrapped and reg-tested on x86_64.
>>>
>>> OK for stage1 trunk?
>>>
>>
>> Ping.
>

Hi Richard,

thanks for the review.

> +static void
> +replace_imm_uses (tree val, imm_use_iterator *imm_iter)
> +{
> +  use_operand_p use_p;
> +
> +  FOR_EACH_IMM_USE_ON_STMT (use_p, *imm_iter)
> +    SET_USE (use_p, val);
>
> Use propagate_value.  Why this odd interface passing a imm_iter?!
>

The replace_imm_uses function is common code factored out of 
replace_uses_in_bb_by and replace_uses_in_bbs_by. I'm not sure what is odd about 
the interface of the replace_imm_uses function, but I've removed the function.

I tried using propagate_value, but that one doesn't like virtuals.

> In fact most of the "repair SSA" in transform_to_exit_first_loop_alt
> looks too ad-hoc to me ... it also looks somewhat familiar to other
> code ...
>
> Unfortunately the comment before the function isn't in SSA form
> so it's hard to follow the transform.
>

I've added the ssa bits in the transformation comment before the function, and 
updated variable names and comments in the function.

> I consider the parloops code bitrotten, no repair possible, so
> I might be convinced to not care about new spaghetti in there...
>
> +  /* Fix up loop structure.  TODO: Check whether this is sufficient.  */
> +  loop->header = new_header;
> +
>
> no, surely not.  Number of iterations (and estimates) are off
> after the transform

The loop is cancelled at the end of gen_parallel_loop, and the estimations are 
removed.  We only seem to be using a limited set of the loop data until the loop 
is cancelled. Updated comment accordingly.

> and loop->latch might also need updating.
>

That one actually stays the same. Updated comment accordingly.

> "Safest" is to simply schedule a fixup (but you'll lose any
> loop annotation in that process).
>

Since the loop is cancelled, AFAIU we don't need that, but... You're referring 
to the annotations mentioned in replace_loop_annotate_in_block? Losing the 
annotations seems to be a generic problem of scheduling such a fixup, not 
particular to this patch. Do you have a concern related to the patch and these 
annotations?

> +  /* Figure out whether nit + 1 overflows.  */
> +  if (TREE_CODE (nit) == INTEGER_CST)
> +    {
> +      if (!tree_int_cst_equal (nit, TYPE_MAXVAL (nit_type)))
>
> in case nit_type is a pointer type TYPE_MAXVAL will be NULL I think.
>

We've done ivcanon just before this point, so AFAIU we can assume we're dealing 
with an unsigned integer.

> Is the whole exercise for performance?

I'm trying to fix the todo in the code for parloops, which is about performance, 
though I don't expect a large gain.

OTOH, my main focus is a related oacc kernels issue. For an oacc kernels region, 
the entire loop is enclosed in a kernels region. Peeling of the last iteration 
means I have to guard that iteration to run on only one thread, which currently 
isn't done, so in that sense it's a correctness issue as well.

> In that case using an
> entirely new, unsigned IV, that runs from 0 to niter should be easiest and

Introducting such an IV would mean that we don't have to update the IV, but 
still we have to connect the new IV to the uses of the old IV.

The current special handling of the IV variable is actually not that elaborate:
...
+      /* Replace ivtmp_a with ivtmp_c in condition 'if (ivtmp_a < n)'.  */
+      replace_uses_in_bb_by (res_a, res_c, new_header);
...
So I'm not sure it's easier.

just run-time guard that niter == +INF case?

That doesn't work out nicely for the oacc kernels case. I need to know 
compile-time wheter I can parallelize or not.

> For the graphite case, can't you make graphite generate the
> loops exit-first in the first place?
>

Perhaps, but that doesn't make a difference for the oacc kernels case.

> The testcases are just correctness ones?  That is, there isn't
> any testcase that checks whether the new code is exercised?
> (no extra debugging dumping?)
>

There are 3 new test patterns, each with a libgomp/testsuite/libgomp.c and a 
gcc/testsuite/gcc.dg variant, so 6 new test-cases in total. The 
gcc/testsuite/gcc.dg variant checks that the new code is exercised. The 
libgomp/testsuite/libgomp.c variant checks that the new code generates correct code.

Reposting updated patch (for brevity, without testcases) below.

Thanks,
- Tom


[-- Attachment #2: 0002-Add-transform_to_exit_first_loop_alt.patch --]
[-- Type: text/x-patch, Size: 14010 bytes --]

Add transform_to_exit_first_loop_alt

2015-04-14  Tom de Vries  <tom@codesourcery.com>

	PR tree-optimization/65443
	* tree-parloops.c (replace_imm_uses, replace_uses_in_bb_by)
	(replace_uses_in_bbs_by, transform_to_exit_first_loop_alt)
	(try_transform_to_exit_first_loop_alt): New function.
	(transform_to_exit_first_loop): Use
	try_transform_to_exit_first_loop_alt.
---
 gcc/tree-parloops.c | 405 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 393 insertions(+), 12 deletions(-)

diff --git a/gcc/tree-parloops.c b/gcc/tree-parloops.c
index 080d35e..8168632 100644
--- a/gcc/tree-parloops.c
+++ b/gcc/tree-parloops.c
@@ -78,6 +78,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "plugin-api.h"
 #include "ipa-ref.h"
 #include "cgraph.h"
+#include "tree-ssa.h"
 
 /* This pass tries to distribute iterations of loops into several threads.
    The implementation is straightforward -- for each loop we test whether its
@@ -1487,17 +1488,386 @@ create_loop_fn (location_t loc)
   return decl;
 }
 
-/* Moves the exit condition of LOOP to the beginning of its header, and
-   duplicates the part of the last iteration that gets disabled to the
-   exit of the loop.  NIT is the number of iterations of the loop
-   (used to initialize the variables in the duplicated part).
+/* Replace uses of NAME by VAL in block BB.  */
 
-   TODO: the common case is that latch of the loop is empty and immediately
-   follows the loop exit.  In this case, it would be better not to copy the
-   body of the loop, but only move the entry of the loop directly before the
-   exit check and increase the number of iterations of the loop by one.
-   This may need some additional preconditioning in case NIT = ~0.
-   REDUCTION_LIST describes the reductions in LOOP.  */
+static void
+replace_uses_in_bb_by (tree name, tree val, basic_block bb)
+{
+  gimple use_stmt;
+  imm_use_iterator imm_iter;
+
+  FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, name)
+    {
+      if (gimple_bb (use_stmt) != bb)
+	continue;
+
+      use_operand_p use_p;
+      FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
+	SET_USE (use_p, val);
+    }
+}
+
+/* Replace uses of NAME by VAL in blocks BBS.  */
+
+static void
+replace_uses_in_bbs_by (tree name, tree val, bitmap bbs)
+{
+  gimple use_stmt;
+  imm_use_iterator imm_iter;
+
+  FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, name)
+    {
+      if (!bitmap_bit_p (bbs, gimple_bb (use_stmt)->index))
+	continue;
+
+      use_operand_p use_p;
+      FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
+	SET_USE (use_p, val);
+    }
+}
+
+/* Do transformation from:
+
+     <bb preheader>:
+     ...
+     goto <bb header>
+
+     <bb header>:
+     ivtmp_a = PHI <ivtmp_init (preheader), ivtmp_b (latch)>
+     sum_a = PHI <sum_init (preheader), sum_b (latch)>
+     ...
+     use (ivtmp_a)
+     ...
+     sum_b = sum_a + sum_update
+     ...
+     if (ivtmp_a < n)
+       goto <bb latch>;
+     else
+       goto <bb exit>;
+
+     <bb latch>:
+     ivtmp_b = ivtmp_a + 1;
+     goto <bb header>
+
+     <bb exit>:
+     sum_z = PHI <sum_b (cond[1])>
+
+     [1] Where <bb cond> is single_pred (bb latch); In the simplest case,
+	 that's <bb header>.
+
+   to:
+
+     <bb preheader>:
+     ...
+     goto <bb newheader>
+
+     <bb header>:
+     ivtmp_a = PHI <ivtmp_c (latch)>
+     sum_a = PHI <sum_c (latch)>
+     ...
+     use (ivtmp_a)
+     ...
+     sum_b = sum_a + sum_update
+     ...
+     goto <bb latch>;
+
+     <bb newheader>:
+     ivtmp_c = PHI <ivtmp_init (preheader), ivtmp_b (latch)>
+     sum_c = PHI <sum_init (preheader), sum_b (latch)>
+     if (ivtmp_c < n + 1)
+       goto <bb header>;
+     else
+       goto <bb exit>;
+
+     <bb latch>:
+     ivtmp_b = ivtmp_a + 1;
+     goto <bb newheader>
+
+     <bb exit>:
+     sum_z = PHI <sum_c (newheader)>
+
+
+   In unified diff format:
+
+      <bb preheader>:
+      ...
+-     goto <bb header>
++     goto <bb newheader>
+
+      <bb header>:
+-     ivtmp_a = PHI <ivtmp_init (preheader), ivtmp_b (latch)>
+-     sum_a = PHI <sum_init (preheader), sum_b (latch)>
++     ivtmp_a = PHI <ivtmp_c (latch)>
++     sum_a = PHI <sum_c (latch)>
+      ...
+      use (ivtmp_a)
+      ...
+      sum_b = sum_a + sum_update
+      ...
+-     if (ivtmp_a < n)
+-       goto <bb latch>;
++     goto <bb latch>;
++
++     <bb newheader>:
++     ivtmp_c = PHI <ivtmp_init (preheader), ivtmp_b (latch)>
++     sum_c = PHI <sum_init (preheader), sum_b (latch)>
++     if (ivtmp_c < n + 1)
++       goto <bb header>;
+      else
+	goto <bb exit>;
+
+      <bb latch>:
+      ivtmp_b = ivtmp_a + 1;
+-     goto <bb header>
++     goto <bb newheader>
+
+      <bb exit>:
+-     sum_z = PHI <sum_b (cond[1])>
++     sum_z = PHI <sum_c (newheader)>
+
+   Note: the example does not show any virtual phis, but these are handled more
+   or less as reductions.
+
+
+   Moves the exit condition of LOOP to the beginning of its header.
+   REDUCTION_LIST describes the reductions in LOOP.  BOUND is the new loop
+   bound.  */
+
+static void
+transform_to_exit_first_loop_alt (struct loop *loop,
+				  reduction_info_table_type *reduction_list,
+				  tree bound)
+{
+  basic_block header = loop->header;
+  basic_block latch = loop->latch;
+  edge exit = single_dom_exit (loop);
+  basic_block exit_block = exit->dest;
+  gcond *cond_stmt = as_a <gcond *> (last_stmt (exit->src));
+  tree control = gimple_cond_lhs (cond_stmt);
+  edge e;
+
+  /* Gather the bbs dominated by the exit block.  */
+  bitmap exit_dominated = BITMAP_ALLOC (NULL);
+  bitmap_set_bit (exit_dominated, exit_block->index);
+  vec<basic_block> exit_dominated_vec
+    = get_dominated_by (CDI_DOMINATORS, exit_block);
+
+  int i;
+  basic_block dom_bb;
+  FOR_EACH_VEC_ELT (exit_dominated_vec, i, dom_bb)
+    bitmap_set_bit (exit_dominated, dom_bb->index);
+
+  exit_dominated_vec.release ();
+
+  /* Create the new_header block.  */
+  basic_block new_header = split_block_before_cond_jump (exit->src);
+  edge split_edge = single_pred_edge (new_header);
+
+  /* Redirect entry edge to new_header.  */
+  edge entry = loop_preheader_edge (loop);
+  e = redirect_edge_and_branch (entry, new_header);
+  gcc_assert (e == entry);
+
+  /* Redirect post_inc_edge to new_header.  */
+  edge post_inc_edge = single_succ_edge (latch);
+  e = redirect_edge_and_branch (post_inc_edge, new_header);
+  gcc_assert (e == post_inc_edge);
+
+  /* Redirect post_cond_edge to header.  */
+  edge post_cond_edge = single_pred_edge (latch);
+  e = redirect_edge_and_branch (post_cond_edge, header);
+  gcc_assert (e == post_cond_edge);
+
+  /* Redirect split_edge to latch.  */
+  e = redirect_edge_and_branch (split_edge, latch);
+  gcc_assert (e == split_edge);
+
+  /* Set the new loop bound.  */
+  gimple_cond_set_rhs (cond_stmt, bound);
+
+  /* Repair the ssa.  */
+  vec<edge_var_map> *v = redirect_edge_var_map_vector (post_inc_edge);
+  edge_var_map *vm;
+  gphi_iterator gsi;
+  for (gsi = gsi_start_phis (header), i = 0;
+       !gsi_end_p (gsi) && v->iterate (i, &vm);
+       gsi_next (&gsi), i++)
+    {
+      gphi *phi = gsi.phi ();
+      tree res_a = PHI_RESULT (phi);
+
+      /* Create new phi.  */
+      tree res_c = copy_ssa_name (res_a, phi);
+      gphi *nphi = create_phi_node (res_c, new_header);
+
+      /* Replace ivtmp_a with ivtmp_c in condition 'if (ivtmp_a < n)'.  */
+      replace_uses_in_bb_by (res_a, res_c, new_header);
+
+      /* Replace ivtmp/sum_b with ivtmp/sum_c in header phi.  */
+      add_phi_arg (phi, res_c, post_cond_edge, UNKNOWN_LOCATION);
+
+      /* Replace sum_b with sum_c in exit phi.  Loop-closed ssa does not hold
+	 for virtuals, so we cannot get away with exit_block only.  */
+      tree res_b = redirect_edge_var_map_def (vm);
+      replace_uses_in_bbs_by (res_b, res_c, exit_dominated);
+
+      struct reduction_info *red = reduction_phi (reduction_list, phi);
+      gcc_assert (virtual_operand_p (res_a)
+		  || res_a == control
+		  || red != NULL);
+
+      if (red)
+	{
+	  /* Register the new reduction phi.  */
+	  red->reduc_phi = nphi;
+	  gimple_set_uid (red->reduc_phi, red->reduc_version);
+	}
+    }
+  gcc_assert (gsi_end_p (gsi) && !v->iterate (i, &vm));
+  BITMAP_FREE (exit_dominated);
+
+  /* Set the preheader argument of the new phis to ivtmp/sum_init.  */
+  flush_pending_stmts (entry);
+
+  /* Set the latch arguments of the new phis to ivtmp/sum_b.  */
+  flush_pending_stmts (post_inc_edge);
+
+  /* Register the reduction exit phis.  */
+  for (gphi_iterator gsi = gsi_start_phis (exit_block);
+       !gsi_end_p (gsi);
+       gsi_next (&gsi))
+    {
+      gphi *phi = gsi.phi ();
+      tree res_z = PHI_RESULT (phi);
+      if (virtual_operand_p (res_z))
+	continue;
+
+      tree res_c = PHI_ARG_DEF_FROM_EDGE (phi, exit);
+      gimple reduc_phi = SSA_NAME_DEF_STMT (res_c);
+      struct reduction_info *red = reduction_phi (reduction_list, reduc_phi);
+      if (red != NULL)
+	red->keep_res = phi;
+    }
+
+  /* We're going to cancel the loop at the end of gen_parallel_loop, but until
+     then we're still using some fields, so only bother about fields that are
+     still used: header and latch.
+     The loop has a new header bb, so we update it.  The latch bb stays the
+     same.  */
+  loop->header = new_header;
+
+  /* Recalculate dominance info.  */
+  free_dominance_info (CDI_DOMINATORS);
+  calculate_dominance_info (CDI_DOMINATORS);
+}
+
+/* Tries to moves the exit condition of LOOP to the beginning of its header
+   without duplication of the loop body.  NIT is the number of iterations of the
+   loop.  REDUCTION_LIST describes the reductions in LOOP.  Return true if
+   transformation is successful.  */
+
+static bool
+try_transform_to_exit_first_loop_alt (struct loop *loop,
+				      reduction_info_table_type *reduction_list,
+				      tree nit)
+{
+  /* Check whether the latch contains a single statement.  */
+  if (!gimple_seq_singleton_p (bb_seq (loop->latch)))
+    return true;
+
+  /* Check whether the latch contains the loop iv increment.  */
+  edge back = single_succ_edge (loop->latch);
+  edge exit = single_dom_exit (loop);
+  gcond *cond_stmt = as_a <gcond *> (last_stmt (exit->src));
+  tree control = gimple_cond_lhs (cond_stmt);
+  gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (control));
+  tree inc_res = gimple_phi_arg_def (phi, back->dest_idx);
+  if (gimple_bb (SSA_NAME_DEF_STMT (inc_res)) != loop->latch)
+    return false;
+
+  /* Check whether there's no code between the loop condition and the latch.  */
+  if (!single_pred_p (loop->latch)
+      || single_pred (loop->latch) != exit->src)
+    return false;
+
+  tree alt_bound = NULL_TREE;
+  tree nit_type = TREE_TYPE (nit);
+
+  /* Figure out whether nit + 1 overflows.  */
+  if (TREE_CODE (nit) == INTEGER_CST)
+    {
+      if (!tree_int_cst_equal (nit, TYPE_MAXVAL (nit_type)))
+	{
+	  alt_bound = fold_build2_loc (UNKNOWN_LOCATION, PLUS_EXPR, nit_type,
+				       nit, build_one_cst (nit_type));
+
+	  gcc_assert (TREE_CODE (alt_bound) == INTEGER_CST);
+	}
+      else
+	{
+	  /* Todo: Figure out if we can trigger this, if it's worth to handle
+	     optimally, and if we can handle it optimally.  */
+	}
+    }
+  else
+    {
+      gcc_assert (TREE_CODE (nit) == SSA_NAME);
+
+      gimple def = SSA_NAME_DEF_STMT (nit);
+
+      if (def
+	  && is_gimple_assign (def)
+	  && gimple_assign_rhs_code (def) == PLUS_EXPR)
+	{
+	  tree op1 = gimple_assign_rhs1 (def);
+	  tree op2 = gimple_assign_rhs2 (def);
+	  if (integer_minus_onep (op1))
+	    alt_bound = op2;
+	  else if (integer_minus_onep (op2))
+	    alt_bound = op1;
+	}
+
+      /* There is a number of test-cases for which we don't get an alt_bound
+	 here: they're listed here, with the lhs of the last stmt as the nit:
+
+	 libgomp.graphite/force-parallel-1.c:
+	 _21 = (signed long) N_6(D);
+	 _19 = _21 + -1;
+	 _7 = (unsigned long) _19;
+
+	 libgomp.graphite/force-parallel-2.c:
+	 _33 = (signed long) N_9(D);
+	 _16 = _33 + -1;
+	 _37 = (unsigned long) _16;
+
+	 libgomp.graphite/force-parallel-5.c:
+	 <bb 6>:
+	 # graphite_IV.5_46 = PHI <0(5), graphite_IV.5_47(11)>
+	 <bb 7>:
+	 _33 = (unsigned long) graphite_IV.5_46;
+
+	 g++.dg/tree-ssa/pr34355.C:
+	 _2 = (unsigned int) i_9;
+	 _3 = 4 - _2;
+
+	 gcc.dg/pr53849.c:
+	 _5 = d.0_11 + -2;
+	 _18 = (unsigned int) _5;
+
+	 We will be able to handle some of these cases, if we can determine when
+	 it's safe to look past casts.  */
+    }
+
+  if (alt_bound == NULL_TREE)
+    return false;
+
+  transform_to_exit_first_loop_alt (loop, reduction_list, alt_bound);
+  return true;
+}
+
+/* Moves the exit condition of LOOP to the beginning of its header.  NIT is the
+   number of iterations of the loop.  REDUCTION_LIST describes the reductions in
+   LOOP.  */
 
 static void
 transform_to_exit_first_loop (struct loop *loop,
@@ -1882,8 +2252,19 @@ gen_parallel_loop (struct loop *loop,
   /* Base all the induction variables in LOOP on a single control one.  */
   canonicalize_loop_ivs (loop, &nit, true);
 
-  /* Ensure that the exit condition is the first statement in the loop.  */
-  transform_to_exit_first_loop (loop, reduction_list, nit);
+  /* Ensure that the exit condition is the first statement in the loop.
+     The common case is that latch of the loop is empty (apart from the
+     increment) and immediately follows the loop exit test.  Attempt to move the
+     entry of the loop directly before the exit check and increase the number of
+     iterations of the loop by one.  */
+  if (!try_transform_to_exit_first_loop_alt (loop, reduction_list, nit))
+    {
+      /* Fall back on the method that handles more cases, but duplicates the
+	 loop body: move the exit condition of LOOP to the beginning of its
+	 header, and duplicate the part of the last iteration that gets disabled
+	 to the exit of the loop.  */
+      transform_to_exit_first_loop (loop, reduction_list, nit);
+    }
 
   /* Generate initializations for reductions.  */
   if (reduction_list->elements () > 0)
-- 
1.9.1


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PING][PATCH][PR65443] Add transform_to_exit_first_loop_alt
  2015-05-14 11:47       ` Tom de Vries
@ 2015-05-26 11:05         ` Richard Biener
  2015-06-04  8:37           ` Tom de Vries
  2015-06-01  8:12         ` [gomp4,committed,PR65443] Add transform_to_exit_first_loop_al Tom de Vries
  1 sibling, 1 reply; 13+ messages in thread
From: Richard Biener @ 2015-05-26 11:05 UTC (permalink / raw)
  To: Tom de Vries; +Cc: GCC Patches, Jakub Jelinek

On Thu, 14 May 2015, Tom de Vries wrote:

> On 20-04-15 14:25, Richard Biener wrote:
> > On Wed, 15 Apr 2015, Tom de Vries wrote:
> > 
> > > On 03-04-15 14:39, Tom de Vries wrote:
> > > > On 27-03-15 15:10, Tom de Vries wrote:
> > > > > Hi,
> > > > > 
> > > > > this patch fixes PR65443, a todo in the parloops pass for function
> > > > > transform_to_exit_first_loop:
> > > > > ...
> > > > >      TODO: the common case is that latch of the loop is empty and
> > > > > immediately
> > > > >      follows the loop exit.  In this case, it would be better not to
> > > > > copy
> > > > > the
> > > > >      body of the loop, but only move the entry of the loop directly
> > > > > before
> > > > > the
> > > > >      exit check and increase the number of iterations of the loop by
> > > > > one.
> > > > >      This may need some additional preconditioning in case NIT = ~0.
> > > > > ...
> > > > > 
> > > > > The current implementation of transform_to_exit_first_loop transforms
> > > > > the
> > > > > loop
> > > > > by moving and duplicating the loop body. This patch transforms the
> > > > > loop
> > > > > into the
> > > > > same normal form, but without the duplication, if that's possible
> > > > > (determined by
> > > > > try_transform_to_exit_first_loop_alt).
> > > > > 
> > > > > The actual transformation, done by transform_to_exit_first_loop_alt
> > > > > transforms
> > > > > this loop:
> > > > > ...
> > > > >        bb preheader:
> > > > >        ...
> > > > >        goto <bb header>
> > > > > 
> > > > >        <bb header>:
> > > > >        ...
> > > > >        if (ivtmp < n)
> > > > >          goto <bb latch>;
> > > > >        else
> > > > >          goto <bb exit>;
> > > > > 
> > > > >        <bb latch>:
> > > > >        ivtmp = ivtmp + inc;
> > > > >        goto <bb header>
> > > > > ...
> > > > > 
> > > > > into this one:
> > > > > ...
> > > > >        bb preheader:
> > > > >        ...
> > > > >        goto <bb newheader>
> > > > > 
> > > > >        <bb header>:
> > > > >        ...
> > > > >        goto <bb latch>;
> > > > > 
> > > > >        <bb newheader>:
> > > > >        if (ivtmp < n + 1)
> > > > >          goto <bb header>;
> > > > >        else
> > > > >          goto <bb exit>;
> > > > > 
> > > > >        <bb latch>:
> > > > >        ivtmp = ivtmp + inc;
> > > > >        goto <bb newheader>
> > > > > ...
> > > > > 
> > > > 
> > > > Updated patch, now using redirect_edge_var_map and flush_pending_stmts.
> > > > 
> > > > Bootstrapped and reg-tested on x86_64.
> > > > 
> > > > OK for stage1 trunk?
> > > > 
> > > 
> > > Ping.
> > 
> 
> Hi Richard,
> 
> thanks for the review.

Sorry for the delay (too many things to review, too much other work...)

> > +static void
> > +replace_imm_uses (tree val, imm_use_iterator *imm_iter)
> > +{
> > +  use_operand_p use_p;
> > +
> > +  FOR_EACH_IMM_USE_ON_STMT (use_p, *imm_iter)
> > +    SET_USE (use_p, val);
> > 
> > Use propagate_value.  Why this odd interface passing a imm_iter?!
> > 
> 
> The replace_imm_uses function is common code factored out of
> replace_uses_in_bb_by and replace_uses_in_bbs_by. I'm not sure what is odd
> about the interface of the replace_imm_uses function, but I've removed the
> function.
> 
> I tried using propagate_value, but that one doesn't like virtuals.
>
> > In fact most of the "repair SSA" in transform_to_exit_first_loop_alt
> > looks too ad-hoc to me ... it also looks somewhat familiar to other
> > code ...
> > 
> > Unfortunately the comment before the function isn't in SSA form
> > so it's hard to follow the transform.
> > 
> 
> I've added the ssa bits in the transformation comment before the function, and
> updated variable names and comments in the function.
> 
> > I consider the parloops code bitrotten, no repair possible, so
> > I might be convinced to not care about new spaghetti in there...
> > 
> > +  /* Fix up loop structure.  TODO: Check whether this is sufficient.  */
> > +  loop->header = new_header;
> > +
> > 
> > no, surely not.  Number of iterations (and estimates) are off
> > after the transform
> 
> The loop is cancelled at the end of gen_parallel_loop, and the estimations are
> removed.  We only seem to be using a limited set of the loop data until the
> loop is cancelled. Updated comment accordingly.
> 
> > and loop->latch might also need updating.
> > 
> 
> That one actually stays the same. Updated comment accordingly.
> 
> > "Safest" is to simply schedule a fixup (but you'll lose any
> > loop annotation in that process).
> > 
> 
> Since the loop is cancelled, AFAIU we don't need that, but... You're 
> referring to the annotations mentioned in 
> replace_loop_annotate_in_block? Losing the annotations seems to be a 
> generic problem of scheduling such a fixup, not particular to this 
> patch. Do you have a concern related to the patch and these annotations?

For example such annotations (or others added by OpenMP like the
safe_len stuff).  Generally fixups preserve those _iff_ the loop
header stays the same.  So in your case doing

  loop->header = new_header;
  set_loops_state (LOOPS_NEED_FIXUP);

would probably "work".  But yes, if the loop is cancelled anyway
there is no point jumping through hoops.

> > +  /* Figure out whether nit + 1 overflows.  */
> > +  if (TREE_CODE (nit) == INTEGER_CST)
> > +    {
> > +      if (!tree_int_cst_equal (nit, TYPE_MAXVAL (nit_type)))
> > 
> > in case nit_type is a pointer type TYPE_MAXVAL will be NULL I think.
> > 
> 
> We've done ivcanon just before this point, so AFAIU we can assume we're
> dealing with an unsigned integer.
> 
> > Is the whole exercise for performance?
> 
> I'm trying to fix the todo in the code for parloops, which is about
> performance, though I don't expect a large gain.
> 
> OTOH, my main focus is a related oacc kernels issue. For an oacc kernels
> region, the entire loop is enclosed in a kernels region. Peeling of the last
> iteration means I have to guard that iteration to run on only one thread,
> which currently isn't done, so in that sense it's a correctness issue as well.

But as the patch doesn't cover all cases you still have a correctness 
issue to solve, no?  It seems to me the last iteration should simply
_not_ be in the oacc kernels region?

> > In that case using an
> > entirely new, unsigned IV, that runs from 0 to niter should be easiest and
> 
> Introducting such an IV would mean that we don't have to update the IV, but
> still we have to connect the new IV to the uses of the old IV.
> 
> The current special handling of the IV variable is actually not that
> elaborate:
> ...
> +      /* Replace ivtmp_a with ivtmp_c in condition 'if (ivtmp_a < n)'.  */
> +      replace_uses_in_bb_by (res_a, res_c, new_header);
> ...
> So I'm not sure it's easier.
> 
> just run-time guard that niter == +INF case?
> 
> That doesn't work out nicely for the oacc kernels case. I need to know
> compile-time wheter I can parallelize or not.

Hmm, I see.

> > For the graphite case, can't you make graphite generate the
> > loops exit-first in the first place?
> > 
> 
> Perhaps, but that doesn't make a difference for the oacc kernels case.
> 
> > The testcases are just correctness ones?  That is, there isn't
> > any testcase that checks whether the new code is exercised?
> > (no extra debugging dumping?)
> > 
> 
> There are 3 new test patterns, each with a libgomp/testsuite/libgomp.c and a
> gcc/testsuite/gcc.dg variant, so 6 new test-cases in total. The
> gcc/testsuite/gcc.dg variant checks that the new code is exercised. The
> libgomp/testsuite/libgomp.c variant checks that the new code generates correct
> code.
> 
> Reposting updated patch (for brevity, without testcases) below.

I'm ok with the patch and count on you to fix eventual fallout ;)

Thanks,
Richard.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [gomp4,committed,PR65443] Add transform_to_exit_first_loop_al
  2015-05-14 11:47       ` Tom de Vries
  2015-05-26 11:05         ` Richard Biener
@ 2015-06-01  8:12         ` Tom de Vries
  1 sibling, 0 replies; 13+ messages in thread
From: Tom de Vries @ 2015-06-01  8:12 UTC (permalink / raw)
  To: Richard Biener; +Cc: GCC Patches, Jakub Jelinek

[-- Attachment #1: Type: text/plain, Size: 6697 bytes --]

[ was: Re: [PING][PATCH][PR65443] Add transform_to_exit_first_loop_alt ]

On 14/05/15 13:42, Tom de Vries wrote:
> On 20-04-15 14:25, Richard Biener wrote:
>> On Wed, 15 Apr 2015, Tom de Vries wrote:
>>
>>> On 03-04-15 14:39, Tom de Vries wrote:
>>>> On 27-03-15 15:10, Tom de Vries wrote:
>>>>> Hi,
>>>>>
>>>>> this patch fixes PR65443, a todo in the parloops pass for function
>>>>> transform_to_exit_first_loop:
>>>>> ...
>>>>>      TODO: the common case is that latch of the loop is empty and
>>>>> immediately
>>>>>      follows the loop exit.  In this case, it would be better not
>>>>> to copy
>>>>> the
>>>>>      body of the loop, but only move the entry of the loop directly
>>>>> before
>>>>> the
>>>>>      exit check and increase the number of iterations of the loop
>>>>> by one.
>>>>>      This may need some additional preconditioning in case NIT = ~0.
>>>>> ...
>>>>>
>>>>> The current implementation of transform_to_exit_first_loop
>>>>> transforms the
>>>>> loop
>>>>> by moving and duplicating the loop body. This patch transforms the
>>>>> loop
>>>>> into the
>>>>> same normal form, but without the duplication, if that's possible
>>>>> (determined by
>>>>> try_transform_to_exit_first_loop_alt).
>>>>>
>>>>> The actual transformation, done by transform_to_exit_first_loop_alt
>>>>> transforms
>>>>> this loop:
>>>>> ...
>>>>>        bb preheader:
>>>>>        ...
>>>>>        goto <bb header>
>>>>>
>>>>>        <bb header>:
>>>>>        ...
>>>>>        if (ivtmp < n)
>>>>>          goto <bb latch>;
>>>>>        else
>>>>>          goto <bb exit>;
>>>>>
>>>>>        <bb latch>:
>>>>>        ivtmp = ivtmp + inc;
>>>>>        goto <bb header>
>>>>> ...
>>>>>
>>>>> into this one:
>>>>> ...
>>>>>        bb preheader:
>>>>>        ...
>>>>>        goto <bb newheader>
>>>>>
>>>>>        <bb header>:
>>>>>        ...
>>>>>        goto <bb latch>;
>>>>>
>>>>>        <bb newheader>:
>>>>>        if (ivtmp < n + 1)
>>>>>          goto <bb header>;
>>>>>        else
>>>>>          goto <bb exit>;
>>>>>
>>>>>        <bb latch>:
>>>>>        ivtmp = ivtmp + inc;
>>>>>        goto <bb newheader>
>>>>> ...
>>>>>
>>>>
>>>> Updated patch, now using redirect_edge_var_map and flush_pending_stmts.
>>>>
>>>> Bootstrapped and reg-tested on x86_64.
>>>>
>>>> OK for stage1 trunk?
>>>>
>>>
>>> Ping.
>>
>
> Hi Richard,
>
> thanks for the review.
>
>> +static void
>> +replace_imm_uses (tree val, imm_use_iterator *imm_iter)
>> +{
>> +  use_operand_p use_p;
>> +
>> +  FOR_EACH_IMM_USE_ON_STMT (use_p, *imm_iter)
>> +    SET_USE (use_p, val);
>>
>> Use propagate_value.  Why this odd interface passing a imm_iter?!
>>
>
> The replace_imm_uses function is common code factored out of
> replace_uses_in_bb_by and replace_uses_in_bbs_by. I'm not sure what is
> odd about the interface of the replace_imm_uses function, but I've
> removed the function.
>
> I tried using propagate_value, but that one doesn't like virtuals.
>
>> In fact most of the "repair SSA" in transform_to_exit_first_loop_alt
>> looks too ad-hoc to me ... it also looks somewhat familiar to other
>> code ...
>>
>> Unfortunately the comment before the function isn't in SSA form
>> so it's hard to follow the transform.
>>
>
> I've added the ssa bits in the transformation comment before the
> function, and updated variable names and comments in the function.
>
>> I consider the parloops code bitrotten, no repair possible, so
>> I might be convinced to not care about new spaghetti in there...
>>
>> +  /* Fix up loop structure.  TODO: Check whether this is sufficient.  */
>> +  loop->header = new_header;
>> +
>>
>> no, surely not.  Number of iterations (and estimates) are off
>> after the transform
>
> The loop is cancelled at the end of gen_parallel_loop, and the
> estimations are removed.  We only seem to be using a limited set of the
> loop data until the loop is cancelled. Updated comment accordingly.
>
>> and loop->latch might also need updating.
>>
>
> That one actually stays the same. Updated comment accordingly.
>
>> "Safest" is to simply schedule a fixup (but you'll lose any
>> loop annotation in that process).
>>
>
> Since the loop is cancelled, AFAIU we don't need that, but... You're
> referring to the annotations mentioned in
> replace_loop_annotate_in_block? Losing the annotations seems to be a
> generic problem of scheduling such a fixup, not particular to this
> patch. Do you have a concern related to the patch and these annotations?
>
>> +  /* Figure out whether nit + 1 overflows.  */
>> +  if (TREE_CODE (nit) == INTEGER_CST)
>> +    {
>> +      if (!tree_int_cst_equal (nit, TYPE_MAXVAL (nit_type)))
>>
>> in case nit_type is a pointer type TYPE_MAXVAL will be NULL I think.
>>
>
> We've done ivcanon just before this point, so AFAIU we can assume we're
> dealing with an unsigned integer.
>
>> Is the whole exercise for performance?
>
> I'm trying to fix the todo in the code for parloops, which is about
> performance, though I don't expect a large gain.
>
> OTOH, my main focus is a related oacc kernels issue. For an oacc kernels
> region, the entire loop is enclosed in a kernels region. Peeling of the
> last iteration means I have to guard that iteration to run on only one
> thread, which currently isn't done, so in that sense it's a correctness
> issue as well.
>
>> In that case using an
>> entirely new, unsigned IV, that runs from 0 to niter should be easiest
>> and
>
> Introducting such an IV would mean that we don't have to update the IV,
> but still we have to connect the new IV to the uses of the old IV.
>
> The current special handling of the IV variable is actually not that
> elaborate:
> ...
> +      /* Replace ivtmp_a with ivtmp_c in condition 'if (ivtmp_a < n)'.  */
> +      replace_uses_in_bb_by (res_a, res_c, new_header);
> ...
> So I'm not sure it's easier.
>
> just run-time guard that niter == +INF case?
>
> That doesn't work out nicely for the oacc kernels case. I need to know
> compile-time wheter I can parallelize or not.
>
>> For the graphite case, can't you make graphite generate the
>> loops exit-first in the first place?
>>
>
> Perhaps, but that doesn't make a difference for the oacc kernels case.
>
>> The testcases are just correctness ones?  That is, there isn't
>> any testcase that checks whether the new code is exercised?
>> (no extra debugging dumping?)
>>
>
> There are 3 new test patterns, each with a libgomp/testsuite/libgomp.c
> and a gcc/testsuite/gcc.dg variant, so 6 new test-cases in total. The
> gcc/testsuite/gcc.dg variant checks that the new code is exercised. The
> libgomp/testsuite/libgomp.c variant checks that the new code generates
> correct code.
>

Committed to gomp-4_0-branch.

Thanks,
- Tom


[-- Attachment #2: 0001-Add-transform_to_exit_first_loop_alt.patch --]
[-- Type: text/x-patch, Size: 19918 bytes --]

Add transform_to_exit_first_loop_alt

2015-05-28  Tom de Vries  <tom@codesourcery.com>

	PR tree-optimization/65443
	* tree-parloops.c (replace_imm_uses, replace_uses_in_bb_by)
	(replace_uses_in_bbs_by, transform_to_exit_first_loop_alt)
	(try_transform_to_exit_first_loop_alt): New function.
	(transform_to_exit_first_loop): Use
	try_transform_to_exit_first_loop_alt.

	* gcc.dg/parloops-exit-first-loop-alt-2.c: New test.
	* gcc.dg/parloops-exit-first-loop-alt-3.c: New test.
	* gcc.dg/parloops-exit-first-loop-alt.c: New test.

	* testsuite/libgomp.c/parloops-exit-first-loop-alt-2.c: New test.
	* testsuite/libgomp.c/parloops-exit-first-loop-alt-3.c: New test.
	* testsuite/libgomp.c/parloops-exit-first-loop-alt.c: New test.


diff --git a/gcc/testsuite/gcc.dg/parloops-exit-first-loop-alt-2.c b/gcc/testsuite/gcc.dg/parloops-exit-first-loop-alt-2.c
new file mode 100644
index 0000000..a4d6cb2
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/parloops-exit-first-loop-alt-2.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target pthread } */
+/* { dg-options "-O2 -ftree-parallelize-loops=2 -fdump-tree-parloops" } */
+
+#define N 1000
+
+unsigned int a[N];
+unsigned int b[N];
+unsigned int c[N];
+
+void __attribute__((noclone,noinline))
+f (unsigned int n)
+{
+  int i;
+
+    for (i = 0; i < N; ++i)
+      c[i] = a[i] + b[i];
+}
+
+/* Three times three array accesses:
+   - three in f._loopfn.0
+   - three in the parallel
+   - three in the low iteration count loop
+   Crucially, none for a peeled off last iteration following the parallel.  */
+/* { dg-final { scan-tree-dump-times "(?n)\\\[i" 9 "parloops" } } */
+
+/* { dg-final { cleanup-tree-dump "parloops" } } */
diff --git a/gcc/testsuite/gcc.dg/parloops-exit-first-loop-alt-3.c b/gcc/testsuite/gcc.dg/parloops-exit-first-loop-alt-3.c
new file mode 100644
index 0000000..c7ab51f88
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/parloops-exit-first-loop-alt-3.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target pthread } */
+/* { dg-options "-O2 -ftree-parallelize-loops=2 -fdump-tree-parloops" } */
+
+unsigned int *a;
+
+unsigned int __attribute__((noclone,noinline))
+f (unsigned int n)
+{
+  int i;
+  unsigned int sum = 1;
+
+  for (i = 0; i < n; ++i)
+    sum += a[i];
+
+  return sum;
+}
+
+/* Three array accesses:
+   - one in f._loopfn.0
+   - one in the parallel
+   - one in the low iteration count loop
+   Crucially, none for a peeled off last iteration following the parallel.  */
+/* { dg-final { scan-tree-dump-times "(?n)\\\* 4" 3 "parloops" } } */
+
+/* { dg-final { cleanup-tree-dump "parloops" } } */
diff --git a/gcc/testsuite/gcc.dg/parloops-exit-first-loop-alt.c b/gcc/testsuite/gcc.dg/parloops-exit-first-loop-alt.c
new file mode 100644
index 0000000..42ca3ac
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/parloops-exit-first-loop-alt.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target pthread } */
+/* { dg-options "-O2 -ftree-parallelize-loops=2 -fdump-tree-parloops" } */
+
+#define N 1000
+
+unsigned int a[N];
+unsigned int b[N];
+unsigned int c[N];
+
+void __attribute__((noclone,noinline))
+f (unsigned int n)
+{
+  int i;
+
+  for (i = 0; i < n; ++i)
+    c[i] = a[i] + b[i];
+}
+
+/* Three times three array accesses:
+   - three in f._loopfn.0
+   - three in the parallel
+   - three in the low iteration count loop
+   Crucially, none for a peeled off last iteration following the parallel.  */
+/* { dg-final { scan-tree-dump-times "(?n)\\\[i" 9 "parloops" } } */
+
+/* { dg-final { cleanup-tree-dump "parloops" } } */
diff --git a/gcc/tree-parloops.c b/gcc/tree-parloops.c
index e10179d..f698eea 100644
--- a/gcc/tree-parloops.c
+++ b/gcc/tree-parloops.c
@@ -78,6 +78,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "plugin-api.h"
 #include "ipa-ref.h"
 #include "cgraph.h"
+#include "tree-ssa.h"
 
 /* This pass tries to distribute iterations of loops into several threads.
    The implementation is straightforward -- for each loop we test whether its
@@ -1487,17 +1488,386 @@ create_loop_fn (location_t loc)
   return decl;
 }
 
-/* Moves the exit condition of LOOP to the beginning of its header, and
-   duplicates the part of the last iteration that gets disabled to the
-   exit of the loop.  NIT is the number of iterations of the loop
-   (used to initialize the variables in the duplicated part).
+/* Replace uses of NAME by VAL in block BB.  */
 
-   TODO: the common case is that latch of the loop is empty and immediately
-   follows the loop exit.  In this case, it would be better not to copy the
-   body of the loop, but only move the entry of the loop directly before the
-   exit check and increase the number of iterations of the loop by one.
-   This may need some additional preconditioning in case NIT = ~0.
-   REDUCTION_LIST describes the reductions in LOOP.  */
+static void
+replace_uses_in_bb_by (tree name, tree val, basic_block bb)
+{
+  gimple use_stmt;
+  imm_use_iterator imm_iter;
+
+  FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, name)
+    {
+      if (gimple_bb (use_stmt) != bb)
+	continue;
+
+      use_operand_p use_p;
+      FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
+	SET_USE (use_p, val);
+    }
+}
+
+/* Replace uses of NAME by VAL in blocks BBS.  */
+
+static void
+replace_uses_in_bbs_by (tree name, tree val, bitmap bbs)
+{
+  gimple use_stmt;
+  imm_use_iterator imm_iter;
+
+  FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, name)
+    {
+      if (!bitmap_bit_p (bbs, gimple_bb (use_stmt)->index))
+	continue;
+
+      use_operand_p use_p;
+      FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
+	SET_USE (use_p, val);
+    }
+}
+
+/* Do transformation from:
+
+     <bb preheader>:
+     ...
+     goto <bb header>
+
+     <bb header>:
+     ivtmp_a = PHI <ivtmp_init (preheader), ivtmp_b (latch)>
+     sum_a = PHI <sum_init (preheader), sum_b (latch)>
+     ...
+     use (ivtmp_a)
+     ...
+     sum_b = sum_a + sum_update
+     ...
+     if (ivtmp_a < n)
+       goto <bb latch>;
+     else
+       goto <bb exit>;
+
+     <bb latch>:
+     ivtmp_b = ivtmp_a + 1;
+     goto <bb header>
+
+     <bb exit>:
+     sum_z = PHI <sum_b (cond[1])>
+
+     [1] Where <bb cond> is single_pred (bb latch); In the simplest case,
+	 that's <bb header>.
+
+   to:
+
+     <bb preheader>:
+     ...
+     goto <bb newheader>
+
+     <bb header>:
+     ivtmp_a = PHI <ivtmp_c (latch)>
+     sum_a = PHI <sum_c (latch)>
+     ...
+     use (ivtmp_a)
+     ...
+     sum_b = sum_a + sum_update
+     ...
+     goto <bb latch>;
+
+     <bb newheader>:
+     ivtmp_c = PHI <ivtmp_init (preheader), ivtmp_b (latch)>
+     sum_c = PHI <sum_init (preheader), sum_b (latch)>
+     if (ivtmp_c < n + 1)
+       goto <bb header>;
+     else
+       goto <bb exit>;
+
+     <bb latch>:
+     ivtmp_b = ivtmp_a + 1;
+     goto <bb newheader>
+
+     <bb exit>:
+     sum_z = PHI <sum_c (newheader)>
+
+
+   In unified diff format:
+
+      <bb preheader>:
+      ...
+-     goto <bb header>
++     goto <bb newheader>
+
+      <bb header>:
+-     ivtmp_a = PHI <ivtmp_init (preheader), ivtmp_b (latch)>
+-     sum_a = PHI <sum_init (preheader), sum_b (latch)>
++     ivtmp_a = PHI <ivtmp_c (latch)>
++     sum_a = PHI <sum_c (latch)>
+      ...
+      use (ivtmp_a)
+      ...
+      sum_b = sum_a + sum_update
+      ...
+-     if (ivtmp_a < n)
+-       goto <bb latch>;
++     goto <bb latch>;
++
++     <bb newheader>:
++     ivtmp_c = PHI <ivtmp_init (preheader), ivtmp_b (latch)>
++     sum_c = PHI <sum_init (preheader), sum_b (latch)>
++     if (ivtmp_c < n + 1)
++       goto <bb header>;
+      else
+	goto <bb exit>;
+
+      <bb latch>:
+      ivtmp_b = ivtmp_a + 1;
+-     goto <bb header>
++     goto <bb newheader>
+
+      <bb exit>:
+-     sum_z = PHI <sum_b (cond[1])>
++     sum_z = PHI <sum_c (newheader)>
+
+   Note: the example does not show any virtual phis, but these are handled more
+   or less as reductions.
+
+
+   Moves the exit condition of LOOP to the beginning of its header.
+   REDUCTION_LIST describes the reductions in LOOP.  BOUND is the new loop
+   bound.  */
+
+static void
+transform_to_exit_first_loop_alt (struct loop *loop,
+				  reduction_info_table_type *reduction_list,
+				  tree bound)
+{
+  basic_block header = loop->header;
+  basic_block latch = loop->latch;
+  edge exit = single_dom_exit (loop);
+  basic_block exit_block = exit->dest;
+  gcond *cond_stmt = as_a <gcond *> (last_stmt (exit->src));
+  tree control = gimple_cond_lhs (cond_stmt);
+  edge e;
+
+  /* Gather the bbs dominated by the exit block.  */
+  bitmap exit_dominated = BITMAP_ALLOC (NULL);
+  bitmap_set_bit (exit_dominated, exit_block->index);
+  vec<basic_block> exit_dominated_vec
+    = get_dominated_by (CDI_DOMINATORS, exit_block);
+
+  int i;
+  basic_block dom_bb;
+  FOR_EACH_VEC_ELT (exit_dominated_vec, i, dom_bb)
+    bitmap_set_bit (exit_dominated, dom_bb->index);
+
+  exit_dominated_vec.release ();
+
+  /* Create the new_header block.  */
+  basic_block new_header = split_block_before_cond_jump (exit->src);
+  edge split_edge = single_pred_edge (new_header);
+
+  /* Redirect entry edge to new_header.  */
+  edge entry = loop_preheader_edge (loop);
+  e = redirect_edge_and_branch (entry, new_header);
+  gcc_assert (e == entry);
+
+  /* Redirect post_inc_edge to new_header.  */
+  edge post_inc_edge = single_succ_edge (latch);
+  e = redirect_edge_and_branch (post_inc_edge, new_header);
+  gcc_assert (e == post_inc_edge);
+
+  /* Redirect post_cond_edge to header.  */
+  edge post_cond_edge = single_pred_edge (latch);
+  e = redirect_edge_and_branch (post_cond_edge, header);
+  gcc_assert (e == post_cond_edge);
+
+  /* Redirect split_edge to latch.  */
+  e = redirect_edge_and_branch (split_edge, latch);
+  gcc_assert (e == split_edge);
+
+  /* Set the new loop bound.  */
+  gimple_cond_set_rhs (cond_stmt, bound);
+
+  /* Repair the ssa.  */
+  vec<edge_var_map> *v = redirect_edge_var_map_vector (post_inc_edge);
+  edge_var_map *vm;
+  gphi_iterator gsi;
+  for (gsi = gsi_start_phis (header), i = 0;
+       !gsi_end_p (gsi) && v->iterate (i, &vm);
+       gsi_next (&gsi), i++)
+    {
+      gphi *phi = gsi.phi ();
+      tree res_a = PHI_RESULT (phi);
+
+      /* Create new phi.  */
+      tree res_c = copy_ssa_name (res_a, phi);
+      gphi *nphi = create_phi_node (res_c, new_header);
+
+      /* Replace ivtmp_a with ivtmp_c in condition 'if (ivtmp_a < n)'.  */
+      replace_uses_in_bb_by (res_a, res_c, new_header);
+
+      /* Replace ivtmp/sum_b with ivtmp/sum_c in header phi.  */
+      add_phi_arg (phi, res_c, post_cond_edge, UNKNOWN_LOCATION);
+
+      /* Replace sum_b with sum_c in exit phi.  Loop-closed ssa does not hold
+	 for virtuals, so we cannot get away with exit_block only.  */
+      tree res_b = redirect_edge_var_map_def (vm);
+      replace_uses_in_bbs_by (res_b, res_c, exit_dominated);
+
+      struct reduction_info *red = reduction_phi (reduction_list, phi);
+      gcc_assert (virtual_operand_p (res_a)
+		  || res_a == control
+		  || red != NULL);
+
+      if (red)
+	{
+	  /* Register the new reduction phi.  */
+	  red->reduc_phi = nphi;
+	  gimple_set_uid (red->reduc_phi, red->reduc_version);
+	}
+    }
+  gcc_assert (gsi_end_p (gsi) && !v->iterate (i, &vm));
+  BITMAP_FREE (exit_dominated);
+
+  /* Set the preheader argument of the new phis to ivtmp/sum_init.  */
+  flush_pending_stmts (entry);
+
+  /* Set the latch arguments of the new phis to ivtmp/sum_b.  */
+  flush_pending_stmts (post_inc_edge);
+
+  /* Register the reduction exit phis.  */
+  for (gphi_iterator gsi = gsi_start_phis (exit_block);
+       !gsi_end_p (gsi);
+       gsi_next (&gsi))
+    {
+      gphi *phi = gsi.phi ();
+      tree res_z = PHI_RESULT (phi);
+      if (virtual_operand_p (res_z))
+	continue;
+
+      tree res_c = PHI_ARG_DEF_FROM_EDGE (phi, exit);
+      gimple reduc_phi = SSA_NAME_DEF_STMT (res_c);
+      struct reduction_info *red = reduction_phi (reduction_list, reduc_phi);
+      if (red != NULL)
+	red->keep_res = phi;
+    }
+
+  /* We're going to cancel the loop at the end of gen_parallel_loop, but until
+     then we're still using some fields, so only bother about fields that are
+     still used: header and latch.
+     The loop has a new header bb, so we update it.  The latch bb stays the
+     same.  */
+  loop->header = new_header;
+
+  /* Recalculate dominance info.  */
+  free_dominance_info (CDI_DOMINATORS);
+  calculate_dominance_info (CDI_DOMINATORS);
+}
+
+/* Tries to moves the exit condition of LOOP to the beginning of its header
+   without duplication of the loop body.  NIT is the number of iterations of the
+   loop.  REDUCTION_LIST describes the reductions in LOOP.  Return true if
+   transformation is successful.  */
+
+static bool
+try_transform_to_exit_first_loop_alt (struct loop *loop,
+				      reduction_info_table_type *reduction_list,
+				      tree nit)
+{
+  /* Check whether the latch contains a single statement.  */
+  if (!gimple_seq_singleton_p (bb_seq (loop->latch)))
+    return true;
+
+  /* Check whether the latch contains the loop iv increment.  */
+  edge back = single_succ_edge (loop->latch);
+  edge exit = single_dom_exit (loop);
+  gcond *cond_stmt = as_a <gcond *> (last_stmt (exit->src));
+  tree control = gimple_cond_lhs (cond_stmt);
+  gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (control));
+  tree inc_res = gimple_phi_arg_def (phi, back->dest_idx);
+  if (gimple_bb (SSA_NAME_DEF_STMT (inc_res)) != loop->latch)
+    return false;
+
+  /* Check whether there's no code between the loop condition and the latch.  */
+  if (!single_pred_p (loop->latch)
+      || single_pred (loop->latch) != exit->src)
+    return false;
+
+  tree alt_bound = NULL_TREE;
+  tree nit_type = TREE_TYPE (nit);
+
+  /* Figure out whether nit + 1 overflows.  */
+  if (TREE_CODE (nit) == INTEGER_CST)
+    {
+      if (!tree_int_cst_equal (nit, TYPE_MAXVAL (nit_type)))
+	{
+	  alt_bound = fold_build2_loc (UNKNOWN_LOCATION, PLUS_EXPR, nit_type,
+				       nit, build_one_cst (nit_type));
+
+	  gcc_assert (TREE_CODE (alt_bound) == INTEGER_CST);
+	}
+      else
+	{
+	  /* Todo: Figure out if we can trigger this, if it's worth to handle
+	     optimally, and if we can handle it optimally.  */
+	}
+    }
+  else
+    {
+      gcc_assert (TREE_CODE (nit) == SSA_NAME);
+
+      gimple def = SSA_NAME_DEF_STMT (nit);
+
+      if (def
+	  && is_gimple_assign (def)
+	  && gimple_assign_rhs_code (def) == PLUS_EXPR)
+	{
+	  tree op1 = gimple_assign_rhs1 (def);
+	  tree op2 = gimple_assign_rhs2 (def);
+	  if (integer_minus_onep (op1))
+	    alt_bound = op2;
+	  else if (integer_minus_onep (op2))
+	    alt_bound = op1;
+	}
+
+      /* There is a number of test-cases for which we don't get an alt_bound
+	 here: they're listed here, with the lhs of the last stmt as the nit:
+
+	 libgomp.graphite/force-parallel-1.c:
+	 _21 = (signed long) N_6(D);
+	 _19 = _21 + -1;
+	 _7 = (unsigned long) _19;
+
+	 libgomp.graphite/force-parallel-2.c:
+	 _33 = (signed long) N_9(D);
+	 _16 = _33 + -1;
+	 _37 = (unsigned long) _16;
+
+	 libgomp.graphite/force-parallel-5.c:
+	 <bb 6>:
+	 # graphite_IV.5_46 = PHI <0(5), graphite_IV.5_47(11)>
+	 <bb 7>:
+	 _33 = (unsigned long) graphite_IV.5_46;
+
+	 g++.dg/tree-ssa/pr34355.C:
+	 _2 = (unsigned int) i_9;
+	 _3 = 4 - _2;
+
+	 gcc.dg/pr53849.c:
+	 _5 = d.0_11 + -2;
+	 _18 = (unsigned int) _5;
+
+	 We will be able to handle some of these cases, if we can determine when
+	 it's safe to look past casts.  */
+    }
+
+  if (alt_bound == NULL_TREE)
+    return false;
+
+  transform_to_exit_first_loop_alt (loop, reduction_list, alt_bound);
+  return true;
+}
+
+/* Moves the exit condition of LOOP to the beginning of its header.  NIT is the
+   number of iterations of the loop.  REDUCTION_LIST describes the reductions in
+   LOOP.  */
 
 static void
 transform_to_exit_first_loop (struct loop *loop,
@@ -1961,8 +2331,19 @@ gen_parallel_loop (struct loop *loop,
   /* Base all the induction variables in LOOP on a single control one.  */
   canonicalize_loop_ivs (loop, &nit, true);
 
-  /* Ensure that the exit condition is the first statement in the loop.  */
-  transform_to_exit_first_loop (loop, reduction_list, nit);
+  /* Ensure that the exit condition is the first statement in the loop.
+     The common case is that latch of the loop is empty (apart from the
+     increment) and immediately follows the loop exit test.  Attempt to move the
+     entry of the loop directly before the exit check and increase the number of
+     iterations of the loop by one.  */
+  if (!try_transform_to_exit_first_loop_alt (loop, reduction_list, nit))
+    {
+      /* Fall back on the method that handles more cases, but duplicates the
+	 loop body: move the exit condition of LOOP to the beginning of its
+	 header, and duplicate the part of the last iteration that gets disabled
+	 to the exit of the loop.  */
+      transform_to_exit_first_loop (loop, reduction_list, nit);
+    }
 
   /* Generate initializations for reductions.  */
   if (reduction_list->elements () > 0)
diff --git a/libgomp/testsuite/libgomp.c/parloops-exit-first-loop-alt-2.c b/libgomp/testsuite/libgomp.c/parloops-exit-first-loop-alt-2.c
new file mode 100644
index 0000000..eb5e11f
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/parloops-exit-first-loop-alt-2.c
@@ -0,0 +1,48 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -ftree-parallelize-loops=2" } */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#define N 1000
+
+unsigned int a[N];
+unsigned int b[N];
+unsigned int c[N];
+
+void __attribute__((noclone,noinline))
+f (void)
+{
+  int i;
+
+  for (i = 0; i < N; ++i)
+    c[i] = a[i] + b[i];
+}
+
+int
+main (void)
+{
+  int i, j;
+
+  /* Complexify loop to inhibit parloops.  */
+  for (j = 0; j < 100; ++j)
+    for (i = 0; i < 10; i++)
+      {
+	int k = i + (10 * j);
+	a[k] = k;
+	b[k] = (k * 3) % 7;
+	c[k] = k * 2;
+      }
+
+  f ();
+
+  for (i = 0; i < N; i++)
+    {
+      unsigned int actual = c[i];
+      unsigned int expected = i + ((i * 3) % 7);
+      if (actual != expected)
+	abort ();
+    }
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/parloops-exit-first-loop-alt-3.c b/libgomp/testsuite/libgomp.c/parloops-exit-first-loop-alt-3.c
new file mode 100644
index 0000000..b426b3f
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/parloops-exit-first-loop-alt-3.c
@@ -0,0 +1,29 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -ftree-parallelize-loops=2" } */
+
+unsigned int *a;
+
+unsigned int __attribute__((noclone,noinline))
+f (unsigned int n)
+{
+  int i;
+  unsigned int sum = 1;
+
+  for (i = 0; i < n; ++i)
+    sum += a[i];
+
+  return sum;
+}
+
+int
+main (void)
+{
+  unsigned int res;
+  unsigned int array[4000];
+  int i;
+  for (i = 0; i < 4000; ++i)
+    array[i] = i % 7;
+  a = &array[0];
+  res = f (4000);
+  return !(res == 11995);
+}
diff --git a/libgomp/testsuite/libgomp.c/parloops-exit-first-loop-alt.c b/libgomp/testsuite/libgomp.c/parloops-exit-first-loop-alt.c
new file mode 100644
index 0000000..d7d4003
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/parloops-exit-first-loop-alt.c
@@ -0,0 +1,48 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -ftree-parallelize-loops=2" } */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#define N 1000
+
+unsigned int a[N];
+unsigned int b[N];
+unsigned int c[N];
+
+void __attribute__((noclone,noinline))
+f (unsigned int n)
+{
+  int i;
+
+  for (i = 0; i < n; ++i)
+    c[i] = a[i] + b[i];
+}
+
+int
+main (void)
+{
+  int i, j;
+
+  /* Complexify loop to inhibit parloops.  */
+  for (j = 0; j < 100; ++j)
+    for (i = 0; i < 10; i++)
+      {
+	int k = i + (10 * j);
+	a[k] = k;
+	b[k] = (k * 3) % 7;
+	c[k] = k * 2;
+      }
+
+  f (N);
+
+  for (i = 0; i < N; i++)
+    {
+      unsigned int actual = c[i];
+      unsigned int expected = i + ((i * 3) % 7);
+      if (actual != expected)
+	abort ();
+    }
+
+  return 0;
+}
-- 
1.9.1


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PING][PATCH][PR65443] Add transform_to_exit_first_loop_alt
  2015-05-26 11:05         ` Richard Biener
@ 2015-06-04  8:37           ` Tom de Vries
  2015-06-08 10:44             ` Tom de Vries
  0 siblings, 1 reply; 13+ messages in thread
From: Tom de Vries @ 2015-06-04  8:37 UTC (permalink / raw)
  To: Richard Biener; +Cc: GCC Patches, Jakub Jelinek

On 26/05/15 12:39, Richard Biener wrote:
> On Thu, 14 May 2015, Tom de Vries wrote:
>
>> On 20-04-15 14:25, Richard Biener wrote:
>>> On Wed, 15 Apr 2015, Tom de Vries wrote:
>>>
>>>> On 03-04-15 14:39, Tom de Vries wrote:
>>>>> On 27-03-15 15:10, Tom de Vries wrote:
>>>>>> Hi,
>>>>>>
>>>>>> this patch fixes PR65443, a todo in the parloops pass for function
>>>>>> transform_to_exit_first_loop:
>>>>>> ...
>>>>>>       TODO: the common case is that latch of the loop is empty and
>>>>>> immediately
>>>>>>       follows the loop exit.  In this case, it would be better not to
>>>>>> copy
>>>>>> the
>>>>>>       body of the loop, but only move the entry of the loop directly
>>>>>> before
>>>>>> the
>>>>>>       exit check and increase the number of iterations of the loop by
>>>>>> one.
>>>>>>       This may need some additional preconditioning in case NIT = ~0.
>>>>>> ...
>>>>>>
>>>>>> The current implementation of transform_to_exit_first_loop transforms
>>>>>> the
>>>>>> loop
>>>>>> by moving and duplicating the loop body. This patch transforms the
>>>>>> loop
>>>>>> into the
>>>>>> same normal form, but without the duplication, if that's possible
>>>>>> (determined by
>>>>>> try_transform_to_exit_first_loop_alt).
>>>>>>
>>>>>> The actual transformation, done by transform_to_exit_first_loop_alt
>>>>>> transforms
>>>>>> this loop:
>>>>>> ...
>>>>>>         bb preheader:
>>>>>>         ...
>>>>>>         goto <bb header>
>>>>>>
>>>>>>         <bb header>:
>>>>>>         ...
>>>>>>         if (ivtmp < n)
>>>>>>           goto <bb latch>;
>>>>>>         else
>>>>>>           goto <bb exit>;
>>>>>>
>>>>>>         <bb latch>:
>>>>>>         ivtmp = ivtmp + inc;
>>>>>>         goto <bb header>
>>>>>> ...
>>>>>>
>>>>>> into this one:
>>>>>> ...
>>>>>>         bb preheader:
>>>>>>         ...
>>>>>>         goto <bb newheader>
>>>>>>
>>>>>>         <bb header>:
>>>>>>         ...
>>>>>>         goto <bb latch>;
>>>>>>
>>>>>>         <bb newheader>:
>>>>>>         if (ivtmp < n + 1)
>>>>>>           goto <bb header>;
>>>>>>         else
>>>>>>           goto <bb exit>;
>>>>>>
>>>>>>         <bb latch>:
>>>>>>         ivtmp = ivtmp + inc;
>>>>>>         goto <bb newheader>
>>>>>> ...
>>>>>>
>>>>>
>>>>> Updated patch, now using redirect_edge_var_map and flush_pending_stmts.
>>>>>
>>>>> Bootstrapped and reg-tested on x86_64.
>>>>>
>>>>> OK for stage1 trunk?
>>>>>
>>>>
>>>> Ping.
>>>
>>
>> Hi Richard,
>>
>> thanks for the review.
>
> Sorry for the delay (too many things to review, too much other work...)
>

Np, thanks for the review.

>>> +static void
>>> +replace_imm_uses (tree val, imm_use_iterator *imm_iter)
>>> +{
>>> +  use_operand_p use_p;
>>> +
>>> +  FOR_EACH_IMM_USE_ON_STMT (use_p, *imm_iter)
>>> +    SET_USE (use_p, val);
>>>
>>> Use propagate_value.  Why this odd interface passing a imm_iter?!
>>>
>>
>> The replace_imm_uses function is common code factored out of
>> replace_uses_in_bb_by and replace_uses_in_bbs_by. I'm not sure what is odd
>> about the interface of the replace_imm_uses function, but I've removed the
>> function.
>>
>> I tried using propagate_value, but that one doesn't like virtuals.
>>
>>> In fact most of the "repair SSA" in transform_to_exit_first_loop_alt
>>> looks too ad-hoc to me ... it also looks somewhat familiar to other
>>> code ...
>>>
>>> Unfortunately the comment before the function isn't in SSA form
>>> so it's hard to follow the transform.
>>>
>>
>> I've added the ssa bits in the transformation comment before the function, and
>> updated variable names and comments in the function.
>>
>>> I consider the parloops code bitrotten, no repair possible, so
>>> I might be convinced to not care about new spaghetti in there...
>>>
>>> +  /* Fix up loop structure.  TODO: Check whether this is sufficient.  */
>>> +  loop->header = new_header;
>>> +
>>>
>>> no, surely not.  Number of iterations (and estimates) are off
>>> after the transform
>>
>> The loop is cancelled at the end of gen_parallel_loop, and the estimations are
>> removed.  We only seem to be using a limited set of the loop data until the
>> loop is cancelled. Updated comment accordingly.
>>
>>> and loop->latch might also need updating.
>>>
>>
>> That one actually stays the same. Updated comment accordingly.
>>
>>> "Safest" is to simply schedule a fixup (but you'll lose any
>>> loop annotation in that process).
>>>
>>
>> Since the loop is cancelled, AFAIU we don't need that, but... You're
>> referring to the annotations mentioned in
>> replace_loop_annotate_in_block? Losing the annotations seems to be a
>> generic problem of scheduling such a fixup, not particular to this
>> patch. Do you have a concern related to the patch and these annotations?
>
> For example such annotations (or others added by OpenMP like the
> safe_len stuff).  Generally fixups preserve those _iff_ the loop
> header stays the same.  So in your case doing
>
>    loop->header = new_header;
>    set_loops_state (LOOPS_NEED_FIXUP);
>
> would probably "work".  But yes, if the loop is cancelled anyway
> there is no point jumping through hoops.
>
>>> +  /* Figure out whether nit + 1 overflows.  */
>>> +  if (TREE_CODE (nit) == INTEGER_CST)
>>> +    {
>>> +      if (!tree_int_cst_equal (nit, TYPE_MAXVAL (nit_type)))
>>>
>>> in case nit_type is a pointer type TYPE_MAXVAL will be NULL I think.
>>>
>>
>> We've done ivcanon just before this point, so AFAIU we can assume we're
>> dealing with an unsigned integer.
>>
>>> Is the whole exercise for performance?
>>
>> I'm trying to fix the todo in the code for parloops, which is about
>> performance, though I don't expect a large gain.
>>
>> OTOH, my main focus is a related oacc kernels issue. For an oacc kernels
>> region, the entire loop is enclosed in a kernels region. Peeling of the last
>> iteration means I have to guard that iteration to run on only one thread,
>> which currently isn't done, so in that sense it's a correctness issue as well.
>
> But as the patch doesn't cover all cases you still have a correctness
> issue to solve, no?  It seems to me the last iteration should simply
> _not_ be in the oacc kernels region?
>

Correct. On gomp-4_0-branch, I've committed r223849 (submitted at 
https://gcc.gnu.org/ml/gcc-patches/2015-06/msg00077.html). If we don't 
manage to do the alt transformation for a loop in an oacc kernels 
region, we abandon parallelization of that loop.

FWIW, another way to deal with this problem is to guard the peeled off 
last loop iteration such that only one accelerator thread executes it. 
This is a generic problem, that we'll have to solve for the case that 
the user specifies sequential code in a kernels region. But if the user 
didn't specify sequential code, then we better not have the compiler 
introduce it.

>>> In that case using an
>>> entirely new, unsigned IV, that runs from 0 to niter should be easiest and
>>
>> Introducting such an IV would mean that we don't have to update the IV, but
>> still we have to connect the new IV to the uses of the old IV.
>>
>> The current special handling of the IV variable is actually not that
>> elaborate:
>> ...
>> +      /* Replace ivtmp_a with ivtmp_c in condition 'if (ivtmp_a < n)'.  */
>> +      replace_uses_in_bb_by (res_a, res_c, new_header);
>> ...
>> So I'm not sure it's easier.
>>
>> just run-time guard that niter == +INF case?
>>
>> That doesn't work out nicely for the oacc kernels case. I need to know
>> compile-time wheter I can parallelize or not.
>
> Hmm, I see.
>
>>> For the graphite case, can't you make graphite generate the
>>> loops exit-first in the first place?
>>>
>>
>> Perhaps, but that doesn't make a difference for the oacc kernels case.
>>
>>> The testcases are just correctness ones?  That is, there isn't
>>> any testcase that checks whether the new code is exercised?
>>> (no extra debugging dumping?)
>>>
>>
>> There are 3 new test patterns, each with a libgomp/testsuite/libgomp.c and a
>> gcc/testsuite/gcc.dg variant, so 6 new test-cases in total. The
>> gcc/testsuite/gcc.dg variant checks that the new code is exercised. The
>> libgomp/testsuite/libgomp.c variant checks that the new code generates correct
>> code.
>>
>> Reposting updated patch (for brevity, without testcases) below.
>
> I'm ok with the patch and count on you to fix eventual fallout ;)
>

Great, will do.

I'll merge this patch from gomp-4_0-branch to trunk, retest and commit.

Thanks,
- Tom

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PING][PATCH][PR65443] Add transform_to_exit_first_loop_alt
  2015-06-04  8:37           ` Tom de Vries
@ 2015-06-08 10:44             ` Tom de Vries
  2015-06-08 11:08               ` Richard Biener
  2015-06-08 15:59               ` Thomas Schwinge
  0 siblings, 2 replies; 13+ messages in thread
From: Tom de Vries @ 2015-06-08 10:44 UTC (permalink / raw)
  To: Richard Biener; +Cc: GCC Patches, Jakub Jelinek, schwab

[-- Attachment #1: Type: text/plain, Size: 622 bytes --]

On 04/06/15 10:28, Tom de Vries wrote:
>> I'm ok with the patch and count on you to fix eventual fallout ;)
>>
>
> Great, will do.

And here is the fallout:
* PR66442 - [6 regression] FAIL: gcc.dg/autopar/pr46885.c (test for
   excess errors)

There are two problems in try_transform_to_exit_first_loop_alt:
1. In case the latch is not a singleton bb, the function should return
    false rather than true.
2. The check for singleton bb should ignore debug-insns.

Attached patch fixes these problems.

Bootstrapped and reg-tested on x86_64.

Verified by Andreas to fix the problem on m68k.

OK for trunk?

Thanks,
- Tom


[-- Attachment #2: 0001-Fix-try_transform_to_exit_first_loop_alt.patch --]
[-- Type: text/x-patch, Size: 2067 bytes --]

Fix try_transform_to_exit_first_loop_alt

2015-06-06  Tom de Vries  <tom@codesourcery.com>

	PR tree-optimization/66442
	* gimple-iterator.h (gimple_seq_nondebug_singleton_p): Add function.
	* tree-parloops.c (try_transform_to_exit_first_loop_alt): Return false
	if the loop latch is not a singleton.  Use
	gimple_seq_nondebug_singleton_p instead of gimple_seq_singleton_p.
---
 gcc/gimple-iterator.h | 29 +++++++++++++++++++++++++++++
 gcc/tree-parloops.c   |  4 ++--
 2 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/gcc/gimple-iterator.h b/gcc/gimple-iterator.h
index 87e943a..76fa456 100644
--- a/gcc/gimple-iterator.h
+++ b/gcc/gimple-iterator.h
@@ -345,4 +345,33 @@ gsi_seq (gimple_stmt_iterator i)
   return *i.seq;
 }
 
+/* Determine whether SEQ is a nondebug singleton.  */
+
+static inline bool
+gimple_seq_nondebug_singleton_p (gimple_seq seq)
+{
+  gimple_stmt_iterator gsi;
+
+  /* Find a nondebug gimple.  */
+  gsi.ptr = gimple_seq_first (seq);
+  gsi.seq = &seq;
+  gsi.bb = NULL;
+  while (!gsi_end_p (gsi)
+	 && is_gimple_debug (gsi_stmt (gsi)))
+    gsi_next (&gsi);
+
+  /* No nondebug gimple found, not a singleton.  */
+  if (gsi_end_p (gsi))
+    return false;
+
+  /* Find a next nondebug gimple.  */
+  gsi_next (&gsi);
+  while (!gsi_end_p (gsi)
+	 && is_gimple_debug (gsi_stmt (gsi)))
+    gsi_next (&gsi);
+
+  /* Only a singleton if there's no next nondebug gimple.  */
+  return gsi_end_p (gsi);
+}
+
 #endif /* GCC_GIMPLE_ITERATOR_H */
diff --git a/gcc/tree-parloops.c b/gcc/tree-parloops.c
index 02f44eb..c4b83fe 100644
--- a/gcc/tree-parloops.c
+++ b/gcc/tree-parloops.c
@@ -1769,8 +1769,8 @@ try_transform_to_exit_first_loop_alt (struct loop *loop,
 				      tree nit)
 {
   /* Check whether the latch contains a single statement.  */
-  if (!gimple_seq_singleton_p (bb_seq (loop->latch)))
-    return true;
+  if (!gimple_seq_nondebug_singleton_p (bb_seq (loop->latch)))
+    return false;
 
   /* Check whether the latch contains the loop iv increment.  */
   edge back = single_succ_edge (loop->latch);
-- 
1.9.1


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PING][PATCH][PR65443] Add transform_to_exit_first_loop_alt
  2015-06-08 10:44             ` Tom de Vries
@ 2015-06-08 11:08               ` Richard Biener
  2015-06-08 15:59               ` Thomas Schwinge
  1 sibling, 0 replies; 13+ messages in thread
From: Richard Biener @ 2015-06-08 11:08 UTC (permalink / raw)
  To: Tom de Vries; +Cc: GCC Patches, Jakub Jelinek, schwab

On Mon, 8 Jun 2015, Tom de Vries wrote:

> On 04/06/15 10:28, Tom de Vries wrote:
> > > I'm ok with the patch and count on you to fix eventual fallout ;)
> > > 
> > 
> > Great, will do.
> 
> And here is the fallout:
> * PR66442 - [6 regression] FAIL: gcc.dg/autopar/pr46885.c (test for
>   excess errors)
> 
> There are two problems in try_transform_to_exit_first_loop_alt:
> 1. In case the latch is not a singleton bb, the function should return
>    false rather than true.
> 2. The check for singleton bb should ignore debug-insns.
> 
> Attached patch fixes these problems.
> 
> Bootstrapped and reg-tested on x86_64.
> 
> Verified by Andreas to fix the problem on m68k.
> 
> OK for trunk?

Ok.

Thanks,
Richard.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PING][PATCH][PR65443] Add transform_to_exit_first_loop_alt
  2015-06-08 10:44             ` Tom de Vries
  2015-06-08 11:08               ` Richard Biener
@ 2015-06-08 15:59               ` Thomas Schwinge
  2015-06-08 22:12                 ` Tom de Vries
  1 sibling, 1 reply; 13+ messages in thread
From: Thomas Schwinge @ 2015-06-08 15:59 UTC (permalink / raw)
  To: Tom de Vries; +Cc: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 2940 bytes --]

Hi Tom!

On Mon, 8 Jun 2015 12:43:01 +0200, Tom de Vries <Tom_deVries@mentor.com> wrote:
> There are two problems in try_transform_to_exit_first_loop_alt:
> 1. In case the latch is not a singleton bb, the function should return
>     false rather than true.
> 2. The check for singleton bb should ignore debug-insns.
> 
> Attached patch fixes these problems.

> Fix try_transform_to_exit_first_loop_alt

> 	PR tree-optimization/66442
> 	* gimple-iterator.h (gimple_seq_nondebug_singleton_p): Add function.
> 	* tree-parloops.c (try_transform_to_exit_first_loop_alt): Return false
> 	if the loop latch is not a singleton.  Use
> 	gimple_seq_nondebug_singleton_p instead of gimple_seq_singleton_p.

Per my testing, the backport of this patch that you committed to
gomp-4_0-branch, r224219, introduces a number of regressions in your
OpenACC kernels test cases, specifically the »scan-tree-dump-times
parloops_oacc_kernels "(?n)pragma omp target
oacc_parallel.*num_gangs\\(32\\)" 1« tests.  Would you please have a
look?


Grüße,
 Thomas


>  gcc/gimple-iterator.h | 29 +++++++++++++++++++++++++++++
>  gcc/tree-parloops.c   |  4 ++--
>  2 files changed, 31 insertions(+), 2 deletions(-)
> 
> diff --git a/gcc/gimple-iterator.h b/gcc/gimple-iterator.h
> index 87e943a..76fa456 100644
> --- a/gcc/gimple-iterator.h
> +++ b/gcc/gimple-iterator.h
> @@ -345,4 +345,33 @@ gsi_seq (gimple_stmt_iterator i)
>    return *i.seq;
>  }
>  
> +/* Determine whether SEQ is a nondebug singleton.  */
> +
> +static inline bool
> +gimple_seq_nondebug_singleton_p (gimple_seq seq)
> +{
> +  gimple_stmt_iterator gsi;
> +
> +  /* Find a nondebug gimple.  */
> +  gsi.ptr = gimple_seq_first (seq);
> +  gsi.seq = &seq;
> +  gsi.bb = NULL;
> +  while (!gsi_end_p (gsi)
> +	 && is_gimple_debug (gsi_stmt (gsi)))
> +    gsi_next (&gsi);
> +
> +  /* No nondebug gimple found, not a singleton.  */
> +  if (gsi_end_p (gsi))
> +    return false;
> +
> +  /* Find a next nondebug gimple.  */
> +  gsi_next (&gsi);
> +  while (!gsi_end_p (gsi)
> +	 && is_gimple_debug (gsi_stmt (gsi)))
> +    gsi_next (&gsi);
> +
> +  /* Only a singleton if there's no next nondebug gimple.  */
> +  return gsi_end_p (gsi);
> +}
> +
>  #endif /* GCC_GIMPLE_ITERATOR_H */
> diff --git a/gcc/tree-parloops.c b/gcc/tree-parloops.c
> index 02f44eb..c4b83fe 100644
> --- a/gcc/tree-parloops.c
> +++ b/gcc/tree-parloops.c
> @@ -1769,8 +1769,8 @@ try_transform_to_exit_first_loop_alt (struct loop *loop,
>  				      tree nit)
>  {
>    /* Check whether the latch contains a single statement.  */
> -  if (!gimple_seq_singleton_p (bb_seq (loop->latch)))
> -    return true;
> +  if (!gimple_seq_nondebug_singleton_p (bb_seq (loop->latch)))
> +    return false;
>  
>    /* Check whether the latch contains the loop iv increment.  */
>    edge back = single_succ_edge (loop->latch);
> -- 
> 1.9.1
> 

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 472 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PING][PATCH][PR65443] Add transform_to_exit_first_loop_alt
  2015-06-08 15:59               ` Thomas Schwinge
@ 2015-06-08 22:12                 ` Tom de Vries
  2015-06-09  7:47                   ` Tom de Vries
  0 siblings, 1 reply; 13+ messages in thread
From: Tom de Vries @ 2015-06-08 22:12 UTC (permalink / raw)
  To: Thomas Schwinge; +Cc: GCC Patches

On 08/06/15 17:55, Thomas Schwinge wrote:
> Hi Tom!
>
> On Mon, 8 Jun 2015 12:43:01 +0200, Tom de Vries <Tom_deVries@mentor.com> wrote:
>> There are two problems in try_transform_to_exit_first_loop_alt:
>> 1. In case the latch is not a singleton bb, the function should return
>>      false rather than true.
>> 2. The check for singleton bb should ignore debug-insns.
>>
>> Attached patch fixes these problems.
>
>> Fix try_transform_to_exit_first_loop_alt
>
>> 	PR tree-optimization/66442
>> 	* gimple-iterator.h (gimple_seq_nondebug_singleton_p): Add function.
>> 	* tree-parloops.c (try_transform_to_exit_first_loop_alt): Return false
>> 	if the loop latch is not a singleton.  Use
>> 	gimple_seq_nondebug_singleton_p instead of gimple_seq_singleton_p.
>
> Per my testing, the backport of this patch that you committed to
> gomp-4_0-branch, r224219, introduces a number of regressions in your
> OpenACC kernels test cases, specifically the »scan-tree-dump-times
> parloops_oacc_kernels "(?n)pragma omp target
> oacc_parallel.*num_gangs\\(32\\)" 1« tests.  Would you please have a
> look?
>
>

Hi Thomas,

I seem to have committed (to both trunk and gomp-4_0-branch) an older 
version of the patch, which contained an incorrect version of 
gimple_seq_nondebug_singleton_p.

I'll correct the mistake tomorrow morning.

Thanks,
- Tom

> Grüße,
>   Thomas
>
>
>>   gcc/gimple-iterator.h | 29 +++++++++++++++++++++++++++++
>>   gcc/tree-parloops.c   |  4 ++--
>>   2 files changed, 31 insertions(+), 2 deletions(-)
>>
>> diff --git a/gcc/gimple-iterator.h b/gcc/gimple-iterator.h
>> index 87e943a..76fa456 100644
>> --- a/gcc/gimple-iterator.h
>> +++ b/gcc/gimple-iterator.h
>> @@ -345,4 +345,33 @@ gsi_seq (gimple_stmt_iterator i)
>>     return *i.seq;
>>   }
>>
>> +/* Determine whether SEQ is a nondebug singleton.  */
>> +
>> +static inline bool
>> +gimple_seq_nondebug_singleton_p (gimple_seq seq)
>> +{
>> +  gimple_stmt_iterator gsi;
>> +
>> +  /* Find a nondebug gimple.  */
>> +  gsi.ptr = gimple_seq_first (seq);
>> +  gsi.seq = &seq;
>> +  gsi.bb = NULL;
>> +  while (!gsi_end_p (gsi)
>> +	 && is_gimple_debug (gsi_stmt (gsi)))
>> +    gsi_next (&gsi);
>> +
>> +  /* No nondebug gimple found, not a singleton.  */
>> +  if (gsi_end_p (gsi))
>> +    return false;
>> +
>> +  /* Find a next nondebug gimple.  */
>> +  gsi_next (&gsi);
>> +  while (!gsi_end_p (gsi)
>> +	 && is_gimple_debug (gsi_stmt (gsi)))
>> +    gsi_next (&gsi);
>> +
>> +  /* Only a singleton if there's no next nondebug gimple.  */
>> +  return gsi_end_p (gsi);
>> +}
>> +
>>   #endif /* GCC_GIMPLE_ITERATOR_H */
>> diff --git a/gcc/tree-parloops.c b/gcc/tree-parloops.c
>> index 02f44eb..c4b83fe 100644
>> --- a/gcc/tree-parloops.c
>> +++ b/gcc/tree-parloops.c
>> @@ -1769,8 +1769,8 @@ try_transform_to_exit_first_loop_alt (struct loop *loop,
>>   				      tree nit)
>>   {
>>     /* Check whether the latch contains a single statement.  */
>> -  if (!gimple_seq_singleton_p (bb_seq (loop->latch)))
>> -    return true;
>> +  if (!gimple_seq_nondebug_singleton_p (bb_seq (loop->latch)))
>> +    return false;
>>
>>     /* Check whether the latch contains the loop iv increment.  */
>>     edge back = single_succ_edge (loop->latch);
>> --
>> 1.9.1
>>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PING][PATCH][PR65443] Add transform_to_exit_first_loop_alt
  2015-06-08 22:12                 ` Tom de Vries
@ 2015-06-09  7:47                   ` Tom de Vries
  0 siblings, 0 replies; 13+ messages in thread
From: Tom de Vries @ 2015-06-09  7:47 UTC (permalink / raw)
  To: Thomas Schwinge; +Cc: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 1623 bytes --]

On 09/06/15 00:05, Tom de Vries wrote:
> On 08/06/15 17:55, Thomas Schwinge wrote:
>> Hi Tom!
>>
>> On Mon, 8 Jun 2015 12:43:01 +0200, Tom de Vries
>> <Tom_deVries@mentor.com> wrote:
>>> There are two problems in try_transform_to_exit_first_loop_alt:
>>> 1. In case the latch is not a singleton bb, the function should return
>>>      false rather than true.
>>> 2. The check for singleton bb should ignore debug-insns.
>>>
>>> Attached patch fixes these problems.
>>
>>> Fix try_transform_to_exit_first_loop_alt
>>
>>>     PR tree-optimization/66442
>>>     * gimple-iterator.h (gimple_seq_nondebug_singleton_p): Add function.
>>>     * tree-parloops.c (try_transform_to_exit_first_loop_alt): Return
>>> false
>>>     if the loop latch is not a singleton.  Use
>>>     gimple_seq_nondebug_singleton_p instead of gimple_seq_singleton_p.
>>
>> Per my testing, the backport of this patch that you committed to
>> gomp-4_0-branch, r224219, introduces a number of regressions in your
>> OpenACC kernels test cases, specifically the »scan-tree-dump-times
>> parloops_oacc_kernels "(?n)pragma omp target
>> oacc_parallel.*num_gangs\\(32\\)" 1« tests.  Would you please have a
>> look?
>>
>>
>
> Hi Thomas,
>
> I seem to have committed (to both trunk and gomp-4_0-branch) an older
> version of the patch, which contained an incorrect version of
> gimple_seq_nondebug_singleton_p.
>
> I'll correct the mistake tomorrow morning.

Committed attached patch to trunk and propagated to gomp-4_0-branch.

Committed as obvious, since it changes gimple_seq_nondebug_singleton_p 
into the tested and approved version.

Thanks,
- Tom


[-- Attachment #2: 0001-Fix-gimple_seq_nondebug_singleton_p.patch --]
[-- Type: text/x-patch, Size: 1555 bytes --]

Fix gimple_seq_nondebug_singleton_p

2015-06-09  Tom de Vries  <tom@codesourcery.com>

	* gimple-iterator.h (gimple_seq_nondebug_singleton_p): Don't
	always return false.
---
 gcc/gimple-iterator.h | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/gcc/gimple-iterator.h b/gcc/gimple-iterator.h
index d08245e..76fa456 100644
--- a/gcc/gimple-iterator.h
+++ b/gcc/gimple-iterator.h
@@ -351,33 +351,27 @@ static inline bool
 gimple_seq_nondebug_singleton_p (gimple_seq seq)
 {
   gimple_stmt_iterator gsi;
+
+  /* Find a nondebug gimple.  */
   gsi.ptr = gimple_seq_first (seq);
   gsi.seq = &seq;
   gsi.bb = NULL;
-
-  /* Not a singleton if the sequence is empty.  */
-  if (gsi_end_p (gsi))
-    return false;
-
-  /* Find a nondebug gimple.  */
   while (!gsi_end_p (gsi)
 	 && is_gimple_debug (gsi_stmt (gsi)))
     gsi_next (&gsi);
 
-  /* Not a nondebug singleton if there's no nondebug gimple.  */
-  if (is_gimple_debug (gsi_stmt (gsi)))
+  /* No nondebug gimple found, not a singleton.  */
+  if (gsi_end_p (gsi))
     return false;
 
-  /* Find the next nondebug gimple.  */
+  /* Find a next nondebug gimple.  */
+  gsi_next (&gsi);
   while (!gsi_end_p (gsi)
 	 && is_gimple_debug (gsi_stmt (gsi)))
     gsi_next (&gsi);
 
-  /* If there's a next nondebug gimple, it's not a nondebug singleton.  */
-  if (!gsi_end_p (gsi))
-    return false;
-
-  return true;
+  /* Only a singleton if there's no next nondebug gimple.  */
+  return gsi_end_p (gsi);
 }
 
 #endif /* GCC_GIMPLE_ITERATOR_H */
-- 
1.9.1


^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2015-06-09  6:00 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-03-27 14:10 [PATCH, stage1][PR65443] Add transform_to_exit_first_loop_alt Tom de Vries
2015-04-03 12:40 ` Tom de Vries
2015-04-15 13:39   ` [PING][PATCH][PR65443] " Tom de Vries
2015-04-20 12:26     ` Richard Biener
2015-05-14 11:47       ` Tom de Vries
2015-05-26 11:05         ` Richard Biener
2015-06-04  8:37           ` Tom de Vries
2015-06-08 10:44             ` Tom de Vries
2015-06-08 11:08               ` Richard Biener
2015-06-08 15:59               ` Thomas Schwinge
2015-06-08 22:12                 ` Tom de Vries
2015-06-09  7:47                   ` Tom de Vries
2015-06-01  8:12         ` [gomp4,committed,PR65443] Add transform_to_exit_first_loop_al Tom de Vries

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).