public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* Preliminary patch for PR23820 and PR24309
@ 2007-10-11 21:32 Sebastian Pop
  2007-10-12  1:16 ` Zdenek Dvorak
  0 siblings, 1 reply; 26+ messages in thread
From: Sebastian Pop @ 2007-10-11 21:32 UTC (permalink / raw)
  To: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 520 bytes --]

Hi,

Attached is a patch that fixes both problems with the lambda
framework.  While testing, I observed that ltrans-3.c now fails
because there are more than 3 basic blocks in the outer loop, making
the new implementation of the perfect_nest_p saying that this is not a
perfect nest.

What is the name of the pass that cannonicalizes the CFG removing
empty basic blocks containing only labels, or just a goto, as the
following one?


 bb_9 (preds = {bb_7 }, succs = {bb_10 })
    {
    <bb 9>:

    }

Thanks,
Sebastian

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 803_pr24309.diff --]
[-- Type: text/x-patch; name="803_pr24309.diff", Size: 8755 bytes --]

Index: tree-loop-linear.c
===================================================================
--- tree-loop-linear.c	(revision 129032)
+++ tree-loop-linear.c	(working copy)
@@ -244,6 +244,46 @@ try_interchange_loops (lambda_trans_matr
   return trans;
 }
 
+/* Return the number of nested loops in LOOP_NEST, or 0 if the loops
+   are not perfectly nested.  */
+
+static unsigned int
+perfect_loop_nest_depth (struct loop *loop_nest)
+{
+  struct loop *temp;
+  unsigned int depth = 1;
+
+  /* If it's not a loop nest, we don't want it.  We also don't handle
+     sibling loops properly, which are loops of the following form:
+
+     | for (i = 0; i < 50; i++)
+     |   {
+     |     for (j = 0; j < 50; j++)
+     |       {
+     |        ...
+     |       }
+     |     for (j = 0; j < 50; j++)
+     |       {
+     |        ...
+     |       }
+     |   }
+  */
+
+  if (!loop_nest->inner || !single_exit (loop_nest))
+    return 0;
+
+  for (temp = loop_nest->inner; temp; temp = temp->inner)
+    {
+      /* If we have a sibling loop or multiple exit edges, jump ship.  */
+      if (temp->next || !single_exit (temp))
+	return 0;
+
+      depth++;
+    }
+
+  return depth;
+}
+
 /* Perform a set of linear transforms on loops.  */
 
 void
@@ -263,47 +303,18 @@ linear_transform_loops (void)
       unsigned int depth = 0;
       VEC (ddr_p, heap) *dependence_relations;
       VEC (data_reference_p, heap) *datarefs;
-      struct loop *temp;
       lambda_loopnest before, after;
       lambda_trans_matrix trans;
-      bool problem = false;
       struct obstack lambda_obstack;
       gcc_obstack_init (&lambda_obstack);
 
-      /* If it's not a loop nest, we don't want it.
-         We also don't handle sibling loops properly, 
-         which are loops of the following form:
-         for (i = 0; i < 50; i++)
-           {
-             for (j = 0; j < 50; j++)
-               {
-	        ...
-               }
-             for (j = 0; j < 50; j++)
-               {
-                ...
-               }
-           } */
-      if (!loop_nest->inner || !single_exit (loop_nest))
+      depth = perfect_loop_nest_depth (loop_nest);
+      if (depth == 0)
 	continue;
+
       VEC_truncate (tree, oldivs, 0);
       VEC_truncate (tree, invariants, 0);
-      depth = 1;
-      for (temp = loop_nest->inner; temp; temp = temp->inner)
-	{
-	  /* If we have a sibling loop or multiple exit edges, jump ship.  */
-	  if (temp->next || !single_exit (temp))
-	    {
-	      problem = true;
-	      break;
-	    }
-	  depth ++;
-	}
-      if (problem)
-	continue;
 
-      /* Analyze data references and dependence relations using scev.  */      
- 
       datarefs = VEC_alloc (data_reference_p, heap, 10);
       dependence_relations = VEC_alloc (ddr_p, heap, 10 * 10);
       compute_data_dependences_for_loop (loop_nest, true, &datarefs,
@@ -353,12 +364,12 @@ linear_transform_loops (void)
 	  print_lambda_loopnest (dump_file, after, 'u');
 	}
 
-      lambda_loopnest_to_gcc_loopnest (loop_nest, oldivs, invariants,
-				       &remove_ivs,
-                                       after, trans, &lambda_obstack);
-      modified = true;
+      modified = lambda_loopnest_to_gcc_loopnest (loop_nest, oldivs, 
+						  invariants, &remove_ivs,
+						  after, trans,
+						  &lambda_obstack);
 
-      if (dump_file)
+      if (modified && dump_file)
 	fprintf (dump_file, "Successfully transformed loop.\n");
 
     free_and_continue:
Index: testsuite/gcc.dg/tree-ssa/pr23820.c
===================================================================
--- testsuite/gcc.dg/tree-ssa/pr23820.c	(revision 0)
+++ testsuite/gcc.dg/tree-ssa/pr23820.c	(revision 0)
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-loop-linear" } */
+
+int t [2][4];
+
+void foo (void)
+{
+  int i, j, k, v;
+  float e;
+  for (;;)
+    {
+      v = 0;
+      for (j = 0; j < 2; j ++)
+        {
+          for (k = 2; k < 4; k ++)
+            {
+              e = 0.0;
+              for (i = 0; i < 4; i ++)
+                e += t [j][i];
+              if (e)
+                v = j;
+            }
+        }
+      t [v][0] = 0;
+    }
+}
Index: testsuite/gcc.dg/tree-ssa/pr24309.c
===================================================================
--- testsuite/gcc.dg/tree-ssa/pr24309.c	(revision 0)
+++ testsuite/gcc.dg/tree-ssa/pr24309.c	(revision 0)
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-loop-linear" } */
+
+float weight[10];
+void lsp_weight_quant(float *x, char *cdbk)
+{
+   int i,j;
+   float dist;
+   int best_id=0;
+   for (i=0;i<16;i++)
+   {
+      for (j=0;j<10;j++)
+         dist=dist+weight[j];
+      if (dist<0)
+         best_id=i;
+   }
+   x[j] = cdbk[best_id*10+j];
+}
Index: lambda.h
===================================================================
--- lambda.h	(revision 129032)
+++ lambda.h	(working copy)
@@ -204,7 +204,7 @@ lambda_loopnest gcc_loopnest_to_lambda_l
 						 VEC(tree,heap) **,
                                                  VEC(tree,heap) **,
                                                  struct obstack *);
-void lambda_loopnest_to_gcc_loopnest (struct loop *,
+bool lambda_loopnest_to_gcc_loopnest (struct loop *,
 				      VEC(tree,heap) *, VEC(tree,heap) *,
 				      VEC(tree,heap) **,
                                       lambda_loopnest, lambda_trans_matrix,
Index: lambda-code.c
===================================================================
--- lambda-code.c	(revision 129032)
+++ lambda-code.c	(working copy)
@@ -1675,6 +1675,37 @@ remove_iv (tree iv_stmt)
     }
 }
 
+/* Return true if we are sure that the code generation cannot be
+   performed for this loop nest.  */
+
+static bool
+lambda_cannot_generate_loop_nest (VEC (tree, heap) *old_ivs)
+{
+  unsigned i;
+  tree oldiv;
+
+  for (i = 0; VEC_iterate (tree, old_ivs, i, oldiv); i++)
+    {
+      imm_use_iterator imm_iter;
+      tree oldiv_def;
+      tree oldiv_stmt = SSA_NAME_DEF_STMT (oldiv);
+      tree stmt;
+
+      if (TREE_CODE (oldiv_stmt) == PHI_NODE)
+        oldiv_def = PHI_RESULT (oldiv_stmt);
+      else
+	oldiv_def = SINGLE_SSA_TREE_OPERAND (oldiv_stmt, SSA_OP_DEF);
+      gcc_assert (oldiv_def != NULL_TREE);
+
+      FOR_EACH_IMM_USE_STMT (stmt, imm_iter, oldiv_def)
+        {
+	  if (TREE_CODE (stmt) == PHI_NODE)
+	    return true;
+	}
+    }
+
+  return false;
+}
 
 /* Transform a lambda loopnest NEW_LOOPNEST, which had TRANSFORM applied to
    it, back into gcc code.  This changes the
@@ -1686,9 +1717,11 @@ remove_iv (tree iv_stmt)
    INVARIANTS is a vector of loop invariants from the old loopnest.
    NEW_LOOPNEST is the new lambda loopnest to replace OLD_LOOPNEST with.
    TRANSFORM is the matrix transform that was applied to OLD_LOOPNEST to get 
-   NEW_LOOPNEST.  */
+   NEW_LOOPNEST.  
+
+   Returns true when the code generation succeeded.  */
 
-void
+bool
 lambda_loopnest_to_gcc_loopnest (struct loop *old_loopnest,
 				 VEC(tree,heap) *old_ivs,
 				 VEC(tree,heap) *invariants,
@@ -1705,6 +1738,14 @@ lambda_loopnest_to_gcc_loopnest (struct 
   
   block_stmt_iterator bsi;
 
+  if (lambda_cannot_generate_loop_nest (old_ivs))
+    {
+      if (dump_file)
+	fprintf (dump_file, "Lambda code generation failed to produce code.\n");
+
+      return false;
+    }
+
   if (dump_file)
     {
       transform = lambda_trans_matrix_inverse (transform);
@@ -1865,6 +1906,8 @@ lambda_loopnest_to_gcc_loopnest (struct 
       VEC_safe_push (tree, heap, *remove_ivs, oldiv_stmt);
     }
   VEC_free (tree, heap, new_ivs);
+
+  return true;
 }
 
 /* Return TRUE if this is not interesting statement from the perspective of
@@ -1969,18 +2012,30 @@ bool
 perfect_nest_p (struct loop *loop)
 {
   basic_block *bbs;
-  size_t i;
+  size_t i, n_bb = 0;
   tree exit_cond;
 
+  /* Loops at depth 0 are perfect nests.  */
   if (!loop->inner)
     return true;
+
   bbs = get_loop_body (loop);
   exit_cond = get_loop_exit_condition (loop);
+
   for (i = 0; i < loop->num_nodes; i++)
     {
       if (bbs[i]->loop_father == loop)
 	{
 	  block_stmt_iterator bsi;
+
+	  /* A nest is not perfect when there are more than three
+	     basic blocks per non innermost loop.  */
+	  if (n_bb++ >= 3)
+	    {
+	      free (bbs);
+	      return false;
+	    }
+
 	  for (bsi = bsi_start (bbs[i]); !bsi_end_p (bsi); bsi_next (&bsi))
 	    {
 	      tree stmt = bsi_stmt (bsi);
@@ -1993,11 +2048,10 @@ perfect_nest_p (struct loop *loop)
 	    }
 	}
     }
+
   free (bbs);
-  /* See if the inner loops are perfectly nested as well.  */
-  if (loop->inner)    
-    return perfect_nest_p (loop->inner);
-  return true;
+
+  return perfect_nest_p (loop->inner);
 }
 
 /* Replace the USES of X in STMT, or uses with the same step as X with Y.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Preliminary patch for PR23820 and PR24309
  2007-10-11 21:32 Preliminary patch for PR23820 and PR24309 Sebastian Pop
@ 2007-10-12  1:16 ` Zdenek Dvorak
  2007-10-12 21:08   ` Sebastian Pop
  0 siblings, 1 reply; 26+ messages in thread
From: Zdenek Dvorak @ 2007-10-12  1:16 UTC (permalink / raw)
  To: Sebastian Pop; +Cc: GCC Patches

Hello,

> What is the name of the pass that cannonicalizes the CFG removing
> empty basic blocks containing only labels, or just a goto, as the
> following one?
> 
> 
>  bb_9 (preds = {bb_7 }, succs = {bb_10 })
>     {
>     <bb 9>:
> 
>     }

cfg cleanup does that in gcc.  

Zdenek

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Preliminary patch for PR23820 and PR24309
  2007-10-12  1:16 ` Zdenek Dvorak
@ 2007-10-12 21:08   ` Sebastian Pop
  2007-10-12 21:21     ` Richard Guenther
                       ` (2 more replies)
  0 siblings, 3 replies; 26+ messages in thread
From: Sebastian Pop @ 2007-10-12 21:08 UTC (permalink / raw)
  To: Zdenek Dvorak; +Cc: GCC Patches

On 10/11/07, Zdenek Dvorak <rakdver@kam.mff.cuni.cz> wrote:
>
> cfg cleanup does that in gcc.
>

That's not helping removing these BBs as they are just before a BB
with more than one entry, so they are considered as loop latch BBs.

I'm still thinking about how to adapt the analyzer to not reject
ltrans-3.c and not to fail on the two PRs, but the pattern in these
three cases is exactly the same, I cannot distinguish in between: all
the statements outside the innermost loop are just either conditions,
or the update of the main induction variable.

I would like to have a more strict definition of perfectly nested
loops, by forbidding more than a COND_EXPR in the body of the outer
loops: i.e. the only COND_EXPR of the outer loop should be the exit
condition.  But this is too strict for the case in ltrans-3.c.

Do we have an option to XFAIL ltrans-3.c?  And of course, file an
optimization PR for the following problem:

The code for ltrans-3.c is

double u[1782225];
int foo(int N, int *res)
{
  unsigned int i, j;
  double sum = 0;
      for (i = 0; i < N; i++)
	{
	  for (j = 0; j < N; j++)
	    {
	      sum = sum + u[i + 1335 * j];
	    }
	}
      *res = sum + N;
}

we check that N != 0 before entering the first loop, and also inside
this loop, we check for the exact same expression N != 0 before
entering the innermost loop.  N not being modified in between these
two checks, the second cond_expr is redundant.  VRP or a constant
propagator could be adapted to optimize this kind of double checking.

Sebastian

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Preliminary patch for PR23820 and PR24309
  2007-10-12 21:08   ` Sebastian Pop
@ 2007-10-12 21:21     ` Richard Guenther
  2007-10-12 22:10       ` Zdenek Dvorak
  2007-10-12 22:16     ` Zdenek Dvorak
  2007-10-15 17:49     ` Sebastian Pop
  2 siblings, 1 reply; 26+ messages in thread
From: Richard Guenther @ 2007-10-12 21:21 UTC (permalink / raw)
  To: Sebastian Pop; +Cc: Zdenek Dvorak, GCC Patches

On 10/12/07, Sebastian Pop <sebpop@gmail.com> wrote:
> On 10/11/07, Zdenek Dvorak <rakdver@kam.mff.cuni.cz> wrote:
> >
> > cfg cleanup does that in gcc.
> >
>
> That's not helping removing these BBs as they are just before a BB
> with more than one entry, so they are considered as loop latch BBs.
>
> I'm still thinking about how to adapt the analyzer to not reject
> ltrans-3.c and not to fail on the two PRs, but the pattern in these
> three cases is exactly the same, I cannot distinguish in between: all
> the statements outside the innermost loop are just either conditions,
> or the update of the main induction variable.
>
> I would like to have a more strict definition of perfectly nested
> loops, by forbidding more than a COND_EXPR in the body of the outer
> loops: i.e. the only COND_EXPR of the outer loop should be the exit
> condition.  But this is too strict for the case in ltrans-3.c.
>
> Do we have an option to XFAIL ltrans-3.c?  And of course, file an
> optimization PR for the following problem:
>
> The code for ltrans-3.c is
>
> double u[1782225];
> int foo(int N, int *res)
> {
>   unsigned int i, j;
>   double sum = 0;
>       for (i = 0; i < N; i++)
>         {
>           for (j = 0; j < N; j++)
>             {
>               sum = sum + u[i + 1335 * j];
>             }
>         }
>       *res = sum + N;
> }
>
> we check that N != 0 before entering the first loop, and also inside
> this loop, we check for the exact same expression N != 0 before
> entering the innermost loop.  N not being modified in between these
> two checks, the second cond_expr is redundant.  VRP or a constant
> propagator could be adapted to optimize this kind of double checking.

loop header copying is run after the first VRP pass, but DOM usually
catches this.  Now in this case DOM needs enabling transformations.

Why do we run loop header copying so late?  If it doesn't cause
problems if would run it as first optimization after inlining (which
obviously fixes the problems you see, the first VRP pass removes
the comparison).  The last place in the pass pipeline that still
works is just before FRE.

Richard.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Preliminary patch for PR23820 and PR24309
  2007-10-12 21:21     ` Richard Guenther
@ 2007-10-12 22:10       ` Zdenek Dvorak
  2007-10-13 12:23         ` Richard Guenther
  0 siblings, 1 reply; 26+ messages in thread
From: Zdenek Dvorak @ 2007-10-12 22:10 UTC (permalink / raw)
  To: Richard Guenther; +Cc: Sebastian Pop, GCC Patches

Hello,

> loop header copying is run after the first VRP pass, but DOM usually
> catches this.  Now in this case DOM needs enabling transformations.
> 
> Why do we run loop header copying so late?  If it doesn't cause
> problems if would run it as first optimization after inlining (which
> obviously fixes the problems you see, the first VRP pass removes
> the comparison).  The last place in the pass pipeline that still
> works is just before FRE.

in some cases, moving loop header copying earlier led to missed
optimizations.  Like e.g. in the following code, before loop header
copying:

i = 0;
while (1)
  {
    if (i >= n) break;

    if (i >= n)
      error ("access outside of array");
    a[i] = i;

    i++;
  }

the second check for i >= n will obviously be eliminated.  After loop
header copying,

i = 0;
if (0 < n)
  {
    while (1)
      {
        if (i >= n)
          error ("access outside of array");
        a[i] = i;

        i++;
        if (i >= n) break;
      }
  }

it is much harder to determine that the condition is redundant
(certainly we were not able to do that before VRP was introduced; I am
not sure whether VRP handles this case or not).  So we run loop
header copying only after some basic cleanups (especially DOM)
had chance to simplify such conditions.

Zdenek

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Preliminary patch for PR23820 and PR24309
  2007-10-12 21:08   ` Sebastian Pop
  2007-10-12 21:21     ` Richard Guenther
@ 2007-10-12 22:16     ` Zdenek Dvorak
  2007-10-12 22:32       ` Sebastian Pop
  2007-10-15 17:49     ` Sebastian Pop
  2 siblings, 1 reply; 26+ messages in thread
From: Zdenek Dvorak @ 2007-10-12 22:16 UTC (permalink / raw)
  To: Sebastian Pop; +Cc: GCC Patches

Hello,

> On 10/11/07, Zdenek Dvorak <rakdver@kam.mff.cuni.cz> wrote:
> >
> > cfg cleanup does that in gcc.
> >
> 
> That's not helping removing these BBs as they are just before a BB
> with more than one entry, so they are considered as loop latch BBs.

yes, these blocks cannot be removed as long as we keep the loops in
the shape when latch has just one successor.  They are removed once
we no longer update the loops.  I am not sure, do you suggest that
we should not do this, or why are you asking?  Certainly, the
presence of the empty latch block does not make the analysis in ltrans
any harder (not any simpler, of course).

> I'm still thinking about how to adapt the analyzer to not reject
> ltrans-3.c and not to fail on the two PRs, but the pattern in these
> three cases is exactly the same, I cannot distinguish in between: all
> the statements outside the innermost loop are just either conditions,
> or the update of the main induction variable.
> 
> I would like to have a more strict definition of perfectly nested
> loops, by forbidding more than a COND_EXPR in the body of the outer
> loops: i.e. the only COND_EXPR of the outer loop should be the exit
> condition.

Yes, this seems to be the right condition.

> But this is too strict for the case in ltrans-3.c.
> 
> Do we have an option to XFAIL ltrans-3.c?  And of course, file an
> optimization PR for the following problem:
> 
> The code for ltrans-3.c is
> 
> double u[1782225];
> int foo(int N, int *res)
> {
>   unsigned int i, j;
>   double sum = 0;
>       for (i = 0; i < N; i++)
> 	{
> 	  for (j = 0; j < N; j++)
> 	    {
> 	      sum = sum + u[i + 1335 * j];
> 	    }
> 	}
>       *res = sum + N;
> }
> 
> we check that N != 0 before entering the first loop, and also inside
> this loop, we check for the exact same expression N != 0 before
> entering the innermost loop.  N not being modified in between these
> two checks, the second cond_expr is redundant.  VRP or a constant
> propagator could be adapted to optimize this kind of double checking.

Both VRP and DOM should be able to eliminate such a condition, already.

Zdenek

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Preliminary patch for PR23820 and PR24309
  2007-10-12 22:16     ` Zdenek Dvorak
@ 2007-10-12 22:32       ` Sebastian Pop
  2007-10-12 23:05         ` Sebastian Pop
  0 siblings, 1 reply; 26+ messages in thread
From: Sebastian Pop @ 2007-10-12 22:32 UTC (permalink / raw)
  To: Zdenek Dvorak; +Cc: GCC Patches

On 10/12/07, Zdenek Dvorak <rakdver@kam.mff.cuni.cz> wrote:
> yes, these blocks cannot be removed as long as we keep the loops in
> the shape when latch has just one successor.  They are removed once
> we no longer update the loops.  I am not sure, do you suggest that
> we should not do this, or why are you asking?

No.  I was just looking for a pass that would allowed me to clean the
CFG in such a way that I can then count the number of BBs per loop for
determining whether it is a perfect nest or not.

> Certainly, the
> presence of the empty latch block does not make the analysis in ltrans
> any harder (not any simpler, of course).
>

True, I could look at the number of useful statements per block.

> > two checks, the second cond_expr is redundant.  VRP or a constant
> > propagator could be adapted to optimize this kind of double checking.
>
> Both VRP and DOM should be able to eliminate such a condition, already.
>

True, if I'm dumping the value ranges that VRP ends with, we have

N.0_28: [1, +INF]  EQUIVALENCES: { N.0_8 } (1 elements)

that is enough to deduce that "if (N != 0)" is true.  I'm looking into
why VRP doesn't trigger the elimination of the condition.

Sebastian

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Preliminary patch for PR23820 and PR24309
  2007-10-12 22:32       ` Sebastian Pop
@ 2007-10-12 23:05         ` Sebastian Pop
  2007-10-12 23:18           ` Sebastian Pop
  0 siblings, 1 reply; 26+ messages in thread
From: Sebastian Pop @ 2007-10-12 23:05 UTC (permalink / raw)
  To: Zdenek Dvorak; +Cc: GCC Patches

On 10/12/07, Sebastian Pop <sebpop@gmail.com> wrote:
> True, if I'm dumping the value ranges that VRP ends with, we have
>
> N.0_28: [1, +INF]  EQUIVALENCES: { N.0_8 } (1 elements)
>
> that is enough to deduce that "if (N != 0)" is true.  I'm looking into
> why VRP doesn't trigger the elimination of the condition.
>

That's really strange, as after first VRP we don't have any N!=0
condition.  I suppose that DCE is in fault there: in passes.c we have
the following sequence around the first VRP:

      NEXT_PASS (pass_vrp);
      NEXT_PASS (pass_dce);
      NEXT_PASS (pass_cselim);

Now if I'm duplicating VRP just after DCE,

      NEXT_PASS (pass_vrp);
      NEXT_PASS (pass_dce);
      NEXT_PASS (pass_vrp);
      NEXT_PASS (pass_cselim);

the N!=0 in the outer loop disappears, and the ltrans-3.c testcase
passes with the stricter definition of the perfect nested loop.

Sebastian

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Preliminary patch for PR23820 and PR24309
  2007-10-12 23:05         ` Sebastian Pop
@ 2007-10-12 23:18           ` Sebastian Pop
  2007-10-13  0:06             ` Zdenek Dvorak
  2007-10-13 12:26             ` Richard Guenther
  0 siblings, 2 replies; 26+ messages in thread
From: Sebastian Pop @ 2007-10-12 23:18 UTC (permalink / raw)
  To: Diego Novillo, Zdenek Dvorak; +Cc: GCC Patches

On 10/12/07, Sebastian Pop <sebpop@gmail.com> wrote:
> On 10/12/07, Sebastian Pop <sebpop@gmail.com> wrote:
> > True, if I'm dumping the value ranges that VRP ends with, we have
> >
> > N.0_28: [1, +INF]  EQUIVALENCES: { N.0_8 } (1 elements)
> >
> > that is enough to deduce that "if (N != 0)" is true.  I'm looking into
> > why VRP doesn't trigger the elimination of the condition.
> >
>
> That's really strange, as after first VRP we don't have any N!=0
> condition.  I suppose that DCE is in fault there: in passes.c we have
> the following sequence around the first VRP:
>
>       NEXT_PASS (pass_vrp);
>       NEXT_PASS (pass_dce);
>       NEXT_PASS (pass_cselim);
>
> Now if I'm duplicating VRP just after DCE,
>
>       NEXT_PASS (pass_vrp);
>       NEXT_PASS (pass_dce);
>       NEXT_PASS (pass_vrp);
>       NEXT_PASS (pass_cselim);
>
> the N!=0 in the outer loop disappears, and the ltrans-3.c testcase
> passes with the stricter definition of the perfect nested loop.
>

The following patch solves the problem.  Diego, would that be
acceptable?  Any strong reason to have DCE after VRP?

Sebastian

Index: passes.c
===================================================================
--- passes.c	(revision 129276)
+++ passes.c	(working copy)
@@ -567,8 +567,8 @@ init_optimization_passes (void)
       NEXT_PASS (pass_forwprop);
       NEXT_PASS (pass_copy_prop);
       NEXT_PASS (pass_merge_phi);
-      NEXT_PASS (pass_vrp);
       NEXT_PASS (pass_dce);
+      NEXT_PASS (pass_vrp);
       NEXT_PASS (pass_cselim);
       NEXT_PASS (pass_dominator);
       /* The only const/copy propagation opportunities left after

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Preliminary patch for PR23820 and PR24309
  2007-10-12 23:18           ` Sebastian Pop
@ 2007-10-13  0:06             ` Zdenek Dvorak
  2007-10-13 12:26             ` Richard Guenther
  1 sibling, 0 replies; 26+ messages in thread
From: Zdenek Dvorak @ 2007-10-13  0:06 UTC (permalink / raw)
  To: Sebastian Pop; +Cc: Diego Novillo, GCC Patches

Hello,

> > > True, if I'm dumping the value ranges that VRP ends with, we have
> > >
> > > N.0_28: [1, +INF]  EQUIVALENCES: { N.0_8 } (1 elements)
> > >
> > > that is enough to deduce that "if (N != 0)" is true.  I'm looking into
> > > why VRP doesn't trigger the elimination of the condition.
> > >
> >
> > That's really strange, as after first VRP we don't have any N!=0
> > condition.  I suppose that DCE is in fault there: in passes.c we have
> > the following sequence around the first VRP:
> >
> >       NEXT_PASS (pass_vrp);
> >       NEXT_PASS (pass_dce);
> >       NEXT_PASS (pass_cselim);
> >
> > Now if I'm duplicating VRP just after DCE,
> >
> >       NEXT_PASS (pass_vrp);
> >       NEXT_PASS (pass_dce);
> >       NEXT_PASS (pass_vrp);
> >       NEXT_PASS (pass_cselim);
> >
> > the N!=0 in the outer loop disappears, and the ltrans-3.c testcase
> > passes with the stricter definition of the perfect nested loop.
> >
> 
> The following patch solves the problem.

why does this patch help?  It seems to me that dce should not affect
vrp.

> Diego, would that be acceptable?  Any strong reason to have DCE after
> VRP?

Well, vrp may create dead code, so the having dce after vrp seems more
natural.

Zdenek

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Preliminary patch for PR23820 and PR24309
  2007-10-12 22:10       ` Zdenek Dvorak
@ 2007-10-13 12:23         ` Richard Guenther
  0 siblings, 0 replies; 26+ messages in thread
From: Richard Guenther @ 2007-10-13 12:23 UTC (permalink / raw)
  To: Zdenek Dvorak; +Cc: Sebastian Pop, GCC Patches

On 10/13/07, Zdenek Dvorak <rakdver@kam.mff.cuni.cz> wrote:
> Hello,
>
> > loop header copying is run after the first VRP pass, but DOM usually
> > catches this.  Now in this case DOM needs enabling transformations.
> >
> > Why do we run loop header copying so late?  If it doesn't cause
> > problems if would run it as first optimization after inlining (which
> > obviously fixes the problems you see, the first VRP pass removes
> > the comparison).  The last place in the pass pipeline that still
> > works is just before FRE.
>
> in some cases, moving loop header copying earlier led to missed
> optimizations.  Like e.g. in the following code, before loop header
> copying:
>
> i = 0;
> while (1)
>   {
>     if (i >= n) break;
>
>     if (i >= n)
>       error ("access outside of array");
>     a[i] = i;
>
>     i++;
>   }
>
> the second check for i >= n will obviously be eliminated.  After loop
> header copying,
>
> i = 0;
> if (0 < n)
>   {
>     while (1)
>       {
>         if (i >= n)
>           error ("access outside of array");
>         a[i] = i;
>
>         i++;
>         if (i >= n) break;
>       }
>   }
>
> it is much harder to determine that the condition is redundant
> (certainly we were not able to do that before VRP was introduced; I am
> not sure whether VRP handles this case or not).  So we run loop
> header copying only after some basic cleanups (especially DOM)
> had chance to simplify such conditions.

I realize this, though for the testcase DOM after CH but before
loop optimizations doesn't get the simplification because simple
copy-prop through PHI is missing.  For your example above I think
VRP should still be able to remove the redundant comparison
(at least for signed i).

Richard.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Preliminary patch for PR23820 and PR24309
  2007-10-12 23:18           ` Sebastian Pop
  2007-10-13  0:06             ` Zdenek Dvorak
@ 2007-10-13 12:26             ` Richard Guenther
  2007-10-13 16:06               ` Sebastian Pop
  1 sibling, 1 reply; 26+ messages in thread
From: Richard Guenther @ 2007-10-13 12:26 UTC (permalink / raw)
  To: Sebastian Pop; +Cc: Diego Novillo, Zdenek Dvorak, GCC Patches

On 10/13/07, Sebastian Pop <sebpop@gmail.com> wrote:
> On 10/12/07, Sebastian Pop <sebpop@gmail.com> wrote:
> > On 10/12/07, Sebastian Pop <sebpop@gmail.com> wrote:
> > > True, if I'm dumping the value ranges that VRP ends with, we have
> > >
> > > N.0_28: [1, +INF]  EQUIVALENCES: { N.0_8 } (1 elements)
> > >
> > > that is enough to deduce that "if (N != 0)" is true.  I'm looking into
> > > why VRP doesn't trigger the elimination of the condition.
> > >
> >
> > That's really strange, as after first VRP we don't have any N!=0
> > condition.  I suppose that DCE is in fault there: in passes.c we have
> > the following sequence around the first VRP:
> >
> >       NEXT_PASS (pass_vrp);
> >       NEXT_PASS (pass_dce);
> >       NEXT_PASS (pass_cselim);
> >
> > Now if I'm duplicating VRP just after DCE,
> >
> >       NEXT_PASS (pass_vrp);
> >       NEXT_PASS (pass_dce);
> >       NEXT_PASS (pass_vrp);
> >       NEXT_PASS (pass_cselim);
> >
> > the N!=0 in the outer loop disappears, and the ltrans-3.c testcase
> > passes with the stricter definition of the perfect nested loop.
> >
>
> The following patch solves the problem.  Diego, would that be
> acceptable?  Any strong reason to have DCE after VRP?

Huh?  That patch should make zero difference.

Richard.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Preliminary patch for PR23820 and PR24309
  2007-10-13 12:26             ` Richard Guenther
@ 2007-10-13 16:06               ` Sebastian Pop
  2007-10-13 17:19                 ` Richard Guenther
  0 siblings, 1 reply; 26+ messages in thread
From: Sebastian Pop @ 2007-10-13 16:06 UTC (permalink / raw)
  To: Richard Guenther; +Cc: GCC Patches

On 10/13/07, Richard Guenther <richard.guenther@gmail.com> wrote:
> Huh?  That patch should make zero difference.
>

However, it does.  I double checked that without that patch ltrans-3
fails, and with the patch it passes.  Another sequence that works for
ltrans-3 is duplicating DCE before VRP.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Preliminary patch for PR23820 and PR24309
  2007-10-13 16:06               ` Sebastian Pop
@ 2007-10-13 17:19                 ` Richard Guenther
  2007-10-13 17:35                   ` Richard Guenther
  0 siblings, 1 reply; 26+ messages in thread
From: Richard Guenther @ 2007-10-13 17:19 UTC (permalink / raw)
  To: Sebastian Pop; +Cc: GCC Patches

On 10/13/07, Sebastian Pop <sebpop@gmail.com> wrote:
> On 10/13/07, Richard Guenther <richard.guenther@gmail.com> wrote:
> > Huh?  That patch should make zero difference.
> >
>
> However, it does.  I double checked that without that patch ltrans-3
> fails, and with the patch it passes.  Another sequence that works for
> ltrans-3 is duplicating DCE before VRP.

I checked as well and the dumps after these both passes are the same.
Also both passes are before loop-header copying.  The difference it
seems to make is that DCE removes a forwarder block which it doesn't
do if it runs after VRP.  For some reason.

Richard.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Preliminary patch for PR23820 and PR24309
  2007-10-13 17:19                 ` Richard Guenther
@ 2007-10-13 17:35                   ` Richard Guenther
  2007-10-13 17:43                     ` Sebastian Pop
  2007-10-14  2:25                     ` Zdenek Dvorak
  0 siblings, 2 replies; 26+ messages in thread
From: Richard Guenther @ 2007-10-13 17:35 UTC (permalink / raw)
  To: Sebastian Pop; +Cc: GCC Patches

On 10/13/07, Richard Guenther <richard.guenther@gmail.com> wrote:
> On 10/13/07, Sebastian Pop <sebpop@gmail.com> wrote:
> > On 10/13/07, Richard Guenther <richard.guenther@gmail.com> wrote:
> > > Huh?  That patch should make zero difference.
> > >
> >
> > However, it does.  I double checked that without that patch ltrans-3
> > fails, and with the patch it passes.  Another sequence that works for
> > ltrans-3 is duplicating DCE before VRP.
>
> I checked as well and the dumps after these both passes are the same.
> Also both passes are before loop-header copying.  The difference it
> seems to make is that DCE removes a forwarder block which it doesn't
> do if it runs after VRP.  For some reason.

Nope.  VRP inserts loop preheaders.  So, does

Index: tree-vrp.c
===================================================================
--- tree-vrp.c  (revision 129282)
+++ tree-vrp.c  (working copy)
@@ -6089,7 +6089,7 @@ vrp_finalize (void)
 static unsigned int
 execute_vrp (void)
 {
-  loop_optimizer_init (LOOPS_NORMAL | LOOPS_HAVE_RECORDED_EXITS);
+  loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
   rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
   scev_initialize ();

fix it as well?

Richard.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Preliminary patch for PR23820 and PR24309
  2007-10-13 17:35                   ` Richard Guenther
@ 2007-10-13 17:43                     ` Sebastian Pop
  2007-10-14  2:25                     ` Zdenek Dvorak
  1 sibling, 0 replies; 26+ messages in thread
From: Sebastian Pop @ 2007-10-13 17:43 UTC (permalink / raw)
  To: Richard Guenther; +Cc: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 850 bytes --]

On 10/13/07, Richard Guenther <richard.guenther@gmail.com> wrote:
> Nope.  VRP inserts loop preheaders.  So, does
>
> Index: tree-vrp.c
> ===================================================================
> --- tree-vrp.c  (revision 129282)
> +++ tree-vrp.c  (working copy)
> @@ -6089,7 +6089,7 @@ vrp_finalize (void)
>  static unsigned int
>  execute_vrp (void)
>  {
> -  loop_optimizer_init (LOOPS_NORMAL | LOOPS_HAVE_RECORDED_EXITS);
> +  loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
>    rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
>    scev_initialize ();
>
> fix it as well?
>

Nope, and there are a bunch of other tree-ssa.exp fails with that patch.
Attached is the patch that I used and that allows the ltrans-3 to pass.
This patch passed bootstrap on amd64-linux and test with one extra fail:
gcc.dg/fold-compare-2.c

Sebastian

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 805_pr24309.diff --]
[-- Type: text/x-patch; name="805_pr24309.diff", Size: 9774 bytes --]

Index: tree-loop-linear.c
===================================================================
--- tree-loop-linear.c	(revision 129276)
+++ tree-loop-linear.c	(working copy)
@@ -244,6 +244,46 @@ try_interchange_loops (lambda_trans_matr
   return trans;
 }
 
+/* Return the number of nested loops in LOOP_NEST, or 0 if the loops
+   are not perfectly nested.  */
+
+static unsigned int
+perfect_loop_nest_depth (struct loop *loop_nest)
+{
+  struct loop *temp;
+  unsigned int depth = 1;
+
+  /* If it's not a loop nest, we don't want it.  We also don't handle
+     sibling loops properly, which are loops of the following form:
+
+     | for (i = 0; i < 50; i++)
+     |   {
+     |     for (j = 0; j < 50; j++)
+     |       {
+     |        ...
+     |       }
+     |     for (j = 0; j < 50; j++)
+     |       {
+     |        ...
+     |       }
+     |   }
+  */
+
+  if (!loop_nest->inner || !single_exit (loop_nest))
+    return 0;
+
+  for (temp = loop_nest->inner; temp; temp = temp->inner)
+    {
+      /* If we have a sibling loop or multiple exit edges, jump ship.  */
+      if (temp->next || !single_exit (temp))
+	return 0;
+
+      depth++;
+    }
+
+  return depth;
+}
+
 /* Perform a set of linear transforms on loops.  */
 
 void
@@ -263,47 +303,18 @@ linear_transform_loops (void)
       unsigned int depth = 0;
       VEC (ddr_p, heap) *dependence_relations;
       VEC (data_reference_p, heap) *datarefs;
-      struct loop *temp;
       lambda_loopnest before, after;
       lambda_trans_matrix trans;
-      bool problem = false;
       struct obstack lambda_obstack;
       gcc_obstack_init (&lambda_obstack);
 
-      /* If it's not a loop nest, we don't want it.
-         We also don't handle sibling loops properly, 
-         which are loops of the following form:
-         for (i = 0; i < 50; i++)
-           {
-             for (j = 0; j < 50; j++)
-               {
-	        ...
-               }
-             for (j = 0; j < 50; j++)
-               {
-                ...
-               }
-           } */
-      if (!loop_nest->inner || !single_exit (loop_nest))
+      depth = perfect_loop_nest_depth (loop_nest);
+      if (depth == 0)
 	continue;
+
       VEC_truncate (tree, oldivs, 0);
       VEC_truncate (tree, invariants, 0);
-      depth = 1;
-      for (temp = loop_nest->inner; temp; temp = temp->inner)
-	{
-	  /* If we have a sibling loop or multiple exit edges, jump ship.  */
-	  if (temp->next || !single_exit (temp))
-	    {
-	      problem = true;
-	      break;
-	    }
-	  depth ++;
-	}
-      if (problem)
-	continue;
 
-      /* Analyze data references and dependence relations using scev.  */      
- 
       datarefs = VEC_alloc (data_reference_p, heap, 10);
       dependence_relations = VEC_alloc (ddr_p, heap, 10 * 10);
       compute_data_dependences_for_loop (loop_nest, true, &datarefs,
@@ -353,12 +364,12 @@ linear_transform_loops (void)
 	  print_lambda_loopnest (dump_file, after, 'u');
 	}
 
-      lambda_loopnest_to_gcc_loopnest (loop_nest, oldivs, invariants,
-				       &remove_ivs,
-                                       after, trans, &lambda_obstack);
-      modified = true;
+      modified = lambda_loopnest_to_gcc_loopnest (loop_nest, oldivs, 
+						  invariants, &remove_ivs,
+						  after, trans,
+						  &lambda_obstack);
 
-      if (dump_file)
+      if (modified && dump_file)
 	fprintf (dump_file, "Successfully transformed loop.\n");
 
     free_and_continue:
Index: testsuite/gcc.dg/tree-ssa/pr23820.c
===================================================================
--- testsuite/gcc.dg/tree-ssa/pr23820.c	(revision 0)
+++ testsuite/gcc.dg/tree-ssa/pr23820.c	(revision 0)
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-loop-linear" } */
+
+int t [2][4];
+
+void foo (void)
+{
+  int i, j, k, v;
+  float e;
+  for (;;)
+    {
+      v = 0;
+      for (j = 0; j < 2; j ++)
+        {
+          for (k = 2; k < 4; k ++)
+            {
+              e = 0.0;
+              for (i = 0; i < 4; i ++)
+                e += t [j][i];
+              if (e)
+                v = j;
+            }
+        }
+      t [v][0] = 0;
+    }
+}
Index: testsuite/gcc.dg/tree-ssa/ltrans-3.c
===================================================================
--- testsuite/gcc.dg/tree-ssa/ltrans-3.c	(revision 129276)
+++ testsuite/gcc.dg/tree-ssa/ltrans-3.c	(working copy)
@@ -17,5 +17,5 @@ int foo(int N, int *res)
       *res = sum + N;
 }
 
-/* { dg-final { scan-tree-dump-times "transformed loop" 1 "ltrans"} } */ 
+/* { dg-final { scan-tree-dump-times "transformed loop" 1 "ltrans" } } */ 
 /* { dg-final { cleanup-tree-dump "ltrans" } } */
Index: testsuite/gcc.dg/tree-ssa/pr24309.c
===================================================================
--- testsuite/gcc.dg/tree-ssa/pr24309.c	(revision 0)
+++ testsuite/gcc.dg/tree-ssa/pr24309.c	(revision 0)
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-loop-linear" } */
+
+float weight[10];
+void lsp_weight_quant(float *x, char *cdbk)
+{
+   int i,j;
+   float dist;
+   int best_id=0;
+   for (i=0;i<16;i++)
+   {
+      for (j=0;j<10;j++)
+         dist=dist+weight[j];
+      if (dist<0)
+         best_id=i;
+   }
+   x[j] = cdbk[best_id*10+j];
+}
Index: lambda.h
===================================================================
--- lambda.h	(revision 129276)
+++ lambda.h	(working copy)
@@ -204,7 +204,7 @@ lambda_loopnest gcc_loopnest_to_lambda_l
 						 VEC(tree,heap) **,
                                                  VEC(tree,heap) **,
                                                  struct obstack *);
-void lambda_loopnest_to_gcc_loopnest (struct loop *,
+bool lambda_loopnest_to_gcc_loopnest (struct loop *,
 				      VEC(tree,heap) *, VEC(tree,heap) *,
 				      VEC(tree,heap) **,
                                       lambda_loopnest, lambda_trans_matrix,
Index: lambda-code.c
===================================================================
--- lambda-code.c	(revision 129276)
+++ lambda-code.c	(working copy)
@@ -1675,6 +1675,37 @@ remove_iv (tree iv_stmt)
     }
 }
 
+/* Return true if we are sure that the code generation cannot be
+   performed for this loop nest.  */
+
+static bool
+lambda_cannot_generate_loop_nest (VEC (tree, heap) *old_ivs)
+{
+  unsigned i;
+  tree oldiv;
+
+  for (i = 0; VEC_iterate (tree, old_ivs, i, oldiv); i++)
+    {
+      imm_use_iterator imm_iter;
+      tree oldiv_def;
+      tree oldiv_stmt = SSA_NAME_DEF_STMT (oldiv);
+      tree stmt;
+
+      if (TREE_CODE (oldiv_stmt) == PHI_NODE)
+        oldiv_def = PHI_RESULT (oldiv_stmt);
+      else
+	oldiv_def = SINGLE_SSA_TREE_OPERAND (oldiv_stmt, SSA_OP_DEF);
+      gcc_assert (oldiv_def != NULL_TREE);
+
+      FOR_EACH_IMM_USE_STMT (stmt, imm_iter, oldiv_def)
+        {
+	  if (TREE_CODE (stmt) == PHI_NODE)
+	    return true;
+	}
+    }
+
+  return false;
+}
 
 /* Transform a lambda loopnest NEW_LOOPNEST, which had TRANSFORM applied to
    it, back into gcc code.  This changes the
@@ -1686,9 +1717,11 @@ remove_iv (tree iv_stmt)
    INVARIANTS is a vector of loop invariants from the old loopnest.
    NEW_LOOPNEST is the new lambda loopnest to replace OLD_LOOPNEST with.
    TRANSFORM is the matrix transform that was applied to OLD_LOOPNEST to get 
-   NEW_LOOPNEST.  */
+   NEW_LOOPNEST.  
+
+   Returns true when the code generation succeeded.  */
 
-void
+bool
 lambda_loopnest_to_gcc_loopnest (struct loop *old_loopnest,
 				 VEC(tree,heap) *old_ivs,
 				 VEC(tree,heap) *invariants,
@@ -1705,6 +1738,14 @@ lambda_loopnest_to_gcc_loopnest (struct 
   
   block_stmt_iterator bsi;
 
+  if (lambda_cannot_generate_loop_nest (old_ivs))
+    {
+      if (dump_file)
+	fprintf (dump_file, "Lambda code generation failed to produce code.\n");
+
+      return false;
+    }
+
   if (dump_file)
     {
       transform = lambda_trans_matrix_inverse (transform);
@@ -1865,6 +1906,8 @@ lambda_loopnest_to_gcc_loopnest (struct 
       VEC_safe_push (tree, heap, *remove_ivs, oldiv_stmt);
     }
   VEC_free (tree, heap, new_ivs);
+
+  return true;
 }
 
 /* Return TRUE if this is not interesting statement from the perspective of
@@ -1972,32 +2015,42 @@ perfect_nest_p (struct loop *loop)
   size_t i;
   tree exit_cond;
 
+  /* Loops at depth 0 are perfect nests.  */
   if (!loop->inner)
     return true;
+
   bbs = get_loop_body (loop);
   exit_cond = get_loop_exit_condition (loop);
+
   for (i = 0; i < loop->num_nodes; i++)
     {
       if (bbs[i]->loop_father == loop)
 	{
 	  block_stmt_iterator bsi;
+
 	  for (bsi = bsi_start (bbs[i]); !bsi_end_p (bsi); bsi_next (&bsi))
 	    {
 	      tree stmt = bsi_stmt (bsi);
+
+	      if (TREE_CODE (stmt) == COND_EXPR
+		  && exit_cond != stmt)
+		goto non_perfectly_nested;
+
 	      if (stmt == exit_cond
 		  || not_interesting_stmt (stmt)
 		  || stmt_is_bumper_for_loop (loop, stmt))
 		continue;
+
+	    non_perfectly_nested:
 	      free (bbs);
 	      return false;
 	    }
 	}
     }
+
   free (bbs);
-  /* See if the inner loops are perfectly nested as well.  */
-  if (loop->inner)    
-    return perfect_nest_p (loop->inner);
-  return true;
+
+  return perfect_nest_p (loop->inner);
 }
 
 /* Replace the USES of X in STMT, or uses with the same step as X with Y.
Index: passes.c
===================================================================
--- passes.c	(revision 129276)
+++ passes.c	(working copy)
@@ -567,8 +567,8 @@ init_optimization_passes (void)
       NEXT_PASS (pass_forwprop);
       NEXT_PASS (pass_copy_prop);
       NEXT_PASS (pass_merge_phi);
-      NEXT_PASS (pass_vrp);
       NEXT_PASS (pass_dce);
+      NEXT_PASS (pass_vrp);
       NEXT_PASS (pass_cselim);
       NEXT_PASS (pass_dominator);
       /* The only const/copy propagation opportunities left after

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Preliminary patch for PR23820 and PR24309
  2007-10-13 17:35                   ` Richard Guenther
  2007-10-13 17:43                     ` Sebastian Pop
@ 2007-10-14  2:25                     ` Zdenek Dvorak
  1 sibling, 0 replies; 26+ messages in thread
From: Zdenek Dvorak @ 2007-10-14  2:25 UTC (permalink / raw)
  To: Richard Guenther; +Cc: Sebastian Pop, GCC Patches

Hello,

> On 10/13/07, Richard Guenther <richard.guenther@gmail.com> wrote:
> > On 10/13/07, Sebastian Pop <sebpop@gmail.com> wrote:
> > > On 10/13/07, Richard Guenther <richard.guenther@gmail.com> wrote:
> > > > Huh?  That patch should make zero difference.
> > > >
> > >
> > > However, it does.  I double checked that without that patch ltrans-3
> > > fails, and with the patch it passes.  Another sequence that works for
> > > ltrans-3 is duplicating DCE before VRP.
> >
> > I checked as well and the dumps after these both passes are the same.
> > Also both passes are before loop-header copying.  The difference it
> > seems to make is that DCE removes a forwarder block which it doesn't
> > do if it runs after VRP.  For some reason.
> 
> Nope.  VRP inserts loop preheaders.  So, does
> 
> Index: tree-vrp.c
> ===================================================================
> --- tree-vrp.c  (revision 129282)
> +++ tree-vrp.c  (working copy)
> @@ -6089,7 +6089,7 @@ vrp_finalize (void)
>  static unsigned int
>  execute_vrp (void)
>  {
> -  loop_optimizer_init (LOOPS_NORMAL | LOOPS_HAVE_RECORDED_EXITS);
> +  loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
>    rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
>    scev_initialize ();
> 
> fix it as well?

this change should cause ICEs, as we assume that loops have
just a single latch edge (and recorded exits) in scev code.

Zdenek

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Preliminary patch for PR23820 and PR24309
  2007-10-12 21:08   ` Sebastian Pop
  2007-10-12 21:21     ` Richard Guenther
  2007-10-12 22:16     ` Zdenek Dvorak
@ 2007-10-15 17:49     ` Sebastian Pop
  2007-10-15 18:13       ` Zdenek Dvorak
  2007-10-21 20:58       ` Eric Botcazou
  2 siblings, 2 replies; 26+ messages in thread
From: Sebastian Pop @ 2007-10-15 17:49 UTC (permalink / raw)
  To: GCC Patches; +Cc: Daniel Berlin

[-- Attachment #1: Type: text/plain, Size: 674 bytes --]

On 10/12/07, Sebastian Pop <sebpop@gmail.com> wrote:
> Do we have an option to XFAIL ltrans-3.c?  And of course, file an
> optimization PR

As we know that we can fix this optimization PR, and because I still
don't see how to implement a solution for that optimization PR, and
because the patch that I'm attaching, with ltrans-3 XFAIL-ed, passes
bootstrap and test on amd64-linux fixing three ICE PRs 23820, 24309,
and 33766, is the patch okay for trunk?

After this patch, as we won't have other PRs open on the loop
interchange (otherwise ping me on these and I'll give them a look),
would it be possible to enable loop interchange at -O3 or -O2 level?

Thanks,
Sebastian

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 807_pr24309.diff --]
[-- Type: text/x-patch; name="807_pr24309.diff", Size: 6549 bytes --]

Index: tree-loop-linear.c
===================================================================
--- tree-loop-linear.c	(revision 129276)
+++ tree-loop-linear.c	(working copy)
@@ -244,6 +244,46 @@ try_interchange_loops (lambda_trans_matr
   return trans;
 }
 
+/* Return the number of nested loops in LOOP_NEST, or 0 if the loops
+   are not perfectly nested.  */
+
+static unsigned int
+perfect_loop_nest_depth (struct loop *loop_nest)
+{
+  struct loop *temp;
+  unsigned int depth = 1;
+
+  /* If it's not a loop nest, we don't want it.  We also don't handle
+     sibling loops properly, which are loops of the following form:
+
+     | for (i = 0; i < 50; i++)
+     |   {
+     |     for (j = 0; j < 50; j++)
+     |       {
+     |        ...
+     |       }
+     |     for (j = 0; j < 50; j++)
+     |       {
+     |        ...
+     |       }
+     |   }
+  */
+
+  if (!loop_nest->inner || !single_exit (loop_nest))
+    return 0;
+
+  for (temp = loop_nest->inner; temp; temp = temp->inner)
+    {
+      /* If we have a sibling loop or multiple exit edges, jump ship.  */
+      if (temp->next || !single_exit (temp))
+	return 0;
+
+      depth++;
+    }
+
+  return depth;
+}
+
 /* Perform a set of linear transforms on loops.  */
 
 void
@@ -263,47 +303,18 @@ linear_transform_loops (void)
       unsigned int depth = 0;
       VEC (ddr_p, heap) *dependence_relations;
       VEC (data_reference_p, heap) *datarefs;
-      struct loop *temp;
       lambda_loopnest before, after;
       lambda_trans_matrix trans;
-      bool problem = false;
       struct obstack lambda_obstack;
       gcc_obstack_init (&lambda_obstack);
 
-      /* If it's not a loop nest, we don't want it.
-         We also don't handle sibling loops properly, 
-         which are loops of the following form:
-         for (i = 0; i < 50; i++)
-           {
-             for (j = 0; j < 50; j++)
-               {
-	        ...
-               }
-             for (j = 0; j < 50; j++)
-               {
-                ...
-               }
-           } */
-      if (!loop_nest->inner || !single_exit (loop_nest))
+      depth = perfect_loop_nest_depth (loop_nest);
+      if (depth == 0)
 	continue;
+
       VEC_truncate (tree, oldivs, 0);
       VEC_truncate (tree, invariants, 0);
-      depth = 1;
-      for (temp = loop_nest->inner; temp; temp = temp->inner)
-	{
-	  /* If we have a sibling loop or multiple exit edges, jump ship.  */
-	  if (temp->next || !single_exit (temp))
-	    {
-	      problem = true;
-	      break;
-	    }
-	  depth ++;
-	}
-      if (problem)
-	continue;
 
-      /* Analyze data references and dependence relations using scev.  */      
- 
       datarefs = VEC_alloc (data_reference_p, heap, 10);
       dependence_relations = VEC_alloc (ddr_p, heap, 10 * 10);
       compute_data_dependences_for_loop (loop_nest, true, &datarefs,
Index: testsuite/gcc.dg/tree-ssa/pr23820.c
===================================================================
--- testsuite/gcc.dg/tree-ssa/pr23820.c	(revision 0)
+++ testsuite/gcc.dg/tree-ssa/pr23820.c	(revision 0)
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-loop-linear" } */
+
+int t [2][4];
+
+void foo (void)
+{
+  int i, j, k, v;
+  float e;
+  for (;;)
+    {
+      v = 0;
+      for (j = 0; j < 2; j ++)
+        {
+          for (k = 2; k < 4; k ++)
+            {
+              e = 0.0;
+              for (i = 0; i < 4; i ++)
+                e += t [j][i];
+              if (e)
+                v = j;
+            }
+        }
+      t [v][0] = 0;
+    }
+}
Index: testsuite/gcc.dg/tree-ssa/ltrans-3.c
===================================================================
--- testsuite/gcc.dg/tree-ssa/ltrans-3.c	(revision 129276)
+++ testsuite/gcc.dg/tree-ssa/ltrans-3.c	(working copy)
@@ -17,5 +17,5 @@ int foo(int N, int *res)
       *res = sum + N;
 }
 
-/* { dg-final { scan-tree-dump-times "transformed loop" 1 "ltrans"} } */ 
+/* { dg-final { scan-tree-dump-times "transformed loop" 1 "ltrans" { xfail *-*-* } } } */ 
 /* { dg-final { cleanup-tree-dump "ltrans" } } */
Index: testsuite/gcc.dg/tree-ssa/pr33766.c
===================================================================
--- testsuite/gcc.dg/tree-ssa/pr33766.c	(revision 0)
+++ testsuite/gcc.dg/tree-ssa/pr33766.c	(revision 0)
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-loop-linear" } */
+
+float
+fxt1_quantize_ALPHA1()
+{
+        int j1;
+        int i;
+        float *tv;
+        for (j1 = 1; j1; j1++) {
+                float e;
+                for (i = 1; i; i++)
+                        e = tv[i];
+                if (e)
+                        i = j1;
+        }
+        return tv[i];
+}
+
Index: testsuite/gcc.dg/tree-ssa/pr24309.c
===================================================================
--- testsuite/gcc.dg/tree-ssa/pr24309.c	(revision 0)
+++ testsuite/gcc.dg/tree-ssa/pr24309.c	(revision 0)
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-loop-linear" } */
+
+float weight[10];
+void lsp_weight_quant(float *x, char *cdbk)
+{
+   int i,j;
+   float dist;
+   int best_id=0;
+   for (i=0;i<16;i++)
+   {
+      for (j=0;j<10;j++)
+         dist=dist+weight[j];
+      if (dist<0)
+         best_id=i;
+   }
+   x[j] = cdbk[best_id*10+j];
+}
Index: lambda-code.c
===================================================================
--- lambda-code.c	(revision 129276)
+++ lambda-code.c	(working copy)
@@ -1972,32 +1972,42 @@ perfect_nest_p (struct loop *loop)
   size_t i;
   tree exit_cond;
 
+  /* Loops at depth 0 are perfect nests.  */
   if (!loop->inner)
     return true;
+
   bbs = get_loop_body (loop);
   exit_cond = get_loop_exit_condition (loop);
+
   for (i = 0; i < loop->num_nodes; i++)
     {
       if (bbs[i]->loop_father == loop)
 	{
 	  block_stmt_iterator bsi;
+
 	  for (bsi = bsi_start (bbs[i]); !bsi_end_p (bsi); bsi_next (&bsi))
 	    {
 	      tree stmt = bsi_stmt (bsi);
+
+	      if (TREE_CODE (stmt) == COND_EXPR
+		  && exit_cond != stmt)
+		goto non_perfectly_nested;
+
 	      if (stmt == exit_cond
 		  || not_interesting_stmt (stmt)
 		  || stmt_is_bumper_for_loop (loop, stmt))
 		continue;
+
+	    non_perfectly_nested:
 	      free (bbs);
 	      return false;
 	    }
 	}
     }
+
   free (bbs);
-  /* See if the inner loops are perfectly nested as well.  */
-  if (loop->inner)    
-    return perfect_nest_p (loop->inner);
-  return true;
+
+  return perfect_nest_p (loop->inner);
 }
 
 /* Replace the USES of X in STMT, or uses with the same step as X with Y.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Preliminary patch for PR23820 and PR24309
  2007-10-15 17:49     ` Sebastian Pop
@ 2007-10-15 18:13       ` Zdenek Dvorak
  2007-10-16 21:56         ` Sebastian Pop
  2007-10-21 20:58       ` Eric Botcazou
  1 sibling, 1 reply; 26+ messages in thread
From: Zdenek Dvorak @ 2007-10-15 18:13 UTC (permalink / raw)
  To: Sebastian Pop; +Cc: GCC Patches, Daniel Berlin

Hello,

> On 10/12/07, Sebastian Pop <sebpop@gmail.com> wrote:
> > Do we have an option to XFAIL ltrans-3.c?  And of course, file an
> > optimization PR
> 
> As we know that we can fix this optimization PR, and because I still
> don't see how to implement a solution for that optimization PR, and
> because the patch that I'm attaching, with ltrans-3 XFAIL-ed, passes
> bootstrap and test on amd64-linux fixing three ICE PRs 23820, 24309,
> and 33766, is the patch okay for trunk?
> 
> After this patch, as we won't have other PRs open on the loop
> interchange (otherwise ping me on these and I'll give them a look),
> would it be possible to enable loop interchange at -O3 or -O2 level?

I think enabling it at -O3 for now would be a good idea,

Zdenek

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Preliminary patch for PR23820 and PR24309
  2007-10-15 18:13       ` Zdenek Dvorak
@ 2007-10-16 21:56         ` Sebastian Pop
  0 siblings, 0 replies; 26+ messages in thread
From: Sebastian Pop @ 2007-10-16 21:56 UTC (permalink / raw)
  To: Zdenek Dvorak; +Cc: GCC Patches, Daniel Berlin

On 10/15/07, Zdenek Dvorak <rakdver@kam.mff.cuni.cz> wrote:
> > After this patch, as we won't have other PRs open on the loop
> > interchange (otherwise ping me on these and I'll give them a look),
> > would it be possible to enable loop interchange at -O3 or -O2 level?
>
> I think enabling it at -O3 for now would be a good idea,
>

I just bootstrapped and tested a patch with loop interchange
enabled at -O3, and there were 6 regressions with it:

gfortran.sum gfortran.dg/alloc_comp_assign_2.f90
gfortran.sum gfortran.dg/pr19928-2.f90
gfortran.sum gfortran.fortran-torture/execute/scalarize2.f90
gfortran.sum gfortran.fortran-torture/execute/scalarize.f90
gfortran.sum gfortran.fortran-torture/execute/where_1.f90
gfortran.sum gfortran.fortran-torture/execute/where_6.f90

They are all failing the execution test for -O3.  I'll have a look at
these fails before proposing a patch to enable loop interchange
at -O3.

-- 
Sebastian dot Pop at AMD dot com

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Preliminary patch for PR23820 and PR24309
  2007-10-15 17:49     ` Sebastian Pop
  2007-10-15 18:13       ` Zdenek Dvorak
@ 2007-10-21 20:58       ` Eric Botcazou
  2007-10-22 15:14         ` Sebastian Pop
  1 sibling, 1 reply; 26+ messages in thread
From: Eric Botcazou @ 2007-10-21 20:58 UTC (permalink / raw)
  To: Sebastian Pop; +Cc: gcc-patches, Daniel Berlin

Hi Sebastian,

> As we know that we can fix this optimization PR, and because I still
> don't see how to implement a solution for that optimization PR, and
> because the patch that I'm attaching, with ltrans-3 XFAIL-ed, passes
> bootstrap and test on amd64-linux fixing three ICE PRs 23820, 24309,
> and 33766, is the patch okay for trunk?

2007-10-19  Sebastian Pop  <sebastian.pop@amd.com>

	PR tree-optimization/23820
	PR tree-optimization/24309
	PR tree-optimization/33766
	* testsuite/gcc.dg/tree-ssa/pr23820.c: New.
	* testsuite/gcc.dg/tree-ssa/pr24309.c: New.
	* testsuite/gcc.dg/tree-ssa/pr33766.c: New.
	* testsuite/gcc.dg/tree-ssa/ltrans-3.c: XFAILed.
	* tree-loop-linear.c (perfect_loop_nest_depth): New.
	(linear_transform_loops): Use perfect_loop_nest_depth.
	* lambda-code.c (perfect_nest_p): Outer loops in perfect nests 
	should have a single condition: their exit.


Entries for the testsuite must go to testsuite/ChangeLog.

-- 
Eric Botcazou

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Preliminary patch for PR23820 and PR24309
  2007-10-21 20:58       ` Eric Botcazou
@ 2007-10-22 15:14         ` Sebastian Pop
  2007-10-22 15:15           ` Eric Botcazou
  0 siblings, 1 reply; 26+ messages in thread
From: Sebastian Pop @ 2007-10-22 15:14 UTC (permalink / raw)
  To: Eric Botcazou; +Cc: gcc-patches, Daniel Berlin

On 10/21/07, Eric Botcazou <ebotcazou@libertysurf.fr> wrote:
> Entries for the testsuite must go to testsuite/ChangeLog.
>

Sorry, I did not knew of this rule.  I was just following what others did:
if I'm grepping "testsuite/" in the gcc/ChangeLog's, there are dozens
of occurences back to 1998.  In the future, I will avoid putting the
testsuite entries in the main changelog.

Sebastian
--
AMD - GNU Tools

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Preliminary patch for PR23820 and PR24309
  2007-10-22 15:14         ` Sebastian Pop
@ 2007-10-22 15:15           ` Eric Botcazou
  0 siblings, 0 replies; 26+ messages in thread
From: Eric Botcazou @ 2007-10-22 15:15 UTC (permalink / raw)
  To: Sebastian Pop; +Cc: gcc-patches, Daniel Berlin

> Sorry, I did not knew of this rule.  I was just following what others did:
> if I'm grepping "testsuite/" in the gcc/ChangeLog's, there are dozens
> of occurences back to 1998.

You're right, that never occured to me before, just for yours. :-)

> In the future, I will avoid putting the testsuite entries in the main
> changelog. 

Thanks.

-- 
Eric Botcazou

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Preliminary patch for PR23820 and PR24309
  2007-10-17 15:47 ` Sebastian Pop
@ 2007-10-18 10:01   ` Ayal Zaks
  0 siblings, 0 replies; 26+ messages in thread
From: Ayal Zaks @ 2007-10-18 10:01 UTC (permalink / raw)
  To: Sebastian Pop; +Cc: Daniel Berlin, gcc-patches, Zdenek Dvorak

"Sebastian Pop" <sebpop@gmail.com> wrote on 17/10/2007 17:27:58:

> Thanks Ayal, Ira and Victor for raising these problems.
>
> On 10/17/07, Ayal Zaks <ZAKS@il.ibm.com> wrote:
> >
> > 1. Are the statistics gathered by gather_interchange_stats() invariant,
in
> > the sense that you can pre-gather them once per loop and cache them,
> > instead of re-gathering them twice for each pair of loops? This may
save
> > compile time.
> >
>
> Yes, the information gathered is supposed to be invariant if the
> function is called twice on the exact same loop.  After a loop
> transformation, the information should be different.

Sure, but the transformation is done at the end, after collecting all the
information (multiple times).

>
> With Jan Sjodin, we also started to look at how to gather this
> information from the lambda representation instead of gathering it
> from the gimple/data-ref representation.  This is something that we
> have to address if we want to compose lamdba transforms (for example
> implement skewing followed by interchange), as otherwise we have to
> transform the code back to gimple, then start again the data-ref
> analysis, and finally gather the info.

Good. There too, it should suffice to collect the information once per
loop.

>
> > 2. The priority function
> > >     if (dependence_steps_i < dependence_steps_j
> > >         || nb_deps_not_carried_by_i > nb_deps_not_carried_by_j
> > >         || double_int_ucmp (access_strides_i, access_strides_j) < 0)
> >
> > is not 'stable' in the sense that you may prefer to interchange i with
j,
> > and if asked again prefer to interchange back. If point 1 above is
true,
> > you are (bubble) sorting the loops according to their priorities, using
> > unstable comparisons. (Don't iterate until convergence ;-).
> >
>
> Right.  The intended effects of the priority function are multiple:
>
>    /* Heuristics for loop interchange profitability:
>
>       1. (spatial locality) Inner loops should have smallest
>               dependence steps.
>
>       2. (spatial locality) Inner loops should contain more
>       dependence relations not carried by the loop.
>
>       3. (temporal locality) Inner loops should have smallest
>          array access strides.
>    */
>
> It can be true that a decision for better spatial locality harms the
> temporal locality, and vice versa, and so you don't want to iterate
> until convergence ;-), unless you define an order on the effects,
> stating your preferences between spatial and temporal locality.
>

or some weighted average/majority test of the different criteria. I expect
one could tune this decision-making process empirically, e.g. cost model.

> > 3. Can dist
> > >       else if (dist < 0)
> > >         (*dependence_steps) += -dist;
> >
> > be negative?
> >
>
> int dist = DDR_DIST_VECT (ddr, j)[loop_depth (loop) - loop_depth
(first_loop)];
>
> Yes dist can be negative, as dist is one of the components of the
> distance vector, not necessarily the loop that carries the dependence.
> For example, for the distance vector (1, -1), dist = -1 if we're
> looking at the inner loop.
>

Ah, right.

> >
> > 4. Not too clear an example ...:
> >
>
> Could you hint me at how to improve it?  The intent of the example is
> to illustrate the inputs and outputs of the function.

some explanation how the outputs are derived from the inputs could help.

>  IIRC the
> example is adapted from the swim cpu2000 benchmark: ltrans-1.c.

ok (swim is Fortran, no?); a snapshot of the relevant code can help.

Thanks, Ayal.

>
> > >    Example: for the following loop,
> > >
> > >    | loop_1 runs 1335 times
> > >    |   loop_2 runs 1335 times
> > >    |     A[{{0, +, 1}_1, +, 1335}_2]
> > >    |     B[{{0, +, 1}_1, +, 1335}_2]
> > >    |   endloop_2
> > >    |   A[{0, +, 1336}_1]
> > >    | endloop_1
> > >
> > >    gather_interchange_stats (in loop_1) will return
> > >    DEPENDENCE_STEPS = 3002
> > >    NB_DEPS_NOT_CARRIED_BY_LOOP = 5
> > >    ACCESS_STRIDES = 10694
> > >
> > >    gather_interchange_stats (in loop_2) will return
> > >    DEPENDENCE_STEPS = 3000
> > >    NB_DEPS_NOT_CARRIED_BY_LOOP = 7
> > >    ACCESS_STRIDES = 8010
> >
>
> --
> Sebastian dot Pop at AMD dot com

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Preliminary patch for PR23820 and PR24309
  2007-10-17 13:16 Ayal Zaks
@ 2007-10-17 15:47 ` Sebastian Pop
  2007-10-18 10:01   ` Ayal Zaks
  0 siblings, 1 reply; 26+ messages in thread
From: Sebastian Pop @ 2007-10-17 15:47 UTC (permalink / raw)
  To: Ayal Zaks; +Cc: Daniel Berlin, Zdenek Dvorak, gcc-patches

Thanks Ayal, Ira and Victor for raising these problems.

On 10/17/07, Ayal Zaks <ZAKS@il.ibm.com> wrote:
>
> 1. Are the statistics gathered by gather_interchange_stats() invariant, in
> the sense that you can pre-gather them once per loop and cache them,
> instead of re-gathering them twice for each pair of loops? This may save
> compile time.
>

Yes, the information gathered is supposed to be invariant if the
function is called twice on the exact same loop.  After a loop
transformation, the information should be different.

With Jan Sjodin, we also started to look at how to gather this
information from the lambda representation instead of gathering it
from the gimple/data-ref representation.  This is something that we
have to address if we want to compose lamdba transforms (for example
implement skewing followed by interchange), as otherwise we have to
transform the code back to gimple, then start again the data-ref
analysis, and finally gather the info.

> 2. The priority function
> >     if (dependence_steps_i < dependence_steps_j
> >         || nb_deps_not_carried_by_i > nb_deps_not_carried_by_j
> >         || double_int_ucmp (access_strides_i, access_strides_j) < 0)
>
> is not 'stable' in the sense that you may prefer to interchange i with j,
> and if asked again prefer to interchange back. If point 1 above is true,
> you are (bubble) sorting the loops according to their priorities, using
> unstable comparisons. (Don't iterate until convergence ;-).
>

Right.  The intended effects of the priority function are multiple:

	/* Heuristics for loop interchange profitability:

	   1. (spatial locality) Inner loops should have smallest
              dependence steps.

	   2. (spatial locality) Inner loops should contain more
	   dependence relations not carried by the loop.

	   3. (temporal locality) Inner loops should have smallest
	      array access strides.
	*/

It can be true that a decision for better spatial locality harms the
temporal locality, and vice versa, and so you don't want to iterate
until convergence ;-), unless you define an order on the effects,
stating your preferences between spatial and temporal locality.

> 3. Can dist
> >       else if (dist < 0)
> >         (*dependence_steps) += -dist;
>
> be negative?
>

int dist = DDR_DIST_VECT (ddr, j)[loop_depth (loop) - loop_depth (first_loop)];

Yes dist can be negative, as dist is one of the components of the
distance vector, not necessarily the loop that carries the dependence.
For example, for the distance vector (1, -1), dist = -1 if we're
looking at the inner loop.

>
> 4. Not too clear an example ...:
>

Could you hint me at how to improve it?  The intent of the example is
to illustrate the inputs and outputs of the function.  IIRC the
example is adapted from the swim cpu2000 benchmark: ltrans-1.c.

> >    Example: for the following loop,
> >
> >    | loop_1 runs 1335 times
> >    |   loop_2 runs 1335 times
> >    |     A[{{0, +, 1}_1, +, 1335}_2]
> >    |     B[{{0, +, 1}_1, +, 1335}_2]
> >    |   endloop_2
> >    |   A[{0, +, 1336}_1]
> >    | endloop_1
> >
> >    gather_interchange_stats (in loop_1) will return
> >    DEPENDENCE_STEPS = 3002
> >    NB_DEPS_NOT_CARRIED_BY_LOOP = 5
> >    ACCESS_STRIDES = 10694
> >
> >    gather_interchange_stats (in loop_2) will return
> >    DEPENDENCE_STEPS = 3000
> >    NB_DEPS_NOT_CARRIED_BY_LOOP = 7
> >    ACCESS_STRIDES = 8010
>

-- 
Sebastian dot Pop at AMD dot com

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Preliminary patch for PR23820 and PR24309
@ 2007-10-17 13:16 Ayal Zaks
  2007-10-17 15:47 ` Sebastian Pop
  0 siblings, 1 reply; 26+ messages in thread
From: Ayal Zaks @ 2007-10-17 13:16 UTC (permalink / raw)
  To: Sebastian Pop, Daniel Berlin; +Cc: Zdenek Dvorak, gcc-patches


> On 10/15/07, Zdenek Dvorak <rakdver@kam.mff.cuni.cz> wrote:
> > > After this patch, as we won't have other PRs open on the loop
> > > interchange (otherwise ping me on these and I'll give them a look),
> > > would it be possible to enable loop interchange at -O3 or -O2 level?
> >
> > I think enabling it at -O3 for now would be a good idea,
> >
>
> I just bootstrapped and tested a patch with loop interchange
> enabled at -O3, and there were 6 regressions with it:
>
> gfortran.sum gfortran.dg/alloc_comp_assign_2.f90
> gfortran.sum gfortran.dg/pr19928-2.f90
> gfortran.sum gfortran.fortran-torture/execute/scalarize2.f90
> gfortran.sum gfortran.fortran-torture/execute/scalarize.f90
> gfortran.sum gfortran.fortran-torture/execute/where_1.f90
> gfortran.sum gfortran.fortran-torture/execute/where_6.f90
>
> They are all failing the execution test for -O3.  I'll have a look at
> these fails before proposing a patch to enable loop interchange
> at -O3.

A couple of points to consider/clarify:

1. Are the statistics gathered by gather_interchange_stats() invariant, in
the sense that you can pre-gather them once per loop and cache them,
instead of re-gathering them twice for each pair of loops? This may save
compile time.

2. The priority function
>     if (dependence_steps_i < dependence_steps_j
>         || nb_deps_not_carried_by_i > nb_deps_not_carried_by_j
>         || double_int_ucmp (access_strides_i, access_strides_j) < 0)

is not 'stable' in the sense that you may prefer to interchange i with j,
and if asked again prefer to interchange back. If point 1 above is true,
you are (bubble) sorting the loops according to their priorities, using
unstable comparisons. (Don't iterate until convergence ;-).

3. Can dist
>       else if (dist < 0)
>         (*dependence_steps) += -dist;

be negative?


4. Not too clear an example ...:

>    Example: for the following loop,
>
>    | loop_1 runs 1335 times
>    |   loop_2 runs 1335 times
>    |     A[{{0, +, 1}_1, +, 1335}_2]
>    |     B[{{0, +, 1}_1, +, 1335}_2]
>    |   endloop_2
>    |   A[{0, +, 1336}_1]
>    | endloop_1
>
>    gather_interchange_stats (in loop_1) will return
>    DEPENDENCE_STEPS = 3002
>    NB_DEPS_NOT_CARRIED_BY_LOOP = 5
>    ACCESS_STRIDES = 10694
>
>    gather_interchange_stats (in loop_2) will return
>    DEPENDENCE_STEPS = 3000
>    NB_DEPS_NOT_CARRIED_BY_LOOP = 7
>    ACCESS_STRIDES = 8010

Thanks to Ira and Victor,
Ayal.

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2007-10-22 14:36 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-10-11 21:32 Preliminary patch for PR23820 and PR24309 Sebastian Pop
2007-10-12  1:16 ` Zdenek Dvorak
2007-10-12 21:08   ` Sebastian Pop
2007-10-12 21:21     ` Richard Guenther
2007-10-12 22:10       ` Zdenek Dvorak
2007-10-13 12:23         ` Richard Guenther
2007-10-12 22:16     ` Zdenek Dvorak
2007-10-12 22:32       ` Sebastian Pop
2007-10-12 23:05         ` Sebastian Pop
2007-10-12 23:18           ` Sebastian Pop
2007-10-13  0:06             ` Zdenek Dvorak
2007-10-13 12:26             ` Richard Guenther
2007-10-13 16:06               ` Sebastian Pop
2007-10-13 17:19                 ` Richard Guenther
2007-10-13 17:35                   ` Richard Guenther
2007-10-13 17:43                     ` Sebastian Pop
2007-10-14  2:25                     ` Zdenek Dvorak
2007-10-15 17:49     ` Sebastian Pop
2007-10-15 18:13       ` Zdenek Dvorak
2007-10-16 21:56         ` Sebastian Pop
2007-10-21 20:58       ` Eric Botcazou
2007-10-22 15:14         ` Sebastian Pop
2007-10-22 15:15           ` Eric Botcazou
2007-10-17 13:16 Ayal Zaks
2007-10-17 15:47 ` Sebastian Pop
2007-10-18 10:01   ` Ayal Zaks

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).