public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] outerloop parallelization support
@ 2009-10-22 13:00 Razya Ladelsky
  2009-10-22 16:35 ` Sebastian Pop
  2009-10-23  6:25 ` Eric Botcazou
  0 siblings, 2 replies; 4+ messages in thread
From: Razya Ladelsky @ 2009-10-22 13:00 UTC (permalink / raw)
  To: gcc-patches, Richard Guenther; +Cc: Zdenek Dvorak, Alon Dayan, gcc-graphite

[-- Attachment #1: Type: text/plain, Size: 400 bytes --]

Hi,
Following this thread 
http://gcc.gnu.org/ml/gcc-patches/2009-10/msg00686.html
I made some changes to the patch as requested, and tested bootstrap, 
testsuite.
and spec with autopar enabled (on powerpc linux).
The patch does not introduce any new regressions.
 If there are no other objections, I'll commit it later today.
I also added more testcases that include reductions.



Thanks,
Razya




[-- Attachment #2: outer-4.c --]
[-- Type: application/octet-stream, Size: 888 bytes --]

/* { dg-do compile } */
/* { dg-options "-O2 -ftree-parallelize-loops=4 -fdump-tree-parloops-details -fdump-tree-optimized" } */

void abort (void);

int g_sum=0;
int x[500][500];

__attribute__((noinline))
void parloop (int N)
{
  int i, j;
  int sum;

  /* Double reduction is currently not supported, outer loop is not 
     parallelized.  Inner reduction is detected, inner loop is 
     parallelized.  */
  sum = 0;
  for (i = 0; i < N; i++)
    for (j = 0; j < N; j++)
      sum += x[i][j];

  g_sum = sum;
}

int main(void)
{
  parloop(500);

  return 0;
}


/* Check that outer loop is parallelized.  */
/* { dg-final { scan-tree-dump-times "parallelizing outer loop" 0 "parloops" } } */
/* { dg-final { scan-tree-dump-times "parallelizing inner loop" 1 "parloops" }
 * } */
/* { dg-final { cleanup-tree-dump "parloops" } } */
/* { dg-final { cleanup-tree-dump "optimized" } } */

[-- Attachment #3: outer-5.c --]
[-- Type: application/octet-stream, Size: 1078 bytes --]

/* { dg-do compile } */
/* { dg-options "-O2 -ftree-parallelize-loops=4 -fdump-tree-parloops-details -fdump-tree-optimized" } */

void abort (void);

int x[500][500];
int y[500];
int g_sum=0;

__attribute__((noinline))
void init (int i, int j)
{
  x[i][j]=1;
}

__attribute__((noinline))
void parloop (int N)
{
  int i, j;
  int sum;

  /* Inner cycle is currently not supported, outer loop is not 
     parallelized.  Inner reduction is detected, inner loop is 
     parallelized.  */
  for (i = 0; i < N; i++)
  {
    sum = 0;
    for (j = 0; j < N; j++)
      sum += x[i][j];
    y[i]=sum;
  }
  g_sum = sum;
}

int main(void)
{
  int i,j;
  for (i = 0; i < 500; i++) 
    for (j = 0; j < 500; j++)
      init(i, j);
  
  parloop(500);

  return 0;
}


/* Check that outer loop is parallelized.  */
/* { dg-final { scan-tree-dump-times "parallelizing outer loop" 0 "parloops" } } */
/* { dg-final { scan-tree-dump-times "parallelizing inner loop" 1 "parloops" }
 * } */
/* { dg-final { cleanup-tree-dump "parloops" } } */
/* { dg-final { cleanup-tree-dump "optimized" } } */

[-- Attachment #4: outer-6.c --]
[-- Type: application/octet-stream, Size: 983 bytes --]

/* { dg-do compile } */
/* { dg-options "-O2 -ftree-parallelize-loops=4 -fdump-tree-parloops-details -fdump-tree-optimized" } */

void abort (void);

int x[500][500];
int y[500];
int g_sum=0;

__attribute__((noinline))
void init (int i, int j)
{
  x[i][j]=1;
}

__attribute__((noinline))
void parloop (int N)
{
  int i, j;
  int sum;

  /* Outer loop reduction, outerloop is parallelized.  */ 
  sum=0;
  for (i = 0; i < N; i++)
  {
    for (j = 0; j < N; j++)
      y[i]=x[i][j];
    sum += y[i];
  }
  g_sum = sum;
}

int main(void)
{
  int i,j;
  for (i = 0; i < 500; i++) 
    for (j = 0; j < 500; j++)
      init(i, j);
  
  parloop(500);

  return 0;
}


/* Check that outer loop is parallelized.  */
/* { dg-final { scan-tree-dump-times "parallelizing outer loop" 1 "parloops" } } */
/* { dg-final { scan-tree-dump-times "parallelizing inner loop" 0 "parloops" }
 * } */
/* { dg-final { cleanup-tree-dump "parloops" } } */
/* { dg-final { cleanup-tree-dump "optimized" } } */

[-- Attachment #5: outer-1.c --]
[-- Type: application/octet-stream, Size: 745 bytes --]

/* { dg-do compile } */
/* { dg-options "-O2 -ftree-parallelize-loops=4 -fdump-tree-parloops-details -fdump-tree-optimized" } */

void abort (void);

void parloop (int N)
{
  int i, j;
  int x[10000][10000];

  for (i = 0; i < N; i++)
    for (j = 0; j < N; j++)
      x[i][j] = i + j + 3;

  for (i = 0; i < N; i++)
    for (j = 0; j < N; j++)
      if (x[i][j] != i + j + 3)
	abort ();
}

int main(void)
{
  parloop(10000);

  return 0;
}


/* Check that outer loop is parallelized.  */
/* { dg-final { scan-tree-dump-times "parallelizing outer loop" 1 "parloops" } } */
/* { dg-final { scan-tree-dump-times "loopfn" 5 "optimized" } } */
/* { dg-final { cleanup-tree-dump "parloops" } } */
/* { dg-final { cleanup-tree-dump "optimized" } } */

[-- Attachment #6: outer-2.c --]
[-- Type: application/octet-stream, Size: 766 bytes --]

/* { dg-do compile } */
/* { dg-options "-O2 -ftree-parallelize-loops=4 -fdump-tree-parloops-details -fdump-tree-optimized" } */

void abort (void);

void parloop (int N)
{
  int i, j,ii;
  int x[400][10][400];

for (ii = 0; ii < N; ii++)
  for (i = 0; i < N; i++)
    for (j = 0; j < N; j++)
      x[i][j][ii] = ii+i + j + 3;

for (ii = 0; ii < N; ii++)
  for (i = 0; i < N;i++)
    for (j = 0; j < N; j++)
      if (x[i][j][ii] != ii+i + j + 3)
	abort ();
}

int main(void)
{
  parloop(400);

  return 0;
}

/* { dg-final { scan-tree-dump-times "parallelizing outer loop" 1 "parloops" } } */
/* { dg-final { scan-tree-dump-times "loopfn" 5 "optimized" } } */
/* { dg-final { cleanup-tree-dump "parloops" } } */
/* { dg-final { cleanup-tree-dump "optimized" } } */

[-- Attachment #7: outer-3.c --]
[-- Type: application/octet-stream, Size: 739 bytes --]

/* { dg-do compile } */
/* { dg-options "-O2 -ftree-parallelize-loops=4 -fdump-tree-parloops-details -fdump-tree-optimized" } */

void abort (void);

void parloop (int N)
{
  int i, j;
  int x[500][500];

  for (i = 0; i < N; i++)
    for (j = 0; j < i; j++)
      x[i][j] = i + j + 3;

  for (i = 0; i < N; i++)
    for (j = 0; j < i; j++)
      if (x[i][j] != i + j + 3)
	abort ();
}

int main(void)
{
  parloop(500);

  return 0;
}


/* Check that outer loop is parallelized.  */
/* { dg-final { scan-tree-dump-times "parallelizing outer loop" 1 "parloops" } } */
/* { dg-final { scan-tree-dump-times "loopfn" 5 "optimized" } } */
/* { dg-final { cleanup-tree-dump "parloops" } } */
/* { dg-final { cleanup-tree-dump "optimized" } } */

[-- Attachment #8: ChangeLog.txt --]
[-- Type: text/plain, Size: 1333 bytes --]

2009-10-22  Razya Ladelsky  <razya@il.ibm.com>

       	* cfgloopmanip.c  (duplicate_subloops): Export.
    	* tree-parloops.c (loop_parallel_p): Dump if loop is innermost.
	(transform_to_exit_first_loop): Duplicate bbs starting from 
	header up to loop->latch instead of exit->src.
	Initialize control variable to the correct number of iterations.
	(gather_scalar_reductions): Do not register double reductions.
	(parallelize_loops): Dump which loop is tested.
	Indicate whether the parallelized loop is inner or not.
	Remove the innermost-loop requirement.
	* cfgloop.h (duplicate_subloops): Export.
	* tree-cfg.c (add_phi_args_after_redirect): New function.
	(gimple_duplicate_sese_tail): Remove the no-subloops constraint.
	Call duplicate_subloops.
	Update number of iterations at the exit condition.
	Don't redirect nexits always to the loop exit.
	Redirect copied edges from latch to the loop exit.
	* testsuite/libgomp.graphite/force-parallel-2.c: Adjust scan.
	* testsuite/gcc.dg/autopar/outer-1.c: New testcase.
	* testsuite/gcc.dg/autopar/outer-2.c: New testcase.
        * testsuite/gcc.dg/autopar/outer-3.c: New testcase
        * testsuite/gcc.dg/autopar/outer-4.c: New testcase
        * testsuite/gcc.dg/autopar/outer-5.c: New testcase
        * testsuite/gcc.dg/autopar/outer-6.c: New testcase

[-- Attachment #9: autopar_outer.txt --]
[-- Type: text/plain, Size: 14186 bytes --]

Index: libgomp/testsuite/libgomp.graphite/force-parallel-2.c
===================================================================
--- libgomp/testsuite/libgomp.graphite/force-parallel-2.c	(revision 153056)
+++ libgomp/testsuite/libgomp.graphite/force-parallel-2.c	(working copy)
@@ -23,7 +23,7 @@ int main(void)
 }
 
 /* Check that parallel code generation part make the right answer.  */
-/* { dg-final { scan-tree-dump-times "2 loops carried no dependency" 1 "graphite" } } */
+/* { dg-final { scan-tree-dump-times "2 loops carried no dependency" 2 "graphite" } } */
 /* { dg-final { cleanup-tree-dump "graphite" } } */
 /* { dg-final { scan-tree-dump-times "loopfn" 5 "optimized" } } */
 /* { dg-final { cleanup-tree-dump "parloops" } } */
Index: gcc/cfgloopmanip.c
===================================================================
--- gcc/cfgloopmanip.c	(revision 153056)
+++ gcc/cfgloopmanip.c	(working copy)
@@ -32,7 +32,6 @@ along with GCC; see the file COPYING3.  
 #include "output.h"
 #include "tree-flow.h"
 
-static void duplicate_subloops (struct loop *, struct loop *);
 static void copy_loops_to (struct loop **, int,
 			   struct loop *);
 static void loop_redirect_edge (edge, basic_block);
@@ -886,7 +885,7 @@ duplicate_loop (struct loop *loop, struc
 
 /* Copies structure of subloops of LOOP into TARGET loop, placing
    newly created loops into loop tree.  */
-static void
+void
 duplicate_subloops (struct loop *loop, struct loop *target)
 {
   struct loop *aloop, *cloop;
Index: gcc/tree-parloops.c
===================================================================
--- gcc/tree-parloops.c	(revision 153056)
+++ gcc/tree-parloops.c	(working copy)
@@ -255,7 +255,13 @@ loop_parallel_p (struct loop *loop)
   bool ret = false;
 
   if (dump_file && (dump_flags & TDF_DETAILS))
-    fprintf (dump_file, "\nConsidering loop %d\n", loop->num);
+  {
+    fprintf (dump_file, "Considering loop %d\n", loop->num);
+    if (!loop->inner)
+      fprintf (dump_file, "loop is innermost\n");
+    else 
+      fprintf (dump_file, "loop NOT innermost\n");
+   }
 
   /* Check for problems with dependences.  If the loop can be reversed,
      the iterations are independent.  */
@@ -1289,8 +1295,9 @@ transform_to_exit_first_loop (struct loo
   bool ok;
   edge exit = single_dom_exit (loop), hpred;
   tree control, control_name, res, t;
-  gimple phi, nphi, cond_stmt, stmt;
+  gimple phi, nphi, cond_stmt, stmt, cond_nit;
   gimple_stmt_iterator gsi;
+  tree nit_1;
 
   split_block_after_labels (loop->header);
   orig_header = single_succ (loop->header);
@@ -1308,7 +1315,6 @@ transform_to_exit_first_loop (struct loo
       res = PHI_RESULT (phi);
       t = make_ssa_name (SSA_NAME_VAR (res), phi);
       SET_PHI_RESULT (phi, t);
-
       nphi = create_phi_node (res, orig_header);
       SSA_NAME_DEF_STMT (res) = nphi;
       add_phi_arg (nphi, t, hpred, UNKNOWN_LOCATION);
@@ -1320,10 +1326,11 @@ transform_to_exit_first_loop (struct loo
 	  control = t;
 	}
     }
-
   bbs = get_loop_body_in_dom_order (loop);
-  for (n = 0; bbs[n] != exit->src; n++)
+
+  for (n = 0; bbs[n] != loop->latch; n++)
     continue;
+  n--;
   nbbs = XNEWVEC (basic_block, n);
   ok = gimple_duplicate_sese_tail (single_succ_edge (loop->header), exit,
 				   bbs + 1, n, nbbs);
@@ -1358,7 +1365,6 @@ transform_to_exit_first_loop (struct loo
 	  struct reduction_info *red;
 
 	  tree val = PHI_ARG_DEF_FROM_EDGE (phi, exit);
-
 	  red = reduction_phi (reduction_list, SSA_NAME_DEF_STMT (val));
 	  if (red)
 	    {
@@ -1374,12 +1380,15 @@ transform_to_exit_first_loop (struct loo
     }
   gcc_assert (control_name != NULL_TREE);
 
-  /* Initialize the control variable to NIT.  */
+  /* Initialize the control variable to number of iterations 
+     according to the rhs of the exit condition.  */
   gsi = gsi_after_labels (ex_bb);
-  nit = force_gimple_operand_gsi (&gsi,
-				  fold_convert (TREE_TYPE (control_name), nit),
+  cond_nit = last_stmt (exit->src); 
+  nit_1 =  gimple_cond_rhs (cond_nit);
+  nit_1 = force_gimple_operand_gsi (&gsi,
+				  fold_convert (TREE_TYPE (control_name), nit_1),
 				  false, NULL_TREE, false, GSI_SAME_STMT);
-  stmt = gimple_build_assign (control_name, nit);
+  stmt = gimple_build_assign (control_name, nit_1);
   gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
   SSA_NAME_DEF_STMT (control_name) = stmt;
 }
@@ -1740,7 +1749,7 @@ gather_scalar_reductions (loop_p loop, h
 	&& simple_loop_info)
 	{
            gimple reduc_stmt = vect_is_simple_reduction (simple_loop_info, phi, true, &double_reduc);
-	   if (reduc_stmt)
+	   if (reduc_stmt && !double_reduc)
               build_new_reduction (reduction_list, reduc_stmt, phi);
         }
     }
@@ -1890,15 +1899,32 @@ parallelize_loops (void)
   FOR_EACH_LOOP (li, loop, 0)
     {
       htab_empty (reduction_list);
-
-      /* If we use autopar in graphite pass, we use it's marked dependency
+      if (dump_file && (dump_flags & TDF_DETAILS))
+      {
+        fprintf (dump_file, "Trying loop %d as candidate\n",loop->num);
+	if (loop->inner)
+	  fprintf (dump_file, "loop %d is not innermost\n",loop->num);
+	else
+	  fprintf (dump_file, "loop %d is innermost\n",loop->num);
+      }
+      
+      /* If we use autopar in graphite pass, we use its marked dependency
       checking results.  */
       if (flag_loop_parallelize_all && !loop->can_be_parallel)
+      {
+        if (dump_file && (dump_flags & TDF_DETAILS))
+	   fprintf (dump_file, "loop is not parallel according to graphite\n");
 	continue;
+      }
 
-      /* FIXME: Only consider innermost loops with just one exit.  */
-      if (loop->inner || !single_dom_exit (loop))
+      if (!single_dom_exit (loop))
+      {
+       
+        if (dump_file && (dump_flags & TDF_DETAILS))
+	  fprintf (dump_file, "loop is !single_dom_exit\n");
+		
 	continue;
+      }
 
       if (/* And of course, the loop must be parallelizable.  */
 	  !can_duplicate_loop_p (loop)
@@ -1915,7 +1941,7 @@ parallelize_loops (void)
 	      /* Do not bother with loops in cold areas.  */
 	      || optimize_loop_nest_for_size_p (loop)))
 	continue;
-
+ 
       if (!try_get_loop_niter (loop, &niter_desc))
 	continue;
 
@@ -1926,6 +1952,14 @@ parallelize_loops (void)
 	continue;
 
       changed = true;
+      if (dump_file && (dump_flags & TDF_DETAILS))
+      {
+        fprintf (dump_file, "parallelizing ");
+	if (loop->inner)
+	  fprintf (dump_file, "outer loop\n");
+	else
+	  fprintf (dump_file, "inner loop\n");
+      } 
       gen_parallel_loop (loop, reduction_list, 
 			 n_threads, &niter_desc);
       verify_flow_info ();
Index: gcc/cfgloop.h
===================================================================
--- gcc/cfgloop.h	(revision 153056)
+++ gcc/cfgloop.h	(working copy)
@@ -288,6 +288,7 @@ extern edge create_empty_if_region_on_ed
 extern struct loop *create_empty_loop_on_edge (edge, tree, tree, tree, tree,
 					       tree *, tree *, struct loop *);
 extern struct loop * duplicate_loop (struct loop *, struct loop *);
+extern void duplicate_subloops (struct loop *, struct loop *);
 extern bool duplicate_loop_to_header_edge (struct loop *, edge, 
 					   unsigned, sbitmap, edge,
  					   VEC (edge, heap) **, int);
Index: gcc/tree-cfg.c
===================================================================
--- gcc/tree-cfg.c	(revision 153056)
+++ gcc/tree-cfg.c	(working copy)
@@ -4850,6 +4850,31 @@ gimple_duplicate_bb (basic_block bb)
   return new_bb;
 }
 
+/* Add phi arguments to the phi nodes in E_COPY->dest according to 
+   the phi arguments coming from the equivalent edge at
+   the phi nodes of DEST.  */
+
+static void
+add_phi_args_after_redirect (edge e_copy, edge orig_e)
+{
+   gimple_stmt_iterator psi, psi_copy;
+   gimple phi, phi_copy;
+   tree def; 
+   
+    for (psi = gsi_start_phis (orig_e->dest),
+       psi_copy = gsi_start_phis (e_copy->dest);
+       !gsi_end_p (psi);
+       gsi_next (&psi), gsi_next (&psi_copy))
+    {
+
+      phi = gsi_stmt (psi);
+      phi_copy = gsi_stmt (psi_copy);
+      def = PHI_ARG_DEF_FROM_EDGE (phi, orig_e);
+      add_phi_arg (phi_copy, def, e_copy,
+                   gimple_phi_arg_location_from_edge (phi, orig_e));
+    }
+}
+
 /* Adds phi node arguments for edge E_COPY after basic block duplication.  */
 
 static void
@@ -5131,9 +5156,14 @@ gimple_duplicate_sese_tail (edge entry A
   int total_freq = 0, exit_freq = 0;
   gcov_type total_count = 0, exit_count = 0;
   edge exits[2], nexits[2], e;
-  gimple_stmt_iterator gsi;
+  gimple_stmt_iterator gsi,gsi1;
   gimple cond_stmt;
-  edge sorig, snew;
+  edge sorig, snew, orig_e;
+  basic_block exit_bb;
+  edge_iterator ei;
+  VEC (edge, heap) *redirect_edges;
+  basic_block iters_bb, orig_src;
+  tree new_rhs;
 
   gcc_assert (EDGE_COUNT (exit->src->succs) == 2);
   exits[0] = exit;
@@ -5149,17 +5179,13 @@ gimple_duplicate_sese_tail (edge entry A
      it will work, but the resulting code will not be correct.  */
   for (i = 0; i < n_region; i++)
     {
-      /* We do not handle subloops, i.e. all the blocks must belong to the
-	 same loop.  */
-      if (region[i]->loop_father != orig_loop)
-	return false;
-
       if (region[i] == orig_loop->latch)
 	return false;
     }
 
   initialize_original_copy_tables ();
   set_loop_copy (orig_loop, loop);
+  duplicate_subloops (orig_loop, loop);
 
   if (!region_copy)
     {
@@ -5225,8 +5251,36 @@ gimple_duplicate_sese_tail (edge entry A
   cond_stmt = last_stmt (exit->src);
   gcc_assert (gimple_code (cond_stmt) == GIMPLE_COND);
   cond_stmt = gimple_copy (cond_stmt);
+ 
+ /* If the block consisting of the exit condition has the latch as 
+    successor, then the body of the loop is executed before 
+    the exit consition is tested.  In such case, moving the 
+    condition to the entry, causes that the loop will iterate  
+    one less iteration (which is the wanted outcome, since we 
+    peel out the last iteration).  If the body is executed after 
+    the condition, moving the condition to the entry requires 
+    decrementing one iteration.  */
+  if (exits[1]->dest == orig_loop->latch)
+    new_rhs = gimple_cond_rhs (cond_stmt);
+  else
+  {
+    new_rhs = fold_build2 (MINUS_EXPR, TREE_TYPE (gimple_cond_rhs (cond_stmt)),
+			   gimple_cond_rhs (cond_stmt), 
+			   build_int_cst (TREE_TYPE (gimple_cond_rhs (cond_stmt)), 1));
+
+    if (TREE_CODE (gimple_cond_rhs (cond_stmt)) == SSA_NAME)
+      {
+	iters_bb = gimple_bb (SSA_NAME_DEF_STMT (gimple_cond_rhs (cond_stmt)));
+	for (gsi1 = gsi_start_bb (iters_bb); !gsi_end_p (gsi1); gsi_next (&gsi1))
+	  if (gsi_stmt (gsi1)==SSA_NAME_DEF_STMT (gimple_cond_rhs (cond_stmt)))
+	    break;
+		 
+	new_rhs = force_gimple_operand_gsi (&gsi1, new_rhs, true,
+					    NULL_TREE,false,GSI_CONTINUE_LINKING);
+      }
+  }   
+  gimple_cond_set_rhs (cond_stmt, unshare_expr (new_rhs)); 
   gimple_cond_set_lhs (cond_stmt, unshare_expr (gimple_cond_lhs (cond_stmt)));
-  gimple_cond_set_rhs (cond_stmt, unshare_expr (gimple_cond_rhs (cond_stmt)));
   gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
 
   sorig = single_succ_edge (switch_bb);
@@ -5238,25 +5292,87 @@ gimple_duplicate_sese_tail (edge entry A
 
   /* Add the PHI node arguments.  */
   add_phi_args_after_copy (region_copy, n_region, snew);
-
+  
   /* Get rid of now superfluous conditions and associated edges (and phi node
      arguments).  */
+  exit_bb = exit->dest;
+ 
   e = redirect_edge_and_branch (exits[0], exits[1]->dest);
   PENDING_STMT (e) = NULL;
-  e = redirect_edge_and_branch (nexits[1], nexits[0]->dest);
-  PENDING_STMT (e) = NULL;
-
+ 
+  /* If the block consisting of the exit condition has the latch as 
+     successor, then the body of the loop is executed before 
+     the exit consition is tested.  
+     
+     { body  }
+     { cond  } (exit[0])  -> { latch }
+        |      
+	V (exit[1])
+	 
+     { exit_bb }
+     
+     
+     In such case, the equivalent copied edge nexits[1]
+     (for the peeled iteration) needs to be redirected to exit_bb.
+     
+     Otherwise, 
+     
+     { cond  } (exit[0])  -> { body }
+        |
+	V (exit[1])
+     
+     { exit_bb }
+    
+     
+     exit[0] is pointing to the body of the loop,
+     and the equivalent nexits[0] needs to be redirected to 
+     the copied body (of the peeled iteration).  */ 
+    
+  if (exits[1]->dest == orig_loop->latch)
+    e = redirect_edge_and_branch (nexits[1], nexits[0]->dest);
+  else
+    e = redirect_edge_and_branch (nexits[0], nexits[1]->dest);
+  PENDING_STMT (e) = NULL; 
+  
+  redirect_edges = VEC_alloc (edge, heap, 10);
+  
+  for (i = 0; i < n_region; i++)
+    region_copy[i]->flags |= BB_DUPLICATED;
+  
+  /* Iterate all incoming edges to latch.  All those coming from 
+     copied bbs will be redicrecred to exit_bb.  */
+  FOR_EACH_EDGE (e, ei, orig_loop->latch->preds)
+    {
+      if (e->src->flags & BB_DUPLICATED)
+        VEC_safe_push (edge, heap, redirect_edges, e);
+    }
+  
+  for (i = 0; i < n_region; i++)
+    region_copy[i]->flags &= ~BB_DUPLICATED;
+  
+  for (i = 0; VEC_iterate (edge, redirect_edges, i, e); ++i)
+    {
+      e = redirect_edge_and_branch (e, exit_bb);
+      PENDING_STMT (e) = NULL;
+      orig_src = get_bb_original (e->src);
+      orig_e = find_edge (orig_src, orig_loop->latch);
+      add_phi_args_after_redirect (e, orig_e);
+    }
+  
+  VEC_free (edge, heap, redirect_edges);
+  
+  
   /* Anything that is outside of the region, but was dominated by something
      inside needs to update dominance info.  */
   iterate_fix_dominators (CDI_DOMINATORS, doms, false);
   VEC_free (basic_block, heap, doms);
-
+  
   /* Update the SSA web.  */
   update_ssa (TODO_update_ssa);
-
+  
   if (free_region_copy)
     free (region_copy);
-
+  
   free_original_copy_tables ();
   return true;
 }
=

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] outerloop parallelization support
  2009-10-22 13:00 [PATCH] outerloop parallelization support Razya Ladelsky
@ 2009-10-22 16:35 ` Sebastian Pop
  2009-10-22 17:21   ` Razya Ladelsky
  2009-10-23  6:25 ` Eric Botcazou
  1 sibling, 1 reply; 4+ messages in thread
From: Sebastian Pop @ 2009-10-22 16:35 UTC (permalink / raw)
  To: Razya Ladelsky
  Cc: gcc-patches, Richard Guenther, Zdenek Dvorak, Alon Dayan, gcc-graphite

On Thu, Oct 22, 2009 at 07:52, Razya Ladelsky <RAZYA@il.ibm.com> wrote:
> Hi,
> Following this thread
> http://gcc.gnu.org/ml/gcc-patches/2009-10/msg00686.html
> I made some changes to the patch as requested, and tested bootstrap,
> testsuite.
> and spec with autopar enabled (on powerpc linux).
> The patch does not introduce any new regressions.
>  If there are no other objections, I'll commit it later today.


> +        fprintf (dump_file, "Trying loop %d as candidate\n",loop->num);
> +	  fprintf (dump_file, "loop %d is not innermost\n",loop->num);
> +	  fprintf (dump_file, "loop %d is innermost\n",loop->num);
> +  gimple_stmt_iterator gsi,gsi1;
> +					    NULL_TREE,false,GSI_CONTINUE_LINKING);
> +	  if (gsi_stmt (gsi1)==SSA_NAME_DEF_STMT (gimple_cond_rhs (cond_stmt)))

Please fix the missing spaces.

> +    the exit consition is tested.  In such case, moving the

s/consition/condition/

> +     the exit consition is tested.

s/consition/condition/

> +     copied bbs will be redicrecred to exit_bb.  */

s/redicrecred/redirected/

> -
> +
>    /* Update the SSA web.  */
>    update_ssa (TODO_update_ssa);
> -
> +
>    if (free_region_copy)
>      free (region_copy);
> -
> +
>    free_original_copy_tables ();
>    return true;
>  }

You are adding indentation spaces, please remove them on empty lines.

> I also added more testcases that include reductions.

I see that in outer-[4,5].c the outer loops are not parallelized.

From outer-4.c
> /* { dg-final { scan-tree-dump-times "parallelizing outer loop" 0 "parloops" } } */
> /* { dg-final { scan-tree-dump-times "parallelizing inner loop" 1 "parloops" }
>  * } */

Why do you indent this } on the next line?  Please put the brace on
the previous line, similarly in other testcases.

/* { dg-final { cleanup-tree-dump "parloops" } } */
/* { dg-final { cleanup-tree-dump "optimized" } } */

Why do you need to dump the -fdump-tree-optimized?  Please remove the
cleanup and the flag from all the testcases where you do not grep into
"optimized".

>   /* Double reduction is currently not supported, outer loop is not
>      parallelized.  Inner reduction is detected, inner loop is
>      parallelized.  */
>   sum = 0;
>   for (i = 0; i < N; i++)
>     for (j = 0; j < N; j++)
>       sum += x[i][j];

In this case you should probably xfail the "parallelizing outer loop"
as otherwise we won't have an indication of what changed in the
testsuite when we will have support for parallelizing the outer loop:

/* { dg-final { scan-tree-dump-times "parallelizing outer loop" 1
"parloops" { xfail *-*-* } } } */

What is the problem to parallelize the outermost loop in a reduction?
Can you open a bug report for this problem?

Thanks,
Sebastian Pop
--
AMD / Open Source Compiler Engineering / GNU Tools

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] outerloop parallelization support
  2009-10-22 16:35 ` Sebastian Pop
@ 2009-10-22 17:21   ` Razya Ladelsky
  0 siblings, 0 replies; 4+ messages in thread
From: Razya Ladelsky @ 2009-10-22 17:21 UTC (permalink / raw)
  To: Sebastian Pop
  Cc: Alon Dayan, gcc-graphite, gcc-patches, Zdenek Dvorak, Richard Guenther

Sebastian Pop <sebpop@gmail.com> wrote on 22/10/2009 18:29:31:

> Sebastian Pop <sebpop@gmail.com> 
> 22/10/2009 18:29
> 
> To
> 
> Razya Ladelsky/Haifa/IBM@IBMIL
> 
> cc
> 
> gcc-patches@gcc.gnu.org, Richard Guenther 
> <richard.guenther@gmail.com>, Zdenek Dvorak 
> <rakdver@kam.mff.cuni.cz>, Alon Dayan/Haifa/IBM@IBMIL, gcc-
> graphite@googlegroups.com
> 
> Subject
> 
> Re: [PATCH] outerloop parallelization support
> 
> On Thu, Oct 22, 2009 at 07:52, Razya Ladelsky <RAZYA@il.ibm.com> wrote:
> > Hi,
> > Following this thread
> > http://gcc.gnu.org/ml/gcc-patches/2009-10/msg00686.html
> > I made some changes to the patch as requested, and tested bootstrap,
> > testsuite.
> > and spec with autopar enabled (on powerpc linux).
> > The patch does not introduce any new regressions.
> >  If there are no other objections, I'll commit it later today.
> 
> 
>

Thanks Sebastian.
I corrected the typos and the indentation issues.


> > I also added more testcases that include reductions.
> 
> I see that in outer-[4,5].c the outer loops are not parallelized.
> 
> From outer-4.c
> > /* { dg-final { scan-tree-dump-times "parallelizing outer loop" 0 
> "parloops" } } */
> > /* { dg-final { scan-tree-dump-times "parallelizing inner loop" 1 
> "parloops" }
> >  * } */

True. parallelizing the outer loop is correct, but this type of reduction
is currently not handled. 

> >   /* Double reduction is currently not supported, outer loop is not
> >      parallelized.  Inner reduction is detected, inner loop is
> >      parallelized.  */
> >   sum = 0;
> >   for (i = 0; i < N; i++)
> >     for (j = 0; j < N; j++)
> >       sum += x[i][j];
> 
> In this case you should probably xfail the "parallelizing outer loop"
> as otherwise we won't have an indication of what changed in the
> testsuite when we will have support for parallelizing the outer loop:
> 
> /* { dg-final { scan-tree-dump-times "parallelizing outer loop" 1
> "parloops" { xfail *-*-* } } } */
> 

Yes, I was debating about that. I will.

> What is the problem to parallelize the outermost loop in a reduction?
> Can you open a bug report for this problem?
> 

There are a few cases that are parallelizable but not handled yet.
For example, the double reduction:

for i
  for j 
    sum+=

I need to make sure that the code generation can handle this reduction 
properly.

another case:

for i
  for j 
    sum+=
  =sum

the reduction is not "carried" by the outer loop, so it will not be 
detected 
as such when analyzing the outerloop. 
When the analysis detects a scalar dependency that is not a reduction, it 
determines 
that the loop is non-parallelizable.
The analysis needs to be adjusted to detect such inner cycles and 
determine 
that the outer loop is parallelizable.

Thanks,
Razya

> Thanks,
> Sebastian Pop
> --
> AMD / Open Source Compiler Engineering / GNU Tools

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] outerloop parallelization support
  2009-10-22 13:00 [PATCH] outerloop parallelization support Razya Ladelsky
  2009-10-22 16:35 ` Sebastian Pop
@ 2009-10-23  6:25 ` Eric Botcazou
  1 sibling, 0 replies; 4+ messages in thread
From: Eric Botcazou @ 2009-10-23  6:25 UTC (permalink / raw)
  To: Razya Ladelsky
  Cc: gcc-patches, Richard Guenther, Zdenek Dvorak, Alon Dayan, gcc-graphite

> http://gcc.gnu.org/ml/gcc-patches/2009-10/msg00686.html
> I made some changes to the patch as requested, and tested bootstrap,
> testsuite.
> and spec with autopar enabled (on powerpc linux).
> The patch does not introduce any new regressions.
>  If there are no other objections, I'll commit it later today.
> I also added more testcases that include reductions.

ChangeLog for the testsuite must go in testsuite/ChangeLog.

-- 
Eric Botcazou

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2009-10-23  5:46 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-10-22 13:00 [PATCH] outerloop parallelization support Razya Ladelsky
2009-10-22 16:35 ` Sebastian Pop
2009-10-22 17:21   ` Razya Ladelsky
2009-10-23  6:25 ` Eric Botcazou

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).