public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [gomp4]
@ 2015-10-15 17:12 Nathan Sidwell
  0 siblings, 0 replies; 2+ messages in thread
From: Nathan Sidwell @ 2015-10-15 17:12 UTC (permalink / raw)
  To: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 457 bytes --]

I've committed this to gomp4 branch.

It's the next in the series moving partioning decisions into the target 
compiler.  This patch moves the updating of the IF_GOACC_LOOP internal 
function's mask and chunking parameters.  After reconstructing the OpenACC 
loops, we scan the block(s) justy after the header marker looking for these 
functions, and set the determined partitioning mask and chunking.

The next patch will complete this transition.

nathan

[-- Attachment #2: gomp4-loop-subst.patch --]
[-- Type: text/x-patch, Size: 9264 bytes --]

2015-10-15  Nathan Sidwell  <nathan@codesourcery.com>

	* omp-low.c (struct oacc_loop): Add chunk_size and head_end
	fields.
	(extract_omp_for_data): Don't extract OpenACC partitioning or
	chunk size here.
	(lower_oacc_head_mark): Substitute gang_static size.
	(expand_oacc_for): Don't specify parallel region chunking or
	partitioning here.
	(oacc_xform_loop): Stride a single worker partition.  Add
	conversions for chunk size.
	(new_oacc_loop_raw): Initialize new fields.
	(new_oacc_loop): Set chunk_size.
	(oacc_loop_walk): Set head_end.
	(oacc_loop_xform_loop): New.
	(oacc_loop_process): Call it.

Index: gcc/omp-low.c
===================================================================
--- gcc/omp-low.c	(revision 228842)
+++ gcc/omp-low.c	(working copy)
@@ -255,11 +255,10 @@ struct oacc_loop
 
   tree routine;  /* Pseudo-loop enclosing a routine.  */
 
-  /* Partitioning mask.  */
-  unsigned mask;
-
-  /* Partitioning flags.  */
-  unsigned flags;
+  unsigned mask;   /* Partitioning mask.  */
+  unsigned flags;   /* Partitioning flags.  */
+  tree chunk_size;   /* Chunk size.  */
+  gcall *head_end; /* Final marker of head sequence.  */
 };
 
 /*  Flags for an OpenACC loop.  */
@@ -791,31 +790,6 @@ extract_omp_for_data (gomp_for *for_stmt
       fd->loop.step = build_int_cst (TREE_TYPE (fd->loop.v), 1);
       fd->loop.cond_code = LT_EXPR;
     }
-
-  /* For OpenACC loops, force a chunk size of one, unless a gang loop
-     contains a static argument.  This avoids the default scheduling where
-     several subsequent iterations are being executed by the same thread.  */
-  if (gimple_omp_for_kind (for_stmt) == GF_OMP_FOR_KIND_OACC_LOOP)
-    {
-      gcc_assert (fd->chunk_size == NULL_TREE);
-
-      tree gang = find_omp_clause (gimple_omp_for_clauses (for_stmt),
-				   OMP_CLAUSE_GANG);
-      tree chunk_size = NULL_TREE;
-
-      if (gang)
-	{
-	  chunk_size = OMP_CLAUSE_GANG_STATIC_EXPR (gang);
-
-	  /* gang (static:*) is represented by -1.  */
-	  if (chunk_size == integer_minus_one_node)
-	    chunk_size = NULL_TREE;
-	}
-      else
-	chunk_size = build_int_cst (TREE_TYPE (fd->loop.v), 1);
-
-      fd->chunk_size = chunk_size;
-    }
 }
 
 
@@ -4944,11 +4918,15 @@ lower_oacc_head_mark (location_t loc, tr
 	case OMP_CLAUSE_GANG:
 	  tag |= OLF_DIM_GANG;
 	  gang_static = OMP_CLAUSE_GANG_STATIC_EXPR (c);
+	  /* static:* is represented by -1, and we can ignore it, as
+	     scheduling is always static.  */
+	  if (gang_static && integer_minus_onep (gang_static))
+	    gang_static = NULL_TREE;
 	  levels++;
 	  break;
 
 	case OMP_CLAUSE_WORKER:
-	  tag |=  OLF_DIM_WORKER;
+	  tag |= OLF_DIM_WORKER;
 	  levels++;
 	  break;
 
@@ -4980,7 +4958,11 @@ lower_oacc_head_mark (location_t loc, tr
 
  done:
   if (gang_static)
-    tag |= OLF_GANG_STATIC;
+    {
+      if (DECL_P  (gang_static))
+	gang_static = build_outer_var_ref (gang_static, ctx);
+      tag |= OLF_GANG_STATIC;
+    }
 
   /* In a parallel region, loops are implicitly INDEPENDENT.  */
   if (is_oacc_parallel (ctx))
@@ -8819,8 +8801,8 @@ expand_oacc_for (struct omp_region *regi
   enum tree_code cond_code = fd->loop.cond_code;
   enum tree_code plus_code = PLUS_EXPR;
 
-  tree chunk_size = fd->chunk_size;
-  tree gwv = build_int_cst (integer_type_node, region->gwv_this);
+  tree chunk_size = integer_one_node;
+  tree gwv = integer_zero_node;
   tree iter_type = TREE_TYPE (v);
   tree diff_type = iter_type;
   tree plus_type = iter_type;
@@ -8873,7 +8855,7 @@ expand_oacc_for (struct omp_region *regi
   tree step = create_tmp_var (diff_type, ".step");
   bool up = cond_code == LT_EXPR;
   tree dir = build_int_cst (diff_type, up ? +1 : -1);
-  bool chunking = chunk_size != NULL_TREE;
+  bool chunking = !gimple_in_ssa_p (cfun);;
   bool negating;
 
   /* SSA instances.  */
@@ -8902,6 +8884,8 @@ expand_oacc_for (struct omp_region *regi
     {
       offset_init = gimple_omp_for_index (for_stmt, 0);
       gcc_assert (integer_zerop (fd->loop.n1));
+      /* The SSA parallelizer does gang parallelism.  */
+      gwv = build_int_cst (integer_type_node, GOMP_DIM_MASK (GOMP_DIM_GANG));
     }
 
   if (fd->collapse > 1)
@@ -15642,11 +15626,12 @@ oacc_xform_loop (gcall *call)
 
   if (integer_zerop (chunk_size))
     {
-      /* If we're at the gang or worker level, we want each to execute
-	 a contiguous run of iterations.  Otherwise we want each
-	 element to stride.  */
-      striding = !(outer_mask & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
-				 | GOMP_DIM_MASK (GOMP_DIM_GANG)));
+      /* If we're at the gang or (worker with vector), we want each to
+	 execute a contiguous run of iterations.  Otherwise we want
+	 each element to stride.  */
+      striding = !((outer_mask & GOMP_DIM_MASK (GOMP_DIM_GANG))
+		   || ((outer_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
+		       && (outer_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))));
       chunking = false;
     }
   else
@@ -15671,6 +15656,7 @@ oacc_xform_loop (gcall *call)
 	     = (range - dir) / (chunks * step * num_threads) + dir  */
 	  tree per = expand_oacc_get_num_threads (&seq, mask);
 	  per = fold_convert (type, per);
+	  chunk_size = fold_convert (type, chunk_size);
 	  per = fold_build2 (MULT_EXPR, type, per, chunk_size);
 	  per = fold_build2 (MULT_EXPR, type, per, step);
 	  r = build2 (MINUS_EXPR, type, range, dir);
@@ -15706,8 +15692,10 @@ oacc_xform_loop (gcall *call)
 
 	  if (chunking)
 	    {
+	      chunk_size = fold_convert (diff_type, chunk_size);
+
 	      span = inner_size;
-	      span = fold_convert (type, span);
+	      span = fold_convert (diff_type, span);
 	      span = fold_build2 (MULT_EXPR, diff_type, span, chunk_size);
 	    }
 	  else
@@ -15754,6 +15742,8 @@ oacc_xform_loop (gcall *call)
 	  
 	  if (chunking)
 	    {
+	      chunk_size = fold_convert (diff_type, chunk_size);
+
 	      span = expand_oacc_get_num_threads (&seq, inner_mask);
 	      span = fold_convert (diff_type, span);
 	      span = fold_build2 (MULT_EXPR, diff_type, span, chunk_size);
@@ -15899,6 +15889,8 @@ new_oacc_loop_raw (oacc_loop *parent, lo
   loop->routine = NULL_TREE;
 
   loop->mask = loop->flags = 0;
+  loop->chunk_size = 0;
+  loop->head_end = NULL;
 
   return loop;
 }
@@ -15922,6 +15914,11 @@ new_oacc_loop (oacc_loop *parent, gcall
 
   loop->flags = TREE_INT_CST_LOW (gimple_call_arg (head, 2));
 
+  tree chunk_size = integer_zero_node;
+  if (loop->flags & OLF_GANG_STATIC)
+    chunk_size = gimple_call_arg (head,3);
+  loop->chunk_size = chunk_size;
+
   /* Set the mask from the incoming flags.
      TODO: Be smarter and more flexible.  */
   loop->mask = ((loop->flags >> OLF_DIM_BASE)
@@ -16086,6 +16083,8 @@ oacc_loop_walk (oacc_loop *loop, basic_b
 	      marker = 0;
 	      if (code == IFN_UNIQUE_OACC_TAIL_MARK)
 		loop = finish_oacc_loop (loop);
+	      else
+		loop->head_end = call;
 	    }
 	  else
 	    {
@@ -16113,7 +16112,6 @@ oacc_loop_walk (oacc_loop *loop, basic_b
 	    }
 	}
     }
-
   gcc_assert (!remaining && !marker);
 
   /* Walk successor blocks.  */
@@ -16202,6 +16200,47 @@ oacc_loop_xform_head_tail (gcall *from,
  break2:;
 }
 
+/* Transform the IFN_GOACC_LOOP internal functions by providing the
+   determined partitioning mask and chunking argument.  */
+
+static void
+oacc_loop_xform_loop (gcall *end_marker, tree mask_arg, tree chunk_arg)
+{
+  gimple_stmt_iterator gsi = gsi_for_stmt (end_marker);
+  
+  for (;;)
+    {
+      for (; !gsi_end_p (gsi); gsi_next (&gsi))
+	{
+	  gimple *stmt = gsi_stmt (gsi);
+
+	  if (!is_gimple_call (stmt))
+	    continue;
+
+	  gcall *call = as_a <gcall *> (stmt);
+      
+	  if (!gimple_call_internal_p (call))
+	    continue;
+
+	  if (gimple_call_internal_fn (call) != IFN_GOACC_LOOP)
+	    continue;
+
+	  *gimple_call_arg_ptr (call, 5) = mask_arg;
+	  *gimple_call_arg_ptr (call, 4) = chunk_arg;
+	  if (TREE_INT_CST_LOW (gimple_call_arg (call, 0))
+	      == IFN_GOACC_LOOP_BOUND)
+	    goto break2;
+	}
+
+      /* If we didn't see LOOP_BOUND, it should be in the single
+	 successor block.  */
+      basic_block bb = single_succ (gsi_bb (gsi));
+      gsi = gsi_start_bb (bb);
+    }
+
+ break2:;
+}
+
 /* Process the discovered OpenACC loops, setting the correct
    partitioning level etc.  */
 
@@ -16215,19 +16254,26 @@ oacc_loop_process (oacc_loop *loop)
   unsigned mask = loop->mask;
   unsigned dim = GOMP_DIM_GANG;
 
-  if (mask)
-    for (ix = 0; ix != GOMP_DIM_MAX && loop->heads[ix]; ix++)
-      {
-	gcc_assert (mask);
+  if (mask && !loop->routine)
+    {
+      tree mask_arg = build_int_cst (unsigned_type_node, mask);
+      tree chunk_arg = loop->chunk_size;
 
-	while (!(GOMP_DIM_MASK (dim) & mask))
-	  dim++;
+      oacc_loop_xform_loop (loop->head_end, mask_arg, chunk_arg);
 
-	oacc_loop_xform_head_tail (loop->heads[ix], dim);
-	oacc_loop_xform_head_tail (loop->tails[ix], dim);
+      for (ix = 0; ix != GOMP_DIM_MAX && loop->heads[ix]; ix++)
+	{
+	  gcc_assert (mask);
 
-	mask ^= GOMP_DIM_MASK (dim);
-      }
+	  while (!(GOMP_DIM_MASK (dim) & mask))
+	    dim++;
+
+	  oacc_loop_xform_head_tail (loop->heads[ix], dim);
+	  oacc_loop_xform_head_tail (loop->tails[ix], dim);
+
+	  mask ^= GOMP_DIM_MASK (dim);
+	}
+    }
   else
     gcc_assert (!loop->heads[1] && !loop->tails[1]
 		&& (loop->routine || !loop->parent

^ permalink raw reply	[flat|nested] 2+ messages in thread

* [gomp4]
@ 2015-10-09 14:14 Nathan Sidwell
  0 siblings, 0 replies; 2+ messages in thread
From: Nathan Sidwell @ 2015-10-09 14:14 UTC (permalink / raw)
  To: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 130 bytes --]

I've applied this to gomp4 branch.

1) ports the break fix in gimple-fold from trunk

2) fixes missing tab in ptx output.

nathan

[-- Attachment #2: clean.patch --]
[-- Type: text/x-patch, Size: 1288 bytes --]

2015-10-09  Nathan Sidwell  <nathan@acm.org>

	* config/nvptx/nvptx.c (nvptx_init_axis_predicate): Fix output
	formatting.

	PR 67861
	* gimple-fold.c (gimple_fold_builtin): Add break after
	BUILT_IN_PRINTF_CHK, BUILT_IN_VPRINTF_CHK folding.

Index: gcc/config/nvptx/nvptx.c
===================================================================
--- gcc/config/nvptx/nvptx.c	(revision 228656)
+++ gcc/config/nvptx/nvptx.c	(working copy)
@@ -629,8 +629,7 @@ static void
 nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
 {
   fprintf (file, "\t{\n");
-      
-  fprintf (file, "\t.reg.u32\t%%%s;\n", name);
+  fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
   fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
   fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
   fprintf (file, "\t}\n");
Index: gcc/gimple-fold.c
===================================================================
--- gcc/gimple-fold.c	(revision 228656)
+++ gcc/gimple-fold.c	(working copy)
@@ -2890,6 +2890,7 @@ gimple_fold_builtin (gimple_stmt_iterato
 					   n == 3
 					   ? gimple_call_arg (stmt, 2)
 					   : NULL_TREE, fcode);
+      break;
     case BUILT_IN_ACC_ON_DEVICE:
       return gimple_fold_builtin_acc_on_device (gsi,
 						gimple_call_arg (stmt, 0));

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2015-10-15 17:12 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-10-15 17:12 [gomp4] Nathan Sidwell
  -- strict thread matches above, loose matches on Subject: below --
2015-10-09 14:14 [gomp4] Nathan Sidwell

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).