public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [gomp] openacc reduction cleanup
@ 2015-06-30 18:07 Cesar Philippidis
  2015-07-01 13:45 ` Cesar Philippidis
  0 siblings, 1 reply; 2+ messages in thread
From: Cesar Philippidis @ 2015-06-30 18:07 UTC (permalink / raw)
  To: gcc-patches

This patch removes a lot of the crufty code which was necessary for an
arbitrary sized vector reductions. The new plan going forward it to fix
vector_length to a size such that vector loops don't require any
synchronization after the loop have terminated. In the case of nvptx
targets, vector_length = warp_sz, which is currently 32 threads. I'll
follow up this patch with another one which adds support for tree
reductions in vector loops in a later date.

This patch has been applied to gomp-4_0-branch.

Cesar

^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [gomp] openacc reduction cleanup
  2015-06-30 18:07 [gomp] openacc reduction cleanup Cesar Philippidis
@ 2015-07-01 13:45 ` Cesar Philippidis
  0 siblings, 0 replies; 2+ messages in thread
From: Cesar Philippidis @ 2015-07-01 13:45 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 174 bytes --]

On 06/30/2015 11:01 AM, Cesar Philippidis wrote:

> This patch has been applied to gomp-4_0-branch.

It was brought to my attention that I forgot to attach the patch.

Cesar

[-- Attachment #2: reduction-cleanup.diff --]
[-- Type: text/x-patch, Size: 34409 bytes --]

2015-06-30  Cesar Philippidis  <cesar@codesourcery.com>

	gcc/
	* omp-low.c (oacc_get_reduction_array_id): Remove.
	(oacc_parallel_max_reduction_array_size): Remove.
	(install_array_var_ganglocal): Remove.
	(oacc_outermost_parallel_kernels_context): Remove.
	(oacc_inside_routine): Remove.
	(is_oacc_multithreaded): Remove.
	(oacc_needs_global_memory): Remove.
	(oacc_max_threads): Remove.
	(scan_sharing_clauses): Don't allocate an array for openacc reductions.
	(oacc_lower_reduction_var_helper): Remove.
	(lower_reduction_clauses): Handle all OpenACC reductions using atomics.
	(oacc_gimple_assign): Remove.
	(oacc_initialize_reduction_data): Remove.
	(oacc_serial_reduction): Remove.
	(oacc_finalize_reduction_data): Remove.
	(oacc_process_reduction_data_helper): Remove.
	(oacc_process_reduction_data): Remove.
	(lower_omp_target): Don't call oacc_process_reduction_data anymore.

diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index 24fac7c..11ac909 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -305,18 +305,6 @@ static basic_block oacc_broadcast (basic_block, basic_block,
       *handled_ops_p = false; \
       break;
 
-/* Helper function to get the name of the array containing the partial
-   reductions for OpenACC reductions.  */
-static const char *
-oacc_get_reduction_array_id (tree node)
-{
-  const char *id = IDENTIFIER_POINTER (DECL_NAME (node));
-  int len = strlen ("OACC") + strlen (id);
-  char *temp_name = XALLOCAVEC (char, len + 1);
-  snprintf (temp_name, len + 1, "OACC%s", id);
-  return IDENTIFIER_POINTER (get_identifier (temp_name));
-}
-
 static bool
 is_oacc_parallel (omp_context *ctx)
 {
@@ -326,68 +314,6 @@ is_oacc_parallel (omp_context *ctx)
 	      == GF_OMP_TARGET_KIND_OACC_PARALLEL));
 }
 
-/* Determine the maximum number of threads available to the current omp
-   context for a parallel OpenACC reduction.  */
-
-static tree
-oacc_parallel_max_reduction_array_size (omp_context *ctx, tree clauses,
-					gimple_seq *seqp)
-{
-  tree nthreads;
-  bool gangs, workers, vectors;
-  omp_context *oc;
-
-  gangs = find_omp_clause (clauses, OMP_CLAUSE_GANG) != NULL_TREE;
-  workers = find_omp_clause (clauses, OMP_CLAUSE_WORKER) != NULL_TREE;
-  vectors = find_omp_clause (clauses, OMP_CLAUSE_VECTOR) != NULL_TREE;
-
-  /* num_gangs, num_workers and vector_length is set in innermost parallel
-     gimple stmt.  Find that stmt and extract those values.  */
-
-  for (oc = ctx; !is_oacc_parallel (oc); oc = oc->outer)
-    ;
-
-  clauses = gimple_omp_target_clauses (oc->stmt);
-
-  /* FIXME: this assignment may be useless.  */
-  nthreads = create_tmp_var (sizetype, NULL);
-  gimplify_assign (nthreads, build_int_cst (sizetype, 1), seqp);
-
-  if (vectors)
-    {
-      tree v = find_omp_clause (clauses, OMP_CLAUSE_VECTOR_LENGTH);
-      if (v)
-	{
-	  //tree t = fold_convert (sizetype, OMP_CLAUSE_VECTOR_LENGTH_EXPR (v));
-	  gimplify_assign (nthreads, build_int_cst (sizetype, 1), seqp);
-	}
-    }
-
-  if (workers)
-    {
-      tree w = find_omp_clause (clauses, OMP_CLAUSE_NUM_WORKERS);
-      if (w)
-	{
-	  tree t = fold_convert (sizetype, OMP_CLAUSE_NUM_WORKERS_EXPR (w));
-	  gimple stmt = gimple_build_assign (nthreads, MULT_EXPR, nthreads, t);
-	  gimple_seq_add_stmt (seqp, stmt);
-	}
-    }
-
-  if (gangs)
-    {
-      tree g = find_omp_clause (clauses, OMP_CLAUSE_NUM_GANGS);
-      if (g)
-	{
-	  tree t = fold_convert (sizetype, OMP_CLAUSE_NUM_GANGS_EXPR (g));
-	  gimple stmt = gimple_build_assign(nthreads, MULT_EXPR, nthreads, t);
-	  gimple_seq_add_stmt (seqp, stmt);
-	}
-    }
-
-  return nthreads;
-}
-
 /* Holds offload tables with decls.  */
 vec<tree, va_gc> *offload_funcs, *offload_vars;
 
@@ -1583,14 +1509,6 @@ install_var_ganglocal (tree decl, omp_context *ctx)
   return new_var;
 }
 
-static tree
-install_array_var_ganglocal (tree decl, tree type, tree size, omp_context *ctx)
-{
-  tree ptr = alloc_var_ganglocal (decl, type, ctx, size);
-  insert_decl_map (&ctx->cb, decl, ptr);
-  return ptr;
-}
-
 /* Debugging dumps for parallel regions.  */
 void dump_omp_region (FILE *, struct omp_region *, int);
 void debug_omp_region (struct omp_region *);
@@ -1870,140 +1788,6 @@ fixup_child_record_type (omp_context *ctx)
     = build_qualified_type (build_reference_type (type), TYPE_QUAL_RESTRICT);
 }
 
-static omp_context *
-oacc_outermost_parallel_kernels_context (omp_context *ctx)
-{
-  omp_context *octx = ctx;
-
-  while (octx)
-    {
-      if (gimple_code (octx->stmt) == GIMPLE_OMP_TARGET)
-	{
-	  int kind = gimple_omp_target_kind (octx->stmt);
-	  if (kind == GF_OMP_TARGET_KIND_OACC_PARALLEL
-	      || kind == GF_OMP_TARGET_KIND_OACC_KERNELS)
-	    return octx;
-	}
-
-      octx = octx->outer;
-    }
-
-  return ctx;
-}
-
-/* OpenACC routines contain acc loops without any kernels or parallel
-   regions.  If the omp_context isn't a descendant of a kernels or
-   parallel region, then it must be inside a routine.  */
-
-static bool
-oacc_inside_routine (omp_context *ctx)
-{
-  omp_context *octx = ctx;
-
-  while (octx)
-    {
-      if (gimple_code (octx->stmt) == GIMPLE_OMP_TARGET)
-	{
-	  int kind = gimple_omp_target_kind (octx->stmt);
-	  if (kind == GF_OMP_TARGET_KIND_OACC_PARALLEL
-	      || kind == GF_OMP_TARGET_KIND_OACC_KERNELS)
-	    return false;
-	}
-
-      octx = octx->outer;
-    }
-
-  return true;
-}
-
-/* Returns true if an oacc context is to be executed in parallel.  OpenACC
-   kernels get translated into OpenACC parallel regions, so don't consider
-   them as candidates for multithreaded processing.  */
-
-static bool
-is_oacc_multithreaded (tree clauses, omp_context *ctx)
-{
-  omp_context *octx = oacc_outermost_parallel_kernels_context (ctx);
-
-  /* This is true if thies stmt is inside a routine or a kernels region.  */
-  if (!is_oacc_parallel (octx))
-    return false;
-
-  if (find_omp_clause (clauses, OMP_CLAUSE_GANG))
-    return true;
-
-  if (find_omp_clause (clauses, OMP_CLAUSE_WORKER))
-    return true;
-
-  if (find_omp_clause (clauses, OMP_CLAUSE_VECTOR))
-    return true;
-
-  return false;
-}
-
-/* Helper function to determine if a set of clauses uses global memory.
-   All reductions involving gangs or are in a parallel construction are
-   need atomic.  */
-
-static bool
-oacc_needs_global_memory (tree clauses, omp_context *ctx)
-{
-  if (is_oacc_parallel (ctx))
-    return true;
-
-  if (oacc_inside_routine (ctx))
-    return true;
-
-  return find_omp_clause (clauses, OMP_CLAUSE_GANG) != NULL_TREE;
-}
-
-/* Determine the maximum number of threads used by an openacc loop.
-   TODO: Add support for kernels regions.  */
-
-static tree
-oacc_max_threads (omp_context *ctx)
-{
-  tree nthreads, vector_length, num_gangs, num_workers, clauses, oclauses, c;
-  omp_context *octx;
-
-  octx = oacc_outermost_parallel_kernels_context (ctx);
-  clauses = gimple_omp_for_clauses (ctx->stmt);
-  oclauses = gimple_omp_target_clauses (octx->stmt);
-
-  num_gangs = build_int_cst (size_type_node, 1);
-  num_workers = build_int_cst (size_type_node, 1);
-  vector_length = build_int_cst (size_type_node, 1);
-  nthreads = build_int_cst (size_type_node, 1);
-
-  /* Find num_gangs, num_workers and vector_length.  FIXME: Vector reduction
-     also need to scan outer loops for worker threads so that memeory is
-     allocated for all of the vector lanes in each worker thread.  */
-  for (c = oclauses; c ; c = OMP_CLAUSE_CHAIN (c))
-    {
-      tree size = OMP_CLAUSE_OPERAND (c, 0);
-
-      if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_NUM_GANGS)
-	num_gangs = fold_build1 (NOP_EXPR, size_type_node, size);
-      else if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_NUM_WORKERS)
-	num_workers = fold_build1 (NOP_EXPR, size_type_node, size);
-      else if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_VECTOR_LENGTH)
-	vector_length = fold_build1 (NOP_EXPR, size_type_node, size);
-    }
-
-  /* Calculate nthreads.  */
-  for (c = clauses; c ; c = OMP_CLAUSE_CHAIN (c))
-    {
-      if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_GANG)
-	nthreads = fold_build2 (MULT_EXPR, sizetype, nthreads, num_gangs);
-      else if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_WORKER)
-	nthreads = fold_build2 (MULT_EXPR, sizetype, nthreads, num_workers);
-      else if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_VECTOR)
-	nthreads = fold_build2 (MULT_EXPR, sizetype, nthreads, vector_length);
-    }
-
-  return nthreads;
-}
-
 /* Instantiate decls as necessary in CTX to satisfy the data sharing
    specified by CLAUSES.  */
 
@@ -2100,9 +1884,6 @@ scan_sharing_clauses (tree clauses, omp_context *ctx)
 	case OMP_CLAUSE_LINEAR:
 	  decl = OMP_CLAUSE_DECL (c);
 
-	  if (is_gimple_omp_oacc (ctx->stmt))
-	    scan_array_reductions = true;
-
 	do_private:
 	  if (is_variable_sized (decl))
 	    {
@@ -2427,43 +2208,7 @@ scan_sharing_clauses (tree clauses, omp_context *ctx)
   if (scan_array_reductions)
     {
     for (c = clauses; c; c = OMP_CLAUSE_CHAIN (c))
-      if (is_gimple_omp_oacc (ctx->stmt)
-	  && OMP_CLAUSE_CODE (c) == OMP_CLAUSE_REDUCTION
-	  && is_oacc_multithreaded (clauses, ctx)
-	  && !oacc_needs_global_memory (clauses, ctx))
-	{
-	  /* Create a decl for the reduction array.  */
-	  tree var = OMP_CLAUSE_DECL (c);
-	  tree type = get_base_type (var);
-	  tree array;
-
-	    array = create_tmp_var (type,
-				    oacc_get_reduction_array_id (var));
-
-	  omp_context *octx = ctx;
-	  tree size = fold_build2 (MULT_EXPR, sizetype, TYPE_SIZE_UNIT (type),
-				   oacc_max_threads (ctx));
-
-	  /* This array must be inserted into the field map, which is only
-	     present in the outermost omp context.  */
-	  octx = oacc_outermost_parallel_kernels_context (ctx);
-
-	  if (octx != NULL && octx->field_map != NULL)
-	    {
-		array = install_array_var_ganglocal (array, type, size, octx);
-
-	      /* Insert it into the current context.  */
-	      splay_tree_insert (ctx->reduction_map, (splay_tree_key)
-				 oacc_get_reduction_array_id (var),
-				 (splay_tree_value) array);
-	      splay_tree_insert (ctx->reduction_map,
-				 (splay_tree_key) array,
-				 (splay_tree_value) array);
-	    }
-	  else
-	    sorry ("Reductions are not supported inside routines yet.");
-	}
-      else if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_REDUCTION
+      if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_REDUCTION
 	  && OMP_CLAUSE_REDUCTION_PLACEHOLDER (c))
 	{
 	  scan_omp (&OMP_CLAUSE_REDUCTION_GIMPLE_INIT (c), ctx);
@@ -4911,58 +4656,6 @@ lower_lastprivate_clauses (tree clauses, tree predicate, gimple_seq *stmt_list,
     gimple_seq_add_stmt (stmt_list, gimple_build_label (label));
 }
 
-static void
-oacc_lower_reduction_var_helper (gimple_seq *stmt_seqp, omp_context *ctx,
-				 tree tid, tree var, tree new_var)
-{
-  /* The atomic operation at the end of the reduction creates
-     unnecessary write contention on accelerators.  To work around
-     this, create an array to store the partial reductions. Later, in
-     lower_omp_for (for openacc), the values of array will be
-     combined.  */
-
-  tree t = NULL_TREE, array, x;
-  tree type = get_base_type (var);
-  gimple stmt;
-
-  /* Now insert the partial reductions into the array.  */
-
-  /* Find the reduction array.  */
-
-  tree ptype = build_pointer_type (type);
-
-  t = lookup_oacc_reduction (oacc_get_reduction_array_id (var), ctx);
-  t = build_receiver_ref (t, false, ctx->outer);
-
-  array = create_tmp_var (ptype);
-  gimplify_assign (array, t, stmt_seqp);
-
-  tree ptr = create_tmp_var (TREE_TYPE (array));
-
-  /* Find the reduction array.  */
-
-  /* testing a unary conversion.  */
-  tree offset = create_tmp_var (sizetype);
-  gimplify_assign (offset, TYPE_SIZE_UNIT (type),
-		   stmt_seqp);
-  t = create_tmp_var (sizetype);
-  gimplify_assign (t, unshare_expr (fold_build1 (NOP_EXPR, sizetype, tid)),
-		   stmt_seqp);
-  stmt = gimple_build_assign (offset, MULT_EXPR, offset, t);
-  gimple_seq_add_stmt (stmt_seqp, stmt);
-
-  /* Offset expression.  Does the POINTER_PLUS_EXPR take care
-     of adding sizeof(var) to the array?  */
-  ptr = create_tmp_var (ptype);
-  stmt = gimple_build_assign (unshare_expr (ptr), POINTER_PLUS_EXPR, array,
-			      offset);
-  gimple_seq_add_stmt (stmt_seqp, stmt);
-
-  /* Move the local sum to gfc$sum[i].  */
-  x = unshare_expr (build_simple_mem_ref (ptr));
-  stmt = gimplify_assign (x, new_var, stmt_seqp);
-}
-
 /* Returns true if this reduction operation is compatible with atomics.  */
 
 static bool
@@ -5057,13 +4750,6 @@ expand_oacc_get_thread_num (gimple_seq *seq, int gwv_bits)
   return res;
 }
 
-
-static void
-oacc_finalize_reduction_data (tree clauses, tree nthreads, tree tid,
-			      gimple_seq *stmt_seqp, omp_context *ctx,
-			      bool is_accelerator);
-
-
 /* Generate code to implement the REDUCTION clauses.  OpenACC reductions
    are usually executed in parallel, but they fallback to sequential code for
    known single-threaded regions.  */
@@ -5073,7 +4759,7 @@ lower_reduction_clauses (tree clauses, gimple_seq *stmt_seqp, omp_context *ctx)
 {
   gimple_seq sub_seq = NULL;
   gimple stmt;
-  tree x, c, tid = NULL_TREE;
+  tree x, c;
   int count = 0;
 
   /* SIMD reductions are handled in lower_rec_input_clauses.  */
@@ -5098,16 +4784,6 @@ lower_reduction_clauses (tree clauses, gimple_seq *stmt_seqp, omp_context *ctx)
   if (count == 0)
     return;
 
-  /* Initialize thread info for OpenACC.  */
-  if (is_gimple_omp_oacc (ctx->stmt)
-      && !(is_oacc_multithreaded (clauses, ctx)
-	   && oacc_needs_global_memory (clauses, ctx))
-      && !is_oacc_parallel (ctx))
-    {
-      /* Get the current thread id.  */
-      tid = expand_oacc_get_thread_num (stmt_seqp, ctx->gwv_this);
-    }
-
   for (c = clauses; c ; c = OMP_CLAUSE_CHAIN (c))
     {
       tree var, ref, new_var;
@@ -5135,17 +4811,12 @@ lower_reduction_clauses (tree clauses, gimple_seq *stmt_seqp, omp_context *ctx)
 
       if (count == 1 && is_atomic_compatible_reduction (var, ctx))
 	{
-	  if (!is_gimple_omp_oacc (ctx->stmt)
-	      || (is_gimple_omp_oacc (ctx->stmt)
-		  && (!is_oacc_multithreaded (clauses, ctx)
-		      || oacc_needs_global_memory (clauses, ctx))))
-	    {
 	  tree addr = build_fold_addr_expr_loc (clause_loc, ref);
 
 	  addr = save_expr (addr);
 
 	  if (is_gimple_omp_oacc (ctx->stmt)
-	      && !oacc_needs_global_memory (clauses, ctx))
+	      && (ctx->gwv_this == 0))
 	    {
 	      /* This reduction is done sequentially in OpenACC by a single
 		 thread.  There is no need to use atomics.  */
@@ -5163,78 +4834,6 @@ lower_reduction_clauses (tree clauses, gimple_seq *stmt_seqp, omp_context *ctx)
 	    }
 
 	  return;
-	    }
-	  else
-	    {
-	  /* The atomic add at the end of the sum creates unnecessary
-	     write contention on accelerators.  To work around this,
-	     create an array to store the partial reductions. Later, in
-	     lower_omp_for (for openacc), the values of array will be
-	     combined.  */
-
-	  tree t, array, nthreads, ptr;
-	  tree type = get_base_type (var);
-	  omp_context *octx;
-	  gimple stmt;
-
-	  /* Find the total number of threads.  A reduction clause
-	     only appears inside a loop construction or a combined
-	     parallel and loop construct.  */
-	  nthreads = expand_oacc_get_num_threads (stmt_seqp, ctx->gwv_this);
-
-	  /* Now insert the partial reductions into the array.  */
-
-	  /* Find the reduction array.  */
-
-	  tree ptype = build_pointer_type (type);
-
-	  t = lookup_oacc_reduction (oacc_get_reduction_array_id (var), ctx);
-	  
-	  /* Find the outermost ctx where the array variable has been
-	     declared.  */
-
-	  octx = oacc_outermost_parallel_kernels_context (ctx);
-
-	  if (oacc_needs_global_memory (clauses, ctx))
-	    {
-	      t = build_receiver_ref (t, false, octx);
-	      array = create_tmp_var (ptype, NULL);
-	      gimplify_assign (array, t, stmt_seqp);
-	    }
-	  else
-	    array = t;
-
-	  /* Find the reduction array.  */
-
-	  /* testing a unary conversion.  */
-	  tree offset = create_tmp_var (sizetype, NULL);
-	  gimplify_assign (offset, TYPE_SIZE_UNIT (type),
-			   stmt_seqp);
-	  t = create_tmp_var (sizetype, NULL);
-	  gimplify_assign (t, unshare_expr (fold_build1 (NOP_EXPR, sizetype,
-							 tid)),
-			   stmt_seqp);
-	  stmt = gimple_build_assign (offset, MULT_EXPR, offset, t);
-	  gimple_seq_add_stmt (stmt_seqp, stmt);
-
-	  /* Offset expression.  Does the POINTER_PLUS_EXPR take care
-	     of adding sizeof(var) to the array?  */
-	  ptr = create_tmp_var (ptype, NULL);
-	  stmt = gimple_build_assign (unshare_expr(ptr), POINTER_PLUS_EXPR,
-				      array, offset);
-	  gimple_seq_add_stmt (stmt_seqp, stmt);
-
-	  /* Move the local sum to OACCsum[i].  */
-	  x = unshare_expr (build_simple_mem_ref (ptr));
-	  stmt = gimplify_assign (x, new_var, stmt_seqp);
-
-	  /* Synchronize the threads and finish up the reduction.  */
-
-	  oacc_finalize_reduction_data (clauses, nthreads, tid, stmt_seqp, ctx,
-					true);
-
-	  return;
-	    }
 	}
       else if (OMP_CLAUSE_REDUCTION_PLACEHOLDER (c))
 	{
@@ -5253,14 +4852,7 @@ lower_reduction_clauses (tree clauses, gimple_seq *stmt_seqp, omp_context *ctx)
 	}
       else
 	{
-	  if (is_gimple_omp_oacc (ctx->stmt)
-	      && is_oacc_multithreaded (clauses, ctx)
-	      && !oacc_needs_global_memory (clauses, ctx))
-	    {
-	      oacc_lower_reduction_var_helper (stmt_seqp, ctx, tid, var,
-					       new_var);
-	    }
-	  else if (is_oacc_parallel (ctx) && is_reference (var))
+	  if (is_oacc_parallel (ctx) && is_reference (var))
 	    {
 	      tree t1, t2;
 	      tree type = TREE_TYPE (new_var);
@@ -5285,11 +4877,6 @@ lower_reduction_clauses (tree clauses, gimple_seq *stmt_seqp, omp_context *ctx)
 	}
     }
 
-  if (is_gimple_omp_oacc (ctx->stmt)
-      && is_oacc_multithreaded (clauses, ctx)
-      && !oacc_needs_global_memory (clauses, ctx))
-    return;
-
   stmt = gimple_build_call (builtin_decl_explicit (BUILT_IN_GOMP_ATOMIC_START),
 			    0);
   gimple_seq_add_stmt (stmt_seqp, stmt);
@@ -11337,471 +10924,6 @@ make_pass_expand_omp_ssa (gcc::context *ctxt)
   return new pass_expand_omp_ssa (ctxt);
 }
 \f
-/* Routines to lower OMP directives into OMP-GIMPLE.  */
-
-/* Helper function to preform, potentially COMPLEX_TYPE, operation and
-   convert it to gimple.  */
-static void
-oacc_gimple_assign (tree dest, tree_code op, tree src, gimple_seq *seq)
-{
-  gimple stmt;
-
-  if (TREE_CODE (TREE_TYPE (dest)) != COMPLEX_TYPE)
-    {
-      stmt = gimple_build_assign (dest, op, dest, src);
-      gimple_seq_add_stmt (seq, stmt);
-      return;
-    }
-
-  tree t = create_tmp_var (TREE_TYPE (TREE_TYPE (dest)));
-  tree rdest = fold_build1 (REALPART_EXPR, TREE_TYPE (TREE_TYPE (dest)), dest);
-  gimplify_assign (t, rdest, seq);
-  rdest = t;
-
-  t = create_tmp_var (TREE_TYPE (TREE_TYPE (dest)));
-  tree idest = fold_build1 (IMAGPART_EXPR, TREE_TYPE (TREE_TYPE (dest)), dest);
-  gimplify_assign (t, idest, seq);
-  idest = t;
-
-  t = create_tmp_var (TREE_TYPE (TREE_TYPE (src)));
-  tree rsrc = fold_build1 (REALPART_EXPR, TREE_TYPE (TREE_TYPE (src)), src);
-  gimplify_assign (t, rsrc, seq);
-  rsrc = t;
-
-  t = create_tmp_var (TREE_TYPE (TREE_TYPE (src)));
-  tree isrc = fold_build1 (IMAGPART_EXPR, TREE_TYPE (TREE_TYPE (src)), src);
-  gimplify_assign (t, isrc, seq);
-  isrc = t;
-
-  tree r = create_tmp_var (TREE_TYPE (TREE_TYPE (dest)));
-  tree i = create_tmp_var (TREE_TYPE (TREE_TYPE (dest)));
-  tree result;
-
-  if (op == PLUS_EXPR)
-    {
-      stmt = gimple_build_assign (r, op, rdest, rsrc);
-      gimple_seq_add_stmt (seq, stmt);
-
-      stmt = gimple_build_assign (i, op, idest, isrc);
-      gimple_seq_add_stmt (seq, stmt);
-    }
-  else if (op == MULT_EXPR)
-    {
-      /* Let x = a + ib = dest, y = c + id = src.
-	 x * y = (ac - bd) + i(ad + bc)  */
-      tree ac = create_tmp_var (TREE_TYPE (TREE_TYPE (dest)));
-      tree bd = create_tmp_var (TREE_TYPE (TREE_TYPE (dest)));
-      tree ad = create_tmp_var (TREE_TYPE (TREE_TYPE (dest)));
-      tree bc = create_tmp_var (TREE_TYPE (TREE_TYPE (dest)));
-
-      stmt = gimple_build_assign (ac, MULT_EXPR, rdest, rsrc);
-      gimple_seq_add_stmt (seq, stmt);
-
-      stmt = gimple_build_assign (bd, MULT_EXPR, idest, isrc);
-      gimple_seq_add_stmt (seq, stmt);
-
-      stmt = gimple_build_assign (r, MINUS_EXPR, ac, bd);
-      gimple_seq_add_stmt (seq, stmt);
-
-      stmt = gimple_build_assign (ad, MULT_EXPR, rdest, isrc);
-      gimple_seq_add_stmt (seq, stmt);
-
-      stmt = gimple_build_assign (bd, MULT_EXPR, idest, rsrc);
-      gimple_seq_add_stmt (seq, stmt);
-
-      stmt = gimple_build_assign (i, PLUS_EXPR, ad, bc);
-      gimple_seq_add_stmt (seq, stmt);
-    }
-  else
-    gcc_unreachable ();
-
-  result = build2 (COMPLEX_EXPR, TREE_TYPE (dest), r, i);
-  gimplify_assign (dest, result, seq);
-}
-
-/* Helper function to initialize local data for the reduction arrays.
-   The reduction arrays need to be placed inside the calling function
-   for accelerators, or else the host won't be able to preform the final
-   reduction.  */
-
-static void
-oacc_initialize_reduction_data (tree clauses, tree nthreads,
-				gimple_seq *stmt_seqp, omp_context *ctx)
-{
-  tree c, t, oc;
-  gimple stmt;
-  omp_context *octx;
-
-  /* Find the innermost PARALLEL openmp context.  This is needed to insert
-     new data mappings to the containing parallel/kernels region.  */
-  octx = oacc_outermost_parallel_kernels_context (ctx);
-
-  /* Extract the clauses.  */
-  oc = gimple_omp_target_clauses (octx->stmt);
-
-  /* Find the last parallel/kernels region clause.  */
-  for (; oc && OMP_CLAUSE_CHAIN (oc); oc = OMP_CLAUSE_CHAIN (oc))
-    ;
-
-  /* Allocate arrays for each reduction variable.  */
-  for (c = clauses; c; c = OMP_CLAUSE_CHAIN (c))
-    {
-      if (OMP_CLAUSE_CODE (c) != OMP_CLAUSE_REDUCTION)
-	continue;
-
-      if (!oacc_needs_global_memory (clauses, ctx))
-	continue;
-
-      tree var = OMP_CLAUSE_DECL (c);
-      tree type = get_base_type (var);
-      tree array = lookup_oacc_reduction (oacc_get_reduction_array_id (var),
-					  ctx);
-      tree size, call;
-
-      /* Calculate size of the reduction array.  */
-      t = create_tmp_var (TREE_TYPE (nthreads));
-      stmt = gimple_build_assign (t, MULT_EXPR, nthreads,
-				  fold_convert (TREE_TYPE (nthreads),
-						TYPE_SIZE_UNIT (type)));
-      gimple_seq_add_stmt (stmt_seqp, stmt);
-
-      size = create_tmp_var (sizetype);
-      gimplify_assign (size, fold_build1 (NOP_EXPR, sizetype, t), stmt_seqp);
-
-      /* Now allocate memory for it.  FIXME: Allocating memory for the
-	 reduction array may be unnecessary once the final reduction is able
-	 to be preformed on the accelerator.  Instead of allocating memory on
-	 the host side, it could just be allocated on the accelerator.  */
-      call = unshare_expr (builtin_decl_explicit (BUILT_IN_ALLOCA));
-      stmt = gimple_build_call (call, 1, size);
-      gimple_call_set_lhs (stmt, array);
-      gimple_seq_add_stmt (stmt_seqp, stmt);
-
-      /* Map this array into the accelerator.  */
-
-      /* Add the reduction array to the list of clauses.  */
-      /* FIXME: Currently, these variables must be placed in the outer
-	 most clause so that copy-out works.  */
-      tree x = array;
-      t = build_omp_clause (gimple_location (ctx->stmt), OMP_CLAUSE_MAP);
-      OMP_CLAUSE_SET_MAP_KIND (t, GOMP_MAP_FORCE_FROM);
-      OMP_CLAUSE_DECL (t) = x;
-      OMP_CLAUSE_CHAIN (t) = NULL;
-      if (oc)
-	OMP_CLAUSE_CHAIN (oc) = t;
-      else
-	gimple_omp_target_set_clauses (as_a <gomp_target *> (octx->stmt), t);
-      OMP_CLAUSE_SIZE (t) = size;
-      oc = t;
-    }
-}
-
-/* Serial reduction helper.  */
-
-static void
-oacc_serial_reduction (tree clauses, tree nthreads, tree tid, 
-		       gimple_seq *stmt_seqp, omp_context *ctx,
-		       hash_map<tree, tree> &reduction_vars,
-		       bool is_accelerator)
-{
-  tree c, x, var, orig_var, array, loop_header, loop_body, loop_exit, type;
-  tree next, reduction_exit, t1, t2;
-  gimple stmt;
-
-  if (is_accelerator)
-    {
-      /* Synchronize all of the threads.  */
-      tree call = builtin_decl_explicit (BUILT_IN_SYNC_SYNCHRONIZE);
-      stmt = gimple_build_call (call, 0);
-      gimple_seq_add_stmt (stmt_seqp, stmt);
-      
-      /* Jump to the exit label if tid != 0.  Ensure that TID considers
-	 the global TID (gang, worker and vector) not just the local
-	 loop-specific TID.  */
-      
-      next = create_artificial_label (UNKNOWN_LOCATION);
-      reduction_exit = create_artificial_label (UNKNOWN_LOCATION);
-      
-      t1 = create_tmp_var (sizetype, NULL);
-      t2 = create_tmp_var (sizetype, NULL);
-      gimplify_assign (t1, fold_build1 (NOP_EXPR, sizetype, tid),
-		       stmt_seqp);
-      gimplify_assign (t2, fold_build1 (NOP_EXPR, sizetype,
-				    integer_zero_node),
-		       stmt_seqp);
-      stmt = gimple_build_cond (NE_EXPR, t1, t2, reduction_exit, next);
-      gimple_seq_add_stmt (stmt_seqp, stmt);
-      gimple_seq_add_stmt (stmt_seqp, gimple_build_label (next));
-    }
-
-  /* Create for loop.
-
-     let var = the original reduction variable
-     let array = reduction variable array
-
-     for (i = 0; i < nthreads; i++)
-       var op= array[i]
- */
-
-  loop_header = create_artificial_label (UNKNOWN_LOCATION);
-  loop_body = create_artificial_label (UNKNOWN_LOCATION);
-  loop_exit = create_artificial_label (UNKNOWN_LOCATION);
-
-  /* Create and initialize an index variable.  */
-  tree ix = create_tmp_var (sizetype);
-  gimplify_assign (ix, fold_build1 (NOP_EXPR, sizetype, integer_zero_node),
-		   stmt_seqp);
-
-  /* Insert the loop header label here.  */
-  gimple_seq_add_stmt (stmt_seqp, gimple_build_label (loop_header));
-
-  /* Exit loop if ix >= nthreads.  */
-  x = create_tmp_var (sizetype);
-  gimplify_assign (x, fold_build1 (NOP_EXPR, sizetype, nthreads), stmt_seqp);
-  stmt = gimple_build_cond (GE_EXPR, ix, x, loop_exit, loop_body);
-  gimple_seq_add_stmt (stmt_seqp, stmt);
-
-  /* Insert the loop body label here.  */
-  gimple_seq_add_stmt (stmt_seqp, gimple_build_label (loop_body));
-
-  /* Collapse each reduction array, one element at a time.  */
-  for (c = clauses; c; c = OMP_CLAUSE_CHAIN (c))
-    {
-      if (OMP_CLAUSE_CODE (c) != OMP_CLAUSE_REDUCTION)
-	continue;
-
-      tree_code reduction_code = OMP_CLAUSE_REDUCTION_CODE (c);
-
-      /* reduction(-:var) sums up the partial results, so it acts
-	 identically to reduction(+:var).  */
-      if (reduction_code == MINUS_EXPR)
-        reduction_code = PLUS_EXPR;
-
-      /* Set up reduction variable var.  */
-      orig_var = OMP_CLAUSE_DECL (c);
-      type = get_base_type (orig_var);
-
-      if (is_accelerator)
-	orig_var = lookup_decl_in_outer_ctx (orig_var, ctx);
-
-      /* Lookup the local-private reduction variable.  */
-      var = *reduction_vars.get (orig_var);
-
-      array = lookup_oacc_reduction (oacc_get_reduction_array_id
-				     (OMP_CLAUSE_DECL (c)), ctx);
-
-      /* Calculate the array offset.  */
-      tree offset = create_tmp_var (sizetype);
-      gimplify_assign (offset, TYPE_SIZE_UNIT (type), stmt_seqp);
-      stmt = gimple_build_assign (offset, MULT_EXPR, offset, ix);
-      gimple_seq_add_stmt (stmt_seqp, stmt);
-
-      tree ptr = create_tmp_var (TREE_TYPE (array));
-      stmt = gimple_build_assign (ptr, POINTER_PLUS_EXPR, array, offset);
-      gimple_seq_add_stmt (stmt_seqp, stmt);
-
-      /* Extract array[ix] into mem.  */
-      tree mem = create_tmp_var (type);
-      gimplify_assign (mem, build_simple_mem_ref (ptr), stmt_seqp);
-
-      /* Find the original reduction variable.  */
-      if (is_reference (var))
-	var = build_simple_mem_ref (var);
-
-      tree t = create_tmp_var (type);
-
-      x = lang_hooks.decls.omp_clause_assign_op (c, t, var);
-      gimplify_and_add (unshare_expr(x), stmt_seqp);
-
-      /* var = var op mem */
-      switch (OMP_CLAUSE_REDUCTION_CODE (c))
-	{
-	case TRUTH_ANDIF_EXPR:
-	case TRUTH_ORIF_EXPR:
-	  t = fold_build2 (OMP_CLAUSE_REDUCTION_CODE (c), integer_type_node,
-			   t, mem);
-	  gimplify_and_add (t, stmt_seqp);
-	  break;
-	default:
-	  /* The lhs isn't a gimple_reg when var is COMPLEX_TYPE.  */
-	  oacc_gimple_assign (t, OMP_CLAUSE_REDUCTION_CODE (c), mem,
-			      stmt_seqp);
-	}
-
-      t = fold_build1 (NOP_EXPR, TREE_TYPE (var), t);
-      x = lang_hooks.decls.omp_clause_assign_op (c, var, t);
-      gimplify_and_add (unshare_expr(x), stmt_seqp);
-    }
-
-  /* Increment the induction variable.  */
-  tree one = fold_build1 (NOP_EXPR, sizetype, integer_one_node);
-  stmt = gimple_build_assign (ix, PLUS_EXPR, ix, one);
-  gimple_seq_add_stmt (stmt_seqp, stmt);
-
-  /* Go back to the top of the loop.  */
-  gimple_seq_add_stmt (stmt_seqp, gimple_build_goto (loop_header));
-
-  /* Place the loop exit label here.  */
-  gimple_seq_add_stmt (stmt_seqp, gimple_build_label (loop_exit));
-
-  /* Finally, update the original reduction variables.  */
-  for (c = clauses; c; c = OMP_CLAUSE_CHAIN (c))
-    {
-      if (OMP_CLAUSE_CODE (c) != OMP_CLAUSE_REDUCTION)
-	continue;
-
-      /* Set up reduction variable var.  */
-      orig_var = OMP_CLAUSE_DECL (c);
-      if (is_accelerator)
-	orig_var = lookup_decl_in_outer_ctx (orig_var, ctx);
-      type = get_base_type (orig_var);
-
-      var = *reduction_vars.get (orig_var);
-      x = lang_hooks.decls.omp_clause_assign_op (c, orig_var, var);
-      gimplify_and_add (unshare_expr (x), stmt_seqp);
-    }
-
-  if (is_accelerator)
-    gimple_seq_add_stmt (stmt_seqp, gimple_build_label (reduction_exit));
-}
-
-/* Helper function to finalize local data for the reduction arrays. The
-   reduction array needs to be reduced to the original reduction variable.
-
-   FIXME: This function assumes that there are vector_length threads in
-   total.  Also, it assumes that there are at least vector_length iterations
-   in the for loop.  
-
-   FIXME: This function should also take reduction variables which are also
-   private.
-*/
-
-static void
-oacc_finalize_reduction_data (tree clauses, tree nthreads, tree tid,
-			      gimple_seq *stmt_seqp, omp_context *ctx,
-			      bool is_accelerator)
-{
-  gcc_assert (is_gimple_omp_oacc (ctx->stmt));
-  tree c, x, var, orig_var, type;
-
-  /* Create a private copy of each original reduction variable to preform
-     the intermediate reduction result.  The original reduction variable
-     cannot be used because it may need to be updated at the end of a loop
-     or parallel region depending on whether a gang clause is present.  */
-
-  /* Create a hash_map for the gang-local temporary reduction variables.  */
-  hash_map<tree, tree> reduction_vars;
-
-  for (c = clauses; c; c = OMP_CLAUSE_CHAIN (c))
-    {
-      if (OMP_CLAUSE_CODE (c) != OMP_CLAUSE_REDUCTION)
-	continue;
-
-      /* Set up reduction variable var.  */
-      orig_var = OMP_CLAUSE_DECL (c);
-
-      if (is_accelerator)
-	orig_var = lookup_decl_in_outer_ctx (orig_var, ctx);
-
-      type = get_base_type (orig_var);
-      var = create_tmp_var (type, NULL);
-      x = lang_hooks.decls.omp_clause_assign_op (c, var, orig_var);
-      gimplify_and_add (unshare_expr (x), stmt_seqp);
-      reduction_vars.put (orig_var, var);
-    }
-
-    oacc_serial_reduction (clauses, nthreads, tid, stmt_seqp, ctx,
-			   reduction_vars, is_accelerator);
-}
-
-static void
-oacc_process_reduction_data_helper (gimple_seq stmt, gimple_seq *in_stmt_seqp,
-				    gimple_seq *out_stmt_seqp /* TODO */ ATTRIBUTE_UNUSED, tree clauses,
-				    omp_context *ctx)
-{
-  switch (gimple_code (stmt))
-    {
-    case GIMPLE_OMP_FOR:
-      clauses = gimple_omp_for_clauses (stmt);
-      break;
-    case GIMPLE_OMP_TARGET:
-      clauses = gimple_omp_target_clauses (stmt);
-      break;
-    default:
-      clauses = NULL;
-    }
-
-  /* Allocate an array of num_gangs threads for the parallel reduction, and
-     finalize the result when the parallel region exits.  */
-  if (is_oacc_parallel (ctx)
-      && find_omp_clause (clauses, OMP_CLAUSE_REDUCTION))
-    {
-      tree nthreads;
-      nthreads = oacc_parallel_max_reduction_array_size (ctx, clauses,
-							 in_stmt_seqp);
-
-      if (!oacc_needs_global_memory (clauses, ctx))
-	oacc_initialize_reduction_data (clauses, nthreads, in_stmt_seqp,
-					ctx);
-      }
-}
-
-/* Scan through all of the gimple stmts searching for an OMP_FOR_EXPR, and
-   scan that for reductions.  Allocate memory for the arrays of partial
-   reductions and process them as necessary.  All memory involving gangs
-   is allocated via a memory map, otherwise gang-local shared memory is
-   used.  */
-
-static void
-oacc_process_reduction_data (gimple_seq *body, gimple_seq *in_stmt_seqp,
-			gimple_seq *out_stmt_seqp, omp_context *ctx)
-{
-  gimple_stmt_iterator gsi;
-  gimple_seq inner = NULL;
-  gimple stmt;
-
-  gsi = gsi_start (*body);
-  while (!gsi_end_p (gsi))
-    {
-      stmt = gsi_stmt (gsi);
-      /* A collapse clause may have inserted a new bind block.  */
-      if (gbind *bind_stmt = dyn_cast <gbind *> (stmt))
-	{
-	  /* Recursively scan for nested loops.  */
-	  inner = gimple_bind_body (bind_stmt);
-	  oacc_process_reduction_data (&inner, in_stmt_seqp, out_stmt_seqp,
-				       ctx);	  
-	}
-      else if (gimple_code (stmt) == GIMPLE_OMP_FOR)
-	{
-	  tree clauses, c;
-	  bool reduction_found = false;
-	  
-	  stmt = gsi_stmt (gsi);
-	  clauses = gimple_omp_for_clauses (stmt);
-
-	  /* Search for a reduction clause.  */
-	  for (c = clauses; c; c = OMP_CLAUSE_CHAIN (c))
-	    if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_REDUCTION)
-	      {
-		reduction_found = true;
-		break;
-	      }
-      
-	  if (reduction_found && is_oacc_multithreaded (clauses, ctx)
-	      && !oacc_needs_global_memory (clauses, ctx))
-	    oacc_process_reduction_data_helper (stmt, in_stmt_seqp,
-						out_stmt_seqp, clauses,
-						ctx);
-
-	  /* Now recursively scan for nested loops.  */
-	  oacc_process_reduction_data (gimple_omp_body_ptr (stmt),
-				       in_stmt_seqp, out_stmt_seqp, ctx);
-	}
-      gsi_next (&gsi);
-    }
-}
-
 /* Routines to lower OpenMP directives into OMP-GIMPLE.  */
 
 /* If ctx is a worksharing context inside of a cancellable parallel
@@ -13228,7 +12350,6 @@ lower_omp_target (gimple_stmt_iterator *gsi_p, omp_context *ctx)
   if (has_reduction)
     {
       lower_rec_input_clauses (clauses, &irlist, &orlist, ctx, NULL);
-      oacc_process_reduction_data (&tgt_body, &irlist, &orlist, ctx);
       lower_reduction_clauses (clauses, &orlist, ctx);
     }
 

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2015-07-01 13:45 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-06-30 18:07 [gomp] openacc reduction cleanup Cesar Philippidis
2015-07-01 13:45 ` Cesar Philippidis

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).