public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc/devel/omp/gcc-11] openacc: Remove unnecessary barriers (gimple worker partitioning/broadcast)
@ 2021-05-13 16:15 Kwok Yeung
  0 siblings, 0 replies; only message in thread
From: Kwok Yeung @ 2021-05-13 16:15 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:56c407d4fb7446b68323a8b01594732f428ec4b3

commit 56c407d4fb7446b68323a8b01594732f428ec4b3
Author: Julian Brown <julian@codesourcery.com>
Date:   Thu Feb 13 06:13:34 2020 -0800

    openacc: Remove unnecessary barriers (gimple worker partitioning/broadcast)
    
    This is an optimisation for middle-end worker-partitioning support (used
    to support multiple workers on AMD GCN).  At present, barriers may be
    emitted in cases where they aren't needed and cannot be optimised away.
    This patch stops the extraneous barriers from being emitted in the
    first place.
    
    One exception to the above (where the barrier is still needed) is for
    predicated blocks of code that perform a write to gang-private shared
    memory from one worker.  We must execute a barrier before other workers
    read that shared memory location.
    
    2020-07-15  Julian Brown  <julian@codesourcery.com>
    
    gcc/
            * config/gcn/gcn.c (gimple.h): Include.
            (gcn_fork_join): Emit barrier for worker-level joins.
            * omp-sese.c (find_local_vars_to_propagate): Add writes_gangprivate
            bitmap parameter. Set bit for blocks containing gang-private variable
            writes.
            (worker_single_simple): Don't emit barrier after predicated block.
            (worker_single_copy): Don't emit barrier if we're not broadcasting
            anything and the block contains no gang-private writes.
            (neuter_worker_single): Don't predicate blocks that only contain NOPs
            or internal marker functions.  Pass has_gangprivate_write argument to
            worker_single_copy.
            (oacc_do_neutering): Add writes_gangprivate bitmap handling.

Diff:
---
 gcc/ChangeLog.omp    |  15 +++++++
 gcc/config/gcn/gcn.c |   9 +++-
 gcc/omp-sese.c       | 115 +++++++++++++++++++++++++++++++++++++++------------
 3 files changed, 112 insertions(+), 27 deletions(-)

diff --git a/gcc/ChangeLog.omp b/gcc/ChangeLog.omp
index 882a2fbf473..06d9aa25de7 100644
--- a/gcc/ChangeLog.omp
+++ b/gcc/ChangeLog.omp
@@ -1,3 +1,18 @@
+2020-07-15  Julian Brown  <julian@codesourcery.com>
+
+	* config/gcn/gcn.c (gimple.h): Include.
+	(gcn_fork_join): Emit barrier for worker-level joins.
+	* omp-sese.c (find_local_vars_to_propagate): Add writes_gangprivate
+	bitmap parameter. Set bit for blocks containing gang-private variable
+	writes.
+	(worker_single_simple): Don't emit barrier after predicated block.
+	(worker_single_copy): Don't emit barrier if we're not broadcasting
+	anything and the block contains no gang-private writes.
+	(neuter_worker_single): Don't predicate blocks that only contain NOPs
+	or internal marker functions.  Pass has_gangprivate_write argument to
+	worker_single_copy.
+	(oacc_do_neutering): Add writes_gangprivate bitmap handling.
+
 2020-07-15  Julian Brown  <julian@codesourcery.com>
 
 	* config/gcn/gcn-valu.md (scatter<mode>_insn_1offset_ds<exec_scatter>):
diff --git a/gcc/config/gcn/gcn.c b/gcc/config/gcn/gcn.c
index 017e24a2474..80f1430cca8 100644
--- a/gcc/config/gcn/gcn.c
+++ b/gcc/config/gcn/gcn.c
@@ -50,6 +50,7 @@
 #include "varasm.h"
 #include "intl.h"
 #include "rtl-iter.h"
+#include "gimple.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -4988,9 +4989,15 @@ gcn_oacc_dim_pos (int dim)
 /* Implement TARGET_GOACC_FORK_JOIN.  */
 
 static bool
-gcn_fork_join (gcall *ARG_UNUSED (call), const int *ARG_UNUSED (dims),
+gcn_fork_join (gcall *call, const int *ARG_UNUSED (dims),
 	       bool ARG_UNUSED (is_fork))
 {
+  tree arg = gimple_call_arg (call, 2);
+  unsigned axis = TREE_INT_CST_LOW (arg);
+
+  if (!is_fork && axis == GOMP_DIM_WORKER && dims[axis] != 1)
+    return true;
+
   return false;
 }
 
diff --git a/gcc/omp-sese.c b/gcc/omp-sese.c
index c279cf1708a..ad80316f735 100644
--- a/gcc/omp-sese.c
+++ b/gcc/omp-sese.c
@@ -767,16 +767,19 @@ static void
 find_local_vars_to_propagate (parallel_g *par, unsigned outer_mask,
 			      hash_set<tree> *partitioned_var_uses,
 			      hash_set<tree> *gangprivate_vars,
+			      bitmap writes_gangprivate,
 			      vec<propagation_set *> *prop_set)
 {
   unsigned mask = outer_mask | par->mask;
 
   if (par->inner)
     find_local_vars_to_propagate (par->inner, mask, partitioned_var_uses,
-				  gangprivate_vars, prop_set);
+				  gangprivate_vars, writes_gangprivate,
+				  prop_set);
   if (par->next)
     find_local_vars_to_propagate (par->next, outer_mask, partitioned_var_uses,
-				  gangprivate_vars, prop_set);
+				  gangprivate_vars, writes_gangprivate,
+				  prop_set);
 
   if (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
     {
@@ -797,8 +800,7 @@ find_local_vars_to_propagate (parallel_g *par, unsigned outer_mask,
 		  if (!VAR_P (var)
 		      || is_global_var (var)
 		      || AGGREGATE_TYPE_P (TREE_TYPE (var))
-		      || !partitioned_var_uses->contains (var)
-		      || gangprivate_vars->contains (var))
+		      || !partitioned_var_uses->contains (var))
 		    continue;
 
 		  if (stmt_may_clobber_ref_p (stmt, var))
@@ -812,6 +814,14 @@ find_local_vars_to_propagate (parallel_g *par, unsigned outer_mask,
 			  fprintf (dump_file, "\n");
 			}
 
+		      if (gangprivate_vars->contains (var))
+			{
+			  /* If we write a gang-private variable, we want a
+			     barrier at the end of the block.  */
+			  bitmap_set_bit (writes_gangprivate, block->index);
+			  continue;
+			}
+
 		      if (!(*prop_set)[block->index])
 			(*prop_set)[block->index] = new propagation_set;
 
@@ -923,14 +933,6 @@ worker_single_simple (basic_block from, basic_block to,
 	    }
 	}
     }
-
-  gsi = gsi_start_bb (skip_block);
-
-  decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER);
-  gimple *acc_bar = gimple_build_call (decl, 0);
-
-  gsi_insert_before (&gsi, acc_bar, GSI_SAME_STMT);
-  update_stmt (acc_bar);
 }
 
 /* This is a copied and renamed omp-low.c:omp_build_component_ref.  */
@@ -1008,7 +1010,7 @@ worker_single_copy (basic_block from, basic_block to,
 		    hash_set<tree> *def_escapes_block,
 		    hash_set<tree> *worker_partitioned_uses,
 		    tree record_type, unsigned HOST_WIDE_INT placement,
-		    bool isolate_broadcasts)
+		    bool isolate_broadcasts, bool has_gangprivate_write)
 {
   /* If we only have virtual defs, we'll have no record type, but we still want
      to emit single_copy_start and (particularly) single_copy_end to act as
@@ -1089,14 +1091,19 @@ worker_single_copy (basic_block from, basic_block to,
   edge ef = make_edge (from, barrier_block, EDGE_FALSE_VALUE);
   ef->probability = et->probability.invert ();
 
-  decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER);
-  gimple *acc_bar = gimple_build_call (decl, 0);
-
   gimple_stmt_iterator bar_gsi = gsi_start_bb (barrier_block);
-  gsi_insert_before (&bar_gsi, acc_bar, GSI_NEW_STMT);
-
   cond = gimple_build_cond (NE_EXPR, recv_tmp, zero_ptr, NULL_TREE, NULL_TREE);
-  gsi_insert_after (&bar_gsi, cond, GSI_NEW_STMT);
+
+  if (record_type != char_type_node || has_gangprivate_write)
+    {
+      decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER);
+      gimple *acc_bar = gimple_build_call (decl, 0);
+
+      gsi_insert_before (&bar_gsi, acc_bar, GSI_NEW_STMT);
+      gsi_insert_after (&bar_gsi, cond, GSI_NEW_STMT);
+    }
+  else
+    gsi_insert_before (&bar_gsi, cond, GSI_NEW_STMT);
 
   edge et2 = split_block (barrier_block, cond);
   et2->flags &= ~EDGE_FALLTHRU;
@@ -1258,7 +1265,8 @@ neuter_worker_single (parallel_g *par, unsigned outer_mask,
 		      bitmap worker_single, bitmap vector_single,
 		      vec<propagation_set *> *prop_set,
 		      hash_set<tree> *partitioned_var_uses,
-		      blk_offset_map_t *blk_offset_map)
+		      blk_offset_map_t *blk_offset_map,
+		      bitmap writes_gangprivate)
 {
   unsigned mask = outer_mask | par->mask;
 
@@ -1344,10 +1352,57 @@ neuter_worker_single (parallel_g *par, unsigned outer_mask,
 	      (*prop_set)[block->index] = 0;
 	    }
 
-	  tree record_type = (tree) block->aux;
+	  bool only_marker_fns = true;
+	  bool join_block = false;
+
+	  for (gimple_stmt_iterator gsi = gsi_start_bb (block);
+	       !gsi_end_p (gsi);
+	       gsi_next (&gsi))
+	    {
+	      gimple *stmt = gsi_stmt (gsi);
+	      if (gimple_code (stmt) == GIMPLE_CALL
+		  && gimple_call_internal_p (stmt, IFN_UNIQUE))
+		{
+		  enum ifn_unique_kind k = ((enum ifn_unique_kind)
+		    TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
+		  if (k != IFN_UNIQUE_OACC_PRIVATE
+		      && k != IFN_UNIQUE_OACC_JOIN
+		      && k != IFN_UNIQUE_OACC_FORK
+		      && k != IFN_UNIQUE_OACC_HEAD_MARK
+		      && k != IFN_UNIQUE_OACC_TAIL_MARK)
+		    only_marker_fns = false;
+		  else if (k == IFN_UNIQUE_OACC_JOIN)
+		    /* The JOIN marker is special in that it *cannot* be
+		       predicated for worker zero, because it may be lowered
+		       to a barrier instruction and all workers must typically
+		       execute that barrier.  We shouldn't be doing any
+		       broadcasts from the join block anyway.  */
+		    join_block = true;
+		}
+	      else if (gimple_code (stmt) == GIMPLE_CALL
+		       && gimple_call_internal_p (stmt, IFN_GOACC_LOOP))
+		/* Empty.  */;
+	      else if (gimple_nop_p (stmt))
+		/* Empty.  */;
+	      else
+		only_marker_fns = false;
+	    }
+
+	  /* We can skip predicating this block for worker zero if the only
+	     thing it contains is marker functions that will be removed in the
+	     oaccdevlow pass anyway.
+	     Don't do this if the block has (any) phi nodes, because those
+	     might define SSA names that need broadcasting.
+	     TODO: We might be able to skip transforming blocks that only
+	     contain some other trivial statements too.  */
+	  if (only_marker_fns && !phi_nodes (block))
+	    continue;
+
+	  gcc_assert (!join_block);
 
 	  if (has_defs)
 	    {
+	      tree record_type = (tree) block->aux;
 	      std::pair<unsigned HOST_WIDE_INT, bool> *off_rngalloc
 		= blk_offset_map->get (block);
 	      gcc_assert (!record_type || off_rngalloc);
@@ -1355,9 +1410,12 @@ neuter_worker_single (parallel_g *par, unsigned outer_mask,
 		= off_rngalloc ? off_rngalloc->first : 0;
 	      bool range_allocated
 		= off_rngalloc ? off_rngalloc->second : true;
+	      bool has_gangprivate_write
+		= bitmap_bit_p (writes_gangprivate, block->index);
 	      worker_single_copy (block, block, &def_escapes_block,
 				  &worker_partitioned_uses, record_type,
-				  offset, !range_allocated);
+				  offset, !range_allocated,
+				  has_gangprivate_write);
 	    }
 	  else
 	    worker_single_simple (block, block, &def_escapes_block);
@@ -1394,10 +1452,12 @@ neuter_worker_single (parallel_g *par, unsigned outer_mask,
 
   if (par->inner)
     neuter_worker_single (par->inner, mask, worker_single, vector_single,
-			  prop_set, partitioned_var_uses, blk_offset_map);
+			  prop_set, partitioned_var_uses, blk_offset_map,
+			  writes_gangprivate);
   if (par->next)
     neuter_worker_single (par->next, outer_mask, worker_single, vector_single,
-			  prop_set, partitioned_var_uses, blk_offset_map);
+			  prop_set, partitioned_var_uses, blk_offset_map,
+			  writes_gangprivate);
 }
 
 
@@ -1595,11 +1655,13 @@ oacc_do_neutering (unsigned HOST_WIDE_INT bounds_lo,
 
   hash_set<tree> partitioned_var_uses;
   hash_set<tree> gangprivate_vars;
+  auto_bitmap writes_gangprivate;
 
   find_gangprivate_vars (&gangprivate_vars);
   find_partitioned_var_uses (par, mask, &partitioned_var_uses);
   find_local_vars_to_propagate (par, mask, &partitioned_var_uses,
-				&gangprivate_vars, &prop_set);
+				&gangprivate_vars, writes_gangprivate,
+				&prop_set);
 
   FOR_ALL_BB_FN (bb, cfun)
     {
@@ -1749,7 +1811,8 @@ oacc_do_neutering (unsigned HOST_WIDE_INT bounds_lo,
   sbitmap_vector_free (reachable);
 
   neuter_worker_single (par, mask, worker_single, vector_single, &prop_set,
-			&partitioned_var_uses, &blk_offset_map);
+			&partitioned_var_uses, &blk_offset_map,
+			writes_gangprivate);
 
   prop_set.release ();


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2021-05-13 16:15 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-05-13 16:15 [gcc/devel/omp/gcc-11] openacc: Remove unnecessary barriers (gimple worker partitioning/broadcast) Kwok Yeung

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).