* [gomp4] Worker-single predication
@ 2015-06-01 15:59 Bernd Schmidt
2015-06-02 10:29 ` Thomas Schwinge
` (2 more replies)
0 siblings, 3 replies; 5+ messages in thread
From: Bernd Schmidt @ 2015-06-01 15:59 UTC (permalink / raw)
To: GCC Patches; +Cc: Jakub Jelinek
[-- Attachment #1: Type: text/plain, Size: 572 bytes --]
This extends the previous vector-single support to also handle
worker-level predication. We can't use the shfl insn because workers
will live across multiple warps, so we use a location in memory to
broadcast the branch target.
This also fixes the oversight where basic blocks inside a parallel
region but outside all loops weren't being predicated.
A special case is added for worker-single vector-partitioned; we add a
jump over the entire loop that is taken by the inactive workers and add
no predication inside this loop.
Committed on gomp-4_0-branch.
Bernd
[-- Attachment #2: wpred3.diff --]
[-- Type: text/x-patch, Size: 18832 bytes --]
Index: gcc/ChangeLog.gomp
===================================================================
--- gcc/ChangeLog.gomp (revision 223974)
+++ gcc/ChangeLog.gomp (working copy)
@@ -1,3 +1,29 @@
+2015-06-01 Bernd Schmidt <bernds@codesourcery.com>
+
+ * gimple.h (struct gimple_statement_omp_parallel_layout): Add a
+ broadcast_array field.
+ (gimple_omp_target_broadcast_array,
+ gimple_omp_target_set_broadcast_array): New inline functions.
+ * omp-low.c (struct omp_region): Add a broadcast_array field.
+ (find_omp_target_region_data): Initialize it. Change STMT arg to
+ be a gomp_target *. All callers changed.
+ (struct omp_context): Add worker_sync_elt field.
+ (alloc_var_ganglocal): Properly handle a NULL underlying_var.
+ (oacc_alloc_broadcast_storage): Likewise.
+ (required_predication_mask): New function.
+ (requires_vector_predicate): Remove. All callers changed to use
+ required_predication_mask.
+ (generate_vector_broadcast): Return a statement suitable as a
+ block splitting point.
+ (generate_oacc_broadcast, make_predication_test): New static
+ functions.
+ (predicate_bb): New arg MASK. All callers changed. Use
+ generate_oacc_braodcast and use the split point it returns.
+ Handle WSVP regions by jumping across entire loops. Use
+ make_predication_test. Correctly handle GIMPLE_OMP_RETURN.
+ (lower_omp_target): Call oacc_alloc_broadcast_storage. Call
+ gimple_omp_target_set_broadcast_array on the stmt.
+
2015-06-01 Tom de Vries <tom@codesourcery.com>
* omp-low.c (expand_omp_target): Fix GIMPLE_OMP_ENTRY_END handling for
Index: gcc/gimple.h
===================================================================
--- gcc/gimple.h (revision 223974)
+++ gcc/gimple.h (working copy)
@@ -580,6 +580,10 @@ struct GTY((tag("GSS_OMP_PARALLEL_LAYOUT
/* [ WORD 11 ]
Size of the gang-local memory to allocate. */
tree ganglocal_size;
+
+ /* [ WORD 12 ]
+ A pointer to the array to be used for broadcasting across threads. */
+ tree broadcast_array;
};
/* GIMPLE_OMP_PARALLEL or GIMPLE_TASK */
@@ -5243,6 +5247,25 @@ gimple_omp_target_set_ganglocal_size (go
}
+/* Return the pointer to the broadcast array associated with OMP_TARGET GS. */
+
+static inline tree
+gimple_omp_target_broadcast_array (const gomp_target *omp_target_stmt)
+{
+ return omp_target_stmt->broadcast_array;
+}
+
+
+/* Set PTR to be the broadcast array associated with OMP_TARGET
+ GS. */
+
+static inline void
+gimple_omp_target_set_broadcast_array (gomp_target *omp_target_stmt, tree ptr)
+{
+ omp_target_stmt->broadcast_array = ptr;
+}
+
+
/* Return the clauses associated with OMP_TEAMS GS. */
static inline tree
Index: gcc/omp-low.c
===================================================================
--- gcc/omp-low.c (revision 223974)
+++ gcc/omp-low.c (working copy)
@@ -162,6 +162,8 @@ struct omp_region
/* For an OpenACC loop, the level of parallelism requested. */
int gwv_this;
+
+ tree broadcast_array;
};
/* Levels of parallelism as defined by OpenACC. Increasing numbers
@@ -248,6 +250,11 @@ typedef struct omp_context
of workers. */
tree worker_var;
tree worker_count;
+
+ /* For offloaded regions, at runtime this variable holds a pointer
+ to the location that should be used for thread
+ synchronization. */
+ tree worker_sync_elt;
} omp_context;
/* A structure holding the elements of:
@@ -1490,7 +1497,7 @@ alloc_var_ganglocal (tree underlying_var
/* If this is a pointer mapping, then we need to create too mappings, one
for the pointer, and another to the data. Add the offset for the pointer
to size. */
- bool pointer = is_reference (underlying_var);
+ bool pointer = underlying_var ? is_reference (underlying_var) : false;
if (pointer)
size = fold_build2 (PLUS_EXPR, TREE_TYPE (size), size,
@@ -9394,6 +9401,25 @@ expand_omp_atomic (struct omp_region *re
expand_omp_atomic_mutex (load_bb, store_bb, addr, loaded_val, stored_val);
}
+/* Allocate storage for OpenACC worker threads in CTX to broadcast
+ condition results. CLAUSES are the clauses of the parallel construct. */
+
+static void
+oacc_alloc_broadcast_storage (omp_context *ctx, tree clauses)
+{
+ tree vull_type_node = build_qualified_type (long_long_unsigned_type_node,
+ TYPE_QUAL_VOLATILE);
+ tree uptr_node = build_pointer_type (vull_type_node);
+
+ tree clause = find_omp_clause (clauses, OMP_CLAUSE_NUM_WORKERS);
+ tree host_count = integer_one_node;
+ if (clause)
+ host_count = OMP_CLAUSE_NUM_WORKERS_EXPR (clause);
+
+ ctx->worker_sync_elt
+ = alloc_var_ganglocal (NULL_TREE, long_long_unsigned_type_node,
+ ctx, TYPE_SIZE_UNIT (long_long_unsigned_type_node));
+}
/* Expand the GIMPLE_OMP_TARGET starting at REGION. */
@@ -10080,7 +10106,8 @@ find_omp_for_region_data (struct omp_reg
OMP_TARGET STMT. */
static void
-find_omp_target_region_data (struct omp_region *region, gimple stmt)
+find_omp_target_region_data (struct omp_region *region,
+ gomp_target *stmt)
{
if (!is_gimple_omp_oacc (stmt))
return;
@@ -10092,6 +10119,7 @@ find_omp_target_region_data (struct omp_
region->gwv_this |= MASK_WORKER;
if (find_omp_clause (clauses, OMP_CLAUSE_VECTOR_LENGTH))
region->gwv_this |= MASK_VECTOR;
+ region->broadcast_array = gimple_omp_target_broadcast_array (stmt);
}
/* Helper for build_omp_regions. Scan the dominator tree starting at
@@ -10163,7 +10191,8 @@ build_omp_regions_1 (basic_block bb, str
case GF_OMP_TARGET_KIND_OACC_PARALLEL:
case GF_OMP_TARGET_KIND_OACC_KERNELS:
case GF_OMP_TARGET_KIND_OACC_DATA:
- find_omp_target_region_data (region, stmt);
+ find_omp_target_region_data (region,
+ as_a <gomp_target *> (stmt));
break;
case GF_OMP_TARGET_KIND_UPDATE:
case GF_OMP_TARGET_KIND_OACC_UPDATE:
@@ -10244,34 +10273,54 @@ enclosing_target_region (omp_region *reg
return region;
}
-/* Return true if basic blocks in REGION require OpenACC vector
- predication. */
-static bool
-requires_vector_predicate (struct omp_region *region)
+/* Return a mask of GWV_ values indicating the kind of OpenACC
+ predication required for basic blocks in REGION. */
+
+static int
+required_predication_mask (omp_region *region)
{
while (region
&& region->type != GIMPLE_OMP_FOR && region->type != GIMPLE_OMP_TARGET)
region = region->outer;
if (!region)
- return false;
- omp_region *outer_target = enclosing_target_region (region);
- if (!outer_target || (outer_target->gwv_this & MASK_VECTOR) == 0)
- return false;
- if (region->type == GIMPLE_OMP_FOR && (region->gwv_this & MASK_VECTOR) == 0)
- return true;
- return false;
+ return 0;
+
+ int outer_masks = region->gwv_this;
+ omp_region *outer_target = region;
+ while (outer_target != NULL && outer_target->type != GIMPLE_OMP_TARGET)
+ {
+ if (outer_target->type == GIMPLE_OMP_FOR)
+ outer_masks |= outer_target->gwv_this;
+ outer_target = outer_target->outer;
+ }
+ if (!outer_target)
+ return 0;
+
+ int mask = 0;
+ if ((outer_target->gwv_this & MASK_WORKER) != 0
+ && (region->type == GIMPLE_OMP_TARGET
+ || (outer_masks & MASK_WORKER) == 0))
+ mask |= MASK_WORKER;
+ if ((outer_target->gwv_this & MASK_VECTOR) != 0
+ && (region->type == GIMPLE_OMP_TARGET
+ || (outer_masks & MASK_VECTOR) == 0))
+ mask |= MASK_VECTOR;
+ return mask;
}
/* Generate a broadcast across OpenACC vector threads (a warp on GPUs)
- so that VAR is broadcast to DEST_VAR. The new statements are
- added after WHERE. */
-static void
+ so that VAR is broadcast to DEST_VAR. The new statements are added
+ after WHERE. Return the stmt after which the block should be split. */
+
+static gimple
generate_vector_broadcast (tree dest_var, tree var,
gimple_stmt_iterator &where)
{
+ gimple retval = gsi_stmt (where);
tree vartype = TREE_TYPE (var);
tree call_arg_type = unsigned_type_node;
enum built_in_function fn = BUILT_IN_GOACC_THREAD_BROADCAST;
+
if (TYPE_PRECISION (vartype) > TYPE_PRECISION (call_arg_type))
{
fn = BUILT_IN_GOACC_THREAD_BROADCAST_LL;
@@ -10299,16 +10348,106 @@ generate_vector_broadcast (tree dest_var
gsi_insert_after (&where, conv2, GSI_CONTINUE_LINKING);
}
gimple_call_set_lhs (call, casted_dest);
+ return retval;
+}
+
+/* Generate a broadcast across OpenACC threads in REGION so that VAR
+ is broadcast to DEST_VAR. MASK specifies the parallelism level and
+ thereby the broadcast method. If it is equal to MASK_VECTOR, we
+ can use a warp broadcast, otherwise we fall back to memory
+ store/load. */
+
+static gimple
+generate_oacc_broadcast (omp_region *region, tree dest_var, tree var,
+ gimple_stmt_iterator &where, int mask)
+{
+ if (mask == MASK_VECTOR)
+ return generate_vector_broadcast (dest_var, var, where);
+
+ omp_region *parent = enclosing_target_region (region);
+
+ tree elttype = build_qualified_type (TREE_TYPE (var), TYPE_QUAL_VOLATILE);
+ tree ptr = create_tmp_var (build_pointer_type (elttype));
+ gassign *cast1 = gimple_build_assign (ptr, NOP_EXPR,
+ parent->broadcast_array);
+ gsi_insert_after (&where, cast1, GSI_NEW_STMT);
+ gassign *st = gimple_build_assign (build_simple_mem_ref (ptr), var);
+ gsi_insert_after (&where, st, GSI_NEW_STMT);
+ tree fndecl = builtin_decl_explicit (BUILT_IN_GOACC_THREADBARRIER);
+ gcall *sync_bar = gimple_build_call (fndecl, 0);
+ gsi_insert_after (&where, sync_bar, GSI_NEW_STMT);
+
+ gassign *cast2 = gimple_build_assign (ptr, NOP_EXPR,
+ parent->broadcast_array);
+ gsi_insert_after (&where, cast2, GSI_NEW_STMT);
+ gassign *ld = gimple_build_assign (dest_var, build_simple_mem_ref (ptr));
+ gsi_insert_after (&where, ld, GSI_NEW_STMT);
+
+ return st;
}
-/* Apply OpenACC vector predication to basic block BB which is in
- region PARENT. */
+/* Build a test for OpenACC predication. TRUE_EDGE is the edge that should be
+ taken if the block should be executed. SKIP_DEST_BB is the destination to
+ jump to otherwise. MASK specifies the type of predication, it can contain
+ the bits MASK_VECTOR and/or MASK_WORKER. */
static void
-predicate_bb (basic_block bb, struct omp_region *parent)
+make_predication_test (edge true_edge, basic_block skip_dest_bb, int mask)
{
- if (!requires_vector_predicate (parent))
+ basic_block cond_bb = true_edge->src;
+
+ gimple_stmt_iterator tmp_gsi = gsi_last_bb (cond_bb);
+ tree decl = builtin_decl_explicit (BUILT_IN_GOACC_TID);
+
+ tree vvar = NULL_TREE, wvar = NULL_TREE;
+ tree comp_var = NULL_TREE;
+ if (mask & MASK_VECTOR)
+ {
+ gimple call = gimple_build_call (decl, 1, integer_zero_node);
+ vvar = create_tmp_var (unsigned_type_node);
+ comp_var = vvar;
+ gimple_call_set_lhs (call, vvar);
+ gsi_insert_after (&tmp_gsi, call, GSI_NEW_STMT);
+ }
+ if (mask & MASK_WORKER)
+ {
+ gimple call = gimple_build_call (decl, 1, integer_one_node);
+ wvar = create_tmp_var (unsigned_type_node);
+ comp_var = wvar;
+ gimple_call_set_lhs (call, wvar);
+ gsi_insert_after (&tmp_gsi, call, GSI_NEW_STMT);
+ }
+ if (wvar && vvar)
+ {
+ comp_var = create_tmp_var (unsigned_type_node);
+ gassign *ior = gimple_build_assign (comp_var, BIT_IOR_EXPR, wvar, vvar);
+ gsi_insert_after (&tmp_gsi, ior, GSI_NEW_STMT);
+ }
+ tree cond = build2 (EQ_EXPR, boolean_type_node, comp_var,
+ fold_convert (unsigned_type_node, integer_zero_node));
+ gimple cond_stmt = gimple_build_cond_empty (cond);
+ gsi_insert_after (&tmp_gsi, cond_stmt, GSI_NEW_STMT);
+
+ true_edge->flags = EDGE_TRUE_VALUE;
+ make_edge (cond_bb, skip_dest_bb, EDGE_FALSE_VALUE);
+}
+
+/* Apply OpenACC predication to basic block BB which is in
+ region PARENT. MASK has a bitmask of levels that need to be
+ applied; MASK_VECTOR and/or MASK_WORKER may be set. */
+
+static void
+predicate_bb (basic_block bb, struct omp_region *parent, int mask)
+{
+ /* We handle worker-single vector-partitioned loops by jumping
+ around them if not in the controlling worker. Don't insert
+ unnecessary (and incorrect) predication. */
+ if (parent->type == GIMPLE_OMP_FOR
+ && (parent->gwv_this & MASK_VECTOR))
+ mask &= ~MASK_WORKER;
+
+ if (mask == 0)
return;
gimple_stmt_iterator gsi;
@@ -10336,9 +10475,11 @@ predicate_bb (basic_block bb, struct omp
gsi_insert_before (&gsi, asgn, GSI_CONTINUE_LINKING);
gimple_stmt_iterator gsi_asgn = gsi_for_stmt (asgn);
- generate_vector_broadcast (broadcast_cond, cond_var, gsi_asgn);
+ gimple splitpoint = generate_oacc_broadcast (parent, broadcast_cond,
+ cond_var, gsi_asgn,
+ mask);
- edge e = split_block (bb, asgn);
+ edge e = split_block (bb, splitpoint);
skip_dest_bb = e->dest;
gimple_cond_set_condition (as_a <gcond *> (stmt), EQ_EXPR,
@@ -10354,9 +10495,10 @@ predicate_bb (basic_block bb, struct omp
gsi_insert_before (&gsi, asgn, GSI_CONTINUE_LINKING);
gimple_stmt_iterator gsi_asgn = gsi_for_stmt (asgn);
- generate_vector_broadcast (new_var, var, gsi_asgn);
+ gimple splitpoint = generate_oacc_broadcast (parent, new_var, var,
+ gsi_asgn, mask);
- edge e = split_block (bb, asgn);
+ edge e = split_block (bb, splitpoint);
skip_dest_bb = e->dest;
gimple_switch_set_index (sstmt, new_var);
@@ -10364,21 +10506,58 @@ predicate_bb (basic_block bb, struct omp
else if (is_gimple_omp (stmt))
{
gsi_prev (&gsi);
+ gimple split_stmt = gsi_stmt (gsi);
+
+ /* First, see if we must predicate away an entire loop. */
+ if (gimple_code (stmt) == GIMPLE_OMP_FOR)
+ {
+ omp_region *inner;
+ inner = *bb_region_map->get (FALLTHRU_EDGE (bb)->dest);
+ skip_dest_bb = single_succ (inner->exit);
+ gcc_assert (inner->entry == bb);
+ if ((inner->gwv_this & (MASK_VECTOR | MASK_WORKER)) == MASK_VECTOR
+ && (mask & MASK_WORKER) != 0)
+ {
+ gimple_stmt_iterator head_gsi = gsi_start_bb (bb);
+ gsi_prev (&head_gsi);
+ edge e0 = split_block (bb, gsi_stmt (head_gsi));
+
+ if (!split_stmt)
+ {
+ /* The simple case: nothing here except the for,
+ so we just need to make one branch around the
+ entire loop. */
+ inner->entry = e0->dest;
+ make_predication_test (e0, skip_dest_bb,
+ mask & ~MASK_VECTOR);
+ return;
+ }
+ basic_block for_block = e0->dest;
+ /* The general case, make two conditions - a full one around the
+ code preceding the for, and one branch around the loop. */
+ edge e1 = split_block (for_block, split_stmt);
+ basic_block bb3 = e1->dest;
+ edge e2 = split_block (for_block, split_stmt);
+ basic_block bb2 = e2->dest;
+
+ make_predication_test (e0, bb2, mask);
+ make_predication_test (single_pred_edge (bb3), skip_dest_bb,
+ mask & ~MASK_VECTOR);
+ inner->entry = bb3;
+ return;
+ }
+ }
+
/* Only a few statements need special treatment. */
if (gimple_code (stmt) != GIMPLE_OMP_FOR
- && gimple_code (stmt) != GIMPLE_OMP_CONTINUE)
+ && gimple_code (stmt) != GIMPLE_OMP_CONTINUE
+ && gimple_code (stmt) != GIMPLE_OMP_RETURN)
{
edge e = single_succ_edge (bb);
skip_dest_bb = e->dest;
- if (gimple_code (stmt) == GIMPLE_OMP_RETURN)
- {
- gcc_assert (parent->exit == bb);
- adjust_bb_ptr = &parent->exit;
- }
}
else
{
- gimple split_stmt = gsi_stmt (gsi);
if (!split_stmt)
return;
edge e = split_block (bb, split_stmt);
@@ -10388,6 +10567,11 @@ predicate_bb (basic_block bb, struct omp
gcc_assert (parent->cont == bb);
parent->cont = skip_dest_bb;
}
+ else if (gimple_code (stmt) == GIMPLE_OMP_RETURN)
+ {
+ gcc_assert (parent->exit == bb);
+ parent->exit = skip_dest_bb;
+ }
else if (gimple_code (stmt) == GIMPLE_OMP_FOR)
{
omp_region *inner;
@@ -10412,37 +10596,18 @@ predicate_bb (basic_block bb, struct omp
gimple_stmt_iterator head_gsi = gsi_start_bb (bb);
gsi_prev (&head_gsi);
edge e2 = split_block (bb, gsi_stmt (head_gsi));
- basic_block cond_bb = e2->src;
-
- if (adjust_bb_ptr)
- *adjust_bb_ptr = e2->dest;
-
- gimple_stmt_iterator tmp_gsi = gsi_last_bb (cond_bb);
-
- tree decl = builtin_decl_explicit (BUILT_IN_GOACC_TID);
- gimple call = gimple_build_call (decl, 1, integer_zero_node);
- tree tmp_var = create_tmp_var (unsigned_type_node);
- gimple_call_set_lhs (call, tmp_var);
-
- gsi_insert_after (&tmp_gsi, call, GSI_NEW_STMT);
-
- tree cond = build2 (EQ_EXPR, boolean_type_node, tmp_var,
- fold_convert (unsigned_type_node, integer_zero_node));
- gimple cond_stmt = gimple_build_cond_empty (cond);
- gsi_insert_after (&tmp_gsi, cond_stmt, GSI_CONTINUE_LINKING);
-
- e2->flags = EDGE_TRUE_VALUE;
- make_edge (cond_bb, skip_dest_bb, EDGE_FALSE_VALUE);
+ make_predication_test (e2, skip_dest_bb, mask);
}
}
/* Walk the dominator tree starting at BB to collect basic blocks in
WORKLIST which need OpenACC vector predication applied to them. */
+
static void
find_predicatable_bbs (basic_block bb, vec<basic_block> &worklist)
{
struct omp_region *parent = *bb_region_map->get (bb);
- if (requires_vector_predicate (parent))
+ if (required_predication_mask (parent) != 0)
worklist.safe_push (bb);
basic_block son;
for (son = first_dom_son (CDI_DOMINATORS, bb);
@@ -10453,6 +10618,7 @@ find_predicatable_bbs (basic_block bb, v
/* Apply OpenACC vector predication to all basic blocks. HEAD_BB is the
first. */
+
static void
predicate_omp_regions (basic_block head_bb)
{
@@ -10461,7 +10627,11 @@ predicate_omp_regions (basic_block head_
int i;
basic_block bb;
FOR_EACH_VEC_ELT (worklist, i, bb)
- predicate_bb (bb, *bb_region_map->get (bb));
+ {
+ omp_region *region = *bb_region_map->get (bb);
+ int mask = required_predication_mask (region);
+ predicate_bb (bb, region, mask);
+ }
}
/* Main entry point for expanding OMP-GIMPLE into runtime calls. */
@@ -12503,7 +12673,10 @@ lower_omp_target (gimple_stmt_iterator *
orlist = NULL;
if (is_gimple_omp_oacc (stmt))
- oacc_init_count_vars (ctx, clauses);
+ {
+ oacc_init_count_vars (ctx, clauses);
+ oacc_alloc_broadcast_storage (ctx, clauses);
+ }
if (has_reduction)
{
@@ -12790,7 +12963,7 @@ lower_omp_target (gimple_stmt_iterator *
gsi_insert_seq_before (gsi_p, sz_ilist, GSI_SAME_STMT);
gimple_omp_target_set_ganglocal_size (stmt, sz);
-
+ gimple_omp_target_set_broadcast_array (stmt, ctx->worker_sync_elt);
pop_gimplify_context (NULL);
}
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [gomp4] Worker-single predication
2015-06-01 15:59 [gomp4] Worker-single predication Bernd Schmidt
@ 2015-06-02 10:29 ` Thomas Schwinge
2015-06-02 11:07 ` Thomas Schwinge
2015-06-02 18:03 ` Cesar Philippidis
2 siblings, 0 replies; 5+ messages in thread
From: Thomas Schwinge @ 2015-06-02 10:29 UTC (permalink / raw)
To: Bernd Schmidt; +Cc: Jakub Jelinek, GCC Patches
[-- Attachment #1: Type: text/plain, Size: 4150 bytes --]
Hi Bernd!
On Mon, 1 Jun 2015 17:58:51 +0200, Bernd Schmidt <bernds@codesourcery.com> wrote:
> This extends the previous vector-single support to also handle
> worker-level predication.
Thanks!
> --- gcc/omp-low.c (revision 223974)
> +++ gcc/omp-low.c (working copy)
> +/* Allocate storage for OpenACC worker threads in CTX to broadcast
> + condition results. CLAUSES are the clauses of the parallel construct. */
> +
> +static void
> +oacc_alloc_broadcast_storage (omp_context *ctx, tree clauses)
> +{
> + tree vull_type_node = build_qualified_type (long_long_unsigned_type_node,
> + TYPE_QUAL_VOLATILE);
> + tree uptr_node = build_pointer_type (vull_type_node);
> +
> + tree clause = find_omp_clause (clauses, OMP_CLAUSE_NUM_WORKERS);
> + tree host_count = integer_one_node;
> + if (clause)
> + host_count = OMP_CLAUSE_NUM_WORKERS_EXPR (clause);
> +
> + ctx->worker_sync_elt
> + = alloc_var_ganglocal (NULL_TREE, long_long_unsigned_type_node,
> + ctx, TYPE_SIZE_UNIT (long_long_unsigned_type_node));
> +}
> @@ -12503,7 +12673,10 @@ lower_omp_target (gimple_stmt_iterator *
> orlist = NULL;
>
> if (is_gimple_omp_oacc (stmt))
> - oacc_init_count_vars (ctx, clauses);
> + {
> + oacc_init_count_vars (ctx, clauses);
> + oacc_alloc_broadcast_storage (ctx, clauses);
> + }
>
> if (has_reduction)
> {
A few warnings/errors resulting in bootstrap failures. Not yet committed
-- probably you meant to do something with host_count?
commit f0a9e05f8b16436767e4f899580b8f3e753d228f
Author: Thomas Schwinge <thomas@codesourcery.com>
Date: Tue Jun 2 12:07:35 2015 +0200
Resolve bootstrap failures
... introduced in r223989.
[...]/source-gcc/gcc/omp-low.c: In function 'void oacc_alloc_broadcast_storage(omp_context*, tree)':
[...]/source-gcc/gcc/omp-low.c:9412:8: error: unused variable 'uptr_node' [-Werror=unused-variable]
tree uptr_node = build_pointer_type (vull_type_node);
^
[...]/source-gcc/gcc/omp-low.c:9415:8: error: variable 'host_count' set but not used [-Werror=unused-but-set-variable]
tree host_count = integer_one_node;
^
[...]/source-gcc/gcc/omp-low.c: In function 'void predicate_bb(basic_block, omp_region*, int)':
[...]/source-gcc/gcc/omp-low.c:10462:16: error: unused variable 'adjust_bb_ptr' [-Werror=unused-variable]
basic_block *adjust_bb_ptr = NULL;
^
---
gcc/omp-low.c | 14 ++------------
1 file changed, 2 insertions(+), 12 deletions(-)
diff --git gcc/omp-low.c gcc/omp-low.c
index 01e5d4b..ace9e24 100644
--- gcc/omp-low.c
+++ gcc/omp-low.c
@@ -9405,17 +9405,8 @@ expand_omp_atomic (struct omp_region *region)
condition results. CLAUSES are the clauses of the parallel construct. */
static void
-oacc_alloc_broadcast_storage (omp_context *ctx, tree clauses)
+oacc_alloc_broadcast_storage (omp_context *ctx)
{
- tree vull_type_node = build_qualified_type (long_long_unsigned_type_node,
- TYPE_QUAL_VOLATILE);
- tree uptr_node = build_pointer_type (vull_type_node);
-
- tree clause = find_omp_clause (clauses, OMP_CLAUSE_NUM_WORKERS);
- tree host_count = integer_one_node;
- if (clause)
- host_count = OMP_CLAUSE_NUM_WORKERS_EXPR (clause);
-
ctx->worker_sync_elt
= alloc_var_ganglocal (NULL_TREE, long_long_unsigned_type_node,
ctx, TYPE_SIZE_UNIT (long_long_unsigned_type_node));
@@ -10459,7 +10450,6 @@ predicate_bb (basic_block bb, struct omp_region *parent, int mask)
return;
basic_block skip_dest_bb = NULL;
- basic_block *adjust_bb_ptr = NULL;
if (gimple_code (stmt) == GIMPLE_OMP_ENTRY_END)
return;
@@ -12675,7 +12665,7 @@ lower_omp_target (gimple_stmt_iterator *gsi_p, omp_context *ctx)
if (is_gimple_omp_oacc (stmt))
{
oacc_init_count_vars (ctx, clauses);
- oacc_alloc_broadcast_storage (ctx, clauses);
+ oacc_alloc_broadcast_storage (ctx);
}
if (has_reduction)
Grüße,
Thomas
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 472 bytes --]
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [gomp4] Worker-single predication
2015-06-01 15:59 [gomp4] Worker-single predication Bernd Schmidt
2015-06-02 10:29 ` Thomas Schwinge
@ 2015-06-02 11:07 ` Thomas Schwinge
2015-06-03 11:20 ` Bernd Schmidt
2015-06-02 18:03 ` Cesar Philippidis
2 siblings, 1 reply; 5+ messages in thread
From: Thomas Schwinge @ 2015-06-02 11:07 UTC (permalink / raw)
To: Bernd Schmidt; +Cc: Jakub Jelinek, GCC Patches
[-- Attachment #1: Type: text/plain, Size: 3477 bytes --]
Hi Bernd!
On Mon, 1 Jun 2015 17:58:51 +0200, Bernd Schmidt <bernds@codesourcery.com> wrote:
> This extends the previous vector-single support to also handle
> worker-level predication. [...]
This causes the following regressions; would you please have a look?
[-PASS:-]{+FAIL: g++.dg/goacc/template.C -std=c++11 (internal compiler error)+}
{+FAIL:+} g++.dg/goacc/template.C -std=c++11 (test for excess errors)
[-PASS:-]{+FAIL: g++.dg/goacc/template.C -std=c++14 (internal compiler error)+}
{+FAIL:+} g++.dg/goacc/template.C -std=c++14 (test for excess errors)
[-PASS:-]{+FAIL: g++.dg/goacc/template.C -std=c++98 (internal compiler error)+}
{+FAIL:+} g++.dg/goacc/template.C -std=c++98 (test for excess errors)
spawn [...]/build-gcc/gcc/testsuite/g++3/../../xg++ -B[...]/build-gcc/gcc/testsuite/g++3/../../ [...]/source-gcc/gcc/testsuite/g++.dg/goacc/template.C -fno-diagnostics-show-caret -fdiagnostics-color=never -nostdinc++ -I[...]/build-gcc/x86_64-unknown-linux-gnu/libstdc++-v3/include/x86_64-unknown-linux-gnu -I[...]/build-gcc/x86_64-unknown-linux-gnu/libstdc++-v3/include -I[...]/source-gcc/libstdc++-v3/libsupc++ -I[...]/source-gcc/libstdc++-v3/include/backward -I[...]/source-gcc/libstdc++-v3/testsuite/util -fmessage-length=0 -std=c++98 -fopenacc -S -o template.s
[...]/source-gcc/gcc/testsuite/g++.dg/goacc/template.C: In function 'T oacc_parallel_copy(T) [with T = int]':
[...]/source-gcc/gcc/testsuite/g++.dg/goacc/template.C:90:10: internal compiler error: in as_a, at is-a.h:192
0xbb8a60 as_a<gomp_atomic_load*, gimple_statement_base>
[...]/source-gcc/gcc/is-a.h:192
0xbb8a60 expand_omp_atomic
[...]/source-gcc/gcc/omp-low.c:9349
0xbb8a60 expand_omp
[...]/source-gcc/gcc/omp-low.c:10068
0xbb6d98 expand_omp
[...]/source-gcc/gcc/omp-low.c:10029
0xbbddd7 execute_expand_omp
[...]/source-gcc/gcc/omp-low.c:10659
PASS: gfortran.dg/goacc/parallel-tree.f95 -O scan-tree-dump-times original "
PASS: gfortran.dg/goacc/parallel-tree.f95 -O scan-tree-dump-times original "private\\(v\\)" 1
PASS: gfortran.dg/goacc/parallel-tree.f95 -O scan-tree-dump-times original "reduction\\(max:q\\)" 1
PASS: gfortran.dg/goacc/parallel-tree.f95 -O scan-tree-dump-times original "vector_length" 1
[-PASS:-]{+FAIL: gfortran.dg/goacc/parallel-tree.f95 -O (internal compiler error)+}
{+FAIL:+} gfortran.dg/goacc/parallel-tree.f95 -O (test for excess errors)
spawn [...]/build-gcc/gcc/testsuite/gfortran4/../../gfortran -B[...]/build-gcc/gcc/testsuite/gfortran4/../../ -B[...]/build-gcc/x86_64-unknown-linux-gnu/./libgfortran/ [...]/source-gcc/gcc/testsuite/gfortran.dg/goacc/parallel-tree.f95 -fno-diagnostics-show-caret -fdiagnostics-color=never -O -fopenacc -fdump-tree-original -S -o parallel-tree.s
[...]/source-gcc/gcc/testsuite/gfortran.dg/goacc/parallel-tree.f95:14:0: internal compiler error: in as_a, at is-a.h:192
0xa57220 as_a<gomp_atomic_load*, gimple_statement_base>
[...]/source-gcc/gcc/is-a.h:192
0xa57220 expand_omp_atomic
[...]/source-gcc/gcc/omp-low.c:9349
0xa57220 expand_omp
[...]/source-gcc/gcc/omp-low.c:10068
0xa55558 expand_omp
[...]/source-gcc/gcc/omp-low.c:10029
0xa5c597 execute_expand_omp
[...]/source-gcc/gcc/omp-low.c:10659
Grüße,
Thomas
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 472 bytes --]
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [gomp4] Worker-single predication
2015-06-01 15:59 [gomp4] Worker-single predication Bernd Schmidt
2015-06-02 10:29 ` Thomas Schwinge
2015-06-02 11:07 ` Thomas Schwinge
@ 2015-06-02 18:03 ` Cesar Philippidis
2 siblings, 0 replies; 5+ messages in thread
From: Cesar Philippidis @ 2015-06-02 18:03 UTC (permalink / raw)
To: Bernd Schmidt, GCC Patches; +Cc: Jakub Jelinek
On 06/01/2015 08:58 AM, Bernd Schmidt wrote:
> This extends the previous vector-single support to also handle
> worker-level predication. We can't use the shfl insn because workers
> will live across multiple warps, so we use a location in memory to
> broadcast the branch target.
> This also fixes the oversight where basic blocks inside a parallel
> region but outside all loops weren't being predicated.
>
> A special case is added for worker-single vector-partitioned; we add a
> jump over the entire loop that is taken by the inactive workers and add
> no predication inside this loop.
>
> Committed on gomp-4_0-branch.
Thanks. This fixed the problems that I was seeing with variables outside
of acc loops.
I see that calls are being predicated at the moment. Those will need
special handling once we tackle acc routines.
Cesar
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [gomp4] Worker-single predication
2015-06-02 11:07 ` Thomas Schwinge
@ 2015-06-03 11:20 ` Bernd Schmidt
0 siblings, 0 replies; 5+ messages in thread
From: Bernd Schmidt @ 2015-06-03 11:20 UTC (permalink / raw)
To: Thomas Schwinge; +Cc: Jakub Jelinek, GCC Patches
[-- Attachment #1: Type: text/plain, Size: 490 bytes --]
On 06/02/2015 01:06 PM, Thomas Schwinge wrote:
> On Mon, 1 Jun 2015 17:58:51 +0200, Bernd Schmidt <bernds@codesourcery.com> wrote:
>> This extends the previous vector-single support to also handle
>> worker-level predication. [...]
>
> This causes the following regressions; would you please have a look?
I committed the following to take care of some problems with the patch.
The main change is to handle atomic regions similarly to WSVP loops -
just jump over them entirely.
Bernd
[-- Attachment #2: wpfixups.diff --]
[-- Type: text/x-patch, Size: 5173 bytes --]
Index: gcc/ChangeLog.gomp
===================================================================
--- gcc/ChangeLog.gomp (revision 224072)
+++ gcc/ChangeLog.gomp (working copy)
@@ -1,3 +1,10 @@
+2015-06-02 Bernd Schmidt <bernds@codesourcery.com>
+
+ * omp-low.c (oacc_alloc_broadcast_storage): Remove unused parameter
+ CLAUSES and unnecessary code. All callers changed.
+ (expand_omp_target): Only look for entry-end statement if offloaded.
+ (predicate_bb): Extend code to also jump around atomic regions.
+
2015-06-01 Tom de Vries <tom@codesourcery.com>
Revert:
Index: gcc/omp-low.c
===================================================================
--- gcc/omp-low.c (revision 224072)
+++ gcc/omp-low.c (working copy)
@@ -9402,23 +9402,17 @@ expand_omp_atomic (struct omp_region *re
}
/* Allocate storage for OpenACC worker threads in CTX to broadcast
- condition results. CLAUSES are the clauses of the parallel construct. */
+ condition results. */
static void
-oacc_alloc_broadcast_storage (omp_context *ctx, tree clauses)
+oacc_alloc_broadcast_storage (omp_context *ctx)
{
tree vull_type_node = build_qualified_type (long_long_unsigned_type_node,
- TYPE_QUAL_VOLATILE);
- tree uptr_node = build_pointer_type (vull_type_node);
-
- tree clause = find_omp_clause (clauses, OMP_CLAUSE_NUM_WORKERS);
- tree host_count = integer_one_node;
- if (clause)
- host_count = OMP_CLAUSE_NUM_WORKERS_EXPR (clause);
+ TYPE_QUAL_VOLATILE);
ctx->worker_sync_elt
- = alloc_var_ganglocal (NULL_TREE, long_long_unsigned_type_node,
- ctx, TYPE_SIZE_UNIT (long_long_unsigned_type_node));
+ = alloc_var_ganglocal (NULL_TREE, vull_type_node, ctx,
+ TYPE_SIZE_UNIT (vull_type_node));
}
/* Expand the GIMPLE_OMP_TARGET starting at REGION. */
@@ -9512,7 +9506,7 @@ expand_omp_target (struct omp_region *re
}
basic_block entry_succ_bb = single_succ (entry_bb);
- if (!gimple_in_ssa_p (cfun))
+ if (offloaded && !gimple_in_ssa_p (cfun))
{
gsi = gsi_last_bb (entry_succ_bb);
if (gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_ENTRY_END)
@@ -10447,7 +10441,7 @@ predicate_bb (basic_block bb, struct omp
&& (parent->gwv_this & MASK_VECTOR))
mask &= ~MASK_WORKER;
- if (mask == 0)
+ if (mask == 0 || parent->type == GIMPLE_OMP_ATOMIC_LOAD)
return;
gimple_stmt_iterator gsi;
@@ -10459,7 +10453,6 @@ predicate_bb (basic_block bb, struct omp
return;
basic_block skip_dest_bb = NULL;
- basic_block *adjust_bb_ptr = NULL;
if (gimple_code (stmt) == GIMPLE_OMP_ENTRY_END)
return;
@@ -10507,29 +10500,33 @@ predicate_bb (basic_block bb, struct omp
{
gsi_prev (&gsi);
gimple split_stmt = gsi_stmt (gsi);
+ enum gimple_code code = gimple_code (stmt);
- /* First, see if we must predicate away an entire loop. */
- if (gimple_code (stmt) == GIMPLE_OMP_FOR)
+ /* First, see if we must predicate away an entire loop or atomic region. */
+ if (code == GIMPLE_OMP_FOR
+ || code == GIMPLE_OMP_ATOMIC_LOAD)
{
omp_region *inner;
inner = *bb_region_map->get (FALLTHRU_EDGE (bb)->dest);
skip_dest_bb = single_succ (inner->exit);
gcc_assert (inner->entry == bb);
- if ((inner->gwv_this & (MASK_VECTOR | MASK_WORKER)) == MASK_VECTOR
- && (mask & MASK_WORKER) != 0)
+ if (code != GIMPLE_OMP_FOR
+ || ((inner->gwv_this & (MASK_VECTOR | MASK_WORKER)) == MASK_VECTOR
+ && (mask & MASK_WORKER) != 0))
{
gimple_stmt_iterator head_gsi = gsi_start_bb (bb);
gsi_prev (&head_gsi);
edge e0 = split_block (bb, gsi_stmt (head_gsi));
-
- if (!split_stmt)
+ int mask2 = mask;
+ if (code == GIMPLE_OMP_FOR)
+ mask2 &= ~MASK_VECTOR;
+ if (!split_stmt || code != GIMPLE_OMP_FOR)
{
/* The simple case: nothing here except the for,
so we just need to make one branch around the
entire loop. */
inner->entry = e0->dest;
- make_predication_test (e0, skip_dest_bb,
- mask & ~MASK_VECTOR);
+ make_predication_test (e0, skip_dest_bb, mask2);
return;
}
basic_block for_block = e0->dest;
@@ -10542,7 +10539,7 @@ predicate_bb (basic_block bb, struct omp
make_predication_test (e0, bb2, mask);
make_predication_test (single_pred_edge (bb3), skip_dest_bb,
- mask & ~MASK_VECTOR);
+ mask2);
inner->entry = bb3;
return;
}
@@ -10550,8 +10547,8 @@ predicate_bb (basic_block bb, struct omp
/* Only a few statements need special treatment. */
if (gimple_code (stmt) != GIMPLE_OMP_FOR
- && gimple_code (stmt) != GIMPLE_OMP_CONTINUE
- && gimple_code (stmt) != GIMPLE_OMP_RETURN)
+ && gimple_code (stmt) != GIMPLE_OMP_CONTINUE
+ && gimple_code (stmt) != GIMPLE_OMP_RETURN)
{
edge e = single_succ_edge (bb);
skip_dest_bb = e->dest;
@@ -12675,7 +12672,7 @@ lower_omp_target (gimple_stmt_iterator *
if (is_gimple_omp_oacc (stmt))
{
oacc_init_count_vars (ctx, clauses);
- oacc_alloc_broadcast_storage (ctx, clauses);
+ oacc_alloc_broadcast_storage (ctx);
}
if (has_reduction)
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2015-06-03 11:17 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-06-01 15:59 [gomp4] Worker-single predication Bernd Schmidt
2015-06-02 10:29 ` Thomas Schwinge
2015-06-02 11:07 ` Thomas Schwinge
2015-06-03 11:20 ` Bernd Schmidt
2015-06-02 18:03 ` Cesar Philippidis
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).