* [gomp4]
@ 2015-10-15 17:12 Nathan Sidwell
0 siblings, 0 replies; 2+ messages in thread
From: Nathan Sidwell @ 2015-10-15 17:12 UTC (permalink / raw)
To: GCC Patches
[-- Attachment #1: Type: text/plain, Size: 457 bytes --]
I've committed this to gomp4 branch.
It's the next in the series moving partioning decisions into the target
compiler. This patch moves the updating of the IF_GOACC_LOOP internal
function's mask and chunking parameters. After reconstructing the OpenACC
loops, we scan the block(s) justy after the header marker looking for these
functions, and set the determined partitioning mask and chunking.
The next patch will complete this transition.
nathan
[-- Attachment #2: gomp4-loop-subst.patch --]
[-- Type: text/x-patch, Size: 9264 bytes --]
2015-10-15 Nathan Sidwell <nathan@codesourcery.com>
* omp-low.c (struct oacc_loop): Add chunk_size and head_end
fields.
(extract_omp_for_data): Don't extract OpenACC partitioning or
chunk size here.
(lower_oacc_head_mark): Substitute gang_static size.
(expand_oacc_for): Don't specify parallel region chunking or
partitioning here.
(oacc_xform_loop): Stride a single worker partition. Add
conversions for chunk size.
(new_oacc_loop_raw): Initialize new fields.
(new_oacc_loop): Set chunk_size.
(oacc_loop_walk): Set head_end.
(oacc_loop_xform_loop): New.
(oacc_loop_process): Call it.
Index: gcc/omp-low.c
===================================================================
--- gcc/omp-low.c (revision 228842)
+++ gcc/omp-low.c (working copy)
@@ -255,11 +255,10 @@ struct oacc_loop
tree routine; /* Pseudo-loop enclosing a routine. */
- /* Partitioning mask. */
- unsigned mask;
-
- /* Partitioning flags. */
- unsigned flags;
+ unsigned mask; /* Partitioning mask. */
+ unsigned flags; /* Partitioning flags. */
+ tree chunk_size; /* Chunk size. */
+ gcall *head_end; /* Final marker of head sequence. */
};
/* Flags for an OpenACC loop. */
@@ -791,31 +790,6 @@ extract_omp_for_data (gomp_for *for_stmt
fd->loop.step = build_int_cst (TREE_TYPE (fd->loop.v), 1);
fd->loop.cond_code = LT_EXPR;
}
-
- /* For OpenACC loops, force a chunk size of one, unless a gang loop
- contains a static argument. This avoids the default scheduling where
- several subsequent iterations are being executed by the same thread. */
- if (gimple_omp_for_kind (for_stmt) == GF_OMP_FOR_KIND_OACC_LOOP)
- {
- gcc_assert (fd->chunk_size == NULL_TREE);
-
- tree gang = find_omp_clause (gimple_omp_for_clauses (for_stmt),
- OMP_CLAUSE_GANG);
- tree chunk_size = NULL_TREE;
-
- if (gang)
- {
- chunk_size = OMP_CLAUSE_GANG_STATIC_EXPR (gang);
-
- /* gang (static:*) is represented by -1. */
- if (chunk_size == integer_minus_one_node)
- chunk_size = NULL_TREE;
- }
- else
- chunk_size = build_int_cst (TREE_TYPE (fd->loop.v), 1);
-
- fd->chunk_size = chunk_size;
- }
}
@@ -4944,11 +4918,15 @@ lower_oacc_head_mark (location_t loc, tr
case OMP_CLAUSE_GANG:
tag |= OLF_DIM_GANG;
gang_static = OMP_CLAUSE_GANG_STATIC_EXPR (c);
+ /* static:* is represented by -1, and we can ignore it, as
+ scheduling is always static. */
+ if (gang_static && integer_minus_onep (gang_static))
+ gang_static = NULL_TREE;
levels++;
break;
case OMP_CLAUSE_WORKER:
- tag |= OLF_DIM_WORKER;
+ tag |= OLF_DIM_WORKER;
levels++;
break;
@@ -4980,7 +4958,11 @@ lower_oacc_head_mark (location_t loc, tr
done:
if (gang_static)
- tag |= OLF_GANG_STATIC;
+ {
+ if (DECL_P (gang_static))
+ gang_static = build_outer_var_ref (gang_static, ctx);
+ tag |= OLF_GANG_STATIC;
+ }
/* In a parallel region, loops are implicitly INDEPENDENT. */
if (is_oacc_parallel (ctx))
@@ -8819,8 +8801,8 @@ expand_oacc_for (struct omp_region *regi
enum tree_code cond_code = fd->loop.cond_code;
enum tree_code plus_code = PLUS_EXPR;
- tree chunk_size = fd->chunk_size;
- tree gwv = build_int_cst (integer_type_node, region->gwv_this);
+ tree chunk_size = integer_one_node;
+ tree gwv = integer_zero_node;
tree iter_type = TREE_TYPE (v);
tree diff_type = iter_type;
tree plus_type = iter_type;
@@ -8873,7 +8855,7 @@ expand_oacc_for (struct omp_region *regi
tree step = create_tmp_var (diff_type, ".step");
bool up = cond_code == LT_EXPR;
tree dir = build_int_cst (diff_type, up ? +1 : -1);
- bool chunking = chunk_size != NULL_TREE;
+ bool chunking = !gimple_in_ssa_p (cfun);;
bool negating;
/* SSA instances. */
@@ -8902,6 +8884,8 @@ expand_oacc_for (struct omp_region *regi
{
offset_init = gimple_omp_for_index (for_stmt, 0);
gcc_assert (integer_zerop (fd->loop.n1));
+ /* The SSA parallelizer does gang parallelism. */
+ gwv = build_int_cst (integer_type_node, GOMP_DIM_MASK (GOMP_DIM_GANG));
}
if (fd->collapse > 1)
@@ -15642,11 +15626,12 @@ oacc_xform_loop (gcall *call)
if (integer_zerop (chunk_size))
{
- /* If we're at the gang or worker level, we want each to execute
- a contiguous run of iterations. Otherwise we want each
- element to stride. */
- striding = !(outer_mask & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
- | GOMP_DIM_MASK (GOMP_DIM_GANG)));
+ /* If we're at the gang or (worker with vector), we want each to
+ execute a contiguous run of iterations. Otherwise we want
+ each element to stride. */
+ striding = !((outer_mask & GOMP_DIM_MASK (GOMP_DIM_GANG))
+ || ((outer_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
+ && (outer_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))));
chunking = false;
}
else
@@ -15671,6 +15656,7 @@ oacc_xform_loop (gcall *call)
= (range - dir) / (chunks * step * num_threads) + dir */
tree per = expand_oacc_get_num_threads (&seq, mask);
per = fold_convert (type, per);
+ chunk_size = fold_convert (type, chunk_size);
per = fold_build2 (MULT_EXPR, type, per, chunk_size);
per = fold_build2 (MULT_EXPR, type, per, step);
r = build2 (MINUS_EXPR, type, range, dir);
@@ -15706,8 +15692,10 @@ oacc_xform_loop (gcall *call)
if (chunking)
{
+ chunk_size = fold_convert (diff_type, chunk_size);
+
span = inner_size;
- span = fold_convert (type, span);
+ span = fold_convert (diff_type, span);
span = fold_build2 (MULT_EXPR, diff_type, span, chunk_size);
}
else
@@ -15754,6 +15742,8 @@ oacc_xform_loop (gcall *call)
if (chunking)
{
+ chunk_size = fold_convert (diff_type, chunk_size);
+
span = expand_oacc_get_num_threads (&seq, inner_mask);
span = fold_convert (diff_type, span);
span = fold_build2 (MULT_EXPR, diff_type, span, chunk_size);
@@ -15899,6 +15889,8 @@ new_oacc_loop_raw (oacc_loop *parent, lo
loop->routine = NULL_TREE;
loop->mask = loop->flags = 0;
+ loop->chunk_size = 0;
+ loop->head_end = NULL;
return loop;
}
@@ -15922,6 +15914,11 @@ new_oacc_loop (oacc_loop *parent, gcall
loop->flags = TREE_INT_CST_LOW (gimple_call_arg (head, 2));
+ tree chunk_size = integer_zero_node;
+ if (loop->flags & OLF_GANG_STATIC)
+ chunk_size = gimple_call_arg (head,3);
+ loop->chunk_size = chunk_size;
+
/* Set the mask from the incoming flags.
TODO: Be smarter and more flexible. */
loop->mask = ((loop->flags >> OLF_DIM_BASE)
@@ -16086,6 +16083,8 @@ oacc_loop_walk (oacc_loop *loop, basic_b
marker = 0;
if (code == IFN_UNIQUE_OACC_TAIL_MARK)
loop = finish_oacc_loop (loop);
+ else
+ loop->head_end = call;
}
else
{
@@ -16113,7 +16112,6 @@ oacc_loop_walk (oacc_loop *loop, basic_b
}
}
}
-
gcc_assert (!remaining && !marker);
/* Walk successor blocks. */
@@ -16202,6 +16200,47 @@ oacc_loop_xform_head_tail (gcall *from,
break2:;
}
+/* Transform the IFN_GOACC_LOOP internal functions by providing the
+ determined partitioning mask and chunking argument. */
+
+static void
+oacc_loop_xform_loop (gcall *end_marker, tree mask_arg, tree chunk_arg)
+{
+ gimple_stmt_iterator gsi = gsi_for_stmt (end_marker);
+
+ for (;;)
+ {
+ for (; !gsi_end_p (gsi); gsi_next (&gsi))
+ {
+ gimple *stmt = gsi_stmt (gsi);
+
+ if (!is_gimple_call (stmt))
+ continue;
+
+ gcall *call = as_a <gcall *> (stmt);
+
+ if (!gimple_call_internal_p (call))
+ continue;
+
+ if (gimple_call_internal_fn (call) != IFN_GOACC_LOOP)
+ continue;
+
+ *gimple_call_arg_ptr (call, 5) = mask_arg;
+ *gimple_call_arg_ptr (call, 4) = chunk_arg;
+ if (TREE_INT_CST_LOW (gimple_call_arg (call, 0))
+ == IFN_GOACC_LOOP_BOUND)
+ goto break2;
+ }
+
+ /* If we didn't see LOOP_BOUND, it should be in the single
+ successor block. */
+ basic_block bb = single_succ (gsi_bb (gsi));
+ gsi = gsi_start_bb (bb);
+ }
+
+ break2:;
+}
+
/* Process the discovered OpenACC loops, setting the correct
partitioning level etc. */
@@ -16215,19 +16254,26 @@ oacc_loop_process (oacc_loop *loop)
unsigned mask = loop->mask;
unsigned dim = GOMP_DIM_GANG;
- if (mask)
- for (ix = 0; ix != GOMP_DIM_MAX && loop->heads[ix]; ix++)
- {
- gcc_assert (mask);
+ if (mask && !loop->routine)
+ {
+ tree mask_arg = build_int_cst (unsigned_type_node, mask);
+ tree chunk_arg = loop->chunk_size;
- while (!(GOMP_DIM_MASK (dim) & mask))
- dim++;
+ oacc_loop_xform_loop (loop->head_end, mask_arg, chunk_arg);
- oacc_loop_xform_head_tail (loop->heads[ix], dim);
- oacc_loop_xform_head_tail (loop->tails[ix], dim);
+ for (ix = 0; ix != GOMP_DIM_MAX && loop->heads[ix]; ix++)
+ {
+ gcc_assert (mask);
- mask ^= GOMP_DIM_MASK (dim);
- }
+ while (!(GOMP_DIM_MASK (dim) & mask))
+ dim++;
+
+ oacc_loop_xform_head_tail (loop->heads[ix], dim);
+ oacc_loop_xform_head_tail (loop->tails[ix], dim);
+
+ mask ^= GOMP_DIM_MASK (dim);
+ }
+ }
else
gcc_assert (!loop->heads[1] && !loop->tails[1]
&& (loop->routine || !loop->parent
^ permalink raw reply [flat|nested] 2+ messages in thread
* [gomp4]
@ 2015-10-09 14:14 Nathan Sidwell
0 siblings, 0 replies; 2+ messages in thread
From: Nathan Sidwell @ 2015-10-09 14:14 UTC (permalink / raw)
To: GCC Patches
[-- Attachment #1: Type: text/plain, Size: 130 bytes --]
I've applied this to gomp4 branch.
1) ports the break fix in gimple-fold from trunk
2) fixes missing tab in ptx output.
nathan
[-- Attachment #2: clean.patch --]
[-- Type: text/x-patch, Size: 1288 bytes --]
2015-10-09 Nathan Sidwell <nathan@acm.org>
* config/nvptx/nvptx.c (nvptx_init_axis_predicate): Fix output
formatting.
PR 67861
* gimple-fold.c (gimple_fold_builtin): Add break after
BUILT_IN_PRINTF_CHK, BUILT_IN_VPRINTF_CHK folding.
Index: gcc/config/nvptx/nvptx.c
===================================================================
--- gcc/config/nvptx/nvptx.c (revision 228656)
+++ gcc/config/nvptx/nvptx.c (working copy)
@@ -629,8 +629,7 @@ static void
nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
{
fprintf (file, "\t{\n");
-
- fprintf (file, "\t.reg.u32\t%%%s;\n", name);
+ fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
fprintf (file, "\t}\n");
Index: gcc/gimple-fold.c
===================================================================
--- gcc/gimple-fold.c (revision 228656)
+++ gcc/gimple-fold.c (working copy)
@@ -2890,6 +2890,7 @@ gimple_fold_builtin (gimple_stmt_iterato
n == 3
? gimple_call_arg (stmt, 2)
: NULL_TREE, fcode);
+ break;
case BUILT_IN_ACC_ON_DEVICE:
return gimple_fold_builtin_acc_on_device (gsi,
gimple_call_arg (stmt, 0));
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2015-10-15 17:12 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-10-15 17:12 [gomp4] Nathan Sidwell
-- strict thread matches above, loose matches on Subject: below --
2015-10-09 14:14 [gomp4] Nathan Sidwell
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).