public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc/devel/omp/gcc-12] Various OpenACC reduction enhancements - ME and nvptx changes
@ 2022-06-29 14:34 Kwok Yeung
0 siblings, 0 replies; only message in thread
From: Kwok Yeung @ 2022-06-29 14:34 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:bce2c92cfec2ae1eb9d79e36dff5a220b688bfa1
commit bce2c92cfec2ae1eb9d79e36dff5a220b688bfa1
Author: Julian Brown <julian@codesourcery.com>
Date: Tue Feb 12 15:06:55 2019 -0800
Various OpenACC reduction enhancements - ME and nvptx changes
Parts of the first posting got lost in the second posting, above.
This version hopefully contains everything.
2018-10-30 Cesar Philippidis <cesar@codesourcery.com>
gcc/
* config/nvptx/nvptx.cc (nvptx_propagate_unified): New.
(nvptx_split_blocks): Call it for cond_uni insn.
(nvptx_expand_cond_uni): New.
(enum nvptx_builtins): Add NVPTX_BUILTIN_COND_UNI.
(nvptx_init_builtins): Initialize it.
(nvptx_expand_builtin):
(nvptx_generate_vector_shuffle): Change integral SHIFT operand to
tree BITS operand.
(nvptx_vector_reduction): New.
(nvptx_adjust_reduction_type): New.
(nvptx_goacc_reduction_setup): Use it to adjust the type of ref_to_res.
(nvptx_goacc_reduction_init): Don't update LHS if it doesn't exist.
(nvptx_goacc_reduction_fini): Call nvptx_vector_reduction for vector.
Use it to adjust the type of ref_to_res.
(nvptx_goacc_reduction_teardown):
* config/nvptx/nvptx.md (cond_uni): New pattern.
Diff:
---
gcc/ChangeLog.omp | 19 ++++
gcc/config/nvptx/nvptx.cc | 230 +++++++++++++++++++++++++++++++++++++++++-----
gcc/config/nvptx/nvptx.md | 7 ++
gcc/gimplify.cc | 8 +-
gcc/omp-general.h | 2 +-
gcc/omp-low.cc | 29 +++++-
gcc/omp-offload.cc | 18 ++++
7 files changed, 283 insertions(+), 30 deletions(-)
diff --git a/gcc/ChangeLog.omp b/gcc/ChangeLog.omp
index 30b4ab9005e..d1a9bcef523 100644
--- a/gcc/ChangeLog.omp
+++ b/gcc/ChangeLog.omp
@@ -1,3 +1,22 @@
+2018-10-30 Cesar Philippidis <cesar@codesourcery.com>
+
+ * config/nvptx/nvptx.cc (nvptx_propagate_unified): New.
+ (nvptx_split_blocks): Call it for cond_uni insn.
+ (nvptx_expand_cond_uni): New.
+ (enum nvptx_builtins): Add NVPTX_BUILTIN_COND_UNI.
+ (nvptx_init_builtins): Initialize it.
+ (nvptx_expand_builtin):
+ (nvptx_generate_vector_shuffle): Change integral SHIFT operand to
+ tree BITS operand.
+ (nvptx_vector_reduction): New.
+ (nvptx_adjust_reduction_type): New.
+ (nvptx_goacc_reduction_setup): Use it to adjust the type of ref_to_res.
+ (nvptx_goacc_reduction_init): Don't update LHS if it doesn't exist.
+ (nvptx_goacc_reduction_fini): Call nvptx_vector_reduction for vector.
+ Use it to adjust the type of ref_to_res.
+ (nvptx_goacc_reduction_teardown):
+ * config/nvptx/nvptx.md (cond_uni): New pattern.
+
2018-06-29 Cesar Philippidis <cesar@codesourcery.com>
James Norris <jnorris@codesourcery.com>
diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc
index e4297e2d6c3..af383925f9b 100644
--- a/gcc/config/nvptx/nvptx.cc
+++ b/gcc/config/nvptx/nvptx.cc
@@ -3471,6 +3471,52 @@ nvptx_mach_vector_length ()
return cfun->machine->axis_dim[MACH_VECTOR_LENGTH];
}
+/* UNIFIED is a cond_uni insn. Find the branch insn it affects, and
+ mark that as unified. We expect to be in a single block. */
+
+static void
+nvptx_propagate_unified (rtx_insn *unified)
+{
+ rtx_insn *probe = unified;
+ rtx cond_reg = SET_DEST (PATTERN (unified));
+ rtx pat = NULL_RTX;
+
+ /* Find the comparison. (We could skip this and simply scan to he
+ blocks' terminating branch, if we didn't care for self
+ checking.) */
+ for (;;)
+ {
+ probe = next_real_insn (probe);
+ if (!probe)
+ break;
+ pat = PATTERN (probe);
+
+ if (GET_CODE (pat) == SET
+ && GET_RTX_CLASS (GET_CODE (SET_SRC (pat))) == RTX_COMPARE
+ && XEXP (SET_SRC (pat), 0) == cond_reg)
+ break;
+ gcc_assert (NONJUMP_INSN_P (probe));
+ }
+ gcc_assert (pat);
+ rtx pred_reg = SET_DEST (pat);
+
+ /* Find the branch. */
+ do
+ probe = NEXT_INSN (probe);
+ while (!JUMP_P (probe));
+
+ pat = PATTERN (probe);
+ rtx itec = XEXP (SET_SRC (pat), 0);
+ gcc_assert (XEXP (itec, 0) == pred_reg);
+
+ /* Mark the branch's condition as unified. */
+ rtx unspec = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pred_reg),
+ UNSPEC_BR_UNIFIED);
+ bool ok = validate_change (probe, &XEXP (itec, 0), unspec, false);
+
+ gcc_assert (ok);
+}
+
/* Loop structure of the function. The entire function is described as
a NULL loop. */
/* See also 'gcc/omp-oacc-neuter-broadcast.cc:struct parallel_g'. */
@@ -3574,6 +3620,9 @@ nvptx_split_blocks (bb_insn_map_t *map)
continue;
switch (recog_memoized (insn))
{
+ case CODE_FOR_cond_uni:
+ nvptx_propagate_unified (insn);
+ /* FALLTHROUGH */
default:
seen_insn = true;
continue;
@@ -6134,6 +6183,21 @@ nvptx_expand_cmp_swap (tree exp, rtx target,
return target;
}
+/* Expander for the compare unified builtin. */
+
+static rtx
+nvptx_expand_cond_uni (tree exp, rtx target, machine_mode mode, int ignore)
+{
+ if (ignore)
+ return target;
+
+ rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
+ NULL_RTX, mode, EXPAND_NORMAL);
+
+ emit_insn (gen_cond_uni (target, src));
+
+ return target;
+}
/* Codes for all the NVPTX builtins. */
enum nvptx_builtins
@@ -6146,6 +6210,7 @@ enum nvptx_builtins
NVPTX_BUILTIN_CMP_SWAPLL,
NVPTX_BUILTIN_MEMBAR_GL,
NVPTX_BUILTIN_MEMBAR_CTA,
+ NVPTX_BUILTIN_COND_UNI,
NVPTX_BUILTIN_MAX
};
@@ -6188,6 +6253,7 @@ nvptx_init_builtins (void)
DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
DEF (MEMBAR_GL, "membar_gl", (VOID, VOID, NULL_TREE));
DEF (MEMBAR_CTA, "membar_cta", (VOID, VOID, NULL_TREE));
+ DEF (COND_UNI, "cond_uni", (integer_type_node, integer_type_node, NULL_TREE));
#undef DEF
#undef ST
@@ -6231,6 +6297,9 @@ nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
emit_insn (gen_nvptx_membar_cta ());
return NULL_RTX;
+ case NVPTX_BUILTIN_COND_UNI:
+ return nvptx_expand_cond_uni (exp, target, mode, ignore);
+
default: gcc_unreachable ();
}
}
@@ -6579,7 +6648,7 @@ nvptx_get_shared_red_addr (tree type, tree offset, bool vector)
static void
nvptx_generate_vector_shuffle (location_t loc,
- tree dest_var, tree var, unsigned shift,
+ tree dest_var, tree var, tree bits,
gimple_seq *seq)
{
unsigned fn = NVPTX_BUILTIN_SHUFFLE;
@@ -6602,7 +6671,6 @@ nvptx_generate_vector_shuffle (location_t loc,
}
tree call = nvptx_builtin_decl (fn, true);
- tree bits = build_int_cst (unsigned_type_node, shift);
tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
tree expr;
@@ -6892,6 +6960,129 @@ nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
return nvptx_lockfull_update (loc, gsi, ptr, var, op, level);
}
+/* Emit a vector-level reduction loop. OLD_VAR is the incoming
+ variable to reduce (valid in each vector), OP is the reduction
+ operator. Return the reduced value (an SSA var).
+
+ The code we generate looks like:
+ unsigned old_shift = DIM_SIZE(VECTOR);
+ do
+ {
+ shift = PHI (old_shift, new_shift);
+ var = PHI (old_var, new_var);
+ new_shift = shift >> 1;
+ other_var = VSHUFFLE (var, new_shift);
+ new_var = var OP other_var;
+ cond_var = builtin_cond_uni (new_shift);
+ }
+ while (cond_var > 1);
+
+ The builtin_cond_ini expands to a cond_uni instruction, which is
+ processed in nvpts_split_blocks to mark the loop's terminating
+ branch instruction. */
+
+static tree
+nvptx_vector_reduction (location_t loc, gimple_stmt_iterator *gsi,
+ tree old_var, tree_code op)
+{
+ tree var_type = TREE_TYPE (old_var);
+
+ /* Emit old_shift = DIM_SIZE(VECTOR) */
+ tree old_shift = make_ssa_name (integer_type_node);
+ tree dim = build_int_cst (integer_type_node, GOMP_DIM_VECTOR);
+ gcall *call = gimple_build_call_internal (IFN_GOACC_DIM_SIZE, 1, dim);
+ gimple_set_lhs (call, old_shift);
+ gimple_set_location (call, loc);
+ gsi_insert_before (gsi, call, GSI_SAME_STMT);
+
+ /* Split the block just after the init stmts. */
+ basic_block pre_bb = gsi_bb (*gsi);
+ edge pre_edge = split_block (pre_bb, call);
+ pre_edge->probability = profile_probability::even ();
+ basic_block loop_bb = pre_edge->dest;
+ pre_bb = pre_edge->src;
+ /* Reset the iterator. */
+ *gsi = gsi_for_stmt (gsi_stmt (*gsi));
+
+ tree shift = make_ssa_name (integer_type_node);
+ tree new_shift = make_ssa_name (integer_type_node);
+ tree var = make_ssa_name (var_type);
+ tree other_var = make_ssa_name (var_type);
+ tree new_var = make_ssa_name (var_type);
+
+ /* Build and insert the loop body. */
+ gimple_seq loop_seq = NULL;
+
+ /* new_shift = shift >> 1 */
+ tree shift_expr = fold_build2 (RSHIFT_EXPR, integer_type_node,
+ shift, integer_one_node);
+ gimplify_assign (new_shift, shift_expr, &loop_seq);
+
+ /* other_var = shuffle (var, shift) */
+ nvptx_generate_vector_shuffle (loc, other_var, var, new_shift, &loop_seq);
+ /* new_var = var OP other_var */
+ tree red_expr = fold_build2 (op, var_type, var, other_var);
+ gimplify_assign (new_var, red_expr, &loop_seq);
+
+ /* Mark the iterator variable as unified. */
+ tree cond_var = make_ssa_name (integer_type_node);
+ tree uni_fn = nvptx_builtin_decl (NVPTX_BUILTIN_COND_UNI, true);
+ tree uni_expr = build_call_expr_loc (loc, uni_fn, 1, new_shift);
+ gimplify_assign (cond_var, uni_expr, &loop_seq);
+
+ gcond *cond = gimple_build_cond (LE_EXPR, cond_var, integer_one_node,
+ NULL_TREE, NULL_TREE);
+ gimple_seq_add_stmt (&loop_seq, cond);
+
+ gsi_insert_seq_before (gsi, loop_seq, GSI_SAME_STMT);
+
+ /* Split the block just after the loop stmts. */
+ edge post_edge = split_block (loop_bb, cond);
+ post_edge->probability = profile_probability::even ();
+ basic_block post_bb = post_edge->dest;
+ loop_bb = post_edge->src;
+ *gsi = gsi_for_stmt (gsi_stmt (*gsi));
+
+ /* Create the loop. */
+ post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
+ edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
+ loop_edge->probability = profile_probability::even ();
+ set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
+ set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
+
+ gphi *shift_phi = create_phi_node (shift, loop_bb);
+ add_phi_arg (shift_phi, old_shift, pre_edge, loc);
+ add_phi_arg (shift_phi, new_shift, loop_edge, loc);
+
+ gphi *var_phi = create_phi_node (var, loop_bb);
+ add_phi_arg (var_phi, old_var, pre_edge, loc);
+ add_phi_arg (var_phi, new_var, loop_edge, loc);
+
+ loop *loop = alloc_loop ();
+ loop->header = loop_bb;
+ loop->latch = loop_bb;
+ add_loop (loop, loop_bb->loop_father);
+
+ return new_var;
+}
+
+/* Dummy reduction vars that have GOMP_MAP_FIRSTPRIVATE_POINTER data
+ mappings gets retyped to (void *). Adjust the type of VAR to TYPE
+ as appropriate. */
+
+static tree
+nvptx_adjust_reduction_type (tree var, tree type, gimple_seq *seq)
+{
+ if (TREE_TYPE (TREE_TYPE (var)) == type)
+ return var;
+
+ tree ptype = build_pointer_type (type);
+ tree t = make_ssa_name (ptype);
+ tree expr = fold_build1 (NOP_EXPR, ptype, var);
+ gimple_seq_add_stmt (seq, gimple_build_assign (t, expr));
+ return t;
+}
+
/* NVPTX implementation of GOACC_REDUCTION_SETUP. */
static void
@@ -6911,7 +7102,11 @@ nvptx_goacc_reduction_setup (gcall *call, offload_attrs *oa)
tree ref_to_res = gimple_call_arg (call, 1);
if (!integer_zerop (ref_to_res))
- var = build_simple_mem_ref (ref_to_res);
+ {
+ ref_to_res = nvptx_adjust_reduction_type (ref_to_res, TREE_TYPE (var),
+ &seq);
+ var = build_simple_mem_ref (ref_to_res);
+ }
}
if (level == GOMP_DIM_WORKER
@@ -7039,22 +7234,7 @@ nvptx_goacc_reduction_fini (gcall *call, offload_attrs *oa)
push_gimplify_context (true);
if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE)
- {
- /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
- but that requires a method of emitting a unified jump at the
- gimple level. */
- for (int shfl = PTX_WARP_SIZE / 2; shfl > 0; shfl = shfl >> 1)
- {
- tree other_var = make_ssa_name (TREE_TYPE (var));
- nvptx_generate_vector_shuffle (gimple_location (call),
- other_var, var, shfl, &seq);
-
- r = make_ssa_name (TREE_TYPE (var));
- gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
- var, other_var), &seq);
- var = r;
- }
- }
+ r = nvptx_vector_reduction (gimple_location (call), &gsi, var, op);
else
{
tree accum = NULL_TREE;
@@ -7073,7 +7253,11 @@ nvptx_goacc_reduction_fini (gcall *call, offload_attrs *oa)
else if (integer_zerop (ref_to_res))
r = var;
else
- accum = ref_to_res;
+ {
+ ref_to_res = nvptx_adjust_reduction_type (ref_to_res, TREE_TYPE (var),
+ &seq);
+ accum = ref_to_res;
+ }
if (accum)
{
@@ -7124,7 +7308,11 @@ nvptx_goacc_reduction_teardown (gcall *call, offload_attrs *oa)
tree ref_to_res = gimple_call_arg (call, 1);
if (!integer_zerop (ref_to_res))
- gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
+ {
+ ref_to_res = nvptx_adjust_reduction_type (ref_to_res, TREE_TYPE (var),
+ &seq);
+ gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
+ }
}
if (lhs)
diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index 8ed685027b5..995f52b9492 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -942,6 +942,13 @@
"%J0\\tbra.uni\\t%l1;"
[(set_attr "predicable" "no")])
+(define_insn "cond_uni"
+ [(set (match_operand:SI 0 "nvptx_register_operand" "=R")
+ (unspec:SI [(match_operand:SI 1 "nvptx_nonmemory_operand" "R")]
+ UNSPEC_BR_UNIFIED))]
+ ""
+ "%.\\tmov%t0\\t%0, %1; // unified")
+
(define_expand "cbranch<mode>4"
[(set (pc)
(if_then_else (match_operator 0 "nvptx_comparison_operator"
diff --git a/gcc/gimplify.cc b/gcc/gimplify.cc
index e6540314299..8c44d0c4617 100644
--- a/gcc/gimplify.cc
+++ b/gcc/gimplify.cc
@@ -11776,9 +11776,11 @@ gimplify_adjust_omp_clauses (gimple_seq *pre_p, gimple_seq body, tree *list_p,
case OMP_CLAUSE_TASK_REDUCTION:
decl = OMP_CLAUSE_DECL (c);
/* OpenACC reductions need a present_or_copy data clause.
- Add one if necessary. Emit error when the reduction is private. */
- if (ctx->region_type == ORT_ACC_PARALLEL
- || ctx->region_type == ORT_ACC_SERIAL)
+ Add one if necessary. Emit error when the reduction is
+ private. */
+ if (DECL_P (decl) &&
+ (ctx->region_type == ORT_ACC_PARALLEL
+ || ctx->region_type == ORT_ACC_SERIAL))
{
n = splay_tree_lookup (ctx->variables, (splay_tree_key) decl);
if (n->value & (GOVD_PRIVATE | GOVD_FIRSTPRIVATE))
diff --git a/gcc/omp-general.h b/gcc/omp-general.h
index 7a94831e8f5..e68f48d269b 100644
--- a/gcc/omp-general.h
+++ b/gcc/omp-general.h
@@ -33,7 +33,7 @@ enum oacc_loop_flags {
OLF_GANG_STATIC = 1u << 3, /* Gang partitioning is static (has op). */
OLF_TILE = 1u << 4, /* Tiled loop. */
OLF_REDUCTION = 1u << 5, /* Reduction loop. */
-
+
/* Explicitly specified loop axes. */
OLF_DIM_BASE = 6,
OLF_DIM_GANG = 1u << (OLF_DIM_BASE + GOMP_DIM_GANG),
diff --git a/gcc/omp-low.cc b/gcc/omp-low.cc
index e714e2c460b..70a6dc3c399 100644
--- a/gcc/omp-low.cc
+++ b/gcc/omp-low.cc
@@ -7650,6 +7650,7 @@ lower_oacc_reductions (location_t loc, tree clauses, tree level, bool inner,
tree ref_to_res = NULL_TREE;
tree incoming, outgoing, v1, v2, v3;
bool is_private = false;
+ bool is_fpp = false;
enum tree_code rcode = OMP_CLAUSE_REDUCTION_CODE (c);
if (rcode == MINUS_EXPR)
@@ -7711,19 +7712,37 @@ lower_oacc_reductions (location_t loc, tree clauses, tree level, bool inner,
is_private = true;
goto do_lookup;
}
+ else if (OMP_CLAUSE_CODE (cls) == OMP_CLAUSE_MAP
+ && (OMP_CLAUSE_MAP_KIND (cls)
+ == GOMP_MAP_FIRSTPRIVATE_POINTER)
+ && orig == OMP_CLAUSE_DECL (cls))
+ {
+ is_fpp = true;
+ goto do_lookup;
+ }
}
do_lookup:
/* This is the outermost construct with this reduction,
see if there's a mapping for it. */
if (gimple_code (outer->stmt) == GIMPLE_OMP_TARGET
- && maybe_lookup_field (orig, outer) && !is_private)
+ && (maybe_lookup_field (orig, outer) || is_fpp) && !is_private)
{
- ref_to_res = build_receiver_ref (orig, false, outer);
- if (omp_privatize_by_reference (orig))
- ref_to_res = build_simple_mem_ref (ref_to_res);
-
tree type = TREE_TYPE (var);
+
+ if (is_fpp)
+ {
+ tree x = create_tmp_var (type);
+ gimplify_assign (x, lookup_decl (orig, outer), fork_seq);
+ ref_to_res = x;
+ }
+ else
+ {
+ ref_to_res = build_receiver_ref (orig, false, outer);
+ if (omp_privatize_by_reference (orig))
+ ref_to_res = build_simple_mem_ref (ref_to_res);
+ }
+
if (POINTER_TYPE_P (type))
type = TREE_TYPE (type);
diff --git a/gcc/omp-offload.cc b/gcc/omp-offload.cc
index ad4e772015e..78c2982da5e 100644
--- a/gcc/omp-offload.cc
+++ b/gcc/omp-offload.cc
@@ -1623,6 +1623,13 @@ oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask,
non-innermost available level. */
unsigned this_mask = GOMP_DIM_MASK (GOMP_DIM_GANG);
+ /* Orphan reductions cannot have gang partitioning. */
+ if ((loop->flags & OLF_REDUCTION)
+ && oacc_get_fn_attrib (current_function_decl)
+ && !lookup_attribute ("omp target entrypoint",
+ DECL_ATTRIBUTES (current_function_decl)))
+ this_mask = GOMP_DIM_MASK (GOMP_DIM_WORKER);
+
/* Find the first outermost available partition. */
while (this_mask <= outer_mask)
this_mask <<= 1;
@@ -1774,6 +1781,17 @@ default_goacc_reduction (gcall *call)
if (!integer_zerop (ref_to_res))
{
+ /* Dummy reduction vars that have GOMP_MAP_FIRSTPRIVATE_POINTER data
+ mappings gets retyped to (void *). Adjust the type of ref_to_res
+ as appropriate. */
+ if (TREE_TYPE (TREE_TYPE (ref_to_res)) != TREE_TYPE (var))
+ {
+ tree ptype = build_pointer_type (TREE_TYPE (var));
+ tree t = make_ssa_name (ptype);
+ tree expr = fold_build1 (NOP_EXPR, ptype, ref_to_res);
+ gimple_seq_add_stmt (&seq, gimple_build_assign (t, expr));
+ ref_to_res = t;
+ }
tree dst = build_simple_mem_ref (ref_to_res);
tree src = var;
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2022-06-29 14:34 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-06-29 14:34 [gcc/devel/omp/gcc-12] Various OpenACC reduction enhancements - ME and nvptx changes Kwok Yeung
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).