From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 1729) id 911903842AE8; Wed, 29 Jun 2022 14:34:05 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 911903842AE8 Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: Kwok Yeung To: gcc-cvs@gcc.gnu.org Subject: [gcc/devel/omp/gcc-12] Various OpenACC reduction enhancements - ME and nvptx changes X-Act-Checkin: gcc X-Git-Author: Julian Brown X-Git-Refname: refs/heads/devel/omp/gcc-12 X-Git-Oldrev: 3a5e525489f2f808093ae1f12b5d2b406f571ec7 X-Git-Newrev: bce2c92cfec2ae1eb9d79e36dff5a220b688bfa1 Message-Id: <20220629143405.911903842AE8@sourceware.org> Date: Wed, 29 Jun 2022 14:34:05 +0000 (GMT) X-BeenThere: gcc-cvs@gcc.gnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Gcc-cvs mailing list List-Unsubscribe: , List-Archive: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 29 Jun 2022 14:34:05 -0000 https://gcc.gnu.org/g:bce2c92cfec2ae1eb9d79e36dff5a220b688bfa1 commit bce2c92cfec2ae1eb9d79e36dff5a220b688bfa1 Author: Julian Brown Date: Tue Feb 12 15:06:55 2019 -0800 Various OpenACC reduction enhancements - ME and nvptx changes Parts of the first posting got lost in the second posting, above. This version hopefully contains everything. 2018-10-30 Cesar Philippidis gcc/ * config/nvptx/nvptx.cc (nvptx_propagate_unified): New. (nvptx_split_blocks): Call it for cond_uni insn. (nvptx_expand_cond_uni): New. (enum nvptx_builtins): Add NVPTX_BUILTIN_COND_UNI. (nvptx_init_builtins): Initialize it. (nvptx_expand_builtin): (nvptx_generate_vector_shuffle): Change integral SHIFT operand to tree BITS operand. (nvptx_vector_reduction): New. (nvptx_adjust_reduction_type): New. (nvptx_goacc_reduction_setup): Use it to adjust the type of ref_to_res. (nvptx_goacc_reduction_init): Don't update LHS if it doesn't exist. (nvptx_goacc_reduction_fini): Call nvptx_vector_reduction for vector. Use it to adjust the type of ref_to_res. (nvptx_goacc_reduction_teardown): * config/nvptx/nvptx.md (cond_uni): New pattern. Diff: --- gcc/ChangeLog.omp | 19 ++++ gcc/config/nvptx/nvptx.cc | 230 +++++++++++++++++++++++++++++++++++++++++----- gcc/config/nvptx/nvptx.md | 7 ++ gcc/gimplify.cc | 8 +- gcc/omp-general.h | 2 +- gcc/omp-low.cc | 29 +++++- gcc/omp-offload.cc | 18 ++++ 7 files changed, 283 insertions(+), 30 deletions(-) diff --git a/gcc/ChangeLog.omp b/gcc/ChangeLog.omp index 30b4ab9005e..d1a9bcef523 100644 --- a/gcc/ChangeLog.omp +++ b/gcc/ChangeLog.omp @@ -1,3 +1,22 @@ +2018-10-30 Cesar Philippidis + + * config/nvptx/nvptx.cc (nvptx_propagate_unified): New. + (nvptx_split_blocks): Call it for cond_uni insn. + (nvptx_expand_cond_uni): New. + (enum nvptx_builtins): Add NVPTX_BUILTIN_COND_UNI. + (nvptx_init_builtins): Initialize it. + (nvptx_expand_builtin): + (nvptx_generate_vector_shuffle): Change integral SHIFT operand to + tree BITS operand. + (nvptx_vector_reduction): New. + (nvptx_adjust_reduction_type): New. + (nvptx_goacc_reduction_setup): Use it to adjust the type of ref_to_res. + (nvptx_goacc_reduction_init): Don't update LHS if it doesn't exist. + (nvptx_goacc_reduction_fini): Call nvptx_vector_reduction for vector. + Use it to adjust the type of ref_to_res. + (nvptx_goacc_reduction_teardown): + * config/nvptx/nvptx.md (cond_uni): New pattern. + 2018-06-29 Cesar Philippidis James Norris diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc index e4297e2d6c3..af383925f9b 100644 --- a/gcc/config/nvptx/nvptx.cc +++ b/gcc/config/nvptx/nvptx.cc @@ -3471,6 +3471,52 @@ nvptx_mach_vector_length () return cfun->machine->axis_dim[MACH_VECTOR_LENGTH]; } +/* UNIFIED is a cond_uni insn. Find the branch insn it affects, and + mark that as unified. We expect to be in a single block. */ + +static void +nvptx_propagate_unified (rtx_insn *unified) +{ + rtx_insn *probe = unified; + rtx cond_reg = SET_DEST (PATTERN (unified)); + rtx pat = NULL_RTX; + + /* Find the comparison. (We could skip this and simply scan to he + blocks' terminating branch, if we didn't care for self + checking.) */ + for (;;) + { + probe = next_real_insn (probe); + if (!probe) + break; + pat = PATTERN (probe); + + if (GET_CODE (pat) == SET + && GET_RTX_CLASS (GET_CODE (SET_SRC (pat))) == RTX_COMPARE + && XEXP (SET_SRC (pat), 0) == cond_reg) + break; + gcc_assert (NONJUMP_INSN_P (probe)); + } + gcc_assert (pat); + rtx pred_reg = SET_DEST (pat); + + /* Find the branch. */ + do + probe = NEXT_INSN (probe); + while (!JUMP_P (probe)); + + pat = PATTERN (probe); + rtx itec = XEXP (SET_SRC (pat), 0); + gcc_assert (XEXP (itec, 0) == pred_reg); + + /* Mark the branch's condition as unified. */ + rtx unspec = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pred_reg), + UNSPEC_BR_UNIFIED); + bool ok = validate_change (probe, &XEXP (itec, 0), unspec, false); + + gcc_assert (ok); +} + /* Loop structure of the function. The entire function is described as a NULL loop. */ /* See also 'gcc/omp-oacc-neuter-broadcast.cc:struct parallel_g'. */ @@ -3574,6 +3620,9 @@ nvptx_split_blocks (bb_insn_map_t *map) continue; switch (recog_memoized (insn)) { + case CODE_FOR_cond_uni: + nvptx_propagate_unified (insn); + /* FALLTHROUGH */ default: seen_insn = true; continue; @@ -6134,6 +6183,21 @@ nvptx_expand_cmp_swap (tree exp, rtx target, return target; } +/* Expander for the compare unified builtin. */ + +static rtx +nvptx_expand_cond_uni (tree exp, rtx target, machine_mode mode, int ignore) +{ + if (ignore) + return target; + + rtx src = expand_expr (CALL_EXPR_ARG (exp, 0), + NULL_RTX, mode, EXPAND_NORMAL); + + emit_insn (gen_cond_uni (target, src)); + + return target; +} /* Codes for all the NVPTX builtins. */ enum nvptx_builtins @@ -6146,6 +6210,7 @@ enum nvptx_builtins NVPTX_BUILTIN_CMP_SWAPLL, NVPTX_BUILTIN_MEMBAR_GL, NVPTX_BUILTIN_MEMBAR_CTA, + NVPTX_BUILTIN_COND_UNI, NVPTX_BUILTIN_MAX }; @@ -6188,6 +6253,7 @@ nvptx_init_builtins (void) DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE)); DEF (MEMBAR_GL, "membar_gl", (VOID, VOID, NULL_TREE)); DEF (MEMBAR_CTA, "membar_cta", (VOID, VOID, NULL_TREE)); + DEF (COND_UNI, "cond_uni", (integer_type_node, integer_type_node, NULL_TREE)); #undef DEF #undef ST @@ -6231,6 +6297,9 @@ nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget), emit_insn (gen_nvptx_membar_cta ()); return NULL_RTX; + case NVPTX_BUILTIN_COND_UNI: + return nvptx_expand_cond_uni (exp, target, mode, ignore); + default: gcc_unreachable (); } } @@ -6579,7 +6648,7 @@ nvptx_get_shared_red_addr (tree type, tree offset, bool vector) static void nvptx_generate_vector_shuffle (location_t loc, - tree dest_var, tree var, unsigned shift, + tree dest_var, tree var, tree bits, gimple_seq *seq) { unsigned fn = NVPTX_BUILTIN_SHUFFLE; @@ -6602,7 +6671,6 @@ nvptx_generate_vector_shuffle (location_t loc, } tree call = nvptx_builtin_decl (fn, true); - tree bits = build_int_cst (unsigned_type_node, shift); tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN); tree expr; @@ -6892,6 +6960,129 @@ nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi, return nvptx_lockfull_update (loc, gsi, ptr, var, op, level); } +/* Emit a vector-level reduction loop. OLD_VAR is the incoming + variable to reduce (valid in each vector), OP is the reduction + operator. Return the reduced value (an SSA var). + + The code we generate looks like: + unsigned old_shift = DIM_SIZE(VECTOR); + do + { + shift = PHI (old_shift, new_shift); + var = PHI (old_var, new_var); + new_shift = shift >> 1; + other_var = VSHUFFLE (var, new_shift); + new_var = var OP other_var; + cond_var = builtin_cond_uni (new_shift); + } + while (cond_var > 1); + + The builtin_cond_ini expands to a cond_uni instruction, which is + processed in nvpts_split_blocks to mark the loop's terminating + branch instruction. */ + +static tree +nvptx_vector_reduction (location_t loc, gimple_stmt_iterator *gsi, + tree old_var, tree_code op) +{ + tree var_type = TREE_TYPE (old_var); + + /* Emit old_shift = DIM_SIZE(VECTOR) */ + tree old_shift = make_ssa_name (integer_type_node); + tree dim = build_int_cst (integer_type_node, GOMP_DIM_VECTOR); + gcall *call = gimple_build_call_internal (IFN_GOACC_DIM_SIZE, 1, dim); + gimple_set_lhs (call, old_shift); + gimple_set_location (call, loc); + gsi_insert_before (gsi, call, GSI_SAME_STMT); + + /* Split the block just after the init stmts. */ + basic_block pre_bb = gsi_bb (*gsi); + edge pre_edge = split_block (pre_bb, call); + pre_edge->probability = profile_probability::even (); + basic_block loop_bb = pre_edge->dest; + pre_bb = pre_edge->src; + /* Reset the iterator. */ + *gsi = gsi_for_stmt (gsi_stmt (*gsi)); + + tree shift = make_ssa_name (integer_type_node); + tree new_shift = make_ssa_name (integer_type_node); + tree var = make_ssa_name (var_type); + tree other_var = make_ssa_name (var_type); + tree new_var = make_ssa_name (var_type); + + /* Build and insert the loop body. */ + gimple_seq loop_seq = NULL; + + /* new_shift = shift >> 1 */ + tree shift_expr = fold_build2 (RSHIFT_EXPR, integer_type_node, + shift, integer_one_node); + gimplify_assign (new_shift, shift_expr, &loop_seq); + + /* other_var = shuffle (var, shift) */ + nvptx_generate_vector_shuffle (loc, other_var, var, new_shift, &loop_seq); + /* new_var = var OP other_var */ + tree red_expr = fold_build2 (op, var_type, var, other_var); + gimplify_assign (new_var, red_expr, &loop_seq); + + /* Mark the iterator variable as unified. */ + tree cond_var = make_ssa_name (integer_type_node); + tree uni_fn = nvptx_builtin_decl (NVPTX_BUILTIN_COND_UNI, true); + tree uni_expr = build_call_expr_loc (loc, uni_fn, 1, new_shift); + gimplify_assign (cond_var, uni_expr, &loop_seq); + + gcond *cond = gimple_build_cond (LE_EXPR, cond_var, integer_one_node, + NULL_TREE, NULL_TREE); + gimple_seq_add_stmt (&loop_seq, cond); + + gsi_insert_seq_before (gsi, loop_seq, GSI_SAME_STMT); + + /* Split the block just after the loop stmts. */ + edge post_edge = split_block (loop_bb, cond); + post_edge->probability = profile_probability::even (); + basic_block post_bb = post_edge->dest; + loop_bb = post_edge->src; + *gsi = gsi_for_stmt (gsi_stmt (*gsi)); + + /* Create the loop. */ + post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU; + edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE); + loop_edge->probability = profile_probability::even (); + set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb); + set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb); + + gphi *shift_phi = create_phi_node (shift, loop_bb); + add_phi_arg (shift_phi, old_shift, pre_edge, loc); + add_phi_arg (shift_phi, new_shift, loop_edge, loc); + + gphi *var_phi = create_phi_node (var, loop_bb); + add_phi_arg (var_phi, old_var, pre_edge, loc); + add_phi_arg (var_phi, new_var, loop_edge, loc); + + loop *loop = alloc_loop (); + loop->header = loop_bb; + loop->latch = loop_bb; + add_loop (loop, loop_bb->loop_father); + + return new_var; +} + +/* Dummy reduction vars that have GOMP_MAP_FIRSTPRIVATE_POINTER data + mappings gets retyped to (void *). Adjust the type of VAR to TYPE + as appropriate. */ + +static tree +nvptx_adjust_reduction_type (tree var, tree type, gimple_seq *seq) +{ + if (TREE_TYPE (TREE_TYPE (var)) == type) + return var; + + tree ptype = build_pointer_type (type); + tree t = make_ssa_name (ptype); + tree expr = fold_build1 (NOP_EXPR, ptype, var); + gimple_seq_add_stmt (seq, gimple_build_assign (t, expr)); + return t; +} + /* NVPTX implementation of GOACC_REDUCTION_SETUP. */ static void @@ -6911,7 +7102,11 @@ nvptx_goacc_reduction_setup (gcall *call, offload_attrs *oa) tree ref_to_res = gimple_call_arg (call, 1); if (!integer_zerop (ref_to_res)) - var = build_simple_mem_ref (ref_to_res); + { + ref_to_res = nvptx_adjust_reduction_type (ref_to_res, TREE_TYPE (var), + &seq); + var = build_simple_mem_ref (ref_to_res); + } } if (level == GOMP_DIM_WORKER @@ -7039,22 +7234,7 @@ nvptx_goacc_reduction_fini (gcall *call, offload_attrs *oa) push_gimplify_context (true); if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE) - { - /* Emit binary shuffle tree. TODO. Emit this as an actual loop, - but that requires a method of emitting a unified jump at the - gimple level. */ - for (int shfl = PTX_WARP_SIZE / 2; shfl > 0; shfl = shfl >> 1) - { - tree other_var = make_ssa_name (TREE_TYPE (var)); - nvptx_generate_vector_shuffle (gimple_location (call), - other_var, var, shfl, &seq); - - r = make_ssa_name (TREE_TYPE (var)); - gimplify_assign (r, fold_build2 (op, TREE_TYPE (var), - var, other_var), &seq); - var = r; - } - } + r = nvptx_vector_reduction (gimple_location (call), &gsi, var, op); else { tree accum = NULL_TREE; @@ -7073,7 +7253,11 @@ nvptx_goacc_reduction_fini (gcall *call, offload_attrs *oa) else if (integer_zerop (ref_to_res)) r = var; else - accum = ref_to_res; + { + ref_to_res = nvptx_adjust_reduction_type (ref_to_res, TREE_TYPE (var), + &seq); + accum = ref_to_res; + } if (accum) { @@ -7124,7 +7308,11 @@ nvptx_goacc_reduction_teardown (gcall *call, offload_attrs *oa) tree ref_to_res = gimple_call_arg (call, 1); if (!integer_zerop (ref_to_res)) - gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq); + { + ref_to_res = nvptx_adjust_reduction_type (ref_to_res, TREE_TYPE (var), + &seq); + gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq); + } } if (lhs) diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md index 8ed685027b5..995f52b9492 100644 --- a/gcc/config/nvptx/nvptx.md +++ b/gcc/config/nvptx/nvptx.md @@ -942,6 +942,13 @@ "%J0\\tbra.uni\\t%l1;" [(set_attr "predicable" "no")]) +(define_insn "cond_uni" + [(set (match_operand:SI 0 "nvptx_register_operand" "=R") + (unspec:SI [(match_operand:SI 1 "nvptx_nonmemory_operand" "R")] + UNSPEC_BR_UNIFIED))] + "" + "%.\\tmov%t0\\t%0, %1; // unified") + (define_expand "cbranch4" [(set (pc) (if_then_else (match_operator 0 "nvptx_comparison_operator" diff --git a/gcc/gimplify.cc b/gcc/gimplify.cc index e6540314299..8c44d0c4617 100644 --- a/gcc/gimplify.cc +++ b/gcc/gimplify.cc @@ -11776,9 +11776,11 @@ gimplify_adjust_omp_clauses (gimple_seq *pre_p, gimple_seq body, tree *list_p, case OMP_CLAUSE_TASK_REDUCTION: decl = OMP_CLAUSE_DECL (c); /* OpenACC reductions need a present_or_copy data clause. - Add one if necessary. Emit error when the reduction is private. */ - if (ctx->region_type == ORT_ACC_PARALLEL - || ctx->region_type == ORT_ACC_SERIAL) + Add one if necessary. Emit error when the reduction is + private. */ + if (DECL_P (decl) && + (ctx->region_type == ORT_ACC_PARALLEL + || ctx->region_type == ORT_ACC_SERIAL)) { n = splay_tree_lookup (ctx->variables, (splay_tree_key) decl); if (n->value & (GOVD_PRIVATE | GOVD_FIRSTPRIVATE)) diff --git a/gcc/omp-general.h b/gcc/omp-general.h index 7a94831e8f5..e68f48d269b 100644 --- a/gcc/omp-general.h +++ b/gcc/omp-general.h @@ -33,7 +33,7 @@ enum oacc_loop_flags { OLF_GANG_STATIC = 1u << 3, /* Gang partitioning is static (has op). */ OLF_TILE = 1u << 4, /* Tiled loop. */ OLF_REDUCTION = 1u << 5, /* Reduction loop. */ - + /* Explicitly specified loop axes. */ OLF_DIM_BASE = 6, OLF_DIM_GANG = 1u << (OLF_DIM_BASE + GOMP_DIM_GANG), diff --git a/gcc/omp-low.cc b/gcc/omp-low.cc index e714e2c460b..70a6dc3c399 100644 --- a/gcc/omp-low.cc +++ b/gcc/omp-low.cc @@ -7650,6 +7650,7 @@ lower_oacc_reductions (location_t loc, tree clauses, tree level, bool inner, tree ref_to_res = NULL_TREE; tree incoming, outgoing, v1, v2, v3; bool is_private = false; + bool is_fpp = false; enum tree_code rcode = OMP_CLAUSE_REDUCTION_CODE (c); if (rcode == MINUS_EXPR) @@ -7711,19 +7712,37 @@ lower_oacc_reductions (location_t loc, tree clauses, tree level, bool inner, is_private = true; goto do_lookup; } + else if (OMP_CLAUSE_CODE (cls) == OMP_CLAUSE_MAP + && (OMP_CLAUSE_MAP_KIND (cls) + == GOMP_MAP_FIRSTPRIVATE_POINTER) + && orig == OMP_CLAUSE_DECL (cls)) + { + is_fpp = true; + goto do_lookup; + } } do_lookup: /* This is the outermost construct with this reduction, see if there's a mapping for it. */ if (gimple_code (outer->stmt) == GIMPLE_OMP_TARGET - && maybe_lookup_field (orig, outer) && !is_private) + && (maybe_lookup_field (orig, outer) || is_fpp) && !is_private) { - ref_to_res = build_receiver_ref (orig, false, outer); - if (omp_privatize_by_reference (orig)) - ref_to_res = build_simple_mem_ref (ref_to_res); - tree type = TREE_TYPE (var); + + if (is_fpp) + { + tree x = create_tmp_var (type); + gimplify_assign (x, lookup_decl (orig, outer), fork_seq); + ref_to_res = x; + } + else + { + ref_to_res = build_receiver_ref (orig, false, outer); + if (omp_privatize_by_reference (orig)) + ref_to_res = build_simple_mem_ref (ref_to_res); + } + if (POINTER_TYPE_P (type)) type = TREE_TYPE (type); diff --git a/gcc/omp-offload.cc b/gcc/omp-offload.cc index ad4e772015e..78c2982da5e 100644 --- a/gcc/omp-offload.cc +++ b/gcc/omp-offload.cc @@ -1623,6 +1623,13 @@ oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask, non-innermost available level. */ unsigned this_mask = GOMP_DIM_MASK (GOMP_DIM_GANG); + /* Orphan reductions cannot have gang partitioning. */ + if ((loop->flags & OLF_REDUCTION) + && oacc_get_fn_attrib (current_function_decl) + && !lookup_attribute ("omp target entrypoint", + DECL_ATTRIBUTES (current_function_decl))) + this_mask = GOMP_DIM_MASK (GOMP_DIM_WORKER); + /* Find the first outermost available partition. */ while (this_mask <= outer_mask) this_mask <<= 1; @@ -1774,6 +1781,17 @@ default_goacc_reduction (gcall *call) if (!integer_zerop (ref_to_res)) { + /* Dummy reduction vars that have GOMP_MAP_FIRSTPRIVATE_POINTER data + mappings gets retyped to (void *). Adjust the type of ref_to_res + as appropriate. */ + if (TREE_TYPE (TREE_TYPE (ref_to_res)) != TREE_TYPE (var)) + { + tree ptype = build_pointer_type (TREE_TYPE (var)); + tree t = make_ssa_name (ptype); + tree expr = fold_build1 (NOP_EXPR, ptype, ref_to_res); + gimple_seq_add_stmt (&seq, gimple_build_assign (t, expr)); + ref_to_res = t; + } tree dst = build_simple_mem_ref (ref_to_res); tree src = var;