From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 66582 invoked by alias); 5 Oct 2018 14:07:10 -0000 Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Archive: List-Post: List-Help: Sender: gcc-patches-owner@gcc.gnu.org Received: (qmail 66571 invoked by uid 89); 5 Oct 2018 14:07:09 -0000 Authentication-Results: sourceware.org; auth=none X-Spam-SWARE-Status: No, score=-26.9 required=5.0 tests=BAYES_00,GIT_PATCH_0,GIT_PATCH_1,GIT_PATCH_2,GIT_PATCH_3,SPF_PASS autolearn=ham version=3.3.2 spammy=5660, sk:build_c, EXPAND_NORMAL, Codes X-HELO: mx1.suse.de Received: from mx2.suse.de (HELO mx1.suse.de) (195.135.220.15) by sourceware.org (qpsmtpd/0.93/v0.84-503-g423c35a) with ESMTP; Fri, 05 Oct 2018 14:07:06 +0000 Received: from relay1.suse.de (unknown [195.135.220.254]) by mx1.suse.de (Postfix) with ESMTP id A7CCBAF71; Fri, 5 Oct 2018 14:07:03 +0000 (UTC) Subject: Re: [patch] various OpenACC reduction enhancements - ME and nvptx changes To: Cesar Philippidis , "gcc-patches@gcc.gnu.org" , Jakub Jelinek References: <27100797-1930-5c5f-ec68-c5888be08088@codesourcery.com> From: Tom de Vries Message-ID: <30208224-2a8a-c50a-d663-39b1bc8d3e93@suse.de> Date: Fri, 05 Oct 2018 14:09:00 -0000 User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Thunderbird/60.0 MIME-Version: 1.0 In-Reply-To: <27100797-1930-5c5f-ec68-c5888be08088@codesourcery.com> Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 8bit X-IsSubscribed: yes X-SW-Source: 2018-10/txt/msg00327.txt.bz2 On 6/29/18 8:19 PM, Cesar Philippidis wrote: > The attached patch includes the nvptx and GCC ME reductions enhancements. > > Is this patch OK for trunk? It bootstrapped / regression tested cleanly > for x86_64 with nvptx offloading. > These need fixing: ... === ERROR type #5: trailing whitespace (4 error(s)) === gcc/config/nvptx/nvptx.c:5139:0:██ gcc/config/nvptx/nvptx.c:5660:8: do█ gcc/config/nvptx/nvptx.c:5702:0:██ gcc/config/nvptx/nvptx.c:5726:0:██ ... > gcc/ > * config/nvptx/nvptx.c (nvptx_propagate_unified): New. > (nvptx_split_blocks): Call it for cond_uni insn. > (nvptx_expand_cond_uni): New. > (enum nvptx_builtins): Add NVPTX_BUILTIN_COND_UNI. > (nvptx_init_builtins): Initialize it. > (nvptx_expand_builtin): > (nvptx_generate_vector_shuffle): Change integral SHIFT operand to > tree BITS operand. > (nvptx_vector_reduction): New. > (nvptx_adjust_reduction_type): New. > (nvptx_goacc_reduction_setup): Use it to adjust the type of ref_to_res. > (nvptx_goacc_reduction_init): Don't update LHS if it doesn't exist. > (nvptx_goacc_reduction_fini): Call nvptx_vector_reduction for vector. > Use it to adjust the type of ref_to_res. > (nvptx_goacc_reduction_teardown): > * config/nvptx/nvptx.md (cond_uni): New pattern. > diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c > index 5608bee8a8d..33ec3db1153 100644 > --- a/gcc/config/nvptx/nvptx.c > +++ b/gcc/config/nvptx/nvptx.c > @@ -2863,6 +2863,52 @@ nvptx_reorg_uniform_simt () > } > } > > +/* UNIFIED is a cond_uni insn. Find the branch insn it affects, and > + mark that as unified. We expect to be in a single block. */ > + > +static void > +nvptx_propagate_unified (rtx_insn *unified) > +{ > + rtx_insn *probe = unified; > + rtx cond_reg = SET_DEST (PATTERN (unified)); > + rtx pat = NULL_RTX; > + > + /* Find the comparison. (We could skip this and simply scan to he > + blocks' terminating branch, if we didn't care for self > + checking.) */ > + for (;;) > + { > + probe = next_real_insn (probe); > + if (!probe) > + break; > + pat = PATTERN (probe); > + > + if (GET_CODE (pat) == SET > + && GET_RTX_CLASS (GET_CODE (SET_SRC (pat))) == RTX_COMPARE > + && XEXP (SET_SRC (pat), 0) == cond_reg) > + break; > + gcc_assert (NONJUMP_INSN_P (probe)); > + } > + gcc_assert (pat); > + rtx pred_reg = SET_DEST (pat); > + > + /* Find the branch. */ > + do > + probe = NEXT_INSN (probe); > + while (!JUMP_P (probe)); > + > + pat = PATTERN (probe); > + rtx itec = XEXP (SET_SRC (pat), 0); > + gcc_assert (XEXP (itec, 0) == pred_reg); > + > + /* Mark the branch's condition as unified. */ > + rtx unspec = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pred_reg), > + UNSPEC_BR_UNIFIED); > + bool ok = validate_change (probe, &XEXP (itec, 0), unspec, false); > + > + gcc_assert (ok); > +} > + > /* Loop structure of the function. The entire function is described as > a NULL loop. */ > > @@ -2964,6 +3010,9 @@ nvptx_split_blocks (bb_insn_map_t *map) > continue; > switch (recog_memoized (insn)) > { > + case CODE_FOR_cond_uni: > + nvptx_propagate_unified (insn); > + /* FALLTHROUGH */ > default: > seen_insn = true; > continue; > @@ -5080,6 +5129,21 @@ nvptx_expand_cmp_swap (tree exp, rtx target, > return target; > } > > +/* Expander for the compare unified builtin. */ > + > +static rtx > +nvptx_expand_cond_uni (tree exp, rtx target, machine_mode mode, int ignore) > +{ > + if (ignore) > + return target; > + > + rtx src = expand_expr (CALL_EXPR_ARG (exp, 0), > + NULL_RTX, mode, EXPAND_NORMAL); > + > + emit_insn (gen_cond_uni (target, src)); > + > + return target; > +} > > /* Codes for all the NVPTX builtins. */ > enum nvptx_builtins > @@ -5089,6 +5153,7 @@ enum nvptx_builtins > NVPTX_BUILTIN_WORKER_ADDR, > NVPTX_BUILTIN_CMP_SWAP, > NVPTX_BUILTIN_CMP_SWAPLL, > + NVPTX_BUILTIN_COND_UNI, > NVPTX_BUILTIN_MAX > }; > > @@ -5126,6 +5191,7 @@ nvptx_init_builtins (void) > (PTRVOID, ST, UINT, UINT, NULL_TREE)); > DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE)); > DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE)); > + DEF (COND_UNI, "cond_uni", (integer_type_node, integer_type_node, NULL_TREE)); > > #undef DEF > #undef ST > @@ -5158,6 +5224,9 @@ nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget), > case NVPTX_BUILTIN_CMP_SWAPLL: > return nvptx_expand_cmp_swap (exp, target, mode, ignore); > > + case NVPTX_BUILTIN_COND_UNI: > + return nvptx_expand_cond_uni (exp, target, mode, ignore); > + > default: gcc_unreachable (); > } > } > @@ -5284,7 +5353,7 @@ nvptx_get_worker_red_addr (tree type, tree offset) > > static void > nvptx_generate_vector_shuffle (location_t loc, > - tree dest_var, tree var, unsigned shift, > + tree dest_var, tree var, tree bits, > gimple_seq *seq) > { > unsigned fn = NVPTX_BUILTIN_SHUFFLE; > @@ -5307,7 +5376,6 @@ nvptx_generate_vector_shuffle (location_t loc, > } > > tree call = nvptx_builtin_decl (fn, true); > - tree bits = build_int_cst (unsigned_type_node, shift); > tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN); > tree expr; > > @@ -5583,6 +5651,126 @@ nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi, > return nvptx_lockfull_update (loc, gsi, ptr, var, op); > } > > +/* Emit a vector-level reduction loop. OLD_VAR is the incoming > + variable to reduce (valid in each vector), OP is the reduction > + operator. Return the reduced value (an SSA var). > + > + The code we generate looks like: > + unsigned old_shift = DIM_SIZE(VECTOR); > + do > + { > + shift = PHI (old_shift, new_shift); > + var = PHI (old_var, new_var); > + new_shift = shift >> 1; > + other_var = VSHUFFLE (var, new_shift); > + new_var = var OP other_var; > + cond_var = builtin_cond_uni (new_shift); > + } > + while (cond_var > 1); > + > + The builtin_cond_ini expands to a cond_uni instruction, which is > + processed in nvpts_split_blocks to mark the loop's terminating > + branch instruction. */ > + > +static tree > +nvptx_vector_reduction (location_t loc, gimple_stmt_iterator *gsi, > + tree old_var, tree_code op) > +{ > + tree var_type = TREE_TYPE (old_var); > + > + /* Emit old_shift = DIM_SIZE(VECTOR) */ > + tree old_shift = make_ssa_name (integer_type_node); > + tree dim = build_int_cst (integer_type_node, GOMP_DIM_VECTOR); > + gcall *call = gimple_build_call_internal (IFN_GOACC_DIM_SIZE, 1, dim); > + gimple_set_lhs (call, old_shift); > + gimple_set_location (call, loc); > + gsi_insert_before (gsi, call, GSI_SAME_STMT); > + > + /* Split the block just after the init stmts. */ > + basic_block pre_bb = gsi_bb (*gsi); > + edge pre_edge = split_block (pre_bb, call); > + basic_block loop_bb = pre_edge->dest; > + pre_bb = pre_edge->src; > + /* Reset the iterator. */ > + *gsi = gsi_for_stmt (gsi_stmt (*gsi)); > + > + tree shift = make_ssa_name (integer_type_node); > + tree new_shift = make_ssa_name (integer_type_node); > + tree var = make_ssa_name (var_type); > + tree other_var = make_ssa_name (var_type); > + tree new_var = make_ssa_name (var_type); > + > + /* Build and insert the loop body. */ > + gimple_seq loop_seq = NULL; > + > + /* new_shift = shift >> 1 */ > + tree shift_expr = fold_build2 (RSHIFT_EXPR, integer_type_node, > + shift, integer_one_node); > + gimplify_assign (new_shift, shift_expr, &loop_seq); > + > + /* other_var = shuffle (var, shift) */ > + nvptx_generate_vector_shuffle (loc, other_var, var, new_shift, &loop_seq); > + /* new_var = var OP other_var */ > + tree red_expr = fold_build2 (op, var_type, var, other_var); > + gimplify_assign (new_var, red_expr, &loop_seq); > + > + /* Mark the iterator variable as unified. */ > + tree cond_var = make_ssa_name (integer_type_node); > + tree uni_fn = nvptx_builtin_decl (NVPTX_BUILTIN_COND_UNI, true); > + tree uni_expr = build_call_expr_loc (loc, uni_fn, 1, new_shift); > + gimplify_assign (cond_var, uni_expr, &loop_seq); > + > + gcond *cond = gimple_build_cond (LE_EXPR, cond_var, integer_one_node, > + NULL_TREE, NULL_TREE); > + gimple_seq_add_stmt (&loop_seq, cond); > + > + gsi_insert_seq_before (gsi, loop_seq, GSI_SAME_STMT); > + > + /* Split the block just after the loop stmts. */ > + edge post_edge = split_block (loop_bb, cond); > + basic_block post_bb = post_edge->dest; > + loop_bb = post_edge->src; > + *gsi = gsi_for_stmt (gsi_stmt (*gsi)); > + > + /* Create the loop. */ > + post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU; Edges need probabilities, as in nvptx_lockless_update, nvptx_lockfull_update and nvptx_goacc_reduction_init. > + edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE); > + set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb); > + set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb); > + > + gphi *shift_phi = create_phi_node (shift, loop_bb); > + add_phi_arg (shift_phi, old_shift, pre_edge, loc); > + add_phi_arg (shift_phi, new_shift, loop_edge, loc); > + > + gphi *var_phi = create_phi_node (var, loop_bb); > + add_phi_arg (var_phi, old_var, pre_edge, loc); > + add_phi_arg (var_phi, new_var, loop_edge, loc); > + > + loop *loop = alloc_loop (); > + loop->header = loop_bb; > + loop->latch = loop_bb; > + add_loop (loop, loop_bb->loop_father); > + > + return new_var; > +} > + > +/* Dummy reduction vars that have GOMP_MAP_FIRSTPRIVATE_POINTER data > + mappings gets retyped to (void *). Adjust the type of VAR to TYPE > + as appropriate. */ > + > +static tree > +nvptx_adjust_reduction_type (tree var, tree type, gimple_seq *seq) > +{ > + if (TREE_TYPE (TREE_TYPE (var)) == type) > + return var; > + > + tree ptype = build_pointer_type (type); > + tree t = make_ssa_name (ptype); > + tree expr = fold_build1 (NOP_EXPR, ptype, var); > + gimple_seq_add_stmt (seq, gimple_build_assign (t, expr)); > + return t; > +} > + > /* NVPTX implementation of GOACC_REDUCTION_SETUP. */ > > static void > @@ -5602,7 +5790,11 @@ nvptx_goacc_reduction_setup (gcall *call) > tree ref_to_res = gimple_call_arg (call, 1); > > if (!integer_zerop (ref_to_res)) > - var = build_simple_mem_ref (ref_to_res); > + { > + ref_to_res = nvptx_adjust_reduction_type (ref_to_res, TREE_TYPE (var), > + &seq); > + var = build_simple_mem_ref (ref_to_res); > + } > } > > if (level == GOMP_DIM_WORKER) > @@ -5702,7 +5894,11 @@ nvptx_goacc_reduction_init (gcall *call) > init = var; > } > > - gimplify_assign (lhs, init, &seq); > + /* The LHS may be NULL if a reduction variable on a parallel > + construct is initialized to some constant inside the parallel > + region. */ > + if (lhs) > + gimplify_assign (lhs, init, &seq); > } > > pop_gimplify_context (NULL); > @@ -5727,22 +5923,7 @@ nvptx_goacc_reduction_fini (gcall *call) > push_gimplify_context (true); > > if (level == GOMP_DIM_VECTOR) > - { > - /* Emit binary shuffle tree. TODO. Emit this as an actual loop, > - but that requires a method of emitting a unified jump at the > - gimple level. */ > - for (int shfl = PTX_VECTOR_LENGTH / 2; shfl > 0; shfl = shfl >> 1) > - { > - tree other_var = make_ssa_name (TREE_TYPE (var)); > - nvptx_generate_vector_shuffle (gimple_location (call), > - other_var, var, shfl, &seq); > - > - r = make_ssa_name (TREE_TYPE (var)); > - gimplify_assign (r, fold_build2 (op, TREE_TYPE (var), > - var, other_var), &seq); > - var = r; > - } > - } > + r = nvptx_vector_reduction (gimple_location (call), &gsi, var, op); > else > { > tree accum = NULL_TREE; > @@ -5760,7 +5941,11 @@ nvptx_goacc_reduction_fini (gcall *call) > else if (integer_zerop (ref_to_res)) > r = var; > else > - accum = ref_to_res; > + { > + ref_to_res = nvptx_adjust_reduction_type (ref_to_res, TREE_TYPE (var), > + &seq); > + accum = ref_to_res; > + } > > if (accum) > { > @@ -5809,7 +5994,11 @@ nvptx_goacc_reduction_teardown (gcall *call) > tree ref_to_res = gimple_call_arg (call, 1); > > if (!integer_zerop (ref_to_res)) > - gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq); > + { > + ref_to_res = nvptx_adjust_reduction_type (ref_to_res, TREE_TYPE (var), > + &seq); > + gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq); > + } > } > > if (lhs) > diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md > index 2988f5dfa91..79c4c061841 100644 > --- a/gcc/config/nvptx/nvptx.md > +++ b/gcc/config/nvptx/nvptx.md > @@ -600,6 +600,13 @@ > "%J0\\tbra.uni\\t%l1;" > [(set_attr "predicable" "false")]) > > +(define_insn "cond_uni" > + [(set (match_operand:SI 0 "nvptx_register_operand" "=R") > + (unspec:SI [(match_operand:SI 1 "nvptx_nonmemory_operand" "R")] > + UNSPEC_BR_UNIFIED))] > + "" > + "%.\\tmov%t0\\t%0, %1; // unified") > + > (define_expand "cbranch4" > [(set (pc) > (if_then_else (match_operator 0 "nvptx_comparison_operator" Otherwise, nvptx part LGTM. Thanks, - Tom