public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
From: Tom de Vries <tdevries@suse.de>
To: Cesar Philippidis <cesar@codesourcery.com>,
	"gcc-patches@gcc.gnu.org" <gcc-patches@gcc.gnu.org>,
	Jakub Jelinek <jakub@redhat.com>
Subject: Re: [patch] various OpenACC reduction enhancements - ME and nvptx changes
Date: Fri, 05 Oct 2018 14:09:00 -0000	[thread overview]
Message-ID: <30208224-2a8a-c50a-d663-39b1bc8d3e93@suse.de> (raw)
In-Reply-To: <27100797-1930-5c5f-ec68-c5888be08088@codesourcery.com>

On 6/29/18 8:19 PM, Cesar Philippidis wrote:
> The attached patch includes the nvptx and GCC ME reductions enhancements.
> 
> Is this patch OK for trunk? It bootstrapped / regression tested cleanly
> for x86_64 with nvptx offloading.
> 

These need fixing:
...
=== ERROR type #5: trailing whitespace (4 error(s)) ===
gcc/config/nvptx/nvptx.c:5139:0:██
gcc/config/nvptx/nvptx.c:5660:8:      doâ–ˆ
gcc/config/nvptx/nvptx.c:5702:0:██
gcc/config/nvptx/nvptx.c:5726:0:██
...


> 	gcc/
> 	* config/nvptx/nvptx.c (nvptx_propagate_unified): New.
> 	(nvptx_split_blocks): Call it for cond_uni insn.
> 	(nvptx_expand_cond_uni): New.
> 	(enum nvptx_builtins): Add NVPTX_BUILTIN_COND_UNI.
> 	(nvptx_init_builtins): Initialize it.
> 	(nvptx_expand_builtin):
> 	(nvptx_generate_vector_shuffle): Change integral SHIFT operand to
> 	tree BITS operand.
> 	(nvptx_vector_reduction): New.
> 	(nvptx_adjust_reduction_type): New.
> 	(nvptx_goacc_reduction_setup): Use it to adjust the type of ref_to_res.
> 	(nvptx_goacc_reduction_init): Don't update LHS if it doesn't exist.
> 	(nvptx_goacc_reduction_fini): Call nvptx_vector_reduction for vector.
> 	Use it to adjust the type of ref_to_res.
> 	(nvptx_goacc_reduction_teardown):
> 	* config/nvptx/nvptx.md (cond_uni): New pattern.

> diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
> index 5608bee8a8d..33ec3db1153 100644
> --- a/gcc/config/nvptx/nvptx.c
> +++ b/gcc/config/nvptx/nvptx.c
> @@ -2863,6 +2863,52 @@ nvptx_reorg_uniform_simt ()
>      }
>  }
>  
> +/* UNIFIED is a cond_uni insn.  Find the branch insn it affects, and
> +   mark that as unified.  We expect to be in a single block.  */
> +
> +static void
> +nvptx_propagate_unified (rtx_insn *unified)
> +{
> +  rtx_insn *probe = unified;
> +  rtx cond_reg = SET_DEST (PATTERN (unified));
> +  rtx pat = NULL_RTX;
> +
> +  /* Find the comparison.  (We could skip this and simply scan to he
> +     blocks' terminating branch, if we didn't care for self
> +     checking.)  */
> +  for (;;)
> +    {
> +      probe = next_real_insn (probe);
> +      if (!probe)
> +	break;
> +      pat = PATTERN (probe);
> +
> +      if (GET_CODE (pat) == SET
> +	  && GET_RTX_CLASS (GET_CODE (SET_SRC (pat))) == RTX_COMPARE
> +	  && XEXP (SET_SRC (pat), 0) == cond_reg)
> +	break;
> +      gcc_assert (NONJUMP_INSN_P (probe));
> +    }
> +  gcc_assert (pat);
> +  rtx pred_reg = SET_DEST (pat);
> +
> +  /* Find the branch.  */
> +  do
> +    probe = NEXT_INSN (probe);
> +  while (!JUMP_P (probe));
> +
> +  pat = PATTERN (probe);
> +  rtx itec = XEXP (SET_SRC (pat), 0);
> +  gcc_assert (XEXP (itec, 0) == pred_reg);
> +
> +  /* Mark the branch's condition as unified.  */
> +  rtx unspec = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pred_reg),
> +			       UNSPEC_BR_UNIFIED);
> +  bool ok = validate_change (probe, &XEXP (itec, 0), unspec, false);
> +
> +  gcc_assert (ok);
> +}
> +
>  /* Loop structure of the function.  The entire function is described as
>     a NULL loop.  */
>  
> @@ -2964,6 +3010,9 @@ nvptx_split_blocks (bb_insn_map_t *map)
>  	    continue;
>  	  switch (recog_memoized (insn))
>  	    {
> +	    case CODE_FOR_cond_uni:
> +	      nvptx_propagate_unified (insn);
> +	      /* FALLTHROUGH */
>  	    default:
>  	      seen_insn = true;
>  	      continue;
> @@ -5080,6 +5129,21 @@ nvptx_expand_cmp_swap (tree exp, rtx target,
>    return target;
>  }
>  
> +/* Expander for the compare unified builtin.  */
> +
> +static rtx
> +nvptx_expand_cond_uni (tree exp, rtx target, machine_mode mode, int ignore)
> +{
> +  if (ignore)
> +    return target;
> +  
> +  rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
> +			 NULL_RTX, mode, EXPAND_NORMAL);
> +
> +  emit_insn (gen_cond_uni (target, src));
> +
> +  return target;
> +}
>  
>  /* Codes for all the NVPTX builtins.  */
>  enum nvptx_builtins
> @@ -5089,6 +5153,7 @@ enum nvptx_builtins
>    NVPTX_BUILTIN_WORKER_ADDR,
>    NVPTX_BUILTIN_CMP_SWAP,
>    NVPTX_BUILTIN_CMP_SWAPLL,
> +  NVPTX_BUILTIN_COND_UNI,
>    NVPTX_BUILTIN_MAX
>  };
>  
> @@ -5126,6 +5191,7 @@ nvptx_init_builtins (void)
>         (PTRVOID, ST, UINT, UINT, NULL_TREE));
>    DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
>    DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
> +  DEF (COND_UNI, "cond_uni", (integer_type_node, integer_type_node, NULL_TREE));
>  
>  #undef DEF
>  #undef ST
> @@ -5158,6 +5224,9 @@ nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
>      case NVPTX_BUILTIN_CMP_SWAPLL:
>        return nvptx_expand_cmp_swap (exp, target, mode, ignore);
>  
> +    case NVPTX_BUILTIN_COND_UNI:
> +      return nvptx_expand_cond_uni (exp, target, mode, ignore);
> +
>      default: gcc_unreachable ();
>      }
>  }
> @@ -5284,7 +5353,7 @@ nvptx_get_worker_red_addr (tree type, tree offset)
>  
>  static void
>  nvptx_generate_vector_shuffle (location_t loc,
> -			       tree dest_var, tree var, unsigned shift,
> +			       tree dest_var, tree var, tree bits,
>  			       gimple_seq *seq)
>  {
>    unsigned fn = NVPTX_BUILTIN_SHUFFLE;
> @@ -5307,7 +5376,6 @@ nvptx_generate_vector_shuffle (location_t loc,
>      }
>    
>    tree call = nvptx_builtin_decl (fn, true);
> -  tree bits = build_int_cst (unsigned_type_node, shift);
>    tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
>    tree expr;
>  
> @@ -5583,6 +5651,126 @@ nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
>      return nvptx_lockfull_update (loc, gsi, ptr, var, op);
>  }
>  
> +/* Emit a vector-level reduction loop.  OLD_VAR is the incoming
> +   variable to reduce (valid in each vector), OP is the reduction
> +   operator.  Return the reduced value (an SSA var).
> +
> +   The code we generate looks like:
> +      unsigned old_shift = DIM_SIZE(VECTOR);
> +      do 
> +	{
> +	  shift = PHI (old_shift, new_shift);
> +	  var = PHI (old_var, new_var);
> +	  new_shift = shift >> 1;
> +	  other_var = VSHUFFLE (var, new_shift);
> +	  new_var = var OP other_var;
> +	  cond_var = builtin_cond_uni (new_shift);
> +	}
> +	while (cond_var > 1);
> +
> +  The builtin_cond_ini expands to a cond_uni instruction, which is
> +  processed in nvpts_split_blocks to mark the loop's terminating
> +  branch instruction.  */
> +
> +static tree
> +nvptx_vector_reduction (location_t loc, gimple_stmt_iterator *gsi,
> +			tree old_var, tree_code op)
> +{
> +  tree var_type = TREE_TYPE (old_var);
> +
> +  /*  Emit old_shift = DIM_SIZE(VECTOR) */
> +  tree old_shift = make_ssa_name (integer_type_node);
> +  tree dim = build_int_cst (integer_type_node, GOMP_DIM_VECTOR);
> +  gcall *call = gimple_build_call_internal (IFN_GOACC_DIM_SIZE, 1, dim);
> +  gimple_set_lhs (call, old_shift);
> +  gimple_set_location (call, loc);
> +  gsi_insert_before (gsi, call, GSI_SAME_STMT);
> +
> +  /* Split the block just after the init stmts.  */
> +  basic_block pre_bb = gsi_bb (*gsi);
> +  edge pre_edge = split_block (pre_bb, call);
> +  basic_block loop_bb = pre_edge->dest;
> +  pre_bb = pre_edge->src;
> +  /* Reset the iterator.  */
> +  *gsi = gsi_for_stmt (gsi_stmt (*gsi));
> +
> +  tree shift = make_ssa_name (integer_type_node);
> +  tree new_shift = make_ssa_name (integer_type_node);
> +  tree var = make_ssa_name (var_type);
> +  tree other_var = make_ssa_name (var_type);
> +  tree new_var = make_ssa_name (var_type);
> +  
> +  /* Build and insert the loop body.  */
> +  gimple_seq loop_seq = NULL;
> +
> +  /* new_shift = shift >> 1 */
> +  tree shift_expr = fold_build2 (RSHIFT_EXPR, integer_type_node,
> +				 shift, integer_one_node);
> +  gimplify_assign (new_shift, shift_expr, &loop_seq);
> +
> +  /* other_var = shuffle (var, shift) */
> +  nvptx_generate_vector_shuffle (loc, other_var, var, new_shift, &loop_seq);
> +  /* new_var = var  OP other_var */
> +  tree red_expr = fold_build2 (op, var_type, var, other_var);
> +  gimplify_assign (new_var, red_expr, &loop_seq);
> +
> +  /* Mark the iterator variable as unified.  */
> +  tree cond_var = make_ssa_name (integer_type_node);
> +  tree uni_fn = nvptx_builtin_decl (NVPTX_BUILTIN_COND_UNI, true);
> +  tree uni_expr = build_call_expr_loc (loc, uni_fn, 1, new_shift);
> +  gimplify_assign (cond_var,  uni_expr, &loop_seq);
> +
> +  gcond *cond = gimple_build_cond (LE_EXPR, cond_var, integer_one_node,
> +				   NULL_TREE, NULL_TREE);
> +  gimple_seq_add_stmt (&loop_seq, cond);
> +  
> +  gsi_insert_seq_before (gsi, loop_seq, GSI_SAME_STMT);
> +
> +  /* Split the block just after the loop stmts.  */
> +  edge post_edge = split_block (loop_bb, cond);
> +  basic_block post_bb = post_edge->dest;
> +  loop_bb = post_edge->src;
> +  *gsi = gsi_for_stmt (gsi_stmt (*gsi));
> +
> +  /* Create the loop.  */
> +  post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;

Edges need probabilities, as in nvptx_lockless_update,
nvptx_lockfull_update and nvptx_goacc_reduction_init.

> +  edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
> +  set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
> +  set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
> +
> +  gphi *shift_phi = create_phi_node (shift, loop_bb);
> +  add_phi_arg (shift_phi, old_shift, pre_edge, loc);
> +  add_phi_arg (shift_phi, new_shift, loop_edge, loc);
> +
> +  gphi *var_phi = create_phi_node (var, loop_bb);
> +  add_phi_arg (var_phi, old_var, pre_edge, loc);
> +  add_phi_arg (var_phi, new_var, loop_edge, loc);
> +
> +  loop *loop = alloc_loop ();
> +  loop->header = loop_bb;
> +  loop->latch = loop_bb;
> +  add_loop (loop, loop_bb->loop_father);
> +
> +  return new_var;
> +}
> +
> +/* Dummy reduction vars that have GOMP_MAP_FIRSTPRIVATE_POINTER data
> +   mappings gets retyped to (void *).  Adjust the type of VAR to TYPE
> +   as appropriate.  */
> +
> +static tree
> +nvptx_adjust_reduction_type (tree var, tree type, gimple_seq *seq)
> +{
> +  if (TREE_TYPE (TREE_TYPE (var)) == type)
> +    return var;
> +
> +  tree ptype = build_pointer_type (type);
> +  tree t = make_ssa_name (ptype);
> +  tree expr = fold_build1 (NOP_EXPR, ptype, var);
> +  gimple_seq_add_stmt (seq, gimple_build_assign (t, expr));
> +  return t;
> +}
> +
>  /* NVPTX implementation of GOACC_REDUCTION_SETUP.  */
>  
>  static void
> @@ -5602,7 +5790,11 @@ nvptx_goacc_reduction_setup (gcall *call)
>        tree ref_to_res = gimple_call_arg (call, 1);
>  
>        if (!integer_zerop (ref_to_res))
> -	var = build_simple_mem_ref (ref_to_res);
> +	{
> +	  ref_to_res = nvptx_adjust_reduction_type (ref_to_res, TREE_TYPE (var),
> +						    &seq);
> +	  var = build_simple_mem_ref (ref_to_res);
> +	}
>      }
>    
>    if (level == GOMP_DIM_WORKER)
> @@ -5702,7 +5894,11 @@ nvptx_goacc_reduction_init (gcall *call)
>  	    init = var;
>  	}
>  
> -      gimplify_assign (lhs, init, &seq);
> +      /* The LHS may be NULL if a reduction variable on a parallel
> +	 construct is initialized to some constant inside the parallel
> +	 region.  */
> +      if (lhs)
> +	gimplify_assign (lhs, init, &seq);
>      }
>  
>    pop_gimplify_context (NULL);
> @@ -5727,22 +5923,7 @@ nvptx_goacc_reduction_fini (gcall *call)
>    push_gimplify_context (true);
>  
>    if (level == GOMP_DIM_VECTOR)
> -    {
> -      /* Emit binary shuffle tree.  TODO. Emit this as an actual loop,
> -	 but that requires a method of emitting a unified jump at the
> -	 gimple level.  */
> -      for (int shfl = PTX_VECTOR_LENGTH / 2; shfl > 0; shfl = shfl >> 1)
> -	{
> -	  tree other_var = make_ssa_name (TREE_TYPE (var));
> -	  nvptx_generate_vector_shuffle (gimple_location (call),
> -					 other_var, var, shfl, &seq);
> -
> -	  r = make_ssa_name (TREE_TYPE (var));
> -	  gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
> -					   var, other_var), &seq);
> -	  var = r;
> -	}
> -    }
> +    r = nvptx_vector_reduction (gimple_location (call), &gsi, var, op);
>    else
>      {
>        tree accum = NULL_TREE;
> @@ -5760,7 +5941,11 @@ nvptx_goacc_reduction_fini (gcall *call)
>        else if (integer_zerop (ref_to_res))
>  	r = var;
>        else
> -	accum = ref_to_res;
> +	{
> +	  ref_to_res = nvptx_adjust_reduction_type (ref_to_res, TREE_TYPE (var),
> +						    &seq);
> +	  accum = ref_to_res;
> +	}
>  
>        if (accum)
>  	{
> @@ -5809,7 +5994,11 @@ nvptx_goacc_reduction_teardown (gcall *call)
>        tree ref_to_res = gimple_call_arg (call, 1);
>  
>        if (!integer_zerop (ref_to_res))
> -	gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
> +	{
> +	  ref_to_res = nvptx_adjust_reduction_type (ref_to_res, TREE_TYPE (var),
> +						    &seq);
> +	  gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
> +	}
>      }
>  
>    if (lhs)
> diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
> index 2988f5dfa91..79c4c061841 100644
> --- a/gcc/config/nvptx/nvptx.md
> +++ b/gcc/config/nvptx/nvptx.md
> @@ -600,6 +600,13 @@
>    "%J0\\tbra.uni\\t%l1;"
>    [(set_attr "predicable" "false")])
>  
> +(define_insn "cond_uni"
> +  [(set (match_operand:SI 0 "nvptx_register_operand" "=R")
> +          (unspec:SI [(match_operand:SI 1 "nvptx_nonmemory_operand" "R")]
> +  		     UNSPEC_BR_UNIFIED))]
> +  ""
> +  "%.\\tmov%t0\\t%0, %1; // unified")
> +
>  (define_expand "cbranch<mode>4"
>    [(set (pc)
>  	(if_then_else (match_operator 0 "nvptx_comparison_operator"

Otherwise, nvptx part LGTM.

Thanks,
- Tom

  reply	other threads:[~2018-10-05 14:07 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-06-29 18:20 [patch] various OpenACC reduction enhancements Cesar Philippidis
2018-06-29 18:22 ` [patch] various OpenACC reduction enhancements - ME and nvptx changes Cesar Philippidis
2018-10-05 14:09   ` Tom de Vries [this message]
2018-10-30 20:09     ` Cesar Philippidis
2018-12-04 12:29   ` Jakub Jelinek
2018-12-04 15:54     ` Tom de Vries
2018-12-13 15:56       ` Julian Brown
2018-06-29 18:23 ` [patch] various OpenACC reduction enhancements - FE changes Cesar Philippidis
2018-12-04 12:57   ` Jakub Jelinek
2018-12-13 14:12     ` Julian Brown
2018-12-18 13:06       ` Jakub Jelinek
2018-06-29 18:38 ` [patch] various OpenACC reduction enhancements - test cases Cesar Philippidis
2018-12-04 12:59   ` Jakub Jelinek
2018-12-13 14:14     ` Julian Brown

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=30208224-2a8a-c50a-d663-39b1bc8d3e93@suse.de \
    --to=tdevries@suse.de \
    --cc=cesar@codesourcery.com \
    --cc=gcc-patches@gcc.gnu.org \
    --cc=jakub@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).