On Thu, 16 May 2019, linkw@linux.ibm.com wrote:

> From: Kewen Lin <linkw@linux.ibm.com>
> 
> Hi,
> 
> Previous version link:
> https://gcc.gnu.org/ml/gcc-patches/2019-05/msg00654.html
> 
> Comparing with the previous version, I moved the generic
> parts of rs6000 target hook to IVOPTs.  But I still kept
> the target hook as previous which checks some target
> specific criteria like innermost, max iteration counts
> etc, and checks for invalid stmt in loop.  The reason
> I decided not to move this part to generic is they are
> not generic enough.
> 
> 1) For the part of target specific criteria, if we want
> to put it in generic, we can call the hook
> targetm.can_use_doloop_p, which requires us to
> prepare those four parameters, but each target only needs
> one or two parameters, it means we will evaluate some
> things which aren't required for that target.  So I'd like
> to leave this part to target hook.
> 2) For the other part of target invalid stmt check, as the
> hook invalid_within_doloop grep data shows, no all targets
> need to check whether invalid instructions exist in doloop.
> If we scan all stmts as generic, it can waste time for those
> targets which don't need to check.  Besides, the scope of
> the current check on SWITCH in rs6000 hook is wide, later
> if we want it more exact, we may need to check more stmts
> instead of single.  To let target hook scan the BBs/stmts
> by itself is also more flexible.
> 
> Bootstrapped and regression testing ongoing on powerpc64le.
> 
> Any more comments?
> 
> gcc/ChangeLog
> 
> 2019-05-17  Kewen Lin  <linkw@gcc.gnu.org>
> 
> 	PR middle-end/80791
> 	* target.def (predict_doloop_p): New hook.
> 	* targhooks.h (default_predict_doloop_p): New declaration.
> 	* targhooks.c (default_predict_doloop_p): New function.
> 	* doc/tm.texi.in (TARGET_PREDICT_DOLOOP_P): New hook.
> 	* doc/tm.texi: Regenerate.
> 	* config/rs6000/rs6000.c (invalid_insn_for_doloop_p): New function.
> 	(rs6000_predict_doloop_p): Likewise.
> 	(TARGET_PREDICT_DOLOOP_P): New macro.
> 	* tree-ssa-loop-ivopts.c (generic_predict_doloop_p): New function. 
> 	(costly_iter_for_doloop_p): Likewise.
> 
> ---
>  gcc/config/rs6000/rs6000.c |  79 +++++++++++++++++++++++++++++++++-
>  gcc/doc/tm.texi            |   8 ++++
>  gcc/doc/tm.texi.in         |   2 +
>  gcc/target.def             |   9 ++++
>  gcc/targhooks.c            |  13 ++++++
>  gcc/targhooks.h            |   1 +
>  gcc/tree-ssa-loop-ivopts.c | 105 +++++++++++++++++++++++++++++++++++++++++++++
>  7 files changed, 216 insertions(+), 1 deletion(-)
> 
> diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
> index a21f4f7..2fd52d7 100644
> --- a/gcc/config/rs6000/rs6000.c
> +++ b/gcc/config/rs6000/rs6000.c
> @@ -83,6 +83,7 @@
>  #include "tree-ssa-propagate.h"
>  #include "tree-vrp.h"
>  #include "tree-ssanames.h"
> +#include "tree-cfg.h"
>  
>  /* This file should be included last.  */
>  #include "target-def.h"
> @@ -1914,6 +1915,9 @@ static const struct attribute_spec rs6000_attribute_table[] =
>  #undef TARGET_CAN_USE_DOLOOP_P
>  #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
>  
> +#undef TARGET_PREDICT_DOLOOP_P
> +#define TARGET_PREDICT_DOLOOP_P rs6000_predict_doloop_p
> +
>  #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
>  #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV rs6000_atomic_assign_expand_fenv
>  
> @@ -39436,7 +39440,80 @@ rs6000_mangle_decl_assembler_name (tree decl, tree id)
>    return id;
>  }
>  
> -
> +/* Check whether there are some instructions preventing doloop transformation
> +   inside loop body, mainly for instructions which are possible to kill CTR.
> +
> +   Return true if some invalid insn exits, otherwise return false.  */
> +
> +static bool
> +invalid_insn_for_doloop_p (struct loop *loop)
> +{
> +  basic_block *body = get_loop_body (loop);

You leak memory here.  I still think all of this function should
be in generic code for now.

> +  gimple_stmt_iterator gsi;
> +
> +  for (unsigned i = 0; i < loop->num_nodes; i++)
> +    for (gsi = gsi_start_bb (body[i]); !gsi_end_p (gsi); gsi_next (&gsi))
> +      {
> +	gimple *stmt = gsi_stmt (gsi);
> +	if (is_gimple_call (stmt) && !gimple_call_internal_p (stmt)
> +	    && !is_inexpensive_builtin (gimple_call_fndecl (stmt)))
> +	  {
> +	    if (dump_file && (dump_flags & TDF_DETAILS))
> +	      fprintf (dump_file,
> +		       "predict doloop failure due to finding call.\n");
> +	    return true;
> +	  }
> +	if (computed_goto_p (stmt))
> +	  {
> +	    if (dump_file && (dump_flags & TDF_DETAILS))
> +	      fprintf (dump_file, "predict doloop failure due to"
> +				  "finding computed jump.\n");
> +	    return true;
> +	  }
> +
> +	/* TODO: Now this hook is expected to be called in ivopts, which is
> +	   before switchlower1/switchlower2.  Checking for SWITCH at this point
> +	   will eliminate some good candidates.  But since there are only a few
> +	   cases having _a_ switch statement in the innermost loop, it can be a
> +	   low priority enhancement.  */
> +	if (gimple_code (stmt) == GIMPLE_SWITCH)
> +	  {
> +	    if (dump_file && (dump_flags & TDF_DETAILS))
> +	      fprintf (dump_file,
> +		       "predict doloop failure due to finding switch.\n");
> +	    return true;
> +	  }
> +      }
> +
> +  return false;
> +}
> +
> +/* Predict whether the given loop in gimple will be transformed in the RTL
> +   doloop_optimize pass.  This is for rs6000 target specific.  */
> +
> +static bool
> +rs6000_predict_doloop_p (struct loop *loop)
> +{
> +  gcc_assert (loop);
> +
> +  /* On rs6000, targetm.can_use_doloop_p is actually
> +     can_use_doloop_if_innermost.  Just ensure it's innermost.  */

So the point is that ppc only has a single counter register, right?

So the better way would be to expose that via a target hook somehow.
Or simply restrict IVOPTs processing to innermost loops for now.

> +  if (loop->inner != NULL)
> +    {
> +      if (dump_file && (dump_flags & TDF_DETAILS))
> +	fprintf (dump_file, "predict doloop failure due to"
> +			    "no innermost.\n");
> +      return false;
> +    }
> +
> +  /* Similar to doloop_optimize targetm.invalid_within_doloop, check for
> +     CALL, tablejump, computed_jump.  */
> +  if (invalid_insn_for_doloop_p (loop))
> +    return false;
> +
> +  return true;
> +}
> +
>  struct gcc_target targetm = TARGET_INITIALIZER;
>  
>  #include "gt-rs6000.h"
> diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
> index 8c8978b..e595b8b 100644
> --- a/gcc/doc/tm.texi
> +++ b/gcc/doc/tm.texi
> @@ -11603,6 +11603,14 @@ function version at run-time for a given set of function versions.
>  body must be generated.
>  @end deftypefn
>  
> +@deftypefn {Target Hook} bool TARGET_PREDICT_DOLOOP_P (struct loop *@var{loop})
> +Return true if we can predict it is possible to use low-overhead loops
> +for a particular loop.  The parameter @var{loop} is a pointer to the loop
> +which is going to be checked.  This target hook is required only when the
> +target supports low-overhead loops, and will help some earlier middle-end
> +passes to make some decisions.
> +@end deftypefn
> +
>  @deftypefn {Target Hook} bool TARGET_CAN_USE_DOLOOP_P (const widest_int @var{&iterations}, const widest_int @var{&iterations_max}, unsigned int @var{loop_depth}, bool @var{entered_at_top})
>  Return true if it is possible to use low-overhead loops (@code{doloop_end}
>  and @code{doloop_begin}) for a particular loop.  @var{iterations} gives the
> diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
> index fe1194e..e047734 100644
> --- a/gcc/doc/tm.texi.in
> +++ b/gcc/doc/tm.texi.in
> @@ -7942,6 +7942,8 @@ to by @var{ce_info}.
>  
>  @hook TARGET_GENERATE_VERSION_DISPATCHER_BODY
>  
> +@hook TARGET_PREDICT_DOLOOP_P
> +
>  @hook TARGET_CAN_USE_DOLOOP_P
>  
>  @hook TARGET_INVALID_WITHIN_DOLOOP
> diff --git a/gcc/target.def b/gcc/target.def
> index 66cee07..0a80de3 100644
> --- a/gcc/target.def
> +++ b/gcc/target.def
> @@ -4227,6 +4227,15 @@ DEFHOOK
>  rtx, (machine_mode mode, rtx result, rtx val, rtx failval),
>   default_speculation_safe_value)
>   
> +DEFHOOK
> +(predict_doloop_p,
> + "Return true if we can predict it is possible to use low-overhead loops\n\
> +for a particular loop.  The parameter @var{loop} is a pointer to the loop\n\
> +which is going to be checked.  This target hook is required only when the\n\
> +target supports low-overhead loops, and will help some earlier middle-end\n\
> +passes to make some decisions.",
> + bool, (struct loop *loop),
> + default_predict_doloop_p)
>  
>  DEFHOOK
>  (can_use_doloop_p,
> diff --git a/gcc/targhooks.c b/gcc/targhooks.c
> index 318f7e9..56ed4cc 100644
> --- a/gcc/targhooks.c
> +++ b/gcc/targhooks.c
> @@ -643,6 +643,19 @@ default_has_ifunc_p (void)
>    return HAVE_GNU_INDIRECT_FUNCTION;
>  }
>  
> +/* True if we can predict this loop is possible to be transformed to a
> +   low-overhead loop, otherwise returns false.
> +
> +   By default, false is returned, as this hook's applicability should be
> +   verified for each target.  Target maintainers should re-define the hook
> +   if the target can take advantage of it.  */
> +
> +bool
> +default_predict_doloop_p (struct loop *loop ATTRIBUTE_UNUSED)
> +{
> +  return false;
> +}
> +
>  /* NULL if INSN insn is valid within a low-overhead loop, otherwise returns
>     an error message.
>  
> diff --git a/gcc/targhooks.h b/gcc/targhooks.h
> index 5943627..70bfb17 100644
> --- a/gcc/targhooks.h
> +++ b/gcc/targhooks.h
> @@ -85,6 +85,7 @@ extern bool default_fixed_point_supported_p (void);
>  
>  extern bool default_has_ifunc_p (void);
>  
> +extern bool default_predict_doloop_p (struct loop *);
>  extern const char * default_invalid_within_doloop (const rtx_insn *);
>  
>  extern tree default_builtin_vectorized_function (unsigned int, tree, tree);
> diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
> index a44b4cb..ed7f2a5 100644
> --- a/gcc/tree-ssa-loop-ivopts.c
> +++ b/gcc/tree-ssa-loop-ivopts.c
> @@ -3776,6 +3776,111 @@ prepare_decl_rtl (tree *expr_p, int *ws, void *data)
>    return NULL_TREE;
>  }
>  
> +/* Check whether number of iteration computation is too costly for doloop
> +   transformation.  It expands the gimple sequence to equivalent RTL insn
> +   sequence, then evaluate the cost.
> +
> +   Return true if it's costly, otherwise return false.  */
> +
> +static bool
> +costly_iter_for_doloop_p (struct loop *loop, tree niters)
> +{

I already said this but I expect IVOPTs to already cost initial
value computation of new IVs.  I also doubt it's worth
going into RTL details here, estimate_num_insns should be
enough, like via using expression_expensive_p from SCEV analysis
which is already used by IVOPTs.

You try to be perfect here but end up adding more
magic numbers (PARAM_MAX_ITERATIONS_COMPUTATION_COST).

> +  tree type = TREE_TYPE (niters);
> +  unsigned cost = 0;
> +  bool speed = optimize_loop_for_speed_p (loop);
> +  int regno = LAST_VIRTUAL_REGISTER + 1;
> +  walk_tree (&niters, prepare_decl_rtl, &regno, NULL);
> +  start_sequence ();
> +  expand_expr (niters, NULL_RTX, TYPE_MODE (type), EXPAND_NORMAL);
> +  rtx_insn *seq = get_insns ();
> +  end_sequence ();
> +
> +  for (; seq; seq = NEXT_INSN (seq))
> +    {
> +      if (!INSN_P (seq))
> +	continue;
> +      rtx body = PATTERN (seq);
> +      if (GET_CODE (body) == SET)
> +	{
> +	  rtx set_val = XEXP (body, 1);
> +	  enum rtx_code code = GET_CODE (set_val);
> +	  enum rtx_class cls = GET_RTX_CLASS (code);
> +	  /* For now, we only consider these two RTX classes, to match what we
> +	 get in doloop_optimize, excluding operations like zero/sign extend.  */
> +	  if (cls == RTX_BIN_ARITH || cls == RTX_COMM_ARITH)
> +	    cost += set_src_cost (set_val, GET_MODE (set_val), speed);
> +	}
> +    }
> +  unsigned max_cost
> +    = COSTS_N_INSNS (PARAM_VALUE (PARAM_MAX_ITERATIONS_COMPUTATION_COST));
> +  if (cost > max_cost)
> +    return true;
> +
> +  return false;
> +}
> +
> +/* Predict whether the given loop will be transformed in the RTL
> +   doloop_optimize pass.  Attempt to duplicate as many doloop_optimize checks
> +   as possible.  This is only for target independent checks, see
> +   targetm.predict_doloop_p for the target dependent ones.
> +
> +   Some RTL specific checks seems unable to be checked in gimple, if any new
> +   checks or easy checks _are_ missing here, please add them.  */
> +
> +static bool
> +generic_predict_doloop_p (struct ivopts_data *data)
> +{
> +  struct loop *loop = data->current_loop;
> +
> +  /* Call target hook for target dependent checks.  */
> +  if (!targetm.predict_doloop_p (loop))
> +    {
> +      if (dump_file && (dump_flags & TDF_DETAILS))
> +	fprintf (dump_file, "predict doloop failure due to"
> +			    "target specific checks.\n");
> +      return false;
> +    }
> +
> +  /* Similar to doloop_optimize, check iteration description to know it's
> +     suitable or not.  */
> +  edge exit = loop_latch_edge (loop);
> +  struct tree_niter_desc *niter_desc = niter_for_exit (data, exit);
> +  if (niter_desc == NULL)
> +    {
> +      if (dump_file && (dump_flags & TDF_DETAILS))
> +	fprintf (dump_file, "predict doloop failure due to"
> +			    "unexpected niters.\n");
> +      return false;
> +    }
> +
> +  /* Similar to doloop_optimize, check whether iteration count too small
> +     and not profitable.  */
> +  HOST_WIDE_INT est_niter = get_estimated_loop_iterations_int (loop);
> +  if (est_niter == -1)
> +    est_niter = get_likely_max_loop_iterations_int (loop);
> +  if (est_niter >= 0 && est_niter < 3)
> +    {
> +      if (dump_file && (dump_flags & TDF_DETAILS))
> +	fprintf (dump_file,
> +		 "predict doloop failure due to"
> +		 "too few iterations (%u).\n",
> +		 (unsigned int) est_niter);
> +      return false;
> +    }
> +
> +  /* Similar to doloop_optimize, check whether number of iterations too costly
> +     to compute.  */
> +  if (costly_iter_for_doloop_p (loop, niter_desc->niter))
> +    {
> +      if (dump_file && (dump_flags & TDF_DETAILS))
> +	fprintf (dump_file, "predict doloop failure due to"
> +			    "costly niter computation.\n");
> +      return false;
> +    }
> +
> +  return true;
> +}
> +
>  /* Determines cost of the computation of EXPR.  */
>  
>  static unsigned
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Linux GmbH, Maxfeldstrasse 5, 90409 Nuernberg, Germany;
GF: Felix Imendörffer, Mary Higgins, Sri Rasiah; HRB 21284 (AG Nürnberg)